diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,140393 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 20050, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 4.987531172069826e-05, + "grad_norm": 2.6472109546739415, + "learning_rate": 0.0, + "loss": 1.1899, + "step": 1 + }, + { + "epoch": 9.975062344139652e-05, + "grad_norm": 4.413802674662548, + "learning_rate": 8.305647840531561e-08, + "loss": 1.1836, + "step": 2 + }, + { + "epoch": 0.00014962593516209475, + "grad_norm": 2.5388195920867886, + "learning_rate": 1.6611295681063123e-07, + "loss": 1.1753, + "step": 3 + }, + { + "epoch": 0.00019950124688279303, + "grad_norm": 2.7096737630695045, + "learning_rate": 2.4916943521594683e-07, + "loss": 1.1622, + "step": 4 + }, + { + "epoch": 0.00024937655860349125, + "grad_norm": 3.0894056979095508, + "learning_rate": 3.3222591362126246e-07, + "loss": 1.1512, + "step": 5 + }, + { + "epoch": 0.0002992518703241895, + "grad_norm": 3.032336489981861, + "learning_rate": 4.1528239202657814e-07, + "loss": 1.2183, + "step": 6 + }, + { + "epoch": 0.00034912718204488776, + "grad_norm": 2.792395197805048, + "learning_rate": 4.983388704318937e-07, + "loss": 1.2415, + "step": 7 + }, + { + "epoch": 0.00039900249376558606, + "grad_norm": 2.750379330231259, + "learning_rate": 5.813953488372093e-07, + "loss": 1.1956, + "step": 8 + }, + { + "epoch": 0.0004488778054862843, + "grad_norm": 3.8250437177436836, + "learning_rate": 6.644518272425249e-07, + "loss": 1.2141, + "step": 9 + }, + { + "epoch": 0.0004987531172069825, + "grad_norm": 6.423501056664673, + "learning_rate": 7.475083056478406e-07, + "loss": 1.1801, + "step": 10 + }, + { + "epoch": 0.0005486284289276808, + "grad_norm": 2.190217546355279, + "learning_rate": 8.305647840531563e-07, + "loss": 1.0985, + "step": 11 + }, + { + "epoch": 0.000598503740648379, + "grad_norm": 2.3744659373491106, + "learning_rate": 9.136212624584719e-07, + "loss": 1.0862, + "step": 12 + }, + { + "epoch": 0.0006483790523690773, + "grad_norm": 2.1896398272114714, + "learning_rate": 9.966777408637873e-07, + "loss": 1.1623, + "step": 13 + }, + { + "epoch": 0.0006982543640897755, + "grad_norm": 2.7661390502047394, + "learning_rate": 1.079734219269103e-06, + "loss": 1.1871, + "step": 14 + }, + { + "epoch": 0.0007481296758104738, + "grad_norm": 2.5405457782252108, + "learning_rate": 1.1627906976744186e-06, + "loss": 1.1345, + "step": 15 + }, + { + "epoch": 0.0007980049875311721, + "grad_norm": 1.8231167740606211, + "learning_rate": 1.2458471760797342e-06, + "loss": 1.1412, + "step": 16 + }, + { + "epoch": 0.0008478802992518703, + "grad_norm": 2.1572161393138143, + "learning_rate": 1.3289036544850498e-06, + "loss": 1.1825, + "step": 17 + }, + { + "epoch": 0.0008977556109725686, + "grad_norm": 2.5951068231825656, + "learning_rate": 1.4119601328903657e-06, + "loss": 1.1631, + "step": 18 + }, + { + "epoch": 0.0009476309226932668, + "grad_norm": 3.807132297012551, + "learning_rate": 1.4950166112956813e-06, + "loss": 1.1236, + "step": 19 + }, + { + "epoch": 0.000997506234413965, + "grad_norm": 2.17715716285994, + "learning_rate": 1.5780730897009967e-06, + "loss": 1.1903, + "step": 20 + }, + { + "epoch": 0.0010473815461346633, + "grad_norm": 1.83628453668541, + "learning_rate": 1.6611295681063126e-06, + "loss": 1.1669, + "step": 21 + }, + { + "epoch": 0.0010972568578553616, + "grad_norm": 1.8669822483173097, + "learning_rate": 1.744186046511628e-06, + "loss": 1.1749, + "step": 22 + }, + { + "epoch": 0.00114713216957606, + "grad_norm": 1.6127334140261442, + "learning_rate": 1.8272425249169438e-06, + "loss": 1.1341, + "step": 23 + }, + { + "epoch": 0.001197007481296758, + "grad_norm": 1.3709831142757472, + "learning_rate": 1.9102990033222592e-06, + "loss": 1.142, + "step": 24 + }, + { + "epoch": 0.0012468827930174563, + "grad_norm": 1.693752543379472, + "learning_rate": 1.9933554817275746e-06, + "loss": 1.0768, + "step": 25 + }, + { + "epoch": 0.0012967581047381546, + "grad_norm": 1.315914832601601, + "learning_rate": 2.0764119601328905e-06, + "loss": 1.0784, + "step": 26 + }, + { + "epoch": 0.001346633416458853, + "grad_norm": 1.2619052432019053, + "learning_rate": 2.159468438538206e-06, + "loss": 1.0822, + "step": 27 + }, + { + "epoch": 0.001396508728179551, + "grad_norm": 1.2738817816319803, + "learning_rate": 2.2425249169435217e-06, + "loss": 1.0502, + "step": 28 + }, + { + "epoch": 0.0014463840399002493, + "grad_norm": 1.4934925482961487, + "learning_rate": 2.325581395348837e-06, + "loss": 1.0275, + "step": 29 + }, + { + "epoch": 0.0014962593516209476, + "grad_norm": 1.249018683364538, + "learning_rate": 2.408637873754153e-06, + "loss": 1.0926, + "step": 30 + }, + { + "epoch": 0.001546134663341646, + "grad_norm": 1.3000781321417525, + "learning_rate": 2.4916943521594684e-06, + "loss": 1.0538, + "step": 31 + }, + { + "epoch": 0.0015960099750623442, + "grad_norm": 1.5066115563152203, + "learning_rate": 2.5747508305647842e-06, + "loss": 1.0709, + "step": 32 + }, + { + "epoch": 0.0016458852867830423, + "grad_norm": 1.174731989845452, + "learning_rate": 2.6578073089700997e-06, + "loss": 1.0448, + "step": 33 + }, + { + "epoch": 0.0016957605985037406, + "grad_norm": 1.1568545895133024, + "learning_rate": 2.7408637873754155e-06, + "loss": 1.0879, + "step": 34 + }, + { + "epoch": 0.001745635910224439, + "grad_norm": 1.3196006697633855, + "learning_rate": 2.8239202657807313e-06, + "loss": 1.0978, + "step": 35 + }, + { + "epoch": 0.0017955112219451373, + "grad_norm": 1.390570460769799, + "learning_rate": 2.9069767441860468e-06, + "loss": 1.0501, + "step": 36 + }, + { + "epoch": 0.0018453865336658353, + "grad_norm": 1.1644551720368836, + "learning_rate": 2.9900332225913626e-06, + "loss": 1.0574, + "step": 37 + }, + { + "epoch": 0.0018952618453865336, + "grad_norm": 1.0770387846959188, + "learning_rate": 3.073089700996678e-06, + "loss": 1.035, + "step": 38 + }, + { + "epoch": 0.001945137157107232, + "grad_norm": 1.1427489612559232, + "learning_rate": 3.1561461794019934e-06, + "loss": 1.0718, + "step": 39 + }, + { + "epoch": 0.00199501246882793, + "grad_norm": 1.7274631865374839, + "learning_rate": 3.239202657807309e-06, + "loss": 1.0574, + "step": 40 + }, + { + "epoch": 0.0020448877805486283, + "grad_norm": 1.3307066117195687, + "learning_rate": 3.322259136212625e-06, + "loss": 1.0756, + "step": 41 + }, + { + "epoch": 0.0020947630922693266, + "grad_norm": 2.568802276368685, + "learning_rate": 3.4053156146179405e-06, + "loss": 1.0574, + "step": 42 + }, + { + "epoch": 0.002144638403990025, + "grad_norm": 1.6634357068605048, + "learning_rate": 3.488372093023256e-06, + "loss": 1.0885, + "step": 43 + }, + { + "epoch": 0.0021945137157107233, + "grad_norm": 0.986971479198236, + "learning_rate": 3.5714285714285714e-06, + "loss": 1.1068, + "step": 44 + }, + { + "epoch": 0.0022443890274314216, + "grad_norm": 1.4467661598839698, + "learning_rate": 3.6544850498338876e-06, + "loss": 1.0633, + "step": 45 + }, + { + "epoch": 0.00229426433915212, + "grad_norm": 0.9857178530323059, + "learning_rate": 3.737541528239203e-06, + "loss": 1.042, + "step": 46 + }, + { + "epoch": 0.0023441396508728177, + "grad_norm": 0.9967231434325955, + "learning_rate": 3.8205980066445185e-06, + "loss": 1.0239, + "step": 47 + }, + { + "epoch": 0.002394014962593516, + "grad_norm": 0.8633942954854189, + "learning_rate": 3.903654485049834e-06, + "loss": 1.0511, + "step": 48 + }, + { + "epoch": 0.0024438902743142144, + "grad_norm": 1.5643646818103363, + "learning_rate": 3.986710963455149e-06, + "loss": 1.0647, + "step": 49 + }, + { + "epoch": 0.0024937655860349127, + "grad_norm": 1.8446223341221197, + "learning_rate": 4.0697674418604655e-06, + "loss": 1.0657, + "step": 50 + }, + { + "epoch": 0.002543640897755611, + "grad_norm": 1.9554218378289059, + "learning_rate": 4.152823920265781e-06, + "loss": 1.0, + "step": 51 + }, + { + "epoch": 0.0025935162094763093, + "grad_norm": 1.3474525588045507, + "learning_rate": 4.235880398671096e-06, + "loss": 1.0277, + "step": 52 + }, + { + "epoch": 0.0026433915211970076, + "grad_norm": 0.963110884552223, + "learning_rate": 4.318936877076412e-06, + "loss": 0.9887, + "step": 53 + }, + { + "epoch": 0.002693266832917706, + "grad_norm": 1.1578718396435577, + "learning_rate": 4.401993355481727e-06, + "loss": 1.0782, + "step": 54 + }, + { + "epoch": 0.002743142144638404, + "grad_norm": 1.2994073648991007, + "learning_rate": 4.4850498338870435e-06, + "loss": 0.9999, + "step": 55 + }, + { + "epoch": 0.002793017456359102, + "grad_norm": 1.0420545339594225, + "learning_rate": 4.568106312292359e-06, + "loss": 1.0184, + "step": 56 + }, + { + "epoch": 0.0028428927680798004, + "grad_norm": 0.9457931409633268, + "learning_rate": 4.651162790697674e-06, + "loss": 1.011, + "step": 57 + }, + { + "epoch": 0.0028927680798004987, + "grad_norm": 1.6117746603326408, + "learning_rate": 4.73421926910299e-06, + "loss": 1.0195, + "step": 58 + }, + { + "epoch": 0.002942643391521197, + "grad_norm": 0.9013209543007481, + "learning_rate": 4.817275747508306e-06, + "loss": 0.9724, + "step": 59 + }, + { + "epoch": 0.0029925187032418953, + "grad_norm": 1.619350524154054, + "learning_rate": 4.900332225913621e-06, + "loss": 1.0362, + "step": 60 + }, + { + "epoch": 0.0030423940149625936, + "grad_norm": 2.4024458031429248, + "learning_rate": 4.983388704318937e-06, + "loss": 1.0027, + "step": 61 + }, + { + "epoch": 0.003092269326683292, + "grad_norm": 1.2454712359509146, + "learning_rate": 5.066445182724252e-06, + "loss": 0.9887, + "step": 62 + }, + { + "epoch": 0.00314214463840399, + "grad_norm": 1.7379804376161592, + "learning_rate": 5.1495016611295685e-06, + "loss": 1.0134, + "step": 63 + }, + { + "epoch": 0.0031920199501246885, + "grad_norm": 1.6059434270612978, + "learning_rate": 5.232558139534884e-06, + "loss": 1.0129, + "step": 64 + }, + { + "epoch": 0.0032418952618453864, + "grad_norm": 1.1280663878630746, + "learning_rate": 5.315614617940199e-06, + "loss": 0.97, + "step": 65 + }, + { + "epoch": 0.0032917705735660847, + "grad_norm": 1.0778755614052078, + "learning_rate": 5.398671096345516e-06, + "loss": 0.9858, + "step": 66 + }, + { + "epoch": 0.003341645885286783, + "grad_norm": 0.8505701199931851, + "learning_rate": 5.481727574750831e-06, + "loss": 0.9689, + "step": 67 + }, + { + "epoch": 0.0033915211970074813, + "grad_norm": 0.9212890210890757, + "learning_rate": 5.5647840531561464e-06, + "loss": 1.006, + "step": 68 + }, + { + "epoch": 0.0034413965087281796, + "grad_norm": 0.9910560177706003, + "learning_rate": 5.647840531561463e-06, + "loss": 0.9873, + "step": 69 + }, + { + "epoch": 0.003491271820448878, + "grad_norm": 1.0863500276579228, + "learning_rate": 5.730897009966778e-06, + "loss": 1.0039, + "step": 70 + }, + { + "epoch": 0.003541147132169576, + "grad_norm": 0.985068636320863, + "learning_rate": 5.8139534883720935e-06, + "loss": 1.0056, + "step": 71 + }, + { + "epoch": 0.0035910224438902745, + "grad_norm": 0.8676092165929453, + "learning_rate": 5.897009966777409e-06, + "loss": 0.9828, + "step": 72 + }, + { + "epoch": 0.0036408977556109724, + "grad_norm": 0.9508228323041943, + "learning_rate": 5.980066445182725e-06, + "loss": 0.9908, + "step": 73 + }, + { + "epoch": 0.0036907730673316707, + "grad_norm": 1.2134044539428448, + "learning_rate": 6.063122923588041e-06, + "loss": 0.9274, + "step": 74 + }, + { + "epoch": 0.003740648379052369, + "grad_norm": 1.142035687345757, + "learning_rate": 6.146179401993356e-06, + "loss": 0.9893, + "step": 75 + }, + { + "epoch": 0.0037905236907730673, + "grad_norm": 1.0640651623187343, + "learning_rate": 6.2292358803986714e-06, + "loss": 0.9976, + "step": 76 + }, + { + "epoch": 0.0038403990024937656, + "grad_norm": 0.8987646953079174, + "learning_rate": 6.312292358803987e-06, + "loss": 0.9696, + "step": 77 + }, + { + "epoch": 0.003890274314214464, + "grad_norm": 0.9289266588187, + "learning_rate": 6.395348837209303e-06, + "loss": 1.0034, + "step": 78 + }, + { + "epoch": 0.003940149625935162, + "grad_norm": 1.0808365503753237, + "learning_rate": 6.478405315614618e-06, + "loss": 1.0381, + "step": 79 + }, + { + "epoch": 0.00399002493765586, + "grad_norm": 1.3187720617400025, + "learning_rate": 6.561461794019934e-06, + "loss": 0.9887, + "step": 80 + }, + { + "epoch": 0.004039900249376558, + "grad_norm": 0.8358174503203746, + "learning_rate": 6.64451827242525e-06, + "loss": 0.9435, + "step": 81 + }, + { + "epoch": 0.004089775561097257, + "grad_norm": 1.0732765937938802, + "learning_rate": 6.727574750830565e-06, + "loss": 0.9583, + "step": 82 + }, + { + "epoch": 0.004139650872817955, + "grad_norm": 0.7972294594600061, + "learning_rate": 6.810631229235881e-06, + "loss": 0.9539, + "step": 83 + }, + { + "epoch": 0.004189526184538653, + "grad_norm": 0.9215463362054117, + "learning_rate": 6.893687707641196e-06, + "loss": 0.955, + "step": 84 + }, + { + "epoch": 0.004239401496259352, + "grad_norm": 1.0308602674660858, + "learning_rate": 6.976744186046512e-06, + "loss": 0.9351, + "step": 85 + }, + { + "epoch": 0.00428927680798005, + "grad_norm": 0.9582778854636269, + "learning_rate": 7.059800664451828e-06, + "loss": 0.9857, + "step": 86 + }, + { + "epoch": 0.004339152119700748, + "grad_norm": 1.1760927382812223, + "learning_rate": 7.142857142857143e-06, + "loss": 1.0077, + "step": 87 + }, + { + "epoch": 0.0043890274314214465, + "grad_norm": 0.8061479213129779, + "learning_rate": 7.225913621262459e-06, + "loss": 0.9846, + "step": 88 + }, + { + "epoch": 0.004438902743142145, + "grad_norm": 1.4507285791294249, + "learning_rate": 7.308970099667775e-06, + "loss": 0.9983, + "step": 89 + }, + { + "epoch": 0.004488778054862843, + "grad_norm": 1.1451608859205809, + "learning_rate": 7.39202657807309e-06, + "loss": 1.0132, + "step": 90 + }, + { + "epoch": 0.004538653366583541, + "grad_norm": 0.8791956202857473, + "learning_rate": 7.475083056478406e-06, + "loss": 0.956, + "step": 91 + }, + { + "epoch": 0.00458852867830424, + "grad_norm": 0.9107703405526227, + "learning_rate": 7.558139534883721e-06, + "loss": 0.9986, + "step": 92 + }, + { + "epoch": 0.004638403990024938, + "grad_norm": 0.8617344826547315, + "learning_rate": 7.641196013289037e-06, + "loss": 0.9573, + "step": 93 + }, + { + "epoch": 0.0046882793017456355, + "grad_norm": 6.002843556279148, + "learning_rate": 7.724252491694352e-06, + "loss": 0.9646, + "step": 94 + }, + { + "epoch": 0.004738154613466334, + "grad_norm": 1.2994488071128132, + "learning_rate": 7.807308970099668e-06, + "loss": 1.0338, + "step": 95 + }, + { + "epoch": 0.004788029925187032, + "grad_norm": 0.9148923389689512, + "learning_rate": 7.890365448504985e-06, + "loss": 0.9635, + "step": 96 + }, + { + "epoch": 0.00483790523690773, + "grad_norm": 1.1105685947534945, + "learning_rate": 7.973421926910299e-06, + "loss": 0.9855, + "step": 97 + }, + { + "epoch": 0.004887780548628429, + "grad_norm": 0.9518790865979369, + "learning_rate": 8.056478405315616e-06, + "loss": 0.9457, + "step": 98 + }, + { + "epoch": 0.004937655860349127, + "grad_norm": 0.8608799517773011, + "learning_rate": 8.139534883720931e-06, + "loss": 1.0071, + "step": 99 + }, + { + "epoch": 0.004987531172069825, + "grad_norm": 1.0288850448881515, + "learning_rate": 8.222591362126247e-06, + "loss": 0.8855, + "step": 100 + }, + { + "epoch": 0.005037406483790524, + "grad_norm": 1.0163593301048701, + "learning_rate": 8.305647840531562e-06, + "loss": 0.9917, + "step": 101 + }, + { + "epoch": 0.005087281795511222, + "grad_norm": 0.9027324644256767, + "learning_rate": 8.388704318936877e-06, + "loss": 0.9568, + "step": 102 + }, + { + "epoch": 0.00513715710723192, + "grad_norm": 1.3072287461370216, + "learning_rate": 8.471760797342193e-06, + "loss": 0.9741, + "step": 103 + }, + { + "epoch": 0.0051870324189526185, + "grad_norm": 1.9914235862029142, + "learning_rate": 8.55481727574751e-06, + "loss": 0.9864, + "step": 104 + }, + { + "epoch": 0.005236907730673317, + "grad_norm": 0.8613032618542104, + "learning_rate": 8.637873754152824e-06, + "loss": 0.9381, + "step": 105 + }, + { + "epoch": 0.005286783042394015, + "grad_norm": 0.9329680767768779, + "learning_rate": 8.72093023255814e-06, + "loss": 1.015, + "step": 106 + }, + { + "epoch": 0.0053366583541147134, + "grad_norm": 1.7039092944674432, + "learning_rate": 8.803986710963454e-06, + "loss": 0.9542, + "step": 107 + }, + { + "epoch": 0.005386533665835412, + "grad_norm": 0.9570975091245273, + "learning_rate": 8.887043189368772e-06, + "loss": 1.0001, + "step": 108 + }, + { + "epoch": 0.00543640897755611, + "grad_norm": 0.9514044490375047, + "learning_rate": 8.970099667774087e-06, + "loss": 0.9753, + "step": 109 + }, + { + "epoch": 0.005486284289276808, + "grad_norm": 0.916011484848683, + "learning_rate": 9.053156146179402e-06, + "loss": 0.9816, + "step": 110 + }, + { + "epoch": 0.005536159600997507, + "grad_norm": 0.908312725905651, + "learning_rate": 9.136212624584718e-06, + "loss": 0.9336, + "step": 111 + }, + { + "epoch": 0.005586034912718204, + "grad_norm": 1.1380772158074282, + "learning_rate": 9.219269102990033e-06, + "loss": 0.9964, + "step": 112 + }, + { + "epoch": 0.005635910224438902, + "grad_norm": 1.2918557485050404, + "learning_rate": 9.302325581395349e-06, + "loss": 0.9873, + "step": 113 + }, + { + "epoch": 0.005685785536159601, + "grad_norm": 0.9696200518328774, + "learning_rate": 9.385382059800666e-06, + "loss": 0.9714, + "step": 114 + }, + { + "epoch": 0.005735660847880299, + "grad_norm": 0.8854766670320557, + "learning_rate": 9.46843853820598e-06, + "loss": 1.0065, + "step": 115 + }, + { + "epoch": 0.005785536159600997, + "grad_norm": 1.0921841120096407, + "learning_rate": 9.551495016611297e-06, + "loss": 0.9306, + "step": 116 + }, + { + "epoch": 0.005835411471321696, + "grad_norm": 1.0114014217943825, + "learning_rate": 9.634551495016612e-06, + "loss": 0.9578, + "step": 117 + }, + { + "epoch": 0.005885286783042394, + "grad_norm": 2.118364885629767, + "learning_rate": 9.717607973421927e-06, + "loss": 0.9795, + "step": 118 + }, + { + "epoch": 0.005935162094763092, + "grad_norm": 2.5941086232308876, + "learning_rate": 9.800664451827243e-06, + "loss": 0.9429, + "step": 119 + }, + { + "epoch": 0.0059850374064837905, + "grad_norm": 1.3781646468317763, + "learning_rate": 9.883720930232558e-06, + "loss": 0.9383, + "step": 120 + }, + { + "epoch": 0.006034912718204489, + "grad_norm": 1.006007577156106, + "learning_rate": 9.966777408637874e-06, + "loss": 0.9864, + "step": 121 + }, + { + "epoch": 0.006084788029925187, + "grad_norm": 1.102137994178984, + "learning_rate": 1.004983388704319e-05, + "loss": 0.9537, + "step": 122 + }, + { + "epoch": 0.0061346633416458855, + "grad_norm": 2.0629456342342323, + "learning_rate": 1.0132890365448504e-05, + "loss": 0.9776, + "step": 123 + }, + { + "epoch": 0.006184538653366584, + "grad_norm": 1.4650873502340087, + "learning_rate": 1.0215946843853822e-05, + "loss": 0.9472, + "step": 124 + }, + { + "epoch": 0.006234413965087282, + "grad_norm": 0.9838833519760324, + "learning_rate": 1.0299003322259137e-05, + "loss": 0.9761, + "step": 125 + }, + { + "epoch": 0.00628428927680798, + "grad_norm": 1.4303320848850762, + "learning_rate": 1.0382059800664452e-05, + "loss": 0.9629, + "step": 126 + }, + { + "epoch": 0.006334164588528679, + "grad_norm": 1.1633508772048078, + "learning_rate": 1.0465116279069768e-05, + "loss": 0.9764, + "step": 127 + }, + { + "epoch": 0.006384039900249377, + "grad_norm": 3.893660533389512, + "learning_rate": 1.0548172757475083e-05, + "loss": 0.9623, + "step": 128 + }, + { + "epoch": 0.006433915211970074, + "grad_norm": 1.0311364735340547, + "learning_rate": 1.0631229235880399e-05, + "loss": 0.9288, + "step": 129 + }, + { + "epoch": 0.006483790523690773, + "grad_norm": 1.0813950590936987, + "learning_rate": 1.0714285714285714e-05, + "loss": 0.9807, + "step": 130 + }, + { + "epoch": 0.006533665835411471, + "grad_norm": 2.0059160260687, + "learning_rate": 1.0797342192691031e-05, + "loss": 0.9355, + "step": 131 + }, + { + "epoch": 0.006583541147132169, + "grad_norm": 0.9984365514423702, + "learning_rate": 1.0880398671096347e-05, + "loss": 0.9695, + "step": 132 + }, + { + "epoch": 0.006633416458852868, + "grad_norm": 1.235039849865121, + "learning_rate": 1.0963455149501662e-05, + "loss": 0.9458, + "step": 133 + }, + { + "epoch": 0.006683291770573566, + "grad_norm": 1.222714144327992, + "learning_rate": 1.1046511627906977e-05, + "loss": 1.0148, + "step": 134 + }, + { + "epoch": 0.006733167082294264, + "grad_norm": 1.002907111165248, + "learning_rate": 1.1129568106312293e-05, + "loss": 0.9639, + "step": 135 + }, + { + "epoch": 0.0067830423940149626, + "grad_norm": 1.2847800105577805, + "learning_rate": 1.1212624584717608e-05, + "loss": 0.9983, + "step": 136 + }, + { + "epoch": 0.006832917705735661, + "grad_norm": 2.4186598746770533, + "learning_rate": 1.1295681063122925e-05, + "loss": 0.9702, + "step": 137 + }, + { + "epoch": 0.006882793017456359, + "grad_norm": 1.5099187085635464, + "learning_rate": 1.1378737541528239e-05, + "loss": 0.9808, + "step": 138 + }, + { + "epoch": 0.0069326683291770575, + "grad_norm": 0.9808552839923038, + "learning_rate": 1.1461794019933556e-05, + "loss": 0.9482, + "step": 139 + }, + { + "epoch": 0.006982543640897756, + "grad_norm": 1.0460307719421527, + "learning_rate": 1.1544850498338872e-05, + "loss": 0.9572, + "step": 140 + }, + { + "epoch": 0.007032418952618454, + "grad_norm": 1.1022461697971893, + "learning_rate": 1.1627906976744187e-05, + "loss": 0.9812, + "step": 141 + }, + { + "epoch": 0.007082294264339152, + "grad_norm": 1.052422755502857, + "learning_rate": 1.1710963455149502e-05, + "loss": 0.9972, + "step": 142 + }, + { + "epoch": 0.007132169576059851, + "grad_norm": 1.5281673423327167, + "learning_rate": 1.1794019933554818e-05, + "loss": 0.9351, + "step": 143 + }, + { + "epoch": 0.007182044887780549, + "grad_norm": 0.9870069570642992, + "learning_rate": 1.1877076411960133e-05, + "loss": 0.9713, + "step": 144 + }, + { + "epoch": 0.007231920199501247, + "grad_norm": 0.888515596585298, + "learning_rate": 1.196013289036545e-05, + "loss": 0.9689, + "step": 145 + }, + { + "epoch": 0.007281795511221945, + "grad_norm": 1.1984043794843815, + "learning_rate": 1.2043189368770764e-05, + "loss": 0.9543, + "step": 146 + }, + { + "epoch": 0.007331670822942643, + "grad_norm": 0.8310961805074422, + "learning_rate": 1.2126245847176081e-05, + "loss": 0.9572, + "step": 147 + }, + { + "epoch": 0.007381546134663341, + "grad_norm": 0.8727004384344264, + "learning_rate": 1.2209302325581395e-05, + "loss": 0.9957, + "step": 148 + }, + { + "epoch": 0.00743142144638404, + "grad_norm": 1.359606587205955, + "learning_rate": 1.2292358803986712e-05, + "loss": 0.9715, + "step": 149 + }, + { + "epoch": 0.007481296758104738, + "grad_norm": 1.1356158452100718, + "learning_rate": 1.2375415282392027e-05, + "loss": 0.953, + "step": 150 + }, + { + "epoch": 0.007531172069825436, + "grad_norm": 0.9237266945545487, + "learning_rate": 1.2458471760797343e-05, + "loss": 0.9596, + "step": 151 + }, + { + "epoch": 0.007581047381546135, + "grad_norm": 0.8466889748568027, + "learning_rate": 1.2541528239202657e-05, + "loss": 0.9001, + "step": 152 + }, + { + "epoch": 0.007630922693266833, + "grad_norm": 0.8991391398016045, + "learning_rate": 1.2624584717607974e-05, + "loss": 0.9733, + "step": 153 + }, + { + "epoch": 0.007680798004987531, + "grad_norm": 0.8886223754121466, + "learning_rate": 1.2707641196013289e-05, + "loss": 0.9543, + "step": 154 + }, + { + "epoch": 0.0077306733167082295, + "grad_norm": 1.3058842869147627, + "learning_rate": 1.2790697674418606e-05, + "loss": 0.9536, + "step": 155 + }, + { + "epoch": 0.007780548628428928, + "grad_norm": 1.2865882509294142, + "learning_rate": 1.2873754152823922e-05, + "loss": 0.9714, + "step": 156 + }, + { + "epoch": 0.007830423940149626, + "grad_norm": 0.9571048433838016, + "learning_rate": 1.2956810631229235e-05, + "loss": 0.9317, + "step": 157 + }, + { + "epoch": 0.007880299251870324, + "grad_norm": 1.480869800690198, + "learning_rate": 1.303986710963455e-05, + "loss": 0.9881, + "step": 158 + }, + { + "epoch": 0.007930174563591023, + "grad_norm": 0.7192241502440309, + "learning_rate": 1.3122923588039868e-05, + "loss": 0.942, + "step": 159 + }, + { + "epoch": 0.00798004987531172, + "grad_norm": 0.8350091251759587, + "learning_rate": 1.3205980066445183e-05, + "loss": 0.922, + "step": 160 + }, + { + "epoch": 0.00802992518703242, + "grad_norm": 1.1763502638604988, + "learning_rate": 1.32890365448505e-05, + "loss": 0.9494, + "step": 161 + }, + { + "epoch": 0.008079800498753117, + "grad_norm": 0.8711630908684843, + "learning_rate": 1.3372093023255814e-05, + "loss": 0.9672, + "step": 162 + }, + { + "epoch": 0.008129675810473816, + "grad_norm": 1.611564309615934, + "learning_rate": 1.345514950166113e-05, + "loss": 0.9768, + "step": 163 + }, + { + "epoch": 0.008179551122194513, + "grad_norm": 0.9895290881972809, + "learning_rate": 1.3538205980066445e-05, + "loss": 1.0015, + "step": 164 + }, + { + "epoch": 0.008229426433915213, + "grad_norm": 1.3428027586486775, + "learning_rate": 1.3621262458471762e-05, + "loss": 0.9456, + "step": 165 + }, + { + "epoch": 0.00827930174563591, + "grad_norm": 0.966298231122404, + "learning_rate": 1.3704318936877078e-05, + "loss": 0.9888, + "step": 166 + }, + { + "epoch": 0.00832917705735661, + "grad_norm": 0.8057181656858916, + "learning_rate": 1.3787375415282391e-05, + "loss": 0.9395, + "step": 167 + }, + { + "epoch": 0.008379052369077307, + "grad_norm": 0.9458260504929873, + "learning_rate": 1.3870431893687708e-05, + "loss": 0.9473, + "step": 168 + }, + { + "epoch": 0.008428927680798006, + "grad_norm": 0.9816447371924518, + "learning_rate": 1.3953488372093024e-05, + "loss": 0.9736, + "step": 169 + }, + { + "epoch": 0.008478802992518703, + "grad_norm": 1.0989267486198677, + "learning_rate": 1.403654485049834e-05, + "loss": 0.9475, + "step": 170 + }, + { + "epoch": 0.0085286783042394, + "grad_norm": 1.6792464071154445, + "learning_rate": 1.4119601328903656e-05, + "loss": 0.9981, + "step": 171 + }, + { + "epoch": 0.0085785536159601, + "grad_norm": 0.8810829808411863, + "learning_rate": 1.420265780730897e-05, + "loss": 0.9573, + "step": 172 + }, + { + "epoch": 0.008628428927680797, + "grad_norm": 0.8736326665690809, + "learning_rate": 1.4285714285714285e-05, + "loss": 0.9684, + "step": 173 + }, + { + "epoch": 0.008678304239401496, + "grad_norm": 1.2259712637292866, + "learning_rate": 1.4368770764119603e-05, + "loss": 0.9909, + "step": 174 + }, + { + "epoch": 0.008728179551122194, + "grad_norm": 0.9988943834829557, + "learning_rate": 1.4451827242524918e-05, + "loss": 0.9449, + "step": 175 + }, + { + "epoch": 0.008778054862842893, + "grad_norm": 0.8998477545110397, + "learning_rate": 1.4534883720930233e-05, + "loss": 1.0146, + "step": 176 + }, + { + "epoch": 0.00882793017456359, + "grad_norm": 1.0227012135724447, + "learning_rate": 1.461794019933555e-05, + "loss": 0.9581, + "step": 177 + }, + { + "epoch": 0.00887780548628429, + "grad_norm": 2.5804740528009975, + "learning_rate": 1.4700996677740864e-05, + "loss": 0.9351, + "step": 178 + }, + { + "epoch": 0.008927680798004987, + "grad_norm": 1.4635606083583559, + "learning_rate": 1.478405315614618e-05, + "loss": 0.9623, + "step": 179 + }, + { + "epoch": 0.008977556109725686, + "grad_norm": 1.8570347101611404, + "learning_rate": 1.4867109634551497e-05, + "loss": 1.0335, + "step": 180 + }, + { + "epoch": 0.009027431421446384, + "grad_norm": 1.0246998860017718, + "learning_rate": 1.4950166112956812e-05, + "loss": 0.9549, + "step": 181 + }, + { + "epoch": 0.009077306733167083, + "grad_norm": 0.8139410903325798, + "learning_rate": 1.5033222591362128e-05, + "loss": 0.9858, + "step": 182 + }, + { + "epoch": 0.00912718204488778, + "grad_norm": 0.8454963506199463, + "learning_rate": 1.5116279069767441e-05, + "loss": 0.9297, + "step": 183 + }, + { + "epoch": 0.00917705735660848, + "grad_norm": 0.9222532417654145, + "learning_rate": 1.5199335548172758e-05, + "loss": 1.0247, + "step": 184 + }, + { + "epoch": 0.009226932668329177, + "grad_norm": 1.1138581289360452, + "learning_rate": 1.5282392026578074e-05, + "loss": 0.9466, + "step": 185 + }, + { + "epoch": 0.009276807980049876, + "grad_norm": 0.944512896559732, + "learning_rate": 1.536544850498339e-05, + "loss": 0.9174, + "step": 186 + }, + { + "epoch": 0.009326683291770574, + "grad_norm": 0.7317545414016215, + "learning_rate": 1.5448504983388705e-05, + "loss": 0.9207, + "step": 187 + }, + { + "epoch": 0.009376558603491271, + "grad_norm": 1.329728486945316, + "learning_rate": 1.553156146179402e-05, + "loss": 0.9736, + "step": 188 + }, + { + "epoch": 0.00942643391521197, + "grad_norm": 0.9792818198380825, + "learning_rate": 1.5614617940199335e-05, + "loss": 0.9452, + "step": 189 + }, + { + "epoch": 0.009476309226932668, + "grad_norm": 1.0149181937844727, + "learning_rate": 1.569767441860465e-05, + "loss": 0.9682, + "step": 190 + }, + { + "epoch": 0.009526184538653367, + "grad_norm": 2.354461038628803, + "learning_rate": 1.578073089700997e-05, + "loss": 0.9497, + "step": 191 + }, + { + "epoch": 0.009576059850374064, + "grad_norm": 1.087476612455294, + "learning_rate": 1.5863787375415285e-05, + "loss": 0.9829, + "step": 192 + }, + { + "epoch": 0.009625935162094763, + "grad_norm": 1.1634133583529949, + "learning_rate": 1.5946843853820597e-05, + "loss": 0.9226, + "step": 193 + }, + { + "epoch": 0.00967581047381546, + "grad_norm": 1.0727308629842893, + "learning_rate": 1.6029900332225913e-05, + "loss": 0.975, + "step": 194 + }, + { + "epoch": 0.00972568578553616, + "grad_norm": 0.8075333331302329, + "learning_rate": 1.611295681063123e-05, + "loss": 0.9223, + "step": 195 + }, + { + "epoch": 0.009775561097256857, + "grad_norm": 1.4139326219681962, + "learning_rate": 1.6196013289036547e-05, + "loss": 0.9273, + "step": 196 + }, + { + "epoch": 0.009825436408977557, + "grad_norm": 0.9974309836214726, + "learning_rate": 1.6279069767441862e-05, + "loss": 0.9317, + "step": 197 + }, + { + "epoch": 0.009875311720698254, + "grad_norm": 0.839376997774348, + "learning_rate": 1.6362126245847174e-05, + "loss": 0.9885, + "step": 198 + }, + { + "epoch": 0.009925187032418953, + "grad_norm": 1.027122402125168, + "learning_rate": 1.6445182724252493e-05, + "loss": 0.9142, + "step": 199 + }, + { + "epoch": 0.00997506234413965, + "grad_norm": 1.3387019326312704, + "learning_rate": 1.652823920265781e-05, + "loss": 0.9427, + "step": 200 + }, + { + "epoch": 0.01002493765586035, + "grad_norm": 1.135288689067514, + "learning_rate": 1.6611295681063124e-05, + "loss": 0.9413, + "step": 201 + }, + { + "epoch": 0.010074812967581047, + "grad_norm": 0.982610423556646, + "learning_rate": 1.669435215946844e-05, + "loss": 0.9495, + "step": 202 + }, + { + "epoch": 0.010124688279301746, + "grad_norm": 0.9464627595827015, + "learning_rate": 1.6777408637873755e-05, + "loss": 0.9376, + "step": 203 + }, + { + "epoch": 0.010174563591022444, + "grad_norm": 1.167701870185551, + "learning_rate": 1.686046511627907e-05, + "loss": 0.9519, + "step": 204 + }, + { + "epoch": 0.010224438902743141, + "grad_norm": 0.9798751311031798, + "learning_rate": 1.6943521594684386e-05, + "loss": 0.9268, + "step": 205 + }, + { + "epoch": 0.01027431421446384, + "grad_norm": 1.7091499933859817, + "learning_rate": 1.70265780730897e-05, + "loss": 0.9408, + "step": 206 + }, + { + "epoch": 0.010324189526184538, + "grad_norm": 1.27186760216749, + "learning_rate": 1.710963455149502e-05, + "loss": 0.9369, + "step": 207 + }, + { + "epoch": 0.010374064837905237, + "grad_norm": 1.682225646704079, + "learning_rate": 1.7192691029900332e-05, + "loss": 0.9335, + "step": 208 + }, + { + "epoch": 0.010423940149625935, + "grad_norm": 0.9026614580314402, + "learning_rate": 1.7275747508305647e-05, + "loss": 0.9533, + "step": 209 + }, + { + "epoch": 0.010473815461346634, + "grad_norm": 1.3103102272856253, + "learning_rate": 1.7358803986710963e-05, + "loss": 0.9192, + "step": 210 + }, + { + "epoch": 0.010523690773067331, + "grad_norm": 1.189320363546404, + "learning_rate": 1.744186046511628e-05, + "loss": 0.8955, + "step": 211 + }, + { + "epoch": 0.01057356608478803, + "grad_norm": 1.0131377066272715, + "learning_rate": 1.7524916943521597e-05, + "loss": 0.9172, + "step": 212 + }, + { + "epoch": 0.010623441396508728, + "grad_norm": 1.011833127279949, + "learning_rate": 1.760797342192691e-05, + "loss": 0.9475, + "step": 213 + }, + { + "epoch": 0.010673316708229427, + "grad_norm": 1.4155865409441144, + "learning_rate": 1.7691029900332228e-05, + "loss": 0.9147, + "step": 214 + }, + { + "epoch": 0.010723192019950124, + "grad_norm": 1.0370384558454584, + "learning_rate": 1.7774086378737543e-05, + "loss": 0.9479, + "step": 215 + }, + { + "epoch": 0.010773067331670824, + "grad_norm": 1.0394093533915347, + "learning_rate": 1.785714285714286e-05, + "loss": 0.9369, + "step": 216 + }, + { + "epoch": 0.010822942643391521, + "grad_norm": 0.7237823561810162, + "learning_rate": 1.7940199335548174e-05, + "loss": 0.9256, + "step": 217 + }, + { + "epoch": 0.01087281795511222, + "grad_norm": 0.9416301902977466, + "learning_rate": 1.802325581395349e-05, + "loss": 0.96, + "step": 218 + }, + { + "epoch": 0.010922693266832918, + "grad_norm": 0.9135004198613729, + "learning_rate": 1.8106312292358805e-05, + "loss": 0.9654, + "step": 219 + }, + { + "epoch": 0.010972568578553617, + "grad_norm": 0.8541802261905664, + "learning_rate": 1.818936877076412e-05, + "loss": 0.9356, + "step": 220 + }, + { + "epoch": 0.011022443890274314, + "grad_norm": 0.7652139649511556, + "learning_rate": 1.8272425249169436e-05, + "loss": 0.9674, + "step": 221 + }, + { + "epoch": 0.011072319201995013, + "grad_norm": 3.960923100822008, + "learning_rate": 1.835548172757475e-05, + "loss": 0.9496, + "step": 222 + }, + { + "epoch": 0.01112219451371571, + "grad_norm": 0.9363055345633922, + "learning_rate": 1.8438538205980066e-05, + "loss": 0.9505, + "step": 223 + }, + { + "epoch": 0.011172069825436408, + "grad_norm": 1.0488574377983093, + "learning_rate": 1.8521594684385382e-05, + "loss": 0.9263, + "step": 224 + }, + { + "epoch": 0.011221945137157107, + "grad_norm": 1.0931525741592518, + "learning_rate": 1.8604651162790697e-05, + "loss": 0.9797, + "step": 225 + }, + { + "epoch": 0.011271820448877805, + "grad_norm": 0.7993212454302326, + "learning_rate": 1.8687707641196016e-05, + "loss": 0.9462, + "step": 226 + }, + { + "epoch": 0.011321695760598504, + "grad_norm": 0.792758879660068, + "learning_rate": 1.877076411960133e-05, + "loss": 0.9698, + "step": 227 + }, + { + "epoch": 0.011371571072319201, + "grad_norm": 0.7113877582360928, + "learning_rate": 1.8853820598006647e-05, + "loss": 0.9226, + "step": 228 + }, + { + "epoch": 0.0114214463840399, + "grad_norm": 1.268407200227433, + "learning_rate": 1.893687707641196e-05, + "loss": 0.9568, + "step": 229 + }, + { + "epoch": 0.011471321695760598, + "grad_norm": 0.8733851464906407, + "learning_rate": 1.9019933554817278e-05, + "loss": 0.9451, + "step": 230 + }, + { + "epoch": 0.011521197007481297, + "grad_norm": 1.0550910924497896, + "learning_rate": 1.9102990033222593e-05, + "loss": 0.9597, + "step": 231 + }, + { + "epoch": 0.011571072319201995, + "grad_norm": 1.0577384326140673, + "learning_rate": 1.918604651162791e-05, + "loss": 0.9126, + "step": 232 + }, + { + "epoch": 0.011620947630922694, + "grad_norm": 0.8511473213905226, + "learning_rate": 1.9269102990033224e-05, + "loss": 0.9116, + "step": 233 + }, + { + "epoch": 0.011670822942643391, + "grad_norm": 0.9988108769360206, + "learning_rate": 1.935215946843854e-05, + "loss": 0.9232, + "step": 234 + }, + { + "epoch": 0.01172069825436409, + "grad_norm": 0.8310514353688783, + "learning_rate": 1.9435215946843855e-05, + "loss": 0.9606, + "step": 235 + }, + { + "epoch": 0.011770573566084788, + "grad_norm": 0.9434425590782999, + "learning_rate": 1.951827242524917e-05, + "loss": 0.9385, + "step": 236 + }, + { + "epoch": 0.011820448877805487, + "grad_norm": 1.3204247543153744, + "learning_rate": 1.9601328903654486e-05, + "loss": 0.9597, + "step": 237 + }, + { + "epoch": 0.011870324189526184, + "grad_norm": 0.8355790358223315, + "learning_rate": 1.9684385382059804e-05, + "loss": 0.9238, + "step": 238 + }, + { + "epoch": 0.011920199501246884, + "grad_norm": 1.4501889829158001, + "learning_rate": 1.9767441860465116e-05, + "loss": 0.9303, + "step": 239 + }, + { + "epoch": 0.011970074812967581, + "grad_norm": 0.7275520115228064, + "learning_rate": 1.9850498338870432e-05, + "loss": 0.9311, + "step": 240 + }, + { + "epoch": 0.012019950124688279, + "grad_norm": 3.1714687869402467, + "learning_rate": 1.9933554817275747e-05, + "loss": 0.9192, + "step": 241 + }, + { + "epoch": 0.012069825436408978, + "grad_norm": 0.8642172785252655, + "learning_rate": 2.0016611295681066e-05, + "loss": 0.9697, + "step": 242 + }, + { + "epoch": 0.012119700748129675, + "grad_norm": 0.8351274349567464, + "learning_rate": 2.009966777408638e-05, + "loss": 0.9475, + "step": 243 + }, + { + "epoch": 0.012169576059850374, + "grad_norm": 0.8478917073320702, + "learning_rate": 2.0182724252491694e-05, + "loss": 0.8748, + "step": 244 + }, + { + "epoch": 0.012219451371571072, + "grad_norm": 1.035689058160803, + "learning_rate": 2.026578073089701e-05, + "loss": 0.9346, + "step": 245 + }, + { + "epoch": 0.012269326683291771, + "grad_norm": 0.8830366100613483, + "learning_rate": 2.0348837209302328e-05, + "loss": 0.9232, + "step": 246 + }, + { + "epoch": 0.012319201995012468, + "grad_norm": 0.7358644112447094, + "learning_rate": 2.0431893687707643e-05, + "loss": 0.9567, + "step": 247 + }, + { + "epoch": 0.012369077306733168, + "grad_norm": 0.8043119953164141, + "learning_rate": 2.051495016611296e-05, + "loss": 0.9661, + "step": 248 + }, + { + "epoch": 0.012418952618453865, + "grad_norm": 1.0243967313871385, + "learning_rate": 2.0598006644518274e-05, + "loss": 0.9317, + "step": 249 + }, + { + "epoch": 0.012468827930174564, + "grad_norm": 0.7816658491622034, + "learning_rate": 2.068106312292359e-05, + "loss": 0.9475, + "step": 250 + }, + { + "epoch": 0.012518703241895262, + "grad_norm": 0.8523320093290063, + "learning_rate": 2.0764119601328905e-05, + "loss": 0.9544, + "step": 251 + }, + { + "epoch": 0.01256857855361596, + "grad_norm": 1.072043398873135, + "learning_rate": 2.084717607973422e-05, + "loss": 0.9537, + "step": 252 + }, + { + "epoch": 0.012618453865336658, + "grad_norm": 0.8147353466358987, + "learning_rate": 2.0930232558139536e-05, + "loss": 0.9967, + "step": 253 + }, + { + "epoch": 0.012668329177057357, + "grad_norm": 0.929196552374494, + "learning_rate": 2.101328903654485e-05, + "loss": 0.9474, + "step": 254 + }, + { + "epoch": 0.012718204488778055, + "grad_norm": 0.7738959216678126, + "learning_rate": 2.1096345514950166e-05, + "loss": 0.8952, + "step": 255 + }, + { + "epoch": 0.012768079800498754, + "grad_norm": 1.1758576883556828, + "learning_rate": 2.1179401993355482e-05, + "loss": 0.9229, + "step": 256 + }, + { + "epoch": 0.012817955112219451, + "grad_norm": 1.4063618639243949, + "learning_rate": 2.1262458471760797e-05, + "loss": 0.9308, + "step": 257 + }, + { + "epoch": 0.012867830423940149, + "grad_norm": 0.8392446393975065, + "learning_rate": 2.1345514950166116e-05, + "loss": 0.9215, + "step": 258 + }, + { + "epoch": 0.012917705735660848, + "grad_norm": 1.0357309556328578, + "learning_rate": 2.1428571428571428e-05, + "loss": 0.9596, + "step": 259 + }, + { + "epoch": 0.012967581047381545, + "grad_norm": 0.766374191013659, + "learning_rate": 2.1511627906976744e-05, + "loss": 0.9348, + "step": 260 + }, + { + "epoch": 0.013017456359102245, + "grad_norm": 0.823338716592112, + "learning_rate": 2.1594684385382062e-05, + "loss": 0.9372, + "step": 261 + }, + { + "epoch": 0.013067331670822942, + "grad_norm": 0.9914085050724056, + "learning_rate": 2.1677740863787378e-05, + "loss": 0.9278, + "step": 262 + }, + { + "epoch": 0.013117206982543641, + "grad_norm": 0.816334582985557, + "learning_rate": 2.1760797342192693e-05, + "loss": 0.9671, + "step": 263 + }, + { + "epoch": 0.013167082294264339, + "grad_norm": 1.67601407668965, + "learning_rate": 2.1843853820598005e-05, + "loss": 0.98, + "step": 264 + }, + { + "epoch": 0.013216957605985038, + "grad_norm": 1.8516025938977814, + "learning_rate": 2.1926910299003324e-05, + "loss": 0.9411, + "step": 265 + }, + { + "epoch": 0.013266832917705735, + "grad_norm": 0.8118423185893441, + "learning_rate": 2.200996677740864e-05, + "loss": 0.9889, + "step": 266 + }, + { + "epoch": 0.013316708229426434, + "grad_norm": 1.1388770082366129, + "learning_rate": 2.2093023255813955e-05, + "loss": 0.9859, + "step": 267 + }, + { + "epoch": 0.013366583541147132, + "grad_norm": 0.92302865205334, + "learning_rate": 2.217607973421927e-05, + "loss": 0.9015, + "step": 268 + }, + { + "epoch": 0.013416458852867831, + "grad_norm": 0.8503310580568787, + "learning_rate": 2.2259136212624586e-05, + "loss": 0.9348, + "step": 269 + }, + { + "epoch": 0.013466334164588529, + "grad_norm": 1.404686024018993, + "learning_rate": 2.23421926910299e-05, + "loss": 0.9395, + "step": 270 + }, + { + "epoch": 0.013516209476309228, + "grad_norm": 1.1036328275825233, + "learning_rate": 2.2425249169435217e-05, + "loss": 0.9921, + "step": 271 + }, + { + "epoch": 0.013566084788029925, + "grad_norm": 1.1267683260100383, + "learning_rate": 2.2508305647840532e-05, + "loss": 0.9603, + "step": 272 + }, + { + "epoch": 0.013615960099750624, + "grad_norm": 0.8423351683077813, + "learning_rate": 2.259136212624585e-05, + "loss": 0.9353, + "step": 273 + }, + { + "epoch": 0.013665835411471322, + "grad_norm": 1.2757745808417327, + "learning_rate": 2.2674418604651163e-05, + "loss": 0.9245, + "step": 274 + }, + { + "epoch": 0.01371571072319202, + "grad_norm": 0.8488718473096204, + "learning_rate": 2.2757475083056478e-05, + "loss": 0.9802, + "step": 275 + }, + { + "epoch": 0.013765586034912718, + "grad_norm": 0.8506180969323726, + "learning_rate": 2.2840531561461794e-05, + "loss": 0.9333, + "step": 276 + }, + { + "epoch": 0.013815461346633416, + "grad_norm": 0.7633085660276074, + "learning_rate": 2.2923588039867112e-05, + "loss": 0.9256, + "step": 277 + }, + { + "epoch": 0.013865336658354115, + "grad_norm": 0.835507192342244, + "learning_rate": 2.3006644518272428e-05, + "loss": 0.926, + "step": 278 + }, + { + "epoch": 0.013915211970074812, + "grad_norm": 0.8017615980647528, + "learning_rate": 2.3089700996677743e-05, + "loss": 0.9483, + "step": 279 + }, + { + "epoch": 0.013965087281795512, + "grad_norm": 1.0049349477784817, + "learning_rate": 2.3172757475083055e-05, + "loss": 0.9415, + "step": 280 + }, + { + "epoch": 0.014014962593516209, + "grad_norm": 0.910954170688395, + "learning_rate": 2.3255813953488374e-05, + "loss": 0.9506, + "step": 281 + }, + { + "epoch": 0.014064837905236908, + "grad_norm": 0.9726144653906347, + "learning_rate": 2.333887043189369e-05, + "loss": 0.9423, + "step": 282 + }, + { + "epoch": 0.014114713216957606, + "grad_norm": 1.3699435609321005, + "learning_rate": 2.3421926910299005e-05, + "loss": 0.9414, + "step": 283 + }, + { + "epoch": 0.014164588528678305, + "grad_norm": 0.9155220682868462, + "learning_rate": 2.350498338870432e-05, + "loss": 1.0042, + "step": 284 + }, + { + "epoch": 0.014214463840399002, + "grad_norm": 0.9274465023604508, + "learning_rate": 2.3588039867109636e-05, + "loss": 0.9527, + "step": 285 + }, + { + "epoch": 0.014264339152119701, + "grad_norm": 1.0345881809812298, + "learning_rate": 2.367109634551495e-05, + "loss": 0.9499, + "step": 286 + }, + { + "epoch": 0.014314214463840399, + "grad_norm": 1.0756199959877868, + "learning_rate": 2.3754152823920267e-05, + "loss": 0.937, + "step": 287 + }, + { + "epoch": 0.014364089775561098, + "grad_norm": 1.2503959509389555, + "learning_rate": 2.3837209302325582e-05, + "loss": 0.9176, + "step": 288 + }, + { + "epoch": 0.014413965087281795, + "grad_norm": 0.7455142768585155, + "learning_rate": 2.39202657807309e-05, + "loss": 0.9714, + "step": 289 + }, + { + "epoch": 0.014463840399002495, + "grad_norm": 1.4532241414820268, + "learning_rate": 2.4003322259136213e-05, + "loss": 0.9121, + "step": 290 + }, + { + "epoch": 0.014513715710723192, + "grad_norm": 0.9264902718122139, + "learning_rate": 2.4086378737541528e-05, + "loss": 0.9571, + "step": 291 + }, + { + "epoch": 0.01456359102244389, + "grad_norm": 1.2516896282253307, + "learning_rate": 2.4169435215946844e-05, + "loss": 0.9702, + "step": 292 + }, + { + "epoch": 0.014613466334164589, + "grad_norm": 1.054756792024253, + "learning_rate": 2.4252491694352162e-05, + "loss": 0.9686, + "step": 293 + }, + { + "epoch": 0.014663341645885286, + "grad_norm": 0.7932351265619986, + "learning_rate": 2.4335548172757478e-05, + "loss": 1.0016, + "step": 294 + }, + { + "epoch": 0.014713216957605985, + "grad_norm": 0.7208227181108974, + "learning_rate": 2.441860465116279e-05, + "loss": 0.9518, + "step": 295 + }, + { + "epoch": 0.014763092269326683, + "grad_norm": 0.7986147690319754, + "learning_rate": 2.4501661129568105e-05, + "loss": 0.9329, + "step": 296 + }, + { + "epoch": 0.014812967581047382, + "grad_norm": 0.8053990968141096, + "learning_rate": 2.4584717607973424e-05, + "loss": 0.9191, + "step": 297 + }, + { + "epoch": 0.01486284289276808, + "grad_norm": 1.5691258725559938, + "learning_rate": 2.466777408637874e-05, + "loss": 0.9797, + "step": 298 + }, + { + "epoch": 0.014912718204488778, + "grad_norm": 1.028405877143215, + "learning_rate": 2.4750830564784055e-05, + "loss": 0.9136, + "step": 299 + }, + { + "epoch": 0.014962593516209476, + "grad_norm": 1.5742448522658126, + "learning_rate": 2.483388704318937e-05, + "loss": 0.9639, + "step": 300 + }, + { + "epoch": 0.015012468827930175, + "grad_norm": 0.9462664575073146, + "learning_rate": 2.4916943521594686e-05, + "loss": 0.9519, + "step": 301 + }, + { + "epoch": 0.015062344139650873, + "grad_norm": 0.7613950190254303, + "learning_rate": 2.5e-05, + "loss": 0.9255, + "step": 302 + }, + { + "epoch": 0.015112219451371572, + "grad_norm": 1.0411818682487262, + "learning_rate": 2.5083056478405313e-05, + "loss": 0.9417, + "step": 303 + }, + { + "epoch": 0.01516209476309227, + "grad_norm": 0.8158822159008656, + "learning_rate": 2.5166112956810632e-05, + "loss": 0.9722, + "step": 304 + }, + { + "epoch": 0.015211970074812968, + "grad_norm": 0.9740230879677, + "learning_rate": 2.5249169435215947e-05, + "loss": 0.9631, + "step": 305 + }, + { + "epoch": 0.015261845386533666, + "grad_norm": 0.8891662195828673, + "learning_rate": 2.5332225913621266e-05, + "loss": 0.9212, + "step": 306 + }, + { + "epoch": 0.015311720698254365, + "grad_norm": 0.9222214534709314, + "learning_rate": 2.5415282392026578e-05, + "loss": 0.8997, + "step": 307 + }, + { + "epoch": 0.015361596009975062, + "grad_norm": 0.6342949865863581, + "learning_rate": 2.5498338870431894e-05, + "loss": 0.9296, + "step": 308 + }, + { + "epoch": 0.01541147132169576, + "grad_norm": 3.168835459565602, + "learning_rate": 2.5581395348837212e-05, + "loss": 0.9005, + "step": 309 + }, + { + "epoch": 0.015461346633416459, + "grad_norm": 0.8938761277633008, + "learning_rate": 2.5664451827242525e-05, + "loss": 0.9235, + "step": 310 + }, + { + "epoch": 0.015511221945137156, + "grad_norm": 0.7542568218566514, + "learning_rate": 2.5747508305647843e-05, + "loss": 0.9776, + "step": 311 + }, + { + "epoch": 0.015561097256857856, + "grad_norm": 1.2230970396634897, + "learning_rate": 2.583056478405316e-05, + "loss": 0.9365, + "step": 312 + }, + { + "epoch": 0.015610972568578553, + "grad_norm": 0.7555731237263875, + "learning_rate": 2.591362126245847e-05, + "loss": 0.9178, + "step": 313 + }, + { + "epoch": 0.015660847880299252, + "grad_norm": 0.993253288288048, + "learning_rate": 2.599667774086379e-05, + "loss": 0.9756, + "step": 314 + }, + { + "epoch": 0.01571072319201995, + "grad_norm": 0.7020265854719794, + "learning_rate": 2.60797342192691e-05, + "loss": 0.9254, + "step": 315 + }, + { + "epoch": 0.015760598503740647, + "grad_norm": 0.9603229353753396, + "learning_rate": 2.616279069767442e-05, + "loss": 0.9077, + "step": 316 + }, + { + "epoch": 0.015810473815461346, + "grad_norm": 0.8640178784208472, + "learning_rate": 2.6245847176079736e-05, + "loss": 0.989, + "step": 317 + }, + { + "epoch": 0.015860349127182045, + "grad_norm": 0.7585751633311673, + "learning_rate": 2.6328903654485048e-05, + "loss": 0.9185, + "step": 318 + }, + { + "epoch": 0.015910224438902745, + "grad_norm": 0.7192409983311282, + "learning_rate": 2.6411960132890367e-05, + "loss": 0.9094, + "step": 319 + }, + { + "epoch": 0.01596009975062344, + "grad_norm": 0.8754786281228678, + "learning_rate": 2.6495016611295682e-05, + "loss": 0.935, + "step": 320 + }, + { + "epoch": 0.01600997506234414, + "grad_norm": 1.0734286017920276, + "learning_rate": 2.6578073089701e-05, + "loss": 0.9596, + "step": 321 + }, + { + "epoch": 0.01605985037406484, + "grad_norm": 2.043430238426615, + "learning_rate": 2.6661129568106313e-05, + "loss": 0.9222, + "step": 322 + }, + { + "epoch": 0.016109725685785538, + "grad_norm": 0.8918928339187571, + "learning_rate": 2.674418604651163e-05, + "loss": 0.96, + "step": 323 + }, + { + "epoch": 0.016159600997506234, + "grad_norm": 0.796828755590491, + "learning_rate": 2.6827242524916947e-05, + "loss": 0.9802, + "step": 324 + }, + { + "epoch": 0.016209476309226933, + "grad_norm": 1.6070125587532815, + "learning_rate": 2.691029900332226e-05, + "loss": 1.0019, + "step": 325 + }, + { + "epoch": 0.016259351620947632, + "grad_norm": 1.1296531433254475, + "learning_rate": 2.6993355481727578e-05, + "loss": 0.9447, + "step": 326 + }, + { + "epoch": 0.016309226932668328, + "grad_norm": 0.7390810748578298, + "learning_rate": 2.707641196013289e-05, + "loss": 0.9934, + "step": 327 + }, + { + "epoch": 0.016359102244389027, + "grad_norm": 0.7238432176636873, + "learning_rate": 2.7159468438538205e-05, + "loss": 0.9507, + "step": 328 + }, + { + "epoch": 0.016408977556109726, + "grad_norm": 0.7994584649765913, + "learning_rate": 2.7242524916943524e-05, + "loss": 0.9297, + "step": 329 + }, + { + "epoch": 0.016458852867830425, + "grad_norm": 0.7308481633481674, + "learning_rate": 2.7325581395348836e-05, + "loss": 0.9729, + "step": 330 + }, + { + "epoch": 0.01650872817955112, + "grad_norm": 0.7977855637502483, + "learning_rate": 2.7408637873754155e-05, + "loss": 0.9714, + "step": 331 + }, + { + "epoch": 0.01655860349127182, + "grad_norm": 1.393963948261056, + "learning_rate": 2.749169435215947e-05, + "loss": 0.9496, + "step": 332 + }, + { + "epoch": 0.01660847880299252, + "grad_norm": 0.7266381164358315, + "learning_rate": 2.7574750830564782e-05, + "loss": 0.923, + "step": 333 + }, + { + "epoch": 0.01665835411471322, + "grad_norm": 0.8670524680072641, + "learning_rate": 2.76578073089701e-05, + "loss": 0.9478, + "step": 334 + }, + { + "epoch": 0.016708229426433914, + "grad_norm": 0.9819407006841205, + "learning_rate": 2.7740863787375417e-05, + "loss": 0.9489, + "step": 335 + }, + { + "epoch": 0.016758104738154613, + "grad_norm": 0.8280579358883221, + "learning_rate": 2.7823920265780736e-05, + "loss": 0.9116, + "step": 336 + }, + { + "epoch": 0.016807980049875312, + "grad_norm": 2.4912041515571253, + "learning_rate": 2.7906976744186048e-05, + "loss": 0.9662, + "step": 337 + }, + { + "epoch": 0.01685785536159601, + "grad_norm": 1.4051868180401406, + "learning_rate": 2.799003322259136e-05, + "loss": 1.0049, + "step": 338 + }, + { + "epoch": 0.016907730673316707, + "grad_norm": 0.7779316091764464, + "learning_rate": 2.807308970099668e-05, + "loss": 0.9405, + "step": 339 + }, + { + "epoch": 0.016957605985037406, + "grad_norm": 0.7755650406217628, + "learning_rate": 2.8156146179401994e-05, + "loss": 0.9737, + "step": 340 + }, + { + "epoch": 0.017007481296758106, + "grad_norm": 0.7787034992239164, + "learning_rate": 2.8239202657807313e-05, + "loss": 0.9684, + "step": 341 + }, + { + "epoch": 0.0170573566084788, + "grad_norm": 0.715804103323462, + "learning_rate": 2.8322259136212625e-05, + "loss": 0.943, + "step": 342 + }, + { + "epoch": 0.0171072319201995, + "grad_norm": 0.9403277955655155, + "learning_rate": 2.840531561461794e-05, + "loss": 0.9429, + "step": 343 + }, + { + "epoch": 0.0171571072319202, + "grad_norm": 0.9453693009048316, + "learning_rate": 2.848837209302326e-05, + "loss": 0.9622, + "step": 344 + }, + { + "epoch": 0.0172069825436409, + "grad_norm": 1.8171324883864861, + "learning_rate": 2.857142857142857e-05, + "loss": 0.9519, + "step": 345 + }, + { + "epoch": 0.017256857855361595, + "grad_norm": 0.9663025268352989, + "learning_rate": 2.865448504983389e-05, + "loss": 1.0239, + "step": 346 + }, + { + "epoch": 0.017306733167082294, + "grad_norm": 0.7812516867536193, + "learning_rate": 2.8737541528239205e-05, + "loss": 0.8846, + "step": 347 + }, + { + "epoch": 0.017356608478802993, + "grad_norm": 0.9160411591325556, + "learning_rate": 2.8820598006644517e-05, + "loss": 0.9479, + "step": 348 + }, + { + "epoch": 0.017406483790523692, + "grad_norm": 1.0790778238762015, + "learning_rate": 2.8903654485049836e-05, + "loss": 0.9686, + "step": 349 + }, + { + "epoch": 0.017456359102244388, + "grad_norm": 0.8562301415033778, + "learning_rate": 2.8986710963455148e-05, + "loss": 0.9635, + "step": 350 + }, + { + "epoch": 0.017506234413965087, + "grad_norm": 0.6951232943458114, + "learning_rate": 2.9069767441860467e-05, + "loss": 0.969, + "step": 351 + }, + { + "epoch": 0.017556109725685786, + "grad_norm": 0.7680430730798842, + "learning_rate": 2.9152823920265782e-05, + "loss": 0.9396, + "step": 352 + }, + { + "epoch": 0.017605985037406485, + "grad_norm": 0.7448203226714932, + "learning_rate": 2.92358803986711e-05, + "loss": 0.9205, + "step": 353 + }, + { + "epoch": 0.01765586034912718, + "grad_norm": 0.7904339325926112, + "learning_rate": 2.9318936877076413e-05, + "loss": 0.94, + "step": 354 + }, + { + "epoch": 0.01770573566084788, + "grad_norm": 0.7609168141440174, + "learning_rate": 2.940199335548173e-05, + "loss": 0.9086, + "step": 355 + }, + { + "epoch": 0.01775561097256858, + "grad_norm": 1.0138442652435744, + "learning_rate": 2.9485049833887047e-05, + "loss": 0.9257, + "step": 356 + }, + { + "epoch": 0.01780548628428928, + "grad_norm": 0.9918057154877717, + "learning_rate": 2.956810631229236e-05, + "loss": 0.9451, + "step": 357 + }, + { + "epoch": 0.017855361596009974, + "grad_norm": 0.7139987928979613, + "learning_rate": 2.9651162790697678e-05, + "loss": 0.9114, + "step": 358 + }, + { + "epoch": 0.017905236907730673, + "grad_norm": 1.8269177270803696, + "learning_rate": 2.9734219269102993e-05, + "loss": 0.9509, + "step": 359 + }, + { + "epoch": 0.017955112219451373, + "grad_norm": 0.6819557391891737, + "learning_rate": 2.9817275747508305e-05, + "loss": 0.937, + "step": 360 + }, + { + "epoch": 0.018004987531172068, + "grad_norm": 0.8183080954814556, + "learning_rate": 2.9900332225913624e-05, + "loss": 0.9746, + "step": 361 + }, + { + "epoch": 0.018054862842892767, + "grad_norm": 1.2842952717558518, + "learning_rate": 2.9983388704318936e-05, + "loss": 0.9534, + "step": 362 + }, + { + "epoch": 0.018104738154613467, + "grad_norm": 2.508541286365909, + "learning_rate": 3.0066445182724255e-05, + "loss": 0.9755, + "step": 363 + }, + { + "epoch": 0.018154613466334166, + "grad_norm": 0.718244713143972, + "learning_rate": 3.014950166112957e-05, + "loss": 0.9406, + "step": 364 + }, + { + "epoch": 0.01820448877805486, + "grad_norm": 0.7268455211223784, + "learning_rate": 3.0232558139534883e-05, + "loss": 0.9639, + "step": 365 + }, + { + "epoch": 0.01825436408977556, + "grad_norm": 0.9017901566960013, + "learning_rate": 3.03156146179402e-05, + "loss": 0.929, + "step": 366 + }, + { + "epoch": 0.01830423940149626, + "grad_norm": 0.7067687090499235, + "learning_rate": 3.0398671096345517e-05, + "loss": 0.9348, + "step": 367 + }, + { + "epoch": 0.01835411471321696, + "grad_norm": 0.6932304339604254, + "learning_rate": 3.0481727574750836e-05, + "loss": 0.9087, + "step": 368 + }, + { + "epoch": 0.018403990024937655, + "grad_norm": 0.7977969871986622, + "learning_rate": 3.056478405315615e-05, + "loss": 0.9503, + "step": 369 + }, + { + "epoch": 0.018453865336658354, + "grad_norm": 0.8508832937364185, + "learning_rate": 3.064784053156146e-05, + "loss": 0.9375, + "step": 370 + }, + { + "epoch": 0.018503740648379053, + "grad_norm": 0.7569390552077694, + "learning_rate": 3.073089700996678e-05, + "loss": 0.9263, + "step": 371 + }, + { + "epoch": 0.018553615960099752, + "grad_norm": 1.0633477639223543, + "learning_rate": 3.081395348837209e-05, + "loss": 0.9285, + "step": 372 + }, + { + "epoch": 0.018603491271820448, + "grad_norm": 0.7651063884069274, + "learning_rate": 3.089700996677741e-05, + "loss": 0.9267, + "step": 373 + }, + { + "epoch": 0.018653366583541147, + "grad_norm": 0.7194251828913655, + "learning_rate": 3.098006644518273e-05, + "loss": 0.9194, + "step": 374 + }, + { + "epoch": 0.018703241895261846, + "grad_norm": 1.0218532601716979, + "learning_rate": 3.106312292358804e-05, + "loss": 0.956, + "step": 375 + }, + { + "epoch": 0.018753117206982542, + "grad_norm": 0.7483394775233244, + "learning_rate": 3.114617940199336e-05, + "loss": 0.9437, + "step": 376 + }, + { + "epoch": 0.01880299251870324, + "grad_norm": 1.0561898423634666, + "learning_rate": 3.122923588039867e-05, + "loss": 0.9512, + "step": 377 + }, + { + "epoch": 0.01885286783042394, + "grad_norm": 1.3461552151414256, + "learning_rate": 3.131229235880399e-05, + "loss": 0.9376, + "step": 378 + }, + { + "epoch": 0.01890274314214464, + "grad_norm": 1.0734353787372208, + "learning_rate": 3.13953488372093e-05, + "loss": 0.9667, + "step": 379 + }, + { + "epoch": 0.018952618453865335, + "grad_norm": 0.7784081145099461, + "learning_rate": 3.147840531561462e-05, + "loss": 0.9577, + "step": 380 + }, + { + "epoch": 0.019002493765586034, + "grad_norm": 1.0426959132118154, + "learning_rate": 3.156146179401994e-05, + "loss": 0.9225, + "step": 381 + }, + { + "epoch": 0.019052369077306733, + "grad_norm": 0.805146055041698, + "learning_rate": 3.164451827242525e-05, + "loss": 0.9167, + "step": 382 + }, + { + "epoch": 0.019102244389027433, + "grad_norm": 0.6065568569233086, + "learning_rate": 3.172757475083057e-05, + "loss": 0.929, + "step": 383 + }, + { + "epoch": 0.01915211970074813, + "grad_norm": 1.6964418046649716, + "learning_rate": 3.181063122923588e-05, + "loss": 0.9653, + "step": 384 + }, + { + "epoch": 0.019201995012468828, + "grad_norm": 0.8824728295773107, + "learning_rate": 3.1893687707641194e-05, + "loss": 0.993, + "step": 385 + }, + { + "epoch": 0.019251870324189527, + "grad_norm": 0.7959530064501845, + "learning_rate": 3.197674418604651e-05, + "loss": 0.9546, + "step": 386 + }, + { + "epoch": 0.019301745635910226, + "grad_norm": 0.8238130483982664, + "learning_rate": 3.2059800664451825e-05, + "loss": 0.9757, + "step": 387 + }, + { + "epoch": 0.01935162094763092, + "grad_norm": 0.8417211287646655, + "learning_rate": 3.2142857142857144e-05, + "loss": 0.9286, + "step": 388 + }, + { + "epoch": 0.01940149625935162, + "grad_norm": 0.7382771326872768, + "learning_rate": 3.222591362126246e-05, + "loss": 0.9729, + "step": 389 + }, + { + "epoch": 0.01945137157107232, + "grad_norm": 0.7502163182202308, + "learning_rate": 3.2308970099667775e-05, + "loss": 0.9029, + "step": 390 + }, + { + "epoch": 0.01950124688279302, + "grad_norm": 0.7320224758098881, + "learning_rate": 3.2392026578073094e-05, + "loss": 0.9584, + "step": 391 + }, + { + "epoch": 0.019551122194513715, + "grad_norm": 0.6192943289109216, + "learning_rate": 3.2475083056478406e-05, + "loss": 0.9268, + "step": 392 + }, + { + "epoch": 0.019600997506234414, + "grad_norm": 0.7439526191436129, + "learning_rate": 3.2558139534883724e-05, + "loss": 0.9762, + "step": 393 + }, + { + "epoch": 0.019650872817955113, + "grad_norm": 0.8891384929076337, + "learning_rate": 3.2641196013289036e-05, + "loss": 0.9431, + "step": 394 + }, + { + "epoch": 0.01970074812967581, + "grad_norm": 0.7269259394220505, + "learning_rate": 3.272425249169435e-05, + "loss": 0.9174, + "step": 395 + }, + { + "epoch": 0.019750623441396508, + "grad_norm": 0.7901563549847901, + "learning_rate": 3.280730897009967e-05, + "loss": 0.9332, + "step": 396 + }, + { + "epoch": 0.019800498753117207, + "grad_norm": 0.9182588649507222, + "learning_rate": 3.2890365448504986e-05, + "loss": 0.9655, + "step": 397 + }, + { + "epoch": 0.019850374064837906, + "grad_norm": 0.7714080911276233, + "learning_rate": 3.2973421926910305e-05, + "loss": 0.9527, + "step": 398 + }, + { + "epoch": 0.019900249376558602, + "grad_norm": 0.7972824854879719, + "learning_rate": 3.305647840531562e-05, + "loss": 0.9116, + "step": 399 + }, + { + "epoch": 0.0199501246882793, + "grad_norm": 0.63898751663419, + "learning_rate": 3.313953488372093e-05, + "loss": 0.9777, + "step": 400 + }, + { + "epoch": 0.02, + "grad_norm": 0.8691847010235902, + "learning_rate": 3.322259136212625e-05, + "loss": 0.9388, + "step": 401 + }, + { + "epoch": 0.0200498753117207, + "grad_norm": 1.2352332391992702, + "learning_rate": 3.330564784053156e-05, + "loss": 0.9731, + "step": 402 + }, + { + "epoch": 0.020099750623441395, + "grad_norm": 0.782525757485585, + "learning_rate": 3.338870431893688e-05, + "loss": 0.9762, + "step": 403 + }, + { + "epoch": 0.020149625935162094, + "grad_norm": 0.7049087459266875, + "learning_rate": 3.34717607973422e-05, + "loss": 0.9347, + "step": 404 + }, + { + "epoch": 0.020199501246882794, + "grad_norm": 0.7490000595622441, + "learning_rate": 3.355481727574751e-05, + "loss": 0.9082, + "step": 405 + }, + { + "epoch": 0.020249376558603493, + "grad_norm": 0.9113272878924942, + "learning_rate": 3.363787375415283e-05, + "loss": 1.0115, + "step": 406 + }, + { + "epoch": 0.02029925187032419, + "grad_norm": 0.8033219919132726, + "learning_rate": 3.372093023255814e-05, + "loss": 0.9369, + "step": 407 + }, + { + "epoch": 0.020349127182044888, + "grad_norm": 1.0087638172094475, + "learning_rate": 3.380398671096346e-05, + "loss": 0.9521, + "step": 408 + }, + { + "epoch": 0.020399002493765587, + "grad_norm": 0.9186531746109131, + "learning_rate": 3.388704318936877e-05, + "loss": 0.9663, + "step": 409 + }, + { + "epoch": 0.020448877805486283, + "grad_norm": 0.8622952510397068, + "learning_rate": 3.397009966777408e-05, + "loss": 0.9041, + "step": 410 + }, + { + "epoch": 0.02049875311720698, + "grad_norm": 0.8581011896128237, + "learning_rate": 3.40531561461794e-05, + "loss": 0.9551, + "step": 411 + }, + { + "epoch": 0.02054862842892768, + "grad_norm": 0.680953849232426, + "learning_rate": 3.413621262458472e-05, + "loss": 0.9508, + "step": 412 + }, + { + "epoch": 0.02059850374064838, + "grad_norm": 0.6856179259717801, + "learning_rate": 3.421926910299004e-05, + "loss": 0.9374, + "step": 413 + }, + { + "epoch": 0.020648379052369076, + "grad_norm": 0.9261964064906869, + "learning_rate": 3.430232558139535e-05, + "loss": 0.9311, + "step": 414 + }, + { + "epoch": 0.020698254364089775, + "grad_norm": 0.7590195590715442, + "learning_rate": 3.4385382059800664e-05, + "loss": 0.9805, + "step": 415 + }, + { + "epoch": 0.020748129675810474, + "grad_norm": 0.7033414309391244, + "learning_rate": 3.446843853820598e-05, + "loss": 0.8864, + "step": 416 + }, + { + "epoch": 0.020798004987531173, + "grad_norm": 0.6018114688639853, + "learning_rate": 3.4551495016611294e-05, + "loss": 0.8851, + "step": 417 + }, + { + "epoch": 0.02084788029925187, + "grad_norm": 0.7203908760637877, + "learning_rate": 3.463455149501661e-05, + "loss": 0.9282, + "step": 418 + }, + { + "epoch": 0.020897755610972568, + "grad_norm": 0.754268920403004, + "learning_rate": 3.4717607973421925e-05, + "loss": 0.9591, + "step": 419 + }, + { + "epoch": 0.020947630922693267, + "grad_norm": 0.9436218758371969, + "learning_rate": 3.4800664451827244e-05, + "loss": 0.9921, + "step": 420 + }, + { + "epoch": 0.020997506234413967, + "grad_norm": 0.8483196982000188, + "learning_rate": 3.488372093023256e-05, + "loss": 0.9853, + "step": 421 + }, + { + "epoch": 0.021047381546134662, + "grad_norm": 0.6574902055424124, + "learning_rate": 3.4966777408637875e-05, + "loss": 0.8969, + "step": 422 + }, + { + "epoch": 0.02109725685785536, + "grad_norm": 0.6779162142797192, + "learning_rate": 3.5049833887043194e-05, + "loss": 0.9781, + "step": 423 + }, + { + "epoch": 0.02114713216957606, + "grad_norm": 0.7162435844611758, + "learning_rate": 3.5132890365448506e-05, + "loss": 0.9589, + "step": 424 + }, + { + "epoch": 0.02119700748129676, + "grad_norm": 0.8173691030806729, + "learning_rate": 3.521594684385382e-05, + "loss": 0.9308, + "step": 425 + }, + { + "epoch": 0.021246882793017455, + "grad_norm": 0.8503435006207157, + "learning_rate": 3.5299003322259136e-05, + "loss": 0.9623, + "step": 426 + }, + { + "epoch": 0.021296758104738155, + "grad_norm": 0.6940768564328201, + "learning_rate": 3.5382059800664455e-05, + "loss": 0.9818, + "step": 427 + }, + { + "epoch": 0.021346633416458854, + "grad_norm": 0.6414991989484956, + "learning_rate": 3.5465116279069774e-05, + "loss": 0.9211, + "step": 428 + }, + { + "epoch": 0.02139650872817955, + "grad_norm": 0.686991749912539, + "learning_rate": 3.5548172757475086e-05, + "loss": 0.9696, + "step": 429 + }, + { + "epoch": 0.02144638403990025, + "grad_norm": 0.6238211312052699, + "learning_rate": 3.56312292358804e-05, + "loss": 0.9291, + "step": 430 + }, + { + "epoch": 0.021496259351620948, + "grad_norm": 1.4202438546353275, + "learning_rate": 3.571428571428572e-05, + "loss": 0.9427, + "step": 431 + }, + { + "epoch": 0.021546134663341647, + "grad_norm": 0.6913454290625768, + "learning_rate": 3.579734219269103e-05, + "loss": 0.9202, + "step": 432 + }, + { + "epoch": 0.021596009975062343, + "grad_norm": 0.7842934514979679, + "learning_rate": 3.588039867109635e-05, + "loss": 0.9365, + "step": 433 + }, + { + "epoch": 0.021645885286783042, + "grad_norm": 1.0406128832710622, + "learning_rate": 3.596345514950166e-05, + "loss": 0.9728, + "step": 434 + }, + { + "epoch": 0.02169576059850374, + "grad_norm": 0.7125921440431922, + "learning_rate": 3.604651162790698e-05, + "loss": 0.9615, + "step": 435 + }, + { + "epoch": 0.02174563591022444, + "grad_norm": 0.6636014251342814, + "learning_rate": 3.61295681063123e-05, + "loss": 0.9485, + "step": 436 + }, + { + "epoch": 0.021795511221945136, + "grad_norm": 0.7903697248260289, + "learning_rate": 3.621262458471761e-05, + "loss": 0.9378, + "step": 437 + }, + { + "epoch": 0.021845386533665835, + "grad_norm": 0.6347785553347228, + "learning_rate": 3.629568106312293e-05, + "loss": 0.9277, + "step": 438 + }, + { + "epoch": 0.021895261845386534, + "grad_norm": 0.7505462800405609, + "learning_rate": 3.637873754152824e-05, + "loss": 0.8828, + "step": 439 + }, + { + "epoch": 0.021945137157107233, + "grad_norm": 0.8930726420137262, + "learning_rate": 3.646179401993355e-05, + "loss": 0.9367, + "step": 440 + }, + { + "epoch": 0.02199501246882793, + "grad_norm": 0.6515847255112687, + "learning_rate": 3.654485049833887e-05, + "loss": 0.9252, + "step": 441 + }, + { + "epoch": 0.02204488778054863, + "grad_norm": 0.7902314587725474, + "learning_rate": 3.662790697674418e-05, + "loss": 0.9275, + "step": 442 + }, + { + "epoch": 0.022094763092269327, + "grad_norm": 0.8529409381048747, + "learning_rate": 3.67109634551495e-05, + "loss": 0.9564, + "step": 443 + }, + { + "epoch": 0.022144638403990027, + "grad_norm": 0.7086471115168301, + "learning_rate": 3.679401993355482e-05, + "loss": 0.9483, + "step": 444 + }, + { + "epoch": 0.022194513715710722, + "grad_norm": 0.8533533401184438, + "learning_rate": 3.687707641196013e-05, + "loss": 0.9953, + "step": 445 + }, + { + "epoch": 0.02224438902743142, + "grad_norm": 0.7832266044933317, + "learning_rate": 3.696013289036545e-05, + "loss": 0.9694, + "step": 446 + }, + { + "epoch": 0.02229426433915212, + "grad_norm": 0.6543336224770847, + "learning_rate": 3.7043189368770764e-05, + "loss": 0.9248, + "step": 447 + }, + { + "epoch": 0.022344139650872816, + "grad_norm": 0.7413805646429145, + "learning_rate": 3.712624584717608e-05, + "loss": 0.9223, + "step": 448 + }, + { + "epoch": 0.022394014962593516, + "grad_norm": 0.7467951466078785, + "learning_rate": 3.7209302325581394e-05, + "loss": 0.9457, + "step": 449 + }, + { + "epoch": 0.022443890274314215, + "grad_norm": 0.6556075699476793, + "learning_rate": 3.729235880398671e-05, + "loss": 0.9358, + "step": 450 + }, + { + "epoch": 0.022493765586034914, + "grad_norm": 0.8797135159068772, + "learning_rate": 3.737541528239203e-05, + "loss": 0.9109, + "step": 451 + }, + { + "epoch": 0.02254364089775561, + "grad_norm": 0.8027452516404762, + "learning_rate": 3.7458471760797344e-05, + "loss": 0.9908, + "step": 452 + }, + { + "epoch": 0.02259351620947631, + "grad_norm": 0.7225614244013685, + "learning_rate": 3.754152823920266e-05, + "loss": 0.9426, + "step": 453 + }, + { + "epoch": 0.022643391521197008, + "grad_norm": 1.2712995668188605, + "learning_rate": 3.7624584717607975e-05, + "loss": 0.9646, + "step": 454 + }, + { + "epoch": 0.022693266832917707, + "grad_norm": 1.7144637758424277, + "learning_rate": 3.7707641196013294e-05, + "loss": 0.9064, + "step": 455 + }, + { + "epoch": 0.022743142144638403, + "grad_norm": 0.8840926875208391, + "learning_rate": 3.7790697674418606e-05, + "loss": 0.9604, + "step": 456 + }, + { + "epoch": 0.022793017456359102, + "grad_norm": 1.1284614278478577, + "learning_rate": 3.787375415282392e-05, + "loss": 0.9555, + "step": 457 + }, + { + "epoch": 0.0228428927680798, + "grad_norm": 0.9206926370305665, + "learning_rate": 3.7956810631229237e-05, + "loss": 0.9534, + "step": 458 + }, + { + "epoch": 0.0228927680798005, + "grad_norm": 0.7495276783147385, + "learning_rate": 3.8039867109634555e-05, + "loss": 0.9568, + "step": 459 + }, + { + "epoch": 0.022942643391521196, + "grad_norm": 0.7749149312316771, + "learning_rate": 3.8122923588039874e-05, + "loss": 0.9624, + "step": 460 + }, + { + "epoch": 0.022992518703241895, + "grad_norm": 0.6170754708887753, + "learning_rate": 3.8205980066445186e-05, + "loss": 0.9341, + "step": 461 + }, + { + "epoch": 0.023042394014962594, + "grad_norm": 0.6237507775949767, + "learning_rate": 3.82890365448505e-05, + "loss": 0.9453, + "step": 462 + }, + { + "epoch": 0.02309226932668329, + "grad_norm": 0.5340327067143867, + "learning_rate": 3.837209302325582e-05, + "loss": 0.9603, + "step": 463 + }, + { + "epoch": 0.02314214463840399, + "grad_norm": 0.6490999666359529, + "learning_rate": 3.845514950166113e-05, + "loss": 0.979, + "step": 464 + }, + { + "epoch": 0.02319201995012469, + "grad_norm": 0.7358895966603276, + "learning_rate": 3.853820598006645e-05, + "loss": 0.9834, + "step": 465 + }, + { + "epoch": 0.023241895261845388, + "grad_norm": 0.7607305474384325, + "learning_rate": 3.862126245847176e-05, + "loss": 0.9824, + "step": 466 + }, + { + "epoch": 0.023291770573566083, + "grad_norm": 0.7584190665173771, + "learning_rate": 3.870431893687708e-05, + "loss": 0.9803, + "step": 467 + }, + { + "epoch": 0.023341645885286783, + "grad_norm": 0.6962961332383152, + "learning_rate": 3.87873754152824e-05, + "loss": 0.9317, + "step": 468 + }, + { + "epoch": 0.02339152119700748, + "grad_norm": 0.6034823787279822, + "learning_rate": 3.887043189368771e-05, + "loss": 0.942, + "step": 469 + }, + { + "epoch": 0.02344139650872818, + "grad_norm": 0.8272615831071036, + "learning_rate": 3.895348837209303e-05, + "loss": 0.9459, + "step": 470 + }, + { + "epoch": 0.023491271820448877, + "grad_norm": 0.6113970535215366, + "learning_rate": 3.903654485049834e-05, + "loss": 0.9113, + "step": 471 + }, + { + "epoch": 0.023541147132169576, + "grad_norm": 0.8338898427499155, + "learning_rate": 3.911960132890365e-05, + "loss": 0.9457, + "step": 472 + }, + { + "epoch": 0.023591022443890275, + "grad_norm": 0.7194299943635565, + "learning_rate": 3.920265780730897e-05, + "loss": 0.9401, + "step": 473 + }, + { + "epoch": 0.023640897755610974, + "grad_norm": 0.713256452602886, + "learning_rate": 3.928571428571429e-05, + "loss": 0.9268, + "step": 474 + }, + { + "epoch": 0.02369077306733167, + "grad_norm": 0.6652973765790708, + "learning_rate": 3.936877076411961e-05, + "loss": 0.9706, + "step": 475 + }, + { + "epoch": 0.02374064837905237, + "grad_norm": 0.8674379502574068, + "learning_rate": 3.945182724252492e-05, + "loss": 0.9123, + "step": 476 + }, + { + "epoch": 0.023790523690773068, + "grad_norm": 0.8085432019431961, + "learning_rate": 3.953488372093023e-05, + "loss": 0.9729, + "step": 477 + }, + { + "epoch": 0.023840399002493767, + "grad_norm": 0.6858236357698302, + "learning_rate": 3.961794019933555e-05, + "loss": 0.9534, + "step": 478 + }, + { + "epoch": 0.023890274314214463, + "grad_norm": 0.7204491479273631, + "learning_rate": 3.9700996677740864e-05, + "loss": 0.9691, + "step": 479 + }, + { + "epoch": 0.023940149625935162, + "grad_norm": 0.7022198410418068, + "learning_rate": 3.978405315614618e-05, + "loss": 0.958, + "step": 480 + }, + { + "epoch": 0.02399002493765586, + "grad_norm": 0.7185749786608483, + "learning_rate": 3.9867109634551495e-05, + "loss": 0.9592, + "step": 481 + }, + { + "epoch": 0.024039900249376557, + "grad_norm": 0.8609153964908303, + "learning_rate": 3.995016611295681e-05, + "loss": 0.9583, + "step": 482 + }, + { + "epoch": 0.024089775561097256, + "grad_norm": 0.7205734570676173, + "learning_rate": 4.003322259136213e-05, + "loss": 0.9845, + "step": 483 + }, + { + "epoch": 0.024139650872817955, + "grad_norm": 0.6168339577085465, + "learning_rate": 4.0116279069767444e-05, + "loss": 0.955, + "step": 484 + }, + { + "epoch": 0.024189526184538655, + "grad_norm": 0.70787485676011, + "learning_rate": 4.019933554817276e-05, + "loss": 1.0167, + "step": 485 + }, + { + "epoch": 0.02423940149625935, + "grad_norm": 0.7431871271085058, + "learning_rate": 4.0282392026578075e-05, + "loss": 0.921, + "step": 486 + }, + { + "epoch": 0.02428927680798005, + "grad_norm": 1.7907726071534111, + "learning_rate": 4.036544850498339e-05, + "loss": 0.9344, + "step": 487 + }, + { + "epoch": 0.02433915211970075, + "grad_norm": 0.6978497570714501, + "learning_rate": 4.0448504983388706e-05, + "loss": 0.9752, + "step": 488 + }, + { + "epoch": 0.024389027431421448, + "grad_norm": 0.5977961899865282, + "learning_rate": 4.053156146179402e-05, + "loss": 0.9776, + "step": 489 + }, + { + "epoch": 0.024438902743142144, + "grad_norm": 0.7255800959060158, + "learning_rate": 4.061461794019934e-05, + "loss": 0.9008, + "step": 490 + }, + { + "epoch": 0.024488778054862843, + "grad_norm": 0.7380435424088552, + "learning_rate": 4.0697674418604655e-05, + "loss": 0.9705, + "step": 491 + }, + { + "epoch": 0.024538653366583542, + "grad_norm": 0.7934969421623795, + "learning_rate": 4.078073089700997e-05, + "loss": 0.9305, + "step": 492 + }, + { + "epoch": 0.02458852867830424, + "grad_norm": 0.6705156870537614, + "learning_rate": 4.0863787375415286e-05, + "loss": 0.9761, + "step": 493 + }, + { + "epoch": 0.024638403990024937, + "grad_norm": 0.8652749080473969, + "learning_rate": 4.09468438538206e-05, + "loss": 0.9426, + "step": 494 + }, + { + "epoch": 0.024688279301745636, + "grad_norm": 0.6361436026909547, + "learning_rate": 4.102990033222592e-05, + "loss": 0.9553, + "step": 495 + }, + { + "epoch": 0.024738154613466335, + "grad_norm": 0.6399613393284006, + "learning_rate": 4.111295681063123e-05, + "loss": 0.9013, + "step": 496 + }, + { + "epoch": 0.02478802992518703, + "grad_norm": 0.6085679251076378, + "learning_rate": 4.119601328903655e-05, + "loss": 0.9992, + "step": 497 + }, + { + "epoch": 0.02483790523690773, + "grad_norm": 2.0401365175221766, + "learning_rate": 4.127906976744187e-05, + "loss": 0.9461, + "step": 498 + }, + { + "epoch": 0.02488778054862843, + "grad_norm": 0.6194189037262003, + "learning_rate": 4.136212624584718e-05, + "loss": 0.9806, + "step": 499 + }, + { + "epoch": 0.02493765586034913, + "grad_norm": 0.9782589155488013, + "learning_rate": 4.14451827242525e-05, + "loss": 0.9698, + "step": 500 + }, + { + "epoch": 0.024987531172069824, + "grad_norm": 0.662484924165895, + "learning_rate": 4.152823920265781e-05, + "loss": 0.9556, + "step": 501 + }, + { + "epoch": 0.025037406483790523, + "grad_norm": 0.5478050617475646, + "learning_rate": 4.161129568106312e-05, + "loss": 0.9593, + "step": 502 + }, + { + "epoch": 0.025087281795511222, + "grad_norm": 0.792064560388656, + "learning_rate": 4.169435215946844e-05, + "loss": 0.9387, + "step": 503 + }, + { + "epoch": 0.02513715710723192, + "grad_norm": 0.6218494808773649, + "learning_rate": 4.177740863787375e-05, + "loss": 0.9537, + "step": 504 + }, + { + "epoch": 0.025187032418952617, + "grad_norm": 0.706270615439779, + "learning_rate": 4.186046511627907e-05, + "loss": 0.9786, + "step": 505 + }, + { + "epoch": 0.025236907730673316, + "grad_norm": 0.5692868699821403, + "learning_rate": 4.194352159468439e-05, + "loss": 0.917, + "step": 506 + }, + { + "epoch": 0.025286783042394016, + "grad_norm": 0.762690823762792, + "learning_rate": 4.20265780730897e-05, + "loss": 0.975, + "step": 507 + }, + { + "epoch": 0.025336658354114715, + "grad_norm": 0.6257273676265818, + "learning_rate": 4.210963455149502e-05, + "loss": 0.9238, + "step": 508 + }, + { + "epoch": 0.02538653366583541, + "grad_norm": 0.6979177382541194, + "learning_rate": 4.219269102990033e-05, + "loss": 0.9501, + "step": 509 + }, + { + "epoch": 0.02543640897755611, + "grad_norm": 0.5818992936033001, + "learning_rate": 4.227574750830565e-05, + "loss": 0.9031, + "step": 510 + }, + { + "epoch": 0.02548628428927681, + "grad_norm": 0.7276066846700705, + "learning_rate": 4.2358803986710964e-05, + "loss": 0.9285, + "step": 511 + }, + { + "epoch": 0.025536159600997508, + "grad_norm": 0.7613871251343347, + "learning_rate": 4.2441860465116276e-05, + "loss": 0.95, + "step": 512 + }, + { + "epoch": 0.025586034912718204, + "grad_norm": 0.5713023268321495, + "learning_rate": 4.2524916943521595e-05, + "loss": 0.9304, + "step": 513 + }, + { + "epoch": 0.025635910224438903, + "grad_norm": 0.9073504514463973, + "learning_rate": 4.2607973421926913e-05, + "loss": 0.9597, + "step": 514 + }, + { + "epoch": 0.025685785536159602, + "grad_norm": 0.9160538726308138, + "learning_rate": 4.269102990033223e-05, + "loss": 0.9244, + "step": 515 + }, + { + "epoch": 0.025735660847880298, + "grad_norm": 0.6658346622776171, + "learning_rate": 4.2774086378737544e-05, + "loss": 0.9491, + "step": 516 + }, + { + "epoch": 0.025785536159600997, + "grad_norm": 0.5473562444320847, + "learning_rate": 4.2857142857142856e-05, + "loss": 0.9344, + "step": 517 + }, + { + "epoch": 0.025835411471321696, + "grad_norm": 0.5948307908968519, + "learning_rate": 4.2940199335548175e-05, + "loss": 0.9652, + "step": 518 + }, + { + "epoch": 0.025885286783042395, + "grad_norm": 0.8062415565406383, + "learning_rate": 4.302325581395349e-05, + "loss": 0.9714, + "step": 519 + }, + { + "epoch": 0.02593516209476309, + "grad_norm": 0.7612309199839943, + "learning_rate": 4.3106312292358806e-05, + "loss": 0.892, + "step": 520 + }, + { + "epoch": 0.02598503740648379, + "grad_norm": 0.6633466494671779, + "learning_rate": 4.3189368770764125e-05, + "loss": 0.9493, + "step": 521 + }, + { + "epoch": 0.02603491271820449, + "grad_norm": 0.7771965466792687, + "learning_rate": 4.327242524916944e-05, + "loss": 0.9531, + "step": 522 + }, + { + "epoch": 0.02608478802992519, + "grad_norm": 0.7306752429788227, + "learning_rate": 4.3355481727574756e-05, + "loss": 0.9373, + "step": 523 + }, + { + "epoch": 0.026134663341645884, + "grad_norm": 0.774892646740655, + "learning_rate": 4.343853820598007e-05, + "loss": 0.9314, + "step": 524 + }, + { + "epoch": 0.026184538653366583, + "grad_norm": 0.6256538059928489, + "learning_rate": 4.3521594684385386e-05, + "loss": 0.9445, + "step": 525 + }, + { + "epoch": 0.026234413965087282, + "grad_norm": 0.8043739299771692, + "learning_rate": 4.36046511627907e-05, + "loss": 0.914, + "step": 526 + }, + { + "epoch": 0.02628428927680798, + "grad_norm": 2.684926920076861, + "learning_rate": 4.368770764119601e-05, + "loss": 0.9734, + "step": 527 + }, + { + "epoch": 0.026334164588528677, + "grad_norm": 0.7032758781134834, + "learning_rate": 4.377076411960133e-05, + "loss": 0.9555, + "step": 528 + }, + { + "epoch": 0.026384039900249377, + "grad_norm": 0.6869483730353917, + "learning_rate": 4.385382059800665e-05, + "loss": 0.97, + "step": 529 + }, + { + "epoch": 0.026433915211970076, + "grad_norm": 0.8086762108061772, + "learning_rate": 4.393687707641197e-05, + "loss": 0.9828, + "step": 530 + }, + { + "epoch": 0.02648379052369077, + "grad_norm": 0.6769252768397763, + "learning_rate": 4.401993355481728e-05, + "loss": 0.9576, + "step": 531 + }, + { + "epoch": 0.02653366583541147, + "grad_norm": 0.8888288068830779, + "learning_rate": 4.410299003322259e-05, + "loss": 0.9645, + "step": 532 + }, + { + "epoch": 0.02658354114713217, + "grad_norm": 0.9642993446523587, + "learning_rate": 4.418604651162791e-05, + "loss": 0.9292, + "step": 533 + }, + { + "epoch": 0.02663341645885287, + "grad_norm": 0.9230227813803529, + "learning_rate": 4.426910299003322e-05, + "loss": 0.8957, + "step": 534 + }, + { + "epoch": 0.026683291770573565, + "grad_norm": 0.6881576559734043, + "learning_rate": 4.435215946843854e-05, + "loss": 0.9832, + "step": 535 + }, + { + "epoch": 0.026733167082294264, + "grad_norm": 0.8116730731772974, + "learning_rate": 4.443521594684385e-05, + "loss": 0.9773, + "step": 536 + }, + { + "epoch": 0.026783042394014963, + "grad_norm": 0.7236379349588005, + "learning_rate": 4.451827242524917e-05, + "loss": 0.9838, + "step": 537 + }, + { + "epoch": 0.026832917705735662, + "grad_norm": 0.6262855915689262, + "learning_rate": 4.460132890365449e-05, + "loss": 0.9383, + "step": 538 + }, + { + "epoch": 0.026882793017456358, + "grad_norm": 0.7504452442156262, + "learning_rate": 4.46843853820598e-05, + "loss": 0.9211, + "step": 539 + }, + { + "epoch": 0.026932668329177057, + "grad_norm": 0.624148134601973, + "learning_rate": 4.476744186046512e-05, + "loss": 0.9441, + "step": 540 + }, + { + "epoch": 0.026982543640897756, + "grad_norm": 0.6526680900437255, + "learning_rate": 4.485049833887043e-05, + "loss": 0.9772, + "step": 541 + }, + { + "epoch": 0.027032418952618455, + "grad_norm": 0.6679758942381377, + "learning_rate": 4.4933554817275745e-05, + "loss": 0.9788, + "step": 542 + }, + { + "epoch": 0.02708229426433915, + "grad_norm": 0.6398997829636935, + "learning_rate": 4.5016611295681064e-05, + "loss": 0.8929, + "step": 543 + }, + { + "epoch": 0.02713216957605985, + "grad_norm": 0.6761254796366162, + "learning_rate": 4.509966777408638e-05, + "loss": 0.963, + "step": 544 + }, + { + "epoch": 0.02718204488778055, + "grad_norm": 0.6592344765328979, + "learning_rate": 4.51827242524917e-05, + "loss": 0.9602, + "step": 545 + }, + { + "epoch": 0.02723192019950125, + "grad_norm": 0.6515983979099513, + "learning_rate": 4.5265780730897014e-05, + "loss": 0.9556, + "step": 546 + }, + { + "epoch": 0.027281795511221944, + "grad_norm": 0.6517317775730219, + "learning_rate": 4.5348837209302326e-05, + "loss": 0.9811, + "step": 547 + }, + { + "epoch": 0.027331670822942643, + "grad_norm": 0.6668092337183903, + "learning_rate": 4.5431893687707644e-05, + "loss": 0.9163, + "step": 548 + }, + { + "epoch": 0.027381546134663343, + "grad_norm": 0.6266383931002819, + "learning_rate": 4.5514950166112956e-05, + "loss": 0.9008, + "step": 549 + }, + { + "epoch": 0.02743142144638404, + "grad_norm": 0.9880793326652614, + "learning_rate": 4.5598006644518275e-05, + "loss": 0.963, + "step": 550 + }, + { + "epoch": 0.027481296758104738, + "grad_norm": 0.5849247448713651, + "learning_rate": 4.568106312292359e-05, + "loss": 0.9504, + "step": 551 + }, + { + "epoch": 0.027531172069825437, + "grad_norm": 0.6663219678701662, + "learning_rate": 4.5764119601328906e-05, + "loss": 0.95, + "step": 552 + }, + { + "epoch": 0.027581047381546136, + "grad_norm": 0.6418040487138629, + "learning_rate": 4.5847176079734225e-05, + "loss": 0.9321, + "step": 553 + }, + { + "epoch": 0.02763092269326683, + "grad_norm": 0.8169480304242015, + "learning_rate": 4.593023255813954e-05, + "loss": 0.9277, + "step": 554 + }, + { + "epoch": 0.02768079800498753, + "grad_norm": 0.6986788018410984, + "learning_rate": 4.6013289036544856e-05, + "loss": 0.9278, + "step": 555 + }, + { + "epoch": 0.02773067331670823, + "grad_norm": 0.6554701918833876, + "learning_rate": 4.609634551495017e-05, + "loss": 0.9605, + "step": 556 + }, + { + "epoch": 0.02778054862842893, + "grad_norm": 0.8860031559210767, + "learning_rate": 4.6179401993355486e-05, + "loss": 1.0152, + "step": 557 + }, + { + "epoch": 0.027830423940149625, + "grad_norm": 0.649493564283632, + "learning_rate": 4.62624584717608e-05, + "loss": 0.9628, + "step": 558 + }, + { + "epoch": 0.027880299251870324, + "grad_norm": 0.7754026963190074, + "learning_rate": 4.634551495016611e-05, + "loss": 0.961, + "step": 559 + }, + { + "epoch": 0.027930174563591023, + "grad_norm": 0.7734445590061989, + "learning_rate": 4.642857142857143e-05, + "loss": 0.9329, + "step": 560 + }, + { + "epoch": 0.027980049875311722, + "grad_norm": 0.7202092539493791, + "learning_rate": 4.651162790697675e-05, + "loss": 0.947, + "step": 561 + }, + { + "epoch": 0.028029925187032418, + "grad_norm": 0.8644793277603828, + "learning_rate": 4.659468438538207e-05, + "loss": 0.9672, + "step": 562 + }, + { + "epoch": 0.028079800498753117, + "grad_norm": 0.6625622168769278, + "learning_rate": 4.667774086378738e-05, + "loss": 0.9383, + "step": 563 + }, + { + "epoch": 0.028129675810473816, + "grad_norm": 0.5998637638139648, + "learning_rate": 4.676079734219269e-05, + "loss": 0.9483, + "step": 564 + }, + { + "epoch": 0.028179551122194512, + "grad_norm": 0.6925873208949996, + "learning_rate": 4.684385382059801e-05, + "loss": 0.9676, + "step": 565 + }, + { + "epoch": 0.02822942643391521, + "grad_norm": 0.6537917077388807, + "learning_rate": 4.692691029900332e-05, + "loss": 0.954, + "step": 566 + }, + { + "epoch": 0.02827930174563591, + "grad_norm": 0.6754937128226459, + "learning_rate": 4.700996677740864e-05, + "loss": 0.9627, + "step": 567 + }, + { + "epoch": 0.02832917705735661, + "grad_norm": 0.646135179571174, + "learning_rate": 4.709302325581396e-05, + "loss": 0.977, + "step": 568 + }, + { + "epoch": 0.028379052369077305, + "grad_norm": 0.7319656842846003, + "learning_rate": 4.717607973421927e-05, + "loss": 0.9745, + "step": 569 + }, + { + "epoch": 0.028428927680798004, + "grad_norm": 0.7208092455534506, + "learning_rate": 4.725913621262459e-05, + "loss": 0.9408, + "step": 570 + }, + { + "epoch": 0.028478802992518704, + "grad_norm": 0.8071334442558962, + "learning_rate": 4.73421926910299e-05, + "loss": 0.9931, + "step": 571 + }, + { + "epoch": 0.028528678304239403, + "grad_norm": 0.8366290366855041, + "learning_rate": 4.742524916943522e-05, + "loss": 0.9837, + "step": 572 + }, + { + "epoch": 0.0285785536159601, + "grad_norm": 0.9003317183407521, + "learning_rate": 4.750830564784053e-05, + "loss": 0.9473, + "step": 573 + }, + { + "epoch": 0.028628428927680798, + "grad_norm": 0.59695540179719, + "learning_rate": 4.7591362126245845e-05, + "loss": 0.945, + "step": 574 + }, + { + "epoch": 0.028678304239401497, + "grad_norm": 0.7156591280150888, + "learning_rate": 4.7674418604651164e-05, + "loss": 1.0458, + "step": 575 + }, + { + "epoch": 0.028728179551122196, + "grad_norm": 0.6933087888630552, + "learning_rate": 4.775747508305648e-05, + "loss": 0.96, + "step": 576 + }, + { + "epoch": 0.02877805486284289, + "grad_norm": 0.6317903016914131, + "learning_rate": 4.78405315614618e-05, + "loss": 0.9288, + "step": 577 + }, + { + "epoch": 0.02882793017456359, + "grad_norm": 0.7756929585486549, + "learning_rate": 4.7923588039867114e-05, + "loss": 0.9631, + "step": 578 + }, + { + "epoch": 0.02887780548628429, + "grad_norm": 0.5614713001276419, + "learning_rate": 4.8006644518272426e-05, + "loss": 0.9733, + "step": 579 + }, + { + "epoch": 0.02892768079800499, + "grad_norm": 0.7462265479409996, + "learning_rate": 4.8089700996677744e-05, + "loss": 1.0069, + "step": 580 + }, + { + "epoch": 0.028977556109725685, + "grad_norm": 0.6834366051107059, + "learning_rate": 4.8172757475083056e-05, + "loss": 0.9675, + "step": 581 + }, + { + "epoch": 0.029027431421446384, + "grad_norm": 0.6617798844189302, + "learning_rate": 4.8255813953488375e-05, + "loss": 0.9543, + "step": 582 + }, + { + "epoch": 0.029077306733167083, + "grad_norm": 1.4090940475132008, + "learning_rate": 4.833887043189369e-05, + "loss": 0.9698, + "step": 583 + }, + { + "epoch": 0.02912718204488778, + "grad_norm": 0.7040467914934694, + "learning_rate": 4.8421926910299006e-05, + "loss": 0.9677, + "step": 584 + }, + { + "epoch": 0.029177057356608478, + "grad_norm": 0.6625037832146246, + "learning_rate": 4.8504983388704325e-05, + "loss": 0.9522, + "step": 585 + }, + { + "epoch": 0.029226932668329177, + "grad_norm": 0.6859758062316357, + "learning_rate": 4.858803986710964e-05, + "loss": 0.9668, + "step": 586 + }, + { + "epoch": 0.029276807980049877, + "grad_norm": 0.6717781675446652, + "learning_rate": 4.8671096345514956e-05, + "loss": 0.9127, + "step": 587 + }, + { + "epoch": 0.029326683291770572, + "grad_norm": 0.6586057260997682, + "learning_rate": 4.875415282392027e-05, + "loss": 0.9352, + "step": 588 + }, + { + "epoch": 0.02937655860349127, + "grad_norm": 0.6330075211968533, + "learning_rate": 4.883720930232558e-05, + "loss": 0.947, + "step": 589 + }, + { + "epoch": 0.02942643391521197, + "grad_norm": 0.6880060115913379, + "learning_rate": 4.89202657807309e-05, + "loss": 0.9362, + "step": 590 + }, + { + "epoch": 0.02947630922693267, + "grad_norm": 0.603763329257647, + "learning_rate": 4.900332225913621e-05, + "loss": 0.9459, + "step": 591 + }, + { + "epoch": 0.029526184538653365, + "grad_norm": 1.190567294455111, + "learning_rate": 4.9086378737541536e-05, + "loss": 0.9907, + "step": 592 + }, + { + "epoch": 0.029576059850374065, + "grad_norm": 0.7416724578366496, + "learning_rate": 4.916943521594685e-05, + "loss": 0.9524, + "step": 593 + }, + { + "epoch": 0.029625935162094764, + "grad_norm": 0.8123624348744337, + "learning_rate": 4.925249169435216e-05, + "loss": 0.927, + "step": 594 + }, + { + "epoch": 0.029675810473815463, + "grad_norm": 0.6784261241665075, + "learning_rate": 4.933554817275748e-05, + "loss": 0.9908, + "step": 595 + }, + { + "epoch": 0.02972568578553616, + "grad_norm": 0.9297298874718595, + "learning_rate": 4.941860465116279e-05, + "loss": 0.9122, + "step": 596 + }, + { + "epoch": 0.029775561097256858, + "grad_norm": 0.7792572925410369, + "learning_rate": 4.950166112956811e-05, + "loss": 0.9751, + "step": 597 + }, + { + "epoch": 0.029825436408977557, + "grad_norm": 0.7326750089052368, + "learning_rate": 4.958471760797342e-05, + "loss": 0.977, + "step": 598 + }, + { + "epoch": 0.029875311720698253, + "grad_norm": 0.7479049160092787, + "learning_rate": 4.966777408637874e-05, + "loss": 0.9707, + "step": 599 + }, + { + "epoch": 0.029925187032418952, + "grad_norm": 0.651011671479872, + "learning_rate": 4.975083056478406e-05, + "loss": 0.9901, + "step": 600 + }, + { + "epoch": 0.02997506234413965, + "grad_norm": 0.6597743060021202, + "learning_rate": 4.983388704318937e-05, + "loss": 0.9494, + "step": 601 + }, + { + "epoch": 0.03002493765586035, + "grad_norm": 0.6717043244819247, + "learning_rate": 4.991694352159469e-05, + "loss": 0.9824, + "step": 602 + }, + { + "epoch": 0.030074812967581046, + "grad_norm": 0.6472597416233251, + "learning_rate": 5e-05, + "loss": 0.986, + "step": 603 + }, + { + "epoch": 0.030124688279301745, + "grad_norm": 0.9057581459952748, + "learning_rate": 4.99999996738181e-05, + "loss": 0.9604, + "step": 604 + }, + { + "epoch": 0.030174563591022444, + "grad_norm": 1.3464663838161057, + "learning_rate": 4.999999869527239e-05, + "loss": 0.937, + "step": 605 + }, + { + "epoch": 0.030224438902743143, + "grad_norm": 0.6198388554718106, + "learning_rate": 4.9999997064362904e-05, + "loss": 0.9741, + "step": 606 + }, + { + "epoch": 0.03027431421446384, + "grad_norm": 0.799912143627179, + "learning_rate": 4.999999478108968e-05, + "loss": 0.9585, + "step": 607 + }, + { + "epoch": 0.03032418952618454, + "grad_norm": 0.5548985480612364, + "learning_rate": 4.999999184545279e-05, + "loss": 0.958, + "step": 608 + }, + { + "epoch": 0.030374064837905237, + "grad_norm": 0.5485546329030055, + "learning_rate": 4.999998825745229e-05, + "loss": 0.993, + "step": 609 + }, + { + "epoch": 0.030423940149625937, + "grad_norm": 0.5778262700558926, + "learning_rate": 4.9999984017088285e-05, + "loss": 0.9731, + "step": 610 + }, + { + "epoch": 0.030473815461346632, + "grad_norm": 0.697139748361692, + "learning_rate": 4.9999979124360894e-05, + "loss": 0.9922, + "step": 611 + }, + { + "epoch": 0.03052369077306733, + "grad_norm": 0.6197474265865703, + "learning_rate": 4.999997357927023e-05, + "loss": 0.9543, + "step": 612 + }, + { + "epoch": 0.03057356608478803, + "grad_norm": 0.7051729340477995, + "learning_rate": 4.999996738181645e-05, + "loss": 0.9806, + "step": 613 + }, + { + "epoch": 0.03062344139650873, + "grad_norm": 0.7782840692340735, + "learning_rate": 4.999996053199971e-05, + "loss": 0.9355, + "step": 614 + }, + { + "epoch": 0.030673316708229426, + "grad_norm": 0.675098843186283, + "learning_rate": 4.999995302982018e-05, + "loss": 0.9536, + "step": 615 + }, + { + "epoch": 0.030723192019950125, + "grad_norm": 0.6585142195452456, + "learning_rate": 4.999994487527807e-05, + "loss": 1.0102, + "step": 616 + }, + { + "epoch": 0.030773067331670824, + "grad_norm": 0.7850972275165652, + "learning_rate": 4.9999936068373584e-05, + "loss": 0.9734, + "step": 617 + }, + { + "epoch": 0.03082294264339152, + "grad_norm": 0.6194488119565065, + "learning_rate": 4.999992660910696e-05, + "loss": 0.9424, + "step": 618 + }, + { + "epoch": 0.03087281795511222, + "grad_norm": 0.6936671585295319, + "learning_rate": 4.999991649747844e-05, + "loss": 0.9581, + "step": 619 + }, + { + "epoch": 0.030922693266832918, + "grad_norm": 0.621207697647042, + "learning_rate": 4.9999905733488285e-05, + "loss": 0.9439, + "step": 620 + }, + { + "epoch": 0.030972568578553617, + "grad_norm": 0.5486614133455657, + "learning_rate": 4.999989431713678e-05, + "loss": 0.9531, + "step": 621 + }, + { + "epoch": 0.031022443890274313, + "grad_norm": 0.7311377572135008, + "learning_rate": 4.999988224842421e-05, + "loss": 0.9888, + "step": 622 + }, + { + "epoch": 0.031072319201995012, + "grad_norm": 0.7998671747513934, + "learning_rate": 4.999986952735092e-05, + "loss": 1.0268, + "step": 623 + }, + { + "epoch": 0.03112219451371571, + "grad_norm": 0.7643961072740879, + "learning_rate": 4.9999856153917205e-05, + "loss": 1.019, + "step": 624 + }, + { + "epoch": 0.03117206982543641, + "grad_norm": 0.6394655732733041, + "learning_rate": 4.999984212812344e-05, + "loss": 0.975, + "step": 625 + }, + { + "epoch": 0.031221945137157106, + "grad_norm": 0.6600069590248846, + "learning_rate": 4.9999827449969986e-05, + "loss": 1.0219, + "step": 626 + }, + { + "epoch": 0.03127182044887781, + "grad_norm": 0.8909849162265572, + "learning_rate": 4.999981211945722e-05, + "loss": 0.9258, + "step": 627 + }, + { + "epoch": 0.031321695760598504, + "grad_norm": 0.6201655305550693, + "learning_rate": 4.9999796136585544e-05, + "loss": 0.9379, + "step": 628 + }, + { + "epoch": 0.0313715710723192, + "grad_norm": 0.7264198073702824, + "learning_rate": 4.9999779501355384e-05, + "loss": 0.9535, + "step": 629 + }, + { + "epoch": 0.0314214463840399, + "grad_norm": 0.6378072616495957, + "learning_rate": 4.999976221376715e-05, + "loss": 0.985, + "step": 630 + }, + { + "epoch": 0.0314713216957606, + "grad_norm": 0.5335963897339883, + "learning_rate": 4.9999744273821326e-05, + "loss": 0.9175, + "step": 631 + }, + { + "epoch": 0.031521197007481294, + "grad_norm": 0.5861388643683807, + "learning_rate": 4.999972568151836e-05, + "loss": 0.9703, + "step": 632 + }, + { + "epoch": 0.031571072319202, + "grad_norm": 0.5509734283553909, + "learning_rate": 4.999970643685874e-05, + "loss": 0.9598, + "step": 633 + }, + { + "epoch": 0.03162094763092269, + "grad_norm": 0.5570010272685344, + "learning_rate": 4.999968653984296e-05, + "loss": 0.9389, + "step": 634 + }, + { + "epoch": 0.03167082294264339, + "grad_norm": 0.7727561721568762, + "learning_rate": 4.999966599047156e-05, + "loss": 0.9428, + "step": 635 + }, + { + "epoch": 0.03172069825436409, + "grad_norm": 0.5525428339488174, + "learning_rate": 4.999964478874506e-05, + "loss": 0.9585, + "step": 636 + }, + { + "epoch": 0.03177057356608479, + "grad_norm": 0.6159459856638831, + "learning_rate": 4.9999622934664016e-05, + "loss": 0.9223, + "step": 637 + }, + { + "epoch": 0.03182044887780549, + "grad_norm": 0.5848070389826782, + "learning_rate": 4.9999600428229e-05, + "loss": 0.967, + "step": 638 + }, + { + "epoch": 0.031870324189526185, + "grad_norm": 0.5810170592782861, + "learning_rate": 4.99995772694406e-05, + "loss": 0.965, + "step": 639 + }, + { + "epoch": 0.03192019950124688, + "grad_norm": 0.5782096576036426, + "learning_rate": 4.999955345829942e-05, + "loss": 0.9849, + "step": 640 + }, + { + "epoch": 0.03197007481296758, + "grad_norm": 0.7680193731167869, + "learning_rate": 4.999952899480608e-05, + "loss": 0.9474, + "step": 641 + }, + { + "epoch": 0.03201995012468828, + "grad_norm": 0.7116509841814649, + "learning_rate": 4.999950387896123e-05, + "loss": 0.9852, + "step": 642 + }, + { + "epoch": 0.032069825436408975, + "grad_norm": 1.0624406141331644, + "learning_rate": 4.99994781107655e-05, + "loss": 1.0077, + "step": 643 + }, + { + "epoch": 0.03211970074812968, + "grad_norm": 0.679525593238316, + "learning_rate": 4.999945169021958e-05, + "loss": 0.9066, + "step": 644 + }, + { + "epoch": 0.03216957605985037, + "grad_norm": 0.49445222871964856, + "learning_rate": 4.999942461732415e-05, + "loss": 0.9463, + "step": 645 + }, + { + "epoch": 0.032219451371571076, + "grad_norm": 0.5527217724758239, + "learning_rate": 4.999939689207994e-05, + "loss": 0.9596, + "step": 646 + }, + { + "epoch": 0.03226932668329177, + "grad_norm": 0.554262146812102, + "learning_rate": 4.999936851448764e-05, + "loss": 0.9314, + "step": 647 + }, + { + "epoch": 0.03231920199501247, + "grad_norm": 0.7388354527996661, + "learning_rate": 4.999933948454802e-05, + "loss": 0.9647, + "step": 648 + }, + { + "epoch": 0.03236907730673317, + "grad_norm": 0.6998134836471683, + "learning_rate": 4.999930980226182e-05, + "loss": 0.9127, + "step": 649 + }, + { + "epoch": 0.032418952618453865, + "grad_norm": 0.6364249005316754, + "learning_rate": 4.999927946762981e-05, + "loss": 0.9784, + "step": 650 + }, + { + "epoch": 0.03246882793017456, + "grad_norm": 0.7395564620576146, + "learning_rate": 4.99992484806528e-05, + "loss": 0.9713, + "step": 651 + }, + { + "epoch": 0.032518703241895264, + "grad_norm": 0.5483166202967775, + "learning_rate": 4.9999216841331587e-05, + "loss": 0.9766, + "step": 652 + }, + { + "epoch": 0.03256857855361596, + "grad_norm": 0.5729102672609816, + "learning_rate": 4.9999184549667e-05, + "loss": 0.9747, + "step": 653 + }, + { + "epoch": 0.032618453865336655, + "grad_norm": 2.00008350360326, + "learning_rate": 4.999915160565988e-05, + "loss": 0.9857, + "step": 654 + }, + { + "epoch": 0.03266832917705736, + "grad_norm": 0.6351960684480858, + "learning_rate": 4.9999118009311084e-05, + "loss": 0.967, + "step": 655 + }, + { + "epoch": 0.032718204488778053, + "grad_norm": 1.21835241287884, + "learning_rate": 4.999908376062149e-05, + "loss": 0.9517, + "step": 656 + }, + { + "epoch": 0.032768079800498756, + "grad_norm": 0.7492575241846614, + "learning_rate": 4.9999048859592005e-05, + "loss": 1.0134, + "step": 657 + }, + { + "epoch": 0.03281795511221945, + "grad_norm": 0.7196030833931714, + "learning_rate": 4.999901330622352e-05, + "loss": 0.9727, + "step": 658 + }, + { + "epoch": 0.03286783042394015, + "grad_norm": 0.6756438973693614, + "learning_rate": 4.999897710051697e-05, + "loss": 0.9301, + "step": 659 + }, + { + "epoch": 0.03291770573566085, + "grad_norm": 0.5695177794416507, + "learning_rate": 4.99989402424733e-05, + "loss": 0.9727, + "step": 660 + }, + { + "epoch": 0.032967581047381546, + "grad_norm": 0.6396916797188517, + "learning_rate": 4.9998902732093475e-05, + "loss": 1.0143, + "step": 661 + }, + { + "epoch": 0.03301745635910224, + "grad_norm": 0.6268264755471655, + "learning_rate": 4.9998864569378476e-05, + "loss": 0.9473, + "step": 662 + }, + { + "epoch": 0.033067331670822944, + "grad_norm": 2.2054935696647924, + "learning_rate": 4.999882575432928e-05, + "loss": 0.9277, + "step": 663 + }, + { + "epoch": 0.03311720698254364, + "grad_norm": 0.768184687084241, + "learning_rate": 4.999878628694693e-05, + "loss": 0.9141, + "step": 664 + }, + { + "epoch": 0.03316708229426434, + "grad_norm": 0.6494990307161023, + "learning_rate": 4.999874616723243e-05, + "loss": 1.006, + "step": 665 + }, + { + "epoch": 0.03321695760598504, + "grad_norm": 0.6353391216675238, + "learning_rate": 4.999870539518684e-05, + "loss": 1.0132, + "step": 666 + }, + { + "epoch": 0.033266832917705734, + "grad_norm": 0.6105262123965232, + "learning_rate": 4.999866397081122e-05, + "loss": 0.9928, + "step": 667 + }, + { + "epoch": 0.03331670822942644, + "grad_norm": 0.5811784551073073, + "learning_rate": 4.999862189410665e-05, + "loss": 0.9863, + "step": 668 + }, + { + "epoch": 0.03336658354114713, + "grad_norm": 0.7708554042949399, + "learning_rate": 4.9998579165074235e-05, + "loss": 0.9336, + "step": 669 + }, + { + "epoch": 0.03341645885286783, + "grad_norm": 0.6499513458243908, + "learning_rate": 4.9998535783715074e-05, + "loss": 0.9527, + "step": 670 + }, + { + "epoch": 0.03346633416458853, + "grad_norm": 0.8001009611924429, + "learning_rate": 4.9998491750030315e-05, + "loss": 0.9978, + "step": 671 + }, + { + "epoch": 0.033516209476309226, + "grad_norm": 0.6464363064848875, + "learning_rate": 4.99984470640211e-05, + "loss": 1.0006, + "step": 672 + }, + { + "epoch": 0.03356608478802992, + "grad_norm": 0.667909865412408, + "learning_rate": 4.99984017256886e-05, + "loss": 0.9473, + "step": 673 + }, + { + "epoch": 0.033615960099750625, + "grad_norm": 0.6748269168330914, + "learning_rate": 4.9998355735034e-05, + "loss": 0.9789, + "step": 674 + }, + { + "epoch": 0.03366583541147132, + "grad_norm": 0.6597076337572386, + "learning_rate": 4.999830909205849e-05, + "loss": 0.9499, + "step": 675 + }, + { + "epoch": 0.03371571072319202, + "grad_norm": 0.6429650503081036, + "learning_rate": 4.999826179676329e-05, + "loss": 0.9844, + "step": 676 + }, + { + "epoch": 0.03376558603491272, + "grad_norm": 0.7703435453623265, + "learning_rate": 4.9998213849149636e-05, + "loss": 0.9526, + "step": 677 + }, + { + "epoch": 0.033815461346633414, + "grad_norm": 0.6981740005245012, + "learning_rate": 4.999816524921879e-05, + "loss": 0.9562, + "step": 678 + }, + { + "epoch": 0.03386533665835412, + "grad_norm": 0.655824600502768, + "learning_rate": 4.9998115996971995e-05, + "loss": 0.9636, + "step": 679 + }, + { + "epoch": 0.03391521197007481, + "grad_norm": 0.7188563952742776, + "learning_rate": 4.9998066092410554e-05, + "loss": 0.9881, + "step": 680 + }, + { + "epoch": 0.03396508728179551, + "grad_norm": 0.6728415174139637, + "learning_rate": 4.999801553553577e-05, + "loss": 0.9663, + "step": 681 + }, + { + "epoch": 0.03401496259351621, + "grad_norm": 0.7694770288934963, + "learning_rate": 4.999796432634895e-05, + "loss": 0.9528, + "step": 682 + }, + { + "epoch": 0.03406483790523691, + "grad_norm": 0.6038904889435409, + "learning_rate": 4.999791246485145e-05, + "loss": 0.9667, + "step": 683 + }, + { + "epoch": 0.0341147132169576, + "grad_norm": 0.5886886245421686, + "learning_rate": 4.999785995104461e-05, + "loss": 0.9773, + "step": 684 + }, + { + "epoch": 0.034164588528678305, + "grad_norm": 0.5611361563749221, + "learning_rate": 4.999780678492979e-05, + "loss": 0.8921, + "step": 685 + }, + { + "epoch": 0.034214463840399, + "grad_norm": 0.5469940069613259, + "learning_rate": 4.99977529665084e-05, + "loss": 0.9542, + "step": 686 + }, + { + "epoch": 0.034264339152119704, + "grad_norm": 0.6153297618572126, + "learning_rate": 4.9997698495781833e-05, + "loss": 1.0013, + "step": 687 + }, + { + "epoch": 0.0343142144638404, + "grad_norm": 0.5902815835466874, + "learning_rate": 4.9997643372751514e-05, + "loss": 0.9946, + "step": 688 + }, + { + "epoch": 0.034364089775561095, + "grad_norm": 0.912127122727569, + "learning_rate": 4.999758759741887e-05, + "loss": 0.9817, + "step": 689 + }, + { + "epoch": 0.0344139650872818, + "grad_norm": 0.6174740815970142, + "learning_rate": 4.999753116978537e-05, + "loss": 0.9401, + "step": 690 + }, + { + "epoch": 0.03446384039900249, + "grad_norm": 0.5878753017617502, + "learning_rate": 4.999747408985249e-05, + "loss": 0.9703, + "step": 691 + }, + { + "epoch": 0.03451371571072319, + "grad_norm": 0.6035897489498795, + "learning_rate": 4.9997416357621695e-05, + "loss": 0.9413, + "step": 692 + }, + { + "epoch": 0.03456359102244389, + "grad_norm": 0.611661155283646, + "learning_rate": 4.9997357973094515e-05, + "loss": 0.9565, + "step": 693 + }, + { + "epoch": 0.03461346633416459, + "grad_norm": 0.6482266175651825, + "learning_rate": 4.9997298936272474e-05, + "loss": 0.968, + "step": 694 + }, + { + "epoch": 0.03466334164588529, + "grad_norm": 0.7714426092183057, + "learning_rate": 4.9997239247157095e-05, + "loss": 0.9602, + "step": 695 + }, + { + "epoch": 0.034713216957605986, + "grad_norm": 0.824584589527587, + "learning_rate": 4.9997178905749945e-05, + "loss": 0.9883, + "step": 696 + }, + { + "epoch": 0.03476309226932668, + "grad_norm": 0.6665201751182107, + "learning_rate": 4.99971179120526e-05, + "loss": 0.9278, + "step": 697 + }, + { + "epoch": 0.034812967581047384, + "grad_norm": 1.2566255636328074, + "learning_rate": 4.999705626606666e-05, + "loss": 1.0128, + "step": 698 + }, + { + "epoch": 0.03486284289276808, + "grad_norm": 0.6205651055226449, + "learning_rate": 4.9996993967793706e-05, + "loss": 0.9936, + "step": 699 + }, + { + "epoch": 0.034912718204488775, + "grad_norm": 0.6419403676912723, + "learning_rate": 4.999693101723539e-05, + "loss": 0.9631, + "step": 700 + }, + { + "epoch": 0.03496259351620948, + "grad_norm": 0.6505063323664741, + "learning_rate": 4.999686741439335e-05, + "loss": 1.0292, + "step": 701 + }, + { + "epoch": 0.035012468827930174, + "grad_norm": 0.587064859752242, + "learning_rate": 4.999680315926924e-05, + "loss": 0.9187, + "step": 702 + }, + { + "epoch": 0.03506234413965087, + "grad_norm": 0.9778771279471984, + "learning_rate": 4.999673825186474e-05, + "loss": 0.905, + "step": 703 + }, + { + "epoch": 0.03511221945137157, + "grad_norm": 0.5486218733869204, + "learning_rate": 4.9996672692181533e-05, + "loss": 0.9332, + "step": 704 + }, + { + "epoch": 0.03516209476309227, + "grad_norm": 0.6069342220243035, + "learning_rate": 4.999660648022135e-05, + "loss": 0.923, + "step": 705 + }, + { + "epoch": 0.03521197007481297, + "grad_norm": 0.5640871707193909, + "learning_rate": 4.99965396159859e-05, + "loss": 0.9532, + "step": 706 + }, + { + "epoch": 0.035261845386533666, + "grad_norm": 0.6783933760757003, + "learning_rate": 4.9996472099476943e-05, + "loss": 1.0101, + "step": 707 + }, + { + "epoch": 0.03531172069825436, + "grad_norm": 0.665135742906579, + "learning_rate": 4.999640393069623e-05, + "loss": 1.0013, + "step": 708 + }, + { + "epoch": 0.035361596009975065, + "grad_norm": 0.6006564882130462, + "learning_rate": 4.9996335109645545e-05, + "loss": 0.9325, + "step": 709 + }, + { + "epoch": 0.03541147132169576, + "grad_norm": 0.5779024260240714, + "learning_rate": 4.999626563632668e-05, + "loss": 0.9589, + "step": 710 + }, + { + "epoch": 0.035461346633416456, + "grad_norm": 0.5874277913224523, + "learning_rate": 4.9996195510741454e-05, + "loss": 1.0181, + "step": 711 + }, + { + "epoch": 0.03551122194513716, + "grad_norm": 0.6736292273668448, + "learning_rate": 4.999612473289169e-05, + "loss": 0.942, + "step": 712 + }, + { + "epoch": 0.035561097256857854, + "grad_norm": 0.5566105268009506, + "learning_rate": 4.999605330277923e-05, + "loss": 0.9584, + "step": 713 + }, + { + "epoch": 0.03561097256857856, + "grad_norm": 0.5606929645432325, + "learning_rate": 4.9995981220405964e-05, + "loss": 1.02, + "step": 714 + }, + { + "epoch": 0.03566084788029925, + "grad_norm": 0.6251720167417135, + "learning_rate": 4.999590848577374e-05, + "loss": 0.9832, + "step": 715 + }, + { + "epoch": 0.03571072319201995, + "grad_norm": 0.7098865685556054, + "learning_rate": 4.999583509888448e-05, + "loss": 1.0026, + "step": 716 + }, + { + "epoch": 0.03576059850374065, + "grad_norm": 0.5870297838116822, + "learning_rate": 4.999576105974008e-05, + "loss": 0.9845, + "step": 717 + }, + { + "epoch": 0.03581047381546135, + "grad_norm": 0.8163452832064648, + "learning_rate": 4.9995686368342484e-05, + "loss": 0.9709, + "step": 718 + }, + { + "epoch": 0.03586034912718204, + "grad_norm": 0.618756449073573, + "learning_rate": 4.999561102469364e-05, + "loss": 0.9699, + "step": 719 + }, + { + "epoch": 0.035910224438902745, + "grad_norm": 0.7139608236448045, + "learning_rate": 4.9995535028795524e-05, + "loss": 1.0062, + "step": 720 + }, + { + "epoch": 0.03596009975062344, + "grad_norm": 0.6104203065994354, + "learning_rate": 4.999545838065009e-05, + "loss": 1.0012, + "step": 721 + }, + { + "epoch": 0.036009975062344136, + "grad_norm": 0.6103705861181122, + "learning_rate": 4.9995381080259374e-05, + "loss": 1.0091, + "step": 722 + }, + { + "epoch": 0.03605985037406484, + "grad_norm": 0.5641390950919231, + "learning_rate": 4.999530312762536e-05, + "loss": 0.9584, + "step": 723 + }, + { + "epoch": 0.036109725685785535, + "grad_norm": 0.6768380234246784, + "learning_rate": 4.999522452275011e-05, + "loss": 0.9913, + "step": 724 + }, + { + "epoch": 0.03615960099750624, + "grad_norm": 0.9040532349883682, + "learning_rate": 4.999514526563565e-05, + "loss": 0.9558, + "step": 725 + }, + { + "epoch": 0.03620947630922693, + "grad_norm": 0.6022663343582007, + "learning_rate": 4.999506535628406e-05, + "loss": 0.9794, + "step": 726 + }, + { + "epoch": 0.03625935162094763, + "grad_norm": 0.8138492533798739, + "learning_rate": 4.999498479469744e-05, + "loss": 0.9108, + "step": 727 + }, + { + "epoch": 0.03630922693266833, + "grad_norm": 0.63031229426115, + "learning_rate": 4.9994903580877875e-05, + "loss": 0.9735, + "step": 728 + }, + { + "epoch": 0.03635910224438903, + "grad_norm": 0.8342968855298679, + "learning_rate": 4.999482171482748e-05, + "loss": 0.9666, + "step": 729 + }, + { + "epoch": 0.03640897755610972, + "grad_norm": 0.5574152513678963, + "learning_rate": 4.99947391965484e-05, + "loss": 0.9729, + "step": 730 + }, + { + "epoch": 0.036458852867830426, + "grad_norm": 0.6111648132648818, + "learning_rate": 4.9994656026042784e-05, + "loss": 0.9375, + "step": 731 + }, + { + "epoch": 0.03650872817955112, + "grad_norm": 0.5593085381721143, + "learning_rate": 4.999457220331281e-05, + "loss": 0.928, + "step": 732 + }, + { + "epoch": 0.036558603491271824, + "grad_norm": 0.5529415029430927, + "learning_rate": 4.9994487728360665e-05, + "loss": 0.955, + "step": 733 + }, + { + "epoch": 0.03660847880299252, + "grad_norm": 0.6277341578938163, + "learning_rate": 4.999440260118854e-05, + "loss": 0.9427, + "step": 734 + }, + { + "epoch": 0.036658354114713215, + "grad_norm": 0.5606930258330485, + "learning_rate": 4.999431682179867e-05, + "loss": 0.9603, + "step": 735 + }, + { + "epoch": 0.03670822942643392, + "grad_norm": 0.6601713860702235, + "learning_rate": 4.999423039019329e-05, + "loss": 0.9337, + "step": 736 + }, + { + "epoch": 0.036758104738154614, + "grad_norm": 0.7676258868630939, + "learning_rate": 4.999414330637464e-05, + "loss": 0.9183, + "step": 737 + }, + { + "epoch": 0.03680798004987531, + "grad_norm": 1.497879909941237, + "learning_rate": 4.999405557034502e-05, + "loss": 0.9824, + "step": 738 + }, + { + "epoch": 0.03685785536159601, + "grad_norm": 0.5445395356023575, + "learning_rate": 4.99939671821067e-05, + "loss": 0.969, + "step": 739 + }, + { + "epoch": 0.03690773067331671, + "grad_norm": 0.6390532068659581, + "learning_rate": 4.9993878141662e-05, + "loss": 0.958, + "step": 740 + }, + { + "epoch": 0.0369576059850374, + "grad_norm": 0.894903708667274, + "learning_rate": 4.999378844901323e-05, + "loss": 0.9695, + "step": 741 + }, + { + "epoch": 0.037007481296758106, + "grad_norm": 0.5530053097301907, + "learning_rate": 4.999369810416274e-05, + "loss": 0.9562, + "step": 742 + }, + { + "epoch": 0.0370573566084788, + "grad_norm": 0.550607207665196, + "learning_rate": 4.999360710711288e-05, + "loss": 1.0236, + "step": 743 + }, + { + "epoch": 0.037107231920199504, + "grad_norm": 0.6183169313267887, + "learning_rate": 4.999351545786603e-05, + "loss": 0.983, + "step": 744 + }, + { + "epoch": 0.0371571072319202, + "grad_norm": 0.6116073937616492, + "learning_rate": 4.999342315642458e-05, + "loss": 0.9444, + "step": 745 + }, + { + "epoch": 0.037206982543640896, + "grad_norm": 0.5146532115007049, + "learning_rate": 4.999333020279093e-05, + "loss": 0.9543, + "step": 746 + }, + { + "epoch": 0.0372568578553616, + "grad_norm": 0.6415993476372998, + "learning_rate": 4.999323659696752e-05, + "loss": 0.9637, + "step": 747 + }, + { + "epoch": 0.037306733167082294, + "grad_norm": 0.5606964869023946, + "learning_rate": 4.999314233895679e-05, + "loss": 0.9798, + "step": 748 + }, + { + "epoch": 0.03735660847880299, + "grad_norm": 0.7467470938328805, + "learning_rate": 4.9993047428761196e-05, + "loss": 0.9238, + "step": 749 + }, + { + "epoch": 0.03740648379052369, + "grad_norm": 0.8319341239752357, + "learning_rate": 4.999295186638321e-05, + "loss": 0.9792, + "step": 750 + }, + { + "epoch": 0.03745635910224439, + "grad_norm": 0.5808978397531785, + "learning_rate": 4.999285565182533e-05, + "loss": 0.9364, + "step": 751 + }, + { + "epoch": 0.037506234413965084, + "grad_norm": 0.4631124937741128, + "learning_rate": 4.999275878509008e-05, + "loss": 0.9124, + "step": 752 + }, + { + "epoch": 0.037556109725685786, + "grad_norm": 0.5749420239325533, + "learning_rate": 4.999266126617996e-05, + "loss": 0.947, + "step": 753 + }, + { + "epoch": 0.03760598503740648, + "grad_norm": 0.6601177490326156, + "learning_rate": 4.999256309509753e-05, + "loss": 1.0141, + "step": 754 + }, + { + "epoch": 0.037655860349127185, + "grad_norm": 0.4943514532814682, + "learning_rate": 4.9992464271845366e-05, + "loss": 0.9803, + "step": 755 + }, + { + "epoch": 0.03770573566084788, + "grad_norm": 0.6076791489488654, + "learning_rate": 4.9992364796426016e-05, + "loss": 0.978, + "step": 756 + }, + { + "epoch": 0.037755610972568576, + "grad_norm": 0.6007459062646651, + "learning_rate": 4.99922646688421e-05, + "loss": 0.9422, + "step": 757 + }, + { + "epoch": 0.03780548628428928, + "grad_norm": 0.773425484208989, + "learning_rate": 4.9992163889096215e-05, + "loss": 0.9275, + "step": 758 + }, + { + "epoch": 0.037855361596009975, + "grad_norm": 0.6447096445333871, + "learning_rate": 4.9992062457191e-05, + "loss": 0.9351, + "step": 759 + }, + { + "epoch": 0.03790523690773067, + "grad_norm": 3.564279070167445, + "learning_rate": 4.999196037312911e-05, + "loss": 0.9441, + "step": 760 + }, + { + "epoch": 0.03795511221945137, + "grad_norm": 0.6125019118300299, + "learning_rate": 4.999185763691319e-05, + "loss": 0.9766, + "step": 761 + }, + { + "epoch": 0.03800498753117207, + "grad_norm": 0.5297943662049451, + "learning_rate": 4.999175424854593e-05, + "loss": 0.9689, + "step": 762 + }, + { + "epoch": 0.03805486284289277, + "grad_norm": 0.6587212056341268, + "learning_rate": 4.999165020803003e-05, + "loss": 0.9582, + "step": 763 + }, + { + "epoch": 0.03810473815461347, + "grad_norm": 0.7626536840349609, + "learning_rate": 4.9991545515368205e-05, + "loss": 1.0017, + "step": 764 + }, + { + "epoch": 0.03815461346633416, + "grad_norm": 0.566662135414825, + "learning_rate": 4.999144017056319e-05, + "loss": 1.0521, + "step": 765 + }, + { + "epoch": 0.038204488778054865, + "grad_norm": 0.5637918436018202, + "learning_rate": 4.999133417361771e-05, + "loss": 0.9642, + "step": 766 + }, + { + "epoch": 0.03825436408977556, + "grad_norm": 0.5454251915923417, + "learning_rate": 4.999122752453456e-05, + "loss": 0.9674, + "step": 767 + }, + { + "epoch": 0.03830423940149626, + "grad_norm": 0.6367078730323311, + "learning_rate": 4.9991120223316514e-05, + "loss": 0.9857, + "step": 768 + }, + { + "epoch": 0.03835411471321696, + "grad_norm": 0.7102863860274227, + "learning_rate": 4.999101226996637e-05, + "loss": 1.0098, + "step": 769 + }, + { + "epoch": 0.038403990024937655, + "grad_norm": 0.813187557797593, + "learning_rate": 4.999090366448694e-05, + "loss": 0.9872, + "step": 770 + }, + { + "epoch": 0.03845386533665835, + "grad_norm": 0.6409070680110633, + "learning_rate": 4.999079440688107e-05, + "loss": 0.9892, + "step": 771 + }, + { + "epoch": 0.03850374064837905, + "grad_norm": 0.76148165970864, + "learning_rate": 4.99906844971516e-05, + "loss": 0.9888, + "step": 772 + }, + { + "epoch": 0.03855361596009975, + "grad_norm": 0.5344141406416965, + "learning_rate": 4.999057393530141e-05, + "loss": 0.9488, + "step": 773 + }, + { + "epoch": 0.03860349127182045, + "grad_norm": 0.5499081744399893, + "learning_rate": 4.9990462721333364e-05, + "loss": 0.9723, + "step": 774 + }, + { + "epoch": 0.03865336658354115, + "grad_norm": 0.6010741130319192, + "learning_rate": 4.999035085525039e-05, + "loss": 0.95, + "step": 775 + }, + { + "epoch": 0.03870324189526184, + "grad_norm": 0.6758074253838644, + "learning_rate": 4.999023833705539e-05, + "loss": 0.9596, + "step": 776 + }, + { + "epoch": 0.038753117206982546, + "grad_norm": 0.7257435492629042, + "learning_rate": 4.99901251667513e-05, + "loss": 0.9391, + "step": 777 + }, + { + "epoch": 0.03880299251870324, + "grad_norm": 0.5993643362283673, + "learning_rate": 4.999001134434109e-05, + "loss": 0.9537, + "step": 778 + }, + { + "epoch": 0.03885286783042394, + "grad_norm": 0.8538436500275378, + "learning_rate": 4.9989896869827705e-05, + "loss": 0.9511, + "step": 779 + }, + { + "epoch": 0.03890274314214464, + "grad_norm": 0.7246781628883716, + "learning_rate": 4.9989781743214154e-05, + "loss": 0.9131, + "step": 780 + }, + { + "epoch": 0.038952618453865336, + "grad_norm": 0.8163156629535854, + "learning_rate": 4.998966596450343e-05, + "loss": 0.971, + "step": 781 + }, + { + "epoch": 0.03900249376558604, + "grad_norm": 0.5916096469056507, + "learning_rate": 4.9989549533698554e-05, + "loss": 0.9131, + "step": 782 + }, + { + "epoch": 0.039052369077306734, + "grad_norm": 0.568967283043005, + "learning_rate": 4.998943245080256e-05, + "loss": 0.9497, + "step": 783 + }, + { + "epoch": 0.03910224438902743, + "grad_norm": 0.6559007036518825, + "learning_rate": 4.998931471581852e-05, + "loss": 0.9916, + "step": 784 + }, + { + "epoch": 0.03915211970074813, + "grad_norm": 0.6646968033723838, + "learning_rate": 4.99891963287495e-05, + "loss": 0.9772, + "step": 785 + }, + { + "epoch": 0.03920199501246883, + "grad_norm": 0.7016539583818766, + "learning_rate": 4.998907728959858e-05, + "loss": 0.9359, + "step": 786 + }, + { + "epoch": 0.039251870324189524, + "grad_norm": 0.5781409391282898, + "learning_rate": 4.998895759836887e-05, + "loss": 0.944, + "step": 787 + }, + { + "epoch": 0.039301745635910226, + "grad_norm": 0.8747330355819025, + "learning_rate": 4.99888372550635e-05, + "loss": 0.9643, + "step": 788 + }, + { + "epoch": 0.03935162094763092, + "grad_norm": 0.8934824772216426, + "learning_rate": 4.9988716259685606e-05, + "loss": 0.9845, + "step": 789 + }, + { + "epoch": 0.03940149625935162, + "grad_norm": 0.7676754852686023, + "learning_rate": 4.9988594612238336e-05, + "loss": 0.9419, + "step": 790 + }, + { + "epoch": 0.03945137157107232, + "grad_norm": 0.664063487002954, + "learning_rate": 4.998847231272488e-05, + "loss": 0.9344, + "step": 791 + }, + { + "epoch": 0.039501246882793016, + "grad_norm": 0.8634372909583764, + "learning_rate": 4.9988349361148426e-05, + "loss": 0.982, + "step": 792 + }, + { + "epoch": 0.03955112219451372, + "grad_norm": 0.6913761382262624, + "learning_rate": 4.998822575751216e-05, + "loss": 0.9251, + "step": 793 + }, + { + "epoch": 0.039600997506234414, + "grad_norm": 0.7264604432105288, + "learning_rate": 4.998810150181934e-05, + "loss": 0.9606, + "step": 794 + }, + { + "epoch": 0.03965087281795511, + "grad_norm": 0.6429736433505028, + "learning_rate": 4.99879765940732e-05, + "loss": 0.9852, + "step": 795 + }, + { + "epoch": 0.03970074812967581, + "grad_norm": 0.8198146445837352, + "learning_rate": 4.9987851034276986e-05, + "loss": 0.9337, + "step": 796 + }, + { + "epoch": 0.03975062344139651, + "grad_norm": 0.5288549678713407, + "learning_rate": 4.998772482243399e-05, + "loss": 0.9515, + "step": 797 + }, + { + "epoch": 0.039800498753117204, + "grad_norm": 0.5904738052157701, + "learning_rate": 4.998759795854749e-05, + "loss": 1.0008, + "step": 798 + }, + { + "epoch": 0.03985037406483791, + "grad_norm": 1.749213548877051, + "learning_rate": 4.998747044262081e-05, + "loss": 0.9449, + "step": 799 + }, + { + "epoch": 0.0399002493765586, + "grad_norm": 1.0646267249273698, + "learning_rate": 4.998734227465726e-05, + "loss": 0.9511, + "step": 800 + }, + { + "epoch": 0.039950124688279305, + "grad_norm": 1.331114194243731, + "learning_rate": 4.998721345466021e-05, + "loss": 0.9978, + "step": 801 + }, + { + "epoch": 0.04, + "grad_norm": 0.6838498251933827, + "learning_rate": 4.9987083982632996e-05, + "loss": 0.9666, + "step": 802 + }, + { + "epoch": 0.0400498753117207, + "grad_norm": 0.5888912975398136, + "learning_rate": 4.9986953858579015e-05, + "loss": 0.9765, + "step": 803 + }, + { + "epoch": 0.0400997506234414, + "grad_norm": 0.9092466949433758, + "learning_rate": 4.9986823082501656e-05, + "loss": 0.9715, + "step": 804 + }, + { + "epoch": 0.040149625935162095, + "grad_norm": 0.6740765968581873, + "learning_rate": 4.998669165440433e-05, + "loss": 1.0422, + "step": 805 + }, + { + "epoch": 0.04019950124688279, + "grad_norm": 0.5591414674568833, + "learning_rate": 4.998655957429047e-05, + "loss": 0.9376, + "step": 806 + }, + { + "epoch": 0.04024937655860349, + "grad_norm": 0.6114687554286341, + "learning_rate": 4.9986426842163515e-05, + "loss": 0.9954, + "step": 807 + }, + { + "epoch": 0.04029925187032419, + "grad_norm": 0.7918810557724382, + "learning_rate": 4.998629345802694e-05, + "loss": 0.9654, + "step": 808 + }, + { + "epoch": 0.040349127182044885, + "grad_norm": 0.7133629779277595, + "learning_rate": 4.9986159421884214e-05, + "loss": 0.9612, + "step": 809 + }, + { + "epoch": 0.04039900249376559, + "grad_norm": 0.6029159461422904, + "learning_rate": 4.9986024733738846e-05, + "loss": 1.0086, + "step": 810 + }, + { + "epoch": 0.04044887780548628, + "grad_norm": 1.3723111933371936, + "learning_rate": 4.9985889393594345e-05, + "loss": 0.9516, + "step": 811 + }, + { + "epoch": 0.040498753117206986, + "grad_norm": 0.884718934020404, + "learning_rate": 4.9985753401454237e-05, + "loss": 0.9493, + "step": 812 + }, + { + "epoch": 0.04054862842892768, + "grad_norm": 0.8251818147258011, + "learning_rate": 4.998561675732208e-05, + "loss": 0.9579, + "step": 813 + }, + { + "epoch": 0.04059850374064838, + "grad_norm": 0.5982192784652337, + "learning_rate": 4.998547946120144e-05, + "loss": 0.974, + "step": 814 + }, + { + "epoch": 0.04064837905236908, + "grad_norm": 0.5521362129770191, + "learning_rate": 4.998534151309588e-05, + "loss": 0.9739, + "step": 815 + }, + { + "epoch": 0.040698254364089775, + "grad_norm": 0.5968501901048083, + "learning_rate": 4.9985202913009024e-05, + "loss": 0.9301, + "step": 816 + }, + { + "epoch": 0.04074812967581047, + "grad_norm": 0.7520978094555741, + "learning_rate": 4.998506366094449e-05, + "loss": 0.9456, + "step": 817 + }, + { + "epoch": 0.040798004987531174, + "grad_norm": 0.9799146826956132, + "learning_rate": 4.998492375690589e-05, + "loss": 0.9958, + "step": 818 + }, + { + "epoch": 0.04084788029925187, + "grad_norm": 0.65620824472049, + "learning_rate": 4.998478320089689e-05, + "loss": 0.9598, + "step": 819 + }, + { + "epoch": 0.040897755610972565, + "grad_norm": 0.5597122673613064, + "learning_rate": 4.9984641992921146e-05, + "loss": 0.9401, + "step": 820 + }, + { + "epoch": 0.04094763092269327, + "grad_norm": 0.5230036041454944, + "learning_rate": 4.998450013298236e-05, + "loss": 0.9973, + "step": 821 + }, + { + "epoch": 0.04099750623441396, + "grad_norm": 0.8519955056418422, + "learning_rate": 4.998435762108422e-05, + "loss": 0.9619, + "step": 822 + }, + { + "epoch": 0.041047381546134666, + "grad_norm": 0.6177235834420004, + "learning_rate": 4.998421445723046e-05, + "loss": 0.9531, + "step": 823 + }, + { + "epoch": 0.04109725685785536, + "grad_norm": 0.5326216481048018, + "learning_rate": 4.99840706414248e-05, + "loss": 0.943, + "step": 824 + }, + { + "epoch": 0.04114713216957606, + "grad_norm": 0.6733534904677565, + "learning_rate": 4.998392617367099e-05, + "loss": 0.9429, + "step": 825 + }, + { + "epoch": 0.04119700748129676, + "grad_norm": 0.5923935954072325, + "learning_rate": 4.998378105397281e-05, + "loss": 0.9662, + "step": 826 + }, + { + "epoch": 0.041246882793017456, + "grad_norm": 0.5904084518706598, + "learning_rate": 4.9983635282334055e-05, + "loss": 0.9966, + "step": 827 + }, + { + "epoch": 0.04129675810473815, + "grad_norm": 0.5590710608739573, + "learning_rate": 4.9983488858758506e-05, + "loss": 0.9146, + "step": 828 + }, + { + "epoch": 0.041346633416458854, + "grad_norm": 0.6209380203467152, + "learning_rate": 4.998334178325e-05, + "loss": 0.9706, + "step": 829 + }, + { + "epoch": 0.04139650872817955, + "grad_norm": 0.6482136083610778, + "learning_rate": 4.998319405581238e-05, + "loss": 0.9673, + "step": 830 + }, + { + "epoch": 0.04144638403990025, + "grad_norm": 0.6897829740149526, + "learning_rate": 4.998304567644948e-05, + "loss": 0.9783, + "step": 831 + }, + { + "epoch": 0.04149625935162095, + "grad_norm": 0.5809733313940589, + "learning_rate": 4.998289664516519e-05, + "loss": 1.013, + "step": 832 + }, + { + "epoch": 0.041546134663341644, + "grad_norm": 0.5990144717601744, + "learning_rate": 4.998274696196339e-05, + "loss": 0.9358, + "step": 833 + }, + { + "epoch": 0.04159600997506235, + "grad_norm": 0.6399517822024567, + "learning_rate": 4.9982596626847994e-05, + "loss": 0.9467, + "step": 834 + }, + { + "epoch": 0.04164588528678304, + "grad_norm": 0.5725330343930151, + "learning_rate": 4.998244563982292e-05, + "loss": 0.9986, + "step": 835 + }, + { + "epoch": 0.04169576059850374, + "grad_norm": 0.6219529222640667, + "learning_rate": 4.99822940008921e-05, + "loss": 0.9358, + "step": 836 + }, + { + "epoch": 0.04174563591022444, + "grad_norm": 0.6820563111451692, + "learning_rate": 4.99821417100595e-05, + "loss": 0.9834, + "step": 837 + }, + { + "epoch": 0.041795511221945136, + "grad_norm": 0.9028988014759163, + "learning_rate": 4.99819887673291e-05, + "loss": 0.9825, + "step": 838 + }, + { + "epoch": 0.04184538653366583, + "grad_norm": 0.5484743426901246, + "learning_rate": 4.998183517270488e-05, + "loss": 0.9803, + "step": 839 + }, + { + "epoch": 0.041895261845386535, + "grad_norm": 0.6153719365072531, + "learning_rate": 4.9981680926190854e-05, + "loss": 0.8855, + "step": 840 + }, + { + "epoch": 0.04194513715710723, + "grad_norm": 0.6402317160449152, + "learning_rate": 4.9981526027791036e-05, + "loss": 0.9613, + "step": 841 + }, + { + "epoch": 0.04199501246882793, + "grad_norm": 0.5987020192892047, + "learning_rate": 4.9981370477509484e-05, + "loss": 0.9938, + "step": 842 + }, + { + "epoch": 0.04204488778054863, + "grad_norm": 0.6345015079478099, + "learning_rate": 4.998121427535025e-05, + "loss": 0.9592, + "step": 843 + }, + { + "epoch": 0.042094763092269324, + "grad_norm": 0.6265338989109271, + "learning_rate": 4.9981057421317404e-05, + "loss": 0.9219, + "step": 844 + }, + { + "epoch": 0.04214463840399003, + "grad_norm": 0.5387045808532186, + "learning_rate": 4.998089991541505e-05, + "loss": 0.9726, + "step": 845 + }, + { + "epoch": 0.04219451371571072, + "grad_norm": 0.6818748688701074, + "learning_rate": 4.998074175764729e-05, + "loss": 0.9181, + "step": 846 + }, + { + "epoch": 0.04224438902743142, + "grad_norm": 0.5398386383805541, + "learning_rate": 4.998058294801825e-05, + "loss": 0.9222, + "step": 847 + }, + { + "epoch": 0.04229426433915212, + "grad_norm": 0.6140156194995896, + "learning_rate": 4.998042348653209e-05, + "loss": 0.9193, + "step": 848 + }, + { + "epoch": 0.04234413965087282, + "grad_norm": 0.5293375706652872, + "learning_rate": 4.998026337319295e-05, + "loss": 0.9322, + "step": 849 + }, + { + "epoch": 0.04239401496259352, + "grad_norm": 0.5914260739336242, + "learning_rate": 4.998010260800502e-05, + "loss": 0.9797, + "step": 850 + }, + { + "epoch": 0.042443890274314215, + "grad_norm": 0.5653809687468093, + "learning_rate": 4.997994119097249e-05, + "loss": 0.9143, + "step": 851 + }, + { + "epoch": 0.04249376558603491, + "grad_norm": 0.5541692623746631, + "learning_rate": 4.997977912209957e-05, + "loss": 0.9445, + "step": 852 + }, + { + "epoch": 0.042543640897755614, + "grad_norm": 0.5419901263759581, + "learning_rate": 4.997961640139049e-05, + "loss": 0.9576, + "step": 853 + }, + { + "epoch": 0.04259351620947631, + "grad_norm": 0.5887862684246901, + "learning_rate": 4.997945302884952e-05, + "loss": 0.9979, + "step": 854 + }, + { + "epoch": 0.042643391521197005, + "grad_norm": 0.5386285421109237, + "learning_rate": 4.9979289004480887e-05, + "loss": 0.9766, + "step": 855 + }, + { + "epoch": 0.04269326683291771, + "grad_norm": 0.722303663220358, + "learning_rate": 4.9979124328288886e-05, + "loss": 0.9943, + "step": 856 + }, + { + "epoch": 0.0427431421446384, + "grad_norm": 0.6272092776362252, + "learning_rate": 4.997895900027782e-05, + "loss": 0.9415, + "step": 857 + }, + { + "epoch": 0.0427930174563591, + "grad_norm": 0.5315905834622574, + "learning_rate": 4.9978793020451994e-05, + "loss": 0.986, + "step": 858 + }, + { + "epoch": 0.0428428927680798, + "grad_norm": 0.6547074849790296, + "learning_rate": 4.9978626388815743e-05, + "loss": 0.9274, + "step": 859 + }, + { + "epoch": 0.0428927680798005, + "grad_norm": 0.9329218087014122, + "learning_rate": 4.997845910537342e-05, + "loss": 0.9422, + "step": 860 + }, + { + "epoch": 0.0429426433915212, + "grad_norm": 0.5289185361116875, + "learning_rate": 4.997829117012939e-05, + "loss": 0.9352, + "step": 861 + }, + { + "epoch": 0.042992518703241896, + "grad_norm": 0.5572018172785145, + "learning_rate": 4.9978122583088016e-05, + "loss": 0.9777, + "step": 862 + }, + { + "epoch": 0.04304239401496259, + "grad_norm": 0.5897427670722515, + "learning_rate": 4.9977953344253725e-05, + "loss": 0.9733, + "step": 863 + }, + { + "epoch": 0.043092269326683294, + "grad_norm": 0.600534939396614, + "learning_rate": 4.997778345363091e-05, + "loss": 0.9322, + "step": 864 + }, + { + "epoch": 0.04314214463840399, + "grad_norm": 0.6344576219301273, + "learning_rate": 4.9977612911224025e-05, + "loss": 0.9619, + "step": 865 + }, + { + "epoch": 0.043192019950124685, + "grad_norm": 0.5465606623477558, + "learning_rate": 4.99774417170375e-05, + "loss": 0.9483, + "step": 866 + }, + { + "epoch": 0.04324189526184539, + "grad_norm": 0.5935590550287712, + "learning_rate": 4.997726987107582e-05, + "loss": 0.9841, + "step": 867 + }, + { + "epoch": 0.043291770573566084, + "grad_norm": 0.5980587003239265, + "learning_rate": 4.997709737334345e-05, + "loss": 0.9569, + "step": 868 + }, + { + "epoch": 0.043341645885286786, + "grad_norm": 0.7661891156645358, + "learning_rate": 4.9976924223844914e-05, + "loss": 0.9933, + "step": 869 + }, + { + "epoch": 0.04339152119700748, + "grad_norm": 0.6899906147440243, + "learning_rate": 4.997675042258472e-05, + "loss": 0.9657, + "step": 870 + }, + { + "epoch": 0.04344139650872818, + "grad_norm": 0.6589372049703298, + "learning_rate": 4.9976575969567386e-05, + "loss": 1.023, + "step": 871 + }, + { + "epoch": 0.04349127182044888, + "grad_norm": 0.5413508752457057, + "learning_rate": 4.997640086479749e-05, + "loss": 0.9729, + "step": 872 + }, + { + "epoch": 0.043541147132169576, + "grad_norm": 0.7077986121117841, + "learning_rate": 4.9976225108279596e-05, + "loss": 0.9271, + "step": 873 + }, + { + "epoch": 0.04359102244389027, + "grad_norm": 0.8320282718670213, + "learning_rate": 4.997604870001828e-05, + "loss": 0.9435, + "step": 874 + }, + { + "epoch": 0.043640897755610975, + "grad_norm": 0.6171160457418762, + "learning_rate": 4.9975871640018154e-05, + "loss": 0.9926, + "step": 875 + }, + { + "epoch": 0.04369077306733167, + "grad_norm": 0.5984533878881236, + "learning_rate": 4.997569392828383e-05, + "loss": 0.9656, + "step": 876 + }, + { + "epoch": 0.043740648379052366, + "grad_norm": 0.5433256199820273, + "learning_rate": 4.997551556481995e-05, + "loss": 0.9373, + "step": 877 + }, + { + "epoch": 0.04379052369077307, + "grad_norm": 0.6285516542627578, + "learning_rate": 4.997533654963118e-05, + "loss": 0.9727, + "step": 878 + }, + { + "epoch": 0.043840399002493764, + "grad_norm": 0.6513171620897134, + "learning_rate": 4.9975156882722174e-05, + "loss": 1.0333, + "step": 879 + }, + { + "epoch": 0.04389027431421447, + "grad_norm": 0.5571826402334467, + "learning_rate": 4.997497656409762e-05, + "loss": 0.9448, + "step": 880 + }, + { + "epoch": 0.04394014962593516, + "grad_norm": 0.7183697564515016, + "learning_rate": 4.9974795593762237e-05, + "loss": 0.9584, + "step": 881 + }, + { + "epoch": 0.04399002493765586, + "grad_norm": 0.5262672141200818, + "learning_rate": 4.997461397172074e-05, + "loss": 0.9467, + "step": 882 + }, + { + "epoch": 0.04403990024937656, + "grad_norm": 0.5145713193443257, + "learning_rate": 4.997443169797786e-05, + "loss": 0.9276, + "step": 883 + }, + { + "epoch": 0.04408977556109726, + "grad_norm": 0.7260495517194058, + "learning_rate": 4.9974248772538376e-05, + "loss": 0.9569, + "step": 884 + }, + { + "epoch": 0.04413965087281795, + "grad_norm": 0.6037140935627474, + "learning_rate": 4.9974065195407036e-05, + "loss": 0.9595, + "step": 885 + }, + { + "epoch": 0.044189526184538655, + "grad_norm": 0.5591340264822197, + "learning_rate": 4.997388096658865e-05, + "loss": 0.9908, + "step": 886 + }, + { + "epoch": 0.04423940149625935, + "grad_norm": 0.8327787475250829, + "learning_rate": 4.997369608608801e-05, + "loss": 0.9265, + "step": 887 + }, + { + "epoch": 0.04428927680798005, + "grad_norm": 1.1927486744778373, + "learning_rate": 4.997351055390995e-05, + "loss": 0.9526, + "step": 888 + }, + { + "epoch": 0.04433915211970075, + "grad_norm": 0.5278186588898514, + "learning_rate": 4.997332437005931e-05, + "loss": 0.9788, + "step": 889 + }, + { + "epoch": 0.044389027431421445, + "grad_norm": 0.5405985190028993, + "learning_rate": 4.997313753454095e-05, + "loss": 0.9523, + "step": 890 + }, + { + "epoch": 0.04443890274314215, + "grad_norm": 0.545979014576783, + "learning_rate": 4.997295004735974e-05, + "loss": 0.9381, + "step": 891 + }, + { + "epoch": 0.04448877805486284, + "grad_norm": 0.6328425763679026, + "learning_rate": 4.997276190852057e-05, + "loss": 1.0134, + "step": 892 + }, + { + "epoch": 0.04453865336658354, + "grad_norm": 0.5532947629902613, + "learning_rate": 4.9972573118028365e-05, + "loss": 0.944, + "step": 893 + }, + { + "epoch": 0.04458852867830424, + "grad_norm": 0.6454873593818109, + "learning_rate": 4.997238367588803e-05, + "loss": 0.987, + "step": 894 + }, + { + "epoch": 0.04463840399002494, + "grad_norm": 0.7923780159757668, + "learning_rate": 4.997219358210452e-05, + "loss": 0.9412, + "step": 895 + }, + { + "epoch": 0.04468827930174563, + "grad_norm": 0.5046168368298027, + "learning_rate": 4.99720028366828e-05, + "loss": 0.9172, + "step": 896 + }, + { + "epoch": 0.044738154613466335, + "grad_norm": 0.751437765854575, + "learning_rate": 4.997181143962784e-05, + "loss": 0.9659, + "step": 897 + }, + { + "epoch": 0.04478802992518703, + "grad_norm": 0.6268852831660976, + "learning_rate": 4.9971619390944635e-05, + "loss": 0.9803, + "step": 898 + }, + { + "epoch": 0.044837905236907734, + "grad_norm": 0.6413258469185685, + "learning_rate": 4.99714266906382e-05, + "loss": 0.9409, + "step": 899 + }, + { + "epoch": 0.04488778054862843, + "grad_norm": 0.5436296045204058, + "learning_rate": 4.9971233338713565e-05, + "loss": 0.9278, + "step": 900 + }, + { + "epoch": 0.044937655860349125, + "grad_norm": 0.5495526177811833, + "learning_rate": 4.997103933517576e-05, + "loss": 0.9416, + "step": 901 + }, + { + "epoch": 0.04498753117206983, + "grad_norm": 0.5442940467388078, + "learning_rate": 4.997084468002987e-05, + "loss": 0.9727, + "step": 902 + }, + { + "epoch": 0.045037406483790524, + "grad_norm": 0.6775384005975199, + "learning_rate": 4.997064937328096e-05, + "loss": 1.0164, + "step": 903 + }, + { + "epoch": 0.04508728179551122, + "grad_norm": 0.5806448694920193, + "learning_rate": 4.997045341493413e-05, + "loss": 0.9332, + "step": 904 + }, + { + "epoch": 0.04513715710723192, + "grad_norm": 3.0987398130282693, + "learning_rate": 4.99702568049945e-05, + "loss": 0.9561, + "step": 905 + }, + { + "epoch": 0.04518703241895262, + "grad_norm": 0.5954094780872909, + "learning_rate": 4.997005954346718e-05, + "loss": 0.9546, + "step": 906 + }, + { + "epoch": 0.04523690773067331, + "grad_norm": 0.5497853966420592, + "learning_rate": 4.996986163035734e-05, + "loss": 0.9781, + "step": 907 + }, + { + "epoch": 0.045286783042394016, + "grad_norm": 0.6431561723223476, + "learning_rate": 4.9969663065670135e-05, + "loss": 0.9261, + "step": 908 + }, + { + "epoch": 0.04533665835411471, + "grad_norm": 0.6662880139974592, + "learning_rate": 4.996946384941075e-05, + "loss": 0.9632, + "step": 909 + }, + { + "epoch": 0.045386533665835414, + "grad_norm": 0.5399885724855445, + "learning_rate": 4.996926398158437e-05, + "loss": 0.94, + "step": 910 + }, + { + "epoch": 0.04543640897755611, + "grad_norm": 0.667566780665213, + "learning_rate": 4.996906346219623e-05, + "loss": 0.9568, + "step": 911 + }, + { + "epoch": 0.045486284289276806, + "grad_norm": 0.6597275089254264, + "learning_rate": 4.996886229125155e-05, + "loss": 0.9497, + "step": 912 + }, + { + "epoch": 0.04553615960099751, + "grad_norm": 0.6291182981610605, + "learning_rate": 4.996866046875559e-05, + "loss": 0.9435, + "step": 913 + }, + { + "epoch": 0.045586034912718204, + "grad_norm": 0.5948160730015776, + "learning_rate": 4.99684579947136e-05, + "loss": 0.97, + "step": 914 + }, + { + "epoch": 0.0456359102244389, + "grad_norm": 0.8559785766200818, + "learning_rate": 4.996825486913088e-05, + "loss": 0.9063, + "step": 915 + }, + { + "epoch": 0.0456857855361596, + "grad_norm": 0.5773711245199225, + "learning_rate": 4.996805109201272e-05, + "loss": 0.9231, + "step": 916 + }, + { + "epoch": 0.0457356608478803, + "grad_norm": 0.6189419689043758, + "learning_rate": 4.996784666336444e-05, + "loss": 1.0045, + "step": 917 + }, + { + "epoch": 0.045785536159601, + "grad_norm": 0.5171810813641193, + "learning_rate": 4.9967641583191384e-05, + "loss": 0.9632, + "step": 918 + }, + { + "epoch": 0.045835411471321696, + "grad_norm": 1.000958399866004, + "learning_rate": 4.996743585149889e-05, + "loss": 0.9346, + "step": 919 + }, + { + "epoch": 0.04588528678304239, + "grad_norm": 0.6557401087475035, + "learning_rate": 4.996722946829233e-05, + "loss": 0.9351, + "step": 920 + }, + { + "epoch": 0.045935162094763095, + "grad_norm": 0.677634466395948, + "learning_rate": 4.996702243357709e-05, + "loss": 0.9928, + "step": 921 + }, + { + "epoch": 0.04598503740648379, + "grad_norm": 0.8157953006268719, + "learning_rate": 4.996681474735857e-05, + "loss": 0.9174, + "step": 922 + }, + { + "epoch": 0.046034912718204486, + "grad_norm": 0.52943409094963, + "learning_rate": 4.99666064096422e-05, + "loss": 0.9561, + "step": 923 + }, + { + "epoch": 0.04608478802992519, + "grad_norm": 0.5482378015289547, + "learning_rate": 4.996639742043341e-05, + "loss": 0.9703, + "step": 924 + }, + { + "epoch": 0.046134663341645885, + "grad_norm": 0.5994739784315362, + "learning_rate": 4.9966187779737644e-05, + "loss": 0.9803, + "step": 925 + }, + { + "epoch": 0.04618453865336658, + "grad_norm": 0.7186919373079311, + "learning_rate": 4.996597748756039e-05, + "loss": 0.932, + "step": 926 + }, + { + "epoch": 0.04623441396508728, + "grad_norm": 0.5730203328702969, + "learning_rate": 4.9965766543907125e-05, + "loss": 0.974, + "step": 927 + }, + { + "epoch": 0.04628428927680798, + "grad_norm": 0.5814942875797533, + "learning_rate": 4.996555494878335e-05, + "loss": 0.9557, + "step": 928 + }, + { + "epoch": 0.04633416458852868, + "grad_norm": 0.724627520938203, + "learning_rate": 4.99653427021946e-05, + "loss": 0.9494, + "step": 929 + }, + { + "epoch": 0.04638403990024938, + "grad_norm": 0.5390338225837246, + "learning_rate": 4.9965129804146395e-05, + "loss": 0.9516, + "step": 930 + }, + { + "epoch": 0.04643391521197007, + "grad_norm": 0.5551877627821863, + "learning_rate": 4.9964916254644305e-05, + "loss": 0.9718, + "step": 931 + }, + { + "epoch": 0.046483790523690775, + "grad_norm": 0.5712547660655777, + "learning_rate": 4.9964702053693905e-05, + "loss": 0.9762, + "step": 932 + }, + { + "epoch": 0.04653366583541147, + "grad_norm": 0.5881637202338748, + "learning_rate": 4.996448720130077e-05, + "loss": 0.9526, + "step": 933 + }, + { + "epoch": 0.04658354114713217, + "grad_norm": 0.57680795481041, + "learning_rate": 4.9964271697470516e-05, + "loss": 0.9602, + "step": 934 + }, + { + "epoch": 0.04663341645885287, + "grad_norm": 0.5655459889884724, + "learning_rate": 4.996405554220876e-05, + "loss": 0.991, + "step": 935 + }, + { + "epoch": 0.046683291770573565, + "grad_norm": 0.5460402241248598, + "learning_rate": 4.9963838735521156e-05, + "loss": 1.0316, + "step": 936 + }, + { + "epoch": 0.04673316708229427, + "grad_norm": 0.5677287880144719, + "learning_rate": 4.9963621277413354e-05, + "loss": 0.9485, + "step": 937 + }, + { + "epoch": 0.04678304239401496, + "grad_norm": 0.5578080447336001, + "learning_rate": 4.9963403167891015e-05, + "loss": 0.9681, + "step": 938 + }, + { + "epoch": 0.04683291770573566, + "grad_norm": 1.3566213615763518, + "learning_rate": 4.996318440695985e-05, + "loss": 0.9104, + "step": 939 + }, + { + "epoch": 0.04688279301745636, + "grad_norm": 0.633978319699535, + "learning_rate": 4.996296499462556e-05, + "loss": 0.9806, + "step": 940 + }, + { + "epoch": 0.04693266832917706, + "grad_norm": 0.5057575696611929, + "learning_rate": 4.996274493089387e-05, + "loss": 0.9352, + "step": 941 + }, + { + "epoch": 0.04698254364089775, + "grad_norm": 0.5559490144607117, + "learning_rate": 4.9962524215770524e-05, + "loss": 0.976, + "step": 942 + }, + { + "epoch": 0.047032418952618456, + "grad_norm": 0.5285564160533602, + "learning_rate": 4.996230284926128e-05, + "loss": 0.9713, + "step": 943 + }, + { + "epoch": 0.04708229426433915, + "grad_norm": 0.5216571378487292, + "learning_rate": 4.996208083137191e-05, + "loss": 0.9517, + "step": 944 + }, + { + "epoch": 0.04713216957605985, + "grad_norm": 0.6907603588162067, + "learning_rate": 4.996185816210821e-05, + "loss": 1.0033, + "step": 945 + }, + { + "epoch": 0.04718204488778055, + "grad_norm": 0.4991344825178465, + "learning_rate": 4.9961634841476004e-05, + "loss": 0.9429, + "step": 946 + }, + { + "epoch": 0.047231920199501246, + "grad_norm": 0.6069104356716213, + "learning_rate": 4.99614108694811e-05, + "loss": 0.9887, + "step": 947 + }, + { + "epoch": 0.04728179551122195, + "grad_norm": 0.5141535638559881, + "learning_rate": 4.996118624612936e-05, + "loss": 0.9469, + "step": 948 + }, + { + "epoch": 0.047331670822942644, + "grad_norm": 0.6440182644014464, + "learning_rate": 4.996096097142663e-05, + "loss": 0.9864, + "step": 949 + }, + { + "epoch": 0.04738154613466334, + "grad_norm": 0.6651204537973273, + "learning_rate": 4.9960735045378794e-05, + "loss": 0.9514, + "step": 950 + }, + { + "epoch": 0.04743142144638404, + "grad_norm": 0.6439073973043821, + "learning_rate": 4.996050846799174e-05, + "loss": 0.9855, + "step": 951 + }, + { + "epoch": 0.04748129675810474, + "grad_norm": 0.7968848096421386, + "learning_rate": 4.99602812392714e-05, + "loss": 0.9491, + "step": 952 + }, + { + "epoch": 0.047531172069825434, + "grad_norm": 0.7546942536067098, + "learning_rate": 4.996005335922369e-05, + "loss": 0.9929, + "step": 953 + }, + { + "epoch": 0.047581047381546136, + "grad_norm": 0.49628659649624696, + "learning_rate": 4.995982482785455e-05, + "loss": 0.9178, + "step": 954 + }, + { + "epoch": 0.04763092269326683, + "grad_norm": 0.5766387858708986, + "learning_rate": 4.995959564516997e-05, + "loss": 0.9316, + "step": 955 + }, + { + "epoch": 0.047680798004987535, + "grad_norm": 0.6371058220449609, + "learning_rate": 4.9959365811175894e-05, + "loss": 0.958, + "step": 956 + }, + { + "epoch": 0.04773067331670823, + "grad_norm": 0.5784035509587576, + "learning_rate": 4.9959135325878345e-05, + "loss": 0.9324, + "step": 957 + }, + { + "epoch": 0.047780548628428926, + "grad_norm": 0.5136407363781558, + "learning_rate": 4.995890418928333e-05, + "loss": 0.9532, + "step": 958 + }, + { + "epoch": 0.04783042394014963, + "grad_norm": 0.5733390664366471, + "learning_rate": 4.995867240139687e-05, + "loss": 0.9696, + "step": 959 + }, + { + "epoch": 0.047880299251870324, + "grad_norm": 0.6696960347825631, + "learning_rate": 4.9958439962225036e-05, + "loss": 0.9462, + "step": 960 + }, + { + "epoch": 0.04793017456359102, + "grad_norm": 0.6231797585207305, + "learning_rate": 4.995820687177388e-05, + "loss": 0.959, + "step": 961 + }, + { + "epoch": 0.04798004987531172, + "grad_norm": 0.8012249955271594, + "learning_rate": 4.9957973130049486e-05, + "loss": 0.982, + "step": 962 + }, + { + "epoch": 0.04802992518703242, + "grad_norm": 0.5219173285996406, + "learning_rate": 4.9957738737057945e-05, + "loss": 0.9175, + "step": 963 + }, + { + "epoch": 0.048079800498753114, + "grad_norm": 0.5857756358043334, + "learning_rate": 4.9957503692805384e-05, + "loss": 0.914, + "step": 964 + }, + { + "epoch": 0.04812967581047382, + "grad_norm": 0.5559701315830776, + "learning_rate": 4.995726799729794e-05, + "loss": 0.9481, + "step": 965 + }, + { + "epoch": 0.04817955112219451, + "grad_norm": 0.6714404689206218, + "learning_rate": 4.9957031650541755e-05, + "loss": 0.9805, + "step": 966 + }, + { + "epoch": 0.048229426433915215, + "grad_norm": 0.49207281015262, + "learning_rate": 4.9956794652542994e-05, + "loss": 1.0089, + "step": 967 + }, + { + "epoch": 0.04827930174563591, + "grad_norm": 0.5857917559735074, + "learning_rate": 4.9956557003307844e-05, + "loss": 0.9663, + "step": 968 + }, + { + "epoch": 0.048329177057356607, + "grad_norm": 0.6500877458799768, + "learning_rate": 4.995631870284252e-05, + "loss": 0.9797, + "step": 969 + }, + { + "epoch": 0.04837905236907731, + "grad_norm": 0.6263342255367412, + "learning_rate": 4.995607975115322e-05, + "loss": 0.9122, + "step": 970 + }, + { + "epoch": 0.048428927680798005, + "grad_norm": 0.5442396876054525, + "learning_rate": 4.9955840148246185e-05, + "loss": 0.9211, + "step": 971 + }, + { + "epoch": 0.0484788029925187, + "grad_norm": 0.5883529797516494, + "learning_rate": 4.995559989412767e-05, + "loss": 0.9098, + "step": 972 + }, + { + "epoch": 0.0485286783042394, + "grad_norm": 0.6087711702416694, + "learning_rate": 4.995535898880395e-05, + "loss": 0.9643, + "step": 973 + }, + { + "epoch": 0.0485785536159601, + "grad_norm": 0.5175923498374277, + "learning_rate": 4.9955117432281305e-05, + "loss": 0.9373, + "step": 974 + }, + { + "epoch": 0.048628428927680795, + "grad_norm": 0.6831966353399924, + "learning_rate": 4.9954875224566036e-05, + "loss": 0.9407, + "step": 975 + }, + { + "epoch": 0.0486783042394015, + "grad_norm": 0.7113719121219958, + "learning_rate": 4.995463236566447e-05, + "loss": 0.9818, + "step": 976 + }, + { + "epoch": 0.04872817955112219, + "grad_norm": 0.52437432786071, + "learning_rate": 4.995438885558294e-05, + "loss": 0.9496, + "step": 977 + }, + { + "epoch": 0.048778054862842896, + "grad_norm": 0.5415306948428946, + "learning_rate": 4.99541446943278e-05, + "loss": 0.9266, + "step": 978 + }, + { + "epoch": 0.04882793017456359, + "grad_norm": 0.48125977817469634, + "learning_rate": 4.995389988190542e-05, + "loss": 0.9389, + "step": 979 + }, + { + "epoch": 0.04887780548628429, + "grad_norm": 0.8367134155208416, + "learning_rate": 4.995365441832219e-05, + "loss": 0.9336, + "step": 980 + }, + { + "epoch": 0.04892768079800499, + "grad_norm": 0.6775044119795929, + "learning_rate": 4.995340830358453e-05, + "loss": 0.9922, + "step": 981 + }, + { + "epoch": 0.048977556109725685, + "grad_norm": 0.5477283028065931, + "learning_rate": 4.995316153769883e-05, + "loss": 0.9603, + "step": 982 + }, + { + "epoch": 0.04902743142144638, + "grad_norm": 0.5662894523518894, + "learning_rate": 4.995291412067155e-05, + "loss": 0.942, + "step": 983 + }, + { + "epoch": 0.049077306733167084, + "grad_norm": 0.490841956071452, + "learning_rate": 4.9952666052509145e-05, + "loss": 0.9876, + "step": 984 + }, + { + "epoch": 0.04912718204488778, + "grad_norm": 0.5453152641684522, + "learning_rate": 4.995241733321809e-05, + "loss": 0.9706, + "step": 985 + }, + { + "epoch": 0.04917705735660848, + "grad_norm": 0.5455027592936649, + "learning_rate": 4.995216796280488e-05, + "loss": 0.9478, + "step": 986 + }, + { + "epoch": 0.04922693266832918, + "grad_norm": 0.5866272346311713, + "learning_rate": 4.995191794127599e-05, + "loss": 0.9363, + "step": 987 + }, + { + "epoch": 0.04927680798004987, + "grad_norm": 0.6349231808846311, + "learning_rate": 4.995166726863799e-05, + "loss": 1.0203, + "step": 988 + }, + { + "epoch": 0.049326683291770576, + "grad_norm": 0.5450670891129318, + "learning_rate": 4.99514159448974e-05, + "loss": 0.9484, + "step": 989 + }, + { + "epoch": 0.04937655860349127, + "grad_norm": 0.5152924601371346, + "learning_rate": 4.9951163970060765e-05, + "loss": 0.9323, + "step": 990 + }, + { + "epoch": 0.04942643391521197, + "grad_norm": 0.5771346144360361, + "learning_rate": 4.9950911344134673e-05, + "loss": 0.9821, + "step": 991 + }, + { + "epoch": 0.04947630922693267, + "grad_norm": 0.6185718674288464, + "learning_rate": 4.995065806712572e-05, + "loss": 0.9761, + "step": 992 + }, + { + "epoch": 0.049526184538653366, + "grad_norm": 0.8117808421224699, + "learning_rate": 4.995040413904052e-05, + "loss": 0.94, + "step": 993 + }, + { + "epoch": 0.04957605985037406, + "grad_norm": 0.5420903363056987, + "learning_rate": 4.9950149559885683e-05, + "loss": 0.9523, + "step": 994 + }, + { + "epoch": 0.049625935162094764, + "grad_norm": 0.9998307372716037, + "learning_rate": 4.994989432966786e-05, + "loss": 0.9478, + "step": 995 + }, + { + "epoch": 0.04967581047381546, + "grad_norm": 0.5778558509339365, + "learning_rate": 4.994963844839371e-05, + "loss": 0.9917, + "step": 996 + }, + { + "epoch": 0.04972568578553616, + "grad_norm": 2.339027389027845, + "learning_rate": 4.99493819160699e-05, + "loss": 0.968, + "step": 997 + }, + { + "epoch": 0.04977556109725686, + "grad_norm": 0.5953501771081688, + "learning_rate": 4.9949124732703155e-05, + "loss": 0.9749, + "step": 998 + }, + { + "epoch": 0.049825436408977554, + "grad_norm": 0.8605487412924123, + "learning_rate": 4.994886689830015e-05, + "loss": 0.9275, + "step": 999 + }, + { + "epoch": 0.04987531172069826, + "grad_norm": 0.5923032396811511, + "learning_rate": 4.9948608412867636e-05, + "loss": 0.925, + "step": 1000 + }, + { + "epoch": 0.04992518703241895, + "grad_norm": 0.640796079365727, + "learning_rate": 4.9948349276412355e-05, + "loss": 0.9496, + "step": 1001 + }, + { + "epoch": 0.04997506234413965, + "grad_norm": 0.575990937542128, + "learning_rate": 4.9948089488941055e-05, + "loss": 0.9546, + "step": 1002 + }, + { + "epoch": 0.05002493765586035, + "grad_norm": 0.469198627400431, + "learning_rate": 4.994782905046054e-05, + "loss": 0.9349, + "step": 1003 + }, + { + "epoch": 0.050074812967581046, + "grad_norm": 0.5086594666163888, + "learning_rate": 4.9947567960977573e-05, + "loss": 1.0025, + "step": 1004 + }, + { + "epoch": 0.05012468827930175, + "grad_norm": 0.5567209146552639, + "learning_rate": 4.9947306220499e-05, + "loss": 0.9603, + "step": 1005 + }, + { + "epoch": 0.050174563591022445, + "grad_norm": 0.6097019429421071, + "learning_rate": 4.9947043829031626e-05, + "loss": 0.9727, + "step": 1006 + }, + { + "epoch": 0.05022443890274314, + "grad_norm": 0.5217690447880077, + "learning_rate": 4.994678078658231e-05, + "loss": 0.9654, + "step": 1007 + }, + { + "epoch": 0.05027431421446384, + "grad_norm": 0.9087498958916564, + "learning_rate": 4.994651709315792e-05, + "loss": 0.9499, + "step": 1008 + }, + { + "epoch": 0.05032418952618454, + "grad_norm": 0.6253465273127226, + "learning_rate": 4.994625274876532e-05, + "loss": 0.9637, + "step": 1009 + }, + { + "epoch": 0.050374064837905234, + "grad_norm": 1.0227946072131957, + "learning_rate": 4.994598775341143e-05, + "loss": 0.9981, + "step": 1010 + }, + { + "epoch": 0.05042394014962594, + "grad_norm": 0.6128311214339719, + "learning_rate": 4.994572210710315e-05, + "loss": 0.9214, + "step": 1011 + }, + { + "epoch": 0.05047381546134663, + "grad_norm": 0.8164263432786923, + "learning_rate": 4.9945455809847417e-05, + "loss": 0.9597, + "step": 1012 + }, + { + "epoch": 0.05052369077306733, + "grad_norm": 0.5518105207186819, + "learning_rate": 4.9945188861651175e-05, + "loss": 0.9822, + "step": 1013 + }, + { + "epoch": 0.05057356608478803, + "grad_norm": 0.6035068598945419, + "learning_rate": 4.99449212625214e-05, + "loss": 0.9473, + "step": 1014 + }, + { + "epoch": 0.05062344139650873, + "grad_norm": 0.5917253066493942, + "learning_rate": 4.994465301246506e-05, + "loss": 0.9797, + "step": 1015 + }, + { + "epoch": 0.05067331670822943, + "grad_norm": 0.6358992662996322, + "learning_rate": 4.994438411148917e-05, + "loss": 0.9745, + "step": 1016 + }, + { + "epoch": 0.050723192019950125, + "grad_norm": 0.582499715326855, + "learning_rate": 4.994411455960074e-05, + "loss": 0.9444, + "step": 1017 + }, + { + "epoch": 0.05077306733167082, + "grad_norm": 0.45656706354221077, + "learning_rate": 4.99438443568068e-05, + "loss": 0.9587, + "step": 1018 + }, + { + "epoch": 0.050822942643391524, + "grad_norm": 0.684903683469994, + "learning_rate": 4.99435735031144e-05, + "loss": 0.9871, + "step": 1019 + }, + { + "epoch": 0.05087281795511222, + "grad_norm": 0.7130081919213543, + "learning_rate": 4.994330199853063e-05, + "loss": 0.96, + "step": 1020 + }, + { + "epoch": 0.050922693266832915, + "grad_norm": 0.6431882880865212, + "learning_rate": 4.994302984306255e-05, + "loss": 0.9187, + "step": 1021 + }, + { + "epoch": 0.05097256857855362, + "grad_norm": 0.6100786872738713, + "learning_rate": 4.994275703671727e-05, + "loss": 0.9225, + "step": 1022 + }, + { + "epoch": 0.05102244389027431, + "grad_norm": 0.5290818841284674, + "learning_rate": 4.99424835795019e-05, + "loss": 0.9089, + "step": 1023 + }, + { + "epoch": 0.051072319201995016, + "grad_norm": 0.6266973487014353, + "learning_rate": 4.994220947142358e-05, + "loss": 0.935, + "step": 1024 + }, + { + "epoch": 0.05112219451371571, + "grad_norm": 0.562017273875548, + "learning_rate": 4.994193471248948e-05, + "loss": 0.9412, + "step": 1025 + }, + { + "epoch": 0.05117206982543641, + "grad_norm": 0.5379773649147657, + "learning_rate": 4.994165930270676e-05, + "loss": 0.9025, + "step": 1026 + }, + { + "epoch": 0.05122194513715711, + "grad_norm": 1.1120490672251864, + "learning_rate": 4.994138324208258e-05, + "loss": 0.9536, + "step": 1027 + }, + { + "epoch": 0.051271820448877806, + "grad_norm": 0.5782764424473325, + "learning_rate": 4.994110653062419e-05, + "loss": 0.9874, + "step": 1028 + }, + { + "epoch": 0.0513216957605985, + "grad_norm": 0.5963811990378513, + "learning_rate": 4.994082916833878e-05, + "loss": 1.0188, + "step": 1029 + }, + { + "epoch": 0.051371571072319204, + "grad_norm": 0.760750005716995, + "learning_rate": 4.994055115523359e-05, + "loss": 0.9178, + "step": 1030 + }, + { + "epoch": 0.0514214463840399, + "grad_norm": 1.011949779903545, + "learning_rate": 4.994027249131588e-05, + "loss": 0.9583, + "step": 1031 + }, + { + "epoch": 0.051471321695760595, + "grad_norm": 0.6331360830904552, + "learning_rate": 4.9939993176592926e-05, + "loss": 0.9717, + "step": 1032 + }, + { + "epoch": 0.0515211970074813, + "grad_norm": 0.6177285801237851, + "learning_rate": 4.9939713211072006e-05, + "loss": 1.0003, + "step": 1033 + }, + { + "epoch": 0.051571072319201994, + "grad_norm": 0.5284977808941355, + "learning_rate": 4.993943259476044e-05, + "loss": 0.9911, + "step": 1034 + }, + { + "epoch": 0.051620947630922696, + "grad_norm": 0.645437136297629, + "learning_rate": 4.993915132766553e-05, + "loss": 0.9467, + "step": 1035 + }, + { + "epoch": 0.05167082294264339, + "grad_norm": 0.585242515330079, + "learning_rate": 4.993886940979464e-05, + "loss": 0.9621, + "step": 1036 + }, + { + "epoch": 0.05172069825436409, + "grad_norm": 0.7442675639711394, + "learning_rate": 4.993858684115511e-05, + "loss": 0.9407, + "step": 1037 + }, + { + "epoch": 0.05177057356608479, + "grad_norm": 0.5846583905170915, + "learning_rate": 4.9938303621754314e-05, + "loss": 0.9326, + "step": 1038 + }, + { + "epoch": 0.051820448877805486, + "grad_norm": 0.5472937201167768, + "learning_rate": 4.993801975159964e-05, + "loss": 0.958, + "step": 1039 + }, + { + "epoch": 0.05187032418952618, + "grad_norm": 0.6127203787924429, + "learning_rate": 4.9937735230698514e-05, + "loss": 0.9412, + "step": 1040 + }, + { + "epoch": 0.051920199501246884, + "grad_norm": 0.5915130862276614, + "learning_rate": 4.9937450059058346e-05, + "loss": 0.9623, + "step": 1041 + }, + { + "epoch": 0.05197007481296758, + "grad_norm": 0.6524428650267332, + "learning_rate": 4.9937164236686574e-05, + "loss": 0.9845, + "step": 1042 + }, + { + "epoch": 0.052019950124688276, + "grad_norm": 0.5463484410201607, + "learning_rate": 4.9936877763590664e-05, + "loss": 0.942, + "step": 1043 + }, + { + "epoch": 0.05206982543640898, + "grad_norm": 0.600511380939234, + "learning_rate": 4.993659063977809e-05, + "loss": 0.9615, + "step": 1044 + }, + { + "epoch": 0.052119700748129674, + "grad_norm": 0.6075630814907041, + "learning_rate": 4.993630286525634e-05, + "loss": 0.9713, + "step": 1045 + }, + { + "epoch": 0.05216957605985038, + "grad_norm": 0.5364802131102584, + "learning_rate": 4.993601444003293e-05, + "loss": 0.9482, + "step": 1046 + }, + { + "epoch": 0.05221945137157107, + "grad_norm": 0.5535481245474024, + "learning_rate": 4.993572536411538e-05, + "loss": 0.9725, + "step": 1047 + }, + { + "epoch": 0.05226932668329177, + "grad_norm": 0.5389407992419777, + "learning_rate": 4.993543563751124e-05, + "loss": 0.9454, + "step": 1048 + }, + { + "epoch": 0.05231920199501247, + "grad_norm": 0.5006846122875087, + "learning_rate": 4.993514526022807e-05, + "loss": 0.9506, + "step": 1049 + }, + { + "epoch": 0.05236907730673317, + "grad_norm": 0.6035739062908743, + "learning_rate": 4.993485423227343e-05, + "loss": 0.9748, + "step": 1050 + }, + { + "epoch": 0.05241895261845386, + "grad_norm": 0.5240264350812038, + "learning_rate": 4.993456255365494e-05, + "loss": 0.9865, + "step": 1051 + }, + { + "epoch": 0.052468827930174565, + "grad_norm": 0.6935953445738624, + "learning_rate": 4.99342702243802e-05, + "loss": 0.9362, + "step": 1052 + }, + { + "epoch": 0.05251870324189526, + "grad_norm": 0.49785033994140065, + "learning_rate": 4.993397724445683e-05, + "loss": 0.9485, + "step": 1053 + }, + { + "epoch": 0.05256857855361596, + "grad_norm": 1.260566332456426, + "learning_rate": 4.993368361389249e-05, + "loss": 0.8934, + "step": 1054 + }, + { + "epoch": 0.05261845386533666, + "grad_norm": 0.5289605147369478, + "learning_rate": 4.993338933269483e-05, + "loss": 0.8836, + "step": 1055 + }, + { + "epoch": 0.052668329177057355, + "grad_norm": 0.5217919491909623, + "learning_rate": 4.993309440087154e-05, + "loss": 0.9547, + "step": 1056 + }, + { + "epoch": 0.05271820448877806, + "grad_norm": 0.5420542804894125, + "learning_rate": 4.99327988184303e-05, + "loss": 0.9152, + "step": 1057 + }, + { + "epoch": 0.05276807980049875, + "grad_norm": 0.7003120658577693, + "learning_rate": 4.993250258537884e-05, + "loss": 0.974, + "step": 1058 + }, + { + "epoch": 0.05281795511221945, + "grad_norm": 0.5884072546849305, + "learning_rate": 4.993220570172489e-05, + "loss": 0.9556, + "step": 1059 + }, + { + "epoch": 0.05286783042394015, + "grad_norm": 0.7122464467586139, + "learning_rate": 4.9931908167476185e-05, + "loss": 0.9439, + "step": 1060 + }, + { + "epoch": 0.05291770573566085, + "grad_norm": 0.6356932332080081, + "learning_rate": 4.993160998264049e-05, + "loss": 0.9923, + "step": 1061 + }, + { + "epoch": 0.05296758104738154, + "grad_norm": 0.6174612768575815, + "learning_rate": 4.993131114722559e-05, + "loss": 0.9201, + "step": 1062 + }, + { + "epoch": 0.053017456359102245, + "grad_norm": 0.6249199407847729, + "learning_rate": 4.993101166123928e-05, + "loss": 0.9212, + "step": 1063 + }, + { + "epoch": 0.05306733167082294, + "grad_norm": 0.5278347781151589, + "learning_rate": 4.993071152468939e-05, + "loss": 0.9679, + "step": 1064 + }, + { + "epoch": 0.053117206982543644, + "grad_norm": 0.5811608865955284, + "learning_rate": 4.9930410737583734e-05, + "loss": 0.8846, + "step": 1065 + }, + { + "epoch": 0.05316708229426434, + "grad_norm": 0.5952793040157394, + "learning_rate": 4.9930109299930175e-05, + "loss": 0.9424, + "step": 1066 + }, + { + "epoch": 0.053216957605985035, + "grad_norm": 0.503968689950385, + "learning_rate": 4.9929807211736556e-05, + "loss": 0.9812, + "step": 1067 + }, + { + "epoch": 0.05326683291770574, + "grad_norm": 0.6298080297903228, + "learning_rate": 4.992950447301079e-05, + "loss": 0.946, + "step": 1068 + }, + { + "epoch": 0.053316708229426434, + "grad_norm": 0.6850942474242019, + "learning_rate": 4.992920108376075e-05, + "loss": 0.9813, + "step": 1069 + }, + { + "epoch": 0.05336658354114713, + "grad_norm": 0.583340242305425, + "learning_rate": 4.992889704399437e-05, + "loss": 0.9578, + "step": 1070 + }, + { + "epoch": 0.05341645885286783, + "grad_norm": 0.5322531386421931, + "learning_rate": 4.992859235371958e-05, + "loss": 0.9363, + "step": 1071 + }, + { + "epoch": 0.05346633416458853, + "grad_norm": 0.7428157137135306, + "learning_rate": 4.992828701294433e-05, + "loss": 0.9634, + "step": 1072 + }, + { + "epoch": 0.05351620947630923, + "grad_norm": 0.9955019540143981, + "learning_rate": 4.9927981021676585e-05, + "loss": 0.9801, + "step": 1073 + }, + { + "epoch": 0.053566084788029926, + "grad_norm": 0.5278909313451903, + "learning_rate": 4.9927674379924325e-05, + "loss": 0.9861, + "step": 1074 + }, + { + "epoch": 0.05361596009975062, + "grad_norm": 0.6039667583891174, + "learning_rate": 4.9927367087695566e-05, + "loss": 0.9564, + "step": 1075 + }, + { + "epoch": 0.053665835411471324, + "grad_norm": 0.5636197096021079, + "learning_rate": 4.992705914499832e-05, + "loss": 0.9627, + "step": 1076 + }, + { + "epoch": 0.05371571072319202, + "grad_norm": 0.5681961228891154, + "learning_rate": 4.992675055184062e-05, + "loss": 0.9405, + "step": 1077 + }, + { + "epoch": 0.053765586034912716, + "grad_norm": 0.6275112101521029, + "learning_rate": 4.992644130823051e-05, + "loss": 0.959, + "step": 1078 + }, + { + "epoch": 0.05381546134663342, + "grad_norm": 0.6814179141061105, + "learning_rate": 4.992613141417608e-05, + "loss": 0.9663, + "step": 1079 + }, + { + "epoch": 0.053865336658354114, + "grad_norm": 0.9240906810967128, + "learning_rate": 4.9925820869685404e-05, + "loss": 0.9707, + "step": 1080 + }, + { + "epoch": 0.05391521197007481, + "grad_norm": 0.5828748016903266, + "learning_rate": 4.992550967476659e-05, + "loss": 0.9127, + "step": 1081 + }, + { + "epoch": 0.05396508728179551, + "grad_norm": 0.6868816901421742, + "learning_rate": 4.992519782942775e-05, + "loss": 0.9403, + "step": 1082 + }, + { + "epoch": 0.05401496259351621, + "grad_norm": 0.6576496252634296, + "learning_rate": 4.9924885333677035e-05, + "loss": 0.9408, + "step": 1083 + }, + { + "epoch": 0.05406483790523691, + "grad_norm": 0.8835881547985361, + "learning_rate": 4.992457218752259e-05, + "loss": 0.9924, + "step": 1084 + }, + { + "epoch": 0.054114713216957606, + "grad_norm": 0.6450812845111684, + "learning_rate": 4.992425839097259e-05, + "loss": 0.9095, + "step": 1085 + }, + { + "epoch": 0.0541645885286783, + "grad_norm": 0.6069030672408074, + "learning_rate": 4.992394394403522e-05, + "loss": 0.9608, + "step": 1086 + }, + { + "epoch": 0.054214463840399005, + "grad_norm": 0.6557222083124667, + "learning_rate": 4.9923628846718685e-05, + "loss": 0.9831, + "step": 1087 + }, + { + "epoch": 0.0542643391521197, + "grad_norm": 0.592027565532143, + "learning_rate": 4.992331309903121e-05, + "loss": 0.9941, + "step": 1088 + }, + { + "epoch": 0.054314214463840396, + "grad_norm": 0.6858096786735499, + "learning_rate": 4.9922996700981033e-05, + "loss": 0.9797, + "step": 1089 + }, + { + "epoch": 0.0543640897755611, + "grad_norm": 0.6701355116537427, + "learning_rate": 4.9922679652576414e-05, + "loss": 0.9717, + "step": 1090 + }, + { + "epoch": 0.054413965087281795, + "grad_norm": 0.5616555755747896, + "learning_rate": 4.992236195382562e-05, + "loss": 0.9594, + "step": 1091 + }, + { + "epoch": 0.0544638403990025, + "grad_norm": 0.5691254327330539, + "learning_rate": 4.992204360473695e-05, + "loss": 0.9165, + "step": 1092 + }, + { + "epoch": 0.05451371571072319, + "grad_norm": 0.6033425166741402, + "learning_rate": 4.9921724605318705e-05, + "loss": 0.9599, + "step": 1093 + }, + { + "epoch": 0.05456359102244389, + "grad_norm": 0.5210414781237817, + "learning_rate": 4.992140495557921e-05, + "loss": 0.9465, + "step": 1094 + }, + { + "epoch": 0.05461346633416459, + "grad_norm": 0.6762702476247244, + "learning_rate": 4.99210846555268e-05, + "loss": 0.9644, + "step": 1095 + }, + { + "epoch": 0.05466334164588529, + "grad_norm": 0.6400102082637606, + "learning_rate": 4.992076370516985e-05, + "loss": 0.9921, + "step": 1096 + }, + { + "epoch": 0.05471321695760598, + "grad_norm": 0.6383640340765017, + "learning_rate": 4.9920442104516716e-05, + "loss": 0.9372, + "step": 1097 + }, + { + "epoch": 0.054763092269326685, + "grad_norm": 0.6297804826232918, + "learning_rate": 4.99201198535758e-05, + "loss": 0.946, + "step": 1098 + }, + { + "epoch": 0.05481296758104738, + "grad_norm": 0.7132736794271398, + "learning_rate": 4.99197969523555e-05, + "loss": 0.9734, + "step": 1099 + }, + { + "epoch": 0.05486284289276808, + "grad_norm": 0.6136692262454758, + "learning_rate": 4.9919473400864266e-05, + "loss": 0.9827, + "step": 1100 + }, + { + "epoch": 0.05491271820448878, + "grad_norm": 0.6385905200875996, + "learning_rate": 4.991914919911052e-05, + "loss": 0.9691, + "step": 1101 + }, + { + "epoch": 0.054962593516209475, + "grad_norm": 0.7679019998090914, + "learning_rate": 4.9918824347102724e-05, + "loss": 0.9324, + "step": 1102 + }, + { + "epoch": 0.05501246882793018, + "grad_norm": 0.5810354527099293, + "learning_rate": 4.991849884484937e-05, + "loss": 0.8933, + "step": 1103 + }, + { + "epoch": 0.05506234413965087, + "grad_norm": 0.613722392186837, + "learning_rate": 4.991817269235893e-05, + "loss": 0.9715, + "step": 1104 + }, + { + "epoch": 0.05511221945137157, + "grad_norm": 0.5857891034270317, + "learning_rate": 4.991784588963993e-05, + "loss": 0.9974, + "step": 1105 + }, + { + "epoch": 0.05516209476309227, + "grad_norm": 0.878099062862141, + "learning_rate": 4.9917518436700894e-05, + "loss": 0.9745, + "step": 1106 + }, + { + "epoch": 0.05521197007481297, + "grad_norm": 1.4080498182895556, + "learning_rate": 4.991719033355037e-05, + "loss": 0.9328, + "step": 1107 + }, + { + "epoch": 0.05526184538653366, + "grad_norm": 0.7096528482173888, + "learning_rate": 4.9916861580196904e-05, + "loss": 0.9368, + "step": 1108 + }, + { + "epoch": 0.055311720698254366, + "grad_norm": 0.4857436453709342, + "learning_rate": 4.991653217664909e-05, + "loss": 0.9097, + "step": 1109 + }, + { + "epoch": 0.05536159600997506, + "grad_norm": 0.6677711613641206, + "learning_rate": 4.9916202122915526e-05, + "loss": 0.9447, + "step": 1110 + }, + { + "epoch": 0.05541147132169576, + "grad_norm": 0.5706169281549437, + "learning_rate": 4.991587141900481e-05, + "loss": 0.9546, + "step": 1111 + }, + { + "epoch": 0.05546134663341646, + "grad_norm": 0.9437446121331742, + "learning_rate": 4.991554006492559e-05, + "loss": 0.9455, + "step": 1112 + }, + { + "epoch": 0.055511221945137156, + "grad_norm": 2.1564240690499323, + "learning_rate": 4.9915208060686504e-05, + "loss": 0.9788, + "step": 1113 + }, + { + "epoch": 0.05556109725685786, + "grad_norm": 0.7141574687195973, + "learning_rate": 4.99148754062962e-05, + "loss": 1.0072, + "step": 1114 + }, + { + "epoch": 0.055610972568578554, + "grad_norm": 0.6118057287368831, + "learning_rate": 4.9914542101763385e-05, + "loss": 0.9867, + "step": 1115 + }, + { + "epoch": 0.05566084788029925, + "grad_norm": 1.1721440174417834, + "learning_rate": 4.9914208147096744e-05, + "loss": 0.9394, + "step": 1116 + }, + { + "epoch": 0.05571072319201995, + "grad_norm": 0.5719031405587359, + "learning_rate": 4.991387354230499e-05, + "loss": 0.9394, + "step": 1117 + }, + { + "epoch": 0.05576059850374065, + "grad_norm": 0.5681670883755816, + "learning_rate": 4.991353828739685e-05, + "loss": 0.9135, + "step": 1118 + }, + { + "epoch": 0.055810473815461344, + "grad_norm": 1.1283760779064478, + "learning_rate": 4.991320238238108e-05, + "loss": 0.9068, + "step": 1119 + }, + { + "epoch": 0.055860349127182046, + "grad_norm": 0.6742981750683704, + "learning_rate": 4.991286582726644e-05, + "loss": 0.9567, + "step": 1120 + }, + { + "epoch": 0.05591022443890274, + "grad_norm": 0.6348459531032139, + "learning_rate": 4.9912528622061725e-05, + "loss": 0.985, + "step": 1121 + }, + { + "epoch": 0.055960099750623445, + "grad_norm": 0.523133366411906, + "learning_rate": 4.991219076677572e-05, + "loss": 0.9558, + "step": 1122 + }, + { + "epoch": 0.05600997506234414, + "grad_norm": 0.5543117634866371, + "learning_rate": 4.991185226141725e-05, + "loss": 0.9402, + "step": 1123 + }, + { + "epoch": 0.056059850374064836, + "grad_norm": 0.844455311461443, + "learning_rate": 4.991151310599514e-05, + "loss": 0.94, + "step": 1124 + }, + { + "epoch": 0.05610972568578554, + "grad_norm": 0.5260806416655279, + "learning_rate": 4.991117330051825e-05, + "loss": 0.9832, + "step": 1125 + }, + { + "epoch": 0.056159600997506234, + "grad_norm": 0.5786655235935645, + "learning_rate": 4.991083284499544e-05, + "loss": 0.937, + "step": 1126 + }, + { + "epoch": 0.05620947630922693, + "grad_norm": 0.6787033656813424, + "learning_rate": 4.99104917394356e-05, + "loss": 0.9491, + "step": 1127 + }, + { + "epoch": 0.05625935162094763, + "grad_norm": 0.5324370114091712, + "learning_rate": 4.991014998384761e-05, + "loss": 0.9387, + "step": 1128 + }, + { + "epoch": 0.05630922693266833, + "grad_norm": 0.845211722298692, + "learning_rate": 4.990980757824042e-05, + "loss": 0.9545, + "step": 1129 + }, + { + "epoch": 0.056359102244389024, + "grad_norm": 0.8374497647485278, + "learning_rate": 4.990946452262294e-05, + "loss": 0.9331, + "step": 1130 + }, + { + "epoch": 0.05640897755610973, + "grad_norm": 0.5408440642791508, + "learning_rate": 4.990912081700413e-05, + "loss": 0.9654, + "step": 1131 + }, + { + "epoch": 0.05645885286783042, + "grad_norm": 0.7460323888002198, + "learning_rate": 4.990877646139297e-05, + "loss": 0.9521, + "step": 1132 + }, + { + "epoch": 0.056508728179551125, + "grad_norm": 0.5697287620878962, + "learning_rate": 4.9908431455798434e-05, + "loss": 0.9699, + "step": 1133 + }, + { + "epoch": 0.05655860349127182, + "grad_norm": 0.5683511353520709, + "learning_rate": 4.990808580022952e-05, + "loss": 0.9529, + "step": 1134 + }, + { + "epoch": 0.056608478802992516, + "grad_norm": 0.5494026012059758, + "learning_rate": 4.990773949469526e-05, + "loss": 0.9687, + "step": 1135 + }, + { + "epoch": 0.05665835411471322, + "grad_norm": 0.5840699233756826, + "learning_rate": 4.990739253920468e-05, + "loss": 0.9522, + "step": 1136 + }, + { + "epoch": 0.056708229426433915, + "grad_norm": 0.6895407411864873, + "learning_rate": 4.9907044933766846e-05, + "loss": 0.9949, + "step": 1137 + }, + { + "epoch": 0.05675810473815461, + "grad_norm": 0.9111639914484848, + "learning_rate": 4.990669667839082e-05, + "loss": 0.9403, + "step": 1138 + }, + { + "epoch": 0.05680798004987531, + "grad_norm": 0.5392430767370474, + "learning_rate": 4.990634777308568e-05, + "loss": 0.9364, + "step": 1139 + }, + { + "epoch": 0.05685785536159601, + "grad_norm": 0.535476940222611, + "learning_rate": 4.9905998217860554e-05, + "loss": 0.9105, + "step": 1140 + }, + { + "epoch": 0.05690773067331671, + "grad_norm": 0.538156349169334, + "learning_rate": 4.9905648012724546e-05, + "loss": 0.9666, + "step": 1141 + }, + { + "epoch": 0.05695760598503741, + "grad_norm": 0.5743418682939911, + "learning_rate": 4.99052971576868e-05, + "loss": 0.9533, + "step": 1142 + }, + { + "epoch": 0.0570074812967581, + "grad_norm": 0.5636392050313658, + "learning_rate": 4.9904945652756475e-05, + "loss": 0.9454, + "step": 1143 + }, + { + "epoch": 0.057057356608478806, + "grad_norm": 0.5338428693303727, + "learning_rate": 4.9904593497942735e-05, + "loss": 0.9165, + "step": 1144 + }, + { + "epoch": 0.0571072319201995, + "grad_norm": 0.5465944633298307, + "learning_rate": 4.990424069325478e-05, + "loss": 0.8983, + "step": 1145 + }, + { + "epoch": 0.0571571072319202, + "grad_norm": 0.48191119800576576, + "learning_rate": 4.99038872387018e-05, + "loss": 0.9184, + "step": 1146 + }, + { + "epoch": 0.0572069825436409, + "grad_norm": 0.6896130283309716, + "learning_rate": 4.990353313429303e-05, + "loss": 0.8938, + "step": 1147 + }, + { + "epoch": 0.057256857855361595, + "grad_norm": 0.6433780414871598, + "learning_rate": 4.990317838003771e-05, + "loss": 0.9882, + "step": 1148 + }, + { + "epoch": 0.05730673316708229, + "grad_norm": 0.5236614122761967, + "learning_rate": 4.990282297594509e-05, + "loss": 0.95, + "step": 1149 + }, + { + "epoch": 0.057356608478802994, + "grad_norm": 0.5347266702146068, + "learning_rate": 4.990246692202446e-05, + "loss": 0.9532, + "step": 1150 + }, + { + "epoch": 0.05740648379052369, + "grad_norm": 0.5519098197272881, + "learning_rate": 4.99021102182851e-05, + "loss": 0.9548, + "step": 1151 + }, + { + "epoch": 0.05745635910224439, + "grad_norm": 0.6040121623598399, + "learning_rate": 4.9901752864736315e-05, + "loss": 0.9497, + "step": 1152 + }, + { + "epoch": 0.05750623441396509, + "grad_norm": 0.5926355041233667, + "learning_rate": 4.990139486138743e-05, + "loss": 0.9701, + "step": 1153 + }, + { + "epoch": 0.05755610972568578, + "grad_norm": 0.7912622386693451, + "learning_rate": 4.9901036208247794e-05, + "loss": 1.0109, + "step": 1154 + }, + { + "epoch": 0.057605985037406486, + "grad_norm": 0.6297740111334815, + "learning_rate": 4.990067690532676e-05, + "loss": 0.9184, + "step": 1155 + }, + { + "epoch": 0.05765586034912718, + "grad_norm": 0.5662539122696412, + "learning_rate": 4.990031695263371e-05, + "loss": 0.9871, + "step": 1156 + }, + { + "epoch": 0.05770573566084788, + "grad_norm": 0.6023264302599639, + "learning_rate": 4.989995635017803e-05, + "loss": 1.0015, + "step": 1157 + }, + { + "epoch": 0.05775561097256858, + "grad_norm": 0.5120911765633108, + "learning_rate": 4.989959509796913e-05, + "loss": 0.9981, + "step": 1158 + }, + { + "epoch": 0.057805486284289276, + "grad_norm": 0.7495526128127634, + "learning_rate": 4.989923319601645e-05, + "loss": 0.9606, + "step": 1159 + }, + { + "epoch": 0.05785536159600998, + "grad_norm": 0.594116683904021, + "learning_rate": 4.989887064432941e-05, + "loss": 0.9085, + "step": 1160 + }, + { + "epoch": 0.057905236907730674, + "grad_norm": 0.604186751773846, + "learning_rate": 4.9898507442917495e-05, + "loss": 0.9241, + "step": 1161 + }, + { + "epoch": 0.05795511221945137, + "grad_norm": 0.7476198795217108, + "learning_rate": 4.989814359179017e-05, + "loss": 0.9311, + "step": 1162 + }, + { + "epoch": 0.05800498753117207, + "grad_norm": 0.5533814682115498, + "learning_rate": 4.989777909095692e-05, + "loss": 0.9369, + "step": 1163 + }, + { + "epoch": 0.05805486284289277, + "grad_norm": 0.836672066019084, + "learning_rate": 4.989741394042727e-05, + "loss": 0.9624, + "step": 1164 + }, + { + "epoch": 0.058104738154613464, + "grad_norm": 0.6105297734904499, + "learning_rate": 4.989704814021076e-05, + "loss": 0.9907, + "step": 1165 + }, + { + "epoch": 0.05815461346633417, + "grad_norm": 0.6214099903453992, + "learning_rate": 4.989668169031691e-05, + "loss": 1.0388, + "step": 1166 + }, + { + "epoch": 0.05820448877805486, + "grad_norm": 0.7381752313021123, + "learning_rate": 4.989631459075529e-05, + "loss": 0.9733, + "step": 1167 + }, + { + "epoch": 0.05825436408977556, + "grad_norm": 0.6331082056603984, + "learning_rate": 4.989594684153549e-05, + "loss": 0.9619, + "step": 1168 + }, + { + "epoch": 0.05830423940149626, + "grad_norm": 0.5207183593902434, + "learning_rate": 4.98955784426671e-05, + "loss": 0.9327, + "step": 1169 + }, + { + "epoch": 0.058354114713216956, + "grad_norm": 0.5649866193034204, + "learning_rate": 4.989520939415973e-05, + "loss": 0.9344, + "step": 1170 + }, + { + "epoch": 0.05840399002493766, + "grad_norm": 0.729426286553499, + "learning_rate": 4.989483969602301e-05, + "loss": 0.9668, + "step": 1171 + }, + { + "epoch": 0.058453865336658355, + "grad_norm": 0.6178899114711653, + "learning_rate": 4.989446934826659e-05, + "loss": 0.9991, + "step": 1172 + }, + { + "epoch": 0.05850374064837905, + "grad_norm": 0.5357055507659813, + "learning_rate": 4.9894098350900144e-05, + "loss": 0.9591, + "step": 1173 + }, + { + "epoch": 0.05855361596009975, + "grad_norm": 0.5909093700984692, + "learning_rate": 4.989372670393333e-05, + "loss": 0.956, + "step": 1174 + }, + { + "epoch": 0.05860349127182045, + "grad_norm": 0.6081234514276662, + "learning_rate": 4.989335440737586e-05, + "loss": 0.9541, + "step": 1175 + }, + { + "epoch": 0.058653366583541144, + "grad_norm": 0.6556440327942947, + "learning_rate": 4.989298146123745e-05, + "loss": 0.9745, + "step": 1176 + }, + { + "epoch": 0.05870324189526185, + "grad_norm": 0.6587289810442492, + "learning_rate": 4.9892607865527834e-05, + "loss": 0.9761, + "step": 1177 + }, + { + "epoch": 0.05875311720698254, + "grad_norm": 0.6122434438054968, + "learning_rate": 4.9892233620256755e-05, + "loss": 0.9409, + "step": 1178 + }, + { + "epoch": 0.05880299251870324, + "grad_norm": 0.5541816240810234, + "learning_rate": 4.9891858725433976e-05, + "loss": 0.9616, + "step": 1179 + }, + { + "epoch": 0.05885286783042394, + "grad_norm": 0.6965789332413582, + "learning_rate": 4.9891483181069286e-05, + "loss": 0.994, + "step": 1180 + }, + { + "epoch": 0.05890274314214464, + "grad_norm": 0.5209374417479031, + "learning_rate": 4.989110698717248e-05, + "loss": 0.9727, + "step": 1181 + }, + { + "epoch": 0.05895261845386534, + "grad_norm": 0.5149400356747165, + "learning_rate": 4.989073014375338e-05, + "loss": 0.9412, + "step": 1182 + }, + { + "epoch": 0.059002493765586035, + "grad_norm": 0.7299114397984757, + "learning_rate": 4.989035265082181e-05, + "loss": 0.9791, + "step": 1183 + }, + { + "epoch": 0.05905236907730673, + "grad_norm": 0.5208523760176506, + "learning_rate": 4.988997450838764e-05, + "loss": 0.9575, + "step": 1184 + }, + { + "epoch": 0.059102244389027433, + "grad_norm": 0.46987186325013164, + "learning_rate": 4.9889595716460715e-05, + "loss": 0.9263, + "step": 1185 + }, + { + "epoch": 0.05915211970074813, + "grad_norm": 0.557575938023096, + "learning_rate": 4.988921627505093e-05, + "loss": 0.9264, + "step": 1186 + }, + { + "epoch": 0.059201995012468825, + "grad_norm": 0.5559899630896806, + "learning_rate": 4.9888836184168186e-05, + "loss": 0.9413, + "step": 1187 + }, + { + "epoch": 0.05925187032418953, + "grad_norm": 0.6831878347305186, + "learning_rate": 4.988845544382239e-05, + "loss": 1.0105, + "step": 1188 + }, + { + "epoch": 0.05930174563591022, + "grad_norm": 0.5465850536450751, + "learning_rate": 4.98880740540235e-05, + "loss": 0.9132, + "step": 1189 + }, + { + "epoch": 0.059351620947630926, + "grad_norm": 0.5748453640095563, + "learning_rate": 4.988769201478146e-05, + "loss": 0.941, + "step": 1190 + }, + { + "epoch": 0.05940149625935162, + "grad_norm": 0.6102224239363866, + "learning_rate": 4.988730932610623e-05, + "loss": 0.9772, + "step": 1191 + }, + { + "epoch": 0.05945137157107232, + "grad_norm": 0.6120223101587041, + "learning_rate": 4.98869259880078e-05, + "loss": 0.99, + "step": 1192 + }, + { + "epoch": 0.05950124688279302, + "grad_norm": 0.6170918267386301, + "learning_rate": 4.988654200049617e-05, + "loss": 0.9646, + "step": 1193 + }, + { + "epoch": 0.059551122194513716, + "grad_norm": 0.5455990467900586, + "learning_rate": 4.988615736358138e-05, + "loss": 0.9573, + "step": 1194 + }, + { + "epoch": 0.05960099750623441, + "grad_norm": 0.517073639962138, + "learning_rate": 4.9885772077273426e-05, + "loss": 0.9704, + "step": 1195 + }, + { + "epoch": 0.059650872817955114, + "grad_norm": 0.6077832930395533, + "learning_rate": 4.988538614158241e-05, + "loss": 0.9352, + "step": 1196 + }, + { + "epoch": 0.05970074812967581, + "grad_norm": 0.5609754022124787, + "learning_rate": 4.988499955651837e-05, + "loss": 0.9515, + "step": 1197 + }, + { + "epoch": 0.059750623441396505, + "grad_norm": 0.8081999584181783, + "learning_rate": 4.9884612322091406e-05, + "loss": 0.9709, + "step": 1198 + }, + { + "epoch": 0.05980049875311721, + "grad_norm": 0.6541760317210888, + "learning_rate": 4.988422443831162e-05, + "loss": 0.9932, + "step": 1199 + }, + { + "epoch": 0.059850374064837904, + "grad_norm": 0.6771351641539817, + "learning_rate": 4.988383590518913e-05, + "loss": 0.9789, + "step": 1200 + }, + { + "epoch": 0.059900249376558606, + "grad_norm": 0.5510604057783666, + "learning_rate": 4.9883446722734084e-05, + "loss": 0.9247, + "step": 1201 + }, + { + "epoch": 0.0599501246882793, + "grad_norm": 1.3274237384563305, + "learning_rate": 4.988305689095662e-05, + "loss": 1.0138, + "step": 1202 + }, + { + "epoch": 0.06, + "grad_norm": 0.7118659077784354, + "learning_rate": 4.988266640986694e-05, + "loss": 0.9732, + "step": 1203 + }, + { + "epoch": 0.0600498753117207, + "grad_norm": 0.588565535970083, + "learning_rate": 4.988227527947521e-05, + "loss": 0.9518, + "step": 1204 + }, + { + "epoch": 0.060099750623441396, + "grad_norm": 0.4687257713613307, + "learning_rate": 4.9881883499791636e-05, + "loss": 0.9282, + "step": 1205 + }, + { + "epoch": 0.06014962593516209, + "grad_norm": 1.075607750469402, + "learning_rate": 4.988149107082646e-05, + "loss": 1.008, + "step": 1206 + }, + { + "epoch": 0.060199501246882794, + "grad_norm": 0.6724944741108851, + "learning_rate": 4.98810979925899e-05, + "loss": 0.97, + "step": 1207 + }, + { + "epoch": 0.06024937655860349, + "grad_norm": 0.49520143090141483, + "learning_rate": 4.9880704265092225e-05, + "loss": 0.9314, + "step": 1208 + }, + { + "epoch": 0.06029925187032419, + "grad_norm": 0.5279079635561776, + "learning_rate": 4.988030988834371e-05, + "loss": 0.9613, + "step": 1209 + }, + { + "epoch": 0.06034912718204489, + "grad_norm": 0.5889753319758342, + "learning_rate": 4.9879914862354636e-05, + "loss": 0.96, + "step": 1210 + }, + { + "epoch": 0.060399002493765584, + "grad_norm": 0.6134286350712965, + "learning_rate": 4.987951918713533e-05, + "loss": 0.9828, + "step": 1211 + }, + { + "epoch": 0.06044887780548629, + "grad_norm": 0.5034934586646698, + "learning_rate": 4.987912286269609e-05, + "loss": 0.9637, + "step": 1212 + }, + { + "epoch": 0.06049875311720698, + "grad_norm": 0.7642294030260058, + "learning_rate": 4.987872588904728e-05, + "loss": 0.9922, + "step": 1213 + }, + { + "epoch": 0.06054862842892768, + "grad_norm": 0.6578212610450697, + "learning_rate": 4.9878328266199257e-05, + "loss": 0.9864, + "step": 1214 + }, + { + "epoch": 0.06059850374064838, + "grad_norm": 0.841466725671512, + "learning_rate": 4.98779299941624e-05, + "loss": 0.9978, + "step": 1215 + }, + { + "epoch": 0.06064837905236908, + "grad_norm": 0.7039826570733747, + "learning_rate": 4.9877531072947083e-05, + "loss": 0.9888, + "step": 1216 + }, + { + "epoch": 0.06069825436408977, + "grad_norm": 0.5422445727456023, + "learning_rate": 4.9877131502563725e-05, + "loss": 0.9783, + "step": 1217 + }, + { + "epoch": 0.060748129675810475, + "grad_norm": 0.6395309225655869, + "learning_rate": 4.987673128302275e-05, + "loss": 0.9413, + "step": 1218 + }, + { + "epoch": 0.06079800498753117, + "grad_norm": 0.5398308430322171, + "learning_rate": 4.9876330414334614e-05, + "loss": 0.9335, + "step": 1219 + }, + { + "epoch": 0.06084788029925187, + "grad_norm": 0.6336433840101772, + "learning_rate": 4.987592889650977e-05, + "loss": 0.9343, + "step": 1220 + }, + { + "epoch": 0.06089775561097257, + "grad_norm": 0.6047758617932444, + "learning_rate": 4.987552672955869e-05, + "loss": 1.0037, + "step": 1221 + }, + { + "epoch": 0.060947630922693265, + "grad_norm": 0.5895094744797705, + "learning_rate": 4.987512391349187e-05, + "loss": 0.952, + "step": 1222 + }, + { + "epoch": 0.06099750623441397, + "grad_norm": 1.359908331710762, + "learning_rate": 4.9874720448319835e-05, + "loss": 0.9395, + "step": 1223 + }, + { + "epoch": 0.06104738154613466, + "grad_norm": 0.6436013338067919, + "learning_rate": 4.987431633405309e-05, + "loss": 0.9556, + "step": 1224 + }, + { + "epoch": 0.06109725685785536, + "grad_norm": 0.5685533087909577, + "learning_rate": 4.9873911570702204e-05, + "loss": 0.9563, + "step": 1225 + }, + { + "epoch": 0.06114713216957606, + "grad_norm": 0.6274745653608832, + "learning_rate": 4.9873506158277714e-05, + "loss": 0.9595, + "step": 1226 + }, + { + "epoch": 0.06119700748129676, + "grad_norm": 1.4425270258169098, + "learning_rate": 4.9873100096790226e-05, + "loss": 0.9625, + "step": 1227 + }, + { + "epoch": 0.06124688279301746, + "grad_norm": 0.6207907459737049, + "learning_rate": 4.987269338625031e-05, + "loss": 0.9597, + "step": 1228 + }, + { + "epoch": 0.061296758104738155, + "grad_norm": 0.5323936965998165, + "learning_rate": 4.9872286026668604e-05, + "loss": 0.9535, + "step": 1229 + }, + { + "epoch": 0.06134663341645885, + "grad_norm": 0.6733464047253338, + "learning_rate": 4.987187801805572e-05, + "loss": 0.9463, + "step": 1230 + }, + { + "epoch": 0.061396508728179554, + "grad_norm": 0.6233672759465626, + "learning_rate": 4.9871469360422304e-05, + "loss": 0.9753, + "step": 1231 + }, + { + "epoch": 0.06144638403990025, + "grad_norm": 0.9027917618600975, + "learning_rate": 4.987106005377904e-05, + "loss": 0.961, + "step": 1232 + }, + { + "epoch": 0.061496259351620945, + "grad_norm": 0.6228871841907744, + "learning_rate": 4.987065009813659e-05, + "loss": 0.9561, + "step": 1233 + }, + { + "epoch": 0.06154613466334165, + "grad_norm": 0.5410966675802428, + "learning_rate": 4.987023949350565e-05, + "loss": 0.8991, + "step": 1234 + }, + { + "epoch": 0.061596009975062344, + "grad_norm": 0.5748540296323545, + "learning_rate": 4.986982823989694e-05, + "loss": 0.9232, + "step": 1235 + }, + { + "epoch": 0.06164588528678304, + "grad_norm": 0.5909795890311754, + "learning_rate": 4.98694163373212e-05, + "loss": 0.9595, + "step": 1236 + }, + { + "epoch": 0.06169576059850374, + "grad_norm": 0.6588852773325664, + "learning_rate": 4.986900378578917e-05, + "loss": 1.0029, + "step": 1237 + }, + { + "epoch": 0.06174563591022444, + "grad_norm": 0.7124981401590961, + "learning_rate": 4.9868590585311616e-05, + "loss": 0.9562, + "step": 1238 + }, + { + "epoch": 0.06179551122194514, + "grad_norm": 0.5410512973269404, + "learning_rate": 4.9868176735899317e-05, + "loss": 0.9299, + "step": 1239 + }, + { + "epoch": 0.061845386533665836, + "grad_norm": 0.5715747595467042, + "learning_rate": 4.9867762237563075e-05, + "loss": 0.9084, + "step": 1240 + }, + { + "epoch": 0.06189526184538653, + "grad_norm": 0.7478820353727663, + "learning_rate": 4.986734709031371e-05, + "loss": 0.9618, + "step": 1241 + }, + { + "epoch": 0.061945137157107234, + "grad_norm": 0.7087441712701797, + "learning_rate": 4.986693129416205e-05, + "loss": 0.9685, + "step": 1242 + }, + { + "epoch": 0.06199501246882793, + "grad_norm": 0.5640016820584633, + "learning_rate": 4.986651484911895e-05, + "loss": 0.956, + "step": 1243 + }, + { + "epoch": 0.062044887780548626, + "grad_norm": 0.813536878843417, + "learning_rate": 4.986609775519527e-05, + "loss": 0.9922, + "step": 1244 + }, + { + "epoch": 0.06209476309226933, + "grad_norm": 0.6003136856804661, + "learning_rate": 4.9865680012401895e-05, + "loss": 0.959, + "step": 1245 + }, + { + "epoch": 0.062144638403990024, + "grad_norm": 0.5374790099248875, + "learning_rate": 4.9865261620749734e-05, + "loss": 0.9555, + "step": 1246 + }, + { + "epoch": 0.06219451371571072, + "grad_norm": 0.5697111465125527, + "learning_rate": 4.98648425802497e-05, + "loss": 0.9836, + "step": 1247 + }, + { + "epoch": 0.06224438902743142, + "grad_norm": 0.5696332675128292, + "learning_rate": 4.986442289091272e-05, + "loss": 0.9494, + "step": 1248 + }, + { + "epoch": 0.06229426433915212, + "grad_norm": 0.5646168159444551, + "learning_rate": 4.9864002552749755e-05, + "loss": 0.9588, + "step": 1249 + }, + { + "epoch": 0.06234413965087282, + "grad_norm": 0.7349527781854978, + "learning_rate": 4.986358156577178e-05, + "loss": 0.9343, + "step": 1250 + }, + { + "epoch": 0.062394014962593516, + "grad_norm": 0.5338147687682004, + "learning_rate": 4.9863159929989763e-05, + "loss": 0.944, + "step": 1251 + }, + { + "epoch": 0.06244389027431421, + "grad_norm": 0.5988258226031281, + "learning_rate": 4.986273764541471e-05, + "loss": 0.9113, + "step": 1252 + }, + { + "epoch": 0.062493765586034915, + "grad_norm": 0.7363879738847974, + "learning_rate": 4.9862314712057654e-05, + "loss": 1.0074, + "step": 1253 + }, + { + "epoch": 0.06254364089775562, + "grad_norm": 0.7942433597153404, + "learning_rate": 4.986189112992963e-05, + "loss": 0.9444, + "step": 1254 + }, + { + "epoch": 0.06259351620947631, + "grad_norm": 0.6247945429538575, + "learning_rate": 4.986146689904166e-05, + "loss": 0.9645, + "step": 1255 + }, + { + "epoch": 0.06264339152119701, + "grad_norm": 1.2354937179929786, + "learning_rate": 4.986104201940486e-05, + "loss": 0.9845, + "step": 1256 + }, + { + "epoch": 0.0626932668329177, + "grad_norm": 0.6351066695086108, + "learning_rate": 4.986061649103029e-05, + "loss": 0.9787, + "step": 1257 + }, + { + "epoch": 0.0627431421446384, + "grad_norm": 0.5582966704882777, + "learning_rate": 4.986019031392905e-05, + "loss": 0.9657, + "step": 1258 + }, + { + "epoch": 0.0627930174563591, + "grad_norm": 1.0944078428819801, + "learning_rate": 4.9859763488112276e-05, + "loss": 0.9597, + "step": 1259 + }, + { + "epoch": 0.0628428927680798, + "grad_norm": 0.6266112763717814, + "learning_rate": 4.9859336013591095e-05, + "loss": 0.9761, + "step": 1260 + }, + { + "epoch": 0.0628927680798005, + "grad_norm": 0.5525605619916414, + "learning_rate": 4.985890789037667e-05, + "loss": 0.9349, + "step": 1261 + }, + { + "epoch": 0.0629426433915212, + "grad_norm": 0.5606160881871805, + "learning_rate": 4.985847911848017e-05, + "loss": 0.95, + "step": 1262 + }, + { + "epoch": 0.06299251870324189, + "grad_norm": 0.8635230481360262, + "learning_rate": 4.985804969791279e-05, + "loss": 0.951, + "step": 1263 + }, + { + "epoch": 0.06304239401496259, + "grad_norm": 0.45910889987129866, + "learning_rate": 4.9857619628685716e-05, + "loss": 0.9218, + "step": 1264 + }, + { + "epoch": 0.0630922693266833, + "grad_norm": 0.6027351413352272, + "learning_rate": 4.985718891081018e-05, + "loss": 0.9734, + "step": 1265 + }, + { + "epoch": 0.063142144638404, + "grad_norm": 0.7466123062340434, + "learning_rate": 4.985675754429744e-05, + "loss": 0.9201, + "step": 1266 + }, + { + "epoch": 0.06319201995012469, + "grad_norm": 0.6472419703047166, + "learning_rate": 4.985632552915872e-05, + "loss": 0.9847, + "step": 1267 + }, + { + "epoch": 0.06324189526184539, + "grad_norm": 0.5205491616875704, + "learning_rate": 4.985589286540532e-05, + "loss": 0.9473, + "step": 1268 + }, + { + "epoch": 0.06329177057356608, + "grad_norm": 0.5337409866256771, + "learning_rate": 4.985545955304851e-05, + "loss": 0.9948, + "step": 1269 + }, + { + "epoch": 0.06334164588528678, + "grad_norm": 0.5713227384538233, + "learning_rate": 4.9855025592099616e-05, + "loss": 0.9329, + "step": 1270 + }, + { + "epoch": 0.06339152119700749, + "grad_norm": 0.5432811643596704, + "learning_rate": 4.985459098256995e-05, + "loss": 0.954, + "step": 1271 + }, + { + "epoch": 0.06344139650872818, + "grad_norm": 0.5006613148620638, + "learning_rate": 4.9854155724470856e-05, + "loss": 0.879, + "step": 1272 + }, + { + "epoch": 0.06349127182044888, + "grad_norm": 0.8770204361719702, + "learning_rate": 4.985371981781369e-05, + "loss": 0.9254, + "step": 1273 + }, + { + "epoch": 0.06354114713216957, + "grad_norm": 0.6230440022258723, + "learning_rate": 4.985328326260983e-05, + "loss": 0.9772, + "step": 1274 + }, + { + "epoch": 0.06359102244389027, + "grad_norm": 0.8919924769711846, + "learning_rate": 4.9852846058870664e-05, + "loss": 1.0048, + "step": 1275 + }, + { + "epoch": 0.06364089775561098, + "grad_norm": 0.5974098199904021, + "learning_rate": 4.9852408206607605e-05, + "loss": 0.9383, + "step": 1276 + }, + { + "epoch": 0.06369077306733167, + "grad_norm": 0.5347887606094823, + "learning_rate": 4.985196970583208e-05, + "loss": 0.9628, + "step": 1277 + }, + { + "epoch": 0.06374064837905237, + "grad_norm": 0.5438391991857643, + "learning_rate": 4.985153055655552e-05, + "loss": 0.9233, + "step": 1278 + }, + { + "epoch": 0.06379052369077307, + "grad_norm": 0.4929702907100234, + "learning_rate": 4.98510907587894e-05, + "loss": 0.9553, + "step": 1279 + }, + { + "epoch": 0.06384039900249376, + "grad_norm": 0.5141354040897548, + "learning_rate": 4.985065031254518e-05, + "loss": 0.9581, + "step": 1280 + }, + { + "epoch": 0.06389027431421446, + "grad_norm": 0.8318942056846191, + "learning_rate": 4.985020921783437e-05, + "loss": 0.956, + "step": 1281 + }, + { + "epoch": 0.06394014962593517, + "grad_norm": 0.6228969454263937, + "learning_rate": 4.984976747466846e-05, + "loss": 1.0063, + "step": 1282 + }, + { + "epoch": 0.06399002493765586, + "grad_norm": 0.6273860178882668, + "learning_rate": 4.9849325083059e-05, + "loss": 1.0061, + "step": 1283 + }, + { + "epoch": 0.06403990024937656, + "grad_norm": 0.5318992770774287, + "learning_rate": 4.984888204301752e-05, + "loss": 0.9465, + "step": 1284 + }, + { + "epoch": 0.06408977556109725, + "grad_norm": 0.7157023394514224, + "learning_rate": 4.9848438354555584e-05, + "loss": 0.9478, + "step": 1285 + }, + { + "epoch": 0.06413965087281795, + "grad_norm": 0.6180221121070334, + "learning_rate": 4.984799401768477e-05, + "loss": 0.9898, + "step": 1286 + }, + { + "epoch": 0.06418952618453866, + "grad_norm": 2.2486086005021093, + "learning_rate": 4.984754903241668e-05, + "loss": 0.964, + "step": 1287 + }, + { + "epoch": 0.06423940149625935, + "grad_norm": 0.5467365349139356, + "learning_rate": 4.9847103398762905e-05, + "loss": 0.9568, + "step": 1288 + }, + { + "epoch": 0.06428927680798005, + "grad_norm": 0.810170733089244, + "learning_rate": 4.984665711673509e-05, + "loss": 0.951, + "step": 1289 + }, + { + "epoch": 0.06433915211970075, + "grad_norm": 0.6582412555219004, + "learning_rate": 4.984621018634488e-05, + "loss": 0.9601, + "step": 1290 + }, + { + "epoch": 0.06438902743142144, + "grad_norm": 0.5001019183097053, + "learning_rate": 4.984576260760394e-05, + "loss": 0.9596, + "step": 1291 + }, + { + "epoch": 0.06443890274314215, + "grad_norm": 0.6433923421378779, + "learning_rate": 4.984531438052394e-05, + "loss": 0.9433, + "step": 1292 + }, + { + "epoch": 0.06448877805486285, + "grad_norm": 0.5606890026069785, + "learning_rate": 4.984486550511658e-05, + "loss": 0.9434, + "step": 1293 + }, + { + "epoch": 0.06453865336658354, + "grad_norm": 0.6728079464884927, + "learning_rate": 4.984441598139357e-05, + "loss": 0.989, + "step": 1294 + }, + { + "epoch": 0.06458852867830424, + "grad_norm": 0.5130528334556217, + "learning_rate": 4.984396580936665e-05, + "loss": 0.9186, + "step": 1295 + }, + { + "epoch": 0.06463840399002493, + "grad_norm": 0.49869440223539285, + "learning_rate": 4.984351498904756e-05, + "loss": 0.9304, + "step": 1296 + }, + { + "epoch": 0.06468827930174563, + "grad_norm": 0.7082498618484018, + "learning_rate": 4.984306352044806e-05, + "loss": 1.0179, + "step": 1297 + }, + { + "epoch": 0.06473815461346634, + "grad_norm": 0.5247849416405472, + "learning_rate": 4.984261140357994e-05, + "loss": 0.947, + "step": 1298 + }, + { + "epoch": 0.06478802992518704, + "grad_norm": 0.5511273374603597, + "learning_rate": 4.9842158638454994e-05, + "loss": 0.9823, + "step": 1299 + }, + { + "epoch": 0.06483790523690773, + "grad_norm": 0.5294780834314383, + "learning_rate": 4.9841705225085034e-05, + "loss": 0.9689, + "step": 1300 + }, + { + "epoch": 0.06488778054862843, + "grad_norm": 0.7587798626635227, + "learning_rate": 4.984125116348189e-05, + "loss": 0.974, + "step": 1301 + }, + { + "epoch": 0.06493765586034912, + "grad_norm": 0.5478938991299172, + "learning_rate": 4.9840796453657424e-05, + "loss": 0.9158, + "step": 1302 + }, + { + "epoch": 0.06498753117206983, + "grad_norm": 0.729844482773071, + "learning_rate": 4.984034109562349e-05, + "loss": 0.979, + "step": 1303 + }, + { + "epoch": 0.06503740648379053, + "grad_norm": 0.5649599856330921, + "learning_rate": 4.983988508939197e-05, + "loss": 0.9432, + "step": 1304 + }, + { + "epoch": 0.06508728179551122, + "grad_norm": 0.5491291979542232, + "learning_rate": 4.983942843497476e-05, + "loss": 0.9194, + "step": 1305 + }, + { + "epoch": 0.06513715710723192, + "grad_norm": 0.5690888612542544, + "learning_rate": 4.9838971132383785e-05, + "loss": 0.9137, + "step": 1306 + }, + { + "epoch": 0.06518703241895261, + "grad_norm": 0.5880639001925462, + "learning_rate": 4.983851318163097e-05, + "loss": 0.9072, + "step": 1307 + }, + { + "epoch": 0.06523690773067331, + "grad_norm": 0.5892176131121223, + "learning_rate": 4.9838054582728284e-05, + "loss": 0.9272, + "step": 1308 + }, + { + "epoch": 0.06528678304239402, + "grad_norm": 0.510484320303581, + "learning_rate": 4.983759533568767e-05, + "loss": 0.9539, + "step": 1309 + }, + { + "epoch": 0.06533665835411472, + "grad_norm": 0.5842227625036751, + "learning_rate": 4.983713544052112e-05, + "loss": 0.9276, + "step": 1310 + }, + { + "epoch": 0.06538653366583541, + "grad_norm": 0.8681360739693418, + "learning_rate": 4.983667489724064e-05, + "loss": 0.9263, + "step": 1311 + }, + { + "epoch": 0.06543640897755611, + "grad_norm": 0.5588456748750498, + "learning_rate": 4.983621370585825e-05, + "loss": 0.987, + "step": 1312 + }, + { + "epoch": 0.0654862842892768, + "grad_norm": 0.5658210664190397, + "learning_rate": 4.983575186638597e-05, + "loss": 0.9544, + "step": 1313 + }, + { + "epoch": 0.06553615960099751, + "grad_norm": 0.5718451833815291, + "learning_rate": 4.983528937883586e-05, + "loss": 0.9304, + "step": 1314 + }, + { + "epoch": 0.06558603491271821, + "grad_norm": 0.5266063835131582, + "learning_rate": 4.983482624321999e-05, + "loss": 0.9409, + "step": 1315 + }, + { + "epoch": 0.0656359102244389, + "grad_norm": 0.5509546774232962, + "learning_rate": 4.983436245955044e-05, + "loss": 0.9144, + "step": 1316 + }, + { + "epoch": 0.0656857855361596, + "grad_norm": 1.018101285334747, + "learning_rate": 4.983389802783933e-05, + "loss": 1.0028, + "step": 1317 + }, + { + "epoch": 0.0657356608478803, + "grad_norm": 0.7234787409359356, + "learning_rate": 4.9833432948098754e-05, + "loss": 0.9863, + "step": 1318 + }, + { + "epoch": 0.06578553615960099, + "grad_norm": 0.5094437520424322, + "learning_rate": 4.9832967220340854e-05, + "loss": 0.9819, + "step": 1319 + }, + { + "epoch": 0.0658354114713217, + "grad_norm": 0.8076317131792601, + "learning_rate": 4.98325008445778e-05, + "loss": 0.9607, + "step": 1320 + }, + { + "epoch": 0.0658852867830424, + "grad_norm": 0.6203659292742402, + "learning_rate": 4.983203382082175e-05, + "loss": 0.9388, + "step": 1321 + }, + { + "epoch": 0.06593516209476309, + "grad_norm": 0.5121311843834504, + "learning_rate": 4.983156614908488e-05, + "loss": 0.935, + "step": 1322 + }, + { + "epoch": 0.06598503740648379, + "grad_norm": 0.7230487480173262, + "learning_rate": 4.983109782937941e-05, + "loss": 0.9597, + "step": 1323 + }, + { + "epoch": 0.06603491271820448, + "grad_norm": 0.6393174098465085, + "learning_rate": 4.983062886171755e-05, + "loss": 0.9457, + "step": 1324 + }, + { + "epoch": 0.06608478802992519, + "grad_norm": 0.6266157102467893, + "learning_rate": 4.983015924611155e-05, + "loss": 1.0237, + "step": 1325 + }, + { + "epoch": 0.06613466334164589, + "grad_norm": 0.607568146292229, + "learning_rate": 4.982968898257366e-05, + "loss": 0.9292, + "step": 1326 + }, + { + "epoch": 0.06618453865336658, + "grad_norm": 0.5760664714793293, + "learning_rate": 4.982921807111614e-05, + "loss": 0.9387, + "step": 1327 + }, + { + "epoch": 0.06623441396508728, + "grad_norm": 0.7913799042099212, + "learning_rate": 4.982874651175129e-05, + "loss": 0.9654, + "step": 1328 + }, + { + "epoch": 0.06628428927680798, + "grad_norm": 0.6472615044677702, + "learning_rate": 4.982827430449141e-05, + "loss": 0.9599, + "step": 1329 + }, + { + "epoch": 0.06633416458852869, + "grad_norm": 0.6120422671022023, + "learning_rate": 4.9827801449348824e-05, + "loss": 1.0077, + "step": 1330 + }, + { + "epoch": 0.06638403990024938, + "grad_norm": 0.7017621544332643, + "learning_rate": 4.9827327946335875e-05, + "loss": 1.0016, + "step": 1331 + }, + { + "epoch": 0.06643391521197008, + "grad_norm": 0.6388309778141721, + "learning_rate": 4.982685379546491e-05, + "loss": 0.9485, + "step": 1332 + }, + { + "epoch": 0.06648379052369077, + "grad_norm": 0.9824135065831091, + "learning_rate": 4.982637899674831e-05, + "loss": 0.9431, + "step": 1333 + }, + { + "epoch": 0.06653366583541147, + "grad_norm": 0.5634638146420621, + "learning_rate": 4.982590355019846e-05, + "loss": 1.0093, + "step": 1334 + }, + { + "epoch": 0.06658354114713216, + "grad_norm": 0.6014830300575162, + "learning_rate": 4.9825427455827765e-05, + "loss": 0.9546, + "step": 1335 + }, + { + "epoch": 0.06663341645885287, + "grad_norm": 0.45948365248071876, + "learning_rate": 4.982495071364866e-05, + "loss": 0.9168, + "step": 1336 + }, + { + "epoch": 0.06668329177057357, + "grad_norm": 0.6143228816801636, + "learning_rate": 4.982447332367356e-05, + "loss": 0.9984, + "step": 1337 + }, + { + "epoch": 0.06673316708229426, + "grad_norm": 0.6389212398152172, + "learning_rate": 4.982399528591495e-05, + "loss": 0.9845, + "step": 1338 + }, + { + "epoch": 0.06678304239401496, + "grad_norm": 0.5060480974476048, + "learning_rate": 4.9823516600385287e-05, + "loss": 0.9445, + "step": 1339 + }, + { + "epoch": 0.06683291770573566, + "grad_norm": 0.4382361759068401, + "learning_rate": 4.982303726709707e-05, + "loss": 0.9204, + "step": 1340 + }, + { + "epoch": 0.06688279301745637, + "grad_norm": 0.5104617900130708, + "learning_rate": 4.98225572860628e-05, + "loss": 0.962, + "step": 1341 + }, + { + "epoch": 0.06693266832917706, + "grad_norm": 0.4706042959589064, + "learning_rate": 4.982207665729501e-05, + "loss": 0.9478, + "step": 1342 + }, + { + "epoch": 0.06698254364089776, + "grad_norm": 0.49362555458400154, + "learning_rate": 4.9821595380806244e-05, + "loss": 0.9661, + "step": 1343 + }, + { + "epoch": 0.06703241895261845, + "grad_norm": 0.5239157726179693, + "learning_rate": 4.982111345660906e-05, + "loss": 0.9465, + "step": 1344 + }, + { + "epoch": 0.06708229426433915, + "grad_norm": 0.5605506570572009, + "learning_rate": 4.982063088471601e-05, + "loss": 0.9724, + "step": 1345 + }, + { + "epoch": 0.06713216957605984, + "grad_norm": 0.5384815353340027, + "learning_rate": 4.982014766513972e-05, + "loss": 0.9489, + "step": 1346 + }, + { + "epoch": 0.06718204488778055, + "grad_norm": 0.5193601197997788, + "learning_rate": 4.981966379789278e-05, + "loss": 0.9693, + "step": 1347 + }, + { + "epoch": 0.06723192019950125, + "grad_norm": 0.5301720178819554, + "learning_rate": 4.981917928298783e-05, + "loss": 0.9897, + "step": 1348 + }, + { + "epoch": 0.06728179551122195, + "grad_norm": 1.0249698055277159, + "learning_rate": 4.9818694120437495e-05, + "loss": 0.9427, + "step": 1349 + }, + { + "epoch": 0.06733167082294264, + "grad_norm": 0.4971089833138187, + "learning_rate": 4.9818208310254445e-05, + "loss": 0.9278, + "step": 1350 + }, + { + "epoch": 0.06738154613466334, + "grad_norm": 0.5305445427609841, + "learning_rate": 4.981772185245135e-05, + "loss": 1.0004, + "step": 1351 + }, + { + "epoch": 0.06743142144638405, + "grad_norm": 0.6012505820972964, + "learning_rate": 4.981723474704092e-05, + "loss": 0.9121, + "step": 1352 + }, + { + "epoch": 0.06748129675810474, + "grad_norm": 0.6389866445422214, + "learning_rate": 4.9816746994035853e-05, + "loss": 0.9371, + "step": 1353 + }, + { + "epoch": 0.06753117206982544, + "grad_norm": 0.623907205267529, + "learning_rate": 4.981625859344888e-05, + "loss": 0.9507, + "step": 1354 + }, + { + "epoch": 0.06758104738154613, + "grad_norm": 0.5322172750831615, + "learning_rate": 4.981576954529275e-05, + "loss": 0.9527, + "step": 1355 + }, + { + "epoch": 0.06763092269326683, + "grad_norm": 0.5876482036317673, + "learning_rate": 4.981527984958021e-05, + "loss": 0.9246, + "step": 1356 + }, + { + "epoch": 0.06768079800498752, + "grad_norm": 0.6182494710423029, + "learning_rate": 4.981478950632405e-05, + "loss": 0.9417, + "step": 1357 + }, + { + "epoch": 0.06773067331670823, + "grad_norm": 0.6356065138059049, + "learning_rate": 4.9814298515537065e-05, + "loss": 0.9722, + "step": 1358 + }, + { + "epoch": 0.06778054862842893, + "grad_norm": 0.576799467641953, + "learning_rate": 4.9813806877232074e-05, + "loss": 0.9788, + "step": 1359 + }, + { + "epoch": 0.06783042394014963, + "grad_norm": 0.5744039857636298, + "learning_rate": 4.981331459142189e-05, + "loss": 0.938, + "step": 1360 + }, + { + "epoch": 0.06788029925187032, + "grad_norm": 0.4590239259491444, + "learning_rate": 4.981282165811936e-05, + "loss": 0.9353, + "step": 1361 + }, + { + "epoch": 0.06793017456359102, + "grad_norm": 0.5580468415541674, + "learning_rate": 4.981232807733737e-05, + "loss": 0.965, + "step": 1362 + }, + { + "epoch": 0.06798004987531173, + "grad_norm": 0.5099345469914223, + "learning_rate": 4.981183384908877e-05, + "loss": 0.9721, + "step": 1363 + }, + { + "epoch": 0.06802992518703242, + "grad_norm": 0.5413818302940384, + "learning_rate": 4.981133897338647e-05, + "loss": 0.934, + "step": 1364 + }, + { + "epoch": 0.06807980049875312, + "grad_norm": 0.8021359347591792, + "learning_rate": 4.98108434502434e-05, + "loss": 0.9473, + "step": 1365 + }, + { + "epoch": 0.06812967581047381, + "grad_norm": 0.5180794589957973, + "learning_rate": 4.981034727967245e-05, + "loss": 0.9585, + "step": 1366 + }, + { + "epoch": 0.06817955112219451, + "grad_norm": 0.5795535716513666, + "learning_rate": 4.9809850461686606e-05, + "loss": 0.905, + "step": 1367 + }, + { + "epoch": 0.0682294264339152, + "grad_norm": 0.6432005743005291, + "learning_rate": 4.980935299629882e-05, + "loss": 0.9525, + "step": 1368 + }, + { + "epoch": 0.06827930174563591, + "grad_norm": 0.6399502537801677, + "learning_rate": 4.980885488352206e-05, + "loss": 0.9758, + "step": 1369 + }, + { + "epoch": 0.06832917705735661, + "grad_norm": 0.4682323325823096, + "learning_rate": 4.980835612336934e-05, + "loss": 0.9141, + "step": 1370 + }, + { + "epoch": 0.0683790523690773, + "grad_norm": 0.5248467724324887, + "learning_rate": 4.9807856715853674e-05, + "loss": 0.9385, + "step": 1371 + }, + { + "epoch": 0.068428927680798, + "grad_norm": 0.7076776152610713, + "learning_rate": 4.980735666098808e-05, + "loss": 0.944, + "step": 1372 + }, + { + "epoch": 0.0684788029925187, + "grad_norm": 0.5318119975679535, + "learning_rate": 4.9806855958785625e-05, + "loss": 0.9393, + "step": 1373 + }, + { + "epoch": 0.06852867830423941, + "grad_norm": 0.5347585027677593, + "learning_rate": 4.9806354609259366e-05, + "loss": 0.9525, + "step": 1374 + }, + { + "epoch": 0.0685785536159601, + "grad_norm": 1.4736643587730853, + "learning_rate": 4.9805852612422384e-05, + "loss": 0.9373, + "step": 1375 + }, + { + "epoch": 0.0686284289276808, + "grad_norm": 0.7566264295311148, + "learning_rate": 4.980534996828777e-05, + "loss": 0.942, + "step": 1376 + }, + { + "epoch": 0.0686783042394015, + "grad_norm": 0.7248008482403351, + "learning_rate": 4.9804846676868664e-05, + "loss": 0.9611, + "step": 1377 + }, + { + "epoch": 0.06872817955112219, + "grad_norm": 0.537545822909423, + "learning_rate": 4.980434273817818e-05, + "loss": 0.9764, + "step": 1378 + }, + { + "epoch": 0.0687780548628429, + "grad_norm": 0.5479894644332081, + "learning_rate": 4.9803838152229476e-05, + "loss": 0.9771, + "step": 1379 + }, + { + "epoch": 0.0688279301745636, + "grad_norm": 0.5999954125774022, + "learning_rate": 4.9803332919035714e-05, + "loss": 0.9361, + "step": 1380 + }, + { + "epoch": 0.06887780548628429, + "grad_norm": 0.5018621916909277, + "learning_rate": 4.980282703861008e-05, + "loss": 0.9418, + "step": 1381 + }, + { + "epoch": 0.06892768079800499, + "grad_norm": 0.5325822867785832, + "learning_rate": 4.980232051096577e-05, + "loss": 0.9993, + "step": 1382 + }, + { + "epoch": 0.06897755610972568, + "grad_norm": 0.7127085702270437, + "learning_rate": 4.980181333611601e-05, + "loss": 0.9744, + "step": 1383 + }, + { + "epoch": 0.06902743142144638, + "grad_norm": 0.555689447435498, + "learning_rate": 4.980130551407403e-05, + "loss": 0.9371, + "step": 1384 + }, + { + "epoch": 0.06907730673316709, + "grad_norm": 0.601173247246776, + "learning_rate": 4.980079704485308e-05, + "loss": 0.933, + "step": 1385 + }, + { + "epoch": 0.06912718204488778, + "grad_norm": 0.6529389133283713, + "learning_rate": 4.980028792846644e-05, + "loss": 0.954, + "step": 1386 + }, + { + "epoch": 0.06917705735660848, + "grad_norm": 0.6484962895504177, + "learning_rate": 4.9799778164927374e-05, + "loss": 0.9756, + "step": 1387 + }, + { + "epoch": 0.06922693266832917, + "grad_norm": 0.5560076682187491, + "learning_rate": 4.97992677542492e-05, + "loss": 0.9686, + "step": 1388 + }, + { + "epoch": 0.06927680798004987, + "grad_norm": 0.5094023528792441, + "learning_rate": 4.9798756696445236e-05, + "loss": 0.9391, + "step": 1389 + }, + { + "epoch": 0.06932668329177058, + "grad_norm": 0.5107138101071128, + "learning_rate": 4.979824499152881e-05, + "loss": 0.9507, + "step": 1390 + }, + { + "epoch": 0.06937655860349128, + "grad_norm": 0.5424968864375692, + "learning_rate": 4.9797732639513284e-05, + "loss": 0.9137, + "step": 1391 + }, + { + "epoch": 0.06942643391521197, + "grad_norm": 0.5472825950667758, + "learning_rate": 4.9797219640412016e-05, + "loss": 0.9478, + "step": 1392 + }, + { + "epoch": 0.06947630922693267, + "grad_norm": 0.45157163370949605, + "learning_rate": 4.97967059942384e-05, + "loss": 0.9471, + "step": 1393 + }, + { + "epoch": 0.06952618453865336, + "grad_norm": 0.595893003566699, + "learning_rate": 4.979619170100585e-05, + "loss": 0.9622, + "step": 1394 + }, + { + "epoch": 0.06957605985037406, + "grad_norm": 2.09540201355348, + "learning_rate": 4.979567676072776e-05, + "loss": 0.9805, + "step": 1395 + }, + { + "epoch": 0.06962593516209477, + "grad_norm": 0.4759008584166797, + "learning_rate": 4.9795161173417596e-05, + "loss": 0.9361, + "step": 1396 + }, + { + "epoch": 0.06967581047381546, + "grad_norm": 0.5961120624244711, + "learning_rate": 4.9794644939088786e-05, + "loss": 1.0051, + "step": 1397 + }, + { + "epoch": 0.06972568578553616, + "grad_norm": 0.6595572836590278, + "learning_rate": 4.979412805775482e-05, + "loss": 0.9146, + "step": 1398 + }, + { + "epoch": 0.06977556109725686, + "grad_norm": 0.6410689084983942, + "learning_rate": 4.979361052942917e-05, + "loss": 0.9987, + "step": 1399 + }, + { + "epoch": 0.06982543640897755, + "grad_norm": 0.8472725981534789, + "learning_rate": 4.979309235412536e-05, + "loss": 0.9112, + "step": 1400 + }, + { + "epoch": 0.06987531172069826, + "grad_norm": 0.5337176935710969, + "learning_rate": 4.97925735318569e-05, + "loss": 0.9106, + "step": 1401 + }, + { + "epoch": 0.06992518703241896, + "grad_norm": 0.9171569458969693, + "learning_rate": 4.979205406263733e-05, + "loss": 0.9097, + "step": 1402 + }, + { + "epoch": 0.06997506234413965, + "grad_norm": 0.5178992446370942, + "learning_rate": 4.9791533946480195e-05, + "loss": 0.9276, + "step": 1403 + }, + { + "epoch": 0.07002493765586035, + "grad_norm": 0.8877064136154038, + "learning_rate": 4.979101318339908e-05, + "loss": 1.0129, + "step": 1404 + }, + { + "epoch": 0.07007481296758104, + "grad_norm": 0.6186554318649221, + "learning_rate": 4.9790491773407575e-05, + "loss": 0.9296, + "step": 1405 + }, + { + "epoch": 0.07012468827930174, + "grad_norm": 0.5361089250265075, + "learning_rate": 4.978996971651928e-05, + "loss": 0.9921, + "step": 1406 + }, + { + "epoch": 0.07017456359102245, + "grad_norm": 0.560203823292989, + "learning_rate": 4.978944701274782e-05, + "loss": 0.9621, + "step": 1407 + }, + { + "epoch": 0.07022443890274314, + "grad_norm": 1.2206891365814028, + "learning_rate": 4.9788923662106834e-05, + "loss": 0.9294, + "step": 1408 + }, + { + "epoch": 0.07027431421446384, + "grad_norm": 0.763333943246238, + "learning_rate": 4.9788399664609985e-05, + "loss": 0.9529, + "step": 1409 + }, + { + "epoch": 0.07032418952618454, + "grad_norm": 0.4777601604013, + "learning_rate": 4.978787502027094e-05, + "loss": 0.9114, + "step": 1410 + }, + { + "epoch": 0.07037406483790523, + "grad_norm": 0.6335645947623577, + "learning_rate": 4.978734972910338e-05, + "loss": 0.9465, + "step": 1411 + }, + { + "epoch": 0.07042394014962594, + "grad_norm": 0.6328512395004214, + "learning_rate": 4.9786823791121026e-05, + "loss": 0.944, + "step": 1412 + }, + { + "epoch": 0.07047381546134664, + "grad_norm": 0.5182919961776953, + "learning_rate": 4.97862972063376e-05, + "loss": 0.9722, + "step": 1413 + }, + { + "epoch": 0.07052369077306733, + "grad_norm": 0.5649114816068027, + "learning_rate": 4.9785769974766846e-05, + "loss": 0.9944, + "step": 1414 + }, + { + "epoch": 0.07057356608478803, + "grad_norm": 0.5601376707808662, + "learning_rate": 4.978524209642251e-05, + "loss": 0.8946, + "step": 1415 + }, + { + "epoch": 0.07062344139650872, + "grad_norm": 0.5765640869739759, + "learning_rate": 4.9784713571318377e-05, + "loss": 0.9276, + "step": 1416 + }, + { + "epoch": 0.07067331670822943, + "grad_norm": 0.7913725726885316, + "learning_rate": 4.978418439946824e-05, + "loss": 0.9167, + "step": 1417 + }, + { + "epoch": 0.07072319201995013, + "grad_norm": 0.4917452160944717, + "learning_rate": 4.9783654580885895e-05, + "loss": 0.9263, + "step": 1418 + }, + { + "epoch": 0.07077306733167082, + "grad_norm": 0.8183494067421307, + "learning_rate": 4.978312411558518e-05, + "loss": 0.9621, + "step": 1419 + }, + { + "epoch": 0.07082294264339152, + "grad_norm": 0.6755688021243628, + "learning_rate": 4.978259300357993e-05, + "loss": 0.9532, + "step": 1420 + }, + { + "epoch": 0.07087281795511222, + "grad_norm": 0.5309481148235388, + "learning_rate": 4.978206124488401e-05, + "loss": 0.9577, + "step": 1421 + }, + { + "epoch": 0.07092269326683291, + "grad_norm": 0.5721023866703666, + "learning_rate": 4.978152883951129e-05, + "loss": 0.9999, + "step": 1422 + }, + { + "epoch": 0.07097256857855362, + "grad_norm": 0.7449561575157762, + "learning_rate": 4.978099578747566e-05, + "loss": 0.9998, + "step": 1423 + }, + { + "epoch": 0.07102244389027432, + "grad_norm": 0.5834590332960466, + "learning_rate": 4.978046208879105e-05, + "loss": 0.9564, + "step": 1424 + }, + { + "epoch": 0.07107231920199501, + "grad_norm": 0.7465678832415767, + "learning_rate": 4.977992774347136e-05, + "loss": 0.9914, + "step": 1425 + }, + { + "epoch": 0.07112219451371571, + "grad_norm": 0.5465922803183628, + "learning_rate": 4.977939275153055e-05, + "loss": 0.9388, + "step": 1426 + }, + { + "epoch": 0.0711720698254364, + "grad_norm": 0.7771264868544874, + "learning_rate": 4.9778857112982575e-05, + "loss": 0.9401, + "step": 1427 + }, + { + "epoch": 0.07122194513715711, + "grad_norm": 0.7484900063332274, + "learning_rate": 4.9778320827841416e-05, + "loss": 0.9787, + "step": 1428 + }, + { + "epoch": 0.07127182044887781, + "grad_norm": 0.5520254933162064, + "learning_rate": 4.977778389612106e-05, + "loss": 0.9684, + "step": 1429 + }, + { + "epoch": 0.0713216957605985, + "grad_norm": 0.9136286856208791, + "learning_rate": 4.977724631783552e-05, + "loss": 0.966, + "step": 1430 + }, + { + "epoch": 0.0713715710723192, + "grad_norm": 0.705897187864546, + "learning_rate": 4.977670809299883e-05, + "loss": 0.9214, + "step": 1431 + }, + { + "epoch": 0.0714214463840399, + "grad_norm": 1.1805867284983818, + "learning_rate": 4.9776169221625034e-05, + "loss": 0.9542, + "step": 1432 + }, + { + "epoch": 0.07147132169576059, + "grad_norm": 0.5800818424252216, + "learning_rate": 4.977562970372819e-05, + "loss": 0.9984, + "step": 1433 + }, + { + "epoch": 0.0715211970074813, + "grad_norm": 0.5996215608867939, + "learning_rate": 4.977508953932237e-05, + "loss": 0.8609, + "step": 1434 + }, + { + "epoch": 0.071571072319202, + "grad_norm": 0.5566210760249943, + "learning_rate": 4.977454872842169e-05, + "loss": 0.9702, + "step": 1435 + }, + { + "epoch": 0.0716209476309227, + "grad_norm": 0.6265108325342185, + "learning_rate": 4.977400727104024e-05, + "loss": 0.9231, + "step": 1436 + }, + { + "epoch": 0.07167082294264339, + "grad_norm": 0.5390395942601829, + "learning_rate": 4.977346516719216e-05, + "loss": 0.9689, + "step": 1437 + }, + { + "epoch": 0.07172069825436408, + "grad_norm": 0.5281166455098062, + "learning_rate": 4.9772922416891584e-05, + "loss": 0.9426, + "step": 1438 + }, + { + "epoch": 0.0717705735660848, + "grad_norm": 0.8402185206329389, + "learning_rate": 4.977237902015269e-05, + "loss": 0.9366, + "step": 1439 + }, + { + "epoch": 0.07182044887780549, + "grad_norm": 0.5940819823564383, + "learning_rate": 4.977183497698966e-05, + "loss": 0.9719, + "step": 1440 + }, + { + "epoch": 0.07187032418952619, + "grad_norm": 1.0809148927928793, + "learning_rate": 4.977129028741667e-05, + "loss": 0.9687, + "step": 1441 + }, + { + "epoch": 0.07192019950124688, + "grad_norm": 0.5806855215201483, + "learning_rate": 4.9770744951447954e-05, + "loss": 0.9889, + "step": 1442 + }, + { + "epoch": 0.07197007481296758, + "grad_norm": 0.6031410742826746, + "learning_rate": 4.9770198969097735e-05, + "loss": 0.9994, + "step": 1443 + }, + { + "epoch": 0.07201995012468827, + "grad_norm": 0.5399241048508379, + "learning_rate": 4.976965234038026e-05, + "loss": 1.0133, + "step": 1444 + }, + { + "epoch": 0.07206982543640898, + "grad_norm": 0.5929497014915798, + "learning_rate": 4.976910506530979e-05, + "loss": 0.9444, + "step": 1445 + }, + { + "epoch": 0.07211970074812968, + "grad_norm": 0.5751611880372249, + "learning_rate": 4.976855714390061e-05, + "loss": 0.9494, + "step": 1446 + }, + { + "epoch": 0.07216957605985037, + "grad_norm": 0.4886752751803387, + "learning_rate": 4.9768008576167014e-05, + "loss": 0.9205, + "step": 1447 + }, + { + "epoch": 0.07221945137157107, + "grad_norm": 0.5712539363380504, + "learning_rate": 4.976745936212332e-05, + "loss": 0.9438, + "step": 1448 + }, + { + "epoch": 0.07226932668329177, + "grad_norm": 0.7278529578582801, + "learning_rate": 4.9766909501783854e-05, + "loss": 0.9743, + "step": 1449 + }, + { + "epoch": 0.07231920199501247, + "grad_norm": 0.5427642535944329, + "learning_rate": 4.9766358995162974e-05, + "loss": 0.928, + "step": 1450 + }, + { + "epoch": 0.07236907730673317, + "grad_norm": 0.8939872035602385, + "learning_rate": 4.9765807842275044e-05, + "loss": 1.0089, + "step": 1451 + }, + { + "epoch": 0.07241895261845387, + "grad_norm": 0.8101074137213611, + "learning_rate": 4.976525604313444e-05, + "loss": 1.0138, + "step": 1452 + }, + { + "epoch": 0.07246882793017456, + "grad_norm": 0.5833058622526406, + "learning_rate": 4.976470359775556e-05, + "loss": 0.9577, + "step": 1453 + }, + { + "epoch": 0.07251870324189526, + "grad_norm": 0.48461019012251133, + "learning_rate": 4.9764150506152815e-05, + "loss": 0.9168, + "step": 1454 + }, + { + "epoch": 0.07256857855361595, + "grad_norm": 0.6619834555407026, + "learning_rate": 4.976359676834066e-05, + "loss": 0.9573, + "step": 1455 + }, + { + "epoch": 0.07261845386533666, + "grad_norm": 0.5410809962317092, + "learning_rate": 4.976304238433352e-05, + "loss": 0.9348, + "step": 1456 + }, + { + "epoch": 0.07266832917705736, + "grad_norm": 0.5853480933938967, + "learning_rate": 4.976248735414588e-05, + "loss": 0.988, + "step": 1457 + }, + { + "epoch": 0.07271820448877805, + "grad_norm": 0.5224038993403454, + "learning_rate": 4.976193167779221e-05, + "loss": 0.9518, + "step": 1458 + }, + { + "epoch": 0.07276807980049875, + "grad_norm": 0.5718278443567185, + "learning_rate": 4.976137535528701e-05, + "loss": 0.931, + "step": 1459 + }, + { + "epoch": 0.07281795511221945, + "grad_norm": 0.7572062250567707, + "learning_rate": 4.9760818386644806e-05, + "loss": 0.9118, + "step": 1460 + }, + { + "epoch": 0.07286783042394016, + "grad_norm": 0.556386656035192, + "learning_rate": 4.976026077188013e-05, + "loss": 0.9333, + "step": 1461 + }, + { + "epoch": 0.07291770573566085, + "grad_norm": 0.6428136175039203, + "learning_rate": 4.975970251100753e-05, + "loss": 0.9823, + "step": 1462 + }, + { + "epoch": 0.07296758104738155, + "grad_norm": 0.5183629649251321, + "learning_rate": 4.975914360404157e-05, + "loss": 0.9617, + "step": 1463 + }, + { + "epoch": 0.07301745635910224, + "grad_norm": 0.5568536416693627, + "learning_rate": 4.975858405099685e-05, + "loss": 0.9597, + "step": 1464 + }, + { + "epoch": 0.07306733167082294, + "grad_norm": 0.5787612184497213, + "learning_rate": 4.975802385188795e-05, + "loss": 0.9563, + "step": 1465 + }, + { + "epoch": 0.07311720698254365, + "grad_norm": 0.8255029511076242, + "learning_rate": 4.975746300672949e-05, + "loss": 0.9754, + "step": 1466 + }, + { + "epoch": 0.07316708229426434, + "grad_norm": 0.5727247717922554, + "learning_rate": 4.975690151553613e-05, + "loss": 0.9404, + "step": 1467 + }, + { + "epoch": 0.07321695760598504, + "grad_norm": 0.5561705293385386, + "learning_rate": 4.975633937832249e-05, + "loss": 0.921, + "step": 1468 + }, + { + "epoch": 0.07326683291770573, + "grad_norm": 0.5445839368291833, + "learning_rate": 4.975577659510327e-05, + "loss": 0.9286, + "step": 1469 + }, + { + "epoch": 0.07331670822942643, + "grad_norm": 0.6373963887984142, + "learning_rate": 4.975521316589312e-05, + "loss": 0.9524, + "step": 1470 + }, + { + "epoch": 0.07336658354114713, + "grad_norm": 0.5260695751453104, + "learning_rate": 4.975464909070677e-05, + "loss": 0.9517, + "step": 1471 + }, + { + "epoch": 0.07341645885286784, + "grad_norm": 0.6672501577522688, + "learning_rate": 4.975408436955893e-05, + "loss": 0.9342, + "step": 1472 + }, + { + "epoch": 0.07346633416458853, + "grad_norm": 0.474063212114565, + "learning_rate": 4.975351900246434e-05, + "loss": 0.9616, + "step": 1473 + }, + { + "epoch": 0.07351620947630923, + "grad_norm": 0.5269839699687592, + "learning_rate": 4.9752952989437744e-05, + "loss": 0.9562, + "step": 1474 + }, + { + "epoch": 0.07356608478802992, + "grad_norm": 0.7580422538776704, + "learning_rate": 4.975238633049392e-05, + "loss": 0.976, + "step": 1475 + }, + { + "epoch": 0.07361596009975062, + "grad_norm": 0.4932953523712596, + "learning_rate": 4.9751819025647655e-05, + "loss": 0.9433, + "step": 1476 + }, + { + "epoch": 0.07366583541147133, + "grad_norm": 0.85791480563788, + "learning_rate": 4.9751251074913754e-05, + "loss": 1.0172, + "step": 1477 + }, + { + "epoch": 0.07371571072319202, + "grad_norm": 0.517602263427787, + "learning_rate": 4.975068247830702e-05, + "loss": 0.8844, + "step": 1478 + }, + { + "epoch": 0.07376558603491272, + "grad_norm": 0.7076248449652807, + "learning_rate": 4.9750113235842316e-05, + "loss": 0.9383, + "step": 1479 + }, + { + "epoch": 0.07381546134663342, + "grad_norm": 0.6737499743294352, + "learning_rate": 4.974954334753448e-05, + "loss": 0.948, + "step": 1480 + }, + { + "epoch": 0.07386533665835411, + "grad_norm": 0.7121289507581631, + "learning_rate": 4.9748972813398385e-05, + "loss": 0.9523, + "step": 1481 + }, + { + "epoch": 0.0739152119700748, + "grad_norm": 0.5086401219381693, + "learning_rate": 4.974840163344893e-05, + "loss": 1.023, + "step": 1482 + }, + { + "epoch": 0.07396508728179552, + "grad_norm": 0.5694662041901528, + "learning_rate": 4.9747829807701e-05, + "loss": 0.9348, + "step": 1483 + }, + { + "epoch": 0.07401496259351621, + "grad_norm": 0.5910023547955426, + "learning_rate": 4.974725733616953e-05, + "loss": 0.9429, + "step": 1484 + }, + { + "epoch": 0.07406483790523691, + "grad_norm": 0.7639123307483823, + "learning_rate": 4.974668421886945e-05, + "loss": 0.9474, + "step": 1485 + }, + { + "epoch": 0.0741147132169576, + "grad_norm": 0.5136177275409491, + "learning_rate": 4.974611045581572e-05, + "loss": 0.9265, + "step": 1486 + }, + { + "epoch": 0.0741645885286783, + "grad_norm": 0.7162162270026238, + "learning_rate": 4.9745536047023324e-05, + "loss": 0.9461, + "step": 1487 + }, + { + "epoch": 0.07421446384039901, + "grad_norm": 0.520708491038692, + "learning_rate": 4.974496099250723e-05, + "loss": 0.9788, + "step": 1488 + }, + { + "epoch": 0.0742643391521197, + "grad_norm": 0.6687230805997049, + "learning_rate": 4.974438529228246e-05, + "loss": 0.9632, + "step": 1489 + }, + { + "epoch": 0.0743142144638404, + "grad_norm": 0.7055074905292422, + "learning_rate": 4.9743808946364025e-05, + "loss": 0.9711, + "step": 1490 + }, + { + "epoch": 0.0743640897755611, + "grad_norm": 0.5500299069774499, + "learning_rate": 4.9743231954766966e-05, + "loss": 0.9762, + "step": 1491 + }, + { + "epoch": 0.07441396508728179, + "grad_norm": 0.5091497209978034, + "learning_rate": 4.974265431750635e-05, + "loss": 0.9625, + "step": 1492 + }, + { + "epoch": 0.07446384039900249, + "grad_norm": 0.6329753365009386, + "learning_rate": 4.974207603459724e-05, + "loss": 0.9516, + "step": 1493 + }, + { + "epoch": 0.0745137157107232, + "grad_norm": 0.5180335281968764, + "learning_rate": 4.974149710605473e-05, + "loss": 0.9659, + "step": 1494 + }, + { + "epoch": 0.07456359102244389, + "grad_norm": 0.7450218728181539, + "learning_rate": 4.974091753189393e-05, + "loss": 0.9397, + "step": 1495 + }, + { + "epoch": 0.07461346633416459, + "grad_norm": 0.6633011611501229, + "learning_rate": 4.974033731212995e-05, + "loss": 0.9407, + "step": 1496 + }, + { + "epoch": 0.07466334164588528, + "grad_norm": 0.6085737718400898, + "learning_rate": 4.9739756446777944e-05, + "loss": 0.9774, + "step": 1497 + }, + { + "epoch": 0.07471321695760598, + "grad_norm": 0.673156836293869, + "learning_rate": 4.973917493585307e-05, + "loss": 0.9472, + "step": 1498 + }, + { + "epoch": 0.07476309226932669, + "grad_norm": 0.6028613885353431, + "learning_rate": 4.973859277937049e-05, + "loss": 0.9642, + "step": 1499 + }, + { + "epoch": 0.07481296758104738, + "grad_norm": 0.5686244193897644, + "learning_rate": 4.9738009977345404e-05, + "loss": 0.9559, + "step": 1500 + }, + { + "epoch": 0.07486284289276808, + "grad_norm": 0.5049460668210568, + "learning_rate": 4.973742652979303e-05, + "loss": 0.9331, + "step": 1501 + }, + { + "epoch": 0.07491271820448878, + "grad_norm": 0.8448847089536125, + "learning_rate": 4.973684243672857e-05, + "loss": 1.0067, + "step": 1502 + }, + { + "epoch": 0.07496259351620947, + "grad_norm": 0.57706537884565, + "learning_rate": 4.9736257698167275e-05, + "loss": 0.9857, + "step": 1503 + }, + { + "epoch": 0.07501246882793017, + "grad_norm": 0.5610499428278507, + "learning_rate": 4.9735672314124414e-05, + "loss": 0.9465, + "step": 1504 + }, + { + "epoch": 0.07506234413965088, + "grad_norm": 0.9277778653250264, + "learning_rate": 4.973508628461525e-05, + "loss": 0.9522, + "step": 1505 + }, + { + "epoch": 0.07511221945137157, + "grad_norm": 0.7155157571805001, + "learning_rate": 4.973449960965509e-05, + "loss": 0.9647, + "step": 1506 + }, + { + "epoch": 0.07516209476309227, + "grad_norm": 0.6226069463339771, + "learning_rate": 4.973391228925922e-05, + "loss": 0.9715, + "step": 1507 + }, + { + "epoch": 0.07521197007481296, + "grad_norm": 0.49087481920423853, + "learning_rate": 4.9733324323442986e-05, + "loss": 0.9214, + "step": 1508 + }, + { + "epoch": 0.07526184538653366, + "grad_norm": 0.5583942205016936, + "learning_rate": 4.973273571222172e-05, + "loss": 0.9555, + "step": 1509 + }, + { + "epoch": 0.07531172069825437, + "grad_norm": 0.6202575835720227, + "learning_rate": 4.9732146455610786e-05, + "loss": 0.9951, + "step": 1510 + }, + { + "epoch": 0.07536159600997507, + "grad_norm": 0.6359033664109851, + "learning_rate": 4.9731556553625555e-05, + "loss": 0.9729, + "step": 1511 + }, + { + "epoch": 0.07541147132169576, + "grad_norm": 0.5441644328482863, + "learning_rate": 4.973096600628142e-05, + "loss": 0.9715, + "step": 1512 + }, + { + "epoch": 0.07546134663341646, + "grad_norm": 0.49388496460573816, + "learning_rate": 4.9730374813593804e-05, + "loss": 0.9118, + "step": 1513 + }, + { + "epoch": 0.07551122194513715, + "grad_norm": 0.589519329607115, + "learning_rate": 4.972978297557812e-05, + "loss": 0.9523, + "step": 1514 + }, + { + "epoch": 0.07556109725685786, + "grad_norm": 1.0838341249219277, + "learning_rate": 4.972919049224982e-05, + "loss": 1.0223, + "step": 1515 + }, + { + "epoch": 0.07561097256857856, + "grad_norm": 0.5639977357280903, + "learning_rate": 4.972859736362436e-05, + "loss": 0.9598, + "step": 1516 + }, + { + "epoch": 0.07566084788029925, + "grad_norm": 0.5880047543951917, + "learning_rate": 4.9728003589717223e-05, + "loss": 0.9773, + "step": 1517 + }, + { + "epoch": 0.07571072319201995, + "grad_norm": 0.562170518715957, + "learning_rate": 4.972740917054389e-05, + "loss": 0.9613, + "step": 1518 + }, + { + "epoch": 0.07576059850374064, + "grad_norm": 0.5614369146758856, + "learning_rate": 4.972681410611989e-05, + "loss": 0.9251, + "step": 1519 + }, + { + "epoch": 0.07581047381546134, + "grad_norm": 0.6189198396261271, + "learning_rate": 4.9726218396460745e-05, + "loss": 0.9591, + "step": 1520 + }, + { + "epoch": 0.07586034912718205, + "grad_norm": 0.6267367710107746, + "learning_rate": 4.972562204158199e-05, + "loss": 0.9566, + "step": 1521 + }, + { + "epoch": 0.07591022443890275, + "grad_norm": 0.6449511067032628, + "learning_rate": 4.972502504149919e-05, + "loss": 0.947, + "step": 1522 + }, + { + "epoch": 0.07596009975062344, + "grad_norm": 0.6709164084174305, + "learning_rate": 4.972442739622793e-05, + "loss": 0.9724, + "step": 1523 + }, + { + "epoch": 0.07600997506234414, + "grad_norm": 0.6170394168490728, + "learning_rate": 4.97238291057838e-05, + "loss": 0.9494, + "step": 1524 + }, + { + "epoch": 0.07605985037406483, + "grad_norm": 0.5933210929296995, + "learning_rate": 4.9723230170182425e-05, + "loss": 0.9836, + "step": 1525 + }, + { + "epoch": 0.07610972568578554, + "grad_norm": 0.6443832285102064, + "learning_rate": 4.972263058943942e-05, + "loss": 0.9669, + "step": 1526 + }, + { + "epoch": 0.07615960099750624, + "grad_norm": 0.5864112283787146, + "learning_rate": 4.9722030363570425e-05, + "loss": 0.8983, + "step": 1527 + }, + { + "epoch": 0.07620947630922693, + "grad_norm": 0.5881220756773624, + "learning_rate": 4.9721429492591123e-05, + "loss": 0.976, + "step": 1528 + }, + { + "epoch": 0.07625935162094763, + "grad_norm": 0.5548933610291457, + "learning_rate": 4.9720827976517174e-05, + "loss": 0.9134, + "step": 1529 + }, + { + "epoch": 0.07630922693266833, + "grad_norm": 0.5882118445618535, + "learning_rate": 4.972022581536428e-05, + "loss": 0.9437, + "step": 1530 + }, + { + "epoch": 0.07635910224438902, + "grad_norm": 0.5415945531912335, + "learning_rate": 4.9719623009148166e-05, + "loss": 0.9947, + "step": 1531 + }, + { + "epoch": 0.07640897755610973, + "grad_norm": 0.5299734382685901, + "learning_rate": 4.971901955788455e-05, + "loss": 0.9248, + "step": 1532 + }, + { + "epoch": 0.07645885286783043, + "grad_norm": 0.7452114686669802, + "learning_rate": 4.971841546158918e-05, + "loss": 0.9759, + "step": 1533 + }, + { + "epoch": 0.07650872817955112, + "grad_norm": 0.7849778528981957, + "learning_rate": 4.971781072027782e-05, + "loss": 0.8903, + "step": 1534 + }, + { + "epoch": 0.07655860349127182, + "grad_norm": 0.6523030584928486, + "learning_rate": 4.971720533396625e-05, + "loss": 0.9391, + "step": 1535 + }, + { + "epoch": 0.07660847880299251, + "grad_norm": 0.5713871689299911, + "learning_rate": 4.971659930267028e-05, + "loss": 0.9705, + "step": 1536 + }, + { + "epoch": 0.07665835411471322, + "grad_norm": 0.7029342796198136, + "learning_rate": 4.97159926264057e-05, + "loss": 0.963, + "step": 1537 + }, + { + "epoch": 0.07670822942643392, + "grad_norm": 0.7978172381613076, + "learning_rate": 4.971538530518836e-05, + "loss": 0.9402, + "step": 1538 + }, + { + "epoch": 0.07675810473815461, + "grad_norm": 1.0399404582974718, + "learning_rate": 4.97147773390341e-05, + "loss": 0.988, + "step": 1539 + }, + { + "epoch": 0.07680798004987531, + "grad_norm": 0.5590442958363543, + "learning_rate": 4.971416872795879e-05, + "loss": 0.9434, + "step": 1540 + }, + { + "epoch": 0.076857855361596, + "grad_norm": 0.512586634515604, + "learning_rate": 4.97135594719783e-05, + "loss": 0.9748, + "step": 1541 + }, + { + "epoch": 0.0769077306733167, + "grad_norm": 0.6302134952983288, + "learning_rate": 4.971294957110853e-05, + "loss": 0.9522, + "step": 1542 + }, + { + "epoch": 0.07695760598503741, + "grad_norm": 0.6773896673942441, + "learning_rate": 4.971233902536542e-05, + "loss": 0.9138, + "step": 1543 + }, + { + "epoch": 0.0770074812967581, + "grad_norm": 0.5580819683135534, + "learning_rate": 4.971172783476487e-05, + "loss": 0.9113, + "step": 1544 + }, + { + "epoch": 0.0770573566084788, + "grad_norm": 0.8911285595548943, + "learning_rate": 4.9711115999322844e-05, + "loss": 0.9177, + "step": 1545 + }, + { + "epoch": 0.0771072319201995, + "grad_norm": 0.5576577437584506, + "learning_rate": 4.9710503519055305e-05, + "loss": 0.9563, + "step": 1546 + }, + { + "epoch": 0.0771571072319202, + "grad_norm": 0.6541957316975131, + "learning_rate": 4.970989039397824e-05, + "loss": 0.9741, + "step": 1547 + }, + { + "epoch": 0.0772069825436409, + "grad_norm": 0.5388149607140722, + "learning_rate": 4.970927662410764e-05, + "loss": 0.9396, + "step": 1548 + }, + { + "epoch": 0.0772568578553616, + "grad_norm": 0.5216065903817785, + "learning_rate": 4.970866220945953e-05, + "loss": 0.9377, + "step": 1549 + }, + { + "epoch": 0.0773067331670823, + "grad_norm": 0.5940475223748407, + "learning_rate": 4.970804715004993e-05, + "loss": 0.9072, + "step": 1550 + }, + { + "epoch": 0.07735660847880299, + "grad_norm": 0.5416010513535247, + "learning_rate": 4.9707431445894904e-05, + "loss": 0.9493, + "step": 1551 + }, + { + "epoch": 0.07740648379052369, + "grad_norm": 0.5276556583976059, + "learning_rate": 4.9706815097010505e-05, + "loss": 1.0112, + "step": 1552 + }, + { + "epoch": 0.0774563591022444, + "grad_norm": 0.6476346646236032, + "learning_rate": 4.970619810341283e-05, + "loss": 0.9387, + "step": 1553 + }, + { + "epoch": 0.07750623441396509, + "grad_norm": 0.4559790032339434, + "learning_rate": 4.9705580465117974e-05, + "loss": 0.9368, + "step": 1554 + }, + { + "epoch": 0.07755610972568579, + "grad_norm": 0.5003244985567401, + "learning_rate": 4.9704962182142044e-05, + "loss": 0.9768, + "step": 1555 + }, + { + "epoch": 0.07760598503740648, + "grad_norm": 0.57290987702454, + "learning_rate": 4.970434325450119e-05, + "loss": 0.9543, + "step": 1556 + }, + { + "epoch": 0.07765586034912718, + "grad_norm": 0.6954244272857166, + "learning_rate": 4.970372368221155e-05, + "loss": 0.966, + "step": 1557 + }, + { + "epoch": 0.07770573566084787, + "grad_norm": 0.5235196785206919, + "learning_rate": 4.97031034652893e-05, + "loss": 0.8888, + "step": 1558 + }, + { + "epoch": 0.07775561097256858, + "grad_norm": 0.5283389694840467, + "learning_rate": 4.970248260375062e-05, + "loss": 0.9417, + "step": 1559 + }, + { + "epoch": 0.07780548628428928, + "grad_norm": 0.515416019318318, + "learning_rate": 4.970186109761171e-05, + "loss": 0.9434, + "step": 1560 + }, + { + "epoch": 0.07785536159600998, + "grad_norm": 0.4784034583550614, + "learning_rate": 4.9701238946888796e-05, + "loss": 0.9897, + "step": 1561 + }, + { + "epoch": 0.07790523690773067, + "grad_norm": 0.6250541879940422, + "learning_rate": 4.97006161515981e-05, + "loss": 0.9103, + "step": 1562 + }, + { + "epoch": 0.07795511221945137, + "grad_norm": 0.5034799624424634, + "learning_rate": 4.969999271175588e-05, + "loss": 0.921, + "step": 1563 + }, + { + "epoch": 0.07800498753117208, + "grad_norm": 0.4971176828183686, + "learning_rate": 4.9699368627378405e-05, + "loss": 0.915, + "step": 1564 + }, + { + "epoch": 0.07805486284289277, + "grad_norm": 0.5391904484258543, + "learning_rate": 4.9698743898481955e-05, + "loss": 0.9386, + "step": 1565 + }, + { + "epoch": 0.07810473815461347, + "grad_norm": 0.6071897821403325, + "learning_rate": 4.9698118525082846e-05, + "loss": 0.9346, + "step": 1566 + }, + { + "epoch": 0.07815461346633416, + "grad_norm": 0.5295278304652928, + "learning_rate": 4.969749250719739e-05, + "loss": 0.959, + "step": 1567 + }, + { + "epoch": 0.07820448877805486, + "grad_norm": 0.529950156754443, + "learning_rate": 4.969686584484191e-05, + "loss": 0.9784, + "step": 1568 + }, + { + "epoch": 0.07825436408977555, + "grad_norm": 0.8659106495213978, + "learning_rate": 4.969623853803277e-05, + "loss": 0.9445, + "step": 1569 + }, + { + "epoch": 0.07830423940149626, + "grad_norm": 0.6085932944240213, + "learning_rate": 4.969561058678634e-05, + "loss": 0.9819, + "step": 1570 + }, + { + "epoch": 0.07835411471321696, + "grad_norm": 0.5931431074089325, + "learning_rate": 4.9694981991119004e-05, + "loss": 0.9848, + "step": 1571 + }, + { + "epoch": 0.07840399002493766, + "grad_norm": 0.5700553109749608, + "learning_rate": 4.9694352751047165e-05, + "loss": 0.9411, + "step": 1572 + }, + { + "epoch": 0.07845386533665835, + "grad_norm": 0.5336548993714735, + "learning_rate": 4.969372286658725e-05, + "loss": 0.953, + "step": 1573 + }, + { + "epoch": 0.07850374064837905, + "grad_norm": 0.5090310519756827, + "learning_rate": 4.9693092337755674e-05, + "loss": 0.9616, + "step": 1574 + }, + { + "epoch": 0.07855361596009976, + "grad_norm": 0.5039134063457171, + "learning_rate": 4.969246116456891e-05, + "loss": 0.9447, + "step": 1575 + }, + { + "epoch": 0.07860349127182045, + "grad_norm": 0.5216077674429331, + "learning_rate": 4.969182934704343e-05, + "loss": 0.9322, + "step": 1576 + }, + { + "epoch": 0.07865336658354115, + "grad_norm": 0.5088096727812386, + "learning_rate": 4.969119688519571e-05, + "loss": 0.9162, + "step": 1577 + }, + { + "epoch": 0.07870324189526184, + "grad_norm": 0.6213109824345566, + "learning_rate": 4.969056377904226e-05, + "loss": 0.9354, + "step": 1578 + }, + { + "epoch": 0.07875311720698254, + "grad_norm": 0.49106204676892223, + "learning_rate": 4.968993002859959e-05, + "loss": 0.9704, + "step": 1579 + }, + { + "epoch": 0.07880299251870324, + "grad_norm": 0.5552753411272333, + "learning_rate": 4.968929563388425e-05, + "loss": 0.9321, + "step": 1580 + }, + { + "epoch": 0.07885286783042394, + "grad_norm": 0.5813828522492329, + "learning_rate": 4.968866059491279e-05, + "loss": 0.9781, + "step": 1581 + }, + { + "epoch": 0.07890274314214464, + "grad_norm": 0.5162674350964562, + "learning_rate": 4.968802491170178e-05, + "loss": 0.979, + "step": 1582 + }, + { + "epoch": 0.07895261845386534, + "grad_norm": 1.0321824286434274, + "learning_rate": 4.968738858426781e-05, + "loss": 0.9664, + "step": 1583 + }, + { + "epoch": 0.07900249376558603, + "grad_norm": 0.48838756880675294, + "learning_rate": 4.968675161262749e-05, + "loss": 0.9292, + "step": 1584 + }, + { + "epoch": 0.07905236907730673, + "grad_norm": 0.5299128098763938, + "learning_rate": 4.9686113996797426e-05, + "loss": 0.961, + "step": 1585 + }, + { + "epoch": 0.07910224438902744, + "grad_norm": 1.4732641611617416, + "learning_rate": 4.968547573679426e-05, + "loss": 0.92, + "step": 1586 + }, + { + "epoch": 0.07915211970074813, + "grad_norm": 0.4429642529417359, + "learning_rate": 4.968483683263466e-05, + "loss": 0.9465, + "step": 1587 + }, + { + "epoch": 0.07920199501246883, + "grad_norm": 0.8680672887908185, + "learning_rate": 4.96841972843353e-05, + "loss": 0.9391, + "step": 1588 + }, + { + "epoch": 0.07925187032418952, + "grad_norm": 0.6018209913003714, + "learning_rate": 4.968355709191285e-05, + "loss": 0.9579, + "step": 1589 + }, + { + "epoch": 0.07930174563591022, + "grad_norm": 0.5005442409003795, + "learning_rate": 4.968291625538402e-05, + "loss": 0.9414, + "step": 1590 + }, + { + "epoch": 0.07935162094763092, + "grad_norm": 0.9676441877569383, + "learning_rate": 4.968227477476554e-05, + "loss": 0.9763, + "step": 1591 + }, + { + "epoch": 0.07940149625935163, + "grad_norm": 0.5059472741330054, + "learning_rate": 4.968163265007415e-05, + "loss": 0.9246, + "step": 1592 + }, + { + "epoch": 0.07945137157107232, + "grad_norm": 0.6249233975894147, + "learning_rate": 4.96809898813266e-05, + "loss": 0.9137, + "step": 1593 + }, + { + "epoch": 0.07950124688279302, + "grad_norm": 0.7033615609299664, + "learning_rate": 4.968034646853966e-05, + "loss": 1.0153, + "step": 1594 + }, + { + "epoch": 0.07955112219451371, + "grad_norm": 0.5442898042819992, + "learning_rate": 4.9679702411730136e-05, + "loss": 0.8964, + "step": 1595 + }, + { + "epoch": 0.07960099750623441, + "grad_norm": 0.5414955457790812, + "learning_rate": 4.967905771091481e-05, + "loss": 0.9939, + "step": 1596 + }, + { + "epoch": 0.07965087281795512, + "grad_norm": 0.5159839877047793, + "learning_rate": 4.9678412366110525e-05, + "loss": 0.9879, + "step": 1597 + }, + { + "epoch": 0.07970074812967581, + "grad_norm": 0.5803258330687777, + "learning_rate": 4.967776637733411e-05, + "loss": 0.9412, + "step": 1598 + }, + { + "epoch": 0.07975062344139651, + "grad_norm": 0.6087039785283586, + "learning_rate": 4.967711974460243e-05, + "loss": 0.928, + "step": 1599 + }, + { + "epoch": 0.0798004987531172, + "grad_norm": 0.49347050514568075, + "learning_rate": 4.9676472467932355e-05, + "loss": 0.9222, + "step": 1600 + }, + { + "epoch": 0.0798503740648379, + "grad_norm": 0.6105431140278338, + "learning_rate": 4.9675824547340766e-05, + "loss": 0.9556, + "step": 1601 + }, + { + "epoch": 0.07990024937655861, + "grad_norm": 0.5240940677432819, + "learning_rate": 4.967517598284459e-05, + "loss": 0.897, + "step": 1602 + }, + { + "epoch": 0.0799501246882793, + "grad_norm": 0.5940629619074814, + "learning_rate": 4.9674526774460734e-05, + "loss": 1.0326, + "step": 1603 + }, + { + "epoch": 0.08, + "grad_norm": 0.47725146421270725, + "learning_rate": 4.967387692220614e-05, + "loss": 0.9237, + "step": 1604 + }, + { + "epoch": 0.0800498753117207, + "grad_norm": 0.5353329178829889, + "learning_rate": 4.967322642609778e-05, + "loss": 0.9413, + "step": 1605 + }, + { + "epoch": 0.0800997506234414, + "grad_norm": 0.5850926050287447, + "learning_rate": 4.967257528615261e-05, + "loss": 0.9038, + "step": 1606 + }, + { + "epoch": 0.08014962593516209, + "grad_norm": 0.842126713989176, + "learning_rate": 4.967192350238764e-05, + "loss": 0.969, + "step": 1607 + }, + { + "epoch": 0.0801995012468828, + "grad_norm": 0.5621846683616673, + "learning_rate": 4.967127107481986e-05, + "loss": 0.9423, + "step": 1608 + }, + { + "epoch": 0.0802493765586035, + "grad_norm": 0.4566908989148234, + "learning_rate": 4.9670618003466304e-05, + "loss": 0.9506, + "step": 1609 + }, + { + "epoch": 0.08029925187032419, + "grad_norm": 0.5709867451022749, + "learning_rate": 4.966996428834402e-05, + "loss": 0.9743, + "step": 1610 + }, + { + "epoch": 0.08034912718204489, + "grad_norm": 0.8381564473354384, + "learning_rate": 4.966930992947005e-05, + "loss": 0.97, + "step": 1611 + }, + { + "epoch": 0.08039900249376558, + "grad_norm": 1.4957116113445996, + "learning_rate": 4.966865492686148e-05, + "loss": 0.9767, + "step": 1612 + }, + { + "epoch": 0.08044887780548629, + "grad_norm": 1.6474657051484771, + "learning_rate": 4.966799928053539e-05, + "loss": 0.9693, + "step": 1613 + }, + { + "epoch": 0.08049875311720699, + "grad_norm": 0.518892132304445, + "learning_rate": 4.966734299050892e-05, + "loss": 0.9676, + "step": 1614 + }, + { + "epoch": 0.08054862842892768, + "grad_norm": 0.5451014156068865, + "learning_rate": 4.966668605679916e-05, + "loss": 0.9305, + "step": 1615 + }, + { + "epoch": 0.08059850374064838, + "grad_norm": 0.5438152023642968, + "learning_rate": 4.966602847942326e-05, + "loss": 0.937, + "step": 1616 + }, + { + "epoch": 0.08064837905236907, + "grad_norm": 0.6197768433021634, + "learning_rate": 4.9665370258398404e-05, + "loss": 0.9378, + "step": 1617 + }, + { + "epoch": 0.08069825436408977, + "grad_norm": 0.6176981587041569, + "learning_rate": 4.966471139374174e-05, + "loss": 0.9826, + "step": 1618 + }, + { + "epoch": 0.08074812967581048, + "grad_norm": 0.4802754771443494, + "learning_rate": 4.966405188547047e-05, + "loss": 0.9207, + "step": 1619 + }, + { + "epoch": 0.08079800498753117, + "grad_norm": 0.5762438115611037, + "learning_rate": 4.966339173360181e-05, + "loss": 0.9967, + "step": 1620 + }, + { + "epoch": 0.08084788029925187, + "grad_norm": 0.6429074407199623, + "learning_rate": 4.966273093815298e-05, + "loss": 0.9141, + "step": 1621 + }, + { + "epoch": 0.08089775561097257, + "grad_norm": 0.5464049149103422, + "learning_rate": 4.966206949914122e-05, + "loss": 0.9494, + "step": 1622 + }, + { + "epoch": 0.08094763092269326, + "grad_norm": 0.5556581144535617, + "learning_rate": 4.966140741658379e-05, + "loss": 0.8997, + "step": 1623 + }, + { + "epoch": 0.08099750623441397, + "grad_norm": 0.49763419419756016, + "learning_rate": 4.9660744690497976e-05, + "loss": 0.9378, + "step": 1624 + }, + { + "epoch": 0.08104738154613467, + "grad_norm": 0.5610885685187498, + "learning_rate": 4.9660081320901065e-05, + "loss": 0.8815, + "step": 1625 + }, + { + "epoch": 0.08109725685785536, + "grad_norm": 0.5471368050667507, + "learning_rate": 4.965941730781037e-05, + "loss": 0.9705, + "step": 1626 + }, + { + "epoch": 0.08114713216957606, + "grad_norm": 0.5943583602407245, + "learning_rate": 4.9658752651243214e-05, + "loss": 0.9409, + "step": 1627 + }, + { + "epoch": 0.08119700748129675, + "grad_norm": 0.5124136173098117, + "learning_rate": 4.965808735121694e-05, + "loss": 0.9943, + "step": 1628 + }, + { + "epoch": 0.08124688279301745, + "grad_norm": 0.5162192714624475, + "learning_rate": 4.9657421407748915e-05, + "loss": 0.9171, + "step": 1629 + }, + { + "epoch": 0.08129675810473816, + "grad_norm": 0.49711601437924763, + "learning_rate": 4.965675482085652e-05, + "loss": 0.9145, + "step": 1630 + }, + { + "epoch": 0.08134663341645886, + "grad_norm": 0.5637857783770405, + "learning_rate": 4.9656087590557136e-05, + "loss": 0.9549, + "step": 1631 + }, + { + "epoch": 0.08139650872817955, + "grad_norm": 0.6427204864084701, + "learning_rate": 4.9655419716868186e-05, + "loss": 0.9574, + "step": 1632 + }, + { + "epoch": 0.08144638403990025, + "grad_norm": 0.6287428881652459, + "learning_rate": 4.965475119980709e-05, + "loss": 0.9725, + "step": 1633 + }, + { + "epoch": 0.08149625935162094, + "grad_norm": 0.5545480796150988, + "learning_rate": 4.9654082039391295e-05, + "loss": 0.9782, + "step": 1634 + }, + { + "epoch": 0.08154613466334165, + "grad_norm": 0.559326189070364, + "learning_rate": 4.9653412235638265e-05, + "loss": 0.9166, + "step": 1635 + }, + { + "epoch": 0.08159600997506235, + "grad_norm": 0.4987271594698246, + "learning_rate": 4.9652741788565473e-05, + "loss": 0.9401, + "step": 1636 + }, + { + "epoch": 0.08164588528678304, + "grad_norm": 0.5065281314557135, + "learning_rate": 4.965207069819041e-05, + "loss": 0.9477, + "step": 1637 + }, + { + "epoch": 0.08169576059850374, + "grad_norm": 0.6882759775777302, + "learning_rate": 4.9651398964530605e-05, + "loss": 0.947, + "step": 1638 + }, + { + "epoch": 0.08174563591022443, + "grad_norm": 0.7001987402646999, + "learning_rate": 4.965072658760358e-05, + "loss": 1.0592, + "step": 1639 + }, + { + "epoch": 0.08179551122194513, + "grad_norm": 0.5248918994414358, + "learning_rate": 4.965005356742687e-05, + "loss": 0.9219, + "step": 1640 + }, + { + "epoch": 0.08184538653366584, + "grad_norm": 0.5290130771386123, + "learning_rate": 4.9649379904018046e-05, + "loss": 0.9785, + "step": 1641 + }, + { + "epoch": 0.08189526184538654, + "grad_norm": 0.539521393756467, + "learning_rate": 4.9648705597394684e-05, + "loss": 1.0037, + "step": 1642 + }, + { + "epoch": 0.08194513715710723, + "grad_norm": 0.6716782678580987, + "learning_rate": 4.964803064757438e-05, + "loss": 0.975, + "step": 1643 + }, + { + "epoch": 0.08199501246882793, + "grad_norm": 0.48800766858750494, + "learning_rate": 4.964735505457475e-05, + "loss": 0.9562, + "step": 1644 + }, + { + "epoch": 0.08204488778054862, + "grad_norm": 0.5281597188021482, + "learning_rate": 4.964667881841342e-05, + "loss": 1.0006, + "step": 1645 + }, + { + "epoch": 0.08209476309226933, + "grad_norm": 1.1326739943740687, + "learning_rate": 4.964600193910803e-05, + "loss": 0.9507, + "step": 1646 + }, + { + "epoch": 0.08214463840399003, + "grad_norm": 0.7370831731243167, + "learning_rate": 4.964532441667626e-05, + "loss": 0.9792, + "step": 1647 + }, + { + "epoch": 0.08219451371571072, + "grad_norm": 0.46767541312700545, + "learning_rate": 4.964464625113578e-05, + "loss": 0.9268, + "step": 1648 + }, + { + "epoch": 0.08224438902743142, + "grad_norm": 0.4737497660876726, + "learning_rate": 4.9643967442504266e-05, + "loss": 0.9222, + "step": 1649 + }, + { + "epoch": 0.08229426433915212, + "grad_norm": 0.6326352082658009, + "learning_rate": 4.964328799079947e-05, + "loss": 0.9515, + "step": 1650 + }, + { + "epoch": 0.08234413965087282, + "grad_norm": 0.5561214573129949, + "learning_rate": 4.964260789603909e-05, + "loss": 0.9354, + "step": 1651 + }, + { + "epoch": 0.08239401496259352, + "grad_norm": 0.6238089045318654, + "learning_rate": 4.96419271582409e-05, + "loss": 0.9846, + "step": 1652 + }, + { + "epoch": 0.08244389027431422, + "grad_norm": 0.4458305668977047, + "learning_rate": 4.964124577742264e-05, + "loss": 0.9146, + "step": 1653 + }, + { + "epoch": 0.08249376558603491, + "grad_norm": 1.3393306804799596, + "learning_rate": 4.964056375360211e-05, + "loss": 0.9142, + "step": 1654 + }, + { + "epoch": 0.08254364089775561, + "grad_norm": 0.4680825310544083, + "learning_rate": 4.963988108679708e-05, + "loss": 0.922, + "step": 1655 + }, + { + "epoch": 0.0825935162094763, + "grad_norm": 0.5550518146901856, + "learning_rate": 4.963919777702539e-05, + "loss": 0.914, + "step": 1656 + }, + { + "epoch": 0.08264339152119701, + "grad_norm": 0.575602844152435, + "learning_rate": 4.963851382430486e-05, + "loss": 0.9916, + "step": 1657 + }, + { + "epoch": 0.08269326683291771, + "grad_norm": 0.7132036431384218, + "learning_rate": 4.9637829228653346e-05, + "loss": 1.0083, + "step": 1658 + }, + { + "epoch": 0.0827431421446384, + "grad_norm": 0.5190647042552606, + "learning_rate": 4.963714399008869e-05, + "loss": 0.9675, + "step": 1659 + }, + { + "epoch": 0.0827930174563591, + "grad_norm": 0.5991093308444604, + "learning_rate": 4.9636458108628794e-05, + "loss": 0.9428, + "step": 1660 + }, + { + "epoch": 0.0828428927680798, + "grad_norm": 0.4586545239936725, + "learning_rate": 4.963577158429155e-05, + "loss": 0.9579, + "step": 1661 + }, + { + "epoch": 0.0828927680798005, + "grad_norm": 0.4965953400978677, + "learning_rate": 4.963508441709487e-05, + "loss": 0.9006, + "step": 1662 + }, + { + "epoch": 0.0829426433915212, + "grad_norm": 0.5561053314888833, + "learning_rate": 4.963439660705669e-05, + "loss": 0.9577, + "step": 1663 + }, + { + "epoch": 0.0829925187032419, + "grad_norm": 0.5354552379221946, + "learning_rate": 4.9633708154194956e-05, + "loss": 0.967, + "step": 1664 + }, + { + "epoch": 0.08304239401496259, + "grad_norm": 0.4656640609483977, + "learning_rate": 4.963301905852763e-05, + "loss": 0.9486, + "step": 1665 + }, + { + "epoch": 0.08309226932668329, + "grad_norm": 0.519349955182645, + "learning_rate": 4.96323293200727e-05, + "loss": 0.9493, + "step": 1666 + }, + { + "epoch": 0.08314214463840398, + "grad_norm": 1.037641750828994, + "learning_rate": 4.963163893884816e-05, + "loss": 0.926, + "step": 1667 + }, + { + "epoch": 0.0831920199501247, + "grad_norm": 0.49234269161496425, + "learning_rate": 4.963094791487202e-05, + "loss": 0.9432, + "step": 1668 + }, + { + "epoch": 0.08324189526184539, + "grad_norm": 0.519220665811618, + "learning_rate": 4.963025624816232e-05, + "loss": 0.9898, + "step": 1669 + }, + { + "epoch": 0.08329177057356608, + "grad_norm": 0.5198224113722305, + "learning_rate": 4.9629563938737114e-05, + "loss": 0.9475, + "step": 1670 + }, + { + "epoch": 0.08334164588528678, + "grad_norm": 0.6118946140257355, + "learning_rate": 4.962887098661445e-05, + "loss": 0.9851, + "step": 1671 + }, + { + "epoch": 0.08339152119700748, + "grad_norm": 0.48751964667891556, + "learning_rate": 4.962817739181243e-05, + "loss": 0.9383, + "step": 1672 + }, + { + "epoch": 0.08344139650872819, + "grad_norm": 0.5001009456883262, + "learning_rate": 4.962748315434913e-05, + "loss": 0.9441, + "step": 1673 + }, + { + "epoch": 0.08349127182044888, + "grad_norm": 0.7805712287650592, + "learning_rate": 4.962678827424268e-05, + "loss": 0.9215, + "step": 1674 + }, + { + "epoch": 0.08354114713216958, + "grad_norm": 0.498527168851709, + "learning_rate": 4.962609275151122e-05, + "loss": 0.9503, + "step": 1675 + }, + { + "epoch": 0.08359102244389027, + "grad_norm": 0.5057813039784549, + "learning_rate": 4.9625396586172885e-05, + "loss": 0.9506, + "step": 1676 + }, + { + "epoch": 0.08364089775561097, + "grad_norm": 0.4545795207693407, + "learning_rate": 4.962469977824585e-05, + "loss": 0.9547, + "step": 1677 + }, + { + "epoch": 0.08369077306733166, + "grad_norm": 0.700926809060445, + "learning_rate": 4.962400232774829e-05, + "loss": 0.9693, + "step": 1678 + }, + { + "epoch": 0.08374064837905237, + "grad_norm": 0.5228275802255371, + "learning_rate": 4.962330423469842e-05, + "loss": 0.9733, + "step": 1679 + }, + { + "epoch": 0.08379052369077307, + "grad_norm": 0.5246148633400315, + "learning_rate": 4.962260549911444e-05, + "loss": 0.9355, + "step": 1680 + }, + { + "epoch": 0.08384039900249377, + "grad_norm": 0.7069195842983409, + "learning_rate": 4.9621906121014584e-05, + "loss": 0.9518, + "step": 1681 + }, + { + "epoch": 0.08389027431421446, + "grad_norm": 0.6551197359369914, + "learning_rate": 4.962120610041711e-05, + "loss": 0.9538, + "step": 1682 + }, + { + "epoch": 0.08394014962593516, + "grad_norm": 0.47788800554958577, + "learning_rate": 4.9620505437340286e-05, + "loss": 0.9563, + "step": 1683 + }, + { + "epoch": 0.08399002493765587, + "grad_norm": 0.5026349835922814, + "learning_rate": 4.9619804131802385e-05, + "loss": 0.9435, + "step": 1684 + }, + { + "epoch": 0.08403990024937656, + "grad_norm": 0.5961848465745941, + "learning_rate": 4.961910218382172e-05, + "loss": 0.9457, + "step": 1685 + }, + { + "epoch": 0.08408977556109726, + "grad_norm": 0.4875149366895006, + "learning_rate": 4.9618399593416595e-05, + "loss": 0.969, + "step": 1686 + }, + { + "epoch": 0.08413965087281795, + "grad_norm": 1.1707125323292953, + "learning_rate": 4.961769636060536e-05, + "loss": 0.935, + "step": 1687 + }, + { + "epoch": 0.08418952618453865, + "grad_norm": 0.5333212057844975, + "learning_rate": 4.9616992485406346e-05, + "loss": 0.9809, + "step": 1688 + }, + { + "epoch": 0.08423940149625936, + "grad_norm": 0.5269588532092921, + "learning_rate": 4.961628796783794e-05, + "loss": 0.928, + "step": 1689 + }, + { + "epoch": 0.08428927680798005, + "grad_norm": 0.7382395215763023, + "learning_rate": 4.961558280791851e-05, + "loss": 0.9651, + "step": 1690 + }, + { + "epoch": 0.08433915211970075, + "grad_norm": 0.6058434610907523, + "learning_rate": 4.961487700566646e-05, + "loss": 0.9906, + "step": 1691 + }, + { + "epoch": 0.08438902743142145, + "grad_norm": 0.6370320200742688, + "learning_rate": 4.961417056110021e-05, + "loss": 0.9635, + "step": 1692 + }, + { + "epoch": 0.08443890274314214, + "grad_norm": 0.8092695208471901, + "learning_rate": 4.961346347423821e-05, + "loss": 0.9602, + "step": 1693 + }, + { + "epoch": 0.08448877805486284, + "grad_norm": 0.6765184555480286, + "learning_rate": 4.9612755745098885e-05, + "loss": 0.9771, + "step": 1694 + }, + { + "epoch": 0.08453865336658355, + "grad_norm": 0.5635313600417429, + "learning_rate": 4.961204737370071e-05, + "loss": 0.9331, + "step": 1695 + }, + { + "epoch": 0.08458852867830424, + "grad_norm": 0.5131406342315562, + "learning_rate": 4.961133836006218e-05, + "loss": 0.9381, + "step": 1696 + }, + { + "epoch": 0.08463840399002494, + "grad_norm": 0.5551890627818525, + "learning_rate": 4.961062870420179e-05, + "loss": 0.9595, + "step": 1697 + }, + { + "epoch": 0.08468827930174563, + "grad_norm": 0.49622414727660025, + "learning_rate": 4.960991840613806e-05, + "loss": 0.9647, + "step": 1698 + }, + { + "epoch": 0.08473815461346633, + "grad_norm": 0.5358962316155428, + "learning_rate": 4.960920746588952e-05, + "loss": 0.9379, + "step": 1699 + }, + { + "epoch": 0.08478802992518704, + "grad_norm": 0.5141426155305826, + "learning_rate": 4.960849588347473e-05, + "loss": 0.9606, + "step": 1700 + }, + { + "epoch": 0.08483790523690773, + "grad_norm": 0.45117081603471876, + "learning_rate": 4.9607783658912245e-05, + "loss": 0.9396, + "step": 1701 + }, + { + "epoch": 0.08488778054862843, + "grad_norm": 1.0694954784230815, + "learning_rate": 4.960707079222066e-05, + "loss": 0.8698, + "step": 1702 + }, + { + "epoch": 0.08493765586034913, + "grad_norm": 0.5197466927224696, + "learning_rate": 4.9606357283418575e-05, + "loss": 0.9379, + "step": 1703 + }, + { + "epoch": 0.08498753117206982, + "grad_norm": 0.647177226224031, + "learning_rate": 4.960564313252461e-05, + "loss": 0.9397, + "step": 1704 + }, + { + "epoch": 0.08503740648379052, + "grad_norm": 0.646693816285039, + "learning_rate": 4.96049283395574e-05, + "loss": 0.9294, + "step": 1705 + }, + { + "epoch": 0.08508728179551123, + "grad_norm": 0.6031677686723481, + "learning_rate": 4.9604212904535596e-05, + "loss": 0.9854, + "step": 1706 + }, + { + "epoch": 0.08513715710723192, + "grad_norm": 0.5037766232046639, + "learning_rate": 4.9603496827477866e-05, + "loss": 0.9511, + "step": 1707 + }, + { + "epoch": 0.08518703241895262, + "grad_norm": 0.5254936591494513, + "learning_rate": 4.96027801084029e-05, + "loss": 0.9309, + "step": 1708 + }, + { + "epoch": 0.08523690773067331, + "grad_norm": 0.6306667693695363, + "learning_rate": 4.9602062747329404e-05, + "loss": 0.9663, + "step": 1709 + }, + { + "epoch": 0.08528678304239401, + "grad_norm": 0.6364065283750207, + "learning_rate": 4.960134474427608e-05, + "loss": 0.9662, + "step": 1710 + }, + { + "epoch": 0.08533665835411472, + "grad_norm": 0.522927816857988, + "learning_rate": 4.960062609926168e-05, + "loss": 0.9797, + "step": 1711 + }, + { + "epoch": 0.08538653366583542, + "grad_norm": 0.5832174254057866, + "learning_rate": 4.959990681230495e-05, + "loss": 0.9433, + "step": 1712 + }, + { + "epoch": 0.08543640897755611, + "grad_norm": 0.5560079962883351, + "learning_rate": 4.959918688342467e-05, + "loss": 0.9601, + "step": 1713 + }, + { + "epoch": 0.0854862842892768, + "grad_norm": 0.7272028054828951, + "learning_rate": 4.959846631263961e-05, + "loss": 0.9729, + "step": 1714 + }, + { + "epoch": 0.0855361596009975, + "grad_norm": 0.6728535768082675, + "learning_rate": 4.959774509996858e-05, + "loss": 0.9585, + "step": 1715 + }, + { + "epoch": 0.0855860349127182, + "grad_norm": 0.5673702263570002, + "learning_rate": 4.95970232454304e-05, + "loss": 0.9392, + "step": 1716 + }, + { + "epoch": 0.08563591022443891, + "grad_norm": 0.654360126938565, + "learning_rate": 4.959630074904391e-05, + "loss": 1.0051, + "step": 1717 + }, + { + "epoch": 0.0856857855361596, + "grad_norm": 1.2843076530203212, + "learning_rate": 4.959557761082796e-05, + "loss": 0.9797, + "step": 1718 + }, + { + "epoch": 0.0857356608478803, + "grad_norm": 0.6915917155239428, + "learning_rate": 4.959485383080141e-05, + "loss": 0.9196, + "step": 1719 + }, + { + "epoch": 0.085785536159601, + "grad_norm": 0.5231460339761687, + "learning_rate": 4.959412940898317e-05, + "loss": 0.9605, + "step": 1720 + }, + { + "epoch": 0.08583541147132169, + "grad_norm": 0.5437875226268921, + "learning_rate": 4.959340434539213e-05, + "loss": 1.0016, + "step": 1721 + }, + { + "epoch": 0.0858852867830424, + "grad_norm": 1.6310980174666239, + "learning_rate": 4.9592678640047196e-05, + "loss": 0.9691, + "step": 1722 + }, + { + "epoch": 0.0859351620947631, + "grad_norm": 0.5510201363624458, + "learning_rate": 4.959195229296733e-05, + "loss": 0.9261, + "step": 1723 + }, + { + "epoch": 0.08598503740648379, + "grad_norm": 0.6059362355817293, + "learning_rate": 4.9591225304171465e-05, + "loss": 0.9478, + "step": 1724 + }, + { + "epoch": 0.08603491271820449, + "grad_norm": 1.0026276259324192, + "learning_rate": 4.959049767367859e-05, + "loss": 0.9916, + "step": 1725 + }, + { + "epoch": 0.08608478802992518, + "grad_norm": 1.492970226050654, + "learning_rate": 4.958976940150768e-05, + "loss": 0.9439, + "step": 1726 + }, + { + "epoch": 0.08613466334164588, + "grad_norm": 0.5625845721746592, + "learning_rate": 4.9589040487677754e-05, + "loss": 0.984, + "step": 1727 + }, + { + "epoch": 0.08618453865336659, + "grad_norm": 0.6149520075763715, + "learning_rate": 4.9588310932207804e-05, + "loss": 0.9527, + "step": 1728 + }, + { + "epoch": 0.08623441396508728, + "grad_norm": 0.5231660584839657, + "learning_rate": 4.958758073511689e-05, + "loss": 0.9687, + "step": 1729 + }, + { + "epoch": 0.08628428927680798, + "grad_norm": 0.6435888338343053, + "learning_rate": 4.958684989642407e-05, + "loss": 0.9049, + "step": 1730 + }, + { + "epoch": 0.08633416458852868, + "grad_norm": 0.7614356140179578, + "learning_rate": 4.958611841614839e-05, + "loss": 0.9606, + "step": 1731 + }, + { + "epoch": 0.08638403990024937, + "grad_norm": 0.5440080508999755, + "learning_rate": 4.9585386294308966e-05, + "loss": 0.9489, + "step": 1732 + }, + { + "epoch": 0.08643391521197008, + "grad_norm": 0.49943560054114283, + "learning_rate": 4.9584653530924884e-05, + "loss": 0.9444, + "step": 1733 + }, + { + "epoch": 0.08648379052369078, + "grad_norm": 0.5212393906938875, + "learning_rate": 4.958392012601527e-05, + "loss": 0.9998, + "step": 1734 + }, + { + "epoch": 0.08653366583541147, + "grad_norm": 0.5744769011642324, + "learning_rate": 4.958318607959926e-05, + "loss": 0.9457, + "step": 1735 + }, + { + "epoch": 0.08658354114713217, + "grad_norm": 0.8607921076589745, + "learning_rate": 4.9582451391696015e-05, + "loss": 0.9691, + "step": 1736 + }, + { + "epoch": 0.08663341645885286, + "grad_norm": 0.48971123293799806, + "learning_rate": 4.9581716062324715e-05, + "loss": 0.9159, + "step": 1737 + }, + { + "epoch": 0.08668329177057357, + "grad_norm": 0.4623256522393489, + "learning_rate": 4.958098009150451e-05, + "loss": 0.9658, + "step": 1738 + }, + { + "epoch": 0.08673316708229427, + "grad_norm": 0.7876273514060618, + "learning_rate": 4.958024347925465e-05, + "loss": 0.9608, + "step": 1739 + }, + { + "epoch": 0.08678304239401496, + "grad_norm": 0.7282912796967644, + "learning_rate": 4.957950622559433e-05, + "loss": 0.9446, + "step": 1740 + }, + { + "epoch": 0.08683291770573566, + "grad_norm": 0.5256531783819415, + "learning_rate": 4.95787683305428e-05, + "loss": 0.9821, + "step": 1741 + }, + { + "epoch": 0.08688279301745636, + "grad_norm": 0.5781831184751103, + "learning_rate": 4.9578029794119305e-05, + "loss": 0.9751, + "step": 1742 + }, + { + "epoch": 0.08693266832917705, + "grad_norm": 0.7838589210485014, + "learning_rate": 4.957729061634313e-05, + "loss": 0.9884, + "step": 1743 + }, + { + "epoch": 0.08698254364089776, + "grad_norm": 0.7028455084659978, + "learning_rate": 4.957655079723355e-05, + "loss": 0.9384, + "step": 1744 + }, + { + "epoch": 0.08703241895261846, + "grad_norm": 0.5713156135021383, + "learning_rate": 4.9575810336809866e-05, + "loss": 1.0071, + "step": 1745 + }, + { + "epoch": 0.08708229426433915, + "grad_norm": 0.8101942178448204, + "learning_rate": 4.957506923509143e-05, + "loss": 0.9742, + "step": 1746 + }, + { + "epoch": 0.08713216957605985, + "grad_norm": 0.521753091120948, + "learning_rate": 4.957432749209755e-05, + "loss": 0.9709, + "step": 1747 + }, + { + "epoch": 0.08718204488778054, + "grad_norm": 0.869438125227189, + "learning_rate": 4.9573585107847584e-05, + "loss": 0.9594, + "step": 1748 + }, + { + "epoch": 0.08723192019950125, + "grad_norm": 0.4984435928845525, + "learning_rate": 4.9572842082360926e-05, + "loss": 1.0006, + "step": 1749 + }, + { + "epoch": 0.08728179551122195, + "grad_norm": 0.5388840376435589, + "learning_rate": 4.957209841565695e-05, + "loss": 0.9534, + "step": 1750 + }, + { + "epoch": 0.08733167082294264, + "grad_norm": 0.5820195382178501, + "learning_rate": 4.957135410775505e-05, + "loss": 0.9571, + "step": 1751 + }, + { + "epoch": 0.08738154613466334, + "grad_norm": 0.5706028775693689, + "learning_rate": 4.9570609158674676e-05, + "loss": 0.9362, + "step": 1752 + }, + { + "epoch": 0.08743142144638404, + "grad_norm": 0.562975518425692, + "learning_rate": 4.9569863568435246e-05, + "loss": 0.9229, + "step": 1753 + }, + { + "epoch": 0.08748129675810473, + "grad_norm": 0.5487587142648664, + "learning_rate": 4.956911733705623e-05, + "loss": 0.9523, + "step": 1754 + }, + { + "epoch": 0.08753117206982544, + "grad_norm": 0.6106969068641889, + "learning_rate": 4.956837046455708e-05, + "loss": 0.9147, + "step": 1755 + }, + { + "epoch": 0.08758104738154614, + "grad_norm": 0.4663877864435529, + "learning_rate": 4.9567622950957306e-05, + "loss": 0.9434, + "step": 1756 + }, + { + "epoch": 0.08763092269326683, + "grad_norm": 0.6646423651739818, + "learning_rate": 4.9566874796276406e-05, + "loss": 0.9178, + "step": 1757 + }, + { + "epoch": 0.08768079800498753, + "grad_norm": 0.4935393617620018, + "learning_rate": 4.95661260005339e-05, + "loss": 0.9534, + "step": 1758 + }, + { + "epoch": 0.08773067331670822, + "grad_norm": 0.5811334272060847, + "learning_rate": 4.956537656374933e-05, + "loss": 0.9787, + "step": 1759 + }, + { + "epoch": 0.08778054862842893, + "grad_norm": 0.6327355140984843, + "learning_rate": 4.9564626485942254e-05, + "loss": 0.9323, + "step": 1760 + }, + { + "epoch": 0.08783042394014963, + "grad_norm": 0.5105099663259015, + "learning_rate": 4.956387576713224e-05, + "loss": 0.9147, + "step": 1761 + }, + { + "epoch": 0.08788029925187033, + "grad_norm": 0.5602918834908777, + "learning_rate": 4.956312440733889e-05, + "loss": 0.9495, + "step": 1762 + }, + { + "epoch": 0.08793017456359102, + "grad_norm": 0.5051384968070634, + "learning_rate": 4.956237240658179e-05, + "loss": 0.9687, + "step": 1763 + }, + { + "epoch": 0.08798004987531172, + "grad_norm": 0.46356554285429247, + "learning_rate": 4.9561619764880586e-05, + "loss": 0.9345, + "step": 1764 + }, + { + "epoch": 0.08802992518703241, + "grad_norm": 0.47204485689661096, + "learning_rate": 4.95608664822549e-05, + "loss": 0.9284, + "step": 1765 + }, + { + "epoch": 0.08807980049875312, + "grad_norm": 0.8973324997655098, + "learning_rate": 4.95601125587244e-05, + "loss": 0.9247, + "step": 1766 + }, + { + "epoch": 0.08812967581047382, + "grad_norm": 0.66049794525186, + "learning_rate": 4.9559357994308755e-05, + "loss": 0.9865, + "step": 1767 + }, + { + "epoch": 0.08817955112219451, + "grad_norm": 0.4864075298809596, + "learning_rate": 4.955860278902764e-05, + "loss": 0.9545, + "step": 1768 + }, + { + "epoch": 0.08822942643391521, + "grad_norm": 0.5780418303174063, + "learning_rate": 4.955784694290079e-05, + "loss": 0.9175, + "step": 1769 + }, + { + "epoch": 0.0882793017456359, + "grad_norm": 0.6264902630657936, + "learning_rate": 4.9557090455947915e-05, + "loss": 0.9484, + "step": 1770 + }, + { + "epoch": 0.08832917705735661, + "grad_norm": 0.6392558745389987, + "learning_rate": 4.955633332818875e-05, + "loss": 0.9106, + "step": 1771 + }, + { + "epoch": 0.08837905236907731, + "grad_norm": 0.4814276020120612, + "learning_rate": 4.955557555964306e-05, + "loss": 0.931, + "step": 1772 + }, + { + "epoch": 0.088428927680798, + "grad_norm": 0.6828794323360242, + "learning_rate": 4.955481715033061e-05, + "loss": 0.943, + "step": 1773 + }, + { + "epoch": 0.0884788029925187, + "grad_norm": 0.5111006188933688, + "learning_rate": 4.95540581002712e-05, + "loss": 0.9792, + "step": 1774 + }, + { + "epoch": 0.0885286783042394, + "grad_norm": 0.7920649344472368, + "learning_rate": 4.9553298409484634e-05, + "loss": 0.9646, + "step": 1775 + }, + { + "epoch": 0.0885785536159601, + "grad_norm": 0.6385396738481535, + "learning_rate": 4.955253807799073e-05, + "loss": 0.9233, + "step": 1776 + }, + { + "epoch": 0.0886284289276808, + "grad_norm": 0.825568674753704, + "learning_rate": 4.9551777105809336e-05, + "loss": 0.9722, + "step": 1777 + }, + { + "epoch": 0.0886783042394015, + "grad_norm": 0.46812420751580835, + "learning_rate": 4.955101549296031e-05, + "loss": 0.9018, + "step": 1778 + }, + { + "epoch": 0.0887281795511222, + "grad_norm": 0.47335242767557034, + "learning_rate": 4.9550253239463516e-05, + "loss": 0.9377, + "step": 1779 + }, + { + "epoch": 0.08877805486284289, + "grad_norm": 0.5411686900488565, + "learning_rate": 4.954949034533885e-05, + "loss": 0.9594, + "step": 1780 + }, + { + "epoch": 0.08882793017456359, + "grad_norm": 0.5722692067346473, + "learning_rate": 4.954872681060623e-05, + "loss": 0.9624, + "step": 1781 + }, + { + "epoch": 0.0888778054862843, + "grad_norm": 0.632490685714298, + "learning_rate": 4.954796263528556e-05, + "loss": 0.9119, + "step": 1782 + }, + { + "epoch": 0.08892768079800499, + "grad_norm": 0.6965262522963734, + "learning_rate": 4.9547197819396794e-05, + "loss": 0.9361, + "step": 1783 + }, + { + "epoch": 0.08897755610972569, + "grad_norm": 0.7188182642727342, + "learning_rate": 4.954643236295989e-05, + "loss": 0.9543, + "step": 1784 + }, + { + "epoch": 0.08902743142144638, + "grad_norm": 0.5721243840382685, + "learning_rate": 4.954566626599482e-05, + "loss": 0.9585, + "step": 1785 + }, + { + "epoch": 0.08907730673316708, + "grad_norm": 0.48466726955114653, + "learning_rate": 4.9544899528521574e-05, + "loss": 0.943, + "step": 1786 + }, + { + "epoch": 0.08912718204488779, + "grad_norm": 0.5391691117059866, + "learning_rate": 4.9544132150560154e-05, + "loss": 0.9481, + "step": 1787 + }, + { + "epoch": 0.08917705735660848, + "grad_norm": 0.5743124165953953, + "learning_rate": 4.95433641321306e-05, + "loss": 0.9746, + "step": 1788 + }, + { + "epoch": 0.08922693266832918, + "grad_norm": 0.49977416690805315, + "learning_rate": 4.954259547325293e-05, + "loss": 0.9496, + "step": 1789 + }, + { + "epoch": 0.08927680798004987, + "grad_norm": 0.6031161338946215, + "learning_rate": 4.9541826173947226e-05, + "loss": 0.9898, + "step": 1790 + }, + { + "epoch": 0.08932668329177057, + "grad_norm": 0.5246066610674346, + "learning_rate": 4.954105623423354e-05, + "loss": 0.9785, + "step": 1791 + }, + { + "epoch": 0.08937655860349127, + "grad_norm": 0.5703588676033386, + "learning_rate": 4.9540285654131984e-05, + "loss": 0.9493, + "step": 1792 + }, + { + "epoch": 0.08942643391521198, + "grad_norm": 0.47546711654280105, + "learning_rate": 4.953951443366266e-05, + "loss": 0.9556, + "step": 1793 + }, + { + "epoch": 0.08947630922693267, + "grad_norm": 0.4636959943931576, + "learning_rate": 4.9538742572845674e-05, + "loss": 0.9445, + "step": 1794 + }, + { + "epoch": 0.08952618453865337, + "grad_norm": 0.5230881802653221, + "learning_rate": 4.95379700717012e-05, + "loss": 0.9457, + "step": 1795 + }, + { + "epoch": 0.08957605985037406, + "grad_norm": 0.7045277808744861, + "learning_rate": 4.953719693024937e-05, + "loss": 0.9859, + "step": 1796 + }, + { + "epoch": 0.08962593516209476, + "grad_norm": 0.6408932451514563, + "learning_rate": 4.9536423148510366e-05, + "loss": 0.9072, + "step": 1797 + }, + { + "epoch": 0.08967581047381547, + "grad_norm": 0.6496374326327318, + "learning_rate": 4.953564872650438e-05, + "loss": 0.959, + "step": 1798 + }, + { + "epoch": 0.08972568578553616, + "grad_norm": 0.6570704073327831, + "learning_rate": 4.953487366425163e-05, + "loss": 0.8967, + "step": 1799 + }, + { + "epoch": 0.08977556109725686, + "grad_norm": 0.86710250778229, + "learning_rate": 4.9534097961772324e-05, + "loss": 0.9125, + "step": 1800 + }, + { + "epoch": 0.08982543640897755, + "grad_norm": 0.6287947572368646, + "learning_rate": 4.953332161908671e-05, + "loss": 1.0035, + "step": 1801 + }, + { + "epoch": 0.08987531172069825, + "grad_norm": 0.5618727464935752, + "learning_rate": 4.953254463621506e-05, + "loss": 0.988, + "step": 1802 + }, + { + "epoch": 0.08992518703241895, + "grad_norm": 0.5048311953031319, + "learning_rate": 4.9531767013177626e-05, + "loss": 0.996, + "step": 1803 + }, + { + "epoch": 0.08997506234413966, + "grad_norm": 0.5770403741269733, + "learning_rate": 4.9530988749994716e-05, + "loss": 0.9973, + "step": 1804 + }, + { + "epoch": 0.09002493765586035, + "grad_norm": 0.8830412152066295, + "learning_rate": 4.953020984668663e-05, + "loss": 0.9502, + "step": 1805 + }, + { + "epoch": 0.09007481296758105, + "grad_norm": 0.566469458720332, + "learning_rate": 4.95294303032737e-05, + "loss": 0.935, + "step": 1806 + }, + { + "epoch": 0.09012468827930174, + "grad_norm": 0.5795035960776156, + "learning_rate": 4.952865011977627e-05, + "loss": 0.961, + "step": 1807 + }, + { + "epoch": 0.09017456359102244, + "grad_norm": 0.5040281346467336, + "learning_rate": 4.952786929621468e-05, + "loss": 0.9366, + "step": 1808 + }, + { + "epoch": 0.09022443890274315, + "grad_norm": 0.5575022089302298, + "learning_rate": 4.952708783260932e-05, + "loss": 0.9611, + "step": 1809 + }, + { + "epoch": 0.09027431421446384, + "grad_norm": 0.42922626976021433, + "learning_rate": 4.9526305728980584e-05, + "loss": 0.9009, + "step": 1810 + }, + { + "epoch": 0.09032418952618454, + "grad_norm": 0.700800278592465, + "learning_rate": 4.952552298534887e-05, + "loss": 0.9264, + "step": 1811 + }, + { + "epoch": 0.09037406483790524, + "grad_norm": 0.6084348531134438, + "learning_rate": 4.9524739601734615e-05, + "loss": 0.9399, + "step": 1812 + }, + { + "epoch": 0.09042394014962593, + "grad_norm": 0.5272978728296547, + "learning_rate": 4.952395557815826e-05, + "loss": 0.9612, + "step": 1813 + }, + { + "epoch": 0.09047381546134663, + "grad_norm": 0.5228918683302937, + "learning_rate": 4.9523170914640256e-05, + "loss": 0.9383, + "step": 1814 + }, + { + "epoch": 0.09052369077306734, + "grad_norm": 0.8597110652790194, + "learning_rate": 4.952238561120108e-05, + "loss": 0.9621, + "step": 1815 + }, + { + "epoch": 0.09057356608478803, + "grad_norm": 0.4623215682235851, + "learning_rate": 4.952159966786123e-05, + "loss": 0.9998, + "step": 1816 + }, + { + "epoch": 0.09062344139650873, + "grad_norm": 0.5767531398604127, + "learning_rate": 4.95208130846412e-05, + "loss": 0.9652, + "step": 1817 + }, + { + "epoch": 0.09067331670822942, + "grad_norm": 0.6642347012304951, + "learning_rate": 4.952002586156154e-05, + "loss": 1.0057, + "step": 1818 + }, + { + "epoch": 0.09072319201995012, + "grad_norm": 0.6714465417178971, + "learning_rate": 4.951923799864278e-05, + "loss": 0.949, + "step": 1819 + }, + { + "epoch": 0.09077306733167083, + "grad_norm": 0.5062973944286542, + "learning_rate": 4.951844949590547e-05, + "loss": 0.9632, + "step": 1820 + }, + { + "epoch": 0.09082294264339152, + "grad_norm": 0.5866544459290072, + "learning_rate": 4.95176603533702e-05, + "loss": 0.9382, + "step": 1821 + }, + { + "epoch": 0.09087281795511222, + "grad_norm": 0.6487974365605339, + "learning_rate": 4.951687057105755e-05, + "loss": 0.9844, + "step": 1822 + }, + { + "epoch": 0.09092269326683292, + "grad_norm": 0.5935544631752129, + "learning_rate": 4.951608014898813e-05, + "loss": 0.9563, + "step": 1823 + }, + { + "epoch": 0.09097256857855361, + "grad_norm": 0.6222372493517733, + "learning_rate": 4.951528908718258e-05, + "loss": 0.9763, + "step": 1824 + }, + { + "epoch": 0.09102244389027432, + "grad_norm": 0.7872260061759477, + "learning_rate": 4.951449738566153e-05, + "loss": 0.9286, + "step": 1825 + }, + { + "epoch": 0.09107231920199502, + "grad_norm": 0.5888069043865489, + "learning_rate": 4.951370504444565e-05, + "loss": 0.9456, + "step": 1826 + }, + { + "epoch": 0.09112219451371571, + "grad_norm": 0.5175507460896316, + "learning_rate": 4.95129120635556e-05, + "loss": 0.926, + "step": 1827 + }, + { + "epoch": 0.09117206982543641, + "grad_norm": 0.6630626085163197, + "learning_rate": 4.951211844301207e-05, + "loss": 0.9812, + "step": 1828 + }, + { + "epoch": 0.0912219451371571, + "grad_norm": 0.5774403393461616, + "learning_rate": 4.9511324182835796e-05, + "loss": 0.9385, + "step": 1829 + }, + { + "epoch": 0.0912718204488778, + "grad_norm": 0.728224305144558, + "learning_rate": 4.9510529283047476e-05, + "loss": 0.9387, + "step": 1830 + }, + { + "epoch": 0.09132169576059851, + "grad_norm": 0.5341250851278113, + "learning_rate": 4.950973374366787e-05, + "loss": 0.9236, + "step": 1831 + }, + { + "epoch": 0.0913715710723192, + "grad_norm": 0.6457077426536788, + "learning_rate": 4.950893756471773e-05, + "loss": 0.9591, + "step": 1832 + }, + { + "epoch": 0.0914214463840399, + "grad_norm": 0.5195801204209378, + "learning_rate": 4.950814074621783e-05, + "loss": 0.9717, + "step": 1833 + }, + { + "epoch": 0.0914713216957606, + "grad_norm": 0.5729544376624697, + "learning_rate": 4.9507343288188964e-05, + "loss": 0.9494, + "step": 1834 + }, + { + "epoch": 0.09152119700748129, + "grad_norm": 0.5741374703664491, + "learning_rate": 4.950654519065195e-05, + "loss": 0.9225, + "step": 1835 + }, + { + "epoch": 0.091571072319202, + "grad_norm": 0.5094544479458663, + "learning_rate": 4.95057464536276e-05, + "loss": 0.9457, + "step": 1836 + }, + { + "epoch": 0.0916209476309227, + "grad_norm": 0.6734388350446948, + "learning_rate": 4.950494707713677e-05, + "loss": 0.9635, + "step": 1837 + }, + { + "epoch": 0.09167082294264339, + "grad_norm": 0.5287624439305191, + "learning_rate": 4.9504147061200314e-05, + "loss": 0.9365, + "step": 1838 + }, + { + "epoch": 0.09172069825436409, + "grad_norm": 0.6126174985419914, + "learning_rate": 4.95033464058391e-05, + "loss": 0.933, + "step": 1839 + }, + { + "epoch": 0.09177057356608478, + "grad_norm": 0.7854308676447742, + "learning_rate": 4.950254511107403e-05, + "loss": 0.9514, + "step": 1840 + }, + { + "epoch": 0.09182044887780548, + "grad_norm": 0.6130448711672836, + "learning_rate": 4.950174317692602e-05, + "loss": 0.9726, + "step": 1841 + }, + { + "epoch": 0.09187032418952619, + "grad_norm": 0.49566437857907414, + "learning_rate": 4.950094060341598e-05, + "loss": 0.9397, + "step": 1842 + }, + { + "epoch": 0.09192019950124689, + "grad_norm": 0.5634631857800804, + "learning_rate": 4.950013739056486e-05, + "loss": 0.9064, + "step": 1843 + }, + { + "epoch": 0.09197007481296758, + "grad_norm": 0.626741994407735, + "learning_rate": 4.9499333538393615e-05, + "loss": 0.9838, + "step": 1844 + }, + { + "epoch": 0.09201995012468828, + "grad_norm": 0.7514020176945335, + "learning_rate": 4.949852904692323e-05, + "loss": 0.9327, + "step": 1845 + }, + { + "epoch": 0.09206982543640897, + "grad_norm": 0.5579134887947637, + "learning_rate": 4.94977239161747e-05, + "loss": 0.9635, + "step": 1846 + }, + { + "epoch": 0.09211970074812968, + "grad_norm": 0.56967193005487, + "learning_rate": 4.949691814616901e-05, + "loss": 0.997, + "step": 1847 + }, + { + "epoch": 0.09216957605985038, + "grad_norm": 0.599635038516747, + "learning_rate": 4.949611173692723e-05, + "loss": 0.911, + "step": 1848 + }, + { + "epoch": 0.09221945137157107, + "grad_norm": 0.5378677672777087, + "learning_rate": 4.9495304688470356e-05, + "loss": 0.9237, + "step": 1849 + }, + { + "epoch": 0.09226932668329177, + "grad_norm": 0.7328250664386595, + "learning_rate": 4.949449700081948e-05, + "loss": 0.9793, + "step": 1850 + }, + { + "epoch": 0.09231920199501246, + "grad_norm": 0.49352987354949374, + "learning_rate": 4.949368867399567e-05, + "loss": 0.9516, + "step": 1851 + }, + { + "epoch": 0.09236907730673316, + "grad_norm": 0.4986361356888549, + "learning_rate": 4.949287970802e-05, + "loss": 0.9512, + "step": 1852 + }, + { + "epoch": 0.09241895261845387, + "grad_norm": 0.6697602695248073, + "learning_rate": 4.9492070102913607e-05, + "loss": 1.0092, + "step": 1853 + }, + { + "epoch": 0.09246882793017457, + "grad_norm": 0.7712849592180754, + "learning_rate": 4.94912598586976e-05, + "loss": 0.982, + "step": 1854 + }, + { + "epoch": 0.09251870324189526, + "grad_norm": 0.5704387946932442, + "learning_rate": 4.9490448975393134e-05, + "loss": 0.9673, + "step": 1855 + }, + { + "epoch": 0.09256857855361596, + "grad_norm": 0.6815671142115007, + "learning_rate": 4.9489637453021365e-05, + "loss": 1.0284, + "step": 1856 + }, + { + "epoch": 0.09261845386533665, + "grad_norm": 0.6869209296194221, + "learning_rate": 4.948882529160346e-05, + "loss": 0.944, + "step": 1857 + }, + { + "epoch": 0.09266832917705736, + "grad_norm": 0.6041997162104968, + "learning_rate": 4.9488012491160624e-05, + "loss": 0.9377, + "step": 1858 + }, + { + "epoch": 0.09271820448877806, + "grad_norm": 0.641359773853594, + "learning_rate": 4.948719905171406e-05, + "loss": 0.9273, + "step": 1859 + }, + { + "epoch": 0.09276807980049875, + "grad_norm": 0.6786508852018263, + "learning_rate": 4.9486384973284994e-05, + "loss": 0.957, + "step": 1860 + }, + { + "epoch": 0.09281795511221945, + "grad_norm": 0.5815292267986764, + "learning_rate": 4.948557025589467e-05, + "loss": 0.9158, + "step": 1861 + }, + { + "epoch": 0.09286783042394015, + "grad_norm": 0.6388659319440045, + "learning_rate": 4.9484754899564343e-05, + "loss": 0.9348, + "step": 1862 + }, + { + "epoch": 0.09291770573566084, + "grad_norm": 0.6920200022456544, + "learning_rate": 4.9483938904315305e-05, + "loss": 1.0106, + "step": 1863 + }, + { + "epoch": 0.09296758104738155, + "grad_norm": 0.5884607294705967, + "learning_rate": 4.948312227016884e-05, + "loss": 0.9452, + "step": 1864 + }, + { + "epoch": 0.09301745635910225, + "grad_norm": 0.6348710931477252, + "learning_rate": 4.948230499714625e-05, + "loss": 0.9561, + "step": 1865 + }, + { + "epoch": 0.09306733167082294, + "grad_norm": 0.48982830281643425, + "learning_rate": 4.948148708526887e-05, + "loss": 0.9584, + "step": 1866 + }, + { + "epoch": 0.09311720698254364, + "grad_norm": 0.4742703979055658, + "learning_rate": 4.948066853455804e-05, + "loss": 0.9399, + "step": 1867 + }, + { + "epoch": 0.09316708229426433, + "grad_norm": 0.5583199965324993, + "learning_rate": 4.9479849345035125e-05, + "loss": 0.9386, + "step": 1868 + }, + { + "epoch": 0.09321695760598504, + "grad_norm": 0.5109954659899774, + "learning_rate": 4.947902951672149e-05, + "loss": 0.97, + "step": 1869 + }, + { + "epoch": 0.09326683291770574, + "grad_norm": 0.523161767100801, + "learning_rate": 4.947820904963854e-05, + "loss": 0.9409, + "step": 1870 + }, + { + "epoch": 0.09331670822942643, + "grad_norm": 0.8361610233962907, + "learning_rate": 4.947738794380768e-05, + "loss": 0.9782, + "step": 1871 + }, + { + "epoch": 0.09336658354114713, + "grad_norm": 0.5216976220328099, + "learning_rate": 4.947656619925033e-05, + "loss": 0.9576, + "step": 1872 + }, + { + "epoch": 0.09341645885286783, + "grad_norm": 0.5248310021600002, + "learning_rate": 4.9475743815987945e-05, + "loss": 0.9287, + "step": 1873 + }, + { + "epoch": 0.09346633416458854, + "grad_norm": 0.638683318414399, + "learning_rate": 4.947492079404198e-05, + "loss": 0.98, + "step": 1874 + }, + { + "epoch": 0.09351620947630923, + "grad_norm": 0.6049346313301468, + "learning_rate": 4.9474097133433906e-05, + "loss": 0.9296, + "step": 1875 + }, + { + "epoch": 0.09356608478802993, + "grad_norm": 0.5697238455922966, + "learning_rate": 4.947327283418522e-05, + "loss": 0.9296, + "step": 1876 + }, + { + "epoch": 0.09361596009975062, + "grad_norm": 0.5203787742456534, + "learning_rate": 4.9472447896317444e-05, + "loss": 0.9594, + "step": 1877 + }, + { + "epoch": 0.09366583541147132, + "grad_norm": 0.5025173748945324, + "learning_rate": 4.9471622319852085e-05, + "loss": 0.9545, + "step": 1878 + }, + { + "epoch": 0.09371571072319201, + "grad_norm": 0.5449233305431965, + "learning_rate": 4.947079610481069e-05, + "loss": 0.9602, + "step": 1879 + }, + { + "epoch": 0.09376558603491272, + "grad_norm": 0.5127167692689341, + "learning_rate": 4.946996925121482e-05, + "loss": 0.9549, + "step": 1880 + }, + { + "epoch": 0.09381546134663342, + "grad_norm": 0.49590166520649337, + "learning_rate": 4.946914175908606e-05, + "loss": 0.9902, + "step": 1881 + }, + { + "epoch": 0.09386533665835411, + "grad_norm": 0.43366391315011477, + "learning_rate": 4.9468313628445996e-05, + "loss": 0.912, + "step": 1882 + }, + { + "epoch": 0.09391521197007481, + "grad_norm": 0.856768475977194, + "learning_rate": 4.9467484859316236e-05, + "loss": 0.9817, + "step": 1883 + }, + { + "epoch": 0.0939650872817955, + "grad_norm": 0.715769347187793, + "learning_rate": 4.946665545171841e-05, + "loss": 0.9262, + "step": 1884 + }, + { + "epoch": 0.09401496259351622, + "grad_norm": 0.7209099384650267, + "learning_rate": 4.946582540567416e-05, + "loss": 0.9691, + "step": 1885 + }, + { + "epoch": 0.09406483790523691, + "grad_norm": 0.6779289921622428, + "learning_rate": 4.946499472120515e-05, + "loss": 0.9686, + "step": 1886 + }, + { + "epoch": 0.09411471321695761, + "grad_norm": 0.5926317347693282, + "learning_rate": 4.946416339833305e-05, + "loss": 0.9072, + "step": 1887 + }, + { + "epoch": 0.0941645885286783, + "grad_norm": 0.7036975468166067, + "learning_rate": 4.9463331437079554e-05, + "loss": 0.9831, + "step": 1888 + }, + { + "epoch": 0.094214463840399, + "grad_norm": 1.6746601567941513, + "learning_rate": 4.946249883746637e-05, + "loss": 1.0071, + "step": 1889 + }, + { + "epoch": 0.0942643391521197, + "grad_norm": 0.5179545338240371, + "learning_rate": 4.946166559951523e-05, + "loss": 0.9391, + "step": 1890 + }, + { + "epoch": 0.0943142144638404, + "grad_norm": 0.5759389240948928, + "learning_rate": 4.946083172324787e-05, + "loss": 0.9238, + "step": 1891 + }, + { + "epoch": 0.0943640897755611, + "grad_norm": 0.6343325872624167, + "learning_rate": 4.945999720868605e-05, + "loss": 0.9578, + "step": 1892 + }, + { + "epoch": 0.0944139650872818, + "grad_norm": 0.5335806047175212, + "learning_rate": 4.945916205585156e-05, + "loss": 0.9409, + "step": 1893 + }, + { + "epoch": 0.09446384039900249, + "grad_norm": 0.7174676066175079, + "learning_rate": 4.9458326264766173e-05, + "loss": 0.9677, + "step": 1894 + }, + { + "epoch": 0.09451371571072319, + "grad_norm": 0.626398262140688, + "learning_rate": 4.945748983545172e-05, + "loss": 0.9644, + "step": 1895 + }, + { + "epoch": 0.0945635910224439, + "grad_norm": 0.5514223561866939, + "learning_rate": 4.945665276793e-05, + "loss": 0.9503, + "step": 1896 + }, + { + "epoch": 0.09461346633416459, + "grad_norm": 0.5975629936235342, + "learning_rate": 4.945581506222289e-05, + "loss": 0.9683, + "step": 1897 + }, + { + "epoch": 0.09466334164588529, + "grad_norm": 0.4944116546678987, + "learning_rate": 4.945497671835222e-05, + "loss": 0.9256, + "step": 1898 + }, + { + "epoch": 0.09471321695760598, + "grad_norm": 0.6298116495275985, + "learning_rate": 4.9454137736339876e-05, + "loss": 0.9611, + "step": 1899 + }, + { + "epoch": 0.09476309226932668, + "grad_norm": 0.5450596151798184, + "learning_rate": 4.9453298116207754e-05, + "loss": 0.9867, + "step": 1900 + }, + { + "epoch": 0.09481296758104737, + "grad_norm": 0.8520807669857589, + "learning_rate": 4.9452457857977755e-05, + "loss": 0.9661, + "step": 1901 + }, + { + "epoch": 0.09486284289276808, + "grad_norm": 0.5605361254010389, + "learning_rate": 4.945161696167182e-05, + "loss": 1.0064, + "step": 1902 + }, + { + "epoch": 0.09491271820448878, + "grad_norm": 0.5030621486662652, + "learning_rate": 4.945077542731188e-05, + "loss": 0.9611, + "step": 1903 + }, + { + "epoch": 0.09496259351620948, + "grad_norm": 0.5073838422057084, + "learning_rate": 4.94499332549199e-05, + "loss": 0.8872, + "step": 1904 + }, + { + "epoch": 0.09501246882793017, + "grad_norm": 0.5523588225231085, + "learning_rate": 4.9449090444517856e-05, + "loss": 0.9626, + "step": 1905 + }, + { + "epoch": 0.09506234413965087, + "grad_norm": 1.1632598581683928, + "learning_rate": 4.944824699612773e-05, + "loss": 0.9719, + "step": 1906 + }, + { + "epoch": 0.09511221945137158, + "grad_norm": 0.5071902692181302, + "learning_rate": 4.9447402909771555e-05, + "loss": 0.9143, + "step": 1907 + }, + { + "epoch": 0.09516209476309227, + "grad_norm": 0.5877522793955318, + "learning_rate": 4.944655818547133e-05, + "loss": 0.9235, + "step": 1908 + }, + { + "epoch": 0.09521197007481297, + "grad_norm": 0.5916556346550824, + "learning_rate": 4.9445712823249115e-05, + "loss": 0.9706, + "step": 1909 + }, + { + "epoch": 0.09526184538653366, + "grad_norm": 0.4960969507157417, + "learning_rate": 4.9444866823126965e-05, + "loss": 0.9818, + "step": 1910 + }, + { + "epoch": 0.09531172069825436, + "grad_norm": 0.7210157601902175, + "learning_rate": 4.944402018512695e-05, + "loss": 0.9249, + "step": 1911 + }, + { + "epoch": 0.09536159600997507, + "grad_norm": 0.5936260432494239, + "learning_rate": 4.944317290927117e-05, + "loss": 0.985, + "step": 1912 + }, + { + "epoch": 0.09541147132169576, + "grad_norm": 0.551744058499687, + "learning_rate": 4.9442324995581735e-05, + "loss": 0.9617, + "step": 1913 + }, + { + "epoch": 0.09546134663341646, + "grad_norm": 0.5604315277408102, + "learning_rate": 4.944147644408077e-05, + "loss": 0.8902, + "step": 1914 + }, + { + "epoch": 0.09551122194513716, + "grad_norm": 0.547428264856266, + "learning_rate": 4.944062725479042e-05, + "loss": 0.9048, + "step": 1915 + }, + { + "epoch": 0.09556109725685785, + "grad_norm": 0.4830462024993349, + "learning_rate": 4.9439777427732825e-05, + "loss": 0.9219, + "step": 1916 + }, + { + "epoch": 0.09561097256857855, + "grad_norm": 0.601638796959806, + "learning_rate": 4.9438926962930185e-05, + "loss": 0.9278, + "step": 1917 + }, + { + "epoch": 0.09566084788029926, + "grad_norm": 0.7380854385260889, + "learning_rate": 4.943807586040468e-05, + "loss": 0.9634, + "step": 1918 + }, + { + "epoch": 0.09571072319201995, + "grad_norm": 0.7318321101728559, + "learning_rate": 4.9437224120178526e-05, + "loss": 0.9747, + "step": 1919 + }, + { + "epoch": 0.09576059850374065, + "grad_norm": 0.7276022292445773, + "learning_rate": 4.9436371742273946e-05, + "loss": 0.9192, + "step": 1920 + }, + { + "epoch": 0.09581047381546134, + "grad_norm": 2.700838446554186, + "learning_rate": 4.9435518726713185e-05, + "loss": 0.9523, + "step": 1921 + }, + { + "epoch": 0.09586034912718204, + "grad_norm": 0.606677910113384, + "learning_rate": 4.943466507351848e-05, + "loss": 0.9093, + "step": 1922 + }, + { + "epoch": 0.09591022443890275, + "grad_norm": 0.45570387959469827, + "learning_rate": 4.943381078271214e-05, + "loss": 0.9293, + "step": 1923 + }, + { + "epoch": 0.09596009975062345, + "grad_norm": 0.5398244795894599, + "learning_rate": 4.9432955854316444e-05, + "loss": 0.9642, + "step": 1924 + }, + { + "epoch": 0.09600997506234414, + "grad_norm": 1.0462559414217074, + "learning_rate": 4.94321002883537e-05, + "loss": 0.9438, + "step": 1925 + }, + { + "epoch": 0.09605985037406484, + "grad_norm": 0.6163956513618857, + "learning_rate": 4.943124408484623e-05, + "loss": 0.9583, + "step": 1926 + }, + { + "epoch": 0.09610972568578553, + "grad_norm": 0.6021743286862115, + "learning_rate": 4.943038724381638e-05, + "loss": 0.8967, + "step": 1927 + }, + { + "epoch": 0.09615960099750623, + "grad_norm": 0.7600219105384504, + "learning_rate": 4.942952976528651e-05, + "loss": 0.9284, + "step": 1928 + }, + { + "epoch": 0.09620947630922694, + "grad_norm": 0.5245417201316994, + "learning_rate": 4.942867164927899e-05, + "loss": 0.9314, + "step": 1929 + }, + { + "epoch": 0.09625935162094763, + "grad_norm": 0.562869858776199, + "learning_rate": 4.942781289581622e-05, + "loss": 0.9311, + "step": 1930 + }, + { + "epoch": 0.09630922693266833, + "grad_norm": 0.5490005528975049, + "learning_rate": 4.9426953504920607e-05, + "loss": 0.9621, + "step": 1931 + }, + { + "epoch": 0.09635910224438902, + "grad_norm": 0.9809344638558385, + "learning_rate": 4.942609347661457e-05, + "loss": 0.9448, + "step": 1932 + }, + { + "epoch": 0.09640897755610972, + "grad_norm": 0.6292367518224367, + "learning_rate": 4.9425232810920555e-05, + "loss": 0.9497, + "step": 1933 + }, + { + "epoch": 0.09645885286783043, + "grad_norm": 0.5807937837504179, + "learning_rate": 4.942437150786102e-05, + "loss": 0.8982, + "step": 1934 + }, + { + "epoch": 0.09650872817955113, + "grad_norm": 0.5615602615696179, + "learning_rate": 4.942350956745845e-05, + "loss": 0.9256, + "step": 1935 + }, + { + "epoch": 0.09655860349127182, + "grad_norm": 0.49737514698085844, + "learning_rate": 4.9422646989735315e-05, + "loss": 0.9942, + "step": 1936 + }, + { + "epoch": 0.09660847880299252, + "grad_norm": 0.9566751299403289, + "learning_rate": 4.942178377471415e-05, + "loss": 0.9522, + "step": 1937 + }, + { + "epoch": 0.09665835411471321, + "grad_norm": 0.5149159823095968, + "learning_rate": 4.9420919922417455e-05, + "loss": 0.9633, + "step": 1938 + }, + { + "epoch": 0.09670822942643391, + "grad_norm": 0.9901387826603798, + "learning_rate": 4.9420055432867794e-05, + "loss": 0.9798, + "step": 1939 + }, + { + "epoch": 0.09675810473815462, + "grad_norm": 0.6318324532915355, + "learning_rate": 4.941919030608772e-05, + "loss": 0.9501, + "step": 1940 + }, + { + "epoch": 0.09680798004987531, + "grad_norm": 0.5956409790975074, + "learning_rate": 4.9418324542099785e-05, + "loss": 0.9873, + "step": 1941 + }, + { + "epoch": 0.09685785536159601, + "grad_norm": 0.47069754395020896, + "learning_rate": 4.941745814092661e-05, + "loss": 0.9832, + "step": 1942 + }, + { + "epoch": 0.0969077306733167, + "grad_norm": 0.5154115065517053, + "learning_rate": 4.9416591102590793e-05, + "loss": 0.972, + "step": 1943 + }, + { + "epoch": 0.0969576059850374, + "grad_norm": 0.494729596229757, + "learning_rate": 4.941572342711496e-05, + "loss": 0.9767, + "step": 1944 + }, + { + "epoch": 0.09700748129675811, + "grad_norm": 0.6370338882449584, + "learning_rate": 4.9414855114521745e-05, + "loss": 0.9834, + "step": 1945 + }, + { + "epoch": 0.0970573566084788, + "grad_norm": 0.5394695948160029, + "learning_rate": 4.941398616483382e-05, + "loss": 0.9345, + "step": 1946 + }, + { + "epoch": 0.0971072319201995, + "grad_norm": 0.5428701247108031, + "learning_rate": 4.941311657807385e-05, + "loss": 0.9223, + "step": 1947 + }, + { + "epoch": 0.0971571072319202, + "grad_norm": 0.5954582054509, + "learning_rate": 4.9412246354264524e-05, + "loss": 0.9621, + "step": 1948 + }, + { + "epoch": 0.0972069825436409, + "grad_norm": 0.7422258929749015, + "learning_rate": 4.941137549342856e-05, + "loss": 0.9779, + "step": 1949 + }, + { + "epoch": 0.09725685785536159, + "grad_norm": 0.5715323485099961, + "learning_rate": 4.9410503995588674e-05, + "loss": 0.9377, + "step": 1950 + }, + { + "epoch": 0.0973067331670823, + "grad_norm": 0.5574197908656783, + "learning_rate": 4.9409631860767614e-05, + "loss": 0.9292, + "step": 1951 + }, + { + "epoch": 0.097356608478803, + "grad_norm": 0.5008460150896991, + "learning_rate": 4.940875908898813e-05, + "loss": 0.9333, + "step": 1952 + }, + { + "epoch": 0.09740648379052369, + "grad_norm": 0.5610575385758684, + "learning_rate": 4.9407885680273015e-05, + "loss": 0.9366, + "step": 1953 + }, + { + "epoch": 0.09745635910224439, + "grad_norm": 0.5834869629578998, + "learning_rate": 4.940701163464503e-05, + "loss": 0.9668, + "step": 1954 + }, + { + "epoch": 0.09750623441396508, + "grad_norm": 0.5918291781371305, + "learning_rate": 4.9406136952127015e-05, + "loss": 0.9752, + "step": 1955 + }, + { + "epoch": 0.09755610972568579, + "grad_norm": 0.5643830742891026, + "learning_rate": 4.940526163274177e-05, + "loss": 0.9724, + "step": 1956 + }, + { + "epoch": 0.09760598503740649, + "grad_norm": 0.498771618691821, + "learning_rate": 4.940438567651215e-05, + "loss": 0.9555, + "step": 1957 + }, + { + "epoch": 0.09765586034912718, + "grad_norm": 0.5248497712267713, + "learning_rate": 4.9403509083460996e-05, + "loss": 0.9741, + "step": 1958 + }, + { + "epoch": 0.09770573566084788, + "grad_norm": 0.5433127205422995, + "learning_rate": 4.94026318536112e-05, + "loss": 0.9549, + "step": 1959 + }, + { + "epoch": 0.09775561097256857, + "grad_norm": 0.4539808445047683, + "learning_rate": 4.940175398698566e-05, + "loss": 1.0186, + "step": 1960 + }, + { + "epoch": 0.09780548628428928, + "grad_norm": 0.6008706637980555, + "learning_rate": 4.940087548360725e-05, + "loss": 0.9339, + "step": 1961 + }, + { + "epoch": 0.09785536159600998, + "grad_norm": 0.5469834028893946, + "learning_rate": 4.939999634349892e-05, + "loss": 0.954, + "step": 1962 + }, + { + "epoch": 0.09790523690773068, + "grad_norm": 0.5286015057898806, + "learning_rate": 4.939911656668361e-05, + "loss": 0.9643, + "step": 1963 + }, + { + "epoch": 0.09795511221945137, + "grad_norm": 0.5431773789275489, + "learning_rate": 4.939823615318427e-05, + "loss": 0.9865, + "step": 1964 + }, + { + "epoch": 0.09800498753117207, + "grad_norm": 0.533613354287781, + "learning_rate": 4.9397355103023885e-05, + "loss": 0.9407, + "step": 1965 + }, + { + "epoch": 0.09805486284289276, + "grad_norm": 0.6432421251793572, + "learning_rate": 4.939647341622543e-05, + "loss": 0.9768, + "step": 1966 + }, + { + "epoch": 0.09810473815461347, + "grad_norm": 0.4779895701682198, + "learning_rate": 4.939559109281192e-05, + "loss": 0.9311, + "step": 1967 + }, + { + "epoch": 0.09815461346633417, + "grad_norm": 0.581151331388875, + "learning_rate": 4.9394708132806376e-05, + "loss": 0.9581, + "step": 1968 + }, + { + "epoch": 0.09820448877805486, + "grad_norm": 0.4716537541757809, + "learning_rate": 4.9393824536231846e-05, + "loss": 0.9234, + "step": 1969 + }, + { + "epoch": 0.09825436408977556, + "grad_norm": 0.4656234990538594, + "learning_rate": 4.9392940303111386e-05, + "loss": 0.8695, + "step": 1970 + }, + { + "epoch": 0.09830423940149625, + "grad_norm": 0.5668736807444992, + "learning_rate": 4.9392055433468055e-05, + "loss": 0.9122, + "step": 1971 + }, + { + "epoch": 0.09835411471321696, + "grad_norm": 0.6406023475035002, + "learning_rate": 4.9391169927324966e-05, + "loss": 0.9316, + "step": 1972 + }, + { + "epoch": 0.09840399002493766, + "grad_norm": 0.5727644981365109, + "learning_rate": 4.939028378470521e-05, + "loss": 0.9854, + "step": 1973 + }, + { + "epoch": 0.09845386533665836, + "grad_norm": 0.594509432125477, + "learning_rate": 4.93893970056319e-05, + "loss": 0.9811, + "step": 1974 + }, + { + "epoch": 0.09850374064837905, + "grad_norm": 0.5281685243378034, + "learning_rate": 4.938850959012821e-05, + "loss": 0.9456, + "step": 1975 + }, + { + "epoch": 0.09855361596009975, + "grad_norm": 0.709687417391776, + "learning_rate": 4.938762153821726e-05, + "loss": 0.934, + "step": 1976 + }, + { + "epoch": 0.09860349127182044, + "grad_norm": 0.5756907111497145, + "learning_rate": 4.938673284992225e-05, + "loss": 0.9924, + "step": 1977 + }, + { + "epoch": 0.09865336658354115, + "grad_norm": 0.8088067666913515, + "learning_rate": 4.938584352526636e-05, + "loss": 0.927, + "step": 1978 + }, + { + "epoch": 0.09870324189526185, + "grad_norm": 0.5128024286850484, + "learning_rate": 4.93849535642728e-05, + "loss": 0.9707, + "step": 1979 + }, + { + "epoch": 0.09875311720698254, + "grad_norm": 0.6151552903537502, + "learning_rate": 4.9384062966964786e-05, + "loss": 0.9814, + "step": 1980 + }, + { + "epoch": 0.09880299251870324, + "grad_norm": 0.572333850492315, + "learning_rate": 4.938317173336556e-05, + "loss": 0.9724, + "step": 1981 + }, + { + "epoch": 0.09885286783042393, + "grad_norm": 0.5670359447844409, + "learning_rate": 4.938227986349838e-05, + "loss": 0.97, + "step": 1982 + }, + { + "epoch": 0.09890274314214464, + "grad_norm": 0.5514069710601073, + "learning_rate": 4.938138735738652e-05, + "loss": 0.9575, + "step": 1983 + }, + { + "epoch": 0.09895261845386534, + "grad_norm": 0.5186057759638378, + "learning_rate": 4.938049421505327e-05, + "loss": 0.9696, + "step": 1984 + }, + { + "epoch": 0.09900249376558604, + "grad_norm": 0.4920474116307642, + "learning_rate": 4.937960043652193e-05, + "loss": 0.9551, + "step": 1985 + }, + { + "epoch": 0.09905236907730673, + "grad_norm": 0.5386166567631285, + "learning_rate": 4.9378706021815833e-05, + "loss": 0.9517, + "step": 1986 + }, + { + "epoch": 0.09910224438902743, + "grad_norm": 0.5757303660377009, + "learning_rate": 4.9377810970958313e-05, + "loss": 0.9617, + "step": 1987 + }, + { + "epoch": 0.09915211970074812, + "grad_norm": 0.5501944516779771, + "learning_rate": 4.937691528397273e-05, + "loss": 0.9703, + "step": 1988 + }, + { + "epoch": 0.09920199501246883, + "grad_norm": 0.47638436504772885, + "learning_rate": 4.937601896088244e-05, + "loss": 0.9554, + "step": 1989 + }, + { + "epoch": 0.09925187032418953, + "grad_norm": 0.49354022521533203, + "learning_rate": 4.937512200171086e-05, + "loss": 0.9031, + "step": 1990 + }, + { + "epoch": 0.09930174563591022, + "grad_norm": 0.5314887755793279, + "learning_rate": 4.9374224406481365e-05, + "loss": 0.9385, + "step": 1991 + }, + { + "epoch": 0.09935162094763092, + "grad_norm": 0.5646325202498967, + "learning_rate": 4.9373326175217406e-05, + "loss": 0.9443, + "step": 1992 + }, + { + "epoch": 0.09940149625935162, + "grad_norm": 0.4446809569822825, + "learning_rate": 4.93724273079424e-05, + "loss": 0.9289, + "step": 1993 + }, + { + "epoch": 0.09945137157107233, + "grad_norm": 0.4548335888954954, + "learning_rate": 4.937152780467981e-05, + "loss": 0.9124, + "step": 1994 + }, + { + "epoch": 0.09950124688279302, + "grad_norm": 0.5490925968639324, + "learning_rate": 4.937062766545312e-05, + "loss": 0.9535, + "step": 1995 + }, + { + "epoch": 0.09955112219451372, + "grad_norm": 0.5787681157283028, + "learning_rate": 4.93697268902858e-05, + "loss": 0.9845, + "step": 1996 + }, + { + "epoch": 0.09960099750623441, + "grad_norm": 0.5906787012655946, + "learning_rate": 4.936882547920136e-05, + "loss": 0.9141, + "step": 1997 + }, + { + "epoch": 0.09965087281795511, + "grad_norm": 0.46212903134819194, + "learning_rate": 4.936792343222333e-05, + "loss": 0.9363, + "step": 1998 + }, + { + "epoch": 0.0997007481296758, + "grad_norm": 0.534177868533397, + "learning_rate": 4.936702074937524e-05, + "loss": 0.9948, + "step": 1999 + }, + { + "epoch": 0.09975062344139651, + "grad_norm": 0.5414169104407907, + "learning_rate": 4.9366117430680646e-05, + "loss": 0.972, + "step": 2000 + }, + { + "epoch": 0.09980049875311721, + "grad_norm": 0.4850180739180147, + "learning_rate": 4.936521347616313e-05, + "loss": 0.9612, + "step": 2001 + }, + { + "epoch": 0.0998503740648379, + "grad_norm": 0.48116442394315107, + "learning_rate": 4.9364308885846266e-05, + "loss": 0.9672, + "step": 2002 + }, + { + "epoch": 0.0999002493765586, + "grad_norm": 0.5267500428281823, + "learning_rate": 4.936340365975367e-05, + "loss": 0.98, + "step": 2003 + }, + { + "epoch": 0.0999501246882793, + "grad_norm": 0.50368280794104, + "learning_rate": 4.9362497797908965e-05, + "loss": 0.9623, + "step": 2004 + }, + { + "epoch": 0.1, + "grad_norm": 0.6977258506697281, + "learning_rate": 4.936159130033577e-05, + "loss": 0.9508, + "step": 2005 + }, + { + "epoch": 0.1000498753117207, + "grad_norm": 0.42975874388571356, + "learning_rate": 4.9360684167057767e-05, + "loss": 0.9161, + "step": 2006 + }, + { + "epoch": 0.1000997506234414, + "grad_norm": 0.4821868683353146, + "learning_rate": 4.93597763980986e-05, + "loss": 0.9621, + "step": 2007 + }, + { + "epoch": 0.10014962593516209, + "grad_norm": 0.48066168290437555, + "learning_rate": 4.9358867993481985e-05, + "loss": 0.948, + "step": 2008 + }, + { + "epoch": 0.10019950124688279, + "grad_norm": 0.460329059362941, + "learning_rate": 4.9357958953231596e-05, + "loss": 0.9534, + "step": 2009 + }, + { + "epoch": 0.1002493765586035, + "grad_norm": 0.4854145150528035, + "learning_rate": 4.935704927737118e-05, + "loss": 0.9773, + "step": 2010 + }, + { + "epoch": 0.1002992518703242, + "grad_norm": 0.49425864588851676, + "learning_rate": 4.935613896592446e-05, + "loss": 0.9462, + "step": 2011 + }, + { + "epoch": 0.10034912718204489, + "grad_norm": 0.5835361064429015, + "learning_rate": 4.935522801891519e-05, + "loss": 0.9325, + "step": 2012 + }, + { + "epoch": 0.10039900249376559, + "grad_norm": 0.48261488919507367, + "learning_rate": 4.935431643636715e-05, + "loss": 0.926, + "step": 2013 + }, + { + "epoch": 0.10044887780548628, + "grad_norm": 0.5131769367189688, + "learning_rate": 4.935340421830412e-05, + "loss": 0.9543, + "step": 2014 + }, + { + "epoch": 0.10049875311720698, + "grad_norm": 0.5404511248092001, + "learning_rate": 4.935249136474991e-05, + "loss": 0.9683, + "step": 2015 + }, + { + "epoch": 0.10054862842892769, + "grad_norm": 0.47590016587715345, + "learning_rate": 4.935157787572833e-05, + "loss": 0.9978, + "step": 2016 + }, + { + "epoch": 0.10059850374064838, + "grad_norm": 0.5377993968673375, + "learning_rate": 4.935066375126323e-05, + "loss": 0.9563, + "step": 2017 + }, + { + "epoch": 0.10064837905236908, + "grad_norm": 0.5734811966063976, + "learning_rate": 4.934974899137845e-05, + "loss": 1.0205, + "step": 2018 + }, + { + "epoch": 0.10069825436408977, + "grad_norm": 0.835469809416651, + "learning_rate": 4.934883359609788e-05, + "loss": 0.9852, + "step": 2019 + }, + { + "epoch": 0.10074812967581047, + "grad_norm": 0.4807048283186035, + "learning_rate": 4.934791756544538e-05, + "loss": 0.9377, + "step": 2020 + }, + { + "epoch": 0.10079800498753118, + "grad_norm": 0.588301036048829, + "learning_rate": 4.934700089944487e-05, + "loss": 0.9556, + "step": 2021 + }, + { + "epoch": 0.10084788029925187, + "grad_norm": 0.5217280081220529, + "learning_rate": 4.934608359812027e-05, + "loss": 0.9328, + "step": 2022 + }, + { + "epoch": 0.10089775561097257, + "grad_norm": 0.4748595776390641, + "learning_rate": 4.934516566149552e-05, + "loss": 0.9155, + "step": 2023 + }, + { + "epoch": 0.10094763092269327, + "grad_norm": 0.43570824003582986, + "learning_rate": 4.934424708959457e-05, + "loss": 0.9172, + "step": 2024 + }, + { + "epoch": 0.10099750623441396, + "grad_norm": 0.49005779209542794, + "learning_rate": 4.934332788244138e-05, + "loss": 0.9295, + "step": 2025 + }, + { + "epoch": 0.10104738154613466, + "grad_norm": 0.5862439908720501, + "learning_rate": 4.934240804005994e-05, + "loss": 0.9541, + "step": 2026 + }, + { + "epoch": 0.10109725685785537, + "grad_norm": 0.6568988900777731, + "learning_rate": 4.934148756247427e-05, + "loss": 0.9531, + "step": 2027 + }, + { + "epoch": 0.10114713216957606, + "grad_norm": 0.5839271159201149, + "learning_rate": 4.934056644970836e-05, + "loss": 0.9479, + "step": 2028 + }, + { + "epoch": 0.10119700748129676, + "grad_norm": 0.47056343296130254, + "learning_rate": 4.933964470178627e-05, + "loss": 0.9045, + "step": 2029 + }, + { + "epoch": 0.10124688279301745, + "grad_norm": 0.4921257408047291, + "learning_rate": 4.933872231873204e-05, + "loss": 0.9873, + "step": 2030 + }, + { + "epoch": 0.10129675810473815, + "grad_norm": 0.5042182639910562, + "learning_rate": 4.933779930056975e-05, + "loss": 0.9403, + "step": 2031 + }, + { + "epoch": 0.10134663341645886, + "grad_norm": 0.5430184344426024, + "learning_rate": 4.933687564732348e-05, + "loss": 0.954, + "step": 2032 + }, + { + "epoch": 0.10139650872817955, + "grad_norm": 0.5125417164210758, + "learning_rate": 4.933595135901732e-05, + "loss": 0.9226, + "step": 2033 + }, + { + "epoch": 0.10144638403990025, + "grad_norm": 0.5493188707436614, + "learning_rate": 4.9335026435675414e-05, + "loss": 0.922, + "step": 2034 + }, + { + "epoch": 0.10149625935162095, + "grad_norm": 0.5473577500462488, + "learning_rate": 4.9334100877321876e-05, + "loss": 0.9633, + "step": 2035 + }, + { + "epoch": 0.10154613466334164, + "grad_norm": 0.5043099778624235, + "learning_rate": 4.933317468398087e-05, + "loss": 0.9026, + "step": 2036 + }, + { + "epoch": 0.10159600997506234, + "grad_norm": 0.5780349153446642, + "learning_rate": 4.933224785567656e-05, + "loss": 0.9544, + "step": 2037 + }, + { + "epoch": 0.10164588528678305, + "grad_norm": 0.5581041139897681, + "learning_rate": 4.933132039243313e-05, + "loss": 0.9383, + "step": 2038 + }, + { + "epoch": 0.10169576059850374, + "grad_norm": 0.5091355469691884, + "learning_rate": 4.9330392294274787e-05, + "loss": 0.9604, + "step": 2039 + }, + { + "epoch": 0.10174563591022444, + "grad_norm": 0.45160691205578996, + "learning_rate": 4.932946356122574e-05, + "loss": 0.9263, + "step": 2040 + }, + { + "epoch": 0.10179551122194513, + "grad_norm": 0.5387255549418987, + "learning_rate": 4.932853419331024e-05, + "loss": 0.9819, + "step": 2041 + }, + { + "epoch": 0.10184538653366583, + "grad_norm": 0.4367557509206927, + "learning_rate": 4.9327604190552526e-05, + "loss": 0.9145, + "step": 2042 + }, + { + "epoch": 0.10189526184538654, + "grad_norm": 0.5001696712215089, + "learning_rate": 4.9326673552976856e-05, + "loss": 0.9856, + "step": 2043 + }, + { + "epoch": 0.10194513715710724, + "grad_norm": 0.54087175225334, + "learning_rate": 4.932574228060754e-05, + "loss": 0.9804, + "step": 2044 + }, + { + "epoch": 0.10199501246882793, + "grad_norm": 0.5415088678191591, + "learning_rate": 4.932481037346886e-05, + "loss": 0.901, + "step": 2045 + }, + { + "epoch": 0.10204488778054863, + "grad_norm": 0.7206828262651819, + "learning_rate": 4.932387783158514e-05, + "loss": 0.9699, + "step": 2046 + }, + { + "epoch": 0.10209476309226932, + "grad_norm": 0.55118323711449, + "learning_rate": 4.932294465498072e-05, + "loss": 0.9858, + "step": 2047 + }, + { + "epoch": 0.10214463840399003, + "grad_norm": 0.6132294145073011, + "learning_rate": 4.932201084367993e-05, + "loss": 0.9304, + "step": 2048 + }, + { + "epoch": 0.10219451371571073, + "grad_norm": 0.5445028233905249, + "learning_rate": 4.932107639770717e-05, + "loss": 0.9382, + "step": 2049 + }, + { + "epoch": 0.10224438902743142, + "grad_norm": 0.4853690908049323, + "learning_rate": 4.93201413170868e-05, + "loss": 0.9301, + "step": 2050 + }, + { + "epoch": 0.10229426433915212, + "grad_norm": 0.5134063871200132, + "learning_rate": 4.931920560184322e-05, + "loss": 0.9343, + "step": 2051 + }, + { + "epoch": 0.10234413965087281, + "grad_norm": 0.5825535513528318, + "learning_rate": 4.931826925200086e-05, + "loss": 0.9227, + "step": 2052 + }, + { + "epoch": 0.10239401496259351, + "grad_norm": 0.48199230099872764, + "learning_rate": 4.931733226758415e-05, + "loss": 0.9656, + "step": 2053 + }, + { + "epoch": 0.10244389027431422, + "grad_norm": 0.46580088602511577, + "learning_rate": 4.9316394648617524e-05, + "loss": 0.9013, + "step": 2054 + }, + { + "epoch": 0.10249376558603492, + "grad_norm": 0.5514382302213424, + "learning_rate": 4.931545639512547e-05, + "loss": 0.9683, + "step": 2055 + }, + { + "epoch": 0.10254364089775561, + "grad_norm": 0.8322100479744435, + "learning_rate": 4.931451750713247e-05, + "loss": 0.9596, + "step": 2056 + }, + { + "epoch": 0.10259351620947631, + "grad_norm": 0.5084261853794861, + "learning_rate": 4.931357798466301e-05, + "loss": 0.9588, + "step": 2057 + }, + { + "epoch": 0.102643391521197, + "grad_norm": 0.5142104703579916, + "learning_rate": 4.9312637827741623e-05, + "loss": 0.9089, + "step": 2058 + }, + { + "epoch": 0.10269326683291771, + "grad_norm": 0.5211260440340465, + "learning_rate": 4.931169703639282e-05, + "loss": 0.9503, + "step": 2059 + }, + { + "epoch": 0.10274314214463841, + "grad_norm": 0.5590751551311005, + "learning_rate": 4.931075561064117e-05, + "loss": 0.9656, + "step": 2060 + }, + { + "epoch": 0.1027930174563591, + "grad_norm": 0.530081939328558, + "learning_rate": 4.930981355051123e-05, + "loss": 1.0094, + "step": 2061 + }, + { + "epoch": 0.1028428927680798, + "grad_norm": 0.545165558220875, + "learning_rate": 4.9308870856027586e-05, + "loss": 0.9356, + "step": 2062 + }, + { + "epoch": 0.1028927680798005, + "grad_norm": 0.5676626971030038, + "learning_rate": 4.930792752721484e-05, + "loss": 0.9265, + "step": 2063 + }, + { + "epoch": 0.10294264339152119, + "grad_norm": 0.48606314073235624, + "learning_rate": 4.93069835640976e-05, + "loss": 0.9287, + "step": 2064 + }, + { + "epoch": 0.1029925187032419, + "grad_norm": 0.5277707401754324, + "learning_rate": 4.9306038966700504e-05, + "loss": 0.9453, + "step": 2065 + }, + { + "epoch": 0.1030423940149626, + "grad_norm": 0.6545986838211143, + "learning_rate": 4.93050937350482e-05, + "loss": 0.9673, + "step": 2066 + }, + { + "epoch": 0.10309226932668329, + "grad_norm": 0.5382845263429458, + "learning_rate": 4.930414786916535e-05, + "loss": 0.9674, + "step": 2067 + }, + { + "epoch": 0.10314214463840399, + "grad_norm": 0.48622405766954946, + "learning_rate": 4.930320136907664e-05, + "loss": 0.9404, + "step": 2068 + }, + { + "epoch": 0.10319201995012468, + "grad_norm": 0.4860044950765356, + "learning_rate": 4.930225423480677e-05, + "loss": 0.9474, + "step": 2069 + }, + { + "epoch": 0.10324189526184539, + "grad_norm": 0.45435456972462246, + "learning_rate": 4.9301306466380445e-05, + "loss": 0.9448, + "step": 2070 + }, + { + "epoch": 0.10329177057356609, + "grad_norm": 0.5951146081119648, + "learning_rate": 4.9300358063822406e-05, + "loss": 0.9288, + "step": 2071 + }, + { + "epoch": 0.10334164588528678, + "grad_norm": 0.5035955019473398, + "learning_rate": 4.92994090271574e-05, + "loss": 0.9422, + "step": 2072 + }, + { + "epoch": 0.10339152119700748, + "grad_norm": 0.7583857956892734, + "learning_rate": 4.929845935641019e-05, + "loss": 0.8964, + "step": 2073 + }, + { + "epoch": 0.10344139650872818, + "grad_norm": 0.5424090567779116, + "learning_rate": 4.9297509051605555e-05, + "loss": 0.9532, + "step": 2074 + }, + { + "epoch": 0.10349127182044887, + "grad_norm": 0.7084742177918198, + "learning_rate": 4.92965581127683e-05, + "loss": 0.9845, + "step": 2075 + }, + { + "epoch": 0.10354114713216958, + "grad_norm": 0.5957339177341288, + "learning_rate": 4.9295606539923235e-05, + "loss": 0.9596, + "step": 2076 + }, + { + "epoch": 0.10359102244389028, + "grad_norm": 0.6456995021033907, + "learning_rate": 4.9294654333095186e-05, + "loss": 0.9372, + "step": 2077 + }, + { + "epoch": 0.10364089775561097, + "grad_norm": 0.49301474374518667, + "learning_rate": 4.929370149230901e-05, + "loss": 0.9176, + "step": 2078 + }, + { + "epoch": 0.10369077306733167, + "grad_norm": 0.6834817990614821, + "learning_rate": 4.929274801758956e-05, + "loss": 0.9029, + "step": 2079 + }, + { + "epoch": 0.10374064837905236, + "grad_norm": 0.61396397556625, + "learning_rate": 4.9291793908961724e-05, + "loss": 0.9489, + "step": 2080 + }, + { + "epoch": 0.10379052369077307, + "grad_norm": 0.5704790865174945, + "learning_rate": 4.92908391664504e-05, + "loss": 0.97, + "step": 2081 + }, + { + "epoch": 0.10384039900249377, + "grad_norm": 0.5268322109853713, + "learning_rate": 4.92898837900805e-05, + "loss": 0.9122, + "step": 2082 + }, + { + "epoch": 0.10389027431421446, + "grad_norm": 0.5088279418255894, + "learning_rate": 4.928892777987696e-05, + "loss": 0.917, + "step": 2083 + }, + { + "epoch": 0.10394014962593516, + "grad_norm": 0.4910673217773618, + "learning_rate": 4.928797113586471e-05, + "loss": 0.962, + "step": 2084 + }, + { + "epoch": 0.10399002493765586, + "grad_norm": 0.5453388669977404, + "learning_rate": 4.928701385806872e-05, + "loss": 0.9716, + "step": 2085 + }, + { + "epoch": 0.10403990024937655, + "grad_norm": 0.5680935580264959, + "learning_rate": 4.928605594651398e-05, + "loss": 0.9152, + "step": 2086 + }, + { + "epoch": 0.10408977556109726, + "grad_norm": 0.4937566859654678, + "learning_rate": 4.928509740122548e-05, + "loss": 0.9863, + "step": 2087 + }, + { + "epoch": 0.10413965087281796, + "grad_norm": 0.6916277390020092, + "learning_rate": 4.928413822222822e-05, + "loss": 0.9784, + "step": 2088 + }, + { + "epoch": 0.10418952618453865, + "grad_norm": 0.4735494244530301, + "learning_rate": 4.9283178409547256e-05, + "loss": 0.9312, + "step": 2089 + }, + { + "epoch": 0.10423940149625935, + "grad_norm": 0.5855544958883222, + "learning_rate": 4.928221796320761e-05, + "loss": 0.9356, + "step": 2090 + }, + { + "epoch": 0.10428927680798004, + "grad_norm": 0.5730255628671901, + "learning_rate": 4.928125688323436e-05, + "loss": 0.9982, + "step": 2091 + }, + { + "epoch": 0.10433915211970075, + "grad_norm": 0.5663925777279484, + "learning_rate": 4.928029516965257e-05, + "loss": 0.9629, + "step": 2092 + }, + { + "epoch": 0.10438902743142145, + "grad_norm": 0.5206428321267554, + "learning_rate": 4.927933282248736e-05, + "loss": 0.939, + "step": 2093 + }, + { + "epoch": 0.10443890274314215, + "grad_norm": 0.4880502374127603, + "learning_rate": 4.927836984176381e-05, + "loss": 0.9229, + "step": 2094 + }, + { + "epoch": 0.10448877805486284, + "grad_norm": 0.4958944381037113, + "learning_rate": 4.927740622750707e-05, + "loss": 0.967, + "step": 2095 + }, + { + "epoch": 0.10453865336658354, + "grad_norm": 0.5336187420360184, + "learning_rate": 4.9276441979742275e-05, + "loss": 0.9503, + "step": 2096 + }, + { + "epoch": 0.10458852867830425, + "grad_norm": 0.7438820572972136, + "learning_rate": 4.92754770984946e-05, + "loss": 0.9074, + "step": 2097 + }, + { + "epoch": 0.10463840399002494, + "grad_norm": 0.6625368169356541, + "learning_rate": 4.927451158378921e-05, + "loss": 0.9196, + "step": 2098 + }, + { + "epoch": 0.10468827930174564, + "grad_norm": 0.954705236915944, + "learning_rate": 4.92735454356513e-05, + "loss": 0.9793, + "step": 2099 + }, + { + "epoch": 0.10473815461346633, + "grad_norm": 0.581812352187464, + "learning_rate": 4.9272578654106096e-05, + "loss": 0.9771, + "step": 2100 + }, + { + "epoch": 0.10478802992518703, + "grad_norm": 0.7328699733367601, + "learning_rate": 4.927161123917881e-05, + "loss": 0.9474, + "step": 2101 + }, + { + "epoch": 0.10483790523690772, + "grad_norm": 0.4928612142940803, + "learning_rate": 4.9270643190894684e-05, + "loss": 0.9783, + "step": 2102 + }, + { + "epoch": 0.10488778054862843, + "grad_norm": 0.48902087882122885, + "learning_rate": 4.9269674509278996e-05, + "loss": 0.9894, + "step": 2103 + }, + { + "epoch": 0.10493765586034913, + "grad_norm": 0.5614570109431621, + "learning_rate": 4.926870519435701e-05, + "loss": 1.0039, + "step": 2104 + }, + { + "epoch": 0.10498753117206983, + "grad_norm": 0.5550410337747804, + "learning_rate": 4.926773524615402e-05, + "loss": 0.9654, + "step": 2105 + }, + { + "epoch": 0.10503740648379052, + "grad_norm": 0.5470621959417535, + "learning_rate": 4.9266764664695345e-05, + "loss": 0.9446, + "step": 2106 + }, + { + "epoch": 0.10508728179551122, + "grad_norm": 0.6271584878825355, + "learning_rate": 4.92657934500063e-05, + "loss": 0.9459, + "step": 2107 + }, + { + "epoch": 0.10513715710723193, + "grad_norm": 0.6231625715907899, + "learning_rate": 4.926482160211224e-05, + "loss": 0.9151, + "step": 2108 + }, + { + "epoch": 0.10518703241895262, + "grad_norm": 0.54081800395512, + "learning_rate": 4.9263849121038516e-05, + "loss": 0.9341, + "step": 2109 + }, + { + "epoch": 0.10523690773067332, + "grad_norm": 0.4970762010957273, + "learning_rate": 4.926287600681051e-05, + "loss": 0.9446, + "step": 2110 + }, + { + "epoch": 0.10528678304239401, + "grad_norm": 0.5717283383650162, + "learning_rate": 4.9261902259453614e-05, + "loss": 0.9789, + "step": 2111 + }, + { + "epoch": 0.10533665835411471, + "grad_norm": 0.6262210357298206, + "learning_rate": 4.926092787899324e-05, + "loss": 0.9462, + "step": 2112 + }, + { + "epoch": 0.1053865336658354, + "grad_norm": 0.5434136771262591, + "learning_rate": 4.92599528654548e-05, + "loss": 0.9675, + "step": 2113 + }, + { + "epoch": 0.10543640897755611, + "grad_norm": 0.8996487213639118, + "learning_rate": 4.925897721886375e-05, + "loss": 0.9266, + "step": 2114 + }, + { + "epoch": 0.10548628428927681, + "grad_norm": 0.4889231453874916, + "learning_rate": 4.925800093924556e-05, + "loss": 0.9529, + "step": 2115 + }, + { + "epoch": 0.1055361596009975, + "grad_norm": 0.49117238606400804, + "learning_rate": 4.925702402662567e-05, + "loss": 0.9541, + "step": 2116 + }, + { + "epoch": 0.1055860349127182, + "grad_norm": 0.5921300330731675, + "learning_rate": 4.9256046481029614e-05, + "loss": 0.9151, + "step": 2117 + }, + { + "epoch": 0.1056359102244389, + "grad_norm": 0.45885612924632424, + "learning_rate": 4.925506830248287e-05, + "loss": 0.8938, + "step": 2118 + }, + { + "epoch": 0.10568578553615961, + "grad_norm": 0.5694573963655896, + "learning_rate": 4.925408949101098e-05, + "loss": 0.9932, + "step": 2119 + }, + { + "epoch": 0.1057356608478803, + "grad_norm": 0.6766018044219436, + "learning_rate": 4.925311004663948e-05, + "loss": 0.9718, + "step": 2120 + }, + { + "epoch": 0.105785536159601, + "grad_norm": 0.5137165326207576, + "learning_rate": 4.9252129969393926e-05, + "loss": 0.9727, + "step": 2121 + }, + { + "epoch": 0.1058354114713217, + "grad_norm": 0.6860957374731452, + "learning_rate": 4.9251149259299886e-05, + "loss": 0.972, + "step": 2122 + }, + { + "epoch": 0.10588528678304239, + "grad_norm": 0.5329916664309843, + "learning_rate": 4.925016791638297e-05, + "loss": 0.9365, + "step": 2123 + }, + { + "epoch": 0.10593516209476309, + "grad_norm": 0.48443406478672507, + "learning_rate": 4.9249185940668775e-05, + "loss": 0.9268, + "step": 2124 + }, + { + "epoch": 0.1059850374064838, + "grad_norm": 0.6632049955418646, + "learning_rate": 4.924820333218292e-05, + "loss": 0.9787, + "step": 2125 + }, + { + "epoch": 0.10603491271820449, + "grad_norm": 0.4951086018752935, + "learning_rate": 4.924722009095105e-05, + "loss": 1.0066, + "step": 2126 + }, + { + "epoch": 0.10608478802992519, + "grad_norm": 0.9240876469752369, + "learning_rate": 4.9246236216998824e-05, + "loss": 0.9472, + "step": 2127 + }, + { + "epoch": 0.10613466334164588, + "grad_norm": 0.8943972946399521, + "learning_rate": 4.9245251710351916e-05, + "loss": 0.9744, + "step": 2128 + }, + { + "epoch": 0.10618453865336658, + "grad_norm": 0.8363392646866553, + "learning_rate": 4.924426657103602e-05, + "loss": 1.0115, + "step": 2129 + }, + { + "epoch": 0.10623441396508729, + "grad_norm": 0.510288135654721, + "learning_rate": 4.924328079907684e-05, + "loss": 0.9671, + "step": 2130 + }, + { + "epoch": 0.10628428927680798, + "grad_norm": 0.4492298493801823, + "learning_rate": 4.924229439450009e-05, + "loss": 0.9091, + "step": 2131 + }, + { + "epoch": 0.10633416458852868, + "grad_norm": 0.4825880121850957, + "learning_rate": 4.924130735733152e-05, + "loss": 0.9137, + "step": 2132 + }, + { + "epoch": 0.10638403990024937, + "grad_norm": 0.6283465195197356, + "learning_rate": 4.924031968759688e-05, + "loss": 0.9864, + "step": 2133 + }, + { + "epoch": 0.10643391521197007, + "grad_norm": 0.49399808213466895, + "learning_rate": 4.9239331385321955e-05, + "loss": 0.9679, + "step": 2134 + }, + { + "epoch": 0.10648379052369077, + "grad_norm": 0.5139214984831153, + "learning_rate": 4.9238342450532516e-05, + "loss": 0.9429, + "step": 2135 + }, + { + "epoch": 0.10653366583541148, + "grad_norm": 0.44795076265834155, + "learning_rate": 4.9237352883254386e-05, + "loss": 0.9891, + "step": 2136 + }, + { + "epoch": 0.10658354114713217, + "grad_norm": 0.7168856118976078, + "learning_rate": 4.923636268351337e-05, + "loss": 0.9497, + "step": 2137 + }, + { + "epoch": 0.10663341645885287, + "grad_norm": 0.49138968383666365, + "learning_rate": 4.923537185133532e-05, + "loss": 1.0123, + "step": 2138 + }, + { + "epoch": 0.10668329177057356, + "grad_norm": 0.5815385110166404, + "learning_rate": 4.923438038674609e-05, + "loss": 0.9279, + "step": 2139 + }, + { + "epoch": 0.10673316708229426, + "grad_norm": 0.7449060362794471, + "learning_rate": 4.923338828977155e-05, + "loss": 0.9667, + "step": 2140 + }, + { + "epoch": 0.10678304239401497, + "grad_norm": 1.1273215042230254, + "learning_rate": 4.9232395560437594e-05, + "loss": 0.9456, + "step": 2141 + }, + { + "epoch": 0.10683291770573566, + "grad_norm": 0.8047689692172918, + "learning_rate": 4.9231402198770115e-05, + "loss": 0.9205, + "step": 2142 + }, + { + "epoch": 0.10688279301745636, + "grad_norm": 0.4980994050979361, + "learning_rate": 4.9230408204795034e-05, + "loss": 0.8947, + "step": 2143 + }, + { + "epoch": 0.10693266832917706, + "grad_norm": 0.7533083985672074, + "learning_rate": 4.9229413578538305e-05, + "loss": 0.9507, + "step": 2144 + }, + { + "epoch": 0.10698254364089775, + "grad_norm": 0.503244828165899, + "learning_rate": 4.922841832002587e-05, + "loss": 0.9495, + "step": 2145 + }, + { + "epoch": 0.10703241895261846, + "grad_norm": 0.48640551323687276, + "learning_rate": 4.92274224292837e-05, + "loss": 0.8999, + "step": 2146 + }, + { + "epoch": 0.10708229426433916, + "grad_norm": 0.5917671383871614, + "learning_rate": 4.922642590633778e-05, + "loss": 0.9615, + "step": 2147 + }, + { + "epoch": 0.10713216957605985, + "grad_norm": 0.8129658694327965, + "learning_rate": 4.922542875121413e-05, + "loss": 0.9362, + "step": 2148 + }, + { + "epoch": 0.10718204488778055, + "grad_norm": 0.524577501576292, + "learning_rate": 4.922443096393875e-05, + "loss": 0.933, + "step": 2149 + }, + { + "epoch": 0.10723192019950124, + "grad_norm": 0.5375601180526146, + "learning_rate": 4.922343254453768e-05, + "loss": 1.0101, + "step": 2150 + }, + { + "epoch": 0.10728179551122194, + "grad_norm": 0.6199765481676232, + "learning_rate": 4.922243349303699e-05, + "loss": 0.9695, + "step": 2151 + }, + { + "epoch": 0.10733167082294265, + "grad_norm": 0.4403637605832131, + "learning_rate": 4.922143380946273e-05, + "loss": 0.9371, + "step": 2152 + }, + { + "epoch": 0.10738154613466334, + "grad_norm": 0.4994987781710597, + "learning_rate": 4.922043349384101e-05, + "loss": 0.9529, + "step": 2153 + }, + { + "epoch": 0.10743142144638404, + "grad_norm": 0.5325807003523317, + "learning_rate": 4.92194325461979e-05, + "loss": 0.9446, + "step": 2154 + }, + { + "epoch": 0.10748129675810474, + "grad_norm": 0.6480431023064931, + "learning_rate": 4.921843096655955e-05, + "loss": 0.9285, + "step": 2155 + }, + { + "epoch": 0.10753117206982543, + "grad_norm": 0.872301779360183, + "learning_rate": 4.9217428754952074e-05, + "loss": 0.9481, + "step": 2156 + }, + { + "epoch": 0.10758104738154614, + "grad_norm": 0.5191513190080643, + "learning_rate": 4.921642591140164e-05, + "loss": 0.9654, + "step": 2157 + }, + { + "epoch": 0.10763092269326684, + "grad_norm": 0.6781794662490251, + "learning_rate": 4.92154224359344e-05, + "loss": 0.9556, + "step": 2158 + }, + { + "epoch": 0.10768079800498753, + "grad_norm": 0.603198568430948, + "learning_rate": 4.921441832857656e-05, + "loss": 0.9751, + "step": 2159 + }, + { + "epoch": 0.10773067331670823, + "grad_norm": 0.5640038925656238, + "learning_rate": 4.9213413589354304e-05, + "loss": 0.9923, + "step": 2160 + }, + { + "epoch": 0.10778054862842892, + "grad_norm": 0.5547868451034846, + "learning_rate": 4.9212408218293853e-05, + "loss": 0.9289, + "step": 2161 + }, + { + "epoch": 0.10783042394014962, + "grad_norm": 0.5675472426916536, + "learning_rate": 4.921140221542145e-05, + "loss": 0.9054, + "step": 2162 + }, + { + "epoch": 0.10788029925187033, + "grad_norm": 0.9834186542450469, + "learning_rate": 4.921039558076335e-05, + "loss": 0.9495, + "step": 2163 + }, + { + "epoch": 0.10793017456359102, + "grad_norm": 0.5377210374148377, + "learning_rate": 4.92093883143458e-05, + "loss": 0.928, + "step": 2164 + }, + { + "epoch": 0.10798004987531172, + "grad_norm": 0.47652362109116186, + "learning_rate": 4.92083804161951e-05, + "loss": 0.9492, + "step": 2165 + }, + { + "epoch": 0.10802992518703242, + "grad_norm": 0.5531180608322463, + "learning_rate": 4.920737188633755e-05, + "loss": 0.9809, + "step": 2166 + }, + { + "epoch": 0.10807980049875311, + "grad_norm": 0.8077614111534287, + "learning_rate": 4.920636272479946e-05, + "loss": 0.9439, + "step": 2167 + }, + { + "epoch": 0.10812967581047382, + "grad_norm": 0.5229879276910051, + "learning_rate": 4.920535293160717e-05, + "loss": 0.9733, + "step": 2168 + }, + { + "epoch": 0.10817955112219452, + "grad_norm": 0.5198234112088786, + "learning_rate": 4.920434250678703e-05, + "loss": 0.9393, + "step": 2169 + }, + { + "epoch": 0.10822942643391521, + "grad_norm": 0.5837966298337645, + "learning_rate": 4.920333145036541e-05, + "loss": 0.953, + "step": 2170 + }, + { + "epoch": 0.10827930174563591, + "grad_norm": 0.5883297041215377, + "learning_rate": 4.920231976236868e-05, + "loss": 0.9504, + "step": 2171 + }, + { + "epoch": 0.1083291770573566, + "grad_norm": 0.5849302313556681, + "learning_rate": 4.920130744282325e-05, + "loss": 0.9607, + "step": 2172 + }, + { + "epoch": 0.1083790523690773, + "grad_norm": 0.5278577177816393, + "learning_rate": 4.920029449175553e-05, + "loss": 0.945, + "step": 2173 + }, + { + "epoch": 0.10842892768079801, + "grad_norm": 0.652139064572397, + "learning_rate": 4.919928090919196e-05, + "loss": 0.944, + "step": 2174 + }, + { + "epoch": 0.1084788029925187, + "grad_norm": 0.5282161013132144, + "learning_rate": 4.9198266695158984e-05, + "loss": 0.9468, + "step": 2175 + }, + { + "epoch": 0.1085286783042394, + "grad_norm": 0.48903382050432825, + "learning_rate": 4.9197251849683066e-05, + "loss": 0.9301, + "step": 2176 + }, + { + "epoch": 0.1085785536159601, + "grad_norm": 0.49560620564627267, + "learning_rate": 4.919623637279069e-05, + "loss": 0.9742, + "step": 2177 + }, + { + "epoch": 0.10862842892768079, + "grad_norm": 0.6514606452811192, + "learning_rate": 4.919522026450836e-05, + "loss": 0.9588, + "step": 2178 + }, + { + "epoch": 0.1086783042394015, + "grad_norm": 0.5136082917032899, + "learning_rate": 4.919420352486258e-05, + "loss": 0.9392, + "step": 2179 + }, + { + "epoch": 0.1087281795511222, + "grad_norm": 0.5040259035470902, + "learning_rate": 4.919318615387989e-05, + "loss": 0.9106, + "step": 2180 + }, + { + "epoch": 0.1087780548628429, + "grad_norm": 0.45841196986323746, + "learning_rate": 4.919216815158684e-05, + "loss": 0.9508, + "step": 2181 + }, + { + "epoch": 0.10882793017456359, + "grad_norm": 0.5140352057326685, + "learning_rate": 4.9191149518009976e-05, + "loss": 0.913, + "step": 2182 + }, + { + "epoch": 0.10887780548628428, + "grad_norm": 0.4779132446173379, + "learning_rate": 4.91901302531759e-05, + "loss": 0.9686, + "step": 2183 + }, + { + "epoch": 0.108927680798005, + "grad_norm": 1.681093849397516, + "learning_rate": 4.91891103571112e-05, + "loss": 0.9086, + "step": 2184 + }, + { + "epoch": 0.10897755610972569, + "grad_norm": 0.7030836684205758, + "learning_rate": 4.918808982984249e-05, + "loss": 1.0598, + "step": 2185 + }, + { + "epoch": 0.10902743142144639, + "grad_norm": 0.9318452914324424, + "learning_rate": 4.9187068671396405e-05, + "loss": 0.9894, + "step": 2186 + }, + { + "epoch": 0.10907730673316708, + "grad_norm": 0.5232937768811354, + "learning_rate": 4.918604688179958e-05, + "loss": 0.9451, + "step": 2187 + }, + { + "epoch": 0.10912718204488778, + "grad_norm": 0.5863007302801533, + "learning_rate": 4.9185024461078695e-05, + "loss": 0.9362, + "step": 2188 + }, + { + "epoch": 0.10917705735660847, + "grad_norm": 0.45255454780313953, + "learning_rate": 4.918400140926042e-05, + "loss": 0.9086, + "step": 2189 + }, + { + "epoch": 0.10922693266832918, + "grad_norm": 0.49251902194449243, + "learning_rate": 4.918297772637144e-05, + "loss": 0.9228, + "step": 2190 + }, + { + "epoch": 0.10927680798004988, + "grad_norm": 0.48596099975207085, + "learning_rate": 4.9181953412438485e-05, + "loss": 0.9513, + "step": 2191 + }, + { + "epoch": 0.10932668329177057, + "grad_norm": 0.6516068952199603, + "learning_rate": 4.9180928467488284e-05, + "loss": 0.9313, + "step": 2192 + }, + { + "epoch": 0.10937655860349127, + "grad_norm": 0.5072922942795988, + "learning_rate": 4.917990289154757e-05, + "loss": 0.9212, + "step": 2193 + }, + { + "epoch": 0.10942643391521197, + "grad_norm": 0.61661321227931, + "learning_rate": 4.9178876684643116e-05, + "loss": 0.9731, + "step": 2194 + }, + { + "epoch": 0.10947630922693267, + "grad_norm": 0.6217243078970434, + "learning_rate": 4.9177849846801696e-05, + "loss": 0.9282, + "step": 2195 + }, + { + "epoch": 0.10952618453865337, + "grad_norm": 0.49440521363839784, + "learning_rate": 4.91768223780501e-05, + "loss": 0.9467, + "step": 2196 + }, + { + "epoch": 0.10957605985037407, + "grad_norm": 0.5106643634379535, + "learning_rate": 4.9175794278415144e-05, + "loss": 0.9383, + "step": 2197 + }, + { + "epoch": 0.10962593516209476, + "grad_norm": 0.5437301164970021, + "learning_rate": 4.9174765547923665e-05, + "loss": 0.9999, + "step": 2198 + }, + { + "epoch": 0.10967581047381546, + "grad_norm": 0.5893395545666452, + "learning_rate": 4.9173736186602495e-05, + "loss": 0.994, + "step": 2199 + }, + { + "epoch": 0.10972568578553615, + "grad_norm": 0.7564780592156951, + "learning_rate": 4.917270619447849e-05, + "loss": 0.9902, + "step": 2200 + }, + { + "epoch": 0.10977556109725686, + "grad_norm": 0.5824040807185759, + "learning_rate": 4.917167557157854e-05, + "loss": 0.903, + "step": 2201 + }, + { + "epoch": 0.10982543640897756, + "grad_norm": 0.5133730030653803, + "learning_rate": 4.9170644317929534e-05, + "loss": 0.9504, + "step": 2202 + }, + { + "epoch": 0.10987531172069825, + "grad_norm": 0.5694467949737324, + "learning_rate": 4.9169612433558385e-05, + "loss": 0.9855, + "step": 2203 + }, + { + "epoch": 0.10992518703241895, + "grad_norm": 0.667376657104272, + "learning_rate": 4.9168579918492006e-05, + "loss": 0.9305, + "step": 2204 + }, + { + "epoch": 0.10997506234413965, + "grad_norm": 1.1045509616013234, + "learning_rate": 4.916754677275736e-05, + "loss": 0.981, + "step": 2205 + }, + { + "epoch": 0.11002493765586036, + "grad_norm": 0.6476002532922259, + "learning_rate": 4.91665129963814e-05, + "loss": 1.0244, + "step": 2206 + }, + { + "epoch": 0.11007481296758105, + "grad_norm": 0.571621401943337, + "learning_rate": 4.916547858939109e-05, + "loss": 0.9157, + "step": 2207 + }, + { + "epoch": 0.11012468827930175, + "grad_norm": 0.5689048476548823, + "learning_rate": 4.9164443551813434e-05, + "loss": 0.9818, + "step": 2208 + }, + { + "epoch": 0.11017456359102244, + "grad_norm": 0.5878869567092491, + "learning_rate": 4.916340788367544e-05, + "loss": 0.9951, + "step": 2209 + }, + { + "epoch": 0.11022443890274314, + "grad_norm": 1.1683348651709629, + "learning_rate": 4.916237158500413e-05, + "loss": 0.9646, + "step": 2210 + }, + { + "epoch": 0.11027431421446383, + "grad_norm": 0.818516689699712, + "learning_rate": 4.9161334655826555e-05, + "loss": 0.9458, + "step": 2211 + }, + { + "epoch": 0.11032418952618454, + "grad_norm": 0.6947915028975327, + "learning_rate": 4.916029709616975e-05, + "loss": 0.9005, + "step": 2212 + }, + { + "epoch": 0.11037406483790524, + "grad_norm": 0.9834996061705764, + "learning_rate": 4.915925890606081e-05, + "loss": 0.9673, + "step": 2213 + }, + { + "epoch": 0.11042394014962593, + "grad_norm": 0.8366330085425976, + "learning_rate": 4.915822008552683e-05, + "loss": 0.9516, + "step": 2214 + }, + { + "epoch": 0.11047381546134663, + "grad_norm": 0.8673434203065927, + "learning_rate": 4.91571806345949e-05, + "loss": 0.9716, + "step": 2215 + }, + { + "epoch": 0.11052369077306733, + "grad_norm": 1.1017289481808372, + "learning_rate": 4.915614055329216e-05, + "loss": 0.9529, + "step": 2216 + }, + { + "epoch": 0.11057356608478804, + "grad_norm": 0.49280039426363803, + "learning_rate": 4.915509984164573e-05, + "loss": 0.9385, + "step": 2217 + }, + { + "epoch": 0.11062344139650873, + "grad_norm": 0.5573426231435731, + "learning_rate": 4.915405849968279e-05, + "loss": 0.9493, + "step": 2218 + }, + { + "epoch": 0.11067331670822943, + "grad_norm": 0.5779957749037176, + "learning_rate": 4.915301652743051e-05, + "loss": 0.9282, + "step": 2219 + }, + { + "epoch": 0.11072319201995012, + "grad_norm": 0.5811709190929899, + "learning_rate": 4.915197392491606e-05, + "loss": 1.0013, + "step": 2220 + }, + { + "epoch": 0.11077306733167082, + "grad_norm": 0.5518406595056247, + "learning_rate": 4.915093069216666e-05, + "loss": 0.948, + "step": 2221 + }, + { + "epoch": 0.11082294264339151, + "grad_norm": 0.4606324689679128, + "learning_rate": 4.914988682920955e-05, + "loss": 0.9274, + "step": 2222 + }, + { + "epoch": 0.11087281795511222, + "grad_norm": 0.66721176503625, + "learning_rate": 4.914884233607193e-05, + "loss": 0.982, + "step": 2223 + }, + { + "epoch": 0.11092269326683292, + "grad_norm": 0.5066529624998839, + "learning_rate": 4.914779721278109e-05, + "loss": 0.9271, + "step": 2224 + }, + { + "epoch": 0.11097256857855362, + "grad_norm": 0.5866963808394862, + "learning_rate": 4.9146751459364285e-05, + "loss": 0.9361, + "step": 2225 + }, + { + "epoch": 0.11102244389027431, + "grad_norm": 0.6297290902781976, + "learning_rate": 4.9145705075848805e-05, + "loss": 0.9708, + "step": 2226 + }, + { + "epoch": 0.111072319201995, + "grad_norm": 0.7176759120801034, + "learning_rate": 4.914465806226195e-05, + "loss": 0.9669, + "step": 2227 + }, + { + "epoch": 0.11112219451371572, + "grad_norm": 0.5757112475002585, + "learning_rate": 4.914361041863106e-05, + "loss": 0.9265, + "step": 2228 + }, + { + "epoch": 0.11117206982543641, + "grad_norm": 0.5287196077170804, + "learning_rate": 4.914256214498346e-05, + "loss": 0.9343, + "step": 2229 + }, + { + "epoch": 0.11122194513715711, + "grad_norm": 0.6484129180110411, + "learning_rate": 4.9141513241346506e-05, + "loss": 0.9134, + "step": 2230 + }, + { + "epoch": 0.1112718204488778, + "grad_norm": 0.5433610445502484, + "learning_rate": 4.914046370774756e-05, + "loss": 0.9364, + "step": 2231 + }, + { + "epoch": 0.1113216957605985, + "grad_norm": 0.6918001121441978, + "learning_rate": 4.913941354421403e-05, + "loss": 0.9522, + "step": 2232 + }, + { + "epoch": 0.11137157107231921, + "grad_norm": 0.5927584209122541, + "learning_rate": 4.91383627507733e-05, + "loss": 0.9166, + "step": 2233 + }, + { + "epoch": 0.1114214463840399, + "grad_norm": 0.676619751856722, + "learning_rate": 4.91373113274528e-05, + "loss": 0.9672, + "step": 2234 + }, + { + "epoch": 0.1114713216957606, + "grad_norm": 0.574280947084235, + "learning_rate": 4.9136259274279955e-05, + "loss": 0.9323, + "step": 2235 + }, + { + "epoch": 0.1115211970074813, + "grad_norm": 0.5779639867197836, + "learning_rate": 4.913520659128223e-05, + "loss": 0.9574, + "step": 2236 + }, + { + "epoch": 0.11157107231920199, + "grad_norm": 0.9510821609874338, + "learning_rate": 4.913415327848709e-05, + "loss": 0.9261, + "step": 2237 + }, + { + "epoch": 0.11162094763092269, + "grad_norm": 0.5357542732409855, + "learning_rate": 4.913309933592203e-05, + "loss": 0.9396, + "step": 2238 + }, + { + "epoch": 0.1116708229426434, + "grad_norm": 0.4159610405953999, + "learning_rate": 4.913204476361454e-05, + "loss": 0.9547, + "step": 2239 + }, + { + "epoch": 0.11172069825436409, + "grad_norm": 0.7978087788532823, + "learning_rate": 4.913098956159213e-05, + "loss": 0.9167, + "step": 2240 + }, + { + "epoch": 0.11177057356608479, + "grad_norm": 1.7770234871014963, + "learning_rate": 4.912993372988236e-05, + "loss": 0.9308, + "step": 2241 + }, + { + "epoch": 0.11182044887780548, + "grad_norm": 0.6145305696624237, + "learning_rate": 4.9128877268512765e-05, + "loss": 0.987, + "step": 2242 + }, + { + "epoch": 0.11187032418952618, + "grad_norm": 0.543526615582436, + "learning_rate": 4.912782017751092e-05, + "loss": 0.9901, + "step": 2243 + }, + { + "epoch": 0.11192019950124689, + "grad_norm": 0.5793940228791333, + "learning_rate": 4.912676245690439e-05, + "loss": 0.9289, + "step": 2244 + }, + { + "epoch": 0.11197007481296758, + "grad_norm": 0.45789251762143607, + "learning_rate": 4.912570410672082e-05, + "loss": 0.9221, + "step": 2245 + }, + { + "epoch": 0.11201995012468828, + "grad_norm": 0.5102769388885923, + "learning_rate": 4.912464512698778e-05, + "loss": 0.9262, + "step": 2246 + }, + { + "epoch": 0.11206982543640898, + "grad_norm": 0.43880765801509874, + "learning_rate": 4.9123585517732915e-05, + "loss": 0.9171, + "step": 2247 + }, + { + "epoch": 0.11211970074812967, + "grad_norm": 0.6562694278134905, + "learning_rate": 4.912252527898389e-05, + "loss": 0.9641, + "step": 2248 + }, + { + "epoch": 0.11216957605985037, + "grad_norm": 0.5584108610624001, + "learning_rate": 4.912146441076837e-05, + "loss": 0.9384, + "step": 2249 + }, + { + "epoch": 0.11221945137157108, + "grad_norm": 0.6762878503697805, + "learning_rate": 4.912040291311403e-05, + "loss": 0.9471, + "step": 2250 + }, + { + "epoch": 0.11226932668329177, + "grad_norm": 0.6086948768888529, + "learning_rate": 4.911934078604856e-05, + "loss": 0.9227, + "step": 2251 + }, + { + "epoch": 0.11231920199501247, + "grad_norm": 0.5782555938070619, + "learning_rate": 4.9118278029599695e-05, + "loss": 0.9543, + "step": 2252 + }, + { + "epoch": 0.11236907730673316, + "grad_norm": 0.7610688238071476, + "learning_rate": 4.911721464379516e-05, + "loss": 0.9687, + "step": 2253 + }, + { + "epoch": 0.11241895261845386, + "grad_norm": 0.564896167115592, + "learning_rate": 4.91161506286627e-05, + "loss": 0.9646, + "step": 2254 + }, + { + "epoch": 0.11246882793017457, + "grad_norm": 0.529415183167095, + "learning_rate": 4.911508598423008e-05, + "loss": 0.966, + "step": 2255 + }, + { + "epoch": 0.11251870324189527, + "grad_norm": 2.0917029887770986, + "learning_rate": 4.911402071052509e-05, + "loss": 0.9347, + "step": 2256 + }, + { + "epoch": 0.11256857855361596, + "grad_norm": 0.5474756958448572, + "learning_rate": 4.911295480757552e-05, + "loss": 0.8886, + "step": 2257 + }, + { + "epoch": 0.11261845386533666, + "grad_norm": 0.702361307656155, + "learning_rate": 4.911188827540918e-05, + "loss": 0.9464, + "step": 2258 + }, + { + "epoch": 0.11266832917705735, + "grad_norm": 0.6159334704089001, + "learning_rate": 4.911082111405391e-05, + "loss": 0.9232, + "step": 2259 + }, + { + "epoch": 0.11271820448877805, + "grad_norm": 0.5608610654394519, + "learning_rate": 4.9109753323537556e-05, + "loss": 0.9914, + "step": 2260 + }, + { + "epoch": 0.11276807980049876, + "grad_norm": 0.4754210459153907, + "learning_rate": 4.9108684903887976e-05, + "loss": 0.9468, + "step": 2261 + }, + { + "epoch": 0.11281795511221945, + "grad_norm": 0.6287658549755187, + "learning_rate": 4.910761585513305e-05, + "loss": 0.9487, + "step": 2262 + }, + { + "epoch": 0.11286783042394015, + "grad_norm": 1.3570789733808584, + "learning_rate": 4.910654617730068e-05, + "loss": 0.9893, + "step": 2263 + }, + { + "epoch": 0.11291770573566084, + "grad_norm": 0.503773191175658, + "learning_rate": 4.910547587041878e-05, + "loss": 0.9518, + "step": 2264 + }, + { + "epoch": 0.11296758104738154, + "grad_norm": 0.5814800708959543, + "learning_rate": 4.910440493451527e-05, + "loss": 0.9396, + "step": 2265 + }, + { + "epoch": 0.11301745635910225, + "grad_norm": 0.4354079206648906, + "learning_rate": 4.910333336961811e-05, + "loss": 0.9683, + "step": 2266 + }, + { + "epoch": 0.11306733167082295, + "grad_norm": 1.7809351093353158, + "learning_rate": 4.910226117575524e-05, + "loss": 0.9028, + "step": 2267 + }, + { + "epoch": 0.11311720698254364, + "grad_norm": 0.5098914818065249, + "learning_rate": 4.910118835295466e-05, + "loss": 0.9473, + "step": 2268 + }, + { + "epoch": 0.11316708229426434, + "grad_norm": 0.5515152774177942, + "learning_rate": 4.9100114901244356e-05, + "loss": 0.9154, + "step": 2269 + }, + { + "epoch": 0.11321695760598503, + "grad_norm": 0.7355564009260457, + "learning_rate": 4.909904082065234e-05, + "loss": 0.9063, + "step": 2270 + }, + { + "epoch": 0.11326683291770574, + "grad_norm": 0.5508593740242869, + "learning_rate": 4.9097966111206634e-05, + "loss": 0.9615, + "step": 2271 + }, + { + "epoch": 0.11331670822942644, + "grad_norm": 0.46909326541548296, + "learning_rate": 4.909689077293529e-05, + "loss": 0.9596, + "step": 2272 + }, + { + "epoch": 0.11336658354114713, + "grad_norm": 0.6770973241588076, + "learning_rate": 4.909581480586637e-05, + "loss": 0.9354, + "step": 2273 + }, + { + "epoch": 0.11341645885286783, + "grad_norm": 0.5187064651791063, + "learning_rate": 4.909473821002794e-05, + "loss": 0.9502, + "step": 2274 + }, + { + "epoch": 0.11346633416458853, + "grad_norm": 0.8486480549025225, + "learning_rate": 4.9093660985448097e-05, + "loss": 0.9398, + "step": 2275 + }, + { + "epoch": 0.11351620947630922, + "grad_norm": 0.49431764985619275, + "learning_rate": 4.909258313215496e-05, + "loss": 0.9487, + "step": 2276 + }, + { + "epoch": 0.11356608478802993, + "grad_norm": 0.5524848821892562, + "learning_rate": 4.909150465017664e-05, + "loss": 0.9692, + "step": 2277 + }, + { + "epoch": 0.11361596009975063, + "grad_norm": 0.541445975569177, + "learning_rate": 4.909042553954131e-05, + "loss": 0.9501, + "step": 2278 + }, + { + "epoch": 0.11366583541147132, + "grad_norm": 0.6091338867483569, + "learning_rate": 4.908934580027709e-05, + "loss": 0.9483, + "step": 2279 + }, + { + "epoch": 0.11371571072319202, + "grad_norm": 0.47417546439983066, + "learning_rate": 4.908826543241217e-05, + "loss": 0.9254, + "step": 2280 + }, + { + "epoch": 0.11376558603491271, + "grad_norm": 0.7617028799668646, + "learning_rate": 4.9087184435974745e-05, + "loss": 0.9937, + "step": 2281 + }, + { + "epoch": 0.11381546134663342, + "grad_norm": 0.5219625125675567, + "learning_rate": 4.9086102810993027e-05, + "loss": 1.0057, + "step": 2282 + }, + { + "epoch": 0.11386533665835412, + "grad_norm": 0.49002833698464204, + "learning_rate": 4.9085020557495236e-05, + "loss": 0.973, + "step": 2283 + }, + { + "epoch": 0.11391521197007481, + "grad_norm": 0.7460697174652865, + "learning_rate": 4.908393767550961e-05, + "loss": 0.95, + "step": 2284 + }, + { + "epoch": 0.11396508728179551, + "grad_norm": 0.6409580464081932, + "learning_rate": 4.908285416506441e-05, + "loss": 0.9844, + "step": 2285 + }, + { + "epoch": 0.1140149625935162, + "grad_norm": 0.4566076187570015, + "learning_rate": 4.9081770026187914e-05, + "loss": 0.9378, + "step": 2286 + }, + { + "epoch": 0.1140648379052369, + "grad_norm": 0.6730581203077601, + "learning_rate": 4.90806852589084e-05, + "loss": 0.99, + "step": 2287 + }, + { + "epoch": 0.11411471321695761, + "grad_norm": 0.5238252616017227, + "learning_rate": 4.907959986325418e-05, + "loss": 0.9147, + "step": 2288 + }, + { + "epoch": 0.1141645885286783, + "grad_norm": 0.4815690914367857, + "learning_rate": 4.907851383925358e-05, + "loss": 0.893, + "step": 2289 + }, + { + "epoch": 0.114214463840399, + "grad_norm": 0.46752451166222964, + "learning_rate": 4.907742718693494e-05, + "loss": 0.9329, + "step": 2290 + }, + { + "epoch": 0.1142643391521197, + "grad_norm": 0.5569894087921675, + "learning_rate": 4.9076339906326605e-05, + "loss": 0.9664, + "step": 2291 + }, + { + "epoch": 0.1143142144638404, + "grad_norm": 0.47182074423565107, + "learning_rate": 4.9075251997456966e-05, + "loss": 0.9519, + "step": 2292 + }, + { + "epoch": 0.1143640897755611, + "grad_norm": 0.4858673707514734, + "learning_rate": 4.9074163460354395e-05, + "loss": 0.9372, + "step": 2293 + }, + { + "epoch": 0.1144139650872818, + "grad_norm": 0.47292152118405834, + "learning_rate": 4.90730742950473e-05, + "loss": 0.9515, + "step": 2294 + }, + { + "epoch": 0.1144638403990025, + "grad_norm": 0.5671130503637769, + "learning_rate": 4.907198450156412e-05, + "loss": 0.9263, + "step": 2295 + }, + { + "epoch": 0.11451371571072319, + "grad_norm": 0.5096036975009256, + "learning_rate": 4.907089407993326e-05, + "loss": 1.0105, + "step": 2296 + }, + { + "epoch": 0.11456359102244389, + "grad_norm": 0.5212646217227578, + "learning_rate": 4.90698030301832e-05, + "loss": 1.0044, + "step": 2297 + }, + { + "epoch": 0.11461346633416458, + "grad_norm": 0.502558561233029, + "learning_rate": 4.90687113523424e-05, + "loss": 0.9546, + "step": 2298 + }, + { + "epoch": 0.11466334164588529, + "grad_norm": 0.6885918647090485, + "learning_rate": 4.906761904643935e-05, + "loss": 0.9625, + "step": 2299 + }, + { + "epoch": 0.11471321695760599, + "grad_norm": 0.47190837410830677, + "learning_rate": 4.9066526112502557e-05, + "loss": 0.9415, + "step": 2300 + }, + { + "epoch": 0.11476309226932668, + "grad_norm": 1.220029551354219, + "learning_rate": 4.9065432550560533e-05, + "loss": 0.9751, + "step": 2301 + }, + { + "epoch": 0.11481296758104738, + "grad_norm": 0.5513843334942132, + "learning_rate": 4.906433836064181e-05, + "loss": 0.9763, + "step": 2302 + }, + { + "epoch": 0.11486284289276807, + "grad_norm": 0.40806238233479986, + "learning_rate": 4.906324354277495e-05, + "loss": 0.8808, + "step": 2303 + }, + { + "epoch": 0.11491271820448878, + "grad_norm": 0.4780742734283721, + "learning_rate": 4.906214809698852e-05, + "loss": 0.9452, + "step": 2304 + }, + { + "epoch": 0.11496259351620948, + "grad_norm": 0.5978723694718165, + "learning_rate": 4.9061052023311105e-05, + "loss": 0.9316, + "step": 2305 + }, + { + "epoch": 0.11501246882793018, + "grad_norm": 0.4800554106360697, + "learning_rate": 4.9059955321771315e-05, + "loss": 0.9323, + "step": 2306 + }, + { + "epoch": 0.11506234413965087, + "grad_norm": 0.4346044495754462, + "learning_rate": 4.905885799239774e-05, + "loss": 0.9371, + "step": 2307 + }, + { + "epoch": 0.11511221945137157, + "grad_norm": 1.2296386990212687, + "learning_rate": 4.905776003521905e-05, + "loss": 0.9519, + "step": 2308 + }, + { + "epoch": 0.11516209476309226, + "grad_norm": 0.5374060307670102, + "learning_rate": 4.905666145026387e-05, + "loss": 0.9968, + "step": 2309 + }, + { + "epoch": 0.11521197007481297, + "grad_norm": 0.5162006100752872, + "learning_rate": 4.9055562237560874e-05, + "loss": 0.9254, + "step": 2310 + }, + { + "epoch": 0.11526184538653367, + "grad_norm": 0.7046031499366718, + "learning_rate": 4.905446239713875e-05, + "loss": 0.9057, + "step": 2311 + }, + { + "epoch": 0.11531172069825436, + "grad_norm": 0.48400051606933536, + "learning_rate": 4.90533619290262e-05, + "loss": 0.931, + "step": 2312 + }, + { + "epoch": 0.11536159600997506, + "grad_norm": 0.8175727198337899, + "learning_rate": 4.9052260833251926e-05, + "loss": 0.9784, + "step": 2313 + }, + { + "epoch": 0.11541147132169575, + "grad_norm": 0.43608753379819865, + "learning_rate": 4.905115910984468e-05, + "loss": 0.9173, + "step": 2314 + }, + { + "epoch": 0.11546134663341646, + "grad_norm": 0.8334529331798007, + "learning_rate": 4.9050056758833195e-05, + "loss": 0.9601, + "step": 2315 + }, + { + "epoch": 0.11551122194513716, + "grad_norm": 0.6216005120562097, + "learning_rate": 4.9048953780246245e-05, + "loss": 0.968, + "step": 2316 + }, + { + "epoch": 0.11556109725685786, + "grad_norm": 0.4131385670592664, + "learning_rate": 4.90478501741126e-05, + "loss": 0.9079, + "step": 2317 + }, + { + "epoch": 0.11561097256857855, + "grad_norm": 0.5765992088588467, + "learning_rate": 4.9046745940461083e-05, + "loss": 0.9458, + "step": 2318 + }, + { + "epoch": 0.11566084788029925, + "grad_norm": 0.4784408870874017, + "learning_rate": 4.9045641079320484e-05, + "loss": 0.9421, + "step": 2319 + }, + { + "epoch": 0.11571072319201996, + "grad_norm": 0.5032978871245061, + "learning_rate": 4.9044535590719636e-05, + "loss": 0.9582, + "step": 2320 + }, + { + "epoch": 0.11576059850374065, + "grad_norm": 0.5194459911607076, + "learning_rate": 4.90434294746874e-05, + "loss": 0.963, + "step": 2321 + }, + { + "epoch": 0.11581047381546135, + "grad_norm": 0.500935899447256, + "learning_rate": 4.9042322731252634e-05, + "loss": 0.9341, + "step": 2322 + }, + { + "epoch": 0.11586034912718204, + "grad_norm": 0.4065320328515857, + "learning_rate": 4.904121536044421e-05, + "loss": 0.8821, + "step": 2323 + }, + { + "epoch": 0.11591022443890274, + "grad_norm": 0.9848422871636325, + "learning_rate": 4.904010736229103e-05, + "loss": 0.9588, + "step": 2324 + }, + { + "epoch": 0.11596009975062344, + "grad_norm": 0.514068336691405, + "learning_rate": 4.903899873682202e-05, + "loss": 0.9442, + "step": 2325 + }, + { + "epoch": 0.11600997506234415, + "grad_norm": 0.545056024308351, + "learning_rate": 4.9037889484066085e-05, + "loss": 1.0204, + "step": 2326 + }, + { + "epoch": 0.11605985037406484, + "grad_norm": 0.503196031109919, + "learning_rate": 4.903677960405219e-05, + "loss": 0.9638, + "step": 2327 + }, + { + "epoch": 0.11610972568578554, + "grad_norm": 0.5424058248554454, + "learning_rate": 4.903566909680928e-05, + "loss": 0.9367, + "step": 2328 + }, + { + "epoch": 0.11615960099750623, + "grad_norm": 0.5427669478225429, + "learning_rate": 4.9034557962366345e-05, + "loss": 0.9529, + "step": 2329 + }, + { + "epoch": 0.11620947630922693, + "grad_norm": 1.0657555449043097, + "learning_rate": 4.9033446200752375e-05, + "loss": 0.9663, + "step": 2330 + }, + { + "epoch": 0.11625935162094764, + "grad_norm": 0.4840992451955306, + "learning_rate": 4.903233381199638e-05, + "loss": 0.9596, + "step": 2331 + }, + { + "epoch": 0.11630922693266833, + "grad_norm": 0.5752410194168571, + "learning_rate": 4.90312207961274e-05, + "loss": 0.9296, + "step": 2332 + }, + { + "epoch": 0.11635910224438903, + "grad_norm": 1.0055111055878867, + "learning_rate": 4.9030107153174466e-05, + "loss": 0.9449, + "step": 2333 + }, + { + "epoch": 0.11640897755610972, + "grad_norm": 0.5386417103345164, + "learning_rate": 4.902899288316663e-05, + "loss": 0.9269, + "step": 2334 + }, + { + "epoch": 0.11645885286783042, + "grad_norm": 0.47125369658667776, + "learning_rate": 4.9027877986132985e-05, + "loss": 0.917, + "step": 2335 + }, + { + "epoch": 0.11650872817955112, + "grad_norm": 0.4877805047622845, + "learning_rate": 4.902676246210262e-05, + "loss": 0.9605, + "step": 2336 + }, + { + "epoch": 0.11655860349127183, + "grad_norm": 0.6143410189768057, + "learning_rate": 4.902564631110464e-05, + "loss": 0.9322, + "step": 2337 + }, + { + "epoch": 0.11660847880299252, + "grad_norm": 0.5549275457850135, + "learning_rate": 4.9024529533168166e-05, + "loss": 0.9428, + "step": 2338 + }, + { + "epoch": 0.11665835411471322, + "grad_norm": 0.5501607155115366, + "learning_rate": 4.9023412128322356e-05, + "loss": 0.9146, + "step": 2339 + }, + { + "epoch": 0.11670822942643391, + "grad_norm": 0.5335714794903254, + "learning_rate": 4.902229409659635e-05, + "loss": 0.9635, + "step": 2340 + }, + { + "epoch": 0.11675810473815461, + "grad_norm": 0.5535109954376495, + "learning_rate": 4.9021175438019334e-05, + "loss": 0.9717, + "step": 2341 + }, + { + "epoch": 0.11680798004987532, + "grad_norm": 0.44822044235156344, + "learning_rate": 4.90200561526205e-05, + "loss": 0.9583, + "step": 2342 + }, + { + "epoch": 0.11685785536159601, + "grad_norm": 0.6820926005572606, + "learning_rate": 4.901893624042904e-05, + "loss": 0.9769, + "step": 2343 + }, + { + "epoch": 0.11690773067331671, + "grad_norm": 0.49985036199094224, + "learning_rate": 4.901781570147419e-05, + "loss": 0.9109, + "step": 2344 + }, + { + "epoch": 0.1169576059850374, + "grad_norm": 1.9662079813649411, + "learning_rate": 4.901669453578519e-05, + "loss": 0.9249, + "step": 2345 + }, + { + "epoch": 0.1170074812967581, + "grad_norm": 0.6155764474439888, + "learning_rate": 4.9015572743391306e-05, + "loss": 0.926, + "step": 2346 + }, + { + "epoch": 0.1170573566084788, + "grad_norm": 0.577417497594275, + "learning_rate": 4.9014450324321786e-05, + "loss": 0.934, + "step": 2347 + }, + { + "epoch": 0.1171072319201995, + "grad_norm": 0.983459465357608, + "learning_rate": 4.9013327278605936e-05, + "loss": 0.9593, + "step": 2348 + }, + { + "epoch": 0.1171571072319202, + "grad_norm": 0.48812297494570794, + "learning_rate": 4.9012203606273054e-05, + "loss": 0.9351, + "step": 2349 + }, + { + "epoch": 0.1172069825436409, + "grad_norm": 0.4806257761200891, + "learning_rate": 4.901107930735247e-05, + "loss": 0.9826, + "step": 2350 + }, + { + "epoch": 0.1172568578553616, + "grad_norm": 0.4672885218099158, + "learning_rate": 4.9009954381873516e-05, + "loss": 0.9794, + "step": 2351 + }, + { + "epoch": 0.11730673316708229, + "grad_norm": 0.5770911812350793, + "learning_rate": 4.900882882986555e-05, + "loss": 0.9449, + "step": 2352 + }, + { + "epoch": 0.117356608478803, + "grad_norm": 0.45756142158421814, + "learning_rate": 4.900770265135794e-05, + "loss": 0.9275, + "step": 2353 + }, + { + "epoch": 0.1174064837905237, + "grad_norm": 0.7300155309875741, + "learning_rate": 4.900657584638008e-05, + "loss": 0.964, + "step": 2354 + }, + { + "epoch": 0.11745635910224439, + "grad_norm": 0.5633072980890175, + "learning_rate": 4.9005448414961354e-05, + "loss": 0.9605, + "step": 2355 + }, + { + "epoch": 0.11750623441396509, + "grad_norm": 0.5439742287700864, + "learning_rate": 4.90043203571312e-05, + "loss": 0.9659, + "step": 2356 + }, + { + "epoch": 0.11755610972568578, + "grad_norm": 0.5933604742808923, + "learning_rate": 4.9003191672919044e-05, + "loss": 0.9438, + "step": 2357 + }, + { + "epoch": 0.11760598503740648, + "grad_norm": 0.5706646687798892, + "learning_rate": 4.900206236235436e-05, + "loss": 0.9741, + "step": 2358 + }, + { + "epoch": 0.11765586034912719, + "grad_norm": 0.5187040699065467, + "learning_rate": 4.900093242546658e-05, + "loss": 0.9401, + "step": 2359 + }, + { + "epoch": 0.11770573566084788, + "grad_norm": 0.5130927450303975, + "learning_rate": 4.8999801862285226e-05, + "loss": 0.9234, + "step": 2360 + }, + { + "epoch": 0.11775561097256858, + "grad_norm": 0.49080732606162686, + "learning_rate": 4.8998670672839775e-05, + "loss": 0.951, + "step": 2361 + }, + { + "epoch": 0.11780548628428927, + "grad_norm": 0.9458017018849781, + "learning_rate": 4.899753885715976e-05, + "loss": 0.9888, + "step": 2362 + }, + { + "epoch": 0.11785536159600997, + "grad_norm": 0.871680234934215, + "learning_rate": 4.89964064152747e-05, + "loss": 0.9922, + "step": 2363 + }, + { + "epoch": 0.11790523690773068, + "grad_norm": 0.47960576052004167, + "learning_rate": 4.899527334721416e-05, + "loss": 0.9394, + "step": 2364 + }, + { + "epoch": 0.11795511221945137, + "grad_norm": 0.5760074814269001, + "learning_rate": 4.8994139653007697e-05, + "loss": 0.9134, + "step": 2365 + }, + { + "epoch": 0.11800498753117207, + "grad_norm": 0.6915295927348785, + "learning_rate": 4.8993005332684906e-05, + "loss": 0.9589, + "step": 2366 + }, + { + "epoch": 0.11805486284289277, + "grad_norm": 0.6635449946678983, + "learning_rate": 4.8991870386275375e-05, + "loss": 0.9302, + "step": 2367 + }, + { + "epoch": 0.11810473815461346, + "grad_norm": 0.714804058377665, + "learning_rate": 4.8990734813808715e-05, + "loss": 1.0011, + "step": 2368 + }, + { + "epoch": 0.11815461346633417, + "grad_norm": 0.5569909717672481, + "learning_rate": 4.898959861531457e-05, + "loss": 0.9722, + "step": 2369 + }, + { + "epoch": 0.11820448877805487, + "grad_norm": 0.5557409995169279, + "learning_rate": 4.8988461790822594e-05, + "loss": 0.942, + "step": 2370 + }, + { + "epoch": 0.11825436408977556, + "grad_norm": 0.5337393024426004, + "learning_rate": 4.898732434036244e-05, + "loss": 0.9352, + "step": 2371 + }, + { + "epoch": 0.11830423940149626, + "grad_norm": 0.5857024803843853, + "learning_rate": 4.898618626396378e-05, + "loss": 0.9764, + "step": 2372 + }, + { + "epoch": 0.11835411471321695, + "grad_norm": 0.7506249261212816, + "learning_rate": 4.898504756165635e-05, + "loss": 0.9477, + "step": 2373 + }, + { + "epoch": 0.11840399002493765, + "grad_norm": 0.5009428950491539, + "learning_rate": 4.898390823346982e-05, + "loss": 0.9008, + "step": 2374 + }, + { + "epoch": 0.11845386533665836, + "grad_norm": 0.8541713470792434, + "learning_rate": 4.898276827943394e-05, + "loss": 0.9542, + "step": 2375 + }, + { + "epoch": 0.11850374064837906, + "grad_norm": 0.4615921680716651, + "learning_rate": 4.8981627699578456e-05, + "loss": 0.942, + "step": 2376 + }, + { + "epoch": 0.11855361596009975, + "grad_norm": 0.5453164431254267, + "learning_rate": 4.898048649393313e-05, + "loss": 0.9535, + "step": 2377 + }, + { + "epoch": 0.11860349127182045, + "grad_norm": 0.6909447261455798, + "learning_rate": 4.897934466252774e-05, + "loss": 0.976, + "step": 2378 + }, + { + "epoch": 0.11865336658354114, + "grad_norm": 0.6583695485173273, + "learning_rate": 4.897820220539209e-05, + "loss": 0.9367, + "step": 2379 + }, + { + "epoch": 0.11870324189526185, + "grad_norm": 1.0092957232469477, + "learning_rate": 4.897705912255598e-05, + "loss": 0.9509, + "step": 2380 + }, + { + "epoch": 0.11875311720698255, + "grad_norm": 0.4829790488483068, + "learning_rate": 4.897591541404924e-05, + "loss": 0.9588, + "step": 2381 + }, + { + "epoch": 0.11880299251870324, + "grad_norm": 0.5060578625129476, + "learning_rate": 4.8974771079901714e-05, + "loss": 0.9534, + "step": 2382 + }, + { + "epoch": 0.11885286783042394, + "grad_norm": 0.581520851481397, + "learning_rate": 4.897362612014328e-05, + "loss": 0.9237, + "step": 2383 + }, + { + "epoch": 0.11890274314214463, + "grad_norm": 0.5137094901315904, + "learning_rate": 4.897248053480379e-05, + "loss": 0.9361, + "step": 2384 + }, + { + "epoch": 0.11895261845386533, + "grad_norm": 0.5022198740578299, + "learning_rate": 4.8971334323913154e-05, + "loss": 0.9639, + "step": 2385 + }, + { + "epoch": 0.11900249376558604, + "grad_norm": 0.5207926038856604, + "learning_rate": 4.897018748750127e-05, + "loss": 0.967, + "step": 2386 + }, + { + "epoch": 0.11905236907730674, + "grad_norm": 0.4905434966939875, + "learning_rate": 4.8969040025598075e-05, + "loss": 0.9595, + "step": 2387 + }, + { + "epoch": 0.11910224438902743, + "grad_norm": 1.0410653014370748, + "learning_rate": 4.896789193823352e-05, + "loss": 0.9332, + "step": 2388 + }, + { + "epoch": 0.11915211970074813, + "grad_norm": 0.4975561555148221, + "learning_rate": 4.896674322543754e-05, + "loss": 0.9308, + "step": 2389 + }, + { + "epoch": 0.11920199501246882, + "grad_norm": 0.59090906423337, + "learning_rate": 4.896559388724012e-05, + "loss": 0.9126, + "step": 2390 + }, + { + "epoch": 0.11925187032418953, + "grad_norm": 1.0623611665069557, + "learning_rate": 4.8964443923671254e-05, + "loss": 0.9694, + "step": 2391 + }, + { + "epoch": 0.11930174563591023, + "grad_norm": 0.5454746884225166, + "learning_rate": 4.896329333476095e-05, + "loss": 0.9187, + "step": 2392 + }, + { + "epoch": 0.11935162094763092, + "grad_norm": 1.203398709920314, + "learning_rate": 4.896214212053923e-05, + "loss": 0.9827, + "step": 2393 + }, + { + "epoch": 0.11940149625935162, + "grad_norm": 0.6087488411581927, + "learning_rate": 4.896099028103615e-05, + "loss": 0.911, + "step": 2394 + }, + { + "epoch": 0.11945137157107232, + "grad_norm": 0.5094345893563287, + "learning_rate": 4.895983781628173e-05, + "loss": 0.9541, + "step": 2395 + }, + { + "epoch": 0.11950124688279301, + "grad_norm": 0.48340455221000367, + "learning_rate": 4.895868472630608e-05, + "loss": 0.9584, + "step": 2396 + }, + { + "epoch": 0.11955112219451372, + "grad_norm": 0.5367669066178233, + "learning_rate": 4.895753101113927e-05, + "loss": 0.9717, + "step": 2397 + }, + { + "epoch": 0.11960099750623442, + "grad_norm": 0.5003058416980505, + "learning_rate": 4.895637667081141e-05, + "loss": 0.9673, + "step": 2398 + }, + { + "epoch": 0.11965087281795511, + "grad_norm": 0.44730815413442343, + "learning_rate": 4.895522170535262e-05, + "loss": 0.933, + "step": 2399 + }, + { + "epoch": 0.11970074812967581, + "grad_norm": 0.5532318102743795, + "learning_rate": 4.895406611479304e-05, + "loss": 0.9379, + "step": 2400 + }, + { + "epoch": 0.1197506234413965, + "grad_norm": 0.6843730591958428, + "learning_rate": 4.8952909899162826e-05, + "loss": 0.9433, + "step": 2401 + }, + { + "epoch": 0.11980049875311721, + "grad_norm": 0.5203275312936958, + "learning_rate": 4.895175305849215e-05, + "loss": 0.962, + "step": 2402 + }, + { + "epoch": 0.11985037406483791, + "grad_norm": 0.6241033051819104, + "learning_rate": 4.89505955928112e-05, + "loss": 0.9477, + "step": 2403 + }, + { + "epoch": 0.1199002493765586, + "grad_norm": 0.5744011395609376, + "learning_rate": 4.894943750215018e-05, + "loss": 0.9315, + "step": 2404 + }, + { + "epoch": 0.1199501246882793, + "grad_norm": 0.5250056577915708, + "learning_rate": 4.894827878653929e-05, + "loss": 0.9402, + "step": 2405 + }, + { + "epoch": 0.12, + "grad_norm": 0.5375898757971812, + "learning_rate": 4.894711944600879e-05, + "loss": 0.9699, + "step": 2406 + }, + { + "epoch": 0.1200498753117207, + "grad_norm": 0.7969048532913617, + "learning_rate": 4.894595948058893e-05, + "loss": 0.9265, + "step": 2407 + }, + { + "epoch": 0.1200997506234414, + "grad_norm": 0.6351399016101766, + "learning_rate": 4.8944798890309975e-05, + "loss": 0.9121, + "step": 2408 + }, + { + "epoch": 0.1201496259351621, + "grad_norm": 0.512008510987852, + "learning_rate": 4.894363767520221e-05, + "loss": 0.9331, + "step": 2409 + }, + { + "epoch": 0.12019950124688279, + "grad_norm": 0.7260972470914815, + "learning_rate": 4.894247583529593e-05, + "loss": 0.9429, + "step": 2410 + }, + { + "epoch": 0.12024937655860349, + "grad_norm": 0.5922644330318759, + "learning_rate": 4.894131337062147e-05, + "loss": 0.9413, + "step": 2411 + }, + { + "epoch": 0.12029925187032418, + "grad_norm": 0.7340963406156664, + "learning_rate": 4.894015028120914e-05, + "loss": 0.9105, + "step": 2412 + }, + { + "epoch": 0.1203491271820449, + "grad_norm": 0.603901741999449, + "learning_rate": 4.8938986567089305e-05, + "loss": 0.9028, + "step": 2413 + }, + { + "epoch": 0.12039900249376559, + "grad_norm": 0.5311841380620012, + "learning_rate": 4.893782222829233e-05, + "loss": 0.9847, + "step": 2414 + }, + { + "epoch": 0.12044887780548628, + "grad_norm": 0.9417691071212156, + "learning_rate": 4.893665726484859e-05, + "loss": 0.9476, + "step": 2415 + }, + { + "epoch": 0.12049875311720698, + "grad_norm": 1.0725735951497017, + "learning_rate": 4.89354916767885e-05, + "loss": 0.9877, + "step": 2416 + }, + { + "epoch": 0.12054862842892768, + "grad_norm": 0.7155214770709912, + "learning_rate": 4.893432546414247e-05, + "loss": 0.9067, + "step": 2417 + }, + { + "epoch": 0.12059850374064839, + "grad_norm": 0.5483928305161668, + "learning_rate": 4.893315862694092e-05, + "loss": 0.9617, + "step": 2418 + }, + { + "epoch": 0.12064837905236908, + "grad_norm": 0.5881416849933534, + "learning_rate": 4.893199116521432e-05, + "loss": 0.8936, + "step": 2419 + }, + { + "epoch": 0.12069825436408978, + "grad_norm": 0.5067418493513367, + "learning_rate": 4.8930823078993106e-05, + "loss": 0.9718, + "step": 2420 + }, + { + "epoch": 0.12074812967581047, + "grad_norm": 1.0119009155055843, + "learning_rate": 4.892965436830778e-05, + "loss": 0.9556, + "step": 2421 + }, + { + "epoch": 0.12079800498753117, + "grad_norm": 0.6082373125500735, + "learning_rate": 4.8928485033188834e-05, + "loss": 0.9706, + "step": 2422 + }, + { + "epoch": 0.12084788029925186, + "grad_norm": 0.6488574103124252, + "learning_rate": 4.8927315073666777e-05, + "loss": 0.9606, + "step": 2423 + }, + { + "epoch": 0.12089775561097257, + "grad_norm": 0.7565160282884863, + "learning_rate": 4.892614448977214e-05, + "loss": 0.9132, + "step": 2424 + }, + { + "epoch": 0.12094763092269327, + "grad_norm": 0.5431092522564835, + "learning_rate": 4.892497328153548e-05, + "loss": 0.9301, + "step": 2425 + }, + { + "epoch": 0.12099750623441397, + "grad_norm": 0.5869034414148703, + "learning_rate": 4.892380144898734e-05, + "loss": 1.0043, + "step": 2426 + }, + { + "epoch": 0.12104738154613466, + "grad_norm": 1.459827555083662, + "learning_rate": 4.8922628992158315e-05, + "loss": 0.9212, + "step": 2427 + }, + { + "epoch": 0.12109725685785536, + "grad_norm": 0.5115139154107659, + "learning_rate": 4.892145591107899e-05, + "loss": 0.9687, + "step": 2428 + }, + { + "epoch": 0.12114713216957607, + "grad_norm": 0.8659083584733718, + "learning_rate": 4.892028220577998e-05, + "loss": 0.9492, + "step": 2429 + }, + { + "epoch": 0.12119700748129676, + "grad_norm": 0.6099707232261683, + "learning_rate": 4.89191078762919e-05, + "loss": 0.9333, + "step": 2430 + }, + { + "epoch": 0.12124688279301746, + "grad_norm": 0.6684825274166072, + "learning_rate": 4.891793292264542e-05, + "loss": 0.9845, + "step": 2431 + }, + { + "epoch": 0.12129675810473815, + "grad_norm": 0.46660514565519157, + "learning_rate": 4.891675734487118e-05, + "loss": 0.9365, + "step": 2432 + }, + { + "epoch": 0.12134663341645885, + "grad_norm": 0.5640299413366532, + "learning_rate": 4.8915581142999854e-05, + "loss": 0.9835, + "step": 2433 + }, + { + "epoch": 0.12139650872817954, + "grad_norm": 0.6763084857693059, + "learning_rate": 4.891440431706215e-05, + "loss": 0.9679, + "step": 2434 + }, + { + "epoch": 0.12144638403990025, + "grad_norm": 0.5061487751616663, + "learning_rate": 4.8913226867088765e-05, + "loss": 0.9444, + "step": 2435 + }, + { + "epoch": 0.12149625935162095, + "grad_norm": 0.5942301096830495, + "learning_rate": 4.891204879311042e-05, + "loss": 0.9362, + "step": 2436 + }, + { + "epoch": 0.12154613466334165, + "grad_norm": 0.6347334705739908, + "learning_rate": 4.891087009515788e-05, + "loss": 0.9107, + "step": 2437 + }, + { + "epoch": 0.12159600997506234, + "grad_norm": 0.548777274222863, + "learning_rate": 4.8909690773261876e-05, + "loss": 0.9171, + "step": 2438 + }, + { + "epoch": 0.12164588528678304, + "grad_norm": 0.5261060204598063, + "learning_rate": 4.890851082745319e-05, + "loss": 0.917, + "step": 2439 + }, + { + "epoch": 0.12169576059850375, + "grad_norm": 1.6269574709707564, + "learning_rate": 4.8907330257762616e-05, + "loss": 0.9757, + "step": 2440 + }, + { + "epoch": 0.12174563591022444, + "grad_norm": 0.687745463418894, + "learning_rate": 4.8906149064220964e-05, + "loss": 0.8715, + "step": 2441 + }, + { + "epoch": 0.12179551122194514, + "grad_norm": 0.7192617173956446, + "learning_rate": 4.890496724685904e-05, + "loss": 0.9489, + "step": 2442 + }, + { + "epoch": 0.12184538653366583, + "grad_norm": 0.6081613853377669, + "learning_rate": 4.8903784805707694e-05, + "loss": 0.9412, + "step": 2443 + }, + { + "epoch": 0.12189526184538653, + "grad_norm": 0.4781848445547259, + "learning_rate": 4.8902601740797795e-05, + "loss": 0.9139, + "step": 2444 + }, + { + "epoch": 0.12194513715710723, + "grad_norm": 0.8007873713331632, + "learning_rate": 4.8901418052160196e-05, + "loss": 0.9585, + "step": 2445 + }, + { + "epoch": 0.12199501246882793, + "grad_norm": 0.5211180872823745, + "learning_rate": 4.890023373982578e-05, + "loss": 0.9281, + "step": 2446 + }, + { + "epoch": 0.12204488778054863, + "grad_norm": 0.9015610148888223, + "learning_rate": 4.889904880382548e-05, + "loss": 0.9591, + "step": 2447 + }, + { + "epoch": 0.12209476309226933, + "grad_norm": 0.7630708485260597, + "learning_rate": 4.889786324419018e-05, + "loss": 0.9697, + "step": 2448 + }, + { + "epoch": 0.12214463840399002, + "grad_norm": 0.5609311017964478, + "learning_rate": 4.889667706095084e-05, + "loss": 0.9394, + "step": 2449 + }, + { + "epoch": 0.12219451371571072, + "grad_norm": 0.5833344839411728, + "learning_rate": 4.8895490254138404e-05, + "loss": 0.9765, + "step": 2450 + }, + { + "epoch": 0.12224438902743143, + "grad_norm": 0.775134634177984, + "learning_rate": 4.889430282378385e-05, + "loss": 0.9954, + "step": 2451 + }, + { + "epoch": 0.12229426433915212, + "grad_norm": 0.4968230097116518, + "learning_rate": 4.8893114769918146e-05, + "loss": 0.9511, + "step": 2452 + }, + { + "epoch": 0.12234413965087282, + "grad_norm": 0.7074808564468735, + "learning_rate": 4.8891926092572315e-05, + "loss": 0.9434, + "step": 2453 + }, + { + "epoch": 0.12239401496259351, + "grad_norm": 0.5969996530797171, + "learning_rate": 4.8890736791777364e-05, + "loss": 0.9373, + "step": 2454 + }, + { + "epoch": 0.12244389027431421, + "grad_norm": 0.726769476955545, + "learning_rate": 4.888954686756433e-05, + "loss": 0.9897, + "step": 2455 + }, + { + "epoch": 0.12249376558603492, + "grad_norm": 0.8902429758095256, + "learning_rate": 4.8888356319964254e-05, + "loss": 0.9534, + "step": 2456 + }, + { + "epoch": 0.12254364089775562, + "grad_norm": 0.587639780957528, + "learning_rate": 4.888716514900822e-05, + "loss": 0.947, + "step": 2457 + }, + { + "epoch": 0.12259351620947631, + "grad_norm": 0.919166804396578, + "learning_rate": 4.88859733547273e-05, + "loss": 0.9767, + "step": 2458 + }, + { + "epoch": 0.122643391521197, + "grad_norm": 0.5402100190183102, + "learning_rate": 4.888478093715259e-05, + "loss": 0.9692, + "step": 2459 + }, + { + "epoch": 0.1226932668329177, + "grad_norm": 0.7494292221282577, + "learning_rate": 4.888358789631522e-05, + "loss": 0.9666, + "step": 2460 + }, + { + "epoch": 0.1227431421446384, + "grad_norm": 0.5911541483095543, + "learning_rate": 4.8882394232246305e-05, + "loss": 0.9425, + "step": 2461 + }, + { + "epoch": 0.12279301745635911, + "grad_norm": 0.6440627402128314, + "learning_rate": 4.8881199944977e-05, + "loss": 0.9867, + "step": 2462 + }, + { + "epoch": 0.1228428927680798, + "grad_norm": 0.8814320263274155, + "learning_rate": 4.888000503453848e-05, + "loss": 0.9304, + "step": 2463 + }, + { + "epoch": 0.1228927680798005, + "grad_norm": 2.8586972469861407, + "learning_rate": 4.887880950096191e-05, + "loss": 0.9389, + "step": 2464 + }, + { + "epoch": 0.1229426433915212, + "grad_norm": 3.03998901699724, + "learning_rate": 4.887761334427849e-05, + "loss": 0.9545, + "step": 2465 + }, + { + "epoch": 0.12299251870324189, + "grad_norm": 0.6236620460373751, + "learning_rate": 4.8876416564519435e-05, + "loss": 0.967, + "step": 2466 + }, + { + "epoch": 0.1230423940149626, + "grad_norm": 0.6622913981183838, + "learning_rate": 4.887521916171598e-05, + "loss": 0.9342, + "step": 2467 + }, + { + "epoch": 0.1230922693266833, + "grad_norm": 0.5508136105514899, + "learning_rate": 4.887402113589936e-05, + "loss": 0.9672, + "step": 2468 + }, + { + "epoch": 0.12314214463840399, + "grad_norm": 0.48053802285146996, + "learning_rate": 4.8872822487100856e-05, + "loss": 0.9266, + "step": 2469 + }, + { + "epoch": 0.12319201995012469, + "grad_norm": 0.5124049873340832, + "learning_rate": 4.887162321535173e-05, + "loss": 0.9311, + "step": 2470 + }, + { + "epoch": 0.12324189526184538, + "grad_norm": 0.5766181729151347, + "learning_rate": 4.887042332068327e-05, + "loss": 0.9751, + "step": 2471 + }, + { + "epoch": 0.12329177057356608, + "grad_norm": 0.5134577437951356, + "learning_rate": 4.88692228031268e-05, + "loss": 0.9496, + "step": 2472 + }, + { + "epoch": 0.12334164588528679, + "grad_norm": 0.7520808749587375, + "learning_rate": 4.886802166271364e-05, + "loss": 0.9558, + "step": 2473 + }, + { + "epoch": 0.12339152119700748, + "grad_norm": 0.5389643287613471, + "learning_rate": 4.8866819899475145e-05, + "loss": 0.9685, + "step": 2474 + }, + { + "epoch": 0.12344139650872818, + "grad_norm": 0.5189931708815265, + "learning_rate": 4.886561751344266e-05, + "loss": 0.9081, + "step": 2475 + }, + { + "epoch": 0.12349127182044888, + "grad_norm": 0.5809964287247955, + "learning_rate": 4.886441450464757e-05, + "loss": 0.9081, + "step": 2476 + }, + { + "epoch": 0.12354114713216957, + "grad_norm": 0.5546442998333757, + "learning_rate": 4.886321087312127e-05, + "loss": 0.9242, + "step": 2477 + }, + { + "epoch": 0.12359102244389028, + "grad_norm": 0.5777003874792671, + "learning_rate": 4.886200661889515e-05, + "loss": 0.9578, + "step": 2478 + }, + { + "epoch": 0.12364089775561098, + "grad_norm": 0.5586791000741818, + "learning_rate": 4.886080174200065e-05, + "loss": 0.9425, + "step": 2479 + }, + { + "epoch": 0.12369077306733167, + "grad_norm": 0.47993732066535916, + "learning_rate": 4.885959624246921e-05, + "loss": 0.9206, + "step": 2480 + }, + { + "epoch": 0.12374064837905237, + "grad_norm": 0.48195396895860804, + "learning_rate": 4.8858390120332284e-05, + "loss": 0.872, + "step": 2481 + }, + { + "epoch": 0.12379052369077306, + "grad_norm": 0.494060449512675, + "learning_rate": 4.8857183375621345e-05, + "loss": 0.9263, + "step": 2482 + }, + { + "epoch": 0.12384039900249376, + "grad_norm": 0.47539102233912733, + "learning_rate": 4.885597600836788e-05, + "loss": 0.9015, + "step": 2483 + }, + { + "epoch": 0.12389027431421447, + "grad_norm": 0.6889602851110915, + "learning_rate": 4.88547680186034e-05, + "loss": 0.9697, + "step": 2484 + }, + { + "epoch": 0.12394014962593516, + "grad_norm": 0.5857269842338075, + "learning_rate": 4.8853559406359425e-05, + "loss": 0.9281, + "step": 2485 + }, + { + "epoch": 0.12399002493765586, + "grad_norm": 0.7274703151930465, + "learning_rate": 4.885235017166749e-05, + "loss": 0.9529, + "step": 2486 + }, + { + "epoch": 0.12403990024937656, + "grad_norm": 0.5366687251485107, + "learning_rate": 4.885114031455916e-05, + "loss": 0.8771, + "step": 2487 + }, + { + "epoch": 0.12408977556109725, + "grad_norm": 0.6058988728626282, + "learning_rate": 4.8849929835065985e-05, + "loss": 0.9866, + "step": 2488 + }, + { + "epoch": 0.12413965087281796, + "grad_norm": 0.556262607416156, + "learning_rate": 4.884871873321958e-05, + "loss": 1.0008, + "step": 2489 + }, + { + "epoch": 0.12418952618453866, + "grad_norm": 0.7429117003113522, + "learning_rate": 4.8847507009051525e-05, + "loss": 0.915, + "step": 2490 + }, + { + "epoch": 0.12423940149625935, + "grad_norm": 0.5389775049243046, + "learning_rate": 4.884629466259345e-05, + "loss": 0.8814, + "step": 2491 + }, + { + "epoch": 0.12428927680798005, + "grad_norm": 0.59867044732777, + "learning_rate": 4.884508169387698e-05, + "loss": 0.9852, + "step": 2492 + }, + { + "epoch": 0.12433915211970074, + "grad_norm": 0.8834232142317922, + "learning_rate": 4.884386810293379e-05, + "loss": 0.9069, + "step": 2493 + }, + { + "epoch": 0.12438902743142144, + "grad_norm": 0.6198243160970676, + "learning_rate": 4.884265388979552e-05, + "loss": 0.926, + "step": 2494 + }, + { + "epoch": 0.12443890274314215, + "grad_norm": 0.5003579316458756, + "learning_rate": 4.884143905449387e-05, + "loss": 0.9575, + "step": 2495 + }, + { + "epoch": 0.12448877805486284, + "grad_norm": 0.49434237885390675, + "learning_rate": 4.884022359706054e-05, + "loss": 0.8807, + "step": 2496 + }, + { + "epoch": 0.12453865336658354, + "grad_norm": 0.5749061027891887, + "learning_rate": 4.8839007517527234e-05, + "loss": 0.9185, + "step": 2497 + }, + { + "epoch": 0.12458852867830424, + "grad_norm": 0.5380332775694912, + "learning_rate": 4.883779081592571e-05, + "loss": 0.9818, + "step": 2498 + }, + { + "epoch": 0.12463840399002493, + "grad_norm": 0.6377818704446677, + "learning_rate": 4.8836573492287694e-05, + "loss": 0.9389, + "step": 2499 + }, + { + "epoch": 0.12468827930174564, + "grad_norm": 0.7349087941684574, + "learning_rate": 4.8835355546644964e-05, + "loss": 0.9557, + "step": 2500 + }, + { + "epoch": 0.12473815461346634, + "grad_norm": 0.619988301651807, + "learning_rate": 4.88341369790293e-05, + "loss": 0.9143, + "step": 2501 + }, + { + "epoch": 0.12478802992518703, + "grad_norm": 0.5279292480738341, + "learning_rate": 4.8832917789472496e-05, + "loss": 0.9474, + "step": 2502 + }, + { + "epoch": 0.12483790523690773, + "grad_norm": 0.894951520048544, + "learning_rate": 4.883169797800637e-05, + "loss": 0.9347, + "step": 2503 + }, + { + "epoch": 0.12488778054862842, + "grad_norm": 0.5877405070494633, + "learning_rate": 4.8830477544662754e-05, + "loss": 0.9078, + "step": 2504 + }, + { + "epoch": 0.12493765586034913, + "grad_norm": 0.5310189715771492, + "learning_rate": 4.882925648947349e-05, + "loss": 0.9955, + "step": 2505 + }, + { + "epoch": 0.12498753117206983, + "grad_norm": 0.5102255434587081, + "learning_rate": 4.882803481247044e-05, + "loss": 0.9598, + "step": 2506 + }, + { + "epoch": 0.12503740648379053, + "grad_norm": 0.6469212698045455, + "learning_rate": 4.8826812513685487e-05, + "loss": 0.9662, + "step": 2507 + }, + { + "epoch": 0.12508728179551123, + "grad_norm": 0.5029301336622319, + "learning_rate": 4.882558959315052e-05, + "loss": 0.9986, + "step": 2508 + }, + { + "epoch": 0.12513715710723192, + "grad_norm": 0.5342260976739189, + "learning_rate": 4.8824366050897464e-05, + "loss": 0.9567, + "step": 2509 + }, + { + "epoch": 0.12518703241895263, + "grad_norm": 0.6254898026929315, + "learning_rate": 4.8823141886958234e-05, + "loss": 0.945, + "step": 2510 + }, + { + "epoch": 0.1252369077306733, + "grad_norm": 0.554041126601272, + "learning_rate": 4.882191710136478e-05, + "loss": 0.9582, + "step": 2511 + }, + { + "epoch": 0.12528678304239402, + "grad_norm": 0.4942534825136452, + "learning_rate": 4.882069169414906e-05, + "loss": 0.9259, + "step": 2512 + }, + { + "epoch": 0.1253366583541147, + "grad_norm": 0.558628867898959, + "learning_rate": 4.881946566534305e-05, + "loss": 0.9433, + "step": 2513 + }, + { + "epoch": 0.1253865336658354, + "grad_norm": 0.5426648846447328, + "learning_rate": 4.881823901497875e-05, + "loss": 0.9417, + "step": 2514 + }, + { + "epoch": 0.12543640897755612, + "grad_norm": 0.8037888506313767, + "learning_rate": 4.881701174308815e-05, + "loss": 1.0162, + "step": 2515 + }, + { + "epoch": 0.1254862842892768, + "grad_norm": 0.6170852004618459, + "learning_rate": 4.88157838497033e-05, + "loss": 0.8998, + "step": 2516 + }, + { + "epoch": 0.1255361596009975, + "grad_norm": 0.4842104030682749, + "learning_rate": 4.8814555334856216e-05, + "loss": 0.8914, + "step": 2517 + }, + { + "epoch": 0.1255860349127182, + "grad_norm": 0.5382961172348333, + "learning_rate": 4.881332619857898e-05, + "loss": 0.9382, + "step": 2518 + }, + { + "epoch": 0.1256359102244389, + "grad_norm": 1.2355012569116994, + "learning_rate": 4.881209644090365e-05, + "loss": 0.959, + "step": 2519 + }, + { + "epoch": 0.1256857855361596, + "grad_norm": 0.5179230150312538, + "learning_rate": 4.881086606186232e-05, + "loss": 0.9752, + "step": 2520 + }, + { + "epoch": 0.1257356608478803, + "grad_norm": 0.6620591111437965, + "learning_rate": 4.8809635061487094e-05, + "loss": 0.9035, + "step": 2521 + }, + { + "epoch": 0.125785536159601, + "grad_norm": 0.5010575468537708, + "learning_rate": 4.88084034398101e-05, + "loss": 0.9568, + "step": 2522 + }, + { + "epoch": 0.12583541147132168, + "grad_norm": 0.5585444552191468, + "learning_rate": 4.880717119686347e-05, + "loss": 0.9326, + "step": 2523 + }, + { + "epoch": 0.1258852867830424, + "grad_norm": 0.7764123308774592, + "learning_rate": 4.880593833267937e-05, + "loss": 0.9649, + "step": 2524 + }, + { + "epoch": 0.1259351620947631, + "grad_norm": 0.5919385974915856, + "learning_rate": 4.880470484728996e-05, + "loss": 0.9522, + "step": 2525 + }, + { + "epoch": 0.12598503740648379, + "grad_norm": 0.6569105316988939, + "learning_rate": 4.8803470740727425e-05, + "loss": 0.9404, + "step": 2526 + }, + { + "epoch": 0.1260349127182045, + "grad_norm": 0.674954032310399, + "learning_rate": 4.8802236013023975e-05, + "loss": 0.966, + "step": 2527 + }, + { + "epoch": 0.12608478802992518, + "grad_norm": 0.47023140452441164, + "learning_rate": 4.880100066421184e-05, + "loss": 0.959, + "step": 2528 + }, + { + "epoch": 0.1261346633416459, + "grad_norm": 0.5929544302841644, + "learning_rate": 4.8799764694323235e-05, + "loss": 0.9058, + "step": 2529 + }, + { + "epoch": 0.1261845386533666, + "grad_norm": 0.5035111574143871, + "learning_rate": 4.8798528103390414e-05, + "loss": 0.9808, + "step": 2530 + }, + { + "epoch": 0.12623441396508728, + "grad_norm": 1.2257802287764075, + "learning_rate": 4.879729089144567e-05, + "loss": 0.9962, + "step": 2531 + }, + { + "epoch": 0.126284289276808, + "grad_norm": 0.5407403113201992, + "learning_rate": 4.8796053058521266e-05, + "loss": 0.9389, + "step": 2532 + }, + { + "epoch": 0.12633416458852867, + "grad_norm": 0.4800512816417536, + "learning_rate": 4.8794814604649496e-05, + "loss": 0.9366, + "step": 2533 + }, + { + "epoch": 0.12638403990024938, + "grad_norm": 0.9868802415920658, + "learning_rate": 4.87935755298627e-05, + "loss": 0.93, + "step": 2534 + }, + { + "epoch": 0.1264339152119701, + "grad_norm": 0.5312830304217958, + "learning_rate": 4.8792335834193194e-05, + "loss": 0.9155, + "step": 2535 + }, + { + "epoch": 0.12648379052369077, + "grad_norm": 0.628425790770739, + "learning_rate": 4.8791095517673334e-05, + "loss": 0.9476, + "step": 2536 + }, + { + "epoch": 0.12653366583541148, + "grad_norm": 0.4737020093588516, + "learning_rate": 4.878985458033548e-05, + "loss": 0.9123, + "step": 2537 + }, + { + "epoch": 0.12658354114713216, + "grad_norm": 0.7565514611548096, + "learning_rate": 4.878861302221203e-05, + "loss": 0.9008, + "step": 2538 + }, + { + "epoch": 0.12663341645885287, + "grad_norm": 0.613078606696238, + "learning_rate": 4.8787370843335355e-05, + "loss": 0.9263, + "step": 2539 + }, + { + "epoch": 0.12668329177057355, + "grad_norm": 0.4547629292261244, + "learning_rate": 4.878612804373789e-05, + "loss": 0.9419, + "step": 2540 + }, + { + "epoch": 0.12673316708229426, + "grad_norm": 0.4397809225001333, + "learning_rate": 4.878488462345206e-05, + "loss": 0.9265, + "step": 2541 + }, + { + "epoch": 0.12678304239401497, + "grad_norm": 0.6419141728385566, + "learning_rate": 4.8783640582510305e-05, + "loss": 0.9931, + "step": 2542 + }, + { + "epoch": 0.12683291770573565, + "grad_norm": 0.6027833523144082, + "learning_rate": 4.87823959209451e-05, + "loss": 0.9669, + "step": 2543 + }, + { + "epoch": 0.12688279301745636, + "grad_norm": 0.5132976054919736, + "learning_rate": 4.878115063878891e-05, + "loss": 0.9657, + "step": 2544 + }, + { + "epoch": 0.12693266832917705, + "grad_norm": 0.49754388104009534, + "learning_rate": 4.877990473607424e-05, + "loss": 0.9192, + "step": 2545 + }, + { + "epoch": 0.12698254364089775, + "grad_norm": 0.5111503927599647, + "learning_rate": 4.87786582128336e-05, + "loss": 0.9086, + "step": 2546 + }, + { + "epoch": 0.12703241895261846, + "grad_norm": 0.8350984827222339, + "learning_rate": 4.877741106909952e-05, + "loss": 0.9753, + "step": 2547 + }, + { + "epoch": 0.12708229426433915, + "grad_norm": 0.5514407513069567, + "learning_rate": 4.8776163304904533e-05, + "loss": 0.9572, + "step": 2548 + }, + { + "epoch": 0.12713216957605986, + "grad_norm": 0.5508319652336738, + "learning_rate": 4.877491492028121e-05, + "loss": 0.9841, + "step": 2549 + }, + { + "epoch": 0.12718204488778054, + "grad_norm": 0.5506214311923125, + "learning_rate": 4.877366591526212e-05, + "loss": 0.9188, + "step": 2550 + }, + { + "epoch": 0.12723192019950125, + "grad_norm": 0.543380037048668, + "learning_rate": 4.877241628987986e-05, + "loss": 0.9248, + "step": 2551 + }, + { + "epoch": 0.12728179551122196, + "grad_norm": 0.47150864816914995, + "learning_rate": 4.877116604416703e-05, + "loss": 0.9571, + "step": 2552 + }, + { + "epoch": 0.12733167082294264, + "grad_norm": 0.47554412346806596, + "learning_rate": 4.876991517815627e-05, + "loss": 0.9233, + "step": 2553 + }, + { + "epoch": 0.12738154613466335, + "grad_norm": 1.0811683771435168, + "learning_rate": 4.87686636918802e-05, + "loss": 0.9511, + "step": 2554 + }, + { + "epoch": 0.12743142144638403, + "grad_norm": 0.6700061939331423, + "learning_rate": 4.8767411585371493e-05, + "loss": 0.8958, + "step": 2555 + }, + { + "epoch": 0.12748129675810474, + "grad_norm": 0.8478627599855716, + "learning_rate": 4.876615885866282e-05, + "loss": 0.9407, + "step": 2556 + }, + { + "epoch": 0.12753117206982545, + "grad_norm": 0.47568034216490523, + "learning_rate": 4.8764905511786866e-05, + "loss": 0.9505, + "step": 2557 + }, + { + "epoch": 0.12758104738154613, + "grad_norm": 0.6385106014302973, + "learning_rate": 4.876365154477634e-05, + "loss": 0.9189, + "step": 2558 + }, + { + "epoch": 0.12763092269326684, + "grad_norm": 0.566439871811003, + "learning_rate": 4.8762396957663956e-05, + "loss": 0.9428, + "step": 2559 + }, + { + "epoch": 0.12768079800498752, + "grad_norm": 0.5598493826329296, + "learning_rate": 4.876114175048247e-05, + "loss": 0.9303, + "step": 2560 + }, + { + "epoch": 0.12773067331670823, + "grad_norm": 0.7353731237405755, + "learning_rate": 4.875988592326462e-05, + "loss": 0.9653, + "step": 2561 + }, + { + "epoch": 0.1277805486284289, + "grad_norm": 0.5168011122374757, + "learning_rate": 4.875862947604317e-05, + "loss": 0.9588, + "step": 2562 + }, + { + "epoch": 0.12783042394014962, + "grad_norm": 0.7347319365465642, + "learning_rate": 4.875737240885093e-05, + "loss": 0.9341, + "step": 2563 + }, + { + "epoch": 0.12788029925187033, + "grad_norm": 0.5074551510573369, + "learning_rate": 4.875611472172068e-05, + "loss": 0.9273, + "step": 2564 + }, + { + "epoch": 0.12793017456359101, + "grad_norm": 0.5490982778973754, + "learning_rate": 4.875485641468526e-05, + "loss": 0.9236, + "step": 2565 + }, + { + "epoch": 0.12798004987531172, + "grad_norm": 0.6262601957213807, + "learning_rate": 4.875359748777748e-05, + "loss": 0.918, + "step": 2566 + }, + { + "epoch": 0.1280299251870324, + "grad_norm": 0.45263661624689727, + "learning_rate": 4.875233794103021e-05, + "loss": 0.9234, + "step": 2567 + }, + { + "epoch": 0.12807980049875312, + "grad_norm": 1.2063263801757003, + "learning_rate": 4.8751077774476314e-05, + "loss": 0.9593, + "step": 2568 + }, + { + "epoch": 0.12812967581047383, + "grad_norm": 0.5345695067820017, + "learning_rate": 4.8749816988148666e-05, + "loss": 0.9341, + "step": 2569 + }, + { + "epoch": 0.1281795511221945, + "grad_norm": 0.5020322583502898, + "learning_rate": 4.874855558208018e-05, + "loss": 0.9289, + "step": 2570 + }, + { + "epoch": 0.12822942643391522, + "grad_norm": 0.5527097611570031, + "learning_rate": 4.874729355630376e-05, + "loss": 0.9523, + "step": 2571 + }, + { + "epoch": 0.1282793017456359, + "grad_norm": 0.5613476545279154, + "learning_rate": 4.8746030910852346e-05, + "loss": 0.9512, + "step": 2572 + }, + { + "epoch": 0.1283291770573566, + "grad_norm": 0.52272634995969, + "learning_rate": 4.8744767645758875e-05, + "loss": 0.9252, + "step": 2573 + }, + { + "epoch": 0.12837905236907732, + "grad_norm": 0.6450338252920194, + "learning_rate": 4.874350376105633e-05, + "loss": 0.9451, + "step": 2574 + }, + { + "epoch": 0.128428927680798, + "grad_norm": 0.5418385439279217, + "learning_rate": 4.8742239256777674e-05, + "loss": 0.9306, + "step": 2575 + }, + { + "epoch": 0.1284788029925187, + "grad_norm": 0.9540209839822271, + "learning_rate": 4.874097413295591e-05, + "loss": 0.9602, + "step": 2576 + }, + { + "epoch": 0.1285286783042394, + "grad_norm": 0.45327541827085954, + "learning_rate": 4.873970838962405e-05, + "loss": 0.9339, + "step": 2577 + }, + { + "epoch": 0.1285785536159601, + "grad_norm": 0.6160839367360481, + "learning_rate": 4.8738442026815134e-05, + "loss": 0.975, + "step": 2578 + }, + { + "epoch": 0.1286284289276808, + "grad_norm": 0.6680720745256787, + "learning_rate": 4.873717504456219e-05, + "loss": 0.984, + "step": 2579 + }, + { + "epoch": 0.1286783042394015, + "grad_norm": 0.6482503582586282, + "learning_rate": 4.873590744289828e-05, + "loss": 0.9308, + "step": 2580 + }, + { + "epoch": 0.1287281795511222, + "grad_norm": 1.0137624374222918, + "learning_rate": 4.8734639221856495e-05, + "loss": 0.91, + "step": 2581 + }, + { + "epoch": 0.12877805486284288, + "grad_norm": 0.6342332533783799, + "learning_rate": 4.873337038146992e-05, + "loss": 0.9593, + "step": 2582 + }, + { + "epoch": 0.1288279301745636, + "grad_norm": 0.8102576703717834, + "learning_rate": 4.873210092177167e-05, + "loss": 0.9454, + "step": 2583 + }, + { + "epoch": 0.1288778054862843, + "grad_norm": 0.6147253624100785, + "learning_rate": 4.873083084279487e-05, + "loss": 0.9561, + "step": 2584 + }, + { + "epoch": 0.12892768079800498, + "grad_norm": 0.6949691090929959, + "learning_rate": 4.872956014457265e-05, + "loss": 0.9613, + "step": 2585 + }, + { + "epoch": 0.1289775561097257, + "grad_norm": 0.5486424321885098, + "learning_rate": 4.872828882713818e-05, + "loss": 0.9278, + "step": 2586 + }, + { + "epoch": 0.12902743142144638, + "grad_norm": 0.6676737038975041, + "learning_rate": 4.872701689052464e-05, + "loss": 0.9237, + "step": 2587 + }, + { + "epoch": 0.12907730673316709, + "grad_norm": 0.6141254363640949, + "learning_rate": 4.872574433476521e-05, + "loss": 0.8869, + "step": 2588 + }, + { + "epoch": 0.12912718204488777, + "grad_norm": 0.574787603380592, + "learning_rate": 4.87244711598931e-05, + "loss": 0.9486, + "step": 2589 + }, + { + "epoch": 0.12917705735660848, + "grad_norm": 0.6067393058361944, + "learning_rate": 4.872319736594153e-05, + "loss": 0.9445, + "step": 2590 + }, + { + "epoch": 0.1292269326683292, + "grad_norm": 0.5425199083973833, + "learning_rate": 4.872192295294374e-05, + "loss": 0.8994, + "step": 2591 + }, + { + "epoch": 0.12927680798004987, + "grad_norm": 0.5246376179491908, + "learning_rate": 4.872064792093299e-05, + "loss": 0.9265, + "step": 2592 + }, + { + "epoch": 0.12932668329177058, + "grad_norm": 0.5281109680951204, + "learning_rate": 4.871937226994255e-05, + "loss": 0.9072, + "step": 2593 + }, + { + "epoch": 0.12937655860349126, + "grad_norm": 0.4734867839922408, + "learning_rate": 4.871809600000571e-05, + "loss": 0.9646, + "step": 2594 + }, + { + "epoch": 0.12942643391521197, + "grad_norm": 0.5947213269104761, + "learning_rate": 4.871681911115577e-05, + "loss": 0.9213, + "step": 2595 + }, + { + "epoch": 0.12947630922693268, + "grad_norm": 0.726602212196281, + "learning_rate": 4.871554160342604e-05, + "loss": 0.9964, + "step": 2596 + }, + { + "epoch": 0.12952618453865336, + "grad_norm": 0.6210619433820318, + "learning_rate": 4.8714263476849863e-05, + "loss": 0.9382, + "step": 2597 + }, + { + "epoch": 0.12957605985037407, + "grad_norm": 0.5106644902608305, + "learning_rate": 4.8712984731460596e-05, + "loss": 0.9549, + "step": 2598 + }, + { + "epoch": 0.12962593516209475, + "grad_norm": 0.479936036748056, + "learning_rate": 4.8711705367291616e-05, + "loss": 0.8958, + "step": 2599 + }, + { + "epoch": 0.12967581047381546, + "grad_norm": 0.6229844639055854, + "learning_rate": 4.871042538437629e-05, + "loss": 0.9555, + "step": 2600 + }, + { + "epoch": 0.12972568578553617, + "grad_norm": 0.6061719164466731, + "learning_rate": 4.870914478274802e-05, + "loss": 0.9307, + "step": 2601 + }, + { + "epoch": 0.12977556109725685, + "grad_norm": 0.6896908779402554, + "learning_rate": 4.870786356244024e-05, + "loss": 0.9507, + "step": 2602 + }, + { + "epoch": 0.12982543640897756, + "grad_norm": 0.7250033215736948, + "learning_rate": 4.8706581723486355e-05, + "loss": 0.948, + "step": 2603 + }, + { + "epoch": 0.12987531172069824, + "grad_norm": 0.517308161551889, + "learning_rate": 4.870529926591984e-05, + "loss": 0.999, + "step": 2604 + }, + { + "epoch": 0.12992518703241895, + "grad_norm": 0.5912579907988132, + "learning_rate": 4.870401618977415e-05, + "loss": 0.9175, + "step": 2605 + }, + { + "epoch": 0.12997506234413966, + "grad_norm": 0.5883840669150645, + "learning_rate": 4.870273249508276e-05, + "loss": 0.9203, + "step": 2606 + }, + { + "epoch": 0.13002493765586035, + "grad_norm": 0.8286424461744715, + "learning_rate": 4.870144818187918e-05, + "loss": 0.9895, + "step": 2607 + }, + { + "epoch": 0.13007481296758105, + "grad_norm": 0.4971831895520287, + "learning_rate": 4.8700163250196914e-05, + "loss": 0.9148, + "step": 2608 + }, + { + "epoch": 0.13012468827930174, + "grad_norm": 0.4702517763059919, + "learning_rate": 4.86988777000695e-05, + "loss": 0.9485, + "step": 2609 + }, + { + "epoch": 0.13017456359102245, + "grad_norm": 1.0047542805650513, + "learning_rate": 4.869759153153047e-05, + "loss": 0.9215, + "step": 2610 + }, + { + "epoch": 0.13022443890274316, + "grad_norm": 0.6308363246344593, + "learning_rate": 4.86963047446134e-05, + "loss": 0.9573, + "step": 2611 + }, + { + "epoch": 0.13027431421446384, + "grad_norm": 0.4997833025308874, + "learning_rate": 4.869501733935186e-05, + "loss": 0.9539, + "step": 2612 + }, + { + "epoch": 0.13032418952618455, + "grad_norm": 0.5207066740858599, + "learning_rate": 4.869372931577944e-05, + "loss": 0.9205, + "step": 2613 + }, + { + "epoch": 0.13037406483790523, + "grad_norm": 0.46110345391241636, + "learning_rate": 4.869244067392977e-05, + "loss": 0.9535, + "step": 2614 + }, + { + "epoch": 0.13042394014962594, + "grad_norm": 0.5344022387391283, + "learning_rate": 4.869115141383646e-05, + "loss": 0.9315, + "step": 2615 + }, + { + "epoch": 0.13047381546134662, + "grad_norm": 0.7733743646798085, + "learning_rate": 4.8689861535533155e-05, + "loss": 0.9131, + "step": 2616 + }, + { + "epoch": 0.13052369077306733, + "grad_norm": 0.5016398834734225, + "learning_rate": 4.868857103905352e-05, + "loss": 0.9454, + "step": 2617 + }, + { + "epoch": 0.13057356608478804, + "grad_norm": 0.4660491134691784, + "learning_rate": 4.868727992443122e-05, + "loss": 0.9417, + "step": 2618 + }, + { + "epoch": 0.13062344139650872, + "grad_norm": 1.3077079015777746, + "learning_rate": 4.868598819169995e-05, + "loss": 0.8902, + "step": 2619 + }, + { + "epoch": 0.13067331670822943, + "grad_norm": 0.6042964227410672, + "learning_rate": 4.868469584089343e-05, + "loss": 0.9828, + "step": 2620 + }, + { + "epoch": 0.1307231920199501, + "grad_norm": 0.5764394799208817, + "learning_rate": 4.868340287204536e-05, + "loss": 0.9314, + "step": 2621 + }, + { + "epoch": 0.13077306733167082, + "grad_norm": 0.5719268023110273, + "learning_rate": 4.868210928518949e-05, + "loss": 0.959, + "step": 2622 + }, + { + "epoch": 0.13082294264339153, + "grad_norm": 0.7126468376470663, + "learning_rate": 4.8680815080359585e-05, + "loss": 0.932, + "step": 2623 + }, + { + "epoch": 0.13087281795511221, + "grad_norm": 0.5506925839647817, + "learning_rate": 4.867952025758941e-05, + "loss": 0.9691, + "step": 2624 + }, + { + "epoch": 0.13092269326683292, + "grad_norm": 0.6429577064045894, + "learning_rate": 4.8678224816912744e-05, + "loss": 0.9793, + "step": 2625 + }, + { + "epoch": 0.1309725685785536, + "grad_norm": 0.5965852329533774, + "learning_rate": 4.8676928758363395e-05, + "loss": 0.9558, + "step": 2626 + }, + { + "epoch": 0.13102244389027431, + "grad_norm": 0.7988024046121336, + "learning_rate": 4.86756320819752e-05, + "loss": 0.94, + "step": 2627 + }, + { + "epoch": 0.13107231920199502, + "grad_norm": 0.5077260082317199, + "learning_rate": 4.867433478778197e-05, + "loss": 0.9281, + "step": 2628 + }, + { + "epoch": 0.1311221945137157, + "grad_norm": 0.4412064766328583, + "learning_rate": 4.867303687581757e-05, + "loss": 0.9521, + "step": 2629 + }, + { + "epoch": 0.13117206982543642, + "grad_norm": 0.5387448296655984, + "learning_rate": 4.867173834611587e-05, + "loss": 0.9421, + "step": 2630 + }, + { + "epoch": 0.1312219451371571, + "grad_norm": 0.5001636204469312, + "learning_rate": 4.867043919871076e-05, + "loss": 1.0, + "step": 2631 + }, + { + "epoch": 0.1312718204488778, + "grad_norm": 0.6643131715153875, + "learning_rate": 4.866913943363612e-05, + "loss": 0.9549, + "step": 2632 + }, + { + "epoch": 0.13132169576059852, + "grad_norm": 0.8851297312813998, + "learning_rate": 4.866783905092589e-05, + "loss": 0.9456, + "step": 2633 + }, + { + "epoch": 0.1313715710723192, + "grad_norm": 0.6267266246298642, + "learning_rate": 4.866653805061398e-05, + "loss": 0.926, + "step": 2634 + }, + { + "epoch": 0.1314214463840399, + "grad_norm": 0.6541933383806251, + "learning_rate": 4.866523643273436e-05, + "loss": 0.9286, + "step": 2635 + }, + { + "epoch": 0.1314713216957606, + "grad_norm": 0.49373370803790745, + "learning_rate": 4.866393419732099e-05, + "loss": 0.9768, + "step": 2636 + }, + { + "epoch": 0.1315211970074813, + "grad_norm": 0.5281370373886759, + "learning_rate": 4.866263134440784e-05, + "loss": 0.9468, + "step": 2637 + }, + { + "epoch": 0.13157107231920198, + "grad_norm": 0.5094832933808822, + "learning_rate": 4.866132787402892e-05, + "loss": 0.9188, + "step": 2638 + }, + { + "epoch": 0.1316209476309227, + "grad_norm": 0.4741774046517763, + "learning_rate": 4.866002378621823e-05, + "loss": 0.9555, + "step": 2639 + }, + { + "epoch": 0.1316708229426434, + "grad_norm": 0.6894758544939477, + "learning_rate": 4.865871908100982e-05, + "loss": 0.9359, + "step": 2640 + }, + { + "epoch": 0.13172069825436408, + "grad_norm": 0.7859337800980302, + "learning_rate": 4.8657413758437714e-05, + "loss": 0.9022, + "step": 2641 + }, + { + "epoch": 0.1317705735660848, + "grad_norm": 0.5293974861965746, + "learning_rate": 4.865610781853599e-05, + "loss": 0.9017, + "step": 2642 + }, + { + "epoch": 0.13182044887780547, + "grad_norm": 0.5717644409756125, + "learning_rate": 4.865480126133872e-05, + "loss": 0.9734, + "step": 2643 + }, + { + "epoch": 0.13187032418952618, + "grad_norm": 0.46425215245775214, + "learning_rate": 4.8653494086879994e-05, + "loss": 0.9104, + "step": 2644 + }, + { + "epoch": 0.1319201995012469, + "grad_norm": 0.42421219246258607, + "learning_rate": 4.8652186295193926e-05, + "loss": 0.9224, + "step": 2645 + }, + { + "epoch": 0.13197007481296757, + "grad_norm": 0.6643774813028477, + "learning_rate": 4.865087788631464e-05, + "loss": 0.9773, + "step": 2646 + }, + { + "epoch": 0.13201995012468828, + "grad_norm": 0.5245717760193878, + "learning_rate": 4.864956886027628e-05, + "loss": 0.9687, + "step": 2647 + }, + { + "epoch": 0.13206982543640897, + "grad_norm": 0.6004811524045136, + "learning_rate": 4.864825921711301e-05, + "loss": 0.9832, + "step": 2648 + }, + { + "epoch": 0.13211970074812968, + "grad_norm": 0.5459399189332681, + "learning_rate": 4.8646948956859e-05, + "loss": 0.945, + "step": 2649 + }, + { + "epoch": 0.13216957605985039, + "grad_norm": 0.4322656946221957, + "learning_rate": 4.864563807954844e-05, + "loss": 0.913, + "step": 2650 + }, + { + "epoch": 0.13221945137157107, + "grad_norm": 0.5444558963202328, + "learning_rate": 4.8644326585215526e-05, + "loss": 0.9657, + "step": 2651 + }, + { + "epoch": 0.13226932668329178, + "grad_norm": 0.6023285246946475, + "learning_rate": 4.86430144738945e-05, + "loss": 0.9542, + "step": 2652 + }, + { + "epoch": 0.13231920199501246, + "grad_norm": 0.5306517236001483, + "learning_rate": 4.864170174561959e-05, + "loss": 0.9642, + "step": 2653 + }, + { + "epoch": 0.13236907730673317, + "grad_norm": 0.4901700133263797, + "learning_rate": 4.8640388400425064e-05, + "loss": 0.9397, + "step": 2654 + }, + { + "epoch": 0.13241895261845388, + "grad_norm": 0.5564082828009401, + "learning_rate": 4.863907443834517e-05, + "loss": 0.9311, + "step": 2655 + }, + { + "epoch": 0.13246882793017456, + "grad_norm": 0.6063890061544215, + "learning_rate": 4.863775985941421e-05, + "loss": 0.9407, + "step": 2656 + }, + { + "epoch": 0.13251870324189527, + "grad_norm": 0.42140698159230516, + "learning_rate": 4.863644466366649e-05, + "loss": 0.9349, + "step": 2657 + }, + { + "epoch": 0.13256857855361595, + "grad_norm": 0.5137408187112721, + "learning_rate": 4.863512885113632e-05, + "loss": 0.9538, + "step": 2658 + }, + { + "epoch": 0.13261845386533666, + "grad_norm": 0.5232755852667, + "learning_rate": 4.8633812421858045e-05, + "loss": 0.9569, + "step": 2659 + }, + { + "epoch": 0.13266832917705737, + "grad_norm": 0.508867739758271, + "learning_rate": 4.8632495375866004e-05, + "loss": 0.9134, + "step": 2660 + }, + { + "epoch": 0.13271820448877805, + "grad_norm": 0.5909221078912489, + "learning_rate": 4.8631177713194586e-05, + "loss": 0.9428, + "step": 2661 + }, + { + "epoch": 0.13276807980049876, + "grad_norm": 0.5028386096402265, + "learning_rate": 4.862985943387815e-05, + "loss": 0.9508, + "step": 2662 + }, + { + "epoch": 0.13281795511221944, + "grad_norm": 0.6127109077532618, + "learning_rate": 4.862854053795111e-05, + "loss": 0.9515, + "step": 2663 + }, + { + "epoch": 0.13286783042394015, + "grad_norm": 0.5399962344909455, + "learning_rate": 4.8627221025447883e-05, + "loss": 0.9612, + "step": 2664 + }, + { + "epoch": 0.13291770573566083, + "grad_norm": 0.49185463513524197, + "learning_rate": 4.86259008964029e-05, + "loss": 0.9422, + "step": 2665 + }, + { + "epoch": 0.13296758104738154, + "grad_norm": 0.44843219933133127, + "learning_rate": 4.86245801508506e-05, + "loss": 0.9572, + "step": 2666 + }, + { + "epoch": 0.13301745635910225, + "grad_norm": 0.5111931393419638, + "learning_rate": 4.862325878882546e-05, + "loss": 0.9201, + "step": 2667 + }, + { + "epoch": 0.13306733167082294, + "grad_norm": 0.6709967178760219, + "learning_rate": 4.8621936810361945e-05, + "loss": 0.9706, + "step": 2668 + }, + { + "epoch": 0.13311720698254365, + "grad_norm": 0.45596781778377177, + "learning_rate": 4.8620614215494565e-05, + "loss": 0.9276, + "step": 2669 + }, + { + "epoch": 0.13316708229426433, + "grad_norm": 0.7960211662954614, + "learning_rate": 4.861929100425783e-05, + "loss": 0.9449, + "step": 2670 + }, + { + "epoch": 0.13321695760598504, + "grad_norm": 0.5807786159263544, + "learning_rate": 4.8617967176686266e-05, + "loss": 0.9491, + "step": 2671 + }, + { + "epoch": 0.13326683291770575, + "grad_norm": 0.7093965524426001, + "learning_rate": 4.8616642732814416e-05, + "loss": 0.9149, + "step": 2672 + }, + { + "epoch": 0.13331670822942643, + "grad_norm": 0.511617279036058, + "learning_rate": 4.861531767267685e-05, + "loss": 0.9404, + "step": 2673 + }, + { + "epoch": 0.13336658354114714, + "grad_norm": 0.9431852592601128, + "learning_rate": 4.8613991996308126e-05, + "loss": 0.9089, + "step": 2674 + }, + { + "epoch": 0.13341645885286782, + "grad_norm": 0.4805400798489245, + "learning_rate": 4.861266570374286e-05, + "loss": 0.9127, + "step": 2675 + }, + { + "epoch": 0.13346633416458853, + "grad_norm": 0.45668202747964326, + "learning_rate": 4.861133879501565e-05, + "loss": 0.9018, + "step": 2676 + }, + { + "epoch": 0.13351620947630924, + "grad_norm": 0.5742283564978234, + "learning_rate": 4.861001127016112e-05, + "loss": 0.9476, + "step": 2677 + }, + { + "epoch": 0.13356608478802992, + "grad_norm": 0.5019496133940687, + "learning_rate": 4.860868312921391e-05, + "loss": 0.9756, + "step": 2678 + }, + { + "epoch": 0.13361596009975063, + "grad_norm": 0.49536322114874193, + "learning_rate": 4.860735437220868e-05, + "loss": 0.925, + "step": 2679 + }, + { + "epoch": 0.1336658354114713, + "grad_norm": 0.6485954293362409, + "learning_rate": 4.86060249991801e-05, + "loss": 0.9821, + "step": 2680 + }, + { + "epoch": 0.13371571072319202, + "grad_norm": 0.490729620465909, + "learning_rate": 4.860469501016286e-05, + "loss": 0.9223, + "step": 2681 + }, + { + "epoch": 0.13376558603491273, + "grad_norm": 1.0523260976031736, + "learning_rate": 4.860336440519168e-05, + "loss": 0.9436, + "step": 2682 + }, + { + "epoch": 0.1338154613466334, + "grad_norm": 0.7397657990131172, + "learning_rate": 4.860203318430126e-05, + "loss": 0.9665, + "step": 2683 + }, + { + "epoch": 0.13386533665835412, + "grad_norm": 0.49933779097115777, + "learning_rate": 4.860070134752635e-05, + "loss": 0.899, + "step": 2684 + }, + { + "epoch": 0.1339152119700748, + "grad_norm": 0.49992586144958484, + "learning_rate": 4.85993688949017e-05, + "loss": 1.0225, + "step": 2685 + }, + { + "epoch": 0.13396508728179551, + "grad_norm": 0.7388072355115396, + "learning_rate": 4.859803582646208e-05, + "loss": 0.9465, + "step": 2686 + }, + { + "epoch": 0.1340149625935162, + "grad_norm": 0.7130865759669265, + "learning_rate": 4.859670214224228e-05, + "loss": 0.9824, + "step": 2687 + }, + { + "epoch": 0.1340648379052369, + "grad_norm": 0.5686119589448129, + "learning_rate": 4.8595367842277096e-05, + "loss": 0.8731, + "step": 2688 + }, + { + "epoch": 0.13411471321695762, + "grad_norm": 0.6018478645599136, + "learning_rate": 4.859403292660134e-05, + "loss": 0.9123, + "step": 2689 + }, + { + "epoch": 0.1341645885286783, + "grad_norm": 0.6749427359007767, + "learning_rate": 4.8592697395249864e-05, + "loss": 1.0051, + "step": 2690 + }, + { + "epoch": 0.134214463840399, + "grad_norm": 2.361973068647843, + "learning_rate": 4.859136124825751e-05, + "loss": 0.9658, + "step": 2691 + }, + { + "epoch": 0.1342643391521197, + "grad_norm": 0.7029031523348647, + "learning_rate": 4.8590024485659134e-05, + "loss": 0.9423, + "step": 2692 + }, + { + "epoch": 0.1343142144638404, + "grad_norm": 0.546992032948314, + "learning_rate": 4.858868710748963e-05, + "loss": 1.0049, + "step": 2693 + }, + { + "epoch": 0.1343640897755611, + "grad_norm": 0.5637773450687439, + "learning_rate": 4.858734911378389e-05, + "loss": 0.937, + "step": 2694 + }, + { + "epoch": 0.1344139650872818, + "grad_norm": 1.008629721741846, + "learning_rate": 4.858601050457684e-05, + "loss": 0.9228, + "step": 2695 + }, + { + "epoch": 0.1344638403990025, + "grad_norm": 0.5837656068927529, + "learning_rate": 4.8584671279903396e-05, + "loss": 0.9253, + "step": 2696 + }, + { + "epoch": 0.13451371571072318, + "grad_norm": 0.710758722852074, + "learning_rate": 4.8583331439798504e-05, + "loss": 0.9439, + "step": 2697 + }, + { + "epoch": 0.1345635910224439, + "grad_norm": 0.5172329522544653, + "learning_rate": 4.8581990984297146e-05, + "loss": 0.9289, + "step": 2698 + }, + { + "epoch": 0.1346134663341646, + "grad_norm": 0.5279423329960198, + "learning_rate": 4.8580649913434273e-05, + "loss": 0.8881, + "step": 2699 + }, + { + "epoch": 0.13466334164588528, + "grad_norm": 0.4980750609680397, + "learning_rate": 4.85793082272449e-05, + "loss": 0.939, + "step": 2700 + }, + { + "epoch": 0.134713216957606, + "grad_norm": 0.4822392932889666, + "learning_rate": 4.8577965925764035e-05, + "loss": 0.9716, + "step": 2701 + }, + { + "epoch": 0.13476309226932667, + "grad_norm": 0.45964156132851075, + "learning_rate": 4.85766230090267e-05, + "loss": 0.9716, + "step": 2702 + }, + { + "epoch": 0.13481296758104738, + "grad_norm": 0.4770113164789637, + "learning_rate": 4.8575279477067934e-05, + "loss": 0.9212, + "step": 2703 + }, + { + "epoch": 0.1348628428927681, + "grad_norm": 0.6393340230748977, + "learning_rate": 4.8573935329922804e-05, + "loss": 0.9817, + "step": 2704 + }, + { + "epoch": 0.13491271820448877, + "grad_norm": 0.5250172691493201, + "learning_rate": 4.8572590567626386e-05, + "loss": 0.8737, + "step": 2705 + }, + { + "epoch": 0.13496259351620948, + "grad_norm": 0.8785302181077489, + "learning_rate": 4.857124519021376e-05, + "loss": 0.963, + "step": 2706 + }, + { + "epoch": 0.13501246882793017, + "grad_norm": 0.5468392537607896, + "learning_rate": 4.856989919772005e-05, + "loss": 0.9439, + "step": 2707 + }, + { + "epoch": 0.13506234413965088, + "grad_norm": 0.5030383683933787, + "learning_rate": 4.856855259018035e-05, + "loss": 0.9508, + "step": 2708 + }, + { + "epoch": 0.13511221945137158, + "grad_norm": 0.7022822784806978, + "learning_rate": 4.8567205367629835e-05, + "loss": 0.9198, + "step": 2709 + }, + { + "epoch": 0.13516209476309227, + "grad_norm": 0.46038544571018236, + "learning_rate": 4.856585753010364e-05, + "loss": 0.9797, + "step": 2710 + }, + { + "epoch": 0.13521197007481298, + "grad_norm": 0.5900644814533815, + "learning_rate": 4.856450907763693e-05, + "loss": 0.9362, + "step": 2711 + }, + { + "epoch": 0.13526184538653366, + "grad_norm": 0.4812778643636692, + "learning_rate": 4.856316001026492e-05, + "loss": 0.9899, + "step": 2712 + }, + { + "epoch": 0.13531172069825437, + "grad_norm": 0.4358041986433433, + "learning_rate": 4.8561810328022786e-05, + "loss": 0.898, + "step": 2713 + }, + { + "epoch": 0.13536159600997505, + "grad_norm": 0.7203045648617928, + "learning_rate": 4.8560460030945747e-05, + "loss": 0.9348, + "step": 2714 + }, + { + "epoch": 0.13541147132169576, + "grad_norm": 0.4898729364713474, + "learning_rate": 4.855910911906906e-05, + "loss": 0.9281, + "step": 2715 + }, + { + "epoch": 0.13546134663341647, + "grad_norm": 0.5637299665792728, + "learning_rate": 4.8557757592427956e-05, + "loss": 0.9515, + "step": 2716 + }, + { + "epoch": 0.13551122194513715, + "grad_norm": 0.8234707011620007, + "learning_rate": 4.855640545105772e-05, + "loss": 0.9583, + "step": 2717 + }, + { + "epoch": 0.13556109725685786, + "grad_norm": 0.5643085495246281, + "learning_rate": 4.855505269499362e-05, + "loss": 0.9945, + "step": 2718 + }, + { + "epoch": 0.13561097256857854, + "grad_norm": 0.4572285795907801, + "learning_rate": 4.8553699324270964e-05, + "loss": 0.9487, + "step": 2719 + }, + { + "epoch": 0.13566084788029925, + "grad_norm": 0.4968932667050419, + "learning_rate": 4.855234533892506e-05, + "loss": 0.9471, + "step": 2720 + }, + { + "epoch": 0.13571072319201996, + "grad_norm": 0.4785506688659571, + "learning_rate": 4.855099073899125e-05, + "loss": 0.9311, + "step": 2721 + }, + { + "epoch": 0.13576059850374064, + "grad_norm": 0.7746465637113852, + "learning_rate": 4.8549635524504874e-05, + "loss": 0.9488, + "step": 2722 + }, + { + "epoch": 0.13581047381546135, + "grad_norm": 0.48607232362749003, + "learning_rate": 4.85482796955013e-05, + "loss": 0.9741, + "step": 2723 + }, + { + "epoch": 0.13586034912718203, + "grad_norm": 0.5247477769489393, + "learning_rate": 4.854692325201591e-05, + "loss": 0.9277, + "step": 2724 + }, + { + "epoch": 0.13591022443890274, + "grad_norm": 0.5574019612700817, + "learning_rate": 4.854556619408409e-05, + "loss": 0.9858, + "step": 2725 + }, + { + "epoch": 0.13596009975062345, + "grad_norm": 0.4518957614358897, + "learning_rate": 4.854420852174126e-05, + "loss": 0.8964, + "step": 2726 + }, + { + "epoch": 0.13600997506234414, + "grad_norm": 0.5258091539969528, + "learning_rate": 4.854285023502284e-05, + "loss": 0.9724, + "step": 2727 + }, + { + "epoch": 0.13605985037406484, + "grad_norm": 0.46566590749465986, + "learning_rate": 4.854149133396429e-05, + "loss": 0.9104, + "step": 2728 + }, + { + "epoch": 0.13610972568578553, + "grad_norm": 0.5047547267322936, + "learning_rate": 4.854013181860105e-05, + "loss": 0.9315, + "step": 2729 + }, + { + "epoch": 0.13615960099750624, + "grad_norm": 0.5282057257629854, + "learning_rate": 4.853877168896861e-05, + "loss": 0.9184, + "step": 2730 + }, + { + "epoch": 0.13620947630922695, + "grad_norm": 0.7614536747878053, + "learning_rate": 4.853741094510246e-05, + "loss": 0.9408, + "step": 2731 + }, + { + "epoch": 0.13625935162094763, + "grad_norm": 0.5303685277142434, + "learning_rate": 4.85360495870381e-05, + "loss": 0.9502, + "step": 2732 + }, + { + "epoch": 0.13630922693266834, + "grad_norm": 0.9293914218027111, + "learning_rate": 4.8534687614811064e-05, + "loss": 0.9512, + "step": 2733 + }, + { + "epoch": 0.13635910224438902, + "grad_norm": 0.6299947324283861, + "learning_rate": 4.853332502845689e-05, + "loss": 0.9598, + "step": 2734 + }, + { + "epoch": 0.13640897755610973, + "grad_norm": 0.58784212450682, + "learning_rate": 4.8531961828011124e-05, + "loss": 0.933, + "step": 2735 + }, + { + "epoch": 0.1364588528678304, + "grad_norm": 0.5085783211256001, + "learning_rate": 4.853059801350935e-05, + "loss": 0.9609, + "step": 2736 + }, + { + "epoch": 0.13650872817955112, + "grad_norm": 0.49952741158599095, + "learning_rate": 4.852923358498715e-05, + "loss": 0.9482, + "step": 2737 + }, + { + "epoch": 0.13655860349127183, + "grad_norm": 0.5026551148044648, + "learning_rate": 4.852786854248013e-05, + "loss": 0.943, + "step": 2738 + }, + { + "epoch": 0.1366084788029925, + "grad_norm": 0.46638602523518785, + "learning_rate": 4.85265028860239e-05, + "loss": 0.9137, + "step": 2739 + }, + { + "epoch": 0.13665835411471322, + "grad_norm": 0.7261898476170359, + "learning_rate": 4.8525136615654126e-05, + "loss": 0.9446, + "step": 2740 + }, + { + "epoch": 0.1367082294264339, + "grad_norm": 0.6116005074760351, + "learning_rate": 4.852376973140643e-05, + "loss": 0.9995, + "step": 2741 + }, + { + "epoch": 0.1367581047381546, + "grad_norm": 0.5127461516838465, + "learning_rate": 4.8522402233316486e-05, + "loss": 0.9603, + "step": 2742 + }, + { + "epoch": 0.13680798004987532, + "grad_norm": 0.4941342011221172, + "learning_rate": 4.8521034121419996e-05, + "loss": 0.9147, + "step": 2743 + }, + { + "epoch": 0.136857855361596, + "grad_norm": 0.6760677377951865, + "learning_rate": 4.851966539575264e-05, + "loss": 0.9133, + "step": 2744 + }, + { + "epoch": 0.1369077306733167, + "grad_norm": 0.48692231366354255, + "learning_rate": 4.851829605635014e-05, + "loss": 0.9122, + "step": 2745 + }, + { + "epoch": 0.1369576059850374, + "grad_norm": 0.45638179243769056, + "learning_rate": 4.851692610324823e-05, + "loss": 0.9373, + "step": 2746 + }, + { + "epoch": 0.1370074812967581, + "grad_norm": 0.6051605783282017, + "learning_rate": 4.851555553648266e-05, + "loss": 0.948, + "step": 2747 + }, + { + "epoch": 0.13705735660847881, + "grad_norm": 0.6610765688402372, + "learning_rate": 4.8514184356089195e-05, + "loss": 0.9752, + "step": 2748 + }, + { + "epoch": 0.1371072319201995, + "grad_norm": 0.6206300393292109, + "learning_rate": 4.851281256210361e-05, + "loss": 0.9469, + "step": 2749 + }, + { + "epoch": 0.1371571072319202, + "grad_norm": 0.7331339571697003, + "learning_rate": 4.851144015456171e-05, + "loss": 0.9309, + "step": 2750 + }, + { + "epoch": 0.1372069825436409, + "grad_norm": 0.42027928880182736, + "learning_rate": 4.851006713349929e-05, + "loss": 0.8957, + "step": 2751 + }, + { + "epoch": 0.1372568578553616, + "grad_norm": 0.5020824871695827, + "learning_rate": 4.85086934989522e-05, + "loss": 0.9224, + "step": 2752 + }, + { + "epoch": 0.1373067331670823, + "grad_norm": 0.5924490135713155, + "learning_rate": 4.850731925095626e-05, + "loss": 0.9893, + "step": 2753 + }, + { + "epoch": 0.137356608478803, + "grad_norm": 0.626691420851097, + "learning_rate": 4.8505944389547355e-05, + "loss": 0.9543, + "step": 2754 + }, + { + "epoch": 0.1374064837905237, + "grad_norm": 0.4851005221774201, + "learning_rate": 4.850456891476135e-05, + "loss": 0.9083, + "step": 2755 + }, + { + "epoch": 0.13745635910224438, + "grad_norm": 0.5520128908864785, + "learning_rate": 4.850319282663414e-05, + "loss": 0.9537, + "step": 2756 + }, + { + "epoch": 0.1375062344139651, + "grad_norm": 0.5159736005790654, + "learning_rate": 4.8501816125201635e-05, + "loss": 0.9227, + "step": 2757 + }, + { + "epoch": 0.1375561097256858, + "grad_norm": 0.9341656174368099, + "learning_rate": 4.850043881049975e-05, + "loss": 0.9277, + "step": 2758 + }, + { + "epoch": 0.13760598503740648, + "grad_norm": 0.5567667422628811, + "learning_rate": 4.849906088256443e-05, + "loss": 0.9705, + "step": 2759 + }, + { + "epoch": 0.1376558603491272, + "grad_norm": 0.47193880421574935, + "learning_rate": 4.849768234143164e-05, + "loss": 0.9277, + "step": 2760 + }, + { + "epoch": 0.13770573566084787, + "grad_norm": 0.5987553581534371, + "learning_rate": 4.849630318713734e-05, + "loss": 0.9792, + "step": 2761 + }, + { + "epoch": 0.13775561097256858, + "grad_norm": 0.48178157679071276, + "learning_rate": 4.849492341971753e-05, + "loss": 0.9662, + "step": 2762 + }, + { + "epoch": 0.13780548628428926, + "grad_norm": 0.5253387564237567, + "learning_rate": 4.84935430392082e-05, + "loss": 1.01, + "step": 2763 + }, + { + "epoch": 0.13785536159600997, + "grad_norm": 0.5234386183382258, + "learning_rate": 4.8492162045645385e-05, + "loss": 0.9663, + "step": 2764 + }, + { + "epoch": 0.13790523690773068, + "grad_norm": 0.46490262968640694, + "learning_rate": 4.849078043906511e-05, + "loss": 0.9723, + "step": 2765 + }, + { + "epoch": 0.13795511221945136, + "grad_norm": 0.462631667311106, + "learning_rate": 4.8489398219503436e-05, + "loss": 0.9167, + "step": 2766 + }, + { + "epoch": 0.13800498753117207, + "grad_norm": 0.516083006822955, + "learning_rate": 4.848801538699643e-05, + "loss": 0.8918, + "step": 2767 + }, + { + "epoch": 0.13805486284289276, + "grad_norm": 0.5442862676564455, + "learning_rate": 4.848663194158018e-05, + "loss": 0.9486, + "step": 2768 + }, + { + "epoch": 0.13810473815461347, + "grad_norm": 0.5307075116860203, + "learning_rate": 4.8485247883290765e-05, + "loss": 0.9395, + "step": 2769 + }, + { + "epoch": 0.13815461346633418, + "grad_norm": 0.5099302781846576, + "learning_rate": 4.848386321216433e-05, + "loss": 0.9561, + "step": 2770 + }, + { + "epoch": 0.13820448877805486, + "grad_norm": 0.498417002033428, + "learning_rate": 4.848247792823698e-05, + "loss": 0.9847, + "step": 2771 + }, + { + "epoch": 0.13825436408977557, + "grad_norm": 0.47450630276737094, + "learning_rate": 4.8481092031544886e-05, + "loss": 0.9168, + "step": 2772 + }, + { + "epoch": 0.13830423940149625, + "grad_norm": 0.508084651095044, + "learning_rate": 4.847970552212421e-05, + "loss": 0.9505, + "step": 2773 + }, + { + "epoch": 0.13835411471321696, + "grad_norm": 0.6126355745978821, + "learning_rate": 4.847831840001112e-05, + "loss": 0.9092, + "step": 2774 + }, + { + "epoch": 0.13840399002493767, + "grad_norm": 1.160487580696267, + "learning_rate": 4.847693066524182e-05, + "loss": 0.941, + "step": 2775 + }, + { + "epoch": 0.13845386533665835, + "grad_norm": 0.5686115018352543, + "learning_rate": 4.8475542317852524e-05, + "loss": 0.9222, + "step": 2776 + }, + { + "epoch": 0.13850374064837906, + "grad_norm": 0.5077776715653681, + "learning_rate": 4.847415335787946e-05, + "loss": 0.9281, + "step": 2777 + }, + { + "epoch": 0.13855361596009974, + "grad_norm": 0.4669824323102154, + "learning_rate": 4.847276378535886e-05, + "loss": 0.9355, + "step": 2778 + }, + { + "epoch": 0.13860349127182045, + "grad_norm": 0.6727484736838982, + "learning_rate": 4.8471373600326996e-05, + "loss": 0.9784, + "step": 2779 + }, + { + "epoch": 0.13865336658354116, + "grad_norm": 0.5235211114694498, + "learning_rate": 4.8469982802820144e-05, + "loss": 0.9212, + "step": 2780 + }, + { + "epoch": 0.13870324189526184, + "grad_norm": 0.4626898461080583, + "learning_rate": 4.8468591392874604e-05, + "loss": 0.9077, + "step": 2781 + }, + { + "epoch": 0.13875311720698255, + "grad_norm": 0.4723383931426947, + "learning_rate": 4.846719937052666e-05, + "loss": 0.939, + "step": 2782 + }, + { + "epoch": 0.13880299251870323, + "grad_norm": 0.46463556664943045, + "learning_rate": 4.8465806735812665e-05, + "loss": 0.9237, + "step": 2783 + }, + { + "epoch": 0.13885286783042394, + "grad_norm": 0.7580521322152758, + "learning_rate": 4.8464413488768936e-05, + "loss": 1.002, + "step": 2784 + }, + { + "epoch": 0.13890274314214462, + "grad_norm": 0.51004831799601, + "learning_rate": 4.8463019629431835e-05, + "loss": 0.9482, + "step": 2785 + }, + { + "epoch": 0.13895261845386533, + "grad_norm": 0.9304243240368569, + "learning_rate": 4.846162515783775e-05, + "loss": 0.9331, + "step": 2786 + }, + { + "epoch": 0.13900249376558604, + "grad_norm": 0.5133294530156629, + "learning_rate": 4.846023007402305e-05, + "loss": 0.9469, + "step": 2787 + }, + { + "epoch": 0.13905236907730673, + "grad_norm": 0.5311116299663052, + "learning_rate": 4.845883437802414e-05, + "loss": 0.9364, + "step": 2788 + }, + { + "epoch": 0.13910224438902744, + "grad_norm": 0.5436508776631017, + "learning_rate": 4.845743806987745e-05, + "loss": 0.9783, + "step": 2789 + }, + { + "epoch": 0.13915211970074812, + "grad_norm": 0.4338743553771233, + "learning_rate": 4.8456041149619416e-05, + "loss": 0.9784, + "step": 2790 + }, + { + "epoch": 0.13920199501246883, + "grad_norm": 0.6146373294008188, + "learning_rate": 4.845464361728648e-05, + "loss": 0.9651, + "step": 2791 + }, + { + "epoch": 0.13925187032418954, + "grad_norm": 0.6104397453216924, + "learning_rate": 4.845324547291512e-05, + "loss": 0.9443, + "step": 2792 + }, + { + "epoch": 0.13930174563591022, + "grad_norm": 0.6637251415259088, + "learning_rate": 4.845184671654182e-05, + "loss": 0.9522, + "step": 2793 + }, + { + "epoch": 0.13935162094763093, + "grad_norm": 0.5665582760268185, + "learning_rate": 4.845044734820306e-05, + "loss": 0.9804, + "step": 2794 + }, + { + "epoch": 0.1394014962593516, + "grad_norm": 0.804965146441483, + "learning_rate": 4.844904736793538e-05, + "loss": 0.9569, + "step": 2795 + }, + { + "epoch": 0.13945137157107232, + "grad_norm": 0.571191583195471, + "learning_rate": 4.844764677577531e-05, + "loss": 0.9864, + "step": 2796 + }, + { + "epoch": 0.13950124688279303, + "grad_norm": 0.45183117823643365, + "learning_rate": 4.844624557175938e-05, + "loss": 0.9291, + "step": 2797 + }, + { + "epoch": 0.1395511221945137, + "grad_norm": 0.5577003203164205, + "learning_rate": 4.844484375592417e-05, + "loss": 0.9083, + "step": 2798 + }, + { + "epoch": 0.13960099750623442, + "grad_norm": 0.6666102422645944, + "learning_rate": 4.844344132830626e-05, + "loss": 0.9791, + "step": 2799 + }, + { + "epoch": 0.1396508728179551, + "grad_norm": 0.6237694740916077, + "learning_rate": 4.844203828894223e-05, + "loss": 0.9085, + "step": 2800 + }, + { + "epoch": 0.1397007481296758, + "grad_norm": 0.46766118332495565, + "learning_rate": 4.844063463786871e-05, + "loss": 0.9316, + "step": 2801 + }, + { + "epoch": 0.13975062344139652, + "grad_norm": 0.5100454818721553, + "learning_rate": 4.843923037512232e-05, + "loss": 0.9375, + "step": 2802 + }, + { + "epoch": 0.1398004987531172, + "grad_norm": 0.7288717168215191, + "learning_rate": 4.8437825500739696e-05, + "loss": 0.922, + "step": 2803 + }, + { + "epoch": 0.1398503740648379, + "grad_norm": 0.48430296762935116, + "learning_rate": 4.8436420014757514e-05, + "loss": 0.9426, + "step": 2804 + }, + { + "epoch": 0.1399002493765586, + "grad_norm": 0.6540535136653632, + "learning_rate": 4.8435013917212434e-05, + "loss": 0.9226, + "step": 2805 + }, + { + "epoch": 0.1399501246882793, + "grad_norm": 0.5285966102845481, + "learning_rate": 4.843360720814116e-05, + "loss": 0.9131, + "step": 2806 + }, + { + "epoch": 0.14, + "grad_norm": 0.5324017585913219, + "learning_rate": 4.8432199887580385e-05, + "loss": 0.9528, + "step": 2807 + }, + { + "epoch": 0.1400498753117207, + "grad_norm": 0.703222689338647, + "learning_rate": 4.843079195556684e-05, + "loss": 0.9627, + "step": 2808 + }, + { + "epoch": 0.1400997506234414, + "grad_norm": 0.6734085562514355, + "learning_rate": 4.842938341213727e-05, + "loss": 0.8748, + "step": 2809 + }, + { + "epoch": 0.1401496259351621, + "grad_norm": 0.5400078979422495, + "learning_rate": 4.842797425732843e-05, + "loss": 0.9451, + "step": 2810 + }, + { + "epoch": 0.1401995012468828, + "grad_norm": 0.823401936285285, + "learning_rate": 4.842656449117708e-05, + "loss": 0.9289, + "step": 2811 + }, + { + "epoch": 0.14024937655860348, + "grad_norm": 0.45729009511104934, + "learning_rate": 4.842515411372002e-05, + "loss": 0.9323, + "step": 2812 + }, + { + "epoch": 0.1402992518703242, + "grad_norm": 0.9799154797853348, + "learning_rate": 4.842374312499405e-05, + "loss": 0.9405, + "step": 2813 + }, + { + "epoch": 0.1403491271820449, + "grad_norm": 0.5560409164623067, + "learning_rate": 4.842233152503598e-05, + "loss": 0.999, + "step": 2814 + }, + { + "epoch": 0.14039900249376558, + "grad_norm": 0.7540773835071312, + "learning_rate": 4.842091931388265e-05, + "loss": 0.9456, + "step": 2815 + }, + { + "epoch": 0.1404488778054863, + "grad_norm": 0.8486108259689784, + "learning_rate": 4.841950649157091e-05, + "loss": 0.9302, + "step": 2816 + }, + { + "epoch": 0.14049875311720697, + "grad_norm": 0.5006150960048187, + "learning_rate": 4.8418093058137635e-05, + "loss": 0.9642, + "step": 2817 + }, + { + "epoch": 0.14054862842892768, + "grad_norm": 0.693518371986037, + "learning_rate": 4.84166790136197e-05, + "loss": 0.9411, + "step": 2818 + }, + { + "epoch": 0.1405985037406484, + "grad_norm": 0.4798950378834567, + "learning_rate": 4.8415264358054005e-05, + "loss": 0.9415, + "step": 2819 + }, + { + "epoch": 0.14064837905236907, + "grad_norm": 0.5659795194700037, + "learning_rate": 4.841384909147747e-05, + "loss": 0.973, + "step": 2820 + }, + { + "epoch": 0.14069825436408978, + "grad_norm": 0.509651251224239, + "learning_rate": 4.8412433213927015e-05, + "loss": 0.939, + "step": 2821 + }, + { + "epoch": 0.14074812967581046, + "grad_norm": 0.802512250044122, + "learning_rate": 4.84110167254396e-05, + "loss": 0.9727, + "step": 2822 + }, + { + "epoch": 0.14079800498753117, + "grad_norm": 0.5398416618664165, + "learning_rate": 4.840959962605218e-05, + "loss": 0.9926, + "step": 2823 + }, + { + "epoch": 0.14084788029925188, + "grad_norm": 0.5067569686086432, + "learning_rate": 4.840818191580173e-05, + "loss": 0.969, + "step": 2824 + }, + { + "epoch": 0.14089775561097256, + "grad_norm": 0.6435416229377735, + "learning_rate": 4.840676359472525e-05, + "loss": 0.9348, + "step": 2825 + }, + { + "epoch": 0.14094763092269327, + "grad_norm": 0.4916552554571038, + "learning_rate": 4.840534466285975e-05, + "loss": 0.9422, + "step": 2826 + }, + { + "epoch": 0.14099750623441396, + "grad_norm": 0.6101026481563577, + "learning_rate": 4.8403925120242264e-05, + "loss": 0.9776, + "step": 2827 + }, + { + "epoch": 0.14104738154613466, + "grad_norm": 0.46943763201661937, + "learning_rate": 4.8402504966909814e-05, + "loss": 0.9265, + "step": 2828 + }, + { + "epoch": 0.14109725685785537, + "grad_norm": 0.8094144278835527, + "learning_rate": 4.8401084202899474e-05, + "loss": 0.9599, + "step": 2829 + }, + { + "epoch": 0.14114713216957606, + "grad_norm": 0.8891178552614428, + "learning_rate": 4.839966282824832e-05, + "loss": 0.9521, + "step": 2830 + }, + { + "epoch": 0.14119700748129677, + "grad_norm": 0.5606760442955999, + "learning_rate": 4.8398240842993437e-05, + "loss": 0.962, + "step": 2831 + }, + { + "epoch": 0.14124688279301745, + "grad_norm": 0.9357825811249945, + "learning_rate": 4.8396818247171925e-05, + "loss": 0.9424, + "step": 2832 + }, + { + "epoch": 0.14129675810473816, + "grad_norm": 0.5179392869598999, + "learning_rate": 4.839539504082091e-05, + "loss": 0.9093, + "step": 2833 + }, + { + "epoch": 0.14134663341645887, + "grad_norm": 0.9377969728734213, + "learning_rate": 4.839397122397754e-05, + "loss": 0.9321, + "step": 2834 + }, + { + "epoch": 0.14139650872817955, + "grad_norm": 1.1499631577882288, + "learning_rate": 4.839254679667895e-05, + "loss": 0.9301, + "step": 2835 + }, + { + "epoch": 0.14144638403990026, + "grad_norm": 0.6099061973918265, + "learning_rate": 4.839112175896233e-05, + "loss": 0.9798, + "step": 2836 + }, + { + "epoch": 0.14149625935162094, + "grad_norm": 0.5128925545414293, + "learning_rate": 4.838969611086486e-05, + "loss": 0.9291, + "step": 2837 + }, + { + "epoch": 0.14154613466334165, + "grad_norm": 0.47157525140957546, + "learning_rate": 4.838826985242373e-05, + "loss": 0.9575, + "step": 2838 + }, + { + "epoch": 0.14159600997506233, + "grad_norm": 0.5084074085902975, + "learning_rate": 4.8386842983676164e-05, + "loss": 0.9664, + "step": 2839 + }, + { + "epoch": 0.14164588528678304, + "grad_norm": 0.5107800028985001, + "learning_rate": 4.83854155046594e-05, + "loss": 0.9076, + "step": 2840 + }, + { + "epoch": 0.14169576059850375, + "grad_norm": 0.9871909617173171, + "learning_rate": 4.838398741541068e-05, + "loss": 0.952, + "step": 2841 + }, + { + "epoch": 0.14174563591022443, + "grad_norm": 0.48019598855131285, + "learning_rate": 4.8382558715967275e-05, + "loss": 0.9777, + "step": 2842 + }, + { + "epoch": 0.14179551122194514, + "grad_norm": 0.49355871842528276, + "learning_rate": 4.838112940636647e-05, + "loss": 0.9624, + "step": 2843 + }, + { + "epoch": 0.14184538653366582, + "grad_norm": 0.4597273962038915, + "learning_rate": 4.837969948664556e-05, + "loss": 0.9788, + "step": 2844 + }, + { + "epoch": 0.14189526184538653, + "grad_norm": 0.5388413781036112, + "learning_rate": 4.837826895684184e-05, + "loss": 0.9368, + "step": 2845 + }, + { + "epoch": 0.14194513715710724, + "grad_norm": 0.5077833968402461, + "learning_rate": 4.837683781699267e-05, + "loss": 0.9329, + "step": 2846 + }, + { + "epoch": 0.14199501246882792, + "grad_norm": 0.5469050268054341, + "learning_rate": 4.837540606713538e-05, + "loss": 0.9294, + "step": 2847 + }, + { + "epoch": 0.14204488778054863, + "grad_norm": 0.49003236874560935, + "learning_rate": 4.837397370730732e-05, + "loss": 0.9601, + "step": 2848 + }, + { + "epoch": 0.14209476309226932, + "grad_norm": 0.45132619197095547, + "learning_rate": 4.837254073754588e-05, + "loss": 0.9604, + "step": 2849 + }, + { + "epoch": 0.14214463840399003, + "grad_norm": 0.4726711576526331, + "learning_rate": 4.8371107157888453e-05, + "loss": 0.9561, + "step": 2850 + }, + { + "epoch": 0.14219451371571074, + "grad_norm": 0.5534723561770074, + "learning_rate": 4.836967296837244e-05, + "loss": 0.9203, + "step": 2851 + }, + { + "epoch": 0.14224438902743142, + "grad_norm": 0.4444200341611965, + "learning_rate": 4.836823816903527e-05, + "loss": 0.9193, + "step": 2852 + }, + { + "epoch": 0.14229426433915213, + "grad_norm": 0.47840487143573457, + "learning_rate": 4.836680275991438e-05, + "loss": 0.8913, + "step": 2853 + }, + { + "epoch": 0.1423441396508728, + "grad_norm": 0.6099831860534698, + "learning_rate": 4.8365366741047235e-05, + "loss": 0.9647, + "step": 2854 + }, + { + "epoch": 0.14239401496259352, + "grad_norm": 0.6053650716022659, + "learning_rate": 4.83639301124713e-05, + "loss": 0.9553, + "step": 2855 + }, + { + "epoch": 0.14244389027431423, + "grad_norm": 1.2841285525080945, + "learning_rate": 4.836249287422406e-05, + "loss": 0.952, + "step": 2856 + }, + { + "epoch": 0.1424937655860349, + "grad_norm": 0.6779031318459267, + "learning_rate": 4.836105502634304e-05, + "loss": 0.9551, + "step": 2857 + }, + { + "epoch": 0.14254364089775562, + "grad_norm": 0.5280054104325638, + "learning_rate": 4.835961656886573e-05, + "loss": 0.9547, + "step": 2858 + }, + { + "epoch": 0.1425935162094763, + "grad_norm": 0.4641539645964674, + "learning_rate": 4.835817750182968e-05, + "loss": 0.9334, + "step": 2859 + }, + { + "epoch": 0.142643391521197, + "grad_norm": 0.47127260752071476, + "learning_rate": 4.835673782527245e-05, + "loss": 0.9431, + "step": 2860 + }, + { + "epoch": 0.1426932668329177, + "grad_norm": 0.49290688379531317, + "learning_rate": 4.8355297539231596e-05, + "loss": 0.9492, + "step": 2861 + }, + { + "epoch": 0.1427431421446384, + "grad_norm": 0.5803210210057487, + "learning_rate": 4.8353856643744705e-05, + "loss": 0.9433, + "step": 2862 + }, + { + "epoch": 0.1427930174563591, + "grad_norm": 0.4844345473861679, + "learning_rate": 4.835241513884938e-05, + "loss": 0.9085, + "step": 2863 + }, + { + "epoch": 0.1428428927680798, + "grad_norm": 0.7841712315340228, + "learning_rate": 4.8350973024583225e-05, + "loss": 0.9458, + "step": 2864 + }, + { + "epoch": 0.1428927680798005, + "grad_norm": 0.978350143154205, + "learning_rate": 4.8349530300983885e-05, + "loss": 0.9417, + "step": 2865 + }, + { + "epoch": 0.14294264339152118, + "grad_norm": 0.5337750484254812, + "learning_rate": 4.8348086968089e-05, + "loss": 0.9337, + "step": 2866 + }, + { + "epoch": 0.1429925187032419, + "grad_norm": 1.0226422074833124, + "learning_rate": 4.834664302593624e-05, + "loss": 0.9718, + "step": 2867 + }, + { + "epoch": 0.1430423940149626, + "grad_norm": 2.805671824635171, + "learning_rate": 4.834519847456327e-05, + "loss": 0.9516, + "step": 2868 + }, + { + "epoch": 0.14309226932668329, + "grad_norm": 0.5374591079210695, + "learning_rate": 4.83437533140078e-05, + "loss": 0.9613, + "step": 2869 + }, + { + "epoch": 0.143142144638404, + "grad_norm": 0.5799058351747999, + "learning_rate": 4.834230754430753e-05, + "loss": 1.0142, + "step": 2870 + }, + { + "epoch": 0.14319201995012468, + "grad_norm": 0.6119390574479838, + "learning_rate": 4.834086116550019e-05, + "loss": 0.9514, + "step": 2871 + }, + { + "epoch": 0.1432418952618454, + "grad_norm": 0.4666628855521122, + "learning_rate": 4.833941417762353e-05, + "loss": 0.9147, + "step": 2872 + }, + { + "epoch": 0.1432917705735661, + "grad_norm": 0.5404457624681877, + "learning_rate": 4.83379665807153e-05, + "loss": 0.9446, + "step": 2873 + }, + { + "epoch": 0.14334164588528678, + "grad_norm": 0.561402863354051, + "learning_rate": 4.833651837481328e-05, + "loss": 0.9329, + "step": 2874 + }, + { + "epoch": 0.1433915211970075, + "grad_norm": 0.47614244654159465, + "learning_rate": 4.833506955995526e-05, + "loss": 0.9045, + "step": 2875 + }, + { + "epoch": 0.14344139650872817, + "grad_norm": 0.6226523109687307, + "learning_rate": 4.8333620136179034e-05, + "loss": 0.9731, + "step": 2876 + }, + { + "epoch": 0.14349127182044888, + "grad_norm": 0.617991794378198, + "learning_rate": 4.833217010352243e-05, + "loss": 1.0137, + "step": 2877 + }, + { + "epoch": 0.1435411471321696, + "grad_norm": 0.5879221574039614, + "learning_rate": 4.83307194620233e-05, + "loss": 0.9716, + "step": 2878 + }, + { + "epoch": 0.14359102244389027, + "grad_norm": 0.8733454203776156, + "learning_rate": 4.832926821171949e-05, + "loss": 0.9306, + "step": 2879 + }, + { + "epoch": 0.14364089775561098, + "grad_norm": 0.6273129782572292, + "learning_rate": 4.832781635264886e-05, + "loss": 0.8823, + "step": 2880 + }, + { + "epoch": 0.14369077306733166, + "grad_norm": 0.8858014784203883, + "learning_rate": 4.83263638848493e-05, + "loss": 0.9579, + "step": 2881 + }, + { + "epoch": 0.14374064837905237, + "grad_norm": 0.5137091235489465, + "learning_rate": 4.832491080835871e-05, + "loss": 0.9279, + "step": 2882 + }, + { + "epoch": 0.14379052369077308, + "grad_norm": 0.7335890235278312, + "learning_rate": 4.832345712321502e-05, + "loss": 0.9536, + "step": 2883 + }, + { + "epoch": 0.14384039900249376, + "grad_norm": 0.6326194007566512, + "learning_rate": 4.832200282945616e-05, + "loss": 1.0118, + "step": 2884 + }, + { + "epoch": 0.14389027431421447, + "grad_norm": 0.5823750209148326, + "learning_rate": 4.832054792712006e-05, + "loss": 0.976, + "step": 2885 + }, + { + "epoch": 0.14394014962593515, + "grad_norm": 0.5073860961184392, + "learning_rate": 4.8319092416244704e-05, + "loss": 0.9428, + "step": 2886 + }, + { + "epoch": 0.14399002493765586, + "grad_norm": 0.5282249734872966, + "learning_rate": 4.831763629686807e-05, + "loss": 0.9648, + "step": 2887 + }, + { + "epoch": 0.14403990024937655, + "grad_norm": 0.645543302560389, + "learning_rate": 4.8316179569028154e-05, + "loss": 0.9588, + "step": 2888 + }, + { + "epoch": 0.14408977556109726, + "grad_norm": 0.5252299409080681, + "learning_rate": 4.831472223276296e-05, + "loss": 0.9789, + "step": 2889 + }, + { + "epoch": 0.14413965087281796, + "grad_norm": 0.5339145610782133, + "learning_rate": 4.8313264288110524e-05, + "loss": 0.982, + "step": 2890 + }, + { + "epoch": 0.14418952618453865, + "grad_norm": 0.5635151469038945, + "learning_rate": 4.8311805735108894e-05, + "loss": 0.9738, + "step": 2891 + }, + { + "epoch": 0.14423940149625936, + "grad_norm": 0.48304037942046524, + "learning_rate": 4.831034657379613e-05, + "loss": 0.9182, + "step": 2892 + }, + { + "epoch": 0.14428927680798004, + "grad_norm": 0.925143219597791, + "learning_rate": 4.83088868042103e-05, + "loss": 0.9971, + "step": 2893 + }, + { + "epoch": 0.14433915211970075, + "grad_norm": 0.5056012512949535, + "learning_rate": 4.83074264263895e-05, + "loss": 0.9543, + "step": 2894 + }, + { + "epoch": 0.14438902743142146, + "grad_norm": 0.5710963829529715, + "learning_rate": 4.8305965440371845e-05, + "loss": 0.9365, + "step": 2895 + }, + { + "epoch": 0.14443890274314214, + "grad_norm": 0.4834201053539779, + "learning_rate": 4.830450384619544e-05, + "loss": 0.9416, + "step": 2896 + }, + { + "epoch": 0.14448877805486285, + "grad_norm": 0.6528838142070443, + "learning_rate": 4.830304164389844e-05, + "loss": 0.9776, + "step": 2897 + }, + { + "epoch": 0.14453865336658353, + "grad_norm": 0.485751363051031, + "learning_rate": 4.830157883351901e-05, + "loss": 0.9549, + "step": 2898 + }, + { + "epoch": 0.14458852867830424, + "grad_norm": 0.45365574946043313, + "learning_rate": 4.8300115415095295e-05, + "loss": 0.9508, + "step": 2899 + }, + { + "epoch": 0.14463840399002495, + "grad_norm": 0.660936241877917, + "learning_rate": 4.82986513886655e-05, + "loss": 0.9342, + "step": 2900 + }, + { + "epoch": 0.14468827930174563, + "grad_norm": 0.789978674190257, + "learning_rate": 4.829718675426783e-05, + "loss": 0.9206, + "step": 2901 + }, + { + "epoch": 0.14473815461346634, + "grad_norm": 0.5766043716319145, + "learning_rate": 4.829572151194049e-05, + "loss": 0.9354, + "step": 2902 + }, + { + "epoch": 0.14478802992518702, + "grad_norm": 0.5854276260625715, + "learning_rate": 4.829425566172172e-05, + "loss": 0.9808, + "step": 2903 + }, + { + "epoch": 0.14483790523690773, + "grad_norm": 1.0446990777960823, + "learning_rate": 4.829278920364978e-05, + "loss": 0.9208, + "step": 2904 + }, + { + "epoch": 0.14488778054862844, + "grad_norm": 2.587029689944655, + "learning_rate": 4.829132213776293e-05, + "loss": 0.9457, + "step": 2905 + }, + { + "epoch": 0.14493765586034912, + "grad_norm": 0.4857529513623996, + "learning_rate": 4.828985446409946e-05, + "loss": 0.9505, + "step": 2906 + }, + { + "epoch": 0.14498753117206983, + "grad_norm": 0.6219118951713412, + "learning_rate": 4.828838618269765e-05, + "loss": 0.9438, + "step": 2907 + }, + { + "epoch": 0.14503740648379052, + "grad_norm": 0.5574820692712968, + "learning_rate": 4.828691729359583e-05, + "loss": 0.965, + "step": 2908 + }, + { + "epoch": 0.14508728179551122, + "grad_norm": 0.5313667510175313, + "learning_rate": 4.8285447796832325e-05, + "loss": 0.945, + "step": 2909 + }, + { + "epoch": 0.1451371571072319, + "grad_norm": 0.5521879063413457, + "learning_rate": 4.8283977692445475e-05, + "loss": 0.9669, + "step": 2910 + }, + { + "epoch": 0.14518703241895262, + "grad_norm": 0.4936021318123712, + "learning_rate": 4.828250698047365e-05, + "loss": 0.9639, + "step": 2911 + }, + { + "epoch": 0.14523690773067333, + "grad_norm": 0.5734585007969986, + "learning_rate": 4.8281035660955235e-05, + "loss": 1.0034, + "step": 2912 + }, + { + "epoch": 0.145286783042394, + "grad_norm": 0.6064217859662142, + "learning_rate": 4.82795637339286e-05, + "loss": 0.995, + "step": 2913 + }, + { + "epoch": 0.14533665835411472, + "grad_norm": 0.6161646094046405, + "learning_rate": 4.8278091199432176e-05, + "loss": 0.9442, + "step": 2914 + }, + { + "epoch": 0.1453865336658354, + "grad_norm": 0.5144103886852833, + "learning_rate": 4.8276618057504376e-05, + "loss": 0.9423, + "step": 2915 + }, + { + "epoch": 0.1454364089775561, + "grad_norm": 0.5216048498048045, + "learning_rate": 4.827514430818365e-05, + "loss": 0.9052, + "step": 2916 + }, + { + "epoch": 0.14548628428927682, + "grad_norm": 0.5071355567286879, + "learning_rate": 4.827366995150845e-05, + "loss": 0.9032, + "step": 2917 + }, + { + "epoch": 0.1455361596009975, + "grad_norm": 0.5349961368346563, + "learning_rate": 4.8272194987517236e-05, + "loss": 0.926, + "step": 2918 + }, + { + "epoch": 0.1455860349127182, + "grad_norm": 0.5724459177406938, + "learning_rate": 4.827071941624852e-05, + "loss": 0.9194, + "step": 2919 + }, + { + "epoch": 0.1456359102244389, + "grad_norm": 0.432133229581853, + "learning_rate": 4.8269243237740796e-05, + "loss": 0.9156, + "step": 2920 + }, + { + "epoch": 0.1456857855361596, + "grad_norm": 4.092098201403542, + "learning_rate": 4.826776645203258e-05, + "loss": 0.9547, + "step": 2921 + }, + { + "epoch": 0.1457356608478803, + "grad_norm": 0.4900156826980323, + "learning_rate": 4.826628905916242e-05, + "loss": 0.9289, + "step": 2922 + }, + { + "epoch": 0.145785536159601, + "grad_norm": 0.5731158491785697, + "learning_rate": 4.8264811059168845e-05, + "loss": 0.9331, + "step": 2923 + }, + { + "epoch": 0.1458354114713217, + "grad_norm": 0.5177749434040875, + "learning_rate": 4.826333245209045e-05, + "loss": 0.9324, + "step": 2924 + }, + { + "epoch": 0.14588528678304238, + "grad_norm": 0.4690980473016511, + "learning_rate": 4.8261853237965806e-05, + "loss": 0.9725, + "step": 2925 + }, + { + "epoch": 0.1459351620947631, + "grad_norm": 0.6277150939091708, + "learning_rate": 4.82603734168335e-05, + "loss": 0.8678, + "step": 2926 + }, + { + "epoch": 0.1459850374064838, + "grad_norm": 0.5656359385477178, + "learning_rate": 4.825889298873217e-05, + "loss": 0.9514, + "step": 2927 + }, + { + "epoch": 0.14603491271820448, + "grad_norm": 0.5216558682305856, + "learning_rate": 4.8257411953700436e-05, + "loss": 0.9, + "step": 2928 + }, + { + "epoch": 0.1460847880299252, + "grad_norm": 0.5106183649894291, + "learning_rate": 4.825593031177694e-05, + "loss": 0.9652, + "step": 2929 + }, + { + "epoch": 0.14613466334164588, + "grad_norm": 0.4140013377239435, + "learning_rate": 4.825444806300036e-05, + "loss": 0.9023, + "step": 2930 + }, + { + "epoch": 0.14618453865336659, + "grad_norm": 0.4347300272806549, + "learning_rate": 4.825296520740936e-05, + "loss": 0.9783, + "step": 2931 + }, + { + "epoch": 0.1462344139650873, + "grad_norm": 0.4555097861418168, + "learning_rate": 4.825148174504264e-05, + "loss": 0.8981, + "step": 2932 + }, + { + "epoch": 0.14628428927680798, + "grad_norm": 0.5640407781485178, + "learning_rate": 4.8249997675938904e-05, + "loss": 0.9373, + "step": 2933 + }, + { + "epoch": 0.1463341645885287, + "grad_norm": 0.4824834006590057, + "learning_rate": 4.824851300013689e-05, + "loss": 0.9203, + "step": 2934 + }, + { + "epoch": 0.14638403990024937, + "grad_norm": 0.48529779591103894, + "learning_rate": 4.824702771767533e-05, + "loss": 0.9179, + "step": 2935 + }, + { + "epoch": 0.14643391521197008, + "grad_norm": 0.4912395590864068, + "learning_rate": 4.824554182859299e-05, + "loss": 0.8937, + "step": 2936 + }, + { + "epoch": 0.14648379052369076, + "grad_norm": 0.43644748709688036, + "learning_rate": 4.824405533292864e-05, + "loss": 0.932, + "step": 2937 + }, + { + "epoch": 0.14653366583541147, + "grad_norm": 0.5764853729886575, + "learning_rate": 4.8242568230721065e-05, + "loss": 0.9928, + "step": 2938 + }, + { + "epoch": 0.14658354114713218, + "grad_norm": 0.4969649871766227, + "learning_rate": 4.824108052200907e-05, + "loss": 1.0067, + "step": 2939 + }, + { + "epoch": 0.14663341645885286, + "grad_norm": 0.49246548159596704, + "learning_rate": 4.823959220683149e-05, + "loss": 1.0072, + "step": 2940 + }, + { + "epoch": 0.14668329177057357, + "grad_norm": 0.5167036900527922, + "learning_rate": 4.823810328522715e-05, + "loss": 0.9721, + "step": 2941 + }, + { + "epoch": 0.14673316708229425, + "grad_norm": 0.7776042504260714, + "learning_rate": 4.8236613757234906e-05, + "loss": 0.9495, + "step": 2942 + }, + { + "epoch": 0.14678304239401496, + "grad_norm": 0.47537563349735434, + "learning_rate": 4.823512362289362e-05, + "loss": 0.9303, + "step": 2943 + }, + { + "epoch": 0.14683291770573567, + "grad_norm": 0.43690530797850186, + "learning_rate": 4.823363288224219e-05, + "loss": 0.9487, + "step": 2944 + }, + { + "epoch": 0.14688279301745635, + "grad_norm": 0.5945829464808444, + "learning_rate": 4.82321415353195e-05, + "loss": 0.9161, + "step": 2945 + }, + { + "epoch": 0.14693266832917706, + "grad_norm": 0.41621316432301814, + "learning_rate": 4.823064958216446e-05, + "loss": 0.907, + "step": 2946 + }, + { + "epoch": 0.14698254364089774, + "grad_norm": 0.513558561955935, + "learning_rate": 4.822915702281604e-05, + "loss": 0.971, + "step": 2947 + }, + { + "epoch": 0.14703241895261845, + "grad_norm": 0.47608247509412926, + "learning_rate": 4.8227663857313154e-05, + "loss": 0.9364, + "step": 2948 + }, + { + "epoch": 0.14708229426433916, + "grad_norm": 0.3993879217096743, + "learning_rate": 4.822617008569478e-05, + "loss": 0.9454, + "step": 2949 + }, + { + "epoch": 0.14713216957605985, + "grad_norm": 0.45095358808089403, + "learning_rate": 4.8224675707999886e-05, + "loss": 0.9578, + "step": 2950 + }, + { + "epoch": 0.14718204488778056, + "grad_norm": 0.8643115588553869, + "learning_rate": 4.822318072426748e-05, + "loss": 0.9481, + "step": 2951 + }, + { + "epoch": 0.14723192019950124, + "grad_norm": 0.7056821599687602, + "learning_rate": 4.822168513453656e-05, + "loss": 0.9427, + "step": 2952 + }, + { + "epoch": 0.14728179551122195, + "grad_norm": 0.5066300199228688, + "learning_rate": 4.822018893884616e-05, + "loss": 0.9338, + "step": 2953 + }, + { + "epoch": 0.14733167082294266, + "grad_norm": 0.4785453847647429, + "learning_rate": 4.821869213723532e-05, + "loss": 0.9332, + "step": 2954 + }, + { + "epoch": 0.14738154613466334, + "grad_norm": 0.44031552470020624, + "learning_rate": 4.821719472974311e-05, + "loss": 0.9564, + "step": 2955 + }, + { + "epoch": 0.14743142144638405, + "grad_norm": 0.4374586147378847, + "learning_rate": 4.8215696716408583e-05, + "loss": 0.9482, + "step": 2956 + }, + { + "epoch": 0.14748129675810473, + "grad_norm": 0.5416591088771799, + "learning_rate": 4.821419809727085e-05, + "loss": 0.9642, + "step": 2957 + }, + { + "epoch": 0.14753117206982544, + "grad_norm": 0.4071861482946785, + "learning_rate": 4.821269887236899e-05, + "loss": 0.9391, + "step": 2958 + }, + { + "epoch": 0.14758104738154612, + "grad_norm": 0.4891242006976979, + "learning_rate": 4.8211199041742164e-05, + "loss": 0.9803, + "step": 2959 + }, + { + "epoch": 0.14763092269326683, + "grad_norm": 0.46051882697605173, + "learning_rate": 4.820969860542947e-05, + "loss": 0.9035, + "step": 2960 + }, + { + "epoch": 0.14768079800498754, + "grad_norm": 0.4964279433490147, + "learning_rate": 4.820819756347008e-05, + "loss": 0.994, + "step": 2961 + }, + { + "epoch": 0.14773067331670822, + "grad_norm": 0.5006905943777975, + "learning_rate": 4.8206695915903163e-05, + "loss": 0.9487, + "step": 2962 + }, + { + "epoch": 0.14778054862842893, + "grad_norm": 0.5159600242863935, + "learning_rate": 4.8205193662767914e-05, + "loss": 0.9558, + "step": 2963 + }, + { + "epoch": 0.1478304239401496, + "grad_norm": 0.5114387955532362, + "learning_rate": 4.820369080410351e-05, + "loss": 0.9424, + "step": 2964 + }, + { + "epoch": 0.14788029925187032, + "grad_norm": 0.7802412616818651, + "learning_rate": 4.820218733994918e-05, + "loss": 0.9121, + "step": 2965 + }, + { + "epoch": 0.14793017456359103, + "grad_norm": 1.5383106978218155, + "learning_rate": 4.820068327034415e-05, + "loss": 0.9174, + "step": 2966 + }, + { + "epoch": 0.14798004987531171, + "grad_norm": 1.1040084399730277, + "learning_rate": 4.8199178595327685e-05, + "loss": 0.8927, + "step": 2967 + }, + { + "epoch": 0.14802992518703242, + "grad_norm": 0.6493763202124953, + "learning_rate": 4.819767331493903e-05, + "loss": 0.9363, + "step": 2968 + }, + { + "epoch": 0.1480798004987531, + "grad_norm": 0.4561128277518504, + "learning_rate": 4.8196167429217474e-05, + "loss": 0.926, + "step": 2969 + }, + { + "epoch": 0.14812967581047382, + "grad_norm": 0.5856195801799745, + "learning_rate": 4.819466093820231e-05, + "loss": 0.9744, + "step": 2970 + }, + { + "epoch": 0.14817955112219452, + "grad_norm": 0.7478348399804692, + "learning_rate": 4.819315384193285e-05, + "loss": 1.0214, + "step": 2971 + }, + { + "epoch": 0.1482294264339152, + "grad_norm": 0.4534659434873685, + "learning_rate": 4.8191646140448424e-05, + "loss": 0.9475, + "step": 2972 + }, + { + "epoch": 0.14827930174563592, + "grad_norm": 0.4625832792657016, + "learning_rate": 4.819013783378836e-05, + "loss": 0.9823, + "step": 2973 + }, + { + "epoch": 0.1483291770573566, + "grad_norm": 0.4960693100409521, + "learning_rate": 4.8188628921992034e-05, + "loss": 0.9267, + "step": 2974 + }, + { + "epoch": 0.1483790523690773, + "grad_norm": 0.5170908436671978, + "learning_rate": 4.818711940509881e-05, + "loss": 0.9057, + "step": 2975 + }, + { + "epoch": 0.14842892768079802, + "grad_norm": 0.5051506148936301, + "learning_rate": 4.818560928314809e-05, + "loss": 0.993, + "step": 2976 + }, + { + "epoch": 0.1484788029925187, + "grad_norm": 0.4973808797468511, + "learning_rate": 4.818409855617927e-05, + "loss": 0.9418, + "step": 2977 + }, + { + "epoch": 0.1485286783042394, + "grad_norm": 0.8209327048203017, + "learning_rate": 4.818258722423178e-05, + "loss": 0.9638, + "step": 2978 + }, + { + "epoch": 0.1485785536159601, + "grad_norm": 0.5965184224888107, + "learning_rate": 4.818107528734504e-05, + "loss": 0.9414, + "step": 2979 + }, + { + "epoch": 0.1486284289276808, + "grad_norm": 0.4790951716023054, + "learning_rate": 4.817956274555851e-05, + "loss": 0.9365, + "step": 2980 + }, + { + "epoch": 0.1486783042394015, + "grad_norm": 0.5314924999658773, + "learning_rate": 4.817804959891168e-05, + "loss": 0.9107, + "step": 2981 + }, + { + "epoch": 0.1487281795511222, + "grad_norm": 0.7484648611727636, + "learning_rate": 4.817653584744401e-05, + "loss": 0.9338, + "step": 2982 + }, + { + "epoch": 0.1487780548628429, + "grad_norm": 1.5655216520315818, + "learning_rate": 4.817502149119502e-05, + "loss": 0.9545, + "step": 2983 + }, + { + "epoch": 0.14882793017456358, + "grad_norm": 0.5010333630854695, + "learning_rate": 4.81735065302042e-05, + "loss": 0.9687, + "step": 2984 + }, + { + "epoch": 0.1488778054862843, + "grad_norm": 0.7743284675147001, + "learning_rate": 4.817199096451111e-05, + "loss": 0.9218, + "step": 2985 + }, + { + "epoch": 0.14892768079800497, + "grad_norm": 0.46068513139162653, + "learning_rate": 4.817047479415529e-05, + "loss": 0.941, + "step": 2986 + }, + { + "epoch": 0.14897755610972568, + "grad_norm": 0.5981715686998272, + "learning_rate": 4.816895801917629e-05, + "loss": 0.9607, + "step": 2987 + }, + { + "epoch": 0.1490274314214464, + "grad_norm": 0.6099587896191018, + "learning_rate": 4.81674406396137e-05, + "loss": 0.9394, + "step": 2988 + }, + { + "epoch": 0.14907730673316708, + "grad_norm": 0.49365479709696974, + "learning_rate": 4.8165922655507115e-05, + "loss": 0.9476, + "step": 2989 + }, + { + "epoch": 0.14912718204488778, + "grad_norm": 0.539187300637791, + "learning_rate": 4.816440406689615e-05, + "loss": 0.9051, + "step": 2990 + }, + { + "epoch": 0.14917705735660847, + "grad_norm": 0.48088260651272047, + "learning_rate": 4.816288487382043e-05, + "loss": 0.9737, + "step": 2991 + }, + { + "epoch": 0.14922693266832918, + "grad_norm": 0.4939796302635002, + "learning_rate": 4.8161365076319587e-05, + "loss": 0.9528, + "step": 2992 + }, + { + "epoch": 0.14927680798004989, + "grad_norm": 0.5299991699552986, + "learning_rate": 4.815984467443329e-05, + "loss": 0.9519, + "step": 2993 + }, + { + "epoch": 0.14932668329177057, + "grad_norm": 0.6022897107711248, + "learning_rate": 4.815832366820121e-05, + "loss": 0.9124, + "step": 2994 + }, + { + "epoch": 0.14937655860349128, + "grad_norm": 0.6064067866519235, + "learning_rate": 4.815680205766304e-05, + "loss": 0.9752, + "step": 2995 + }, + { + "epoch": 0.14942643391521196, + "grad_norm": 0.43571089622147113, + "learning_rate": 4.815527984285849e-05, + "loss": 0.965, + "step": 2996 + }, + { + "epoch": 0.14947630922693267, + "grad_norm": 0.517778673836042, + "learning_rate": 4.8153757023827276e-05, + "loss": 0.9233, + "step": 2997 + }, + { + "epoch": 0.14952618453865338, + "grad_norm": 0.4747081708012721, + "learning_rate": 4.815223360060913e-05, + "loss": 0.9559, + "step": 2998 + }, + { + "epoch": 0.14957605985037406, + "grad_norm": 0.5516579857527812, + "learning_rate": 4.81507095732438e-05, + "loss": 0.9349, + "step": 2999 + }, + { + "epoch": 0.14962593516209477, + "grad_norm": 0.48833256216495535, + "learning_rate": 4.8149184941771074e-05, + "loss": 0.9133, + "step": 3000 + }, + { + "epoch": 0.14967581047381545, + "grad_norm": 0.4705557301743397, + "learning_rate": 4.814765970623073e-05, + "loss": 0.9581, + "step": 3001 + }, + { + "epoch": 0.14972568578553616, + "grad_norm": 0.45780585499850945, + "learning_rate": 4.814613386666256e-05, + "loss": 0.9276, + "step": 3002 + }, + { + "epoch": 0.14977556109725687, + "grad_norm": 0.47804195413202966, + "learning_rate": 4.8144607423106393e-05, + "loss": 0.8989, + "step": 3003 + }, + { + "epoch": 0.14982543640897755, + "grad_norm": 0.5651118687029251, + "learning_rate": 4.8143080375602044e-05, + "loss": 0.9799, + "step": 3004 + }, + { + "epoch": 0.14987531172069826, + "grad_norm": 0.5271941211601816, + "learning_rate": 4.8141552724189374e-05, + "loss": 0.9523, + "step": 3005 + }, + { + "epoch": 0.14992518703241894, + "grad_norm": 0.49728207472679214, + "learning_rate": 4.814002446890825e-05, + "loss": 0.9257, + "step": 3006 + }, + { + "epoch": 0.14997506234413965, + "grad_norm": 0.715898985012498, + "learning_rate": 4.813849560979854e-05, + "loss": 0.9107, + "step": 3007 + }, + { + "epoch": 0.15002493765586034, + "grad_norm": 1.6792451050912018, + "learning_rate": 4.8136966146900135e-05, + "loss": 0.9424, + "step": 3008 + }, + { + "epoch": 0.15007481296758104, + "grad_norm": 0.6165632862785049, + "learning_rate": 4.813543608025296e-05, + "loss": 0.9614, + "step": 3009 + }, + { + "epoch": 0.15012468827930175, + "grad_norm": 0.41483884677241634, + "learning_rate": 4.8133905409896926e-05, + "loss": 0.9385, + "step": 3010 + }, + { + "epoch": 0.15017456359102244, + "grad_norm": 0.7118595158297831, + "learning_rate": 4.8132374135871995e-05, + "loss": 0.9314, + "step": 3011 + }, + { + "epoch": 0.15022443890274315, + "grad_norm": 0.5275608108405051, + "learning_rate": 4.813084225821811e-05, + "loss": 0.9528, + "step": 3012 + }, + { + "epoch": 0.15027431421446383, + "grad_norm": 0.46022723640521923, + "learning_rate": 4.812930977697524e-05, + "loss": 0.9077, + "step": 3013 + }, + { + "epoch": 0.15032418952618454, + "grad_norm": 0.5101496036654933, + "learning_rate": 4.812777669218339e-05, + "loss": 0.9754, + "step": 3014 + }, + { + "epoch": 0.15037406483790525, + "grad_norm": 0.6393497527156263, + "learning_rate": 4.8126243003882566e-05, + "loss": 0.9507, + "step": 3015 + }, + { + "epoch": 0.15042394014962593, + "grad_norm": 0.6504204932768507, + "learning_rate": 4.812470871211277e-05, + "loss": 0.9892, + "step": 3016 + }, + { + "epoch": 0.15047381546134664, + "grad_norm": 0.6028389502068138, + "learning_rate": 4.812317381691405e-05, + "loss": 0.9544, + "step": 3017 + }, + { + "epoch": 0.15052369077306732, + "grad_norm": 0.49198959619193683, + "learning_rate": 4.8121638318326455e-05, + "loss": 0.9225, + "step": 3018 + }, + { + "epoch": 0.15057356608478803, + "grad_norm": 0.5125393354420196, + "learning_rate": 4.812010221639006e-05, + "loss": 0.9783, + "step": 3019 + }, + { + "epoch": 0.15062344139650874, + "grad_norm": 0.7553034099757004, + "learning_rate": 4.8118565511144954e-05, + "loss": 0.953, + "step": 3020 + }, + { + "epoch": 0.15067331670822942, + "grad_norm": 0.5606196230081727, + "learning_rate": 4.8117028202631216e-05, + "loss": 0.9271, + "step": 3021 + }, + { + "epoch": 0.15072319201995013, + "grad_norm": 0.4254614357474461, + "learning_rate": 4.811549029088897e-05, + "loss": 0.955, + "step": 3022 + }, + { + "epoch": 0.1507730673316708, + "grad_norm": 0.5568909301643777, + "learning_rate": 4.811395177595836e-05, + "loss": 0.9691, + "step": 3023 + }, + { + "epoch": 0.15082294264339152, + "grad_norm": 0.5103284497774618, + "learning_rate": 4.8112412657879516e-05, + "loss": 0.9201, + "step": 3024 + }, + { + "epoch": 0.15087281795511223, + "grad_norm": 3.287527182904549, + "learning_rate": 4.811087293669261e-05, + "loss": 0.9467, + "step": 3025 + }, + { + "epoch": 0.1509226932668329, + "grad_norm": 0.549192710317879, + "learning_rate": 4.810933261243782e-05, + "loss": 0.9552, + "step": 3026 + }, + { + "epoch": 0.15097256857855362, + "grad_norm": 0.5076163944820984, + "learning_rate": 4.810779168515533e-05, + "loss": 0.9037, + "step": 3027 + }, + { + "epoch": 0.1510224438902743, + "grad_norm": 0.49805450608169627, + "learning_rate": 4.810625015488537e-05, + "loss": 0.939, + "step": 3028 + }, + { + "epoch": 0.15107231920199501, + "grad_norm": 0.46176526232632864, + "learning_rate": 4.810470802166814e-05, + "loss": 0.9376, + "step": 3029 + }, + { + "epoch": 0.15112219451371572, + "grad_norm": 0.5806094359236852, + "learning_rate": 4.810316528554391e-05, + "loss": 0.8901, + "step": 3030 + }, + { + "epoch": 0.1511720698254364, + "grad_norm": 0.5223042016587897, + "learning_rate": 4.810162194655291e-05, + "loss": 0.9162, + "step": 3031 + }, + { + "epoch": 0.15122194513715712, + "grad_norm": 0.5450968959865025, + "learning_rate": 4.8100078004735426e-05, + "loss": 0.998, + "step": 3032 + }, + { + "epoch": 0.1512718204488778, + "grad_norm": 0.5025423541643407, + "learning_rate": 4.809853346013175e-05, + "loss": 0.9597, + "step": 3033 + }, + { + "epoch": 0.1513216957605985, + "grad_norm": 0.595939174662407, + "learning_rate": 4.8096988312782174e-05, + "loss": 0.9363, + "step": 3034 + }, + { + "epoch": 0.1513715710723192, + "grad_norm": 0.5123603782639353, + "learning_rate": 4.8095442562727025e-05, + "loss": 0.9412, + "step": 3035 + }, + { + "epoch": 0.1514214463840399, + "grad_norm": 0.5056958090520174, + "learning_rate": 4.809389621000664e-05, + "loss": 0.9243, + "step": 3036 + }, + { + "epoch": 0.1514713216957606, + "grad_norm": 0.5190693149861763, + "learning_rate": 4.809234925466137e-05, + "loss": 0.9435, + "step": 3037 + }, + { + "epoch": 0.1515211970074813, + "grad_norm": 0.47259840768396755, + "learning_rate": 4.809080169673158e-05, + "loss": 0.9353, + "step": 3038 + }, + { + "epoch": 0.151571072319202, + "grad_norm": 0.4891779181435965, + "learning_rate": 4.8089253536257655e-05, + "loss": 0.9677, + "step": 3039 + }, + { + "epoch": 0.15162094763092268, + "grad_norm": 0.4778837315865626, + "learning_rate": 4.8087704773279994e-05, + "loss": 0.9702, + "step": 3040 + }, + { + "epoch": 0.1516708229426434, + "grad_norm": 0.44005338853703607, + "learning_rate": 4.808615540783901e-05, + "loss": 0.9788, + "step": 3041 + }, + { + "epoch": 0.1517206982543641, + "grad_norm": 0.4638234386642065, + "learning_rate": 4.808460543997513e-05, + "loss": 0.92, + "step": 3042 + }, + { + "epoch": 0.15177057356608478, + "grad_norm": 0.43807575388001135, + "learning_rate": 4.8083054869728797e-05, + "loss": 0.9211, + "step": 3043 + }, + { + "epoch": 0.1518204488778055, + "grad_norm": 0.4752204035385739, + "learning_rate": 4.8081503697140484e-05, + "loss": 0.9413, + "step": 3044 + }, + { + "epoch": 0.15187032418952617, + "grad_norm": 0.5550095411448059, + "learning_rate": 4.807995192225066e-05, + "loss": 0.9304, + "step": 3045 + }, + { + "epoch": 0.15192019950124688, + "grad_norm": 0.45526741674592547, + "learning_rate": 4.807839954509982e-05, + "loss": 0.9632, + "step": 3046 + }, + { + "epoch": 0.1519700748129676, + "grad_norm": 0.5501865442229792, + "learning_rate": 4.807684656572847e-05, + "loss": 0.9767, + "step": 3047 + }, + { + "epoch": 0.15201995012468827, + "grad_norm": 0.6025070624438115, + "learning_rate": 4.8075292984177145e-05, + "loss": 0.9265, + "step": 3048 + }, + { + "epoch": 0.15206982543640898, + "grad_norm": 0.4133309449358915, + "learning_rate": 4.807373880048637e-05, + "loss": 0.9735, + "step": 3049 + }, + { + "epoch": 0.15211970074812967, + "grad_norm": 0.5471010422264495, + "learning_rate": 4.8072184014696706e-05, + "loss": 0.9355, + "step": 3050 + }, + { + "epoch": 0.15216957605985038, + "grad_norm": 0.4080086825278622, + "learning_rate": 4.8070628626848735e-05, + "loss": 0.8921, + "step": 3051 + }, + { + "epoch": 0.15221945137157109, + "grad_norm": 0.5228842083242591, + "learning_rate": 4.806907263698303e-05, + "loss": 0.9722, + "step": 3052 + }, + { + "epoch": 0.15226932668329177, + "grad_norm": 0.5833257814150004, + "learning_rate": 4.80675160451402e-05, + "loss": 0.9734, + "step": 3053 + }, + { + "epoch": 0.15231920199501248, + "grad_norm": 0.498178259901224, + "learning_rate": 4.806595885136086e-05, + "loss": 0.9253, + "step": 3054 + }, + { + "epoch": 0.15236907730673316, + "grad_norm": 0.46697128934445525, + "learning_rate": 4.806440105568565e-05, + "loss": 0.9686, + "step": 3055 + }, + { + "epoch": 0.15241895261845387, + "grad_norm": 0.4759038222900404, + "learning_rate": 4.806284265815521e-05, + "loss": 0.9597, + "step": 3056 + }, + { + "epoch": 0.15246882793017458, + "grad_norm": 0.8147589538780524, + "learning_rate": 4.806128365881022e-05, + "loss": 0.9409, + "step": 3057 + }, + { + "epoch": 0.15251870324189526, + "grad_norm": 0.4921574595996018, + "learning_rate": 4.805972405769135e-05, + "loss": 0.9448, + "step": 3058 + }, + { + "epoch": 0.15256857855361597, + "grad_norm": 3.134505160446644, + "learning_rate": 4.80581638548393e-05, + "loss": 0.9494, + "step": 3059 + }, + { + "epoch": 0.15261845386533665, + "grad_norm": 0.5723535585864449, + "learning_rate": 4.805660305029478e-05, + "loss": 0.9856, + "step": 3060 + }, + { + "epoch": 0.15266832917705736, + "grad_norm": 0.5033351401402332, + "learning_rate": 4.805504164409853e-05, + "loss": 0.9558, + "step": 3061 + }, + { + "epoch": 0.15271820448877804, + "grad_norm": 0.5008604979585346, + "learning_rate": 4.8053479636291285e-05, + "loss": 0.928, + "step": 3062 + }, + { + "epoch": 0.15276807980049875, + "grad_norm": 0.5152103815978487, + "learning_rate": 4.80519170269138e-05, + "loss": 0.9592, + "step": 3063 + }, + { + "epoch": 0.15281795511221946, + "grad_norm": 0.4773129735369589, + "learning_rate": 4.8050353816006864e-05, + "loss": 0.9224, + "step": 3064 + }, + { + "epoch": 0.15286783042394014, + "grad_norm": 0.56528085950792, + "learning_rate": 4.804879000361126e-05, + "loss": 0.967, + "step": 3065 + }, + { + "epoch": 0.15291770573566085, + "grad_norm": 0.6305902001545473, + "learning_rate": 4.8047225589767796e-05, + "loss": 0.8886, + "step": 3066 + }, + { + "epoch": 0.15296758104738153, + "grad_norm": 0.42077048019839464, + "learning_rate": 4.804566057451729e-05, + "loss": 0.9523, + "step": 3067 + }, + { + "epoch": 0.15301745635910224, + "grad_norm": 3.30426645429864, + "learning_rate": 4.804409495790059e-05, + "loss": 0.9874, + "step": 3068 + }, + { + "epoch": 0.15306733167082295, + "grad_norm": 0.5141194856845666, + "learning_rate": 4.8042528739958546e-05, + "loss": 0.9418, + "step": 3069 + }, + { + "epoch": 0.15311720698254364, + "grad_norm": 0.43216242145057676, + "learning_rate": 4.804096192073202e-05, + "loss": 0.9577, + "step": 3070 + }, + { + "epoch": 0.15316708229426435, + "grad_norm": 0.44813725380554886, + "learning_rate": 4.803939450026191e-05, + "loss": 0.9328, + "step": 3071 + }, + { + "epoch": 0.15321695760598503, + "grad_norm": 0.5191877177766026, + "learning_rate": 4.803782647858911e-05, + "loss": 0.9455, + "step": 3072 + }, + { + "epoch": 0.15326683291770574, + "grad_norm": 0.5060100012632738, + "learning_rate": 4.803625785575454e-05, + "loss": 0.9702, + "step": 3073 + }, + { + "epoch": 0.15331670822942645, + "grad_norm": 0.48047740142093953, + "learning_rate": 4.8034688631799134e-05, + "loss": 0.9288, + "step": 3074 + }, + { + "epoch": 0.15336658354114713, + "grad_norm": 0.41721069316861065, + "learning_rate": 4.803311880676383e-05, + "loss": 0.9285, + "step": 3075 + }, + { + "epoch": 0.15341645885286784, + "grad_norm": 0.46066187109199447, + "learning_rate": 4.803154838068961e-05, + "loss": 0.9427, + "step": 3076 + }, + { + "epoch": 0.15346633416458852, + "grad_norm": 0.454986615474184, + "learning_rate": 4.8029977353617434e-05, + "loss": 0.9689, + "step": 3077 + }, + { + "epoch": 0.15351620947630923, + "grad_norm": 0.5290723939882906, + "learning_rate": 4.802840572558831e-05, + "loss": 0.9138, + "step": 3078 + }, + { + "epoch": 0.15356608478802994, + "grad_norm": 0.49537381784258655, + "learning_rate": 4.802683349664324e-05, + "loss": 0.9376, + "step": 3079 + }, + { + "epoch": 0.15361596009975062, + "grad_norm": 0.5680759575698122, + "learning_rate": 4.802526066682326e-05, + "loss": 0.9262, + "step": 3080 + }, + { + "epoch": 0.15366583541147133, + "grad_norm": 0.5098998461091141, + "learning_rate": 4.8023687236169405e-05, + "loss": 0.9619, + "step": 3081 + }, + { + "epoch": 0.153715710723192, + "grad_norm": 0.45568878916558203, + "learning_rate": 4.802211320472273e-05, + "loss": 0.9287, + "step": 3082 + }, + { + "epoch": 0.15376558603491272, + "grad_norm": 0.46852930035057067, + "learning_rate": 4.8020538572524324e-05, + "loss": 0.9277, + "step": 3083 + }, + { + "epoch": 0.1538154613466334, + "grad_norm": 0.5397870160856764, + "learning_rate": 4.801896333961526e-05, + "loss": 0.9405, + "step": 3084 + }, + { + "epoch": 0.1538653366583541, + "grad_norm": 0.9008599753145937, + "learning_rate": 4.801738750603665e-05, + "loss": 0.9102, + "step": 3085 + }, + { + "epoch": 0.15391521197007482, + "grad_norm": 0.5037657427373318, + "learning_rate": 4.801581107182962e-05, + "loss": 0.9794, + "step": 3086 + }, + { + "epoch": 0.1539650872817955, + "grad_norm": 0.45651304867498105, + "learning_rate": 4.8014234037035294e-05, + "loss": 0.9568, + "step": 3087 + }, + { + "epoch": 0.1540149625935162, + "grad_norm": 1.067954854621938, + "learning_rate": 4.801265640169483e-05, + "loss": 0.9794, + "step": 3088 + }, + { + "epoch": 0.1540648379052369, + "grad_norm": 0.4784773783046725, + "learning_rate": 4.80110781658494e-05, + "loss": 0.9207, + "step": 3089 + }, + { + "epoch": 0.1541147132169576, + "grad_norm": 0.5612967591382302, + "learning_rate": 4.8009499329540186e-05, + "loss": 0.9376, + "step": 3090 + }, + { + "epoch": 0.15416458852867831, + "grad_norm": 0.4148117342276845, + "learning_rate": 4.800791989280839e-05, + "loss": 0.9081, + "step": 3091 + }, + { + "epoch": 0.154214463840399, + "grad_norm": 0.7184761280597592, + "learning_rate": 4.800633985569521e-05, + "loss": 0.9595, + "step": 3092 + }, + { + "epoch": 0.1542643391521197, + "grad_norm": 0.4864579911533606, + "learning_rate": 4.800475921824189e-05, + "loss": 0.939, + "step": 3093 + }, + { + "epoch": 0.1543142144638404, + "grad_norm": 0.9911572285619037, + "learning_rate": 4.8003177980489675e-05, + "loss": 0.9504, + "step": 3094 + }, + { + "epoch": 0.1543640897755611, + "grad_norm": 0.43115415976205923, + "learning_rate": 4.8001596142479826e-05, + "loss": 0.9304, + "step": 3095 + }, + { + "epoch": 0.1544139650872818, + "grad_norm": 0.48115340793528416, + "learning_rate": 4.800001370425362e-05, + "loss": 0.9481, + "step": 3096 + }, + { + "epoch": 0.1544638403990025, + "grad_norm": 0.6361340627741057, + "learning_rate": 4.799843066585234e-05, + "loss": 0.9744, + "step": 3097 + }, + { + "epoch": 0.1545137157107232, + "grad_norm": 0.49826580774288315, + "learning_rate": 4.7996847027317324e-05, + "loss": 0.9919, + "step": 3098 + }, + { + "epoch": 0.15456359102244388, + "grad_norm": 0.5344303533362916, + "learning_rate": 4.799526278868987e-05, + "loss": 1.009, + "step": 3099 + }, + { + "epoch": 0.1546134663341646, + "grad_norm": 0.545622647073518, + "learning_rate": 4.7993677950011315e-05, + "loss": 0.9738, + "step": 3100 + }, + { + "epoch": 0.1546633416458853, + "grad_norm": 0.44842598625559604, + "learning_rate": 4.7992092511323035e-05, + "loss": 0.8913, + "step": 3101 + }, + { + "epoch": 0.15471321695760598, + "grad_norm": 0.5347762047863416, + "learning_rate": 4.7990506472666384e-05, + "loss": 0.9233, + "step": 3102 + }, + { + "epoch": 0.1547630922693267, + "grad_norm": 0.5028250209359066, + "learning_rate": 4.798891983408276e-05, + "loss": 0.9257, + "step": 3103 + }, + { + "epoch": 0.15481296758104737, + "grad_norm": 0.597405160299651, + "learning_rate": 4.798733259561356e-05, + "loss": 0.9742, + "step": 3104 + }, + { + "epoch": 0.15486284289276808, + "grad_norm": 0.4736700771654389, + "learning_rate": 4.7985744757300205e-05, + "loss": 0.9745, + "step": 3105 + }, + { + "epoch": 0.1549127182044888, + "grad_norm": 0.5811784422456958, + "learning_rate": 4.7984156319184126e-05, + "loss": 0.9748, + "step": 3106 + }, + { + "epoch": 0.15496259351620947, + "grad_norm": 0.6917702721348599, + "learning_rate": 4.798256728130677e-05, + "loss": 0.9956, + "step": 3107 + }, + { + "epoch": 0.15501246882793018, + "grad_norm": 0.4924301623725297, + "learning_rate": 4.798097764370961e-05, + "loss": 0.934, + "step": 3108 + }, + { + "epoch": 0.15506234413965087, + "grad_norm": 0.43599428085085495, + "learning_rate": 4.797938740643413e-05, + "loss": 0.8991, + "step": 3109 + }, + { + "epoch": 0.15511221945137157, + "grad_norm": 0.5275902136053361, + "learning_rate": 4.7977796569521815e-05, + "loss": 0.9289, + "step": 3110 + }, + { + "epoch": 0.15516209476309226, + "grad_norm": 0.6854434032312503, + "learning_rate": 4.797620513301418e-05, + "loss": 0.9628, + "step": 3111 + }, + { + "epoch": 0.15521197007481297, + "grad_norm": 0.5938950800553691, + "learning_rate": 4.7974613096952755e-05, + "loss": 0.9272, + "step": 3112 + }, + { + "epoch": 0.15526184538653368, + "grad_norm": 0.4734451983629633, + "learning_rate": 4.797302046137909e-05, + "loss": 0.9682, + "step": 3113 + }, + { + "epoch": 0.15531172069825436, + "grad_norm": 0.4681290927999438, + "learning_rate": 4.797142722633473e-05, + "loss": 0.9409, + "step": 3114 + }, + { + "epoch": 0.15536159600997507, + "grad_norm": 0.5373976007202327, + "learning_rate": 4.796983339186126e-05, + "loss": 0.9066, + "step": 3115 + }, + { + "epoch": 0.15541147132169575, + "grad_norm": 0.5017719454855991, + "learning_rate": 4.7968238958000264e-05, + "loss": 0.9263, + "step": 3116 + }, + { + "epoch": 0.15546134663341646, + "grad_norm": 0.6467556498998476, + "learning_rate": 4.796664392479335e-05, + "loss": 0.9368, + "step": 3117 + }, + { + "epoch": 0.15551122194513717, + "grad_norm": 0.5756325934838201, + "learning_rate": 4.796504829228215e-05, + "loss": 0.919, + "step": 3118 + }, + { + "epoch": 0.15556109725685785, + "grad_norm": 0.7608841633814517, + "learning_rate": 4.796345206050829e-05, + "loss": 0.952, + "step": 3119 + }, + { + "epoch": 0.15561097256857856, + "grad_norm": 0.6794525176429023, + "learning_rate": 4.7961855229513426e-05, + "loss": 0.933, + "step": 3120 + }, + { + "epoch": 0.15566084788029924, + "grad_norm": 0.5060209663215065, + "learning_rate": 4.7960257799339224e-05, + "loss": 0.9096, + "step": 3121 + }, + { + "epoch": 0.15571072319201995, + "grad_norm": 0.8403139331084409, + "learning_rate": 4.795865977002737e-05, + "loss": 0.9211, + "step": 3122 + }, + { + "epoch": 0.15576059850374066, + "grad_norm": 0.42450342424231874, + "learning_rate": 4.795706114161956e-05, + "loss": 0.9308, + "step": 3123 + }, + { + "epoch": 0.15581047381546134, + "grad_norm": 0.4990579585730063, + "learning_rate": 4.795546191415752e-05, + "loss": 0.9543, + "step": 3124 + }, + { + "epoch": 0.15586034912718205, + "grad_norm": 0.5598852653736438, + "learning_rate": 4.795386208768298e-05, + "loss": 0.9759, + "step": 3125 + }, + { + "epoch": 0.15591022443890273, + "grad_norm": 0.5087201519672961, + "learning_rate": 4.795226166223767e-05, + "loss": 0.9468, + "step": 3126 + }, + { + "epoch": 0.15596009975062344, + "grad_norm": 0.44372874536561424, + "learning_rate": 4.795066063786337e-05, + "loss": 0.9443, + "step": 3127 + }, + { + "epoch": 0.15600997506234415, + "grad_norm": 0.543564877171898, + "learning_rate": 4.794905901460185e-05, + "loss": 0.9582, + "step": 3128 + }, + { + "epoch": 0.15605985037406483, + "grad_norm": 0.47691192354653794, + "learning_rate": 4.79474567924949e-05, + "loss": 0.985, + "step": 3129 + }, + { + "epoch": 0.15610972568578554, + "grad_norm": 0.6849695610017332, + "learning_rate": 4.794585397158435e-05, + "loss": 0.9464, + "step": 3130 + }, + { + "epoch": 0.15615960099750623, + "grad_norm": 0.548437565615722, + "learning_rate": 4.7944250551912e-05, + "loss": 0.9432, + "step": 3131 + }, + { + "epoch": 0.15620947630922694, + "grad_norm": 0.5195960119666782, + "learning_rate": 4.7942646533519704e-05, + "loss": 0.9728, + "step": 3132 + }, + { + "epoch": 0.15625935162094762, + "grad_norm": 0.5132559784169425, + "learning_rate": 4.7941041916449316e-05, + "loss": 0.9342, + "step": 3133 + }, + { + "epoch": 0.15630922693266833, + "grad_norm": 0.5522785555737153, + "learning_rate": 4.7939436700742696e-05, + "loss": 0.9497, + "step": 3134 + }, + { + "epoch": 0.15635910224438904, + "grad_norm": 0.5602805229880814, + "learning_rate": 4.793783088644175e-05, + "loss": 0.9376, + "step": 3135 + }, + { + "epoch": 0.15640897755610972, + "grad_norm": 0.5137614509136031, + "learning_rate": 4.793622447358837e-05, + "loss": 0.8947, + "step": 3136 + }, + { + "epoch": 0.15645885286783043, + "grad_norm": 0.6218199264423625, + "learning_rate": 4.7934617462224477e-05, + "loss": 0.8887, + "step": 3137 + }, + { + "epoch": 0.1565087281795511, + "grad_norm": 0.5035126902217508, + "learning_rate": 4.793300985239201e-05, + "loss": 0.9581, + "step": 3138 + }, + { + "epoch": 0.15655860349127182, + "grad_norm": 0.5363148686048043, + "learning_rate": 4.793140164413291e-05, + "loss": 0.9627, + "step": 3139 + }, + { + "epoch": 0.15660847880299253, + "grad_norm": 0.44949615242641616, + "learning_rate": 4.792979283748915e-05, + "loss": 0.9417, + "step": 3140 + }, + { + "epoch": 0.1566583541147132, + "grad_norm": 0.47770461380036505, + "learning_rate": 4.792818343250271e-05, + "loss": 0.9885, + "step": 3141 + }, + { + "epoch": 0.15670822942643392, + "grad_norm": 0.42970764281025897, + "learning_rate": 4.792657342921557e-05, + "loss": 0.9524, + "step": 3142 + }, + { + "epoch": 0.1567581047381546, + "grad_norm": 0.891534293498914, + "learning_rate": 4.792496282766977e-05, + "loss": 0.9304, + "step": 3143 + }, + { + "epoch": 0.1568079800498753, + "grad_norm": 0.6261472438646073, + "learning_rate": 4.792335162790732e-05, + "loss": 0.9803, + "step": 3144 + }, + { + "epoch": 0.15685785536159602, + "grad_norm": 0.4634696851324137, + "learning_rate": 4.7921739829970274e-05, + "loss": 0.9593, + "step": 3145 + }, + { + "epoch": 0.1569077306733167, + "grad_norm": 0.6217110945882846, + "learning_rate": 4.792012743390069e-05, + "loss": 0.9667, + "step": 3146 + }, + { + "epoch": 0.1569576059850374, + "grad_norm": 0.642673448517825, + "learning_rate": 4.791851443974063e-05, + "loss": 0.9246, + "step": 3147 + }, + { + "epoch": 0.1570074812967581, + "grad_norm": 0.7157733945496292, + "learning_rate": 4.7916900847532184e-05, + "loss": 0.9331, + "step": 3148 + }, + { + "epoch": 0.1570573566084788, + "grad_norm": 0.5077438398649795, + "learning_rate": 4.791528665731748e-05, + "loss": 0.9346, + "step": 3149 + }, + { + "epoch": 0.1571072319201995, + "grad_norm": 0.473637397705214, + "learning_rate": 4.791367186913862e-05, + "loss": 0.9559, + "step": 3150 + }, + { + "epoch": 0.1571571072319202, + "grad_norm": 0.6487118605576738, + "learning_rate": 4.791205648303775e-05, + "loss": 0.9491, + "step": 3151 + }, + { + "epoch": 0.1572069825436409, + "grad_norm": 0.8043559615790744, + "learning_rate": 4.791044049905702e-05, + "loss": 0.9504, + "step": 3152 + }, + { + "epoch": 0.1572568578553616, + "grad_norm": 0.534987165003789, + "learning_rate": 4.7908823917238596e-05, + "loss": 1.0004, + "step": 3153 + }, + { + "epoch": 0.1573067331670823, + "grad_norm": 0.5936485828596227, + "learning_rate": 4.7907206737624674e-05, + "loss": 0.9802, + "step": 3154 + }, + { + "epoch": 0.157356608478803, + "grad_norm": 0.4892321112991103, + "learning_rate": 4.790558896025743e-05, + "loss": 0.9329, + "step": 3155 + }, + { + "epoch": 0.1574064837905237, + "grad_norm": 0.6294417472910392, + "learning_rate": 4.79039705851791e-05, + "loss": 0.9607, + "step": 3156 + }, + { + "epoch": 0.1574563591022444, + "grad_norm": 1.7472886071305092, + "learning_rate": 4.790235161243191e-05, + "loss": 0.9231, + "step": 3157 + }, + { + "epoch": 0.15750623441396508, + "grad_norm": 0.722162300155533, + "learning_rate": 4.790073204205811e-05, + "loss": 0.9663, + "step": 3158 + }, + { + "epoch": 0.1575561097256858, + "grad_norm": 0.4554014657351532, + "learning_rate": 4.7899111874099946e-05, + "loss": 0.9239, + "step": 3159 + }, + { + "epoch": 0.15760598503740647, + "grad_norm": 0.6581224900335375, + "learning_rate": 4.789749110859971e-05, + "loss": 0.9485, + "step": 3160 + }, + { + "epoch": 0.15765586034912718, + "grad_norm": 0.6064199851413343, + "learning_rate": 4.7895869745599686e-05, + "loss": 0.9409, + "step": 3161 + }, + { + "epoch": 0.1577057356608479, + "grad_norm": 0.5106733458617855, + "learning_rate": 4.7894247785142196e-05, + "loss": 0.9735, + "step": 3162 + }, + { + "epoch": 0.15775561097256857, + "grad_norm": 0.494977741998703, + "learning_rate": 4.789262522726955e-05, + "loss": 0.9699, + "step": 3163 + }, + { + "epoch": 0.15780548628428928, + "grad_norm": 1.1179328028152247, + "learning_rate": 4.789100207202409e-05, + "loss": 0.9734, + "step": 3164 + }, + { + "epoch": 0.15785536159600996, + "grad_norm": 0.5170129092545833, + "learning_rate": 4.7889378319448185e-05, + "loss": 0.931, + "step": 3165 + }, + { + "epoch": 0.15790523690773067, + "grad_norm": 0.6784844016529372, + "learning_rate": 4.788775396958419e-05, + "loss": 0.9342, + "step": 3166 + }, + { + "epoch": 0.15795511221945138, + "grad_norm": 1.1697948353382779, + "learning_rate": 4.788612902247451e-05, + "loss": 0.9454, + "step": 3167 + }, + { + "epoch": 0.15800498753117206, + "grad_norm": 0.5192522440405908, + "learning_rate": 4.7884503478161524e-05, + "loss": 0.8954, + "step": 3168 + }, + { + "epoch": 0.15805486284289277, + "grad_norm": 0.6318572826920335, + "learning_rate": 4.788287733668767e-05, + "loss": 0.9164, + "step": 3169 + }, + { + "epoch": 0.15810473815461346, + "grad_norm": 0.6580014924266114, + "learning_rate": 4.788125059809536e-05, + "loss": 0.9429, + "step": 3170 + }, + { + "epoch": 0.15815461346633417, + "grad_norm": 0.4596276545852102, + "learning_rate": 4.787962326242707e-05, + "loss": 0.9166, + "step": 3171 + }, + { + "epoch": 0.15820448877805487, + "grad_norm": 0.45393076109300307, + "learning_rate": 4.7877995329725245e-05, + "loss": 0.9507, + "step": 3172 + }, + { + "epoch": 0.15825436408977556, + "grad_norm": 0.46865408147423265, + "learning_rate": 4.787636680003237e-05, + "loss": 0.932, + "step": 3173 + }, + { + "epoch": 0.15830423940149627, + "grad_norm": 0.5524381107317909, + "learning_rate": 4.787473767339094e-05, + "loss": 0.9344, + "step": 3174 + }, + { + "epoch": 0.15835411471321695, + "grad_norm": 0.8669055880810601, + "learning_rate": 4.787310794984347e-05, + "loss": 0.934, + "step": 3175 + }, + { + "epoch": 0.15840399002493766, + "grad_norm": 0.46748933527037373, + "learning_rate": 4.787147762943248e-05, + "loss": 0.968, + "step": 3176 + }, + { + "epoch": 0.15845386533665837, + "grad_norm": 0.3908576788874074, + "learning_rate": 4.786984671220053e-05, + "loss": 0.9049, + "step": 3177 + }, + { + "epoch": 0.15850374064837905, + "grad_norm": 0.4894726822443591, + "learning_rate": 4.7868215198190156e-05, + "loss": 0.9276, + "step": 3178 + }, + { + "epoch": 0.15855361596009976, + "grad_norm": 0.4346919709066145, + "learning_rate": 4.786658308744394e-05, + "loss": 0.949, + "step": 3179 + }, + { + "epoch": 0.15860349127182044, + "grad_norm": 0.5865581675306311, + "learning_rate": 4.786495038000447e-05, + "loss": 0.9751, + "step": 3180 + }, + { + "epoch": 0.15865336658354115, + "grad_norm": 0.4884476595197893, + "learning_rate": 4.786331707591436e-05, + "loss": 0.942, + "step": 3181 + }, + { + "epoch": 0.15870324189526183, + "grad_norm": 0.4716652911991379, + "learning_rate": 4.786168317521622e-05, + "loss": 0.9444, + "step": 3182 + }, + { + "epoch": 0.15875311720698254, + "grad_norm": 0.7257194138787928, + "learning_rate": 4.7860048677952686e-05, + "loss": 0.9562, + "step": 3183 + }, + { + "epoch": 0.15880299251870325, + "grad_norm": 0.5625620976882983, + "learning_rate": 4.7858413584166414e-05, + "loss": 0.957, + "step": 3184 + }, + { + "epoch": 0.15885286783042393, + "grad_norm": 0.49227626312363676, + "learning_rate": 4.7856777893900076e-05, + "loss": 0.9298, + "step": 3185 + }, + { + "epoch": 0.15890274314214464, + "grad_norm": 0.5894915068951094, + "learning_rate": 4.785514160719634e-05, + "loss": 0.9688, + "step": 3186 + }, + { + "epoch": 0.15895261845386532, + "grad_norm": 0.5287831111211763, + "learning_rate": 4.785350472409792e-05, + "loss": 0.924, + "step": 3187 + }, + { + "epoch": 0.15900249376558603, + "grad_norm": 0.456977287810731, + "learning_rate": 4.785186724464752e-05, + "loss": 0.9, + "step": 3188 + }, + { + "epoch": 0.15905236907730674, + "grad_norm": 0.5834571888726257, + "learning_rate": 4.7850229168887864e-05, + "loss": 0.9459, + "step": 3189 + }, + { + "epoch": 0.15910224438902743, + "grad_norm": 0.5601651432669816, + "learning_rate": 4.7848590496861714e-05, + "loss": 0.9545, + "step": 3190 + }, + { + "epoch": 0.15915211970074813, + "grad_norm": 0.5124633670064654, + "learning_rate": 4.784695122861181e-05, + "loss": 0.937, + "step": 3191 + }, + { + "epoch": 0.15920199501246882, + "grad_norm": 0.469851597321015, + "learning_rate": 4.7845311364180946e-05, + "loss": 0.9173, + "step": 3192 + }, + { + "epoch": 0.15925187032418953, + "grad_norm": 0.546064725418742, + "learning_rate": 4.7843670903611904e-05, + "loss": 0.9329, + "step": 3193 + }, + { + "epoch": 0.15930174563591024, + "grad_norm": 0.5777074533577974, + "learning_rate": 4.784202984694749e-05, + "loss": 0.9495, + "step": 3194 + }, + { + "epoch": 0.15935162094763092, + "grad_norm": 0.5535556960147259, + "learning_rate": 4.784038819423054e-05, + "loss": 0.9859, + "step": 3195 + }, + { + "epoch": 0.15940149625935163, + "grad_norm": 0.7598044157705233, + "learning_rate": 4.783874594550388e-05, + "loss": 0.9422, + "step": 3196 + }, + { + "epoch": 0.1594513715710723, + "grad_norm": 0.6108994140232661, + "learning_rate": 4.7837103100810356e-05, + "loss": 0.9665, + "step": 3197 + }, + { + "epoch": 0.15950124688279302, + "grad_norm": 0.5368655766897186, + "learning_rate": 4.7835459660192855e-05, + "loss": 0.9334, + "step": 3198 + }, + { + "epoch": 0.15955112219451373, + "grad_norm": 1.244656850159054, + "learning_rate": 4.7833815623694256e-05, + "loss": 0.9214, + "step": 3199 + }, + { + "epoch": 0.1596009975062344, + "grad_norm": 0.6148671541308915, + "learning_rate": 4.783217099135745e-05, + "loss": 0.9459, + "step": 3200 + }, + { + "epoch": 0.15965087281795512, + "grad_norm": 0.5917753314743414, + "learning_rate": 4.783052576322536e-05, + "loss": 0.9592, + "step": 3201 + }, + { + "epoch": 0.1597007481296758, + "grad_norm": 0.4830496252832452, + "learning_rate": 4.7828879939340925e-05, + "loss": 0.9751, + "step": 3202 + }, + { + "epoch": 0.1597506234413965, + "grad_norm": 0.5508673644139911, + "learning_rate": 4.7827233519747086e-05, + "loss": 0.9314, + "step": 3203 + }, + { + "epoch": 0.15980049875311722, + "grad_norm": 0.6261727976342749, + "learning_rate": 4.78255865044868e-05, + "loss": 0.9648, + "step": 3204 + }, + { + "epoch": 0.1598503740648379, + "grad_norm": 0.49040606576469437, + "learning_rate": 4.782393889360305e-05, + "loss": 0.9049, + "step": 3205 + }, + { + "epoch": 0.1599002493765586, + "grad_norm": 0.4943054828253562, + "learning_rate": 4.7822290687138826e-05, + "loss": 0.8866, + "step": 3206 + }, + { + "epoch": 0.1599501246882793, + "grad_norm": 0.5178754562796485, + "learning_rate": 4.782064188513714e-05, + "loss": 0.9255, + "step": 3207 + }, + { + "epoch": 0.16, + "grad_norm": 0.4883969806903344, + "learning_rate": 4.7818992487641024e-05, + "loss": 0.946, + "step": 3208 + }, + { + "epoch": 0.16004987531172069, + "grad_norm": 0.5567669224939301, + "learning_rate": 4.781734249469351e-05, + "loss": 0.9098, + "step": 3209 + }, + { + "epoch": 0.1600997506234414, + "grad_norm": 0.5794096209146514, + "learning_rate": 4.7815691906337646e-05, + "loss": 0.913, + "step": 3210 + }, + { + "epoch": 0.1601496259351621, + "grad_norm": 0.5251799340646834, + "learning_rate": 4.781404072261652e-05, + "loss": 0.9692, + "step": 3211 + }, + { + "epoch": 0.1601995012468828, + "grad_norm": 0.7270514347839658, + "learning_rate": 4.781238894357322e-05, + "loss": 0.938, + "step": 3212 + }, + { + "epoch": 0.1602493765586035, + "grad_norm": 0.41130560451344655, + "learning_rate": 4.781073656925083e-05, + "loss": 0.9001, + "step": 3213 + }, + { + "epoch": 0.16029925187032418, + "grad_norm": 0.47906796761154824, + "learning_rate": 4.780908359969247e-05, + "loss": 0.9187, + "step": 3214 + }, + { + "epoch": 0.1603491271820449, + "grad_norm": 0.542323659306033, + "learning_rate": 4.780743003494129e-05, + "loss": 0.9083, + "step": 3215 + }, + { + "epoch": 0.1603990024937656, + "grad_norm": 0.5132262598341563, + "learning_rate": 4.780577587504044e-05, + "loss": 0.9639, + "step": 3216 + }, + { + "epoch": 0.16044887780548628, + "grad_norm": 0.46253442338137685, + "learning_rate": 4.780412112003306e-05, + "loss": 0.9512, + "step": 3217 + }, + { + "epoch": 0.160498753117207, + "grad_norm": 0.46947412403235833, + "learning_rate": 4.7802465769962354e-05, + "loss": 0.9886, + "step": 3218 + }, + { + "epoch": 0.16054862842892767, + "grad_norm": 0.5827124831844043, + "learning_rate": 4.7800809824871505e-05, + "loss": 0.8808, + "step": 3219 + }, + { + "epoch": 0.16059850374064838, + "grad_norm": 0.6126633297835384, + "learning_rate": 4.7799153284803735e-05, + "loss": 0.9461, + "step": 3220 + }, + { + "epoch": 0.1606483790523691, + "grad_norm": 1.5553548352144544, + "learning_rate": 4.7797496149802256e-05, + "loss": 0.978, + "step": 3221 + }, + { + "epoch": 0.16069825436408977, + "grad_norm": 0.503920741185038, + "learning_rate": 4.779583841991032e-05, + "loss": 0.9275, + "step": 3222 + }, + { + "epoch": 0.16074812967581048, + "grad_norm": 0.48979935957374104, + "learning_rate": 4.779418009517118e-05, + "loss": 0.9152, + "step": 3223 + }, + { + "epoch": 0.16079800498753116, + "grad_norm": 0.5034402257639589, + "learning_rate": 4.779252117562811e-05, + "loss": 0.904, + "step": 3224 + }, + { + "epoch": 0.16084788029925187, + "grad_norm": 0.5077279749446948, + "learning_rate": 4.779086166132441e-05, + "loss": 0.924, + "step": 3225 + }, + { + "epoch": 0.16089775561097258, + "grad_norm": 0.48740304947293206, + "learning_rate": 4.7789201552303363e-05, + "loss": 0.9523, + "step": 3226 + }, + { + "epoch": 0.16094763092269326, + "grad_norm": 0.6614965560822884, + "learning_rate": 4.778754084860831e-05, + "loss": 0.98, + "step": 3227 + }, + { + "epoch": 0.16099750623441397, + "grad_norm": 0.6448743010431373, + "learning_rate": 4.778587955028257e-05, + "loss": 0.9389, + "step": 3228 + }, + { + "epoch": 0.16104738154613465, + "grad_norm": 0.4926389828185337, + "learning_rate": 4.778421765736951e-05, + "loss": 0.9116, + "step": 3229 + }, + { + "epoch": 0.16109725685785536, + "grad_norm": 0.6235113380150664, + "learning_rate": 4.778255516991248e-05, + "loss": 0.9436, + "step": 3230 + }, + { + "epoch": 0.16114713216957605, + "grad_norm": 0.519867047247555, + "learning_rate": 4.778089208795488e-05, + "loss": 0.9755, + "step": 3231 + }, + { + "epoch": 0.16119700748129676, + "grad_norm": 0.4375135874874566, + "learning_rate": 4.777922841154008e-05, + "loss": 0.9579, + "step": 3232 + }, + { + "epoch": 0.16124688279301747, + "grad_norm": 0.5342616056270373, + "learning_rate": 4.7777564140711515e-05, + "loss": 0.9562, + "step": 3233 + }, + { + "epoch": 0.16129675810473815, + "grad_norm": 0.7506153309702083, + "learning_rate": 4.777589927551261e-05, + "loss": 0.9126, + "step": 3234 + }, + { + "epoch": 0.16134663341645886, + "grad_norm": 0.5546424043333371, + "learning_rate": 4.77742338159868e-05, + "loss": 0.9126, + "step": 3235 + }, + { + "epoch": 0.16139650872817954, + "grad_norm": 0.5224358768643468, + "learning_rate": 4.777256776217756e-05, + "loss": 0.9901, + "step": 3236 + }, + { + "epoch": 0.16144638403990025, + "grad_norm": 0.5671775076261747, + "learning_rate": 4.7770901114128355e-05, + "loss": 0.9465, + "step": 3237 + }, + { + "epoch": 0.16149625935162096, + "grad_norm": 0.5385186750843259, + "learning_rate": 4.776923387188267e-05, + "loss": 0.9564, + "step": 3238 + }, + { + "epoch": 0.16154613466334164, + "grad_norm": 0.49478197457176726, + "learning_rate": 4.7767566035484026e-05, + "loss": 0.9472, + "step": 3239 + }, + { + "epoch": 0.16159600997506235, + "grad_norm": 0.5139873924311466, + "learning_rate": 4.776589760497593e-05, + "loss": 0.9361, + "step": 3240 + }, + { + "epoch": 0.16164588528678303, + "grad_norm": 0.45787584793078956, + "learning_rate": 4.7764228580401926e-05, + "loss": 0.9071, + "step": 3241 + }, + { + "epoch": 0.16169576059850374, + "grad_norm": 0.5799982840782484, + "learning_rate": 4.776255896180557e-05, + "loss": 0.9497, + "step": 3242 + }, + { + "epoch": 0.16174563591022445, + "grad_norm": 0.749493363424969, + "learning_rate": 4.7760888749230416e-05, + "loss": 0.9634, + "step": 3243 + }, + { + "epoch": 0.16179551122194513, + "grad_norm": 0.4538947159551012, + "learning_rate": 4.775921794272006e-05, + "loss": 0.974, + "step": 3244 + }, + { + "epoch": 0.16184538653366584, + "grad_norm": 0.4910666578983544, + "learning_rate": 4.77575465423181e-05, + "loss": 0.9289, + "step": 3245 + }, + { + "epoch": 0.16189526184538652, + "grad_norm": 0.5395539481269483, + "learning_rate": 4.775587454806814e-05, + "loss": 0.9267, + "step": 3246 + }, + { + "epoch": 0.16194513715710723, + "grad_norm": 0.47127201289270737, + "learning_rate": 4.775420196001382e-05, + "loss": 0.9048, + "step": 3247 + }, + { + "epoch": 0.16199501246882794, + "grad_norm": 0.6238494744773859, + "learning_rate": 4.775252877819879e-05, + "loss": 0.9458, + "step": 3248 + }, + { + "epoch": 0.16204488778054862, + "grad_norm": 0.6100975072998754, + "learning_rate": 4.7750855002666704e-05, + "loss": 0.9645, + "step": 3249 + }, + { + "epoch": 0.16209476309226933, + "grad_norm": 0.6206962521632964, + "learning_rate": 4.774918063346123e-05, + "loss": 0.9317, + "step": 3250 + }, + { + "epoch": 0.16214463840399002, + "grad_norm": 0.5646275636563741, + "learning_rate": 4.774750567062608e-05, + "loss": 0.9351, + "step": 3251 + }, + { + "epoch": 0.16219451371571073, + "grad_norm": 0.6789360268391209, + "learning_rate": 4.7745830114204946e-05, + "loss": 0.9633, + "step": 3252 + }, + { + "epoch": 0.16224438902743143, + "grad_norm": 0.47730442135471224, + "learning_rate": 4.774415396424155e-05, + "loss": 0.9561, + "step": 3253 + }, + { + "epoch": 0.16229426433915212, + "grad_norm": 2.138346084134085, + "learning_rate": 4.7742477220779635e-05, + "loss": 0.9238, + "step": 3254 + }, + { + "epoch": 0.16234413965087283, + "grad_norm": 0.49347443032856586, + "learning_rate": 4.774079988386296e-05, + "loss": 0.9612, + "step": 3255 + }, + { + "epoch": 0.1623940149625935, + "grad_norm": 0.9422545366343014, + "learning_rate": 4.7739121953535295e-05, + "loss": 0.9987, + "step": 3256 + }, + { + "epoch": 0.16244389027431422, + "grad_norm": 0.4908309558831111, + "learning_rate": 4.773744342984041e-05, + "loss": 0.9149, + "step": 3257 + }, + { + "epoch": 0.1624937655860349, + "grad_norm": 0.49178640382317185, + "learning_rate": 4.7735764312822115e-05, + "loss": 0.9536, + "step": 3258 + }, + { + "epoch": 0.1625436408977556, + "grad_norm": 0.711872518960547, + "learning_rate": 4.7734084602524226e-05, + "loss": 0.9543, + "step": 3259 + }, + { + "epoch": 0.16259351620947632, + "grad_norm": 0.4396941125869105, + "learning_rate": 4.773240429899058e-05, + "loss": 0.9404, + "step": 3260 + }, + { + "epoch": 0.162643391521197, + "grad_norm": 0.595426736514318, + "learning_rate": 4.773072340226501e-05, + "loss": 0.9544, + "step": 3261 + }, + { + "epoch": 0.1626932668329177, + "grad_norm": 0.5101045837544977, + "learning_rate": 4.772904191239139e-05, + "loss": 0.9642, + "step": 3262 + }, + { + "epoch": 0.1627431421446384, + "grad_norm": 0.48342060397791503, + "learning_rate": 4.7727359829413586e-05, + "loss": 0.9264, + "step": 3263 + }, + { + "epoch": 0.1627930174563591, + "grad_norm": 0.6032514945633233, + "learning_rate": 4.77256771533755e-05, + "loss": 0.9255, + "step": 3264 + }, + { + "epoch": 0.1628428927680798, + "grad_norm": 0.5278390625937998, + "learning_rate": 4.772399388432105e-05, + "loss": 0.9457, + "step": 3265 + }, + { + "epoch": 0.1628927680798005, + "grad_norm": 0.5593242185800807, + "learning_rate": 4.772231002229415e-05, + "loss": 0.9508, + "step": 3266 + }, + { + "epoch": 0.1629426433915212, + "grad_norm": 0.6031738274197546, + "learning_rate": 4.772062556733873e-05, + "loss": 0.9417, + "step": 3267 + }, + { + "epoch": 0.16299251870324188, + "grad_norm": 0.6288672580244452, + "learning_rate": 4.7718940519498755e-05, + "loss": 0.9563, + "step": 3268 + }, + { + "epoch": 0.1630423940149626, + "grad_norm": 0.4765143469037355, + "learning_rate": 4.7717254878818196e-05, + "loss": 0.9354, + "step": 3269 + }, + { + "epoch": 0.1630922693266833, + "grad_norm": 0.5637315908119991, + "learning_rate": 4.771556864534103e-05, + "loss": 0.9408, + "step": 3270 + }, + { + "epoch": 0.16314214463840399, + "grad_norm": 0.4944693773833278, + "learning_rate": 4.7713881819111284e-05, + "loss": 0.9421, + "step": 3271 + }, + { + "epoch": 0.1631920199501247, + "grad_norm": 0.4947621292540401, + "learning_rate": 4.771219440017295e-05, + "loss": 0.947, + "step": 3272 + }, + { + "epoch": 0.16324189526184538, + "grad_norm": 0.6226568833019929, + "learning_rate": 4.771050638857006e-05, + "loss": 0.9457, + "step": 3273 + }, + { + "epoch": 0.1632917705735661, + "grad_norm": 0.5016653749644338, + "learning_rate": 4.770881778434668e-05, + "loss": 0.949, + "step": 3274 + }, + { + "epoch": 0.1633416458852868, + "grad_norm": 0.5609847619583096, + "learning_rate": 4.7707128587546856e-05, + "loss": 0.9981, + "step": 3275 + }, + { + "epoch": 0.16339152119700748, + "grad_norm": 0.499930588215901, + "learning_rate": 4.7705438798214674e-05, + "loss": 0.9455, + "step": 3276 + }, + { + "epoch": 0.1634413965087282, + "grad_norm": 0.4698100829311777, + "learning_rate": 4.7703748416394234e-05, + "loss": 0.9272, + "step": 3277 + }, + { + "epoch": 0.16349127182044887, + "grad_norm": 0.6532778841440996, + "learning_rate": 4.770205744212964e-05, + "loss": 0.986, + "step": 3278 + }, + { + "epoch": 0.16354114713216958, + "grad_norm": 0.5920173958277984, + "learning_rate": 4.770036587546502e-05, + "loss": 0.9986, + "step": 3279 + }, + { + "epoch": 0.16359102244389026, + "grad_norm": 0.6760294199067848, + "learning_rate": 4.76986737164445e-05, + "loss": 0.9288, + "step": 3280 + }, + { + "epoch": 0.16364089775561097, + "grad_norm": 0.4761034501626329, + "learning_rate": 4.7696980965112253e-05, + "loss": 0.9683, + "step": 3281 + }, + { + "epoch": 0.16369077306733168, + "grad_norm": 0.49563802412543473, + "learning_rate": 4.769528762151244e-05, + "loss": 1.0024, + "step": 3282 + }, + { + "epoch": 0.16374064837905236, + "grad_norm": 0.9486676279535632, + "learning_rate": 4.7693593685689266e-05, + "loss": 0.9317, + "step": 3283 + }, + { + "epoch": 0.16379052369077307, + "grad_norm": 0.8083339603671907, + "learning_rate": 4.769189915768691e-05, + "loss": 0.9323, + "step": 3284 + }, + { + "epoch": 0.16384039900249375, + "grad_norm": 0.5383017226272084, + "learning_rate": 4.769020403754961e-05, + "loss": 0.9178, + "step": 3285 + }, + { + "epoch": 0.16389027431421446, + "grad_norm": 0.5549308552237283, + "learning_rate": 4.768850832532159e-05, + "loss": 0.9297, + "step": 3286 + }, + { + "epoch": 0.16394014962593517, + "grad_norm": 0.5538062289782478, + "learning_rate": 4.7686812021047087e-05, + "loss": 0.9511, + "step": 3287 + }, + { + "epoch": 0.16399002493765585, + "grad_norm": 0.46388297796511646, + "learning_rate": 4.768511512477039e-05, + "loss": 0.9352, + "step": 3288 + }, + { + "epoch": 0.16403990024937656, + "grad_norm": 0.5626417099444585, + "learning_rate": 4.7683417636535764e-05, + "loss": 0.952, + "step": 3289 + }, + { + "epoch": 0.16408977556109725, + "grad_norm": 0.4496570212723223, + "learning_rate": 4.76817195563875e-05, + "loss": 0.8974, + "step": 3290 + }, + { + "epoch": 0.16413965087281795, + "grad_norm": 0.5847140971308563, + "learning_rate": 4.768002088436993e-05, + "loss": 0.9608, + "step": 3291 + }, + { + "epoch": 0.16418952618453866, + "grad_norm": 0.8966677468473322, + "learning_rate": 4.7678321620527357e-05, + "loss": 0.9357, + "step": 3292 + }, + { + "epoch": 0.16423940149625935, + "grad_norm": 0.6893201661044053, + "learning_rate": 4.7676621764904125e-05, + "loss": 0.9111, + "step": 3293 + }, + { + "epoch": 0.16428927680798006, + "grad_norm": 0.6508857471775132, + "learning_rate": 4.76749213175446e-05, + "loss": 0.9531, + "step": 3294 + }, + { + "epoch": 0.16433915211970074, + "grad_norm": 0.6073270268377484, + "learning_rate": 4.7673220278493155e-05, + "loss": 0.9649, + "step": 3295 + }, + { + "epoch": 0.16438902743142145, + "grad_norm": 0.4916311022379739, + "learning_rate": 4.7671518647794164e-05, + "loss": 0.9315, + "step": 3296 + }, + { + "epoch": 0.16443890274314216, + "grad_norm": 0.4236976058921007, + "learning_rate": 4.766981642549205e-05, + "loss": 0.9456, + "step": 3297 + }, + { + "epoch": 0.16448877805486284, + "grad_norm": 0.5121366187875154, + "learning_rate": 4.766811361163123e-05, + "loss": 0.9331, + "step": 3298 + }, + { + "epoch": 0.16453865336658355, + "grad_norm": 0.6681840050934793, + "learning_rate": 4.7666410206256116e-05, + "loss": 0.92, + "step": 3299 + }, + { + "epoch": 0.16458852867830423, + "grad_norm": 0.5205640208290819, + "learning_rate": 4.766470620941118e-05, + "loss": 0.9311, + "step": 3300 + }, + { + "epoch": 0.16463840399002494, + "grad_norm": 0.6416480547926283, + "learning_rate": 4.7663001621140865e-05, + "loss": 0.956, + "step": 3301 + }, + { + "epoch": 0.16468827930174565, + "grad_norm": 0.47523992544589116, + "learning_rate": 4.766129644148968e-05, + "loss": 0.9095, + "step": 3302 + }, + { + "epoch": 0.16473815461346633, + "grad_norm": 0.46518189676171123, + "learning_rate": 4.76595906705021e-05, + "loss": 0.9223, + "step": 3303 + }, + { + "epoch": 0.16478802992518704, + "grad_norm": 0.5736881616721, + "learning_rate": 4.7657884308222644e-05, + "loss": 0.9333, + "step": 3304 + }, + { + "epoch": 0.16483790523690772, + "grad_norm": 0.5706664494948808, + "learning_rate": 4.765617735469584e-05, + "loss": 0.9223, + "step": 3305 + }, + { + "epoch": 0.16488778054862843, + "grad_norm": 3.18281823105167, + "learning_rate": 4.765446980996622e-05, + "loss": 0.9457, + "step": 3306 + }, + { + "epoch": 0.1649376558603491, + "grad_norm": 0.7495961093050653, + "learning_rate": 4.765276167407835e-05, + "loss": 0.9503, + "step": 3307 + }, + { + "epoch": 0.16498753117206982, + "grad_norm": 0.5360285780132383, + "learning_rate": 4.765105294707681e-05, + "loss": 0.922, + "step": 3308 + }, + { + "epoch": 0.16503740648379053, + "grad_norm": 0.45901538546496223, + "learning_rate": 4.764934362900618e-05, + "loss": 0.9391, + "step": 3309 + }, + { + "epoch": 0.16508728179551121, + "grad_norm": 0.5825194368749553, + "learning_rate": 4.764763371991106e-05, + "loss": 0.9436, + "step": 3310 + }, + { + "epoch": 0.16513715710723192, + "grad_norm": 0.48164345839739786, + "learning_rate": 4.764592321983607e-05, + "loss": 0.9216, + "step": 3311 + }, + { + "epoch": 0.1651870324189526, + "grad_norm": 0.4407815083555288, + "learning_rate": 4.764421212882586e-05, + "loss": 0.9051, + "step": 3312 + }, + { + "epoch": 0.16523690773067332, + "grad_norm": 0.5548019249217405, + "learning_rate": 4.7642500446925066e-05, + "loss": 0.9354, + "step": 3313 + }, + { + "epoch": 0.16528678304239403, + "grad_norm": 0.4418716190944449, + "learning_rate": 4.764078817417835e-05, + "loss": 0.9474, + "step": 3314 + }, + { + "epoch": 0.1653366583541147, + "grad_norm": 0.6552903481335496, + "learning_rate": 4.76390753106304e-05, + "loss": 0.9662, + "step": 3315 + }, + { + "epoch": 0.16538653366583542, + "grad_norm": 0.529413317985648, + "learning_rate": 4.7637361856325914e-05, + "loss": 0.893, + "step": 3316 + }, + { + "epoch": 0.1654364089775561, + "grad_norm": 0.5926810140722645, + "learning_rate": 4.76356478113096e-05, + "loss": 0.9331, + "step": 3317 + }, + { + "epoch": 0.1654862842892768, + "grad_norm": 0.4391960697861595, + "learning_rate": 4.763393317562619e-05, + "loss": 0.8919, + "step": 3318 + }, + { + "epoch": 0.16553615960099752, + "grad_norm": 0.48664390735474294, + "learning_rate": 4.763221794932042e-05, + "loss": 0.9191, + "step": 3319 + }, + { + "epoch": 0.1655860349127182, + "grad_norm": 0.5529949970991553, + "learning_rate": 4.7630502132437047e-05, + "loss": 0.9096, + "step": 3320 + }, + { + "epoch": 0.1656359102244389, + "grad_norm": 0.5438150702675104, + "learning_rate": 4.7628785725020866e-05, + "loss": 0.9394, + "step": 3321 + }, + { + "epoch": 0.1656857855361596, + "grad_norm": 0.4930079712199477, + "learning_rate": 4.762706872711663e-05, + "loss": 0.9006, + "step": 3322 + }, + { + "epoch": 0.1657356608478803, + "grad_norm": 0.6176470408243806, + "learning_rate": 4.762535113876917e-05, + "loss": 0.9713, + "step": 3323 + }, + { + "epoch": 0.165785536159601, + "grad_norm": 0.43245726303412696, + "learning_rate": 4.7623632960023304e-05, + "loss": 0.9335, + "step": 3324 + }, + { + "epoch": 0.1658354114713217, + "grad_norm": 0.6461813329612942, + "learning_rate": 4.762191419092385e-05, + "loss": 0.9752, + "step": 3325 + }, + { + "epoch": 0.1658852867830424, + "grad_norm": 0.4753929171928992, + "learning_rate": 4.7620194831515675e-05, + "loss": 0.9566, + "step": 3326 + }, + { + "epoch": 0.16593516209476308, + "grad_norm": 0.5719171297020379, + "learning_rate": 4.761847488184364e-05, + "loss": 0.9169, + "step": 3327 + }, + { + "epoch": 0.1659850374064838, + "grad_norm": 0.42533561884549786, + "learning_rate": 4.761675434195262e-05, + "loss": 0.9452, + "step": 3328 + }, + { + "epoch": 0.1660349127182045, + "grad_norm": 0.8747396157572799, + "learning_rate": 4.761503321188752e-05, + "loss": 0.9745, + "step": 3329 + }, + { + "epoch": 0.16608478802992518, + "grad_norm": 0.6503802715460676, + "learning_rate": 4.761331149169325e-05, + "loss": 0.922, + "step": 3330 + }, + { + "epoch": 0.1661346633416459, + "grad_norm": 0.5286251094206026, + "learning_rate": 4.761158918141474e-05, + "loss": 0.9824, + "step": 3331 + }, + { + "epoch": 0.16618453865336658, + "grad_norm": 0.5097273097587297, + "learning_rate": 4.760986628109693e-05, + "loss": 0.9451, + "step": 3332 + }, + { + "epoch": 0.16623441396508729, + "grad_norm": 0.7403164115789995, + "learning_rate": 4.760814279078477e-05, + "loss": 0.9517, + "step": 3333 + }, + { + "epoch": 0.16628428927680797, + "grad_norm": 0.6123241428037411, + "learning_rate": 4.760641871052325e-05, + "loss": 0.9661, + "step": 3334 + }, + { + "epoch": 0.16633416458852868, + "grad_norm": 0.5644321158097727, + "learning_rate": 4.760469404035735e-05, + "loss": 0.978, + "step": 3335 + }, + { + "epoch": 0.1663840399002494, + "grad_norm": 0.6667006224100506, + "learning_rate": 4.760296878033208e-05, + "loss": 0.882, + "step": 3336 + }, + { + "epoch": 0.16643391521197007, + "grad_norm": 0.5401346477678971, + "learning_rate": 4.760124293049245e-05, + "loss": 0.9871, + "step": 3337 + }, + { + "epoch": 0.16648379052369078, + "grad_norm": 0.6412556242639342, + "learning_rate": 4.75995164908835e-05, + "loss": 0.9898, + "step": 3338 + }, + { + "epoch": 0.16653366583541146, + "grad_norm": 0.6702740197545415, + "learning_rate": 4.759778946155028e-05, + "loss": 0.9124, + "step": 3339 + }, + { + "epoch": 0.16658354114713217, + "grad_norm": 0.5926916325802675, + "learning_rate": 4.759606184253786e-05, + "loss": 0.9186, + "step": 3340 + }, + { + "epoch": 0.16663341645885288, + "grad_norm": 2.9437075495307976, + "learning_rate": 4.759433363389132e-05, + "loss": 0.8973, + "step": 3341 + }, + { + "epoch": 0.16668329177057356, + "grad_norm": 0.5593512018446147, + "learning_rate": 4.759260483565575e-05, + "loss": 0.9482, + "step": 3342 + }, + { + "epoch": 0.16673316708229427, + "grad_norm": 0.861387349357807, + "learning_rate": 4.759087544787627e-05, + "loss": 0.913, + "step": 3343 + }, + { + "epoch": 0.16678304239401495, + "grad_norm": 0.6518270443501254, + "learning_rate": 4.758914547059801e-05, + "loss": 0.9857, + "step": 3344 + }, + { + "epoch": 0.16683291770573566, + "grad_norm": 0.47268427125674706, + "learning_rate": 4.758741490386611e-05, + "loss": 0.929, + "step": 3345 + }, + { + "epoch": 0.16688279301745637, + "grad_norm": 0.5524121594964004, + "learning_rate": 4.7585683747725715e-05, + "loss": 0.966, + "step": 3346 + }, + { + "epoch": 0.16693266832917705, + "grad_norm": 0.49015136477315946, + "learning_rate": 4.7583952002222024e-05, + "loss": 0.9174, + "step": 3347 + }, + { + "epoch": 0.16698254364089776, + "grad_norm": 0.5196889066746513, + "learning_rate": 4.758221966740021e-05, + "loss": 0.9142, + "step": 3348 + }, + { + "epoch": 0.16703241895261844, + "grad_norm": 0.5491671407832757, + "learning_rate": 4.7580486743305473e-05, + "loss": 0.9723, + "step": 3349 + }, + { + "epoch": 0.16708229426433915, + "grad_norm": 0.4725025903901495, + "learning_rate": 4.757875322998305e-05, + "loss": 0.9206, + "step": 3350 + }, + { + "epoch": 0.16713216957605986, + "grad_norm": 0.45494610117577144, + "learning_rate": 4.757701912747816e-05, + "loss": 0.9546, + "step": 3351 + }, + { + "epoch": 0.16718204488778055, + "grad_norm": 0.4411264873317409, + "learning_rate": 4.757528443583606e-05, + "loss": 0.9808, + "step": 3352 + }, + { + "epoch": 0.16723192019950125, + "grad_norm": 0.47488575956742995, + "learning_rate": 4.7573549155102014e-05, + "loss": 0.9308, + "step": 3353 + }, + { + "epoch": 0.16728179551122194, + "grad_norm": 0.5330347889850454, + "learning_rate": 4.7571813285321306e-05, + "loss": 0.9535, + "step": 3354 + }, + { + "epoch": 0.16733167082294265, + "grad_norm": 1.5319131615569326, + "learning_rate": 4.757007682653924e-05, + "loss": 0.9156, + "step": 3355 + }, + { + "epoch": 0.16738154613466333, + "grad_norm": 0.5440688550209798, + "learning_rate": 4.756833977880112e-05, + "loss": 0.9456, + "step": 3356 + }, + { + "epoch": 0.16743142144638404, + "grad_norm": 0.6063155346093674, + "learning_rate": 4.756660214215226e-05, + "loss": 1.0088, + "step": 3357 + }, + { + "epoch": 0.16748129675810475, + "grad_norm": 0.7502969645532928, + "learning_rate": 4.756486391663803e-05, + "loss": 0.9559, + "step": 3358 + }, + { + "epoch": 0.16753117206982543, + "grad_norm": 0.955503795493233, + "learning_rate": 4.7563125102303766e-05, + "loss": 0.9381, + "step": 3359 + }, + { + "epoch": 0.16758104738154614, + "grad_norm": 0.7724249964821869, + "learning_rate": 4.756138569919486e-05, + "loss": 0.9465, + "step": 3360 + }, + { + "epoch": 0.16763092269326682, + "grad_norm": 0.5387836583042938, + "learning_rate": 4.755964570735668e-05, + "loss": 0.9296, + "step": 3361 + }, + { + "epoch": 0.16768079800498753, + "grad_norm": 0.8139503332049017, + "learning_rate": 4.7557905126834654e-05, + "loss": 0.9657, + "step": 3362 + }, + { + "epoch": 0.16773067331670824, + "grad_norm": 0.6520874780204411, + "learning_rate": 4.755616395767418e-05, + "loss": 0.9392, + "step": 3363 + }, + { + "epoch": 0.16778054862842892, + "grad_norm": 0.7330975819694993, + "learning_rate": 4.755442219992071e-05, + "loss": 0.9422, + "step": 3364 + }, + { + "epoch": 0.16783042394014963, + "grad_norm": 0.9492347314215435, + "learning_rate": 4.755267985361968e-05, + "loss": 0.9756, + "step": 3365 + }, + { + "epoch": 0.1678802992518703, + "grad_norm": 0.9572958406450998, + "learning_rate": 4.755093691881656e-05, + "loss": 0.9684, + "step": 3366 + }, + { + "epoch": 0.16793017456359102, + "grad_norm": 12.131516468827243, + "learning_rate": 4.754919339555684e-05, + "loss": 0.9878, + "step": 3367 + }, + { + "epoch": 0.16798004987531173, + "grad_norm": 0.7884691952475206, + "learning_rate": 4.754744928388601e-05, + "loss": 0.9248, + "step": 3368 + }, + { + "epoch": 0.16802992518703241, + "grad_norm": 1.5256489173598438, + "learning_rate": 4.7545704583849584e-05, + "loss": 0.9823, + "step": 3369 + }, + { + "epoch": 0.16807980049875312, + "grad_norm": 1.1792495648572325, + "learning_rate": 4.754395929549308e-05, + "loss": 0.9314, + "step": 3370 + }, + { + "epoch": 0.1681296758104738, + "grad_norm": 0.5885568516130234, + "learning_rate": 4.754221341886205e-05, + "loss": 0.9154, + "step": 3371 + }, + { + "epoch": 0.16817955112219451, + "grad_norm": 0.6136742424292013, + "learning_rate": 4.754046695400205e-05, + "loss": 0.9578, + "step": 3372 + }, + { + "epoch": 0.16822942643391522, + "grad_norm": 0.4276755107863189, + "learning_rate": 4.753871990095865e-05, + "loss": 0.9523, + "step": 3373 + }, + { + "epoch": 0.1682793017456359, + "grad_norm": 0.5304036608819587, + "learning_rate": 4.753697225977745e-05, + "loss": 1.03, + "step": 3374 + }, + { + "epoch": 0.16832917705735662, + "grad_norm": 0.531647646845061, + "learning_rate": 4.753522403050403e-05, + "loss": 0.9551, + "step": 3375 + }, + { + "epoch": 0.1683790523690773, + "grad_norm": 0.5574637056854513, + "learning_rate": 4.753347521318404e-05, + "loss": 0.9571, + "step": 3376 + }, + { + "epoch": 0.168428927680798, + "grad_norm": 0.5441825118759638, + "learning_rate": 4.7531725807863094e-05, + "loss": 0.9267, + "step": 3377 + }, + { + "epoch": 0.16847880299251872, + "grad_norm": 0.6070217037952553, + "learning_rate": 4.752997581458684e-05, + "loss": 0.9779, + "step": 3378 + }, + { + "epoch": 0.1685286783042394, + "grad_norm": 0.5907759821744986, + "learning_rate": 4.752822523340096e-05, + "loss": 0.9619, + "step": 3379 + }, + { + "epoch": 0.1685785536159601, + "grad_norm": 0.5243970219984602, + "learning_rate": 4.7526474064351114e-05, + "loss": 0.9089, + "step": 3380 + }, + { + "epoch": 0.1686284289276808, + "grad_norm": 0.4523355083707051, + "learning_rate": 4.752472230748302e-05, + "loss": 0.9417, + "step": 3381 + }, + { + "epoch": 0.1686783042394015, + "grad_norm": 0.4730537769620766, + "learning_rate": 4.752296996284237e-05, + "loss": 0.9759, + "step": 3382 + }, + { + "epoch": 0.16872817955112218, + "grad_norm": 0.9592290070811603, + "learning_rate": 4.75212170304749e-05, + "loss": 0.9529, + "step": 3383 + }, + { + "epoch": 0.1687780548628429, + "grad_norm": 0.5269260499061532, + "learning_rate": 4.7519463510426354e-05, + "loss": 0.9278, + "step": 3384 + }, + { + "epoch": 0.1688279301745636, + "grad_norm": 0.646005506101507, + "learning_rate": 4.751770940274248e-05, + "loss": 0.9116, + "step": 3385 + }, + { + "epoch": 0.16887780548628428, + "grad_norm": 0.5466896401390569, + "learning_rate": 4.751595470746907e-05, + "loss": 0.9227, + "step": 3386 + }, + { + "epoch": 0.168927680798005, + "grad_norm": 0.47134842482370065, + "learning_rate": 4.751419942465188e-05, + "loss": 0.9756, + "step": 3387 + }, + { + "epoch": 0.16897755610972567, + "grad_norm": 0.491280588562218, + "learning_rate": 4.751244355433675e-05, + "loss": 0.9219, + "step": 3388 + }, + { + "epoch": 0.16902743142144638, + "grad_norm": 0.5137643092696738, + "learning_rate": 4.751068709656947e-05, + "loss": 0.9591, + "step": 3389 + }, + { + "epoch": 0.1690773067331671, + "grad_norm": 0.5927594824811859, + "learning_rate": 4.7508930051395896e-05, + "loss": 0.9294, + "step": 3390 + }, + { + "epoch": 0.16912718204488777, + "grad_norm": 0.5585283291657286, + "learning_rate": 4.750717241886185e-05, + "loss": 0.9295, + "step": 3391 + }, + { + "epoch": 0.16917705735660848, + "grad_norm": 0.5921489199523817, + "learning_rate": 4.750541419901322e-05, + "loss": 0.9568, + "step": 3392 + }, + { + "epoch": 0.16922693266832917, + "grad_norm": 0.5804692817528486, + "learning_rate": 4.750365539189589e-05, + "loss": 0.9427, + "step": 3393 + }, + { + "epoch": 0.16927680798004988, + "grad_norm": 0.7250972003395234, + "learning_rate": 4.750189599755572e-05, + "loss": 0.9469, + "step": 3394 + }, + { + "epoch": 0.16932668329177059, + "grad_norm": 0.5307081990182579, + "learning_rate": 4.7500136016038665e-05, + "loss": 0.9009, + "step": 3395 + }, + { + "epoch": 0.16937655860349127, + "grad_norm": 0.6209799949743506, + "learning_rate": 4.7498375447390616e-05, + "loss": 0.9511, + "step": 3396 + }, + { + "epoch": 0.16942643391521198, + "grad_norm": 0.4821514332934894, + "learning_rate": 4.749661429165754e-05, + "loss": 0.9815, + "step": 3397 + }, + { + "epoch": 0.16947630922693266, + "grad_norm": 0.5228632050145159, + "learning_rate": 4.749485254888538e-05, + "loss": 0.9478, + "step": 3398 + }, + { + "epoch": 0.16952618453865337, + "grad_norm": 0.4527424306084498, + "learning_rate": 4.7493090219120105e-05, + "loss": 0.9466, + "step": 3399 + }, + { + "epoch": 0.16957605985037408, + "grad_norm": 0.8066481705155716, + "learning_rate": 4.749132730240771e-05, + "loss": 0.9767, + "step": 3400 + }, + { + "epoch": 0.16962593516209476, + "grad_norm": 0.5269386350533537, + "learning_rate": 4.7489563798794196e-05, + "loss": 0.9709, + "step": 3401 + }, + { + "epoch": 0.16967581047381547, + "grad_norm": 0.5355309581330744, + "learning_rate": 4.7487799708325576e-05, + "loss": 0.9573, + "step": 3402 + }, + { + "epoch": 0.16972568578553615, + "grad_norm": 0.5812549718570311, + "learning_rate": 4.748603503104789e-05, + "loss": 0.9053, + "step": 3403 + }, + { + "epoch": 0.16977556109725686, + "grad_norm": 0.6658531959346723, + "learning_rate": 4.748426976700718e-05, + "loss": 0.8914, + "step": 3404 + }, + { + "epoch": 0.16982543640897754, + "grad_norm": 0.5687551258865949, + "learning_rate": 4.748250391624952e-05, + "loss": 0.9413, + "step": 3405 + }, + { + "epoch": 0.16987531172069825, + "grad_norm": 0.5103647799477228, + "learning_rate": 4.748073747882097e-05, + "loss": 0.9356, + "step": 3406 + }, + { + "epoch": 0.16992518703241896, + "grad_norm": 0.5043124602949316, + "learning_rate": 4.747897045476765e-05, + "loss": 0.9449, + "step": 3407 + }, + { + "epoch": 0.16997506234413964, + "grad_norm": 0.5412306197226712, + "learning_rate": 4.7477202844135646e-05, + "loss": 0.9323, + "step": 3408 + }, + { + "epoch": 0.17002493765586035, + "grad_norm": 0.5397927056190058, + "learning_rate": 4.7475434646971106e-05, + "loss": 0.9328, + "step": 3409 + }, + { + "epoch": 0.17007481296758103, + "grad_norm": 0.45662845485138503, + "learning_rate": 4.747366586332015e-05, + "loss": 0.9672, + "step": 3410 + }, + { + "epoch": 0.17012468827930174, + "grad_norm": 0.49955510798309927, + "learning_rate": 4.747189649322894e-05, + "loss": 0.927, + "step": 3411 + }, + { + "epoch": 0.17017456359102245, + "grad_norm": 0.48788562838675037, + "learning_rate": 4.7470126536743646e-05, + "loss": 0.9667, + "step": 3412 + }, + { + "epoch": 0.17022443890274314, + "grad_norm": 0.5428765472832617, + "learning_rate": 4.7468355993910464e-05, + "loss": 0.9478, + "step": 3413 + }, + { + "epoch": 0.17027431421446385, + "grad_norm": 0.46756776030577135, + "learning_rate": 4.746658486477558e-05, + "loss": 0.8775, + "step": 3414 + }, + { + "epoch": 0.17032418952618453, + "grad_norm": 0.5770765096275482, + "learning_rate": 4.7464813149385226e-05, + "loss": 0.949, + "step": 3415 + }, + { + "epoch": 0.17037406483790524, + "grad_norm": 0.627734524967987, + "learning_rate": 4.746304084778562e-05, + "loss": 0.9082, + "step": 3416 + }, + { + "epoch": 0.17042394014962595, + "grad_norm": 0.4770774164170825, + "learning_rate": 4.746126796002302e-05, + "loss": 0.9456, + "step": 3417 + }, + { + "epoch": 0.17047381546134663, + "grad_norm": 0.4568904410571068, + "learning_rate": 4.745949448614368e-05, + "loss": 0.937, + "step": 3418 + }, + { + "epoch": 0.17052369077306734, + "grad_norm": 0.6205716169172742, + "learning_rate": 4.745772042619388e-05, + "loss": 0.9135, + "step": 3419 + }, + { + "epoch": 0.17057356608478802, + "grad_norm": 0.5426109895445447, + "learning_rate": 4.745594578021993e-05, + "loss": 0.9751, + "step": 3420 + }, + { + "epoch": 0.17062344139650873, + "grad_norm": 0.47995173218947906, + "learning_rate": 4.745417054826812e-05, + "loss": 0.9534, + "step": 3421 + }, + { + "epoch": 0.17067331670822944, + "grad_norm": 0.46797977688718967, + "learning_rate": 4.745239473038478e-05, + "loss": 0.9025, + "step": 3422 + }, + { + "epoch": 0.17072319201995012, + "grad_norm": 0.5485127795515742, + "learning_rate": 4.7450618326616235e-05, + "loss": 0.9664, + "step": 3423 + }, + { + "epoch": 0.17077306733167083, + "grad_norm": 0.5239022311391064, + "learning_rate": 4.744884133700887e-05, + "loss": 0.9489, + "step": 3424 + }, + { + "epoch": 0.1708229426433915, + "grad_norm": 0.4480264078000952, + "learning_rate": 4.7447063761609025e-05, + "loss": 0.9405, + "step": 3425 + }, + { + "epoch": 0.17087281795511222, + "grad_norm": 1.2790026555254292, + "learning_rate": 4.7445285600463096e-05, + "loss": 0.9632, + "step": 3426 + }, + { + "epoch": 0.17092269326683293, + "grad_norm": 0.4060905748881268, + "learning_rate": 4.7443506853617494e-05, + "loss": 0.9163, + "step": 3427 + }, + { + "epoch": 0.1709725685785536, + "grad_norm": 0.5376712410372271, + "learning_rate": 4.744172752111862e-05, + "loss": 0.9539, + "step": 3428 + }, + { + "epoch": 0.17102244389027432, + "grad_norm": 1.5670617224833723, + "learning_rate": 4.743994760301291e-05, + "loss": 0.9292, + "step": 3429 + }, + { + "epoch": 0.171072319201995, + "grad_norm": 0.5638057413809532, + "learning_rate": 4.7438167099346806e-05, + "loss": 0.9265, + "step": 3430 + }, + { + "epoch": 0.17112219451371571, + "grad_norm": 0.6412567638414428, + "learning_rate": 4.743638601016678e-05, + "loss": 0.9809, + "step": 3431 + }, + { + "epoch": 0.1711720698254364, + "grad_norm": 0.5598680386431575, + "learning_rate": 4.7434604335519304e-05, + "loss": 0.9514, + "step": 3432 + }, + { + "epoch": 0.1712219451371571, + "grad_norm": 0.4827701432577284, + "learning_rate": 4.743282207545086e-05, + "loss": 0.9322, + "step": 3433 + }, + { + "epoch": 0.17127182044887782, + "grad_norm": 0.6058400089314634, + "learning_rate": 4.743103923000797e-05, + "loss": 0.9212, + "step": 3434 + }, + { + "epoch": 0.1713216957605985, + "grad_norm": 0.46262242777963936, + "learning_rate": 4.742925579923715e-05, + "loss": 0.9468, + "step": 3435 + }, + { + "epoch": 0.1713715710723192, + "grad_norm": 0.49224093347336556, + "learning_rate": 4.742747178318493e-05, + "loss": 0.9535, + "step": 3436 + }, + { + "epoch": 0.1714214463840399, + "grad_norm": 0.461182420934645, + "learning_rate": 4.742568718189788e-05, + "loss": 0.9679, + "step": 3437 + }, + { + "epoch": 0.1714713216957606, + "grad_norm": 0.5611037621073236, + "learning_rate": 4.7423901995422557e-05, + "loss": 0.9857, + "step": 3438 + }, + { + "epoch": 0.1715211970074813, + "grad_norm": 0.43602423684761105, + "learning_rate": 4.7422116223805544e-05, + "loss": 0.9371, + "step": 3439 + }, + { + "epoch": 0.171571072319202, + "grad_norm": 0.5219135258734832, + "learning_rate": 4.742032986709345e-05, + "loss": 0.9854, + "step": 3440 + }, + { + "epoch": 0.1716209476309227, + "grad_norm": 0.5247239434060239, + "learning_rate": 4.741854292533288e-05, + "loss": 0.9645, + "step": 3441 + }, + { + "epoch": 0.17167082294264338, + "grad_norm": 0.5639404326580436, + "learning_rate": 4.741675539857046e-05, + "loss": 0.9732, + "step": 3442 + }, + { + "epoch": 0.1717206982543641, + "grad_norm": 0.560678195105035, + "learning_rate": 4.741496728685284e-05, + "loss": 0.9637, + "step": 3443 + }, + { + "epoch": 0.1717705735660848, + "grad_norm": 0.4625687897655851, + "learning_rate": 4.7413178590226695e-05, + "loss": 0.9243, + "step": 3444 + }, + { + "epoch": 0.17182044887780548, + "grad_norm": 0.4879847942903789, + "learning_rate": 4.7411389308738666e-05, + "loss": 0.9266, + "step": 3445 + }, + { + "epoch": 0.1718703241895262, + "grad_norm": 0.46552888941078024, + "learning_rate": 4.740959944243547e-05, + "loss": 0.9561, + "step": 3446 + }, + { + "epoch": 0.17192019950124687, + "grad_norm": 0.5007868063988059, + "learning_rate": 4.740780899136381e-05, + "loss": 0.9479, + "step": 3447 + }, + { + "epoch": 0.17197007481296758, + "grad_norm": 0.5694995985970202, + "learning_rate": 4.74060179555704e-05, + "loss": 0.8985, + "step": 3448 + }, + { + "epoch": 0.1720199501246883, + "grad_norm": 0.567872757854144, + "learning_rate": 4.740422633510198e-05, + "loss": 0.9566, + "step": 3449 + }, + { + "epoch": 0.17206982543640897, + "grad_norm": 0.4724037834009252, + "learning_rate": 4.7402434130005305e-05, + "loss": 0.9463, + "step": 3450 + }, + { + "epoch": 0.17211970074812968, + "grad_norm": 0.4731734698137713, + "learning_rate": 4.7400641340327125e-05, + "loss": 0.8743, + "step": 3451 + }, + { + "epoch": 0.17216957605985037, + "grad_norm": 0.3721853753259487, + "learning_rate": 4.7398847966114246e-05, + "loss": 0.9238, + "step": 3452 + }, + { + "epoch": 0.17221945137157108, + "grad_norm": 0.531501100972084, + "learning_rate": 4.7397054007413444e-05, + "loss": 0.9139, + "step": 3453 + }, + { + "epoch": 0.17226932668329176, + "grad_norm": 0.5451838926626947, + "learning_rate": 4.739525946427154e-05, + "loss": 0.929, + "step": 3454 + }, + { + "epoch": 0.17231920199501247, + "grad_norm": 0.4369720251748746, + "learning_rate": 4.739346433673538e-05, + "loss": 0.9375, + "step": 3455 + }, + { + "epoch": 0.17236907730673318, + "grad_norm": 0.4745678343063723, + "learning_rate": 4.739166862485178e-05, + "loss": 0.9588, + "step": 3456 + }, + { + "epoch": 0.17241895261845386, + "grad_norm": 0.5442555389331117, + "learning_rate": 4.73898723286676e-05, + "loss": 0.9711, + "step": 3457 + }, + { + "epoch": 0.17246882793017457, + "grad_norm": 0.46020443938091815, + "learning_rate": 4.7388075448229744e-05, + "loss": 0.9224, + "step": 3458 + }, + { + "epoch": 0.17251870324189525, + "grad_norm": 0.5005034721356577, + "learning_rate": 4.738627798358506e-05, + "loss": 0.9518, + "step": 3459 + }, + { + "epoch": 0.17256857855361596, + "grad_norm": 0.5035332958359123, + "learning_rate": 4.738447993478048e-05, + "loss": 0.926, + "step": 3460 + }, + { + "epoch": 0.17261845386533667, + "grad_norm": 0.52073297519499, + "learning_rate": 4.738268130186291e-05, + "loss": 0.9738, + "step": 3461 + }, + { + "epoch": 0.17266832917705735, + "grad_norm": 0.4786782315224472, + "learning_rate": 4.7380882084879284e-05, + "loss": 0.978, + "step": 3462 + }, + { + "epoch": 0.17271820448877806, + "grad_norm": 0.6878480763180038, + "learning_rate": 4.7379082283876566e-05, + "loss": 0.9293, + "step": 3463 + }, + { + "epoch": 0.17276807980049874, + "grad_norm": 0.4645881048307959, + "learning_rate": 4.737728189890171e-05, + "loss": 0.9608, + "step": 3464 + }, + { + "epoch": 0.17281795511221945, + "grad_norm": 0.46417636831555914, + "learning_rate": 4.737548093000168e-05, + "loss": 0.9316, + "step": 3465 + }, + { + "epoch": 0.17286783042394016, + "grad_norm": 0.49633047526596125, + "learning_rate": 4.737367937722351e-05, + "loss": 0.9264, + "step": 3466 + }, + { + "epoch": 0.17291770573566084, + "grad_norm": 0.4745874634330205, + "learning_rate": 4.737187724061418e-05, + "loss": 0.903, + "step": 3467 + }, + { + "epoch": 0.17296758104738155, + "grad_norm": 0.6896227765646423, + "learning_rate": 4.737007452022073e-05, + "loss": 0.9487, + "step": 3468 + }, + { + "epoch": 0.17301745635910223, + "grad_norm": 0.5070067923928593, + "learning_rate": 4.73682712160902e-05, + "loss": 0.9569, + "step": 3469 + }, + { + "epoch": 0.17306733167082294, + "grad_norm": 0.4353583878285219, + "learning_rate": 4.7366467328269636e-05, + "loss": 0.936, + "step": 3470 + }, + { + "epoch": 0.17311720698254365, + "grad_norm": 0.5475777155731804, + "learning_rate": 4.736466285680612e-05, + "loss": 0.9375, + "step": 3471 + }, + { + "epoch": 0.17316708229426434, + "grad_norm": 0.46972709314882777, + "learning_rate": 4.7362857801746726e-05, + "loss": 0.9596, + "step": 3472 + }, + { + "epoch": 0.17321695760598504, + "grad_norm": 0.4828847961846954, + "learning_rate": 4.7361052163138585e-05, + "loss": 0.9069, + "step": 3473 + }, + { + "epoch": 0.17326683291770573, + "grad_norm": 0.4405078356985549, + "learning_rate": 4.7359245941028785e-05, + "loss": 0.8728, + "step": 3474 + }, + { + "epoch": 0.17331670822942644, + "grad_norm": 0.47420921487337175, + "learning_rate": 4.735743913546446e-05, + "loss": 0.9198, + "step": 3475 + }, + { + "epoch": 0.17336658354114715, + "grad_norm": 0.4947166876104259, + "learning_rate": 4.735563174649278e-05, + "loss": 0.9017, + "step": 3476 + }, + { + "epoch": 0.17341645885286783, + "grad_norm": 0.5166117793904027, + "learning_rate": 4.73538237741609e-05, + "loss": 0.8952, + "step": 3477 + }, + { + "epoch": 0.17346633416458854, + "grad_norm": 0.4649336374853929, + "learning_rate": 4.735201521851598e-05, + "loss": 0.9254, + "step": 3478 + }, + { + "epoch": 0.17351620947630922, + "grad_norm": 0.4447539436386934, + "learning_rate": 4.735020607960523e-05, + "loss": 0.9066, + "step": 3479 + }, + { + "epoch": 0.17356608478802993, + "grad_norm": 0.7401286658409589, + "learning_rate": 4.7348396357475854e-05, + "loss": 0.9395, + "step": 3480 + }, + { + "epoch": 0.1736159600997506, + "grad_norm": 0.5514183645754844, + "learning_rate": 4.734658605217508e-05, + "loss": 0.902, + "step": 3481 + }, + { + "epoch": 0.17366583541147132, + "grad_norm": 0.49151365993616597, + "learning_rate": 4.7344775163750143e-05, + "loss": 0.9234, + "step": 3482 + }, + { + "epoch": 0.17371571072319203, + "grad_norm": 0.4715502234064768, + "learning_rate": 4.73429636922483e-05, + "loss": 0.97, + "step": 3483 + }, + { + "epoch": 0.1737655860349127, + "grad_norm": 0.47185203051155933, + "learning_rate": 4.7341151637716814e-05, + "loss": 0.9084, + "step": 3484 + }, + { + "epoch": 0.17381546134663342, + "grad_norm": 0.47797962204861394, + "learning_rate": 4.733933900020298e-05, + "loss": 0.8984, + "step": 3485 + }, + { + "epoch": 0.1738653366583541, + "grad_norm": 0.5807790551476539, + "learning_rate": 4.733752577975409e-05, + "loss": 0.9059, + "step": 3486 + }, + { + "epoch": 0.1739152119700748, + "grad_norm": 0.4712789787696541, + "learning_rate": 4.733571197641746e-05, + "loss": 0.9017, + "step": 3487 + }, + { + "epoch": 0.17396508728179552, + "grad_norm": 0.46875939406722844, + "learning_rate": 4.733389759024042e-05, + "loss": 0.967, + "step": 3488 + }, + { + "epoch": 0.1740149625935162, + "grad_norm": 0.43695608766454447, + "learning_rate": 4.7332082621270326e-05, + "loss": 0.9091, + "step": 3489 + }, + { + "epoch": 0.1740648379052369, + "grad_norm": 0.4913888170905293, + "learning_rate": 4.733026706955452e-05, + "loss": 0.9584, + "step": 3490 + }, + { + "epoch": 0.1741147132169576, + "grad_norm": 0.4087531079692387, + "learning_rate": 4.7328450935140395e-05, + "loss": 0.9162, + "step": 3491 + }, + { + "epoch": 0.1741645885286783, + "grad_norm": 0.49033641295053837, + "learning_rate": 4.7326634218075336e-05, + "loss": 1.0, + "step": 3492 + }, + { + "epoch": 0.17421446384039901, + "grad_norm": 0.5086106659667475, + "learning_rate": 4.7324816918406744e-05, + "loss": 0.972, + "step": 3493 + }, + { + "epoch": 0.1742643391521197, + "grad_norm": 0.5454264786605136, + "learning_rate": 4.7322999036182054e-05, + "loss": 0.9539, + "step": 3494 + }, + { + "epoch": 0.1743142144638404, + "grad_norm": 0.4742720401861537, + "learning_rate": 4.7321180571448686e-05, + "loss": 0.9947, + "step": 3495 + }, + { + "epoch": 0.1743640897755611, + "grad_norm": 0.4831248491314462, + "learning_rate": 4.731936152425411e-05, + "loss": 0.9367, + "step": 3496 + }, + { + "epoch": 0.1744139650872818, + "grad_norm": 0.5109475391981504, + "learning_rate": 4.731754189464578e-05, + "loss": 0.9507, + "step": 3497 + }, + { + "epoch": 0.1744638403990025, + "grad_norm": 0.44761017943016623, + "learning_rate": 4.731572168267118e-05, + "loss": 0.9283, + "step": 3498 + }, + { + "epoch": 0.1745137157107232, + "grad_norm": 0.508265892748976, + "learning_rate": 4.731390088837781e-05, + "loss": 0.9403, + "step": 3499 + }, + { + "epoch": 0.1745635910224439, + "grad_norm": 0.4921701193089659, + "learning_rate": 4.731207951181319e-05, + "loss": 0.923, + "step": 3500 + }, + { + "epoch": 0.17461346633416458, + "grad_norm": 0.5492546861869407, + "learning_rate": 4.7310257553024825e-05, + "loss": 0.9458, + "step": 3501 + }, + { + "epoch": 0.1746633416458853, + "grad_norm": 0.620981833651238, + "learning_rate": 4.7308435012060284e-05, + "loss": 0.9169, + "step": 3502 + }, + { + "epoch": 0.17471321695760597, + "grad_norm": 0.6220371545680121, + "learning_rate": 4.730661188896712e-05, + "loss": 0.9631, + "step": 3503 + }, + { + "epoch": 0.17476309226932668, + "grad_norm": 0.4441392628749588, + "learning_rate": 4.730478818379289e-05, + "loss": 0.9845, + "step": 3504 + }, + { + "epoch": 0.1748129675810474, + "grad_norm": 0.6278544919533369, + "learning_rate": 4.7302963896585204e-05, + "loss": 0.9703, + "step": 3505 + }, + { + "epoch": 0.17486284289276807, + "grad_norm": 0.4311683804586257, + "learning_rate": 4.7301139027391657e-05, + "loss": 0.9219, + "step": 3506 + }, + { + "epoch": 0.17491271820448878, + "grad_norm": 0.4456496873184498, + "learning_rate": 4.7299313576259864e-05, + "loss": 0.9366, + "step": 3507 + }, + { + "epoch": 0.17496259351620946, + "grad_norm": 0.4858661822906994, + "learning_rate": 4.729748754323746e-05, + "loss": 0.8982, + "step": 3508 + }, + { + "epoch": 0.17501246882793017, + "grad_norm": 0.4141217128327441, + "learning_rate": 4.7295660928372104e-05, + "loss": 0.9123, + "step": 3509 + }, + { + "epoch": 0.17506234413965088, + "grad_norm": 0.47957557006539886, + "learning_rate": 4.729383373171146e-05, + "loss": 0.9472, + "step": 3510 + }, + { + "epoch": 0.17511221945137156, + "grad_norm": 0.4609404221098705, + "learning_rate": 4.7292005953303186e-05, + "loss": 0.9131, + "step": 3511 + }, + { + "epoch": 0.17516209476309227, + "grad_norm": 0.4834685578254818, + "learning_rate": 4.729017759319501e-05, + "loss": 0.888, + "step": 3512 + }, + { + "epoch": 0.17521197007481296, + "grad_norm": 0.4661197714868243, + "learning_rate": 4.728834865143461e-05, + "loss": 0.9617, + "step": 3513 + }, + { + "epoch": 0.17526184538653367, + "grad_norm": 0.9017817588755981, + "learning_rate": 4.7286519128069736e-05, + "loss": 0.9252, + "step": 3514 + }, + { + "epoch": 0.17531172069825438, + "grad_norm": 0.4832711851108824, + "learning_rate": 4.728468902314812e-05, + "loss": 0.9102, + "step": 3515 + }, + { + "epoch": 0.17536159600997506, + "grad_norm": 0.44254089219375625, + "learning_rate": 4.7282858336717504e-05, + "loss": 0.9661, + "step": 3516 + }, + { + "epoch": 0.17541147132169577, + "grad_norm": 0.43397464403320485, + "learning_rate": 4.7281027068825687e-05, + "loss": 0.9209, + "step": 3517 + }, + { + "epoch": 0.17546134663341645, + "grad_norm": 0.45926280429329974, + "learning_rate": 4.727919521952043e-05, + "loss": 0.9658, + "step": 3518 + }, + { + "epoch": 0.17551122194513716, + "grad_norm": 0.43827277125263503, + "learning_rate": 4.7277362788849555e-05, + "loss": 0.8984, + "step": 3519 + }, + { + "epoch": 0.17556109725685787, + "grad_norm": 0.45296086968476496, + "learning_rate": 4.7275529776860866e-05, + "loss": 0.9462, + "step": 3520 + }, + { + "epoch": 0.17561097256857855, + "grad_norm": 0.48689815339917547, + "learning_rate": 4.7273696183602194e-05, + "loss": 0.9625, + "step": 3521 + }, + { + "epoch": 0.17566084788029926, + "grad_norm": 0.4853590048176056, + "learning_rate": 4.727186200912139e-05, + "loss": 0.922, + "step": 3522 + }, + { + "epoch": 0.17571072319201994, + "grad_norm": 0.47369101294453897, + "learning_rate": 4.727002725346631e-05, + "loss": 0.9325, + "step": 3523 + }, + { + "epoch": 0.17576059850374065, + "grad_norm": 0.4608713827738709, + "learning_rate": 4.726819191668484e-05, + "loss": 1.0191, + "step": 3524 + }, + { + "epoch": 0.17581047381546136, + "grad_norm": 0.47140227203844753, + "learning_rate": 4.7266355998824865e-05, + "loss": 0.9101, + "step": 3525 + }, + { + "epoch": 0.17586034912718204, + "grad_norm": 0.48805557013797507, + "learning_rate": 4.72645194999343e-05, + "loss": 0.9449, + "step": 3526 + }, + { + "epoch": 0.17591022443890275, + "grad_norm": 0.41984470725925366, + "learning_rate": 4.726268242006106e-05, + "loss": 0.9415, + "step": 3527 + }, + { + "epoch": 0.17596009975062343, + "grad_norm": 0.48276850313865927, + "learning_rate": 4.726084475925309e-05, + "loss": 0.9243, + "step": 3528 + }, + { + "epoch": 0.17600997506234414, + "grad_norm": 0.6532867443162214, + "learning_rate": 4.725900651755833e-05, + "loss": 0.9505, + "step": 3529 + }, + { + "epoch": 0.17605985037406482, + "grad_norm": 0.5061459414215882, + "learning_rate": 4.725716769502476e-05, + "loss": 0.9188, + "step": 3530 + }, + { + "epoch": 0.17610972568578553, + "grad_norm": 0.4991984145853956, + "learning_rate": 4.7255328291700364e-05, + "loss": 0.9803, + "step": 3531 + }, + { + "epoch": 0.17615960099750624, + "grad_norm": 0.40683724822384065, + "learning_rate": 4.725348830763313e-05, + "loss": 0.9185, + "step": 3532 + }, + { + "epoch": 0.17620947630922693, + "grad_norm": 0.41278811377791197, + "learning_rate": 4.7251647742871085e-05, + "loss": 0.9233, + "step": 3533 + }, + { + "epoch": 0.17625935162094764, + "grad_norm": 1.0426477244210015, + "learning_rate": 4.7249806597462244e-05, + "loss": 0.9444, + "step": 3534 + }, + { + "epoch": 0.17630922693266832, + "grad_norm": 2.3055513930502243, + "learning_rate": 4.724796487145466e-05, + "loss": 0.8921, + "step": 3535 + }, + { + "epoch": 0.17635910224438903, + "grad_norm": 0.9858191813525171, + "learning_rate": 4.724612256489639e-05, + "loss": 0.9203, + "step": 3536 + }, + { + "epoch": 0.17640897755610974, + "grad_norm": 1.5044811129310167, + "learning_rate": 4.7244279677835514e-05, + "loss": 0.9385, + "step": 3537 + }, + { + "epoch": 0.17645885286783042, + "grad_norm": 0.6215338344596262, + "learning_rate": 4.724243621032011e-05, + "loss": 0.9714, + "step": 3538 + }, + { + "epoch": 0.17650872817955113, + "grad_norm": 1.0611683328488404, + "learning_rate": 4.724059216239828e-05, + "loss": 0.9847, + "step": 3539 + }, + { + "epoch": 0.1765586034912718, + "grad_norm": 0.8781324631042298, + "learning_rate": 4.723874753411816e-05, + "loss": 0.9236, + "step": 3540 + }, + { + "epoch": 0.17660847880299252, + "grad_norm": 0.6139057003101142, + "learning_rate": 4.7236902325527874e-05, + "loss": 0.8912, + "step": 3541 + }, + { + "epoch": 0.17665835411471323, + "grad_norm": 0.4773668179281427, + "learning_rate": 4.723505653667557e-05, + "loss": 0.9383, + "step": 3542 + }, + { + "epoch": 0.1767082294264339, + "grad_norm": 0.6342027135865248, + "learning_rate": 4.7233210167609424e-05, + "loss": 0.9597, + "step": 3543 + }, + { + "epoch": 0.17675810473815462, + "grad_norm": 0.48037755519311515, + "learning_rate": 4.7231363218377605e-05, + "loss": 0.9568, + "step": 3544 + }, + { + "epoch": 0.1768079800498753, + "grad_norm": 0.5199587989646502, + "learning_rate": 4.7229515689028316e-05, + "loss": 0.9475, + "step": 3545 + }, + { + "epoch": 0.176857855361596, + "grad_norm": 0.4287633056673414, + "learning_rate": 4.7227667579609757e-05, + "loss": 0.8971, + "step": 3546 + }, + { + "epoch": 0.17690773067331672, + "grad_norm": 0.4980178729122796, + "learning_rate": 4.722581889017016e-05, + "loss": 0.9407, + "step": 3547 + }, + { + "epoch": 0.1769576059850374, + "grad_norm": 0.46020696875224604, + "learning_rate": 4.722396962075777e-05, + "loss": 0.9453, + "step": 3548 + }, + { + "epoch": 0.1770074812967581, + "grad_norm": 0.4271535054948437, + "learning_rate": 4.722211977142084e-05, + "loss": 0.9206, + "step": 3549 + }, + { + "epoch": 0.1770573566084788, + "grad_norm": 0.40206141961509906, + "learning_rate": 4.722026934220764e-05, + "loss": 0.9456, + "step": 3550 + }, + { + "epoch": 0.1771072319201995, + "grad_norm": 0.4912842890490653, + "learning_rate": 4.7218418333166455e-05, + "loss": 0.9075, + "step": 3551 + }, + { + "epoch": 0.1771571072319202, + "grad_norm": 0.4640020492729843, + "learning_rate": 4.721656674434558e-05, + "loss": 0.9055, + "step": 3552 + }, + { + "epoch": 0.1772069825436409, + "grad_norm": 0.43472317269235344, + "learning_rate": 4.721471457579335e-05, + "loss": 0.9304, + "step": 3553 + }, + { + "epoch": 0.1772568578553616, + "grad_norm": 0.9058953586198297, + "learning_rate": 4.721286182755808e-05, + "loss": 0.9593, + "step": 3554 + }, + { + "epoch": 0.1773067331670823, + "grad_norm": 0.3895285663780234, + "learning_rate": 4.721100849968813e-05, + "loss": 0.9144, + "step": 3555 + }, + { + "epoch": 0.177356608478803, + "grad_norm": 0.45571742184115355, + "learning_rate": 4.720915459223184e-05, + "loss": 0.926, + "step": 3556 + }, + { + "epoch": 0.17740648379052368, + "grad_norm": 0.4634886029230789, + "learning_rate": 4.720730010523761e-05, + "loss": 0.9634, + "step": 3557 + }, + { + "epoch": 0.1774563591022444, + "grad_norm": 0.4740075255648729, + "learning_rate": 4.7205445038753814e-05, + "loss": 0.9517, + "step": 3558 + }, + { + "epoch": 0.1775062344139651, + "grad_norm": 0.40760512363428997, + "learning_rate": 4.7203589392828876e-05, + "loss": 0.8726, + "step": 3559 + }, + { + "epoch": 0.17755610972568578, + "grad_norm": 0.40484372701894145, + "learning_rate": 4.72017331675112e-05, + "loss": 0.887, + "step": 3560 + }, + { + "epoch": 0.1776059850374065, + "grad_norm": 0.6170278690457104, + "learning_rate": 4.719987636284924e-05, + "loss": 0.9284, + "step": 3561 + }, + { + "epoch": 0.17765586034912717, + "grad_norm": 0.4816418566682398, + "learning_rate": 4.719801897889144e-05, + "loss": 0.9497, + "step": 3562 + }, + { + "epoch": 0.17770573566084788, + "grad_norm": 0.43361623124461585, + "learning_rate": 4.719616101568627e-05, + "loss": 0.9436, + "step": 3563 + }, + { + "epoch": 0.1777556109725686, + "grad_norm": 0.5568736852285144, + "learning_rate": 4.7194302473282216e-05, + "loss": 0.9616, + "step": 3564 + }, + { + "epoch": 0.17780548628428927, + "grad_norm": 0.49272879644602896, + "learning_rate": 4.7192443351727765e-05, + "loss": 0.9505, + "step": 3565 + }, + { + "epoch": 0.17785536159600998, + "grad_norm": 0.4819637083763125, + "learning_rate": 4.719058365107144e-05, + "loss": 0.9514, + "step": 3566 + }, + { + "epoch": 0.17790523690773066, + "grad_norm": 0.5213112742133557, + "learning_rate": 4.718872337136176e-05, + "loss": 0.8976, + "step": 3567 + }, + { + "epoch": 0.17795511221945137, + "grad_norm": 0.514750349395406, + "learning_rate": 4.718686251264729e-05, + "loss": 0.9542, + "step": 3568 + }, + { + "epoch": 0.17800498753117208, + "grad_norm": 0.5618066394622196, + "learning_rate": 4.718500107497656e-05, + "loss": 0.9663, + "step": 3569 + }, + { + "epoch": 0.17805486284289276, + "grad_norm": 0.5522145951334088, + "learning_rate": 4.718313905839815e-05, + "loss": 0.9378, + "step": 3570 + }, + { + "epoch": 0.17810473815461347, + "grad_norm": 0.4425688148815319, + "learning_rate": 4.7181276462960666e-05, + "loss": 0.9003, + "step": 3571 + }, + { + "epoch": 0.17815461346633416, + "grad_norm": 0.45295429223097256, + "learning_rate": 4.717941328871269e-05, + "loss": 0.9452, + "step": 3572 + }, + { + "epoch": 0.17820448877805486, + "grad_norm": 0.42643511998572553, + "learning_rate": 4.717754953570286e-05, + "loss": 0.8893, + "step": 3573 + }, + { + "epoch": 0.17825436408977557, + "grad_norm": 0.4240081545810552, + "learning_rate": 4.71756852039798e-05, + "loss": 0.9323, + "step": 3574 + }, + { + "epoch": 0.17830423940149626, + "grad_norm": 0.6524085614794439, + "learning_rate": 4.717382029359215e-05, + "loss": 0.9716, + "step": 3575 + }, + { + "epoch": 0.17835411471321697, + "grad_norm": 0.435327837666722, + "learning_rate": 4.717195480458859e-05, + "loss": 0.9183, + "step": 3576 + }, + { + "epoch": 0.17840399002493765, + "grad_norm": 0.6432658444801477, + "learning_rate": 4.717008873701779e-05, + "loss": 0.9455, + "step": 3577 + }, + { + "epoch": 0.17845386533665836, + "grad_norm": 0.45812539180219863, + "learning_rate": 4.716822209092845e-05, + "loss": 0.8866, + "step": 3578 + }, + { + "epoch": 0.17850374064837904, + "grad_norm": 0.6188256692013663, + "learning_rate": 4.716635486636927e-05, + "loss": 0.9471, + "step": 3579 + }, + { + "epoch": 0.17855361596009975, + "grad_norm": 0.5166433597583018, + "learning_rate": 4.716448706338899e-05, + "loss": 0.9673, + "step": 3580 + }, + { + "epoch": 0.17860349127182046, + "grad_norm": 0.4387286244484767, + "learning_rate": 4.716261868203633e-05, + "loss": 0.9327, + "step": 3581 + }, + { + "epoch": 0.17865336658354114, + "grad_norm": 0.44636336771996427, + "learning_rate": 4.7160749722360056e-05, + "loss": 0.9409, + "step": 3582 + }, + { + "epoch": 0.17870324189526185, + "grad_norm": 0.5011457596229043, + "learning_rate": 4.715888018440893e-05, + "loss": 0.9299, + "step": 3583 + }, + { + "epoch": 0.17875311720698253, + "grad_norm": 0.5017442352953808, + "learning_rate": 4.7157010068231746e-05, + "loss": 0.9765, + "step": 3584 + }, + { + "epoch": 0.17880299251870324, + "grad_norm": 0.6522712477696155, + "learning_rate": 4.715513937387731e-05, + "loss": 0.9229, + "step": 3585 + }, + { + "epoch": 0.17885286783042395, + "grad_norm": 0.4672907107692788, + "learning_rate": 4.715326810139442e-05, + "loss": 0.9124, + "step": 3586 + }, + { + "epoch": 0.17890274314214463, + "grad_norm": 0.4188565353823381, + "learning_rate": 4.7151396250831914e-05, + "loss": 0.9313, + "step": 3587 + }, + { + "epoch": 0.17895261845386534, + "grad_norm": 0.5778422511793295, + "learning_rate": 4.714952382223864e-05, + "loss": 0.9403, + "step": 3588 + }, + { + "epoch": 0.17900249376558602, + "grad_norm": 0.40231589729736406, + "learning_rate": 4.714765081566345e-05, + "loss": 0.9124, + "step": 3589 + }, + { + "epoch": 0.17905236907730673, + "grad_norm": 0.4231080693904879, + "learning_rate": 4.714577723115522e-05, + "loss": 0.9706, + "step": 3590 + }, + { + "epoch": 0.17910224438902744, + "grad_norm": 0.48686221381790773, + "learning_rate": 4.714390306876285e-05, + "loss": 0.9767, + "step": 3591 + }, + { + "epoch": 0.17915211970074812, + "grad_norm": 0.419980881873754, + "learning_rate": 4.714202832853524e-05, + "loss": 0.9439, + "step": 3592 + }, + { + "epoch": 0.17920199501246883, + "grad_norm": 0.43833674125791716, + "learning_rate": 4.714015301052131e-05, + "loss": 0.9246, + "step": 3593 + }, + { + "epoch": 0.17925187032418952, + "grad_norm": 0.48511527861513365, + "learning_rate": 4.7138277114769996e-05, + "loss": 0.9527, + "step": 3594 + }, + { + "epoch": 0.17930174563591023, + "grad_norm": 0.47044477968669096, + "learning_rate": 4.713640064133025e-05, + "loss": 0.9267, + "step": 3595 + }, + { + "epoch": 0.17935162094763094, + "grad_norm": 0.5506877078357086, + "learning_rate": 4.713452359025104e-05, + "loss": 0.93, + "step": 3596 + }, + { + "epoch": 0.17940149625935162, + "grad_norm": 0.43289728013132583, + "learning_rate": 4.713264596158133e-05, + "loss": 0.9262, + "step": 3597 + }, + { + "epoch": 0.17945137157107233, + "grad_norm": 0.39818554047152405, + "learning_rate": 4.7130767755370134e-05, + "loss": 0.892, + "step": 3598 + }, + { + "epoch": 0.179501246882793, + "grad_norm": 0.4353346305875516, + "learning_rate": 4.712888897166646e-05, + "loss": 0.9537, + "step": 3599 + }, + { + "epoch": 0.17955112219451372, + "grad_norm": 0.516020353353938, + "learning_rate": 4.7127009610519335e-05, + "loss": 0.9341, + "step": 3600 + }, + { + "epoch": 0.17960099750623443, + "grad_norm": 0.42819816187707155, + "learning_rate": 4.71251296719778e-05, + "loss": 0.9802, + "step": 3601 + }, + { + "epoch": 0.1796508728179551, + "grad_norm": 0.4676555069743348, + "learning_rate": 4.7123249156090897e-05, + "loss": 0.9503, + "step": 3602 + }, + { + "epoch": 0.17970074812967582, + "grad_norm": 0.45478967130568393, + "learning_rate": 4.712136806290771e-05, + "loss": 0.9395, + "step": 3603 + }, + { + "epoch": 0.1797506234413965, + "grad_norm": 0.5191700279103135, + "learning_rate": 4.711948639247733e-05, + "loss": 0.9256, + "step": 3604 + }, + { + "epoch": 0.1798004987531172, + "grad_norm": 0.47559088153601925, + "learning_rate": 4.711760414484885e-05, + "loss": 0.9459, + "step": 3605 + }, + { + "epoch": 0.1798503740648379, + "grad_norm": 0.3943445229075489, + "learning_rate": 4.711572132007139e-05, + "loss": 0.9295, + "step": 3606 + }, + { + "epoch": 0.1799002493765586, + "grad_norm": 0.42892664558414695, + "learning_rate": 4.711383791819407e-05, + "loss": 0.9144, + "step": 3607 + }, + { + "epoch": 0.1799501246882793, + "grad_norm": 0.5346213375131793, + "learning_rate": 4.711195393926606e-05, + "loss": 0.9728, + "step": 3608 + }, + { + "epoch": 0.18, + "grad_norm": 0.6263509999093423, + "learning_rate": 4.71100693833365e-05, + "loss": 0.9594, + "step": 3609 + }, + { + "epoch": 0.1800498753117207, + "grad_norm": 0.45400093870599495, + "learning_rate": 4.710818425045458e-05, + "loss": 0.949, + "step": 3610 + }, + { + "epoch": 0.18009975062344138, + "grad_norm": 0.4334342203699316, + "learning_rate": 4.710629854066948e-05, + "loss": 0.954, + "step": 3611 + }, + { + "epoch": 0.1801496259351621, + "grad_norm": 0.3914960639975155, + "learning_rate": 4.7104412254030414e-05, + "loss": 0.9006, + "step": 3612 + }, + { + "epoch": 0.1801995012468828, + "grad_norm": 0.4359146058711904, + "learning_rate": 4.7102525390586604e-05, + "loss": 0.9224, + "step": 3613 + }, + { + "epoch": 0.18024937655860349, + "grad_norm": 0.4501658556897117, + "learning_rate": 4.7100637950387286e-05, + "loss": 0.9331, + "step": 3614 + }, + { + "epoch": 0.1802992518703242, + "grad_norm": 0.43314342844106307, + "learning_rate": 4.709874993348171e-05, + "loss": 0.9392, + "step": 3615 + }, + { + "epoch": 0.18034912718204488, + "grad_norm": 0.40926742104553016, + "learning_rate": 4.709686133991915e-05, + "loss": 0.9118, + "step": 3616 + }, + { + "epoch": 0.1803990024937656, + "grad_norm": 0.41781240549857246, + "learning_rate": 4.709497216974887e-05, + "loss": 0.9225, + "step": 3617 + }, + { + "epoch": 0.1804488778054863, + "grad_norm": 0.37285434215869806, + "learning_rate": 4.7093082423020196e-05, + "loss": 0.9294, + "step": 3618 + }, + { + "epoch": 0.18049875311720698, + "grad_norm": 0.4806824051149181, + "learning_rate": 4.709119209978242e-05, + "loss": 0.9588, + "step": 3619 + }, + { + "epoch": 0.1805486284289277, + "grad_norm": 0.4405344319116416, + "learning_rate": 4.708930120008487e-05, + "loss": 0.9681, + "step": 3620 + }, + { + "epoch": 0.18059850374064837, + "grad_norm": 0.48432921700164305, + "learning_rate": 4.7087409723976885e-05, + "loss": 0.986, + "step": 3621 + }, + { + "epoch": 0.18064837905236908, + "grad_norm": 0.45412569221866067, + "learning_rate": 4.708551767150784e-05, + "loss": 0.9233, + "step": 3622 + }, + { + "epoch": 0.1806982543640898, + "grad_norm": 0.5284655534227438, + "learning_rate": 4.7083625042727086e-05, + "loss": 0.9161, + "step": 3623 + }, + { + "epoch": 0.18074812967581047, + "grad_norm": 0.4291946326029825, + "learning_rate": 4.708173183768402e-05, + "loss": 0.9289, + "step": 3624 + }, + { + "epoch": 0.18079800498753118, + "grad_norm": 0.39226342294962735, + "learning_rate": 4.707983805642805e-05, + "loss": 0.925, + "step": 3625 + }, + { + "epoch": 0.18084788029925186, + "grad_norm": 0.4442631027875872, + "learning_rate": 4.707794369900859e-05, + "loss": 0.8925, + "step": 3626 + }, + { + "epoch": 0.18089775561097257, + "grad_norm": 0.48609945222579765, + "learning_rate": 4.707604876547506e-05, + "loss": 0.9717, + "step": 3627 + }, + { + "epoch": 0.18094763092269325, + "grad_norm": 0.4397730216089402, + "learning_rate": 4.707415325587692e-05, + "loss": 0.9281, + "step": 3628 + }, + { + "epoch": 0.18099750623441396, + "grad_norm": 0.41011643269292125, + "learning_rate": 4.707225717026363e-05, + "loss": 0.923, + "step": 3629 + }, + { + "epoch": 0.18104738154613467, + "grad_norm": 0.4937813424128699, + "learning_rate": 4.7070360508684674e-05, + "loss": 0.9381, + "step": 3630 + }, + { + "epoch": 0.18109725685785535, + "grad_norm": 0.39805652882729464, + "learning_rate": 4.7068463271189535e-05, + "loss": 0.9444, + "step": 3631 + }, + { + "epoch": 0.18114713216957606, + "grad_norm": 0.6328381978099824, + "learning_rate": 4.706656545782772e-05, + "loss": 0.9263, + "step": 3632 + }, + { + "epoch": 0.18119700748129675, + "grad_norm": 0.48889192989203445, + "learning_rate": 4.7064667068648754e-05, + "loss": 0.9893, + "step": 3633 + }, + { + "epoch": 0.18124688279301746, + "grad_norm": 0.4482615519546659, + "learning_rate": 4.706276810370218e-05, + "loss": 0.9476, + "step": 3634 + }, + { + "epoch": 0.18129675810473816, + "grad_norm": 0.44401666924250166, + "learning_rate": 4.706086856303754e-05, + "loss": 0.9404, + "step": 3635 + }, + { + "epoch": 0.18134663341645885, + "grad_norm": 0.40426569319822137, + "learning_rate": 4.7058968446704424e-05, + "loss": 0.891, + "step": 3636 + }, + { + "epoch": 0.18139650872817956, + "grad_norm": 0.422937723500113, + "learning_rate": 4.7057067754752384e-05, + "loss": 0.9444, + "step": 3637 + }, + { + "epoch": 0.18144638403990024, + "grad_norm": 0.40333123138544114, + "learning_rate": 4.7055166487231034e-05, + "loss": 0.9094, + "step": 3638 + }, + { + "epoch": 0.18149625935162095, + "grad_norm": 0.4608193918904808, + "learning_rate": 4.705326464418999e-05, + "loss": 0.9801, + "step": 3639 + }, + { + "epoch": 0.18154613466334166, + "grad_norm": 0.415412082410826, + "learning_rate": 4.705136222567887e-05, + "loss": 0.9754, + "step": 3640 + }, + { + "epoch": 0.18159600997506234, + "grad_norm": 0.433465245519195, + "learning_rate": 4.7049459231747325e-05, + "loss": 0.9215, + "step": 3641 + }, + { + "epoch": 0.18164588528678305, + "grad_norm": 0.5839392864993413, + "learning_rate": 4.704755566244501e-05, + "loss": 0.9765, + "step": 3642 + }, + { + "epoch": 0.18169576059850373, + "grad_norm": 0.5382347538823964, + "learning_rate": 4.704565151782159e-05, + "loss": 0.9376, + "step": 3643 + }, + { + "epoch": 0.18174563591022444, + "grad_norm": 0.4454162334975465, + "learning_rate": 4.704374679792677e-05, + "loss": 0.9281, + "step": 3644 + }, + { + "epoch": 0.18179551122194515, + "grad_norm": 0.39984822548912197, + "learning_rate": 4.7041841502810236e-05, + "loss": 0.9536, + "step": 3645 + }, + { + "epoch": 0.18184538653366583, + "grad_norm": 0.42728846475225674, + "learning_rate": 4.703993563252172e-05, + "loss": 0.932, + "step": 3646 + }, + { + "epoch": 0.18189526184538654, + "grad_norm": 0.47206321396672024, + "learning_rate": 4.703802918711093e-05, + "loss": 0.944, + "step": 3647 + }, + { + "epoch": 0.18194513715710722, + "grad_norm": 0.4377274857962956, + "learning_rate": 4.703612216662765e-05, + "loss": 0.9479, + "step": 3648 + }, + { + "epoch": 0.18199501246882793, + "grad_norm": 0.4223237719965276, + "learning_rate": 4.703421457112162e-05, + "loss": 0.9247, + "step": 3649 + }, + { + "epoch": 0.18204488778054864, + "grad_norm": 0.43398824522694, + "learning_rate": 4.703230640064261e-05, + "loss": 0.9241, + "step": 3650 + }, + { + "epoch": 0.18209476309226932, + "grad_norm": 0.45401580418540893, + "learning_rate": 4.7030397655240444e-05, + "loss": 0.9493, + "step": 3651 + }, + { + "epoch": 0.18214463840399003, + "grad_norm": 0.4426273502806563, + "learning_rate": 4.702848833496489e-05, + "loss": 0.9709, + "step": 3652 + }, + { + "epoch": 0.18219451371571072, + "grad_norm": 0.565272504898061, + "learning_rate": 4.702657843986581e-05, + "loss": 0.914, + "step": 3653 + }, + { + "epoch": 0.18224438902743142, + "grad_norm": 0.49254472771879587, + "learning_rate": 4.702466796999302e-05, + "loss": 0.8952, + "step": 3654 + }, + { + "epoch": 0.1822942643391521, + "grad_norm": 0.4099648339465115, + "learning_rate": 4.702275692539637e-05, + "loss": 0.9753, + "step": 3655 + }, + { + "epoch": 0.18234413965087282, + "grad_norm": 0.43774714961119465, + "learning_rate": 4.7020845306125736e-05, + "loss": 0.8895, + "step": 3656 + }, + { + "epoch": 0.18239401496259353, + "grad_norm": 0.44237442830905743, + "learning_rate": 4.7018933112231e-05, + "loss": 0.9566, + "step": 3657 + }, + { + "epoch": 0.1824438902743142, + "grad_norm": 0.4556458906142468, + "learning_rate": 4.7017020343762056e-05, + "loss": 0.9816, + "step": 3658 + }, + { + "epoch": 0.18249376558603492, + "grad_norm": 0.4101955064681425, + "learning_rate": 4.7015107000768824e-05, + "loss": 0.964, + "step": 3659 + }, + { + "epoch": 0.1825436408977556, + "grad_norm": 0.4018156201096474, + "learning_rate": 4.701319308330122e-05, + "loss": 0.9349, + "step": 3660 + }, + { + "epoch": 0.1825935162094763, + "grad_norm": 0.4539306181637044, + "learning_rate": 4.70112785914092e-05, + "loss": 0.9398, + "step": 3661 + }, + { + "epoch": 0.18264339152119702, + "grad_norm": 0.4944979210971281, + "learning_rate": 4.700936352514272e-05, + "loss": 0.9557, + "step": 3662 + }, + { + "epoch": 0.1826932668329177, + "grad_norm": 0.41896715577412275, + "learning_rate": 4.7007447884551745e-05, + "loss": 0.9352, + "step": 3663 + }, + { + "epoch": 0.1827431421446384, + "grad_norm": 0.4147302532748941, + "learning_rate": 4.7005531669686265e-05, + "loss": 0.9027, + "step": 3664 + }, + { + "epoch": 0.1827930174563591, + "grad_norm": 0.5746520816450351, + "learning_rate": 4.7003614880596286e-05, + "loss": 0.9587, + "step": 3665 + }, + { + "epoch": 0.1828428927680798, + "grad_norm": 0.4033629215782818, + "learning_rate": 4.700169751733182e-05, + "loss": 0.9265, + "step": 3666 + }, + { + "epoch": 0.1828927680798005, + "grad_norm": 0.519220837221208, + "learning_rate": 4.699977957994291e-05, + "loss": 0.9144, + "step": 3667 + }, + { + "epoch": 0.1829426433915212, + "grad_norm": 0.4701422443132331, + "learning_rate": 4.69978610684796e-05, + "loss": 0.9227, + "step": 3668 + }, + { + "epoch": 0.1829925187032419, + "grad_norm": 0.4553762115121509, + "learning_rate": 4.699594198299194e-05, + "loss": 0.9694, + "step": 3669 + }, + { + "epoch": 0.18304239401496258, + "grad_norm": 0.5195237186930417, + "learning_rate": 4.699402232353003e-05, + "loss": 0.9054, + "step": 3670 + }, + { + "epoch": 0.1830922693266833, + "grad_norm": 0.4728556799397468, + "learning_rate": 4.699210209014394e-05, + "loss": 0.9785, + "step": 3671 + }, + { + "epoch": 0.183142144638404, + "grad_norm": 0.4797959778387307, + "learning_rate": 4.6990181282883794e-05, + "loss": 0.9232, + "step": 3672 + }, + { + "epoch": 0.18319201995012468, + "grad_norm": 0.44012143739244053, + "learning_rate": 4.698825990179971e-05, + "loss": 0.9352, + "step": 3673 + }, + { + "epoch": 0.1832418952618454, + "grad_norm": 0.46640428811110274, + "learning_rate": 4.698633794694183e-05, + "loss": 0.9417, + "step": 3674 + }, + { + "epoch": 0.18329177057356608, + "grad_norm": 0.4375978703522529, + "learning_rate": 4.698441541836029e-05, + "loss": 0.9278, + "step": 3675 + }, + { + "epoch": 0.18334164588528679, + "grad_norm": 0.435626083812428, + "learning_rate": 4.698249231610527e-05, + "loss": 0.9078, + "step": 3676 + }, + { + "epoch": 0.18339152119700747, + "grad_norm": 0.4066528266844408, + "learning_rate": 4.6980568640226954e-05, + "loss": 0.9646, + "step": 3677 + }, + { + "epoch": 0.18344139650872818, + "grad_norm": 0.8435245343887099, + "learning_rate": 4.697864439077554e-05, + "loss": 0.9721, + "step": 3678 + }, + { + "epoch": 0.1834912718204489, + "grad_norm": 0.4030286525725514, + "learning_rate": 4.697671956780123e-05, + "loss": 0.9132, + "step": 3679 + }, + { + "epoch": 0.18354114713216957, + "grad_norm": 3.4816725064258045, + "learning_rate": 4.6974794171354264e-05, + "loss": 0.9619, + "step": 3680 + }, + { + "epoch": 0.18359102244389028, + "grad_norm": 0.5329478617128516, + "learning_rate": 4.697286820148488e-05, + "loss": 0.9505, + "step": 3681 + }, + { + "epoch": 0.18364089775561096, + "grad_norm": 0.4186859286156811, + "learning_rate": 4.6970941658243323e-05, + "loss": 0.9393, + "step": 3682 + }, + { + "epoch": 0.18369077306733167, + "grad_norm": 0.4574917240080894, + "learning_rate": 4.696901454167988e-05, + "loss": 0.9644, + "step": 3683 + }, + { + "epoch": 0.18374064837905238, + "grad_norm": 0.43246138951725693, + "learning_rate": 4.696708685184484e-05, + "loss": 0.9416, + "step": 3684 + }, + { + "epoch": 0.18379052369077306, + "grad_norm": 0.3746368324334739, + "learning_rate": 4.69651585887885e-05, + "loss": 0.904, + "step": 3685 + }, + { + "epoch": 0.18384039900249377, + "grad_norm": 0.41489011425702, + "learning_rate": 4.696322975256118e-05, + "loss": 0.9314, + "step": 3686 + }, + { + "epoch": 0.18389027431421445, + "grad_norm": 0.46022509344503276, + "learning_rate": 4.69613003432132e-05, + "loss": 0.9451, + "step": 3687 + }, + { + "epoch": 0.18394014962593516, + "grad_norm": 0.4402406089417169, + "learning_rate": 4.695937036079492e-05, + "loss": 0.9401, + "step": 3688 + }, + { + "epoch": 0.18399002493765587, + "grad_norm": 0.41978600820268613, + "learning_rate": 4.6957439805356696e-05, + "loss": 0.9278, + "step": 3689 + }, + { + "epoch": 0.18403990024937655, + "grad_norm": 0.440869213246086, + "learning_rate": 4.695550867694891e-05, + "loss": 0.9422, + "step": 3690 + }, + { + "epoch": 0.18408977556109726, + "grad_norm": 0.42123490549982806, + "learning_rate": 4.695357697562195e-05, + "loss": 0.9239, + "step": 3691 + }, + { + "epoch": 0.18413965087281794, + "grad_norm": 0.40547547157353225, + "learning_rate": 4.695164470142622e-05, + "loss": 0.9031, + "step": 3692 + }, + { + "epoch": 0.18418952618453865, + "grad_norm": 0.4787571380139965, + "learning_rate": 4.6949711854412155e-05, + "loss": 0.9829, + "step": 3693 + }, + { + "epoch": 0.18423940149625936, + "grad_norm": 0.4199317102671499, + "learning_rate": 4.6947778434630174e-05, + "loss": 0.8703, + "step": 3694 + }, + { + "epoch": 0.18428927680798005, + "grad_norm": 0.430625871934591, + "learning_rate": 4.694584444213075e-05, + "loss": 0.9462, + "step": 3695 + }, + { + "epoch": 0.18433915211970076, + "grad_norm": 0.453890743637178, + "learning_rate": 4.694390987696432e-05, + "loss": 0.954, + "step": 3696 + }, + { + "epoch": 0.18438902743142144, + "grad_norm": 0.4617285494223413, + "learning_rate": 4.6941974739181395e-05, + "loss": 0.9794, + "step": 3697 + }, + { + "epoch": 0.18443890274314215, + "grad_norm": 0.4809754685023201, + "learning_rate": 4.694003902883245e-05, + "loss": 0.9841, + "step": 3698 + }, + { + "epoch": 0.18448877805486286, + "grad_norm": 0.4027899687673411, + "learning_rate": 4.6938102745968014e-05, + "loss": 0.9386, + "step": 3699 + }, + { + "epoch": 0.18453865336658354, + "grad_norm": 0.39460431785893163, + "learning_rate": 4.69361658906386e-05, + "loss": 0.9872, + "step": 3700 + }, + { + "epoch": 0.18458852867830425, + "grad_norm": 0.3882886847541606, + "learning_rate": 4.6934228462894756e-05, + "loss": 0.9212, + "step": 3701 + }, + { + "epoch": 0.18463840399002493, + "grad_norm": 0.4178712452421348, + "learning_rate": 4.693229046278703e-05, + "loss": 0.9182, + "step": 3702 + }, + { + "epoch": 0.18468827930174564, + "grad_norm": 0.4327188373268287, + "learning_rate": 4.693035189036601e-05, + "loss": 0.9083, + "step": 3703 + }, + { + "epoch": 0.18473815461346632, + "grad_norm": 0.4421510293737025, + "learning_rate": 4.692841274568227e-05, + "loss": 0.9704, + "step": 3704 + }, + { + "epoch": 0.18478802992518703, + "grad_norm": 0.43368649283126387, + "learning_rate": 4.692647302878641e-05, + "loss": 0.998, + "step": 3705 + }, + { + "epoch": 0.18483790523690774, + "grad_norm": 0.416675265585359, + "learning_rate": 4.6924532739729054e-05, + "loss": 0.953, + "step": 3706 + }, + { + "epoch": 0.18488778054862842, + "grad_norm": 0.4689110845871804, + "learning_rate": 4.692259187856083e-05, + "loss": 0.92, + "step": 3707 + }, + { + "epoch": 0.18493765586034913, + "grad_norm": 0.567706921990688, + "learning_rate": 4.692065044533237e-05, + "loss": 0.9328, + "step": 3708 + }, + { + "epoch": 0.1849875311720698, + "grad_norm": 0.41128263846815905, + "learning_rate": 4.691870844009436e-05, + "loss": 0.9382, + "step": 3709 + }, + { + "epoch": 0.18503740648379052, + "grad_norm": 0.448259358383023, + "learning_rate": 4.691676586289746e-05, + "loss": 0.9358, + "step": 3710 + }, + { + "epoch": 0.18508728179551123, + "grad_norm": 0.4526707240456426, + "learning_rate": 4.691482271379236e-05, + "loss": 0.9277, + "step": 3711 + }, + { + "epoch": 0.18513715710723191, + "grad_norm": 0.4514881928981078, + "learning_rate": 4.691287899282977e-05, + "loss": 0.975, + "step": 3712 + }, + { + "epoch": 0.18518703241895262, + "grad_norm": 0.41418085317833925, + "learning_rate": 4.69109347000604e-05, + "loss": 0.9198, + "step": 3713 + }, + { + "epoch": 0.1852369077306733, + "grad_norm": 0.3912368786878459, + "learning_rate": 4.690898983553501e-05, + "loss": 0.9745, + "step": 3714 + }, + { + "epoch": 0.18528678304239402, + "grad_norm": 0.4393431074289878, + "learning_rate": 4.690704439930433e-05, + "loss": 0.9147, + "step": 3715 + }, + { + "epoch": 0.18533665835411473, + "grad_norm": 0.5204228420287512, + "learning_rate": 4.6905098391419136e-05, + "loss": 0.9215, + "step": 3716 + }, + { + "epoch": 0.1853865336658354, + "grad_norm": 0.7881596103782595, + "learning_rate": 4.6903151811930195e-05, + "loss": 0.947, + "step": 3717 + }, + { + "epoch": 0.18543640897755612, + "grad_norm": 0.41681878573197867, + "learning_rate": 4.690120466088831e-05, + "loss": 0.9286, + "step": 3718 + }, + { + "epoch": 0.1854862842892768, + "grad_norm": 0.43634235885872685, + "learning_rate": 4.689925693834429e-05, + "loss": 0.9265, + "step": 3719 + }, + { + "epoch": 0.1855361596009975, + "grad_norm": 0.5125250408977542, + "learning_rate": 4.689730864434897e-05, + "loss": 0.9181, + "step": 3720 + }, + { + "epoch": 0.18558603491271822, + "grad_norm": 0.3964567721628635, + "learning_rate": 4.6895359778953175e-05, + "loss": 0.9752, + "step": 3721 + }, + { + "epoch": 0.1856359102244389, + "grad_norm": 0.47534662998959915, + "learning_rate": 4.689341034220776e-05, + "loss": 0.9686, + "step": 3722 + }, + { + "epoch": 0.1856857855361596, + "grad_norm": 0.40910698740296053, + "learning_rate": 4.689146033416362e-05, + "loss": 0.9407, + "step": 3723 + }, + { + "epoch": 0.1857356608478803, + "grad_norm": 0.39940769930302994, + "learning_rate": 4.6889509754871606e-05, + "loss": 0.9652, + "step": 3724 + }, + { + "epoch": 0.185785536159601, + "grad_norm": 0.4130593583081343, + "learning_rate": 4.688755860438263e-05, + "loss": 0.8934, + "step": 3725 + }, + { + "epoch": 0.18583541147132168, + "grad_norm": 0.4501220748246152, + "learning_rate": 4.688560688274761e-05, + "loss": 0.9204, + "step": 3726 + }, + { + "epoch": 0.1858852867830424, + "grad_norm": 0.386949145456655, + "learning_rate": 4.688365459001747e-05, + "loss": 0.9276, + "step": 3727 + }, + { + "epoch": 0.1859351620947631, + "grad_norm": 0.3951186986228053, + "learning_rate": 4.688170172624317e-05, + "loss": 0.9578, + "step": 3728 + }, + { + "epoch": 0.18598503740648378, + "grad_norm": 0.4721822904306454, + "learning_rate": 4.687974829147564e-05, + "loss": 0.9567, + "step": 3729 + }, + { + "epoch": 0.1860349127182045, + "grad_norm": 0.47573472229674724, + "learning_rate": 4.6877794285765874e-05, + "loss": 0.9649, + "step": 3730 + }, + { + "epoch": 0.18608478802992517, + "grad_norm": 0.49579286018554214, + "learning_rate": 4.687583970916487e-05, + "loss": 0.9023, + "step": 3731 + }, + { + "epoch": 0.18613466334164588, + "grad_norm": 0.4284294088728979, + "learning_rate": 4.68738845617236e-05, + "loss": 0.9362, + "step": 3732 + }, + { + "epoch": 0.1861845386533666, + "grad_norm": 0.4510889555637803, + "learning_rate": 4.687192884349312e-05, + "loss": 0.9248, + "step": 3733 + }, + { + "epoch": 0.18623441396508728, + "grad_norm": 0.4451326681199047, + "learning_rate": 4.686997255452443e-05, + "loss": 0.8806, + "step": 3734 + }, + { + "epoch": 0.18628428927680798, + "grad_norm": 0.4329736228448575, + "learning_rate": 4.686801569486861e-05, + "loss": 0.9168, + "step": 3735 + }, + { + "epoch": 0.18633416458852867, + "grad_norm": 0.4577576657322781, + "learning_rate": 4.686605826457669e-05, + "loss": 0.9312, + "step": 3736 + }, + { + "epoch": 0.18638403990024938, + "grad_norm": 1.0555620954302276, + "learning_rate": 4.686410026369977e-05, + "loss": 0.9478, + "step": 3737 + }, + { + "epoch": 0.18643391521197009, + "grad_norm": 0.38838798686299314, + "learning_rate": 4.6862141692288944e-05, + "loss": 0.9238, + "step": 3738 + }, + { + "epoch": 0.18648379052369077, + "grad_norm": 0.4320893490887145, + "learning_rate": 4.6860182550395314e-05, + "loss": 0.9592, + "step": 3739 + }, + { + "epoch": 0.18653366583541148, + "grad_norm": 0.4542500067229636, + "learning_rate": 4.685822283807e-05, + "loss": 0.9469, + "step": 3740 + }, + { + "epoch": 0.18658354114713216, + "grad_norm": 0.42624033639070924, + "learning_rate": 4.685626255536415e-05, + "loss": 0.9699, + "step": 3741 + }, + { + "epoch": 0.18663341645885287, + "grad_norm": 0.4457742696940867, + "learning_rate": 4.685430170232891e-05, + "loss": 0.8744, + "step": 3742 + }, + { + "epoch": 0.18668329177057358, + "grad_norm": 0.5506916995181178, + "learning_rate": 4.685234027901544e-05, + "loss": 0.9383, + "step": 3743 + }, + { + "epoch": 0.18673316708229426, + "grad_norm": 0.44085252098640026, + "learning_rate": 4.685037828547494e-05, + "loss": 0.9445, + "step": 3744 + }, + { + "epoch": 0.18678304239401497, + "grad_norm": 0.44695537232426585, + "learning_rate": 4.684841572175859e-05, + "loss": 0.9002, + "step": 3745 + }, + { + "epoch": 0.18683291770573565, + "grad_norm": 1.3294568385480243, + "learning_rate": 4.684645258791761e-05, + "loss": 0.9373, + "step": 3746 + }, + { + "epoch": 0.18688279301745636, + "grad_norm": 0.5275080912311438, + "learning_rate": 4.684448888400323e-05, + "loss": 0.9345, + "step": 3747 + }, + { + "epoch": 0.18693266832917707, + "grad_norm": 0.5417391453640189, + "learning_rate": 4.684252461006669e-05, + "loss": 0.9335, + "step": 3748 + }, + { + "epoch": 0.18698254364089775, + "grad_norm": 0.4643224450774762, + "learning_rate": 4.684055976615924e-05, + "loss": 0.9787, + "step": 3749 + }, + { + "epoch": 0.18703241895261846, + "grad_norm": 0.6905350476178593, + "learning_rate": 4.683859435233216e-05, + "loss": 0.9424, + "step": 3750 + }, + { + "epoch": 0.18708229426433914, + "grad_norm": 0.45156052898249743, + "learning_rate": 4.6836628368636736e-05, + "loss": 0.9525, + "step": 3751 + }, + { + "epoch": 0.18713216957605985, + "grad_norm": 0.4360003992802237, + "learning_rate": 4.683466181512427e-05, + "loss": 0.946, + "step": 3752 + }, + { + "epoch": 0.18718204488778054, + "grad_norm": 0.4560129957156734, + "learning_rate": 4.683269469184607e-05, + "loss": 0.9251, + "step": 3753 + }, + { + "epoch": 0.18723192019950124, + "grad_norm": 0.5047649361317352, + "learning_rate": 4.6830726998853476e-05, + "loss": 0.8915, + "step": 3754 + }, + { + "epoch": 0.18728179551122195, + "grad_norm": 0.45810226527794135, + "learning_rate": 4.682875873619783e-05, + "loss": 0.924, + "step": 3755 + }, + { + "epoch": 0.18733167082294264, + "grad_norm": 0.49715763567840426, + "learning_rate": 4.682678990393049e-05, + "loss": 0.9452, + "step": 3756 + }, + { + "epoch": 0.18738154613466335, + "grad_norm": 0.4144524109125169, + "learning_rate": 4.682482050210284e-05, + "loss": 0.8862, + "step": 3757 + }, + { + "epoch": 0.18743142144638403, + "grad_norm": 0.4603975957331739, + "learning_rate": 4.682285053076627e-05, + "loss": 0.9951, + "step": 3758 + }, + { + "epoch": 0.18748129675810474, + "grad_norm": 0.42659606952304757, + "learning_rate": 4.6820879989972176e-05, + "loss": 0.964, + "step": 3759 + }, + { + "epoch": 0.18753117206982545, + "grad_norm": 0.48126474026120664, + "learning_rate": 4.681890887977199e-05, + "loss": 0.9286, + "step": 3760 + }, + { + "epoch": 0.18758104738154613, + "grad_norm": 0.42731355899785295, + "learning_rate": 4.6816937200217145e-05, + "loss": 0.9062, + "step": 3761 + }, + { + "epoch": 0.18763092269326684, + "grad_norm": 0.4304575293111906, + "learning_rate": 4.6814964951359075e-05, + "loss": 0.9638, + "step": 3762 + }, + { + "epoch": 0.18768079800498752, + "grad_norm": 0.49667882394211593, + "learning_rate": 4.6812992133249266e-05, + "loss": 0.9862, + "step": 3763 + }, + { + "epoch": 0.18773067331670823, + "grad_norm": 0.47889934570695736, + "learning_rate": 4.68110187459392e-05, + "loss": 0.8892, + "step": 3764 + }, + { + "epoch": 0.18778054862842894, + "grad_norm": 0.41382626537824935, + "learning_rate": 4.680904478948034e-05, + "loss": 0.9338, + "step": 3765 + }, + { + "epoch": 0.18783042394014962, + "grad_norm": 0.4437868818493754, + "learning_rate": 4.6807070263924224e-05, + "loss": 0.9403, + "step": 3766 + }, + { + "epoch": 0.18788029925187033, + "grad_norm": 0.44456608715148865, + "learning_rate": 4.680509516932238e-05, + "loss": 0.9595, + "step": 3767 + }, + { + "epoch": 0.187930174563591, + "grad_norm": 0.46873660598990374, + "learning_rate": 4.6803119505726325e-05, + "loss": 0.9113, + "step": 3768 + }, + { + "epoch": 0.18798004987531172, + "grad_norm": 0.4432399888839951, + "learning_rate": 4.680114327318762e-05, + "loss": 0.9444, + "step": 3769 + }, + { + "epoch": 0.18802992518703243, + "grad_norm": 1.0364166919860296, + "learning_rate": 4.679916647175785e-05, + "loss": 0.9354, + "step": 3770 + }, + { + "epoch": 0.1880798004987531, + "grad_norm": 0.449159150310164, + "learning_rate": 4.6797189101488584e-05, + "loss": 0.9181, + "step": 3771 + }, + { + "epoch": 0.18812967581047382, + "grad_norm": 0.45309368401828193, + "learning_rate": 4.679521116243142e-05, + "loss": 0.9461, + "step": 3772 + }, + { + "epoch": 0.1881795511221945, + "grad_norm": 0.41321612382517514, + "learning_rate": 4.679323265463797e-05, + "loss": 0.9177, + "step": 3773 + }, + { + "epoch": 0.18822942643391521, + "grad_norm": 0.41479246279179616, + "learning_rate": 4.679125357815988e-05, + "loss": 0.9581, + "step": 3774 + }, + { + "epoch": 0.1882793017456359, + "grad_norm": 0.464763453305539, + "learning_rate": 4.6789273933048766e-05, + "loss": 0.9005, + "step": 3775 + }, + { + "epoch": 0.1883291770573566, + "grad_norm": 0.4419974439460371, + "learning_rate": 4.678729371935631e-05, + "loss": 0.9833, + "step": 3776 + }, + { + "epoch": 0.18837905236907732, + "grad_norm": 0.4438956639363457, + "learning_rate": 4.678531293713417e-05, + "loss": 0.924, + "step": 3777 + }, + { + "epoch": 0.188428927680798, + "grad_norm": 0.49932003292291355, + "learning_rate": 4.678333158643404e-05, + "loss": 0.9564, + "step": 3778 + }, + { + "epoch": 0.1884788029925187, + "grad_norm": 0.5136018678344249, + "learning_rate": 4.6781349667307626e-05, + "loss": 0.937, + "step": 3779 + }, + { + "epoch": 0.1885286783042394, + "grad_norm": 0.4713677635311256, + "learning_rate": 4.677936717980664e-05, + "loss": 0.9403, + "step": 3780 + }, + { + "epoch": 0.1885785536159601, + "grad_norm": 0.4232894389401609, + "learning_rate": 4.6777384123982815e-05, + "loss": 0.9412, + "step": 3781 + }, + { + "epoch": 0.1886284289276808, + "grad_norm": 2.087711942179732, + "learning_rate": 4.677540049988789e-05, + "loss": 0.964, + "step": 3782 + }, + { + "epoch": 0.1886783042394015, + "grad_norm": 0.4078265369088294, + "learning_rate": 4.677341630757364e-05, + "loss": 0.9156, + "step": 3783 + }, + { + "epoch": 0.1887281795511222, + "grad_norm": 0.3876023093987886, + "learning_rate": 4.677143154709184e-05, + "loss": 0.9151, + "step": 3784 + }, + { + "epoch": 0.18877805486284288, + "grad_norm": 0.7099551902515381, + "learning_rate": 4.676944621849427e-05, + "loss": 0.9608, + "step": 3785 + }, + { + "epoch": 0.1888279301745636, + "grad_norm": 0.4038064176066437, + "learning_rate": 4.676746032183274e-05, + "loss": 0.92, + "step": 3786 + }, + { + "epoch": 0.1888778054862843, + "grad_norm": 0.5261693845140564, + "learning_rate": 4.676547385715909e-05, + "loss": 0.9665, + "step": 3787 + }, + { + "epoch": 0.18892768079800498, + "grad_norm": 0.41360034603385465, + "learning_rate": 4.676348682452513e-05, + "loss": 0.959, + "step": 3788 + }, + { + "epoch": 0.1889775561097257, + "grad_norm": 0.41461606646364335, + "learning_rate": 4.676149922398272e-05, + "loss": 0.9406, + "step": 3789 + }, + { + "epoch": 0.18902743142144637, + "grad_norm": 0.4755653420641178, + "learning_rate": 4.6759511055583734e-05, + "loss": 0.923, + "step": 3790 + }, + { + "epoch": 0.18907730673316708, + "grad_norm": 0.440410764346283, + "learning_rate": 4.6757522319380035e-05, + "loss": 0.9117, + "step": 3791 + }, + { + "epoch": 0.1891271820448878, + "grad_norm": 0.42033646111330053, + "learning_rate": 4.6755533015423535e-05, + "loss": 0.9312, + "step": 3792 + }, + { + "epoch": 0.18917705735660847, + "grad_norm": 0.5084290589576451, + "learning_rate": 4.675354314376614e-05, + "loss": 0.9508, + "step": 3793 + }, + { + "epoch": 0.18922693266832918, + "grad_norm": 0.44655608646912254, + "learning_rate": 4.675155270445977e-05, + "loss": 0.957, + "step": 3794 + }, + { + "epoch": 0.18927680798004987, + "grad_norm": 0.44886946042620013, + "learning_rate": 4.674956169755636e-05, + "loss": 0.9153, + "step": 3795 + }, + { + "epoch": 0.18932668329177058, + "grad_norm": 0.44699955727712, + "learning_rate": 4.674757012310788e-05, + "loss": 0.9012, + "step": 3796 + }, + { + "epoch": 0.18937655860349129, + "grad_norm": 0.46976181790975613, + "learning_rate": 4.674557798116629e-05, + "loss": 0.9624, + "step": 3797 + }, + { + "epoch": 0.18942643391521197, + "grad_norm": 0.43396693239983836, + "learning_rate": 4.6743585271783575e-05, + "loss": 0.9158, + "step": 3798 + }, + { + "epoch": 0.18947630922693268, + "grad_norm": 0.6400458455300342, + "learning_rate": 4.674159199501173e-05, + "loss": 0.9478, + "step": 3799 + }, + { + "epoch": 0.18952618453865336, + "grad_norm": 0.5764107738528403, + "learning_rate": 4.673959815090277e-05, + "loss": 0.979, + "step": 3800 + }, + { + "epoch": 0.18957605985037407, + "grad_norm": 0.4728723116399539, + "learning_rate": 4.673760373950873e-05, + "loss": 0.9502, + "step": 3801 + }, + { + "epoch": 0.18962593516209475, + "grad_norm": 0.46902582979689783, + "learning_rate": 4.673560876088165e-05, + "loss": 0.9588, + "step": 3802 + }, + { + "epoch": 0.18967581047381546, + "grad_norm": 0.4882924974829065, + "learning_rate": 4.673361321507358e-05, + "loss": 0.9567, + "step": 3803 + }, + { + "epoch": 0.18972568578553617, + "grad_norm": 0.4052634172911017, + "learning_rate": 4.673161710213661e-05, + "loss": 0.8891, + "step": 3804 + }, + { + "epoch": 0.18977556109725685, + "grad_norm": 0.4490246777539319, + "learning_rate": 4.672962042212281e-05, + "loss": 0.8977, + "step": 3805 + }, + { + "epoch": 0.18982543640897756, + "grad_norm": 0.6028134747966237, + "learning_rate": 4.67276231750843e-05, + "loss": 0.9781, + "step": 3806 + }, + { + "epoch": 0.18987531172069824, + "grad_norm": 0.42340118964395224, + "learning_rate": 4.672562536107318e-05, + "loss": 0.9478, + "step": 3807 + }, + { + "epoch": 0.18992518703241895, + "grad_norm": 0.618501764553898, + "learning_rate": 4.6723626980141585e-05, + "loss": 0.9071, + "step": 3808 + }, + { + "epoch": 0.18997506234413966, + "grad_norm": 0.4123961405777523, + "learning_rate": 4.672162803234167e-05, + "loss": 0.9174, + "step": 3809 + }, + { + "epoch": 0.19002493765586034, + "grad_norm": 0.49558796731240307, + "learning_rate": 4.671962851772559e-05, + "loss": 0.9508, + "step": 3810 + }, + { + "epoch": 0.19007481296758105, + "grad_norm": 0.45648765855513596, + "learning_rate": 4.671762843634553e-05, + "loss": 0.9537, + "step": 3811 + }, + { + "epoch": 0.19012468827930173, + "grad_norm": 0.50142732903737, + "learning_rate": 4.671562778825367e-05, + "loss": 0.8928, + "step": 3812 + }, + { + "epoch": 0.19017456359102244, + "grad_norm": 0.44591367685556116, + "learning_rate": 4.671362657350222e-05, + "loss": 0.9933, + "step": 3813 + }, + { + "epoch": 0.19022443890274315, + "grad_norm": 0.45951117888126997, + "learning_rate": 4.671162479214341e-05, + "loss": 0.9282, + "step": 3814 + }, + { + "epoch": 0.19027431421446384, + "grad_norm": 0.45710095216524255, + "learning_rate": 4.670962244422946e-05, + "loss": 0.9198, + "step": 3815 + }, + { + "epoch": 0.19032418952618455, + "grad_norm": 0.696078353079456, + "learning_rate": 4.670761952981264e-05, + "loss": 0.9386, + "step": 3816 + }, + { + "epoch": 0.19037406483790523, + "grad_norm": 0.43576887777800005, + "learning_rate": 4.6705616048945195e-05, + "loss": 0.9714, + "step": 3817 + }, + { + "epoch": 0.19042394014962594, + "grad_norm": 0.4474050772285474, + "learning_rate": 4.670361200167941e-05, + "loss": 0.9481, + "step": 3818 + }, + { + "epoch": 0.19047381546134665, + "grad_norm": 0.48041939921941224, + "learning_rate": 4.6701607388067595e-05, + "loss": 0.9528, + "step": 3819 + }, + { + "epoch": 0.19052369077306733, + "grad_norm": 0.436956490151742, + "learning_rate": 4.6699602208162034e-05, + "loss": 0.9179, + "step": 3820 + }, + { + "epoch": 0.19057356608478804, + "grad_norm": 0.4608332748216118, + "learning_rate": 4.669759646201507e-05, + "loss": 0.897, + "step": 3821 + }, + { + "epoch": 0.19062344139650872, + "grad_norm": 0.37542184203440576, + "learning_rate": 4.669559014967904e-05, + "loss": 0.8865, + "step": 3822 + }, + { + "epoch": 0.19067331670822943, + "grad_norm": 0.46113386023089015, + "learning_rate": 4.66935832712063e-05, + "loss": 0.9619, + "step": 3823 + }, + { + "epoch": 0.19072319201995014, + "grad_norm": 0.6292256068444977, + "learning_rate": 4.6691575826649205e-05, + "loss": 0.9783, + "step": 3824 + }, + { + "epoch": 0.19077306733167082, + "grad_norm": 0.45209711681720827, + "learning_rate": 4.668956781606014e-05, + "loss": 0.9394, + "step": 3825 + }, + { + "epoch": 0.19082294264339153, + "grad_norm": 0.4815587445711195, + "learning_rate": 4.6687559239491516e-05, + "loss": 0.9507, + "step": 3826 + }, + { + "epoch": 0.1908728179551122, + "grad_norm": 0.4624385482416898, + "learning_rate": 4.6685550096995744e-05, + "loss": 0.9776, + "step": 3827 + }, + { + "epoch": 0.19092269326683292, + "grad_norm": 0.42712759111644544, + "learning_rate": 4.6683540388625246e-05, + "loss": 0.9122, + "step": 3828 + }, + { + "epoch": 0.1909725685785536, + "grad_norm": 0.45468506025290734, + "learning_rate": 4.6681530114432456e-05, + "loss": 0.9036, + "step": 3829 + }, + { + "epoch": 0.1910224438902743, + "grad_norm": 0.4806892913994025, + "learning_rate": 4.667951927446986e-05, + "loss": 0.9152, + "step": 3830 + }, + { + "epoch": 0.19107231920199502, + "grad_norm": 0.5099439703537826, + "learning_rate": 4.667750786878989e-05, + "loss": 0.9589, + "step": 3831 + }, + { + "epoch": 0.1911221945137157, + "grad_norm": 0.44020215953638453, + "learning_rate": 4.667549589744507e-05, + "loss": 0.9261, + "step": 3832 + }, + { + "epoch": 0.1911720698254364, + "grad_norm": 0.4466171177126281, + "learning_rate": 4.667348336048788e-05, + "loss": 0.9409, + "step": 3833 + }, + { + "epoch": 0.1912219451371571, + "grad_norm": 0.4640472296368419, + "learning_rate": 4.6671470257970836e-05, + "loss": 0.9322, + "step": 3834 + }, + { + "epoch": 0.1912718204488778, + "grad_norm": 0.3972578320729183, + "learning_rate": 4.6669456589946485e-05, + "loss": 0.9109, + "step": 3835 + }, + { + "epoch": 0.19132169576059851, + "grad_norm": 0.5990009905794901, + "learning_rate": 4.666744235646736e-05, + "loss": 0.9598, + "step": 3836 + }, + { + "epoch": 0.1913715710723192, + "grad_norm": 0.47805139021339466, + "learning_rate": 4.666542755758602e-05, + "loss": 0.9785, + "step": 3837 + }, + { + "epoch": 0.1914214463840399, + "grad_norm": 0.4862969720516297, + "learning_rate": 4.666341219335504e-05, + "loss": 0.9462, + "step": 3838 + }, + { + "epoch": 0.1914713216957606, + "grad_norm": 0.4618927949811877, + "learning_rate": 4.6661396263827015e-05, + "loss": 0.9631, + "step": 3839 + }, + { + "epoch": 0.1915211970074813, + "grad_norm": 0.48793995482032687, + "learning_rate": 4.665937976905456e-05, + "loss": 0.922, + "step": 3840 + }, + { + "epoch": 0.191571072319202, + "grad_norm": 0.48471260726075727, + "learning_rate": 4.665736270909027e-05, + "loss": 0.8887, + "step": 3841 + }, + { + "epoch": 0.1916209476309227, + "grad_norm": 0.4449018602906407, + "learning_rate": 4.66553450839868e-05, + "loss": 0.914, + "step": 3842 + }, + { + "epoch": 0.1916708229426434, + "grad_norm": 0.48345692334816004, + "learning_rate": 4.665332689379679e-05, + "loss": 0.9831, + "step": 3843 + }, + { + "epoch": 0.19172069825436408, + "grad_norm": 0.6256242147242155, + "learning_rate": 4.665130813857291e-05, + "loss": 0.9276, + "step": 3844 + }, + { + "epoch": 0.1917705735660848, + "grad_norm": 0.49114992579570493, + "learning_rate": 4.664928881836783e-05, + "loss": 0.9272, + "step": 3845 + }, + { + "epoch": 0.1918204488778055, + "grad_norm": 0.40242054069253763, + "learning_rate": 4.6647268933234246e-05, + "loss": 0.8718, + "step": 3846 + }, + { + "epoch": 0.19187032418952618, + "grad_norm": 0.43424719400244594, + "learning_rate": 4.6645248483224866e-05, + "loss": 0.9481, + "step": 3847 + }, + { + "epoch": 0.1919201995012469, + "grad_norm": 0.47507807030400906, + "learning_rate": 4.6643227468392416e-05, + "loss": 0.9655, + "step": 3848 + }, + { + "epoch": 0.19197007481296757, + "grad_norm": 0.4641831071382356, + "learning_rate": 4.664120588878963e-05, + "loss": 0.9634, + "step": 3849 + }, + { + "epoch": 0.19201995012468828, + "grad_norm": 0.5363505961490639, + "learning_rate": 4.6639183744469264e-05, + "loss": 0.9258, + "step": 3850 + }, + { + "epoch": 0.19206982543640896, + "grad_norm": 0.6246659515409169, + "learning_rate": 4.663716103548408e-05, + "loss": 0.985, + "step": 3851 + }, + { + "epoch": 0.19211970074812967, + "grad_norm": 0.49962123644321793, + "learning_rate": 4.663513776188687e-05, + "loss": 0.9157, + "step": 3852 + }, + { + "epoch": 0.19216957605985038, + "grad_norm": 0.4937541151590045, + "learning_rate": 4.663311392373041e-05, + "loss": 0.9899, + "step": 3853 + }, + { + "epoch": 0.19221945137157107, + "grad_norm": 0.4539705741079468, + "learning_rate": 4.663108952106754e-05, + "loss": 0.9603, + "step": 3854 + }, + { + "epoch": 0.19226932668329177, + "grad_norm": 0.4726091860882575, + "learning_rate": 4.6629064553951056e-05, + "loss": 0.9133, + "step": 3855 + }, + { + "epoch": 0.19231920199501246, + "grad_norm": 0.4294496496497705, + "learning_rate": 4.6627039022433816e-05, + "loss": 0.9169, + "step": 3856 + }, + { + "epoch": 0.19236907730673317, + "grad_norm": 0.49994346537191514, + "learning_rate": 4.662501292656867e-05, + "loss": 0.9361, + "step": 3857 + }, + { + "epoch": 0.19241895261845388, + "grad_norm": 0.4397351721892924, + "learning_rate": 4.66229862664085e-05, + "loss": 0.9062, + "step": 3858 + }, + { + "epoch": 0.19246882793017456, + "grad_norm": 0.4037590854819271, + "learning_rate": 4.6620959042006174e-05, + "loss": 0.9016, + "step": 3859 + }, + { + "epoch": 0.19251870324189527, + "grad_norm": 0.41969039562094207, + "learning_rate": 4.661893125341459e-05, + "loss": 0.8685, + "step": 3860 + }, + { + "epoch": 0.19256857855361595, + "grad_norm": 0.5234954393522493, + "learning_rate": 4.661690290068668e-05, + "loss": 0.965, + "step": 3861 + }, + { + "epoch": 0.19261845386533666, + "grad_norm": 0.49649168856771636, + "learning_rate": 4.661487398387536e-05, + "loss": 0.9583, + "step": 3862 + }, + { + "epoch": 0.19266832917705737, + "grad_norm": 0.4315104350243032, + "learning_rate": 4.6612844503033585e-05, + "loss": 0.9227, + "step": 3863 + }, + { + "epoch": 0.19271820448877805, + "grad_norm": 0.564106418040004, + "learning_rate": 4.661081445821429e-05, + "loss": 0.9329, + "step": 3864 + }, + { + "epoch": 0.19276807980049876, + "grad_norm": 0.42439455618434735, + "learning_rate": 4.660878384947047e-05, + "loss": 0.9229, + "step": 3865 + }, + { + "epoch": 0.19281795511221944, + "grad_norm": 0.41491376856379886, + "learning_rate": 4.6606752676855115e-05, + "loss": 0.9303, + "step": 3866 + }, + { + "epoch": 0.19286783042394015, + "grad_norm": 0.41794966515886106, + "learning_rate": 4.660472094042121e-05, + "loss": 0.9165, + "step": 3867 + }, + { + "epoch": 0.19291770573566086, + "grad_norm": 0.39932586500931194, + "learning_rate": 4.660268864022178e-05, + "loss": 0.9873, + "step": 3868 + }, + { + "epoch": 0.19296758104738154, + "grad_norm": 0.4780196417212367, + "learning_rate": 4.6600655776309856e-05, + "loss": 0.9312, + "step": 3869 + }, + { + "epoch": 0.19301745635910225, + "grad_norm": 0.4946356612407521, + "learning_rate": 4.65986223487385e-05, + "loss": 0.9493, + "step": 3870 + }, + { + "epoch": 0.19306733167082293, + "grad_norm": 0.5103516909748184, + "learning_rate": 4.659658835756074e-05, + "loss": 0.9536, + "step": 3871 + }, + { + "epoch": 0.19311720698254364, + "grad_norm": 0.44683128040163245, + "learning_rate": 4.659455380282969e-05, + "loss": 0.9957, + "step": 3872 + }, + { + "epoch": 0.19316708229426435, + "grad_norm": 0.4207739014359983, + "learning_rate": 4.6592518684598416e-05, + "loss": 0.8806, + "step": 3873 + }, + { + "epoch": 0.19321695760598503, + "grad_norm": 0.5824453361888372, + "learning_rate": 4.6590483002920035e-05, + "loss": 0.92, + "step": 3874 + }, + { + "epoch": 0.19326683291770574, + "grad_norm": 0.4549741469694947, + "learning_rate": 4.658844675784766e-05, + "loss": 0.9259, + "step": 3875 + }, + { + "epoch": 0.19331670822942643, + "grad_norm": 0.530610218693694, + "learning_rate": 4.658640994943442e-05, + "loss": 0.9423, + "step": 3876 + }, + { + "epoch": 0.19336658354114714, + "grad_norm": 0.5048326697499484, + "learning_rate": 4.658437257773348e-05, + "loss": 0.9378, + "step": 3877 + }, + { + "epoch": 0.19341645885286782, + "grad_norm": 0.5707917034308668, + "learning_rate": 4.6582334642798e-05, + "loss": 0.8876, + "step": 3878 + }, + { + "epoch": 0.19346633416458853, + "grad_norm": 0.4976999459818786, + "learning_rate": 4.6580296144681157e-05, + "loss": 0.9259, + "step": 3879 + }, + { + "epoch": 0.19351620947630924, + "grad_norm": 0.3956316338935707, + "learning_rate": 4.657825708343614e-05, + "loss": 0.9433, + "step": 3880 + }, + { + "epoch": 0.19356608478802992, + "grad_norm": 0.42267502388191136, + "learning_rate": 4.657621745911617e-05, + "loss": 0.9427, + "step": 3881 + }, + { + "epoch": 0.19361596009975063, + "grad_norm": 0.49616632713631986, + "learning_rate": 4.657417727177445e-05, + "loss": 0.9472, + "step": 3882 + }, + { + "epoch": 0.1936658354114713, + "grad_norm": 0.4860684552901263, + "learning_rate": 4.657213652146424e-05, + "loss": 0.9421, + "step": 3883 + }, + { + "epoch": 0.19371571072319202, + "grad_norm": 0.4355732960345289, + "learning_rate": 4.657009520823877e-05, + "loss": 0.9655, + "step": 3884 + }, + { + "epoch": 0.19376558603491273, + "grad_norm": 0.4329329637908094, + "learning_rate": 4.656805333215132e-05, + "loss": 0.931, + "step": 3885 + }, + { + "epoch": 0.1938154613466334, + "grad_norm": 0.5850026224002411, + "learning_rate": 4.656601089325518e-05, + "loss": 0.9338, + "step": 3886 + }, + { + "epoch": 0.19386533665835412, + "grad_norm": 0.42379078215621313, + "learning_rate": 4.656396789160363e-05, + "loss": 0.9578, + "step": 3887 + }, + { + "epoch": 0.1939152119700748, + "grad_norm": 0.5265305784635577, + "learning_rate": 4.656192432725e-05, + "loss": 0.95, + "step": 3888 + }, + { + "epoch": 0.1939650872817955, + "grad_norm": 0.49241932475327355, + "learning_rate": 4.6559880200247594e-05, + "loss": 0.9195, + "step": 3889 + }, + { + "epoch": 0.19401496259351622, + "grad_norm": 0.9278007197356019, + "learning_rate": 4.655783551064976e-05, + "loss": 0.9232, + "step": 3890 + }, + { + "epoch": 0.1940648379052369, + "grad_norm": 0.39974257278651876, + "learning_rate": 4.6555790258509866e-05, + "loss": 0.9297, + "step": 3891 + }, + { + "epoch": 0.1941147132169576, + "grad_norm": 0.5329021174452402, + "learning_rate": 4.655374444388127e-05, + "loss": 0.9454, + "step": 3892 + }, + { + "epoch": 0.1941645885286783, + "grad_norm": 0.42286921348063267, + "learning_rate": 4.655169806681736e-05, + "loss": 0.9334, + "step": 3893 + }, + { + "epoch": 0.194214463840399, + "grad_norm": 0.8157801622845573, + "learning_rate": 4.654965112737153e-05, + "loss": 0.9174, + "step": 3894 + }, + { + "epoch": 0.1942643391521197, + "grad_norm": 0.44730912312067833, + "learning_rate": 4.65476036255972e-05, + "loss": 0.9394, + "step": 3895 + }, + { + "epoch": 0.1943142144638404, + "grad_norm": 0.4037319984819019, + "learning_rate": 4.654555556154779e-05, + "loss": 0.8834, + "step": 3896 + }, + { + "epoch": 0.1943640897755611, + "grad_norm": 0.4625082543167065, + "learning_rate": 4.654350693527676e-05, + "loss": 0.937, + "step": 3897 + }, + { + "epoch": 0.1944139650872818, + "grad_norm": 0.47771371302988724, + "learning_rate": 4.654145774683756e-05, + "loss": 0.9576, + "step": 3898 + }, + { + "epoch": 0.1944638403990025, + "grad_norm": 0.5496897048978276, + "learning_rate": 4.653940799628366e-05, + "loss": 1.0083, + "step": 3899 + }, + { + "epoch": 0.19451371571072318, + "grad_norm": 0.46673505862459636, + "learning_rate": 4.653735768366854e-05, + "loss": 0.9435, + "step": 3900 + }, + { + "epoch": 0.1945635910224439, + "grad_norm": 0.49258201823207615, + "learning_rate": 4.6535306809045706e-05, + "loss": 0.9666, + "step": 3901 + }, + { + "epoch": 0.1946134663341646, + "grad_norm": 0.8226264706796239, + "learning_rate": 4.653325537246869e-05, + "loss": 0.96, + "step": 3902 + }, + { + "epoch": 0.19466334164588528, + "grad_norm": 0.4084731201192741, + "learning_rate": 4.6531203373991014e-05, + "loss": 0.9401, + "step": 3903 + }, + { + "epoch": 0.194713216957606, + "grad_norm": 0.5013240374378365, + "learning_rate": 4.652915081366621e-05, + "loss": 0.9918, + "step": 3904 + }, + { + "epoch": 0.19476309226932667, + "grad_norm": 0.44410983748892197, + "learning_rate": 4.652709769154786e-05, + "loss": 0.9405, + "step": 3905 + }, + { + "epoch": 0.19481296758104738, + "grad_norm": 0.6297134628928268, + "learning_rate": 4.652504400768953e-05, + "loss": 0.9475, + "step": 3906 + }, + { + "epoch": 0.1948628428927681, + "grad_norm": 0.40990539969791645, + "learning_rate": 4.652298976214481e-05, + "loss": 0.9447, + "step": 3907 + }, + { + "epoch": 0.19491271820448877, + "grad_norm": 0.48424367628950243, + "learning_rate": 4.6520934954967296e-05, + "loss": 0.932, + "step": 3908 + }, + { + "epoch": 0.19496259351620948, + "grad_norm": 0.6461583097388128, + "learning_rate": 4.651887958621063e-05, + "loss": 0.9653, + "step": 3909 + }, + { + "epoch": 0.19501246882793016, + "grad_norm": 0.48393205424521313, + "learning_rate": 4.651682365592842e-05, + "loss": 0.9102, + "step": 3910 + }, + { + "epoch": 0.19506234413965087, + "grad_norm": 0.46872373019217733, + "learning_rate": 4.6514767164174325e-05, + "loss": 0.9006, + "step": 3911 + }, + { + "epoch": 0.19511221945137158, + "grad_norm": 0.4182830004368735, + "learning_rate": 4.6512710111002014e-05, + "loss": 0.8986, + "step": 3912 + }, + { + "epoch": 0.19516209476309226, + "grad_norm": 0.4922261025556701, + "learning_rate": 4.651065249646517e-05, + "loss": 0.8952, + "step": 3913 + }, + { + "epoch": 0.19521197007481297, + "grad_norm": 0.8759096345029508, + "learning_rate": 4.650859432061746e-05, + "loss": 0.9635, + "step": 3914 + }, + { + "epoch": 0.19526184538653366, + "grad_norm": 0.5079922250035731, + "learning_rate": 4.650653558351262e-05, + "loss": 0.9475, + "step": 3915 + }, + { + "epoch": 0.19531172069825437, + "grad_norm": 0.47579712880676606, + "learning_rate": 4.650447628520435e-05, + "loss": 0.9535, + "step": 3916 + }, + { + "epoch": 0.19536159600997507, + "grad_norm": 0.41403054784226045, + "learning_rate": 4.65024164257464e-05, + "loss": 0.9253, + "step": 3917 + }, + { + "epoch": 0.19541147132169576, + "grad_norm": 0.42177081340960015, + "learning_rate": 4.6500356005192514e-05, + "loss": 0.9246, + "step": 3918 + }, + { + "epoch": 0.19546134663341647, + "grad_norm": 0.5249034456689174, + "learning_rate": 4.649829502359646e-05, + "loss": 0.9453, + "step": 3919 + }, + { + "epoch": 0.19551122194513715, + "grad_norm": 0.4261239008651081, + "learning_rate": 4.649623348101203e-05, + "loss": 0.9381, + "step": 3920 + }, + { + "epoch": 0.19556109725685786, + "grad_norm": 0.4117236879008326, + "learning_rate": 4.649417137749299e-05, + "loss": 0.9441, + "step": 3921 + }, + { + "epoch": 0.19561097256857857, + "grad_norm": 0.5907724010140025, + "learning_rate": 4.6492108713093176e-05, + "loss": 0.9275, + "step": 3922 + }, + { + "epoch": 0.19566084788029925, + "grad_norm": 0.6890405214837546, + "learning_rate": 4.649004548786641e-05, + "loss": 0.909, + "step": 3923 + }, + { + "epoch": 0.19571072319201996, + "grad_norm": 0.6067079044496712, + "learning_rate": 4.648798170186651e-05, + "loss": 0.956, + "step": 3924 + }, + { + "epoch": 0.19576059850374064, + "grad_norm": 0.5352264495023449, + "learning_rate": 4.648591735514736e-05, + "loss": 0.9245, + "step": 3925 + }, + { + "epoch": 0.19581047381546135, + "grad_norm": 0.46928233379052525, + "learning_rate": 4.64838524477628e-05, + "loss": 0.9411, + "step": 3926 + }, + { + "epoch": 0.19586034912718203, + "grad_norm": 0.5191984191607446, + "learning_rate": 4.648178697976673e-05, + "loss": 0.9169, + "step": 3927 + }, + { + "epoch": 0.19591022443890274, + "grad_norm": 0.513191007090705, + "learning_rate": 4.647972095121305e-05, + "loss": 0.8813, + "step": 3928 + }, + { + "epoch": 0.19596009975062345, + "grad_norm": 0.497495911052122, + "learning_rate": 4.647765436215566e-05, + "loss": 0.9501, + "step": 3929 + }, + { + "epoch": 0.19600997506234413, + "grad_norm": 0.4901139963167155, + "learning_rate": 4.6475587212648486e-05, + "loss": 0.9941, + "step": 3930 + }, + { + "epoch": 0.19605985037406484, + "grad_norm": 0.454594115141728, + "learning_rate": 4.6473519502745476e-05, + "loss": 0.9424, + "step": 3931 + }, + { + "epoch": 0.19610972568578552, + "grad_norm": 0.47161413282892517, + "learning_rate": 4.6471451232500586e-05, + "loss": 0.952, + "step": 3932 + }, + { + "epoch": 0.19615960099750623, + "grad_norm": 0.7149956114424042, + "learning_rate": 4.646938240196779e-05, + "loss": 0.9568, + "step": 3933 + }, + { + "epoch": 0.19620947630922694, + "grad_norm": 0.4982109185459218, + "learning_rate": 4.6467313011201066e-05, + "loss": 0.9687, + "step": 3934 + }, + { + "epoch": 0.19625935162094763, + "grad_norm": 0.4742667484333068, + "learning_rate": 4.6465243060254415e-05, + "loss": 0.9291, + "step": 3935 + }, + { + "epoch": 0.19630922693266833, + "grad_norm": 2.5725289078667855, + "learning_rate": 4.646317254918186e-05, + "loss": 0.9234, + "step": 3936 + }, + { + "epoch": 0.19635910224438902, + "grad_norm": 0.5813627413323438, + "learning_rate": 4.646110147803741e-05, + "loss": 0.9049, + "step": 3937 + }, + { + "epoch": 0.19640897755610973, + "grad_norm": 0.5996563655357513, + "learning_rate": 4.6459029846875124e-05, + "loss": 0.9807, + "step": 3938 + }, + { + "epoch": 0.19645885286783044, + "grad_norm": 0.4339403908769471, + "learning_rate": 4.6456957655749064e-05, + "loss": 0.9731, + "step": 3939 + }, + { + "epoch": 0.19650872817955112, + "grad_norm": 0.5941352962217921, + "learning_rate": 4.64548849047133e-05, + "loss": 0.9725, + "step": 3940 + }, + { + "epoch": 0.19655860349127183, + "grad_norm": 0.45343661798395396, + "learning_rate": 4.645281159382191e-05, + "loss": 0.9375, + "step": 3941 + }, + { + "epoch": 0.1966084788029925, + "grad_norm": 0.48416164094411096, + "learning_rate": 4.6450737723129e-05, + "loss": 0.9014, + "step": 3942 + }, + { + "epoch": 0.19665835411471322, + "grad_norm": 0.588598037504591, + "learning_rate": 4.64486632926887e-05, + "loss": 0.957, + "step": 3943 + }, + { + "epoch": 0.19670822942643393, + "grad_norm": 0.4287940517577355, + "learning_rate": 4.644658830255512e-05, + "loss": 0.9064, + "step": 3944 + }, + { + "epoch": 0.1967581047381546, + "grad_norm": 0.47910676207424985, + "learning_rate": 4.6444512752782424e-05, + "loss": 0.9727, + "step": 3945 + }, + { + "epoch": 0.19680798004987532, + "grad_norm": 0.5577865313267015, + "learning_rate": 4.644243664342476e-05, + "loss": 0.9406, + "step": 3946 + }, + { + "epoch": 0.196857855361596, + "grad_norm": 0.5144243365107684, + "learning_rate": 4.6440359974536306e-05, + "loss": 0.9393, + "step": 3947 + }, + { + "epoch": 0.1969077306733167, + "grad_norm": 0.5051514910452578, + "learning_rate": 4.6438282746171255e-05, + "loss": 0.9446, + "step": 3948 + }, + { + "epoch": 0.1969576059850374, + "grad_norm": 0.4579034282717349, + "learning_rate": 4.643620495838381e-05, + "loss": 0.937, + "step": 3949 + }, + { + "epoch": 0.1970074812967581, + "grad_norm": 0.4273077467476881, + "learning_rate": 4.64341266112282e-05, + "loss": 0.9583, + "step": 3950 + }, + { + "epoch": 0.1970573566084788, + "grad_norm": 0.42074699752092337, + "learning_rate": 4.643204770475864e-05, + "loss": 0.9305, + "step": 3951 + }, + { + "epoch": 0.1971072319201995, + "grad_norm": 0.4804456563277724, + "learning_rate": 4.6429968239029384e-05, + "loss": 0.9604, + "step": 3952 + }, + { + "epoch": 0.1971571072319202, + "grad_norm": 0.6946331425427285, + "learning_rate": 4.64278882140947e-05, + "loss": 0.9581, + "step": 3953 + }, + { + "epoch": 0.19720698254364089, + "grad_norm": 1.026448555115763, + "learning_rate": 4.6425807630008874e-05, + "loss": 0.9557, + "step": 3954 + }, + { + "epoch": 0.1972568578553616, + "grad_norm": 0.46365856514471465, + "learning_rate": 4.642372648682618e-05, + "loss": 0.9297, + "step": 3955 + }, + { + "epoch": 0.1973067331670823, + "grad_norm": 0.5032840478892971, + "learning_rate": 4.6421644784600923e-05, + "loss": 0.9326, + "step": 3956 + }, + { + "epoch": 0.197356608478803, + "grad_norm": 0.47088484948058845, + "learning_rate": 4.641956252338744e-05, + "loss": 0.9565, + "step": 3957 + }, + { + "epoch": 0.1974064837905237, + "grad_norm": 0.4837582247602778, + "learning_rate": 4.6417479703240055e-05, + "loss": 0.9315, + "step": 3958 + }, + { + "epoch": 0.19745635910224438, + "grad_norm": 0.42340635939354837, + "learning_rate": 4.641539632421313e-05, + "loss": 0.92, + "step": 3959 + }, + { + "epoch": 0.1975062344139651, + "grad_norm": 0.5504481526009002, + "learning_rate": 4.641331238636101e-05, + "loss": 0.9052, + "step": 3960 + }, + { + "epoch": 0.1975561097256858, + "grad_norm": 0.4866623204389158, + "learning_rate": 4.641122788973809e-05, + "loss": 0.9648, + "step": 3961 + }, + { + "epoch": 0.19760598503740648, + "grad_norm": 0.9013258262438468, + "learning_rate": 4.640914283439877e-05, + "loss": 0.9426, + "step": 3962 + }, + { + "epoch": 0.1976558603491272, + "grad_norm": 0.45658018138352424, + "learning_rate": 4.6407057220397445e-05, + "loss": 0.9063, + "step": 3963 + }, + { + "epoch": 0.19770573566084787, + "grad_norm": 0.5647291317490869, + "learning_rate": 4.640497104778854e-05, + "loss": 0.9503, + "step": 3964 + }, + { + "epoch": 0.19775561097256858, + "grad_norm": 0.5235938325718046, + "learning_rate": 4.64028843166265e-05, + "loss": 0.988, + "step": 3965 + }, + { + "epoch": 0.1978054862842893, + "grad_norm": 0.49502359857842937, + "learning_rate": 4.640079702696577e-05, + "loss": 0.8907, + "step": 3966 + }, + { + "epoch": 0.19785536159600997, + "grad_norm": 0.5829879624076496, + "learning_rate": 4.639870917886082e-05, + "loss": 0.9308, + "step": 3967 + }, + { + "epoch": 0.19790523690773068, + "grad_norm": 0.47106640328334437, + "learning_rate": 4.639662077236613e-05, + "loss": 0.9083, + "step": 3968 + }, + { + "epoch": 0.19795511221945136, + "grad_norm": 0.5312331349612857, + "learning_rate": 4.639453180753619e-05, + "loss": 0.9372, + "step": 3969 + }, + { + "epoch": 0.19800498753117207, + "grad_norm": 0.4100729992547825, + "learning_rate": 4.639244228442553e-05, + "loss": 0.9647, + "step": 3970 + }, + { + "epoch": 0.19805486284289278, + "grad_norm": 0.47315052554972625, + "learning_rate": 4.639035220308865e-05, + "loss": 0.9277, + "step": 3971 + }, + { + "epoch": 0.19810473815461346, + "grad_norm": 0.4835637152179287, + "learning_rate": 4.6388261563580105e-05, + "loss": 1.0157, + "step": 3972 + }, + { + "epoch": 0.19815461346633417, + "grad_norm": 0.5310109374771378, + "learning_rate": 4.638617036595445e-05, + "loss": 0.8974, + "step": 3973 + }, + { + "epoch": 0.19820448877805485, + "grad_norm": 0.5497018480465555, + "learning_rate": 4.6384078610266254e-05, + "loss": 0.9505, + "step": 3974 + }, + { + "epoch": 0.19825436408977556, + "grad_norm": 0.4170634701796219, + "learning_rate": 4.638198629657009e-05, + "loss": 0.9082, + "step": 3975 + }, + { + "epoch": 0.19830423940149625, + "grad_norm": 0.49791257004955747, + "learning_rate": 4.637989342492057e-05, + "loss": 0.9832, + "step": 3976 + }, + { + "epoch": 0.19835411471321696, + "grad_norm": 0.8362320730912418, + "learning_rate": 4.6377799995372295e-05, + "loss": 0.9276, + "step": 3977 + }, + { + "epoch": 0.19840399002493767, + "grad_norm": 0.5720557757465384, + "learning_rate": 4.637570600797989e-05, + "loss": 0.9335, + "step": 3978 + }, + { + "epoch": 0.19845386533665835, + "grad_norm": 0.5598652075832294, + "learning_rate": 4.6373611462798016e-05, + "loss": 0.9914, + "step": 3979 + }, + { + "epoch": 0.19850374064837906, + "grad_norm": 0.5503809480332162, + "learning_rate": 4.6371516359881306e-05, + "loss": 0.9296, + "step": 3980 + }, + { + "epoch": 0.19855361596009974, + "grad_norm": 0.43494033729985276, + "learning_rate": 4.6369420699284446e-05, + "loss": 0.9483, + "step": 3981 + }, + { + "epoch": 0.19860349127182045, + "grad_norm": 0.4784098234465367, + "learning_rate": 4.636732448106212e-05, + "loss": 0.9525, + "step": 3982 + }, + { + "epoch": 0.19865336658354116, + "grad_norm": 0.4259893532604471, + "learning_rate": 4.636522770526902e-05, + "loss": 0.9489, + "step": 3983 + }, + { + "epoch": 0.19870324189526184, + "grad_norm": 0.41889691918539257, + "learning_rate": 4.636313037195986e-05, + "loss": 0.9457, + "step": 3984 + }, + { + "epoch": 0.19875311720698255, + "grad_norm": 0.5140484332572002, + "learning_rate": 4.636103248118938e-05, + "loss": 0.915, + "step": 3985 + }, + { + "epoch": 0.19880299251870323, + "grad_norm": 0.43510991446714253, + "learning_rate": 4.6358934033012324e-05, + "loss": 0.9273, + "step": 3986 + }, + { + "epoch": 0.19885286783042394, + "grad_norm": 0.5307904198287163, + "learning_rate": 4.635683502748343e-05, + "loss": 0.9739, + "step": 3987 + }, + { + "epoch": 0.19890274314214465, + "grad_norm": 0.9068203925257883, + "learning_rate": 4.6354735464657495e-05, + "loss": 0.9577, + "step": 3988 + }, + { + "epoch": 0.19895261845386533, + "grad_norm": 0.4655328519216688, + "learning_rate": 4.635263534458929e-05, + "loss": 0.9448, + "step": 3989 + }, + { + "epoch": 0.19900249376558604, + "grad_norm": 0.5284788810457901, + "learning_rate": 4.635053466733362e-05, + "loss": 0.9259, + "step": 3990 + }, + { + "epoch": 0.19905236907730672, + "grad_norm": 0.7107428483454813, + "learning_rate": 4.634843343294531e-05, + "loss": 0.9427, + "step": 3991 + }, + { + "epoch": 0.19910224438902743, + "grad_norm": 0.4906472088607391, + "learning_rate": 4.634633164147918e-05, + "loss": 0.9666, + "step": 3992 + }, + { + "epoch": 0.19915211970074814, + "grad_norm": 0.3958213331779895, + "learning_rate": 4.6344229292990074e-05, + "loss": 0.9198, + "step": 3993 + }, + { + "epoch": 0.19920199501246882, + "grad_norm": 0.44980411397742476, + "learning_rate": 4.634212638753286e-05, + "loss": 0.9003, + "step": 3994 + }, + { + "epoch": 0.19925187032418953, + "grad_norm": 0.45618239764748736, + "learning_rate": 4.634002292516242e-05, + "loss": 0.9434, + "step": 3995 + }, + { + "epoch": 0.19930174563591022, + "grad_norm": 0.6139816671085628, + "learning_rate": 4.633791890593362e-05, + "loss": 0.9656, + "step": 3996 + }, + { + "epoch": 0.19935162094763093, + "grad_norm": 0.4650568678108267, + "learning_rate": 4.633581432990138e-05, + "loss": 0.9491, + "step": 3997 + }, + { + "epoch": 0.1994014962593516, + "grad_norm": 0.49757189936926627, + "learning_rate": 4.633370919712061e-05, + "loss": 0.9368, + "step": 3998 + }, + { + "epoch": 0.19945137157107232, + "grad_norm": 0.5373396012136574, + "learning_rate": 4.633160350764625e-05, + "loss": 0.9427, + "step": 3999 + }, + { + "epoch": 0.19950124688279303, + "grad_norm": 0.9428576722688772, + "learning_rate": 4.6329497261533236e-05, + "loss": 0.9649, + "step": 4000 + }, + { + "epoch": 0.1995511221945137, + "grad_norm": 0.47901842957521445, + "learning_rate": 4.632739045883655e-05, + "loss": 0.9591, + "step": 4001 + }, + { + "epoch": 0.19960099750623442, + "grad_norm": 0.4568716124011764, + "learning_rate": 4.632528309961115e-05, + "loss": 0.9286, + "step": 4002 + }, + { + "epoch": 0.1996508728179551, + "grad_norm": 0.9510143361301486, + "learning_rate": 4.6323175183912024e-05, + "loss": 0.9454, + "step": 4003 + }, + { + "epoch": 0.1997007481296758, + "grad_norm": 0.41407185110319716, + "learning_rate": 4.632106671179419e-05, + "loss": 0.9087, + "step": 4004 + }, + { + "epoch": 0.19975062344139652, + "grad_norm": 0.4681187368292013, + "learning_rate": 4.631895768331266e-05, + "loss": 0.9497, + "step": 4005 + }, + { + "epoch": 0.1998004987531172, + "grad_norm": 0.5791611120975213, + "learning_rate": 4.631684809852248e-05, + "loss": 0.943, + "step": 4006 + }, + { + "epoch": 0.1998503740648379, + "grad_norm": 0.46005252784388767, + "learning_rate": 4.6314737957478685e-05, + "loss": 0.9354, + "step": 4007 + }, + { + "epoch": 0.1999002493765586, + "grad_norm": 0.477393505101588, + "learning_rate": 4.631262726023633e-05, + "loss": 0.9057, + "step": 4008 + }, + { + "epoch": 0.1999501246882793, + "grad_norm": 0.45545167627057026, + "learning_rate": 4.631051600685051e-05, + "loss": 0.9636, + "step": 4009 + }, + { + "epoch": 0.2, + "grad_norm": 0.48493612100814437, + "learning_rate": 4.630840419737632e-05, + "loss": 0.9062, + "step": 4010 + }, + { + "epoch": 0.2000498753117207, + "grad_norm": 0.45555901648481373, + "learning_rate": 4.630629183186885e-05, + "loss": 0.9379, + "step": 4011 + }, + { + "epoch": 0.2000997506234414, + "grad_norm": 0.4899972346566807, + "learning_rate": 4.630417891038324e-05, + "loss": 0.9376, + "step": 4012 + }, + { + "epoch": 0.20014962593516208, + "grad_norm": 0.4551148236857176, + "learning_rate": 4.6302065432974616e-05, + "loss": 0.8898, + "step": 4013 + }, + { + "epoch": 0.2001995012468828, + "grad_norm": 0.45907505884570105, + "learning_rate": 4.6299951399698124e-05, + "loss": 0.8945, + "step": 4014 + }, + { + "epoch": 0.2002493765586035, + "grad_norm": 0.426835604277975, + "learning_rate": 4.629783681060893e-05, + "loss": 0.9098, + "step": 4015 + }, + { + "epoch": 0.20029925187032419, + "grad_norm": 0.4240668803292635, + "learning_rate": 4.629572166576221e-05, + "loss": 0.9211, + "step": 4016 + }, + { + "epoch": 0.2003491271820449, + "grad_norm": 0.4182961561041454, + "learning_rate": 4.629360596521318e-05, + "loss": 0.9158, + "step": 4017 + }, + { + "epoch": 0.20039900249376558, + "grad_norm": 0.45950783548358537, + "learning_rate": 4.6291489709017025e-05, + "loss": 0.9162, + "step": 4018 + }, + { + "epoch": 0.2004488778054863, + "grad_norm": 0.41640470192443413, + "learning_rate": 4.628937289722897e-05, + "loss": 0.906, + "step": 4019 + }, + { + "epoch": 0.200498753117207, + "grad_norm": 0.4457289645119661, + "learning_rate": 4.628725552990426e-05, + "loss": 0.9695, + "step": 4020 + }, + { + "epoch": 0.20054862842892768, + "grad_norm": 0.5295049292503292, + "learning_rate": 4.628513760709814e-05, + "loss": 0.9768, + "step": 4021 + }, + { + "epoch": 0.2005985037406484, + "grad_norm": 0.5047194858876841, + "learning_rate": 4.628301912886589e-05, + "loss": 0.9337, + "step": 4022 + }, + { + "epoch": 0.20064837905236907, + "grad_norm": 0.4206305576719781, + "learning_rate": 4.628090009526277e-05, + "loss": 0.9708, + "step": 4023 + }, + { + "epoch": 0.20069825436408978, + "grad_norm": 0.4718686059744529, + "learning_rate": 4.62787805063441e-05, + "loss": 0.929, + "step": 4024 + }, + { + "epoch": 0.20074812967581046, + "grad_norm": 0.45671034554870316, + "learning_rate": 4.627666036216517e-05, + "loss": 0.9528, + "step": 4025 + }, + { + "epoch": 0.20079800498753117, + "grad_norm": 0.42353845249500277, + "learning_rate": 4.6274539662781305e-05, + "loss": 0.9393, + "step": 4026 + }, + { + "epoch": 0.20084788029925188, + "grad_norm": 0.49810027125392975, + "learning_rate": 4.627241840824785e-05, + "loss": 0.9195, + "step": 4027 + }, + { + "epoch": 0.20089775561097256, + "grad_norm": 0.4140012073224336, + "learning_rate": 4.627029659862016e-05, + "loss": 0.9076, + "step": 4028 + }, + { + "epoch": 0.20094763092269327, + "grad_norm": 0.7592748141071028, + "learning_rate": 4.626817423395359e-05, + "loss": 0.9455, + "step": 4029 + }, + { + "epoch": 0.20099750623441395, + "grad_norm": 0.4777869950067755, + "learning_rate": 4.626605131430354e-05, + "loss": 0.9339, + "step": 4030 + }, + { + "epoch": 0.20104738154613466, + "grad_norm": 0.4774167917886214, + "learning_rate": 4.62639278397254e-05, + "loss": 0.9203, + "step": 4031 + }, + { + "epoch": 0.20109725685785537, + "grad_norm": 0.4619428284222109, + "learning_rate": 4.6261803810274577e-05, + "loss": 0.9568, + "step": 4032 + }, + { + "epoch": 0.20114713216957605, + "grad_norm": 0.457896751524354, + "learning_rate": 4.62596792260065e-05, + "loss": 0.9244, + "step": 4033 + }, + { + "epoch": 0.20119700748129676, + "grad_norm": 0.43477377876519363, + "learning_rate": 4.6257554086976606e-05, + "loss": 0.9338, + "step": 4034 + }, + { + "epoch": 0.20124688279301745, + "grad_norm": 0.49018562480549466, + "learning_rate": 4.625542839324036e-05, + "loss": 0.9051, + "step": 4035 + }, + { + "epoch": 0.20129675810473815, + "grad_norm": 0.4319530731354804, + "learning_rate": 4.625330214485322e-05, + "loss": 0.9632, + "step": 4036 + }, + { + "epoch": 0.20134663341645886, + "grad_norm": 0.4488914836354358, + "learning_rate": 4.625117534187066e-05, + "loss": 0.9248, + "step": 4037 + }, + { + "epoch": 0.20139650872817955, + "grad_norm": 0.9412836075459907, + "learning_rate": 4.62490479843482e-05, + "loss": 0.9415, + "step": 4038 + }, + { + "epoch": 0.20144638403990026, + "grad_norm": 0.45085744009615947, + "learning_rate": 4.624692007234134e-05, + "loss": 0.8886, + "step": 4039 + }, + { + "epoch": 0.20149625935162094, + "grad_norm": 0.4455328924457226, + "learning_rate": 4.6244791605905615e-05, + "loss": 0.982, + "step": 4040 + }, + { + "epoch": 0.20154613466334165, + "grad_norm": 0.5197395641454711, + "learning_rate": 4.624266258509655e-05, + "loss": 0.9009, + "step": 4041 + }, + { + "epoch": 0.20159600997506236, + "grad_norm": 0.4742181304860337, + "learning_rate": 4.624053300996972e-05, + "loss": 0.9249, + "step": 4042 + }, + { + "epoch": 0.20164588528678304, + "grad_norm": 0.5217584352842233, + "learning_rate": 4.623840288058069e-05, + "loss": 0.9036, + "step": 4043 + }, + { + "epoch": 0.20169576059850375, + "grad_norm": 0.3947460791158185, + "learning_rate": 4.6236272196985033e-05, + "loss": 0.9063, + "step": 4044 + }, + { + "epoch": 0.20174563591022443, + "grad_norm": 0.4338843329460968, + "learning_rate": 4.6234140959238366e-05, + "loss": 0.9278, + "step": 4045 + }, + { + "epoch": 0.20179551122194514, + "grad_norm": 0.4789970453079851, + "learning_rate": 4.623200916739629e-05, + "loss": 0.9611, + "step": 4046 + }, + { + "epoch": 0.20184538653366585, + "grad_norm": 0.382197486624897, + "learning_rate": 4.622987682151444e-05, + "loss": 0.9052, + "step": 4047 + }, + { + "epoch": 0.20189526184538653, + "grad_norm": 0.4125016201398112, + "learning_rate": 4.622774392164845e-05, + "loss": 0.9038, + "step": 4048 + }, + { + "epoch": 0.20194513715710724, + "grad_norm": 0.46521769363621535, + "learning_rate": 4.622561046785399e-05, + "loss": 0.8865, + "step": 4049 + }, + { + "epoch": 0.20199501246882792, + "grad_norm": 0.49706401365871233, + "learning_rate": 4.6223476460186715e-05, + "loss": 0.9534, + "step": 4050 + }, + { + "epoch": 0.20204488778054863, + "grad_norm": 0.41245567033551755, + "learning_rate": 4.622134189870232e-05, + "loss": 0.9371, + "step": 4051 + }, + { + "epoch": 0.20209476309226931, + "grad_norm": 0.5236886669382022, + "learning_rate": 4.6219206783456514e-05, + "loss": 0.9527, + "step": 4052 + }, + { + "epoch": 0.20214463840399002, + "grad_norm": 0.42498268224655855, + "learning_rate": 4.621707111450499e-05, + "loss": 0.9359, + "step": 4053 + }, + { + "epoch": 0.20219451371571073, + "grad_norm": 0.4828623975736826, + "learning_rate": 4.6214934891903505e-05, + "loss": 0.9772, + "step": 4054 + }, + { + "epoch": 0.20224438902743141, + "grad_norm": 0.6356535204874358, + "learning_rate": 4.621279811570779e-05, + "loss": 0.938, + "step": 4055 + }, + { + "epoch": 0.20229426433915212, + "grad_norm": 0.478490863104379, + "learning_rate": 4.621066078597359e-05, + "loss": 0.92, + "step": 4056 + }, + { + "epoch": 0.2023441396508728, + "grad_norm": 0.549597956755822, + "learning_rate": 4.6208522902756694e-05, + "loss": 0.9531, + "step": 4057 + }, + { + "epoch": 0.20239401496259352, + "grad_norm": 0.5585367837247689, + "learning_rate": 4.620638446611288e-05, + "loss": 0.9716, + "step": 4058 + }, + { + "epoch": 0.20244389027431423, + "grad_norm": 0.4680671124887836, + "learning_rate": 4.6204245476097965e-05, + "loss": 0.9808, + "step": 4059 + }, + { + "epoch": 0.2024937655860349, + "grad_norm": 0.4618981534513314, + "learning_rate": 4.6202105932767755e-05, + "loss": 0.9194, + "step": 4060 + }, + { + "epoch": 0.20254364089775562, + "grad_norm": 0.5369035658174717, + "learning_rate": 4.6199965836178073e-05, + "loss": 0.9435, + "step": 4061 + }, + { + "epoch": 0.2025935162094763, + "grad_norm": 0.4334529853550308, + "learning_rate": 4.619782518638477e-05, + "loss": 0.9151, + "step": 4062 + }, + { + "epoch": 0.202643391521197, + "grad_norm": 0.43498615239728444, + "learning_rate": 4.61956839834437e-05, + "loss": 0.9322, + "step": 4063 + }, + { + "epoch": 0.20269326683291772, + "grad_norm": 0.5331325374096912, + "learning_rate": 4.6193542227410754e-05, + "loss": 0.9386, + "step": 4064 + }, + { + "epoch": 0.2027431421446384, + "grad_norm": 1.3092760037924513, + "learning_rate": 4.61913999183418e-05, + "loss": 0.9178, + "step": 4065 + }, + { + "epoch": 0.2027930174563591, + "grad_norm": 0.5045276788910924, + "learning_rate": 4.618925705629276e-05, + "loss": 0.9274, + "step": 4066 + }, + { + "epoch": 0.2028428927680798, + "grad_norm": 0.4229943199526402, + "learning_rate": 4.618711364131952e-05, + "loss": 0.8896, + "step": 4067 + }, + { + "epoch": 0.2028927680798005, + "grad_norm": 0.39094571636333114, + "learning_rate": 4.618496967347805e-05, + "loss": 0.9048, + "step": 4068 + }, + { + "epoch": 0.2029426433915212, + "grad_norm": 0.5458119682080885, + "learning_rate": 4.618282515282427e-05, + "loss": 0.9578, + "step": 4069 + }, + { + "epoch": 0.2029925187032419, + "grad_norm": 0.45985315176429875, + "learning_rate": 4.618068007941415e-05, + "loss": 0.9464, + "step": 4070 + }, + { + "epoch": 0.2030423940149626, + "grad_norm": 0.40895129407067504, + "learning_rate": 4.6178534453303666e-05, + "loss": 0.9458, + "step": 4071 + }, + { + "epoch": 0.20309226932668328, + "grad_norm": 0.4458912644243382, + "learning_rate": 4.61763882745488e-05, + "loss": 0.9771, + "step": 4072 + }, + { + "epoch": 0.203142144638404, + "grad_norm": 0.44689125725984474, + "learning_rate": 4.617424154320556e-05, + "loss": 0.9207, + "step": 4073 + }, + { + "epoch": 0.20319201995012467, + "grad_norm": 0.4804572420420655, + "learning_rate": 4.617209425932995e-05, + "loss": 0.9268, + "step": 4074 + }, + { + "epoch": 0.20324189526184538, + "grad_norm": 0.4087704598460752, + "learning_rate": 4.616994642297804e-05, + "loss": 0.9437, + "step": 4075 + }, + { + "epoch": 0.2032917705735661, + "grad_norm": 0.5105307116469697, + "learning_rate": 4.6167798034205836e-05, + "loss": 0.9562, + "step": 4076 + }, + { + "epoch": 0.20334164588528678, + "grad_norm": 0.5365719509579994, + "learning_rate": 4.6165649093069424e-05, + "loss": 0.9498, + "step": 4077 + }, + { + "epoch": 0.20339152119700749, + "grad_norm": 0.5633761444244583, + "learning_rate": 4.616349959962487e-05, + "loss": 0.9377, + "step": 4078 + }, + { + "epoch": 0.20344139650872817, + "grad_norm": 0.49139828839132416, + "learning_rate": 4.616134955392826e-05, + "loss": 0.9513, + "step": 4079 + }, + { + "epoch": 0.20349127182044888, + "grad_norm": 0.3954416489833556, + "learning_rate": 4.615919895603571e-05, + "loss": 0.9371, + "step": 4080 + }, + { + "epoch": 0.2035411471321696, + "grad_norm": 0.40368954222053094, + "learning_rate": 4.615704780600333e-05, + "loss": 0.9192, + "step": 4081 + }, + { + "epoch": 0.20359102244389027, + "grad_norm": 0.41296863257078886, + "learning_rate": 4.615489610388726e-05, + "loss": 0.9232, + "step": 4082 + }, + { + "epoch": 0.20364089775561098, + "grad_norm": 0.3953705773547376, + "learning_rate": 4.6152743849743636e-05, + "loss": 0.9249, + "step": 4083 + }, + { + "epoch": 0.20369077306733166, + "grad_norm": 0.4611600385224362, + "learning_rate": 4.6150591043628635e-05, + "loss": 0.9415, + "step": 4084 + }, + { + "epoch": 0.20374064837905237, + "grad_norm": 0.4795807809084308, + "learning_rate": 4.614843768559842e-05, + "loss": 0.9023, + "step": 4085 + }, + { + "epoch": 0.20379052369077308, + "grad_norm": 0.423802457431077, + "learning_rate": 4.614628377570919e-05, + "loss": 0.9499, + "step": 4086 + }, + { + "epoch": 0.20384039900249376, + "grad_norm": 0.41754022601799223, + "learning_rate": 4.6144129314017154e-05, + "loss": 0.8682, + "step": 4087 + }, + { + "epoch": 0.20389027431421447, + "grad_norm": 0.5434850864042584, + "learning_rate": 4.614197430057852e-05, + "loss": 0.9319, + "step": 4088 + }, + { + "epoch": 0.20394014962593515, + "grad_norm": 0.46597189597079064, + "learning_rate": 4.6139818735449534e-05, + "loss": 0.9251, + "step": 4089 + }, + { + "epoch": 0.20399002493765586, + "grad_norm": 0.5352234730703024, + "learning_rate": 4.613766261868644e-05, + "loss": 0.9956, + "step": 4090 + }, + { + "epoch": 0.20403990024937657, + "grad_norm": 0.4322765233199365, + "learning_rate": 4.6135505950345494e-05, + "loss": 0.9325, + "step": 4091 + }, + { + "epoch": 0.20408977556109725, + "grad_norm": 0.7082231164196915, + "learning_rate": 4.613334873048298e-05, + "loss": 0.9538, + "step": 4092 + }, + { + "epoch": 0.20413965087281796, + "grad_norm": 0.46651966546611345, + "learning_rate": 4.613119095915519e-05, + "loss": 0.9435, + "step": 4093 + }, + { + "epoch": 0.20418952618453864, + "grad_norm": 0.3977685363998353, + "learning_rate": 4.612903263641843e-05, + "loss": 0.9419, + "step": 4094 + }, + { + "epoch": 0.20423940149625935, + "grad_norm": 0.651363675970193, + "learning_rate": 4.612687376232902e-05, + "loss": 0.9688, + "step": 4095 + }, + { + "epoch": 0.20428927680798006, + "grad_norm": 0.5330801377691229, + "learning_rate": 4.61247143369433e-05, + "loss": 0.9233, + "step": 4096 + }, + { + "epoch": 0.20433915211970075, + "grad_norm": 0.5259683666072922, + "learning_rate": 4.61225543603176e-05, + "loss": 0.976, + "step": 4097 + }, + { + "epoch": 0.20438902743142146, + "grad_norm": 0.43920279942013457, + "learning_rate": 4.6120393832508305e-05, + "loss": 0.9562, + "step": 4098 + }, + { + "epoch": 0.20443890274314214, + "grad_norm": 0.40686225362351186, + "learning_rate": 4.611823275357179e-05, + "loss": 0.901, + "step": 4099 + }, + { + "epoch": 0.20448877805486285, + "grad_norm": 0.5187579082063597, + "learning_rate": 4.611607112356443e-05, + "loss": 0.9172, + "step": 4100 + }, + { + "epoch": 0.20453865336658353, + "grad_norm": 0.42807171437708114, + "learning_rate": 4.6113908942542655e-05, + "loss": 0.9354, + "step": 4101 + }, + { + "epoch": 0.20458852867830424, + "grad_norm": 0.43859115878796695, + "learning_rate": 4.611174621056287e-05, + "loss": 0.8776, + "step": 4102 + }, + { + "epoch": 0.20463840399002495, + "grad_norm": 0.5354430322851089, + "learning_rate": 4.6109582927681516e-05, + "loss": 0.9886, + "step": 4103 + }, + { + "epoch": 0.20468827930174563, + "grad_norm": 0.3918964998978008, + "learning_rate": 4.610741909395505e-05, + "loss": 0.9574, + "step": 4104 + }, + { + "epoch": 0.20473815461346634, + "grad_norm": 0.4300737370732144, + "learning_rate": 4.610525470943992e-05, + "loss": 0.9122, + "step": 4105 + }, + { + "epoch": 0.20478802992518702, + "grad_norm": 0.6605963455193372, + "learning_rate": 4.610308977419262e-05, + "loss": 0.9809, + "step": 4106 + }, + { + "epoch": 0.20483790523690773, + "grad_norm": 0.42717435646094665, + "learning_rate": 4.610092428826963e-05, + "loss": 1.0213, + "step": 4107 + }, + { + "epoch": 0.20488778054862844, + "grad_norm": 0.48588196225365166, + "learning_rate": 4.609875825172747e-05, + "loss": 0.9058, + "step": 4108 + }, + { + "epoch": 0.20493765586034912, + "grad_norm": 0.4385645820992903, + "learning_rate": 4.609659166462266e-05, + "loss": 0.9675, + "step": 4109 + }, + { + "epoch": 0.20498753117206983, + "grad_norm": 0.4816041944318111, + "learning_rate": 4.6094424527011726e-05, + "loss": 0.9657, + "step": 4110 + }, + { + "epoch": 0.2050374064837905, + "grad_norm": 0.42369463718129663, + "learning_rate": 4.609225683895123e-05, + "loss": 0.9601, + "step": 4111 + }, + { + "epoch": 0.20508728179551122, + "grad_norm": 0.5209643842702132, + "learning_rate": 4.609008860049772e-05, + "loss": 0.9602, + "step": 4112 + }, + { + "epoch": 0.20513715710723193, + "grad_norm": 0.5232434220808254, + "learning_rate": 4.60879198117078e-05, + "loss": 0.9568, + "step": 4113 + }, + { + "epoch": 0.20518703241895261, + "grad_norm": 0.4495641594295419, + "learning_rate": 4.608575047263804e-05, + "loss": 0.9709, + "step": 4114 + }, + { + "epoch": 0.20523690773067332, + "grad_norm": 0.4396338197415318, + "learning_rate": 4.6083580583345064e-05, + "loss": 0.9528, + "step": 4115 + }, + { + "epoch": 0.205286783042394, + "grad_norm": 0.44619516282449384, + "learning_rate": 4.608141014388548e-05, + "loss": 0.9194, + "step": 4116 + }, + { + "epoch": 0.20533665835411472, + "grad_norm": 0.8037816400788135, + "learning_rate": 4.607923915431594e-05, + "loss": 0.9467, + "step": 4117 + }, + { + "epoch": 0.20538653366583542, + "grad_norm": 0.4512371602419317, + "learning_rate": 4.607706761469309e-05, + "loss": 0.926, + "step": 4118 + }, + { + "epoch": 0.2054364089775561, + "grad_norm": 0.41732286545281755, + "learning_rate": 4.607489552507359e-05, + "loss": 0.9275, + "step": 4119 + }, + { + "epoch": 0.20548628428927682, + "grad_norm": 0.4647031090719133, + "learning_rate": 4.607272288551413e-05, + "loss": 0.9651, + "step": 4120 + }, + { + "epoch": 0.2055361596009975, + "grad_norm": 0.3773039831758, + "learning_rate": 4.6070549696071394e-05, + "loss": 0.91, + "step": 4121 + }, + { + "epoch": 0.2055860349127182, + "grad_norm": 0.6400039256196715, + "learning_rate": 4.606837595680209e-05, + "loss": 0.9804, + "step": 4122 + }, + { + "epoch": 0.2056359102244389, + "grad_norm": 0.5535903743629078, + "learning_rate": 4.606620166776294e-05, + "loss": 0.9574, + "step": 4123 + }, + { + "epoch": 0.2056857855361596, + "grad_norm": 0.41733313317269377, + "learning_rate": 4.60640268290107e-05, + "loss": 0.9154, + "step": 4124 + }, + { + "epoch": 0.2057356608478803, + "grad_norm": 0.41379045611717113, + "learning_rate": 4.60618514406021e-05, + "loss": 0.9022, + "step": 4125 + }, + { + "epoch": 0.205785536159601, + "grad_norm": 0.3929106737754247, + "learning_rate": 4.6059675502593905e-05, + "loss": 0.9042, + "step": 4126 + }, + { + "epoch": 0.2058354114713217, + "grad_norm": 0.43403674735720876, + "learning_rate": 4.605749901504291e-05, + "loss": 0.944, + "step": 4127 + }, + { + "epoch": 0.20588528678304238, + "grad_norm": 0.4280699939229216, + "learning_rate": 4.60553219780059e-05, + "loss": 0.9256, + "step": 4128 + }, + { + "epoch": 0.2059351620947631, + "grad_norm": 0.5497424526092061, + "learning_rate": 4.605314439153969e-05, + "loss": 0.9481, + "step": 4129 + }, + { + "epoch": 0.2059850374064838, + "grad_norm": 0.4151555630971527, + "learning_rate": 4.605096625570109e-05, + "loss": 0.9615, + "step": 4130 + }, + { + "epoch": 0.20603491271820448, + "grad_norm": 0.4571763229764869, + "learning_rate": 4.6048787570546956e-05, + "loss": 0.9403, + "step": 4131 + }, + { + "epoch": 0.2060847880299252, + "grad_norm": 0.48336190921067157, + "learning_rate": 4.604660833613413e-05, + "loss": 0.9713, + "step": 4132 + }, + { + "epoch": 0.20613466334164587, + "grad_norm": 0.6515269348548528, + "learning_rate": 4.6044428552519475e-05, + "loss": 0.9545, + "step": 4133 + }, + { + "epoch": 0.20618453865336658, + "grad_norm": 0.44330030763640565, + "learning_rate": 4.6042248219759875e-05, + "loss": 0.9709, + "step": 4134 + }, + { + "epoch": 0.2062344139650873, + "grad_norm": 0.49228123852523914, + "learning_rate": 4.604006733791223e-05, + "loss": 0.9896, + "step": 4135 + }, + { + "epoch": 0.20628428927680797, + "grad_norm": 0.45979591513985657, + "learning_rate": 4.603788590703344e-05, + "loss": 0.9343, + "step": 4136 + }, + { + "epoch": 0.20633416458852868, + "grad_norm": 0.453899835814691, + "learning_rate": 4.603570392718043e-05, + "loss": 0.9039, + "step": 4137 + }, + { + "epoch": 0.20638403990024937, + "grad_norm": 0.42533673630787633, + "learning_rate": 4.603352139841014e-05, + "loss": 0.9366, + "step": 4138 + }, + { + "epoch": 0.20643391521197008, + "grad_norm": 0.5319634736000657, + "learning_rate": 4.6031338320779534e-05, + "loss": 0.9496, + "step": 4139 + }, + { + "epoch": 0.20648379052369079, + "grad_norm": 0.4317167369202225, + "learning_rate": 4.602915469434555e-05, + "loss": 0.9008, + "step": 4140 + }, + { + "epoch": 0.20653366583541147, + "grad_norm": 0.4269549107696156, + "learning_rate": 4.6026970519165206e-05, + "loss": 0.8969, + "step": 4141 + }, + { + "epoch": 0.20658354114713218, + "grad_norm": 0.47476715808950287, + "learning_rate": 4.602478579529546e-05, + "loss": 0.9545, + "step": 4142 + }, + { + "epoch": 0.20663341645885286, + "grad_norm": 0.5178665612493912, + "learning_rate": 4.6022600522793345e-05, + "loss": 0.9449, + "step": 4143 + }, + { + "epoch": 0.20668329177057357, + "grad_norm": 0.4482668675589407, + "learning_rate": 4.6020414701715876e-05, + "loss": 0.9052, + "step": 4144 + }, + { + "epoch": 0.20673316708229428, + "grad_norm": 0.4385510549412322, + "learning_rate": 4.6018228332120097e-05, + "loss": 0.9378, + "step": 4145 + }, + { + "epoch": 0.20678304239401496, + "grad_norm": 0.5741448146569462, + "learning_rate": 4.601604141406305e-05, + "loss": 0.9186, + "step": 4146 + }, + { + "epoch": 0.20683291770573567, + "grad_norm": 0.41699642817426524, + "learning_rate": 4.601385394760181e-05, + "loss": 0.9116, + "step": 4147 + }, + { + "epoch": 0.20688279301745635, + "grad_norm": 0.5758903495242446, + "learning_rate": 4.601166593279346e-05, + "loss": 0.954, + "step": 4148 + }, + { + "epoch": 0.20693266832917706, + "grad_norm": 0.6701872771700148, + "learning_rate": 4.600947736969508e-05, + "loss": 0.9742, + "step": 4149 + }, + { + "epoch": 0.20698254364089774, + "grad_norm": 0.5127243642893363, + "learning_rate": 4.60072882583638e-05, + "loss": 0.9506, + "step": 4150 + }, + { + "epoch": 0.20703241895261845, + "grad_norm": 0.40552833978535013, + "learning_rate": 4.600509859885673e-05, + "loss": 0.9134, + "step": 4151 + }, + { + "epoch": 0.20708229426433916, + "grad_norm": 0.4042682158568646, + "learning_rate": 4.6002908391231015e-05, + "loss": 0.9147, + "step": 4152 + }, + { + "epoch": 0.20713216957605984, + "grad_norm": 0.4825436387990607, + "learning_rate": 4.600071763554381e-05, + "loss": 0.9065, + "step": 4153 + }, + { + "epoch": 0.20718204488778055, + "grad_norm": 0.3912808532788235, + "learning_rate": 4.599852633185227e-05, + "loss": 0.9314, + "step": 4154 + }, + { + "epoch": 0.20723192019950123, + "grad_norm": 0.5131462965275667, + "learning_rate": 4.5996334480213585e-05, + "loss": 0.9562, + "step": 4155 + }, + { + "epoch": 0.20728179551122194, + "grad_norm": 0.439602048132326, + "learning_rate": 4.599414208068495e-05, + "loss": 0.9614, + "step": 4156 + }, + { + "epoch": 0.20733167082294265, + "grad_norm": 0.47683703221274465, + "learning_rate": 4.599194913332357e-05, + "loss": 0.9437, + "step": 4157 + }, + { + "epoch": 0.20738154613466334, + "grad_norm": 0.7578123555258754, + "learning_rate": 4.5989755638186676e-05, + "loss": 0.8977, + "step": 4158 + }, + { + "epoch": 0.20743142144638405, + "grad_norm": 0.4547975078796237, + "learning_rate": 4.59875615953315e-05, + "loss": 0.9541, + "step": 4159 + }, + { + "epoch": 0.20748129675810473, + "grad_norm": 0.42034739698230483, + "learning_rate": 4.59853670048153e-05, + "loss": 0.9181, + "step": 4160 + }, + { + "epoch": 0.20753117206982544, + "grad_norm": 0.41812431008544093, + "learning_rate": 4.598317186669534e-05, + "loss": 0.9072, + "step": 4161 + }, + { + "epoch": 0.20758104738154615, + "grad_norm": 0.4222318532923496, + "learning_rate": 4.59809761810289e-05, + "loss": 0.9844, + "step": 4162 + }, + { + "epoch": 0.20763092269326683, + "grad_norm": 0.4367706559121619, + "learning_rate": 4.597877994787327e-05, + "loss": 0.9687, + "step": 4163 + }, + { + "epoch": 0.20768079800498754, + "grad_norm": 0.5339014287659166, + "learning_rate": 4.597658316728577e-05, + "loss": 0.9821, + "step": 4164 + }, + { + "epoch": 0.20773067331670822, + "grad_norm": 0.4312706430315768, + "learning_rate": 4.597438583932372e-05, + "loss": 0.9252, + "step": 4165 + }, + { + "epoch": 0.20778054862842893, + "grad_norm": 0.5954061270999917, + "learning_rate": 4.597218796404446e-05, + "loss": 0.947, + "step": 4166 + }, + { + "epoch": 0.20783042394014964, + "grad_norm": 0.44719255144355724, + "learning_rate": 4.596998954150534e-05, + "loss": 0.9401, + "step": 4167 + }, + { + "epoch": 0.20788029925187032, + "grad_norm": 0.4948629286654582, + "learning_rate": 4.596779057176373e-05, + "loss": 0.9486, + "step": 4168 + }, + { + "epoch": 0.20793017456359103, + "grad_norm": 0.44072940356056317, + "learning_rate": 4.596559105487701e-05, + "loss": 0.9427, + "step": 4169 + }, + { + "epoch": 0.2079800498753117, + "grad_norm": 0.4656444072762086, + "learning_rate": 4.5963390990902564e-05, + "loss": 0.9575, + "step": 4170 + }, + { + "epoch": 0.20802992518703242, + "grad_norm": 0.4925822351140443, + "learning_rate": 4.596119037989782e-05, + "loss": 0.9282, + "step": 4171 + }, + { + "epoch": 0.2080798004987531, + "grad_norm": 0.4400598413696448, + "learning_rate": 4.59589892219202e-05, + "loss": 0.9108, + "step": 4172 + }, + { + "epoch": 0.2081296758104738, + "grad_norm": 0.4107643770770961, + "learning_rate": 4.595678751702713e-05, + "loss": 0.9269, + "step": 4173 + }, + { + "epoch": 0.20817955112219452, + "grad_norm": 0.41829878680897975, + "learning_rate": 4.595458526527607e-05, + "loss": 0.9319, + "step": 4174 + }, + { + "epoch": 0.2082294264339152, + "grad_norm": 0.44377416501330225, + "learning_rate": 4.595238246672448e-05, + "loss": 0.8962, + "step": 4175 + }, + { + "epoch": 0.20827930174563591, + "grad_norm": 1.0613811102214255, + "learning_rate": 4.5950179121429846e-05, + "loss": 0.972, + "step": 4176 + }, + { + "epoch": 0.2083291770573566, + "grad_norm": 0.4934171604862857, + "learning_rate": 4.594797522944967e-05, + "loss": 0.8655, + "step": 4177 + }, + { + "epoch": 0.2083790523690773, + "grad_norm": 0.6702387303427104, + "learning_rate": 4.594577079084146e-05, + "loss": 0.9352, + "step": 4178 + }, + { + "epoch": 0.20842892768079802, + "grad_norm": 0.4494185146437955, + "learning_rate": 4.594356580566273e-05, + "loss": 0.9198, + "step": 4179 + }, + { + "epoch": 0.2084788029925187, + "grad_norm": 0.4970091358159442, + "learning_rate": 4.594136027397102e-05, + "loss": 0.8709, + "step": 4180 + }, + { + "epoch": 0.2085286783042394, + "grad_norm": 0.4750592230722175, + "learning_rate": 4.59391541958239e-05, + "loss": 0.951, + "step": 4181 + }, + { + "epoch": 0.2085785536159601, + "grad_norm": 0.4235356756908157, + "learning_rate": 4.5936947571278904e-05, + "loss": 0.906, + "step": 4182 + }, + { + "epoch": 0.2086284289276808, + "grad_norm": 0.5166897512269021, + "learning_rate": 4.593474040039364e-05, + "loss": 0.9788, + "step": 4183 + }, + { + "epoch": 0.2086783042394015, + "grad_norm": 0.46229237076933893, + "learning_rate": 4.5932532683225695e-05, + "loss": 0.9534, + "step": 4184 + }, + { + "epoch": 0.2087281795511222, + "grad_norm": 0.48234526245644843, + "learning_rate": 4.593032441983267e-05, + "loss": 0.9328, + "step": 4185 + }, + { + "epoch": 0.2087780548628429, + "grad_norm": 0.5162975974360777, + "learning_rate": 4.59281156102722e-05, + "loss": 0.9312, + "step": 4186 + }, + { + "epoch": 0.20882793017456358, + "grad_norm": 0.4455244681510487, + "learning_rate": 4.5925906254601926e-05, + "loss": 0.9635, + "step": 4187 + }, + { + "epoch": 0.2088778054862843, + "grad_norm": 0.4262884730932651, + "learning_rate": 4.592369635287949e-05, + "loss": 0.9397, + "step": 4188 + }, + { + "epoch": 0.208927680798005, + "grad_norm": 0.6871084369688052, + "learning_rate": 4.592148590516256e-05, + "loss": 0.9426, + "step": 4189 + }, + { + "epoch": 0.20897755610972568, + "grad_norm": 0.5718464650715865, + "learning_rate": 4.591927491150882e-05, + "loss": 0.9703, + "step": 4190 + }, + { + "epoch": 0.2090274314214464, + "grad_norm": 0.4989315105978188, + "learning_rate": 4.591706337197597e-05, + "loss": 0.8994, + "step": 4191 + }, + { + "epoch": 0.20907730673316707, + "grad_norm": 0.49441890042149106, + "learning_rate": 4.5914851286621695e-05, + "loss": 0.9468, + "step": 4192 + }, + { + "epoch": 0.20912718204488778, + "grad_norm": 0.4599454432705738, + "learning_rate": 4.591263865550375e-05, + "loss": 0.9559, + "step": 4193 + }, + { + "epoch": 0.2091770573566085, + "grad_norm": 0.68646067214234, + "learning_rate": 4.5910425478679855e-05, + "loss": 0.8885, + "step": 4194 + }, + { + "epoch": 0.20922693266832917, + "grad_norm": 0.4911726483704634, + "learning_rate": 4.5908211756207765e-05, + "loss": 0.9467, + "step": 4195 + }, + { + "epoch": 0.20927680798004988, + "grad_norm": 0.5080757206641685, + "learning_rate": 4.5905997488145246e-05, + "loss": 0.9292, + "step": 4196 + }, + { + "epoch": 0.20932668329177057, + "grad_norm": 0.4059841904376933, + "learning_rate": 4.590378267455008e-05, + "loss": 0.973, + "step": 4197 + }, + { + "epoch": 0.20937655860349128, + "grad_norm": 0.5051933484261587, + "learning_rate": 4.590156731548007e-05, + "loss": 0.996, + "step": 4198 + }, + { + "epoch": 0.20942643391521196, + "grad_norm": 0.5138590323473342, + "learning_rate": 4.5899351410993e-05, + "loss": 0.9174, + "step": 4199 + }, + { + "epoch": 0.20947630922693267, + "grad_norm": 0.4809187438910795, + "learning_rate": 4.589713496114671e-05, + "loss": 0.9174, + "step": 4200 + }, + { + "epoch": 0.20952618453865338, + "grad_norm": 0.4054450445448433, + "learning_rate": 4.589491796599904e-05, + "loss": 0.9912, + "step": 4201 + }, + { + "epoch": 0.20957605985037406, + "grad_norm": 0.40257519224299604, + "learning_rate": 4.589270042560784e-05, + "loss": 0.905, + "step": 4202 + }, + { + "epoch": 0.20962593516209477, + "grad_norm": 0.4529223909711398, + "learning_rate": 4.589048234003096e-05, + "loss": 0.9636, + "step": 4203 + }, + { + "epoch": 0.20967581047381545, + "grad_norm": 0.44988302340120134, + "learning_rate": 4.5888263709326306e-05, + "loss": 0.9251, + "step": 4204 + }, + { + "epoch": 0.20972568578553616, + "grad_norm": 0.43557172332903255, + "learning_rate": 4.5886044533551753e-05, + "loss": 0.9327, + "step": 4205 + }, + { + "epoch": 0.20977556109725687, + "grad_norm": 0.63888435612175, + "learning_rate": 4.588382481276522e-05, + "loss": 0.9508, + "step": 4206 + }, + { + "epoch": 0.20982543640897755, + "grad_norm": 0.38844851527571644, + "learning_rate": 4.588160454702462e-05, + "loss": 0.9172, + "step": 4207 + }, + { + "epoch": 0.20987531172069826, + "grad_norm": 0.40545288467342927, + "learning_rate": 4.587938373638789e-05, + "loss": 0.9212, + "step": 4208 + }, + { + "epoch": 0.20992518703241894, + "grad_norm": 0.6161863453126579, + "learning_rate": 4.5877162380913e-05, + "loss": 0.9283, + "step": 4209 + }, + { + "epoch": 0.20997506234413965, + "grad_norm": 0.5733051071494742, + "learning_rate": 4.587494048065789e-05, + "loss": 0.9777, + "step": 4210 + }, + { + "epoch": 0.21002493765586036, + "grad_norm": 0.5378131881109073, + "learning_rate": 4.587271803568056e-05, + "loss": 0.8939, + "step": 4211 + }, + { + "epoch": 0.21007481296758104, + "grad_norm": 0.39062307128375306, + "learning_rate": 4.5870495046038985e-05, + "loss": 0.9213, + "step": 4212 + }, + { + "epoch": 0.21012468827930175, + "grad_norm": 0.3979411658128, + "learning_rate": 4.586827151179118e-05, + "loss": 0.9639, + "step": 4213 + }, + { + "epoch": 0.21017456359102243, + "grad_norm": 0.43562372137104494, + "learning_rate": 4.586604743299518e-05, + "loss": 0.9403, + "step": 4214 + }, + { + "epoch": 0.21022443890274314, + "grad_norm": 0.5390982236430212, + "learning_rate": 4.586382280970901e-05, + "loss": 0.9033, + "step": 4215 + }, + { + "epoch": 0.21027431421446385, + "grad_norm": 0.5357754031610164, + "learning_rate": 4.586159764199071e-05, + "loss": 0.9177, + "step": 4216 + }, + { + "epoch": 0.21032418952618454, + "grad_norm": 0.4236495432066593, + "learning_rate": 4.585937192989837e-05, + "loss": 0.9, + "step": 4217 + }, + { + "epoch": 0.21037406483790524, + "grad_norm": 0.44070971911432055, + "learning_rate": 4.585714567349004e-05, + "loss": 0.9467, + "step": 4218 + }, + { + "epoch": 0.21042394014962593, + "grad_norm": 0.49979980170665406, + "learning_rate": 4.5854918872823836e-05, + "loss": 0.9241, + "step": 4219 + }, + { + "epoch": 0.21047381546134664, + "grad_norm": 0.37094272203833567, + "learning_rate": 4.5852691527957856e-05, + "loss": 0.9023, + "step": 4220 + }, + { + "epoch": 0.21052369077306732, + "grad_norm": 0.4243019817169038, + "learning_rate": 4.5850463638950227e-05, + "loss": 0.9659, + "step": 4221 + }, + { + "epoch": 0.21057356608478803, + "grad_norm": 0.39792802143068756, + "learning_rate": 4.584823520585908e-05, + "loss": 0.9162, + "step": 4222 + }, + { + "epoch": 0.21062344139650874, + "grad_norm": 0.43371559610133176, + "learning_rate": 4.584600622874255e-05, + "loss": 0.956, + "step": 4223 + }, + { + "epoch": 0.21067331670822942, + "grad_norm": 0.39801431934743436, + "learning_rate": 4.584377670765884e-05, + "loss": 0.9465, + "step": 4224 + }, + { + "epoch": 0.21072319201995013, + "grad_norm": 0.5270673583213954, + "learning_rate": 4.584154664266609e-05, + "loss": 0.9926, + "step": 4225 + }, + { + "epoch": 0.2107730673316708, + "grad_norm": 0.42623858214997107, + "learning_rate": 4.5839316033822524e-05, + "loss": 0.9391, + "step": 4226 + }, + { + "epoch": 0.21082294264339152, + "grad_norm": 0.5891644585734774, + "learning_rate": 4.5837084881186324e-05, + "loss": 0.9088, + "step": 4227 + }, + { + "epoch": 0.21087281795511223, + "grad_norm": 0.3988283408868504, + "learning_rate": 4.583485318481571e-05, + "loss": 0.9379, + "step": 4228 + }, + { + "epoch": 0.2109226932668329, + "grad_norm": 0.4791534968122386, + "learning_rate": 4.5832620944768935e-05, + "loss": 0.9464, + "step": 4229 + }, + { + "epoch": 0.21097256857855362, + "grad_norm": 0.4077617192507459, + "learning_rate": 4.5830388161104245e-05, + "loss": 0.9303, + "step": 4230 + }, + { + "epoch": 0.2110224438902743, + "grad_norm": 0.512718831332777, + "learning_rate": 4.582815483387989e-05, + "loss": 0.8941, + "step": 4231 + }, + { + "epoch": 0.211072319201995, + "grad_norm": 0.46581591053309623, + "learning_rate": 4.582592096315416e-05, + "loss": 0.9379, + "step": 4232 + }, + { + "epoch": 0.21112219451371572, + "grad_norm": 0.42404006760044777, + "learning_rate": 4.582368654898533e-05, + "loss": 0.9086, + "step": 4233 + }, + { + "epoch": 0.2111720698254364, + "grad_norm": 0.9872540567596386, + "learning_rate": 4.5821451591431744e-05, + "loss": 0.9388, + "step": 4234 + }, + { + "epoch": 0.2112219451371571, + "grad_norm": 0.45084575497592677, + "learning_rate": 4.581921609055168e-05, + "loss": 0.9321, + "step": 4235 + }, + { + "epoch": 0.2112718204488778, + "grad_norm": 0.4657353194844088, + "learning_rate": 4.581698004640349e-05, + "loss": 0.9818, + "step": 4236 + }, + { + "epoch": 0.2113216957605985, + "grad_norm": 0.6438038101021043, + "learning_rate": 4.5814743459045526e-05, + "loss": 0.9403, + "step": 4237 + }, + { + "epoch": 0.21137157107231921, + "grad_norm": 0.5187661292545448, + "learning_rate": 4.581250632853615e-05, + "loss": 0.9458, + "step": 4238 + }, + { + "epoch": 0.2114214463840399, + "grad_norm": 0.40952737032900616, + "learning_rate": 4.581026865493373e-05, + "loss": 0.9301, + "step": 4239 + }, + { + "epoch": 0.2114713216957606, + "grad_norm": 0.516842376395114, + "learning_rate": 4.5808030438296664e-05, + "loss": 0.9632, + "step": 4240 + }, + { + "epoch": 0.2115211970074813, + "grad_norm": 0.43798435542066083, + "learning_rate": 4.5805791678683356e-05, + "loss": 0.8963, + "step": 4241 + }, + { + "epoch": 0.211571072319202, + "grad_norm": 0.43703627983871235, + "learning_rate": 4.5803552376152226e-05, + "loss": 0.9414, + "step": 4242 + }, + { + "epoch": 0.2116209476309227, + "grad_norm": 0.518521505162066, + "learning_rate": 4.580131253076171e-05, + "loss": 0.9283, + "step": 4243 + }, + { + "epoch": 0.2116708229426434, + "grad_norm": 0.42935686032293857, + "learning_rate": 4.579907214257025e-05, + "loss": 0.9157, + "step": 4244 + }, + { + "epoch": 0.2117206982543641, + "grad_norm": 0.46157414810855807, + "learning_rate": 4.579683121163632e-05, + "loss": 0.9123, + "step": 4245 + }, + { + "epoch": 0.21177057356608478, + "grad_norm": 0.48420119267514244, + "learning_rate": 4.579458973801837e-05, + "loss": 0.9271, + "step": 4246 + }, + { + "epoch": 0.2118204488778055, + "grad_norm": 0.45188000011069795, + "learning_rate": 4.579234772177492e-05, + "loss": 0.9201, + "step": 4247 + }, + { + "epoch": 0.21187032418952617, + "grad_norm": 0.4399175331859613, + "learning_rate": 4.579010516296445e-05, + "loss": 0.9516, + "step": 4248 + }, + { + "epoch": 0.21192019950124688, + "grad_norm": 0.40178274849970447, + "learning_rate": 4.5787862061645505e-05, + "loss": 0.8969, + "step": 4249 + }, + { + "epoch": 0.2119700748129676, + "grad_norm": 0.49295518632915725, + "learning_rate": 4.5785618417876596e-05, + "loss": 0.9571, + "step": 4250 + }, + { + "epoch": 0.21201995012468827, + "grad_norm": 0.48821774317739736, + "learning_rate": 4.578337423171627e-05, + "loss": 0.9444, + "step": 4251 + }, + { + "epoch": 0.21206982543640898, + "grad_norm": 0.41093553791152554, + "learning_rate": 4.578112950322311e-05, + "loss": 0.9508, + "step": 4252 + }, + { + "epoch": 0.21211970074812966, + "grad_norm": 0.41170949664010537, + "learning_rate": 4.577888423245566e-05, + "loss": 0.9584, + "step": 4253 + }, + { + "epoch": 0.21216957605985037, + "grad_norm": 0.4081088456469548, + "learning_rate": 4.577663841947254e-05, + "loss": 0.8969, + "step": 4254 + }, + { + "epoch": 0.21221945137157108, + "grad_norm": 0.46321679294066925, + "learning_rate": 4.577439206433233e-05, + "loss": 0.9097, + "step": 4255 + }, + { + "epoch": 0.21226932668329176, + "grad_norm": 0.48510413211384, + "learning_rate": 4.577214516709365e-05, + "loss": 0.9498, + "step": 4256 + }, + { + "epoch": 0.21231920199501247, + "grad_norm": 0.5506566332929085, + "learning_rate": 4.5769897727815156e-05, + "loss": 0.911, + "step": 4257 + }, + { + "epoch": 0.21236907730673316, + "grad_norm": 0.46998437908473273, + "learning_rate": 4.5767649746555466e-05, + "loss": 0.9194, + "step": 4258 + }, + { + "epoch": 0.21241895261845387, + "grad_norm": 0.45117237002341964, + "learning_rate": 4.576540122337325e-05, + "loss": 0.8803, + "step": 4259 + }, + { + "epoch": 0.21246882793017458, + "grad_norm": 0.8050709243406461, + "learning_rate": 4.576315215832718e-05, + "loss": 0.9217, + "step": 4260 + }, + { + "epoch": 0.21251870324189526, + "grad_norm": 0.41425693362413146, + "learning_rate": 4.576090255147595e-05, + "loss": 0.9493, + "step": 4261 + }, + { + "epoch": 0.21256857855361597, + "grad_norm": 0.46515401258369776, + "learning_rate": 4.575865240287826e-05, + "loss": 0.8972, + "step": 4262 + }, + { + "epoch": 0.21261845386533665, + "grad_norm": 0.44878928401345, + "learning_rate": 4.5756401712592824e-05, + "loss": 0.9748, + "step": 4263 + }, + { + "epoch": 0.21266832917705736, + "grad_norm": 0.44638826100836104, + "learning_rate": 4.575415048067838e-05, + "loss": 0.9352, + "step": 4264 + }, + { + "epoch": 0.21271820448877807, + "grad_norm": 0.4420658783613262, + "learning_rate": 4.575189870719366e-05, + "loss": 0.9088, + "step": 4265 + }, + { + "epoch": 0.21276807980049875, + "grad_norm": 0.4653098235351562, + "learning_rate": 4.574964639219743e-05, + "loss": 0.9361, + "step": 4266 + }, + { + "epoch": 0.21281795511221946, + "grad_norm": 0.42828014676630577, + "learning_rate": 4.574739353574847e-05, + "loss": 0.9308, + "step": 4267 + }, + { + "epoch": 0.21286783042394014, + "grad_norm": 0.40213286297007705, + "learning_rate": 4.574514013790556e-05, + "loss": 0.8693, + "step": 4268 + }, + { + "epoch": 0.21291770573566085, + "grad_norm": 0.5288816652960647, + "learning_rate": 4.57428861987275e-05, + "loss": 0.9408, + "step": 4269 + }, + { + "epoch": 0.21296758104738153, + "grad_norm": 0.5450278898625277, + "learning_rate": 4.574063171827311e-05, + "loss": 0.946, + "step": 4270 + }, + { + "epoch": 0.21301745635910224, + "grad_norm": 0.46369360988521696, + "learning_rate": 4.5738376696601214e-05, + "loss": 0.9124, + "step": 4271 + }, + { + "epoch": 0.21306733167082295, + "grad_norm": 0.4808534557184007, + "learning_rate": 4.573612113377067e-05, + "loss": 0.9251, + "step": 4272 + }, + { + "epoch": 0.21311720698254363, + "grad_norm": 0.6701707692520935, + "learning_rate": 4.573386502984032e-05, + "loss": 0.8976, + "step": 4273 + }, + { + "epoch": 0.21316708229426434, + "grad_norm": 0.43683741881735644, + "learning_rate": 4.573160838486903e-05, + "loss": 0.9077, + "step": 4274 + }, + { + "epoch": 0.21321695760598502, + "grad_norm": 0.42071306858199625, + "learning_rate": 4.572935119891571e-05, + "loss": 0.9277, + "step": 4275 + }, + { + "epoch": 0.21326683291770573, + "grad_norm": 0.45488402735369576, + "learning_rate": 4.572709347203924e-05, + "loss": 0.9291, + "step": 4276 + }, + { + "epoch": 0.21331670822942644, + "grad_norm": 0.5330238481605291, + "learning_rate": 4.572483520429855e-05, + "loss": 0.9014, + "step": 4277 + }, + { + "epoch": 0.21336658354114713, + "grad_norm": 0.4573485535507722, + "learning_rate": 4.572257639575256e-05, + "loss": 0.9357, + "step": 4278 + }, + { + "epoch": 0.21341645885286784, + "grad_norm": 0.5527991832503123, + "learning_rate": 4.57203170464602e-05, + "loss": 0.9797, + "step": 4279 + }, + { + "epoch": 0.21346633416458852, + "grad_norm": 0.4286146022349695, + "learning_rate": 4.5718057156480454e-05, + "loss": 0.9183, + "step": 4280 + }, + { + "epoch": 0.21351620947630923, + "grad_norm": 0.5262464461761891, + "learning_rate": 4.571579672587227e-05, + "loss": 0.9611, + "step": 4281 + }, + { + "epoch": 0.21356608478802994, + "grad_norm": 0.4031602458666826, + "learning_rate": 4.5713535754694644e-05, + "loss": 0.9381, + "step": 4282 + }, + { + "epoch": 0.21361596009975062, + "grad_norm": 0.45642833637663816, + "learning_rate": 4.571127424300658e-05, + "loss": 0.9159, + "step": 4283 + }, + { + "epoch": 0.21366583541147133, + "grad_norm": 0.4491278113565909, + "learning_rate": 4.5709012190867076e-05, + "loss": 0.9605, + "step": 4284 + }, + { + "epoch": 0.213715710723192, + "grad_norm": 0.44280317406020225, + "learning_rate": 4.570674959833517e-05, + "loss": 0.9378, + "step": 4285 + }, + { + "epoch": 0.21376558603491272, + "grad_norm": 0.4438162676105786, + "learning_rate": 4.5704486465469905e-05, + "loss": 0.9018, + "step": 4286 + }, + { + "epoch": 0.21381546134663343, + "grad_norm": 0.46918686793896913, + "learning_rate": 4.570222279233033e-05, + "loss": 0.9632, + "step": 4287 + }, + { + "epoch": 0.2138653366583541, + "grad_norm": 0.44526477681833665, + "learning_rate": 4.569995857897551e-05, + "loss": 0.931, + "step": 4288 + }, + { + "epoch": 0.21391521197007482, + "grad_norm": 0.41847109403838717, + "learning_rate": 4.5697693825464536e-05, + "loss": 0.9711, + "step": 4289 + }, + { + "epoch": 0.2139650872817955, + "grad_norm": 0.5557985239750575, + "learning_rate": 4.569542853185651e-05, + "loss": 0.8918, + "step": 4290 + }, + { + "epoch": 0.2140149625935162, + "grad_norm": 0.4455500554834785, + "learning_rate": 4.569316269821053e-05, + "loss": 0.8837, + "step": 4291 + }, + { + "epoch": 0.21406483790523692, + "grad_norm": 0.502321079998615, + "learning_rate": 4.569089632458574e-05, + "loss": 0.9282, + "step": 4292 + }, + { + "epoch": 0.2141147132169576, + "grad_norm": 0.41294019007158433, + "learning_rate": 4.5688629411041265e-05, + "loss": 0.938, + "step": 4293 + }, + { + "epoch": 0.2141645885286783, + "grad_norm": 0.503995111437273, + "learning_rate": 4.568636195763626e-05, + "loss": 0.946, + "step": 4294 + }, + { + "epoch": 0.214214463840399, + "grad_norm": 0.4444824436117272, + "learning_rate": 4.568409396442991e-05, + "loss": 0.9308, + "step": 4295 + }, + { + "epoch": 0.2142643391521197, + "grad_norm": 0.7010847371505131, + "learning_rate": 4.568182543148137e-05, + "loss": 0.9783, + "step": 4296 + }, + { + "epoch": 0.21431421446384039, + "grad_norm": 0.47974360208476324, + "learning_rate": 4.567955635884986e-05, + "loss": 0.9202, + "step": 4297 + }, + { + "epoch": 0.2143640897755611, + "grad_norm": 0.7470414135345532, + "learning_rate": 4.5677286746594585e-05, + "loss": 0.9235, + "step": 4298 + }, + { + "epoch": 0.2144139650872818, + "grad_norm": 0.450374947947033, + "learning_rate": 4.567501659477477e-05, + "loss": 0.9272, + "step": 4299 + }, + { + "epoch": 0.2144638403990025, + "grad_norm": 0.42936571939435947, + "learning_rate": 4.5672745903449635e-05, + "loss": 0.9261, + "step": 4300 + }, + { + "epoch": 0.2145137157107232, + "grad_norm": 0.46514021300766384, + "learning_rate": 4.567047467267846e-05, + "loss": 0.9006, + "step": 4301 + }, + { + "epoch": 0.21456359102244388, + "grad_norm": 0.4109493521260494, + "learning_rate": 4.566820290252049e-05, + "loss": 0.9767, + "step": 4302 + }, + { + "epoch": 0.2146134663341646, + "grad_norm": 0.4859456708137527, + "learning_rate": 4.566593059303502e-05, + "loss": 0.888, + "step": 4303 + }, + { + "epoch": 0.2146633416458853, + "grad_norm": 0.5419431655412574, + "learning_rate": 4.566365774428134e-05, + "loss": 0.9371, + "step": 4304 + }, + { + "epoch": 0.21471321695760598, + "grad_norm": 0.6191283778459388, + "learning_rate": 4.566138435631876e-05, + "loss": 0.907, + "step": 4305 + }, + { + "epoch": 0.2147630922693267, + "grad_norm": 0.425821236494463, + "learning_rate": 4.5659110429206596e-05, + "loss": 0.91, + "step": 4306 + }, + { + "epoch": 0.21481296758104737, + "grad_norm": 0.580586884055858, + "learning_rate": 4.5656835963004194e-05, + "loss": 0.9849, + "step": 4307 + }, + { + "epoch": 0.21486284289276808, + "grad_norm": 0.39438675568862297, + "learning_rate": 4.565456095777091e-05, + "loss": 0.8983, + "step": 4308 + }, + { + "epoch": 0.2149127182044888, + "grad_norm": 0.6765376051709493, + "learning_rate": 4.5652285413566095e-05, + "loss": 0.9763, + "step": 4309 + }, + { + "epoch": 0.21496259351620947, + "grad_norm": 0.4608634870313268, + "learning_rate": 4.565000933044914e-05, + "loss": 0.9209, + "step": 4310 + }, + { + "epoch": 0.21501246882793018, + "grad_norm": 0.413362572532137, + "learning_rate": 4.564773270847943e-05, + "loss": 0.95, + "step": 4311 + }, + { + "epoch": 0.21506234413965086, + "grad_norm": 0.46633973180062427, + "learning_rate": 4.564545554771637e-05, + "loss": 0.9806, + "step": 4312 + }, + { + "epoch": 0.21511221945137157, + "grad_norm": 0.422050228633862, + "learning_rate": 4.564317784821939e-05, + "loss": 0.929, + "step": 4313 + }, + { + "epoch": 0.21516209476309228, + "grad_norm": 0.7623917832785165, + "learning_rate": 4.5640899610047924e-05, + "loss": 0.963, + "step": 4314 + }, + { + "epoch": 0.21521197007481296, + "grad_norm": 0.548135885556672, + "learning_rate": 4.563862083326143e-05, + "loss": 0.9241, + "step": 4315 + }, + { + "epoch": 0.21526184538653367, + "grad_norm": 0.5891284519814394, + "learning_rate": 4.563634151791935e-05, + "loss": 0.9475, + "step": 4316 + }, + { + "epoch": 0.21531172069825436, + "grad_norm": 0.4816276979005306, + "learning_rate": 4.5634061664081173e-05, + "loss": 0.9193, + "step": 4317 + }, + { + "epoch": 0.21536159600997506, + "grad_norm": 0.5122747000847596, + "learning_rate": 4.5631781271806395e-05, + "loss": 0.9319, + "step": 4318 + }, + { + "epoch": 0.21541147132169577, + "grad_norm": 0.5244881218809079, + "learning_rate": 4.562950034115452e-05, + "loss": 0.9676, + "step": 4319 + }, + { + "epoch": 0.21546134663341646, + "grad_norm": 0.40483727028323685, + "learning_rate": 4.562721887218506e-05, + "loss": 0.91, + "step": 4320 + }, + { + "epoch": 0.21551122194513717, + "grad_norm": 0.558848677649534, + "learning_rate": 4.5624936864957556e-05, + "loss": 0.995, + "step": 4321 + }, + { + "epoch": 0.21556109725685785, + "grad_norm": 0.6718282472250905, + "learning_rate": 4.5622654319531565e-05, + "loss": 0.9485, + "step": 4322 + }, + { + "epoch": 0.21561097256857856, + "grad_norm": 0.39633840364082645, + "learning_rate": 4.562037123596663e-05, + "loss": 0.952, + "step": 4323 + }, + { + "epoch": 0.21566084788029924, + "grad_norm": 0.5247855982422742, + "learning_rate": 4.5618087614322335e-05, + "loss": 0.9432, + "step": 4324 + }, + { + "epoch": 0.21571072319201995, + "grad_norm": 0.6596900311616808, + "learning_rate": 4.5615803454658276e-05, + "loss": 0.9793, + "step": 4325 + }, + { + "epoch": 0.21576059850374066, + "grad_norm": 0.4137834125317839, + "learning_rate": 4.561351875703406e-05, + "loss": 0.903, + "step": 4326 + }, + { + "epoch": 0.21581047381546134, + "grad_norm": 0.45881502291046083, + "learning_rate": 4.5611233521509287e-05, + "loss": 0.968, + "step": 4327 + }, + { + "epoch": 0.21586034912718205, + "grad_norm": 0.5423800500502827, + "learning_rate": 4.560894774814359e-05, + "loss": 0.9558, + "step": 4328 + }, + { + "epoch": 0.21591022443890273, + "grad_norm": 1.0960634739592114, + "learning_rate": 4.560666143699664e-05, + "loss": 0.9691, + "step": 4329 + }, + { + "epoch": 0.21596009975062344, + "grad_norm": 0.4889075118488954, + "learning_rate": 4.560437458812808e-05, + "loss": 0.9223, + "step": 4330 + }, + { + "epoch": 0.21600997506234415, + "grad_norm": 0.4773935844282473, + "learning_rate": 4.560208720159759e-05, + "loss": 0.935, + "step": 4331 + }, + { + "epoch": 0.21605985037406483, + "grad_norm": 1.0290668605872642, + "learning_rate": 4.5599799277464846e-05, + "loss": 0.9211, + "step": 4332 + }, + { + "epoch": 0.21610972568578554, + "grad_norm": 0.45021189380463694, + "learning_rate": 4.559751081578956e-05, + "loss": 0.9756, + "step": 4333 + }, + { + "epoch": 0.21615960099750622, + "grad_norm": 0.46117095049014584, + "learning_rate": 4.559522181663145e-05, + "loss": 0.9781, + "step": 4334 + }, + { + "epoch": 0.21620947630922693, + "grad_norm": 0.46440983061443203, + "learning_rate": 4.5592932280050244e-05, + "loss": 0.9283, + "step": 4335 + }, + { + "epoch": 0.21625935162094764, + "grad_norm": 0.479582372008093, + "learning_rate": 4.559064220610568e-05, + "loss": 0.9321, + "step": 4336 + }, + { + "epoch": 0.21630922693266832, + "grad_norm": 0.42407796142504117, + "learning_rate": 4.5588351594857535e-05, + "loss": 0.9379, + "step": 4337 + }, + { + "epoch": 0.21635910224438903, + "grad_norm": 0.5948626977650523, + "learning_rate": 4.5586060446365565e-05, + "loss": 0.8788, + "step": 4338 + }, + { + "epoch": 0.21640897755610972, + "grad_norm": 0.44614372184444, + "learning_rate": 4.558376876068956e-05, + "loss": 0.9735, + "step": 4339 + }, + { + "epoch": 0.21645885286783043, + "grad_norm": 0.4287080548224908, + "learning_rate": 4.558147653788932e-05, + "loss": 0.9407, + "step": 4340 + }, + { + "epoch": 0.21650872817955114, + "grad_norm": 0.4947561477726004, + "learning_rate": 4.557918377802466e-05, + "loss": 0.963, + "step": 4341 + }, + { + "epoch": 0.21655860349127182, + "grad_norm": 0.5981003751609573, + "learning_rate": 4.557689048115541e-05, + "loss": 0.9606, + "step": 4342 + }, + { + "epoch": 0.21660847880299253, + "grad_norm": 0.5154369836575444, + "learning_rate": 4.557459664734141e-05, + "loss": 0.9365, + "step": 4343 + }, + { + "epoch": 0.2166583541147132, + "grad_norm": 0.4188277843461909, + "learning_rate": 4.557230227664252e-05, + "loss": 0.9729, + "step": 4344 + }, + { + "epoch": 0.21670822942643392, + "grad_norm": 0.4244623738823197, + "learning_rate": 4.557000736911862e-05, + "loss": 0.9585, + "step": 4345 + }, + { + "epoch": 0.2167581047381546, + "grad_norm": 0.42631076492007397, + "learning_rate": 4.5567711924829573e-05, + "loss": 0.9153, + "step": 4346 + }, + { + "epoch": 0.2168079800498753, + "grad_norm": 0.4598322409719337, + "learning_rate": 4.5565415943835285e-05, + "loss": 0.9061, + "step": 4347 + }, + { + "epoch": 0.21685785536159602, + "grad_norm": 0.4739407247841169, + "learning_rate": 4.5563119426195676e-05, + "loss": 0.9417, + "step": 4348 + }, + { + "epoch": 0.2169077306733167, + "grad_norm": 0.4603086390613469, + "learning_rate": 4.556082237197067e-05, + "loss": 0.9438, + "step": 4349 + }, + { + "epoch": 0.2169576059850374, + "grad_norm": 0.4651411528790603, + "learning_rate": 4.55585247812202e-05, + "loss": 0.9481, + "step": 4350 + }, + { + "epoch": 0.2170074812967581, + "grad_norm": 0.526685882978859, + "learning_rate": 4.555622665400423e-05, + "loss": 0.971, + "step": 4351 + }, + { + "epoch": 0.2170573566084788, + "grad_norm": 0.4690571836907775, + "learning_rate": 4.555392799038273e-05, + "loss": 0.9483, + "step": 4352 + }, + { + "epoch": 0.2171072319201995, + "grad_norm": 0.5139386107221531, + "learning_rate": 4.555162879041567e-05, + "loss": 0.9484, + "step": 4353 + }, + { + "epoch": 0.2171571072319202, + "grad_norm": 0.4455028094653584, + "learning_rate": 4.554932905416306e-05, + "loss": 0.8845, + "step": 4354 + }, + { + "epoch": 0.2172069825436409, + "grad_norm": 0.4497286649948538, + "learning_rate": 4.55470287816849e-05, + "loss": 0.9486, + "step": 4355 + }, + { + "epoch": 0.21725685785536158, + "grad_norm": 0.5464539762709129, + "learning_rate": 4.5544727973041226e-05, + "loss": 0.9678, + "step": 4356 + }, + { + "epoch": 0.2173067331670823, + "grad_norm": 0.415676544996099, + "learning_rate": 4.5542426628292066e-05, + "loss": 0.8901, + "step": 4357 + }, + { + "epoch": 0.217356608478803, + "grad_norm": 0.6532737151689866, + "learning_rate": 4.5540124747497476e-05, + "loss": 0.9477, + "step": 4358 + }, + { + "epoch": 0.21740648379052369, + "grad_norm": 0.548654644252957, + "learning_rate": 4.5537822330717526e-05, + "loss": 0.9654, + "step": 4359 + }, + { + "epoch": 0.2174563591022444, + "grad_norm": 0.5513997182954843, + "learning_rate": 4.5535519378012295e-05, + "loss": 0.9409, + "step": 4360 + }, + { + "epoch": 0.21750623441396508, + "grad_norm": 0.40365505960943593, + "learning_rate": 4.553321588944187e-05, + "loss": 0.9109, + "step": 4361 + }, + { + "epoch": 0.2175561097256858, + "grad_norm": 0.43735727882938075, + "learning_rate": 4.553091186506637e-05, + "loss": 0.9467, + "step": 4362 + }, + { + "epoch": 0.2176059850374065, + "grad_norm": 0.4621249018594084, + "learning_rate": 4.552860730494591e-05, + "loss": 0.9587, + "step": 4363 + }, + { + "epoch": 0.21765586034912718, + "grad_norm": 0.5091173885949247, + "learning_rate": 4.552630220914064e-05, + "loss": 0.9426, + "step": 4364 + }, + { + "epoch": 0.2177057356608479, + "grad_norm": 0.39018969560630995, + "learning_rate": 4.552399657771069e-05, + "loss": 0.9414, + "step": 4365 + }, + { + "epoch": 0.21775561097256857, + "grad_norm": 0.4808976566181806, + "learning_rate": 4.5521690410716235e-05, + "loss": 0.927, + "step": 4366 + }, + { + "epoch": 0.21780548628428928, + "grad_norm": 0.5473612907066283, + "learning_rate": 4.551938370821746e-05, + "loss": 0.9593, + "step": 4367 + }, + { + "epoch": 0.21785536159601, + "grad_norm": 0.4335442844945524, + "learning_rate": 4.551707647027454e-05, + "loss": 0.942, + "step": 4368 + }, + { + "epoch": 0.21790523690773067, + "grad_norm": 0.43389817162492894, + "learning_rate": 4.551476869694771e-05, + "loss": 0.9592, + "step": 4369 + }, + { + "epoch": 0.21795511221945138, + "grad_norm": 0.4564368256142703, + "learning_rate": 4.5512460388297154e-05, + "loss": 0.9641, + "step": 4370 + }, + { + "epoch": 0.21800498753117206, + "grad_norm": 0.5627303948043944, + "learning_rate": 4.551015154438313e-05, + "loss": 0.9239, + "step": 4371 + }, + { + "epoch": 0.21805486284289277, + "grad_norm": 0.4516675673313573, + "learning_rate": 4.550784216526588e-05, + "loss": 0.9739, + "step": 4372 + }, + { + "epoch": 0.21810473815461345, + "grad_norm": 0.4256038344529344, + "learning_rate": 4.5505532251005665e-05, + "loss": 0.9501, + "step": 4373 + }, + { + "epoch": 0.21815461346633416, + "grad_norm": 0.43847216999283073, + "learning_rate": 4.5503221801662765e-05, + "loss": 0.8921, + "step": 4374 + }, + { + "epoch": 0.21820448877805487, + "grad_norm": 0.393791574672812, + "learning_rate": 4.5500910817297474e-05, + "loss": 0.9221, + "step": 4375 + }, + { + "epoch": 0.21825436408977555, + "grad_norm": 0.47179465429466944, + "learning_rate": 4.5498599297970086e-05, + "loss": 0.9615, + "step": 4376 + }, + { + "epoch": 0.21830423940149626, + "grad_norm": 0.47545764568736465, + "learning_rate": 4.549628724374092e-05, + "loss": 0.886, + "step": 4377 + }, + { + "epoch": 0.21835411471321695, + "grad_norm": 0.4427171350844482, + "learning_rate": 4.549397465467031e-05, + "loss": 0.9498, + "step": 4378 + }, + { + "epoch": 0.21840399002493766, + "grad_norm": 0.7149681405174608, + "learning_rate": 4.549166153081861e-05, + "loss": 0.9657, + "step": 4379 + }, + { + "epoch": 0.21845386533665836, + "grad_norm": 0.5531585992120085, + "learning_rate": 4.548934787224617e-05, + "loss": 0.9032, + "step": 4380 + }, + { + "epoch": 0.21850374064837905, + "grad_norm": 0.5088655230041327, + "learning_rate": 4.548703367901337e-05, + "loss": 0.9253, + "step": 4381 + }, + { + "epoch": 0.21855361596009976, + "grad_norm": 0.4485940425525059, + "learning_rate": 4.548471895118059e-05, + "loss": 0.9276, + "step": 4382 + }, + { + "epoch": 0.21860349127182044, + "grad_norm": 0.41303148661880357, + "learning_rate": 4.548240368880824e-05, + "loss": 0.9167, + "step": 4383 + }, + { + "epoch": 0.21865336658354115, + "grad_norm": 0.421570377350384, + "learning_rate": 4.548008789195674e-05, + "loss": 0.939, + "step": 4384 + }, + { + "epoch": 0.21870324189526186, + "grad_norm": 0.44671923461154356, + "learning_rate": 4.54777715606865e-05, + "loss": 0.9079, + "step": 4385 + }, + { + "epoch": 0.21875311720698254, + "grad_norm": 0.5486763092177319, + "learning_rate": 4.547545469505797e-05, + "loss": 0.9606, + "step": 4386 + }, + { + "epoch": 0.21880299251870325, + "grad_norm": 0.45948261849542754, + "learning_rate": 4.547313729513163e-05, + "loss": 0.9204, + "step": 4387 + }, + { + "epoch": 0.21885286783042393, + "grad_norm": 0.4520204978979354, + "learning_rate": 4.5470819360967926e-05, + "loss": 0.9157, + "step": 4388 + }, + { + "epoch": 0.21890274314214464, + "grad_norm": 0.5173192032642128, + "learning_rate": 4.5468500892627355e-05, + "loss": 0.9455, + "step": 4389 + }, + { + "epoch": 0.21895261845386535, + "grad_norm": 0.48401352750649246, + "learning_rate": 4.5466181890170414e-05, + "loss": 0.9298, + "step": 4390 + }, + { + "epoch": 0.21900249376558603, + "grad_norm": 0.4466137989407745, + "learning_rate": 4.5463862353657614e-05, + "loss": 0.9255, + "step": 4391 + }, + { + "epoch": 0.21905236907730674, + "grad_norm": 0.5840361938906921, + "learning_rate": 4.546154228314948e-05, + "loss": 0.9632, + "step": 4392 + }, + { + "epoch": 0.21910224438902742, + "grad_norm": 0.43515537712724006, + "learning_rate": 4.5459221678706566e-05, + "loss": 0.9387, + "step": 4393 + }, + { + "epoch": 0.21915211970074813, + "grad_norm": 0.703677165061387, + "learning_rate": 4.545690054038941e-05, + "loss": 0.9673, + "step": 4394 + }, + { + "epoch": 0.21920199501246881, + "grad_norm": 0.43256595621416727, + "learning_rate": 4.54545788682586e-05, + "loss": 0.9277, + "step": 4395 + }, + { + "epoch": 0.21925187032418952, + "grad_norm": 0.43408895799949787, + "learning_rate": 4.54522566623747e-05, + "loss": 0.9157, + "step": 4396 + }, + { + "epoch": 0.21930174563591023, + "grad_norm": 0.6116540350178591, + "learning_rate": 4.544993392279833e-05, + "loss": 0.9388, + "step": 4397 + }, + { + "epoch": 0.21935162094763092, + "grad_norm": 0.4627343677666659, + "learning_rate": 4.5447610649590064e-05, + "loss": 0.9573, + "step": 4398 + }, + { + "epoch": 0.21940149625935162, + "grad_norm": 0.49936343088529356, + "learning_rate": 4.544528684281056e-05, + "loss": 0.9575, + "step": 4399 + }, + { + "epoch": 0.2194513715710723, + "grad_norm": 0.4126451505003239, + "learning_rate": 4.544296250252045e-05, + "loss": 0.9236, + "step": 4400 + }, + { + "epoch": 0.21950124688279302, + "grad_norm": 0.5791729249976201, + "learning_rate": 4.544063762878038e-05, + "loss": 0.9574, + "step": 4401 + }, + { + "epoch": 0.21955112219451373, + "grad_norm": 0.49392606446601295, + "learning_rate": 4.543831222165103e-05, + "loss": 0.9191, + "step": 4402 + }, + { + "epoch": 0.2196009975062344, + "grad_norm": 0.5928328056144527, + "learning_rate": 4.543598628119305e-05, + "loss": 0.9035, + "step": 4403 + }, + { + "epoch": 0.21965087281795512, + "grad_norm": 0.4785314911849246, + "learning_rate": 4.5433659807467164e-05, + "loss": 0.935, + "step": 4404 + }, + { + "epoch": 0.2197007481296758, + "grad_norm": 0.4469784226496395, + "learning_rate": 4.5431332800534065e-05, + "loss": 0.898, + "step": 4405 + }, + { + "epoch": 0.2197506234413965, + "grad_norm": 0.5136437902826496, + "learning_rate": 4.542900526045448e-05, + "loss": 0.9422, + "step": 4406 + }, + { + "epoch": 0.21980049875311722, + "grad_norm": 0.5652582835653431, + "learning_rate": 4.5426677187289156e-05, + "loss": 0.9644, + "step": 4407 + }, + { + "epoch": 0.2198503740648379, + "grad_norm": 0.45162370985432176, + "learning_rate": 4.5424348581098825e-05, + "loss": 0.8881, + "step": 4408 + }, + { + "epoch": 0.2199002493765586, + "grad_norm": 0.4036032169445329, + "learning_rate": 4.5422019441944257e-05, + "loss": 0.8869, + "step": 4409 + }, + { + "epoch": 0.2199501246882793, + "grad_norm": 0.46193054195558053, + "learning_rate": 4.541968976988623e-05, + "loss": 0.9562, + "step": 4410 + }, + { + "epoch": 0.22, + "grad_norm": 0.5291792516498818, + "learning_rate": 4.541735956498554e-05, + "loss": 0.9853, + "step": 4411 + }, + { + "epoch": 0.2200498753117207, + "grad_norm": 0.4778705036328966, + "learning_rate": 4.5415028827302997e-05, + "loss": 0.9019, + "step": 4412 + }, + { + "epoch": 0.2200997506234414, + "grad_norm": 0.7920316778345364, + "learning_rate": 4.541269755689941e-05, + "loss": 0.9165, + "step": 4413 + }, + { + "epoch": 0.2201496259351621, + "grad_norm": 0.45250350565436476, + "learning_rate": 4.541036575383561e-05, + "loss": 0.9292, + "step": 4414 + }, + { + "epoch": 0.22019950124688278, + "grad_norm": 0.5606227511104827, + "learning_rate": 4.5408033418172455e-05, + "loss": 0.9079, + "step": 4415 + }, + { + "epoch": 0.2202493765586035, + "grad_norm": 0.48618037067755465, + "learning_rate": 4.54057005499708e-05, + "loss": 0.9529, + "step": 4416 + }, + { + "epoch": 0.2202992518703242, + "grad_norm": 0.518517488431199, + "learning_rate": 4.540336714929152e-05, + "loss": 0.9982, + "step": 4417 + }, + { + "epoch": 0.22034912718204488, + "grad_norm": 0.4187003887911755, + "learning_rate": 4.54010332161955e-05, + "loss": 0.9257, + "step": 4418 + }, + { + "epoch": 0.2203990024937656, + "grad_norm": 0.4686610968990627, + "learning_rate": 4.539869875074366e-05, + "loss": 0.959, + "step": 4419 + }, + { + "epoch": 0.22044887780548628, + "grad_norm": 0.45862192795258844, + "learning_rate": 4.53963637529969e-05, + "loss": 0.984, + "step": 4420 + }, + { + "epoch": 0.22049875311720699, + "grad_norm": 0.47186488511357416, + "learning_rate": 4.539402822301616e-05, + "loss": 0.9411, + "step": 4421 + }, + { + "epoch": 0.22054862842892767, + "grad_norm": 0.42748226306821024, + "learning_rate": 4.539169216086238e-05, + "loss": 0.9655, + "step": 4422 + }, + { + "epoch": 0.22059850374064838, + "grad_norm": 0.5048873402177982, + "learning_rate": 4.538935556659651e-05, + "loss": 0.9416, + "step": 4423 + }, + { + "epoch": 0.2206483790523691, + "grad_norm": 0.3877921516492808, + "learning_rate": 4.538701844027954e-05, + "loss": 0.926, + "step": 4424 + }, + { + "epoch": 0.22069825436408977, + "grad_norm": 0.49526204387548484, + "learning_rate": 4.5384680781972446e-05, + "loss": 0.9549, + "step": 4425 + }, + { + "epoch": 0.22074812967581048, + "grad_norm": 0.5771910194212572, + "learning_rate": 4.538234259173622e-05, + "loss": 0.9789, + "step": 4426 + }, + { + "epoch": 0.22079800498753116, + "grad_norm": 0.4833033967137469, + "learning_rate": 4.5380003869631906e-05, + "loss": 0.9463, + "step": 4427 + }, + { + "epoch": 0.22084788029925187, + "grad_norm": 0.43757404849429304, + "learning_rate": 4.53776646157205e-05, + "loss": 0.9034, + "step": 4428 + }, + { + "epoch": 0.22089775561097258, + "grad_norm": 0.42545708400003246, + "learning_rate": 4.537532483006306e-05, + "loss": 0.9189, + "step": 4429 + }, + { + "epoch": 0.22094763092269326, + "grad_norm": 0.4683984593284979, + "learning_rate": 4.537298451272063e-05, + "loss": 0.8967, + "step": 4430 + }, + { + "epoch": 0.22099750623441397, + "grad_norm": 0.4817734878680919, + "learning_rate": 4.537064366375429e-05, + "loss": 0.9223, + "step": 4431 + }, + { + "epoch": 0.22104738154613465, + "grad_norm": 0.45249359352798885, + "learning_rate": 4.536830228322513e-05, + "loss": 0.9064, + "step": 4432 + }, + { + "epoch": 0.22109725685785536, + "grad_norm": 0.49608452462903463, + "learning_rate": 4.536596037119423e-05, + "loss": 0.9838, + "step": 4433 + }, + { + "epoch": 0.22114713216957607, + "grad_norm": 0.6775429087675676, + "learning_rate": 4.53636179277227e-05, + "loss": 0.9035, + "step": 4434 + }, + { + "epoch": 0.22119700748129675, + "grad_norm": 0.5054213143329956, + "learning_rate": 4.5361274952871684e-05, + "loss": 0.9578, + "step": 4435 + }, + { + "epoch": 0.22124688279301746, + "grad_norm": 0.477177255861649, + "learning_rate": 4.5358931446702315e-05, + "loss": 0.9278, + "step": 4436 + }, + { + "epoch": 0.22129675810473814, + "grad_norm": 0.5403010553669256, + "learning_rate": 4.535658740927573e-05, + "loss": 0.9298, + "step": 4437 + }, + { + "epoch": 0.22134663341645885, + "grad_norm": 0.5393631238645687, + "learning_rate": 4.535424284065311e-05, + "loss": 0.9408, + "step": 4438 + }, + { + "epoch": 0.22139650872817956, + "grad_norm": 0.49325835861694817, + "learning_rate": 4.535189774089563e-05, + "loss": 0.8693, + "step": 4439 + }, + { + "epoch": 0.22144638403990025, + "grad_norm": 0.6565551470624283, + "learning_rate": 4.53495521100645e-05, + "loss": 0.937, + "step": 4440 + }, + { + "epoch": 0.22149625935162096, + "grad_norm": 0.48381786929388915, + "learning_rate": 4.5347205948220905e-05, + "loss": 0.969, + "step": 4441 + }, + { + "epoch": 0.22154613466334164, + "grad_norm": 0.490568307200594, + "learning_rate": 4.534485925542607e-05, + "loss": 0.9492, + "step": 4442 + }, + { + "epoch": 0.22159600997506235, + "grad_norm": 0.4961816015033156, + "learning_rate": 4.534251203174125e-05, + "loss": 0.8987, + "step": 4443 + }, + { + "epoch": 0.22164588528678303, + "grad_norm": 0.6242201285378002, + "learning_rate": 4.534016427722768e-05, + "loss": 0.929, + "step": 4444 + }, + { + "epoch": 0.22169576059850374, + "grad_norm": 0.5349273967975187, + "learning_rate": 4.533781599194663e-05, + "loss": 0.95, + "step": 4445 + }, + { + "epoch": 0.22174563591022445, + "grad_norm": 0.4231480106481833, + "learning_rate": 4.533546717595937e-05, + "loss": 0.9378, + "step": 4446 + }, + { + "epoch": 0.22179551122194513, + "grad_norm": 0.48957984025535894, + "learning_rate": 4.533311782932719e-05, + "loss": 0.949, + "step": 4447 + }, + { + "epoch": 0.22184538653366584, + "grad_norm": 0.44037586446414073, + "learning_rate": 4.53307679521114e-05, + "loss": 0.8846, + "step": 4448 + }, + { + "epoch": 0.22189526184538652, + "grad_norm": 0.4679162210970334, + "learning_rate": 4.532841754437333e-05, + "loss": 0.9085, + "step": 4449 + }, + { + "epoch": 0.22194513715710723, + "grad_norm": 0.42982522801542367, + "learning_rate": 4.532606660617429e-05, + "loss": 0.9601, + "step": 4450 + }, + { + "epoch": 0.22199501246882794, + "grad_norm": 0.45181528446475855, + "learning_rate": 4.532371513757564e-05, + "loss": 0.9161, + "step": 4451 + }, + { + "epoch": 0.22204488778054862, + "grad_norm": 0.4787299021673891, + "learning_rate": 4.532136313863875e-05, + "loss": 0.8999, + "step": 4452 + }, + { + "epoch": 0.22209476309226933, + "grad_norm": 0.939888872533707, + "learning_rate": 4.531901060942497e-05, + "loss": 0.9489, + "step": 4453 + }, + { + "epoch": 0.22214463840399, + "grad_norm": 0.450316002972556, + "learning_rate": 4.531665754999571e-05, + "loss": 0.9566, + "step": 4454 + }, + { + "epoch": 0.22219451371571072, + "grad_norm": 0.4345413241024383, + "learning_rate": 4.531430396041236e-05, + "loss": 0.9084, + "step": 4455 + }, + { + "epoch": 0.22224438902743143, + "grad_norm": 0.4996825173430485, + "learning_rate": 4.5311949840736345e-05, + "loss": 0.9033, + "step": 4456 + }, + { + "epoch": 0.22229426433915211, + "grad_norm": 0.5583421480462285, + "learning_rate": 4.530959519102909e-05, + "loss": 0.9332, + "step": 4457 + }, + { + "epoch": 0.22234413965087282, + "grad_norm": 0.7538009679686211, + "learning_rate": 4.5307240011352025e-05, + "loss": 0.9187, + "step": 4458 + }, + { + "epoch": 0.2223940149625935, + "grad_norm": 0.4741902134463721, + "learning_rate": 4.530488430176664e-05, + "loss": 0.8915, + "step": 4459 + }, + { + "epoch": 0.22244389027431422, + "grad_norm": 0.46617806263763545, + "learning_rate": 4.530252806233437e-05, + "loss": 0.9293, + "step": 4460 + }, + { + "epoch": 0.22249376558603493, + "grad_norm": 0.48973894175842037, + "learning_rate": 4.5300171293116725e-05, + "loss": 0.938, + "step": 4461 + }, + { + "epoch": 0.2225436408977556, + "grad_norm": 0.450995927573271, + "learning_rate": 4.529781399417519e-05, + "loss": 0.9488, + "step": 4462 + }, + { + "epoch": 0.22259351620947632, + "grad_norm": 0.5583813342950508, + "learning_rate": 4.529545616557128e-05, + "loss": 1.0, + "step": 4463 + }, + { + "epoch": 0.222643391521197, + "grad_norm": 0.43287169071922615, + "learning_rate": 4.529309780736654e-05, + "loss": 0.9297, + "step": 4464 + }, + { + "epoch": 0.2226932668329177, + "grad_norm": 0.4683441404731378, + "learning_rate": 4.529073891962248e-05, + "loss": 0.921, + "step": 4465 + }, + { + "epoch": 0.22274314214463842, + "grad_norm": 0.4470006918051519, + "learning_rate": 4.5288379502400666e-05, + "loss": 0.9156, + "step": 4466 + }, + { + "epoch": 0.2227930174563591, + "grad_norm": 2.967525886260455, + "learning_rate": 4.528601955576268e-05, + "loss": 0.9713, + "step": 4467 + }, + { + "epoch": 0.2228428927680798, + "grad_norm": 0.5188147067913663, + "learning_rate": 4.528365907977009e-05, + "loss": 0.9435, + "step": 4468 + }, + { + "epoch": 0.2228927680798005, + "grad_norm": 0.4119721778734567, + "learning_rate": 4.528129807448449e-05, + "loss": 0.8935, + "step": 4469 + }, + { + "epoch": 0.2229426433915212, + "grad_norm": 0.5311645838953984, + "learning_rate": 4.52789365399675e-05, + "loss": 0.9471, + "step": 4470 + }, + { + "epoch": 0.22299251870324188, + "grad_norm": 0.5279780236003874, + "learning_rate": 4.527657447628073e-05, + "loss": 0.92, + "step": 4471 + }, + { + "epoch": 0.2230423940149626, + "grad_norm": 0.4370736213813588, + "learning_rate": 4.527421188348582e-05, + "loss": 0.9489, + "step": 4472 + }, + { + "epoch": 0.2230922693266833, + "grad_norm": 0.4809458775085037, + "learning_rate": 4.527184876164443e-05, + "loss": 0.9087, + "step": 4473 + }, + { + "epoch": 0.22314214463840398, + "grad_norm": 0.39390369789474233, + "learning_rate": 4.5269485110818225e-05, + "loss": 0.8963, + "step": 4474 + }, + { + "epoch": 0.2231920199501247, + "grad_norm": 0.5502169067839405, + "learning_rate": 4.5267120931068875e-05, + "loss": 0.9122, + "step": 4475 + }, + { + "epoch": 0.22324189526184537, + "grad_norm": 0.478132405829049, + "learning_rate": 4.526475622245807e-05, + "loss": 0.9139, + "step": 4476 + }, + { + "epoch": 0.22329177057356608, + "grad_norm": 0.49555035041171336, + "learning_rate": 4.526239098504752e-05, + "loss": 0.9188, + "step": 4477 + }, + { + "epoch": 0.2233416458852868, + "grad_norm": 0.5276873118226775, + "learning_rate": 4.526002521889895e-05, + "loss": 0.9247, + "step": 4478 + }, + { + "epoch": 0.22339152119700748, + "grad_norm": 0.44132473356825586, + "learning_rate": 4.525765892407409e-05, + "loss": 0.9362, + "step": 4479 + }, + { + "epoch": 0.22344139650872819, + "grad_norm": 0.5008433216585713, + "learning_rate": 4.525529210063468e-05, + "loss": 0.9312, + "step": 4480 + }, + { + "epoch": 0.22349127182044887, + "grad_norm": 0.4092259209028909, + "learning_rate": 4.525292474864249e-05, + "loss": 0.9495, + "step": 4481 + }, + { + "epoch": 0.22354114713216958, + "grad_norm": 0.5524372372426258, + "learning_rate": 4.52505568681593e-05, + "loss": 0.916, + "step": 4482 + }, + { + "epoch": 0.22359102244389029, + "grad_norm": 0.39958702483059716, + "learning_rate": 4.5248188459246883e-05, + "loss": 0.8656, + "step": 4483 + }, + { + "epoch": 0.22364089775561097, + "grad_norm": 0.5011784205736379, + "learning_rate": 4.524581952196705e-05, + "loss": 0.9294, + "step": 4484 + }, + { + "epoch": 0.22369077306733168, + "grad_norm": 0.4386265583501846, + "learning_rate": 4.524345005638162e-05, + "loss": 0.9147, + "step": 4485 + }, + { + "epoch": 0.22374064837905236, + "grad_norm": 0.37913383835631137, + "learning_rate": 4.524108006255242e-05, + "loss": 0.8903, + "step": 4486 + }, + { + "epoch": 0.22379052369077307, + "grad_norm": 0.4943758249704032, + "learning_rate": 4.52387095405413e-05, + "loss": 0.9407, + "step": 4487 + }, + { + "epoch": 0.22384039900249378, + "grad_norm": 0.42252024965763096, + "learning_rate": 4.5236338490410104e-05, + "loss": 0.9259, + "step": 4488 + }, + { + "epoch": 0.22389027431421446, + "grad_norm": 0.4398414487370559, + "learning_rate": 4.523396691222071e-05, + "loss": 0.9521, + "step": 4489 + }, + { + "epoch": 0.22394014962593517, + "grad_norm": 0.43228077657830466, + "learning_rate": 4.5231594806035014e-05, + "loss": 0.9308, + "step": 4490 + }, + { + "epoch": 0.22399002493765585, + "grad_norm": 0.6409249207935056, + "learning_rate": 4.52292221719149e-05, + "loss": 0.9383, + "step": 4491 + }, + { + "epoch": 0.22403990024937656, + "grad_norm": 0.5075616587177175, + "learning_rate": 4.522684900992229e-05, + "loss": 0.9646, + "step": 4492 + }, + { + "epoch": 0.22408977556109724, + "grad_norm": 0.5475317995182334, + "learning_rate": 4.5224475320119106e-05, + "loss": 0.9339, + "step": 4493 + }, + { + "epoch": 0.22413965087281795, + "grad_norm": 0.4603626066659591, + "learning_rate": 4.522210110256729e-05, + "loss": 0.932, + "step": 4494 + }, + { + "epoch": 0.22418952618453866, + "grad_norm": 0.45776161061261883, + "learning_rate": 4.52197263573288e-05, + "loss": 0.9074, + "step": 4495 + }, + { + "epoch": 0.22423940149625934, + "grad_norm": 0.420948238403378, + "learning_rate": 4.5217351084465595e-05, + "loss": 0.9166, + "step": 4496 + }, + { + "epoch": 0.22428927680798005, + "grad_norm": 0.40326343868646636, + "learning_rate": 4.521497528403966e-05, + "loss": 0.9373, + "step": 4497 + }, + { + "epoch": 0.22433915211970074, + "grad_norm": 0.7260943938198063, + "learning_rate": 4.521259895611299e-05, + "loss": 0.9272, + "step": 4498 + }, + { + "epoch": 0.22438902743142145, + "grad_norm": 0.38973822526561486, + "learning_rate": 4.521022210074761e-05, + "loss": 0.9504, + "step": 4499 + }, + { + "epoch": 0.22443890274314215, + "grad_norm": 0.4490705695454286, + "learning_rate": 4.520784471800552e-05, + "loss": 0.9347, + "step": 4500 + }, + { + "epoch": 0.22448877805486284, + "grad_norm": 0.40895947039122976, + "learning_rate": 4.5205466807948767e-05, + "loss": 0.922, + "step": 4501 + }, + { + "epoch": 0.22453865336658355, + "grad_norm": 0.41728538860286785, + "learning_rate": 4.520308837063939e-05, + "loss": 0.865, + "step": 4502 + }, + { + "epoch": 0.22458852867830423, + "grad_norm": 0.40304914606363834, + "learning_rate": 4.520070940613948e-05, + "loss": 0.9478, + "step": 4503 + }, + { + "epoch": 0.22463840399002494, + "grad_norm": 0.5274072422674049, + "learning_rate": 4.519832991451109e-05, + "loss": 0.9265, + "step": 4504 + }, + { + "epoch": 0.22468827930174565, + "grad_norm": 0.41473516595789567, + "learning_rate": 4.5195949895816325e-05, + "loss": 0.9485, + "step": 4505 + }, + { + "epoch": 0.22473815461346633, + "grad_norm": 0.4433686317034449, + "learning_rate": 4.519356935011728e-05, + "loss": 0.9285, + "step": 4506 + }, + { + "epoch": 0.22478802992518704, + "grad_norm": 0.5301554765615856, + "learning_rate": 4.519118827747608e-05, + "loss": 0.8917, + "step": 4507 + }, + { + "epoch": 0.22483790523690772, + "grad_norm": 0.4117299862577609, + "learning_rate": 4.5188806677954864e-05, + "loss": 0.9078, + "step": 4508 + }, + { + "epoch": 0.22488778054862843, + "grad_norm": 0.4798850465394991, + "learning_rate": 4.518642455161577e-05, + "loss": 0.9079, + "step": 4509 + }, + { + "epoch": 0.22493765586034914, + "grad_norm": 0.4680535415123048, + "learning_rate": 4.518404189852097e-05, + "loss": 0.941, + "step": 4510 + }, + { + "epoch": 0.22498753117206982, + "grad_norm": 0.5219155624653053, + "learning_rate": 4.518165871873262e-05, + "loss": 0.9506, + "step": 4511 + }, + { + "epoch": 0.22503740648379053, + "grad_norm": 0.39310691613304355, + "learning_rate": 4.517927501231292e-05, + "loss": 0.9064, + "step": 4512 + }, + { + "epoch": 0.2250872817955112, + "grad_norm": 0.4551433448282818, + "learning_rate": 4.5176890779324065e-05, + "loss": 0.9545, + "step": 4513 + }, + { + "epoch": 0.22513715710723192, + "grad_norm": 0.546249727523388, + "learning_rate": 4.5174506019828285e-05, + "loss": 0.9433, + "step": 4514 + }, + { + "epoch": 0.22518703241895263, + "grad_norm": 0.43584333454988733, + "learning_rate": 4.517212073388779e-05, + "loss": 0.9228, + "step": 4515 + }, + { + "epoch": 0.2252369077306733, + "grad_norm": 0.42725995018888707, + "learning_rate": 4.516973492156484e-05, + "loss": 0.9378, + "step": 4516 + }, + { + "epoch": 0.22528678304239402, + "grad_norm": 0.6400035068259635, + "learning_rate": 4.516734858292168e-05, + "loss": 0.9796, + "step": 4517 + }, + { + "epoch": 0.2253366583541147, + "grad_norm": 0.4399221897108341, + "learning_rate": 4.5164961718020585e-05, + "loss": 0.9185, + "step": 4518 + }, + { + "epoch": 0.22538653366583541, + "grad_norm": 0.4440445566918009, + "learning_rate": 4.5162574326923834e-05, + "loss": 0.8849, + "step": 4519 + }, + { + "epoch": 0.2254364089775561, + "grad_norm": 0.47816014855230016, + "learning_rate": 4.516018640969374e-05, + "loss": 0.9167, + "step": 4520 + }, + { + "epoch": 0.2254862842892768, + "grad_norm": 0.6515622536228634, + "learning_rate": 4.51577979663926e-05, + "loss": 0.9243, + "step": 4521 + }, + { + "epoch": 0.22553615960099752, + "grad_norm": 0.5154371832029828, + "learning_rate": 4.515540899708274e-05, + "loss": 0.9066, + "step": 4522 + }, + { + "epoch": 0.2255860349127182, + "grad_norm": 0.41970597697193823, + "learning_rate": 4.51530195018265e-05, + "loss": 0.9541, + "step": 4523 + }, + { + "epoch": 0.2256359102244389, + "grad_norm": 0.6121518809732003, + "learning_rate": 4.5150629480686244e-05, + "loss": 0.919, + "step": 4524 + }, + { + "epoch": 0.2256857855361596, + "grad_norm": 0.5609954358369588, + "learning_rate": 4.514823893372432e-05, + "loss": 0.9123, + "step": 4525 + }, + { + "epoch": 0.2257356608478803, + "grad_norm": 0.44919265120906965, + "learning_rate": 4.5145847861003135e-05, + "loss": 0.9263, + "step": 4526 + }, + { + "epoch": 0.225785536159601, + "grad_norm": 0.4732905397470889, + "learning_rate": 4.514345626258505e-05, + "loss": 0.9419, + "step": 4527 + }, + { + "epoch": 0.2258354114713217, + "grad_norm": 0.5424583799168728, + "learning_rate": 4.51410641385325e-05, + "loss": 0.9335, + "step": 4528 + }, + { + "epoch": 0.2258852867830424, + "grad_norm": 0.4940063590786666, + "learning_rate": 4.513867148890788e-05, + "loss": 0.9629, + "step": 4529 + }, + { + "epoch": 0.22593516209476308, + "grad_norm": 0.7045830229533458, + "learning_rate": 4.513627831377365e-05, + "loss": 0.8895, + "step": 4530 + }, + { + "epoch": 0.2259850374064838, + "grad_norm": 0.5832235011638229, + "learning_rate": 4.5133884613192244e-05, + "loss": 0.9221, + "step": 4531 + }, + { + "epoch": 0.2260349127182045, + "grad_norm": 0.4572651269914723, + "learning_rate": 4.513149038722614e-05, + "loss": 0.9742, + "step": 4532 + }, + { + "epoch": 0.22608478802992518, + "grad_norm": 0.4815980498041376, + "learning_rate": 4.51290956359378e-05, + "loss": 0.924, + "step": 4533 + }, + { + "epoch": 0.2261346633416459, + "grad_norm": 0.5329927797289901, + "learning_rate": 4.512670035938972e-05, + "loss": 0.9192, + "step": 4534 + }, + { + "epoch": 0.22618453865336657, + "grad_norm": 0.5977811631916747, + "learning_rate": 4.5124304557644393e-05, + "loss": 0.9341, + "step": 4535 + }, + { + "epoch": 0.22623441396508728, + "grad_norm": 0.4393655266621339, + "learning_rate": 4.5121908230764356e-05, + "loss": 0.9105, + "step": 4536 + }, + { + "epoch": 0.226284289276808, + "grad_norm": 0.48278231695793145, + "learning_rate": 4.511951137881212e-05, + "loss": 0.9482, + "step": 4537 + }, + { + "epoch": 0.22633416458852867, + "grad_norm": 0.5556895437531458, + "learning_rate": 4.511711400185024e-05, + "loss": 0.9468, + "step": 4538 + }, + { + "epoch": 0.22638403990024938, + "grad_norm": 0.48739490006802133, + "learning_rate": 4.5114716099941276e-05, + "loss": 0.9617, + "step": 4539 + }, + { + "epoch": 0.22643391521197007, + "grad_norm": 0.4573543143975015, + "learning_rate": 4.5112317673147795e-05, + "loss": 0.9352, + "step": 4540 + }, + { + "epoch": 0.22648379052369078, + "grad_norm": 0.47185253041378644, + "learning_rate": 4.510991872153239e-05, + "loss": 0.914, + "step": 4541 + }, + { + "epoch": 0.22653366583541149, + "grad_norm": 0.5872003645127886, + "learning_rate": 4.5107519245157654e-05, + "loss": 0.9672, + "step": 4542 + }, + { + "epoch": 0.22658354114713217, + "grad_norm": 0.510694114080915, + "learning_rate": 4.51051192440862e-05, + "loss": 0.9203, + "step": 4543 + }, + { + "epoch": 0.22663341645885288, + "grad_norm": 0.48813763508844166, + "learning_rate": 4.510271871838066e-05, + "loss": 0.933, + "step": 4544 + }, + { + "epoch": 0.22668329177057356, + "grad_norm": 0.49562579398992485, + "learning_rate": 4.510031766810367e-05, + "loss": 0.9131, + "step": 4545 + }, + { + "epoch": 0.22673316708229427, + "grad_norm": 0.7412517590095619, + "learning_rate": 4.5097916093317895e-05, + "loss": 0.9499, + "step": 4546 + }, + { + "epoch": 0.22678304239401495, + "grad_norm": 0.46246756436889497, + "learning_rate": 4.509551399408598e-05, + "loss": 0.928, + "step": 4547 + }, + { + "epoch": 0.22683291770573566, + "grad_norm": 1.0274646425404124, + "learning_rate": 4.509311137047063e-05, + "loss": 0.9165, + "step": 4548 + }, + { + "epoch": 0.22688279301745637, + "grad_norm": 0.42943161080835346, + "learning_rate": 4.509070822253453e-05, + "loss": 0.921, + "step": 4549 + }, + { + "epoch": 0.22693266832917705, + "grad_norm": 0.43120950159464566, + "learning_rate": 4.508830455034039e-05, + "loss": 0.9684, + "step": 4550 + }, + { + "epoch": 0.22698254364089776, + "grad_norm": 0.46065704131018287, + "learning_rate": 4.5085900353950926e-05, + "loss": 0.9764, + "step": 4551 + }, + { + "epoch": 0.22703241895261844, + "grad_norm": 0.4962702806991681, + "learning_rate": 4.5083495633428886e-05, + "loss": 0.9242, + "step": 4552 + }, + { + "epoch": 0.22708229426433915, + "grad_norm": 0.4892879540743114, + "learning_rate": 4.5081090388837024e-05, + "loss": 0.9053, + "step": 4553 + }, + { + "epoch": 0.22713216957605986, + "grad_norm": 0.46674124584912835, + "learning_rate": 4.507868462023809e-05, + "loss": 0.9305, + "step": 4554 + }, + { + "epoch": 0.22718204488778054, + "grad_norm": 1.5458907409951912, + "learning_rate": 4.507627832769486e-05, + "loss": 0.902, + "step": 4555 + }, + { + "epoch": 0.22723192019950125, + "grad_norm": 0.45932726445783234, + "learning_rate": 4.507387151127014e-05, + "loss": 0.959, + "step": 4556 + }, + { + "epoch": 0.22728179551122193, + "grad_norm": 0.5565096624139794, + "learning_rate": 4.5071464171026725e-05, + "loss": 0.9023, + "step": 4557 + }, + { + "epoch": 0.22733167082294264, + "grad_norm": 0.4656954063468958, + "learning_rate": 4.506905630702743e-05, + "loss": 0.924, + "step": 4558 + }, + { + "epoch": 0.22738154613466335, + "grad_norm": 0.541731009825457, + "learning_rate": 4.50666479193351e-05, + "loss": 0.9039, + "step": 4559 + }, + { + "epoch": 0.22743142144638404, + "grad_norm": 0.6459117653554521, + "learning_rate": 4.506423900801257e-05, + "loss": 0.9728, + "step": 4560 + }, + { + "epoch": 0.22748129675810475, + "grad_norm": 0.5421505446036609, + "learning_rate": 4.50618295731227e-05, + "loss": 0.9386, + "step": 4561 + }, + { + "epoch": 0.22753117206982543, + "grad_norm": 0.46864921321463426, + "learning_rate": 4.505941961472837e-05, + "loss": 0.941, + "step": 4562 + }, + { + "epoch": 0.22758104738154614, + "grad_norm": 0.9860154722959188, + "learning_rate": 4.505700913289246e-05, + "loss": 0.9353, + "step": 4563 + }, + { + "epoch": 0.22763092269326685, + "grad_norm": 0.6354071302586961, + "learning_rate": 4.505459812767787e-05, + "loss": 0.951, + "step": 4564 + }, + { + "epoch": 0.22768079800498753, + "grad_norm": 0.7081248553666478, + "learning_rate": 4.5052186599147526e-05, + "loss": 0.9147, + "step": 4565 + }, + { + "epoch": 0.22773067331670824, + "grad_norm": 1.3897160527901729, + "learning_rate": 4.5049774547364345e-05, + "loss": 0.9867, + "step": 4566 + }, + { + "epoch": 0.22778054862842892, + "grad_norm": 0.4510783655099806, + "learning_rate": 4.504736197239127e-05, + "loss": 0.9711, + "step": 4567 + }, + { + "epoch": 0.22783042394014963, + "grad_norm": 0.4683446391345248, + "learning_rate": 4.504494887429125e-05, + "loss": 0.9201, + "step": 4568 + }, + { + "epoch": 0.2278802992518703, + "grad_norm": 0.45014399876612265, + "learning_rate": 4.504253525312726e-05, + "loss": 0.9187, + "step": 4569 + }, + { + "epoch": 0.22793017456359102, + "grad_norm": 0.4433594589127217, + "learning_rate": 4.5040121108962285e-05, + "loss": 0.9255, + "step": 4570 + }, + { + "epoch": 0.22798004987531173, + "grad_norm": 0.4619095408177242, + "learning_rate": 4.5037706441859325e-05, + "loss": 0.9443, + "step": 4571 + }, + { + "epoch": 0.2280299251870324, + "grad_norm": 0.6636780447598637, + "learning_rate": 4.5035291251881375e-05, + "loss": 0.9342, + "step": 4572 + }, + { + "epoch": 0.22807980049875312, + "grad_norm": 0.5505826299562231, + "learning_rate": 4.503287553909147e-05, + "loss": 0.965, + "step": 4573 + }, + { + "epoch": 0.2281296758104738, + "grad_norm": 0.5140026897622227, + "learning_rate": 4.5030459303552644e-05, + "loss": 0.8849, + "step": 4574 + }, + { + "epoch": 0.2281795511221945, + "grad_norm": 0.47367395664121004, + "learning_rate": 4.502804254532794e-05, + "loss": 0.93, + "step": 4575 + }, + { + "epoch": 0.22822942643391522, + "grad_norm": 0.4374537897993507, + "learning_rate": 4.502562526448044e-05, + "loss": 0.9865, + "step": 4576 + }, + { + "epoch": 0.2282793017456359, + "grad_norm": 0.49505004399135255, + "learning_rate": 4.50232074610732e-05, + "loss": 0.9346, + "step": 4577 + }, + { + "epoch": 0.2283291770573566, + "grad_norm": 0.4654298509402335, + "learning_rate": 4.502078913516933e-05, + "loss": 0.9286, + "step": 4578 + }, + { + "epoch": 0.2283790523690773, + "grad_norm": 0.5245591799296717, + "learning_rate": 4.5018370286831926e-05, + "loss": 0.9287, + "step": 4579 + }, + { + "epoch": 0.228428927680798, + "grad_norm": 0.4429226960560329, + "learning_rate": 4.501595091612411e-05, + "loss": 0.9003, + "step": 4580 + }, + { + "epoch": 0.22847880299251871, + "grad_norm": 0.5175136930927584, + "learning_rate": 4.5013531023109014e-05, + "loss": 0.8841, + "step": 4581 + }, + { + "epoch": 0.2285286783042394, + "grad_norm": 0.36156945859479056, + "learning_rate": 4.501111060784978e-05, + "loss": 0.9285, + "step": 4582 + }, + { + "epoch": 0.2285785536159601, + "grad_norm": 0.43754953052027035, + "learning_rate": 4.500868967040957e-05, + "loss": 0.9893, + "step": 4583 + }, + { + "epoch": 0.2286284289276808, + "grad_norm": 0.4859237733413433, + "learning_rate": 4.5006268210851555e-05, + "loss": 0.9369, + "step": 4584 + }, + { + "epoch": 0.2286783042394015, + "grad_norm": 0.4850683173580175, + "learning_rate": 4.5003846229238934e-05, + "loss": 0.9602, + "step": 4585 + }, + { + "epoch": 0.2287281795511222, + "grad_norm": 0.44499665638255886, + "learning_rate": 4.5001423725634886e-05, + "loss": 0.9364, + "step": 4586 + }, + { + "epoch": 0.2287780548628429, + "grad_norm": 0.42354021436555234, + "learning_rate": 4.499900070010264e-05, + "loss": 0.932, + "step": 4587 + }, + { + "epoch": 0.2288279301745636, + "grad_norm": 0.5060020912525306, + "learning_rate": 4.499657715270543e-05, + "loss": 0.9174, + "step": 4588 + }, + { + "epoch": 0.22887780548628428, + "grad_norm": 0.41710827668972206, + "learning_rate": 4.4994153083506486e-05, + "loss": 0.9419, + "step": 4589 + }, + { + "epoch": 0.228927680798005, + "grad_norm": 0.45147042686961364, + "learning_rate": 4.499172849256906e-05, + "loss": 1.0163, + "step": 4590 + }, + { + "epoch": 0.2289775561097257, + "grad_norm": 0.5149668907833388, + "learning_rate": 4.498930337995643e-05, + "loss": 1.0058, + "step": 4591 + }, + { + "epoch": 0.22902743142144638, + "grad_norm": 0.4989605140866577, + "learning_rate": 4.498687774573187e-05, + "loss": 0.9244, + "step": 4592 + }, + { + "epoch": 0.2290773067331671, + "grad_norm": 0.39196075561120364, + "learning_rate": 4.498445158995869e-05, + "loss": 0.9177, + "step": 4593 + }, + { + "epoch": 0.22912718204488777, + "grad_norm": 0.5635299214843916, + "learning_rate": 4.4982024912700176e-05, + "loss": 0.949, + "step": 4594 + }, + { + "epoch": 0.22917705735660848, + "grad_norm": 0.4618628463221768, + "learning_rate": 4.4979597714019674e-05, + "loss": 0.953, + "step": 4595 + }, + { + "epoch": 0.22922693266832916, + "grad_norm": 0.4278374555650952, + "learning_rate": 4.497716999398051e-05, + "loss": 0.9412, + "step": 4596 + }, + { + "epoch": 0.22927680798004987, + "grad_norm": 0.44591498378854816, + "learning_rate": 4.4974741752646033e-05, + "loss": 0.9675, + "step": 4597 + }, + { + "epoch": 0.22932668329177058, + "grad_norm": 0.4550748770004, + "learning_rate": 4.497231299007961e-05, + "loss": 0.9549, + "step": 4598 + }, + { + "epoch": 0.22937655860349127, + "grad_norm": 3.187115949374362, + "learning_rate": 4.4969883706344615e-05, + "loss": 0.9639, + "step": 4599 + }, + { + "epoch": 0.22942643391521197, + "grad_norm": 0.4258151398451808, + "learning_rate": 4.4967453901504454e-05, + "loss": 0.9578, + "step": 4600 + }, + { + "epoch": 0.22947630922693266, + "grad_norm": 0.46459288589865344, + "learning_rate": 4.4965023575622514e-05, + "loss": 0.8901, + "step": 4601 + }, + { + "epoch": 0.22952618453865337, + "grad_norm": 0.3773201306770444, + "learning_rate": 4.496259272876221e-05, + "loss": 0.9094, + "step": 4602 + }, + { + "epoch": 0.22957605985037408, + "grad_norm": 0.41655678058935786, + "learning_rate": 4.4960161360986994e-05, + "loss": 0.9299, + "step": 4603 + }, + { + "epoch": 0.22962593516209476, + "grad_norm": 0.4738961373178491, + "learning_rate": 4.495772947236029e-05, + "loss": 0.9201, + "step": 4604 + }, + { + "epoch": 0.22967581047381547, + "grad_norm": 0.42846157643113736, + "learning_rate": 4.495529706294558e-05, + "loss": 0.9508, + "step": 4605 + }, + { + "epoch": 0.22972568578553615, + "grad_norm": 0.45603039253535715, + "learning_rate": 4.495286413280632e-05, + "loss": 0.9651, + "step": 4606 + }, + { + "epoch": 0.22977556109725686, + "grad_norm": 0.5087779020536635, + "learning_rate": 4.4950430682006e-05, + "loss": 0.9301, + "step": 4607 + }, + { + "epoch": 0.22982543640897757, + "grad_norm": 0.4063302292778174, + "learning_rate": 4.4947996710608106e-05, + "loss": 0.9221, + "step": 4608 + }, + { + "epoch": 0.22987531172069825, + "grad_norm": 0.43176896799629544, + "learning_rate": 4.494556221867618e-05, + "loss": 0.9229, + "step": 4609 + }, + { + "epoch": 0.22992518703241896, + "grad_norm": 0.5891169375582042, + "learning_rate": 4.4943127206273726e-05, + "loss": 0.9101, + "step": 4610 + }, + { + "epoch": 0.22997506234413964, + "grad_norm": 0.5204934230412974, + "learning_rate": 4.4940691673464306e-05, + "loss": 0.9136, + "step": 4611 + }, + { + "epoch": 0.23002493765586035, + "grad_norm": 0.4456382156086398, + "learning_rate": 4.4938255620311446e-05, + "loss": 0.9052, + "step": 4612 + }, + { + "epoch": 0.23007481296758106, + "grad_norm": 0.651802525090218, + "learning_rate": 4.4935819046878737e-05, + "loss": 0.9162, + "step": 4613 + }, + { + "epoch": 0.23012468827930174, + "grad_norm": 0.49788913146435926, + "learning_rate": 4.493338195322975e-05, + "loss": 0.9088, + "step": 4614 + }, + { + "epoch": 0.23017456359102245, + "grad_norm": 0.3765792933916561, + "learning_rate": 4.4930944339428085e-05, + "loss": 0.9299, + "step": 4615 + }, + { + "epoch": 0.23022443890274313, + "grad_norm": 0.4158682047208107, + "learning_rate": 4.492850620553735e-05, + "loss": 0.9652, + "step": 4616 + }, + { + "epoch": 0.23027431421446384, + "grad_norm": 0.4388096402281968, + "learning_rate": 4.4926067551621154e-05, + "loss": 0.8875, + "step": 4617 + }, + { + "epoch": 0.23032418952618453, + "grad_norm": 0.404384075213067, + "learning_rate": 4.4923628377743146e-05, + "loss": 0.9149, + "step": 4618 + }, + { + "epoch": 0.23037406483790523, + "grad_norm": 0.46003193252637553, + "learning_rate": 4.4921188683966975e-05, + "loss": 0.9254, + "step": 4619 + }, + { + "epoch": 0.23042394014962594, + "grad_norm": 0.4294248635605676, + "learning_rate": 4.49187484703563e-05, + "loss": 0.942, + "step": 4620 + }, + { + "epoch": 0.23047381546134663, + "grad_norm": 0.46489493473793775, + "learning_rate": 4.491630773697481e-05, + "loss": 0.9366, + "step": 4621 + }, + { + "epoch": 0.23052369077306734, + "grad_norm": 0.4270160423726999, + "learning_rate": 4.491386648388616e-05, + "loss": 0.9353, + "step": 4622 + }, + { + "epoch": 0.23057356608478802, + "grad_norm": 0.43280255178175564, + "learning_rate": 4.4911424711154094e-05, + "loss": 0.9118, + "step": 4623 + }, + { + "epoch": 0.23062344139650873, + "grad_norm": 0.47970340794503796, + "learning_rate": 4.4908982418842306e-05, + "loss": 0.9137, + "step": 4624 + }, + { + "epoch": 0.23067331670822944, + "grad_norm": 0.3891113042755766, + "learning_rate": 4.490653960701453e-05, + "loss": 0.9314, + "step": 4625 + }, + { + "epoch": 0.23072319201995012, + "grad_norm": 0.43656205594757025, + "learning_rate": 4.490409627573451e-05, + "loss": 0.96, + "step": 4626 + }, + { + "epoch": 0.23077306733167083, + "grad_norm": 0.42019967770666156, + "learning_rate": 4.490165242506601e-05, + "loss": 0.9452, + "step": 4627 + }, + { + "epoch": 0.2308229426433915, + "grad_norm": 0.42138446491192716, + "learning_rate": 4.4899208055072795e-05, + "loss": 0.9554, + "step": 4628 + }, + { + "epoch": 0.23087281795511222, + "grad_norm": 0.41702800146024394, + "learning_rate": 4.489676316581866e-05, + "loss": 0.8759, + "step": 4629 + }, + { + "epoch": 0.23092269326683293, + "grad_norm": 0.44636359431271483, + "learning_rate": 4.489431775736738e-05, + "loss": 0.9375, + "step": 4630 + }, + { + "epoch": 0.2309725685785536, + "grad_norm": 0.5088008701156618, + "learning_rate": 4.489187182978279e-05, + "loss": 0.976, + "step": 4631 + }, + { + "epoch": 0.23102244389027432, + "grad_norm": 0.4894249120535126, + "learning_rate": 4.48894253831287e-05, + "loss": 0.9584, + "step": 4632 + }, + { + "epoch": 0.231072319201995, + "grad_norm": 0.4720995605491744, + "learning_rate": 4.488697841746897e-05, + "loss": 0.9606, + "step": 4633 + }, + { + "epoch": 0.2311221945137157, + "grad_norm": 0.4537125620227464, + "learning_rate": 4.488453093286743e-05, + "loss": 0.9466, + "step": 4634 + }, + { + "epoch": 0.23117206982543642, + "grad_norm": 0.43282119611090103, + "learning_rate": 4.488208292938795e-05, + "loss": 0.9369, + "step": 4635 + }, + { + "epoch": 0.2312219451371571, + "grad_norm": 0.43239649062784175, + "learning_rate": 4.487963440709442e-05, + "loss": 0.9018, + "step": 4636 + }, + { + "epoch": 0.2312718204488778, + "grad_norm": 0.3960647801398345, + "learning_rate": 4.487718536605072e-05, + "loss": 0.9227, + "step": 4637 + }, + { + "epoch": 0.2313216957605985, + "grad_norm": 0.604935344583413, + "learning_rate": 4.487473580632078e-05, + "loss": 0.9716, + "step": 4638 + }, + { + "epoch": 0.2313715710723192, + "grad_norm": 0.45991105593166776, + "learning_rate": 4.487228572796849e-05, + "loss": 0.927, + "step": 4639 + }, + { + "epoch": 0.2314214463840399, + "grad_norm": 0.5159503907348105, + "learning_rate": 4.48698351310578e-05, + "loss": 0.9091, + "step": 4640 + }, + { + "epoch": 0.2314713216957606, + "grad_norm": 0.4500248027695649, + "learning_rate": 4.486738401565266e-05, + "loss": 0.9369, + "step": 4641 + }, + { + "epoch": 0.2315211970074813, + "grad_norm": 0.5912428834541665, + "learning_rate": 4.4864932381817013e-05, + "loss": 0.9224, + "step": 4642 + }, + { + "epoch": 0.231571072319202, + "grad_norm": 0.4585195344191064, + "learning_rate": 4.4862480229614855e-05, + "loss": 0.9287, + "step": 4643 + }, + { + "epoch": 0.2316209476309227, + "grad_norm": 0.41368447914016604, + "learning_rate": 4.4860027559110166e-05, + "loss": 0.9095, + "step": 4644 + }, + { + "epoch": 0.23167082294264338, + "grad_norm": 0.5389316400196958, + "learning_rate": 4.485757437036694e-05, + "loss": 0.8879, + "step": 4645 + }, + { + "epoch": 0.2317206982543641, + "grad_norm": 0.7547741552955558, + "learning_rate": 4.4855120663449206e-05, + "loss": 0.9105, + "step": 4646 + }, + { + "epoch": 0.2317705735660848, + "grad_norm": 0.46777967480851945, + "learning_rate": 4.485266643842098e-05, + "loss": 0.9011, + "step": 4647 + }, + { + "epoch": 0.23182044887780548, + "grad_norm": 0.43940249373195006, + "learning_rate": 4.485021169534631e-05, + "loss": 0.9279, + "step": 4648 + }, + { + "epoch": 0.2318703241895262, + "grad_norm": 0.420690637351595, + "learning_rate": 4.484775643428924e-05, + "loss": 0.9522, + "step": 4649 + }, + { + "epoch": 0.23192019950124687, + "grad_norm": 0.4353014423608398, + "learning_rate": 4.484530065531386e-05, + "loss": 0.8874, + "step": 4650 + }, + { + "epoch": 0.23197007481296758, + "grad_norm": 0.44143149041838264, + "learning_rate": 4.4842844358484236e-05, + "loss": 0.9037, + "step": 4651 + }, + { + "epoch": 0.2320199501246883, + "grad_norm": 0.4448999232962538, + "learning_rate": 4.4840387543864476e-05, + "loss": 0.955, + "step": 4652 + }, + { + "epoch": 0.23206982543640897, + "grad_norm": 0.38541443735046077, + "learning_rate": 4.483793021151868e-05, + "loss": 0.9154, + "step": 4653 + }, + { + "epoch": 0.23211970074812968, + "grad_norm": 0.40153845354948037, + "learning_rate": 4.483547236151096e-05, + "loss": 0.9215, + "step": 4654 + }, + { + "epoch": 0.23216957605985036, + "grad_norm": 0.4709773814575879, + "learning_rate": 4.4833013993905484e-05, + "loss": 0.9335, + "step": 4655 + }, + { + "epoch": 0.23221945137157107, + "grad_norm": 0.5602636661217325, + "learning_rate": 4.483055510876638e-05, + "loss": 0.9111, + "step": 4656 + }, + { + "epoch": 0.23226932668329178, + "grad_norm": 0.40687264145184643, + "learning_rate": 4.482809570615781e-05, + "loss": 0.9272, + "step": 4657 + }, + { + "epoch": 0.23231920199501246, + "grad_norm": 0.6858192302649484, + "learning_rate": 4.482563578614395e-05, + "loss": 0.9222, + "step": 4658 + }, + { + "epoch": 0.23236907730673317, + "grad_norm": 0.4428809936826459, + "learning_rate": 4.482317534878901e-05, + "loss": 0.8917, + "step": 4659 + }, + { + "epoch": 0.23241895261845386, + "grad_norm": 0.43107597750047766, + "learning_rate": 4.4820714394157174e-05, + "loss": 0.9244, + "step": 4660 + }, + { + "epoch": 0.23246882793017457, + "grad_norm": 0.37924283356808186, + "learning_rate": 4.481825292231267e-05, + "loss": 0.9123, + "step": 4661 + }, + { + "epoch": 0.23251870324189527, + "grad_norm": 0.4611133520280925, + "learning_rate": 4.4815790933319724e-05, + "loss": 0.9399, + "step": 4662 + }, + { + "epoch": 0.23256857855361596, + "grad_norm": 0.4715355071011691, + "learning_rate": 4.481332842724258e-05, + "loss": 0.9454, + "step": 4663 + }, + { + "epoch": 0.23261845386533667, + "grad_norm": 0.3981145547346519, + "learning_rate": 4.48108654041455e-05, + "loss": 0.949, + "step": 4664 + }, + { + "epoch": 0.23266832917705735, + "grad_norm": 0.4318649887603125, + "learning_rate": 4.480840186409275e-05, + "loss": 0.9815, + "step": 4665 + }, + { + "epoch": 0.23271820448877806, + "grad_norm": 0.5042777389972615, + "learning_rate": 4.4805937807148624e-05, + "loss": 0.9528, + "step": 4666 + }, + { + "epoch": 0.23276807980049874, + "grad_norm": 0.5053139544329611, + "learning_rate": 4.4803473233377416e-05, + "loss": 0.9343, + "step": 4667 + }, + { + "epoch": 0.23281795511221945, + "grad_norm": 0.45411870059663617, + "learning_rate": 4.4801008142843435e-05, + "loss": 0.9766, + "step": 4668 + }, + { + "epoch": 0.23286783042394016, + "grad_norm": 0.45378629129507514, + "learning_rate": 4.479854253561101e-05, + "loss": 0.9192, + "step": 4669 + }, + { + "epoch": 0.23291770573566084, + "grad_norm": 0.5083611976335581, + "learning_rate": 4.479607641174447e-05, + "loss": 0.9272, + "step": 4670 + }, + { + "epoch": 0.23296758104738155, + "grad_norm": 0.40241099699476923, + "learning_rate": 4.479360977130818e-05, + "loss": 0.8993, + "step": 4671 + }, + { + "epoch": 0.23301745635910223, + "grad_norm": 0.7132369298196315, + "learning_rate": 4.47911426143665e-05, + "loss": 0.9397, + "step": 4672 + }, + { + "epoch": 0.23306733167082294, + "grad_norm": 0.4845232361678231, + "learning_rate": 4.478867494098381e-05, + "loss": 0.8988, + "step": 4673 + }, + { + "epoch": 0.23311720698254365, + "grad_norm": 0.4897778869328839, + "learning_rate": 4.478620675122451e-05, + "loss": 0.9406, + "step": 4674 + }, + { + "epoch": 0.23316708229426433, + "grad_norm": 0.5385906224554163, + "learning_rate": 4.4783738045153e-05, + "loss": 0.8854, + "step": 4675 + }, + { + "epoch": 0.23321695760598504, + "grad_norm": 0.4645904252906364, + "learning_rate": 4.4781268822833694e-05, + "loss": 0.9298, + "step": 4676 + }, + { + "epoch": 0.23326683291770572, + "grad_norm": 0.5065994124558116, + "learning_rate": 4.477879908433104e-05, + "loss": 0.9564, + "step": 4677 + }, + { + "epoch": 0.23331670822942643, + "grad_norm": 0.41110926638563694, + "learning_rate": 4.477632882970946e-05, + "loss": 0.8867, + "step": 4678 + }, + { + "epoch": 0.23336658354114714, + "grad_norm": 0.5035986031724395, + "learning_rate": 4.477385805903344e-05, + "loss": 0.8918, + "step": 4679 + }, + { + "epoch": 0.23341645885286783, + "grad_norm": 0.5194573874272491, + "learning_rate": 4.477138677236744e-05, + "loss": 0.9181, + "step": 4680 + }, + { + "epoch": 0.23346633416458853, + "grad_norm": 0.460137606598329, + "learning_rate": 4.476891496977595e-05, + "loss": 0.9042, + "step": 4681 + }, + { + "epoch": 0.23351620947630922, + "grad_norm": 0.5094122068059326, + "learning_rate": 4.476644265132348e-05, + "loss": 0.9508, + "step": 4682 + }, + { + "epoch": 0.23356608478802993, + "grad_norm": 0.44239815852754405, + "learning_rate": 4.476396981707453e-05, + "loss": 0.9037, + "step": 4683 + }, + { + "epoch": 0.23361596009975064, + "grad_norm": 0.43671667032253525, + "learning_rate": 4.476149646709363e-05, + "loss": 0.9326, + "step": 4684 + }, + { + "epoch": 0.23366583541147132, + "grad_norm": 0.4350521211291734, + "learning_rate": 4.4759022601445336e-05, + "loss": 0.9162, + "step": 4685 + }, + { + "epoch": 0.23371571072319203, + "grad_norm": 0.449271893895196, + "learning_rate": 4.475654822019417e-05, + "loss": 0.872, + "step": 4686 + }, + { + "epoch": 0.2337655860349127, + "grad_norm": 0.46496810584075526, + "learning_rate": 4.475407332340473e-05, + "loss": 0.8816, + "step": 4687 + }, + { + "epoch": 0.23381546134663342, + "grad_norm": 0.41696067161497674, + "learning_rate": 4.47515979111416e-05, + "loss": 0.9238, + "step": 4688 + }, + { + "epoch": 0.23386533665835413, + "grad_norm": 0.6638112471580404, + "learning_rate": 4.474912198346935e-05, + "loss": 0.9302, + "step": 4689 + }, + { + "epoch": 0.2339152119700748, + "grad_norm": 0.5475603173923668, + "learning_rate": 4.474664554045259e-05, + "loss": 0.9293, + "step": 4690 + }, + { + "epoch": 0.23396508728179552, + "grad_norm": 0.7988699958117563, + "learning_rate": 4.474416858215596e-05, + "loss": 0.9344, + "step": 4691 + }, + { + "epoch": 0.2340149625935162, + "grad_norm": 0.43494326248625476, + "learning_rate": 4.47416911086441e-05, + "loss": 0.9149, + "step": 4692 + }, + { + "epoch": 0.2340648379052369, + "grad_norm": 0.7597186689485648, + "learning_rate": 4.473921311998163e-05, + "loss": 0.9139, + "step": 4693 + }, + { + "epoch": 0.2341147132169576, + "grad_norm": 0.6788073153740471, + "learning_rate": 4.4736734616233235e-05, + "loss": 0.9357, + "step": 4694 + }, + { + "epoch": 0.2341645885286783, + "grad_norm": 0.5344707613323282, + "learning_rate": 4.473425559746358e-05, + "loss": 0.9627, + "step": 4695 + }, + { + "epoch": 0.234214463840399, + "grad_norm": 0.5416356275998376, + "learning_rate": 4.4731776063737364e-05, + "loss": 0.8693, + "step": 4696 + }, + { + "epoch": 0.2342643391521197, + "grad_norm": 0.4028537598918806, + "learning_rate": 4.472929601511927e-05, + "loss": 0.9246, + "step": 4697 + }, + { + "epoch": 0.2343142144638404, + "grad_norm": 0.5372259070393358, + "learning_rate": 4.472681545167404e-05, + "loss": 0.9303, + "step": 4698 + }, + { + "epoch": 0.23436408977556109, + "grad_norm": 0.5800034698568073, + "learning_rate": 4.472433437346638e-05, + "loss": 0.9621, + "step": 4699 + }, + { + "epoch": 0.2344139650872818, + "grad_norm": 0.5892773809499093, + "learning_rate": 4.4721852780561046e-05, + "loss": 0.9967, + "step": 4700 + }, + { + "epoch": 0.2344638403990025, + "grad_norm": 0.5406285423835991, + "learning_rate": 4.471937067302279e-05, + "loss": 0.9346, + "step": 4701 + }, + { + "epoch": 0.2345137157107232, + "grad_norm": 0.62993690405214, + "learning_rate": 4.471688805091638e-05, + "loss": 0.9606, + "step": 4702 + }, + { + "epoch": 0.2345635910224439, + "grad_norm": 0.5002177268865898, + "learning_rate": 4.47144049143066e-05, + "loss": 0.9342, + "step": 4703 + }, + { + "epoch": 0.23461346633416458, + "grad_norm": 0.43460264656789344, + "learning_rate": 4.471192126325825e-05, + "loss": 0.9091, + "step": 4704 + }, + { + "epoch": 0.2346633416458853, + "grad_norm": 0.4683958797380057, + "learning_rate": 4.470943709783613e-05, + "loss": 0.9492, + "step": 4705 + }, + { + "epoch": 0.234713216957606, + "grad_norm": 0.43458203878691926, + "learning_rate": 4.470695241810508e-05, + "loss": 0.9485, + "step": 4706 + }, + { + "epoch": 0.23476309226932668, + "grad_norm": 0.44255506169182995, + "learning_rate": 4.4704467224129914e-05, + "loss": 1.0016, + "step": 4707 + }, + { + "epoch": 0.2348129675810474, + "grad_norm": 0.5852971673028887, + "learning_rate": 4.47019815159755e-05, + "loss": 0.9953, + "step": 4708 + }, + { + "epoch": 0.23486284289276807, + "grad_norm": 0.37777725986036254, + "learning_rate": 4.46994952937067e-05, + "loss": 0.9341, + "step": 4709 + }, + { + "epoch": 0.23491271820448878, + "grad_norm": 0.5003589208805609, + "learning_rate": 4.4697008557388375e-05, + "loss": 0.9388, + "step": 4710 + }, + { + "epoch": 0.2349625935162095, + "grad_norm": 0.5625008252845527, + "learning_rate": 4.469452130708543e-05, + "loss": 0.9149, + "step": 4711 + }, + { + "epoch": 0.23501246882793017, + "grad_norm": 0.8012885984106826, + "learning_rate": 4.4692033542862774e-05, + "loss": 0.9211, + "step": 4712 + }, + { + "epoch": 0.23506234413965088, + "grad_norm": 0.3917940045855744, + "learning_rate": 4.4689545264785314e-05, + "loss": 0.9596, + "step": 4713 + }, + { + "epoch": 0.23511221945137156, + "grad_norm": 0.5083365242990535, + "learning_rate": 4.468705647291799e-05, + "loss": 0.9508, + "step": 4714 + }, + { + "epoch": 0.23516209476309227, + "grad_norm": 0.4515502207771032, + "learning_rate": 4.468456716732572e-05, + "loss": 0.9378, + "step": 4715 + }, + { + "epoch": 0.23521197007481295, + "grad_norm": 0.42335802741516076, + "learning_rate": 4.468207734807349e-05, + "loss": 0.8814, + "step": 4716 + }, + { + "epoch": 0.23526184538653366, + "grad_norm": 0.5633760049843829, + "learning_rate": 4.4679587015226253e-05, + "loss": 0.9635, + "step": 4717 + }, + { + "epoch": 0.23531172069825437, + "grad_norm": 0.5887836200789989, + "learning_rate": 4.4677096168849007e-05, + "loss": 0.9295, + "step": 4718 + }, + { + "epoch": 0.23536159600997505, + "grad_norm": 0.42179949945368667, + "learning_rate": 4.467460480900674e-05, + "loss": 0.9194, + "step": 4719 + }, + { + "epoch": 0.23541147132169576, + "grad_norm": 0.5850596144633811, + "learning_rate": 4.467211293576447e-05, + "loss": 0.9444, + "step": 4720 + }, + { + "epoch": 0.23546134663341645, + "grad_norm": 0.6886402170823123, + "learning_rate": 4.466962054918721e-05, + "loss": 0.9407, + "step": 4721 + }, + { + "epoch": 0.23551122194513716, + "grad_norm": 0.4068509869384463, + "learning_rate": 4.466712764934001e-05, + "loss": 0.9552, + "step": 4722 + }, + { + "epoch": 0.23556109725685787, + "grad_norm": 0.4324727948636383, + "learning_rate": 4.466463423628792e-05, + "loss": 0.9312, + "step": 4723 + }, + { + "epoch": 0.23561097256857855, + "grad_norm": 0.44907109729254036, + "learning_rate": 4.466214031009599e-05, + "loss": 0.9269, + "step": 4724 + }, + { + "epoch": 0.23566084788029926, + "grad_norm": 0.4548088386448253, + "learning_rate": 4.465964587082931e-05, + "loss": 0.9452, + "step": 4725 + }, + { + "epoch": 0.23571072319201994, + "grad_norm": 0.446382118566187, + "learning_rate": 4.4657150918552974e-05, + "loss": 0.9726, + "step": 4726 + }, + { + "epoch": 0.23576059850374065, + "grad_norm": 0.39938757922215623, + "learning_rate": 4.465465545333208e-05, + "loss": 0.9353, + "step": 4727 + }, + { + "epoch": 0.23581047381546136, + "grad_norm": 0.5789033194851548, + "learning_rate": 4.4652159475231745e-05, + "loss": 0.9239, + "step": 4728 + }, + { + "epoch": 0.23586034912718204, + "grad_norm": 0.44265294041852676, + "learning_rate": 4.4649662984317116e-05, + "loss": 0.9339, + "step": 4729 + }, + { + "epoch": 0.23591022443890275, + "grad_norm": 0.42306853215634743, + "learning_rate": 4.4647165980653316e-05, + "loss": 0.9395, + "step": 4730 + }, + { + "epoch": 0.23596009975062343, + "grad_norm": 0.6427872024927269, + "learning_rate": 4.464466846430551e-05, + "loss": 0.9066, + "step": 4731 + }, + { + "epoch": 0.23600997506234414, + "grad_norm": 0.41372996852542204, + "learning_rate": 4.4642170435338874e-05, + "loss": 0.969, + "step": 4732 + }, + { + "epoch": 0.23605985037406485, + "grad_norm": 0.45279372397649814, + "learning_rate": 4.4639671893818595e-05, + "loss": 1.0003, + "step": 4733 + }, + { + "epoch": 0.23610972568578553, + "grad_norm": 1.683118517105203, + "learning_rate": 4.463717283980987e-05, + "loss": 0.9563, + "step": 4734 + }, + { + "epoch": 0.23615960099750624, + "grad_norm": 2.2620193342067845, + "learning_rate": 4.4634673273377906e-05, + "loss": 0.8995, + "step": 4735 + }, + { + "epoch": 0.23620947630922692, + "grad_norm": 0.4202506271022545, + "learning_rate": 4.4632173194587926e-05, + "loss": 0.8868, + "step": 4736 + }, + { + "epoch": 0.23625935162094763, + "grad_norm": 0.46602053741779587, + "learning_rate": 4.462967260350517e-05, + "loss": 0.9313, + "step": 4737 + }, + { + "epoch": 0.23630922693266834, + "grad_norm": 0.6367190828411097, + "learning_rate": 4.4627171500194896e-05, + "loss": 0.9134, + "step": 4738 + }, + { + "epoch": 0.23635910224438902, + "grad_norm": 0.4095927401394604, + "learning_rate": 4.462466988472237e-05, + "loss": 0.9279, + "step": 4739 + }, + { + "epoch": 0.23640897755610973, + "grad_norm": 0.521253086488995, + "learning_rate": 4.462216775715287e-05, + "loss": 0.9009, + "step": 4740 + }, + { + "epoch": 0.23645885286783042, + "grad_norm": 0.399830491791554, + "learning_rate": 4.4619665117551666e-05, + "loss": 0.9387, + "step": 4741 + }, + { + "epoch": 0.23650872817955113, + "grad_norm": 0.4268488879418979, + "learning_rate": 4.46171619659841e-05, + "loss": 0.9191, + "step": 4742 + }, + { + "epoch": 0.2365586034912718, + "grad_norm": 0.6917368286337474, + "learning_rate": 4.461465830251546e-05, + "loss": 0.9934, + "step": 4743 + }, + { + "epoch": 0.23660847880299252, + "grad_norm": 0.42064994812333095, + "learning_rate": 4.4612154127211095e-05, + "loss": 0.9043, + "step": 4744 + }, + { + "epoch": 0.23665835411471323, + "grad_norm": 0.39634162762135966, + "learning_rate": 4.4609649440136346e-05, + "loss": 0.9506, + "step": 4745 + }, + { + "epoch": 0.2367082294264339, + "grad_norm": 0.44650032868732126, + "learning_rate": 4.460714424135657e-05, + "loss": 0.9367, + "step": 4746 + }, + { + "epoch": 0.23675810473815462, + "grad_norm": 0.38609872382158955, + "learning_rate": 4.460463853093714e-05, + "loss": 0.8988, + "step": 4747 + }, + { + "epoch": 0.2368079800498753, + "grad_norm": 0.44542374678805313, + "learning_rate": 4.460213230894344e-05, + "loss": 0.9727, + "step": 4748 + }, + { + "epoch": 0.236857855361596, + "grad_norm": 0.42127197077762746, + "learning_rate": 4.459962557544087e-05, + "loss": 0.9735, + "step": 4749 + }, + { + "epoch": 0.23690773067331672, + "grad_norm": 0.4540503983536716, + "learning_rate": 4.4597118330494844e-05, + "loss": 0.9333, + "step": 4750 + }, + { + "epoch": 0.2369576059850374, + "grad_norm": 0.592448602221519, + "learning_rate": 4.459461057417078e-05, + "loss": 0.9583, + "step": 4751 + }, + { + "epoch": 0.2370074812967581, + "grad_norm": 0.4514429832176672, + "learning_rate": 4.459210230653413e-05, + "loss": 0.9039, + "step": 4752 + }, + { + "epoch": 0.2370573566084788, + "grad_norm": 0.3867521557400877, + "learning_rate": 4.458959352765034e-05, + "loss": 0.9445, + "step": 4753 + }, + { + "epoch": 0.2371072319201995, + "grad_norm": 0.3809102590719449, + "learning_rate": 4.458708423758487e-05, + "loss": 0.9048, + "step": 4754 + }, + { + "epoch": 0.2371571072319202, + "grad_norm": 0.9701977308649837, + "learning_rate": 4.45845744364032e-05, + "loss": 0.8894, + "step": 4755 + }, + { + "epoch": 0.2372069825436409, + "grad_norm": 0.4212864135931351, + "learning_rate": 4.458206412417083e-05, + "loss": 0.9261, + "step": 4756 + }, + { + "epoch": 0.2372568578553616, + "grad_norm": 0.4225005587055448, + "learning_rate": 4.457955330095326e-05, + "loss": 0.9504, + "step": 4757 + }, + { + "epoch": 0.23730673316708228, + "grad_norm": 0.46176195068314385, + "learning_rate": 4.4577041966816006e-05, + "loss": 0.9794, + "step": 4758 + }, + { + "epoch": 0.237356608478803, + "grad_norm": 0.4013098969749989, + "learning_rate": 4.457453012182461e-05, + "loss": 0.9191, + "step": 4759 + }, + { + "epoch": 0.2374064837905237, + "grad_norm": 0.6938975079955253, + "learning_rate": 4.45720177660446e-05, + "loss": 0.9441, + "step": 4760 + }, + { + "epoch": 0.23745635910224439, + "grad_norm": 0.791973775422155, + "learning_rate": 4.456950489954156e-05, + "loss": 0.9181, + "step": 4761 + }, + { + "epoch": 0.2375062344139651, + "grad_norm": 0.49742093083241956, + "learning_rate": 4.456699152238104e-05, + "loss": 0.9357, + "step": 4762 + }, + { + "epoch": 0.23755610972568578, + "grad_norm": 0.5261071162233988, + "learning_rate": 4.456447763462863e-05, + "loss": 1.0139, + "step": 4763 + }, + { + "epoch": 0.2376059850374065, + "grad_norm": 0.42979745629377086, + "learning_rate": 4.456196323634994e-05, + "loss": 0.9516, + "step": 4764 + }, + { + "epoch": 0.23765586034912717, + "grad_norm": 0.4415118638298988, + "learning_rate": 4.455944832761056e-05, + "loss": 0.9317, + "step": 4765 + }, + { + "epoch": 0.23770573566084788, + "grad_norm": 0.46492283108651034, + "learning_rate": 4.455693290847614e-05, + "loss": 0.9471, + "step": 4766 + }, + { + "epoch": 0.2377556109725686, + "grad_norm": 0.4114098440293427, + "learning_rate": 4.45544169790123e-05, + "loss": 0.9034, + "step": 4767 + }, + { + "epoch": 0.23780548628428927, + "grad_norm": 0.4385511864841873, + "learning_rate": 4.455190053928471e-05, + "loss": 0.9234, + "step": 4768 + }, + { + "epoch": 0.23785536159600998, + "grad_norm": 0.45555800932665513, + "learning_rate": 4.454938358935903e-05, + "loss": 0.8757, + "step": 4769 + }, + { + "epoch": 0.23790523690773066, + "grad_norm": 0.43694420486150903, + "learning_rate": 4.454686612930093e-05, + "loss": 0.9225, + "step": 4770 + }, + { + "epoch": 0.23795511221945137, + "grad_norm": 0.4745734171903028, + "learning_rate": 4.4544348159176096e-05, + "loss": 0.9617, + "step": 4771 + }, + { + "epoch": 0.23800498753117208, + "grad_norm": 0.43379217416881627, + "learning_rate": 4.4541829679050254e-05, + "loss": 0.9351, + "step": 4772 + }, + { + "epoch": 0.23805486284289276, + "grad_norm": 0.41461267421541204, + "learning_rate": 4.453931068898911e-05, + "loss": 0.9252, + "step": 4773 + }, + { + "epoch": 0.23810473815461347, + "grad_norm": 0.4057887991093593, + "learning_rate": 4.45367911890584e-05, + "loss": 0.9603, + "step": 4774 + }, + { + "epoch": 0.23815461346633415, + "grad_norm": 0.504219160425858, + "learning_rate": 4.453427117932387e-05, + "loss": 0.9404, + "step": 4775 + }, + { + "epoch": 0.23820448877805486, + "grad_norm": 0.4460225108769904, + "learning_rate": 4.4531750659851266e-05, + "loss": 0.9081, + "step": 4776 + }, + { + "epoch": 0.23825436408977557, + "grad_norm": 0.7646829761687711, + "learning_rate": 4.452922963070638e-05, + "loss": 0.933, + "step": 4777 + }, + { + "epoch": 0.23830423940149625, + "grad_norm": 0.4457842788132677, + "learning_rate": 4.452670809195498e-05, + "loss": 0.9235, + "step": 4778 + }, + { + "epoch": 0.23835411471321696, + "grad_norm": 0.4002818635519948, + "learning_rate": 4.452418604366287e-05, + "loss": 0.9388, + "step": 4779 + }, + { + "epoch": 0.23840399002493765, + "grad_norm": 0.43872268932439973, + "learning_rate": 4.452166348589587e-05, + "loss": 0.9302, + "step": 4780 + }, + { + "epoch": 0.23845386533665835, + "grad_norm": 0.4943356964384764, + "learning_rate": 4.4519140418719795e-05, + "loss": 0.9797, + "step": 4781 + }, + { + "epoch": 0.23850374064837906, + "grad_norm": 0.6041745063929603, + "learning_rate": 4.4516616842200486e-05, + "loss": 0.9144, + "step": 4782 + }, + { + "epoch": 0.23855361596009975, + "grad_norm": 0.6693289147496384, + "learning_rate": 4.451409275640379e-05, + "loss": 0.9157, + "step": 4783 + }, + { + "epoch": 0.23860349127182046, + "grad_norm": 0.4483147974536965, + "learning_rate": 4.451156816139559e-05, + "loss": 0.948, + "step": 4784 + }, + { + "epoch": 0.23865336658354114, + "grad_norm": 0.4824218246794317, + "learning_rate": 4.450904305724174e-05, + "loss": 0.896, + "step": 4785 + }, + { + "epoch": 0.23870324189526185, + "grad_norm": 0.37761603756337175, + "learning_rate": 4.4506517444008143e-05, + "loss": 0.9108, + "step": 4786 + }, + { + "epoch": 0.23875311720698256, + "grad_norm": 0.43897324983951413, + "learning_rate": 4.4503991321760706e-05, + "loss": 0.9012, + "step": 4787 + }, + { + "epoch": 0.23880299251870324, + "grad_norm": 0.5112399438694465, + "learning_rate": 4.4501464690565344e-05, + "loss": 0.9238, + "step": 4788 + }, + { + "epoch": 0.23885286783042395, + "grad_norm": 0.3829901241147315, + "learning_rate": 4.449893755048799e-05, + "loss": 0.937, + "step": 4789 + }, + { + "epoch": 0.23890274314214463, + "grad_norm": 0.44591913634407065, + "learning_rate": 4.449640990159458e-05, + "loss": 0.9572, + "step": 4790 + }, + { + "epoch": 0.23895261845386534, + "grad_norm": 0.4841519501675972, + "learning_rate": 4.4493881743951097e-05, + "loss": 0.8854, + "step": 4791 + }, + { + "epoch": 0.23900249376558602, + "grad_norm": 0.5873398908098634, + "learning_rate": 4.449135307762348e-05, + "loss": 0.939, + "step": 4792 + }, + { + "epoch": 0.23905236907730673, + "grad_norm": 0.5322486646409488, + "learning_rate": 4.448882390267772e-05, + "loss": 0.9142, + "step": 4793 + }, + { + "epoch": 0.23910224438902744, + "grad_norm": 2.0360764344177475, + "learning_rate": 4.4486294219179836e-05, + "loss": 0.909, + "step": 4794 + }, + { + "epoch": 0.23915211970074812, + "grad_norm": 0.45520161353323163, + "learning_rate": 4.4483764027195827e-05, + "loss": 0.8766, + "step": 4795 + }, + { + "epoch": 0.23920199501246883, + "grad_norm": 0.5351141098149478, + "learning_rate": 4.44812333267917e-05, + "loss": 0.9386, + "step": 4796 + }, + { + "epoch": 0.23925187032418951, + "grad_norm": 0.4277874608729358, + "learning_rate": 4.447870211803353e-05, + "loss": 0.9268, + "step": 4797 + }, + { + "epoch": 0.23930174563591022, + "grad_norm": 0.5231058577269895, + "learning_rate": 4.4476170400987326e-05, + "loss": 0.8987, + "step": 4798 + }, + { + "epoch": 0.23935162094763093, + "grad_norm": 0.4974058530636332, + "learning_rate": 4.4473638175719185e-05, + "loss": 0.9601, + "step": 4799 + }, + { + "epoch": 0.23940149625935161, + "grad_norm": 0.3973609544830391, + "learning_rate": 4.447110544229517e-05, + "loss": 0.9073, + "step": 4800 + }, + { + "epoch": 0.23945137157107232, + "grad_norm": 0.4854262370922368, + "learning_rate": 4.446857220078137e-05, + "loss": 0.9209, + "step": 4801 + }, + { + "epoch": 0.239501246882793, + "grad_norm": 0.4426559757328729, + "learning_rate": 4.446603845124388e-05, + "loss": 0.9181, + "step": 4802 + }, + { + "epoch": 0.23955112219451372, + "grad_norm": 0.6140369557792204, + "learning_rate": 4.446350419374884e-05, + "loss": 0.9512, + "step": 4803 + }, + { + "epoch": 0.23960099750623443, + "grad_norm": 0.4701570602925321, + "learning_rate": 4.4460969428362365e-05, + "loss": 0.9525, + "step": 4804 + }, + { + "epoch": 0.2396508728179551, + "grad_norm": 0.4550274790551922, + "learning_rate": 4.4458434155150606e-05, + "loss": 0.9124, + "step": 4805 + }, + { + "epoch": 0.23970074812967582, + "grad_norm": 0.6676582242989376, + "learning_rate": 4.4455898374179715e-05, + "loss": 0.9384, + "step": 4806 + }, + { + "epoch": 0.2397506234413965, + "grad_norm": 0.5156713402943917, + "learning_rate": 4.4453362085515857e-05, + "loss": 0.8553, + "step": 4807 + }, + { + "epoch": 0.2398004987531172, + "grad_norm": 0.7292976927297438, + "learning_rate": 4.4450825289225234e-05, + "loss": 0.9621, + "step": 4808 + }, + { + "epoch": 0.23985037406483792, + "grad_norm": 0.4524103308790866, + "learning_rate": 4.4448287985374024e-05, + "loss": 0.9405, + "step": 4809 + }, + { + "epoch": 0.2399002493765586, + "grad_norm": 0.45045084058394474, + "learning_rate": 4.4445750174028446e-05, + "loss": 0.9455, + "step": 4810 + }, + { + "epoch": 0.2399501246882793, + "grad_norm": 0.5083625983379054, + "learning_rate": 4.4443211855254715e-05, + "loss": 0.958, + "step": 4811 + }, + { + "epoch": 0.24, + "grad_norm": 0.40830489745420584, + "learning_rate": 4.444067302911907e-05, + "loss": 0.9155, + "step": 4812 + }, + { + "epoch": 0.2400498753117207, + "grad_norm": 0.574297046573671, + "learning_rate": 4.4438133695687764e-05, + "loss": 0.972, + "step": 4813 + }, + { + "epoch": 0.2400997506234414, + "grad_norm": 0.5216656551999467, + "learning_rate": 4.443559385502706e-05, + "loss": 0.9003, + "step": 4814 + }, + { + "epoch": 0.2401496259351621, + "grad_norm": 0.42555065214356197, + "learning_rate": 4.443305350720324e-05, + "loss": 0.9491, + "step": 4815 + }, + { + "epoch": 0.2401995012468828, + "grad_norm": 0.49146560248109183, + "learning_rate": 4.443051265228258e-05, + "loss": 0.9565, + "step": 4816 + }, + { + "epoch": 0.24024937655860348, + "grad_norm": 0.5289013219163856, + "learning_rate": 4.442797129033138e-05, + "loss": 0.9109, + "step": 4817 + }, + { + "epoch": 0.2402992518703242, + "grad_norm": 0.4389157430813737, + "learning_rate": 4.442542942141598e-05, + "loss": 0.9162, + "step": 4818 + }, + { + "epoch": 0.24034912718204487, + "grad_norm": 0.8135683079356304, + "learning_rate": 4.442288704560268e-05, + "loss": 0.9295, + "step": 4819 + }, + { + "epoch": 0.24039900249376558, + "grad_norm": 0.6427568284403034, + "learning_rate": 4.4420344162957835e-05, + "loss": 0.9373, + "step": 4820 + }, + { + "epoch": 0.2404488778054863, + "grad_norm": 0.5316103128018269, + "learning_rate": 4.441780077354781e-05, + "loss": 0.9315, + "step": 4821 + }, + { + "epoch": 0.24049875311720698, + "grad_norm": 0.39876096439061415, + "learning_rate": 4.441525687743896e-05, + "loss": 0.9526, + "step": 4822 + }, + { + "epoch": 0.24054862842892769, + "grad_norm": 0.53110374114505, + "learning_rate": 4.441271247469767e-05, + "loss": 0.9112, + "step": 4823 + }, + { + "epoch": 0.24059850374064837, + "grad_norm": 0.4281928347431523, + "learning_rate": 4.441016756539034e-05, + "loss": 0.8843, + "step": 4824 + }, + { + "epoch": 0.24064837905236908, + "grad_norm": 0.469108825471924, + "learning_rate": 4.440762214958337e-05, + "loss": 0.9268, + "step": 4825 + }, + { + "epoch": 0.2406982543640898, + "grad_norm": 0.41070669591183745, + "learning_rate": 4.4405076227343176e-05, + "loss": 0.8913, + "step": 4826 + }, + { + "epoch": 0.24074812967581047, + "grad_norm": 0.9103434482621012, + "learning_rate": 4.440252979873622e-05, + "loss": 0.8754, + "step": 4827 + }, + { + "epoch": 0.24079800498753118, + "grad_norm": 0.5143877469444569, + "learning_rate": 4.4399982863828925e-05, + "loss": 0.9208, + "step": 4828 + }, + { + "epoch": 0.24084788029925186, + "grad_norm": 0.45684700888269225, + "learning_rate": 4.439743542268776e-05, + "loss": 0.967, + "step": 4829 + }, + { + "epoch": 0.24089775561097257, + "grad_norm": 0.46051148566696226, + "learning_rate": 4.43948874753792e-05, + "loss": 0.9277, + "step": 4830 + }, + { + "epoch": 0.24094763092269328, + "grad_norm": 0.5000231951228666, + "learning_rate": 4.439233902196973e-05, + "loss": 0.9253, + "step": 4831 + }, + { + "epoch": 0.24099750623441396, + "grad_norm": 0.6055617735973466, + "learning_rate": 4.438979006252585e-05, + "loss": 0.9771, + "step": 4832 + }, + { + "epoch": 0.24104738154613467, + "grad_norm": 0.4269210600029276, + "learning_rate": 4.438724059711408e-05, + "loss": 0.9472, + "step": 4833 + }, + { + "epoch": 0.24109725685785535, + "grad_norm": 0.4738762081853972, + "learning_rate": 4.438469062580094e-05, + "loss": 0.991, + "step": 4834 + }, + { + "epoch": 0.24114713216957606, + "grad_norm": 0.43712062581415984, + "learning_rate": 4.4382140148652974e-05, + "loss": 0.9538, + "step": 4835 + }, + { + "epoch": 0.24119700748129677, + "grad_norm": 0.5175846208196547, + "learning_rate": 4.4379589165736736e-05, + "loss": 0.9386, + "step": 4836 + }, + { + "epoch": 0.24124688279301745, + "grad_norm": 0.4810220607897248, + "learning_rate": 4.4377037677118795e-05, + "loss": 0.9233, + "step": 4837 + }, + { + "epoch": 0.24129675810473816, + "grad_norm": 0.5107577221491376, + "learning_rate": 4.4374485682865716e-05, + "loss": 0.9189, + "step": 4838 + }, + { + "epoch": 0.24134663341645884, + "grad_norm": 0.4379832910786765, + "learning_rate": 4.437193318304411e-05, + "loss": 0.9348, + "step": 4839 + }, + { + "epoch": 0.24139650872817955, + "grad_norm": 0.45064032630645523, + "learning_rate": 4.4369380177720585e-05, + "loss": 0.931, + "step": 4840 + }, + { + "epoch": 0.24144638403990024, + "grad_norm": 0.43825401056282903, + "learning_rate": 4.436682666696175e-05, + "loss": 0.9854, + "step": 4841 + }, + { + "epoch": 0.24149625935162095, + "grad_norm": 0.4280484125366703, + "learning_rate": 4.436427265083424e-05, + "loss": 0.9224, + "step": 4842 + }, + { + "epoch": 0.24154613466334166, + "grad_norm": 0.4460588989161124, + "learning_rate": 4.4361718129404695e-05, + "loss": 0.857, + "step": 4843 + }, + { + "epoch": 0.24159600997506234, + "grad_norm": 0.43174643303391974, + "learning_rate": 4.435916310273979e-05, + "loss": 0.9191, + "step": 4844 + }, + { + "epoch": 0.24164588528678305, + "grad_norm": 0.4383152477594814, + "learning_rate": 4.435660757090618e-05, + "loss": 0.915, + "step": 4845 + }, + { + "epoch": 0.24169576059850373, + "grad_norm": 0.4326552691951432, + "learning_rate": 4.435405153397056e-05, + "loss": 0.9266, + "step": 4846 + }, + { + "epoch": 0.24174563591022444, + "grad_norm": 0.3785924863228501, + "learning_rate": 4.435149499199963e-05, + "loss": 0.898, + "step": 4847 + }, + { + "epoch": 0.24179551122194515, + "grad_norm": 0.4369902162504215, + "learning_rate": 4.43489379450601e-05, + "loss": 0.915, + "step": 4848 + }, + { + "epoch": 0.24184538653366583, + "grad_norm": 0.6898314297145547, + "learning_rate": 4.434638039321869e-05, + "loss": 0.9685, + "step": 4849 + }, + { + "epoch": 0.24189526184538654, + "grad_norm": 0.7212402779698509, + "learning_rate": 4.434382233654214e-05, + "loss": 0.9498, + "step": 4850 + }, + { + "epoch": 0.24194513715710722, + "grad_norm": 0.4147411626149288, + "learning_rate": 4.434126377509721e-05, + "loss": 0.9159, + "step": 4851 + }, + { + "epoch": 0.24199501246882793, + "grad_norm": 0.420496169575764, + "learning_rate": 4.433870470895065e-05, + "loss": 0.9585, + "step": 4852 + }, + { + "epoch": 0.24204488778054864, + "grad_norm": 0.406852767896325, + "learning_rate": 4.433614513816925e-05, + "loss": 0.9206, + "step": 4853 + }, + { + "epoch": 0.24209476309226932, + "grad_norm": 0.4523486984836215, + "learning_rate": 4.4333585062819796e-05, + "loss": 0.9492, + "step": 4854 + }, + { + "epoch": 0.24214463840399003, + "grad_norm": 0.4401971048161647, + "learning_rate": 4.433102448296909e-05, + "loss": 0.9712, + "step": 4855 + }, + { + "epoch": 0.2421945137157107, + "grad_norm": 0.47861166989342896, + "learning_rate": 4.432846339868395e-05, + "loss": 0.9234, + "step": 4856 + }, + { + "epoch": 0.24224438902743142, + "grad_norm": 0.48827516522557357, + "learning_rate": 4.432590181003121e-05, + "loss": 0.937, + "step": 4857 + }, + { + "epoch": 0.24229426433915213, + "grad_norm": 0.6359309125814291, + "learning_rate": 4.432333971707771e-05, + "loss": 0.9192, + "step": 4858 + }, + { + "epoch": 0.24234413965087281, + "grad_norm": 0.4466110687853386, + "learning_rate": 4.4320777119890325e-05, + "loss": 0.9569, + "step": 4859 + }, + { + "epoch": 0.24239401496259352, + "grad_norm": 0.5814145658962545, + "learning_rate": 4.431821401853588e-05, + "loss": 0.9397, + "step": 4860 + }, + { + "epoch": 0.2424438902743142, + "grad_norm": 0.4711716733911774, + "learning_rate": 4.431565041308131e-05, + "loss": 0.9695, + "step": 4861 + }, + { + "epoch": 0.24249376558603492, + "grad_norm": 0.4260573698928175, + "learning_rate": 4.431308630359348e-05, + "loss": 0.8866, + "step": 4862 + }, + { + "epoch": 0.24254364089775562, + "grad_norm": 0.4790162558617486, + "learning_rate": 4.43105216901393e-05, + "loss": 0.9083, + "step": 4863 + }, + { + "epoch": 0.2425935162094763, + "grad_norm": 0.42551320939534576, + "learning_rate": 4.4307956572785705e-05, + "loss": 0.9092, + "step": 4864 + }, + { + "epoch": 0.24264339152119702, + "grad_norm": 0.49895902753945576, + "learning_rate": 4.4305390951599616e-05, + "loss": 0.931, + "step": 4865 + }, + { + "epoch": 0.2426932668329177, + "grad_norm": 0.47362088191511265, + "learning_rate": 4.430282482664799e-05, + "loss": 0.8993, + "step": 4866 + }, + { + "epoch": 0.2427431421446384, + "grad_norm": 0.40654750993150274, + "learning_rate": 4.43002581979978e-05, + "loss": 0.9263, + "step": 4867 + }, + { + "epoch": 0.2427930174563591, + "grad_norm": 0.46247760373838304, + "learning_rate": 4.4297691065716004e-05, + "loss": 0.913, + "step": 4868 + }, + { + "epoch": 0.2428428927680798, + "grad_norm": 0.4201985893279862, + "learning_rate": 4.42951234298696e-05, + "loss": 0.9487, + "step": 4869 + }, + { + "epoch": 0.2428927680798005, + "grad_norm": 0.43650885743521106, + "learning_rate": 4.4292555290525586e-05, + "loss": 0.9147, + "step": 4870 + }, + { + "epoch": 0.2429426433915212, + "grad_norm": 0.425434535505076, + "learning_rate": 4.428998664775097e-05, + "loss": 0.8708, + "step": 4871 + }, + { + "epoch": 0.2429925187032419, + "grad_norm": 0.411495617571008, + "learning_rate": 4.428741750161278e-05, + "loss": 0.9263, + "step": 4872 + }, + { + "epoch": 0.24304239401496258, + "grad_norm": 0.5263070263113302, + "learning_rate": 4.428484785217807e-05, + "loss": 0.891, + "step": 4873 + }, + { + "epoch": 0.2430922693266833, + "grad_norm": 0.5053820198271476, + "learning_rate": 4.428227769951389e-05, + "loss": 0.9499, + "step": 4874 + }, + { + "epoch": 0.243142144638404, + "grad_norm": 0.4460599313219946, + "learning_rate": 4.4279707043687305e-05, + "loss": 0.9094, + "step": 4875 + }, + { + "epoch": 0.24319201995012468, + "grad_norm": 0.41079934303012094, + "learning_rate": 4.427713588476538e-05, + "loss": 0.9294, + "step": 4876 + }, + { + "epoch": 0.2432418952618454, + "grad_norm": 0.5758303393957163, + "learning_rate": 4.427456422281523e-05, + "loss": 0.9091, + "step": 4877 + }, + { + "epoch": 0.24329177057356607, + "grad_norm": 0.48472430929622795, + "learning_rate": 4.427199205790395e-05, + "loss": 0.9295, + "step": 4878 + }, + { + "epoch": 0.24334164588528678, + "grad_norm": 0.5421670902878158, + "learning_rate": 4.4269419390098664e-05, + "loss": 0.9545, + "step": 4879 + }, + { + "epoch": 0.2433915211970075, + "grad_norm": 0.49433105210436395, + "learning_rate": 4.42668462194665e-05, + "loss": 0.9391, + "step": 4880 + }, + { + "epoch": 0.24344139650872818, + "grad_norm": 0.47087525500529587, + "learning_rate": 4.426427254607461e-05, + "loss": 0.8321, + "step": 4881 + }, + { + "epoch": 0.24349127182044888, + "grad_norm": 0.4133507368840623, + "learning_rate": 4.426169836999015e-05, + "loss": 0.9005, + "step": 4882 + }, + { + "epoch": 0.24354114713216957, + "grad_norm": 0.5308075027944724, + "learning_rate": 4.425912369128029e-05, + "loss": 0.9053, + "step": 4883 + }, + { + "epoch": 0.24359102244389028, + "grad_norm": 0.4836055299489339, + "learning_rate": 4.4256548510012216e-05, + "loss": 0.9788, + "step": 4884 + }, + { + "epoch": 0.24364089775561099, + "grad_norm": 0.5060411755043481, + "learning_rate": 4.4253972826253125e-05, + "loss": 0.9398, + "step": 4885 + }, + { + "epoch": 0.24369077306733167, + "grad_norm": 0.5146246614513029, + "learning_rate": 4.425139664007023e-05, + "loss": 0.9466, + "step": 4886 + }, + { + "epoch": 0.24374064837905238, + "grad_norm": 0.506469789115326, + "learning_rate": 4.424881995153076e-05, + "loss": 0.9701, + "step": 4887 + }, + { + "epoch": 0.24379052369077306, + "grad_norm": 0.536141078005277, + "learning_rate": 4.4246242760701945e-05, + "loss": 0.8954, + "step": 4888 + }, + { + "epoch": 0.24384039900249377, + "grad_norm": 0.4895499078823134, + "learning_rate": 4.424366506765104e-05, + "loss": 0.9506, + "step": 4889 + }, + { + "epoch": 0.24389027431421445, + "grad_norm": 0.5106549248310127, + "learning_rate": 4.424108687244531e-05, + "loss": 0.9309, + "step": 4890 + }, + { + "epoch": 0.24394014962593516, + "grad_norm": 0.40563884594617156, + "learning_rate": 4.423850817515203e-05, + "loss": 0.956, + "step": 4891 + }, + { + "epoch": 0.24399002493765587, + "grad_norm": 0.3561802277604091, + "learning_rate": 4.423592897583848e-05, + "loss": 0.8977, + "step": 4892 + }, + { + "epoch": 0.24403990024937655, + "grad_norm": 0.4277417908534999, + "learning_rate": 4.423334927457198e-05, + "loss": 0.9641, + "step": 4893 + }, + { + "epoch": 0.24408977556109726, + "grad_norm": 0.5814667293763653, + "learning_rate": 4.423076907141983e-05, + "loss": 0.9212, + "step": 4894 + }, + { + "epoch": 0.24413965087281794, + "grad_norm": 0.4638931116238933, + "learning_rate": 4.4228188366449375e-05, + "loss": 0.9229, + "step": 4895 + }, + { + "epoch": 0.24418952618453865, + "grad_norm": 0.6075855452074254, + "learning_rate": 4.422560715972794e-05, + "loss": 0.9115, + "step": 4896 + }, + { + "epoch": 0.24423940149625936, + "grad_norm": 0.4664159317730351, + "learning_rate": 4.422302545132291e-05, + "loss": 0.9017, + "step": 4897 + }, + { + "epoch": 0.24428927680798004, + "grad_norm": 0.48685960218923197, + "learning_rate": 4.422044324130162e-05, + "loss": 0.9, + "step": 4898 + }, + { + "epoch": 0.24433915211970075, + "grad_norm": 0.4433294751217204, + "learning_rate": 4.421786052973147e-05, + "loss": 0.9393, + "step": 4899 + }, + { + "epoch": 0.24438902743142144, + "grad_norm": 0.48065871735773047, + "learning_rate": 4.421527731667985e-05, + "loss": 0.9376, + "step": 4900 + }, + { + "epoch": 0.24443890274314214, + "grad_norm": 0.512273597568778, + "learning_rate": 4.421269360221416e-05, + "loss": 0.9496, + "step": 4901 + }, + { + "epoch": 0.24448877805486285, + "grad_norm": 0.5060742746637096, + "learning_rate": 4.421010938640184e-05, + "loss": 0.9452, + "step": 4902 + }, + { + "epoch": 0.24453865336658354, + "grad_norm": 0.5023911938574591, + "learning_rate": 4.420752466931031e-05, + "loss": 0.9127, + "step": 4903 + }, + { + "epoch": 0.24458852867830425, + "grad_norm": 0.5090453820107071, + "learning_rate": 4.420493945100702e-05, + "loss": 0.9515, + "step": 4904 + }, + { + "epoch": 0.24463840399002493, + "grad_norm": 0.5227467494316022, + "learning_rate": 4.420235373155942e-05, + "loss": 0.9013, + "step": 4905 + }, + { + "epoch": 0.24468827930174564, + "grad_norm": 0.8921108833447599, + "learning_rate": 4.419976751103501e-05, + "loss": 0.9267, + "step": 4906 + }, + { + "epoch": 0.24473815461346635, + "grad_norm": 0.6220388107798559, + "learning_rate": 4.4197180789501246e-05, + "loss": 0.9094, + "step": 4907 + }, + { + "epoch": 0.24478802992518703, + "grad_norm": 0.49981054943530784, + "learning_rate": 4.419459356702565e-05, + "loss": 0.9189, + "step": 4908 + }, + { + "epoch": 0.24483790523690774, + "grad_norm": 0.5046203576694048, + "learning_rate": 4.419200584367572e-05, + "loss": 0.9362, + "step": 4909 + }, + { + "epoch": 0.24488778054862842, + "grad_norm": 0.5106028229129217, + "learning_rate": 4.4189417619518983e-05, + "loss": 0.9112, + "step": 4910 + }, + { + "epoch": 0.24493765586034913, + "grad_norm": 0.42106694936334377, + "learning_rate": 4.418682889462299e-05, + "loss": 0.9102, + "step": 4911 + }, + { + "epoch": 0.24498753117206984, + "grad_norm": 0.43769006908140656, + "learning_rate": 4.418423966905528e-05, + "loss": 0.9167, + "step": 4912 + }, + { + "epoch": 0.24503740648379052, + "grad_norm": 0.4157876551251142, + "learning_rate": 4.418164994288342e-05, + "loss": 0.929, + "step": 4913 + }, + { + "epoch": 0.24508728179551123, + "grad_norm": 0.5874344462478902, + "learning_rate": 4.417905971617499e-05, + "loss": 0.9377, + "step": 4914 + }, + { + "epoch": 0.2451371571072319, + "grad_norm": 0.7816896523707754, + "learning_rate": 4.4176468988997586e-05, + "loss": 0.9165, + "step": 4915 + }, + { + "epoch": 0.24518703241895262, + "grad_norm": 0.5150285604591242, + "learning_rate": 4.41738777614188e-05, + "loss": 0.9102, + "step": 4916 + }, + { + "epoch": 0.2452369077306733, + "grad_norm": 0.5334823349962498, + "learning_rate": 4.4171286033506255e-05, + "loss": 0.9879, + "step": 4917 + }, + { + "epoch": 0.245286783042394, + "grad_norm": 0.7281188895488394, + "learning_rate": 4.416869380532758e-05, + "loss": 0.9383, + "step": 4918 + }, + { + "epoch": 0.24533665835411472, + "grad_norm": 0.5069386008923211, + "learning_rate": 4.416610107695042e-05, + "loss": 0.9644, + "step": 4919 + }, + { + "epoch": 0.2453865336658354, + "grad_norm": 0.603577844763177, + "learning_rate": 4.4163507848442435e-05, + "loss": 0.9412, + "step": 4920 + }, + { + "epoch": 0.24543640897755611, + "grad_norm": 0.47497311250013724, + "learning_rate": 4.416091411987128e-05, + "loss": 0.9074, + "step": 4921 + }, + { + "epoch": 0.2454862842892768, + "grad_norm": 0.6501218052870212, + "learning_rate": 4.4158319891304655e-05, + "loss": 0.9813, + "step": 4922 + }, + { + "epoch": 0.2455361596009975, + "grad_norm": 0.42588394594287055, + "learning_rate": 4.415572516281024e-05, + "loss": 0.889, + "step": 4923 + }, + { + "epoch": 0.24558603491271822, + "grad_norm": 0.4279221662557979, + "learning_rate": 4.4153129934455755e-05, + "loss": 0.9135, + "step": 4924 + }, + { + "epoch": 0.2456359102244389, + "grad_norm": 0.47338298340397794, + "learning_rate": 4.415053420630891e-05, + "loss": 0.9216, + "step": 4925 + }, + { + "epoch": 0.2456857855361596, + "grad_norm": 0.6746497360858785, + "learning_rate": 4.414793797843746e-05, + "loss": 0.8871, + "step": 4926 + }, + { + "epoch": 0.2457356608478803, + "grad_norm": 0.4380173376696699, + "learning_rate": 4.414534125090912e-05, + "loss": 0.9055, + "step": 4927 + }, + { + "epoch": 0.245785536159601, + "grad_norm": 0.45218426073529827, + "learning_rate": 4.4142744023791685e-05, + "loss": 0.9526, + "step": 4928 + }, + { + "epoch": 0.2458354114713217, + "grad_norm": 0.5521546108526112, + "learning_rate": 4.4140146297152894e-05, + "loss": 0.9106, + "step": 4929 + }, + { + "epoch": 0.2458852867830424, + "grad_norm": 0.6519632466161532, + "learning_rate": 4.413754807106056e-05, + "loss": 0.9403, + "step": 4930 + }, + { + "epoch": 0.2459351620947631, + "grad_norm": 0.513036644281313, + "learning_rate": 4.4134949345582474e-05, + "loss": 0.9324, + "step": 4931 + }, + { + "epoch": 0.24598503740648378, + "grad_norm": 0.48102126802904294, + "learning_rate": 4.4132350120786445e-05, + "loss": 0.9691, + "step": 4932 + }, + { + "epoch": 0.2460349127182045, + "grad_norm": 0.40958144772573307, + "learning_rate": 4.412975039674031e-05, + "loss": 0.9076, + "step": 4933 + }, + { + "epoch": 0.2460847880299252, + "grad_norm": 0.4488567113357257, + "learning_rate": 4.41271501735119e-05, + "loss": 0.9276, + "step": 4934 + }, + { + "epoch": 0.24613466334164588, + "grad_norm": 0.4661014674105385, + "learning_rate": 4.412454945116906e-05, + "loss": 0.9633, + "step": 4935 + }, + { + "epoch": 0.2461845386533666, + "grad_norm": 0.47186228925034723, + "learning_rate": 4.412194822977966e-05, + "loss": 0.9325, + "step": 4936 + }, + { + "epoch": 0.24623441396508727, + "grad_norm": 0.5677270007539909, + "learning_rate": 4.4119346509411586e-05, + "loss": 0.8812, + "step": 4937 + }, + { + "epoch": 0.24628428927680798, + "grad_norm": 0.409179837115503, + "learning_rate": 4.4116744290132716e-05, + "loss": 0.9418, + "step": 4938 + }, + { + "epoch": 0.24633416458852866, + "grad_norm": 0.4143401471216747, + "learning_rate": 4.411414157201096e-05, + "loss": 0.9131, + "step": 4939 + }, + { + "epoch": 0.24638403990024937, + "grad_norm": 0.576236003094204, + "learning_rate": 4.4111538355114237e-05, + "loss": 0.9476, + "step": 4940 + }, + { + "epoch": 0.24643391521197008, + "grad_norm": 0.48644506617931343, + "learning_rate": 4.410893463951047e-05, + "loss": 0.9328, + "step": 4941 + }, + { + "epoch": 0.24648379052369077, + "grad_norm": 0.47151204383529327, + "learning_rate": 4.4106330425267606e-05, + "loss": 0.9954, + "step": 4942 + }, + { + "epoch": 0.24653366583541148, + "grad_norm": 0.4164580759518826, + "learning_rate": 4.4103725712453606e-05, + "loss": 0.9225, + "step": 4943 + }, + { + "epoch": 0.24658354114713216, + "grad_norm": 0.4444498107505891, + "learning_rate": 4.410112050113643e-05, + "loss": 0.9493, + "step": 4944 + }, + { + "epoch": 0.24663341645885287, + "grad_norm": 0.5419012335145377, + "learning_rate": 4.409851479138406e-05, + "loss": 0.9398, + "step": 4945 + }, + { + "epoch": 0.24668329177057358, + "grad_norm": 0.48767822585403414, + "learning_rate": 4.409590858326449e-05, + "loss": 0.9548, + "step": 4946 + }, + { + "epoch": 0.24673316708229426, + "grad_norm": 0.5577732365914635, + "learning_rate": 4.409330187684574e-05, + "loss": 0.9593, + "step": 4947 + }, + { + "epoch": 0.24678304239401497, + "grad_norm": 0.44264184540725754, + "learning_rate": 4.409069467219582e-05, + "loss": 0.8833, + "step": 4948 + }, + { + "epoch": 0.24683291770573565, + "grad_norm": 0.43965392684030186, + "learning_rate": 4.408808696938277e-05, + "loss": 0.9138, + "step": 4949 + }, + { + "epoch": 0.24688279301745636, + "grad_norm": 0.41698611598987134, + "learning_rate": 4.408547876847462e-05, + "loss": 1.001, + "step": 4950 + }, + { + "epoch": 0.24693266832917707, + "grad_norm": 0.45121563004127047, + "learning_rate": 4.408287006953946e-05, + "loss": 0.9325, + "step": 4951 + }, + { + "epoch": 0.24698254364089775, + "grad_norm": 0.4612710888917779, + "learning_rate": 4.408026087264534e-05, + "loss": 0.9489, + "step": 4952 + }, + { + "epoch": 0.24703241895261846, + "grad_norm": 0.41580359298906033, + "learning_rate": 4.407765117786034e-05, + "loss": 0.8929, + "step": 4953 + }, + { + "epoch": 0.24708229426433914, + "grad_norm": 0.43705946695513703, + "learning_rate": 4.4075040985252593e-05, + "loss": 0.9643, + "step": 4954 + }, + { + "epoch": 0.24713216957605985, + "grad_norm": 0.5012317377432449, + "learning_rate": 4.4072430294890174e-05, + "loss": 0.9151, + "step": 4955 + }, + { + "epoch": 0.24718204488778056, + "grad_norm": 0.43017944102231337, + "learning_rate": 4.406981910684123e-05, + "loss": 0.9516, + "step": 4956 + }, + { + "epoch": 0.24723192019950124, + "grad_norm": 1.0085729138178021, + "learning_rate": 4.406720742117388e-05, + "loss": 0.9213, + "step": 4957 + }, + { + "epoch": 0.24728179551122195, + "grad_norm": 0.42878533411348513, + "learning_rate": 4.4064595237956286e-05, + "loss": 0.9212, + "step": 4958 + }, + { + "epoch": 0.24733167082294263, + "grad_norm": 0.4289612122912134, + "learning_rate": 4.406198255725662e-05, + "loss": 0.9454, + "step": 4959 + }, + { + "epoch": 0.24738154613466334, + "grad_norm": 0.5352799441997742, + "learning_rate": 4.405936937914304e-05, + "loss": 0.9355, + "step": 4960 + }, + { + "epoch": 0.24743142144638405, + "grad_norm": 0.4562213125739544, + "learning_rate": 4.405675570368376e-05, + "loss": 0.9046, + "step": 4961 + }, + { + "epoch": 0.24748129675810474, + "grad_norm": 2.777644014072255, + "learning_rate": 4.4054141530946964e-05, + "loss": 0.8983, + "step": 4962 + }, + { + "epoch": 0.24753117206982544, + "grad_norm": 0.45964430977566445, + "learning_rate": 4.405152686100087e-05, + "loss": 0.9395, + "step": 4963 + }, + { + "epoch": 0.24758104738154613, + "grad_norm": 0.4937127538080164, + "learning_rate": 4.4048911693913705e-05, + "loss": 0.9421, + "step": 4964 + }, + { + "epoch": 0.24763092269326684, + "grad_norm": 0.511758617008909, + "learning_rate": 4.404629602975372e-05, + "loss": 0.9056, + "step": 4965 + }, + { + "epoch": 0.24768079800498752, + "grad_norm": 0.41944315711714003, + "learning_rate": 4.404367986858916e-05, + "loss": 0.9213, + "step": 4966 + }, + { + "epoch": 0.24773067331670823, + "grad_norm": 0.4988092173477627, + "learning_rate": 4.40410632104883e-05, + "loss": 0.9556, + "step": 4967 + }, + { + "epoch": 0.24778054862842894, + "grad_norm": 0.5735247205828313, + "learning_rate": 4.403844605551942e-05, + "loss": 0.9559, + "step": 4968 + }, + { + "epoch": 0.24783042394014962, + "grad_norm": 0.6397195150782587, + "learning_rate": 4.40358284037508e-05, + "loss": 0.9549, + "step": 4969 + }, + { + "epoch": 0.24788029925187033, + "grad_norm": 0.460930720500737, + "learning_rate": 4.403321025525077e-05, + "loss": 0.8781, + "step": 4970 + }, + { + "epoch": 0.247930174563591, + "grad_norm": 0.42519879552815215, + "learning_rate": 4.4030591610087624e-05, + "loss": 0.9697, + "step": 4971 + }, + { + "epoch": 0.24798004987531172, + "grad_norm": 0.582344201441721, + "learning_rate": 4.402797246832972e-05, + "loss": 0.9017, + "step": 4972 + }, + { + "epoch": 0.24802992518703243, + "grad_norm": 0.42500357139865863, + "learning_rate": 4.402535283004538e-05, + "loss": 0.9007, + "step": 4973 + }, + { + "epoch": 0.2480798004987531, + "grad_norm": 0.5945721561984685, + "learning_rate": 4.4022732695302975e-05, + "loss": 0.9358, + "step": 4974 + }, + { + "epoch": 0.24812967581047382, + "grad_norm": 0.499787000568483, + "learning_rate": 4.402011206417088e-05, + "loss": 0.9486, + "step": 4975 + }, + { + "epoch": 0.2481795511221945, + "grad_norm": 0.49450512952287684, + "learning_rate": 4.401749093671746e-05, + "loss": 0.9277, + "step": 4976 + }, + { + "epoch": 0.2482294264339152, + "grad_norm": 0.5102779843173414, + "learning_rate": 4.401486931301114e-05, + "loss": 0.8993, + "step": 4977 + }, + { + "epoch": 0.24827930174563592, + "grad_norm": 0.5041953609254891, + "learning_rate": 4.401224719312031e-05, + "loss": 0.9886, + "step": 4978 + }, + { + "epoch": 0.2483291770573566, + "grad_norm": 0.4650637383993718, + "learning_rate": 4.4009624577113395e-05, + "loss": 0.9231, + "step": 4979 + }, + { + "epoch": 0.2483790523690773, + "grad_norm": 0.49363101757495836, + "learning_rate": 4.400700146505883e-05, + "loss": 0.9609, + "step": 4980 + }, + { + "epoch": 0.248428927680798, + "grad_norm": 0.45414053173897934, + "learning_rate": 4.4004377857025075e-05, + "loss": 0.891, + "step": 4981 + }, + { + "epoch": 0.2484788029925187, + "grad_norm": 0.4711080502402516, + "learning_rate": 4.400175375308059e-05, + "loss": 0.8863, + "step": 4982 + }, + { + "epoch": 0.24852867830423941, + "grad_norm": 0.45774705314391634, + "learning_rate": 4.399912915329383e-05, + "loss": 0.931, + "step": 4983 + }, + { + "epoch": 0.2485785536159601, + "grad_norm": 2.35664369431704, + "learning_rate": 4.3996504057733314e-05, + "loss": 0.9384, + "step": 4984 + }, + { + "epoch": 0.2486284289276808, + "grad_norm": 0.7667707648061631, + "learning_rate": 4.399387846646752e-05, + "loss": 0.9212, + "step": 4985 + }, + { + "epoch": 0.2486783042394015, + "grad_norm": 0.4660459888537406, + "learning_rate": 4.399125237956496e-05, + "loss": 0.9557, + "step": 4986 + }, + { + "epoch": 0.2487281795511222, + "grad_norm": 0.8253614767252289, + "learning_rate": 4.398862579709417e-05, + "loss": 0.9635, + "step": 4987 + }, + { + "epoch": 0.24877805486284288, + "grad_norm": 0.5401490027316826, + "learning_rate": 4.39859987191237e-05, + "loss": 0.9224, + "step": 4988 + }, + { + "epoch": 0.2488279301745636, + "grad_norm": 0.5172211918114541, + "learning_rate": 4.398337114572209e-05, + "loss": 0.9439, + "step": 4989 + }, + { + "epoch": 0.2488778054862843, + "grad_norm": 0.5108204284868505, + "learning_rate": 4.39807430769579e-05, + "loss": 0.9097, + "step": 4990 + }, + { + "epoch": 0.24892768079800498, + "grad_norm": 0.9056336100497575, + "learning_rate": 4.39781145128997e-05, + "loss": 0.8915, + "step": 4991 + }, + { + "epoch": 0.2489775561097257, + "grad_norm": 0.4927999706608488, + "learning_rate": 4.397548545361612e-05, + "loss": 0.9303, + "step": 4992 + }, + { + "epoch": 0.24902743142144637, + "grad_norm": 0.4628639848814641, + "learning_rate": 4.3972855899175724e-05, + "loss": 0.9437, + "step": 4993 + }, + { + "epoch": 0.24907730673316708, + "grad_norm": 0.49265479815433016, + "learning_rate": 4.397022584964715e-05, + "loss": 0.9519, + "step": 4994 + }, + { + "epoch": 0.2491271820448878, + "grad_norm": 0.47316063344478954, + "learning_rate": 4.396759530509902e-05, + "loss": 0.9461, + "step": 4995 + }, + { + "epoch": 0.24917705735660847, + "grad_norm": 0.5991316409341699, + "learning_rate": 4.3964964265599986e-05, + "loss": 0.9575, + "step": 4996 + }, + { + "epoch": 0.24922693266832918, + "grad_norm": 0.5416419995062902, + "learning_rate": 4.396233273121869e-05, + "loss": 0.9284, + "step": 4997 + }, + { + "epoch": 0.24927680798004986, + "grad_norm": 1.2943205358237297, + "learning_rate": 4.395970070202381e-05, + "loss": 0.9175, + "step": 4998 + }, + { + "epoch": 0.24932668329177057, + "grad_norm": 0.37770639519876686, + "learning_rate": 4.395706817808403e-05, + "loss": 0.9012, + "step": 4999 + }, + { + "epoch": 0.24937655860349128, + "grad_norm": 0.44158508883667796, + "learning_rate": 4.395443515946803e-05, + "loss": 0.945, + "step": 5000 + }, + { + "epoch": 0.24942643391521196, + "grad_norm": 0.43439757434491794, + "learning_rate": 4.395180164624453e-05, + "loss": 0.9471, + "step": 5001 + }, + { + "epoch": 0.24947630922693267, + "grad_norm": 0.48138606200012535, + "learning_rate": 4.394916763848225e-05, + "loss": 0.9523, + "step": 5002 + }, + { + "epoch": 0.24952618453865336, + "grad_norm": 0.500585315601016, + "learning_rate": 4.3946533136249926e-05, + "loss": 0.9854, + "step": 5003 + }, + { + "epoch": 0.24957605985037407, + "grad_norm": 0.44314024422501164, + "learning_rate": 4.394389813961629e-05, + "loss": 0.9241, + "step": 5004 + }, + { + "epoch": 0.24962593516209478, + "grad_norm": 0.44609049854430916, + "learning_rate": 4.3941262648650114e-05, + "loss": 0.9687, + "step": 5005 + }, + { + "epoch": 0.24967581047381546, + "grad_norm": 0.5561774748089906, + "learning_rate": 4.393862666342017e-05, + "loss": 0.921, + "step": 5006 + }, + { + "epoch": 0.24972568578553617, + "grad_norm": 0.5601386251820756, + "learning_rate": 4.3935990183995235e-05, + "loss": 0.9272, + "step": 5007 + }, + { + "epoch": 0.24977556109725685, + "grad_norm": 0.44595355486543165, + "learning_rate": 4.393335321044411e-05, + "loss": 0.896, + "step": 5008 + }, + { + "epoch": 0.24982543640897756, + "grad_norm": 0.5120089134200579, + "learning_rate": 4.3930715742835606e-05, + "loss": 0.9697, + "step": 5009 + }, + { + "epoch": 0.24987531172069827, + "grad_norm": 0.452043565752897, + "learning_rate": 4.3928077781238546e-05, + "loss": 0.9137, + "step": 5010 + }, + { + "epoch": 0.24992518703241895, + "grad_norm": 0.541288431232321, + "learning_rate": 4.3925439325721774e-05, + "loss": 0.9146, + "step": 5011 + }, + { + "epoch": 0.24997506234413966, + "grad_norm": 0.46039941773486737, + "learning_rate": 4.3922800376354125e-05, + "loss": 0.9378, + "step": 5012 + }, + { + "epoch": 0.25002493765586037, + "grad_norm": 0.5064676934908706, + "learning_rate": 4.392016093320447e-05, + "loss": 0.9162, + "step": 5013 + }, + { + "epoch": 0.25007481296758105, + "grad_norm": 0.5164874119225665, + "learning_rate": 4.391752099634169e-05, + "loss": 0.97, + "step": 5014 + }, + { + "epoch": 0.25012468827930173, + "grad_norm": 0.4742251641656763, + "learning_rate": 4.391488056583466e-05, + "loss": 0.9179, + "step": 5015 + }, + { + "epoch": 0.25017456359102247, + "grad_norm": 0.5225378212677698, + "learning_rate": 4.391223964175229e-05, + "loss": 0.9335, + "step": 5016 + }, + { + "epoch": 0.25022443890274315, + "grad_norm": 0.5030196922883119, + "learning_rate": 4.390959822416349e-05, + "loss": 0.9185, + "step": 5017 + }, + { + "epoch": 0.25027431421446383, + "grad_norm": 0.4075505811641377, + "learning_rate": 4.390695631313719e-05, + "loss": 0.901, + "step": 5018 + }, + { + "epoch": 0.2503241895261845, + "grad_norm": 0.4429635210520074, + "learning_rate": 4.390431390874232e-05, + "loss": 0.921, + "step": 5019 + }, + { + "epoch": 0.25037406483790525, + "grad_norm": 0.44013494016505383, + "learning_rate": 4.390167101104784e-05, + "loss": 0.9155, + "step": 5020 + }, + { + "epoch": 0.25042394014962593, + "grad_norm": 0.6422410999155245, + "learning_rate": 4.389902762012272e-05, + "loss": 0.8927, + "step": 5021 + }, + { + "epoch": 0.2504738154613466, + "grad_norm": 0.42845969362065134, + "learning_rate": 4.389638373603593e-05, + "loss": 0.9447, + "step": 5022 + }, + { + "epoch": 0.25052369077306735, + "grad_norm": 0.5049931583757241, + "learning_rate": 4.389373935885646e-05, + "loss": 0.9109, + "step": 5023 + }, + { + "epoch": 0.25057356608478804, + "grad_norm": 0.480260242112738, + "learning_rate": 4.3891094488653315e-05, + "loss": 0.909, + "step": 5024 + }, + { + "epoch": 0.2506234413965087, + "grad_norm": 0.4287976186462047, + "learning_rate": 4.3888449125495526e-05, + "loss": 0.8751, + "step": 5025 + }, + { + "epoch": 0.2506733167082294, + "grad_norm": 0.4435975101724546, + "learning_rate": 4.38858032694521e-05, + "loss": 0.9299, + "step": 5026 + }, + { + "epoch": 0.25072319201995014, + "grad_norm": 0.5146382783353203, + "learning_rate": 4.388315692059209e-05, + "loss": 0.9264, + "step": 5027 + }, + { + "epoch": 0.2507730673316708, + "grad_norm": 0.751650593417561, + "learning_rate": 4.388051007898456e-05, + "loss": 0.9385, + "step": 5028 + }, + { + "epoch": 0.2508229426433915, + "grad_norm": 0.4550853984136689, + "learning_rate": 4.387786274469856e-05, + "loss": 0.8959, + "step": 5029 + }, + { + "epoch": 0.25087281795511224, + "grad_norm": 0.4734659552478587, + "learning_rate": 4.3875214917803175e-05, + "loss": 0.957, + "step": 5030 + }, + { + "epoch": 0.2509226932668329, + "grad_norm": 0.5778025792606714, + "learning_rate": 4.387256659836752e-05, + "loss": 0.9229, + "step": 5031 + }, + { + "epoch": 0.2509725685785536, + "grad_norm": 0.5195544974706319, + "learning_rate": 4.386991778646067e-05, + "loss": 0.934, + "step": 5032 + }, + { + "epoch": 0.25102244389027434, + "grad_norm": 0.4557966349498291, + "learning_rate": 4.386726848215177e-05, + "loss": 0.8897, + "step": 5033 + }, + { + "epoch": 0.251072319201995, + "grad_norm": 0.4661906818105312, + "learning_rate": 4.386461868550995e-05, + "loss": 0.9184, + "step": 5034 + }, + { + "epoch": 0.2511221945137157, + "grad_norm": 1.5784187559716323, + "learning_rate": 4.386196839660433e-05, + "loss": 0.9079, + "step": 5035 + }, + { + "epoch": 0.2511720698254364, + "grad_norm": 0.5169978748463686, + "learning_rate": 4.38593176155041e-05, + "loss": 0.9029, + "step": 5036 + }, + { + "epoch": 0.2512219451371571, + "grad_norm": 0.41189451910724006, + "learning_rate": 4.3856666342278414e-05, + "loss": 0.9124, + "step": 5037 + }, + { + "epoch": 0.2512718204488778, + "grad_norm": 0.44167431744710794, + "learning_rate": 4.385401457699646e-05, + "loss": 0.9797, + "step": 5038 + }, + { + "epoch": 0.2513216957605985, + "grad_norm": 0.4491370052280698, + "learning_rate": 4.385136231972743e-05, + "loss": 0.9839, + "step": 5039 + }, + { + "epoch": 0.2513715710723192, + "grad_norm": 0.47634154701261133, + "learning_rate": 4.384870957054053e-05, + "loss": 0.9476, + "step": 5040 + }, + { + "epoch": 0.2514214463840399, + "grad_norm": 0.519487330473531, + "learning_rate": 4.3846056329505005e-05, + "loss": 0.9165, + "step": 5041 + }, + { + "epoch": 0.2514713216957606, + "grad_norm": 0.5091582807330484, + "learning_rate": 4.384340259669006e-05, + "loss": 0.9264, + "step": 5042 + }, + { + "epoch": 0.2515211970074813, + "grad_norm": 0.6919998639032344, + "learning_rate": 4.384074837216497e-05, + "loss": 0.9021, + "step": 5043 + }, + { + "epoch": 0.251571072319202, + "grad_norm": 1.831578990955952, + "learning_rate": 4.383809365599898e-05, + "loss": 0.95, + "step": 5044 + }, + { + "epoch": 0.2516209476309227, + "grad_norm": 0.6061981784951789, + "learning_rate": 4.383543844826137e-05, + "loss": 0.9254, + "step": 5045 + }, + { + "epoch": 0.25167082294264337, + "grad_norm": 0.5451537841507894, + "learning_rate": 4.383278274902141e-05, + "loss": 0.9475, + "step": 5046 + }, + { + "epoch": 0.2517206982543641, + "grad_norm": 0.4753224210979838, + "learning_rate": 4.383012655834842e-05, + "loss": 0.854, + "step": 5047 + }, + { + "epoch": 0.2517705735660848, + "grad_norm": 0.48722858851571893, + "learning_rate": 4.3827469876311714e-05, + "loss": 0.8777, + "step": 5048 + }, + { + "epoch": 0.25182044887780547, + "grad_norm": 0.47397674086480307, + "learning_rate": 4.3824812702980595e-05, + "loss": 0.9229, + "step": 5049 + }, + { + "epoch": 0.2518703241895262, + "grad_norm": 0.46152760167171397, + "learning_rate": 4.3822155038424415e-05, + "loss": 0.9163, + "step": 5050 + }, + { + "epoch": 0.2519201995012469, + "grad_norm": 0.5194795411998848, + "learning_rate": 4.381949688271253e-05, + "loss": 0.9105, + "step": 5051 + }, + { + "epoch": 0.25197007481296757, + "grad_norm": 0.4799257288042047, + "learning_rate": 4.381683823591429e-05, + "loss": 0.9437, + "step": 5052 + }, + { + "epoch": 0.25201995012468825, + "grad_norm": 0.6471123683105162, + "learning_rate": 4.381417909809908e-05, + "loss": 0.9427, + "step": 5053 + }, + { + "epoch": 0.252069825436409, + "grad_norm": 0.585607317672154, + "learning_rate": 4.3811519469336285e-05, + "loss": 0.9154, + "step": 5054 + }, + { + "epoch": 0.25211970074812967, + "grad_norm": 0.44202146843312967, + "learning_rate": 4.380885934969531e-05, + "loss": 0.9757, + "step": 5055 + }, + { + "epoch": 0.25216957605985035, + "grad_norm": 0.53529518924925, + "learning_rate": 4.380619873924556e-05, + "loss": 0.9214, + "step": 5056 + }, + { + "epoch": 0.2522194513715711, + "grad_norm": 0.4798874670129561, + "learning_rate": 4.3803537638056485e-05, + "loss": 0.8959, + "step": 5057 + }, + { + "epoch": 0.2522693266832918, + "grad_norm": 1.1382999917310594, + "learning_rate": 4.3800876046197506e-05, + "loss": 0.9725, + "step": 5058 + }, + { + "epoch": 0.25231920199501245, + "grad_norm": 0.6434704888976611, + "learning_rate": 4.3798213963738074e-05, + "loss": 0.9345, + "step": 5059 + }, + { + "epoch": 0.2523690773067332, + "grad_norm": 0.7048122991397208, + "learning_rate": 4.3795551390747666e-05, + "loss": 0.8957, + "step": 5060 + }, + { + "epoch": 0.2524189526184539, + "grad_norm": 0.6389917836043802, + "learning_rate": 4.379288832729575e-05, + "loss": 0.9169, + "step": 5061 + }, + { + "epoch": 0.25246882793017456, + "grad_norm": 0.46267839265017374, + "learning_rate": 4.379022477345184e-05, + "loss": 0.9315, + "step": 5062 + }, + { + "epoch": 0.25251870324189524, + "grad_norm": 0.6288401350263932, + "learning_rate": 4.378756072928541e-05, + "loss": 0.9223, + "step": 5063 + }, + { + "epoch": 0.252568578553616, + "grad_norm": 0.5162137183625524, + "learning_rate": 4.378489619486599e-05, + "loss": 0.9506, + "step": 5064 + }, + { + "epoch": 0.25261845386533666, + "grad_norm": 0.45655963938594757, + "learning_rate": 4.378223117026312e-05, + "loss": 0.8957, + "step": 5065 + }, + { + "epoch": 0.25266832917705734, + "grad_norm": 0.6642498416266847, + "learning_rate": 4.377956565554633e-05, + "loss": 0.9049, + "step": 5066 + }, + { + "epoch": 0.2527182044887781, + "grad_norm": 0.4981084919855754, + "learning_rate": 4.377689965078517e-05, + "loss": 0.9427, + "step": 5067 + }, + { + "epoch": 0.25276807980049876, + "grad_norm": 0.4278549368357907, + "learning_rate": 4.3774233156049234e-05, + "loss": 0.9275, + "step": 5068 + }, + { + "epoch": 0.25281795511221944, + "grad_norm": 0.479665747271264, + "learning_rate": 4.3771566171408074e-05, + "loss": 0.8768, + "step": 5069 + }, + { + "epoch": 0.2528678304239402, + "grad_norm": 0.44541264928001606, + "learning_rate": 4.37688986969313e-05, + "loss": 0.9062, + "step": 5070 + }, + { + "epoch": 0.25291770573566086, + "grad_norm": 0.5133434027888852, + "learning_rate": 4.376623073268851e-05, + "loss": 0.9426, + "step": 5071 + }, + { + "epoch": 0.25296758104738154, + "grad_norm": 0.5112094528734475, + "learning_rate": 4.376356227874934e-05, + "loss": 0.9586, + "step": 5072 + }, + { + "epoch": 0.2530174563591022, + "grad_norm": 1.2163478554755696, + "learning_rate": 4.3760893335183396e-05, + "loss": 0.9208, + "step": 5073 + }, + { + "epoch": 0.25306733167082296, + "grad_norm": 0.4995808629081953, + "learning_rate": 4.3758223902060334e-05, + "loss": 0.9391, + "step": 5074 + }, + { + "epoch": 0.25311720698254364, + "grad_norm": 0.7218629529320195, + "learning_rate": 4.375555397944983e-05, + "loss": 0.9601, + "step": 5075 + }, + { + "epoch": 0.2531670822942643, + "grad_norm": 0.5052629370739611, + "learning_rate": 4.375288356742153e-05, + "loss": 0.9421, + "step": 5076 + }, + { + "epoch": 0.25321695760598506, + "grad_norm": 0.5607341457876783, + "learning_rate": 4.375021266604512e-05, + "loss": 0.9437, + "step": 5077 + }, + { + "epoch": 0.25326683291770574, + "grad_norm": 0.4319570877251317, + "learning_rate": 4.374754127539031e-05, + "loss": 0.9261, + "step": 5078 + }, + { + "epoch": 0.2533167082294264, + "grad_norm": 0.6023952907916693, + "learning_rate": 4.37448693955268e-05, + "loss": 0.9572, + "step": 5079 + }, + { + "epoch": 0.2533665835411471, + "grad_norm": 0.4353116399919066, + "learning_rate": 4.374219702652431e-05, + "loss": 0.9201, + "step": 5080 + }, + { + "epoch": 0.25341645885286784, + "grad_norm": 0.49451761188372745, + "learning_rate": 4.373952416845258e-05, + "loss": 0.9229, + "step": 5081 + }, + { + "epoch": 0.2534663341645885, + "grad_norm": 0.5094165825043997, + "learning_rate": 4.3736850821381355e-05, + "loss": 0.8961, + "step": 5082 + }, + { + "epoch": 0.2535162094763092, + "grad_norm": 0.5367057439555064, + "learning_rate": 4.373417698538039e-05, + "loss": 0.897, + "step": 5083 + }, + { + "epoch": 0.25356608478802994, + "grad_norm": 1.2873625537999227, + "learning_rate": 4.3731502660519455e-05, + "loss": 0.954, + "step": 5084 + }, + { + "epoch": 0.2536159600997506, + "grad_norm": 0.4211009841202291, + "learning_rate": 4.3728827846868346e-05, + "loss": 0.9775, + "step": 5085 + }, + { + "epoch": 0.2536658354114713, + "grad_norm": 0.38110017407085434, + "learning_rate": 4.372615254449686e-05, + "loss": 0.9259, + "step": 5086 + }, + { + "epoch": 0.25371571072319204, + "grad_norm": 0.4743453265564132, + "learning_rate": 4.37234767534748e-05, + "loss": 0.9211, + "step": 5087 + }, + { + "epoch": 0.2537655860349127, + "grad_norm": 0.4522849530144511, + "learning_rate": 4.372080047387199e-05, + "loss": 0.9273, + "step": 5088 + }, + { + "epoch": 0.2538154613466334, + "grad_norm": 0.5636850019059184, + "learning_rate": 4.371812370575829e-05, + "loss": 0.9794, + "step": 5089 + }, + { + "epoch": 0.2538653366583541, + "grad_norm": 0.45744879406835515, + "learning_rate": 4.3715446449203504e-05, + "loss": 0.9275, + "step": 5090 + }, + { + "epoch": 0.25391521197007483, + "grad_norm": 0.5089161339430329, + "learning_rate": 4.371276870427753e-05, + "loss": 0.9342, + "step": 5091 + }, + { + "epoch": 0.2539650872817955, + "grad_norm": 0.5694016626049176, + "learning_rate": 4.371009047105023e-05, + "loss": 0.857, + "step": 5092 + }, + { + "epoch": 0.2540149625935162, + "grad_norm": 0.7174448560735281, + "learning_rate": 4.3707411749591494e-05, + "loss": 0.9234, + "step": 5093 + }, + { + "epoch": 0.25406483790523693, + "grad_norm": 0.4579035072010172, + "learning_rate": 4.3704732539971214e-05, + "loss": 0.892, + "step": 5094 + }, + { + "epoch": 0.2541147132169576, + "grad_norm": 0.4819521402667113, + "learning_rate": 4.3702052842259325e-05, + "loss": 0.9439, + "step": 5095 + }, + { + "epoch": 0.2541645885286783, + "grad_norm": 0.435087457636271, + "learning_rate": 4.3699372656525724e-05, + "loss": 0.9105, + "step": 5096 + }, + { + "epoch": 0.25421446384039903, + "grad_norm": 0.4651754599825268, + "learning_rate": 4.369669198284036e-05, + "loss": 0.9284, + "step": 5097 + }, + { + "epoch": 0.2542643391521197, + "grad_norm": 0.46618040746294304, + "learning_rate": 4.36940108212732e-05, + "loss": 0.9373, + "step": 5098 + }, + { + "epoch": 0.2543142144638404, + "grad_norm": 0.44271196971307086, + "learning_rate": 4.369132917189418e-05, + "loss": 0.8894, + "step": 5099 + }, + { + "epoch": 0.2543640897755611, + "grad_norm": 0.44538371644090435, + "learning_rate": 4.36886470347733e-05, + "loss": 0.9499, + "step": 5100 + }, + { + "epoch": 0.2544139650872818, + "grad_norm": 0.5827312837386964, + "learning_rate": 4.368596440998053e-05, + "loss": 0.9599, + "step": 5101 + }, + { + "epoch": 0.2544638403990025, + "grad_norm": 0.48412027116519346, + "learning_rate": 4.368328129758589e-05, + "loss": 0.9404, + "step": 5102 + }, + { + "epoch": 0.2545137157107232, + "grad_norm": 0.451794201440933, + "learning_rate": 4.368059769765938e-05, + "loss": 0.9229, + "step": 5103 + }, + { + "epoch": 0.2545635910224439, + "grad_norm": 0.4397127169602025, + "learning_rate": 4.3677913610271036e-05, + "loss": 0.9517, + "step": 5104 + }, + { + "epoch": 0.2546134663341646, + "grad_norm": 0.453774847010883, + "learning_rate": 4.3675229035490895e-05, + "loss": 0.9396, + "step": 5105 + }, + { + "epoch": 0.2546633416458853, + "grad_norm": 0.5911807449980567, + "learning_rate": 4.3672543973389016e-05, + "loss": 0.9392, + "step": 5106 + }, + { + "epoch": 0.25471321695760596, + "grad_norm": 0.6809055739232702, + "learning_rate": 4.366985842403545e-05, + "loss": 0.933, + "step": 5107 + }, + { + "epoch": 0.2547630922693267, + "grad_norm": 0.4467345823663764, + "learning_rate": 4.3667172387500285e-05, + "loss": 0.8707, + "step": 5108 + }, + { + "epoch": 0.2548129675810474, + "grad_norm": 0.49376511148826285, + "learning_rate": 4.366448586385362e-05, + "loss": 0.9202, + "step": 5109 + }, + { + "epoch": 0.25486284289276806, + "grad_norm": 0.46711145986922675, + "learning_rate": 4.3661798853165544e-05, + "loss": 0.9236, + "step": 5110 + }, + { + "epoch": 0.2549127182044888, + "grad_norm": 0.45151388669313947, + "learning_rate": 4.365911135550618e-05, + "loss": 0.9388, + "step": 5111 + }, + { + "epoch": 0.2549625935162095, + "grad_norm": 0.45207828917257464, + "learning_rate": 4.365642337094565e-05, + "loss": 0.9153, + "step": 5112 + }, + { + "epoch": 0.25501246882793016, + "grad_norm": 0.528290042877939, + "learning_rate": 4.365373489955411e-05, + "loss": 0.9079, + "step": 5113 + }, + { + "epoch": 0.2550623441396509, + "grad_norm": 0.4389130268433281, + "learning_rate": 4.3651045941401704e-05, + "loss": 0.9029, + "step": 5114 + }, + { + "epoch": 0.2551122194513716, + "grad_norm": 0.7481780883799439, + "learning_rate": 4.36483564965586e-05, + "loss": 0.9247, + "step": 5115 + }, + { + "epoch": 0.25516209476309226, + "grad_norm": 0.4879991707113302, + "learning_rate": 4.364566656509497e-05, + "loss": 0.9178, + "step": 5116 + }, + { + "epoch": 0.25521197007481294, + "grad_norm": 0.6159084533630118, + "learning_rate": 4.364297614708103e-05, + "loss": 0.909, + "step": 5117 + }, + { + "epoch": 0.2552618453865337, + "grad_norm": 0.46121244637015574, + "learning_rate": 4.3640285242586965e-05, + "loss": 0.9672, + "step": 5118 + }, + { + "epoch": 0.25531172069825436, + "grad_norm": 0.4167042998506399, + "learning_rate": 4.3637593851682993e-05, + "loss": 0.8785, + "step": 5119 + }, + { + "epoch": 0.25536159600997504, + "grad_norm": 0.4794277412009543, + "learning_rate": 4.3634901974439354e-05, + "loss": 0.9701, + "step": 5120 + }, + { + "epoch": 0.2554114713216958, + "grad_norm": 0.42539150002929965, + "learning_rate": 4.36322096109263e-05, + "loss": 0.8801, + "step": 5121 + }, + { + "epoch": 0.25546134663341646, + "grad_norm": 0.4241337961969821, + "learning_rate": 4.362951676121406e-05, + "loss": 0.9319, + "step": 5122 + }, + { + "epoch": 0.25551122194513715, + "grad_norm": 0.5012644765386661, + "learning_rate": 4.362682342537292e-05, + "loss": 0.8822, + "step": 5123 + }, + { + "epoch": 0.2555610972568578, + "grad_norm": 0.4422397209029006, + "learning_rate": 4.3624129603473164e-05, + "loss": 0.9417, + "step": 5124 + }, + { + "epoch": 0.25561097256857856, + "grad_norm": 0.45864994438069556, + "learning_rate": 4.362143529558508e-05, + "loss": 0.9764, + "step": 5125 + }, + { + "epoch": 0.25566084788029925, + "grad_norm": 0.4922620128018074, + "learning_rate": 4.3618740501778976e-05, + "loss": 0.9237, + "step": 5126 + }, + { + "epoch": 0.25571072319201993, + "grad_norm": 0.4241267178201165, + "learning_rate": 4.361604522212517e-05, + "loss": 0.9361, + "step": 5127 + }, + { + "epoch": 0.25576059850374067, + "grad_norm": 0.48464416120496717, + "learning_rate": 4.3613349456693994e-05, + "loss": 0.87, + "step": 5128 + }, + { + "epoch": 0.25581047381546135, + "grad_norm": 0.4284513466201117, + "learning_rate": 4.36106532055558e-05, + "loss": 0.8886, + "step": 5129 + }, + { + "epoch": 0.25586034912718203, + "grad_norm": 0.5372559475900867, + "learning_rate": 4.360795646878093e-05, + "loss": 0.9543, + "step": 5130 + }, + { + "epoch": 0.25591022443890277, + "grad_norm": 0.4955814861016248, + "learning_rate": 4.3605259246439766e-05, + "loss": 0.9299, + "step": 5131 + }, + { + "epoch": 0.25596009975062345, + "grad_norm": 0.5389338020540214, + "learning_rate": 4.360256153860269e-05, + "loss": 0.8687, + "step": 5132 + }, + { + "epoch": 0.25600997506234413, + "grad_norm": 0.5399005366999239, + "learning_rate": 4.3599863345340104e-05, + "loss": 0.9662, + "step": 5133 + }, + { + "epoch": 0.2560598503740648, + "grad_norm": 0.5684944969847233, + "learning_rate": 4.35971646667224e-05, + "loss": 0.901, + "step": 5134 + }, + { + "epoch": 0.25610972568578555, + "grad_norm": 0.44903283770927194, + "learning_rate": 4.359446550282001e-05, + "loss": 0.9617, + "step": 5135 + }, + { + "epoch": 0.25615960099750623, + "grad_norm": 0.7691973511173379, + "learning_rate": 4.359176585370336e-05, + "loss": 0.9287, + "step": 5136 + }, + { + "epoch": 0.2562094763092269, + "grad_norm": 0.49010290624541486, + "learning_rate": 4.358906571944291e-05, + "loss": 0.9458, + "step": 5137 + }, + { + "epoch": 0.25625935162094765, + "grad_norm": 0.5515186242192669, + "learning_rate": 4.35863651001091e-05, + "loss": 0.9545, + "step": 5138 + }, + { + "epoch": 0.25630922693266833, + "grad_norm": 0.6596459305291192, + "learning_rate": 4.3583663995772413e-05, + "loss": 0.9029, + "step": 5139 + }, + { + "epoch": 0.256359102244389, + "grad_norm": 0.4917743368896919, + "learning_rate": 4.3580962406503333e-05, + "loss": 0.9695, + "step": 5140 + }, + { + "epoch": 0.25640897755610975, + "grad_norm": 0.4547952054722989, + "learning_rate": 4.3578260332372354e-05, + "loss": 0.9704, + "step": 5141 + }, + { + "epoch": 0.25645885286783043, + "grad_norm": 0.4303427452535654, + "learning_rate": 4.357555777344998e-05, + "loss": 0.9384, + "step": 5142 + }, + { + "epoch": 0.2565087281795511, + "grad_norm": 0.4779732449365015, + "learning_rate": 4.357285472980675e-05, + "loss": 0.9483, + "step": 5143 + }, + { + "epoch": 0.2565586034912718, + "grad_norm": 0.4148605092287715, + "learning_rate": 4.357015120151319e-05, + "loss": 0.8735, + "step": 5144 + }, + { + "epoch": 0.25660847880299253, + "grad_norm": 0.5018316420576292, + "learning_rate": 4.356744718863983e-05, + "loss": 0.9241, + "step": 5145 + }, + { + "epoch": 0.2566583541147132, + "grad_norm": 0.4353765551007673, + "learning_rate": 4.3564742691257254e-05, + "loss": 0.9112, + "step": 5146 + }, + { + "epoch": 0.2567082294264339, + "grad_norm": 0.4542985742604429, + "learning_rate": 4.3562037709436025e-05, + "loss": 0.9328, + "step": 5147 + }, + { + "epoch": 0.25675810473815464, + "grad_norm": 0.4665370323813187, + "learning_rate": 4.355933224324673e-05, + "loss": 0.9805, + "step": 5148 + }, + { + "epoch": 0.2568079800498753, + "grad_norm": 0.4623722397156483, + "learning_rate": 4.355662629275997e-05, + "loss": 0.9228, + "step": 5149 + }, + { + "epoch": 0.256857855361596, + "grad_norm": 0.453956101420795, + "learning_rate": 4.355391985804634e-05, + "loss": 0.9328, + "step": 5150 + }, + { + "epoch": 0.2569077306733167, + "grad_norm": 0.44928389904236526, + "learning_rate": 4.355121293917649e-05, + "loss": 0.9254, + "step": 5151 + }, + { + "epoch": 0.2569576059850374, + "grad_norm": 0.6308113428110509, + "learning_rate": 4.3548505536221036e-05, + "loss": 0.9269, + "step": 5152 + }, + { + "epoch": 0.2570074812967581, + "grad_norm": 0.4478005156649027, + "learning_rate": 4.354579764925063e-05, + "loss": 0.9748, + "step": 5153 + }, + { + "epoch": 0.2570573566084788, + "grad_norm": 0.45426446369151746, + "learning_rate": 4.354308927833594e-05, + "loss": 0.878, + "step": 5154 + }, + { + "epoch": 0.2571072319201995, + "grad_norm": 0.9716688048588663, + "learning_rate": 4.3540380423547623e-05, + "loss": 0.9269, + "step": 5155 + }, + { + "epoch": 0.2571571072319202, + "grad_norm": 0.5127745065192139, + "learning_rate": 4.353767108495638e-05, + "loss": 0.8989, + "step": 5156 + }, + { + "epoch": 0.2572069825436409, + "grad_norm": 0.44039764960132977, + "learning_rate": 4.35349612626329e-05, + "loss": 0.8801, + "step": 5157 + }, + { + "epoch": 0.2572568578553616, + "grad_norm": 0.6230532737779717, + "learning_rate": 4.3532250956647914e-05, + "loss": 0.9211, + "step": 5158 + }, + { + "epoch": 0.2573067331670823, + "grad_norm": 0.5076911392942672, + "learning_rate": 4.3529540167072126e-05, + "loss": 0.9795, + "step": 5159 + }, + { + "epoch": 0.257356608478803, + "grad_norm": 0.5206877773664261, + "learning_rate": 4.352682889397628e-05, + "loss": 0.9463, + "step": 5160 + }, + { + "epoch": 0.25740648379052367, + "grad_norm": 0.44723511925032866, + "learning_rate": 4.352411713743113e-05, + "loss": 0.9586, + "step": 5161 + }, + { + "epoch": 0.2574563591022444, + "grad_norm": 0.5673417241365917, + "learning_rate": 4.352140489750743e-05, + "loss": 0.9277, + "step": 5162 + }, + { + "epoch": 0.2575062344139651, + "grad_norm": 0.3916683594970271, + "learning_rate": 4.351869217427596e-05, + "loss": 0.8938, + "step": 5163 + }, + { + "epoch": 0.25755610972568577, + "grad_norm": 0.5267148348274884, + "learning_rate": 4.35159789678075e-05, + "loss": 0.938, + "step": 5164 + }, + { + "epoch": 0.2576059850374065, + "grad_norm": 0.49168203199278226, + "learning_rate": 4.3513265278172864e-05, + "loss": 0.9528, + "step": 5165 + }, + { + "epoch": 0.2576558603491272, + "grad_norm": 0.40413485634660673, + "learning_rate": 4.351055110544285e-05, + "loss": 0.9158, + "step": 5166 + }, + { + "epoch": 0.25770573566084787, + "grad_norm": 0.7380312199787612, + "learning_rate": 4.3507836449688296e-05, + "loss": 0.9019, + "step": 5167 + }, + { + "epoch": 0.2577556109725686, + "grad_norm": 0.5222399976120702, + "learning_rate": 4.3505121310980024e-05, + "loss": 0.9194, + "step": 5168 + }, + { + "epoch": 0.2578054862842893, + "grad_norm": 0.43307346475831066, + "learning_rate": 4.35024056893889e-05, + "loss": 0.9051, + "step": 5169 + }, + { + "epoch": 0.25785536159600997, + "grad_norm": 2.5392065188928994, + "learning_rate": 4.349968958498578e-05, + "loss": 0.9013, + "step": 5170 + }, + { + "epoch": 0.25790523690773065, + "grad_norm": 0.3999600188946885, + "learning_rate": 4.3496972997841536e-05, + "loss": 0.922, + "step": 5171 + }, + { + "epoch": 0.2579551122194514, + "grad_norm": 0.5386924384800694, + "learning_rate": 4.349425592802706e-05, + "loss": 0.8973, + "step": 5172 + }, + { + "epoch": 0.25800498753117207, + "grad_norm": 0.5337082868600013, + "learning_rate": 4.349153837561325e-05, + "loss": 0.9462, + "step": 5173 + }, + { + "epoch": 0.25805486284289275, + "grad_norm": 6.282204192041986, + "learning_rate": 4.348882034067103e-05, + "loss": 0.9644, + "step": 5174 + }, + { + "epoch": 0.2581047381546135, + "grad_norm": 0.4636158766375883, + "learning_rate": 4.348610182327131e-05, + "loss": 0.9045, + "step": 5175 + }, + { + "epoch": 0.25815461346633417, + "grad_norm": 0.46913364540998775, + "learning_rate": 4.348338282348504e-05, + "loss": 0.9157, + "step": 5176 + }, + { + "epoch": 0.25820448877805485, + "grad_norm": 0.6305472449055884, + "learning_rate": 4.348066334138316e-05, + "loss": 0.9418, + "step": 5177 + }, + { + "epoch": 0.25825436408977553, + "grad_norm": 0.4088437453463952, + "learning_rate": 4.3477943377036656e-05, + "loss": 0.9021, + "step": 5178 + }, + { + "epoch": 0.25830423940149627, + "grad_norm": 0.4677741628906918, + "learning_rate": 4.347522293051648e-05, + "loss": 0.9248, + "step": 5179 + }, + { + "epoch": 0.25835411471321695, + "grad_norm": 0.45465389491384467, + "learning_rate": 4.347250200189363e-05, + "loss": 0.9437, + "step": 5180 + }, + { + "epoch": 0.25840399002493764, + "grad_norm": 0.4571506739025416, + "learning_rate": 4.346978059123911e-05, + "loss": 0.8753, + "step": 5181 + }, + { + "epoch": 0.2584538653366584, + "grad_norm": 0.4848363977790122, + "learning_rate": 4.346705869862393e-05, + "loss": 0.919, + "step": 5182 + }, + { + "epoch": 0.25850374064837905, + "grad_norm": 0.4281631291788667, + "learning_rate": 4.346433632411912e-05, + "loss": 0.9097, + "step": 5183 + }, + { + "epoch": 0.25855361596009974, + "grad_norm": 0.4116857340265833, + "learning_rate": 4.346161346779572e-05, + "loss": 0.8966, + "step": 5184 + }, + { + "epoch": 0.2586034912718205, + "grad_norm": 0.4383569566005578, + "learning_rate": 4.345889012972477e-05, + "loss": 0.9129, + "step": 5185 + }, + { + "epoch": 0.25865336658354116, + "grad_norm": 0.5295883195189229, + "learning_rate": 4.3456166309977344e-05, + "loss": 0.9316, + "step": 5186 + }, + { + "epoch": 0.25870324189526184, + "grad_norm": 0.4828428238521228, + "learning_rate": 4.3453442008624525e-05, + "loss": 0.8909, + "step": 5187 + }, + { + "epoch": 0.2587531172069825, + "grad_norm": 0.4381064081371777, + "learning_rate": 4.345071722573739e-05, + "loss": 0.9497, + "step": 5188 + }, + { + "epoch": 0.25880299251870326, + "grad_norm": 0.5609255720862237, + "learning_rate": 4.344799196138705e-05, + "loss": 0.9707, + "step": 5189 + }, + { + "epoch": 0.25885286783042394, + "grad_norm": 0.4696197223889566, + "learning_rate": 4.344526621564461e-05, + "loss": 0.9667, + "step": 5190 + }, + { + "epoch": 0.2589027431421446, + "grad_norm": 0.5152100885594109, + "learning_rate": 4.344253998858121e-05, + "loss": 0.8793, + "step": 5191 + }, + { + "epoch": 0.25895261845386536, + "grad_norm": 0.4401232398143273, + "learning_rate": 4.3439813280267975e-05, + "loss": 0.9404, + "step": 5192 + }, + { + "epoch": 0.25900249376558604, + "grad_norm": 0.5189695835994563, + "learning_rate": 4.3437086090776067e-05, + "loss": 0.9346, + "step": 5193 + }, + { + "epoch": 0.2590523690773067, + "grad_norm": 0.762265341083126, + "learning_rate": 4.343435842017666e-05, + "loss": 0.9358, + "step": 5194 + }, + { + "epoch": 0.25910224438902746, + "grad_norm": 0.4577377123744238, + "learning_rate": 4.343163026854091e-05, + "loss": 0.9006, + "step": 5195 + }, + { + "epoch": 0.25915211970074814, + "grad_norm": 0.3894879600369034, + "learning_rate": 4.3428901635940014e-05, + "loss": 0.9168, + "step": 5196 + }, + { + "epoch": 0.2592019950124688, + "grad_norm": 0.45592649634643184, + "learning_rate": 4.342617252244518e-05, + "loss": 0.9369, + "step": 5197 + }, + { + "epoch": 0.2592518703241895, + "grad_norm": 0.5070008980936335, + "learning_rate": 4.342344292812763e-05, + "loss": 0.9547, + "step": 5198 + }, + { + "epoch": 0.25930174563591024, + "grad_norm": 0.45726341113128, + "learning_rate": 4.342071285305858e-05, + "loss": 0.9268, + "step": 5199 + }, + { + "epoch": 0.2593516209476309, + "grad_norm": 0.4875701769713452, + "learning_rate": 4.341798229730926e-05, + "loss": 0.9906, + "step": 5200 + }, + { + "epoch": 0.2594014962593516, + "grad_norm": 0.44476974697336824, + "learning_rate": 4.3415251260950946e-05, + "loss": 0.9783, + "step": 5201 + }, + { + "epoch": 0.25945137157107234, + "grad_norm": 0.3807438068519703, + "learning_rate": 4.3412519744054886e-05, + "loss": 0.8988, + "step": 5202 + }, + { + "epoch": 0.259501246882793, + "grad_norm": 0.4498736575045121, + "learning_rate": 4.3409787746692364e-05, + "loss": 0.9386, + "step": 5203 + }, + { + "epoch": 0.2595511221945137, + "grad_norm": 0.4379051411359776, + "learning_rate": 4.340705526893467e-05, + "loss": 0.8937, + "step": 5204 + }, + { + "epoch": 0.2596009975062344, + "grad_norm": 0.5372371169775607, + "learning_rate": 4.340432231085311e-05, + "loss": 0.9568, + "step": 5205 + }, + { + "epoch": 0.2596508728179551, + "grad_norm": 0.4641829166491047, + "learning_rate": 4.340158887251899e-05, + "loss": 0.9602, + "step": 5206 + }, + { + "epoch": 0.2597007481296758, + "grad_norm": 0.48643298714862837, + "learning_rate": 4.339885495400366e-05, + "loss": 0.9177, + "step": 5207 + }, + { + "epoch": 0.2597506234413965, + "grad_norm": 0.456846870088375, + "learning_rate": 4.339612055537843e-05, + "loss": 0.9723, + "step": 5208 + }, + { + "epoch": 0.2598004987531172, + "grad_norm": 0.7414162922695233, + "learning_rate": 4.339338567671467e-05, + "loss": 0.8931, + "step": 5209 + }, + { + "epoch": 0.2598503740648379, + "grad_norm": 0.6274537340630214, + "learning_rate": 4.3390650318083736e-05, + "loss": 0.9252, + "step": 5210 + }, + { + "epoch": 0.2599002493765586, + "grad_norm": 0.47930810744173125, + "learning_rate": 4.3387914479557024e-05, + "loss": 0.9633, + "step": 5211 + }, + { + "epoch": 0.2599501246882793, + "grad_norm": 0.40789453381063767, + "learning_rate": 4.33851781612059e-05, + "loss": 0.9232, + "step": 5212 + }, + { + "epoch": 0.26, + "grad_norm": 3.0451952188447455, + "learning_rate": 4.3382441363101786e-05, + "loss": 0.8832, + "step": 5213 + }, + { + "epoch": 0.2600498753117207, + "grad_norm": 0.47589290435903603, + "learning_rate": 4.33797040853161e-05, + "loss": 0.9398, + "step": 5214 + }, + { + "epoch": 0.2600997506234414, + "grad_norm": 0.4647871650885013, + "learning_rate": 4.3376966327920256e-05, + "loss": 0.9627, + "step": 5215 + }, + { + "epoch": 0.2601496259351621, + "grad_norm": 0.5620563362684673, + "learning_rate": 4.337422809098569e-05, + "loss": 0.9203, + "step": 5216 + }, + { + "epoch": 0.2601995012468828, + "grad_norm": 0.4234956053137903, + "learning_rate": 4.337148937458388e-05, + "loss": 0.9268, + "step": 5217 + }, + { + "epoch": 0.2602493765586035, + "grad_norm": 0.50984753225194, + "learning_rate": 4.336875017878627e-05, + "loss": 0.9865, + "step": 5218 + }, + { + "epoch": 0.2602992518703242, + "grad_norm": 0.47965149217846426, + "learning_rate": 4.336601050366434e-05, + "loss": 0.9194, + "step": 5219 + }, + { + "epoch": 0.2603491271820449, + "grad_norm": 0.47070804644546793, + "learning_rate": 4.33632703492896e-05, + "loss": 0.9018, + "step": 5220 + }, + { + "epoch": 0.2603990024937656, + "grad_norm": 0.45863050281500806, + "learning_rate": 4.336052971573352e-05, + "loss": 0.9341, + "step": 5221 + }, + { + "epoch": 0.2604488778054863, + "grad_norm": 0.44297623549926984, + "learning_rate": 4.335778860306765e-05, + "loss": 0.8964, + "step": 5222 + }, + { + "epoch": 0.260498753117207, + "grad_norm": 0.440050626428573, + "learning_rate": 4.33550470113635e-05, + "loss": 0.9403, + "step": 5223 + }, + { + "epoch": 0.2605486284289277, + "grad_norm": 0.4602450012753675, + "learning_rate": 4.3352304940692605e-05, + "loss": 0.9555, + "step": 5224 + }, + { + "epoch": 0.26059850374064836, + "grad_norm": 0.3918225853098414, + "learning_rate": 4.334956239112653e-05, + "loss": 0.9147, + "step": 5225 + }, + { + "epoch": 0.2606483790523691, + "grad_norm": 0.4290220954984515, + "learning_rate": 4.3346819362736833e-05, + "loss": 0.9324, + "step": 5226 + }, + { + "epoch": 0.2606982543640898, + "grad_norm": 0.4276594881666595, + "learning_rate": 4.3344075855595104e-05, + "loss": 0.9531, + "step": 5227 + }, + { + "epoch": 0.26074812967581046, + "grad_norm": 0.42618491909901735, + "learning_rate": 4.334133186977292e-05, + "loss": 0.9605, + "step": 5228 + }, + { + "epoch": 0.2607980049875312, + "grad_norm": 0.45305963541145855, + "learning_rate": 4.333858740534189e-05, + "loss": 0.9201, + "step": 5229 + }, + { + "epoch": 0.2608478802992519, + "grad_norm": 1.0200947388256976, + "learning_rate": 4.333584246237363e-05, + "loss": 0.9734, + "step": 5230 + }, + { + "epoch": 0.26089775561097256, + "grad_norm": 0.4199886141921677, + "learning_rate": 4.3333097040939764e-05, + "loss": 0.9266, + "step": 5231 + }, + { + "epoch": 0.26094763092269324, + "grad_norm": 0.4774600670093554, + "learning_rate": 4.3330351141111935e-05, + "loss": 0.9293, + "step": 5232 + }, + { + "epoch": 0.260997506234414, + "grad_norm": 0.41255408807522853, + "learning_rate": 4.3327604762961796e-05, + "loss": 0.9514, + "step": 5233 + }, + { + "epoch": 0.26104738154613466, + "grad_norm": 0.41629782106537544, + "learning_rate": 4.3324857906561025e-05, + "loss": 0.9256, + "step": 5234 + }, + { + "epoch": 0.26109725685785534, + "grad_norm": 0.40761140690957354, + "learning_rate": 4.3322110571981275e-05, + "loss": 0.9085, + "step": 5235 + }, + { + "epoch": 0.2611471321695761, + "grad_norm": 0.4422200287803934, + "learning_rate": 4.331936275929426e-05, + "loss": 0.9123, + "step": 5236 + }, + { + "epoch": 0.26119700748129676, + "grad_norm": 0.3831416355432962, + "learning_rate": 4.3316614468571657e-05, + "loss": 0.9271, + "step": 5237 + }, + { + "epoch": 0.26124688279301744, + "grad_norm": 0.6539827284536532, + "learning_rate": 4.331386569988521e-05, + "loss": 0.926, + "step": 5238 + }, + { + "epoch": 0.2612967581047382, + "grad_norm": 0.4326088348548877, + "learning_rate": 4.331111645330663e-05, + "loss": 0.9242, + "step": 5239 + }, + { + "epoch": 0.26134663341645886, + "grad_norm": 0.4553063608857658, + "learning_rate": 4.330836672890766e-05, + "loss": 0.944, + "step": 5240 + }, + { + "epoch": 0.26139650872817954, + "grad_norm": 0.3796929964978295, + "learning_rate": 4.330561652676005e-05, + "loss": 0.9251, + "step": 5241 + }, + { + "epoch": 0.2614463840399002, + "grad_norm": 0.44721831263157474, + "learning_rate": 4.330286584693558e-05, + "loss": 0.9659, + "step": 5242 + }, + { + "epoch": 0.26149625935162096, + "grad_norm": 0.45359273608496764, + "learning_rate": 4.330011468950601e-05, + "loss": 0.9591, + "step": 5243 + }, + { + "epoch": 0.26154613466334165, + "grad_norm": 0.42684085175869246, + "learning_rate": 4.329736305454314e-05, + "loss": 0.9158, + "step": 5244 + }, + { + "epoch": 0.2615960099750623, + "grad_norm": 1.1897036350227626, + "learning_rate": 4.329461094211876e-05, + "loss": 0.8656, + "step": 5245 + }, + { + "epoch": 0.26164588528678306, + "grad_norm": 0.5265412673384507, + "learning_rate": 4.3291858352304716e-05, + "loss": 0.9296, + "step": 5246 + }, + { + "epoch": 0.26169576059850375, + "grad_norm": 0.6462803976644391, + "learning_rate": 4.32891052851728e-05, + "loss": 0.9329, + "step": 5247 + }, + { + "epoch": 0.26174563591022443, + "grad_norm": 0.43461419631479203, + "learning_rate": 4.328635174079487e-05, + "loss": 0.9302, + "step": 5248 + }, + { + "epoch": 0.2617955112219451, + "grad_norm": 1.5956747727238378, + "learning_rate": 4.3283597719242766e-05, + "loss": 0.8807, + "step": 5249 + }, + { + "epoch": 0.26184538653366585, + "grad_norm": 0.47732917807158837, + "learning_rate": 4.328084322058838e-05, + "loss": 0.9395, + "step": 5250 + }, + { + "epoch": 0.26189526184538653, + "grad_norm": 0.5768721314147215, + "learning_rate": 4.327808824490355e-05, + "loss": 0.8863, + "step": 5251 + }, + { + "epoch": 0.2619451371571072, + "grad_norm": 0.41275598953328796, + "learning_rate": 4.32753327922602e-05, + "loss": 0.9464, + "step": 5252 + }, + { + "epoch": 0.26199501246882795, + "grad_norm": 0.41826622351122744, + "learning_rate": 4.327257686273022e-05, + "loss": 0.9538, + "step": 5253 + }, + { + "epoch": 0.26204488778054863, + "grad_norm": 0.41090049431512893, + "learning_rate": 4.3269820456385524e-05, + "loss": 0.9032, + "step": 5254 + }, + { + "epoch": 0.2620947630922693, + "grad_norm": 0.411499251074949, + "learning_rate": 4.326706357329804e-05, + "loss": 0.9551, + "step": 5255 + }, + { + "epoch": 0.26214463840399005, + "grad_norm": 0.452284148587631, + "learning_rate": 4.3264306213539705e-05, + "loss": 0.9275, + "step": 5256 + }, + { + "epoch": 0.26219451371571073, + "grad_norm": 0.43664761761488546, + "learning_rate": 4.3261548377182474e-05, + "loss": 0.9127, + "step": 5257 + }, + { + "epoch": 0.2622443890274314, + "grad_norm": 0.5719672254219904, + "learning_rate": 4.325879006429832e-05, + "loss": 0.8805, + "step": 5258 + }, + { + "epoch": 0.2622942643391521, + "grad_norm": 0.4738376384038609, + "learning_rate": 4.32560312749592e-05, + "loss": 0.945, + "step": 5259 + }, + { + "epoch": 0.26234413965087283, + "grad_norm": 0.4364731373058673, + "learning_rate": 4.325327200923711e-05, + "loss": 0.919, + "step": 5260 + }, + { + "epoch": 0.2623940149625935, + "grad_norm": 0.46081387739116814, + "learning_rate": 4.3250512267204066e-05, + "loss": 0.8775, + "step": 5261 + }, + { + "epoch": 0.2624438902743142, + "grad_norm": 0.4378293859900264, + "learning_rate": 4.324775204893207e-05, + "loss": 0.8945, + "step": 5262 + }, + { + "epoch": 0.26249376558603493, + "grad_norm": 0.4361795576216186, + "learning_rate": 4.324499135449315e-05, + "loss": 0.9196, + "step": 5263 + }, + { + "epoch": 0.2625436408977556, + "grad_norm": 0.4406202798862615, + "learning_rate": 4.324223018395934e-05, + "loss": 0.9367, + "step": 5264 + }, + { + "epoch": 0.2625935162094763, + "grad_norm": 0.5749037447866178, + "learning_rate": 4.3239468537402714e-05, + "loss": 0.9341, + "step": 5265 + }, + { + "epoch": 0.26264339152119703, + "grad_norm": 0.45049560927538157, + "learning_rate": 4.32367064148953e-05, + "loss": 0.9053, + "step": 5266 + }, + { + "epoch": 0.2626932668329177, + "grad_norm": 0.5441110087988463, + "learning_rate": 4.3233943816509204e-05, + "loss": 0.9125, + "step": 5267 + }, + { + "epoch": 0.2627431421446384, + "grad_norm": 0.4732602216100123, + "learning_rate": 4.3231180742316506e-05, + "loss": 0.9541, + "step": 5268 + }, + { + "epoch": 0.2627930174563591, + "grad_norm": 0.5041125524958252, + "learning_rate": 4.3228417192389305e-05, + "loss": 0.8983, + "step": 5269 + }, + { + "epoch": 0.2628428927680798, + "grad_norm": 0.40480371864826065, + "learning_rate": 4.322565316679971e-05, + "loss": 0.9777, + "step": 5270 + }, + { + "epoch": 0.2628927680798005, + "grad_norm": 0.4482316537174049, + "learning_rate": 4.322288866561985e-05, + "loss": 0.926, + "step": 5271 + }, + { + "epoch": 0.2629426433915212, + "grad_norm": 0.3962228882394076, + "learning_rate": 4.3220123688921874e-05, + "loss": 0.9209, + "step": 5272 + }, + { + "epoch": 0.2629925187032419, + "grad_norm": 0.7232574594365923, + "learning_rate": 4.3217358236777926e-05, + "loss": 0.9164, + "step": 5273 + }, + { + "epoch": 0.2630423940149626, + "grad_norm": 0.4081004968914771, + "learning_rate": 4.321459230926016e-05, + "loss": 0.9487, + "step": 5274 + }, + { + "epoch": 0.2630922693266833, + "grad_norm": 0.49615440699549046, + "learning_rate": 4.321182590644077e-05, + "loss": 0.9745, + "step": 5275 + }, + { + "epoch": 0.26314214463840396, + "grad_norm": 0.4739475516956883, + "learning_rate": 4.320905902839192e-05, + "loss": 0.9317, + "step": 5276 + }, + { + "epoch": 0.2631920199501247, + "grad_norm": 0.5870393368142374, + "learning_rate": 4.3206291675185834e-05, + "loss": 0.9427, + "step": 5277 + }, + { + "epoch": 0.2632418952618454, + "grad_norm": 0.45168559675125797, + "learning_rate": 4.320352384689471e-05, + "loss": 0.9539, + "step": 5278 + }, + { + "epoch": 0.26329177057356606, + "grad_norm": 0.44765826431281946, + "learning_rate": 4.3200755543590785e-05, + "loss": 0.9422, + "step": 5279 + }, + { + "epoch": 0.2633416458852868, + "grad_norm": 0.450105360694507, + "learning_rate": 4.319798676534628e-05, + "loss": 0.926, + "step": 5280 + }, + { + "epoch": 0.2633915211970075, + "grad_norm": 0.7720436732589212, + "learning_rate": 4.319521751223346e-05, + "loss": 0.8792, + "step": 5281 + }, + { + "epoch": 0.26344139650872817, + "grad_norm": 0.44182904591540034, + "learning_rate": 4.319244778432458e-05, + "loss": 0.9338, + "step": 5282 + }, + { + "epoch": 0.2634912718204489, + "grad_norm": 0.44078190567923625, + "learning_rate": 4.318967758169192e-05, + "loss": 0.8921, + "step": 5283 + }, + { + "epoch": 0.2635411471321696, + "grad_norm": 0.4686458936661657, + "learning_rate": 4.318690690440776e-05, + "loss": 0.9212, + "step": 5284 + }, + { + "epoch": 0.26359102244389027, + "grad_norm": 0.5627675214633744, + "learning_rate": 4.318413575254441e-05, + "loss": 0.9692, + "step": 5285 + }, + { + "epoch": 0.26364089775561095, + "grad_norm": 0.43290349779581455, + "learning_rate": 4.318136412617417e-05, + "loss": 0.9187, + "step": 5286 + }, + { + "epoch": 0.2636907730673317, + "grad_norm": 0.4257116000578227, + "learning_rate": 4.3178592025369374e-05, + "loss": 0.8951, + "step": 5287 + }, + { + "epoch": 0.26374064837905237, + "grad_norm": 0.46216493593337016, + "learning_rate": 4.317581945020235e-05, + "loss": 0.9531, + "step": 5288 + }, + { + "epoch": 0.26379052369077305, + "grad_norm": 0.4180522137892546, + "learning_rate": 4.317304640074546e-05, + "loss": 0.9335, + "step": 5289 + }, + { + "epoch": 0.2638403990024938, + "grad_norm": 0.48544136571280805, + "learning_rate": 4.317027287707105e-05, + "loss": 0.9509, + "step": 5290 + }, + { + "epoch": 0.26389027431421447, + "grad_norm": 0.4968857694606372, + "learning_rate": 4.31674988792515e-05, + "loss": 0.9086, + "step": 5291 + }, + { + "epoch": 0.26394014962593515, + "grad_norm": 0.4397403140267303, + "learning_rate": 4.31647244073592e-05, + "loss": 0.8904, + "step": 5292 + }, + { + "epoch": 0.2639900249376559, + "grad_norm": 0.399839717874064, + "learning_rate": 4.316194946146654e-05, + "loss": 0.9485, + "step": 5293 + }, + { + "epoch": 0.26403990024937657, + "grad_norm": 0.49522216420495546, + "learning_rate": 4.315917404164595e-05, + "loss": 0.9306, + "step": 5294 + }, + { + "epoch": 0.26408977556109725, + "grad_norm": 0.42532210928190567, + "learning_rate": 4.315639814796983e-05, + "loss": 0.9116, + "step": 5295 + }, + { + "epoch": 0.26413965087281793, + "grad_norm": 0.47580493485240144, + "learning_rate": 4.3153621780510636e-05, + "loss": 0.9187, + "step": 5296 + }, + { + "epoch": 0.26418952618453867, + "grad_norm": 0.4139372232300957, + "learning_rate": 4.3150844939340796e-05, + "loss": 0.9047, + "step": 5297 + }, + { + "epoch": 0.26423940149625935, + "grad_norm": 0.609195904510781, + "learning_rate": 4.3148067624532785e-05, + "loss": 0.9432, + "step": 5298 + }, + { + "epoch": 0.26428927680798003, + "grad_norm": 0.5109987572175358, + "learning_rate": 4.3145289836159075e-05, + "loss": 0.9077, + "step": 5299 + }, + { + "epoch": 0.26433915211970077, + "grad_norm": 0.7645980137387658, + "learning_rate": 4.3142511574292146e-05, + "loss": 0.9409, + "step": 5300 + }, + { + "epoch": 0.26438902743142145, + "grad_norm": 0.4713249266831397, + "learning_rate": 4.313973283900449e-05, + "loss": 0.8849, + "step": 5301 + }, + { + "epoch": 0.26443890274314213, + "grad_norm": 0.4879934885711843, + "learning_rate": 4.313695363036862e-05, + "loss": 0.9195, + "step": 5302 + }, + { + "epoch": 0.2644887780548628, + "grad_norm": 0.406400571253887, + "learning_rate": 4.3134173948457076e-05, + "loss": 0.988, + "step": 5303 + }, + { + "epoch": 0.26453865336658355, + "grad_norm": 0.4675305680044415, + "learning_rate": 4.313139379334238e-05, + "loss": 0.9352, + "step": 5304 + }, + { + "epoch": 0.26458852867830424, + "grad_norm": 0.4142719054768242, + "learning_rate": 4.312861316509706e-05, + "loss": 0.8979, + "step": 5305 + }, + { + "epoch": 0.2646384039900249, + "grad_norm": 0.4828601323745444, + "learning_rate": 4.312583206379371e-05, + "loss": 0.9057, + "step": 5306 + }, + { + "epoch": 0.26468827930174565, + "grad_norm": 0.47461574825204794, + "learning_rate": 4.312305048950488e-05, + "loss": 0.9666, + "step": 5307 + }, + { + "epoch": 0.26473815461346634, + "grad_norm": 0.5755931259429405, + "learning_rate": 4.312026844230315e-05, + "loss": 0.8914, + "step": 5308 + }, + { + "epoch": 0.264788029925187, + "grad_norm": 0.40973888371147155, + "learning_rate": 4.3117485922261136e-05, + "loss": 0.9757, + "step": 5309 + }, + { + "epoch": 0.26483790523690776, + "grad_norm": 0.4235863103492461, + "learning_rate": 4.311470292945143e-05, + "loss": 0.8859, + "step": 5310 + }, + { + "epoch": 0.26488778054862844, + "grad_norm": 0.41851107529471937, + "learning_rate": 4.311191946394665e-05, + "loss": 0.9055, + "step": 5311 + }, + { + "epoch": 0.2649376558603491, + "grad_norm": 0.4932161249671512, + "learning_rate": 4.310913552581945e-05, + "loss": 0.8825, + "step": 5312 + }, + { + "epoch": 0.2649875311720698, + "grad_norm": 0.4507321831996277, + "learning_rate": 4.310635111514246e-05, + "loss": 0.9213, + "step": 5313 + }, + { + "epoch": 0.26503740648379054, + "grad_norm": 0.5341181442188816, + "learning_rate": 4.310356623198834e-05, + "loss": 0.9492, + "step": 5314 + }, + { + "epoch": 0.2650872817955112, + "grad_norm": 0.443837041157471, + "learning_rate": 4.310078087642976e-05, + "loss": 0.9397, + "step": 5315 + }, + { + "epoch": 0.2651371571072319, + "grad_norm": 0.444237633666993, + "learning_rate": 4.3097995048539405e-05, + "loss": 0.9244, + "step": 5316 + }, + { + "epoch": 0.26518703241895264, + "grad_norm": 0.41539967106001285, + "learning_rate": 4.309520874838997e-05, + "loss": 0.9265, + "step": 5317 + }, + { + "epoch": 0.2652369077306733, + "grad_norm": 0.5237499538112634, + "learning_rate": 4.3092421976054156e-05, + "loss": 0.9489, + "step": 5318 + }, + { + "epoch": 0.265286783042394, + "grad_norm": 0.6038984249309196, + "learning_rate": 4.3089634731604686e-05, + "loss": 0.9317, + "step": 5319 + }, + { + "epoch": 0.26533665835411474, + "grad_norm": 0.5143589128092384, + "learning_rate": 4.3086847015114304e-05, + "loss": 0.9637, + "step": 5320 + }, + { + "epoch": 0.2653865336658354, + "grad_norm": 0.4540213071742321, + "learning_rate": 4.3084058826655735e-05, + "loss": 0.9251, + "step": 5321 + }, + { + "epoch": 0.2654364089775561, + "grad_norm": 0.4368274828157895, + "learning_rate": 4.3081270166301755e-05, + "loss": 0.8829, + "step": 5322 + }, + { + "epoch": 0.2654862842892768, + "grad_norm": 0.46031360134369614, + "learning_rate": 4.3078481034125107e-05, + "loss": 0.8959, + "step": 5323 + }, + { + "epoch": 0.2655361596009975, + "grad_norm": 0.4837610137032933, + "learning_rate": 4.3075691430198595e-05, + "loss": 0.9467, + "step": 5324 + }, + { + "epoch": 0.2655860349127182, + "grad_norm": 0.5294146775525947, + "learning_rate": 4.307290135459501e-05, + "loss": 0.9206, + "step": 5325 + }, + { + "epoch": 0.2656359102244389, + "grad_norm": 0.44783227361530886, + "learning_rate": 4.3070110807387143e-05, + "loss": 0.9382, + "step": 5326 + }, + { + "epoch": 0.2656857855361596, + "grad_norm": 0.519719086085047, + "learning_rate": 4.3067319788647826e-05, + "loss": 0.9659, + "step": 5327 + }, + { + "epoch": 0.2657356608478803, + "grad_norm": 0.5443915344586384, + "learning_rate": 4.306452829844989e-05, + "loss": 0.8828, + "step": 5328 + }, + { + "epoch": 0.265785536159601, + "grad_norm": 0.45168429585653735, + "learning_rate": 4.306173633686617e-05, + "loss": 0.9551, + "step": 5329 + }, + { + "epoch": 0.26583541147132167, + "grad_norm": 0.4943286903809498, + "learning_rate": 4.305894390396952e-05, + "loss": 0.9646, + "step": 5330 + }, + { + "epoch": 0.2658852867830424, + "grad_norm": 0.5279015528451548, + "learning_rate": 4.305615099983281e-05, + "loss": 0.8945, + "step": 5331 + }, + { + "epoch": 0.2659351620947631, + "grad_norm": 0.4081299909020576, + "learning_rate": 4.3053357624528924e-05, + "loss": 0.896, + "step": 5332 + }, + { + "epoch": 0.26598503740648377, + "grad_norm": 0.5520503709558217, + "learning_rate": 4.305056377813075e-05, + "loss": 0.9601, + "step": 5333 + }, + { + "epoch": 0.2660349127182045, + "grad_norm": 0.4401737586139969, + "learning_rate": 4.304776946071119e-05, + "loss": 0.8992, + "step": 5334 + }, + { + "epoch": 0.2660847880299252, + "grad_norm": 0.4195735616842718, + "learning_rate": 4.3044974672343164e-05, + "loss": 0.9656, + "step": 5335 + }, + { + "epoch": 0.26613466334164587, + "grad_norm": 0.45819538161448153, + "learning_rate": 4.3042179413099597e-05, + "loss": 0.8951, + "step": 5336 + }, + { + "epoch": 0.2661845386533666, + "grad_norm": 0.44777326078566926, + "learning_rate": 4.3039383683053434e-05, + "loss": 0.866, + "step": 5337 + }, + { + "epoch": 0.2662344139650873, + "grad_norm": 0.44772359312261273, + "learning_rate": 4.3036587482277634e-05, + "loss": 0.9594, + "step": 5338 + }, + { + "epoch": 0.266284289276808, + "grad_norm": 0.45214020615229167, + "learning_rate": 4.303379081084515e-05, + "loss": 0.9751, + "step": 5339 + }, + { + "epoch": 0.26633416458852865, + "grad_norm": 0.5831696393019876, + "learning_rate": 4.303099366882897e-05, + "loss": 0.9512, + "step": 5340 + }, + { + "epoch": 0.2663840399002494, + "grad_norm": 0.655135792676832, + "learning_rate": 4.3028196056302075e-05, + "loss": 0.9761, + "step": 5341 + }, + { + "epoch": 0.2664339152119701, + "grad_norm": 0.5411958479048272, + "learning_rate": 4.3025397973337474e-05, + "loss": 0.9554, + "step": 5342 + }, + { + "epoch": 0.26648379052369076, + "grad_norm": 0.5709484049222905, + "learning_rate": 4.302259942000818e-05, + "loss": 0.9, + "step": 5343 + }, + { + "epoch": 0.2665336658354115, + "grad_norm": 0.4383852396940494, + "learning_rate": 4.301980039638722e-05, + "loss": 0.8989, + "step": 5344 + }, + { + "epoch": 0.2665835411471322, + "grad_norm": 0.4213141597851983, + "learning_rate": 4.3017000902547634e-05, + "loss": 0.9168, + "step": 5345 + }, + { + "epoch": 0.26663341645885286, + "grad_norm": 0.5432208649314162, + "learning_rate": 4.301420093856247e-05, + "loss": 0.9126, + "step": 5346 + }, + { + "epoch": 0.26668329177057354, + "grad_norm": 0.4973372650208201, + "learning_rate": 4.3011400504504797e-05, + "loss": 0.9364, + "step": 5347 + }, + { + "epoch": 0.2667331670822943, + "grad_norm": 1.0184034462636713, + "learning_rate": 4.300859960044769e-05, + "loss": 0.9416, + "step": 5348 + }, + { + "epoch": 0.26678304239401496, + "grad_norm": 0.4764347568045593, + "learning_rate": 4.3005798226464235e-05, + "loss": 0.919, + "step": 5349 + }, + { + "epoch": 0.26683291770573564, + "grad_norm": 0.4178183201820482, + "learning_rate": 4.300299638262753e-05, + "loss": 0.8752, + "step": 5350 + }, + { + "epoch": 0.2668827930174564, + "grad_norm": 0.4607752534776532, + "learning_rate": 4.3000194069010694e-05, + "loss": 0.8931, + "step": 5351 + }, + { + "epoch": 0.26693266832917706, + "grad_norm": 0.6030987172118848, + "learning_rate": 4.299739128568685e-05, + "loss": 0.889, + "step": 5352 + }, + { + "epoch": 0.26698254364089774, + "grad_norm": 0.540776290780475, + "learning_rate": 4.299458803272914e-05, + "loss": 0.9331, + "step": 5353 + }, + { + "epoch": 0.2670324189526185, + "grad_norm": 0.6240381226929944, + "learning_rate": 4.2991784310210706e-05, + "loss": 0.9214, + "step": 5354 + }, + { + "epoch": 0.26708229426433916, + "grad_norm": 0.4995463849781814, + "learning_rate": 4.2988980118204705e-05, + "loss": 0.899, + "step": 5355 + }, + { + "epoch": 0.26713216957605984, + "grad_norm": 0.612723210500365, + "learning_rate": 4.2986175456784326e-05, + "loss": 0.9305, + "step": 5356 + }, + { + "epoch": 0.2671820448877805, + "grad_norm": 0.47501630870582506, + "learning_rate": 4.298337032602274e-05, + "loss": 0.9045, + "step": 5357 + }, + { + "epoch": 0.26723192019950126, + "grad_norm": 0.565563580401004, + "learning_rate": 4.298056472599317e-05, + "loss": 0.9312, + "step": 5358 + }, + { + "epoch": 0.26728179551122194, + "grad_norm": 0.36959704948788724, + "learning_rate": 4.2977758656768794e-05, + "loss": 0.9332, + "step": 5359 + }, + { + "epoch": 0.2673316708229426, + "grad_norm": 0.9705227677596551, + "learning_rate": 4.297495211842286e-05, + "loss": 0.9406, + "step": 5360 + }, + { + "epoch": 0.26738154613466336, + "grad_norm": 0.48076732270346306, + "learning_rate": 4.2972145111028595e-05, + "loss": 0.9356, + "step": 5361 + }, + { + "epoch": 0.26743142144638404, + "grad_norm": 0.4458177193729005, + "learning_rate": 4.296933763465924e-05, + "loss": 0.9812, + "step": 5362 + }, + { + "epoch": 0.2674812967581047, + "grad_norm": 0.5110755069386286, + "learning_rate": 4.296652968938807e-05, + "loss": 0.9158, + "step": 5363 + }, + { + "epoch": 0.26753117206982546, + "grad_norm": 0.7439444385840922, + "learning_rate": 4.296372127528834e-05, + "loss": 0.9607, + "step": 5364 + }, + { + "epoch": 0.26758104738154614, + "grad_norm": 0.5015695368497759, + "learning_rate": 4.296091239243335e-05, + "loss": 0.9281, + "step": 5365 + }, + { + "epoch": 0.2676309226932668, + "grad_norm": 0.48297246432422525, + "learning_rate": 4.295810304089638e-05, + "loss": 0.9619, + "step": 5366 + }, + { + "epoch": 0.2676807980049875, + "grad_norm": 1.0482822233067604, + "learning_rate": 4.295529322075076e-05, + "loss": 0.9727, + "step": 5367 + }, + { + "epoch": 0.26773067331670825, + "grad_norm": 0.4303290821967416, + "learning_rate": 4.295248293206979e-05, + "loss": 0.9236, + "step": 5368 + }, + { + "epoch": 0.2677805486284289, + "grad_norm": 0.5293856026988604, + "learning_rate": 4.2949672174926815e-05, + "loss": 0.9482, + "step": 5369 + }, + { + "epoch": 0.2678304239401496, + "grad_norm": 0.5043478795950811, + "learning_rate": 4.294686094939518e-05, + "loss": 0.925, + "step": 5370 + }, + { + "epoch": 0.26788029925187035, + "grad_norm": 0.44516886101885267, + "learning_rate": 4.294404925554824e-05, + "loss": 0.9439, + "step": 5371 + }, + { + "epoch": 0.26793017456359103, + "grad_norm": 0.535183736721789, + "learning_rate": 4.294123709345936e-05, + "loss": 0.9177, + "step": 5372 + }, + { + "epoch": 0.2679800498753117, + "grad_norm": 0.5594482887273247, + "learning_rate": 4.293842446320194e-05, + "loss": 0.9701, + "step": 5373 + }, + { + "epoch": 0.2680299251870324, + "grad_norm": 0.5288487968515156, + "learning_rate": 4.293561136484935e-05, + "loss": 0.9672, + "step": 5374 + }, + { + "epoch": 0.26807980049875313, + "grad_norm": 0.4771484565651411, + "learning_rate": 4.293279779847502e-05, + "loss": 0.939, + "step": 5375 + }, + { + "epoch": 0.2681296758104738, + "grad_norm": 0.4540303125828765, + "learning_rate": 4.292998376415235e-05, + "loss": 0.9282, + "step": 5376 + }, + { + "epoch": 0.2681795511221945, + "grad_norm": 0.43559094452420105, + "learning_rate": 4.292716926195478e-05, + "loss": 0.9829, + "step": 5377 + }, + { + "epoch": 0.26822942643391523, + "grad_norm": 0.5612991259095654, + "learning_rate": 4.292435429195574e-05, + "loss": 0.923, + "step": 5378 + }, + { + "epoch": 0.2682793017456359, + "grad_norm": 0.5289311928124446, + "learning_rate": 4.292153885422871e-05, + "loss": 0.927, + "step": 5379 + }, + { + "epoch": 0.2683291770573566, + "grad_norm": 0.5381858903978051, + "learning_rate": 4.291872294884714e-05, + "loss": 0.8988, + "step": 5380 + }, + { + "epoch": 0.26837905236907733, + "grad_norm": 0.5577633087201614, + "learning_rate": 4.291590657588451e-05, + "loss": 0.9176, + "step": 5381 + }, + { + "epoch": 0.268428927680798, + "grad_norm": 0.481093360882478, + "learning_rate": 4.291308973541432e-05, + "loss": 0.9207, + "step": 5382 + }, + { + "epoch": 0.2684788029925187, + "grad_norm": 0.4437764875604433, + "learning_rate": 4.291027242751007e-05, + "loss": 0.9154, + "step": 5383 + }, + { + "epoch": 0.2685286783042394, + "grad_norm": 0.5584840428425619, + "learning_rate": 4.2907454652245276e-05, + "loss": 0.9141, + "step": 5384 + }, + { + "epoch": 0.2685785536159601, + "grad_norm": 0.423663066389802, + "learning_rate": 4.2904636409693466e-05, + "loss": 0.8721, + "step": 5385 + }, + { + "epoch": 0.2686284289276808, + "grad_norm": 0.7402776456425697, + "learning_rate": 4.290181769992819e-05, + "loss": 0.9513, + "step": 5386 + }, + { + "epoch": 0.2686783042394015, + "grad_norm": 0.5480642218446828, + "learning_rate": 4.289899852302298e-05, + "loss": 0.8985, + "step": 5387 + }, + { + "epoch": 0.2687281795511222, + "grad_norm": 0.6594003305153573, + "learning_rate": 4.289617887905143e-05, + "loss": 0.9349, + "step": 5388 + }, + { + "epoch": 0.2687780548628429, + "grad_norm": 0.8532631688431512, + "learning_rate": 4.289335876808709e-05, + "loss": 0.9051, + "step": 5389 + }, + { + "epoch": 0.2688279301745636, + "grad_norm": 0.6103279636250255, + "learning_rate": 4.289053819020356e-05, + "loss": 0.9238, + "step": 5390 + }, + { + "epoch": 0.2688778054862843, + "grad_norm": 0.4085411686638068, + "learning_rate": 4.288771714547445e-05, + "loss": 0.9358, + "step": 5391 + }, + { + "epoch": 0.268927680798005, + "grad_norm": 1.1756069707246195, + "learning_rate": 4.288489563397336e-05, + "loss": 0.8933, + "step": 5392 + }, + { + "epoch": 0.2689775561097257, + "grad_norm": 0.5165936284642536, + "learning_rate": 4.2882073655773924e-05, + "loss": 0.9609, + "step": 5393 + }, + { + "epoch": 0.26902743142144636, + "grad_norm": 0.4441122190654574, + "learning_rate": 4.2879251210949775e-05, + "loss": 0.9063, + "step": 5394 + }, + { + "epoch": 0.2690773067331671, + "grad_norm": 0.45930035819920817, + "learning_rate": 4.287642829957458e-05, + "loss": 0.9072, + "step": 5395 + }, + { + "epoch": 0.2691271820448878, + "grad_norm": 0.6048369345196521, + "learning_rate": 4.2873604921721975e-05, + "loss": 0.8719, + "step": 5396 + }, + { + "epoch": 0.26917705735660846, + "grad_norm": 0.4514250535174757, + "learning_rate": 4.287078107746566e-05, + "loss": 0.9415, + "step": 5397 + }, + { + "epoch": 0.2692269326683292, + "grad_norm": 0.5119339508792199, + "learning_rate": 4.28679567668793e-05, + "loss": 0.9505, + "step": 5398 + }, + { + "epoch": 0.2692768079800499, + "grad_norm": 0.5010598843972814, + "learning_rate": 4.2865131990036614e-05, + "loss": 0.9055, + "step": 5399 + }, + { + "epoch": 0.26932668329177056, + "grad_norm": 0.8739033493418097, + "learning_rate": 4.286230674701131e-05, + "loss": 0.8854, + "step": 5400 + }, + { + "epoch": 0.26937655860349125, + "grad_norm": 0.4963172807882303, + "learning_rate": 4.285948103787709e-05, + "loss": 0.9341, + "step": 5401 + }, + { + "epoch": 0.269426433915212, + "grad_norm": 0.4319486880025429, + "learning_rate": 4.285665486270771e-05, + "loss": 0.9178, + "step": 5402 + }, + { + "epoch": 0.26947630922693266, + "grad_norm": 0.5451427623003273, + "learning_rate": 4.285382822157692e-05, + "loss": 0.9102, + "step": 5403 + }, + { + "epoch": 0.26952618453865335, + "grad_norm": 0.4027784625007193, + "learning_rate": 4.2851001114558465e-05, + "loss": 0.901, + "step": 5404 + }, + { + "epoch": 0.2695760598503741, + "grad_norm": 0.5632708813384876, + "learning_rate": 4.284817354172612e-05, + "loss": 0.9361, + "step": 5405 + }, + { + "epoch": 0.26962593516209477, + "grad_norm": 0.6416217048024191, + "learning_rate": 4.284534550315369e-05, + "loss": 0.9421, + "step": 5406 + }, + { + "epoch": 0.26967581047381545, + "grad_norm": 0.6246105525506862, + "learning_rate": 4.2842516998914946e-05, + "loss": 0.9325, + "step": 5407 + }, + { + "epoch": 0.2697256857855362, + "grad_norm": 0.5234128399896498, + "learning_rate": 4.28396880290837e-05, + "loss": 0.9165, + "step": 5408 + }, + { + "epoch": 0.26977556109725687, + "grad_norm": 0.689461879732886, + "learning_rate": 4.283685859373379e-05, + "loss": 0.9716, + "step": 5409 + }, + { + "epoch": 0.26982543640897755, + "grad_norm": 0.4337725346247556, + "learning_rate": 4.283402869293903e-05, + "loss": 0.9096, + "step": 5410 + }, + { + "epoch": 0.26987531172069823, + "grad_norm": 0.5011574709777575, + "learning_rate": 4.283119832677328e-05, + "loss": 0.9443, + "step": 5411 + }, + { + "epoch": 0.26992518703241897, + "grad_norm": 0.511685888990518, + "learning_rate": 4.282836749531039e-05, + "loss": 0.9726, + "step": 5412 + }, + { + "epoch": 0.26997506234413965, + "grad_norm": 0.39476642052160743, + "learning_rate": 4.282553619862423e-05, + "loss": 0.9197, + "step": 5413 + }, + { + "epoch": 0.27002493765586033, + "grad_norm": 0.5423512346452722, + "learning_rate": 4.282270443678868e-05, + "loss": 0.9418, + "step": 5414 + }, + { + "epoch": 0.27007481296758107, + "grad_norm": 0.497473872576333, + "learning_rate": 4.2819872209877624e-05, + "loss": 0.9457, + "step": 5415 + }, + { + "epoch": 0.27012468827930175, + "grad_norm": 0.46673585980928184, + "learning_rate": 4.2817039517964984e-05, + "loss": 0.9497, + "step": 5416 + }, + { + "epoch": 0.27017456359102243, + "grad_norm": 0.533435297980006, + "learning_rate": 4.281420636112467e-05, + "loss": 0.9224, + "step": 5417 + }, + { + "epoch": 0.27022443890274317, + "grad_norm": 0.47742681850873, + "learning_rate": 4.2811372739430616e-05, + "loss": 0.9438, + "step": 5418 + }, + { + "epoch": 0.27027431421446385, + "grad_norm": 0.4851949616518499, + "learning_rate": 4.280853865295676e-05, + "loss": 0.977, + "step": 5419 + }, + { + "epoch": 0.27032418952618453, + "grad_norm": 0.7612101631900708, + "learning_rate": 4.280570410177705e-05, + "loss": 0.921, + "step": 5420 + }, + { + "epoch": 0.2703740648379052, + "grad_norm": 0.5327640834873579, + "learning_rate": 4.280286908596548e-05, + "loss": 0.8684, + "step": 5421 + }, + { + "epoch": 0.27042394014962595, + "grad_norm": 0.4372572830235035, + "learning_rate": 4.2800033605596e-05, + "loss": 0.8977, + "step": 5422 + }, + { + "epoch": 0.27047381546134663, + "grad_norm": 0.45563144183457294, + "learning_rate": 4.27971976607426e-05, + "loss": 0.9349, + "step": 5423 + }, + { + "epoch": 0.2705236907730673, + "grad_norm": 1.0670148482889623, + "learning_rate": 4.27943612514793e-05, + "loss": 0.9566, + "step": 5424 + }, + { + "epoch": 0.27057356608478805, + "grad_norm": 0.47448789639033156, + "learning_rate": 4.279152437788011e-05, + "loss": 0.9197, + "step": 5425 + }, + { + "epoch": 0.27062344139650873, + "grad_norm": 0.4923600304865293, + "learning_rate": 4.278868704001905e-05, + "loss": 0.8867, + "step": 5426 + }, + { + "epoch": 0.2706733167082294, + "grad_norm": 0.4255751970559663, + "learning_rate": 4.2785849237970166e-05, + "loss": 0.8885, + "step": 5427 + }, + { + "epoch": 0.2707231920199501, + "grad_norm": 0.49681829476144096, + "learning_rate": 4.27830109718075e-05, + "loss": 0.9385, + "step": 5428 + }, + { + "epoch": 0.27077306733167084, + "grad_norm": 0.47293522908610824, + "learning_rate": 4.278017224160513e-05, + "loss": 0.9167, + "step": 5429 + }, + { + "epoch": 0.2708229426433915, + "grad_norm": 0.49541687379375204, + "learning_rate": 4.277733304743711e-05, + "loss": 0.9104, + "step": 5430 + }, + { + "epoch": 0.2708728179551122, + "grad_norm": 0.6107212397963788, + "learning_rate": 4.2774493389377545e-05, + "loss": 0.9414, + "step": 5431 + }, + { + "epoch": 0.27092269326683294, + "grad_norm": 0.4809098453405734, + "learning_rate": 4.2771653267500534e-05, + "loss": 0.9244, + "step": 5432 + }, + { + "epoch": 0.2709725685785536, + "grad_norm": 0.4522998751273774, + "learning_rate": 4.276881268188019e-05, + "loss": 0.9234, + "step": 5433 + }, + { + "epoch": 0.2710224438902743, + "grad_norm": 0.44559052766611923, + "learning_rate": 4.2765971632590616e-05, + "loss": 0.9388, + "step": 5434 + }, + { + "epoch": 0.27107231920199504, + "grad_norm": 0.517598372842975, + "learning_rate": 4.2763130119705975e-05, + "loss": 0.8739, + "step": 5435 + }, + { + "epoch": 0.2711221945137157, + "grad_norm": 0.44795945641846724, + "learning_rate": 4.2760288143300395e-05, + "loss": 0.9185, + "step": 5436 + }, + { + "epoch": 0.2711720698254364, + "grad_norm": 0.690452856140087, + "learning_rate": 4.275744570344805e-05, + "loss": 0.9347, + "step": 5437 + }, + { + "epoch": 0.2712219451371571, + "grad_norm": 0.4962244420973905, + "learning_rate": 4.275460280022311e-05, + "loss": 0.9392, + "step": 5438 + }, + { + "epoch": 0.2712718204488778, + "grad_norm": 0.5513708985554523, + "learning_rate": 4.275175943369975e-05, + "loss": 0.9278, + "step": 5439 + }, + { + "epoch": 0.2713216957605985, + "grad_norm": 0.49023454594996896, + "learning_rate": 4.274891560395217e-05, + "loss": 0.9229, + "step": 5440 + }, + { + "epoch": 0.2713715710723192, + "grad_norm": 3.1244546838950895, + "learning_rate": 4.274607131105459e-05, + "loss": 0.9611, + "step": 5441 + }, + { + "epoch": 0.2714214463840399, + "grad_norm": 0.7842199483973372, + "learning_rate": 4.2743226555081214e-05, + "loss": 0.9093, + "step": 5442 + }, + { + "epoch": 0.2714713216957606, + "grad_norm": 0.43758022843036526, + "learning_rate": 4.274038133610628e-05, + "loss": 0.9335, + "step": 5443 + }, + { + "epoch": 0.2715211970074813, + "grad_norm": 0.45038603782430714, + "learning_rate": 4.2737535654204045e-05, + "loss": 0.9362, + "step": 5444 + }, + { + "epoch": 0.271571072319202, + "grad_norm": 0.4611898675430214, + "learning_rate": 4.2734689509448745e-05, + "loss": 0.9173, + "step": 5445 + }, + { + "epoch": 0.2716209476309227, + "grad_norm": 0.5960542388488351, + "learning_rate": 4.2731842901914656e-05, + "loss": 0.9678, + "step": 5446 + }, + { + "epoch": 0.2716708229426434, + "grad_norm": 0.41617233044187785, + "learning_rate": 4.2728995831676074e-05, + "loss": 0.9091, + "step": 5447 + }, + { + "epoch": 0.27172069825436407, + "grad_norm": 0.5705838280338331, + "learning_rate": 4.272614829880728e-05, + "loss": 0.9092, + "step": 5448 + }, + { + "epoch": 0.2717705735660848, + "grad_norm": 0.43596317659698075, + "learning_rate": 4.272330030338257e-05, + "loss": 0.9149, + "step": 5449 + }, + { + "epoch": 0.2718204488778055, + "grad_norm": 0.4752271312973879, + "learning_rate": 4.2720451845476275e-05, + "loss": 0.8637, + "step": 5450 + }, + { + "epoch": 0.27187032418952617, + "grad_norm": 0.4689394399632667, + "learning_rate": 4.271760292516272e-05, + "loss": 0.8783, + "step": 5451 + }, + { + "epoch": 0.2719201995012469, + "grad_norm": 0.5023142231678426, + "learning_rate": 4.2714753542516245e-05, + "loss": 0.902, + "step": 5452 + }, + { + "epoch": 0.2719700748129676, + "grad_norm": 0.5038931988129479, + "learning_rate": 4.271190369761121e-05, + "loss": 0.958, + "step": 5453 + }, + { + "epoch": 0.27201995012468827, + "grad_norm": 0.5861579918320639, + "learning_rate": 4.2709053390521964e-05, + "loss": 0.9112, + "step": 5454 + }, + { + "epoch": 0.27206982543640895, + "grad_norm": 0.5022567461142359, + "learning_rate": 4.27062026213229e-05, + "loss": 0.8926, + "step": 5455 + }, + { + "epoch": 0.2721197007481297, + "grad_norm": 0.3871215203121197, + "learning_rate": 4.270335139008841e-05, + "loss": 0.9426, + "step": 5456 + }, + { + "epoch": 0.27216957605985037, + "grad_norm": 0.4795751290298321, + "learning_rate": 4.2700499696892874e-05, + "loss": 0.9251, + "step": 5457 + }, + { + "epoch": 0.27221945137157105, + "grad_norm": 0.5012938740383998, + "learning_rate": 4.2697647541810735e-05, + "loss": 0.9206, + "step": 5458 + }, + { + "epoch": 0.2722693266832918, + "grad_norm": 0.48191252432851694, + "learning_rate": 4.2694794924916394e-05, + "loss": 0.9339, + "step": 5459 + }, + { + "epoch": 0.27231920199501247, + "grad_norm": 0.396101080872675, + "learning_rate": 4.26919418462843e-05, + "loss": 0.9174, + "step": 5460 + }, + { + "epoch": 0.27236907730673315, + "grad_norm": 0.6134183918457412, + "learning_rate": 4.268908830598891e-05, + "loss": 0.9218, + "step": 5461 + }, + { + "epoch": 0.2724189526184539, + "grad_norm": 0.47926911500419106, + "learning_rate": 4.268623430410467e-05, + "loss": 0.9454, + "step": 5462 + }, + { + "epoch": 0.2724688279301746, + "grad_norm": 0.43520818031554176, + "learning_rate": 4.268337984070606e-05, + "loss": 0.9254, + "step": 5463 + }, + { + "epoch": 0.27251870324189525, + "grad_norm": 0.7351707012684526, + "learning_rate": 4.268052491586757e-05, + "loss": 0.9297, + "step": 5464 + }, + { + "epoch": 0.27256857855361594, + "grad_norm": 0.4364922383356326, + "learning_rate": 4.267766952966369e-05, + "loss": 0.9272, + "step": 5465 + }, + { + "epoch": 0.2726184538653367, + "grad_norm": 0.39576893777498273, + "learning_rate": 4.2674813682168944e-05, + "loss": 0.9027, + "step": 5466 + }, + { + "epoch": 0.27266832917705736, + "grad_norm": 0.48971554199495826, + "learning_rate": 4.267195737345784e-05, + "loss": 0.9439, + "step": 5467 + }, + { + "epoch": 0.27271820448877804, + "grad_norm": 0.4259480669337155, + "learning_rate": 4.2669100603604915e-05, + "loss": 0.8923, + "step": 5468 + }, + { + "epoch": 0.2727680798004988, + "grad_norm": 0.518882983402975, + "learning_rate": 4.2666243372684725e-05, + "loss": 0.9185, + "step": 5469 + }, + { + "epoch": 0.27281795511221946, + "grad_norm": 0.5060438944560254, + "learning_rate": 4.266338568077181e-05, + "loss": 0.9417, + "step": 5470 + }, + { + "epoch": 0.27286783042394014, + "grad_norm": 0.6491562851775031, + "learning_rate": 4.266052752794075e-05, + "loss": 0.9391, + "step": 5471 + }, + { + "epoch": 0.2729177057356608, + "grad_norm": 0.4311995858787614, + "learning_rate": 4.265766891426614e-05, + "loss": 0.9447, + "step": 5472 + }, + { + "epoch": 0.27296758104738156, + "grad_norm": 0.6694603405941577, + "learning_rate": 4.265480983982255e-05, + "loss": 0.9469, + "step": 5473 + }, + { + "epoch": 0.27301745635910224, + "grad_norm": 0.4493725842038669, + "learning_rate": 4.265195030468461e-05, + "loss": 0.9522, + "step": 5474 + }, + { + "epoch": 0.2730673316708229, + "grad_norm": 0.4880048984278788, + "learning_rate": 4.264909030892692e-05, + "loss": 0.8919, + "step": 5475 + }, + { + "epoch": 0.27311720698254366, + "grad_norm": 0.584942536093304, + "learning_rate": 4.264622985262412e-05, + "loss": 0.9241, + "step": 5476 + }, + { + "epoch": 0.27316708229426434, + "grad_norm": 0.4942408815874274, + "learning_rate": 4.264336893585085e-05, + "loss": 0.9462, + "step": 5477 + }, + { + "epoch": 0.273216957605985, + "grad_norm": 0.4208759084970573, + "learning_rate": 4.264050755868176e-05, + "loss": 0.9149, + "step": 5478 + }, + { + "epoch": 0.27326683291770576, + "grad_norm": 0.4039260036732146, + "learning_rate": 4.263764572119152e-05, + "loss": 0.941, + "step": 5479 + }, + { + "epoch": 0.27331670822942644, + "grad_norm": 0.47805230662083276, + "learning_rate": 4.263478342345482e-05, + "loss": 0.8899, + "step": 5480 + }, + { + "epoch": 0.2733665835411471, + "grad_norm": 0.4904404244726453, + "learning_rate": 4.263192066554632e-05, + "loss": 0.9301, + "step": 5481 + }, + { + "epoch": 0.2734164588528678, + "grad_norm": 0.40636413414716827, + "learning_rate": 4.262905744754076e-05, + "loss": 0.8352, + "step": 5482 + }, + { + "epoch": 0.27346633416458854, + "grad_norm": 0.40130002422718963, + "learning_rate": 4.262619376951282e-05, + "loss": 0.9422, + "step": 5483 + }, + { + "epoch": 0.2735162094763092, + "grad_norm": 0.47594241651922115, + "learning_rate": 4.2623329631537244e-05, + "loss": 0.9318, + "step": 5484 + }, + { + "epoch": 0.2735660847880299, + "grad_norm": 0.46062403337342966, + "learning_rate": 4.262046503368878e-05, + "loss": 0.9538, + "step": 5485 + }, + { + "epoch": 0.27361596009975064, + "grad_norm": 0.39374033616848075, + "learning_rate": 4.261759997604216e-05, + "loss": 0.8904, + "step": 5486 + }, + { + "epoch": 0.2736658354114713, + "grad_norm": 0.4283446214509043, + "learning_rate": 4.261473445867215e-05, + "loss": 0.9568, + "step": 5487 + }, + { + "epoch": 0.273715710723192, + "grad_norm": 0.4455766447889426, + "learning_rate": 4.2611868481653525e-05, + "loss": 0.902, + "step": 5488 + }, + { + "epoch": 0.27376558603491274, + "grad_norm": 0.5988362126984561, + "learning_rate": 4.2609002045061086e-05, + "loss": 0.9164, + "step": 5489 + }, + { + "epoch": 0.2738154613466334, + "grad_norm": 0.4368004106804469, + "learning_rate": 4.260613514896962e-05, + "loss": 0.9107, + "step": 5490 + }, + { + "epoch": 0.2738653366583541, + "grad_norm": 0.47972928472976084, + "learning_rate": 4.260326779345393e-05, + "loss": 0.9452, + "step": 5491 + }, + { + "epoch": 0.2739152119700748, + "grad_norm": 0.5549801344427154, + "learning_rate": 4.2600399978588836e-05, + "loss": 0.9649, + "step": 5492 + }, + { + "epoch": 0.2739650872817955, + "grad_norm": 0.43972350867143795, + "learning_rate": 4.259753170444919e-05, + "loss": 0.9534, + "step": 5493 + }, + { + "epoch": 0.2740149625935162, + "grad_norm": 0.411829795004243, + "learning_rate": 4.259466297110983e-05, + "loss": 0.8812, + "step": 5494 + }, + { + "epoch": 0.2740648379052369, + "grad_norm": 0.511089064215357, + "learning_rate": 4.2591793778645616e-05, + "loss": 0.8852, + "step": 5495 + }, + { + "epoch": 0.27411471321695763, + "grad_norm": 0.41758959235992443, + "learning_rate": 4.258892412713141e-05, + "loss": 0.9606, + "step": 5496 + }, + { + "epoch": 0.2741645885286783, + "grad_norm": 0.5302715802718551, + "learning_rate": 4.2586054016642107e-05, + "loss": 0.908, + "step": 5497 + }, + { + "epoch": 0.274214463840399, + "grad_norm": 0.4659240936604786, + "learning_rate": 4.2583183447252596e-05, + "loss": 0.9357, + "step": 5498 + }, + { + "epoch": 0.2742643391521197, + "grad_norm": 0.453431633190183, + "learning_rate": 4.258031241903778e-05, + "loss": 0.9082, + "step": 5499 + }, + { + "epoch": 0.2743142144638404, + "grad_norm": 0.4076512732569617, + "learning_rate": 4.2577440932072574e-05, + "loss": 0.9159, + "step": 5500 + }, + { + "epoch": 0.2743640897755611, + "grad_norm": 0.409325144454776, + "learning_rate": 4.257456898643192e-05, + "loss": 0.9085, + "step": 5501 + }, + { + "epoch": 0.2744139650872818, + "grad_norm": 0.461734112029129, + "learning_rate": 4.2571696582190756e-05, + "loss": 0.8992, + "step": 5502 + }, + { + "epoch": 0.2744638403990025, + "grad_norm": 0.5130294740783957, + "learning_rate": 4.256882371942403e-05, + "loss": 0.9545, + "step": 5503 + }, + { + "epoch": 0.2745137157107232, + "grad_norm": 0.5335191312259855, + "learning_rate": 4.256595039820671e-05, + "loss": 0.9524, + "step": 5504 + }, + { + "epoch": 0.2745635910224439, + "grad_norm": 0.3676388265168682, + "learning_rate": 4.2563076618613776e-05, + "loss": 0.8833, + "step": 5505 + }, + { + "epoch": 0.2746134663341646, + "grad_norm": 0.4707243699614275, + "learning_rate": 4.2560202380720214e-05, + "loss": 0.9136, + "step": 5506 + }, + { + "epoch": 0.2746633416458853, + "grad_norm": 0.47512041194737553, + "learning_rate": 4.2557327684601044e-05, + "loss": 0.9248, + "step": 5507 + }, + { + "epoch": 0.274713216957606, + "grad_norm": 0.43516258778469835, + "learning_rate": 4.2554452530331254e-05, + "loss": 0.92, + "step": 5508 + }, + { + "epoch": 0.27476309226932666, + "grad_norm": 0.5148784231605822, + "learning_rate": 4.2551576917985884e-05, + "loss": 0.9639, + "step": 5509 + }, + { + "epoch": 0.2748129675810474, + "grad_norm": 0.4777734198705904, + "learning_rate": 4.2548700847639966e-05, + "loss": 0.9554, + "step": 5510 + }, + { + "epoch": 0.2748628428927681, + "grad_norm": 0.44290566988449226, + "learning_rate": 4.254582431936856e-05, + "loss": 0.9655, + "step": 5511 + }, + { + "epoch": 0.27491271820448876, + "grad_norm": 0.3763567094938602, + "learning_rate": 4.254294733324672e-05, + "loss": 0.9097, + "step": 5512 + }, + { + "epoch": 0.2749625935162095, + "grad_norm": 0.9123137589889069, + "learning_rate": 4.254006988934952e-05, + "loss": 0.9566, + "step": 5513 + }, + { + "epoch": 0.2750124688279302, + "grad_norm": 0.46949132311317854, + "learning_rate": 4.253719198775204e-05, + "loss": 0.9595, + "step": 5514 + }, + { + "epoch": 0.27506234413965086, + "grad_norm": 0.4440659111140316, + "learning_rate": 4.2534313628529386e-05, + "loss": 0.9112, + "step": 5515 + }, + { + "epoch": 0.2751122194513716, + "grad_norm": 0.4221400118169857, + "learning_rate": 4.253143481175667e-05, + "loss": 0.9084, + "step": 5516 + }, + { + "epoch": 0.2751620947630923, + "grad_norm": 0.5727860406719438, + "learning_rate": 4.252855553750901e-05, + "loss": 0.8968, + "step": 5517 + }, + { + "epoch": 0.27521197007481296, + "grad_norm": 1.670683386197059, + "learning_rate": 4.252567580586153e-05, + "loss": 0.9437, + "step": 5518 + }, + { + "epoch": 0.27526184538653364, + "grad_norm": 0.41251365887930547, + "learning_rate": 4.252279561688939e-05, + "loss": 0.9551, + "step": 5519 + }, + { + "epoch": 0.2753117206982544, + "grad_norm": 0.45064877500834144, + "learning_rate": 4.251991497066774e-05, + "loss": 0.9045, + "step": 5520 + }, + { + "epoch": 0.27536159600997506, + "grad_norm": 0.6256162755059915, + "learning_rate": 4.251703386727174e-05, + "loss": 0.8997, + "step": 5521 + }, + { + "epoch": 0.27541147132169574, + "grad_norm": 0.5716302462514669, + "learning_rate": 4.2514152306776586e-05, + "loss": 0.9542, + "step": 5522 + }, + { + "epoch": 0.2754613466334165, + "grad_norm": 0.4200743919101249, + "learning_rate": 4.251127028925747e-05, + "loss": 0.9099, + "step": 5523 + }, + { + "epoch": 0.27551122194513716, + "grad_norm": 0.4533735184669952, + "learning_rate": 4.2508387814789593e-05, + "loss": 0.8926, + "step": 5524 + }, + { + "epoch": 0.27556109725685785, + "grad_norm": 0.5007503076039826, + "learning_rate": 4.2505504883448174e-05, + "loss": 0.8748, + "step": 5525 + }, + { + "epoch": 0.2756109725685785, + "grad_norm": 0.48293866520736406, + "learning_rate": 4.250262149530843e-05, + "loss": 0.8956, + "step": 5526 + }, + { + "epoch": 0.27566084788029926, + "grad_norm": 0.43064810919421936, + "learning_rate": 4.2499737650445616e-05, + "loss": 0.9337, + "step": 5527 + }, + { + "epoch": 0.27571072319201995, + "grad_norm": 0.4656296527319288, + "learning_rate": 4.2496853348934987e-05, + "loss": 0.9063, + "step": 5528 + }, + { + "epoch": 0.27576059850374063, + "grad_norm": 0.5186339946178208, + "learning_rate": 4.2493968590851785e-05, + "loss": 0.9482, + "step": 5529 + }, + { + "epoch": 0.27581047381546137, + "grad_norm": 0.4826810525739992, + "learning_rate": 4.249108337627131e-05, + "loss": 0.9346, + "step": 5530 + }, + { + "epoch": 0.27586034912718205, + "grad_norm": 0.43865066443177003, + "learning_rate": 4.248819770526885e-05, + "loss": 0.9494, + "step": 5531 + }, + { + "epoch": 0.27591022443890273, + "grad_norm": 0.5289546210965479, + "learning_rate": 4.2485311577919685e-05, + "loss": 0.931, + "step": 5532 + }, + { + "epoch": 0.27596009975062347, + "grad_norm": 0.3758348639592463, + "learning_rate": 4.248242499429915e-05, + "loss": 0.9133, + "step": 5533 + }, + { + "epoch": 0.27600997506234415, + "grad_norm": 0.42212105303471237, + "learning_rate": 4.247953795448255e-05, + "loss": 0.8949, + "step": 5534 + }, + { + "epoch": 0.27605985037406483, + "grad_norm": 0.39229834445517164, + "learning_rate": 4.247665045854523e-05, + "loss": 0.9305, + "step": 5535 + }, + { + "epoch": 0.2761097256857855, + "grad_norm": 0.47053156973627797, + "learning_rate": 4.247376250656254e-05, + "loss": 0.9162, + "step": 5536 + }, + { + "epoch": 0.27615960099750625, + "grad_norm": 0.4321205921403977, + "learning_rate": 4.247087409860985e-05, + "loss": 0.9275, + "step": 5537 + }, + { + "epoch": 0.27620947630922693, + "grad_norm": 0.4270172001825791, + "learning_rate": 4.2467985234762506e-05, + "loss": 0.9526, + "step": 5538 + }, + { + "epoch": 0.2762593516209476, + "grad_norm": 0.47428339090342697, + "learning_rate": 4.24650959150959e-05, + "loss": 0.949, + "step": 5539 + }, + { + "epoch": 0.27630922693266835, + "grad_norm": 0.4357123393393775, + "learning_rate": 4.246220613968545e-05, + "loss": 0.9179, + "step": 5540 + }, + { + "epoch": 0.27635910224438903, + "grad_norm": 0.4611212572854002, + "learning_rate": 4.245931590860653e-05, + "loss": 0.9531, + "step": 5541 + }, + { + "epoch": 0.2764089775561097, + "grad_norm": 0.39406437399490063, + "learning_rate": 4.245642522193458e-05, + "loss": 0.9127, + "step": 5542 + }, + { + "epoch": 0.27645885286783045, + "grad_norm": 0.3900892986734937, + "learning_rate": 4.245353407974503e-05, + "loss": 0.9266, + "step": 5543 + }, + { + "epoch": 0.27650872817955113, + "grad_norm": 0.4247827173376918, + "learning_rate": 4.2450642482113316e-05, + "loss": 0.9061, + "step": 5544 + }, + { + "epoch": 0.2765586034912718, + "grad_norm": 0.7979931160441779, + "learning_rate": 4.2447750429114894e-05, + "loss": 0.9396, + "step": 5545 + }, + { + "epoch": 0.2766084788029925, + "grad_norm": 0.47986965334824194, + "learning_rate": 4.2444857920825245e-05, + "loss": 0.9191, + "step": 5546 + }, + { + "epoch": 0.27665835411471323, + "grad_norm": 0.5102274013492047, + "learning_rate": 4.244196495731982e-05, + "loss": 0.9223, + "step": 5547 + }, + { + "epoch": 0.2767082294264339, + "grad_norm": 0.5123227375291847, + "learning_rate": 4.243907153867413e-05, + "loss": 0.8813, + "step": 5548 + }, + { + "epoch": 0.2767581047381546, + "grad_norm": 0.9599453109622055, + "learning_rate": 4.2436177664963685e-05, + "loss": 0.9005, + "step": 5549 + }, + { + "epoch": 0.27680798004987534, + "grad_norm": 0.4306719275961774, + "learning_rate": 4.2433283336263975e-05, + "loss": 0.9441, + "step": 5550 + }, + { + "epoch": 0.276857855361596, + "grad_norm": 0.3936848742842468, + "learning_rate": 4.243038855265054e-05, + "loss": 0.8853, + "step": 5551 + }, + { + "epoch": 0.2769077306733167, + "grad_norm": 0.4218156711657773, + "learning_rate": 4.242749331419892e-05, + "loss": 0.9005, + "step": 5552 + }, + { + "epoch": 0.2769576059850374, + "grad_norm": 0.4008896400840737, + "learning_rate": 4.242459762098466e-05, + "loss": 0.901, + "step": 5553 + }, + { + "epoch": 0.2770074812967581, + "grad_norm": 0.4718452714539576, + "learning_rate": 4.242170147308332e-05, + "loss": 0.9381, + "step": 5554 + }, + { + "epoch": 0.2770573566084788, + "grad_norm": 0.47628803508481227, + "learning_rate": 4.241880487057048e-05, + "loss": 0.929, + "step": 5555 + }, + { + "epoch": 0.2771072319201995, + "grad_norm": 0.4942257187213819, + "learning_rate": 4.2415907813521716e-05, + "loss": 0.9441, + "step": 5556 + }, + { + "epoch": 0.2771571072319202, + "grad_norm": 0.4073710181186397, + "learning_rate": 4.241301030201264e-05, + "loss": 0.9554, + "step": 5557 + }, + { + "epoch": 0.2772069825436409, + "grad_norm": 0.3738306062295323, + "learning_rate": 4.241011233611885e-05, + "loss": 0.8733, + "step": 5558 + }, + { + "epoch": 0.2772568578553616, + "grad_norm": 0.46556930281335474, + "learning_rate": 4.2407213915915976e-05, + "loss": 0.9566, + "step": 5559 + }, + { + "epoch": 0.2773067331670823, + "grad_norm": 0.42999541050895485, + "learning_rate": 4.240431504147964e-05, + "loss": 0.9204, + "step": 5560 + }, + { + "epoch": 0.277356608478803, + "grad_norm": 0.360781666427513, + "learning_rate": 4.24014157128855e-05, + "loss": 0.926, + "step": 5561 + }, + { + "epoch": 0.2774064837905237, + "grad_norm": 0.429400814020898, + "learning_rate": 4.2398515930209196e-05, + "loss": 0.9715, + "step": 5562 + }, + { + "epoch": 0.27745635910224437, + "grad_norm": 1.1136429237308076, + "learning_rate": 4.2395615693526405e-05, + "loss": 0.9149, + "step": 5563 + }, + { + "epoch": 0.2775062344139651, + "grad_norm": 0.5609529044753799, + "learning_rate": 4.239271500291281e-05, + "loss": 0.913, + "step": 5564 + }, + { + "epoch": 0.2775561097256858, + "grad_norm": 0.5850278546931315, + "learning_rate": 4.2389813858444095e-05, + "loss": 0.9586, + "step": 5565 + }, + { + "epoch": 0.27760598503740647, + "grad_norm": 0.677985370649895, + "learning_rate": 4.238691226019598e-05, + "loss": 0.9086, + "step": 5566 + }, + { + "epoch": 0.2776558603491272, + "grad_norm": 0.5110854321130051, + "learning_rate": 4.238401020824416e-05, + "loss": 0.985, + "step": 5567 + }, + { + "epoch": 0.2777057356608479, + "grad_norm": 0.5211737277335154, + "learning_rate": 4.2381107702664386e-05, + "loss": 0.903, + "step": 5568 + }, + { + "epoch": 0.27775561097256857, + "grad_norm": 0.4205816461143999, + "learning_rate": 4.2378204743532377e-05, + "loss": 0.9118, + "step": 5569 + }, + { + "epoch": 0.27780548628428925, + "grad_norm": 0.5155952394328578, + "learning_rate": 4.23753013309239e-05, + "loss": 0.9108, + "step": 5570 + }, + { + "epoch": 0.27785536159601, + "grad_norm": 0.4207963569645503, + "learning_rate": 4.237239746491471e-05, + "loss": 0.938, + "step": 5571 + }, + { + "epoch": 0.27790523690773067, + "grad_norm": 0.47348465903835335, + "learning_rate": 4.236949314558057e-05, + "loss": 0.9327, + "step": 5572 + }, + { + "epoch": 0.27795511221945135, + "grad_norm": 0.4023003725759101, + "learning_rate": 4.2366588372997297e-05, + "loss": 0.9509, + "step": 5573 + }, + { + "epoch": 0.2780049875311721, + "grad_norm": 0.4092287738834331, + "learning_rate": 4.236368314724066e-05, + "loss": 0.9252, + "step": 5574 + }, + { + "epoch": 0.27805486284289277, + "grad_norm": 0.43865756550518087, + "learning_rate": 4.236077746838649e-05, + "loss": 0.9635, + "step": 5575 + }, + { + "epoch": 0.27810473815461345, + "grad_norm": 0.4015347927077507, + "learning_rate": 4.2357871336510595e-05, + "loss": 0.938, + "step": 5576 + }, + { + "epoch": 0.2781546134663342, + "grad_norm": 0.4584562621411253, + "learning_rate": 4.235496475168882e-05, + "loss": 0.9169, + "step": 5577 + }, + { + "epoch": 0.27820448877805487, + "grad_norm": 0.49441325752480736, + "learning_rate": 4.2352057713997004e-05, + "loss": 0.8937, + "step": 5578 + }, + { + "epoch": 0.27825436408977555, + "grad_norm": 0.4097709476271355, + "learning_rate": 4.2349150223511016e-05, + "loss": 0.9397, + "step": 5579 + }, + { + "epoch": 0.27830423940149623, + "grad_norm": 0.4629678235216637, + "learning_rate": 4.2346242280306715e-05, + "loss": 0.946, + "step": 5580 + }, + { + "epoch": 0.27835411471321697, + "grad_norm": 0.4017085816100026, + "learning_rate": 4.234333388445998e-05, + "loss": 0.9661, + "step": 5581 + }, + { + "epoch": 0.27840399002493765, + "grad_norm": 0.4955229546807455, + "learning_rate": 4.234042503604671e-05, + "loss": 0.9233, + "step": 5582 + }, + { + "epoch": 0.27845386533665833, + "grad_norm": 0.41633797576945497, + "learning_rate": 4.233751573514281e-05, + "loss": 0.8942, + "step": 5583 + }, + { + "epoch": 0.2785037406483791, + "grad_norm": 0.42364585102682634, + "learning_rate": 4.2334605981824193e-05, + "loss": 0.8936, + "step": 5584 + }, + { + "epoch": 0.27855361596009975, + "grad_norm": 0.4946615790593248, + "learning_rate": 4.2331695776166796e-05, + "loss": 0.9139, + "step": 5585 + }, + { + "epoch": 0.27860349127182044, + "grad_norm": 0.39914138917085573, + "learning_rate": 4.232878511824655e-05, + "loss": 0.915, + "step": 5586 + }, + { + "epoch": 0.2786533665835412, + "grad_norm": 0.39033020487372294, + "learning_rate": 4.232587400813942e-05, + "loss": 0.9399, + "step": 5587 + }, + { + "epoch": 0.27870324189526186, + "grad_norm": 0.4997554623357988, + "learning_rate": 4.232296244592135e-05, + "loss": 0.9885, + "step": 5588 + }, + { + "epoch": 0.27875311720698254, + "grad_norm": 0.4237856751777778, + "learning_rate": 4.232005043166834e-05, + "loss": 0.9245, + "step": 5589 + }, + { + "epoch": 0.2788029925187032, + "grad_norm": 0.4999553410272211, + "learning_rate": 4.2317137965456356e-05, + "loss": 0.9561, + "step": 5590 + }, + { + "epoch": 0.27885286783042396, + "grad_norm": 0.41890639577749794, + "learning_rate": 4.231422504736141e-05, + "loss": 0.9107, + "step": 5591 + }, + { + "epoch": 0.27890274314214464, + "grad_norm": 0.5158636415965684, + "learning_rate": 4.231131167745951e-05, + "loss": 0.9428, + "step": 5592 + }, + { + "epoch": 0.2789526184538653, + "grad_norm": 0.5221152466508214, + "learning_rate": 4.2308397855826675e-05, + "loss": 0.9629, + "step": 5593 + }, + { + "epoch": 0.27900249376558606, + "grad_norm": 0.45057741272121454, + "learning_rate": 4.230548358253895e-05, + "loss": 0.9266, + "step": 5594 + }, + { + "epoch": 0.27905236907730674, + "grad_norm": 0.5135889757406892, + "learning_rate": 4.2302568857672375e-05, + "loss": 0.8742, + "step": 5595 + }, + { + "epoch": 0.2791022443890274, + "grad_norm": 0.44590962308027354, + "learning_rate": 4.229965368130301e-05, + "loss": 0.9288, + "step": 5596 + }, + { + "epoch": 0.2791521197007481, + "grad_norm": 0.4489471326899853, + "learning_rate": 4.2296738053506924e-05, + "loss": 0.8995, + "step": 5597 + }, + { + "epoch": 0.27920199501246884, + "grad_norm": 0.397060301476908, + "learning_rate": 4.2293821974360196e-05, + "loss": 0.947, + "step": 5598 + }, + { + "epoch": 0.2792518703241895, + "grad_norm": 0.5774627887898697, + "learning_rate": 4.229090544393893e-05, + "loss": 0.8792, + "step": 5599 + }, + { + "epoch": 0.2793017456359102, + "grad_norm": 0.4419418586903851, + "learning_rate": 4.2287988462319215e-05, + "loss": 0.9409, + "step": 5600 + }, + { + "epoch": 0.27935162094763094, + "grad_norm": 0.44190163962267914, + "learning_rate": 4.228507102957719e-05, + "loss": 0.9575, + "step": 5601 + }, + { + "epoch": 0.2794014962593516, + "grad_norm": 0.5073526394622664, + "learning_rate": 4.2282153145788964e-05, + "loss": 0.9695, + "step": 5602 + }, + { + "epoch": 0.2794513715710723, + "grad_norm": 0.7223403811663136, + "learning_rate": 4.2279234811030686e-05, + "loss": 0.9571, + "step": 5603 + }, + { + "epoch": 0.27950124688279304, + "grad_norm": 0.46475296671223215, + "learning_rate": 4.2276316025378514e-05, + "loss": 0.92, + "step": 5604 + }, + { + "epoch": 0.2795511221945137, + "grad_norm": 0.4016150354189802, + "learning_rate": 4.22733967889086e-05, + "loss": 0.941, + "step": 5605 + }, + { + "epoch": 0.2796009975062344, + "grad_norm": 0.47234354741811646, + "learning_rate": 4.227047710169714e-05, + "loss": 0.9309, + "step": 5606 + }, + { + "epoch": 0.2796508728179551, + "grad_norm": 0.5387607752878381, + "learning_rate": 4.22675569638203e-05, + "loss": 0.8893, + "step": 5607 + }, + { + "epoch": 0.2797007481296758, + "grad_norm": 0.42489215797437824, + "learning_rate": 4.226463637535429e-05, + "loss": 0.8844, + "step": 5608 + }, + { + "epoch": 0.2797506234413965, + "grad_norm": 0.5918423564648071, + "learning_rate": 4.226171533637532e-05, + "loss": 0.9744, + "step": 5609 + }, + { + "epoch": 0.2798004987531172, + "grad_norm": 0.3843258526770381, + "learning_rate": 4.225879384695962e-05, + "loss": 0.8976, + "step": 5610 + }, + { + "epoch": 0.2798503740648379, + "grad_norm": 0.4470107864912962, + "learning_rate": 4.225587190718341e-05, + "loss": 0.8687, + "step": 5611 + }, + { + "epoch": 0.2799002493765586, + "grad_norm": 0.5108490725538571, + "learning_rate": 4.225294951712294e-05, + "loss": 0.8962, + "step": 5612 + }, + { + "epoch": 0.2799501246882793, + "grad_norm": 0.49581569444882645, + "learning_rate": 4.225002667685448e-05, + "loss": 0.9012, + "step": 5613 + }, + { + "epoch": 0.28, + "grad_norm": 0.45984275147864234, + "learning_rate": 4.22471033864543e-05, + "loss": 0.8829, + "step": 5614 + }, + { + "epoch": 0.2800498753117207, + "grad_norm": 0.4660453888190922, + "learning_rate": 4.224417964599867e-05, + "loss": 0.9312, + "step": 5615 + }, + { + "epoch": 0.2800997506234414, + "grad_norm": 0.4550062487559833, + "learning_rate": 4.224125545556389e-05, + "loss": 0.9164, + "step": 5616 + }, + { + "epoch": 0.28014962593516207, + "grad_norm": 0.46070099204298615, + "learning_rate": 4.223833081522627e-05, + "loss": 0.9037, + "step": 5617 + }, + { + "epoch": 0.2801995012468828, + "grad_norm": 0.9912401226806187, + "learning_rate": 4.223540572506211e-05, + "loss": 0.8987, + "step": 5618 + }, + { + "epoch": 0.2802493765586035, + "grad_norm": 0.4984347359427463, + "learning_rate": 4.223248018514777e-05, + "loss": 0.9072, + "step": 5619 + }, + { + "epoch": 0.2802992518703242, + "grad_norm": 2.3358074208036887, + "learning_rate": 4.222955419555956e-05, + "loss": 0.9511, + "step": 5620 + }, + { + "epoch": 0.2803491271820449, + "grad_norm": 0.46656845801580116, + "learning_rate": 4.2226627756373846e-05, + "loss": 0.9564, + "step": 5621 + }, + { + "epoch": 0.2803990024937656, + "grad_norm": 0.47333915522565057, + "learning_rate": 4.2223700867666995e-05, + "loss": 0.9399, + "step": 5622 + }, + { + "epoch": 0.2804488778054863, + "grad_norm": 0.4213071837207147, + "learning_rate": 4.222077352951537e-05, + "loss": 0.9606, + "step": 5623 + }, + { + "epoch": 0.28049875311720696, + "grad_norm": 0.3826639748220666, + "learning_rate": 4.2217845741995364e-05, + "loss": 0.9433, + "step": 5624 + }, + { + "epoch": 0.2805486284289277, + "grad_norm": 0.572495032553582, + "learning_rate": 4.2214917505183394e-05, + "loss": 0.9388, + "step": 5625 + }, + { + "epoch": 0.2805985037406484, + "grad_norm": 0.47620657337527234, + "learning_rate": 4.221198881915585e-05, + "loss": 0.9126, + "step": 5626 + }, + { + "epoch": 0.28064837905236906, + "grad_norm": 0.45377814239120984, + "learning_rate": 4.220905968398915e-05, + "loss": 0.9334, + "step": 5627 + }, + { + "epoch": 0.2806982543640898, + "grad_norm": 0.43919780209486725, + "learning_rate": 4.2206130099759744e-05, + "loss": 0.9326, + "step": 5628 + }, + { + "epoch": 0.2807481296758105, + "grad_norm": 0.43811243854141807, + "learning_rate": 4.220320006654408e-05, + "loss": 0.9101, + "step": 5629 + }, + { + "epoch": 0.28079800498753116, + "grad_norm": 0.5324906319141115, + "learning_rate": 4.220026958441861e-05, + "loss": 0.9081, + "step": 5630 + }, + { + "epoch": 0.2808478802992519, + "grad_norm": 0.5274726774225307, + "learning_rate": 4.21973386534598e-05, + "loss": 0.9572, + "step": 5631 + }, + { + "epoch": 0.2808977556109726, + "grad_norm": 0.8639155414302087, + "learning_rate": 4.219440727374413e-05, + "loss": 0.9435, + "step": 5632 + }, + { + "epoch": 0.28094763092269326, + "grad_norm": 0.48402319559253365, + "learning_rate": 4.2191475445348107e-05, + "loss": 0.936, + "step": 5633 + }, + { + "epoch": 0.28099750623441394, + "grad_norm": 0.4644201322072369, + "learning_rate": 4.2188543168348214e-05, + "loss": 0.93, + "step": 5634 + }, + { + "epoch": 0.2810473815461347, + "grad_norm": 0.45184901287379464, + "learning_rate": 4.218561044282099e-05, + "loss": 0.9129, + "step": 5635 + }, + { + "epoch": 0.28109725685785536, + "grad_norm": 0.42341057424470707, + "learning_rate": 4.218267726884295e-05, + "loss": 0.9211, + "step": 5636 + }, + { + "epoch": 0.28114713216957604, + "grad_norm": 0.47010226687188533, + "learning_rate": 4.2179743646490636e-05, + "loss": 0.8752, + "step": 5637 + }, + { + "epoch": 0.2811970074812968, + "grad_norm": 0.3698729258324002, + "learning_rate": 4.2176809575840594e-05, + "loss": 0.9021, + "step": 5638 + }, + { + "epoch": 0.28124688279301746, + "grad_norm": 0.41545664789155745, + "learning_rate": 4.2173875056969394e-05, + "loss": 0.9242, + "step": 5639 + }, + { + "epoch": 0.28129675810473814, + "grad_norm": 0.5523352602810979, + "learning_rate": 4.217094008995363e-05, + "loss": 0.9255, + "step": 5640 + }, + { + "epoch": 0.2813466334164589, + "grad_norm": 0.42183397862401134, + "learning_rate": 4.2168004674869846e-05, + "loss": 0.9288, + "step": 5641 + }, + { + "epoch": 0.28139650872817956, + "grad_norm": 0.4564771774559841, + "learning_rate": 4.2165068811794675e-05, + "loss": 0.9026, + "step": 5642 + }, + { + "epoch": 0.28144638403990024, + "grad_norm": 0.3979939750584609, + "learning_rate": 4.216213250080471e-05, + "loss": 0.9106, + "step": 5643 + }, + { + "epoch": 0.2814962593516209, + "grad_norm": 0.44657357299279976, + "learning_rate": 4.2159195741976576e-05, + "loss": 0.8965, + "step": 5644 + }, + { + "epoch": 0.28154613466334166, + "grad_norm": 0.4348244957843616, + "learning_rate": 4.2156258535386914e-05, + "loss": 0.8865, + "step": 5645 + }, + { + "epoch": 0.28159600997506234, + "grad_norm": 0.4164810492308668, + "learning_rate": 4.2153320881112366e-05, + "loss": 0.9741, + "step": 5646 + }, + { + "epoch": 0.281645885286783, + "grad_norm": 0.40150542162660563, + "learning_rate": 4.215038277922958e-05, + "loss": 0.9045, + "step": 5647 + }, + { + "epoch": 0.28169576059850376, + "grad_norm": 0.4067124990449354, + "learning_rate": 4.2147444229815234e-05, + "loss": 0.8992, + "step": 5648 + }, + { + "epoch": 0.28174563591022445, + "grad_norm": 0.4307149968473845, + "learning_rate": 4.214450523294601e-05, + "loss": 0.9229, + "step": 5649 + }, + { + "epoch": 0.2817955112219451, + "grad_norm": 0.3956254989230281, + "learning_rate": 4.214156578869858e-05, + "loss": 0.9282, + "step": 5650 + }, + { + "epoch": 0.2818453865336658, + "grad_norm": 0.38693348554053847, + "learning_rate": 4.2138625897149676e-05, + "loss": 0.9346, + "step": 5651 + }, + { + "epoch": 0.28189526184538655, + "grad_norm": 0.417823236662685, + "learning_rate": 4.2135685558375994e-05, + "loss": 0.9039, + "step": 5652 + }, + { + "epoch": 0.28194513715710723, + "grad_norm": 0.4478186965081066, + "learning_rate": 4.213274477245427e-05, + "loss": 0.9174, + "step": 5653 + }, + { + "epoch": 0.2819950124688279, + "grad_norm": 0.4143943546952942, + "learning_rate": 4.212980353946123e-05, + "loss": 0.9273, + "step": 5654 + }, + { + "epoch": 0.28204488778054865, + "grad_norm": 0.4954314840291655, + "learning_rate": 4.2126861859473643e-05, + "loss": 0.9047, + "step": 5655 + }, + { + "epoch": 0.28209476309226933, + "grad_norm": 0.4101868225328226, + "learning_rate": 4.212391973256826e-05, + "loss": 0.9034, + "step": 5656 + }, + { + "epoch": 0.28214463840399, + "grad_norm": 0.4070518111687495, + "learning_rate": 4.212097715882185e-05, + "loss": 0.9424, + "step": 5657 + }, + { + "epoch": 0.28219451371571075, + "grad_norm": 0.39895253149021176, + "learning_rate": 4.2118034138311216e-05, + "loss": 0.8935, + "step": 5658 + }, + { + "epoch": 0.28224438902743143, + "grad_norm": 0.44864319218714155, + "learning_rate": 4.211509067111312e-05, + "loss": 0.9299, + "step": 5659 + }, + { + "epoch": 0.2822942643391521, + "grad_norm": 0.45164443140805893, + "learning_rate": 4.211214675730441e-05, + "loss": 0.9411, + "step": 5660 + }, + { + "epoch": 0.2823441396508728, + "grad_norm": 0.40339136561620237, + "learning_rate": 4.210920239696188e-05, + "loss": 0.9387, + "step": 5661 + }, + { + "epoch": 0.28239401496259353, + "grad_norm": 0.464944594956887, + "learning_rate": 4.210625759016238e-05, + "loss": 0.9274, + "step": 5662 + }, + { + "epoch": 0.2824438902743142, + "grad_norm": 0.4439235597467594, + "learning_rate": 4.2103312336982734e-05, + "loss": 0.9167, + "step": 5663 + }, + { + "epoch": 0.2824937655860349, + "grad_norm": 0.47898329556106073, + "learning_rate": 4.210036663749981e-05, + "loss": 0.9336, + "step": 5664 + }, + { + "epoch": 0.28254364089775563, + "grad_norm": 0.4311831129092655, + "learning_rate": 4.2097420491790476e-05, + "loss": 0.9076, + "step": 5665 + }, + { + "epoch": 0.2825935162094763, + "grad_norm": 0.41519617790374397, + "learning_rate": 4.20944738999316e-05, + "loss": 0.9433, + "step": 5666 + }, + { + "epoch": 0.282643391521197, + "grad_norm": 0.46736464025783114, + "learning_rate": 4.209152686200009e-05, + "loss": 0.9264, + "step": 5667 + }, + { + "epoch": 0.28269326683291773, + "grad_norm": 0.5167157471000428, + "learning_rate": 4.2088579378072815e-05, + "loss": 0.9229, + "step": 5668 + }, + { + "epoch": 0.2827431421446384, + "grad_norm": 0.4810815315091388, + "learning_rate": 4.208563144822672e-05, + "loss": 0.9203, + "step": 5669 + }, + { + "epoch": 0.2827930174563591, + "grad_norm": 0.37163762856981775, + "learning_rate": 4.208268307253872e-05, + "loss": 0.892, + "step": 5670 + }, + { + "epoch": 0.2828428927680798, + "grad_norm": 0.45786466821130645, + "learning_rate": 4.2079734251085756e-05, + "loss": 0.982, + "step": 5671 + }, + { + "epoch": 0.2828927680798005, + "grad_norm": 0.5354638848951745, + "learning_rate": 4.2076784983944764e-05, + "loss": 0.9562, + "step": 5672 + }, + { + "epoch": 0.2829426433915212, + "grad_norm": 0.5354411781119393, + "learning_rate": 4.207383527119271e-05, + "loss": 0.9535, + "step": 5673 + }, + { + "epoch": 0.2829925187032419, + "grad_norm": 0.4104208578990666, + "learning_rate": 4.207088511290657e-05, + "loss": 0.9529, + "step": 5674 + }, + { + "epoch": 0.2830423940149626, + "grad_norm": 0.5826616786926276, + "learning_rate": 4.206793450916332e-05, + "loss": 0.944, + "step": 5675 + }, + { + "epoch": 0.2830922693266833, + "grad_norm": 0.4164878068520964, + "learning_rate": 4.206498346003995e-05, + "loss": 0.9411, + "step": 5676 + }, + { + "epoch": 0.283142144638404, + "grad_norm": 0.4587224692226421, + "learning_rate": 4.2062031965613484e-05, + "loss": 0.8979, + "step": 5677 + }, + { + "epoch": 0.28319201995012466, + "grad_norm": 0.4274194093185006, + "learning_rate": 4.205908002596093e-05, + "loss": 0.9767, + "step": 5678 + }, + { + "epoch": 0.2832418952618454, + "grad_norm": 0.5189094946160332, + "learning_rate": 4.205612764115932e-05, + "loss": 0.8981, + "step": 5679 + }, + { + "epoch": 0.2832917705735661, + "grad_norm": 0.44741678781406535, + "learning_rate": 4.205317481128568e-05, + "loss": 0.9723, + "step": 5680 + }, + { + "epoch": 0.28334164588528676, + "grad_norm": 0.45812245269318935, + "learning_rate": 4.205022153641708e-05, + "loss": 0.9368, + "step": 5681 + }, + { + "epoch": 0.2833915211970075, + "grad_norm": 0.37638188073331, + "learning_rate": 4.2047267816630586e-05, + "loss": 0.9463, + "step": 5682 + }, + { + "epoch": 0.2834413965087282, + "grad_norm": 0.4165029022827586, + "learning_rate": 4.204431365200326e-05, + "loss": 0.9096, + "step": 5683 + }, + { + "epoch": 0.28349127182044886, + "grad_norm": 0.41782380264180685, + "learning_rate": 4.20413590426122e-05, + "loss": 0.8848, + "step": 5684 + }, + { + "epoch": 0.2835411471321696, + "grad_norm": 0.4629911081334172, + "learning_rate": 4.2038403988534506e-05, + "loss": 0.912, + "step": 5685 + }, + { + "epoch": 0.2835910224438903, + "grad_norm": 0.40395846299577276, + "learning_rate": 4.2035448489847284e-05, + "loss": 0.8596, + "step": 5686 + }, + { + "epoch": 0.28364089775561097, + "grad_norm": 0.44754057172875, + "learning_rate": 4.203249254662765e-05, + "loss": 0.8711, + "step": 5687 + }, + { + "epoch": 0.28369077306733165, + "grad_norm": 0.40040710420426073, + "learning_rate": 4.202953615895275e-05, + "loss": 0.9323, + "step": 5688 + }, + { + "epoch": 0.2837406483790524, + "grad_norm": 0.40540750877843884, + "learning_rate": 4.2026579326899726e-05, + "loss": 0.9591, + "step": 5689 + }, + { + "epoch": 0.28379052369077307, + "grad_norm": 0.4685023287934028, + "learning_rate": 4.202362205054574e-05, + "loss": 0.9384, + "step": 5690 + }, + { + "epoch": 0.28384039900249375, + "grad_norm": 0.4362139610585948, + "learning_rate": 4.202066432996795e-05, + "loss": 0.9239, + "step": 5691 + }, + { + "epoch": 0.2838902743142145, + "grad_norm": 0.46370375003632786, + "learning_rate": 4.201770616524355e-05, + "loss": 0.9386, + "step": 5692 + }, + { + "epoch": 0.28394014962593517, + "grad_norm": 0.43679121201561455, + "learning_rate": 4.201474755644972e-05, + "loss": 0.9033, + "step": 5693 + }, + { + "epoch": 0.28399002493765585, + "grad_norm": 0.5100798637147568, + "learning_rate": 4.2011788503663656e-05, + "loss": 0.9107, + "step": 5694 + }, + { + "epoch": 0.28403990024937653, + "grad_norm": 0.47526466917930965, + "learning_rate": 4.20088290069626e-05, + "loss": 0.8851, + "step": 5695 + }, + { + "epoch": 0.28408977556109727, + "grad_norm": 0.38072555094072097, + "learning_rate": 4.2005869066423753e-05, + "loss": 0.9168, + "step": 5696 + }, + { + "epoch": 0.28413965087281795, + "grad_norm": 0.41668046445449197, + "learning_rate": 4.2002908682124366e-05, + "loss": 0.9226, + "step": 5697 + }, + { + "epoch": 0.28418952618453863, + "grad_norm": 0.43555827798392943, + "learning_rate": 4.199994785414169e-05, + "loss": 0.934, + "step": 5698 + }, + { + "epoch": 0.28423940149625937, + "grad_norm": 0.4367375989890103, + "learning_rate": 4.199698658255298e-05, + "loss": 0.9354, + "step": 5699 + }, + { + "epoch": 0.28428927680798005, + "grad_norm": 0.8334180036359612, + "learning_rate": 4.199402486743551e-05, + "loss": 0.9312, + "step": 5700 + }, + { + "epoch": 0.28433915211970073, + "grad_norm": 0.42408932060303217, + "learning_rate": 4.199106270886656e-05, + "loss": 0.9036, + "step": 5701 + }, + { + "epoch": 0.28438902743142147, + "grad_norm": 0.4027646066220741, + "learning_rate": 4.198810010692345e-05, + "loss": 0.9181, + "step": 5702 + }, + { + "epoch": 0.28443890274314215, + "grad_norm": 0.4143705855551398, + "learning_rate": 4.198513706168345e-05, + "loss": 0.9282, + "step": 5703 + }, + { + "epoch": 0.28448877805486283, + "grad_norm": 0.39978559979521816, + "learning_rate": 4.198217357322392e-05, + "loss": 0.8984, + "step": 5704 + }, + { + "epoch": 0.2845386533665835, + "grad_norm": 0.4177204673404019, + "learning_rate": 4.197920964162216e-05, + "loss": 0.9123, + "step": 5705 + }, + { + "epoch": 0.28458852867830425, + "grad_norm": 0.7692018693556475, + "learning_rate": 4.197624526695553e-05, + "loss": 0.9394, + "step": 5706 + }, + { + "epoch": 0.28463840399002494, + "grad_norm": 0.45181009753273604, + "learning_rate": 4.197328044930137e-05, + "loss": 0.8851, + "step": 5707 + }, + { + "epoch": 0.2846882793017456, + "grad_norm": 0.5097110628985254, + "learning_rate": 4.1970315188737055e-05, + "loss": 0.8793, + "step": 5708 + }, + { + "epoch": 0.28473815461346635, + "grad_norm": 0.3973209284899444, + "learning_rate": 4.196734948533996e-05, + "loss": 0.8736, + "step": 5709 + }, + { + "epoch": 0.28478802992518704, + "grad_norm": 0.4938387919635227, + "learning_rate": 4.196438333918748e-05, + "loss": 0.9485, + "step": 5710 + }, + { + "epoch": 0.2848379052369077, + "grad_norm": 0.47225818021873156, + "learning_rate": 4.1961416750357e-05, + "loss": 0.9328, + "step": 5711 + }, + { + "epoch": 0.28488778054862846, + "grad_norm": 0.4890100746530778, + "learning_rate": 4.195844971892595e-05, + "loss": 0.9025, + "step": 5712 + }, + { + "epoch": 0.28493765586034914, + "grad_norm": 0.36860282803268163, + "learning_rate": 4.195548224497173e-05, + "loss": 0.8955, + "step": 5713 + }, + { + "epoch": 0.2849875311720698, + "grad_norm": 0.4621236121197841, + "learning_rate": 4.19525143285718e-05, + "loss": 0.9173, + "step": 5714 + }, + { + "epoch": 0.2850374064837905, + "grad_norm": 0.4467550170994473, + "learning_rate": 4.19495459698036e-05, + "loss": 0.9123, + "step": 5715 + }, + { + "epoch": 0.28508728179551124, + "grad_norm": 0.42144974546073793, + "learning_rate": 4.1946577168744575e-05, + "loss": 0.9283, + "step": 5716 + }, + { + "epoch": 0.2851371571072319, + "grad_norm": 0.3879390678580457, + "learning_rate": 4.1943607925472205e-05, + "loss": 0.8841, + "step": 5717 + }, + { + "epoch": 0.2851870324189526, + "grad_norm": 0.38145925086174926, + "learning_rate": 4.194063824006397e-05, + "loss": 0.9194, + "step": 5718 + }, + { + "epoch": 0.28523690773067334, + "grad_norm": 0.4950071808645345, + "learning_rate": 4.193766811259736e-05, + "loss": 0.9451, + "step": 5719 + }, + { + "epoch": 0.285286783042394, + "grad_norm": 0.44203458239938676, + "learning_rate": 4.1934697543149884e-05, + "loss": 0.9184, + "step": 5720 + }, + { + "epoch": 0.2853366583541147, + "grad_norm": 0.40796842548218326, + "learning_rate": 4.193172653179905e-05, + "loss": 0.9017, + "step": 5721 + }, + { + "epoch": 0.2853865336658354, + "grad_norm": 0.5392730654521829, + "learning_rate": 4.192875507862239e-05, + "loss": 0.9049, + "step": 5722 + }, + { + "epoch": 0.2854364089775561, + "grad_norm": 0.42733703515456956, + "learning_rate": 4.1925783183697456e-05, + "loss": 0.9292, + "step": 5723 + }, + { + "epoch": 0.2854862842892768, + "grad_norm": 0.46805858283118706, + "learning_rate": 4.1922810847101765e-05, + "loss": 0.9586, + "step": 5724 + }, + { + "epoch": 0.2855361596009975, + "grad_norm": 0.531380137117147, + "learning_rate": 4.191983806891291e-05, + "loss": 0.9683, + "step": 5725 + }, + { + "epoch": 0.2855860349127182, + "grad_norm": 0.5263419954813806, + "learning_rate": 4.191686484920845e-05, + "loss": 0.8859, + "step": 5726 + }, + { + "epoch": 0.2856359102244389, + "grad_norm": 0.4714709583795157, + "learning_rate": 4.1913891188065974e-05, + "loss": 0.9103, + "step": 5727 + }, + { + "epoch": 0.2856857855361596, + "grad_norm": 0.39973131424809943, + "learning_rate": 4.191091708556308e-05, + "loss": 0.9083, + "step": 5728 + }, + { + "epoch": 0.2857356608478803, + "grad_norm": 0.5117297524478411, + "learning_rate": 4.1907942541777364e-05, + "loss": 0.955, + "step": 5729 + }, + { + "epoch": 0.285785536159601, + "grad_norm": 0.5093374935456982, + "learning_rate": 4.1904967556786465e-05, + "loss": 0.9577, + "step": 5730 + }, + { + "epoch": 0.2858354114713217, + "grad_norm": 0.4039304977873907, + "learning_rate": 4.1901992130667996e-05, + "loss": 0.9005, + "step": 5731 + }, + { + "epoch": 0.28588528678304237, + "grad_norm": 0.4601516743556823, + "learning_rate": 4.189901626349961e-05, + "loss": 0.8917, + "step": 5732 + }, + { + "epoch": 0.2859351620947631, + "grad_norm": 0.48213070050454493, + "learning_rate": 4.1896039955358954e-05, + "loss": 0.9261, + "step": 5733 + }, + { + "epoch": 0.2859850374064838, + "grad_norm": 0.5319433549035731, + "learning_rate": 4.18930632063237e-05, + "loss": 0.8977, + "step": 5734 + }, + { + "epoch": 0.28603491271820447, + "grad_norm": 0.44513188735068315, + "learning_rate": 4.189008601647152e-05, + "loss": 0.9497, + "step": 5735 + }, + { + "epoch": 0.2860847880299252, + "grad_norm": 0.48098989243855866, + "learning_rate": 4.1887108385880106e-05, + "loss": 0.911, + "step": 5736 + }, + { + "epoch": 0.2861346633416459, + "grad_norm": 0.4381232112512385, + "learning_rate": 4.188413031462716e-05, + "loss": 0.9545, + "step": 5737 + }, + { + "epoch": 0.28618453865336657, + "grad_norm": 0.5630898622117164, + "learning_rate": 4.188115180279038e-05, + "loss": 0.8879, + "step": 5738 + }, + { + "epoch": 0.2862344139650873, + "grad_norm": 1.1364733481700946, + "learning_rate": 4.1878172850447505e-05, + "loss": 0.9549, + "step": 5739 + }, + { + "epoch": 0.286284289276808, + "grad_norm": 0.5985903078224796, + "learning_rate": 4.1875193457676265e-05, + "loss": 0.9207, + "step": 5740 + }, + { + "epoch": 0.2863341645885287, + "grad_norm": 0.4624032305506857, + "learning_rate": 4.18722136245544e-05, + "loss": 0.9827, + "step": 5741 + }, + { + "epoch": 0.28638403990024935, + "grad_norm": 0.44021666063131853, + "learning_rate": 4.186923335115967e-05, + "loss": 0.9034, + "step": 5742 + }, + { + "epoch": 0.2864339152119701, + "grad_norm": 0.5147780258067896, + "learning_rate": 4.1866252637569856e-05, + "loss": 0.923, + "step": 5743 + }, + { + "epoch": 0.2864837905236908, + "grad_norm": 0.4856245747296861, + "learning_rate": 4.1863271483862716e-05, + "loss": 0.9421, + "step": 5744 + }, + { + "epoch": 0.28653366583541146, + "grad_norm": 0.4297630894962179, + "learning_rate": 4.186028989011606e-05, + "loss": 0.9217, + "step": 5745 + }, + { + "epoch": 0.2865835411471322, + "grad_norm": 0.6080752948823043, + "learning_rate": 4.185730785640768e-05, + "loss": 0.9212, + "step": 5746 + }, + { + "epoch": 0.2866334164588529, + "grad_norm": 0.430016243751405, + "learning_rate": 4.18543253828154e-05, + "loss": 0.9109, + "step": 5747 + }, + { + "epoch": 0.28668329177057356, + "grad_norm": 0.43713876986270084, + "learning_rate": 4.1851342469417045e-05, + "loss": 0.9045, + "step": 5748 + }, + { + "epoch": 0.28673316708229424, + "grad_norm": 0.4066473039353164, + "learning_rate": 4.184835911629044e-05, + "loss": 0.8761, + "step": 5749 + }, + { + "epoch": 0.286783042394015, + "grad_norm": 0.4897483355510326, + "learning_rate": 4.1845375323513456e-05, + "loss": 0.9341, + "step": 5750 + }, + { + "epoch": 0.28683291770573566, + "grad_norm": 0.43899585089941934, + "learning_rate": 4.184239109116393e-05, + "loss": 0.9438, + "step": 5751 + }, + { + "epoch": 0.28688279301745634, + "grad_norm": 0.47024014177749446, + "learning_rate": 4.183940641931975e-05, + "loss": 0.9273, + "step": 5752 + }, + { + "epoch": 0.2869326683291771, + "grad_norm": 0.4156312832813036, + "learning_rate": 4.1836421308058804e-05, + "loss": 0.9201, + "step": 5753 + }, + { + "epoch": 0.28698254364089776, + "grad_norm": 0.4071619372612389, + "learning_rate": 4.183343575745897e-05, + "loss": 0.9062, + "step": 5754 + }, + { + "epoch": 0.28703241895261844, + "grad_norm": 0.4092983065978306, + "learning_rate": 4.183044976759817e-05, + "loss": 0.937, + "step": 5755 + }, + { + "epoch": 0.2870822942643392, + "grad_norm": 0.4641372382907025, + "learning_rate": 4.182746333855431e-05, + "loss": 0.9565, + "step": 5756 + }, + { + "epoch": 0.28713216957605986, + "grad_norm": 0.42818159266567524, + "learning_rate": 4.1824476470405324e-05, + "loss": 0.9082, + "step": 5757 + }, + { + "epoch": 0.28718204488778054, + "grad_norm": 0.3929115896552977, + "learning_rate": 4.182148916322917e-05, + "loss": 0.9169, + "step": 5758 + }, + { + "epoch": 0.2872319201995012, + "grad_norm": 0.4606952098099507, + "learning_rate": 4.181850141710376e-05, + "loss": 0.9806, + "step": 5759 + }, + { + "epoch": 0.28728179551122196, + "grad_norm": 0.4693226328029697, + "learning_rate": 4.1815513232107094e-05, + "loss": 0.8627, + "step": 5760 + }, + { + "epoch": 0.28733167082294264, + "grad_norm": 0.4140751644174547, + "learning_rate": 4.181252460831714e-05, + "loss": 0.9177, + "step": 5761 + }, + { + "epoch": 0.2873815461346633, + "grad_norm": 0.4132407014700118, + "learning_rate": 4.180953554581188e-05, + "loss": 0.8739, + "step": 5762 + }, + { + "epoch": 0.28743142144638406, + "grad_norm": 0.5108858764323302, + "learning_rate": 4.180654604466931e-05, + "loss": 0.9257, + "step": 5763 + }, + { + "epoch": 0.28748129675810474, + "grad_norm": 0.483019972063766, + "learning_rate": 4.180355610496744e-05, + "loss": 0.923, + "step": 5764 + }, + { + "epoch": 0.2875311720698254, + "grad_norm": 0.4700291889309274, + "learning_rate": 4.18005657267843e-05, + "loss": 0.9273, + "step": 5765 + }, + { + "epoch": 0.28758104738154616, + "grad_norm": 0.4811622607271214, + "learning_rate": 4.179757491019791e-05, + "loss": 0.952, + "step": 5766 + }, + { + "epoch": 0.28763092269326684, + "grad_norm": 0.39669712610088886, + "learning_rate": 4.1794583655286326e-05, + "loss": 0.9078, + "step": 5767 + }, + { + "epoch": 0.2876807980049875, + "grad_norm": 0.3935175451537328, + "learning_rate": 4.179159196212759e-05, + "loss": 0.9474, + "step": 5768 + }, + { + "epoch": 0.2877306733167082, + "grad_norm": 0.39538432973309806, + "learning_rate": 4.178859983079978e-05, + "loss": 0.9258, + "step": 5769 + }, + { + "epoch": 0.28778054862842894, + "grad_norm": 0.4654187780025725, + "learning_rate": 4.178560726138098e-05, + "loss": 0.8989, + "step": 5770 + }, + { + "epoch": 0.2878304239401496, + "grad_norm": 0.43926656631148825, + "learning_rate": 4.178261425394926e-05, + "loss": 0.945, + "step": 5771 + }, + { + "epoch": 0.2878802992518703, + "grad_norm": 0.9048674825387019, + "learning_rate": 4.177962080858273e-05, + "loss": 0.8821, + "step": 5772 + }, + { + "epoch": 0.28793017456359105, + "grad_norm": 0.38198945392839384, + "learning_rate": 4.177662692535952e-05, + "loss": 0.8929, + "step": 5773 + }, + { + "epoch": 0.2879800498753117, + "grad_norm": 0.4374209862262633, + "learning_rate": 4.177363260435773e-05, + "loss": 0.924, + "step": 5774 + }, + { + "epoch": 0.2880299251870324, + "grad_norm": 0.40720595311538, + "learning_rate": 4.177063784565549e-05, + "loss": 0.9017, + "step": 5775 + }, + { + "epoch": 0.2880798004987531, + "grad_norm": 0.48609682589636805, + "learning_rate": 4.1767642649330976e-05, + "loss": 0.9544, + "step": 5776 + }, + { + "epoch": 0.28812967581047383, + "grad_norm": 0.3728804903469673, + "learning_rate": 4.1764647015462334e-05, + "loss": 0.898, + "step": 5777 + }, + { + "epoch": 0.2881795511221945, + "grad_norm": 0.41267202210264975, + "learning_rate": 4.176165094412773e-05, + "loss": 0.9814, + "step": 5778 + }, + { + "epoch": 0.2882294264339152, + "grad_norm": 0.542338068429505, + "learning_rate": 4.175865443540533e-05, + "loss": 0.9128, + "step": 5779 + }, + { + "epoch": 0.28827930174563593, + "grad_norm": 0.39926679427685385, + "learning_rate": 4.175565748937336e-05, + "loss": 0.9069, + "step": 5780 + }, + { + "epoch": 0.2883291770573566, + "grad_norm": 0.37311178253961425, + "learning_rate": 4.175266010611e-05, + "loss": 0.9156, + "step": 5781 + }, + { + "epoch": 0.2883790523690773, + "grad_norm": 0.40686010740654144, + "learning_rate": 4.174966228569347e-05, + "loss": 0.912, + "step": 5782 + }, + { + "epoch": 0.28842892768079803, + "grad_norm": 0.4278752816430191, + "learning_rate": 4.1746664028202e-05, + "loss": 0.9423, + "step": 5783 + }, + { + "epoch": 0.2884788029925187, + "grad_norm": 0.40269819945744495, + "learning_rate": 4.174366533371383e-05, + "loss": 0.8753, + "step": 5784 + }, + { + "epoch": 0.2885286783042394, + "grad_norm": 0.4416968403825364, + "learning_rate": 4.174066620230721e-05, + "loss": 0.9189, + "step": 5785 + }, + { + "epoch": 0.2885785536159601, + "grad_norm": 0.3941537595379544, + "learning_rate": 4.1737666634060396e-05, + "loss": 0.9054, + "step": 5786 + }, + { + "epoch": 0.2886284289276808, + "grad_norm": 0.40577823853817463, + "learning_rate": 4.173466662905166e-05, + "loss": 0.9103, + "step": 5787 + }, + { + "epoch": 0.2886783042394015, + "grad_norm": 0.6002882485031358, + "learning_rate": 4.173166618735929e-05, + "loss": 0.9219, + "step": 5788 + }, + { + "epoch": 0.2887281795511222, + "grad_norm": 0.3982956129551691, + "learning_rate": 4.172866530906158e-05, + "loss": 0.9231, + "step": 5789 + }, + { + "epoch": 0.2887780548628429, + "grad_norm": 0.465635428918904, + "learning_rate": 4.172566399423682e-05, + "loss": 0.9512, + "step": 5790 + }, + { + "epoch": 0.2888279301745636, + "grad_norm": 0.5161235532000767, + "learning_rate": 4.172266224296336e-05, + "loss": 0.9564, + "step": 5791 + }, + { + "epoch": 0.2888778054862843, + "grad_norm": 0.44224283065194875, + "learning_rate": 4.171966005531951e-05, + "loss": 0.9075, + "step": 5792 + }, + { + "epoch": 0.28892768079800496, + "grad_norm": 0.40975082126278706, + "learning_rate": 4.171665743138361e-05, + "loss": 0.9106, + "step": 5793 + }, + { + "epoch": 0.2889775561097257, + "grad_norm": 0.44917152255561404, + "learning_rate": 4.171365437123401e-05, + "loss": 0.9171, + "step": 5794 + }, + { + "epoch": 0.2890274314214464, + "grad_norm": 0.464224422236642, + "learning_rate": 4.171065087494909e-05, + "loss": 0.8983, + "step": 5795 + }, + { + "epoch": 0.28907730673316706, + "grad_norm": 0.44582278848459655, + "learning_rate": 4.170764694260721e-05, + "loss": 0.9462, + "step": 5796 + }, + { + "epoch": 0.2891271820448878, + "grad_norm": 0.46360763880755285, + "learning_rate": 4.170464257428676e-05, + "loss": 0.8969, + "step": 5797 + }, + { + "epoch": 0.2891770573566085, + "grad_norm": 0.5796912794147903, + "learning_rate": 4.170163777006614e-05, + "loss": 0.9757, + "step": 5798 + }, + { + "epoch": 0.28922693266832916, + "grad_norm": 0.43852561300836596, + "learning_rate": 4.1698632530023753e-05, + "loss": 0.9219, + "step": 5799 + }, + { + "epoch": 0.2892768079800499, + "grad_norm": 0.5482080214113707, + "learning_rate": 4.169562685423803e-05, + "loss": 0.9073, + "step": 5800 + }, + { + "epoch": 0.2893266832917706, + "grad_norm": 0.5716592956258876, + "learning_rate": 4.169262074278739e-05, + "loss": 0.9512, + "step": 5801 + }, + { + "epoch": 0.28937655860349126, + "grad_norm": 0.40837255206551226, + "learning_rate": 4.168961419575029e-05, + "loss": 0.9244, + "step": 5802 + }, + { + "epoch": 0.28942643391521194, + "grad_norm": 0.3815390644960111, + "learning_rate": 4.168660721320517e-05, + "loss": 0.874, + "step": 5803 + }, + { + "epoch": 0.2894763092269327, + "grad_norm": 0.44271932223338945, + "learning_rate": 4.168359979523051e-05, + "loss": 0.9231, + "step": 5804 + }, + { + "epoch": 0.28952618453865336, + "grad_norm": 0.35718939213548934, + "learning_rate": 4.168059194190477e-05, + "loss": 0.8734, + "step": 5805 + }, + { + "epoch": 0.28957605985037405, + "grad_norm": 0.41974704307024846, + "learning_rate": 4.167758365330646e-05, + "loss": 0.9234, + "step": 5806 + }, + { + "epoch": 0.2896259351620948, + "grad_norm": 0.46088179240478516, + "learning_rate": 4.167457492951405e-05, + "loss": 0.92, + "step": 5807 + }, + { + "epoch": 0.28967581047381546, + "grad_norm": 0.41623843402242633, + "learning_rate": 4.167156577060609e-05, + "loss": 0.9393, + "step": 5808 + }, + { + "epoch": 0.28972568578553615, + "grad_norm": 0.4485842998241851, + "learning_rate": 4.166855617666107e-05, + "loss": 0.9165, + "step": 5809 + }, + { + "epoch": 0.2897755610972569, + "grad_norm": 0.49865405315733946, + "learning_rate": 4.166554614775755e-05, + "loss": 0.9315, + "step": 5810 + }, + { + "epoch": 0.28982543640897757, + "grad_norm": 0.5299534720941592, + "learning_rate": 4.166253568397405e-05, + "loss": 0.902, + "step": 5811 + }, + { + "epoch": 0.28987531172069825, + "grad_norm": 0.3822112461672413, + "learning_rate": 4.165952478538914e-05, + "loss": 0.9405, + "step": 5812 + }, + { + "epoch": 0.28992518703241893, + "grad_norm": 0.43428075160748064, + "learning_rate": 4.165651345208138e-05, + "loss": 0.8813, + "step": 5813 + }, + { + "epoch": 0.28997506234413967, + "grad_norm": 0.413511740776289, + "learning_rate": 4.165350168412936e-05, + "loss": 0.9191, + "step": 5814 + }, + { + "epoch": 0.29002493765586035, + "grad_norm": 0.46913068516547085, + "learning_rate": 4.165048948161168e-05, + "loss": 0.9137, + "step": 5815 + }, + { + "epoch": 0.29007481296758103, + "grad_norm": 0.4387120972725653, + "learning_rate": 4.1647476844606914e-05, + "loss": 0.875, + "step": 5816 + }, + { + "epoch": 0.29012468827930177, + "grad_norm": 0.45741813693214706, + "learning_rate": 4.16444637731937e-05, + "loss": 0.8783, + "step": 5817 + }, + { + "epoch": 0.29017456359102245, + "grad_norm": 0.39830883932505196, + "learning_rate": 4.164145026745064e-05, + "loss": 0.847, + "step": 5818 + }, + { + "epoch": 0.29022443890274313, + "grad_norm": 0.42325997325053993, + "learning_rate": 4.16384363274564e-05, + "loss": 0.9179, + "step": 5819 + }, + { + "epoch": 0.2902743142144638, + "grad_norm": 0.36634280265901864, + "learning_rate": 4.1635421953289594e-05, + "loss": 0.8916, + "step": 5820 + }, + { + "epoch": 0.29032418952618455, + "grad_norm": 0.5423913913647368, + "learning_rate": 4.1632407145028904e-05, + "loss": 0.9393, + "step": 5821 + }, + { + "epoch": 0.29037406483790523, + "grad_norm": 0.4369593440898769, + "learning_rate": 4.1629391902753004e-05, + "loss": 0.9636, + "step": 5822 + }, + { + "epoch": 0.2904239401496259, + "grad_norm": 0.42371950066242026, + "learning_rate": 4.162637622654054e-05, + "loss": 0.8964, + "step": 5823 + }, + { + "epoch": 0.29047381546134665, + "grad_norm": 0.41862244115945957, + "learning_rate": 4.162336011647025e-05, + "loss": 0.9396, + "step": 5824 + }, + { + "epoch": 0.29052369077306733, + "grad_norm": 0.4155963397047919, + "learning_rate": 4.162034357262081e-05, + "loss": 0.9545, + "step": 5825 + }, + { + "epoch": 0.290573566084788, + "grad_norm": 0.4348838884802156, + "learning_rate": 4.161732659507095e-05, + "loss": 0.9094, + "step": 5826 + }, + { + "epoch": 0.29062344139650875, + "grad_norm": 0.46960428212105554, + "learning_rate": 4.161430918389938e-05, + "loss": 0.9287, + "step": 5827 + }, + { + "epoch": 0.29067331670822943, + "grad_norm": 0.5001296899311971, + "learning_rate": 4.161129133918485e-05, + "loss": 0.9459, + "step": 5828 + }, + { + "epoch": 0.2907231920199501, + "grad_norm": 0.7467499428365683, + "learning_rate": 4.160827306100611e-05, + "loss": 0.9667, + "step": 5829 + }, + { + "epoch": 0.2907730673316708, + "grad_norm": 0.439996492081079, + "learning_rate": 4.160525434944191e-05, + "loss": 0.9011, + "step": 5830 + }, + { + "epoch": 0.29082294264339154, + "grad_norm": 0.5343435147010355, + "learning_rate": 4.1602235204571036e-05, + "loss": 0.8964, + "step": 5831 + }, + { + "epoch": 0.2908728179551122, + "grad_norm": 0.39774654291486083, + "learning_rate": 4.159921562647226e-05, + "loss": 0.9403, + "step": 5832 + }, + { + "epoch": 0.2909226932668329, + "grad_norm": 0.40555908788797096, + "learning_rate": 4.159619561522439e-05, + "loss": 0.9021, + "step": 5833 + }, + { + "epoch": 0.29097256857855364, + "grad_norm": 0.3959798151948959, + "learning_rate": 4.159317517090621e-05, + "loss": 0.9275, + "step": 5834 + }, + { + "epoch": 0.2910224438902743, + "grad_norm": 0.4377049740694384, + "learning_rate": 4.159015429359656e-05, + "loss": 0.9093, + "step": 5835 + }, + { + "epoch": 0.291072319201995, + "grad_norm": 0.42062529215030064, + "learning_rate": 4.158713298337426e-05, + "loss": 0.9719, + "step": 5836 + }, + { + "epoch": 0.29112219451371574, + "grad_norm": 0.4494391048388199, + "learning_rate": 4.158411124031814e-05, + "loss": 0.9486, + "step": 5837 + }, + { + "epoch": 0.2911720698254364, + "grad_norm": 0.43918881788080927, + "learning_rate": 4.158108906450706e-05, + "loss": 0.9023, + "step": 5838 + }, + { + "epoch": 0.2912219451371571, + "grad_norm": 0.519514835629964, + "learning_rate": 4.157806645601988e-05, + "loss": 0.9269, + "step": 5839 + }, + { + "epoch": 0.2912718204488778, + "grad_norm": 0.6173512368517361, + "learning_rate": 4.157504341493548e-05, + "loss": 0.9, + "step": 5840 + }, + { + "epoch": 0.2913216957605985, + "grad_norm": 0.42412433402832445, + "learning_rate": 4.157201994133274e-05, + "loss": 0.9066, + "step": 5841 + }, + { + "epoch": 0.2913715710723192, + "grad_norm": 0.5178304903062337, + "learning_rate": 4.1568996035290554e-05, + "loss": 0.9072, + "step": 5842 + }, + { + "epoch": 0.2914214463840399, + "grad_norm": 0.5188540516065848, + "learning_rate": 4.156597169688784e-05, + "loss": 0.9381, + "step": 5843 + }, + { + "epoch": 0.2914713216957606, + "grad_norm": 0.3979079631449659, + "learning_rate": 4.156294692620349e-05, + "loss": 0.9682, + "step": 5844 + }, + { + "epoch": 0.2915211970074813, + "grad_norm": 0.41481900791826765, + "learning_rate": 4.155992172331647e-05, + "loss": 0.9244, + "step": 5845 + }, + { + "epoch": 0.291571072319202, + "grad_norm": 0.5129507253400043, + "learning_rate": 4.15568960883057e-05, + "loss": 0.9476, + "step": 5846 + }, + { + "epoch": 0.29162094763092267, + "grad_norm": 0.4387362620547511, + "learning_rate": 4.155387002125014e-05, + "loss": 0.9653, + "step": 5847 + }, + { + "epoch": 0.2916708229426434, + "grad_norm": 0.42995346572979776, + "learning_rate": 4.1550843522228744e-05, + "loss": 0.9416, + "step": 5848 + }, + { + "epoch": 0.2917206982543641, + "grad_norm": 0.46799316718223605, + "learning_rate": 4.1547816591320496e-05, + "loss": 0.8836, + "step": 5849 + }, + { + "epoch": 0.29177057356608477, + "grad_norm": 0.38278838193578746, + "learning_rate": 4.154478922860437e-05, + "loss": 0.9225, + "step": 5850 + }, + { + "epoch": 0.2918204488778055, + "grad_norm": 0.5093639121555075, + "learning_rate": 4.1541761434159384e-05, + "loss": 0.9512, + "step": 5851 + }, + { + "epoch": 0.2918703241895262, + "grad_norm": 0.4509836004898546, + "learning_rate": 4.153873320806454e-05, + "loss": 0.9527, + "step": 5852 + }, + { + "epoch": 0.29192019950124687, + "grad_norm": 0.560968862897168, + "learning_rate": 4.1535704550398845e-05, + "loss": 0.932, + "step": 5853 + }, + { + "epoch": 0.2919700748129676, + "grad_norm": 0.4735152672943757, + "learning_rate": 4.153267546124135e-05, + "loss": 0.9014, + "step": 5854 + }, + { + "epoch": 0.2920199501246883, + "grad_norm": 0.38918307737329183, + "learning_rate": 4.152964594067108e-05, + "loss": 0.9255, + "step": 5855 + }, + { + "epoch": 0.29206982543640897, + "grad_norm": 0.5103253558661998, + "learning_rate": 4.15266159887671e-05, + "loss": 0.9148, + "step": 5856 + }, + { + "epoch": 0.29211970074812965, + "grad_norm": 0.6540441150083741, + "learning_rate": 4.1523585605608475e-05, + "loss": 0.8696, + "step": 5857 + }, + { + "epoch": 0.2921695760598504, + "grad_norm": 0.4255156523475048, + "learning_rate": 4.152055479127427e-05, + "loss": 0.9189, + "step": 5858 + }, + { + "epoch": 0.29221945137157107, + "grad_norm": 0.5268470151597435, + "learning_rate": 4.151752354584359e-05, + "loss": 0.9632, + "step": 5859 + }, + { + "epoch": 0.29226932668329175, + "grad_norm": 0.601094857343354, + "learning_rate": 4.1514491869395524e-05, + "loss": 0.9031, + "step": 5860 + }, + { + "epoch": 0.2923192019950125, + "grad_norm": 0.38215907705374597, + "learning_rate": 4.151145976200918e-05, + "loss": 0.8895, + "step": 5861 + }, + { + "epoch": 0.29236907730673317, + "grad_norm": 0.41325844703112224, + "learning_rate": 4.150842722376368e-05, + "loss": 0.9493, + "step": 5862 + }, + { + "epoch": 0.29241895261845385, + "grad_norm": 0.38461839721832297, + "learning_rate": 4.150539425473817e-05, + "loss": 0.8729, + "step": 5863 + }, + { + "epoch": 0.2924688279301746, + "grad_norm": 0.46810360385284394, + "learning_rate": 4.1502360855011776e-05, + "loss": 0.9463, + "step": 5864 + }, + { + "epoch": 0.2925187032418953, + "grad_norm": 0.4412844464152121, + "learning_rate": 4.1499327024663666e-05, + "loss": 0.9282, + "step": 5865 + }, + { + "epoch": 0.29256857855361595, + "grad_norm": 0.4763078366345036, + "learning_rate": 4.149629276377299e-05, + "loss": 0.9122, + "step": 5866 + }, + { + "epoch": 0.29261845386533664, + "grad_norm": 0.3860282828478854, + "learning_rate": 4.149325807241895e-05, + "loss": 0.9225, + "step": 5867 + }, + { + "epoch": 0.2926683291770574, + "grad_norm": 0.426687210416166, + "learning_rate": 4.1490222950680715e-05, + "loss": 0.9195, + "step": 5868 + }, + { + "epoch": 0.29271820448877806, + "grad_norm": 0.42770021837101585, + "learning_rate": 4.148718739863749e-05, + "loss": 0.8921, + "step": 5869 + }, + { + "epoch": 0.29276807980049874, + "grad_norm": 0.4272935430662366, + "learning_rate": 4.14841514163685e-05, + "loss": 0.9167, + "step": 5870 + }, + { + "epoch": 0.2928179551122195, + "grad_norm": 0.6089171871631878, + "learning_rate": 4.148111500395295e-05, + "loss": 1.0021, + "step": 5871 + }, + { + "epoch": 0.29286783042394016, + "grad_norm": 0.45189406127411325, + "learning_rate": 4.147807816147008e-05, + "loss": 0.8984, + "step": 5872 + }, + { + "epoch": 0.29291770573566084, + "grad_norm": 0.5677752226712459, + "learning_rate": 4.1475040888999135e-05, + "loss": 0.9126, + "step": 5873 + }, + { + "epoch": 0.2929675810473815, + "grad_norm": 0.46991936085512626, + "learning_rate": 4.147200318661937e-05, + "loss": 0.9667, + "step": 5874 + }, + { + "epoch": 0.29301745635910226, + "grad_norm": 0.41976724846603225, + "learning_rate": 4.146896505441005e-05, + "loss": 0.916, + "step": 5875 + }, + { + "epoch": 0.29306733167082294, + "grad_norm": 0.42718544983287193, + "learning_rate": 4.146592649245047e-05, + "loss": 0.9245, + "step": 5876 + }, + { + "epoch": 0.2931172069825436, + "grad_norm": 0.40030784984486956, + "learning_rate": 4.146288750081989e-05, + "loss": 0.9497, + "step": 5877 + }, + { + "epoch": 0.29316708229426436, + "grad_norm": 0.43293141737343926, + "learning_rate": 4.145984807959764e-05, + "loss": 0.926, + "step": 5878 + }, + { + "epoch": 0.29321695760598504, + "grad_norm": 0.5036280900644685, + "learning_rate": 4.145680822886302e-05, + "loss": 0.9573, + "step": 5879 + }, + { + "epoch": 0.2932668329177057, + "grad_norm": 0.42258836800211835, + "learning_rate": 4.1453767948695355e-05, + "loss": 0.8981, + "step": 5880 + }, + { + "epoch": 0.29331670822942646, + "grad_norm": 0.41135978188480016, + "learning_rate": 4.145072723917398e-05, + "loss": 0.9289, + "step": 5881 + }, + { + "epoch": 0.29336658354114714, + "grad_norm": 0.4364890081541929, + "learning_rate": 4.144768610037824e-05, + "loss": 0.9652, + "step": 5882 + }, + { + "epoch": 0.2934164588528678, + "grad_norm": 0.4547282099192081, + "learning_rate": 4.144464453238748e-05, + "loss": 0.909, + "step": 5883 + }, + { + "epoch": 0.2934663341645885, + "grad_norm": 0.42442696465181634, + "learning_rate": 4.1441602535281094e-05, + "loss": 0.9019, + "step": 5884 + }, + { + "epoch": 0.29351620947630924, + "grad_norm": 0.47902606856650126, + "learning_rate": 4.1438560109138444e-05, + "loss": 0.9449, + "step": 5885 + }, + { + "epoch": 0.2935660847880299, + "grad_norm": 0.5517711989495322, + "learning_rate": 4.143551725403892e-05, + "loss": 0.8734, + "step": 5886 + }, + { + "epoch": 0.2936159600997506, + "grad_norm": 0.4187004967930953, + "learning_rate": 4.1432473970061935e-05, + "loss": 0.9055, + "step": 5887 + }, + { + "epoch": 0.29366583541147134, + "grad_norm": 0.9067753240524415, + "learning_rate": 4.14294302572869e-05, + "loss": 0.9725, + "step": 5888 + }, + { + "epoch": 0.293715710723192, + "grad_norm": 0.394901817252907, + "learning_rate": 4.142638611579323e-05, + "loss": 0.8958, + "step": 5889 + }, + { + "epoch": 0.2937655860349127, + "grad_norm": 0.6517091626621807, + "learning_rate": 4.142334154566036e-05, + "loss": 0.9313, + "step": 5890 + }, + { + "epoch": 0.29381546134663344, + "grad_norm": 0.6202142084964668, + "learning_rate": 4.1420296546967754e-05, + "loss": 0.9281, + "step": 5891 + }, + { + "epoch": 0.2938653366583541, + "grad_norm": 3.972311549789805, + "learning_rate": 4.141725111979485e-05, + "loss": 0.9321, + "step": 5892 + }, + { + "epoch": 0.2939152119700748, + "grad_norm": 0.4252333710704815, + "learning_rate": 4.141420526422113e-05, + "loss": 0.9523, + "step": 5893 + }, + { + "epoch": 0.2939650872817955, + "grad_norm": 0.440705136970261, + "learning_rate": 4.141115898032607e-05, + "loss": 0.9403, + "step": 5894 + }, + { + "epoch": 0.2940149625935162, + "grad_norm": 0.4593542902063393, + "learning_rate": 4.140811226818916e-05, + "loss": 0.9325, + "step": 5895 + }, + { + "epoch": 0.2940648379052369, + "grad_norm": 0.4138625342661316, + "learning_rate": 4.14050651278899e-05, + "loss": 0.9302, + "step": 5896 + }, + { + "epoch": 0.2941147132169576, + "grad_norm": 0.47379305060105104, + "learning_rate": 4.140201755950781e-05, + "loss": 0.927, + "step": 5897 + }, + { + "epoch": 0.29416458852867833, + "grad_norm": 0.39156076458800254, + "learning_rate": 4.1398969563122414e-05, + "loss": 0.9433, + "step": 5898 + }, + { + "epoch": 0.294214463840399, + "grad_norm": 0.430479402971734, + "learning_rate": 4.139592113881324e-05, + "loss": 0.8815, + "step": 5899 + }, + { + "epoch": 0.2942643391521197, + "grad_norm": 0.3809863528242505, + "learning_rate": 4.139287228665985e-05, + "loss": 0.9379, + "step": 5900 + }, + { + "epoch": 0.2943142144638404, + "grad_norm": 0.39889963200265555, + "learning_rate": 4.138982300674179e-05, + "loss": 0.9285, + "step": 5901 + }, + { + "epoch": 0.2943640897755611, + "grad_norm": 0.4185720649176044, + "learning_rate": 4.138677329913864e-05, + "loss": 0.9089, + "step": 5902 + }, + { + "epoch": 0.2944139650872818, + "grad_norm": 0.4551075303217924, + "learning_rate": 4.1383723163929964e-05, + "loss": 0.9288, + "step": 5903 + }, + { + "epoch": 0.2944638403990025, + "grad_norm": 0.36611505657956844, + "learning_rate": 4.138067260119537e-05, + "loss": 0.8738, + "step": 5904 + }, + { + "epoch": 0.2945137157107232, + "grad_norm": 0.49909076438321154, + "learning_rate": 4.137762161101445e-05, + "loss": 0.9981, + "step": 5905 + }, + { + "epoch": 0.2945635910224439, + "grad_norm": 0.528898052294006, + "learning_rate": 4.137457019346683e-05, + "loss": 0.9307, + "step": 5906 + }, + { + "epoch": 0.2946134663341646, + "grad_norm": 0.4394366545558136, + "learning_rate": 4.137151834863213e-05, + "loss": 0.9202, + "step": 5907 + }, + { + "epoch": 0.2946633416458853, + "grad_norm": 0.4401344764552296, + "learning_rate": 4.136846607658998e-05, + "loss": 1.0105, + "step": 5908 + }, + { + "epoch": 0.294713216957606, + "grad_norm": 0.44480958023027656, + "learning_rate": 4.136541337742004e-05, + "loss": 0.9272, + "step": 5909 + }, + { + "epoch": 0.2947630922693267, + "grad_norm": 0.42982655078934506, + "learning_rate": 4.136236025120196e-05, + "loss": 0.9205, + "step": 5910 + }, + { + "epoch": 0.29481296758104736, + "grad_norm": 0.39108845166297146, + "learning_rate": 4.135930669801541e-05, + "loss": 0.8621, + "step": 5911 + }, + { + "epoch": 0.2948628428927681, + "grad_norm": 0.4553305669788639, + "learning_rate": 4.135625271794007e-05, + "loss": 0.9331, + "step": 5912 + }, + { + "epoch": 0.2949127182044888, + "grad_norm": 0.4093921536129464, + "learning_rate": 4.135319831105564e-05, + "loss": 0.8921, + "step": 5913 + }, + { + "epoch": 0.29496259351620946, + "grad_norm": 0.42947340841147313, + "learning_rate": 4.1350143477441817e-05, + "loss": 0.9045, + "step": 5914 + }, + { + "epoch": 0.2950124688279302, + "grad_norm": 0.4666548797021733, + "learning_rate": 4.134708821717832e-05, + "loss": 0.9434, + "step": 5915 + }, + { + "epoch": 0.2950623441396509, + "grad_norm": 0.4352516853193398, + "learning_rate": 4.1344032530344876e-05, + "loss": 0.9581, + "step": 5916 + }, + { + "epoch": 0.29511221945137156, + "grad_norm": 0.48490414552709016, + "learning_rate": 4.134097641702121e-05, + "loss": 0.9019, + "step": 5917 + }, + { + "epoch": 0.29516209476309224, + "grad_norm": 0.5438382477741696, + "learning_rate": 4.133791987728708e-05, + "loss": 0.8929, + "step": 5918 + }, + { + "epoch": 0.295211970074813, + "grad_norm": 0.6834602373503916, + "learning_rate": 4.133486291122224e-05, + "loss": 0.9291, + "step": 5919 + }, + { + "epoch": 0.29526184538653366, + "grad_norm": 0.40564633994564614, + "learning_rate": 4.1331805518906475e-05, + "loss": 0.8911, + "step": 5920 + }, + { + "epoch": 0.29531172069825434, + "grad_norm": 0.39920288853238406, + "learning_rate": 4.132874770041953e-05, + "loss": 0.9379, + "step": 5921 + }, + { + "epoch": 0.2953615960099751, + "grad_norm": 0.43064228968169593, + "learning_rate": 4.132568945584124e-05, + "loss": 0.9354, + "step": 5922 + }, + { + "epoch": 0.29541147132169576, + "grad_norm": 0.4258709150560165, + "learning_rate": 4.132263078525139e-05, + "loss": 0.9177, + "step": 5923 + }, + { + "epoch": 0.29546134663341644, + "grad_norm": 0.5616771352872192, + "learning_rate": 4.131957168872979e-05, + "loss": 0.911, + "step": 5924 + }, + { + "epoch": 0.2955112219451372, + "grad_norm": 0.48574390503325376, + "learning_rate": 4.131651216635627e-05, + "loss": 0.936, + "step": 5925 + }, + { + "epoch": 0.29556109725685786, + "grad_norm": 0.44479420039416656, + "learning_rate": 4.131345221821067e-05, + "loss": 0.9181, + "step": 5926 + }, + { + "epoch": 0.29561097256857854, + "grad_norm": 0.3876232962916041, + "learning_rate": 4.131039184437283e-05, + "loss": 0.9281, + "step": 5927 + }, + { + "epoch": 0.2956608478802992, + "grad_norm": 0.3781301388531601, + "learning_rate": 4.1307331044922624e-05, + "loss": 0.9081, + "step": 5928 + }, + { + "epoch": 0.29571072319201996, + "grad_norm": 0.47458925178700734, + "learning_rate": 4.130426981993991e-05, + "loss": 0.9565, + "step": 5929 + }, + { + "epoch": 0.29576059850374065, + "grad_norm": 0.3946062810584408, + "learning_rate": 4.1301208169504566e-05, + "loss": 0.9424, + "step": 5930 + }, + { + "epoch": 0.29581047381546133, + "grad_norm": 0.44200515161292475, + "learning_rate": 4.12981460936965e-05, + "loss": 0.9303, + "step": 5931 + }, + { + "epoch": 0.29586034912718207, + "grad_norm": 0.3792284316361379, + "learning_rate": 4.12950835925956e-05, + "loss": 0.895, + "step": 5932 + }, + { + "epoch": 0.29591022443890275, + "grad_norm": 0.4049943211018395, + "learning_rate": 4.1292020666281786e-05, + "loss": 0.8985, + "step": 5933 + }, + { + "epoch": 0.29596009975062343, + "grad_norm": 0.45309578605873985, + "learning_rate": 4.128895731483498e-05, + "loss": 0.9069, + "step": 5934 + }, + { + "epoch": 0.29600997506234417, + "grad_norm": 0.46702776580483, + "learning_rate": 4.128589353833512e-05, + "loss": 0.9204, + "step": 5935 + }, + { + "epoch": 0.29605985037406485, + "grad_norm": 0.40451937605691046, + "learning_rate": 4.128282933686217e-05, + "loss": 0.8941, + "step": 5936 + }, + { + "epoch": 0.29610972568578553, + "grad_norm": 0.48060576408991224, + "learning_rate": 4.1279764710496067e-05, + "loss": 0.9164, + "step": 5937 + }, + { + "epoch": 0.2961596009975062, + "grad_norm": 0.43478487269887317, + "learning_rate": 4.1276699659316787e-05, + "loss": 0.9588, + "step": 5938 + }, + { + "epoch": 0.29620947630922695, + "grad_norm": 0.4878197482445599, + "learning_rate": 4.127363418340432e-05, + "loss": 0.9232, + "step": 5939 + }, + { + "epoch": 0.29625935162094763, + "grad_norm": 0.5418285505459066, + "learning_rate": 4.127056828283865e-05, + "loss": 0.9178, + "step": 5940 + }, + { + "epoch": 0.2963092269326683, + "grad_norm": 0.41026046509388314, + "learning_rate": 4.126750195769978e-05, + "loss": 0.8962, + "step": 5941 + }, + { + "epoch": 0.29635910224438905, + "grad_norm": 0.40561433386552853, + "learning_rate": 4.1264435208067735e-05, + "loss": 0.9188, + "step": 5942 + }, + { + "epoch": 0.29640897755610973, + "grad_norm": 0.4359670461491195, + "learning_rate": 4.126136803402253e-05, + "loss": 0.8934, + "step": 5943 + }, + { + "epoch": 0.2964588528678304, + "grad_norm": 0.3935715119857157, + "learning_rate": 4.12583004356442e-05, + "loss": 0.9071, + "step": 5944 + }, + { + "epoch": 0.2965087281795511, + "grad_norm": 0.5056042666163924, + "learning_rate": 4.1255232413012796e-05, + "loss": 0.932, + "step": 5945 + }, + { + "epoch": 0.29655860349127183, + "grad_norm": 0.4003867444207868, + "learning_rate": 4.125216396620838e-05, + "loss": 0.9131, + "step": 5946 + }, + { + "epoch": 0.2966084788029925, + "grad_norm": 0.47172529788534, + "learning_rate": 4.124909509531101e-05, + "loss": 0.9642, + "step": 5947 + }, + { + "epoch": 0.2966583541147132, + "grad_norm": 0.45427557591669965, + "learning_rate": 4.124602580040079e-05, + "loss": 0.9137, + "step": 5948 + }, + { + "epoch": 0.29670822942643393, + "grad_norm": 0.39770706935395717, + "learning_rate": 4.1242956081557785e-05, + "loss": 0.8995, + "step": 5949 + }, + { + "epoch": 0.2967581047381546, + "grad_norm": 0.4326816833491421, + "learning_rate": 4.1239885938862124e-05, + "loss": 0.933, + "step": 5950 + }, + { + "epoch": 0.2968079800498753, + "grad_norm": 0.4945118790497173, + "learning_rate": 4.123681537239389e-05, + "loss": 0.9436, + "step": 5951 + }, + { + "epoch": 0.29685785536159603, + "grad_norm": 0.4369246085556956, + "learning_rate": 4.1233744382233245e-05, + "loss": 0.8933, + "step": 5952 + }, + { + "epoch": 0.2969077306733167, + "grad_norm": 0.420428470467461, + "learning_rate": 4.123067296846029e-05, + "loss": 0.9335, + "step": 5953 + }, + { + "epoch": 0.2969576059850374, + "grad_norm": 0.421121172762288, + "learning_rate": 4.122760113115519e-05, + "loss": 0.9468, + "step": 5954 + }, + { + "epoch": 0.2970074812967581, + "grad_norm": 0.41906011320460956, + "learning_rate": 4.122452887039811e-05, + "loss": 0.8845, + "step": 5955 + }, + { + "epoch": 0.2970573566084788, + "grad_norm": 0.5965947073524671, + "learning_rate": 4.1221456186269204e-05, + "loss": 0.897, + "step": 5956 + }, + { + "epoch": 0.2971072319201995, + "grad_norm": 0.5295940901072121, + "learning_rate": 4.1218383078848656e-05, + "loss": 0.9163, + "step": 5957 + }, + { + "epoch": 0.2971571072319202, + "grad_norm": 0.503184693281462, + "learning_rate": 4.121530954821666e-05, + "loss": 0.8891, + "step": 5958 + }, + { + "epoch": 0.2972069825436409, + "grad_norm": 0.604821206700733, + "learning_rate": 4.121223559445343e-05, + "loss": 0.9409, + "step": 5959 + }, + { + "epoch": 0.2972568578553616, + "grad_norm": 0.42627837363973475, + "learning_rate": 4.120916121763915e-05, + "loss": 0.9231, + "step": 5960 + }, + { + "epoch": 0.2973067331670823, + "grad_norm": 0.4917728787055283, + "learning_rate": 4.120608641785407e-05, + "loss": 0.912, + "step": 5961 + }, + { + "epoch": 0.297356608478803, + "grad_norm": 0.4603695016884036, + "learning_rate": 4.120301119517842e-05, + "loss": 0.9304, + "step": 5962 + }, + { + "epoch": 0.2974064837905237, + "grad_norm": 0.4645525997828119, + "learning_rate": 4.119993554969245e-05, + "loss": 0.8783, + "step": 5963 + }, + { + "epoch": 0.2974563591022444, + "grad_norm": 0.513560153215546, + "learning_rate": 4.1196859481476404e-05, + "loss": 0.9391, + "step": 5964 + }, + { + "epoch": 0.29750623441396506, + "grad_norm": 0.45435059496556834, + "learning_rate": 4.1193782990610564e-05, + "loss": 0.8882, + "step": 5965 + }, + { + "epoch": 0.2975561097256858, + "grad_norm": 0.6670002607661191, + "learning_rate": 4.11907060771752e-05, + "loss": 0.912, + "step": 5966 + }, + { + "epoch": 0.2976059850374065, + "grad_norm": 0.5946747138266625, + "learning_rate": 4.11876287412506e-05, + "loss": 0.9391, + "step": 5967 + }, + { + "epoch": 0.29765586034912717, + "grad_norm": 0.45086081501692726, + "learning_rate": 4.1184550982917085e-05, + "loss": 0.9027, + "step": 5968 + }, + { + "epoch": 0.2977057356608479, + "grad_norm": 0.5022062837103619, + "learning_rate": 4.118147280225495e-05, + "loss": 0.9549, + "step": 5969 + }, + { + "epoch": 0.2977556109725686, + "grad_norm": 0.4693562549158594, + "learning_rate": 4.117839419934453e-05, + "loss": 0.9077, + "step": 5970 + }, + { + "epoch": 0.29780548628428927, + "grad_norm": 0.447782891745204, + "learning_rate": 4.117531517426614e-05, + "loss": 0.9232, + "step": 5971 + }, + { + "epoch": 0.29785536159600995, + "grad_norm": 0.48393153733690686, + "learning_rate": 4.117223572710015e-05, + "loss": 0.9258, + "step": 5972 + }, + { + "epoch": 0.2979052369077307, + "grad_norm": 0.7111823552867109, + "learning_rate": 4.11691558579269e-05, + "loss": 0.9275, + "step": 5973 + }, + { + "epoch": 0.29795511221945137, + "grad_norm": 0.4917113503794101, + "learning_rate": 4.116607556682678e-05, + "loss": 0.9022, + "step": 5974 + }, + { + "epoch": 0.29800498753117205, + "grad_norm": 0.4898187233420212, + "learning_rate": 4.116299485388014e-05, + "loss": 0.9378, + "step": 5975 + }, + { + "epoch": 0.2980548628428928, + "grad_norm": 0.4516577502260686, + "learning_rate": 4.115991371916738e-05, + "loss": 0.9309, + "step": 5976 + }, + { + "epoch": 0.29810473815461347, + "grad_norm": 0.48580470238700896, + "learning_rate": 4.115683216276891e-05, + "loss": 0.8749, + "step": 5977 + }, + { + "epoch": 0.29815461346633415, + "grad_norm": 0.5096364265351222, + "learning_rate": 4.1153750184765135e-05, + "loss": 0.9205, + "step": 5978 + }, + { + "epoch": 0.2982044887780549, + "grad_norm": 0.39891586844380417, + "learning_rate": 4.1150667785236475e-05, + "loss": 0.9263, + "step": 5979 + }, + { + "epoch": 0.29825436408977557, + "grad_norm": 0.45096330087955444, + "learning_rate": 4.1147584964263366e-05, + "loss": 0.9563, + "step": 5980 + }, + { + "epoch": 0.29830423940149625, + "grad_norm": 0.419846631583294, + "learning_rate": 4.1144501721926264e-05, + "loss": 0.9117, + "step": 5981 + }, + { + "epoch": 0.29835411471321693, + "grad_norm": 1.4621780727308933, + "learning_rate": 4.11414180583056e-05, + "loss": 0.8954, + "step": 5982 + }, + { + "epoch": 0.29840399002493767, + "grad_norm": 0.5984282427504813, + "learning_rate": 4.113833397348187e-05, + "loss": 0.9476, + "step": 5983 + }, + { + "epoch": 0.29845386533665835, + "grad_norm": 0.42076439661061105, + "learning_rate": 4.113524946753553e-05, + "loss": 0.9017, + "step": 5984 + }, + { + "epoch": 0.29850374064837903, + "grad_norm": 0.48637605095666425, + "learning_rate": 4.113216454054708e-05, + "loss": 0.8995, + "step": 5985 + }, + { + "epoch": 0.29855361596009977, + "grad_norm": 0.44913821493170064, + "learning_rate": 4.112907919259701e-05, + "loss": 0.9448, + "step": 5986 + }, + { + "epoch": 0.29860349127182045, + "grad_norm": 0.43222883133703494, + "learning_rate": 4.112599342376585e-05, + "loss": 0.9294, + "step": 5987 + }, + { + "epoch": 0.29865336658354114, + "grad_norm": 0.41274605357236716, + "learning_rate": 4.11229072341341e-05, + "loss": 0.9498, + "step": 5988 + }, + { + "epoch": 0.2987032418952619, + "grad_norm": 0.42533909692326355, + "learning_rate": 4.1119820623782296e-05, + "loss": 0.9153, + "step": 5989 + }, + { + "epoch": 0.29875311720698255, + "grad_norm": 0.4718709377430599, + "learning_rate": 4.1116733592790996e-05, + "loss": 0.9018, + "step": 5990 + }, + { + "epoch": 0.29880299251870324, + "grad_norm": 0.4636682019606917, + "learning_rate": 4.1113646141240736e-05, + "loss": 0.9213, + "step": 5991 + }, + { + "epoch": 0.2988528678304239, + "grad_norm": 0.4171830002988182, + "learning_rate": 4.1110558269212106e-05, + "loss": 0.9024, + "step": 5992 + }, + { + "epoch": 0.29890274314214466, + "grad_norm": 0.36987874043608115, + "learning_rate": 4.110746997678565e-05, + "loss": 0.9018, + "step": 5993 + }, + { + "epoch": 0.29895261845386534, + "grad_norm": 0.5178088746821605, + "learning_rate": 4.110438126404199e-05, + "loss": 0.8946, + "step": 5994 + }, + { + "epoch": 0.299002493765586, + "grad_norm": 0.3889544819516725, + "learning_rate": 4.1101292131061696e-05, + "loss": 0.9417, + "step": 5995 + }, + { + "epoch": 0.29905236907730676, + "grad_norm": 0.4086539963881995, + "learning_rate": 4.109820257792541e-05, + "loss": 0.9351, + "step": 5996 + }, + { + "epoch": 0.29910224438902744, + "grad_norm": 0.4257833091782285, + "learning_rate": 4.1095112604713706e-05, + "loss": 0.9255, + "step": 5997 + }, + { + "epoch": 0.2991521197007481, + "grad_norm": 0.43822330823809963, + "learning_rate": 4.109202221150725e-05, + "loss": 0.959, + "step": 5998 + }, + { + "epoch": 0.2992019950124688, + "grad_norm": 0.3981848112791333, + "learning_rate": 4.108893139838669e-05, + "loss": 0.937, + "step": 5999 + }, + { + "epoch": 0.29925187032418954, + "grad_norm": 0.5006293258146421, + "learning_rate": 4.108584016543265e-05, + "loss": 0.9004, + "step": 6000 + }, + { + "epoch": 0.2993017456359102, + "grad_norm": 0.452045152474858, + "learning_rate": 4.1082748512725825e-05, + "loss": 0.9334, + "step": 6001 + }, + { + "epoch": 0.2993516209476309, + "grad_norm": 0.44333886787102333, + "learning_rate": 4.107965644034686e-05, + "loss": 0.9116, + "step": 6002 + }, + { + "epoch": 0.29940149625935164, + "grad_norm": 0.4708456876927094, + "learning_rate": 4.107656394837646e-05, + "loss": 0.9142, + "step": 6003 + }, + { + "epoch": 0.2994513715710723, + "grad_norm": 0.3932620952780933, + "learning_rate": 4.107347103689532e-05, + "loss": 0.9615, + "step": 6004 + }, + { + "epoch": 0.299501246882793, + "grad_norm": 0.422259314850987, + "learning_rate": 4.1070377705984156e-05, + "loss": 0.9118, + "step": 6005 + }, + { + "epoch": 0.29955112219451374, + "grad_norm": 0.469150397815512, + "learning_rate": 4.1067283955723665e-05, + "loss": 0.8864, + "step": 6006 + }, + { + "epoch": 0.2996009975062344, + "grad_norm": 0.7531528562043435, + "learning_rate": 4.10641897861946e-05, + "loss": 0.8688, + "step": 6007 + }, + { + "epoch": 0.2996508728179551, + "grad_norm": 0.39407663268637516, + "learning_rate": 4.106109519747768e-05, + "loss": 0.9073, + "step": 6008 + }, + { + "epoch": 0.2997007481296758, + "grad_norm": 0.6452681971564226, + "learning_rate": 4.105800018965368e-05, + "loss": 0.946, + "step": 6009 + }, + { + "epoch": 0.2997506234413965, + "grad_norm": 0.43444673262288347, + "learning_rate": 4.1054904762803344e-05, + "loss": 0.8774, + "step": 6010 + }, + { + "epoch": 0.2998004987531172, + "grad_norm": 0.4061035986136193, + "learning_rate": 4.105180891700746e-05, + "loss": 0.9184, + "step": 6011 + }, + { + "epoch": 0.2998503740648379, + "grad_norm": 0.6457553398106491, + "learning_rate": 4.10487126523468e-05, + "loss": 0.9293, + "step": 6012 + }, + { + "epoch": 0.2999002493765586, + "grad_norm": 0.4358582279929002, + "learning_rate": 4.104561596890217e-05, + "loss": 0.9477, + "step": 6013 + }, + { + "epoch": 0.2999501246882793, + "grad_norm": 0.39701572320129086, + "learning_rate": 4.104251886675436e-05, + "loss": 0.8928, + "step": 6014 + }, + { + "epoch": 0.3, + "grad_norm": 0.4119719975366719, + "learning_rate": 4.103942134598422e-05, + "loss": 0.8724, + "step": 6015 + }, + { + "epoch": 0.30004987531172067, + "grad_norm": 0.4737311064371359, + "learning_rate": 4.103632340667255e-05, + "loss": 0.9023, + "step": 6016 + }, + { + "epoch": 0.3000997506234414, + "grad_norm": 1.4800405837735549, + "learning_rate": 4.1033225048900195e-05, + "loss": 0.9191, + "step": 6017 + }, + { + "epoch": 0.3001496259351621, + "grad_norm": 0.43735306959274034, + "learning_rate": 4.103012627274801e-05, + "loss": 0.9157, + "step": 6018 + }, + { + "epoch": 0.30019950124688277, + "grad_norm": 0.5531012711378877, + "learning_rate": 4.102702707829685e-05, + "loss": 0.9061, + "step": 6019 + }, + { + "epoch": 0.3002493765586035, + "grad_norm": 0.49134935210527286, + "learning_rate": 4.10239274656276e-05, + "loss": 0.9014, + "step": 6020 + }, + { + "epoch": 0.3002992518703242, + "grad_norm": 0.5406103194866432, + "learning_rate": 4.1020827434821124e-05, + "loss": 0.936, + "step": 6021 + }, + { + "epoch": 0.3003491271820449, + "grad_norm": 0.538001176690988, + "learning_rate": 4.1017726985958334e-05, + "loss": 0.9265, + "step": 6022 + }, + { + "epoch": 0.3003990024937656, + "grad_norm": 0.42929251891997844, + "learning_rate": 4.1014626119120124e-05, + "loss": 0.9087, + "step": 6023 + }, + { + "epoch": 0.3004488778054863, + "grad_norm": 0.4175975558461141, + "learning_rate": 4.1011524834387414e-05, + "loss": 0.9299, + "step": 6024 + }, + { + "epoch": 0.300498753117207, + "grad_norm": 0.3850231079781877, + "learning_rate": 4.100842313184113e-05, + "loss": 0.8874, + "step": 6025 + }, + { + "epoch": 0.30054862842892766, + "grad_norm": 0.41571229642098223, + "learning_rate": 4.1005321011562206e-05, + "loss": 0.9166, + "step": 6026 + }, + { + "epoch": 0.3005985037406484, + "grad_norm": 0.48357759386424326, + "learning_rate": 4.100221847363159e-05, + "loss": 0.9117, + "step": 6027 + }, + { + "epoch": 0.3006483790523691, + "grad_norm": 0.5005626146226877, + "learning_rate": 4.099911551813025e-05, + "loss": 0.9138, + "step": 6028 + }, + { + "epoch": 0.30069825436408976, + "grad_norm": 0.4313704725357647, + "learning_rate": 4.099601214513915e-05, + "loss": 0.8825, + "step": 6029 + }, + { + "epoch": 0.3007481296758105, + "grad_norm": 0.9500181902164164, + "learning_rate": 4.099290835473927e-05, + "loss": 0.9111, + "step": 6030 + }, + { + "epoch": 0.3007980049875312, + "grad_norm": 0.48234591792515186, + "learning_rate": 4.0989804147011604e-05, + "loss": 0.8989, + "step": 6031 + }, + { + "epoch": 0.30084788029925186, + "grad_norm": 0.45741785370726434, + "learning_rate": 4.0986699522037155e-05, + "loss": 0.976, + "step": 6032 + }, + { + "epoch": 0.3008977556109726, + "grad_norm": 0.5005733944484029, + "learning_rate": 4.098359447989694e-05, + "loss": 0.922, + "step": 6033 + }, + { + "epoch": 0.3009476309226933, + "grad_norm": 0.4757545859197697, + "learning_rate": 4.0980489020671976e-05, + "loss": 0.9021, + "step": 6034 + }, + { + "epoch": 0.30099750623441396, + "grad_norm": 0.4259448710538415, + "learning_rate": 4.097738314444331e-05, + "loss": 0.8992, + "step": 6035 + }, + { + "epoch": 0.30104738154613464, + "grad_norm": 0.46480076069344783, + "learning_rate": 4.097427685129198e-05, + "loss": 0.892, + "step": 6036 + }, + { + "epoch": 0.3010972568578554, + "grad_norm": 0.4572030237430165, + "learning_rate": 4.097117014129903e-05, + "loss": 0.8462, + "step": 6037 + }, + { + "epoch": 0.30114713216957606, + "grad_norm": 0.606870766212275, + "learning_rate": 4.096806301454556e-05, + "loss": 0.9631, + "step": 6038 + }, + { + "epoch": 0.30119700748129674, + "grad_norm": 0.4783872626441416, + "learning_rate": 4.096495547111262e-05, + "loss": 0.9428, + "step": 6039 + }, + { + "epoch": 0.3012468827930175, + "grad_norm": 0.4913745393879039, + "learning_rate": 4.0961847511081323e-05, + "loss": 0.9167, + "step": 6040 + }, + { + "epoch": 0.30129675810473816, + "grad_norm": 0.495240328910603, + "learning_rate": 4.095873913453275e-05, + "loss": 0.9473, + "step": 6041 + }, + { + "epoch": 0.30134663341645884, + "grad_norm": 0.4986873532642866, + "learning_rate": 4.095563034154803e-05, + "loss": 0.9582, + "step": 6042 + }, + { + "epoch": 0.3013965087281795, + "grad_norm": 0.4466328944915748, + "learning_rate": 4.095252113220827e-05, + "loss": 0.9469, + "step": 6043 + }, + { + "epoch": 0.30144638403990026, + "grad_norm": 0.49313160525211613, + "learning_rate": 4.094941150659461e-05, + "loss": 0.9461, + "step": 6044 + }, + { + "epoch": 0.30149625935162094, + "grad_norm": 0.4628148117618573, + "learning_rate": 4.09463014647882e-05, + "loss": 0.9509, + "step": 6045 + }, + { + "epoch": 0.3015461346633416, + "grad_norm": 0.43344538472989824, + "learning_rate": 4.094319100687019e-05, + "loss": 0.9292, + "step": 6046 + }, + { + "epoch": 0.30159600997506236, + "grad_norm": 0.7994149120143038, + "learning_rate": 4.0940080132921746e-05, + "loss": 0.9364, + "step": 6047 + }, + { + "epoch": 0.30164588528678304, + "grad_norm": 0.4031619301330941, + "learning_rate": 4.093696884302404e-05, + "loss": 0.8908, + "step": 6048 + }, + { + "epoch": 0.3016957605985037, + "grad_norm": 0.4442979716471606, + "learning_rate": 4.093385713725827e-05, + "loss": 0.9354, + "step": 6049 + }, + { + "epoch": 0.30174563591022446, + "grad_norm": 0.6436870688131882, + "learning_rate": 4.0930745015705625e-05, + "loss": 0.9436, + "step": 6050 + }, + { + "epoch": 0.30179551122194515, + "grad_norm": 0.5053729054963746, + "learning_rate": 4.092763247844732e-05, + "loss": 0.9285, + "step": 6051 + }, + { + "epoch": 0.3018453865336658, + "grad_norm": 0.4791065121086217, + "learning_rate": 4.0924519525564567e-05, + "loss": 0.921, + "step": 6052 + }, + { + "epoch": 0.3018952618453865, + "grad_norm": 0.40872463265395936, + "learning_rate": 4.092140615713861e-05, + "loss": 0.9156, + "step": 6053 + }, + { + "epoch": 0.30194513715710725, + "grad_norm": 0.4366917408834576, + "learning_rate": 4.091829237325069e-05, + "loss": 0.9015, + "step": 6054 + }, + { + "epoch": 0.30199501246882793, + "grad_norm": 0.5287240007094806, + "learning_rate": 4.091517817398205e-05, + "loss": 0.9477, + "step": 6055 + }, + { + "epoch": 0.3020448877805486, + "grad_norm": 0.504318042345179, + "learning_rate": 4.0912063559413955e-05, + "loss": 0.9365, + "step": 6056 + }, + { + "epoch": 0.30209476309226935, + "grad_norm": 0.4732400904690992, + "learning_rate": 4.090894852962769e-05, + "loss": 0.9102, + "step": 6057 + }, + { + "epoch": 0.30214463840399003, + "grad_norm": 0.42889421621341356, + "learning_rate": 4.090583308470453e-05, + "loss": 0.8955, + "step": 6058 + }, + { + "epoch": 0.3021945137157107, + "grad_norm": 0.5780391381325395, + "learning_rate": 4.0902717224725766e-05, + "loss": 0.9128, + "step": 6059 + }, + { + "epoch": 0.30224438902743145, + "grad_norm": 0.41808627014579575, + "learning_rate": 4.089960094977272e-05, + "loss": 0.8995, + "step": 6060 + }, + { + "epoch": 0.30229426433915213, + "grad_norm": 0.4883429902880531, + "learning_rate": 4.0896484259926703e-05, + "loss": 0.8876, + "step": 6061 + }, + { + "epoch": 0.3023441396508728, + "grad_norm": 0.5056243295982524, + "learning_rate": 4.089336715526905e-05, + "loss": 0.8641, + "step": 6062 + }, + { + "epoch": 0.3023940149625935, + "grad_norm": 0.41531146266629965, + "learning_rate": 4.089024963588108e-05, + "loss": 0.9613, + "step": 6063 + }, + { + "epoch": 0.30244389027431423, + "grad_norm": 0.414119913197586, + "learning_rate": 4.088713170184417e-05, + "loss": 0.9352, + "step": 6064 + }, + { + "epoch": 0.3024937655860349, + "grad_norm": 1.71706393468692, + "learning_rate": 4.088401335323966e-05, + "loss": 0.9185, + "step": 6065 + }, + { + "epoch": 0.3025436408977556, + "grad_norm": 0.4139571122966364, + "learning_rate": 4.0880894590148936e-05, + "loss": 0.904, + "step": 6066 + }, + { + "epoch": 0.30259351620947633, + "grad_norm": 0.5503912941949908, + "learning_rate": 4.0877775412653366e-05, + "loss": 0.9366, + "step": 6067 + }, + { + "epoch": 0.302643391521197, + "grad_norm": 0.4807735344861686, + "learning_rate": 4.087465582083436e-05, + "loss": 0.9343, + "step": 6068 + }, + { + "epoch": 0.3026932668329177, + "grad_norm": 0.43277189275195577, + "learning_rate": 4.087153581477331e-05, + "loss": 0.8976, + "step": 6069 + }, + { + "epoch": 0.3027431421446384, + "grad_norm": 0.4922518260521993, + "learning_rate": 4.0868415394551644e-05, + "loss": 0.912, + "step": 6070 + }, + { + "epoch": 0.3027930174563591, + "grad_norm": 0.4043042599905781, + "learning_rate": 4.086529456025078e-05, + "loss": 0.8966, + "step": 6071 + }, + { + "epoch": 0.3028428927680798, + "grad_norm": 0.4640877750313705, + "learning_rate": 4.086217331195215e-05, + "loss": 0.9139, + "step": 6072 + }, + { + "epoch": 0.3028927680798005, + "grad_norm": 0.4280696146969614, + "learning_rate": 4.0859051649737204e-05, + "loss": 0.944, + "step": 6073 + }, + { + "epoch": 0.3029426433915212, + "grad_norm": 0.6081935168556444, + "learning_rate": 4.085592957368741e-05, + "loss": 0.929, + "step": 6074 + }, + { + "epoch": 0.3029925187032419, + "grad_norm": 0.5503469860874396, + "learning_rate": 4.085280708388422e-05, + "loss": 0.9488, + "step": 6075 + }, + { + "epoch": 0.3030423940149626, + "grad_norm": 0.5555626341407888, + "learning_rate": 4.084968418040913e-05, + "loss": 0.8807, + "step": 6076 + }, + { + "epoch": 0.3030922693266833, + "grad_norm": 0.4339579090588012, + "learning_rate": 4.084656086334363e-05, + "loss": 0.9125, + "step": 6077 + }, + { + "epoch": 0.303142144638404, + "grad_norm": 0.5867937179560587, + "learning_rate": 4.08434371327692e-05, + "loss": 1.003, + "step": 6078 + }, + { + "epoch": 0.3031920199501247, + "grad_norm": 0.4994457398540402, + "learning_rate": 4.0840312988767386e-05, + "loss": 0.8875, + "step": 6079 + }, + { + "epoch": 0.30324189526184536, + "grad_norm": 0.4064956907850942, + "learning_rate": 4.083718843141969e-05, + "loss": 0.9261, + "step": 6080 + }, + { + "epoch": 0.3032917705735661, + "grad_norm": 0.5814259685896405, + "learning_rate": 4.083406346080765e-05, + "loss": 0.9226, + "step": 6081 + }, + { + "epoch": 0.3033416458852868, + "grad_norm": 0.45339288105746234, + "learning_rate": 4.0830938077012804e-05, + "loss": 0.9237, + "step": 6082 + }, + { + "epoch": 0.30339152119700746, + "grad_norm": 0.4667331870249291, + "learning_rate": 4.082781228011672e-05, + "loss": 0.9601, + "step": 6083 + }, + { + "epoch": 0.3034413965087282, + "grad_norm": 0.4979643930109153, + "learning_rate": 4.082468607020095e-05, + "loss": 0.9399, + "step": 6084 + }, + { + "epoch": 0.3034912718204489, + "grad_norm": 0.3853703122159309, + "learning_rate": 4.0821559447347085e-05, + "loss": 0.9242, + "step": 6085 + }, + { + "epoch": 0.30354114713216956, + "grad_norm": 0.5157799547626912, + "learning_rate": 4.081843241163671e-05, + "loss": 0.9294, + "step": 6086 + }, + { + "epoch": 0.3035910224438903, + "grad_norm": 0.4717013130260783, + "learning_rate": 4.081530496315141e-05, + "loss": 0.8955, + "step": 6087 + }, + { + "epoch": 0.303640897755611, + "grad_norm": 0.4376794412485712, + "learning_rate": 4.081217710197281e-05, + "loss": 0.9209, + "step": 6088 + }, + { + "epoch": 0.30369077306733167, + "grad_norm": 0.4269854214470251, + "learning_rate": 4.0809048828182534e-05, + "loss": 0.9055, + "step": 6089 + }, + { + "epoch": 0.30374064837905235, + "grad_norm": 0.4247218290860702, + "learning_rate": 4.08059201418622e-05, + "loss": 0.9409, + "step": 6090 + }, + { + "epoch": 0.3037905236907731, + "grad_norm": 0.4068279110432387, + "learning_rate": 4.080279104309345e-05, + "loss": 0.9479, + "step": 6091 + }, + { + "epoch": 0.30384039900249377, + "grad_norm": 0.4744094659569346, + "learning_rate": 4.0799661531957936e-05, + "loss": 0.8991, + "step": 6092 + }, + { + "epoch": 0.30389027431421445, + "grad_norm": 0.39873562763952836, + "learning_rate": 4.0796531608537336e-05, + "loss": 0.9098, + "step": 6093 + }, + { + "epoch": 0.3039401496259352, + "grad_norm": 0.3855143511196882, + "learning_rate": 4.07934012729133e-05, + "loss": 0.9251, + "step": 6094 + }, + { + "epoch": 0.30399002493765587, + "grad_norm": 0.47174749348701184, + "learning_rate": 4.079027052516754e-05, + "loss": 0.9085, + "step": 6095 + }, + { + "epoch": 0.30403990024937655, + "grad_norm": 0.4909354559759538, + "learning_rate": 4.078713936538173e-05, + "loss": 0.9202, + "step": 6096 + }, + { + "epoch": 0.30408977556109723, + "grad_norm": 0.42697860696064616, + "learning_rate": 4.078400779363758e-05, + "loss": 0.8578, + "step": 6097 + }, + { + "epoch": 0.30413965087281797, + "grad_norm": 0.39628300159844426, + "learning_rate": 4.0780875810016814e-05, + "loss": 0.8801, + "step": 6098 + }, + { + "epoch": 0.30418952618453865, + "grad_norm": 0.7253423144212056, + "learning_rate": 4.0777743414601166e-05, + "loss": 0.9419, + "step": 6099 + }, + { + "epoch": 0.30423940149625933, + "grad_norm": 0.525274645054494, + "learning_rate": 4.077461060747235e-05, + "loss": 0.8809, + "step": 6100 + }, + { + "epoch": 0.30428927680798007, + "grad_norm": 0.4686601284766734, + "learning_rate": 4.0771477388712145e-05, + "loss": 0.9148, + "step": 6101 + }, + { + "epoch": 0.30433915211970075, + "grad_norm": 0.4208977903282031, + "learning_rate": 4.0768343758402284e-05, + "loss": 0.9483, + "step": 6102 + }, + { + "epoch": 0.30438902743142143, + "grad_norm": 0.5604218513100654, + "learning_rate": 4.076520971662455e-05, + "loss": 0.9197, + "step": 6103 + }, + { + "epoch": 0.30443890274314217, + "grad_norm": 0.49845574250066166, + "learning_rate": 4.0762075263460734e-05, + "loss": 0.9078, + "step": 6104 + }, + { + "epoch": 0.30448877805486285, + "grad_norm": 0.416848861464122, + "learning_rate": 4.075894039899261e-05, + "loss": 0.8986, + "step": 6105 + }, + { + "epoch": 0.30453865336658353, + "grad_norm": 0.46510673216930737, + "learning_rate": 4.075580512330199e-05, + "loss": 0.9716, + "step": 6106 + }, + { + "epoch": 0.3045885286783042, + "grad_norm": 0.39729715481189704, + "learning_rate": 4.0752669436470686e-05, + "loss": 0.9312, + "step": 6107 + }, + { + "epoch": 0.30463840399002495, + "grad_norm": 0.45881313421001757, + "learning_rate": 4.0749533338580526e-05, + "loss": 0.9021, + "step": 6108 + }, + { + "epoch": 0.30468827930174563, + "grad_norm": 0.6952875586286696, + "learning_rate": 4.074639682971334e-05, + "loss": 0.9303, + "step": 6109 + }, + { + "epoch": 0.3047381546134663, + "grad_norm": 0.6403264345898048, + "learning_rate": 4.074325990995097e-05, + "loss": 0.9364, + "step": 6110 + }, + { + "epoch": 0.30478802992518705, + "grad_norm": 0.3908387786318842, + "learning_rate": 4.0740122579375286e-05, + "loss": 0.9031, + "step": 6111 + }, + { + "epoch": 0.30483790523690774, + "grad_norm": 0.4551464603276825, + "learning_rate": 4.073698483806815e-05, + "loss": 0.9511, + "step": 6112 + }, + { + "epoch": 0.3048877805486284, + "grad_norm": 0.47308365343544834, + "learning_rate": 4.073384668611143e-05, + "loss": 0.935, + "step": 6113 + }, + { + "epoch": 0.30493765586034915, + "grad_norm": 0.47032312922970776, + "learning_rate": 4.073070812358703e-05, + "loss": 0.9194, + "step": 6114 + }, + { + "epoch": 0.30498753117206984, + "grad_norm": 0.5910512296876339, + "learning_rate": 4.072756915057683e-05, + "loss": 0.9161, + "step": 6115 + }, + { + "epoch": 0.3050374064837905, + "grad_norm": 0.7024219002718328, + "learning_rate": 4.072442976716277e-05, + "loss": 0.943, + "step": 6116 + }, + { + "epoch": 0.3050872817955112, + "grad_norm": 0.9120380415435279, + "learning_rate": 4.072128997342673e-05, + "loss": 0.9459, + "step": 6117 + }, + { + "epoch": 0.30513715710723194, + "grad_norm": 0.37445895090263204, + "learning_rate": 4.0718149769450685e-05, + "loss": 0.9104, + "step": 6118 + }, + { + "epoch": 0.3051870324189526, + "grad_norm": 0.42848885311216217, + "learning_rate": 4.0715009155316546e-05, + "loss": 0.9135, + "step": 6119 + }, + { + "epoch": 0.3052369077306733, + "grad_norm": 0.5686484783027611, + "learning_rate": 4.071186813110628e-05, + "loss": 0.9138, + "step": 6120 + }, + { + "epoch": 0.30528678304239404, + "grad_norm": 0.6715140717430561, + "learning_rate": 4.070872669690184e-05, + "loss": 0.9055, + "step": 6121 + }, + { + "epoch": 0.3053366583541147, + "grad_norm": 0.4627582035792053, + "learning_rate": 4.070558485278521e-05, + "loss": 0.899, + "step": 6122 + }, + { + "epoch": 0.3053865336658354, + "grad_norm": 0.5012717492974182, + "learning_rate": 4.0702442598838375e-05, + "loss": 0.9113, + "step": 6123 + }, + { + "epoch": 0.3054364089775561, + "grad_norm": 0.616264370579399, + "learning_rate": 4.069929993514333e-05, + "loss": 0.88, + "step": 6124 + }, + { + "epoch": 0.3054862842892768, + "grad_norm": 0.6569868309759451, + "learning_rate": 4.069615686178207e-05, + "loss": 0.8869, + "step": 6125 + }, + { + "epoch": 0.3055361596009975, + "grad_norm": 0.5580539062097308, + "learning_rate": 4.069301337883663e-05, + "loss": 0.89, + "step": 6126 + }, + { + "epoch": 0.3055860349127182, + "grad_norm": 0.7058604922866294, + "learning_rate": 4.068986948638903e-05, + "loss": 0.8845, + "step": 6127 + }, + { + "epoch": 0.3056359102244389, + "grad_norm": 0.48785228591817825, + "learning_rate": 4.06867251845213e-05, + "loss": 0.924, + "step": 6128 + }, + { + "epoch": 0.3056857855361596, + "grad_norm": 0.5876214898461684, + "learning_rate": 4.06835804733155e-05, + "loss": 0.9171, + "step": 6129 + }, + { + "epoch": 0.3057356608478803, + "grad_norm": 0.5009840754214133, + "learning_rate": 4.068043535285369e-05, + "loss": 0.8955, + "step": 6130 + }, + { + "epoch": 0.305785536159601, + "grad_norm": 0.5782360211886716, + "learning_rate": 4.067728982321793e-05, + "loss": 0.9553, + "step": 6131 + }, + { + "epoch": 0.3058354114713217, + "grad_norm": 0.8531619719321545, + "learning_rate": 4.067414388449032e-05, + "loss": 0.9424, + "step": 6132 + }, + { + "epoch": 0.3058852867830424, + "grad_norm": 0.7454091177878598, + "learning_rate": 4.067099753675293e-05, + "loss": 0.9409, + "step": 6133 + }, + { + "epoch": 0.30593516209476307, + "grad_norm": 0.6338429051967696, + "learning_rate": 4.066785078008788e-05, + "loss": 0.9311, + "step": 6134 + }, + { + "epoch": 0.3059850374064838, + "grad_norm": 0.477618051506546, + "learning_rate": 4.066470361457726e-05, + "loss": 0.9506, + "step": 6135 + }, + { + "epoch": 0.3060349127182045, + "grad_norm": 0.4950808760302034, + "learning_rate": 4.066155604030323e-05, + "loss": 0.9157, + "step": 6136 + }, + { + "epoch": 0.30608478802992517, + "grad_norm": 0.4397909947977637, + "learning_rate": 4.06584080573479e-05, + "loss": 0.9258, + "step": 6137 + }, + { + "epoch": 0.3061346633416459, + "grad_norm": 0.43631912244969456, + "learning_rate": 4.065525966579341e-05, + "loss": 0.9338, + "step": 6138 + }, + { + "epoch": 0.3061845386533666, + "grad_norm": 0.5036753082313836, + "learning_rate": 4.065211086572193e-05, + "loss": 0.9376, + "step": 6139 + }, + { + "epoch": 0.30623441396508727, + "grad_norm": 0.5716067927296011, + "learning_rate": 4.0648961657215625e-05, + "loss": 0.9098, + "step": 6140 + }, + { + "epoch": 0.30628428927680795, + "grad_norm": 0.4253101786177321, + "learning_rate": 4.064581204035667e-05, + "loss": 0.8807, + "step": 6141 + }, + { + "epoch": 0.3063341645885287, + "grad_norm": 0.5018209050128163, + "learning_rate": 4.0642662015227246e-05, + "loss": 0.9457, + "step": 6142 + }, + { + "epoch": 0.30638403990024937, + "grad_norm": 0.7797779787988138, + "learning_rate": 4.063951158190956e-05, + "loss": 0.8625, + "step": 6143 + }, + { + "epoch": 0.30643391521197005, + "grad_norm": 0.5038274285692429, + "learning_rate": 4.063636074048581e-05, + "loss": 0.9778, + "step": 6144 + }, + { + "epoch": 0.3064837905236908, + "grad_norm": 0.5893605220239464, + "learning_rate": 4.063320949103824e-05, + "loss": 0.8967, + "step": 6145 + }, + { + "epoch": 0.3065336658354115, + "grad_norm": 0.5281860665997362, + "learning_rate": 4.063005783364906e-05, + "loss": 0.9121, + "step": 6146 + }, + { + "epoch": 0.30658354114713215, + "grad_norm": 1.3800003743939546, + "learning_rate": 4.0626905768400516e-05, + "loss": 0.9248, + "step": 6147 + }, + { + "epoch": 0.3066334164588529, + "grad_norm": 0.5365889884523707, + "learning_rate": 4.0623753295374856e-05, + "loss": 0.9014, + "step": 6148 + }, + { + "epoch": 0.3066832917705736, + "grad_norm": 0.4994741552379272, + "learning_rate": 4.062060041465434e-05, + "loss": 0.9227, + "step": 6149 + }, + { + "epoch": 0.30673316708229426, + "grad_norm": 0.47247092088182036, + "learning_rate": 4.061744712632126e-05, + "loss": 0.891, + "step": 6150 + }, + { + "epoch": 0.30678304239401494, + "grad_norm": 0.48981815108791005, + "learning_rate": 4.0614293430457884e-05, + "loss": 0.9285, + "step": 6151 + }, + { + "epoch": 0.3068329177057357, + "grad_norm": 0.5021862812006168, + "learning_rate": 4.0611139327146494e-05, + "loss": 0.9054, + "step": 6152 + }, + { + "epoch": 0.30688279301745636, + "grad_norm": 0.5143799801574528, + "learning_rate": 4.060798481646943e-05, + "loss": 0.8616, + "step": 6153 + }, + { + "epoch": 0.30693266832917704, + "grad_norm": 0.5386001124686521, + "learning_rate": 4.0604829898508976e-05, + "loss": 0.9184, + "step": 6154 + }, + { + "epoch": 0.3069825436408978, + "grad_norm": 0.5153541099561118, + "learning_rate": 4.060167457334748e-05, + "loss": 0.9235, + "step": 6155 + }, + { + "epoch": 0.30703241895261846, + "grad_norm": 0.5223692549855526, + "learning_rate": 4.0598518841067264e-05, + "loss": 0.9627, + "step": 6156 + }, + { + "epoch": 0.30708229426433914, + "grad_norm": 0.5442664761589284, + "learning_rate": 4.059536270175067e-05, + "loss": 0.9729, + "step": 6157 + }, + { + "epoch": 0.3071321695760599, + "grad_norm": 0.5019642706147299, + "learning_rate": 4.059220615548008e-05, + "loss": 0.9258, + "step": 6158 + }, + { + "epoch": 0.30718204488778056, + "grad_norm": 0.731038424785169, + "learning_rate": 4.0589049202337836e-05, + "loss": 0.913, + "step": 6159 + }, + { + "epoch": 0.30723192019950124, + "grad_norm": 0.5745286295815556, + "learning_rate": 4.058589184240634e-05, + "loss": 0.935, + "step": 6160 + }, + { + "epoch": 0.3072817955112219, + "grad_norm": 0.5318844209819479, + "learning_rate": 4.0582734075767964e-05, + "loss": 0.9216, + "step": 6161 + }, + { + "epoch": 0.30733167082294266, + "grad_norm": 0.5343597296951166, + "learning_rate": 4.057957590250512e-05, + "loss": 0.8563, + "step": 6162 + }, + { + "epoch": 0.30738154613466334, + "grad_norm": 0.43688750952064415, + "learning_rate": 4.057641732270021e-05, + "loss": 0.8899, + "step": 6163 + }, + { + "epoch": 0.307431421446384, + "grad_norm": 0.714065166180672, + "learning_rate": 4.057325833643567e-05, + "loss": 0.9209, + "step": 6164 + }, + { + "epoch": 0.30748129675810476, + "grad_norm": 0.5793185938486027, + "learning_rate": 4.057009894379392e-05, + "loss": 0.9114, + "step": 6165 + }, + { + "epoch": 0.30753117206982544, + "grad_norm": 0.5271714808829389, + "learning_rate": 4.0566939144857406e-05, + "loss": 0.9544, + "step": 6166 + }, + { + "epoch": 0.3075810473815461, + "grad_norm": 1.2567122947594203, + "learning_rate": 4.056377893970858e-05, + "loss": 0.8895, + "step": 6167 + }, + { + "epoch": 0.3076309226932668, + "grad_norm": 0.6174064205682116, + "learning_rate": 4.056061832842991e-05, + "loss": 0.9298, + "step": 6168 + }, + { + "epoch": 0.30768079800498754, + "grad_norm": 0.4624251816926701, + "learning_rate": 4.055745731110387e-05, + "loss": 0.9129, + "step": 6169 + }, + { + "epoch": 0.3077306733167082, + "grad_norm": 0.6085186904022614, + "learning_rate": 4.055429588781294e-05, + "loss": 0.9349, + "step": 6170 + }, + { + "epoch": 0.3077805486284289, + "grad_norm": 0.8012982965504462, + "learning_rate": 4.055113405863963e-05, + "loss": 0.9104, + "step": 6171 + }, + { + "epoch": 0.30783042394014964, + "grad_norm": 0.5830039640170381, + "learning_rate": 4.054797182366642e-05, + "loss": 0.8875, + "step": 6172 + }, + { + "epoch": 0.3078802992518703, + "grad_norm": 0.6860384130283255, + "learning_rate": 4.054480918297586e-05, + "loss": 0.9146, + "step": 6173 + }, + { + "epoch": 0.307930174563591, + "grad_norm": 0.6666698328470018, + "learning_rate": 4.054164613665045e-05, + "loss": 0.9086, + "step": 6174 + }, + { + "epoch": 0.30798004987531175, + "grad_norm": 0.5240307347014321, + "learning_rate": 4.053848268477274e-05, + "loss": 0.8989, + "step": 6175 + }, + { + "epoch": 0.3080299251870324, + "grad_norm": 0.5686725837782333, + "learning_rate": 4.053531882742529e-05, + "loss": 0.8702, + "step": 6176 + }, + { + "epoch": 0.3080798004987531, + "grad_norm": 0.8801785510674763, + "learning_rate": 4.053215456469064e-05, + "loss": 0.9059, + "step": 6177 + }, + { + "epoch": 0.3081296758104738, + "grad_norm": 0.9349182325080061, + "learning_rate": 4.0528989896651366e-05, + "loss": 0.9384, + "step": 6178 + }, + { + "epoch": 0.30817955112219453, + "grad_norm": 1.907229572532599, + "learning_rate": 4.0525824823390045e-05, + "loss": 0.9442, + "step": 6179 + }, + { + "epoch": 0.3082294264339152, + "grad_norm": 0.6545442858615584, + "learning_rate": 4.052265934498929e-05, + "loss": 0.8988, + "step": 6180 + }, + { + "epoch": 0.3082793017456359, + "grad_norm": 0.7800313110964741, + "learning_rate": 4.051949346153167e-05, + "loss": 0.9395, + "step": 6181 + }, + { + "epoch": 0.30832917705735663, + "grad_norm": 0.7183917341701279, + "learning_rate": 4.051632717309982e-05, + "loss": 0.9271, + "step": 6182 + }, + { + "epoch": 0.3083790523690773, + "grad_norm": 0.8437172336298071, + "learning_rate": 4.0513160479776355e-05, + "loss": 0.8395, + "step": 6183 + }, + { + "epoch": 0.308428927680798, + "grad_norm": 1.0628634580996559, + "learning_rate": 4.050999338164392e-05, + "loss": 0.892, + "step": 6184 + }, + { + "epoch": 0.30847880299251873, + "grad_norm": 0.6221163498461184, + "learning_rate": 4.0506825878785134e-05, + "loss": 0.9008, + "step": 6185 + }, + { + "epoch": 0.3085286783042394, + "grad_norm": 0.775635967133276, + "learning_rate": 4.0503657971282684e-05, + "loss": 0.9224, + "step": 6186 + }, + { + "epoch": 0.3085785536159601, + "grad_norm": 0.515185115691144, + "learning_rate": 4.0500489659219206e-05, + "loss": 0.9291, + "step": 6187 + }, + { + "epoch": 0.3086284289276808, + "grad_norm": 0.6420191652641094, + "learning_rate": 4.0497320942677384e-05, + "loss": 0.9293, + "step": 6188 + }, + { + "epoch": 0.3086783042394015, + "grad_norm": 0.7661452536262915, + "learning_rate": 4.049415182173992e-05, + "loss": 0.9828, + "step": 6189 + }, + { + "epoch": 0.3087281795511222, + "grad_norm": 0.5166034194793515, + "learning_rate": 4.049098229648949e-05, + "loss": 0.9653, + "step": 6190 + }, + { + "epoch": 0.3087780548628429, + "grad_norm": 0.47240033776212026, + "learning_rate": 4.0487812367008814e-05, + "loss": 0.9379, + "step": 6191 + }, + { + "epoch": 0.3088279301745636, + "grad_norm": 0.6631323703437995, + "learning_rate": 4.04846420333806e-05, + "loss": 0.9579, + "step": 6192 + }, + { + "epoch": 0.3088778054862843, + "grad_norm": 0.5702701053713745, + "learning_rate": 4.048147129568759e-05, + "loss": 0.8901, + "step": 6193 + }, + { + "epoch": 0.308927680798005, + "grad_norm": 0.5291877673258095, + "learning_rate": 4.0478300154012506e-05, + "loss": 0.9817, + "step": 6194 + }, + { + "epoch": 0.30897755610972566, + "grad_norm": 0.5000252373641402, + "learning_rate": 4.0475128608438116e-05, + "loss": 0.8805, + "step": 6195 + }, + { + "epoch": 0.3090274314214464, + "grad_norm": 0.6603427351758939, + "learning_rate": 4.047195665904716e-05, + "loss": 0.9087, + "step": 6196 + }, + { + "epoch": 0.3090773067331671, + "grad_norm": 1.0911030618921145, + "learning_rate": 4.046878430592243e-05, + "loss": 0.945, + "step": 6197 + }, + { + "epoch": 0.30912718204488776, + "grad_norm": 0.6317981778271188, + "learning_rate": 4.0465611549146684e-05, + "loss": 0.8896, + "step": 6198 + }, + { + "epoch": 0.3091770573566085, + "grad_norm": 0.5224754772659019, + "learning_rate": 4.0462438388802744e-05, + "loss": 0.8767, + "step": 6199 + }, + { + "epoch": 0.3092269326683292, + "grad_norm": 0.6752692464410094, + "learning_rate": 4.045926482497338e-05, + "loss": 0.9467, + "step": 6200 + }, + { + "epoch": 0.30927680798004986, + "grad_norm": 0.9888573973609365, + "learning_rate": 4.045609085774142e-05, + "loss": 0.9181, + "step": 6201 + }, + { + "epoch": 0.3093266832917706, + "grad_norm": 0.9573694280619335, + "learning_rate": 4.045291648718969e-05, + "loss": 0.8827, + "step": 6202 + }, + { + "epoch": 0.3093765586034913, + "grad_norm": 0.9033278148707142, + "learning_rate": 4.0449741713401024e-05, + "loss": 0.9136, + "step": 6203 + }, + { + "epoch": 0.30942643391521196, + "grad_norm": 0.4738119054921708, + "learning_rate": 4.0446566536458255e-05, + "loss": 0.9034, + "step": 6204 + }, + { + "epoch": 0.30947630922693264, + "grad_norm": 0.8699095793434324, + "learning_rate": 4.0443390956444246e-05, + "loss": 0.9251, + "step": 6205 + }, + { + "epoch": 0.3095261845386534, + "grad_norm": 0.747868259860259, + "learning_rate": 4.044021497344187e-05, + "loss": 0.9323, + "step": 6206 + }, + { + "epoch": 0.30957605985037406, + "grad_norm": 0.7232704149904398, + "learning_rate": 4.0437038587533985e-05, + "loss": 0.9039, + "step": 6207 + }, + { + "epoch": 0.30962593516209475, + "grad_norm": 0.6295971296373575, + "learning_rate": 4.0433861798803494e-05, + "loss": 0.9152, + "step": 6208 + }, + { + "epoch": 0.3096758104738155, + "grad_norm": 0.7849011396758241, + "learning_rate": 4.043068460733328e-05, + "loss": 0.9077, + "step": 6209 + }, + { + "epoch": 0.30972568578553616, + "grad_norm": 0.6478754681582475, + "learning_rate": 4.0427507013206265e-05, + "loss": 0.8612, + "step": 6210 + }, + { + "epoch": 0.30977556109725685, + "grad_norm": 0.7105446640182058, + "learning_rate": 4.042432901650535e-05, + "loss": 0.9162, + "step": 6211 + }, + { + "epoch": 0.3098254364089776, + "grad_norm": 0.6218728141886525, + "learning_rate": 4.0421150617313473e-05, + "loss": 0.9672, + "step": 6212 + }, + { + "epoch": 0.30987531172069827, + "grad_norm": 0.6209235780416265, + "learning_rate": 4.0417971815713584e-05, + "loss": 0.9151, + "step": 6213 + }, + { + "epoch": 0.30992518703241895, + "grad_norm": 1.0868625101755336, + "learning_rate": 4.041479261178861e-05, + "loss": 0.8619, + "step": 6214 + }, + { + "epoch": 0.30997506234413963, + "grad_norm": 0.5817332342173287, + "learning_rate": 4.041161300562152e-05, + "loss": 0.9627, + "step": 6215 + }, + { + "epoch": 0.31002493765586037, + "grad_norm": 0.548758753166275, + "learning_rate": 4.040843299729529e-05, + "loss": 0.9436, + "step": 6216 + }, + { + "epoch": 0.31007481296758105, + "grad_norm": 0.6570556867490369, + "learning_rate": 4.040525258689289e-05, + "loss": 0.913, + "step": 6217 + }, + { + "epoch": 0.31012468827930173, + "grad_norm": 0.7589399653344611, + "learning_rate": 4.040207177449733e-05, + "loss": 0.9111, + "step": 6218 + }, + { + "epoch": 0.31017456359102247, + "grad_norm": 0.5435636466458615, + "learning_rate": 4.039889056019159e-05, + "loss": 0.8848, + "step": 6219 + }, + { + "epoch": 0.31022443890274315, + "grad_norm": 0.643297690888915, + "learning_rate": 4.03957089440587e-05, + "loss": 0.9069, + "step": 6220 + }, + { + "epoch": 0.31027431421446383, + "grad_norm": 0.5966569857245484, + "learning_rate": 4.039252692618167e-05, + "loss": 0.9202, + "step": 6221 + }, + { + "epoch": 0.3103241895261845, + "grad_norm": 0.7892144606924862, + "learning_rate": 4.0389344506643545e-05, + "loss": 0.9431, + "step": 6222 + }, + { + "epoch": 0.31037406483790525, + "grad_norm": 0.5267869792310997, + "learning_rate": 4.0386161685527355e-05, + "loss": 0.901, + "step": 6223 + }, + { + "epoch": 0.31042394014962593, + "grad_norm": 0.8397046880732322, + "learning_rate": 4.038297846291617e-05, + "loss": 0.9304, + "step": 6224 + }, + { + "epoch": 0.3104738154613466, + "grad_norm": 0.7586383359844013, + "learning_rate": 4.037979483889304e-05, + "loss": 0.9173, + "step": 6225 + }, + { + "epoch": 0.31052369077306735, + "grad_norm": 0.7101632421536418, + "learning_rate": 4.037661081354105e-05, + "loss": 0.9122, + "step": 6226 + }, + { + "epoch": 0.31057356608478803, + "grad_norm": 1.0232352098547528, + "learning_rate": 4.037342638694328e-05, + "loss": 0.9011, + "step": 6227 + }, + { + "epoch": 0.3106234413965087, + "grad_norm": 0.7279694661820418, + "learning_rate": 4.037024155918283e-05, + "loss": 1.0013, + "step": 6228 + }, + { + "epoch": 0.31067331670822945, + "grad_norm": 0.5928224387588751, + "learning_rate": 4.0367056330342803e-05, + "loss": 0.9248, + "step": 6229 + }, + { + "epoch": 0.31072319201995013, + "grad_norm": 1.2109901979329998, + "learning_rate": 4.036387070050632e-05, + "loss": 0.9634, + "step": 6230 + }, + { + "epoch": 0.3107730673316708, + "grad_norm": 0.580708099019875, + "learning_rate": 4.036068466975651e-05, + "loss": 0.8949, + "step": 6231 + }, + { + "epoch": 0.3108229426433915, + "grad_norm": 0.6736147619709945, + "learning_rate": 4.035749823817651e-05, + "loss": 0.8833, + "step": 6232 + }, + { + "epoch": 0.31087281795511224, + "grad_norm": 0.5924202527429901, + "learning_rate": 4.035431140584946e-05, + "loss": 0.9291, + "step": 6233 + }, + { + "epoch": 0.3109226932668329, + "grad_norm": 1.2428204786613506, + "learning_rate": 4.0351124172858526e-05, + "loss": 0.9059, + "step": 6234 + }, + { + "epoch": 0.3109725685785536, + "grad_norm": 0.6938041471700048, + "learning_rate": 4.0347936539286876e-05, + "loss": 0.8854, + "step": 6235 + }, + { + "epoch": 0.31102244389027434, + "grad_norm": 0.643135242396645, + "learning_rate": 4.03447485052177e-05, + "loss": 0.8543, + "step": 6236 + }, + { + "epoch": 0.311072319201995, + "grad_norm": 0.5452552613639947, + "learning_rate": 4.0341560070734174e-05, + "loss": 0.9592, + "step": 6237 + }, + { + "epoch": 0.3111221945137157, + "grad_norm": 0.48389426726202994, + "learning_rate": 4.03383712359195e-05, + "loss": 0.8988, + "step": 6238 + }, + { + "epoch": 0.3111720698254364, + "grad_norm": 0.6938095845947098, + "learning_rate": 4.0335182000856896e-05, + "loss": 0.9149, + "step": 6239 + }, + { + "epoch": 0.3112219451371571, + "grad_norm": 0.6516981066629076, + "learning_rate": 4.033199236562958e-05, + "loss": 0.9106, + "step": 6240 + }, + { + "epoch": 0.3112718204488778, + "grad_norm": 0.6632354743465554, + "learning_rate": 4.032880233032079e-05, + "loss": 0.905, + "step": 6241 + }, + { + "epoch": 0.3113216957605985, + "grad_norm": 0.6536739905488583, + "learning_rate": 4.032561189501376e-05, + "loss": 0.9005, + "step": 6242 + }, + { + "epoch": 0.3113715710723192, + "grad_norm": 0.6648004547328004, + "learning_rate": 4.032242105979174e-05, + "loss": 0.9113, + "step": 6243 + }, + { + "epoch": 0.3114214463840399, + "grad_norm": 0.662444691525182, + "learning_rate": 4.0319229824738e-05, + "loss": 0.9105, + "step": 6244 + }, + { + "epoch": 0.3114713216957606, + "grad_norm": 0.7411250690355655, + "learning_rate": 4.031603818993583e-05, + "loss": 0.9071, + "step": 6245 + }, + { + "epoch": 0.3115211970074813, + "grad_norm": 0.581726602367653, + "learning_rate": 4.031284615546849e-05, + "loss": 0.9243, + "step": 6246 + }, + { + "epoch": 0.311571072319202, + "grad_norm": 0.8774503539159696, + "learning_rate": 4.030965372141927e-05, + "loss": 0.8751, + "step": 6247 + }, + { + "epoch": 0.3116209476309227, + "grad_norm": 0.8096344346819254, + "learning_rate": 4.0306460887871504e-05, + "loss": 0.9265, + "step": 6248 + }, + { + "epoch": 0.31167082294264337, + "grad_norm": 0.7125679118062691, + "learning_rate": 4.030326765490848e-05, + "loss": 0.8793, + "step": 6249 + }, + { + "epoch": 0.3117206982543641, + "grad_norm": 0.6105251052533223, + "learning_rate": 4.030007402261354e-05, + "loss": 0.9189, + "step": 6250 + }, + { + "epoch": 0.3117705735660848, + "grad_norm": 1.0146750526400725, + "learning_rate": 4.0296879991070025e-05, + "loss": 0.9452, + "step": 6251 + }, + { + "epoch": 0.31182044887780547, + "grad_norm": 0.820037795874171, + "learning_rate": 4.029368556036126e-05, + "loss": 0.9193, + "step": 6252 + }, + { + "epoch": 0.3118703241895262, + "grad_norm": 0.514619983616843, + "learning_rate": 4.0290490730570616e-05, + "loss": 0.9344, + "step": 6253 + }, + { + "epoch": 0.3119201995012469, + "grad_norm": 0.905430245859716, + "learning_rate": 4.028729550178147e-05, + "loss": 0.9135, + "step": 6254 + }, + { + "epoch": 0.31197007481296757, + "grad_norm": 0.728453497649336, + "learning_rate": 4.028409987407718e-05, + "loss": 0.906, + "step": 6255 + }, + { + "epoch": 0.3120199501246883, + "grad_norm": 0.606580435884689, + "learning_rate": 4.0280903847541144e-05, + "loss": 0.9505, + "step": 6256 + }, + { + "epoch": 0.312069825436409, + "grad_norm": 0.5702787335118892, + "learning_rate": 4.027770742225676e-05, + "loss": 0.902, + "step": 6257 + }, + { + "epoch": 0.31211970074812967, + "grad_norm": 0.7530959347600369, + "learning_rate": 4.0274510598307444e-05, + "loss": 0.956, + "step": 6258 + }, + { + "epoch": 0.31216957605985035, + "grad_norm": 0.7058080286378806, + "learning_rate": 4.027131337577661e-05, + "loss": 0.9273, + "step": 6259 + }, + { + "epoch": 0.3122194513715711, + "grad_norm": 0.7117167243420186, + "learning_rate": 4.026811575474768e-05, + "loss": 0.9459, + "step": 6260 + }, + { + "epoch": 0.31226932668329177, + "grad_norm": 0.5880889252834578, + "learning_rate": 4.0264917735304105e-05, + "loss": 0.8718, + "step": 6261 + }, + { + "epoch": 0.31231920199501245, + "grad_norm": 0.944968053230748, + "learning_rate": 4.026171931752934e-05, + "loss": 0.9331, + "step": 6262 + }, + { + "epoch": 0.3123690773067332, + "grad_norm": 0.6390474839757412, + "learning_rate": 4.025852050150684e-05, + "loss": 0.9488, + "step": 6263 + }, + { + "epoch": 0.31241895261845387, + "grad_norm": 0.6569440355156737, + "learning_rate": 4.025532128732007e-05, + "loss": 0.8654, + "step": 6264 + }, + { + "epoch": 0.31246882793017455, + "grad_norm": 1.4848131993455713, + "learning_rate": 4.025212167505252e-05, + "loss": 0.9221, + "step": 6265 + }, + { + "epoch": 0.31251870324189523, + "grad_norm": 0.557776013183083, + "learning_rate": 4.0248921664787674e-05, + "loss": 0.8826, + "step": 6266 + }, + { + "epoch": 0.31256857855361597, + "grad_norm": 1.2273719256216444, + "learning_rate": 4.024572125660906e-05, + "loss": 0.9128, + "step": 6267 + }, + { + "epoch": 0.31261845386533665, + "grad_norm": 0.9838931030009909, + "learning_rate": 4.024252045060015e-05, + "loss": 0.9829, + "step": 6268 + }, + { + "epoch": 0.31266832917705734, + "grad_norm": 0.8774951436825135, + "learning_rate": 4.023931924684451e-05, + "loss": 0.9294, + "step": 6269 + }, + { + "epoch": 0.3127182044887781, + "grad_norm": 0.959020733069667, + "learning_rate": 4.023611764542564e-05, + "loss": 0.9209, + "step": 6270 + }, + { + "epoch": 0.31276807980049876, + "grad_norm": 0.8417207623751345, + "learning_rate": 4.023291564642711e-05, + "loss": 0.9061, + "step": 6271 + }, + { + "epoch": 0.31281795511221944, + "grad_norm": 0.5723101277696323, + "learning_rate": 4.022971324993247e-05, + "loss": 0.9284, + "step": 6272 + }, + { + "epoch": 0.3128678304239402, + "grad_norm": 0.7449482940531265, + "learning_rate": 4.022651045602526e-05, + "loss": 0.9462, + "step": 6273 + }, + { + "epoch": 0.31291770573566086, + "grad_norm": 0.48546075943894657, + "learning_rate": 4.022330726478908e-05, + "loss": 0.9348, + "step": 6274 + }, + { + "epoch": 0.31296758104738154, + "grad_norm": 0.6556985997084634, + "learning_rate": 4.0220103676307514e-05, + "loss": 0.9551, + "step": 6275 + }, + { + "epoch": 0.3130174563591022, + "grad_norm": 0.8220842925713843, + "learning_rate": 4.0216899690664155e-05, + "loss": 0.9513, + "step": 6276 + }, + { + "epoch": 0.31306733167082296, + "grad_norm": 0.49135117062038125, + "learning_rate": 4.02136953079426e-05, + "loss": 0.9284, + "step": 6277 + }, + { + "epoch": 0.31311720698254364, + "grad_norm": 0.6181672936076232, + "learning_rate": 4.021049052822649e-05, + "loss": 0.9434, + "step": 6278 + }, + { + "epoch": 0.3131670822942643, + "grad_norm": 1.1148072770571338, + "learning_rate": 4.020728535159942e-05, + "loss": 0.9443, + "step": 6279 + }, + { + "epoch": 0.31321695760598506, + "grad_norm": 0.5620535443537703, + "learning_rate": 4.020407977814505e-05, + "loss": 0.9318, + "step": 6280 + }, + { + "epoch": 0.31326683291770574, + "grad_norm": 0.7316241714058418, + "learning_rate": 4.020087380794703e-05, + "loss": 0.9196, + "step": 6281 + }, + { + "epoch": 0.3133167082294264, + "grad_norm": 0.5408485661935923, + "learning_rate": 4.0197667441089005e-05, + "loss": 0.934, + "step": 6282 + }, + { + "epoch": 0.31336658354114716, + "grad_norm": 0.714928301775922, + "learning_rate": 4.019446067765464e-05, + "loss": 0.9142, + "step": 6283 + }, + { + "epoch": 0.31341645885286784, + "grad_norm": 0.6760462778619518, + "learning_rate": 4.0191253517727646e-05, + "loss": 0.9598, + "step": 6284 + }, + { + "epoch": 0.3134663341645885, + "grad_norm": 1.522014805170653, + "learning_rate": 4.018804596139168e-05, + "loss": 0.9519, + "step": 6285 + }, + { + "epoch": 0.3135162094763092, + "grad_norm": 0.8999806254858244, + "learning_rate": 4.018483800873045e-05, + "loss": 0.883, + "step": 6286 + }, + { + "epoch": 0.31356608478802994, + "grad_norm": 0.49794666590305353, + "learning_rate": 4.018162965982767e-05, + "loss": 0.8811, + "step": 6287 + }, + { + "epoch": 0.3136159600997506, + "grad_norm": 0.6487082375534176, + "learning_rate": 4.017842091476705e-05, + "loss": 0.944, + "step": 6288 + }, + { + "epoch": 0.3136658354114713, + "grad_norm": 0.5853574136649001, + "learning_rate": 4.0175211773632336e-05, + "loss": 0.9372, + "step": 6289 + }, + { + "epoch": 0.31371571072319204, + "grad_norm": 0.5892560218191555, + "learning_rate": 4.0172002236507265e-05, + "loss": 0.967, + "step": 6290 + }, + { + "epoch": 0.3137655860349127, + "grad_norm": 0.5181795427640126, + "learning_rate": 4.016879230347558e-05, + "loss": 0.9555, + "step": 6291 + }, + { + "epoch": 0.3138154613466334, + "grad_norm": 0.5751000311418507, + "learning_rate": 4.0165581974621054e-05, + "loss": 0.893, + "step": 6292 + }, + { + "epoch": 0.3138653366583541, + "grad_norm": 0.567498930888003, + "learning_rate": 4.016237125002745e-05, + "loss": 0.9322, + "step": 6293 + }, + { + "epoch": 0.3139152119700748, + "grad_norm": 0.7424750282912145, + "learning_rate": 4.015916012977856e-05, + "loss": 0.9146, + "step": 6294 + }, + { + "epoch": 0.3139650872817955, + "grad_norm": 1.5116197590875105, + "learning_rate": 4.015594861395816e-05, + "loss": 0.8878, + "step": 6295 + }, + { + "epoch": 0.3140149625935162, + "grad_norm": 1.2419022631687726, + "learning_rate": 4.0152736702650074e-05, + "loss": 0.9054, + "step": 6296 + }, + { + "epoch": 0.3140648379052369, + "grad_norm": 0.4906391445875428, + "learning_rate": 4.0149524395938095e-05, + "loss": 0.8991, + "step": 6297 + }, + { + "epoch": 0.3141147132169576, + "grad_norm": 0.8842693764251548, + "learning_rate": 4.014631169390607e-05, + "loss": 0.9315, + "step": 6298 + }, + { + "epoch": 0.3141645885286783, + "grad_norm": 0.5415234574652839, + "learning_rate": 4.0143098596637804e-05, + "loss": 0.948, + "step": 6299 + }, + { + "epoch": 0.314214463840399, + "grad_norm": 0.6365999930887236, + "learning_rate": 4.0139885104217176e-05, + "loss": 0.9305, + "step": 6300 + }, + { + "epoch": 0.3142643391521197, + "grad_norm": 0.5876491329053036, + "learning_rate": 4.013667121672801e-05, + "loss": 0.9281, + "step": 6301 + }, + { + "epoch": 0.3143142144638404, + "grad_norm": 0.5480422184958479, + "learning_rate": 4.013345693425419e-05, + "loss": 0.9019, + "step": 6302 + }, + { + "epoch": 0.3143640897755611, + "grad_norm": 0.8658938046796864, + "learning_rate": 4.013024225687957e-05, + "loss": 0.9326, + "step": 6303 + }, + { + "epoch": 0.3144139650872818, + "grad_norm": 0.5395904783914575, + "learning_rate": 4.0127027184688064e-05, + "loss": 0.9248, + "step": 6304 + }, + { + "epoch": 0.3144638403990025, + "grad_norm": 0.7838664044322806, + "learning_rate": 4.012381171776355e-05, + "loss": 0.9045, + "step": 6305 + }, + { + "epoch": 0.3145137157107232, + "grad_norm": 0.8334373821337946, + "learning_rate": 4.012059585618993e-05, + "loss": 0.9598, + "step": 6306 + }, + { + "epoch": 0.3145635910224439, + "grad_norm": 0.5184591046073241, + "learning_rate": 4.011737960005114e-05, + "loss": 0.9414, + "step": 6307 + }, + { + "epoch": 0.3146134663341646, + "grad_norm": 0.7443388090596443, + "learning_rate": 4.011416294943109e-05, + "loss": 0.9149, + "step": 6308 + }, + { + "epoch": 0.3146633416458853, + "grad_norm": 0.5505683970547605, + "learning_rate": 4.011094590441372e-05, + "loss": 0.9566, + "step": 6309 + }, + { + "epoch": 0.314713216957606, + "grad_norm": 0.585111431879072, + "learning_rate": 4.010772846508298e-05, + "loss": 0.9164, + "step": 6310 + }, + { + "epoch": 0.3147630922693267, + "grad_norm": 0.6488285181878118, + "learning_rate": 4.010451063152283e-05, + "loss": 0.9029, + "step": 6311 + }, + { + "epoch": 0.3148129675810474, + "grad_norm": 0.6045063135690343, + "learning_rate": 4.010129240381724e-05, + "loss": 0.9751, + "step": 6312 + }, + { + "epoch": 0.31486284289276806, + "grad_norm": 0.5480148211990415, + "learning_rate": 4.009807378205017e-05, + "loss": 0.9156, + "step": 6313 + }, + { + "epoch": 0.3149127182044888, + "grad_norm": 0.4929551842571157, + "learning_rate": 4.009485476630562e-05, + "loss": 0.942, + "step": 6314 + }, + { + "epoch": 0.3149625935162095, + "grad_norm": 0.6444952561794356, + "learning_rate": 4.009163535666761e-05, + "loss": 0.9315, + "step": 6315 + }, + { + "epoch": 0.31501246882793016, + "grad_norm": 0.5261910578537272, + "learning_rate": 4.0088415553220116e-05, + "loss": 0.877, + "step": 6316 + }, + { + "epoch": 0.3150623441396509, + "grad_norm": 0.5386862779083826, + "learning_rate": 4.008519535604717e-05, + "loss": 0.9435, + "step": 6317 + }, + { + "epoch": 0.3151122194513716, + "grad_norm": 0.5436828514587433, + "learning_rate": 4.00819747652328e-05, + "loss": 0.9473, + "step": 6318 + }, + { + "epoch": 0.31516209476309226, + "grad_norm": 0.7227574661399344, + "learning_rate": 4.007875378086106e-05, + "loss": 0.9147, + "step": 6319 + }, + { + "epoch": 0.31521197007481294, + "grad_norm": 0.4279090177618827, + "learning_rate": 4.007553240301598e-05, + "loss": 0.968, + "step": 6320 + }, + { + "epoch": 0.3152618453865337, + "grad_norm": 0.5219168499832554, + "learning_rate": 4.007231063178163e-05, + "loss": 0.9268, + "step": 6321 + }, + { + "epoch": 0.31531172069825436, + "grad_norm": 0.8573320710049571, + "learning_rate": 4.006908846724208e-05, + "loss": 0.8856, + "step": 6322 + }, + { + "epoch": 0.31536159600997504, + "grad_norm": 0.8133206714543121, + "learning_rate": 4.0065865909481417e-05, + "loss": 0.8873, + "step": 6323 + }, + { + "epoch": 0.3154114713216958, + "grad_norm": 0.6208071058253627, + "learning_rate": 4.006264295858372e-05, + "loss": 0.9452, + "step": 6324 + }, + { + "epoch": 0.31546134663341646, + "grad_norm": 0.624508327256139, + "learning_rate": 4.005941961463309e-05, + "loss": 0.9252, + "step": 6325 + }, + { + "epoch": 0.31551122194513714, + "grad_norm": 0.6713844103285818, + "learning_rate": 4.005619587771365e-05, + "loss": 0.8884, + "step": 6326 + }, + { + "epoch": 0.3155610972568579, + "grad_norm": 0.6164398559460214, + "learning_rate": 4.005297174790952e-05, + "loss": 0.9165, + "step": 6327 + }, + { + "epoch": 0.31561097256857856, + "grad_norm": 0.8013258083520205, + "learning_rate": 4.004974722530482e-05, + "loss": 0.9389, + "step": 6328 + }, + { + "epoch": 0.31566084788029924, + "grad_norm": 0.5526618022726177, + "learning_rate": 4.004652230998371e-05, + "loss": 0.9052, + "step": 6329 + }, + { + "epoch": 0.3157107231920199, + "grad_norm": 0.5847475783225144, + "learning_rate": 4.004329700203033e-05, + "loss": 0.9548, + "step": 6330 + }, + { + "epoch": 0.31576059850374066, + "grad_norm": 2.4840758270219054, + "learning_rate": 4.004007130152886e-05, + "loss": 0.9325, + "step": 6331 + }, + { + "epoch": 0.31581047381546135, + "grad_norm": 0.9459552216597215, + "learning_rate": 4.0036845208563444e-05, + "loss": 0.943, + "step": 6332 + }, + { + "epoch": 0.315860349127182, + "grad_norm": 0.4163688074294696, + "learning_rate": 4.003361872321829e-05, + "loss": 0.8706, + "step": 6333 + }, + { + "epoch": 0.31591022443890276, + "grad_norm": 0.5317307022206756, + "learning_rate": 4.003039184557758e-05, + "loss": 0.9017, + "step": 6334 + }, + { + "epoch": 0.31596009975062345, + "grad_norm": 0.6905910946426999, + "learning_rate": 4.0027164575725526e-05, + "loss": 0.9163, + "step": 6335 + }, + { + "epoch": 0.31600997506234413, + "grad_norm": 0.7973778363085089, + "learning_rate": 4.002393691374633e-05, + "loss": 0.8818, + "step": 6336 + }, + { + "epoch": 0.3160598503740648, + "grad_norm": 0.6102582644194215, + "learning_rate": 4.0020708859724235e-05, + "loss": 0.938, + "step": 6337 + }, + { + "epoch": 0.31610972568578555, + "grad_norm": 0.8675860197476313, + "learning_rate": 4.001748041374345e-05, + "loss": 0.9673, + "step": 6338 + }, + { + "epoch": 0.31615960099750623, + "grad_norm": 0.6124951683367664, + "learning_rate": 4.001425157588825e-05, + "loss": 0.9186, + "step": 6339 + }, + { + "epoch": 0.3162094763092269, + "grad_norm": 0.5361588630936425, + "learning_rate": 4.001102234624286e-05, + "loss": 0.9034, + "step": 6340 + }, + { + "epoch": 0.31625935162094765, + "grad_norm": 0.5923249443083772, + "learning_rate": 4.000779272489157e-05, + "loss": 0.9024, + "step": 6341 + }, + { + "epoch": 0.31630922693266833, + "grad_norm": 0.5583215257115076, + "learning_rate": 4.000456271191864e-05, + "loss": 0.9578, + "step": 6342 + }, + { + "epoch": 0.316359102244389, + "grad_norm": 0.8301606179346188, + "learning_rate": 4.000133230740837e-05, + "loss": 0.9283, + "step": 6343 + }, + { + "epoch": 0.31640897755610975, + "grad_norm": 0.9083914948049557, + "learning_rate": 3.999810151144504e-05, + "loss": 0.9503, + "step": 6344 + }, + { + "epoch": 0.31645885286783043, + "grad_norm": 0.665143061832062, + "learning_rate": 3.9994870324112966e-05, + "loss": 0.9105, + "step": 6345 + }, + { + "epoch": 0.3165087281795511, + "grad_norm": 0.6112706284548822, + "learning_rate": 3.999163874549646e-05, + "loss": 0.9328, + "step": 6346 + }, + { + "epoch": 0.3165586034912718, + "grad_norm": 0.5955842239218812, + "learning_rate": 3.998840677567985e-05, + "loss": 0.928, + "step": 6347 + }, + { + "epoch": 0.31660847880299253, + "grad_norm": 0.5889514365739261, + "learning_rate": 3.9985174414747474e-05, + "loss": 0.9484, + "step": 6348 + }, + { + "epoch": 0.3166583541147132, + "grad_norm": 0.6226286398759308, + "learning_rate": 3.9981941662783674e-05, + "loss": 0.9541, + "step": 6349 + }, + { + "epoch": 0.3167082294264339, + "grad_norm": 0.7121356686081698, + "learning_rate": 3.997870851987282e-05, + "loss": 0.8891, + "step": 6350 + }, + { + "epoch": 0.31675810473815463, + "grad_norm": 0.8355251811425561, + "learning_rate": 3.9975474986099256e-05, + "loss": 0.9061, + "step": 6351 + }, + { + "epoch": 0.3168079800498753, + "grad_norm": 0.5439382710459435, + "learning_rate": 3.9972241061547395e-05, + "loss": 0.8911, + "step": 6352 + }, + { + "epoch": 0.316857855361596, + "grad_norm": 0.6359981018780487, + "learning_rate": 3.996900674630159e-05, + "loss": 0.9106, + "step": 6353 + }, + { + "epoch": 0.31690773067331673, + "grad_norm": 0.5176836443809679, + "learning_rate": 3.9965772040446256e-05, + "loss": 0.9093, + "step": 6354 + }, + { + "epoch": 0.3169576059850374, + "grad_norm": 0.62330394368304, + "learning_rate": 3.99625369440658e-05, + "loss": 0.9492, + "step": 6355 + }, + { + "epoch": 0.3170074812967581, + "grad_norm": 0.6062536330081094, + "learning_rate": 3.995930145724463e-05, + "loss": 0.9027, + "step": 6356 + }, + { + "epoch": 0.3170573566084788, + "grad_norm": 0.5279004315140308, + "learning_rate": 3.99560655800672e-05, + "loss": 0.9309, + "step": 6357 + }, + { + "epoch": 0.3171072319201995, + "grad_norm": 0.550536319568698, + "learning_rate": 3.995282931261792e-05, + "loss": 0.8679, + "step": 6358 + }, + { + "epoch": 0.3171571072319202, + "grad_norm": 0.6450262341725914, + "learning_rate": 3.994959265498126e-05, + "loss": 0.9219, + "step": 6359 + }, + { + "epoch": 0.3172069825436409, + "grad_norm": 0.5741584774613995, + "learning_rate": 3.994635560724166e-05, + "loss": 0.9296, + "step": 6360 + }, + { + "epoch": 0.3172568578553616, + "grad_norm": 0.5999684263006821, + "learning_rate": 3.9943118169483605e-05, + "loss": 0.9406, + "step": 6361 + }, + { + "epoch": 0.3173067331670823, + "grad_norm": 0.5916384575979806, + "learning_rate": 3.993988034179157e-05, + "loss": 0.9119, + "step": 6362 + }, + { + "epoch": 0.317356608478803, + "grad_norm": 0.529883162039827, + "learning_rate": 3.9936642124250045e-05, + "loss": 0.8717, + "step": 6363 + }, + { + "epoch": 0.31740648379052366, + "grad_norm": 0.9524676068560045, + "learning_rate": 3.9933403516943524e-05, + "loss": 0.9159, + "step": 6364 + }, + { + "epoch": 0.3174563591022444, + "grad_norm": 0.8205339968560458, + "learning_rate": 3.993016451995653e-05, + "loss": 0.9344, + "step": 6365 + }, + { + "epoch": 0.3175062344139651, + "grad_norm": 1.2207727354099653, + "learning_rate": 3.992692513337356e-05, + "loss": 0.9249, + "step": 6366 + }, + { + "epoch": 0.31755610972568576, + "grad_norm": 0.5442703557298224, + "learning_rate": 3.9923685357279166e-05, + "loss": 0.9077, + "step": 6367 + }, + { + "epoch": 0.3176059850374065, + "grad_norm": 0.8304222953180667, + "learning_rate": 3.992044519175788e-05, + "loss": 0.9477, + "step": 6368 + }, + { + "epoch": 0.3176558603491272, + "grad_norm": 0.8663037483062298, + "learning_rate": 3.991720463689426e-05, + "loss": 0.9446, + "step": 6369 + }, + { + "epoch": 0.31770573566084787, + "grad_norm": 0.8371568805517642, + "learning_rate": 3.991396369277286e-05, + "loss": 0.9004, + "step": 6370 + }, + { + "epoch": 0.3177556109725686, + "grad_norm": 0.9695688355623802, + "learning_rate": 3.9910722359478246e-05, + "loss": 0.9196, + "step": 6371 + }, + { + "epoch": 0.3178054862842893, + "grad_norm": 0.6042052361253245, + "learning_rate": 3.9907480637095014e-05, + "loss": 0.9052, + "step": 6372 + }, + { + "epoch": 0.31785536159600997, + "grad_norm": 0.682760334601552, + "learning_rate": 3.990423852570774e-05, + "loss": 0.9146, + "step": 6373 + }, + { + "epoch": 0.31790523690773065, + "grad_norm": 0.6311146294852591, + "learning_rate": 3.990099602540103e-05, + "loss": 0.9227, + "step": 6374 + }, + { + "epoch": 0.3179551122194514, + "grad_norm": 0.9782329372140667, + "learning_rate": 3.98977531362595e-05, + "loss": 0.9149, + "step": 6375 + }, + { + "epoch": 0.31800498753117207, + "grad_norm": 0.5326357250841821, + "learning_rate": 3.989450985836777e-05, + "loss": 0.9694, + "step": 6376 + }, + { + "epoch": 0.31805486284289275, + "grad_norm": 0.5172995393547799, + "learning_rate": 3.989126619181047e-05, + "loss": 0.8986, + "step": 6377 + }, + { + "epoch": 0.3181047381546135, + "grad_norm": 0.5324147634964252, + "learning_rate": 3.988802213667225e-05, + "loss": 0.8997, + "step": 6378 + }, + { + "epoch": 0.31815461346633417, + "grad_norm": 0.921474517162547, + "learning_rate": 3.9884777693037755e-05, + "loss": 0.8672, + "step": 6379 + }, + { + "epoch": 0.31820448877805485, + "grad_norm": 0.8857896316920195, + "learning_rate": 3.988153286099164e-05, + "loss": 0.8821, + "step": 6380 + }, + { + "epoch": 0.3182543640897756, + "grad_norm": 0.6151395384246406, + "learning_rate": 3.9878287640618586e-05, + "loss": 0.9394, + "step": 6381 + }, + { + "epoch": 0.31830423940149627, + "grad_norm": 0.5712663433615267, + "learning_rate": 3.987504203200327e-05, + "loss": 0.9036, + "step": 6382 + }, + { + "epoch": 0.31835411471321695, + "grad_norm": 0.4171131734081916, + "learning_rate": 3.98717960352304e-05, + "loss": 0.9668, + "step": 6383 + }, + { + "epoch": 0.31840399002493763, + "grad_norm": 0.6794624472812084, + "learning_rate": 3.986854965038467e-05, + "loss": 0.9369, + "step": 6384 + }, + { + "epoch": 0.31845386533665837, + "grad_norm": 0.6756862348697072, + "learning_rate": 3.9865302877550783e-05, + "loss": 0.885, + "step": 6385 + }, + { + "epoch": 0.31850374064837905, + "grad_norm": 0.5197995122820823, + "learning_rate": 3.986205571681347e-05, + "loss": 0.8922, + "step": 6386 + }, + { + "epoch": 0.31855361596009973, + "grad_norm": 0.5257220823799352, + "learning_rate": 3.9858808168257464e-05, + "loss": 0.8942, + "step": 6387 + }, + { + "epoch": 0.31860349127182047, + "grad_norm": 0.8457521684352342, + "learning_rate": 3.985556023196752e-05, + "loss": 0.9061, + "step": 6388 + }, + { + "epoch": 0.31865336658354115, + "grad_norm": 0.49864340616324765, + "learning_rate": 3.9852311908028374e-05, + "loss": 0.8701, + "step": 6389 + }, + { + "epoch": 0.31870324189526184, + "grad_norm": 0.47257204897678096, + "learning_rate": 3.9849063196524786e-05, + "loss": 0.919, + "step": 6390 + }, + { + "epoch": 0.3187531172069825, + "grad_norm": 0.5129114702372484, + "learning_rate": 3.9845814097541545e-05, + "loss": 0.925, + "step": 6391 + }, + { + "epoch": 0.31880299251870325, + "grad_norm": 0.8307131335977767, + "learning_rate": 3.9842564611163435e-05, + "loss": 0.8942, + "step": 6392 + }, + { + "epoch": 0.31885286783042394, + "grad_norm": 0.4542694369711236, + "learning_rate": 3.983931473747524e-05, + "loss": 0.9231, + "step": 6393 + }, + { + "epoch": 0.3189027431421446, + "grad_norm": 0.6583460281520609, + "learning_rate": 3.983606447656177e-05, + "loss": 0.9388, + "step": 6394 + }, + { + "epoch": 0.31895261845386536, + "grad_norm": 0.7320763355471462, + "learning_rate": 3.983281382850783e-05, + "loss": 0.9345, + "step": 6395 + }, + { + "epoch": 0.31900249376558604, + "grad_norm": 0.6981719104167938, + "learning_rate": 3.9829562793398264e-05, + "loss": 0.9437, + "step": 6396 + }, + { + "epoch": 0.3190523690773067, + "grad_norm": 1.06770987473396, + "learning_rate": 3.9826311371317884e-05, + "loss": 0.9573, + "step": 6397 + }, + { + "epoch": 0.31910224438902746, + "grad_norm": 0.5121568465364145, + "learning_rate": 3.982305956235155e-05, + "loss": 0.912, + "step": 6398 + }, + { + "epoch": 0.31915211970074814, + "grad_norm": 0.6241679077554728, + "learning_rate": 3.98198073665841e-05, + "loss": 0.9451, + "step": 6399 + }, + { + "epoch": 0.3192019950124688, + "grad_norm": 0.4958979762385495, + "learning_rate": 3.981655478410043e-05, + "loss": 0.8745, + "step": 6400 + }, + { + "epoch": 0.3192518703241895, + "grad_norm": 0.7423712077252544, + "learning_rate": 3.981330181498537e-05, + "loss": 0.9602, + "step": 6401 + }, + { + "epoch": 0.31930174563591024, + "grad_norm": 0.5478262268040466, + "learning_rate": 3.981004845932385e-05, + "loss": 0.941, + "step": 6402 + }, + { + "epoch": 0.3193516209476309, + "grad_norm": 1.1948546579239652, + "learning_rate": 3.980679471720073e-05, + "loss": 0.9148, + "step": 6403 + }, + { + "epoch": 0.3194014962593516, + "grad_norm": 0.5181992782167634, + "learning_rate": 3.9803540588700924e-05, + "loss": 0.9321, + "step": 6404 + }, + { + "epoch": 0.31945137157107234, + "grad_norm": 0.694925978908787, + "learning_rate": 3.9800286073909365e-05, + "loss": 0.9018, + "step": 6405 + }, + { + "epoch": 0.319501246882793, + "grad_norm": 0.527481103785679, + "learning_rate": 3.979703117291096e-05, + "loss": 0.8843, + "step": 6406 + }, + { + "epoch": 0.3195511221945137, + "grad_norm": 0.7953461882392939, + "learning_rate": 3.979377588579065e-05, + "loss": 0.9704, + "step": 6407 + }, + { + "epoch": 0.31960099750623444, + "grad_norm": 0.7921612352944123, + "learning_rate": 3.979052021263338e-05, + "loss": 0.9033, + "step": 6408 + }, + { + "epoch": 0.3196508728179551, + "grad_norm": 0.7888725395583168, + "learning_rate": 3.97872641535241e-05, + "loss": 0.9015, + "step": 6409 + }, + { + "epoch": 0.3197007481296758, + "grad_norm": 0.5706865611361853, + "learning_rate": 3.978400770854778e-05, + "loss": 0.9309, + "step": 6410 + }, + { + "epoch": 0.3197506234413965, + "grad_norm": 0.6618655053150309, + "learning_rate": 3.97807508777894e-05, + "loss": 0.9977, + "step": 6411 + }, + { + "epoch": 0.3198004987531172, + "grad_norm": 0.6748495178331366, + "learning_rate": 3.9777493661333934e-05, + "loss": 0.9321, + "step": 6412 + }, + { + "epoch": 0.3198503740648379, + "grad_norm": 0.5903686461797343, + "learning_rate": 3.977423605926638e-05, + "loss": 0.9038, + "step": 6413 + }, + { + "epoch": 0.3199002493765586, + "grad_norm": 0.5208736562377523, + "learning_rate": 3.977097807167176e-05, + "loss": 0.9507, + "step": 6414 + }, + { + "epoch": 0.3199501246882793, + "grad_norm": 0.7046134198547084, + "learning_rate": 3.976771969863507e-05, + "loss": 0.9734, + "step": 6415 + }, + { + "epoch": 0.32, + "grad_norm": 0.5565403127251405, + "learning_rate": 3.976446094024135e-05, + "loss": 0.9581, + "step": 6416 + }, + { + "epoch": 0.3200498753117207, + "grad_norm": 0.583227355639153, + "learning_rate": 3.976120179657563e-05, + "loss": 0.9104, + "step": 6417 + }, + { + "epoch": 0.32009975062344137, + "grad_norm": 0.5725282831261381, + "learning_rate": 3.975794226772295e-05, + "loss": 0.9029, + "step": 6418 + }, + { + "epoch": 0.3201496259351621, + "grad_norm": 0.6575399991046881, + "learning_rate": 3.975468235376838e-05, + "loss": 0.9301, + "step": 6419 + }, + { + "epoch": 0.3201995012468828, + "grad_norm": 0.6798156722274581, + "learning_rate": 3.975142205479697e-05, + "loss": 0.9005, + "step": 6420 + }, + { + "epoch": 0.32024937655860347, + "grad_norm": 2.295494629256834, + "learning_rate": 3.97481613708938e-05, + "loss": 0.9088, + "step": 6421 + }, + { + "epoch": 0.3202992518703242, + "grad_norm": 0.7863139699757687, + "learning_rate": 3.9744900302143963e-05, + "loss": 0.8909, + "step": 6422 + }, + { + "epoch": 0.3203491271820449, + "grad_norm": 0.4810816487392829, + "learning_rate": 3.974163884863255e-05, + "loss": 0.9142, + "step": 6423 + }, + { + "epoch": 0.3203990024937656, + "grad_norm": 0.4636829923819985, + "learning_rate": 3.973837701044467e-05, + "loss": 0.9065, + "step": 6424 + }, + { + "epoch": 0.3204488778054863, + "grad_norm": 0.479687688862479, + "learning_rate": 3.973511478766544e-05, + "loss": 0.9177, + "step": 6425 + }, + { + "epoch": 0.320498753117207, + "grad_norm": 0.7030083609102779, + "learning_rate": 3.973185218037998e-05, + "loss": 0.9393, + "step": 6426 + }, + { + "epoch": 0.3205486284289277, + "grad_norm": 0.5735405336280033, + "learning_rate": 3.972858918867344e-05, + "loss": 0.9202, + "step": 6427 + }, + { + "epoch": 0.32059850374064836, + "grad_norm": 0.5219931529389191, + "learning_rate": 3.9725325812630954e-05, + "loss": 0.902, + "step": 6428 + }, + { + "epoch": 0.3206483790523691, + "grad_norm": 0.8996009984669721, + "learning_rate": 3.972206205233767e-05, + "loss": 0.8662, + "step": 6429 + }, + { + "epoch": 0.3206982543640898, + "grad_norm": 0.4980372045846335, + "learning_rate": 3.9718797907878775e-05, + "loss": 0.9362, + "step": 6430 + }, + { + "epoch": 0.32074812967581046, + "grad_norm": 0.6111505016338993, + "learning_rate": 3.971553337933943e-05, + "loss": 0.9066, + "step": 6431 + }, + { + "epoch": 0.3207980049875312, + "grad_norm": 0.7953072933621708, + "learning_rate": 3.971226846680484e-05, + "loss": 0.9101, + "step": 6432 + }, + { + "epoch": 0.3208478802992519, + "grad_norm": 0.7401623548926688, + "learning_rate": 3.9709003170360176e-05, + "loss": 0.8642, + "step": 6433 + }, + { + "epoch": 0.32089775561097256, + "grad_norm": 0.532849866710904, + "learning_rate": 3.970573749009066e-05, + "loss": 0.9232, + "step": 6434 + }, + { + "epoch": 0.3209476309226933, + "grad_norm": 0.6822756121070542, + "learning_rate": 3.970247142608151e-05, + "loss": 0.8704, + "step": 6435 + }, + { + "epoch": 0.320997506234414, + "grad_norm": 0.5333111848973989, + "learning_rate": 3.9699204978417934e-05, + "loss": 0.92, + "step": 6436 + }, + { + "epoch": 0.32104738154613466, + "grad_norm": 0.4319116370850239, + "learning_rate": 3.96959381471852e-05, + "loss": 0.8699, + "step": 6437 + }, + { + "epoch": 0.32109725685785534, + "grad_norm": 0.5831696981422858, + "learning_rate": 3.969267093246854e-05, + "loss": 0.8955, + "step": 6438 + }, + { + "epoch": 0.3211471321695761, + "grad_norm": 0.4644993807521269, + "learning_rate": 3.968940333435319e-05, + "loss": 0.9073, + "step": 6439 + }, + { + "epoch": 0.32119700748129676, + "grad_norm": 0.5134478644734404, + "learning_rate": 3.9686135352924445e-05, + "loss": 0.9158, + "step": 6440 + }, + { + "epoch": 0.32124688279301744, + "grad_norm": 0.5856859177320028, + "learning_rate": 3.9682866988267566e-05, + "loss": 0.9407, + "step": 6441 + }, + { + "epoch": 0.3212967581047382, + "grad_norm": 0.8826085017037649, + "learning_rate": 3.967959824046785e-05, + "loss": 0.9227, + "step": 6442 + }, + { + "epoch": 0.32134663341645886, + "grad_norm": 0.5602701017472149, + "learning_rate": 3.9676329109610574e-05, + "loss": 0.8941, + "step": 6443 + }, + { + "epoch": 0.32139650872817954, + "grad_norm": 0.5774950936063017, + "learning_rate": 3.9673059595781073e-05, + "loss": 0.9399, + "step": 6444 + }, + { + "epoch": 0.3214463840399002, + "grad_norm": 0.6891004957751666, + "learning_rate": 3.966978969906464e-05, + "loss": 0.9415, + "step": 6445 + }, + { + "epoch": 0.32149625935162096, + "grad_norm": 0.5504123148073911, + "learning_rate": 3.9666519419546616e-05, + "loss": 0.9256, + "step": 6446 + }, + { + "epoch": 0.32154613466334164, + "grad_norm": 0.4855396267778736, + "learning_rate": 3.9663248757312325e-05, + "loss": 0.9344, + "step": 6447 + }, + { + "epoch": 0.3215960099750623, + "grad_norm": 0.561566463171975, + "learning_rate": 3.965997771244713e-05, + "loss": 0.9088, + "step": 6448 + }, + { + "epoch": 0.32164588528678306, + "grad_norm": 0.5795831979136017, + "learning_rate": 3.965670628503637e-05, + "loss": 0.9021, + "step": 6449 + }, + { + "epoch": 0.32169576059850374, + "grad_norm": 1.6746025559506663, + "learning_rate": 3.965343447516542e-05, + "loss": 0.8983, + "step": 6450 + }, + { + "epoch": 0.3217456359102244, + "grad_norm": 0.7625620324985766, + "learning_rate": 3.9650162282919655e-05, + "loss": 0.9465, + "step": 6451 + }, + { + "epoch": 0.32179551122194516, + "grad_norm": 0.7305932082386076, + "learning_rate": 3.964688970838446e-05, + "loss": 0.939, + "step": 6452 + }, + { + "epoch": 0.32184538653366584, + "grad_norm": 0.5779981893929271, + "learning_rate": 3.964361675164524e-05, + "loss": 0.9448, + "step": 6453 + }, + { + "epoch": 0.3218952618453865, + "grad_norm": 0.6272713894135176, + "learning_rate": 3.964034341278739e-05, + "loss": 0.9509, + "step": 6454 + }, + { + "epoch": 0.3219451371571072, + "grad_norm": 0.6498016304503399, + "learning_rate": 3.963706969189633e-05, + "loss": 0.9063, + "step": 6455 + }, + { + "epoch": 0.32199501246882795, + "grad_norm": 0.5840537216343539, + "learning_rate": 3.963379558905749e-05, + "loss": 0.9443, + "step": 6456 + }, + { + "epoch": 0.3220448877805486, + "grad_norm": 1.1904321319108548, + "learning_rate": 3.96305211043563e-05, + "loss": 0.9273, + "step": 6457 + }, + { + "epoch": 0.3220947630922693, + "grad_norm": 0.86890829908889, + "learning_rate": 3.962724623787821e-05, + "loss": 0.9211, + "step": 6458 + }, + { + "epoch": 0.32214463840399005, + "grad_norm": 0.5154297544933982, + "learning_rate": 3.962397098970868e-05, + "loss": 0.9398, + "step": 6459 + }, + { + "epoch": 0.32219451371571073, + "grad_norm": 0.5915125894831448, + "learning_rate": 3.962069535993317e-05, + "loss": 0.9352, + "step": 6460 + }, + { + "epoch": 0.3222443890274314, + "grad_norm": 0.5008082772841062, + "learning_rate": 3.961741934863715e-05, + "loss": 0.9007, + "step": 6461 + }, + { + "epoch": 0.3222942643391521, + "grad_norm": 0.4732244265974375, + "learning_rate": 3.961414295590613e-05, + "loss": 0.9041, + "step": 6462 + }, + { + "epoch": 0.32234413965087283, + "grad_norm": 0.5924603642362509, + "learning_rate": 3.961086618182558e-05, + "loss": 0.9287, + "step": 6463 + }, + { + "epoch": 0.3223940149625935, + "grad_norm": 0.6526497585662833, + "learning_rate": 3.9607589026481015e-05, + "loss": 0.9516, + "step": 6464 + }, + { + "epoch": 0.3224438902743142, + "grad_norm": 1.7394570077748643, + "learning_rate": 3.9604311489957955e-05, + "loss": 0.9088, + "step": 6465 + }, + { + "epoch": 0.32249376558603493, + "grad_norm": 0.7597410356418212, + "learning_rate": 3.960103357234192e-05, + "loss": 0.9134, + "step": 6466 + }, + { + "epoch": 0.3225436408977556, + "grad_norm": 0.47084465655710167, + "learning_rate": 3.959775527371845e-05, + "loss": 0.9043, + "step": 6467 + }, + { + "epoch": 0.3225935162094763, + "grad_norm": 0.7829887515055748, + "learning_rate": 3.959447659417309e-05, + "loss": 0.9151, + "step": 6468 + }, + { + "epoch": 0.32264339152119703, + "grad_norm": 0.6532981145581153, + "learning_rate": 3.959119753379139e-05, + "loss": 0.9001, + "step": 6469 + }, + { + "epoch": 0.3226932668329177, + "grad_norm": 0.8778120789424945, + "learning_rate": 3.958791809265893e-05, + "loss": 0.9513, + "step": 6470 + }, + { + "epoch": 0.3227431421446384, + "grad_norm": 0.5295222761982646, + "learning_rate": 3.9584638270861266e-05, + "loss": 0.9025, + "step": 6471 + }, + { + "epoch": 0.3227930174563591, + "grad_norm": 0.4845854668527379, + "learning_rate": 3.9581358068483995e-05, + "loss": 0.9621, + "step": 6472 + }, + { + "epoch": 0.3228428927680798, + "grad_norm": 1.4316963798357476, + "learning_rate": 3.957807748561272e-05, + "loss": 0.865, + "step": 6473 + }, + { + "epoch": 0.3228927680798005, + "grad_norm": 0.6250325049665197, + "learning_rate": 3.957479652233303e-05, + "loss": 0.9662, + "step": 6474 + }, + { + "epoch": 0.3229426433915212, + "grad_norm": 0.43651224318803894, + "learning_rate": 3.957151517873055e-05, + "loss": 0.9469, + "step": 6475 + }, + { + "epoch": 0.3229925187032419, + "grad_norm": 0.5142957213724957, + "learning_rate": 3.95682334548909e-05, + "loss": 0.9685, + "step": 6476 + }, + { + "epoch": 0.3230423940149626, + "grad_norm": 0.44150418936011077, + "learning_rate": 3.956495135089972e-05, + "loss": 0.92, + "step": 6477 + }, + { + "epoch": 0.3230922693266833, + "grad_norm": 0.5195050678217192, + "learning_rate": 3.956166886684265e-05, + "loss": 0.9156, + "step": 6478 + }, + { + "epoch": 0.323142144638404, + "grad_norm": 0.6132715440185517, + "learning_rate": 3.955838600280535e-05, + "loss": 0.8786, + "step": 6479 + }, + { + "epoch": 0.3231920199501247, + "grad_norm": 0.4844090110480178, + "learning_rate": 3.955510275887348e-05, + "loss": 0.9362, + "step": 6480 + }, + { + "epoch": 0.3232418952618454, + "grad_norm": 0.46072572925445765, + "learning_rate": 3.955181913513273e-05, + "loss": 0.9274, + "step": 6481 + }, + { + "epoch": 0.32329177057356606, + "grad_norm": 0.46886034633957796, + "learning_rate": 3.9548535131668756e-05, + "loss": 0.9266, + "step": 6482 + }, + { + "epoch": 0.3233416458852868, + "grad_norm": 0.4504942506388042, + "learning_rate": 3.9545250748567284e-05, + "loss": 0.8954, + "step": 6483 + }, + { + "epoch": 0.3233915211970075, + "grad_norm": 0.5297819738732789, + "learning_rate": 3.954196598591399e-05, + "loss": 0.965, + "step": 6484 + }, + { + "epoch": 0.32344139650872816, + "grad_norm": 0.5809165095149971, + "learning_rate": 3.9538680843794605e-05, + "loss": 0.9521, + "step": 6485 + }, + { + "epoch": 0.3234912718204489, + "grad_norm": 0.4524693155087699, + "learning_rate": 3.953539532229486e-05, + "loss": 0.9358, + "step": 6486 + }, + { + "epoch": 0.3235411471321696, + "grad_norm": 0.5955071412175686, + "learning_rate": 3.9532109421500466e-05, + "loss": 0.9479, + "step": 6487 + }, + { + "epoch": 0.32359102244389026, + "grad_norm": 0.509680704787009, + "learning_rate": 3.952882314149718e-05, + "loss": 0.8916, + "step": 6488 + }, + { + "epoch": 0.32364089775561095, + "grad_norm": 0.4512538605087398, + "learning_rate": 3.952553648237076e-05, + "loss": 0.9142, + "step": 6489 + }, + { + "epoch": 0.3236907730673317, + "grad_norm": 0.4895477381497226, + "learning_rate": 3.9522249444206974e-05, + "loss": 0.9145, + "step": 6490 + }, + { + "epoch": 0.32374064837905236, + "grad_norm": 0.48096552887332467, + "learning_rate": 3.951896202709158e-05, + "loss": 0.8829, + "step": 6491 + }, + { + "epoch": 0.32379052369077305, + "grad_norm": 0.9857711863886687, + "learning_rate": 3.951567423111038e-05, + "loss": 0.8801, + "step": 6492 + }, + { + "epoch": 0.3238403990024938, + "grad_norm": 0.7824981707751525, + "learning_rate": 3.951238605634914e-05, + "loss": 0.9303, + "step": 6493 + }, + { + "epoch": 0.32389027431421447, + "grad_norm": 0.48109221444284095, + "learning_rate": 3.9509097502893696e-05, + "loss": 0.8977, + "step": 6494 + }, + { + "epoch": 0.32394014962593515, + "grad_norm": 0.5118334827039872, + "learning_rate": 3.950580857082984e-05, + "loss": 0.9397, + "step": 6495 + }, + { + "epoch": 0.3239900249376559, + "grad_norm": 1.5229379094076136, + "learning_rate": 3.95025192602434e-05, + "loss": 0.9121, + "step": 6496 + }, + { + "epoch": 0.32403990024937657, + "grad_norm": 0.94582700759407, + "learning_rate": 3.949922957122021e-05, + "loss": 0.9303, + "step": 6497 + }, + { + "epoch": 0.32408977556109725, + "grad_norm": 0.672726480103223, + "learning_rate": 3.949593950384611e-05, + "loss": 0.9288, + "step": 6498 + }, + { + "epoch": 0.32413965087281793, + "grad_norm": 0.44110142558987653, + "learning_rate": 3.949264905820697e-05, + "loss": 0.9187, + "step": 6499 + }, + { + "epoch": 0.32418952618453867, + "grad_norm": 0.7443744226202993, + "learning_rate": 3.9489358234388626e-05, + "loss": 0.9478, + "step": 6500 + }, + { + "epoch": 0.32423940149625935, + "grad_norm": 0.5110865462487204, + "learning_rate": 3.948606703247697e-05, + "loss": 0.9021, + "step": 6501 + }, + { + "epoch": 0.32428927680798003, + "grad_norm": 0.45417067116725507, + "learning_rate": 3.948277545255787e-05, + "loss": 0.8815, + "step": 6502 + }, + { + "epoch": 0.32433915211970077, + "grad_norm": 2.6089746443489044, + "learning_rate": 3.947948349471724e-05, + "loss": 0.9355, + "step": 6503 + }, + { + "epoch": 0.32438902743142145, + "grad_norm": 0.537233643542576, + "learning_rate": 3.9476191159040955e-05, + "loss": 0.918, + "step": 6504 + }, + { + "epoch": 0.32443890274314213, + "grad_norm": 0.5515545622097937, + "learning_rate": 3.947289844561495e-05, + "loss": 0.9173, + "step": 6505 + }, + { + "epoch": 0.32448877805486287, + "grad_norm": 0.6933092587473941, + "learning_rate": 3.9469605354525124e-05, + "loss": 0.8777, + "step": 6506 + }, + { + "epoch": 0.32453865336658355, + "grad_norm": 0.43091384659171, + "learning_rate": 3.946631188585743e-05, + "loss": 0.9262, + "step": 6507 + }, + { + "epoch": 0.32458852867830423, + "grad_norm": 0.5121791018152932, + "learning_rate": 3.9463018039697806e-05, + "loss": 0.9098, + "step": 6508 + }, + { + "epoch": 0.3246384039900249, + "grad_norm": 0.45686137789743586, + "learning_rate": 3.9459723816132197e-05, + "loss": 0.9195, + "step": 6509 + }, + { + "epoch": 0.32468827930174565, + "grad_norm": 0.5975697359703536, + "learning_rate": 3.9456429215246566e-05, + "loss": 0.9176, + "step": 6510 + }, + { + "epoch": 0.32473815461346633, + "grad_norm": 0.6874673455858519, + "learning_rate": 3.945313423712688e-05, + "loss": 0.9326, + "step": 6511 + }, + { + "epoch": 0.324788029925187, + "grad_norm": 0.44592212830083133, + "learning_rate": 3.944983888185913e-05, + "loss": 0.8947, + "step": 6512 + }, + { + "epoch": 0.32483790523690775, + "grad_norm": 0.5465964339639541, + "learning_rate": 3.9446543149529295e-05, + "loss": 0.899, + "step": 6513 + }, + { + "epoch": 0.32488778054862844, + "grad_norm": 0.43625216213038837, + "learning_rate": 3.944324704022339e-05, + "loss": 0.8607, + "step": 6514 + }, + { + "epoch": 0.3249376558603491, + "grad_norm": 0.6703684735780999, + "learning_rate": 3.943995055402741e-05, + "loss": 0.8876, + "step": 6515 + }, + { + "epoch": 0.3249875311720698, + "grad_norm": 0.4527165460647803, + "learning_rate": 3.9436653691027394e-05, + "loss": 0.9173, + "step": 6516 + }, + { + "epoch": 0.32503740648379054, + "grad_norm": 0.4974864251293146, + "learning_rate": 3.9433356451309346e-05, + "loss": 0.9115, + "step": 6517 + }, + { + "epoch": 0.3250872817955112, + "grad_norm": 0.5567407989751669, + "learning_rate": 3.9430058834959325e-05, + "loss": 0.8762, + "step": 6518 + }, + { + "epoch": 0.3251371571072319, + "grad_norm": 0.5043915612032546, + "learning_rate": 3.942676084206338e-05, + "loss": 0.8664, + "step": 6519 + }, + { + "epoch": 0.32518703241895264, + "grad_norm": 0.5037257227135331, + "learning_rate": 3.9423462472707576e-05, + "loss": 0.8633, + "step": 6520 + }, + { + "epoch": 0.3252369077306733, + "grad_norm": 0.7130970987512759, + "learning_rate": 3.942016372697796e-05, + "loss": 0.9616, + "step": 6521 + }, + { + "epoch": 0.325286783042394, + "grad_norm": 0.7490912865668559, + "learning_rate": 3.9416864604960635e-05, + "loss": 0.903, + "step": 6522 + }, + { + "epoch": 0.32533665835411474, + "grad_norm": 0.4339286756908458, + "learning_rate": 3.9413565106741665e-05, + "loss": 0.8902, + "step": 6523 + }, + { + "epoch": 0.3253865336658354, + "grad_norm": 0.38401090345715627, + "learning_rate": 3.941026523240718e-05, + "loss": 0.8717, + "step": 6524 + }, + { + "epoch": 0.3254364089775561, + "grad_norm": 0.4970834284774617, + "learning_rate": 3.940696498204327e-05, + "loss": 0.9444, + "step": 6525 + }, + { + "epoch": 0.3254862842892768, + "grad_norm": 0.6068280268009443, + "learning_rate": 3.940366435573605e-05, + "loss": 0.8875, + "step": 6526 + }, + { + "epoch": 0.3255361596009975, + "grad_norm": 0.5798937888773927, + "learning_rate": 3.940036335357166e-05, + "loss": 0.9245, + "step": 6527 + }, + { + "epoch": 0.3255860349127182, + "grad_norm": 0.6023861839570777, + "learning_rate": 3.939706197563623e-05, + "loss": 0.8751, + "step": 6528 + }, + { + "epoch": 0.3256359102244389, + "grad_norm": 0.746577259439972, + "learning_rate": 3.939376022201592e-05, + "loss": 0.9327, + "step": 6529 + }, + { + "epoch": 0.3256857855361596, + "grad_norm": 0.5145133920569526, + "learning_rate": 3.9390458092796864e-05, + "loss": 0.8773, + "step": 6530 + }, + { + "epoch": 0.3257356608478803, + "grad_norm": 0.4850684497921781, + "learning_rate": 3.938715558806525e-05, + "loss": 0.949, + "step": 6531 + }, + { + "epoch": 0.325785536159601, + "grad_norm": 0.5160155433379856, + "learning_rate": 3.938385270790725e-05, + "loss": 0.9128, + "step": 6532 + }, + { + "epoch": 0.3258354114713217, + "grad_norm": 0.5565288040900166, + "learning_rate": 3.9380549452409066e-05, + "loss": 0.9119, + "step": 6533 + }, + { + "epoch": 0.3258852867830424, + "grad_norm": 0.5554190536678363, + "learning_rate": 3.937724582165686e-05, + "loss": 0.9706, + "step": 6534 + }, + { + "epoch": 0.3259351620947631, + "grad_norm": 0.5364863957424143, + "learning_rate": 3.937394181573687e-05, + "loss": 0.8601, + "step": 6535 + }, + { + "epoch": 0.32598503740648377, + "grad_norm": 0.5487906131721436, + "learning_rate": 3.9370637434735296e-05, + "loss": 0.961, + "step": 6536 + }, + { + "epoch": 0.3260349127182045, + "grad_norm": 0.5136986423544816, + "learning_rate": 3.936733267873837e-05, + "loss": 0.9214, + "step": 6537 + }, + { + "epoch": 0.3260847880299252, + "grad_norm": 1.2915044017233817, + "learning_rate": 3.9364027547832326e-05, + "loss": 0.9368, + "step": 6538 + }, + { + "epoch": 0.32613466334164587, + "grad_norm": 0.7130653265325784, + "learning_rate": 3.936072204210342e-05, + "loss": 0.838, + "step": 6539 + }, + { + "epoch": 0.3261845386533666, + "grad_norm": 0.405073376118472, + "learning_rate": 3.935741616163789e-05, + "loss": 0.9131, + "step": 6540 + }, + { + "epoch": 0.3262344139650873, + "grad_norm": 0.5294853113230575, + "learning_rate": 3.935410990652201e-05, + "loss": 0.9341, + "step": 6541 + }, + { + "epoch": 0.32628428927680797, + "grad_norm": 0.5931026413353041, + "learning_rate": 3.9350803276842064e-05, + "loss": 0.8807, + "step": 6542 + }, + { + "epoch": 0.32633416458852865, + "grad_norm": 2.3836767490669595, + "learning_rate": 3.9347496272684325e-05, + "loss": 0.9382, + "step": 6543 + }, + { + "epoch": 0.3263840399002494, + "grad_norm": 0.6053971131119033, + "learning_rate": 3.934418889413509e-05, + "loss": 0.8885, + "step": 6544 + }, + { + "epoch": 0.32643391521197007, + "grad_norm": 0.5985523942346429, + "learning_rate": 3.9340881141280673e-05, + "loss": 0.9818, + "step": 6545 + }, + { + "epoch": 0.32648379052369075, + "grad_norm": 0.6577790251425014, + "learning_rate": 3.933757301420737e-05, + "loss": 0.895, + "step": 6546 + }, + { + "epoch": 0.3265336658354115, + "grad_norm": 0.4541413026894826, + "learning_rate": 3.933426451300152e-05, + "loss": 0.9414, + "step": 6547 + }, + { + "epoch": 0.3265835411471322, + "grad_norm": 0.6409704890779493, + "learning_rate": 3.933095563774945e-05, + "loss": 0.9419, + "step": 6548 + }, + { + "epoch": 0.32663341645885285, + "grad_norm": 0.6022963068686343, + "learning_rate": 3.932764638853752e-05, + "loss": 0.9841, + "step": 6549 + }, + { + "epoch": 0.3266832917705736, + "grad_norm": 0.5916299316569057, + "learning_rate": 3.932433676545205e-05, + "loss": 0.9377, + "step": 6550 + }, + { + "epoch": 0.3267331670822943, + "grad_norm": 0.6306653409900499, + "learning_rate": 3.932102676857944e-05, + "loss": 0.9142, + "step": 6551 + }, + { + "epoch": 0.32678304239401496, + "grad_norm": 0.38320762422434784, + "learning_rate": 3.931771639800603e-05, + "loss": 0.914, + "step": 6552 + }, + { + "epoch": 0.32683291770573564, + "grad_norm": 0.49723342899253886, + "learning_rate": 3.931440565381823e-05, + "loss": 0.9662, + "step": 6553 + }, + { + "epoch": 0.3268827930174564, + "grad_norm": 0.8945189232497001, + "learning_rate": 3.931109453610242e-05, + "loss": 0.9255, + "step": 6554 + }, + { + "epoch": 0.32693266832917706, + "grad_norm": 0.7051698654332025, + "learning_rate": 3.9307783044945e-05, + "loss": 0.9773, + "step": 6555 + }, + { + "epoch": 0.32698254364089774, + "grad_norm": 0.4230915286409951, + "learning_rate": 3.930447118043239e-05, + "loss": 0.9091, + "step": 6556 + }, + { + "epoch": 0.3270324189526185, + "grad_norm": 0.5941095332730797, + "learning_rate": 3.9301158942650994e-05, + "loss": 0.9257, + "step": 6557 + }, + { + "epoch": 0.32708229426433916, + "grad_norm": 0.4994359432949944, + "learning_rate": 3.9297846331687275e-05, + "loss": 0.9292, + "step": 6558 + }, + { + "epoch": 0.32713216957605984, + "grad_norm": 0.43251967075532416, + "learning_rate": 3.929453334762764e-05, + "loss": 0.8993, + "step": 6559 + }, + { + "epoch": 0.3271820448877805, + "grad_norm": 0.7942954993376146, + "learning_rate": 3.929121999055855e-05, + "loss": 0.8912, + "step": 6560 + }, + { + "epoch": 0.32723192019950126, + "grad_norm": 0.4258231557550881, + "learning_rate": 3.9287906260566474e-05, + "loss": 0.9212, + "step": 6561 + }, + { + "epoch": 0.32728179551122194, + "grad_norm": 0.5617575037980219, + "learning_rate": 3.928459215773789e-05, + "loss": 0.9015, + "step": 6562 + }, + { + "epoch": 0.3273316708229426, + "grad_norm": 0.567954131597833, + "learning_rate": 3.9281277682159255e-05, + "loss": 0.9344, + "step": 6563 + }, + { + "epoch": 0.32738154613466336, + "grad_norm": 0.47358534811503705, + "learning_rate": 3.9277962833917084e-05, + "loss": 0.9196, + "step": 6564 + }, + { + "epoch": 0.32743142144638404, + "grad_norm": 0.47339593991288714, + "learning_rate": 3.927464761309785e-05, + "loss": 0.9327, + "step": 6565 + }, + { + "epoch": 0.3274812967581047, + "grad_norm": 1.3144037722916018, + "learning_rate": 3.927133201978808e-05, + "loss": 0.8948, + "step": 6566 + }, + { + "epoch": 0.32753117206982546, + "grad_norm": 0.49856543554849114, + "learning_rate": 3.926801605407428e-05, + "loss": 0.8968, + "step": 6567 + }, + { + "epoch": 0.32758104738154614, + "grad_norm": 0.4433722333230415, + "learning_rate": 3.9264699716042996e-05, + "loss": 0.939, + "step": 6568 + }, + { + "epoch": 0.3276309226932668, + "grad_norm": 0.4903374373015369, + "learning_rate": 3.926138300578075e-05, + "loss": 0.9245, + "step": 6569 + }, + { + "epoch": 0.3276807980049875, + "grad_norm": 0.5427981412670039, + "learning_rate": 3.92580659233741e-05, + "loss": 0.977, + "step": 6570 + }, + { + "epoch": 0.32773067331670824, + "grad_norm": 0.4342532380937421, + "learning_rate": 3.9254748468909606e-05, + "loss": 0.8659, + "step": 6571 + }, + { + "epoch": 0.3277805486284289, + "grad_norm": 0.6291424250850826, + "learning_rate": 3.9251430642473823e-05, + "loss": 0.9482, + "step": 6572 + }, + { + "epoch": 0.3278304239401496, + "grad_norm": 0.5030425793437484, + "learning_rate": 3.924811244415334e-05, + "loss": 0.968, + "step": 6573 + }, + { + "epoch": 0.32788029925187034, + "grad_norm": 2.42256720844296, + "learning_rate": 3.924479387403473e-05, + "loss": 0.9457, + "step": 6574 + }, + { + "epoch": 0.327930174563591, + "grad_norm": 0.6815480515801855, + "learning_rate": 3.924147493220462e-05, + "loss": 0.9223, + "step": 6575 + }, + { + "epoch": 0.3279800498753117, + "grad_norm": 0.8468912363629608, + "learning_rate": 3.9238155618749575e-05, + "loss": 0.9149, + "step": 6576 + }, + { + "epoch": 0.32802992518703245, + "grad_norm": 0.5363425328621707, + "learning_rate": 3.923483593375624e-05, + "loss": 0.9122, + "step": 6577 + }, + { + "epoch": 0.3280798004987531, + "grad_norm": 0.43808279206460726, + "learning_rate": 3.923151587731123e-05, + "loss": 0.9251, + "step": 6578 + }, + { + "epoch": 0.3281296758104738, + "grad_norm": 0.41762630441741533, + "learning_rate": 3.922819544950118e-05, + "loss": 0.925, + "step": 6579 + }, + { + "epoch": 0.3281795511221945, + "grad_norm": 0.6071900956100109, + "learning_rate": 3.922487465041275e-05, + "loss": 0.9653, + "step": 6580 + }, + { + "epoch": 0.32822942643391523, + "grad_norm": 0.5272182681305214, + "learning_rate": 3.9221553480132565e-05, + "loss": 0.9059, + "step": 6581 + }, + { + "epoch": 0.3282793017456359, + "grad_norm": 0.5796101184699491, + "learning_rate": 3.921823193874732e-05, + "loss": 0.9047, + "step": 6582 + }, + { + "epoch": 0.3283291770573566, + "grad_norm": 0.4567978380733284, + "learning_rate": 3.9214910026343663e-05, + "loss": 0.8819, + "step": 6583 + }, + { + "epoch": 0.32837905236907733, + "grad_norm": 0.492301964464915, + "learning_rate": 3.92115877430083e-05, + "loss": 0.9594, + "step": 6584 + }, + { + "epoch": 0.328428927680798, + "grad_norm": 0.4433495266527863, + "learning_rate": 3.920826508882791e-05, + "loss": 0.8702, + "step": 6585 + }, + { + "epoch": 0.3284788029925187, + "grad_norm": 0.47608800611972124, + "learning_rate": 3.920494206388921e-05, + "loss": 0.9842, + "step": 6586 + }, + { + "epoch": 0.3285286783042394, + "grad_norm": 0.6434521638869986, + "learning_rate": 3.920161866827889e-05, + "loss": 0.9092, + "step": 6587 + }, + { + "epoch": 0.3285785536159601, + "grad_norm": 0.4537293432285869, + "learning_rate": 3.91982949020837e-05, + "loss": 0.9234, + "step": 6588 + }, + { + "epoch": 0.3286284289276808, + "grad_norm": 0.47821156139183807, + "learning_rate": 3.919497076539035e-05, + "loss": 0.9111, + "step": 6589 + }, + { + "epoch": 0.3286783042394015, + "grad_norm": 0.6839118498226641, + "learning_rate": 3.91916462582856e-05, + "loss": 0.915, + "step": 6590 + }, + { + "epoch": 0.3287281795511222, + "grad_norm": 0.5926865417887208, + "learning_rate": 3.9188321380856184e-05, + "loss": 0.9567, + "step": 6591 + }, + { + "epoch": 0.3287780548628429, + "grad_norm": 0.6290601715835572, + "learning_rate": 3.918499613318888e-05, + "loss": 0.9541, + "step": 6592 + }, + { + "epoch": 0.3288279301745636, + "grad_norm": 0.4246563320783706, + "learning_rate": 3.918167051537044e-05, + "loss": 0.9122, + "step": 6593 + }, + { + "epoch": 0.3288778054862843, + "grad_norm": 0.5658910955107691, + "learning_rate": 3.9178344527487665e-05, + "loss": 0.9153, + "step": 6594 + }, + { + "epoch": 0.328927680798005, + "grad_norm": 0.49189474355922275, + "learning_rate": 3.9175018169627326e-05, + "loss": 0.9107, + "step": 6595 + }, + { + "epoch": 0.3289775561097257, + "grad_norm": 0.4636547755755999, + "learning_rate": 3.9171691441876236e-05, + "loss": 0.8944, + "step": 6596 + }, + { + "epoch": 0.32902743142144636, + "grad_norm": 0.6440483042834225, + "learning_rate": 3.9168364344321194e-05, + "loss": 0.9039, + "step": 6597 + }, + { + "epoch": 0.3290773067331671, + "grad_norm": 0.5405015914989568, + "learning_rate": 3.916503687704903e-05, + "loss": 0.8656, + "step": 6598 + }, + { + "epoch": 0.3291271820448878, + "grad_norm": 0.8368659080621894, + "learning_rate": 3.9161709040146576e-05, + "loss": 0.9295, + "step": 6599 + }, + { + "epoch": 0.32917705735660846, + "grad_norm": 0.46772699911921506, + "learning_rate": 3.915838083370065e-05, + "loss": 0.9075, + "step": 6600 + }, + { + "epoch": 0.3292269326683292, + "grad_norm": 0.4422792091295753, + "learning_rate": 3.915505225779812e-05, + "loss": 0.9449, + "step": 6601 + }, + { + "epoch": 0.3292768079800499, + "grad_norm": 0.8649515825754772, + "learning_rate": 3.915172331252583e-05, + "loss": 0.8736, + "step": 6602 + }, + { + "epoch": 0.32932668329177056, + "grad_norm": 0.4963760634833785, + "learning_rate": 3.914839399797066e-05, + "loss": 0.9218, + "step": 6603 + }, + { + "epoch": 0.3293765586034913, + "grad_norm": 0.6396100860419609, + "learning_rate": 3.914506431421948e-05, + "loss": 0.856, + "step": 6604 + }, + { + "epoch": 0.329426433915212, + "grad_norm": 0.6022350258710462, + "learning_rate": 3.9141734261359176e-05, + "loss": 0.8918, + "step": 6605 + }, + { + "epoch": 0.32947630922693266, + "grad_norm": 0.5387372515218676, + "learning_rate": 3.913840383947665e-05, + "loss": 0.961, + "step": 6606 + }, + { + "epoch": 0.32952618453865334, + "grad_norm": 0.5580330187706773, + "learning_rate": 3.9135073048658794e-05, + "loss": 0.9119, + "step": 6607 + }, + { + "epoch": 0.3295760598503741, + "grad_norm": 0.6062177576887614, + "learning_rate": 3.9131741888992536e-05, + "loss": 0.8935, + "step": 6608 + }, + { + "epoch": 0.32962593516209476, + "grad_norm": 0.584088630036947, + "learning_rate": 3.91284103605648e-05, + "loss": 0.9182, + "step": 6609 + }, + { + "epoch": 0.32967581047381544, + "grad_norm": 0.5934348384481394, + "learning_rate": 3.9125078463462516e-05, + "loss": 0.9562, + "step": 6610 + }, + { + "epoch": 0.3297256857855362, + "grad_norm": 0.4634211921631931, + "learning_rate": 3.912174619777263e-05, + "loss": 0.8696, + "step": 6611 + }, + { + "epoch": 0.32977556109725686, + "grad_norm": 0.47556933071631524, + "learning_rate": 3.911841356358211e-05, + "loss": 0.9152, + "step": 6612 + }, + { + "epoch": 0.32982543640897755, + "grad_norm": 0.9532650666962469, + "learning_rate": 3.911508056097789e-05, + "loss": 0.8908, + "step": 6613 + }, + { + "epoch": 0.3298753117206982, + "grad_norm": 0.5079773182022865, + "learning_rate": 3.911174719004697e-05, + "loss": 0.9101, + "step": 6614 + }, + { + "epoch": 0.32992518703241897, + "grad_norm": 0.503253954801868, + "learning_rate": 3.910841345087631e-05, + "loss": 0.9547, + "step": 6615 + }, + { + "epoch": 0.32997506234413965, + "grad_norm": 0.6627902431682936, + "learning_rate": 3.910507934355293e-05, + "loss": 0.9556, + "step": 6616 + }, + { + "epoch": 0.33002493765586033, + "grad_norm": 0.5849769929483606, + "learning_rate": 3.9101744868163816e-05, + "loss": 0.9291, + "step": 6617 + }, + { + "epoch": 0.33007481296758107, + "grad_norm": 0.676918416683894, + "learning_rate": 3.9098410024795974e-05, + "loss": 0.9337, + "step": 6618 + }, + { + "epoch": 0.33012468827930175, + "grad_norm": 0.4557568880049336, + "learning_rate": 3.909507481353643e-05, + "loss": 0.9513, + "step": 6619 + }, + { + "epoch": 0.33017456359102243, + "grad_norm": 0.5131754723370984, + "learning_rate": 3.909173923447222e-05, + "loss": 0.9203, + "step": 6620 + }, + { + "epoch": 0.33022443890274317, + "grad_norm": 0.5294605509352507, + "learning_rate": 3.908840328769039e-05, + "loss": 0.9187, + "step": 6621 + }, + { + "epoch": 0.33027431421446385, + "grad_norm": 0.5516084277935698, + "learning_rate": 3.9085066973277966e-05, + "loss": 0.9093, + "step": 6622 + }, + { + "epoch": 0.33032418952618453, + "grad_norm": 0.5537847154457199, + "learning_rate": 3.908173029132204e-05, + "loss": 0.8696, + "step": 6623 + }, + { + "epoch": 0.3303740648379052, + "grad_norm": 0.6388784804022198, + "learning_rate": 3.907839324190965e-05, + "loss": 0.9176, + "step": 6624 + }, + { + "epoch": 0.33042394014962595, + "grad_norm": 0.5895798036250968, + "learning_rate": 3.9075055825127894e-05, + "loss": 0.915, + "step": 6625 + }, + { + "epoch": 0.33047381546134663, + "grad_norm": 0.4902531509391588, + "learning_rate": 3.9071718041063856e-05, + "loss": 0.9277, + "step": 6626 + }, + { + "epoch": 0.3305236907730673, + "grad_norm": 0.7970050937046489, + "learning_rate": 3.906837988980463e-05, + "loss": 0.997, + "step": 6627 + }, + { + "epoch": 0.33057356608478805, + "grad_norm": 0.5825120038251035, + "learning_rate": 3.9065041371437336e-05, + "loss": 0.9393, + "step": 6628 + }, + { + "epoch": 0.33062344139650873, + "grad_norm": 0.48393260810260297, + "learning_rate": 3.9061702486049065e-05, + "loss": 0.8811, + "step": 6629 + }, + { + "epoch": 0.3306733167082294, + "grad_norm": 0.6240898644043654, + "learning_rate": 3.905836323372698e-05, + "loss": 0.9399, + "step": 6630 + }, + { + "epoch": 0.33072319201995015, + "grad_norm": 0.858938857115376, + "learning_rate": 3.905502361455818e-05, + "loss": 0.9575, + "step": 6631 + }, + { + "epoch": 0.33077306733167083, + "grad_norm": 0.4799678841342806, + "learning_rate": 3.905168362862984e-05, + "loss": 0.8792, + "step": 6632 + }, + { + "epoch": 0.3308229426433915, + "grad_norm": 0.5040340831940253, + "learning_rate": 3.9048343276029106e-05, + "loss": 0.9565, + "step": 6633 + }, + { + "epoch": 0.3308728179551122, + "grad_norm": 0.547258946837136, + "learning_rate": 3.904500255684314e-05, + "loss": 0.9062, + "step": 6634 + }, + { + "epoch": 0.33092269326683293, + "grad_norm": 0.4843142694062808, + "learning_rate": 3.904166147115912e-05, + "loss": 0.9515, + "step": 6635 + }, + { + "epoch": 0.3309725685785536, + "grad_norm": 0.6861297396997327, + "learning_rate": 3.903832001906422e-05, + "loss": 0.9122, + "step": 6636 + }, + { + "epoch": 0.3310224438902743, + "grad_norm": 0.6606612958512266, + "learning_rate": 3.903497820064565e-05, + "loss": 0.9358, + "step": 6637 + }, + { + "epoch": 0.33107231920199504, + "grad_norm": 1.1639813328335, + "learning_rate": 3.9031636015990606e-05, + "loss": 0.9143, + "step": 6638 + }, + { + "epoch": 0.3311221945137157, + "grad_norm": 0.6388740786641398, + "learning_rate": 3.9028293465186295e-05, + "loss": 0.913, + "step": 6639 + }, + { + "epoch": 0.3311720698254364, + "grad_norm": 0.47599891914822584, + "learning_rate": 3.902495054831994e-05, + "loss": 0.9024, + "step": 6640 + }, + { + "epoch": 0.3312219451371571, + "grad_norm": 0.651415043563272, + "learning_rate": 3.9021607265478786e-05, + "loss": 0.947, + "step": 6641 + }, + { + "epoch": 0.3312718204488778, + "grad_norm": 0.4436197076845519, + "learning_rate": 3.9018263616750064e-05, + "loss": 0.9, + "step": 6642 + }, + { + "epoch": 0.3313216957605985, + "grad_norm": 0.5990446197876763, + "learning_rate": 3.901491960222102e-05, + "loss": 0.936, + "step": 6643 + }, + { + "epoch": 0.3313715710723192, + "grad_norm": 0.7067466339993802, + "learning_rate": 3.9011575221978925e-05, + "loss": 0.9511, + "step": 6644 + }, + { + "epoch": 0.3314214463840399, + "grad_norm": 0.46598006506759465, + "learning_rate": 3.9008230476111046e-05, + "loss": 0.8743, + "step": 6645 + }, + { + "epoch": 0.3314713216957606, + "grad_norm": 0.6463982847313435, + "learning_rate": 3.900488536470466e-05, + "loss": 0.8886, + "step": 6646 + }, + { + "epoch": 0.3315211970074813, + "grad_norm": 0.5280061454455606, + "learning_rate": 3.900153988784706e-05, + "loss": 0.9968, + "step": 6647 + }, + { + "epoch": 0.331571072319202, + "grad_norm": 0.535861438586933, + "learning_rate": 3.8998194045625544e-05, + "loss": 0.8913, + "step": 6648 + }, + { + "epoch": 0.3316209476309227, + "grad_norm": 0.5088443192239144, + "learning_rate": 3.8994847838127416e-05, + "loss": 0.9073, + "step": 6649 + }, + { + "epoch": 0.3316708229426434, + "grad_norm": 0.4571809918497867, + "learning_rate": 3.899150126544e-05, + "loss": 0.8777, + "step": 6650 + }, + { + "epoch": 0.33172069825436407, + "grad_norm": 0.5332022295760329, + "learning_rate": 3.898815432765062e-05, + "loss": 0.8856, + "step": 6651 + }, + { + "epoch": 0.3317705735660848, + "grad_norm": 0.49867904790085016, + "learning_rate": 3.898480702484661e-05, + "loss": 0.8677, + "step": 6652 + }, + { + "epoch": 0.3318204488778055, + "grad_norm": 0.5286638736235688, + "learning_rate": 3.8981459357115325e-05, + "loss": 0.9162, + "step": 6653 + }, + { + "epoch": 0.33187032418952617, + "grad_norm": 0.5053491261762184, + "learning_rate": 3.897811132454411e-05, + "loss": 0.915, + "step": 6654 + }, + { + "epoch": 0.3319201995012469, + "grad_norm": 0.5217912718358104, + "learning_rate": 3.897476292722034e-05, + "loss": 0.938, + "step": 6655 + }, + { + "epoch": 0.3319700748129676, + "grad_norm": 1.0547363914050079, + "learning_rate": 3.8971414165231384e-05, + "loss": 0.8986, + "step": 6656 + }, + { + "epoch": 0.33201995012468827, + "grad_norm": 0.5304691949873197, + "learning_rate": 3.896806503866462e-05, + "loss": 0.9323, + "step": 6657 + }, + { + "epoch": 0.332069825436409, + "grad_norm": 1.1110385810976073, + "learning_rate": 3.896471554760747e-05, + "loss": 0.9013, + "step": 6658 + }, + { + "epoch": 0.3321197007481297, + "grad_norm": 0.566535149426473, + "learning_rate": 3.89613656921473e-05, + "loss": 0.8772, + "step": 6659 + }, + { + "epoch": 0.33216957605985037, + "grad_norm": 1.141568977795702, + "learning_rate": 3.895801547237155e-05, + "loss": 0.9458, + "step": 6660 + }, + { + "epoch": 0.33221945137157105, + "grad_norm": 0.7415784935309202, + "learning_rate": 3.895466488836762e-05, + "loss": 0.9521, + "step": 6661 + }, + { + "epoch": 0.3322693266832918, + "grad_norm": 0.8233116419136706, + "learning_rate": 3.895131394022297e-05, + "loss": 0.9069, + "step": 6662 + }, + { + "epoch": 0.33231920199501247, + "grad_norm": 0.5122336233913525, + "learning_rate": 3.8947962628025026e-05, + "loss": 0.926, + "step": 6663 + }, + { + "epoch": 0.33236907730673315, + "grad_norm": 0.47045748078878336, + "learning_rate": 3.8944610951861224e-05, + "loss": 0.9472, + "step": 6664 + }, + { + "epoch": 0.3324189526184539, + "grad_norm": 0.4596398691541692, + "learning_rate": 3.894125891181906e-05, + "loss": 0.9265, + "step": 6665 + }, + { + "epoch": 0.33246882793017457, + "grad_norm": 0.5472776353925195, + "learning_rate": 3.893790650798597e-05, + "loss": 0.9843, + "step": 6666 + }, + { + "epoch": 0.33251870324189525, + "grad_norm": 0.4651362266991972, + "learning_rate": 3.893455374044945e-05, + "loss": 0.8653, + "step": 6667 + }, + { + "epoch": 0.33256857855361593, + "grad_norm": 0.5746083771271904, + "learning_rate": 3.8931200609296994e-05, + "loss": 0.91, + "step": 6668 + }, + { + "epoch": 0.33261845386533667, + "grad_norm": 0.5459541108268949, + "learning_rate": 3.8927847114616086e-05, + "loss": 0.8984, + "step": 6669 + }, + { + "epoch": 0.33266832917705735, + "grad_norm": 0.45219650740866135, + "learning_rate": 3.892449325649424e-05, + "loss": 0.8937, + "step": 6670 + }, + { + "epoch": 0.33271820448877804, + "grad_norm": 0.5592633226858461, + "learning_rate": 3.8921139035018974e-05, + "loss": 0.8982, + "step": 6671 + }, + { + "epoch": 0.3327680798004988, + "grad_norm": 0.5712729183667107, + "learning_rate": 3.8917784450277814e-05, + "loss": 0.9179, + "step": 6672 + }, + { + "epoch": 0.33281795511221945, + "grad_norm": 0.813119367783145, + "learning_rate": 3.891442950235831e-05, + "loss": 0.9077, + "step": 6673 + }, + { + "epoch": 0.33286783042394014, + "grad_norm": 0.631055344324506, + "learning_rate": 3.891107419134798e-05, + "loss": 0.9114, + "step": 6674 + }, + { + "epoch": 0.3329177057356609, + "grad_norm": 0.5242122311527141, + "learning_rate": 3.89077185173344e-05, + "loss": 0.9365, + "step": 6675 + }, + { + "epoch": 0.33296758104738156, + "grad_norm": 0.5233206677935386, + "learning_rate": 3.890436248040513e-05, + "loss": 0.9259, + "step": 6676 + }, + { + "epoch": 0.33301745635910224, + "grad_norm": 0.45943606378399326, + "learning_rate": 3.8901006080647744e-05, + "loss": 0.8828, + "step": 6677 + }, + { + "epoch": 0.3330673316708229, + "grad_norm": 0.4953002422283316, + "learning_rate": 3.889764931814983e-05, + "loss": 0.9203, + "step": 6678 + }, + { + "epoch": 0.33311720698254366, + "grad_norm": 0.8316389070125454, + "learning_rate": 3.8894292192998974e-05, + "loss": 0.9606, + "step": 6679 + }, + { + "epoch": 0.33316708229426434, + "grad_norm": 0.8528251938461213, + "learning_rate": 3.889093470528278e-05, + "loss": 0.9064, + "step": 6680 + }, + { + "epoch": 0.333216957605985, + "grad_norm": 0.674960929719157, + "learning_rate": 3.888757685508886e-05, + "loss": 0.8818, + "step": 6681 + }, + { + "epoch": 0.33326683291770576, + "grad_norm": 0.6301075696450406, + "learning_rate": 3.888421864250485e-05, + "loss": 0.9386, + "step": 6682 + }, + { + "epoch": 0.33331670822942644, + "grad_norm": 0.5235582102568344, + "learning_rate": 3.888086006761835e-05, + "loss": 0.8787, + "step": 6683 + }, + { + "epoch": 0.3333665835411471, + "grad_norm": 0.538535339916254, + "learning_rate": 3.887750113051703e-05, + "loss": 0.9339, + "step": 6684 + }, + { + "epoch": 0.3334164588528678, + "grad_norm": 0.8236914334647234, + "learning_rate": 3.8874141831288516e-05, + "loss": 0.8871, + "step": 6685 + }, + { + "epoch": 0.33346633416458854, + "grad_norm": 0.6323237895338826, + "learning_rate": 3.887078217002049e-05, + "loss": 0.9447, + "step": 6686 + }, + { + "epoch": 0.3335162094763092, + "grad_norm": 0.651817311734799, + "learning_rate": 3.8867422146800615e-05, + "loss": 0.8971, + "step": 6687 + }, + { + "epoch": 0.3335660847880299, + "grad_norm": 0.6319885208722159, + "learning_rate": 3.8864061761716556e-05, + "loss": 0.9081, + "step": 6688 + }, + { + "epoch": 0.33361596009975064, + "grad_norm": 1.1087131089113733, + "learning_rate": 3.886070101485602e-05, + "loss": 0.9044, + "step": 6689 + }, + { + "epoch": 0.3336658354114713, + "grad_norm": 0.7187517644507034, + "learning_rate": 3.8857339906306685e-05, + "loss": 0.9429, + "step": 6690 + }, + { + "epoch": 0.333715710723192, + "grad_norm": 0.660382981036101, + "learning_rate": 3.885397843615627e-05, + "loss": 0.8822, + "step": 6691 + }, + { + "epoch": 0.33376558603491274, + "grad_norm": 0.5515283739062485, + "learning_rate": 3.8850616604492484e-05, + "loss": 0.8995, + "step": 6692 + }, + { + "epoch": 0.3338154613466334, + "grad_norm": 0.8757276947418293, + "learning_rate": 3.8847254411403066e-05, + "loss": 0.9617, + "step": 6693 + }, + { + "epoch": 0.3338653366583541, + "grad_norm": 0.4897020990837167, + "learning_rate": 3.8843891856975736e-05, + "loss": 0.8866, + "step": 6694 + }, + { + "epoch": 0.3339152119700748, + "grad_norm": 0.9102200170210256, + "learning_rate": 3.8840528941298246e-05, + "loss": 0.9496, + "step": 6695 + }, + { + "epoch": 0.3339650872817955, + "grad_norm": 0.610010182014294, + "learning_rate": 3.883716566445834e-05, + "loss": 0.8809, + "step": 6696 + }, + { + "epoch": 0.3340149625935162, + "grad_norm": 0.6643023523146316, + "learning_rate": 3.8833802026543796e-05, + "loss": 0.9325, + "step": 6697 + }, + { + "epoch": 0.3340648379052369, + "grad_norm": 0.5172940041822804, + "learning_rate": 3.883043802764238e-05, + "loss": 0.9195, + "step": 6698 + }, + { + "epoch": 0.3341147132169576, + "grad_norm": 0.7605986817774703, + "learning_rate": 3.882707366784187e-05, + "loss": 0.9286, + "step": 6699 + }, + { + "epoch": 0.3341645885286783, + "grad_norm": 0.4675696196683097, + "learning_rate": 3.882370894723006e-05, + "loss": 0.9165, + "step": 6700 + }, + { + "epoch": 0.334214463840399, + "grad_norm": 0.46684755894976565, + "learning_rate": 3.882034386589476e-05, + "loss": 0.8534, + "step": 6701 + }, + { + "epoch": 0.3342643391521197, + "grad_norm": 0.5320476130231399, + "learning_rate": 3.8816978423923765e-05, + "loss": 0.9272, + "step": 6702 + }, + { + "epoch": 0.3343142144638404, + "grad_norm": 1.0472710424663052, + "learning_rate": 3.88136126214049e-05, + "loss": 0.901, + "step": 6703 + }, + { + "epoch": 0.3343640897755611, + "grad_norm": 0.5136932428109929, + "learning_rate": 3.8810246458426004e-05, + "loss": 0.9446, + "step": 6704 + }, + { + "epoch": 0.3344139650872818, + "grad_norm": 0.8956752195076901, + "learning_rate": 3.880687993507489e-05, + "loss": 0.9115, + "step": 6705 + }, + { + "epoch": 0.3344638403990025, + "grad_norm": 1.0491364206582907, + "learning_rate": 3.880351305143944e-05, + "loss": 0.8873, + "step": 6706 + }, + { + "epoch": 0.3345137157107232, + "grad_norm": 0.4972357683364793, + "learning_rate": 3.880014580760749e-05, + "loss": 0.8978, + "step": 6707 + }, + { + "epoch": 0.3345635910224439, + "grad_norm": 0.6392841228137667, + "learning_rate": 3.8796778203666905e-05, + "loss": 0.9006, + "step": 6708 + }, + { + "epoch": 0.3346134663341646, + "grad_norm": 0.5275265926990017, + "learning_rate": 3.8793410239705575e-05, + "loss": 0.9487, + "step": 6709 + }, + { + "epoch": 0.3346633416458853, + "grad_norm": 0.7836164104043739, + "learning_rate": 3.8790041915811384e-05, + "loss": 0.9235, + "step": 6710 + }, + { + "epoch": 0.334713216957606, + "grad_norm": 0.7850960189410398, + "learning_rate": 3.8786673232072206e-05, + "loss": 0.9266, + "step": 6711 + }, + { + "epoch": 0.33476309226932666, + "grad_norm": 0.5914946874951063, + "learning_rate": 3.8783304188575964e-05, + "loss": 0.9358, + "step": 6712 + }, + { + "epoch": 0.3348129675810474, + "grad_norm": 0.7012711030034396, + "learning_rate": 3.877993478541058e-05, + "loss": 0.8972, + "step": 6713 + }, + { + "epoch": 0.3348628428927681, + "grad_norm": 0.9376852625784556, + "learning_rate": 3.877656502266395e-05, + "loss": 0.9408, + "step": 6714 + }, + { + "epoch": 0.33491271820448876, + "grad_norm": 0.5716112954078723, + "learning_rate": 3.877319490042402e-05, + "loss": 0.9398, + "step": 6715 + }, + { + "epoch": 0.3349625935162095, + "grad_norm": 1.0462992668855062, + "learning_rate": 3.876982441877874e-05, + "loss": 0.9106, + "step": 6716 + }, + { + "epoch": 0.3350124688279302, + "grad_norm": 0.5792824086669546, + "learning_rate": 3.876645357781606e-05, + "loss": 0.8595, + "step": 6717 + }, + { + "epoch": 0.33506234413965086, + "grad_norm": 0.470233753433031, + "learning_rate": 3.876308237762393e-05, + "loss": 0.8949, + "step": 6718 + }, + { + "epoch": 0.3351122194513716, + "grad_norm": 0.724385808152499, + "learning_rate": 3.875971081829033e-05, + "loss": 0.958, + "step": 6719 + }, + { + "epoch": 0.3351620947630923, + "grad_norm": 0.7343534547559778, + "learning_rate": 3.8756338899903223e-05, + "loss": 0.9718, + "step": 6720 + }, + { + "epoch": 0.33521197007481296, + "grad_norm": 0.5933646296438798, + "learning_rate": 3.8752966622550616e-05, + "loss": 0.9188, + "step": 6721 + }, + { + "epoch": 0.33526184538653364, + "grad_norm": 0.9126130830754636, + "learning_rate": 3.8749593986320495e-05, + "loss": 0.9503, + "step": 6722 + }, + { + "epoch": 0.3353117206982544, + "grad_norm": 0.584978614546823, + "learning_rate": 3.874622099130087e-05, + "loss": 0.8965, + "step": 6723 + }, + { + "epoch": 0.33536159600997506, + "grad_norm": 0.5827574544664962, + "learning_rate": 3.874284763757977e-05, + "loss": 0.9462, + "step": 6724 + }, + { + "epoch": 0.33541147132169574, + "grad_norm": 0.5760988973918612, + "learning_rate": 3.873947392524521e-05, + "loss": 0.9061, + "step": 6725 + }, + { + "epoch": 0.3354613466334165, + "grad_norm": 0.5134795045629543, + "learning_rate": 3.873609985438522e-05, + "loss": 0.9303, + "step": 6726 + }, + { + "epoch": 0.33551122194513716, + "grad_norm": 0.7616159127599549, + "learning_rate": 3.873272542508786e-05, + "loss": 0.9056, + "step": 6727 + }, + { + "epoch": 0.33556109725685784, + "grad_norm": 0.6005892772507079, + "learning_rate": 3.8729350637441176e-05, + "loss": 0.9233, + "step": 6728 + }, + { + "epoch": 0.3356109725685786, + "grad_norm": 1.072965058304705, + "learning_rate": 3.8725975491533226e-05, + "loss": 0.9237, + "step": 6729 + }, + { + "epoch": 0.33566084788029926, + "grad_norm": 0.790823902411691, + "learning_rate": 3.872259998745209e-05, + "loss": 0.9147, + "step": 6730 + }, + { + "epoch": 0.33571072319201994, + "grad_norm": 0.5672903237902427, + "learning_rate": 3.871922412528585e-05, + "loss": 0.9046, + "step": 6731 + }, + { + "epoch": 0.3357605985037406, + "grad_norm": 0.5946010496334618, + "learning_rate": 3.871584790512259e-05, + "loss": 0.9193, + "step": 6732 + }, + { + "epoch": 0.33581047381546136, + "grad_norm": 0.5053936154443647, + "learning_rate": 3.8712471327050425e-05, + "loss": 0.8949, + "step": 6733 + }, + { + "epoch": 0.33586034912718205, + "grad_norm": 0.477077242464607, + "learning_rate": 3.870909439115745e-05, + "loss": 0.8668, + "step": 6734 + }, + { + "epoch": 0.3359102244389027, + "grad_norm": 0.6617508304013684, + "learning_rate": 3.87057170975318e-05, + "loss": 0.9616, + "step": 6735 + }, + { + "epoch": 0.33596009975062346, + "grad_norm": 0.4513275998718864, + "learning_rate": 3.8702339446261596e-05, + "loss": 0.9057, + "step": 6736 + }, + { + "epoch": 0.33600997506234415, + "grad_norm": 0.6065492319633424, + "learning_rate": 3.8698961437434977e-05, + "loss": 0.8932, + "step": 6737 + }, + { + "epoch": 0.33605985037406483, + "grad_norm": 0.5101917196824233, + "learning_rate": 3.8695583071140084e-05, + "loss": 0.8787, + "step": 6738 + }, + { + "epoch": 0.3361097256857855, + "grad_norm": 1.090528457427924, + "learning_rate": 3.869220434746509e-05, + "loss": 0.9091, + "step": 6739 + }, + { + "epoch": 0.33615960099750625, + "grad_norm": 0.5484393379296616, + "learning_rate": 3.8688825266498144e-05, + "loss": 0.8996, + "step": 6740 + }, + { + "epoch": 0.33620947630922693, + "grad_norm": 0.6935005483775581, + "learning_rate": 3.868544582832743e-05, + "loss": 0.897, + "step": 6741 + }, + { + "epoch": 0.3362593516209476, + "grad_norm": 0.5083149874610599, + "learning_rate": 3.868206603304112e-05, + "loss": 0.9385, + "step": 6742 + }, + { + "epoch": 0.33630922693266835, + "grad_norm": 0.4798549803928636, + "learning_rate": 3.867868588072744e-05, + "loss": 0.8723, + "step": 6743 + }, + { + "epoch": 0.33635910224438903, + "grad_norm": 0.5270326194071758, + "learning_rate": 3.8675305371474565e-05, + "loss": 0.9169, + "step": 6744 + }, + { + "epoch": 0.3364089775561097, + "grad_norm": 0.5515082697492732, + "learning_rate": 3.867192450537071e-05, + "loss": 0.9252, + "step": 6745 + }, + { + "epoch": 0.33645885286783045, + "grad_norm": 0.4542584422918443, + "learning_rate": 3.8668543282504124e-05, + "loss": 0.9067, + "step": 6746 + }, + { + "epoch": 0.33650872817955113, + "grad_norm": 0.49333338417944056, + "learning_rate": 3.8665161702963e-05, + "loss": 0.912, + "step": 6747 + }, + { + "epoch": 0.3365586034912718, + "grad_norm": 1.4714282042508422, + "learning_rate": 3.86617797668356e-05, + "loss": 0.9202, + "step": 6748 + }, + { + "epoch": 0.3366084788029925, + "grad_norm": 0.5158860501823944, + "learning_rate": 3.865839747421017e-05, + "loss": 0.8597, + "step": 6749 + }, + { + "epoch": 0.33665835411471323, + "grad_norm": 0.5783126206418114, + "learning_rate": 3.865501482517498e-05, + "loss": 0.9072, + "step": 6750 + }, + { + "epoch": 0.3367082294264339, + "grad_norm": 0.6015682233762005, + "learning_rate": 3.865163181981828e-05, + "loss": 0.9371, + "step": 6751 + }, + { + "epoch": 0.3367581047381546, + "grad_norm": 0.664790417664731, + "learning_rate": 3.864824845822836e-05, + "loss": 0.9511, + "step": 6752 + }, + { + "epoch": 0.33680798004987533, + "grad_norm": 0.755643774000235, + "learning_rate": 3.864486474049351e-05, + "loss": 0.8703, + "step": 6753 + }, + { + "epoch": 0.336857855361596, + "grad_norm": 1.7823670051004523, + "learning_rate": 3.864148066670201e-05, + "loss": 0.8988, + "step": 6754 + }, + { + "epoch": 0.3369077306733167, + "grad_norm": 0.9318385907058481, + "learning_rate": 3.863809623694219e-05, + "loss": 0.8842, + "step": 6755 + }, + { + "epoch": 0.33695760598503743, + "grad_norm": 0.5531500394600283, + "learning_rate": 3.863471145130235e-05, + "loss": 0.8813, + "step": 6756 + }, + { + "epoch": 0.3370074812967581, + "grad_norm": 0.6391123873713255, + "learning_rate": 3.863132630987081e-05, + "loss": 0.9383, + "step": 6757 + }, + { + "epoch": 0.3370573566084788, + "grad_norm": 0.5802918453573996, + "learning_rate": 3.862794081273591e-05, + "loss": 0.9202, + "step": 6758 + }, + { + "epoch": 0.3371072319201995, + "grad_norm": 0.5179770683647742, + "learning_rate": 3.8624554959986e-05, + "loss": 0.9489, + "step": 6759 + }, + { + "epoch": 0.3371571072319202, + "grad_norm": 0.718412970112207, + "learning_rate": 3.8621168751709426e-05, + "loss": 0.8894, + "step": 6760 + }, + { + "epoch": 0.3372069825436409, + "grad_norm": 0.6956726104412206, + "learning_rate": 3.8617782187994543e-05, + "loss": 0.9131, + "step": 6761 + }, + { + "epoch": 0.3372568578553616, + "grad_norm": 0.5849795609982825, + "learning_rate": 3.861439526892973e-05, + "loss": 0.9365, + "step": 6762 + }, + { + "epoch": 0.3373067331670823, + "grad_norm": 0.5571319801919237, + "learning_rate": 3.8611007994603365e-05, + "loss": 0.9396, + "step": 6763 + }, + { + "epoch": 0.337356608478803, + "grad_norm": 0.8151254185504371, + "learning_rate": 3.860762036510384e-05, + "loss": 0.9713, + "step": 6764 + }, + { + "epoch": 0.3374064837905237, + "grad_norm": 0.5502517139376489, + "learning_rate": 3.8604232380519556e-05, + "loss": 0.9228, + "step": 6765 + }, + { + "epoch": 0.33745635910224436, + "grad_norm": 0.6370903149831223, + "learning_rate": 3.860084404093891e-05, + "loss": 0.9087, + "step": 6766 + }, + { + "epoch": 0.3375062344139651, + "grad_norm": 0.6230476286477334, + "learning_rate": 3.859745534645033e-05, + "loss": 0.9485, + "step": 6767 + }, + { + "epoch": 0.3375561097256858, + "grad_norm": 0.6113290904122226, + "learning_rate": 3.859406629714223e-05, + "loss": 0.913, + "step": 6768 + }, + { + "epoch": 0.33760598503740646, + "grad_norm": 0.7237020043547371, + "learning_rate": 3.859067689310306e-05, + "loss": 0.8993, + "step": 6769 + }, + { + "epoch": 0.3376558603491272, + "grad_norm": 0.5937921882632078, + "learning_rate": 3.858728713442127e-05, + "loss": 0.9164, + "step": 6770 + }, + { + "epoch": 0.3377057356608479, + "grad_norm": 0.7995757246546885, + "learning_rate": 3.8583897021185283e-05, + "loss": 0.8571, + "step": 6771 + }, + { + "epoch": 0.33775561097256857, + "grad_norm": 0.5508679094904556, + "learning_rate": 3.85805065534836e-05, + "loss": 0.9034, + "step": 6772 + }, + { + "epoch": 0.3378054862842893, + "grad_norm": 2.614509265215495, + "learning_rate": 3.857711573140466e-05, + "loss": 0.9308, + "step": 6773 + }, + { + "epoch": 0.33785536159601, + "grad_norm": 1.4148879271928234, + "learning_rate": 3.857372455503697e-05, + "loss": 0.9, + "step": 6774 + }, + { + "epoch": 0.33790523690773067, + "grad_norm": 0.7558475895875013, + "learning_rate": 3.857033302446901e-05, + "loss": 0.9383, + "step": 6775 + }, + { + "epoch": 0.33795511221945135, + "grad_norm": 0.7171911197371166, + "learning_rate": 3.856694113978929e-05, + "loss": 0.979, + "step": 6776 + }, + { + "epoch": 0.3380049875311721, + "grad_norm": 0.9501271196849599, + "learning_rate": 3.8563548901086296e-05, + "loss": 0.9301, + "step": 6777 + }, + { + "epoch": 0.33805486284289277, + "grad_norm": 0.7793336183731823, + "learning_rate": 3.856015630844858e-05, + "loss": 0.8806, + "step": 6778 + }, + { + "epoch": 0.33810473815461345, + "grad_norm": 0.7822342677755649, + "learning_rate": 3.855676336196465e-05, + "loss": 0.9502, + "step": 6779 + }, + { + "epoch": 0.3381546134663342, + "grad_norm": 0.5977753448174377, + "learning_rate": 3.855337006172305e-05, + "loss": 0.8964, + "step": 6780 + }, + { + "epoch": 0.33820448877805487, + "grad_norm": 0.6408188764947067, + "learning_rate": 3.8549976407812316e-05, + "loss": 0.9035, + "step": 6781 + }, + { + "epoch": 0.33825436408977555, + "grad_norm": 0.7969384686537009, + "learning_rate": 3.854658240032102e-05, + "loss": 0.8392, + "step": 6782 + }, + { + "epoch": 0.33830423940149623, + "grad_norm": 0.7586189989820445, + "learning_rate": 3.8543188039337715e-05, + "loss": 0.9, + "step": 6783 + }, + { + "epoch": 0.33835411471321697, + "grad_norm": 0.7070148178677543, + "learning_rate": 3.853979332495098e-05, + "loss": 0.9033, + "step": 6784 + }, + { + "epoch": 0.33840399002493765, + "grad_norm": 0.5828451029324618, + "learning_rate": 3.8536398257249395e-05, + "loss": 0.8952, + "step": 6785 + }, + { + "epoch": 0.33845386533665833, + "grad_norm": 0.7000453165997609, + "learning_rate": 3.8533002836321556e-05, + "loss": 0.9447, + "step": 6786 + }, + { + "epoch": 0.33850374064837907, + "grad_norm": 0.6863576068160874, + "learning_rate": 3.8529607062256065e-05, + "loss": 0.9055, + "step": 6787 + }, + { + "epoch": 0.33855361596009975, + "grad_norm": 0.6299303137519221, + "learning_rate": 3.852621093514153e-05, + "loss": 0.8649, + "step": 6788 + }, + { + "epoch": 0.33860349127182043, + "grad_norm": 1.0572925507213564, + "learning_rate": 3.852281445506658e-05, + "loss": 0.9119, + "step": 6789 + }, + { + "epoch": 0.33865336658354117, + "grad_norm": 0.770102493191299, + "learning_rate": 3.851941762211984e-05, + "loss": 0.9208, + "step": 6790 + }, + { + "epoch": 0.33870324189526185, + "grad_norm": 0.4994568603549484, + "learning_rate": 3.851602043638994e-05, + "loss": 0.9015, + "step": 6791 + }, + { + "epoch": 0.33875311720698253, + "grad_norm": 0.48460916079378114, + "learning_rate": 3.851262289796554e-05, + "loss": 0.9066, + "step": 6792 + }, + { + "epoch": 0.3388029925187032, + "grad_norm": 0.5622006607750243, + "learning_rate": 3.85092250069353e-05, + "loss": 0.9404, + "step": 6793 + }, + { + "epoch": 0.33885286783042395, + "grad_norm": 0.6123300829547382, + "learning_rate": 3.8505826763387865e-05, + "loss": 0.9414, + "step": 6794 + }, + { + "epoch": 0.33890274314214464, + "grad_norm": 0.5064649900832572, + "learning_rate": 3.850242816741193e-05, + "loss": 0.8927, + "step": 6795 + }, + { + "epoch": 0.3389526184538653, + "grad_norm": 0.4661234646120823, + "learning_rate": 3.849902921909618e-05, + "loss": 0.8856, + "step": 6796 + }, + { + "epoch": 0.33900249376558605, + "grad_norm": 0.6661230794606908, + "learning_rate": 3.84956299185293e-05, + "loss": 0.9076, + "step": 6797 + }, + { + "epoch": 0.33905236907730674, + "grad_norm": 0.5041678900044392, + "learning_rate": 3.84922302658e-05, + "loss": 0.9055, + "step": 6798 + }, + { + "epoch": 0.3391022443890274, + "grad_norm": 0.6957760291437949, + "learning_rate": 3.8488830260996986e-05, + "loss": 0.8973, + "step": 6799 + }, + { + "epoch": 0.33915211970074816, + "grad_norm": 0.5578930605639911, + "learning_rate": 3.8485429904208994e-05, + "loss": 0.9264, + "step": 6800 + }, + { + "epoch": 0.33920199501246884, + "grad_norm": 0.7059872260731004, + "learning_rate": 3.848202919552473e-05, + "loss": 0.9484, + "step": 6801 + }, + { + "epoch": 0.3392518703241895, + "grad_norm": 0.5453629969289489, + "learning_rate": 3.8478628135032953e-05, + "loss": 0.902, + "step": 6802 + }, + { + "epoch": 0.3393017456359102, + "grad_norm": 0.5414624551956237, + "learning_rate": 3.847522672282241e-05, + "loss": 0.9309, + "step": 6803 + }, + { + "epoch": 0.33935162094763094, + "grad_norm": 2.3293324050972153, + "learning_rate": 3.847182495898185e-05, + "loss": 1.01, + "step": 6804 + }, + { + "epoch": 0.3394014962593516, + "grad_norm": 0.5313579167368769, + "learning_rate": 3.846842284360005e-05, + "loss": 0.9471, + "step": 6805 + }, + { + "epoch": 0.3394513715710723, + "grad_norm": 0.5634552347875872, + "learning_rate": 3.8465020376765784e-05, + "loss": 0.8899, + "step": 6806 + }, + { + "epoch": 0.33950124688279304, + "grad_norm": 0.5190310812939593, + "learning_rate": 3.8461617558567845e-05, + "loss": 0.8954, + "step": 6807 + }, + { + "epoch": 0.3395511221945137, + "grad_norm": 0.665843041602518, + "learning_rate": 3.8458214389095e-05, + "loss": 0.9317, + "step": 6808 + }, + { + "epoch": 0.3396009975062344, + "grad_norm": 0.54877126652685, + "learning_rate": 3.845481086843609e-05, + "loss": 0.9029, + "step": 6809 + }, + { + "epoch": 0.3396508728179551, + "grad_norm": 0.49450829827519993, + "learning_rate": 3.8451406996679904e-05, + "loss": 0.8884, + "step": 6810 + }, + { + "epoch": 0.3397007481296758, + "grad_norm": 0.6931580681729008, + "learning_rate": 3.844800277391528e-05, + "loss": 0.9088, + "step": 6811 + }, + { + "epoch": 0.3397506234413965, + "grad_norm": 0.5599046111003045, + "learning_rate": 3.844459820023104e-05, + "loss": 0.8694, + "step": 6812 + }, + { + "epoch": 0.3398004987531172, + "grad_norm": 0.5817798864927352, + "learning_rate": 3.844119327571602e-05, + "loss": 0.8653, + "step": 6813 + }, + { + "epoch": 0.3398503740648379, + "grad_norm": 0.5874281985233097, + "learning_rate": 3.8437788000459085e-05, + "loss": 0.9337, + "step": 6814 + }, + { + "epoch": 0.3399002493765586, + "grad_norm": 0.48597045085765095, + "learning_rate": 3.843438237454909e-05, + "loss": 0.8756, + "step": 6815 + }, + { + "epoch": 0.3399501246882793, + "grad_norm": 0.604854116554776, + "learning_rate": 3.843097639807489e-05, + "loss": 0.9093, + "step": 6816 + }, + { + "epoch": 0.34, + "grad_norm": 0.6445216518680504, + "learning_rate": 3.842757007112537e-05, + "loss": 0.9153, + "step": 6817 + }, + { + "epoch": 0.3400498753117207, + "grad_norm": 0.5135775748050915, + "learning_rate": 3.8424163393789435e-05, + "loss": 0.8758, + "step": 6818 + }, + { + "epoch": 0.3400997506234414, + "grad_norm": 0.9511368068225206, + "learning_rate": 3.8420756366155954e-05, + "loss": 0.9395, + "step": 6819 + }, + { + "epoch": 0.34014962593516207, + "grad_norm": 0.6076250291440647, + "learning_rate": 3.841734898831384e-05, + "loss": 0.9222, + "step": 6820 + }, + { + "epoch": 0.3401995012468828, + "grad_norm": 0.5086252668904614, + "learning_rate": 3.8413941260352e-05, + "loss": 0.9309, + "step": 6821 + }, + { + "epoch": 0.3402493765586035, + "grad_norm": 0.6278432090505035, + "learning_rate": 3.841053318235938e-05, + "loss": 0.9466, + "step": 6822 + }, + { + "epoch": 0.34029925187032417, + "grad_norm": 0.6422481877439571, + "learning_rate": 3.8407124754424896e-05, + "loss": 0.8986, + "step": 6823 + }, + { + "epoch": 0.3403491271820449, + "grad_norm": 1.0822189784923664, + "learning_rate": 3.8403715976637486e-05, + "loss": 0.9288, + "step": 6824 + }, + { + "epoch": 0.3403990024937656, + "grad_norm": 0.6921531790907879, + "learning_rate": 3.840030684908611e-05, + "loss": 0.8542, + "step": 6825 + }, + { + "epoch": 0.34044887780548627, + "grad_norm": 0.7366586623286268, + "learning_rate": 3.839689737185973e-05, + "loss": 0.9185, + "step": 6826 + }, + { + "epoch": 0.340498753117207, + "grad_norm": 0.7620166159175668, + "learning_rate": 3.83934875450473e-05, + "loss": 0.9485, + "step": 6827 + }, + { + "epoch": 0.3405486284289277, + "grad_norm": 0.4866370103427365, + "learning_rate": 3.839007736873781e-05, + "loss": 0.8809, + "step": 6828 + }, + { + "epoch": 0.3405985037406484, + "grad_norm": 0.5149644179303393, + "learning_rate": 3.8386666843020244e-05, + "loss": 0.9274, + "step": 6829 + }, + { + "epoch": 0.34064837905236905, + "grad_norm": 0.5559587220876839, + "learning_rate": 3.83832559679836e-05, + "loss": 0.9162, + "step": 6830 + }, + { + "epoch": 0.3406982543640898, + "grad_norm": 0.5550222640042647, + "learning_rate": 3.837984474371688e-05, + "loss": 0.9297, + "step": 6831 + }, + { + "epoch": 0.3407481296758105, + "grad_norm": 0.8012747028406919, + "learning_rate": 3.837643317030909e-05, + "loss": 0.963, + "step": 6832 + }, + { + "epoch": 0.34079800498753116, + "grad_norm": 0.6762007977317328, + "learning_rate": 3.837302124784927e-05, + "loss": 0.8976, + "step": 6833 + }, + { + "epoch": 0.3408478802992519, + "grad_norm": 0.8447297220275789, + "learning_rate": 3.836960897642644e-05, + "loss": 0.9016, + "step": 6834 + }, + { + "epoch": 0.3408977556109726, + "grad_norm": 0.7114552953615322, + "learning_rate": 3.836619635612966e-05, + "loss": 0.9131, + "step": 6835 + }, + { + "epoch": 0.34094763092269326, + "grad_norm": 0.8261556649446231, + "learning_rate": 3.836278338704796e-05, + "loss": 0.9207, + "step": 6836 + }, + { + "epoch": 0.34099750623441394, + "grad_norm": 0.6021514053555003, + "learning_rate": 3.8359370069270404e-05, + "loss": 0.9268, + "step": 6837 + }, + { + "epoch": 0.3410473815461347, + "grad_norm": 0.5658248097361748, + "learning_rate": 3.8355956402886074e-05, + "loss": 0.8742, + "step": 6838 + }, + { + "epoch": 0.34109725685785536, + "grad_norm": 0.6272168772026843, + "learning_rate": 3.8352542387984035e-05, + "loss": 0.8975, + "step": 6839 + }, + { + "epoch": 0.34114713216957604, + "grad_norm": 0.7539032002162109, + "learning_rate": 3.834912802465338e-05, + "loss": 0.9194, + "step": 6840 + }, + { + "epoch": 0.3411970074812968, + "grad_norm": 0.762683443525638, + "learning_rate": 3.83457133129832e-05, + "loss": 0.8819, + "step": 6841 + }, + { + "epoch": 0.34124688279301746, + "grad_norm": 0.7937561436869446, + "learning_rate": 3.834229825306261e-05, + "loss": 0.9172, + "step": 6842 + }, + { + "epoch": 0.34129675810473814, + "grad_norm": 0.6263853910857251, + "learning_rate": 3.833888284498071e-05, + "loss": 0.889, + "step": 6843 + }, + { + "epoch": 0.3413466334164589, + "grad_norm": 0.542433378717047, + "learning_rate": 3.833546708882664e-05, + "loss": 0.8566, + "step": 6844 + }, + { + "epoch": 0.34139650872817956, + "grad_norm": 0.6804193935781201, + "learning_rate": 3.833205098468952e-05, + "loss": 0.8851, + "step": 6845 + }, + { + "epoch": 0.34144638403990024, + "grad_norm": 1.0435403267854948, + "learning_rate": 3.8328634532658496e-05, + "loss": 0.9063, + "step": 6846 + }, + { + "epoch": 0.3414962593516209, + "grad_norm": 1.4793077931485425, + "learning_rate": 3.8325217732822714e-05, + "loss": 0.9262, + "step": 6847 + }, + { + "epoch": 0.34154613466334166, + "grad_norm": 0.6344369813619128, + "learning_rate": 3.832180058527135e-05, + "loss": 0.8455, + "step": 6848 + }, + { + "epoch": 0.34159600997506234, + "grad_norm": 0.5595623603339418, + "learning_rate": 3.831838309009356e-05, + "loss": 0.9287, + "step": 6849 + }, + { + "epoch": 0.341645885286783, + "grad_norm": 0.5874989052365048, + "learning_rate": 3.831496524737851e-05, + "loss": 0.9547, + "step": 6850 + }, + { + "epoch": 0.34169576059850376, + "grad_norm": 0.63514800462256, + "learning_rate": 3.831154705721541e-05, + "loss": 0.9502, + "step": 6851 + }, + { + "epoch": 0.34174563591022444, + "grad_norm": 0.9081292464648768, + "learning_rate": 3.8308128519693456e-05, + "loss": 0.897, + "step": 6852 + }, + { + "epoch": 0.3417955112219451, + "grad_norm": 0.5828434050878573, + "learning_rate": 3.830470963490184e-05, + "loss": 0.8795, + "step": 6853 + }, + { + "epoch": 0.34184538653366586, + "grad_norm": 0.5767132201867279, + "learning_rate": 3.830129040292977e-05, + "loss": 0.9606, + "step": 6854 + }, + { + "epoch": 0.34189526184538654, + "grad_norm": 0.4585616386620965, + "learning_rate": 3.829787082386649e-05, + "loss": 0.8431, + "step": 6855 + }, + { + "epoch": 0.3419451371571072, + "grad_norm": 0.46871040204665004, + "learning_rate": 3.8294450897801216e-05, + "loss": 0.8854, + "step": 6856 + }, + { + "epoch": 0.3419950124688279, + "grad_norm": 0.760164981759605, + "learning_rate": 3.8291030624823197e-05, + "loss": 0.9036, + "step": 6857 + }, + { + "epoch": 0.34204488778054865, + "grad_norm": 0.5599709186165215, + "learning_rate": 3.828761000502168e-05, + "loss": 0.8878, + "step": 6858 + }, + { + "epoch": 0.3420947630922693, + "grad_norm": 1.0409859648028856, + "learning_rate": 3.8284189038485936e-05, + "loss": 0.9206, + "step": 6859 + }, + { + "epoch": 0.34214463840399, + "grad_norm": 0.6741282357150638, + "learning_rate": 3.828076772530521e-05, + "loss": 0.9137, + "step": 6860 + }, + { + "epoch": 0.34219451371571075, + "grad_norm": 0.47804361937527495, + "learning_rate": 3.8277346065568806e-05, + "loss": 0.9394, + "step": 6861 + }, + { + "epoch": 0.34224438902743143, + "grad_norm": 0.9467456429153962, + "learning_rate": 3.8273924059366e-05, + "loss": 0.9315, + "step": 6862 + }, + { + "epoch": 0.3422942643391521, + "grad_norm": 0.4774784657862278, + "learning_rate": 3.827050170678608e-05, + "loss": 0.9017, + "step": 6863 + }, + { + "epoch": 0.3423441396508728, + "grad_norm": 0.7410208200310489, + "learning_rate": 3.8267079007918356e-05, + "loss": 0.8949, + "step": 6864 + }, + { + "epoch": 0.34239401496259353, + "grad_norm": 0.5634916040295229, + "learning_rate": 3.826365596285215e-05, + "loss": 0.9279, + "step": 6865 + }, + { + "epoch": 0.3424438902743142, + "grad_norm": 0.5015505540716612, + "learning_rate": 3.826023257167677e-05, + "loss": 0.9313, + "step": 6866 + }, + { + "epoch": 0.3424937655860349, + "grad_norm": 0.716385374673028, + "learning_rate": 3.825680883448156e-05, + "loss": 0.8809, + "step": 6867 + }, + { + "epoch": 0.34254364089775563, + "grad_norm": 0.7156596582224553, + "learning_rate": 3.8253384751355844e-05, + "loss": 0.9439, + "step": 6868 + }, + { + "epoch": 0.3425935162094763, + "grad_norm": 0.7657601585454894, + "learning_rate": 3.8249960322389e-05, + "loss": 0.9241, + "step": 6869 + }, + { + "epoch": 0.342643391521197, + "grad_norm": 2.3359035087129407, + "learning_rate": 3.8246535547670365e-05, + "loss": 0.9081, + "step": 6870 + }, + { + "epoch": 0.34269326683291773, + "grad_norm": 0.7178135570682125, + "learning_rate": 3.8243110427289315e-05, + "loss": 0.9069, + "step": 6871 + }, + { + "epoch": 0.3427431421446384, + "grad_norm": 0.6734609577240988, + "learning_rate": 3.823968496133523e-05, + "loss": 0.9, + "step": 6872 + }, + { + "epoch": 0.3427930174563591, + "grad_norm": 0.48914836955944263, + "learning_rate": 3.823625914989748e-05, + "loss": 0.9212, + "step": 6873 + }, + { + "epoch": 0.3428428927680798, + "grad_norm": 0.5795272827951986, + "learning_rate": 3.8232832993065485e-05, + "loss": 0.9399, + "step": 6874 + }, + { + "epoch": 0.3428927680798005, + "grad_norm": 0.5418848232896417, + "learning_rate": 3.822940649092862e-05, + "loss": 0.9419, + "step": 6875 + }, + { + "epoch": 0.3429426433915212, + "grad_norm": 0.8815495161129374, + "learning_rate": 3.822597964357632e-05, + "loss": 0.9106, + "step": 6876 + }, + { + "epoch": 0.3429925187032419, + "grad_norm": 0.4950612337995704, + "learning_rate": 3.8222552451098005e-05, + "loss": 0.9181, + "step": 6877 + }, + { + "epoch": 0.3430423940149626, + "grad_norm": 0.8005343530835504, + "learning_rate": 3.821912491358309e-05, + "loss": 0.9301, + "step": 6878 + }, + { + "epoch": 0.3430922693266833, + "grad_norm": 0.7603985467166282, + "learning_rate": 3.821569703112105e-05, + "loss": 0.9075, + "step": 6879 + }, + { + "epoch": 0.343142144638404, + "grad_norm": 0.4787395675518577, + "learning_rate": 3.8212268803801286e-05, + "loss": 0.9036, + "step": 6880 + }, + { + "epoch": 0.3431920199501247, + "grad_norm": 1.0131529393311138, + "learning_rate": 3.820884023171329e-05, + "loss": 0.9359, + "step": 6881 + }, + { + "epoch": 0.3432418952618454, + "grad_norm": 0.584601254413632, + "learning_rate": 3.8205411314946516e-05, + "loss": 0.9566, + "step": 6882 + }, + { + "epoch": 0.3432917705735661, + "grad_norm": 0.6255017125235499, + "learning_rate": 3.820198205359045e-05, + "loss": 0.9206, + "step": 6883 + }, + { + "epoch": 0.34334164588528676, + "grad_norm": 1.0270686993378653, + "learning_rate": 3.819855244773457e-05, + "loss": 0.9361, + "step": 6884 + }, + { + "epoch": 0.3433915211970075, + "grad_norm": 0.5698536850557548, + "learning_rate": 3.819512249746836e-05, + "loss": 0.9382, + "step": 6885 + }, + { + "epoch": 0.3434413965087282, + "grad_norm": 1.4333693981903075, + "learning_rate": 3.8191692202881345e-05, + "loss": 0.8772, + "step": 6886 + }, + { + "epoch": 0.34349127182044886, + "grad_norm": 0.5378955555546882, + "learning_rate": 3.818826156406302e-05, + "loss": 0.8489, + "step": 6887 + }, + { + "epoch": 0.3435411471321696, + "grad_norm": 0.6774417662321598, + "learning_rate": 3.818483058110292e-05, + "loss": 0.8969, + "step": 6888 + }, + { + "epoch": 0.3435910224438903, + "grad_norm": 0.6430526039614007, + "learning_rate": 3.818139925409056e-05, + "loss": 0.9416, + "step": 6889 + }, + { + "epoch": 0.34364089775561096, + "grad_norm": 0.5948254915186455, + "learning_rate": 3.8177967583115495e-05, + "loss": 0.917, + "step": 6890 + }, + { + "epoch": 0.34369077306733165, + "grad_norm": 0.4168185548337818, + "learning_rate": 3.817453556826724e-05, + "loss": 0.9185, + "step": 6891 + }, + { + "epoch": 0.3437406483790524, + "grad_norm": 0.5126668881018205, + "learning_rate": 3.81711032096354e-05, + "loss": 0.8969, + "step": 6892 + }, + { + "epoch": 0.34379052369077306, + "grad_norm": 0.5045187863820649, + "learning_rate": 3.816767050730951e-05, + "loss": 0.8683, + "step": 6893 + }, + { + "epoch": 0.34384039900249375, + "grad_norm": 0.5247898999724564, + "learning_rate": 3.816423746137915e-05, + "loss": 0.9206, + "step": 6894 + }, + { + "epoch": 0.3438902743142145, + "grad_norm": 0.569540966036075, + "learning_rate": 3.81608040719339e-05, + "loss": 0.9405, + "step": 6895 + }, + { + "epoch": 0.34394014962593517, + "grad_norm": 0.7234802743550319, + "learning_rate": 3.815737033906336e-05, + "loss": 0.9389, + "step": 6896 + }, + { + "epoch": 0.34399002493765585, + "grad_norm": 0.8307250119080731, + "learning_rate": 3.8153936262857125e-05, + "loss": 0.9495, + "step": 6897 + }, + { + "epoch": 0.3440399002493766, + "grad_norm": 1.4703190406438504, + "learning_rate": 3.815050184340482e-05, + "loss": 0.8872, + "step": 6898 + }, + { + "epoch": 0.34408977556109727, + "grad_norm": 0.46083667478395557, + "learning_rate": 3.814706708079605e-05, + "loss": 0.9248, + "step": 6899 + }, + { + "epoch": 0.34413965087281795, + "grad_norm": 1.3903559821228924, + "learning_rate": 3.814363197512045e-05, + "loss": 0.8862, + "step": 6900 + }, + { + "epoch": 0.34418952618453863, + "grad_norm": 0.5560958050174193, + "learning_rate": 3.814019652646766e-05, + "loss": 0.8879, + "step": 6901 + }, + { + "epoch": 0.34423940149625937, + "grad_norm": 0.6935464171830107, + "learning_rate": 3.813676073492731e-05, + "loss": 0.9223, + "step": 6902 + }, + { + "epoch": 0.34428927680798005, + "grad_norm": 0.6653997805542222, + "learning_rate": 3.813332460058908e-05, + "loss": 0.8987, + "step": 6903 + }, + { + "epoch": 0.34433915211970073, + "grad_norm": 1.4913907698027964, + "learning_rate": 3.812988812354262e-05, + "loss": 0.9668, + "step": 6904 + }, + { + "epoch": 0.34438902743142147, + "grad_norm": 0.7443800456574237, + "learning_rate": 3.81264513038776e-05, + "loss": 0.8947, + "step": 6905 + }, + { + "epoch": 0.34443890274314215, + "grad_norm": 0.7610076231670718, + "learning_rate": 3.812301414168371e-05, + "loss": 0.9036, + "step": 6906 + }, + { + "epoch": 0.34448877805486283, + "grad_norm": 0.7042733243288781, + "learning_rate": 3.811957663705064e-05, + "loss": 0.8831, + "step": 6907 + }, + { + "epoch": 0.3445386533665835, + "grad_norm": 0.765372905189411, + "learning_rate": 3.811613879006809e-05, + "loss": 0.8633, + "step": 6908 + }, + { + "epoch": 0.34458852867830425, + "grad_norm": 0.4505354922496608, + "learning_rate": 3.811270060082577e-05, + "loss": 0.8765, + "step": 6909 + }, + { + "epoch": 0.34463840399002493, + "grad_norm": 0.642803112635053, + "learning_rate": 3.810926206941339e-05, + "loss": 0.8552, + "step": 6910 + }, + { + "epoch": 0.3446882793017456, + "grad_norm": 0.7760505561084058, + "learning_rate": 3.8105823195920686e-05, + "loss": 0.8939, + "step": 6911 + }, + { + "epoch": 0.34473815461346635, + "grad_norm": 0.5500363313598361, + "learning_rate": 3.8102383980437394e-05, + "loss": 0.8748, + "step": 6912 + }, + { + "epoch": 0.34478802992518703, + "grad_norm": 0.5826503986493701, + "learning_rate": 3.809894442305325e-05, + "loss": 0.9153, + "step": 6913 + }, + { + "epoch": 0.3448379052369077, + "grad_norm": 0.5285687946659644, + "learning_rate": 3.809550452385803e-05, + "loss": 0.8755, + "step": 6914 + }, + { + "epoch": 0.34488778054862845, + "grad_norm": 0.5644560892299226, + "learning_rate": 3.809206428294146e-05, + "loss": 0.8906, + "step": 6915 + }, + { + "epoch": 0.34493765586034913, + "grad_norm": 0.5235026425742767, + "learning_rate": 3.8088623700393346e-05, + "loss": 0.9214, + "step": 6916 + }, + { + "epoch": 0.3449875311720698, + "grad_norm": 0.9220896498700742, + "learning_rate": 3.808518277630344e-05, + "loss": 0.9093, + "step": 6917 + }, + { + "epoch": 0.3450374064837905, + "grad_norm": 0.5036134704667408, + "learning_rate": 3.808174151076156e-05, + "loss": 0.9059, + "step": 6918 + }, + { + "epoch": 0.34508728179551124, + "grad_norm": 0.6336972214923531, + "learning_rate": 3.8078299903857475e-05, + "loss": 0.9326, + "step": 6919 + }, + { + "epoch": 0.3451371571072319, + "grad_norm": 0.5105006088326535, + "learning_rate": 3.807485795568102e-05, + "loss": 0.9265, + "step": 6920 + }, + { + "epoch": 0.3451870324189526, + "grad_norm": 0.6286320462074902, + "learning_rate": 3.8071415666321995e-05, + "loss": 0.9108, + "step": 6921 + }, + { + "epoch": 0.34523690773067334, + "grad_norm": 0.8260560491859076, + "learning_rate": 3.806797303587023e-05, + "loss": 0.9222, + "step": 6922 + }, + { + "epoch": 0.345286783042394, + "grad_norm": 0.5055820944461923, + "learning_rate": 3.806453006441555e-05, + "loss": 0.9413, + "step": 6923 + }, + { + "epoch": 0.3453366583541147, + "grad_norm": 0.5537628719017551, + "learning_rate": 3.806108675204781e-05, + "loss": 0.9152, + "step": 6924 + }, + { + "epoch": 0.34538653366583544, + "grad_norm": 0.8162448719171255, + "learning_rate": 3.805764309885685e-05, + "loss": 0.8982, + "step": 6925 + }, + { + "epoch": 0.3454364089775561, + "grad_norm": 0.973285386198153, + "learning_rate": 3.8054199104932545e-05, + "loss": 0.8938, + "step": 6926 + }, + { + "epoch": 0.3454862842892768, + "grad_norm": 0.6541247071771554, + "learning_rate": 3.805075477036476e-05, + "loss": 0.8702, + "step": 6927 + }, + { + "epoch": 0.3455361596009975, + "grad_norm": 0.602379377096662, + "learning_rate": 3.804731009524336e-05, + "loss": 0.9175, + "step": 6928 + }, + { + "epoch": 0.3455860349127182, + "grad_norm": 1.3713349502694403, + "learning_rate": 3.8043865079658246e-05, + "loss": 0.8879, + "step": 6929 + }, + { + "epoch": 0.3456359102244389, + "grad_norm": 0.5122988950749225, + "learning_rate": 3.8040419723699313e-05, + "loss": 0.9402, + "step": 6930 + }, + { + "epoch": 0.3456857855361596, + "grad_norm": 0.5938127263378401, + "learning_rate": 3.803697402745646e-05, + "loss": 0.9289, + "step": 6931 + }, + { + "epoch": 0.3457356608478803, + "grad_norm": 0.5304132984956711, + "learning_rate": 3.8033527991019605e-05, + "loss": 0.94, + "step": 6932 + }, + { + "epoch": 0.345785536159601, + "grad_norm": 0.5763194488601205, + "learning_rate": 3.803008161447867e-05, + "loss": 0.8767, + "step": 6933 + }, + { + "epoch": 0.3458354114713217, + "grad_norm": 0.6921498567216653, + "learning_rate": 3.8026634897923594e-05, + "loss": 0.9304, + "step": 6934 + }, + { + "epoch": 0.34588528678304237, + "grad_norm": 1.0858525600594033, + "learning_rate": 3.802318784144431e-05, + "loss": 0.9357, + "step": 6935 + }, + { + "epoch": 0.3459351620947631, + "grad_norm": 0.6633109175051608, + "learning_rate": 3.8019740445130756e-05, + "loss": 0.9138, + "step": 6936 + }, + { + "epoch": 0.3459850374064838, + "grad_norm": 0.4035365330200821, + "learning_rate": 3.801629270907291e-05, + "loss": 0.889, + "step": 6937 + }, + { + "epoch": 0.34603491271820447, + "grad_norm": 0.8193241234157146, + "learning_rate": 3.801284463336073e-05, + "loss": 0.9955, + "step": 6938 + }, + { + "epoch": 0.3460847880299252, + "grad_norm": 1.4252434013990483, + "learning_rate": 3.8009396218084195e-05, + "loss": 0.9139, + "step": 6939 + }, + { + "epoch": 0.3461346633416459, + "grad_norm": 0.5044226058490032, + "learning_rate": 3.800594746333327e-05, + "loss": 0.8737, + "step": 6940 + }, + { + "epoch": 0.34618453865336657, + "grad_norm": 0.7590534094807591, + "learning_rate": 3.8002498369197994e-05, + "loss": 0.9199, + "step": 6941 + }, + { + "epoch": 0.3462344139650873, + "grad_norm": 0.5654894629639144, + "learning_rate": 3.799904893576832e-05, + "loss": 0.8903, + "step": 6942 + }, + { + "epoch": 0.346284289276808, + "grad_norm": 0.6382523320802591, + "learning_rate": 3.79955991631343e-05, + "loss": 0.9259, + "step": 6943 + }, + { + "epoch": 0.34633416458852867, + "grad_norm": 0.47287280416286065, + "learning_rate": 3.7992149051385924e-05, + "loss": 0.9238, + "step": 6944 + }, + { + "epoch": 0.34638403990024935, + "grad_norm": 0.8667319339760798, + "learning_rate": 3.798869860061324e-05, + "loss": 0.9164, + "step": 6945 + }, + { + "epoch": 0.3464339152119701, + "grad_norm": 0.5503833262474644, + "learning_rate": 3.7985247810906266e-05, + "loss": 0.9688, + "step": 6946 + }, + { + "epoch": 0.34648379052369077, + "grad_norm": 0.5302887623378969, + "learning_rate": 3.798179668235508e-05, + "loss": 0.872, + "step": 6947 + }, + { + "epoch": 0.34653366583541145, + "grad_norm": 1.2667021928266766, + "learning_rate": 3.7978345215049704e-05, + "loss": 0.8997, + "step": 6948 + }, + { + "epoch": 0.3465835411471322, + "grad_norm": 0.5558564046056196, + "learning_rate": 3.797489340908022e-05, + "loss": 0.9102, + "step": 6949 + }, + { + "epoch": 0.34663341645885287, + "grad_norm": 1.3918066117846997, + "learning_rate": 3.797144126453671e-05, + "loss": 0.8826, + "step": 6950 + }, + { + "epoch": 0.34668329177057355, + "grad_norm": 0.5000777464659317, + "learning_rate": 3.7967988781509237e-05, + "loss": 0.9023, + "step": 6951 + }, + { + "epoch": 0.3467331670822943, + "grad_norm": 0.5395226526884451, + "learning_rate": 3.7964535960087906e-05, + "loss": 0.8689, + "step": 6952 + }, + { + "epoch": 0.346783042394015, + "grad_norm": 0.4660757937937083, + "learning_rate": 3.796108280036281e-05, + "loss": 0.952, + "step": 6953 + }, + { + "epoch": 0.34683291770573565, + "grad_norm": 0.6440438386235058, + "learning_rate": 3.795762930242405e-05, + "loss": 0.9329, + "step": 6954 + }, + { + "epoch": 0.34688279301745634, + "grad_norm": 0.7177852587159058, + "learning_rate": 3.7954175466361767e-05, + "loss": 0.9087, + "step": 6955 + }, + { + "epoch": 0.3469326683291771, + "grad_norm": 0.7315901823023733, + "learning_rate": 3.7950721292266055e-05, + "loss": 0.932, + "step": 6956 + }, + { + "epoch": 0.34698254364089776, + "grad_norm": 0.5118316610038492, + "learning_rate": 3.794726678022709e-05, + "loss": 0.9362, + "step": 6957 + }, + { + "epoch": 0.34703241895261844, + "grad_norm": 0.6622671618729915, + "learning_rate": 3.794381193033497e-05, + "loss": 0.9091, + "step": 6958 + }, + { + "epoch": 0.3470822942643392, + "grad_norm": 0.6099085272304747, + "learning_rate": 3.7940356742679884e-05, + "loss": 0.9116, + "step": 6959 + }, + { + "epoch": 0.34713216957605986, + "grad_norm": 0.6304434719364106, + "learning_rate": 3.793690121735198e-05, + "loss": 0.9386, + "step": 6960 + }, + { + "epoch": 0.34718204488778054, + "grad_norm": 0.8183968669793135, + "learning_rate": 3.793344535444142e-05, + "loss": 0.8658, + "step": 6961 + }, + { + "epoch": 0.3472319201995012, + "grad_norm": 0.595607823445528, + "learning_rate": 3.7929989154038404e-05, + "loss": 0.9145, + "step": 6962 + }, + { + "epoch": 0.34728179551122196, + "grad_norm": 0.6572395750464889, + "learning_rate": 3.7926532616233096e-05, + "loss": 0.9023, + "step": 6963 + }, + { + "epoch": 0.34733167082294264, + "grad_norm": 0.5987298415345116, + "learning_rate": 3.792307574111571e-05, + "loss": 0.9312, + "step": 6964 + }, + { + "epoch": 0.3473815461346633, + "grad_norm": 1.1388073201013682, + "learning_rate": 3.791961852877645e-05, + "loss": 0.9013, + "step": 6965 + }, + { + "epoch": 0.34743142144638406, + "grad_norm": 0.5717786758822703, + "learning_rate": 3.7916160979305516e-05, + "loss": 0.9876, + "step": 6966 + }, + { + "epoch": 0.34748129675810474, + "grad_norm": 0.5653161619115357, + "learning_rate": 3.7912703092793146e-05, + "loss": 0.891, + "step": 6967 + }, + { + "epoch": 0.3475311720698254, + "grad_norm": 0.4950924660295998, + "learning_rate": 3.790924486932957e-05, + "loss": 0.8815, + "step": 6968 + }, + { + "epoch": 0.34758104738154616, + "grad_norm": 1.9127296699709284, + "learning_rate": 3.7905786309005026e-05, + "loss": 0.9079, + "step": 6969 + }, + { + "epoch": 0.34763092269326684, + "grad_norm": 0.8666314630837259, + "learning_rate": 3.790232741190977e-05, + "loss": 0.9143, + "step": 6970 + }, + { + "epoch": 0.3476807980049875, + "grad_norm": 0.9329571184370278, + "learning_rate": 3.789886817813404e-05, + "loss": 0.9422, + "step": 6971 + }, + { + "epoch": 0.3477306733167082, + "grad_norm": 0.6920345758064436, + "learning_rate": 3.789540860776813e-05, + "loss": 0.9119, + "step": 6972 + }, + { + "epoch": 0.34778054862842894, + "grad_norm": 0.5793846124261051, + "learning_rate": 3.78919487009023e-05, + "loss": 0.9395, + "step": 6973 + }, + { + "epoch": 0.3478304239401496, + "grad_norm": 0.5547398933494792, + "learning_rate": 3.788848845762684e-05, + "loss": 0.9048, + "step": 6974 + }, + { + "epoch": 0.3478802992518703, + "grad_norm": 2.546140909809689, + "learning_rate": 3.7885027878032044e-05, + "loss": 0.8848, + "step": 6975 + }, + { + "epoch": 0.34793017456359104, + "grad_norm": 0.5760724151069373, + "learning_rate": 3.7881566962208205e-05, + "loss": 0.9309, + "step": 6976 + }, + { + "epoch": 0.3479800498753117, + "grad_norm": 0.7286368041574294, + "learning_rate": 3.7878105710245643e-05, + "loss": 0.9225, + "step": 6977 + }, + { + "epoch": 0.3480299251870324, + "grad_norm": 0.49971794255678537, + "learning_rate": 3.787464412223467e-05, + "loss": 0.8858, + "step": 6978 + }, + { + "epoch": 0.34807980049875314, + "grad_norm": 0.6875301491501211, + "learning_rate": 3.7871182198265634e-05, + "loss": 0.8851, + "step": 6979 + }, + { + "epoch": 0.3481296758104738, + "grad_norm": 0.5264420306773959, + "learning_rate": 3.786771993842885e-05, + "loss": 0.9163, + "step": 6980 + }, + { + "epoch": 0.3481795511221945, + "grad_norm": 0.585497598273754, + "learning_rate": 3.7864257342814676e-05, + "loss": 0.8799, + "step": 6981 + }, + { + "epoch": 0.3482294264339152, + "grad_norm": 0.4530190404286834, + "learning_rate": 3.7860794411513467e-05, + "loss": 0.9032, + "step": 6982 + }, + { + "epoch": 0.3482793017456359, + "grad_norm": 0.7391866025044107, + "learning_rate": 3.7857331144615574e-05, + "loss": 0.8857, + "step": 6983 + }, + { + "epoch": 0.3483291770573566, + "grad_norm": 0.5822802539595103, + "learning_rate": 3.785386754221139e-05, + "loss": 0.8945, + "step": 6984 + }, + { + "epoch": 0.3483790523690773, + "grad_norm": 0.8394962913218689, + "learning_rate": 3.785040360439128e-05, + "loss": 0.9019, + "step": 6985 + }, + { + "epoch": 0.34842892768079803, + "grad_norm": 0.513920092416475, + "learning_rate": 3.7846939331245636e-05, + "loss": 0.9014, + "step": 6986 + }, + { + "epoch": 0.3484788029925187, + "grad_norm": 0.5350171854571829, + "learning_rate": 3.784347472286486e-05, + "loss": 0.9143, + "step": 6987 + }, + { + "epoch": 0.3485286783042394, + "grad_norm": 0.7224959488758859, + "learning_rate": 3.784000977933937e-05, + "loss": 0.8968, + "step": 6988 + }, + { + "epoch": 0.3485785536159601, + "grad_norm": 0.5795859224298169, + "learning_rate": 3.783654450075956e-05, + "loss": 0.939, + "step": 6989 + }, + { + "epoch": 0.3486284289276808, + "grad_norm": 0.58834849311744, + "learning_rate": 3.783307888721587e-05, + "loss": 0.9031, + "step": 6990 + }, + { + "epoch": 0.3486783042394015, + "grad_norm": 0.6924043337680619, + "learning_rate": 3.782961293879873e-05, + "loss": 0.959, + "step": 6991 + }, + { + "epoch": 0.3487281795511222, + "grad_norm": 0.5125164153880188, + "learning_rate": 3.782614665559858e-05, + "loss": 0.9085, + "step": 6992 + }, + { + "epoch": 0.3487780548628429, + "grad_norm": 0.4722902862882919, + "learning_rate": 3.782268003770587e-05, + "loss": 0.8996, + "step": 6993 + }, + { + "epoch": 0.3488279301745636, + "grad_norm": 1.049848568176727, + "learning_rate": 3.781921308521107e-05, + "loss": 0.9211, + "step": 6994 + }, + { + "epoch": 0.3488778054862843, + "grad_norm": 0.5179190866306317, + "learning_rate": 3.781574579820464e-05, + "loss": 0.8712, + "step": 6995 + }, + { + "epoch": 0.348927680798005, + "grad_norm": 0.7845331861746581, + "learning_rate": 3.781227817677706e-05, + "loss": 0.8968, + "step": 6996 + }, + { + "epoch": 0.3489775561097257, + "grad_norm": 0.4984375225239195, + "learning_rate": 3.780881022101882e-05, + "loss": 0.9458, + "step": 6997 + }, + { + "epoch": 0.3490274314214464, + "grad_norm": 0.7479268458609277, + "learning_rate": 3.78053419310204e-05, + "loss": 0.9239, + "step": 6998 + }, + { + "epoch": 0.34907730673316706, + "grad_norm": 0.4769531069693376, + "learning_rate": 3.780187330687231e-05, + "loss": 0.8868, + "step": 6999 + }, + { + "epoch": 0.3491271820448878, + "grad_norm": 0.5056407206725964, + "learning_rate": 3.779840434866507e-05, + "loss": 0.8813, + "step": 7000 + }, + { + "epoch": 0.3491770573566085, + "grad_norm": 0.5145329302442688, + "learning_rate": 3.7794935056489205e-05, + "loss": 0.8855, + "step": 7001 + }, + { + "epoch": 0.34922693266832916, + "grad_norm": 0.6807386264847628, + "learning_rate": 3.7791465430435225e-05, + "loss": 0.8594, + "step": 7002 + }, + { + "epoch": 0.3492768079800499, + "grad_norm": 0.4657288290130868, + "learning_rate": 3.778799547059369e-05, + "loss": 0.8883, + "step": 7003 + }, + { + "epoch": 0.3493266832917706, + "grad_norm": 0.6107468849883976, + "learning_rate": 3.778452517705513e-05, + "loss": 0.8914, + "step": 7004 + }, + { + "epoch": 0.34937655860349126, + "grad_norm": 0.753598567316706, + "learning_rate": 3.7781054549910104e-05, + "loss": 0.9243, + "step": 7005 + }, + { + "epoch": 0.34942643391521194, + "grad_norm": 1.8357418249329718, + "learning_rate": 3.7777583589249186e-05, + "loss": 0.8709, + "step": 7006 + }, + { + "epoch": 0.3494763092269327, + "grad_norm": 0.5204452667002258, + "learning_rate": 3.7774112295162936e-05, + "loss": 0.9458, + "step": 7007 + }, + { + "epoch": 0.34952618453865336, + "grad_norm": 0.39698660553281245, + "learning_rate": 3.7770640667741954e-05, + "loss": 0.8808, + "step": 7008 + }, + { + "epoch": 0.34957605985037404, + "grad_norm": 0.485882952451727, + "learning_rate": 3.7767168707076805e-05, + "loss": 0.8919, + "step": 7009 + }, + { + "epoch": 0.3496259351620948, + "grad_norm": 0.7703753742554931, + "learning_rate": 3.7763696413258106e-05, + "loss": 0.9128, + "step": 7010 + }, + { + "epoch": 0.34967581047381546, + "grad_norm": 0.5118063236379687, + "learning_rate": 3.7760223786376465e-05, + "loss": 0.9175, + "step": 7011 + }, + { + "epoch": 0.34972568578553614, + "grad_norm": 0.5083126531248822, + "learning_rate": 3.77567508265225e-05, + "loss": 0.9075, + "step": 7012 + }, + { + "epoch": 0.3497755610972569, + "grad_norm": 0.5107518259768646, + "learning_rate": 3.775327753378682e-05, + "loss": 0.9416, + "step": 7013 + }, + { + "epoch": 0.34982543640897756, + "grad_norm": 0.800930362483166, + "learning_rate": 3.774980390826007e-05, + "loss": 0.9186, + "step": 7014 + }, + { + "epoch": 0.34987531172069825, + "grad_norm": 0.527113409329017, + "learning_rate": 3.77463299500329e-05, + "loss": 0.9575, + "step": 7015 + }, + { + "epoch": 0.3499251870324189, + "grad_norm": 1.0431942828749436, + "learning_rate": 3.7742855659195955e-05, + "loss": 0.9487, + "step": 7016 + }, + { + "epoch": 0.34997506234413966, + "grad_norm": 0.4724557543167335, + "learning_rate": 3.7739381035839896e-05, + "loss": 0.9197, + "step": 7017 + }, + { + "epoch": 0.35002493765586035, + "grad_norm": 1.6445951988308445, + "learning_rate": 3.773590608005538e-05, + "loss": 0.9414, + "step": 7018 + }, + { + "epoch": 0.35007481296758103, + "grad_norm": 0.4836192990128269, + "learning_rate": 3.77324307919331e-05, + "loss": 0.907, + "step": 7019 + }, + { + "epoch": 0.35012468827930177, + "grad_norm": 0.42034404852464574, + "learning_rate": 3.772895517156374e-05, + "loss": 0.922, + "step": 7020 + }, + { + "epoch": 0.35017456359102245, + "grad_norm": 0.6628438862031277, + "learning_rate": 3.7725479219037996e-05, + "loss": 0.939, + "step": 7021 + }, + { + "epoch": 0.35022443890274313, + "grad_norm": 0.4602142644639976, + "learning_rate": 3.772200293444655e-05, + "loss": 0.8897, + "step": 7022 + }, + { + "epoch": 0.35027431421446387, + "grad_norm": 0.5291270717026306, + "learning_rate": 3.7718526317880155e-05, + "loss": 0.8867, + "step": 7023 + }, + { + "epoch": 0.35032418952618455, + "grad_norm": 0.43910040067144396, + "learning_rate": 3.771504936942949e-05, + "loss": 0.8742, + "step": 7024 + }, + { + "epoch": 0.35037406483790523, + "grad_norm": 0.4851852280975375, + "learning_rate": 3.771157208918531e-05, + "loss": 0.9628, + "step": 7025 + }, + { + "epoch": 0.3504239401496259, + "grad_norm": 0.5562139116406161, + "learning_rate": 3.7708094477238335e-05, + "loss": 0.9008, + "step": 7026 + }, + { + "epoch": 0.35047381546134665, + "grad_norm": 0.49586800746625037, + "learning_rate": 3.770461653367934e-05, + "loss": 0.8923, + "step": 7027 + }, + { + "epoch": 0.35052369077306733, + "grad_norm": 0.5932792954718403, + "learning_rate": 3.7701138258599045e-05, + "loss": 0.9391, + "step": 7028 + }, + { + "epoch": 0.350573566084788, + "grad_norm": 0.4404262767491707, + "learning_rate": 3.769765965208824e-05, + "loss": 0.9281, + "step": 7029 + }, + { + "epoch": 0.35062344139650875, + "grad_norm": 0.45401109415876445, + "learning_rate": 3.7694180714237684e-05, + "loss": 0.9097, + "step": 7030 + }, + { + "epoch": 0.35067331670822943, + "grad_norm": 0.6041131454431126, + "learning_rate": 3.7690701445138166e-05, + "loss": 0.8743, + "step": 7031 + }, + { + "epoch": 0.3507231920199501, + "grad_norm": 0.444297176754411, + "learning_rate": 3.7687221844880485e-05, + "loss": 0.8917, + "step": 7032 + }, + { + "epoch": 0.3507730673316708, + "grad_norm": 0.47706763995653323, + "learning_rate": 3.768374191355541e-05, + "loss": 0.963, + "step": 7033 + }, + { + "epoch": 0.35082294264339153, + "grad_norm": 1.094674179483138, + "learning_rate": 3.7680261651253776e-05, + "loss": 0.9103, + "step": 7034 + }, + { + "epoch": 0.3508728179551122, + "grad_norm": 0.547464289557109, + "learning_rate": 3.767678105806638e-05, + "loss": 0.9397, + "step": 7035 + }, + { + "epoch": 0.3509226932668329, + "grad_norm": 0.47692565572041346, + "learning_rate": 3.7673300134084066e-05, + "loss": 0.8847, + "step": 7036 + }, + { + "epoch": 0.35097256857855363, + "grad_norm": 0.5014651939167782, + "learning_rate": 3.766981887939765e-05, + "loss": 0.9148, + "step": 7037 + }, + { + "epoch": 0.3510224438902743, + "grad_norm": 0.6227132145962916, + "learning_rate": 3.766633729409799e-05, + "loss": 0.9302, + "step": 7038 + }, + { + "epoch": 0.351072319201995, + "grad_norm": 0.4686434407148166, + "learning_rate": 3.766285537827592e-05, + "loss": 0.8984, + "step": 7039 + }, + { + "epoch": 0.35112219451371574, + "grad_norm": 0.4126211339988897, + "learning_rate": 3.76593731320223e-05, + "loss": 0.9146, + "step": 7040 + }, + { + "epoch": 0.3511720698254364, + "grad_norm": 0.4854701484277489, + "learning_rate": 3.765589055542801e-05, + "loss": 0.9583, + "step": 7041 + }, + { + "epoch": 0.3512219451371571, + "grad_norm": 0.4853496716709199, + "learning_rate": 3.765240764858392e-05, + "loss": 0.9016, + "step": 7042 + }, + { + "epoch": 0.3512718204488778, + "grad_norm": 0.47797399121280537, + "learning_rate": 3.764892441158092e-05, + "loss": 0.9114, + "step": 7043 + }, + { + "epoch": 0.3513216957605985, + "grad_norm": 1.771073409966815, + "learning_rate": 3.7645440844509893e-05, + "loss": 0.965, + "step": 7044 + }, + { + "epoch": 0.3513715710723192, + "grad_norm": 0.510041698018867, + "learning_rate": 3.764195694746175e-05, + "loss": 0.8891, + "step": 7045 + }, + { + "epoch": 0.3514214463840399, + "grad_norm": 0.4558113476237561, + "learning_rate": 3.7638472720527396e-05, + "loss": 0.8931, + "step": 7046 + }, + { + "epoch": 0.3514713216957606, + "grad_norm": 0.4525962464728247, + "learning_rate": 3.763498816379775e-05, + "loss": 0.9237, + "step": 7047 + }, + { + "epoch": 0.3515211970074813, + "grad_norm": 0.4137273945996539, + "learning_rate": 3.7631503277363746e-05, + "loss": 0.9125, + "step": 7048 + }, + { + "epoch": 0.351571072319202, + "grad_norm": 0.43850023085527134, + "learning_rate": 3.762801806131632e-05, + "loss": 0.9078, + "step": 7049 + }, + { + "epoch": 0.3516209476309227, + "grad_norm": 0.4805231167593407, + "learning_rate": 3.76245325157464e-05, + "loss": 0.8618, + "step": 7050 + }, + { + "epoch": 0.3516708229426434, + "grad_norm": 0.437662675680961, + "learning_rate": 3.7621046640744975e-05, + "loss": 0.9231, + "step": 7051 + }, + { + "epoch": 0.3517206982543641, + "grad_norm": 0.42561541983543816, + "learning_rate": 3.761756043640296e-05, + "loss": 0.9142, + "step": 7052 + }, + { + "epoch": 0.35177057356608477, + "grad_norm": 0.3673386400872669, + "learning_rate": 3.761407390281138e-05, + "loss": 0.8884, + "step": 7053 + }, + { + "epoch": 0.3518204488778055, + "grad_norm": 0.6411258256401671, + "learning_rate": 3.761058704006117e-05, + "loss": 0.907, + "step": 7054 + }, + { + "epoch": 0.3518703241895262, + "grad_norm": 0.5822356185600613, + "learning_rate": 3.7607099848243335e-05, + "loss": 0.9639, + "step": 7055 + }, + { + "epoch": 0.35192019950124687, + "grad_norm": 0.5152117212432321, + "learning_rate": 3.7603612327448876e-05, + "loss": 0.9754, + "step": 7056 + }, + { + "epoch": 0.3519700748129676, + "grad_norm": 0.493538901132602, + "learning_rate": 3.76001244777688e-05, + "loss": 0.9352, + "step": 7057 + }, + { + "epoch": 0.3520199501246883, + "grad_norm": 0.5750234225514399, + "learning_rate": 3.759663629929411e-05, + "loss": 0.9192, + "step": 7058 + }, + { + "epoch": 0.35206982543640897, + "grad_norm": 0.48769219610718095, + "learning_rate": 3.759314779211582e-05, + "loss": 0.8495, + "step": 7059 + }, + { + "epoch": 0.35211970074812965, + "grad_norm": 0.6549645227506591, + "learning_rate": 3.7589658956324994e-05, + "loss": 0.9052, + "step": 7060 + }, + { + "epoch": 0.3521695760598504, + "grad_norm": 0.5580975587791552, + "learning_rate": 3.7586169792012646e-05, + "loss": 0.8808, + "step": 7061 + }, + { + "epoch": 0.35221945137157107, + "grad_norm": 0.5092433779733817, + "learning_rate": 3.7582680299269826e-05, + "loss": 0.9608, + "step": 7062 + }, + { + "epoch": 0.35226932668329175, + "grad_norm": 0.4642545030522986, + "learning_rate": 3.75791904781876e-05, + "loss": 0.8994, + "step": 7063 + }, + { + "epoch": 0.3523192019950125, + "grad_norm": 0.6042209965184347, + "learning_rate": 3.7575700328857025e-05, + "loss": 0.8914, + "step": 7064 + }, + { + "epoch": 0.35236907730673317, + "grad_norm": 1.0107932375522093, + "learning_rate": 3.757220985136918e-05, + "loss": 0.8897, + "step": 7065 + }, + { + "epoch": 0.35241895261845385, + "grad_norm": 0.4985565549048883, + "learning_rate": 3.7568719045815135e-05, + "loss": 0.9271, + "step": 7066 + }, + { + "epoch": 0.3524688279301746, + "grad_norm": 0.7771884809471966, + "learning_rate": 3.7565227912286005e-05, + "loss": 0.9628, + "step": 7067 + }, + { + "epoch": 0.35251870324189527, + "grad_norm": 0.43724956476050303, + "learning_rate": 3.7561736450872875e-05, + "loss": 0.9084, + "step": 7068 + }, + { + "epoch": 0.35256857855361595, + "grad_norm": 0.4580517236111488, + "learning_rate": 3.755824466166685e-05, + "loss": 0.9469, + "step": 7069 + }, + { + "epoch": 0.35261845386533663, + "grad_norm": 0.6768506581327395, + "learning_rate": 3.755475254475905e-05, + "loss": 0.9303, + "step": 7070 + }, + { + "epoch": 0.35266832917705737, + "grad_norm": 0.5545314443231801, + "learning_rate": 3.75512601002406e-05, + "loss": 0.9079, + "step": 7071 + }, + { + "epoch": 0.35271820448877805, + "grad_norm": 0.3811702585761318, + "learning_rate": 3.7547767328202644e-05, + "loss": 0.8649, + "step": 7072 + }, + { + "epoch": 0.35276807980049874, + "grad_norm": 0.4941502653210704, + "learning_rate": 3.754427422873631e-05, + "loss": 0.9433, + "step": 7073 + }, + { + "epoch": 0.3528179551122195, + "grad_norm": 0.39788509800847355, + "learning_rate": 3.754078080193275e-05, + "loss": 0.8938, + "step": 7074 + }, + { + "epoch": 0.35286783042394015, + "grad_norm": 0.4032714491608019, + "learning_rate": 3.7537287047883127e-05, + "loss": 0.9252, + "step": 7075 + }, + { + "epoch": 0.35291770573566084, + "grad_norm": 0.45934131645199744, + "learning_rate": 3.7533792966678615e-05, + "loss": 0.8813, + "step": 7076 + }, + { + "epoch": 0.3529675810473816, + "grad_norm": 0.44730399018865413, + "learning_rate": 3.753029855841039e-05, + "loss": 0.9029, + "step": 7077 + }, + { + "epoch": 0.35301745635910226, + "grad_norm": 0.5479900271789236, + "learning_rate": 3.752680382316962e-05, + "loss": 0.9106, + "step": 7078 + }, + { + "epoch": 0.35306733167082294, + "grad_norm": 0.47710619144298255, + "learning_rate": 3.752330876104752e-05, + "loss": 0.9104, + "step": 7079 + }, + { + "epoch": 0.3531172069825436, + "grad_norm": 0.4460733214503945, + "learning_rate": 3.7519813372135274e-05, + "loss": 0.935, + "step": 7080 + }, + { + "epoch": 0.35316708229426436, + "grad_norm": 0.47595431044959935, + "learning_rate": 3.75163176565241e-05, + "loss": 0.8826, + "step": 7081 + }, + { + "epoch": 0.35321695760598504, + "grad_norm": 0.6547470716563684, + "learning_rate": 3.751282161430523e-05, + "loss": 0.9202, + "step": 7082 + }, + { + "epoch": 0.3532668329177057, + "grad_norm": 0.5527180630846256, + "learning_rate": 3.750932524556987e-05, + "loss": 0.9064, + "step": 7083 + }, + { + "epoch": 0.35331670822942646, + "grad_norm": 0.5781855249349029, + "learning_rate": 3.750582855040927e-05, + "loss": 0.8869, + "step": 7084 + }, + { + "epoch": 0.35336658354114714, + "grad_norm": 0.5397619790151188, + "learning_rate": 3.750233152891467e-05, + "loss": 0.9475, + "step": 7085 + }, + { + "epoch": 0.3534164588528678, + "grad_norm": 0.6208936927898999, + "learning_rate": 3.7498834181177326e-05, + "loss": 0.9291, + "step": 7086 + }, + { + "epoch": 0.3534663341645885, + "grad_norm": 0.46912905090508916, + "learning_rate": 3.74953365072885e-05, + "loss": 0.9071, + "step": 7087 + }, + { + "epoch": 0.35351620947630924, + "grad_norm": 0.4327706398972698, + "learning_rate": 3.7491838507339453e-05, + "loss": 0.9183, + "step": 7088 + }, + { + "epoch": 0.3535660847880299, + "grad_norm": 0.44680586783969817, + "learning_rate": 3.7488340181421475e-05, + "loss": 0.8782, + "step": 7089 + }, + { + "epoch": 0.3536159600997506, + "grad_norm": 0.45641496671226894, + "learning_rate": 3.7484841529625846e-05, + "loss": 0.9022, + "step": 7090 + }, + { + "epoch": 0.35366583541147134, + "grad_norm": 0.48615869649286964, + "learning_rate": 3.748134255204387e-05, + "loss": 0.8666, + "step": 7091 + }, + { + "epoch": 0.353715710723192, + "grad_norm": 0.42971212663330516, + "learning_rate": 3.747784324876684e-05, + "loss": 0.9171, + "step": 7092 + }, + { + "epoch": 0.3537655860349127, + "grad_norm": 0.5679419207984263, + "learning_rate": 3.747434361988608e-05, + "loss": 0.9045, + "step": 7093 + }, + { + "epoch": 0.35381546134663344, + "grad_norm": 0.44876369468358596, + "learning_rate": 3.74708436654929e-05, + "loss": 0.9428, + "step": 7094 + }, + { + "epoch": 0.3538653366583541, + "grad_norm": 0.5322481277403033, + "learning_rate": 3.746734338567864e-05, + "loss": 0.934, + "step": 7095 + }, + { + "epoch": 0.3539152119700748, + "grad_norm": 0.46575567181410915, + "learning_rate": 3.7463842780534634e-05, + "loss": 0.8885, + "step": 7096 + }, + { + "epoch": 0.3539650872817955, + "grad_norm": 0.4346571008817384, + "learning_rate": 3.7460341850152234e-05, + "loss": 0.9229, + "step": 7097 + }, + { + "epoch": 0.3540149625935162, + "grad_norm": 0.553322706547113, + "learning_rate": 3.745684059462278e-05, + "loss": 0.8495, + "step": 7098 + }, + { + "epoch": 0.3540648379052369, + "grad_norm": 0.41085923128653523, + "learning_rate": 3.745333901403765e-05, + "loss": 0.8636, + "step": 7099 + }, + { + "epoch": 0.3541147132169576, + "grad_norm": 0.6122706328164559, + "learning_rate": 3.7449837108488207e-05, + "loss": 0.9177, + "step": 7100 + }, + { + "epoch": 0.3541645885286783, + "grad_norm": 0.503469214170716, + "learning_rate": 3.744633487806585e-05, + "loss": 0.9439, + "step": 7101 + }, + { + "epoch": 0.354214463840399, + "grad_norm": 0.5571510369312743, + "learning_rate": 3.7442832322861943e-05, + "loss": 0.9842, + "step": 7102 + }, + { + "epoch": 0.3542643391521197, + "grad_norm": 0.4887860878585815, + "learning_rate": 3.74393294429679e-05, + "loss": 0.9191, + "step": 7103 + }, + { + "epoch": 0.3543142144638404, + "grad_norm": 0.44365283287932683, + "learning_rate": 3.743582623847512e-05, + "loss": 0.9227, + "step": 7104 + }, + { + "epoch": 0.3543640897755611, + "grad_norm": 0.42232124590624837, + "learning_rate": 3.743232270947502e-05, + "loss": 0.9206, + "step": 7105 + }, + { + "epoch": 0.3544139650872818, + "grad_norm": 0.605610007083121, + "learning_rate": 3.7428818856059024e-05, + "loss": 0.8677, + "step": 7106 + }, + { + "epoch": 0.35446384039900247, + "grad_norm": 0.5405162081965386, + "learning_rate": 3.7425314678318554e-05, + "loss": 0.9401, + "step": 7107 + }, + { + "epoch": 0.3545137157107232, + "grad_norm": 0.5900175193695157, + "learning_rate": 3.742181017634507e-05, + "loss": 0.8607, + "step": 7108 + }, + { + "epoch": 0.3545635910224439, + "grad_norm": 0.5096866787316413, + "learning_rate": 3.7418305350230005e-05, + "loss": 0.9132, + "step": 7109 + }, + { + "epoch": 0.3546134663341646, + "grad_norm": 0.5509037293774428, + "learning_rate": 3.741480020006482e-05, + "loss": 0.8912, + "step": 7110 + }, + { + "epoch": 0.3546633416458853, + "grad_norm": 0.6790329255618162, + "learning_rate": 3.741129472594098e-05, + "loss": 0.857, + "step": 7111 + }, + { + "epoch": 0.354713216957606, + "grad_norm": 0.4807030948484688, + "learning_rate": 3.7407788927949956e-05, + "loss": 0.9176, + "step": 7112 + }, + { + "epoch": 0.3547630922693267, + "grad_norm": 0.4677724285312459, + "learning_rate": 3.740428280618323e-05, + "loss": 0.9119, + "step": 7113 + }, + { + "epoch": 0.35481296758104736, + "grad_norm": 0.5085950918142664, + "learning_rate": 3.74007763607323e-05, + "loss": 0.976, + "step": 7114 + }, + { + "epoch": 0.3548628428927681, + "grad_norm": 0.5085747961639441, + "learning_rate": 3.739726959168867e-05, + "loss": 0.9546, + "step": 7115 + }, + { + "epoch": 0.3549127182044888, + "grad_norm": 0.42297245104979975, + "learning_rate": 3.739376249914383e-05, + "loss": 0.9307, + "step": 7116 + }, + { + "epoch": 0.35496259351620946, + "grad_norm": 0.750494951722983, + "learning_rate": 3.73902550831893e-05, + "loss": 0.8927, + "step": 7117 + }, + { + "epoch": 0.3550124688279302, + "grad_norm": 0.7647961379472223, + "learning_rate": 3.738674734391661e-05, + "loss": 0.8739, + "step": 7118 + }, + { + "epoch": 0.3550623441396509, + "grad_norm": 0.46503310165020184, + "learning_rate": 3.738323928141729e-05, + "loss": 0.9074, + "step": 7119 + }, + { + "epoch": 0.35511221945137156, + "grad_norm": 0.4663271610631718, + "learning_rate": 3.737973089578289e-05, + "loss": 0.8643, + "step": 7120 + }, + { + "epoch": 0.3551620947630923, + "grad_norm": 0.6436968645647733, + "learning_rate": 3.737622218710495e-05, + "loss": 0.9259, + "step": 7121 + }, + { + "epoch": 0.355211970074813, + "grad_norm": 1.1539032314430497, + "learning_rate": 3.737271315547503e-05, + "loss": 0.8801, + "step": 7122 + }, + { + "epoch": 0.35526184538653366, + "grad_norm": 0.45549614026277885, + "learning_rate": 3.7369203800984695e-05, + "loss": 0.8863, + "step": 7123 + }, + { + "epoch": 0.35531172069825434, + "grad_norm": 0.7728356520042774, + "learning_rate": 3.7365694123725524e-05, + "loss": 0.9165, + "step": 7124 + }, + { + "epoch": 0.3553615960099751, + "grad_norm": 0.7982911793750016, + "learning_rate": 3.7362184123789104e-05, + "loss": 0.9126, + "step": 7125 + }, + { + "epoch": 0.35541147132169576, + "grad_norm": 0.5097888369378663, + "learning_rate": 3.7358673801267017e-05, + "loss": 0.9196, + "step": 7126 + }, + { + "epoch": 0.35546134663341644, + "grad_norm": 0.4555725734302904, + "learning_rate": 3.735516315625087e-05, + "loss": 0.8698, + "step": 7127 + }, + { + "epoch": 0.3555112219451372, + "grad_norm": 0.9095564433684017, + "learning_rate": 3.735165218883227e-05, + "loss": 0.9377, + "step": 7128 + }, + { + "epoch": 0.35556109725685786, + "grad_norm": 0.40793876285176006, + "learning_rate": 3.734814089910283e-05, + "loss": 0.8906, + "step": 7129 + }, + { + "epoch": 0.35561097256857854, + "grad_norm": 0.5465647071528633, + "learning_rate": 3.7344629287154185e-05, + "loss": 0.8778, + "step": 7130 + }, + { + "epoch": 0.3556608478802992, + "grad_norm": 0.45116116427353037, + "learning_rate": 3.7341117353077966e-05, + "loss": 0.9074, + "step": 7131 + }, + { + "epoch": 0.35571072319201996, + "grad_norm": 0.5353752979353781, + "learning_rate": 3.7337605096965804e-05, + "loss": 0.9121, + "step": 7132 + }, + { + "epoch": 0.35576059850374064, + "grad_norm": 0.46355644797977913, + "learning_rate": 3.733409251890936e-05, + "loss": 0.9008, + "step": 7133 + }, + { + "epoch": 0.3558104738154613, + "grad_norm": 0.4137895048249916, + "learning_rate": 3.73305796190003e-05, + "loss": 0.8742, + "step": 7134 + }, + { + "epoch": 0.35586034912718206, + "grad_norm": 0.44398843838093993, + "learning_rate": 3.732706639733027e-05, + "loss": 0.9013, + "step": 7135 + }, + { + "epoch": 0.35591022443890274, + "grad_norm": 0.6085970919458759, + "learning_rate": 3.7323552853990976e-05, + "loss": 0.8856, + "step": 7136 + }, + { + "epoch": 0.3559600997506234, + "grad_norm": 0.6763516881653681, + "learning_rate": 3.7320038989074076e-05, + "loss": 0.9547, + "step": 7137 + }, + { + "epoch": 0.35600997506234416, + "grad_norm": 0.552026173174626, + "learning_rate": 3.731652480267127e-05, + "loss": 0.9223, + "step": 7138 + }, + { + "epoch": 0.35605985037406485, + "grad_norm": 0.4571899192735815, + "learning_rate": 3.731301029487427e-05, + "loss": 0.914, + "step": 7139 + }, + { + "epoch": 0.3561097256857855, + "grad_norm": 0.46490424729268115, + "learning_rate": 3.730949546577477e-05, + "loss": 0.9735, + "step": 7140 + }, + { + "epoch": 0.3561596009975062, + "grad_norm": 0.5811812005983351, + "learning_rate": 3.7305980315464496e-05, + "loss": 0.9348, + "step": 7141 + }, + { + "epoch": 0.35620947630922695, + "grad_norm": 0.663116427147063, + "learning_rate": 3.730246484403517e-05, + "loss": 0.9418, + "step": 7142 + }, + { + "epoch": 0.35625935162094763, + "grad_norm": 0.616556909906337, + "learning_rate": 3.729894905157854e-05, + "loss": 0.9196, + "step": 7143 + }, + { + "epoch": 0.3563092269326683, + "grad_norm": 0.48884690534919584, + "learning_rate": 3.7295432938186326e-05, + "loss": 0.8999, + "step": 7144 + }, + { + "epoch": 0.35635910224438905, + "grad_norm": 0.4521647718736931, + "learning_rate": 3.72919165039503e-05, + "loss": 0.9543, + "step": 7145 + }, + { + "epoch": 0.35640897755610973, + "grad_norm": 0.4422770703436805, + "learning_rate": 3.728839974896221e-05, + "loss": 0.8796, + "step": 7146 + }, + { + "epoch": 0.3564588528678304, + "grad_norm": 0.4757151181578678, + "learning_rate": 3.728488267331383e-05, + "loss": 0.964, + "step": 7147 + }, + { + "epoch": 0.35650872817955115, + "grad_norm": 0.7110281314039802, + "learning_rate": 3.728136527709694e-05, + "loss": 0.8803, + "step": 7148 + }, + { + "epoch": 0.35655860349127183, + "grad_norm": 0.47122267243962895, + "learning_rate": 3.727784756040331e-05, + "loss": 0.9184, + "step": 7149 + }, + { + "epoch": 0.3566084788029925, + "grad_norm": 0.47419209297556447, + "learning_rate": 3.7274329523324754e-05, + "loss": 0.8868, + "step": 7150 + }, + { + "epoch": 0.3566583541147132, + "grad_norm": 0.5334374230954791, + "learning_rate": 3.727081116595305e-05, + "loss": 0.8867, + "step": 7151 + }, + { + "epoch": 0.35670822942643393, + "grad_norm": 0.4654395276663712, + "learning_rate": 3.726729248838003e-05, + "loss": 0.9101, + "step": 7152 + }, + { + "epoch": 0.3567581047381546, + "grad_norm": 0.9728075269379489, + "learning_rate": 3.7263773490697496e-05, + "loss": 0.9172, + "step": 7153 + }, + { + "epoch": 0.3568079800498753, + "grad_norm": 0.40812376449183346, + "learning_rate": 3.7260254172997285e-05, + "loss": 0.8683, + "step": 7154 + }, + { + "epoch": 0.35685785536159603, + "grad_norm": 0.48193002081336644, + "learning_rate": 3.7256734535371225e-05, + "loss": 0.9119, + "step": 7155 + }, + { + "epoch": 0.3569077306733167, + "grad_norm": 0.6328877911437369, + "learning_rate": 3.725321457791117e-05, + "loss": 0.8999, + "step": 7156 + }, + { + "epoch": 0.3569576059850374, + "grad_norm": 1.4571220501449094, + "learning_rate": 3.7249694300708955e-05, + "loss": 0.9068, + "step": 7157 + }, + { + "epoch": 0.3570074812967581, + "grad_norm": 0.6769097828996566, + "learning_rate": 3.724617370385645e-05, + "loss": 0.9337, + "step": 7158 + }, + { + "epoch": 0.3570573566084788, + "grad_norm": 0.4921557128340003, + "learning_rate": 3.7242652787445525e-05, + "loss": 0.9135, + "step": 7159 + }, + { + "epoch": 0.3571072319201995, + "grad_norm": 0.49887582400039193, + "learning_rate": 3.723913155156806e-05, + "loss": 0.919, + "step": 7160 + }, + { + "epoch": 0.3571571072319202, + "grad_norm": 0.6818115946093345, + "learning_rate": 3.7235609996315936e-05, + "loss": 0.9199, + "step": 7161 + }, + { + "epoch": 0.3572069825436409, + "grad_norm": 0.4802934083697557, + "learning_rate": 3.7232088121781036e-05, + "loss": 0.9305, + "step": 7162 + }, + { + "epoch": 0.3572568578553616, + "grad_norm": 0.6106903041602303, + "learning_rate": 3.7228565928055276e-05, + "loss": 0.9331, + "step": 7163 + }, + { + "epoch": 0.3573067331670823, + "grad_norm": 0.5040007195577468, + "learning_rate": 3.722504341523056e-05, + "loss": 0.9239, + "step": 7164 + }, + { + "epoch": 0.357356608478803, + "grad_norm": 0.5645292659258837, + "learning_rate": 3.72215205833988e-05, + "loss": 0.9375, + "step": 7165 + }, + { + "epoch": 0.3574064837905237, + "grad_norm": 0.4393472507711414, + "learning_rate": 3.721799743265194e-05, + "loss": 0.9178, + "step": 7166 + }, + { + "epoch": 0.3574563591022444, + "grad_norm": 0.5837090124824645, + "learning_rate": 3.72144739630819e-05, + "loss": 0.9532, + "step": 7167 + }, + { + "epoch": 0.35750623441396506, + "grad_norm": 0.47781849828325146, + "learning_rate": 3.721095017478064e-05, + "loss": 0.8698, + "step": 7168 + }, + { + "epoch": 0.3575561097256858, + "grad_norm": 0.46738339901224624, + "learning_rate": 3.720742606784008e-05, + "loss": 0.9191, + "step": 7169 + }, + { + "epoch": 0.3576059850374065, + "grad_norm": 0.5065459965750196, + "learning_rate": 3.7203901642352214e-05, + "loss": 0.9213, + "step": 7170 + }, + { + "epoch": 0.35765586034912716, + "grad_norm": 0.5709243184432264, + "learning_rate": 3.720037689840899e-05, + "loss": 0.9338, + "step": 7171 + }, + { + "epoch": 0.3577057356608479, + "grad_norm": 0.5585236886034712, + "learning_rate": 3.719685183610239e-05, + "loss": 0.9379, + "step": 7172 + }, + { + "epoch": 0.3577556109725686, + "grad_norm": 0.46122853746850945, + "learning_rate": 3.7193326455524405e-05, + "loss": 0.9262, + "step": 7173 + }, + { + "epoch": 0.35780548628428926, + "grad_norm": 0.5737224702718371, + "learning_rate": 3.718980075676703e-05, + "loss": 0.9308, + "step": 7174 + }, + { + "epoch": 0.35785536159601, + "grad_norm": 0.5056832921562144, + "learning_rate": 3.7186274739922245e-05, + "loss": 0.8699, + "step": 7175 + }, + { + "epoch": 0.3579052369077307, + "grad_norm": 0.538119394363166, + "learning_rate": 3.7182748405082086e-05, + "loss": 0.9119, + "step": 7176 + }, + { + "epoch": 0.35795511221945137, + "grad_norm": 0.47188978081364075, + "learning_rate": 3.7179221752338556e-05, + "loss": 0.9093, + "step": 7177 + }, + { + "epoch": 0.35800498753117205, + "grad_norm": 0.4800885108648131, + "learning_rate": 3.717569478178368e-05, + "loss": 0.9606, + "step": 7178 + }, + { + "epoch": 0.3580548628428928, + "grad_norm": 0.6595632462644335, + "learning_rate": 3.71721674935095e-05, + "loss": 0.8963, + "step": 7179 + }, + { + "epoch": 0.35810473815461347, + "grad_norm": 0.4653228579416519, + "learning_rate": 3.7168639887608063e-05, + "loss": 0.8916, + "step": 7180 + }, + { + "epoch": 0.35815461346633415, + "grad_norm": 0.42980872510675805, + "learning_rate": 3.716511196417141e-05, + "loss": 0.8928, + "step": 7181 + }, + { + "epoch": 0.3582044887780549, + "grad_norm": 0.463914363387363, + "learning_rate": 3.7161583723291605e-05, + "loss": 0.8939, + "step": 7182 + }, + { + "epoch": 0.35825436408977557, + "grad_norm": 0.6185657366009224, + "learning_rate": 3.715805516506071e-05, + "loss": 0.8657, + "step": 7183 + }, + { + "epoch": 0.35830423940149625, + "grad_norm": 0.546397057145292, + "learning_rate": 3.715452628957082e-05, + "loss": 0.9388, + "step": 7184 + }, + { + "epoch": 0.35835411471321693, + "grad_norm": 0.4183642839012359, + "learning_rate": 3.7150997096913995e-05, + "loss": 0.9017, + "step": 7185 + }, + { + "epoch": 0.35840399002493767, + "grad_norm": 0.5316282946062687, + "learning_rate": 3.714746758718234e-05, + "loss": 0.9258, + "step": 7186 + }, + { + "epoch": 0.35845386533665835, + "grad_norm": 0.645801601075498, + "learning_rate": 3.714393776046796e-05, + "loss": 0.8847, + "step": 7187 + }, + { + "epoch": 0.35850374064837903, + "grad_norm": 0.4638052294122996, + "learning_rate": 3.714040761686295e-05, + "loss": 0.8855, + "step": 7188 + }, + { + "epoch": 0.35855361596009977, + "grad_norm": 0.5685182236864986, + "learning_rate": 3.713687715645945e-05, + "loss": 0.9231, + "step": 7189 + }, + { + "epoch": 0.35860349127182045, + "grad_norm": 0.4644948092028616, + "learning_rate": 3.713334637934956e-05, + "loss": 0.9108, + "step": 7190 + }, + { + "epoch": 0.35865336658354113, + "grad_norm": 0.4311092532463791, + "learning_rate": 3.712981528562543e-05, + "loss": 0.9303, + "step": 7191 + }, + { + "epoch": 0.35870324189526187, + "grad_norm": 0.393245663714643, + "learning_rate": 3.71262838753792e-05, + "loss": 0.9082, + "step": 7192 + }, + { + "epoch": 0.35875311720698255, + "grad_norm": 0.5910736272816506, + "learning_rate": 3.712275214870302e-05, + "loss": 0.9458, + "step": 7193 + }, + { + "epoch": 0.35880299251870323, + "grad_norm": 0.5717402123592061, + "learning_rate": 3.711922010568905e-05, + "loss": 0.9498, + "step": 7194 + }, + { + "epoch": 0.3588528678304239, + "grad_norm": 0.5696714533845624, + "learning_rate": 3.7115687746429454e-05, + "loss": 0.9298, + "step": 7195 + }, + { + "epoch": 0.35890274314214465, + "grad_norm": 1.1943697865854233, + "learning_rate": 3.71121550710164e-05, + "loss": 0.8467, + "step": 7196 + }, + { + "epoch": 0.35895261845386534, + "grad_norm": 0.9077587760495768, + "learning_rate": 3.7108622079542085e-05, + "loss": 0.8894, + "step": 7197 + }, + { + "epoch": 0.359002493765586, + "grad_norm": 0.4631377323324387, + "learning_rate": 3.7105088772098706e-05, + "loss": 0.9346, + "step": 7198 + }, + { + "epoch": 0.35905236907730675, + "grad_norm": 0.43970985722545786, + "learning_rate": 3.710155514877844e-05, + "loss": 0.9193, + "step": 7199 + }, + { + "epoch": 0.35910224438902744, + "grad_norm": 0.6533776977121113, + "learning_rate": 3.709802120967352e-05, + "loss": 0.891, + "step": 7200 + }, + { + "epoch": 0.3591521197007481, + "grad_norm": 0.560052756529815, + "learning_rate": 3.709448695487614e-05, + "loss": 0.9099, + "step": 7201 + }, + { + "epoch": 0.35920199501246886, + "grad_norm": 0.4601536066721886, + "learning_rate": 3.709095238447854e-05, + "loss": 0.9203, + "step": 7202 + }, + { + "epoch": 0.35925187032418954, + "grad_norm": 0.6298991154141331, + "learning_rate": 3.7087417498572944e-05, + "loss": 0.913, + "step": 7203 + }, + { + "epoch": 0.3593017456359102, + "grad_norm": 0.49258355765765, + "learning_rate": 3.70838822972516e-05, + "loss": 0.8938, + "step": 7204 + }, + { + "epoch": 0.3593516209476309, + "grad_norm": 0.5372632492554896, + "learning_rate": 3.7080346780606764e-05, + "loss": 0.8986, + "step": 7205 + }, + { + "epoch": 0.35940149625935164, + "grad_norm": 0.6709294566239573, + "learning_rate": 3.7076810948730675e-05, + "loss": 0.9261, + "step": 7206 + }, + { + "epoch": 0.3594513715710723, + "grad_norm": 0.5024280302745734, + "learning_rate": 3.707327480171561e-05, + "loss": 0.9101, + "step": 7207 + }, + { + "epoch": 0.359501246882793, + "grad_norm": 0.4464872772188817, + "learning_rate": 3.706973833965385e-05, + "loss": 0.9612, + "step": 7208 + }, + { + "epoch": 0.35955112219451374, + "grad_norm": 0.39986257996305413, + "learning_rate": 3.706620156263766e-05, + "loss": 0.8773, + "step": 7209 + }, + { + "epoch": 0.3596009975062344, + "grad_norm": 0.48286728365062737, + "learning_rate": 3.7062664470759345e-05, + "loss": 0.8887, + "step": 7210 + }, + { + "epoch": 0.3596508728179551, + "grad_norm": 0.8738390968566679, + "learning_rate": 3.7059127064111205e-05, + "loss": 0.9476, + "step": 7211 + }, + { + "epoch": 0.3597007481296758, + "grad_norm": 0.7298911163356886, + "learning_rate": 3.7055589342785534e-05, + "loss": 0.9442, + "step": 7212 + }, + { + "epoch": 0.3597506234413965, + "grad_norm": 0.46849112445861985, + "learning_rate": 3.705205130687466e-05, + "loss": 0.9067, + "step": 7213 + }, + { + "epoch": 0.3598004987531172, + "grad_norm": 0.650053955926421, + "learning_rate": 3.7048512956470894e-05, + "loss": 0.9251, + "step": 7214 + }, + { + "epoch": 0.3598503740648379, + "grad_norm": 0.5355120877896827, + "learning_rate": 3.704497429166658e-05, + "loss": 0.904, + "step": 7215 + }, + { + "epoch": 0.3599002493765586, + "grad_norm": 0.6843752829979838, + "learning_rate": 3.704143531255405e-05, + "loss": 0.9245, + "step": 7216 + }, + { + "epoch": 0.3599501246882793, + "grad_norm": 0.4117878627607844, + "learning_rate": 3.703789601922566e-05, + "loss": 0.8972, + "step": 7217 + }, + { + "epoch": 0.36, + "grad_norm": 0.4517725468526783, + "learning_rate": 3.703435641177375e-05, + "loss": 0.9233, + "step": 7218 + }, + { + "epoch": 0.3600498753117207, + "grad_norm": 0.5857414097874084, + "learning_rate": 3.7030816490290705e-05, + "loss": 0.9205, + "step": 7219 + }, + { + "epoch": 0.3600997506234414, + "grad_norm": 0.6515070155943923, + "learning_rate": 3.702727625486888e-05, + "loss": 0.905, + "step": 7220 + }, + { + "epoch": 0.3601496259351621, + "grad_norm": 0.5023629001348707, + "learning_rate": 3.702373570560067e-05, + "loss": 0.9041, + "step": 7221 + }, + { + "epoch": 0.36019950124688277, + "grad_norm": 0.5418595034775152, + "learning_rate": 3.702019484257846e-05, + "loss": 0.9197, + "step": 7222 + }, + { + "epoch": 0.3602493765586035, + "grad_norm": 0.4472087958313742, + "learning_rate": 3.701665366589464e-05, + "loss": 0.8584, + "step": 7223 + }, + { + "epoch": 0.3602992518703242, + "grad_norm": 0.630408882893293, + "learning_rate": 3.7013112175641625e-05, + "loss": 0.9289, + "step": 7224 + }, + { + "epoch": 0.36034912718204487, + "grad_norm": 0.48992283275613246, + "learning_rate": 3.7009570371911815e-05, + "loss": 0.945, + "step": 7225 + }, + { + "epoch": 0.3603990024937656, + "grad_norm": 0.5844411021520615, + "learning_rate": 3.7006028254797656e-05, + "loss": 0.9038, + "step": 7226 + }, + { + "epoch": 0.3604488778054863, + "grad_norm": 0.4708066193885911, + "learning_rate": 3.700248582439155e-05, + "loss": 0.8998, + "step": 7227 + }, + { + "epoch": 0.36049875311720697, + "grad_norm": 0.4690053518976276, + "learning_rate": 3.699894308078595e-05, + "loss": 0.9137, + "step": 7228 + }, + { + "epoch": 0.36054862842892765, + "grad_norm": 0.5687577410902497, + "learning_rate": 3.69954000240733e-05, + "loss": 0.8937, + "step": 7229 + }, + { + "epoch": 0.3605985037406484, + "grad_norm": 0.44924159980342226, + "learning_rate": 3.699185665434606e-05, + "loss": 0.9042, + "step": 7230 + }, + { + "epoch": 0.3606483790523691, + "grad_norm": 2.528949698917051, + "learning_rate": 3.698831297169668e-05, + "loss": 0.8826, + "step": 7231 + }, + { + "epoch": 0.36069825436408975, + "grad_norm": 0.48746753841616386, + "learning_rate": 3.698476897621764e-05, + "loss": 0.8543, + "step": 7232 + }, + { + "epoch": 0.3607481296758105, + "grad_norm": 0.4653360711118383, + "learning_rate": 3.6981224668001424e-05, + "loss": 0.9068, + "step": 7233 + }, + { + "epoch": 0.3607980049875312, + "grad_norm": 0.5934277695645916, + "learning_rate": 3.697768004714051e-05, + "loss": 0.8682, + "step": 7234 + }, + { + "epoch": 0.36084788029925186, + "grad_norm": 0.6529292575472725, + "learning_rate": 3.697413511372739e-05, + "loss": 0.9462, + "step": 7235 + }, + { + "epoch": 0.3608977556109726, + "grad_norm": 0.5821871879941497, + "learning_rate": 3.697058986785458e-05, + "loss": 0.8796, + "step": 7236 + }, + { + "epoch": 0.3609476309226933, + "grad_norm": 0.7561354949217404, + "learning_rate": 3.6967044309614585e-05, + "loss": 0.9006, + "step": 7237 + }, + { + "epoch": 0.36099750623441396, + "grad_norm": 0.5662336935810344, + "learning_rate": 3.696349843909992e-05, + "loss": 0.9079, + "step": 7238 + }, + { + "epoch": 0.36104738154613464, + "grad_norm": 0.6447099084935476, + "learning_rate": 3.695995225640312e-05, + "loss": 0.9179, + "step": 7239 + }, + { + "epoch": 0.3610972568578554, + "grad_norm": 0.4807405517384597, + "learning_rate": 3.695640576161671e-05, + "loss": 0.9134, + "step": 7240 + }, + { + "epoch": 0.36114713216957606, + "grad_norm": 0.5231543911039438, + "learning_rate": 3.6952858954833246e-05, + "loss": 0.9249, + "step": 7241 + }, + { + "epoch": 0.36119700748129674, + "grad_norm": 0.7623755333418206, + "learning_rate": 3.6949311836145286e-05, + "loss": 0.8696, + "step": 7242 + }, + { + "epoch": 0.3612468827930175, + "grad_norm": 0.6490212796034611, + "learning_rate": 3.6945764405645375e-05, + "loss": 0.9424, + "step": 7243 + }, + { + "epoch": 0.36129675810473816, + "grad_norm": 0.5412521451549205, + "learning_rate": 3.6942216663426085e-05, + "loss": 0.9132, + "step": 7244 + }, + { + "epoch": 0.36134663341645884, + "grad_norm": 0.5448333663618495, + "learning_rate": 3.6938668609580005e-05, + "loss": 0.8967, + "step": 7245 + }, + { + "epoch": 0.3613965087281796, + "grad_norm": 0.47355665003270925, + "learning_rate": 3.693512024419971e-05, + "loss": 0.9155, + "step": 7246 + }, + { + "epoch": 0.36144638403990026, + "grad_norm": 0.4660143771764125, + "learning_rate": 3.693157156737778e-05, + "loss": 0.9247, + "step": 7247 + }, + { + "epoch": 0.36149625935162094, + "grad_norm": 0.4801349284885998, + "learning_rate": 3.6928022579206845e-05, + "loss": 0.9226, + "step": 7248 + }, + { + "epoch": 0.3615461346633416, + "grad_norm": 0.4704781033510201, + "learning_rate": 3.692447327977949e-05, + "loss": 0.9152, + "step": 7249 + }, + { + "epoch": 0.36159600997506236, + "grad_norm": 0.655042600859562, + "learning_rate": 3.692092366918835e-05, + "loss": 0.9225, + "step": 7250 + }, + { + "epoch": 0.36164588528678304, + "grad_norm": 0.3932843076171626, + "learning_rate": 3.691737374752603e-05, + "loss": 0.9007, + "step": 7251 + }, + { + "epoch": 0.3616957605985037, + "grad_norm": 0.45722471607474685, + "learning_rate": 3.691382351488519e-05, + "loss": 0.9395, + "step": 7252 + }, + { + "epoch": 0.36174563591022446, + "grad_norm": 0.5885422238428458, + "learning_rate": 3.6910272971358446e-05, + "loss": 0.8791, + "step": 7253 + }, + { + "epoch": 0.36179551122194514, + "grad_norm": 0.44387417530449547, + "learning_rate": 3.690672211703846e-05, + "loss": 0.9143, + "step": 7254 + }, + { + "epoch": 0.3618453865336658, + "grad_norm": 0.5056971207660544, + "learning_rate": 3.69031709520179e-05, + "loss": 0.8552, + "step": 7255 + }, + { + "epoch": 0.3618952618453865, + "grad_norm": 0.48066133189026305, + "learning_rate": 3.689961947638941e-05, + "loss": 0.8976, + "step": 7256 + }, + { + "epoch": 0.36194513715710724, + "grad_norm": 0.5052849164463002, + "learning_rate": 3.6896067690245675e-05, + "loss": 0.9473, + "step": 7257 + }, + { + "epoch": 0.3619950124688279, + "grad_norm": 0.4640934599824683, + "learning_rate": 3.689251559367938e-05, + "loss": 0.9377, + "step": 7258 + }, + { + "epoch": 0.3620448877805486, + "grad_norm": 0.37266703114738436, + "learning_rate": 3.6888963186783224e-05, + "loss": 0.8801, + "step": 7259 + }, + { + "epoch": 0.36209476309226934, + "grad_norm": 0.447918248461984, + "learning_rate": 3.688541046964988e-05, + "loss": 0.9005, + "step": 7260 + }, + { + "epoch": 0.36214463840399, + "grad_norm": 0.3950513916980697, + "learning_rate": 3.688185744237207e-05, + "loss": 0.8733, + "step": 7261 + }, + { + "epoch": 0.3621945137157107, + "grad_norm": 0.5060144670178068, + "learning_rate": 3.687830410504251e-05, + "loss": 0.9439, + "step": 7262 + }, + { + "epoch": 0.36224438902743145, + "grad_norm": 0.4279770070066592, + "learning_rate": 3.687475045775392e-05, + "loss": 0.9552, + "step": 7263 + }, + { + "epoch": 0.36229426433915213, + "grad_norm": 0.6972878529505904, + "learning_rate": 3.6871196500599036e-05, + "loss": 0.8997, + "step": 7264 + }, + { + "epoch": 0.3623441396508728, + "grad_norm": 0.5312787935827411, + "learning_rate": 3.6867642233670594e-05, + "loss": 0.9319, + "step": 7265 + }, + { + "epoch": 0.3623940149625935, + "grad_norm": 0.42522316501924984, + "learning_rate": 3.686408765706133e-05, + "loss": 0.8563, + "step": 7266 + }, + { + "epoch": 0.36244389027431423, + "grad_norm": 1.0959383699554177, + "learning_rate": 3.686053277086401e-05, + "loss": 0.9173, + "step": 7267 + }, + { + "epoch": 0.3624937655860349, + "grad_norm": 0.42849902940382567, + "learning_rate": 3.68569775751714e-05, + "loss": 0.9493, + "step": 7268 + }, + { + "epoch": 0.3625436408977556, + "grad_norm": 0.49842343278528317, + "learning_rate": 3.6853422070076264e-05, + "loss": 0.9266, + "step": 7269 + }, + { + "epoch": 0.36259351620947633, + "grad_norm": 0.4689906272949596, + "learning_rate": 3.684986625567138e-05, + "loss": 0.9299, + "step": 7270 + }, + { + "epoch": 0.362643391521197, + "grad_norm": 0.48826895500079254, + "learning_rate": 3.684631013204955e-05, + "loss": 0.9229, + "step": 7271 + }, + { + "epoch": 0.3626932668329177, + "grad_norm": 0.4960132701971301, + "learning_rate": 3.6842753699303554e-05, + "loss": 0.9023, + "step": 7272 + }, + { + "epoch": 0.36274314214463843, + "grad_norm": 0.5978730908018088, + "learning_rate": 3.683919695752619e-05, + "loss": 0.9563, + "step": 7273 + }, + { + "epoch": 0.3627930174563591, + "grad_norm": 2.51130890133921, + "learning_rate": 3.6835639906810295e-05, + "loss": 0.8817, + "step": 7274 + }, + { + "epoch": 0.3628428927680798, + "grad_norm": 0.5876483843805979, + "learning_rate": 3.683208254724867e-05, + "loss": 0.8906, + "step": 7275 + }, + { + "epoch": 0.3628927680798005, + "grad_norm": 0.44007302125973075, + "learning_rate": 3.6828524878934147e-05, + "loss": 0.9154, + "step": 7276 + }, + { + "epoch": 0.3629426433915212, + "grad_norm": 0.5057168753546806, + "learning_rate": 3.682496690195955e-05, + "loss": 0.8552, + "step": 7277 + }, + { + "epoch": 0.3629925187032419, + "grad_norm": 0.4551959410362022, + "learning_rate": 3.6821408616417746e-05, + "loss": 0.8949, + "step": 7278 + }, + { + "epoch": 0.3630423940149626, + "grad_norm": 0.41794623479040854, + "learning_rate": 3.681785002240157e-05, + "loss": 0.9059, + "step": 7279 + }, + { + "epoch": 0.3630922693266833, + "grad_norm": 0.4199388468391524, + "learning_rate": 3.681429112000388e-05, + "loss": 0.8785, + "step": 7280 + }, + { + "epoch": 0.363142144638404, + "grad_norm": 0.44405251579405225, + "learning_rate": 3.6810731909317565e-05, + "loss": 0.9103, + "step": 7281 + }, + { + "epoch": 0.3631920199501247, + "grad_norm": 0.4980621314233865, + "learning_rate": 3.680717239043547e-05, + "loss": 0.9087, + "step": 7282 + }, + { + "epoch": 0.36324189526184536, + "grad_norm": 0.4600969830338176, + "learning_rate": 3.6803612563450505e-05, + "loss": 0.9133, + "step": 7283 + }, + { + "epoch": 0.3632917705735661, + "grad_norm": 0.46824818144613534, + "learning_rate": 3.680005242845556e-05, + "loss": 0.8765, + "step": 7284 + }, + { + "epoch": 0.3633416458852868, + "grad_norm": 0.45899495448831645, + "learning_rate": 3.6796491985543516e-05, + "loss": 0.9003, + "step": 7285 + }, + { + "epoch": 0.36339152119700746, + "grad_norm": 0.39986669715766326, + "learning_rate": 3.67929312348073e-05, + "loss": 0.914, + "step": 7286 + }, + { + "epoch": 0.3634413965087282, + "grad_norm": 0.5465566634155096, + "learning_rate": 3.678937017633982e-05, + "loss": 0.8517, + "step": 7287 + }, + { + "epoch": 0.3634912718204489, + "grad_norm": 0.4566746635811048, + "learning_rate": 3.6785808810234e-05, + "loss": 0.8858, + "step": 7288 + }, + { + "epoch": 0.36354114713216956, + "grad_norm": 0.4444809171769912, + "learning_rate": 3.678224713658277e-05, + "loss": 0.9095, + "step": 7289 + }, + { + "epoch": 0.3635910224438903, + "grad_norm": 0.4296868915215436, + "learning_rate": 3.677868515547908e-05, + "loss": 0.9045, + "step": 7290 + }, + { + "epoch": 0.363640897755611, + "grad_norm": 0.45162891846465414, + "learning_rate": 3.677512286701587e-05, + "loss": 0.9073, + "step": 7291 + }, + { + "epoch": 0.36369077306733166, + "grad_norm": 0.4684620229489641, + "learning_rate": 3.67715602712861e-05, + "loss": 0.8828, + "step": 7292 + }, + { + "epoch": 0.36374064837905234, + "grad_norm": 0.3674067549009555, + "learning_rate": 3.6767997368382736e-05, + "loss": 0.8699, + "step": 7293 + }, + { + "epoch": 0.3637905236907731, + "grad_norm": 1.2285245067100767, + "learning_rate": 3.676443415839874e-05, + "loss": 0.914, + "step": 7294 + }, + { + "epoch": 0.36384039900249376, + "grad_norm": 0.47012898887458054, + "learning_rate": 3.6760870641427104e-05, + "loss": 0.9079, + "step": 7295 + }, + { + "epoch": 0.36389027431421445, + "grad_norm": 0.4226118393268435, + "learning_rate": 3.675730681756081e-05, + "loss": 0.8546, + "step": 7296 + }, + { + "epoch": 0.3639401496259352, + "grad_norm": 0.48957053045478427, + "learning_rate": 3.675374268689285e-05, + "loss": 0.8663, + "step": 7297 + }, + { + "epoch": 0.36399002493765586, + "grad_norm": 0.4252121565276298, + "learning_rate": 3.6750178249516254e-05, + "loss": 0.9095, + "step": 7298 + }, + { + "epoch": 0.36403990024937655, + "grad_norm": 0.4229457610330843, + "learning_rate": 3.6746613505523995e-05, + "loss": 0.9583, + "step": 7299 + }, + { + "epoch": 0.3640897755610973, + "grad_norm": 0.5929439841413822, + "learning_rate": 3.6743048455009126e-05, + "loss": 0.9312, + "step": 7300 + }, + { + "epoch": 0.36413965087281797, + "grad_norm": 0.42940954353956756, + "learning_rate": 3.673948309806466e-05, + "loss": 0.8885, + "step": 7301 + }, + { + "epoch": 0.36418952618453865, + "grad_norm": 0.4269243739606626, + "learning_rate": 3.673591743478364e-05, + "loss": 0.882, + "step": 7302 + }, + { + "epoch": 0.36423940149625933, + "grad_norm": 0.4703253500835138, + "learning_rate": 3.67323514652591e-05, + "loss": 0.8824, + "step": 7303 + }, + { + "epoch": 0.36428927680798007, + "grad_norm": 0.3936357051926627, + "learning_rate": 3.6728785189584106e-05, + "loss": 0.8769, + "step": 7304 + }, + { + "epoch": 0.36433915211970075, + "grad_norm": 0.44388260160522874, + "learning_rate": 3.672521860785171e-05, + "loss": 0.91, + "step": 7305 + }, + { + "epoch": 0.36438902743142143, + "grad_norm": 0.42350325637530695, + "learning_rate": 3.672165172015497e-05, + "loss": 0.8914, + "step": 7306 + }, + { + "epoch": 0.36443890274314217, + "grad_norm": 0.43397493363915, + "learning_rate": 3.6718084526586986e-05, + "loss": 0.9272, + "step": 7307 + }, + { + "epoch": 0.36448877805486285, + "grad_norm": 1.187397877976607, + "learning_rate": 3.671451702724083e-05, + "loss": 0.8875, + "step": 7308 + }, + { + "epoch": 0.36453865336658353, + "grad_norm": 4.007431171625352, + "learning_rate": 3.6710949222209595e-05, + "loss": 0.9273, + "step": 7309 + }, + { + "epoch": 0.3645885286783042, + "grad_norm": 0.44415465585436553, + "learning_rate": 3.6707381111586374e-05, + "loss": 0.9061, + "step": 7310 + }, + { + "epoch": 0.36463840399002495, + "grad_norm": 0.43943065922079116, + "learning_rate": 3.670381269546429e-05, + "loss": 0.8754, + "step": 7311 + }, + { + "epoch": 0.36468827930174563, + "grad_norm": 0.5454303684645153, + "learning_rate": 3.670024397393645e-05, + "loss": 0.8769, + "step": 7312 + }, + { + "epoch": 0.3647381546134663, + "grad_norm": 0.4014611962267405, + "learning_rate": 3.6696674947095984e-05, + "loss": 0.9295, + "step": 7313 + }, + { + "epoch": 0.36478802992518705, + "grad_norm": 0.39854543113379803, + "learning_rate": 3.669310561503601e-05, + "loss": 0.9104, + "step": 7314 + }, + { + "epoch": 0.36483790523690773, + "grad_norm": 0.531693063534026, + "learning_rate": 3.668953597784969e-05, + "loss": 0.8901, + "step": 7315 + }, + { + "epoch": 0.3648877805486284, + "grad_norm": 0.8377369486975663, + "learning_rate": 3.668596603563015e-05, + "loss": 0.8965, + "step": 7316 + }, + { + "epoch": 0.36493765586034915, + "grad_norm": 0.41060179416222553, + "learning_rate": 3.668239578847056e-05, + "loss": 0.8903, + "step": 7317 + }, + { + "epoch": 0.36498753117206983, + "grad_norm": 0.4974184218165088, + "learning_rate": 3.6678825236464074e-05, + "loss": 0.9041, + "step": 7318 + }, + { + "epoch": 0.3650374064837905, + "grad_norm": 0.4131060451777719, + "learning_rate": 3.667525437970388e-05, + "loss": 0.87, + "step": 7319 + }, + { + "epoch": 0.3650872817955112, + "grad_norm": 1.5792214216843876, + "learning_rate": 3.6671683218283145e-05, + "loss": 0.9412, + "step": 7320 + }, + { + "epoch": 0.36513715710723194, + "grad_norm": 0.45720841040526444, + "learning_rate": 3.666811175229506e-05, + "loss": 0.9042, + "step": 7321 + }, + { + "epoch": 0.3651870324189526, + "grad_norm": 0.5552530022537623, + "learning_rate": 3.666453998183282e-05, + "loss": 0.9199, + "step": 7322 + }, + { + "epoch": 0.3652369077306733, + "grad_norm": 0.7672997351098934, + "learning_rate": 3.666096790698963e-05, + "loss": 0.9268, + "step": 7323 + }, + { + "epoch": 0.36528678304239404, + "grad_norm": 0.4800652783512787, + "learning_rate": 3.66573955278587e-05, + "loss": 0.8771, + "step": 7324 + }, + { + "epoch": 0.3653366583541147, + "grad_norm": 0.4606712391631918, + "learning_rate": 3.6653822844533256e-05, + "loss": 0.9145, + "step": 7325 + }, + { + "epoch": 0.3653865336658354, + "grad_norm": 0.37611608108948946, + "learning_rate": 3.665024985710652e-05, + "loss": 0.8783, + "step": 7326 + }, + { + "epoch": 0.3654364089775561, + "grad_norm": 0.4401257504365252, + "learning_rate": 3.664667656567172e-05, + "loss": 0.878, + "step": 7327 + }, + { + "epoch": 0.3654862842892768, + "grad_norm": 0.5269357935519994, + "learning_rate": 3.664310297032211e-05, + "loss": 0.9309, + "step": 7328 + }, + { + "epoch": 0.3655361596009975, + "grad_norm": 0.4533126277740037, + "learning_rate": 3.6639529071150944e-05, + "loss": 0.9261, + "step": 7329 + }, + { + "epoch": 0.3655860349127182, + "grad_norm": 0.6245033220093573, + "learning_rate": 3.663595486825147e-05, + "loss": 0.9846, + "step": 7330 + }, + { + "epoch": 0.3656359102244389, + "grad_norm": 0.5155866374842464, + "learning_rate": 3.663238036171697e-05, + "loss": 0.9088, + "step": 7331 + }, + { + "epoch": 0.3656857855361596, + "grad_norm": 0.41770669057006005, + "learning_rate": 3.66288055516407e-05, + "loss": 0.9401, + "step": 7332 + }, + { + "epoch": 0.3657356608478803, + "grad_norm": 0.5637848131313848, + "learning_rate": 3.662523043811596e-05, + "loss": 0.8532, + "step": 7333 + }, + { + "epoch": 0.365785536159601, + "grad_norm": 0.6217135784623804, + "learning_rate": 3.662165502123603e-05, + "loss": 0.9256, + "step": 7334 + }, + { + "epoch": 0.3658354114713217, + "grad_norm": 0.5402312265322782, + "learning_rate": 3.6618079301094216e-05, + "loss": 0.9009, + "step": 7335 + }, + { + "epoch": 0.3658852867830424, + "grad_norm": 0.9617603295050842, + "learning_rate": 3.661450327778382e-05, + "loss": 0.8838, + "step": 7336 + }, + { + "epoch": 0.36593516209476307, + "grad_norm": 0.4594971010749264, + "learning_rate": 3.6610926951398164e-05, + "loss": 0.9408, + "step": 7337 + }, + { + "epoch": 0.3659850374064838, + "grad_norm": 0.4126740556062608, + "learning_rate": 3.660735032203056e-05, + "loss": 0.8932, + "step": 7338 + }, + { + "epoch": 0.3660349127182045, + "grad_norm": 0.46359107892771023, + "learning_rate": 3.660377338977435e-05, + "loss": 0.8749, + "step": 7339 + }, + { + "epoch": 0.36608478802992517, + "grad_norm": 0.47479336558000684, + "learning_rate": 3.6600196154722865e-05, + "loss": 0.93, + "step": 7340 + }, + { + "epoch": 0.3661346633416459, + "grad_norm": 0.382174404975483, + "learning_rate": 3.6596618616969445e-05, + "loss": 0.8998, + "step": 7341 + }, + { + "epoch": 0.3661845386533666, + "grad_norm": 0.5185597489476111, + "learning_rate": 3.659304077660746e-05, + "loss": 0.8762, + "step": 7342 + }, + { + "epoch": 0.36623441396508727, + "grad_norm": 0.4024770555584192, + "learning_rate": 3.6589462633730266e-05, + "loss": 0.8429, + "step": 7343 + }, + { + "epoch": 0.366284289276808, + "grad_norm": 0.4354106123506261, + "learning_rate": 3.658588418843123e-05, + "loss": 0.9155, + "step": 7344 + }, + { + "epoch": 0.3663341645885287, + "grad_norm": 1.1247043963200078, + "learning_rate": 3.658230544080373e-05, + "loss": 0.9588, + "step": 7345 + }, + { + "epoch": 0.36638403990024937, + "grad_norm": 0.5930328751218455, + "learning_rate": 3.6578726390941145e-05, + "loss": 0.9084, + "step": 7346 + }, + { + "epoch": 0.36643391521197005, + "grad_norm": 0.40616829540399063, + "learning_rate": 3.657514703893689e-05, + "loss": 0.8961, + "step": 7347 + }, + { + "epoch": 0.3664837905236908, + "grad_norm": 0.5598504009972691, + "learning_rate": 3.657156738488435e-05, + "loss": 0.8641, + "step": 7348 + }, + { + "epoch": 0.36653366583541147, + "grad_norm": 1.219690619838182, + "learning_rate": 3.656798742887693e-05, + "loss": 0.9127, + "step": 7349 + }, + { + "epoch": 0.36658354114713215, + "grad_norm": 0.44213443632900784, + "learning_rate": 3.656440717100806e-05, + "loss": 0.8837, + "step": 7350 + }, + { + "epoch": 0.3666334164588529, + "grad_norm": 0.5827188529398349, + "learning_rate": 3.656082661137116e-05, + "loss": 0.8991, + "step": 7351 + }, + { + "epoch": 0.36668329177057357, + "grad_norm": 0.4275646163679122, + "learning_rate": 3.655724575005967e-05, + "loss": 0.9393, + "step": 7352 + }, + { + "epoch": 0.36673316708229425, + "grad_norm": 0.48128481901771014, + "learning_rate": 3.655366458716701e-05, + "loss": 0.9711, + "step": 7353 + }, + { + "epoch": 0.36678304239401494, + "grad_norm": 0.39377393073431805, + "learning_rate": 3.655008312278666e-05, + "loss": 0.9182, + "step": 7354 + }, + { + "epoch": 0.3668329177057357, + "grad_norm": 0.4402106732040636, + "learning_rate": 3.654650135701205e-05, + "loss": 0.9005, + "step": 7355 + }, + { + "epoch": 0.36688279301745635, + "grad_norm": 0.46196925017407253, + "learning_rate": 3.654291928993665e-05, + "loss": 0.9318, + "step": 7356 + }, + { + "epoch": 0.36693266832917704, + "grad_norm": 0.5077167917729863, + "learning_rate": 3.653933692165395e-05, + "loss": 0.8844, + "step": 7357 + }, + { + "epoch": 0.3669825436408978, + "grad_norm": 0.521640263326839, + "learning_rate": 3.653575425225741e-05, + "loss": 0.916, + "step": 7358 + }, + { + "epoch": 0.36703241895261846, + "grad_norm": 0.45089213430366454, + "learning_rate": 3.6532171281840526e-05, + "loss": 0.9169, + "step": 7359 + }, + { + "epoch": 0.36708229426433914, + "grad_norm": 3.169675498545526, + "learning_rate": 3.6528588010496786e-05, + "loss": 0.9237, + "step": 7360 + }, + { + "epoch": 0.3671321695760599, + "grad_norm": 0.4276412473138962, + "learning_rate": 3.652500443831972e-05, + "loss": 0.9177, + "step": 7361 + }, + { + "epoch": 0.36718204488778056, + "grad_norm": 0.43198553399830436, + "learning_rate": 3.652142056540281e-05, + "loss": 0.8942, + "step": 7362 + }, + { + "epoch": 0.36723192019950124, + "grad_norm": 0.4296440440615401, + "learning_rate": 3.6517836391839586e-05, + "loss": 0.905, + "step": 7363 + }, + { + "epoch": 0.3672817955112219, + "grad_norm": 0.3859501439093978, + "learning_rate": 3.651425191772358e-05, + "loss": 0.8921, + "step": 7364 + }, + { + "epoch": 0.36733167082294266, + "grad_norm": 0.49127163532739865, + "learning_rate": 3.6510667143148316e-05, + "loss": 0.8872, + "step": 7365 + }, + { + "epoch": 0.36738154613466334, + "grad_norm": 0.38717750646903615, + "learning_rate": 3.6507082068207354e-05, + "loss": 0.8602, + "step": 7366 + }, + { + "epoch": 0.367431421446384, + "grad_norm": 0.5142511805457392, + "learning_rate": 3.6503496692994235e-05, + "loss": 0.8973, + "step": 7367 + }, + { + "epoch": 0.36748129675810476, + "grad_norm": 0.4887277260202464, + "learning_rate": 3.649991101760251e-05, + "loss": 0.8556, + "step": 7368 + }, + { + "epoch": 0.36753117206982544, + "grad_norm": 0.5073324351496341, + "learning_rate": 3.6496325042125754e-05, + "loss": 0.9382, + "step": 7369 + }, + { + "epoch": 0.3675810473815461, + "grad_norm": 1.043701291101615, + "learning_rate": 3.6492738766657545e-05, + "loss": 0.9007, + "step": 7370 + }, + { + "epoch": 0.36763092269326686, + "grad_norm": 0.409853002848159, + "learning_rate": 3.6489152191291466e-05, + "loss": 0.9213, + "step": 7371 + }, + { + "epoch": 0.36768079800498754, + "grad_norm": 0.442239635159448, + "learning_rate": 3.64855653161211e-05, + "loss": 0.9166, + "step": 7372 + }, + { + "epoch": 0.3677306733167082, + "grad_norm": 0.46273913147990187, + "learning_rate": 3.6481978141240034e-05, + "loss": 0.868, + "step": 7373 + }, + { + "epoch": 0.3677805486284289, + "grad_norm": 0.4058561470109276, + "learning_rate": 3.64783906667419e-05, + "loss": 0.9011, + "step": 7374 + }, + { + "epoch": 0.36783042394014964, + "grad_norm": 0.49733773814681714, + "learning_rate": 3.64748028927203e-05, + "loss": 0.9212, + "step": 7375 + }, + { + "epoch": 0.3678802992518703, + "grad_norm": 0.4493328070269688, + "learning_rate": 3.647121481926885e-05, + "loss": 0.9372, + "step": 7376 + }, + { + "epoch": 0.367930174563591, + "grad_norm": 0.44236447105062054, + "learning_rate": 3.6467626446481184e-05, + "loss": 0.9136, + "step": 7377 + }, + { + "epoch": 0.36798004987531174, + "grad_norm": 0.9169294793414627, + "learning_rate": 3.6464037774450934e-05, + "loss": 0.8541, + "step": 7378 + }, + { + "epoch": 0.3680299251870324, + "grad_norm": 0.48444611845032065, + "learning_rate": 3.6460448803271754e-05, + "loss": 0.899, + "step": 7379 + }, + { + "epoch": 0.3680798004987531, + "grad_norm": 0.44827347919615895, + "learning_rate": 3.645685953303729e-05, + "loss": 0.9119, + "step": 7380 + }, + { + "epoch": 0.3681296758104738, + "grad_norm": 0.5685509569265361, + "learning_rate": 3.645326996384121e-05, + "loss": 0.9231, + "step": 7381 + }, + { + "epoch": 0.3681795511221945, + "grad_norm": 0.39201480100158426, + "learning_rate": 3.644968009577717e-05, + "loss": 0.8941, + "step": 7382 + }, + { + "epoch": 0.3682294264339152, + "grad_norm": 0.5329378518510813, + "learning_rate": 3.644608992893885e-05, + "loss": 0.9229, + "step": 7383 + }, + { + "epoch": 0.3682793017456359, + "grad_norm": 1.1965272142592975, + "learning_rate": 3.644249946341993e-05, + "loss": 0.8974, + "step": 7384 + }, + { + "epoch": 0.3683291770573566, + "grad_norm": 0.43021599116481374, + "learning_rate": 3.6438908699314124e-05, + "loss": 0.9338, + "step": 7385 + }, + { + "epoch": 0.3683790523690773, + "grad_norm": 0.3881632931869718, + "learning_rate": 3.643531763671511e-05, + "loss": 0.8817, + "step": 7386 + }, + { + "epoch": 0.368428927680798, + "grad_norm": 0.38842303962825614, + "learning_rate": 3.6431726275716593e-05, + "loss": 0.9106, + "step": 7387 + }, + { + "epoch": 0.36847880299251873, + "grad_norm": 0.39817397017081047, + "learning_rate": 3.6428134616412294e-05, + "loss": 0.9277, + "step": 7388 + }, + { + "epoch": 0.3685286783042394, + "grad_norm": 0.4393780361883469, + "learning_rate": 3.6424542658895944e-05, + "loss": 0.9106, + "step": 7389 + }, + { + "epoch": 0.3685785536159601, + "grad_norm": 0.47555707850051665, + "learning_rate": 3.642095040326126e-05, + "loss": 0.895, + "step": 7390 + }, + { + "epoch": 0.3686284289276808, + "grad_norm": 0.37012270056348134, + "learning_rate": 3.6417357849601995e-05, + "loss": 0.8586, + "step": 7391 + }, + { + "epoch": 0.3686783042394015, + "grad_norm": 0.4275803291689794, + "learning_rate": 3.6413764998011876e-05, + "loss": 0.9186, + "step": 7392 + }, + { + "epoch": 0.3687281795511222, + "grad_norm": 0.47138876461890117, + "learning_rate": 3.641017184858468e-05, + "loss": 0.8749, + "step": 7393 + }, + { + "epoch": 0.3687780548628429, + "grad_norm": 0.4617094514304468, + "learning_rate": 3.640657840141414e-05, + "loss": 0.8868, + "step": 7394 + }, + { + "epoch": 0.3688279301745636, + "grad_norm": 0.6375097517681129, + "learning_rate": 3.6402984656594055e-05, + "loss": 0.9198, + "step": 7395 + }, + { + "epoch": 0.3688778054862843, + "grad_norm": 3.3054792467074936, + "learning_rate": 3.6399390614218185e-05, + "loss": 0.9156, + "step": 7396 + }, + { + "epoch": 0.368927680798005, + "grad_norm": 0.4477539279044634, + "learning_rate": 3.6395796274380315e-05, + "loss": 0.918, + "step": 7397 + }, + { + "epoch": 0.3689775561097257, + "grad_norm": 0.4010298592690996, + "learning_rate": 3.639220163717425e-05, + "loss": 0.8595, + "step": 7398 + }, + { + "epoch": 0.3690274314214464, + "grad_norm": 0.5644082018128729, + "learning_rate": 3.638860670269378e-05, + "loss": 0.8981, + "step": 7399 + }, + { + "epoch": 0.3690773067331671, + "grad_norm": 0.4964867337901055, + "learning_rate": 3.6385011471032706e-05, + "loss": 0.9175, + "step": 7400 + }, + { + "epoch": 0.36912718204488776, + "grad_norm": 0.42398174946239664, + "learning_rate": 3.6381415942284856e-05, + "loss": 0.9191, + "step": 7401 + }, + { + "epoch": 0.3691770573566085, + "grad_norm": 0.4440489225167097, + "learning_rate": 3.637782011654405e-05, + "loss": 0.9131, + "step": 7402 + }, + { + "epoch": 0.3692269326683292, + "grad_norm": 0.39865205417931326, + "learning_rate": 3.637422399390413e-05, + "loss": 0.9386, + "step": 7403 + }, + { + "epoch": 0.36927680798004986, + "grad_norm": 0.5002367822695319, + "learning_rate": 3.637062757445892e-05, + "loss": 0.8844, + "step": 7404 + }, + { + "epoch": 0.3693266832917706, + "grad_norm": 0.4893545036196562, + "learning_rate": 3.636703085830227e-05, + "loss": 0.9376, + "step": 7405 + }, + { + "epoch": 0.3693765586034913, + "grad_norm": 0.7345303278658143, + "learning_rate": 3.6363433845528036e-05, + "loss": 0.8863, + "step": 7406 + }, + { + "epoch": 0.36942643391521196, + "grad_norm": 0.448052191363129, + "learning_rate": 3.6359836536230086e-05, + "loss": 0.8913, + "step": 7407 + }, + { + "epoch": 0.36947630922693264, + "grad_norm": 0.5724082562579998, + "learning_rate": 3.635623893050227e-05, + "loss": 0.9097, + "step": 7408 + }, + { + "epoch": 0.3695261845386534, + "grad_norm": 0.4447915242927888, + "learning_rate": 3.635264102843851e-05, + "loss": 0.8933, + "step": 7409 + }, + { + "epoch": 0.36957605985037406, + "grad_norm": 0.4193989758545179, + "learning_rate": 3.634904283013264e-05, + "loss": 0.8938, + "step": 7410 + }, + { + "epoch": 0.36962593516209474, + "grad_norm": 0.5297315134642738, + "learning_rate": 3.634544433567858e-05, + "loss": 0.9017, + "step": 7411 + }, + { + "epoch": 0.3696758104738155, + "grad_norm": 0.4737797551664827, + "learning_rate": 3.634184554517024e-05, + "loss": 0.9103, + "step": 7412 + }, + { + "epoch": 0.36972568578553616, + "grad_norm": 0.4228445735377448, + "learning_rate": 3.63382464587015e-05, + "loss": 0.8975, + "step": 7413 + }, + { + "epoch": 0.36977556109725684, + "grad_norm": 0.42183588597872435, + "learning_rate": 3.6334647076366306e-05, + "loss": 0.8972, + "step": 7414 + }, + { + "epoch": 0.3698254364089776, + "grad_norm": 0.5766211274287004, + "learning_rate": 3.633104739825856e-05, + "loss": 0.9465, + "step": 7415 + }, + { + "epoch": 0.36987531172069826, + "grad_norm": 0.40598356423246634, + "learning_rate": 3.6327447424472214e-05, + "loss": 0.9218, + "step": 7416 + }, + { + "epoch": 0.36992518703241895, + "grad_norm": 0.39803736382383625, + "learning_rate": 3.632384715510119e-05, + "loss": 0.8708, + "step": 7417 + }, + { + "epoch": 0.3699750623441396, + "grad_norm": 0.4078951900618489, + "learning_rate": 3.6320246590239437e-05, + "loss": 0.9196, + "step": 7418 + }, + { + "epoch": 0.37002493765586036, + "grad_norm": 0.4472586265438602, + "learning_rate": 3.631664572998092e-05, + "loss": 0.9005, + "step": 7419 + }, + { + "epoch": 0.37007481296758105, + "grad_norm": 0.6485222095702519, + "learning_rate": 3.6313044574419604e-05, + "loss": 0.9179, + "step": 7420 + }, + { + "epoch": 0.37012468827930173, + "grad_norm": 0.4039340217134378, + "learning_rate": 3.630944312364944e-05, + "loss": 0.9185, + "step": 7421 + }, + { + "epoch": 0.37017456359102247, + "grad_norm": 0.39484183644077087, + "learning_rate": 3.6305841377764436e-05, + "loss": 0.9443, + "step": 7422 + }, + { + "epoch": 0.37022443890274315, + "grad_norm": 0.4038600248387727, + "learning_rate": 3.6302239336858545e-05, + "loss": 0.8938, + "step": 7423 + }, + { + "epoch": 0.37027431421446383, + "grad_norm": 0.3719735708097626, + "learning_rate": 3.6298637001025784e-05, + "loss": 0.9282, + "step": 7424 + }, + { + "epoch": 0.37032418952618457, + "grad_norm": 0.41185048952772707, + "learning_rate": 3.6295034370360145e-05, + "loss": 0.9052, + "step": 7425 + }, + { + "epoch": 0.37037406483790525, + "grad_norm": 0.537172511984436, + "learning_rate": 3.6291431444955644e-05, + "loss": 0.9199, + "step": 7426 + }, + { + "epoch": 0.37042394014962593, + "grad_norm": 0.3856340563865064, + "learning_rate": 3.628782822490629e-05, + "loss": 0.8936, + "step": 7427 + }, + { + "epoch": 0.3704738154613466, + "grad_norm": 0.5731021037569718, + "learning_rate": 3.628422471030611e-05, + "loss": 0.9353, + "step": 7428 + }, + { + "epoch": 0.37052369077306735, + "grad_norm": 0.40993713417853006, + "learning_rate": 3.628062090124913e-05, + "loss": 0.8658, + "step": 7429 + }, + { + "epoch": 0.37057356608478803, + "grad_norm": 0.48986193178598664, + "learning_rate": 3.62770167978294e-05, + "loss": 0.8729, + "step": 7430 + }, + { + "epoch": 0.3706234413965087, + "grad_norm": 0.5261264680380956, + "learning_rate": 3.627341240014096e-05, + "loss": 0.9524, + "step": 7431 + }, + { + "epoch": 0.37067331670822945, + "grad_norm": 0.41923719137190646, + "learning_rate": 3.626980770827788e-05, + "loss": 0.8915, + "step": 7432 + }, + { + "epoch": 0.37072319201995013, + "grad_norm": 0.4674558349269555, + "learning_rate": 3.626620272233421e-05, + "loss": 0.8875, + "step": 7433 + }, + { + "epoch": 0.3707730673316708, + "grad_norm": 0.7936050174820176, + "learning_rate": 3.626259744240402e-05, + "loss": 0.9086, + "step": 7434 + }, + { + "epoch": 0.3708229426433915, + "grad_norm": 0.4232235364082093, + "learning_rate": 3.625899186858138e-05, + "loss": 0.8833, + "step": 7435 + }, + { + "epoch": 0.37087281795511223, + "grad_norm": 0.7333575681954888, + "learning_rate": 3.62553860009604e-05, + "loss": 0.9362, + "step": 7436 + }, + { + "epoch": 0.3709226932668329, + "grad_norm": 0.49754187368704733, + "learning_rate": 3.625177983963515e-05, + "loss": 0.9687, + "step": 7437 + }, + { + "epoch": 0.3709725685785536, + "grad_norm": 0.39935710734256924, + "learning_rate": 3.624817338469975e-05, + "loss": 0.8779, + "step": 7438 + }, + { + "epoch": 0.37102244389027433, + "grad_norm": 0.463415557215784, + "learning_rate": 3.624456663624829e-05, + "loss": 0.9425, + "step": 7439 + }, + { + "epoch": 0.371072319201995, + "grad_norm": 0.5221539917312827, + "learning_rate": 3.62409595943749e-05, + "loss": 0.9451, + "step": 7440 + }, + { + "epoch": 0.3711221945137157, + "grad_norm": 0.6526988940634066, + "learning_rate": 3.62373522591737e-05, + "loss": 0.8903, + "step": 7441 + }, + { + "epoch": 0.37117206982543643, + "grad_norm": 0.47079434106381995, + "learning_rate": 3.623374463073883e-05, + "loss": 0.8687, + "step": 7442 + }, + { + "epoch": 0.3712219451371571, + "grad_norm": 0.4338291301121307, + "learning_rate": 3.6230136709164406e-05, + "loss": 0.9012, + "step": 7443 + }, + { + "epoch": 0.3712718204488778, + "grad_norm": 0.4356214920764799, + "learning_rate": 3.62265284945446e-05, + "loss": 0.8922, + "step": 7444 + }, + { + "epoch": 0.3713216957605985, + "grad_norm": 0.6139246498380277, + "learning_rate": 3.622291998697355e-05, + "loss": 0.8674, + "step": 7445 + }, + { + "epoch": 0.3713715710723192, + "grad_norm": 0.4917457045104076, + "learning_rate": 3.621931118654544e-05, + "loss": 0.8932, + "step": 7446 + }, + { + "epoch": 0.3714214463840399, + "grad_norm": 0.4516392409236676, + "learning_rate": 3.621570209335441e-05, + "loss": 0.9086, + "step": 7447 + }, + { + "epoch": 0.3714713216957606, + "grad_norm": 1.3406934635501857, + "learning_rate": 3.621209270749466e-05, + "loss": 0.9324, + "step": 7448 + }, + { + "epoch": 0.3715211970074813, + "grad_norm": 0.4982654655407972, + "learning_rate": 3.6208483029060377e-05, + "loss": 0.9141, + "step": 7449 + }, + { + "epoch": 0.371571072319202, + "grad_norm": 0.5139955347742402, + "learning_rate": 3.620487305814573e-05, + "loss": 0.9264, + "step": 7450 + }, + { + "epoch": 0.3716209476309227, + "grad_norm": 0.43316508332202863, + "learning_rate": 3.620126279484495e-05, + "loss": 0.8933, + "step": 7451 + }, + { + "epoch": 0.37167082294264336, + "grad_norm": 0.5053681003177614, + "learning_rate": 3.619765223925222e-05, + "loss": 0.9435, + "step": 7452 + }, + { + "epoch": 0.3717206982543641, + "grad_norm": 0.7987785723277118, + "learning_rate": 3.619404139146177e-05, + "loss": 0.9354, + "step": 7453 + }, + { + "epoch": 0.3717705735660848, + "grad_norm": 0.5545156246399846, + "learning_rate": 3.619043025156782e-05, + "loss": 0.9122, + "step": 7454 + }, + { + "epoch": 0.37182044887780547, + "grad_norm": 0.45721148419261315, + "learning_rate": 3.6186818819664594e-05, + "loss": 0.8906, + "step": 7455 + }, + { + "epoch": 0.3718703241895262, + "grad_norm": 0.4272196910587848, + "learning_rate": 3.618320709584635e-05, + "loss": 0.8924, + "step": 7456 + }, + { + "epoch": 0.3719201995012469, + "grad_norm": 0.4061194998166612, + "learning_rate": 3.617959508020731e-05, + "loss": 0.9202, + "step": 7457 + }, + { + "epoch": 0.37197007481296757, + "grad_norm": 0.47184139620261306, + "learning_rate": 3.617598277284175e-05, + "loss": 0.9075, + "step": 7458 + }, + { + "epoch": 0.3720199501246883, + "grad_norm": 0.4374356536990036, + "learning_rate": 3.6172370173843915e-05, + "loss": 0.9096, + "step": 7459 + }, + { + "epoch": 0.372069825436409, + "grad_norm": 0.4028268664063798, + "learning_rate": 3.616875728330808e-05, + "loss": 0.9319, + "step": 7460 + }, + { + "epoch": 0.37211970074812967, + "grad_norm": 0.47544021428285604, + "learning_rate": 3.6165144101328516e-05, + "loss": 0.9208, + "step": 7461 + }, + { + "epoch": 0.37216957605985035, + "grad_norm": 0.44325407886302914, + "learning_rate": 3.616153062799953e-05, + "loss": 0.9438, + "step": 7462 + }, + { + "epoch": 0.3722194513715711, + "grad_norm": 0.568070599903023, + "learning_rate": 3.615791686341539e-05, + "loss": 0.958, + "step": 7463 + }, + { + "epoch": 0.37226932668329177, + "grad_norm": 0.7217778141988321, + "learning_rate": 3.61543028076704e-05, + "loss": 0.9289, + "step": 7464 + }, + { + "epoch": 0.37231920199501245, + "grad_norm": 0.665978185131312, + "learning_rate": 3.615068846085887e-05, + "loss": 0.8876, + "step": 7465 + }, + { + "epoch": 0.3723690773067332, + "grad_norm": 0.5366040990702091, + "learning_rate": 3.614707382307511e-05, + "loss": 0.9153, + "step": 7466 + }, + { + "epoch": 0.37241895261845387, + "grad_norm": 0.6356109747939769, + "learning_rate": 3.6143458894413465e-05, + "loss": 0.9652, + "step": 7467 + }, + { + "epoch": 0.37246882793017455, + "grad_norm": 0.47307947742543033, + "learning_rate": 3.613984367496823e-05, + "loss": 0.9455, + "step": 7468 + }, + { + "epoch": 0.3725187032418953, + "grad_norm": 0.44822699401105615, + "learning_rate": 3.613622816483377e-05, + "loss": 0.9695, + "step": 7469 + }, + { + "epoch": 0.37256857855361597, + "grad_norm": 0.6828905072805455, + "learning_rate": 3.613261236410441e-05, + "loss": 0.9264, + "step": 7470 + }, + { + "epoch": 0.37261845386533665, + "grad_norm": 0.4553647216886043, + "learning_rate": 3.612899627287452e-05, + "loss": 0.8947, + "step": 7471 + }, + { + "epoch": 0.37266832917705733, + "grad_norm": 0.7626832110494737, + "learning_rate": 3.612537989123845e-05, + "loss": 0.8963, + "step": 7472 + }, + { + "epoch": 0.37271820448877807, + "grad_norm": 0.5752846195618073, + "learning_rate": 3.612176321929058e-05, + "loss": 0.9294, + "step": 7473 + }, + { + "epoch": 0.37276807980049875, + "grad_norm": 0.44300419323305285, + "learning_rate": 3.611814625712526e-05, + "loss": 0.9299, + "step": 7474 + }, + { + "epoch": 0.37281795511221943, + "grad_norm": 0.4368208912587628, + "learning_rate": 3.6114529004836904e-05, + "loss": 0.8875, + "step": 7475 + }, + { + "epoch": 0.37286783042394017, + "grad_norm": 0.5772407120864667, + "learning_rate": 3.611091146251988e-05, + "loss": 0.9, + "step": 7476 + }, + { + "epoch": 0.37291770573566085, + "grad_norm": 0.5012628059443088, + "learning_rate": 3.61072936302686e-05, + "loss": 0.9696, + "step": 7477 + }, + { + "epoch": 0.37296758104738154, + "grad_norm": 1.7741898082236018, + "learning_rate": 3.610367550817746e-05, + "loss": 0.886, + "step": 7478 + }, + { + "epoch": 0.3730174563591022, + "grad_norm": 0.4701879797504189, + "learning_rate": 3.6100057096340876e-05, + "loss": 0.9387, + "step": 7479 + }, + { + "epoch": 0.37306733167082295, + "grad_norm": 0.4619969370601642, + "learning_rate": 3.6096438394853264e-05, + "loss": 0.9689, + "step": 7480 + }, + { + "epoch": 0.37311720698254364, + "grad_norm": 0.41231915145836373, + "learning_rate": 3.609281940380907e-05, + "loss": 0.9064, + "step": 7481 + }, + { + "epoch": 0.3731670822942643, + "grad_norm": 0.42517389224915864, + "learning_rate": 3.6089200123302715e-05, + "loss": 0.9197, + "step": 7482 + }, + { + "epoch": 0.37321695760598506, + "grad_norm": 0.43705503556961783, + "learning_rate": 3.6085580553428645e-05, + "loss": 0.9568, + "step": 7483 + }, + { + "epoch": 0.37326683291770574, + "grad_norm": 0.44553710350135955, + "learning_rate": 3.608196069428131e-05, + "loss": 0.8879, + "step": 7484 + }, + { + "epoch": 0.3733167082294264, + "grad_norm": 0.5285228942591691, + "learning_rate": 3.607834054595517e-05, + "loss": 0.8875, + "step": 7485 + }, + { + "epoch": 0.37336658354114716, + "grad_norm": 0.3721082092491375, + "learning_rate": 3.6074720108544694e-05, + "loss": 0.8883, + "step": 7486 + }, + { + "epoch": 0.37341645885286784, + "grad_norm": 0.3771832159592455, + "learning_rate": 3.607109938214435e-05, + "loss": 0.9318, + "step": 7487 + }, + { + "epoch": 0.3734663341645885, + "grad_norm": 0.5116564934601157, + "learning_rate": 3.606747836684862e-05, + "loss": 0.9124, + "step": 7488 + }, + { + "epoch": 0.3735162094763092, + "grad_norm": 0.41491717551102314, + "learning_rate": 3.6063857062751997e-05, + "loss": 0.8767, + "step": 7489 + }, + { + "epoch": 0.37356608478802994, + "grad_norm": 0.4025819382005369, + "learning_rate": 3.606023546994898e-05, + "loss": 0.9021, + "step": 7490 + }, + { + "epoch": 0.3736159600997506, + "grad_norm": 0.40869191536828076, + "learning_rate": 3.605661358853407e-05, + "loss": 0.9162, + "step": 7491 + }, + { + "epoch": 0.3736658354114713, + "grad_norm": 0.4011891147761566, + "learning_rate": 3.6052991418601776e-05, + "loss": 0.9092, + "step": 7492 + }, + { + "epoch": 0.37371571072319204, + "grad_norm": 0.4201743330708467, + "learning_rate": 3.604936896024661e-05, + "loss": 0.9257, + "step": 7493 + }, + { + "epoch": 0.3737655860349127, + "grad_norm": 0.6103952211788299, + "learning_rate": 3.604574621356311e-05, + "loss": 0.891, + "step": 7494 + }, + { + "epoch": 0.3738154613466334, + "grad_norm": 0.424716432648776, + "learning_rate": 3.60421231786458e-05, + "loss": 0.9307, + "step": 7495 + }, + { + "epoch": 0.37386533665835414, + "grad_norm": 0.4743161165511837, + "learning_rate": 3.6038499855589235e-05, + "loss": 0.9173, + "step": 7496 + }, + { + "epoch": 0.3739152119700748, + "grad_norm": 2.910427470339495, + "learning_rate": 3.603487624448796e-05, + "loss": 0.9096, + "step": 7497 + }, + { + "epoch": 0.3739650872817955, + "grad_norm": 0.5434238008130683, + "learning_rate": 3.603125234543652e-05, + "loss": 0.8996, + "step": 7498 + }, + { + "epoch": 0.3740149625935162, + "grad_norm": 0.9344230215021508, + "learning_rate": 3.602762815852949e-05, + "loss": 0.8873, + "step": 7499 + }, + { + "epoch": 0.3740648379052369, + "grad_norm": 0.6159848627417781, + "learning_rate": 3.602400368386144e-05, + "loss": 0.9079, + "step": 7500 + }, + { + "epoch": 0.3741147132169576, + "grad_norm": 0.49444274236198316, + "learning_rate": 3.6020378921526945e-05, + "loss": 0.8884, + "step": 7501 + }, + { + "epoch": 0.3741645885286783, + "grad_norm": 0.4779948142878575, + "learning_rate": 3.6016753871620594e-05, + "loss": 0.8876, + "step": 7502 + }, + { + "epoch": 0.374214463840399, + "grad_norm": 0.5322274160612634, + "learning_rate": 3.601312853423698e-05, + "loss": 0.8912, + "step": 7503 + }, + { + "epoch": 0.3742643391521197, + "grad_norm": 0.41163743518919294, + "learning_rate": 3.600950290947071e-05, + "loss": 0.9061, + "step": 7504 + }, + { + "epoch": 0.3743142144638404, + "grad_norm": 0.4703932160700142, + "learning_rate": 3.600587699741639e-05, + "loss": 0.9061, + "step": 7505 + }, + { + "epoch": 0.37436408977556107, + "grad_norm": 0.4891772616587531, + "learning_rate": 3.600225079816863e-05, + "loss": 0.9261, + "step": 7506 + }, + { + "epoch": 0.3744139650872818, + "grad_norm": 0.5558069680061939, + "learning_rate": 3.599862431182206e-05, + "loss": 0.9326, + "step": 7507 + }, + { + "epoch": 0.3744638403990025, + "grad_norm": 0.539699541940608, + "learning_rate": 3.5994997538471315e-05, + "loss": 0.954, + "step": 7508 + }, + { + "epoch": 0.37451371571072317, + "grad_norm": 0.4101292739853346, + "learning_rate": 3.5991370478211024e-05, + "loss": 0.8921, + "step": 7509 + }, + { + "epoch": 0.3745635910224439, + "grad_norm": 0.42233374706652543, + "learning_rate": 3.5987743131135844e-05, + "loss": 0.893, + "step": 7510 + }, + { + "epoch": 0.3746134663341646, + "grad_norm": 0.5153971337237688, + "learning_rate": 3.598411549734042e-05, + "loss": 0.9473, + "step": 7511 + }, + { + "epoch": 0.3746633416458853, + "grad_norm": 0.4205127737776349, + "learning_rate": 3.598048757691942e-05, + "loss": 0.9387, + "step": 7512 + }, + { + "epoch": 0.374713216957606, + "grad_norm": 0.4947047894729554, + "learning_rate": 3.597685936996752e-05, + "loss": 0.9402, + "step": 7513 + }, + { + "epoch": 0.3747630922693267, + "grad_norm": 0.7906642183109509, + "learning_rate": 3.597323087657938e-05, + "loss": 0.9049, + "step": 7514 + }, + { + "epoch": 0.3748129675810474, + "grad_norm": 0.5057946201785174, + "learning_rate": 3.596960209684969e-05, + "loss": 0.9507, + "step": 7515 + }, + { + "epoch": 0.37486284289276806, + "grad_norm": 0.5041346032611386, + "learning_rate": 3.596597303087314e-05, + "loss": 0.9099, + "step": 7516 + }, + { + "epoch": 0.3749127182044888, + "grad_norm": 0.49572378535733, + "learning_rate": 3.596234367874444e-05, + "loss": 0.8773, + "step": 7517 + }, + { + "epoch": 0.3749625935162095, + "grad_norm": 0.6717016457644571, + "learning_rate": 3.595871404055828e-05, + "loss": 0.9116, + "step": 7518 + }, + { + "epoch": 0.37501246882793016, + "grad_norm": 0.48209357847741713, + "learning_rate": 3.5955084116409385e-05, + "loss": 0.9517, + "step": 7519 + }, + { + "epoch": 0.3750623441396509, + "grad_norm": 0.47564834493031644, + "learning_rate": 3.595145390639247e-05, + "loss": 0.8734, + "step": 7520 + }, + { + "epoch": 0.3751122194513716, + "grad_norm": 0.4753994412720646, + "learning_rate": 3.594782341060227e-05, + "loss": 0.9179, + "step": 7521 + }, + { + "epoch": 0.37516209476309226, + "grad_norm": 0.402762206704835, + "learning_rate": 3.594419262913351e-05, + "loss": 0.9302, + "step": 7522 + }, + { + "epoch": 0.375211970074813, + "grad_norm": 0.3777986238029225, + "learning_rate": 3.594056156208095e-05, + "loss": 0.8969, + "step": 7523 + }, + { + "epoch": 0.3752618453865337, + "grad_norm": 0.5028793403644415, + "learning_rate": 3.593693020953933e-05, + "loss": 0.9068, + "step": 7524 + }, + { + "epoch": 0.37531172069825436, + "grad_norm": 0.534693294114266, + "learning_rate": 3.593329857160341e-05, + "loss": 0.8892, + "step": 7525 + }, + { + "epoch": 0.37536159600997504, + "grad_norm": 0.5129432853613158, + "learning_rate": 3.592966664836796e-05, + "loss": 0.8888, + "step": 7526 + }, + { + "epoch": 0.3754114713216958, + "grad_norm": 0.4925711121053064, + "learning_rate": 3.5926034439927745e-05, + "loss": 0.9168, + "step": 7527 + }, + { + "epoch": 0.37546134663341646, + "grad_norm": 0.9029363654848138, + "learning_rate": 3.592240194637756e-05, + "loss": 0.9179, + "step": 7528 + }, + { + "epoch": 0.37551122194513714, + "grad_norm": 0.46437510206141563, + "learning_rate": 3.5918769167812176e-05, + "loss": 0.928, + "step": 7529 + }, + { + "epoch": 0.3755610972568579, + "grad_norm": 0.487218694651295, + "learning_rate": 3.591513610432641e-05, + "loss": 0.8855, + "step": 7530 + }, + { + "epoch": 0.37561097256857856, + "grad_norm": 0.4058517597923856, + "learning_rate": 3.591150275601504e-05, + "loss": 0.9099, + "step": 7531 + }, + { + "epoch": 0.37566084788029924, + "grad_norm": 0.5620612217856171, + "learning_rate": 3.5907869122972884e-05, + "loss": 0.8905, + "step": 7532 + }, + { + "epoch": 0.3757107231920199, + "grad_norm": 0.6400919672929599, + "learning_rate": 3.5904235205294776e-05, + "loss": 0.9125, + "step": 7533 + }, + { + "epoch": 0.37576059850374066, + "grad_norm": 0.3867574463707369, + "learning_rate": 3.590060100307553e-05, + "loss": 0.8998, + "step": 7534 + }, + { + "epoch": 0.37581047381546134, + "grad_norm": 0.4598755959409322, + "learning_rate": 3.5896966516409966e-05, + "loss": 0.8988, + "step": 7535 + }, + { + "epoch": 0.375860349127182, + "grad_norm": 0.45682368407430324, + "learning_rate": 3.589333174539295e-05, + "loss": 0.867, + "step": 7536 + }, + { + "epoch": 0.37591022443890276, + "grad_norm": 0.4273395902366483, + "learning_rate": 3.588969669011932e-05, + "loss": 0.882, + "step": 7537 + }, + { + "epoch": 0.37596009975062344, + "grad_norm": 0.4760071446961548, + "learning_rate": 3.588606135068392e-05, + "loss": 0.9102, + "step": 7538 + }, + { + "epoch": 0.3760099750623441, + "grad_norm": 0.38652620875055, + "learning_rate": 3.588242572718162e-05, + "loss": 0.8989, + "step": 7539 + }, + { + "epoch": 0.37605985037406486, + "grad_norm": 0.4495698394573406, + "learning_rate": 3.58787898197073e-05, + "loss": 0.9495, + "step": 7540 + }, + { + "epoch": 0.37610972568578555, + "grad_norm": 0.5298707447404993, + "learning_rate": 3.587515362835582e-05, + "loss": 0.9281, + "step": 7541 + }, + { + "epoch": 0.3761596009975062, + "grad_norm": 0.5116780170688209, + "learning_rate": 3.587151715322208e-05, + "loss": 0.9258, + "step": 7542 + }, + { + "epoch": 0.3762094763092269, + "grad_norm": 0.4506311405853, + "learning_rate": 3.586788039440096e-05, + "loss": 0.9049, + "step": 7543 + }, + { + "epoch": 0.37625935162094765, + "grad_norm": 0.42637880561450975, + "learning_rate": 3.5864243351987366e-05, + "loss": 0.9139, + "step": 7544 + }, + { + "epoch": 0.37630922693266833, + "grad_norm": 0.4826403042713888, + "learning_rate": 3.5860606026076204e-05, + "loss": 0.8998, + "step": 7545 + }, + { + "epoch": 0.376359102244389, + "grad_norm": 0.5731505228087787, + "learning_rate": 3.585696841676239e-05, + "loss": 0.8924, + "step": 7546 + }, + { + "epoch": 0.37640897755610975, + "grad_norm": 0.4104540521778151, + "learning_rate": 3.5853330524140843e-05, + "loss": 0.8537, + "step": 7547 + }, + { + "epoch": 0.37645885286783043, + "grad_norm": 0.5437885102833124, + "learning_rate": 3.5849692348306495e-05, + "loss": 0.9514, + "step": 7548 + }, + { + "epoch": 0.3765087281795511, + "grad_norm": 2.7626923406041017, + "learning_rate": 3.584605388935428e-05, + "loss": 0.917, + "step": 7549 + }, + { + "epoch": 0.3765586034912718, + "grad_norm": 0.5016365542030262, + "learning_rate": 3.584241514737914e-05, + "loss": 0.9103, + "step": 7550 + }, + { + "epoch": 0.37660847880299253, + "grad_norm": 0.4694629183506955, + "learning_rate": 3.583877612247604e-05, + "loss": 0.9158, + "step": 7551 + }, + { + "epoch": 0.3766583541147132, + "grad_norm": 0.4809442328464469, + "learning_rate": 3.5835136814739915e-05, + "loss": 0.9148, + "step": 7552 + }, + { + "epoch": 0.3767082294264339, + "grad_norm": 0.5258998973337164, + "learning_rate": 3.5831497224265745e-05, + "loss": 0.8802, + "step": 7553 + }, + { + "epoch": 0.37675810473815463, + "grad_norm": 0.47633390687411953, + "learning_rate": 3.582785735114851e-05, + "loss": 0.9275, + "step": 7554 + }, + { + "epoch": 0.3768079800498753, + "grad_norm": 0.5496664924916101, + "learning_rate": 3.582421719548318e-05, + "loss": 0.9125, + "step": 7555 + }, + { + "epoch": 0.376857855361596, + "grad_norm": 0.4653617257082065, + "learning_rate": 3.582057675736474e-05, + "loss": 0.9211, + "step": 7556 + }, + { + "epoch": 0.37690773067331673, + "grad_norm": 1.8598499494440592, + "learning_rate": 3.58169360368882e-05, + "loss": 0.9322, + "step": 7557 + }, + { + "epoch": 0.3769576059850374, + "grad_norm": 0.42656645414677824, + "learning_rate": 3.581329503414855e-05, + "loss": 0.9128, + "step": 7558 + }, + { + "epoch": 0.3770074812967581, + "grad_norm": 0.4688577289798579, + "learning_rate": 3.580965374924081e-05, + "loss": 0.9193, + "step": 7559 + }, + { + "epoch": 0.3770573566084788, + "grad_norm": 0.41609525846915035, + "learning_rate": 3.580601218226e-05, + "loss": 0.887, + "step": 7560 + }, + { + "epoch": 0.3771072319201995, + "grad_norm": 0.4120250395941616, + "learning_rate": 3.580237033330112e-05, + "loss": 0.9267, + "step": 7561 + }, + { + "epoch": 0.3771571072319202, + "grad_norm": 0.8032223279355867, + "learning_rate": 3.5798728202459235e-05, + "loss": 0.8686, + "step": 7562 + }, + { + "epoch": 0.3772069825436409, + "grad_norm": 0.419291684834441, + "learning_rate": 3.579508578982936e-05, + "loss": 0.9288, + "step": 7563 + }, + { + "epoch": 0.3772568578553616, + "grad_norm": 0.4533722251582927, + "learning_rate": 3.5791443095506565e-05, + "loss": 0.8939, + "step": 7564 + }, + { + "epoch": 0.3773067331670823, + "grad_norm": 0.45415101562901167, + "learning_rate": 3.578780011958589e-05, + "loss": 0.8822, + "step": 7565 + }, + { + "epoch": 0.377356608478803, + "grad_norm": 0.5459984144425434, + "learning_rate": 3.578415686216239e-05, + "loss": 0.9357, + "step": 7566 + }, + { + "epoch": 0.3774064837905237, + "grad_norm": 0.43605383797839903, + "learning_rate": 3.578051332333115e-05, + "loss": 0.8583, + "step": 7567 + }, + { + "epoch": 0.3774563591022444, + "grad_norm": 0.47330176223197135, + "learning_rate": 3.5776869503187235e-05, + "loss": 0.9244, + "step": 7568 + }, + { + "epoch": 0.3775062344139651, + "grad_norm": 0.5096131568937176, + "learning_rate": 3.5773225401825735e-05, + "loss": 0.9035, + "step": 7569 + }, + { + "epoch": 0.37755610972568576, + "grad_norm": 0.4767941487136807, + "learning_rate": 3.576958101934174e-05, + "loss": 0.9156, + "step": 7570 + }, + { + "epoch": 0.3776059850374065, + "grad_norm": 0.440342638357943, + "learning_rate": 3.5765936355830356e-05, + "loss": 0.8794, + "step": 7571 + }, + { + "epoch": 0.3776558603491272, + "grad_norm": 0.4889561367977096, + "learning_rate": 3.576229141138667e-05, + "loss": 0.9108, + "step": 7572 + }, + { + "epoch": 0.37770573566084786, + "grad_norm": 0.40607336164229063, + "learning_rate": 3.575864618610581e-05, + "loss": 0.8991, + "step": 7573 + }, + { + "epoch": 0.3777556109725686, + "grad_norm": 0.43837386219162383, + "learning_rate": 3.575500068008289e-05, + "loss": 0.9501, + "step": 7574 + }, + { + "epoch": 0.3778054862842893, + "grad_norm": 0.5743480195272401, + "learning_rate": 3.575135489341304e-05, + "loss": 0.8478, + "step": 7575 + }, + { + "epoch": 0.37785536159600996, + "grad_norm": 0.5252385886634247, + "learning_rate": 3.57477088261914e-05, + "loss": 0.9038, + "step": 7576 + }, + { + "epoch": 0.37790523690773065, + "grad_norm": 0.41852778461502216, + "learning_rate": 3.574406247851311e-05, + "loss": 0.8964, + "step": 7577 + }, + { + "epoch": 0.3779551122194514, + "grad_norm": 0.4359343762009539, + "learning_rate": 3.5740415850473306e-05, + "loss": 0.9316, + "step": 7578 + }, + { + "epoch": 0.37800498753117207, + "grad_norm": 0.41534894621833895, + "learning_rate": 3.5736768942167174e-05, + "loss": 0.9, + "step": 7579 + }, + { + "epoch": 0.37805486284289275, + "grad_norm": 0.6313264327850632, + "learning_rate": 3.5733121753689845e-05, + "loss": 0.8717, + "step": 7580 + }, + { + "epoch": 0.3781047381546135, + "grad_norm": 0.4419862726742067, + "learning_rate": 3.572947428513652e-05, + "loss": 0.9611, + "step": 7581 + }, + { + "epoch": 0.37815461346633417, + "grad_norm": 0.6676908603704313, + "learning_rate": 3.572582653660236e-05, + "loss": 0.8554, + "step": 7582 + }, + { + "epoch": 0.37820448877805485, + "grad_norm": 0.4130006496559805, + "learning_rate": 3.5722178508182555e-05, + "loss": 0.9118, + "step": 7583 + }, + { + "epoch": 0.3782543640897756, + "grad_norm": 0.4633993151520197, + "learning_rate": 3.571853019997231e-05, + "loss": 0.9212, + "step": 7584 + }, + { + "epoch": 0.37830423940149627, + "grad_norm": 0.501594830446081, + "learning_rate": 3.571488161206681e-05, + "loss": 0.8882, + "step": 7585 + }, + { + "epoch": 0.37835411471321695, + "grad_norm": 0.5531482297614051, + "learning_rate": 3.5711232744561265e-05, + "loss": 0.9039, + "step": 7586 + }, + { + "epoch": 0.37840399002493763, + "grad_norm": 0.49822404316447166, + "learning_rate": 3.5707583597550896e-05, + "loss": 0.9239, + "step": 7587 + }, + { + "epoch": 0.37845386533665837, + "grad_norm": 2.7981374593601966, + "learning_rate": 3.570393417113094e-05, + "loss": 0.9144, + "step": 7588 + }, + { + "epoch": 0.37850374064837905, + "grad_norm": 0.47793234211729585, + "learning_rate": 3.57002844653966e-05, + "loss": 0.8879, + "step": 7589 + }, + { + "epoch": 0.37855361596009973, + "grad_norm": 1.0264492005645294, + "learning_rate": 3.5696634480443125e-05, + "loss": 0.8987, + "step": 7590 + }, + { + "epoch": 0.37860349127182047, + "grad_norm": 0.9108342523148293, + "learning_rate": 3.5692984216365766e-05, + "loss": 0.8765, + "step": 7591 + }, + { + "epoch": 0.37865336658354115, + "grad_norm": 0.6370726247920292, + "learning_rate": 3.568933367325976e-05, + "loss": 0.923, + "step": 7592 + }, + { + "epoch": 0.37870324189526183, + "grad_norm": 0.4789422470774794, + "learning_rate": 3.568568285122038e-05, + "loss": 0.9032, + "step": 7593 + }, + { + "epoch": 0.37875311720698257, + "grad_norm": 0.4393910639768705, + "learning_rate": 3.568203175034289e-05, + "loss": 0.8956, + "step": 7594 + }, + { + "epoch": 0.37880299251870325, + "grad_norm": 0.5679761941176145, + "learning_rate": 3.567838037072256e-05, + "loss": 0.935, + "step": 7595 + }, + { + "epoch": 0.37885286783042393, + "grad_norm": 0.672221276605373, + "learning_rate": 3.567472871245468e-05, + "loss": 0.9151, + "step": 7596 + }, + { + "epoch": 0.3789027431421446, + "grad_norm": 0.43702704190969266, + "learning_rate": 3.567107677563453e-05, + "loss": 0.8754, + "step": 7597 + }, + { + "epoch": 0.37895261845386535, + "grad_norm": 0.4663691967952516, + "learning_rate": 3.56674245603574e-05, + "loss": 0.9215, + "step": 7598 + }, + { + "epoch": 0.37900249376558603, + "grad_norm": 0.5302383658208152, + "learning_rate": 3.56637720667186e-05, + "loss": 0.8614, + "step": 7599 + }, + { + "epoch": 0.3790523690773067, + "grad_norm": 0.4415215597443179, + "learning_rate": 3.5660119294813456e-05, + "loss": 0.9109, + "step": 7600 + }, + { + "epoch": 0.37910224438902745, + "grad_norm": 0.6299138288992245, + "learning_rate": 3.5656466244737255e-05, + "loss": 0.9129, + "step": 7601 + }, + { + "epoch": 0.37915211970074814, + "grad_norm": 0.7225056605214869, + "learning_rate": 3.565281291658534e-05, + "loss": 0.8518, + "step": 7602 + }, + { + "epoch": 0.3792019950124688, + "grad_norm": 0.5005090859497777, + "learning_rate": 3.564915931045304e-05, + "loss": 0.9287, + "step": 7603 + }, + { + "epoch": 0.3792518703241895, + "grad_norm": 0.3979515371115753, + "learning_rate": 3.564550542643569e-05, + "loss": 0.8823, + "step": 7604 + }, + { + "epoch": 0.37930174563591024, + "grad_norm": 0.470454076311759, + "learning_rate": 3.5641851264628645e-05, + "loss": 0.9466, + "step": 7605 + }, + { + "epoch": 0.3793516209476309, + "grad_norm": 0.537610406260188, + "learning_rate": 3.5638196825127255e-05, + "loss": 0.8865, + "step": 7606 + }, + { + "epoch": 0.3794014962593516, + "grad_norm": 0.4919636722531776, + "learning_rate": 3.5634542108026876e-05, + "loss": 0.9023, + "step": 7607 + }, + { + "epoch": 0.37945137157107234, + "grad_norm": 0.5428685567700571, + "learning_rate": 3.5630887113422884e-05, + "loss": 0.9085, + "step": 7608 + }, + { + "epoch": 0.379501246882793, + "grad_norm": 0.5012123751077305, + "learning_rate": 3.562723184141065e-05, + "loss": 0.9452, + "step": 7609 + }, + { + "epoch": 0.3795511221945137, + "grad_norm": 0.4804297004282459, + "learning_rate": 3.562357629208555e-05, + "loss": 0.9308, + "step": 7610 + }, + { + "epoch": 0.37960099750623444, + "grad_norm": 0.4715417476028287, + "learning_rate": 3.5619920465542985e-05, + "loss": 0.9054, + "step": 7611 + }, + { + "epoch": 0.3796508728179551, + "grad_norm": 0.41194277518427297, + "learning_rate": 3.5616264361878355e-05, + "loss": 0.8744, + "step": 7612 + }, + { + "epoch": 0.3797007481296758, + "grad_norm": 0.39250862244893076, + "learning_rate": 3.5612607981187046e-05, + "loss": 0.888, + "step": 7613 + }, + { + "epoch": 0.3797506234413965, + "grad_norm": 0.3552218661412801, + "learning_rate": 3.560895132356449e-05, + "loss": 0.8675, + "step": 7614 + }, + { + "epoch": 0.3798004987531172, + "grad_norm": 0.5194477655670562, + "learning_rate": 3.560529438910609e-05, + "loss": 0.9106, + "step": 7615 + }, + { + "epoch": 0.3798503740648379, + "grad_norm": 0.5098434131216896, + "learning_rate": 3.5601637177907286e-05, + "loss": 0.9142, + "step": 7616 + }, + { + "epoch": 0.3799002493765586, + "grad_norm": 0.4148321308987668, + "learning_rate": 3.55979796900635e-05, + "loss": 0.9144, + "step": 7617 + }, + { + "epoch": 0.3799501246882793, + "grad_norm": 0.40982908469085594, + "learning_rate": 3.559432192567018e-05, + "loss": 0.8792, + "step": 7618 + }, + { + "epoch": 0.38, + "grad_norm": 0.891350177554024, + "learning_rate": 3.559066388482277e-05, + "loss": 0.9184, + "step": 7619 + }, + { + "epoch": 0.3800498753117207, + "grad_norm": 0.6133318959537465, + "learning_rate": 3.5587005567616724e-05, + "loss": 0.8651, + "step": 7620 + }, + { + "epoch": 0.3800997506234414, + "grad_norm": 0.5458290118850194, + "learning_rate": 3.5583346974147515e-05, + "loss": 0.9029, + "step": 7621 + }, + { + "epoch": 0.3801496259351621, + "grad_norm": 0.48117187909354214, + "learning_rate": 3.5579688104510586e-05, + "loss": 0.9324, + "step": 7622 + }, + { + "epoch": 0.3801995012468828, + "grad_norm": 0.4513962517365845, + "learning_rate": 3.557602895880145e-05, + "loss": 0.8817, + "step": 7623 + }, + { + "epoch": 0.38024937655860347, + "grad_norm": 0.5276962992038838, + "learning_rate": 3.5572369537115566e-05, + "loss": 0.895, + "step": 7624 + }, + { + "epoch": 0.3802992518703242, + "grad_norm": 0.9368604932141895, + "learning_rate": 3.556870983954842e-05, + "loss": 0.9115, + "step": 7625 + }, + { + "epoch": 0.3803491271820449, + "grad_norm": 0.4668297400440289, + "learning_rate": 3.556504986619553e-05, + "loss": 0.9079, + "step": 7626 + }, + { + "epoch": 0.38039900249376557, + "grad_norm": 0.6177155154163625, + "learning_rate": 3.556138961715239e-05, + "loss": 0.9065, + "step": 7627 + }, + { + "epoch": 0.3804488778054863, + "grad_norm": 0.4075050972245948, + "learning_rate": 3.555772909251452e-05, + "loss": 0.9258, + "step": 7628 + }, + { + "epoch": 0.380498753117207, + "grad_norm": 0.5301343902685866, + "learning_rate": 3.555406829237742e-05, + "loss": 0.9041, + "step": 7629 + }, + { + "epoch": 0.38054862842892767, + "grad_norm": 0.5718614595951781, + "learning_rate": 3.555040721683664e-05, + "loss": 0.9048, + "step": 7630 + }, + { + "epoch": 0.38059850374064835, + "grad_norm": 0.4805861469637541, + "learning_rate": 3.554674586598771e-05, + "loss": 0.9351, + "step": 7631 + }, + { + "epoch": 0.3806483790523691, + "grad_norm": 0.5962725692720298, + "learning_rate": 3.554308423992616e-05, + "loss": 0.9009, + "step": 7632 + }, + { + "epoch": 0.38069825436408977, + "grad_norm": 0.446087645357582, + "learning_rate": 3.5539422338747545e-05, + "loss": 0.9495, + "step": 7633 + }, + { + "epoch": 0.38074812967581045, + "grad_norm": 0.5103211829607648, + "learning_rate": 3.553576016254743e-05, + "loss": 0.9098, + "step": 7634 + }, + { + "epoch": 0.3807980049875312, + "grad_norm": 0.48396235355266204, + "learning_rate": 3.553209771142135e-05, + "loss": 0.9177, + "step": 7635 + }, + { + "epoch": 0.3808478802992519, + "grad_norm": 0.45751969455114866, + "learning_rate": 3.5528434985464914e-05, + "loss": 0.9416, + "step": 7636 + }, + { + "epoch": 0.38089775561097255, + "grad_norm": 0.7206621642561889, + "learning_rate": 3.5524771984773674e-05, + "loss": 0.9246, + "step": 7637 + }, + { + "epoch": 0.3809476309226933, + "grad_norm": 2.002886325259425, + "learning_rate": 3.5521108709443216e-05, + "loss": 0.912, + "step": 7638 + }, + { + "epoch": 0.380997506234414, + "grad_norm": 0.5011466602187413, + "learning_rate": 3.551744515956914e-05, + "loss": 0.9155, + "step": 7639 + }, + { + "epoch": 0.38104738154613466, + "grad_norm": 0.49516495725397364, + "learning_rate": 3.551378133524703e-05, + "loss": 0.9596, + "step": 7640 + }, + { + "epoch": 0.38109725685785534, + "grad_norm": 0.4802196923211094, + "learning_rate": 3.5510117236572505e-05, + "loss": 0.895, + "step": 7641 + }, + { + "epoch": 0.3811471321695761, + "grad_norm": 0.7510144975288583, + "learning_rate": 3.550645286364117e-05, + "loss": 0.8775, + "step": 7642 + }, + { + "epoch": 0.38119700748129676, + "grad_norm": 0.5773007148002037, + "learning_rate": 3.550278821654866e-05, + "loss": 0.9106, + "step": 7643 + }, + { + "epoch": 0.38124688279301744, + "grad_norm": 0.5458700706381479, + "learning_rate": 3.549912329539058e-05, + "loss": 0.8818, + "step": 7644 + }, + { + "epoch": 0.3812967581047382, + "grad_norm": 0.7937393985315788, + "learning_rate": 3.549545810026259e-05, + "loss": 0.9449, + "step": 7645 + }, + { + "epoch": 0.38134663341645886, + "grad_norm": 1.8193765143843743, + "learning_rate": 3.5491792631260306e-05, + "loss": 0.9093, + "step": 7646 + }, + { + "epoch": 0.38139650872817954, + "grad_norm": 0.4005193800898788, + "learning_rate": 3.54881268884794e-05, + "loss": 0.9288, + "step": 7647 + }, + { + "epoch": 0.3814463840399003, + "grad_norm": 0.5481471811844904, + "learning_rate": 3.548446087201551e-05, + "loss": 0.9352, + "step": 7648 + }, + { + "epoch": 0.38149625935162096, + "grad_norm": 0.46228689022562536, + "learning_rate": 3.5480794581964304e-05, + "loss": 0.923, + "step": 7649 + }, + { + "epoch": 0.38154613466334164, + "grad_norm": 1.010730202045148, + "learning_rate": 3.547712801842146e-05, + "loss": 0.8485, + "step": 7650 + }, + { + "epoch": 0.3815960099750623, + "grad_norm": 0.5063624675754211, + "learning_rate": 3.547346118148264e-05, + "loss": 0.9297, + "step": 7651 + }, + { + "epoch": 0.38164588528678306, + "grad_norm": 0.42905357015764767, + "learning_rate": 3.546979407124354e-05, + "loss": 0.9056, + "step": 7652 + }, + { + "epoch": 0.38169576059850374, + "grad_norm": 0.5933591737316405, + "learning_rate": 3.546612668779986e-05, + "loss": 0.9209, + "step": 7653 + }, + { + "epoch": 0.3817456359102244, + "grad_norm": 0.47868009419291935, + "learning_rate": 3.546245903124727e-05, + "loss": 0.9724, + "step": 7654 + }, + { + "epoch": 0.38179551122194516, + "grad_norm": 0.8552892686961339, + "learning_rate": 3.545879110168152e-05, + "loss": 0.9003, + "step": 7655 + }, + { + "epoch": 0.38184538653366584, + "grad_norm": 0.4876773368026181, + "learning_rate": 3.545512289919827e-05, + "loss": 0.8526, + "step": 7656 + }, + { + "epoch": 0.3818952618453865, + "grad_norm": 0.44577958150139096, + "learning_rate": 3.545145442389328e-05, + "loss": 0.9324, + "step": 7657 + }, + { + "epoch": 0.3819451371571072, + "grad_norm": 0.4897768637778463, + "learning_rate": 3.544778567586227e-05, + "loss": 0.8726, + "step": 7658 + }, + { + "epoch": 0.38199501246882794, + "grad_norm": 0.4314578883638481, + "learning_rate": 3.544411665520096e-05, + "loss": 0.8948, + "step": 7659 + }, + { + "epoch": 0.3820448877805486, + "grad_norm": 0.6207940110151445, + "learning_rate": 3.54404473620051e-05, + "loss": 0.9065, + "step": 7660 + }, + { + "epoch": 0.3820947630922693, + "grad_norm": 0.4889849462687344, + "learning_rate": 3.543677779637044e-05, + "loss": 0.9276, + "step": 7661 + }, + { + "epoch": 0.38214463840399004, + "grad_norm": 0.387038657479995, + "learning_rate": 3.543310795839273e-05, + "loss": 0.8763, + "step": 7662 + }, + { + "epoch": 0.3821945137157107, + "grad_norm": 0.5309810812713495, + "learning_rate": 3.5429437848167746e-05, + "loss": 0.891, + "step": 7663 + }, + { + "epoch": 0.3822443890274314, + "grad_norm": 0.7807897788194691, + "learning_rate": 3.542576746579124e-05, + "loss": 0.9083, + "step": 7664 + }, + { + "epoch": 0.38229426433915215, + "grad_norm": 0.5402823026328283, + "learning_rate": 3.5422096811359006e-05, + "loss": 0.8897, + "step": 7665 + }, + { + "epoch": 0.3823441396508728, + "grad_norm": 0.6878019325654116, + "learning_rate": 3.541842588496682e-05, + "loss": 0.9286, + "step": 7666 + }, + { + "epoch": 0.3823940149625935, + "grad_norm": 0.4662616335599622, + "learning_rate": 3.5414754686710474e-05, + "loss": 0.9267, + "step": 7667 + }, + { + "epoch": 0.3824438902743142, + "grad_norm": 0.4701538218845545, + "learning_rate": 3.541108321668575e-05, + "loss": 0.9012, + "step": 7668 + }, + { + "epoch": 0.38249376558603493, + "grad_norm": 0.3900849203499215, + "learning_rate": 3.540741147498849e-05, + "loss": 0.8739, + "step": 7669 + }, + { + "epoch": 0.3825436408977556, + "grad_norm": 0.5472410980347175, + "learning_rate": 3.540373946171447e-05, + "loss": 0.9223, + "step": 7670 + }, + { + "epoch": 0.3825935162094763, + "grad_norm": 0.49249170773478396, + "learning_rate": 3.540006717695953e-05, + "loss": 0.8984, + "step": 7671 + }, + { + "epoch": 0.38264339152119703, + "grad_norm": 0.6407662518136817, + "learning_rate": 3.539639462081949e-05, + "loss": 0.952, + "step": 7672 + }, + { + "epoch": 0.3826932668329177, + "grad_norm": 0.7609915093731656, + "learning_rate": 3.5392721793390196e-05, + "loss": 0.9233, + "step": 7673 + }, + { + "epoch": 0.3827431421446384, + "grad_norm": 0.8082519208245724, + "learning_rate": 3.5389048694767466e-05, + "loss": 0.8921, + "step": 7674 + }, + { + "epoch": 0.3827930174563591, + "grad_norm": 0.9747829405304371, + "learning_rate": 3.5385375325047166e-05, + "loss": 0.9342, + "step": 7675 + }, + { + "epoch": 0.3828428927680798, + "grad_norm": 0.42688538709501656, + "learning_rate": 3.5381701684325145e-05, + "loss": 0.9061, + "step": 7676 + }, + { + "epoch": 0.3828927680798005, + "grad_norm": 0.447730511289481, + "learning_rate": 3.5378027772697254e-05, + "loss": 0.9053, + "step": 7677 + }, + { + "epoch": 0.3829426433915212, + "grad_norm": 0.38149052407925776, + "learning_rate": 3.5374353590259384e-05, + "loss": 0.865, + "step": 7678 + }, + { + "epoch": 0.3829925187032419, + "grad_norm": 0.5981591376460444, + "learning_rate": 3.537067913710739e-05, + "loss": 0.9076, + "step": 7679 + }, + { + "epoch": 0.3830423940149626, + "grad_norm": 0.476598473655538, + "learning_rate": 3.536700441333718e-05, + "loss": 0.9823, + "step": 7680 + }, + { + "epoch": 0.3830922693266833, + "grad_norm": 0.5956134967842499, + "learning_rate": 3.5363329419044614e-05, + "loss": 0.941, + "step": 7681 + }, + { + "epoch": 0.383142144638404, + "grad_norm": 0.4002605534105718, + "learning_rate": 3.5359654154325616e-05, + "loss": 0.9303, + "step": 7682 + }, + { + "epoch": 0.3831920199501247, + "grad_norm": 0.9119626166831635, + "learning_rate": 3.535597861927607e-05, + "loss": 0.8764, + "step": 7683 + }, + { + "epoch": 0.3832418952618454, + "grad_norm": 0.37796039224223454, + "learning_rate": 3.535230281399191e-05, + "loss": 0.8835, + "step": 7684 + }, + { + "epoch": 0.38329177057356606, + "grad_norm": 0.5012736598188912, + "learning_rate": 3.534862673856904e-05, + "loss": 0.9238, + "step": 7685 + }, + { + "epoch": 0.3833416458852868, + "grad_norm": 0.5138853225255471, + "learning_rate": 3.5344950393103374e-05, + "loss": 0.9337, + "step": 7686 + }, + { + "epoch": 0.3833915211970075, + "grad_norm": 0.4687642154867479, + "learning_rate": 3.5341273777690865e-05, + "loss": 0.8765, + "step": 7687 + }, + { + "epoch": 0.38344139650872816, + "grad_norm": 0.44821397382847034, + "learning_rate": 3.5337596892427436e-05, + "loss": 0.8817, + "step": 7688 + }, + { + "epoch": 0.3834912718204489, + "grad_norm": 0.4584201684163594, + "learning_rate": 3.533391973740905e-05, + "loss": 0.9094, + "step": 7689 + }, + { + "epoch": 0.3835411471321696, + "grad_norm": 0.8853541927847778, + "learning_rate": 3.5330242312731656e-05, + "loss": 0.8842, + "step": 7690 + }, + { + "epoch": 0.38359102244389026, + "grad_norm": 0.9695581661614826, + "learning_rate": 3.53265646184912e-05, + "loss": 0.8808, + "step": 7691 + }, + { + "epoch": 0.383640897755611, + "grad_norm": 0.3980194608163111, + "learning_rate": 3.532288665478368e-05, + "loss": 0.838, + "step": 7692 + }, + { + "epoch": 0.3836907730673317, + "grad_norm": 0.627695417988406, + "learning_rate": 3.531920842170504e-05, + "loss": 0.9108, + "step": 7693 + }, + { + "epoch": 0.38374064837905236, + "grad_norm": 0.4750524938994931, + "learning_rate": 3.5315529919351275e-05, + "loss": 0.9114, + "step": 7694 + }, + { + "epoch": 0.38379052369077304, + "grad_norm": 0.491497956666157, + "learning_rate": 3.531185114781838e-05, + "loss": 0.8921, + "step": 7695 + }, + { + "epoch": 0.3838403990024938, + "grad_norm": 0.41487634465961276, + "learning_rate": 3.5308172107202345e-05, + "loss": 0.8895, + "step": 7696 + }, + { + "epoch": 0.38389027431421446, + "grad_norm": 0.4629765125800178, + "learning_rate": 3.530449279759916e-05, + "loss": 0.8995, + "step": 7697 + }, + { + "epoch": 0.38394014962593515, + "grad_norm": 0.7107824892500172, + "learning_rate": 3.530081321910485e-05, + "loss": 0.886, + "step": 7698 + }, + { + "epoch": 0.3839900249376559, + "grad_norm": 0.4469615342846758, + "learning_rate": 3.529713337181544e-05, + "loss": 0.916, + "step": 7699 + }, + { + "epoch": 0.38403990024937656, + "grad_norm": 0.6831949217979285, + "learning_rate": 3.529345325582694e-05, + "loss": 0.9136, + "step": 7700 + }, + { + "epoch": 0.38408977556109725, + "grad_norm": 0.5596409388243571, + "learning_rate": 3.5289772871235374e-05, + "loss": 0.8837, + "step": 7701 + }, + { + "epoch": 0.38413965087281793, + "grad_norm": 1.2603653973974014, + "learning_rate": 3.52860922181368e-05, + "loss": 0.8918, + "step": 7702 + }, + { + "epoch": 0.38418952618453867, + "grad_norm": 0.5038240637835917, + "learning_rate": 3.528241129662725e-05, + "loss": 0.8932, + "step": 7703 + }, + { + "epoch": 0.38423940149625935, + "grad_norm": 0.4845443347952588, + "learning_rate": 3.5278730106802784e-05, + "loss": 0.9357, + "step": 7704 + }, + { + "epoch": 0.38428927680798003, + "grad_norm": 0.48627871551646545, + "learning_rate": 3.527504864875944e-05, + "loss": 0.8666, + "step": 7705 + }, + { + "epoch": 0.38433915211970077, + "grad_norm": 0.5380171449366369, + "learning_rate": 3.527136692259332e-05, + "loss": 0.9439, + "step": 7706 + }, + { + "epoch": 0.38438902743142145, + "grad_norm": 0.4551967895834469, + "learning_rate": 3.526768492840047e-05, + "loss": 0.864, + "step": 7707 + }, + { + "epoch": 0.38443890274314213, + "grad_norm": 0.5992777145705034, + "learning_rate": 3.5264002666276976e-05, + "loss": 0.8713, + "step": 7708 + }, + { + "epoch": 0.38448877805486287, + "grad_norm": 0.4748162841976902, + "learning_rate": 3.526032013631893e-05, + "loss": 0.9379, + "step": 7709 + }, + { + "epoch": 0.38453865336658355, + "grad_norm": 0.6439189956513408, + "learning_rate": 3.525663733862241e-05, + "loss": 0.8959, + "step": 7710 + }, + { + "epoch": 0.38458852867830423, + "grad_norm": 0.5273070278407518, + "learning_rate": 3.525295427328354e-05, + "loss": 0.9196, + "step": 7711 + }, + { + "epoch": 0.3846384039900249, + "grad_norm": 0.43172275213104566, + "learning_rate": 3.524927094039841e-05, + "loss": 0.9167, + "step": 7712 + }, + { + "epoch": 0.38468827930174565, + "grad_norm": 0.6222257546025877, + "learning_rate": 3.524558734006315e-05, + "loss": 0.857, + "step": 7713 + }, + { + "epoch": 0.38473815461346633, + "grad_norm": 0.5256118382508396, + "learning_rate": 3.524190347237386e-05, + "loss": 0.9094, + "step": 7714 + }, + { + "epoch": 0.384788029925187, + "grad_norm": 0.4299539450394566, + "learning_rate": 3.523821933742669e-05, + "loss": 0.884, + "step": 7715 + }, + { + "epoch": 0.38483790523690775, + "grad_norm": 0.43366030656317855, + "learning_rate": 3.523453493531777e-05, + "loss": 0.88, + "step": 7716 + }, + { + "epoch": 0.38488778054862843, + "grad_norm": 0.4511231586330221, + "learning_rate": 3.5230850266143246e-05, + "loss": 0.8826, + "step": 7717 + }, + { + "epoch": 0.3849376558603491, + "grad_norm": 0.49009897379512146, + "learning_rate": 3.522716532999926e-05, + "loss": 0.865, + "step": 7718 + }, + { + "epoch": 0.38498753117206985, + "grad_norm": 0.6706830497858167, + "learning_rate": 3.5223480126981966e-05, + "loss": 0.9194, + "step": 7719 + }, + { + "epoch": 0.38503740648379053, + "grad_norm": 0.6037562778960904, + "learning_rate": 3.5219794657187536e-05, + "loss": 0.8832, + "step": 7720 + }, + { + "epoch": 0.3850872817955112, + "grad_norm": 0.39624930493433086, + "learning_rate": 3.521610892071214e-05, + "loss": 0.8802, + "step": 7721 + }, + { + "epoch": 0.3851371571072319, + "grad_norm": 0.38302447124110744, + "learning_rate": 3.5212422917651956e-05, + "loss": 0.8834, + "step": 7722 + }, + { + "epoch": 0.38518703241895264, + "grad_norm": 0.5573849458133312, + "learning_rate": 3.520873664810316e-05, + "loss": 0.8956, + "step": 7723 + }, + { + "epoch": 0.3852369077306733, + "grad_norm": 0.48184627394728974, + "learning_rate": 3.520505011216195e-05, + "loss": 0.8888, + "step": 7724 + }, + { + "epoch": 0.385286783042394, + "grad_norm": 0.638629182929604, + "learning_rate": 3.520136330992453e-05, + "loss": 0.962, + "step": 7725 + }, + { + "epoch": 0.38533665835411474, + "grad_norm": 0.5481703772774859, + "learning_rate": 3.51976762414871e-05, + "loss": 0.9205, + "step": 7726 + }, + { + "epoch": 0.3853865336658354, + "grad_norm": 0.5499986737889427, + "learning_rate": 3.519398890694587e-05, + "loss": 0.8704, + "step": 7727 + }, + { + "epoch": 0.3854364089775561, + "grad_norm": 0.44889060948755916, + "learning_rate": 3.5190301306397055e-05, + "loss": 0.9486, + "step": 7728 + }, + { + "epoch": 0.3854862842892768, + "grad_norm": 0.42846889285092277, + "learning_rate": 3.51866134399369e-05, + "loss": 0.9026, + "step": 7729 + }, + { + "epoch": 0.3855361596009975, + "grad_norm": 0.704875185940396, + "learning_rate": 3.518292530766162e-05, + "loss": 0.9233, + "step": 7730 + }, + { + "epoch": 0.3855860349127182, + "grad_norm": 0.5071376960360985, + "learning_rate": 3.5179236909667465e-05, + "loss": 0.9343, + "step": 7731 + }, + { + "epoch": 0.3856359102244389, + "grad_norm": 0.4443763238666033, + "learning_rate": 3.517554824605067e-05, + "loss": 0.9214, + "step": 7732 + }, + { + "epoch": 0.3856857855361596, + "grad_norm": 0.3910806245266449, + "learning_rate": 3.5171859316907507e-05, + "loss": 0.8941, + "step": 7733 + }, + { + "epoch": 0.3857356608478803, + "grad_norm": 0.3803839444676375, + "learning_rate": 3.516817012233422e-05, + "loss": 0.9094, + "step": 7734 + }, + { + "epoch": 0.385785536159601, + "grad_norm": 0.5041200220862203, + "learning_rate": 3.5164480662427094e-05, + "loss": 0.9288, + "step": 7735 + }, + { + "epoch": 0.3858354114713217, + "grad_norm": 0.42046566173176253, + "learning_rate": 3.5160790937282387e-05, + "loss": 0.9167, + "step": 7736 + }, + { + "epoch": 0.3858852867830424, + "grad_norm": 0.7286041554103514, + "learning_rate": 3.515710094699639e-05, + "loss": 0.8686, + "step": 7737 + }, + { + "epoch": 0.3859351620947631, + "grad_norm": 0.45615782592306575, + "learning_rate": 3.515341069166539e-05, + "loss": 0.9234, + "step": 7738 + }, + { + "epoch": 0.38598503740648377, + "grad_norm": 0.48457142333339964, + "learning_rate": 3.514972017138568e-05, + "loss": 0.8809, + "step": 7739 + }, + { + "epoch": 0.3860349127182045, + "grad_norm": 0.6406584648851101, + "learning_rate": 3.514602938625357e-05, + "loss": 0.9228, + "step": 7740 + }, + { + "epoch": 0.3860847880299252, + "grad_norm": 0.43767540410938555, + "learning_rate": 3.5142338336365357e-05, + "loss": 0.883, + "step": 7741 + }, + { + "epoch": 0.38613466334164587, + "grad_norm": 0.4623836631939168, + "learning_rate": 3.5138647021817364e-05, + "loss": 0.9074, + "step": 7742 + }, + { + "epoch": 0.3861845386533666, + "grad_norm": 0.6858431312419271, + "learning_rate": 3.513495544270592e-05, + "loss": 0.8827, + "step": 7743 + }, + { + "epoch": 0.3862344139650873, + "grad_norm": 0.5329303785642363, + "learning_rate": 3.513126359912735e-05, + "loss": 0.9237, + "step": 7744 + }, + { + "epoch": 0.38628428927680797, + "grad_norm": 0.5628708034244233, + "learning_rate": 3.512757149117798e-05, + "loss": 0.9031, + "step": 7745 + }, + { + "epoch": 0.3863341645885287, + "grad_norm": 0.6377551740497927, + "learning_rate": 3.512387911895418e-05, + "loss": 0.9059, + "step": 7746 + }, + { + "epoch": 0.3863840399002494, + "grad_norm": 0.43557306604395607, + "learning_rate": 3.512018648255227e-05, + "loss": 0.9368, + "step": 7747 + }, + { + "epoch": 0.38643391521197007, + "grad_norm": 0.7786914169929616, + "learning_rate": 3.511649358206863e-05, + "loss": 0.9626, + "step": 7748 + }, + { + "epoch": 0.38648379052369075, + "grad_norm": 0.42117540013815546, + "learning_rate": 3.511280041759962e-05, + "loss": 0.8932, + "step": 7749 + }, + { + "epoch": 0.3865336658354115, + "grad_norm": 0.5040563382675711, + "learning_rate": 3.510910698924161e-05, + "loss": 0.9634, + "step": 7750 + }, + { + "epoch": 0.38658354114713217, + "grad_norm": 0.4996233120860435, + "learning_rate": 3.5105413297090965e-05, + "loss": 0.9534, + "step": 7751 + }, + { + "epoch": 0.38663341645885285, + "grad_norm": 0.42332574478344837, + "learning_rate": 3.51017193412441e-05, + "loss": 0.888, + "step": 7752 + }, + { + "epoch": 0.3866832917705736, + "grad_norm": 0.5821687071042542, + "learning_rate": 3.509802512179737e-05, + "loss": 0.9189, + "step": 7753 + }, + { + "epoch": 0.38673316708229427, + "grad_norm": 0.510034887687832, + "learning_rate": 3.509433063884721e-05, + "loss": 0.8858, + "step": 7754 + }, + { + "epoch": 0.38678304239401495, + "grad_norm": 0.39449739538091955, + "learning_rate": 3.509063589249001e-05, + "loss": 0.8651, + "step": 7755 + }, + { + "epoch": 0.38683291770573563, + "grad_norm": 0.5265528944828572, + "learning_rate": 3.508694088282217e-05, + "loss": 0.9004, + "step": 7756 + }, + { + "epoch": 0.3868827930174564, + "grad_norm": 0.6626727709045266, + "learning_rate": 3.508324560994014e-05, + "loss": 0.8955, + "step": 7757 + }, + { + "epoch": 0.38693266832917705, + "grad_norm": 0.5041219790131269, + "learning_rate": 3.507955007394031e-05, + "loss": 0.8858, + "step": 7758 + }, + { + "epoch": 0.38698254364089774, + "grad_norm": 0.5063809769454589, + "learning_rate": 3.507585427491914e-05, + "loss": 0.907, + "step": 7759 + }, + { + "epoch": 0.3870324189526185, + "grad_norm": 0.41392158320640177, + "learning_rate": 3.507215821297306e-05, + "loss": 0.905, + "step": 7760 + }, + { + "epoch": 0.38708229426433916, + "grad_norm": 0.46285683423965235, + "learning_rate": 3.506846188819852e-05, + "loss": 0.9448, + "step": 7761 + }, + { + "epoch": 0.38713216957605984, + "grad_norm": 0.4597711815840222, + "learning_rate": 3.506476530069197e-05, + "loss": 0.9028, + "step": 7762 + }, + { + "epoch": 0.3871820448877806, + "grad_norm": 0.4112970239395498, + "learning_rate": 3.5061068450549884e-05, + "loss": 0.9263, + "step": 7763 + }, + { + "epoch": 0.38723192019950126, + "grad_norm": 0.5187718470964906, + "learning_rate": 3.505737133786871e-05, + "loss": 0.8858, + "step": 7764 + }, + { + "epoch": 0.38728179551122194, + "grad_norm": 0.4641176169178072, + "learning_rate": 3.505367396274493e-05, + "loss": 0.8649, + "step": 7765 + }, + { + "epoch": 0.3873316708229426, + "grad_norm": 0.5155070904006331, + "learning_rate": 3.504997632527504e-05, + "loss": 0.9066, + "step": 7766 + }, + { + "epoch": 0.38738154613466336, + "grad_norm": 0.49778487323663634, + "learning_rate": 3.50462784255555e-05, + "loss": 0.8886, + "step": 7767 + }, + { + "epoch": 0.38743142144638404, + "grad_norm": 0.5388175641253319, + "learning_rate": 3.5042580263682835e-05, + "loss": 0.8941, + "step": 7768 + }, + { + "epoch": 0.3874812967581047, + "grad_norm": 0.4050434484507021, + "learning_rate": 3.5038881839753515e-05, + "loss": 0.9112, + "step": 7769 + }, + { + "epoch": 0.38753117206982546, + "grad_norm": 0.40008759735375626, + "learning_rate": 3.5035183153864083e-05, + "loss": 0.8701, + "step": 7770 + }, + { + "epoch": 0.38758104738154614, + "grad_norm": 0.6629507473193157, + "learning_rate": 3.503148420611103e-05, + "loss": 0.9391, + "step": 7771 + }, + { + "epoch": 0.3876309226932668, + "grad_norm": 0.6085842438574254, + "learning_rate": 3.502778499659089e-05, + "loss": 0.857, + "step": 7772 + }, + { + "epoch": 0.3876807980049875, + "grad_norm": 0.4001899203477149, + "learning_rate": 3.5024085525400186e-05, + "loss": 0.906, + "step": 7773 + }, + { + "epoch": 0.38773067331670824, + "grad_norm": 0.9173367260139165, + "learning_rate": 3.5020385792635455e-05, + "loss": 0.8962, + "step": 7774 + }, + { + "epoch": 0.3877805486284289, + "grad_norm": 0.4199494829314291, + "learning_rate": 3.5016685798393244e-05, + "loss": 0.8692, + "step": 7775 + }, + { + "epoch": 0.3878304239401496, + "grad_norm": 0.45990620188856784, + "learning_rate": 3.50129855427701e-05, + "loss": 0.9189, + "step": 7776 + }, + { + "epoch": 0.38788029925187034, + "grad_norm": 0.5127940487585486, + "learning_rate": 3.5009285025862575e-05, + "loss": 0.9204, + "step": 7777 + }, + { + "epoch": 0.387930174563591, + "grad_norm": 0.4619841366610934, + "learning_rate": 3.500558424776724e-05, + "loss": 0.9106, + "step": 7778 + }, + { + "epoch": 0.3879800498753117, + "grad_norm": 0.46725381272537014, + "learning_rate": 3.5001883208580665e-05, + "loss": 0.893, + "step": 7779 + }, + { + "epoch": 0.38802992518703244, + "grad_norm": 0.4529473956164958, + "learning_rate": 3.499818190839942e-05, + "loss": 0.866, + "step": 7780 + }, + { + "epoch": 0.3880798004987531, + "grad_norm": 0.5142239682425369, + "learning_rate": 3.4994480347320104e-05, + "loss": 0.898, + "step": 7781 + }, + { + "epoch": 0.3881296758104738, + "grad_norm": 0.5068328990387958, + "learning_rate": 3.4990778525439274e-05, + "loss": 0.8989, + "step": 7782 + }, + { + "epoch": 0.3881795511221945, + "grad_norm": 0.45778334650105373, + "learning_rate": 3.498707644285357e-05, + "loss": 0.9036, + "step": 7783 + }, + { + "epoch": 0.3882294264339152, + "grad_norm": 0.42559291412056593, + "learning_rate": 3.498337409965956e-05, + "loss": 0.907, + "step": 7784 + }, + { + "epoch": 0.3882793017456359, + "grad_norm": 0.43823574055098563, + "learning_rate": 3.4979671495953885e-05, + "loss": 0.8805, + "step": 7785 + }, + { + "epoch": 0.3883291770573566, + "grad_norm": 0.6177825333441834, + "learning_rate": 3.497596863183314e-05, + "loss": 0.9059, + "step": 7786 + }, + { + "epoch": 0.3883790523690773, + "grad_norm": 0.48395721541345976, + "learning_rate": 3.497226550739397e-05, + "loss": 0.9095, + "step": 7787 + }, + { + "epoch": 0.388428927680798, + "grad_norm": 0.4270323603986851, + "learning_rate": 3.496856212273297e-05, + "loss": 0.928, + "step": 7788 + }, + { + "epoch": 0.3884788029925187, + "grad_norm": 0.5672177908102923, + "learning_rate": 3.4964858477946824e-05, + "loss": 0.9548, + "step": 7789 + }, + { + "epoch": 0.3885286783042394, + "grad_norm": 0.4120931514186008, + "learning_rate": 3.496115457313215e-05, + "loss": 0.909, + "step": 7790 + }, + { + "epoch": 0.3885785536159601, + "grad_norm": 0.5036115926450584, + "learning_rate": 3.495745040838561e-05, + "loss": 0.9211, + "step": 7791 + }, + { + "epoch": 0.3886284289276808, + "grad_norm": 0.5272006818382255, + "learning_rate": 3.4953745983803845e-05, + "loss": 0.8761, + "step": 7792 + }, + { + "epoch": 0.3886783042394015, + "grad_norm": 0.4153472465773315, + "learning_rate": 3.495004129948354e-05, + "loss": 0.9074, + "step": 7793 + }, + { + "epoch": 0.3887281795511222, + "grad_norm": 0.5105564827158608, + "learning_rate": 3.494633635552137e-05, + "loss": 0.919, + "step": 7794 + }, + { + "epoch": 0.3887780548628429, + "grad_norm": 0.48362877273006616, + "learning_rate": 3.494263115201399e-05, + "loss": 0.885, + "step": 7795 + }, + { + "epoch": 0.3888279301745636, + "grad_norm": 0.3996102363469281, + "learning_rate": 3.493892568905811e-05, + "loss": 0.9066, + "step": 7796 + }, + { + "epoch": 0.3888778054862843, + "grad_norm": 0.5209357162734464, + "learning_rate": 3.4935219966750405e-05, + "loss": 0.8869, + "step": 7797 + }, + { + "epoch": 0.388927680798005, + "grad_norm": 0.4196444343945335, + "learning_rate": 3.493151398518758e-05, + "loss": 0.8779, + "step": 7798 + }, + { + "epoch": 0.3889775561097257, + "grad_norm": 0.441482235819324, + "learning_rate": 3.492780774446635e-05, + "loss": 0.8892, + "step": 7799 + }, + { + "epoch": 0.38902743142144636, + "grad_norm": 0.4074701531188117, + "learning_rate": 3.492410124468341e-05, + "loss": 0.9075, + "step": 7800 + }, + { + "epoch": 0.3890773067331671, + "grad_norm": 0.48888000784685515, + "learning_rate": 3.4920394485935506e-05, + "loss": 0.9347, + "step": 7801 + }, + { + "epoch": 0.3891271820448878, + "grad_norm": 0.45602226732324386, + "learning_rate": 3.4916687468319334e-05, + "loss": 0.8726, + "step": 7802 + }, + { + "epoch": 0.38917705735660846, + "grad_norm": 0.4905697832688855, + "learning_rate": 3.491298019193164e-05, + "loss": 0.866, + "step": 7803 + }, + { + "epoch": 0.3892269326683292, + "grad_norm": 0.4841731785430228, + "learning_rate": 3.490927265686917e-05, + "loss": 0.925, + "step": 7804 + }, + { + "epoch": 0.3892768079800499, + "grad_norm": 0.46263498456405666, + "learning_rate": 3.4905564863228666e-05, + "loss": 0.9236, + "step": 7805 + }, + { + "epoch": 0.38932668329177056, + "grad_norm": 0.4475042462487565, + "learning_rate": 3.4901856811106877e-05, + "loss": 0.9067, + "step": 7806 + }, + { + "epoch": 0.3893765586034913, + "grad_norm": 0.4255326433364455, + "learning_rate": 3.4898148500600567e-05, + "loss": 0.894, + "step": 7807 + }, + { + "epoch": 0.389426433915212, + "grad_norm": 0.44272439945664094, + "learning_rate": 3.48944399318065e-05, + "loss": 0.9047, + "step": 7808 + }, + { + "epoch": 0.38947630922693266, + "grad_norm": 0.4572385216553857, + "learning_rate": 3.489073110482145e-05, + "loss": 0.9221, + "step": 7809 + }, + { + "epoch": 0.38952618453865334, + "grad_norm": 0.47454405839342473, + "learning_rate": 3.48870220197422e-05, + "loss": 0.8962, + "step": 7810 + }, + { + "epoch": 0.3895760598503741, + "grad_norm": 0.5161781095406569, + "learning_rate": 3.4883312676665536e-05, + "loss": 0.8958, + "step": 7811 + }, + { + "epoch": 0.38962593516209476, + "grad_norm": 0.41636409104062905, + "learning_rate": 3.487960307568825e-05, + "loss": 0.9018, + "step": 7812 + }, + { + "epoch": 0.38967581047381544, + "grad_norm": 2.6375145142213543, + "learning_rate": 3.4875893216907144e-05, + "loss": 0.8877, + "step": 7813 + }, + { + "epoch": 0.3897256857855362, + "grad_norm": 0.403889113536152, + "learning_rate": 3.487218310041903e-05, + "loss": 0.8991, + "step": 7814 + }, + { + "epoch": 0.38977556109725686, + "grad_norm": 0.40188778269259123, + "learning_rate": 3.486847272632071e-05, + "loss": 0.901, + "step": 7815 + }, + { + "epoch": 0.38982543640897754, + "grad_norm": 0.9976611725602927, + "learning_rate": 3.486476209470901e-05, + "loss": 0.8918, + "step": 7816 + }, + { + "epoch": 0.3898753117206983, + "grad_norm": 0.43924416239367, + "learning_rate": 3.486105120568076e-05, + "loss": 0.9043, + "step": 7817 + }, + { + "epoch": 0.38992518703241896, + "grad_norm": 0.3938714333711506, + "learning_rate": 3.485734005933279e-05, + "loss": 0.9029, + "step": 7818 + }, + { + "epoch": 0.38997506234413964, + "grad_norm": 0.6318208558402856, + "learning_rate": 3.485362865576194e-05, + "loss": 0.85, + "step": 7819 + }, + { + "epoch": 0.3900249376558603, + "grad_norm": 0.45374659933168565, + "learning_rate": 3.484991699506507e-05, + "loss": 0.9319, + "step": 7820 + }, + { + "epoch": 0.39007481296758106, + "grad_norm": 0.4461816965448094, + "learning_rate": 3.484620507733902e-05, + "loss": 0.9074, + "step": 7821 + }, + { + "epoch": 0.39012468827930175, + "grad_norm": 0.4267095034521416, + "learning_rate": 3.484249290268065e-05, + "loss": 0.8897, + "step": 7822 + }, + { + "epoch": 0.3901745635910224, + "grad_norm": 0.528960353801197, + "learning_rate": 3.483878047118684e-05, + "loss": 0.922, + "step": 7823 + }, + { + "epoch": 0.39022443890274316, + "grad_norm": 0.6420026116209582, + "learning_rate": 3.4835067782954455e-05, + "loss": 0.9195, + "step": 7824 + }, + { + "epoch": 0.39027431421446385, + "grad_norm": 0.4947366167087951, + "learning_rate": 3.483135483808038e-05, + "loss": 0.896, + "step": 7825 + }, + { + "epoch": 0.39032418952618453, + "grad_norm": 0.5139130220236752, + "learning_rate": 3.482764163666149e-05, + "loss": 0.9517, + "step": 7826 + }, + { + "epoch": 0.3903740648379052, + "grad_norm": 1.2159621064595234, + "learning_rate": 3.4823928178794696e-05, + "loss": 0.9019, + "step": 7827 + }, + { + "epoch": 0.39042394014962595, + "grad_norm": 0.4725510892789101, + "learning_rate": 3.48202144645769e-05, + "loss": 0.9333, + "step": 7828 + }, + { + "epoch": 0.39047381546134663, + "grad_norm": 0.5248488384977952, + "learning_rate": 3.4816500494104995e-05, + "loss": 0.892, + "step": 7829 + }, + { + "epoch": 0.3905236907730673, + "grad_norm": 0.43871198070878903, + "learning_rate": 3.4812786267475896e-05, + "loss": 0.92, + "step": 7830 + }, + { + "epoch": 0.39057356608478805, + "grad_norm": 0.412627974120808, + "learning_rate": 3.480907178478654e-05, + "loss": 0.8813, + "step": 7831 + }, + { + "epoch": 0.39062344139650873, + "grad_norm": 0.704552929469781, + "learning_rate": 3.480535704613384e-05, + "loss": 0.914, + "step": 7832 + }, + { + "epoch": 0.3906733167082294, + "grad_norm": 0.49092208728728415, + "learning_rate": 3.480164205161475e-05, + "loss": 0.9029, + "step": 7833 + }, + { + "epoch": 0.39072319201995015, + "grad_norm": 0.5735716032459476, + "learning_rate": 3.479792680132618e-05, + "loss": 0.9213, + "step": 7834 + }, + { + "epoch": 0.39077306733167083, + "grad_norm": 2.264405379109913, + "learning_rate": 3.479421129536511e-05, + "loss": 0.9064, + "step": 7835 + }, + { + "epoch": 0.3908229426433915, + "grad_norm": 0.5421828874299812, + "learning_rate": 3.4790495533828466e-05, + "loss": 0.8819, + "step": 7836 + }, + { + "epoch": 0.3908728179551122, + "grad_norm": 0.4345604592145548, + "learning_rate": 3.478677951681323e-05, + "loss": 0.8989, + "step": 7837 + }, + { + "epoch": 0.39092269326683293, + "grad_norm": 0.5790650613102605, + "learning_rate": 3.478306324441636e-05, + "loss": 0.9028, + "step": 7838 + }, + { + "epoch": 0.3909725685785536, + "grad_norm": 0.4092386188133454, + "learning_rate": 3.477934671673484e-05, + "loss": 0.9046, + "step": 7839 + }, + { + "epoch": 0.3910224438902743, + "grad_norm": 0.4994708818491165, + "learning_rate": 3.4775629933865634e-05, + "loss": 0.9047, + "step": 7840 + }, + { + "epoch": 0.39107231920199503, + "grad_norm": 0.44099134884381475, + "learning_rate": 3.477191289590574e-05, + "loss": 0.9213, + "step": 7841 + }, + { + "epoch": 0.3911221945137157, + "grad_norm": 0.6085128070534109, + "learning_rate": 3.476819560295216e-05, + "loss": 0.9011, + "step": 7842 + }, + { + "epoch": 0.3911720698254364, + "grad_norm": 0.5147687817225307, + "learning_rate": 3.476447805510188e-05, + "loss": 0.9249, + "step": 7843 + }, + { + "epoch": 0.39122194513715713, + "grad_norm": 0.39911435459830746, + "learning_rate": 3.476076025245193e-05, + "loss": 0.8716, + "step": 7844 + }, + { + "epoch": 0.3912718204488778, + "grad_norm": 0.4089889330656026, + "learning_rate": 3.475704219509929e-05, + "loss": 0.9321, + "step": 7845 + }, + { + "epoch": 0.3913216957605985, + "grad_norm": 0.3849331948838422, + "learning_rate": 3.475332388314101e-05, + "loss": 0.886, + "step": 7846 + }, + { + "epoch": 0.3913715710723192, + "grad_norm": 0.43114862748833127, + "learning_rate": 3.4749605316674106e-05, + "loss": 0.8851, + "step": 7847 + }, + { + "epoch": 0.3914214463840399, + "grad_norm": 0.5943765406268489, + "learning_rate": 3.474588649579562e-05, + "loss": 0.9343, + "step": 7848 + }, + { + "epoch": 0.3914713216957606, + "grad_norm": 0.500967913444396, + "learning_rate": 3.474216742060258e-05, + "loss": 0.9372, + "step": 7849 + }, + { + "epoch": 0.3915211970074813, + "grad_norm": 0.44728118607777745, + "learning_rate": 3.4738448091192045e-05, + "loss": 0.9186, + "step": 7850 + }, + { + "epoch": 0.391571072319202, + "grad_norm": 0.4298402702738899, + "learning_rate": 3.473472850766107e-05, + "loss": 0.9147, + "step": 7851 + }, + { + "epoch": 0.3916209476309227, + "grad_norm": 0.9720776795413169, + "learning_rate": 3.47310086701067e-05, + "loss": 0.9534, + "step": 7852 + }, + { + "epoch": 0.3916708229426434, + "grad_norm": 0.41870226324006415, + "learning_rate": 3.472728857862603e-05, + "loss": 0.91, + "step": 7853 + }, + { + "epoch": 0.39172069825436406, + "grad_norm": 1.9968515655502148, + "learning_rate": 3.4723568233316106e-05, + "loss": 0.9159, + "step": 7854 + }, + { + "epoch": 0.3917705735660848, + "grad_norm": 0.5861974514014097, + "learning_rate": 3.471984763427402e-05, + "loss": 0.937, + "step": 7855 + }, + { + "epoch": 0.3918204488778055, + "grad_norm": 0.44068669103055547, + "learning_rate": 3.471612678159686e-05, + "loss": 0.943, + "step": 7856 + }, + { + "epoch": 0.39187032418952616, + "grad_norm": 0.8297399298321322, + "learning_rate": 3.471240567538173e-05, + "loss": 0.9476, + "step": 7857 + }, + { + "epoch": 0.3919201995012469, + "grad_norm": 1.5803491677321055, + "learning_rate": 3.470868431572571e-05, + "loss": 0.8954, + "step": 7858 + }, + { + "epoch": 0.3919700748129676, + "grad_norm": 0.45183815915013753, + "learning_rate": 3.4704962702725926e-05, + "loss": 0.8912, + "step": 7859 + }, + { + "epoch": 0.39201995012468827, + "grad_norm": 0.4134600185457102, + "learning_rate": 3.4701240836479474e-05, + "loss": 0.9203, + "step": 7860 + }, + { + "epoch": 0.392069825436409, + "grad_norm": 0.4572745587322646, + "learning_rate": 3.469751871708349e-05, + "loss": 0.9192, + "step": 7861 + }, + { + "epoch": 0.3921197007481297, + "grad_norm": 0.4803279980391784, + "learning_rate": 3.46937963446351e-05, + "loss": 0.9296, + "step": 7862 + }, + { + "epoch": 0.39216957605985037, + "grad_norm": 0.4416822776743039, + "learning_rate": 3.4690073719231425e-05, + "loss": 0.9003, + "step": 7863 + }, + { + "epoch": 0.39221945137157105, + "grad_norm": 0.4520761732428947, + "learning_rate": 3.468635084096962e-05, + "loss": 0.9515, + "step": 7864 + }, + { + "epoch": 0.3922693266832918, + "grad_norm": 0.5767913895733391, + "learning_rate": 3.468262770994681e-05, + "loss": 0.8856, + "step": 7865 + }, + { + "epoch": 0.39231920199501247, + "grad_norm": 1.4436385924368254, + "learning_rate": 3.467890432626018e-05, + "loss": 0.9254, + "step": 7866 + }, + { + "epoch": 0.39236907730673315, + "grad_norm": 0.4255959601817161, + "learning_rate": 3.4675180690006864e-05, + "loss": 0.9353, + "step": 7867 + }, + { + "epoch": 0.3924189526184539, + "grad_norm": 0.5154424389673654, + "learning_rate": 3.467145680128404e-05, + "loss": 0.8904, + "step": 7868 + }, + { + "epoch": 0.39246882793017457, + "grad_norm": 0.4399537409737274, + "learning_rate": 3.466773266018888e-05, + "loss": 0.8945, + "step": 7869 + }, + { + "epoch": 0.39251870324189525, + "grad_norm": 0.6195972412761644, + "learning_rate": 3.4664008266818564e-05, + "loss": 0.9101, + "step": 7870 + }, + { + "epoch": 0.392568578553616, + "grad_norm": 0.4563667724055507, + "learning_rate": 3.466028362127027e-05, + "loss": 0.9526, + "step": 7871 + }, + { + "epoch": 0.39261845386533667, + "grad_norm": 0.49164311976710434, + "learning_rate": 3.465655872364121e-05, + "loss": 0.9073, + "step": 7872 + }, + { + "epoch": 0.39266832917705735, + "grad_norm": 0.5561020148280337, + "learning_rate": 3.465283357402857e-05, + "loss": 0.9129, + "step": 7873 + }, + { + "epoch": 0.39271820448877803, + "grad_norm": 0.3748295395347255, + "learning_rate": 3.4649108172529554e-05, + "loss": 0.8323, + "step": 7874 + }, + { + "epoch": 0.39276807980049877, + "grad_norm": 0.49254725990103526, + "learning_rate": 3.464538251924138e-05, + "loss": 0.9016, + "step": 7875 + }, + { + "epoch": 0.39281795511221945, + "grad_norm": 0.4922844231166661, + "learning_rate": 3.4641656614261257e-05, + "loss": 0.8894, + "step": 7876 + }, + { + "epoch": 0.39286783042394013, + "grad_norm": 0.38399054442089015, + "learning_rate": 3.463793045768643e-05, + "loss": 0.8778, + "step": 7877 + }, + { + "epoch": 0.39291770573566087, + "grad_norm": 0.4414248784795501, + "learning_rate": 3.4634204049614113e-05, + "loss": 0.8897, + "step": 7878 + }, + { + "epoch": 0.39296758104738155, + "grad_norm": 0.5658782284856111, + "learning_rate": 3.4630477390141556e-05, + "loss": 0.9324, + "step": 7879 + }, + { + "epoch": 0.39301745635910224, + "grad_norm": 0.4041395405944319, + "learning_rate": 3.4626750479366006e-05, + "loss": 0.9327, + "step": 7880 + }, + { + "epoch": 0.3930673316708229, + "grad_norm": 0.5830210200235912, + "learning_rate": 3.462302331738471e-05, + "loss": 0.8581, + "step": 7881 + }, + { + "epoch": 0.39311720698254365, + "grad_norm": 0.46725278604402815, + "learning_rate": 3.4619295904294924e-05, + "loss": 0.9241, + "step": 7882 + }, + { + "epoch": 0.39316708229426434, + "grad_norm": 0.4150435994422176, + "learning_rate": 3.461556824019392e-05, + "loss": 0.8812, + "step": 7883 + }, + { + "epoch": 0.393216957605985, + "grad_norm": 0.47499739497059584, + "learning_rate": 3.461184032517896e-05, + "loss": 0.9283, + "step": 7884 + }, + { + "epoch": 0.39326683291770576, + "grad_norm": 0.537038915483923, + "learning_rate": 3.460811215934734e-05, + "loss": 0.8761, + "step": 7885 + }, + { + "epoch": 0.39331670822942644, + "grad_norm": 0.4116141172534491, + "learning_rate": 3.4604383742796324e-05, + "loss": 0.8853, + "step": 7886 + }, + { + "epoch": 0.3933665835411471, + "grad_norm": 0.43887503148997037, + "learning_rate": 3.460065507562321e-05, + "loss": 0.9007, + "step": 7887 + }, + { + "epoch": 0.39341645885286786, + "grad_norm": 0.48557709225032947, + "learning_rate": 3.459692615792531e-05, + "loss": 0.8873, + "step": 7888 + }, + { + "epoch": 0.39346633416458854, + "grad_norm": 0.3878368843902676, + "learning_rate": 3.459319698979991e-05, + "loss": 0.9548, + "step": 7889 + }, + { + "epoch": 0.3935162094763092, + "grad_norm": 0.4757795051498241, + "learning_rate": 3.4589467571344324e-05, + "loss": 0.9006, + "step": 7890 + }, + { + "epoch": 0.3935660847880299, + "grad_norm": 0.48773940680396216, + "learning_rate": 3.458573790265588e-05, + "loss": 0.9112, + "step": 7891 + }, + { + "epoch": 0.39361596009975064, + "grad_norm": 0.546675421248906, + "learning_rate": 3.458200798383189e-05, + "loss": 0.8569, + "step": 7892 + }, + { + "epoch": 0.3936658354114713, + "grad_norm": 0.7875665551000388, + "learning_rate": 3.457827781496969e-05, + "loss": 0.9096, + "step": 7893 + }, + { + "epoch": 0.393715710723192, + "grad_norm": 1.1899734935284603, + "learning_rate": 3.457454739616662e-05, + "loss": 0.892, + "step": 7894 + }, + { + "epoch": 0.39376558603491274, + "grad_norm": 0.7683416898364499, + "learning_rate": 3.457081672752002e-05, + "loss": 0.8965, + "step": 7895 + }, + { + "epoch": 0.3938154613466334, + "grad_norm": 0.475873913920165, + "learning_rate": 3.456708580912725e-05, + "loss": 0.9115, + "step": 7896 + }, + { + "epoch": 0.3938653366583541, + "grad_norm": 0.4388766935857258, + "learning_rate": 3.456335464108565e-05, + "loss": 0.895, + "step": 7897 + }, + { + "epoch": 0.3939152119700748, + "grad_norm": 0.5402743330739138, + "learning_rate": 3.455962322349259e-05, + "loss": 0.9075, + "step": 7898 + }, + { + "epoch": 0.3939650872817955, + "grad_norm": 0.46754229730862573, + "learning_rate": 3.4555891556445434e-05, + "loss": 0.8949, + "step": 7899 + }, + { + "epoch": 0.3940149625935162, + "grad_norm": 0.46622910113953586, + "learning_rate": 3.4552159640041573e-05, + "loss": 0.9021, + "step": 7900 + }, + { + "epoch": 0.3940648379052369, + "grad_norm": 0.5196160416549129, + "learning_rate": 3.454842747437838e-05, + "loss": 0.9248, + "step": 7901 + }, + { + "epoch": 0.3941147132169576, + "grad_norm": 0.6332225926429781, + "learning_rate": 3.454469505955324e-05, + "loss": 0.9027, + "step": 7902 + }, + { + "epoch": 0.3941645885286783, + "grad_norm": 0.47375074555372476, + "learning_rate": 3.454096239566356e-05, + "loss": 0.8759, + "step": 7903 + }, + { + "epoch": 0.394214463840399, + "grad_norm": 0.7214361116389724, + "learning_rate": 3.453722948280673e-05, + "loss": 0.8984, + "step": 7904 + }, + { + "epoch": 0.3942643391521197, + "grad_norm": 0.44725330151103887, + "learning_rate": 3.453349632108017e-05, + "loss": 0.9047, + "step": 7905 + }, + { + "epoch": 0.3943142144638404, + "grad_norm": 0.4300596715250996, + "learning_rate": 3.452976291058129e-05, + "loss": 0.9071, + "step": 7906 + }, + { + "epoch": 0.3943640897755611, + "grad_norm": 0.4483741670200443, + "learning_rate": 3.452602925140751e-05, + "loss": 0.8777, + "step": 7907 + }, + { + "epoch": 0.39441396508728177, + "grad_norm": 0.47072106221290433, + "learning_rate": 3.452229534365627e-05, + "loss": 0.9175, + "step": 7908 + }, + { + "epoch": 0.3944638403990025, + "grad_norm": 0.4789417673928335, + "learning_rate": 3.451856118742498e-05, + "loss": 0.8986, + "step": 7909 + }, + { + "epoch": 0.3945137157107232, + "grad_norm": 0.40772744622542, + "learning_rate": 3.451482678281111e-05, + "loss": 0.8978, + "step": 7910 + }, + { + "epoch": 0.39456359102244387, + "grad_norm": 0.5590616652579276, + "learning_rate": 3.451109212991207e-05, + "loss": 0.9046, + "step": 7911 + }, + { + "epoch": 0.3946134663341646, + "grad_norm": 0.548269801898336, + "learning_rate": 3.450735722882536e-05, + "loss": 0.936, + "step": 7912 + }, + { + "epoch": 0.3946633416458853, + "grad_norm": 0.430631590897758, + "learning_rate": 3.4503622079648405e-05, + "loss": 0.8921, + "step": 7913 + }, + { + "epoch": 0.394713216957606, + "grad_norm": 0.4222194276250044, + "learning_rate": 3.449988668247869e-05, + "loss": 0.8728, + "step": 7914 + }, + { + "epoch": 0.3947630922693267, + "grad_norm": 0.6168120579659474, + "learning_rate": 3.4496151037413685e-05, + "loss": 0.9401, + "step": 7915 + }, + { + "epoch": 0.3948129675810474, + "grad_norm": 0.47616891094594715, + "learning_rate": 3.4492415144550865e-05, + "loss": 0.9032, + "step": 7916 + }, + { + "epoch": 0.3948628428927681, + "grad_norm": 0.5241942596782174, + "learning_rate": 3.4488679003987726e-05, + "loss": 0.9238, + "step": 7917 + }, + { + "epoch": 0.39491271820448876, + "grad_norm": 0.5398355617789249, + "learning_rate": 3.448494261582175e-05, + "loss": 0.8841, + "step": 7918 + }, + { + "epoch": 0.3949625935162095, + "grad_norm": 0.5603616953490955, + "learning_rate": 3.448120598015044e-05, + "loss": 0.8921, + "step": 7919 + }, + { + "epoch": 0.3950124688279302, + "grad_norm": 0.4479360996278005, + "learning_rate": 3.4477469097071316e-05, + "loss": 0.9158, + "step": 7920 + }, + { + "epoch": 0.39506234413965086, + "grad_norm": 0.6537339945994776, + "learning_rate": 3.447373196668187e-05, + "loss": 0.9216, + "step": 7921 + }, + { + "epoch": 0.3951122194513716, + "grad_norm": 0.5045363837346358, + "learning_rate": 3.446999458907962e-05, + "loss": 0.9148, + "step": 7922 + }, + { + "epoch": 0.3951620947630923, + "grad_norm": 0.4996509764266929, + "learning_rate": 3.4466256964362107e-05, + "loss": 0.9371, + "step": 7923 + }, + { + "epoch": 0.39521197007481296, + "grad_norm": 0.5199053630690817, + "learning_rate": 3.446251909262685e-05, + "loss": 0.9039, + "step": 7924 + }, + { + "epoch": 0.39526184538653364, + "grad_norm": 0.49440384940005494, + "learning_rate": 3.445878097397139e-05, + "loss": 0.9152, + "step": 7925 + }, + { + "epoch": 0.3953117206982544, + "grad_norm": 0.5440589108818779, + "learning_rate": 3.445504260849328e-05, + "loss": 0.9199, + "step": 7926 + }, + { + "epoch": 0.39536159600997506, + "grad_norm": 0.43384444325492494, + "learning_rate": 3.445130399629007e-05, + "loss": 0.9116, + "step": 7927 + }, + { + "epoch": 0.39541147132169574, + "grad_norm": 0.5141184201124597, + "learning_rate": 3.44475651374593e-05, + "loss": 0.856, + "step": 7928 + }, + { + "epoch": 0.3954613466334165, + "grad_norm": 0.5356304020850813, + "learning_rate": 3.444382603209856e-05, + "loss": 0.8873, + "step": 7929 + }, + { + "epoch": 0.39551122194513716, + "grad_norm": 0.4757626173572635, + "learning_rate": 3.444008668030539e-05, + "loss": 0.8824, + "step": 7930 + }, + { + "epoch": 0.39556109725685784, + "grad_norm": 0.4569224069850221, + "learning_rate": 3.44363470821774e-05, + "loss": 0.9015, + "step": 7931 + }, + { + "epoch": 0.3956109725685786, + "grad_norm": 0.5183027775371872, + "learning_rate": 3.443260723781214e-05, + "loss": 0.9145, + "step": 7932 + }, + { + "epoch": 0.39566084788029926, + "grad_norm": 0.44947841414260875, + "learning_rate": 3.4428867147307223e-05, + "loss": 0.8976, + "step": 7933 + }, + { + "epoch": 0.39571072319201994, + "grad_norm": 0.394896475671188, + "learning_rate": 3.4425126810760247e-05, + "loss": 0.8805, + "step": 7934 + }, + { + "epoch": 0.3957605985037406, + "grad_norm": 0.41600560046843765, + "learning_rate": 3.442138622826879e-05, + "loss": 0.9034, + "step": 7935 + }, + { + "epoch": 0.39581047381546136, + "grad_norm": 0.5489968178812291, + "learning_rate": 3.441764539993048e-05, + "loss": 0.8961, + "step": 7936 + }, + { + "epoch": 0.39586034912718204, + "grad_norm": 0.4312613023853069, + "learning_rate": 3.4413904325842926e-05, + "loss": 0.937, + "step": 7937 + }, + { + "epoch": 0.3959102244389027, + "grad_norm": 0.4291994642827579, + "learning_rate": 3.441016300610376e-05, + "loss": 0.8813, + "step": 7938 + }, + { + "epoch": 0.39596009975062346, + "grad_norm": 0.4369999855000122, + "learning_rate": 3.4406421440810597e-05, + "loss": 0.9022, + "step": 7939 + }, + { + "epoch": 0.39600997506234414, + "grad_norm": 0.4766510149988699, + "learning_rate": 3.440267963006107e-05, + "loss": 0.8939, + "step": 7940 + }, + { + "epoch": 0.3960598503740648, + "grad_norm": 0.4562704562758714, + "learning_rate": 3.439893757395283e-05, + "loss": 0.9133, + "step": 7941 + }, + { + "epoch": 0.39610972568578556, + "grad_norm": 0.6076506072256178, + "learning_rate": 3.439519527258353e-05, + "loss": 0.901, + "step": 7942 + }, + { + "epoch": 0.39615960099750624, + "grad_norm": 0.48798269900556507, + "learning_rate": 3.43914527260508e-05, + "loss": 0.9412, + "step": 7943 + }, + { + "epoch": 0.3962094763092269, + "grad_norm": 0.38433909546181866, + "learning_rate": 3.4387709934452325e-05, + "loss": 0.8638, + "step": 7944 + }, + { + "epoch": 0.3962593516209476, + "grad_norm": 0.525913982518703, + "learning_rate": 3.438396689788576e-05, + "loss": 0.9287, + "step": 7945 + }, + { + "epoch": 0.39630922693266835, + "grad_norm": 0.5355414750922708, + "learning_rate": 3.438022361644877e-05, + "loss": 0.8845, + "step": 7946 + }, + { + "epoch": 0.396359102244389, + "grad_norm": 0.5356803221069868, + "learning_rate": 3.437648009023905e-05, + "loss": 0.9344, + "step": 7947 + }, + { + "epoch": 0.3964089775561097, + "grad_norm": 0.4279774380351375, + "learning_rate": 3.437273631935427e-05, + "loss": 0.8994, + "step": 7948 + }, + { + "epoch": 0.39645885286783045, + "grad_norm": 0.5236063164519204, + "learning_rate": 3.4368992303892146e-05, + "loss": 0.8745, + "step": 7949 + }, + { + "epoch": 0.39650872817955113, + "grad_norm": 0.417586585442857, + "learning_rate": 3.436524804395035e-05, + "loss": 0.9102, + "step": 7950 + }, + { + "epoch": 0.3965586034912718, + "grad_norm": 0.48542461521050345, + "learning_rate": 3.436150353962659e-05, + "loss": 0.9161, + "step": 7951 + }, + { + "epoch": 0.3966084788029925, + "grad_norm": 0.5343472755818374, + "learning_rate": 3.43577587910186e-05, + "loss": 0.8937, + "step": 7952 + }, + { + "epoch": 0.39665835411471323, + "grad_norm": 2.7482292706934652, + "learning_rate": 3.435401379822407e-05, + "loss": 0.94, + "step": 7953 + }, + { + "epoch": 0.3967082294264339, + "grad_norm": 0.5077478982623724, + "learning_rate": 3.435026856134075e-05, + "loss": 0.9374, + "step": 7954 + }, + { + "epoch": 0.3967581047381546, + "grad_norm": 0.4164912104335147, + "learning_rate": 3.4346523080466345e-05, + "loss": 0.9066, + "step": 7955 + }, + { + "epoch": 0.39680798004987533, + "grad_norm": 0.5710221757309901, + "learning_rate": 3.434277735569861e-05, + "loss": 0.8601, + "step": 7956 + }, + { + "epoch": 0.396857855361596, + "grad_norm": 0.5118462927655724, + "learning_rate": 3.433903138713527e-05, + "loss": 0.9407, + "step": 7957 + }, + { + "epoch": 0.3969077306733167, + "grad_norm": 0.4754939233519486, + "learning_rate": 3.43352851748741e-05, + "loss": 0.9215, + "step": 7958 + }, + { + "epoch": 0.39695760598503743, + "grad_norm": 0.43250456699998713, + "learning_rate": 3.4331538719012834e-05, + "loss": 0.9025, + "step": 7959 + }, + { + "epoch": 0.3970074812967581, + "grad_norm": 0.45201527068611386, + "learning_rate": 3.432779201964924e-05, + "loss": 0.9154, + "step": 7960 + }, + { + "epoch": 0.3970573566084788, + "grad_norm": 0.4580755361938983, + "learning_rate": 3.432404507688109e-05, + "loss": 0.8768, + "step": 7961 + }, + { + "epoch": 0.3971072319201995, + "grad_norm": 0.45772632453792156, + "learning_rate": 3.432029789080616e-05, + "loss": 0.8897, + "step": 7962 + }, + { + "epoch": 0.3971571072319202, + "grad_norm": 0.44117455924488497, + "learning_rate": 3.4316550461522225e-05, + "loss": 0.9033, + "step": 7963 + }, + { + "epoch": 0.3972069825436409, + "grad_norm": 0.4188910086405368, + "learning_rate": 3.431280278912707e-05, + "loss": 0.8739, + "step": 7964 + }, + { + "epoch": 0.3972568578553616, + "grad_norm": 0.44139327281645424, + "learning_rate": 3.4309054873718504e-05, + "loss": 0.8916, + "step": 7965 + }, + { + "epoch": 0.3973067331670823, + "grad_norm": 0.4961092976310501, + "learning_rate": 3.430530671539431e-05, + "loss": 0.8662, + "step": 7966 + }, + { + "epoch": 0.397356608478803, + "grad_norm": 0.5659139195505083, + "learning_rate": 3.43015583142523e-05, + "loss": 0.939, + "step": 7967 + }, + { + "epoch": 0.3974064837905237, + "grad_norm": 0.6122660390713468, + "learning_rate": 3.4297809670390305e-05, + "loss": 0.8915, + "step": 7968 + }, + { + "epoch": 0.3974563591022444, + "grad_norm": 0.4358925087678149, + "learning_rate": 3.4294060783906114e-05, + "loss": 0.8777, + "step": 7969 + }, + { + "epoch": 0.3975062344139651, + "grad_norm": 0.4883174628594094, + "learning_rate": 3.429031165489756e-05, + "loss": 0.9604, + "step": 7970 + }, + { + "epoch": 0.3975561097256858, + "grad_norm": 0.44468009886822146, + "learning_rate": 3.428656228346249e-05, + "loss": 0.9446, + "step": 7971 + }, + { + "epoch": 0.39760598503740646, + "grad_norm": 0.7219010030881811, + "learning_rate": 3.4282812669698735e-05, + "loss": 0.8772, + "step": 7972 + }, + { + "epoch": 0.3976558603491272, + "grad_norm": 0.44731647122641954, + "learning_rate": 3.427906281370414e-05, + "loss": 0.9228, + "step": 7973 + }, + { + "epoch": 0.3977057356608479, + "grad_norm": 0.44054108653135626, + "learning_rate": 3.4275312715576544e-05, + "loss": 0.8731, + "step": 7974 + }, + { + "epoch": 0.39775561097256856, + "grad_norm": 0.46926859647369495, + "learning_rate": 3.427156237541382e-05, + "loss": 0.9192, + "step": 7975 + }, + { + "epoch": 0.3978054862842893, + "grad_norm": 0.4515870176554284, + "learning_rate": 3.426781179331382e-05, + "loss": 0.9236, + "step": 7976 + }, + { + "epoch": 0.39785536159601, + "grad_norm": 0.4995612928367672, + "learning_rate": 3.426406096937442e-05, + "loss": 0.9167, + "step": 7977 + }, + { + "epoch": 0.39790523690773066, + "grad_norm": 0.534941749223683, + "learning_rate": 3.42603099036935e-05, + "loss": 0.9305, + "step": 7978 + }, + { + "epoch": 0.39795511221945135, + "grad_norm": 0.41346318469665644, + "learning_rate": 3.425655859636894e-05, + "loss": 0.8917, + "step": 7979 + }, + { + "epoch": 0.3980049875311721, + "grad_norm": 0.4280732455292514, + "learning_rate": 3.4252807047498624e-05, + "loss": 0.903, + "step": 7980 + }, + { + "epoch": 0.39805486284289276, + "grad_norm": 0.472025741258737, + "learning_rate": 3.424905525718044e-05, + "loss": 0.9599, + "step": 7981 + }, + { + "epoch": 0.39810473815461345, + "grad_norm": 0.4810842269611108, + "learning_rate": 3.424530322551231e-05, + "loss": 0.9384, + "step": 7982 + }, + { + "epoch": 0.3981546134663342, + "grad_norm": 0.498749296829087, + "learning_rate": 3.424155095259212e-05, + "loss": 0.9003, + "step": 7983 + }, + { + "epoch": 0.39820448877805487, + "grad_norm": 0.38305541123794046, + "learning_rate": 3.423779843851781e-05, + "loss": 0.879, + "step": 7984 + }, + { + "epoch": 0.39825436408977555, + "grad_norm": 0.5404605764078693, + "learning_rate": 3.423404568338726e-05, + "loss": 0.9173, + "step": 7985 + }, + { + "epoch": 0.3983042394014963, + "grad_norm": 0.5081004462700217, + "learning_rate": 3.423029268729844e-05, + "loss": 0.9553, + "step": 7986 + }, + { + "epoch": 0.39835411471321697, + "grad_norm": 0.37727255405905297, + "learning_rate": 3.422653945034925e-05, + "loss": 0.8788, + "step": 7987 + }, + { + "epoch": 0.39840399002493765, + "grad_norm": 0.5650514764754769, + "learning_rate": 3.422278597263765e-05, + "loss": 0.8452, + "step": 7988 + }, + { + "epoch": 0.39845386533665833, + "grad_norm": 0.8954683574182656, + "learning_rate": 3.421903225426158e-05, + "loss": 0.8924, + "step": 7989 + }, + { + "epoch": 0.39850374064837907, + "grad_norm": 0.4657311309765746, + "learning_rate": 3.421527829531898e-05, + "loss": 0.8723, + "step": 7990 + }, + { + "epoch": 0.39855361596009975, + "grad_norm": 0.41713630188410666, + "learning_rate": 3.421152409590783e-05, + "loss": 0.903, + "step": 7991 + }, + { + "epoch": 0.39860349127182043, + "grad_norm": 0.7151280081604573, + "learning_rate": 3.420776965612606e-05, + "loss": 0.9388, + "step": 7992 + }, + { + "epoch": 0.39865336658354117, + "grad_norm": 0.6605176338214056, + "learning_rate": 3.4204014976071686e-05, + "loss": 0.947, + "step": 7993 + }, + { + "epoch": 0.39870324189526185, + "grad_norm": 0.500904891244006, + "learning_rate": 3.420026005584265e-05, + "loss": 0.9471, + "step": 7994 + }, + { + "epoch": 0.39875311720698253, + "grad_norm": 0.4380337639722819, + "learning_rate": 3.419650489553695e-05, + "loss": 0.9244, + "step": 7995 + }, + { + "epoch": 0.3988029925187032, + "grad_norm": 0.4991360281197537, + "learning_rate": 3.419274949525256e-05, + "loss": 0.9102, + "step": 7996 + }, + { + "epoch": 0.39885286783042395, + "grad_norm": 0.8914781134988702, + "learning_rate": 3.4188993855087494e-05, + "loss": 0.8778, + "step": 7997 + }, + { + "epoch": 0.39890274314214463, + "grad_norm": 0.5080468983646584, + "learning_rate": 3.418523797513974e-05, + "loss": 0.8879, + "step": 7998 + }, + { + "epoch": 0.3989526184538653, + "grad_norm": 0.5029119118327201, + "learning_rate": 3.418148185550732e-05, + "loss": 0.9651, + "step": 7999 + }, + { + "epoch": 0.39900249376558605, + "grad_norm": 0.5893928241840191, + "learning_rate": 3.417772549628823e-05, + "loss": 0.9272, + "step": 8000 + }, + { + "epoch": 0.39905236907730673, + "grad_norm": 0.4740027378166851, + "learning_rate": 3.4173968897580513e-05, + "loss": 0.9126, + "step": 8001 + }, + { + "epoch": 0.3991022443890274, + "grad_norm": 0.5096820271804949, + "learning_rate": 3.417021205948217e-05, + "loss": 0.9416, + "step": 8002 + }, + { + "epoch": 0.39915211970074815, + "grad_norm": 0.44173311502804125, + "learning_rate": 3.416645498209125e-05, + "loss": 0.8821, + "step": 8003 + }, + { + "epoch": 0.39920199501246884, + "grad_norm": 0.45703909183520836, + "learning_rate": 3.416269766550581e-05, + "loss": 0.8954, + "step": 8004 + }, + { + "epoch": 0.3992518703241895, + "grad_norm": 0.5557210870211038, + "learning_rate": 3.415894010982385e-05, + "loss": 0.8903, + "step": 8005 + }, + { + "epoch": 0.3993017456359102, + "grad_norm": 0.49341640021895417, + "learning_rate": 3.415518231514346e-05, + "loss": 0.885, + "step": 8006 + }, + { + "epoch": 0.39935162094763094, + "grad_norm": 0.42231917820991943, + "learning_rate": 3.4151424281562675e-05, + "loss": 0.9111, + "step": 8007 + }, + { + "epoch": 0.3994014962593516, + "grad_norm": 0.4265558860814456, + "learning_rate": 3.414766600917958e-05, + "loss": 0.8937, + "step": 8008 + }, + { + "epoch": 0.3994513715710723, + "grad_norm": 0.4242628309330586, + "learning_rate": 3.4143907498092225e-05, + "loss": 0.8975, + "step": 8009 + }, + { + "epoch": 0.39950124688279304, + "grad_norm": 0.7225617126416471, + "learning_rate": 3.4140148748398705e-05, + "loss": 0.8964, + "step": 8010 + }, + { + "epoch": 0.3995511221945137, + "grad_norm": 0.4552552066009713, + "learning_rate": 3.4136389760197085e-05, + "loss": 0.8964, + "step": 8011 + }, + { + "epoch": 0.3996009975062344, + "grad_norm": 0.5558047558858392, + "learning_rate": 3.413263053358547e-05, + "loss": 0.8896, + "step": 8012 + }, + { + "epoch": 0.39965087281795514, + "grad_norm": 0.4240998606327764, + "learning_rate": 3.412887106866194e-05, + "loss": 0.9036, + "step": 8013 + }, + { + "epoch": 0.3997007481296758, + "grad_norm": 0.48295268801342445, + "learning_rate": 3.412511136552461e-05, + "loss": 0.8891, + "step": 8014 + }, + { + "epoch": 0.3997506234413965, + "grad_norm": 0.39308103930042554, + "learning_rate": 3.4121351424271594e-05, + "loss": 0.8837, + "step": 8015 + }, + { + "epoch": 0.3998004987531172, + "grad_norm": 0.42374529101806874, + "learning_rate": 3.411759124500098e-05, + "loss": 0.8782, + "step": 8016 + }, + { + "epoch": 0.3998503740648379, + "grad_norm": 0.4526213246073477, + "learning_rate": 3.4113830827810904e-05, + "loss": 0.9145, + "step": 8017 + }, + { + "epoch": 0.3999002493765586, + "grad_norm": 0.45785406484186153, + "learning_rate": 3.4110070172799494e-05, + "loss": 0.8885, + "step": 8018 + }, + { + "epoch": 0.3999501246882793, + "grad_norm": 0.6853054004716724, + "learning_rate": 3.4106309280064885e-05, + "loss": 0.9291, + "step": 8019 + }, + { + "epoch": 0.4, + "grad_norm": 0.4971668997637325, + "learning_rate": 3.4102548149705205e-05, + "loss": 0.9142, + "step": 8020 + }, + { + "epoch": 0.4000498753117207, + "grad_norm": 0.43030179286041415, + "learning_rate": 3.40987867818186e-05, + "loss": 0.8931, + "step": 8021 + }, + { + "epoch": 0.4000997506234414, + "grad_norm": 1.2040087717476329, + "learning_rate": 3.4095025176503234e-05, + "loss": 0.9146, + "step": 8022 + }, + { + "epoch": 0.40014962593516207, + "grad_norm": 0.47397891520140567, + "learning_rate": 3.409126333385726e-05, + "loss": 0.935, + "step": 8023 + }, + { + "epoch": 0.4001995012468828, + "grad_norm": 0.5864682195146584, + "learning_rate": 3.408750125397883e-05, + "loss": 0.9513, + "step": 8024 + }, + { + "epoch": 0.4002493765586035, + "grad_norm": 0.4406365628363469, + "learning_rate": 3.4083738936966126e-05, + "loss": 0.8896, + "step": 8025 + }, + { + "epoch": 0.40029925187032417, + "grad_norm": 0.44322851372508476, + "learning_rate": 3.407997638291731e-05, + "loss": 0.9446, + "step": 8026 + }, + { + "epoch": 0.4003491271820449, + "grad_norm": 0.4711866596405232, + "learning_rate": 3.407621359193059e-05, + "loss": 0.9182, + "step": 8027 + }, + { + "epoch": 0.4003990024937656, + "grad_norm": 0.3551900788098352, + "learning_rate": 3.407245056410412e-05, + "loss": 0.8393, + "step": 8028 + }, + { + "epoch": 0.40044887780548627, + "grad_norm": 0.5168515169501993, + "learning_rate": 3.406868729953613e-05, + "loss": 0.9067, + "step": 8029 + }, + { + "epoch": 0.400498753117207, + "grad_norm": 0.4550402205318753, + "learning_rate": 3.4064923798324795e-05, + "loss": 0.9244, + "step": 8030 + }, + { + "epoch": 0.4005486284289277, + "grad_norm": 0.4214902924369306, + "learning_rate": 3.406116006056833e-05, + "loss": 0.8872, + "step": 8031 + }, + { + "epoch": 0.40059850374064837, + "grad_norm": 0.8167667833385573, + "learning_rate": 3.4057396086364956e-05, + "loss": 0.9559, + "step": 8032 + }, + { + "epoch": 0.40064837905236905, + "grad_norm": 0.41146688509839063, + "learning_rate": 3.405363187581288e-05, + "loss": 0.9121, + "step": 8033 + }, + { + "epoch": 0.4006982543640898, + "grad_norm": 0.36304000234985273, + "learning_rate": 3.4049867429010326e-05, + "loss": 0.899, + "step": 8034 + }, + { + "epoch": 0.40074812967581047, + "grad_norm": 0.41365172907311865, + "learning_rate": 3.404610274605554e-05, + "loss": 0.8977, + "step": 8035 + }, + { + "epoch": 0.40079800498753115, + "grad_norm": 0.4002376124028223, + "learning_rate": 3.404233782704674e-05, + "loss": 0.8591, + "step": 8036 + }, + { + "epoch": 0.4008478802992519, + "grad_norm": 0.46766903362891205, + "learning_rate": 3.4038572672082195e-05, + "loss": 0.8791, + "step": 8037 + }, + { + "epoch": 0.4008977556109726, + "grad_norm": 0.4632130867857419, + "learning_rate": 3.403480728126014e-05, + "loss": 0.9315, + "step": 8038 + }, + { + "epoch": 0.40094763092269325, + "grad_norm": 0.5549978106232016, + "learning_rate": 3.403104165467883e-05, + "loss": 0.8913, + "step": 8039 + }, + { + "epoch": 0.400997506234414, + "grad_norm": 0.3713399109547182, + "learning_rate": 3.402727579243653e-05, + "loss": 0.8907, + "step": 8040 + }, + { + "epoch": 0.4010473815461347, + "grad_norm": 0.43270599548051225, + "learning_rate": 3.40235096946315e-05, + "loss": 0.9211, + "step": 8041 + }, + { + "epoch": 0.40109725685785536, + "grad_norm": 0.3882979090793143, + "learning_rate": 3.401974336136203e-05, + "loss": 0.9351, + "step": 8042 + }, + { + "epoch": 0.40114713216957604, + "grad_norm": 0.3845558323647559, + "learning_rate": 3.4015976792726394e-05, + "loss": 0.9305, + "step": 8043 + }, + { + "epoch": 0.4011970074812968, + "grad_norm": 0.4247570188442145, + "learning_rate": 3.401220998882287e-05, + "loss": 0.9177, + "step": 8044 + }, + { + "epoch": 0.40124688279301746, + "grad_norm": 0.9329151393409475, + "learning_rate": 3.400844294974977e-05, + "loss": 0.9326, + "step": 8045 + }, + { + "epoch": 0.40129675810473814, + "grad_norm": 0.4626846061042043, + "learning_rate": 3.400467567560538e-05, + "loss": 0.8917, + "step": 8046 + }, + { + "epoch": 0.4013466334164589, + "grad_norm": 0.4994348689030482, + "learning_rate": 3.400090816648801e-05, + "loss": 0.9013, + "step": 8047 + }, + { + "epoch": 0.40139650872817956, + "grad_norm": 0.45214901702818106, + "learning_rate": 3.3997140422495965e-05, + "loss": 0.9014, + "step": 8048 + }, + { + "epoch": 0.40144638403990024, + "grad_norm": 0.6896539636897236, + "learning_rate": 3.3993372443727576e-05, + "loss": 0.9016, + "step": 8049 + }, + { + "epoch": 0.4014962593516209, + "grad_norm": 0.7575164184907144, + "learning_rate": 3.398960423028115e-05, + "loss": 0.9302, + "step": 8050 + }, + { + "epoch": 0.40154613466334166, + "grad_norm": 0.4957405591222772, + "learning_rate": 3.398583578225502e-05, + "loss": 0.9112, + "step": 8051 + }, + { + "epoch": 0.40159600997506234, + "grad_norm": 0.46074915081675055, + "learning_rate": 3.3982067099747546e-05, + "loss": 0.8921, + "step": 8052 + }, + { + "epoch": 0.401645885286783, + "grad_norm": 0.4481838523910595, + "learning_rate": 3.397829818285704e-05, + "loss": 0.8741, + "step": 8053 + }, + { + "epoch": 0.40169576059850376, + "grad_norm": 0.4273328685933678, + "learning_rate": 3.3974529031681855e-05, + "loss": 0.886, + "step": 8054 + }, + { + "epoch": 0.40174563591022444, + "grad_norm": 0.3724576826598911, + "learning_rate": 3.397075964632036e-05, + "loss": 0.9018, + "step": 8055 + }, + { + "epoch": 0.4017955112219451, + "grad_norm": 0.3791974732482526, + "learning_rate": 3.39669900268709e-05, + "loss": 0.8752, + "step": 8056 + }, + { + "epoch": 0.40184538653366586, + "grad_norm": 0.442222541557268, + "learning_rate": 3.3963220173431854e-05, + "loss": 0.8795, + "step": 8057 + }, + { + "epoch": 0.40189526184538654, + "grad_norm": 0.46569585683187265, + "learning_rate": 3.3959450086101586e-05, + "loss": 0.9184, + "step": 8058 + }, + { + "epoch": 0.4019451371571072, + "grad_norm": 0.5873839977073281, + "learning_rate": 3.3955679764978474e-05, + "loss": 0.8864, + "step": 8059 + }, + { + "epoch": 0.4019950124688279, + "grad_norm": 0.4241632063709943, + "learning_rate": 3.3951909210160916e-05, + "loss": 0.9397, + "step": 8060 + }, + { + "epoch": 0.40204488778054864, + "grad_norm": 0.47045958763385587, + "learning_rate": 3.3948138421747284e-05, + "loss": 0.9197, + "step": 8061 + }, + { + "epoch": 0.4020947630922693, + "grad_norm": 0.3557829407406406, + "learning_rate": 3.394436739983599e-05, + "loss": 0.8615, + "step": 8062 + }, + { + "epoch": 0.40214463840399, + "grad_norm": 0.41386765231570005, + "learning_rate": 3.3940596144525424e-05, + "loss": 0.894, + "step": 8063 + }, + { + "epoch": 0.40219451371571074, + "grad_norm": 0.42380514649161044, + "learning_rate": 3.393682465591402e-05, + "loss": 0.8897, + "step": 8064 + }, + { + "epoch": 0.4022443890274314, + "grad_norm": 0.45088528822068774, + "learning_rate": 3.393305293410016e-05, + "loss": 0.9522, + "step": 8065 + }, + { + "epoch": 0.4022942643391521, + "grad_norm": 0.44022997953564647, + "learning_rate": 3.392928097918229e-05, + "loss": 0.8774, + "step": 8066 + }, + { + "epoch": 0.40234413965087285, + "grad_norm": 0.47345534029946673, + "learning_rate": 3.392550879125882e-05, + "loss": 0.9462, + "step": 8067 + }, + { + "epoch": 0.4023940149625935, + "grad_norm": 0.367952102122364, + "learning_rate": 3.39217363704282e-05, + "loss": 0.8925, + "step": 8068 + }, + { + "epoch": 0.4024438902743142, + "grad_norm": 0.41320797350717386, + "learning_rate": 3.391796371678887e-05, + "loss": 0.8898, + "step": 8069 + }, + { + "epoch": 0.4024937655860349, + "grad_norm": 0.6390010480303991, + "learning_rate": 3.3914190830439254e-05, + "loss": 0.9257, + "step": 8070 + }, + { + "epoch": 0.40254364089775563, + "grad_norm": 0.3944091845864582, + "learning_rate": 3.391041771147782e-05, + "loss": 0.871, + "step": 8071 + }, + { + "epoch": 0.4025935162094763, + "grad_norm": 0.4505481760132356, + "learning_rate": 3.390664436000303e-05, + "loss": 0.9669, + "step": 8072 + }, + { + "epoch": 0.402643391521197, + "grad_norm": 0.5194211834230307, + "learning_rate": 3.390287077611334e-05, + "loss": 0.8978, + "step": 8073 + }, + { + "epoch": 0.40269326683291773, + "grad_norm": 0.45965310310444657, + "learning_rate": 3.3899096959907214e-05, + "loss": 0.9208, + "step": 8074 + }, + { + "epoch": 0.4027431421446384, + "grad_norm": 0.5364256125621455, + "learning_rate": 3.389532291148314e-05, + "loss": 0.9116, + "step": 8075 + }, + { + "epoch": 0.4027930174563591, + "grad_norm": 0.5126851934047202, + "learning_rate": 3.38915486309396e-05, + "loss": 0.8668, + "step": 8076 + }, + { + "epoch": 0.4028428927680798, + "grad_norm": 0.4774226300080874, + "learning_rate": 3.388777411837507e-05, + "loss": 0.8683, + "step": 8077 + }, + { + "epoch": 0.4028927680798005, + "grad_norm": 0.6457952050205321, + "learning_rate": 3.388399937388806e-05, + "loss": 0.9325, + "step": 8078 + }, + { + "epoch": 0.4029426433915212, + "grad_norm": 0.7075310279990771, + "learning_rate": 3.388022439757705e-05, + "loss": 0.9623, + "step": 8079 + }, + { + "epoch": 0.4029925187032419, + "grad_norm": 0.5352587057062896, + "learning_rate": 3.387644918954058e-05, + "loss": 0.897, + "step": 8080 + }, + { + "epoch": 0.4030423940149626, + "grad_norm": 0.4658798969164236, + "learning_rate": 3.387267374987712e-05, + "loss": 0.9268, + "step": 8081 + }, + { + "epoch": 0.4030922693266833, + "grad_norm": 0.5540196264576058, + "learning_rate": 3.3868898078685225e-05, + "loss": 0.8565, + "step": 8082 + }, + { + "epoch": 0.403142144638404, + "grad_norm": 0.38336130351360054, + "learning_rate": 3.386512217606339e-05, + "loss": 0.8966, + "step": 8083 + }, + { + "epoch": 0.4031920199501247, + "grad_norm": 0.5170752636657202, + "learning_rate": 3.386134604211017e-05, + "loss": 0.9258, + "step": 8084 + }, + { + "epoch": 0.4032418952618454, + "grad_norm": 0.5454211276387768, + "learning_rate": 3.385756967692408e-05, + "loss": 0.9096, + "step": 8085 + }, + { + "epoch": 0.4032917705735661, + "grad_norm": 0.5512316590834309, + "learning_rate": 3.385379308060369e-05, + "loss": 0.8555, + "step": 8086 + }, + { + "epoch": 0.40334164588528676, + "grad_norm": 0.4033161119548014, + "learning_rate": 3.3850016253247516e-05, + "loss": 0.8663, + "step": 8087 + }, + { + "epoch": 0.4033915211970075, + "grad_norm": 0.5004107326024617, + "learning_rate": 3.384623919495414e-05, + "loss": 0.9602, + "step": 8088 + }, + { + "epoch": 0.4034413965087282, + "grad_norm": 0.49222798590946704, + "learning_rate": 3.384246190582211e-05, + "loss": 0.9101, + "step": 8089 + }, + { + "epoch": 0.40349127182044886, + "grad_norm": 0.453365135974323, + "learning_rate": 3.383868438594998e-05, + "loss": 0.8678, + "step": 8090 + }, + { + "epoch": 0.4035411471321696, + "grad_norm": 0.46564115502007963, + "learning_rate": 3.383490663543635e-05, + "loss": 0.9263, + "step": 8091 + }, + { + "epoch": 0.4035910224438903, + "grad_norm": 0.4184948620035723, + "learning_rate": 3.383112865437979e-05, + "loss": 0.8723, + "step": 8092 + }, + { + "epoch": 0.40364089775561096, + "grad_norm": 0.43119903110492325, + "learning_rate": 3.382735044287887e-05, + "loss": 0.893, + "step": 8093 + }, + { + "epoch": 0.4036907730673317, + "grad_norm": 0.4006605031390638, + "learning_rate": 3.382357200103219e-05, + "loss": 0.8995, + "step": 8094 + }, + { + "epoch": 0.4037406483790524, + "grad_norm": 0.4055397338878644, + "learning_rate": 3.3819793328938345e-05, + "loss": 0.9309, + "step": 8095 + }, + { + "epoch": 0.40379052369077306, + "grad_norm": 0.41565374092732793, + "learning_rate": 3.3816014426695945e-05, + "loss": 0.9318, + "step": 8096 + }, + { + "epoch": 0.40384039900249374, + "grad_norm": 0.4864994108710663, + "learning_rate": 3.38122352944036e-05, + "loss": 0.8996, + "step": 8097 + }, + { + "epoch": 0.4038902743142145, + "grad_norm": 0.4518414607068196, + "learning_rate": 3.380845593215991e-05, + "loss": 0.94, + "step": 8098 + }, + { + "epoch": 0.40394014962593516, + "grad_norm": 0.47854380223181314, + "learning_rate": 3.3804676340063514e-05, + "loss": 0.9141, + "step": 8099 + }, + { + "epoch": 0.40399002493765584, + "grad_norm": 0.4579382155189333, + "learning_rate": 3.380089651821302e-05, + "loss": 0.9331, + "step": 8100 + }, + { + "epoch": 0.4040399002493766, + "grad_norm": 0.5463049852220723, + "learning_rate": 3.3797116466707076e-05, + "loss": 0.9283, + "step": 8101 + }, + { + "epoch": 0.40408977556109726, + "grad_norm": 0.41717687541688075, + "learning_rate": 3.379333618564431e-05, + "loss": 0.9019, + "step": 8102 + }, + { + "epoch": 0.40413965087281795, + "grad_norm": 1.3715122884686866, + "learning_rate": 3.378955567512337e-05, + "loss": 0.9329, + "step": 8103 + }, + { + "epoch": 0.40418952618453863, + "grad_norm": 0.4450959589700969, + "learning_rate": 3.378577493524292e-05, + "loss": 0.9271, + "step": 8104 + }, + { + "epoch": 0.40423940149625937, + "grad_norm": 0.5345060497580576, + "learning_rate": 3.3781993966101585e-05, + "loss": 0.8899, + "step": 8105 + }, + { + "epoch": 0.40428927680798005, + "grad_norm": 0.42879077114196007, + "learning_rate": 3.377821276779807e-05, + "loss": 0.8702, + "step": 8106 + }, + { + "epoch": 0.40433915211970073, + "grad_norm": 0.45723768462584885, + "learning_rate": 3.377443134043101e-05, + "loss": 0.9077, + "step": 8107 + }, + { + "epoch": 0.40438902743142147, + "grad_norm": 0.7889562299531235, + "learning_rate": 3.377064968409909e-05, + "loss": 0.9218, + "step": 8108 + }, + { + "epoch": 0.40443890274314215, + "grad_norm": 0.4132697827150373, + "learning_rate": 3.376686779890099e-05, + "loss": 0.9017, + "step": 8109 + }, + { + "epoch": 0.40448877805486283, + "grad_norm": 0.5591904797835384, + "learning_rate": 3.3763085684935406e-05, + "loss": 0.898, + "step": 8110 + }, + { + "epoch": 0.40453865336658357, + "grad_norm": 0.4437852030616408, + "learning_rate": 3.3759303342301025e-05, + "loss": 0.8956, + "step": 8111 + }, + { + "epoch": 0.40458852867830425, + "grad_norm": 0.3929712318793074, + "learning_rate": 3.375552077109654e-05, + "loss": 0.8946, + "step": 8112 + }, + { + "epoch": 0.40463840399002493, + "grad_norm": 0.38895508670615664, + "learning_rate": 3.3751737971420656e-05, + "loss": 0.9213, + "step": 8113 + }, + { + "epoch": 0.4046882793017456, + "grad_norm": 0.5660573528857794, + "learning_rate": 3.3747954943372086e-05, + "loss": 0.9062, + "step": 8114 + }, + { + "epoch": 0.40473815461346635, + "grad_norm": 0.4418301257488022, + "learning_rate": 3.374417168704955e-05, + "loss": 0.892, + "step": 8115 + }, + { + "epoch": 0.40478802992518703, + "grad_norm": 0.41805227715224325, + "learning_rate": 3.374038820255177e-05, + "loss": 0.9041, + "step": 8116 + }, + { + "epoch": 0.4048379052369077, + "grad_norm": 0.43799035303703054, + "learning_rate": 3.3736604489977466e-05, + "loss": 0.9269, + "step": 8117 + }, + { + "epoch": 0.40488778054862845, + "grad_norm": 0.4126876983120558, + "learning_rate": 3.373282054942537e-05, + "loss": 0.8945, + "step": 8118 + }, + { + "epoch": 0.40493765586034913, + "grad_norm": 0.38164070116468957, + "learning_rate": 3.372903638099425e-05, + "loss": 0.8976, + "step": 8119 + }, + { + "epoch": 0.4049875311720698, + "grad_norm": 0.4750371516221315, + "learning_rate": 3.372525198478281e-05, + "loss": 0.9167, + "step": 8120 + }, + { + "epoch": 0.4050374064837905, + "grad_norm": 0.4163708606727219, + "learning_rate": 3.372146736088984e-05, + "loss": 0.9234, + "step": 8121 + }, + { + "epoch": 0.40508728179551123, + "grad_norm": 0.4696172395737719, + "learning_rate": 3.3717682509414086e-05, + "loss": 0.9261, + "step": 8122 + }, + { + "epoch": 0.4051371571072319, + "grad_norm": 0.44390046642305075, + "learning_rate": 3.371389743045429e-05, + "loss": 0.8724, + "step": 8123 + }, + { + "epoch": 0.4051870324189526, + "grad_norm": 0.3918987380705564, + "learning_rate": 3.3710112124109253e-05, + "loss": 0.8812, + "step": 8124 + }, + { + "epoch": 0.40523690773067333, + "grad_norm": 0.6637308869685064, + "learning_rate": 3.370632659047774e-05, + "loss": 0.9251, + "step": 8125 + }, + { + "epoch": 0.405286783042394, + "grad_norm": 0.4128006577574358, + "learning_rate": 3.370254082965853e-05, + "loss": 0.9162, + "step": 8126 + }, + { + "epoch": 0.4053366583541147, + "grad_norm": 0.46087371518510667, + "learning_rate": 3.36987548417504e-05, + "loss": 0.8877, + "step": 8127 + }, + { + "epoch": 0.40538653366583544, + "grad_norm": 0.43807539408383006, + "learning_rate": 3.369496862685217e-05, + "loss": 0.8953, + "step": 8128 + }, + { + "epoch": 0.4054364089775561, + "grad_norm": 0.49178160841584995, + "learning_rate": 3.369118218506262e-05, + "loss": 0.8845, + "step": 8129 + }, + { + "epoch": 0.4054862842892768, + "grad_norm": 0.5067113419903753, + "learning_rate": 3.368739551648056e-05, + "loss": 0.9072, + "step": 8130 + }, + { + "epoch": 0.4055361596009975, + "grad_norm": 0.4296847128786572, + "learning_rate": 3.36836086212048e-05, + "loss": 0.9163, + "step": 8131 + }, + { + "epoch": 0.4055860349127182, + "grad_norm": 0.4291113171266457, + "learning_rate": 3.367982149933416e-05, + "loss": 0.9112, + "step": 8132 + }, + { + "epoch": 0.4056359102244389, + "grad_norm": 0.5559771707885043, + "learning_rate": 3.367603415096746e-05, + "loss": 0.9344, + "step": 8133 + }, + { + "epoch": 0.4056857855361596, + "grad_norm": 0.4588483242297336, + "learning_rate": 3.367224657620354e-05, + "loss": 0.8815, + "step": 8134 + }, + { + "epoch": 0.4057356608478803, + "grad_norm": 0.5405521542885415, + "learning_rate": 3.3668458775141225e-05, + "loss": 0.8836, + "step": 8135 + }, + { + "epoch": 0.405785536159601, + "grad_norm": 0.7741717587625709, + "learning_rate": 3.366467074787936e-05, + "loss": 0.888, + "step": 8136 + }, + { + "epoch": 0.4058354114713217, + "grad_norm": 0.7555358080062644, + "learning_rate": 3.366088249451678e-05, + "loss": 0.8822, + "step": 8137 + }, + { + "epoch": 0.4058852867830424, + "grad_norm": 0.675615169841202, + "learning_rate": 3.365709401515235e-05, + "loss": 0.8678, + "step": 8138 + }, + { + "epoch": 0.4059351620947631, + "grad_norm": 0.4353777008069755, + "learning_rate": 3.365330530988492e-05, + "loss": 0.9356, + "step": 8139 + }, + { + "epoch": 0.4059850374064838, + "grad_norm": 0.3955812063623212, + "learning_rate": 3.3649516378813375e-05, + "loss": 0.8829, + "step": 8140 + }, + { + "epoch": 0.40603491271820447, + "grad_norm": 0.4482873521299646, + "learning_rate": 3.3645727222036564e-05, + "loss": 0.9067, + "step": 8141 + }, + { + "epoch": 0.4060847880299252, + "grad_norm": 0.5623541683294053, + "learning_rate": 3.3641937839653374e-05, + "loss": 0.9149, + "step": 8142 + }, + { + "epoch": 0.4061346633416459, + "grad_norm": 0.5046441609996182, + "learning_rate": 3.363814823176268e-05, + "loss": 0.9175, + "step": 8143 + }, + { + "epoch": 0.40618453865336657, + "grad_norm": 0.4142079718592696, + "learning_rate": 3.363435839846337e-05, + "loss": 0.8558, + "step": 8144 + }, + { + "epoch": 0.4062344139650873, + "grad_norm": 1.157515315271292, + "learning_rate": 3.3630568339854354e-05, + "loss": 0.9033, + "step": 8145 + }, + { + "epoch": 0.406284289276808, + "grad_norm": 0.7363302574865446, + "learning_rate": 3.362677805603451e-05, + "loss": 0.8992, + "step": 8146 + }, + { + "epoch": 0.40633416458852867, + "grad_norm": 0.4216323787848744, + "learning_rate": 3.362298754710276e-05, + "loss": 0.9288, + "step": 8147 + }, + { + "epoch": 0.40638403990024935, + "grad_norm": 0.5549154844678328, + "learning_rate": 3.3619196813158006e-05, + "loss": 0.9107, + "step": 8148 + }, + { + "epoch": 0.4064339152119701, + "grad_norm": 0.430906959703551, + "learning_rate": 3.3615405854299166e-05, + "loss": 0.883, + "step": 8149 + }, + { + "epoch": 0.40648379052369077, + "grad_norm": 0.47027400562622557, + "learning_rate": 3.361161467062517e-05, + "loss": 0.8875, + "step": 8150 + }, + { + "epoch": 0.40653366583541145, + "grad_norm": 1.5676622848510469, + "learning_rate": 3.360782326223493e-05, + "loss": 0.878, + "step": 8151 + }, + { + "epoch": 0.4065835411471322, + "grad_norm": 0.4734744537427727, + "learning_rate": 3.3604031629227414e-05, + "loss": 0.8943, + "step": 8152 + }, + { + "epoch": 0.40663341645885287, + "grad_norm": 0.4756604767399322, + "learning_rate": 3.360023977170153e-05, + "loss": 0.8934, + "step": 8153 + }, + { + "epoch": 0.40668329177057355, + "grad_norm": 0.48919447514496206, + "learning_rate": 3.359644768975625e-05, + "loss": 0.8986, + "step": 8154 + }, + { + "epoch": 0.4067331670822943, + "grad_norm": 0.45459158398419586, + "learning_rate": 3.35926553834905e-05, + "loss": 0.8811, + "step": 8155 + }, + { + "epoch": 0.40678304239401497, + "grad_norm": 0.507186946632113, + "learning_rate": 3.3588862853003265e-05, + "loss": 0.8666, + "step": 8156 + }, + { + "epoch": 0.40683291770573565, + "grad_norm": 0.5204917467293468, + "learning_rate": 3.35850700983935e-05, + "loss": 0.8792, + "step": 8157 + }, + { + "epoch": 0.40688279301745633, + "grad_norm": 0.5420326683424921, + "learning_rate": 3.358127711976016e-05, + "loss": 0.8903, + "step": 8158 + }, + { + "epoch": 0.40693266832917707, + "grad_norm": 0.5801640992041504, + "learning_rate": 3.357748391720224e-05, + "loss": 0.9072, + "step": 8159 + }, + { + "epoch": 0.40698254364089775, + "grad_norm": 0.4439192445106743, + "learning_rate": 3.3573690490818716e-05, + "loss": 0.9502, + "step": 8160 + }, + { + "epoch": 0.40703241895261844, + "grad_norm": 0.6666204975300396, + "learning_rate": 3.356989684070858e-05, + "loss": 0.9064, + "step": 8161 + }, + { + "epoch": 0.4070822942643392, + "grad_norm": 0.5193857223951782, + "learning_rate": 3.3566102966970816e-05, + "loss": 0.8892, + "step": 8162 + }, + { + "epoch": 0.40713216957605985, + "grad_norm": 0.5220513152215921, + "learning_rate": 3.3562308869704425e-05, + "loss": 0.9176, + "step": 8163 + }, + { + "epoch": 0.40718204488778054, + "grad_norm": 1.2499609786388448, + "learning_rate": 3.355851454900842e-05, + "loss": 0.8727, + "step": 8164 + }, + { + "epoch": 0.4072319201995013, + "grad_norm": 0.7500139330356655, + "learning_rate": 3.3554720004981815e-05, + "loss": 0.8951, + "step": 8165 + }, + { + "epoch": 0.40728179551122196, + "grad_norm": 0.5294371554606969, + "learning_rate": 3.355092523772361e-05, + "loss": 0.9115, + "step": 8166 + }, + { + "epoch": 0.40733167082294264, + "grad_norm": 0.4602252171255416, + "learning_rate": 3.3547130247332846e-05, + "loss": 0.9017, + "step": 8167 + }, + { + "epoch": 0.4073815461346633, + "grad_norm": 0.4689722406732571, + "learning_rate": 3.354333503390854e-05, + "loss": 0.894, + "step": 8168 + }, + { + "epoch": 0.40743142144638406, + "grad_norm": 0.6811644804802546, + "learning_rate": 3.353953959754973e-05, + "loss": 0.8929, + "step": 8169 + }, + { + "epoch": 0.40748129675810474, + "grad_norm": 0.7514233231905784, + "learning_rate": 3.3535743938355455e-05, + "loss": 0.9386, + "step": 8170 + }, + { + "epoch": 0.4075311720698254, + "grad_norm": 0.49677672587973054, + "learning_rate": 3.353194805642477e-05, + "loss": 0.9422, + "step": 8171 + }, + { + "epoch": 0.40758104738154616, + "grad_norm": 0.5451402214844967, + "learning_rate": 3.3528151951856705e-05, + "loss": 0.8714, + "step": 8172 + }, + { + "epoch": 0.40763092269326684, + "grad_norm": 0.5481744674378218, + "learning_rate": 3.352435562475034e-05, + "loss": 0.9134, + "step": 8173 + }, + { + "epoch": 0.4076807980049875, + "grad_norm": 0.4677431037051818, + "learning_rate": 3.352055907520473e-05, + "loss": 0.909, + "step": 8174 + }, + { + "epoch": 0.4077306733167082, + "grad_norm": 0.39842122112013484, + "learning_rate": 3.351676230331894e-05, + "loss": 0.8921, + "step": 8175 + }, + { + "epoch": 0.40778054862842894, + "grad_norm": 0.5159032455562245, + "learning_rate": 3.3512965309192044e-05, + "loss": 0.8742, + "step": 8176 + }, + { + "epoch": 0.4078304239401496, + "grad_norm": 1.0271519972667946, + "learning_rate": 3.3509168092923146e-05, + "loss": 0.9263, + "step": 8177 + }, + { + "epoch": 0.4078802992518703, + "grad_norm": 0.4201816717144256, + "learning_rate": 3.35053706546113e-05, + "loss": 0.8928, + "step": 8178 + }, + { + "epoch": 0.40793017456359104, + "grad_norm": 0.42440042206259243, + "learning_rate": 3.350157299435562e-05, + "loss": 0.9013, + "step": 8179 + }, + { + "epoch": 0.4079800498753117, + "grad_norm": 0.43215779710386854, + "learning_rate": 3.349777511225519e-05, + "loss": 0.881, + "step": 8180 + }, + { + "epoch": 0.4080299251870324, + "grad_norm": 0.4314392463021442, + "learning_rate": 3.3493977008409125e-05, + "loss": 0.9355, + "step": 8181 + }, + { + "epoch": 0.40807980049875314, + "grad_norm": 0.7174359259027324, + "learning_rate": 3.349017868291653e-05, + "loss": 0.9511, + "step": 8182 + }, + { + "epoch": 0.4081296758104738, + "grad_norm": 0.495131088672335, + "learning_rate": 3.348638013587653e-05, + "loss": 0.8766, + "step": 8183 + }, + { + "epoch": 0.4081795511221945, + "grad_norm": 0.5634441067925356, + "learning_rate": 3.3482581367388233e-05, + "loss": 0.9174, + "step": 8184 + }, + { + "epoch": 0.4082294264339152, + "grad_norm": 0.4581733457543488, + "learning_rate": 3.347878237755078e-05, + "loss": 0.9082, + "step": 8185 + }, + { + "epoch": 0.4082793017456359, + "grad_norm": 0.4460828228116429, + "learning_rate": 3.347498316646328e-05, + "loss": 0.8795, + "step": 8186 + }, + { + "epoch": 0.4083291770573566, + "grad_norm": 0.6426299135644813, + "learning_rate": 3.3471183734224895e-05, + "loss": 0.925, + "step": 8187 + }, + { + "epoch": 0.4083790523690773, + "grad_norm": 0.596421002615259, + "learning_rate": 3.346738408093476e-05, + "loss": 0.9299, + "step": 8188 + }, + { + "epoch": 0.408428927680798, + "grad_norm": 0.43812742395027415, + "learning_rate": 3.346358420669204e-05, + "loss": 0.81, + "step": 8189 + }, + { + "epoch": 0.4084788029925187, + "grad_norm": 0.5189668636452415, + "learning_rate": 3.3459784111595856e-05, + "loss": 0.9326, + "step": 8190 + }, + { + "epoch": 0.4085286783042394, + "grad_norm": 0.5260024426003393, + "learning_rate": 3.345598379574541e-05, + "loss": 0.8974, + "step": 8191 + }, + { + "epoch": 0.4085785536159601, + "grad_norm": 0.4059903695504116, + "learning_rate": 3.345218325923983e-05, + "loss": 0.8766, + "step": 8192 + }, + { + "epoch": 0.4086284289276808, + "grad_norm": 0.5375149069765506, + "learning_rate": 3.344838250217833e-05, + "loss": 0.9487, + "step": 8193 + }, + { + "epoch": 0.4086783042394015, + "grad_norm": 0.9074697973261872, + "learning_rate": 3.344458152466007e-05, + "loss": 0.9109, + "step": 8194 + }, + { + "epoch": 0.4087281795511222, + "grad_norm": 0.4962289883124921, + "learning_rate": 3.344078032678422e-05, + "loss": 0.8937, + "step": 8195 + }, + { + "epoch": 0.4087780548628429, + "grad_norm": 0.47058288543592114, + "learning_rate": 3.343697890865e-05, + "loss": 0.9256, + "step": 8196 + }, + { + "epoch": 0.4088279301745636, + "grad_norm": 0.5252182118317507, + "learning_rate": 3.343317727035657e-05, + "loss": 0.9137, + "step": 8197 + }, + { + "epoch": 0.4088778054862843, + "grad_norm": 0.4223891303974414, + "learning_rate": 3.3429375412003175e-05, + "loss": 0.8882, + "step": 8198 + }, + { + "epoch": 0.408927680798005, + "grad_norm": 0.45710609991880935, + "learning_rate": 3.342557333368899e-05, + "loss": 0.8438, + "step": 8199 + }, + { + "epoch": 0.4089775561097257, + "grad_norm": 0.8596348944482666, + "learning_rate": 3.342177103551325e-05, + "loss": 0.8624, + "step": 8200 + }, + { + "epoch": 0.4090274314214464, + "grad_norm": 0.44942659226152, + "learning_rate": 3.3417968517575156e-05, + "loss": 0.8915, + "step": 8201 + }, + { + "epoch": 0.40907730673316706, + "grad_norm": 0.4937034668527082, + "learning_rate": 3.3414165779973945e-05, + "loss": 0.8934, + "step": 8202 + }, + { + "epoch": 0.4091271820448878, + "grad_norm": 0.5160971859228447, + "learning_rate": 3.341036282280883e-05, + "loss": 0.8856, + "step": 8203 + }, + { + "epoch": 0.4091770573566085, + "grad_norm": 0.5292860862995598, + "learning_rate": 3.340655964617908e-05, + "loss": 0.8982, + "step": 8204 + }, + { + "epoch": 0.40922693266832916, + "grad_norm": 0.8349461651119228, + "learning_rate": 3.340275625018391e-05, + "loss": 0.964, + "step": 8205 + }, + { + "epoch": 0.4092768079800499, + "grad_norm": 0.416257213971288, + "learning_rate": 3.3398952634922574e-05, + "loss": 0.8886, + "step": 8206 + }, + { + "epoch": 0.4093266832917706, + "grad_norm": 0.4779011688410685, + "learning_rate": 3.3395148800494334e-05, + "loss": 0.972, + "step": 8207 + }, + { + "epoch": 0.40937655860349126, + "grad_norm": 0.4934194561693345, + "learning_rate": 3.339134474699844e-05, + "loss": 0.9347, + "step": 8208 + }, + { + "epoch": 0.409426433915212, + "grad_norm": 0.40470588430693766, + "learning_rate": 3.338754047453416e-05, + "loss": 0.8853, + "step": 8209 + }, + { + "epoch": 0.4094763092269327, + "grad_norm": 0.46909885404545576, + "learning_rate": 3.338373598320076e-05, + "loss": 0.8655, + "step": 8210 + }, + { + "epoch": 0.40952618453865336, + "grad_norm": 0.5042972144025775, + "learning_rate": 3.337993127309753e-05, + "loss": 0.949, + "step": 8211 + }, + { + "epoch": 0.40957605985037404, + "grad_norm": 0.41078452866450843, + "learning_rate": 3.3376126344323736e-05, + "loss": 0.8986, + "step": 8212 + }, + { + "epoch": 0.4096259351620948, + "grad_norm": 0.5135624211948959, + "learning_rate": 3.3372321196978674e-05, + "loss": 0.9182, + "step": 8213 + }, + { + "epoch": 0.40967581047381546, + "grad_norm": 0.4444237478173653, + "learning_rate": 3.336851583116164e-05, + "loss": 0.9192, + "step": 8214 + }, + { + "epoch": 0.40972568578553614, + "grad_norm": 0.48278309852810025, + "learning_rate": 3.3364710246971934e-05, + "loss": 0.868, + "step": 8215 + }, + { + "epoch": 0.4097755610972569, + "grad_norm": 0.508812615029072, + "learning_rate": 3.3360904444508856e-05, + "loss": 0.9081, + "step": 8216 + }, + { + "epoch": 0.40982543640897756, + "grad_norm": 0.4930640469938743, + "learning_rate": 3.335709842387172e-05, + "loss": 0.9194, + "step": 8217 + }, + { + "epoch": 0.40987531172069824, + "grad_norm": 0.5225049176060478, + "learning_rate": 3.335329218515984e-05, + "loss": 0.9066, + "step": 8218 + }, + { + "epoch": 0.4099251870324189, + "grad_norm": 0.4253520639285169, + "learning_rate": 3.3349485728472535e-05, + "loss": 0.917, + "step": 8219 + }, + { + "epoch": 0.40997506234413966, + "grad_norm": 1.38267467143808, + "learning_rate": 3.334567905390914e-05, + "loss": 0.8811, + "step": 8220 + }, + { + "epoch": 0.41002493765586034, + "grad_norm": 0.6680255808413451, + "learning_rate": 3.3341872161568984e-05, + "loss": 0.8954, + "step": 8221 + }, + { + "epoch": 0.410074812967581, + "grad_norm": 0.4300586281785461, + "learning_rate": 3.333806505155141e-05, + "loss": 0.8873, + "step": 8222 + }, + { + "epoch": 0.41012468827930176, + "grad_norm": 0.4416476556950333, + "learning_rate": 3.333425772395575e-05, + "loss": 0.8872, + "step": 8223 + }, + { + "epoch": 0.41017456359102245, + "grad_norm": 0.44903933096552695, + "learning_rate": 3.333045017888138e-05, + "loss": 0.9036, + "step": 8224 + }, + { + "epoch": 0.4102244389027431, + "grad_norm": 0.5653884079456021, + "learning_rate": 3.332664241642763e-05, + "loss": 0.9471, + "step": 8225 + }, + { + "epoch": 0.41027431421446386, + "grad_norm": 0.4420211170932008, + "learning_rate": 3.332283443669388e-05, + "loss": 0.8855, + "step": 8226 + }, + { + "epoch": 0.41032418952618455, + "grad_norm": 0.6579166872725168, + "learning_rate": 3.3319026239779484e-05, + "loss": 0.8813, + "step": 8227 + }, + { + "epoch": 0.41037406483790523, + "grad_norm": 0.6065785068446127, + "learning_rate": 3.331521782578382e-05, + "loss": 0.8661, + "step": 8228 + }, + { + "epoch": 0.4104239401496259, + "grad_norm": 0.47978761366998673, + "learning_rate": 3.3311409194806274e-05, + "loss": 0.9221, + "step": 8229 + }, + { + "epoch": 0.41047381546134665, + "grad_norm": 0.4217268005433133, + "learning_rate": 3.330760034694622e-05, + "loss": 0.8919, + "step": 8230 + }, + { + "epoch": 0.41052369077306733, + "grad_norm": 0.5497478628480249, + "learning_rate": 3.330379128230306e-05, + "loss": 0.8365, + "step": 8231 + }, + { + "epoch": 0.410573566084788, + "grad_norm": 0.40678050948548444, + "learning_rate": 3.3299982000976175e-05, + "loss": 0.9059, + "step": 8232 + }, + { + "epoch": 0.41062344139650875, + "grad_norm": 0.5198202491191781, + "learning_rate": 3.329617250306498e-05, + "loss": 0.8798, + "step": 8233 + }, + { + "epoch": 0.41067331670822943, + "grad_norm": 0.9516070695692752, + "learning_rate": 3.3292362788668877e-05, + "loss": 0.9024, + "step": 8234 + }, + { + "epoch": 0.4107231920199501, + "grad_norm": 0.5979451391456327, + "learning_rate": 3.3288552857887275e-05, + "loss": 0.886, + "step": 8235 + }, + { + "epoch": 0.41077306733167085, + "grad_norm": 0.7225139040817331, + "learning_rate": 3.328474271081959e-05, + "loss": 0.8863, + "step": 8236 + }, + { + "epoch": 0.41082294264339153, + "grad_norm": 0.4181884064543115, + "learning_rate": 3.3280932347565255e-05, + "loss": 0.9344, + "step": 8237 + }, + { + "epoch": 0.4108728179551122, + "grad_norm": 0.4491452092408224, + "learning_rate": 3.32771217682237e-05, + "loss": 0.9064, + "step": 8238 + }, + { + "epoch": 0.4109226932668329, + "grad_norm": 0.3852806475957013, + "learning_rate": 3.327331097289436e-05, + "loss": 0.9125, + "step": 8239 + }, + { + "epoch": 0.41097256857855363, + "grad_norm": 0.4961145693953434, + "learning_rate": 3.326949996167667e-05, + "loss": 0.9022, + "step": 8240 + }, + { + "epoch": 0.4110224438902743, + "grad_norm": 0.4857758724151839, + "learning_rate": 3.326568873467007e-05, + "loss": 0.907, + "step": 8241 + }, + { + "epoch": 0.411072319201995, + "grad_norm": 0.4008543369588129, + "learning_rate": 3.326187729197404e-05, + "loss": 0.8777, + "step": 8242 + }, + { + "epoch": 0.41112219451371573, + "grad_norm": 0.42960006155296887, + "learning_rate": 3.3258065633688004e-05, + "loss": 0.8981, + "step": 8243 + }, + { + "epoch": 0.4111720698254364, + "grad_norm": 0.43954811535348187, + "learning_rate": 3.3254253759911444e-05, + "loss": 0.8582, + "step": 8244 + }, + { + "epoch": 0.4112219451371571, + "grad_norm": 0.46449986466753296, + "learning_rate": 3.325044167074383e-05, + "loss": 0.8568, + "step": 8245 + }, + { + "epoch": 0.4112718204488778, + "grad_norm": 0.4486028001334387, + "learning_rate": 3.324662936628463e-05, + "loss": 0.9133, + "step": 8246 + }, + { + "epoch": 0.4113216957605985, + "grad_norm": 0.49250954446283224, + "learning_rate": 3.324281684663333e-05, + "loss": 0.9611, + "step": 8247 + }, + { + "epoch": 0.4113715710723192, + "grad_norm": 0.48680564756917427, + "learning_rate": 3.3239004111889416e-05, + "loss": 0.8834, + "step": 8248 + }, + { + "epoch": 0.4114214463840399, + "grad_norm": 0.4138717733065339, + "learning_rate": 3.323519116215237e-05, + "loss": 0.9153, + "step": 8249 + }, + { + "epoch": 0.4114713216957606, + "grad_norm": 0.4178320910963361, + "learning_rate": 3.32313779975217e-05, + "loss": 0.9385, + "step": 8250 + }, + { + "epoch": 0.4115211970074813, + "grad_norm": 0.48154233313258976, + "learning_rate": 3.3227564618096904e-05, + "loss": 0.8599, + "step": 8251 + }, + { + "epoch": 0.411571072319202, + "grad_norm": 0.40332571925052046, + "learning_rate": 3.322375102397749e-05, + "loss": 0.8821, + "step": 8252 + }, + { + "epoch": 0.4116209476309227, + "grad_norm": 0.40477040920300017, + "learning_rate": 3.321993721526297e-05, + "loss": 0.8773, + "step": 8253 + }, + { + "epoch": 0.4116708229426434, + "grad_norm": 0.5404493364751537, + "learning_rate": 3.321612319205288e-05, + "loss": 0.8443, + "step": 8254 + }, + { + "epoch": 0.4117206982543641, + "grad_norm": 0.44360787917925215, + "learning_rate": 3.321230895444672e-05, + "loss": 0.9022, + "step": 8255 + }, + { + "epoch": 0.41177057356608476, + "grad_norm": 0.580748968909822, + "learning_rate": 3.320849450254404e-05, + "loss": 0.9049, + "step": 8256 + }, + { + "epoch": 0.4118204488778055, + "grad_norm": 0.44519557778387026, + "learning_rate": 3.320467983644437e-05, + "loss": 0.9143, + "step": 8257 + }, + { + "epoch": 0.4118703241895262, + "grad_norm": 1.2758473355158886, + "learning_rate": 3.3200864956247245e-05, + "loss": 0.8869, + "step": 8258 + }, + { + "epoch": 0.41192019950124686, + "grad_norm": 0.522236797294267, + "learning_rate": 3.319704986205223e-05, + "loss": 0.9393, + "step": 8259 + }, + { + "epoch": 0.4119700748129676, + "grad_norm": 0.4524701649236483, + "learning_rate": 3.3193234553958855e-05, + "loss": 0.8817, + "step": 8260 + }, + { + "epoch": 0.4120199501246883, + "grad_norm": 0.6096965402604346, + "learning_rate": 3.3189419032066705e-05, + "loss": 0.8849, + "step": 8261 + }, + { + "epoch": 0.41206982543640897, + "grad_norm": 0.5113707489059169, + "learning_rate": 3.318560329647531e-05, + "loss": 0.9018, + "step": 8262 + }, + { + "epoch": 0.4121197007481297, + "grad_norm": 0.5848424434691527, + "learning_rate": 3.318178734728428e-05, + "loss": 0.8809, + "step": 8263 + }, + { + "epoch": 0.4121695760598504, + "grad_norm": 0.4738224484561046, + "learning_rate": 3.317797118459317e-05, + "loss": 0.8795, + "step": 8264 + }, + { + "epoch": 0.41221945137157107, + "grad_norm": 0.4153434649205176, + "learning_rate": 3.317415480850155e-05, + "loss": 0.8643, + "step": 8265 + }, + { + "epoch": 0.41226932668329175, + "grad_norm": 0.47987169407360963, + "learning_rate": 3.3170338219109026e-05, + "loss": 0.9404, + "step": 8266 + }, + { + "epoch": 0.4123192019950125, + "grad_norm": 0.6160998205618446, + "learning_rate": 3.3166521416515176e-05, + "loss": 0.9129, + "step": 8267 + }, + { + "epoch": 0.41236907730673317, + "grad_norm": 0.5111235417199869, + "learning_rate": 3.3162704400819614e-05, + "loss": 0.9059, + "step": 8268 + }, + { + "epoch": 0.41241895261845385, + "grad_norm": 0.44799529060513466, + "learning_rate": 3.315888717212193e-05, + "loss": 0.9143, + "step": 8269 + }, + { + "epoch": 0.4124688279301746, + "grad_norm": 0.40011012380695316, + "learning_rate": 3.3155069730521735e-05, + "loss": 0.8673, + "step": 8270 + }, + { + "epoch": 0.41251870324189527, + "grad_norm": 0.39914427342363645, + "learning_rate": 3.315125207611865e-05, + "loss": 0.8756, + "step": 8271 + }, + { + "epoch": 0.41256857855361595, + "grad_norm": 0.6299506780726989, + "learning_rate": 3.3147434209012285e-05, + "loss": 0.9542, + "step": 8272 + }, + { + "epoch": 0.41261845386533663, + "grad_norm": 0.42743572398126173, + "learning_rate": 3.3143616129302266e-05, + "loss": 0.8791, + "step": 8273 + }, + { + "epoch": 0.41266832917705737, + "grad_norm": 0.4584763554181492, + "learning_rate": 3.313979783708824e-05, + "loss": 0.9225, + "step": 8274 + }, + { + "epoch": 0.41271820448877805, + "grad_norm": 0.5303658830297654, + "learning_rate": 3.313597933246982e-05, + "loss": 0.9463, + "step": 8275 + }, + { + "epoch": 0.41276807980049873, + "grad_norm": 0.4306230436482396, + "learning_rate": 3.3132160615546674e-05, + "loss": 0.885, + "step": 8276 + }, + { + "epoch": 0.41281795511221947, + "grad_norm": 0.74732414046025, + "learning_rate": 3.312834168641843e-05, + "loss": 0.88, + "step": 8277 + }, + { + "epoch": 0.41286783042394015, + "grad_norm": 0.44742340165298994, + "learning_rate": 3.3124522545184745e-05, + "loss": 0.9218, + "step": 8278 + }, + { + "epoch": 0.41291770573566083, + "grad_norm": 0.43319322687215256, + "learning_rate": 3.312070319194529e-05, + "loss": 0.9263, + "step": 8279 + }, + { + "epoch": 0.41296758104738157, + "grad_norm": 0.44767894692520205, + "learning_rate": 3.311688362679971e-05, + "loss": 0.9241, + "step": 8280 + }, + { + "epoch": 0.41301745635910225, + "grad_norm": 0.4324032511157433, + "learning_rate": 3.3113063849847684e-05, + "loss": 0.8974, + "step": 8281 + }, + { + "epoch": 0.41306733167082293, + "grad_norm": 0.4164972687510526, + "learning_rate": 3.310924386118889e-05, + "loss": 0.882, + "step": 8282 + }, + { + "epoch": 0.4131172069825436, + "grad_norm": 0.43318706471490914, + "learning_rate": 3.310542366092301e-05, + "loss": 0.9128, + "step": 8283 + }, + { + "epoch": 0.41316708229426435, + "grad_norm": 0.39071423047429693, + "learning_rate": 3.310160324914972e-05, + "loss": 0.8871, + "step": 8284 + }, + { + "epoch": 0.41321695760598504, + "grad_norm": 0.44768831434659584, + "learning_rate": 3.309778262596872e-05, + "loss": 0.9643, + "step": 8285 + }, + { + "epoch": 0.4132668329177057, + "grad_norm": 0.4631038446832746, + "learning_rate": 3.3093961791479714e-05, + "loss": 0.9286, + "step": 8286 + }, + { + "epoch": 0.41331670822942645, + "grad_norm": 0.4257234594590319, + "learning_rate": 3.3090140745782396e-05, + "loss": 0.9007, + "step": 8287 + }, + { + "epoch": 0.41336658354114714, + "grad_norm": 0.3874162318665473, + "learning_rate": 3.308631948897646e-05, + "loss": 0.8823, + "step": 8288 + }, + { + "epoch": 0.4134164588528678, + "grad_norm": 0.4347768333314009, + "learning_rate": 3.3082498021161656e-05, + "loss": 0.8953, + "step": 8289 + }, + { + "epoch": 0.41346633416458856, + "grad_norm": 0.5068143336395918, + "learning_rate": 3.3078676342437673e-05, + "loss": 0.8754, + "step": 8290 + }, + { + "epoch": 0.41351620947630924, + "grad_norm": 0.47770553135683863, + "learning_rate": 3.3074854452904246e-05, + "loss": 0.9205, + "step": 8291 + }, + { + "epoch": 0.4135660847880299, + "grad_norm": 0.45016164346900417, + "learning_rate": 3.3071032352661114e-05, + "loss": 0.9126, + "step": 8292 + }, + { + "epoch": 0.4136159600997506, + "grad_norm": 1.0116849258185712, + "learning_rate": 3.3067210041807996e-05, + "loss": 0.9203, + "step": 8293 + }, + { + "epoch": 0.41366583541147134, + "grad_norm": 0.6110501393826439, + "learning_rate": 3.306338752044465e-05, + "loss": 0.8981, + "step": 8294 + }, + { + "epoch": 0.413715710723192, + "grad_norm": 0.6060735070393479, + "learning_rate": 3.3059564788670804e-05, + "loss": 0.9058, + "step": 8295 + }, + { + "epoch": 0.4137655860349127, + "grad_norm": 0.49161034135526965, + "learning_rate": 3.305574184658624e-05, + "loss": 0.9043, + "step": 8296 + }, + { + "epoch": 0.41381546134663344, + "grad_norm": 0.4381991046999959, + "learning_rate": 3.3051918694290684e-05, + "loss": 0.9558, + "step": 8297 + }, + { + "epoch": 0.4138653366583541, + "grad_norm": 0.5378308728840552, + "learning_rate": 3.304809533188391e-05, + "loss": 0.8828, + "step": 8298 + }, + { + "epoch": 0.4139152119700748, + "grad_norm": 0.4884313360223316, + "learning_rate": 3.30442717594657e-05, + "loss": 0.9292, + "step": 8299 + }, + { + "epoch": 0.4139650872817955, + "grad_norm": 0.4805453338270729, + "learning_rate": 3.304044797713582e-05, + "loss": 0.937, + "step": 8300 + }, + { + "epoch": 0.4140149625935162, + "grad_norm": 0.5523968887826788, + "learning_rate": 3.303662398499404e-05, + "loss": 0.8918, + "step": 8301 + }, + { + "epoch": 0.4140648379052369, + "grad_norm": 0.3644724591336974, + "learning_rate": 3.303279978314016e-05, + "loss": 0.9283, + "step": 8302 + }, + { + "epoch": 0.4141147132169576, + "grad_norm": 0.5651963640267039, + "learning_rate": 3.302897537167397e-05, + "loss": 0.9536, + "step": 8303 + }, + { + "epoch": 0.4141645885286783, + "grad_norm": 0.49585568972937466, + "learning_rate": 3.302515075069524e-05, + "loss": 0.9334, + "step": 8304 + }, + { + "epoch": 0.414214463840399, + "grad_norm": 0.5577851528770923, + "learning_rate": 3.3021325920303815e-05, + "loss": 0.8982, + "step": 8305 + }, + { + "epoch": 0.4142643391521197, + "grad_norm": 0.5221598889287613, + "learning_rate": 3.301750088059947e-05, + "loss": 0.9265, + "step": 8306 + }, + { + "epoch": 0.4143142144638404, + "grad_norm": 0.5164512974354647, + "learning_rate": 3.301367563168203e-05, + "loss": 0.9118, + "step": 8307 + }, + { + "epoch": 0.4143640897755611, + "grad_norm": 0.7110633225596485, + "learning_rate": 3.300985017365131e-05, + "loss": 0.9273, + "step": 8308 + }, + { + "epoch": 0.4144139650872818, + "grad_norm": 0.5470526667955274, + "learning_rate": 3.300602450660713e-05, + "loss": 0.9282, + "step": 8309 + }, + { + "epoch": 0.41446384039900247, + "grad_norm": 0.8708811781489303, + "learning_rate": 3.300219863064933e-05, + "loss": 0.8683, + "step": 8310 + }, + { + "epoch": 0.4145137157107232, + "grad_norm": 0.5185738412796819, + "learning_rate": 3.299837254587774e-05, + "loss": 0.9174, + "step": 8311 + }, + { + "epoch": 0.4145635910224439, + "grad_norm": 0.6598064203644989, + "learning_rate": 3.2994546252392186e-05, + "loss": 0.8969, + "step": 8312 + }, + { + "epoch": 0.41461346633416457, + "grad_norm": 0.42626574886290386, + "learning_rate": 3.2990719750292536e-05, + "loss": 0.8972, + "step": 8313 + }, + { + "epoch": 0.4146633416458853, + "grad_norm": 0.43619138742860664, + "learning_rate": 3.298689303967863e-05, + "loss": 0.9252, + "step": 8314 + }, + { + "epoch": 0.414713216957606, + "grad_norm": 0.5412273397780053, + "learning_rate": 3.2983066120650316e-05, + "loss": 0.9209, + "step": 8315 + }, + { + "epoch": 0.41476309226932667, + "grad_norm": 0.6456614917101894, + "learning_rate": 3.2979238993307475e-05, + "loss": 0.8899, + "step": 8316 + }, + { + "epoch": 0.4148129675810474, + "grad_norm": 0.4791815704438539, + "learning_rate": 3.297541165774996e-05, + "loss": 0.9538, + "step": 8317 + }, + { + "epoch": 0.4148628428927681, + "grad_norm": 0.519721445384032, + "learning_rate": 3.297158411407765e-05, + "loss": 0.922, + "step": 8318 + }, + { + "epoch": 0.4149127182044888, + "grad_norm": 0.38975331024252824, + "learning_rate": 3.2967756362390415e-05, + "loss": 0.9194, + "step": 8319 + }, + { + "epoch": 0.41496259351620945, + "grad_norm": 0.4431617330152998, + "learning_rate": 3.296392840278814e-05, + "loss": 0.9283, + "step": 8320 + }, + { + "epoch": 0.4150124688279302, + "grad_norm": 0.5402223548850934, + "learning_rate": 3.296010023537072e-05, + "loss": 0.9065, + "step": 8321 + }, + { + "epoch": 0.4150623441396509, + "grad_norm": 0.52524747371948, + "learning_rate": 3.295627186023805e-05, + "loss": 0.9518, + "step": 8322 + }, + { + "epoch": 0.41511221945137156, + "grad_norm": 0.9660541079730969, + "learning_rate": 3.2952443277490024e-05, + "loss": 0.9163, + "step": 8323 + }, + { + "epoch": 0.4151620947630923, + "grad_norm": 0.4432558091761934, + "learning_rate": 3.294861448722656e-05, + "loss": 0.8944, + "step": 8324 + }, + { + "epoch": 0.415211970074813, + "grad_norm": 0.5541288737514396, + "learning_rate": 3.294478548954754e-05, + "loss": 0.9519, + "step": 8325 + }, + { + "epoch": 0.41526184538653366, + "grad_norm": 0.4658019363810868, + "learning_rate": 3.2940956284552914e-05, + "loss": 0.8757, + "step": 8326 + }, + { + "epoch": 0.41531172069825434, + "grad_norm": 0.4467716902654359, + "learning_rate": 3.2937126872342574e-05, + "loss": 0.8986, + "step": 8327 + }, + { + "epoch": 0.4153615960099751, + "grad_norm": 0.5157987857956847, + "learning_rate": 3.293329725301647e-05, + "loss": 0.8951, + "step": 8328 + }, + { + "epoch": 0.41541147132169576, + "grad_norm": 0.55023978247326, + "learning_rate": 3.2929467426674524e-05, + "loss": 0.8948, + "step": 8329 + }, + { + "epoch": 0.41546134663341644, + "grad_norm": 0.86573366180729, + "learning_rate": 3.2925637393416666e-05, + "loss": 0.9085, + "step": 8330 + }, + { + "epoch": 0.4155112219451372, + "grad_norm": 0.6347441785054428, + "learning_rate": 3.292180715334287e-05, + "loss": 0.8941, + "step": 8331 + }, + { + "epoch": 0.41556109725685786, + "grad_norm": 0.5510296751943922, + "learning_rate": 3.2917976706553034e-05, + "loss": 0.8749, + "step": 8332 + }, + { + "epoch": 0.41561097256857854, + "grad_norm": 0.5374934501833311, + "learning_rate": 3.291414605314715e-05, + "loss": 0.9089, + "step": 8333 + }, + { + "epoch": 0.4156608478802993, + "grad_norm": 0.44373239256739333, + "learning_rate": 3.291031519322517e-05, + "loss": 0.8436, + "step": 8334 + }, + { + "epoch": 0.41571072319201996, + "grad_norm": 0.44006260392546104, + "learning_rate": 3.290648412688706e-05, + "loss": 0.8529, + "step": 8335 + }, + { + "epoch": 0.41576059850374064, + "grad_norm": 0.5252051576163022, + "learning_rate": 3.2902652854232776e-05, + "loss": 0.8818, + "step": 8336 + }, + { + "epoch": 0.4158104738154613, + "grad_norm": 0.4146398870289952, + "learning_rate": 3.289882137536231e-05, + "loss": 0.9004, + "step": 8337 + }, + { + "epoch": 0.41586034912718206, + "grad_norm": 0.8985770610200164, + "learning_rate": 3.2894989690375626e-05, + "loss": 0.8997, + "step": 8338 + }, + { + "epoch": 0.41591022443890274, + "grad_norm": 0.7423977881552493, + "learning_rate": 3.289115779937272e-05, + "loss": 0.9413, + "step": 8339 + }, + { + "epoch": 0.4159600997506234, + "grad_norm": 0.515977865252577, + "learning_rate": 3.288732570245359e-05, + "loss": 0.8688, + "step": 8340 + }, + { + "epoch": 0.41600997506234416, + "grad_norm": 1.0297937290777122, + "learning_rate": 3.288349339971822e-05, + "loss": 0.9385, + "step": 8341 + }, + { + "epoch": 0.41605985037406484, + "grad_norm": 0.4604317345960697, + "learning_rate": 3.287966089126662e-05, + "loss": 0.8832, + "step": 8342 + }, + { + "epoch": 0.4161097256857855, + "grad_norm": 0.8386704214080619, + "learning_rate": 3.2875828177198796e-05, + "loss": 0.9547, + "step": 8343 + }, + { + "epoch": 0.4161596009975062, + "grad_norm": 0.49999178225844687, + "learning_rate": 3.2871995257614764e-05, + "loss": 0.9464, + "step": 8344 + }, + { + "epoch": 0.41620947630922694, + "grad_norm": 0.6397535748685504, + "learning_rate": 3.2868162132614535e-05, + "loss": 0.8891, + "step": 8345 + }, + { + "epoch": 0.4162593516209476, + "grad_norm": 0.5824835392612047, + "learning_rate": 3.2864328802298135e-05, + "loss": 0.8564, + "step": 8346 + }, + { + "epoch": 0.4163092269326683, + "grad_norm": 0.6264549365296951, + "learning_rate": 3.28604952667656e-05, + "loss": 0.9199, + "step": 8347 + }, + { + "epoch": 0.41635910224438905, + "grad_norm": 0.6756445882669903, + "learning_rate": 3.285666152611695e-05, + "loss": 0.9115, + "step": 8348 + }, + { + "epoch": 0.4164089775561097, + "grad_norm": 0.5292085634958964, + "learning_rate": 3.2852827580452236e-05, + "loss": 0.9281, + "step": 8349 + }, + { + "epoch": 0.4164588528678304, + "grad_norm": 0.9913370875592378, + "learning_rate": 3.28489934298715e-05, + "loss": 0.8987, + "step": 8350 + }, + { + "epoch": 0.41650872817955115, + "grad_norm": 0.6527434935355164, + "learning_rate": 3.2845159074474806e-05, + "loss": 0.8662, + "step": 8351 + }, + { + "epoch": 0.41655860349127183, + "grad_norm": 0.4783287912949683, + "learning_rate": 3.284132451436218e-05, + "loss": 0.8707, + "step": 8352 + }, + { + "epoch": 0.4166084788029925, + "grad_norm": 0.49634725727837237, + "learning_rate": 3.2837489749633715e-05, + "loss": 0.957, + "step": 8353 + }, + { + "epoch": 0.4166583541147132, + "grad_norm": 0.5644615515800423, + "learning_rate": 3.283365478038945e-05, + "loss": 0.9018, + "step": 8354 + }, + { + "epoch": 0.41670822942643393, + "grad_norm": 0.4760523392326328, + "learning_rate": 3.282981960672948e-05, + "loss": 0.867, + "step": 8355 + }, + { + "epoch": 0.4167581047381546, + "grad_norm": 0.5035652133597348, + "learning_rate": 3.282598422875386e-05, + "loss": 0.8791, + "step": 8356 + }, + { + "epoch": 0.4168079800498753, + "grad_norm": 0.5525985848669678, + "learning_rate": 3.2822148646562695e-05, + "loss": 0.9317, + "step": 8357 + }, + { + "epoch": 0.41685785536159603, + "grad_norm": 0.9568771006606757, + "learning_rate": 3.281831286025605e-05, + "loss": 0.8843, + "step": 8358 + }, + { + "epoch": 0.4169077306733167, + "grad_norm": 0.5228272867052217, + "learning_rate": 3.281447686993404e-05, + "loss": 0.9285, + "step": 8359 + }, + { + "epoch": 0.4169576059850374, + "grad_norm": 0.3950885450834702, + "learning_rate": 3.281064067569676e-05, + "loss": 0.8726, + "step": 8360 + }, + { + "epoch": 0.41700748129675813, + "grad_norm": 0.7642841148421567, + "learning_rate": 3.2806804277644285e-05, + "loss": 0.8551, + "step": 8361 + }, + { + "epoch": 0.4170573566084788, + "grad_norm": 0.6885991784210881, + "learning_rate": 3.280296767587676e-05, + "loss": 0.9815, + "step": 8362 + }, + { + "epoch": 0.4171072319201995, + "grad_norm": 0.5050806171970602, + "learning_rate": 3.279913087049429e-05, + "loss": 0.8927, + "step": 8363 + }, + { + "epoch": 0.4171571072319202, + "grad_norm": 0.689444807151678, + "learning_rate": 3.2795293861596976e-05, + "loss": 0.8776, + "step": 8364 + }, + { + "epoch": 0.4172069825436409, + "grad_norm": 0.7069181399883849, + "learning_rate": 3.2791456649284966e-05, + "loss": 0.8718, + "step": 8365 + }, + { + "epoch": 0.4172568578553616, + "grad_norm": 0.6811211881627074, + "learning_rate": 3.2787619233658385e-05, + "loss": 0.8845, + "step": 8366 + }, + { + "epoch": 0.4173067331670823, + "grad_norm": 0.4525155566577988, + "learning_rate": 3.278378161481736e-05, + "loss": 0.856, + "step": 8367 + }, + { + "epoch": 0.417356608478803, + "grad_norm": 0.6909664375127732, + "learning_rate": 3.277994379286204e-05, + "loss": 0.9095, + "step": 8368 + }, + { + "epoch": 0.4174064837905237, + "grad_norm": 0.6123918225951327, + "learning_rate": 3.2776105767892565e-05, + "loss": 0.9247, + "step": 8369 + }, + { + "epoch": 0.4174563591022444, + "grad_norm": 0.555197490236487, + "learning_rate": 3.277226754000909e-05, + "loss": 0.8814, + "step": 8370 + }, + { + "epoch": 0.41750623441396506, + "grad_norm": 0.7538629237482084, + "learning_rate": 3.276842910931178e-05, + "loss": 0.8829, + "step": 8371 + }, + { + "epoch": 0.4175561097256858, + "grad_norm": 0.5386137908340537, + "learning_rate": 3.2764590475900786e-05, + "loss": 0.9184, + "step": 8372 + }, + { + "epoch": 0.4176059850374065, + "grad_norm": 0.48867842830959163, + "learning_rate": 3.276075163987628e-05, + "loss": 0.899, + "step": 8373 + }, + { + "epoch": 0.41765586034912716, + "grad_norm": 0.5440937110233236, + "learning_rate": 3.275691260133842e-05, + "loss": 0.8772, + "step": 8374 + }, + { + "epoch": 0.4177057356608479, + "grad_norm": 0.4943296402764294, + "learning_rate": 3.2753073360387416e-05, + "loss": 0.8995, + "step": 8375 + }, + { + "epoch": 0.4177556109725686, + "grad_norm": 0.506280925583741, + "learning_rate": 3.2749233917123424e-05, + "loss": 0.8882, + "step": 8376 + }, + { + "epoch": 0.41780548628428926, + "grad_norm": 0.5073743865459502, + "learning_rate": 3.274539427164664e-05, + "loss": 0.8813, + "step": 8377 + }, + { + "epoch": 0.41785536159601, + "grad_norm": 0.5433370774314003, + "learning_rate": 3.274155442405726e-05, + "loss": 0.8812, + "step": 8378 + }, + { + "epoch": 0.4179052369077307, + "grad_norm": 0.500470421532542, + "learning_rate": 3.273771437445548e-05, + "loss": 0.892, + "step": 8379 + }, + { + "epoch": 0.41795511221945136, + "grad_norm": 0.5289141324490219, + "learning_rate": 3.2733874122941503e-05, + "loss": 0.9223, + "step": 8380 + }, + { + "epoch": 0.41800498753117205, + "grad_norm": 0.38513956901730195, + "learning_rate": 3.273003366961555e-05, + "loss": 0.8229, + "step": 8381 + }, + { + "epoch": 0.4180548628428928, + "grad_norm": 0.5108793092025754, + "learning_rate": 3.272619301457782e-05, + "loss": 0.8875, + "step": 8382 + }, + { + "epoch": 0.41810473815461346, + "grad_norm": 1.341353495997012, + "learning_rate": 3.272235215792855e-05, + "loss": 0.9031, + "step": 8383 + }, + { + "epoch": 0.41815461346633415, + "grad_norm": 0.5439927186279139, + "learning_rate": 3.271851109976796e-05, + "loss": 0.9267, + "step": 8384 + }, + { + "epoch": 0.4182044887780549, + "grad_norm": 0.5307560814391714, + "learning_rate": 3.271466984019627e-05, + "loss": 0.8626, + "step": 8385 + }, + { + "epoch": 0.41825436408977557, + "grad_norm": 0.43664683990822956, + "learning_rate": 3.2710828379313716e-05, + "loss": 0.9549, + "step": 8386 + }, + { + "epoch": 0.41830423940149625, + "grad_norm": 0.45302840485798124, + "learning_rate": 3.270698671722055e-05, + "loss": 0.9123, + "step": 8387 + }, + { + "epoch": 0.418354114713217, + "grad_norm": 0.5431915773154893, + "learning_rate": 3.270314485401702e-05, + "loss": 0.8657, + "step": 8388 + }, + { + "epoch": 0.41840399002493767, + "grad_norm": 0.866792163972022, + "learning_rate": 3.269930278980337e-05, + "loss": 0.8704, + "step": 8389 + }, + { + "epoch": 0.41845386533665835, + "grad_norm": 0.5980800800961972, + "learning_rate": 3.2695460524679864e-05, + "loss": 0.9394, + "step": 8390 + }, + { + "epoch": 0.41850374064837903, + "grad_norm": 0.4586477919210777, + "learning_rate": 3.2691618058746756e-05, + "loss": 0.8991, + "step": 8391 + }, + { + "epoch": 0.41855361596009977, + "grad_norm": 0.775118728574814, + "learning_rate": 3.268777539210431e-05, + "loss": 0.8707, + "step": 8392 + }, + { + "epoch": 0.41860349127182045, + "grad_norm": 0.5349263343466947, + "learning_rate": 3.2683932524852814e-05, + "loss": 0.8801, + "step": 8393 + }, + { + "epoch": 0.41865336658354113, + "grad_norm": 0.546178071634342, + "learning_rate": 3.268008945709254e-05, + "loss": 0.9243, + "step": 8394 + }, + { + "epoch": 0.41870324189526187, + "grad_norm": 0.43901147928547524, + "learning_rate": 3.267624618892376e-05, + "loss": 0.8934, + "step": 8395 + }, + { + "epoch": 0.41875311720698255, + "grad_norm": 0.45147014608400576, + "learning_rate": 3.267240272044678e-05, + "loss": 0.8726, + "step": 8396 + }, + { + "epoch": 0.41880299251870323, + "grad_norm": 0.4429610036781778, + "learning_rate": 3.266855905176188e-05, + "loss": 0.8648, + "step": 8397 + }, + { + "epoch": 0.4188528678304239, + "grad_norm": 1.4700153824062396, + "learning_rate": 3.266471518296936e-05, + "loss": 0.8966, + "step": 8398 + }, + { + "epoch": 0.41890274314214465, + "grad_norm": 0.634505731014548, + "learning_rate": 3.2660871114169536e-05, + "loss": 0.8997, + "step": 8399 + }, + { + "epoch": 0.41895261845386533, + "grad_norm": 0.6147423348347909, + "learning_rate": 3.2657026845462704e-05, + "loss": 0.9003, + "step": 8400 + }, + { + "epoch": 0.419002493765586, + "grad_norm": 0.41777632526626013, + "learning_rate": 3.265318237694918e-05, + "loss": 0.8808, + "step": 8401 + }, + { + "epoch": 0.41905236907730675, + "grad_norm": 0.6132589706544288, + "learning_rate": 3.264933770872929e-05, + "loss": 0.8848, + "step": 8402 + }, + { + "epoch": 0.41910224438902743, + "grad_norm": 0.7267031175592197, + "learning_rate": 3.264549284090336e-05, + "loss": 0.8997, + "step": 8403 + }, + { + "epoch": 0.4191521197007481, + "grad_norm": 0.44298051644116027, + "learning_rate": 3.2641647773571716e-05, + "loss": 0.9178, + "step": 8404 + }, + { + "epoch": 0.41920199501246885, + "grad_norm": 0.4634557043668073, + "learning_rate": 3.2637802506834686e-05, + "loss": 0.8596, + "step": 8405 + }, + { + "epoch": 0.41925187032418954, + "grad_norm": 0.5670271865707033, + "learning_rate": 3.263395704079262e-05, + "loss": 0.8502, + "step": 8406 + }, + { + "epoch": 0.4193017456359102, + "grad_norm": 0.5658034639509241, + "learning_rate": 3.263011137554586e-05, + "loss": 0.9244, + "step": 8407 + }, + { + "epoch": 0.4193516209476309, + "grad_norm": 0.39963483338797456, + "learning_rate": 3.2626265511194765e-05, + "loss": 0.9103, + "step": 8408 + }, + { + "epoch": 0.41940149625935164, + "grad_norm": 0.4378805734219531, + "learning_rate": 3.262241944783968e-05, + "loss": 0.8529, + "step": 8409 + }, + { + "epoch": 0.4194513715710723, + "grad_norm": 0.45307347111004703, + "learning_rate": 3.2618573185580974e-05, + "loss": 0.912, + "step": 8410 + }, + { + "epoch": 0.419501246882793, + "grad_norm": 0.4675195606427179, + "learning_rate": 3.2614726724519005e-05, + "loss": 0.9044, + "step": 8411 + }, + { + "epoch": 0.41955112219451374, + "grad_norm": 0.5322575161066739, + "learning_rate": 3.261088006475416e-05, + "loss": 0.8836, + "step": 8412 + }, + { + "epoch": 0.4196009975062344, + "grad_norm": 0.5911569113943258, + "learning_rate": 3.260703320638679e-05, + "loss": 0.9138, + "step": 8413 + }, + { + "epoch": 0.4196508728179551, + "grad_norm": 0.5073343220040403, + "learning_rate": 3.260318614951731e-05, + "loss": 0.8988, + "step": 8414 + }, + { + "epoch": 0.41970074812967584, + "grad_norm": 0.4758775633519127, + "learning_rate": 3.259933889424608e-05, + "loss": 0.8739, + "step": 8415 + }, + { + "epoch": 0.4197506234413965, + "grad_norm": 0.4240357412210988, + "learning_rate": 3.259549144067351e-05, + "loss": 0.8824, + "step": 8416 + }, + { + "epoch": 0.4198004987531172, + "grad_norm": 0.5032462473051483, + "learning_rate": 3.2591643788899984e-05, + "loss": 0.8572, + "step": 8417 + }, + { + "epoch": 0.4198503740648379, + "grad_norm": 0.47860763187112015, + "learning_rate": 3.258779593902591e-05, + "loss": 0.918, + "step": 8418 + }, + { + "epoch": 0.4199002493765586, + "grad_norm": 0.41953724915817797, + "learning_rate": 3.25839478911517e-05, + "loss": 0.8713, + "step": 8419 + }, + { + "epoch": 0.4199501246882793, + "grad_norm": 0.3994200393939449, + "learning_rate": 3.258009964537776e-05, + "loss": 0.9056, + "step": 8420 + }, + { + "epoch": 0.42, + "grad_norm": 0.5082721875969876, + "learning_rate": 3.2576251201804517e-05, + "loss": 0.8975, + "step": 8421 + }, + { + "epoch": 0.4200498753117207, + "grad_norm": 0.4552773617591677, + "learning_rate": 3.2572402560532386e-05, + "loss": 0.9382, + "step": 8422 + }, + { + "epoch": 0.4200997506234414, + "grad_norm": 0.49987224828293997, + "learning_rate": 3.25685537216618e-05, + "loss": 0.9053, + "step": 8423 + }, + { + "epoch": 0.4201496259351621, + "grad_norm": 0.4477003057558162, + "learning_rate": 3.256470468529319e-05, + "loss": 0.8616, + "step": 8424 + }, + { + "epoch": 0.42019950124688277, + "grad_norm": 0.4503997174879386, + "learning_rate": 3.2560855451527005e-05, + "loss": 0.9169, + "step": 8425 + }, + { + "epoch": 0.4202493765586035, + "grad_norm": 0.681909345102017, + "learning_rate": 3.2557006020463676e-05, + "loss": 0.8951, + "step": 8426 + }, + { + "epoch": 0.4202992518703242, + "grad_norm": 0.9075288122765364, + "learning_rate": 3.255315639220366e-05, + "loss": 0.8921, + "step": 8427 + }, + { + "epoch": 0.42034912718204487, + "grad_norm": 0.5771739367920328, + "learning_rate": 3.25493065668474e-05, + "loss": 0.8794, + "step": 8428 + }, + { + "epoch": 0.4203990024937656, + "grad_norm": 0.45691046689738773, + "learning_rate": 3.2545456544495365e-05, + "loss": 0.9057, + "step": 8429 + }, + { + "epoch": 0.4204488778054863, + "grad_norm": 0.5510366595118709, + "learning_rate": 3.254160632524803e-05, + "loss": 0.8677, + "step": 8430 + }, + { + "epoch": 0.42049875311720697, + "grad_norm": 0.5016614618380628, + "learning_rate": 3.253775590920584e-05, + "loss": 0.9332, + "step": 8431 + }, + { + "epoch": 0.4205486284289277, + "grad_norm": 0.45348843737812466, + "learning_rate": 3.2533905296469294e-05, + "loss": 0.8986, + "step": 8432 + }, + { + "epoch": 0.4205985037406484, + "grad_norm": 0.5095072445894795, + "learning_rate": 3.253005448713885e-05, + "loss": 0.946, + "step": 8433 + }, + { + "epoch": 0.42064837905236907, + "grad_norm": 0.4796149225544742, + "learning_rate": 3.252620348131501e-05, + "loss": 0.9032, + "step": 8434 + }, + { + "epoch": 0.42069825436408975, + "grad_norm": 1.0965115928374296, + "learning_rate": 3.252235227909826e-05, + "loss": 0.9078, + "step": 8435 + }, + { + "epoch": 0.4207481296758105, + "grad_norm": 0.5189235190493007, + "learning_rate": 3.251850088058909e-05, + "loss": 0.878, + "step": 8436 + }, + { + "epoch": 0.42079800498753117, + "grad_norm": 0.4671565532621709, + "learning_rate": 3.2514649285888e-05, + "loss": 0.8548, + "step": 8437 + }, + { + "epoch": 0.42084788029925185, + "grad_norm": 0.511870053006887, + "learning_rate": 3.251079749509551e-05, + "loss": 0.9059, + "step": 8438 + }, + { + "epoch": 0.4208977556109726, + "grad_norm": 0.7030346004261336, + "learning_rate": 3.250694550831212e-05, + "loss": 0.8775, + "step": 8439 + }, + { + "epoch": 0.42094763092269327, + "grad_norm": 0.5615344382425893, + "learning_rate": 3.250309332563833e-05, + "loss": 0.9474, + "step": 8440 + }, + { + "epoch": 0.42099750623441395, + "grad_norm": 0.4322178638105696, + "learning_rate": 3.2499240947174696e-05, + "loss": 0.9085, + "step": 8441 + }, + { + "epoch": 0.42104738154613464, + "grad_norm": 0.5148090840726687, + "learning_rate": 3.249538837302172e-05, + "loss": 0.8871, + "step": 8442 + }, + { + "epoch": 0.4210972568578554, + "grad_norm": 0.4535189129257466, + "learning_rate": 3.2491535603279946e-05, + "loss": 0.9204, + "step": 8443 + }, + { + "epoch": 0.42114713216957606, + "grad_norm": 0.44828534875412207, + "learning_rate": 3.248768263804989e-05, + "loss": 0.88, + "step": 8444 + }, + { + "epoch": 0.42119700748129674, + "grad_norm": 0.5076497466418415, + "learning_rate": 3.2483829477432115e-05, + "loss": 0.9125, + "step": 8445 + }, + { + "epoch": 0.4212468827930175, + "grad_norm": 0.3875178844254606, + "learning_rate": 3.247997612152715e-05, + "loss": 0.9167, + "step": 8446 + }, + { + "epoch": 0.42129675810473816, + "grad_norm": 0.47514247065252635, + "learning_rate": 3.247612257043557e-05, + "loss": 0.8736, + "step": 8447 + }, + { + "epoch": 0.42134663341645884, + "grad_norm": 0.4499339874249109, + "learning_rate": 3.247226882425791e-05, + "loss": 0.9113, + "step": 8448 + }, + { + "epoch": 0.4213965087281796, + "grad_norm": 1.341273999611278, + "learning_rate": 3.246841488309474e-05, + "loss": 0.9585, + "step": 8449 + }, + { + "epoch": 0.42144638403990026, + "grad_norm": 0.5093541872464963, + "learning_rate": 3.246456074704662e-05, + "loss": 0.8984, + "step": 8450 + }, + { + "epoch": 0.42149625935162094, + "grad_norm": 0.9133293074096959, + "learning_rate": 3.246070641621414e-05, + "loss": 0.8622, + "step": 8451 + }, + { + "epoch": 0.4215461346633416, + "grad_norm": 0.46802664435237545, + "learning_rate": 3.245685189069785e-05, + "loss": 0.8807, + "step": 8452 + }, + { + "epoch": 0.42159600997506236, + "grad_norm": 0.42759070901305896, + "learning_rate": 3.2452997170598366e-05, + "loss": 0.9023, + "step": 8453 + }, + { + "epoch": 0.42164588528678304, + "grad_norm": 0.42802466259270355, + "learning_rate": 3.2449142256016246e-05, + "loss": 0.9074, + "step": 8454 + }, + { + "epoch": 0.4216957605985037, + "grad_norm": 0.5002865888016343, + "learning_rate": 3.2445287147052086e-05, + "loss": 0.8949, + "step": 8455 + }, + { + "epoch": 0.42174563591022446, + "grad_norm": 0.5482421212821347, + "learning_rate": 3.24414318438065e-05, + "loss": 0.8903, + "step": 8456 + }, + { + "epoch": 0.42179551122194514, + "grad_norm": 0.7231792357073757, + "learning_rate": 3.243757634638008e-05, + "loss": 0.849, + "step": 8457 + }, + { + "epoch": 0.4218453865336658, + "grad_norm": 0.49840465476100626, + "learning_rate": 3.243372065487343e-05, + "loss": 0.8993, + "step": 8458 + }, + { + "epoch": 0.42189526184538656, + "grad_norm": 0.44195766017136145, + "learning_rate": 3.2429864769387166e-05, + "loss": 0.9533, + "step": 8459 + }, + { + "epoch": 0.42194513715710724, + "grad_norm": 0.8761166655668986, + "learning_rate": 3.2426008690021905e-05, + "loss": 0.9134, + "step": 8460 + }, + { + "epoch": 0.4219950124688279, + "grad_norm": 0.511897558376717, + "learning_rate": 3.242215241687827e-05, + "loss": 0.9029, + "step": 8461 + }, + { + "epoch": 0.4220448877805486, + "grad_norm": 0.45658208722101096, + "learning_rate": 3.24182959500569e-05, + "loss": 0.9647, + "step": 8462 + }, + { + "epoch": 0.42209476309226934, + "grad_norm": 0.43322666322960646, + "learning_rate": 3.2414439289658406e-05, + "loss": 0.8958, + "step": 8463 + }, + { + "epoch": 0.42214463840399, + "grad_norm": 0.4172853951238001, + "learning_rate": 3.2410582435783444e-05, + "loss": 0.9191, + "step": 8464 + }, + { + "epoch": 0.4221945137157107, + "grad_norm": 0.41384326276216177, + "learning_rate": 3.240672538853264e-05, + "loss": 0.9186, + "step": 8465 + }, + { + "epoch": 0.42224438902743144, + "grad_norm": 0.7900490630072959, + "learning_rate": 3.240286814800666e-05, + "loss": 0.8923, + "step": 8466 + }, + { + "epoch": 0.4222942643391521, + "grad_norm": 0.523020522827579, + "learning_rate": 3.239901071430616e-05, + "loss": 0.91, + "step": 8467 + }, + { + "epoch": 0.4223441396508728, + "grad_norm": 0.567591703766565, + "learning_rate": 3.2395153087531764e-05, + "loss": 0.9066, + "step": 8468 + }, + { + "epoch": 0.4223940149625935, + "grad_norm": 0.4092417660431948, + "learning_rate": 3.239129526778417e-05, + "loss": 0.9086, + "step": 8469 + }, + { + "epoch": 0.4224438902743142, + "grad_norm": 0.4976136854406583, + "learning_rate": 3.238743725516403e-05, + "loss": 0.8728, + "step": 8470 + }, + { + "epoch": 0.4224937655860349, + "grad_norm": 0.6523197526327735, + "learning_rate": 3.2383579049772024e-05, + "loss": 0.9031, + "step": 8471 + }, + { + "epoch": 0.4225436408977556, + "grad_norm": 0.4276329704448688, + "learning_rate": 3.237972065170882e-05, + "loss": 0.9086, + "step": 8472 + }, + { + "epoch": 0.4225935162094763, + "grad_norm": 1.5329356501556508, + "learning_rate": 3.237586206107512e-05, + "loss": 0.9075, + "step": 8473 + }, + { + "epoch": 0.422643391521197, + "grad_norm": 0.4175574612051936, + "learning_rate": 3.237200327797159e-05, + "loss": 0.8811, + "step": 8474 + }, + { + "epoch": 0.4226932668329177, + "grad_norm": 0.8668191939262541, + "learning_rate": 3.2368144302498945e-05, + "loss": 0.9077, + "step": 8475 + }, + { + "epoch": 0.42274314214463843, + "grad_norm": 0.4390689131839121, + "learning_rate": 3.236428513475786e-05, + "loss": 0.8999, + "step": 8476 + }, + { + "epoch": 0.4227930174563591, + "grad_norm": 0.4287819584934051, + "learning_rate": 3.2360425774849055e-05, + "loss": 0.9308, + "step": 8477 + }, + { + "epoch": 0.4228428927680798, + "grad_norm": 0.5147944311533839, + "learning_rate": 3.2356566222873235e-05, + "loss": 0.9254, + "step": 8478 + }, + { + "epoch": 0.4228927680798005, + "grad_norm": 0.4837080030764692, + "learning_rate": 3.2352706478931106e-05, + "loss": 0.9002, + "step": 8479 + }, + { + "epoch": 0.4229426433915212, + "grad_norm": 0.4913516243054069, + "learning_rate": 3.2348846543123404e-05, + "loss": 0.9049, + "step": 8480 + }, + { + "epoch": 0.4229925187032419, + "grad_norm": 0.5055140431783645, + "learning_rate": 3.2344986415550825e-05, + "loss": 0.93, + "step": 8481 + }, + { + "epoch": 0.4230423940149626, + "grad_norm": 0.6523364039533286, + "learning_rate": 3.234112609631412e-05, + "loss": 0.8944, + "step": 8482 + }, + { + "epoch": 0.4230922693266833, + "grad_norm": 0.45667343298965285, + "learning_rate": 3.233726558551402e-05, + "loss": 0.92, + "step": 8483 + }, + { + "epoch": 0.423142144638404, + "grad_norm": 0.6474802648992835, + "learning_rate": 3.233340488325126e-05, + "loss": 0.9359, + "step": 8484 + }, + { + "epoch": 0.4231920199501247, + "grad_norm": 0.52243222644853, + "learning_rate": 3.2329543989626574e-05, + "loss": 0.887, + "step": 8485 + }, + { + "epoch": 0.4232418952618454, + "grad_norm": 0.3918351564900964, + "learning_rate": 3.232568290474072e-05, + "loss": 0.8699, + "step": 8486 + }, + { + "epoch": 0.4232917705735661, + "grad_norm": 0.5525917124279489, + "learning_rate": 3.232182162869444e-05, + "loss": 0.9316, + "step": 8487 + }, + { + "epoch": 0.4233416458852868, + "grad_norm": 0.4887005598480787, + "learning_rate": 3.231796016158852e-05, + "loss": 0.889, + "step": 8488 + }, + { + "epoch": 0.42339152119700746, + "grad_norm": 0.8696531499350307, + "learning_rate": 3.231409850352369e-05, + "loss": 0.8873, + "step": 8489 + }, + { + "epoch": 0.4234413965087282, + "grad_norm": 0.5894120082234005, + "learning_rate": 3.231023665460074e-05, + "loss": 0.9011, + "step": 8490 + }, + { + "epoch": 0.4234912718204489, + "grad_norm": 0.5141834210308025, + "learning_rate": 3.230637461492043e-05, + "loss": 0.8857, + "step": 8491 + }, + { + "epoch": 0.42354114713216956, + "grad_norm": 0.7417361356577797, + "learning_rate": 3.230251238458355e-05, + "loss": 0.9236, + "step": 8492 + }, + { + "epoch": 0.4235910224438903, + "grad_norm": 0.7756979658736334, + "learning_rate": 3.229864996369088e-05, + "loss": 0.896, + "step": 8493 + }, + { + "epoch": 0.423640897755611, + "grad_norm": 1.7700570760018501, + "learning_rate": 3.2294787352343194e-05, + "loss": 0.892, + "step": 8494 + }, + { + "epoch": 0.42369077306733166, + "grad_norm": 0.5401588821600409, + "learning_rate": 3.229092455064131e-05, + "loss": 0.9024, + "step": 8495 + }, + { + "epoch": 0.42374064837905234, + "grad_norm": 0.4795041494344257, + "learning_rate": 3.2287061558686e-05, + "loss": 0.8717, + "step": 8496 + }, + { + "epoch": 0.4237905236907731, + "grad_norm": 0.40864475978601716, + "learning_rate": 3.228319837657809e-05, + "loss": 0.8496, + "step": 8497 + }, + { + "epoch": 0.42384039900249376, + "grad_norm": 0.4114712733496171, + "learning_rate": 3.227933500441838e-05, + "loss": 0.9243, + "step": 8498 + }, + { + "epoch": 0.42389027431421444, + "grad_norm": 0.5851246882659231, + "learning_rate": 3.227547144230767e-05, + "loss": 0.9106, + "step": 8499 + }, + { + "epoch": 0.4239401496259352, + "grad_norm": 0.787394869286654, + "learning_rate": 3.22716076903468e-05, + "loss": 0.969, + "step": 8500 + }, + { + "epoch": 0.42399002493765586, + "grad_norm": 0.5179821710876151, + "learning_rate": 3.226774374863657e-05, + "loss": 0.9203, + "step": 8501 + }, + { + "epoch": 0.42403990024937654, + "grad_norm": 0.44928511513860026, + "learning_rate": 3.226387961727783e-05, + "loss": 0.9251, + "step": 8502 + }, + { + "epoch": 0.4240897755610973, + "grad_norm": 0.42490850915443673, + "learning_rate": 3.22600152963714e-05, + "loss": 0.8811, + "step": 8503 + }, + { + "epoch": 0.42413965087281796, + "grad_norm": 0.4944487186287153, + "learning_rate": 3.2256150786018116e-05, + "loss": 0.9004, + "step": 8504 + }, + { + "epoch": 0.42418952618453865, + "grad_norm": 0.440191726270011, + "learning_rate": 3.225228608631883e-05, + "loss": 0.9394, + "step": 8505 + }, + { + "epoch": 0.4242394014962593, + "grad_norm": 0.5012398181398313, + "learning_rate": 3.2248421197374376e-05, + "loss": 0.8376, + "step": 8506 + }, + { + "epoch": 0.42428927680798006, + "grad_norm": 0.5906748395361818, + "learning_rate": 3.2244556119285616e-05, + "loss": 0.9042, + "step": 8507 + }, + { + "epoch": 0.42433915211970075, + "grad_norm": 0.9236480965789329, + "learning_rate": 3.2240690852153414e-05, + "loss": 0.9062, + "step": 8508 + }, + { + "epoch": 0.42438902743142143, + "grad_norm": 0.4516016815067994, + "learning_rate": 3.223682539607862e-05, + "loss": 0.965, + "step": 8509 + }, + { + "epoch": 0.42443890274314217, + "grad_norm": 0.6325464381049765, + "learning_rate": 3.223295975116211e-05, + "loss": 0.875, + "step": 8510 + }, + { + "epoch": 0.42448877805486285, + "grad_norm": 0.47286030734032725, + "learning_rate": 3.222909391750474e-05, + "loss": 0.9041, + "step": 8511 + }, + { + "epoch": 0.42453865336658353, + "grad_norm": 0.6181688974615825, + "learning_rate": 3.2225227895207416e-05, + "loss": 0.8946, + "step": 8512 + }, + { + "epoch": 0.42458852867830427, + "grad_norm": 0.8067302594855176, + "learning_rate": 3.222136168437099e-05, + "loss": 0.8909, + "step": 8513 + }, + { + "epoch": 0.42463840399002495, + "grad_norm": 0.5564713897125493, + "learning_rate": 3.221749528509637e-05, + "loss": 0.9088, + "step": 8514 + }, + { + "epoch": 0.42468827930174563, + "grad_norm": 0.4942299073768183, + "learning_rate": 3.2213628697484445e-05, + "loss": 0.9265, + "step": 8515 + }, + { + "epoch": 0.4247381546134663, + "grad_norm": 0.5043627365761303, + "learning_rate": 3.22097619216361e-05, + "loss": 0.8747, + "step": 8516 + }, + { + "epoch": 0.42478802992518705, + "grad_norm": 0.7989931380546649, + "learning_rate": 3.2205894957652245e-05, + "loss": 0.9057, + "step": 8517 + }, + { + "epoch": 0.42483790523690773, + "grad_norm": 0.5005605514684799, + "learning_rate": 3.220202780563378e-05, + "loss": 0.9662, + "step": 8518 + }, + { + "epoch": 0.4248877805486284, + "grad_norm": 0.5777786198388083, + "learning_rate": 3.2198160465681635e-05, + "loss": 0.8884, + "step": 8519 + }, + { + "epoch": 0.42493765586034915, + "grad_norm": 0.7072053920375878, + "learning_rate": 3.219429293789671e-05, + "loss": 0.944, + "step": 8520 + }, + { + "epoch": 0.42498753117206983, + "grad_norm": 0.47702421500887704, + "learning_rate": 3.219042522237993e-05, + "loss": 0.8631, + "step": 8521 + }, + { + "epoch": 0.4250374064837905, + "grad_norm": 0.9113252916565651, + "learning_rate": 3.218655731923222e-05, + "loss": 0.8882, + "step": 8522 + }, + { + "epoch": 0.4250872817955112, + "grad_norm": 0.5603926579971069, + "learning_rate": 3.2182689228554517e-05, + "loss": 0.9209, + "step": 8523 + }, + { + "epoch": 0.42513715710723193, + "grad_norm": 0.5300090187074743, + "learning_rate": 3.2178820950447754e-05, + "loss": 0.9197, + "step": 8524 + }, + { + "epoch": 0.4251870324189526, + "grad_norm": 1.1616740586200986, + "learning_rate": 3.2174952485012866e-05, + "loss": 0.9617, + "step": 8525 + }, + { + "epoch": 0.4252369077306733, + "grad_norm": 0.43746459022232037, + "learning_rate": 3.2171083832350805e-05, + "loss": 0.9123, + "step": 8526 + }, + { + "epoch": 0.42528678304239403, + "grad_norm": 0.47714818212121357, + "learning_rate": 3.216721499256252e-05, + "loss": 0.8999, + "step": 8527 + }, + { + "epoch": 0.4253366583541147, + "grad_norm": 0.4172894836673184, + "learning_rate": 3.216334596574897e-05, + "loss": 0.8596, + "step": 8528 + }, + { + "epoch": 0.4253865336658354, + "grad_norm": 0.688074817725996, + "learning_rate": 3.2159476752011114e-05, + "loss": 0.9048, + "step": 8529 + }, + { + "epoch": 0.42543640897755614, + "grad_norm": 0.5016199991944162, + "learning_rate": 3.2155607351449914e-05, + "loss": 0.9097, + "step": 8530 + }, + { + "epoch": 0.4254862842892768, + "grad_norm": 0.5455969972813016, + "learning_rate": 3.2151737764166346e-05, + "loss": 0.9455, + "step": 8531 + }, + { + "epoch": 0.4255361596009975, + "grad_norm": 0.4387484023556311, + "learning_rate": 3.2147867990261374e-05, + "loss": 0.866, + "step": 8532 + }, + { + "epoch": 0.4255860349127182, + "grad_norm": 0.5613553027236727, + "learning_rate": 3.2143998029835994e-05, + "loss": 0.9137, + "step": 8533 + }, + { + "epoch": 0.4256359102244389, + "grad_norm": 0.544468995252468, + "learning_rate": 3.214012788299118e-05, + "loss": 0.9187, + "step": 8534 + }, + { + "epoch": 0.4256857855361596, + "grad_norm": 0.44486904141764233, + "learning_rate": 3.213625754982792e-05, + "loss": 0.851, + "step": 8535 + }, + { + "epoch": 0.4257356608478803, + "grad_norm": 0.5493674067068728, + "learning_rate": 3.213238703044722e-05, + "loss": 0.9504, + "step": 8536 + }, + { + "epoch": 0.425785536159601, + "grad_norm": 1.4024230716456847, + "learning_rate": 3.212851632495007e-05, + "loss": 0.9027, + "step": 8537 + }, + { + "epoch": 0.4258354114713217, + "grad_norm": 0.48699765909132503, + "learning_rate": 3.212464543343747e-05, + "loss": 0.9144, + "step": 8538 + }, + { + "epoch": 0.4258852867830424, + "grad_norm": 0.4797916515026168, + "learning_rate": 3.212077435601045e-05, + "loss": 0.8776, + "step": 8539 + }, + { + "epoch": 0.42593516209476306, + "grad_norm": 0.5023569376873589, + "learning_rate": 3.2116903092769995e-05, + "loss": 0.9073, + "step": 8540 + }, + { + "epoch": 0.4259850374064838, + "grad_norm": 0.4880717451386041, + "learning_rate": 3.211303164381715e-05, + "loss": 0.8878, + "step": 8541 + }, + { + "epoch": 0.4260349127182045, + "grad_norm": 0.5532197643444543, + "learning_rate": 3.210916000925292e-05, + "loss": 0.9588, + "step": 8542 + }, + { + "epoch": 0.42608478802992517, + "grad_norm": 0.423929844927819, + "learning_rate": 3.2105288189178344e-05, + "loss": 0.9086, + "step": 8543 + }, + { + "epoch": 0.4261346633416459, + "grad_norm": 0.5319176869072876, + "learning_rate": 3.210141618369446e-05, + "loss": 0.914, + "step": 8544 + }, + { + "epoch": 0.4261845386533666, + "grad_norm": 0.47112341506691124, + "learning_rate": 3.2097543992902293e-05, + "loss": 0.887, + "step": 8545 + }, + { + "epoch": 0.42623441396508727, + "grad_norm": 0.47714193399651034, + "learning_rate": 3.209367161690288e-05, + "loss": 0.8703, + "step": 8546 + }, + { + "epoch": 0.426284289276808, + "grad_norm": 0.5675404928656098, + "learning_rate": 3.2089799055797295e-05, + "loss": 0.9265, + "step": 8547 + }, + { + "epoch": 0.4263341645885287, + "grad_norm": 0.7086106622826757, + "learning_rate": 3.208592630968658e-05, + "loss": 0.9073, + "step": 8548 + }, + { + "epoch": 0.42638403990024937, + "grad_norm": 0.49819888085573916, + "learning_rate": 3.208205337867178e-05, + "loss": 0.9272, + "step": 8549 + }, + { + "epoch": 0.42643391521197005, + "grad_norm": 0.4256177392530872, + "learning_rate": 3.207818026285396e-05, + "loss": 0.8527, + "step": 8550 + }, + { + "epoch": 0.4264837905236908, + "grad_norm": 0.4933815409914395, + "learning_rate": 3.207430696233421e-05, + "loss": 0.8937, + "step": 8551 + }, + { + "epoch": 0.42653366583541147, + "grad_norm": 0.5025057869062068, + "learning_rate": 3.207043347721357e-05, + "loss": 0.8847, + "step": 8552 + }, + { + "epoch": 0.42658354114713215, + "grad_norm": 0.6463337290012952, + "learning_rate": 3.2066559807593145e-05, + "loss": 0.8549, + "step": 8553 + }, + { + "epoch": 0.4266334164588529, + "grad_norm": 0.4688891454549792, + "learning_rate": 3.2062685953574e-05, + "loss": 0.9442, + "step": 8554 + }, + { + "epoch": 0.42668329177057357, + "grad_norm": 0.4556537124715502, + "learning_rate": 3.2058811915257225e-05, + "loss": 0.9153, + "step": 8555 + }, + { + "epoch": 0.42673316708229425, + "grad_norm": 0.47613452309484366, + "learning_rate": 3.205493769274392e-05, + "loss": 0.899, + "step": 8556 + }, + { + "epoch": 0.426783042394015, + "grad_norm": 0.5459534060088365, + "learning_rate": 3.2051063286135164e-05, + "loss": 0.8834, + "step": 8557 + }, + { + "epoch": 0.42683291770573567, + "grad_norm": 0.5133750802858839, + "learning_rate": 3.204718869553207e-05, + "loss": 0.871, + "step": 8558 + }, + { + "epoch": 0.42688279301745635, + "grad_norm": 0.46697231546201706, + "learning_rate": 3.2043313921035743e-05, + "loss": 0.9352, + "step": 8559 + }, + { + "epoch": 0.42693266832917703, + "grad_norm": 0.4711895145769806, + "learning_rate": 3.203943896274729e-05, + "loss": 0.8962, + "step": 8560 + }, + { + "epoch": 0.42698254364089777, + "grad_norm": 0.47853099068469046, + "learning_rate": 3.203556382076783e-05, + "loss": 0.9044, + "step": 8561 + }, + { + "epoch": 0.42703241895261845, + "grad_norm": 0.4402048888944123, + "learning_rate": 3.2031688495198474e-05, + "loss": 0.8914, + "step": 8562 + }, + { + "epoch": 0.42708229426433914, + "grad_norm": 0.6265385661746956, + "learning_rate": 3.202781298614036e-05, + "loss": 0.9576, + "step": 8563 + }, + { + "epoch": 0.4271321695760599, + "grad_norm": 0.5550364742620755, + "learning_rate": 3.202393729369461e-05, + "loss": 0.9252, + "step": 8564 + }, + { + "epoch": 0.42718204488778055, + "grad_norm": 0.42487353739665923, + "learning_rate": 3.202006141796236e-05, + "loss": 0.8729, + "step": 8565 + }, + { + "epoch": 0.42723192019950124, + "grad_norm": 0.526060536118556, + "learning_rate": 3.201618535904475e-05, + "loss": 0.8848, + "step": 8566 + }, + { + "epoch": 0.4272817955112219, + "grad_norm": 0.4934064587285436, + "learning_rate": 3.201230911704292e-05, + "loss": 0.8764, + "step": 8567 + }, + { + "epoch": 0.42733167082294266, + "grad_norm": 0.5237443319267564, + "learning_rate": 3.200843269205802e-05, + "loss": 0.9341, + "step": 8568 + }, + { + "epoch": 0.42738154613466334, + "grad_norm": 0.49361701100060423, + "learning_rate": 3.2004556084191214e-05, + "loss": 0.9807, + "step": 8569 + }, + { + "epoch": 0.427431421446384, + "grad_norm": 0.44085533421107775, + "learning_rate": 3.200067929354365e-05, + "loss": 0.8855, + "step": 8570 + }, + { + "epoch": 0.42748129675810476, + "grad_norm": 2.031638436185728, + "learning_rate": 3.1996802320216486e-05, + "loss": 0.9408, + "step": 8571 + }, + { + "epoch": 0.42753117206982544, + "grad_norm": 0.5444705612226926, + "learning_rate": 3.19929251643109e-05, + "loss": 0.8744, + "step": 8572 + }, + { + "epoch": 0.4275810473815461, + "grad_norm": 0.48750618689908387, + "learning_rate": 3.198904782592806e-05, + "loss": 0.8944, + "step": 8573 + }, + { + "epoch": 0.42763092269326686, + "grad_norm": 0.5242606130103948, + "learning_rate": 3.198517030516915e-05, + "loss": 0.9407, + "step": 8574 + }, + { + "epoch": 0.42768079800498754, + "grad_norm": 0.5537098147707226, + "learning_rate": 3.1981292602135335e-05, + "loss": 0.8993, + "step": 8575 + }, + { + "epoch": 0.4277306733167082, + "grad_norm": 0.5299786032899935, + "learning_rate": 3.1977414716927834e-05, + "loss": 0.9241, + "step": 8576 + }, + { + "epoch": 0.4277805486284289, + "grad_norm": 0.4995101727269242, + "learning_rate": 3.19735366496478e-05, + "loss": 0.8811, + "step": 8577 + }, + { + "epoch": 0.42783042394014964, + "grad_norm": 0.5166314921517068, + "learning_rate": 3.1969658400396464e-05, + "loss": 0.9161, + "step": 8578 + }, + { + "epoch": 0.4278802992518703, + "grad_norm": 0.5293830194914646, + "learning_rate": 3.1965779969274995e-05, + "loss": 0.8885, + "step": 8579 + }, + { + "epoch": 0.427930174563591, + "grad_norm": 0.4626206703475937, + "learning_rate": 3.1961901356384626e-05, + "loss": 0.9193, + "step": 8580 + }, + { + "epoch": 0.42798004987531174, + "grad_norm": 0.6472712077288466, + "learning_rate": 3.1958022561826554e-05, + "loss": 0.9444, + "step": 8581 + }, + { + "epoch": 0.4280299251870324, + "grad_norm": 0.46524363777098365, + "learning_rate": 3.1954143585701996e-05, + "loss": 0.8749, + "step": 8582 + }, + { + "epoch": 0.4280798004987531, + "grad_norm": 0.7735624054928228, + "learning_rate": 3.195026442811217e-05, + "loss": 0.9018, + "step": 8583 + }, + { + "epoch": 0.42812967581047384, + "grad_norm": 0.4626082325062449, + "learning_rate": 3.194638508915832e-05, + "loss": 0.9114, + "step": 8584 + }, + { + "epoch": 0.4281795511221945, + "grad_norm": 0.5845183772659518, + "learning_rate": 3.194250556894164e-05, + "loss": 0.906, + "step": 8585 + }, + { + "epoch": 0.4282294264339152, + "grad_norm": 0.5743736632154626, + "learning_rate": 3.1938625867563394e-05, + "loss": 0.9104, + "step": 8586 + }, + { + "epoch": 0.4282793017456359, + "grad_norm": 0.5360453173177879, + "learning_rate": 3.193474598512481e-05, + "loss": 0.9102, + "step": 8587 + }, + { + "epoch": 0.4283291770573566, + "grad_norm": 0.6321592367516684, + "learning_rate": 3.193086592172713e-05, + "loss": 0.9606, + "step": 8588 + }, + { + "epoch": 0.4283790523690773, + "grad_norm": 0.5603422911858605, + "learning_rate": 3.192698567747161e-05, + "loss": 0.9052, + "step": 8589 + }, + { + "epoch": 0.428428927680798, + "grad_norm": 0.5045384970573835, + "learning_rate": 3.19231052524595e-05, + "loss": 0.8828, + "step": 8590 + }, + { + "epoch": 0.4284788029925187, + "grad_norm": 0.5165417965651455, + "learning_rate": 3.191922464679205e-05, + "loss": 0.8995, + "step": 8591 + }, + { + "epoch": 0.4285286783042394, + "grad_norm": 0.566711096396206, + "learning_rate": 3.191534386057053e-05, + "loss": 0.8573, + "step": 8592 + }, + { + "epoch": 0.4285785536159601, + "grad_norm": 0.5103121023138896, + "learning_rate": 3.1911462893896214e-05, + "loss": 0.8746, + "step": 8593 + }, + { + "epoch": 0.42862842892768077, + "grad_norm": 0.8065530202067709, + "learning_rate": 3.190758174687036e-05, + "loss": 0.8705, + "step": 8594 + }, + { + "epoch": 0.4286783042394015, + "grad_norm": 0.46506184194841704, + "learning_rate": 3.190370041959426e-05, + "loss": 0.9231, + "step": 8595 + }, + { + "epoch": 0.4287281795511222, + "grad_norm": 0.6007559389218039, + "learning_rate": 3.189981891216919e-05, + "loss": 0.8607, + "step": 8596 + }, + { + "epoch": 0.4287780548628429, + "grad_norm": 0.43108796424779167, + "learning_rate": 3.1895937224696416e-05, + "loss": 0.8842, + "step": 8597 + }, + { + "epoch": 0.4288279301745636, + "grad_norm": 0.43964868138915886, + "learning_rate": 3.189205535727726e-05, + "loss": 0.8545, + "step": 8598 + }, + { + "epoch": 0.4288778054862843, + "grad_norm": 0.4873548508228008, + "learning_rate": 3.1888173310013e-05, + "loss": 0.9161, + "step": 8599 + }, + { + "epoch": 0.428927680798005, + "grad_norm": 0.6456388109266721, + "learning_rate": 3.1884291083004944e-05, + "loss": 0.865, + "step": 8600 + }, + { + "epoch": 0.4289775561097257, + "grad_norm": 0.4765093490189755, + "learning_rate": 3.188040867635438e-05, + "loss": 0.9033, + "step": 8601 + }, + { + "epoch": 0.4290274314214464, + "grad_norm": 0.41621044971058735, + "learning_rate": 3.187652609016265e-05, + "loss": 0.8781, + "step": 8602 + }, + { + "epoch": 0.4290773067331671, + "grad_norm": 0.44523937225696075, + "learning_rate": 3.187264332453104e-05, + "loss": 0.8754, + "step": 8603 + }, + { + "epoch": 0.42912718204488776, + "grad_norm": 0.48574887742445266, + "learning_rate": 3.186876037956088e-05, + "loss": 0.9149, + "step": 8604 + }, + { + "epoch": 0.4291770573566085, + "grad_norm": 0.6564640199541886, + "learning_rate": 3.186487725535349e-05, + "loss": 0.9539, + "step": 8605 + }, + { + "epoch": 0.4292269326683292, + "grad_norm": 0.5855960976587697, + "learning_rate": 3.18609939520102e-05, + "loss": 0.871, + "step": 8606 + }, + { + "epoch": 0.42927680798004986, + "grad_norm": 0.5173610669237796, + "learning_rate": 3.185711046963234e-05, + "loss": 0.8874, + "step": 8607 + }, + { + "epoch": 0.4293266832917706, + "grad_norm": 0.5270517020040044, + "learning_rate": 3.185322680832126e-05, + "loss": 0.8965, + "step": 8608 + }, + { + "epoch": 0.4293765586034913, + "grad_norm": 0.4984657287396873, + "learning_rate": 3.1849342968178285e-05, + "loss": 0.9169, + "step": 8609 + }, + { + "epoch": 0.42942643391521196, + "grad_norm": 0.7718266257568418, + "learning_rate": 3.184545894930477e-05, + "loss": 0.8648, + "step": 8610 + }, + { + "epoch": 0.4294763092269327, + "grad_norm": 0.46639928999545155, + "learning_rate": 3.1841574751802076e-05, + "loss": 0.8785, + "step": 8611 + }, + { + "epoch": 0.4295261845386534, + "grad_norm": 0.4686625045941696, + "learning_rate": 3.1837690375771545e-05, + "loss": 0.9006, + "step": 8612 + }, + { + "epoch": 0.42957605985037406, + "grad_norm": 0.5007591969491972, + "learning_rate": 3.183380582131455e-05, + "loss": 0.9434, + "step": 8613 + }, + { + "epoch": 0.42962593516209474, + "grad_norm": 0.8734848584522305, + "learning_rate": 3.182992108853244e-05, + "loss": 0.9095, + "step": 8614 + }, + { + "epoch": 0.4296758104738155, + "grad_norm": 0.4160248014533724, + "learning_rate": 3.18260361775266e-05, + "loss": 0.8895, + "step": 8615 + }, + { + "epoch": 0.42972568578553616, + "grad_norm": 0.4308718034412691, + "learning_rate": 3.18221510883984e-05, + "loss": 0.8821, + "step": 8616 + }, + { + "epoch": 0.42977556109725684, + "grad_norm": 0.7494016183522558, + "learning_rate": 3.181826582124923e-05, + "loss": 0.8295, + "step": 8617 + }, + { + "epoch": 0.4298254364089776, + "grad_norm": 0.6040353027999573, + "learning_rate": 3.1814380376180454e-05, + "loss": 0.9007, + "step": 8618 + }, + { + "epoch": 0.42987531172069826, + "grad_norm": 0.5195419700908609, + "learning_rate": 3.181049475329347e-05, + "loss": 0.8837, + "step": 8619 + }, + { + "epoch": 0.42992518703241894, + "grad_norm": 0.47484232396341536, + "learning_rate": 3.1806608952689686e-05, + "loss": 0.964, + "step": 8620 + }, + { + "epoch": 0.4299750623441396, + "grad_norm": 0.9472382630044193, + "learning_rate": 3.180272297447048e-05, + "loss": 0.9204, + "step": 8621 + }, + { + "epoch": 0.43002493765586036, + "grad_norm": 0.6142630147113733, + "learning_rate": 3.1798836818737266e-05, + "loss": 0.9076, + "step": 8622 + }, + { + "epoch": 0.43007481296758104, + "grad_norm": 0.5424921992817122, + "learning_rate": 3.179495048559144e-05, + "loss": 0.9308, + "step": 8623 + }, + { + "epoch": 0.4301246882793017, + "grad_norm": 0.609411898096565, + "learning_rate": 3.179106397513443e-05, + "loss": 0.9376, + "step": 8624 + }, + { + "epoch": 0.43017456359102246, + "grad_norm": 0.6759064771059927, + "learning_rate": 3.178717728746764e-05, + "loss": 0.9329, + "step": 8625 + }, + { + "epoch": 0.43022443890274314, + "grad_norm": 0.45604253960990826, + "learning_rate": 3.17832904226925e-05, + "loss": 0.9118, + "step": 8626 + }, + { + "epoch": 0.4302743142144638, + "grad_norm": 0.5822923068765512, + "learning_rate": 3.177940338091043e-05, + "loss": 0.8985, + "step": 8627 + }, + { + "epoch": 0.43032418952618456, + "grad_norm": 0.4453220916863593, + "learning_rate": 3.177551616222286e-05, + "loss": 0.9058, + "step": 8628 + }, + { + "epoch": 0.43037406483790525, + "grad_norm": 0.4697407241973631, + "learning_rate": 3.1771628766731224e-05, + "loss": 0.9022, + "step": 8629 + }, + { + "epoch": 0.4304239401496259, + "grad_norm": 0.5160815957530905, + "learning_rate": 3.176774119453698e-05, + "loss": 0.9135, + "step": 8630 + }, + { + "epoch": 0.4304738154613466, + "grad_norm": 0.6082583954505475, + "learning_rate": 3.176385344574155e-05, + "loss": 0.9026, + "step": 8631 + }, + { + "epoch": 0.43052369077306735, + "grad_norm": 0.467143302368384, + "learning_rate": 3.1759965520446396e-05, + "loss": 0.9271, + "step": 8632 + }, + { + "epoch": 0.43057356608478803, + "grad_norm": 0.9380784476673838, + "learning_rate": 3.1756077418752967e-05, + "loss": 0.9375, + "step": 8633 + }, + { + "epoch": 0.4306234413965087, + "grad_norm": 0.4562611789877471, + "learning_rate": 3.1752189140762714e-05, + "loss": 0.8647, + "step": 8634 + }, + { + "epoch": 0.43067331670822945, + "grad_norm": 0.45800327086937564, + "learning_rate": 3.174830068657711e-05, + "loss": 0.9253, + "step": 8635 + }, + { + "epoch": 0.43072319201995013, + "grad_norm": 0.5665290974361032, + "learning_rate": 3.1744412056297625e-05, + "loss": 0.8695, + "step": 8636 + }, + { + "epoch": 0.4307730673316708, + "grad_norm": 0.4602263354479999, + "learning_rate": 3.1740523250025723e-05, + "loss": 0.9057, + "step": 8637 + }, + { + "epoch": 0.43082294264339155, + "grad_norm": 0.9469924926663258, + "learning_rate": 3.173663426786288e-05, + "loss": 0.9001, + "step": 8638 + }, + { + "epoch": 0.43087281795511223, + "grad_norm": 0.5817788752931915, + "learning_rate": 3.173274510991058e-05, + "loss": 0.9132, + "step": 8639 + }, + { + "epoch": 0.4309226932668329, + "grad_norm": 0.48142715732522084, + "learning_rate": 3.1728855776270315e-05, + "loss": 0.8945, + "step": 8640 + }, + { + "epoch": 0.4309725685785536, + "grad_norm": 0.3944632773576847, + "learning_rate": 3.172496626704357e-05, + "loss": 0.8485, + "step": 8641 + }, + { + "epoch": 0.43102244389027433, + "grad_norm": 0.9675742317306609, + "learning_rate": 3.1721076582331836e-05, + "loss": 0.9264, + "step": 8642 + }, + { + "epoch": 0.431072319201995, + "grad_norm": 0.7876784382407751, + "learning_rate": 3.171718672223662e-05, + "loss": 0.9246, + "step": 8643 + }, + { + "epoch": 0.4311221945137157, + "grad_norm": 0.6911040117449583, + "learning_rate": 3.1713296686859426e-05, + "loss": 0.8935, + "step": 8644 + }, + { + "epoch": 0.43117206982543643, + "grad_norm": 0.5524465407043259, + "learning_rate": 3.170940647630175e-05, + "loss": 0.9141, + "step": 8645 + }, + { + "epoch": 0.4312219451371571, + "grad_norm": 0.5715863539184769, + "learning_rate": 3.170551609066512e-05, + "loss": 0.887, + "step": 8646 + }, + { + "epoch": 0.4312718204488778, + "grad_norm": 0.48121380648679435, + "learning_rate": 3.170162553005105e-05, + "loss": 0.9441, + "step": 8647 + }, + { + "epoch": 0.4313216957605985, + "grad_norm": 0.48189320916030187, + "learning_rate": 3.169773479456106e-05, + "loss": 0.8993, + "step": 8648 + }, + { + "epoch": 0.4313715710723192, + "grad_norm": 0.7810838294034386, + "learning_rate": 3.169384388429667e-05, + "loss": 0.8978, + "step": 8649 + }, + { + "epoch": 0.4314214463840399, + "grad_norm": 0.4921559124305264, + "learning_rate": 3.168995279935944e-05, + "loss": 0.894, + "step": 8650 + }, + { + "epoch": 0.4314713216957606, + "grad_norm": 0.4310225260294227, + "learning_rate": 3.168606153985087e-05, + "loss": 0.8683, + "step": 8651 + }, + { + "epoch": 0.4315211970074813, + "grad_norm": 0.3955031826909621, + "learning_rate": 3.168217010587251e-05, + "loss": 0.8808, + "step": 8652 + }, + { + "epoch": 0.431571072319202, + "grad_norm": 0.4532724528842469, + "learning_rate": 3.167827849752592e-05, + "loss": 0.9652, + "step": 8653 + }, + { + "epoch": 0.4316209476309227, + "grad_norm": 0.5584713215719267, + "learning_rate": 3.1674386714912644e-05, + "loss": 0.8784, + "step": 8654 + }, + { + "epoch": 0.4316708229426434, + "grad_norm": 0.47050348803954734, + "learning_rate": 3.1670494758134235e-05, + "loss": 0.8357, + "step": 8655 + }, + { + "epoch": 0.4317206982543641, + "grad_norm": 0.5087427852482109, + "learning_rate": 3.166660262729224e-05, + "loss": 0.9056, + "step": 8656 + }, + { + "epoch": 0.4317705735660848, + "grad_norm": 0.45872686747901353, + "learning_rate": 3.166271032248825e-05, + "loss": 0.8182, + "step": 8657 + }, + { + "epoch": 0.43182044887780546, + "grad_norm": 0.5272254006970788, + "learning_rate": 3.16588178438238e-05, + "loss": 0.9008, + "step": 8658 + }, + { + "epoch": 0.4318703241895262, + "grad_norm": 0.4380038578187677, + "learning_rate": 3.165492519140049e-05, + "loss": 0.9037, + "step": 8659 + }, + { + "epoch": 0.4319201995012469, + "grad_norm": 0.49827187413027885, + "learning_rate": 3.1651032365319875e-05, + "loss": 0.8804, + "step": 8660 + }, + { + "epoch": 0.43197007481296756, + "grad_norm": 0.431514880322989, + "learning_rate": 3.164713936568355e-05, + "loss": 0.939, + "step": 8661 + }, + { + "epoch": 0.4320199501246883, + "grad_norm": 1.3567699475478467, + "learning_rate": 3.16432461925931e-05, + "loss": 0.9398, + "step": 8662 + }, + { + "epoch": 0.432069825436409, + "grad_norm": 0.4411654408295756, + "learning_rate": 3.163935284615011e-05, + "loss": 0.8526, + "step": 8663 + }, + { + "epoch": 0.43211970074812966, + "grad_norm": 0.4857807174881964, + "learning_rate": 3.163545932645619e-05, + "loss": 0.9055, + "step": 8664 + }, + { + "epoch": 0.43216957605985035, + "grad_norm": 0.44539803997548505, + "learning_rate": 3.1631565633612915e-05, + "loss": 0.9175, + "step": 8665 + }, + { + "epoch": 0.4322194513715711, + "grad_norm": 0.48364325776298633, + "learning_rate": 3.1627671767721915e-05, + "loss": 0.8963, + "step": 8666 + }, + { + "epoch": 0.43226932668329177, + "grad_norm": 0.41260060900468826, + "learning_rate": 3.162377772888477e-05, + "loss": 0.8612, + "step": 8667 + }, + { + "epoch": 0.43231920199501245, + "grad_norm": 0.42107754759928134, + "learning_rate": 3.161988351720313e-05, + "loss": 0.8817, + "step": 8668 + }, + { + "epoch": 0.4323690773067332, + "grad_norm": 0.6001843156250245, + "learning_rate": 3.161598913277858e-05, + "loss": 0.9569, + "step": 8669 + }, + { + "epoch": 0.43241895261845387, + "grad_norm": 0.4765033449252742, + "learning_rate": 3.161209457571276e-05, + "loss": 0.9206, + "step": 8670 + }, + { + "epoch": 0.43246882793017455, + "grad_norm": 0.453947017188435, + "learning_rate": 3.1608199846107287e-05, + "loss": 0.8956, + "step": 8671 + }, + { + "epoch": 0.4325187032418953, + "grad_norm": 0.5713350832301444, + "learning_rate": 3.160430494406379e-05, + "loss": 0.9165, + "step": 8672 + }, + { + "epoch": 0.43256857855361597, + "grad_norm": 0.9616226977575449, + "learning_rate": 3.1600409869683924e-05, + "loss": 0.9023, + "step": 8673 + }, + { + "epoch": 0.43261845386533665, + "grad_norm": 0.4659475522216996, + "learning_rate": 3.159651462306931e-05, + "loss": 0.9165, + "step": 8674 + }, + { + "epoch": 0.43266832917705733, + "grad_norm": 0.41850331329969254, + "learning_rate": 3.15926192043216e-05, + "loss": 0.8685, + "step": 8675 + }, + { + "epoch": 0.43271820448877807, + "grad_norm": 0.5113719296064103, + "learning_rate": 3.158872361354244e-05, + "loss": 0.8266, + "step": 8676 + }, + { + "epoch": 0.43276807980049875, + "grad_norm": 0.4702278144131224, + "learning_rate": 3.1584827850833486e-05, + "loss": 0.8284, + "step": 8677 + }, + { + "epoch": 0.43281795511221943, + "grad_norm": 0.46658371991230546, + "learning_rate": 3.158093191629641e-05, + "loss": 0.9105, + "step": 8678 + }, + { + "epoch": 0.43286783042394017, + "grad_norm": 0.40861012721429935, + "learning_rate": 3.157703581003284e-05, + "loss": 0.8777, + "step": 8679 + }, + { + "epoch": 0.43291770573566085, + "grad_norm": 0.48557940692778023, + "learning_rate": 3.157313953214448e-05, + "loss": 0.9096, + "step": 8680 + }, + { + "epoch": 0.43296758104738153, + "grad_norm": 0.42684225405794685, + "learning_rate": 3.156924308273298e-05, + "loss": 0.9201, + "step": 8681 + }, + { + "epoch": 0.43301745635910227, + "grad_norm": 0.4632744959676543, + "learning_rate": 3.156534646190002e-05, + "loss": 0.8661, + "step": 8682 + }, + { + "epoch": 0.43306733167082295, + "grad_norm": 0.38733674142569413, + "learning_rate": 3.1561449669747284e-05, + "loss": 0.875, + "step": 8683 + }, + { + "epoch": 0.43311720698254363, + "grad_norm": 0.44072469134550546, + "learning_rate": 3.155755270637645e-05, + "loss": 0.9093, + "step": 8684 + }, + { + "epoch": 0.4331670822942643, + "grad_norm": 0.4090015635257612, + "learning_rate": 3.155365557188922e-05, + "loss": 0.895, + "step": 8685 + }, + { + "epoch": 0.43321695760598505, + "grad_norm": 0.4666008341808923, + "learning_rate": 3.154975826638728e-05, + "loss": 0.9035, + "step": 8686 + }, + { + "epoch": 0.43326683291770574, + "grad_norm": 0.43056250635985277, + "learning_rate": 3.154586078997233e-05, + "loss": 0.8881, + "step": 8687 + }, + { + "epoch": 0.4333167082294264, + "grad_norm": 0.4300266399526118, + "learning_rate": 3.154196314274607e-05, + "loss": 0.8783, + "step": 8688 + }, + { + "epoch": 0.43336658354114715, + "grad_norm": 0.44590344501044016, + "learning_rate": 3.1538065324810206e-05, + "loss": 0.8931, + "step": 8689 + }, + { + "epoch": 0.43341645885286784, + "grad_norm": 0.4530245209984315, + "learning_rate": 3.153416733626646e-05, + "loss": 0.9071, + "step": 8690 + }, + { + "epoch": 0.4334663341645885, + "grad_norm": 0.7962341866333738, + "learning_rate": 3.1530269177216545e-05, + "loss": 0.9173, + "step": 8691 + }, + { + "epoch": 0.4335162094763092, + "grad_norm": 0.4325167925816781, + "learning_rate": 3.152637084776217e-05, + "loss": 0.9391, + "step": 8692 + }, + { + "epoch": 0.43356608478802994, + "grad_norm": 0.4999023255851049, + "learning_rate": 3.1522472348005064e-05, + "loss": 0.8495, + "step": 8693 + }, + { + "epoch": 0.4336159600997506, + "grad_norm": 0.439316550295652, + "learning_rate": 3.1518573678046974e-05, + "loss": 0.8972, + "step": 8694 + }, + { + "epoch": 0.4336658354114713, + "grad_norm": 0.5842611190642242, + "learning_rate": 3.151467483798961e-05, + "loss": 0.9009, + "step": 8695 + }, + { + "epoch": 0.43371571072319204, + "grad_norm": 0.6242109949215077, + "learning_rate": 3.1510775827934734e-05, + "loss": 0.9469, + "step": 8696 + }, + { + "epoch": 0.4337655860349127, + "grad_norm": 0.48412010710773556, + "learning_rate": 3.150687664798406e-05, + "loss": 0.8952, + "step": 8697 + }, + { + "epoch": 0.4338154613466334, + "grad_norm": 0.5488542069074751, + "learning_rate": 3.150297729823937e-05, + "loss": 0.9186, + "step": 8698 + }, + { + "epoch": 0.43386533665835414, + "grad_norm": 0.576531600727369, + "learning_rate": 3.1499077778802385e-05, + "loss": 0.9287, + "step": 8699 + }, + { + "epoch": 0.4339152119700748, + "grad_norm": 0.44801420685572774, + "learning_rate": 3.149517808977488e-05, + "loss": 0.8595, + "step": 8700 + }, + { + "epoch": 0.4339650872817955, + "grad_norm": 0.4616378660531886, + "learning_rate": 3.14912782312586e-05, + "loss": 0.8947, + "step": 8701 + }, + { + "epoch": 0.4340149625935162, + "grad_norm": 0.4199958200240461, + "learning_rate": 3.148737820335533e-05, + "loss": 0.8737, + "step": 8702 + }, + { + "epoch": 0.4340648379052369, + "grad_norm": 0.6913203901792587, + "learning_rate": 3.148347800616683e-05, + "loss": 0.8436, + "step": 8703 + }, + { + "epoch": 0.4341147132169576, + "grad_norm": 0.46202461989124816, + "learning_rate": 3.1479577639794864e-05, + "loss": 0.8667, + "step": 8704 + }, + { + "epoch": 0.4341645885286783, + "grad_norm": 0.5446866550082172, + "learning_rate": 3.147567710434123e-05, + "loss": 0.9036, + "step": 8705 + }, + { + "epoch": 0.434214463840399, + "grad_norm": 0.6772563639740016, + "learning_rate": 3.1471776399907694e-05, + "loss": 0.9074, + "step": 8706 + }, + { + "epoch": 0.4342643391521197, + "grad_norm": 1.140153238942299, + "learning_rate": 3.1467875526596056e-05, + "loss": 0.9485, + "step": 8707 + }, + { + "epoch": 0.4343142144638404, + "grad_norm": 0.4670920172170074, + "learning_rate": 3.146397448450809e-05, + "loss": 0.9142, + "step": 8708 + }, + { + "epoch": 0.4343640897755611, + "grad_norm": 0.4609879309776971, + "learning_rate": 3.146007327374561e-05, + "loss": 0.8985, + "step": 8709 + }, + { + "epoch": 0.4344139650872818, + "grad_norm": 0.6259622476955717, + "learning_rate": 3.14561718944104e-05, + "loss": 0.8722, + "step": 8710 + }, + { + "epoch": 0.4344638403990025, + "grad_norm": 0.5802903781578356, + "learning_rate": 3.1452270346604284e-05, + "loss": 0.8964, + "step": 8711 + }, + { + "epoch": 0.43451371571072317, + "grad_norm": 0.47099368878488146, + "learning_rate": 3.144836863042906e-05, + "loss": 0.9141, + "step": 8712 + }, + { + "epoch": 0.4345635910224439, + "grad_norm": 0.41490906635295144, + "learning_rate": 3.144446674598654e-05, + "loss": 0.8745, + "step": 8713 + }, + { + "epoch": 0.4346134663341646, + "grad_norm": 0.4501924503247352, + "learning_rate": 3.144056469337854e-05, + "loss": 0.9034, + "step": 8714 + }, + { + "epoch": 0.43466334164588527, + "grad_norm": 0.46329075338173586, + "learning_rate": 3.1436662472706895e-05, + "loss": 0.8888, + "step": 8715 + }, + { + "epoch": 0.434713216957606, + "grad_norm": 0.4781095410175048, + "learning_rate": 3.143276008407342e-05, + "loss": 0.8831, + "step": 8716 + }, + { + "epoch": 0.4347630922693267, + "grad_norm": 0.7061917623089692, + "learning_rate": 3.142885752757995e-05, + "loss": 0.8941, + "step": 8717 + }, + { + "epoch": 0.43481296758104737, + "grad_norm": 0.6804338117395995, + "learning_rate": 3.142495480332832e-05, + "loss": 0.8916, + "step": 8718 + }, + { + "epoch": 0.43486284289276805, + "grad_norm": 0.4025633920485976, + "learning_rate": 3.142105191142037e-05, + "loss": 0.9315, + "step": 8719 + }, + { + "epoch": 0.4349127182044888, + "grad_norm": 0.4494077366569893, + "learning_rate": 3.141714885195795e-05, + "loss": 0.9156, + "step": 8720 + }, + { + "epoch": 0.4349625935162095, + "grad_norm": 0.4713701129904792, + "learning_rate": 3.141324562504289e-05, + "loss": 0.9223, + "step": 8721 + }, + { + "epoch": 0.43501246882793015, + "grad_norm": 0.5520325395362728, + "learning_rate": 3.140934223077706e-05, + "loss": 0.8948, + "step": 8722 + }, + { + "epoch": 0.4350623441396509, + "grad_norm": 0.6137589425874763, + "learning_rate": 3.140543866926231e-05, + "loss": 0.9188, + "step": 8723 + }, + { + "epoch": 0.4351122194513716, + "grad_norm": 0.4450562155753263, + "learning_rate": 3.140153494060051e-05, + "loss": 0.8849, + "step": 8724 + }, + { + "epoch": 0.43516209476309226, + "grad_norm": 0.3937506441286304, + "learning_rate": 3.139763104489352e-05, + "loss": 0.8773, + "step": 8725 + }, + { + "epoch": 0.435211970074813, + "grad_norm": 0.4605370522828055, + "learning_rate": 3.139372698224321e-05, + "loss": 0.8838, + "step": 8726 + }, + { + "epoch": 0.4352618453865337, + "grad_norm": 0.390827877965297, + "learning_rate": 3.138982275275146e-05, + "loss": 0.9075, + "step": 8727 + }, + { + "epoch": 0.43531172069825436, + "grad_norm": 0.5002457836334792, + "learning_rate": 3.138591835652014e-05, + "loss": 0.8993, + "step": 8728 + }, + { + "epoch": 0.43536159600997504, + "grad_norm": 0.6579081581648858, + "learning_rate": 3.138201379365114e-05, + "loss": 0.9131, + "step": 8729 + }, + { + "epoch": 0.4354114713216958, + "grad_norm": 0.4046236137472451, + "learning_rate": 3.137810906424635e-05, + "loss": 0.8671, + "step": 8730 + }, + { + "epoch": 0.43546134663341646, + "grad_norm": 0.4470715194455778, + "learning_rate": 3.137420416840766e-05, + "loss": 0.8776, + "step": 8731 + }, + { + "epoch": 0.43551122194513714, + "grad_norm": 0.5423374258017234, + "learning_rate": 3.137029910623696e-05, + "loss": 0.8843, + "step": 8732 + }, + { + "epoch": 0.4355610972568579, + "grad_norm": 0.5572529907406536, + "learning_rate": 3.1366393877836153e-05, + "loss": 0.9071, + "step": 8733 + }, + { + "epoch": 0.43561097256857856, + "grad_norm": 0.680891854298057, + "learning_rate": 3.1362488483307144e-05, + "loss": 0.9154, + "step": 8734 + }, + { + "epoch": 0.43566084788029924, + "grad_norm": 0.48644620447983894, + "learning_rate": 3.135858292275186e-05, + "loss": 0.9128, + "step": 8735 + }, + { + "epoch": 0.43571072319202, + "grad_norm": 0.5119817823879284, + "learning_rate": 3.135467719627219e-05, + "loss": 0.8969, + "step": 8736 + }, + { + "epoch": 0.43576059850374066, + "grad_norm": 0.40958531948515464, + "learning_rate": 3.1350771303970064e-05, + "loss": 0.8855, + "step": 8737 + }, + { + "epoch": 0.43581047381546134, + "grad_norm": 0.4353169004798644, + "learning_rate": 3.13468652459474e-05, + "loss": 0.9086, + "step": 8738 + }, + { + "epoch": 0.435860349127182, + "grad_norm": 0.4564412524242704, + "learning_rate": 3.134295902230613e-05, + "loss": 0.8264, + "step": 8739 + }, + { + "epoch": 0.43591022443890276, + "grad_norm": 0.5208427664512874, + "learning_rate": 3.133905263314818e-05, + "loss": 0.9419, + "step": 8740 + }, + { + "epoch": 0.43596009975062344, + "grad_norm": 0.4909533533483032, + "learning_rate": 3.1335146078575496e-05, + "loss": 0.9178, + "step": 8741 + }, + { + "epoch": 0.4360099750623441, + "grad_norm": 0.6143700598291638, + "learning_rate": 3.133123935869001e-05, + "loss": 0.8839, + "step": 8742 + }, + { + "epoch": 0.43605985037406486, + "grad_norm": 0.6066753361046859, + "learning_rate": 3.132733247359366e-05, + "loss": 0.8539, + "step": 8743 + }, + { + "epoch": 0.43610972568578554, + "grad_norm": 0.5316568016920031, + "learning_rate": 3.13234254233884e-05, + "loss": 0.9354, + "step": 8744 + }, + { + "epoch": 0.4361596009975062, + "grad_norm": 0.4861235912139522, + "learning_rate": 3.131951820817619e-05, + "loss": 0.8949, + "step": 8745 + }, + { + "epoch": 0.4362094763092269, + "grad_norm": 0.434498082955382, + "learning_rate": 3.131561082805898e-05, + "loss": 0.9189, + "step": 8746 + }, + { + "epoch": 0.43625935162094764, + "grad_norm": 0.4368751071034494, + "learning_rate": 3.131170328313873e-05, + "loss": 0.8994, + "step": 8747 + }, + { + "epoch": 0.4363092269326683, + "grad_norm": 0.6041751809772858, + "learning_rate": 3.130779557351741e-05, + "loss": 0.8643, + "step": 8748 + }, + { + "epoch": 0.436359102244389, + "grad_norm": 0.47847286703449504, + "learning_rate": 3.130388769929698e-05, + "loss": 0.907, + "step": 8749 + }, + { + "epoch": 0.43640897755610975, + "grad_norm": 0.519588546925484, + "learning_rate": 3.1299979660579424e-05, + "loss": 0.9761, + "step": 8750 + }, + { + "epoch": 0.4364588528678304, + "grad_norm": 0.45369003141286207, + "learning_rate": 3.129607145746673e-05, + "loss": 0.9163, + "step": 8751 + }, + { + "epoch": 0.4365087281795511, + "grad_norm": 0.5533788494217664, + "learning_rate": 3.1292163090060857e-05, + "loss": 0.9165, + "step": 8752 + }, + { + "epoch": 0.43655860349127185, + "grad_norm": 0.44230320225082465, + "learning_rate": 3.128825455846381e-05, + "loss": 0.8986, + "step": 8753 + }, + { + "epoch": 0.43660847880299253, + "grad_norm": 0.4637099387899527, + "learning_rate": 3.128434586277757e-05, + "loss": 0.8937, + "step": 8754 + }, + { + "epoch": 0.4366583541147132, + "grad_norm": 0.5706821843912813, + "learning_rate": 3.128043700310414e-05, + "loss": 0.9457, + "step": 8755 + }, + { + "epoch": 0.4367082294264339, + "grad_norm": 0.4763250607125208, + "learning_rate": 3.127652797954551e-05, + "loss": 0.8834, + "step": 8756 + }, + { + "epoch": 0.43675810473815463, + "grad_norm": 0.44269180574093636, + "learning_rate": 3.1272618792203704e-05, + "loss": 0.8538, + "step": 8757 + }, + { + "epoch": 0.4368079800498753, + "grad_norm": 0.4061301662153884, + "learning_rate": 3.126870944118071e-05, + "loss": 0.9095, + "step": 8758 + }, + { + "epoch": 0.436857855361596, + "grad_norm": 0.6991539970353575, + "learning_rate": 3.126479992657855e-05, + "loss": 0.9104, + "step": 8759 + }, + { + "epoch": 0.43690773067331673, + "grad_norm": 0.4712808160095744, + "learning_rate": 3.126089024849924e-05, + "loss": 0.8718, + "step": 8760 + }, + { + "epoch": 0.4369576059850374, + "grad_norm": 0.4218833430405433, + "learning_rate": 3.12569804070448e-05, + "loss": 0.9352, + "step": 8761 + }, + { + "epoch": 0.4370074812967581, + "grad_norm": 0.46095465113721695, + "learning_rate": 3.1253070402317264e-05, + "loss": 0.8852, + "step": 8762 + }, + { + "epoch": 0.4370573566084788, + "grad_norm": 0.46051365619406653, + "learning_rate": 3.124916023441865e-05, + "loss": 0.9437, + "step": 8763 + }, + { + "epoch": 0.4371072319201995, + "grad_norm": 0.5830774279629573, + "learning_rate": 3.124524990345099e-05, + "loss": 0.8652, + "step": 8764 + }, + { + "epoch": 0.4371571072319202, + "grad_norm": 0.3645796629503962, + "learning_rate": 3.1241339409516336e-05, + "loss": 0.8706, + "step": 8765 + }, + { + "epoch": 0.4372069825436409, + "grad_norm": 0.4201299324580417, + "learning_rate": 3.123742875271672e-05, + "loss": 0.9019, + "step": 8766 + }, + { + "epoch": 0.4372568578553616, + "grad_norm": 0.4255236476804205, + "learning_rate": 3.123351793315419e-05, + "loss": 0.9345, + "step": 8767 + }, + { + "epoch": 0.4373067331670823, + "grad_norm": 0.4204613862346924, + "learning_rate": 3.12296069509308e-05, + "loss": 0.9099, + "step": 8768 + }, + { + "epoch": 0.437356608478803, + "grad_norm": 0.5909521552035333, + "learning_rate": 3.1225695806148606e-05, + "loss": 0.9498, + "step": 8769 + }, + { + "epoch": 0.4374064837905237, + "grad_norm": 0.46202582673822473, + "learning_rate": 3.1221784498909666e-05, + "loss": 0.8942, + "step": 8770 + }, + { + "epoch": 0.4374563591022444, + "grad_norm": 0.4175680780368786, + "learning_rate": 3.1217873029316045e-05, + "loss": 0.8448, + "step": 8771 + }, + { + "epoch": 0.4375062344139651, + "grad_norm": 0.4605461584183189, + "learning_rate": 3.1213961397469814e-05, + "loss": 0.8911, + "step": 8772 + }, + { + "epoch": 0.43755610972568576, + "grad_norm": 0.5783353974442093, + "learning_rate": 3.1210049603473036e-05, + "loss": 0.8779, + "step": 8773 + }, + { + "epoch": 0.4376059850374065, + "grad_norm": 0.5102935835003951, + "learning_rate": 3.12061376474278e-05, + "loss": 0.8923, + "step": 8774 + }, + { + "epoch": 0.4376558603491272, + "grad_norm": 0.453810504454544, + "learning_rate": 3.120222552943617e-05, + "loss": 0.8898, + "step": 8775 + }, + { + "epoch": 0.43770573566084786, + "grad_norm": 0.5380066429305465, + "learning_rate": 3.119831324960024e-05, + "loss": 0.8673, + "step": 8776 + }, + { + "epoch": 0.4377556109725686, + "grad_norm": 0.6431728596070451, + "learning_rate": 3.119440080802211e-05, + "loss": 0.8919, + "step": 8777 + }, + { + "epoch": 0.4378054862842893, + "grad_norm": 0.4345729605328858, + "learning_rate": 3.1190488204803845e-05, + "loss": 0.8856, + "step": 8778 + }, + { + "epoch": 0.43785536159600996, + "grad_norm": 0.5567379626230837, + "learning_rate": 3.118657544004758e-05, + "loss": 0.9102, + "step": 8779 + }, + { + "epoch": 0.4379052369077307, + "grad_norm": 0.45456465558634235, + "learning_rate": 3.118266251385539e-05, + "loss": 0.9194, + "step": 8780 + }, + { + "epoch": 0.4379551122194514, + "grad_norm": 0.48688501854677607, + "learning_rate": 3.117874942632939e-05, + "loss": 0.9022, + "step": 8781 + }, + { + "epoch": 0.43800498753117206, + "grad_norm": 0.5491442236968036, + "learning_rate": 3.1174836177571686e-05, + "loss": 0.9023, + "step": 8782 + }, + { + "epoch": 0.43805486284289274, + "grad_norm": 0.5714434686498199, + "learning_rate": 3.1170922767684395e-05, + "loss": 0.8825, + "step": 8783 + }, + { + "epoch": 0.4381047381546135, + "grad_norm": 0.46970290912243484, + "learning_rate": 3.1167009196769636e-05, + "loss": 0.9072, + "step": 8784 + }, + { + "epoch": 0.43815461346633416, + "grad_norm": 0.48606793398007675, + "learning_rate": 3.116309546492954e-05, + "loss": 0.8929, + "step": 8785 + }, + { + "epoch": 0.43820448877805485, + "grad_norm": 0.5158360901215908, + "learning_rate": 3.1159181572266226e-05, + "loss": 0.8861, + "step": 8786 + }, + { + "epoch": 0.4382543640897756, + "grad_norm": 0.45786988337771045, + "learning_rate": 3.115526751888181e-05, + "loss": 0.8788, + "step": 8787 + }, + { + "epoch": 0.43830423940149627, + "grad_norm": 0.6108720079831805, + "learning_rate": 3.1151353304878463e-05, + "loss": 0.9708, + "step": 8788 + }, + { + "epoch": 0.43835411471321695, + "grad_norm": 0.4666395292254847, + "learning_rate": 3.114743893035829e-05, + "loss": 0.9206, + "step": 8789 + }, + { + "epoch": 0.43840399002493763, + "grad_norm": 0.4498193296318143, + "learning_rate": 3.114352439542346e-05, + "loss": 0.8939, + "step": 8790 + }, + { + "epoch": 0.43845386533665837, + "grad_norm": 0.5247605481307828, + "learning_rate": 3.1139609700176096e-05, + "loss": 0.9137, + "step": 8791 + }, + { + "epoch": 0.43850374064837905, + "grad_norm": 0.6629930489731568, + "learning_rate": 3.1135694844718377e-05, + "loss": 0.9089, + "step": 8792 + }, + { + "epoch": 0.43855361596009973, + "grad_norm": 0.5765376120132545, + "learning_rate": 3.113177982915244e-05, + "loss": 0.8354, + "step": 8793 + }, + { + "epoch": 0.43860349127182047, + "grad_norm": 0.47486904535576213, + "learning_rate": 3.112786465358046e-05, + "loss": 0.8956, + "step": 8794 + }, + { + "epoch": 0.43865336658354115, + "grad_norm": 0.6447766459114349, + "learning_rate": 3.112394931810458e-05, + "loss": 0.8824, + "step": 8795 + }, + { + "epoch": 0.43870324189526183, + "grad_norm": 0.5109737059458068, + "learning_rate": 3.1120033822827e-05, + "loss": 0.8591, + "step": 8796 + }, + { + "epoch": 0.43875311720698257, + "grad_norm": 1.1611723558768634, + "learning_rate": 3.1116118167849866e-05, + "loss": 0.9039, + "step": 8797 + }, + { + "epoch": 0.43880299251870325, + "grad_norm": 0.45439744524271425, + "learning_rate": 3.1112202353275374e-05, + "loss": 0.9009, + "step": 8798 + }, + { + "epoch": 0.43885286783042393, + "grad_norm": 1.5102733479560748, + "learning_rate": 3.11082863792057e-05, + "loss": 0.9101, + "step": 8799 + }, + { + "epoch": 0.4389027431421446, + "grad_norm": 0.5337606324753476, + "learning_rate": 3.1104370245743006e-05, + "loss": 0.9027, + "step": 8800 + }, + { + "epoch": 0.43895261845386535, + "grad_norm": 0.45666674775930494, + "learning_rate": 3.110045395298952e-05, + "loss": 0.884, + "step": 8801 + }, + { + "epoch": 0.43900249376558603, + "grad_norm": 0.44908253092562567, + "learning_rate": 3.109653750104741e-05, + "loss": 0.8597, + "step": 8802 + }, + { + "epoch": 0.4390523690773067, + "grad_norm": 0.5093064095225213, + "learning_rate": 3.109262089001889e-05, + "loss": 0.8776, + "step": 8803 + }, + { + "epoch": 0.43910224438902745, + "grad_norm": 4.872217415855934, + "learning_rate": 3.1088704120006154e-05, + "loss": 0.8749, + "step": 8804 + }, + { + "epoch": 0.43915211970074813, + "grad_norm": 0.47894860182303156, + "learning_rate": 3.1084787191111404e-05, + "loss": 0.9094, + "step": 8805 + }, + { + "epoch": 0.4392019950124688, + "grad_norm": 0.8248122298802557, + "learning_rate": 3.108087010343685e-05, + "loss": 0.9112, + "step": 8806 + }, + { + "epoch": 0.43925187032418955, + "grad_norm": 0.4342005561068265, + "learning_rate": 3.1076952857084724e-05, + "loss": 0.8499, + "step": 8807 + }, + { + "epoch": 0.43930174563591023, + "grad_norm": 0.6113181787399496, + "learning_rate": 3.107303545215723e-05, + "loss": 0.9108, + "step": 8808 + }, + { + "epoch": 0.4393516209476309, + "grad_norm": 1.3948411638049074, + "learning_rate": 3.1069117888756594e-05, + "loss": 0.8977, + "step": 8809 + }, + { + "epoch": 0.4394014962593516, + "grad_norm": 0.4509542766615732, + "learning_rate": 3.106520016698504e-05, + "loss": 0.8519, + "step": 8810 + }, + { + "epoch": 0.43945137157107234, + "grad_norm": 0.5328055526999657, + "learning_rate": 3.1061282286944796e-05, + "loss": 0.8994, + "step": 8811 + }, + { + "epoch": 0.439501246882793, + "grad_norm": 0.4185073751772443, + "learning_rate": 3.105736424873811e-05, + "loss": 0.9328, + "step": 8812 + }, + { + "epoch": 0.4395511221945137, + "grad_norm": 0.39350973451763827, + "learning_rate": 3.1053446052467215e-05, + "loss": 0.9418, + "step": 8813 + }, + { + "epoch": 0.43960099750623444, + "grad_norm": 0.4618168079761572, + "learning_rate": 3.104952769823435e-05, + "loss": 0.8902, + "step": 8814 + }, + { + "epoch": 0.4396508728179551, + "grad_norm": 0.5229397621632894, + "learning_rate": 3.104560918614177e-05, + "loss": 0.9089, + "step": 8815 + }, + { + "epoch": 0.4397007481296758, + "grad_norm": 0.7370050397057484, + "learning_rate": 3.104169051629173e-05, + "loss": 0.8662, + "step": 8816 + }, + { + "epoch": 0.4397506234413965, + "grad_norm": 0.43938306267691607, + "learning_rate": 3.1037771688786474e-05, + "loss": 0.9326, + "step": 8817 + }, + { + "epoch": 0.4398004987531172, + "grad_norm": 0.49130938550494896, + "learning_rate": 3.103385270372827e-05, + "loss": 0.9041, + "step": 8818 + }, + { + "epoch": 0.4398503740648379, + "grad_norm": 0.47886145593990265, + "learning_rate": 3.1029933561219375e-05, + "loss": 0.9128, + "step": 8819 + }, + { + "epoch": 0.4399002493765586, + "grad_norm": 0.5155894070031656, + "learning_rate": 3.1026014261362074e-05, + "loss": 0.8969, + "step": 8820 + }, + { + "epoch": 0.4399501246882793, + "grad_norm": 0.5034320266287938, + "learning_rate": 3.102209480425862e-05, + "loss": 0.8984, + "step": 8821 + }, + { + "epoch": 0.44, + "grad_norm": 0.4704571851344461, + "learning_rate": 3.1018175190011295e-05, + "loss": 0.9336, + "step": 8822 + }, + { + "epoch": 0.4400498753117207, + "grad_norm": 0.5097833413750666, + "learning_rate": 3.1014255418722385e-05, + "loss": 0.908, + "step": 8823 + }, + { + "epoch": 0.4400997506234414, + "grad_norm": 0.47604921330797084, + "learning_rate": 3.101033549049417e-05, + "loss": 0.9314, + "step": 8824 + }, + { + "epoch": 0.4401496259351621, + "grad_norm": 0.4830050899439509, + "learning_rate": 3.100641540542895e-05, + "loss": 0.9174, + "step": 8825 + }, + { + "epoch": 0.4401995012468828, + "grad_norm": 0.4513542598491111, + "learning_rate": 3.100249516362899e-05, + "loss": 0.8851, + "step": 8826 + }, + { + "epoch": 0.44024937655860347, + "grad_norm": 0.45825313847310367, + "learning_rate": 3.099857476519662e-05, + "loss": 0.899, + "step": 8827 + }, + { + "epoch": 0.4402992518703242, + "grad_norm": 0.5196396885135952, + "learning_rate": 3.099465421023412e-05, + "loss": 0.8803, + "step": 8828 + }, + { + "epoch": 0.4403491271820449, + "grad_norm": 0.6139207701860303, + "learning_rate": 3.0990733498843796e-05, + "loss": 0.889, + "step": 8829 + }, + { + "epoch": 0.44039900249376557, + "grad_norm": 0.6304002081532548, + "learning_rate": 3.098681263112797e-05, + "loss": 0.8912, + "step": 8830 + }, + { + "epoch": 0.4404488778054863, + "grad_norm": 0.4973399242730324, + "learning_rate": 3.098289160718895e-05, + "loss": 0.9178, + "step": 8831 + }, + { + "epoch": 0.440498753117207, + "grad_norm": 0.4351500556804644, + "learning_rate": 3.097897042712904e-05, + "loss": 0.8815, + "step": 8832 + }, + { + "epoch": 0.44054862842892767, + "grad_norm": 0.43938226760468024, + "learning_rate": 3.097504909105058e-05, + "loss": 0.891, + "step": 8833 + }, + { + "epoch": 0.4405985037406484, + "grad_norm": 0.41638896896967786, + "learning_rate": 3.097112759905589e-05, + "loss": 0.9007, + "step": 8834 + }, + { + "epoch": 0.4406483790523691, + "grad_norm": 0.47734941140036463, + "learning_rate": 3.09672059512473e-05, + "loss": 0.9657, + "step": 8835 + }, + { + "epoch": 0.44069825436408977, + "grad_norm": 0.4222581779439872, + "learning_rate": 3.096328414772713e-05, + "loss": 0.8848, + "step": 8836 + }, + { + "epoch": 0.44074812967581045, + "grad_norm": 0.47528151982434885, + "learning_rate": 3.0959362188597744e-05, + "loss": 0.9228, + "step": 8837 + }, + { + "epoch": 0.4407980049875312, + "grad_norm": 0.6071078346907397, + "learning_rate": 3.0955440073961455e-05, + "loss": 0.8962, + "step": 8838 + }, + { + "epoch": 0.44084788029925187, + "grad_norm": 0.424432058331985, + "learning_rate": 3.095151780392063e-05, + "loss": 0.9294, + "step": 8839 + }, + { + "epoch": 0.44089775561097255, + "grad_norm": 0.47318473912902104, + "learning_rate": 3.094759537857761e-05, + "loss": 0.9043, + "step": 8840 + }, + { + "epoch": 0.4409476309226933, + "grad_norm": 0.49311254637039, + "learning_rate": 3.0943672798034744e-05, + "loss": 0.8537, + "step": 8841 + }, + { + "epoch": 0.44099750623441397, + "grad_norm": 0.45742774480445914, + "learning_rate": 3.0939750062394407e-05, + "loss": 0.8722, + "step": 8842 + }, + { + "epoch": 0.44104738154613465, + "grad_norm": 0.6049715873813785, + "learning_rate": 3.093582717175894e-05, + "loss": 0.9285, + "step": 8843 + }, + { + "epoch": 0.44109725685785534, + "grad_norm": 0.4690817327301224, + "learning_rate": 3.0931904126230723e-05, + "loss": 0.8708, + "step": 8844 + }, + { + "epoch": 0.4411471321695761, + "grad_norm": 0.5327457873542893, + "learning_rate": 3.092798092591213e-05, + "loss": 0.8529, + "step": 8845 + }, + { + "epoch": 0.44119700748129675, + "grad_norm": 0.38260709400284076, + "learning_rate": 3.0924057570905526e-05, + "loss": 0.8616, + "step": 8846 + }, + { + "epoch": 0.44124688279301744, + "grad_norm": 0.41859583455978416, + "learning_rate": 3.0920134061313275e-05, + "loss": 0.8511, + "step": 8847 + }, + { + "epoch": 0.4412967581047382, + "grad_norm": 0.4557978810928964, + "learning_rate": 3.091621039723779e-05, + "loss": 0.8916, + "step": 8848 + }, + { + "epoch": 0.44134663341645886, + "grad_norm": 0.4537651792011041, + "learning_rate": 3.091228657878144e-05, + "loss": 0.9038, + "step": 8849 + }, + { + "epoch": 0.44139650872817954, + "grad_norm": 0.5307045829163832, + "learning_rate": 3.090836260604662e-05, + "loss": 0.8781, + "step": 8850 + }, + { + "epoch": 0.4414463840399003, + "grad_norm": 0.4695072751841848, + "learning_rate": 3.090443847913572e-05, + "loss": 0.8441, + "step": 8851 + }, + { + "epoch": 0.44149625935162096, + "grad_norm": 0.45481258101123556, + "learning_rate": 3.0900514198151145e-05, + "loss": 0.9153, + "step": 8852 + }, + { + "epoch": 0.44154613466334164, + "grad_norm": 0.4694032694642776, + "learning_rate": 3.089658976319528e-05, + "loss": 0.8637, + "step": 8853 + }, + { + "epoch": 0.4415960099750623, + "grad_norm": 1.013998575579568, + "learning_rate": 3.089266517437055e-05, + "loss": 0.9236, + "step": 8854 + }, + { + "epoch": 0.44164588528678306, + "grad_norm": 0.6663933112413091, + "learning_rate": 3.088874043177936e-05, + "loss": 0.8991, + "step": 8855 + }, + { + "epoch": 0.44169576059850374, + "grad_norm": 0.41359438986340735, + "learning_rate": 3.088481553552413e-05, + "loss": 0.8791, + "step": 8856 + }, + { + "epoch": 0.4417456359102244, + "grad_norm": 0.4255232154019434, + "learning_rate": 3.088089048570726e-05, + "loss": 0.9197, + "step": 8857 + }, + { + "epoch": 0.44179551122194516, + "grad_norm": 0.3701345307826944, + "learning_rate": 3.0876965282431204e-05, + "loss": 0.8802, + "step": 8858 + }, + { + "epoch": 0.44184538653366584, + "grad_norm": 0.5186204727170979, + "learning_rate": 3.087303992579835e-05, + "loss": 0.8905, + "step": 8859 + }, + { + "epoch": 0.4418952618453865, + "grad_norm": 0.46878435194691104, + "learning_rate": 3.086911441591116e-05, + "loss": 0.9425, + "step": 8860 + }, + { + "epoch": 0.44194513715710726, + "grad_norm": 0.5770459511569098, + "learning_rate": 3.0865188752872046e-05, + "loss": 0.9016, + "step": 8861 + }, + { + "epoch": 0.44199501246882794, + "grad_norm": 0.5646738493364754, + "learning_rate": 3.0861262936783466e-05, + "loss": 0.8914, + "step": 8862 + }, + { + "epoch": 0.4420448877805486, + "grad_norm": 0.48600252902112856, + "learning_rate": 3.085733696774784e-05, + "loss": 0.8655, + "step": 8863 + }, + { + "epoch": 0.4420947630922693, + "grad_norm": 0.4819777535848297, + "learning_rate": 3.085341084586764e-05, + "loss": 0.9211, + "step": 8864 + }, + { + "epoch": 0.44214463840399004, + "grad_norm": 0.4544981021576665, + "learning_rate": 3.0849484571245294e-05, + "loss": 0.945, + "step": 8865 + }, + { + "epoch": 0.4421945137157107, + "grad_norm": 0.4995314564663312, + "learning_rate": 3.0845558143983275e-05, + "loss": 0.879, + "step": 8866 + }, + { + "epoch": 0.4422443890274314, + "grad_norm": 0.7106466673310775, + "learning_rate": 3.0841631564184026e-05, + "loss": 0.9344, + "step": 8867 + }, + { + "epoch": 0.44229426433915214, + "grad_norm": 0.5179881656429706, + "learning_rate": 3.0837704831950023e-05, + "loss": 0.9384, + "step": 8868 + }, + { + "epoch": 0.4423441396508728, + "grad_norm": 0.48310481827688234, + "learning_rate": 3.0833777947383725e-05, + "loss": 0.8944, + "step": 8869 + }, + { + "epoch": 0.4423940149625935, + "grad_norm": 0.7940924619012988, + "learning_rate": 3.0829850910587595e-05, + "loss": 0.8752, + "step": 8870 + }, + { + "epoch": 0.4424438902743142, + "grad_norm": 0.49613933131298893, + "learning_rate": 3.082592372166412e-05, + "loss": 0.853, + "step": 8871 + }, + { + "epoch": 0.4424937655860349, + "grad_norm": 0.46744444056122253, + "learning_rate": 3.082199638071576e-05, + "loss": 0.8945, + "step": 8872 + }, + { + "epoch": 0.4425436408977556, + "grad_norm": 0.49030696620230296, + "learning_rate": 3.081806888784502e-05, + "loss": 0.8953, + "step": 8873 + }, + { + "epoch": 0.4425935162094763, + "grad_norm": 0.5162852080374194, + "learning_rate": 3.081414124315438e-05, + "loss": 0.8907, + "step": 8874 + }, + { + "epoch": 0.442643391521197, + "grad_norm": 0.44863849424119284, + "learning_rate": 3.081021344674632e-05, + "loss": 0.9033, + "step": 8875 + }, + { + "epoch": 0.4426932668329177, + "grad_norm": 0.4844973015817233, + "learning_rate": 3.080628549872334e-05, + "loss": 0.8768, + "step": 8876 + }, + { + "epoch": 0.4427431421446384, + "grad_norm": 0.48406004704699285, + "learning_rate": 3.080235739918795e-05, + "loss": 0.9771, + "step": 8877 + }, + { + "epoch": 0.44279301745635913, + "grad_norm": 0.4945134513420552, + "learning_rate": 3.079842914824262e-05, + "loss": 0.9249, + "step": 8878 + }, + { + "epoch": 0.4428428927680798, + "grad_norm": 0.42265682418812334, + "learning_rate": 3.07945007459899e-05, + "loss": 0.8952, + "step": 8879 + }, + { + "epoch": 0.4428927680798005, + "grad_norm": 0.5246525830005688, + "learning_rate": 3.0790572192532256e-05, + "loss": 0.9471, + "step": 8880 + }, + { + "epoch": 0.4429426433915212, + "grad_norm": 0.5188411832834233, + "learning_rate": 3.078664348797223e-05, + "loss": 0.894, + "step": 8881 + }, + { + "epoch": 0.4429925187032419, + "grad_norm": 0.4278834900152691, + "learning_rate": 3.078271463241234e-05, + "loss": 0.9132, + "step": 8882 + }, + { + "epoch": 0.4430423940149626, + "grad_norm": 0.4454381992919993, + "learning_rate": 3.077878562595509e-05, + "loss": 0.9459, + "step": 8883 + }, + { + "epoch": 0.4430922693266833, + "grad_norm": 0.5489457052308812, + "learning_rate": 3.077485646870302e-05, + "loss": 0.9265, + "step": 8884 + }, + { + "epoch": 0.443142144638404, + "grad_norm": 0.4387970612133543, + "learning_rate": 3.077092716075865e-05, + "loss": 0.8925, + "step": 8885 + }, + { + "epoch": 0.4431920199501247, + "grad_norm": 0.4281173800868987, + "learning_rate": 3.076699770222452e-05, + "loss": 0.913, + "step": 8886 + }, + { + "epoch": 0.4432418952618454, + "grad_norm": 0.4259809053286912, + "learning_rate": 3.0763068093203165e-05, + "loss": 0.9138, + "step": 8887 + }, + { + "epoch": 0.44329177057356606, + "grad_norm": 0.5521656427183022, + "learning_rate": 3.075913833379713e-05, + "loss": 0.9291, + "step": 8888 + }, + { + "epoch": 0.4433416458852868, + "grad_norm": 0.48975376978617197, + "learning_rate": 3.075520842410896e-05, + "loss": 0.8917, + "step": 8889 + }, + { + "epoch": 0.4433915211970075, + "grad_norm": 0.4864485453176737, + "learning_rate": 3.0751278364241194e-05, + "loss": 0.9242, + "step": 8890 + }, + { + "epoch": 0.44344139650872816, + "grad_norm": 0.4254722652532415, + "learning_rate": 3.07473481542964e-05, + "loss": 0.8876, + "step": 8891 + }, + { + "epoch": 0.4434912718204489, + "grad_norm": 0.39202847900986304, + "learning_rate": 3.0743417794377125e-05, + "loss": 0.895, + "step": 8892 + }, + { + "epoch": 0.4435411471321696, + "grad_norm": 0.4254575113709352, + "learning_rate": 3.073948728458593e-05, + "loss": 0.8638, + "step": 8893 + }, + { + "epoch": 0.44359102244389026, + "grad_norm": 0.5937995545866349, + "learning_rate": 3.0735556625025385e-05, + "loss": 0.8929, + "step": 8894 + }, + { + "epoch": 0.443640897755611, + "grad_norm": 0.5432991303074981, + "learning_rate": 3.073162581579806e-05, + "loss": 0.9322, + "step": 8895 + }, + { + "epoch": 0.4436907730673317, + "grad_norm": 0.45850663192773994, + "learning_rate": 3.072769485700652e-05, + "loss": 0.8673, + "step": 8896 + }, + { + "epoch": 0.44374064837905236, + "grad_norm": 0.45957412427545646, + "learning_rate": 3.072376374875335e-05, + "loss": 0.9149, + "step": 8897 + }, + { + "epoch": 0.44379052369077304, + "grad_norm": 1.1241150005606046, + "learning_rate": 3.071983249114113e-05, + "loss": 0.9086, + "step": 8898 + }, + { + "epoch": 0.4438403990024938, + "grad_norm": 0.48296599433193244, + "learning_rate": 3.071590108427244e-05, + "loss": 0.9246, + "step": 8899 + }, + { + "epoch": 0.44389027431421446, + "grad_norm": 0.4041782715415703, + "learning_rate": 3.071196952824986e-05, + "loss": 0.8608, + "step": 8900 + }, + { + "epoch": 0.44394014962593514, + "grad_norm": 0.40056706633703504, + "learning_rate": 3.0708037823176e-05, + "loss": 0.921, + "step": 8901 + }, + { + "epoch": 0.4439900249376559, + "grad_norm": 0.42520187363185286, + "learning_rate": 3.070410596915344e-05, + "loss": 0.9271, + "step": 8902 + }, + { + "epoch": 0.44403990024937656, + "grad_norm": 0.4722500619064021, + "learning_rate": 3.070017396628479e-05, + "loss": 0.9056, + "step": 8903 + }, + { + "epoch": 0.44408977556109724, + "grad_norm": 0.3700577178061859, + "learning_rate": 3.069624181467267e-05, + "loss": 0.8629, + "step": 8904 + }, + { + "epoch": 0.444139650872818, + "grad_norm": 0.5319106465136512, + "learning_rate": 3.0692309514419645e-05, + "loss": 0.9051, + "step": 8905 + }, + { + "epoch": 0.44418952618453866, + "grad_norm": 0.43146767552564325, + "learning_rate": 3.068837706562836e-05, + "loss": 0.8672, + "step": 8906 + }, + { + "epoch": 0.44423940149625935, + "grad_norm": 0.4332063053290535, + "learning_rate": 3.068444446840142e-05, + "loss": 0.8877, + "step": 8907 + }, + { + "epoch": 0.44428927680798, + "grad_norm": 0.45772422747165636, + "learning_rate": 3.0680511722841446e-05, + "loss": 0.9149, + "step": 8908 + }, + { + "epoch": 0.44433915211970076, + "grad_norm": 0.4161345405370944, + "learning_rate": 3.0676578829051065e-05, + "loss": 0.8811, + "step": 8909 + }, + { + "epoch": 0.44438902743142145, + "grad_norm": 0.49068777394062096, + "learning_rate": 3.067264578713289e-05, + "loss": 0.8895, + "step": 8910 + }, + { + "epoch": 0.44443890274314213, + "grad_norm": 0.670234173915878, + "learning_rate": 3.066871259718957e-05, + "loss": 0.9271, + "step": 8911 + }, + { + "epoch": 0.44448877805486287, + "grad_norm": 0.5101434230828226, + "learning_rate": 3.0664779259323725e-05, + "loss": 0.9264, + "step": 8912 + }, + { + "epoch": 0.44453865336658355, + "grad_norm": 0.4333449439619816, + "learning_rate": 3.066084577363801e-05, + "loss": 0.9124, + "step": 8913 + }, + { + "epoch": 0.44458852867830423, + "grad_norm": 0.4811970834512949, + "learning_rate": 3.065691214023505e-05, + "loss": 0.9198, + "step": 8914 + }, + { + "epoch": 0.4446384039900249, + "grad_norm": 0.5268017862662885, + "learning_rate": 3.06529783592175e-05, + "loss": 0.9033, + "step": 8915 + }, + { + "epoch": 0.44468827930174565, + "grad_norm": 0.4710360277718421, + "learning_rate": 3.0649044430688015e-05, + "loss": 0.9105, + "step": 8916 + }, + { + "epoch": 0.44473815461346633, + "grad_norm": 0.4831850377148537, + "learning_rate": 3.0645110354749246e-05, + "loss": 0.8543, + "step": 8917 + }, + { + "epoch": 0.444788029925187, + "grad_norm": 0.4945090691313995, + "learning_rate": 3.064117613150384e-05, + "loss": 0.8299, + "step": 8918 + }, + { + "epoch": 0.44483790523690775, + "grad_norm": 0.4229786424711317, + "learning_rate": 3.063724176105447e-05, + "loss": 0.9017, + "step": 8919 + }, + { + "epoch": 0.44488778054862843, + "grad_norm": 0.4815758635627675, + "learning_rate": 3.06333072435038e-05, + "loss": 0.9026, + "step": 8920 + }, + { + "epoch": 0.4449376558603491, + "grad_norm": 0.5110866807416121, + "learning_rate": 3.06293725789545e-05, + "loss": 0.8531, + "step": 8921 + }, + { + "epoch": 0.44498753117206985, + "grad_norm": 0.4716942011736572, + "learning_rate": 3.062543776750924e-05, + "loss": 0.8694, + "step": 8922 + }, + { + "epoch": 0.44503740648379053, + "grad_norm": 0.4085079764730015, + "learning_rate": 3.06215028092707e-05, + "loss": 0.8859, + "step": 8923 + }, + { + "epoch": 0.4450872817955112, + "grad_norm": 0.4024035973020176, + "learning_rate": 3.061756770434156e-05, + "loss": 0.89, + "step": 8924 + }, + { + "epoch": 0.4451371571072319, + "grad_norm": 0.7371577549481293, + "learning_rate": 3.06136324528245e-05, + "loss": 0.9141, + "step": 8925 + }, + { + "epoch": 0.44518703241895263, + "grad_norm": 0.5804188583434465, + "learning_rate": 3.060969705482222e-05, + "loss": 0.8713, + "step": 8926 + }, + { + "epoch": 0.4452369077306733, + "grad_norm": 0.43185703824540383, + "learning_rate": 3.06057615104374e-05, + "loss": 0.8981, + "step": 8927 + }, + { + "epoch": 0.445286783042394, + "grad_norm": 0.4436372747161028, + "learning_rate": 3.060182581977275e-05, + "loss": 0.9069, + "step": 8928 + }, + { + "epoch": 0.44533665835411473, + "grad_norm": 0.44109952864137186, + "learning_rate": 3.059788998293096e-05, + "loss": 0.8835, + "step": 8929 + }, + { + "epoch": 0.4453865336658354, + "grad_norm": 0.791820145924806, + "learning_rate": 3.059395400001474e-05, + "loss": 0.8697, + "step": 8930 + }, + { + "epoch": 0.4454364089775561, + "grad_norm": 0.5184227973927196, + "learning_rate": 3.059001787112678e-05, + "loss": 0.8809, + "step": 8931 + }, + { + "epoch": 0.44548628428927683, + "grad_norm": 0.4805880006702156, + "learning_rate": 3.058608159636981e-05, + "loss": 0.9459, + "step": 8932 + }, + { + "epoch": 0.4455361596009975, + "grad_norm": 0.42464794202409534, + "learning_rate": 3.058214517584654e-05, + "loss": 0.9185, + "step": 8933 + }, + { + "epoch": 0.4455860349127182, + "grad_norm": 0.42810186277668394, + "learning_rate": 3.05782086096597e-05, + "loss": 0.8956, + "step": 8934 + }, + { + "epoch": 0.4456359102244389, + "grad_norm": 0.4368789951736817, + "learning_rate": 3.0574271897912e-05, + "loss": 0.9025, + "step": 8935 + }, + { + "epoch": 0.4456857855361596, + "grad_norm": 0.43319518946131097, + "learning_rate": 3.0570335040706164e-05, + "loss": 0.8689, + "step": 8936 + }, + { + "epoch": 0.4457356608478803, + "grad_norm": 0.4698329496653696, + "learning_rate": 3.056639803814494e-05, + "loss": 0.9197, + "step": 8937 + }, + { + "epoch": 0.445785536159601, + "grad_norm": 0.38014320409613833, + "learning_rate": 3.056246089033104e-05, + "loss": 0.8316, + "step": 8938 + }, + { + "epoch": 0.4458354114713217, + "grad_norm": 0.4203219563762035, + "learning_rate": 3.0558523597367206e-05, + "loss": 0.9368, + "step": 8939 + }, + { + "epoch": 0.4458852867830424, + "grad_norm": 0.4623258019045109, + "learning_rate": 3.0554586159356193e-05, + "loss": 0.8742, + "step": 8940 + }, + { + "epoch": 0.4459351620947631, + "grad_norm": 0.44230506764507094, + "learning_rate": 3.055064857640075e-05, + "loss": 0.9161, + "step": 8941 + }, + { + "epoch": 0.44598503740648376, + "grad_norm": 0.47621786276408534, + "learning_rate": 3.054671084860361e-05, + "loss": 0.903, + "step": 8942 + }, + { + "epoch": 0.4460349127182045, + "grad_norm": 0.46282694756738685, + "learning_rate": 3.054277297606753e-05, + "loss": 0.882, + "step": 8943 + }, + { + "epoch": 0.4460847880299252, + "grad_norm": 0.49402588024503574, + "learning_rate": 3.053883495889526e-05, + "loss": 0.91, + "step": 8944 + }, + { + "epoch": 0.44613466334164587, + "grad_norm": 0.3904882058487075, + "learning_rate": 3.0534896797189585e-05, + "loss": 0.8234, + "step": 8945 + }, + { + "epoch": 0.4461845386533666, + "grad_norm": 0.4488615392455002, + "learning_rate": 3.053095849105325e-05, + "loss": 0.9285, + "step": 8946 + }, + { + "epoch": 0.4462344139650873, + "grad_norm": 0.45403757028891367, + "learning_rate": 3.0527020040589035e-05, + "loss": 0.943, + "step": 8947 + }, + { + "epoch": 0.44628428927680797, + "grad_norm": 0.7824351604267364, + "learning_rate": 3.05230814458997e-05, + "loss": 0.9104, + "step": 8948 + }, + { + "epoch": 0.4463341645885287, + "grad_norm": 0.46265676882115603, + "learning_rate": 3.0519142707088026e-05, + "loss": 0.8706, + "step": 8949 + }, + { + "epoch": 0.4463840399002494, + "grad_norm": 0.46521940386139643, + "learning_rate": 3.0515203824256794e-05, + "loss": 0.8985, + "step": 8950 + }, + { + "epoch": 0.44643391521197007, + "grad_norm": 0.4100989248213038, + "learning_rate": 3.0511264797508786e-05, + "loss": 0.8765, + "step": 8951 + }, + { + "epoch": 0.44648379052369075, + "grad_norm": 0.4409123411521726, + "learning_rate": 3.050732562694679e-05, + "loss": 0.8728, + "step": 8952 + }, + { + "epoch": 0.4465336658354115, + "grad_norm": 0.5124168806423106, + "learning_rate": 3.0503386312673598e-05, + "loss": 0.8758, + "step": 8953 + }, + { + "epoch": 0.44658354114713217, + "grad_norm": 0.562818958162973, + "learning_rate": 3.0499446854792003e-05, + "loss": 0.9125, + "step": 8954 + }, + { + "epoch": 0.44663341645885285, + "grad_norm": 0.49696305188958484, + "learning_rate": 3.04955072534048e-05, + "loss": 0.8998, + "step": 8955 + }, + { + "epoch": 0.4466832917705736, + "grad_norm": 0.4086433406241599, + "learning_rate": 3.04915675086148e-05, + "loss": 0.9294, + "step": 8956 + }, + { + "epoch": 0.44673316708229427, + "grad_norm": 0.48438972577059514, + "learning_rate": 3.0487627620524794e-05, + "loss": 0.8541, + "step": 8957 + }, + { + "epoch": 0.44678304239401495, + "grad_norm": 0.42276142709576897, + "learning_rate": 3.0483687589237615e-05, + "loss": 0.9323, + "step": 8958 + }, + { + "epoch": 0.4468329177057357, + "grad_norm": 0.3911037906387756, + "learning_rate": 3.0479747414856054e-05, + "loss": 0.8744, + "step": 8959 + }, + { + "epoch": 0.44688279301745637, + "grad_norm": 0.411271956424256, + "learning_rate": 3.0475807097482938e-05, + "loss": 0.8861, + "step": 8960 + }, + { + "epoch": 0.44693266832917705, + "grad_norm": 0.6841029974327495, + "learning_rate": 3.047186663722108e-05, + "loss": 0.9197, + "step": 8961 + }, + { + "epoch": 0.44698254364089773, + "grad_norm": 0.37613412799155954, + "learning_rate": 3.0467926034173315e-05, + "loss": 0.8411, + "step": 8962 + }, + { + "epoch": 0.44703241895261847, + "grad_norm": 0.857791033164924, + "learning_rate": 3.0463985288442475e-05, + "loss": 0.867, + "step": 8963 + }, + { + "epoch": 0.44708229426433915, + "grad_norm": 0.43330078037836156, + "learning_rate": 3.0460044400131378e-05, + "loss": 0.8354, + "step": 8964 + }, + { + "epoch": 0.44713216957605983, + "grad_norm": 0.47067495537639936, + "learning_rate": 3.0456103369342868e-05, + "loss": 0.9135, + "step": 8965 + }, + { + "epoch": 0.44718204488778057, + "grad_norm": 0.4064675922870724, + "learning_rate": 3.045216219617978e-05, + "loss": 0.8864, + "step": 8966 + }, + { + "epoch": 0.44723192019950125, + "grad_norm": 0.4242549285486106, + "learning_rate": 3.044822088074496e-05, + "loss": 0.924, + "step": 8967 + }, + { + "epoch": 0.44728179551122194, + "grad_norm": 0.6973672275579765, + "learning_rate": 3.0444279423141253e-05, + "loss": 0.8709, + "step": 8968 + }, + { + "epoch": 0.4473316708229426, + "grad_norm": 0.876742048804466, + "learning_rate": 3.0440337823471515e-05, + "loss": 0.8621, + "step": 8969 + }, + { + "epoch": 0.44738154613466335, + "grad_norm": 0.44502445833974574, + "learning_rate": 3.0436396081838593e-05, + "loss": 0.888, + "step": 8970 + }, + { + "epoch": 0.44743142144638404, + "grad_norm": 0.45402506601045645, + "learning_rate": 3.043245419834535e-05, + "loss": 0.8754, + "step": 8971 + }, + { + "epoch": 0.4474812967581047, + "grad_norm": 0.5090870565799385, + "learning_rate": 3.0428512173094647e-05, + "loss": 0.9003, + "step": 8972 + }, + { + "epoch": 0.44753117206982546, + "grad_norm": 0.6428237795097699, + "learning_rate": 3.0424570006189346e-05, + "loss": 0.9295, + "step": 8973 + }, + { + "epoch": 0.44758104738154614, + "grad_norm": 0.4825536690912761, + "learning_rate": 3.0420627697732316e-05, + "loss": 0.878, + "step": 8974 + }, + { + "epoch": 0.4476309226932668, + "grad_norm": 0.5370567407260133, + "learning_rate": 3.0416685247826443e-05, + "loss": 0.9093, + "step": 8975 + }, + { + "epoch": 0.44768079800498756, + "grad_norm": 0.5115597204063289, + "learning_rate": 3.041274265657459e-05, + "loss": 0.9071, + "step": 8976 + }, + { + "epoch": 0.44773067331670824, + "grad_norm": 0.3873393840304193, + "learning_rate": 3.040879992407963e-05, + "loss": 0.9044, + "step": 8977 + }, + { + "epoch": 0.4477805486284289, + "grad_norm": 0.47029309787537016, + "learning_rate": 3.0404857050444464e-05, + "loss": 0.924, + "step": 8978 + }, + { + "epoch": 0.4478304239401496, + "grad_norm": 0.47749023511328986, + "learning_rate": 3.0400914035771967e-05, + "loss": 0.8858, + "step": 8979 + }, + { + "epoch": 0.44788029925187034, + "grad_norm": 0.41495108233087324, + "learning_rate": 3.039697088016504e-05, + "loss": 0.8592, + "step": 8980 + }, + { + "epoch": 0.447930174563591, + "grad_norm": 0.4991719212486798, + "learning_rate": 3.0393027583726568e-05, + "loss": 0.9799, + "step": 8981 + }, + { + "epoch": 0.4479800498753117, + "grad_norm": 0.4414651031024474, + "learning_rate": 3.038908414655946e-05, + "loss": 0.9171, + "step": 8982 + }, + { + "epoch": 0.44802992518703244, + "grad_norm": 0.35639477851102647, + "learning_rate": 3.0385140568766613e-05, + "loss": 0.8536, + "step": 8983 + }, + { + "epoch": 0.4480798004987531, + "grad_norm": 0.5015888526127923, + "learning_rate": 3.0381196850450934e-05, + "loss": 0.8939, + "step": 8984 + }, + { + "epoch": 0.4481296758104738, + "grad_norm": 0.4183723038392907, + "learning_rate": 3.0377252991715323e-05, + "loss": 0.8788, + "step": 8985 + }, + { + "epoch": 0.4481795511221945, + "grad_norm": 0.4834192184972552, + "learning_rate": 3.0373308992662707e-05, + "loss": 0.8821, + "step": 8986 + }, + { + "epoch": 0.4482294264339152, + "grad_norm": 0.38746696203549924, + "learning_rate": 3.0369364853396004e-05, + "loss": 0.8543, + "step": 8987 + }, + { + "epoch": 0.4482793017456359, + "grad_norm": 0.4448413428593303, + "learning_rate": 3.036542057401812e-05, + "loss": 0.917, + "step": 8988 + }, + { + "epoch": 0.4483291770573566, + "grad_norm": 0.4844451318294028, + "learning_rate": 3.0361476154631992e-05, + "loss": 0.923, + "step": 8989 + }, + { + "epoch": 0.4483790523690773, + "grad_norm": 0.4349516372991446, + "learning_rate": 3.035753159534054e-05, + "loss": 0.8634, + "step": 8990 + }, + { + "epoch": 0.448428927680798, + "grad_norm": 0.5922771597522046, + "learning_rate": 3.03535868962467e-05, + "loss": 0.9103, + "step": 8991 + }, + { + "epoch": 0.4484788029925187, + "grad_norm": 0.532564884575055, + "learning_rate": 3.0349642057453404e-05, + "loss": 0.875, + "step": 8992 + }, + { + "epoch": 0.4485286783042394, + "grad_norm": 0.5275106959565636, + "learning_rate": 3.0345697079063595e-05, + "loss": 0.9024, + "step": 8993 + }, + { + "epoch": 0.4485785536159601, + "grad_norm": 0.39704821604197943, + "learning_rate": 3.034175196118021e-05, + "loss": 0.9014, + "step": 8994 + }, + { + "epoch": 0.4486284289276808, + "grad_norm": 0.5746274604885184, + "learning_rate": 3.03378067039062e-05, + "loss": 0.8616, + "step": 8995 + }, + { + "epoch": 0.44867830423940147, + "grad_norm": 0.6393905146829512, + "learning_rate": 3.0333861307344513e-05, + "loss": 0.8913, + "step": 8996 + }, + { + "epoch": 0.4487281795511222, + "grad_norm": 0.5308792767172058, + "learning_rate": 3.0329915771598104e-05, + "loss": 0.885, + "step": 8997 + }, + { + "epoch": 0.4487780548628429, + "grad_norm": 0.6409589276422392, + "learning_rate": 3.0325970096769928e-05, + "loss": 0.9161, + "step": 8998 + }, + { + "epoch": 0.44882793017456357, + "grad_norm": 0.5450294907149787, + "learning_rate": 3.0322024282962948e-05, + "loss": 0.8402, + "step": 8999 + }, + { + "epoch": 0.4488778054862843, + "grad_norm": 0.4007821181196251, + "learning_rate": 3.0318078330280118e-05, + "loss": 0.9079, + "step": 9000 + }, + { + "epoch": 0.448927680798005, + "grad_norm": 0.44278488543693845, + "learning_rate": 3.0314132238824415e-05, + "loss": 0.9355, + "step": 9001 + }, + { + "epoch": 0.4489775561097257, + "grad_norm": 0.4397359092985846, + "learning_rate": 3.0310186008698817e-05, + "loss": 0.8949, + "step": 9002 + }, + { + "epoch": 0.4490274314214464, + "grad_norm": 0.8518071442582029, + "learning_rate": 3.0306239640006285e-05, + "loss": 0.9162, + "step": 9003 + }, + { + "epoch": 0.4490773067331671, + "grad_norm": 0.4204908597103052, + "learning_rate": 3.0302293132849813e-05, + "loss": 0.8932, + "step": 9004 + }, + { + "epoch": 0.4491271820448878, + "grad_norm": 0.3921074033296131, + "learning_rate": 3.0298346487332363e-05, + "loss": 0.8305, + "step": 9005 + }, + { + "epoch": 0.44917705735660846, + "grad_norm": 0.4654426512867412, + "learning_rate": 3.0294399703556943e-05, + "loss": 0.9089, + "step": 9006 + }, + { + "epoch": 0.4492269326683292, + "grad_norm": 0.46508312960222237, + "learning_rate": 3.0290452781626526e-05, + "loss": 0.8938, + "step": 9007 + }, + { + "epoch": 0.4492768079800499, + "grad_norm": 0.457043050989491, + "learning_rate": 3.0286505721644115e-05, + "loss": 0.887, + "step": 9008 + }, + { + "epoch": 0.44932668329177056, + "grad_norm": 1.3695772520747977, + "learning_rate": 3.02825585237127e-05, + "loss": 0.9227, + "step": 9009 + }, + { + "epoch": 0.4493765586034913, + "grad_norm": 0.5781361220876174, + "learning_rate": 3.027861118793529e-05, + "loss": 0.9106, + "step": 9010 + }, + { + "epoch": 0.449426433915212, + "grad_norm": 1.0681318309773662, + "learning_rate": 3.027466371441488e-05, + "loss": 0.9121, + "step": 9011 + }, + { + "epoch": 0.44947630922693266, + "grad_norm": 0.5506936066852566, + "learning_rate": 3.027071610325448e-05, + "loss": 0.9099, + "step": 9012 + }, + { + "epoch": 0.44952618453865334, + "grad_norm": 0.4680187958768523, + "learning_rate": 3.0266768354557108e-05, + "loss": 0.9001, + "step": 9013 + }, + { + "epoch": 0.4495760598503741, + "grad_norm": 0.46605254514212197, + "learning_rate": 3.0262820468425763e-05, + "loss": 0.8596, + "step": 9014 + }, + { + "epoch": 0.44962593516209476, + "grad_norm": 0.4710967740054271, + "learning_rate": 3.0258872444963483e-05, + "loss": 0.8664, + "step": 9015 + }, + { + "epoch": 0.44967581047381544, + "grad_norm": 0.44145058345973937, + "learning_rate": 3.0254924284273266e-05, + "loss": 0.8729, + "step": 9016 + }, + { + "epoch": 0.4497256857855362, + "grad_norm": 0.4810640512184121, + "learning_rate": 3.025097598645817e-05, + "loss": 0.8465, + "step": 9017 + }, + { + "epoch": 0.44977556109725686, + "grad_norm": 0.5316204460762417, + "learning_rate": 3.024702755162119e-05, + "loss": 0.9064, + "step": 9018 + }, + { + "epoch": 0.44982543640897754, + "grad_norm": 0.4218300062398897, + "learning_rate": 3.0243078979865384e-05, + "loss": 0.8832, + "step": 9019 + }, + { + "epoch": 0.4498753117206983, + "grad_norm": 0.4487209191986102, + "learning_rate": 3.023913027129377e-05, + "loss": 0.9412, + "step": 9020 + }, + { + "epoch": 0.44992518703241896, + "grad_norm": 0.5101828068609222, + "learning_rate": 3.0235181426009406e-05, + "loss": 0.8794, + "step": 9021 + }, + { + "epoch": 0.44997506234413964, + "grad_norm": 0.4301277422343177, + "learning_rate": 3.023123244411532e-05, + "loss": 0.8947, + "step": 9022 + }, + { + "epoch": 0.4500249376558603, + "grad_norm": 0.45900490252698045, + "learning_rate": 3.0227283325714568e-05, + "loss": 0.8968, + "step": 9023 + }, + { + "epoch": 0.45007481296758106, + "grad_norm": 0.4685965499982172, + "learning_rate": 3.02233340709102e-05, + "loss": 0.9215, + "step": 9024 + }, + { + "epoch": 0.45012468827930174, + "grad_norm": 0.39867883098564383, + "learning_rate": 3.0219384679805255e-05, + "loss": 0.8827, + "step": 9025 + }, + { + "epoch": 0.4501745635910224, + "grad_norm": 0.4281843106341111, + "learning_rate": 3.0215435152502815e-05, + "loss": 0.8657, + "step": 9026 + }, + { + "epoch": 0.45022443890274316, + "grad_norm": 0.7366351510188451, + "learning_rate": 3.0211485489105917e-05, + "loss": 0.9142, + "step": 9027 + }, + { + "epoch": 0.45027431421446384, + "grad_norm": 0.41853958412340736, + "learning_rate": 3.020753568971765e-05, + "loss": 0.893, + "step": 9028 + }, + { + "epoch": 0.4503241895261845, + "grad_norm": 0.5959781540123004, + "learning_rate": 3.0203585754441054e-05, + "loss": 0.9004, + "step": 9029 + }, + { + "epoch": 0.45037406483790526, + "grad_norm": 0.41881474412473796, + "learning_rate": 3.0199635683379233e-05, + "loss": 0.9159, + "step": 9030 + }, + { + "epoch": 0.45042394014962595, + "grad_norm": 0.39172555554015964, + "learning_rate": 3.019568547663524e-05, + "loss": 0.8503, + "step": 9031 + }, + { + "epoch": 0.4504738154613466, + "grad_norm": 0.4845105050037053, + "learning_rate": 3.0191735134312154e-05, + "loss": 0.9012, + "step": 9032 + }, + { + "epoch": 0.4505236907730673, + "grad_norm": 0.5609977014549135, + "learning_rate": 3.0187784656513068e-05, + "loss": 0.9004, + "step": 9033 + }, + { + "epoch": 0.45057356608478805, + "grad_norm": 0.40210660685249255, + "learning_rate": 3.0183834043341063e-05, + "loss": 0.9079, + "step": 9034 + }, + { + "epoch": 0.45062344139650873, + "grad_norm": 0.49898854135315834, + "learning_rate": 3.017988329489923e-05, + "loss": 0.9002, + "step": 9035 + }, + { + "epoch": 0.4506733167082294, + "grad_norm": 0.4492107417479033, + "learning_rate": 3.017593241129066e-05, + "loss": 0.8939, + "step": 9036 + }, + { + "epoch": 0.45072319201995015, + "grad_norm": 0.6106050833386415, + "learning_rate": 3.017198139261845e-05, + "loss": 0.8975, + "step": 9037 + }, + { + "epoch": 0.45077306733167083, + "grad_norm": 0.47116179024811977, + "learning_rate": 3.0168030238985696e-05, + "loss": 0.925, + "step": 9038 + }, + { + "epoch": 0.4508229426433915, + "grad_norm": 0.46730115810555656, + "learning_rate": 3.0164078950495512e-05, + "loss": 0.9266, + "step": 9039 + }, + { + "epoch": 0.4508728179551122, + "grad_norm": 0.6200589036469336, + "learning_rate": 3.0160127527250993e-05, + "loss": 0.8998, + "step": 9040 + }, + { + "epoch": 0.45092269326683293, + "grad_norm": 0.5714610230456133, + "learning_rate": 3.015617596935526e-05, + "loss": 0.9013, + "step": 9041 + }, + { + "epoch": 0.4509725685785536, + "grad_norm": 0.5161401845439868, + "learning_rate": 3.0152224276911423e-05, + "loss": 0.9067, + "step": 9042 + }, + { + "epoch": 0.4510224438902743, + "grad_norm": 0.5194472476287891, + "learning_rate": 3.0148272450022595e-05, + "loss": 0.8889, + "step": 9043 + }, + { + "epoch": 0.45107231920199503, + "grad_norm": 0.4744996556121442, + "learning_rate": 3.0144320488791906e-05, + "loss": 0.8637, + "step": 9044 + }, + { + "epoch": 0.4511221945137157, + "grad_norm": 0.4663444031670349, + "learning_rate": 3.0140368393322476e-05, + "loss": 0.8622, + "step": 9045 + }, + { + "epoch": 0.4511720698254364, + "grad_norm": 0.40327624097693, + "learning_rate": 3.013641616371743e-05, + "loss": 0.8959, + "step": 9046 + }, + { + "epoch": 0.45122194513715713, + "grad_norm": 0.4665016844095698, + "learning_rate": 3.0132463800079907e-05, + "loss": 0.8985, + "step": 9047 + }, + { + "epoch": 0.4512718204488778, + "grad_norm": 0.4395810898352632, + "learning_rate": 3.012851130251304e-05, + "loss": 0.8764, + "step": 9048 + }, + { + "epoch": 0.4513216957605985, + "grad_norm": 0.4354518521013911, + "learning_rate": 3.0124558671119957e-05, + "loss": 0.9123, + "step": 9049 + }, + { + "epoch": 0.4513715710723192, + "grad_norm": 0.39102256614550396, + "learning_rate": 3.012060590600382e-05, + "loss": 0.9011, + "step": 9050 + }, + { + "epoch": 0.4514214463840399, + "grad_norm": 0.44888580085511726, + "learning_rate": 3.0116653007267753e-05, + "loss": 0.905, + "step": 9051 + }, + { + "epoch": 0.4514713216957606, + "grad_norm": 0.3942068659608525, + "learning_rate": 3.0112699975014917e-05, + "loss": 0.8544, + "step": 9052 + }, + { + "epoch": 0.4515211970074813, + "grad_norm": 0.4473656192655416, + "learning_rate": 3.0108746809348466e-05, + "loss": 0.8848, + "step": 9053 + }, + { + "epoch": 0.451571072319202, + "grad_norm": 0.5353390431740392, + "learning_rate": 3.010479351037156e-05, + "loss": 0.8977, + "step": 9054 + }, + { + "epoch": 0.4516209476309227, + "grad_norm": 0.40942507861429117, + "learning_rate": 3.0100840078187342e-05, + "loss": 0.8695, + "step": 9055 + }, + { + "epoch": 0.4516708229426434, + "grad_norm": 0.44473507993011197, + "learning_rate": 3.0096886512898993e-05, + "loss": 0.9052, + "step": 9056 + }, + { + "epoch": 0.4517206982543641, + "grad_norm": 0.42561029191341965, + "learning_rate": 3.009293281460966e-05, + "loss": 0.899, + "step": 9057 + }, + { + "epoch": 0.4517705735660848, + "grad_norm": 0.4452468444155401, + "learning_rate": 3.0088978983422534e-05, + "loss": 0.9157, + "step": 9058 + }, + { + "epoch": 0.4518204488778055, + "grad_norm": 0.4625777050952931, + "learning_rate": 3.008502501944078e-05, + "loss": 0.8884, + "step": 9059 + }, + { + "epoch": 0.45187032418952616, + "grad_norm": 0.4178752133638569, + "learning_rate": 3.008107092276757e-05, + "loss": 0.852, + "step": 9060 + }, + { + "epoch": 0.4519201995012469, + "grad_norm": 0.5706648334749368, + "learning_rate": 3.0077116693506092e-05, + "loss": 0.9004, + "step": 9061 + }, + { + "epoch": 0.4519700748129676, + "grad_norm": 0.5179671851814786, + "learning_rate": 3.007316233175952e-05, + "loss": 0.901, + "step": 9062 + }, + { + "epoch": 0.45201995012468826, + "grad_norm": 0.39457460902247865, + "learning_rate": 3.0069207837631053e-05, + "loss": 0.8554, + "step": 9063 + }, + { + "epoch": 0.452069825436409, + "grad_norm": 0.42720460801249727, + "learning_rate": 3.006525321122387e-05, + "loss": 0.8653, + "step": 9064 + }, + { + "epoch": 0.4521197007481297, + "grad_norm": 0.4185414191127357, + "learning_rate": 3.0061298452641174e-05, + "loss": 0.8963, + "step": 9065 + }, + { + "epoch": 0.45216957605985036, + "grad_norm": 0.4799839853812283, + "learning_rate": 3.0057343561986162e-05, + "loss": 0.9029, + "step": 9066 + }, + { + "epoch": 0.45221945137157105, + "grad_norm": 0.4797015658081105, + "learning_rate": 3.0053388539362032e-05, + "loss": 0.9073, + "step": 9067 + }, + { + "epoch": 0.4522693266832918, + "grad_norm": 0.5988223637128518, + "learning_rate": 3.004943338487199e-05, + "loss": 0.8624, + "step": 9068 + }, + { + "epoch": 0.45231920199501247, + "grad_norm": 0.49972394435401374, + "learning_rate": 3.0045478098619245e-05, + "loss": 0.8632, + "step": 9069 + }, + { + "epoch": 0.45236907730673315, + "grad_norm": 0.529846872443558, + "learning_rate": 3.0041522680707007e-05, + "loss": 0.9015, + "step": 9070 + }, + { + "epoch": 0.4524189526184539, + "grad_norm": 0.4263589746560181, + "learning_rate": 3.0037567131238487e-05, + "loss": 0.9005, + "step": 9071 + }, + { + "epoch": 0.45246882793017457, + "grad_norm": 0.4674480782256333, + "learning_rate": 3.0033611450316908e-05, + "loss": 0.9054, + "step": 9072 + }, + { + "epoch": 0.45251870324189525, + "grad_norm": 1.0639864023514072, + "learning_rate": 3.0029655638045496e-05, + "loss": 0.9491, + "step": 9073 + }, + { + "epoch": 0.452568578553616, + "grad_norm": 0.9605159371553007, + "learning_rate": 3.0025699694527464e-05, + "loss": 0.8721, + "step": 9074 + }, + { + "epoch": 0.45261845386533667, + "grad_norm": 0.5685194174827101, + "learning_rate": 3.0021743619866048e-05, + "loss": 0.9213, + "step": 9075 + }, + { + "epoch": 0.45266832917705735, + "grad_norm": 0.42021849712604065, + "learning_rate": 3.0017787414164485e-05, + "loss": 0.8838, + "step": 9076 + }, + { + "epoch": 0.45271820448877803, + "grad_norm": 0.4216608193274595, + "learning_rate": 3.0013831077526e-05, + "loss": 0.8722, + "step": 9077 + }, + { + "epoch": 0.45276807980049877, + "grad_norm": 0.4799769943135886, + "learning_rate": 3.000987461005384e-05, + "loss": 0.9073, + "step": 9078 + }, + { + "epoch": 0.45281795511221945, + "grad_norm": 0.5527520709778169, + "learning_rate": 3.000591801185124e-05, + "loss": 0.8613, + "step": 9079 + }, + { + "epoch": 0.45286783042394013, + "grad_norm": 0.671678838047405, + "learning_rate": 3.000196128302146e-05, + "loss": 0.9336, + "step": 9080 + }, + { + "epoch": 0.45291770573566087, + "grad_norm": 0.7856545807253899, + "learning_rate": 2.999800442366773e-05, + "loss": 0.8916, + "step": 9081 + }, + { + "epoch": 0.45296758104738155, + "grad_norm": 0.5921907208820735, + "learning_rate": 2.9994047433893318e-05, + "loss": 0.9026, + "step": 9082 + }, + { + "epoch": 0.45301745635910223, + "grad_norm": 0.3874124636421812, + "learning_rate": 2.9990090313801474e-05, + "loss": 0.8642, + "step": 9083 + }, + { + "epoch": 0.45306733167082297, + "grad_norm": 0.5933191543936203, + "learning_rate": 2.9986133063495447e-05, + "loss": 0.9076, + "step": 9084 + }, + { + "epoch": 0.45311720698254365, + "grad_norm": 0.4637612460665819, + "learning_rate": 2.9982175683078516e-05, + "loss": 0.8647, + "step": 9085 + }, + { + "epoch": 0.45316708229426433, + "grad_norm": 0.569177080179613, + "learning_rate": 2.9978218172653937e-05, + "loss": 0.9156, + "step": 9086 + }, + { + "epoch": 0.453216957605985, + "grad_norm": 0.44857003064430545, + "learning_rate": 2.9974260532324987e-05, + "loss": 0.8944, + "step": 9087 + }, + { + "epoch": 0.45326683291770575, + "grad_norm": 0.4706944772064155, + "learning_rate": 2.997030276219493e-05, + "loss": 0.8783, + "step": 9088 + }, + { + "epoch": 0.45331670822942643, + "grad_norm": 0.4179664481849821, + "learning_rate": 2.9966344862367047e-05, + "loss": 0.9269, + "step": 9089 + }, + { + "epoch": 0.4533665835411471, + "grad_norm": 0.7569221094551991, + "learning_rate": 2.9962386832944616e-05, + "loss": 0.8716, + "step": 9090 + }, + { + "epoch": 0.45341645885286785, + "grad_norm": 0.41917412115567015, + "learning_rate": 2.995842867403093e-05, + "loss": 0.8688, + "step": 9091 + }, + { + "epoch": 0.45346633416458854, + "grad_norm": 0.4536724360989202, + "learning_rate": 2.995447038572926e-05, + "loss": 0.8685, + "step": 9092 + }, + { + "epoch": 0.4535162094763092, + "grad_norm": 0.6985615074596675, + "learning_rate": 2.99505119681429e-05, + "loss": 0.8668, + "step": 9093 + }, + { + "epoch": 0.4535660847880299, + "grad_norm": 0.555854782483581, + "learning_rate": 2.9946553421375158e-05, + "loss": 0.947, + "step": 9094 + }, + { + "epoch": 0.45361596009975064, + "grad_norm": 0.46804842853591816, + "learning_rate": 2.9942594745529307e-05, + "loss": 0.8933, + "step": 9095 + }, + { + "epoch": 0.4536658354114713, + "grad_norm": 0.49192963346019675, + "learning_rate": 2.9938635940708666e-05, + "loss": 0.9312, + "step": 9096 + }, + { + "epoch": 0.453715710723192, + "grad_norm": 0.5413921288961183, + "learning_rate": 2.993467700701652e-05, + "loss": 0.8973, + "step": 9097 + }, + { + "epoch": 0.45376558603491274, + "grad_norm": 0.4708872511155608, + "learning_rate": 2.99307179445562e-05, + "loss": 0.8633, + "step": 9098 + }, + { + "epoch": 0.4538154613466334, + "grad_norm": 0.5402769989445009, + "learning_rate": 2.9926758753430983e-05, + "loss": 0.8921, + "step": 9099 + }, + { + "epoch": 0.4538653366583541, + "grad_norm": 0.7877489546572818, + "learning_rate": 2.992279943374422e-05, + "loss": 0.8888, + "step": 9100 + }, + { + "epoch": 0.45391521197007484, + "grad_norm": 0.5008524419928431, + "learning_rate": 2.9918839985599195e-05, + "loss": 0.8579, + "step": 9101 + }, + { + "epoch": 0.4539650872817955, + "grad_norm": 0.4742399773016712, + "learning_rate": 2.991488040909925e-05, + "loss": 0.9002, + "step": 9102 + }, + { + "epoch": 0.4540149625935162, + "grad_norm": 0.4680260501691857, + "learning_rate": 2.9910920704347696e-05, + "loss": 0.9269, + "step": 9103 + }, + { + "epoch": 0.4540648379052369, + "grad_norm": 0.5002051238644345, + "learning_rate": 2.990696087144787e-05, + "loss": 0.9257, + "step": 9104 + }, + { + "epoch": 0.4541147132169576, + "grad_norm": 0.3916087975762558, + "learning_rate": 2.9903000910503086e-05, + "loss": 0.8937, + "step": 9105 + }, + { + "epoch": 0.4541645885286783, + "grad_norm": 0.4391105982103171, + "learning_rate": 2.9899040821616696e-05, + "loss": 0.9001, + "step": 9106 + }, + { + "epoch": 0.454214463840399, + "grad_norm": 0.4562894555396237, + "learning_rate": 2.9895080604892035e-05, + "loss": 0.8178, + "step": 9107 + }, + { + "epoch": 0.4542643391521197, + "grad_norm": 0.49841525524371805, + "learning_rate": 2.989112026043242e-05, + "loss": 0.9099, + "step": 9108 + }, + { + "epoch": 0.4543142144638404, + "grad_norm": 0.5802168223689576, + "learning_rate": 2.9887159788341216e-05, + "loss": 0.8934, + "step": 9109 + }, + { + "epoch": 0.4543640897755611, + "grad_norm": 0.4548246211341015, + "learning_rate": 2.9883199188721766e-05, + "loss": 0.9092, + "step": 9110 + }, + { + "epoch": 0.45441396508728177, + "grad_norm": 0.44564384169972604, + "learning_rate": 2.9879238461677423e-05, + "loss": 0.9133, + "step": 9111 + }, + { + "epoch": 0.4544638403990025, + "grad_norm": 0.4057044585802795, + "learning_rate": 2.987527760731153e-05, + "loss": 0.8683, + "step": 9112 + }, + { + "epoch": 0.4545137157107232, + "grad_norm": 0.41106654387894687, + "learning_rate": 2.9871316625727453e-05, + "loss": 0.8875, + "step": 9113 + }, + { + "epoch": 0.45456359102244387, + "grad_norm": 0.5525316940163092, + "learning_rate": 2.986735551702855e-05, + "loss": 0.8784, + "step": 9114 + }, + { + "epoch": 0.4546134663341646, + "grad_norm": 0.4074343661634833, + "learning_rate": 2.986339428131818e-05, + "loss": 0.8623, + "step": 9115 + }, + { + "epoch": 0.4546633416458853, + "grad_norm": 0.4377202539572844, + "learning_rate": 2.985943291869971e-05, + "loss": 0.8973, + "step": 9116 + }, + { + "epoch": 0.45471321695760597, + "grad_norm": 0.5382109558843071, + "learning_rate": 2.9855471429276514e-05, + "loss": 0.9151, + "step": 9117 + }, + { + "epoch": 0.4547630922693267, + "grad_norm": 0.4154971210104352, + "learning_rate": 2.9851509813151973e-05, + "loss": 0.8797, + "step": 9118 + }, + { + "epoch": 0.4548129675810474, + "grad_norm": 0.47875584535122456, + "learning_rate": 2.9847548070429444e-05, + "loss": 0.8848, + "step": 9119 + }, + { + "epoch": 0.45486284289276807, + "grad_norm": 0.6559032493728858, + "learning_rate": 2.9843586201212326e-05, + "loss": 0.8501, + "step": 9120 + }, + { + "epoch": 0.45491271820448875, + "grad_norm": 0.404856947859667, + "learning_rate": 2.9839624205603987e-05, + "loss": 0.9461, + "step": 9121 + }, + { + "epoch": 0.4549625935162095, + "grad_norm": 0.48117635392705715, + "learning_rate": 2.9835662083707827e-05, + "loss": 0.9068, + "step": 9122 + }, + { + "epoch": 0.45501246882793017, + "grad_norm": 0.6009424988698013, + "learning_rate": 2.983169983562722e-05, + "loss": 0.8547, + "step": 9123 + }, + { + "epoch": 0.45506234413965085, + "grad_norm": 0.4637975776358827, + "learning_rate": 2.9827737461465576e-05, + "loss": 0.9362, + "step": 9124 + }, + { + "epoch": 0.4551122194513716, + "grad_norm": 0.4678415435087388, + "learning_rate": 2.9823774961326277e-05, + "loss": 0.897, + "step": 9125 + }, + { + "epoch": 0.4551620947630923, + "grad_norm": 0.5940031122430662, + "learning_rate": 2.9819812335312735e-05, + "loss": 0.9169, + "step": 9126 + }, + { + "epoch": 0.45521197007481295, + "grad_norm": 0.5543851270018256, + "learning_rate": 2.9815849583528344e-05, + "loss": 0.866, + "step": 9127 + }, + { + "epoch": 0.4552618453865337, + "grad_norm": 0.43821962688874755, + "learning_rate": 2.981188670607652e-05, + "loss": 0.8812, + "step": 9128 + }, + { + "epoch": 0.4553117206982544, + "grad_norm": 0.5404301965436377, + "learning_rate": 2.9807923703060652e-05, + "loss": 0.9043, + "step": 9129 + }, + { + "epoch": 0.45536159600997506, + "grad_norm": 0.4988833911361145, + "learning_rate": 2.980396057458418e-05, + "loss": 0.93, + "step": 9130 + }, + { + "epoch": 0.45541147132169574, + "grad_norm": 0.47884269276988467, + "learning_rate": 2.9799997320750507e-05, + "loss": 0.9481, + "step": 9131 + }, + { + "epoch": 0.4554613466334165, + "grad_norm": 0.4146309311952351, + "learning_rate": 2.9796033941663042e-05, + "loss": 0.9107, + "step": 9132 + }, + { + "epoch": 0.45551122194513716, + "grad_norm": 0.4744778760846666, + "learning_rate": 2.9792070437425222e-05, + "loss": 0.9052, + "step": 9133 + }, + { + "epoch": 0.45556109725685784, + "grad_norm": 0.4531197972970809, + "learning_rate": 2.978810680814047e-05, + "loss": 0.8359, + "step": 9134 + }, + { + "epoch": 0.4556109725685786, + "grad_norm": 0.43611353924898055, + "learning_rate": 2.9784143053912217e-05, + "loss": 0.8831, + "step": 9135 + }, + { + "epoch": 0.45566084788029926, + "grad_norm": 0.5443628388851387, + "learning_rate": 2.9780179174843886e-05, + "loss": 0.9529, + "step": 9136 + }, + { + "epoch": 0.45571072319201994, + "grad_norm": 0.49875767713729957, + "learning_rate": 2.9776215171038923e-05, + "loss": 0.8943, + "step": 9137 + }, + { + "epoch": 0.4557605985037406, + "grad_norm": 0.6772734304441279, + "learning_rate": 2.977225104260076e-05, + "loss": 0.9043, + "step": 9138 + }, + { + "epoch": 0.45581047381546136, + "grad_norm": 0.5797320579892338, + "learning_rate": 2.9768286789632844e-05, + "loss": 0.8471, + "step": 9139 + }, + { + "epoch": 0.45586034912718204, + "grad_norm": 0.40798546046904866, + "learning_rate": 2.976432241223861e-05, + "loss": 0.8825, + "step": 9140 + }, + { + "epoch": 0.4559102244389027, + "grad_norm": 0.6501251325518965, + "learning_rate": 2.9760357910521526e-05, + "loss": 0.8641, + "step": 9141 + }, + { + "epoch": 0.45596009975062346, + "grad_norm": 0.5171670314654078, + "learning_rate": 2.975639328458503e-05, + "loss": 0.9006, + "step": 9142 + }, + { + "epoch": 0.45600997506234414, + "grad_norm": 0.4070575472185593, + "learning_rate": 2.975242853453258e-05, + "loss": 0.8668, + "step": 9143 + }, + { + "epoch": 0.4560598503740648, + "grad_norm": 0.3986408077972672, + "learning_rate": 2.9748463660467636e-05, + "loss": 0.8142, + "step": 9144 + }, + { + "epoch": 0.45610972568578556, + "grad_norm": 0.44218594228898744, + "learning_rate": 2.9744498662493647e-05, + "loss": 0.9057, + "step": 9145 + }, + { + "epoch": 0.45615960099750624, + "grad_norm": 0.4197189381494946, + "learning_rate": 2.9740533540714098e-05, + "loss": 0.8981, + "step": 9146 + }, + { + "epoch": 0.4562094763092269, + "grad_norm": 0.43752802394643336, + "learning_rate": 2.9736568295232448e-05, + "loss": 0.9112, + "step": 9147 + }, + { + "epoch": 0.4562593516209476, + "grad_norm": 0.455968767186431, + "learning_rate": 2.973260292615216e-05, + "loss": 0.8811, + "step": 9148 + }, + { + "epoch": 0.45630922693266834, + "grad_norm": 0.41663560005160655, + "learning_rate": 2.9728637433576728e-05, + "loss": 0.9186, + "step": 9149 + }, + { + "epoch": 0.456359102244389, + "grad_norm": 0.5729286095975658, + "learning_rate": 2.9724671817609612e-05, + "loss": 0.925, + "step": 9150 + }, + { + "epoch": 0.4564089775561097, + "grad_norm": 0.541847540040847, + "learning_rate": 2.97207060783543e-05, + "loss": 0.8992, + "step": 9151 + }, + { + "epoch": 0.45645885286783044, + "grad_norm": 0.41369139734354093, + "learning_rate": 2.9716740215914274e-05, + "loss": 0.8931, + "step": 9152 + }, + { + "epoch": 0.4565087281795511, + "grad_norm": 0.47396405074499576, + "learning_rate": 2.9712774230393026e-05, + "loss": 0.9555, + "step": 9153 + }, + { + "epoch": 0.4565586034912718, + "grad_norm": 0.4045368774981937, + "learning_rate": 2.9708808121894043e-05, + "loss": 0.8776, + "step": 9154 + }, + { + "epoch": 0.45660847880299255, + "grad_norm": 0.4423304676508903, + "learning_rate": 2.9704841890520824e-05, + "loss": 0.8973, + "step": 9155 + }, + { + "epoch": 0.4566583541147132, + "grad_norm": 0.3928840621435755, + "learning_rate": 2.9700875536376855e-05, + "loss": 0.9122, + "step": 9156 + }, + { + "epoch": 0.4567082294264339, + "grad_norm": 0.530785766331819, + "learning_rate": 2.9696909059565642e-05, + "loss": 0.9485, + "step": 9157 + }, + { + "epoch": 0.4567581047381546, + "grad_norm": 0.37319470059501425, + "learning_rate": 2.9692942460190693e-05, + "loss": 0.8799, + "step": 9158 + }, + { + "epoch": 0.45680798004987533, + "grad_norm": 0.5700763015797304, + "learning_rate": 2.968897573835551e-05, + "loss": 0.9057, + "step": 9159 + }, + { + "epoch": 0.456857855361596, + "grad_norm": 0.5329475243874003, + "learning_rate": 2.9685008894163602e-05, + "loss": 0.9381, + "step": 9160 + }, + { + "epoch": 0.4569077306733167, + "grad_norm": 0.3830608215274655, + "learning_rate": 2.968104192771849e-05, + "loss": 0.907, + "step": 9161 + }, + { + "epoch": 0.45695760598503743, + "grad_norm": 0.42146427488091975, + "learning_rate": 2.9677074839123675e-05, + "loss": 0.896, + "step": 9162 + }, + { + "epoch": 0.4570074812967581, + "grad_norm": 0.5158432473735788, + "learning_rate": 2.9673107628482687e-05, + "loss": 0.8989, + "step": 9163 + }, + { + "epoch": 0.4570573566084788, + "grad_norm": 0.37404777691726254, + "learning_rate": 2.9669140295899044e-05, + "loss": 0.8622, + "step": 9164 + }, + { + "epoch": 0.4571072319201995, + "grad_norm": 0.5137960154442012, + "learning_rate": 2.9665172841476284e-05, + "loss": 0.8511, + "step": 9165 + }, + { + "epoch": 0.4571571072319202, + "grad_norm": 0.5751054814913257, + "learning_rate": 2.9661205265317926e-05, + "loss": 0.9344, + "step": 9166 + }, + { + "epoch": 0.4572069825436409, + "grad_norm": 0.46327084577104644, + "learning_rate": 2.9657237567527497e-05, + "loss": 0.8657, + "step": 9167 + }, + { + "epoch": 0.4572568578553616, + "grad_norm": 0.4429563291020661, + "learning_rate": 2.965326974820854e-05, + "loss": 0.8709, + "step": 9168 + }, + { + "epoch": 0.4573067331670823, + "grad_norm": 0.5310875410438549, + "learning_rate": 2.9649301807464593e-05, + "loss": 0.8716, + "step": 9169 + }, + { + "epoch": 0.457356608478803, + "grad_norm": 0.3980522790125319, + "learning_rate": 2.9645333745399194e-05, + "loss": 0.858, + "step": 9170 + }, + { + "epoch": 0.4574064837905237, + "grad_norm": 0.596748241712654, + "learning_rate": 2.9641365562115887e-05, + "loss": 0.9173, + "step": 9171 + }, + { + "epoch": 0.4574563591022444, + "grad_norm": 0.8941008789254264, + "learning_rate": 2.963739725771823e-05, + "loss": 0.8886, + "step": 9172 + }, + { + "epoch": 0.4575062344139651, + "grad_norm": 0.519905638032443, + "learning_rate": 2.9633428832309763e-05, + "loss": 0.8911, + "step": 9173 + }, + { + "epoch": 0.4575561097256858, + "grad_norm": 0.6700790442210245, + "learning_rate": 2.9629460285994043e-05, + "loss": 0.8937, + "step": 9174 + }, + { + "epoch": 0.45760598503740646, + "grad_norm": 0.4749738853310106, + "learning_rate": 2.962549161887463e-05, + "loss": 0.8874, + "step": 9175 + }, + { + "epoch": 0.4576558603491272, + "grad_norm": 0.47633327743647624, + "learning_rate": 2.962152283105508e-05, + "loss": 0.8906, + "step": 9176 + }, + { + "epoch": 0.4577057356608479, + "grad_norm": 0.40667460333512256, + "learning_rate": 2.961755392263897e-05, + "loss": 0.8064, + "step": 9177 + }, + { + "epoch": 0.45775561097256856, + "grad_norm": 0.5487882285678203, + "learning_rate": 2.9613584893729847e-05, + "loss": 0.8943, + "step": 9178 + }, + { + "epoch": 0.4578054862842893, + "grad_norm": 0.5217461652112515, + "learning_rate": 2.9609615744431297e-05, + "loss": 0.8627, + "step": 9179 + }, + { + "epoch": 0.45785536159601, + "grad_norm": 0.44774556724978076, + "learning_rate": 2.9605646474846883e-05, + "loss": 0.9466, + "step": 9180 + }, + { + "epoch": 0.45790523690773066, + "grad_norm": 0.5493346825566964, + "learning_rate": 2.9601677085080186e-05, + "loss": 0.8871, + "step": 9181 + }, + { + "epoch": 0.4579551122194514, + "grad_norm": 0.5737493378816739, + "learning_rate": 2.9597707575234784e-05, + "loss": 0.9276, + "step": 9182 + }, + { + "epoch": 0.4580049875311721, + "grad_norm": 0.4149941291234489, + "learning_rate": 2.9593737945414264e-05, + "loss": 0.8637, + "step": 9183 + }, + { + "epoch": 0.45805486284289276, + "grad_norm": 0.41177302741401905, + "learning_rate": 2.9589768195722207e-05, + "loss": 0.9011, + "step": 9184 + }, + { + "epoch": 0.45810473815461344, + "grad_norm": 0.44226378596923466, + "learning_rate": 2.9585798326262206e-05, + "loss": 0.8647, + "step": 9185 + }, + { + "epoch": 0.4581546134663342, + "grad_norm": 0.5564794792485468, + "learning_rate": 2.958182833713784e-05, + "loss": 0.8917, + "step": 9186 + }, + { + "epoch": 0.45820448877805486, + "grad_norm": 0.5437237734081998, + "learning_rate": 2.9577858228452716e-05, + "loss": 0.8502, + "step": 9187 + }, + { + "epoch": 0.45825436408977555, + "grad_norm": 0.505932051462236, + "learning_rate": 2.957388800031044e-05, + "loss": 0.9337, + "step": 9188 + }, + { + "epoch": 0.4583042394014963, + "grad_norm": 0.45492108382801333, + "learning_rate": 2.9569917652814594e-05, + "loss": 0.8607, + "step": 9189 + }, + { + "epoch": 0.45835411471321696, + "grad_norm": 0.4980249618375542, + "learning_rate": 2.9565947186068798e-05, + "loss": 0.881, + "step": 9190 + }, + { + "epoch": 0.45840399002493765, + "grad_norm": 0.5120395432369493, + "learning_rate": 2.956197660017665e-05, + "loss": 0.9415, + "step": 9191 + }, + { + "epoch": 0.45845386533665833, + "grad_norm": 0.4048546184413327, + "learning_rate": 2.955800589524177e-05, + "loss": 0.899, + "step": 9192 + }, + { + "epoch": 0.45850374064837907, + "grad_norm": 0.6688317875580557, + "learning_rate": 2.9554035071367758e-05, + "loss": 0.8713, + "step": 9193 + }, + { + "epoch": 0.45855361596009975, + "grad_norm": 0.4575424930104621, + "learning_rate": 2.9550064128658252e-05, + "loss": 0.8684, + "step": 9194 + }, + { + "epoch": 0.45860349127182043, + "grad_norm": 0.4686158132269844, + "learning_rate": 2.9546093067216844e-05, + "loss": 0.8715, + "step": 9195 + }, + { + "epoch": 0.45865336658354117, + "grad_norm": 0.4124620123010231, + "learning_rate": 2.9542121887147178e-05, + "loss": 0.9001, + "step": 9196 + }, + { + "epoch": 0.45870324189526185, + "grad_norm": 0.4460925983229465, + "learning_rate": 2.9538150588552876e-05, + "loss": 0.8987, + "step": 9197 + }, + { + "epoch": 0.45875311720698253, + "grad_norm": 0.47719442486964075, + "learning_rate": 2.9534179171537567e-05, + "loss": 0.9099, + "step": 9198 + }, + { + "epoch": 0.45880299251870327, + "grad_norm": 0.43244775470662955, + "learning_rate": 2.9530207636204878e-05, + "loss": 0.8751, + "step": 9199 + }, + { + "epoch": 0.45885286783042395, + "grad_norm": 0.43126134296480717, + "learning_rate": 2.9526235982658444e-05, + "loss": 0.8973, + "step": 9200 + }, + { + "epoch": 0.45890274314214463, + "grad_norm": 0.48527355215908935, + "learning_rate": 2.952226421100192e-05, + "loss": 0.797, + "step": 9201 + }, + { + "epoch": 0.4589526184538653, + "grad_norm": 1.5496954272942152, + "learning_rate": 2.9518292321338926e-05, + "loss": 0.8943, + "step": 9202 + }, + { + "epoch": 0.45900249376558605, + "grad_norm": 0.40620024785739994, + "learning_rate": 2.951432031377312e-05, + "loss": 0.8712, + "step": 9203 + }, + { + "epoch": 0.45905236907730673, + "grad_norm": 0.4426039426122178, + "learning_rate": 2.951034818840814e-05, + "loss": 0.8695, + "step": 9204 + }, + { + "epoch": 0.4591022443890274, + "grad_norm": 0.48307764912877754, + "learning_rate": 2.950637594534765e-05, + "loss": 0.8821, + "step": 9205 + }, + { + "epoch": 0.45915211970074815, + "grad_norm": 0.4507157359671486, + "learning_rate": 2.950240358469529e-05, + "loss": 0.8754, + "step": 9206 + }, + { + "epoch": 0.45920199501246883, + "grad_norm": 0.4844856096593076, + "learning_rate": 2.9498431106554726e-05, + "loss": 0.9054, + "step": 9207 + }, + { + "epoch": 0.4592518703241895, + "grad_norm": 0.6388683532992322, + "learning_rate": 2.9494458511029616e-05, + "loss": 0.9484, + "step": 9208 + }, + { + "epoch": 0.4593017456359102, + "grad_norm": 0.5101875670441413, + "learning_rate": 2.9490485798223623e-05, + "loss": 0.9077, + "step": 9209 + }, + { + "epoch": 0.45935162094763093, + "grad_norm": 0.5063575925526401, + "learning_rate": 2.948651296824041e-05, + "loss": 0.9464, + "step": 9210 + }, + { + "epoch": 0.4594014962593516, + "grad_norm": 0.37166832395501037, + "learning_rate": 2.948254002118365e-05, + "loss": 0.8475, + "step": 9211 + }, + { + "epoch": 0.4594513715710723, + "grad_norm": 0.4379118936313444, + "learning_rate": 2.947856695715702e-05, + "loss": 0.9168, + "step": 9212 + }, + { + "epoch": 0.45950124688279304, + "grad_norm": 0.47581506024568193, + "learning_rate": 2.947459377626418e-05, + "loss": 0.8903, + "step": 9213 + }, + { + "epoch": 0.4595511221945137, + "grad_norm": 0.43370136130279635, + "learning_rate": 2.9470620478608828e-05, + "loss": 0.9075, + "step": 9214 + }, + { + "epoch": 0.4596009975062344, + "grad_norm": 0.5461759658822751, + "learning_rate": 2.9466647064294627e-05, + "loss": 0.9415, + "step": 9215 + }, + { + "epoch": 0.45965087281795514, + "grad_norm": 0.4944889434640519, + "learning_rate": 2.9462673533425278e-05, + "loss": 0.8685, + "step": 9216 + }, + { + "epoch": 0.4597007481296758, + "grad_norm": 0.4199443250976052, + "learning_rate": 2.9458699886104453e-05, + "loss": 0.8483, + "step": 9217 + }, + { + "epoch": 0.4597506234413965, + "grad_norm": 0.45480168030307666, + "learning_rate": 2.9454726122435853e-05, + "loss": 0.8739, + "step": 9218 + }, + { + "epoch": 0.4598004987531172, + "grad_norm": 0.5855114846776113, + "learning_rate": 2.9450752242523166e-05, + "loss": 0.8783, + "step": 9219 + }, + { + "epoch": 0.4598503740648379, + "grad_norm": 0.7855621930159805, + "learning_rate": 2.9446778246470098e-05, + "loss": 0.8936, + "step": 9220 + }, + { + "epoch": 0.4599002493765586, + "grad_norm": 0.505297442600404, + "learning_rate": 2.944280413438033e-05, + "loss": 0.8648, + "step": 9221 + }, + { + "epoch": 0.4599501246882793, + "grad_norm": 0.4193528421061783, + "learning_rate": 2.943882990635759e-05, + "loss": 0.8892, + "step": 9222 + }, + { + "epoch": 0.46, + "grad_norm": 0.6591883326194544, + "learning_rate": 2.943485556250556e-05, + "loss": 0.9127, + "step": 9223 + }, + { + "epoch": 0.4600498753117207, + "grad_norm": 0.5437653864102782, + "learning_rate": 2.943088110292796e-05, + "loss": 0.8788, + "step": 9224 + }, + { + "epoch": 0.4600997506234414, + "grad_norm": 0.5444403362192289, + "learning_rate": 2.9426906527728503e-05, + "loss": 0.9436, + "step": 9225 + }, + { + "epoch": 0.4601496259351621, + "grad_norm": 0.5486198718505474, + "learning_rate": 2.9422931837010898e-05, + "loss": 0.8768, + "step": 9226 + }, + { + "epoch": 0.4601995012468828, + "grad_norm": 0.4402379771976963, + "learning_rate": 2.9418957030878874e-05, + "loss": 0.9351, + "step": 9227 + }, + { + "epoch": 0.4602493765586035, + "grad_norm": 0.37999339252277897, + "learning_rate": 2.9414982109436135e-05, + "loss": 0.8582, + "step": 9228 + }, + { + "epoch": 0.46029925187032417, + "grad_norm": 0.40395469720788335, + "learning_rate": 2.941100707278642e-05, + "loss": 0.8808, + "step": 9229 + }, + { + "epoch": 0.4603491271820449, + "grad_norm": 0.46154854463744305, + "learning_rate": 2.9407031921033445e-05, + "loss": 0.841, + "step": 9230 + }, + { + "epoch": 0.4603990024937656, + "grad_norm": 0.4720739936083993, + "learning_rate": 2.9403056654280953e-05, + "loss": 0.9436, + "step": 9231 + }, + { + "epoch": 0.46044887780548627, + "grad_norm": 0.4448254873544409, + "learning_rate": 2.9399081272632658e-05, + "loss": 0.8976, + "step": 9232 + }, + { + "epoch": 0.460498753117207, + "grad_norm": 0.3941919361473474, + "learning_rate": 2.9395105776192312e-05, + "loss": 0.9125, + "step": 9233 + }, + { + "epoch": 0.4605486284289277, + "grad_norm": 0.45579245491051873, + "learning_rate": 2.9391130165063647e-05, + "loss": 0.9173, + "step": 9234 + }, + { + "epoch": 0.46059850374064837, + "grad_norm": 0.7508887963790588, + "learning_rate": 2.9387154439350406e-05, + "loss": 0.9233, + "step": 9235 + }, + { + "epoch": 0.46064837905236905, + "grad_norm": 0.4513904897876316, + "learning_rate": 2.9383178599156324e-05, + "loss": 0.9151, + "step": 9236 + }, + { + "epoch": 0.4606982543640898, + "grad_norm": 0.4534378206112924, + "learning_rate": 2.9379202644585175e-05, + "loss": 0.9271, + "step": 9237 + }, + { + "epoch": 0.46074812967581047, + "grad_norm": 0.5481574569503965, + "learning_rate": 2.9375226575740682e-05, + "loss": 0.8751, + "step": 9238 + }, + { + "epoch": 0.46079800498753115, + "grad_norm": 0.5652221625083187, + "learning_rate": 2.9371250392726614e-05, + "loss": 0.8415, + "step": 9239 + }, + { + "epoch": 0.4608478802992519, + "grad_norm": 0.44493034526618686, + "learning_rate": 2.9367274095646718e-05, + "loss": 0.9066, + "step": 9240 + }, + { + "epoch": 0.46089775561097257, + "grad_norm": 0.45276165342419933, + "learning_rate": 2.9363297684604763e-05, + "loss": 0.8269, + "step": 9241 + }, + { + "epoch": 0.46094763092269325, + "grad_norm": 0.5819319400333872, + "learning_rate": 2.9359321159704507e-05, + "loss": 0.8659, + "step": 9242 + }, + { + "epoch": 0.460997506234414, + "grad_norm": 0.4290698763763918, + "learning_rate": 2.935534452104971e-05, + "loss": 0.9129, + "step": 9243 + }, + { + "epoch": 0.46104738154613467, + "grad_norm": 0.4628294373975906, + "learning_rate": 2.935136776874416e-05, + "loss": 0.8964, + "step": 9244 + }, + { + "epoch": 0.46109725685785535, + "grad_norm": 0.43687753568911164, + "learning_rate": 2.93473909028916e-05, + "loss": 0.9132, + "step": 9245 + }, + { + "epoch": 0.46114713216957604, + "grad_norm": 0.4074626604203022, + "learning_rate": 2.9343413923595835e-05, + "loss": 0.8706, + "step": 9246 + }, + { + "epoch": 0.4611970074812968, + "grad_norm": 0.4052720492426224, + "learning_rate": 2.933943683096062e-05, + "loss": 0.8959, + "step": 9247 + }, + { + "epoch": 0.46124688279301745, + "grad_norm": 0.6914980704854965, + "learning_rate": 2.9335459625089744e-05, + "loss": 0.8881, + "step": 9248 + }, + { + "epoch": 0.46129675810473814, + "grad_norm": 0.5471050121151521, + "learning_rate": 2.9331482306086993e-05, + "loss": 0.8895, + "step": 9249 + }, + { + "epoch": 0.4613466334164589, + "grad_norm": 0.4659714680470149, + "learning_rate": 2.9327504874056145e-05, + "loss": 0.8839, + "step": 9250 + }, + { + "epoch": 0.46139650872817956, + "grad_norm": 0.4333927696586506, + "learning_rate": 2.9323527329100996e-05, + "loss": 0.8872, + "step": 9251 + }, + { + "epoch": 0.46144638403990024, + "grad_norm": 0.4899741869335587, + "learning_rate": 2.931954967132533e-05, + "loss": 0.9102, + "step": 9252 + }, + { + "epoch": 0.461496259351621, + "grad_norm": 0.5194092538079731, + "learning_rate": 2.9315571900832955e-05, + "loss": 0.8862, + "step": 9253 + }, + { + "epoch": 0.46154613466334166, + "grad_norm": 0.7028401566704023, + "learning_rate": 2.931159401772766e-05, + "loss": 0.8831, + "step": 9254 + }, + { + "epoch": 0.46159600997506234, + "grad_norm": 0.4324722982800881, + "learning_rate": 2.9307616022113253e-05, + "loss": 0.8606, + "step": 9255 + }, + { + "epoch": 0.461645885286783, + "grad_norm": 0.42467689138604603, + "learning_rate": 2.9303637914093528e-05, + "loss": 0.8811, + "step": 9256 + }, + { + "epoch": 0.46169576059850376, + "grad_norm": 0.5123602526620872, + "learning_rate": 2.92996596937723e-05, + "loss": 0.8986, + "step": 9257 + }, + { + "epoch": 0.46174563591022444, + "grad_norm": 0.4296533101929125, + "learning_rate": 2.9295681361253378e-05, + "loss": 0.8369, + "step": 9258 + }, + { + "epoch": 0.4617955112219451, + "grad_norm": 0.48935259656999935, + "learning_rate": 2.9291702916640575e-05, + "loss": 0.8895, + "step": 9259 + }, + { + "epoch": 0.46184538653366586, + "grad_norm": 0.43466548060456517, + "learning_rate": 2.92877243600377e-05, + "loss": 0.9146, + "step": 9260 + }, + { + "epoch": 0.46189526184538654, + "grad_norm": 0.5223343051026116, + "learning_rate": 2.928374569154858e-05, + "loss": 0.875, + "step": 9261 + }, + { + "epoch": 0.4619451371571072, + "grad_norm": 0.49521371276487547, + "learning_rate": 2.927976691127703e-05, + "loss": 0.8961, + "step": 9262 + }, + { + "epoch": 0.4619950124688279, + "grad_norm": 0.614184598555956, + "learning_rate": 2.927578801932688e-05, + "loss": 0.9313, + "step": 9263 + }, + { + "epoch": 0.46204488778054864, + "grad_norm": 0.46290814806014063, + "learning_rate": 2.9271809015801953e-05, + "loss": 0.8903, + "step": 9264 + }, + { + "epoch": 0.4620947630922693, + "grad_norm": 0.5783889646221354, + "learning_rate": 2.926782990080609e-05, + "loss": 0.8795, + "step": 9265 + }, + { + "epoch": 0.46214463840399, + "grad_norm": 0.46443193183936476, + "learning_rate": 2.926385067444311e-05, + "loss": 0.8821, + "step": 9266 + }, + { + "epoch": 0.46219451371571074, + "grad_norm": 0.4361260569499862, + "learning_rate": 2.9259871336816853e-05, + "loss": 0.8381, + "step": 9267 + }, + { + "epoch": 0.4622443890274314, + "grad_norm": 0.41807297063673543, + "learning_rate": 2.9255891888031162e-05, + "loss": 0.8841, + "step": 9268 + }, + { + "epoch": 0.4622942643391521, + "grad_norm": 0.4255054533754829, + "learning_rate": 2.9251912328189874e-05, + "loss": 0.9341, + "step": 9269 + }, + { + "epoch": 0.46234413965087284, + "grad_norm": 0.4663803993610059, + "learning_rate": 2.9247932657396842e-05, + "loss": 0.8575, + "step": 9270 + }, + { + "epoch": 0.4623940149625935, + "grad_norm": 1.217349407860585, + "learning_rate": 2.9243952875755904e-05, + "loss": 0.8929, + "step": 9271 + }, + { + "epoch": 0.4624438902743142, + "grad_norm": 0.4897027601298749, + "learning_rate": 2.9239972983370918e-05, + "loss": 0.9073, + "step": 9272 + }, + { + "epoch": 0.4624937655860349, + "grad_norm": 0.45744143606520593, + "learning_rate": 2.923599298034574e-05, + "loss": 0.8896, + "step": 9273 + }, + { + "epoch": 0.4625436408977556, + "grad_norm": 0.457184162978005, + "learning_rate": 2.9232012866784207e-05, + "loss": 0.9269, + "step": 9274 + }, + { + "epoch": 0.4625935162094763, + "grad_norm": 0.39067258663928206, + "learning_rate": 2.9228032642790203e-05, + "loss": 0.859, + "step": 9275 + }, + { + "epoch": 0.462643391521197, + "grad_norm": 0.416392085597164, + "learning_rate": 2.9224052308467575e-05, + "loss": 0.9156, + "step": 9276 + }, + { + "epoch": 0.4626932668329177, + "grad_norm": 0.5644249948966735, + "learning_rate": 2.9220071863920195e-05, + "loss": 0.8988, + "step": 9277 + }, + { + "epoch": 0.4627431421446384, + "grad_norm": 0.8686800919260316, + "learning_rate": 2.9216091309251924e-05, + "loss": 0.9198, + "step": 9278 + }, + { + "epoch": 0.4627930174563591, + "grad_norm": 0.5542310546680526, + "learning_rate": 2.9212110644566643e-05, + "loss": 0.8839, + "step": 9279 + }, + { + "epoch": 0.4628428927680798, + "grad_norm": 0.5571140295841679, + "learning_rate": 2.9208129869968216e-05, + "loss": 0.9211, + "step": 9280 + }, + { + "epoch": 0.4628927680798005, + "grad_norm": 0.410270638579908, + "learning_rate": 2.9204148985560526e-05, + "loss": 0.8613, + "step": 9281 + }, + { + "epoch": 0.4629426433915212, + "grad_norm": 0.4219950393713308, + "learning_rate": 2.920016799144745e-05, + "loss": 0.9107, + "step": 9282 + }, + { + "epoch": 0.4629925187032419, + "grad_norm": 0.42567661893745706, + "learning_rate": 2.9196186887732862e-05, + "loss": 0.8583, + "step": 9283 + }, + { + "epoch": 0.4630423940149626, + "grad_norm": 0.4596646078203043, + "learning_rate": 2.9192205674520666e-05, + "loss": 0.9164, + "step": 9284 + }, + { + "epoch": 0.4630922693266833, + "grad_norm": 0.48176208750264043, + "learning_rate": 2.9188224351914734e-05, + "loss": 0.8934, + "step": 9285 + }, + { + "epoch": 0.463142144638404, + "grad_norm": 0.40848930699565944, + "learning_rate": 2.918424292001896e-05, + "loss": 0.8934, + "step": 9286 + }, + { + "epoch": 0.4631920199501247, + "grad_norm": 0.5367829956618105, + "learning_rate": 2.918026137893724e-05, + "loss": 0.8673, + "step": 9287 + }, + { + "epoch": 0.4632418952618454, + "grad_norm": 0.3985243673230778, + "learning_rate": 2.9176279728773476e-05, + "loss": 0.9071, + "step": 9288 + }, + { + "epoch": 0.4632917705735661, + "grad_norm": 0.39374290937448764, + "learning_rate": 2.9172297969631558e-05, + "loss": 0.8729, + "step": 9289 + }, + { + "epoch": 0.46334164588528676, + "grad_norm": 0.40919001275267697, + "learning_rate": 2.916831610161539e-05, + "loss": 0.8625, + "step": 9290 + }, + { + "epoch": 0.4633915211970075, + "grad_norm": 0.4887261615919741, + "learning_rate": 2.916433412482888e-05, + "loss": 0.9057, + "step": 9291 + }, + { + "epoch": 0.4634413965087282, + "grad_norm": 0.3939032209567905, + "learning_rate": 2.9160352039375937e-05, + "loss": 0.8879, + "step": 9292 + }, + { + "epoch": 0.46349127182044886, + "grad_norm": 0.4221239422841031, + "learning_rate": 2.9156369845360467e-05, + "loss": 0.8697, + "step": 9293 + }, + { + "epoch": 0.4635411471321696, + "grad_norm": 0.41857241748717644, + "learning_rate": 2.9152387542886384e-05, + "loss": 0.8562, + "step": 9294 + }, + { + "epoch": 0.4635910224438903, + "grad_norm": 0.5199825700936456, + "learning_rate": 2.914840513205761e-05, + "loss": 0.9267, + "step": 9295 + }, + { + "epoch": 0.46364089775561096, + "grad_norm": 0.4913423077353502, + "learning_rate": 2.914442261297806e-05, + "loss": 0.9179, + "step": 9296 + }, + { + "epoch": 0.4636907730673317, + "grad_norm": 0.5326957030315109, + "learning_rate": 2.9140439985751655e-05, + "loss": 0.9061, + "step": 9297 + }, + { + "epoch": 0.4637406483790524, + "grad_norm": 0.44712255772991594, + "learning_rate": 2.9136457250482324e-05, + "loss": 0.9242, + "step": 9298 + }, + { + "epoch": 0.46379052369077306, + "grad_norm": 0.43467261991384615, + "learning_rate": 2.9132474407273985e-05, + "loss": 0.9159, + "step": 9299 + }, + { + "epoch": 0.46384039900249374, + "grad_norm": 0.45367258302257807, + "learning_rate": 2.912849145623059e-05, + "loss": 0.8828, + "step": 9300 + }, + { + "epoch": 0.4638902743142145, + "grad_norm": 0.4400465195792595, + "learning_rate": 2.9124508397456042e-05, + "loss": 0.8407, + "step": 9301 + }, + { + "epoch": 0.46394014962593516, + "grad_norm": 0.44587907108732194, + "learning_rate": 2.9120525231054303e-05, + "loss": 0.9212, + "step": 9302 + }, + { + "epoch": 0.46399002493765584, + "grad_norm": 0.43705502508926236, + "learning_rate": 2.91165419571293e-05, + "loss": 0.9086, + "step": 9303 + }, + { + "epoch": 0.4640399002493766, + "grad_norm": 0.439927668765493, + "learning_rate": 2.9112558575784976e-05, + "loss": 0.8774, + "step": 9304 + }, + { + "epoch": 0.46408977556109726, + "grad_norm": 0.5024223365136874, + "learning_rate": 2.910857508712528e-05, + "loss": 0.8979, + "step": 9305 + }, + { + "epoch": 0.46413965087281794, + "grad_norm": 0.36561746523680133, + "learning_rate": 2.910459149125415e-05, + "loss": 0.9206, + "step": 9306 + }, + { + "epoch": 0.4641895261845387, + "grad_norm": 0.4635549992556245, + "learning_rate": 2.9100607788275545e-05, + "loss": 0.8735, + "step": 9307 + }, + { + "epoch": 0.46423940149625936, + "grad_norm": 0.6751612620359856, + "learning_rate": 2.9096623978293426e-05, + "loss": 0.8881, + "step": 9308 + }, + { + "epoch": 0.46428927680798004, + "grad_norm": 0.45092192533191994, + "learning_rate": 2.9092640061411718e-05, + "loss": 0.8977, + "step": 9309 + }, + { + "epoch": 0.4643391521197007, + "grad_norm": 0.44539733339437243, + "learning_rate": 2.9088656037734414e-05, + "loss": 0.923, + "step": 9310 + }, + { + "epoch": 0.46438902743142146, + "grad_norm": 3.0588126469807007, + "learning_rate": 2.908467190736545e-05, + "loss": 0.87, + "step": 9311 + }, + { + "epoch": 0.46443890274314215, + "grad_norm": 0.4659479572654765, + "learning_rate": 2.9080687670408812e-05, + "loss": 0.9565, + "step": 9312 + }, + { + "epoch": 0.4644887780548628, + "grad_norm": 0.4784258476010628, + "learning_rate": 2.9076703326968447e-05, + "loss": 0.9101, + "step": 9313 + }, + { + "epoch": 0.46453865336658356, + "grad_norm": 0.6301856897984784, + "learning_rate": 2.9072718877148336e-05, + "loss": 0.9013, + "step": 9314 + }, + { + "epoch": 0.46458852867830425, + "grad_norm": 0.6264947129990267, + "learning_rate": 2.9068734321052444e-05, + "loss": 0.9058, + "step": 9315 + }, + { + "epoch": 0.46463840399002493, + "grad_norm": 0.5109388821094962, + "learning_rate": 2.9064749658784757e-05, + "loss": 0.89, + "step": 9316 + }, + { + "epoch": 0.4646882793017456, + "grad_norm": 0.41256489422604037, + "learning_rate": 2.9060764890449243e-05, + "loss": 0.8936, + "step": 9317 + }, + { + "epoch": 0.46473815461346635, + "grad_norm": 0.4807583800006507, + "learning_rate": 2.9056780016149894e-05, + "loss": 0.8619, + "step": 9318 + }, + { + "epoch": 0.46478802992518703, + "grad_norm": 0.4685735834661679, + "learning_rate": 2.9052795035990678e-05, + "loss": 0.8929, + "step": 9319 + }, + { + "epoch": 0.4648379052369077, + "grad_norm": 0.5737541327612921, + "learning_rate": 2.904880995007559e-05, + "loss": 0.9196, + "step": 9320 + }, + { + "epoch": 0.46488778054862845, + "grad_norm": 0.7374263420430017, + "learning_rate": 2.9044824758508625e-05, + "loss": 0.9183, + "step": 9321 + }, + { + "epoch": 0.46493765586034913, + "grad_norm": 0.43736862601392384, + "learning_rate": 2.904083946139376e-05, + "loss": 0.8911, + "step": 9322 + }, + { + "epoch": 0.4649875311720698, + "grad_norm": 0.40619431284527274, + "learning_rate": 2.9036854058835e-05, + "loss": 0.8961, + "step": 9323 + }, + { + "epoch": 0.46503740648379055, + "grad_norm": 0.6208402016345191, + "learning_rate": 2.903286855093634e-05, + "loss": 0.8479, + "step": 9324 + }, + { + "epoch": 0.46508728179551123, + "grad_norm": 0.4703788527437842, + "learning_rate": 2.9028882937801784e-05, + "loss": 0.9314, + "step": 9325 + }, + { + "epoch": 0.4651371571072319, + "grad_norm": 0.6871176530334305, + "learning_rate": 2.9024897219535323e-05, + "loss": 0.8799, + "step": 9326 + }, + { + "epoch": 0.4651870324189526, + "grad_norm": 0.4297342691188943, + "learning_rate": 2.9020911396240978e-05, + "loss": 0.8772, + "step": 9327 + }, + { + "epoch": 0.46523690773067333, + "grad_norm": 0.46649858483695567, + "learning_rate": 2.9016925468022744e-05, + "loss": 0.8972, + "step": 9328 + }, + { + "epoch": 0.465286783042394, + "grad_norm": 0.5833172948265, + "learning_rate": 2.9012939434984644e-05, + "loss": 0.895, + "step": 9329 + }, + { + "epoch": 0.4653366583541147, + "grad_norm": 0.4654660493726229, + "learning_rate": 2.900895329723068e-05, + "loss": 0.9162, + "step": 9330 + }, + { + "epoch": 0.46538653366583543, + "grad_norm": 0.45550505599817204, + "learning_rate": 2.900496705486488e-05, + "loss": 0.8903, + "step": 9331 + }, + { + "epoch": 0.4654364089775561, + "grad_norm": 0.440906800506778, + "learning_rate": 2.9000980707991253e-05, + "loss": 0.9078, + "step": 9332 + }, + { + "epoch": 0.4654862842892768, + "grad_norm": 0.5143427275840613, + "learning_rate": 2.8996994256713823e-05, + "loss": 0.8731, + "step": 9333 + }, + { + "epoch": 0.4655361596009975, + "grad_norm": 0.4325843474842115, + "learning_rate": 2.8993007701136615e-05, + "loss": 0.8885, + "step": 9334 + }, + { + "epoch": 0.4655860349127182, + "grad_norm": 0.5020373565776948, + "learning_rate": 2.8989021041363662e-05, + "loss": 0.874, + "step": 9335 + }, + { + "epoch": 0.4656359102244389, + "grad_norm": 0.4508443038457262, + "learning_rate": 2.898503427749899e-05, + "loss": 0.9025, + "step": 9336 + }, + { + "epoch": 0.4656857855361596, + "grad_norm": 0.4488864015377379, + "learning_rate": 2.8981047409646632e-05, + "loss": 0.8982, + "step": 9337 + }, + { + "epoch": 0.4657356608478803, + "grad_norm": 0.5082692663776122, + "learning_rate": 2.897706043791062e-05, + "loss": 0.891, + "step": 9338 + }, + { + "epoch": 0.465785536159601, + "grad_norm": 0.5427976572735752, + "learning_rate": 2.8973073362394998e-05, + "loss": 0.9228, + "step": 9339 + }, + { + "epoch": 0.4658354114713217, + "grad_norm": 0.7901610805224784, + "learning_rate": 2.896908618320381e-05, + "loss": 0.9048, + "step": 9340 + }, + { + "epoch": 0.4658852867830424, + "grad_norm": 0.5405649943923092, + "learning_rate": 2.8965098900441084e-05, + "loss": 0.9612, + "step": 9341 + }, + { + "epoch": 0.4659351620947631, + "grad_norm": 0.5780179491516269, + "learning_rate": 2.8961111514210886e-05, + "loss": 0.884, + "step": 9342 + }, + { + "epoch": 0.4659850374064838, + "grad_norm": 0.686326255516543, + "learning_rate": 2.8957124024617254e-05, + "loss": 0.8871, + "step": 9343 + }, + { + "epoch": 0.46603491271820446, + "grad_norm": 0.44673914816754456, + "learning_rate": 2.8953136431764238e-05, + "loss": 0.8696, + "step": 9344 + }, + { + "epoch": 0.4660847880299252, + "grad_norm": 0.4456750048852317, + "learning_rate": 2.8949148735755903e-05, + "loss": 0.897, + "step": 9345 + }, + { + "epoch": 0.4661346633416459, + "grad_norm": 1.40058944803382, + "learning_rate": 2.8945160936696293e-05, + "loss": 0.9218, + "step": 9346 + }, + { + "epoch": 0.46618453865336656, + "grad_norm": 0.41888447028313114, + "learning_rate": 2.894117303468947e-05, + "loss": 0.8897, + "step": 9347 + }, + { + "epoch": 0.4662344139650873, + "grad_norm": 0.45841277909643624, + "learning_rate": 2.8937185029839507e-05, + "loss": 0.8836, + "step": 9348 + }, + { + "epoch": 0.466284289276808, + "grad_norm": 0.46862961502970263, + "learning_rate": 2.8933196922250468e-05, + "loss": 0.9111, + "step": 9349 + }, + { + "epoch": 0.46633416458852867, + "grad_norm": 0.38693099460733754, + "learning_rate": 2.8929208712026408e-05, + "loss": 0.8876, + "step": 9350 + }, + { + "epoch": 0.4663840399002494, + "grad_norm": 0.373577764363617, + "learning_rate": 2.8925220399271403e-05, + "loss": 0.8861, + "step": 9351 + }, + { + "epoch": 0.4664339152119701, + "grad_norm": 0.7399084510048073, + "learning_rate": 2.892123198408953e-05, + "loss": 0.8876, + "step": 9352 + }, + { + "epoch": 0.46648379052369077, + "grad_norm": 0.443817319753359, + "learning_rate": 2.891724346658487e-05, + "loss": 0.8185, + "step": 9353 + }, + { + "epoch": 0.46653366583541145, + "grad_norm": 0.48688469229650827, + "learning_rate": 2.8913254846861488e-05, + "loss": 0.909, + "step": 9354 + }, + { + "epoch": 0.4665835411471322, + "grad_norm": 0.5135382781212791, + "learning_rate": 2.8909266125023477e-05, + "loss": 0.8801, + "step": 9355 + }, + { + "epoch": 0.46663341645885287, + "grad_norm": 0.6959472037352193, + "learning_rate": 2.8905277301174917e-05, + "loss": 0.8887, + "step": 9356 + }, + { + "epoch": 0.46668329177057355, + "grad_norm": 0.4251172655454505, + "learning_rate": 2.8901288375419884e-05, + "loss": 0.9129, + "step": 9357 + }, + { + "epoch": 0.4667331670822943, + "grad_norm": 0.5014915050771865, + "learning_rate": 2.8897299347862482e-05, + "loss": 0.9166, + "step": 9358 + }, + { + "epoch": 0.46678304239401497, + "grad_norm": 0.44115468898631643, + "learning_rate": 2.8893310218606794e-05, + "loss": 0.8695, + "step": 9359 + }, + { + "epoch": 0.46683291770573565, + "grad_norm": 0.3996802246322243, + "learning_rate": 2.888932098775693e-05, + "loss": 0.8697, + "step": 9360 + }, + { + "epoch": 0.46688279301745633, + "grad_norm": 0.6140808582375888, + "learning_rate": 2.8885331655416965e-05, + "loss": 0.9176, + "step": 9361 + }, + { + "epoch": 0.46693266832917707, + "grad_norm": 0.4940410005333134, + "learning_rate": 2.888134222169101e-05, + "loss": 0.8871, + "step": 9362 + }, + { + "epoch": 0.46698254364089775, + "grad_norm": 0.43444454259971627, + "learning_rate": 2.8877352686683167e-05, + "loss": 0.8758, + "step": 9363 + }, + { + "epoch": 0.46703241895261843, + "grad_norm": 0.49074617997233955, + "learning_rate": 2.887336305049755e-05, + "loss": 0.9427, + "step": 9364 + }, + { + "epoch": 0.46708229426433917, + "grad_norm": 0.3825015183527974, + "learning_rate": 2.8869373313238247e-05, + "loss": 0.9012, + "step": 9365 + }, + { + "epoch": 0.46713216957605985, + "grad_norm": 0.4339547355690907, + "learning_rate": 2.8865383475009382e-05, + "loss": 0.9205, + "step": 9366 + }, + { + "epoch": 0.46718204488778053, + "grad_norm": 0.41019986289555405, + "learning_rate": 2.886139353591507e-05, + "loss": 0.9064, + "step": 9367 + }, + { + "epoch": 0.46723192019950127, + "grad_norm": 0.5067296742299441, + "learning_rate": 2.8857403496059415e-05, + "loss": 0.8364, + "step": 9368 + }, + { + "epoch": 0.46728179551122195, + "grad_norm": 0.4474298835818348, + "learning_rate": 2.8853413355546545e-05, + "loss": 0.9251, + "step": 9369 + }, + { + "epoch": 0.46733167082294264, + "grad_norm": 0.5656115302856434, + "learning_rate": 2.8849423114480577e-05, + "loss": 0.8563, + "step": 9370 + }, + { + "epoch": 0.4673815461346633, + "grad_norm": 0.40651073384742653, + "learning_rate": 2.884543277296563e-05, + "loss": 0.8668, + "step": 9371 + }, + { + "epoch": 0.46743142144638405, + "grad_norm": 0.5234601020485781, + "learning_rate": 2.884144233110585e-05, + "loss": 0.9094, + "step": 9372 + }, + { + "epoch": 0.46748129675810474, + "grad_norm": 0.3925483894862439, + "learning_rate": 2.8837451789005338e-05, + "loss": 0.8759, + "step": 9373 + }, + { + "epoch": 0.4675311720698254, + "grad_norm": 0.3776412215693654, + "learning_rate": 2.883346114676825e-05, + "loss": 0.8849, + "step": 9374 + }, + { + "epoch": 0.46758104738154616, + "grad_norm": 0.48196559643636855, + "learning_rate": 2.8829470404498697e-05, + "loss": 0.9079, + "step": 9375 + }, + { + "epoch": 0.46763092269326684, + "grad_norm": 0.43646785941637234, + "learning_rate": 2.882547956230084e-05, + "loss": 0.8548, + "step": 9376 + }, + { + "epoch": 0.4676807980049875, + "grad_norm": 1.1132343774949096, + "learning_rate": 2.8821488620278797e-05, + "loss": 0.8694, + "step": 9377 + }, + { + "epoch": 0.46773067331670826, + "grad_norm": 0.5516125661268955, + "learning_rate": 2.881749757853672e-05, + "loss": 0.8677, + "step": 9378 + }, + { + "epoch": 0.46778054862842894, + "grad_norm": 0.4200546329795407, + "learning_rate": 2.8813506437178757e-05, + "loss": 0.9026, + "step": 9379 + }, + { + "epoch": 0.4678304239401496, + "grad_norm": 0.4055129081726796, + "learning_rate": 2.8809515196309044e-05, + "loss": 0.8823, + "step": 9380 + }, + { + "epoch": 0.4678802992518703, + "grad_norm": 0.5032687076967245, + "learning_rate": 2.880552385603174e-05, + "loss": 0.9002, + "step": 9381 + }, + { + "epoch": 0.46793017456359104, + "grad_norm": 0.4487940557572808, + "learning_rate": 2.8801532416451e-05, + "loss": 0.9385, + "step": 9382 + }, + { + "epoch": 0.4679800498753117, + "grad_norm": 0.40239878885863767, + "learning_rate": 2.8797540877670965e-05, + "loss": 0.877, + "step": 9383 + }, + { + "epoch": 0.4680299251870324, + "grad_norm": 0.40230860086638054, + "learning_rate": 2.8793549239795802e-05, + "loss": 0.928, + "step": 9384 + }, + { + "epoch": 0.46807980049875314, + "grad_norm": 0.3755701476267659, + "learning_rate": 2.8789557502929663e-05, + "loss": 0.8782, + "step": 9385 + }, + { + "epoch": 0.4681296758104738, + "grad_norm": 0.4352096763985981, + "learning_rate": 2.8785565667176728e-05, + "loss": 0.8922, + "step": 9386 + }, + { + "epoch": 0.4681795511221945, + "grad_norm": 0.4806319816728, + "learning_rate": 2.878157373264115e-05, + "loss": 0.928, + "step": 9387 + }, + { + "epoch": 0.4682294264339152, + "grad_norm": 0.4975334944633831, + "learning_rate": 2.877758169942709e-05, + "loss": 0.8887, + "step": 9388 + }, + { + "epoch": 0.4682793017456359, + "grad_norm": 0.3993588405818988, + "learning_rate": 2.877358956763873e-05, + "loss": 0.8951, + "step": 9389 + }, + { + "epoch": 0.4683291770573566, + "grad_norm": 0.388428649816677, + "learning_rate": 2.8769597337380245e-05, + "loss": 0.8823, + "step": 9390 + }, + { + "epoch": 0.4683790523690773, + "grad_norm": 0.43604349371832296, + "learning_rate": 2.87656050087558e-05, + "loss": 0.8928, + "step": 9391 + }, + { + "epoch": 0.468428927680798, + "grad_norm": 0.7183947793884734, + "learning_rate": 2.876161258186958e-05, + "loss": 0.8922, + "step": 9392 + }, + { + "epoch": 0.4684788029925187, + "grad_norm": 0.7137049420537234, + "learning_rate": 2.8757620056825762e-05, + "loss": 0.9004, + "step": 9393 + }, + { + "epoch": 0.4685286783042394, + "grad_norm": 0.44332539623472117, + "learning_rate": 2.8753627433728526e-05, + "loss": 0.8816, + "step": 9394 + }, + { + "epoch": 0.4685785536159601, + "grad_norm": 0.5067454418747345, + "learning_rate": 2.8749634712682066e-05, + "loss": 0.8891, + "step": 9395 + }, + { + "epoch": 0.4686284289276808, + "grad_norm": 0.4484466934898249, + "learning_rate": 2.8745641893790565e-05, + "loss": 0.9216, + "step": 9396 + }, + { + "epoch": 0.4686783042394015, + "grad_norm": 0.4319262361229473, + "learning_rate": 2.8741648977158215e-05, + "loss": 0.9186, + "step": 9397 + }, + { + "epoch": 0.46872817955112217, + "grad_norm": 0.5236960366094461, + "learning_rate": 2.873765596288921e-05, + "loss": 0.8857, + "step": 9398 + }, + { + "epoch": 0.4687780548628429, + "grad_norm": 0.41077053875781727, + "learning_rate": 2.8733662851087743e-05, + "loss": 0.9048, + "step": 9399 + }, + { + "epoch": 0.4688279301745636, + "grad_norm": 0.394581553768744, + "learning_rate": 2.8729669641858015e-05, + "loss": 0.8671, + "step": 9400 + }, + { + "epoch": 0.46887780548628427, + "grad_norm": 0.7290266994059379, + "learning_rate": 2.872567633530423e-05, + "loss": 0.8901, + "step": 9401 + }, + { + "epoch": 0.468927680798005, + "grad_norm": 0.4609592199380678, + "learning_rate": 2.8721682931530586e-05, + "loss": 0.9098, + "step": 9402 + }, + { + "epoch": 0.4689775561097257, + "grad_norm": 0.46531708581641806, + "learning_rate": 2.8717689430641292e-05, + "loss": 0.9344, + "step": 9403 + }, + { + "epoch": 0.4690274314214464, + "grad_norm": 0.500626197807634, + "learning_rate": 2.8713695832740563e-05, + "loss": 0.9238, + "step": 9404 + }, + { + "epoch": 0.4690773067331671, + "grad_norm": 0.39964342024744476, + "learning_rate": 2.8709702137932593e-05, + "loss": 0.8847, + "step": 9405 + }, + { + "epoch": 0.4691271820448878, + "grad_norm": 0.4166418635318774, + "learning_rate": 2.8705708346321615e-05, + "loss": 0.8954, + "step": 9406 + }, + { + "epoch": 0.4691770573566085, + "grad_norm": 0.39445272853985525, + "learning_rate": 2.8701714458011824e-05, + "loss": 0.9001, + "step": 9407 + }, + { + "epoch": 0.46922693266832916, + "grad_norm": 0.5332230323393407, + "learning_rate": 2.8697720473107464e-05, + "loss": 0.8784, + "step": 9408 + }, + { + "epoch": 0.4692768079800499, + "grad_norm": 0.43442907695206656, + "learning_rate": 2.869372639171273e-05, + "loss": 0.8843, + "step": 9409 + }, + { + "epoch": 0.4693266832917706, + "grad_norm": 0.413614064926857, + "learning_rate": 2.868973221393187e-05, + "loss": 0.8927, + "step": 9410 + }, + { + "epoch": 0.46937655860349126, + "grad_norm": 0.4573494816323312, + "learning_rate": 2.8685737939869097e-05, + "loss": 0.8773, + "step": 9411 + }, + { + "epoch": 0.469426433915212, + "grad_norm": 0.6039243754097988, + "learning_rate": 2.868174356962864e-05, + "loss": 0.9127, + "step": 9412 + }, + { + "epoch": 0.4694763092269327, + "grad_norm": 0.4168285242945092, + "learning_rate": 2.8677749103314732e-05, + "loss": 0.9031, + "step": 9413 + }, + { + "epoch": 0.46952618453865336, + "grad_norm": 0.48420834971927557, + "learning_rate": 2.8673754541031606e-05, + "loss": 0.9164, + "step": 9414 + }, + { + "epoch": 0.46957605985037404, + "grad_norm": 0.38602632183096036, + "learning_rate": 2.86697598828835e-05, + "loss": 0.8742, + "step": 9415 + }, + { + "epoch": 0.4696259351620948, + "grad_norm": 0.4082189176988289, + "learning_rate": 2.8665765128974654e-05, + "loss": 0.912, + "step": 9416 + }, + { + "epoch": 0.46967581047381546, + "grad_norm": 0.466294240894234, + "learning_rate": 2.8661770279409307e-05, + "loss": 0.8428, + "step": 9417 + }, + { + "epoch": 0.46972568578553614, + "grad_norm": 0.42182243926083574, + "learning_rate": 2.8657775334291704e-05, + "loss": 0.8601, + "step": 9418 + }, + { + "epoch": 0.4697755610972569, + "grad_norm": 0.5255702671454581, + "learning_rate": 2.865378029372609e-05, + "loss": 0.8727, + "step": 9419 + }, + { + "epoch": 0.46982543640897756, + "grad_norm": 1.2104334397402843, + "learning_rate": 2.8649785157816713e-05, + "loss": 0.9201, + "step": 9420 + }, + { + "epoch": 0.46987531172069824, + "grad_norm": 0.7328397019546476, + "learning_rate": 2.8645789926667827e-05, + "loss": 0.9025, + "step": 9421 + }, + { + "epoch": 0.469925187032419, + "grad_norm": 1.2287806742451728, + "learning_rate": 2.8641794600383683e-05, + "loss": 0.866, + "step": 9422 + }, + { + "epoch": 0.46997506234413966, + "grad_norm": 0.45971794227772567, + "learning_rate": 2.863779917906854e-05, + "loss": 0.8714, + "step": 9423 + }, + { + "epoch": 0.47002493765586034, + "grad_norm": 0.43661832193070704, + "learning_rate": 2.863380366282665e-05, + "loss": 0.9029, + "step": 9424 + }, + { + "epoch": 0.470074812967581, + "grad_norm": 0.4391309069706433, + "learning_rate": 2.8629808051762292e-05, + "loss": 0.8583, + "step": 9425 + }, + { + "epoch": 0.47012468827930176, + "grad_norm": 0.5104729005473025, + "learning_rate": 2.8625812345979714e-05, + "loss": 0.8566, + "step": 9426 + }, + { + "epoch": 0.47017456359102244, + "grad_norm": 0.438382127538861, + "learning_rate": 2.862181654558318e-05, + "loss": 0.9129, + "step": 9427 + }, + { + "epoch": 0.4702244389027431, + "grad_norm": 0.42754123559222235, + "learning_rate": 2.8617820650676968e-05, + "loss": 0.9039, + "step": 9428 + }, + { + "epoch": 0.47027431421446386, + "grad_norm": 0.44693027503832, + "learning_rate": 2.861382466136534e-05, + "loss": 0.919, + "step": 9429 + }, + { + "epoch": 0.47032418952618454, + "grad_norm": 0.4315395774470326, + "learning_rate": 2.8609828577752583e-05, + "loss": 0.9211, + "step": 9430 + }, + { + "epoch": 0.4703740648379052, + "grad_norm": 0.46137578016335384, + "learning_rate": 2.8605832399942954e-05, + "loss": 0.8868, + "step": 9431 + }, + { + "epoch": 0.4704239401496259, + "grad_norm": 0.3862502563668897, + "learning_rate": 2.8601836128040756e-05, + "loss": 0.8375, + "step": 9432 + }, + { + "epoch": 0.47047381546134664, + "grad_norm": 0.4222591652420574, + "learning_rate": 2.8597839762150247e-05, + "loss": 0.9136, + "step": 9433 + }, + { + "epoch": 0.4705236907730673, + "grad_norm": 0.44973368766832406, + "learning_rate": 2.8593843302375724e-05, + "loss": 0.8821, + "step": 9434 + }, + { + "epoch": 0.470573566084788, + "grad_norm": 0.4483691054356633, + "learning_rate": 2.8589846748821464e-05, + "loss": 0.8746, + "step": 9435 + }, + { + "epoch": 0.47062344139650875, + "grad_norm": 0.46689858124358763, + "learning_rate": 2.858585010159176e-05, + "loss": 0.9054, + "step": 9436 + }, + { + "epoch": 0.47067331670822943, + "grad_norm": 0.4511725505480159, + "learning_rate": 2.8581853360790905e-05, + "loss": 0.9073, + "step": 9437 + }, + { + "epoch": 0.4707231920199501, + "grad_norm": 0.4308783287513358, + "learning_rate": 2.8577856526523187e-05, + "loss": 0.8978, + "step": 9438 + }, + { + "epoch": 0.47077306733167085, + "grad_norm": 0.4455434590021913, + "learning_rate": 2.857385959889291e-05, + "loss": 0.8324, + "step": 9439 + }, + { + "epoch": 0.47082294264339153, + "grad_norm": 0.4628314280259319, + "learning_rate": 2.8569862578004358e-05, + "loss": 0.8398, + "step": 9440 + }, + { + "epoch": 0.4708728179551122, + "grad_norm": 0.4493776972771144, + "learning_rate": 2.856586546396185e-05, + "loss": 0.9058, + "step": 9441 + }, + { + "epoch": 0.4709226932668329, + "grad_norm": 0.6598024954800038, + "learning_rate": 2.8561868256869667e-05, + "loss": 0.8914, + "step": 9442 + }, + { + "epoch": 0.47097256857855363, + "grad_norm": 0.40092804025712925, + "learning_rate": 2.8557870956832132e-05, + "loss": 0.9378, + "step": 9443 + }, + { + "epoch": 0.4710224438902743, + "grad_norm": 0.42043308086395564, + "learning_rate": 2.8553873563953548e-05, + "loss": 0.8797, + "step": 9444 + }, + { + "epoch": 0.471072319201995, + "grad_norm": 0.5185962062610421, + "learning_rate": 2.8549876078338222e-05, + "loss": 0.8902, + "step": 9445 + }, + { + "epoch": 0.47112219451371573, + "grad_norm": 0.5042210593646145, + "learning_rate": 2.8545878500090468e-05, + "loss": 0.9087, + "step": 9446 + }, + { + "epoch": 0.4711720698254364, + "grad_norm": 0.5403440183745478, + "learning_rate": 2.8541880829314604e-05, + "loss": 0.924, + "step": 9447 + }, + { + "epoch": 0.4712219451371571, + "grad_norm": 0.3990914239340673, + "learning_rate": 2.853788306611494e-05, + "loss": 0.8665, + "step": 9448 + }, + { + "epoch": 0.47127182044887783, + "grad_norm": 0.6652472900914096, + "learning_rate": 2.85338852105958e-05, + "loss": 0.9097, + "step": 9449 + }, + { + "epoch": 0.4713216957605985, + "grad_norm": 0.3828573025234425, + "learning_rate": 2.852988726286151e-05, + "loss": 0.8706, + "step": 9450 + }, + { + "epoch": 0.4713715710723192, + "grad_norm": 0.40646493647483173, + "learning_rate": 2.852588922301639e-05, + "loss": 0.8857, + "step": 9451 + }, + { + "epoch": 0.4714214463840399, + "grad_norm": 0.4230513442094099, + "learning_rate": 2.8521891091164778e-05, + "loss": 0.8903, + "step": 9452 + }, + { + "epoch": 0.4714713216957606, + "grad_norm": 0.5688367877142829, + "learning_rate": 2.851789286741098e-05, + "loss": 0.9165, + "step": 9453 + }, + { + "epoch": 0.4715211970074813, + "grad_norm": 0.45377317068323675, + "learning_rate": 2.8513894551859348e-05, + "loss": 0.905, + "step": 9454 + }, + { + "epoch": 0.471571072319202, + "grad_norm": 0.49185866469247125, + "learning_rate": 2.850989614461421e-05, + "loss": 0.9235, + "step": 9455 + }, + { + "epoch": 0.4716209476309227, + "grad_norm": 0.676905460235789, + "learning_rate": 2.8505897645779904e-05, + "loss": 0.8736, + "step": 9456 + }, + { + "epoch": 0.4716708229426434, + "grad_norm": 0.44802558431335227, + "learning_rate": 2.850189905546077e-05, + "loss": 0.8665, + "step": 9457 + }, + { + "epoch": 0.4717206982543641, + "grad_norm": 0.5249133286781411, + "learning_rate": 2.8497900373761145e-05, + "loss": 0.8273, + "step": 9458 + }, + { + "epoch": 0.47177057356608476, + "grad_norm": 0.48308596885650185, + "learning_rate": 2.8493901600785376e-05, + "loss": 0.9116, + "step": 9459 + }, + { + "epoch": 0.4718204488778055, + "grad_norm": 0.4336441913461283, + "learning_rate": 2.8489902736637807e-05, + "loss": 0.8627, + "step": 9460 + }, + { + "epoch": 0.4718703241895262, + "grad_norm": 0.44119446253292716, + "learning_rate": 2.848590378142279e-05, + "loss": 0.9086, + "step": 9461 + }, + { + "epoch": 0.47192019950124686, + "grad_norm": 0.48390119988072844, + "learning_rate": 2.8481904735244673e-05, + "loss": 0.9003, + "step": 9462 + }, + { + "epoch": 0.4719700748129676, + "grad_norm": 1.1492836505981743, + "learning_rate": 2.8477905598207814e-05, + "loss": 0.8334, + "step": 9463 + }, + { + "epoch": 0.4720199501246883, + "grad_norm": 0.43252556559289024, + "learning_rate": 2.847390637041656e-05, + "loss": 0.8769, + "step": 9464 + }, + { + "epoch": 0.47206982543640896, + "grad_norm": 0.550372717580915, + "learning_rate": 2.8469907051975276e-05, + "loss": 0.9135, + "step": 9465 + }, + { + "epoch": 0.4721197007481297, + "grad_norm": 0.5003493200711459, + "learning_rate": 2.8465907642988315e-05, + "loss": 0.9001, + "step": 9466 + }, + { + "epoch": 0.4721695760598504, + "grad_norm": 0.5543778496087223, + "learning_rate": 2.8461908143560058e-05, + "loss": 0.8647, + "step": 9467 + }, + { + "epoch": 0.47221945137157106, + "grad_norm": 0.4712570848039798, + "learning_rate": 2.8457908553794843e-05, + "loss": 0.9256, + "step": 9468 + }, + { + "epoch": 0.47226932668329175, + "grad_norm": 0.7737231113680406, + "learning_rate": 2.8453908873797058e-05, + "loss": 0.9432, + "step": 9469 + }, + { + "epoch": 0.4723192019950125, + "grad_norm": 0.900984768381548, + "learning_rate": 2.8449909103671064e-05, + "loss": 0.8912, + "step": 9470 + }, + { + "epoch": 0.47236907730673316, + "grad_norm": 0.4161302585092151, + "learning_rate": 2.844590924352124e-05, + "loss": 0.8644, + "step": 9471 + }, + { + "epoch": 0.47241895261845385, + "grad_norm": 0.5499190802303139, + "learning_rate": 2.8441909293451953e-05, + "loss": 0.9437, + "step": 9472 + }, + { + "epoch": 0.4724688279301746, + "grad_norm": 0.4569786643517095, + "learning_rate": 2.8437909253567592e-05, + "loss": 0.8738, + "step": 9473 + }, + { + "epoch": 0.47251870324189527, + "grad_norm": 0.4922722688277368, + "learning_rate": 2.8433909123972518e-05, + "loss": 0.8691, + "step": 9474 + }, + { + "epoch": 0.47256857855361595, + "grad_norm": 0.44569452851741315, + "learning_rate": 2.8429908904771123e-05, + "loss": 0.8716, + "step": 9475 + }, + { + "epoch": 0.4726184538653367, + "grad_norm": 0.39839351495319314, + "learning_rate": 2.8425908596067795e-05, + "loss": 0.8763, + "step": 9476 + }, + { + "epoch": 0.47266832917705737, + "grad_norm": 0.632910802823994, + "learning_rate": 2.8421908197966912e-05, + "loss": 0.8899, + "step": 9477 + }, + { + "epoch": 0.47271820448877805, + "grad_norm": 0.5276579435987301, + "learning_rate": 2.8417907710572867e-05, + "loss": 0.8923, + "step": 9478 + }, + { + "epoch": 0.47276807980049873, + "grad_norm": 0.5016863945776919, + "learning_rate": 2.8413907133990046e-05, + "loss": 0.8598, + "step": 9479 + }, + { + "epoch": 0.47281795511221947, + "grad_norm": 0.439648271995836, + "learning_rate": 2.8409906468322857e-05, + "loss": 0.9104, + "step": 9480 + }, + { + "epoch": 0.47286783042394015, + "grad_norm": 0.5374409070735833, + "learning_rate": 2.8405905713675672e-05, + "loss": 0.8439, + "step": 9481 + }, + { + "epoch": 0.47291770573566083, + "grad_norm": 0.5540053875190035, + "learning_rate": 2.8401904870152912e-05, + "loss": 0.8771, + "step": 9482 + }, + { + "epoch": 0.47296758104738157, + "grad_norm": 0.4572369684208894, + "learning_rate": 2.839790393785896e-05, + "loss": 0.9169, + "step": 9483 + }, + { + "epoch": 0.47301745635910225, + "grad_norm": 0.39929493870877447, + "learning_rate": 2.839390291689823e-05, + "loss": 0.8967, + "step": 9484 + }, + { + "epoch": 0.47306733167082293, + "grad_norm": 0.43621562829428245, + "learning_rate": 2.838990180737512e-05, + "loss": 0.8802, + "step": 9485 + }, + { + "epoch": 0.4731172069825436, + "grad_norm": 0.4720863308900259, + "learning_rate": 2.8385900609394034e-05, + "loss": 0.858, + "step": 9486 + }, + { + "epoch": 0.47316708229426435, + "grad_norm": 0.46066503683053517, + "learning_rate": 2.838189932305939e-05, + "loss": 0.9398, + "step": 9487 + }, + { + "epoch": 0.47321695760598503, + "grad_norm": 0.6686572987313638, + "learning_rate": 2.8377897948475602e-05, + "loss": 0.8914, + "step": 9488 + }, + { + "epoch": 0.4732668329177057, + "grad_norm": 0.9134105914769656, + "learning_rate": 2.8373896485747075e-05, + "loss": 0.84, + "step": 9489 + }, + { + "epoch": 0.47331670822942645, + "grad_norm": 0.42513101798923875, + "learning_rate": 2.836989493497823e-05, + "loss": 0.9461, + "step": 9490 + }, + { + "epoch": 0.47336658354114713, + "grad_norm": 0.6657428340730318, + "learning_rate": 2.8365893296273487e-05, + "loss": 0.9025, + "step": 9491 + }, + { + "epoch": 0.4734164588528678, + "grad_norm": 0.43295405862937975, + "learning_rate": 2.836189156973726e-05, + "loss": 0.9115, + "step": 9492 + }, + { + "epoch": 0.47346633416458855, + "grad_norm": 0.4461843388388517, + "learning_rate": 2.8357889755473983e-05, + "loss": 0.8532, + "step": 9493 + }, + { + "epoch": 0.47351620947630924, + "grad_norm": 0.9737251004424934, + "learning_rate": 2.8353887853588068e-05, + "loss": 0.8953, + "step": 9494 + }, + { + "epoch": 0.4735660847880299, + "grad_norm": 0.5998972498932694, + "learning_rate": 2.8349885864183955e-05, + "loss": 0.9085, + "step": 9495 + }, + { + "epoch": 0.4736159600997506, + "grad_norm": 0.5936331746362788, + "learning_rate": 2.834588378736607e-05, + "loss": 0.8914, + "step": 9496 + }, + { + "epoch": 0.47366583541147134, + "grad_norm": 0.6862020256683455, + "learning_rate": 2.834188162323885e-05, + "loss": 0.8667, + "step": 9497 + }, + { + "epoch": 0.473715710723192, + "grad_norm": 0.38861051357765447, + "learning_rate": 2.8337879371906717e-05, + "loss": 0.8611, + "step": 9498 + }, + { + "epoch": 0.4737655860349127, + "grad_norm": 0.4191960562856117, + "learning_rate": 2.8333877033474116e-05, + "loss": 0.8744, + "step": 9499 + }, + { + "epoch": 0.47381546134663344, + "grad_norm": 0.4404329772349016, + "learning_rate": 2.832987460804549e-05, + "loss": 0.9169, + "step": 9500 + }, + { + "epoch": 0.4738653366583541, + "grad_norm": 0.5135615808387372, + "learning_rate": 2.832587209572527e-05, + "loss": 0.9451, + "step": 9501 + }, + { + "epoch": 0.4739152119700748, + "grad_norm": 0.4857340919343942, + "learning_rate": 2.8321869496617913e-05, + "loss": 0.8199, + "step": 9502 + }, + { + "epoch": 0.47396508728179554, + "grad_norm": 0.45190850818335887, + "learning_rate": 2.8317866810827854e-05, + "loss": 0.8781, + "step": 9503 + }, + { + "epoch": 0.4740149625935162, + "grad_norm": 0.4240321183352839, + "learning_rate": 2.8313864038459546e-05, + "loss": 0.8853, + "step": 9504 + }, + { + "epoch": 0.4740648379052369, + "grad_norm": 0.8469049362883072, + "learning_rate": 2.830986117961744e-05, + "loss": 0.8945, + "step": 9505 + }, + { + "epoch": 0.4741147132169576, + "grad_norm": 0.5113178650594224, + "learning_rate": 2.830585823440599e-05, + "loss": 0.9161, + "step": 9506 + }, + { + "epoch": 0.4741645885286783, + "grad_norm": 0.5678451235751901, + "learning_rate": 2.8301855202929644e-05, + "loss": 0.8748, + "step": 9507 + }, + { + "epoch": 0.474214463840399, + "grad_norm": 0.4460615997698369, + "learning_rate": 2.829785208529287e-05, + "loss": 0.885, + "step": 9508 + }, + { + "epoch": 0.4742643391521197, + "grad_norm": 0.39892386460159207, + "learning_rate": 2.8293848881600123e-05, + "loss": 0.9207, + "step": 9509 + }, + { + "epoch": 0.4743142144638404, + "grad_norm": 0.540158301335553, + "learning_rate": 2.8289845591955854e-05, + "loss": 0.8304, + "step": 9510 + }, + { + "epoch": 0.4743640897755611, + "grad_norm": 0.5756363680962706, + "learning_rate": 2.8285842216464543e-05, + "loss": 0.862, + "step": 9511 + }, + { + "epoch": 0.4744139650872818, + "grad_norm": 0.7426662094382422, + "learning_rate": 2.8281838755230644e-05, + "loss": 0.9553, + "step": 9512 + }, + { + "epoch": 0.47446384039900247, + "grad_norm": 0.45381181340697796, + "learning_rate": 2.8277835208358637e-05, + "loss": 0.9099, + "step": 9513 + }, + { + "epoch": 0.4745137157107232, + "grad_norm": 0.4773598524567821, + "learning_rate": 2.8273831575952987e-05, + "loss": 0.8829, + "step": 9514 + }, + { + "epoch": 0.4745635910224439, + "grad_norm": 0.493167019538751, + "learning_rate": 2.826982785811817e-05, + "loss": 0.8692, + "step": 9515 + }, + { + "epoch": 0.47461346633416457, + "grad_norm": 0.45930832950013195, + "learning_rate": 2.826582405495865e-05, + "loss": 0.9024, + "step": 9516 + }, + { + "epoch": 0.4746633416458853, + "grad_norm": 0.4382676108705101, + "learning_rate": 2.826182016657892e-05, + "loss": 0.8715, + "step": 9517 + }, + { + "epoch": 0.474713216957606, + "grad_norm": 0.386271523888561, + "learning_rate": 2.8257816193083447e-05, + "loss": 0.8842, + "step": 9518 + }, + { + "epoch": 0.47476309226932667, + "grad_norm": 0.3748776985534481, + "learning_rate": 2.8253812134576723e-05, + "loss": 0.834, + "step": 9519 + }, + { + "epoch": 0.4748129675810474, + "grad_norm": 0.38680971240198675, + "learning_rate": 2.8249807991163224e-05, + "loss": 0.8818, + "step": 9520 + }, + { + "epoch": 0.4748628428927681, + "grad_norm": 0.47325145740577257, + "learning_rate": 2.824580376294745e-05, + "loss": 0.8678, + "step": 9521 + }, + { + "epoch": 0.47491271820448877, + "grad_norm": 0.716822622467074, + "learning_rate": 2.824179945003387e-05, + "loss": 0.8835, + "step": 9522 + }, + { + "epoch": 0.47496259351620945, + "grad_norm": 0.45075891701642257, + "learning_rate": 2.8237795052526982e-05, + "loss": 0.8707, + "step": 9523 + }, + { + "epoch": 0.4750124688279302, + "grad_norm": 0.5111401728191834, + "learning_rate": 2.8233790570531283e-05, + "loss": 0.9286, + "step": 9524 + }, + { + "epoch": 0.47506234413965087, + "grad_norm": 0.7962580149316051, + "learning_rate": 2.8229786004151265e-05, + "loss": 0.8404, + "step": 9525 + }, + { + "epoch": 0.47511221945137155, + "grad_norm": 0.5290625474475553, + "learning_rate": 2.822578135349143e-05, + "loss": 0.8509, + "step": 9526 + }, + { + "epoch": 0.4751620947630923, + "grad_norm": 0.436616688034821, + "learning_rate": 2.8221776618656275e-05, + "loss": 0.885, + "step": 9527 + }, + { + "epoch": 0.475211970074813, + "grad_norm": 0.489513024324253, + "learning_rate": 2.8217771799750304e-05, + "loss": 0.9436, + "step": 9528 + }, + { + "epoch": 0.47526184538653365, + "grad_norm": 0.4923427222414055, + "learning_rate": 2.8213766896878008e-05, + "loss": 0.8618, + "step": 9529 + }, + { + "epoch": 0.47531172069825434, + "grad_norm": 0.9107713064571239, + "learning_rate": 2.820976191014391e-05, + "loss": 0.8996, + "step": 9530 + }, + { + "epoch": 0.4753615960099751, + "grad_norm": 0.6890190811039704, + "learning_rate": 2.820575683965251e-05, + "loss": 0.9492, + "step": 9531 + }, + { + "epoch": 0.47541147132169576, + "grad_norm": 0.4451705710772446, + "learning_rate": 2.8201751685508326e-05, + "loss": 0.8725, + "step": 9532 + }, + { + "epoch": 0.47546134663341644, + "grad_norm": 0.43231055485586767, + "learning_rate": 2.819774644781586e-05, + "loss": 0.8597, + "step": 9533 + }, + { + "epoch": 0.4755112219451372, + "grad_norm": 0.8022165875091705, + "learning_rate": 2.8193741126679628e-05, + "loss": 0.8546, + "step": 9534 + }, + { + "epoch": 0.47556109725685786, + "grad_norm": 0.5038351520850378, + "learning_rate": 2.8189735722204158e-05, + "loss": 0.8961, + "step": 9535 + }, + { + "epoch": 0.47561097256857854, + "grad_norm": 0.4326450353303426, + "learning_rate": 2.8185730234493955e-05, + "loss": 0.8937, + "step": 9536 + }, + { + "epoch": 0.4756608478802993, + "grad_norm": 0.5322607741411166, + "learning_rate": 2.8181724663653554e-05, + "loss": 0.8905, + "step": 9537 + }, + { + "epoch": 0.47571072319201996, + "grad_norm": 0.4470833106903245, + "learning_rate": 2.817771900978746e-05, + "loss": 0.9289, + "step": 9538 + }, + { + "epoch": 0.47576059850374064, + "grad_norm": 0.6590166148631819, + "learning_rate": 2.8173713273000223e-05, + "loss": 0.8747, + "step": 9539 + }, + { + "epoch": 0.4758104738154613, + "grad_norm": 0.4373031169804182, + "learning_rate": 2.816970745339635e-05, + "loss": 0.8938, + "step": 9540 + }, + { + "epoch": 0.47586034912718206, + "grad_norm": 0.49108196917160035, + "learning_rate": 2.8165701551080387e-05, + "loss": 0.9126, + "step": 9541 + }, + { + "epoch": 0.47591022443890274, + "grad_norm": 0.4283122421888677, + "learning_rate": 2.8161695566156854e-05, + "loss": 0.9081, + "step": 9542 + }, + { + "epoch": 0.4759600997506234, + "grad_norm": 0.45604776410695613, + "learning_rate": 2.815768949873029e-05, + "loss": 0.8912, + "step": 9543 + }, + { + "epoch": 0.47600997506234416, + "grad_norm": 0.5239132746823693, + "learning_rate": 2.815368334890523e-05, + "loss": 0.8511, + "step": 9544 + }, + { + "epoch": 0.47605985037406484, + "grad_norm": 0.5570636473766801, + "learning_rate": 2.8149677116786222e-05, + "loss": 0.8726, + "step": 9545 + }, + { + "epoch": 0.4761097256857855, + "grad_norm": 0.4693047338904059, + "learning_rate": 2.8145670802477796e-05, + "loss": 0.9145, + "step": 9546 + }, + { + "epoch": 0.47615960099750626, + "grad_norm": 0.4143895824922924, + "learning_rate": 2.8141664406084495e-05, + "loss": 0.9172, + "step": 9547 + }, + { + "epoch": 0.47620947630922694, + "grad_norm": 0.3894666176362913, + "learning_rate": 2.813765792771087e-05, + "loss": 0.8448, + "step": 9548 + }, + { + "epoch": 0.4762593516209476, + "grad_norm": 0.5089612130029323, + "learning_rate": 2.8133651367461467e-05, + "loss": 0.8438, + "step": 9549 + }, + { + "epoch": 0.4763092269326683, + "grad_norm": 0.489310585370913, + "learning_rate": 2.812964472544083e-05, + "loss": 0.944, + "step": 9550 + }, + { + "epoch": 0.47635910224438904, + "grad_norm": 0.432000336951835, + "learning_rate": 2.8125638001753512e-05, + "loss": 0.895, + "step": 9551 + }, + { + "epoch": 0.4764089775561097, + "grad_norm": 0.4538040905077986, + "learning_rate": 2.8121631196504078e-05, + "loss": 0.8728, + "step": 9552 + }, + { + "epoch": 0.4764588528678304, + "grad_norm": 0.587054124616177, + "learning_rate": 2.8117624309797068e-05, + "loss": 0.9755, + "step": 9553 + }, + { + "epoch": 0.47650872817955114, + "grad_norm": 0.37411033785618886, + "learning_rate": 2.8113617341737053e-05, + "loss": 0.8901, + "step": 9554 + }, + { + "epoch": 0.4765586034912718, + "grad_norm": 0.45433077876774575, + "learning_rate": 2.8109610292428577e-05, + "loss": 0.8973, + "step": 9555 + }, + { + "epoch": 0.4766084788029925, + "grad_norm": 0.38270646926306834, + "learning_rate": 2.8105603161976222e-05, + "loss": 0.8238, + "step": 9556 + }, + { + "epoch": 0.4766583541147132, + "grad_norm": 0.40100993798811485, + "learning_rate": 2.810159595048454e-05, + "loss": 0.9245, + "step": 9557 + }, + { + "epoch": 0.4767082294264339, + "grad_norm": 0.5202173664325821, + "learning_rate": 2.8097588658058098e-05, + "loss": 0.8364, + "step": 9558 + }, + { + "epoch": 0.4767581047381546, + "grad_norm": 0.42284553041958606, + "learning_rate": 2.809358128480147e-05, + "loss": 0.8827, + "step": 9559 + }, + { + "epoch": 0.4768079800498753, + "grad_norm": 0.4135452798834122, + "learning_rate": 2.8089573830819214e-05, + "loss": 0.9233, + "step": 9560 + }, + { + "epoch": 0.47685785536159603, + "grad_norm": 0.593625606500947, + "learning_rate": 2.808556629621592e-05, + "loss": 0.8448, + "step": 9561 + }, + { + "epoch": 0.4769077306733167, + "grad_norm": 0.5889764579305105, + "learning_rate": 2.8081558681096154e-05, + "loss": 0.8886, + "step": 9562 + }, + { + "epoch": 0.4769576059850374, + "grad_norm": 0.3700480981985883, + "learning_rate": 2.8077550985564488e-05, + "loss": 0.8399, + "step": 9563 + }, + { + "epoch": 0.47700748129675813, + "grad_norm": 0.432688753643478, + "learning_rate": 2.8073543209725506e-05, + "loss": 0.8864, + "step": 9564 + }, + { + "epoch": 0.4770573566084788, + "grad_norm": 0.4424949666057453, + "learning_rate": 2.8069535353683796e-05, + "loss": 0.8913, + "step": 9565 + }, + { + "epoch": 0.4771072319201995, + "grad_norm": 1.1961448723749126, + "learning_rate": 2.806552741754393e-05, + "loss": 0.9204, + "step": 9566 + }, + { + "epoch": 0.4771571072319202, + "grad_norm": 0.4003780236453822, + "learning_rate": 2.8061519401410504e-05, + "loss": 0.8863, + "step": 9567 + }, + { + "epoch": 0.4772069825436409, + "grad_norm": 0.47883282266269694, + "learning_rate": 2.805751130538809e-05, + "loss": 0.8954, + "step": 9568 + }, + { + "epoch": 0.4772568578553616, + "grad_norm": 0.41800269298784415, + "learning_rate": 2.8053503129581292e-05, + "loss": 0.8901, + "step": 9569 + }, + { + "epoch": 0.4773067331670823, + "grad_norm": 0.4898006391693061, + "learning_rate": 2.80494948740947e-05, + "loss": 0.9381, + "step": 9570 + }, + { + "epoch": 0.477356608478803, + "grad_norm": 0.4064100280068391, + "learning_rate": 2.8045486539032895e-05, + "loss": 0.8747, + "step": 9571 + }, + { + "epoch": 0.4774064837905237, + "grad_norm": 0.4149732257507901, + "learning_rate": 2.8041478124500493e-05, + "loss": 0.8776, + "step": 9572 + }, + { + "epoch": 0.4774563591022444, + "grad_norm": 0.3626039710854422, + "learning_rate": 2.803746963060207e-05, + "loss": 0.8573, + "step": 9573 + }, + { + "epoch": 0.4775062344139651, + "grad_norm": 0.4974540306721721, + "learning_rate": 2.8033461057442246e-05, + "loss": 0.914, + "step": 9574 + }, + { + "epoch": 0.4775561097256858, + "grad_norm": 0.37771199445042697, + "learning_rate": 2.8029452405125605e-05, + "loss": 0.8945, + "step": 9575 + }, + { + "epoch": 0.4776059850374065, + "grad_norm": 0.4624898480754764, + "learning_rate": 2.8025443673756764e-05, + "loss": 0.9344, + "step": 9576 + }, + { + "epoch": 0.47765586034912716, + "grad_norm": 0.7982062163947543, + "learning_rate": 2.802143486344032e-05, + "loss": 0.8398, + "step": 9577 + }, + { + "epoch": 0.4777057356608479, + "grad_norm": 0.4929452956470902, + "learning_rate": 2.8017425974280893e-05, + "loss": 0.8816, + "step": 9578 + }, + { + "epoch": 0.4777556109725686, + "grad_norm": 0.6208819995429248, + "learning_rate": 2.8013417006383076e-05, + "loss": 0.8693, + "step": 9579 + }, + { + "epoch": 0.47780548628428926, + "grad_norm": 0.376896183148767, + "learning_rate": 2.8009407959851497e-05, + "loss": 0.841, + "step": 9580 + }, + { + "epoch": 0.47785536159601, + "grad_norm": 0.48550618722280203, + "learning_rate": 2.800539883479077e-05, + "loss": 0.8913, + "step": 9581 + }, + { + "epoch": 0.4779052369077307, + "grad_norm": 0.41577348713698214, + "learning_rate": 2.8001389631305493e-05, + "loss": 0.8965, + "step": 9582 + }, + { + "epoch": 0.47795511221945136, + "grad_norm": 0.3999648751906586, + "learning_rate": 2.7997380349500308e-05, + "loss": 0.887, + "step": 9583 + }, + { + "epoch": 0.47800498753117204, + "grad_norm": 0.3902278469866277, + "learning_rate": 2.7993370989479813e-05, + "loss": 0.9126, + "step": 9584 + }, + { + "epoch": 0.4780548628428928, + "grad_norm": 0.380618222647258, + "learning_rate": 2.7989361551348646e-05, + "loss": 0.8875, + "step": 9585 + }, + { + "epoch": 0.47810473815461346, + "grad_norm": 0.4605227671202773, + "learning_rate": 2.798535203521143e-05, + "loss": 0.8899, + "step": 9586 + }, + { + "epoch": 0.47815461346633414, + "grad_norm": 0.45459756429176257, + "learning_rate": 2.7981342441172788e-05, + "loss": 0.9478, + "step": 9587 + }, + { + "epoch": 0.4782044887780549, + "grad_norm": 0.3780042271822525, + "learning_rate": 2.7977332769337344e-05, + "loss": 0.9363, + "step": 9588 + }, + { + "epoch": 0.47825436408977556, + "grad_norm": 0.457409842275122, + "learning_rate": 2.7973323019809744e-05, + "loss": 0.9513, + "step": 9589 + }, + { + "epoch": 0.47830423940149625, + "grad_norm": 0.4355578863705848, + "learning_rate": 2.79693131926946e-05, + "loss": 0.8856, + "step": 9590 + }, + { + "epoch": 0.478354114713217, + "grad_norm": 0.44057692685691857, + "learning_rate": 2.7965303288096566e-05, + "loss": 0.8943, + "step": 9591 + }, + { + "epoch": 0.47840399002493766, + "grad_norm": 0.9552459589101001, + "learning_rate": 2.796129330612027e-05, + "loss": 0.8407, + "step": 9592 + }, + { + "epoch": 0.47845386533665835, + "grad_norm": 0.4001566941289514, + "learning_rate": 2.7957283246870342e-05, + "loss": 0.8782, + "step": 9593 + }, + { + "epoch": 0.47850374064837903, + "grad_norm": 0.4001265654726537, + "learning_rate": 2.7953273110451438e-05, + "loss": 0.8929, + "step": 9594 + }, + { + "epoch": 0.47855361596009977, + "grad_norm": 0.39407080176273435, + "learning_rate": 2.7949262896968185e-05, + "loss": 0.8581, + "step": 9595 + }, + { + "epoch": 0.47860349127182045, + "grad_norm": 0.36237898082310005, + "learning_rate": 2.794525260652524e-05, + "loss": 0.8748, + "step": 9596 + }, + { + "epoch": 0.47865336658354113, + "grad_norm": 0.3868310828878881, + "learning_rate": 2.7941242239227255e-05, + "loss": 0.8389, + "step": 9597 + }, + { + "epoch": 0.47870324189526187, + "grad_norm": 0.4156719335172132, + "learning_rate": 2.793723179517886e-05, + "loss": 0.9144, + "step": 9598 + }, + { + "epoch": 0.47875311720698255, + "grad_norm": 0.4571061802659266, + "learning_rate": 2.7933221274484723e-05, + "loss": 0.9271, + "step": 9599 + }, + { + "epoch": 0.47880299251870323, + "grad_norm": 0.48986694221586335, + "learning_rate": 2.7929210677249483e-05, + "loss": 0.8816, + "step": 9600 + }, + { + "epoch": 0.47885286783042397, + "grad_norm": 0.4236323105997673, + "learning_rate": 2.79252000035778e-05, + "loss": 0.8331, + "step": 9601 + }, + { + "epoch": 0.47890274314214465, + "grad_norm": 0.8334828263997618, + "learning_rate": 2.7921189253574342e-05, + "loss": 0.8575, + "step": 9602 + }, + { + "epoch": 0.47895261845386533, + "grad_norm": 0.3693318719736381, + "learning_rate": 2.791717842734375e-05, + "loss": 0.8641, + "step": 9603 + }, + { + "epoch": 0.479002493765586, + "grad_norm": 0.4644991814999406, + "learning_rate": 2.7913167524990698e-05, + "loss": 0.8484, + "step": 9604 + }, + { + "epoch": 0.47905236907730675, + "grad_norm": 0.42192602120120776, + "learning_rate": 2.790915654661984e-05, + "loss": 0.9148, + "step": 9605 + }, + { + "epoch": 0.47910224438902743, + "grad_norm": 0.40087906886058294, + "learning_rate": 2.7905145492335842e-05, + "loss": 0.8997, + "step": 9606 + }, + { + "epoch": 0.4791521197007481, + "grad_norm": 0.4327302236205298, + "learning_rate": 2.790113436224338e-05, + "loss": 0.8909, + "step": 9607 + }, + { + "epoch": 0.47920199501246885, + "grad_norm": 0.4080939339871654, + "learning_rate": 2.7897123156447108e-05, + "loss": 0.8288, + "step": 9608 + }, + { + "epoch": 0.47925187032418953, + "grad_norm": 0.6088169681565204, + "learning_rate": 2.789311187505171e-05, + "loss": 0.9158, + "step": 9609 + }, + { + "epoch": 0.4793017456359102, + "grad_norm": 0.4393471001799088, + "learning_rate": 2.788910051816184e-05, + "loss": 0.8532, + "step": 9610 + }, + { + "epoch": 0.4793516209476309, + "grad_norm": 0.391338170923682, + "learning_rate": 2.78850890858822e-05, + "loss": 0.9143, + "step": 9611 + }, + { + "epoch": 0.47940149625935163, + "grad_norm": 0.6045488469332196, + "learning_rate": 2.7881077578317444e-05, + "loss": 0.8632, + "step": 9612 + }, + { + "epoch": 0.4794513715710723, + "grad_norm": 0.4302559955710915, + "learning_rate": 2.787706599557226e-05, + "loss": 0.8815, + "step": 9613 + }, + { + "epoch": 0.479501246882793, + "grad_norm": 0.48662691059459057, + "learning_rate": 2.787305433775132e-05, + "loss": 0.8806, + "step": 9614 + }, + { + "epoch": 0.47955112219451373, + "grad_norm": 0.5207662793358254, + "learning_rate": 2.7869042604959322e-05, + "loss": 0.9464, + "step": 9615 + }, + { + "epoch": 0.4796009975062344, + "grad_norm": 0.40785762673175113, + "learning_rate": 2.786503079730094e-05, + "loss": 0.9239, + "step": 9616 + }, + { + "epoch": 0.4796508728179551, + "grad_norm": 0.41215330667501715, + "learning_rate": 2.7861018914880854e-05, + "loss": 0.912, + "step": 9617 + }, + { + "epoch": 0.47970074812967584, + "grad_norm": 0.41040393015055826, + "learning_rate": 2.7857006957803773e-05, + "loss": 0.886, + "step": 9618 + }, + { + "epoch": 0.4797506234413965, + "grad_norm": 0.4250626483920374, + "learning_rate": 2.785299492617436e-05, + "loss": 0.8925, + "step": 9619 + }, + { + "epoch": 0.4798004987531172, + "grad_norm": 0.42852460300351, + "learning_rate": 2.784898282009733e-05, + "loss": 0.9025, + "step": 9620 + }, + { + "epoch": 0.4798503740648379, + "grad_norm": 0.39185015576011967, + "learning_rate": 2.784497063967736e-05, + "loss": 0.9278, + "step": 9621 + }, + { + "epoch": 0.4799002493765586, + "grad_norm": 0.3847603197123116, + "learning_rate": 2.7840958385019167e-05, + "loss": 0.8746, + "step": 9622 + }, + { + "epoch": 0.4799501246882793, + "grad_norm": 0.3717297216909295, + "learning_rate": 2.7836946056227426e-05, + "loss": 0.8683, + "step": 9623 + }, + { + "epoch": 0.48, + "grad_norm": 0.4144023519438977, + "learning_rate": 2.783293365340685e-05, + "loss": 0.8712, + "step": 9624 + }, + { + "epoch": 0.4800498753117207, + "grad_norm": 0.4448877351136352, + "learning_rate": 2.7828921176662136e-05, + "loss": 0.8711, + "step": 9625 + }, + { + "epoch": 0.4800997506234414, + "grad_norm": 0.5636590656738775, + "learning_rate": 2.7824908626097997e-05, + "loss": 0.8916, + "step": 9626 + }, + { + "epoch": 0.4801496259351621, + "grad_norm": 0.40553876800315464, + "learning_rate": 2.7820896001819124e-05, + "loss": 0.9007, + "step": 9627 + }, + { + "epoch": 0.4801995012468828, + "grad_norm": 0.4597362344378829, + "learning_rate": 2.7816883303930236e-05, + "loss": 0.912, + "step": 9628 + }, + { + "epoch": 0.4802493765586035, + "grad_norm": 0.49975979514060753, + "learning_rate": 2.7812870532536045e-05, + "loss": 0.9106, + "step": 9629 + }, + { + "epoch": 0.4802992518703242, + "grad_norm": 0.4012581139785555, + "learning_rate": 2.7808857687741245e-05, + "loss": 0.8956, + "step": 9630 + }, + { + "epoch": 0.48034912718204487, + "grad_norm": 1.401774393757542, + "learning_rate": 2.7804844769650567e-05, + "loss": 0.9041, + "step": 9631 + }, + { + "epoch": 0.4803990024937656, + "grad_norm": 0.41303985474402516, + "learning_rate": 2.7800831778368717e-05, + "loss": 0.9111, + "step": 9632 + }, + { + "epoch": 0.4804488778054863, + "grad_norm": 0.4710791413565303, + "learning_rate": 2.7796818714000416e-05, + "loss": 0.9028, + "step": 9633 + }, + { + "epoch": 0.48049875311720697, + "grad_norm": 0.4633229045355273, + "learning_rate": 2.7792805576650384e-05, + "loss": 0.8559, + "step": 9634 + }, + { + "epoch": 0.4805486284289277, + "grad_norm": 0.397534641220356, + "learning_rate": 2.7788792366423335e-05, + "loss": 0.8881, + "step": 9635 + }, + { + "epoch": 0.4805985037406484, + "grad_norm": 1.1191771556566323, + "learning_rate": 2.7784779083424005e-05, + "loss": 0.9039, + "step": 9636 + }, + { + "epoch": 0.48064837905236907, + "grad_norm": 0.4148013041763215, + "learning_rate": 2.778076572775711e-05, + "loss": 0.8846, + "step": 9637 + }, + { + "epoch": 0.48069825436408975, + "grad_norm": 0.714540519962874, + "learning_rate": 2.7776752299527374e-05, + "loss": 0.8767, + "step": 9638 + }, + { + "epoch": 0.4807481296758105, + "grad_norm": 0.44741158240838086, + "learning_rate": 2.7772738798839527e-05, + "loss": 0.9071, + "step": 9639 + }, + { + "epoch": 0.48079800498753117, + "grad_norm": 0.7607816892906173, + "learning_rate": 2.776872522579831e-05, + "loss": 0.9201, + "step": 9640 + }, + { + "epoch": 0.48084788029925185, + "grad_norm": 0.7458371413973399, + "learning_rate": 2.776471158050844e-05, + "loss": 0.8948, + "step": 9641 + }, + { + "epoch": 0.4808977556109726, + "grad_norm": 0.3960297210482143, + "learning_rate": 2.7760697863074666e-05, + "loss": 0.9349, + "step": 9642 + }, + { + "epoch": 0.48094763092269327, + "grad_norm": 0.42556631032286396, + "learning_rate": 2.775668407360171e-05, + "loss": 0.9189, + "step": 9643 + }, + { + "epoch": 0.48099750623441395, + "grad_norm": 0.5375806482890668, + "learning_rate": 2.775267021219432e-05, + "loss": 0.8922, + "step": 9644 + }, + { + "epoch": 0.4810473815461347, + "grad_norm": 0.43685409522322366, + "learning_rate": 2.774865627895723e-05, + "loss": 0.9312, + "step": 9645 + }, + { + "epoch": 0.48109725685785537, + "grad_norm": 0.38396191558032194, + "learning_rate": 2.7744642273995192e-05, + "loss": 0.8816, + "step": 9646 + }, + { + "epoch": 0.48114713216957605, + "grad_norm": 0.4767987969592784, + "learning_rate": 2.774062819741293e-05, + "loss": 0.8604, + "step": 9647 + }, + { + "epoch": 0.48119700748129673, + "grad_norm": 0.41111045729216067, + "learning_rate": 2.7736614049315213e-05, + "loss": 0.8357, + "step": 9648 + }, + { + "epoch": 0.48124688279301747, + "grad_norm": 0.5705843012103556, + "learning_rate": 2.773259982980677e-05, + "loss": 0.8394, + "step": 9649 + }, + { + "epoch": 0.48129675810473815, + "grad_norm": 0.4340339606372272, + "learning_rate": 2.7728585538992363e-05, + "loss": 0.8926, + "step": 9650 + }, + { + "epoch": 0.48134663341645884, + "grad_norm": 0.40629384432520455, + "learning_rate": 2.7724571176976732e-05, + "loss": 0.8801, + "step": 9651 + }, + { + "epoch": 0.4813965087281796, + "grad_norm": 0.5020555323896281, + "learning_rate": 2.7720556743864646e-05, + "loss": 0.8954, + "step": 9652 + }, + { + "epoch": 0.48144638403990025, + "grad_norm": 0.39367250921564584, + "learning_rate": 2.7716542239760844e-05, + "loss": 0.8729, + "step": 9653 + }, + { + "epoch": 0.48149625935162094, + "grad_norm": 0.4153399098536559, + "learning_rate": 2.771252766477008e-05, + "loss": 0.9125, + "step": 9654 + }, + { + "epoch": 0.4815461346633416, + "grad_norm": 0.4104619663876804, + "learning_rate": 2.770851301899713e-05, + "loss": 0.913, + "step": 9655 + }, + { + "epoch": 0.48159600997506236, + "grad_norm": 0.40834199235171503, + "learning_rate": 2.7704498302546738e-05, + "loss": 0.857, + "step": 9656 + }, + { + "epoch": 0.48164588528678304, + "grad_norm": 0.44685882734335525, + "learning_rate": 2.770048351552368e-05, + "loss": 0.9081, + "step": 9657 + }, + { + "epoch": 0.4816957605985037, + "grad_norm": 0.4233165015624143, + "learning_rate": 2.7696468658032708e-05, + "loss": 0.8768, + "step": 9658 + }, + { + "epoch": 0.48174563591022446, + "grad_norm": 0.5187325207527923, + "learning_rate": 2.7692453730178597e-05, + "loss": 0.8717, + "step": 9659 + }, + { + "epoch": 0.48179551122194514, + "grad_norm": 0.4734566144549104, + "learning_rate": 2.768843873206611e-05, + "loss": 0.9222, + "step": 9660 + }, + { + "epoch": 0.4818453865336658, + "grad_norm": 0.480537286735882, + "learning_rate": 2.7684423663800014e-05, + "loss": 0.8719, + "step": 9661 + }, + { + "epoch": 0.48189526184538656, + "grad_norm": 0.4272273731299174, + "learning_rate": 2.7680408525485087e-05, + "loss": 0.8888, + "step": 9662 + }, + { + "epoch": 0.48194513715710724, + "grad_norm": 0.4262768923751927, + "learning_rate": 2.7676393317226096e-05, + "loss": 0.89, + "step": 9663 + }, + { + "epoch": 0.4819950124688279, + "grad_norm": 0.4125120742664408, + "learning_rate": 2.767237803912783e-05, + "loss": 0.9258, + "step": 9664 + }, + { + "epoch": 0.4820448877805486, + "grad_norm": 0.4630387722400964, + "learning_rate": 2.766836269129504e-05, + "loss": 0.9048, + "step": 9665 + }, + { + "epoch": 0.48209476309226934, + "grad_norm": 0.463745324995781, + "learning_rate": 2.7664347273832532e-05, + "loss": 0.9187, + "step": 9666 + }, + { + "epoch": 0.48214463840399, + "grad_norm": 0.4106428624046568, + "learning_rate": 2.766033178684506e-05, + "loss": 0.868, + "step": 9667 + }, + { + "epoch": 0.4821945137157107, + "grad_norm": 0.4275988169383625, + "learning_rate": 2.7656316230437436e-05, + "loss": 0.9193, + "step": 9668 + }, + { + "epoch": 0.48224438902743144, + "grad_norm": 0.4691323638487961, + "learning_rate": 2.7652300604714422e-05, + "loss": 0.9125, + "step": 9669 + }, + { + "epoch": 0.4822942643391521, + "grad_norm": 0.4334947887149732, + "learning_rate": 2.764828490978081e-05, + "loss": 0.8844, + "step": 9670 + }, + { + "epoch": 0.4823441396508728, + "grad_norm": 0.4170169786822818, + "learning_rate": 2.7644269145741388e-05, + "loss": 0.9298, + "step": 9671 + }, + { + "epoch": 0.48239401496259354, + "grad_norm": 0.4300247227943529, + "learning_rate": 2.7640253312700944e-05, + "loss": 0.9138, + "step": 9672 + }, + { + "epoch": 0.4824438902743142, + "grad_norm": 0.4863364978040629, + "learning_rate": 2.7636237410764277e-05, + "loss": 0.9053, + "step": 9673 + }, + { + "epoch": 0.4824937655860349, + "grad_norm": 0.42665233151283477, + "learning_rate": 2.7632221440036164e-05, + "loss": 0.9216, + "step": 9674 + }, + { + "epoch": 0.4825436408977556, + "grad_norm": 0.4112690006753609, + "learning_rate": 2.7628205400621425e-05, + "loss": 0.9066, + "step": 9675 + }, + { + "epoch": 0.4825935162094763, + "grad_norm": 0.5574786471099835, + "learning_rate": 2.7624189292624837e-05, + "loss": 0.8537, + "step": 9676 + }, + { + "epoch": 0.482643391521197, + "grad_norm": 0.4147026735294843, + "learning_rate": 2.7620173116151205e-05, + "loss": 0.8968, + "step": 9677 + }, + { + "epoch": 0.4826932668329177, + "grad_norm": 0.4047477827233855, + "learning_rate": 2.761615687130532e-05, + "loss": 0.8644, + "step": 9678 + }, + { + "epoch": 0.4827431421446384, + "grad_norm": 0.5026714390550489, + "learning_rate": 2.7612140558192e-05, + "loss": 0.9155, + "step": 9679 + }, + { + "epoch": 0.4827930174563591, + "grad_norm": 0.5337390586476961, + "learning_rate": 2.7608124176916038e-05, + "loss": 0.901, + "step": 9680 + }, + { + "epoch": 0.4828428927680798, + "grad_norm": 0.45372864036201643, + "learning_rate": 2.760410772758225e-05, + "loss": 0.9341, + "step": 9681 + }, + { + "epoch": 0.48289276807980047, + "grad_norm": 0.42204661953727307, + "learning_rate": 2.7600091210295427e-05, + "loss": 0.9318, + "step": 9682 + }, + { + "epoch": 0.4829426433915212, + "grad_norm": 0.421499149027868, + "learning_rate": 2.7596074625160396e-05, + "loss": 0.8908, + "step": 9683 + }, + { + "epoch": 0.4829925187032419, + "grad_norm": 0.528651603411698, + "learning_rate": 2.759205797228195e-05, + "loss": 0.8677, + "step": 9684 + }, + { + "epoch": 0.4830423940149626, + "grad_norm": 0.43270456378834116, + "learning_rate": 2.7588041251764917e-05, + "loss": 0.8649, + "step": 9685 + }, + { + "epoch": 0.4830922693266833, + "grad_norm": 0.44663594417278757, + "learning_rate": 2.7584024463714104e-05, + "loss": 0.9017, + "step": 9686 + }, + { + "epoch": 0.483142144638404, + "grad_norm": 0.46658987536960833, + "learning_rate": 2.758000760823433e-05, + "loss": 0.8607, + "step": 9687 + }, + { + "epoch": 0.4831920199501247, + "grad_norm": 0.5077202780773311, + "learning_rate": 2.7575990685430415e-05, + "loss": 0.8755, + "step": 9688 + }, + { + "epoch": 0.4832418952618454, + "grad_norm": 0.4618858337272568, + "learning_rate": 2.757197369540717e-05, + "loss": 0.8773, + "step": 9689 + }, + { + "epoch": 0.4832917705735661, + "grad_norm": 0.4376703973381259, + "learning_rate": 2.7567956638269433e-05, + "loss": 0.9266, + "step": 9690 + }, + { + "epoch": 0.4833416458852868, + "grad_norm": 0.4332764978872372, + "learning_rate": 2.7563939514122005e-05, + "loss": 0.9092, + "step": 9691 + }, + { + "epoch": 0.48339152119700746, + "grad_norm": 0.7048436244098567, + "learning_rate": 2.7559922323069727e-05, + "loss": 0.8807, + "step": 9692 + }, + { + "epoch": 0.4834413965087282, + "grad_norm": 1.1698165880795832, + "learning_rate": 2.755590506521742e-05, + "loss": 0.92, + "step": 9693 + }, + { + "epoch": 0.4834912718204489, + "grad_norm": 0.4220841877800839, + "learning_rate": 2.755188774066992e-05, + "loss": 0.9384, + "step": 9694 + }, + { + "epoch": 0.48354114713216956, + "grad_norm": 0.6342659888764841, + "learning_rate": 2.7547870349532044e-05, + "loss": 0.8845, + "step": 9695 + }, + { + "epoch": 0.4835910224438903, + "grad_norm": 0.4046115075077683, + "learning_rate": 2.754385289190864e-05, + "loss": 0.8671, + "step": 9696 + }, + { + "epoch": 0.483640897755611, + "grad_norm": 0.43949649077317704, + "learning_rate": 2.753983536790452e-05, + "loss": 0.929, + "step": 9697 + }, + { + "epoch": 0.48369077306733166, + "grad_norm": 0.4933237469458607, + "learning_rate": 2.7535817777624545e-05, + "loss": 0.8866, + "step": 9698 + }, + { + "epoch": 0.4837406483790524, + "grad_norm": 0.5871155960752408, + "learning_rate": 2.753180012117354e-05, + "loss": 0.927, + "step": 9699 + }, + { + "epoch": 0.4837905236907731, + "grad_norm": 0.46958632573867554, + "learning_rate": 2.7527782398656336e-05, + "loss": 0.8892, + "step": 9700 + }, + { + "epoch": 0.48384039900249376, + "grad_norm": 0.40966229590086095, + "learning_rate": 2.752376461017779e-05, + "loss": 0.942, + "step": 9701 + }, + { + "epoch": 0.48389027431421444, + "grad_norm": 0.450040600055551, + "learning_rate": 2.7519746755842728e-05, + "loss": 0.884, + "step": 9702 + }, + { + "epoch": 0.4839401496259352, + "grad_norm": 0.7696770492990814, + "learning_rate": 2.7515728835756006e-05, + "loss": 0.8776, + "step": 9703 + }, + { + "epoch": 0.48399002493765586, + "grad_norm": 0.46163573846393346, + "learning_rate": 2.7511710850022466e-05, + "loss": 0.8733, + "step": 9704 + }, + { + "epoch": 0.48403990024937654, + "grad_norm": 0.3957032846838247, + "learning_rate": 2.7507692798746952e-05, + "loss": 0.8832, + "step": 9705 + }, + { + "epoch": 0.4840897755610973, + "grad_norm": 0.39948441604136653, + "learning_rate": 2.750367468203432e-05, + "loss": 0.8659, + "step": 9706 + }, + { + "epoch": 0.48413965087281796, + "grad_norm": 0.40235901635576654, + "learning_rate": 2.7499656499989424e-05, + "loss": 0.8824, + "step": 9707 + }, + { + "epoch": 0.48418952618453864, + "grad_norm": 0.4166495533845926, + "learning_rate": 2.74956382527171e-05, + "loss": 0.8556, + "step": 9708 + }, + { + "epoch": 0.4842394014962593, + "grad_norm": 0.3826746593088063, + "learning_rate": 2.7491619940322217e-05, + "loss": 0.8595, + "step": 9709 + }, + { + "epoch": 0.48428927680798006, + "grad_norm": 0.4516239736723229, + "learning_rate": 2.748760156290962e-05, + "loss": 0.8615, + "step": 9710 + }, + { + "epoch": 0.48433915211970074, + "grad_norm": 0.37575463191525577, + "learning_rate": 2.748358312058418e-05, + "loss": 0.8474, + "step": 9711 + }, + { + "epoch": 0.4843890274314214, + "grad_norm": 0.4595386829442084, + "learning_rate": 2.7479564613450753e-05, + "loss": 0.9612, + "step": 9712 + }, + { + "epoch": 0.48443890274314216, + "grad_norm": 0.5002158541452738, + "learning_rate": 2.747554604161419e-05, + "loss": 0.8542, + "step": 9713 + }, + { + "epoch": 0.48448877805486285, + "grad_norm": 0.3778277177805333, + "learning_rate": 2.7471527405179366e-05, + "loss": 0.8535, + "step": 9714 + }, + { + "epoch": 0.4845386533665835, + "grad_norm": 0.39450031845196765, + "learning_rate": 2.7467508704251137e-05, + "loss": 0.8638, + "step": 9715 + }, + { + "epoch": 0.48458852867830426, + "grad_norm": 0.36872558455666843, + "learning_rate": 2.7463489938934378e-05, + "loss": 0.9473, + "step": 9716 + }, + { + "epoch": 0.48463840399002495, + "grad_norm": 0.4093115129403778, + "learning_rate": 2.7459471109333947e-05, + "loss": 0.8458, + "step": 9717 + }, + { + "epoch": 0.48468827930174563, + "grad_norm": 0.46441977307002485, + "learning_rate": 2.7455452215554724e-05, + "loss": 0.9109, + "step": 9718 + }, + { + "epoch": 0.4847381546134663, + "grad_norm": 0.3421967959933839, + "learning_rate": 2.7451433257701564e-05, + "loss": 0.8298, + "step": 9719 + }, + { + "epoch": 0.48478802992518705, + "grad_norm": 0.40663862061936296, + "learning_rate": 2.744741423587936e-05, + "loss": 0.91, + "step": 9720 + }, + { + "epoch": 0.48483790523690773, + "grad_norm": 0.4115468425187992, + "learning_rate": 2.744339515019297e-05, + "loss": 0.8443, + "step": 9721 + }, + { + "epoch": 0.4848877805486284, + "grad_norm": 0.43417820814371805, + "learning_rate": 2.743937600074728e-05, + "loss": 0.8575, + "step": 9722 + }, + { + "epoch": 0.48493765586034915, + "grad_norm": 0.4202495499769075, + "learning_rate": 2.743535678764717e-05, + "loss": 0.9115, + "step": 9723 + }, + { + "epoch": 0.48498753117206983, + "grad_norm": 0.6420775492090313, + "learning_rate": 2.7431337510997505e-05, + "loss": 0.8972, + "step": 9724 + }, + { + "epoch": 0.4850374064837905, + "grad_norm": 0.4790001103609705, + "learning_rate": 2.742731817090318e-05, + "loss": 0.8392, + "step": 9725 + }, + { + "epoch": 0.48508728179551125, + "grad_norm": 0.41121000945438557, + "learning_rate": 2.7423298767469076e-05, + "loss": 0.9025, + "step": 9726 + }, + { + "epoch": 0.48513715710723193, + "grad_norm": 0.3848900490182493, + "learning_rate": 2.7419279300800078e-05, + "loss": 0.8817, + "step": 9727 + }, + { + "epoch": 0.4851870324189526, + "grad_norm": 0.4383892346433354, + "learning_rate": 2.741525977100106e-05, + "loss": 0.9107, + "step": 9728 + }, + { + "epoch": 0.4852369077306733, + "grad_norm": 0.4382915876532596, + "learning_rate": 2.7411240178176927e-05, + "loss": 0.9272, + "step": 9729 + }, + { + "epoch": 0.48528678304239403, + "grad_norm": 0.4182325013551415, + "learning_rate": 2.7407220522432553e-05, + "loss": 0.9174, + "step": 9730 + }, + { + "epoch": 0.4853366583541147, + "grad_norm": 0.4487312302558042, + "learning_rate": 2.7403200803872843e-05, + "loss": 0.8542, + "step": 9731 + }, + { + "epoch": 0.4853865336658354, + "grad_norm": 0.41308114496576365, + "learning_rate": 2.7399181022602683e-05, + "loss": 0.8593, + "step": 9732 + }, + { + "epoch": 0.48543640897755613, + "grad_norm": 0.4357708317216642, + "learning_rate": 2.739516117872697e-05, + "loss": 0.9036, + "step": 9733 + }, + { + "epoch": 0.4854862842892768, + "grad_norm": 0.4719559520590568, + "learning_rate": 2.739114127235059e-05, + "loss": 0.9359, + "step": 9734 + }, + { + "epoch": 0.4855361596009975, + "grad_norm": 0.518805758125185, + "learning_rate": 2.7387121303578456e-05, + "loss": 0.899, + "step": 9735 + }, + { + "epoch": 0.4855860349127182, + "grad_norm": 0.43957700287500906, + "learning_rate": 2.738310127251546e-05, + "loss": 0.9031, + "step": 9736 + }, + { + "epoch": 0.4856359102244389, + "grad_norm": 0.40578850930210814, + "learning_rate": 2.73790811792665e-05, + "loss": 0.8665, + "step": 9737 + }, + { + "epoch": 0.4856857855361596, + "grad_norm": 0.4418199074259627, + "learning_rate": 2.7375061023936482e-05, + "loss": 0.8359, + "step": 9738 + }, + { + "epoch": 0.4857356608478803, + "grad_norm": 0.6479908890275737, + "learning_rate": 2.7371040806630305e-05, + "loss": 0.8989, + "step": 9739 + }, + { + "epoch": 0.485785536159601, + "grad_norm": 0.5255218913281446, + "learning_rate": 2.7367020527452887e-05, + "loss": 0.8985, + "step": 9740 + }, + { + "epoch": 0.4858354114713217, + "grad_norm": 0.6128582461337059, + "learning_rate": 2.736300018650912e-05, + "loss": 0.9227, + "step": 9741 + }, + { + "epoch": 0.4858852867830424, + "grad_norm": 0.39841461810862905, + "learning_rate": 2.7358979783903927e-05, + "loss": 0.8812, + "step": 9742 + }, + { + "epoch": 0.4859351620947631, + "grad_norm": 0.44077349509976343, + "learning_rate": 2.7354959319742208e-05, + "loss": 0.888, + "step": 9743 + }, + { + "epoch": 0.4859850374064838, + "grad_norm": 0.4752478464600547, + "learning_rate": 2.7350938794128882e-05, + "loss": 0.9444, + "step": 9744 + }, + { + "epoch": 0.4860349127182045, + "grad_norm": 0.45293899675494426, + "learning_rate": 2.734691820716886e-05, + "loss": 0.8927, + "step": 9745 + }, + { + "epoch": 0.48608478802992516, + "grad_norm": 0.705766755844972, + "learning_rate": 2.7342897558967058e-05, + "loss": 0.9042, + "step": 9746 + }, + { + "epoch": 0.4861346633416459, + "grad_norm": 0.40653120969555284, + "learning_rate": 2.7338876849628386e-05, + "loss": 0.9202, + "step": 9747 + }, + { + "epoch": 0.4861845386533666, + "grad_norm": 0.4679401006745026, + "learning_rate": 2.7334856079257775e-05, + "loss": 0.885, + "step": 9748 + }, + { + "epoch": 0.48623441396508726, + "grad_norm": 0.4673772392911669, + "learning_rate": 2.7330835247960145e-05, + "loss": 0.8651, + "step": 9749 + }, + { + "epoch": 0.486284289276808, + "grad_norm": 0.5302481723142491, + "learning_rate": 2.7326814355840404e-05, + "loss": 0.8812, + "step": 9750 + }, + { + "epoch": 0.4863341645885287, + "grad_norm": 0.5189811467458489, + "learning_rate": 2.7322793403003487e-05, + "loss": 0.9106, + "step": 9751 + }, + { + "epoch": 0.48638403990024937, + "grad_norm": 0.45929624561880245, + "learning_rate": 2.7318772389554315e-05, + "loss": 0.914, + "step": 9752 + }, + { + "epoch": 0.48643391521197005, + "grad_norm": 0.457172657389415, + "learning_rate": 2.731475131559782e-05, + "loss": 0.8833, + "step": 9753 + }, + { + "epoch": 0.4864837905236908, + "grad_norm": 0.3930065903895728, + "learning_rate": 2.731073018123892e-05, + "loss": 0.8942, + "step": 9754 + }, + { + "epoch": 0.48653366583541147, + "grad_norm": 0.4959946426578077, + "learning_rate": 2.7306708986582553e-05, + "loss": 0.8846, + "step": 9755 + }, + { + "epoch": 0.48658354114713215, + "grad_norm": 0.46891560586331643, + "learning_rate": 2.730268773173364e-05, + "loss": 0.9106, + "step": 9756 + }, + { + "epoch": 0.4866334164588529, + "grad_norm": 0.5911605213605515, + "learning_rate": 2.729866641679713e-05, + "loss": 0.9141, + "step": 9757 + }, + { + "epoch": 0.48668329177057357, + "grad_norm": 0.42578281908378174, + "learning_rate": 2.7294645041877947e-05, + "loss": 0.8595, + "step": 9758 + }, + { + "epoch": 0.48673316708229425, + "grad_norm": 0.4817764977051447, + "learning_rate": 2.7290623607081028e-05, + "loss": 0.8783, + "step": 9759 + }, + { + "epoch": 0.486783042394015, + "grad_norm": 0.4072451996174287, + "learning_rate": 2.728660211251131e-05, + "loss": 0.8863, + "step": 9760 + }, + { + "epoch": 0.48683291770573567, + "grad_norm": 0.4748031825590785, + "learning_rate": 2.7282580558273734e-05, + "loss": 0.8794, + "step": 9761 + }, + { + "epoch": 0.48688279301745635, + "grad_norm": 0.5796350322768021, + "learning_rate": 2.7278558944473244e-05, + "loss": 0.8945, + "step": 9762 + }, + { + "epoch": 0.48693266832917703, + "grad_norm": 0.5991521046816991, + "learning_rate": 2.7274537271214768e-05, + "loss": 0.9063, + "step": 9763 + }, + { + "epoch": 0.48698254364089777, + "grad_norm": 1.725946338961854, + "learning_rate": 2.727051553860327e-05, + "loss": 0.9402, + "step": 9764 + }, + { + "epoch": 0.48703241895261845, + "grad_norm": 0.42756790994715776, + "learning_rate": 2.7266493746743675e-05, + "loss": 0.9144, + "step": 9765 + }, + { + "epoch": 0.48708229426433913, + "grad_norm": 0.6876907881945583, + "learning_rate": 2.726247189574095e-05, + "loss": 0.8574, + "step": 9766 + }, + { + "epoch": 0.48713216957605987, + "grad_norm": 0.47550909021951976, + "learning_rate": 2.7258449985700024e-05, + "loss": 0.8632, + "step": 9767 + }, + { + "epoch": 0.48718204488778055, + "grad_norm": 0.4115760092888273, + "learning_rate": 2.7254428016725863e-05, + "loss": 0.8552, + "step": 9768 + }, + { + "epoch": 0.48723192019950123, + "grad_norm": 0.47423574353869774, + "learning_rate": 2.725040598892341e-05, + "loss": 0.9208, + "step": 9769 + }, + { + "epoch": 0.48728179551122197, + "grad_norm": 0.49822221177800946, + "learning_rate": 2.724638390239762e-05, + "loss": 0.8918, + "step": 9770 + }, + { + "epoch": 0.48733167082294265, + "grad_norm": 0.5114507621626352, + "learning_rate": 2.7242361757253454e-05, + "loss": 0.872, + "step": 9771 + }, + { + "epoch": 0.48738154613466333, + "grad_norm": 0.40069717379578373, + "learning_rate": 2.7238339553595848e-05, + "loss": 0.8775, + "step": 9772 + }, + { + "epoch": 0.487431421446384, + "grad_norm": 0.49569242205870667, + "learning_rate": 2.7234317291529788e-05, + "loss": 0.8935, + "step": 9773 + }, + { + "epoch": 0.48748129675810475, + "grad_norm": 0.4281591367807285, + "learning_rate": 2.7230294971160207e-05, + "loss": 0.8808, + "step": 9774 + }, + { + "epoch": 0.48753117206982544, + "grad_norm": 0.554926544195074, + "learning_rate": 2.7226272592592084e-05, + "loss": 0.9133, + "step": 9775 + }, + { + "epoch": 0.4875810473815461, + "grad_norm": 0.4588617047904649, + "learning_rate": 2.7222250155930368e-05, + "loss": 0.8896, + "step": 9776 + }, + { + "epoch": 0.48763092269326686, + "grad_norm": 0.5542942407980925, + "learning_rate": 2.721822766128003e-05, + "loss": 0.909, + "step": 9777 + }, + { + "epoch": 0.48768079800498754, + "grad_norm": 0.42959471529847676, + "learning_rate": 2.721420510874603e-05, + "loss": 0.8565, + "step": 9778 + }, + { + "epoch": 0.4877306733167082, + "grad_norm": 0.4024050280270112, + "learning_rate": 2.721018249843335e-05, + "loss": 0.8917, + "step": 9779 + }, + { + "epoch": 0.4877805486284289, + "grad_norm": 0.4851040385966828, + "learning_rate": 2.7206159830446936e-05, + "loss": 0.8746, + "step": 9780 + }, + { + "epoch": 0.48783042394014964, + "grad_norm": 0.39568645882006764, + "learning_rate": 2.720213710489178e-05, + "loss": 0.8474, + "step": 9781 + }, + { + "epoch": 0.4878802992518703, + "grad_norm": 0.47060719248742594, + "learning_rate": 2.719811432187283e-05, + "loss": 0.8721, + "step": 9782 + }, + { + "epoch": 0.487930174563591, + "grad_norm": 0.41961109075203845, + "learning_rate": 2.7194091481495076e-05, + "loss": 0.9027, + "step": 9783 + }, + { + "epoch": 0.48798004987531174, + "grad_norm": 0.5256473624001059, + "learning_rate": 2.719006858386348e-05, + "loss": 0.8983, + "step": 9784 + }, + { + "epoch": 0.4880299251870324, + "grad_norm": 0.48548116382168033, + "learning_rate": 2.718604562908304e-05, + "loss": 0.9205, + "step": 9785 + }, + { + "epoch": 0.4880798004987531, + "grad_norm": 0.4180415226674377, + "learning_rate": 2.71820226172587e-05, + "loss": 0.8563, + "step": 9786 + }, + { + "epoch": 0.48812967581047384, + "grad_norm": 0.42586657089847224, + "learning_rate": 2.7177999548495464e-05, + "loss": 0.8782, + "step": 9787 + }, + { + "epoch": 0.4881795511221945, + "grad_norm": 0.4026112313993584, + "learning_rate": 2.7173976422898308e-05, + "loss": 0.8445, + "step": 9788 + }, + { + "epoch": 0.4882294264339152, + "grad_norm": 0.49925162085679364, + "learning_rate": 2.7169953240572204e-05, + "loss": 0.8452, + "step": 9789 + }, + { + "epoch": 0.4882793017456359, + "grad_norm": 0.5516735527865604, + "learning_rate": 2.716593000162215e-05, + "loss": 0.8839, + "step": 9790 + }, + { + "epoch": 0.4883291770573566, + "grad_norm": 0.39977769577750294, + "learning_rate": 2.716190670615311e-05, + "loss": 0.8936, + "step": 9791 + }, + { + "epoch": 0.4883790523690773, + "grad_norm": 0.47554490244657194, + "learning_rate": 2.715788335427009e-05, + "loss": 0.8839, + "step": 9792 + }, + { + "epoch": 0.488428927680798, + "grad_norm": 0.4055141249552206, + "learning_rate": 2.715385994607807e-05, + "loss": 0.8999, + "step": 9793 + }, + { + "epoch": 0.4884788029925187, + "grad_norm": 0.44854394361431055, + "learning_rate": 2.714983648168204e-05, + "loss": 0.942, + "step": 9794 + }, + { + "epoch": 0.4885286783042394, + "grad_norm": 0.49876047682228486, + "learning_rate": 2.7145812961186988e-05, + "loss": 0.8863, + "step": 9795 + }, + { + "epoch": 0.4885785536159601, + "grad_norm": 0.4844762182933438, + "learning_rate": 2.71417893846979e-05, + "loss": 0.9014, + "step": 9796 + }, + { + "epoch": 0.4886284289276808, + "grad_norm": 0.5267177207096821, + "learning_rate": 2.7137765752319787e-05, + "loss": 0.8926, + "step": 9797 + }, + { + "epoch": 0.4886783042394015, + "grad_norm": 0.45964411407685746, + "learning_rate": 2.7133742064157625e-05, + "loss": 0.8907, + "step": 9798 + }, + { + "epoch": 0.4887281795511222, + "grad_norm": 0.8512469427027632, + "learning_rate": 2.712971832031642e-05, + "loss": 0.9385, + "step": 9799 + }, + { + "epoch": 0.48877805486284287, + "grad_norm": 0.3926059713061083, + "learning_rate": 2.7125694520901167e-05, + "loss": 0.8733, + "step": 9800 + }, + { + "epoch": 0.4888279301745636, + "grad_norm": 0.9033429238025362, + "learning_rate": 2.7121670666016873e-05, + "loss": 0.8785, + "step": 9801 + }, + { + "epoch": 0.4888778054862843, + "grad_norm": 0.44524599129337994, + "learning_rate": 2.7117646755768534e-05, + "loss": 0.9171, + "step": 9802 + }, + { + "epoch": 0.48892768079800497, + "grad_norm": 0.4901034183673837, + "learning_rate": 2.7113622790261144e-05, + "loss": 0.8452, + "step": 9803 + }, + { + "epoch": 0.4889775561097257, + "grad_norm": 0.5557519768915281, + "learning_rate": 2.7109598769599716e-05, + "loss": 0.8898, + "step": 9804 + }, + { + "epoch": 0.4890274314214464, + "grad_norm": 0.5584764505274934, + "learning_rate": 2.710557469388925e-05, + "loss": 0.9138, + "step": 9805 + }, + { + "epoch": 0.48907730673316707, + "grad_norm": 0.5051689950136645, + "learning_rate": 2.7101550563234763e-05, + "loss": 0.9255, + "step": 9806 + }, + { + "epoch": 0.48912718204488775, + "grad_norm": 0.45895332775391756, + "learning_rate": 2.7097526377741245e-05, + "loss": 0.9076, + "step": 9807 + }, + { + "epoch": 0.4891770573566085, + "grad_norm": 0.5048077203380253, + "learning_rate": 2.709350213751372e-05, + "loss": 0.8749, + "step": 9808 + }, + { + "epoch": 0.4892269326683292, + "grad_norm": 0.36751096896095814, + "learning_rate": 2.708947784265719e-05, + "loss": 0.8468, + "step": 9809 + }, + { + "epoch": 0.48927680798004985, + "grad_norm": 0.5546127285784394, + "learning_rate": 2.7085453493276676e-05, + "loss": 0.9247, + "step": 9810 + }, + { + "epoch": 0.4893266832917706, + "grad_norm": 0.553774655503055, + "learning_rate": 2.708142908947718e-05, + "loss": 0.9058, + "step": 9811 + }, + { + "epoch": 0.4893765586034913, + "grad_norm": 0.5794252351316656, + "learning_rate": 2.707740463136373e-05, + "loss": 0.915, + "step": 9812 + }, + { + "epoch": 0.48942643391521196, + "grad_norm": 0.41968438136998687, + "learning_rate": 2.7073380119041325e-05, + "loss": 0.8628, + "step": 9813 + }, + { + "epoch": 0.4894763092269327, + "grad_norm": 0.3716262149853638, + "learning_rate": 2.7069355552615007e-05, + "loss": 0.8776, + "step": 9814 + }, + { + "epoch": 0.4895261845386534, + "grad_norm": 0.47882461711211277, + "learning_rate": 2.706533093218977e-05, + "loss": 0.9139, + "step": 9815 + }, + { + "epoch": 0.48957605985037406, + "grad_norm": 0.479898470096798, + "learning_rate": 2.7061306257870654e-05, + "loss": 0.9318, + "step": 9816 + }, + { + "epoch": 0.48962593516209474, + "grad_norm": 0.45516146432095966, + "learning_rate": 2.7057281529762673e-05, + "loss": 0.902, + "step": 9817 + }, + { + "epoch": 0.4896758104738155, + "grad_norm": 0.4715231195366142, + "learning_rate": 2.7053256747970852e-05, + "loss": 0.8603, + "step": 9818 + }, + { + "epoch": 0.48972568578553616, + "grad_norm": 0.5590411252061839, + "learning_rate": 2.704923191260021e-05, + "loss": 0.9063, + "step": 9819 + }, + { + "epoch": 0.48977556109725684, + "grad_norm": 0.6595314957339083, + "learning_rate": 2.7045207023755787e-05, + "loss": 0.9091, + "step": 9820 + }, + { + "epoch": 0.4898254364089776, + "grad_norm": 0.47402749645527714, + "learning_rate": 2.7041182081542593e-05, + "loss": 0.9308, + "step": 9821 + }, + { + "epoch": 0.48987531172069826, + "grad_norm": 0.40424648567931015, + "learning_rate": 2.7037157086065674e-05, + "loss": 0.8908, + "step": 9822 + }, + { + "epoch": 0.48992518703241894, + "grad_norm": 0.4763560682113941, + "learning_rate": 2.703313203743004e-05, + "loss": 0.8845, + "step": 9823 + }, + { + "epoch": 0.4899750623441397, + "grad_norm": 0.5615908196606938, + "learning_rate": 2.702910693574075e-05, + "loss": 0.8841, + "step": 9824 + }, + { + "epoch": 0.49002493765586036, + "grad_norm": 0.5630879839437416, + "learning_rate": 2.7025081781102818e-05, + "loss": 0.902, + "step": 9825 + }, + { + "epoch": 0.49007481296758104, + "grad_norm": 0.3988920197124216, + "learning_rate": 2.7021056573621277e-05, + "loss": 0.9034, + "step": 9826 + }, + { + "epoch": 0.4901246882793017, + "grad_norm": 0.49398116723393015, + "learning_rate": 2.7017031313401175e-05, + "loss": 0.8971, + "step": 9827 + }, + { + "epoch": 0.49017456359102246, + "grad_norm": 0.5711589193921304, + "learning_rate": 2.7013006000547543e-05, + "loss": 0.8871, + "step": 9828 + }, + { + "epoch": 0.49022443890274314, + "grad_norm": 0.4575266543903386, + "learning_rate": 2.700898063516542e-05, + "loss": 0.878, + "step": 9829 + }, + { + "epoch": 0.4902743142144638, + "grad_norm": 0.41591551735631654, + "learning_rate": 2.7004955217359852e-05, + "loss": 0.9232, + "step": 9830 + }, + { + "epoch": 0.49032418952618456, + "grad_norm": 0.8213623729441742, + "learning_rate": 2.7000929747235865e-05, + "loss": 0.9199, + "step": 9831 + }, + { + "epoch": 0.49037406483790524, + "grad_norm": 0.5073171296784419, + "learning_rate": 2.6996904224898516e-05, + "loss": 0.8849, + "step": 9832 + }, + { + "epoch": 0.4904239401496259, + "grad_norm": 0.6060494198559601, + "learning_rate": 2.6992878650452846e-05, + "loss": 0.8599, + "step": 9833 + }, + { + "epoch": 0.4904738154613466, + "grad_norm": 0.5613067021826806, + "learning_rate": 2.69888530240039e-05, + "loss": 0.8886, + "step": 9834 + }, + { + "epoch": 0.49052369077306734, + "grad_norm": 0.8230941887947664, + "learning_rate": 2.698482734565672e-05, + "loss": 0.9298, + "step": 9835 + }, + { + "epoch": 0.490573566084788, + "grad_norm": 0.4173425362065395, + "learning_rate": 2.6980801615516366e-05, + "loss": 0.8788, + "step": 9836 + }, + { + "epoch": 0.4906234413965087, + "grad_norm": 0.5373424705812139, + "learning_rate": 2.6976775833687873e-05, + "loss": 0.9401, + "step": 9837 + }, + { + "epoch": 0.49067331670822945, + "grad_norm": 0.49541552314996656, + "learning_rate": 2.6972750000276302e-05, + "loss": 0.8803, + "step": 9838 + }, + { + "epoch": 0.4907231920199501, + "grad_norm": 0.5112998425792923, + "learning_rate": 2.69687241153867e-05, + "loss": 0.8983, + "step": 9839 + }, + { + "epoch": 0.4907730673316708, + "grad_norm": 0.5734024700842763, + "learning_rate": 2.696469817912413e-05, + "loss": 0.9343, + "step": 9840 + }, + { + "epoch": 0.49082294264339155, + "grad_norm": 0.46372517991682854, + "learning_rate": 2.696067219159364e-05, + "loss": 0.8265, + "step": 9841 + }, + { + "epoch": 0.49087281795511223, + "grad_norm": 0.48988536516316905, + "learning_rate": 2.6956646152900277e-05, + "loss": 0.9163, + "step": 9842 + }, + { + "epoch": 0.4909226932668329, + "grad_norm": 0.601380241822815, + "learning_rate": 2.695262006314912e-05, + "loss": 0.8926, + "step": 9843 + }, + { + "epoch": 0.4909725685785536, + "grad_norm": 0.4353820867235383, + "learning_rate": 2.6948593922445208e-05, + "loss": 0.88, + "step": 9844 + }, + { + "epoch": 0.49102244389027433, + "grad_norm": 0.3953739019756932, + "learning_rate": 2.6944567730893616e-05, + "loss": 0.8914, + "step": 9845 + }, + { + "epoch": 0.491072319201995, + "grad_norm": 0.6412438276764344, + "learning_rate": 2.694054148859939e-05, + "loss": 0.8775, + "step": 9846 + }, + { + "epoch": 0.4911221945137157, + "grad_norm": 0.5225043311653429, + "learning_rate": 2.6936515195667612e-05, + "loss": 0.8685, + "step": 9847 + }, + { + "epoch": 0.49117206982543643, + "grad_norm": 0.643052827822746, + "learning_rate": 2.6932488852203332e-05, + "loss": 0.9018, + "step": 9848 + }, + { + "epoch": 0.4912219451371571, + "grad_norm": 0.8238827237806066, + "learning_rate": 2.6928462458311622e-05, + "loss": 0.8422, + "step": 9849 + }, + { + "epoch": 0.4912718204488778, + "grad_norm": 0.4026206412638441, + "learning_rate": 2.6924436014097543e-05, + "loss": 0.8665, + "step": 9850 + }, + { + "epoch": 0.49132169576059853, + "grad_norm": 0.5684113217665455, + "learning_rate": 2.6920409519666174e-05, + "loss": 0.8939, + "step": 9851 + }, + { + "epoch": 0.4913715710723192, + "grad_norm": 0.46924941906080336, + "learning_rate": 2.6916382975122573e-05, + "loss": 0.9194, + "step": 9852 + }, + { + "epoch": 0.4914214463840399, + "grad_norm": 0.5815736822605806, + "learning_rate": 2.6912356380571818e-05, + "loss": 0.8882, + "step": 9853 + }, + { + "epoch": 0.4914713216957606, + "grad_norm": 1.9787128630339088, + "learning_rate": 2.690832973611898e-05, + "loss": 0.9048, + "step": 9854 + }, + { + "epoch": 0.4915211970074813, + "grad_norm": 0.5703892625325861, + "learning_rate": 2.6904303041869134e-05, + "loss": 0.9074, + "step": 9855 + }, + { + "epoch": 0.491571072319202, + "grad_norm": 0.7496855428049541, + "learning_rate": 2.690027629792735e-05, + "loss": 0.8566, + "step": 9856 + }, + { + "epoch": 0.4916209476309227, + "grad_norm": 0.5044594171854947, + "learning_rate": 2.68962495043987e-05, + "loss": 0.8734, + "step": 9857 + }, + { + "epoch": 0.4916708229426434, + "grad_norm": 0.6208936615867471, + "learning_rate": 2.6892222661388272e-05, + "loss": 0.9079, + "step": 9858 + }, + { + "epoch": 0.4917206982543641, + "grad_norm": 0.5814252698683307, + "learning_rate": 2.6888195769001146e-05, + "loss": 0.9032, + "step": 9859 + }, + { + "epoch": 0.4917705735660848, + "grad_norm": 0.5670842456624382, + "learning_rate": 2.6884168827342387e-05, + "loss": 0.8716, + "step": 9860 + }, + { + "epoch": 0.49182044887780546, + "grad_norm": 0.4457457821693814, + "learning_rate": 2.6880141836517093e-05, + "loss": 0.9457, + "step": 9861 + }, + { + "epoch": 0.4918703241895262, + "grad_norm": 0.470265672123836, + "learning_rate": 2.6876114796630336e-05, + "loss": 0.9482, + "step": 9862 + }, + { + "epoch": 0.4919201995012469, + "grad_norm": 0.4054264653366805, + "learning_rate": 2.6872087707787203e-05, + "loss": 0.8713, + "step": 9863 + }, + { + "epoch": 0.49197007481296756, + "grad_norm": 0.38721236006058035, + "learning_rate": 2.686806057009278e-05, + "loss": 0.8591, + "step": 9864 + }, + { + "epoch": 0.4920199501246883, + "grad_norm": 0.4799397879315422, + "learning_rate": 2.6864033383652155e-05, + "loss": 0.8831, + "step": 9865 + }, + { + "epoch": 0.492069825436409, + "grad_norm": 0.4925370287076167, + "learning_rate": 2.686000614857041e-05, + "loss": 0.9193, + "step": 9866 + }, + { + "epoch": 0.49211970074812966, + "grad_norm": 0.46904439545207816, + "learning_rate": 2.6855978864952636e-05, + "loss": 0.8913, + "step": 9867 + }, + { + "epoch": 0.4921695760598504, + "grad_norm": 0.42245031903371605, + "learning_rate": 2.6851951532903924e-05, + "loss": 0.8519, + "step": 9868 + }, + { + "epoch": 0.4922194513715711, + "grad_norm": 0.3587773580221455, + "learning_rate": 2.684792415252937e-05, + "loss": 0.8507, + "step": 9869 + }, + { + "epoch": 0.49226932668329176, + "grad_norm": 0.7212804193825887, + "learning_rate": 2.684389672393406e-05, + "loss": 0.9703, + "step": 9870 + }, + { + "epoch": 0.49231920199501245, + "grad_norm": 0.49337467561170756, + "learning_rate": 2.6839869247223092e-05, + "loss": 0.8831, + "step": 9871 + }, + { + "epoch": 0.4923690773067332, + "grad_norm": 0.4671024143224166, + "learning_rate": 2.6835841722501554e-05, + "loss": 0.8739, + "step": 9872 + }, + { + "epoch": 0.49241895261845386, + "grad_norm": 0.4128109438956545, + "learning_rate": 2.683181414987455e-05, + "loss": 0.8522, + "step": 9873 + }, + { + "epoch": 0.49246882793017455, + "grad_norm": 0.43219049252207675, + "learning_rate": 2.6827786529447176e-05, + "loss": 0.8741, + "step": 9874 + }, + { + "epoch": 0.4925187032418953, + "grad_norm": 0.38444507757053265, + "learning_rate": 2.682375886132453e-05, + "loss": 0.8762, + "step": 9875 + }, + { + "epoch": 0.49256857855361597, + "grad_norm": 0.5366804615109755, + "learning_rate": 2.6819731145611714e-05, + "loss": 0.8636, + "step": 9876 + }, + { + "epoch": 0.49261845386533665, + "grad_norm": 0.6012395152751772, + "learning_rate": 2.681570338241383e-05, + "loss": 0.9061, + "step": 9877 + }, + { + "epoch": 0.49266832917705733, + "grad_norm": 0.41026820804680864, + "learning_rate": 2.6811675571835982e-05, + "loss": 0.8919, + "step": 9878 + }, + { + "epoch": 0.49271820448877807, + "grad_norm": 0.5861593815566006, + "learning_rate": 2.6807647713983264e-05, + "loss": 0.8814, + "step": 9879 + }, + { + "epoch": 0.49276807980049875, + "grad_norm": 0.4810433954305573, + "learning_rate": 2.680361980896079e-05, + "loss": 0.8385, + "step": 9880 + }, + { + "epoch": 0.49281795511221943, + "grad_norm": 0.5478516774150588, + "learning_rate": 2.6799591856873663e-05, + "loss": 0.8935, + "step": 9881 + }, + { + "epoch": 0.49286783042394017, + "grad_norm": 0.5509468692818138, + "learning_rate": 2.6795563857826993e-05, + "loss": 0.9208, + "step": 9882 + }, + { + "epoch": 0.49291770573566085, + "grad_norm": 0.4593764052967424, + "learning_rate": 2.6791535811925883e-05, + "loss": 0.8833, + "step": 9883 + }, + { + "epoch": 0.49296758104738153, + "grad_norm": 0.46007783010675984, + "learning_rate": 2.6787507719275455e-05, + "loss": 0.8612, + "step": 9884 + }, + { + "epoch": 0.49301745635910227, + "grad_norm": 0.5025598317017494, + "learning_rate": 2.6783479579980807e-05, + "loss": 0.8869, + "step": 9885 + }, + { + "epoch": 0.49306733167082295, + "grad_norm": 0.3887795529883376, + "learning_rate": 2.6779451394147066e-05, + "loss": 0.882, + "step": 9886 + }, + { + "epoch": 0.49311720698254363, + "grad_norm": 0.5410630845250877, + "learning_rate": 2.677542316187933e-05, + "loss": 0.8932, + "step": 9887 + }, + { + "epoch": 0.4931670822942643, + "grad_norm": 0.5666363231138275, + "learning_rate": 2.6771394883282725e-05, + "loss": 0.9184, + "step": 9888 + }, + { + "epoch": 0.49321695760598505, + "grad_norm": 0.6690873065784059, + "learning_rate": 2.6767366558462365e-05, + "loss": 0.8879, + "step": 9889 + }, + { + "epoch": 0.49326683291770573, + "grad_norm": 0.5109793568656458, + "learning_rate": 2.6763338187523356e-05, + "loss": 0.9169, + "step": 9890 + }, + { + "epoch": 0.4933167082294264, + "grad_norm": 0.40883303846907587, + "learning_rate": 2.675930977057084e-05, + "loss": 0.8542, + "step": 9891 + }, + { + "epoch": 0.49336658354114715, + "grad_norm": 0.42859785499599645, + "learning_rate": 2.675528130770991e-05, + "loss": 0.8575, + "step": 9892 + }, + { + "epoch": 0.49341645885286783, + "grad_norm": 0.4869533200272388, + "learning_rate": 2.6751252799045706e-05, + "loss": 0.919, + "step": 9893 + }, + { + "epoch": 0.4934663341645885, + "grad_norm": 0.5485496205697403, + "learning_rate": 2.6747224244683345e-05, + "loss": 0.8556, + "step": 9894 + }, + { + "epoch": 0.49351620947630925, + "grad_norm": 0.6522543505125394, + "learning_rate": 2.674319564472795e-05, + "loss": 0.8985, + "step": 9895 + }, + { + "epoch": 0.49356608478802994, + "grad_norm": 0.4693724918276339, + "learning_rate": 2.6739166999284636e-05, + "loss": 0.8983, + "step": 9896 + }, + { + "epoch": 0.4936159600997506, + "grad_norm": 0.3937589138247389, + "learning_rate": 2.6735138308458542e-05, + "loss": 0.867, + "step": 9897 + }, + { + "epoch": 0.4936658354114713, + "grad_norm": 0.37554789514903364, + "learning_rate": 2.6731109572354796e-05, + "loss": 0.8797, + "step": 9898 + }, + { + "epoch": 0.49371571072319204, + "grad_norm": 0.5331823956724401, + "learning_rate": 2.6727080791078512e-05, + "loss": 0.828, + "step": 9899 + }, + { + "epoch": 0.4937655860349127, + "grad_norm": 0.4700548419561846, + "learning_rate": 2.6723051964734834e-05, + "loss": 0.8921, + "step": 9900 + }, + { + "epoch": 0.4938154613466334, + "grad_norm": 0.4682209011396916, + "learning_rate": 2.6719023093428885e-05, + "loss": 0.8512, + "step": 9901 + }, + { + "epoch": 0.49386533665835414, + "grad_norm": 0.7854402206267832, + "learning_rate": 2.6714994177265796e-05, + "loss": 0.907, + "step": 9902 + }, + { + "epoch": 0.4939152119700748, + "grad_norm": 0.5431331731036624, + "learning_rate": 2.6710965216350704e-05, + "loss": 0.8836, + "step": 9903 + }, + { + "epoch": 0.4939650872817955, + "grad_norm": 0.4592584697619771, + "learning_rate": 2.6706936210788735e-05, + "loss": 0.8755, + "step": 9904 + }, + { + "epoch": 0.4940149625935162, + "grad_norm": 0.4653504966189175, + "learning_rate": 2.670290716068503e-05, + "loss": 0.8893, + "step": 9905 + }, + { + "epoch": 0.4940648379052369, + "grad_norm": 0.5992619786182408, + "learning_rate": 2.6698878066144734e-05, + "loss": 0.893, + "step": 9906 + }, + { + "epoch": 0.4941147132169576, + "grad_norm": 0.5159787974956451, + "learning_rate": 2.669484892727297e-05, + "loss": 0.887, + "step": 9907 + }, + { + "epoch": 0.4941645885286783, + "grad_norm": 0.4294733445393249, + "learning_rate": 2.6690819744174884e-05, + "loss": 0.9364, + "step": 9908 + }, + { + "epoch": 0.494214463840399, + "grad_norm": 0.4244755633415866, + "learning_rate": 2.6686790516955608e-05, + "loss": 0.8972, + "step": 9909 + }, + { + "epoch": 0.4942643391521197, + "grad_norm": 0.4458378876110163, + "learning_rate": 2.668276124572029e-05, + "loss": 0.8525, + "step": 9910 + }, + { + "epoch": 0.4943142144638404, + "grad_norm": 0.40036371919439606, + "learning_rate": 2.667873193057407e-05, + "loss": 0.8977, + "step": 9911 + }, + { + "epoch": 0.4943640897755611, + "grad_norm": 0.4451504637772755, + "learning_rate": 2.6674702571622095e-05, + "loss": 0.885, + "step": 9912 + }, + { + "epoch": 0.4944139650872818, + "grad_norm": 0.49798592531754526, + "learning_rate": 2.6670673168969505e-05, + "loss": 0.9389, + "step": 9913 + }, + { + "epoch": 0.4944638403990025, + "grad_norm": 0.735309699190686, + "learning_rate": 2.666664372272144e-05, + "loss": 0.8678, + "step": 9914 + }, + { + "epoch": 0.49451371571072317, + "grad_norm": 0.5467656418520204, + "learning_rate": 2.6662614232983062e-05, + "loss": 0.8979, + "step": 9915 + }, + { + "epoch": 0.4945635910224439, + "grad_norm": 0.44344676004675787, + "learning_rate": 2.6658584699859508e-05, + "loss": 0.843, + "step": 9916 + }, + { + "epoch": 0.4946134663341646, + "grad_norm": 0.5415824544847363, + "learning_rate": 2.6654555123455927e-05, + "loss": 0.8887, + "step": 9917 + }, + { + "epoch": 0.49466334164588527, + "grad_norm": 0.5005303914795162, + "learning_rate": 2.6650525503877468e-05, + "loss": 0.8965, + "step": 9918 + }, + { + "epoch": 0.494713216957606, + "grad_norm": 0.5457050260836259, + "learning_rate": 2.6646495841229287e-05, + "loss": 0.9449, + "step": 9919 + }, + { + "epoch": 0.4947630922693267, + "grad_norm": 0.5430511194593356, + "learning_rate": 2.664246613561653e-05, + "loss": 0.9019, + "step": 9920 + }, + { + "epoch": 0.49481296758104737, + "grad_norm": 0.44115302768643117, + "learning_rate": 2.663843638714436e-05, + "loss": 0.8837, + "step": 9921 + }, + { + "epoch": 0.4948628428927681, + "grad_norm": 0.5915208945278555, + "learning_rate": 2.663440659591792e-05, + "loss": 0.8913, + "step": 9922 + }, + { + "epoch": 0.4949127182044888, + "grad_norm": 0.49947334199453014, + "learning_rate": 2.663037676204238e-05, + "loss": 0.8854, + "step": 9923 + }, + { + "epoch": 0.49496259351620947, + "grad_norm": 0.45890397404139255, + "learning_rate": 2.6626346885622883e-05, + "loss": 0.8663, + "step": 9924 + }, + { + "epoch": 0.49501246882793015, + "grad_norm": 0.40089142941945943, + "learning_rate": 2.6622316966764593e-05, + "loss": 0.8922, + "step": 9925 + }, + { + "epoch": 0.4950623441396509, + "grad_norm": 0.5084934498742443, + "learning_rate": 2.6618287005572667e-05, + "loss": 0.8669, + "step": 9926 + }, + { + "epoch": 0.49511221945137157, + "grad_norm": 0.46332038626371463, + "learning_rate": 2.661425700215226e-05, + "loss": 0.8865, + "step": 9927 + }, + { + "epoch": 0.49516209476309225, + "grad_norm": 0.466145594747302, + "learning_rate": 2.6610226956608547e-05, + "loss": 0.8552, + "step": 9928 + }, + { + "epoch": 0.495211970074813, + "grad_norm": 0.7682719473230396, + "learning_rate": 2.6606196869046678e-05, + "loss": 0.9116, + "step": 9929 + }, + { + "epoch": 0.4952618453865337, + "grad_norm": 0.5807069207719627, + "learning_rate": 2.6602166739571827e-05, + "loss": 0.9272, + "step": 9930 + }, + { + "epoch": 0.49531172069825435, + "grad_norm": 0.6018400415382269, + "learning_rate": 2.6598136568289143e-05, + "loss": 0.8939, + "step": 9931 + }, + { + "epoch": 0.49536159600997504, + "grad_norm": 0.4961131017173462, + "learning_rate": 2.6594106355303806e-05, + "loss": 0.9174, + "step": 9932 + }, + { + "epoch": 0.4954114713216958, + "grad_norm": 0.6866899238058148, + "learning_rate": 2.659007610072097e-05, + "loss": 0.863, + "step": 9933 + }, + { + "epoch": 0.49546134663341646, + "grad_norm": 0.60566268462179, + "learning_rate": 2.6586045804645815e-05, + "loss": 0.8795, + "step": 9934 + }, + { + "epoch": 0.49551122194513714, + "grad_norm": 0.41899615805412704, + "learning_rate": 2.6582015467183507e-05, + "loss": 0.8763, + "step": 9935 + }, + { + "epoch": 0.4955610972568579, + "grad_norm": 0.4447379203324023, + "learning_rate": 2.6577985088439212e-05, + "loss": 0.8632, + "step": 9936 + }, + { + "epoch": 0.49561097256857856, + "grad_norm": 0.43999677138514826, + "learning_rate": 2.65739546685181e-05, + "loss": 0.9172, + "step": 9937 + }, + { + "epoch": 0.49566084788029924, + "grad_norm": 0.5158548088590644, + "learning_rate": 2.656992420752535e-05, + "loss": 0.8748, + "step": 9938 + }, + { + "epoch": 0.49571072319202, + "grad_norm": 0.46845688696616833, + "learning_rate": 2.6565893705566126e-05, + "loss": 0.8865, + "step": 9939 + }, + { + "epoch": 0.49576059850374066, + "grad_norm": 0.712559663903869, + "learning_rate": 2.6561863162745603e-05, + "loss": 0.8954, + "step": 9940 + }, + { + "epoch": 0.49581047381546134, + "grad_norm": 0.5101317828953826, + "learning_rate": 2.6557832579168972e-05, + "loss": 0.8926, + "step": 9941 + }, + { + "epoch": 0.495860349127182, + "grad_norm": 0.41603139825716845, + "learning_rate": 2.6553801954941386e-05, + "loss": 0.9001, + "step": 9942 + }, + { + "epoch": 0.49591022443890276, + "grad_norm": 0.41529192032069273, + "learning_rate": 2.654977129016804e-05, + "loss": 0.8829, + "step": 9943 + }, + { + "epoch": 0.49596009975062344, + "grad_norm": 0.5442279592111179, + "learning_rate": 2.6545740584954098e-05, + "loss": 0.9053, + "step": 9944 + }, + { + "epoch": 0.4960099750623441, + "grad_norm": 0.4599802171779855, + "learning_rate": 2.6541709839404753e-05, + "loss": 0.8835, + "step": 9945 + }, + { + "epoch": 0.49605985037406486, + "grad_norm": 0.5890373537054069, + "learning_rate": 2.653767905362518e-05, + "loss": 0.8765, + "step": 9946 + }, + { + "epoch": 0.49610972568578554, + "grad_norm": 0.6712810045417315, + "learning_rate": 2.653364822772056e-05, + "loss": 0.8887, + "step": 9947 + }, + { + "epoch": 0.4961596009975062, + "grad_norm": 0.4589276436293018, + "learning_rate": 2.6529617361796077e-05, + "loss": 0.9041, + "step": 9948 + }, + { + "epoch": 0.49620947630922696, + "grad_norm": 0.46709737697335496, + "learning_rate": 2.6525586455956908e-05, + "loss": 0.8762, + "step": 9949 + }, + { + "epoch": 0.49625935162094764, + "grad_norm": 0.6234937250796511, + "learning_rate": 2.6521555510308254e-05, + "loss": 0.8711, + "step": 9950 + }, + { + "epoch": 0.4963092269326683, + "grad_norm": 1.1442594797937282, + "learning_rate": 2.651752452495528e-05, + "loss": 0.8903, + "step": 9951 + }, + { + "epoch": 0.496359102244389, + "grad_norm": 0.40691911909045203, + "learning_rate": 2.6513493500003183e-05, + "loss": 0.8301, + "step": 9952 + }, + { + "epoch": 0.49640897755610974, + "grad_norm": 0.6881774776010174, + "learning_rate": 2.6509462435557152e-05, + "loss": 0.9435, + "step": 9953 + }, + { + "epoch": 0.4964588528678304, + "grad_norm": 0.3846855298059889, + "learning_rate": 2.650543133172238e-05, + "loss": 0.8648, + "step": 9954 + }, + { + "epoch": 0.4965087281795511, + "grad_norm": 0.5096398114538103, + "learning_rate": 2.6501400188604042e-05, + "loss": 0.8825, + "step": 9955 + }, + { + "epoch": 0.49655860349127184, + "grad_norm": 0.44980020698108003, + "learning_rate": 2.6497369006307343e-05, + "loss": 0.8816, + "step": 9956 + }, + { + "epoch": 0.4966084788029925, + "grad_norm": 0.4772149720046788, + "learning_rate": 2.6493337784937467e-05, + "loss": 0.8613, + "step": 9957 + }, + { + "epoch": 0.4966583541147132, + "grad_norm": 0.5108295539233315, + "learning_rate": 2.6489306524599617e-05, + "loss": 0.8919, + "step": 9958 + }, + { + "epoch": 0.4967082294264339, + "grad_norm": 0.5599455268612086, + "learning_rate": 2.648527522539897e-05, + "loss": 0.885, + "step": 9959 + }, + { + "epoch": 0.4967581047381546, + "grad_norm": 0.43620862414593375, + "learning_rate": 2.6481243887440736e-05, + "loss": 0.8692, + "step": 9960 + }, + { + "epoch": 0.4968079800498753, + "grad_norm": 0.5337815359931657, + "learning_rate": 2.647721251083011e-05, + "loss": 0.8612, + "step": 9961 + }, + { + "epoch": 0.496857855361596, + "grad_norm": 0.5527133106504237, + "learning_rate": 2.6473181095672277e-05, + "loss": 0.888, + "step": 9962 + }, + { + "epoch": 0.4969077306733167, + "grad_norm": 0.4516369911716137, + "learning_rate": 2.646914964207245e-05, + "loss": 0.8453, + "step": 9963 + }, + { + "epoch": 0.4969576059850374, + "grad_norm": 0.5569754237737528, + "learning_rate": 2.6465118150135808e-05, + "loss": 0.8659, + "step": 9964 + }, + { + "epoch": 0.4970074812967581, + "grad_norm": 0.6493497412388487, + "learning_rate": 2.646108661996758e-05, + "loss": 0.9124, + "step": 9965 + }, + { + "epoch": 0.49705735660847883, + "grad_norm": 0.4703974584147924, + "learning_rate": 2.6457055051672935e-05, + "loss": 0.8541, + "step": 9966 + }, + { + "epoch": 0.4971072319201995, + "grad_norm": 0.47456331686554964, + "learning_rate": 2.64530234453571e-05, + "loss": 0.8477, + "step": 9967 + }, + { + "epoch": 0.4971571072319202, + "grad_norm": 0.4161441975209904, + "learning_rate": 2.6448991801125268e-05, + "loss": 0.8623, + "step": 9968 + }, + { + "epoch": 0.4972069825436409, + "grad_norm": 0.7408710905836356, + "learning_rate": 2.644496011908264e-05, + "loss": 0.8891, + "step": 9969 + }, + { + "epoch": 0.4972568578553616, + "grad_norm": 0.5785681900967459, + "learning_rate": 2.6440928399334424e-05, + "loss": 0.9377, + "step": 9970 + }, + { + "epoch": 0.4973067331670823, + "grad_norm": 0.5967440453489751, + "learning_rate": 2.643689664198583e-05, + "loss": 0.8996, + "step": 9971 + }, + { + "epoch": 0.497356608478803, + "grad_norm": 0.5000557645475374, + "learning_rate": 2.643286484714206e-05, + "loss": 0.9389, + "step": 9972 + }, + { + "epoch": 0.4974064837905237, + "grad_norm": 0.48458211734866513, + "learning_rate": 2.6428833014908316e-05, + "loss": 0.8647, + "step": 9973 + }, + { + "epoch": 0.4974563591022444, + "grad_norm": 0.46199014739482164, + "learning_rate": 2.642480114538983e-05, + "loss": 0.8876, + "step": 9974 + }, + { + "epoch": 0.4975062344139651, + "grad_norm": 0.41054191447820026, + "learning_rate": 2.642076923869178e-05, + "loss": 0.8255, + "step": 9975 + }, + { + "epoch": 0.49755610972568576, + "grad_norm": 0.5051035972289508, + "learning_rate": 2.64167372949194e-05, + "loss": 0.9129, + "step": 9976 + }, + { + "epoch": 0.4976059850374065, + "grad_norm": 0.46623417453754235, + "learning_rate": 2.641270531417789e-05, + "loss": 0.8952, + "step": 9977 + }, + { + "epoch": 0.4976558603491272, + "grad_norm": 0.5918746244393605, + "learning_rate": 2.6408673296572472e-05, + "loss": 0.8928, + "step": 9978 + }, + { + "epoch": 0.49770573566084786, + "grad_norm": 0.426729011920502, + "learning_rate": 2.640464124220835e-05, + "loss": 0.8602, + "step": 9979 + }, + { + "epoch": 0.4977556109725686, + "grad_norm": 0.44731925419868146, + "learning_rate": 2.6400609151190753e-05, + "loss": 0.8707, + "step": 9980 + }, + { + "epoch": 0.4978054862842893, + "grad_norm": 0.4452063894315011, + "learning_rate": 2.6396577023624875e-05, + "loss": 0.8562, + "step": 9981 + }, + { + "epoch": 0.49785536159600996, + "grad_norm": 0.4839577809782294, + "learning_rate": 2.6392544859615953e-05, + "loss": 0.8861, + "step": 9982 + }, + { + "epoch": 0.4979052369077307, + "grad_norm": 0.9306816008720754, + "learning_rate": 2.6388512659269188e-05, + "loss": 0.882, + "step": 9983 + }, + { + "epoch": 0.4979551122194514, + "grad_norm": 0.5461125836396548, + "learning_rate": 2.6384480422689817e-05, + "loss": 0.9139, + "step": 9984 + }, + { + "epoch": 0.49800498753117206, + "grad_norm": 0.5348755928518325, + "learning_rate": 2.6380448149983044e-05, + "loss": 0.9047, + "step": 9985 + }, + { + "epoch": 0.49805486284289274, + "grad_norm": 1.3651029716191483, + "learning_rate": 2.6376415841254094e-05, + "loss": 0.8564, + "step": 9986 + }, + { + "epoch": 0.4981047381546135, + "grad_norm": 0.4323107831160306, + "learning_rate": 2.637238349660819e-05, + "loss": 0.8994, + "step": 9987 + }, + { + "epoch": 0.49815461346633416, + "grad_norm": 0.48648749369042565, + "learning_rate": 2.6368351116150547e-05, + "loss": 0.9112, + "step": 9988 + }, + { + "epoch": 0.49820448877805484, + "grad_norm": 0.4451821451568433, + "learning_rate": 2.63643186999864e-05, + "loss": 0.8588, + "step": 9989 + }, + { + "epoch": 0.4982543640897756, + "grad_norm": 0.473446972232483, + "learning_rate": 2.6360286248220966e-05, + "loss": 0.8721, + "step": 9990 + }, + { + "epoch": 0.49830423940149626, + "grad_norm": 0.40970656307382347, + "learning_rate": 2.635625376095947e-05, + "loss": 0.9165, + "step": 9991 + }, + { + "epoch": 0.49835411471321694, + "grad_norm": 0.5590869360328795, + "learning_rate": 2.635222123830714e-05, + "loss": 0.8951, + "step": 9992 + }, + { + "epoch": 0.4984039900249377, + "grad_norm": 0.5336722119678174, + "learning_rate": 2.6348188680369206e-05, + "loss": 0.9395, + "step": 9993 + }, + { + "epoch": 0.49845386533665836, + "grad_norm": 0.6384693693898675, + "learning_rate": 2.6344156087250882e-05, + "loss": 0.8943, + "step": 9994 + }, + { + "epoch": 0.49850374064837905, + "grad_norm": 0.4087113139170999, + "learning_rate": 2.6340123459057415e-05, + "loss": 0.861, + "step": 9995 + }, + { + "epoch": 0.4985536159600997, + "grad_norm": 0.6982447639627103, + "learning_rate": 2.6336090795894024e-05, + "loss": 0.8919, + "step": 9996 + }, + { + "epoch": 0.49860349127182046, + "grad_norm": 0.4347835684920884, + "learning_rate": 2.6332058097865943e-05, + "loss": 0.898, + "step": 9997 + }, + { + "epoch": 0.49865336658354115, + "grad_norm": 0.4880580879937952, + "learning_rate": 2.63280253650784e-05, + "loss": 0.931, + "step": 9998 + }, + { + "epoch": 0.49870324189526183, + "grad_norm": 0.3818207398974087, + "learning_rate": 2.6323992597636627e-05, + "loss": 0.857, + "step": 9999 + }, + { + "epoch": 0.49875311720698257, + "grad_norm": 0.4618327974775003, + "learning_rate": 2.6319959795645864e-05, + "loss": 0.8955, + "step": 10000 + }, + { + "epoch": 0.49880299251870325, + "grad_norm": 0.8158174127819849, + "learning_rate": 2.6315926959211336e-05, + "loss": 0.8622, + "step": 10001 + }, + { + "epoch": 0.49885286783042393, + "grad_norm": 0.4089534906411337, + "learning_rate": 2.6311894088438288e-05, + "loss": 0.8803, + "step": 10002 + }, + { + "epoch": 0.4989027431421446, + "grad_norm": 0.5202413384819105, + "learning_rate": 2.6307861183431947e-05, + "loss": 0.9258, + "step": 10003 + }, + { + "epoch": 0.49895261845386535, + "grad_norm": 0.4726935916762666, + "learning_rate": 2.6303828244297557e-05, + "loss": 0.9023, + "step": 10004 + }, + { + "epoch": 0.49900249376558603, + "grad_norm": 0.5717384800534838, + "learning_rate": 2.6299795271140347e-05, + "loss": 0.898, + "step": 10005 + }, + { + "epoch": 0.4990523690773067, + "grad_norm": 0.40383171148211944, + "learning_rate": 2.629576226406557e-05, + "loss": 0.8623, + "step": 10006 + }, + { + "epoch": 0.49910224438902745, + "grad_norm": 0.4852243223415728, + "learning_rate": 2.6291729223178447e-05, + "loss": 0.8652, + "step": 10007 + }, + { + "epoch": 0.49915211970074813, + "grad_norm": 0.6243206167474586, + "learning_rate": 2.6287696148584234e-05, + "loss": 0.8178, + "step": 10008 + }, + { + "epoch": 0.4992019950124688, + "grad_norm": 0.489757457115764, + "learning_rate": 2.6283663040388163e-05, + "loss": 0.9448, + "step": 10009 + }, + { + "epoch": 0.49925187032418955, + "grad_norm": 0.5023927881997963, + "learning_rate": 2.6279629898695483e-05, + "loss": 0.8881, + "step": 10010 + }, + { + "epoch": 0.49930174563591023, + "grad_norm": 0.5244227476364538, + "learning_rate": 2.6275596723611433e-05, + "loss": 0.815, + "step": 10011 + }, + { + "epoch": 0.4993516209476309, + "grad_norm": 0.5389864273451083, + "learning_rate": 2.6271563515241255e-05, + "loss": 0.8423, + "step": 10012 + }, + { + "epoch": 0.4994014962593516, + "grad_norm": 0.42458367120000096, + "learning_rate": 2.62675302736902e-05, + "loss": 0.899, + "step": 10013 + }, + { + "epoch": 0.49945137157107233, + "grad_norm": 0.4435505535383164, + "learning_rate": 2.6263496999063502e-05, + "loss": 0.8935, + "step": 10014 + }, + { + "epoch": 0.499501246882793, + "grad_norm": 0.39166258185819175, + "learning_rate": 2.6259463691466423e-05, + "loss": 0.8762, + "step": 10015 + }, + { + "epoch": 0.4995511221945137, + "grad_norm": 0.5025477794866258, + "learning_rate": 2.6255430351004197e-05, + "loss": 0.922, + "step": 10016 + }, + { + "epoch": 0.49960099750623443, + "grad_norm": 0.5404968869955776, + "learning_rate": 2.6251396977782084e-05, + "loss": 0.8943, + "step": 10017 + }, + { + "epoch": 0.4996508728179551, + "grad_norm": 0.5013472918663111, + "learning_rate": 2.624736357190532e-05, + "loss": 0.9082, + "step": 10018 + }, + { + "epoch": 0.4997007481296758, + "grad_norm": 0.3876372906281424, + "learning_rate": 2.624333013347917e-05, + "loss": 0.8579, + "step": 10019 + }, + { + "epoch": 0.49975062344139654, + "grad_norm": 0.5455724876094149, + "learning_rate": 2.6239296662608874e-05, + "loss": 0.8951, + "step": 10020 + }, + { + "epoch": 0.4998004987531172, + "grad_norm": 0.44516966815202086, + "learning_rate": 2.6235263159399683e-05, + "loss": 0.8208, + "step": 10021 + }, + { + "epoch": 0.4998503740648379, + "grad_norm": 0.4888045968157074, + "learning_rate": 2.6231229623956856e-05, + "loss": 0.9156, + "step": 10022 + }, + { + "epoch": 0.4999002493765586, + "grad_norm": 0.40431925425061516, + "learning_rate": 2.622719605638564e-05, + "loss": 0.8543, + "step": 10023 + }, + { + "epoch": 0.4999501246882793, + "grad_norm": 0.4009553679388997, + "learning_rate": 2.6223162456791295e-05, + "loss": 0.8722, + "step": 10024 + }, + { + "epoch": 0.5, + "grad_norm": 0.5270720548339616, + "learning_rate": 2.6219128825279065e-05, + "loss": 0.8632, + "step": 10025 + }, + { + "epoch": 0.5000498753117207, + "grad_norm": 0.4666474456829487, + "learning_rate": 2.621509516195423e-05, + "loss": 0.8609, + "step": 10026 + }, + { + "epoch": 0.5000997506234414, + "grad_norm": 0.40380121466774654, + "learning_rate": 2.6211061466922017e-05, + "loss": 0.9133, + "step": 10027 + }, + { + "epoch": 0.5001496259351621, + "grad_norm": 0.45763064357056144, + "learning_rate": 2.62070277402877e-05, + "loss": 0.8869, + "step": 10028 + }, + { + "epoch": 0.5001995012468828, + "grad_norm": 0.3958580845040238, + "learning_rate": 2.6202993982156533e-05, + "loss": 0.889, + "step": 10029 + }, + { + "epoch": 0.5002493765586035, + "grad_norm": 0.4754538676438667, + "learning_rate": 2.6198960192633783e-05, + "loss": 0.8647, + "step": 10030 + }, + { + "epoch": 0.5002992518703242, + "grad_norm": 0.39733742113813614, + "learning_rate": 2.6194926371824702e-05, + "loss": 0.8552, + "step": 10031 + }, + { + "epoch": 0.5003491271820449, + "grad_norm": 0.478136413115644, + "learning_rate": 2.6190892519834543e-05, + "loss": 0.8783, + "step": 10032 + }, + { + "epoch": 0.5003990024937656, + "grad_norm": 0.47156697607096343, + "learning_rate": 2.6186858636768585e-05, + "loss": 0.8494, + "step": 10033 + }, + { + "epoch": 0.5004488778054863, + "grad_norm": 0.4251925991610769, + "learning_rate": 2.618282472273208e-05, + "loss": 0.9366, + "step": 10034 + }, + { + "epoch": 0.5004987531172069, + "grad_norm": 0.4377255456785957, + "learning_rate": 2.6178790777830298e-05, + "loss": 0.8762, + "step": 10035 + }, + { + "epoch": 0.5005486284289277, + "grad_norm": 0.4398976822455217, + "learning_rate": 2.6174756802168487e-05, + "loss": 0.894, + "step": 10036 + }, + { + "epoch": 0.5005985037406484, + "grad_norm": 0.45546426787284466, + "learning_rate": 2.617072279585193e-05, + "loss": 0.9004, + "step": 10037 + }, + { + "epoch": 0.500648379052369, + "grad_norm": 0.39622320321349525, + "learning_rate": 2.6166688758985887e-05, + "loss": 0.8949, + "step": 10038 + }, + { + "epoch": 0.5006982543640898, + "grad_norm": 0.6447969003411222, + "learning_rate": 2.616265469167562e-05, + "loss": 0.8697, + "step": 10039 + }, + { + "epoch": 0.5007481296758105, + "grad_norm": 0.41686253167294834, + "learning_rate": 2.61586205940264e-05, + "loss": 0.8782, + "step": 10040 + }, + { + "epoch": 0.5007980049875311, + "grad_norm": 0.6740163031983603, + "learning_rate": 2.6154586466143495e-05, + "loss": 0.8972, + "step": 10041 + }, + { + "epoch": 0.5008478802992519, + "grad_norm": 0.48873182243610536, + "learning_rate": 2.615055230813217e-05, + "loss": 0.8639, + "step": 10042 + }, + { + "epoch": 0.5008977556109726, + "grad_norm": 0.4286393368946651, + "learning_rate": 2.6146518120097702e-05, + "loss": 0.8725, + "step": 10043 + }, + { + "epoch": 0.5009476309226932, + "grad_norm": 0.4177752918105483, + "learning_rate": 2.614248390214535e-05, + "loss": 0.8649, + "step": 10044 + }, + { + "epoch": 0.500997506234414, + "grad_norm": 0.42989204465106523, + "learning_rate": 2.6138449654380397e-05, + "loss": 0.8786, + "step": 10045 + }, + { + "epoch": 0.5010473815461347, + "grad_norm": 0.4524378054703258, + "learning_rate": 2.6134415376908106e-05, + "loss": 0.9102, + "step": 10046 + }, + { + "epoch": 0.5010972568578553, + "grad_norm": 0.5241063606269584, + "learning_rate": 2.6130381069833758e-05, + "loss": 0.898, + "step": 10047 + }, + { + "epoch": 0.5011471321695761, + "grad_norm": 0.4941251733421788, + "learning_rate": 2.612634673326262e-05, + "loss": 0.8851, + "step": 10048 + }, + { + "epoch": 0.5011970074812968, + "grad_norm": 0.4994221846280118, + "learning_rate": 2.6122312367299966e-05, + "loss": 0.9074, + "step": 10049 + }, + { + "epoch": 0.5012468827930174, + "grad_norm": 0.3933095951085783, + "learning_rate": 2.611827797205108e-05, + "loss": 0.8442, + "step": 10050 + }, + { + "epoch": 0.5012967581047382, + "grad_norm": 0.4520384621341498, + "learning_rate": 2.611424354762122e-05, + "loss": 0.8984, + "step": 10051 + }, + { + "epoch": 0.5013466334164588, + "grad_norm": 0.44836236636716714, + "learning_rate": 2.6110209094115685e-05, + "loss": 0.8592, + "step": 10052 + }, + { + "epoch": 0.5013965087281795, + "grad_norm": 0.5726645965017708, + "learning_rate": 2.6106174611639733e-05, + "loss": 0.8777, + "step": 10053 + }, + { + "epoch": 0.5014463840399003, + "grad_norm": 0.4572683060800354, + "learning_rate": 2.610214010029865e-05, + "loss": 0.8824, + "step": 10054 + }, + { + "epoch": 0.5014962593516209, + "grad_norm": 0.4244812529401342, + "learning_rate": 2.6098105560197722e-05, + "loss": 0.8771, + "step": 10055 + }, + { + "epoch": 0.5015461346633416, + "grad_norm": 0.5335225101046356, + "learning_rate": 2.609407099144221e-05, + "loss": 0.9085, + "step": 10056 + }, + { + "epoch": 0.5015960099750624, + "grad_norm": 0.4612431605905334, + "learning_rate": 2.609003639413741e-05, + "loss": 0.8692, + "step": 10057 + }, + { + "epoch": 0.501645885286783, + "grad_norm": 0.39320745089117715, + "learning_rate": 2.6086001768388594e-05, + "loss": 0.9066, + "step": 10058 + }, + { + "epoch": 0.5016957605985037, + "grad_norm": 0.36525092311372787, + "learning_rate": 2.608196711430106e-05, + "loss": 0.8391, + "step": 10059 + }, + { + "epoch": 0.5017456359102245, + "grad_norm": 0.4935004462598798, + "learning_rate": 2.6077932431980068e-05, + "loss": 0.9031, + "step": 10060 + }, + { + "epoch": 0.5017955112219451, + "grad_norm": 0.3919205585251161, + "learning_rate": 2.6073897721530914e-05, + "loss": 0.9016, + "step": 10061 + }, + { + "epoch": 0.5018453865336658, + "grad_norm": 0.43908034974913507, + "learning_rate": 2.6069862983058878e-05, + "loss": 0.9043, + "step": 10062 + }, + { + "epoch": 0.5018952618453866, + "grad_norm": 0.4448944615954335, + "learning_rate": 2.6065828216669253e-05, + "loss": 0.8285, + "step": 10063 + }, + { + "epoch": 0.5019451371571072, + "grad_norm": 0.47065565554360544, + "learning_rate": 2.6061793422467307e-05, + "loss": 0.8871, + "step": 10064 + }, + { + "epoch": 0.5019950124688279, + "grad_norm": 0.503193669729866, + "learning_rate": 2.6057758600558345e-05, + "loss": 0.8343, + "step": 10065 + }, + { + "epoch": 0.5020448877805487, + "grad_norm": 0.4349163805009639, + "learning_rate": 2.605372375104764e-05, + "loss": 0.8997, + "step": 10066 + }, + { + "epoch": 0.5020947630922693, + "grad_norm": 0.4427228327673062, + "learning_rate": 2.604968887404049e-05, + "loss": 0.8834, + "step": 10067 + }, + { + "epoch": 0.50214463840399, + "grad_norm": 0.39824193741941516, + "learning_rate": 2.604565396964218e-05, + "loss": 0.8982, + "step": 10068 + }, + { + "epoch": 0.5021945137157108, + "grad_norm": 0.44638216410490805, + "learning_rate": 2.6041619037957993e-05, + "loss": 0.8853, + "step": 10069 + }, + { + "epoch": 0.5022443890274314, + "grad_norm": 0.5860193014956759, + "learning_rate": 2.6037584079093225e-05, + "loss": 0.8957, + "step": 10070 + }, + { + "epoch": 0.5022942643391521, + "grad_norm": 0.4220917198993744, + "learning_rate": 2.6033549093153165e-05, + "loss": 0.9206, + "step": 10071 + }, + { + "epoch": 0.5023441396508728, + "grad_norm": 0.5031506646519546, + "learning_rate": 2.6029514080243108e-05, + "loss": 0.8775, + "step": 10072 + }, + { + "epoch": 0.5023940149625935, + "grad_norm": 0.41573037448015643, + "learning_rate": 2.602547904046833e-05, + "loss": 0.8829, + "step": 10073 + }, + { + "epoch": 0.5024438902743142, + "grad_norm": 0.39029961698552446, + "learning_rate": 2.6021443973934145e-05, + "loss": 0.8698, + "step": 10074 + }, + { + "epoch": 0.5024937655860349, + "grad_norm": 0.4592924145058225, + "learning_rate": 2.6017408880745832e-05, + "loss": 0.8939, + "step": 10075 + }, + { + "epoch": 0.5025436408977556, + "grad_norm": 0.385019583910037, + "learning_rate": 2.6013373761008698e-05, + "loss": 0.8895, + "step": 10076 + }, + { + "epoch": 0.5025935162094763, + "grad_norm": 0.44108624749592584, + "learning_rate": 2.6009338614828015e-05, + "loss": 0.8803, + "step": 10077 + }, + { + "epoch": 0.502643391521197, + "grad_norm": 0.48000825083575915, + "learning_rate": 2.600530344230911e-05, + "loss": 0.9047, + "step": 10078 + }, + { + "epoch": 0.5026932668329177, + "grad_norm": 0.3961671247801785, + "learning_rate": 2.6001268243557253e-05, + "loss": 0.9105, + "step": 10079 + }, + { + "epoch": 0.5027431421446384, + "grad_norm": 0.4531502334379615, + "learning_rate": 2.5997233018677747e-05, + "loss": 0.8795, + "step": 10080 + }, + { + "epoch": 0.5027930174563591, + "grad_norm": 0.5093809529040781, + "learning_rate": 2.599319776777589e-05, + "loss": 0.8243, + "step": 10081 + }, + { + "epoch": 0.5028428927680798, + "grad_norm": 0.4660407828531328, + "learning_rate": 2.5989162490956987e-05, + "loss": 0.8524, + "step": 10082 + }, + { + "epoch": 0.5028927680798005, + "grad_norm": 0.3759981662394381, + "learning_rate": 2.5985127188326324e-05, + "loss": 0.8545, + "step": 10083 + }, + { + "epoch": 0.5029426433915212, + "grad_norm": 0.504220173460478, + "learning_rate": 2.598109185998922e-05, + "loss": 0.8985, + "step": 10084 + }, + { + "epoch": 0.5029925187032419, + "grad_norm": 0.417327873126701, + "learning_rate": 2.597705650605095e-05, + "loss": 0.8623, + "step": 10085 + }, + { + "epoch": 0.5030423940149626, + "grad_norm": 0.4787530227341276, + "learning_rate": 2.597302112661684e-05, + "loss": 0.8993, + "step": 10086 + }, + { + "epoch": 0.5030922693266833, + "grad_norm": 0.4296931236638636, + "learning_rate": 2.596898572179217e-05, + "loss": 0.8667, + "step": 10087 + }, + { + "epoch": 0.503142144638404, + "grad_norm": 0.6135808013846021, + "learning_rate": 2.5964950291682256e-05, + "loss": 0.848, + "step": 10088 + }, + { + "epoch": 0.5031920199501246, + "grad_norm": 0.4761188507479383, + "learning_rate": 2.5960914836392393e-05, + "loss": 0.8474, + "step": 10089 + }, + { + "epoch": 0.5032418952618454, + "grad_norm": 0.4013390574239413, + "learning_rate": 2.595687935602789e-05, + "loss": 0.8599, + "step": 10090 + }, + { + "epoch": 0.5032917705735661, + "grad_norm": 0.42898149899234567, + "learning_rate": 2.5952843850694048e-05, + "loss": 0.8797, + "step": 10091 + }, + { + "epoch": 0.5033416458852867, + "grad_norm": 0.45092639832724685, + "learning_rate": 2.5948808320496176e-05, + "loss": 0.9254, + "step": 10092 + }, + { + "epoch": 0.5033915211970075, + "grad_norm": 0.4294398753891311, + "learning_rate": 2.5944772765539565e-05, + "loss": 0.8377, + "step": 10093 + }, + { + "epoch": 0.5034413965087282, + "grad_norm": 0.4856237742217048, + "learning_rate": 2.594073718592954e-05, + "loss": 0.8746, + "step": 10094 + }, + { + "epoch": 0.5034912718204488, + "grad_norm": 0.48086073021869474, + "learning_rate": 2.5936701581771395e-05, + "loss": 0.8893, + "step": 10095 + }, + { + "epoch": 0.5035411471321696, + "grad_norm": 0.4211061113190086, + "learning_rate": 2.593266595317045e-05, + "loss": 0.8318, + "step": 10096 + }, + { + "epoch": 0.5035910224438903, + "grad_norm": 0.4189839904963082, + "learning_rate": 2.5928630300231993e-05, + "loss": 0.9029, + "step": 10097 + }, + { + "epoch": 0.5036408977556109, + "grad_norm": 0.42789057779108214, + "learning_rate": 2.592459462306135e-05, + "loss": 0.9519, + "step": 10098 + }, + { + "epoch": 0.5036907730673317, + "grad_norm": 0.42711968623209595, + "learning_rate": 2.592055892176382e-05, + "loss": 0.8809, + "step": 10099 + }, + { + "epoch": 0.5037406483790524, + "grad_norm": 0.48784706375788844, + "learning_rate": 2.591652319644472e-05, + "loss": 0.94, + "step": 10100 + }, + { + "epoch": 0.503790523690773, + "grad_norm": 0.4241305087836519, + "learning_rate": 2.5912487447209355e-05, + "loss": 0.8692, + "step": 10101 + }, + { + "epoch": 0.5038403990024938, + "grad_norm": 0.5376255116592756, + "learning_rate": 2.5908451674163044e-05, + "loss": 0.8131, + "step": 10102 + }, + { + "epoch": 0.5038902743142145, + "grad_norm": 0.5167203421098242, + "learning_rate": 2.5904415877411093e-05, + "loss": 0.8771, + "step": 10103 + }, + { + "epoch": 0.5039401496259351, + "grad_norm": 0.3968334049127893, + "learning_rate": 2.5900380057058808e-05, + "loss": 0.8665, + "step": 10104 + }, + { + "epoch": 0.5039900249376559, + "grad_norm": 0.4149962547950376, + "learning_rate": 2.5896344213211517e-05, + "loss": 0.8393, + "step": 10105 + }, + { + "epoch": 0.5040399002493765, + "grad_norm": 0.5879593637297876, + "learning_rate": 2.5892308345974515e-05, + "loss": 0.8625, + "step": 10106 + }, + { + "epoch": 0.5040897755610972, + "grad_norm": 0.637916091362782, + "learning_rate": 2.5888272455453134e-05, + "loss": 0.8944, + "step": 10107 + }, + { + "epoch": 0.504139650872818, + "grad_norm": 0.4028616448390877, + "learning_rate": 2.588423654175267e-05, + "loss": 0.9051, + "step": 10108 + }, + { + "epoch": 0.5041895261845386, + "grad_norm": 0.4783565669883933, + "learning_rate": 2.588020060497846e-05, + "loss": 0.888, + "step": 10109 + }, + { + "epoch": 0.5042394014962593, + "grad_norm": 0.39389617030624197, + "learning_rate": 2.5876164645235807e-05, + "loss": 0.9016, + "step": 10110 + }, + { + "epoch": 0.5042892768079801, + "grad_norm": 0.44274067705887343, + "learning_rate": 2.5872128662630028e-05, + "loss": 0.9085, + "step": 10111 + }, + { + "epoch": 0.5043391521197007, + "grad_norm": 0.42257197265874513, + "learning_rate": 2.586809265726644e-05, + "loss": 0.8729, + "step": 10112 + }, + { + "epoch": 0.5043890274314214, + "grad_norm": 0.4643965605290211, + "learning_rate": 2.5864056629250366e-05, + "loss": 0.8663, + "step": 10113 + }, + { + "epoch": 0.5044389027431422, + "grad_norm": 0.4431164673868281, + "learning_rate": 2.586002057868712e-05, + "loss": 0.8681, + "step": 10114 + }, + { + "epoch": 0.5044887780548628, + "grad_norm": 0.5297899129079195, + "learning_rate": 2.5855984505682024e-05, + "loss": 0.8786, + "step": 10115 + }, + { + "epoch": 0.5045386533665835, + "grad_norm": 0.37904801496044543, + "learning_rate": 2.5851948410340392e-05, + "loss": 0.8661, + "step": 10116 + }, + { + "epoch": 0.5045885286783043, + "grad_norm": 0.38727688682127626, + "learning_rate": 2.5847912292767546e-05, + "loss": 0.8611, + "step": 10117 + }, + { + "epoch": 0.5046384039900249, + "grad_norm": 0.5580131838189583, + "learning_rate": 2.5843876153068814e-05, + "loss": 0.9067, + "step": 10118 + }, + { + "epoch": 0.5046882793017456, + "grad_norm": 0.5210060312967278, + "learning_rate": 2.5839839991349506e-05, + "loss": 0.9049, + "step": 10119 + }, + { + "epoch": 0.5047381546134664, + "grad_norm": 0.47480184728660524, + "learning_rate": 2.5835803807714947e-05, + "loss": 0.9004, + "step": 10120 + }, + { + "epoch": 0.504788029925187, + "grad_norm": 0.4652823039933021, + "learning_rate": 2.5831767602270467e-05, + "loss": 0.8487, + "step": 10121 + }, + { + "epoch": 0.5048379052369077, + "grad_norm": 0.5479257430135662, + "learning_rate": 2.5827731375121383e-05, + "loss": 0.9378, + "step": 10122 + }, + { + "epoch": 0.5048877805486285, + "grad_norm": 0.48855378838250724, + "learning_rate": 2.582369512637302e-05, + "loss": 0.8775, + "step": 10123 + }, + { + "epoch": 0.5049376558603491, + "grad_norm": 0.43444300803603153, + "learning_rate": 2.5819658856130696e-05, + "loss": 0.8987, + "step": 10124 + }, + { + "epoch": 0.5049875311720698, + "grad_norm": 0.48445206162796944, + "learning_rate": 2.5815622564499752e-05, + "loss": 0.9057, + "step": 10125 + }, + { + "epoch": 0.5050374064837905, + "grad_norm": 0.700182472416773, + "learning_rate": 2.581158625158549e-05, + "loss": 0.8677, + "step": 10126 + }, + { + "epoch": 0.5050872817955112, + "grad_norm": 0.5831731271065815, + "learning_rate": 2.5807549917493258e-05, + "loss": 0.8808, + "step": 10127 + }, + { + "epoch": 0.505137157107232, + "grad_norm": 0.4084492106584195, + "learning_rate": 2.5803513562328362e-05, + "loss": 0.8452, + "step": 10128 + }, + { + "epoch": 0.5051870324189526, + "grad_norm": 0.5419779227479191, + "learning_rate": 2.5799477186196154e-05, + "loss": 0.8685, + "step": 10129 + }, + { + "epoch": 0.5052369077306733, + "grad_norm": 0.581088273112853, + "learning_rate": 2.579544078920193e-05, + "loss": 0.9124, + "step": 10130 + }, + { + "epoch": 0.505286783042394, + "grad_norm": 0.4771838645369845, + "learning_rate": 2.5791404371451045e-05, + "loss": 0.8541, + "step": 10131 + }, + { + "epoch": 0.5053366583541147, + "grad_norm": 0.558160027985764, + "learning_rate": 2.578736793304881e-05, + "loss": 0.8352, + "step": 10132 + }, + { + "epoch": 0.5053865336658354, + "grad_norm": 0.43892286561073834, + "learning_rate": 2.578333147410057e-05, + "loss": 0.8615, + "step": 10133 + }, + { + "epoch": 0.5054364089775562, + "grad_norm": 0.4344522971735811, + "learning_rate": 2.577929499471164e-05, + "loss": 0.8587, + "step": 10134 + }, + { + "epoch": 0.5054862842892768, + "grad_norm": 0.4493286886128816, + "learning_rate": 2.5775258494987357e-05, + "loss": 0.9023, + "step": 10135 + }, + { + "epoch": 0.5055361596009975, + "grad_norm": 0.41311260323959115, + "learning_rate": 2.5771221975033048e-05, + "loss": 0.8818, + "step": 10136 + }, + { + "epoch": 0.5055860349127183, + "grad_norm": 0.5968173051853217, + "learning_rate": 2.5767185434954056e-05, + "loss": 0.8542, + "step": 10137 + }, + { + "epoch": 0.5056359102244389, + "grad_norm": 0.42137403311166177, + "learning_rate": 2.5763148874855698e-05, + "loss": 0.8755, + "step": 10138 + }, + { + "epoch": 0.5056857855361596, + "grad_norm": 0.5105122112484372, + "learning_rate": 2.5759112294843307e-05, + "loss": 0.8796, + "step": 10139 + }, + { + "epoch": 0.5057356608478804, + "grad_norm": 0.4782156684863198, + "learning_rate": 2.5755075695022224e-05, + "loss": 0.8831, + "step": 10140 + }, + { + "epoch": 0.505785536159601, + "grad_norm": 0.4322359865882003, + "learning_rate": 2.575103907549778e-05, + "loss": 0.8723, + "step": 10141 + }, + { + "epoch": 0.5058354114713217, + "grad_norm": 0.5740624411896794, + "learning_rate": 2.5747002436375306e-05, + "loss": 0.9129, + "step": 10142 + }, + { + "epoch": 0.5058852867830423, + "grad_norm": 0.42568535954896347, + "learning_rate": 2.5742965777760135e-05, + "loss": 0.8957, + "step": 10143 + }, + { + "epoch": 0.5059351620947631, + "grad_norm": 0.46192826028856265, + "learning_rate": 2.573892909975761e-05, + "loss": 0.8822, + "step": 10144 + }, + { + "epoch": 0.5059850374064838, + "grad_norm": 0.426918134760228, + "learning_rate": 2.5734892402473053e-05, + "loss": 0.8859, + "step": 10145 + }, + { + "epoch": 0.5060349127182044, + "grad_norm": 0.4901452157828847, + "learning_rate": 2.5730855686011816e-05, + "loss": 0.9276, + "step": 10146 + }, + { + "epoch": 0.5060847880299252, + "grad_norm": 0.4518534750957254, + "learning_rate": 2.572681895047922e-05, + "loss": 0.9367, + "step": 10147 + }, + { + "epoch": 0.5061346633416459, + "grad_norm": 0.489284281065629, + "learning_rate": 2.5722782195980616e-05, + "loss": 0.8688, + "step": 10148 + }, + { + "epoch": 0.5061845386533665, + "grad_norm": 0.7831481784921959, + "learning_rate": 2.571874542262133e-05, + "loss": 0.9028, + "step": 10149 + }, + { + "epoch": 0.5062344139650873, + "grad_norm": 0.43888837974607287, + "learning_rate": 2.5714708630506706e-05, + "loss": 0.9034, + "step": 10150 + }, + { + "epoch": 0.506284289276808, + "grad_norm": 0.6218342026949054, + "learning_rate": 2.5710671819742083e-05, + "loss": 0.9387, + "step": 10151 + }, + { + "epoch": 0.5063341645885286, + "grad_norm": 0.6625421010189877, + "learning_rate": 2.5706634990432788e-05, + "loss": 0.8902, + "step": 10152 + }, + { + "epoch": 0.5063840399002494, + "grad_norm": 0.4133319766550931, + "learning_rate": 2.5702598142684175e-05, + "loss": 0.86, + "step": 10153 + }, + { + "epoch": 0.5064339152119701, + "grad_norm": 0.5635922903762, + "learning_rate": 2.5698561276601575e-05, + "loss": 0.9403, + "step": 10154 + }, + { + "epoch": 0.5064837905236907, + "grad_norm": 0.42358573036438407, + "learning_rate": 2.5694524392290338e-05, + "loss": 0.8864, + "step": 10155 + }, + { + "epoch": 0.5065336658354115, + "grad_norm": 0.4969639052475649, + "learning_rate": 2.5690487489855784e-05, + "loss": 0.8812, + "step": 10156 + }, + { + "epoch": 0.5065835411471322, + "grad_norm": 0.41964189422422776, + "learning_rate": 2.568645056940328e-05, + "loss": 0.8545, + "step": 10157 + }, + { + "epoch": 0.5066334164588528, + "grad_norm": 0.4541218583099461, + "learning_rate": 2.568241363103815e-05, + "loss": 0.8988, + "step": 10158 + }, + { + "epoch": 0.5066832917705736, + "grad_norm": 0.4959250366083207, + "learning_rate": 2.5678376674865744e-05, + "loss": 0.9089, + "step": 10159 + }, + { + "epoch": 0.5067331670822942, + "grad_norm": 0.3930560596858817, + "learning_rate": 2.56743397009914e-05, + "loss": 0.8691, + "step": 10160 + }, + { + "epoch": 0.506783042394015, + "grad_norm": 0.7073286953762732, + "learning_rate": 2.5670302709520465e-05, + "loss": 0.8841, + "step": 10161 + }, + { + "epoch": 0.5068329177057357, + "grad_norm": 0.448696498479769, + "learning_rate": 2.5666265700558284e-05, + "loss": 0.8925, + "step": 10162 + }, + { + "epoch": 0.5068827930174563, + "grad_norm": 0.43804339149721333, + "learning_rate": 2.5662228674210192e-05, + "loss": 0.897, + "step": 10163 + }, + { + "epoch": 0.506932668329177, + "grad_norm": 0.4005154272832978, + "learning_rate": 2.5658191630581545e-05, + "loss": 0.8751, + "step": 10164 + }, + { + "epoch": 0.5069825436408978, + "grad_norm": 0.5621243493439894, + "learning_rate": 2.5654154569777678e-05, + "loss": 0.9026, + "step": 10165 + }, + { + "epoch": 0.5070324189526184, + "grad_norm": 0.4302157291463826, + "learning_rate": 2.5650117491903942e-05, + "loss": 0.8896, + "step": 10166 + }, + { + "epoch": 0.5070822942643392, + "grad_norm": 0.5394697848640821, + "learning_rate": 2.5646080397065675e-05, + "loss": 0.9504, + "step": 10167 + }, + { + "epoch": 0.5071321695760599, + "grad_norm": 1.4094251130210835, + "learning_rate": 2.564204328536824e-05, + "loss": 0.879, + "step": 10168 + }, + { + "epoch": 0.5071820448877805, + "grad_norm": 0.5162096521662538, + "learning_rate": 2.5638006156916967e-05, + "loss": 0.8079, + "step": 10169 + }, + { + "epoch": 0.5072319201995013, + "grad_norm": 0.44863236918575083, + "learning_rate": 2.563396901181721e-05, + "loss": 0.8742, + "step": 10170 + }, + { + "epoch": 0.507281795511222, + "grad_norm": 0.8190548167685955, + "learning_rate": 2.562993185017431e-05, + "loss": 0.8792, + "step": 10171 + }, + { + "epoch": 0.5073316708229426, + "grad_norm": 0.4021160400040642, + "learning_rate": 2.5625894672093625e-05, + "loss": 0.8961, + "step": 10172 + }, + { + "epoch": 0.5073815461346634, + "grad_norm": 0.4769903844885657, + "learning_rate": 2.5621857477680506e-05, + "loss": 0.9315, + "step": 10173 + }, + { + "epoch": 0.5074314214463841, + "grad_norm": 0.42541193050341114, + "learning_rate": 2.561782026704028e-05, + "loss": 0.8187, + "step": 10174 + }, + { + "epoch": 0.5074812967581047, + "grad_norm": 0.3974991916403924, + "learning_rate": 2.5613783040278323e-05, + "loss": 0.8866, + "step": 10175 + }, + { + "epoch": 0.5075311720698255, + "grad_norm": 0.4252414171331499, + "learning_rate": 2.560974579749996e-05, + "loss": 0.9145, + "step": 10176 + }, + { + "epoch": 0.5075810473815461, + "grad_norm": 0.4782689515561361, + "learning_rate": 2.5605708538810563e-05, + "loss": 0.8747, + "step": 10177 + }, + { + "epoch": 0.5076309226932668, + "grad_norm": 0.43826743969675114, + "learning_rate": 2.560167126431547e-05, + "loss": 0.865, + "step": 10178 + }, + { + "epoch": 0.5076807980049876, + "grad_norm": 0.3875816838166756, + "learning_rate": 2.559763397412004e-05, + "loss": 0.915, + "step": 10179 + }, + { + "epoch": 0.5077306733167082, + "grad_norm": 0.7488972820089985, + "learning_rate": 2.5593596668329606e-05, + "loss": 0.9303, + "step": 10180 + }, + { + "epoch": 0.5077805486284289, + "grad_norm": 0.41823846709546875, + "learning_rate": 2.5589559347049542e-05, + "loss": 0.8802, + "step": 10181 + }, + { + "epoch": 0.5078304239401497, + "grad_norm": 0.49015349332600316, + "learning_rate": 2.5585522010385187e-05, + "loss": 0.925, + "step": 10182 + }, + { + "epoch": 0.5078802992518703, + "grad_norm": 0.4857291458277585, + "learning_rate": 2.55814846584419e-05, + "loss": 0.9132, + "step": 10183 + }, + { + "epoch": 0.507930174563591, + "grad_norm": 0.4212343778635187, + "learning_rate": 2.557744729132503e-05, + "loss": 0.9096, + "step": 10184 + }, + { + "epoch": 0.5079800498753118, + "grad_norm": 0.5003351639489881, + "learning_rate": 2.5573409909139933e-05, + "loss": 0.8866, + "step": 10185 + }, + { + "epoch": 0.5080299251870324, + "grad_norm": 0.4199856750549239, + "learning_rate": 2.556937251199196e-05, + "loss": 0.8418, + "step": 10186 + }, + { + "epoch": 0.5080798004987531, + "grad_norm": 0.4587822576994957, + "learning_rate": 2.5565335099986466e-05, + "loss": 0.8642, + "step": 10187 + }, + { + "epoch": 0.5081296758104739, + "grad_norm": 0.5596585841754986, + "learning_rate": 2.5561297673228807e-05, + "loss": 0.9158, + "step": 10188 + }, + { + "epoch": 0.5081795511221945, + "grad_norm": 0.5344388372140808, + "learning_rate": 2.5557260231824333e-05, + "loss": 0.9015, + "step": 10189 + }, + { + "epoch": 0.5082294264339152, + "grad_norm": 0.43051791019349767, + "learning_rate": 2.5553222775878403e-05, + "loss": 0.8614, + "step": 10190 + }, + { + "epoch": 0.508279301745636, + "grad_norm": 0.4337028714013166, + "learning_rate": 2.554918530549637e-05, + "loss": 0.9214, + "step": 10191 + }, + { + "epoch": 0.5083291770573566, + "grad_norm": 0.4458047975753229, + "learning_rate": 2.55451478207836e-05, + "loss": 0.9236, + "step": 10192 + }, + { + "epoch": 0.5083790523690773, + "grad_norm": 0.4718522666360397, + "learning_rate": 2.5541110321845434e-05, + "loss": 0.8866, + "step": 10193 + }, + { + "epoch": 0.5084289276807981, + "grad_norm": 0.40658050155300623, + "learning_rate": 2.5537072808787245e-05, + "loss": 0.8586, + "step": 10194 + }, + { + "epoch": 0.5084788029925187, + "grad_norm": 0.5730125948066984, + "learning_rate": 2.553303528171437e-05, + "loss": 0.8905, + "step": 10195 + }, + { + "epoch": 0.5085286783042394, + "grad_norm": 0.36120736291108657, + "learning_rate": 2.552899774073218e-05, + "loss": 0.8384, + "step": 10196 + }, + { + "epoch": 0.50857855361596, + "grad_norm": 0.4220143070056121, + "learning_rate": 2.552496018594604e-05, + "loss": 0.8898, + "step": 10197 + }, + { + "epoch": 0.5086284289276808, + "grad_norm": 0.4209102165849165, + "learning_rate": 2.552092261746129e-05, + "loss": 0.885, + "step": 10198 + }, + { + "epoch": 0.5086783042394015, + "grad_norm": 0.6534444317227017, + "learning_rate": 2.5516885035383302e-05, + "loss": 0.886, + "step": 10199 + }, + { + "epoch": 0.5087281795511222, + "grad_norm": 0.5753421993715192, + "learning_rate": 2.551284743981743e-05, + "loss": 0.8695, + "step": 10200 + }, + { + "epoch": 0.5087780548628429, + "grad_norm": 0.42630336396312957, + "learning_rate": 2.5508809830869028e-05, + "loss": 0.8815, + "step": 10201 + }, + { + "epoch": 0.5088279301745636, + "grad_norm": 0.4651605355476187, + "learning_rate": 2.5504772208643467e-05, + "loss": 0.8955, + "step": 10202 + }, + { + "epoch": 0.5088778054862843, + "grad_norm": 0.40198027880125414, + "learning_rate": 2.55007345732461e-05, + "loss": 0.8956, + "step": 10203 + }, + { + "epoch": 0.508927680798005, + "grad_norm": 0.43340074085328406, + "learning_rate": 2.5496696924782286e-05, + "loss": 0.8936, + "step": 10204 + }, + { + "epoch": 0.5089775561097257, + "grad_norm": 0.3909826146361352, + "learning_rate": 2.5492659263357392e-05, + "loss": 0.8636, + "step": 10205 + }, + { + "epoch": 0.5090274314214464, + "grad_norm": 0.4237215093341863, + "learning_rate": 2.548862158907677e-05, + "loss": 0.8674, + "step": 10206 + }, + { + "epoch": 0.5090773067331671, + "grad_norm": 0.4429550758392206, + "learning_rate": 2.5484583902045796e-05, + "loss": 0.8739, + "step": 10207 + }, + { + "epoch": 0.5091271820448878, + "grad_norm": 0.39315522081110993, + "learning_rate": 2.548054620236981e-05, + "loss": 0.8773, + "step": 10208 + }, + { + "epoch": 0.5091770573566085, + "grad_norm": 0.41525990212809283, + "learning_rate": 2.5476508490154193e-05, + "loss": 0.8539, + "step": 10209 + }, + { + "epoch": 0.5092269326683292, + "grad_norm": 0.4102217215514551, + "learning_rate": 2.5472470765504303e-05, + "loss": 0.8556, + "step": 10210 + }, + { + "epoch": 0.5092768079800499, + "grad_norm": 0.5055078225270959, + "learning_rate": 2.5468433028525497e-05, + "loss": 0.8848, + "step": 10211 + }, + { + "epoch": 0.5093266832917706, + "grad_norm": 0.4876409234043282, + "learning_rate": 2.5464395279323137e-05, + "loss": 0.8628, + "step": 10212 + }, + { + "epoch": 0.5093765586034913, + "grad_norm": 0.4467787677553884, + "learning_rate": 2.5460357518002592e-05, + "loss": 0.8861, + "step": 10213 + }, + { + "epoch": 0.5094264339152119, + "grad_norm": 0.846396239170372, + "learning_rate": 2.5456319744669226e-05, + "loss": 0.8869, + "step": 10214 + }, + { + "epoch": 0.5094763092269327, + "grad_norm": 0.39335230342588323, + "learning_rate": 2.5452281959428397e-05, + "loss": 0.8854, + "step": 10215 + }, + { + "epoch": 0.5095261845386534, + "grad_norm": 0.512592260581832, + "learning_rate": 2.5448244162385478e-05, + "loss": 0.8669, + "step": 10216 + }, + { + "epoch": 0.509576059850374, + "grad_norm": 0.44485655773732447, + "learning_rate": 2.5444206353645823e-05, + "loss": 0.9051, + "step": 10217 + }, + { + "epoch": 0.5096259351620948, + "grad_norm": 0.6856019477939188, + "learning_rate": 2.54401685333148e-05, + "loss": 0.8159, + "step": 10218 + }, + { + "epoch": 0.5096758104738155, + "grad_norm": 0.43760945048097044, + "learning_rate": 2.543613070149778e-05, + "loss": 0.8738, + "step": 10219 + }, + { + "epoch": 0.5097256857855361, + "grad_norm": 0.41866627250888583, + "learning_rate": 2.5432092858300127e-05, + "loss": 0.8959, + "step": 10220 + }, + { + "epoch": 0.5097755610972569, + "grad_norm": 0.5418976451340098, + "learning_rate": 2.54280550038272e-05, + "loss": 0.9038, + "step": 10221 + }, + { + "epoch": 0.5098254364089776, + "grad_norm": 0.4477104853850135, + "learning_rate": 2.5424017138184368e-05, + "loss": 0.8691, + "step": 10222 + }, + { + "epoch": 0.5098753117206982, + "grad_norm": 0.45377456620232853, + "learning_rate": 2.5419979261477007e-05, + "loss": 0.8615, + "step": 10223 + }, + { + "epoch": 0.509925187032419, + "grad_norm": 0.4991872725606359, + "learning_rate": 2.5415941373810463e-05, + "loss": 0.9169, + "step": 10224 + }, + { + "epoch": 0.5099750623441397, + "grad_norm": 0.4559414902683894, + "learning_rate": 2.5411903475290127e-05, + "loss": 0.8607, + "step": 10225 + }, + { + "epoch": 0.5100249376558603, + "grad_norm": 0.40799375629287665, + "learning_rate": 2.5407865566021343e-05, + "loss": 0.8916, + "step": 10226 + }, + { + "epoch": 0.5100748129675811, + "grad_norm": 0.4432580929423835, + "learning_rate": 2.54038276461095e-05, + "loss": 0.8713, + "step": 10227 + }, + { + "epoch": 0.5101246882793018, + "grad_norm": 0.73635212959335, + "learning_rate": 2.539978971565995e-05, + "loss": 0.8561, + "step": 10228 + }, + { + "epoch": 0.5101745635910224, + "grad_norm": 0.5049531584092244, + "learning_rate": 2.5395751774778066e-05, + "loss": 0.8475, + "step": 10229 + }, + { + "epoch": 0.5102244389027432, + "grad_norm": 0.5593911426795367, + "learning_rate": 2.5391713823569215e-05, + "loss": 0.8746, + "step": 10230 + }, + { + "epoch": 0.5102743142144638, + "grad_norm": 0.420926629529807, + "learning_rate": 2.5387675862138773e-05, + "loss": 0.9438, + "step": 10231 + }, + { + "epoch": 0.5103241895261845, + "grad_norm": 0.48373979108321913, + "learning_rate": 2.5383637890592094e-05, + "loss": 0.8834, + "step": 10232 + }, + { + "epoch": 0.5103740648379053, + "grad_norm": 0.46215178662659984, + "learning_rate": 2.5379599909034564e-05, + "loss": 0.8981, + "step": 10233 + }, + { + "epoch": 0.5104239401496259, + "grad_norm": 0.4724169196941852, + "learning_rate": 2.537556191757154e-05, + "loss": 0.9329, + "step": 10234 + }, + { + "epoch": 0.5104738154613466, + "grad_norm": 0.42149890766199816, + "learning_rate": 2.5371523916308393e-05, + "loss": 0.8861, + "step": 10235 + }, + { + "epoch": 0.5105236907730674, + "grad_norm": 0.5059322264667578, + "learning_rate": 2.53674859053505e-05, + "loss": 0.857, + "step": 10236 + }, + { + "epoch": 0.510573566084788, + "grad_norm": 0.39237511928461205, + "learning_rate": 2.5363447884803228e-05, + "loss": 0.8702, + "step": 10237 + }, + { + "epoch": 0.5106234413965087, + "grad_norm": 0.48447405574392644, + "learning_rate": 2.5359409854771943e-05, + "loss": 0.8888, + "step": 10238 + }, + { + "epoch": 0.5106733167082295, + "grad_norm": 4.2567105532840195, + "learning_rate": 2.5355371815362018e-05, + "loss": 0.875, + "step": 10239 + }, + { + "epoch": 0.5107231920199501, + "grad_norm": 0.4479467805591833, + "learning_rate": 2.535133376667883e-05, + "loss": 0.8783, + "step": 10240 + }, + { + "epoch": 0.5107730673316708, + "grad_norm": 0.48052591705644293, + "learning_rate": 2.5347295708827738e-05, + "loss": 0.9507, + "step": 10241 + }, + { + "epoch": 0.5108229426433916, + "grad_norm": 0.40557799765953123, + "learning_rate": 2.5343257641914125e-05, + "loss": 0.8347, + "step": 10242 + }, + { + "epoch": 0.5108728179551122, + "grad_norm": 0.5433251887828274, + "learning_rate": 2.5339219566043355e-05, + "loss": 0.9204, + "step": 10243 + }, + { + "epoch": 0.5109226932668329, + "grad_norm": 0.6541952154224249, + "learning_rate": 2.5335181481320806e-05, + "loss": 0.8641, + "step": 10244 + }, + { + "epoch": 0.5109725685785537, + "grad_norm": 0.5065401934300183, + "learning_rate": 2.5331143387851846e-05, + "loss": 0.9018, + "step": 10245 + }, + { + "epoch": 0.5110224438902743, + "grad_norm": 0.8084212140562752, + "learning_rate": 2.5327105285741842e-05, + "loss": 0.862, + "step": 10246 + }, + { + "epoch": 0.511072319201995, + "grad_norm": 0.3749522832720817, + "learning_rate": 2.5323067175096175e-05, + "loss": 0.9085, + "step": 10247 + }, + { + "epoch": 0.5111221945137157, + "grad_norm": 0.4200682368351737, + "learning_rate": 2.5319029056020217e-05, + "loss": 0.8902, + "step": 10248 + }, + { + "epoch": 0.5111720698254364, + "grad_norm": 0.4908590235470127, + "learning_rate": 2.5314990928619337e-05, + "loss": 0.8919, + "step": 10249 + }, + { + "epoch": 0.5112219451371571, + "grad_norm": 0.41363205763918914, + "learning_rate": 2.5310952792998904e-05, + "loss": 0.8928, + "step": 10250 + }, + { + "epoch": 0.5112718204488778, + "grad_norm": 0.40125092955657293, + "learning_rate": 2.5306914649264308e-05, + "loss": 0.873, + "step": 10251 + }, + { + "epoch": 0.5113216957605985, + "grad_norm": 0.4449417980913692, + "learning_rate": 2.53028764975209e-05, + "loss": 0.8737, + "step": 10252 + }, + { + "epoch": 0.5113715710723192, + "grad_norm": 0.4068915039306514, + "learning_rate": 2.529883833787407e-05, + "loss": 0.8991, + "step": 10253 + }, + { + "epoch": 0.5114214463840399, + "grad_norm": 0.6030816911217268, + "learning_rate": 2.529480017042919e-05, + "loss": 0.8637, + "step": 10254 + }, + { + "epoch": 0.5114713216957606, + "grad_norm": 0.3819427110263383, + "learning_rate": 2.529076199529163e-05, + "loss": 0.8762, + "step": 10255 + }, + { + "epoch": 0.5115211970074813, + "grad_norm": 0.4256904701306083, + "learning_rate": 2.5286723812566766e-05, + "loss": 0.8546, + "step": 10256 + }, + { + "epoch": 0.511571072319202, + "grad_norm": 0.35844994863489554, + "learning_rate": 2.528268562235997e-05, + "loss": 0.917, + "step": 10257 + }, + { + "epoch": 0.5116209476309227, + "grad_norm": 0.39094738426977566, + "learning_rate": 2.5278647424776626e-05, + "loss": 0.8961, + "step": 10258 + }, + { + "epoch": 0.5116708229426434, + "grad_norm": 0.4242336245396221, + "learning_rate": 2.527460921992209e-05, + "loss": 0.9509, + "step": 10259 + }, + { + "epoch": 0.5117206982543641, + "grad_norm": 0.8471656203052557, + "learning_rate": 2.5270571007901763e-05, + "loss": 0.8977, + "step": 10260 + }, + { + "epoch": 0.5117705735660848, + "grad_norm": 0.44350308365206, + "learning_rate": 2.5266532788820994e-05, + "loss": 0.8868, + "step": 10261 + }, + { + "epoch": 0.5118204488778055, + "grad_norm": 0.40686320948486876, + "learning_rate": 2.526249456278518e-05, + "loss": 0.8311, + "step": 10262 + }, + { + "epoch": 0.5118703241895262, + "grad_norm": 0.3754814551336265, + "learning_rate": 2.5258456329899684e-05, + "loss": 0.8909, + "step": 10263 + }, + { + "epoch": 0.5119201995012469, + "grad_norm": 0.43268427667284004, + "learning_rate": 2.525441809026989e-05, + "loss": 0.8419, + "step": 10264 + }, + { + "epoch": 0.5119700748129676, + "grad_norm": 0.4357524045342433, + "learning_rate": 2.525037984400116e-05, + "loss": 0.8804, + "step": 10265 + }, + { + "epoch": 0.5120199501246883, + "grad_norm": 0.5858611307685885, + "learning_rate": 2.524634159119889e-05, + "loss": 0.825, + "step": 10266 + }, + { + "epoch": 0.512069825436409, + "grad_norm": 0.37040975562373385, + "learning_rate": 2.5242303331968443e-05, + "loss": 0.8689, + "step": 10267 + }, + { + "epoch": 0.5121197007481296, + "grad_norm": 0.6042301774042224, + "learning_rate": 2.5238265066415202e-05, + "loss": 0.924, + "step": 10268 + }, + { + "epoch": 0.5121695760598504, + "grad_norm": 0.44444262645457894, + "learning_rate": 2.5234226794644533e-05, + "loss": 0.8795, + "step": 10269 + }, + { + "epoch": 0.5122194513715711, + "grad_norm": 0.47325925198783503, + "learning_rate": 2.5230188516761826e-05, + "loss": 0.9118, + "step": 10270 + }, + { + "epoch": 0.5122693266832917, + "grad_norm": 2.3513738322125395, + "learning_rate": 2.5226150232872453e-05, + "loss": 0.8495, + "step": 10271 + }, + { + "epoch": 0.5123192019950125, + "grad_norm": 0.5678753736490805, + "learning_rate": 2.5222111943081788e-05, + "loss": 0.8717, + "step": 10272 + }, + { + "epoch": 0.5123690773067332, + "grad_norm": 0.40618668860546797, + "learning_rate": 2.521807364749521e-05, + "loss": 0.8704, + "step": 10273 + }, + { + "epoch": 0.5124189526184538, + "grad_norm": 0.445720446457316, + "learning_rate": 2.5214035346218102e-05, + "loss": 0.8756, + "step": 10274 + }, + { + "epoch": 0.5124688279301746, + "grad_norm": 0.40237703167342836, + "learning_rate": 2.5209997039355837e-05, + "loss": 0.9104, + "step": 10275 + }, + { + "epoch": 0.5125187032418953, + "grad_norm": 0.4625701892795247, + "learning_rate": 2.520595872701379e-05, + "loss": 0.8522, + "step": 10276 + }, + { + "epoch": 0.5125685785536159, + "grad_norm": 0.4784192712575899, + "learning_rate": 2.5201920409297347e-05, + "loss": 0.8608, + "step": 10277 + }, + { + "epoch": 0.5126184538653367, + "grad_norm": 0.4765037844946053, + "learning_rate": 2.519788208631188e-05, + "loss": 0.8835, + "step": 10278 + }, + { + "epoch": 0.5126683291770574, + "grad_norm": 0.43476409967147406, + "learning_rate": 2.5193843758162766e-05, + "loss": 0.9102, + "step": 10279 + }, + { + "epoch": 0.512718204488778, + "grad_norm": 0.46020329561311224, + "learning_rate": 2.5189805424955394e-05, + "loss": 0.8808, + "step": 10280 + }, + { + "epoch": 0.5127680798004988, + "grad_norm": 0.41933946820737644, + "learning_rate": 2.5185767086795126e-05, + "loss": 0.8664, + "step": 10281 + }, + { + "epoch": 0.5128179551122195, + "grad_norm": 0.4483456854266766, + "learning_rate": 2.5181728743787352e-05, + "loss": 0.8979, + "step": 10282 + }, + { + "epoch": 0.5128678304239401, + "grad_norm": 0.3866508693869616, + "learning_rate": 2.517769039603744e-05, + "loss": 0.8805, + "step": 10283 + }, + { + "epoch": 0.5129177057356609, + "grad_norm": 0.5863304278687252, + "learning_rate": 2.5173652043650787e-05, + "loss": 0.896, + "step": 10284 + }, + { + "epoch": 0.5129675810473815, + "grad_norm": 0.4986016538378167, + "learning_rate": 2.5169613686732757e-05, + "loss": 0.8586, + "step": 10285 + }, + { + "epoch": 0.5130174563591022, + "grad_norm": 0.4144942155501702, + "learning_rate": 2.5165575325388733e-05, + "loss": 0.8861, + "step": 10286 + }, + { + "epoch": 0.513067331670823, + "grad_norm": 0.46327611087553, + "learning_rate": 2.5161536959724096e-05, + "loss": 0.8881, + "step": 10287 + }, + { + "epoch": 0.5131172069825436, + "grad_norm": 0.43905637852412477, + "learning_rate": 2.515749858984423e-05, + "loss": 0.8675, + "step": 10288 + }, + { + "epoch": 0.5131670822942643, + "grad_norm": 0.5119774069734652, + "learning_rate": 2.51534602158545e-05, + "loss": 0.8684, + "step": 10289 + }, + { + "epoch": 0.5132169576059851, + "grad_norm": 0.4071621262245925, + "learning_rate": 2.51494218378603e-05, + "loss": 0.8907, + "step": 10290 + }, + { + "epoch": 0.5132668329177057, + "grad_norm": 0.3947810567223891, + "learning_rate": 2.5145383455967e-05, + "loss": 0.8594, + "step": 10291 + }, + { + "epoch": 0.5133167082294264, + "grad_norm": 0.45576939230704083, + "learning_rate": 2.5141345070279986e-05, + "loss": 0.8436, + "step": 10292 + }, + { + "epoch": 0.5133665835411472, + "grad_norm": 0.43692053201616954, + "learning_rate": 2.5137306680904643e-05, + "loss": 0.8811, + "step": 10293 + }, + { + "epoch": 0.5134164588528678, + "grad_norm": 0.5016785851880368, + "learning_rate": 2.513326828794633e-05, + "loss": 0.8432, + "step": 10294 + }, + { + "epoch": 0.5134663341645885, + "grad_norm": 0.445595392084661, + "learning_rate": 2.5129229891510453e-05, + "loss": 0.856, + "step": 10295 + }, + { + "epoch": 0.5135162094763093, + "grad_norm": 0.4755342189932857, + "learning_rate": 2.5125191491702365e-05, + "loss": 0.9189, + "step": 10296 + }, + { + "epoch": 0.5135660847880299, + "grad_norm": 0.4508989905728512, + "learning_rate": 2.5121153088627474e-05, + "loss": 0.9235, + "step": 10297 + }, + { + "epoch": 0.5136159600997506, + "grad_norm": 0.4090096823219194, + "learning_rate": 2.511711468239114e-05, + "loss": 0.8608, + "step": 10298 + }, + { + "epoch": 0.5136658354114714, + "grad_norm": 0.46544857956350827, + "learning_rate": 2.5113076273098756e-05, + "loss": 0.9487, + "step": 10299 + }, + { + "epoch": 0.513715710723192, + "grad_norm": 0.445736489784552, + "learning_rate": 2.5109037860855696e-05, + "loss": 0.8646, + "step": 10300 + }, + { + "epoch": 0.5137655860349127, + "grad_norm": 0.47798976731668263, + "learning_rate": 2.510499944576734e-05, + "loss": 0.8553, + "step": 10301 + }, + { + "epoch": 0.5138154613466334, + "grad_norm": 0.5859682660425938, + "learning_rate": 2.510096102793907e-05, + "loss": 0.8725, + "step": 10302 + }, + { + "epoch": 0.5138653366583541, + "grad_norm": 0.43867008949106334, + "learning_rate": 2.509692260747627e-05, + "loss": 0.9179, + "step": 10303 + }, + { + "epoch": 0.5139152119700748, + "grad_norm": 0.45952779720277687, + "learning_rate": 2.5092884184484323e-05, + "loss": 0.8572, + "step": 10304 + }, + { + "epoch": 0.5139650872817955, + "grad_norm": 0.43110590870483784, + "learning_rate": 2.5088845759068592e-05, + "loss": 0.8728, + "step": 10305 + }, + { + "epoch": 0.5140149625935162, + "grad_norm": 0.41495757993368915, + "learning_rate": 2.5084807331334474e-05, + "loss": 0.8242, + "step": 10306 + }, + { + "epoch": 0.5140648379052369, + "grad_norm": 0.41032815391416044, + "learning_rate": 2.508076890138735e-05, + "loss": 0.8558, + "step": 10307 + }, + { + "epoch": 0.5141147132169576, + "grad_norm": 0.404518175257015, + "learning_rate": 2.50767304693326e-05, + "loss": 0.9341, + "step": 10308 + }, + { + "epoch": 0.5141645885286783, + "grad_norm": 0.43293093623999224, + "learning_rate": 2.50726920352756e-05, + "loss": 0.9234, + "step": 10309 + }, + { + "epoch": 0.514214463840399, + "grad_norm": 0.5790544075006743, + "learning_rate": 2.5068653599321734e-05, + "loss": 0.8841, + "step": 10310 + }, + { + "epoch": 0.5142643391521197, + "grad_norm": 0.4426604425449112, + "learning_rate": 2.5064615161576378e-05, + "loss": 0.9239, + "step": 10311 + }, + { + "epoch": 0.5143142144638404, + "grad_norm": 0.4446755017430959, + "learning_rate": 2.5060576722144925e-05, + "loss": 0.8925, + "step": 10312 + }, + { + "epoch": 0.5143640897755611, + "grad_norm": 0.42086098245671055, + "learning_rate": 2.5056538281132746e-05, + "loss": 0.8904, + "step": 10313 + }, + { + "epoch": 0.5144139650872818, + "grad_norm": 0.3840069815701339, + "learning_rate": 2.5052499838645232e-05, + "loss": 0.8159, + "step": 10314 + }, + { + "epoch": 0.5144638403990025, + "grad_norm": 0.40457986522577816, + "learning_rate": 2.5048461394787748e-05, + "loss": 0.8712, + "step": 10315 + }, + { + "epoch": 0.5145137157107232, + "grad_norm": 0.47579589923729415, + "learning_rate": 2.5044422949665697e-05, + "loss": 0.9182, + "step": 10316 + }, + { + "epoch": 0.5145635910224439, + "grad_norm": 0.43782863208477674, + "learning_rate": 2.5040384503384445e-05, + "loss": 0.8842, + "step": 10317 + }, + { + "epoch": 0.5146134663341646, + "grad_norm": 0.392013315101695, + "learning_rate": 2.503634605604937e-05, + "loss": 0.8666, + "step": 10318 + }, + { + "epoch": 0.5146633416458853, + "grad_norm": 0.45894883446306795, + "learning_rate": 2.503230760776587e-05, + "loss": 0.8594, + "step": 10319 + }, + { + "epoch": 0.514713216957606, + "grad_norm": 0.403777999480608, + "learning_rate": 2.502826915863931e-05, + "loss": 0.8627, + "step": 10320 + }, + { + "epoch": 0.5147630922693267, + "grad_norm": 0.663031042187043, + "learning_rate": 2.502423070877509e-05, + "loss": 0.8997, + "step": 10321 + }, + { + "epoch": 0.5148129675810473, + "grad_norm": 0.9850998557585767, + "learning_rate": 2.502019225827857e-05, + "loss": 0.8983, + "step": 10322 + }, + { + "epoch": 0.5148628428927681, + "grad_norm": 0.4413874318515258, + "learning_rate": 2.501615380725515e-05, + "loss": 0.8979, + "step": 10323 + }, + { + "epoch": 0.5149127182044888, + "grad_norm": 0.5765009039435773, + "learning_rate": 2.5012115355810194e-05, + "loss": 0.8832, + "step": 10324 + }, + { + "epoch": 0.5149625935162094, + "grad_norm": 0.425633048914915, + "learning_rate": 2.50080769040491e-05, + "loss": 0.8598, + "step": 10325 + }, + { + "epoch": 0.5150124688279302, + "grad_norm": 0.47826532255991466, + "learning_rate": 2.500403845207724e-05, + "loss": 0.9448, + "step": 10326 + }, + { + "epoch": 0.5150623441396509, + "grad_norm": 0.3901566307687443, + "learning_rate": 2.5e-05, + "loss": 0.8389, + "step": 10327 + }, + { + "epoch": 0.5151122194513715, + "grad_norm": 0.7235153666901623, + "learning_rate": 2.499596154792276e-05, + "loss": 0.8927, + "step": 10328 + }, + { + "epoch": 0.5151620947630923, + "grad_norm": 0.4042949012710996, + "learning_rate": 2.499192309595091e-05, + "loss": 0.8558, + "step": 10329 + }, + { + "epoch": 0.515211970074813, + "grad_norm": 0.4969696479067274, + "learning_rate": 2.498788464418981e-05, + "loss": 0.8819, + "step": 10330 + }, + { + "epoch": 0.5152618453865336, + "grad_norm": 0.4083512933458416, + "learning_rate": 2.498384619274486e-05, + "loss": 0.8602, + "step": 10331 + }, + { + "epoch": 0.5153117206982544, + "grad_norm": 0.4300350302912043, + "learning_rate": 2.4979807741721432e-05, + "loss": 0.8675, + "step": 10332 + }, + { + "epoch": 0.5153615960099751, + "grad_norm": 0.4175222124964178, + "learning_rate": 2.4975769291224924e-05, + "loss": 0.921, + "step": 10333 + }, + { + "epoch": 0.5154114713216957, + "grad_norm": 0.5137533467035877, + "learning_rate": 2.4971730841360695e-05, + "loss": 0.8275, + "step": 10334 + }, + { + "epoch": 0.5154613466334165, + "grad_norm": 0.37903910784209377, + "learning_rate": 2.4967692392234134e-05, + "loss": 0.8806, + "step": 10335 + }, + { + "epoch": 0.5155112219451372, + "grad_norm": 0.49720354650081966, + "learning_rate": 2.4963653943950627e-05, + "loss": 0.8835, + "step": 10336 + }, + { + "epoch": 0.5155610972568578, + "grad_norm": 0.3949130764177295, + "learning_rate": 2.4959615496615568e-05, + "loss": 0.8821, + "step": 10337 + }, + { + "epoch": 0.5156109725685786, + "grad_norm": 0.4929430557676918, + "learning_rate": 2.4955577050334312e-05, + "loss": 0.8996, + "step": 10338 + }, + { + "epoch": 0.5156608478802992, + "grad_norm": 0.5212310231075266, + "learning_rate": 2.495153860521225e-05, + "loss": 0.8853, + "step": 10339 + }, + { + "epoch": 0.5157107231920199, + "grad_norm": 0.3726905697859985, + "learning_rate": 2.4947500161354777e-05, + "loss": 0.8988, + "step": 10340 + }, + { + "epoch": 0.5157605985037407, + "grad_norm": 0.4384421682725932, + "learning_rate": 2.494346171886726e-05, + "loss": 0.8649, + "step": 10341 + }, + { + "epoch": 0.5158104738154613, + "grad_norm": 0.42637841752094685, + "learning_rate": 2.4939423277855077e-05, + "loss": 0.8855, + "step": 10342 + }, + { + "epoch": 0.515860349127182, + "grad_norm": 0.36723112918882145, + "learning_rate": 2.493538483842362e-05, + "loss": 0.8358, + "step": 10343 + }, + { + "epoch": 0.5159102244389028, + "grad_norm": 0.43362539575163656, + "learning_rate": 2.4931346400678275e-05, + "loss": 0.8961, + "step": 10344 + }, + { + "epoch": 0.5159600997506234, + "grad_norm": 0.4424661529440672, + "learning_rate": 2.4927307964724404e-05, + "loss": 0.8612, + "step": 10345 + }, + { + "epoch": 0.5160099750623441, + "grad_norm": 0.5705750716626958, + "learning_rate": 2.4923269530667406e-05, + "loss": 0.8439, + "step": 10346 + }, + { + "epoch": 0.5160598503740649, + "grad_norm": 0.41745314199775146, + "learning_rate": 2.491923109861265e-05, + "loss": 0.8993, + "step": 10347 + }, + { + "epoch": 0.5161097256857855, + "grad_norm": 0.42594760278185695, + "learning_rate": 2.4915192668665528e-05, + "loss": 0.8717, + "step": 10348 + }, + { + "epoch": 0.5161596009975062, + "grad_norm": 0.7136271223605801, + "learning_rate": 2.491115424093141e-05, + "loss": 0.927, + "step": 10349 + }, + { + "epoch": 0.516209476309227, + "grad_norm": 0.7105804661226958, + "learning_rate": 2.4907115815515683e-05, + "loss": 0.8935, + "step": 10350 + }, + { + "epoch": 0.5162593516209476, + "grad_norm": 0.40641387143706154, + "learning_rate": 2.4903077392523737e-05, + "loss": 0.871, + "step": 10351 + }, + { + "epoch": 0.5163092269326683, + "grad_norm": 0.6906499321007648, + "learning_rate": 2.4899038972060933e-05, + "loss": 0.9272, + "step": 10352 + }, + { + "epoch": 0.5163591022443891, + "grad_norm": 0.6474357241703879, + "learning_rate": 2.4895000554232663e-05, + "loss": 0.8689, + "step": 10353 + }, + { + "epoch": 0.5164089775561097, + "grad_norm": 0.42669633762211917, + "learning_rate": 2.4890962139144307e-05, + "loss": 0.8975, + "step": 10354 + }, + { + "epoch": 0.5164588528678304, + "grad_norm": 0.5384587961387363, + "learning_rate": 2.4886923726901253e-05, + "loss": 0.8656, + "step": 10355 + }, + { + "epoch": 0.5165087281795511, + "grad_norm": 0.44710169719803966, + "learning_rate": 2.4882885317608865e-05, + "loss": 0.9299, + "step": 10356 + }, + { + "epoch": 0.5165586034912718, + "grad_norm": 0.5022908313693922, + "learning_rate": 2.487884691137253e-05, + "loss": 0.864, + "step": 10357 + }, + { + "epoch": 0.5166084788029925, + "grad_norm": 0.41118874517413007, + "learning_rate": 2.487480850829763e-05, + "loss": 0.8652, + "step": 10358 + }, + { + "epoch": 0.5166583541147132, + "grad_norm": 0.43109745122617954, + "learning_rate": 2.487077010848956e-05, + "loss": 0.8551, + "step": 10359 + }, + { + "epoch": 0.5167082294264339, + "grad_norm": 0.4531238667703506, + "learning_rate": 2.486673171205367e-05, + "loss": 0.8694, + "step": 10360 + }, + { + "epoch": 0.5167581047381546, + "grad_norm": 0.48605254015639787, + "learning_rate": 2.4862693319095366e-05, + "loss": 0.8783, + "step": 10361 + }, + { + "epoch": 0.5168079800498753, + "grad_norm": 0.47088438309322317, + "learning_rate": 2.485865492972001e-05, + "loss": 0.9158, + "step": 10362 + }, + { + "epoch": 0.516857855361596, + "grad_norm": 0.4297633138125611, + "learning_rate": 2.4854616544033004e-05, + "loss": 0.8943, + "step": 10363 + }, + { + "epoch": 0.5169077306733167, + "grad_norm": 0.40678537115212554, + "learning_rate": 2.4850578162139703e-05, + "loss": 0.9014, + "step": 10364 + }, + { + "epoch": 0.5169576059850374, + "grad_norm": 0.4305132258349708, + "learning_rate": 2.48465397841455e-05, + "loss": 0.8405, + "step": 10365 + }, + { + "epoch": 0.5170074812967581, + "grad_norm": 0.44784085470421714, + "learning_rate": 2.4842501410155783e-05, + "loss": 0.8884, + "step": 10366 + }, + { + "epoch": 0.5170573566084788, + "grad_norm": 0.4199913896392777, + "learning_rate": 2.4838463040275907e-05, + "loss": 0.8661, + "step": 10367 + }, + { + "epoch": 0.5171072319201995, + "grad_norm": 0.45095064603538265, + "learning_rate": 2.483442467461127e-05, + "loss": 0.8741, + "step": 10368 + }, + { + "epoch": 0.5171571072319202, + "grad_norm": 0.4477306668378164, + "learning_rate": 2.483038631326724e-05, + "loss": 0.894, + "step": 10369 + }, + { + "epoch": 0.517206982543641, + "grad_norm": 0.526486748492841, + "learning_rate": 2.4826347956349222e-05, + "loss": 0.9258, + "step": 10370 + }, + { + "epoch": 0.5172568578553616, + "grad_norm": 0.6274891633158531, + "learning_rate": 2.482230960396256e-05, + "loss": 0.893, + "step": 10371 + }, + { + "epoch": 0.5173067331670823, + "grad_norm": 0.5808525739543671, + "learning_rate": 2.4818271256212654e-05, + "loss": 0.8752, + "step": 10372 + }, + { + "epoch": 0.5173566084788029, + "grad_norm": 0.41441020368712905, + "learning_rate": 2.4814232913204877e-05, + "loss": 0.8685, + "step": 10373 + }, + { + "epoch": 0.5174064837905237, + "grad_norm": 0.45293176963190435, + "learning_rate": 2.481019457504462e-05, + "loss": 0.8826, + "step": 10374 + }, + { + "epoch": 0.5174563591022444, + "grad_norm": 0.4777194862511071, + "learning_rate": 2.480615624183724e-05, + "loss": 0.9154, + "step": 10375 + }, + { + "epoch": 0.517506234413965, + "grad_norm": 0.5058893563495224, + "learning_rate": 2.4802117913688122e-05, + "loss": 0.9001, + "step": 10376 + }, + { + "epoch": 0.5175561097256858, + "grad_norm": 0.4709838637251826, + "learning_rate": 2.479807959070266e-05, + "loss": 0.9423, + "step": 10377 + }, + { + "epoch": 0.5176059850374065, + "grad_norm": 0.4608967673494986, + "learning_rate": 2.4794041272986216e-05, + "loss": 0.8954, + "step": 10378 + }, + { + "epoch": 0.5176558603491271, + "grad_norm": 0.4295808633521556, + "learning_rate": 2.479000296064417e-05, + "loss": 0.8792, + "step": 10379 + }, + { + "epoch": 0.5177057356608479, + "grad_norm": 0.4596873590712769, + "learning_rate": 2.4785964653781897e-05, + "loss": 0.8528, + "step": 10380 + }, + { + "epoch": 0.5177556109725686, + "grad_norm": 0.4515843704041121, + "learning_rate": 2.478192635250479e-05, + "loss": 0.8928, + "step": 10381 + }, + { + "epoch": 0.5178054862842892, + "grad_norm": 0.48859699297830705, + "learning_rate": 2.4777888056918214e-05, + "loss": 0.8854, + "step": 10382 + }, + { + "epoch": 0.51785536159601, + "grad_norm": 0.42883095851558295, + "learning_rate": 2.4773849767127556e-05, + "loss": 0.875, + "step": 10383 + }, + { + "epoch": 0.5179052369077307, + "grad_norm": 0.37890401091881265, + "learning_rate": 2.476981148323818e-05, + "loss": 0.8527, + "step": 10384 + }, + { + "epoch": 0.5179551122194513, + "grad_norm": 0.35427081207407596, + "learning_rate": 2.4765773205355473e-05, + "loss": 0.8143, + "step": 10385 + }, + { + "epoch": 0.5180049875311721, + "grad_norm": 0.3879607737007129, + "learning_rate": 2.4761734933584803e-05, + "loss": 0.8878, + "step": 10386 + }, + { + "epoch": 0.5180548628428928, + "grad_norm": 0.558209405315361, + "learning_rate": 2.4757696668031562e-05, + "loss": 0.8667, + "step": 10387 + }, + { + "epoch": 0.5181047381546134, + "grad_norm": 0.4896212894522814, + "learning_rate": 2.475365840880112e-05, + "loss": 0.8739, + "step": 10388 + }, + { + "epoch": 0.5181546134663342, + "grad_norm": 0.4215268741235377, + "learning_rate": 2.4749620155998843e-05, + "loss": 0.8429, + "step": 10389 + }, + { + "epoch": 0.5182044887780549, + "grad_norm": 0.5106729248590348, + "learning_rate": 2.4745581909730116e-05, + "loss": 0.872, + "step": 10390 + }, + { + "epoch": 0.5182543640897755, + "grad_norm": 0.5172760458385971, + "learning_rate": 2.474154367010032e-05, + "loss": 0.9024, + "step": 10391 + }, + { + "epoch": 0.5183042394014963, + "grad_norm": 0.4433887559335682, + "learning_rate": 2.473750543721483e-05, + "loss": 0.8923, + "step": 10392 + }, + { + "epoch": 0.5183541147132169, + "grad_norm": 0.42342065315373734, + "learning_rate": 2.4733467211179008e-05, + "loss": 0.8816, + "step": 10393 + }, + { + "epoch": 0.5184039900249376, + "grad_norm": 0.4225462435513756, + "learning_rate": 2.4729428992098243e-05, + "loss": 0.8469, + "step": 10394 + }, + { + "epoch": 0.5184538653366584, + "grad_norm": 0.3984133669050193, + "learning_rate": 2.4725390780077908e-05, + "loss": 0.8759, + "step": 10395 + }, + { + "epoch": 0.518503740648379, + "grad_norm": 0.5150145473149695, + "learning_rate": 2.472135257522339e-05, + "loss": 0.8414, + "step": 10396 + }, + { + "epoch": 0.5185536159600997, + "grad_norm": 0.38128247144754, + "learning_rate": 2.4717314377640037e-05, + "loss": 0.899, + "step": 10397 + }, + { + "epoch": 0.5186034912718205, + "grad_norm": 0.42097878843879216, + "learning_rate": 2.4713276187433236e-05, + "loss": 0.922, + "step": 10398 + }, + { + "epoch": 0.5186533665835411, + "grad_norm": 0.3496460351221082, + "learning_rate": 2.4709238004708383e-05, + "loss": 0.8465, + "step": 10399 + }, + { + "epoch": 0.5187032418952618, + "grad_norm": 0.41140496123552034, + "learning_rate": 2.470519982957082e-05, + "loss": 0.8668, + "step": 10400 + }, + { + "epoch": 0.5187531172069826, + "grad_norm": 0.3981096371806248, + "learning_rate": 2.4701161662125933e-05, + "loss": 0.8606, + "step": 10401 + }, + { + "epoch": 0.5188029925187032, + "grad_norm": 0.4489771267928492, + "learning_rate": 2.4697123502479098e-05, + "loss": 0.9341, + "step": 10402 + }, + { + "epoch": 0.518852867830424, + "grad_norm": 0.4141314151016519, + "learning_rate": 2.4693085350735708e-05, + "loss": 0.8562, + "step": 10403 + }, + { + "epoch": 0.5189027431421447, + "grad_norm": 0.5580642392233454, + "learning_rate": 2.4689047207001102e-05, + "loss": 0.8914, + "step": 10404 + }, + { + "epoch": 0.5189526184538653, + "grad_norm": 0.5022629744269008, + "learning_rate": 2.4685009071380672e-05, + "loss": 0.8601, + "step": 10405 + }, + { + "epoch": 0.519002493765586, + "grad_norm": 1.0443542709454563, + "learning_rate": 2.4680970943979786e-05, + "loss": 0.8426, + "step": 10406 + }, + { + "epoch": 0.5190523690773068, + "grad_norm": 0.4479240005577996, + "learning_rate": 2.4676932824903834e-05, + "loss": 0.9081, + "step": 10407 + }, + { + "epoch": 0.5191022443890274, + "grad_norm": 0.4194634041391564, + "learning_rate": 2.4672894714258164e-05, + "loss": 0.8884, + "step": 10408 + }, + { + "epoch": 0.5191521197007481, + "grad_norm": 0.6707035488278407, + "learning_rate": 2.466885661214816e-05, + "loss": 0.9028, + "step": 10409 + }, + { + "epoch": 0.5192019950124688, + "grad_norm": 0.9645425953932664, + "learning_rate": 2.4664818518679196e-05, + "loss": 0.8894, + "step": 10410 + }, + { + "epoch": 0.5192518703241895, + "grad_norm": 0.41703550506124204, + "learning_rate": 2.466078043395665e-05, + "loss": 0.8711, + "step": 10411 + }, + { + "epoch": 0.5193017456359103, + "grad_norm": 0.5506618494718952, + "learning_rate": 2.4656742358085878e-05, + "loss": 0.8969, + "step": 10412 + }, + { + "epoch": 0.5193516209476309, + "grad_norm": 0.4475306497909476, + "learning_rate": 2.465270429117226e-05, + "loss": 0.8415, + "step": 10413 + }, + { + "epoch": 0.5194014962593516, + "grad_norm": 0.4729148134094705, + "learning_rate": 2.4648666233321175e-05, + "loss": 0.8948, + "step": 10414 + }, + { + "epoch": 0.5194513715710724, + "grad_norm": 0.4222684551570662, + "learning_rate": 2.4644628184637984e-05, + "loss": 0.8766, + "step": 10415 + }, + { + "epoch": 0.519501246882793, + "grad_norm": 0.951084586456059, + "learning_rate": 2.464059014522806e-05, + "loss": 0.8516, + "step": 10416 + }, + { + "epoch": 0.5195511221945137, + "grad_norm": 0.4184210027070358, + "learning_rate": 2.4636552115196775e-05, + "loss": 0.8445, + "step": 10417 + }, + { + "epoch": 0.5196009975062345, + "grad_norm": 0.4412047686409412, + "learning_rate": 2.4632514094649505e-05, + "loss": 0.8828, + "step": 10418 + }, + { + "epoch": 0.5196508728179551, + "grad_norm": 0.3970590519824489, + "learning_rate": 2.4628476083691613e-05, + "loss": 0.9072, + "step": 10419 + }, + { + "epoch": 0.5197007481296758, + "grad_norm": 0.5492372331993779, + "learning_rate": 2.4624438082428467e-05, + "loss": 0.8766, + "step": 10420 + }, + { + "epoch": 0.5197506234413966, + "grad_norm": 0.4769216924966473, + "learning_rate": 2.4620400090965442e-05, + "loss": 0.9015, + "step": 10421 + }, + { + "epoch": 0.5198004987531172, + "grad_norm": 0.3756550741366036, + "learning_rate": 2.461636210940791e-05, + "loss": 0.8263, + "step": 10422 + }, + { + "epoch": 0.5198503740648379, + "grad_norm": 0.4437051695082236, + "learning_rate": 2.4612324137861233e-05, + "loss": 0.8842, + "step": 10423 + }, + { + "epoch": 0.5199002493765587, + "grad_norm": 0.41596654709318215, + "learning_rate": 2.460828617643079e-05, + "loss": 0.8752, + "step": 10424 + }, + { + "epoch": 0.5199501246882793, + "grad_norm": 0.37702077981701, + "learning_rate": 2.4604248225221946e-05, + "loss": 0.9138, + "step": 10425 + }, + { + "epoch": 0.52, + "grad_norm": 0.47536093009963526, + "learning_rate": 2.460021028434006e-05, + "loss": 0.8911, + "step": 10426 + }, + { + "epoch": 0.5200498753117206, + "grad_norm": 0.38420602741959253, + "learning_rate": 2.4596172353890508e-05, + "loss": 0.8632, + "step": 10427 + }, + { + "epoch": 0.5200997506234414, + "grad_norm": 0.4504814113062834, + "learning_rate": 2.4592134433978656e-05, + "loss": 0.9121, + "step": 10428 + }, + { + "epoch": 0.5201496259351621, + "grad_norm": 0.4321772166329751, + "learning_rate": 2.4588096524709885e-05, + "loss": 0.87, + "step": 10429 + }, + { + "epoch": 0.5201995012468827, + "grad_norm": 0.436905988645893, + "learning_rate": 2.458405862618954e-05, + "loss": 0.8953, + "step": 10430 + }, + { + "epoch": 0.5202493765586035, + "grad_norm": 0.5035838216069972, + "learning_rate": 2.4580020738523e-05, + "loss": 0.8831, + "step": 10431 + }, + { + "epoch": 0.5202992518703242, + "grad_norm": 0.41355183670207224, + "learning_rate": 2.457598286181563e-05, + "loss": 0.8497, + "step": 10432 + }, + { + "epoch": 0.5203491271820448, + "grad_norm": 0.40715921219725376, + "learning_rate": 2.4571944996172808e-05, + "loss": 0.837, + "step": 10433 + }, + { + "epoch": 0.5203990024937656, + "grad_norm": 0.44059023415335336, + "learning_rate": 2.4567907141699882e-05, + "loss": 0.8923, + "step": 10434 + }, + { + "epoch": 0.5204488778054863, + "grad_norm": 0.49802522852499204, + "learning_rate": 2.456386929850222e-05, + "loss": 0.8903, + "step": 10435 + }, + { + "epoch": 0.520498753117207, + "grad_norm": 0.7225636051702524, + "learning_rate": 2.4559831466685206e-05, + "loss": 0.9258, + "step": 10436 + }, + { + "epoch": 0.5205486284289277, + "grad_norm": 0.5531918353561891, + "learning_rate": 2.4555793646354186e-05, + "loss": 0.8708, + "step": 10437 + }, + { + "epoch": 0.5205985037406484, + "grad_norm": 1.9379090952098328, + "learning_rate": 2.4551755837614528e-05, + "loss": 0.8896, + "step": 10438 + }, + { + "epoch": 0.520648379052369, + "grad_norm": 0.520439552911919, + "learning_rate": 2.4547718040571606e-05, + "loss": 0.9026, + "step": 10439 + }, + { + "epoch": 0.5206982543640898, + "grad_norm": 0.566881750595825, + "learning_rate": 2.4543680255330783e-05, + "loss": 0.8811, + "step": 10440 + }, + { + "epoch": 0.5207481296758105, + "grad_norm": 0.5553941234458709, + "learning_rate": 2.4539642481997413e-05, + "loss": 0.8882, + "step": 10441 + }, + { + "epoch": 0.5207980049875311, + "grad_norm": 0.45339236011199685, + "learning_rate": 2.4535604720676865e-05, + "loss": 0.8835, + "step": 10442 + }, + { + "epoch": 0.5208478802992519, + "grad_norm": 0.4536640638389137, + "learning_rate": 2.4531566971474506e-05, + "loss": 0.8591, + "step": 10443 + }, + { + "epoch": 0.5208977556109726, + "grad_norm": 0.4747916741884543, + "learning_rate": 2.4527529234495706e-05, + "loss": 0.8598, + "step": 10444 + }, + { + "epoch": 0.5209476309226932, + "grad_norm": 0.42783026633058363, + "learning_rate": 2.452349150984581e-05, + "loss": 0.8454, + "step": 10445 + }, + { + "epoch": 0.520997506234414, + "grad_norm": 0.4016569583825267, + "learning_rate": 2.4519453797630187e-05, + "loss": 0.8655, + "step": 10446 + }, + { + "epoch": 0.5210473815461346, + "grad_norm": 0.5121408890707123, + "learning_rate": 2.4515416097954217e-05, + "loss": 0.836, + "step": 10447 + }, + { + "epoch": 0.5210972568578554, + "grad_norm": 0.5085413296969213, + "learning_rate": 2.4511378410923232e-05, + "loss": 0.8518, + "step": 10448 + }, + { + "epoch": 0.5211471321695761, + "grad_norm": 0.4398879058827488, + "learning_rate": 2.450734073664261e-05, + "loss": 0.8683, + "step": 10449 + }, + { + "epoch": 0.5211970074812967, + "grad_norm": 0.47174506783859893, + "learning_rate": 2.4503303075217713e-05, + "loss": 0.9099, + "step": 10450 + }, + { + "epoch": 0.5212468827930175, + "grad_norm": 0.41657941492528305, + "learning_rate": 2.449926542675391e-05, + "loss": 0.9067, + "step": 10451 + }, + { + "epoch": 0.5212967581047382, + "grad_norm": 0.40848104446784544, + "learning_rate": 2.449522779135654e-05, + "loss": 0.8727, + "step": 10452 + }, + { + "epoch": 0.5213466334164588, + "grad_norm": 0.47338046783883914, + "learning_rate": 2.4491190169130975e-05, + "loss": 0.9406, + "step": 10453 + }, + { + "epoch": 0.5213965087281796, + "grad_norm": 0.39521394568794727, + "learning_rate": 2.4487152560182574e-05, + "loss": 0.8646, + "step": 10454 + }, + { + "epoch": 0.5214463840399003, + "grad_norm": 0.4404517948220637, + "learning_rate": 2.4483114964616703e-05, + "loss": 0.8859, + "step": 10455 + }, + { + "epoch": 0.5214962593516209, + "grad_norm": 0.3953347419786104, + "learning_rate": 2.447907738253872e-05, + "loss": 0.8538, + "step": 10456 + }, + { + "epoch": 0.5215461346633417, + "grad_norm": 0.4534625752458199, + "learning_rate": 2.447503981405397e-05, + "loss": 0.8763, + "step": 10457 + }, + { + "epoch": 0.5215960099750624, + "grad_norm": 0.5349213536528149, + "learning_rate": 2.4471002259267826e-05, + "loss": 0.905, + "step": 10458 + }, + { + "epoch": 0.521645885286783, + "grad_norm": 0.5485803103826306, + "learning_rate": 2.4466964718285636e-05, + "loss": 0.8608, + "step": 10459 + }, + { + "epoch": 0.5216957605985038, + "grad_norm": 0.5125894207689551, + "learning_rate": 2.4462927191212765e-05, + "loss": 0.8551, + "step": 10460 + }, + { + "epoch": 0.5217456359102245, + "grad_norm": 0.4916350813803741, + "learning_rate": 2.445888967815457e-05, + "loss": 0.921, + "step": 10461 + }, + { + "epoch": 0.5217955112219451, + "grad_norm": 0.5174156877976771, + "learning_rate": 2.445485217921641e-05, + "loss": 0.8945, + "step": 10462 + }, + { + "epoch": 0.5218453865336659, + "grad_norm": 0.47113204057282004, + "learning_rate": 2.4450814694503636e-05, + "loss": 0.9227, + "step": 10463 + }, + { + "epoch": 0.5218952618453865, + "grad_norm": 0.38315850774531846, + "learning_rate": 2.4446777224121602e-05, + "loss": 0.8493, + "step": 10464 + }, + { + "epoch": 0.5219451371571072, + "grad_norm": 0.4276143744249457, + "learning_rate": 2.444273976817567e-05, + "loss": 0.9172, + "step": 10465 + }, + { + "epoch": 0.521995012468828, + "grad_norm": 0.4578740694701538, + "learning_rate": 2.4438702326771205e-05, + "loss": 0.927, + "step": 10466 + }, + { + "epoch": 0.5220448877805486, + "grad_norm": 0.43253681580875303, + "learning_rate": 2.443466490001354e-05, + "loss": 0.8978, + "step": 10467 + }, + { + "epoch": 0.5220947630922693, + "grad_norm": 0.404499497670237, + "learning_rate": 2.4430627488008044e-05, + "loss": 0.9078, + "step": 10468 + }, + { + "epoch": 0.5221446384039901, + "grad_norm": 0.5265207386793371, + "learning_rate": 2.442659009086007e-05, + "loss": 0.9411, + "step": 10469 + }, + { + "epoch": 0.5221945137157107, + "grad_norm": 0.4336365133825562, + "learning_rate": 2.4422552708674975e-05, + "loss": 0.8506, + "step": 10470 + }, + { + "epoch": 0.5222443890274314, + "grad_norm": 0.3768687322139951, + "learning_rate": 2.4418515341558105e-05, + "loss": 0.8794, + "step": 10471 + }, + { + "epoch": 0.5222942643391522, + "grad_norm": 0.4068262386034962, + "learning_rate": 2.441447798961481e-05, + "loss": 0.8975, + "step": 10472 + }, + { + "epoch": 0.5223441396508728, + "grad_norm": 0.40695610602801807, + "learning_rate": 2.4410440652950467e-05, + "loss": 0.8702, + "step": 10473 + }, + { + "epoch": 0.5223940149625935, + "grad_norm": 0.7472493129778377, + "learning_rate": 2.44064033316704e-05, + "loss": 0.8618, + "step": 10474 + }, + { + "epoch": 0.5224438902743143, + "grad_norm": 0.4626449993948201, + "learning_rate": 2.440236602587997e-05, + "loss": 0.8372, + "step": 10475 + }, + { + "epoch": 0.5224937655860349, + "grad_norm": 0.5653120577271352, + "learning_rate": 2.439832873568453e-05, + "loss": 0.8787, + "step": 10476 + }, + { + "epoch": 0.5225436408977556, + "grad_norm": 0.3997855875781608, + "learning_rate": 2.4394291461189446e-05, + "loss": 0.8295, + "step": 10477 + }, + { + "epoch": 0.5225935162094764, + "grad_norm": 0.44027233248368747, + "learning_rate": 2.439025420250004e-05, + "loss": 0.8901, + "step": 10478 + }, + { + "epoch": 0.522643391521197, + "grad_norm": 0.40246625179414036, + "learning_rate": 2.4386216959721683e-05, + "loss": 0.8909, + "step": 10479 + }, + { + "epoch": 0.5226932668329177, + "grad_norm": 0.6589413699889722, + "learning_rate": 2.4382179732959718e-05, + "loss": 0.8897, + "step": 10480 + }, + { + "epoch": 0.5227431421446384, + "grad_norm": 0.526975293852551, + "learning_rate": 2.4378142522319507e-05, + "loss": 0.9057, + "step": 10481 + }, + { + "epoch": 0.5227930174563591, + "grad_norm": 0.6600128971426424, + "learning_rate": 2.4374105327906377e-05, + "loss": 0.8865, + "step": 10482 + }, + { + "epoch": 0.5228428927680798, + "grad_norm": 0.6850481017808596, + "learning_rate": 2.437006814982569e-05, + "loss": 0.8911, + "step": 10483 + }, + { + "epoch": 0.5228927680798005, + "grad_norm": 0.42771031936664594, + "learning_rate": 2.4366030988182804e-05, + "loss": 0.8889, + "step": 10484 + }, + { + "epoch": 0.5229426433915212, + "grad_norm": 0.6851356237713017, + "learning_rate": 2.4361993843083042e-05, + "loss": 0.9127, + "step": 10485 + }, + { + "epoch": 0.5229925187032419, + "grad_norm": 0.4044148897169973, + "learning_rate": 2.4357956714631766e-05, + "loss": 0.9206, + "step": 10486 + }, + { + "epoch": 0.5230423940149626, + "grad_norm": 0.4012086709095589, + "learning_rate": 2.435391960293432e-05, + "loss": 0.8889, + "step": 10487 + }, + { + "epoch": 0.5230922693266833, + "grad_norm": 0.45997351975759054, + "learning_rate": 2.434988250809607e-05, + "loss": 0.8889, + "step": 10488 + }, + { + "epoch": 0.523142144638404, + "grad_norm": 0.5290507776381951, + "learning_rate": 2.434584543022233e-05, + "loss": 0.8682, + "step": 10489 + }, + { + "epoch": 0.5231920199501247, + "grad_norm": 0.4111083629816332, + "learning_rate": 2.4341808369418458e-05, + "loss": 0.8874, + "step": 10490 + }, + { + "epoch": 0.5232418952618454, + "grad_norm": 0.5595278845084981, + "learning_rate": 2.4337771325789807e-05, + "loss": 0.9057, + "step": 10491 + }, + { + "epoch": 0.5232917705735661, + "grad_norm": 0.5156934449126538, + "learning_rate": 2.4333734299441722e-05, + "loss": 0.9352, + "step": 10492 + }, + { + "epoch": 0.5233416458852868, + "grad_norm": 0.42387176520219977, + "learning_rate": 2.4329697290479538e-05, + "loss": 0.852, + "step": 10493 + }, + { + "epoch": 0.5233915211970075, + "grad_norm": 0.5203927712688144, + "learning_rate": 2.43256602990086e-05, + "loss": 0.9277, + "step": 10494 + }, + { + "epoch": 0.5234413965087282, + "grad_norm": 0.40355523587405895, + "learning_rate": 2.432162332513426e-05, + "loss": 0.8643, + "step": 10495 + }, + { + "epoch": 0.5234912718204489, + "grad_norm": 0.4493492818450957, + "learning_rate": 2.4317586368961854e-05, + "loss": 0.8281, + "step": 10496 + }, + { + "epoch": 0.5235411471321696, + "grad_norm": 0.4913461937477062, + "learning_rate": 2.431354943059672e-05, + "loss": 0.8749, + "step": 10497 + }, + { + "epoch": 0.5235910224438902, + "grad_norm": 0.531508482924516, + "learning_rate": 2.4309512510144218e-05, + "loss": 0.8791, + "step": 10498 + }, + { + "epoch": 0.523640897755611, + "grad_norm": 0.3847371734137606, + "learning_rate": 2.4305475607709674e-05, + "loss": 0.8847, + "step": 10499 + }, + { + "epoch": 0.5236907730673317, + "grad_norm": 0.43069916833703237, + "learning_rate": 2.4301438723398427e-05, + "loss": 0.9071, + "step": 10500 + }, + { + "epoch": 0.5237406483790523, + "grad_norm": 0.5156322372884123, + "learning_rate": 2.429740185731583e-05, + "loss": 0.8897, + "step": 10501 + }, + { + "epoch": 0.5237905236907731, + "grad_norm": 0.4547077991942811, + "learning_rate": 2.4293365009567218e-05, + "loss": 0.8731, + "step": 10502 + }, + { + "epoch": 0.5238403990024938, + "grad_norm": 0.44288048223234755, + "learning_rate": 2.4289328180257926e-05, + "loss": 0.9387, + "step": 10503 + }, + { + "epoch": 0.5238902743142144, + "grad_norm": 0.443897321778398, + "learning_rate": 2.42852913694933e-05, + "loss": 0.885, + "step": 10504 + }, + { + "epoch": 0.5239401496259352, + "grad_norm": 0.371892258801879, + "learning_rate": 2.4281254577378672e-05, + "loss": 0.887, + "step": 10505 + }, + { + "epoch": 0.5239900249376559, + "grad_norm": 0.41452991845844145, + "learning_rate": 2.4277217804019393e-05, + "loss": 0.8926, + "step": 10506 + }, + { + "epoch": 0.5240399002493765, + "grad_norm": 0.45802132564501374, + "learning_rate": 2.4273181049520783e-05, + "loss": 0.8574, + "step": 10507 + }, + { + "epoch": 0.5240897755610973, + "grad_norm": 0.3908212605617012, + "learning_rate": 2.4269144313988186e-05, + "loss": 0.9003, + "step": 10508 + }, + { + "epoch": 0.524139650872818, + "grad_norm": 0.4329972525124501, + "learning_rate": 2.4265107597526946e-05, + "loss": 0.9061, + "step": 10509 + }, + { + "epoch": 0.5241895261845386, + "grad_norm": 0.4878760147913357, + "learning_rate": 2.42610709002424e-05, + "loss": 0.8608, + "step": 10510 + }, + { + "epoch": 0.5242394014962594, + "grad_norm": 0.4022544304416617, + "learning_rate": 2.425703422223987e-05, + "loss": 0.903, + "step": 10511 + }, + { + "epoch": 0.5242892768079801, + "grad_norm": 0.42912723672779773, + "learning_rate": 2.42529975636247e-05, + "loss": 0.9343, + "step": 10512 + }, + { + "epoch": 0.5243391521197007, + "grad_norm": 0.5780017932075533, + "learning_rate": 2.4248960924502223e-05, + "loss": 0.8672, + "step": 10513 + }, + { + "epoch": 0.5243890274314215, + "grad_norm": 0.39351530066320595, + "learning_rate": 2.4244924304977785e-05, + "loss": 0.8676, + "step": 10514 + }, + { + "epoch": 0.5244389027431422, + "grad_norm": 0.49969719032852844, + "learning_rate": 2.42408877051567e-05, + "loss": 0.8785, + "step": 10515 + }, + { + "epoch": 0.5244887780548628, + "grad_norm": 0.9421988447609304, + "learning_rate": 2.4236851125144308e-05, + "loss": 0.8859, + "step": 10516 + }, + { + "epoch": 0.5245386533665836, + "grad_norm": 0.36873957246694156, + "learning_rate": 2.423281456504596e-05, + "loss": 0.9104, + "step": 10517 + }, + { + "epoch": 0.5245885286783042, + "grad_norm": 0.5493263655768434, + "learning_rate": 2.4228778024966954e-05, + "loss": 0.9153, + "step": 10518 + }, + { + "epoch": 0.5246384039900249, + "grad_norm": 0.46935829218354874, + "learning_rate": 2.4224741505012645e-05, + "loss": 0.9422, + "step": 10519 + }, + { + "epoch": 0.5246882793017457, + "grad_norm": 0.4233664732065339, + "learning_rate": 2.422070500528836e-05, + "loss": 0.8597, + "step": 10520 + }, + { + "epoch": 0.5247381546134663, + "grad_norm": 0.40614650589095835, + "learning_rate": 2.421666852589944e-05, + "loss": 0.8716, + "step": 10521 + }, + { + "epoch": 0.524788029925187, + "grad_norm": 0.6877741671813034, + "learning_rate": 2.4212632066951195e-05, + "loss": 0.8894, + "step": 10522 + }, + { + "epoch": 0.5248379052369078, + "grad_norm": 0.5074866610232834, + "learning_rate": 2.420859562854896e-05, + "loss": 0.8617, + "step": 10523 + }, + { + "epoch": 0.5248877805486284, + "grad_norm": 0.423828359766476, + "learning_rate": 2.420455921079807e-05, + "loss": 0.8939, + "step": 10524 + }, + { + "epoch": 0.5249376558603491, + "grad_norm": 0.4445412610892447, + "learning_rate": 2.420052281380386e-05, + "loss": 0.8773, + "step": 10525 + }, + { + "epoch": 0.5249875311720699, + "grad_norm": 0.43471523456658934, + "learning_rate": 2.419648643767164e-05, + "loss": 0.8666, + "step": 10526 + }, + { + "epoch": 0.5250374064837905, + "grad_norm": 0.41056617875518636, + "learning_rate": 2.4192450082506748e-05, + "loss": 0.8519, + "step": 10527 + }, + { + "epoch": 0.5250872817955112, + "grad_norm": 0.6212626645078957, + "learning_rate": 2.418841374841451e-05, + "loss": 0.8411, + "step": 10528 + }, + { + "epoch": 0.525137157107232, + "grad_norm": 0.5206543056516295, + "learning_rate": 2.4184377435500254e-05, + "loss": 0.8701, + "step": 10529 + }, + { + "epoch": 0.5251870324189526, + "grad_norm": 0.4198612635480408, + "learning_rate": 2.4180341143869307e-05, + "loss": 0.911, + "step": 10530 + }, + { + "epoch": 0.5252369077306733, + "grad_norm": 0.8081330347220174, + "learning_rate": 2.4176304873626985e-05, + "loss": 0.8753, + "step": 10531 + }, + { + "epoch": 0.5252867830423941, + "grad_norm": 0.42106135888959334, + "learning_rate": 2.4172268624878623e-05, + "loss": 0.9267, + "step": 10532 + }, + { + "epoch": 0.5253366583541147, + "grad_norm": 0.4246638355001794, + "learning_rate": 2.4168232397729536e-05, + "loss": 0.8576, + "step": 10533 + }, + { + "epoch": 0.5253865336658354, + "grad_norm": 0.47963278570965956, + "learning_rate": 2.416419619228506e-05, + "loss": 0.8562, + "step": 10534 + }, + { + "epoch": 0.5254364089775561, + "grad_norm": 0.4111541108472058, + "learning_rate": 2.41601600086505e-05, + "loss": 0.925, + "step": 10535 + }, + { + "epoch": 0.5254862842892768, + "grad_norm": 0.46257759891423644, + "learning_rate": 2.41561238469312e-05, + "loss": 0.8939, + "step": 10536 + }, + { + "epoch": 0.5255361596009975, + "grad_norm": 0.41617546888593354, + "learning_rate": 2.415208770723246e-05, + "loss": 0.9053, + "step": 10537 + }, + { + "epoch": 0.5255860349127182, + "grad_norm": 0.592483641945592, + "learning_rate": 2.414805158965961e-05, + "loss": 0.8646, + "step": 10538 + }, + { + "epoch": 0.5256359102244389, + "grad_norm": 0.3800344099126492, + "learning_rate": 2.4144015494317985e-05, + "loss": 0.8576, + "step": 10539 + }, + { + "epoch": 0.5256857855361596, + "grad_norm": 0.43065389729735654, + "learning_rate": 2.413997942131289e-05, + "loss": 0.9026, + "step": 10540 + }, + { + "epoch": 0.5257356608478803, + "grad_norm": 0.4097502648498517, + "learning_rate": 2.413594337074964e-05, + "loss": 0.8614, + "step": 10541 + }, + { + "epoch": 0.525785536159601, + "grad_norm": 0.4306289382098348, + "learning_rate": 2.4131907342733563e-05, + "loss": 0.9127, + "step": 10542 + }, + { + "epoch": 0.5258354114713217, + "grad_norm": 0.4744489294134988, + "learning_rate": 2.4127871337369984e-05, + "loss": 0.878, + "step": 10543 + }, + { + "epoch": 0.5258852867830424, + "grad_norm": 0.6061479079812364, + "learning_rate": 2.4123835354764202e-05, + "loss": 0.9028, + "step": 10544 + }, + { + "epoch": 0.5259351620947631, + "grad_norm": 0.44030889631469766, + "learning_rate": 2.4119799395021543e-05, + "loss": 0.8409, + "step": 10545 + }, + { + "epoch": 0.5259850374064838, + "grad_norm": 0.5295980149035739, + "learning_rate": 2.4115763458247327e-05, + "loss": 0.8978, + "step": 10546 + }, + { + "epoch": 0.5260349127182045, + "grad_norm": 0.47753211589319383, + "learning_rate": 2.411172754454688e-05, + "loss": 0.8937, + "step": 10547 + }, + { + "epoch": 0.5260847880299252, + "grad_norm": 0.4902958012954852, + "learning_rate": 2.410769165402549e-05, + "loss": 0.8748, + "step": 10548 + }, + { + "epoch": 0.5261346633416459, + "grad_norm": 0.5035308339468725, + "learning_rate": 2.4103655786788492e-05, + "loss": 0.828, + "step": 10549 + }, + { + "epoch": 0.5261845386533666, + "grad_norm": 0.47802962102382457, + "learning_rate": 2.4099619942941194e-05, + "loss": 0.892, + "step": 10550 + }, + { + "epoch": 0.5262344139650873, + "grad_norm": 0.47445097026871347, + "learning_rate": 2.409558412258892e-05, + "loss": 0.8634, + "step": 10551 + }, + { + "epoch": 0.5262842892768079, + "grad_norm": 0.4681665702464236, + "learning_rate": 2.409154832583696e-05, + "loss": 0.8833, + "step": 10552 + }, + { + "epoch": 0.5263341645885287, + "grad_norm": 0.8308594533234034, + "learning_rate": 2.4087512552790644e-05, + "loss": 0.9039, + "step": 10553 + }, + { + "epoch": 0.5263840399002494, + "grad_norm": 0.5900732424539977, + "learning_rate": 2.4083476803555287e-05, + "loss": 0.8474, + "step": 10554 + }, + { + "epoch": 0.52643391521197, + "grad_norm": 0.3986402614067613, + "learning_rate": 2.4079441078236185e-05, + "loss": 0.8724, + "step": 10555 + }, + { + "epoch": 0.5264837905236908, + "grad_norm": 0.38909622653821224, + "learning_rate": 2.4075405376938657e-05, + "loss": 0.8608, + "step": 10556 + }, + { + "epoch": 0.5265336658354115, + "grad_norm": 0.4813509530661809, + "learning_rate": 2.407136969976801e-05, + "loss": 0.8915, + "step": 10557 + }, + { + "epoch": 0.5265835411471321, + "grad_norm": 0.7582243535915615, + "learning_rate": 2.4067334046829564e-05, + "loss": 0.8839, + "step": 10558 + }, + { + "epoch": 0.5266334164588529, + "grad_norm": 0.4986980449127101, + "learning_rate": 2.406329841822861e-05, + "loss": 0.8419, + "step": 10559 + }, + { + "epoch": 0.5266832917705736, + "grad_norm": 0.45487744128848806, + "learning_rate": 2.405926281407046e-05, + "loss": 0.9062, + "step": 10560 + }, + { + "epoch": 0.5267331670822942, + "grad_norm": 0.4901478117466664, + "learning_rate": 2.4055227234460434e-05, + "loss": 0.9119, + "step": 10561 + }, + { + "epoch": 0.526783042394015, + "grad_norm": 0.5195839717734966, + "learning_rate": 2.4051191679503837e-05, + "loss": 0.8774, + "step": 10562 + }, + { + "epoch": 0.5268329177057357, + "grad_norm": 0.4452229202585445, + "learning_rate": 2.4047156149305958e-05, + "loss": 0.8951, + "step": 10563 + }, + { + "epoch": 0.5268827930174563, + "grad_norm": 1.150998012658575, + "learning_rate": 2.4043120643972114e-05, + "loss": 0.8571, + "step": 10564 + }, + { + "epoch": 0.5269326683291771, + "grad_norm": 0.39767612287623155, + "learning_rate": 2.403908516360761e-05, + "loss": 0.8711, + "step": 10565 + }, + { + "epoch": 0.5269825436408978, + "grad_norm": 0.46456369458374397, + "learning_rate": 2.4035049708317753e-05, + "loss": 0.8746, + "step": 10566 + }, + { + "epoch": 0.5270324189526184, + "grad_norm": 0.4577274964673395, + "learning_rate": 2.4031014278207835e-05, + "loss": 0.8998, + "step": 10567 + }, + { + "epoch": 0.5270822942643392, + "grad_norm": 0.558889855146567, + "learning_rate": 2.4026978873383165e-05, + "loss": 0.8589, + "step": 10568 + }, + { + "epoch": 0.5271321695760599, + "grad_norm": 0.46376646193384446, + "learning_rate": 2.4022943493949053e-05, + "loss": 0.879, + "step": 10569 + }, + { + "epoch": 0.5271820448877805, + "grad_norm": 0.44610186603414714, + "learning_rate": 2.401890814001079e-05, + "loss": 0.8925, + "step": 10570 + }, + { + "epoch": 0.5272319201995013, + "grad_norm": 0.45385795844678456, + "learning_rate": 2.401487281167368e-05, + "loss": 0.9067, + "step": 10571 + }, + { + "epoch": 0.5272817955112219, + "grad_norm": 0.4456417521429601, + "learning_rate": 2.401083750904302e-05, + "loss": 0.8326, + "step": 10572 + }, + { + "epoch": 0.5273316708229426, + "grad_norm": 0.4358156019688218, + "learning_rate": 2.4006802232224115e-05, + "loss": 0.9256, + "step": 10573 + }, + { + "epoch": 0.5273815461346634, + "grad_norm": 0.43802962570925347, + "learning_rate": 2.4002766981322263e-05, + "loss": 0.8919, + "step": 10574 + }, + { + "epoch": 0.527431421446384, + "grad_norm": 0.5039907076698318, + "learning_rate": 2.3998731756442752e-05, + "loss": 0.9149, + "step": 10575 + }, + { + "epoch": 0.5274812967581047, + "grad_norm": 0.42144147880331023, + "learning_rate": 2.39946965576909e-05, + "loss": 0.8519, + "step": 10576 + }, + { + "epoch": 0.5275311720698255, + "grad_norm": 0.45061712572344875, + "learning_rate": 2.3990661385171987e-05, + "loss": 0.919, + "step": 10577 + }, + { + "epoch": 0.5275810473815461, + "grad_norm": 0.7261803343263704, + "learning_rate": 2.3986626238991308e-05, + "loss": 0.8728, + "step": 10578 + }, + { + "epoch": 0.5276309226932668, + "grad_norm": 0.377796316623479, + "learning_rate": 2.3982591119254167e-05, + "loss": 0.8824, + "step": 10579 + }, + { + "epoch": 0.5276807980049876, + "grad_norm": 0.42364530092747255, + "learning_rate": 2.3978556026065864e-05, + "loss": 0.8942, + "step": 10580 + }, + { + "epoch": 0.5277306733167082, + "grad_norm": 0.40275515300803016, + "learning_rate": 2.3974520959531672e-05, + "loss": 0.8726, + "step": 10581 + }, + { + "epoch": 0.5277805486284289, + "grad_norm": 0.45561804587980986, + "learning_rate": 2.39704859197569e-05, + "loss": 0.8231, + "step": 10582 + }, + { + "epoch": 0.5278304239401497, + "grad_norm": 0.45236884690399015, + "learning_rate": 2.3966450906846838e-05, + "loss": 0.9154, + "step": 10583 + }, + { + "epoch": 0.5278802992518703, + "grad_norm": 0.539616130671061, + "learning_rate": 2.3962415920906784e-05, + "loss": 0.8673, + "step": 10584 + }, + { + "epoch": 0.527930174563591, + "grad_norm": 1.7346937541753147, + "learning_rate": 2.3958380962042013e-05, + "loss": 0.8953, + "step": 10585 + }, + { + "epoch": 0.5279800498753118, + "grad_norm": 0.4500700069984541, + "learning_rate": 2.3954346030357825e-05, + "loss": 0.881, + "step": 10586 + }, + { + "epoch": 0.5280299251870324, + "grad_norm": 1.0437008987082583, + "learning_rate": 2.395031112595951e-05, + "loss": 0.9353, + "step": 10587 + }, + { + "epoch": 0.5280798004987531, + "grad_norm": 0.3792204363998203, + "learning_rate": 2.3946276248952364e-05, + "loss": 0.8505, + "step": 10588 + }, + { + "epoch": 0.5281296758104738, + "grad_norm": 0.6609698820503267, + "learning_rate": 2.394224139944166e-05, + "loss": 0.8544, + "step": 10589 + }, + { + "epoch": 0.5281795511221945, + "grad_norm": 0.5728163079550727, + "learning_rate": 2.393820657753269e-05, + "loss": 0.8864, + "step": 10590 + }, + { + "epoch": 0.5282294264339152, + "grad_norm": 2.1336918213575435, + "learning_rate": 2.393417178333076e-05, + "loss": 0.7907, + "step": 10591 + }, + { + "epoch": 0.5282793017456359, + "grad_norm": 0.9915590347254085, + "learning_rate": 2.3930137016941128e-05, + "loss": 0.9113, + "step": 10592 + }, + { + "epoch": 0.5283291770573566, + "grad_norm": 0.9790023282304544, + "learning_rate": 2.3926102278469088e-05, + "loss": 0.8692, + "step": 10593 + }, + { + "epoch": 0.5283790523690773, + "grad_norm": 0.5369986335650487, + "learning_rate": 2.3922067568019934e-05, + "loss": 0.8755, + "step": 10594 + }, + { + "epoch": 0.528428927680798, + "grad_norm": 0.49268182575758057, + "learning_rate": 2.3918032885698952e-05, + "loss": 0.8884, + "step": 10595 + }, + { + "epoch": 0.5284788029925187, + "grad_norm": 0.4482884225352244, + "learning_rate": 2.3913998231611408e-05, + "loss": 0.9303, + "step": 10596 + }, + { + "epoch": 0.5285286783042394, + "grad_norm": 0.4090288849400199, + "learning_rate": 2.390996360586259e-05, + "loss": 0.8681, + "step": 10597 + }, + { + "epoch": 0.5285785536159601, + "grad_norm": 0.4055268211450004, + "learning_rate": 2.390592900855779e-05, + "loss": 0.9271, + "step": 10598 + }, + { + "epoch": 0.5286284289276808, + "grad_norm": 0.46513634273774873, + "learning_rate": 2.390189443980229e-05, + "loss": 0.8696, + "step": 10599 + }, + { + "epoch": 0.5286783042394015, + "grad_norm": 0.4272301390820584, + "learning_rate": 2.3897859899701352e-05, + "loss": 0.8978, + "step": 10600 + }, + { + "epoch": 0.5287281795511222, + "grad_norm": 0.4530650963250826, + "learning_rate": 2.389382538836027e-05, + "loss": 0.892, + "step": 10601 + }, + { + "epoch": 0.5287780548628429, + "grad_norm": 0.5323192810033468, + "learning_rate": 2.388979090588432e-05, + "loss": 0.8558, + "step": 10602 + }, + { + "epoch": 0.5288279301745636, + "grad_norm": 0.6026700859729275, + "learning_rate": 2.3885756452378783e-05, + "loss": 0.9164, + "step": 10603 + }, + { + "epoch": 0.5288778054862843, + "grad_norm": 0.4699617526999256, + "learning_rate": 2.388172202794893e-05, + "loss": 0.8962, + "step": 10604 + }, + { + "epoch": 0.528927680798005, + "grad_norm": 0.4253322326980111, + "learning_rate": 2.3877687632700033e-05, + "loss": 0.8637, + "step": 10605 + }, + { + "epoch": 0.5289775561097256, + "grad_norm": 0.44575494745196464, + "learning_rate": 2.3873653266737383e-05, + "loss": 0.85, + "step": 10606 + }, + { + "epoch": 0.5290274314214464, + "grad_norm": 0.830859690515486, + "learning_rate": 2.3869618930166245e-05, + "loss": 0.8632, + "step": 10607 + }, + { + "epoch": 0.5290773067331671, + "grad_norm": 0.5902005384885558, + "learning_rate": 2.3865584623091897e-05, + "loss": 0.8689, + "step": 10608 + }, + { + "epoch": 0.5291271820448877, + "grad_norm": 0.4360747285500349, + "learning_rate": 2.3861550345619606e-05, + "loss": 0.8259, + "step": 10609 + }, + { + "epoch": 0.5291770573566085, + "grad_norm": 0.4636960134891968, + "learning_rate": 2.3857516097854654e-05, + "loss": 0.8351, + "step": 10610 + }, + { + "epoch": 0.5292269326683292, + "grad_norm": 0.4482393448719917, + "learning_rate": 2.3853481879902304e-05, + "loss": 0.9033, + "step": 10611 + }, + { + "epoch": 0.5292768079800498, + "grad_norm": 0.42254397237470087, + "learning_rate": 2.3849447691867832e-05, + "loss": 0.8961, + "step": 10612 + }, + { + "epoch": 0.5293266832917706, + "grad_norm": 0.4799300146234639, + "learning_rate": 2.3845413533856517e-05, + "loss": 0.8579, + "step": 10613 + }, + { + "epoch": 0.5293765586034913, + "grad_norm": 0.4142291069991478, + "learning_rate": 2.3841379405973606e-05, + "loss": 0.9335, + "step": 10614 + }, + { + "epoch": 0.5294264339152119, + "grad_norm": 0.43763148053241224, + "learning_rate": 2.3837345308324387e-05, + "loss": 0.8816, + "step": 10615 + }, + { + "epoch": 0.5294763092269327, + "grad_norm": 0.5187288878752928, + "learning_rate": 2.3833311241014116e-05, + "loss": 0.8897, + "step": 10616 + }, + { + "epoch": 0.5295261845386534, + "grad_norm": 0.5713726757344143, + "learning_rate": 2.382927720414808e-05, + "loss": 0.9178, + "step": 10617 + }, + { + "epoch": 0.529576059850374, + "grad_norm": 0.5382788404123036, + "learning_rate": 2.382524319783152e-05, + "loss": 0.9074, + "step": 10618 + }, + { + "epoch": 0.5296259351620948, + "grad_norm": 0.4274339523824597, + "learning_rate": 2.382120922216971e-05, + "loss": 0.884, + "step": 10619 + }, + { + "epoch": 0.5296758104738155, + "grad_norm": 0.44940961721706113, + "learning_rate": 2.381717527726792e-05, + "loss": 0.8911, + "step": 10620 + }, + { + "epoch": 0.5297256857855361, + "grad_norm": 0.6632772910037897, + "learning_rate": 2.3813141363231424e-05, + "loss": 0.8507, + "step": 10621 + }, + { + "epoch": 0.5297755610972569, + "grad_norm": 0.7647981993204301, + "learning_rate": 2.380910748016546e-05, + "loss": 0.8886, + "step": 10622 + }, + { + "epoch": 0.5298254364089775, + "grad_norm": 0.4217519224109521, + "learning_rate": 2.3805073628175307e-05, + "loss": 0.8717, + "step": 10623 + }, + { + "epoch": 0.5298753117206982, + "grad_norm": 0.47123802697718864, + "learning_rate": 2.380103980736623e-05, + "loss": 0.9163, + "step": 10624 + }, + { + "epoch": 0.529925187032419, + "grad_norm": 0.3980932510377731, + "learning_rate": 2.3797006017843473e-05, + "loss": 0.8843, + "step": 10625 + }, + { + "epoch": 0.5299750623441396, + "grad_norm": 0.42525446436938713, + "learning_rate": 2.3792972259712303e-05, + "loss": 0.9002, + "step": 10626 + }, + { + "epoch": 0.5300249376558603, + "grad_norm": 0.4700711027337865, + "learning_rate": 2.3788938533077986e-05, + "loss": 0.8816, + "step": 10627 + }, + { + "epoch": 0.5300748129675811, + "grad_norm": 0.41724431985642674, + "learning_rate": 2.3784904838045786e-05, + "loss": 0.894, + "step": 10628 + }, + { + "epoch": 0.5301246882793017, + "grad_norm": 0.4152242954021348, + "learning_rate": 2.3780871174720937e-05, + "loss": 0.8645, + "step": 10629 + }, + { + "epoch": 0.5301745635910224, + "grad_norm": 0.7467796594296059, + "learning_rate": 2.377683754320871e-05, + "loss": 0.8368, + "step": 10630 + }, + { + "epoch": 0.5302244389027432, + "grad_norm": 0.6801523284059116, + "learning_rate": 2.377280394361436e-05, + "loss": 0.9102, + "step": 10631 + }, + { + "epoch": 0.5302743142144638, + "grad_norm": 0.47237476035067866, + "learning_rate": 2.3768770376043157e-05, + "loss": 0.8776, + "step": 10632 + }, + { + "epoch": 0.5303241895261845, + "grad_norm": 0.383791345700311, + "learning_rate": 2.3764736840600323e-05, + "loss": 0.8669, + "step": 10633 + }, + { + "epoch": 0.5303740648379053, + "grad_norm": 0.47497272396190277, + "learning_rate": 2.3760703337391132e-05, + "loss": 0.8876, + "step": 10634 + }, + { + "epoch": 0.5304239401496259, + "grad_norm": 0.5438791079235364, + "learning_rate": 2.3756669866520832e-05, + "loss": 0.8685, + "step": 10635 + }, + { + "epoch": 0.5304738154613466, + "grad_norm": 0.5320375347453848, + "learning_rate": 2.3752636428094685e-05, + "loss": 0.9216, + "step": 10636 + }, + { + "epoch": 0.5305236907730674, + "grad_norm": 0.4628047737560555, + "learning_rate": 2.374860302221792e-05, + "loss": 0.9249, + "step": 10637 + }, + { + "epoch": 0.530573566084788, + "grad_norm": 0.44746888614110725, + "learning_rate": 2.3744569648995802e-05, + "loss": 0.8755, + "step": 10638 + }, + { + "epoch": 0.5306234413965087, + "grad_norm": 0.5014552053795815, + "learning_rate": 2.374053630853358e-05, + "loss": 0.8961, + "step": 10639 + }, + { + "epoch": 0.5306733167082295, + "grad_norm": 0.4953780631794165, + "learning_rate": 2.3736503000936504e-05, + "loss": 0.8829, + "step": 10640 + }, + { + "epoch": 0.5307231920199501, + "grad_norm": 0.4246043120392023, + "learning_rate": 2.373246972630981e-05, + "loss": 0.8712, + "step": 10641 + }, + { + "epoch": 0.5307730673316708, + "grad_norm": 0.38961096904559117, + "learning_rate": 2.3728436484758748e-05, + "loss": 0.8688, + "step": 10642 + }, + { + "epoch": 0.5308229426433915, + "grad_norm": 0.7476746358588772, + "learning_rate": 2.3724403276388573e-05, + "loss": 0.8772, + "step": 10643 + }, + { + "epoch": 0.5308728179551122, + "grad_norm": 1.1095009547842334, + "learning_rate": 2.372037010130452e-05, + "loss": 0.8639, + "step": 10644 + }, + { + "epoch": 0.530922693266833, + "grad_norm": 0.4473057188585736, + "learning_rate": 2.371633695961184e-05, + "loss": 0.8889, + "step": 10645 + }, + { + "epoch": 0.5309725685785536, + "grad_norm": 0.48006553642788496, + "learning_rate": 2.3712303851415768e-05, + "loss": 0.9211, + "step": 10646 + }, + { + "epoch": 0.5310224438902743, + "grad_norm": 0.4595422336863327, + "learning_rate": 2.3708270776821555e-05, + "loss": 0.8554, + "step": 10647 + }, + { + "epoch": 0.531072319201995, + "grad_norm": 0.5587307539545663, + "learning_rate": 2.3704237735934436e-05, + "loss": 0.9049, + "step": 10648 + }, + { + "epoch": 0.5311221945137157, + "grad_norm": 0.44659843413488726, + "learning_rate": 2.3700204728859655e-05, + "loss": 0.8772, + "step": 10649 + }, + { + "epoch": 0.5311720698254364, + "grad_norm": 0.43141754281010775, + "learning_rate": 2.3696171755702452e-05, + "loss": 0.9176, + "step": 10650 + }, + { + "epoch": 0.5312219451371571, + "grad_norm": 0.46961009616789584, + "learning_rate": 2.369213881656806e-05, + "loss": 0.8746, + "step": 10651 + }, + { + "epoch": 0.5312718204488778, + "grad_norm": 0.541351431913527, + "learning_rate": 2.3688105911561718e-05, + "loss": 0.8875, + "step": 10652 + }, + { + "epoch": 0.5313216957605985, + "grad_norm": 0.36675376336420357, + "learning_rate": 2.3684073040788663e-05, + "loss": 0.8687, + "step": 10653 + }, + { + "epoch": 0.5313715710723192, + "grad_norm": 0.3803280998670638, + "learning_rate": 2.368004020435415e-05, + "loss": 0.8674, + "step": 10654 + }, + { + "epoch": 0.5314214463840399, + "grad_norm": 0.42397723644078505, + "learning_rate": 2.367600740236338e-05, + "loss": 0.8798, + "step": 10655 + }, + { + "epoch": 0.5314713216957606, + "grad_norm": 0.43507147730147716, + "learning_rate": 2.3671974634921608e-05, + "loss": 0.8542, + "step": 10656 + }, + { + "epoch": 0.5315211970074813, + "grad_norm": 0.471664094863382, + "learning_rate": 2.366794190213406e-05, + "loss": 0.8277, + "step": 10657 + }, + { + "epoch": 0.531571072319202, + "grad_norm": 0.41891178193064327, + "learning_rate": 2.3663909204105982e-05, + "loss": 0.8954, + "step": 10658 + }, + { + "epoch": 0.5316209476309227, + "grad_norm": 0.4612225170379962, + "learning_rate": 2.3659876540942587e-05, + "loss": 0.8666, + "step": 10659 + }, + { + "epoch": 0.5316708229426433, + "grad_norm": 0.34744650486007655, + "learning_rate": 2.3655843912749117e-05, + "loss": 0.8434, + "step": 10660 + }, + { + "epoch": 0.5317206982543641, + "grad_norm": 0.5004874705946385, + "learning_rate": 2.3651811319630806e-05, + "loss": 0.8959, + "step": 10661 + }, + { + "epoch": 0.5317705735660848, + "grad_norm": 0.4546058309293754, + "learning_rate": 2.3647778761692864e-05, + "loss": 0.9113, + "step": 10662 + }, + { + "epoch": 0.5318204488778054, + "grad_norm": 0.559128677163048, + "learning_rate": 2.3643746239040532e-05, + "loss": 0.882, + "step": 10663 + }, + { + "epoch": 0.5318703241895262, + "grad_norm": 0.4629911293636927, + "learning_rate": 2.3639713751779033e-05, + "loss": 0.9252, + "step": 10664 + }, + { + "epoch": 0.5319201995012469, + "grad_norm": 0.5591799912595659, + "learning_rate": 2.3635681300013608e-05, + "loss": 0.8856, + "step": 10665 + }, + { + "epoch": 0.5319700748129675, + "grad_norm": 0.4203403860342538, + "learning_rate": 2.3631648883849455e-05, + "loss": 0.8918, + "step": 10666 + }, + { + "epoch": 0.5320199501246883, + "grad_norm": 0.4602078742133573, + "learning_rate": 2.3627616503391814e-05, + "loss": 0.8606, + "step": 10667 + }, + { + "epoch": 0.532069825436409, + "grad_norm": 0.5126939234676577, + "learning_rate": 2.3623584158745905e-05, + "loss": 0.8698, + "step": 10668 + }, + { + "epoch": 0.5321197007481296, + "grad_norm": 0.482313684488145, + "learning_rate": 2.3619551850016966e-05, + "loss": 0.88, + "step": 10669 + }, + { + "epoch": 0.5321695760598504, + "grad_norm": 0.4091303290380724, + "learning_rate": 2.361551957731019e-05, + "loss": 0.8897, + "step": 10670 + }, + { + "epoch": 0.5322194513715711, + "grad_norm": 0.4768333823035263, + "learning_rate": 2.3611487340730808e-05, + "loss": 0.8889, + "step": 10671 + }, + { + "epoch": 0.5322693266832917, + "grad_norm": 0.5473828936139447, + "learning_rate": 2.3607455140384057e-05, + "loss": 0.8808, + "step": 10672 + }, + { + "epoch": 0.5323192019950125, + "grad_norm": 0.4493414584061236, + "learning_rate": 2.360342297637513e-05, + "loss": 0.9176, + "step": 10673 + }, + { + "epoch": 0.5323690773067332, + "grad_norm": 0.43454429852294363, + "learning_rate": 2.3599390848809256e-05, + "loss": 0.898, + "step": 10674 + }, + { + "epoch": 0.5324189526184538, + "grad_norm": 0.82463263669512, + "learning_rate": 2.359535875779165e-05, + "loss": 0.9026, + "step": 10675 + }, + { + "epoch": 0.5324688279301746, + "grad_norm": 0.45359218493068726, + "learning_rate": 2.3591326703427537e-05, + "loss": 0.8667, + "step": 10676 + }, + { + "epoch": 0.5325187032418952, + "grad_norm": 0.7459962643849346, + "learning_rate": 2.3587294685822114e-05, + "loss": 0.8828, + "step": 10677 + }, + { + "epoch": 0.532568578553616, + "grad_norm": 0.6356705453349178, + "learning_rate": 2.3583262705080604e-05, + "loss": 0.8705, + "step": 10678 + }, + { + "epoch": 0.5326184538653367, + "grad_norm": 0.4172651080267397, + "learning_rate": 2.357923076130822e-05, + "loss": 0.9618, + "step": 10679 + }, + { + "epoch": 0.5326683291770573, + "grad_norm": 0.7404675264646393, + "learning_rate": 2.357519885461018e-05, + "loss": 0.8636, + "step": 10680 + }, + { + "epoch": 0.532718204488778, + "grad_norm": 0.40684697229796846, + "learning_rate": 2.3571166985091687e-05, + "loss": 0.8985, + "step": 10681 + }, + { + "epoch": 0.5327680798004988, + "grad_norm": 0.5590204794724059, + "learning_rate": 2.3567135152857947e-05, + "loss": 0.9009, + "step": 10682 + }, + { + "epoch": 0.5328179551122194, + "grad_norm": 0.43289241625938657, + "learning_rate": 2.356310335801418e-05, + "loss": 0.8343, + "step": 10683 + }, + { + "epoch": 0.5328678304239401, + "grad_norm": 0.37310121931346313, + "learning_rate": 2.355907160066558e-05, + "loss": 0.8736, + "step": 10684 + }, + { + "epoch": 0.5329177057356609, + "grad_norm": 0.4245371500656292, + "learning_rate": 2.3555039880917363e-05, + "loss": 0.8965, + "step": 10685 + }, + { + "epoch": 0.5329675810473815, + "grad_norm": 0.4628026766528258, + "learning_rate": 2.3551008198874737e-05, + "loss": 0.9255, + "step": 10686 + }, + { + "epoch": 0.5330174563591022, + "grad_norm": 0.42888963397060403, + "learning_rate": 2.3546976554642908e-05, + "loss": 0.8723, + "step": 10687 + }, + { + "epoch": 0.533067331670823, + "grad_norm": 0.4648078566350325, + "learning_rate": 2.3542944948327068e-05, + "loss": 0.9044, + "step": 10688 + }, + { + "epoch": 0.5331172069825436, + "grad_norm": 0.5588928527502973, + "learning_rate": 2.3538913380032426e-05, + "loss": 0.8725, + "step": 10689 + }, + { + "epoch": 0.5331670822942643, + "grad_norm": 0.41033316719091056, + "learning_rate": 2.3534881849864187e-05, + "loss": 0.8863, + "step": 10690 + }, + { + "epoch": 0.5332169576059851, + "grad_norm": 0.4379822723100458, + "learning_rate": 2.353085035792756e-05, + "loss": 0.8889, + "step": 10691 + }, + { + "epoch": 0.5332668329177057, + "grad_norm": 0.43006763095163586, + "learning_rate": 2.352681890432773e-05, + "loss": 0.8982, + "step": 10692 + }, + { + "epoch": 0.5333167082294265, + "grad_norm": 0.5040589676083026, + "learning_rate": 2.3522787489169895e-05, + "loss": 0.8653, + "step": 10693 + }, + { + "epoch": 0.5333665835411471, + "grad_norm": 0.46491839451959555, + "learning_rate": 2.3518756112559263e-05, + "loss": 0.8933, + "step": 10694 + }, + { + "epoch": 0.5334164588528678, + "grad_norm": 0.42010812976897877, + "learning_rate": 2.3514724774601035e-05, + "loss": 0.8433, + "step": 10695 + }, + { + "epoch": 0.5334663341645886, + "grad_norm": 0.549355849232713, + "learning_rate": 2.351069347540039e-05, + "loss": 0.8742, + "step": 10696 + }, + { + "epoch": 0.5335162094763092, + "grad_norm": 0.4710132544233296, + "learning_rate": 2.3506662215062532e-05, + "loss": 0.8668, + "step": 10697 + }, + { + "epoch": 0.5335660847880299, + "grad_norm": 0.8601447137817918, + "learning_rate": 2.3502630993692663e-05, + "loss": 0.8617, + "step": 10698 + }, + { + "epoch": 0.5336159600997507, + "grad_norm": 0.465964926209992, + "learning_rate": 2.3498599811395963e-05, + "loss": 0.8734, + "step": 10699 + }, + { + "epoch": 0.5336658354114713, + "grad_norm": 0.45276965528583046, + "learning_rate": 2.3494568668277628e-05, + "loss": 0.9167, + "step": 10700 + }, + { + "epoch": 0.533715710723192, + "grad_norm": 0.5283726254175019, + "learning_rate": 2.3490537564442847e-05, + "loss": 0.8988, + "step": 10701 + }, + { + "epoch": 0.5337655860349128, + "grad_norm": 0.40072302056500464, + "learning_rate": 2.3486506499996826e-05, + "loss": 0.9088, + "step": 10702 + }, + { + "epoch": 0.5338154613466334, + "grad_norm": 0.4891919014597527, + "learning_rate": 2.3482475475044726e-05, + "loss": 0.8668, + "step": 10703 + }, + { + "epoch": 0.5338653366583541, + "grad_norm": 0.47264902190186825, + "learning_rate": 2.3478444489691752e-05, + "loss": 0.8802, + "step": 10704 + }, + { + "epoch": 0.5339152119700749, + "grad_norm": 0.40961533929421645, + "learning_rate": 2.3474413544043088e-05, + "loss": 0.8585, + "step": 10705 + }, + { + "epoch": 0.5339650872817955, + "grad_norm": 0.38798417611885955, + "learning_rate": 2.3470382638203932e-05, + "loss": 0.8475, + "step": 10706 + }, + { + "epoch": 0.5340149625935162, + "grad_norm": 0.7928910926766706, + "learning_rate": 2.3466351772279442e-05, + "loss": 0.8825, + "step": 10707 + }, + { + "epoch": 0.534064837905237, + "grad_norm": 0.4767557814689379, + "learning_rate": 2.3462320946374823e-05, + "loss": 0.862, + "step": 10708 + }, + { + "epoch": 0.5341147132169576, + "grad_norm": 0.4548523965034525, + "learning_rate": 2.3458290160595253e-05, + "loss": 0.8553, + "step": 10709 + }, + { + "epoch": 0.5341645885286783, + "grad_norm": 0.5682082711789006, + "learning_rate": 2.3454259415045908e-05, + "loss": 0.8639, + "step": 10710 + }, + { + "epoch": 0.5342144638403991, + "grad_norm": 0.5555860050187182, + "learning_rate": 2.3450228709831968e-05, + "loss": 0.8946, + "step": 10711 + }, + { + "epoch": 0.5342643391521197, + "grad_norm": 0.40030150084119015, + "learning_rate": 2.3446198045058616e-05, + "loss": 0.8644, + "step": 10712 + }, + { + "epoch": 0.5343142144638404, + "grad_norm": 0.469181830102029, + "learning_rate": 2.344216742083104e-05, + "loss": 0.8368, + "step": 10713 + }, + { + "epoch": 0.534364089775561, + "grad_norm": 0.4337061407931677, + "learning_rate": 2.34381368372544e-05, + "loss": 0.8488, + "step": 10714 + }, + { + "epoch": 0.5344139650872818, + "grad_norm": 0.4946014000332981, + "learning_rate": 2.3434106294433877e-05, + "loss": 0.8577, + "step": 10715 + }, + { + "epoch": 0.5344638403990025, + "grad_norm": 0.8658801162947303, + "learning_rate": 2.3430075792474653e-05, + "loss": 0.9247, + "step": 10716 + }, + { + "epoch": 0.5345137157107231, + "grad_norm": 0.42005240433971064, + "learning_rate": 2.34260453314819e-05, + "loss": 0.8735, + "step": 10717 + }, + { + "epoch": 0.5345635910224439, + "grad_norm": 0.3740286927449418, + "learning_rate": 2.3422014911560793e-05, + "loss": 0.879, + "step": 10718 + }, + { + "epoch": 0.5346134663341646, + "grad_norm": 0.44418527645042766, + "learning_rate": 2.3417984532816495e-05, + "loss": 0.903, + "step": 10719 + }, + { + "epoch": 0.5346633416458852, + "grad_norm": 0.5238673320526408, + "learning_rate": 2.341395419535419e-05, + "loss": 0.8582, + "step": 10720 + }, + { + "epoch": 0.534713216957606, + "grad_norm": 0.5414624177531404, + "learning_rate": 2.340992389927903e-05, + "loss": 0.9185, + "step": 10721 + }, + { + "epoch": 0.5347630922693267, + "grad_norm": 0.4028059534744311, + "learning_rate": 2.3405893644696197e-05, + "loss": 0.8831, + "step": 10722 + }, + { + "epoch": 0.5348129675810473, + "grad_norm": 0.4489833262290266, + "learning_rate": 2.3401863431710863e-05, + "loss": 0.8825, + "step": 10723 + }, + { + "epoch": 0.5348628428927681, + "grad_norm": 0.6209289943613391, + "learning_rate": 2.3397833260428183e-05, + "loss": 0.8768, + "step": 10724 + }, + { + "epoch": 0.5349127182044888, + "grad_norm": 0.44188673344177554, + "learning_rate": 2.3393803130953328e-05, + "loss": 0.8929, + "step": 10725 + }, + { + "epoch": 0.5349625935162095, + "grad_norm": 0.3972416748234258, + "learning_rate": 2.3389773043391455e-05, + "loss": 0.8509, + "step": 10726 + }, + { + "epoch": 0.5350124688279302, + "grad_norm": 0.5170361575769973, + "learning_rate": 2.3385742997847736e-05, + "loss": 0.8815, + "step": 10727 + }, + { + "epoch": 0.5350623441396509, + "grad_norm": 0.38802737549548577, + "learning_rate": 2.3381712994427342e-05, + "loss": 0.8581, + "step": 10728 + }, + { + "epoch": 0.5351122194513716, + "grad_norm": 0.39293167777863874, + "learning_rate": 2.3377683033235416e-05, + "loss": 0.8695, + "step": 10729 + }, + { + "epoch": 0.5351620947630923, + "grad_norm": 0.4741398356167228, + "learning_rate": 2.337365311437712e-05, + "loss": 0.8566, + "step": 10730 + }, + { + "epoch": 0.5352119700748129, + "grad_norm": 0.425752752559751, + "learning_rate": 2.3369623237957634e-05, + "loss": 0.8516, + "step": 10731 + }, + { + "epoch": 0.5352618453865337, + "grad_norm": 0.6396778954062424, + "learning_rate": 2.3365593404082085e-05, + "loss": 0.8851, + "step": 10732 + }, + { + "epoch": 0.5353117206982544, + "grad_norm": 0.5092978378781523, + "learning_rate": 2.3361563612855645e-05, + "loss": 0.8543, + "step": 10733 + }, + { + "epoch": 0.535361596009975, + "grad_norm": 0.4725981553981762, + "learning_rate": 2.335753386438347e-05, + "loss": 0.8973, + "step": 10734 + }, + { + "epoch": 0.5354114713216958, + "grad_norm": 0.4125517577686178, + "learning_rate": 2.3353504158770722e-05, + "loss": 0.8509, + "step": 10735 + }, + { + "epoch": 0.5354613466334165, + "grad_norm": 0.5984209751140256, + "learning_rate": 2.3349474496122538e-05, + "loss": 0.8734, + "step": 10736 + }, + { + "epoch": 0.5355112219451371, + "grad_norm": 0.6692663274441359, + "learning_rate": 2.334544487654408e-05, + "loss": 0.8976, + "step": 10737 + }, + { + "epoch": 0.5355610972568579, + "grad_norm": 0.42765892292246216, + "learning_rate": 2.3341415300140495e-05, + "loss": 0.8957, + "step": 10738 + }, + { + "epoch": 0.5356109725685786, + "grad_norm": 0.4729739248439162, + "learning_rate": 2.3337385767016947e-05, + "loss": 0.8843, + "step": 10739 + }, + { + "epoch": 0.5356608478802992, + "grad_norm": 0.42763278486761364, + "learning_rate": 2.3333356277278562e-05, + "loss": 0.888, + "step": 10740 + }, + { + "epoch": 0.53571072319202, + "grad_norm": 0.4271043051351126, + "learning_rate": 2.3329326831030497e-05, + "loss": 0.883, + "step": 10741 + }, + { + "epoch": 0.5357605985037407, + "grad_norm": 0.4716053438103238, + "learning_rate": 2.3325297428377908e-05, + "loss": 0.8548, + "step": 10742 + }, + { + "epoch": 0.5358104738154613, + "grad_norm": 0.4266616396849832, + "learning_rate": 2.3321268069425936e-05, + "loss": 0.8542, + "step": 10743 + }, + { + "epoch": 0.5358603491271821, + "grad_norm": 0.4452696490540143, + "learning_rate": 2.3317238754279717e-05, + "loss": 0.9108, + "step": 10744 + }, + { + "epoch": 0.5359102244389028, + "grad_norm": 0.5043312445983815, + "learning_rate": 2.3313209483044394e-05, + "loss": 0.9109, + "step": 10745 + }, + { + "epoch": 0.5359600997506234, + "grad_norm": 0.41391872213074576, + "learning_rate": 2.330918025582513e-05, + "loss": 0.9267, + "step": 10746 + }, + { + "epoch": 0.5360099750623442, + "grad_norm": 0.6909090082138241, + "learning_rate": 2.330515107272704e-05, + "loss": 0.8584, + "step": 10747 + }, + { + "epoch": 0.5360598503740648, + "grad_norm": 0.4709813319625453, + "learning_rate": 2.330112193385527e-05, + "loss": 0.8577, + "step": 10748 + }, + { + "epoch": 0.5361097256857855, + "grad_norm": 0.44683503301463157, + "learning_rate": 2.3297092839314965e-05, + "loss": 0.8904, + "step": 10749 + }, + { + "epoch": 0.5361596009975063, + "grad_norm": 0.47411130742671376, + "learning_rate": 2.329306378921127e-05, + "loss": 0.8796, + "step": 10750 + }, + { + "epoch": 0.5362094763092269, + "grad_norm": 0.5075705445421791, + "learning_rate": 2.3289034783649306e-05, + "loss": 0.8734, + "step": 10751 + }, + { + "epoch": 0.5362593516209476, + "grad_norm": 0.391408891106187, + "learning_rate": 2.3285005822734206e-05, + "loss": 0.888, + "step": 10752 + }, + { + "epoch": 0.5363092269326684, + "grad_norm": 0.4049702370284881, + "learning_rate": 2.3280976906571117e-05, + "loss": 0.8857, + "step": 10753 + }, + { + "epoch": 0.536359102244389, + "grad_norm": 0.46017920529427536, + "learning_rate": 2.327694803526517e-05, + "loss": 0.8609, + "step": 10754 + }, + { + "epoch": 0.5364089775561097, + "grad_norm": 0.4958900515660053, + "learning_rate": 2.3272919208921494e-05, + "loss": 0.9255, + "step": 10755 + }, + { + "epoch": 0.5364588528678305, + "grad_norm": 0.810281562507245, + "learning_rate": 2.3268890427645213e-05, + "loss": 0.9189, + "step": 10756 + }, + { + "epoch": 0.5365087281795511, + "grad_norm": 0.3925803217482451, + "learning_rate": 2.3264861691541464e-05, + "loss": 0.8499, + "step": 10757 + }, + { + "epoch": 0.5365586034912718, + "grad_norm": 0.5669682655790212, + "learning_rate": 2.3260833000715366e-05, + "loss": 0.8704, + "step": 10758 + }, + { + "epoch": 0.5366084788029926, + "grad_norm": 1.0144707455038016, + "learning_rate": 2.3256804355272057e-05, + "loss": 0.8776, + "step": 10759 + }, + { + "epoch": 0.5366583541147132, + "grad_norm": 0.3720005792909516, + "learning_rate": 2.3252775755316664e-05, + "loss": 0.8295, + "step": 10760 + }, + { + "epoch": 0.5367082294264339, + "grad_norm": 0.41798452826440347, + "learning_rate": 2.3248747200954303e-05, + "loss": 0.8321, + "step": 10761 + }, + { + "epoch": 0.5367581047381547, + "grad_norm": 0.4953187846298095, + "learning_rate": 2.3244718692290098e-05, + "loss": 0.8527, + "step": 10762 + }, + { + "epoch": 0.5368079800498753, + "grad_norm": 0.4581558035199629, + "learning_rate": 2.324069022942917e-05, + "loss": 0.8741, + "step": 10763 + }, + { + "epoch": 0.536857855361596, + "grad_norm": 0.44271748868145244, + "learning_rate": 2.3236661812476646e-05, + "loss": 0.8858, + "step": 10764 + }, + { + "epoch": 0.5369077306733168, + "grad_norm": 0.38591459611388534, + "learning_rate": 2.3232633441537647e-05, + "loss": 0.8635, + "step": 10765 + }, + { + "epoch": 0.5369576059850374, + "grad_norm": 0.44489526396091483, + "learning_rate": 2.3228605116717278e-05, + "loss": 0.8941, + "step": 10766 + }, + { + "epoch": 0.5370074812967581, + "grad_norm": 0.4449309731916425, + "learning_rate": 2.3224576838120672e-05, + "loss": 0.8854, + "step": 10767 + }, + { + "epoch": 0.5370573566084788, + "grad_norm": 0.5583271764756841, + "learning_rate": 2.3220548605852943e-05, + "loss": 0.9224, + "step": 10768 + }, + { + "epoch": 0.5371072319201995, + "grad_norm": 0.43280791644994204, + "learning_rate": 2.3216520420019195e-05, + "loss": 0.8671, + "step": 10769 + }, + { + "epoch": 0.5371571072319202, + "grad_norm": 0.4491060361373269, + "learning_rate": 2.321249228072455e-05, + "loss": 0.8934, + "step": 10770 + }, + { + "epoch": 0.5372069825436409, + "grad_norm": 0.3815700873407512, + "learning_rate": 2.3208464188074113e-05, + "loss": 0.8933, + "step": 10771 + }, + { + "epoch": 0.5372568578553616, + "grad_norm": 0.43445957442361355, + "learning_rate": 2.3204436142173016e-05, + "loss": 0.867, + "step": 10772 + }, + { + "epoch": 0.5373067331670823, + "grad_norm": 0.5298828435845482, + "learning_rate": 2.3200408143126343e-05, + "loss": 0.9004, + "step": 10773 + }, + { + "epoch": 0.537356608478803, + "grad_norm": 0.4060854616755215, + "learning_rate": 2.3196380191039214e-05, + "loss": 0.8486, + "step": 10774 + }, + { + "epoch": 0.5374064837905237, + "grad_norm": 0.7365581276670848, + "learning_rate": 2.3192352286016738e-05, + "loss": 0.8702, + "step": 10775 + }, + { + "epoch": 0.5374563591022444, + "grad_norm": 0.4302265685678647, + "learning_rate": 2.318832442816403e-05, + "loss": 0.9011, + "step": 10776 + }, + { + "epoch": 0.5375062344139651, + "grad_norm": 0.3939067358223608, + "learning_rate": 2.3184296617586172e-05, + "loss": 0.8606, + "step": 10777 + }, + { + "epoch": 0.5375561097256858, + "grad_norm": 0.43257201237081866, + "learning_rate": 2.3180268854388285e-05, + "loss": 0.8817, + "step": 10778 + }, + { + "epoch": 0.5376059850374065, + "grad_norm": 0.38434761406445184, + "learning_rate": 2.3176241138675474e-05, + "loss": 0.8333, + "step": 10779 + }, + { + "epoch": 0.5376558603491272, + "grad_norm": 0.6592007060663995, + "learning_rate": 2.3172213470552827e-05, + "loss": 0.875, + "step": 10780 + }, + { + "epoch": 0.5377057356608479, + "grad_norm": 0.48801033758476464, + "learning_rate": 2.3168185850125453e-05, + "loss": 0.8828, + "step": 10781 + }, + { + "epoch": 0.5377556109725686, + "grad_norm": 0.46002225792300855, + "learning_rate": 2.3164158277498445e-05, + "loss": 0.9146, + "step": 10782 + }, + { + "epoch": 0.5378054862842893, + "grad_norm": 0.48176511125785926, + "learning_rate": 2.316013075277692e-05, + "loss": 0.8722, + "step": 10783 + }, + { + "epoch": 0.53785536159601, + "grad_norm": 0.5382540071514205, + "learning_rate": 2.315610327606595e-05, + "loss": 0.8436, + "step": 10784 + }, + { + "epoch": 0.5379052369077306, + "grad_norm": 0.37440565150999466, + "learning_rate": 2.3152075847470633e-05, + "loss": 0.8968, + "step": 10785 + }, + { + "epoch": 0.5379551122194514, + "grad_norm": 0.5080857894374964, + "learning_rate": 2.3148048467096075e-05, + "loss": 0.8835, + "step": 10786 + }, + { + "epoch": 0.5380049875311721, + "grad_norm": 0.4484415898062113, + "learning_rate": 2.3144021135047374e-05, + "loss": 0.876, + "step": 10787 + }, + { + "epoch": 0.5380548628428927, + "grad_norm": 0.3949202116979042, + "learning_rate": 2.3139993851429597e-05, + "loss": 0.8663, + "step": 10788 + }, + { + "epoch": 0.5381047381546135, + "grad_norm": 0.4204972692858841, + "learning_rate": 2.313596661634785e-05, + "loss": 0.9262, + "step": 10789 + }, + { + "epoch": 0.5381546134663342, + "grad_norm": 0.46304714172973066, + "learning_rate": 2.3131939429907227e-05, + "loss": 0.9064, + "step": 10790 + }, + { + "epoch": 0.5382044887780548, + "grad_norm": 0.9026415365618442, + "learning_rate": 2.3127912292212806e-05, + "loss": 0.9241, + "step": 10791 + }, + { + "epoch": 0.5382543640897756, + "grad_norm": 0.4359249213146622, + "learning_rate": 2.3123885203369673e-05, + "loss": 0.9214, + "step": 10792 + }, + { + "epoch": 0.5383042394014963, + "grad_norm": 0.4096423162907974, + "learning_rate": 2.311985816348291e-05, + "loss": 0.8842, + "step": 10793 + }, + { + "epoch": 0.5383541147132169, + "grad_norm": 0.7403802477849786, + "learning_rate": 2.311583117265762e-05, + "loss": 0.9183, + "step": 10794 + }, + { + "epoch": 0.5384039900249377, + "grad_norm": 0.4183783863453003, + "learning_rate": 2.3111804230998863e-05, + "loss": 0.9017, + "step": 10795 + }, + { + "epoch": 0.5384538653366584, + "grad_norm": 0.5963777852064237, + "learning_rate": 2.3107777338611734e-05, + "loss": 0.8729, + "step": 10796 + }, + { + "epoch": 0.538503740648379, + "grad_norm": 0.46500720148459107, + "learning_rate": 2.3103750495601302e-05, + "loss": 0.913, + "step": 10797 + }, + { + "epoch": 0.5385536159600998, + "grad_norm": 0.5111939072065611, + "learning_rate": 2.3099723702072662e-05, + "loss": 0.8996, + "step": 10798 + }, + { + "epoch": 0.5386034912718205, + "grad_norm": 0.6494893416028429, + "learning_rate": 2.3095696958130875e-05, + "loss": 0.8851, + "step": 10799 + }, + { + "epoch": 0.5386533665835411, + "grad_norm": 0.5330641880416972, + "learning_rate": 2.309167026388102e-05, + "loss": 0.8545, + "step": 10800 + }, + { + "epoch": 0.5387032418952619, + "grad_norm": 0.47948531732059885, + "learning_rate": 2.3087643619428184e-05, + "loss": 0.8779, + "step": 10801 + }, + { + "epoch": 0.5387531172069825, + "grad_norm": 0.478998741001853, + "learning_rate": 2.3083617024877432e-05, + "loss": 0.8995, + "step": 10802 + }, + { + "epoch": 0.5388029925187032, + "grad_norm": 0.4610739539520843, + "learning_rate": 2.307959048033383e-05, + "loss": 0.8735, + "step": 10803 + }, + { + "epoch": 0.538852867830424, + "grad_norm": 0.49555649649596156, + "learning_rate": 2.3075563985902453e-05, + "loss": 0.9212, + "step": 10804 + }, + { + "epoch": 0.5389027431421446, + "grad_norm": 0.41082712682431777, + "learning_rate": 2.3071537541688387e-05, + "loss": 0.8847, + "step": 10805 + }, + { + "epoch": 0.5389526184538653, + "grad_norm": 0.4100308600357924, + "learning_rate": 2.3067511147796674e-05, + "loss": 0.8918, + "step": 10806 + }, + { + "epoch": 0.5390024937655861, + "grad_norm": 0.4809733950586797, + "learning_rate": 2.306348480433239e-05, + "loss": 0.8494, + "step": 10807 + }, + { + "epoch": 0.5390523690773067, + "grad_norm": 1.041583520597493, + "learning_rate": 2.3059458511400608e-05, + "loss": 0.8439, + "step": 10808 + }, + { + "epoch": 0.5391022443890274, + "grad_norm": 0.7859544081304322, + "learning_rate": 2.3055432269106393e-05, + "loss": 0.8502, + "step": 10809 + }, + { + "epoch": 0.5391521197007482, + "grad_norm": 0.6157040622369682, + "learning_rate": 2.3051406077554795e-05, + "loss": 0.872, + "step": 10810 + }, + { + "epoch": 0.5392019950124688, + "grad_norm": 0.4452598032436987, + "learning_rate": 2.3047379936850886e-05, + "loss": 0.8951, + "step": 10811 + }, + { + "epoch": 0.5392518703241895, + "grad_norm": 0.47464930105262165, + "learning_rate": 2.304335384709972e-05, + "loss": 0.8874, + "step": 10812 + }, + { + "epoch": 0.5393017456359103, + "grad_norm": 0.48467818843472615, + "learning_rate": 2.3039327808406373e-05, + "loss": 0.8769, + "step": 10813 + }, + { + "epoch": 0.5393516209476309, + "grad_norm": 0.6244234093156846, + "learning_rate": 2.3035301820875878e-05, + "loss": 0.8561, + "step": 10814 + }, + { + "epoch": 0.5394014962593516, + "grad_norm": 0.45927719962212626, + "learning_rate": 2.3031275884613298e-05, + "loss": 0.8784, + "step": 10815 + }, + { + "epoch": 0.5394513715710724, + "grad_norm": 0.42266865382118757, + "learning_rate": 2.3027249999723707e-05, + "loss": 0.9237, + "step": 10816 + }, + { + "epoch": 0.539501246882793, + "grad_norm": 0.41383404932527357, + "learning_rate": 2.3023224166312137e-05, + "loss": 0.8518, + "step": 10817 + }, + { + "epoch": 0.5395511221945137, + "grad_norm": 0.4577744803441836, + "learning_rate": 2.3019198384483643e-05, + "loss": 0.8641, + "step": 10818 + }, + { + "epoch": 0.5396009975062344, + "grad_norm": 0.4403334492574061, + "learning_rate": 2.3015172654343282e-05, + "loss": 0.8457, + "step": 10819 + }, + { + "epoch": 0.5396508728179551, + "grad_norm": 0.47760804422808506, + "learning_rate": 2.3011146975996112e-05, + "loss": 0.8841, + "step": 10820 + }, + { + "epoch": 0.5397007481296758, + "grad_norm": 0.4523298160067642, + "learning_rate": 2.3007121349547163e-05, + "loss": 0.8956, + "step": 10821 + }, + { + "epoch": 0.5397506234413965, + "grad_norm": 0.4283153412299709, + "learning_rate": 2.300309577510149e-05, + "loss": 0.8581, + "step": 10822 + }, + { + "epoch": 0.5398004987531172, + "grad_norm": 0.5427684483902175, + "learning_rate": 2.2999070252764135e-05, + "loss": 0.8796, + "step": 10823 + }, + { + "epoch": 0.5398503740648379, + "grad_norm": 0.6223177576017692, + "learning_rate": 2.299504478264016e-05, + "loss": 0.8965, + "step": 10824 + }, + { + "epoch": 0.5399002493765586, + "grad_norm": 0.4911258065684529, + "learning_rate": 2.2991019364834586e-05, + "loss": 0.9019, + "step": 10825 + }, + { + "epoch": 0.5399501246882793, + "grad_norm": 0.42751482510207056, + "learning_rate": 2.2986993999452456e-05, + "loss": 0.8763, + "step": 10826 + }, + { + "epoch": 0.54, + "grad_norm": 0.3855199349838231, + "learning_rate": 2.298296868659883e-05, + "loss": 0.8671, + "step": 10827 + }, + { + "epoch": 0.5400498753117207, + "grad_norm": 0.5219369596771692, + "learning_rate": 2.297894342637873e-05, + "loss": 0.854, + "step": 10828 + }, + { + "epoch": 0.5400997506234414, + "grad_norm": 0.40975585965664885, + "learning_rate": 2.297491821889719e-05, + "loss": 0.9081, + "step": 10829 + }, + { + "epoch": 0.5401496259351621, + "grad_norm": 0.44488607317101153, + "learning_rate": 2.297089306425925e-05, + "loss": 0.8977, + "step": 10830 + }, + { + "epoch": 0.5401995012468828, + "grad_norm": 0.5766176585537096, + "learning_rate": 2.296686796256996e-05, + "loss": 0.887, + "step": 10831 + }, + { + "epoch": 0.5402493765586035, + "grad_norm": 0.4654177475544338, + "learning_rate": 2.296284291393433e-05, + "loss": 0.9048, + "step": 10832 + }, + { + "epoch": 0.5402992518703242, + "grad_norm": 0.43651841519379414, + "learning_rate": 2.2958817918457412e-05, + "loss": 0.8721, + "step": 10833 + }, + { + "epoch": 0.5403491271820449, + "grad_norm": 0.4188779878228321, + "learning_rate": 2.295479297624422e-05, + "loss": 0.8742, + "step": 10834 + }, + { + "epoch": 0.5403990024937656, + "grad_norm": 0.45402688199591745, + "learning_rate": 2.2950768087399793e-05, + "loss": 0.9236, + "step": 10835 + }, + { + "epoch": 0.5404488778054863, + "grad_norm": 0.48299608436727415, + "learning_rate": 2.294674325202915e-05, + "loss": 0.8262, + "step": 10836 + }, + { + "epoch": 0.540498753117207, + "grad_norm": 0.40504415182129166, + "learning_rate": 2.294271847023733e-05, + "loss": 0.8802, + "step": 10837 + }, + { + "epoch": 0.5405486284289277, + "grad_norm": 0.38306442198168317, + "learning_rate": 2.2938693742129352e-05, + "loss": 0.8592, + "step": 10838 + }, + { + "epoch": 0.5405985037406483, + "grad_norm": 0.39144798033360684, + "learning_rate": 2.2934669067810233e-05, + "loss": 0.9046, + "step": 10839 + }, + { + "epoch": 0.5406483790523691, + "grad_norm": 0.4026568732830095, + "learning_rate": 2.2930644447385e-05, + "loss": 0.8347, + "step": 10840 + }, + { + "epoch": 0.5406982543640898, + "grad_norm": 0.42758242037130556, + "learning_rate": 2.292661988095867e-05, + "loss": 0.8856, + "step": 10841 + }, + { + "epoch": 0.5407481296758104, + "grad_norm": 0.4757797782107204, + "learning_rate": 2.2922595368636283e-05, + "loss": 0.8652, + "step": 10842 + }, + { + "epoch": 0.5407980049875312, + "grad_norm": 0.41962330862050456, + "learning_rate": 2.2918570910522826e-05, + "loss": 0.8446, + "step": 10843 + }, + { + "epoch": 0.5408478802992519, + "grad_norm": 0.4641305825483007, + "learning_rate": 2.291454650672333e-05, + "loss": 0.8504, + "step": 10844 + }, + { + "epoch": 0.5408977556109725, + "grad_norm": 0.5395065812191875, + "learning_rate": 2.291052215734281e-05, + "loss": 0.8948, + "step": 10845 + }, + { + "epoch": 0.5409476309226933, + "grad_norm": 0.5254167933817063, + "learning_rate": 2.290649786248629e-05, + "loss": 0.8729, + "step": 10846 + }, + { + "epoch": 0.540997506234414, + "grad_norm": 0.49942263313351914, + "learning_rate": 2.290247362225876e-05, + "loss": 0.9171, + "step": 10847 + }, + { + "epoch": 0.5410473815461346, + "grad_norm": 0.42759128774234234, + "learning_rate": 2.2898449436765243e-05, + "loss": 0.8783, + "step": 10848 + }, + { + "epoch": 0.5410972568578554, + "grad_norm": 0.4798850427033152, + "learning_rate": 2.289442530611076e-05, + "loss": 0.8696, + "step": 10849 + }, + { + "epoch": 0.5411471321695761, + "grad_norm": 0.44008527495890876, + "learning_rate": 2.289040123040029e-05, + "loss": 0.927, + "step": 10850 + }, + { + "epoch": 0.5411970074812967, + "grad_norm": 0.44327777227764975, + "learning_rate": 2.288637720973886e-05, + "loss": 0.8757, + "step": 10851 + }, + { + "epoch": 0.5412468827930175, + "grad_norm": 0.4302611073420843, + "learning_rate": 2.288235324423147e-05, + "loss": 0.8309, + "step": 10852 + }, + { + "epoch": 0.5412967581047382, + "grad_norm": 0.4488899927669766, + "learning_rate": 2.2878329333983133e-05, + "loss": 0.8926, + "step": 10853 + }, + { + "epoch": 0.5413466334164588, + "grad_norm": 0.572245469707827, + "learning_rate": 2.2874305479098836e-05, + "loss": 0.8904, + "step": 10854 + }, + { + "epoch": 0.5413965087281796, + "grad_norm": 0.48429175902458943, + "learning_rate": 2.2870281679683582e-05, + "loss": 0.8655, + "step": 10855 + }, + { + "epoch": 0.5414463840399002, + "grad_norm": 0.5957180731660782, + "learning_rate": 2.2866257935842377e-05, + "loss": 0.8859, + "step": 10856 + }, + { + "epoch": 0.5414962593516209, + "grad_norm": 0.40295703527189736, + "learning_rate": 2.2862234247680226e-05, + "loss": 0.8319, + "step": 10857 + }, + { + "epoch": 0.5415461346633417, + "grad_norm": 0.42763867692696805, + "learning_rate": 2.2858210615302105e-05, + "loss": 0.8419, + "step": 10858 + }, + { + "epoch": 0.5415960099750623, + "grad_norm": 0.4352125602446766, + "learning_rate": 2.2854187038813018e-05, + "loss": 0.8512, + "step": 10859 + }, + { + "epoch": 0.541645885286783, + "grad_norm": 0.6403958605505036, + "learning_rate": 2.2850163518317962e-05, + "loss": 0.8559, + "step": 10860 + }, + { + "epoch": 0.5416957605985038, + "grad_norm": 0.4326340669135393, + "learning_rate": 2.284614005392194e-05, + "loss": 0.8868, + "step": 10861 + }, + { + "epoch": 0.5417456359102244, + "grad_norm": 0.49138860593370537, + "learning_rate": 2.2842116645729912e-05, + "loss": 0.9023, + "step": 10862 + }, + { + "epoch": 0.5417955112219451, + "grad_norm": 0.44880657427716636, + "learning_rate": 2.2838093293846888e-05, + "loss": 0.8413, + "step": 10863 + }, + { + "epoch": 0.5418453865336659, + "grad_norm": 0.4029056993915293, + "learning_rate": 2.2834069998377857e-05, + "loss": 0.8801, + "step": 10864 + }, + { + "epoch": 0.5418952618453865, + "grad_norm": 0.5706835695374868, + "learning_rate": 2.28300467594278e-05, + "loss": 0.8214, + "step": 10865 + }, + { + "epoch": 0.5419451371571072, + "grad_norm": 0.4388323755578439, + "learning_rate": 2.2826023577101695e-05, + "loss": 0.8481, + "step": 10866 + }, + { + "epoch": 0.541995012468828, + "grad_norm": 0.5242945315069968, + "learning_rate": 2.2822000451504535e-05, + "loss": 0.9128, + "step": 10867 + }, + { + "epoch": 0.5420448877805486, + "grad_norm": 0.5524662795978061, + "learning_rate": 2.28179773827413e-05, + "loss": 0.8781, + "step": 10868 + }, + { + "epoch": 0.5420947630922693, + "grad_norm": 0.4670203794036281, + "learning_rate": 2.2813954370916967e-05, + "loss": 0.8822, + "step": 10869 + }, + { + "epoch": 0.5421446384039901, + "grad_norm": 0.62777270201425, + "learning_rate": 2.280993141613652e-05, + "loss": 0.8732, + "step": 10870 + }, + { + "epoch": 0.5421945137157107, + "grad_norm": 0.4385263522645943, + "learning_rate": 2.280590851850493e-05, + "loss": 0.8727, + "step": 10871 + }, + { + "epoch": 0.5422443890274314, + "grad_norm": 0.5407556295265318, + "learning_rate": 2.2801885678127176e-05, + "loss": 0.8936, + "step": 10872 + }, + { + "epoch": 0.5422942643391521, + "grad_norm": 0.5497981017820109, + "learning_rate": 2.279786289510823e-05, + "loss": 0.8802, + "step": 10873 + }, + { + "epoch": 0.5423441396508728, + "grad_norm": 0.4297836668480151, + "learning_rate": 2.2793840169553063e-05, + "loss": 0.8781, + "step": 10874 + }, + { + "epoch": 0.5423940149625935, + "grad_norm": 0.3671643739676615, + "learning_rate": 2.2789817501566663e-05, + "loss": 0.856, + "step": 10875 + }, + { + "epoch": 0.5424438902743142, + "grad_norm": 0.46264070323733414, + "learning_rate": 2.2785794891253974e-05, + "loss": 0.8719, + "step": 10876 + }, + { + "epoch": 0.5424937655860349, + "grad_norm": 0.42306517798189414, + "learning_rate": 2.2781772338719974e-05, + "loss": 0.836, + "step": 10877 + }, + { + "epoch": 0.5425436408977556, + "grad_norm": 0.5468357705488162, + "learning_rate": 2.2777749844069634e-05, + "loss": 0.9012, + "step": 10878 + }, + { + "epoch": 0.5425935162094763, + "grad_norm": 0.421197531888687, + "learning_rate": 2.277372740740793e-05, + "loss": 0.8369, + "step": 10879 + }, + { + "epoch": 0.542643391521197, + "grad_norm": 0.5521973955030834, + "learning_rate": 2.27697050288398e-05, + "loss": 0.8855, + "step": 10880 + }, + { + "epoch": 0.5426932668329177, + "grad_norm": 0.521746807965276, + "learning_rate": 2.2765682708470218e-05, + "loss": 0.8857, + "step": 10881 + }, + { + "epoch": 0.5427431421446384, + "grad_norm": 0.41230434933364124, + "learning_rate": 2.2761660446404147e-05, + "loss": 0.832, + "step": 10882 + }, + { + "epoch": 0.5427930174563591, + "grad_norm": 0.42750020897996155, + "learning_rate": 2.2757638242746555e-05, + "loss": 0.8677, + "step": 10883 + }, + { + "epoch": 0.5428428927680798, + "grad_norm": 0.40705429599908804, + "learning_rate": 2.2753616097602382e-05, + "loss": 0.881, + "step": 10884 + }, + { + "epoch": 0.5428927680798005, + "grad_norm": 0.4234150849749047, + "learning_rate": 2.274959401107659e-05, + "loss": 0.8843, + "step": 10885 + }, + { + "epoch": 0.5429426433915212, + "grad_norm": 0.6169726950090093, + "learning_rate": 2.2745571983274143e-05, + "loss": 0.9018, + "step": 10886 + }, + { + "epoch": 0.5429925187032419, + "grad_norm": 0.441984708055616, + "learning_rate": 2.2741550014299982e-05, + "loss": 0.8882, + "step": 10887 + }, + { + "epoch": 0.5430423940149626, + "grad_norm": 0.45651976988310805, + "learning_rate": 2.2737528104259056e-05, + "loss": 0.8924, + "step": 10888 + }, + { + "epoch": 0.5430922693266833, + "grad_norm": 0.5048206582339886, + "learning_rate": 2.2733506253256324e-05, + "loss": 0.822, + "step": 10889 + }, + { + "epoch": 0.543142144638404, + "grad_norm": 0.5960837408088699, + "learning_rate": 2.2729484461396745e-05, + "loss": 0.8918, + "step": 10890 + }, + { + "epoch": 0.5431920199501247, + "grad_norm": 0.4786879567563266, + "learning_rate": 2.2725462728785237e-05, + "loss": 0.8732, + "step": 10891 + }, + { + "epoch": 0.5432418952618454, + "grad_norm": 0.47881755562147965, + "learning_rate": 2.2721441055526765e-05, + "loss": 0.8548, + "step": 10892 + }, + { + "epoch": 0.543291770573566, + "grad_norm": 0.44617803745753315, + "learning_rate": 2.2717419441726268e-05, + "loss": 0.8659, + "step": 10893 + }, + { + "epoch": 0.5433416458852868, + "grad_norm": 0.43704243185621017, + "learning_rate": 2.2713397887488698e-05, + "loss": 0.8619, + "step": 10894 + }, + { + "epoch": 0.5433915211970075, + "grad_norm": 1.4008888674994282, + "learning_rate": 2.2709376392918978e-05, + "loss": 0.8316, + "step": 10895 + }, + { + "epoch": 0.5434413965087281, + "grad_norm": 0.42587638024619917, + "learning_rate": 2.2705354958122056e-05, + "loss": 0.8928, + "step": 10896 + }, + { + "epoch": 0.5434912718204489, + "grad_norm": 0.4951677340619744, + "learning_rate": 2.270133358320288e-05, + "loss": 0.8654, + "step": 10897 + }, + { + "epoch": 0.5435411471321696, + "grad_norm": 0.565783820775318, + "learning_rate": 2.2697312268266363e-05, + "loss": 0.846, + "step": 10898 + }, + { + "epoch": 0.5435910224438902, + "grad_norm": 0.5986773429243967, + "learning_rate": 2.2693291013417453e-05, + "loss": 0.8504, + "step": 10899 + }, + { + "epoch": 0.543640897755611, + "grad_norm": 0.8498590594235512, + "learning_rate": 2.2689269818761083e-05, + "loss": 0.852, + "step": 10900 + }, + { + "epoch": 0.5436907730673317, + "grad_norm": 0.4467237214737615, + "learning_rate": 2.2685248684402187e-05, + "loss": 0.9025, + "step": 10901 + }, + { + "epoch": 0.5437406483790523, + "grad_norm": 0.44290962517222565, + "learning_rate": 2.268122761044569e-05, + "loss": 0.8922, + "step": 10902 + }, + { + "epoch": 0.5437905236907731, + "grad_norm": 0.47352244914156844, + "learning_rate": 2.2677206596996516e-05, + "loss": 0.8742, + "step": 10903 + }, + { + "epoch": 0.5438403990024938, + "grad_norm": 0.4672934595985439, + "learning_rate": 2.26731856441596e-05, + "loss": 0.8681, + "step": 10904 + }, + { + "epoch": 0.5438902743142144, + "grad_norm": 0.35918321661347663, + "learning_rate": 2.2669164752039864e-05, + "loss": 0.8235, + "step": 10905 + }, + { + "epoch": 0.5439401496259352, + "grad_norm": 0.4156510189302079, + "learning_rate": 2.266514392074223e-05, + "loss": 0.853, + "step": 10906 + }, + { + "epoch": 0.5439900249376559, + "grad_norm": 0.5406046569940448, + "learning_rate": 2.2661123150371616e-05, + "loss": 0.9329, + "step": 10907 + }, + { + "epoch": 0.5440399002493765, + "grad_norm": 0.44160463390727867, + "learning_rate": 2.2657102441032948e-05, + "loss": 0.881, + "step": 10908 + }, + { + "epoch": 0.5440897755610973, + "grad_norm": 0.45732206311474877, + "learning_rate": 2.265308179283115e-05, + "loss": 0.9293, + "step": 10909 + }, + { + "epoch": 0.5441396508728179, + "grad_norm": 0.4246101299039591, + "learning_rate": 2.2649061205871124e-05, + "loss": 0.9026, + "step": 10910 + }, + { + "epoch": 0.5441895261845386, + "grad_norm": 0.4873614812431459, + "learning_rate": 2.2645040680257798e-05, + "loss": 0.8903, + "step": 10911 + }, + { + "epoch": 0.5442394014962594, + "grad_norm": 0.5110948116540901, + "learning_rate": 2.2641020216096086e-05, + "loss": 0.8658, + "step": 10912 + }, + { + "epoch": 0.54428927680798, + "grad_norm": 0.44506217399448045, + "learning_rate": 2.2636999813490887e-05, + "loss": 0.8728, + "step": 10913 + }, + { + "epoch": 0.5443391521197007, + "grad_norm": 0.4502241189808725, + "learning_rate": 2.263297947254712e-05, + "loss": 0.8443, + "step": 10914 + }, + { + "epoch": 0.5443890274314215, + "grad_norm": 0.42191603829546814, + "learning_rate": 2.2628959193369695e-05, + "loss": 0.8459, + "step": 10915 + }, + { + "epoch": 0.5444389027431421, + "grad_norm": 0.4956180438598676, + "learning_rate": 2.262493897606353e-05, + "loss": 0.9416, + "step": 10916 + }, + { + "epoch": 0.5444887780548628, + "grad_norm": 0.41070191101938636, + "learning_rate": 2.262091882073351e-05, + "loss": 0.8319, + "step": 10917 + }, + { + "epoch": 0.5445386533665836, + "grad_norm": 0.5606432091803218, + "learning_rate": 2.2616898727484548e-05, + "loss": 0.8482, + "step": 10918 + }, + { + "epoch": 0.5445885286783042, + "grad_norm": 0.4754176740773817, + "learning_rate": 2.2612878696421546e-05, + "loss": 0.9079, + "step": 10919 + }, + { + "epoch": 0.5446384039900249, + "grad_norm": 0.5018368990683659, + "learning_rate": 2.2608858727649418e-05, + "loss": 0.8861, + "step": 10920 + }, + { + "epoch": 0.5446882793017457, + "grad_norm": 0.42009919418327907, + "learning_rate": 2.260483882127304e-05, + "loss": 0.8917, + "step": 10921 + }, + { + "epoch": 0.5447381546134663, + "grad_norm": 0.4109885770664145, + "learning_rate": 2.260081897739732e-05, + "loss": 0.8951, + "step": 10922 + }, + { + "epoch": 0.544788029925187, + "grad_norm": 0.5469997470387309, + "learning_rate": 2.2596799196127166e-05, + "loss": 0.8656, + "step": 10923 + }, + { + "epoch": 0.5448379052369078, + "grad_norm": 0.43957022711630445, + "learning_rate": 2.2592779477567453e-05, + "loss": 0.8805, + "step": 10924 + }, + { + "epoch": 0.5448877805486284, + "grad_norm": 0.5464240462689797, + "learning_rate": 2.258875982182308e-05, + "loss": 0.9246, + "step": 10925 + }, + { + "epoch": 0.5449376558603491, + "grad_norm": 0.4455964370733353, + "learning_rate": 2.2584740228998942e-05, + "loss": 0.8685, + "step": 10926 + }, + { + "epoch": 0.5449875311720698, + "grad_norm": 0.4707447549838937, + "learning_rate": 2.2580720699199935e-05, + "loss": 0.8533, + "step": 10927 + }, + { + "epoch": 0.5450374064837905, + "grad_norm": 0.4250473339688777, + "learning_rate": 2.257670123253093e-05, + "loss": 0.8895, + "step": 10928 + }, + { + "epoch": 0.5450872817955112, + "grad_norm": 0.5326539519560773, + "learning_rate": 2.257268182909682e-05, + "loss": 0.8615, + "step": 10929 + }, + { + "epoch": 0.5451371571072319, + "grad_norm": 0.4088502800842054, + "learning_rate": 2.2568662489002494e-05, + "loss": 0.835, + "step": 10930 + }, + { + "epoch": 0.5451870324189526, + "grad_norm": 0.46538012913513455, + "learning_rate": 2.2564643212352842e-05, + "loss": 0.8735, + "step": 10931 + }, + { + "epoch": 0.5452369077306733, + "grad_norm": 0.48992066290021874, + "learning_rate": 2.256062399925272e-05, + "loss": 0.8935, + "step": 10932 + }, + { + "epoch": 0.545286783042394, + "grad_norm": 0.42676813320456936, + "learning_rate": 2.2556604849807032e-05, + "loss": 0.8588, + "step": 10933 + }, + { + "epoch": 0.5453366583541147, + "grad_norm": 0.43526022086103655, + "learning_rate": 2.255258576412065e-05, + "loss": 0.8876, + "step": 10934 + }, + { + "epoch": 0.5453865336658354, + "grad_norm": 0.5043224141648144, + "learning_rate": 2.254856674229844e-05, + "loss": 0.887, + "step": 10935 + }, + { + "epoch": 0.5454364089775561, + "grad_norm": 0.41494919254118884, + "learning_rate": 2.2544547784445286e-05, + "loss": 0.8961, + "step": 10936 + }, + { + "epoch": 0.5454862842892768, + "grad_norm": 0.5676998989949927, + "learning_rate": 2.2540528890666056e-05, + "loss": 0.8651, + "step": 10937 + }, + { + "epoch": 0.5455361596009976, + "grad_norm": 0.45700057928211296, + "learning_rate": 2.253651006106563e-05, + "loss": 0.8515, + "step": 10938 + }, + { + "epoch": 0.5455860349127182, + "grad_norm": 0.41998229949578164, + "learning_rate": 2.253249129574887e-05, + "loss": 0.8605, + "step": 10939 + }, + { + "epoch": 0.5456359102244389, + "grad_norm": 0.6385839745135509, + "learning_rate": 2.2528472594820637e-05, + "loss": 0.8894, + "step": 10940 + }, + { + "epoch": 0.5456857855361597, + "grad_norm": 0.4571077517361672, + "learning_rate": 2.2524453958385812e-05, + "loss": 0.8641, + "step": 10941 + }, + { + "epoch": 0.5457356608478803, + "grad_norm": 0.48136685072191493, + "learning_rate": 2.2520435386549253e-05, + "loss": 0.8595, + "step": 10942 + }, + { + "epoch": 0.545785536159601, + "grad_norm": 0.40470108484622697, + "learning_rate": 2.2516416879415824e-05, + "loss": 0.8794, + "step": 10943 + }, + { + "epoch": 0.5458354114713216, + "grad_norm": 0.3819874230425692, + "learning_rate": 2.2512398437090383e-05, + "loss": 0.8689, + "step": 10944 + }, + { + "epoch": 0.5458852867830424, + "grad_norm": 0.47357043539418703, + "learning_rate": 2.2508380059677796e-05, + "loss": 0.8952, + "step": 10945 + }, + { + "epoch": 0.5459351620947631, + "grad_norm": 0.4565723509789232, + "learning_rate": 2.2504361747282906e-05, + "loss": 0.8932, + "step": 10946 + }, + { + "epoch": 0.5459850374064837, + "grad_norm": 0.45646954683464064, + "learning_rate": 2.2500343500010586e-05, + "loss": 0.9639, + "step": 10947 + }, + { + "epoch": 0.5460349127182045, + "grad_norm": 0.42515383528742023, + "learning_rate": 2.2496325317965684e-05, + "loss": 0.9005, + "step": 10948 + }, + { + "epoch": 0.5460847880299252, + "grad_norm": 0.44177211915441345, + "learning_rate": 2.2492307201253054e-05, + "loss": 0.89, + "step": 10949 + }, + { + "epoch": 0.5461346633416458, + "grad_norm": 0.3727904617911665, + "learning_rate": 2.248828914997754e-05, + "loss": 0.8633, + "step": 10950 + }, + { + "epoch": 0.5461845386533666, + "grad_norm": 0.45369460801095707, + "learning_rate": 2.2484271164243996e-05, + "loss": 0.8447, + "step": 10951 + }, + { + "epoch": 0.5462344139650873, + "grad_norm": 0.3952628317660366, + "learning_rate": 2.2480253244157274e-05, + "loss": 0.8534, + "step": 10952 + }, + { + "epoch": 0.5462842892768079, + "grad_norm": 0.6326639196037714, + "learning_rate": 2.2476235389822224e-05, + "loss": 0.8579, + "step": 10953 + }, + { + "epoch": 0.5463341645885287, + "grad_norm": 0.9895739766266596, + "learning_rate": 2.247221760134367e-05, + "loss": 0.9143, + "step": 10954 + }, + { + "epoch": 0.5463840399002494, + "grad_norm": 0.7173207458344096, + "learning_rate": 2.2468199878826465e-05, + "loss": 0.8363, + "step": 10955 + }, + { + "epoch": 0.54643391521197, + "grad_norm": 0.49101269104880463, + "learning_rate": 2.2464182222375464e-05, + "loss": 0.9215, + "step": 10956 + }, + { + "epoch": 0.5464837905236908, + "grad_norm": 0.41081236816252104, + "learning_rate": 2.246016463209548e-05, + "loss": 0.8465, + "step": 10957 + }, + { + "epoch": 0.5465336658354115, + "grad_norm": 0.37694978628920595, + "learning_rate": 2.2456147108091367e-05, + "loss": 0.8449, + "step": 10958 + }, + { + "epoch": 0.5465835411471321, + "grad_norm": 0.4218790758958268, + "learning_rate": 2.2452129650467958e-05, + "loss": 0.8949, + "step": 10959 + }, + { + "epoch": 0.5466334164588529, + "grad_norm": 0.46448346086137826, + "learning_rate": 2.2448112259330093e-05, + "loss": 0.8953, + "step": 10960 + }, + { + "epoch": 0.5466832917705736, + "grad_norm": 0.48920813241709515, + "learning_rate": 2.2444094934782587e-05, + "loss": 0.9056, + "step": 10961 + }, + { + "epoch": 0.5467331670822942, + "grad_norm": 0.4347531241788078, + "learning_rate": 2.2440077676930276e-05, + "loss": 0.9052, + "step": 10962 + }, + { + "epoch": 0.546783042394015, + "grad_norm": 0.41201737166882924, + "learning_rate": 2.2436060485877998e-05, + "loss": 0.9221, + "step": 10963 + }, + { + "epoch": 0.5468329177057356, + "grad_norm": 0.40433341268483564, + "learning_rate": 2.2432043361730583e-05, + "loss": 0.861, + "step": 10964 + }, + { + "epoch": 0.5468827930174563, + "grad_norm": 1.1277187068003716, + "learning_rate": 2.2428026304592835e-05, + "loss": 0.8554, + "step": 10965 + }, + { + "epoch": 0.5469326683291771, + "grad_norm": 0.4615289838039092, + "learning_rate": 2.2424009314569587e-05, + "loss": 0.8591, + "step": 10966 + }, + { + "epoch": 0.5469825436408977, + "grad_norm": 0.48234357927300664, + "learning_rate": 2.2419992391765667e-05, + "loss": 0.926, + "step": 10967 + }, + { + "epoch": 0.5470324189526184, + "grad_norm": 0.4785817866933527, + "learning_rate": 2.2415975536285902e-05, + "loss": 0.9022, + "step": 10968 + }, + { + "epoch": 0.5470822942643392, + "grad_norm": 0.41100145861644877, + "learning_rate": 2.241195874823509e-05, + "loss": 0.8523, + "step": 10969 + }, + { + "epoch": 0.5471321695760598, + "grad_norm": 0.38803006229886183, + "learning_rate": 2.240794202771805e-05, + "loss": 0.8396, + "step": 10970 + }, + { + "epoch": 0.5471820448877805, + "grad_norm": 0.45617890774764136, + "learning_rate": 2.2403925374839617e-05, + "loss": 0.8509, + "step": 10971 + }, + { + "epoch": 0.5472319201995013, + "grad_norm": 0.36256112410358843, + "learning_rate": 2.239990878970458e-05, + "loss": 0.8353, + "step": 10972 + }, + { + "epoch": 0.5472817955112219, + "grad_norm": 0.8267024660043091, + "learning_rate": 2.2395892272417758e-05, + "loss": 0.8852, + "step": 10973 + }, + { + "epoch": 0.5473316708229427, + "grad_norm": 0.4948522153270917, + "learning_rate": 2.239187582308396e-05, + "loss": 0.8999, + "step": 10974 + }, + { + "epoch": 0.5473815461346634, + "grad_norm": 0.45641390653689706, + "learning_rate": 2.238785944180801e-05, + "loss": 0.909, + "step": 10975 + }, + { + "epoch": 0.547431421446384, + "grad_norm": 0.45106471132734177, + "learning_rate": 2.2383843128694684e-05, + "loss": 0.8872, + "step": 10976 + }, + { + "epoch": 0.5474812967581048, + "grad_norm": 0.4216652454239024, + "learning_rate": 2.2379826883848804e-05, + "loss": 0.8846, + "step": 10977 + }, + { + "epoch": 0.5475311720698255, + "grad_norm": 0.42952526194350477, + "learning_rate": 2.2375810707375165e-05, + "loss": 0.8992, + "step": 10978 + }, + { + "epoch": 0.5475810473815461, + "grad_norm": 0.4323283644761592, + "learning_rate": 2.2371794599378577e-05, + "loss": 0.8438, + "step": 10979 + }, + { + "epoch": 0.5476309226932669, + "grad_norm": 0.44849048307332434, + "learning_rate": 2.2367778559963838e-05, + "loss": 0.8908, + "step": 10980 + }, + { + "epoch": 0.5476807980049875, + "grad_norm": 0.43312934433543965, + "learning_rate": 2.236376258923573e-05, + "loss": 0.8739, + "step": 10981 + }, + { + "epoch": 0.5477306733167082, + "grad_norm": 0.40499940202526014, + "learning_rate": 2.2359746687299062e-05, + "loss": 0.8645, + "step": 10982 + }, + { + "epoch": 0.547780548628429, + "grad_norm": 0.39000820966347405, + "learning_rate": 2.2355730854258618e-05, + "loss": 0.8347, + "step": 10983 + }, + { + "epoch": 0.5478304239401496, + "grad_norm": 0.4780492009094556, + "learning_rate": 2.2351715090219196e-05, + "loss": 0.92, + "step": 10984 + }, + { + "epoch": 0.5478802992518703, + "grad_norm": 0.4609713598108139, + "learning_rate": 2.2347699395285587e-05, + "loss": 0.9021, + "step": 10985 + }, + { + "epoch": 0.547930174563591, + "grad_norm": 0.4527203533421893, + "learning_rate": 2.2343683769562576e-05, + "loss": 0.8612, + "step": 10986 + }, + { + "epoch": 0.5479800498753117, + "grad_norm": 0.4842134044319972, + "learning_rate": 2.2339668213154945e-05, + "loss": 0.8688, + "step": 10987 + }, + { + "epoch": 0.5480299251870324, + "grad_norm": 0.4492906112536703, + "learning_rate": 2.2335652726167477e-05, + "loss": 0.8737, + "step": 10988 + }, + { + "epoch": 0.5480798004987532, + "grad_norm": 0.7664195344023564, + "learning_rate": 2.2331637308704962e-05, + "loss": 0.8812, + "step": 10989 + }, + { + "epoch": 0.5481296758104738, + "grad_norm": 0.5379549333473889, + "learning_rate": 2.2327621960872187e-05, + "loss": 0.8921, + "step": 10990 + }, + { + "epoch": 0.5481795511221945, + "grad_norm": 0.35955262882688643, + "learning_rate": 2.2323606682773906e-05, + "loss": 0.8401, + "step": 10991 + }, + { + "epoch": 0.5482294264339153, + "grad_norm": 0.3916458997227876, + "learning_rate": 2.2319591474514915e-05, + "loss": 0.8479, + "step": 10992 + }, + { + "epoch": 0.5482793017456359, + "grad_norm": 0.5046856073115703, + "learning_rate": 2.2315576336199995e-05, + "loss": 0.8841, + "step": 10993 + }, + { + "epoch": 0.5483291770573566, + "grad_norm": 0.44778933531920984, + "learning_rate": 2.23115612679339e-05, + "loss": 0.8867, + "step": 10994 + }, + { + "epoch": 0.5483790523690774, + "grad_norm": 0.4327017404257947, + "learning_rate": 2.230754626982141e-05, + "loss": 0.8795, + "step": 10995 + }, + { + "epoch": 0.548428927680798, + "grad_norm": 0.4738452819922509, + "learning_rate": 2.2303531341967295e-05, + "loss": 0.8789, + "step": 10996 + }, + { + "epoch": 0.5484788029925187, + "grad_norm": 1.192923157902798, + "learning_rate": 2.229951648447633e-05, + "loss": 0.9039, + "step": 10997 + }, + { + "epoch": 0.5485286783042393, + "grad_norm": 0.4285454318727997, + "learning_rate": 2.2295501697453268e-05, + "loss": 0.8953, + "step": 10998 + }, + { + "epoch": 0.5485785536159601, + "grad_norm": 0.5372725788832681, + "learning_rate": 2.2291486981002873e-05, + "loss": 0.8398, + "step": 10999 + }, + { + "epoch": 0.5486284289276808, + "grad_norm": 0.43485615851612375, + "learning_rate": 2.228747233522992e-05, + "loss": 0.8801, + "step": 11000 + }, + { + "epoch": 0.5486783042394014, + "grad_norm": 0.6026897813255944, + "learning_rate": 2.228345776023917e-05, + "loss": 0.9012, + "step": 11001 + }, + { + "epoch": 0.5487281795511222, + "grad_norm": 0.48016904857324694, + "learning_rate": 2.2279443256135363e-05, + "loss": 0.8669, + "step": 11002 + }, + { + "epoch": 0.5487780548628429, + "grad_norm": 0.46305439275748217, + "learning_rate": 2.2275428823023263e-05, + "loss": 0.8856, + "step": 11003 + }, + { + "epoch": 0.5488279301745635, + "grad_norm": 0.533269325922686, + "learning_rate": 2.2271414461007646e-05, + "loss": 0.859, + "step": 11004 + }, + { + "epoch": 0.5488778054862843, + "grad_norm": 0.4741031533191526, + "learning_rate": 2.2267400170193235e-05, + "loss": 0.8769, + "step": 11005 + }, + { + "epoch": 0.548927680798005, + "grad_norm": 0.4832097149555373, + "learning_rate": 2.226338595068479e-05, + "loss": 0.8236, + "step": 11006 + }, + { + "epoch": 0.5489775561097257, + "grad_norm": 0.43893569851692205, + "learning_rate": 2.2259371802587068e-05, + "loss": 0.9106, + "step": 11007 + }, + { + "epoch": 0.5490274314214464, + "grad_norm": 0.4059925426923445, + "learning_rate": 2.225535772600482e-05, + "loss": 0.8388, + "step": 11008 + }, + { + "epoch": 0.5490773067331671, + "grad_norm": 0.42383787772089737, + "learning_rate": 2.2251343721042776e-05, + "loss": 0.8935, + "step": 11009 + }, + { + "epoch": 0.5491271820448878, + "grad_norm": 0.4488892665339853, + "learning_rate": 2.2247329787805685e-05, + "loss": 0.8831, + "step": 11010 + }, + { + "epoch": 0.5491770573566085, + "grad_norm": 0.7002006792715242, + "learning_rate": 2.224331592639829e-05, + "loss": 0.8646, + "step": 11011 + }, + { + "epoch": 0.5492269326683292, + "grad_norm": 0.4708947701195015, + "learning_rate": 2.2239302136925344e-05, + "loss": 0.8983, + "step": 11012 + }, + { + "epoch": 0.5492768079800499, + "grad_norm": 0.4325471280735621, + "learning_rate": 2.2235288419491564e-05, + "loss": 0.889, + "step": 11013 + }, + { + "epoch": 0.5493266832917706, + "grad_norm": 0.4390986218443554, + "learning_rate": 2.2231274774201697e-05, + "loss": 0.8929, + "step": 11014 + }, + { + "epoch": 0.5493765586034913, + "grad_norm": 0.4353152229657187, + "learning_rate": 2.2227261201160475e-05, + "loss": 0.9061, + "step": 11015 + }, + { + "epoch": 0.549426433915212, + "grad_norm": 0.3947775161881067, + "learning_rate": 2.222324770047263e-05, + "loss": 0.8795, + "step": 11016 + }, + { + "epoch": 0.5494763092269327, + "grad_norm": 0.41835586755004645, + "learning_rate": 2.22192342722429e-05, + "loss": 0.8766, + "step": 11017 + }, + { + "epoch": 0.5495261845386533, + "grad_norm": 0.47526245002658085, + "learning_rate": 2.2215220916576e-05, + "loss": 0.9138, + "step": 11018 + }, + { + "epoch": 0.549576059850374, + "grad_norm": 0.43818850803405934, + "learning_rate": 2.2211207633576668e-05, + "loss": 0.8996, + "step": 11019 + }, + { + "epoch": 0.5496259351620948, + "grad_norm": 0.46268636954103215, + "learning_rate": 2.2207194423349622e-05, + "loss": 0.8804, + "step": 11020 + }, + { + "epoch": 0.5496758104738154, + "grad_norm": 0.37654532269932306, + "learning_rate": 2.2203181285999586e-05, + "loss": 0.8699, + "step": 11021 + }, + { + "epoch": 0.5497256857855362, + "grad_norm": 0.5168149029391974, + "learning_rate": 2.219916822163129e-05, + "loss": 0.8923, + "step": 11022 + }, + { + "epoch": 0.5497755610972569, + "grad_norm": 0.4558351440763162, + "learning_rate": 2.2195155230349442e-05, + "loss": 0.9275, + "step": 11023 + }, + { + "epoch": 0.5498254364089775, + "grad_norm": 0.5501156051118851, + "learning_rate": 2.219114231225876e-05, + "loss": 0.847, + "step": 11024 + }, + { + "epoch": 0.5498753117206983, + "grad_norm": 0.45339479044801295, + "learning_rate": 2.2187129467463964e-05, + "loss": 0.8649, + "step": 11025 + }, + { + "epoch": 0.549925187032419, + "grad_norm": 0.5119973421751002, + "learning_rate": 2.2183116696069767e-05, + "loss": 0.859, + "step": 11026 + }, + { + "epoch": 0.5499750623441396, + "grad_norm": 0.40815917992509965, + "learning_rate": 2.2179103998180885e-05, + "loss": 0.8984, + "step": 11027 + }, + { + "epoch": 0.5500249376558604, + "grad_norm": 0.4991082516806058, + "learning_rate": 2.217509137390201e-05, + "loss": 0.8687, + "step": 11028 + }, + { + "epoch": 0.5500748129675811, + "grad_norm": 0.4896411540411704, + "learning_rate": 2.2171078823337863e-05, + "loss": 0.8396, + "step": 11029 + }, + { + "epoch": 0.5501246882793017, + "grad_norm": 0.4536821263102141, + "learning_rate": 2.216706634659316e-05, + "loss": 0.8484, + "step": 11030 + }, + { + "epoch": 0.5501745635910225, + "grad_norm": 0.5097563742657037, + "learning_rate": 2.2163053943772583e-05, + "loss": 0.8274, + "step": 11031 + }, + { + "epoch": 0.5502244389027432, + "grad_norm": 0.4592755562966114, + "learning_rate": 2.2159041614980842e-05, + "loss": 0.8354, + "step": 11032 + }, + { + "epoch": 0.5502743142144638, + "grad_norm": 0.404337165805939, + "learning_rate": 2.215502936032264e-05, + "loss": 0.9027, + "step": 11033 + }, + { + "epoch": 0.5503241895261846, + "grad_norm": 0.43023524189553813, + "learning_rate": 2.215101717990268e-05, + "loss": 0.831, + "step": 11034 + }, + { + "epoch": 0.5503740648379052, + "grad_norm": 0.5867832494471498, + "learning_rate": 2.2147005073825647e-05, + "loss": 0.9078, + "step": 11035 + }, + { + "epoch": 0.5504239401496259, + "grad_norm": 0.453109925757569, + "learning_rate": 2.2142993042196236e-05, + "loss": 0.9028, + "step": 11036 + }, + { + "epoch": 0.5504738154613467, + "grad_norm": 0.4351542298580351, + "learning_rate": 2.213898108511914e-05, + "loss": 0.8523, + "step": 11037 + }, + { + "epoch": 0.5505236907730673, + "grad_norm": 0.411687241749088, + "learning_rate": 2.2134969202699073e-05, + "loss": 0.9056, + "step": 11038 + }, + { + "epoch": 0.550573566084788, + "grad_norm": 0.4548524024247117, + "learning_rate": 2.2130957395040684e-05, + "loss": 0.8693, + "step": 11039 + }, + { + "epoch": 0.5506234413965088, + "grad_norm": 0.7417264309807808, + "learning_rate": 2.212694566224868e-05, + "loss": 0.8593, + "step": 11040 + }, + { + "epoch": 0.5506733167082294, + "grad_norm": 0.4071294243090034, + "learning_rate": 2.2122934004427754e-05, + "loss": 0.8649, + "step": 11041 + }, + { + "epoch": 0.5507231920199501, + "grad_norm": 0.3851307402029007, + "learning_rate": 2.2118922421682565e-05, + "loss": 0.8685, + "step": 11042 + }, + { + "epoch": 0.5507730673316709, + "grad_norm": 0.4662567433673688, + "learning_rate": 2.2114910914117806e-05, + "loss": 0.8754, + "step": 11043 + }, + { + "epoch": 0.5508229426433915, + "grad_norm": 0.724464646597532, + "learning_rate": 2.2110899481838157e-05, + "loss": 0.9068, + "step": 11044 + }, + { + "epoch": 0.5508728179551122, + "grad_norm": 0.44264505892821177, + "learning_rate": 2.2106888124948304e-05, + "loss": 0.8898, + "step": 11045 + }, + { + "epoch": 0.550922693266833, + "grad_norm": 0.5063678363150599, + "learning_rate": 2.2102876843552898e-05, + "loss": 0.9201, + "step": 11046 + }, + { + "epoch": 0.5509725685785536, + "grad_norm": 0.4312175412352833, + "learning_rate": 2.2098865637756626e-05, + "loss": 0.868, + "step": 11047 + }, + { + "epoch": 0.5510224438902743, + "grad_norm": 0.4440743723131541, + "learning_rate": 2.2094854507664157e-05, + "loss": 0.8868, + "step": 11048 + }, + { + "epoch": 0.5510723192019951, + "grad_norm": 0.5383312116407254, + "learning_rate": 2.2090843453380168e-05, + "loss": 0.8836, + "step": 11049 + }, + { + "epoch": 0.5511221945137157, + "grad_norm": 0.46338952010787204, + "learning_rate": 2.2086832475009308e-05, + "loss": 0.8989, + "step": 11050 + }, + { + "epoch": 0.5511720698254364, + "grad_norm": 0.45526668875295345, + "learning_rate": 2.208282157265625e-05, + "loss": 0.8229, + "step": 11051 + }, + { + "epoch": 0.551221945137157, + "grad_norm": 0.5548404780177654, + "learning_rate": 2.2078810746425664e-05, + "loss": 0.9342, + "step": 11052 + }, + { + "epoch": 0.5512718204488778, + "grad_norm": 0.5724704376795471, + "learning_rate": 2.20747999964222e-05, + "loss": 0.8475, + "step": 11053 + }, + { + "epoch": 0.5513216957605985, + "grad_norm": 0.40740996830828596, + "learning_rate": 2.2070789322750523e-05, + "loss": 0.9089, + "step": 11054 + }, + { + "epoch": 0.5513715710723192, + "grad_norm": 0.4844424836823074, + "learning_rate": 2.2066778725515283e-05, + "loss": 0.9131, + "step": 11055 + }, + { + "epoch": 0.5514214463840399, + "grad_norm": 0.5212586917531103, + "learning_rate": 2.2062768204821146e-05, + "loss": 0.844, + "step": 11056 + }, + { + "epoch": 0.5514713216957606, + "grad_norm": 0.8009033716852698, + "learning_rate": 2.2058757760772754e-05, + "loss": 0.8973, + "step": 11057 + }, + { + "epoch": 0.5515211970074813, + "grad_norm": 0.5385314719292793, + "learning_rate": 2.2054747393474766e-05, + "loss": 0.8756, + "step": 11058 + }, + { + "epoch": 0.551571072319202, + "grad_norm": 0.5152960471376148, + "learning_rate": 2.2050737103031817e-05, + "loss": 0.8646, + "step": 11059 + }, + { + "epoch": 0.5516209476309227, + "grad_norm": 0.45803466205100757, + "learning_rate": 2.2046726889548575e-05, + "loss": 0.894, + "step": 11060 + }, + { + "epoch": 0.5516708229426434, + "grad_norm": 0.40284918034052175, + "learning_rate": 2.2042716753129664e-05, + "loss": 0.8613, + "step": 11061 + }, + { + "epoch": 0.5517206982543641, + "grad_norm": 0.5760069266821151, + "learning_rate": 2.2038706693879737e-05, + "loss": 0.8587, + "step": 11062 + }, + { + "epoch": 0.5517705735660848, + "grad_norm": 0.4504019054762472, + "learning_rate": 2.2034696711903447e-05, + "loss": 0.8668, + "step": 11063 + }, + { + "epoch": 0.5518204488778055, + "grad_norm": 0.4418063082471963, + "learning_rate": 2.2030686807305402e-05, + "loss": 0.8875, + "step": 11064 + }, + { + "epoch": 0.5518703241895262, + "grad_norm": 0.4245520608412205, + "learning_rate": 2.2026676980190265e-05, + "loss": 0.9265, + "step": 11065 + }, + { + "epoch": 0.5519201995012469, + "grad_norm": 0.45808016265327095, + "learning_rate": 2.202266723066265e-05, + "loss": 0.8774, + "step": 11066 + }, + { + "epoch": 0.5519700748129676, + "grad_norm": 0.4059107401080624, + "learning_rate": 2.201865755882722e-05, + "loss": 0.846, + "step": 11067 + }, + { + "epoch": 0.5520199501246883, + "grad_norm": 0.4303334581012796, + "learning_rate": 2.2014647964788575e-05, + "loss": 0.8784, + "step": 11068 + }, + { + "epoch": 0.5520698254364089, + "grad_norm": 0.47095380115729085, + "learning_rate": 2.2010638448651356e-05, + "loss": 0.8551, + "step": 11069 + }, + { + "epoch": 0.5521197007481297, + "grad_norm": 0.47239631726860926, + "learning_rate": 2.200662901052019e-05, + "loss": 0.8639, + "step": 11070 + }, + { + "epoch": 0.5521695760598504, + "grad_norm": 0.44390071859307084, + "learning_rate": 2.2002619650499705e-05, + "loss": 0.8606, + "step": 11071 + }, + { + "epoch": 0.552219451371571, + "grad_norm": 0.5420567087172982, + "learning_rate": 2.199861036869451e-05, + "loss": 0.8968, + "step": 11072 + }, + { + "epoch": 0.5522693266832918, + "grad_norm": 0.5084105179311131, + "learning_rate": 2.199460116520924e-05, + "loss": 0.8655, + "step": 11073 + }, + { + "epoch": 0.5523192019950125, + "grad_norm": 0.4571036590017628, + "learning_rate": 2.19905920401485e-05, + "loss": 0.8475, + "step": 11074 + }, + { + "epoch": 0.5523690773067331, + "grad_norm": 0.5315420310669039, + "learning_rate": 2.1986582993616926e-05, + "loss": 0.9064, + "step": 11075 + }, + { + "epoch": 0.5524189526184539, + "grad_norm": 0.4781847272559926, + "learning_rate": 2.1982574025719113e-05, + "loss": 0.9088, + "step": 11076 + }, + { + "epoch": 0.5524688279301746, + "grad_norm": 0.6321908043938519, + "learning_rate": 2.197856513655968e-05, + "loss": 0.8631, + "step": 11077 + }, + { + "epoch": 0.5525187032418952, + "grad_norm": 0.433493093937465, + "learning_rate": 2.1974556326243245e-05, + "loss": 0.8773, + "step": 11078 + }, + { + "epoch": 0.552568578553616, + "grad_norm": 0.47718029423256164, + "learning_rate": 2.19705475948744e-05, + "loss": 0.8685, + "step": 11079 + }, + { + "epoch": 0.5526184538653367, + "grad_norm": 0.4552978158251265, + "learning_rate": 2.196653894255776e-05, + "loss": 0.9024, + "step": 11080 + }, + { + "epoch": 0.5526683291770573, + "grad_norm": 0.42198264595787033, + "learning_rate": 2.196253036939793e-05, + "loss": 0.8595, + "step": 11081 + }, + { + "epoch": 0.5527182044887781, + "grad_norm": 0.4314972571179938, + "learning_rate": 2.195852187549952e-05, + "loss": 0.8759, + "step": 11082 + }, + { + "epoch": 0.5527680798004988, + "grad_norm": 0.43155713539271506, + "learning_rate": 2.1954513460967107e-05, + "loss": 0.9037, + "step": 11083 + }, + { + "epoch": 0.5528179551122194, + "grad_norm": 0.48868339575310954, + "learning_rate": 2.1950505125905305e-05, + "loss": 0.8875, + "step": 11084 + }, + { + "epoch": 0.5528678304239402, + "grad_norm": 0.49093892974001513, + "learning_rate": 2.1946496870418707e-05, + "loss": 0.8807, + "step": 11085 + }, + { + "epoch": 0.5529177057356609, + "grad_norm": 0.4018026514452772, + "learning_rate": 2.194248869461192e-05, + "loss": 0.854, + "step": 11086 + }, + { + "epoch": 0.5529675810473815, + "grad_norm": 0.47861760780333046, + "learning_rate": 2.1938480598589506e-05, + "loss": 0.8674, + "step": 11087 + }, + { + "epoch": 0.5530174563591023, + "grad_norm": 0.3922504351229757, + "learning_rate": 2.1934472582456072e-05, + "loss": 0.8826, + "step": 11088 + }, + { + "epoch": 0.5530673316708229, + "grad_norm": 0.444357271716501, + "learning_rate": 2.193046464631621e-05, + "loss": 0.8905, + "step": 11089 + }, + { + "epoch": 0.5531172069825436, + "grad_norm": 0.4997430642390481, + "learning_rate": 2.19264567902745e-05, + "loss": 0.8203, + "step": 11090 + }, + { + "epoch": 0.5531670822942644, + "grad_norm": 0.43594817784702067, + "learning_rate": 2.1922449014435518e-05, + "loss": 0.8598, + "step": 11091 + }, + { + "epoch": 0.553216957605985, + "grad_norm": 0.415762206631137, + "learning_rate": 2.1918441318903855e-05, + "loss": 0.8936, + "step": 11092 + }, + { + "epoch": 0.5532668329177057, + "grad_norm": 0.5138992762002144, + "learning_rate": 2.191443370378409e-05, + "loss": 0.851, + "step": 11093 + }, + { + "epoch": 0.5533167082294265, + "grad_norm": 0.4594728656037896, + "learning_rate": 2.1910426169180788e-05, + "loss": 0.9015, + "step": 11094 + }, + { + "epoch": 0.5533665835411471, + "grad_norm": 0.669842771443342, + "learning_rate": 2.190641871519854e-05, + "loss": 0.8554, + "step": 11095 + }, + { + "epoch": 0.5534164588528678, + "grad_norm": 0.45326728134250316, + "learning_rate": 2.1902411341941904e-05, + "loss": 0.8827, + "step": 11096 + }, + { + "epoch": 0.5534663341645886, + "grad_norm": 0.5121587306368242, + "learning_rate": 2.1898404049515467e-05, + "loss": 0.8968, + "step": 11097 + }, + { + "epoch": 0.5535162094763092, + "grad_norm": 1.0141557009739564, + "learning_rate": 2.189439683802378e-05, + "loss": 0.9041, + "step": 11098 + }, + { + "epoch": 0.5535660847880299, + "grad_norm": 0.5577004985707256, + "learning_rate": 2.189038970757142e-05, + "loss": 0.858, + "step": 11099 + }, + { + "epoch": 0.5536159600997507, + "grad_norm": 0.46783090484251705, + "learning_rate": 2.1886382658262956e-05, + "loss": 0.8952, + "step": 11100 + }, + { + "epoch": 0.5536658354114713, + "grad_norm": 0.47026416883537153, + "learning_rate": 2.1882375690202934e-05, + "loss": 0.9139, + "step": 11101 + }, + { + "epoch": 0.553715710723192, + "grad_norm": 0.62052313092796, + "learning_rate": 2.1878368803495928e-05, + "loss": 0.9133, + "step": 11102 + }, + { + "epoch": 0.5537655860349128, + "grad_norm": 0.5588672814375625, + "learning_rate": 2.1874361998246484e-05, + "loss": 0.8426, + "step": 11103 + }, + { + "epoch": 0.5538154613466334, + "grad_norm": 0.42489430459959365, + "learning_rate": 2.1870355274559177e-05, + "loss": 0.8701, + "step": 11104 + }, + { + "epoch": 0.5538653366583541, + "grad_norm": 0.4005152821793071, + "learning_rate": 2.1866348632538542e-05, + "loss": 0.8799, + "step": 11105 + }, + { + "epoch": 0.5539152119700748, + "grad_norm": 0.41662782924457864, + "learning_rate": 2.1862342072289133e-05, + "loss": 0.9012, + "step": 11106 + }, + { + "epoch": 0.5539650872817955, + "grad_norm": 0.4262260025969314, + "learning_rate": 2.1858335593915504e-05, + "loss": 0.8821, + "step": 11107 + }, + { + "epoch": 0.5540149625935162, + "grad_norm": 0.43092022378203126, + "learning_rate": 2.1854329197522213e-05, + "loss": 0.8554, + "step": 11108 + }, + { + "epoch": 0.5540648379052369, + "grad_norm": 0.4440952359687354, + "learning_rate": 2.1850322883213783e-05, + "loss": 0.8603, + "step": 11109 + }, + { + "epoch": 0.5541147132169576, + "grad_norm": 0.48615170649207295, + "learning_rate": 2.1846316651094767e-05, + "loss": 0.8264, + "step": 11110 + }, + { + "epoch": 0.5541645885286783, + "grad_norm": 0.40299393104516834, + "learning_rate": 2.184231050126972e-05, + "loss": 0.8947, + "step": 11111 + }, + { + "epoch": 0.554214463840399, + "grad_norm": 0.4673967133295449, + "learning_rate": 2.1838304433843155e-05, + "loss": 0.9211, + "step": 11112 + }, + { + "epoch": 0.5542643391521197, + "grad_norm": 0.44729308115959115, + "learning_rate": 2.183429844891962e-05, + "loss": 0.865, + "step": 11113 + }, + { + "epoch": 0.5543142144638404, + "grad_norm": 0.4268621376606204, + "learning_rate": 2.183029254660365e-05, + "loss": 0.9179, + "step": 11114 + }, + { + "epoch": 0.5543640897755611, + "grad_norm": 0.5049851219668733, + "learning_rate": 2.182628672699979e-05, + "loss": 0.8728, + "step": 11115 + }, + { + "epoch": 0.5544139650872818, + "grad_norm": 0.4676943279890533, + "learning_rate": 2.1822280990212542e-05, + "loss": 0.8729, + "step": 11116 + }, + { + "epoch": 0.5544638403990025, + "grad_norm": 0.44834643013079006, + "learning_rate": 2.1818275336346455e-05, + "loss": 0.8954, + "step": 11117 + }, + { + "epoch": 0.5545137157107232, + "grad_norm": 0.35736288968874613, + "learning_rate": 2.1814269765506047e-05, + "loss": 0.8706, + "step": 11118 + }, + { + "epoch": 0.5545635910224439, + "grad_norm": 0.4073492085328729, + "learning_rate": 2.1810264277795854e-05, + "loss": 0.8604, + "step": 11119 + }, + { + "epoch": 0.5546134663341646, + "grad_norm": 0.41715432375650996, + "learning_rate": 2.1806258873320378e-05, + "loss": 0.8981, + "step": 11120 + }, + { + "epoch": 0.5546633416458853, + "grad_norm": 0.4406966766287988, + "learning_rate": 2.1802253552184144e-05, + "loss": 0.8695, + "step": 11121 + }, + { + "epoch": 0.554713216957606, + "grad_norm": 0.4536906905478925, + "learning_rate": 2.1798248314491687e-05, + "loss": 0.8969, + "step": 11122 + }, + { + "epoch": 0.5547630922693266, + "grad_norm": 0.4249304762513332, + "learning_rate": 2.1794243160347494e-05, + "loss": 0.8637, + "step": 11123 + }, + { + "epoch": 0.5548129675810474, + "grad_norm": 0.5446976156708997, + "learning_rate": 2.1790238089856092e-05, + "loss": 0.8732, + "step": 11124 + }, + { + "epoch": 0.5548628428927681, + "grad_norm": 0.4773213508908001, + "learning_rate": 2.178623310312199e-05, + "loss": 0.8864, + "step": 11125 + }, + { + "epoch": 0.5549127182044887, + "grad_norm": 0.4103458101276523, + "learning_rate": 2.1782228200249705e-05, + "loss": 0.8729, + "step": 11126 + }, + { + "epoch": 0.5549625935162095, + "grad_norm": 0.4391941713186998, + "learning_rate": 2.177822338134373e-05, + "loss": 0.9183, + "step": 11127 + }, + { + "epoch": 0.5550124688279302, + "grad_norm": 0.43151903081160387, + "learning_rate": 2.177421864650857e-05, + "loss": 0.8844, + "step": 11128 + }, + { + "epoch": 0.5550623441396508, + "grad_norm": 0.39673829855531934, + "learning_rate": 2.1770213995848734e-05, + "loss": 0.8587, + "step": 11129 + }, + { + "epoch": 0.5551122194513716, + "grad_norm": 0.4195592563624402, + "learning_rate": 2.176620942946872e-05, + "loss": 0.9198, + "step": 11130 + }, + { + "epoch": 0.5551620947630923, + "grad_norm": 0.4766863255558913, + "learning_rate": 2.176220494747302e-05, + "loss": 0.8824, + "step": 11131 + }, + { + "epoch": 0.5552119700748129, + "grad_norm": 0.5453919568718842, + "learning_rate": 2.175820054996614e-05, + "loss": 0.8785, + "step": 11132 + }, + { + "epoch": 0.5552618453865337, + "grad_norm": 0.4490603616415472, + "learning_rate": 2.175419623705256e-05, + "loss": 0.8671, + "step": 11133 + }, + { + "epoch": 0.5553117206982544, + "grad_norm": 0.4445095642975841, + "learning_rate": 2.1750192008836778e-05, + "loss": 0.8514, + "step": 11134 + }, + { + "epoch": 0.555361596009975, + "grad_norm": 0.4907841660875028, + "learning_rate": 2.174618786542328e-05, + "loss": 0.9153, + "step": 11135 + }, + { + "epoch": 0.5554114713216958, + "grad_norm": 0.4713771619176942, + "learning_rate": 2.1742183806916552e-05, + "loss": 0.8798, + "step": 11136 + }, + { + "epoch": 0.5554613466334165, + "grad_norm": 0.4938310983734826, + "learning_rate": 2.173817983342109e-05, + "loss": 0.8786, + "step": 11137 + }, + { + "epoch": 0.5555112219451371, + "grad_norm": 0.4774550714519356, + "learning_rate": 2.1734175945041355e-05, + "loss": 0.8885, + "step": 11138 + }, + { + "epoch": 0.5555610972568579, + "grad_norm": 0.4176065548725803, + "learning_rate": 2.1730172141881838e-05, + "loss": 0.8194, + "step": 11139 + }, + { + "epoch": 0.5556109725685785, + "grad_norm": 0.5378260530524759, + "learning_rate": 2.1726168424047012e-05, + "loss": 0.8898, + "step": 11140 + }, + { + "epoch": 0.5556608478802992, + "grad_norm": 0.3657981065786055, + "learning_rate": 2.1722164791641368e-05, + "loss": 0.8552, + "step": 11141 + }, + { + "epoch": 0.55571072319202, + "grad_norm": 0.39165521492417293, + "learning_rate": 2.171816124476936e-05, + "loss": 0.8667, + "step": 11142 + }, + { + "epoch": 0.5557605985037406, + "grad_norm": 0.3976225809768239, + "learning_rate": 2.1714157783535463e-05, + "loss": 0.8917, + "step": 11143 + }, + { + "epoch": 0.5558104738154613, + "grad_norm": 0.4390983336383553, + "learning_rate": 2.1710154408044148e-05, + "loss": 0.8504, + "step": 11144 + }, + { + "epoch": 0.5558603491271821, + "grad_norm": 0.546787170399595, + "learning_rate": 2.1706151118399893e-05, + "loss": 0.9212, + "step": 11145 + }, + { + "epoch": 0.5559102244389027, + "grad_norm": 0.4664935085025795, + "learning_rate": 2.1702147914707138e-05, + "loss": 0.8835, + "step": 11146 + }, + { + "epoch": 0.5559600997506234, + "grad_norm": 0.5793660324507561, + "learning_rate": 2.169814479707036e-05, + "loss": 0.8669, + "step": 11147 + }, + { + "epoch": 0.5560099750623442, + "grad_norm": 0.41097244601268207, + "learning_rate": 2.1694141765594024e-05, + "loss": 0.8962, + "step": 11148 + }, + { + "epoch": 0.5560598503740648, + "grad_norm": 0.6520136802780105, + "learning_rate": 2.1690138820382565e-05, + "loss": 0.8991, + "step": 11149 + }, + { + "epoch": 0.5561097256857855, + "grad_norm": 0.43744619602067375, + "learning_rate": 2.1686135961540456e-05, + "loss": 0.895, + "step": 11150 + }, + { + "epoch": 0.5561596009975063, + "grad_norm": 0.4229226310495051, + "learning_rate": 2.1682133189172148e-05, + "loss": 0.884, + "step": 11151 + }, + { + "epoch": 0.5562094763092269, + "grad_norm": 0.4001749776094885, + "learning_rate": 2.1678130503382096e-05, + "loss": 0.8965, + "step": 11152 + }, + { + "epoch": 0.5562593516209476, + "grad_norm": 0.4371440278258247, + "learning_rate": 2.1674127904274733e-05, + "loss": 0.8682, + "step": 11153 + }, + { + "epoch": 0.5563092269326684, + "grad_norm": 0.7099733123845255, + "learning_rate": 2.1670125391954514e-05, + "loss": 0.873, + "step": 11154 + }, + { + "epoch": 0.556359102244389, + "grad_norm": 0.48841460628889194, + "learning_rate": 2.1666122966525883e-05, + "loss": 0.9072, + "step": 11155 + }, + { + "epoch": 0.5564089775561097, + "grad_norm": 0.6456333399637021, + "learning_rate": 2.1662120628093293e-05, + "loss": 0.8401, + "step": 11156 + }, + { + "epoch": 0.5564588528678305, + "grad_norm": 0.4250975785117596, + "learning_rate": 2.165811837676116e-05, + "loss": 0.8617, + "step": 11157 + }, + { + "epoch": 0.5565087281795511, + "grad_norm": 0.469777972415382, + "learning_rate": 2.165411621263393e-05, + "loss": 0.8952, + "step": 11158 + }, + { + "epoch": 0.5565586034912718, + "grad_norm": 0.5061380788804271, + "learning_rate": 2.165011413581605e-05, + "loss": 0.8602, + "step": 11159 + }, + { + "epoch": 0.5566084788029925, + "grad_norm": 0.4893212303798637, + "learning_rate": 2.1646112146411937e-05, + "loss": 0.8888, + "step": 11160 + }, + { + "epoch": 0.5566583541147132, + "grad_norm": 0.37879275807507323, + "learning_rate": 2.1642110244526023e-05, + "loss": 0.8651, + "step": 11161 + }, + { + "epoch": 0.5567082294264339, + "grad_norm": 0.5486189394140938, + "learning_rate": 2.163810843026274e-05, + "loss": 0.8582, + "step": 11162 + }, + { + "epoch": 0.5567581047381546, + "grad_norm": 0.4849636008717759, + "learning_rate": 2.163410670372652e-05, + "loss": 0.8988, + "step": 11163 + }, + { + "epoch": 0.5568079800498753, + "grad_norm": 0.4521557225820392, + "learning_rate": 2.1630105065021778e-05, + "loss": 0.884, + "step": 11164 + }, + { + "epoch": 0.556857855361596, + "grad_norm": 0.5010582975624354, + "learning_rate": 2.162610351425293e-05, + "loss": 0.8505, + "step": 11165 + }, + { + "epoch": 0.5569077306733167, + "grad_norm": 0.5555791783112926, + "learning_rate": 2.16221020515244e-05, + "loss": 0.9077, + "step": 11166 + }, + { + "epoch": 0.5569576059850374, + "grad_norm": 0.4596456948215885, + "learning_rate": 2.1618100676940612e-05, + "loss": 0.8913, + "step": 11167 + }, + { + "epoch": 0.5570074812967581, + "grad_norm": 0.40840763344793074, + "learning_rate": 2.1614099390605975e-05, + "loss": 0.867, + "step": 11168 + }, + { + "epoch": 0.5570573566084788, + "grad_norm": 0.48150149197061975, + "learning_rate": 2.161009819262489e-05, + "loss": 0.878, + "step": 11169 + }, + { + "epoch": 0.5571072319201995, + "grad_norm": 0.43260669195469736, + "learning_rate": 2.160609708310178e-05, + "loss": 0.8893, + "step": 11170 + }, + { + "epoch": 0.5571571072319202, + "grad_norm": 0.46844523488431977, + "learning_rate": 2.1602096062141048e-05, + "loss": 0.8842, + "step": 11171 + }, + { + "epoch": 0.5572069825436409, + "grad_norm": 0.3715431712271427, + "learning_rate": 2.1598095129847097e-05, + "loss": 0.8327, + "step": 11172 + }, + { + "epoch": 0.5572568578553616, + "grad_norm": 0.5292712473398146, + "learning_rate": 2.159409428632433e-05, + "loss": 0.9207, + "step": 11173 + }, + { + "epoch": 0.5573067331670823, + "grad_norm": 0.470253957091765, + "learning_rate": 2.1590093531677156e-05, + "loss": 0.8865, + "step": 11174 + }, + { + "epoch": 0.557356608478803, + "grad_norm": 0.4374828347644245, + "learning_rate": 2.1586092866009956e-05, + "loss": 0.8681, + "step": 11175 + }, + { + "epoch": 0.5574064837905237, + "grad_norm": 0.5895762185044023, + "learning_rate": 2.1582092289427136e-05, + "loss": 0.8887, + "step": 11176 + }, + { + "epoch": 0.5574563591022443, + "grad_norm": 0.4633763693467902, + "learning_rate": 2.157809180203309e-05, + "loss": 0.8987, + "step": 11177 + }, + { + "epoch": 0.5575062344139651, + "grad_norm": 0.46954569725054524, + "learning_rate": 2.1574091403932214e-05, + "loss": 0.854, + "step": 11178 + }, + { + "epoch": 0.5575561097256858, + "grad_norm": 0.44582412949122324, + "learning_rate": 2.157009109522888e-05, + "loss": 0.8789, + "step": 11179 + }, + { + "epoch": 0.5576059850374064, + "grad_norm": 0.4414642327228909, + "learning_rate": 2.1566090876027484e-05, + "loss": 0.851, + "step": 11180 + }, + { + "epoch": 0.5576558603491272, + "grad_norm": 0.45339200331513213, + "learning_rate": 2.1562090746432424e-05, + "loss": 0.9179, + "step": 11181 + }, + { + "epoch": 0.5577057356608479, + "grad_norm": 0.41211495663469444, + "learning_rate": 2.155809070654805e-05, + "loss": 0.8588, + "step": 11182 + }, + { + "epoch": 0.5577556109725685, + "grad_norm": 0.47192017405739445, + "learning_rate": 2.1554090756478762e-05, + "loss": 0.8852, + "step": 11183 + }, + { + "epoch": 0.5578054862842893, + "grad_norm": 0.45202705022346223, + "learning_rate": 2.1550090896328935e-05, + "loss": 0.8929, + "step": 11184 + }, + { + "epoch": 0.55785536159601, + "grad_norm": 0.41294934152533136, + "learning_rate": 2.154609112620295e-05, + "loss": 0.8588, + "step": 11185 + }, + { + "epoch": 0.5579052369077306, + "grad_norm": 0.527427605876689, + "learning_rate": 2.1542091446205166e-05, + "loss": 0.9064, + "step": 11186 + }, + { + "epoch": 0.5579551122194514, + "grad_norm": 0.49791426069993977, + "learning_rate": 2.153809185643995e-05, + "loss": 0.8482, + "step": 11187 + }, + { + "epoch": 0.5580049875311721, + "grad_norm": 0.43789934578537726, + "learning_rate": 2.1534092357011684e-05, + "loss": 0.8392, + "step": 11188 + }, + { + "epoch": 0.5580548628428927, + "grad_norm": 0.4167572750372065, + "learning_rate": 2.1530092948024737e-05, + "loss": 0.8366, + "step": 11189 + }, + { + "epoch": 0.5581047381546135, + "grad_norm": 0.4859347369760905, + "learning_rate": 2.152609362958345e-05, + "loss": 0.8739, + "step": 11190 + }, + { + "epoch": 0.5581546134663342, + "grad_norm": 0.4970099358601716, + "learning_rate": 2.152209440179219e-05, + "loss": 0.8564, + "step": 11191 + }, + { + "epoch": 0.5582044887780548, + "grad_norm": 0.4441958799507847, + "learning_rate": 2.1518095264755326e-05, + "loss": 0.8462, + "step": 11192 + }, + { + "epoch": 0.5582543640897756, + "grad_norm": 0.41927798133633043, + "learning_rate": 2.1514096218577218e-05, + "loss": 0.8341, + "step": 11193 + }, + { + "epoch": 0.5583042394014962, + "grad_norm": 0.5254632610101526, + "learning_rate": 2.1510097263362196e-05, + "loss": 0.9143, + "step": 11194 + }, + { + "epoch": 0.5583541147132169, + "grad_norm": 0.4022163795404087, + "learning_rate": 2.1506098399214626e-05, + "loss": 0.889, + "step": 11195 + }, + { + "epoch": 0.5584039900249377, + "grad_norm": 0.42880761060761496, + "learning_rate": 2.1502099626238864e-05, + "loss": 0.9442, + "step": 11196 + }, + { + "epoch": 0.5584538653366583, + "grad_norm": 0.4431395765466801, + "learning_rate": 2.1498100944539237e-05, + "loss": 0.8545, + "step": 11197 + }, + { + "epoch": 0.558503740648379, + "grad_norm": 0.46028531018161356, + "learning_rate": 2.14941023542201e-05, + "loss": 0.9044, + "step": 11198 + }, + { + "epoch": 0.5585536159600998, + "grad_norm": 0.5126713202516708, + "learning_rate": 2.149010385538579e-05, + "loss": 0.9183, + "step": 11199 + }, + { + "epoch": 0.5586034912718204, + "grad_norm": 0.5215705175340659, + "learning_rate": 2.1486105448140658e-05, + "loss": 0.9091, + "step": 11200 + }, + { + "epoch": 0.5586533665835411, + "grad_norm": 0.58460191065874, + "learning_rate": 2.1482107132589026e-05, + "loss": 0.8779, + "step": 11201 + }, + { + "epoch": 0.5587032418952619, + "grad_norm": 0.40751291949771634, + "learning_rate": 2.147810890883523e-05, + "loss": 0.885, + "step": 11202 + }, + { + "epoch": 0.5587531172069825, + "grad_norm": 0.6187066663479011, + "learning_rate": 2.147411077698361e-05, + "loss": 0.898, + "step": 11203 + }, + { + "epoch": 0.5588029925187032, + "grad_norm": 0.730481780087423, + "learning_rate": 2.1470112737138495e-05, + "loss": 0.8513, + "step": 11204 + }, + { + "epoch": 0.558852867830424, + "grad_norm": 0.46246922561998577, + "learning_rate": 2.1466114789404206e-05, + "loss": 0.9066, + "step": 11205 + }, + { + "epoch": 0.5589027431421446, + "grad_norm": 0.5040051986988887, + "learning_rate": 2.146211693388507e-05, + "loss": 0.8623, + "step": 11206 + }, + { + "epoch": 0.5589526184538653, + "grad_norm": 0.5584775239509965, + "learning_rate": 2.145811917068541e-05, + "loss": 0.89, + "step": 11207 + }, + { + "epoch": 0.5590024937655861, + "grad_norm": 0.4011347597775405, + "learning_rate": 2.145412149990954e-05, + "loss": 0.875, + "step": 11208 + }, + { + "epoch": 0.5590523690773067, + "grad_norm": 0.5221391778252121, + "learning_rate": 2.1450123921661784e-05, + "loss": 0.8891, + "step": 11209 + }, + { + "epoch": 0.5591022443890274, + "grad_norm": 0.9178692062144238, + "learning_rate": 2.1446126436046458e-05, + "loss": 0.8792, + "step": 11210 + }, + { + "epoch": 0.5591521197007482, + "grad_norm": 0.5753598991503691, + "learning_rate": 2.1442129043167874e-05, + "loss": 0.8484, + "step": 11211 + }, + { + "epoch": 0.5592019950124688, + "grad_norm": 0.4725184448435476, + "learning_rate": 2.1438131743130338e-05, + "loss": 0.8404, + "step": 11212 + }, + { + "epoch": 0.5592518703241895, + "grad_norm": 0.5281442437353404, + "learning_rate": 2.1434134536038158e-05, + "loss": 0.8613, + "step": 11213 + }, + { + "epoch": 0.5593017456359102, + "grad_norm": 0.5084415982573192, + "learning_rate": 2.1430137421995637e-05, + "loss": 0.8949, + "step": 11214 + }, + { + "epoch": 0.5593516209476309, + "grad_norm": 0.48811998359279274, + "learning_rate": 2.14261404011071e-05, + "loss": 0.8857, + "step": 11215 + }, + { + "epoch": 0.5594014962593516, + "grad_norm": 0.49748027103334236, + "learning_rate": 2.1422143473476815e-05, + "loss": 0.8865, + "step": 11216 + }, + { + "epoch": 0.5594513715710723, + "grad_norm": 0.6506194203176914, + "learning_rate": 2.1418146639209097e-05, + "loss": 0.8674, + "step": 11217 + }, + { + "epoch": 0.559501246882793, + "grad_norm": 0.4667705682870087, + "learning_rate": 2.1414149898408246e-05, + "loss": 0.9126, + "step": 11218 + }, + { + "epoch": 0.5595511221945138, + "grad_norm": 0.48158656200780453, + "learning_rate": 2.1410153251178546e-05, + "loss": 0.8984, + "step": 11219 + }, + { + "epoch": 0.5596009975062344, + "grad_norm": 0.7396423443705109, + "learning_rate": 2.1406156697624282e-05, + "loss": 0.8651, + "step": 11220 + }, + { + "epoch": 0.5596508728179551, + "grad_norm": 0.408151376696026, + "learning_rate": 2.1402160237849756e-05, + "loss": 0.8912, + "step": 11221 + }, + { + "epoch": 0.5597007481296759, + "grad_norm": 0.41345117303982204, + "learning_rate": 2.1398163871959257e-05, + "loss": 0.8751, + "step": 11222 + }, + { + "epoch": 0.5597506234413965, + "grad_norm": 0.4722639033058177, + "learning_rate": 2.1394167600057048e-05, + "loss": 0.8551, + "step": 11223 + }, + { + "epoch": 0.5598004987531172, + "grad_norm": 0.7165545172822916, + "learning_rate": 2.139017142224742e-05, + "loss": 0.9056, + "step": 11224 + }, + { + "epoch": 0.559850374064838, + "grad_norm": 0.6214815345216673, + "learning_rate": 2.1386175338634655e-05, + "loss": 0.8638, + "step": 11225 + }, + { + "epoch": 0.5599002493765586, + "grad_norm": 0.4896504130466516, + "learning_rate": 2.138217934932304e-05, + "loss": 0.8633, + "step": 11226 + }, + { + "epoch": 0.5599501246882793, + "grad_norm": 0.5146065938038412, + "learning_rate": 2.1378183454416827e-05, + "loss": 0.8932, + "step": 11227 + }, + { + "epoch": 0.56, + "grad_norm": 0.3944353151807294, + "learning_rate": 2.1374187654020292e-05, + "loss": 0.877, + "step": 11228 + }, + { + "epoch": 0.5600498753117207, + "grad_norm": 0.5268835266178132, + "learning_rate": 2.1370191948237717e-05, + "loss": 0.8914, + "step": 11229 + }, + { + "epoch": 0.5600997506234414, + "grad_norm": 0.6504451367659143, + "learning_rate": 2.136619633717335e-05, + "loss": 0.8651, + "step": 11230 + }, + { + "epoch": 0.560149625935162, + "grad_norm": 0.45336361125290114, + "learning_rate": 2.1362200820931465e-05, + "loss": 0.8739, + "step": 11231 + }, + { + "epoch": 0.5601995012468828, + "grad_norm": 0.4317252223582249, + "learning_rate": 2.135820539961632e-05, + "loss": 0.8706, + "step": 11232 + }, + { + "epoch": 0.5602493765586035, + "grad_norm": 0.4151448354834548, + "learning_rate": 2.1354210073332183e-05, + "loss": 0.8795, + "step": 11233 + }, + { + "epoch": 0.5602992518703241, + "grad_norm": 0.4007291474168359, + "learning_rate": 2.1350214842183293e-05, + "loss": 0.8953, + "step": 11234 + }, + { + "epoch": 0.5603491271820449, + "grad_norm": 0.39452247961052933, + "learning_rate": 2.1346219706273918e-05, + "loss": 0.862, + "step": 11235 + }, + { + "epoch": 0.5603990024937656, + "grad_norm": 0.4346265921395473, + "learning_rate": 2.13422246657083e-05, + "loss": 0.8589, + "step": 11236 + }, + { + "epoch": 0.5604488778054862, + "grad_norm": 0.45830790368085284, + "learning_rate": 2.1338229720590702e-05, + "loss": 0.9088, + "step": 11237 + }, + { + "epoch": 0.560498753117207, + "grad_norm": 0.544844029041964, + "learning_rate": 2.1334234871025355e-05, + "loss": 0.8597, + "step": 11238 + }, + { + "epoch": 0.5605486284289277, + "grad_norm": 0.41894030013691425, + "learning_rate": 2.1330240117116505e-05, + "loss": 0.8261, + "step": 11239 + }, + { + "epoch": 0.5605985037406483, + "grad_norm": 0.40608511572068084, + "learning_rate": 2.1326245458968396e-05, + "loss": 0.8951, + "step": 11240 + }, + { + "epoch": 0.5606483790523691, + "grad_norm": 0.41511404904478855, + "learning_rate": 2.1322250896685274e-05, + "loss": 0.8898, + "step": 11241 + }, + { + "epoch": 0.5606982543640898, + "grad_norm": 0.490627884772585, + "learning_rate": 2.131825643037137e-05, + "loss": 0.8393, + "step": 11242 + }, + { + "epoch": 0.5607481296758104, + "grad_norm": 0.5379436211881051, + "learning_rate": 2.131426206013091e-05, + "loss": 0.8967, + "step": 11243 + }, + { + "epoch": 0.5607980049875312, + "grad_norm": 0.576096955199526, + "learning_rate": 2.131026778606814e-05, + "loss": 0.871, + "step": 11244 + }, + { + "epoch": 0.5608478802992519, + "grad_norm": 0.4948813909689741, + "learning_rate": 2.130627360828727e-05, + "loss": 0.8984, + "step": 11245 + }, + { + "epoch": 0.5608977556109725, + "grad_norm": 0.8159596618931176, + "learning_rate": 2.1302279526892542e-05, + "loss": 0.8733, + "step": 11246 + }, + { + "epoch": 0.5609476309226933, + "grad_norm": 0.4462054825690143, + "learning_rate": 2.1298285541988178e-05, + "loss": 0.9013, + "step": 11247 + }, + { + "epoch": 0.5609975062344139, + "grad_norm": 0.4611865803632763, + "learning_rate": 2.1294291653678398e-05, + "loss": 0.889, + "step": 11248 + }, + { + "epoch": 0.5610473815461346, + "grad_norm": 0.4227317925826738, + "learning_rate": 2.1290297862067413e-05, + "loss": 0.8955, + "step": 11249 + }, + { + "epoch": 0.5610972568578554, + "grad_norm": 0.36817913054897755, + "learning_rate": 2.1286304167259443e-05, + "loss": 0.8611, + "step": 11250 + }, + { + "epoch": 0.561147132169576, + "grad_norm": 0.46186306539506855, + "learning_rate": 2.1282310569358703e-05, + "loss": 0.8791, + "step": 11251 + }, + { + "epoch": 0.5611970074812968, + "grad_norm": 0.38488500002371756, + "learning_rate": 2.127831706846942e-05, + "loss": 0.8481, + "step": 11252 + }, + { + "epoch": 0.5612468827930175, + "grad_norm": 0.43914393797531664, + "learning_rate": 2.1274323664695777e-05, + "loss": 0.8464, + "step": 11253 + }, + { + "epoch": 0.5612967581047381, + "grad_norm": 0.6456193765655676, + "learning_rate": 2.1270330358141984e-05, + "loss": 0.8863, + "step": 11254 + }, + { + "epoch": 0.5613466334164589, + "grad_norm": 0.501118791040499, + "learning_rate": 2.1266337148912266e-05, + "loss": 0.8565, + "step": 11255 + }, + { + "epoch": 0.5613965087281796, + "grad_norm": 0.4819431907601988, + "learning_rate": 2.12623440371108e-05, + "loss": 0.8854, + "step": 11256 + }, + { + "epoch": 0.5614463840399002, + "grad_norm": 0.4294815356617332, + "learning_rate": 2.125835102284179e-05, + "loss": 0.8566, + "step": 11257 + }, + { + "epoch": 0.561496259351621, + "grad_norm": 0.4246573668795635, + "learning_rate": 2.1254358106209437e-05, + "loss": 0.8929, + "step": 11258 + }, + { + "epoch": 0.5615461346633417, + "grad_norm": 0.8311695332648985, + "learning_rate": 2.1250365287317943e-05, + "loss": 0.8678, + "step": 11259 + }, + { + "epoch": 0.5615960099750623, + "grad_norm": 0.40354224117148674, + "learning_rate": 2.124637256627148e-05, + "loss": 0.8571, + "step": 11260 + }, + { + "epoch": 0.561645885286783, + "grad_norm": 0.4053502361871492, + "learning_rate": 2.1242379943174244e-05, + "loss": 0.9108, + "step": 11261 + }, + { + "epoch": 0.5616957605985038, + "grad_norm": 0.4505676925018493, + "learning_rate": 2.1238387418130422e-05, + "loss": 0.9016, + "step": 11262 + }, + { + "epoch": 0.5617456359102244, + "grad_norm": 0.4871948600676405, + "learning_rate": 2.1234394991244208e-05, + "loss": 0.8669, + "step": 11263 + }, + { + "epoch": 0.5617955112219452, + "grad_norm": 0.4734520589156146, + "learning_rate": 2.1230402662619758e-05, + "loss": 0.8638, + "step": 11264 + }, + { + "epoch": 0.5618453865336658, + "grad_norm": 0.42973911781730256, + "learning_rate": 2.1226410432361264e-05, + "loss": 0.8319, + "step": 11265 + }, + { + "epoch": 0.5618952618453865, + "grad_norm": 0.44811901837199475, + "learning_rate": 2.1222418300572915e-05, + "loss": 0.8859, + "step": 11266 + }, + { + "epoch": 0.5619451371571073, + "grad_norm": 0.4904740112389667, + "learning_rate": 2.121842626735886e-05, + "loss": 0.8317, + "step": 11267 + }, + { + "epoch": 0.5619950124688279, + "grad_norm": 0.46713727956546985, + "learning_rate": 2.1214434332823275e-05, + "loss": 0.8747, + "step": 11268 + }, + { + "epoch": 0.5620448877805486, + "grad_norm": 0.535713633302082, + "learning_rate": 2.1210442497070332e-05, + "loss": 0.9132, + "step": 11269 + }, + { + "epoch": 0.5620947630922694, + "grad_norm": 0.46159122206294845, + "learning_rate": 2.1206450760204207e-05, + "loss": 0.8892, + "step": 11270 + }, + { + "epoch": 0.56214463840399, + "grad_norm": 0.4287537273345642, + "learning_rate": 2.1202459122329044e-05, + "loss": 0.8606, + "step": 11271 + }, + { + "epoch": 0.5621945137157107, + "grad_norm": 0.4539579616594276, + "learning_rate": 2.1198467583549007e-05, + "loss": 0.864, + "step": 11272 + }, + { + "epoch": 0.5622443890274315, + "grad_norm": 0.37576715777700676, + "learning_rate": 2.1194476143968258e-05, + "loss": 0.8557, + "step": 11273 + }, + { + "epoch": 0.5622942643391521, + "grad_norm": 0.3956164227806819, + "learning_rate": 2.1190484803690962e-05, + "loss": 0.8724, + "step": 11274 + }, + { + "epoch": 0.5623441396508728, + "grad_norm": 0.510621666356315, + "learning_rate": 2.1186493562821252e-05, + "loss": 0.8949, + "step": 11275 + }, + { + "epoch": 0.5623940149625936, + "grad_norm": 0.45758151343678327, + "learning_rate": 2.118250242146328e-05, + "loss": 0.9014, + "step": 11276 + }, + { + "epoch": 0.5624438902743142, + "grad_norm": 0.4490560609521399, + "learning_rate": 2.1178511379721212e-05, + "loss": 0.9175, + "step": 11277 + }, + { + "epoch": 0.5624937655860349, + "grad_norm": 0.45220427958810466, + "learning_rate": 2.1174520437699166e-05, + "loss": 0.8926, + "step": 11278 + }, + { + "epoch": 0.5625436408977557, + "grad_norm": 0.4481376528215922, + "learning_rate": 2.1170529595501305e-05, + "loss": 0.8427, + "step": 11279 + }, + { + "epoch": 0.5625935162094763, + "grad_norm": 0.4369985309362744, + "learning_rate": 2.1166538853231757e-05, + "loss": 0.8748, + "step": 11280 + }, + { + "epoch": 0.562643391521197, + "grad_norm": 0.40963388117075855, + "learning_rate": 2.1162548210994668e-05, + "loss": 0.8567, + "step": 11281 + }, + { + "epoch": 0.5626932668329178, + "grad_norm": 0.4783676558193178, + "learning_rate": 2.1158557668894156e-05, + "loss": 0.8948, + "step": 11282 + }, + { + "epoch": 0.5627431421446384, + "grad_norm": 0.8317834597355604, + "learning_rate": 2.1154567227034373e-05, + "loss": 0.9094, + "step": 11283 + }, + { + "epoch": 0.5627930174563591, + "grad_norm": 0.39577002603379047, + "learning_rate": 2.115057688551943e-05, + "loss": 0.8745, + "step": 11284 + }, + { + "epoch": 0.5628428927680797, + "grad_norm": 0.5105424160978499, + "learning_rate": 2.1146586644453465e-05, + "loss": 0.8972, + "step": 11285 + }, + { + "epoch": 0.5628927680798005, + "grad_norm": 0.4039453691876978, + "learning_rate": 2.1142596503940594e-05, + "loss": 0.8691, + "step": 11286 + }, + { + "epoch": 0.5629426433915212, + "grad_norm": 0.5648134072062829, + "learning_rate": 2.1138606464084938e-05, + "loss": 0.8822, + "step": 11287 + }, + { + "epoch": 0.5629925187032419, + "grad_norm": 0.47152524155268843, + "learning_rate": 2.1134616524990627e-05, + "loss": 0.8943, + "step": 11288 + }, + { + "epoch": 0.5630423940149626, + "grad_norm": 0.4066507134120402, + "learning_rate": 2.1130626686761762e-05, + "loss": 0.8512, + "step": 11289 + }, + { + "epoch": 0.5630922693266833, + "grad_norm": 0.46801473088674256, + "learning_rate": 2.112663694950246e-05, + "loss": 0.8751, + "step": 11290 + }, + { + "epoch": 0.563142144638404, + "grad_norm": 0.37894843112044235, + "learning_rate": 2.1122647313316832e-05, + "loss": 0.8422, + "step": 11291 + }, + { + "epoch": 0.5631920199501247, + "grad_norm": 0.44681264223432243, + "learning_rate": 2.1118657778308997e-05, + "loss": 0.9149, + "step": 11292 + }, + { + "epoch": 0.5632418952618454, + "grad_norm": 0.3866332833722801, + "learning_rate": 2.1114668344583044e-05, + "loss": 0.8274, + "step": 11293 + }, + { + "epoch": 0.563291770573566, + "grad_norm": 0.4515225516495124, + "learning_rate": 2.111067901224308e-05, + "loss": 0.9218, + "step": 11294 + }, + { + "epoch": 0.5633416458852868, + "grad_norm": 0.4049191021504779, + "learning_rate": 2.1106689781393202e-05, + "loss": 0.8861, + "step": 11295 + }, + { + "epoch": 0.5633915211970075, + "grad_norm": 0.4066043913926381, + "learning_rate": 2.1102700652137527e-05, + "loss": 0.9147, + "step": 11296 + }, + { + "epoch": 0.5634413965087282, + "grad_norm": 0.5311067343756266, + "learning_rate": 2.1098711624580122e-05, + "loss": 0.8602, + "step": 11297 + }, + { + "epoch": 0.5634912718204489, + "grad_norm": 0.4322564237382181, + "learning_rate": 2.1094722698825092e-05, + "loss": 0.8601, + "step": 11298 + }, + { + "epoch": 0.5635411471321696, + "grad_norm": 0.4354970946542322, + "learning_rate": 2.1090733874976525e-05, + "loss": 0.8921, + "step": 11299 + }, + { + "epoch": 0.5635910224438903, + "grad_norm": 0.4206776184983278, + "learning_rate": 2.108674515313852e-05, + "loss": 0.9184, + "step": 11300 + }, + { + "epoch": 0.563640897755611, + "grad_norm": 0.4550869063848961, + "learning_rate": 2.1082756533415137e-05, + "loss": 0.8464, + "step": 11301 + }, + { + "epoch": 0.5636907730673316, + "grad_norm": 0.4589538080818166, + "learning_rate": 2.107876801591047e-05, + "loss": 0.9041, + "step": 11302 + }, + { + "epoch": 0.5637406483790524, + "grad_norm": 0.3986382879335031, + "learning_rate": 2.1074779600728607e-05, + "loss": 0.9042, + "step": 11303 + }, + { + "epoch": 0.5637905236907731, + "grad_norm": 0.4420668636376359, + "learning_rate": 2.10707912879736e-05, + "loss": 0.8434, + "step": 11304 + }, + { + "epoch": 0.5638403990024937, + "grad_norm": 0.4471348621793926, + "learning_rate": 2.1066803077749538e-05, + "loss": 0.8537, + "step": 11305 + }, + { + "epoch": 0.5638902743142145, + "grad_norm": 0.4331131876438823, + "learning_rate": 2.106281497016049e-05, + "loss": 0.8683, + "step": 11306 + }, + { + "epoch": 0.5639401496259352, + "grad_norm": 0.4701638785015946, + "learning_rate": 2.1058826965310534e-05, + "loss": 0.8693, + "step": 11307 + }, + { + "epoch": 0.5639900249376558, + "grad_norm": 0.9877820334564938, + "learning_rate": 2.1054839063303716e-05, + "loss": 0.8418, + "step": 11308 + }, + { + "epoch": 0.5640399002493766, + "grad_norm": 0.45387822524065746, + "learning_rate": 2.1050851264244103e-05, + "loss": 0.8867, + "step": 11309 + }, + { + "epoch": 0.5640897755610973, + "grad_norm": 0.46828722416680446, + "learning_rate": 2.104686356823576e-05, + "loss": 0.8516, + "step": 11310 + }, + { + "epoch": 0.5641396508728179, + "grad_norm": 0.43142421033185446, + "learning_rate": 2.1042875975382755e-05, + "loss": 0.8269, + "step": 11311 + }, + { + "epoch": 0.5641895261845387, + "grad_norm": 0.398131343853456, + "learning_rate": 2.103888848578912e-05, + "loss": 0.8497, + "step": 11312 + }, + { + "epoch": 0.5642394014962594, + "grad_norm": 0.4379248572776756, + "learning_rate": 2.1034901099558915e-05, + "loss": 0.9447, + "step": 11313 + }, + { + "epoch": 0.56428927680798, + "grad_norm": 0.46610142447265873, + "learning_rate": 2.1030913816796196e-05, + "loss": 0.9217, + "step": 11314 + }, + { + "epoch": 0.5643391521197008, + "grad_norm": 0.8333078353176401, + "learning_rate": 2.1026926637605008e-05, + "loss": 0.8806, + "step": 11315 + }, + { + "epoch": 0.5643890274314215, + "grad_norm": 0.4830214533622, + "learning_rate": 2.102293956208938e-05, + "loss": 0.8867, + "step": 11316 + }, + { + "epoch": 0.5644389027431421, + "grad_norm": 0.870857799213654, + "learning_rate": 2.101895259035337e-05, + "loss": 0.9058, + "step": 11317 + }, + { + "epoch": 0.5644887780548629, + "grad_norm": 0.4767898773571148, + "learning_rate": 2.1014965722501017e-05, + "loss": 0.8792, + "step": 11318 + }, + { + "epoch": 0.5645386533665835, + "grad_norm": 0.46425413594341003, + "learning_rate": 2.101097895863634e-05, + "loss": 0.8502, + "step": 11319 + }, + { + "epoch": 0.5645885286783042, + "grad_norm": 0.42399195128656625, + "learning_rate": 2.100699229886339e-05, + "loss": 0.9122, + "step": 11320 + }, + { + "epoch": 0.564638403990025, + "grad_norm": 0.39591966579181054, + "learning_rate": 2.1003005743286182e-05, + "loss": 0.8719, + "step": 11321 + }, + { + "epoch": 0.5646882793017456, + "grad_norm": 0.4276308455885825, + "learning_rate": 2.099901929200876e-05, + "loss": 0.8579, + "step": 11322 + }, + { + "epoch": 0.5647381546134663, + "grad_norm": 0.5877084080652957, + "learning_rate": 2.099503294513513e-05, + "loss": 0.8796, + "step": 11323 + }, + { + "epoch": 0.5647880299251871, + "grad_norm": 0.3805304439597132, + "learning_rate": 2.099104670276932e-05, + "loss": 0.8578, + "step": 11324 + }, + { + "epoch": 0.5648379052369077, + "grad_norm": 0.41901549798797616, + "learning_rate": 2.0987060565015365e-05, + "loss": 0.8941, + "step": 11325 + }, + { + "epoch": 0.5648877805486284, + "grad_norm": 0.4106846140978797, + "learning_rate": 2.0983074531977258e-05, + "loss": 0.9107, + "step": 11326 + }, + { + "epoch": 0.5649376558603492, + "grad_norm": 0.4053934046950268, + "learning_rate": 2.0979088603759024e-05, + "loss": 0.8982, + "step": 11327 + }, + { + "epoch": 0.5649875311720698, + "grad_norm": 1.072384207442506, + "learning_rate": 2.0975102780464676e-05, + "loss": 0.8392, + "step": 11328 + }, + { + "epoch": 0.5650374064837905, + "grad_norm": 0.5847280718330145, + "learning_rate": 2.0971117062198225e-05, + "loss": 0.8799, + "step": 11329 + }, + { + "epoch": 0.5650872817955113, + "grad_norm": 0.38584849059716025, + "learning_rate": 2.0967131449063664e-05, + "loss": 0.8728, + "step": 11330 + }, + { + "epoch": 0.5651371571072319, + "grad_norm": 0.7509533822419396, + "learning_rate": 2.0963145941165002e-05, + "loss": 0.851, + "step": 11331 + }, + { + "epoch": 0.5651870324189526, + "grad_norm": 0.49880372052010785, + "learning_rate": 2.095916053860624e-05, + "loss": 0.8858, + "step": 11332 + }, + { + "epoch": 0.5652369077306734, + "grad_norm": 0.4345250355460065, + "learning_rate": 2.0955175241491388e-05, + "loss": 0.9338, + "step": 11333 + }, + { + "epoch": 0.565286783042394, + "grad_norm": 0.3956800058774768, + "learning_rate": 2.0951190049924413e-05, + "loss": 0.8371, + "step": 11334 + }, + { + "epoch": 0.5653366583541147, + "grad_norm": 0.4797415344914447, + "learning_rate": 2.0947204964009325e-05, + "loss": 0.901, + "step": 11335 + }, + { + "epoch": 0.5653865336658355, + "grad_norm": 0.4849039496874616, + "learning_rate": 2.0943219983850122e-05, + "loss": 0.8889, + "step": 11336 + }, + { + "epoch": 0.5654364089775561, + "grad_norm": 0.4549501032547462, + "learning_rate": 2.0939235109550762e-05, + "loss": 0.8662, + "step": 11337 + }, + { + "epoch": 0.5654862842892768, + "grad_norm": 0.42847378856054597, + "learning_rate": 2.0935250341215245e-05, + "loss": 0.8771, + "step": 11338 + }, + { + "epoch": 0.5655361596009975, + "grad_norm": 0.47825072003542796, + "learning_rate": 2.0931265678947555e-05, + "loss": 0.8975, + "step": 11339 + }, + { + "epoch": 0.5655860349127182, + "grad_norm": 0.4805912522126342, + "learning_rate": 2.0927281122851673e-05, + "loss": 0.8837, + "step": 11340 + }, + { + "epoch": 0.5656359102244389, + "grad_norm": 0.4670825045064449, + "learning_rate": 2.0923296673031562e-05, + "loss": 0.8958, + "step": 11341 + }, + { + "epoch": 0.5656857855361596, + "grad_norm": 0.40534473072110166, + "learning_rate": 2.0919312329591197e-05, + "loss": 0.8461, + "step": 11342 + }, + { + "epoch": 0.5657356608478803, + "grad_norm": 0.42237537193110203, + "learning_rate": 2.0915328092634548e-05, + "loss": 0.9062, + "step": 11343 + }, + { + "epoch": 0.565785536159601, + "grad_norm": 0.4593293805748688, + "learning_rate": 2.0911343962265598e-05, + "loss": 0.838, + "step": 11344 + }, + { + "epoch": 0.5658354114713217, + "grad_norm": 0.47102100466542407, + "learning_rate": 2.0907359938588285e-05, + "loss": 0.8767, + "step": 11345 + }, + { + "epoch": 0.5658852867830424, + "grad_norm": 0.8640689248160803, + "learning_rate": 2.0903376021706583e-05, + "loss": 0.8883, + "step": 11346 + }, + { + "epoch": 0.5659351620947631, + "grad_norm": 0.4664270106817012, + "learning_rate": 2.089939221172446e-05, + "loss": 0.8854, + "step": 11347 + }, + { + "epoch": 0.5659850374064838, + "grad_norm": 0.517638764059798, + "learning_rate": 2.0895408508745856e-05, + "loss": 0.8439, + "step": 11348 + }, + { + "epoch": 0.5660349127182045, + "grad_norm": 0.5314433411447099, + "learning_rate": 2.0891424912874726e-05, + "loss": 0.8893, + "step": 11349 + }, + { + "epoch": 0.5660847880299252, + "grad_norm": 0.4348730029717208, + "learning_rate": 2.0887441424215027e-05, + "loss": 0.8482, + "step": 11350 + }, + { + "epoch": 0.5661346633416459, + "grad_norm": 0.5508429447452207, + "learning_rate": 2.0883458042870705e-05, + "loss": 0.9182, + "step": 11351 + }, + { + "epoch": 0.5661845386533666, + "grad_norm": 0.6596101301173118, + "learning_rate": 2.0879474768945706e-05, + "loss": 0.8918, + "step": 11352 + }, + { + "epoch": 0.5662344139650873, + "grad_norm": 0.5412523953106346, + "learning_rate": 2.087549160254396e-05, + "loss": 0.9046, + "step": 11353 + }, + { + "epoch": 0.566284289276808, + "grad_norm": 0.4172078339544558, + "learning_rate": 2.087150854376942e-05, + "loss": 0.8903, + "step": 11354 + }, + { + "epoch": 0.5663341645885287, + "grad_norm": 1.0915526219461187, + "learning_rate": 2.0867525592726018e-05, + "loss": 0.8678, + "step": 11355 + }, + { + "epoch": 0.5663840399002493, + "grad_norm": 0.44543855279910277, + "learning_rate": 2.0863542749517682e-05, + "loss": 0.8749, + "step": 11356 + }, + { + "epoch": 0.5664339152119701, + "grad_norm": 0.46082587440597805, + "learning_rate": 2.085956001424835e-05, + "loss": 0.8703, + "step": 11357 + }, + { + "epoch": 0.5664837905236908, + "grad_norm": 0.45501084534273695, + "learning_rate": 2.0855577387021945e-05, + "loss": 0.9293, + "step": 11358 + }, + { + "epoch": 0.5665336658354114, + "grad_norm": 0.5711026776969081, + "learning_rate": 2.08515948679424e-05, + "loss": 0.8822, + "step": 11359 + }, + { + "epoch": 0.5665835411471322, + "grad_norm": 0.5232007336973143, + "learning_rate": 2.084761245711362e-05, + "loss": 0.9206, + "step": 11360 + }, + { + "epoch": 0.5666334164588529, + "grad_norm": 0.4334107534543555, + "learning_rate": 2.0843630154639536e-05, + "loss": 0.8619, + "step": 11361 + }, + { + "epoch": 0.5666832917705735, + "grad_norm": 0.47567544163374614, + "learning_rate": 2.0839647960624075e-05, + "loss": 0.8969, + "step": 11362 + }, + { + "epoch": 0.5667331670822943, + "grad_norm": 0.43361817417222404, + "learning_rate": 2.083566587517113e-05, + "loss": 0.9109, + "step": 11363 + }, + { + "epoch": 0.566783042394015, + "grad_norm": 0.389211533434492, + "learning_rate": 2.0831683898384614e-05, + "loss": 0.8323, + "step": 11364 + }, + { + "epoch": 0.5668329177057356, + "grad_norm": 0.4426542456521692, + "learning_rate": 2.0827702030368445e-05, + "loss": 0.9009, + "step": 11365 + }, + { + "epoch": 0.5668827930174564, + "grad_norm": 0.42820967222823536, + "learning_rate": 2.0823720271226536e-05, + "loss": 0.877, + "step": 11366 + }, + { + "epoch": 0.5669326683291771, + "grad_norm": 0.4769403107309417, + "learning_rate": 2.0819738621062762e-05, + "loss": 0.882, + "step": 11367 + }, + { + "epoch": 0.5669825436408977, + "grad_norm": 0.40152032218910555, + "learning_rate": 2.0815757079981043e-05, + "loss": 0.8551, + "step": 11368 + }, + { + "epoch": 0.5670324189526185, + "grad_norm": 0.4259109290746741, + "learning_rate": 2.081177564808527e-05, + "loss": 0.8759, + "step": 11369 + }, + { + "epoch": 0.5670822942643392, + "grad_norm": 0.4393360577036867, + "learning_rate": 2.0807794325479343e-05, + "loss": 0.8511, + "step": 11370 + }, + { + "epoch": 0.5671321695760598, + "grad_norm": 0.40877722402611505, + "learning_rate": 2.080381311226714e-05, + "loss": 0.8252, + "step": 11371 + }, + { + "epoch": 0.5671820448877806, + "grad_norm": 0.5741593889236687, + "learning_rate": 2.0799832008552554e-05, + "loss": 0.8718, + "step": 11372 + }, + { + "epoch": 0.5672319201995012, + "grad_norm": 0.4885403783254341, + "learning_rate": 2.0795851014439484e-05, + "loss": 0.8804, + "step": 11373 + }, + { + "epoch": 0.5672817955112219, + "grad_norm": 0.41662408603490114, + "learning_rate": 2.079187013003179e-05, + "loss": 0.8745, + "step": 11374 + }, + { + "epoch": 0.5673316708229427, + "grad_norm": 0.42758609125366626, + "learning_rate": 2.0787889355433362e-05, + "loss": 0.8371, + "step": 11375 + }, + { + "epoch": 0.5673815461346633, + "grad_norm": 0.40848783854398607, + "learning_rate": 2.078390869074808e-05, + "loss": 0.8463, + "step": 11376 + }, + { + "epoch": 0.567431421446384, + "grad_norm": 0.42466188814308276, + "learning_rate": 2.0779928136079815e-05, + "loss": 0.8429, + "step": 11377 + }, + { + "epoch": 0.5674812967581048, + "grad_norm": 0.7660119911033537, + "learning_rate": 2.0775947691532434e-05, + "loss": 0.8567, + "step": 11378 + }, + { + "epoch": 0.5675311720698254, + "grad_norm": 0.5126393234560312, + "learning_rate": 2.07719673572098e-05, + "loss": 0.9054, + "step": 11379 + }, + { + "epoch": 0.5675810473815461, + "grad_norm": 0.3898075712637786, + "learning_rate": 2.076798713321579e-05, + "loss": 0.8469, + "step": 11380 + }, + { + "epoch": 0.5676309226932669, + "grad_norm": 0.4302821234051696, + "learning_rate": 2.0764007019654274e-05, + "loss": 0.869, + "step": 11381 + }, + { + "epoch": 0.5676807980049875, + "grad_norm": 0.5096715890264842, + "learning_rate": 2.0760027016629085e-05, + "loss": 0.8896, + "step": 11382 + }, + { + "epoch": 0.5677306733167082, + "grad_norm": 0.4489380269954257, + "learning_rate": 2.0756047124244095e-05, + "loss": 0.9203, + "step": 11383 + }, + { + "epoch": 0.567780548628429, + "grad_norm": 0.40634247362060044, + "learning_rate": 2.0752067342603167e-05, + "loss": 0.8982, + "step": 11384 + }, + { + "epoch": 0.5678304239401496, + "grad_norm": 0.5599612589198665, + "learning_rate": 2.074808767181013e-05, + "loss": 0.882, + "step": 11385 + }, + { + "epoch": 0.5678802992518703, + "grad_norm": 0.4399319228089882, + "learning_rate": 2.0744108111968844e-05, + "loss": 0.8938, + "step": 11386 + }, + { + "epoch": 0.5679301745635911, + "grad_norm": 0.4301680304008044, + "learning_rate": 2.074012866318315e-05, + "loss": 0.8721, + "step": 11387 + }, + { + "epoch": 0.5679800498753117, + "grad_norm": 0.42647238348375466, + "learning_rate": 2.0736149325556897e-05, + "loss": 0.8183, + "step": 11388 + }, + { + "epoch": 0.5680299251870324, + "grad_norm": 0.3882753679210606, + "learning_rate": 2.073217009919392e-05, + "loss": 0.8545, + "step": 11389 + }, + { + "epoch": 0.5680798004987531, + "grad_norm": 0.36930262838422123, + "learning_rate": 2.072819098419805e-05, + "loss": 0.8588, + "step": 11390 + }, + { + "epoch": 0.5681296758104738, + "grad_norm": 0.4341185446782133, + "learning_rate": 2.072421198067312e-05, + "loss": 0.8763, + "step": 11391 + }, + { + "epoch": 0.5681795511221945, + "grad_norm": 0.6786318973549511, + "learning_rate": 2.0720233088722973e-05, + "loss": 0.8923, + "step": 11392 + }, + { + "epoch": 0.5682294264339152, + "grad_norm": 0.5294569385522534, + "learning_rate": 2.0716254308451423e-05, + "loss": 0.8976, + "step": 11393 + }, + { + "epoch": 0.5682793017456359, + "grad_norm": 0.5020476554546838, + "learning_rate": 2.0712275639962306e-05, + "loss": 0.861, + "step": 11394 + }, + { + "epoch": 0.5683291770573566, + "grad_norm": 0.3803467530456492, + "learning_rate": 2.0708297083359434e-05, + "loss": 0.8533, + "step": 11395 + }, + { + "epoch": 0.5683790523690773, + "grad_norm": 0.42896839650680424, + "learning_rate": 2.0704318638746628e-05, + "loss": 0.9341, + "step": 11396 + }, + { + "epoch": 0.568428927680798, + "grad_norm": 0.4084227085188129, + "learning_rate": 2.0700340306227705e-05, + "loss": 0.8784, + "step": 11397 + }, + { + "epoch": 0.5684788029925187, + "grad_norm": 0.43586527662908375, + "learning_rate": 2.0696362085906478e-05, + "loss": 0.9049, + "step": 11398 + }, + { + "epoch": 0.5685286783042394, + "grad_norm": 0.4146697582258024, + "learning_rate": 2.069238397788676e-05, + "loss": 0.8704, + "step": 11399 + }, + { + "epoch": 0.5685785536159601, + "grad_norm": 0.47426230331393116, + "learning_rate": 2.0688405982272346e-05, + "loss": 0.9269, + "step": 11400 + }, + { + "epoch": 0.5686284289276808, + "grad_norm": 0.6266758503942523, + "learning_rate": 2.0684428099167048e-05, + "loss": 0.8815, + "step": 11401 + }, + { + "epoch": 0.5686783042394015, + "grad_norm": 0.6895523475830525, + "learning_rate": 2.068045032867467e-05, + "loss": 0.8668, + "step": 11402 + }, + { + "epoch": 0.5687281795511222, + "grad_norm": 0.4072074179284012, + "learning_rate": 2.0676472670899017e-05, + "loss": 0.8914, + "step": 11403 + }, + { + "epoch": 0.5687780548628429, + "grad_norm": 0.4401799433906339, + "learning_rate": 2.0672495125943864e-05, + "loss": 0.8269, + "step": 11404 + }, + { + "epoch": 0.5688279301745636, + "grad_norm": 0.4363219094678444, + "learning_rate": 2.0668517693913013e-05, + "loss": 0.8919, + "step": 11405 + }, + { + "epoch": 0.5688778054862843, + "grad_norm": 0.4063551722059454, + "learning_rate": 2.0664540374910258e-05, + "loss": 0.8444, + "step": 11406 + }, + { + "epoch": 0.568927680798005, + "grad_norm": 0.43380623882907327, + "learning_rate": 2.0660563169039386e-05, + "loss": 0.8365, + "step": 11407 + }, + { + "epoch": 0.5689775561097257, + "grad_norm": 0.6024270774437671, + "learning_rate": 2.065658607640417e-05, + "loss": 0.8979, + "step": 11408 + }, + { + "epoch": 0.5690274314214464, + "grad_norm": 0.38946761115957784, + "learning_rate": 2.0652609097108398e-05, + "loss": 0.8334, + "step": 11409 + }, + { + "epoch": 0.569077306733167, + "grad_norm": 0.3884057542061452, + "learning_rate": 2.0648632231255853e-05, + "loss": 0.914, + "step": 11410 + }, + { + "epoch": 0.5691271820448878, + "grad_norm": 0.49413546637320754, + "learning_rate": 2.0644655478950293e-05, + "loss": 0.8523, + "step": 11411 + }, + { + "epoch": 0.5691770573566085, + "grad_norm": 0.41043832047607837, + "learning_rate": 2.06406788402955e-05, + "loss": 0.9109, + "step": 11412 + }, + { + "epoch": 0.5692269326683291, + "grad_norm": 0.44230724637254804, + "learning_rate": 2.063670231539524e-05, + "loss": 0.8604, + "step": 11413 + }, + { + "epoch": 0.5692768079800499, + "grad_norm": 0.4565926005427805, + "learning_rate": 2.063272590435329e-05, + "loss": 0.8289, + "step": 11414 + }, + { + "epoch": 0.5693266832917706, + "grad_norm": 0.49240269082205745, + "learning_rate": 2.0628749607273396e-05, + "loss": 0.8787, + "step": 11415 + }, + { + "epoch": 0.5693765586034912, + "grad_norm": 0.3383915403037103, + "learning_rate": 2.062477342425932e-05, + "loss": 0.8191, + "step": 11416 + }, + { + "epoch": 0.569426433915212, + "grad_norm": 0.43771859401504926, + "learning_rate": 2.0620797355414827e-05, + "loss": 0.8683, + "step": 11417 + }, + { + "epoch": 0.5694763092269327, + "grad_norm": 0.5011007250883034, + "learning_rate": 2.0616821400843678e-05, + "loss": 0.8786, + "step": 11418 + }, + { + "epoch": 0.5695261845386533, + "grad_norm": 0.5243449327668179, + "learning_rate": 2.0612845560649603e-05, + "loss": 0.8644, + "step": 11419 + }, + { + "epoch": 0.5695760598503741, + "grad_norm": 0.43428729631557034, + "learning_rate": 2.0608869834936355e-05, + "loss": 0.8626, + "step": 11420 + }, + { + "epoch": 0.5696259351620948, + "grad_norm": 0.46833492532485616, + "learning_rate": 2.0604894223807697e-05, + "loss": 0.8801, + "step": 11421 + }, + { + "epoch": 0.5696758104738154, + "grad_norm": 0.41599854894341814, + "learning_rate": 2.0600918727367345e-05, + "loss": 0.8504, + "step": 11422 + }, + { + "epoch": 0.5697256857855362, + "grad_norm": 0.42806087641576246, + "learning_rate": 2.0596943345719056e-05, + "loss": 0.8391, + "step": 11423 + }, + { + "epoch": 0.5697755610972569, + "grad_norm": 0.38686285346266824, + "learning_rate": 2.0592968078966554e-05, + "loss": 0.8604, + "step": 11424 + }, + { + "epoch": 0.5698254364089775, + "grad_norm": 0.4788441783329259, + "learning_rate": 2.0588992927213584e-05, + "loss": 0.8415, + "step": 11425 + }, + { + "epoch": 0.5698753117206983, + "grad_norm": 1.0478272293831066, + "learning_rate": 2.0585017890563868e-05, + "loss": 0.8655, + "step": 11426 + }, + { + "epoch": 0.5699251870324189, + "grad_norm": 0.406358367005727, + "learning_rate": 2.0581042969121135e-05, + "loss": 0.8992, + "step": 11427 + }, + { + "epoch": 0.5699750623441396, + "grad_norm": 0.39653975103200423, + "learning_rate": 2.05770681629891e-05, + "loss": 0.8285, + "step": 11428 + }, + { + "epoch": 0.5700249376558604, + "grad_norm": 0.3980606006946418, + "learning_rate": 2.0573093472271502e-05, + "loss": 0.8616, + "step": 11429 + }, + { + "epoch": 0.570074812967581, + "grad_norm": 0.6060086844714312, + "learning_rate": 2.0569118897072047e-05, + "loss": 0.9415, + "step": 11430 + }, + { + "epoch": 0.5701246882793017, + "grad_norm": 0.38919667026408244, + "learning_rate": 2.0565144437494446e-05, + "loss": 0.8967, + "step": 11431 + }, + { + "epoch": 0.5701745635910225, + "grad_norm": 0.4599878855058294, + "learning_rate": 2.0561170093642423e-05, + "loss": 0.8886, + "step": 11432 + }, + { + "epoch": 0.5702244389027431, + "grad_norm": 0.5219714833283909, + "learning_rate": 2.0557195865619676e-05, + "loss": 0.8645, + "step": 11433 + }, + { + "epoch": 0.5702743142144638, + "grad_norm": 0.4122293656674, + "learning_rate": 2.055322175352991e-05, + "loss": 0.8437, + "step": 11434 + }, + { + "epoch": 0.5703241895261846, + "grad_norm": 0.4963791646794287, + "learning_rate": 2.054924775747684e-05, + "loss": 0.8761, + "step": 11435 + }, + { + "epoch": 0.5703740648379052, + "grad_norm": 0.49926650133580525, + "learning_rate": 2.054527387756416e-05, + "loss": 0.8611, + "step": 11436 + }, + { + "epoch": 0.5704239401496259, + "grad_norm": 0.42322386779300963, + "learning_rate": 2.0541300113895553e-05, + "loss": 0.8925, + "step": 11437 + }, + { + "epoch": 0.5704738154613467, + "grad_norm": 0.41382566185747566, + "learning_rate": 2.053732646657473e-05, + "loss": 0.8507, + "step": 11438 + }, + { + "epoch": 0.5705236907730673, + "grad_norm": 0.5882272885508546, + "learning_rate": 2.053335293570537e-05, + "loss": 0.8763, + "step": 11439 + }, + { + "epoch": 0.570573566084788, + "grad_norm": 0.5142198627710274, + "learning_rate": 2.0529379521391184e-05, + "loss": 0.9017, + "step": 11440 + }, + { + "epoch": 0.5706234413965088, + "grad_norm": 0.4271617593596002, + "learning_rate": 2.0525406223735823e-05, + "loss": 0.8643, + "step": 11441 + }, + { + "epoch": 0.5706733167082294, + "grad_norm": 0.4056742338276944, + "learning_rate": 2.0521433042842984e-05, + "loss": 0.8893, + "step": 11442 + }, + { + "epoch": 0.5707231920199501, + "grad_norm": 0.4026870847076341, + "learning_rate": 2.051745997881636e-05, + "loss": 0.9268, + "step": 11443 + }, + { + "epoch": 0.5707730673316708, + "grad_norm": 0.43391506176263844, + "learning_rate": 2.0513487031759596e-05, + "loss": 0.8905, + "step": 11444 + }, + { + "epoch": 0.5708229426433915, + "grad_norm": 0.41066963014550023, + "learning_rate": 2.0509514201776383e-05, + "loss": 0.8749, + "step": 11445 + }, + { + "epoch": 0.5708728179551122, + "grad_norm": 0.39618888616840237, + "learning_rate": 2.0505541488970387e-05, + "loss": 0.865, + "step": 11446 + }, + { + "epoch": 0.5709226932668329, + "grad_norm": 0.49944794103809376, + "learning_rate": 2.0501568893445284e-05, + "loss": 0.8711, + "step": 11447 + }, + { + "epoch": 0.5709725685785536, + "grad_norm": 0.4641700412085577, + "learning_rate": 2.0497596415304717e-05, + "loss": 0.8443, + "step": 11448 + }, + { + "epoch": 0.5710224438902743, + "grad_norm": 0.41076329589809585, + "learning_rate": 2.0493624054652357e-05, + "loss": 0.8843, + "step": 11449 + }, + { + "epoch": 0.571072319201995, + "grad_norm": 0.4457366796546883, + "learning_rate": 2.0489651811591863e-05, + "loss": 0.8852, + "step": 11450 + }, + { + "epoch": 0.5711221945137157, + "grad_norm": 0.427919759174574, + "learning_rate": 2.0485679686226895e-05, + "loss": 0.8542, + "step": 11451 + }, + { + "epoch": 0.5711720698254364, + "grad_norm": 0.9750093139153859, + "learning_rate": 2.0481707678661083e-05, + "loss": 0.8872, + "step": 11452 + }, + { + "epoch": 0.5712219451371571, + "grad_norm": 0.40195925101992686, + "learning_rate": 2.0477735788998086e-05, + "loss": 0.8683, + "step": 11453 + }, + { + "epoch": 0.5712718204488778, + "grad_norm": 0.49641139319126965, + "learning_rate": 2.047376401734156e-05, + "loss": 0.92, + "step": 11454 + }, + { + "epoch": 0.5713216957605985, + "grad_norm": 0.4255979987233819, + "learning_rate": 2.046979236379513e-05, + "loss": 0.8963, + "step": 11455 + }, + { + "epoch": 0.5713715710723192, + "grad_norm": 0.4507504682547455, + "learning_rate": 2.046582082846244e-05, + "loss": 0.8393, + "step": 11456 + }, + { + "epoch": 0.5714214463840399, + "grad_norm": 0.4696916420564839, + "learning_rate": 2.0461849411447127e-05, + "loss": 0.8404, + "step": 11457 + }, + { + "epoch": 0.5714713216957606, + "grad_norm": 0.40647162830249983, + "learning_rate": 2.0457878112852828e-05, + "loss": 0.925, + "step": 11458 + }, + { + "epoch": 0.5715211970074813, + "grad_norm": 0.46027579935288976, + "learning_rate": 2.045390693278316e-05, + "loss": 0.8582, + "step": 11459 + }, + { + "epoch": 0.571571072319202, + "grad_norm": 0.4447703547771531, + "learning_rate": 2.0449935871341757e-05, + "loss": 0.893, + "step": 11460 + }, + { + "epoch": 0.5716209476309226, + "grad_norm": 0.3973570100328328, + "learning_rate": 2.0445964928632237e-05, + "loss": 0.9046, + "step": 11461 + }, + { + "epoch": 0.5716708229426434, + "grad_norm": 0.45865364455515484, + "learning_rate": 2.0441994104758242e-05, + "loss": 0.8652, + "step": 11462 + }, + { + "epoch": 0.5717206982543641, + "grad_norm": 0.4426408009068528, + "learning_rate": 2.0438023399823352e-05, + "loss": 0.8582, + "step": 11463 + }, + { + "epoch": 0.5717705735660847, + "grad_norm": 0.39289527635796323, + "learning_rate": 2.0434052813931205e-05, + "loss": 0.8448, + "step": 11464 + }, + { + "epoch": 0.5718204488778055, + "grad_norm": 0.4944863247899111, + "learning_rate": 2.0430082347185405e-05, + "loss": 0.8807, + "step": 11465 + }, + { + "epoch": 0.5718703241895262, + "grad_norm": 0.5296616004068703, + "learning_rate": 2.0426111999689566e-05, + "loss": 0.9451, + "step": 11466 + }, + { + "epoch": 0.5719201995012468, + "grad_norm": 0.7234216130143002, + "learning_rate": 2.0422141771547286e-05, + "loss": 0.9073, + "step": 11467 + }, + { + "epoch": 0.5719700748129676, + "grad_norm": 0.5001569594752203, + "learning_rate": 2.0418171662862165e-05, + "loss": 0.8753, + "step": 11468 + }, + { + "epoch": 0.5720199501246883, + "grad_norm": 0.366320069099343, + "learning_rate": 2.041420167373781e-05, + "loss": 0.8519, + "step": 11469 + }, + { + "epoch": 0.5720698254364089, + "grad_norm": 0.49558811009638204, + "learning_rate": 2.0410231804277802e-05, + "loss": 0.9052, + "step": 11470 + }, + { + "epoch": 0.5721197007481297, + "grad_norm": 0.3708013404909183, + "learning_rate": 2.0406262054585738e-05, + "loss": 0.8786, + "step": 11471 + }, + { + "epoch": 0.5721695760598504, + "grad_norm": 0.42485689467564336, + "learning_rate": 2.040229242476522e-05, + "loss": 0.8653, + "step": 11472 + }, + { + "epoch": 0.572219451371571, + "grad_norm": 0.3958691673532682, + "learning_rate": 2.0398322914919823e-05, + "loss": 0.8863, + "step": 11473 + }, + { + "epoch": 0.5722693266832918, + "grad_norm": 0.4971254428330732, + "learning_rate": 2.0394353525153126e-05, + "loss": 0.9438, + "step": 11474 + }, + { + "epoch": 0.5723192019950125, + "grad_norm": 0.44666363684362254, + "learning_rate": 2.039038425556871e-05, + "loss": 0.884, + "step": 11475 + }, + { + "epoch": 0.5723690773067331, + "grad_norm": 0.38975377242983233, + "learning_rate": 2.0386415106270156e-05, + "loss": 0.8432, + "step": 11476 + }, + { + "epoch": 0.5724189526184539, + "grad_norm": 0.4547468765796534, + "learning_rate": 2.0382446077361043e-05, + "loss": 0.8448, + "step": 11477 + }, + { + "epoch": 0.5724688279301746, + "grad_norm": 0.5494613312422083, + "learning_rate": 2.0378477168944925e-05, + "loss": 0.8562, + "step": 11478 + }, + { + "epoch": 0.5725187032418952, + "grad_norm": 1.063867783043318, + "learning_rate": 2.0374508381125373e-05, + "loss": 0.9136, + "step": 11479 + }, + { + "epoch": 0.572568578553616, + "grad_norm": 0.4820100217218608, + "learning_rate": 2.0370539714005966e-05, + "loss": 0.9156, + "step": 11480 + }, + { + "epoch": 0.5726184538653366, + "grad_norm": 0.4066096011167199, + "learning_rate": 2.0366571167690246e-05, + "loss": 0.8217, + "step": 11481 + }, + { + "epoch": 0.5726683291770573, + "grad_norm": 0.49962509167522706, + "learning_rate": 2.0362602742281778e-05, + "loss": 0.8874, + "step": 11482 + }, + { + "epoch": 0.5727182044887781, + "grad_norm": 0.5808167944956201, + "learning_rate": 2.0358634437884112e-05, + "loss": 0.9092, + "step": 11483 + }, + { + "epoch": 0.5727680798004987, + "grad_norm": 0.44802734145144535, + "learning_rate": 2.0354666254600818e-05, + "loss": 0.9094, + "step": 11484 + }, + { + "epoch": 0.5728179551122194, + "grad_norm": 0.4159647550554826, + "learning_rate": 2.0350698192535416e-05, + "loss": 0.877, + "step": 11485 + }, + { + "epoch": 0.5728678304239402, + "grad_norm": 0.5864886339342487, + "learning_rate": 2.0346730251791462e-05, + "loss": 0.8933, + "step": 11486 + }, + { + "epoch": 0.5729177057356608, + "grad_norm": 0.4366684791822858, + "learning_rate": 2.0342762432472502e-05, + "loss": 0.8801, + "step": 11487 + }, + { + "epoch": 0.5729675810473815, + "grad_norm": 0.5012045163474683, + "learning_rate": 2.0338794734682083e-05, + "loss": 0.8827, + "step": 11488 + }, + { + "epoch": 0.5730174563591023, + "grad_norm": 0.47815546073578474, + "learning_rate": 2.033482715852372e-05, + "loss": 0.8733, + "step": 11489 + }, + { + "epoch": 0.5730673316708229, + "grad_norm": 0.3917390442534476, + "learning_rate": 2.0330859704100952e-05, + "loss": 0.8509, + "step": 11490 + }, + { + "epoch": 0.5731172069825436, + "grad_norm": 0.6470572744809143, + "learning_rate": 2.0326892371517323e-05, + "loss": 0.883, + "step": 11491 + }, + { + "epoch": 0.5731670822942644, + "grad_norm": 0.46122861259255915, + "learning_rate": 2.0322925160876334e-05, + "loss": 0.8605, + "step": 11492 + }, + { + "epoch": 0.573216957605985, + "grad_norm": 0.4762576758728051, + "learning_rate": 2.0318958072281517e-05, + "loss": 0.8832, + "step": 11493 + }, + { + "epoch": 0.5732668329177057, + "grad_norm": 0.47051566787659627, + "learning_rate": 2.03149911058364e-05, + "loss": 0.8918, + "step": 11494 + }, + { + "epoch": 0.5733167082294265, + "grad_norm": 0.544283751517124, + "learning_rate": 2.03110242616445e-05, + "loss": 0.8926, + "step": 11495 + }, + { + "epoch": 0.5733665835411471, + "grad_norm": 0.4616685871793679, + "learning_rate": 2.0307057539809316e-05, + "loss": 0.9, + "step": 11496 + }, + { + "epoch": 0.5734164588528678, + "grad_norm": 0.5633609460521877, + "learning_rate": 2.030309094043436e-05, + "loss": 0.8938, + "step": 11497 + }, + { + "epoch": 0.5734663341645885, + "grad_norm": 0.459696662329131, + "learning_rate": 2.0299124463623147e-05, + "loss": 0.9133, + "step": 11498 + }, + { + "epoch": 0.5735162094763092, + "grad_norm": 0.7630199697666902, + "learning_rate": 2.0295158109479188e-05, + "loss": 0.8717, + "step": 11499 + }, + { + "epoch": 0.57356608478803, + "grad_norm": 1.0016366717214458, + "learning_rate": 2.0291191878105963e-05, + "loss": 0.8662, + "step": 11500 + }, + { + "epoch": 0.5736159600997506, + "grad_norm": 0.43628798875630165, + "learning_rate": 2.0287225769606976e-05, + "loss": 0.9106, + "step": 11501 + }, + { + "epoch": 0.5736658354114713, + "grad_norm": 0.42995162607807735, + "learning_rate": 2.028325978408573e-05, + "loss": 0.8655, + "step": 11502 + }, + { + "epoch": 0.573715710723192, + "grad_norm": 0.40214423209123673, + "learning_rate": 2.0279293921645703e-05, + "loss": 0.8052, + "step": 11503 + }, + { + "epoch": 0.5737655860349127, + "grad_norm": 0.48248667336026063, + "learning_rate": 2.0275328182390397e-05, + "loss": 0.8564, + "step": 11504 + }, + { + "epoch": 0.5738154613466334, + "grad_norm": 0.4349002444168776, + "learning_rate": 2.027136256642328e-05, + "loss": 0.9173, + "step": 11505 + }, + { + "epoch": 0.5738653366583542, + "grad_norm": 0.5515790551313126, + "learning_rate": 2.0267397073847843e-05, + "loss": 0.8572, + "step": 11506 + }, + { + "epoch": 0.5739152119700748, + "grad_norm": 0.45701973221829323, + "learning_rate": 2.026343170476756e-05, + "loss": 0.9156, + "step": 11507 + }, + { + "epoch": 0.5739650872817955, + "grad_norm": 0.3890237904996582, + "learning_rate": 2.0259466459285904e-05, + "loss": 0.8908, + "step": 11508 + }, + { + "epoch": 0.5740149625935163, + "grad_norm": 0.37992016741908313, + "learning_rate": 2.025550133750636e-05, + "loss": 0.8585, + "step": 11509 + }, + { + "epoch": 0.5740648379052369, + "grad_norm": 0.5035028341063252, + "learning_rate": 2.025153633953238e-05, + "loss": 0.8644, + "step": 11510 + }, + { + "epoch": 0.5741147132169576, + "grad_norm": 0.47876449944153276, + "learning_rate": 2.0247571465467427e-05, + "loss": 0.9076, + "step": 11511 + }, + { + "epoch": 0.5741645885286784, + "grad_norm": 0.400306440973783, + "learning_rate": 2.024360671541497e-05, + "loss": 0.8653, + "step": 11512 + }, + { + "epoch": 0.574214463840399, + "grad_norm": 0.4562003702082097, + "learning_rate": 2.0239642089478484e-05, + "loss": 0.8815, + "step": 11513 + }, + { + "epoch": 0.5742643391521197, + "grad_norm": 0.41661265537430725, + "learning_rate": 2.0235677587761394e-05, + "loss": 0.8511, + "step": 11514 + }, + { + "epoch": 0.5743142144638403, + "grad_norm": 0.44221734778406774, + "learning_rate": 2.0231713210367162e-05, + "loss": 0.8555, + "step": 11515 + }, + { + "epoch": 0.5743640897755611, + "grad_norm": 0.40495429011038947, + "learning_rate": 2.0227748957399244e-05, + "loss": 0.8851, + "step": 11516 + }, + { + "epoch": 0.5744139650872818, + "grad_norm": 0.4712011573950855, + "learning_rate": 2.022378482896109e-05, + "loss": 0.8458, + "step": 11517 + }, + { + "epoch": 0.5744638403990024, + "grad_norm": 0.4238888429880776, + "learning_rate": 2.0219820825156123e-05, + "loss": 0.9251, + "step": 11518 + }, + { + "epoch": 0.5745137157107232, + "grad_norm": 0.3856358446138052, + "learning_rate": 2.021585694608779e-05, + "loss": 0.8849, + "step": 11519 + }, + { + "epoch": 0.5745635910224439, + "grad_norm": 0.4719157145827054, + "learning_rate": 2.021189319185953e-05, + "loss": 0.8867, + "step": 11520 + }, + { + "epoch": 0.5746134663341645, + "grad_norm": 0.4329553049062798, + "learning_rate": 2.0207929562574783e-05, + "loss": 0.8702, + "step": 11521 + }, + { + "epoch": 0.5746633416458853, + "grad_norm": 0.43714544001256117, + "learning_rate": 2.0203966058336964e-05, + "loss": 0.8798, + "step": 11522 + }, + { + "epoch": 0.574713216957606, + "grad_norm": 0.4261896594124851, + "learning_rate": 2.02000026792495e-05, + "loss": 0.8634, + "step": 11523 + }, + { + "epoch": 0.5747630922693266, + "grad_norm": 0.4735035111665945, + "learning_rate": 2.019603942541582e-05, + "loss": 0.8761, + "step": 11524 + }, + { + "epoch": 0.5748129675810474, + "grad_norm": 0.4555189363728464, + "learning_rate": 2.019207629693935e-05, + "loss": 0.897, + "step": 11525 + }, + { + "epoch": 0.5748628428927681, + "grad_norm": 0.5036580217749538, + "learning_rate": 2.018811329392349e-05, + "loss": 0.8949, + "step": 11526 + }, + { + "epoch": 0.5749127182044887, + "grad_norm": 0.9282781542853856, + "learning_rate": 2.0184150416471655e-05, + "loss": 0.8995, + "step": 11527 + }, + { + "epoch": 0.5749625935162095, + "grad_norm": 0.5258092916925836, + "learning_rate": 2.0180187664687274e-05, + "loss": 0.9082, + "step": 11528 + }, + { + "epoch": 0.5750124688279302, + "grad_norm": 0.40896437259445595, + "learning_rate": 2.0176225038673725e-05, + "loss": 0.8511, + "step": 11529 + }, + { + "epoch": 0.5750623441396508, + "grad_norm": 0.4841208586949377, + "learning_rate": 2.017226253853443e-05, + "loss": 0.8598, + "step": 11530 + }, + { + "epoch": 0.5751122194513716, + "grad_norm": 0.6434338753999926, + "learning_rate": 2.016830016437278e-05, + "loss": 0.8727, + "step": 11531 + }, + { + "epoch": 0.5751620947630923, + "grad_norm": 0.4230279252286682, + "learning_rate": 2.0164337916292186e-05, + "loss": 0.8678, + "step": 11532 + }, + { + "epoch": 0.575211970074813, + "grad_norm": 0.483398500328891, + "learning_rate": 2.016037579439602e-05, + "loss": 0.876, + "step": 11533 + }, + { + "epoch": 0.5752618453865337, + "grad_norm": 0.4220516874355344, + "learning_rate": 2.015641379878768e-05, + "loss": 0.9052, + "step": 11534 + }, + { + "epoch": 0.5753117206982543, + "grad_norm": 0.38988967880243547, + "learning_rate": 2.0152451929570555e-05, + "loss": 0.8454, + "step": 11535 + }, + { + "epoch": 0.575361596009975, + "grad_norm": 0.44429708495483317, + "learning_rate": 2.0148490186848036e-05, + "loss": 0.8951, + "step": 11536 + }, + { + "epoch": 0.5754114713216958, + "grad_norm": 0.4162280016171637, + "learning_rate": 2.0144528570723488e-05, + "loss": 0.886, + "step": 11537 + }, + { + "epoch": 0.5754613466334164, + "grad_norm": 0.37754797144146646, + "learning_rate": 2.0140567081300294e-05, + "loss": 0.8522, + "step": 11538 + }, + { + "epoch": 0.5755112219451372, + "grad_norm": 0.41803044948193524, + "learning_rate": 2.0136605718681828e-05, + "loss": 0.8257, + "step": 11539 + }, + { + "epoch": 0.5755610972568579, + "grad_norm": 0.432091619916537, + "learning_rate": 2.013264448297146e-05, + "loss": 0.8661, + "step": 11540 + }, + { + "epoch": 0.5756109725685785, + "grad_norm": 0.40862189040043323, + "learning_rate": 2.0128683374272556e-05, + "loss": 0.9087, + "step": 11541 + }, + { + "epoch": 0.5756608478802993, + "grad_norm": 0.41392168093019693, + "learning_rate": 2.0124722392688473e-05, + "loss": 0.8693, + "step": 11542 + }, + { + "epoch": 0.57571072319202, + "grad_norm": 0.5027156913952956, + "learning_rate": 2.0120761538322586e-05, + "loss": 0.8795, + "step": 11543 + }, + { + "epoch": 0.5757605985037406, + "grad_norm": 0.4627527348875039, + "learning_rate": 2.0116800811278237e-05, + "loss": 0.8823, + "step": 11544 + }, + { + "epoch": 0.5758104738154614, + "grad_norm": 0.40063834572162776, + "learning_rate": 2.011284021165879e-05, + "loss": 0.9012, + "step": 11545 + }, + { + "epoch": 0.5758603491271821, + "grad_norm": 0.3921213375398519, + "learning_rate": 2.0108879739567586e-05, + "loss": 0.8519, + "step": 11546 + }, + { + "epoch": 0.5759102244389027, + "grad_norm": 0.3991980027537865, + "learning_rate": 2.010491939510798e-05, + "loss": 0.8568, + "step": 11547 + }, + { + "epoch": 0.5759600997506235, + "grad_norm": 0.40734299268101354, + "learning_rate": 2.0100959178383306e-05, + "loss": 0.8492, + "step": 11548 + }, + { + "epoch": 0.5760099750623442, + "grad_norm": 0.49100061020077007, + "learning_rate": 2.0096999089496913e-05, + "loss": 0.8765, + "step": 11549 + }, + { + "epoch": 0.5760598503740648, + "grad_norm": 0.39587861399890795, + "learning_rate": 2.0093039128552143e-05, + "loss": 0.8952, + "step": 11550 + }, + { + "epoch": 0.5761097256857856, + "grad_norm": 0.42657576952156506, + "learning_rate": 2.0089079295652306e-05, + "loss": 0.8641, + "step": 11551 + }, + { + "epoch": 0.5761596009975062, + "grad_norm": 0.3836032221638255, + "learning_rate": 2.0085119590900755e-05, + "loss": 0.8711, + "step": 11552 + }, + { + "epoch": 0.5762094763092269, + "grad_norm": 0.42935360781606385, + "learning_rate": 2.0081160014400804e-05, + "loss": 0.876, + "step": 11553 + }, + { + "epoch": 0.5762593516209477, + "grad_norm": 0.44968255359023995, + "learning_rate": 2.0077200566255794e-05, + "loss": 0.9223, + "step": 11554 + }, + { + "epoch": 0.5763092269326683, + "grad_norm": 0.41789288436655003, + "learning_rate": 2.007324124656902e-05, + "loss": 0.9051, + "step": 11555 + }, + { + "epoch": 0.576359102244389, + "grad_norm": 0.4116127539317934, + "learning_rate": 2.006928205544381e-05, + "loss": 0.884, + "step": 11556 + }, + { + "epoch": 0.5764089775561098, + "grad_norm": 0.4839087762988056, + "learning_rate": 2.006532299298348e-05, + "loss": 0.8831, + "step": 11557 + }, + { + "epoch": 0.5764588528678304, + "grad_norm": 0.46393745808248177, + "learning_rate": 2.0061364059291346e-05, + "loss": 0.8685, + "step": 11558 + }, + { + "epoch": 0.5765087281795511, + "grad_norm": 0.45567466810283064, + "learning_rate": 2.00574052544707e-05, + "loss": 0.9137, + "step": 11559 + }, + { + "epoch": 0.5765586034912719, + "grad_norm": 0.41038162670778194, + "learning_rate": 2.0053446578624845e-05, + "loss": 0.8547, + "step": 11560 + }, + { + "epoch": 0.5766084788029925, + "grad_norm": 0.45727713103120854, + "learning_rate": 2.0049488031857105e-05, + "loss": 0.9178, + "step": 11561 + }, + { + "epoch": 0.5766583541147132, + "grad_norm": 0.5328776915789216, + "learning_rate": 2.004552961427075e-05, + "loss": 0.8333, + "step": 11562 + }, + { + "epoch": 0.576708229426434, + "grad_norm": 0.40607379796790477, + "learning_rate": 2.0041571325969075e-05, + "loss": 0.8728, + "step": 11563 + }, + { + "epoch": 0.5767581047381546, + "grad_norm": 0.44622721807455246, + "learning_rate": 2.003761316705538e-05, + "loss": 0.8923, + "step": 11564 + }, + { + "epoch": 0.5768079800498753, + "grad_norm": 0.3896711612816188, + "learning_rate": 2.0033655137632962e-05, + "loss": 0.8727, + "step": 11565 + }, + { + "epoch": 0.5768578553615961, + "grad_norm": 0.3723538973419154, + "learning_rate": 2.002969723780508e-05, + "loss": 0.8433, + "step": 11566 + }, + { + "epoch": 0.5769077306733167, + "grad_norm": 0.3559368339069617, + "learning_rate": 2.002573946767502e-05, + "loss": 0.8403, + "step": 11567 + }, + { + "epoch": 0.5769576059850374, + "grad_norm": 0.4945986228717497, + "learning_rate": 2.0021781827346065e-05, + "loss": 0.9145, + "step": 11568 + }, + { + "epoch": 0.577007481296758, + "grad_norm": 0.3905737859462751, + "learning_rate": 2.0017824316921496e-05, + "loss": 0.8887, + "step": 11569 + }, + { + "epoch": 0.5770573566084788, + "grad_norm": 0.4561825749768995, + "learning_rate": 2.0013866936504562e-05, + "loss": 0.8617, + "step": 11570 + }, + { + "epoch": 0.5771072319201995, + "grad_norm": 0.6211965206620517, + "learning_rate": 2.0009909686198535e-05, + "loss": 0.8604, + "step": 11571 + }, + { + "epoch": 0.5771571072319202, + "grad_norm": 0.43862715061993585, + "learning_rate": 2.0005952566106684e-05, + "loss": 0.8426, + "step": 11572 + }, + { + "epoch": 0.5772069825436409, + "grad_norm": 0.468371381179225, + "learning_rate": 2.0001995576332275e-05, + "loss": 0.8684, + "step": 11573 + }, + { + "epoch": 0.5772568578553616, + "grad_norm": 0.4547189965697638, + "learning_rate": 1.9998038716978544e-05, + "loss": 0.8962, + "step": 11574 + }, + { + "epoch": 0.5773067331670823, + "grad_norm": 0.47979876161878665, + "learning_rate": 1.999408198814876e-05, + "loss": 0.8711, + "step": 11575 + }, + { + "epoch": 0.577356608478803, + "grad_norm": 0.48611678020058763, + "learning_rate": 1.999012538994616e-05, + "loss": 0.8801, + "step": 11576 + }, + { + "epoch": 0.5774064837905237, + "grad_norm": 0.5327674058928559, + "learning_rate": 1.998616892247401e-05, + "loss": 0.8775, + "step": 11577 + }, + { + "epoch": 0.5774563591022444, + "grad_norm": 0.5367906783503877, + "learning_rate": 1.998221258583552e-05, + "loss": 0.8476, + "step": 11578 + }, + { + "epoch": 0.5775062344139651, + "grad_norm": 0.3871015203134116, + "learning_rate": 1.9978256380133954e-05, + "loss": 0.8263, + "step": 11579 + }, + { + "epoch": 0.5775561097256858, + "grad_norm": 0.3895520902821146, + "learning_rate": 1.9974300305472542e-05, + "loss": 0.8617, + "step": 11580 + }, + { + "epoch": 0.5776059850374065, + "grad_norm": 0.46504531625061785, + "learning_rate": 1.9970344361954513e-05, + "loss": 0.8784, + "step": 11581 + }, + { + "epoch": 0.5776558603491272, + "grad_norm": 0.5523833767606773, + "learning_rate": 1.9966388549683098e-05, + "loss": 0.906, + "step": 11582 + }, + { + "epoch": 0.5777057356608479, + "grad_norm": 0.44299751964213346, + "learning_rate": 1.996243286876152e-05, + "loss": 0.9025, + "step": 11583 + }, + { + "epoch": 0.5777556109725686, + "grad_norm": 0.4692030429096632, + "learning_rate": 1.9958477319293002e-05, + "loss": 0.881, + "step": 11584 + }, + { + "epoch": 0.5778054862842893, + "grad_norm": 0.5017360217730301, + "learning_rate": 1.9954521901380764e-05, + "loss": 0.8877, + "step": 11585 + }, + { + "epoch": 0.5778553615960099, + "grad_norm": 0.4193977849237591, + "learning_rate": 1.9950566615128014e-05, + "loss": 0.8963, + "step": 11586 + }, + { + "epoch": 0.5779052369077307, + "grad_norm": 0.4429934520157245, + "learning_rate": 1.994661146063798e-05, + "loss": 0.854, + "step": 11587 + }, + { + "epoch": 0.5779551122194514, + "grad_norm": 0.43513195494429313, + "learning_rate": 1.9942656438013847e-05, + "loss": 0.8465, + "step": 11588 + }, + { + "epoch": 0.578004987531172, + "grad_norm": 0.47568313606137613, + "learning_rate": 1.993870154735883e-05, + "loss": 0.8572, + "step": 11589 + }, + { + "epoch": 0.5780548628428928, + "grad_norm": 0.37125492401407956, + "learning_rate": 1.9934746788776132e-05, + "loss": 0.877, + "step": 11590 + }, + { + "epoch": 0.5781047381546135, + "grad_norm": 0.3878626666549692, + "learning_rate": 1.9930792162368962e-05, + "loss": 0.871, + "step": 11591 + }, + { + "epoch": 0.5781546134663341, + "grad_norm": 0.4699375039591872, + "learning_rate": 1.9926837668240486e-05, + "loss": 0.8685, + "step": 11592 + }, + { + "epoch": 0.5782044887780549, + "grad_norm": 0.45839306639521976, + "learning_rate": 1.9922883306493917e-05, + "loss": 0.8953, + "step": 11593 + }, + { + "epoch": 0.5782543640897756, + "grad_norm": 0.4517752464374677, + "learning_rate": 1.9918929077232433e-05, + "loss": 0.8757, + "step": 11594 + }, + { + "epoch": 0.5783042394014962, + "grad_norm": 0.4487110798821908, + "learning_rate": 1.991497498055923e-05, + "loss": 0.8318, + "step": 11595 + }, + { + "epoch": 0.578354114713217, + "grad_norm": 0.5072932784194922, + "learning_rate": 1.991102101657747e-05, + "loss": 0.8749, + "step": 11596 + }, + { + "epoch": 0.5784039900249377, + "grad_norm": 0.45821318518161075, + "learning_rate": 1.990706718539034e-05, + "loss": 0.8935, + "step": 11597 + }, + { + "epoch": 0.5784538653366583, + "grad_norm": 0.43920104783718195, + "learning_rate": 1.9903113487101023e-05, + "loss": 0.8638, + "step": 11598 + }, + { + "epoch": 0.5785037406483791, + "grad_norm": 0.4398231254446659, + "learning_rate": 1.9899159921812667e-05, + "loss": 0.8481, + "step": 11599 + }, + { + "epoch": 0.5785536159600998, + "grad_norm": 0.4959025501845502, + "learning_rate": 1.9895206489628447e-05, + "loss": 0.8344, + "step": 11600 + }, + { + "epoch": 0.5786034912718204, + "grad_norm": 0.4892852900833759, + "learning_rate": 1.9891253190651533e-05, + "loss": 0.8418, + "step": 11601 + }, + { + "epoch": 0.5786533665835412, + "grad_norm": 0.4156483692036648, + "learning_rate": 1.988730002498509e-05, + "loss": 0.8313, + "step": 11602 + }, + { + "epoch": 0.5787032418952619, + "grad_norm": 0.4284712463760773, + "learning_rate": 1.9883346992732256e-05, + "loss": 0.8446, + "step": 11603 + }, + { + "epoch": 0.5787531172069825, + "grad_norm": 0.36286074882734387, + "learning_rate": 1.987939409399619e-05, + "loss": 0.8418, + "step": 11604 + }, + { + "epoch": 0.5788029925187033, + "grad_norm": 0.4151134287016578, + "learning_rate": 1.9875441328880042e-05, + "loss": 0.8843, + "step": 11605 + }, + { + "epoch": 0.5788528678304239, + "grad_norm": 0.4953714325158618, + "learning_rate": 1.987148869748697e-05, + "loss": 0.8462, + "step": 11606 + }, + { + "epoch": 0.5789027431421446, + "grad_norm": 0.38876550791212344, + "learning_rate": 1.9867536199920102e-05, + "loss": 0.8492, + "step": 11607 + }, + { + "epoch": 0.5789526184538654, + "grad_norm": 0.4351325716115339, + "learning_rate": 1.9863583836282572e-05, + "loss": 0.8308, + "step": 11608 + }, + { + "epoch": 0.579002493765586, + "grad_norm": 0.6075184714698502, + "learning_rate": 1.9859631606677537e-05, + "loss": 0.8671, + "step": 11609 + }, + { + "epoch": 0.5790523690773067, + "grad_norm": 0.4057487530740786, + "learning_rate": 1.9855679511208104e-05, + "loss": 0.8597, + "step": 11610 + }, + { + "epoch": 0.5791022443890275, + "grad_norm": 0.431948088345792, + "learning_rate": 1.9851727549977407e-05, + "loss": 0.8984, + "step": 11611 + }, + { + "epoch": 0.5791521197007481, + "grad_norm": 0.5029785934326116, + "learning_rate": 1.984777572308858e-05, + "loss": 0.8734, + "step": 11612 + }, + { + "epoch": 0.5792019950124688, + "grad_norm": 0.44956628381356584, + "learning_rate": 1.9843824030644744e-05, + "loss": 0.9173, + "step": 11613 + }, + { + "epoch": 0.5792518703241896, + "grad_norm": 0.429624215690479, + "learning_rate": 1.9839872472749013e-05, + "loss": 0.9043, + "step": 11614 + }, + { + "epoch": 0.5793017456359102, + "grad_norm": 0.4572037054077754, + "learning_rate": 1.9835921049504493e-05, + "loss": 0.906, + "step": 11615 + }, + { + "epoch": 0.5793516209476309, + "grad_norm": 0.4288317894437949, + "learning_rate": 1.9831969761014306e-05, + "loss": 0.8734, + "step": 11616 + }, + { + "epoch": 0.5794014962593517, + "grad_norm": 0.546710167810897, + "learning_rate": 1.9828018607381556e-05, + "loss": 0.8459, + "step": 11617 + }, + { + "epoch": 0.5794513715710723, + "grad_norm": 0.37051498509183, + "learning_rate": 1.9824067588709344e-05, + "loss": 0.8587, + "step": 11618 + }, + { + "epoch": 0.579501246882793, + "grad_norm": 0.42032193602678736, + "learning_rate": 1.9820116705100777e-05, + "loss": 0.8416, + "step": 11619 + }, + { + "epoch": 0.5795511221945138, + "grad_norm": 0.5526141013332928, + "learning_rate": 1.9816165956658946e-05, + "loss": 0.8641, + "step": 11620 + }, + { + "epoch": 0.5796009975062344, + "grad_norm": 0.43237140851183503, + "learning_rate": 1.9812215343486938e-05, + "loss": 0.8183, + "step": 11621 + }, + { + "epoch": 0.5796508728179551, + "grad_norm": 0.48775584349676726, + "learning_rate": 1.9808264865687848e-05, + "loss": 0.8743, + "step": 11622 + }, + { + "epoch": 0.5797007481296758, + "grad_norm": 0.4055296626084235, + "learning_rate": 1.9804314523364763e-05, + "loss": 0.8911, + "step": 11623 + }, + { + "epoch": 0.5797506234413965, + "grad_norm": 0.44550353140285215, + "learning_rate": 1.980036431662078e-05, + "loss": 0.8527, + "step": 11624 + }, + { + "epoch": 0.5798004987531172, + "grad_norm": 0.44553163185485356, + "learning_rate": 1.979641424555895e-05, + "loss": 0.8443, + "step": 11625 + }, + { + "epoch": 0.5798503740648379, + "grad_norm": 0.45089074054456824, + "learning_rate": 1.979246431028236e-05, + "loss": 0.8332, + "step": 11626 + }, + { + "epoch": 0.5799002493765586, + "grad_norm": 0.43265865028910416, + "learning_rate": 1.9788514510894082e-05, + "loss": 0.8938, + "step": 11627 + }, + { + "epoch": 0.5799501246882793, + "grad_norm": 0.6900006491719801, + "learning_rate": 1.97845648474972e-05, + "loss": 0.8901, + "step": 11628 + }, + { + "epoch": 0.58, + "grad_norm": 0.4606838378218705, + "learning_rate": 1.978061532019475e-05, + "loss": 0.9388, + "step": 11629 + }, + { + "epoch": 0.5800498753117207, + "grad_norm": 0.4398742211468933, + "learning_rate": 1.977666592908981e-05, + "loss": 0.8788, + "step": 11630 + }, + { + "epoch": 0.5800997506234414, + "grad_norm": 0.5169589437072302, + "learning_rate": 1.977271667428543e-05, + "loss": 0.8236, + "step": 11631 + }, + { + "epoch": 0.5801496259351621, + "grad_norm": 0.3926776517426273, + "learning_rate": 1.9768767555884687e-05, + "loss": 0.8736, + "step": 11632 + }, + { + "epoch": 0.5801995012468828, + "grad_norm": 0.470727970823172, + "learning_rate": 1.97648185739906e-05, + "loss": 0.8586, + "step": 11633 + }, + { + "epoch": 0.5802493765586035, + "grad_norm": 0.5572048340331065, + "learning_rate": 1.9760869728706227e-05, + "loss": 0.8674, + "step": 11634 + }, + { + "epoch": 0.5802992518703242, + "grad_norm": 0.5228780313012651, + "learning_rate": 1.975692102013463e-05, + "loss": 0.8607, + "step": 11635 + }, + { + "epoch": 0.5803491271820449, + "grad_norm": 0.4165501258612348, + "learning_rate": 1.9752972448378814e-05, + "loss": 0.898, + "step": 11636 + }, + { + "epoch": 0.5803990024937656, + "grad_norm": 0.4880474618997614, + "learning_rate": 1.974902401354184e-05, + "loss": 0.8723, + "step": 11637 + }, + { + "epoch": 0.5804488778054863, + "grad_norm": 0.48780840861541747, + "learning_rate": 1.974507571572673e-05, + "loss": 0.8513, + "step": 11638 + }, + { + "epoch": 0.580498753117207, + "grad_norm": 0.5315651721023872, + "learning_rate": 1.974112755503653e-05, + "loss": 0.8785, + "step": 11639 + }, + { + "epoch": 0.5805486284289276, + "grad_norm": 0.5870003263902579, + "learning_rate": 1.9737179531574243e-05, + "loss": 0.8793, + "step": 11640 + }, + { + "epoch": 0.5805985037406484, + "grad_norm": 0.46892197887254466, + "learning_rate": 1.97332316454429e-05, + "loss": 0.856, + "step": 11641 + }, + { + "epoch": 0.5806483790523691, + "grad_norm": 0.5791011006353812, + "learning_rate": 1.9729283896745522e-05, + "loss": 0.8878, + "step": 11642 + }, + { + "epoch": 0.5806982543640897, + "grad_norm": 0.45145110287731477, + "learning_rate": 1.9725336285585128e-05, + "loss": 0.8525, + "step": 11643 + }, + { + "epoch": 0.5807481296758105, + "grad_norm": 0.6690835162335604, + "learning_rate": 1.9721388812064714e-05, + "loss": 0.8402, + "step": 11644 + }, + { + "epoch": 0.5807980049875312, + "grad_norm": 0.4649776046683397, + "learning_rate": 1.9717441476287298e-05, + "loss": 0.8689, + "step": 11645 + }, + { + "epoch": 0.5808478802992518, + "grad_norm": 0.726341856364906, + "learning_rate": 1.9713494278355894e-05, + "loss": 0.9189, + "step": 11646 + }, + { + "epoch": 0.5808977556109726, + "grad_norm": 0.669921944285775, + "learning_rate": 1.970954721837348e-05, + "loss": 0.883, + "step": 11647 + }, + { + "epoch": 0.5809476309226933, + "grad_norm": 0.400727808764646, + "learning_rate": 1.970560029644306e-05, + "loss": 0.8871, + "step": 11648 + }, + { + "epoch": 0.5809975062344139, + "grad_norm": 0.45750794266716716, + "learning_rate": 1.9701653512667633e-05, + "loss": 0.8723, + "step": 11649 + }, + { + "epoch": 0.5810473815461347, + "grad_norm": 0.7432452716443879, + "learning_rate": 1.9697706867150192e-05, + "loss": 0.8253, + "step": 11650 + }, + { + "epoch": 0.5810972568578554, + "grad_norm": 0.48834301934008484, + "learning_rate": 1.9693760359993717e-05, + "loss": 0.8458, + "step": 11651 + }, + { + "epoch": 0.581147132169576, + "grad_norm": 0.37442184071306917, + "learning_rate": 1.9689813991301186e-05, + "loss": 0.8771, + "step": 11652 + }, + { + "epoch": 0.5811970074812968, + "grad_norm": 0.3913249387119587, + "learning_rate": 1.9685867761175584e-05, + "loss": 0.8636, + "step": 11653 + }, + { + "epoch": 0.5812468827930175, + "grad_norm": 0.8790757469234076, + "learning_rate": 1.9681921669719887e-05, + "loss": 0.872, + "step": 11654 + }, + { + "epoch": 0.5812967581047381, + "grad_norm": 0.6418112236921967, + "learning_rate": 1.9677975717037058e-05, + "loss": 0.8785, + "step": 11655 + }, + { + "epoch": 0.5813466334164589, + "grad_norm": 0.5343495444240886, + "learning_rate": 1.9674029903230078e-05, + "loss": 0.8626, + "step": 11656 + }, + { + "epoch": 0.5813965087281796, + "grad_norm": 0.4261105354877173, + "learning_rate": 1.9670084228401905e-05, + "loss": 0.8415, + "step": 11657 + }, + { + "epoch": 0.5814463840399002, + "grad_norm": 0.45560896765398495, + "learning_rate": 1.9666138692655492e-05, + "loss": 0.8529, + "step": 11658 + }, + { + "epoch": 0.581496259351621, + "grad_norm": 0.432420005675588, + "learning_rate": 1.9662193296093804e-05, + "loss": 0.8454, + "step": 11659 + }, + { + "epoch": 0.5815461346633416, + "grad_norm": 0.4375722074455523, + "learning_rate": 1.9658248038819797e-05, + "loss": 0.9065, + "step": 11660 + }, + { + "epoch": 0.5815960099750623, + "grad_norm": 0.418949259950134, + "learning_rate": 1.9654302920936414e-05, + "loss": 0.8746, + "step": 11661 + }, + { + "epoch": 0.5816458852867831, + "grad_norm": 0.43477981094117935, + "learning_rate": 1.9650357942546605e-05, + "loss": 0.8857, + "step": 11662 + }, + { + "epoch": 0.5816957605985037, + "grad_norm": 0.38380270599334665, + "learning_rate": 1.9646413103753304e-05, + "loss": 0.821, + "step": 11663 + }, + { + "epoch": 0.5817456359102244, + "grad_norm": 0.4499783226553058, + "learning_rate": 1.9642468404659463e-05, + "loss": 0.8951, + "step": 11664 + }, + { + "epoch": 0.5817955112219452, + "grad_norm": 0.607166856058815, + "learning_rate": 1.963852384536802e-05, + "loss": 0.8302, + "step": 11665 + }, + { + "epoch": 0.5818453865336658, + "grad_norm": 0.4330183275393477, + "learning_rate": 1.9634579425981888e-05, + "loss": 0.9024, + "step": 11666 + }, + { + "epoch": 0.5818952618453865, + "grad_norm": 0.4357185910949232, + "learning_rate": 1.9630635146604002e-05, + "loss": 0.8862, + "step": 11667 + }, + { + "epoch": 0.5819451371571073, + "grad_norm": 0.49443958214198996, + "learning_rate": 1.96266910073373e-05, + "loss": 0.8985, + "step": 11668 + }, + { + "epoch": 0.5819950124688279, + "grad_norm": 0.4753330283188527, + "learning_rate": 1.9622747008284683e-05, + "loss": 0.9074, + "step": 11669 + }, + { + "epoch": 0.5820448877805486, + "grad_norm": 0.42975320317505156, + "learning_rate": 1.9618803149549075e-05, + "loss": 0.8561, + "step": 11670 + }, + { + "epoch": 0.5820947630922694, + "grad_norm": 0.4703663343929632, + "learning_rate": 1.961485943123339e-05, + "loss": 0.8669, + "step": 11671 + }, + { + "epoch": 0.58214463840399, + "grad_norm": 0.37381263308568774, + "learning_rate": 1.9610915853440548e-05, + "loss": 0.8341, + "step": 11672 + }, + { + "epoch": 0.5821945137157107, + "grad_norm": 0.554603999883048, + "learning_rate": 1.9606972416273438e-05, + "loss": 0.9115, + "step": 11673 + }, + { + "epoch": 0.5822443890274315, + "grad_norm": 0.4715764822688099, + "learning_rate": 1.9603029119834963e-05, + "loss": 0.9083, + "step": 11674 + }, + { + "epoch": 0.5822942643391521, + "grad_norm": 0.5043376247905068, + "learning_rate": 1.9599085964228035e-05, + "loss": 0.8862, + "step": 11675 + }, + { + "epoch": 0.5823441396508728, + "grad_norm": 0.5032313446699126, + "learning_rate": 1.9595142949555548e-05, + "loss": 0.8936, + "step": 11676 + }, + { + "epoch": 0.5823940149625935, + "grad_norm": 0.47043248169159835, + "learning_rate": 1.9591200075920376e-05, + "loss": 0.811, + "step": 11677 + }, + { + "epoch": 0.5824438902743142, + "grad_norm": 0.5149773463482726, + "learning_rate": 1.958725734342542e-05, + "loss": 0.8608, + "step": 11678 + }, + { + "epoch": 0.5824937655860349, + "grad_norm": 0.45437448426538274, + "learning_rate": 1.958331475217357e-05, + "loss": 0.8482, + "step": 11679 + }, + { + "epoch": 0.5825436408977556, + "grad_norm": 0.41311512557460134, + "learning_rate": 1.9579372302267686e-05, + "loss": 0.7922, + "step": 11680 + }, + { + "epoch": 0.5825935162094763, + "grad_norm": 0.4705999515925824, + "learning_rate": 1.9575429993810657e-05, + "loss": 0.9127, + "step": 11681 + }, + { + "epoch": 0.582643391521197, + "grad_norm": 0.4366779653442631, + "learning_rate": 1.9571487826905356e-05, + "loss": 0.9101, + "step": 11682 + }, + { + "epoch": 0.5826932668329177, + "grad_norm": 0.4431825386161064, + "learning_rate": 1.9567545801654656e-05, + "loss": 0.8792, + "step": 11683 + }, + { + "epoch": 0.5827431421446384, + "grad_norm": 0.48495756794350553, + "learning_rate": 1.956360391816141e-05, + "loss": 0.9262, + "step": 11684 + }, + { + "epoch": 0.5827930174563591, + "grad_norm": 0.3955163555455517, + "learning_rate": 1.9559662176528488e-05, + "loss": 0.8778, + "step": 11685 + }, + { + "epoch": 0.5828428927680798, + "grad_norm": 0.4316586186452456, + "learning_rate": 1.9555720576858746e-05, + "loss": 0.8951, + "step": 11686 + }, + { + "epoch": 0.5828927680798005, + "grad_norm": 0.4111539728126736, + "learning_rate": 1.9551779119255043e-05, + "loss": 0.8576, + "step": 11687 + }, + { + "epoch": 0.5829426433915212, + "grad_norm": 0.42798827634229786, + "learning_rate": 1.9547837803820227e-05, + "loss": 0.8517, + "step": 11688 + }, + { + "epoch": 0.5829925187032419, + "grad_norm": 0.4698695721483152, + "learning_rate": 1.9543896630657138e-05, + "loss": 0.879, + "step": 11689 + }, + { + "epoch": 0.5830423940149626, + "grad_norm": 0.4796381595621088, + "learning_rate": 1.9539955599868624e-05, + "loss": 0.8622, + "step": 11690 + }, + { + "epoch": 0.5830922693266833, + "grad_norm": 0.4893768689968254, + "learning_rate": 1.9536014711557528e-05, + "loss": 0.8828, + "step": 11691 + }, + { + "epoch": 0.583142144638404, + "grad_norm": 0.4051430258185815, + "learning_rate": 1.9532073965826687e-05, + "loss": 0.9002, + "step": 11692 + }, + { + "epoch": 0.5831920199501247, + "grad_norm": 0.6712872710583363, + "learning_rate": 1.9528133362778923e-05, + "loss": 0.9093, + "step": 11693 + }, + { + "epoch": 0.5832418952618453, + "grad_norm": 0.39651161090138454, + "learning_rate": 1.9524192902517075e-05, + "loss": 0.8622, + "step": 11694 + }, + { + "epoch": 0.5832917705735661, + "grad_norm": 0.8119406386643456, + "learning_rate": 1.9520252585143955e-05, + "loss": 0.8902, + "step": 11695 + }, + { + "epoch": 0.5833416458852868, + "grad_norm": 0.4194328230867895, + "learning_rate": 1.951631241076239e-05, + "loss": 0.848, + "step": 11696 + }, + { + "epoch": 0.5833915211970074, + "grad_norm": 0.4436052807227893, + "learning_rate": 1.951237237947521e-05, + "loss": 0.8628, + "step": 11697 + }, + { + "epoch": 0.5834413965087282, + "grad_norm": 0.3812611601337896, + "learning_rate": 1.950843249138521e-05, + "loss": 0.879, + "step": 11698 + }, + { + "epoch": 0.5834912718204489, + "grad_norm": 0.4382655497394238, + "learning_rate": 1.9504492746595203e-05, + "loss": 0.8592, + "step": 11699 + }, + { + "epoch": 0.5835411471321695, + "grad_norm": 0.5943257361024712, + "learning_rate": 1.9500553145208e-05, + "loss": 0.8836, + "step": 11700 + }, + { + "epoch": 0.5835910224438903, + "grad_norm": 0.5261312858273657, + "learning_rate": 1.9496613687326404e-05, + "loss": 0.876, + "step": 11701 + }, + { + "epoch": 0.583640897755611, + "grad_norm": 0.4706043085791252, + "learning_rate": 1.9492674373053215e-05, + "loss": 0.8532, + "step": 11702 + }, + { + "epoch": 0.5836907730673316, + "grad_norm": 0.38163962229010656, + "learning_rate": 1.9488735202491216e-05, + "loss": 0.8675, + "step": 11703 + }, + { + "epoch": 0.5837406483790524, + "grad_norm": 0.41183026176094173, + "learning_rate": 1.9484796175743208e-05, + "loss": 0.8136, + "step": 11704 + }, + { + "epoch": 0.5837905236907731, + "grad_norm": 0.4821765826131568, + "learning_rate": 1.9480857292911987e-05, + "loss": 0.9139, + "step": 11705 + }, + { + "epoch": 0.5838403990024937, + "grad_norm": 0.6371351558167301, + "learning_rate": 1.947691855410031e-05, + "loss": 0.8781, + "step": 11706 + }, + { + "epoch": 0.5838902743142145, + "grad_norm": 0.4433037885014219, + "learning_rate": 1.947297995941097e-05, + "loss": 0.894, + "step": 11707 + }, + { + "epoch": 0.5839401496259352, + "grad_norm": 0.4962929709962478, + "learning_rate": 1.9469041508946748e-05, + "loss": 0.8266, + "step": 11708 + }, + { + "epoch": 0.5839900249376558, + "grad_norm": 0.49820807343132695, + "learning_rate": 1.9465103202810424e-05, + "loss": 0.8836, + "step": 11709 + }, + { + "epoch": 0.5840399002493766, + "grad_norm": 0.5200687587840127, + "learning_rate": 1.946116504110474e-05, + "loss": 0.9003, + "step": 11710 + }, + { + "epoch": 0.5840897755610972, + "grad_norm": 0.49255876633587753, + "learning_rate": 1.9457227023932477e-05, + "loss": 0.9148, + "step": 11711 + }, + { + "epoch": 0.5841396508728179, + "grad_norm": 0.43130019137444914, + "learning_rate": 1.9453289151396394e-05, + "loss": 0.8404, + "step": 11712 + }, + { + "epoch": 0.5841895261845387, + "grad_norm": 0.5949946502565366, + "learning_rate": 1.944935142359926e-05, + "loss": 0.9081, + "step": 11713 + }, + { + "epoch": 0.5842394014962593, + "grad_norm": 0.36813374518990577, + "learning_rate": 1.9445413840643806e-05, + "loss": 0.8357, + "step": 11714 + }, + { + "epoch": 0.58428927680798, + "grad_norm": 0.4444953689651318, + "learning_rate": 1.9441476402632793e-05, + "loss": 0.8496, + "step": 11715 + }, + { + "epoch": 0.5843391521197008, + "grad_norm": 0.4410006517521989, + "learning_rate": 1.9437539109668972e-05, + "loss": 0.8579, + "step": 11716 + }, + { + "epoch": 0.5843890274314214, + "grad_norm": 0.5504453839584361, + "learning_rate": 1.9433601961855076e-05, + "loss": 0.9407, + "step": 11717 + }, + { + "epoch": 0.5844389027431421, + "grad_norm": 0.6010415270819025, + "learning_rate": 1.9429664959293835e-05, + "loss": 0.8813, + "step": 11718 + }, + { + "epoch": 0.5844887780548629, + "grad_norm": 0.4786910624148545, + "learning_rate": 1.9425728102088e-05, + "loss": 0.9058, + "step": 11719 + }, + { + "epoch": 0.5845386533665835, + "grad_norm": 0.4627310952685757, + "learning_rate": 1.9421791390340307e-05, + "loss": 0.8821, + "step": 11720 + }, + { + "epoch": 0.5845885286783042, + "grad_norm": 0.4034123948802527, + "learning_rate": 1.941785482415346e-05, + "loss": 0.8452, + "step": 11721 + }, + { + "epoch": 0.584638403990025, + "grad_norm": 0.4282034835839072, + "learning_rate": 1.941391840363019e-05, + "loss": 0.8368, + "step": 11722 + }, + { + "epoch": 0.5846882793017456, + "grad_norm": 0.4172913780498945, + "learning_rate": 1.9409982128873223e-05, + "loss": 0.9132, + "step": 11723 + }, + { + "epoch": 0.5847381546134663, + "grad_norm": 0.5161974889736823, + "learning_rate": 1.9406045999985278e-05, + "loss": 0.8749, + "step": 11724 + }, + { + "epoch": 0.5847880299251871, + "grad_norm": 0.4191648617582921, + "learning_rate": 1.940211001706905e-05, + "loss": 0.8991, + "step": 11725 + }, + { + "epoch": 0.5848379052369077, + "grad_norm": 0.4519104888632225, + "learning_rate": 1.9398174180227255e-05, + "loss": 0.8907, + "step": 11726 + }, + { + "epoch": 0.5848877805486284, + "grad_norm": 0.6883667486787277, + "learning_rate": 1.9394238489562606e-05, + "loss": 0.8428, + "step": 11727 + }, + { + "epoch": 0.5849376558603492, + "grad_norm": 0.4501307277265152, + "learning_rate": 1.9390302945177786e-05, + "loss": 0.8687, + "step": 11728 + }, + { + "epoch": 0.5849875311720698, + "grad_norm": 0.49454196840539744, + "learning_rate": 1.9386367547175506e-05, + "loss": 0.8636, + "step": 11729 + }, + { + "epoch": 0.5850374064837905, + "grad_norm": 0.7576618222155066, + "learning_rate": 1.938243229565845e-05, + "loss": 0.9165, + "step": 11730 + }, + { + "epoch": 0.5850872817955112, + "grad_norm": 0.5508808853002994, + "learning_rate": 1.937849719072931e-05, + "loss": 0.8931, + "step": 11731 + }, + { + "epoch": 0.5851371571072319, + "grad_norm": 0.4533389162834927, + "learning_rate": 1.9374562232490766e-05, + "loss": 0.8104, + "step": 11732 + }, + { + "epoch": 0.5851870324189526, + "grad_norm": 1.0084869849046916, + "learning_rate": 1.9370627421045507e-05, + "loss": 0.9232, + "step": 11733 + }, + { + "epoch": 0.5852369077306733, + "grad_norm": 0.5255434723857614, + "learning_rate": 1.9366692756496208e-05, + "loss": 0.8581, + "step": 11734 + }, + { + "epoch": 0.585286783042394, + "grad_norm": 0.4057323218413892, + "learning_rate": 1.936275823894554e-05, + "loss": 0.8638, + "step": 11735 + }, + { + "epoch": 0.5853366583541147, + "grad_norm": 0.495197616013884, + "learning_rate": 1.9358823868496167e-05, + "loss": 0.8915, + "step": 11736 + }, + { + "epoch": 0.5853865336658354, + "grad_norm": 0.48225165002894865, + "learning_rate": 1.9354889645250763e-05, + "loss": 0.8466, + "step": 11737 + }, + { + "epoch": 0.5854364089775561, + "grad_norm": 0.47961385170580834, + "learning_rate": 1.9350955569311984e-05, + "loss": 0.8882, + "step": 11738 + }, + { + "epoch": 0.5854862842892768, + "grad_norm": 0.4617369868429872, + "learning_rate": 1.9347021640782502e-05, + "loss": 0.9146, + "step": 11739 + }, + { + "epoch": 0.5855361596009975, + "grad_norm": 0.4928846265498879, + "learning_rate": 1.9343087859764953e-05, + "loss": 0.8773, + "step": 11740 + }, + { + "epoch": 0.5855860349127182, + "grad_norm": 0.5040079497996203, + "learning_rate": 1.9339154226361993e-05, + "loss": 0.8958, + "step": 11741 + }, + { + "epoch": 0.585635910224439, + "grad_norm": 0.4518866216782991, + "learning_rate": 1.933522074067628e-05, + "loss": 0.8459, + "step": 11742 + }, + { + "epoch": 0.5856857855361596, + "grad_norm": 0.49066563104458044, + "learning_rate": 1.9331287402810438e-05, + "loss": 0.8712, + "step": 11743 + }, + { + "epoch": 0.5857356608478803, + "grad_norm": 0.7002293269177808, + "learning_rate": 1.9327354212867113e-05, + "loss": 0.8755, + "step": 11744 + }, + { + "epoch": 0.585785536159601, + "grad_norm": 0.5915259321092038, + "learning_rate": 1.932342117094894e-05, + "loss": 0.8702, + "step": 11745 + }, + { + "epoch": 0.5858354114713217, + "grad_norm": 0.6635029601828124, + "learning_rate": 1.9319488277158563e-05, + "loss": 0.8935, + "step": 11746 + }, + { + "epoch": 0.5858852867830424, + "grad_norm": 0.4708973426199514, + "learning_rate": 1.9315555531598588e-05, + "loss": 0.8398, + "step": 11747 + }, + { + "epoch": 0.585935162094763, + "grad_norm": 1.1011181069435836, + "learning_rate": 1.9311622934371643e-05, + "loss": 0.8764, + "step": 11748 + }, + { + "epoch": 0.5859850374064838, + "grad_norm": 0.5040395677829099, + "learning_rate": 1.9307690485580354e-05, + "loss": 0.8703, + "step": 11749 + }, + { + "epoch": 0.5860349127182045, + "grad_norm": 0.5294347199039428, + "learning_rate": 1.9303758185327348e-05, + "loss": 0.8961, + "step": 11750 + }, + { + "epoch": 0.5860847880299251, + "grad_norm": 0.47093679475502714, + "learning_rate": 1.9299826033715207e-05, + "loss": 0.8898, + "step": 11751 + }, + { + "epoch": 0.5861346633416459, + "grad_norm": 0.5069724263864902, + "learning_rate": 1.9295894030846558e-05, + "loss": 0.8759, + "step": 11752 + }, + { + "epoch": 0.5861845386533666, + "grad_norm": 0.42788252686909706, + "learning_rate": 1.929196217682401e-05, + "loss": 0.8904, + "step": 11753 + }, + { + "epoch": 0.5862344139650872, + "grad_norm": 0.5869598775393703, + "learning_rate": 1.9288030471750146e-05, + "loss": 0.9001, + "step": 11754 + }, + { + "epoch": 0.586284289276808, + "grad_norm": 0.39876311080881266, + "learning_rate": 1.928409891572757e-05, + "loss": 0.8997, + "step": 11755 + }, + { + "epoch": 0.5863341645885287, + "grad_norm": 0.5883265992292042, + "learning_rate": 1.928016750885887e-05, + "loss": 0.8738, + "step": 11756 + }, + { + "epoch": 0.5863840399002493, + "grad_norm": 0.4814601690332825, + "learning_rate": 1.9276236251246654e-05, + "loss": 0.8857, + "step": 11757 + }, + { + "epoch": 0.5864339152119701, + "grad_norm": 0.44750334239161693, + "learning_rate": 1.9272305142993482e-05, + "loss": 0.8943, + "step": 11758 + }, + { + "epoch": 0.5864837905236908, + "grad_norm": 1.4639612028987874, + "learning_rate": 1.9268374184201943e-05, + "loss": 0.8409, + "step": 11759 + }, + { + "epoch": 0.5865336658354114, + "grad_norm": 0.6532242144004287, + "learning_rate": 1.9264443374974614e-05, + "loss": 0.8701, + "step": 11760 + }, + { + "epoch": 0.5865835411471322, + "grad_norm": 0.5174668848374769, + "learning_rate": 1.926051271541408e-05, + "loss": 0.8734, + "step": 11761 + }, + { + "epoch": 0.5866334164588529, + "grad_norm": 0.5170169617204013, + "learning_rate": 1.9256582205622884e-05, + "loss": 0.8997, + "step": 11762 + }, + { + "epoch": 0.5866832917705735, + "grad_norm": 0.65058543516432, + "learning_rate": 1.9252651845703606e-05, + "loss": 0.8831, + "step": 11763 + }, + { + "epoch": 0.5867331670822943, + "grad_norm": 0.47207877205627513, + "learning_rate": 1.9248721635758812e-05, + "loss": 0.8574, + "step": 11764 + }, + { + "epoch": 0.5867830423940149, + "grad_norm": 0.3959663209659831, + "learning_rate": 1.924479157589105e-05, + "loss": 0.872, + "step": 11765 + }, + { + "epoch": 0.5868329177057356, + "grad_norm": 1.0130629931046868, + "learning_rate": 1.924086166620288e-05, + "loss": 0.8685, + "step": 11766 + }, + { + "epoch": 0.5868827930174564, + "grad_norm": 0.4762217548901821, + "learning_rate": 1.923693190679684e-05, + "loss": 0.8577, + "step": 11767 + }, + { + "epoch": 0.586932668329177, + "grad_norm": 0.4232325718985381, + "learning_rate": 1.923300229777549e-05, + "loss": 0.8704, + "step": 11768 + }, + { + "epoch": 0.5869825436408977, + "grad_norm": 0.38456353725872067, + "learning_rate": 1.9229072839241357e-05, + "loss": 0.842, + "step": 11769 + }, + { + "epoch": 0.5870324189526185, + "grad_norm": 0.4608883452278126, + "learning_rate": 1.9225143531296985e-05, + "loss": 0.921, + "step": 11770 + }, + { + "epoch": 0.5870822942643391, + "grad_norm": 0.45075233229620487, + "learning_rate": 1.9221214374044914e-05, + "loss": 0.8363, + "step": 11771 + }, + { + "epoch": 0.5871321695760598, + "grad_norm": 0.4239560060309401, + "learning_rate": 1.921728536758767e-05, + "loss": 0.8864, + "step": 11772 + }, + { + "epoch": 0.5871820448877806, + "grad_norm": 0.4301318389901543, + "learning_rate": 1.921335651202777e-05, + "loss": 0.9067, + "step": 11773 + }, + { + "epoch": 0.5872319201995012, + "grad_norm": 0.5334733004769083, + "learning_rate": 1.9209427807467743e-05, + "loss": 0.8695, + "step": 11774 + }, + { + "epoch": 0.587281795511222, + "grad_norm": 0.4057140798456779, + "learning_rate": 1.9205499254010115e-05, + "loss": 0.8379, + "step": 11775 + }, + { + "epoch": 0.5873316708229427, + "grad_norm": 0.5019392839684707, + "learning_rate": 1.920157085175738e-05, + "loss": 0.9047, + "step": 11776 + }, + { + "epoch": 0.5873815461346633, + "grad_norm": 0.42653050557591876, + "learning_rate": 1.919764260081206e-05, + "loss": 0.846, + "step": 11777 + }, + { + "epoch": 0.587431421446384, + "grad_norm": 0.40263455411961396, + "learning_rate": 1.9193714501276657e-05, + "loss": 0.8457, + "step": 11778 + }, + { + "epoch": 0.5874812967581048, + "grad_norm": 0.48790404714525315, + "learning_rate": 1.918978655325369e-05, + "loss": 0.8846, + "step": 11779 + }, + { + "epoch": 0.5875311720698254, + "grad_norm": 0.4888750419811106, + "learning_rate": 1.9185858756845626e-05, + "loss": 0.8688, + "step": 11780 + }, + { + "epoch": 0.5875810473815462, + "grad_norm": 0.5042945893378382, + "learning_rate": 1.918193111215498e-05, + "loss": 0.9102, + "step": 11781 + }, + { + "epoch": 0.5876309226932669, + "grad_norm": 0.43912720351916823, + "learning_rate": 1.917800361928424e-05, + "loss": 0.8385, + "step": 11782 + }, + { + "epoch": 0.5876807980049875, + "grad_norm": 0.45730821170410035, + "learning_rate": 1.9174076278335895e-05, + "loss": 0.8338, + "step": 11783 + }, + { + "epoch": 0.5877306733167083, + "grad_norm": 0.42894798237423337, + "learning_rate": 1.9170149089412414e-05, + "loss": 0.8804, + "step": 11784 + }, + { + "epoch": 0.5877805486284289, + "grad_norm": 0.4744203973018715, + "learning_rate": 1.9166222052616284e-05, + "loss": 0.8812, + "step": 11785 + }, + { + "epoch": 0.5878304239401496, + "grad_norm": 0.5400098508455318, + "learning_rate": 1.9162295168049986e-05, + "loss": 0.8743, + "step": 11786 + }, + { + "epoch": 0.5878802992518704, + "grad_norm": 0.4791509733037332, + "learning_rate": 1.9158368435815977e-05, + "loss": 0.8604, + "step": 11787 + }, + { + "epoch": 0.587930174563591, + "grad_norm": 0.48476789235288037, + "learning_rate": 1.9154441856016727e-05, + "loss": 0.9212, + "step": 11788 + }, + { + "epoch": 0.5879800498753117, + "grad_norm": 0.3938702786007728, + "learning_rate": 1.91505154287547e-05, + "loss": 0.8723, + "step": 11789 + }, + { + "epoch": 0.5880299251870325, + "grad_norm": 0.47378138217394744, + "learning_rate": 1.914658915413237e-05, + "loss": 0.8856, + "step": 11790 + }, + { + "epoch": 0.5880798004987531, + "grad_norm": 0.498807263481317, + "learning_rate": 1.9142663032252163e-05, + "loss": 0.8695, + "step": 11791 + }, + { + "epoch": 0.5881296758104738, + "grad_norm": 0.40234793050047923, + "learning_rate": 1.913873706321654e-05, + "loss": 0.9002, + "step": 11792 + }, + { + "epoch": 0.5881795511221946, + "grad_norm": 0.42221157626210737, + "learning_rate": 1.9134811247127953e-05, + "loss": 0.8721, + "step": 11793 + }, + { + "epoch": 0.5882294264339152, + "grad_norm": 0.5370394731210528, + "learning_rate": 1.9130885584088854e-05, + "loss": 0.8658, + "step": 11794 + }, + { + "epoch": 0.5882793017456359, + "grad_norm": 0.49449449987370425, + "learning_rate": 1.9126960074201654e-05, + "loss": 0.8915, + "step": 11795 + }, + { + "epoch": 0.5883291770573567, + "grad_norm": 0.48917496535972527, + "learning_rate": 1.9123034717568806e-05, + "loss": 0.8869, + "step": 11796 + }, + { + "epoch": 0.5883790523690773, + "grad_norm": 0.4718696306294279, + "learning_rate": 1.9119109514292733e-05, + "loss": 0.8701, + "step": 11797 + }, + { + "epoch": 0.588428927680798, + "grad_norm": 0.49765358743269267, + "learning_rate": 1.911518446447588e-05, + "loss": 0.889, + "step": 11798 + }, + { + "epoch": 0.5884788029925188, + "grad_norm": 0.4404538332189289, + "learning_rate": 1.911125956822064e-05, + "loss": 0.8606, + "step": 11799 + }, + { + "epoch": 0.5885286783042394, + "grad_norm": 0.42154414810587315, + "learning_rate": 1.910733482562945e-05, + "loss": 0.8634, + "step": 11800 + }, + { + "epoch": 0.5885785536159601, + "grad_norm": 0.44498842989325355, + "learning_rate": 1.9103410236804725e-05, + "loss": 0.8799, + "step": 11801 + }, + { + "epoch": 0.5886284289276807, + "grad_norm": 0.3964345521387125, + "learning_rate": 1.9099485801848864e-05, + "loss": 0.8787, + "step": 11802 + }, + { + "epoch": 0.5886783042394015, + "grad_norm": 0.4240398391837257, + "learning_rate": 1.9095561520864287e-05, + "loss": 0.8518, + "step": 11803 + }, + { + "epoch": 0.5887281795511222, + "grad_norm": 0.44271593358382816, + "learning_rate": 1.9091637393953384e-05, + "loss": 0.8452, + "step": 11804 + }, + { + "epoch": 0.5887780548628428, + "grad_norm": 0.5591165936031126, + "learning_rate": 1.908771342121856e-05, + "loss": 0.9272, + "step": 11805 + }, + { + "epoch": 0.5888279301745636, + "grad_norm": 0.5127725766239183, + "learning_rate": 1.908378960276221e-05, + "loss": 0.8343, + "step": 11806 + }, + { + "epoch": 0.5888778054862843, + "grad_norm": 0.7062645552790204, + "learning_rate": 1.9079865938686724e-05, + "loss": 0.9229, + "step": 11807 + }, + { + "epoch": 0.588927680798005, + "grad_norm": 0.5134888714679333, + "learning_rate": 1.9075942429094483e-05, + "loss": 0.9117, + "step": 11808 + }, + { + "epoch": 0.5889775561097257, + "grad_norm": 0.4507048038514263, + "learning_rate": 1.9072019074087876e-05, + "loss": 0.8926, + "step": 11809 + }, + { + "epoch": 0.5890274314214464, + "grad_norm": 0.8642882594991991, + "learning_rate": 1.9068095873769276e-05, + "loss": 0.879, + "step": 11810 + }, + { + "epoch": 0.589077306733167, + "grad_norm": 0.4318989347446367, + "learning_rate": 1.906417282824106e-05, + "loss": 0.85, + "step": 11811 + }, + { + "epoch": 0.5891271820448878, + "grad_norm": 0.4704826638830567, + "learning_rate": 1.9060249937605606e-05, + "loss": 0.8972, + "step": 11812 + }, + { + "epoch": 0.5891770573566085, + "grad_norm": 0.5797038535846764, + "learning_rate": 1.905632720196526e-05, + "loss": 0.8444, + "step": 11813 + }, + { + "epoch": 0.5892269326683292, + "grad_norm": 0.44180248374100395, + "learning_rate": 1.90524046214224e-05, + "loss": 0.8286, + "step": 11814 + }, + { + "epoch": 0.5892768079800499, + "grad_norm": 0.49412591400512496, + "learning_rate": 1.9048482196079372e-05, + "loss": 0.8812, + "step": 11815 + }, + { + "epoch": 0.5893266832917706, + "grad_norm": 0.5850915367654193, + "learning_rate": 1.9044559926038555e-05, + "loss": 0.8702, + "step": 11816 + }, + { + "epoch": 0.5893765586034913, + "grad_norm": 0.44352193941173246, + "learning_rate": 1.904063781140227e-05, + "loss": 0.8487, + "step": 11817 + }, + { + "epoch": 0.589426433915212, + "grad_norm": 0.4412997124621701, + "learning_rate": 1.903671585227287e-05, + "loss": 0.8871, + "step": 11818 + }, + { + "epoch": 0.5894763092269326, + "grad_norm": 0.5567995214502761, + "learning_rate": 1.9032794048752705e-05, + "loss": 0.807, + "step": 11819 + }, + { + "epoch": 0.5895261845386534, + "grad_norm": 0.6376725033906145, + "learning_rate": 1.9028872400944116e-05, + "loss": 0.8938, + "step": 11820 + }, + { + "epoch": 0.5895760598503741, + "grad_norm": 0.5976857882831966, + "learning_rate": 1.902495090894942e-05, + "loss": 0.885, + "step": 11821 + }, + { + "epoch": 0.5896259351620947, + "grad_norm": 0.5467685483444653, + "learning_rate": 1.9021029572870956e-05, + "loss": 0.8786, + "step": 11822 + }, + { + "epoch": 0.5896758104738155, + "grad_norm": 0.5384617958601934, + "learning_rate": 1.9017108392811065e-05, + "loss": 0.8667, + "step": 11823 + }, + { + "epoch": 0.5897256857855362, + "grad_norm": 0.42847713003708465, + "learning_rate": 1.9013187368872038e-05, + "loss": 0.8317, + "step": 11824 + }, + { + "epoch": 0.5897755610972568, + "grad_norm": 0.4191418624618062, + "learning_rate": 1.9009266501156206e-05, + "loss": 0.8773, + "step": 11825 + }, + { + "epoch": 0.5898254364089776, + "grad_norm": 0.40159287890613055, + "learning_rate": 1.9005345789765885e-05, + "loss": 0.8445, + "step": 11826 + }, + { + "epoch": 0.5898753117206983, + "grad_norm": 0.5969952064358001, + "learning_rate": 1.9001425234803394e-05, + "loss": 0.8946, + "step": 11827 + }, + { + "epoch": 0.5899251870324189, + "grad_norm": 0.6602296707385406, + "learning_rate": 1.8997504836371015e-05, + "loss": 0.8975, + "step": 11828 + }, + { + "epoch": 0.5899750623441397, + "grad_norm": 0.43169166601357406, + "learning_rate": 1.899358459457106e-05, + "loss": 0.8716, + "step": 11829 + }, + { + "epoch": 0.5900249376558604, + "grad_norm": 0.5300963316592588, + "learning_rate": 1.898966450950583e-05, + "loss": 0.8886, + "step": 11830 + }, + { + "epoch": 0.590074812967581, + "grad_norm": 0.4738719716086605, + "learning_rate": 1.8985744581277624e-05, + "loss": 0.8522, + "step": 11831 + }, + { + "epoch": 0.5901246882793018, + "grad_norm": 0.42005178140936567, + "learning_rate": 1.898182480998871e-05, + "loss": 0.8882, + "step": 11832 + }, + { + "epoch": 0.5901745635910225, + "grad_norm": 0.502265411477638, + "learning_rate": 1.8977905195741386e-05, + "loss": 0.8198, + "step": 11833 + }, + { + "epoch": 0.5902244389027431, + "grad_norm": 0.36162327520151033, + "learning_rate": 1.8973985738637938e-05, + "loss": 0.827, + "step": 11834 + }, + { + "epoch": 0.5902743142144639, + "grad_norm": 0.5642280344327009, + "learning_rate": 1.8970066438780628e-05, + "loss": 0.9037, + "step": 11835 + }, + { + "epoch": 0.5903241895261845, + "grad_norm": 0.4922923281196572, + "learning_rate": 1.896614729627173e-05, + "loss": 0.9174, + "step": 11836 + }, + { + "epoch": 0.5903740648379052, + "grad_norm": 0.44068820281208804, + "learning_rate": 1.8962228311213525e-05, + "loss": 0.8504, + "step": 11837 + }, + { + "epoch": 0.590423940149626, + "grad_norm": 0.5190716354358093, + "learning_rate": 1.8958309483708274e-05, + "loss": 0.8208, + "step": 11838 + }, + { + "epoch": 0.5904738154613466, + "grad_norm": 0.6180748801981926, + "learning_rate": 1.895439081385823e-05, + "loss": 0.8569, + "step": 11839 + }, + { + "epoch": 0.5905236907730673, + "grad_norm": 0.457635037272889, + "learning_rate": 1.895047230176565e-05, + "loss": 0.8921, + "step": 11840 + }, + { + "epoch": 0.5905735660847881, + "grad_norm": 0.46797907781790227, + "learning_rate": 1.8946553947532787e-05, + "loss": 0.8287, + "step": 11841 + }, + { + "epoch": 0.5906234413965087, + "grad_norm": 0.46320231271895423, + "learning_rate": 1.8942635751261896e-05, + "loss": 0.8609, + "step": 11842 + }, + { + "epoch": 0.5906733167082294, + "grad_norm": 0.41027430707233176, + "learning_rate": 1.8938717713055203e-05, + "loss": 0.8909, + "step": 11843 + }, + { + "epoch": 0.5907231920199502, + "grad_norm": 0.3781516286137447, + "learning_rate": 1.893479983301497e-05, + "loss": 0.8863, + "step": 11844 + }, + { + "epoch": 0.5907730673316708, + "grad_norm": 0.3935675734547587, + "learning_rate": 1.893088211124342e-05, + "loss": 0.8453, + "step": 11845 + }, + { + "epoch": 0.5908229426433915, + "grad_norm": 0.4351007690077349, + "learning_rate": 1.892696454784278e-05, + "loss": 0.8775, + "step": 11846 + }, + { + "epoch": 0.5908728179551123, + "grad_norm": 0.5496391090656555, + "learning_rate": 1.892304714291528e-05, + "loss": 0.8648, + "step": 11847 + }, + { + "epoch": 0.5909226932668329, + "grad_norm": 0.4475157395345815, + "learning_rate": 1.8919129896563144e-05, + "loss": 0.8741, + "step": 11848 + }, + { + "epoch": 0.5909725685785536, + "grad_norm": 0.4391962094106186, + "learning_rate": 1.891521280888861e-05, + "loss": 0.8958, + "step": 11849 + }, + { + "epoch": 0.5910224438902744, + "grad_norm": 0.512656926433978, + "learning_rate": 1.8911295879993858e-05, + "loss": 0.8436, + "step": 11850 + }, + { + "epoch": 0.591072319201995, + "grad_norm": 0.38160541439686746, + "learning_rate": 1.8907379109981113e-05, + "loss": 0.8901, + "step": 11851 + }, + { + "epoch": 0.5911221945137157, + "grad_norm": 0.4302915684885176, + "learning_rate": 1.8903462498952583e-05, + "loss": 0.9425, + "step": 11852 + }, + { + "epoch": 0.5911720698254365, + "grad_norm": 0.45232357441753557, + "learning_rate": 1.8899546047010488e-05, + "loss": 0.894, + "step": 11853 + }, + { + "epoch": 0.5912219451371571, + "grad_norm": 0.5043621940434462, + "learning_rate": 1.8895629754256993e-05, + "loss": 0.9222, + "step": 11854 + }, + { + "epoch": 0.5912718204488778, + "grad_norm": 0.4196067431196522, + "learning_rate": 1.889171362079431e-05, + "loss": 0.8579, + "step": 11855 + }, + { + "epoch": 0.5913216957605985, + "grad_norm": 0.5711758095753358, + "learning_rate": 1.8887797646724625e-05, + "loss": 0.8617, + "step": 11856 + }, + { + "epoch": 0.5913715710723192, + "grad_norm": 0.39197800757777956, + "learning_rate": 1.8883881832150136e-05, + "loss": 0.8635, + "step": 11857 + }, + { + "epoch": 0.5914214463840399, + "grad_norm": 0.45857437236648957, + "learning_rate": 1.8879966177173003e-05, + "loss": 0.875, + "step": 11858 + }, + { + "epoch": 0.5914713216957606, + "grad_norm": 0.3976012406178871, + "learning_rate": 1.8876050681895414e-05, + "loss": 0.8419, + "step": 11859 + }, + { + "epoch": 0.5915211970074813, + "grad_norm": 0.3794720553202413, + "learning_rate": 1.887213534641955e-05, + "loss": 0.829, + "step": 11860 + }, + { + "epoch": 0.591571072319202, + "grad_norm": 0.4401254462567443, + "learning_rate": 1.8868220170847563e-05, + "loss": 0.8878, + "step": 11861 + }, + { + "epoch": 0.5916209476309227, + "grad_norm": 0.46713825735105785, + "learning_rate": 1.886430515528163e-05, + "loss": 0.8548, + "step": 11862 + }, + { + "epoch": 0.5916708229426434, + "grad_norm": 0.479941260047292, + "learning_rate": 1.8860390299823903e-05, + "loss": 0.8972, + "step": 11863 + }, + { + "epoch": 0.5917206982543641, + "grad_norm": 0.5828257872803733, + "learning_rate": 1.8856475604576552e-05, + "loss": 0.8772, + "step": 11864 + }, + { + "epoch": 0.5917705735660848, + "grad_norm": 0.8255810085376787, + "learning_rate": 1.8852561069641717e-05, + "loss": 0.8889, + "step": 11865 + }, + { + "epoch": 0.5918204488778055, + "grad_norm": 0.3906700952004628, + "learning_rate": 1.8848646695121542e-05, + "loss": 0.8618, + "step": 11866 + }, + { + "epoch": 0.5918703241895262, + "grad_norm": 0.5022446883813656, + "learning_rate": 1.8844732481118184e-05, + "loss": 0.8435, + "step": 11867 + }, + { + "epoch": 0.5919201995012469, + "grad_norm": 0.410116952998514, + "learning_rate": 1.8840818427733786e-05, + "loss": 0.904, + "step": 11868 + }, + { + "epoch": 0.5919700748129676, + "grad_norm": 0.5383304834694318, + "learning_rate": 1.8836904535070466e-05, + "loss": 0.8847, + "step": 11869 + }, + { + "epoch": 0.5920199501246883, + "grad_norm": 0.4626362926719269, + "learning_rate": 1.883299080323036e-05, + "loss": 0.8611, + "step": 11870 + }, + { + "epoch": 0.592069825436409, + "grad_norm": 0.4021468459452725, + "learning_rate": 1.882907723231561e-05, + "loss": 0.856, + "step": 11871 + }, + { + "epoch": 0.5921197007481297, + "grad_norm": 0.598594147671409, + "learning_rate": 1.882516382242832e-05, + "loss": 0.8039, + "step": 11872 + }, + { + "epoch": 0.5921695760598503, + "grad_norm": 0.3796256409516914, + "learning_rate": 1.882125057367062e-05, + "loss": 0.8684, + "step": 11873 + }, + { + "epoch": 0.5922194513715711, + "grad_norm": 0.4810458193439567, + "learning_rate": 1.881733748614461e-05, + "loss": 0.8232, + "step": 11874 + }, + { + "epoch": 0.5922693266832918, + "grad_norm": 0.45106464232997356, + "learning_rate": 1.8813424559952425e-05, + "loss": 0.8437, + "step": 11875 + }, + { + "epoch": 0.5923192019950124, + "grad_norm": 0.6967969070548484, + "learning_rate": 1.8809511795196154e-05, + "loss": 0.867, + "step": 11876 + }, + { + "epoch": 0.5923690773067332, + "grad_norm": 0.5175613821416872, + "learning_rate": 1.8805599191977902e-05, + "loss": 0.8931, + "step": 11877 + }, + { + "epoch": 0.5924189526184539, + "grad_norm": 0.4988353269280151, + "learning_rate": 1.8801686750399764e-05, + "loss": 0.8555, + "step": 11878 + }, + { + "epoch": 0.5924688279301745, + "grad_norm": 0.4001316518543904, + "learning_rate": 1.8797774470563838e-05, + "loss": 0.8759, + "step": 11879 + }, + { + "epoch": 0.5925187032418953, + "grad_norm": 0.4093305299736611, + "learning_rate": 1.8793862352572212e-05, + "loss": 0.8479, + "step": 11880 + }, + { + "epoch": 0.592568578553616, + "grad_norm": 0.46347873701603765, + "learning_rate": 1.878995039652697e-05, + "loss": 0.8542, + "step": 11881 + }, + { + "epoch": 0.5926184538653366, + "grad_norm": 0.4144114452147762, + "learning_rate": 1.87860386025302e-05, + "loss": 0.894, + "step": 11882 + }, + { + "epoch": 0.5926683291770574, + "grad_norm": 0.4555347066379328, + "learning_rate": 1.8782126970683957e-05, + "loss": 0.898, + "step": 11883 + }, + { + "epoch": 0.5927182044887781, + "grad_norm": 0.41519686890435625, + "learning_rate": 1.8778215501090336e-05, + "loss": 0.8863, + "step": 11884 + }, + { + "epoch": 0.5927680798004987, + "grad_norm": 0.47080003990603964, + "learning_rate": 1.8774304193851393e-05, + "loss": 0.9008, + "step": 11885 + }, + { + "epoch": 0.5928179551122195, + "grad_norm": 0.39368649574191134, + "learning_rate": 1.8770393049069208e-05, + "loss": 0.9106, + "step": 11886 + }, + { + "epoch": 0.5928678304239402, + "grad_norm": 0.446000933209504, + "learning_rate": 1.8766482066845816e-05, + "loss": 0.9334, + "step": 11887 + }, + { + "epoch": 0.5929177057356608, + "grad_norm": 0.46696400181478, + "learning_rate": 1.8762571247283286e-05, + "loss": 0.8588, + "step": 11888 + }, + { + "epoch": 0.5929675810473816, + "grad_norm": 0.4502728031626511, + "learning_rate": 1.8758660590483667e-05, + "loss": 0.8753, + "step": 11889 + }, + { + "epoch": 0.5930174563591022, + "grad_norm": 0.46530276801208137, + "learning_rate": 1.875475009654902e-05, + "loss": 0.8523, + "step": 11890 + }, + { + "epoch": 0.5930673316708229, + "grad_norm": 0.45040645219966635, + "learning_rate": 1.875083976558136e-05, + "loss": 0.8657, + "step": 11891 + }, + { + "epoch": 0.5931172069825437, + "grad_norm": 0.44305763138391757, + "learning_rate": 1.8746929597682746e-05, + "loss": 0.9137, + "step": 11892 + }, + { + "epoch": 0.5931670822942643, + "grad_norm": 0.8584591732595851, + "learning_rate": 1.8743019592955208e-05, + "loss": 0.8977, + "step": 11893 + }, + { + "epoch": 0.593216957605985, + "grad_norm": 0.44975219816003403, + "learning_rate": 1.8739109751500766e-05, + "loss": 0.9116, + "step": 11894 + }, + { + "epoch": 0.5932668329177058, + "grad_norm": 0.4008917104760724, + "learning_rate": 1.8735200073421456e-05, + "loss": 0.849, + "step": 11895 + }, + { + "epoch": 0.5933167082294264, + "grad_norm": 0.434054858720334, + "learning_rate": 1.873129055881929e-05, + "loss": 0.8367, + "step": 11896 + }, + { + "epoch": 0.5933665835411471, + "grad_norm": 0.5310921079906016, + "learning_rate": 1.872738120779631e-05, + "loss": 0.8812, + "step": 11897 + }, + { + "epoch": 0.5934164588528679, + "grad_norm": 0.41197167380813465, + "learning_rate": 1.8723472020454492e-05, + "loss": 0.899, + "step": 11898 + }, + { + "epoch": 0.5934663341645885, + "grad_norm": 0.673801094985994, + "learning_rate": 1.8719562996895863e-05, + "loss": 0.8739, + "step": 11899 + }, + { + "epoch": 0.5935162094763092, + "grad_norm": 0.43873030732626717, + "learning_rate": 1.8715654137222433e-05, + "loss": 0.8787, + "step": 11900 + }, + { + "epoch": 0.59356608478803, + "grad_norm": 0.521584161516818, + "learning_rate": 1.87117454415362e-05, + "loss": 0.8746, + "step": 11901 + }, + { + "epoch": 0.5936159600997506, + "grad_norm": 0.7449187331483066, + "learning_rate": 1.870783690993915e-05, + "loss": 0.857, + "step": 11902 + }, + { + "epoch": 0.5936658354114713, + "grad_norm": 0.4791967497382782, + "learning_rate": 1.870392854253328e-05, + "loss": 0.9038, + "step": 11903 + }, + { + "epoch": 0.5937157107231921, + "grad_norm": 0.37320220462946657, + "learning_rate": 1.870002033942057e-05, + "loss": 0.8707, + "step": 11904 + }, + { + "epoch": 0.5937655860349127, + "grad_norm": 0.4077732781270096, + "learning_rate": 1.8696112300703027e-05, + "loss": 0.8722, + "step": 11905 + }, + { + "epoch": 0.5938154613466334, + "grad_norm": 0.3824306357032091, + "learning_rate": 1.86922044264826e-05, + "loss": 0.8639, + "step": 11906 + }, + { + "epoch": 0.5938653366583541, + "grad_norm": 0.4643454392853841, + "learning_rate": 1.8688296716861274e-05, + "loss": 0.8914, + "step": 11907 + }, + { + "epoch": 0.5939152119700748, + "grad_norm": 0.47611816680141106, + "learning_rate": 1.8684389171941034e-05, + "loss": 0.8445, + "step": 11908 + }, + { + "epoch": 0.5939650872817955, + "grad_norm": 0.46711345388714537, + "learning_rate": 1.8680481791823817e-05, + "loss": 0.8692, + "step": 11909 + }, + { + "epoch": 0.5940149625935162, + "grad_norm": 0.46486125589737076, + "learning_rate": 1.8676574576611603e-05, + "loss": 0.8762, + "step": 11910 + }, + { + "epoch": 0.5940648379052369, + "grad_norm": 0.5243104223780354, + "learning_rate": 1.8672667526406346e-05, + "loss": 0.8752, + "step": 11911 + }, + { + "epoch": 0.5941147132169576, + "grad_norm": 0.47462007658096167, + "learning_rate": 1.866876064131e-05, + "loss": 0.8722, + "step": 11912 + }, + { + "epoch": 0.5941645885286783, + "grad_norm": 0.396054812492978, + "learning_rate": 1.8664853921424513e-05, + "loss": 0.844, + "step": 11913 + }, + { + "epoch": 0.594214463840399, + "grad_norm": 0.5549650597785345, + "learning_rate": 1.866094736685182e-05, + "loss": 0.8679, + "step": 11914 + }, + { + "epoch": 0.5942643391521197, + "grad_norm": 0.46484805570910653, + "learning_rate": 1.8657040977693875e-05, + "loss": 0.8651, + "step": 11915 + }, + { + "epoch": 0.5943142144638404, + "grad_norm": 0.5009308654717525, + "learning_rate": 1.8653134754052608e-05, + "loss": 0.9163, + "step": 11916 + }, + { + "epoch": 0.5943640897755611, + "grad_norm": 0.5121204392607642, + "learning_rate": 1.8649228696029945e-05, + "loss": 0.8771, + "step": 11917 + }, + { + "epoch": 0.5944139650872818, + "grad_norm": 0.5705074253472537, + "learning_rate": 1.864532280372782e-05, + "loss": 0.9083, + "step": 11918 + }, + { + "epoch": 0.5944638403990025, + "grad_norm": 0.4935307555102585, + "learning_rate": 1.8641417077248155e-05, + "loss": 0.873, + "step": 11919 + }, + { + "epoch": 0.5945137157107232, + "grad_norm": 0.4280783310923816, + "learning_rate": 1.8637511516692858e-05, + "loss": 0.8555, + "step": 11920 + }, + { + "epoch": 0.5945635910224439, + "grad_norm": 0.5809350442030738, + "learning_rate": 1.8633606122163856e-05, + "loss": 0.8793, + "step": 11921 + }, + { + "epoch": 0.5946134663341646, + "grad_norm": 0.40251764762151543, + "learning_rate": 1.8629700893763052e-05, + "loss": 0.8765, + "step": 11922 + }, + { + "epoch": 0.5946633416458853, + "grad_norm": 0.44925442434180474, + "learning_rate": 1.8625795831592353e-05, + "loss": 0.8834, + "step": 11923 + }, + { + "epoch": 0.594713216957606, + "grad_norm": 0.439239341529847, + "learning_rate": 1.8621890935753656e-05, + "loss": 0.883, + "step": 11924 + }, + { + "epoch": 0.5947630922693267, + "grad_norm": 0.4749038804408119, + "learning_rate": 1.861798620634886e-05, + "loss": 0.8992, + "step": 11925 + }, + { + "epoch": 0.5948129675810474, + "grad_norm": 0.6065141618909483, + "learning_rate": 1.8614081643479858e-05, + "loss": 0.8902, + "step": 11926 + }, + { + "epoch": 0.594862842892768, + "grad_norm": 0.5153021308712945, + "learning_rate": 1.8610177247248545e-05, + "loss": 0.8645, + "step": 11927 + }, + { + "epoch": 0.5949127182044888, + "grad_norm": 0.462002693292601, + "learning_rate": 1.8606273017756793e-05, + "loss": 0.851, + "step": 11928 + }, + { + "epoch": 0.5949625935162095, + "grad_norm": 0.5274777378196293, + "learning_rate": 1.8602368955106476e-05, + "loss": 0.9097, + "step": 11929 + }, + { + "epoch": 0.5950124688279301, + "grad_norm": 0.45799326544497826, + "learning_rate": 1.8598465059399496e-05, + "loss": 0.8743, + "step": 11930 + }, + { + "epoch": 0.5950623441396509, + "grad_norm": 0.4370377125764256, + "learning_rate": 1.8594561330737692e-05, + "loss": 0.8556, + "step": 11931 + }, + { + "epoch": 0.5951122194513716, + "grad_norm": 0.5442206736818652, + "learning_rate": 1.8590657769222943e-05, + "loss": 0.9022, + "step": 11932 + }, + { + "epoch": 0.5951620947630922, + "grad_norm": 0.5523614200383514, + "learning_rate": 1.8586754374957112e-05, + "loss": 0.858, + "step": 11933 + }, + { + "epoch": 0.595211970074813, + "grad_norm": 0.5558955810839972, + "learning_rate": 1.8582851148042068e-05, + "loss": 0.869, + "step": 11934 + }, + { + "epoch": 0.5952618453865337, + "grad_norm": 0.5304630456959891, + "learning_rate": 1.8578948088579637e-05, + "loss": 0.8682, + "step": 11935 + }, + { + "epoch": 0.5953117206982543, + "grad_norm": 0.47203177973387045, + "learning_rate": 1.8575045196671683e-05, + "loss": 0.914, + "step": 11936 + }, + { + "epoch": 0.5953615960099751, + "grad_norm": 0.43468261776902434, + "learning_rate": 1.857114247242005e-05, + "loss": 0.9183, + "step": 11937 + }, + { + "epoch": 0.5954114713216958, + "grad_norm": 0.8162017495119256, + "learning_rate": 1.8567239915926586e-05, + "loss": 0.8982, + "step": 11938 + }, + { + "epoch": 0.5954613466334164, + "grad_norm": 0.420919704179924, + "learning_rate": 1.856333752729311e-05, + "loss": 0.9059, + "step": 11939 + }, + { + "epoch": 0.5955112219451372, + "grad_norm": 0.5148739974722099, + "learning_rate": 1.8559435306621458e-05, + "loss": 0.8778, + "step": 11940 + }, + { + "epoch": 0.5955610972568579, + "grad_norm": 0.4523057930698318, + "learning_rate": 1.855553325401347e-05, + "loss": 0.8977, + "step": 11941 + }, + { + "epoch": 0.5956109725685785, + "grad_norm": 0.46390687659030466, + "learning_rate": 1.855163136957095e-05, + "loss": 0.8229, + "step": 11942 + }, + { + "epoch": 0.5956608478802993, + "grad_norm": 0.7627589731308738, + "learning_rate": 1.8547729653395718e-05, + "loss": 0.8651, + "step": 11943 + }, + { + "epoch": 0.5957107231920199, + "grad_norm": 0.6967758461792107, + "learning_rate": 1.8543828105589596e-05, + "loss": 0.8897, + "step": 11944 + }, + { + "epoch": 0.5957605985037406, + "grad_norm": 0.4823652092259627, + "learning_rate": 1.8539926726254404e-05, + "loss": 0.8488, + "step": 11945 + }, + { + "epoch": 0.5958104738154614, + "grad_norm": 0.4182478269539354, + "learning_rate": 1.8536025515491917e-05, + "loss": 0.8457, + "step": 11946 + }, + { + "epoch": 0.595860349127182, + "grad_norm": 0.4466699348994186, + "learning_rate": 1.8532124473403957e-05, + "loss": 0.8525, + "step": 11947 + }, + { + "epoch": 0.5959102244389027, + "grad_norm": 0.48406071010497775, + "learning_rate": 1.852822360009231e-05, + "loss": 0.8295, + "step": 11948 + }, + { + "epoch": 0.5959600997506235, + "grad_norm": 0.5659626587570278, + "learning_rate": 1.852432289565878e-05, + "loss": 0.8907, + "step": 11949 + }, + { + "epoch": 0.5960099750623441, + "grad_norm": 0.47948836810674966, + "learning_rate": 1.8520422360205138e-05, + "loss": 0.8618, + "step": 11950 + }, + { + "epoch": 0.5960598503740648, + "grad_norm": 0.41879321285694904, + "learning_rate": 1.8516521993833175e-05, + "loss": 0.8548, + "step": 11951 + }, + { + "epoch": 0.5961097256857856, + "grad_norm": 0.6697248445780807, + "learning_rate": 1.8512621796644675e-05, + "loss": 0.8711, + "step": 11952 + }, + { + "epoch": 0.5961596009975062, + "grad_norm": 0.5644139943734009, + "learning_rate": 1.85087217687414e-05, + "loss": 0.8019, + "step": 11953 + }, + { + "epoch": 0.5962094763092269, + "grad_norm": 0.4753134690249976, + "learning_rate": 1.850482191022513e-05, + "loss": 0.8436, + "step": 11954 + }, + { + "epoch": 0.5962593516209477, + "grad_norm": 0.4347014413484297, + "learning_rate": 1.850092222119762e-05, + "loss": 0.8436, + "step": 11955 + }, + { + "epoch": 0.5963092269326683, + "grad_norm": 0.4904687630861675, + "learning_rate": 1.849702270176064e-05, + "loss": 0.8228, + "step": 11956 + }, + { + "epoch": 0.596359102244389, + "grad_norm": 0.46295995021715536, + "learning_rate": 1.849312335201594e-05, + "loss": 0.8035, + "step": 11957 + }, + { + "epoch": 0.5964089775561098, + "grad_norm": 0.416589055260111, + "learning_rate": 1.848922417206527e-05, + "loss": 0.8517, + "step": 11958 + }, + { + "epoch": 0.5964588528678304, + "grad_norm": 0.4258480259967772, + "learning_rate": 1.848532516201039e-05, + "loss": 0.8497, + "step": 11959 + }, + { + "epoch": 0.5965087281795511, + "grad_norm": 0.4636536498284565, + "learning_rate": 1.848142632195304e-05, + "loss": 0.8708, + "step": 11960 + }, + { + "epoch": 0.5965586034912718, + "grad_norm": 0.48285978734030605, + "learning_rate": 1.8477527651994935e-05, + "loss": 0.8527, + "step": 11961 + }, + { + "epoch": 0.5966084788029925, + "grad_norm": 0.5665828634883692, + "learning_rate": 1.8473629152237836e-05, + "loss": 0.8691, + "step": 11962 + }, + { + "epoch": 0.5966583541147132, + "grad_norm": 0.43965782038407925, + "learning_rate": 1.846973082278346e-05, + "loss": 0.8361, + "step": 11963 + }, + { + "epoch": 0.5967082294264339, + "grad_norm": 0.5164525143430505, + "learning_rate": 1.8465832663733547e-05, + "loss": 0.8665, + "step": 11964 + }, + { + "epoch": 0.5967581047381546, + "grad_norm": 0.395808593442646, + "learning_rate": 1.8461934675189796e-05, + "loss": 0.912, + "step": 11965 + }, + { + "epoch": 0.5968079800498753, + "grad_norm": 0.43639090971091354, + "learning_rate": 1.845803685725393e-05, + "loss": 0.8848, + "step": 11966 + }, + { + "epoch": 0.596857855361596, + "grad_norm": 0.6161341855548127, + "learning_rate": 1.845413921002768e-05, + "loss": 0.8995, + "step": 11967 + }, + { + "epoch": 0.5969077306733167, + "grad_norm": 0.5098945134513987, + "learning_rate": 1.845024173361273e-05, + "loss": 0.8834, + "step": 11968 + }, + { + "epoch": 0.5969576059850374, + "grad_norm": 0.46341145046620724, + "learning_rate": 1.844634442811078e-05, + "loss": 0.8474, + "step": 11969 + }, + { + "epoch": 0.5970074812967581, + "grad_norm": 0.5137378117336224, + "learning_rate": 1.8442447293623548e-05, + "loss": 0.8668, + "step": 11970 + }, + { + "epoch": 0.5970573566084788, + "grad_norm": 0.46862660973527825, + "learning_rate": 1.8438550330252725e-05, + "loss": 0.8852, + "step": 11971 + }, + { + "epoch": 0.5971072319201995, + "grad_norm": 0.447075438366215, + "learning_rate": 1.8434653538099987e-05, + "loss": 0.8799, + "step": 11972 + }, + { + "epoch": 0.5971571072319202, + "grad_norm": 0.4562970054511454, + "learning_rate": 1.8430756917267025e-05, + "loss": 0.8924, + "step": 11973 + }, + { + "epoch": 0.5972069825436409, + "grad_norm": 0.8856410564115673, + "learning_rate": 1.8426860467855523e-05, + "loss": 0.8927, + "step": 11974 + }, + { + "epoch": 0.5972568578553616, + "grad_norm": 0.48279204516942476, + "learning_rate": 1.8422964189967163e-05, + "loss": 0.9199, + "step": 11975 + }, + { + "epoch": 0.5973067331670823, + "grad_norm": 0.5017186250292232, + "learning_rate": 1.84190680837036e-05, + "loss": 0.8595, + "step": 11976 + }, + { + "epoch": 0.597356608478803, + "grad_norm": 0.419009090312755, + "learning_rate": 1.841517214916651e-05, + "loss": 0.902, + "step": 11977 + }, + { + "epoch": 0.5974064837905237, + "grad_norm": 0.7064782966212806, + "learning_rate": 1.8411276386457565e-05, + "loss": 0.8673, + "step": 11978 + }, + { + "epoch": 0.5974563591022444, + "grad_norm": 0.5476448461482449, + "learning_rate": 1.8407380795678408e-05, + "loss": 0.8568, + "step": 11979 + }, + { + "epoch": 0.5975062344139651, + "grad_norm": 0.42197390321400535, + "learning_rate": 1.8403485376930695e-05, + "loss": 0.8422, + "step": 11980 + }, + { + "epoch": 0.5975561097256857, + "grad_norm": 0.4511354222958695, + "learning_rate": 1.839959013031608e-05, + "loss": 0.8857, + "step": 11981 + }, + { + "epoch": 0.5976059850374065, + "grad_norm": 0.4931211776052735, + "learning_rate": 1.8395695055936213e-05, + "loss": 0.8877, + "step": 11982 + }, + { + "epoch": 0.5976558603491272, + "grad_norm": 0.4750296213982967, + "learning_rate": 1.8391800153892723e-05, + "loss": 0.897, + "step": 11983 + }, + { + "epoch": 0.5977057356608478, + "grad_norm": 0.46734712504856396, + "learning_rate": 1.838790542428725e-05, + "loss": 0.8753, + "step": 11984 + }, + { + "epoch": 0.5977556109725686, + "grad_norm": 0.4594652407688343, + "learning_rate": 1.838401086722142e-05, + "loss": 0.8884, + "step": 11985 + }, + { + "epoch": 0.5978054862842893, + "grad_norm": 0.5290904740703201, + "learning_rate": 1.838011648279688e-05, + "loss": 0.8771, + "step": 11986 + }, + { + "epoch": 0.5978553615960099, + "grad_norm": 0.46745274837876566, + "learning_rate": 1.8376222271115228e-05, + "loss": 0.8936, + "step": 11987 + }, + { + "epoch": 0.5979052369077307, + "grad_norm": 0.4299760881691853, + "learning_rate": 1.837232823227809e-05, + "loss": 0.8675, + "step": 11988 + }, + { + "epoch": 0.5979551122194514, + "grad_norm": 0.6835253851731604, + "learning_rate": 1.8368434366387088e-05, + "loss": 0.8537, + "step": 11989 + }, + { + "epoch": 0.598004987531172, + "grad_norm": 0.4239291255908291, + "learning_rate": 1.8364540673543818e-05, + "loss": 0.8657, + "step": 11990 + }, + { + "epoch": 0.5980548628428928, + "grad_norm": 0.4546949019419719, + "learning_rate": 1.836064715384989e-05, + "loss": 0.9272, + "step": 11991 + }, + { + "epoch": 0.5981047381546135, + "grad_norm": 0.7226165431296129, + "learning_rate": 1.8356753807406904e-05, + "loss": 0.8889, + "step": 11992 + }, + { + "epoch": 0.5981546134663341, + "grad_norm": 0.42124972709504604, + "learning_rate": 1.8352860634316458e-05, + "loss": 0.9137, + "step": 11993 + }, + { + "epoch": 0.5982044887780549, + "grad_norm": 0.584731149021888, + "learning_rate": 1.834896763468013e-05, + "loss": 0.8984, + "step": 11994 + }, + { + "epoch": 0.5982543640897756, + "grad_norm": 0.42793183458124334, + "learning_rate": 1.834507480859952e-05, + "loss": 0.8454, + "step": 11995 + }, + { + "epoch": 0.5983042394014962, + "grad_norm": 0.41704766523384457, + "learning_rate": 1.8341182156176206e-05, + "loss": 0.8493, + "step": 11996 + }, + { + "epoch": 0.598354114713217, + "grad_norm": 0.6048805185247238, + "learning_rate": 1.8337289677511766e-05, + "loss": 0.8801, + "step": 11997 + }, + { + "epoch": 0.5984039900249376, + "grad_norm": 0.42976278053431516, + "learning_rate": 1.8333397372707763e-05, + "loss": 0.8749, + "step": 11998 + }, + { + "epoch": 0.5984538653366583, + "grad_norm": 0.5103748432428469, + "learning_rate": 1.832950524186577e-05, + "loss": 0.8483, + "step": 11999 + }, + { + "epoch": 0.5985037406483791, + "grad_norm": 0.4423930584325864, + "learning_rate": 1.8325613285087368e-05, + "loss": 0.8781, + "step": 12000 + }, + { + "epoch": 0.5985536159600997, + "grad_norm": 0.5448238439708694, + "learning_rate": 1.8321721502474085e-05, + "loss": 0.9172, + "step": 12001 + }, + { + "epoch": 0.5986034912718204, + "grad_norm": 0.48126496507558575, + "learning_rate": 1.8317829894127493e-05, + "loss": 0.8589, + "step": 12002 + }, + { + "epoch": 0.5986533665835412, + "grad_norm": 0.45044338090876207, + "learning_rate": 1.831393846014914e-05, + "loss": 0.9068, + "step": 12003 + }, + { + "epoch": 0.5987032418952618, + "grad_norm": 0.521107907081894, + "learning_rate": 1.8310047200640578e-05, + "loss": 0.9132, + "step": 12004 + }, + { + "epoch": 0.5987531172069825, + "grad_norm": 0.40254509338404765, + "learning_rate": 1.8306156115703332e-05, + "loss": 0.8745, + "step": 12005 + }, + { + "epoch": 0.5988029925187033, + "grad_norm": 0.44854864065510336, + "learning_rate": 1.8302265205438944e-05, + "loss": 0.8593, + "step": 12006 + }, + { + "epoch": 0.5988528678304239, + "grad_norm": 0.4467141187313527, + "learning_rate": 1.8298374469948953e-05, + "loss": 0.8983, + "step": 12007 + }, + { + "epoch": 0.5989027431421446, + "grad_norm": 0.6412611679272393, + "learning_rate": 1.829448390933489e-05, + "loss": 0.8908, + "step": 12008 + }, + { + "epoch": 0.5989526184538654, + "grad_norm": 0.4181551285305629, + "learning_rate": 1.8290593523698257e-05, + "loss": 0.844, + "step": 12009 + }, + { + "epoch": 0.599002493765586, + "grad_norm": 0.5288515408443658, + "learning_rate": 1.828670331314058e-05, + "loss": 0.8363, + "step": 12010 + }, + { + "epoch": 0.5990523690773067, + "grad_norm": 0.45374085094947647, + "learning_rate": 1.8282813277763387e-05, + "loss": 0.8928, + "step": 12011 + }, + { + "epoch": 0.5991022443890275, + "grad_norm": 0.5017603050575917, + "learning_rate": 1.827892341766817e-05, + "loss": 0.8526, + "step": 12012 + }, + { + "epoch": 0.5991521197007481, + "grad_norm": 0.3783144681449949, + "learning_rate": 1.8275033732956438e-05, + "loss": 0.8322, + "step": 12013 + }, + { + "epoch": 0.5992019950124688, + "grad_norm": 0.4496405104904821, + "learning_rate": 1.8271144223729684e-05, + "loss": 0.8644, + "step": 12014 + }, + { + "epoch": 0.5992518703241895, + "grad_norm": 0.42939270720607087, + "learning_rate": 1.8267254890089426e-05, + "loss": 0.8673, + "step": 12015 + }, + { + "epoch": 0.5993017456359102, + "grad_norm": 0.5321552281464477, + "learning_rate": 1.826336573213713e-05, + "loss": 0.8584, + "step": 12016 + }, + { + "epoch": 0.599351620947631, + "grad_norm": 0.4434222042484716, + "learning_rate": 1.8259476749974282e-05, + "loss": 0.8438, + "step": 12017 + }, + { + "epoch": 0.5994014962593516, + "grad_norm": 0.4365219831743195, + "learning_rate": 1.8255587943702378e-05, + "loss": 0.8667, + "step": 12018 + }, + { + "epoch": 0.5994513715710723, + "grad_norm": 0.43921120736740454, + "learning_rate": 1.82516993134229e-05, + "loss": 0.8379, + "step": 12019 + }, + { + "epoch": 0.599501246882793, + "grad_norm": 0.405351099671235, + "learning_rate": 1.8247810859237292e-05, + "loss": 0.8602, + "step": 12020 + }, + { + "epoch": 0.5995511221945137, + "grad_norm": 0.4834609192455755, + "learning_rate": 1.8243922581247043e-05, + "loss": 0.9057, + "step": 12021 + }, + { + "epoch": 0.5996009975062344, + "grad_norm": 0.4362467075498559, + "learning_rate": 1.824003447955361e-05, + "loss": 0.8901, + "step": 12022 + }, + { + "epoch": 0.5996508728179551, + "grad_norm": 0.7764484359049265, + "learning_rate": 1.8236146554258458e-05, + "loss": 0.8358, + "step": 12023 + }, + { + "epoch": 0.5997007481296758, + "grad_norm": 0.4133151677653409, + "learning_rate": 1.8232258805463027e-05, + "loss": 0.8329, + "step": 12024 + }, + { + "epoch": 0.5997506234413965, + "grad_norm": 0.5980990368164303, + "learning_rate": 1.822837123326877e-05, + "loss": 0.8516, + "step": 12025 + }, + { + "epoch": 0.5998004987531173, + "grad_norm": 0.3961833911748461, + "learning_rate": 1.8224483837777145e-05, + "loss": 0.8457, + "step": 12026 + }, + { + "epoch": 0.5998503740648379, + "grad_norm": 0.412411526655035, + "learning_rate": 1.8220596619089576e-05, + "loss": 0.8632, + "step": 12027 + }, + { + "epoch": 0.5999002493765586, + "grad_norm": 0.4513177665577633, + "learning_rate": 1.8216709577307512e-05, + "loss": 0.8711, + "step": 12028 + }, + { + "epoch": 0.5999501246882794, + "grad_norm": 0.40220740016068657, + "learning_rate": 1.8212822712532365e-05, + "loss": 0.9002, + "step": 12029 + }, + { + "epoch": 0.6, + "grad_norm": 0.48802916925634027, + "learning_rate": 1.820893602486558e-05, + "loss": 0.9068, + "step": 12030 + }, + { + "epoch": 0.6000498753117207, + "grad_norm": 0.415511406818023, + "learning_rate": 1.820504951440856e-05, + "loss": 0.8297, + "step": 12031 + }, + { + "epoch": 0.6000997506234413, + "grad_norm": 1.6585558581191124, + "learning_rate": 1.820116318126274e-05, + "loss": 0.8424, + "step": 12032 + }, + { + "epoch": 0.6001496259351621, + "grad_norm": 0.44529159079644764, + "learning_rate": 1.8197277025529525e-05, + "loss": 0.8744, + "step": 12033 + }, + { + "epoch": 0.6001995012468828, + "grad_norm": 0.4712623919265698, + "learning_rate": 1.819339104731032e-05, + "loss": 0.8797, + "step": 12034 + }, + { + "epoch": 0.6002493765586034, + "grad_norm": 0.4573131551419617, + "learning_rate": 1.8189505246706528e-05, + "loss": 0.8472, + "step": 12035 + }, + { + "epoch": 0.6002992518703242, + "grad_norm": 0.3968268815440484, + "learning_rate": 1.8185619623819548e-05, + "loss": 0.845, + "step": 12036 + }, + { + "epoch": 0.6003491271820449, + "grad_norm": 0.39845684594252234, + "learning_rate": 1.818173417875078e-05, + "loss": 0.8394, + "step": 12037 + }, + { + "epoch": 0.6003990024937655, + "grad_norm": 0.45813127984086593, + "learning_rate": 1.8177848911601603e-05, + "loss": 0.8937, + "step": 12038 + }, + { + "epoch": 0.6004488778054863, + "grad_norm": 0.5208906241148991, + "learning_rate": 1.81739638224734e-05, + "loss": 0.8881, + "step": 12039 + }, + { + "epoch": 0.600498753117207, + "grad_norm": 0.4438218390717081, + "learning_rate": 1.8170078911467557e-05, + "loss": 0.8493, + "step": 12040 + }, + { + "epoch": 0.6005486284289276, + "grad_norm": 0.38481660110150684, + "learning_rate": 1.8166194178685465e-05, + "loss": 0.8666, + "step": 12041 + }, + { + "epoch": 0.6005985037406484, + "grad_norm": 0.5035010559555354, + "learning_rate": 1.8162309624228464e-05, + "loss": 0.8834, + "step": 12042 + }, + { + "epoch": 0.6006483790523691, + "grad_norm": 0.4738464004516392, + "learning_rate": 1.815842524819793e-05, + "loss": 0.8747, + "step": 12043 + }, + { + "epoch": 0.6006982543640897, + "grad_norm": 0.47434276514495427, + "learning_rate": 1.815454105069523e-05, + "loss": 0.8868, + "step": 12044 + }, + { + "epoch": 0.6007481296758105, + "grad_norm": 0.5186010392035324, + "learning_rate": 1.8150657031821725e-05, + "loss": 0.8631, + "step": 12045 + }, + { + "epoch": 0.6007980049875312, + "grad_norm": 0.418917129754838, + "learning_rate": 1.814677319167875e-05, + "loss": 0.8933, + "step": 12046 + }, + { + "epoch": 0.6008478802992518, + "grad_norm": 0.4191462830133615, + "learning_rate": 1.8142889530367663e-05, + "loss": 0.8389, + "step": 12047 + }, + { + "epoch": 0.6008977556109726, + "grad_norm": 0.4938227689413998, + "learning_rate": 1.8139006047989814e-05, + "loss": 0.8697, + "step": 12048 + }, + { + "epoch": 0.6009476309226933, + "grad_norm": 0.4390443154400737, + "learning_rate": 1.813512274464652e-05, + "loss": 0.8421, + "step": 12049 + }, + { + "epoch": 0.600997506234414, + "grad_norm": 0.5359401606332173, + "learning_rate": 1.813123962043913e-05, + "loss": 0.8752, + "step": 12050 + }, + { + "epoch": 0.6010473815461347, + "grad_norm": 0.5199409480424968, + "learning_rate": 1.8127356675468966e-05, + "loss": 0.8458, + "step": 12051 + }, + { + "epoch": 0.6010972568578553, + "grad_norm": 0.6599526343091004, + "learning_rate": 1.8123473909837364e-05, + "loss": 0.8763, + "step": 12052 + }, + { + "epoch": 0.601147132169576, + "grad_norm": 0.4153376110599817, + "learning_rate": 1.8119591323645618e-05, + "loss": 0.851, + "step": 12053 + }, + { + "epoch": 0.6011970074812968, + "grad_norm": 0.4285237370779978, + "learning_rate": 1.8115708916995062e-05, + "loss": 0.881, + "step": 12054 + }, + { + "epoch": 0.6012468827930174, + "grad_norm": 0.5094078108435771, + "learning_rate": 1.8111826689987e-05, + "loss": 0.8295, + "step": 12055 + }, + { + "epoch": 0.6012967581047381, + "grad_norm": 0.4260335770209865, + "learning_rate": 1.810794464272275e-05, + "loss": 0.842, + "step": 12056 + }, + { + "epoch": 0.6013466334164589, + "grad_norm": 0.5001270283977844, + "learning_rate": 1.8104062775303586e-05, + "loss": 0.8393, + "step": 12057 + }, + { + "epoch": 0.6013965087281795, + "grad_norm": 0.4891575539412207, + "learning_rate": 1.810018108783082e-05, + "loss": 0.9057, + "step": 12058 + }, + { + "epoch": 0.6014463840399003, + "grad_norm": 0.39031273178415815, + "learning_rate": 1.8096299580405744e-05, + "loss": 0.8527, + "step": 12059 + }, + { + "epoch": 0.601496259351621, + "grad_norm": 0.6438557311886524, + "learning_rate": 1.8092418253129646e-05, + "loss": 0.8661, + "step": 12060 + }, + { + "epoch": 0.6015461346633416, + "grad_norm": 0.40425515951530167, + "learning_rate": 1.808853710610379e-05, + "loss": 0.8561, + "step": 12061 + }, + { + "epoch": 0.6015960099750624, + "grad_norm": 0.4778405904944496, + "learning_rate": 1.8084656139429466e-05, + "loss": 0.9006, + "step": 12062 + }, + { + "epoch": 0.6016458852867831, + "grad_norm": 0.39386280936344464, + "learning_rate": 1.8080775353207955e-05, + "loss": 0.8572, + "step": 12063 + }, + { + "epoch": 0.6016957605985037, + "grad_norm": 0.5066623089751267, + "learning_rate": 1.807689474754051e-05, + "loss": 0.8305, + "step": 12064 + }, + { + "epoch": 0.6017456359102245, + "grad_norm": 0.4157436775361061, + "learning_rate": 1.8073014322528397e-05, + "loss": 0.8083, + "step": 12065 + }, + { + "epoch": 0.6017955112219452, + "grad_norm": 0.46393465588816324, + "learning_rate": 1.8069134078272874e-05, + "loss": 0.8728, + "step": 12066 + }, + { + "epoch": 0.6018453865336658, + "grad_norm": 0.42346522635428424, + "learning_rate": 1.80652540148752e-05, + "loss": 0.8952, + "step": 12067 + }, + { + "epoch": 0.6018952618453866, + "grad_norm": 0.46728897511821116, + "learning_rate": 1.8061374132436612e-05, + "loss": 0.8349, + "step": 12068 + }, + { + "epoch": 0.6019451371571072, + "grad_norm": 1.0970349196093612, + "learning_rate": 1.8057494431058365e-05, + "loss": 0.9075, + "step": 12069 + }, + { + "epoch": 0.6019950124688279, + "grad_norm": 0.47386032359862984, + "learning_rate": 1.8053614910841692e-05, + "loss": 0.8648, + "step": 12070 + }, + { + "epoch": 0.6020448877805487, + "grad_norm": 0.45774704219695245, + "learning_rate": 1.8049735571887833e-05, + "loss": 0.9108, + "step": 12071 + }, + { + "epoch": 0.6020947630922693, + "grad_norm": 0.4756176084323484, + "learning_rate": 1.804585641429801e-05, + "loss": 0.8933, + "step": 12072 + }, + { + "epoch": 0.60214463840399, + "grad_norm": 0.45217019337631287, + "learning_rate": 1.804197743817345e-05, + "loss": 0.8544, + "step": 12073 + }, + { + "epoch": 0.6021945137157108, + "grad_norm": 1.3258553160068915, + "learning_rate": 1.8038098643615387e-05, + "loss": 0.8347, + "step": 12074 + }, + { + "epoch": 0.6022443890274314, + "grad_norm": 0.41796400524807215, + "learning_rate": 1.803422003072501e-05, + "loss": 0.8248, + "step": 12075 + }, + { + "epoch": 0.6022942643391521, + "grad_norm": 0.4802893924168688, + "learning_rate": 1.8030341599603545e-05, + "loss": 0.8684, + "step": 12076 + }, + { + "epoch": 0.6023441396508729, + "grad_norm": 0.43762040337331715, + "learning_rate": 1.80264633503522e-05, + "loss": 0.869, + "step": 12077 + }, + { + "epoch": 0.6023940149625935, + "grad_norm": 0.5120100081442593, + "learning_rate": 1.802258528307218e-05, + "loss": 0.8747, + "step": 12078 + }, + { + "epoch": 0.6024438902743142, + "grad_norm": 0.49559516918981517, + "learning_rate": 1.8018707397864664e-05, + "loss": 0.8669, + "step": 12079 + }, + { + "epoch": 0.602493765586035, + "grad_norm": 0.5166180843353922, + "learning_rate": 1.8014829694830852e-05, + "loss": 0.8314, + "step": 12080 + }, + { + "epoch": 0.6025436408977556, + "grad_norm": 0.49808950767583376, + "learning_rate": 1.8010952174071938e-05, + "loss": 0.8743, + "step": 12081 + }, + { + "epoch": 0.6025935162094763, + "grad_norm": 0.4486858217108725, + "learning_rate": 1.8007074835689107e-05, + "loss": 0.8665, + "step": 12082 + }, + { + "epoch": 0.6026433915211971, + "grad_norm": 0.4469311500257584, + "learning_rate": 1.800319767978352e-05, + "loss": 0.9539, + "step": 12083 + }, + { + "epoch": 0.6026932668329177, + "grad_norm": 0.4547596662512133, + "learning_rate": 1.7999320706456358e-05, + "loss": 0.8868, + "step": 12084 + }, + { + "epoch": 0.6027431421446384, + "grad_norm": 0.5260917924273517, + "learning_rate": 1.7995443915808795e-05, + "loss": 0.8778, + "step": 12085 + }, + { + "epoch": 0.602793017456359, + "grad_norm": 0.5336291215501507, + "learning_rate": 1.7991567307941984e-05, + "loss": 0.8665, + "step": 12086 + }, + { + "epoch": 0.6028428927680798, + "grad_norm": 0.38642141571925814, + "learning_rate": 1.7987690882957085e-05, + "loss": 0.8466, + "step": 12087 + }, + { + "epoch": 0.6028927680798005, + "grad_norm": 0.9796401419971206, + "learning_rate": 1.7983814640955256e-05, + "loss": 0.8726, + "step": 12088 + }, + { + "epoch": 0.6029426433915211, + "grad_norm": 0.6383069637042438, + "learning_rate": 1.7979938582037647e-05, + "loss": 0.8579, + "step": 12089 + }, + { + "epoch": 0.6029925187032419, + "grad_norm": 0.6889332131380548, + "learning_rate": 1.7976062706305397e-05, + "loss": 0.864, + "step": 12090 + }, + { + "epoch": 0.6030423940149626, + "grad_norm": 0.5017448740790657, + "learning_rate": 1.7972187013859643e-05, + "loss": 0.8505, + "step": 12091 + }, + { + "epoch": 0.6030922693266833, + "grad_norm": 0.4547388802899136, + "learning_rate": 1.796831150480152e-05, + "loss": 0.8972, + "step": 12092 + }, + { + "epoch": 0.603142144638404, + "grad_norm": 0.4458581321727418, + "learning_rate": 1.796443617923218e-05, + "loss": 0.8508, + "step": 12093 + }, + { + "epoch": 0.6031920199501247, + "grad_norm": 0.40647109930681546, + "learning_rate": 1.7960561037252714e-05, + "loss": 0.8667, + "step": 12094 + }, + { + "epoch": 0.6032418952618454, + "grad_norm": 0.44809509166882766, + "learning_rate": 1.795668607896426e-05, + "loss": 0.8763, + "step": 12095 + }, + { + "epoch": 0.6032917705735661, + "grad_norm": 0.4813300147339168, + "learning_rate": 1.7952811304467936e-05, + "loss": 0.8715, + "step": 12096 + }, + { + "epoch": 0.6033416458852868, + "grad_norm": 0.4667409711035351, + "learning_rate": 1.7948936713864842e-05, + "loss": 0.8831, + "step": 12097 + }, + { + "epoch": 0.6033915211970075, + "grad_norm": 0.407496482401285, + "learning_rate": 1.7945062307256087e-05, + "loss": 0.8725, + "step": 12098 + }, + { + "epoch": 0.6034413965087282, + "grad_norm": 0.40854954004135363, + "learning_rate": 1.794118808474277e-05, + "loss": 0.8435, + "step": 12099 + }, + { + "epoch": 0.6034912718204489, + "grad_norm": 0.7675522361904554, + "learning_rate": 1.7937314046426e-05, + "loss": 0.8834, + "step": 12100 + }, + { + "epoch": 0.6035411471321696, + "grad_norm": 0.418510380043615, + "learning_rate": 1.793344019240686e-05, + "loss": 0.9206, + "step": 12101 + }, + { + "epoch": 0.6035910224438903, + "grad_norm": 0.4617375151674838, + "learning_rate": 1.792956652278643e-05, + "loss": 0.8706, + "step": 12102 + }, + { + "epoch": 0.603640897755611, + "grad_norm": 0.953726256797339, + "learning_rate": 1.7925693037665798e-05, + "loss": 0.85, + "step": 12103 + }, + { + "epoch": 0.6036907730673317, + "grad_norm": 0.6511999971560681, + "learning_rate": 1.792181973714604e-05, + "loss": 0.8763, + "step": 12104 + }, + { + "epoch": 0.6037406483790524, + "grad_norm": 0.525999920180638, + "learning_rate": 1.7917946621328227e-05, + "loss": 0.8682, + "step": 12105 + }, + { + "epoch": 0.603790523690773, + "grad_norm": 0.3721181527308847, + "learning_rate": 1.7914073690313432e-05, + "loss": 0.8281, + "step": 12106 + }, + { + "epoch": 0.6038403990024938, + "grad_norm": 0.48836260264665815, + "learning_rate": 1.7910200944202714e-05, + "loss": 0.8389, + "step": 12107 + }, + { + "epoch": 0.6038902743142145, + "grad_norm": 0.4594376878691827, + "learning_rate": 1.7906328383097123e-05, + "loss": 0.8796, + "step": 12108 + }, + { + "epoch": 0.6039401496259351, + "grad_norm": 0.5369123756640843, + "learning_rate": 1.790245600709772e-05, + "loss": 0.8625, + "step": 12109 + }, + { + "epoch": 0.6039900249376559, + "grad_norm": 0.47045517335171894, + "learning_rate": 1.7898583816305548e-05, + "loss": 0.8924, + "step": 12110 + }, + { + "epoch": 0.6040399002493766, + "grad_norm": 0.4538017873334172, + "learning_rate": 1.7894711810821662e-05, + "loss": 0.8425, + "step": 12111 + }, + { + "epoch": 0.6040897755610972, + "grad_norm": 0.43370739980629575, + "learning_rate": 1.7890839990747084e-05, + "loss": 0.9049, + "step": 12112 + }, + { + "epoch": 0.604139650872818, + "grad_norm": 0.5719522497860338, + "learning_rate": 1.7886968356182856e-05, + "loss": 0.8638, + "step": 12113 + }, + { + "epoch": 0.6041895261845387, + "grad_norm": 0.7575520714226097, + "learning_rate": 1.7883096907230004e-05, + "loss": 0.8693, + "step": 12114 + }, + { + "epoch": 0.6042394014962593, + "grad_norm": 0.4691347675995166, + "learning_rate": 1.7879225643989563e-05, + "loss": 0.9035, + "step": 12115 + }, + { + "epoch": 0.6042892768079801, + "grad_norm": 0.4650425311302108, + "learning_rate": 1.7875354566562533e-05, + "loss": 0.8633, + "step": 12116 + }, + { + "epoch": 0.6043391521197008, + "grad_norm": 0.5740983883377027, + "learning_rate": 1.7871483675049934e-05, + "loss": 0.8569, + "step": 12117 + }, + { + "epoch": 0.6043890274314214, + "grad_norm": 0.4367488753224557, + "learning_rate": 1.7867612969552795e-05, + "loss": 0.8751, + "step": 12118 + }, + { + "epoch": 0.6044389027431422, + "grad_norm": 0.4204337270423761, + "learning_rate": 1.7863742450172087e-05, + "loss": 0.9182, + "step": 12119 + }, + { + "epoch": 0.6044887780548629, + "grad_norm": 0.46489794447353594, + "learning_rate": 1.7859872117008828e-05, + "loss": 0.8703, + "step": 12120 + }, + { + "epoch": 0.6045386533665835, + "grad_norm": 0.5141255972295056, + "learning_rate": 1.7856001970164012e-05, + "loss": 0.8111, + "step": 12121 + }, + { + "epoch": 0.6045885286783043, + "grad_norm": 0.4355097993772278, + "learning_rate": 1.7852132009738635e-05, + "loss": 0.8488, + "step": 12122 + }, + { + "epoch": 0.6046384039900249, + "grad_norm": 0.4321843011724765, + "learning_rate": 1.7848262235833664e-05, + "loss": 0.8484, + "step": 12123 + }, + { + "epoch": 0.6046882793017456, + "grad_norm": 0.5956781007181559, + "learning_rate": 1.784439264855009e-05, + "loss": 0.8921, + "step": 12124 + }, + { + "epoch": 0.6047381546134664, + "grad_norm": 0.5445240108598081, + "learning_rate": 1.7840523247988888e-05, + "loss": 0.8784, + "step": 12125 + }, + { + "epoch": 0.604788029925187, + "grad_norm": 0.6160617023990552, + "learning_rate": 1.7836654034251037e-05, + "loss": 0.8674, + "step": 12126 + }, + { + "epoch": 0.6048379052369077, + "grad_norm": 0.44205056066604453, + "learning_rate": 1.7832785007437483e-05, + "loss": 0.8748, + "step": 12127 + }, + { + "epoch": 0.6048877805486285, + "grad_norm": 0.6056635778607631, + "learning_rate": 1.7828916167649194e-05, + "loss": 0.8403, + "step": 12128 + }, + { + "epoch": 0.6049376558603491, + "grad_norm": 0.3977140566549635, + "learning_rate": 1.7825047514987133e-05, + "loss": 0.843, + "step": 12129 + }, + { + "epoch": 0.6049875311720698, + "grad_norm": 0.39828700207693235, + "learning_rate": 1.7821179049552255e-05, + "loss": 0.8654, + "step": 12130 + }, + { + "epoch": 0.6050374064837906, + "grad_norm": 0.5839717366117319, + "learning_rate": 1.781731077144549e-05, + "loss": 0.8417, + "step": 12131 + }, + { + "epoch": 0.6050872817955112, + "grad_norm": 0.6920169599173819, + "learning_rate": 1.781344268076778e-05, + "loss": 0.87, + "step": 12132 + }, + { + "epoch": 0.6051371571072319, + "grad_norm": 0.5397512317769931, + "learning_rate": 1.780957477762008e-05, + "loss": 0.892, + "step": 12133 + }, + { + "epoch": 0.6051870324189527, + "grad_norm": 0.4380080142526755, + "learning_rate": 1.7805707062103294e-05, + "loss": 0.8612, + "step": 12134 + }, + { + "epoch": 0.6052369077306733, + "grad_norm": 0.42365385145968093, + "learning_rate": 1.7801839534318364e-05, + "loss": 0.8778, + "step": 12135 + }, + { + "epoch": 0.605286783042394, + "grad_norm": 0.48745006909442146, + "learning_rate": 1.7797972194366212e-05, + "loss": 0.8588, + "step": 12136 + }, + { + "epoch": 0.6053366583541148, + "grad_norm": 0.5328232609760216, + "learning_rate": 1.7794105042347758e-05, + "loss": 0.8659, + "step": 12137 + }, + { + "epoch": 0.6053865336658354, + "grad_norm": 0.439234122596014, + "learning_rate": 1.779023807836391e-05, + "loss": 0.8423, + "step": 12138 + }, + { + "epoch": 0.6054364089775561, + "grad_norm": 0.5510068434174191, + "learning_rate": 1.7786371302515564e-05, + "loss": 0.8186, + "step": 12139 + }, + { + "epoch": 0.6054862842892768, + "grad_norm": 0.5202454430703396, + "learning_rate": 1.778250471490363e-05, + "loss": 0.8601, + "step": 12140 + }, + { + "epoch": 0.6055361596009975, + "grad_norm": 0.5811612457106242, + "learning_rate": 1.7778638315629013e-05, + "loss": 0.8847, + "step": 12141 + }, + { + "epoch": 0.6055860349127182, + "grad_norm": 0.4752841766125355, + "learning_rate": 1.777477210479259e-05, + "loss": 0.8448, + "step": 12142 + }, + { + "epoch": 0.6056359102244389, + "grad_norm": 0.4222679947909463, + "learning_rate": 1.777090608249526e-05, + "loss": 0.8999, + "step": 12143 + }, + { + "epoch": 0.6056857855361596, + "grad_norm": 0.43325528990104833, + "learning_rate": 1.77670402488379e-05, + "loss": 0.8519, + "step": 12144 + }, + { + "epoch": 0.6057356608478803, + "grad_norm": 0.5066222994488139, + "learning_rate": 1.7763174603921385e-05, + "loss": 0.8604, + "step": 12145 + }, + { + "epoch": 0.605785536159601, + "grad_norm": 0.4604451676287901, + "learning_rate": 1.7759309147846588e-05, + "loss": 0.8847, + "step": 12146 + }, + { + "epoch": 0.6058354114713217, + "grad_norm": 0.4436759936985546, + "learning_rate": 1.7755443880714383e-05, + "loss": 0.8722, + "step": 12147 + }, + { + "epoch": 0.6058852867830424, + "grad_norm": 0.4522188861442342, + "learning_rate": 1.7751578802625633e-05, + "loss": 0.8708, + "step": 12148 + }, + { + "epoch": 0.6059351620947631, + "grad_norm": 0.47982862122026354, + "learning_rate": 1.774771391368118e-05, + "loss": 0.9094, + "step": 12149 + }, + { + "epoch": 0.6059850374064838, + "grad_norm": 0.44685971449879586, + "learning_rate": 1.774384921398189e-05, + "loss": 0.856, + "step": 12150 + }, + { + "epoch": 0.6060349127182045, + "grad_norm": 0.46758312606339764, + "learning_rate": 1.7739984703628605e-05, + "loss": 0.9408, + "step": 12151 + }, + { + "epoch": 0.6060847880299252, + "grad_norm": 0.4190187468019763, + "learning_rate": 1.773612038272218e-05, + "loss": 0.8815, + "step": 12152 + }, + { + "epoch": 0.6061346633416459, + "grad_norm": 0.41490276221254074, + "learning_rate": 1.7732256251363433e-05, + "loss": 0.8492, + "step": 12153 + }, + { + "epoch": 0.6061845386533666, + "grad_norm": 0.576580498383357, + "learning_rate": 1.7728392309653205e-05, + "loss": 0.8953, + "step": 12154 + }, + { + "epoch": 0.6062344139650873, + "grad_norm": 0.5396412304439121, + "learning_rate": 1.772452855769234e-05, + "loss": 0.8991, + "step": 12155 + }, + { + "epoch": 0.606284289276808, + "grad_norm": 0.4364264854931207, + "learning_rate": 1.772066499558163e-05, + "loss": 0.8495, + "step": 12156 + }, + { + "epoch": 0.6063341645885286, + "grad_norm": 0.5631279867235458, + "learning_rate": 1.7716801623421913e-05, + "loss": 0.8842, + "step": 12157 + }, + { + "epoch": 0.6063840399002494, + "grad_norm": 0.6333352118136483, + "learning_rate": 1.7712938441313996e-05, + "loss": 0.8801, + "step": 12158 + }, + { + "epoch": 0.6064339152119701, + "grad_norm": 0.5257976663706456, + "learning_rate": 1.7709075449358705e-05, + "loss": 0.8737, + "step": 12159 + }, + { + "epoch": 0.6064837905236907, + "grad_norm": 0.4987475796360328, + "learning_rate": 1.770521264765681e-05, + "loss": 0.8367, + "step": 12160 + }, + { + "epoch": 0.6065336658354115, + "grad_norm": 1.0354414023076104, + "learning_rate": 1.7701350036309127e-05, + "loss": 0.8803, + "step": 12161 + }, + { + "epoch": 0.6065835411471322, + "grad_norm": 0.4125523108958769, + "learning_rate": 1.769748761541645e-05, + "loss": 0.8209, + "step": 12162 + }, + { + "epoch": 0.6066334164588528, + "grad_norm": 0.5730880290359099, + "learning_rate": 1.7693625385079577e-05, + "loss": 0.8388, + "step": 12163 + }, + { + "epoch": 0.6066832917705736, + "grad_norm": 0.44582456703451784, + "learning_rate": 1.7689763345399267e-05, + "loss": 0.8657, + "step": 12164 + }, + { + "epoch": 0.6067331670822943, + "grad_norm": 0.4283663609572766, + "learning_rate": 1.7685901496476313e-05, + "loss": 0.855, + "step": 12165 + }, + { + "epoch": 0.6067830423940149, + "grad_norm": 0.7418616991836482, + "learning_rate": 1.7682039838411495e-05, + "loss": 0.8801, + "step": 12166 + }, + { + "epoch": 0.6068329177057357, + "grad_norm": 0.4608674474725758, + "learning_rate": 1.7678178371305562e-05, + "loss": 0.8465, + "step": 12167 + }, + { + "epoch": 0.6068827930174564, + "grad_norm": 0.5399368441498806, + "learning_rate": 1.7674317095259288e-05, + "loss": 0.8669, + "step": 12168 + }, + { + "epoch": 0.606932668329177, + "grad_norm": 0.47957572247342334, + "learning_rate": 1.7670456010373432e-05, + "loss": 0.8707, + "step": 12169 + }, + { + "epoch": 0.6069825436408978, + "grad_norm": 0.4547495229530763, + "learning_rate": 1.7666595116748754e-05, + "loss": 0.8637, + "step": 12170 + }, + { + "epoch": 0.6070324189526185, + "grad_norm": 0.44510474507130193, + "learning_rate": 1.7662734414485987e-05, + "loss": 0.8816, + "step": 12171 + }, + { + "epoch": 0.6070822942643391, + "grad_norm": 0.43694310049310897, + "learning_rate": 1.765887390368588e-05, + "loss": 0.8378, + "step": 12172 + }, + { + "epoch": 0.6071321695760599, + "grad_norm": 0.43339627030050226, + "learning_rate": 1.7655013584449174e-05, + "loss": 0.8807, + "step": 12173 + }, + { + "epoch": 0.6071820448877806, + "grad_norm": 0.42806364023304705, + "learning_rate": 1.7651153456876605e-05, + "loss": 0.8788, + "step": 12174 + }, + { + "epoch": 0.6072319201995012, + "grad_norm": 0.45638264993663746, + "learning_rate": 1.7647293521068896e-05, + "loss": 0.8775, + "step": 12175 + }, + { + "epoch": 0.607281795511222, + "grad_norm": 0.5225977620677458, + "learning_rate": 1.764343377712677e-05, + "loss": 0.899, + "step": 12176 + }, + { + "epoch": 0.6073316708229426, + "grad_norm": 0.4470949183436369, + "learning_rate": 1.763957422515095e-05, + "loss": 0.8572, + "step": 12177 + }, + { + "epoch": 0.6073815461346633, + "grad_norm": 0.40734170200885333, + "learning_rate": 1.7635714865242146e-05, + "loss": 0.8307, + "step": 12178 + }, + { + "epoch": 0.6074314214463841, + "grad_norm": 0.4662566685379259, + "learning_rate": 1.763185569750106e-05, + "loss": 0.8792, + "step": 12179 + }, + { + "epoch": 0.6074812967581047, + "grad_norm": 0.46523543896281944, + "learning_rate": 1.7627996722028415e-05, + "loss": 0.8714, + "step": 12180 + }, + { + "epoch": 0.6075311720698254, + "grad_norm": 0.6020385217064155, + "learning_rate": 1.762413793892489e-05, + "loss": 0.8735, + "step": 12181 + }, + { + "epoch": 0.6075810473815462, + "grad_norm": 0.4727849977014933, + "learning_rate": 1.7620279348291183e-05, + "loss": 0.855, + "step": 12182 + }, + { + "epoch": 0.6076309226932668, + "grad_norm": 0.42428320974600237, + "learning_rate": 1.761642095022798e-05, + "loss": 0.841, + "step": 12183 + }, + { + "epoch": 0.6076807980049875, + "grad_norm": 0.4749488456697373, + "learning_rate": 1.7612562744835977e-05, + "loss": 0.8649, + "step": 12184 + }, + { + "epoch": 0.6077306733167083, + "grad_norm": 0.39575963905557754, + "learning_rate": 1.7608704732215843e-05, + "loss": 0.8393, + "step": 12185 + }, + { + "epoch": 0.6077805486284289, + "grad_norm": 0.4737256622408286, + "learning_rate": 1.7604846912468242e-05, + "loss": 0.8486, + "step": 12186 + }, + { + "epoch": 0.6078304239401496, + "grad_norm": 0.5100070602931501, + "learning_rate": 1.7600989285693852e-05, + "loss": 0.8381, + "step": 12187 + }, + { + "epoch": 0.6078802992518704, + "grad_norm": 0.43663931271025264, + "learning_rate": 1.7597131851993338e-05, + "loss": 0.8549, + "step": 12188 + }, + { + "epoch": 0.607930174563591, + "grad_norm": 0.5147002360510753, + "learning_rate": 1.7593274611467364e-05, + "loss": 0.8209, + "step": 12189 + }, + { + "epoch": 0.6079800498753117, + "grad_norm": 0.45359909353517913, + "learning_rate": 1.7589417564216562e-05, + "loss": 0.8847, + "step": 12190 + }, + { + "epoch": 0.6080299251870325, + "grad_norm": 0.39560417761368427, + "learning_rate": 1.7585560710341596e-05, + "loss": 0.8044, + "step": 12191 + }, + { + "epoch": 0.6080798004987531, + "grad_norm": 0.46884222431158473, + "learning_rate": 1.758170404994311e-05, + "loss": 0.8702, + "step": 12192 + }, + { + "epoch": 0.6081296758104738, + "grad_norm": 0.4765609148757655, + "learning_rate": 1.757784758312173e-05, + "loss": 0.8472, + "step": 12193 + }, + { + "epoch": 0.6081795511221945, + "grad_norm": 0.46406965131590916, + "learning_rate": 1.7573991309978097e-05, + "loss": 0.8929, + "step": 12194 + }, + { + "epoch": 0.6082294264339152, + "grad_norm": 0.4330649790981311, + "learning_rate": 1.7570135230612837e-05, + "loss": 0.8633, + "step": 12195 + }, + { + "epoch": 0.6082793017456359, + "grad_norm": 0.4798364746357766, + "learning_rate": 1.756627934512658e-05, + "loss": 0.8638, + "step": 12196 + }, + { + "epoch": 0.6083291770573566, + "grad_norm": 0.4258032570913153, + "learning_rate": 1.756242365361993e-05, + "loss": 0.8885, + "step": 12197 + }, + { + "epoch": 0.6083790523690773, + "grad_norm": 0.45870013539044624, + "learning_rate": 1.7558568156193503e-05, + "loss": 0.878, + "step": 12198 + }, + { + "epoch": 0.608428927680798, + "grad_norm": 0.4534910676998981, + "learning_rate": 1.7554712852947913e-05, + "loss": 0.8572, + "step": 12199 + }, + { + "epoch": 0.6084788029925187, + "grad_norm": 0.49287217666677696, + "learning_rate": 1.7550857743983767e-05, + "loss": 0.8638, + "step": 12200 + }, + { + "epoch": 0.6085286783042394, + "grad_norm": 0.38957386077789874, + "learning_rate": 1.7547002829401643e-05, + "loss": 0.8651, + "step": 12201 + }, + { + "epoch": 0.6085785536159601, + "grad_norm": 0.5512878084411903, + "learning_rate": 1.7543148109302147e-05, + "loss": 0.8909, + "step": 12202 + }, + { + "epoch": 0.6086284289276808, + "grad_norm": 0.7215124047042085, + "learning_rate": 1.7539293583785872e-05, + "loss": 0.8371, + "step": 12203 + }, + { + "epoch": 0.6086783042394015, + "grad_norm": 0.7156520323483786, + "learning_rate": 1.7535439252953384e-05, + "loss": 0.8831, + "step": 12204 + }, + { + "epoch": 0.6087281795511222, + "grad_norm": 0.48320062088998333, + "learning_rate": 1.7531585116905267e-05, + "loss": 0.8815, + "step": 12205 + }, + { + "epoch": 0.6087780548628429, + "grad_norm": 0.4034956450903817, + "learning_rate": 1.7527731175742094e-05, + "loss": 0.8574, + "step": 12206 + }, + { + "epoch": 0.6088279301745636, + "grad_norm": 0.5106932458848988, + "learning_rate": 1.752387742956444e-05, + "loss": 0.8363, + "step": 12207 + }, + { + "epoch": 0.6088778054862843, + "grad_norm": 0.5141346723260185, + "learning_rate": 1.7520023878472853e-05, + "loss": 0.9067, + "step": 12208 + }, + { + "epoch": 0.608927680798005, + "grad_norm": 0.5648669366073544, + "learning_rate": 1.751617052256789e-05, + "loss": 0.9002, + "step": 12209 + }, + { + "epoch": 0.6089775561097257, + "grad_norm": 0.4162267165916359, + "learning_rate": 1.751231736195011e-05, + "loss": 0.8773, + "step": 12210 + }, + { + "epoch": 0.6090274314214463, + "grad_norm": 0.4323800660488397, + "learning_rate": 1.750846439672007e-05, + "loss": 0.8226, + "step": 12211 + }, + { + "epoch": 0.6090773067331671, + "grad_norm": 0.4399120638215751, + "learning_rate": 1.7504611626978285e-05, + "loss": 0.8635, + "step": 12212 + }, + { + "epoch": 0.6091271820448878, + "grad_norm": 0.43721018575810666, + "learning_rate": 1.7500759052825306e-05, + "loss": 0.8716, + "step": 12213 + }, + { + "epoch": 0.6091770573566084, + "grad_norm": 0.6518201723393945, + "learning_rate": 1.7496906674361667e-05, + "loss": 0.8812, + "step": 12214 + }, + { + "epoch": 0.6092269326683292, + "grad_norm": 0.4416666119196883, + "learning_rate": 1.7493054491687892e-05, + "loss": 0.8798, + "step": 12215 + }, + { + "epoch": 0.6092768079800499, + "grad_norm": 0.41022019884069033, + "learning_rate": 1.74892025049045e-05, + "loss": 0.843, + "step": 12216 + }, + { + "epoch": 0.6093266832917705, + "grad_norm": 0.4640455518378596, + "learning_rate": 1.7485350714112004e-05, + "loss": 0.9441, + "step": 12217 + }, + { + "epoch": 0.6093765586034913, + "grad_norm": 0.404006414528266, + "learning_rate": 1.7481499119410923e-05, + "loss": 0.8675, + "step": 12218 + }, + { + "epoch": 0.609426433915212, + "grad_norm": 0.38772038649384294, + "learning_rate": 1.7477647720901748e-05, + "loss": 0.8738, + "step": 12219 + }, + { + "epoch": 0.6094763092269326, + "grad_norm": 0.42923801399109646, + "learning_rate": 1.7473796518684994e-05, + "loss": 0.9015, + "step": 12220 + }, + { + "epoch": 0.6095261845386534, + "grad_norm": 0.3952538200822684, + "learning_rate": 1.7469945512861157e-05, + "loss": 0.8311, + "step": 12221 + }, + { + "epoch": 0.6095760598503741, + "grad_norm": 0.6124285158194208, + "learning_rate": 1.7466094703530718e-05, + "loss": 0.8628, + "step": 12222 + }, + { + "epoch": 0.6096259351620947, + "grad_norm": 0.4244825410898677, + "learning_rate": 1.7462244090794164e-05, + "loss": 0.9173, + "step": 12223 + }, + { + "epoch": 0.6096758104738155, + "grad_norm": 0.41935039863027174, + "learning_rate": 1.7458393674751978e-05, + "loss": 0.8267, + "step": 12224 + }, + { + "epoch": 0.6097256857855362, + "grad_norm": 2.7756133587968077, + "learning_rate": 1.745454345550464e-05, + "loss": 0.8635, + "step": 12225 + }, + { + "epoch": 0.6097755610972568, + "grad_norm": 0.4477069599462767, + "learning_rate": 1.7450693433152605e-05, + "loss": 0.859, + "step": 12226 + }, + { + "epoch": 0.6098254364089776, + "grad_norm": 0.5263029459735498, + "learning_rate": 1.744684360779635e-05, + "loss": 0.8882, + "step": 12227 + }, + { + "epoch": 0.6098753117206983, + "grad_norm": 0.5408121173894309, + "learning_rate": 1.7442993979536327e-05, + "loss": 0.8848, + "step": 12228 + }, + { + "epoch": 0.6099251870324189, + "grad_norm": 0.7753131757246069, + "learning_rate": 1.7439144548473e-05, + "loss": 0.8692, + "step": 12229 + }, + { + "epoch": 0.6099750623441397, + "grad_norm": 0.519239018867595, + "learning_rate": 1.743529531470681e-05, + "loss": 0.9126, + "step": 12230 + }, + { + "epoch": 0.6100249376558603, + "grad_norm": 0.4085657875131253, + "learning_rate": 1.7431446278338197e-05, + "loss": 0.8643, + "step": 12231 + }, + { + "epoch": 0.610074812967581, + "grad_norm": 0.5170837759756766, + "learning_rate": 1.7427597439467613e-05, + "loss": 0.8579, + "step": 12232 + }, + { + "epoch": 0.6101246882793018, + "grad_norm": 0.4326229908974037, + "learning_rate": 1.742374879819549e-05, + "loss": 0.8891, + "step": 12233 + }, + { + "epoch": 0.6101745635910224, + "grad_norm": 0.4943577524809467, + "learning_rate": 1.741990035462224e-05, + "loss": 0.8983, + "step": 12234 + }, + { + "epoch": 0.6102244389027431, + "grad_norm": 0.4615524016824987, + "learning_rate": 1.74160521088483e-05, + "loss": 0.8286, + "step": 12235 + }, + { + "epoch": 0.6102743142144639, + "grad_norm": 0.3525363842347118, + "learning_rate": 1.7412204060974087e-05, + "loss": 0.8381, + "step": 12236 + }, + { + "epoch": 0.6103241895261845, + "grad_norm": 0.42603121831540514, + "learning_rate": 1.7408356211100025e-05, + "loss": 0.8662, + "step": 12237 + }, + { + "epoch": 0.6103740648379052, + "grad_norm": 0.4450522055255115, + "learning_rate": 1.74045085593265e-05, + "loss": 0.8532, + "step": 12238 + }, + { + "epoch": 0.610423940149626, + "grad_norm": 0.5349852557375769, + "learning_rate": 1.7400661105753918e-05, + "loss": 0.8911, + "step": 12239 + }, + { + "epoch": 0.6104738154613466, + "grad_norm": 0.5797545220963239, + "learning_rate": 1.73968138504827e-05, + "loss": 0.9283, + "step": 12240 + }, + { + "epoch": 0.6105236907730673, + "grad_norm": 0.4223915085466525, + "learning_rate": 1.7392966793613206e-05, + "loss": 0.8593, + "step": 12241 + }, + { + "epoch": 0.6105735660847881, + "grad_norm": 0.3929314861426028, + "learning_rate": 1.7389119935245846e-05, + "loss": 0.8303, + "step": 12242 + }, + { + "epoch": 0.6106234413965087, + "grad_norm": 0.4625813050095203, + "learning_rate": 1.7385273275480994e-05, + "loss": 0.8402, + "step": 12243 + }, + { + "epoch": 0.6106733167082294, + "grad_norm": 0.42135947294639625, + "learning_rate": 1.738142681441903e-05, + "loss": 0.8729, + "step": 12244 + }, + { + "epoch": 0.6107231920199502, + "grad_norm": 0.5889293052460015, + "learning_rate": 1.7377580552160324e-05, + "loss": 0.8558, + "step": 12245 + }, + { + "epoch": 0.6107730673316708, + "grad_norm": 0.4533209736095452, + "learning_rate": 1.7373734488805237e-05, + "loss": 0.8548, + "step": 12246 + }, + { + "epoch": 0.6108229426433915, + "grad_norm": 0.4662119358785565, + "learning_rate": 1.7369888624454137e-05, + "loss": 0.8559, + "step": 12247 + }, + { + "epoch": 0.6108728179551122, + "grad_norm": 0.5196136379944823, + "learning_rate": 1.736604295920739e-05, + "loss": 0.8492, + "step": 12248 + }, + { + "epoch": 0.6109226932668329, + "grad_norm": 0.41034987712848653, + "learning_rate": 1.7362197493165323e-05, + "loss": 0.8522, + "step": 12249 + }, + { + "epoch": 0.6109725685785536, + "grad_norm": 0.5124866058119651, + "learning_rate": 1.7358352226428294e-05, + "loss": 0.9048, + "step": 12250 + }, + { + "epoch": 0.6110224438902743, + "grad_norm": 0.5000400514208148, + "learning_rate": 1.7354507159096647e-05, + "loss": 0.8607, + "step": 12251 + }, + { + "epoch": 0.611072319201995, + "grad_norm": 0.45853933184379353, + "learning_rate": 1.7350662291270713e-05, + "loss": 0.8434, + "step": 12252 + }, + { + "epoch": 0.6111221945137157, + "grad_norm": 0.47609102718427876, + "learning_rate": 1.7346817623050827e-05, + "loss": 0.8589, + "step": 12253 + }, + { + "epoch": 0.6111720698254364, + "grad_norm": 0.4597098169815779, + "learning_rate": 1.73429731545373e-05, + "loss": 0.8426, + "step": 12254 + }, + { + "epoch": 0.6112219451371571, + "grad_norm": 0.48154634695899323, + "learning_rate": 1.7339128885830476e-05, + "loss": 0.9086, + "step": 12255 + }, + { + "epoch": 0.6112718204488778, + "grad_norm": 0.4269614169916567, + "learning_rate": 1.7335284817030643e-05, + "loss": 0.8866, + "step": 12256 + }, + { + "epoch": 0.6113216957605985, + "grad_norm": 0.5364412371868826, + "learning_rate": 1.7331440948238122e-05, + "loss": 0.8929, + "step": 12257 + }, + { + "epoch": 0.6113715710723192, + "grad_norm": 0.40432603079327784, + "learning_rate": 1.7327597279553228e-05, + "loss": 0.8825, + "step": 12258 + }, + { + "epoch": 0.61142144638404, + "grad_norm": 0.4324198780999436, + "learning_rate": 1.7323753811076243e-05, + "loss": 0.8817, + "step": 12259 + }, + { + "epoch": 0.6114713216957606, + "grad_norm": 0.4332803509667457, + "learning_rate": 1.7319910542907466e-05, + "loss": 0.8596, + "step": 12260 + }, + { + "epoch": 0.6115211970074813, + "grad_norm": 0.43931930595874275, + "learning_rate": 1.7316067475147185e-05, + "loss": 0.8694, + "step": 12261 + }, + { + "epoch": 0.611571072319202, + "grad_norm": 0.5736413283346874, + "learning_rate": 1.7312224607895694e-05, + "loss": 0.8754, + "step": 12262 + }, + { + "epoch": 0.6116209476309227, + "grad_norm": 0.3641001155003846, + "learning_rate": 1.7308381941253253e-05, + "loss": 0.8528, + "step": 12263 + }, + { + "epoch": 0.6116708229426434, + "grad_norm": 0.43666742645433365, + "learning_rate": 1.730453947532014e-05, + "loss": 0.8491, + "step": 12264 + }, + { + "epoch": 0.611720698254364, + "grad_norm": 0.3905955460185714, + "learning_rate": 1.730069721019663e-05, + "loss": 0.8699, + "step": 12265 + }, + { + "epoch": 0.6117705735660848, + "grad_norm": 0.8882457444485549, + "learning_rate": 1.7296855145982988e-05, + "loss": 0.867, + "step": 12266 + }, + { + "epoch": 0.6118204488778055, + "grad_norm": 0.4580518431142163, + "learning_rate": 1.7293013282779453e-05, + "loss": 0.9109, + "step": 12267 + }, + { + "epoch": 0.6118703241895261, + "grad_norm": 0.4283032844279561, + "learning_rate": 1.7289171620686287e-05, + "loss": 0.8677, + "step": 12268 + }, + { + "epoch": 0.6119201995012469, + "grad_norm": 0.43669102106179325, + "learning_rate": 1.7285330159803737e-05, + "loss": 0.9007, + "step": 12269 + }, + { + "epoch": 0.6119700748129676, + "grad_norm": 0.473645762774889, + "learning_rate": 1.7281488900232055e-05, + "loss": 0.8891, + "step": 12270 + }, + { + "epoch": 0.6120199501246882, + "grad_norm": 0.46610524241051454, + "learning_rate": 1.7277647842071452e-05, + "loss": 0.8566, + "step": 12271 + }, + { + "epoch": 0.612069825436409, + "grad_norm": 0.41470260513711066, + "learning_rate": 1.7273806985422176e-05, + "loss": 0.8382, + "step": 12272 + }, + { + "epoch": 0.6121197007481297, + "grad_norm": 0.4863265191144105, + "learning_rate": 1.726996633038446e-05, + "loss": 0.9069, + "step": 12273 + }, + { + "epoch": 0.6121695760598503, + "grad_norm": 0.43088419375826975, + "learning_rate": 1.72661258770585e-05, + "loss": 0.8804, + "step": 12274 + }, + { + "epoch": 0.6122194513715711, + "grad_norm": 0.43847042456965957, + "learning_rate": 1.7262285625544526e-05, + "loss": 0.8554, + "step": 12275 + }, + { + "epoch": 0.6122693266832918, + "grad_norm": 0.868121332733069, + "learning_rate": 1.7258445575942743e-05, + "loss": 0.8877, + "step": 12276 + }, + { + "epoch": 0.6123192019950124, + "grad_norm": 0.46137413927209187, + "learning_rate": 1.725460572835337e-05, + "loss": 0.8595, + "step": 12277 + }, + { + "epoch": 0.6123690773067332, + "grad_norm": 0.4710806480856388, + "learning_rate": 1.7250766082876585e-05, + "loss": 0.868, + "step": 12278 + }, + { + "epoch": 0.6124189526184539, + "grad_norm": 0.6684980975880223, + "learning_rate": 1.724692663961259e-05, + "loss": 0.8239, + "step": 12279 + }, + { + "epoch": 0.6124688279301745, + "grad_norm": 0.47249411544234676, + "learning_rate": 1.724308739866157e-05, + "loss": 0.8985, + "step": 12280 + }, + { + "epoch": 0.6125187032418953, + "grad_norm": 0.4595309669871041, + "learning_rate": 1.7239248360123732e-05, + "loss": 0.8301, + "step": 12281 + }, + { + "epoch": 0.6125685785536159, + "grad_norm": 0.5977307435275707, + "learning_rate": 1.723540952409922e-05, + "loss": 0.851, + "step": 12282 + }, + { + "epoch": 0.6126184538653366, + "grad_norm": 0.437773830099814, + "learning_rate": 1.723157089068822e-05, + "loss": 0.8631, + "step": 12283 + }, + { + "epoch": 0.6126683291770574, + "grad_norm": 0.628388018729784, + "learning_rate": 1.7227732459990907e-05, + "loss": 0.8916, + "step": 12284 + }, + { + "epoch": 0.612718204488778, + "grad_norm": 0.41890393536206655, + "learning_rate": 1.722389423210744e-05, + "loss": 0.8634, + "step": 12285 + }, + { + "epoch": 0.6127680798004987, + "grad_norm": 0.45741360086289573, + "learning_rate": 1.7220056207137964e-05, + "loss": 0.8557, + "step": 12286 + }, + { + "epoch": 0.6128179551122195, + "grad_norm": 0.44861542418724015, + "learning_rate": 1.721621838518264e-05, + "loss": 0.887, + "step": 12287 + }, + { + "epoch": 0.6128678304239401, + "grad_norm": 0.4641425644023192, + "learning_rate": 1.721238076634162e-05, + "loss": 0.9126, + "step": 12288 + }, + { + "epoch": 0.6129177057356608, + "grad_norm": 0.45952393355549154, + "learning_rate": 1.7208543350715033e-05, + "loss": 0.8658, + "step": 12289 + }, + { + "epoch": 0.6129675810473816, + "grad_norm": 0.49492577907652713, + "learning_rate": 1.7204706138403027e-05, + "loss": 0.8733, + "step": 12290 + }, + { + "epoch": 0.6130174563591022, + "grad_norm": 0.49376177563502516, + "learning_rate": 1.720086912950572e-05, + "loss": 0.8151, + "step": 12291 + }, + { + "epoch": 0.613067331670823, + "grad_norm": 0.4198871493938349, + "learning_rate": 1.7197032324123248e-05, + "loss": 0.8857, + "step": 12292 + }, + { + "epoch": 0.6131172069825437, + "grad_norm": 0.437708148715182, + "learning_rate": 1.7193195722355714e-05, + "loss": 0.8958, + "step": 12293 + }, + { + "epoch": 0.6131670822942643, + "grad_norm": 0.6861364040090545, + "learning_rate": 1.718935932430325e-05, + "loss": 0.8788, + "step": 12294 + }, + { + "epoch": 0.613216957605985, + "grad_norm": 0.4408556703880668, + "learning_rate": 1.718552313006596e-05, + "loss": 0.8597, + "step": 12295 + }, + { + "epoch": 0.6132668329177058, + "grad_norm": 0.4311598857296761, + "learning_rate": 1.7181687139743953e-05, + "loss": 0.8482, + "step": 12296 + }, + { + "epoch": 0.6133167082294264, + "grad_norm": 0.3768428450409557, + "learning_rate": 1.7177851353437314e-05, + "loss": 0.823, + "step": 12297 + }, + { + "epoch": 0.6133665835411471, + "grad_norm": 0.4463751892585783, + "learning_rate": 1.7174015771246138e-05, + "loss": 0.8634, + "step": 12298 + }, + { + "epoch": 0.6134164588528679, + "grad_norm": 0.5573744493142289, + "learning_rate": 1.7170180393270532e-05, + "loss": 0.8778, + "step": 12299 + }, + { + "epoch": 0.6134663341645885, + "grad_norm": 1.019613594375705, + "learning_rate": 1.7166345219610556e-05, + "loss": 0.8553, + "step": 12300 + }, + { + "epoch": 0.6135162094763092, + "grad_norm": 0.5032171824319684, + "learning_rate": 1.716251025036629e-05, + "loss": 0.8741, + "step": 12301 + }, + { + "epoch": 0.6135660847880299, + "grad_norm": 0.47215178609239483, + "learning_rate": 1.7158675485637816e-05, + "loss": 0.8399, + "step": 12302 + }, + { + "epoch": 0.6136159600997506, + "grad_norm": 0.7917214834062064, + "learning_rate": 1.7154840925525207e-05, + "loss": 0.8508, + "step": 12303 + }, + { + "epoch": 0.6136658354114713, + "grad_norm": 0.4672577780250623, + "learning_rate": 1.71510065701285e-05, + "loss": 0.8509, + "step": 12304 + }, + { + "epoch": 0.613715710723192, + "grad_norm": 0.379025789079358, + "learning_rate": 1.7147172419547763e-05, + "loss": 0.8525, + "step": 12305 + }, + { + "epoch": 0.6137655860349127, + "grad_norm": 0.4512467548297653, + "learning_rate": 1.714333847388305e-05, + "loss": 0.8574, + "step": 12306 + }, + { + "epoch": 0.6138154613466335, + "grad_norm": 0.5057705810865065, + "learning_rate": 1.713950473323441e-05, + "loss": 0.8915, + "step": 12307 + }, + { + "epoch": 0.6138653366583541, + "grad_norm": 0.45900549247815503, + "learning_rate": 1.713567119770187e-05, + "loss": 0.8563, + "step": 12308 + }, + { + "epoch": 0.6139152119700748, + "grad_norm": 0.5395093924373436, + "learning_rate": 1.713183786738547e-05, + "loss": 0.84, + "step": 12309 + }, + { + "epoch": 0.6139650872817956, + "grad_norm": 0.5694542595608095, + "learning_rate": 1.712800474238525e-05, + "loss": 0.8685, + "step": 12310 + }, + { + "epoch": 0.6140149625935162, + "grad_norm": 0.49205586572860277, + "learning_rate": 1.712417182280121e-05, + "loss": 0.8997, + "step": 12311 + }, + { + "epoch": 0.6140648379052369, + "grad_norm": 0.5050343011429853, + "learning_rate": 1.712033910873338e-05, + "loss": 0.821, + "step": 12312 + }, + { + "epoch": 0.6141147132169577, + "grad_norm": 0.5089183731274973, + "learning_rate": 1.7116506600281778e-05, + "loss": 0.8358, + "step": 12313 + }, + { + "epoch": 0.6141645885286783, + "grad_norm": 0.43200116768721714, + "learning_rate": 1.711267429754642e-05, + "loss": 0.866, + "step": 12314 + }, + { + "epoch": 0.614214463840399, + "grad_norm": 0.5655106097007225, + "learning_rate": 1.7108842200627285e-05, + "loss": 0.8676, + "step": 12315 + }, + { + "epoch": 0.6142643391521198, + "grad_norm": 0.5177175586933904, + "learning_rate": 1.710501030962438e-05, + "loss": 0.8426, + "step": 12316 + }, + { + "epoch": 0.6143142144638404, + "grad_norm": 0.4347352306044382, + "learning_rate": 1.71011786246377e-05, + "loss": 0.8403, + "step": 12317 + }, + { + "epoch": 0.6143640897755611, + "grad_norm": 0.47580830936305396, + "learning_rate": 1.7097347145767233e-05, + "loss": 0.7997, + "step": 12318 + }, + { + "epoch": 0.6144139650872817, + "grad_norm": 0.5615538131037727, + "learning_rate": 1.709351587311295e-05, + "loss": 0.8687, + "step": 12319 + }, + { + "epoch": 0.6144638403990025, + "grad_norm": 0.4537013644997247, + "learning_rate": 1.708968480677483e-05, + "loss": 0.8934, + "step": 12320 + }, + { + "epoch": 0.6145137157107232, + "grad_norm": 0.40493139226674424, + "learning_rate": 1.708585394685285e-05, + "loss": 0.8652, + "step": 12321 + }, + { + "epoch": 0.6145635910224438, + "grad_norm": 0.527459065204563, + "learning_rate": 1.7082023293446968e-05, + "loss": 0.8847, + "step": 12322 + }, + { + "epoch": 0.6146134663341646, + "grad_norm": 0.4952748657625579, + "learning_rate": 1.7078192846657142e-05, + "loss": 0.8991, + "step": 12323 + }, + { + "epoch": 0.6146633416458853, + "grad_norm": 0.709391643985777, + "learning_rate": 1.7074362606583326e-05, + "loss": 0.8557, + "step": 12324 + }, + { + "epoch": 0.614713216957606, + "grad_norm": 0.43433811755508767, + "learning_rate": 1.707053257332548e-05, + "loss": 0.8521, + "step": 12325 + }, + { + "epoch": 0.6147630922693267, + "grad_norm": 0.568047012335012, + "learning_rate": 1.7066702746983536e-05, + "loss": 0.868, + "step": 12326 + }, + { + "epoch": 0.6148129675810474, + "grad_norm": 0.5468927302582801, + "learning_rate": 1.7062873127657428e-05, + "loss": 0.834, + "step": 12327 + }, + { + "epoch": 0.614862842892768, + "grad_norm": 0.41613816715908375, + "learning_rate": 1.7059043715447092e-05, + "loss": 0.8567, + "step": 12328 + }, + { + "epoch": 0.6149127182044888, + "grad_norm": 0.4289340440008426, + "learning_rate": 1.705521451045246e-05, + "loss": 0.861, + "step": 12329 + }, + { + "epoch": 0.6149625935162095, + "grad_norm": 0.5505363828458206, + "learning_rate": 1.7051385512773448e-05, + "loss": 0.8641, + "step": 12330 + }, + { + "epoch": 0.6150124688279301, + "grad_norm": 0.46743285290000647, + "learning_rate": 1.704755672250998e-05, + "loss": 0.8645, + "step": 12331 + }, + { + "epoch": 0.6150623441396509, + "grad_norm": 0.728088578095667, + "learning_rate": 1.704372813976196e-05, + "loss": 0.8233, + "step": 12332 + }, + { + "epoch": 0.6151122194513716, + "grad_norm": 0.4556476091203981, + "learning_rate": 1.7039899764629286e-05, + "loss": 0.8864, + "step": 12333 + }, + { + "epoch": 0.6151620947630922, + "grad_norm": 0.4194905198346803, + "learning_rate": 1.7036071597211864e-05, + "loss": 0.8712, + "step": 12334 + }, + { + "epoch": 0.615211970074813, + "grad_norm": 0.4657547749089291, + "learning_rate": 1.703224363760959e-05, + "loss": 0.9027, + "step": 12335 + }, + { + "epoch": 0.6152618453865336, + "grad_norm": 0.4543981356942797, + "learning_rate": 1.7028415885922367e-05, + "loss": 0.8438, + "step": 12336 + }, + { + "epoch": 0.6153117206982543, + "grad_norm": 0.5422835682309988, + "learning_rate": 1.702458834225005e-05, + "loss": 0.8762, + "step": 12337 + }, + { + "epoch": 0.6153615960099751, + "grad_norm": 0.44870209670160405, + "learning_rate": 1.7020761006692527e-05, + "loss": 0.8839, + "step": 12338 + }, + { + "epoch": 0.6154114713216957, + "grad_norm": 0.4628463204312569, + "learning_rate": 1.701693387934968e-05, + "loss": 0.8586, + "step": 12339 + }, + { + "epoch": 0.6154613466334165, + "grad_norm": 0.47105909200439555, + "learning_rate": 1.701310696032138e-05, + "loss": 0.917, + "step": 12340 + }, + { + "epoch": 0.6155112219451372, + "grad_norm": 0.4765759961286834, + "learning_rate": 1.700928024970747e-05, + "loss": 0.8835, + "step": 12341 + }, + { + "epoch": 0.6155610972568578, + "grad_norm": 0.6469381284023238, + "learning_rate": 1.7005453747607813e-05, + "loss": 0.8793, + "step": 12342 + }, + { + "epoch": 0.6156109725685786, + "grad_norm": 0.47460916665085084, + "learning_rate": 1.7001627454122275e-05, + "loss": 0.8847, + "step": 12343 + }, + { + "epoch": 0.6156608478802993, + "grad_norm": 0.5185662142468888, + "learning_rate": 1.699780136935068e-05, + "loss": 0.8711, + "step": 12344 + }, + { + "epoch": 0.6157107231920199, + "grad_norm": 0.45957792873014625, + "learning_rate": 1.6993975493392872e-05, + "loss": 0.8521, + "step": 12345 + }, + { + "epoch": 0.6157605985037407, + "grad_norm": 0.43646864059356716, + "learning_rate": 1.6990149826348693e-05, + "loss": 0.9025, + "step": 12346 + }, + { + "epoch": 0.6158104738154614, + "grad_norm": 0.4503661673247235, + "learning_rate": 1.6986324368317982e-05, + "loss": 0.8589, + "step": 12347 + }, + { + "epoch": 0.615860349127182, + "grad_norm": 0.4454032690542127, + "learning_rate": 1.698249911940054e-05, + "loss": 0.9034, + "step": 12348 + }, + { + "epoch": 0.6159102244389028, + "grad_norm": 0.39109546531643274, + "learning_rate": 1.697867407969619e-05, + "loss": 0.8602, + "step": 12349 + }, + { + "epoch": 0.6159600997506235, + "grad_norm": 0.46073054389716905, + "learning_rate": 1.6974849249304754e-05, + "loss": 0.8362, + "step": 12350 + }, + { + "epoch": 0.6160099750623441, + "grad_norm": 0.39247025046942585, + "learning_rate": 1.6971024628326045e-05, + "loss": 0.9078, + "step": 12351 + }, + { + "epoch": 0.6160598503740649, + "grad_norm": 0.5317818593192812, + "learning_rate": 1.6967200216859848e-05, + "loss": 0.8753, + "step": 12352 + }, + { + "epoch": 0.6161097256857855, + "grad_norm": 0.6091557459318968, + "learning_rate": 1.696337601500596e-05, + "loss": 0.8635, + "step": 12353 + }, + { + "epoch": 0.6161596009975062, + "grad_norm": 0.4227323216648561, + "learning_rate": 1.6959552022864182e-05, + "loss": 0.9301, + "step": 12354 + }, + { + "epoch": 0.616209476309227, + "grad_norm": 0.41799367420630074, + "learning_rate": 1.6955728240534305e-05, + "loss": 0.8665, + "step": 12355 + }, + { + "epoch": 0.6162593516209476, + "grad_norm": 0.46746348547587685, + "learning_rate": 1.695190466811609e-05, + "loss": 0.874, + "step": 12356 + }, + { + "epoch": 0.6163092269326683, + "grad_norm": 0.37698902730283734, + "learning_rate": 1.6948081305709318e-05, + "loss": 0.8466, + "step": 12357 + }, + { + "epoch": 0.6163591022443891, + "grad_norm": 0.4636051731607111, + "learning_rate": 1.6944258153413772e-05, + "loss": 0.8905, + "step": 12358 + }, + { + "epoch": 0.6164089775561097, + "grad_norm": 0.46222974076666556, + "learning_rate": 1.6940435211329198e-05, + "loss": 0.8485, + "step": 12359 + }, + { + "epoch": 0.6164588528678304, + "grad_norm": 0.44880056159523896, + "learning_rate": 1.693661247955536e-05, + "loss": 0.8919, + "step": 12360 + }, + { + "epoch": 0.6165087281795512, + "grad_norm": 0.401348317898157, + "learning_rate": 1.6932789958192003e-05, + "loss": 0.8144, + "step": 12361 + }, + { + "epoch": 0.6165586034912718, + "grad_norm": 0.5008888376787876, + "learning_rate": 1.6928967647338892e-05, + "loss": 0.9013, + "step": 12362 + }, + { + "epoch": 0.6166084788029925, + "grad_norm": 0.5429074218076663, + "learning_rate": 1.692514554709576e-05, + "loss": 0.8697, + "step": 12363 + }, + { + "epoch": 0.6166583541147133, + "grad_norm": 0.47496056960137567, + "learning_rate": 1.6921323657562332e-05, + "loss": 0.9021, + "step": 12364 + }, + { + "epoch": 0.6167082294264339, + "grad_norm": 0.5287864012310424, + "learning_rate": 1.691750197883835e-05, + "loss": 0.8719, + "step": 12365 + }, + { + "epoch": 0.6167581047381546, + "grad_norm": 0.5895158360514917, + "learning_rate": 1.6913680511023538e-05, + "loss": 0.8389, + "step": 12366 + }, + { + "epoch": 0.6168079800498754, + "grad_norm": 0.5023118263416644, + "learning_rate": 1.6909859254217613e-05, + "loss": 0.8447, + "step": 12367 + }, + { + "epoch": 0.616857855361596, + "grad_norm": 0.5975495931211818, + "learning_rate": 1.69060382085203e-05, + "loss": 0.8746, + "step": 12368 + }, + { + "epoch": 0.6169077306733167, + "grad_norm": 0.5006211138665355, + "learning_rate": 1.6902217374031286e-05, + "loss": 0.8564, + "step": 12369 + }, + { + "epoch": 0.6169576059850375, + "grad_norm": 0.44644407601137015, + "learning_rate": 1.6898396750850286e-05, + "loss": 0.8785, + "step": 12370 + }, + { + "epoch": 0.6170074812967581, + "grad_norm": 0.4264449823234222, + "learning_rate": 1.6894576339077e-05, + "loss": 0.8413, + "step": 12371 + }, + { + "epoch": 0.6170573566084788, + "grad_norm": 0.40574371317340735, + "learning_rate": 1.6890756138811113e-05, + "loss": 0.8917, + "step": 12372 + }, + { + "epoch": 0.6171072319201995, + "grad_norm": 0.4821528423657512, + "learning_rate": 1.6886936150152325e-05, + "loss": 0.8868, + "step": 12373 + }, + { + "epoch": 0.6171571072319202, + "grad_norm": 0.521400243757517, + "learning_rate": 1.68831163732003e-05, + "loss": 0.8583, + "step": 12374 + }, + { + "epoch": 0.6172069825436409, + "grad_norm": 0.4437284481153252, + "learning_rate": 1.687929680805472e-05, + "loss": 0.854, + "step": 12375 + }, + { + "epoch": 0.6172568578553616, + "grad_norm": 0.7341690319114251, + "learning_rate": 1.6875477454815254e-05, + "loss": 0.8885, + "step": 12376 + }, + { + "epoch": 0.6173067331670823, + "grad_norm": 0.5501681223460012, + "learning_rate": 1.687165831358158e-05, + "loss": 0.9082, + "step": 12377 + }, + { + "epoch": 0.617356608478803, + "grad_norm": 0.7326552281733911, + "learning_rate": 1.686783938445333e-05, + "loss": 0.8784, + "step": 12378 + }, + { + "epoch": 0.6174064837905237, + "grad_norm": 0.4168912930749862, + "learning_rate": 1.6864020667530177e-05, + "loss": 0.8342, + "step": 12379 + }, + { + "epoch": 0.6174563591022444, + "grad_norm": 0.5283212601731838, + "learning_rate": 1.686020216291177e-05, + "loss": 0.8527, + "step": 12380 + }, + { + "epoch": 0.6175062344139651, + "grad_norm": 0.5279039346532625, + "learning_rate": 1.6856383870697737e-05, + "loss": 0.901, + "step": 12381 + }, + { + "epoch": 0.6175561097256858, + "grad_norm": 0.9715781876981, + "learning_rate": 1.685256579098772e-05, + "loss": 0.9258, + "step": 12382 + }, + { + "epoch": 0.6176059850374065, + "grad_norm": 0.6999360539664948, + "learning_rate": 1.6848747923881352e-05, + "loss": 0.8346, + "step": 12383 + }, + { + "epoch": 0.6176558603491272, + "grad_norm": 2.7615023699383454, + "learning_rate": 1.6844930269478274e-05, + "loss": 0.8777, + "step": 12384 + }, + { + "epoch": 0.6177057356608479, + "grad_norm": 0.4529587993408001, + "learning_rate": 1.6841112827878076e-05, + "loss": 0.8622, + "step": 12385 + }, + { + "epoch": 0.6177556109725686, + "grad_norm": 0.5263240802519806, + "learning_rate": 1.683729559918039e-05, + "loss": 0.9024, + "step": 12386 + }, + { + "epoch": 0.6178054862842893, + "grad_norm": 0.45418296513878287, + "learning_rate": 1.683347858348482e-05, + "loss": 0.8646, + "step": 12387 + }, + { + "epoch": 0.61785536159601, + "grad_norm": 0.45724717971801776, + "learning_rate": 1.6829661780890983e-05, + "loss": 0.857, + "step": 12388 + }, + { + "epoch": 0.6179052369077307, + "grad_norm": 0.41453246045374287, + "learning_rate": 1.6825845191498456e-05, + "loss": 0.8713, + "step": 12389 + }, + { + "epoch": 0.6179551122194513, + "grad_norm": 0.46048285756738006, + "learning_rate": 1.6822028815406837e-05, + "loss": 0.8634, + "step": 12390 + }, + { + "epoch": 0.6180049875311721, + "grad_norm": 0.4881028659372441, + "learning_rate": 1.681821265271573e-05, + "loss": 0.886, + "step": 12391 + }, + { + "epoch": 0.6180548628428928, + "grad_norm": 0.45191662972075786, + "learning_rate": 1.6814396703524687e-05, + "loss": 0.8922, + "step": 12392 + }, + { + "epoch": 0.6181047381546134, + "grad_norm": 0.43647400369918415, + "learning_rate": 1.6810580967933304e-05, + "loss": 0.8587, + "step": 12393 + }, + { + "epoch": 0.6181546134663342, + "grad_norm": 0.5482654911996673, + "learning_rate": 1.6806765446041144e-05, + "loss": 0.8779, + "step": 12394 + }, + { + "epoch": 0.6182044887780549, + "grad_norm": 0.5276022021003582, + "learning_rate": 1.680295013794778e-05, + "loss": 0.8986, + "step": 12395 + }, + { + "epoch": 0.6182543640897755, + "grad_norm": 0.4169653472035636, + "learning_rate": 1.6799135043752757e-05, + "loss": 0.8607, + "step": 12396 + }, + { + "epoch": 0.6183042394014963, + "grad_norm": 0.4517445821454521, + "learning_rate": 1.6795320163555634e-05, + "loss": 0.8979, + "step": 12397 + }, + { + "epoch": 0.618354114713217, + "grad_norm": 0.7447077988529213, + "learning_rate": 1.6791505497455962e-05, + "loss": 0.8557, + "step": 12398 + }, + { + "epoch": 0.6184039900249376, + "grad_norm": 0.4434618290357609, + "learning_rate": 1.678769104555328e-05, + "loss": 0.8338, + "step": 12399 + }, + { + "epoch": 0.6184538653366584, + "grad_norm": 0.5741937207047391, + "learning_rate": 1.6783876807947128e-05, + "loss": 0.8835, + "step": 12400 + }, + { + "epoch": 0.6185037406483791, + "grad_norm": 0.4662701936708848, + "learning_rate": 1.6780062784737026e-05, + "loss": 0.8578, + "step": 12401 + }, + { + "epoch": 0.6185536159600997, + "grad_norm": 0.44936148704131224, + "learning_rate": 1.6776248976022514e-05, + "loss": 0.8424, + "step": 12402 + }, + { + "epoch": 0.6186034912718205, + "grad_norm": 0.4440072910131144, + "learning_rate": 1.67724353819031e-05, + "loss": 0.8349, + "step": 12403 + }, + { + "epoch": 0.6186533665835412, + "grad_norm": 0.5762072252240379, + "learning_rate": 1.67686220024783e-05, + "loss": 0.8374, + "step": 12404 + }, + { + "epoch": 0.6187032418952618, + "grad_norm": 0.4859015845402111, + "learning_rate": 1.6764808837847635e-05, + "loss": 0.8447, + "step": 12405 + }, + { + "epoch": 0.6187531172069826, + "grad_norm": 0.5624703839414057, + "learning_rate": 1.6760995888110593e-05, + "loss": 0.8589, + "step": 12406 + }, + { + "epoch": 0.6188029925187032, + "grad_norm": 0.4408261563230797, + "learning_rate": 1.6757183153366675e-05, + "loss": 0.8871, + "step": 12407 + }, + { + "epoch": 0.6188528678304239, + "grad_norm": 0.46231657045985064, + "learning_rate": 1.675337063371537e-05, + "loss": 0.8386, + "step": 12408 + }, + { + "epoch": 0.6189027431421447, + "grad_norm": 1.375449682309933, + "learning_rate": 1.6749558329256172e-05, + "loss": 0.8571, + "step": 12409 + }, + { + "epoch": 0.6189526184538653, + "grad_norm": 0.40865395425666007, + "learning_rate": 1.6745746240088562e-05, + "loss": 0.8897, + "step": 12410 + }, + { + "epoch": 0.619002493765586, + "grad_norm": 0.48161044617126886, + "learning_rate": 1.6741934366312e-05, + "loss": 0.9016, + "step": 12411 + }, + { + "epoch": 0.6190523690773068, + "grad_norm": 0.41329655103926116, + "learning_rate": 1.673812270802597e-05, + "loss": 0.8816, + "step": 12412 + }, + { + "epoch": 0.6191022443890274, + "grad_norm": 0.5120358336629727, + "learning_rate": 1.6734311265329927e-05, + "loss": 0.851, + "step": 12413 + }, + { + "epoch": 0.6191521197007481, + "grad_norm": 0.4371408612069219, + "learning_rate": 1.673050003832334e-05, + "loss": 0.8664, + "step": 12414 + }, + { + "epoch": 0.6192019950124689, + "grad_norm": 0.4424873734082974, + "learning_rate": 1.6726689027105647e-05, + "loss": 0.8733, + "step": 12415 + }, + { + "epoch": 0.6192518703241895, + "grad_norm": 0.4522685048299968, + "learning_rate": 1.67228782317763e-05, + "loss": 0.8719, + "step": 12416 + }, + { + "epoch": 0.6193017456359102, + "grad_norm": 0.39188203266697497, + "learning_rate": 1.671906765243475e-05, + "loss": 0.8036, + "step": 12417 + }, + { + "epoch": 0.619351620947631, + "grad_norm": 0.42431239865498566, + "learning_rate": 1.6715257289180414e-05, + "loss": 0.8984, + "step": 12418 + }, + { + "epoch": 0.6194014962593516, + "grad_norm": 0.4202349366085832, + "learning_rate": 1.6711447142112734e-05, + "loss": 0.7788, + "step": 12419 + }, + { + "epoch": 0.6194513715710723, + "grad_norm": 0.442717156487094, + "learning_rate": 1.670763721133113e-05, + "loss": 0.8642, + "step": 12420 + }, + { + "epoch": 0.6195012468827931, + "grad_norm": 0.6426807801355204, + "learning_rate": 1.670382749693503e-05, + "loss": 0.8613, + "step": 12421 + }, + { + "epoch": 0.6195511221945137, + "grad_norm": 0.46734854990181607, + "learning_rate": 1.670001799902383e-05, + "loss": 0.9031, + "step": 12422 + }, + { + "epoch": 0.6196009975062344, + "grad_norm": 0.42079888688894485, + "learning_rate": 1.6696208717696945e-05, + "loss": 0.8347, + "step": 12423 + }, + { + "epoch": 0.6196508728179552, + "grad_norm": 0.36308585219795053, + "learning_rate": 1.6692399653053776e-05, + "loss": 0.8405, + "step": 12424 + }, + { + "epoch": 0.6197007481296758, + "grad_norm": 0.39952986156469317, + "learning_rate": 1.6688590805193732e-05, + "loss": 0.8899, + "step": 12425 + }, + { + "epoch": 0.6197506234413965, + "grad_norm": 0.6116339736376416, + "learning_rate": 1.6684782174216186e-05, + "loss": 0.8357, + "step": 12426 + }, + { + "epoch": 0.6198004987531172, + "grad_norm": 0.5781381250202969, + "learning_rate": 1.6680973760220518e-05, + "loss": 0.9185, + "step": 12427 + }, + { + "epoch": 0.6198503740648379, + "grad_norm": 1.066318958941881, + "learning_rate": 1.6677165563306134e-05, + "loss": 0.8503, + "step": 12428 + }, + { + "epoch": 0.6199002493765586, + "grad_norm": 0.42561912207038055, + "learning_rate": 1.6673357583572373e-05, + "loss": 0.8642, + "step": 12429 + }, + { + "epoch": 0.6199501246882793, + "grad_norm": 0.4471272984258856, + "learning_rate": 1.6669549821118626e-05, + "loss": 0.8821, + "step": 12430 + }, + { + "epoch": 0.62, + "grad_norm": 0.399929263337238, + "learning_rate": 1.6665742276044246e-05, + "loss": 0.8687, + "step": 12431 + }, + { + "epoch": 0.6200498753117207, + "grad_norm": 0.4620846426174392, + "learning_rate": 1.6661934948448602e-05, + "loss": 0.8967, + "step": 12432 + }, + { + "epoch": 0.6200997506234414, + "grad_norm": 0.5852346717729493, + "learning_rate": 1.6658127838431022e-05, + "loss": 0.827, + "step": 12433 + }, + { + "epoch": 0.6201496259351621, + "grad_norm": 0.4182245119924959, + "learning_rate": 1.665432094609086e-05, + "loss": 0.8667, + "step": 12434 + }, + { + "epoch": 0.6201995012468828, + "grad_norm": 0.5096462860055618, + "learning_rate": 1.6650514271527468e-05, + "loss": 0.8731, + "step": 12435 + }, + { + "epoch": 0.6202493765586035, + "grad_norm": 0.3806512385566715, + "learning_rate": 1.6646707814840166e-05, + "loss": 0.8253, + "step": 12436 + }, + { + "epoch": 0.6202992518703242, + "grad_norm": 0.442202712753791, + "learning_rate": 1.6642901576128286e-05, + "loss": 0.8833, + "step": 12437 + }, + { + "epoch": 0.6203491271820449, + "grad_norm": 0.43165636303808236, + "learning_rate": 1.6639095555491147e-05, + "loss": 0.853, + "step": 12438 + }, + { + "epoch": 0.6203990024937656, + "grad_norm": 0.44860906070183476, + "learning_rate": 1.6635289753028072e-05, + "loss": 0.8709, + "step": 12439 + }, + { + "epoch": 0.6204488778054863, + "grad_norm": 0.5494981566767797, + "learning_rate": 1.663148416883836e-05, + "loss": 0.8564, + "step": 12440 + }, + { + "epoch": 0.620498753117207, + "grad_norm": 0.4351059521439504, + "learning_rate": 1.662767880302133e-05, + "loss": 0.8549, + "step": 12441 + }, + { + "epoch": 0.6205486284289277, + "grad_norm": 0.4311043184296715, + "learning_rate": 1.662387365567627e-05, + "loss": 0.8422, + "step": 12442 + }, + { + "epoch": 0.6205985037406484, + "grad_norm": 0.4381039206896617, + "learning_rate": 1.6620068726902482e-05, + "loss": 0.8595, + "step": 12443 + }, + { + "epoch": 0.620648379052369, + "grad_norm": 0.4974447637852321, + "learning_rate": 1.6616264016799242e-05, + "loss": 0.8519, + "step": 12444 + }, + { + "epoch": 0.6206982543640898, + "grad_norm": 0.45582053435244685, + "learning_rate": 1.6612459525465845e-05, + "loss": 0.9091, + "step": 12445 + }, + { + "epoch": 0.6207481296758105, + "grad_norm": 0.4587796302729648, + "learning_rate": 1.660865525300157e-05, + "loss": 0.9272, + "step": 12446 + }, + { + "epoch": 0.6207980049875311, + "grad_norm": 0.45981976305307853, + "learning_rate": 1.660485119950568e-05, + "loss": 0.7767, + "step": 12447 + }, + { + "epoch": 0.6208478802992519, + "grad_norm": 0.3953646536495875, + "learning_rate": 1.660104736507743e-05, + "loss": 0.8582, + "step": 12448 + }, + { + "epoch": 0.6208977556109726, + "grad_norm": 0.43321054847777685, + "learning_rate": 1.6597243749816095e-05, + "loss": 0.8155, + "step": 12449 + }, + { + "epoch": 0.6209476309226932, + "grad_norm": 0.5127432314753712, + "learning_rate": 1.659344035382093e-05, + "loss": 0.8709, + "step": 12450 + }, + { + "epoch": 0.620997506234414, + "grad_norm": 0.45327395743787324, + "learning_rate": 1.658963717719117e-05, + "loss": 0.866, + "step": 12451 + }, + { + "epoch": 0.6210473815461347, + "grad_norm": 0.4407703180610021, + "learning_rate": 1.6585834220026064e-05, + "loss": 0.8584, + "step": 12452 + }, + { + "epoch": 0.6210972568578553, + "grad_norm": 0.4582647391040611, + "learning_rate": 1.6582031482424847e-05, + "loss": 0.848, + "step": 12453 + }, + { + "epoch": 0.6211471321695761, + "grad_norm": 0.41580916442032845, + "learning_rate": 1.657822896448676e-05, + "loss": 0.8781, + "step": 12454 + }, + { + "epoch": 0.6211970074812968, + "grad_norm": 0.4105251062792723, + "learning_rate": 1.657442666631101e-05, + "loss": 0.8261, + "step": 12455 + }, + { + "epoch": 0.6212468827930174, + "grad_norm": 0.5153745608005905, + "learning_rate": 1.6570624587996827e-05, + "loss": 0.8872, + "step": 12456 + }, + { + "epoch": 0.6212967581047382, + "grad_norm": 0.42198116341642966, + "learning_rate": 1.6566822729643422e-05, + "loss": 0.8782, + "step": 12457 + }, + { + "epoch": 0.6213466334164589, + "grad_norm": 0.4494141406783882, + "learning_rate": 1.6563021091350014e-05, + "loss": 0.8449, + "step": 12458 + }, + { + "epoch": 0.6213965087281795, + "grad_norm": 0.464926409705168, + "learning_rate": 1.6559219673215784e-05, + "loss": 0.8705, + "step": 12459 + }, + { + "epoch": 0.6214463840399003, + "grad_norm": 0.48923186720237144, + "learning_rate": 1.655541847533994e-05, + "loss": 0.8831, + "step": 12460 + }, + { + "epoch": 0.6214962593516209, + "grad_norm": 1.0344053311640637, + "learning_rate": 1.655161749782167e-05, + "loss": 0.8835, + "step": 12461 + }, + { + "epoch": 0.6215461346633416, + "grad_norm": 0.3675514219108611, + "learning_rate": 1.654781674076017e-05, + "loss": 0.8361, + "step": 12462 + }, + { + "epoch": 0.6215960099750624, + "grad_norm": 0.4691694556776482, + "learning_rate": 1.6544016204254602e-05, + "loss": 0.8496, + "step": 12463 + }, + { + "epoch": 0.621645885286783, + "grad_norm": 0.8493671534611735, + "learning_rate": 1.6540215888404146e-05, + "loss": 0.843, + "step": 12464 + }, + { + "epoch": 0.6216957605985037, + "grad_norm": 0.4544524171292158, + "learning_rate": 1.653641579330798e-05, + "loss": 0.8721, + "step": 12465 + }, + { + "epoch": 0.6217456359102245, + "grad_norm": 0.3909630418897374, + "learning_rate": 1.6532615919065245e-05, + "loss": 0.8707, + "step": 12466 + }, + { + "epoch": 0.6217955112219451, + "grad_norm": 0.44525090497167025, + "learning_rate": 1.652881626577511e-05, + "loss": 0.8744, + "step": 12467 + }, + { + "epoch": 0.6218453865336658, + "grad_norm": 0.40584877017649285, + "learning_rate": 1.6525016833536723e-05, + "loss": 0.8622, + "step": 12468 + }, + { + "epoch": 0.6218952618453866, + "grad_norm": 0.4037278650895841, + "learning_rate": 1.6521217622449236e-05, + "loss": 0.8978, + "step": 12469 + }, + { + "epoch": 0.6219451371571072, + "grad_norm": 0.4338859422512754, + "learning_rate": 1.651741863261177e-05, + "loss": 0.8372, + "step": 12470 + }, + { + "epoch": 0.6219950124688279, + "grad_norm": 0.45443612971588904, + "learning_rate": 1.6513619864123473e-05, + "loss": 0.8547, + "step": 12471 + }, + { + "epoch": 0.6220448877805487, + "grad_norm": 0.4666676639836363, + "learning_rate": 1.6509821317083467e-05, + "loss": 0.8483, + "step": 12472 + }, + { + "epoch": 0.6220947630922693, + "grad_norm": 0.4345730156619074, + "learning_rate": 1.650602299159088e-05, + "loss": 0.8527, + "step": 12473 + }, + { + "epoch": 0.62214463840399, + "grad_norm": 0.4031446793908547, + "learning_rate": 1.6502224887744815e-05, + "loss": 0.8099, + "step": 12474 + }, + { + "epoch": 0.6221945137157108, + "grad_norm": 0.4477151235294369, + "learning_rate": 1.649842700564439e-05, + "loss": 0.8953, + "step": 12475 + }, + { + "epoch": 0.6222443890274314, + "grad_norm": 0.4232667461438114, + "learning_rate": 1.6494629345388708e-05, + "loss": 0.8498, + "step": 12476 + }, + { + "epoch": 0.6222942643391521, + "grad_norm": 0.4589580408426114, + "learning_rate": 1.6490831907076866e-05, + "loss": 0.8705, + "step": 12477 + }, + { + "epoch": 0.6223441396508728, + "grad_norm": 0.43693390225760037, + "learning_rate": 1.6487034690807955e-05, + "loss": 0.8409, + "step": 12478 + }, + { + "epoch": 0.6223940149625935, + "grad_norm": 0.41586358351472896, + "learning_rate": 1.6483237696681068e-05, + "loss": 0.8563, + "step": 12479 + }, + { + "epoch": 0.6224438902743142, + "grad_norm": 0.4000149333186197, + "learning_rate": 1.647944092479528e-05, + "loss": 0.8702, + "step": 12480 + }, + { + "epoch": 0.6224937655860349, + "grad_norm": 0.7423359231394777, + "learning_rate": 1.6475644375249665e-05, + "loss": 0.8282, + "step": 12481 + }, + { + "epoch": 0.6225436408977556, + "grad_norm": 0.3983749952733609, + "learning_rate": 1.6471848048143294e-05, + "loss": 0.8666, + "step": 12482 + }, + { + "epoch": 0.6225935162094763, + "grad_norm": 0.4713918047552318, + "learning_rate": 1.646805194357524e-05, + "loss": 0.9448, + "step": 12483 + }, + { + "epoch": 0.622643391521197, + "grad_norm": 0.41883430338794464, + "learning_rate": 1.646425606164455e-05, + "loss": 0.8761, + "step": 12484 + }, + { + "epoch": 0.6226932668329177, + "grad_norm": 0.44383204164470663, + "learning_rate": 1.6460460402450273e-05, + "loss": 0.8429, + "step": 12485 + }, + { + "epoch": 0.6227431421446384, + "grad_norm": 0.45441197312170145, + "learning_rate": 1.645666496609146e-05, + "loss": 0.8402, + "step": 12486 + }, + { + "epoch": 0.6227930174563591, + "grad_norm": 0.4630566406977061, + "learning_rate": 1.645286975266716e-05, + "loss": 0.8421, + "step": 12487 + }, + { + "epoch": 0.6228428927680798, + "grad_norm": 0.37330271537582577, + "learning_rate": 1.644907476227639e-05, + "loss": 0.8397, + "step": 12488 + }, + { + "epoch": 0.6228927680798005, + "grad_norm": 10.440171514607279, + "learning_rate": 1.6445279995018188e-05, + "loss": 0.8521, + "step": 12489 + }, + { + "epoch": 0.6229426433915212, + "grad_norm": 0.513167039623589, + "learning_rate": 1.6441485450991577e-05, + "loss": 0.8113, + "step": 12490 + }, + { + "epoch": 0.6229925187032419, + "grad_norm": 0.5367789569671478, + "learning_rate": 1.643769113029558e-05, + "loss": 0.8795, + "step": 12491 + }, + { + "epoch": 0.6230423940149626, + "grad_norm": 0.49992660770657, + "learning_rate": 1.6433897033029193e-05, + "loss": 0.8613, + "step": 12492 + }, + { + "epoch": 0.6230922693266833, + "grad_norm": 0.41182234036689097, + "learning_rate": 1.6430103159291426e-05, + "loss": 0.7933, + "step": 12493 + }, + { + "epoch": 0.623142144638404, + "grad_norm": 0.4774162059672623, + "learning_rate": 1.6426309509181283e-05, + "loss": 0.8825, + "step": 12494 + }, + { + "epoch": 0.6231920199501247, + "grad_norm": 0.533152237075182, + "learning_rate": 1.6422516082797767e-05, + "loss": 0.8489, + "step": 12495 + }, + { + "epoch": 0.6232418952618454, + "grad_norm": 0.5936112726124021, + "learning_rate": 1.6418722880239847e-05, + "loss": 0.8262, + "step": 12496 + }, + { + "epoch": 0.6232917705735661, + "grad_norm": 0.6088108998212577, + "learning_rate": 1.641492990160651e-05, + "loss": 0.8872, + "step": 12497 + }, + { + "epoch": 0.6233416458852867, + "grad_norm": 0.3869998450298911, + "learning_rate": 1.6411137146996748e-05, + "loss": 0.8215, + "step": 12498 + }, + { + "epoch": 0.6233915211970075, + "grad_norm": 0.5615526222243626, + "learning_rate": 1.6407344616509504e-05, + "loss": 0.869, + "step": 12499 + }, + { + "epoch": 0.6234413965087282, + "grad_norm": 0.4687393564962611, + "learning_rate": 1.640355231024376e-05, + "loss": 0.8887, + "step": 12500 + }, + { + "epoch": 0.6234912718204488, + "grad_norm": 0.43018215766270396, + "learning_rate": 1.639976022829847e-05, + "loss": 0.8257, + "step": 12501 + }, + { + "epoch": 0.6235411471321696, + "grad_norm": 0.3986136699272119, + "learning_rate": 1.6395968370772595e-05, + "loss": 0.9053, + "step": 12502 + }, + { + "epoch": 0.6235910224438903, + "grad_norm": 0.4057479481736121, + "learning_rate": 1.639217673776507e-05, + "loss": 0.8732, + "step": 12503 + }, + { + "epoch": 0.6236408977556109, + "grad_norm": 0.5316672282785976, + "learning_rate": 1.638838532937484e-05, + "loss": 0.9151, + "step": 12504 + }, + { + "epoch": 0.6236907730673317, + "grad_norm": 0.4376756386126804, + "learning_rate": 1.6384594145700837e-05, + "loss": 0.8849, + "step": 12505 + }, + { + "epoch": 0.6237406483790524, + "grad_norm": 0.4604815981985776, + "learning_rate": 1.638080318684201e-05, + "loss": 0.8743, + "step": 12506 + }, + { + "epoch": 0.623790523690773, + "grad_norm": 0.4596993658892665, + "learning_rate": 1.637701245289725e-05, + "loss": 0.8393, + "step": 12507 + }, + { + "epoch": 0.6238403990024938, + "grad_norm": 0.49156289492422256, + "learning_rate": 1.6373221943965495e-05, + "loss": 0.8613, + "step": 12508 + }, + { + "epoch": 0.6238902743142145, + "grad_norm": 0.40029229881260164, + "learning_rate": 1.6369431660145655e-05, + "loss": 0.8736, + "step": 12509 + }, + { + "epoch": 0.6239401496259351, + "grad_norm": 0.44837797314647204, + "learning_rate": 1.6365641601536633e-05, + "loss": 0.9167, + "step": 12510 + }, + { + "epoch": 0.6239900249376559, + "grad_norm": 1.4840863024063424, + "learning_rate": 1.6361851768237324e-05, + "loss": 0.912, + "step": 12511 + }, + { + "epoch": 0.6240399002493766, + "grad_norm": 0.41361872343034245, + "learning_rate": 1.6358062160346628e-05, + "loss": 0.8683, + "step": 12512 + }, + { + "epoch": 0.6240897755610972, + "grad_norm": 0.4386439970694206, + "learning_rate": 1.635427277796344e-05, + "loss": 0.8241, + "step": 12513 + }, + { + "epoch": 0.624139650872818, + "grad_norm": 0.5812843416795132, + "learning_rate": 1.6350483621186627e-05, + "loss": 0.8099, + "step": 12514 + }, + { + "epoch": 0.6241895261845386, + "grad_norm": 0.47943681342559674, + "learning_rate": 1.6346694690115077e-05, + "loss": 0.8474, + "step": 12515 + }, + { + "epoch": 0.6242394014962593, + "grad_norm": 0.4767489363106754, + "learning_rate": 1.6342905984847657e-05, + "loss": 0.9099, + "step": 12516 + }, + { + "epoch": 0.6242892768079801, + "grad_norm": 0.4073399691815637, + "learning_rate": 1.633911750548323e-05, + "loss": 0.8702, + "step": 12517 + }, + { + "epoch": 0.6243391521197007, + "grad_norm": 0.3986558193283716, + "learning_rate": 1.6335329252120653e-05, + "loss": 0.8533, + "step": 12518 + }, + { + "epoch": 0.6243890274314214, + "grad_norm": 0.4481799361739271, + "learning_rate": 1.633154122485878e-05, + "loss": 0.856, + "step": 12519 + }, + { + "epoch": 0.6244389027431422, + "grad_norm": 0.5077651838229933, + "learning_rate": 1.6327753423796463e-05, + "loss": 0.8276, + "step": 12520 + }, + { + "epoch": 0.6244887780548628, + "grad_norm": 0.4437004203309032, + "learning_rate": 1.632396584903254e-05, + "loss": 0.8321, + "step": 12521 + }, + { + "epoch": 0.6245386533665835, + "grad_norm": 0.3938134481848903, + "learning_rate": 1.6320178500665844e-05, + "loss": 0.8307, + "step": 12522 + }, + { + "epoch": 0.6245885286783043, + "grad_norm": 0.4032953758367472, + "learning_rate": 1.63163913787952e-05, + "loss": 0.8451, + "step": 12523 + }, + { + "epoch": 0.6246384039900249, + "grad_norm": 0.5171616184293613, + "learning_rate": 1.631260448351945e-05, + "loss": 0.8759, + "step": 12524 + }, + { + "epoch": 0.6246882793017456, + "grad_norm": 0.4463579507019505, + "learning_rate": 1.6308817814937387e-05, + "loss": 0.8674, + "step": 12525 + }, + { + "epoch": 0.6247381546134664, + "grad_norm": 0.3917164134276361, + "learning_rate": 1.6305031373147838e-05, + "loss": 0.8399, + "step": 12526 + }, + { + "epoch": 0.624788029925187, + "grad_norm": 0.4545683682688828, + "learning_rate": 1.63012451582496e-05, + "loss": 0.8821, + "step": 12527 + }, + { + "epoch": 0.6248379052369077, + "grad_norm": 0.4133703306948351, + "learning_rate": 1.6297459170341482e-05, + "loss": 0.8314, + "step": 12528 + }, + { + "epoch": 0.6248877805486285, + "grad_norm": 0.49814232786418694, + "learning_rate": 1.629367340952227e-05, + "loss": 0.9037, + "step": 12529 + }, + { + "epoch": 0.6249376558603491, + "grad_norm": 3.736617258782608, + "learning_rate": 1.628988787589075e-05, + "loss": 0.8911, + "step": 12530 + }, + { + "epoch": 0.6249875311720698, + "grad_norm": 0.5119563110016467, + "learning_rate": 1.6286102569545708e-05, + "loss": 0.8302, + "step": 12531 + }, + { + "epoch": 0.6250374064837905, + "grad_norm": 0.4618934296684922, + "learning_rate": 1.628231749058593e-05, + "loss": 0.8864, + "step": 12532 + }, + { + "epoch": 0.6250872817955112, + "grad_norm": 0.43972887176896513, + "learning_rate": 1.6278532639110168e-05, + "loss": 0.8773, + "step": 12533 + }, + { + "epoch": 0.6251371571072319, + "grad_norm": 0.45682134595823715, + "learning_rate": 1.627474801521719e-05, + "loss": 0.868, + "step": 12534 + }, + { + "epoch": 0.6251870324189526, + "grad_norm": 0.46763482382227595, + "learning_rate": 1.6270963619005768e-05, + "loss": 0.8534, + "step": 12535 + }, + { + "epoch": 0.6252369077306733, + "grad_norm": 0.4341282781772669, + "learning_rate": 1.626717945057463e-05, + "loss": 0.9007, + "step": 12536 + }, + { + "epoch": 0.625286783042394, + "grad_norm": 0.6569592634134885, + "learning_rate": 1.6263395510022543e-05, + "loss": 0.8677, + "step": 12537 + }, + { + "epoch": 0.6253366583541147, + "grad_norm": 0.5355111255324462, + "learning_rate": 1.6259611797448238e-05, + "loss": 0.8609, + "step": 12538 + }, + { + "epoch": 0.6253865336658354, + "grad_norm": 0.46176273111779803, + "learning_rate": 1.6255828312950462e-05, + "loss": 0.8559, + "step": 12539 + }, + { + "epoch": 0.6254364089775561, + "grad_norm": 0.41856597711855437, + "learning_rate": 1.625204505662792e-05, + "loss": 0.8743, + "step": 12540 + }, + { + "epoch": 0.6254862842892768, + "grad_norm": 0.49715471780950543, + "learning_rate": 1.624826202857935e-05, + "loss": 0.9052, + "step": 12541 + }, + { + "epoch": 0.6255361596009975, + "grad_norm": 0.3920162092873034, + "learning_rate": 1.6244479228903464e-05, + "loss": 0.8666, + "step": 12542 + }, + { + "epoch": 0.6255860349127182, + "grad_norm": 0.44346342466697025, + "learning_rate": 1.6240696657698984e-05, + "loss": 0.8897, + "step": 12543 + }, + { + "epoch": 0.6256359102244389, + "grad_norm": 0.4464107683352051, + "learning_rate": 1.6236914315064596e-05, + "loss": 0.8559, + "step": 12544 + }, + { + "epoch": 0.6256857855361596, + "grad_norm": 0.43003465157976556, + "learning_rate": 1.6233132201099004e-05, + "loss": 0.8806, + "step": 12545 + }, + { + "epoch": 0.6257356608478803, + "grad_norm": 0.4124237028610283, + "learning_rate": 1.6229350315900915e-05, + "loss": 0.8678, + "step": 12546 + }, + { + "epoch": 0.625785536159601, + "grad_norm": 0.6510432350006481, + "learning_rate": 1.6225568659569e-05, + "loss": 0.8564, + "step": 12547 + }, + { + "epoch": 0.6258354114713217, + "grad_norm": 0.49616987040656874, + "learning_rate": 1.6221787232201938e-05, + "loss": 0.8928, + "step": 12548 + }, + { + "epoch": 0.6258852867830424, + "grad_norm": 0.4441499545561853, + "learning_rate": 1.621800603389841e-05, + "loss": 0.8607, + "step": 12549 + }, + { + "epoch": 0.6259351620947631, + "grad_norm": 0.466623762576356, + "learning_rate": 1.621422506475709e-05, + "loss": 0.8969, + "step": 12550 + }, + { + "epoch": 0.6259850374064838, + "grad_norm": 0.45651949480790843, + "learning_rate": 1.621044432487663e-05, + "loss": 0.8422, + "step": 12551 + }, + { + "epoch": 0.6260349127182044, + "grad_norm": 0.46666496097144416, + "learning_rate": 1.62066638143557e-05, + "loss": 0.8795, + "step": 12552 + }, + { + "epoch": 0.6260847880299252, + "grad_norm": 0.5051105240628426, + "learning_rate": 1.620288353329293e-05, + "loss": 0.8635, + "step": 12553 + }, + { + "epoch": 0.6261346633416459, + "grad_norm": 0.4413548999269752, + "learning_rate": 1.6199103481786987e-05, + "loss": 0.8963, + "step": 12554 + }, + { + "epoch": 0.6261845386533665, + "grad_norm": 0.5042032223468755, + "learning_rate": 1.6195323659936495e-05, + "loss": 0.8519, + "step": 12555 + }, + { + "epoch": 0.6262344139650873, + "grad_norm": 0.44273053669571694, + "learning_rate": 1.6191544067840095e-05, + "loss": 0.8758, + "step": 12556 + }, + { + "epoch": 0.626284289276808, + "grad_norm": 0.6536455962603634, + "learning_rate": 1.618776470559641e-05, + "loss": 0.8962, + "step": 12557 + }, + { + "epoch": 0.6263341645885286, + "grad_norm": 0.5299550938310745, + "learning_rate": 1.6183985573304057e-05, + "loss": 0.8734, + "step": 12558 + }, + { + "epoch": 0.6263840399002494, + "grad_norm": 0.48009627064628413, + "learning_rate": 1.6180206671061657e-05, + "loss": 0.8928, + "step": 12559 + }, + { + "epoch": 0.6264339152119701, + "grad_norm": 0.4179775088676129, + "learning_rate": 1.6176427998967812e-05, + "loss": 0.8872, + "step": 12560 + }, + { + "epoch": 0.6264837905236907, + "grad_norm": 0.5361371053167434, + "learning_rate": 1.6172649557121144e-05, + "loss": 0.8739, + "step": 12561 + }, + { + "epoch": 0.6265336658354115, + "grad_norm": 0.3988829048967093, + "learning_rate": 1.6168871345620225e-05, + "loss": 0.8116, + "step": 12562 + }, + { + "epoch": 0.6265835411471322, + "grad_norm": 0.45773631897123795, + "learning_rate": 1.6165093364563654e-05, + "loss": 0.9082, + "step": 12563 + }, + { + "epoch": 0.6266334164588528, + "grad_norm": 0.9441269196144263, + "learning_rate": 1.6161315614050015e-05, + "loss": 0.8647, + "step": 12564 + }, + { + "epoch": 0.6266832917705736, + "grad_norm": 0.5047112139913459, + "learning_rate": 1.6157538094177903e-05, + "loss": 0.8826, + "step": 12565 + }, + { + "epoch": 0.6267331670822943, + "grad_norm": 0.5268090844006409, + "learning_rate": 1.615376080504587e-05, + "loss": 0.8377, + "step": 12566 + }, + { + "epoch": 0.6267830423940149, + "grad_norm": 0.5527588362633209, + "learning_rate": 1.6149983746752483e-05, + "loss": 0.8619, + "step": 12567 + }, + { + "epoch": 0.6268329177057357, + "grad_norm": 0.5202099687923052, + "learning_rate": 1.614620691939631e-05, + "loss": 0.8923, + "step": 12568 + }, + { + "epoch": 0.6268827930174563, + "grad_norm": 0.4845876641587057, + "learning_rate": 1.6142430323075922e-05, + "loss": 0.831, + "step": 12569 + }, + { + "epoch": 0.626932668329177, + "grad_norm": 0.4375693238959936, + "learning_rate": 1.6138653957889834e-05, + "loss": 0.8997, + "step": 12570 + }, + { + "epoch": 0.6269825436408978, + "grad_norm": 0.39388980313040095, + "learning_rate": 1.613487782393661e-05, + "loss": 0.8531, + "step": 12571 + }, + { + "epoch": 0.6270324189526184, + "grad_norm": 0.44515543950360975, + "learning_rate": 1.613110192131479e-05, + "loss": 0.8487, + "step": 12572 + }, + { + "epoch": 0.6270822942643391, + "grad_norm": 0.4784237752755849, + "learning_rate": 1.612732625012288e-05, + "loss": 0.8799, + "step": 12573 + }, + { + "epoch": 0.6271321695760599, + "grad_norm": 0.4438239910751686, + "learning_rate": 1.612355081045943e-05, + "loss": 0.8515, + "step": 12574 + }, + { + "epoch": 0.6271820448877805, + "grad_norm": 0.47730428838287364, + "learning_rate": 1.6119775602422943e-05, + "loss": 0.8616, + "step": 12575 + }, + { + "epoch": 0.6272319201995012, + "grad_norm": 0.506337048140018, + "learning_rate": 1.6116000626111948e-05, + "loss": 0.8764, + "step": 12576 + }, + { + "epoch": 0.627281795511222, + "grad_norm": 0.39422906947457614, + "learning_rate": 1.6112225881624932e-05, + "loss": 0.8344, + "step": 12577 + }, + { + "epoch": 0.6273316708229426, + "grad_norm": 0.8688446603181272, + "learning_rate": 1.6108451369060406e-05, + "loss": 0.8415, + "step": 12578 + }, + { + "epoch": 0.6273815461346633, + "grad_norm": 0.4311517225983575, + "learning_rate": 1.610467708851686e-05, + "loss": 0.8983, + "step": 12579 + }, + { + "epoch": 0.6274314214463841, + "grad_norm": 0.4359435371992741, + "learning_rate": 1.6100903040092792e-05, + "loss": 0.9089, + "step": 12580 + }, + { + "epoch": 0.6274812967581047, + "grad_norm": 0.4827871354344566, + "learning_rate": 1.609712922388667e-05, + "loss": 0.8842, + "step": 12581 + }, + { + "epoch": 0.6275311720698254, + "grad_norm": 0.5515732623600961, + "learning_rate": 1.6093355639996978e-05, + "loss": 0.8554, + "step": 12582 + }, + { + "epoch": 0.6275810473815462, + "grad_norm": 0.49172581541409555, + "learning_rate": 1.6089582288522187e-05, + "loss": 0.8497, + "step": 12583 + }, + { + "epoch": 0.6276309226932668, + "grad_norm": 0.5504641369927475, + "learning_rate": 1.6085809169560755e-05, + "loss": 0.8771, + "step": 12584 + }, + { + "epoch": 0.6276807980049876, + "grad_norm": 0.45272625997369653, + "learning_rate": 1.6082036283211142e-05, + "loss": 0.9141, + "step": 12585 + }, + { + "epoch": 0.6277306733167082, + "grad_norm": 0.49747117713849126, + "learning_rate": 1.60782636295718e-05, + "loss": 0.8737, + "step": 12586 + }, + { + "epoch": 0.6277805486284289, + "grad_norm": 0.5273525863846524, + "learning_rate": 1.6074491208741182e-05, + "loss": 0.8615, + "step": 12587 + }, + { + "epoch": 0.6278304239401497, + "grad_norm": 0.45782418221482896, + "learning_rate": 1.607071902081772e-05, + "loss": 0.8285, + "step": 12588 + }, + { + "epoch": 0.6278802992518703, + "grad_norm": 0.4879286101677997, + "learning_rate": 1.6066947065899847e-05, + "loss": 0.8906, + "step": 12589 + }, + { + "epoch": 0.627930174563591, + "grad_norm": 0.5628598776338475, + "learning_rate": 1.606317534408599e-05, + "loss": 0.9096, + "step": 12590 + }, + { + "epoch": 0.6279800498753118, + "grad_norm": 0.6155234504660901, + "learning_rate": 1.6059403855474575e-05, + "loss": 0.8664, + "step": 12591 + }, + { + "epoch": 0.6280299251870324, + "grad_norm": 0.44759401760829287, + "learning_rate": 1.6055632600164012e-05, + "loss": 0.886, + "step": 12592 + }, + { + "epoch": 0.6280798004987531, + "grad_norm": 0.5797661594290133, + "learning_rate": 1.6051861578252718e-05, + "loss": 0.8767, + "step": 12593 + }, + { + "epoch": 0.6281296758104739, + "grad_norm": 0.4313105143670311, + "learning_rate": 1.6048090789839096e-05, + "loss": 0.8613, + "step": 12594 + }, + { + "epoch": 0.6281795511221945, + "grad_norm": 0.6600709089565998, + "learning_rate": 1.6044320235021532e-05, + "loss": 0.8841, + "step": 12595 + }, + { + "epoch": 0.6282294264339152, + "grad_norm": 0.4491380056302877, + "learning_rate": 1.604054991389842e-05, + "loss": 0.846, + "step": 12596 + }, + { + "epoch": 0.628279301745636, + "grad_norm": 0.5490858557997234, + "learning_rate": 1.6036779826568148e-05, + "loss": 0.8657, + "step": 12597 + }, + { + "epoch": 0.6283291770573566, + "grad_norm": 0.39839754973539837, + "learning_rate": 1.603300997312911e-05, + "loss": 0.8556, + "step": 12598 + }, + { + "epoch": 0.6283790523690773, + "grad_norm": 0.5630512859729433, + "learning_rate": 1.602924035367965e-05, + "loss": 0.8266, + "step": 12599 + }, + { + "epoch": 0.628428927680798, + "grad_norm": 0.5537251720708442, + "learning_rate": 1.6025470968318147e-05, + "loss": 0.8287, + "step": 12600 + }, + { + "epoch": 0.6284788029925187, + "grad_norm": 0.4588355332306675, + "learning_rate": 1.6021701817142966e-05, + "loss": 0.8903, + "step": 12601 + }, + { + "epoch": 0.6285286783042394, + "grad_norm": 0.5571934819730414, + "learning_rate": 1.6017932900252463e-05, + "loss": 0.8772, + "step": 12602 + }, + { + "epoch": 0.62857855361596, + "grad_norm": 0.45066257580163854, + "learning_rate": 1.6014164217744976e-05, + "loss": 0.8172, + "step": 12603 + }, + { + "epoch": 0.6286284289276808, + "grad_norm": 0.4723119312659352, + "learning_rate": 1.601039576971885e-05, + "loss": 0.8933, + "step": 12604 + }, + { + "epoch": 0.6286783042394015, + "grad_norm": 0.49897742033025005, + "learning_rate": 1.6006627556272436e-05, + "loss": 0.8974, + "step": 12605 + }, + { + "epoch": 0.6287281795511221, + "grad_norm": 0.43344848136489444, + "learning_rate": 1.6002859577504038e-05, + "loss": 0.8602, + "step": 12606 + }, + { + "epoch": 0.6287780548628429, + "grad_norm": 0.4372154082770401, + "learning_rate": 1.5999091833511993e-05, + "loss": 0.7975, + "step": 12607 + }, + { + "epoch": 0.6288279301745636, + "grad_norm": 0.8800730829815282, + "learning_rate": 1.5995324324394616e-05, + "loss": 0.86, + "step": 12608 + }, + { + "epoch": 0.6288778054862842, + "grad_norm": 0.4891824824951918, + "learning_rate": 1.5991557050250237e-05, + "loss": 0.854, + "step": 12609 + }, + { + "epoch": 0.628927680798005, + "grad_norm": 0.44762229809018267, + "learning_rate": 1.5987790011177127e-05, + "loss": 0.8669, + "step": 12610 + }, + { + "epoch": 0.6289775561097257, + "grad_norm": 0.4596214345153685, + "learning_rate": 1.5984023207273612e-05, + "loss": 0.9138, + "step": 12611 + }, + { + "epoch": 0.6290274314214463, + "grad_norm": 0.46486099369276385, + "learning_rate": 1.598025663863797e-05, + "loss": 0.8807, + "step": 12612 + }, + { + "epoch": 0.6290773067331671, + "grad_norm": 0.4440921206543466, + "learning_rate": 1.5976490305368508e-05, + "loss": 0.8754, + "step": 12613 + }, + { + "epoch": 0.6291271820448878, + "grad_norm": 0.44536476533116004, + "learning_rate": 1.5972724207563482e-05, + "loss": 0.8886, + "step": 12614 + }, + { + "epoch": 0.6291770573566084, + "grad_norm": 0.5163452064920754, + "learning_rate": 1.5968958345321178e-05, + "loss": 0.8704, + "step": 12615 + }, + { + "epoch": 0.6292269326683292, + "grad_norm": 0.5470415805223982, + "learning_rate": 1.5965192718739876e-05, + "loss": 0.8664, + "step": 12616 + }, + { + "epoch": 0.6292768079800499, + "grad_norm": 0.7303323488642651, + "learning_rate": 1.596142732791781e-05, + "loss": 0.8299, + "step": 12617 + }, + { + "epoch": 0.6293266832917705, + "grad_norm": 0.41278361124347507, + "learning_rate": 1.595766217295326e-05, + "loss": 0.8359, + "step": 12618 + }, + { + "epoch": 0.6293765586034913, + "grad_norm": 0.4089943755858503, + "learning_rate": 1.5953897253944464e-05, + "loss": 0.8657, + "step": 12619 + }, + { + "epoch": 0.629426433915212, + "grad_norm": 0.44457809217568195, + "learning_rate": 1.5950132570989683e-05, + "loss": 0.8723, + "step": 12620 + }, + { + "epoch": 0.6294763092269327, + "grad_norm": 0.40645233828457544, + "learning_rate": 1.5946368124187132e-05, + "loss": 0.8398, + "step": 12621 + }, + { + "epoch": 0.6295261845386534, + "grad_norm": 0.5385910486168755, + "learning_rate": 1.594260391363505e-05, + "loss": 0.8668, + "step": 12622 + }, + { + "epoch": 0.629576059850374, + "grad_norm": 0.4291688069894156, + "learning_rate": 1.5938839939431672e-05, + "loss": 0.8364, + "step": 12623 + }, + { + "epoch": 0.6296259351620948, + "grad_norm": 0.582677046897844, + "learning_rate": 1.5935076201675208e-05, + "loss": 0.88, + "step": 12624 + }, + { + "epoch": 0.6296758104738155, + "grad_norm": 0.4187773819801336, + "learning_rate": 1.593131270046388e-05, + "loss": 0.8837, + "step": 12625 + }, + { + "epoch": 0.6297256857855361, + "grad_norm": 0.4638353047304278, + "learning_rate": 1.5927549435895877e-05, + "loss": 0.8172, + "step": 12626 + }, + { + "epoch": 0.6297755610972569, + "grad_norm": 0.4785698400911589, + "learning_rate": 1.592378640806942e-05, + "loss": 0.9189, + "step": 12627 + }, + { + "epoch": 0.6298254364089776, + "grad_norm": 0.45942842527402844, + "learning_rate": 1.5920023617082692e-05, + "loss": 0.8336, + "step": 12628 + }, + { + "epoch": 0.6298753117206982, + "grad_norm": 0.5746457705007845, + "learning_rate": 1.5916261063033883e-05, + "loss": 0.891, + "step": 12629 + }, + { + "epoch": 0.629925187032419, + "grad_norm": 0.4253776389348577, + "learning_rate": 1.5912498746021177e-05, + "loss": 0.9024, + "step": 12630 + }, + { + "epoch": 0.6299750623441397, + "grad_norm": 0.551863883755714, + "learning_rate": 1.5908736666142753e-05, + "loss": 0.8527, + "step": 12631 + }, + { + "epoch": 0.6300249376558603, + "grad_norm": 0.4683399938445151, + "learning_rate": 1.590497482349677e-05, + "loss": 0.8727, + "step": 12632 + }, + { + "epoch": 0.630074812967581, + "grad_norm": 0.49423636586554975, + "learning_rate": 1.5901213218181398e-05, + "loss": 0.8618, + "step": 12633 + }, + { + "epoch": 0.6301246882793018, + "grad_norm": 0.4420614504267785, + "learning_rate": 1.5897451850294797e-05, + "loss": 0.8428, + "step": 12634 + }, + { + "epoch": 0.6301745635910224, + "grad_norm": 0.5595337557863062, + "learning_rate": 1.5893690719935128e-05, + "loss": 0.8465, + "step": 12635 + }, + { + "epoch": 0.6302244389027432, + "grad_norm": 0.683779822182428, + "learning_rate": 1.5889929827200512e-05, + "loss": 0.8556, + "step": 12636 + }, + { + "epoch": 0.6302743142144639, + "grad_norm": 0.44947449432895625, + "learning_rate": 1.58861691721891e-05, + "loss": 0.8767, + "step": 12637 + }, + { + "epoch": 0.6303241895261845, + "grad_norm": 0.6691225389595175, + "learning_rate": 1.5882408754999024e-05, + "loss": 0.9332, + "step": 12638 + }, + { + "epoch": 0.6303740648379053, + "grad_norm": 0.4061226580591135, + "learning_rate": 1.587864857572842e-05, + "loss": 0.8527, + "step": 12639 + }, + { + "epoch": 0.6304239401496259, + "grad_norm": 0.9929486850663887, + "learning_rate": 1.5874888634475392e-05, + "loss": 0.865, + "step": 12640 + }, + { + "epoch": 0.6304738154613466, + "grad_norm": 0.4103760966193316, + "learning_rate": 1.587112893133806e-05, + "loss": 0.8832, + "step": 12641 + }, + { + "epoch": 0.6305236907730674, + "grad_norm": 0.47446895121914623, + "learning_rate": 1.5867369466414544e-05, + "loss": 0.8836, + "step": 12642 + }, + { + "epoch": 0.630573566084788, + "grad_norm": 0.45072164882706117, + "learning_rate": 1.5863610239802924e-05, + "loss": 0.8772, + "step": 12643 + }, + { + "epoch": 0.6306234413965087, + "grad_norm": 0.3951634513818072, + "learning_rate": 1.5859851251601304e-05, + "loss": 0.8289, + "step": 12644 + }, + { + "epoch": 0.6306733167082295, + "grad_norm": 0.41262821907618313, + "learning_rate": 1.5856092501907777e-05, + "loss": 0.8571, + "step": 12645 + }, + { + "epoch": 0.6307231920199501, + "grad_norm": 0.42406520951242066, + "learning_rate": 1.5852333990820433e-05, + "loss": 0.878, + "step": 12646 + }, + { + "epoch": 0.6307730673316708, + "grad_norm": 0.4818802971420231, + "learning_rate": 1.584857571843733e-05, + "loss": 0.8308, + "step": 12647 + }, + { + "epoch": 0.6308229426433916, + "grad_norm": 0.4085241583613815, + "learning_rate": 1.5844817684856547e-05, + "loss": 0.8528, + "step": 12648 + }, + { + "epoch": 0.6308728179551122, + "grad_norm": 0.4551793201709128, + "learning_rate": 1.584105989017615e-05, + "loss": 0.8589, + "step": 12649 + }, + { + "epoch": 0.6309226932668329, + "grad_norm": 0.4280027267435082, + "learning_rate": 1.583730233449421e-05, + "loss": 0.895, + "step": 12650 + }, + { + "epoch": 0.6309725685785537, + "grad_norm": 0.46452681425326475, + "learning_rate": 1.5833545017908746e-05, + "loss": 0.8593, + "step": 12651 + }, + { + "epoch": 0.6310224438902743, + "grad_norm": 0.47114925205483915, + "learning_rate": 1.582978794051783e-05, + "loss": 0.8428, + "step": 12652 + }, + { + "epoch": 0.631072319201995, + "grad_norm": 0.5972921645008106, + "learning_rate": 1.58260311024195e-05, + "loss": 0.8977, + "step": 12653 + }, + { + "epoch": 0.6311221945137158, + "grad_norm": 0.4290661171178466, + "learning_rate": 1.5822274503711776e-05, + "loss": 0.8203, + "step": 12654 + }, + { + "epoch": 0.6311720698254364, + "grad_norm": 0.5617480436540662, + "learning_rate": 1.5818518144492688e-05, + "loss": 0.8756, + "step": 12655 + }, + { + "epoch": 0.6312219451371571, + "grad_norm": 0.4702623328801269, + "learning_rate": 1.581476202486026e-05, + "loss": 0.8462, + "step": 12656 + }, + { + "epoch": 0.6312718204488778, + "grad_norm": 0.4787985351926607, + "learning_rate": 1.5811006144912516e-05, + "loss": 0.8967, + "step": 12657 + }, + { + "epoch": 0.6313216957605985, + "grad_norm": 0.3923688612530066, + "learning_rate": 1.5807250504747445e-05, + "loss": 0.857, + "step": 12658 + }, + { + "epoch": 0.6313715710723192, + "grad_norm": 0.8911987403461826, + "learning_rate": 1.580349510446306e-05, + "loss": 0.8308, + "step": 12659 + }, + { + "epoch": 0.6314214463840399, + "grad_norm": 0.45318016412500584, + "learning_rate": 1.5799739944157354e-05, + "loss": 0.9009, + "step": 12660 + }, + { + "epoch": 0.6314713216957606, + "grad_norm": 0.5393239793101969, + "learning_rate": 1.579598502392832e-05, + "loss": 0.8293, + "step": 12661 + }, + { + "epoch": 0.6315211970074813, + "grad_norm": 0.3836875174748395, + "learning_rate": 1.5792230343873936e-05, + "loss": 0.8689, + "step": 12662 + }, + { + "epoch": 0.631571072319202, + "grad_norm": 0.4769821874910526, + "learning_rate": 1.5788475904092178e-05, + "loss": 0.8719, + "step": 12663 + }, + { + "epoch": 0.6316209476309227, + "grad_norm": 0.5208733759526314, + "learning_rate": 1.5784721704681023e-05, + "loss": 0.857, + "step": 12664 + }, + { + "epoch": 0.6316708229426434, + "grad_norm": 0.968281256070103, + "learning_rate": 1.578096774573843e-05, + "loss": 0.8809, + "step": 12665 + }, + { + "epoch": 0.631720698254364, + "grad_norm": 0.4156414014766906, + "learning_rate": 1.5777214027362353e-05, + "loss": 0.9001, + "step": 12666 + }, + { + "epoch": 0.6317705735660848, + "grad_norm": 0.45621406804013676, + "learning_rate": 1.5773460549650753e-05, + "loss": 0.8141, + "step": 12667 + }, + { + "epoch": 0.6318204488778055, + "grad_norm": 0.40858548519457016, + "learning_rate": 1.576970731270157e-05, + "loss": 0.8139, + "step": 12668 + }, + { + "epoch": 0.6318703241895262, + "grad_norm": 0.45518118427446086, + "learning_rate": 1.5765954316612742e-05, + "loss": 0.9036, + "step": 12669 + }, + { + "epoch": 0.6319201995012469, + "grad_norm": 0.6587968265220534, + "learning_rate": 1.5762201561482205e-05, + "loss": 0.864, + "step": 12670 + }, + { + "epoch": 0.6319700748129676, + "grad_norm": 0.5213427949877464, + "learning_rate": 1.5758449047407884e-05, + "loss": 0.8987, + "step": 12671 + }, + { + "epoch": 0.6320199501246883, + "grad_norm": 0.5070380490128784, + "learning_rate": 1.5754696774487702e-05, + "loss": 0.8634, + "step": 12672 + }, + { + "epoch": 0.632069825436409, + "grad_norm": 0.5190433618811654, + "learning_rate": 1.5750944742819562e-05, + "loss": 0.8285, + "step": 12673 + }, + { + "epoch": 0.6321197007481296, + "grad_norm": 0.8458252761576368, + "learning_rate": 1.5747192952501382e-05, + "loss": 0.8576, + "step": 12674 + }, + { + "epoch": 0.6321695760598504, + "grad_norm": 0.49872687300022145, + "learning_rate": 1.5743441403631072e-05, + "loss": 0.8632, + "step": 12675 + }, + { + "epoch": 0.6322194513715711, + "grad_norm": 0.49551525567811416, + "learning_rate": 1.5739690096306505e-05, + "loss": 0.8545, + "step": 12676 + }, + { + "epoch": 0.6322693266832917, + "grad_norm": 0.4754529538905854, + "learning_rate": 1.573593903062558e-05, + "loss": 0.8332, + "step": 12677 + }, + { + "epoch": 0.6323192019950125, + "grad_norm": 0.4020545098478668, + "learning_rate": 1.573218820668618e-05, + "loss": 0.8423, + "step": 12678 + }, + { + "epoch": 0.6323690773067332, + "grad_norm": 0.5359905463856734, + "learning_rate": 1.572843762458619e-05, + "loss": 0.8491, + "step": 12679 + }, + { + "epoch": 0.6324189526184538, + "grad_norm": 0.4129774987004901, + "learning_rate": 1.5724687284423462e-05, + "loss": 0.8371, + "step": 12680 + }, + { + "epoch": 0.6324688279301746, + "grad_norm": 0.5125441216158557, + "learning_rate": 1.572093718629587e-05, + "loss": 0.8713, + "step": 12681 + }, + { + "epoch": 0.6325187032418953, + "grad_norm": 0.4562007618729531, + "learning_rate": 1.5717187330301267e-05, + "loss": 0.9021, + "step": 12682 + }, + { + "epoch": 0.6325685785536159, + "grad_norm": 0.43586104289058336, + "learning_rate": 1.5713437716537515e-05, + "loss": 0.868, + "step": 12683 + }, + { + "epoch": 0.6326184538653367, + "grad_norm": 0.44931794547502424, + "learning_rate": 1.5709688345102443e-05, + "loss": 0.862, + "step": 12684 + }, + { + "epoch": 0.6326683291770574, + "grad_norm": 0.4847988243483537, + "learning_rate": 1.5705939216093895e-05, + "loss": 0.8251, + "step": 12685 + }, + { + "epoch": 0.632718204488778, + "grad_norm": 0.5014117380018545, + "learning_rate": 1.57021903296097e-05, + "loss": 0.8636, + "step": 12686 + }, + { + "epoch": 0.6327680798004988, + "grad_norm": 0.6129665022525653, + "learning_rate": 1.56984416857477e-05, + "loss": 0.8773, + "step": 12687 + }, + { + "epoch": 0.6328179551122195, + "grad_norm": 0.40657415300084676, + "learning_rate": 1.5694693284605694e-05, + "loss": 0.8672, + "step": 12688 + }, + { + "epoch": 0.6328678304239401, + "grad_norm": 0.42378575718970146, + "learning_rate": 1.5690945126281498e-05, + "loss": 0.8636, + "step": 12689 + }, + { + "epoch": 0.6329177057356609, + "grad_norm": 0.4278946092213753, + "learning_rate": 1.5687197210872935e-05, + "loss": 0.852, + "step": 12690 + }, + { + "epoch": 0.6329675810473816, + "grad_norm": 0.41122577141210664, + "learning_rate": 1.5683449538477785e-05, + "loss": 0.8234, + "step": 12691 + }, + { + "epoch": 0.6330174563591022, + "grad_norm": 0.5947622892870141, + "learning_rate": 1.5679702109193843e-05, + "loss": 0.8413, + "step": 12692 + }, + { + "epoch": 0.633067331670823, + "grad_norm": 0.566092074947707, + "learning_rate": 1.5675954923118906e-05, + "loss": 0.8344, + "step": 12693 + }, + { + "epoch": 0.6331172069825436, + "grad_norm": 0.550597821726383, + "learning_rate": 1.5672207980350768e-05, + "loss": 0.8648, + "step": 12694 + }, + { + "epoch": 0.6331670822942643, + "grad_norm": 0.44118262786090456, + "learning_rate": 1.5668461280987172e-05, + "loss": 0.8425, + "step": 12695 + }, + { + "epoch": 0.6332169576059851, + "grad_norm": 0.5261241045413124, + "learning_rate": 1.5664714825125905e-05, + "loss": 0.869, + "step": 12696 + }, + { + "epoch": 0.6332668329177057, + "grad_norm": 0.40208637818864473, + "learning_rate": 1.5660968612864726e-05, + "loss": 0.8589, + "step": 12697 + }, + { + "epoch": 0.6333167082294264, + "grad_norm": 0.5073922707250111, + "learning_rate": 1.5657222644301395e-05, + "loss": 0.9041, + "step": 12698 + }, + { + "epoch": 0.6333665835411472, + "grad_norm": 0.3902436861276853, + "learning_rate": 1.565347691953366e-05, + "loss": 0.851, + "step": 12699 + }, + { + "epoch": 0.6334164588528678, + "grad_norm": 0.5823223019287116, + "learning_rate": 1.5649731438659256e-05, + "loss": 0.8758, + "step": 12700 + }, + { + "epoch": 0.6334663341645885, + "grad_norm": 1.2537076911554286, + "learning_rate": 1.564598620177593e-05, + "loss": 0.8291, + "step": 12701 + }, + { + "epoch": 0.6335162094763093, + "grad_norm": 0.44638318978712394, + "learning_rate": 1.5642241208981405e-05, + "loss": 0.8713, + "step": 12702 + }, + { + "epoch": 0.6335660847880299, + "grad_norm": 0.41921773805835955, + "learning_rate": 1.5638496460373413e-05, + "loss": 0.8591, + "step": 12703 + }, + { + "epoch": 0.6336159600997506, + "grad_norm": 0.5514863294135772, + "learning_rate": 1.563475195604966e-05, + "loss": 0.8709, + "step": 12704 + }, + { + "epoch": 0.6336658354114714, + "grad_norm": 0.41486668457254516, + "learning_rate": 1.5631007696107867e-05, + "loss": 0.8657, + "step": 12705 + }, + { + "epoch": 0.633715710723192, + "grad_norm": 0.4173316356085459, + "learning_rate": 1.5627263680645733e-05, + "loss": 0.8255, + "step": 12706 + }, + { + "epoch": 0.6337655860349127, + "grad_norm": 0.6147640297483261, + "learning_rate": 1.5623519909760954e-05, + "loss": 0.8433, + "step": 12707 + }, + { + "epoch": 0.6338154613466335, + "grad_norm": 0.44358658698628955, + "learning_rate": 1.5619776383551233e-05, + "loss": 0.8466, + "step": 12708 + }, + { + "epoch": 0.6338653366583541, + "grad_norm": 0.44282683098305975, + "learning_rate": 1.561603310211425e-05, + "loss": 0.8445, + "step": 12709 + }, + { + "epoch": 0.6339152119700748, + "grad_norm": 0.43014002918237004, + "learning_rate": 1.561229006554768e-05, + "loss": 0.8638, + "step": 12710 + }, + { + "epoch": 0.6339650872817955, + "grad_norm": 0.5558594294380932, + "learning_rate": 1.5608547273949198e-05, + "loss": 0.8241, + "step": 12711 + }, + { + "epoch": 0.6340149625935162, + "grad_norm": 0.41746868467801285, + "learning_rate": 1.5604804727416484e-05, + "loss": 0.854, + "step": 12712 + }, + { + "epoch": 0.6340648379052369, + "grad_norm": 0.42587006451344467, + "learning_rate": 1.5601062426047172e-05, + "loss": 0.8878, + "step": 12713 + }, + { + "epoch": 0.6341147132169576, + "grad_norm": 0.4505927983281683, + "learning_rate": 1.559732036993893e-05, + "loss": 0.8907, + "step": 12714 + }, + { + "epoch": 0.6341645885286783, + "grad_norm": 0.5459094196146878, + "learning_rate": 1.559357855918941e-05, + "loss": 0.8576, + "step": 12715 + }, + { + "epoch": 0.634214463840399, + "grad_norm": 0.4384709813693934, + "learning_rate": 1.5589836993896252e-05, + "loss": 0.866, + "step": 12716 + }, + { + "epoch": 0.6342643391521197, + "grad_norm": 0.4015224895261242, + "learning_rate": 1.5586095674157076e-05, + "loss": 0.8273, + "step": 12717 + }, + { + "epoch": 0.6343142144638404, + "grad_norm": 0.5320092949981602, + "learning_rate": 1.5582354600069523e-05, + "loss": 0.8453, + "step": 12718 + }, + { + "epoch": 0.6343640897755611, + "grad_norm": 0.461540355873915, + "learning_rate": 1.5578613771731213e-05, + "loss": 0.8467, + "step": 12719 + }, + { + "epoch": 0.6344139650872818, + "grad_norm": 0.49273195701027417, + "learning_rate": 1.557487318923977e-05, + "loss": 0.8611, + "step": 12720 + }, + { + "epoch": 0.6344638403990025, + "grad_norm": 0.5081362564862022, + "learning_rate": 1.557113285269278e-05, + "loss": 0.8965, + "step": 12721 + }, + { + "epoch": 0.6345137157107232, + "grad_norm": 0.6421870334936786, + "learning_rate": 1.556739276218786e-05, + "loss": 0.8826, + "step": 12722 + }, + { + "epoch": 0.6345635910224439, + "grad_norm": 0.48512117821918604, + "learning_rate": 1.5563652917822615e-05, + "loss": 0.8437, + "step": 12723 + }, + { + "epoch": 0.6346134663341646, + "grad_norm": 0.4707018926285901, + "learning_rate": 1.5559913319694614e-05, + "loss": 0.8305, + "step": 12724 + }, + { + "epoch": 0.6346633416458853, + "grad_norm": 0.43161275063278, + "learning_rate": 1.555617396790145e-05, + "loss": 0.8524, + "step": 12725 + }, + { + "epoch": 0.634713216957606, + "grad_norm": 0.42563309417816214, + "learning_rate": 1.5552434862540698e-05, + "loss": 0.8784, + "step": 12726 + }, + { + "epoch": 0.6347630922693267, + "grad_norm": 0.5312746257428524, + "learning_rate": 1.5548696003709938e-05, + "loss": 0.8657, + "step": 12727 + }, + { + "epoch": 0.6348129675810473, + "grad_norm": 0.48737007598192295, + "learning_rate": 1.554495739150672e-05, + "loss": 0.8952, + "step": 12728 + }, + { + "epoch": 0.6348628428927681, + "grad_norm": 0.4245462312467989, + "learning_rate": 1.554121902602861e-05, + "loss": 0.87, + "step": 12729 + }, + { + "epoch": 0.6349127182044888, + "grad_norm": 0.4485505352224559, + "learning_rate": 1.5537480907373152e-05, + "loss": 0.8892, + "step": 12730 + }, + { + "epoch": 0.6349625935162094, + "grad_norm": 0.4158895920431664, + "learning_rate": 1.5533743035637906e-05, + "loss": 0.8981, + "step": 12731 + }, + { + "epoch": 0.6350124688279302, + "grad_norm": 0.4567862441827651, + "learning_rate": 1.5530005410920388e-05, + "loss": 0.8622, + "step": 12732 + }, + { + "epoch": 0.6350623441396509, + "grad_norm": 0.5091023870723191, + "learning_rate": 1.5526268033318142e-05, + "loss": 0.8879, + "step": 12733 + }, + { + "epoch": 0.6351122194513715, + "grad_norm": 0.486542747879738, + "learning_rate": 1.552253090292869e-05, + "loss": 0.8525, + "step": 12734 + }, + { + "epoch": 0.6351620947630923, + "grad_norm": 0.5153484946043588, + "learning_rate": 1.5518794019849565e-05, + "loss": 0.8673, + "step": 12735 + }, + { + "epoch": 0.635211970074813, + "grad_norm": 0.4782306287964341, + "learning_rate": 1.5515057384178257e-05, + "loss": 0.862, + "step": 12736 + }, + { + "epoch": 0.6352618453865336, + "grad_norm": 0.5916920759304907, + "learning_rate": 1.5511320996012276e-05, + "loss": 0.847, + "step": 12737 + }, + { + "epoch": 0.6353117206982544, + "grad_norm": 0.49645105078342733, + "learning_rate": 1.5507584855449138e-05, + "loss": 0.8711, + "step": 12738 + }, + { + "epoch": 0.6353615960099751, + "grad_norm": 0.47214825595896226, + "learning_rate": 1.5503848962586317e-05, + "loss": 0.8439, + "step": 12739 + }, + { + "epoch": 0.6354114713216957, + "grad_norm": 0.7291651910285729, + "learning_rate": 1.5500113317521315e-05, + "loss": 0.9193, + "step": 12740 + }, + { + "epoch": 0.6354613466334165, + "grad_norm": 0.6861349192790446, + "learning_rate": 1.5496377920351597e-05, + "loss": 0.8888, + "step": 12741 + }, + { + "epoch": 0.6355112219451372, + "grad_norm": 0.48471897410226106, + "learning_rate": 1.5492642771174648e-05, + "loss": 0.9049, + "step": 12742 + }, + { + "epoch": 0.6355610972568578, + "grad_norm": 0.5341718870141575, + "learning_rate": 1.548890787008793e-05, + "loss": 0.889, + "step": 12743 + }, + { + "epoch": 0.6356109725685786, + "grad_norm": 0.5774973661686253, + "learning_rate": 1.5485173217188902e-05, + "loss": 0.8355, + "step": 12744 + }, + { + "epoch": 0.6356608478802993, + "grad_norm": 0.5482764437415469, + "learning_rate": 1.5481438812575025e-05, + "loss": 0.9052, + "step": 12745 + }, + { + "epoch": 0.6357107231920199, + "grad_norm": 0.5152853839375743, + "learning_rate": 1.5477704656343744e-05, + "loss": 0.8511, + "step": 12746 + }, + { + "epoch": 0.6357605985037407, + "grad_norm": 0.44467987465939535, + "learning_rate": 1.547397074859249e-05, + "loss": 0.844, + "step": 12747 + }, + { + "epoch": 0.6358104738154613, + "grad_norm": 0.5857744654132556, + "learning_rate": 1.547023708941871e-05, + "loss": 0.8692, + "step": 12748 + }, + { + "epoch": 0.635860349127182, + "grad_norm": 0.43167937688320446, + "learning_rate": 1.5466503678919835e-05, + "loss": 0.8796, + "step": 12749 + }, + { + "epoch": 0.6359102244389028, + "grad_norm": 0.5856308355043254, + "learning_rate": 1.5462770517193274e-05, + "loss": 0.8716, + "step": 12750 + }, + { + "epoch": 0.6359600997506234, + "grad_norm": 0.4814884177766603, + "learning_rate": 1.5459037604336446e-05, + "loss": 0.8926, + "step": 12751 + }, + { + "epoch": 0.6360099750623441, + "grad_norm": 0.4727105545249358, + "learning_rate": 1.545530494044676e-05, + "loss": 0.8632, + "step": 12752 + }, + { + "epoch": 0.6360598503740649, + "grad_norm": 0.46430613327587295, + "learning_rate": 1.5451572525621633e-05, + "loss": 0.8687, + "step": 12753 + }, + { + "epoch": 0.6361097256857855, + "grad_norm": 0.7399964086669844, + "learning_rate": 1.5447840359958436e-05, + "loss": 0.8373, + "step": 12754 + }, + { + "epoch": 0.6361596009975062, + "grad_norm": 0.5171649608194833, + "learning_rate": 1.5444108443554568e-05, + "loss": 0.8538, + "step": 12755 + }, + { + "epoch": 0.636209476309227, + "grad_norm": 0.4324895845278009, + "learning_rate": 1.5440376776507415e-05, + "loss": 0.8481, + "step": 12756 + }, + { + "epoch": 0.6362593516209476, + "grad_norm": 0.47057634986208186, + "learning_rate": 1.5436645358914362e-05, + "loss": 0.8543, + "step": 12757 + }, + { + "epoch": 0.6363092269326683, + "grad_norm": 0.7094434759239194, + "learning_rate": 1.5432914190872757e-05, + "loss": 0.8703, + "step": 12758 + }, + { + "epoch": 0.6363591022443891, + "grad_norm": 0.537200924103604, + "learning_rate": 1.5429183272479977e-05, + "loss": 0.8636, + "step": 12759 + }, + { + "epoch": 0.6364089775561097, + "grad_norm": 0.6663494587095442, + "learning_rate": 1.5425452603833385e-05, + "loss": 0.9111, + "step": 12760 + }, + { + "epoch": 0.6364588528678304, + "grad_norm": 0.5355010793042972, + "learning_rate": 1.5421722185030314e-05, + "loss": 0.8876, + "step": 12761 + }, + { + "epoch": 0.6365087281795512, + "grad_norm": 0.5056218142915797, + "learning_rate": 1.5417992016168113e-05, + "loss": 0.8634, + "step": 12762 + }, + { + "epoch": 0.6365586034912718, + "grad_norm": 0.4465690696034181, + "learning_rate": 1.541426209734412e-05, + "loss": 0.8783, + "step": 12763 + }, + { + "epoch": 0.6366084788029925, + "grad_norm": 0.5371305992834184, + "learning_rate": 1.5410532428655682e-05, + "loss": 0.8307, + "step": 12764 + }, + { + "epoch": 0.6366583541147132, + "grad_norm": 0.5033401528957037, + "learning_rate": 1.54068030102001e-05, + "loss": 0.8522, + "step": 12765 + }, + { + "epoch": 0.6367082294264339, + "grad_norm": 0.4956874447177887, + "learning_rate": 1.5403073842074695e-05, + "loss": 0.8702, + "step": 12766 + }, + { + "epoch": 0.6367581047381546, + "grad_norm": 0.4347607885732119, + "learning_rate": 1.5399344924376787e-05, + "loss": 0.8871, + "step": 12767 + }, + { + "epoch": 0.6368079800498753, + "grad_norm": 0.5214317943220989, + "learning_rate": 1.539561625720368e-05, + "loss": 0.8618, + "step": 12768 + }, + { + "epoch": 0.636857855361596, + "grad_norm": 0.5232823372707408, + "learning_rate": 1.539188784065267e-05, + "loss": 0.8884, + "step": 12769 + }, + { + "epoch": 0.6369077306733167, + "grad_norm": 0.5433879933684899, + "learning_rate": 1.5388159674821036e-05, + "loss": 0.8615, + "step": 12770 + }, + { + "epoch": 0.6369576059850374, + "grad_norm": 0.3865118808739156, + "learning_rate": 1.5384431759806083e-05, + "loss": 0.8378, + "step": 12771 + }, + { + "epoch": 0.6370074812967581, + "grad_norm": 0.42025504978289646, + "learning_rate": 1.538070409570508e-05, + "loss": 0.8732, + "step": 12772 + }, + { + "epoch": 0.6370573566084788, + "grad_norm": 0.5453389831997538, + "learning_rate": 1.5376976682615293e-05, + "loss": 0.9056, + "step": 12773 + }, + { + "epoch": 0.6371072319201995, + "grad_norm": 0.4299212230221823, + "learning_rate": 1.5373249520633993e-05, + "loss": 0.8189, + "step": 12774 + }, + { + "epoch": 0.6371571072319202, + "grad_norm": 0.5186059757426638, + "learning_rate": 1.5369522609858446e-05, + "loss": 0.8483, + "step": 12775 + }, + { + "epoch": 0.6372069825436409, + "grad_norm": 0.49356606469511655, + "learning_rate": 1.5365795950385886e-05, + "loss": 0.8679, + "step": 12776 + }, + { + "epoch": 0.6372568578553616, + "grad_norm": 1.8145775092952943, + "learning_rate": 1.5362069542313578e-05, + "loss": 0.8728, + "step": 12777 + }, + { + "epoch": 0.6373067331670823, + "grad_norm": 0.516286308056904, + "learning_rate": 1.5358343385738742e-05, + "loss": 0.8705, + "step": 12778 + }, + { + "epoch": 0.637356608478803, + "grad_norm": 0.46627148333410007, + "learning_rate": 1.5354617480758632e-05, + "loss": 0.8538, + "step": 12779 + }, + { + "epoch": 0.6374064837905237, + "grad_norm": 0.5138687142636689, + "learning_rate": 1.5350891827470455e-05, + "loss": 0.8203, + "step": 12780 + }, + { + "epoch": 0.6374563591022444, + "grad_norm": 0.4393863852062563, + "learning_rate": 1.5347166425971434e-05, + "loss": 0.874, + "step": 12781 + }, + { + "epoch": 0.637506234413965, + "grad_norm": 0.5527242367929026, + "learning_rate": 1.53434412763588e-05, + "loss": 0.8447, + "step": 12782 + }, + { + "epoch": 0.6375561097256858, + "grad_norm": 0.6389889771692348, + "learning_rate": 1.533971637872973e-05, + "loss": 0.9114, + "step": 12783 + }, + { + "epoch": 0.6376059850374065, + "grad_norm": 0.4438022964160448, + "learning_rate": 1.533599173318144e-05, + "loss": 0.8779, + "step": 12784 + }, + { + "epoch": 0.6376558603491271, + "grad_norm": 0.6361394817358139, + "learning_rate": 1.5332267339811118e-05, + "loss": 0.8667, + "step": 12785 + }, + { + "epoch": 0.6377057356608479, + "grad_norm": 0.4993578324620504, + "learning_rate": 1.5328543198715967e-05, + "loss": 0.8364, + "step": 12786 + }, + { + "epoch": 0.6377556109725686, + "grad_norm": 0.510309891373089, + "learning_rate": 1.5324819309993142e-05, + "loss": 0.8854, + "step": 12787 + }, + { + "epoch": 0.6378054862842892, + "grad_norm": 0.42728044133791904, + "learning_rate": 1.5321095673739826e-05, + "loss": 0.8802, + "step": 12788 + }, + { + "epoch": 0.63785536159601, + "grad_norm": 0.39014795296564403, + "learning_rate": 1.5317372290053185e-05, + "loss": 0.8961, + "step": 12789 + }, + { + "epoch": 0.6379052369077307, + "grad_norm": 0.45978190280150905, + "learning_rate": 1.5313649159030394e-05, + "loss": 0.8616, + "step": 12790 + }, + { + "epoch": 0.6379551122194513, + "grad_norm": 0.5576285174755843, + "learning_rate": 1.5309926280768584e-05, + "loss": 0.859, + "step": 12791 + }, + { + "epoch": 0.6380049875311721, + "grad_norm": 0.5424758973889665, + "learning_rate": 1.5306203655364906e-05, + "loss": 0.8701, + "step": 12792 + }, + { + "epoch": 0.6380548628428928, + "grad_norm": 0.9320060044454251, + "learning_rate": 1.530248128291651e-05, + "loss": 0.8431, + "step": 12793 + }, + { + "epoch": 0.6381047381546134, + "grad_norm": 0.4165517499459783, + "learning_rate": 1.529875916352053e-05, + "loss": 0.8334, + "step": 12794 + }, + { + "epoch": 0.6381546134663342, + "grad_norm": 0.38794011745164214, + "learning_rate": 1.529503729727408e-05, + "loss": 0.8254, + "step": 12795 + }, + { + "epoch": 0.6382044887780549, + "grad_norm": 0.4274828713900511, + "learning_rate": 1.529131568427429e-05, + "loss": 0.8547, + "step": 12796 + }, + { + "epoch": 0.6382543640897755, + "grad_norm": 0.43020809184231656, + "learning_rate": 1.528759432461828e-05, + "loss": 0.8604, + "step": 12797 + }, + { + "epoch": 0.6383042394014963, + "grad_norm": 0.48383052951735395, + "learning_rate": 1.528387321840314e-05, + "loss": 0.8662, + "step": 12798 + }, + { + "epoch": 0.6383541147132169, + "grad_norm": 0.47092706086551933, + "learning_rate": 1.5280152365725982e-05, + "loss": 0.9002, + "step": 12799 + }, + { + "epoch": 0.6384039900249376, + "grad_norm": 0.4471767991463357, + "learning_rate": 1.5276431766683896e-05, + "loss": 0.8495, + "step": 12800 + }, + { + "epoch": 0.6384538653366584, + "grad_norm": 0.46254574486559763, + "learning_rate": 1.5272711421373982e-05, + "loss": 0.8919, + "step": 12801 + }, + { + "epoch": 0.638503740648379, + "grad_norm": 0.5107083968790703, + "learning_rate": 1.52689913298933e-05, + "loss": 0.8428, + "step": 12802 + }, + { + "epoch": 0.6385536159600997, + "grad_norm": 0.4809802697522295, + "learning_rate": 1.5265271492338932e-05, + "loss": 0.8793, + "step": 12803 + }, + { + "epoch": 0.6386034912718205, + "grad_norm": 0.45048384046882956, + "learning_rate": 1.5261551908807954e-05, + "loss": 0.8765, + "step": 12804 + }, + { + "epoch": 0.6386533665835411, + "grad_norm": 0.3998422833665233, + "learning_rate": 1.5257832579397425e-05, + "loss": 0.8624, + "step": 12805 + }, + { + "epoch": 0.6387032418952618, + "grad_norm": 0.44238962423338785, + "learning_rate": 1.5254113504204387e-05, + "loss": 0.8548, + "step": 12806 + }, + { + "epoch": 0.6387531172069826, + "grad_norm": 0.4583722126519371, + "learning_rate": 1.5250394683325894e-05, + "loss": 0.8847, + "step": 12807 + }, + { + "epoch": 0.6388029925187032, + "grad_norm": 0.5359865125097559, + "learning_rate": 1.5246676116858993e-05, + "loss": 0.9398, + "step": 12808 + }, + { + "epoch": 0.6388528678304239, + "grad_norm": 0.510284310582422, + "learning_rate": 1.5242957804900716e-05, + "loss": 0.8689, + "step": 12809 + }, + { + "epoch": 0.6389027431421447, + "grad_norm": 0.42863800007617575, + "learning_rate": 1.5239239747548084e-05, + "loss": 0.8625, + "step": 12810 + }, + { + "epoch": 0.6389526184538653, + "grad_norm": 0.5752056436606359, + "learning_rate": 1.5235521944898118e-05, + "loss": 0.859, + "step": 12811 + }, + { + "epoch": 0.639002493765586, + "grad_norm": 0.4814191832699846, + "learning_rate": 1.5231804397047847e-05, + "loss": 0.8506, + "step": 12812 + }, + { + "epoch": 0.6390523690773068, + "grad_norm": 0.44459700409660985, + "learning_rate": 1.5228087104094261e-05, + "loss": 0.8931, + "step": 12813 + }, + { + "epoch": 0.6391022443890274, + "grad_norm": 0.48724610224579923, + "learning_rate": 1.5224370066134375e-05, + "loss": 0.8485, + "step": 12814 + }, + { + "epoch": 0.6391521197007481, + "grad_norm": 0.3981449659285212, + "learning_rate": 1.5220653283265168e-05, + "loss": 0.8266, + "step": 12815 + }, + { + "epoch": 0.6392019950124689, + "grad_norm": 0.4498245234947977, + "learning_rate": 1.5216936755583643e-05, + "loss": 0.8427, + "step": 12816 + }, + { + "epoch": 0.6392518703241895, + "grad_norm": 0.40429572715313705, + "learning_rate": 1.5213220483186773e-05, + "loss": 0.8299, + "step": 12817 + }, + { + "epoch": 0.6393017456359102, + "grad_norm": 0.4826083391547516, + "learning_rate": 1.520950446617154e-05, + "loss": 0.8421, + "step": 12818 + }, + { + "epoch": 0.6393516209476309, + "grad_norm": 0.5094463276330709, + "learning_rate": 1.5205788704634904e-05, + "loss": 0.8558, + "step": 12819 + }, + { + "epoch": 0.6394014962593516, + "grad_norm": 0.483566619287361, + "learning_rate": 1.5202073198673825e-05, + "loss": 0.8798, + "step": 12820 + }, + { + "epoch": 0.6394513715710723, + "grad_norm": 0.529424327916637, + "learning_rate": 1.5198357948385261e-05, + "loss": 0.8839, + "step": 12821 + }, + { + "epoch": 0.639501246882793, + "grad_norm": 0.438489863914291, + "learning_rate": 1.5194642953866157e-05, + "loss": 0.8742, + "step": 12822 + }, + { + "epoch": 0.6395511221945137, + "grad_norm": 0.4401593808049672, + "learning_rate": 1.5190928215213468e-05, + "loss": 0.8917, + "step": 12823 + }, + { + "epoch": 0.6396009975062344, + "grad_norm": 0.5621894534755366, + "learning_rate": 1.5187213732524108e-05, + "loss": 0.9118, + "step": 12824 + }, + { + "epoch": 0.6396508728179551, + "grad_norm": 0.4914091960036175, + "learning_rate": 1.5183499505895015e-05, + "loss": 0.8327, + "step": 12825 + }, + { + "epoch": 0.6397007481296758, + "grad_norm": 0.5392994126896068, + "learning_rate": 1.5179785535423108e-05, + "loss": 0.8875, + "step": 12826 + }, + { + "epoch": 0.6397506234413965, + "grad_norm": 0.4774403649433272, + "learning_rate": 1.5176071821205313e-05, + "loss": 0.8734, + "step": 12827 + }, + { + "epoch": 0.6398004987531172, + "grad_norm": 0.5281803259637251, + "learning_rate": 1.5172358363338515e-05, + "loss": 0.8989, + "step": 12828 + }, + { + "epoch": 0.6398503740648379, + "grad_norm": 0.4691679440955275, + "learning_rate": 1.5168645161919632e-05, + "loss": 0.8412, + "step": 12829 + }, + { + "epoch": 0.6399002493765586, + "grad_norm": 0.4644699445411534, + "learning_rate": 1.5164932217045557e-05, + "loss": 0.8496, + "step": 12830 + }, + { + "epoch": 0.6399501246882793, + "grad_norm": 0.4099785900807122, + "learning_rate": 1.516121952881317e-05, + "loss": 0.8569, + "step": 12831 + }, + { + "epoch": 0.64, + "grad_norm": 0.520971119673048, + "learning_rate": 1.5157507097319352e-05, + "loss": 0.8419, + "step": 12832 + }, + { + "epoch": 0.6400498753117208, + "grad_norm": 0.40001254980760353, + "learning_rate": 1.5153794922660983e-05, + "loss": 0.8284, + "step": 12833 + }, + { + "epoch": 0.6400997506234414, + "grad_norm": 0.4370448114931445, + "learning_rate": 1.515008300493494e-05, + "loss": 0.8463, + "step": 12834 + }, + { + "epoch": 0.6401496259351621, + "grad_norm": 0.4532063124906716, + "learning_rate": 1.5146371344238061e-05, + "loss": 0.8522, + "step": 12835 + }, + { + "epoch": 0.6401995012468827, + "grad_norm": 0.6705899916623879, + "learning_rate": 1.5142659940667214e-05, + "loss": 0.9168, + "step": 12836 + }, + { + "epoch": 0.6402493765586035, + "grad_norm": 0.4615120919673571, + "learning_rate": 1.513894879431924e-05, + "loss": 0.8566, + "step": 12837 + }, + { + "epoch": 0.6402992518703242, + "grad_norm": 0.4134386284586413, + "learning_rate": 1.5135237905290999e-05, + "loss": 0.8255, + "step": 12838 + }, + { + "epoch": 0.6403491271820448, + "grad_norm": 0.4399305243728219, + "learning_rate": 1.5131527273679298e-05, + "loss": 0.838, + "step": 12839 + }, + { + "epoch": 0.6403990024937656, + "grad_norm": 0.42491149976081694, + "learning_rate": 1.5127816899580977e-05, + "loss": 0.8356, + "step": 12840 + }, + { + "epoch": 0.6404488778054863, + "grad_norm": 0.5612677260938567, + "learning_rate": 1.5124106783092854e-05, + "loss": 0.8706, + "step": 12841 + }, + { + "epoch": 0.6404987531172069, + "grad_norm": 0.4949782275410916, + "learning_rate": 1.5120396924311753e-05, + "loss": 0.877, + "step": 12842 + }, + { + "epoch": 0.6405486284289277, + "grad_norm": 0.4405659233498163, + "learning_rate": 1.5116687323334467e-05, + "loss": 0.9059, + "step": 12843 + }, + { + "epoch": 0.6405985037406484, + "grad_norm": 0.412439472558735, + "learning_rate": 1.51129779802578e-05, + "loss": 0.8823, + "step": 12844 + }, + { + "epoch": 0.640648379052369, + "grad_norm": 0.4555549639869054, + "learning_rate": 1.5109268895178553e-05, + "loss": 0.8312, + "step": 12845 + }, + { + "epoch": 0.6406982543640898, + "grad_norm": 0.41487555786974795, + "learning_rate": 1.5105560068193508e-05, + "loss": 0.8302, + "step": 12846 + }, + { + "epoch": 0.6407481296758105, + "grad_norm": 0.46278500977144954, + "learning_rate": 1.510185149939944e-05, + "loss": 0.8464, + "step": 12847 + }, + { + "epoch": 0.6407980049875311, + "grad_norm": 0.4911756740177277, + "learning_rate": 1.5098143188893128e-05, + "loss": 0.8425, + "step": 12848 + }, + { + "epoch": 0.6408478802992519, + "grad_norm": 0.41900963753318693, + "learning_rate": 1.509443513677134e-05, + "loss": 0.8537, + "step": 12849 + }, + { + "epoch": 0.6408977556109726, + "grad_norm": 0.4629903852065137, + "learning_rate": 1.5090727343130836e-05, + "loss": 0.8922, + "step": 12850 + }, + { + "epoch": 0.6409476309226932, + "grad_norm": 0.47281347211209146, + "learning_rate": 1.5087019808068361e-05, + "loss": 0.8455, + "step": 12851 + }, + { + "epoch": 0.640997506234414, + "grad_norm": 0.37518555634137263, + "learning_rate": 1.5083312531680671e-05, + "loss": 0.7886, + "step": 12852 + }, + { + "epoch": 0.6410473815461346, + "grad_norm": 0.44920356093703745, + "learning_rate": 1.5079605514064505e-05, + "loss": 0.8497, + "step": 12853 + }, + { + "epoch": 0.6410972568578553, + "grad_norm": 0.43036895687964616, + "learning_rate": 1.5075898755316586e-05, + "loss": 0.8326, + "step": 12854 + }, + { + "epoch": 0.6411471321695761, + "grad_norm": 0.4323120301490342, + "learning_rate": 1.5072192255533658e-05, + "loss": 0.8589, + "step": 12855 + }, + { + "epoch": 0.6411970074812967, + "grad_norm": 0.6688524080080483, + "learning_rate": 1.5068486014812424e-05, + "loss": 0.8528, + "step": 12856 + }, + { + "epoch": 0.6412468827930174, + "grad_norm": 0.5620506340283629, + "learning_rate": 1.5064780033249601e-05, + "loss": 0.9067, + "step": 12857 + }, + { + "epoch": 0.6412967581047382, + "grad_norm": 0.4803845927249493, + "learning_rate": 1.5061074310941898e-05, + "loss": 0.8572, + "step": 12858 + }, + { + "epoch": 0.6413466334164588, + "grad_norm": 0.47725835526426363, + "learning_rate": 1.5057368847986008e-05, + "loss": 0.8745, + "step": 12859 + }, + { + "epoch": 0.6413965087281795, + "grad_norm": 0.44637180398718357, + "learning_rate": 1.5053663644478643e-05, + "loss": 0.8893, + "step": 12860 + }, + { + "epoch": 0.6414463840399003, + "grad_norm": 0.4950339682436913, + "learning_rate": 1.5049958700516461e-05, + "loss": 0.8384, + "step": 12861 + }, + { + "epoch": 0.6414962593516209, + "grad_norm": 0.46894384669831857, + "learning_rate": 1.5046254016196154e-05, + "loss": 0.8385, + "step": 12862 + }, + { + "epoch": 0.6415461346633416, + "grad_norm": 0.42368230026336634, + "learning_rate": 1.5042549591614397e-05, + "loss": 0.8696, + "step": 12863 + }, + { + "epoch": 0.6415960099750624, + "grad_norm": 0.46306154789613835, + "learning_rate": 1.503884542686786e-05, + "loss": 0.817, + "step": 12864 + }, + { + "epoch": 0.641645885286783, + "grad_norm": 0.7382915360648094, + "learning_rate": 1.5035141522053182e-05, + "loss": 0.8703, + "step": 12865 + }, + { + "epoch": 0.6416957605985038, + "grad_norm": 0.5017650160917297, + "learning_rate": 1.5031437877267024e-05, + "loss": 0.9219, + "step": 12866 + }, + { + "epoch": 0.6417456359102245, + "grad_norm": 0.42130366424718335, + "learning_rate": 1.5027734492606049e-05, + "loss": 0.9062, + "step": 12867 + }, + { + "epoch": 0.6417955112219451, + "grad_norm": 0.4422469248207903, + "learning_rate": 1.5024031368166865e-05, + "loss": 0.8602, + "step": 12868 + }, + { + "epoch": 0.6418453865336659, + "grad_norm": 0.4360305354605592, + "learning_rate": 1.5020328504046121e-05, + "loss": 0.8293, + "step": 12869 + }, + { + "epoch": 0.6418952618453866, + "grad_norm": 0.5233477928211155, + "learning_rate": 1.5016625900340437e-05, + "loss": 0.9226, + "step": 12870 + }, + { + "epoch": 0.6419451371571072, + "grad_norm": 0.4640949390350099, + "learning_rate": 1.501292355714644e-05, + "loss": 0.8625, + "step": 12871 + }, + { + "epoch": 0.641995012468828, + "grad_norm": 0.46897262039100784, + "learning_rate": 1.5009221474560725e-05, + "loss": 0.9062, + "step": 12872 + }, + { + "epoch": 0.6420448877805486, + "grad_norm": 0.5203466977920566, + "learning_rate": 1.5005519652679905e-05, + "loss": 0.8595, + "step": 12873 + }, + { + "epoch": 0.6420947630922693, + "grad_norm": 0.4654636311752977, + "learning_rate": 1.5001818091600577e-05, + "loss": 0.8648, + "step": 12874 + }, + { + "epoch": 0.64214463840399, + "grad_norm": 0.42334684851584325, + "learning_rate": 1.4998116791419342e-05, + "loss": 0.8571, + "step": 12875 + }, + { + "epoch": 0.6421945137157107, + "grad_norm": 0.47415352852623016, + "learning_rate": 1.499441575223276e-05, + "loss": 0.8694, + "step": 12876 + }, + { + "epoch": 0.6422443890274314, + "grad_norm": 0.572648561912638, + "learning_rate": 1.4990714974137424e-05, + "loss": 0.8318, + "step": 12877 + }, + { + "epoch": 0.6422942643391522, + "grad_norm": 0.4746001944252108, + "learning_rate": 1.4987014457229909e-05, + "loss": 0.8432, + "step": 12878 + }, + { + "epoch": 0.6423441396508728, + "grad_norm": 0.8865212568112186, + "learning_rate": 1.4983314201606763e-05, + "loss": 0.8543, + "step": 12879 + }, + { + "epoch": 0.6423940149625935, + "grad_norm": 0.5205711149701715, + "learning_rate": 1.4979614207364548e-05, + "loss": 0.8977, + "step": 12880 + }, + { + "epoch": 0.6424438902743143, + "grad_norm": 0.4693667896390059, + "learning_rate": 1.4975914474599818e-05, + "loss": 0.8729, + "step": 12881 + }, + { + "epoch": 0.6424937655860349, + "grad_norm": 0.4909012607456063, + "learning_rate": 1.4972215003409118e-05, + "loss": 0.876, + "step": 12882 + }, + { + "epoch": 0.6425436408977556, + "grad_norm": 0.7824437345830515, + "learning_rate": 1.4968515793888977e-05, + "loss": 0.9405, + "step": 12883 + }, + { + "epoch": 0.6425935162094764, + "grad_norm": 0.4022450596885114, + "learning_rate": 1.496481684613592e-05, + "loss": 0.8772, + "step": 12884 + }, + { + "epoch": 0.642643391521197, + "grad_norm": 0.415785707737994, + "learning_rate": 1.496111816024648e-05, + "loss": 0.8503, + "step": 12885 + }, + { + "epoch": 0.6426932668329177, + "grad_norm": 0.42790930091969004, + "learning_rate": 1.4957419736317175e-05, + "loss": 0.8426, + "step": 12886 + }, + { + "epoch": 0.6427431421446385, + "grad_norm": 0.4585162045617776, + "learning_rate": 1.4953721574444504e-05, + "loss": 0.886, + "step": 12887 + }, + { + "epoch": 0.6427930174563591, + "grad_norm": 0.6710014028292396, + "learning_rate": 1.4950023674724969e-05, + "loss": 0.8669, + "step": 12888 + }, + { + "epoch": 0.6428428927680798, + "grad_norm": 0.5197547874405298, + "learning_rate": 1.4946326037255076e-05, + "loss": 0.8365, + "step": 12889 + }, + { + "epoch": 0.6428927680798004, + "grad_norm": 0.4831261760036979, + "learning_rate": 1.4942628662131298e-05, + "loss": 0.8649, + "step": 12890 + }, + { + "epoch": 0.6429426433915212, + "grad_norm": 0.5003937535402471, + "learning_rate": 1.4938931549450122e-05, + "loss": 0.8724, + "step": 12891 + }, + { + "epoch": 0.6429925187032419, + "grad_norm": 0.4609293275103885, + "learning_rate": 1.4935234699308032e-05, + "loss": 0.9051, + "step": 12892 + }, + { + "epoch": 0.6430423940149625, + "grad_norm": 0.4229837239240302, + "learning_rate": 1.4931538111801488e-05, + "loss": 0.8412, + "step": 12893 + }, + { + "epoch": 0.6430922693266833, + "grad_norm": 0.5075959469310548, + "learning_rate": 1.4927841787026946e-05, + "loss": 0.8385, + "step": 12894 + }, + { + "epoch": 0.643142144638404, + "grad_norm": 0.47457464242173847, + "learning_rate": 1.4924145725080862e-05, + "loss": 0.9026, + "step": 12895 + }, + { + "epoch": 0.6431920199501246, + "grad_norm": 0.5201592364271539, + "learning_rate": 1.492044992605969e-05, + "loss": 0.8985, + "step": 12896 + }, + { + "epoch": 0.6432418952618454, + "grad_norm": 0.3922425079497348, + "learning_rate": 1.4916754390059875e-05, + "loss": 0.8221, + "step": 12897 + }, + { + "epoch": 0.6432917705735661, + "grad_norm": 0.4820331254240938, + "learning_rate": 1.491305911717783e-05, + "loss": 0.837, + "step": 12898 + }, + { + "epoch": 0.6433416458852868, + "grad_norm": 0.4881105755186207, + "learning_rate": 1.4909364107509995e-05, + "loss": 0.8672, + "step": 12899 + }, + { + "epoch": 0.6433915211970075, + "grad_norm": 0.4900012832756805, + "learning_rate": 1.4905669361152787e-05, + "loss": 0.866, + "step": 12900 + }, + { + "epoch": 0.6434413965087282, + "grad_norm": 0.43647123308751795, + "learning_rate": 1.490197487820263e-05, + "loss": 0.8872, + "step": 12901 + }, + { + "epoch": 0.6434912718204489, + "grad_norm": 0.49719732468391736, + "learning_rate": 1.4898280658755908e-05, + "loss": 0.8556, + "step": 12902 + }, + { + "epoch": 0.6435411471321696, + "grad_norm": 0.5907256225369174, + "learning_rate": 1.489458670290903e-05, + "loss": 0.8311, + "step": 12903 + }, + { + "epoch": 0.6435910224438903, + "grad_norm": 0.5183756538896953, + "learning_rate": 1.4890893010758404e-05, + "loss": 0.8582, + "step": 12904 + }, + { + "epoch": 0.643640897755611, + "grad_norm": 0.4207892855982722, + "learning_rate": 1.4887199582400386e-05, + "loss": 0.8782, + "step": 12905 + }, + { + "epoch": 0.6436907730673317, + "grad_norm": 0.4567379741444396, + "learning_rate": 1.488350641793137e-05, + "loss": 0.8871, + "step": 12906 + }, + { + "epoch": 0.6437406483790523, + "grad_norm": 0.39613229749344037, + "learning_rate": 1.4879813517447726e-05, + "loss": 0.8475, + "step": 12907 + }, + { + "epoch": 0.643790523690773, + "grad_norm": 0.5086909175865217, + "learning_rate": 1.4876120881045836e-05, + "loss": 0.8547, + "step": 12908 + }, + { + "epoch": 0.6438403990024938, + "grad_norm": 0.4942854422091369, + "learning_rate": 1.4872428508822022e-05, + "loss": 0.9053, + "step": 12909 + }, + { + "epoch": 0.6438902743142144, + "grad_norm": 0.48395490072920094, + "learning_rate": 1.4868736400872658e-05, + "loss": 0.861, + "step": 12910 + }, + { + "epoch": 0.6439401496259352, + "grad_norm": 0.4820306459714612, + "learning_rate": 1.486504455729408e-05, + "loss": 0.8566, + "step": 12911 + }, + { + "epoch": 0.6439900249376559, + "grad_norm": 0.4701849799684002, + "learning_rate": 1.4861352978182642e-05, + "loss": 0.8853, + "step": 12912 + }, + { + "epoch": 0.6440399002493765, + "grad_norm": 0.42154686458285695, + "learning_rate": 1.4857661663634647e-05, + "loss": 0.8852, + "step": 12913 + }, + { + "epoch": 0.6440897755610973, + "grad_norm": 0.5670533153133103, + "learning_rate": 1.4853970613746438e-05, + "loss": 0.8574, + "step": 12914 + }, + { + "epoch": 0.644139650872818, + "grad_norm": 0.6195497757479221, + "learning_rate": 1.4850279828614328e-05, + "loss": 0.8798, + "step": 12915 + }, + { + "epoch": 0.6441895261845386, + "grad_norm": 0.5427662699498239, + "learning_rate": 1.4846589308334619e-05, + "loss": 0.9006, + "step": 12916 + }, + { + "epoch": 0.6442394014962594, + "grad_norm": 0.4240070623087945, + "learning_rate": 1.4842899053003612e-05, + "loss": 0.8005, + "step": 12917 + }, + { + "epoch": 0.6442892768079801, + "grad_norm": 0.4640243424351447, + "learning_rate": 1.4839209062717612e-05, + "loss": 0.8999, + "step": 12918 + }, + { + "epoch": 0.6443391521197007, + "grad_norm": 0.7737646363902526, + "learning_rate": 1.4835519337572917e-05, + "loss": 0.8803, + "step": 12919 + }, + { + "epoch": 0.6443890274314215, + "grad_norm": 0.38918829526091675, + "learning_rate": 1.4831829877665782e-05, + "loss": 0.8305, + "step": 12920 + }, + { + "epoch": 0.6444389027431422, + "grad_norm": 0.46727772218507285, + "learning_rate": 1.4828140683092496e-05, + "loss": 0.8878, + "step": 12921 + }, + { + "epoch": 0.6444887780548628, + "grad_norm": 0.5550628536616994, + "learning_rate": 1.4824451753949325e-05, + "loss": 0.8685, + "step": 12922 + }, + { + "epoch": 0.6445386533665836, + "grad_norm": 0.5066014799208123, + "learning_rate": 1.482076309033254e-05, + "loss": 0.8797, + "step": 12923 + }, + { + "epoch": 0.6445885286783042, + "grad_norm": 0.4794586505930419, + "learning_rate": 1.4817074692338385e-05, + "loss": 0.8521, + "step": 12924 + }, + { + "epoch": 0.6446384039900249, + "grad_norm": 0.4027431757551185, + "learning_rate": 1.4813386560063102e-05, + "loss": 0.8582, + "step": 12925 + }, + { + "epoch": 0.6446882793017457, + "grad_norm": 0.6269465913808581, + "learning_rate": 1.4809698693602947e-05, + "loss": 0.8412, + "step": 12926 + }, + { + "epoch": 0.6447381546134663, + "grad_norm": 0.4823136502938769, + "learning_rate": 1.4806011093054134e-05, + "loss": 0.8734, + "step": 12927 + }, + { + "epoch": 0.644788029925187, + "grad_norm": 0.4751772541353427, + "learning_rate": 1.4802323758512904e-05, + "loss": 0.8268, + "step": 12928 + }, + { + "epoch": 0.6448379052369078, + "grad_norm": 0.43478820986104555, + "learning_rate": 1.4798636690075473e-05, + "loss": 0.8792, + "step": 12929 + }, + { + "epoch": 0.6448877805486284, + "grad_norm": 0.5336682426834086, + "learning_rate": 1.4794949887838055e-05, + "loss": 0.858, + "step": 12930 + }, + { + "epoch": 0.6449376558603491, + "grad_norm": 0.6211055689305606, + "learning_rate": 1.4791263351896845e-05, + "loss": 0.897, + "step": 12931 + }, + { + "epoch": 0.6449875311720699, + "grad_norm": 0.44792222920513114, + "learning_rate": 1.4787577082348051e-05, + "loss": 0.9263, + "step": 12932 + }, + { + "epoch": 0.6450374064837905, + "grad_norm": 0.4386471481997418, + "learning_rate": 1.4783891079287865e-05, + "loss": 0.8602, + "step": 12933 + }, + { + "epoch": 0.6450872817955112, + "grad_norm": 0.5027348659929641, + "learning_rate": 1.478020534281247e-05, + "loss": 0.8892, + "step": 12934 + }, + { + "epoch": 0.645137157107232, + "grad_norm": 0.49863450521031194, + "learning_rate": 1.4776519873018041e-05, + "loss": 0.8829, + "step": 12935 + }, + { + "epoch": 0.6451870324189526, + "grad_norm": 0.5582733750120586, + "learning_rate": 1.4772834670000745e-05, + "loss": 0.9006, + "step": 12936 + }, + { + "epoch": 0.6452369077306733, + "grad_norm": 0.5825748490246812, + "learning_rate": 1.4769149733856765e-05, + "loss": 0.8614, + "step": 12937 + }, + { + "epoch": 0.6452867830423941, + "grad_norm": 0.45708254753387173, + "learning_rate": 1.4765465064682233e-05, + "loss": 0.9075, + "step": 12938 + }, + { + "epoch": 0.6453366583541147, + "grad_norm": 0.39591045552075155, + "learning_rate": 1.476178066257331e-05, + "loss": 0.8765, + "step": 12939 + }, + { + "epoch": 0.6453865336658354, + "grad_norm": 0.43010407200097106, + "learning_rate": 1.475809652762614e-05, + "loss": 0.8535, + "step": 12940 + }, + { + "epoch": 0.6454364089775562, + "grad_norm": 0.41831396696558293, + "learning_rate": 1.4754412659936868e-05, + "loss": 0.8688, + "step": 12941 + }, + { + "epoch": 0.6454862842892768, + "grad_norm": 0.4326550085329005, + "learning_rate": 1.4750729059601597e-05, + "loss": 0.8655, + "step": 12942 + }, + { + "epoch": 0.6455361596009975, + "grad_norm": 0.4451792317372636, + "learning_rate": 1.4747045726716468e-05, + "loss": 0.8552, + "step": 12943 + }, + { + "epoch": 0.6455860349127182, + "grad_norm": 0.4272640515744474, + "learning_rate": 1.474336266137759e-05, + "loss": 0.9009, + "step": 12944 + }, + { + "epoch": 0.6456359102244389, + "grad_norm": 0.6509075385497107, + "learning_rate": 1.4739679863681086e-05, + "loss": 0.8312, + "step": 12945 + }, + { + "epoch": 0.6456857855361596, + "grad_norm": 0.6674969618782928, + "learning_rate": 1.4735997333723034e-05, + "loss": 0.8377, + "step": 12946 + }, + { + "epoch": 0.6457356608478803, + "grad_norm": 0.6672121532897842, + "learning_rate": 1.4732315071599535e-05, + "loss": 0.8511, + "step": 12947 + }, + { + "epoch": 0.645785536159601, + "grad_norm": 0.4204626734831981, + "learning_rate": 1.4728633077406689e-05, + "loss": 0.8511, + "step": 12948 + }, + { + "epoch": 0.6458354114713217, + "grad_norm": 1.5859423321146209, + "learning_rate": 1.472495135124056e-05, + "loss": 0.8657, + "step": 12949 + }, + { + "epoch": 0.6458852867830424, + "grad_norm": 0.5009745038125334, + "learning_rate": 1.4721269893197223e-05, + "loss": 0.8704, + "step": 12950 + }, + { + "epoch": 0.6459351620947631, + "grad_norm": 0.4713451384046696, + "learning_rate": 1.4717588703372751e-05, + "loss": 0.8258, + "step": 12951 + }, + { + "epoch": 0.6459850374064838, + "grad_norm": 0.4032924307311637, + "learning_rate": 1.4713907781863207e-05, + "loss": 0.8852, + "step": 12952 + }, + { + "epoch": 0.6460349127182045, + "grad_norm": 0.3986429379333625, + "learning_rate": 1.4710227128764628e-05, + "loss": 0.8961, + "step": 12953 + }, + { + "epoch": 0.6460847880299252, + "grad_norm": 0.4496623765820917, + "learning_rate": 1.4706546744173066e-05, + "loss": 0.8813, + "step": 12954 + }, + { + "epoch": 0.6461346633416459, + "grad_norm": 0.43827566745118585, + "learning_rate": 1.4702866628184562e-05, + "loss": 0.9019, + "step": 12955 + }, + { + "epoch": 0.6461845386533666, + "grad_norm": 0.47032169988407296, + "learning_rate": 1.4699186780895153e-05, + "loss": 0.9071, + "step": 12956 + }, + { + "epoch": 0.6462344139650873, + "grad_norm": 0.4612881763271944, + "learning_rate": 1.4695507202400843e-05, + "loss": 0.8887, + "step": 12957 + }, + { + "epoch": 0.646284289276808, + "grad_norm": 0.4194517364717048, + "learning_rate": 1.4691827892797666e-05, + "loss": 0.8461, + "step": 12958 + }, + { + "epoch": 0.6463341645885287, + "grad_norm": 0.40035741905516176, + "learning_rate": 1.4688148852181624e-05, + "loss": 0.8505, + "step": 12959 + }, + { + "epoch": 0.6463840399002494, + "grad_norm": 0.5258601164351386, + "learning_rate": 1.4684470080648727e-05, + "loss": 0.8966, + "step": 12960 + }, + { + "epoch": 0.64643391521197, + "grad_norm": 0.4751216268560695, + "learning_rate": 1.4680791578294967e-05, + "loss": 0.8563, + "step": 12961 + }, + { + "epoch": 0.6464837905236908, + "grad_norm": 0.41045734437446946, + "learning_rate": 1.4677113345216329e-05, + "loss": 0.8629, + "step": 12962 + }, + { + "epoch": 0.6465336658354115, + "grad_norm": 0.524893320128854, + "learning_rate": 1.4673435381508801e-05, + "loss": 0.909, + "step": 12963 + }, + { + "epoch": 0.6465835411471321, + "grad_norm": 0.41938230740744914, + "learning_rate": 1.4669757687268351e-05, + "loss": 0.8899, + "step": 12964 + }, + { + "epoch": 0.6466334164588529, + "grad_norm": 0.4153007257945083, + "learning_rate": 1.466608026259096e-05, + "loss": 0.8009, + "step": 12965 + }, + { + "epoch": 0.6466832917705736, + "grad_norm": 0.4512030221652399, + "learning_rate": 1.466240310757257e-05, + "loss": 0.8826, + "step": 12966 + }, + { + "epoch": 0.6467331670822942, + "grad_norm": 0.41952230175898253, + "learning_rate": 1.4658726222309149e-05, + "loss": 0.854, + "step": 12967 + }, + { + "epoch": 0.646783042394015, + "grad_norm": 0.48003193110704884, + "learning_rate": 1.4655049606896634e-05, + "loss": 0.846, + "step": 12968 + }, + { + "epoch": 0.6468329177057357, + "grad_norm": 0.4692097919828377, + "learning_rate": 1.4651373261430972e-05, + "loss": 0.881, + "step": 12969 + }, + { + "epoch": 0.6468827930174563, + "grad_norm": 0.40483347175392664, + "learning_rate": 1.4647697186008097e-05, + "loss": 0.8587, + "step": 12970 + }, + { + "epoch": 0.6469326683291771, + "grad_norm": 0.5380640154094644, + "learning_rate": 1.4644021380723932e-05, + "loss": 0.8574, + "step": 12971 + }, + { + "epoch": 0.6469825436408978, + "grad_norm": 0.4065217555584541, + "learning_rate": 1.4640345845674389e-05, + "loss": 0.8727, + "step": 12972 + }, + { + "epoch": 0.6470324189526184, + "grad_norm": 0.4521775417689365, + "learning_rate": 1.4636670580955383e-05, + "loss": 0.8404, + "step": 12973 + }, + { + "epoch": 0.6470822942643392, + "grad_norm": 0.49110424935422203, + "learning_rate": 1.4632995586662834e-05, + "loss": 0.7977, + "step": 12974 + }, + { + "epoch": 0.6471321695760599, + "grad_norm": 0.5805553779522226, + "learning_rate": 1.4629320862892611e-05, + "loss": 0.8775, + "step": 12975 + }, + { + "epoch": 0.6471820448877805, + "grad_norm": 0.40651210494880125, + "learning_rate": 1.4625646409740623e-05, + "loss": 0.8341, + "step": 12976 + }, + { + "epoch": 0.6472319201995013, + "grad_norm": 0.46073035428887993, + "learning_rate": 1.4621972227302743e-05, + "loss": 0.9028, + "step": 12977 + }, + { + "epoch": 0.6472817955112219, + "grad_norm": 0.46457578679389266, + "learning_rate": 1.461829831567487e-05, + "loss": 0.8495, + "step": 12978 + }, + { + "epoch": 0.6473316708229426, + "grad_norm": 0.4072633743849255, + "learning_rate": 1.4614624674952842e-05, + "loss": 0.8365, + "step": 12979 + }, + { + "epoch": 0.6473815461346634, + "grad_norm": 0.5188687780820433, + "learning_rate": 1.4610951305232536e-05, + "loss": 0.8575, + "step": 12980 + }, + { + "epoch": 0.647431421446384, + "grad_norm": 0.479232967472107, + "learning_rate": 1.460727820660981e-05, + "loss": 0.8919, + "step": 12981 + }, + { + "epoch": 0.6474812967581047, + "grad_norm": 0.5967242848117895, + "learning_rate": 1.4603605379180513e-05, + "loss": 0.8543, + "step": 12982 + }, + { + "epoch": 0.6475311720698255, + "grad_norm": 0.6638974035941368, + "learning_rate": 1.4599932823040471e-05, + "loss": 0.8863, + "step": 12983 + }, + { + "epoch": 0.6475810473815461, + "grad_norm": 0.46149625724530013, + "learning_rate": 1.4596260538285528e-05, + "loss": 0.826, + "step": 12984 + }, + { + "epoch": 0.6476309226932668, + "grad_norm": 0.5680309140380579, + "learning_rate": 1.459258852501152e-05, + "loss": 0.8524, + "step": 12985 + }, + { + "epoch": 0.6476807980049876, + "grad_norm": 0.5328249833496069, + "learning_rate": 1.4588916783314249e-05, + "loss": 0.8586, + "step": 12986 + }, + { + "epoch": 0.6477306733167082, + "grad_norm": 0.5101675695151369, + "learning_rate": 1.458524531328953e-05, + "loss": 0.8295, + "step": 12987 + }, + { + "epoch": 0.6477805486284289, + "grad_norm": 1.1395894224241285, + "learning_rate": 1.4581574115033181e-05, + "loss": 0.8384, + "step": 12988 + }, + { + "epoch": 0.6478304239401497, + "grad_norm": 0.5196819330730172, + "learning_rate": 1.4577903188640996e-05, + "loss": 0.8614, + "step": 12989 + }, + { + "epoch": 0.6478802992518703, + "grad_norm": 0.44041941817528457, + "learning_rate": 1.457423253420876e-05, + "loss": 0.8909, + "step": 12990 + }, + { + "epoch": 0.647930174563591, + "grad_norm": 0.4175509213451825, + "learning_rate": 1.4570562151832252e-05, + "loss": 0.8407, + "step": 12991 + }, + { + "epoch": 0.6479800498753118, + "grad_norm": 0.48560074649080215, + "learning_rate": 1.4566892041607266e-05, + "loss": 0.8524, + "step": 12992 + }, + { + "epoch": 0.6480299251870324, + "grad_norm": 0.4143474288418145, + "learning_rate": 1.4563222203629561e-05, + "loss": 0.8479, + "step": 12993 + }, + { + "epoch": 0.6480798004987531, + "grad_norm": 0.5236658917106277, + "learning_rate": 1.4559552637994905e-05, + "loss": 0.8228, + "step": 12994 + }, + { + "epoch": 0.6481296758104739, + "grad_norm": 0.44574861252317904, + "learning_rate": 1.455588334479904e-05, + "loss": 0.8925, + "step": 12995 + }, + { + "epoch": 0.6481795511221945, + "grad_norm": 0.4753288605768564, + "learning_rate": 1.4552214324137745e-05, + "loss": 0.8799, + "step": 12996 + }, + { + "epoch": 0.6482294264339152, + "grad_norm": 0.5228929097251587, + "learning_rate": 1.4548545576106721e-05, + "loss": 0.8416, + "step": 12997 + }, + { + "epoch": 0.6482793017456359, + "grad_norm": 0.4192473909830826, + "learning_rate": 1.4544877100801733e-05, + "loss": 0.8807, + "step": 12998 + }, + { + "epoch": 0.6483291770573566, + "grad_norm": 0.5835221044247644, + "learning_rate": 1.4541208898318487e-05, + "loss": 0.8499, + "step": 12999 + }, + { + "epoch": 0.6483790523690773, + "grad_norm": 0.5039806700754752, + "learning_rate": 1.4537540968752732e-05, + "loss": 0.8912, + "step": 13000 + }, + { + "epoch": 0.648428927680798, + "grad_norm": 0.4027150681536749, + "learning_rate": 1.4533873312200147e-05, + "loss": 0.8612, + "step": 13001 + }, + { + "epoch": 0.6484788029925187, + "grad_norm": 2.9516857734875614, + "learning_rate": 1.4530205928756464e-05, + "loss": 0.8299, + "step": 13002 + }, + { + "epoch": 0.6485286783042394, + "grad_norm": 0.4041895091206328, + "learning_rate": 1.452653881851736e-05, + "loss": 0.8042, + "step": 13003 + }, + { + "epoch": 0.6485785536159601, + "grad_norm": 0.5401368397888251, + "learning_rate": 1.4522871981578556e-05, + "loss": 0.8593, + "step": 13004 + }, + { + "epoch": 0.6486284289276808, + "grad_norm": 0.4514398222092514, + "learning_rate": 1.4519205418035703e-05, + "loss": 0.872, + "step": 13005 + }, + { + "epoch": 0.6486783042394015, + "grad_norm": 0.5978016411068121, + "learning_rate": 1.4515539127984502e-05, + "loss": 0.8531, + "step": 13006 + }, + { + "epoch": 0.6487281795511222, + "grad_norm": 0.4382098160011503, + "learning_rate": 1.4511873111520602e-05, + "loss": 0.8678, + "step": 13007 + }, + { + "epoch": 0.6487780548628429, + "grad_norm": 0.49610302204401474, + "learning_rate": 1.4508207368739704e-05, + "loss": 0.8267, + "step": 13008 + }, + { + "epoch": 0.6488279301745636, + "grad_norm": 0.5170803862764185, + "learning_rate": 1.4504541899737418e-05, + "loss": 0.8389, + "step": 13009 + }, + { + "epoch": 0.6488778054862843, + "grad_norm": 0.43191317047943034, + "learning_rate": 1.4500876704609422e-05, + "loss": 0.8608, + "step": 13010 + }, + { + "epoch": 0.648927680798005, + "grad_norm": 0.5295949736723662, + "learning_rate": 1.4497211783451353e-05, + "loss": 0.8595, + "step": 13011 + }, + { + "epoch": 0.6489775561097257, + "grad_norm": 0.4526860383206309, + "learning_rate": 1.4493547136358836e-05, + "loss": 0.8782, + "step": 13012 + }, + { + "epoch": 0.6490274314214464, + "grad_norm": 0.4239940277418008, + "learning_rate": 1.4489882763427501e-05, + "loss": 0.8706, + "step": 13013 + }, + { + "epoch": 0.6490773067331671, + "grad_norm": 0.43158980403682096, + "learning_rate": 1.448621866475298e-05, + "loss": 0.856, + "step": 13014 + }, + { + "epoch": 0.6491271820448877, + "grad_norm": 0.4628201246013871, + "learning_rate": 1.4482554840430876e-05, + "loss": 0.8939, + "step": 13015 + }, + { + "epoch": 0.6491770573566085, + "grad_norm": 0.4304605037359412, + "learning_rate": 1.4478891290556799e-05, + "loss": 0.8795, + "step": 13016 + }, + { + "epoch": 0.6492269326683292, + "grad_norm": 0.5193458519262585, + "learning_rate": 1.4475228015226334e-05, + "loss": 0.8888, + "step": 13017 + }, + { + "epoch": 0.6492768079800498, + "grad_norm": 0.4512181802373473, + "learning_rate": 1.447156501453509e-05, + "loss": 0.8279, + "step": 13018 + }, + { + "epoch": 0.6493266832917706, + "grad_norm": 0.5227674609546121, + "learning_rate": 1.446790228857865e-05, + "loss": 0.8913, + "step": 13019 + }, + { + "epoch": 0.6493765586034913, + "grad_norm": 0.39793013104239483, + "learning_rate": 1.4464239837452584e-05, + "loss": 0.8104, + "step": 13020 + }, + { + "epoch": 0.6494264339152119, + "grad_norm": 0.4662631355113345, + "learning_rate": 1.4460577661252456e-05, + "loss": 0.8327, + "step": 13021 + }, + { + "epoch": 0.6494763092269327, + "grad_norm": 0.4195624440821627, + "learning_rate": 1.4456915760073844e-05, + "loss": 0.887, + "step": 13022 + }, + { + "epoch": 0.6495261845386534, + "grad_norm": 0.42074414644535385, + "learning_rate": 1.4453254134012298e-05, + "loss": 0.8526, + "step": 13023 + }, + { + "epoch": 0.649576059850374, + "grad_norm": 0.5059361116539615, + "learning_rate": 1.4449592783163357e-05, + "loss": 0.8619, + "step": 13024 + }, + { + "epoch": 0.6496259351620948, + "grad_norm": 0.4638080173165549, + "learning_rate": 1.444593170762258e-05, + "loss": 0.884, + "step": 13025 + }, + { + "epoch": 0.6496758104738155, + "grad_norm": 0.4655926420841593, + "learning_rate": 1.4442270907485489e-05, + "loss": 0.8656, + "step": 13026 + }, + { + "epoch": 0.6497256857855361, + "grad_norm": 0.5279435615991432, + "learning_rate": 1.4438610382847616e-05, + "loss": 0.8794, + "step": 13027 + }, + { + "epoch": 0.6497755610972569, + "grad_norm": 0.5714276789778776, + "learning_rate": 1.4434950133804468e-05, + "loss": 0.8658, + "step": 13028 + }, + { + "epoch": 0.6498254364089776, + "grad_norm": 0.43219463565528543, + "learning_rate": 1.4431290160451577e-05, + "loss": 0.8741, + "step": 13029 + }, + { + "epoch": 0.6498753117206982, + "grad_norm": 0.4389552959344542, + "learning_rate": 1.442763046288444e-05, + "loss": 0.8214, + "step": 13030 + }, + { + "epoch": 0.649925187032419, + "grad_norm": 0.45162409214793087, + "learning_rate": 1.4423971041198556e-05, + "loss": 0.8359, + "step": 13031 + }, + { + "epoch": 0.6499750623441396, + "grad_norm": 0.47573312684576047, + "learning_rate": 1.4420311895489402e-05, + "loss": 0.8723, + "step": 13032 + }, + { + "epoch": 0.6500249376558603, + "grad_norm": 0.4927690652740198, + "learning_rate": 1.4416653025852498e-05, + "loss": 0.8088, + "step": 13033 + }, + { + "epoch": 0.6500748129675811, + "grad_norm": 0.41691218649495276, + "learning_rate": 1.4412994432383273e-05, + "loss": 0.8538, + "step": 13034 + }, + { + "epoch": 0.6501246882793017, + "grad_norm": 0.4605508566759111, + "learning_rate": 1.440933611517723e-05, + "loss": 0.889, + "step": 13035 + }, + { + "epoch": 0.6501745635910224, + "grad_norm": 0.5004382272227952, + "learning_rate": 1.4405678074329814e-05, + "loss": 0.8822, + "step": 13036 + }, + { + "epoch": 0.6502244389027432, + "grad_norm": 0.5624738731589167, + "learning_rate": 1.4402020309936506e-05, + "loss": 0.8501, + "step": 13037 + }, + { + "epoch": 0.6502743142144638, + "grad_norm": 0.4759407771232738, + "learning_rate": 1.4398362822092715e-05, + "loss": 0.8894, + "step": 13038 + }, + { + "epoch": 0.6503241895261845, + "grad_norm": 0.9960982202736001, + "learning_rate": 1.439470561089391e-05, + "loss": 0.8704, + "step": 13039 + }, + { + "epoch": 0.6503740648379053, + "grad_norm": 0.4644381672392645, + "learning_rate": 1.4391048676435509e-05, + "loss": 0.8865, + "step": 13040 + }, + { + "epoch": 0.6504239401496259, + "grad_norm": 0.437015499165065, + "learning_rate": 1.4387392018812961e-05, + "loss": 0.9139, + "step": 13041 + }, + { + "epoch": 0.6504738154613466, + "grad_norm": 0.42241071107046835, + "learning_rate": 1.4383735638121654e-05, + "loss": 0.9057, + "step": 13042 + }, + { + "epoch": 0.6505236907730674, + "grad_norm": 0.4280080569032948, + "learning_rate": 1.438007953445702e-05, + "loss": 0.8651, + "step": 13043 + }, + { + "epoch": 0.650573566084788, + "grad_norm": 0.46381318845417313, + "learning_rate": 1.437642370791446e-05, + "loss": 0.8479, + "step": 13044 + }, + { + "epoch": 0.6506234413965087, + "grad_norm": 0.5431080188844913, + "learning_rate": 1.4372768158589367e-05, + "loss": 0.8403, + "step": 13045 + }, + { + "epoch": 0.6506733167082295, + "grad_norm": 0.41179332072446734, + "learning_rate": 1.4369112886577125e-05, + "loss": 0.9046, + "step": 13046 + }, + { + "epoch": 0.6507231920199501, + "grad_norm": 0.42391994568438424, + "learning_rate": 1.436545789197313e-05, + "loss": 0.8506, + "step": 13047 + }, + { + "epoch": 0.6507730673316708, + "grad_norm": 0.5904225817531543, + "learning_rate": 1.4361803174872756e-05, + "loss": 0.8292, + "step": 13048 + }, + { + "epoch": 0.6508229426433915, + "grad_norm": 0.44718391373099187, + "learning_rate": 1.4358148735371366e-05, + "loss": 0.8481, + "step": 13049 + }, + { + "epoch": 0.6508728179551122, + "grad_norm": 0.5610205467612858, + "learning_rate": 1.4354494573564311e-05, + "loss": 0.8909, + "step": 13050 + }, + { + "epoch": 0.6509226932668329, + "grad_norm": 0.44955756803867075, + "learning_rate": 1.4350840689546969e-05, + "loss": 0.8391, + "step": 13051 + }, + { + "epoch": 0.6509725685785536, + "grad_norm": 0.4636229372275046, + "learning_rate": 1.4347187083414671e-05, + "loss": 0.8437, + "step": 13052 + }, + { + "epoch": 0.6510224438902743, + "grad_norm": 0.46769346927091243, + "learning_rate": 1.4343533755262756e-05, + "loss": 0.875, + "step": 13053 + }, + { + "epoch": 0.651072319201995, + "grad_norm": 0.4283999322083375, + "learning_rate": 1.4339880705186553e-05, + "loss": 0.8862, + "step": 13054 + }, + { + "epoch": 0.6511221945137157, + "grad_norm": 0.43802811645674183, + "learning_rate": 1.4336227933281398e-05, + "loss": 0.8356, + "step": 13055 + }, + { + "epoch": 0.6511720698254364, + "grad_norm": 0.4431121964473868, + "learning_rate": 1.4332575439642606e-05, + "loss": 0.8564, + "step": 13056 + }, + { + "epoch": 0.6512219451371571, + "grad_norm": 1.469071131963744, + "learning_rate": 1.4328923224365481e-05, + "loss": 0.8444, + "step": 13057 + }, + { + "epoch": 0.6512718204488778, + "grad_norm": 0.4801682104882087, + "learning_rate": 1.4325271287545323e-05, + "loss": 0.8572, + "step": 13058 + }, + { + "epoch": 0.6513216957605985, + "grad_norm": 0.43773944638855966, + "learning_rate": 1.4321619629277438e-05, + "loss": 0.8771, + "step": 13059 + }, + { + "epoch": 0.6513715710723192, + "grad_norm": 0.5135445073759215, + "learning_rate": 1.4317968249657115e-05, + "loss": 0.8556, + "step": 13060 + }, + { + "epoch": 0.6514214463840399, + "grad_norm": 0.49066372057612895, + "learning_rate": 1.4314317148779615e-05, + "loss": 0.8511, + "step": 13061 + }, + { + "epoch": 0.6514713216957606, + "grad_norm": 0.4517072210232544, + "learning_rate": 1.431066632674024e-05, + "loss": 0.8377, + "step": 13062 + }, + { + "epoch": 0.6515211970074813, + "grad_norm": 0.3998284251813592, + "learning_rate": 1.4307015783634242e-05, + "loss": 0.8491, + "step": 13063 + }, + { + "epoch": 0.651571072319202, + "grad_norm": 0.43372172072842713, + "learning_rate": 1.4303365519556883e-05, + "loss": 0.8679, + "step": 13064 + }, + { + "epoch": 0.6516209476309227, + "grad_norm": 0.4657367135481072, + "learning_rate": 1.4299715534603402e-05, + "loss": 0.8722, + "step": 13065 + }, + { + "epoch": 0.6516708229426434, + "grad_norm": 0.6277391179074434, + "learning_rate": 1.4296065828869065e-05, + "loss": 0.8251, + "step": 13066 + }, + { + "epoch": 0.6517206982543641, + "grad_norm": 0.46009290162434013, + "learning_rate": 1.42924164024491e-05, + "loss": 0.8587, + "step": 13067 + }, + { + "epoch": 0.6517705735660848, + "grad_norm": 0.49633449605326524, + "learning_rate": 1.4288767255438739e-05, + "loss": 0.9066, + "step": 13068 + }, + { + "epoch": 0.6518204488778054, + "grad_norm": 0.4612455650706114, + "learning_rate": 1.4285118387933188e-05, + "loss": 0.883, + "step": 13069 + }, + { + "epoch": 0.6518703241895262, + "grad_norm": 0.4480621323477504, + "learning_rate": 1.4281469800027703e-05, + "loss": 0.8928, + "step": 13070 + }, + { + "epoch": 0.6519201995012469, + "grad_norm": 0.3954894645692805, + "learning_rate": 1.4277821491817444e-05, + "loss": 0.8708, + "step": 13071 + }, + { + "epoch": 0.6519700748129675, + "grad_norm": 0.5218221037679932, + "learning_rate": 1.4274173463397643e-05, + "loss": 0.8615, + "step": 13072 + }, + { + "epoch": 0.6520199501246883, + "grad_norm": 0.4927469245100847, + "learning_rate": 1.4270525714863478e-05, + "loss": 0.8125, + "step": 13073 + }, + { + "epoch": 0.652069825436409, + "grad_norm": 0.47410583148543556, + "learning_rate": 1.4266878246310161e-05, + "loss": 0.869, + "step": 13074 + }, + { + "epoch": 0.6521197007481296, + "grad_norm": 0.4174101947257902, + "learning_rate": 1.4263231057832832e-05, + "loss": 0.89, + "step": 13075 + }, + { + "epoch": 0.6521695760598504, + "grad_norm": 0.7215391691807513, + "learning_rate": 1.4259584149526695e-05, + "loss": 0.8842, + "step": 13076 + }, + { + "epoch": 0.6522194513715711, + "grad_norm": 0.5293515029324446, + "learning_rate": 1.4255937521486893e-05, + "loss": 0.9498, + "step": 13077 + }, + { + "epoch": 0.6522693266832917, + "grad_norm": 0.48050242847037716, + "learning_rate": 1.4252291173808612e-05, + "loss": 0.914, + "step": 13078 + }, + { + "epoch": 0.6523192019950125, + "grad_norm": 0.45090961765522886, + "learning_rate": 1.4248645106586961e-05, + "loss": 0.8663, + "step": 13079 + }, + { + "epoch": 0.6523690773067332, + "grad_norm": 0.47639023376368284, + "learning_rate": 1.4244999319917118e-05, + "loss": 0.8526, + "step": 13080 + }, + { + "epoch": 0.6524189526184538, + "grad_norm": 0.5456242702045867, + "learning_rate": 1.4241353813894203e-05, + "loss": 0.8413, + "step": 13081 + }, + { + "epoch": 0.6524688279301746, + "grad_norm": 0.4735248462669209, + "learning_rate": 1.4237708588613344e-05, + "loss": 0.8715, + "step": 13082 + }, + { + "epoch": 0.6525187032418953, + "grad_norm": 0.9086160860949851, + "learning_rate": 1.4234063644169654e-05, + "loss": 0.8412, + "step": 13083 + }, + { + "epoch": 0.6525685785536159, + "grad_norm": 0.4201948829383322, + "learning_rate": 1.4230418980658264e-05, + "loss": 0.8685, + "step": 13084 + }, + { + "epoch": 0.6526184538653367, + "grad_norm": 0.43213257449824305, + "learning_rate": 1.4226774598174272e-05, + "loss": 0.8243, + "step": 13085 + }, + { + "epoch": 0.6526683291770573, + "grad_norm": 0.46039768711598184, + "learning_rate": 1.4223130496812775e-05, + "loss": 0.8761, + "step": 13086 + }, + { + "epoch": 0.652718204488778, + "grad_norm": 0.5201015278336755, + "learning_rate": 1.4219486676668853e-05, + "loss": 0.8577, + "step": 13087 + }, + { + "epoch": 0.6527680798004988, + "grad_norm": 0.48044680116718924, + "learning_rate": 1.4215843137837615e-05, + "loss": 0.8889, + "step": 13088 + }, + { + "epoch": 0.6528179551122194, + "grad_norm": 0.47203394898728923, + "learning_rate": 1.4212199880414123e-05, + "loss": 0.8989, + "step": 13089 + }, + { + "epoch": 0.6528678304239401, + "grad_norm": 1.5774759173884805, + "learning_rate": 1.4208556904493444e-05, + "loss": 0.8836, + "step": 13090 + }, + { + "epoch": 0.6529177057356609, + "grad_norm": 0.4784021222753582, + "learning_rate": 1.4204914210170636e-05, + "loss": 0.8292, + "step": 13091 + }, + { + "epoch": 0.6529675810473815, + "grad_norm": 0.5350037846779839, + "learning_rate": 1.420127179754077e-05, + "loss": 0.8562, + "step": 13092 + }, + { + "epoch": 0.6530174563591022, + "grad_norm": 0.820017149939455, + "learning_rate": 1.4197629666698881e-05, + "loss": 0.8712, + "step": 13093 + }, + { + "epoch": 0.653067331670823, + "grad_norm": 0.48062031958885065, + "learning_rate": 1.4193987817740015e-05, + "loss": 0.8645, + "step": 13094 + }, + { + "epoch": 0.6531172069825436, + "grad_norm": 0.5331380805213137, + "learning_rate": 1.419034625075919e-05, + "loss": 0.8434, + "step": 13095 + }, + { + "epoch": 0.6531670822942643, + "grad_norm": 0.9934623464939664, + "learning_rate": 1.418670496585145e-05, + "loss": 0.8613, + "step": 13096 + }, + { + "epoch": 0.6532169576059851, + "grad_norm": 0.47657263893848306, + "learning_rate": 1.4183063963111806e-05, + "loss": 0.9154, + "step": 13097 + }, + { + "epoch": 0.6532668329177057, + "grad_norm": 0.7494803532044386, + "learning_rate": 1.4179423242635256e-05, + "loss": 0.8107, + "step": 13098 + }, + { + "epoch": 0.6533167082294264, + "grad_norm": 0.5291119372915325, + "learning_rate": 1.4175782804516825e-05, + "loss": 0.7855, + "step": 13099 + }, + { + "epoch": 0.6533665835411472, + "grad_norm": 0.45670215219015375, + "learning_rate": 1.4172142648851494e-05, + "loss": 0.8825, + "step": 13100 + }, + { + "epoch": 0.6534164588528678, + "grad_norm": 0.5125918490155872, + "learning_rate": 1.4168502775734261e-05, + "loss": 0.864, + "step": 13101 + }, + { + "epoch": 0.6534663341645885, + "grad_norm": 0.4159500061852824, + "learning_rate": 1.4164863185260086e-05, + "loss": 0.8706, + "step": 13102 + }, + { + "epoch": 0.6535162094763092, + "grad_norm": 0.41834887142476385, + "learning_rate": 1.4161223877523977e-05, + "loss": 0.873, + "step": 13103 + }, + { + "epoch": 0.6535660847880299, + "grad_norm": 0.6546632848114978, + "learning_rate": 1.4157584852620859e-05, + "loss": 0.9009, + "step": 13104 + }, + { + "epoch": 0.6536159600997506, + "grad_norm": 0.4917350447479396, + "learning_rate": 1.4153946110645728e-05, + "loss": 0.847, + "step": 13105 + }, + { + "epoch": 0.6536658354114713, + "grad_norm": 0.5604403170568596, + "learning_rate": 1.41503076516935e-05, + "loss": 0.8374, + "step": 13106 + }, + { + "epoch": 0.653715710723192, + "grad_norm": 0.46003804714153157, + "learning_rate": 1.4146669475859164e-05, + "loss": 0.8623, + "step": 13107 + }, + { + "epoch": 0.6537655860349127, + "grad_norm": 0.4377929154413705, + "learning_rate": 1.4143031583237609e-05, + "loss": 0.8468, + "step": 13108 + }, + { + "epoch": 0.6538154613466334, + "grad_norm": 0.5631526420812838, + "learning_rate": 1.4139393973923798e-05, + "loss": 0.8693, + "step": 13109 + }, + { + "epoch": 0.6538653366583541, + "grad_norm": 0.472374618073695, + "learning_rate": 1.413575664801263e-05, + "loss": 0.8483, + "step": 13110 + }, + { + "epoch": 0.6539152119700749, + "grad_norm": 0.5643456459704733, + "learning_rate": 1.4132119605599048e-05, + "loss": 0.8817, + "step": 13111 + }, + { + "epoch": 0.6539650872817955, + "grad_norm": 0.4413885677139859, + "learning_rate": 1.4128482846777924e-05, + "loss": 0.8617, + "step": 13112 + }, + { + "epoch": 0.6540149625935162, + "grad_norm": 0.42811637075917813, + "learning_rate": 1.4124846371644184e-05, + "loss": 0.8877, + "step": 13113 + }, + { + "epoch": 0.654064837905237, + "grad_norm": 0.41836484547493397, + "learning_rate": 1.412121018029271e-05, + "loss": 0.8429, + "step": 13114 + }, + { + "epoch": 0.6541147132169576, + "grad_norm": 0.5003400784953288, + "learning_rate": 1.4117574272818388e-05, + "loss": 0.8849, + "step": 13115 + }, + { + "epoch": 0.6541645885286783, + "grad_norm": 0.4847086675248013, + "learning_rate": 1.4113938649316083e-05, + "loss": 0.8528, + "step": 13116 + }, + { + "epoch": 0.654214463840399, + "grad_norm": 0.5331134101260443, + "learning_rate": 1.4110303309880691e-05, + "loss": 0.8818, + "step": 13117 + }, + { + "epoch": 0.6542643391521197, + "grad_norm": 0.44021552841449213, + "learning_rate": 1.4106668254607058e-05, + "loss": 0.8233, + "step": 13118 + }, + { + "epoch": 0.6543142144638404, + "grad_norm": 0.41299397192500703, + "learning_rate": 1.410303348359004e-05, + "loss": 0.8471, + "step": 13119 + }, + { + "epoch": 0.654364089775561, + "grad_norm": 0.3980169077637071, + "learning_rate": 1.4099398996924479e-05, + "loss": 0.8044, + "step": 13120 + }, + { + "epoch": 0.6544139650872818, + "grad_norm": 0.4079172868935943, + "learning_rate": 1.4095764794705232e-05, + "loss": 0.8515, + "step": 13121 + }, + { + "epoch": 0.6544638403990025, + "grad_norm": 0.6204726162118127, + "learning_rate": 1.4092130877027121e-05, + "loss": 0.8217, + "step": 13122 + }, + { + "epoch": 0.6545137157107231, + "grad_norm": 0.4275653521700071, + "learning_rate": 1.4088497243984977e-05, + "loss": 0.8312, + "step": 13123 + }, + { + "epoch": 0.6545635910224439, + "grad_norm": 0.7154351119891974, + "learning_rate": 1.4084863895673601e-05, + "loss": 0.8471, + "step": 13124 + }, + { + "epoch": 0.6546134663341646, + "grad_norm": 0.4196097904126742, + "learning_rate": 1.4081230832187824e-05, + "loss": 0.882, + "step": 13125 + }, + { + "epoch": 0.6546633416458852, + "grad_norm": 0.49435720264561955, + "learning_rate": 1.4077598053622449e-05, + "loss": 0.9146, + "step": 13126 + }, + { + "epoch": 0.654713216957606, + "grad_norm": 0.47836495225341225, + "learning_rate": 1.4073965560072261e-05, + "loss": 0.8498, + "step": 13127 + }, + { + "epoch": 0.6547630922693267, + "grad_norm": 0.40536239032508314, + "learning_rate": 1.4070333351632042e-05, + "loss": 0.8324, + "step": 13128 + }, + { + "epoch": 0.6548129675810473, + "grad_norm": 0.4213014462908043, + "learning_rate": 1.406670142839659e-05, + "loss": 0.864, + "step": 13129 + }, + { + "epoch": 0.6548628428927681, + "grad_norm": 0.573416867651616, + "learning_rate": 1.4063069790460675e-05, + "loss": 0.887, + "step": 13130 + }, + { + "epoch": 0.6549127182044888, + "grad_norm": 0.7577028005219636, + "learning_rate": 1.4059438437919056e-05, + "loss": 0.8446, + "step": 13131 + }, + { + "epoch": 0.6549625935162094, + "grad_norm": 0.48183421504166446, + "learning_rate": 1.4055807370866485e-05, + "loss": 0.8684, + "step": 13132 + }, + { + "epoch": 0.6550124688279302, + "grad_norm": 2.3459676108605647, + "learning_rate": 1.4052176589397736e-05, + "loss": 0.8648, + "step": 13133 + }, + { + "epoch": 0.6550623441396509, + "grad_norm": 0.4938609536650739, + "learning_rate": 1.4048546093607537e-05, + "loss": 0.8471, + "step": 13134 + }, + { + "epoch": 0.6551122194513715, + "grad_norm": 0.4658960784983374, + "learning_rate": 1.4044915883590626e-05, + "loss": 0.863, + "step": 13135 + }, + { + "epoch": 0.6551620947630923, + "grad_norm": 0.5428143223320291, + "learning_rate": 1.4041285959441725e-05, + "loss": 0.8771, + "step": 13136 + }, + { + "epoch": 0.655211970074813, + "grad_norm": 0.6949795023538582, + "learning_rate": 1.4037656321255569e-05, + "loss": 0.822, + "step": 13137 + }, + { + "epoch": 0.6552618453865336, + "grad_norm": 0.5399723904629388, + "learning_rate": 1.4034026969126866e-05, + "loss": 0.9195, + "step": 13138 + }, + { + "epoch": 0.6553117206982544, + "grad_norm": 0.4926017715381401, + "learning_rate": 1.403039790315031e-05, + "loss": 0.8572, + "step": 13139 + }, + { + "epoch": 0.655361596009975, + "grad_norm": 0.4746961059063425, + "learning_rate": 1.4026769123420635e-05, + "loss": 0.8479, + "step": 13140 + }, + { + "epoch": 0.6554114713216957, + "grad_norm": 0.4103888772146311, + "learning_rate": 1.4023140630032485e-05, + "loss": 0.8466, + "step": 13141 + }, + { + "epoch": 0.6554613466334165, + "grad_norm": 0.39764692705826143, + "learning_rate": 1.401951242308058e-05, + "loss": 0.8563, + "step": 13142 + }, + { + "epoch": 0.6555112219451371, + "grad_norm": 0.5539400706996134, + "learning_rate": 1.4015884502659573e-05, + "loss": 0.8564, + "step": 13143 + }, + { + "epoch": 0.6555610972568578, + "grad_norm": 0.41276108844254017, + "learning_rate": 1.4012256868864165e-05, + "loss": 0.8623, + "step": 13144 + }, + { + "epoch": 0.6556109725685786, + "grad_norm": 0.45438644893146507, + "learning_rate": 1.4008629521788976e-05, + "loss": 0.864, + "step": 13145 + }, + { + "epoch": 0.6556608478802992, + "grad_norm": 0.4501259025078202, + "learning_rate": 1.4005002461528693e-05, + "loss": 0.8601, + "step": 13146 + }, + { + "epoch": 0.65571072319202, + "grad_norm": 0.5236822491053782, + "learning_rate": 1.4001375688177936e-05, + "loss": 0.8473, + "step": 13147 + }, + { + "epoch": 0.6557605985037407, + "grad_norm": 0.4595516849667246, + "learning_rate": 1.399774920183138e-05, + "loss": 0.8634, + "step": 13148 + }, + { + "epoch": 0.6558104738154613, + "grad_norm": 0.472568975693766, + "learning_rate": 1.3994123002583615e-05, + "loss": 0.8629, + "step": 13149 + }, + { + "epoch": 0.655860349127182, + "grad_norm": 0.5089261488910451, + "learning_rate": 1.3990497090529295e-05, + "loss": 0.868, + "step": 13150 + }, + { + "epoch": 0.6559102244389028, + "grad_norm": 0.5673581070556872, + "learning_rate": 1.3986871465763023e-05, + "loss": 0.8882, + "step": 13151 + }, + { + "epoch": 0.6559600997506234, + "grad_norm": 0.3729447658523676, + "learning_rate": 1.3983246128379415e-05, + "loss": 0.8371, + "step": 13152 + }, + { + "epoch": 0.6560099750623442, + "grad_norm": 0.5055677557052602, + "learning_rate": 1.3979621078473059e-05, + "loss": 0.9045, + "step": 13153 + }, + { + "epoch": 0.6560598503740649, + "grad_norm": 0.4489571376154438, + "learning_rate": 1.3975996316138568e-05, + "loss": 0.8178, + "step": 13154 + }, + { + "epoch": 0.6561097256857855, + "grad_norm": 0.8032801417749365, + "learning_rate": 1.3972371841470517e-05, + "loss": 0.889, + "step": 13155 + }, + { + "epoch": 0.6561596009975063, + "grad_norm": 0.4829323744947773, + "learning_rate": 1.3968747654563491e-05, + "loss": 0.7934, + "step": 13156 + }, + { + "epoch": 0.6562094763092269, + "grad_norm": 0.40773091127674455, + "learning_rate": 1.3965123755512044e-05, + "loss": 0.8625, + "step": 13157 + }, + { + "epoch": 0.6562593516209476, + "grad_norm": 0.4303529728983238, + "learning_rate": 1.3961500144410766e-05, + "loss": 0.8508, + "step": 13158 + }, + { + "epoch": 0.6563092269326684, + "grad_norm": 0.4325970966519431, + "learning_rate": 1.3957876821354205e-05, + "loss": 0.8568, + "step": 13159 + }, + { + "epoch": 0.656359102244389, + "grad_norm": 0.40337323809397285, + "learning_rate": 1.3954253786436902e-05, + "loss": 0.84, + "step": 13160 + }, + { + "epoch": 0.6564089775561097, + "grad_norm": 0.44429990976878586, + "learning_rate": 1.3950631039753392e-05, + "loss": 0.8352, + "step": 13161 + }, + { + "epoch": 0.6564588528678305, + "grad_norm": 0.4428305109655998, + "learning_rate": 1.3947008581398235e-05, + "loss": 0.8475, + "step": 13162 + }, + { + "epoch": 0.6565087281795511, + "grad_norm": 0.5370216811647853, + "learning_rate": 1.3943386411465939e-05, + "loss": 0.8404, + "step": 13163 + }, + { + "epoch": 0.6565586034912718, + "grad_norm": 0.7951813550496429, + "learning_rate": 1.3939764530051028e-05, + "loss": 0.8715, + "step": 13164 + }, + { + "epoch": 0.6566084788029926, + "grad_norm": 0.5603705803574253, + "learning_rate": 1.3936142937248e-05, + "loss": 0.8726, + "step": 13165 + }, + { + "epoch": 0.6566583541147132, + "grad_norm": 0.540516033263442, + "learning_rate": 1.393252163315138e-05, + "loss": 0.8785, + "step": 13166 + }, + { + "epoch": 0.6567082294264339, + "grad_norm": 0.5334834615857535, + "learning_rate": 1.3928900617855655e-05, + "loss": 0.8311, + "step": 13167 + }, + { + "epoch": 0.6567581047381547, + "grad_norm": 0.4340427516706733, + "learning_rate": 1.3925279891455317e-05, + "loss": 0.8731, + "step": 13168 + }, + { + "epoch": 0.6568079800498753, + "grad_norm": 0.42593544498957553, + "learning_rate": 1.3921659454044832e-05, + "loss": 0.8652, + "step": 13169 + }, + { + "epoch": 0.656857855361596, + "grad_norm": 0.45046558302311845, + "learning_rate": 1.3918039305718695e-05, + "loss": 0.8955, + "step": 13170 + }, + { + "epoch": 0.6569077306733168, + "grad_norm": 0.510677651638572, + "learning_rate": 1.3914419446571364e-05, + "loss": 0.8851, + "step": 13171 + }, + { + "epoch": 0.6569576059850374, + "grad_norm": 0.4682430069527295, + "learning_rate": 1.3910799876697294e-05, + "loss": 0.8846, + "step": 13172 + }, + { + "epoch": 0.6570074812967581, + "grad_norm": 0.4517111143089027, + "learning_rate": 1.3907180596190928e-05, + "loss": 0.8648, + "step": 13173 + }, + { + "epoch": 0.6570573566084787, + "grad_norm": 0.4277899593389112, + "learning_rate": 1.3903561605146731e-05, + "loss": 0.83, + "step": 13174 + }, + { + "epoch": 0.6571072319201995, + "grad_norm": 0.7345261402795804, + "learning_rate": 1.389994290365913e-05, + "loss": 0.8875, + "step": 13175 + }, + { + "epoch": 0.6571571072319202, + "grad_norm": 0.5237472361014023, + "learning_rate": 1.3896324491822537e-05, + "loss": 0.8738, + "step": 13176 + }, + { + "epoch": 0.6572069825436408, + "grad_norm": 0.49173298600392823, + "learning_rate": 1.3892706369731413e-05, + "loss": 0.8767, + "step": 13177 + }, + { + "epoch": 0.6572568578553616, + "grad_norm": 0.3966373630398028, + "learning_rate": 1.388908853748012e-05, + "loss": 0.8549, + "step": 13178 + }, + { + "epoch": 0.6573067331670823, + "grad_norm": 0.3937783296897279, + "learning_rate": 1.38854709951631e-05, + "loss": 0.8581, + "step": 13179 + }, + { + "epoch": 0.657356608478803, + "grad_norm": 0.5598043963711259, + "learning_rate": 1.3881853742874732e-05, + "loss": 0.9127, + "step": 13180 + }, + { + "epoch": 0.6574064837905237, + "grad_norm": 0.427239875796288, + "learning_rate": 1.3878236780709431e-05, + "loss": 0.8837, + "step": 13181 + }, + { + "epoch": 0.6574563591022444, + "grad_norm": 0.40458435998444653, + "learning_rate": 1.3874620108761548e-05, + "loss": 0.8683, + "step": 13182 + }, + { + "epoch": 0.657506234413965, + "grad_norm": 0.45285754237786247, + "learning_rate": 1.387100372712548e-05, + "loss": 0.9008, + "step": 13183 + }, + { + "epoch": 0.6575561097256858, + "grad_norm": 0.41673542847405787, + "learning_rate": 1.3867387635895584e-05, + "loss": 0.8671, + "step": 13184 + }, + { + "epoch": 0.6576059850374065, + "grad_norm": 0.412834648389242, + "learning_rate": 1.3863771835166243e-05, + "loss": 0.8196, + "step": 13185 + }, + { + "epoch": 0.6576558603491272, + "grad_norm": 0.4906113380390116, + "learning_rate": 1.386015632503177e-05, + "loss": 0.8779, + "step": 13186 + }, + { + "epoch": 0.6577057356608479, + "grad_norm": 0.42268598363275944, + "learning_rate": 1.3856541105586545e-05, + "loss": 0.8921, + "step": 13187 + }, + { + "epoch": 0.6577556109725686, + "grad_norm": 0.4572091875823229, + "learning_rate": 1.3852926176924891e-05, + "loss": 0.8815, + "step": 13188 + }, + { + "epoch": 0.6578054862842893, + "grad_norm": 0.40347265940038307, + "learning_rate": 1.3849311539141141e-05, + "loss": 0.8707, + "step": 13189 + }, + { + "epoch": 0.65785536159601, + "grad_norm": 0.4061273868472494, + "learning_rate": 1.3845697192329604e-05, + "loss": 0.8523, + "step": 13190 + }, + { + "epoch": 0.6579052369077307, + "grad_norm": 0.39671952465939064, + "learning_rate": 1.3842083136584621e-05, + "loss": 0.8543, + "step": 13191 + }, + { + "epoch": 0.6579551122194514, + "grad_norm": 0.5412731385114551, + "learning_rate": 1.3838469372000479e-05, + "loss": 0.8384, + "step": 13192 + }, + { + "epoch": 0.6580049875311721, + "grad_norm": 0.4808567802555633, + "learning_rate": 1.383485589867149e-05, + "loss": 0.894, + "step": 13193 + }, + { + "epoch": 0.6580548628428927, + "grad_norm": 0.4133214668852714, + "learning_rate": 1.3831242716691923e-05, + "loss": 0.8763, + "step": 13194 + }, + { + "epoch": 0.6581047381546135, + "grad_norm": 0.4494962989419537, + "learning_rate": 1.3827629826156091e-05, + "loss": 0.8587, + "step": 13195 + }, + { + "epoch": 0.6581546134663342, + "grad_norm": 0.4763767781844369, + "learning_rate": 1.3824017227158261e-05, + "loss": 0.8441, + "step": 13196 + }, + { + "epoch": 0.6582044887780548, + "grad_norm": 0.41584519279921467, + "learning_rate": 1.3820404919792698e-05, + "loss": 0.898, + "step": 13197 + }, + { + "epoch": 0.6582543640897756, + "grad_norm": 0.4323676414660104, + "learning_rate": 1.3816792904153655e-05, + "loss": 0.8883, + "step": 13198 + }, + { + "epoch": 0.6583042394014963, + "grad_norm": 0.4977354187709251, + "learning_rate": 1.3813181180335417e-05, + "loss": 0.8924, + "step": 13199 + }, + { + "epoch": 0.6583541147132169, + "grad_norm": 0.45279670781244147, + "learning_rate": 1.3809569748432189e-05, + "loss": 0.8508, + "step": 13200 + }, + { + "epoch": 0.6584039900249377, + "grad_norm": 0.7724814980897592, + "learning_rate": 1.3805958608538242e-05, + "loss": 0.9173, + "step": 13201 + }, + { + "epoch": 0.6584538653366584, + "grad_norm": 0.49150493735237966, + "learning_rate": 1.3802347760747783e-05, + "loss": 0.8766, + "step": 13202 + }, + { + "epoch": 0.658503740648379, + "grad_norm": 0.4318787156266116, + "learning_rate": 1.379873720515506e-05, + "loss": 0.8502, + "step": 13203 + }, + { + "epoch": 0.6585536159600998, + "grad_norm": 0.4059058202089043, + "learning_rate": 1.3795126941854275e-05, + "loss": 0.8841, + "step": 13204 + }, + { + "epoch": 0.6586034912718205, + "grad_norm": 0.47282135623112787, + "learning_rate": 1.3791516970939636e-05, + "loss": 0.8982, + "step": 13205 + }, + { + "epoch": 0.6586533665835411, + "grad_norm": 0.47709958335408265, + "learning_rate": 1.3787907292505337e-05, + "loss": 0.84, + "step": 13206 + }, + { + "epoch": 0.6587032418952619, + "grad_norm": 0.42070430886969923, + "learning_rate": 1.3784297906645587e-05, + "loss": 0.869, + "step": 13207 + }, + { + "epoch": 0.6587531172069826, + "grad_norm": 0.47968583365062306, + "learning_rate": 1.3780688813454568e-05, + "loss": 0.8778, + "step": 13208 + }, + { + "epoch": 0.6588029925187032, + "grad_norm": 0.4359922507365746, + "learning_rate": 1.3777080013026453e-05, + "loss": 0.8633, + "step": 13209 + }, + { + "epoch": 0.658852867830424, + "grad_norm": 0.46166504038639966, + "learning_rate": 1.377347150545541e-05, + "loss": 0.892, + "step": 13210 + }, + { + "epoch": 0.6589027431421446, + "grad_norm": 0.4314526213010171, + "learning_rate": 1.3769863290835593e-05, + "loss": 0.8662, + "step": 13211 + }, + { + "epoch": 0.6589526184538653, + "grad_norm": 0.7408602032187842, + "learning_rate": 1.3766255369261183e-05, + "loss": 0.8838, + "step": 13212 + }, + { + "epoch": 0.6590024937655861, + "grad_norm": 0.49680390177450134, + "learning_rate": 1.3762647740826296e-05, + "loss": 0.8596, + "step": 13213 + }, + { + "epoch": 0.6590523690773067, + "grad_norm": 0.8362878639515327, + "learning_rate": 1.3759040405625106e-05, + "loss": 0.8214, + "step": 13214 + }, + { + "epoch": 0.6591022443890274, + "grad_norm": 0.5383305137434875, + "learning_rate": 1.3755433363751713e-05, + "loss": 0.8704, + "step": 13215 + }, + { + "epoch": 0.6591521197007482, + "grad_norm": 0.43903824082892046, + "learning_rate": 1.3751826615300256e-05, + "loss": 0.8643, + "step": 13216 + }, + { + "epoch": 0.6592019950124688, + "grad_norm": 0.46259787020345366, + "learning_rate": 1.3748220160364844e-05, + "loss": 0.7867, + "step": 13217 + }, + { + "epoch": 0.6592518703241895, + "grad_norm": 0.4434455224609465, + "learning_rate": 1.374461399903961e-05, + "loss": 0.8606, + "step": 13218 + }, + { + "epoch": 0.6593017456359103, + "grad_norm": 0.4854186165672818, + "learning_rate": 1.3741008131418617e-05, + "loss": 0.8634, + "step": 13219 + }, + { + "epoch": 0.6593516209476309, + "grad_norm": 0.49792444850115736, + "learning_rate": 1.3737402557595987e-05, + "loss": 0.831, + "step": 13220 + }, + { + "epoch": 0.6594014962593516, + "grad_norm": 0.5078191441306571, + "learning_rate": 1.37337972776658e-05, + "loss": 0.8468, + "step": 13221 + }, + { + "epoch": 0.6594513715710724, + "grad_norm": 0.4074026187721216, + "learning_rate": 1.3730192291722127e-05, + "loss": 0.8511, + "step": 13222 + }, + { + "epoch": 0.659501246882793, + "grad_norm": 0.43720884559846085, + "learning_rate": 1.372658759985903e-05, + "loss": 0.8125, + "step": 13223 + }, + { + "epoch": 0.6595511221945137, + "grad_norm": 0.4458335416995084, + "learning_rate": 1.3722983202170602e-05, + "loss": 0.8811, + "step": 13224 + }, + { + "epoch": 0.6596009975062345, + "grad_norm": 0.5396988858433278, + "learning_rate": 1.3719379098750876e-05, + "loss": 0.8376, + "step": 13225 + }, + { + "epoch": 0.6596508728179551, + "grad_norm": 0.4260135612800757, + "learning_rate": 1.3715775289693904e-05, + "loss": 0.8463, + "step": 13226 + }, + { + "epoch": 0.6597007481296758, + "grad_norm": 0.4566046314136694, + "learning_rate": 1.3712171775093715e-05, + "loss": 0.8417, + "step": 13227 + }, + { + "epoch": 0.6597506234413965, + "grad_norm": 0.4090176293533948, + "learning_rate": 1.3708568555044363e-05, + "loss": 0.8502, + "step": 13228 + }, + { + "epoch": 0.6598004987531172, + "grad_norm": 0.4380482668284463, + "learning_rate": 1.370496562963986e-05, + "loss": 0.8526, + "step": 13229 + }, + { + "epoch": 0.6598503740648379, + "grad_norm": 0.7669185139182436, + "learning_rate": 1.3701362998974227e-05, + "loss": 0.8888, + "step": 13230 + }, + { + "epoch": 0.6599002493765586, + "grad_norm": 0.662196619788259, + "learning_rate": 1.3697760663141458e-05, + "loss": 0.9065, + "step": 13231 + }, + { + "epoch": 0.6599501246882793, + "grad_norm": 0.45870139480229777, + "learning_rate": 1.3694158622235575e-05, + "loss": 0.8051, + "step": 13232 + }, + { + "epoch": 0.66, + "grad_norm": 0.43276779779830543, + "learning_rate": 1.3690556876350564e-05, + "loss": 0.849, + "step": 13233 + }, + { + "epoch": 0.6600498753117207, + "grad_norm": 0.480200046665367, + "learning_rate": 1.3686955425580408e-05, + "loss": 0.912, + "step": 13234 + }, + { + "epoch": 0.6600997506234414, + "grad_norm": 0.4426256529199802, + "learning_rate": 1.368335427001908e-05, + "loss": 0.8967, + "step": 13235 + }, + { + "epoch": 0.6601496259351621, + "grad_norm": 0.5360999650432349, + "learning_rate": 1.3679753409760576e-05, + "loss": 0.8675, + "step": 13236 + }, + { + "epoch": 0.6601995012468828, + "grad_norm": 0.47483333457400156, + "learning_rate": 1.367615284489882e-05, + "loss": 0.8675, + "step": 13237 + }, + { + "epoch": 0.6602493765586035, + "grad_norm": 0.42866662102053593, + "learning_rate": 1.36725525755278e-05, + "loss": 0.8853, + "step": 13238 + }, + { + "epoch": 0.6602992518703242, + "grad_norm": 0.4246207244393657, + "learning_rate": 1.3668952601741441e-05, + "loss": 0.8228, + "step": 13239 + }, + { + "epoch": 0.6603491271820449, + "grad_norm": 1.3262748224385061, + "learning_rate": 1.3665352923633701e-05, + "loss": 0.9046, + "step": 13240 + }, + { + "epoch": 0.6603990024937656, + "grad_norm": 0.513622637487756, + "learning_rate": 1.3661753541298502e-05, + "loss": 0.8668, + "step": 13241 + }, + { + "epoch": 0.6604488778054863, + "grad_norm": 0.4281062179511032, + "learning_rate": 1.3658154454829774e-05, + "loss": 0.8249, + "step": 13242 + }, + { + "epoch": 0.660498753117207, + "grad_norm": 1.9822459709414897, + "learning_rate": 1.3654555664321417e-05, + "loss": 0.8586, + "step": 13243 + }, + { + "epoch": 0.6605486284289277, + "grad_norm": 0.45288185500023337, + "learning_rate": 1.3650957169867363e-05, + "loss": 0.8429, + "step": 13244 + }, + { + "epoch": 0.6605985037406483, + "grad_norm": 0.48736674408221864, + "learning_rate": 1.3647358971561503e-05, + "loss": 0.8602, + "step": 13245 + }, + { + "epoch": 0.6606483790523691, + "grad_norm": 0.49756713029219723, + "learning_rate": 1.364376106949773e-05, + "loss": 0.8689, + "step": 13246 + }, + { + "epoch": 0.6606982543640898, + "grad_norm": 0.4742169857979122, + "learning_rate": 1.3640163463769928e-05, + "loss": 0.8377, + "step": 13247 + }, + { + "epoch": 0.6607481296758104, + "grad_norm": 0.4119743055684217, + "learning_rate": 1.363656615447197e-05, + "loss": 0.8359, + "step": 13248 + }, + { + "epoch": 0.6607980049875312, + "grad_norm": 0.49973808888571486, + "learning_rate": 1.3632969141697738e-05, + "loss": 0.8824, + "step": 13249 + }, + { + "epoch": 0.6608478802992519, + "grad_norm": 0.5002918930968966, + "learning_rate": 1.362937242554109e-05, + "loss": 0.8549, + "step": 13250 + }, + { + "epoch": 0.6608977556109725, + "grad_norm": 0.40980055337115634, + "learning_rate": 1.3625776006095881e-05, + "loss": 0.8709, + "step": 13251 + }, + { + "epoch": 0.6609476309226933, + "grad_norm": 0.476881096813908, + "learning_rate": 1.3622179883455946e-05, + "loss": 0.8828, + "step": 13252 + }, + { + "epoch": 0.660997506234414, + "grad_norm": 0.49262805886778327, + "learning_rate": 1.3618584057715144e-05, + "loss": 0.849, + "step": 13253 + }, + { + "epoch": 0.6610473815461346, + "grad_norm": 0.5799531043109282, + "learning_rate": 1.361498852896729e-05, + "loss": 0.8272, + "step": 13254 + }, + { + "epoch": 0.6610972568578554, + "grad_norm": 0.5139536681111254, + "learning_rate": 1.3611393297306236e-05, + "loss": 0.8878, + "step": 13255 + }, + { + "epoch": 0.6611471321695761, + "grad_norm": 0.48011446558907844, + "learning_rate": 1.3607798362825751e-05, + "loss": 0.8856, + "step": 13256 + }, + { + "epoch": 0.6611970074812967, + "grad_norm": 0.6944024132787378, + "learning_rate": 1.3604203725619686e-05, + "loss": 0.8598, + "step": 13257 + }, + { + "epoch": 0.6612468827930175, + "grad_norm": 0.5064440891527779, + "learning_rate": 1.3600609385781821e-05, + "loss": 0.8913, + "step": 13258 + }, + { + "epoch": 0.6612967581047382, + "grad_norm": 0.48798344317041326, + "learning_rate": 1.359701534340595e-05, + "loss": 0.8735, + "step": 13259 + }, + { + "epoch": 0.6613466334164588, + "grad_norm": 0.42636743452196774, + "learning_rate": 1.3593421598585855e-05, + "loss": 0.8514, + "step": 13260 + }, + { + "epoch": 0.6613965087281796, + "grad_norm": 0.5371442279827614, + "learning_rate": 1.3589828151415327e-05, + "loss": 0.8923, + "step": 13261 + }, + { + "epoch": 0.6614463840399003, + "grad_norm": 0.41014264948715073, + "learning_rate": 1.3586235001988126e-05, + "loss": 0.8414, + "step": 13262 + }, + { + "epoch": 0.6614962593516209, + "grad_norm": 0.38015198918316023, + "learning_rate": 1.3582642150398017e-05, + "loss": 0.8624, + "step": 13263 + }, + { + "epoch": 0.6615461346633417, + "grad_norm": 0.4589466692809824, + "learning_rate": 1.3579049596738739e-05, + "loss": 0.8867, + "step": 13264 + }, + { + "epoch": 0.6615960099750623, + "grad_norm": 0.5251889891559507, + "learning_rate": 1.3575457341104059e-05, + "loss": 0.8803, + "step": 13265 + }, + { + "epoch": 0.661645885286783, + "grad_norm": 0.529989006898976, + "learning_rate": 1.3571865383587712e-05, + "loss": 0.874, + "step": 13266 + }, + { + "epoch": 0.6616957605985038, + "grad_norm": 0.8691673829485915, + "learning_rate": 1.3568273724283417e-05, + "loss": 0.8475, + "step": 13267 + }, + { + "epoch": 0.6617456359102244, + "grad_norm": 0.7060639350972828, + "learning_rate": 1.3564682363284897e-05, + "loss": 0.8582, + "step": 13268 + }, + { + "epoch": 0.6617955112219451, + "grad_norm": 0.42343978452483466, + "learning_rate": 1.3561091300685892e-05, + "loss": 0.857, + "step": 13269 + }, + { + "epoch": 0.6618453865336659, + "grad_norm": 0.40992408522456963, + "learning_rate": 1.355750053658007e-05, + "loss": 0.8896, + "step": 13270 + }, + { + "epoch": 0.6618952618453865, + "grad_norm": 0.42942206597750415, + "learning_rate": 1.355391007106116e-05, + "loss": 0.8264, + "step": 13271 + }, + { + "epoch": 0.6619451371571072, + "grad_norm": 0.45238469056659675, + "learning_rate": 1.3550319904222836e-05, + "loss": 0.8859, + "step": 13272 + }, + { + "epoch": 0.661995012468828, + "grad_norm": 0.47686839365933414, + "learning_rate": 1.3546730036158805e-05, + "loss": 0.8359, + "step": 13273 + }, + { + "epoch": 0.6620448877805486, + "grad_norm": 0.6659397870989802, + "learning_rate": 1.3543140466962711e-05, + "loss": 0.8254, + "step": 13274 + }, + { + "epoch": 0.6620947630922693, + "grad_norm": 0.43166798724132277, + "learning_rate": 1.3539551196728251e-05, + "loss": 0.8355, + "step": 13275 + }, + { + "epoch": 0.6621446384039901, + "grad_norm": 0.4939276959405052, + "learning_rate": 1.3535962225549065e-05, + "loss": 0.8923, + "step": 13276 + }, + { + "epoch": 0.6621945137157107, + "grad_norm": 0.56865813284135, + "learning_rate": 1.3532373553518818e-05, + "loss": 0.8503, + "step": 13277 + }, + { + "epoch": 0.6622443890274314, + "grad_norm": 0.4430156139504731, + "learning_rate": 1.3528785180731155e-05, + "loss": 0.8372, + "step": 13278 + }, + { + "epoch": 0.6622942643391522, + "grad_norm": 0.5204270451669016, + "learning_rate": 1.3525197107279708e-05, + "loss": 0.8381, + "step": 13279 + }, + { + "epoch": 0.6623441396508728, + "grad_norm": 0.4368579667261822, + "learning_rate": 1.3521609333258105e-05, + "loss": 0.8423, + "step": 13280 + }, + { + "epoch": 0.6623940149625935, + "grad_norm": 0.4170560302081715, + "learning_rate": 1.351802185875996e-05, + "loss": 0.8355, + "step": 13281 + }, + { + "epoch": 0.6624438902743142, + "grad_norm": 0.5497236020762942, + "learning_rate": 1.351443468387891e-05, + "loss": 0.8585, + "step": 13282 + }, + { + "epoch": 0.6624937655860349, + "grad_norm": 0.5631781628899682, + "learning_rate": 1.3510847808708543e-05, + "loss": 0.8373, + "step": 13283 + }, + { + "epoch": 0.6625436408977556, + "grad_norm": 0.5990751220925349, + "learning_rate": 1.3507261233342461e-05, + "loss": 0.8767, + "step": 13284 + }, + { + "epoch": 0.6625935162094763, + "grad_norm": 0.38763564035344544, + "learning_rate": 1.3503674957874245e-05, + "loss": 0.8811, + "step": 13285 + }, + { + "epoch": 0.662643391521197, + "grad_norm": 0.514973327351051, + "learning_rate": 1.3500088982397496e-05, + "loss": 0.8196, + "step": 13286 + }, + { + "epoch": 0.6626932668329177, + "grad_norm": 0.4354588147676212, + "learning_rate": 1.3496503307005776e-05, + "loss": 0.878, + "step": 13287 + }, + { + "epoch": 0.6627431421446384, + "grad_norm": 0.42854496978518347, + "learning_rate": 1.3492917931792653e-05, + "loss": 0.8664, + "step": 13288 + }, + { + "epoch": 0.6627930174563591, + "grad_norm": 0.4324907934524493, + "learning_rate": 1.3489332856851681e-05, + "loss": 0.8674, + "step": 13289 + }, + { + "epoch": 0.6628428927680798, + "grad_norm": 0.42180837683450556, + "learning_rate": 1.3485748082276423e-05, + "loss": 0.8913, + "step": 13290 + }, + { + "epoch": 0.6628927680798005, + "grad_norm": 0.4204484875323101, + "learning_rate": 1.3482163608160408e-05, + "loss": 0.8194, + "step": 13291 + }, + { + "epoch": 0.6629426433915212, + "grad_norm": 1.0063051178518778, + "learning_rate": 1.3478579434597199e-05, + "loss": 0.8641, + "step": 13292 + }, + { + "epoch": 0.6629925187032419, + "grad_norm": 0.5632688181367241, + "learning_rate": 1.3474995561680282e-05, + "loss": 0.8672, + "step": 13293 + }, + { + "epoch": 0.6630423940149626, + "grad_norm": 0.5756222888779057, + "learning_rate": 1.3471411989503208e-05, + "loss": 0.8928, + "step": 13294 + }, + { + "epoch": 0.6630922693266833, + "grad_norm": 0.5221720834075645, + "learning_rate": 1.3467828718159478e-05, + "loss": 0.8784, + "step": 13295 + }, + { + "epoch": 0.663142144638404, + "grad_norm": 0.4602538372985059, + "learning_rate": 1.3464245747742599e-05, + "loss": 0.8707, + "step": 13296 + }, + { + "epoch": 0.6631920199501247, + "grad_norm": 0.4265084340248589, + "learning_rate": 1.3460663078346053e-05, + "loss": 0.8209, + "step": 13297 + }, + { + "epoch": 0.6632418952618454, + "grad_norm": 0.4490328456354802, + "learning_rate": 1.3457080710063346e-05, + "loss": 0.8527, + "step": 13298 + }, + { + "epoch": 0.663291770573566, + "grad_norm": 0.5304596611244072, + "learning_rate": 1.3453498642987957e-05, + "loss": 0.8835, + "step": 13299 + }, + { + "epoch": 0.6633416458852868, + "grad_norm": 0.4977791011243154, + "learning_rate": 1.3449916877213353e-05, + "loss": 0.8591, + "step": 13300 + }, + { + "epoch": 0.6633915211970075, + "grad_norm": 0.4507215543647218, + "learning_rate": 1.3446335412832989e-05, + "loss": 0.8262, + "step": 13301 + }, + { + "epoch": 0.6634413965087281, + "grad_norm": 0.5334224324906262, + "learning_rate": 1.3442754249940338e-05, + "loss": 0.8278, + "step": 13302 + }, + { + "epoch": 0.6634912718204489, + "grad_norm": 0.39502017572741993, + "learning_rate": 1.3439173388628844e-05, + "loss": 0.8543, + "step": 13303 + }, + { + "epoch": 0.6635411471321696, + "grad_norm": 1.6812878394980324, + "learning_rate": 1.343559282899195e-05, + "loss": 0.8692, + "step": 13304 + }, + { + "epoch": 0.6635910224438902, + "grad_norm": 0.44818141648472154, + "learning_rate": 1.343201257112307e-05, + "loss": 0.8683, + "step": 13305 + }, + { + "epoch": 0.663640897755611, + "grad_norm": 0.5006034078786976, + "learning_rate": 1.342843261511567e-05, + "loss": 0.8907, + "step": 13306 + }, + { + "epoch": 0.6636907730673317, + "grad_norm": 0.44515332345072156, + "learning_rate": 1.3424852961063119e-05, + "loss": 0.8305, + "step": 13307 + }, + { + "epoch": 0.6637406483790523, + "grad_norm": 0.4735767384057849, + "learning_rate": 1.342127360905886e-05, + "loss": 0.8121, + "step": 13308 + }, + { + "epoch": 0.6637905236907731, + "grad_norm": 0.8959164276962718, + "learning_rate": 1.3417694559196275e-05, + "loss": 0.8998, + "step": 13309 + }, + { + "epoch": 0.6638403990024938, + "grad_norm": 0.4760143297338361, + "learning_rate": 1.3414115811568786e-05, + "loss": 0.8071, + "step": 13310 + }, + { + "epoch": 0.6638902743142144, + "grad_norm": 0.39287116937512295, + "learning_rate": 1.3410537366269743e-05, + "loss": 0.8257, + "step": 13311 + }, + { + "epoch": 0.6639401496259352, + "grad_norm": 0.4693260716931225, + "learning_rate": 1.3406959223392546e-05, + "loss": 0.852, + "step": 13312 + }, + { + "epoch": 0.6639900249376559, + "grad_norm": 0.4153743544783579, + "learning_rate": 1.3403381383030553e-05, + "loss": 0.8682, + "step": 13313 + }, + { + "epoch": 0.6640399002493765, + "grad_norm": 0.42937610451493025, + "learning_rate": 1.3399803845277151e-05, + "loss": 0.8958, + "step": 13314 + }, + { + "epoch": 0.6640897755610973, + "grad_norm": 0.4563882920165198, + "learning_rate": 1.3396226610225658e-05, + "loss": 0.8515, + "step": 13315 + }, + { + "epoch": 0.664139650872818, + "grad_norm": 0.6351811944871444, + "learning_rate": 1.3392649677969444e-05, + "loss": 0.8533, + "step": 13316 + }, + { + "epoch": 0.6641895261845386, + "grad_norm": 0.4719243280447859, + "learning_rate": 1.3389073048601847e-05, + "loss": 0.8491, + "step": 13317 + }, + { + "epoch": 0.6642394014962594, + "grad_norm": 0.4254193159204392, + "learning_rate": 1.338549672221618e-05, + "loss": 0.8274, + "step": 13318 + }, + { + "epoch": 0.66428927680798, + "grad_norm": 0.47757223382257447, + "learning_rate": 1.3381920698905787e-05, + "loss": 0.8946, + "step": 13319 + }, + { + "epoch": 0.6643391521197007, + "grad_norm": 0.46394249580773494, + "learning_rate": 1.3378344978763976e-05, + "loss": 0.853, + "step": 13320 + }, + { + "epoch": 0.6643890274314215, + "grad_norm": 0.5286801586706469, + "learning_rate": 1.3374769561884052e-05, + "loss": 0.8714, + "step": 13321 + }, + { + "epoch": 0.6644389027431421, + "grad_norm": 0.39463058854899385, + "learning_rate": 1.3371194448359303e-05, + "loss": 0.8467, + "step": 13322 + }, + { + "epoch": 0.6644887780548628, + "grad_norm": 0.5759945723770505, + "learning_rate": 1.3367619638283039e-05, + "loss": 0.8494, + "step": 13323 + }, + { + "epoch": 0.6645386533665836, + "grad_norm": 0.5598441574850584, + "learning_rate": 1.3364045131748534e-05, + "loss": 0.8292, + "step": 13324 + }, + { + "epoch": 0.6645885286783042, + "grad_norm": 0.5667960445397551, + "learning_rate": 1.3360470928849067e-05, + "loss": 0.8295, + "step": 13325 + }, + { + "epoch": 0.6646384039900249, + "grad_norm": 0.4778097108580464, + "learning_rate": 1.3356897029677889e-05, + "loss": 0.8257, + "step": 13326 + }, + { + "epoch": 0.6646882793017457, + "grad_norm": 0.5183651485129851, + "learning_rate": 1.3353323434328283e-05, + "loss": 0.8833, + "step": 13327 + }, + { + "epoch": 0.6647381546134663, + "grad_norm": 0.445424942673238, + "learning_rate": 1.3349750142893491e-05, + "loss": 0.8581, + "step": 13328 + }, + { + "epoch": 0.664788029925187, + "grad_norm": 0.5461884299768736, + "learning_rate": 1.3346177155466755e-05, + "loss": 0.8712, + "step": 13329 + }, + { + "epoch": 0.6648379052369078, + "grad_norm": 0.42354541106766785, + "learning_rate": 1.3342604472141295e-05, + "loss": 0.8337, + "step": 13330 + }, + { + "epoch": 0.6648877805486284, + "grad_norm": 0.5861479642925591, + "learning_rate": 1.3339032093010374e-05, + "loss": 0.8215, + "step": 13331 + }, + { + "epoch": 0.6649376558603491, + "grad_norm": 0.4519469563288054, + "learning_rate": 1.3335460018167183e-05, + "loss": 0.835, + "step": 13332 + }, + { + "epoch": 0.6649875311720699, + "grad_norm": 0.45778801833251437, + "learning_rate": 1.3331888247704945e-05, + "loss": 0.8514, + "step": 13333 + }, + { + "epoch": 0.6650374064837905, + "grad_norm": 0.4664251665233323, + "learning_rate": 1.3328316781716854e-05, + "loss": 0.8724, + "step": 13334 + }, + { + "epoch": 0.6650872817955112, + "grad_norm": 0.47931715456927715, + "learning_rate": 1.332474562029612e-05, + "loss": 0.868, + "step": 13335 + }, + { + "epoch": 0.6651371571072319, + "grad_norm": 0.46365387036069133, + "learning_rate": 1.3321174763535927e-05, + "loss": 0.8635, + "step": 13336 + }, + { + "epoch": 0.6651870324189526, + "grad_norm": 0.47290186550259705, + "learning_rate": 1.331760421152945e-05, + "loss": 0.8735, + "step": 13337 + }, + { + "epoch": 0.6652369077306733, + "grad_norm": 0.5295499668127059, + "learning_rate": 1.3314033964369852e-05, + "loss": 0.8965, + "step": 13338 + }, + { + "epoch": 0.665286783042394, + "grad_norm": 0.5185576830731925, + "learning_rate": 1.3310464022150319e-05, + "loss": 0.8214, + "step": 13339 + }, + { + "epoch": 0.6653366583541147, + "grad_norm": 0.5110189749820593, + "learning_rate": 1.330689438496399e-05, + "loss": 0.8194, + "step": 13340 + }, + { + "epoch": 0.6653865336658354, + "grad_norm": 0.5399045161238033, + "learning_rate": 1.3303325052904025e-05, + "loss": 0.8752, + "step": 13341 + }, + { + "epoch": 0.6654364089775561, + "grad_norm": 0.4850293018751299, + "learning_rate": 1.3299756026063548e-05, + "loss": 0.8771, + "step": 13342 + }, + { + "epoch": 0.6654862842892768, + "grad_norm": 0.4712233688383321, + "learning_rate": 1.3296187304535723e-05, + "loss": 0.868, + "step": 13343 + }, + { + "epoch": 0.6655361596009975, + "grad_norm": 0.518449412395283, + "learning_rate": 1.3292618888413627e-05, + "loss": 0.8387, + "step": 13344 + }, + { + "epoch": 0.6655860349127182, + "grad_norm": 0.5686583597741757, + "learning_rate": 1.3289050777790412e-05, + "loss": 0.871, + "step": 13345 + }, + { + "epoch": 0.6656359102244389, + "grad_norm": 0.490557914410546, + "learning_rate": 1.3285482972759172e-05, + "loss": 0.8678, + "step": 13346 + }, + { + "epoch": 0.6656857855361596, + "grad_norm": 0.6087907057493203, + "learning_rate": 1.3281915473413026e-05, + "loss": 0.8545, + "step": 13347 + }, + { + "epoch": 0.6657356608478803, + "grad_norm": 0.5005408411539892, + "learning_rate": 1.3278348279845031e-05, + "loss": 0.8508, + "step": 13348 + }, + { + "epoch": 0.665785536159601, + "grad_norm": 0.48403493137232384, + "learning_rate": 1.3274781392148306e-05, + "loss": 0.8624, + "step": 13349 + }, + { + "epoch": 0.6658354114713217, + "grad_norm": 0.4125600118640071, + "learning_rate": 1.32712148104159e-05, + "loss": 0.8377, + "step": 13350 + }, + { + "epoch": 0.6658852867830424, + "grad_norm": 0.48517178544158773, + "learning_rate": 1.3267648534740911e-05, + "loss": 0.8403, + "step": 13351 + }, + { + "epoch": 0.6659351620947631, + "grad_norm": 0.5498994437034133, + "learning_rate": 1.3264082565216368e-05, + "loss": 0.8366, + "step": 13352 + }, + { + "epoch": 0.6659850374064837, + "grad_norm": 0.45106624757482977, + "learning_rate": 1.3260516901935346e-05, + "loss": 0.8915, + "step": 13353 + }, + { + "epoch": 0.6660349127182045, + "grad_norm": 0.4491013518049623, + "learning_rate": 1.3256951544990881e-05, + "loss": 0.8766, + "step": 13354 + }, + { + "epoch": 0.6660847880299252, + "grad_norm": 0.5860623824164418, + "learning_rate": 1.3253386494476e-05, + "loss": 0.8817, + "step": 13355 + }, + { + "epoch": 0.6661346633416458, + "grad_norm": 0.4957410900689053, + "learning_rate": 1.3249821750483753e-05, + "loss": 0.8289, + "step": 13356 + }, + { + "epoch": 0.6661845386533666, + "grad_norm": 0.5196641952468617, + "learning_rate": 1.324625731310715e-05, + "loss": 0.8362, + "step": 13357 + }, + { + "epoch": 0.6662344139650873, + "grad_norm": 0.4354244872349792, + "learning_rate": 1.3242693182439198e-05, + "loss": 0.876, + "step": 13358 + }, + { + "epoch": 0.6662842892768079, + "grad_norm": 0.5353697886436722, + "learning_rate": 1.32391293585729e-05, + "loss": 0.9179, + "step": 13359 + }, + { + "epoch": 0.6663341645885287, + "grad_norm": 0.4394227213839699, + "learning_rate": 1.3235565841601264e-05, + "loss": 0.8715, + "step": 13360 + }, + { + "epoch": 0.6663840399002494, + "grad_norm": 0.4782083321687699, + "learning_rate": 1.3232002631617275e-05, + "loss": 0.8505, + "step": 13361 + }, + { + "epoch": 0.66643391521197, + "grad_norm": 0.47880942414671873, + "learning_rate": 1.322843972871391e-05, + "loss": 0.8696, + "step": 13362 + }, + { + "epoch": 0.6664837905236908, + "grad_norm": 0.5132834656301626, + "learning_rate": 1.3224877132984132e-05, + "loss": 0.862, + "step": 13363 + }, + { + "epoch": 0.6665336658354115, + "grad_norm": 0.42161861964289643, + "learning_rate": 1.3221314844520922e-05, + "loss": 0.8554, + "step": 13364 + }, + { + "epoch": 0.6665835411471321, + "grad_norm": 0.46163988948808143, + "learning_rate": 1.3217752863417232e-05, + "loss": 0.8596, + "step": 13365 + }, + { + "epoch": 0.6666334164588529, + "grad_norm": 0.7494650913727241, + "learning_rate": 1.3214191189766009e-05, + "loss": 0.8939, + "step": 13366 + }, + { + "epoch": 0.6666832917705736, + "grad_norm": 0.4824591277351491, + "learning_rate": 1.321062982366018e-05, + "loss": 0.8528, + "step": 13367 + }, + { + "epoch": 0.6667331670822942, + "grad_norm": 0.42348519757052616, + "learning_rate": 1.3207068765192704e-05, + "loss": 0.8334, + "step": 13368 + }, + { + "epoch": 0.666783042394015, + "grad_norm": 0.49743550570901346, + "learning_rate": 1.320350801445649e-05, + "loss": 0.8823, + "step": 13369 + }, + { + "epoch": 0.6668329177057356, + "grad_norm": 0.4886130611747439, + "learning_rate": 1.3199947571544452e-05, + "loss": 0.8865, + "step": 13370 + }, + { + "epoch": 0.6668827930174563, + "grad_norm": 0.45893275808985556, + "learning_rate": 1.319638743654949e-05, + "loss": 0.8381, + "step": 13371 + }, + { + "epoch": 0.6669326683291771, + "grad_norm": 0.4366876413260905, + "learning_rate": 1.3192827609564529e-05, + "loss": 0.9122, + "step": 13372 + }, + { + "epoch": 0.6669825436408977, + "grad_norm": 1.0026082096107523, + "learning_rate": 1.3189268090682444e-05, + "loss": 0.884, + "step": 13373 + }, + { + "epoch": 0.6670324189526184, + "grad_norm": 0.4750418117212545, + "learning_rate": 1.3185708879996122e-05, + "loss": 0.8397, + "step": 13374 + }, + { + "epoch": 0.6670822942643392, + "grad_norm": 0.4418228492295998, + "learning_rate": 1.3182149977598435e-05, + "loss": 0.8566, + "step": 13375 + }, + { + "epoch": 0.6671321695760598, + "grad_norm": 0.5317050363478142, + "learning_rate": 1.317859138358227e-05, + "loss": 0.844, + "step": 13376 + }, + { + "epoch": 0.6671820448877805, + "grad_norm": 0.409172771338602, + "learning_rate": 1.3175033098040454e-05, + "loss": 0.8893, + "step": 13377 + }, + { + "epoch": 0.6672319201995013, + "grad_norm": 0.40953999944909253, + "learning_rate": 1.3171475121065868e-05, + "loss": 0.8314, + "step": 13378 + }, + { + "epoch": 0.6672817955112219, + "grad_norm": 0.43523798715347556, + "learning_rate": 1.3167917452751333e-05, + "loss": 0.9009, + "step": 13379 + }, + { + "epoch": 0.6673316708229426, + "grad_norm": 0.5244326354054364, + "learning_rate": 1.3164360093189717e-05, + "loss": 0.8457, + "step": 13380 + }, + { + "epoch": 0.6673815461346634, + "grad_norm": 0.606253162140063, + "learning_rate": 1.3160803042473809e-05, + "loss": 0.8777, + "step": 13381 + }, + { + "epoch": 0.667431421446384, + "grad_norm": 0.4856056128395187, + "learning_rate": 1.3157246300696458e-05, + "loss": 0.8528, + "step": 13382 + }, + { + "epoch": 0.6674812967581047, + "grad_norm": 0.469909038615415, + "learning_rate": 1.3153689867950452e-05, + "loss": 0.8843, + "step": 13383 + }, + { + "epoch": 0.6675311720698255, + "grad_norm": 0.8313394563842988, + "learning_rate": 1.315013374432863e-05, + "loss": 0.8651, + "step": 13384 + }, + { + "epoch": 0.6675810473815461, + "grad_norm": 0.4669338411966508, + "learning_rate": 1.3146577929923743e-05, + "loss": 0.8741, + "step": 13385 + }, + { + "epoch": 0.6676309226932668, + "grad_norm": 0.5205777695370414, + "learning_rate": 1.3143022424828609e-05, + "loss": 0.8463, + "step": 13386 + }, + { + "epoch": 0.6676807980049876, + "grad_norm": 0.3848792402818652, + "learning_rate": 1.3139467229135999e-05, + "loss": 0.82, + "step": 13387 + }, + { + "epoch": 0.6677306733167082, + "grad_norm": 0.4608045141240229, + "learning_rate": 1.3135912342938684e-05, + "loss": 0.8489, + "step": 13388 + }, + { + "epoch": 0.667780548628429, + "grad_norm": 0.4192394914724491, + "learning_rate": 1.3132357766329417e-05, + "loss": 0.8278, + "step": 13389 + }, + { + "epoch": 0.6678304239401496, + "grad_norm": 0.5426602453364908, + "learning_rate": 1.3128803499400971e-05, + "loss": 0.8731, + "step": 13390 + }, + { + "epoch": 0.6678802992518703, + "grad_norm": 0.656809384696478, + "learning_rate": 1.3125249542246088e-05, + "loss": 0.8395, + "step": 13391 + }, + { + "epoch": 0.667930174563591, + "grad_norm": 0.44764563310952066, + "learning_rate": 1.3121695894957488e-05, + "loss": 0.8923, + "step": 13392 + }, + { + "epoch": 0.6679800498753117, + "grad_norm": 0.524372389191475, + "learning_rate": 1.3118142557627933e-05, + "loss": 0.8954, + "step": 13393 + }, + { + "epoch": 0.6680299251870324, + "grad_norm": 0.4022957195139662, + "learning_rate": 1.311458953035013e-05, + "loss": 0.8334, + "step": 13394 + }, + { + "epoch": 0.6680798004987532, + "grad_norm": 0.7175938830157171, + "learning_rate": 1.3111036813216792e-05, + "loss": 0.8531, + "step": 13395 + }, + { + "epoch": 0.6681296758104738, + "grad_norm": 0.45945340381290356, + "learning_rate": 1.310748440632062e-05, + "loss": 0.8569, + "step": 13396 + }, + { + "epoch": 0.6681795511221945, + "grad_norm": 0.5073217364832519, + "learning_rate": 1.3103932309754327e-05, + "loss": 0.8512, + "step": 13397 + }, + { + "epoch": 0.6682294264339153, + "grad_norm": 0.5196452526527098, + "learning_rate": 1.3100380523610597e-05, + "loss": 0.8609, + "step": 13398 + }, + { + "epoch": 0.6682793017456359, + "grad_norm": 0.47209645549532225, + "learning_rate": 1.3096829047982112e-05, + "loss": 0.8509, + "step": 13399 + }, + { + "epoch": 0.6683291770573566, + "grad_norm": 0.42715271312353015, + "learning_rate": 1.3093277882961536e-05, + "loss": 0.8458, + "step": 13400 + }, + { + "epoch": 0.6683790523690774, + "grad_norm": 0.5138570244637869, + "learning_rate": 1.3089727028641557e-05, + "loss": 0.837, + "step": 13401 + }, + { + "epoch": 0.668428927680798, + "grad_norm": 0.6498444212018257, + "learning_rate": 1.3086176485114818e-05, + "loss": 0.8546, + "step": 13402 + }, + { + "epoch": 0.6684788029925187, + "grad_norm": 0.4923566915634323, + "learning_rate": 1.3082626252473972e-05, + "loss": 0.8495, + "step": 13403 + }, + { + "epoch": 0.6685286783042395, + "grad_norm": 0.4614247837091581, + "learning_rate": 1.3079076330811651e-05, + "loss": 0.8917, + "step": 13404 + }, + { + "epoch": 0.6685785536159601, + "grad_norm": 0.6223709375514971, + "learning_rate": 1.307552672022051e-05, + "loss": 0.7884, + "step": 13405 + }, + { + "epoch": 0.6686284289276808, + "grad_norm": 0.7870638014540612, + "learning_rate": 1.3071977420793158e-05, + "loss": 0.845, + "step": 13406 + }, + { + "epoch": 0.6686783042394014, + "grad_norm": 0.39819009658416427, + "learning_rate": 1.306842843262222e-05, + "loss": 0.8501, + "step": 13407 + }, + { + "epoch": 0.6687281795511222, + "grad_norm": 0.5200346549428871, + "learning_rate": 1.3064879755800294e-05, + "loss": 0.85, + "step": 13408 + }, + { + "epoch": 0.6687780548628429, + "grad_norm": 0.5862299490361509, + "learning_rate": 1.3061331390419996e-05, + "loss": 0.8862, + "step": 13409 + }, + { + "epoch": 0.6688279301745635, + "grad_norm": 0.501826019955013, + "learning_rate": 1.3057783336573914e-05, + "loss": 0.8645, + "step": 13410 + }, + { + "epoch": 0.6688778054862843, + "grad_norm": 0.4410339175325943, + "learning_rate": 1.305423559435463e-05, + "loss": 0.8457, + "step": 13411 + }, + { + "epoch": 0.668927680798005, + "grad_norm": 0.5160322942969872, + "learning_rate": 1.3050688163854713e-05, + "loss": 0.8869, + "step": 13412 + }, + { + "epoch": 0.6689775561097256, + "grad_norm": 0.6567880213095304, + "learning_rate": 1.3047141045166761e-05, + "loss": 0.8522, + "step": 13413 + }, + { + "epoch": 0.6690274314214464, + "grad_norm": 0.4038869454661125, + "learning_rate": 1.3043594238383295e-05, + "loss": 0.8352, + "step": 13414 + }, + { + "epoch": 0.6690773067331671, + "grad_norm": 0.5018305444237902, + "learning_rate": 1.3040047743596893e-05, + "loss": 0.8427, + "step": 13415 + }, + { + "epoch": 0.6691271820448877, + "grad_norm": 0.48043080927911785, + "learning_rate": 1.3036501560900084e-05, + "loss": 0.8143, + "step": 13416 + }, + { + "epoch": 0.6691770573566085, + "grad_norm": 0.5571355165361621, + "learning_rate": 1.3032955690385431e-05, + "loss": 0.8407, + "step": 13417 + }, + { + "epoch": 0.6692269326683292, + "grad_norm": 0.3919512908515702, + "learning_rate": 1.3029410132145425e-05, + "loss": 0.8427, + "step": 13418 + }, + { + "epoch": 0.6692768079800498, + "grad_norm": 0.5455518844652292, + "learning_rate": 1.3025864886272616e-05, + "loss": 0.8899, + "step": 13419 + }, + { + "epoch": 0.6693266832917706, + "grad_norm": 0.6361386151931989, + "learning_rate": 1.3022319952859491e-05, + "loss": 0.8976, + "step": 13420 + }, + { + "epoch": 0.6693765586034913, + "grad_norm": 0.46887566106294865, + "learning_rate": 1.301877533199859e-05, + "loss": 0.8764, + "step": 13421 + }, + { + "epoch": 0.669426433915212, + "grad_norm": 0.424532192951986, + "learning_rate": 1.301523102378236e-05, + "loss": 0.8548, + "step": 13422 + }, + { + "epoch": 0.6694763092269327, + "grad_norm": 0.43288028434269293, + "learning_rate": 1.3011687028303327e-05, + "loss": 0.8917, + "step": 13423 + }, + { + "epoch": 0.6695261845386533, + "grad_norm": 0.4641225786840745, + "learning_rate": 1.300814334565395e-05, + "loss": 0.8695, + "step": 13424 + }, + { + "epoch": 0.669576059850374, + "grad_norm": 0.6129223982891386, + "learning_rate": 1.3004599975926713e-05, + "loss": 0.8783, + "step": 13425 + }, + { + "epoch": 0.6696259351620948, + "grad_norm": 0.626813033809142, + "learning_rate": 1.3001056919214055e-05, + "loss": 0.8616, + "step": 13426 + }, + { + "epoch": 0.6696758104738154, + "grad_norm": 0.4864353743321871, + "learning_rate": 1.299751417560846e-05, + "loss": 0.8449, + "step": 13427 + }, + { + "epoch": 0.6697256857855362, + "grad_norm": 0.526760983132457, + "learning_rate": 1.299397174520236e-05, + "loss": 0.8837, + "step": 13428 + }, + { + "epoch": 0.6697755610972569, + "grad_norm": 0.5487330769888998, + "learning_rate": 1.2990429628088194e-05, + "loss": 0.8681, + "step": 13429 + }, + { + "epoch": 0.6698254364089775, + "grad_norm": 0.743077352333597, + "learning_rate": 1.2986887824358382e-05, + "loss": 0.8445, + "step": 13430 + }, + { + "epoch": 0.6698753117206983, + "grad_norm": 0.49392362781554544, + "learning_rate": 1.2983346334105367e-05, + "loss": 0.8977, + "step": 13431 + }, + { + "epoch": 0.669925187032419, + "grad_norm": 0.4084821577890325, + "learning_rate": 1.2979805157421549e-05, + "loss": 0.8356, + "step": 13432 + }, + { + "epoch": 0.6699750623441396, + "grad_norm": 0.44942946968428854, + "learning_rate": 1.297626429439933e-05, + "loss": 0.8255, + "step": 13433 + }, + { + "epoch": 0.6700249376558604, + "grad_norm": 0.5329702563482339, + "learning_rate": 1.2972723745131121e-05, + "loss": 0.8631, + "step": 13434 + }, + { + "epoch": 0.6700748129675811, + "grad_norm": 0.750619200126825, + "learning_rate": 1.29691835097093e-05, + "loss": 0.8568, + "step": 13435 + }, + { + "epoch": 0.6701246882793017, + "grad_norm": 0.5043735577028429, + "learning_rate": 1.2965643588226256e-05, + "loss": 0.8551, + "step": 13436 + }, + { + "epoch": 0.6701745635910225, + "grad_norm": 0.4547438633318989, + "learning_rate": 1.2962103980774343e-05, + "loss": 0.8701, + "step": 13437 + }, + { + "epoch": 0.6702244389027432, + "grad_norm": 0.591429804968777, + "learning_rate": 1.2958564687445951e-05, + "loss": 0.8838, + "step": 13438 + }, + { + "epoch": 0.6702743142144638, + "grad_norm": 0.4719689773457688, + "learning_rate": 1.2955025708333424e-05, + "loss": 0.8516, + "step": 13439 + }, + { + "epoch": 0.6703241895261846, + "grad_norm": 0.4490109854890103, + "learning_rate": 1.2951487043529109e-05, + "loss": 0.8614, + "step": 13440 + }, + { + "epoch": 0.6703740648379052, + "grad_norm": 0.496646841747762, + "learning_rate": 1.294794869312534e-05, + "loss": 0.8735, + "step": 13441 + }, + { + "epoch": 0.6704239401496259, + "grad_norm": 1.933458587358827, + "learning_rate": 1.2944410657214468e-05, + "loss": 0.8519, + "step": 13442 + }, + { + "epoch": 0.6704738154613467, + "grad_norm": 1.1419517380396886, + "learning_rate": 1.2940872935888797e-05, + "loss": 0.8491, + "step": 13443 + }, + { + "epoch": 0.6705236907730673, + "grad_norm": 0.5557715710314058, + "learning_rate": 1.2937335529240658e-05, + "loss": 0.7987, + "step": 13444 + }, + { + "epoch": 0.670573566084788, + "grad_norm": 0.5624942623768079, + "learning_rate": 1.2933798437362334e-05, + "loss": 0.8482, + "step": 13445 + }, + { + "epoch": 0.6706234413965088, + "grad_norm": 0.4416221482191683, + "learning_rate": 1.2930261660346163e-05, + "loss": 0.8712, + "step": 13446 + }, + { + "epoch": 0.6706733167082294, + "grad_norm": 0.4499666947000994, + "learning_rate": 1.292672519828439e-05, + "loss": 0.8177, + "step": 13447 + }, + { + "epoch": 0.6707231920199501, + "grad_norm": 0.4191034751641467, + "learning_rate": 1.292318905126933e-05, + "loss": 0.8794, + "step": 13448 + }, + { + "epoch": 0.6707730673316709, + "grad_norm": 0.5804793470606793, + "learning_rate": 1.291965321939324e-05, + "loss": 0.9354, + "step": 13449 + }, + { + "epoch": 0.6708229426433915, + "grad_norm": 0.5053837061529017, + "learning_rate": 1.291611770274841e-05, + "loss": 0.8371, + "step": 13450 + }, + { + "epoch": 0.6708728179551122, + "grad_norm": 0.5092861871200896, + "learning_rate": 1.2912582501427062e-05, + "loss": 0.8656, + "step": 13451 + }, + { + "epoch": 0.670922693266833, + "grad_norm": 0.48296973441933644, + "learning_rate": 1.2909047615521468e-05, + "loss": 0.8542, + "step": 13452 + }, + { + "epoch": 0.6709725685785536, + "grad_norm": 0.491192984849533, + "learning_rate": 1.2905513045123862e-05, + "loss": 0.8564, + "step": 13453 + }, + { + "epoch": 0.6710224438902743, + "grad_norm": 0.6072148868757635, + "learning_rate": 1.29019787903265e-05, + "loss": 0.8666, + "step": 13454 + }, + { + "epoch": 0.6710723192019951, + "grad_norm": 0.5394552989073994, + "learning_rate": 1.2898444851221565e-05, + "loss": 0.8725, + "step": 13455 + }, + { + "epoch": 0.6711221945137157, + "grad_norm": 0.4989499678841252, + "learning_rate": 1.2894911227901307e-05, + "loss": 0.8408, + "step": 13456 + }, + { + "epoch": 0.6711720698254364, + "grad_norm": 0.4458243847773554, + "learning_rate": 1.2891377920457912e-05, + "loss": 0.8563, + "step": 13457 + }, + { + "epoch": 0.6712219451371572, + "grad_norm": 0.4830079358682828, + "learning_rate": 1.2887844928983611e-05, + "loss": 0.8539, + "step": 13458 + }, + { + "epoch": 0.6712718204488778, + "grad_norm": 0.47384256540123576, + "learning_rate": 1.2884312253570557e-05, + "loss": 0.8504, + "step": 13459 + }, + { + "epoch": 0.6713216957605985, + "grad_norm": 0.4283673795590361, + "learning_rate": 1.2880779894310963e-05, + "loss": 0.8714, + "step": 13460 + }, + { + "epoch": 0.6713715710723192, + "grad_norm": 0.4035864944554718, + "learning_rate": 1.2877247851296992e-05, + "loss": 0.8076, + "step": 13461 + }, + { + "epoch": 0.6714214463840399, + "grad_norm": 0.4396487529031458, + "learning_rate": 1.287371612462081e-05, + "loss": 0.8972, + "step": 13462 + }, + { + "epoch": 0.6714713216957606, + "grad_norm": 0.5508956075857186, + "learning_rate": 1.2870184714374573e-05, + "loss": 0.9204, + "step": 13463 + }, + { + "epoch": 0.6715211970074813, + "grad_norm": 0.4656971780265458, + "learning_rate": 1.286665362065045e-05, + "loss": 0.8384, + "step": 13464 + }, + { + "epoch": 0.671571072319202, + "grad_norm": 0.48758537917409994, + "learning_rate": 1.2863122843540565e-05, + "loss": 0.8114, + "step": 13465 + }, + { + "epoch": 0.6716209476309227, + "grad_norm": 0.4670199876567125, + "learning_rate": 1.2859592383137059e-05, + "loss": 0.8763, + "step": 13466 + }, + { + "epoch": 0.6716708229426434, + "grad_norm": 0.5243308230783319, + "learning_rate": 1.2856062239532045e-05, + "loss": 0.8304, + "step": 13467 + }, + { + "epoch": 0.6717206982543641, + "grad_norm": 0.4444191996862723, + "learning_rate": 1.2852532412817663e-05, + "loss": 0.8508, + "step": 13468 + }, + { + "epoch": 0.6717705735660848, + "grad_norm": 0.517075342500813, + "learning_rate": 1.2849002903086014e-05, + "loss": 0.8655, + "step": 13469 + }, + { + "epoch": 0.6718204488778055, + "grad_norm": 0.47193733025231305, + "learning_rate": 1.2845473710429185e-05, + "loss": 0.8964, + "step": 13470 + }, + { + "epoch": 0.6718703241895262, + "grad_norm": 0.5155041819337803, + "learning_rate": 1.284194483493929e-05, + "loss": 0.8571, + "step": 13471 + }, + { + "epoch": 0.6719201995012469, + "grad_norm": 0.45698980474272294, + "learning_rate": 1.28384162767084e-05, + "loss": 0.8596, + "step": 13472 + }, + { + "epoch": 0.6719700748129676, + "grad_norm": 0.6729565932070878, + "learning_rate": 1.2834888035828596e-05, + "loss": 0.8801, + "step": 13473 + }, + { + "epoch": 0.6720199501246883, + "grad_norm": 0.7110951450925312, + "learning_rate": 1.2831360112391937e-05, + "loss": 0.9189, + "step": 13474 + }, + { + "epoch": 0.672069825436409, + "grad_norm": 0.4248607826581915, + "learning_rate": 1.2827832506490498e-05, + "loss": 0.8439, + "step": 13475 + }, + { + "epoch": 0.6721197007481297, + "grad_norm": 0.4703619459822978, + "learning_rate": 1.282430521821632e-05, + "loss": 0.8961, + "step": 13476 + }, + { + "epoch": 0.6721695760598504, + "grad_norm": 0.5219348988335927, + "learning_rate": 1.2820778247661452e-05, + "loss": 0.8838, + "step": 13477 + }, + { + "epoch": 0.672219451371571, + "grad_norm": 0.4714661142412105, + "learning_rate": 1.2817251594917915e-05, + "loss": 0.8185, + "step": 13478 + }, + { + "epoch": 0.6722693266832918, + "grad_norm": 0.42164360125022343, + "learning_rate": 1.2813725260077752e-05, + "loss": 0.8178, + "step": 13479 + }, + { + "epoch": 0.6723192019950125, + "grad_norm": 0.41643265646158867, + "learning_rate": 1.2810199243232979e-05, + "loss": 0.829, + "step": 13480 + }, + { + "epoch": 0.6723690773067331, + "grad_norm": 0.4787935302661048, + "learning_rate": 1.2806673544475594e-05, + "loss": 0.8521, + "step": 13481 + }, + { + "epoch": 0.6724189526184539, + "grad_norm": 0.3882398930967511, + "learning_rate": 1.2803148163897605e-05, + "loss": 0.8348, + "step": 13482 + }, + { + "epoch": 0.6724688279301746, + "grad_norm": 0.6461755325718312, + "learning_rate": 1.2799623101591019e-05, + "loss": 0.8569, + "step": 13483 + }, + { + "epoch": 0.6725187032418952, + "grad_norm": 0.6732396817567049, + "learning_rate": 1.279609835764779e-05, + "loss": 0.8579, + "step": 13484 + }, + { + "epoch": 0.672568578553616, + "grad_norm": 0.5627235608711632, + "learning_rate": 1.2792573932159924e-05, + "loss": 0.9015, + "step": 13485 + }, + { + "epoch": 0.6726184538653367, + "grad_norm": 0.47072359804370156, + "learning_rate": 1.2789049825219368e-05, + "loss": 0.8874, + "step": 13486 + }, + { + "epoch": 0.6726683291770573, + "grad_norm": 0.44345991524446005, + "learning_rate": 1.278552603691811e-05, + "loss": 0.8123, + "step": 13487 + }, + { + "epoch": 0.6727182044887781, + "grad_norm": 0.4449589906676882, + "learning_rate": 1.2782002567348065e-05, + "loss": 0.8508, + "step": 13488 + }, + { + "epoch": 0.6727680798004988, + "grad_norm": 0.4955767303103423, + "learning_rate": 1.2778479416601202e-05, + "loss": 0.8548, + "step": 13489 + }, + { + "epoch": 0.6728179551122194, + "grad_norm": 0.43638486540864196, + "learning_rate": 1.2774956584769443e-05, + "loss": 0.8705, + "step": 13490 + }, + { + "epoch": 0.6728678304239402, + "grad_norm": 0.4365272963520454, + "learning_rate": 1.2771434071944738e-05, + "loss": 0.8612, + "step": 13491 + }, + { + "epoch": 0.6729177057356609, + "grad_norm": 0.4554012382743406, + "learning_rate": 1.276791187821897e-05, + "loss": 0.8726, + "step": 13492 + }, + { + "epoch": 0.6729675810473815, + "grad_norm": 0.9124320000415108, + "learning_rate": 1.2764390003684077e-05, + "loss": 0.8745, + "step": 13493 + }, + { + "epoch": 0.6730174563591023, + "grad_norm": 0.4857537147943334, + "learning_rate": 1.276086844843195e-05, + "loss": 0.8384, + "step": 13494 + }, + { + "epoch": 0.6730673316708229, + "grad_norm": 0.4133458600673774, + "learning_rate": 1.2757347212554483e-05, + "loss": 0.7809, + "step": 13495 + }, + { + "epoch": 0.6731172069825436, + "grad_norm": 0.5046417580646954, + "learning_rate": 1.2753826296143551e-05, + "loss": 0.8805, + "step": 13496 + }, + { + "epoch": 0.6731670822942644, + "grad_norm": 0.47257930575226714, + "learning_rate": 1.2750305699291054e-05, + "loss": 0.8781, + "step": 13497 + }, + { + "epoch": 0.673216957605985, + "grad_norm": 0.46852362950184445, + "learning_rate": 1.2746785422088844e-05, + "loss": 0.8459, + "step": 13498 + }, + { + "epoch": 0.6732668329177057, + "grad_norm": 0.46473842067619014, + "learning_rate": 1.2743265464628786e-05, + "loss": 0.8625, + "step": 13499 + }, + { + "epoch": 0.6733167082294265, + "grad_norm": 0.4878261751160069, + "learning_rate": 1.273974582700272e-05, + "loss": 0.8339, + "step": 13500 + }, + { + "epoch": 0.6733665835411471, + "grad_norm": 0.5287761767393758, + "learning_rate": 1.2736226509302513e-05, + "loss": 0.8946, + "step": 13501 + }, + { + "epoch": 0.6734164588528678, + "grad_norm": 0.540815723913034, + "learning_rate": 1.2732707511619982e-05, + "loss": 0.8408, + "step": 13502 + }, + { + "epoch": 0.6734663341645886, + "grad_norm": 0.43277655062370946, + "learning_rate": 1.272918883404696e-05, + "loss": 0.8762, + "step": 13503 + }, + { + "epoch": 0.6735162094763092, + "grad_norm": 0.5100999149068371, + "learning_rate": 1.2725670476675256e-05, + "loss": 0.9012, + "step": 13504 + }, + { + "epoch": 0.6735660847880299, + "grad_norm": 0.45899039799089697, + "learning_rate": 1.2722152439596693e-05, + "loss": 0.8791, + "step": 13505 + }, + { + "epoch": 0.6736159600997507, + "grad_norm": 1.231619574773, + "learning_rate": 1.2718634722903073e-05, + "loss": 0.89, + "step": 13506 + }, + { + "epoch": 0.6736658354114713, + "grad_norm": 0.4932026651116146, + "learning_rate": 1.2715117326686169e-05, + "loss": 0.8607, + "step": 13507 + }, + { + "epoch": 0.673715710723192, + "grad_norm": 0.46781469667199255, + "learning_rate": 1.2711600251037793e-05, + "loss": 0.8954, + "step": 13508 + }, + { + "epoch": 0.6737655860349128, + "grad_norm": 0.41308503440960936, + "learning_rate": 1.2708083496049705e-05, + "loss": 0.8808, + "step": 13509 + }, + { + "epoch": 0.6738154613466334, + "grad_norm": 0.42174037034006673, + "learning_rate": 1.270456706181368e-05, + "loss": 0.833, + "step": 13510 + }, + { + "epoch": 0.6738653366583541, + "grad_norm": 0.5168371908759433, + "learning_rate": 1.2701050948421464e-05, + "loss": 0.811, + "step": 13511 + }, + { + "epoch": 0.6739152119700749, + "grad_norm": 0.42665760601588815, + "learning_rate": 1.269753515596483e-05, + "loss": 0.8352, + "step": 13512 + }, + { + "epoch": 0.6739650872817955, + "grad_norm": 0.6149647103802569, + "learning_rate": 1.2694019684535507e-05, + "loss": 0.8515, + "step": 13513 + }, + { + "epoch": 0.6740149625935162, + "grad_norm": 0.4926423785539242, + "learning_rate": 1.269050453422524e-05, + "loss": 0.852, + "step": 13514 + }, + { + "epoch": 0.6740648379052369, + "grad_norm": 0.4944164192658993, + "learning_rate": 1.2686989705125732e-05, + "loss": 0.8993, + "step": 13515 + }, + { + "epoch": 0.6741147132169576, + "grad_norm": 0.47922525764462875, + "learning_rate": 1.2683475197328729e-05, + "loss": 0.8751, + "step": 13516 + }, + { + "epoch": 0.6741645885286783, + "grad_norm": 0.511214176859124, + "learning_rate": 1.2679961010925928e-05, + "loss": 0.8524, + "step": 13517 + }, + { + "epoch": 0.674214463840399, + "grad_norm": 0.48577751008659936, + "learning_rate": 1.267644714600903e-05, + "loss": 0.8276, + "step": 13518 + }, + { + "epoch": 0.6742643391521197, + "grad_norm": 0.4290702653076547, + "learning_rate": 1.2672933602669721e-05, + "loss": 0.8778, + "step": 13519 + }, + { + "epoch": 0.6743142144638404, + "grad_norm": 0.43408848803751565, + "learning_rate": 1.2669420380999711e-05, + "loss": 0.8648, + "step": 13520 + }, + { + "epoch": 0.6743640897755611, + "grad_norm": 0.6173977117312885, + "learning_rate": 1.2665907481090639e-05, + "loss": 0.8382, + "step": 13521 + }, + { + "epoch": 0.6744139650872818, + "grad_norm": 0.4598305340934397, + "learning_rate": 1.26623949030342e-05, + "loss": 0.8722, + "step": 13522 + }, + { + "epoch": 0.6744638403990025, + "grad_norm": 0.48992530822448344, + "learning_rate": 1.2658882646922034e-05, + "loss": 0.8544, + "step": 13523 + }, + { + "epoch": 0.6745137157107232, + "grad_norm": 0.3902935958259065, + "learning_rate": 1.265537071284582e-05, + "loss": 0.8274, + "step": 13524 + }, + { + "epoch": 0.6745635910224439, + "grad_norm": 0.48926502944046085, + "learning_rate": 1.2651859100897168e-05, + "loss": 0.8307, + "step": 13525 + }, + { + "epoch": 0.6746134663341646, + "grad_norm": 0.4631249882687587, + "learning_rate": 1.2648347811167735e-05, + "loss": 0.8508, + "step": 13526 + }, + { + "epoch": 0.6746633416458853, + "grad_norm": 0.4641277438798397, + "learning_rate": 1.2644836843749126e-05, + "loss": 0.838, + "step": 13527 + }, + { + "epoch": 0.674713216957606, + "grad_norm": 0.8590020999243441, + "learning_rate": 1.2641326198732991e-05, + "loss": 0.8693, + "step": 13528 + }, + { + "epoch": 0.6747630922693267, + "grad_norm": 0.5226514201045794, + "learning_rate": 1.2637815876210898e-05, + "loss": 0.8563, + "step": 13529 + }, + { + "epoch": 0.6748129675810474, + "grad_norm": 0.4134337745663227, + "learning_rate": 1.2634305876274477e-05, + "loss": 0.8481, + "step": 13530 + }, + { + "epoch": 0.6748628428927681, + "grad_norm": 0.45655737114886447, + "learning_rate": 1.263079619901531e-05, + "loss": 0.8461, + "step": 13531 + }, + { + "epoch": 0.6749127182044887, + "grad_norm": 0.45659245778523216, + "learning_rate": 1.262728684452498e-05, + "loss": 0.857, + "step": 13532 + }, + { + "epoch": 0.6749625935162095, + "grad_norm": 0.6516993969900937, + "learning_rate": 1.2623777812895054e-05, + "loss": 0.8636, + "step": 13533 + }, + { + "epoch": 0.6750124688279302, + "grad_norm": 0.5681853188097241, + "learning_rate": 1.2620269104217115e-05, + "loss": 0.8387, + "step": 13534 + }, + { + "epoch": 0.6750623441396508, + "grad_norm": 0.443448098054684, + "learning_rate": 1.2616760718582715e-05, + "loss": 0.8196, + "step": 13535 + }, + { + "epoch": 0.6751122194513716, + "grad_norm": 0.46015208141225916, + "learning_rate": 1.2613252656083402e-05, + "loss": 0.8341, + "step": 13536 + }, + { + "epoch": 0.6751620947630923, + "grad_norm": 0.4909251554239767, + "learning_rate": 1.2609744916810706e-05, + "loss": 0.8415, + "step": 13537 + }, + { + "epoch": 0.6752119700748129, + "grad_norm": 0.578065959517311, + "learning_rate": 1.2606237500856183e-05, + "loss": 0.898, + "step": 13538 + }, + { + "epoch": 0.6752618453865337, + "grad_norm": 0.39987839003004194, + "learning_rate": 1.2602730408311342e-05, + "loss": 0.8621, + "step": 13539 + }, + { + "epoch": 0.6753117206982544, + "grad_norm": 0.6707287071877558, + "learning_rate": 1.2599223639267704e-05, + "loss": 0.8439, + "step": 13540 + }, + { + "epoch": 0.675361596009975, + "grad_norm": 0.46727831974843753, + "learning_rate": 1.259571719381677e-05, + "loss": 0.8884, + "step": 13541 + }, + { + "epoch": 0.6754114713216958, + "grad_norm": 0.546046219391934, + "learning_rate": 1.259221107205005e-05, + "loss": 0.8819, + "step": 13542 + }, + { + "epoch": 0.6754613466334165, + "grad_norm": 0.44206082173685624, + "learning_rate": 1.2588705274059032e-05, + "loss": 0.8685, + "step": 13543 + }, + { + "epoch": 0.6755112219451371, + "grad_norm": 0.48465910583643196, + "learning_rate": 1.2585199799935191e-05, + "loss": 0.8835, + "step": 13544 + }, + { + "epoch": 0.6755610972568579, + "grad_norm": 0.46919622502541897, + "learning_rate": 1.2581694649769998e-05, + "loss": 0.837, + "step": 13545 + }, + { + "epoch": 0.6756109725685786, + "grad_norm": 0.4213106047973423, + "learning_rate": 1.2578189823654934e-05, + "loss": 0.8584, + "step": 13546 + }, + { + "epoch": 0.6756608478802992, + "grad_norm": 0.4502926458806255, + "learning_rate": 1.2574685321681449e-05, + "loss": 0.8567, + "step": 13547 + }, + { + "epoch": 0.67571072319202, + "grad_norm": 0.4639046987584143, + "learning_rate": 1.257118114394098e-05, + "loss": 0.8301, + "step": 13548 + }, + { + "epoch": 0.6757605985037406, + "grad_norm": 0.5055574106527746, + "learning_rate": 1.2567677290524985e-05, + "loss": 0.8865, + "step": 13549 + }, + { + "epoch": 0.6758104738154613, + "grad_norm": 0.6955385510199198, + "learning_rate": 1.2564173761524887e-05, + "loss": 0.8535, + "step": 13550 + }, + { + "epoch": 0.6758603491271821, + "grad_norm": 0.4327999399040144, + "learning_rate": 1.2560670557032108e-05, + "loss": 0.8831, + "step": 13551 + }, + { + "epoch": 0.6759102244389027, + "grad_norm": 0.5492816147966546, + "learning_rate": 1.2557167677138057e-05, + "loss": 0.8577, + "step": 13552 + }, + { + "epoch": 0.6759600997506234, + "grad_norm": 0.45543221548165874, + "learning_rate": 1.2553665121934166e-05, + "loss": 0.892, + "step": 13553 + }, + { + "epoch": 0.6760099750623442, + "grad_norm": 0.4664561039532846, + "learning_rate": 1.255016289151179e-05, + "loss": 0.8643, + "step": 13554 + }, + { + "epoch": 0.6760598503740648, + "grad_norm": 0.4380406336141877, + "learning_rate": 1.2546660985962355e-05, + "loss": 0.8412, + "step": 13555 + }, + { + "epoch": 0.6761097256857855, + "grad_norm": 0.4356571561563985, + "learning_rate": 1.2543159405377219e-05, + "loss": 0.87, + "step": 13556 + }, + { + "epoch": 0.6761596009975063, + "grad_norm": 0.5027851541868925, + "learning_rate": 1.253965814984778e-05, + "loss": 0.8497, + "step": 13557 + }, + { + "epoch": 0.6762094763092269, + "grad_norm": 0.43698426348753894, + "learning_rate": 1.2536157219465367e-05, + "loss": 0.8575, + "step": 13558 + }, + { + "epoch": 0.6762593516209476, + "grad_norm": 0.784708051098644, + "learning_rate": 1.2532656614321364e-05, + "loss": 0.8316, + "step": 13559 + }, + { + "epoch": 0.6763092269326684, + "grad_norm": 0.5153363997784407, + "learning_rate": 1.2529156334507094e-05, + "loss": 0.8472, + "step": 13560 + }, + { + "epoch": 0.676359102244389, + "grad_norm": 0.5626085851211367, + "learning_rate": 1.2525656380113932e-05, + "loss": 0.855, + "step": 13561 + }, + { + "epoch": 0.6764089775561097, + "grad_norm": 0.5699793798087984, + "learning_rate": 1.252215675123316e-05, + "loss": 0.846, + "step": 13562 + }, + { + "epoch": 0.6764588528678305, + "grad_norm": 0.4861689077878851, + "learning_rate": 1.2518657447956137e-05, + "loss": 0.835, + "step": 13563 + }, + { + "epoch": 0.6765087281795511, + "grad_norm": 0.772203980442751, + "learning_rate": 1.251515847037415e-05, + "loss": 0.8691, + "step": 13564 + }, + { + "epoch": 0.6765586034912718, + "grad_norm": 0.45231125871428474, + "learning_rate": 1.2511659818578536e-05, + "loss": 0.8607, + "step": 13565 + }, + { + "epoch": 0.6766084788029925, + "grad_norm": 0.6171400729115079, + "learning_rate": 1.2508161492660547e-05, + "loss": 0.8992, + "step": 13566 + }, + { + "epoch": 0.6766583541147132, + "grad_norm": 0.5362883377339672, + "learning_rate": 1.2504663492711508e-05, + "loss": 0.8776, + "step": 13567 + }, + { + "epoch": 0.6767082294264339, + "grad_norm": 0.5569208361310966, + "learning_rate": 1.250116581882268e-05, + "loss": 0.8393, + "step": 13568 + }, + { + "epoch": 0.6767581047381546, + "grad_norm": 0.5988363234635825, + "learning_rate": 1.2497668471085339e-05, + "loss": 0.8289, + "step": 13569 + }, + { + "epoch": 0.6768079800498753, + "grad_norm": 0.5326104396726637, + "learning_rate": 1.249417144959073e-05, + "loss": 0.8286, + "step": 13570 + }, + { + "epoch": 0.676857855361596, + "grad_norm": 0.5045081561718836, + "learning_rate": 1.2490674754430131e-05, + "loss": 0.8418, + "step": 13571 + }, + { + "epoch": 0.6769077306733167, + "grad_norm": 0.48434911506854256, + "learning_rate": 1.2487178385694778e-05, + "loss": 0.8852, + "step": 13572 + }, + { + "epoch": 0.6769576059850374, + "grad_norm": 0.5327316802186867, + "learning_rate": 1.2483682343475905e-05, + "loss": 0.8364, + "step": 13573 + }, + { + "epoch": 0.6770074812967581, + "grad_norm": 0.7096616423727153, + "learning_rate": 1.2480186627864729e-05, + "loss": 0.8476, + "step": 13574 + }, + { + "epoch": 0.6770573566084788, + "grad_norm": 0.463781311317983, + "learning_rate": 1.247669123895249e-05, + "loss": 0.9023, + "step": 13575 + }, + { + "epoch": 0.6771072319201995, + "grad_norm": 0.3970677330236485, + "learning_rate": 1.2473196176830388e-05, + "loss": 0.8436, + "step": 13576 + }, + { + "epoch": 0.6771571072319202, + "grad_norm": 0.4790587475223008, + "learning_rate": 1.2469701441589624e-05, + "loss": 0.8645, + "step": 13577 + }, + { + "epoch": 0.6772069825436409, + "grad_norm": 0.45838468906533664, + "learning_rate": 1.2466207033321384e-05, + "loss": 0.8626, + "step": 13578 + }, + { + "epoch": 0.6772568578553616, + "grad_norm": 0.4884652221593632, + "learning_rate": 1.2462712952116872e-05, + "loss": 0.8709, + "step": 13579 + }, + { + "epoch": 0.6773067331670823, + "grad_norm": 0.4715234193495863, + "learning_rate": 1.2459219198067257e-05, + "loss": 0.8134, + "step": 13580 + }, + { + "epoch": 0.677356608478803, + "grad_norm": 0.5967352948905851, + "learning_rate": 1.2455725771263702e-05, + "loss": 0.8748, + "step": 13581 + }, + { + "epoch": 0.6774064837905237, + "grad_norm": 0.52061009931242, + "learning_rate": 1.245223267179736e-05, + "loss": 0.8811, + "step": 13582 + }, + { + "epoch": 0.6774563591022444, + "grad_norm": 0.4679000674008989, + "learning_rate": 1.2448739899759398e-05, + "loss": 0.8591, + "step": 13583 + }, + { + "epoch": 0.6775062344139651, + "grad_norm": 0.4727105425591982, + "learning_rate": 1.2445247455240955e-05, + "loss": 0.8542, + "step": 13584 + }, + { + "epoch": 0.6775561097256858, + "grad_norm": 0.5539633044559763, + "learning_rate": 1.244175533833315e-05, + "loss": 0.8304, + "step": 13585 + }, + { + "epoch": 0.6776059850374064, + "grad_norm": 0.6058880792979611, + "learning_rate": 1.2438263549127128e-05, + "loss": 0.854, + "step": 13586 + }, + { + "epoch": 0.6776558603491272, + "grad_norm": 0.5072540807234661, + "learning_rate": 1.2434772087713997e-05, + "loss": 0.8474, + "step": 13587 + }, + { + "epoch": 0.6777057356608479, + "grad_norm": 0.6134952789554557, + "learning_rate": 1.2431280954184866e-05, + "loss": 0.8464, + "step": 13588 + }, + { + "epoch": 0.6777556109725685, + "grad_norm": 0.4560575617414968, + "learning_rate": 1.2427790148630822e-05, + "loss": 0.861, + "step": 13589 + }, + { + "epoch": 0.6778054862842893, + "grad_norm": 0.6866293149890207, + "learning_rate": 1.2424299671142986e-05, + "loss": 0.8911, + "step": 13590 + }, + { + "epoch": 0.67785536159601, + "grad_norm": 0.8367841020690208, + "learning_rate": 1.2420809521812404e-05, + "loss": 0.8585, + "step": 13591 + }, + { + "epoch": 0.6779052369077306, + "grad_norm": 0.45336431367634955, + "learning_rate": 1.2417319700730178e-05, + "loss": 0.8653, + "step": 13592 + }, + { + "epoch": 0.6779551122194514, + "grad_norm": 0.5038370680030342, + "learning_rate": 1.2413830207987353e-05, + "loss": 0.8513, + "step": 13593 + }, + { + "epoch": 0.6780049875311721, + "grad_norm": 0.45302415021769143, + "learning_rate": 1.2410341043675017e-05, + "loss": 0.9003, + "step": 13594 + }, + { + "epoch": 0.6780548628428927, + "grad_norm": 0.45586390909537466, + "learning_rate": 1.2406852207884176e-05, + "loss": 0.8739, + "step": 13595 + }, + { + "epoch": 0.6781047381546135, + "grad_norm": 0.524761669881702, + "learning_rate": 1.24033637007059e-05, + "loss": 0.8664, + "step": 13596 + }, + { + "epoch": 0.6781546134663342, + "grad_norm": 0.42738794506399036, + "learning_rate": 1.2399875522231199e-05, + "loss": 0.8525, + "step": 13597 + }, + { + "epoch": 0.6782044887780548, + "grad_norm": 0.4614826846197481, + "learning_rate": 1.239638767255113e-05, + "loss": 0.867, + "step": 13598 + }, + { + "epoch": 0.6782543640897756, + "grad_norm": 0.5056473247302773, + "learning_rate": 1.2392900151756665e-05, + "loss": 0.8569, + "step": 13599 + }, + { + "epoch": 0.6783042394014963, + "grad_norm": 0.531199730014392, + "learning_rate": 1.2389412959938837e-05, + "loss": 0.8445, + "step": 13600 + }, + { + "epoch": 0.6783541147132169, + "grad_norm": 0.5001414317493541, + "learning_rate": 1.2385926097188632e-05, + "loss": 0.8539, + "step": 13601 + }, + { + "epoch": 0.6784039900249377, + "grad_norm": 0.4866992101103563, + "learning_rate": 1.2382439563597043e-05, + "loss": 0.9132, + "step": 13602 + }, + { + "epoch": 0.6784538653366583, + "grad_norm": 0.43057868355243484, + "learning_rate": 1.2378953359255036e-05, + "loss": 0.8734, + "step": 13603 + }, + { + "epoch": 0.678503740648379, + "grad_norm": 0.4917772721065398, + "learning_rate": 1.2375467484253603e-05, + "loss": 0.8703, + "step": 13604 + }, + { + "epoch": 0.6785536159600998, + "grad_norm": 0.4988341942854165, + "learning_rate": 1.2371981938683693e-05, + "loss": 0.8758, + "step": 13605 + }, + { + "epoch": 0.6786034912718204, + "grad_norm": 0.5808149262048138, + "learning_rate": 1.2368496722636267e-05, + "loss": 0.8841, + "step": 13606 + }, + { + "epoch": 0.6786533665835411, + "grad_norm": 0.46033412741834756, + "learning_rate": 1.2365011836202253e-05, + "loss": 0.8349, + "step": 13607 + }, + { + "epoch": 0.6787032418952619, + "grad_norm": 0.46663574687282444, + "learning_rate": 1.2361527279472613e-05, + "loss": 0.8505, + "step": 13608 + }, + { + "epoch": 0.6787531172069825, + "grad_norm": 0.4807599345593126, + "learning_rate": 1.2358043052538259e-05, + "loss": 0.8915, + "step": 13609 + }, + { + "epoch": 0.6788029925187032, + "grad_norm": 0.5322064720294776, + "learning_rate": 1.2354559155490117e-05, + "loss": 0.8595, + "step": 13610 + }, + { + "epoch": 0.678852867830424, + "grad_norm": 0.42787300831026615, + "learning_rate": 1.2351075588419083e-05, + "loss": 0.8686, + "step": 13611 + }, + { + "epoch": 0.6789027431421446, + "grad_norm": 0.5917312964268946, + "learning_rate": 1.2347592351416082e-05, + "loss": 0.8722, + "step": 13612 + }, + { + "epoch": 0.6789526184538653, + "grad_norm": 0.5149552192548565, + "learning_rate": 1.2344109444571994e-05, + "loss": 0.8752, + "step": 13613 + }, + { + "epoch": 0.6790024937655861, + "grad_norm": 0.504697810609033, + "learning_rate": 1.2340626867977707e-05, + "loss": 0.8414, + "step": 13614 + }, + { + "epoch": 0.6790523690773067, + "grad_norm": 0.4342313769687323, + "learning_rate": 1.2337144621724087e-05, + "loss": 0.8467, + "step": 13615 + }, + { + "epoch": 0.6791022443890274, + "grad_norm": 0.43229195540537424, + "learning_rate": 1.2333662705902018e-05, + "loss": 0.8709, + "step": 13616 + }, + { + "epoch": 0.6791521197007482, + "grad_norm": 0.42025028720634766, + "learning_rate": 1.2330181120602354e-05, + "loss": 0.8553, + "step": 13617 + }, + { + "epoch": 0.6792019950124688, + "grad_norm": 0.5115310675393828, + "learning_rate": 1.2326699865915945e-05, + "loss": 0.8079, + "step": 13618 + }, + { + "epoch": 0.6792518703241895, + "grad_norm": 0.8449067291686511, + "learning_rate": 1.2323218941933618e-05, + "loss": 0.8914, + "step": 13619 + }, + { + "epoch": 0.6793017456359102, + "grad_norm": 0.4289332475894278, + "learning_rate": 1.2319738348746232e-05, + "loss": 0.8336, + "step": 13620 + }, + { + "epoch": 0.6793516209476309, + "grad_norm": 0.4501933506422186, + "learning_rate": 1.2316258086444596e-05, + "loss": 0.8445, + "step": 13621 + }, + { + "epoch": 0.6794014962593516, + "grad_norm": 0.46707487846801254, + "learning_rate": 1.2312778155119522e-05, + "loss": 0.8776, + "step": 13622 + }, + { + "epoch": 0.6794513715710723, + "grad_norm": 0.47203127117856575, + "learning_rate": 1.2309298554861831e-05, + "loss": 0.8627, + "step": 13623 + }, + { + "epoch": 0.679501246882793, + "grad_norm": 0.5051829744657648, + "learning_rate": 1.2305819285762318e-05, + "loss": 0.8664, + "step": 13624 + }, + { + "epoch": 0.6795511221945137, + "grad_norm": 0.5508382454822776, + "learning_rate": 1.2302340347911766e-05, + "loss": 0.8856, + "step": 13625 + }, + { + "epoch": 0.6796009975062344, + "grad_norm": 0.512871941254359, + "learning_rate": 1.2298861741400952e-05, + "loss": 0.8631, + "step": 13626 + }, + { + "epoch": 0.6796508728179551, + "grad_norm": 0.5537316316285118, + "learning_rate": 1.2295383466320677e-05, + "loss": 0.8424, + "step": 13627 + }, + { + "epoch": 0.6797007481296758, + "grad_norm": 0.5206164597409978, + "learning_rate": 1.2291905522761663e-05, + "loss": 0.8502, + "step": 13628 + }, + { + "epoch": 0.6797506234413965, + "grad_norm": 0.528016926007481, + "learning_rate": 1.2288427910814699e-05, + "loss": 0.8464, + "step": 13629 + }, + { + "epoch": 0.6798004987531172, + "grad_norm": 0.5704153309454146, + "learning_rate": 1.2284950630570508e-05, + "loss": 0.8531, + "step": 13630 + }, + { + "epoch": 0.679850374064838, + "grad_norm": 0.4891275605499789, + "learning_rate": 1.228147368211986e-05, + "loss": 0.8426, + "step": 13631 + }, + { + "epoch": 0.6799002493765586, + "grad_norm": 0.5830816581916101, + "learning_rate": 1.2277997065553442e-05, + "loss": 0.8499, + "step": 13632 + }, + { + "epoch": 0.6799501246882793, + "grad_norm": 0.5081718000142189, + "learning_rate": 1.2274520780962013e-05, + "loss": 0.8529, + "step": 13633 + }, + { + "epoch": 0.68, + "grad_norm": 0.49316701398310087, + "learning_rate": 1.2271044828436254e-05, + "loss": 0.8526, + "step": 13634 + }, + { + "epoch": 0.6800498753117207, + "grad_norm": 0.5397814333809532, + "learning_rate": 1.2267569208066904e-05, + "loss": 0.8581, + "step": 13635 + }, + { + "epoch": 0.6800997506234414, + "grad_norm": 0.536803086256371, + "learning_rate": 1.226409391994462e-05, + "loss": 0.8108, + "step": 13636 + }, + { + "epoch": 0.6801496259351621, + "grad_norm": 0.46562604420075315, + "learning_rate": 1.2260618964160111e-05, + "loss": 0.8457, + "step": 13637 + }, + { + "epoch": 0.6801995012468828, + "grad_norm": 0.48610178226775, + "learning_rate": 1.2257144340804052e-05, + "loss": 0.7953, + "step": 13638 + }, + { + "epoch": 0.6802493765586035, + "grad_norm": 0.45350516703407767, + "learning_rate": 1.2253670049967106e-05, + "loss": 0.8326, + "step": 13639 + }, + { + "epoch": 0.6802992518703241, + "grad_norm": 0.5158188956590437, + "learning_rate": 1.225019609173993e-05, + "loss": 0.8523, + "step": 13640 + }, + { + "epoch": 0.6803491271820449, + "grad_norm": 0.5457306386106702, + "learning_rate": 1.2246722466213186e-05, + "loss": 0.9072, + "step": 13641 + }, + { + "epoch": 0.6803990024937656, + "grad_norm": 0.5142095033232713, + "learning_rate": 1.2243249173477513e-05, + "loss": 0.8907, + "step": 13642 + }, + { + "epoch": 0.6804488778054862, + "grad_norm": 0.5529259756678323, + "learning_rate": 1.2239776213623544e-05, + "loss": 0.8483, + "step": 13643 + }, + { + "epoch": 0.680498753117207, + "grad_norm": 0.42582498675931413, + "learning_rate": 1.2236303586741896e-05, + "loss": 0.8358, + "step": 13644 + }, + { + "epoch": 0.6805486284289277, + "grad_norm": 0.45180825227048393, + "learning_rate": 1.2232831292923203e-05, + "loss": 0.8264, + "step": 13645 + }, + { + "epoch": 0.6805985037406483, + "grad_norm": 0.543080685541476, + "learning_rate": 1.2229359332258062e-05, + "loss": 0.8463, + "step": 13646 + }, + { + "epoch": 0.6806483790523691, + "grad_norm": 0.4594121279342874, + "learning_rate": 1.2225887704837071e-05, + "loss": 0.8792, + "step": 13647 + }, + { + "epoch": 0.6806982543640898, + "grad_norm": 0.49109183159190406, + "learning_rate": 1.222241641075082e-05, + "loss": 0.8164, + "step": 13648 + }, + { + "epoch": 0.6807481296758104, + "grad_norm": 0.5833592109127588, + "learning_rate": 1.22189454500899e-05, + "loss": 0.8421, + "step": 13649 + }, + { + "epoch": 0.6807980049875312, + "grad_norm": 0.4589650325750072, + "learning_rate": 1.2215474822944878e-05, + "loss": 0.8876, + "step": 13650 + }, + { + "epoch": 0.6808478802992519, + "grad_norm": 0.4597840106873017, + "learning_rate": 1.221200452940632e-05, + "loss": 0.8503, + "step": 13651 + }, + { + "epoch": 0.6808977556109725, + "grad_norm": 0.4854509060859873, + "learning_rate": 1.2208534569564772e-05, + "loss": 0.8473, + "step": 13652 + }, + { + "epoch": 0.6809476309226933, + "grad_norm": 0.49039771974195606, + "learning_rate": 1.2205064943510799e-05, + "loss": 0.8529, + "step": 13653 + }, + { + "epoch": 0.680997506234414, + "grad_norm": 0.5704551423211726, + "learning_rate": 1.220159565133493e-05, + "loss": 0.8776, + "step": 13654 + }, + { + "epoch": 0.6810473815461346, + "grad_norm": 0.5025487175100923, + "learning_rate": 1.2198126693127693e-05, + "loss": 0.8262, + "step": 13655 + }, + { + "epoch": 0.6810972568578554, + "grad_norm": 0.6088049146532205, + "learning_rate": 1.2194658068979602e-05, + "loss": 0.8067, + "step": 13656 + }, + { + "epoch": 0.681147132169576, + "grad_norm": 0.4734737382591813, + "learning_rate": 1.2191189778981188e-05, + "loss": 0.8368, + "step": 13657 + }, + { + "epoch": 0.6811970074812967, + "grad_norm": 0.5419489227134078, + "learning_rate": 1.2187721823222945e-05, + "loss": 0.8674, + "step": 13658 + }, + { + "epoch": 0.6812468827930175, + "grad_norm": 0.54678204631493, + "learning_rate": 1.2184254201795365e-05, + "loss": 0.8642, + "step": 13659 + }, + { + "epoch": 0.6812967581047381, + "grad_norm": 0.4863924163745053, + "learning_rate": 1.2180786914788936e-05, + "loss": 0.8744, + "step": 13660 + }, + { + "epoch": 0.6813466334164588, + "grad_norm": 0.4173149354532918, + "learning_rate": 1.2177319962294126e-05, + "loss": 0.8131, + "step": 13661 + }, + { + "epoch": 0.6813965087281796, + "grad_norm": 0.4716000357387658, + "learning_rate": 1.2173853344401423e-05, + "loss": 0.8454, + "step": 13662 + }, + { + "epoch": 0.6814463840399002, + "grad_norm": 0.4680419606691628, + "learning_rate": 1.2170387061201269e-05, + "loss": 0.8906, + "step": 13663 + }, + { + "epoch": 0.681496259351621, + "grad_norm": 0.4503465358805489, + "learning_rate": 1.216692111278414e-05, + "loss": 0.8459, + "step": 13664 + }, + { + "epoch": 0.6815461346633417, + "grad_norm": 0.4445859890714709, + "learning_rate": 1.2163455499240441e-05, + "loss": 0.869, + "step": 13665 + }, + { + "epoch": 0.6815960099750623, + "grad_norm": 0.5811759320207883, + "learning_rate": 1.215999022066064e-05, + "loss": 0.872, + "step": 13666 + }, + { + "epoch": 0.681645885286783, + "grad_norm": 0.5228362174369744, + "learning_rate": 1.2156525277135133e-05, + "loss": 0.8831, + "step": 13667 + }, + { + "epoch": 0.6816957605985038, + "grad_norm": 0.6328220743552613, + "learning_rate": 1.2153060668754372e-05, + "loss": 0.8588, + "step": 13668 + }, + { + "epoch": 0.6817456359102244, + "grad_norm": 0.41932536470450354, + "learning_rate": 1.2149596395608722e-05, + "loss": 0.8372, + "step": 13669 + }, + { + "epoch": 0.6817955112219451, + "grad_norm": 0.6088593838042072, + "learning_rate": 1.2146132457788617e-05, + "loss": 0.8618, + "step": 13670 + }, + { + "epoch": 0.6818453865336659, + "grad_norm": 0.4253905045432495, + "learning_rate": 1.214266885538442e-05, + "loss": 0.7767, + "step": 13671 + }, + { + "epoch": 0.6818952618453865, + "grad_norm": 0.5480715942920434, + "learning_rate": 1.2139205588486546e-05, + "loss": 0.8706, + "step": 13672 + }, + { + "epoch": 0.6819451371571073, + "grad_norm": 0.5188878412725907, + "learning_rate": 1.2135742657185326e-05, + "loss": 0.8638, + "step": 13673 + }, + { + "epoch": 0.6819950124688279, + "grad_norm": 0.541926769778173, + "learning_rate": 1.2132280061571155e-05, + "loss": 0.8802, + "step": 13674 + }, + { + "epoch": 0.6820448877805486, + "grad_norm": 0.5433478244478811, + "learning_rate": 1.2128817801734374e-05, + "loss": 0.8582, + "step": 13675 + }, + { + "epoch": 0.6820947630922694, + "grad_norm": 0.5038921588913198, + "learning_rate": 1.2125355877765335e-05, + "loss": 0.8157, + "step": 13676 + }, + { + "epoch": 0.68214463840399, + "grad_norm": 0.5105180416852972, + "learning_rate": 1.2121894289754362e-05, + "loss": 0.8941, + "step": 13677 + }, + { + "epoch": 0.6821945137157107, + "grad_norm": 0.5804769990465322, + "learning_rate": 1.2118433037791804e-05, + "loss": 0.7863, + "step": 13678 + }, + { + "epoch": 0.6822443890274315, + "grad_norm": 0.4835018411803645, + "learning_rate": 1.2114972121967969e-05, + "loss": 0.8135, + "step": 13679 + }, + { + "epoch": 0.6822942643391521, + "grad_norm": 0.4752907229145563, + "learning_rate": 1.2111511542373171e-05, + "loss": 0.8346, + "step": 13680 + }, + { + "epoch": 0.6823441396508728, + "grad_norm": 0.4324682157360145, + "learning_rate": 1.2108051299097703e-05, + "loss": 0.8263, + "step": 13681 + }, + { + "epoch": 0.6823940149625936, + "grad_norm": 0.4924732822644529, + "learning_rate": 1.2104591392231875e-05, + "loss": 0.8631, + "step": 13682 + }, + { + "epoch": 0.6824438902743142, + "grad_norm": 0.6253419562370556, + "learning_rate": 1.2101131821865963e-05, + "loss": 0.8634, + "step": 13683 + }, + { + "epoch": 0.6824937655860349, + "grad_norm": 0.5085423981158119, + "learning_rate": 1.2097672588090245e-05, + "loss": 0.8831, + "step": 13684 + }, + { + "epoch": 0.6825436408977557, + "grad_norm": 0.5028418758057224, + "learning_rate": 1.2094213690994975e-05, + "loss": 0.8588, + "step": 13685 + }, + { + "epoch": 0.6825935162094763, + "grad_norm": 0.43709326680072663, + "learning_rate": 1.2090755130670443e-05, + "loss": 0.7889, + "step": 13686 + }, + { + "epoch": 0.682643391521197, + "grad_norm": 0.4547460334586478, + "learning_rate": 1.2087296907206858e-05, + "loss": 0.8944, + "step": 13687 + }, + { + "epoch": 0.6826932668329178, + "grad_norm": 0.5303833059645859, + "learning_rate": 1.2083839020694493e-05, + "loss": 0.8722, + "step": 13688 + }, + { + "epoch": 0.6827431421446384, + "grad_norm": 0.43589723611959286, + "learning_rate": 1.2080381471223556e-05, + "loss": 0.8168, + "step": 13689 + }, + { + "epoch": 0.6827930174563591, + "grad_norm": 0.49901941712304765, + "learning_rate": 1.2076924258884296e-05, + "loss": 0.9108, + "step": 13690 + }, + { + "epoch": 0.6828428927680797, + "grad_norm": 0.7186279559404247, + "learning_rate": 1.207346738376691e-05, + "loss": 0.8384, + "step": 13691 + }, + { + "epoch": 0.6828927680798005, + "grad_norm": 0.5300428020042977, + "learning_rate": 1.207001084596161e-05, + "loss": 0.8808, + "step": 13692 + }, + { + "epoch": 0.6829426433915212, + "grad_norm": 0.4631729489430094, + "learning_rate": 1.2066554645558578e-05, + "loss": 0.8695, + "step": 13693 + }, + { + "epoch": 0.6829925187032418, + "grad_norm": 0.554706575125634, + "learning_rate": 1.2063098782648027e-05, + "loss": 0.8468, + "step": 13694 + }, + { + "epoch": 0.6830423940149626, + "grad_norm": 0.49024563860955234, + "learning_rate": 1.2059643257320122e-05, + "loss": 0.8175, + "step": 13695 + }, + { + "epoch": 0.6830922693266833, + "grad_norm": 0.5465601713089236, + "learning_rate": 1.2056188069665033e-05, + "loss": 0.8974, + "step": 13696 + }, + { + "epoch": 0.683142144638404, + "grad_norm": 0.46036088077855347, + "learning_rate": 1.2052733219772926e-05, + "loss": 0.8233, + "step": 13697 + }, + { + "epoch": 0.6831920199501247, + "grad_norm": 0.43430485848879785, + "learning_rate": 1.204927870773394e-05, + "loss": 0.847, + "step": 13698 + }, + { + "epoch": 0.6832418952618454, + "grad_norm": 0.5057668145043135, + "learning_rate": 1.2045824533638243e-05, + "loss": 0.8837, + "step": 13699 + }, + { + "epoch": 0.683291770573566, + "grad_norm": 0.3782697943726118, + "learning_rate": 1.2042370697575943e-05, + "loss": 0.7874, + "step": 13700 + }, + { + "epoch": 0.6833416458852868, + "grad_norm": 0.5027391827004164, + "learning_rate": 1.2038917199637201e-05, + "loss": 0.8844, + "step": 13701 + }, + { + "epoch": 0.6833915211970075, + "grad_norm": 0.4458032957775979, + "learning_rate": 1.2035464039912098e-05, + "loss": 0.8307, + "step": 13702 + }, + { + "epoch": 0.6834413965087281, + "grad_norm": 0.4568314746367478, + "learning_rate": 1.2032011218490766e-05, + "loss": 0.8421, + "step": 13703 + }, + { + "epoch": 0.6834912718204489, + "grad_norm": 0.4579329375826007, + "learning_rate": 1.2028558735463286e-05, + "loss": 0.7974, + "step": 13704 + }, + { + "epoch": 0.6835411471321696, + "grad_norm": 0.46984227910830706, + "learning_rate": 1.2025106590919782e-05, + "loss": 0.8896, + "step": 13705 + }, + { + "epoch": 0.6835910224438903, + "grad_norm": 0.4873017760776557, + "learning_rate": 1.2021654784950295e-05, + "loss": 0.8638, + "step": 13706 + }, + { + "epoch": 0.683640897755611, + "grad_norm": 0.4527826669316263, + "learning_rate": 1.201820331764493e-05, + "loss": 0.8645, + "step": 13707 + }, + { + "epoch": 0.6836907730673317, + "grad_norm": 0.4721082432979438, + "learning_rate": 1.2014752189093737e-05, + "loss": 0.8836, + "step": 13708 + }, + { + "epoch": 0.6837406483790524, + "grad_norm": 0.4561899420289076, + "learning_rate": 1.2011301399386774e-05, + "loss": 0.8768, + "step": 13709 + }, + { + "epoch": 0.6837905236907731, + "grad_norm": 0.4949645839755476, + "learning_rate": 1.200785094861408e-05, + "loss": 0.8542, + "step": 13710 + }, + { + "epoch": 0.6838403990024937, + "grad_norm": 0.4637489568396228, + "learning_rate": 1.2004400836865709e-05, + "loss": 0.854, + "step": 13711 + }, + { + "epoch": 0.6838902743142145, + "grad_norm": 0.7503905995098571, + "learning_rate": 1.2000951064231682e-05, + "loss": 0.8513, + "step": 13712 + }, + { + "epoch": 0.6839401496259352, + "grad_norm": 0.5060787556207026, + "learning_rate": 1.1997501630802018e-05, + "loss": 0.8201, + "step": 13713 + }, + { + "epoch": 0.6839900249376558, + "grad_norm": 0.483565714934058, + "learning_rate": 1.1994052536666726e-05, + "loss": 0.8621, + "step": 13714 + }, + { + "epoch": 0.6840399002493766, + "grad_norm": 0.45815916649738975, + "learning_rate": 1.1990603781915815e-05, + "loss": 0.8586, + "step": 13715 + }, + { + "epoch": 0.6840897755610973, + "grad_norm": 0.49964890974607423, + "learning_rate": 1.198715536663928e-05, + "loss": 0.8485, + "step": 13716 + }, + { + "epoch": 0.6841396508728179, + "grad_norm": 0.7115450879689772, + "learning_rate": 1.1983707290927102e-05, + "loss": 0.849, + "step": 13717 + }, + { + "epoch": 0.6841895261845387, + "grad_norm": 0.5646573343527732, + "learning_rate": 1.1980259554869246e-05, + "loss": 0.8696, + "step": 13718 + }, + { + "epoch": 0.6842394014962594, + "grad_norm": 0.47421863090198807, + "learning_rate": 1.1976812158555711e-05, + "loss": 0.8643, + "step": 13719 + }, + { + "epoch": 0.68428927680798, + "grad_norm": 0.4673830308992078, + "learning_rate": 1.1973365102076414e-05, + "loss": 0.8064, + "step": 13720 + }, + { + "epoch": 0.6843391521197008, + "grad_norm": 0.4809898856139219, + "learning_rate": 1.1969918385521333e-05, + "loss": 0.8695, + "step": 13721 + }, + { + "epoch": 0.6843890274314215, + "grad_norm": 0.4900784233958404, + "learning_rate": 1.1966472008980392e-05, + "loss": 0.9015, + "step": 13722 + }, + { + "epoch": 0.6844389027431421, + "grad_norm": 0.5769150699362788, + "learning_rate": 1.1963025972543549e-05, + "loss": 0.8436, + "step": 13723 + }, + { + "epoch": 0.6844887780548629, + "grad_norm": 0.6264141722291754, + "learning_rate": 1.195958027630069e-05, + "loss": 0.8369, + "step": 13724 + }, + { + "epoch": 0.6845386533665836, + "grad_norm": 0.5667376773976898, + "learning_rate": 1.1956134920341759e-05, + "loss": 0.8481, + "step": 13725 + }, + { + "epoch": 0.6845885286783042, + "grad_norm": 0.5833827252532051, + "learning_rate": 1.195268990475664e-05, + "loss": 0.8754, + "step": 13726 + }, + { + "epoch": 0.684638403990025, + "grad_norm": 0.41818502116697726, + "learning_rate": 1.1949245229635245e-05, + "loss": 0.8509, + "step": 13727 + }, + { + "epoch": 0.6846882793017456, + "grad_norm": 0.5489418861318157, + "learning_rate": 1.1945800895067457e-05, + "loss": 0.8371, + "step": 13728 + }, + { + "epoch": 0.6847381546134663, + "grad_norm": 0.472179980428483, + "learning_rate": 1.194235690114315e-05, + "loss": 0.8503, + "step": 13729 + }, + { + "epoch": 0.6847880299251871, + "grad_norm": 0.4311464607742779, + "learning_rate": 1.1938913247952191e-05, + "loss": 0.8174, + "step": 13730 + }, + { + "epoch": 0.6848379052369077, + "grad_norm": 0.44714951643726447, + "learning_rate": 1.1935469935584451e-05, + "loss": 0.828, + "step": 13731 + }, + { + "epoch": 0.6848877805486284, + "grad_norm": 0.4737967711654314, + "learning_rate": 1.1932026964129778e-05, + "loss": 0.8885, + "step": 13732 + }, + { + "epoch": 0.6849376558603492, + "grad_norm": 0.8094295596942389, + "learning_rate": 1.1928584333678012e-05, + "loss": 0.8566, + "step": 13733 + }, + { + "epoch": 0.6849875311720698, + "grad_norm": 0.5023241740546317, + "learning_rate": 1.1925142044318988e-05, + "loss": 0.8325, + "step": 13734 + }, + { + "epoch": 0.6850374064837905, + "grad_norm": 0.4852920358246687, + "learning_rate": 1.1921700096142522e-05, + "loss": 0.8593, + "step": 13735 + }, + { + "epoch": 0.6850872817955113, + "grad_norm": 0.42807186693720195, + "learning_rate": 1.1918258489238449e-05, + "loss": 0.8166, + "step": 13736 + }, + { + "epoch": 0.6851371571072319, + "grad_norm": 0.44890413724928463, + "learning_rate": 1.1914817223696553e-05, + "loss": 0.8484, + "step": 13737 + }, + { + "epoch": 0.6851870324189526, + "grad_norm": 0.5179303750624458, + "learning_rate": 1.1911376299606667e-05, + "loss": 0.8244, + "step": 13738 + }, + { + "epoch": 0.6852369077306734, + "grad_norm": 0.49040913752291215, + "learning_rate": 1.190793571705854e-05, + "loss": 0.8241, + "step": 13739 + }, + { + "epoch": 0.685286783042394, + "grad_norm": 0.4140123279553829, + "learning_rate": 1.190449547614198e-05, + "loss": 0.8592, + "step": 13740 + }, + { + "epoch": 0.6853366583541147, + "grad_norm": 0.5569458245133175, + "learning_rate": 1.190105557694674e-05, + "loss": 0.8471, + "step": 13741 + }, + { + "epoch": 0.6853865336658355, + "grad_norm": 0.5655885501000072, + "learning_rate": 1.1897616019562614e-05, + "loss": 0.864, + "step": 13742 + }, + { + "epoch": 0.6854364089775561, + "grad_norm": 0.5426645481246636, + "learning_rate": 1.1894176804079313e-05, + "loss": 0.8446, + "step": 13743 + }, + { + "epoch": 0.6854862842892768, + "grad_norm": 0.685581739742305, + "learning_rate": 1.1890737930586612e-05, + "loss": 0.912, + "step": 13744 + }, + { + "epoch": 0.6855361596009975, + "grad_norm": 0.47170535793977, + "learning_rate": 1.1887299399174237e-05, + "loss": 0.8493, + "step": 13745 + }, + { + "epoch": 0.6855860349127182, + "grad_norm": 0.4941179021561214, + "learning_rate": 1.1883861209931918e-05, + "loss": 0.818, + "step": 13746 + }, + { + "epoch": 0.6856359102244389, + "grad_norm": 0.6098238190932826, + "learning_rate": 1.1880423362949361e-05, + "loss": 0.8288, + "step": 13747 + }, + { + "epoch": 0.6856857855361596, + "grad_norm": 0.4577979325798818, + "learning_rate": 1.1876985858316292e-05, + "loss": 0.8615, + "step": 13748 + }, + { + "epoch": 0.6857356608478803, + "grad_norm": 0.5291020920526288, + "learning_rate": 1.1873548696122405e-05, + "loss": 0.809, + "step": 13749 + }, + { + "epoch": 0.685785536159601, + "grad_norm": 0.474300782009153, + "learning_rate": 1.187011187645739e-05, + "loss": 0.8455, + "step": 13750 + }, + { + "epoch": 0.6858354114713217, + "grad_norm": 0.5425465305093835, + "learning_rate": 1.1866675399410923e-05, + "loss": 0.8679, + "step": 13751 + }, + { + "epoch": 0.6858852867830424, + "grad_norm": 0.47140138211692323, + "learning_rate": 1.186323926507269e-05, + "loss": 0.8888, + "step": 13752 + }, + { + "epoch": 0.6859351620947631, + "grad_norm": 0.4858556417365407, + "learning_rate": 1.1859803473532349e-05, + "loss": 0.836, + "step": 13753 + }, + { + "epoch": 0.6859850374064838, + "grad_norm": 0.5225940102306917, + "learning_rate": 1.1856368024879558e-05, + "loss": 0.8911, + "step": 13754 + }, + { + "epoch": 0.6860349127182045, + "grad_norm": 0.5116091948295057, + "learning_rate": 1.185293291920395e-05, + "loss": 0.888, + "step": 13755 + }, + { + "epoch": 0.6860847880299252, + "grad_norm": 0.5489937959262243, + "learning_rate": 1.1849498156595193e-05, + "loss": 0.8207, + "step": 13756 + }, + { + "epoch": 0.6861346633416459, + "grad_norm": 0.4760451637025792, + "learning_rate": 1.1846063737142876e-05, + "loss": 0.8789, + "step": 13757 + }, + { + "epoch": 0.6861845386533666, + "grad_norm": 0.44653765751879837, + "learning_rate": 1.184262966093665e-05, + "loss": 0.8603, + "step": 13758 + }, + { + "epoch": 0.6862344139650873, + "grad_norm": 0.5775257228938532, + "learning_rate": 1.1839195928066102e-05, + "loss": 0.8634, + "step": 13759 + }, + { + "epoch": 0.686284289276808, + "grad_norm": 0.4620461014356954, + "learning_rate": 1.1835762538620867e-05, + "loss": 0.8045, + "step": 13760 + }, + { + "epoch": 0.6863341645885287, + "grad_norm": 0.5657063827858748, + "learning_rate": 1.18323294926905e-05, + "loss": 0.8268, + "step": 13761 + }, + { + "epoch": 0.6863840399002494, + "grad_norm": 0.4386194218773645, + "learning_rate": 1.182889679036461e-05, + "loss": 0.8773, + "step": 13762 + }, + { + "epoch": 0.6864339152119701, + "grad_norm": 0.4860321836323053, + "learning_rate": 1.1825464431732753e-05, + "loss": 0.8574, + "step": 13763 + }, + { + "epoch": 0.6864837905236908, + "grad_norm": 0.9561875124976349, + "learning_rate": 1.1822032416884516e-05, + "loss": 0.8858, + "step": 13764 + }, + { + "epoch": 0.6865336658354114, + "grad_norm": 0.5264215859306208, + "learning_rate": 1.1818600745909445e-05, + "loss": 0.8066, + "step": 13765 + }, + { + "epoch": 0.6865835411471322, + "grad_norm": 0.5056144830096657, + "learning_rate": 1.181516941889709e-05, + "loss": 0.855, + "step": 13766 + }, + { + "epoch": 0.6866334164588529, + "grad_norm": 0.5898754164188769, + "learning_rate": 1.1811738435936987e-05, + "loss": 0.8673, + "step": 13767 + }, + { + "epoch": 0.6866832917705735, + "grad_norm": 0.5552136073612515, + "learning_rate": 1.1808307797118657e-05, + "loss": 0.8806, + "step": 13768 + }, + { + "epoch": 0.6867331670822943, + "grad_norm": 0.631143460087409, + "learning_rate": 1.1804877502531642e-05, + "loss": 0.8692, + "step": 13769 + }, + { + "epoch": 0.686783042394015, + "grad_norm": 0.4873053450374262, + "learning_rate": 1.180144755226544e-05, + "loss": 0.8753, + "step": 13770 + }, + { + "epoch": 0.6868329177057356, + "grad_norm": 0.5266289716069821, + "learning_rate": 1.1798017946409562e-05, + "loss": 0.8705, + "step": 13771 + }, + { + "epoch": 0.6868827930174564, + "grad_norm": 0.5583779844365484, + "learning_rate": 1.1794588685053484e-05, + "loss": 0.8282, + "step": 13772 + }, + { + "epoch": 0.6869326683291771, + "grad_norm": 0.44884307746824237, + "learning_rate": 1.1791159768286716e-05, + "loss": 0.8303, + "step": 13773 + }, + { + "epoch": 0.6869825436408977, + "grad_norm": 0.5136697994566108, + "learning_rate": 1.178773119619872e-05, + "loss": 0.86, + "step": 13774 + }, + { + "epoch": 0.6870324189526185, + "grad_norm": 0.46509591905676456, + "learning_rate": 1.178430296887897e-05, + "loss": 0.85, + "step": 13775 + }, + { + "epoch": 0.6870822942643392, + "grad_norm": 0.5721583163252303, + "learning_rate": 1.1780875086416906e-05, + "loss": 0.8315, + "step": 13776 + }, + { + "epoch": 0.6871321695760598, + "grad_norm": 0.4212359249390479, + "learning_rate": 1.1777447548902002e-05, + "loss": 0.8763, + "step": 13777 + }, + { + "epoch": 0.6871820448877806, + "grad_norm": 0.5637649925788238, + "learning_rate": 1.1774020356423684e-05, + "loss": 0.8639, + "step": 13778 + }, + { + "epoch": 0.6872319201995013, + "grad_norm": 0.619996216343499, + "learning_rate": 1.1770593509071387e-05, + "loss": 0.8613, + "step": 13779 + }, + { + "epoch": 0.6872817955112219, + "grad_norm": 0.4718763609933109, + "learning_rate": 1.1767167006934524e-05, + "loss": 0.8503, + "step": 13780 + }, + { + "epoch": 0.6873316708229427, + "grad_norm": 0.4323518069348036, + "learning_rate": 1.1763740850102522e-05, + "loss": 0.8156, + "step": 13781 + }, + { + "epoch": 0.6873815461346633, + "grad_norm": 0.5799605104001374, + "learning_rate": 1.176031503866478e-05, + "loss": 0.8467, + "step": 13782 + }, + { + "epoch": 0.687431421446384, + "grad_norm": 0.5715477460556752, + "learning_rate": 1.1756889572710691e-05, + "loss": 0.8579, + "step": 13783 + }, + { + "epoch": 0.6874812967581048, + "grad_norm": 0.42442078712885173, + "learning_rate": 1.1753464452329634e-05, + "loss": 0.8304, + "step": 13784 + }, + { + "epoch": 0.6875311720698254, + "grad_norm": 0.47765758716981316, + "learning_rate": 1.1750039677611002e-05, + "loss": 0.8621, + "step": 13785 + }, + { + "epoch": 0.6875810473815461, + "grad_norm": 0.5784379404710267, + "learning_rate": 1.1746615248644155e-05, + "loss": 0.8277, + "step": 13786 + }, + { + "epoch": 0.6876309226932669, + "grad_norm": 0.5035423314768381, + "learning_rate": 1.1743191165518452e-05, + "loss": 0.8187, + "step": 13787 + }, + { + "epoch": 0.6876807980049875, + "grad_norm": 0.45500459703568724, + "learning_rate": 1.1739767428323234e-05, + "loss": 0.8443, + "step": 13788 + }, + { + "epoch": 0.6877306733167082, + "grad_norm": 0.5206731419826066, + "learning_rate": 1.1736344037147858e-05, + "loss": 0.8711, + "step": 13789 + }, + { + "epoch": 0.687780548628429, + "grad_norm": 0.4986645558024658, + "learning_rate": 1.173292099208165e-05, + "loss": 0.9106, + "step": 13790 + }, + { + "epoch": 0.6878304239401496, + "grad_norm": 0.4579766007951244, + "learning_rate": 1.172949829321393e-05, + "loss": 0.8631, + "step": 13791 + }, + { + "epoch": 0.6878802992518703, + "grad_norm": 0.4975828708379675, + "learning_rate": 1.1726075940634004e-05, + "loss": 0.8699, + "step": 13792 + }, + { + "epoch": 0.6879301745635911, + "grad_norm": 0.462149431780825, + "learning_rate": 1.1722653934431205e-05, + "loss": 0.8333, + "step": 13793 + }, + { + "epoch": 0.6879800498753117, + "grad_norm": 0.4472071721349421, + "learning_rate": 1.171923227469479e-05, + "loss": 0.8358, + "step": 13794 + }, + { + "epoch": 0.6880299251870324, + "grad_norm": 0.504745538326909, + "learning_rate": 1.1715810961514073e-05, + "loss": 0.7985, + "step": 13795 + }, + { + "epoch": 0.6880798004987532, + "grad_norm": 0.42752682526632946, + "learning_rate": 1.1712389994978319e-05, + "loss": 0.8311, + "step": 13796 + }, + { + "epoch": 0.6881296758104738, + "grad_norm": 0.4549076414498247, + "learning_rate": 1.1708969375176814e-05, + "loss": 0.8894, + "step": 13797 + }, + { + "epoch": 0.6881795511221945, + "grad_norm": 0.4899715895037131, + "learning_rate": 1.170554910219879e-05, + "loss": 0.8802, + "step": 13798 + }, + { + "epoch": 0.6882294264339152, + "grad_norm": 0.47163245113092384, + "learning_rate": 1.1702129176133522e-05, + "loss": 0.8933, + "step": 13799 + }, + { + "epoch": 0.6882793017456359, + "grad_norm": 0.5716504877644024, + "learning_rate": 1.169870959707023e-05, + "loss": 0.8443, + "step": 13800 + }, + { + "epoch": 0.6883291770573566, + "grad_norm": 0.5272659534824437, + "learning_rate": 1.1695290365098177e-05, + "loss": 0.816, + "step": 13801 + }, + { + "epoch": 0.6883790523690773, + "grad_norm": 0.6855401099480625, + "learning_rate": 1.1691871480306551e-05, + "loss": 0.8573, + "step": 13802 + }, + { + "epoch": 0.688428927680798, + "grad_norm": 0.4516127765966777, + "learning_rate": 1.1688452942784591e-05, + "loss": 0.8851, + "step": 13803 + }, + { + "epoch": 0.6884788029925187, + "grad_norm": 0.6193456335097622, + "learning_rate": 1.1685034752621493e-05, + "loss": 0.8598, + "step": 13804 + }, + { + "epoch": 0.6885286783042394, + "grad_norm": 0.6548216584208577, + "learning_rate": 1.1681616909906448e-05, + "loss": 0.8739, + "step": 13805 + }, + { + "epoch": 0.6885785536159601, + "grad_norm": 0.507591989906153, + "learning_rate": 1.1678199414728654e-05, + "loss": 0.8558, + "step": 13806 + }, + { + "epoch": 0.6886284289276808, + "grad_norm": 0.4556354638953063, + "learning_rate": 1.167478226717729e-05, + "loss": 0.8651, + "step": 13807 + }, + { + "epoch": 0.6886783042394015, + "grad_norm": 0.43019438972949386, + "learning_rate": 1.1671365467341513e-05, + "loss": 0.8632, + "step": 13808 + }, + { + "epoch": 0.6887281795511222, + "grad_norm": 0.5222255012539258, + "learning_rate": 1.1667949015310484e-05, + "loss": 0.8614, + "step": 13809 + }, + { + "epoch": 0.6887780548628429, + "grad_norm": 0.4448587761694898, + "learning_rate": 1.1664532911173364e-05, + "loss": 0.836, + "step": 13810 + }, + { + "epoch": 0.6888279301745636, + "grad_norm": 0.4969061020324486, + "learning_rate": 1.1661117155019293e-05, + "loss": 0.8682, + "step": 13811 + }, + { + "epoch": 0.6888778054862843, + "grad_norm": 0.4857547449523477, + "learning_rate": 1.16577017469374e-05, + "loss": 0.9156, + "step": 13812 + }, + { + "epoch": 0.688927680798005, + "grad_norm": 0.445214174345945, + "learning_rate": 1.16542866870168e-05, + "loss": 0.8346, + "step": 13813 + }, + { + "epoch": 0.6889775561097257, + "grad_norm": 0.8010070940422933, + "learning_rate": 1.1650871975346622e-05, + "loss": 0.8924, + "step": 13814 + }, + { + "epoch": 0.6890274314214464, + "grad_norm": 0.4578427112659266, + "learning_rate": 1.1647457612015969e-05, + "loss": 0.835, + "step": 13815 + }, + { + "epoch": 0.689077306733167, + "grad_norm": 0.4225040535200864, + "learning_rate": 1.1644043597113933e-05, + "loss": 0.8487, + "step": 13816 + }, + { + "epoch": 0.6891271820448878, + "grad_norm": 0.42758097452437743, + "learning_rate": 1.164062993072959e-05, + "loss": 0.8444, + "step": 13817 + }, + { + "epoch": 0.6891770573566085, + "grad_norm": 0.4102620827718859, + "learning_rate": 1.1637216612952043e-05, + "loss": 0.7976, + "step": 13818 + }, + { + "epoch": 0.6892269326683291, + "grad_norm": 0.5180104372482597, + "learning_rate": 1.1633803643870345e-05, + "loss": 0.8592, + "step": 13819 + }, + { + "epoch": 0.6892768079800499, + "grad_norm": 0.5663427724048674, + "learning_rate": 1.163039102357356e-05, + "loss": 0.8913, + "step": 13820 + }, + { + "epoch": 0.6893266832917706, + "grad_norm": 0.5200873155050475, + "learning_rate": 1.162697875215073e-05, + "loss": 0.8325, + "step": 13821 + }, + { + "epoch": 0.6893765586034912, + "grad_norm": 0.45939437278201406, + "learning_rate": 1.1623566829690914e-05, + "loss": 0.8608, + "step": 13822 + }, + { + "epoch": 0.689426433915212, + "grad_norm": 0.634412681977751, + "learning_rate": 1.1620155256283133e-05, + "loss": 0.8568, + "step": 13823 + }, + { + "epoch": 0.6894763092269327, + "grad_norm": 0.8359734111634584, + "learning_rate": 1.1616744032016413e-05, + "loss": 0.8489, + "step": 13824 + }, + { + "epoch": 0.6895261845386533, + "grad_norm": 0.5506327010892478, + "learning_rate": 1.1613333156979759e-05, + "loss": 0.8593, + "step": 13825 + }, + { + "epoch": 0.6895760598503741, + "grad_norm": 0.48318687057169074, + "learning_rate": 1.1609922631262204e-05, + "loss": 0.8155, + "step": 13826 + }, + { + "epoch": 0.6896259351620948, + "grad_norm": 0.504226021638002, + "learning_rate": 1.1606512454952706e-05, + "loss": 0.837, + "step": 13827 + }, + { + "epoch": 0.6896758104738154, + "grad_norm": 0.411786206168736, + "learning_rate": 1.160310262814028e-05, + "loss": 0.8379, + "step": 13828 + }, + { + "epoch": 0.6897256857855362, + "grad_norm": 0.5605578186795405, + "learning_rate": 1.1599693150913888e-05, + "loss": 0.8528, + "step": 13829 + }, + { + "epoch": 0.6897755610972569, + "grad_norm": 0.5614148486965674, + "learning_rate": 1.1596284023362525e-05, + "loss": 0.8451, + "step": 13830 + }, + { + "epoch": 0.6898254364089775, + "grad_norm": 0.4667185846953839, + "learning_rate": 1.1592875245575111e-05, + "loss": 0.8519, + "step": 13831 + }, + { + "epoch": 0.6898753117206983, + "grad_norm": 0.4965626732470615, + "learning_rate": 1.1589466817640626e-05, + "loss": 0.8571, + "step": 13832 + }, + { + "epoch": 0.689925187032419, + "grad_norm": 0.5422154425108621, + "learning_rate": 1.1586058739647995e-05, + "loss": 0.8635, + "step": 13833 + }, + { + "epoch": 0.6899750623441396, + "grad_norm": 0.5886970524448788, + "learning_rate": 1.1582651011686178e-05, + "loss": 0.8473, + "step": 13834 + }, + { + "epoch": 0.6900249376558604, + "grad_norm": 0.474890716223396, + "learning_rate": 1.1579243633844055e-05, + "loss": 0.8836, + "step": 13835 + }, + { + "epoch": 0.690074812967581, + "grad_norm": 0.538174517596462, + "learning_rate": 1.1575836606210578e-05, + "loss": 0.869, + "step": 13836 + }, + { + "epoch": 0.6901246882793017, + "grad_norm": 0.44097256323940537, + "learning_rate": 1.1572429928874623e-05, + "loss": 0.8664, + "step": 13837 + }, + { + "epoch": 0.6901745635910225, + "grad_norm": 0.45552004736056717, + "learning_rate": 1.156902360192512e-05, + "loss": 0.8556, + "step": 13838 + }, + { + "epoch": 0.6902244389027431, + "grad_norm": 0.41829519845801966, + "learning_rate": 1.1565617625450913e-05, + "loss": 0.8145, + "step": 13839 + }, + { + "epoch": 0.6902743142144638, + "grad_norm": 0.6076500946225752, + "learning_rate": 1.1562211999540917e-05, + "loss": 0.8504, + "step": 13840 + }, + { + "epoch": 0.6903241895261846, + "grad_norm": 0.5201596189803874, + "learning_rate": 1.1558806724283982e-05, + "loss": 0.8911, + "step": 13841 + }, + { + "epoch": 0.6903740648379052, + "grad_norm": 0.5571806818375039, + "learning_rate": 1.1555401799768962e-05, + "loss": 0.8385, + "step": 13842 + }, + { + "epoch": 0.6904239401496259, + "grad_norm": 0.44680194542380824, + "learning_rate": 1.155199722608472e-05, + "loss": 0.8332, + "step": 13843 + }, + { + "epoch": 0.6904738154613467, + "grad_norm": 0.5347072829177777, + "learning_rate": 1.1548593003320095e-05, + "loss": 0.8481, + "step": 13844 + }, + { + "epoch": 0.6905236907730673, + "grad_norm": 0.526384057942058, + "learning_rate": 1.1545189131563916e-05, + "loss": 0.8678, + "step": 13845 + }, + { + "epoch": 0.690573566084788, + "grad_norm": 0.4525263507094316, + "learning_rate": 1.1541785610904996e-05, + "loss": 0.8148, + "step": 13846 + }, + { + "epoch": 0.6906234413965088, + "grad_norm": 0.4668680009864365, + "learning_rate": 1.1538382441432166e-05, + "loss": 0.8531, + "step": 13847 + }, + { + "epoch": 0.6906733167082294, + "grad_norm": 0.6193985967313091, + "learning_rate": 1.1534979623234222e-05, + "loss": 0.8415, + "step": 13848 + }, + { + "epoch": 0.6907231920199501, + "grad_norm": 0.45740242144231824, + "learning_rate": 1.1531577156399958e-05, + "loss": 0.8506, + "step": 13849 + }, + { + "epoch": 0.6907730673316709, + "grad_norm": 0.4703714555411601, + "learning_rate": 1.1528175041018152e-05, + "loss": 0.8573, + "step": 13850 + }, + { + "epoch": 0.6908229426433915, + "grad_norm": 1.0108209372232075, + "learning_rate": 1.1524773277177598e-05, + "loss": 0.8925, + "step": 13851 + }, + { + "epoch": 0.6908728179551122, + "grad_norm": 0.4745651791481091, + "learning_rate": 1.1521371864967054e-05, + "loss": 0.8423, + "step": 13852 + }, + { + "epoch": 0.6909226932668329, + "grad_norm": 0.5101034858625735, + "learning_rate": 1.1517970804475278e-05, + "loss": 0.8487, + "step": 13853 + }, + { + "epoch": 0.6909725685785536, + "grad_norm": 0.4566333448568193, + "learning_rate": 1.1514570095791011e-05, + "loss": 0.8587, + "step": 13854 + }, + { + "epoch": 0.6910224438902743, + "grad_norm": 0.48144715929634735, + "learning_rate": 1.1511169739003013e-05, + "loss": 0.8401, + "step": 13855 + }, + { + "epoch": 0.691072319201995, + "grad_norm": 0.5107318313267915, + "learning_rate": 1.1507769734200002e-05, + "loss": 0.8775, + "step": 13856 + }, + { + "epoch": 0.6911221945137157, + "grad_norm": 0.5048621338714441, + "learning_rate": 1.1504370081470703e-05, + "loss": 0.8706, + "step": 13857 + }, + { + "epoch": 0.6911720698254364, + "grad_norm": 0.5044584024542674, + "learning_rate": 1.150097078090382e-05, + "loss": 0.8779, + "step": 13858 + }, + { + "epoch": 0.6912219451371571, + "grad_norm": 0.5369025195395484, + "learning_rate": 1.1497571832588066e-05, + "loss": 0.8445, + "step": 13859 + }, + { + "epoch": 0.6912718204488778, + "grad_norm": 1.2635701323472819, + "learning_rate": 1.1494173236612139e-05, + "loss": 0.8784, + "step": 13860 + }, + { + "epoch": 0.6913216957605985, + "grad_norm": 0.47266078325310046, + "learning_rate": 1.1490774993064713e-05, + "loss": 0.8819, + "step": 13861 + }, + { + "epoch": 0.6913715710723192, + "grad_norm": 0.507800334041659, + "learning_rate": 1.1487377102034458e-05, + "loss": 0.88, + "step": 13862 + }, + { + "epoch": 0.6914214463840399, + "grad_norm": 1.7354780006342725, + "learning_rate": 1.148397956361007e-05, + "loss": 0.8572, + "step": 13863 + }, + { + "epoch": 0.6914713216957606, + "grad_norm": 0.6133361193888269, + "learning_rate": 1.1480582377880167e-05, + "loss": 0.8775, + "step": 13864 + }, + { + "epoch": 0.6915211970074813, + "grad_norm": 0.5106959088523755, + "learning_rate": 1.1477185544933425e-05, + "loss": 0.8631, + "step": 13865 + }, + { + "epoch": 0.691571072319202, + "grad_norm": 0.4668540151493274, + "learning_rate": 1.1473789064858466e-05, + "loss": 0.8551, + "step": 13866 + }, + { + "epoch": 0.6916209476309227, + "grad_norm": 0.5293572292585002, + "learning_rate": 1.1470392937743948e-05, + "loss": 0.8643, + "step": 13867 + }, + { + "epoch": 0.6916708229426434, + "grad_norm": 0.6241428670686976, + "learning_rate": 1.146699716367845e-05, + "loss": 0.8508, + "step": 13868 + }, + { + "epoch": 0.6917206982543641, + "grad_norm": 0.5720695825647183, + "learning_rate": 1.1463601742750616e-05, + "loss": 0.8366, + "step": 13869 + }, + { + "epoch": 0.6917705735660847, + "grad_norm": 0.5117649217917102, + "learning_rate": 1.1460206675049023e-05, + "loss": 0.8719, + "step": 13870 + }, + { + "epoch": 0.6918204488778055, + "grad_norm": 0.629663024228596, + "learning_rate": 1.14568119606623e-05, + "loss": 0.881, + "step": 13871 + }, + { + "epoch": 0.6918703241895262, + "grad_norm": 0.6763397752721098, + "learning_rate": 1.1453417599678987e-05, + "loss": 0.8398, + "step": 13872 + }, + { + "epoch": 0.6919201995012468, + "grad_norm": 0.5396835665766362, + "learning_rate": 1.145002359218769e-05, + "loss": 0.868, + "step": 13873 + }, + { + "epoch": 0.6919700748129676, + "grad_norm": 0.4983331748269218, + "learning_rate": 1.1446629938276962e-05, + "loss": 0.8574, + "step": 13874 + }, + { + "epoch": 0.6920199501246883, + "grad_norm": 0.6026881292519911, + "learning_rate": 1.144323663803536e-05, + "loss": 0.8804, + "step": 13875 + }, + { + "epoch": 0.6920698254364089, + "grad_norm": 0.4701680927537736, + "learning_rate": 1.1439843691551421e-05, + "loss": 0.906, + "step": 13876 + }, + { + "epoch": 0.6921197007481297, + "grad_norm": 0.7299098937598633, + "learning_rate": 1.1436451098913703e-05, + "loss": 0.817, + "step": 13877 + }, + { + "epoch": 0.6921695760598504, + "grad_norm": 0.579389603624731, + "learning_rate": 1.1433058860210724e-05, + "loss": 0.8519, + "step": 13878 + }, + { + "epoch": 0.692219451371571, + "grad_norm": 0.49423524678759484, + "learning_rate": 1.142966697553099e-05, + "loss": 0.8363, + "step": 13879 + }, + { + "epoch": 0.6922693266832918, + "grad_norm": 0.5217280770579731, + "learning_rate": 1.1426275444963034e-05, + "loss": 0.866, + "step": 13880 + }, + { + "epoch": 0.6923192019950125, + "grad_norm": 0.43895349798666217, + "learning_rate": 1.1422884268595347e-05, + "loss": 0.83, + "step": 13881 + }, + { + "epoch": 0.6923690773067331, + "grad_norm": 0.49491975846442754, + "learning_rate": 1.1419493446516415e-05, + "loss": 0.8747, + "step": 13882 + }, + { + "epoch": 0.6924189526184539, + "grad_norm": 0.47821809705570967, + "learning_rate": 1.1416102978814716e-05, + "loss": 0.8611, + "step": 13883 + }, + { + "epoch": 0.6924688279301746, + "grad_norm": 0.5409751260386116, + "learning_rate": 1.141271286557874e-05, + "loss": 0.8572, + "step": 13884 + }, + { + "epoch": 0.6925187032418952, + "grad_norm": 0.41410716954011295, + "learning_rate": 1.1409323106896941e-05, + "loss": 0.8494, + "step": 13885 + }, + { + "epoch": 0.692568578553616, + "grad_norm": 0.5990252593487733, + "learning_rate": 1.1405933702857774e-05, + "loss": 0.8649, + "step": 13886 + }, + { + "epoch": 0.6926184538653366, + "grad_norm": 0.4530091160827635, + "learning_rate": 1.1402544653549672e-05, + "loss": 0.8587, + "step": 13887 + }, + { + "epoch": 0.6926683291770573, + "grad_norm": 0.6573208642863907, + "learning_rate": 1.1399155959061092e-05, + "loss": 0.8471, + "step": 13888 + }, + { + "epoch": 0.6927182044887781, + "grad_norm": 0.4804405722476809, + "learning_rate": 1.1395767619480451e-05, + "loss": 0.8808, + "step": 13889 + }, + { + "epoch": 0.6927680798004987, + "grad_norm": 0.5408086034502508, + "learning_rate": 1.1392379634896163e-05, + "loss": 0.8666, + "step": 13890 + }, + { + "epoch": 0.6928179551122194, + "grad_norm": 0.4694711986092788, + "learning_rate": 1.1388992005396632e-05, + "loss": 0.8776, + "step": 13891 + }, + { + "epoch": 0.6928678304239402, + "grad_norm": 0.5639496770369836, + "learning_rate": 1.138560473107027e-05, + "loss": 0.8594, + "step": 13892 + }, + { + "epoch": 0.6929177057356608, + "grad_norm": 0.4669778862487659, + "learning_rate": 1.1382217812005464e-05, + "loss": 0.8137, + "step": 13893 + }, + { + "epoch": 0.6929675810473815, + "grad_norm": 0.512892310780117, + "learning_rate": 1.1378831248290587e-05, + "loss": 0.8429, + "step": 13894 + }, + { + "epoch": 0.6930174563591023, + "grad_norm": 0.4563661052788715, + "learning_rate": 1.1375445040014002e-05, + "loss": 0.8284, + "step": 13895 + }, + { + "epoch": 0.6930673316708229, + "grad_norm": 0.6021707459431138, + "learning_rate": 1.1372059187264089e-05, + "loss": 0.8605, + "step": 13896 + }, + { + "epoch": 0.6931172069825436, + "grad_norm": 0.42000710245396233, + "learning_rate": 1.1368673690129197e-05, + "loss": 0.8684, + "step": 13897 + }, + { + "epoch": 0.6931670822942644, + "grad_norm": 0.48077431477242394, + "learning_rate": 1.1365288548697662e-05, + "loss": 0.9093, + "step": 13898 + }, + { + "epoch": 0.693216957605985, + "grad_norm": 0.4507659586195496, + "learning_rate": 1.136190376305781e-05, + "loss": 0.8042, + "step": 13899 + }, + { + "epoch": 0.6932668329177057, + "grad_norm": 0.5258458050613141, + "learning_rate": 1.1358519333297995e-05, + "loss": 0.8848, + "step": 13900 + }, + { + "epoch": 0.6933167082294265, + "grad_norm": 0.4404833554348248, + "learning_rate": 1.1355135259506497e-05, + "loss": 0.8361, + "step": 13901 + }, + { + "epoch": 0.6933665835411471, + "grad_norm": 0.4527278230650271, + "learning_rate": 1.1351751541771644e-05, + "loss": 0.8386, + "step": 13902 + }, + { + "epoch": 0.6934164588528678, + "grad_norm": 0.4655446275951644, + "learning_rate": 1.1348368180181718e-05, + "loss": 0.8335, + "step": 13903 + }, + { + "epoch": 0.6934663341645886, + "grad_norm": 1.0667460127816673, + "learning_rate": 1.1344985174825032e-05, + "loss": 0.8384, + "step": 13904 + }, + { + "epoch": 0.6935162094763092, + "grad_norm": 0.4329253469211723, + "learning_rate": 1.1341602525789829e-05, + "loss": 0.8496, + "step": 13905 + }, + { + "epoch": 0.69356608478803, + "grad_norm": 0.6018261957186328, + "learning_rate": 1.1338220233164403e-05, + "loss": 0.8658, + "step": 13906 + }, + { + "epoch": 0.6936159600997506, + "grad_norm": 0.5552422335861652, + "learning_rate": 1.1334838297037002e-05, + "loss": 0.8794, + "step": 13907 + }, + { + "epoch": 0.6936658354114713, + "grad_norm": 0.5231910975102005, + "learning_rate": 1.1331456717495891e-05, + "loss": 0.8434, + "step": 13908 + }, + { + "epoch": 0.693715710723192, + "grad_norm": 0.45777528724038113, + "learning_rate": 1.1328075494629286e-05, + "loss": 0.8581, + "step": 13909 + }, + { + "epoch": 0.6937655860349127, + "grad_norm": 0.48822394994816903, + "learning_rate": 1.1324694628525442e-05, + "loss": 0.8191, + "step": 13910 + }, + { + "epoch": 0.6938154613466334, + "grad_norm": 0.494071512100656, + "learning_rate": 1.1321314119272569e-05, + "loss": 0.8832, + "step": 13911 + }, + { + "epoch": 0.6938653366583541, + "grad_norm": 0.485181175566916, + "learning_rate": 1.1317933966958883e-05, + "loss": 0.8623, + "step": 13912 + }, + { + "epoch": 0.6939152119700748, + "grad_norm": 0.4100828680496919, + "learning_rate": 1.1314554171672578e-05, + "loss": 0.8454, + "step": 13913 + }, + { + "epoch": 0.6939650872817955, + "grad_norm": 0.5360297658513242, + "learning_rate": 1.1311174733501867e-05, + "loss": 0.8581, + "step": 13914 + }, + { + "epoch": 0.6940149625935162, + "grad_norm": 0.4959577961360693, + "learning_rate": 1.1307795652534923e-05, + "loss": 0.8404, + "step": 13915 + }, + { + "epoch": 0.6940648379052369, + "grad_norm": 0.4771741644402666, + "learning_rate": 1.1304416928859927e-05, + "loss": 0.8647, + "step": 13916 + }, + { + "epoch": 0.6941147132169576, + "grad_norm": 0.4734582180580261, + "learning_rate": 1.1301038562565028e-05, + "loss": 0.8098, + "step": 13917 + }, + { + "epoch": 0.6941645885286784, + "grad_norm": 0.4548906833055611, + "learning_rate": 1.129766055373841e-05, + "loss": 0.8613, + "step": 13918 + }, + { + "epoch": 0.694214463840399, + "grad_norm": 0.4999825038258723, + "learning_rate": 1.1294282902468203e-05, + "loss": 0.8495, + "step": 13919 + }, + { + "epoch": 0.6942643391521197, + "grad_norm": 0.5882010063698067, + "learning_rate": 1.1290905608842545e-05, + "loss": 0.8535, + "step": 13920 + }, + { + "epoch": 0.6943142144638405, + "grad_norm": 1.611328882277994, + "learning_rate": 1.1287528672949577e-05, + "loss": 0.862, + "step": 13921 + }, + { + "epoch": 0.6943640897755611, + "grad_norm": 0.5166398303533796, + "learning_rate": 1.1284152094877412e-05, + "loss": 0.8508, + "step": 13922 + }, + { + "epoch": 0.6944139650872818, + "grad_norm": 0.625800802477529, + "learning_rate": 1.128077587471416e-05, + "loss": 0.8154, + "step": 13923 + }, + { + "epoch": 0.6944638403990024, + "grad_norm": 0.5530095032140256, + "learning_rate": 1.1277400012547912e-05, + "loss": 0.8561, + "step": 13924 + }, + { + "epoch": 0.6945137157107232, + "grad_norm": 0.6426476203793138, + "learning_rate": 1.127402450846678e-05, + "loss": 0.84, + "step": 13925 + }, + { + "epoch": 0.6945635910224439, + "grad_norm": 0.42465169913248335, + "learning_rate": 1.1270649362558832e-05, + "loss": 0.8398, + "step": 13926 + }, + { + "epoch": 0.6946134663341645, + "grad_norm": 0.47737863486014204, + "learning_rate": 1.1267274574912146e-05, + "loss": 0.8705, + "step": 13927 + }, + { + "epoch": 0.6946633416458853, + "grad_norm": 0.47454566747359084, + "learning_rate": 1.1263900145614775e-05, + "loss": 0.8506, + "step": 13928 + }, + { + "epoch": 0.694713216957606, + "grad_norm": 0.4924162526611343, + "learning_rate": 1.1260526074754792e-05, + "loss": 0.8711, + "step": 13929 + }, + { + "epoch": 0.6947630922693266, + "grad_norm": 0.49266981069456095, + "learning_rate": 1.1257152362420231e-05, + "loss": 0.8768, + "step": 13930 + }, + { + "epoch": 0.6948129675810474, + "grad_norm": 0.41277549430839194, + "learning_rate": 1.1253779008699131e-05, + "loss": 0.8932, + "step": 13931 + }, + { + "epoch": 0.6948628428927681, + "grad_norm": 0.5132793432773721, + "learning_rate": 1.1250406013679504e-05, + "loss": 0.9056, + "step": 13932 + }, + { + "epoch": 0.6949127182044887, + "grad_norm": 0.5045240065216539, + "learning_rate": 1.1247033377449399e-05, + "loss": 0.8316, + "step": 13933 + }, + { + "epoch": 0.6949625935162095, + "grad_norm": 0.6479894247053017, + "learning_rate": 1.124366110009678e-05, + "loss": 0.821, + "step": 13934 + }, + { + "epoch": 0.6950124688279302, + "grad_norm": 0.5036355045486106, + "learning_rate": 1.1240289181709682e-05, + "loss": 0.8435, + "step": 13935 + }, + { + "epoch": 0.6950623441396508, + "grad_norm": 0.49462039589828755, + "learning_rate": 1.123691762237607e-05, + "loss": 0.8739, + "step": 13936 + }, + { + "epoch": 0.6951122194513716, + "grad_norm": 0.5154087965822571, + "learning_rate": 1.123354642218395e-05, + "loss": 0.8539, + "step": 13937 + }, + { + "epoch": 0.6951620947630923, + "grad_norm": 0.4502107547047096, + "learning_rate": 1.1230175581221258e-05, + "loss": 0.8078, + "step": 13938 + }, + { + "epoch": 0.695211970074813, + "grad_norm": 0.5418478601597554, + "learning_rate": 1.122680509957598e-05, + "loss": 0.8813, + "step": 13939 + }, + { + "epoch": 0.6952618453865337, + "grad_norm": 0.534983906063723, + "learning_rate": 1.122343497733605e-05, + "loss": 0.8794, + "step": 13940 + }, + { + "epoch": 0.6953117206982543, + "grad_norm": 0.6280085862050371, + "learning_rate": 1.1220065214589436e-05, + "loss": 0.8262, + "step": 13941 + }, + { + "epoch": 0.695361596009975, + "grad_norm": 0.8500570991361946, + "learning_rate": 1.1216695811424036e-05, + "loss": 0.8557, + "step": 13942 + }, + { + "epoch": 0.6954114713216958, + "grad_norm": 0.5853375267254476, + "learning_rate": 1.1213326767927798e-05, + "loss": 0.8653, + "step": 13943 + }, + { + "epoch": 0.6954613466334164, + "grad_norm": 1.0086227888754618, + "learning_rate": 1.1209958084188629e-05, + "loss": 0.8744, + "step": 13944 + }, + { + "epoch": 0.6955112219451371, + "grad_norm": 0.5028130284308053, + "learning_rate": 1.1206589760294434e-05, + "loss": 0.8439, + "step": 13945 + }, + { + "epoch": 0.6955610972568579, + "grad_norm": 0.4165096395695323, + "learning_rate": 1.1203221796333094e-05, + "loss": 0.8105, + "step": 13946 + }, + { + "epoch": 0.6956109725685785, + "grad_norm": 0.45398973700072737, + "learning_rate": 1.1199854192392518e-05, + "loss": 0.8421, + "step": 13947 + }, + { + "epoch": 0.6956608478802992, + "grad_norm": 0.47520913177611473, + "learning_rate": 1.1196486948560566e-05, + "loss": 0.8385, + "step": 13948 + }, + { + "epoch": 0.69571072319202, + "grad_norm": 0.49931358685585003, + "learning_rate": 1.1193120064925114e-05, + "loss": 0.8687, + "step": 13949 + }, + { + "epoch": 0.6957605985037406, + "grad_norm": 0.5710918871025485, + "learning_rate": 1.1189753541574005e-05, + "loss": 0.8325, + "step": 13950 + }, + { + "epoch": 0.6958104738154614, + "grad_norm": 0.39942090723182005, + "learning_rate": 1.1186387378595106e-05, + "loss": 0.8192, + "step": 13951 + }, + { + "epoch": 0.6958603491271821, + "grad_norm": 0.4866048212869401, + "learning_rate": 1.1183021576076244e-05, + "loss": 0.8369, + "step": 13952 + }, + { + "epoch": 0.6959102244389027, + "grad_norm": 0.48442117736876456, + "learning_rate": 1.1179656134105253e-05, + "loss": 0.8627, + "step": 13953 + }, + { + "epoch": 0.6959600997506235, + "grad_norm": 0.5852719046239492, + "learning_rate": 1.1176291052769938e-05, + "loss": 0.8597, + "step": 13954 + }, + { + "epoch": 0.6960099750623442, + "grad_norm": 0.5403681517811546, + "learning_rate": 1.1172926332158135e-05, + "loss": 0.8246, + "step": 13955 + }, + { + "epoch": 0.6960598503740648, + "grad_norm": 0.48578234953193156, + "learning_rate": 1.1169561972357629e-05, + "loss": 0.8591, + "step": 13956 + }, + { + "epoch": 0.6961097256857856, + "grad_norm": 0.5777237128245308, + "learning_rate": 1.1166197973456203e-05, + "loss": 0.8721, + "step": 13957 + }, + { + "epoch": 0.6961596009975063, + "grad_norm": 0.559575973433588, + "learning_rate": 1.1162834335541659e-05, + "loss": 0.854, + "step": 13958 + }, + { + "epoch": 0.6962094763092269, + "grad_norm": 0.5892144288926323, + "learning_rate": 1.115947105870176e-05, + "loss": 0.8485, + "step": 13959 + }, + { + "epoch": 0.6962593516209477, + "grad_norm": 0.45946745265726946, + "learning_rate": 1.1156108143024271e-05, + "loss": 0.8184, + "step": 13960 + }, + { + "epoch": 0.6963092269326683, + "grad_norm": 6.405287227418003, + "learning_rate": 1.1152745588596933e-05, + "loss": 0.8511, + "step": 13961 + }, + { + "epoch": 0.696359102244389, + "grad_norm": 0.531346755797891, + "learning_rate": 1.1149383395507512e-05, + "loss": 0.8558, + "step": 13962 + }, + { + "epoch": 0.6964089775561098, + "grad_norm": 0.610375906010184, + "learning_rate": 1.1146021563843733e-05, + "loss": 0.8486, + "step": 13963 + }, + { + "epoch": 0.6964588528678304, + "grad_norm": 0.461332652335911, + "learning_rate": 1.1142660093693321e-05, + "loss": 0.8784, + "step": 13964 + }, + { + "epoch": 0.6965087281795511, + "grad_norm": 0.5316326252562269, + "learning_rate": 1.1139298985143983e-05, + "loss": 0.8681, + "step": 13965 + }, + { + "epoch": 0.6965586034912719, + "grad_norm": 0.46819558307782805, + "learning_rate": 1.1135938238283445e-05, + "loss": 0.8495, + "step": 13966 + }, + { + "epoch": 0.6966084788029925, + "grad_norm": 0.489551185710051, + "learning_rate": 1.1132577853199394e-05, + "loss": 0.8333, + "step": 13967 + }, + { + "epoch": 0.6966583541147132, + "grad_norm": 0.4812475420501791, + "learning_rate": 1.1129217829979516e-05, + "loss": 0.862, + "step": 13968 + }, + { + "epoch": 0.696708229426434, + "grad_norm": 0.6940858955875455, + "learning_rate": 1.1125858168711482e-05, + "loss": 0.8315, + "step": 13969 + }, + { + "epoch": 0.6967581047381546, + "grad_norm": 0.46202238997046746, + "learning_rate": 1.1122498869482988e-05, + "loss": 0.8124, + "step": 13970 + }, + { + "epoch": 0.6968079800498753, + "grad_norm": 0.4619872500108887, + "learning_rate": 1.1119139932381659e-05, + "loss": 0.7979, + "step": 13971 + }, + { + "epoch": 0.6968578553615961, + "grad_norm": 0.5531220589483338, + "learning_rate": 1.1115781357495167e-05, + "loss": 0.8628, + "step": 13972 + }, + { + "epoch": 0.6969077306733167, + "grad_norm": 0.4253508566689463, + "learning_rate": 1.1112423144911139e-05, + "loss": 0.8668, + "step": 13973 + }, + { + "epoch": 0.6969576059850374, + "grad_norm": 0.5162339886639092, + "learning_rate": 1.1109065294717233e-05, + "loss": 0.8768, + "step": 13974 + }, + { + "epoch": 0.6970074812967582, + "grad_norm": 0.6122014083932108, + "learning_rate": 1.1105707807001032e-05, + "loss": 0.8348, + "step": 13975 + }, + { + "epoch": 0.6970573566084788, + "grad_norm": 0.46461583926360056, + "learning_rate": 1.1102350681850176e-05, + "loss": 0.8567, + "step": 13976 + }, + { + "epoch": 0.6971072319201995, + "grad_norm": 0.41831702887361105, + "learning_rate": 1.1098993919352252e-05, + "loss": 0.8206, + "step": 13977 + }, + { + "epoch": 0.6971571072319201, + "grad_norm": 0.5116766031301168, + "learning_rate": 1.109563751959488e-05, + "loss": 0.888, + "step": 13978 + }, + { + "epoch": 0.6972069825436409, + "grad_norm": 0.4464436465647257, + "learning_rate": 1.1092281482665601e-05, + "loss": 0.8653, + "step": 13979 + }, + { + "epoch": 0.6972568578553616, + "grad_norm": 0.5072106645320973, + "learning_rate": 1.1088925808652026e-05, + "loss": 0.8446, + "step": 13980 + }, + { + "epoch": 0.6973067331670822, + "grad_norm": 0.4493304956725226, + "learning_rate": 1.1085570497641703e-05, + "loss": 0.8202, + "step": 13981 + }, + { + "epoch": 0.697356608478803, + "grad_norm": 0.5301316746602426, + "learning_rate": 1.1082215549722192e-05, + "loss": 0.8891, + "step": 13982 + }, + { + "epoch": 0.6974064837905237, + "grad_norm": 1.1682441610350467, + "learning_rate": 1.1078860964981027e-05, + "loss": 0.8584, + "step": 13983 + }, + { + "epoch": 0.6974563591022443, + "grad_norm": 0.9328352413370002, + "learning_rate": 1.1075506743505767e-05, + "loss": 0.8736, + "step": 13984 + }, + { + "epoch": 0.6975062344139651, + "grad_norm": 0.5012339919963651, + "learning_rate": 1.1072152885383923e-05, + "loss": 0.8377, + "step": 13985 + }, + { + "epoch": 0.6975561097256858, + "grad_norm": 0.4756041387471163, + "learning_rate": 1.1068799390703019e-05, + "loss": 0.8948, + "step": 13986 + }, + { + "epoch": 0.6976059850374065, + "grad_norm": 0.6524638398728254, + "learning_rate": 1.1065446259550551e-05, + "loss": 0.8555, + "step": 13987 + }, + { + "epoch": 0.6976558603491272, + "grad_norm": 0.44923600104769856, + "learning_rate": 1.1062093492014035e-05, + "loss": 0.8535, + "step": 13988 + }, + { + "epoch": 0.6977057356608479, + "grad_norm": 0.43795552348037764, + "learning_rate": 1.1058741088180951e-05, + "loss": 0.867, + "step": 13989 + }, + { + "epoch": 0.6977556109725686, + "grad_norm": 0.42652191183537996, + "learning_rate": 1.105538904813878e-05, + "loss": 0.862, + "step": 13990 + }, + { + "epoch": 0.6978054862842893, + "grad_norm": 0.6469806800340215, + "learning_rate": 1.1052037371974983e-05, + "loss": 0.8486, + "step": 13991 + }, + { + "epoch": 0.69785536159601, + "grad_norm": 0.49518406494734235, + "learning_rate": 1.1048686059777033e-05, + "loss": 0.8565, + "step": 13992 + }, + { + "epoch": 0.6979052369077307, + "grad_norm": 0.4915371879451343, + "learning_rate": 1.104533511163238e-05, + "loss": 0.8664, + "step": 13993 + }, + { + "epoch": 0.6979551122194514, + "grad_norm": 0.6704915097000345, + "learning_rate": 1.1041984527628452e-05, + "loss": 0.8694, + "step": 13994 + }, + { + "epoch": 0.698004987531172, + "grad_norm": 0.5389914567190414, + "learning_rate": 1.1038634307852703e-05, + "loss": 0.839, + "step": 13995 + }, + { + "epoch": 0.6980548628428928, + "grad_norm": 0.5115268702864262, + "learning_rate": 1.1035284452392539e-05, + "loss": 0.8615, + "step": 13996 + }, + { + "epoch": 0.6981047381546135, + "grad_norm": 5.201437213484018, + "learning_rate": 1.103193496133538e-05, + "loss": 0.8984, + "step": 13997 + }, + { + "epoch": 0.6981546134663341, + "grad_norm": 0.48157630003414215, + "learning_rate": 1.1028585834768615e-05, + "loss": 0.8714, + "step": 13998 + }, + { + "epoch": 0.6982044887780549, + "grad_norm": 0.5109567368180589, + "learning_rate": 1.1025237072779663e-05, + "loss": 0.8925, + "step": 13999 + }, + { + "epoch": 0.6982543640897756, + "grad_norm": 0.44121313016441194, + "learning_rate": 1.1021888675455891e-05, + "loss": 0.8524, + "step": 14000 + }, + { + "epoch": 0.6983042394014962, + "grad_norm": 0.6705149624287773, + "learning_rate": 1.101854064288468e-05, + "loss": 0.8634, + "step": 14001 + }, + { + "epoch": 0.698354114713217, + "grad_norm": 0.46179734103298586, + "learning_rate": 1.1015192975153388e-05, + "loss": 0.8944, + "step": 14002 + }, + { + "epoch": 0.6984039900249377, + "grad_norm": 0.6064561760083654, + "learning_rate": 1.101184567234938e-05, + "loss": 0.8584, + "step": 14003 + }, + { + "epoch": 0.6984538653366583, + "grad_norm": 0.46809511869922316, + "learning_rate": 1.1008498734560002e-05, + "loss": 0.8213, + "step": 14004 + }, + { + "epoch": 0.6985037406483791, + "grad_norm": 0.47619317805126826, + "learning_rate": 1.1005152161872587e-05, + "loss": 0.8408, + "step": 14005 + }, + { + "epoch": 0.6985536159600998, + "grad_norm": 0.4948814530118968, + "learning_rate": 1.1001805954374455e-05, + "loss": 0.8519, + "step": 14006 + }, + { + "epoch": 0.6986034912718204, + "grad_norm": 0.406372772837798, + "learning_rate": 1.099846011215295e-05, + "loss": 0.8187, + "step": 14007 + }, + { + "epoch": 0.6986533665835412, + "grad_norm": 0.5329319677404951, + "learning_rate": 1.0995114635295342e-05, + "loss": 0.8542, + "step": 14008 + }, + { + "epoch": 0.6987032418952619, + "grad_norm": 0.5060931296652226, + "learning_rate": 1.0991769523888962e-05, + "loss": 0.8386, + "step": 14009 + }, + { + "epoch": 0.6987531172069825, + "grad_norm": 0.5721810903558487, + "learning_rate": 1.0988424778021075e-05, + "loss": 0.8755, + "step": 14010 + }, + { + "epoch": 0.6988029925187033, + "grad_norm": 0.43728119264071474, + "learning_rate": 1.098508039777899e-05, + "loss": 0.8598, + "step": 14011 + }, + { + "epoch": 0.6988528678304239, + "grad_norm": 0.42020356845778484, + "learning_rate": 1.0981736383249946e-05, + "loss": 0.904, + "step": 14012 + }, + { + "epoch": 0.6989027431421446, + "grad_norm": 0.44220515789071946, + "learning_rate": 1.0978392734521223e-05, + "loss": 0.8375, + "step": 14013 + }, + { + "epoch": 0.6989526184538654, + "grad_norm": 0.4504773224953598, + "learning_rate": 1.0975049451680058e-05, + "loss": 0.8574, + "step": 14014 + }, + { + "epoch": 0.699002493765586, + "grad_norm": 0.6004283453051137, + "learning_rate": 1.097170653481372e-05, + "loss": 0.835, + "step": 14015 + }, + { + "epoch": 0.6990523690773067, + "grad_norm": 0.5114376577561566, + "learning_rate": 1.0968363984009403e-05, + "loss": 0.8755, + "step": 14016 + }, + { + "epoch": 0.6991022443890275, + "grad_norm": 0.4698019835571939, + "learning_rate": 1.0965021799354355e-05, + "loss": 0.8287, + "step": 14017 + }, + { + "epoch": 0.6991521197007481, + "grad_norm": 0.4984496204091435, + "learning_rate": 1.0961679980935784e-05, + "loss": 0.809, + "step": 14018 + }, + { + "epoch": 0.6992019950124688, + "grad_norm": 0.43332842341577915, + "learning_rate": 1.0958338528840893e-05, + "loss": 0.8337, + "step": 14019 + }, + { + "epoch": 0.6992518703241896, + "grad_norm": 0.5193077328668636, + "learning_rate": 1.0954997443156861e-05, + "loss": 0.866, + "step": 14020 + }, + { + "epoch": 0.6993017456359102, + "grad_norm": 0.4306432209589939, + "learning_rate": 1.0951656723970897e-05, + "loss": 0.8128, + "step": 14021 + }, + { + "epoch": 0.6993516209476309, + "grad_norm": 0.5315808808449453, + "learning_rate": 1.094831637137016e-05, + "loss": 0.8541, + "step": 14022 + }, + { + "epoch": 0.6994014962593517, + "grad_norm": 0.521027163784088, + "learning_rate": 1.0944976385441821e-05, + "loss": 0.815, + "step": 14023 + }, + { + "epoch": 0.6994513715710723, + "grad_norm": 1.0716358422672483, + "learning_rate": 1.0941636766273025e-05, + "loss": 0.8347, + "step": 14024 + }, + { + "epoch": 0.699501246882793, + "grad_norm": 0.42271130987103145, + "learning_rate": 1.0938297513950934e-05, + "loss": 0.841, + "step": 14025 + }, + { + "epoch": 0.6995511221945138, + "grad_norm": 0.43209292754318007, + "learning_rate": 1.0934958628562675e-05, + "loss": 0.8312, + "step": 14026 + }, + { + "epoch": 0.6996009975062344, + "grad_norm": 0.40525823397301275, + "learning_rate": 1.0931620110195378e-05, + "loss": 0.8599, + "step": 14027 + }, + { + "epoch": 0.6996508728179551, + "grad_norm": 0.4982014580715873, + "learning_rate": 1.0928281958936148e-05, + "loss": 0.8643, + "step": 14028 + }, + { + "epoch": 0.6997007481296759, + "grad_norm": 0.47137012564802905, + "learning_rate": 1.0924944174872112e-05, + "loss": 0.9043, + "step": 14029 + }, + { + "epoch": 0.6997506234413965, + "grad_norm": 0.45512610035221057, + "learning_rate": 1.0921606758090358e-05, + "loss": 0.8033, + "step": 14030 + }, + { + "epoch": 0.6998004987531172, + "grad_norm": 0.46875535192356366, + "learning_rate": 1.0918269708677964e-05, + "loss": 0.8495, + "step": 14031 + }, + { + "epoch": 0.6998503740648379, + "grad_norm": 0.42262525075315716, + "learning_rate": 1.0914933026722033e-05, + "loss": 0.8461, + "step": 14032 + }, + { + "epoch": 0.6999002493765586, + "grad_norm": 0.5333997811953091, + "learning_rate": 1.0911596712309619e-05, + "loss": 0.881, + "step": 14033 + }, + { + "epoch": 0.6999501246882793, + "grad_norm": 0.45269009160530316, + "learning_rate": 1.0908260765527784e-05, + "loss": 0.8463, + "step": 14034 + }, + { + "epoch": 0.7, + "grad_norm": 0.5125163793566834, + "learning_rate": 1.0904925186463566e-05, + "loss": 0.8881, + "step": 14035 + }, + { + "epoch": 0.7000498753117207, + "grad_norm": 0.4569341999165717, + "learning_rate": 1.0901589975204029e-05, + "loss": 0.8177, + "step": 14036 + }, + { + "epoch": 0.7000997506234414, + "grad_norm": 0.5244069751248547, + "learning_rate": 1.0898255131836191e-05, + "loss": 0.8867, + "step": 14037 + }, + { + "epoch": 0.7001496259351621, + "grad_norm": 0.6867526313124984, + "learning_rate": 1.0894920656447074e-05, + "loss": 0.8719, + "step": 14038 + }, + { + "epoch": 0.7001995012468828, + "grad_norm": 0.5696676185282257, + "learning_rate": 1.0891586549123683e-05, + "loss": 0.8726, + "step": 14039 + }, + { + "epoch": 0.7002493765586035, + "grad_norm": 0.42532028826614854, + "learning_rate": 1.0888252809953042e-05, + "loss": 0.8071, + "step": 14040 + }, + { + "epoch": 0.7002992518703242, + "grad_norm": 0.5164690095670397, + "learning_rate": 1.0884919439022111e-05, + "loss": 0.8175, + "step": 14041 + }, + { + "epoch": 0.7003491271820449, + "grad_norm": 0.4354118200514532, + "learning_rate": 1.0881586436417901e-05, + "loss": 0.8821, + "step": 14042 + }, + { + "epoch": 0.7003990024937656, + "grad_norm": 0.44182197124022393, + "learning_rate": 1.0878253802227364e-05, + "loss": 0.8826, + "step": 14043 + }, + { + "epoch": 0.7004488778054863, + "grad_norm": 0.473637647395506, + "learning_rate": 1.0874921536537492e-05, + "loss": 0.8192, + "step": 14044 + }, + { + "epoch": 0.700498753117207, + "grad_norm": 0.4894951271635649, + "learning_rate": 1.0871589639435203e-05, + "loss": 0.8223, + "step": 14045 + }, + { + "epoch": 0.7005486284289277, + "grad_norm": 0.4484022849231904, + "learning_rate": 1.0868258111007468e-05, + "loss": 0.8572, + "step": 14046 + }, + { + "epoch": 0.7005985037406484, + "grad_norm": 0.4385892224370274, + "learning_rate": 1.0864926951341207e-05, + "loss": 0.8781, + "step": 14047 + }, + { + "epoch": 0.7006483790523691, + "grad_norm": 0.4778970596426292, + "learning_rate": 1.0861596160523366e-05, + "loss": 0.8723, + "step": 14048 + }, + { + "epoch": 0.7006982543640897, + "grad_norm": 0.5223400017868504, + "learning_rate": 1.0858265738640828e-05, + "loss": 0.9014, + "step": 14049 + }, + { + "epoch": 0.7007481296758105, + "grad_norm": 0.4148436941198469, + "learning_rate": 1.0854935685780526e-05, + "loss": 0.8399, + "step": 14050 + }, + { + "epoch": 0.7007980049875312, + "grad_norm": 0.5143728845368561, + "learning_rate": 1.0851606002029346e-05, + "loss": 0.9118, + "step": 14051 + }, + { + "epoch": 0.7008478802992518, + "grad_norm": 0.5088552918689695, + "learning_rate": 1.0848276687474176e-05, + "loss": 0.8411, + "step": 14052 + }, + { + "epoch": 0.7008977556109726, + "grad_norm": 0.6630943622881342, + "learning_rate": 1.0844947742201883e-05, + "loss": 0.8909, + "step": 14053 + }, + { + "epoch": 0.7009476309226933, + "grad_norm": 0.4752804291126297, + "learning_rate": 1.0841619166299352e-05, + "loss": 0.8751, + "step": 14054 + }, + { + "epoch": 0.7009975062344139, + "grad_norm": 0.5842542377285673, + "learning_rate": 1.0838290959853433e-05, + "loss": 0.8551, + "step": 14055 + }, + { + "epoch": 0.7010473815461347, + "grad_norm": 0.5276723910643903, + "learning_rate": 1.0834963122950975e-05, + "loss": 0.8388, + "step": 14056 + }, + { + "epoch": 0.7010972568578554, + "grad_norm": 0.5184150202401382, + "learning_rate": 1.0831635655678805e-05, + "loss": 0.8623, + "step": 14057 + }, + { + "epoch": 0.701147132169576, + "grad_norm": 0.500987142365611, + "learning_rate": 1.0828308558123771e-05, + "loss": 0.8661, + "step": 14058 + }, + { + "epoch": 0.7011970074812968, + "grad_norm": 0.6369970555324062, + "learning_rate": 1.0824981830372682e-05, + "loss": 0.8559, + "step": 14059 + }, + { + "epoch": 0.7012468827930175, + "grad_norm": 0.48803230026560723, + "learning_rate": 1.082165547251235e-05, + "loss": 0.8896, + "step": 14060 + }, + { + "epoch": 0.7012967581047381, + "grad_norm": 0.5561931434264072, + "learning_rate": 1.081832948462956e-05, + "loss": 0.8123, + "step": 14061 + }, + { + "epoch": 0.7013466334164589, + "grad_norm": 0.567826311522307, + "learning_rate": 1.0815003866811126e-05, + "loss": 0.8429, + "step": 14062 + }, + { + "epoch": 0.7013965087281796, + "grad_norm": 0.46407786835777326, + "learning_rate": 1.081167861914382e-05, + "loss": 0.8638, + "step": 14063 + }, + { + "epoch": 0.7014463840399002, + "grad_norm": 0.6473710332649752, + "learning_rate": 1.0808353741714408e-05, + "loss": 0.8451, + "step": 14064 + }, + { + "epoch": 0.701496259351621, + "grad_norm": 0.5729824269730002, + "learning_rate": 1.0805029234609646e-05, + "loss": 0.8787, + "step": 14065 + }, + { + "epoch": 0.7015461346633416, + "grad_norm": 0.5455187433887709, + "learning_rate": 1.0801705097916301e-05, + "loss": 0.8415, + "step": 14066 + }, + { + "epoch": 0.7015960099750623, + "grad_norm": 0.7176428292784152, + "learning_rate": 1.0798381331721109e-05, + "loss": 0.8784, + "step": 14067 + }, + { + "epoch": 0.7016458852867831, + "grad_norm": 0.5027268870615569, + "learning_rate": 1.07950579361108e-05, + "loss": 0.8007, + "step": 14068 + }, + { + "epoch": 0.7016957605985037, + "grad_norm": 0.9247598025229555, + "learning_rate": 1.0791734911172086e-05, + "loss": 0.8566, + "step": 14069 + }, + { + "epoch": 0.7017456359102244, + "grad_norm": 0.5360846742088945, + "learning_rate": 1.0788412256991701e-05, + "loss": 0.8281, + "step": 14070 + }, + { + "epoch": 0.7017955112219452, + "grad_norm": 0.53863313697951, + "learning_rate": 1.0785089973656337e-05, + "loss": 0.905, + "step": 14071 + }, + { + "epoch": 0.7018453865336658, + "grad_norm": 0.43194202459773007, + "learning_rate": 1.0781768061252681e-05, + "loss": 0.8183, + "step": 14072 + }, + { + "epoch": 0.7018952618453865, + "grad_norm": 0.4778621782958714, + "learning_rate": 1.0778446519867433e-05, + "loss": 0.8517, + "step": 14073 + }, + { + "epoch": 0.7019451371571073, + "grad_norm": 0.44298909063331565, + "learning_rate": 1.0775125349587256e-05, + "loss": 0.8182, + "step": 14074 + }, + { + "epoch": 0.7019950124688279, + "grad_norm": 0.5555388538878346, + "learning_rate": 1.077180455049882e-05, + "loss": 0.8509, + "step": 14075 + }, + { + "epoch": 0.7020448877805486, + "grad_norm": 0.444423312884376, + "learning_rate": 1.0768484122688765e-05, + "loss": 0.8102, + "step": 14076 + }, + { + "epoch": 0.7020947630922694, + "grad_norm": 0.6045089202039151, + "learning_rate": 1.0765164066243768e-05, + "loss": 0.8012, + "step": 14077 + }, + { + "epoch": 0.70214463840399, + "grad_norm": 0.5858976396091956, + "learning_rate": 1.0761844381250424e-05, + "loss": 0.9076, + "step": 14078 + }, + { + "epoch": 0.7021945137157107, + "grad_norm": 0.4118565120314158, + "learning_rate": 1.075852506779539e-05, + "loss": 0.8034, + "step": 14079 + }, + { + "epoch": 0.7022443890274315, + "grad_norm": 0.5735536154707815, + "learning_rate": 1.075520612596526e-05, + "loss": 0.8198, + "step": 14080 + }, + { + "epoch": 0.7022942643391521, + "grad_norm": 0.5154973932928785, + "learning_rate": 1.0751887555846668e-05, + "loss": 0.8854, + "step": 14081 + }, + { + "epoch": 0.7023441396508728, + "grad_norm": 0.5565735637896518, + "learning_rate": 1.0748569357526177e-05, + "loss": 0.8976, + "step": 14082 + }, + { + "epoch": 0.7023940149625936, + "grad_norm": 0.5445210579146671, + "learning_rate": 1.07452515310904e-05, + "loss": 0.847, + "step": 14083 + }, + { + "epoch": 0.7024438902743142, + "grad_norm": 0.5213348878289837, + "learning_rate": 1.0741934076625895e-05, + "loss": 0.8278, + "step": 14084 + }, + { + "epoch": 0.7024937655860349, + "grad_norm": 0.6186329420256761, + "learning_rate": 1.0738616994219255e-05, + "loss": 0.8568, + "step": 14085 + }, + { + "epoch": 0.7025436408977556, + "grad_norm": 0.5998871102160707, + "learning_rate": 1.0735300283957008e-05, + "loss": 0.8334, + "step": 14086 + }, + { + "epoch": 0.7025935162094763, + "grad_norm": 0.5455504334042343, + "learning_rate": 1.0731983945925723e-05, + "loss": 0.8489, + "step": 14087 + }, + { + "epoch": 0.702643391521197, + "grad_norm": 0.4072714837416943, + "learning_rate": 1.0728667980211932e-05, + "loss": 0.8785, + "step": 14088 + }, + { + "epoch": 0.7026932668329177, + "grad_norm": 0.5063200105739459, + "learning_rate": 1.0725352386902163e-05, + "loss": 0.8403, + "step": 14089 + }, + { + "epoch": 0.7027431421446384, + "grad_norm": 0.5408179318346519, + "learning_rate": 1.0722037166082923e-05, + "loss": 0.7986, + "step": 14090 + }, + { + "epoch": 0.7027930174563591, + "grad_norm": 0.8739074342272851, + "learning_rate": 1.0718722317840746e-05, + "loss": 0.8717, + "step": 14091 + }, + { + "epoch": 0.7028428927680798, + "grad_norm": 0.5917509203748663, + "learning_rate": 1.0715407842262117e-05, + "loss": 0.8389, + "step": 14092 + }, + { + "epoch": 0.7028927680798005, + "grad_norm": 0.443933768318568, + "learning_rate": 1.071209373943353e-05, + "loss": 0.8446, + "step": 14093 + }, + { + "epoch": 0.7029426433915212, + "grad_norm": 0.5746531071882213, + "learning_rate": 1.070878000944145e-05, + "loss": 0.8421, + "step": 14094 + }, + { + "epoch": 0.7029925187032419, + "grad_norm": 0.6403898761453877, + "learning_rate": 1.070546665237237e-05, + "loss": 0.8569, + "step": 14095 + }, + { + "epoch": 0.7030423940149626, + "grad_norm": 0.5177711942470061, + "learning_rate": 1.070215366831274e-05, + "loss": 0.8382, + "step": 14096 + }, + { + "epoch": 0.7030922693266833, + "grad_norm": 0.45842605228695826, + "learning_rate": 1.0698841057349013e-05, + "loss": 0.8541, + "step": 14097 + }, + { + "epoch": 0.703142144638404, + "grad_norm": 0.8305914196996926, + "learning_rate": 1.0695528819567615e-05, + "loss": 0.8266, + "step": 14098 + }, + { + "epoch": 0.7031920199501247, + "grad_norm": 0.5200973801117437, + "learning_rate": 1.0692216955055004e-05, + "loss": 0.8604, + "step": 14099 + }, + { + "epoch": 0.7032418952618454, + "grad_norm": 0.48393301673143696, + "learning_rate": 1.0688905463897586e-05, + "loss": 0.8497, + "step": 14100 + }, + { + "epoch": 0.7032917705735661, + "grad_norm": 0.4359320243143051, + "learning_rate": 1.0685594346181776e-05, + "loss": 0.8741, + "step": 14101 + }, + { + "epoch": 0.7033416458852868, + "grad_norm": 0.45265569374191233, + "learning_rate": 1.0682283601993964e-05, + "loss": 0.8527, + "step": 14102 + }, + { + "epoch": 0.7033915211970074, + "grad_norm": 0.5196243940714658, + "learning_rate": 1.0678973231420567e-05, + "loss": 0.8135, + "step": 14103 + }, + { + "epoch": 0.7034413965087282, + "grad_norm": 0.436262029326677, + "learning_rate": 1.067566323454795e-05, + "loss": 0.8188, + "step": 14104 + }, + { + "epoch": 0.7034912718204489, + "grad_norm": 0.4819304145295919, + "learning_rate": 1.0672353611462493e-05, + "loss": 0.8412, + "step": 14105 + }, + { + "epoch": 0.7035411471321695, + "grad_norm": 0.38461317228377023, + "learning_rate": 1.0669044362250546e-05, + "loss": 0.8244, + "step": 14106 + }, + { + "epoch": 0.7035910224438903, + "grad_norm": 0.47554945872756943, + "learning_rate": 1.0665735486998482e-05, + "loss": 0.8475, + "step": 14107 + }, + { + "epoch": 0.703640897755611, + "grad_norm": 0.5104295354375612, + "learning_rate": 1.0662426985792634e-05, + "loss": 0.9005, + "step": 14108 + }, + { + "epoch": 0.7036907730673316, + "grad_norm": 0.5141969759036826, + "learning_rate": 1.065911885871933e-05, + "loss": 0.9084, + "step": 14109 + }, + { + "epoch": 0.7037406483790524, + "grad_norm": 0.479970249132691, + "learning_rate": 1.0655811105864918e-05, + "loss": 0.8164, + "step": 14110 + }, + { + "epoch": 0.7037905236907731, + "grad_norm": 0.4777625248126063, + "learning_rate": 1.065250372731568e-05, + "loss": 0.8388, + "step": 14111 + }, + { + "epoch": 0.7038403990024937, + "grad_norm": 0.6562858622988713, + "learning_rate": 1.0649196723157942e-05, + "loss": 0.8652, + "step": 14112 + }, + { + "epoch": 0.7038902743142145, + "grad_norm": 0.45085626304920856, + "learning_rate": 1.0645890093477987e-05, + "loss": 0.8924, + "step": 14113 + }, + { + "epoch": 0.7039401496259352, + "grad_norm": 0.511007865896915, + "learning_rate": 1.0642583838362122e-05, + "loss": 0.8391, + "step": 14114 + }, + { + "epoch": 0.7039900249376558, + "grad_norm": 0.47315071531929304, + "learning_rate": 1.0639277957896587e-05, + "loss": 0.871, + "step": 14115 + }, + { + "epoch": 0.7040399002493766, + "grad_norm": 0.6944876122679439, + "learning_rate": 1.0635972452167678e-05, + "loss": 0.9021, + "step": 14116 + }, + { + "epoch": 0.7040897755610973, + "grad_norm": 0.5411746398178694, + "learning_rate": 1.0632667321261627e-05, + "loss": 0.8198, + "step": 14117 + }, + { + "epoch": 0.7041396508728179, + "grad_norm": 0.6006517132296066, + "learning_rate": 1.0629362565264715e-05, + "loss": 0.8592, + "step": 14118 + }, + { + "epoch": 0.7041895261845387, + "grad_norm": 1.816735628281393, + "learning_rate": 1.0626058184263136e-05, + "loss": 0.8388, + "step": 14119 + }, + { + "epoch": 0.7042394014962593, + "grad_norm": 0.5553930819132891, + "learning_rate": 1.0622754178343141e-05, + "loss": 0.8988, + "step": 14120 + }, + { + "epoch": 0.70428927680798, + "grad_norm": 0.5775506731565736, + "learning_rate": 1.0619450547590936e-05, + "loss": 0.8848, + "step": 14121 + }, + { + "epoch": 0.7043391521197008, + "grad_norm": 0.46108482606626366, + "learning_rate": 1.0616147292092751e-05, + "loss": 0.9044, + "step": 14122 + }, + { + "epoch": 0.7043890274314214, + "grad_norm": 0.49688310862786517, + "learning_rate": 1.0612844411934746e-05, + "loss": 0.8334, + "step": 14123 + }, + { + "epoch": 0.7044389027431421, + "grad_norm": 0.5143043671493562, + "learning_rate": 1.0609541907203135e-05, + "loss": 0.8126, + "step": 14124 + }, + { + "epoch": 0.7044887780548629, + "grad_norm": 0.5070488203625533, + "learning_rate": 1.0606239777984089e-05, + "loss": 0.8239, + "step": 14125 + }, + { + "epoch": 0.7045386533665835, + "grad_norm": 0.5538143496363483, + "learning_rate": 1.0602938024363777e-05, + "loss": 0.8414, + "step": 14126 + }, + { + "epoch": 0.7045885286783042, + "grad_norm": 0.42629383607030186, + "learning_rate": 1.059963664642834e-05, + "loss": 0.8663, + "step": 14127 + }, + { + "epoch": 0.704638403990025, + "grad_norm": 0.5351731587094596, + "learning_rate": 1.0596335644263952e-05, + "loss": 0.8234, + "step": 14128 + }, + { + "epoch": 0.7046882793017456, + "grad_norm": 0.4794683053104159, + "learning_rate": 1.0593035017956737e-05, + "loss": 0.8636, + "step": 14129 + }, + { + "epoch": 0.7047381546134663, + "grad_norm": 0.47841311987024066, + "learning_rate": 1.058973476759283e-05, + "loss": 0.8233, + "step": 14130 + }, + { + "epoch": 0.7047880299251871, + "grad_norm": 1.017262402506569, + "learning_rate": 1.0586434893258333e-05, + "loss": 0.8615, + "step": 14131 + }, + { + "epoch": 0.7048379052369077, + "grad_norm": 0.4929234688139886, + "learning_rate": 1.0583135395039373e-05, + "loss": 0.8371, + "step": 14132 + }, + { + "epoch": 0.7048877805486284, + "grad_norm": 0.49859411780623425, + "learning_rate": 1.0579836273022045e-05, + "loss": 0.8906, + "step": 14133 + }, + { + "epoch": 0.7049376558603492, + "grad_norm": 0.6822593352579212, + "learning_rate": 1.0576537527292437e-05, + "loss": 0.8748, + "step": 14134 + }, + { + "epoch": 0.7049875311720698, + "grad_norm": 0.44163251112346497, + "learning_rate": 1.0573239157936619e-05, + "loss": 0.8234, + "step": 14135 + }, + { + "epoch": 0.7050374064837905, + "grad_norm": 0.46790588195654587, + "learning_rate": 1.0569941165040676e-05, + "loss": 0.8595, + "step": 14136 + }, + { + "epoch": 0.7050872817955112, + "grad_norm": 0.483179883433592, + "learning_rate": 1.0566643548690658e-05, + "loss": 0.8858, + "step": 14137 + }, + { + "epoch": 0.7051371571072319, + "grad_norm": 0.44238544805503915, + "learning_rate": 1.0563346308972618e-05, + "loss": 0.828, + "step": 14138 + }, + { + "epoch": 0.7051870324189526, + "grad_norm": 0.5005701369106949, + "learning_rate": 1.0560049445972589e-05, + "loss": 0.8475, + "step": 14139 + }, + { + "epoch": 0.7052369077306733, + "grad_norm": 0.4367106796265665, + "learning_rate": 1.0556752959776614e-05, + "loss": 0.8186, + "step": 14140 + }, + { + "epoch": 0.705286783042394, + "grad_norm": 0.4779575765571127, + "learning_rate": 1.0553456850470708e-05, + "loss": 0.8656, + "step": 14141 + }, + { + "epoch": 0.7053366583541147, + "grad_norm": 0.5609798563138748, + "learning_rate": 1.0550161118140879e-05, + "loss": 0.8806, + "step": 14142 + }, + { + "epoch": 0.7053865336658354, + "grad_norm": 0.48259302019491185, + "learning_rate": 1.054686576287312e-05, + "loss": 0.8841, + "step": 14143 + }, + { + "epoch": 0.7054364089775561, + "grad_norm": 0.5661320614202996, + "learning_rate": 1.054357078475344e-05, + "loss": 0.8789, + "step": 14144 + }, + { + "epoch": 0.7054862842892768, + "grad_norm": 0.4670512092477, + "learning_rate": 1.054027618386781e-05, + "loss": 0.7931, + "step": 14145 + }, + { + "epoch": 0.7055361596009975, + "grad_norm": 0.46254973762152124, + "learning_rate": 1.0536981960302192e-05, + "loss": 0.8941, + "step": 14146 + }, + { + "epoch": 0.7055860349127182, + "grad_norm": 0.5584561305935193, + "learning_rate": 1.0533688114142576e-05, + "loss": 0.8466, + "step": 14147 + }, + { + "epoch": 0.705635910224439, + "grad_norm": 0.48486064658902267, + "learning_rate": 1.0530394645474875e-05, + "loss": 0.8341, + "step": 14148 + }, + { + "epoch": 0.7056857855361596, + "grad_norm": 0.4910075000773812, + "learning_rate": 1.052710155438506e-05, + "loss": 0.8473, + "step": 14149 + }, + { + "epoch": 0.7057356608478803, + "grad_norm": 0.5780147770345173, + "learning_rate": 1.0523808840959043e-05, + "loss": 0.7965, + "step": 14150 + }, + { + "epoch": 0.705785536159601, + "grad_norm": 0.6541468753284695, + "learning_rate": 1.0520516505282777e-05, + "loss": 0.8925, + "step": 14151 + }, + { + "epoch": 0.7058354114713217, + "grad_norm": 0.5709117344366618, + "learning_rate": 1.0517224547442131e-05, + "loss": 0.8256, + "step": 14152 + }, + { + "epoch": 0.7058852867830424, + "grad_norm": 0.46565407892581023, + "learning_rate": 1.0513932967523039e-05, + "loss": 0.8389, + "step": 14153 + }, + { + "epoch": 0.7059351620947631, + "grad_norm": 0.5496005607863584, + "learning_rate": 1.0510641765611373e-05, + "loss": 0.849, + "step": 14154 + }, + { + "epoch": 0.7059850374064838, + "grad_norm": 0.4837181365981547, + "learning_rate": 1.0507350941793042e-05, + "loss": 0.9024, + "step": 14155 + }, + { + "epoch": 0.7060349127182045, + "grad_norm": 0.5366916259365764, + "learning_rate": 1.050406049615389e-05, + "loss": 0.8675, + "step": 14156 + }, + { + "epoch": 0.7060847880299251, + "grad_norm": 0.5209191654066403, + "learning_rate": 1.0500770428779797e-05, + "loss": 0.8851, + "step": 14157 + }, + { + "epoch": 0.7061346633416459, + "grad_norm": 0.6694914452931789, + "learning_rate": 1.049748073975661e-05, + "loss": 0.8449, + "step": 14158 + }, + { + "epoch": 0.7061845386533666, + "grad_norm": 0.5005710756676574, + "learning_rate": 1.0494191429170172e-05, + "loss": 0.8698, + "step": 14159 + }, + { + "epoch": 0.7062344139650872, + "grad_norm": 0.4655880507608677, + "learning_rate": 1.0490902497106308e-05, + "loss": 0.8782, + "step": 14160 + }, + { + "epoch": 0.706284289276808, + "grad_norm": 0.7107952822923865, + "learning_rate": 1.0487613943650861e-05, + "loss": 0.8635, + "step": 14161 + }, + { + "epoch": 0.7063341645885287, + "grad_norm": 0.6322537862181677, + "learning_rate": 1.0484325768889634e-05, + "loss": 0.8415, + "step": 14162 + }, + { + "epoch": 0.7063840399002493, + "grad_norm": 0.650128691956717, + "learning_rate": 1.0481037972908428e-05, + "loss": 0.8157, + "step": 14163 + }, + { + "epoch": 0.7064339152119701, + "grad_norm": 0.46046886884925525, + "learning_rate": 1.0477750555793028e-05, + "loss": 0.8414, + "step": 14164 + }, + { + "epoch": 0.7064837905236908, + "grad_norm": 0.6099861453065633, + "learning_rate": 1.0474463517629237e-05, + "loss": 0.8813, + "step": 14165 + }, + { + "epoch": 0.7065336658354114, + "grad_norm": 0.45270942036009865, + "learning_rate": 1.0471176858502824e-05, + "loss": 0.8515, + "step": 14166 + }, + { + "epoch": 0.7065835411471322, + "grad_norm": 0.44334680009780386, + "learning_rate": 1.0467890578499545e-05, + "loss": 0.8311, + "step": 14167 + }, + { + "epoch": 0.7066334164588529, + "grad_norm": 0.46706750311230955, + "learning_rate": 1.0464604677705148e-05, + "loss": 0.837, + "step": 14168 + }, + { + "epoch": 0.7066832917705735, + "grad_norm": 0.49110313705207614, + "learning_rate": 1.0461319156205396e-05, + "loss": 0.878, + "step": 14169 + }, + { + "epoch": 0.7067331670822943, + "grad_norm": 0.5196478154802382, + "learning_rate": 1.0458034014086016e-05, + "loss": 0.8612, + "step": 14170 + }, + { + "epoch": 0.706783042394015, + "grad_norm": 0.4950057327555847, + "learning_rate": 1.045474925143273e-05, + "loss": 0.8322, + "step": 14171 + }, + { + "epoch": 0.7068329177057356, + "grad_norm": 0.50181735767888, + "learning_rate": 1.0451464868331243e-05, + "loss": 0.8201, + "step": 14172 + }, + { + "epoch": 0.7068827930174564, + "grad_norm": 0.7924710765679531, + "learning_rate": 1.0448180864867276e-05, + "loss": 0.8248, + "step": 14173 + }, + { + "epoch": 0.706932668329177, + "grad_norm": 0.4865253923666578, + "learning_rate": 1.044489724112652e-05, + "loss": 0.8546, + "step": 14174 + }, + { + "epoch": 0.7069825436408977, + "grad_norm": 0.4449062953411912, + "learning_rate": 1.0441613997194654e-05, + "loss": 0.8708, + "step": 14175 + }, + { + "epoch": 0.7070324189526185, + "grad_norm": 0.4851738309851839, + "learning_rate": 1.0438331133157348e-05, + "loss": 0.8105, + "step": 14176 + }, + { + "epoch": 0.7070822942643391, + "grad_norm": 0.47013699901451994, + "learning_rate": 1.0435048649100284e-05, + "loss": 0.8302, + "step": 14177 + }, + { + "epoch": 0.7071321695760598, + "grad_norm": 0.4546291002901032, + "learning_rate": 1.0431766545109106e-05, + "loss": 0.8446, + "step": 14178 + }, + { + "epoch": 0.7071820448877806, + "grad_norm": 0.4984927221248406, + "learning_rate": 1.042848482126946e-05, + "loss": 0.8564, + "step": 14179 + }, + { + "epoch": 0.7072319201995012, + "grad_norm": 0.5417167213891645, + "learning_rate": 1.0425203477666973e-05, + "loss": 0.8449, + "step": 14180 + }, + { + "epoch": 0.707281795511222, + "grad_norm": 0.6649876006455833, + "learning_rate": 1.0421922514387286e-05, + "loss": 0.8529, + "step": 14181 + }, + { + "epoch": 0.7073316708229427, + "grad_norm": 0.7216999773695445, + "learning_rate": 1.0418641931516008e-05, + "loss": 0.8554, + "step": 14182 + }, + { + "epoch": 0.7073815461346633, + "grad_norm": 0.7411185878855768, + "learning_rate": 1.041536172913874e-05, + "loss": 0.8615, + "step": 14183 + }, + { + "epoch": 0.707431421446384, + "grad_norm": 0.6881881626954302, + "learning_rate": 1.0412081907341084e-05, + "loss": 0.8496, + "step": 14184 + }, + { + "epoch": 0.7074812967581048, + "grad_norm": 0.4873451458594008, + "learning_rate": 1.0408802466208611e-05, + "loss": 0.8784, + "step": 14185 + }, + { + "epoch": 0.7075311720698254, + "grad_norm": 0.6095804595618006, + "learning_rate": 1.0405523405826917e-05, + "loss": 0.8633, + "step": 14186 + }, + { + "epoch": 0.7075810473815461, + "grad_norm": 0.5005895087546818, + "learning_rate": 1.0402244726281549e-05, + "loss": 0.866, + "step": 14187 + }, + { + "epoch": 0.7076309226932669, + "grad_norm": 2.264319382025474, + "learning_rate": 1.039896642765809e-05, + "loss": 0.8119, + "step": 14188 + }, + { + "epoch": 0.7076807980049875, + "grad_norm": 0.5805681262718168, + "learning_rate": 1.0395688510042051e-05, + "loss": 0.8928, + "step": 14189 + }, + { + "epoch": 0.7077306733167082, + "grad_norm": 0.5595219178004015, + "learning_rate": 1.039241097351899e-05, + "loss": 0.8566, + "step": 14190 + }, + { + "epoch": 0.7077805486284289, + "grad_norm": 0.5335116111000007, + "learning_rate": 1.038913381817442e-05, + "loss": 0.8622, + "step": 14191 + }, + { + "epoch": 0.7078304239401496, + "grad_norm": 0.42900254961201306, + "learning_rate": 1.0385857044093883e-05, + "loss": 0.8579, + "step": 14192 + }, + { + "epoch": 0.7078802992518703, + "grad_norm": 0.43926330904255356, + "learning_rate": 1.0382580651362847e-05, + "loss": 0.8562, + "step": 14193 + }, + { + "epoch": 0.707930174563591, + "grad_norm": 0.48346073876926254, + "learning_rate": 1.0379304640066837e-05, + "loss": 0.8241, + "step": 14194 + }, + { + "epoch": 0.7079800498753117, + "grad_norm": 0.4909858690150147, + "learning_rate": 1.0376029010291328e-05, + "loss": 0.8524, + "step": 14195 + }, + { + "epoch": 0.7080299251870324, + "grad_norm": 0.6108705811782931, + "learning_rate": 1.0372753762121796e-05, + "loss": 0.8785, + "step": 14196 + }, + { + "epoch": 0.7080798004987531, + "grad_norm": 0.539290336161551, + "learning_rate": 1.03694788956437e-05, + "loss": 0.896, + "step": 14197 + }, + { + "epoch": 0.7081296758104738, + "grad_norm": 0.47252964875428183, + "learning_rate": 1.0366204410942515e-05, + "loss": 0.8623, + "step": 14198 + }, + { + "epoch": 0.7081795511221946, + "grad_norm": 0.474573187010574, + "learning_rate": 1.0362930308103675e-05, + "loss": 0.8506, + "step": 14199 + }, + { + "epoch": 0.7082294264339152, + "grad_norm": 0.5593018363188104, + "learning_rate": 1.0359656587212619e-05, + "loss": 0.8608, + "step": 14200 + }, + { + "epoch": 0.7082793017456359, + "grad_norm": 0.5968762321962122, + "learning_rate": 1.0356383248354761e-05, + "loss": 0.8195, + "step": 14201 + }, + { + "epoch": 0.7083291770573567, + "grad_norm": 0.4512170388666784, + "learning_rate": 1.0353110291615539e-05, + "loss": 0.8296, + "step": 14202 + }, + { + "epoch": 0.7083790523690773, + "grad_norm": 0.4701552589328476, + "learning_rate": 1.034983771708035e-05, + "loss": 0.8531, + "step": 14203 + }, + { + "epoch": 0.708428927680798, + "grad_norm": 0.4681728406608708, + "learning_rate": 1.034656552483459e-05, + "loss": 0.8788, + "step": 14204 + }, + { + "epoch": 0.7084788029925188, + "grad_norm": 0.624134010543898, + "learning_rate": 1.0343293714963632e-05, + "loss": 0.8304, + "step": 14205 + }, + { + "epoch": 0.7085286783042394, + "grad_norm": 0.45772165708331625, + "learning_rate": 1.0340022287552884e-05, + "loss": 0.8465, + "step": 14206 + }, + { + "epoch": 0.7085785536159601, + "grad_norm": 0.49932942301067007, + "learning_rate": 1.0336751242687679e-05, + "loss": 0.859, + "step": 14207 + }, + { + "epoch": 0.7086284289276809, + "grad_norm": 0.4363636073535628, + "learning_rate": 1.0333480580453392e-05, + "loss": 0.8458, + "step": 14208 + }, + { + "epoch": 0.7086783042394015, + "grad_norm": 0.45317482189980285, + "learning_rate": 1.0330210300935358e-05, + "loss": 0.8689, + "step": 14209 + }, + { + "epoch": 0.7087281795511222, + "grad_norm": 0.4998911675060485, + "learning_rate": 1.0326940404218941e-05, + "loss": 0.8387, + "step": 14210 + }, + { + "epoch": 0.7087780548628428, + "grad_norm": 0.46233304334136655, + "learning_rate": 1.0323670890389428e-05, + "loss": 0.7936, + "step": 14211 + }, + { + "epoch": 0.7088279301745636, + "grad_norm": 0.7045468183929726, + "learning_rate": 1.0320401759532163e-05, + "loss": 0.9079, + "step": 14212 + }, + { + "epoch": 0.7088778054862843, + "grad_norm": 0.4978827194174608, + "learning_rate": 1.0317133011732435e-05, + "loss": 0.817, + "step": 14213 + }, + { + "epoch": 0.7089276807980049, + "grad_norm": 0.48746517549291646, + "learning_rate": 1.0313864647075561e-05, + "loss": 0.8792, + "step": 14214 + }, + { + "epoch": 0.7089775561097257, + "grad_norm": 0.7204388118143616, + "learning_rate": 1.0310596665646815e-05, + "loss": 0.8665, + "step": 14215 + }, + { + "epoch": 0.7090274314214464, + "grad_norm": 0.5930306485385252, + "learning_rate": 1.0307329067531477e-05, + "loss": 0.8348, + "step": 14216 + }, + { + "epoch": 0.709077306733167, + "grad_norm": 0.5128841784696935, + "learning_rate": 1.0304061852814808e-05, + "loss": 0.8702, + "step": 14217 + }, + { + "epoch": 0.7091271820448878, + "grad_norm": 0.6123331693433368, + "learning_rate": 1.030079502158206e-05, + "loss": 0.8774, + "step": 14218 + }, + { + "epoch": 0.7091770573566085, + "grad_norm": 0.5074668359852847, + "learning_rate": 1.0297528573918497e-05, + "loss": 0.8845, + "step": 14219 + }, + { + "epoch": 0.7092269326683291, + "grad_norm": 0.6053337881063481, + "learning_rate": 1.0294262509909347e-05, + "loss": 0.8117, + "step": 14220 + }, + { + "epoch": 0.7092768079800499, + "grad_norm": 0.547821956625355, + "learning_rate": 1.0290996829639835e-05, + "loss": 0.902, + "step": 14221 + }, + { + "epoch": 0.7093266832917706, + "grad_norm": 0.5193097573478178, + "learning_rate": 1.0287731533195166e-05, + "loss": 0.8427, + "step": 14222 + }, + { + "epoch": 0.7093765586034912, + "grad_norm": 0.45165316048904897, + "learning_rate": 1.0284466620660568e-05, + "loss": 0.8542, + "step": 14223 + }, + { + "epoch": 0.709426433915212, + "grad_norm": 0.5250215198327594, + "learning_rate": 1.0281202092121223e-05, + "loss": 0.8322, + "step": 14224 + }, + { + "epoch": 0.7094763092269327, + "grad_norm": 0.5665834956083999, + "learning_rate": 1.0277937947662337e-05, + "loss": 0.7871, + "step": 14225 + }, + { + "epoch": 0.7095261845386533, + "grad_norm": 0.5591012922240922, + "learning_rate": 1.0274674187369052e-05, + "loss": 0.8646, + "step": 14226 + }, + { + "epoch": 0.7095760598503741, + "grad_norm": 0.5417018428350913, + "learning_rate": 1.0271410811326565e-05, + "loss": 0.866, + "step": 14227 + }, + { + "epoch": 0.7096259351620947, + "grad_norm": 0.6521494330644297, + "learning_rate": 1.0268147819620011e-05, + "loss": 0.8906, + "step": 14228 + }, + { + "epoch": 0.7096758104738154, + "grad_norm": 0.5230854218988171, + "learning_rate": 1.0264885212334566e-05, + "loss": 0.8514, + "step": 14229 + }, + { + "epoch": 0.7097256857855362, + "grad_norm": 0.4779133394113133, + "learning_rate": 1.0261622989555328e-05, + "loss": 0.8629, + "step": 14230 + }, + { + "epoch": 0.7097755610972568, + "grad_norm": 0.5591021491602496, + "learning_rate": 1.0258361151367452e-05, + "loss": 0.8847, + "step": 14231 + }, + { + "epoch": 0.7098254364089776, + "grad_norm": 0.4606561198138004, + "learning_rate": 1.0255099697856044e-05, + "loss": 0.8095, + "step": 14232 + }, + { + "epoch": 0.7098753117206983, + "grad_norm": 0.56561118059941, + "learning_rate": 1.0251838629106208e-05, + "loss": 0.845, + "step": 14233 + }, + { + "epoch": 0.7099251870324189, + "grad_norm": 0.6950549086481332, + "learning_rate": 1.0248577945203037e-05, + "loss": 0.8798, + "step": 14234 + }, + { + "epoch": 0.7099750623441397, + "grad_norm": 0.652054218277331, + "learning_rate": 1.0245317646231629e-05, + "loss": 0.8568, + "step": 14235 + }, + { + "epoch": 0.7100249376558604, + "grad_norm": 0.5600985762529823, + "learning_rate": 1.0242057732277054e-05, + "loss": 0.8499, + "step": 14236 + }, + { + "epoch": 0.710074812967581, + "grad_norm": 0.44312757535684516, + "learning_rate": 1.023879820342438e-05, + "loss": 0.8371, + "step": 14237 + }, + { + "epoch": 0.7101246882793018, + "grad_norm": 0.5862793654598082, + "learning_rate": 1.0235539059758651e-05, + "loss": 0.8612, + "step": 14238 + }, + { + "epoch": 0.7101745635910225, + "grad_norm": 0.5269102728196995, + "learning_rate": 1.023228030136493e-05, + "loss": 0.8375, + "step": 14239 + }, + { + "epoch": 0.7102244389027431, + "grad_norm": 0.4823397665081642, + "learning_rate": 1.0229021928328244e-05, + "loss": 0.844, + "step": 14240 + }, + { + "epoch": 0.7102743142144639, + "grad_norm": 0.572896754857334, + "learning_rate": 1.0225763940733624e-05, + "loss": 0.8946, + "step": 14241 + }, + { + "epoch": 0.7103241895261846, + "grad_norm": 0.5100407413393869, + "learning_rate": 1.0222506338666072e-05, + "loss": 0.8689, + "step": 14242 + }, + { + "epoch": 0.7103740648379052, + "grad_norm": 0.5353086803435608, + "learning_rate": 1.021924912221062e-05, + "loss": 0.816, + "step": 14243 + }, + { + "epoch": 0.710423940149626, + "grad_norm": 0.5523808511046058, + "learning_rate": 1.0215992291452228e-05, + "loss": 0.8557, + "step": 14244 + }, + { + "epoch": 0.7104738154613466, + "grad_norm": 0.4187382815389628, + "learning_rate": 1.0212735846475913e-05, + "loss": 0.8591, + "step": 14245 + }, + { + "epoch": 0.7105236907730673, + "grad_norm": 0.4432194323210606, + "learning_rate": 1.0209479787366627e-05, + "loss": 0.8127, + "step": 14246 + }, + { + "epoch": 0.710573566084788, + "grad_norm": 0.523204584456021, + "learning_rate": 1.0206224114209365e-05, + "loss": 0.8405, + "step": 14247 + }, + { + "epoch": 0.7106234413965087, + "grad_norm": 0.5301957823118872, + "learning_rate": 1.0202968827089046e-05, + "loss": 0.9475, + "step": 14248 + }, + { + "epoch": 0.7106733167082294, + "grad_norm": 0.5121355807617682, + "learning_rate": 1.0199713926090643e-05, + "loss": 0.8631, + "step": 14249 + }, + { + "epoch": 0.7107231920199502, + "grad_norm": 0.607885502827762, + "learning_rate": 1.019645941129907e-05, + "loss": 0.8445, + "step": 14250 + }, + { + "epoch": 0.7107730673316708, + "grad_norm": 0.5051811419765643, + "learning_rate": 1.0193205282799276e-05, + "loss": 0.8898, + "step": 14251 + }, + { + "epoch": 0.7108229426433915, + "grad_norm": 0.5824625496605933, + "learning_rate": 1.018995154067616e-05, + "loss": 0.8875, + "step": 14252 + }, + { + "epoch": 0.7108728179551123, + "grad_norm": 0.42963157822916875, + "learning_rate": 1.0186698185014632e-05, + "loss": 0.8238, + "step": 14253 + }, + { + "epoch": 0.7109226932668329, + "grad_norm": 0.665295558337925, + "learning_rate": 1.0183445215899584e-05, + "loss": 0.8663, + "step": 14254 + }, + { + "epoch": 0.7109725685785536, + "grad_norm": 0.758484527783661, + "learning_rate": 1.0180192633415898e-05, + "loss": 0.8449, + "step": 14255 + }, + { + "epoch": 0.7110224438902744, + "grad_norm": 0.5262054947864521, + "learning_rate": 1.0176940437648455e-05, + "loss": 0.89, + "step": 14256 + }, + { + "epoch": 0.711072319201995, + "grad_norm": 0.5459614335773755, + "learning_rate": 1.017368862868212e-05, + "loss": 0.8883, + "step": 14257 + }, + { + "epoch": 0.7111221945137157, + "grad_norm": 0.4572575938339788, + "learning_rate": 1.0170437206601745e-05, + "loss": 0.8479, + "step": 14258 + }, + { + "epoch": 0.7111720698254365, + "grad_norm": 0.5404062288284922, + "learning_rate": 1.0167186171492169e-05, + "loss": 0.8376, + "step": 14259 + }, + { + "epoch": 0.7112219451371571, + "grad_norm": 0.6604758718657342, + "learning_rate": 1.0163935523438237e-05, + "loss": 0.9047, + "step": 14260 + }, + { + "epoch": 0.7112718204488778, + "grad_norm": 0.6011848745394633, + "learning_rate": 1.0160685262524758e-05, + "loss": 0.8911, + "step": 14261 + }, + { + "epoch": 0.7113216957605984, + "grad_norm": 0.5684055197517869, + "learning_rate": 1.0157435388836572e-05, + "loss": 0.8348, + "step": 14262 + }, + { + "epoch": 0.7113715710723192, + "grad_norm": 0.5127818499780942, + "learning_rate": 1.0154185902458454e-05, + "loss": 0.8544, + "step": 14263 + }, + { + "epoch": 0.7114214463840399, + "grad_norm": 0.43956634055841737, + "learning_rate": 1.0150936803475216e-05, + "loss": 0.8639, + "step": 14264 + }, + { + "epoch": 0.7114713216957606, + "grad_norm": 0.49659113303928826, + "learning_rate": 1.0147688091971639e-05, + "loss": 0.8281, + "step": 14265 + }, + { + "epoch": 0.7115211970074813, + "grad_norm": 0.5036206893783769, + "learning_rate": 1.0144439768032494e-05, + "loss": 0.8399, + "step": 14266 + }, + { + "epoch": 0.711571072319202, + "grad_norm": 0.5314253569635935, + "learning_rate": 1.0141191831742533e-05, + "loss": 0.9458, + "step": 14267 + }, + { + "epoch": 0.7116209476309227, + "grad_norm": 0.47650807341679746, + "learning_rate": 1.0137944283186532e-05, + "loss": 0.8174, + "step": 14268 + }, + { + "epoch": 0.7116708229426434, + "grad_norm": 0.6323259193376318, + "learning_rate": 1.0134697122449224e-05, + "loss": 0.8638, + "step": 14269 + }, + { + "epoch": 0.7117206982543641, + "grad_norm": 0.44179677444498805, + "learning_rate": 1.0131450349615344e-05, + "loss": 0.783, + "step": 14270 + }, + { + "epoch": 0.7117705735660848, + "grad_norm": 0.4296573096958536, + "learning_rate": 1.0128203964769601e-05, + "loss": 0.8687, + "step": 14271 + }, + { + "epoch": 0.7118204488778055, + "grad_norm": 0.45662724597235227, + "learning_rate": 1.0124957967996726e-05, + "loss": 0.8588, + "step": 14272 + }, + { + "epoch": 0.7118703241895262, + "grad_norm": 0.6534240375957237, + "learning_rate": 1.0121712359381419e-05, + "loss": 0.8812, + "step": 14273 + }, + { + "epoch": 0.7119201995012469, + "grad_norm": 0.5599583475232202, + "learning_rate": 1.0118467139008369e-05, + "loss": 0.834, + "step": 14274 + }, + { + "epoch": 0.7119700748129676, + "grad_norm": 0.7177950256742299, + "learning_rate": 1.011522230696225e-05, + "loss": 0.8441, + "step": 14275 + }, + { + "epoch": 0.7120199501246883, + "grad_norm": 0.4945490582630372, + "learning_rate": 1.0111977863327762e-05, + "loss": 0.8167, + "step": 14276 + }, + { + "epoch": 0.712069825436409, + "grad_norm": 0.4476144608731726, + "learning_rate": 1.010873380818953e-05, + "loss": 0.8926, + "step": 14277 + }, + { + "epoch": 0.7121197007481297, + "grad_norm": 0.5519115799547999, + "learning_rate": 1.0105490141632234e-05, + "loss": 0.8409, + "step": 14278 + }, + { + "epoch": 0.7121695760598504, + "grad_norm": 0.46216784656245674, + "learning_rate": 1.0102246863740496e-05, + "loss": 0.8521, + "step": 14279 + }, + { + "epoch": 0.712219451371571, + "grad_norm": 0.7609458536874042, + "learning_rate": 1.0099003974598978e-05, + "loss": 0.8835, + "step": 14280 + }, + { + "epoch": 0.7122693266832918, + "grad_norm": 0.511790426026666, + "learning_rate": 1.0095761474292264e-05, + "loss": 0.8221, + "step": 14281 + }, + { + "epoch": 0.7123192019950124, + "grad_norm": 0.6129896464408547, + "learning_rate": 1.0092519362904995e-05, + "loss": 0.8448, + "step": 14282 + }, + { + "epoch": 0.7123690773067332, + "grad_norm": 0.462739015504974, + "learning_rate": 1.0089277640521751e-05, + "loss": 0.8086, + "step": 14283 + }, + { + "epoch": 0.7124189526184539, + "grad_norm": 0.46196554611007545, + "learning_rate": 1.0086036307227154e-05, + "loss": 0.8504, + "step": 14284 + }, + { + "epoch": 0.7124688279301745, + "grad_norm": 0.4204820192777631, + "learning_rate": 1.0082795363105742e-05, + "loss": 0.8211, + "step": 14285 + }, + { + "epoch": 0.7125187032418953, + "grad_norm": 0.5862824445051505, + "learning_rate": 1.0079554808242123e-05, + "loss": 0.8498, + "step": 14286 + }, + { + "epoch": 0.712568578553616, + "grad_norm": 0.42882891943625606, + "learning_rate": 1.0076314642720833e-05, + "loss": 0.8311, + "step": 14287 + }, + { + "epoch": 0.7126184538653366, + "grad_norm": 0.585350973674357, + "learning_rate": 1.0073074866626441e-05, + "loss": 0.8676, + "step": 14288 + }, + { + "epoch": 0.7126683291770574, + "grad_norm": 0.5369547812740967, + "learning_rate": 1.0069835480043482e-05, + "loss": 0.8698, + "step": 14289 + }, + { + "epoch": 0.7127182044887781, + "grad_norm": 0.5033072674152055, + "learning_rate": 1.0066596483056485e-05, + "loss": 0.8623, + "step": 14290 + }, + { + "epoch": 0.7127680798004987, + "grad_norm": 2.43315286035673, + "learning_rate": 1.0063357875749965e-05, + "loss": 0.8849, + "step": 14291 + }, + { + "epoch": 0.7128179551122195, + "grad_norm": 0.4590634993644159, + "learning_rate": 1.006011965820843e-05, + "loss": 0.8374, + "step": 14292 + }, + { + "epoch": 0.7128678304239402, + "grad_norm": 0.41552840093830296, + "learning_rate": 1.0056881830516396e-05, + "loss": 0.8356, + "step": 14293 + }, + { + "epoch": 0.7129177057356608, + "grad_norm": 0.5100547522258728, + "learning_rate": 1.0053644392758344e-05, + "loss": 0.8696, + "step": 14294 + }, + { + "epoch": 0.7129675810473816, + "grad_norm": 0.644766983907128, + "learning_rate": 1.0050407345018751e-05, + "loss": 0.7631, + "step": 14295 + }, + { + "epoch": 0.7130174563591023, + "grad_norm": 0.45733219969003414, + "learning_rate": 1.0047170687382079e-05, + "loss": 0.8903, + "step": 14296 + }, + { + "epoch": 0.7130673316708229, + "grad_norm": 0.508742298186231, + "learning_rate": 1.0043934419932804e-05, + "loss": 0.893, + "step": 14297 + }, + { + "epoch": 0.7131172069825437, + "grad_norm": 0.7657588135923094, + "learning_rate": 1.0040698542755366e-05, + "loss": 0.8771, + "step": 14298 + }, + { + "epoch": 0.7131670822942643, + "grad_norm": 0.5008677341505995, + "learning_rate": 1.0037463055934207e-05, + "loss": 0.8688, + "step": 14299 + }, + { + "epoch": 0.713216957605985, + "grad_norm": 0.527286575373235, + "learning_rate": 1.0034227959553744e-05, + "loss": 0.8277, + "step": 14300 + }, + { + "epoch": 0.7132668329177058, + "grad_norm": 0.5165135970616935, + "learning_rate": 1.0030993253698414e-05, + "loss": 0.8244, + "step": 14301 + }, + { + "epoch": 0.7133167082294264, + "grad_norm": 0.706431684872403, + "learning_rate": 1.0027758938452614e-05, + "loss": 0.8692, + "step": 14302 + }, + { + "epoch": 0.7133665835411471, + "grad_norm": 0.6264504059295066, + "learning_rate": 1.0024525013900746e-05, + "loss": 0.8563, + "step": 14303 + }, + { + "epoch": 0.7134164588528679, + "grad_norm": 0.46541486358910644, + "learning_rate": 1.0021291480127184e-05, + "loss": 0.8462, + "step": 14304 + }, + { + "epoch": 0.7134663341645885, + "grad_norm": 0.5475367579819597, + "learning_rate": 1.0018058337216327e-05, + "loss": 0.8441, + "step": 14305 + }, + { + "epoch": 0.7135162094763092, + "grad_norm": 0.47850184911178356, + "learning_rate": 1.0014825585252532e-05, + "loss": 0.8201, + "step": 14306 + }, + { + "epoch": 0.71356608478803, + "grad_norm": 0.5089480229615657, + "learning_rate": 1.0011593224320159e-05, + "loss": 0.8515, + "step": 14307 + }, + { + "epoch": 0.7136159600997506, + "grad_norm": 0.45322708469992706, + "learning_rate": 1.0008361254503543e-05, + "loss": 0.846, + "step": 14308 + }, + { + "epoch": 0.7136658354114713, + "grad_norm": 0.5042921797778998, + "learning_rate": 1.000512967588704e-05, + "loss": 0.8432, + "step": 14309 + }, + { + "epoch": 0.7137157107231921, + "grad_norm": 0.5799175898305508, + "learning_rate": 1.0001898488554968e-05, + "loss": 0.8517, + "step": 14310 + }, + { + "epoch": 0.7137655860349127, + "grad_norm": 0.5244356948235807, + "learning_rate": 9.99866769259164e-06, + "loss": 0.8349, + "step": 14311 + }, + { + "epoch": 0.7138154613466334, + "grad_norm": 0.4298842348800475, + "learning_rate": 9.995437288081357e-06, + "loss": 0.8227, + "step": 14312 + }, + { + "epoch": 0.7138653366583542, + "grad_norm": 1.2571746481840478, + "learning_rate": 9.992207275108442e-06, + "loss": 0.8145, + "step": 14313 + }, + { + "epoch": 0.7139152119700748, + "grad_norm": 0.5020516878897548, + "learning_rate": 9.988977653757143e-06, + "loss": 0.8275, + "step": 14314 + }, + { + "epoch": 0.7139650872817955, + "grad_norm": 0.703672561409204, + "learning_rate": 9.98574842411176e-06, + "loss": 0.8396, + "step": 14315 + }, + { + "epoch": 0.7140149625935162, + "grad_norm": 0.47120271775359557, + "learning_rate": 9.982519586256548e-06, + "loss": 0.8465, + "step": 14316 + }, + { + "epoch": 0.7140648379052369, + "grad_norm": 0.4254071738714627, + "learning_rate": 9.979291140275781e-06, + "loss": 0.8446, + "step": 14317 + }, + { + "epoch": 0.7141147132169576, + "grad_norm": 0.5697937572177282, + "learning_rate": 9.976063086253673e-06, + "loss": 0.8773, + "step": 14318 + }, + { + "epoch": 0.7141645885286783, + "grad_norm": 0.5270315360598565, + "learning_rate": 9.972835424274485e-06, + "loss": 0.8156, + "step": 14319 + }, + { + "epoch": 0.714214463840399, + "grad_norm": 0.5260131972434045, + "learning_rate": 9.969608154422421e-06, + "loss": 0.8555, + "step": 14320 + }, + { + "epoch": 0.7142643391521197, + "grad_norm": 0.5245404572856488, + "learning_rate": 9.966381276781722e-06, + "loss": 0.8602, + "step": 14321 + }, + { + "epoch": 0.7143142144638404, + "grad_norm": 0.47905504725339126, + "learning_rate": 9.96315479143656e-06, + "loss": 0.9042, + "step": 14322 + }, + { + "epoch": 0.7143640897755611, + "grad_norm": 0.5213642002681866, + "learning_rate": 9.959928698471152e-06, + "loss": 0.8585, + "step": 14323 + }, + { + "epoch": 0.7144139650872818, + "grad_norm": 0.5630760062668541, + "learning_rate": 9.956702997969674e-06, + "loss": 0.8427, + "step": 14324 + }, + { + "epoch": 0.7144638403990025, + "grad_norm": 0.5291520482166184, + "learning_rate": 9.953477690016298e-06, + "loss": 0.8251, + "step": 14325 + }, + { + "epoch": 0.7145137157107232, + "grad_norm": 0.4817026466929618, + "learning_rate": 9.950252774695179e-06, + "loss": 0.8573, + "step": 14326 + }, + { + "epoch": 0.7145635910224439, + "grad_norm": 0.46278130447373345, + "learning_rate": 9.947028252090488e-06, + "loss": 0.8065, + "step": 14327 + }, + { + "epoch": 0.7146134663341646, + "grad_norm": 0.52947934594003, + "learning_rate": 9.943804122286357e-06, + "loss": 0.8677, + "step": 14328 + }, + { + "epoch": 0.7146633416458853, + "grad_norm": 0.5238470404737654, + "learning_rate": 9.94058038536691e-06, + "loss": 0.8246, + "step": 14329 + }, + { + "epoch": 0.714713216957606, + "grad_norm": 0.48243382439939614, + "learning_rate": 9.937357041416289e-06, + "loss": 0.8162, + "step": 14330 + }, + { + "epoch": 0.7147630922693267, + "grad_norm": 0.568767620190067, + "learning_rate": 9.934134090518593e-06, + "loss": 0.8506, + "step": 14331 + }, + { + "epoch": 0.7148129675810474, + "grad_norm": 0.4757664379407403, + "learning_rate": 9.930911532757928e-06, + "loss": 0.8923, + "step": 14332 + }, + { + "epoch": 0.714862842892768, + "grad_norm": 0.5220520919622517, + "learning_rate": 9.927689368218371e-06, + "loss": 0.8709, + "step": 14333 + }, + { + "epoch": 0.7149127182044888, + "grad_norm": 0.7899370159161564, + "learning_rate": 9.924467596984021e-06, + "loss": 0.8667, + "step": 14334 + }, + { + "epoch": 0.7149625935162095, + "grad_norm": 0.5482347957404449, + "learning_rate": 9.921246219138946e-06, + "loss": 0.9099, + "step": 14335 + }, + { + "epoch": 0.7150124688279301, + "grad_norm": 0.5081506063582799, + "learning_rate": 9.918025234767203e-06, + "loss": 0.8589, + "step": 14336 + }, + { + "epoch": 0.7150623441396509, + "grad_norm": 0.5916407659809422, + "learning_rate": 9.91480464395283e-06, + "loss": 0.8758, + "step": 14337 + }, + { + "epoch": 0.7151122194513716, + "grad_norm": 0.7782932137046703, + "learning_rate": 9.91158444677989e-06, + "loss": 0.891, + "step": 14338 + }, + { + "epoch": 0.7151620947630922, + "grad_norm": 0.730792758297241, + "learning_rate": 9.908364643332399e-06, + "loss": 0.8786, + "step": 14339 + }, + { + "epoch": 0.715211970074813, + "grad_norm": 1.10696137039319, + "learning_rate": 9.90514523369438e-06, + "loss": 0.8749, + "step": 14340 + }, + { + "epoch": 0.7152618453865337, + "grad_norm": 0.47249905310752977, + "learning_rate": 9.901926217949831e-06, + "loss": 0.8496, + "step": 14341 + }, + { + "epoch": 0.7153117206982543, + "grad_norm": 0.49938548271891336, + "learning_rate": 9.89870759618277e-06, + "loss": 0.879, + "step": 14342 + }, + { + "epoch": 0.7153615960099751, + "grad_norm": 0.5005354172966228, + "learning_rate": 9.895489368477176e-06, + "loss": 0.8627, + "step": 14343 + }, + { + "epoch": 0.7154114713216958, + "grad_norm": 0.5363080227751479, + "learning_rate": 9.892271534917025e-06, + "loss": 0.8565, + "step": 14344 + }, + { + "epoch": 0.7154613466334164, + "grad_norm": 0.4948284963213134, + "learning_rate": 9.889054095586278e-06, + "loss": 0.8298, + "step": 14345 + }, + { + "epoch": 0.7155112219451372, + "grad_norm": 0.5651095634673868, + "learning_rate": 9.885837050568913e-06, + "loss": 0.8802, + "step": 14346 + }, + { + "epoch": 0.7155610972568579, + "grad_norm": 0.521054460400013, + "learning_rate": 9.882620399948863e-06, + "loss": 0.8768, + "step": 14347 + }, + { + "epoch": 0.7156109725685785, + "grad_norm": 0.7873991066748958, + "learning_rate": 9.879404143810072e-06, + "loss": 0.8289, + "step": 14348 + }, + { + "epoch": 0.7156608478802993, + "grad_norm": 1.180998179587869, + "learning_rate": 9.876188282236451e-06, + "loss": 0.8184, + "step": 14349 + }, + { + "epoch": 0.71571072319202, + "grad_norm": 0.4550744385302687, + "learning_rate": 9.872972815311948e-06, + "loss": 0.8583, + "step": 14350 + }, + { + "epoch": 0.7157605985037406, + "grad_norm": 0.5470269916410866, + "learning_rate": 9.869757743120428e-06, + "loss": 0.853, + "step": 14351 + }, + { + "epoch": 0.7158104738154614, + "grad_norm": 0.462713047506861, + "learning_rate": 9.86654306574582e-06, + "loss": 0.8955, + "step": 14352 + }, + { + "epoch": 0.715860349127182, + "grad_norm": 0.537722689666678, + "learning_rate": 9.863328783271989e-06, + "loss": 0.8394, + "step": 14353 + }, + { + "epoch": 0.7159102244389027, + "grad_norm": 0.6047214105143826, + "learning_rate": 9.860114895782838e-06, + "loss": 0.8406, + "step": 14354 + }, + { + "epoch": 0.7159600997506235, + "grad_norm": 0.4396904944988852, + "learning_rate": 9.856901403362192e-06, + "loss": 0.8759, + "step": 14355 + }, + { + "epoch": 0.7160099750623441, + "grad_norm": 0.49291169246941985, + "learning_rate": 9.853688306093937e-06, + "loss": 0.849, + "step": 14356 + }, + { + "epoch": 0.7160598503740648, + "grad_norm": 0.5057750716589904, + "learning_rate": 9.850475604061899e-06, + "loss": 0.8337, + "step": 14357 + }, + { + "epoch": 0.7161097256857856, + "grad_norm": 0.5144301265766709, + "learning_rate": 9.847263297349937e-06, + "loss": 0.8705, + "step": 14358 + }, + { + "epoch": 0.7161596009975062, + "grad_norm": 0.9106168950060068, + "learning_rate": 9.84405138604184e-06, + "loss": 0.8389, + "step": 14359 + }, + { + "epoch": 0.7162094763092269, + "grad_norm": 0.5731808220923776, + "learning_rate": 9.840839870221449e-06, + "loss": 0.8854, + "step": 14360 + }, + { + "epoch": 0.7162593516209477, + "grad_norm": 0.48023822150245465, + "learning_rate": 9.837628749972555e-06, + "loss": 0.8924, + "step": 14361 + }, + { + "epoch": 0.7163092269326683, + "grad_norm": 0.6285598457544479, + "learning_rate": 9.834418025378955e-06, + "loss": 0.8461, + "step": 14362 + }, + { + "epoch": 0.716359102244389, + "grad_norm": 0.575649296476377, + "learning_rate": 9.83120769652442e-06, + "loss": 0.8504, + "step": 14363 + }, + { + "epoch": 0.7164089775561098, + "grad_norm": 0.588585443807455, + "learning_rate": 9.82799776349274e-06, + "loss": 0.8392, + "step": 14364 + }, + { + "epoch": 0.7164588528678304, + "grad_norm": 0.6580038840752651, + "learning_rate": 9.82478822636767e-06, + "loss": 0.8324, + "step": 14365 + }, + { + "epoch": 0.7165087281795511, + "grad_norm": 0.6126340747327905, + "learning_rate": 9.821579085232948e-06, + "loss": 0.8625, + "step": 14366 + }, + { + "epoch": 0.7165586034912719, + "grad_norm": 0.43243550020553684, + "learning_rate": 9.818370340172338e-06, + "loss": 0.8302, + "step": 14367 + }, + { + "epoch": 0.7166084788029925, + "grad_norm": 0.6783413969129949, + "learning_rate": 9.815161991269558e-06, + "loss": 0.8258, + "step": 14368 + }, + { + "epoch": 0.7166583541147132, + "grad_norm": 0.5121639209477372, + "learning_rate": 9.811954038608332e-06, + "loss": 0.8334, + "step": 14369 + }, + { + "epoch": 0.7167082294264339, + "grad_norm": 0.5999525624559849, + "learning_rate": 9.808746482272358e-06, + "loss": 0.8685, + "step": 14370 + }, + { + "epoch": 0.7167581047381546, + "grad_norm": 0.5481950651705041, + "learning_rate": 9.805539322345354e-06, + "loss": 0.8623, + "step": 14371 + }, + { + "epoch": 0.7168079800498753, + "grad_norm": 0.5469466876656908, + "learning_rate": 9.802332558911e-06, + "loss": 0.8387, + "step": 14372 + }, + { + "epoch": 0.716857855361596, + "grad_norm": 0.4914525137999687, + "learning_rate": 9.79912619205298e-06, + "loss": 0.8434, + "step": 14373 + }, + { + "epoch": 0.7169077306733167, + "grad_norm": 0.48623859799033003, + "learning_rate": 9.795920221854948e-06, + "loss": 0.8352, + "step": 14374 + }, + { + "epoch": 0.7169576059850374, + "grad_norm": 0.5380524113451342, + "learning_rate": 9.792714648400584e-06, + "loss": 0.8592, + "step": 14375 + }, + { + "epoch": 0.7170074812967581, + "grad_norm": 0.5069175548824635, + "learning_rate": 9.789509471773523e-06, + "loss": 0.8443, + "step": 14376 + }, + { + "epoch": 0.7170573566084788, + "grad_norm": 0.47280038231624927, + "learning_rate": 9.786304692057405e-06, + "loss": 0.807, + "step": 14377 + }, + { + "epoch": 0.7171072319201995, + "grad_norm": 0.5374483599009361, + "learning_rate": 9.783100309335849e-06, + "loss": 0.8202, + "step": 14378 + }, + { + "epoch": 0.7171571072319202, + "grad_norm": 0.4721032748173218, + "learning_rate": 9.779896323692489e-06, + "loss": 0.8572, + "step": 14379 + }, + { + "epoch": 0.7172069825436409, + "grad_norm": 0.4626005383362087, + "learning_rate": 9.776692735210924e-06, + "loss": 0.8789, + "step": 14380 + }, + { + "epoch": 0.7172568578553616, + "grad_norm": 0.46058842141575956, + "learning_rate": 9.773489543974749e-06, + "loss": 0.8452, + "step": 14381 + }, + { + "epoch": 0.7173067331670823, + "grad_norm": 0.45271852396846, + "learning_rate": 9.77028675006754e-06, + "loss": 0.8638, + "step": 14382 + }, + { + "epoch": 0.717356608478803, + "grad_norm": 0.566212767429909, + "learning_rate": 9.7670843535729e-06, + "loss": 0.8719, + "step": 14383 + }, + { + "epoch": 0.7174064837905237, + "grad_norm": 0.5303826258448978, + "learning_rate": 9.763882354574359e-06, + "loss": 0.8717, + "step": 14384 + }, + { + "epoch": 0.7174563591022444, + "grad_norm": 0.4692663916891948, + "learning_rate": 9.760680753155496e-06, + "loss": 0.8538, + "step": 14385 + }, + { + "epoch": 0.7175062344139651, + "grad_norm": 0.45907178386606695, + "learning_rate": 9.757479549399842e-06, + "loss": 0.8544, + "step": 14386 + }, + { + "epoch": 0.7175561097256857, + "grad_norm": 0.4560074739565883, + "learning_rate": 9.754278743390954e-06, + "loss": 0.8337, + "step": 14387 + }, + { + "epoch": 0.7176059850374065, + "grad_norm": 0.4853458428860765, + "learning_rate": 9.751078335212324e-06, + "loss": 0.834, + "step": 14388 + }, + { + "epoch": 0.7176558603491272, + "grad_norm": 0.40697397988023276, + "learning_rate": 9.747878324947487e-06, + "loss": 0.8518, + "step": 14389 + }, + { + "epoch": 0.7177057356608478, + "grad_norm": 0.5183698124736855, + "learning_rate": 9.74467871267993e-06, + "loss": 0.8759, + "step": 14390 + }, + { + "epoch": 0.7177556109725686, + "grad_norm": 0.478091427737896, + "learning_rate": 9.741479498493175e-06, + "loss": 0.9038, + "step": 14391 + }, + { + "epoch": 0.7178054862842893, + "grad_norm": 0.8442933169329919, + "learning_rate": 9.738280682470663e-06, + "loss": 0.8414, + "step": 14392 + }, + { + "epoch": 0.7178553615960099, + "grad_norm": 0.48458742502317215, + "learning_rate": 9.735082264695897e-06, + "loss": 0.8842, + "step": 14393 + }, + { + "epoch": 0.7179052369077307, + "grad_norm": 0.7541257872006955, + "learning_rate": 9.731884245252319e-06, + "loss": 0.8666, + "step": 14394 + }, + { + "epoch": 0.7179551122194514, + "grad_norm": 0.5068628346298463, + "learning_rate": 9.728686624223404e-06, + "loss": 0.8527, + "step": 14395 + }, + { + "epoch": 0.718004987531172, + "grad_norm": 0.46745261248335584, + "learning_rate": 9.725489401692558e-06, + "loss": 0.8614, + "step": 14396 + }, + { + "epoch": 0.7180548628428928, + "grad_norm": 0.477196051208741, + "learning_rate": 9.722292577743241e-06, + "loss": 0.8728, + "step": 14397 + }, + { + "epoch": 0.7181047381546135, + "grad_norm": 0.4526208614579007, + "learning_rate": 9.719096152458865e-06, + "loss": 0.8318, + "step": 14398 + }, + { + "epoch": 0.7181546134663341, + "grad_norm": 1.2460386084536401, + "learning_rate": 9.715900125922833e-06, + "loss": 0.8837, + "step": 14399 + }, + { + "epoch": 0.7182044887780549, + "grad_norm": 0.8360243383406172, + "learning_rate": 9.712704498218539e-06, + "loss": 0.8442, + "step": 14400 + }, + { + "epoch": 0.7182543640897756, + "grad_norm": 0.4295966147751568, + "learning_rate": 9.709509269429387e-06, + "loss": 0.8121, + "step": 14401 + }, + { + "epoch": 0.7183042394014962, + "grad_norm": 1.0156833906024303, + "learning_rate": 9.706314439638747e-06, + "loss": 0.8436, + "step": 14402 + }, + { + "epoch": 0.718354114713217, + "grad_norm": 0.5132535462038073, + "learning_rate": 9.70312000892998e-06, + "loss": 0.8648, + "step": 14403 + }, + { + "epoch": 0.7184039900249377, + "grad_norm": 0.5068230464023988, + "learning_rate": 9.69992597738646e-06, + "loss": 0.8471, + "step": 14404 + }, + { + "epoch": 0.7184538653366583, + "grad_norm": 2.2562690174190707, + "learning_rate": 9.696732345091522e-06, + "loss": 0.8665, + "step": 14405 + }, + { + "epoch": 0.7185037406483791, + "grad_norm": 0.5106466483745877, + "learning_rate": 9.693539112128505e-06, + "loss": 0.8778, + "step": 14406 + }, + { + "epoch": 0.7185536159600997, + "grad_norm": 0.5158608166525335, + "learning_rate": 9.690346278580726e-06, + "loss": 0.8549, + "step": 14407 + }, + { + "epoch": 0.7186034912718204, + "grad_norm": 0.547563244339913, + "learning_rate": 9.687153844531519e-06, + "loss": 0.848, + "step": 14408 + }, + { + "epoch": 0.7186533665835412, + "grad_norm": 0.9548978756868328, + "learning_rate": 9.683961810064176e-06, + "loss": 0.8692, + "step": 14409 + }, + { + "epoch": 0.7187032418952618, + "grad_norm": 0.49739165694056764, + "learning_rate": 9.680770175262e-06, + "loss": 0.8457, + "step": 14410 + }, + { + "epoch": 0.7187531172069825, + "grad_norm": 0.5401462792974232, + "learning_rate": 9.677578940208257e-06, + "loss": 0.8314, + "step": 14411 + }, + { + "epoch": 0.7188029925187033, + "grad_norm": 0.4886515380806084, + "learning_rate": 9.674388104986246e-06, + "loss": 0.8833, + "step": 14412 + }, + { + "epoch": 0.7188528678304239, + "grad_norm": 0.48758098638981145, + "learning_rate": 9.671197669679217e-06, + "loss": 0.8458, + "step": 14413 + }, + { + "epoch": 0.7189027431421446, + "grad_norm": 0.5021700211530026, + "learning_rate": 9.668007634370424e-06, + "loss": 0.8808, + "step": 14414 + }, + { + "epoch": 0.7189526184538654, + "grad_norm": 0.5729768144446221, + "learning_rate": 9.664817999143103e-06, + "loss": 0.869, + "step": 14415 + }, + { + "epoch": 0.719002493765586, + "grad_norm": 0.5189566601153212, + "learning_rate": 9.6616287640805e-06, + "loss": 0.8555, + "step": 14416 + }, + { + "epoch": 0.7190523690773067, + "grad_norm": 0.4834028846098038, + "learning_rate": 9.65843992926583e-06, + "loss": 0.8639, + "step": 14417 + }, + { + "epoch": 0.7191022443890275, + "grad_norm": 0.6077508000471085, + "learning_rate": 9.655251494782305e-06, + "loss": 0.8458, + "step": 14418 + }, + { + "epoch": 0.7191521197007481, + "grad_norm": 0.5226421609622661, + "learning_rate": 9.652063460713118e-06, + "loss": 0.8546, + "step": 14419 + }, + { + "epoch": 0.7192019950124688, + "grad_norm": 0.4138419189380088, + "learning_rate": 9.648875827141481e-06, + "loss": 0.8447, + "step": 14420 + }, + { + "epoch": 0.7192518703241896, + "grad_norm": 0.5558444709869479, + "learning_rate": 9.645688594150543e-06, + "loss": 0.8585, + "step": 14421 + }, + { + "epoch": 0.7193017456359102, + "grad_norm": 0.5909083477120728, + "learning_rate": 9.642501761823497e-06, + "loss": 0.8309, + "step": 14422 + }, + { + "epoch": 0.7193516209476309, + "grad_norm": 0.7156904720075555, + "learning_rate": 9.639315330243486e-06, + "loss": 0.8549, + "step": 14423 + }, + { + "epoch": 0.7194014962593516, + "grad_norm": 0.7230549535705363, + "learning_rate": 9.636129299493684e-06, + "loss": 0.8766, + "step": 14424 + }, + { + "epoch": 0.7194513715710723, + "grad_norm": 0.5683265750343038, + "learning_rate": 9.632943669657196e-06, + "loss": 0.8828, + "step": 14425 + }, + { + "epoch": 0.719501246882793, + "grad_norm": 0.48249996051961264, + "learning_rate": 9.629758440817175e-06, + "loss": 0.8523, + "step": 14426 + }, + { + "epoch": 0.7195511221945137, + "grad_norm": 0.5369684234165083, + "learning_rate": 9.626573613056716e-06, + "loss": 0.8769, + "step": 14427 + }, + { + "epoch": 0.7196009975062344, + "grad_norm": 0.6345082336964084, + "learning_rate": 9.623389186458958e-06, + "loss": 0.8845, + "step": 14428 + }, + { + "epoch": 0.7196508728179551, + "grad_norm": 0.6615235749763129, + "learning_rate": 9.62020516110696e-06, + "loss": 0.8599, + "step": 14429 + }, + { + "epoch": 0.7197007481296758, + "grad_norm": 1.7841533689120033, + "learning_rate": 9.617021537083834e-06, + "loss": 0.8361, + "step": 14430 + }, + { + "epoch": 0.7197506234413965, + "grad_norm": 0.5873518387068466, + "learning_rate": 9.613838314472649e-06, + "loss": 0.8082, + "step": 14431 + }, + { + "epoch": 0.7198004987531172, + "grad_norm": 0.428638544859629, + "learning_rate": 9.610655493356462e-06, + "loss": 0.8528, + "step": 14432 + }, + { + "epoch": 0.7198503740648379, + "grad_norm": 0.44887406770922833, + "learning_rate": 9.607473073818327e-06, + "loss": 0.845, + "step": 14433 + }, + { + "epoch": 0.7199002493765586, + "grad_norm": 0.6196243873543678, + "learning_rate": 9.604291055941303e-06, + "loss": 0.8699, + "step": 14434 + }, + { + "epoch": 0.7199501246882793, + "grad_norm": 0.5926948497622947, + "learning_rate": 9.601109439808412e-06, + "loss": 0.8657, + "step": 14435 + }, + { + "epoch": 0.72, + "grad_norm": 0.513009825237263, + "learning_rate": 9.597928225502678e-06, + "loss": 0.8977, + "step": 14436 + }, + { + "epoch": 0.7200498753117207, + "grad_norm": 1.1943494034479158, + "learning_rate": 9.594747413107105e-06, + "loss": 0.875, + "step": 14437 + }, + { + "epoch": 0.7200997506234414, + "grad_norm": 0.46226539149718576, + "learning_rate": 9.591567002704716e-06, + "loss": 0.8022, + "step": 14438 + }, + { + "epoch": 0.7201496259351621, + "grad_norm": 0.5488662966242193, + "learning_rate": 9.588386994378487e-06, + "loss": 0.8721, + "step": 14439 + }, + { + "epoch": 0.7201995012468828, + "grad_norm": 0.49892176011138767, + "learning_rate": 9.585207388211404e-06, + "loss": 0.8461, + "step": 14440 + }, + { + "epoch": 0.7202493765586034, + "grad_norm": 0.5400249477708605, + "learning_rate": 9.582028184286423e-06, + "loss": 0.8174, + "step": 14441 + }, + { + "epoch": 0.7202992518703242, + "grad_norm": 0.4620877501339228, + "learning_rate": 9.578849382686527e-06, + "loss": 0.8813, + "step": 14442 + }, + { + "epoch": 0.7203491271820449, + "grad_norm": 0.5191794941276047, + "learning_rate": 9.575670983494656e-06, + "loss": 0.8516, + "step": 14443 + }, + { + "epoch": 0.7203990024937655, + "grad_norm": 0.4735774410283947, + "learning_rate": 9.572492986793738e-06, + "loss": 0.8452, + "step": 14444 + }, + { + "epoch": 0.7204488778054863, + "grad_norm": 0.5448636881074977, + "learning_rate": 9.56931539266672e-06, + "loss": 0.836, + "step": 14445 + }, + { + "epoch": 0.720498753117207, + "grad_norm": 0.559262426504218, + "learning_rate": 9.566138201196512e-06, + "loss": 0.8648, + "step": 14446 + }, + { + "epoch": 0.7205486284289276, + "grad_norm": 0.471963952936929, + "learning_rate": 9.562961412466017e-06, + "loss": 0.8678, + "step": 14447 + }, + { + "epoch": 0.7205985037406484, + "grad_norm": 0.5355752480047541, + "learning_rate": 9.55978502655813e-06, + "loss": 0.8693, + "step": 14448 + }, + { + "epoch": 0.7206483790523691, + "grad_norm": 0.46851942462452395, + "learning_rate": 9.556609043555751e-06, + "loss": 0.9062, + "step": 14449 + }, + { + "epoch": 0.7206982543640897, + "grad_norm": 0.5098654436193122, + "learning_rate": 9.553433463541748e-06, + "loss": 0.8404, + "step": 14450 + }, + { + "epoch": 0.7207481296758105, + "grad_norm": 0.49932023908798906, + "learning_rate": 9.550258286598982e-06, + "loss": 0.8674, + "step": 14451 + }, + { + "epoch": 0.7207980049875312, + "grad_norm": 0.5070662040337226, + "learning_rate": 9.547083512810306e-06, + "loss": 0.8371, + "step": 14452 + }, + { + "epoch": 0.7208478802992518, + "grad_norm": 0.4803574561960602, + "learning_rate": 9.543909142258579e-06, + "loss": 0.8378, + "step": 14453 + }, + { + "epoch": 0.7208977556109726, + "grad_norm": 0.5002217161660488, + "learning_rate": 9.540735175026622e-06, + "loss": 0.8241, + "step": 14454 + }, + { + "epoch": 0.7209476309226933, + "grad_norm": 0.4584708170025907, + "learning_rate": 9.537561611197265e-06, + "loss": 0.8878, + "step": 14455 + }, + { + "epoch": 0.7209975062344139, + "grad_norm": 0.4802559253330176, + "learning_rate": 9.53438845085331e-06, + "loss": 0.8319, + "step": 14456 + }, + { + "epoch": 0.7210473815461347, + "grad_norm": 0.5168305556784466, + "learning_rate": 9.53121569407758e-06, + "loss": 0.8583, + "step": 14457 + }, + { + "epoch": 0.7210972568578553, + "grad_norm": 0.6515814890009085, + "learning_rate": 9.528043340952842e-06, + "loss": 0.884, + "step": 14458 + }, + { + "epoch": 0.721147132169576, + "grad_norm": 0.47237191426517244, + "learning_rate": 9.524871391561893e-06, + "loss": 0.8749, + "step": 14459 + }, + { + "epoch": 0.7211970074812968, + "grad_norm": 0.6977393160411514, + "learning_rate": 9.521699845987491e-06, + "loss": 0.85, + "step": 14460 + }, + { + "epoch": 0.7212468827930174, + "grad_norm": 0.4922711155562241, + "learning_rate": 9.518528704312424e-06, + "loss": 0.8764, + "step": 14461 + }, + { + "epoch": 0.7212967581047381, + "grad_norm": 0.5329628021773565, + "learning_rate": 9.515357966619401e-06, + "loss": 0.8463, + "step": 14462 + }, + { + "epoch": 0.7213466334164589, + "grad_norm": 0.5300477725725115, + "learning_rate": 9.512187632991192e-06, + "loss": 0.7755, + "step": 14463 + }, + { + "epoch": 0.7213965087281795, + "grad_norm": 0.7478372030131077, + "learning_rate": 9.509017703510509e-06, + "loss": 0.8892, + "step": 14464 + }, + { + "epoch": 0.7214463840399002, + "grad_norm": 0.4726674283763663, + "learning_rate": 9.50584817826009e-06, + "loss": 0.8179, + "step": 14465 + }, + { + "epoch": 0.721496259351621, + "grad_norm": 0.5403523782747464, + "learning_rate": 9.502679057322613e-06, + "loss": 0.8299, + "step": 14466 + }, + { + "epoch": 0.7215461346633416, + "grad_norm": 1.0387419951475318, + "learning_rate": 9.499510340780801e-06, + "loss": 0.8553, + "step": 14467 + }, + { + "epoch": 0.7215960099750623, + "grad_norm": 0.8214099802473472, + "learning_rate": 9.496342028717325e-06, + "loss": 0.8005, + "step": 14468 + }, + { + "epoch": 0.7216458852867831, + "grad_norm": 0.5197624450641904, + "learning_rate": 9.493174121214868e-06, + "loss": 0.9142, + "step": 14469 + }, + { + "epoch": 0.7216957605985037, + "grad_norm": 0.5827478423415146, + "learning_rate": 9.490006618356085e-06, + "loss": 0.8522, + "step": 14470 + }, + { + "epoch": 0.7217456359102244, + "grad_norm": 0.46891653710335596, + "learning_rate": 9.486839520223645e-06, + "loss": 0.854, + "step": 14471 + }, + { + "epoch": 0.7217955112219452, + "grad_norm": 1.773988592727509, + "learning_rate": 9.483672826900184e-06, + "loss": 0.8401, + "step": 14472 + }, + { + "epoch": 0.7218453865336658, + "grad_norm": 0.4025099091958943, + "learning_rate": 9.480506538468336e-06, + "loss": 0.825, + "step": 14473 + }, + { + "epoch": 0.7218952618453865, + "grad_norm": 0.6101245364699298, + "learning_rate": 9.477340655010716e-06, + "loss": 0.8392, + "step": 14474 + }, + { + "epoch": 0.7219451371571073, + "grad_norm": 0.48766776578179477, + "learning_rate": 9.474175176609956e-06, + "loss": 0.8012, + "step": 14475 + }, + { + "epoch": 0.7219950124688279, + "grad_norm": 0.4585234939579101, + "learning_rate": 9.471010103348643e-06, + "loss": 0.8785, + "step": 14476 + }, + { + "epoch": 0.7220448877805486, + "grad_norm": 0.5275710172478001, + "learning_rate": 9.467845435309375e-06, + "loss": 0.8791, + "step": 14477 + }, + { + "epoch": 0.7220947630922693, + "grad_norm": 0.5109929155397375, + "learning_rate": 9.464681172574715e-06, + "loss": 0.8897, + "step": 14478 + }, + { + "epoch": 0.72214463840399, + "grad_norm": 0.5355586333272493, + "learning_rate": 9.46151731522726e-06, + "loss": 0.8154, + "step": 14479 + }, + { + "epoch": 0.7221945137157108, + "grad_norm": 0.4880720140463448, + "learning_rate": 9.458353863349557e-06, + "loss": 0.8731, + "step": 14480 + }, + { + "epoch": 0.7222443890274314, + "grad_norm": 0.6227363744458965, + "learning_rate": 9.455190817024142e-06, + "loss": 0.8493, + "step": 14481 + }, + { + "epoch": 0.7222942643391521, + "grad_norm": 0.4910088804070353, + "learning_rate": 9.452028176333578e-06, + "loss": 0.8264, + "step": 14482 + }, + { + "epoch": 0.7223441396508729, + "grad_norm": 0.538361454704316, + "learning_rate": 9.448865941360379e-06, + "loss": 0.8717, + "step": 14483 + }, + { + "epoch": 0.7223940149625935, + "grad_norm": 0.43350670096094757, + "learning_rate": 9.445704112187065e-06, + "loss": 0.8399, + "step": 14484 + }, + { + "epoch": 0.7224438902743142, + "grad_norm": 0.44613598951585365, + "learning_rate": 9.442542688896131e-06, + "loss": 0.8609, + "step": 14485 + }, + { + "epoch": 0.722493765586035, + "grad_norm": 0.4937635023427303, + "learning_rate": 9.43938167157009e-06, + "loss": 0.8573, + "step": 14486 + }, + { + "epoch": 0.7225436408977556, + "grad_norm": 0.5392964333314649, + "learning_rate": 9.436221060291423e-06, + "loss": 0.8479, + "step": 14487 + }, + { + "epoch": 0.7225935162094763, + "grad_norm": 0.5025485151717053, + "learning_rate": 9.4330608551426e-06, + "loss": 0.8542, + "step": 14488 + }, + { + "epoch": 0.722643391521197, + "grad_norm": 0.5774672070209848, + "learning_rate": 9.429901056206078e-06, + "loss": 0.8447, + "step": 14489 + }, + { + "epoch": 0.7226932668329177, + "grad_norm": 0.485577271644065, + "learning_rate": 9.42674166356434e-06, + "loss": 0.8876, + "step": 14490 + }, + { + "epoch": 0.7227431421446384, + "grad_norm": 0.4443521050804174, + "learning_rate": 9.423582677299789e-06, + "loss": 0.8337, + "step": 14491 + }, + { + "epoch": 0.7227930174563592, + "grad_norm": 0.4631036807858547, + "learning_rate": 9.420424097494884e-06, + "loss": 0.8858, + "step": 14492 + }, + { + "epoch": 0.7228428927680798, + "grad_norm": 0.4240073109406579, + "learning_rate": 9.417265924232036e-06, + "loss": 0.8707, + "step": 14493 + }, + { + "epoch": 0.7228927680798005, + "grad_norm": 0.4669314118819564, + "learning_rate": 9.414108157593674e-06, + "loss": 0.8525, + "step": 14494 + }, + { + "epoch": 0.7229426433915211, + "grad_norm": 0.5198281389738745, + "learning_rate": 9.410950797662167e-06, + "loss": 0.7953, + "step": 14495 + }, + { + "epoch": 0.7229925187032419, + "grad_norm": 0.49022269966665094, + "learning_rate": 9.40779384451993e-06, + "loss": 0.7884, + "step": 14496 + }, + { + "epoch": 0.7230423940149626, + "grad_norm": 0.43448901645537985, + "learning_rate": 9.404637298249328e-06, + "loss": 0.8054, + "step": 14497 + }, + { + "epoch": 0.7230922693266832, + "grad_norm": 0.5054538839020133, + "learning_rate": 9.401481158932752e-06, + "loss": 0.8739, + "step": 14498 + }, + { + "epoch": 0.723142144638404, + "grad_norm": 0.5157086986343957, + "learning_rate": 9.398325426652526e-06, + "loss": 0.8424, + "step": 14499 + }, + { + "epoch": 0.7231920199501247, + "grad_norm": 0.7298035519007957, + "learning_rate": 9.395170101491024e-06, + "loss": 0.8114, + "step": 14500 + }, + { + "epoch": 0.7232418952618453, + "grad_norm": 0.49282250616372064, + "learning_rate": 9.392015183530568e-06, + "loss": 0.8272, + "step": 14501 + }, + { + "epoch": 0.7232917705735661, + "grad_norm": 0.44606503843325285, + "learning_rate": 9.388860672853505e-06, + "loss": 0.8699, + "step": 14502 + }, + { + "epoch": 0.7233416458852868, + "grad_norm": 0.4528767586844733, + "learning_rate": 9.385706569542122e-06, + "loss": 0.801, + "step": 14503 + }, + { + "epoch": 0.7233915211970074, + "grad_norm": 0.49212028826143794, + "learning_rate": 9.382552873678746e-06, + "loss": 0.8437, + "step": 14504 + }, + { + "epoch": 0.7234413965087282, + "grad_norm": 0.497595422540764, + "learning_rate": 9.379399585345663e-06, + "loss": 0.8848, + "step": 14505 + }, + { + "epoch": 0.7234912718204489, + "grad_norm": 0.5038980193891852, + "learning_rate": 9.376246704625154e-06, + "loss": 0.8329, + "step": 14506 + }, + { + "epoch": 0.7235411471321695, + "grad_norm": 0.5146909662698692, + "learning_rate": 9.373094231599491e-06, + "loss": 0.8698, + "step": 14507 + }, + { + "epoch": 0.7235910224438903, + "grad_norm": 1.7384413526212454, + "learning_rate": 9.369942166350948e-06, + "loss": 0.8379, + "step": 14508 + }, + { + "epoch": 0.723640897755611, + "grad_norm": 0.4876738110350695, + "learning_rate": 9.366790508961767e-06, + "loss": 0.8019, + "step": 14509 + }, + { + "epoch": 0.7236907730673316, + "grad_norm": 0.4709820649926233, + "learning_rate": 9.363639259514193e-06, + "loss": 0.8488, + "step": 14510 + }, + { + "epoch": 0.7237406483790524, + "grad_norm": 0.4414696489984186, + "learning_rate": 9.360488418090443e-06, + "loss": 0.8719, + "step": 14511 + }, + { + "epoch": 0.723790523690773, + "grad_norm": 0.4765908386843797, + "learning_rate": 9.35733798477276e-06, + "loss": 0.8656, + "step": 14512 + }, + { + "epoch": 0.7238403990024938, + "grad_norm": 0.5016109829227009, + "learning_rate": 9.35418795964334e-06, + "loss": 0.899, + "step": 14513 + }, + { + "epoch": 0.7238902743142145, + "grad_norm": 0.44457943652400445, + "learning_rate": 9.351038342784386e-06, + "loss": 0.8746, + "step": 14514 + }, + { + "epoch": 0.7239401496259351, + "grad_norm": 0.4737370045098044, + "learning_rate": 9.34788913427807e-06, + "loss": 0.843, + "step": 14515 + }, + { + "epoch": 0.7239900249376559, + "grad_norm": 0.5339092725641609, + "learning_rate": 9.344740334206595e-06, + "loss": 0.8619, + "step": 14516 + }, + { + "epoch": 0.7240399002493766, + "grad_norm": 0.4520813755250942, + "learning_rate": 9.341591942652112e-06, + "loss": 0.8424, + "step": 14517 + }, + { + "epoch": 0.7240897755610972, + "grad_norm": 0.4511882896092578, + "learning_rate": 9.338443959696772e-06, + "loss": 0.826, + "step": 14518 + }, + { + "epoch": 0.724139650872818, + "grad_norm": 0.6101329507178033, + "learning_rate": 9.335296385422735e-06, + "loss": 0.9053, + "step": 14519 + }, + { + "epoch": 0.7241895261845387, + "grad_norm": 0.7110867733712027, + "learning_rate": 9.332149219912129e-06, + "loss": 0.8395, + "step": 14520 + }, + { + "epoch": 0.7242394014962593, + "grad_norm": 0.5707035816248519, + "learning_rate": 9.329002463247075e-06, + "loss": 0.8838, + "step": 14521 + }, + { + "epoch": 0.72428927680798, + "grad_norm": 0.47231501324492886, + "learning_rate": 9.325856115509683e-06, + "loss": 0.8682, + "step": 14522 + }, + { + "epoch": 0.7243391521197008, + "grad_norm": 0.50488107648489, + "learning_rate": 9.322710176782068e-06, + "loss": 0.8444, + "step": 14523 + }, + { + "epoch": 0.7243890274314214, + "grad_norm": 0.5618125944855977, + "learning_rate": 9.319564647146314e-06, + "loss": 0.8699, + "step": 14524 + }, + { + "epoch": 0.7244389027431422, + "grad_norm": 0.5143514879427578, + "learning_rate": 9.316419526684504e-06, + "loss": 0.8755, + "step": 14525 + }, + { + "epoch": 0.7244887780548629, + "grad_norm": 0.6693943995749592, + "learning_rate": 9.313274815478698e-06, + "loss": 0.8596, + "step": 14526 + }, + { + "epoch": 0.7245386533665835, + "grad_norm": 0.4747715900397753, + "learning_rate": 9.310130513610982e-06, + "loss": 0.8308, + "step": 14527 + }, + { + "epoch": 0.7245885286783043, + "grad_norm": 0.47134706206292054, + "learning_rate": 9.306986621163371e-06, + "loss": 0.9041, + "step": 14528 + }, + { + "epoch": 0.724638403990025, + "grad_norm": 0.4685208658341283, + "learning_rate": 9.303843138217933e-06, + "loss": 0.838, + "step": 14529 + }, + { + "epoch": 0.7246882793017456, + "grad_norm": 0.5031484872439521, + "learning_rate": 9.300700064856671e-06, + "loss": 0.8433, + "step": 14530 + }, + { + "epoch": 0.7247381546134664, + "grad_norm": 0.45369553126009304, + "learning_rate": 9.297557401161632e-06, + "loss": 0.8337, + "step": 14531 + }, + { + "epoch": 0.724788029925187, + "grad_norm": 0.8836261137821831, + "learning_rate": 9.294415147214792e-06, + "loss": 0.855, + "step": 14532 + }, + { + "epoch": 0.7248379052369077, + "grad_norm": 0.461889215008138, + "learning_rate": 9.291273303098163e-06, + "loss": 0.8407, + "step": 14533 + }, + { + "epoch": 0.7248877805486285, + "grad_norm": 0.505383224043904, + "learning_rate": 9.28813186889372e-06, + "loss": 0.8471, + "step": 14534 + }, + { + "epoch": 0.7249376558603491, + "grad_norm": 0.4721906478045368, + "learning_rate": 9.284990844683464e-06, + "loss": 0.8854, + "step": 14535 + }, + { + "epoch": 0.7249875311720698, + "grad_norm": 0.483909667941293, + "learning_rate": 9.281850230549319e-06, + "loss": 0.8555, + "step": 14536 + }, + { + "epoch": 0.7250374064837906, + "grad_norm": 0.5307961723935465, + "learning_rate": 9.278710026573267e-06, + "loss": 0.894, + "step": 14537 + }, + { + "epoch": 0.7250872817955112, + "grad_norm": 0.4358989109735512, + "learning_rate": 9.27557023283724e-06, + "loss": 0.8239, + "step": 14538 + }, + { + "epoch": 0.7251371571072319, + "grad_norm": 0.456655646920633, + "learning_rate": 9.272430849423174e-06, + "loss": 0.846, + "step": 14539 + }, + { + "epoch": 0.7251870324189527, + "grad_norm": 0.4300132598161514, + "learning_rate": 9.269291876412974e-06, + "loss": 0.9035, + "step": 14540 + }, + { + "epoch": 0.7252369077306733, + "grad_norm": 0.45989604664214656, + "learning_rate": 9.266153313888573e-06, + "loss": 0.9043, + "step": 14541 + }, + { + "epoch": 0.725286783042394, + "grad_norm": 0.7033971552552319, + "learning_rate": 9.26301516193186e-06, + "loss": 0.8947, + "step": 14542 + }, + { + "epoch": 0.7253366583541148, + "grad_norm": 0.4606071647493451, + "learning_rate": 9.259877420624721e-06, + "loss": 0.8357, + "step": 14543 + }, + { + "epoch": 0.7253865336658354, + "grad_norm": 0.4829422889834377, + "learning_rate": 9.25674009004903e-06, + "loss": 0.8567, + "step": 14544 + }, + { + "epoch": 0.7254364089775561, + "grad_norm": 0.4351154731762691, + "learning_rate": 9.253603170286667e-06, + "loss": 0.8644, + "step": 14545 + }, + { + "epoch": 0.7254862842892769, + "grad_norm": 0.44293272573369014, + "learning_rate": 9.250466661419484e-06, + "loss": 0.8516, + "step": 14546 + }, + { + "epoch": 0.7255361596009975, + "grad_norm": 0.6123720010143642, + "learning_rate": 9.247330563529325e-06, + "loss": 0.8305, + "step": 14547 + }, + { + "epoch": 0.7255860349127182, + "grad_norm": 0.4725076260306546, + "learning_rate": 9.244194876698015e-06, + "loss": 0.8612, + "step": 14548 + }, + { + "epoch": 0.7256359102244389, + "grad_norm": 0.4328094567263846, + "learning_rate": 9.241059601007396e-06, + "loss": 0.8664, + "step": 14549 + }, + { + "epoch": 0.7256857855361596, + "grad_norm": 0.4900513636628902, + "learning_rate": 9.237924736539277e-06, + "loss": 0.8881, + "step": 14550 + }, + { + "epoch": 0.7257356608478803, + "grad_norm": 0.4736984163009862, + "learning_rate": 9.234790283375455e-06, + "loss": 0.8471, + "step": 14551 + }, + { + "epoch": 0.725785536159601, + "grad_norm": 0.507182135392389, + "learning_rate": 9.23165624159772e-06, + "loss": 0.8401, + "step": 14552 + }, + { + "epoch": 0.7258354114713217, + "grad_norm": 0.5219067468259287, + "learning_rate": 9.228522611287862e-06, + "loss": 0.8511, + "step": 14553 + }, + { + "epoch": 0.7258852867830424, + "grad_norm": 0.5579712160644991, + "learning_rate": 9.225389392527651e-06, + "loss": 0.8463, + "step": 14554 + }, + { + "epoch": 0.725935162094763, + "grad_norm": 0.5428877587635379, + "learning_rate": 9.222256585398845e-06, + "loss": 0.8459, + "step": 14555 + }, + { + "epoch": 0.7259850374064838, + "grad_norm": 0.46785040285721513, + "learning_rate": 9.21912418998318e-06, + "loss": 0.879, + "step": 14556 + }, + { + "epoch": 0.7260349127182045, + "grad_norm": 0.5521091932909165, + "learning_rate": 9.21599220636242e-06, + "loss": 0.88, + "step": 14557 + }, + { + "epoch": 0.7260847880299252, + "grad_norm": 0.5485781274268139, + "learning_rate": 9.212860634618275e-06, + "loss": 0.8047, + "step": 14558 + }, + { + "epoch": 0.7261346633416459, + "grad_norm": 0.5157820944390815, + "learning_rate": 9.209729474832459e-06, + "loss": 0.8438, + "step": 14559 + }, + { + "epoch": 0.7261845386533666, + "grad_norm": 0.5084676070735886, + "learning_rate": 9.206598727086696e-06, + "loss": 0.84, + "step": 14560 + }, + { + "epoch": 0.7262344139650873, + "grad_norm": 0.49555717393854726, + "learning_rate": 9.203468391462668e-06, + "loss": 0.8549, + "step": 14561 + }, + { + "epoch": 0.726284289276808, + "grad_norm": 0.5226419829479164, + "learning_rate": 9.200338468042066e-06, + "loss": 0.8631, + "step": 14562 + }, + { + "epoch": 0.7263341645885287, + "grad_norm": 0.47660428209616273, + "learning_rate": 9.197208956906554e-06, + "loss": 0.8212, + "step": 14563 + }, + { + "epoch": 0.7263840399002494, + "grad_norm": 0.5391211802853704, + "learning_rate": 9.194079858137813e-06, + "loss": 0.8493, + "step": 14564 + }, + { + "epoch": 0.7264339152119701, + "grad_norm": 0.5014467073040811, + "learning_rate": 9.190951171817469e-06, + "loss": 0.8417, + "step": 14565 + }, + { + "epoch": 0.7264837905236907, + "grad_norm": 0.4944580082688928, + "learning_rate": 9.18782289802719e-06, + "loss": 0.8412, + "step": 14566 + }, + { + "epoch": 0.7265336658354115, + "grad_norm": 0.4453367851400574, + "learning_rate": 9.184695036848584e-06, + "loss": 0.8327, + "step": 14567 + }, + { + "epoch": 0.7265835411471322, + "grad_norm": 0.4322992810593655, + "learning_rate": 9.181567588363302e-06, + "loss": 0.8299, + "step": 14568 + }, + { + "epoch": 0.7266334164588528, + "grad_norm": 0.5014750853810944, + "learning_rate": 9.178440552652917e-06, + "loss": 0.843, + "step": 14569 + }, + { + "epoch": 0.7266832917705736, + "grad_norm": 0.49693934776994314, + "learning_rate": 9.175313929799053e-06, + "loss": 0.8743, + "step": 14570 + }, + { + "epoch": 0.7267331670822943, + "grad_norm": 0.4755905511644335, + "learning_rate": 9.172187719883283e-06, + "loss": 0.8549, + "step": 14571 + }, + { + "epoch": 0.7267830423940149, + "grad_norm": 0.6674964925905851, + "learning_rate": 9.169061922987206e-06, + "loss": 0.8099, + "step": 14572 + }, + { + "epoch": 0.7268329177057357, + "grad_norm": 0.5377554060619939, + "learning_rate": 9.165936539192358e-06, + "loss": 0.9125, + "step": 14573 + }, + { + "epoch": 0.7268827930174564, + "grad_norm": 0.5126746470857984, + "learning_rate": 9.162811568580319e-06, + "loss": 0.8717, + "step": 14574 + }, + { + "epoch": 0.726932668329177, + "grad_norm": 0.4011507304430446, + "learning_rate": 9.15968701123262e-06, + "loss": 0.8567, + "step": 14575 + }, + { + "epoch": 0.7269825436408978, + "grad_norm": 0.4810580465702477, + "learning_rate": 9.1565628672308e-06, + "loss": 0.8808, + "step": 14576 + }, + { + "epoch": 0.7270324189526185, + "grad_norm": 0.5219095731078762, + "learning_rate": 9.153439136656375e-06, + "loss": 0.8202, + "step": 14577 + }, + { + "epoch": 0.7270822942643391, + "grad_norm": 0.5321092910227192, + "learning_rate": 9.150315819590871e-06, + "loss": 0.8366, + "step": 14578 + }, + { + "epoch": 0.7271321695760599, + "grad_norm": 0.6306857818742845, + "learning_rate": 9.147192916115785e-06, + "loss": 0.883, + "step": 14579 + }, + { + "epoch": 0.7271820448877806, + "grad_norm": 0.6049919922884694, + "learning_rate": 9.144070426312601e-06, + "loss": 0.8639, + "step": 14580 + }, + { + "epoch": 0.7272319201995012, + "grad_norm": 0.4769664654077874, + "learning_rate": 9.140948350262798e-06, + "loss": 0.8697, + "step": 14581 + }, + { + "epoch": 0.727281795511222, + "grad_norm": 0.5000075657531361, + "learning_rate": 9.137826688047857e-06, + "loss": 0.8615, + "step": 14582 + }, + { + "epoch": 0.7273316708229426, + "grad_norm": 0.5305233715351142, + "learning_rate": 9.134705439749231e-06, + "loss": 0.8401, + "step": 14583 + }, + { + "epoch": 0.7273815461346633, + "grad_norm": 0.43757959986396106, + "learning_rate": 9.131584605448362e-06, + "loss": 0.8516, + "step": 14584 + }, + { + "epoch": 0.7274314214463841, + "grad_norm": 0.4798005428961741, + "learning_rate": 9.128464185226687e-06, + "loss": 0.9017, + "step": 14585 + }, + { + "epoch": 0.7274812967581047, + "grad_norm": 0.47280440097431764, + "learning_rate": 9.125344179165643e-06, + "loss": 0.8402, + "step": 14586 + }, + { + "epoch": 0.7275311720698254, + "grad_norm": 0.43418518336174033, + "learning_rate": 9.122224587346636e-06, + "loss": 0.8357, + "step": 14587 + }, + { + "epoch": 0.7275810473815462, + "grad_norm": 0.4964853180619874, + "learning_rate": 9.119105409851075e-06, + "loss": 0.8492, + "step": 14588 + }, + { + "epoch": 0.7276309226932668, + "grad_norm": 0.5814548710692206, + "learning_rate": 9.115986646760344e-06, + "loss": 0.8314, + "step": 14589 + }, + { + "epoch": 0.7276807980049875, + "grad_norm": 0.5852266226920224, + "learning_rate": 9.112868298155838e-06, + "loss": 0.8582, + "step": 14590 + }, + { + "epoch": 0.7277306733167083, + "grad_norm": 0.5455154745855928, + "learning_rate": 9.109750364118924e-06, + "loss": 0.8334, + "step": 14591 + }, + { + "epoch": 0.7277805486284289, + "grad_norm": 0.6570205412157376, + "learning_rate": 9.106632844730963e-06, + "loss": 0.8422, + "step": 14592 + }, + { + "epoch": 0.7278304239401496, + "grad_norm": 0.5531540026166263, + "learning_rate": 9.103515740073299e-06, + "loss": 0.8919, + "step": 14593 + }, + { + "epoch": 0.7278802992518704, + "grad_norm": 0.5611495601545288, + "learning_rate": 9.100399050227283e-06, + "loss": 0.84, + "step": 14594 + }, + { + "epoch": 0.727930174563591, + "grad_norm": 0.6845735396919684, + "learning_rate": 9.097282775274238e-06, + "loss": 0.8518, + "step": 14595 + }, + { + "epoch": 0.7279800498753117, + "grad_norm": 0.6011427831102613, + "learning_rate": 9.094166915295474e-06, + "loss": 0.8398, + "step": 14596 + }, + { + "epoch": 0.7280299251870325, + "grad_norm": 0.5424338899879867, + "learning_rate": 9.091051470372321e-06, + "loss": 0.8988, + "step": 14597 + }, + { + "epoch": 0.7280798004987531, + "grad_norm": 0.7545446436529489, + "learning_rate": 9.087936440586048e-06, + "loss": 0.8206, + "step": 14598 + }, + { + "epoch": 0.7281296758104738, + "grad_norm": 0.45379949227796057, + "learning_rate": 9.084821826017955e-06, + "loss": 0.8283, + "step": 14599 + }, + { + "epoch": 0.7281795511221946, + "grad_norm": 0.4126165392969694, + "learning_rate": 9.081707626749311e-06, + "loss": 0.7887, + "step": 14600 + }, + { + "epoch": 0.7282294264339152, + "grad_norm": 0.5613810282435108, + "learning_rate": 9.078593842861397e-06, + "loss": 0.8249, + "step": 14601 + }, + { + "epoch": 0.7282793017456359, + "grad_norm": 0.4872899512364943, + "learning_rate": 9.075480474435432e-06, + "loss": 0.8535, + "step": 14602 + }, + { + "epoch": 0.7283291770573566, + "grad_norm": 0.5450524842432468, + "learning_rate": 9.072367521552688e-06, + "loss": 0.8441, + "step": 14603 + }, + { + "epoch": 0.7283790523690773, + "grad_norm": 0.5191329887264314, + "learning_rate": 9.069254984294376e-06, + "loss": 0.8831, + "step": 14604 + }, + { + "epoch": 0.728428927680798, + "grad_norm": 0.4842330419352969, + "learning_rate": 9.066142862741742e-06, + "loss": 0.8458, + "step": 14605 + }, + { + "epoch": 0.7284788029925187, + "grad_norm": 0.47501466921900887, + "learning_rate": 9.063031156975962e-06, + "loss": 0.8109, + "step": 14606 + }, + { + "epoch": 0.7285286783042394, + "grad_norm": 0.5702549804242868, + "learning_rate": 9.05991986707826e-06, + "loss": 0.8554, + "step": 14607 + }, + { + "epoch": 0.7285785536159601, + "grad_norm": 0.551403459212656, + "learning_rate": 9.056808993129815e-06, + "loss": 0.8689, + "step": 14608 + }, + { + "epoch": 0.7286284289276808, + "grad_norm": 0.5040028794606187, + "learning_rate": 9.053698535211805e-06, + "loss": 0.8208, + "step": 14609 + }, + { + "epoch": 0.7286783042394015, + "grad_norm": 0.5289072441672744, + "learning_rate": 9.050588493405387e-06, + "loss": 0.8318, + "step": 14610 + }, + { + "epoch": 0.7287281795511222, + "grad_norm": 0.5036573444322958, + "learning_rate": 9.047478867791732e-06, + "loss": 0.862, + "step": 14611 + }, + { + "epoch": 0.7287780548628429, + "grad_norm": 0.4713014565054789, + "learning_rate": 9.044369658451976e-06, + "loss": 0.85, + "step": 14612 + }, + { + "epoch": 0.7288279301745636, + "grad_norm": 0.4783413117261038, + "learning_rate": 9.041260865467256e-06, + "loss": 0.8644, + "step": 14613 + }, + { + "epoch": 0.7288778054862843, + "grad_norm": 0.5733703328843188, + "learning_rate": 9.038152488918677e-06, + "loss": 0.8183, + "step": 14614 + }, + { + "epoch": 0.728927680798005, + "grad_norm": 0.9700139829082677, + "learning_rate": 9.035044528887378e-06, + "loss": 0.8486, + "step": 14615 + }, + { + "epoch": 0.7289775561097257, + "grad_norm": 0.44534352742778166, + "learning_rate": 9.031936985454447e-06, + "loss": 0.8656, + "step": 14616 + }, + { + "epoch": 0.7290274314214464, + "grad_norm": 0.49673845056135774, + "learning_rate": 9.028829858700974e-06, + "loss": 0.847, + "step": 14617 + }, + { + "epoch": 0.7290773067331671, + "grad_norm": 0.4816294754573914, + "learning_rate": 9.025723148708027e-06, + "loss": 0.8296, + "step": 14618 + }, + { + "epoch": 0.7291271820448878, + "grad_norm": 0.5074247263945596, + "learning_rate": 9.022616855556695e-06, + "loss": 0.8405, + "step": 14619 + }, + { + "epoch": 0.7291770573566084, + "grad_norm": 0.5702050022971914, + "learning_rate": 9.019510979328028e-06, + "loss": 0.8311, + "step": 14620 + }, + { + "epoch": 0.7292269326683292, + "grad_norm": 0.5049307234819268, + "learning_rate": 9.016405520103069e-06, + "loss": 0.8396, + "step": 14621 + }, + { + "epoch": 0.7292768079800499, + "grad_norm": 0.5230781438516814, + "learning_rate": 9.013300477962844e-06, + "loss": 0.8717, + "step": 14622 + }, + { + "epoch": 0.7293266832917705, + "grad_norm": 0.45847303593630373, + "learning_rate": 9.010195852988399e-06, + "loss": 0.8442, + "step": 14623 + }, + { + "epoch": 0.7293765586034913, + "grad_norm": 0.4421425202276032, + "learning_rate": 9.007091645260735e-06, + "loss": 0.8251, + "step": 14624 + }, + { + "epoch": 0.729426433915212, + "grad_norm": 0.43284714981292133, + "learning_rate": 9.00398785486086e-06, + "loss": 0.8317, + "step": 14625 + }, + { + "epoch": 0.7294763092269326, + "grad_norm": 0.5181446307469115, + "learning_rate": 9.00088448186975e-06, + "loss": 0.8389, + "step": 14626 + }, + { + "epoch": 0.7295261845386534, + "grad_norm": 0.954200138513985, + "learning_rate": 8.997781526368412e-06, + "loss": 0.8478, + "step": 14627 + }, + { + "epoch": 0.7295760598503741, + "grad_norm": 0.5501989884717737, + "learning_rate": 8.994678988437802e-06, + "loss": 0.8665, + "step": 14628 + }, + { + "epoch": 0.7296259351620947, + "grad_norm": 0.5638122415625659, + "learning_rate": 8.991576868158882e-06, + "loss": 0.8102, + "step": 14629 + }, + { + "epoch": 0.7296758104738155, + "grad_norm": 0.45512257284537594, + "learning_rate": 8.988475165612589e-06, + "loss": 0.8432, + "step": 14630 + }, + { + "epoch": 0.7297256857855362, + "grad_norm": 0.5192346023220437, + "learning_rate": 8.98537388087988e-06, + "loss": 0.8545, + "step": 14631 + }, + { + "epoch": 0.7297755610972568, + "grad_norm": 0.543751139297719, + "learning_rate": 8.98227301404167e-06, + "loss": 0.8517, + "step": 14632 + }, + { + "epoch": 0.7298254364089776, + "grad_norm": 0.5868236932542685, + "learning_rate": 8.979172565178872e-06, + "loss": 0.8601, + "step": 14633 + }, + { + "epoch": 0.7298753117206983, + "grad_norm": 0.5292075659780426, + "learning_rate": 8.976072534372412e-06, + "loss": 0.8938, + "step": 14634 + }, + { + "epoch": 0.7299251870324189, + "grad_norm": 0.5197159911790123, + "learning_rate": 8.972972921703149e-06, + "loss": 0.8303, + "step": 14635 + }, + { + "epoch": 0.7299750623441397, + "grad_norm": 0.5055793210104705, + "learning_rate": 8.969873727251995e-06, + "loss": 0.8865, + "step": 14636 + }, + { + "epoch": 0.7300249376558603, + "grad_norm": 0.5074644050318332, + "learning_rate": 8.966774951099804e-06, + "loss": 0.8643, + "step": 14637 + }, + { + "epoch": 0.730074812967581, + "grad_norm": 0.5289864176051763, + "learning_rate": 8.963676593327462e-06, + "loss": 0.8437, + "step": 14638 + }, + { + "epoch": 0.7301246882793018, + "grad_norm": 0.45789307659991685, + "learning_rate": 8.960578654015784e-06, + "loss": 0.8636, + "step": 14639 + }, + { + "epoch": 0.7301745635910224, + "grad_norm": 0.519144587456591, + "learning_rate": 8.957481133245638e-06, + "loss": 0.8449, + "step": 14640 + }, + { + "epoch": 0.7302244389027431, + "grad_norm": 0.497524588682733, + "learning_rate": 8.954384031097832e-06, + "loss": 0.8745, + "step": 14641 + }, + { + "epoch": 0.7302743142144639, + "grad_norm": 0.5198596164308986, + "learning_rate": 8.951287347653212e-06, + "loss": 0.8582, + "step": 14642 + }, + { + "epoch": 0.7303241895261845, + "grad_norm": 0.4376441635229654, + "learning_rate": 8.948191082992546e-06, + "loss": 0.8344, + "step": 14643 + }, + { + "epoch": 0.7303740648379052, + "grad_norm": 1.0026219022302498, + "learning_rate": 8.945095237196662e-06, + "loss": 0.8729, + "step": 14644 + }, + { + "epoch": 0.730423940149626, + "grad_norm": 0.4891564838262878, + "learning_rate": 8.94199981034633e-06, + "loss": 0.8012, + "step": 14645 + }, + { + "epoch": 0.7304738154613466, + "grad_norm": 0.47978629522955146, + "learning_rate": 8.938904802522329e-06, + "loss": 0.9064, + "step": 14646 + }, + { + "epoch": 0.7305236907730673, + "grad_norm": 0.4640985582488508, + "learning_rate": 8.93581021380541e-06, + "loss": 0.8552, + "step": 14647 + }, + { + "epoch": 0.7305735660847881, + "grad_norm": 0.5171008190893058, + "learning_rate": 8.93271604427634e-06, + "loss": 0.8696, + "step": 14648 + }, + { + "epoch": 0.7306234413965087, + "grad_norm": 0.48023874199443084, + "learning_rate": 8.929622294015858e-06, + "loss": 0.8028, + "step": 14649 + }, + { + "epoch": 0.7306733167082294, + "grad_norm": 0.4688686433434643, + "learning_rate": 8.926528963104686e-06, + "loss": 0.803, + "step": 14650 + }, + { + "epoch": 0.7307231920199502, + "grad_norm": 0.4736473285167715, + "learning_rate": 8.923436051623541e-06, + "loss": 0.8643, + "step": 14651 + }, + { + "epoch": 0.7307730673316708, + "grad_norm": 0.4933464632539107, + "learning_rate": 8.920343559653146e-06, + "loss": 0.8998, + "step": 14652 + }, + { + "epoch": 0.7308229426433915, + "grad_norm": 0.5370585215243141, + "learning_rate": 8.91725148727419e-06, + "loss": 0.8529, + "step": 14653 + }, + { + "epoch": 0.7308728179551122, + "grad_norm": 0.5788295514527578, + "learning_rate": 8.914159834567356e-06, + "loss": 0.8647, + "step": 14654 + }, + { + "epoch": 0.7309226932668329, + "grad_norm": 0.5206946998399024, + "learning_rate": 8.911068601613315e-06, + "loss": 0.8783, + "step": 14655 + }, + { + "epoch": 0.7309725685785536, + "grad_norm": 0.4394190604251084, + "learning_rate": 8.907977788492756e-06, + "loss": 0.8282, + "step": 14656 + }, + { + "epoch": 0.7310224438902743, + "grad_norm": 0.5179922429811308, + "learning_rate": 8.904887395286296e-06, + "loss": 0.8216, + "step": 14657 + }, + { + "epoch": 0.731072319201995, + "grad_norm": 0.4745208425511133, + "learning_rate": 8.901797422074607e-06, + "loss": 0.8595, + "step": 14658 + }, + { + "epoch": 0.7311221945137157, + "grad_norm": 0.6731982559132099, + "learning_rate": 8.8987078689383e-06, + "loss": 0.8438, + "step": 14659 + }, + { + "epoch": 0.7311720698254364, + "grad_norm": 0.4951673483719981, + "learning_rate": 8.895618735958014e-06, + "loss": 0.8108, + "step": 14660 + }, + { + "epoch": 0.7312219451371571, + "grad_norm": 0.5705710582782447, + "learning_rate": 8.89253002321435e-06, + "loss": 0.8567, + "step": 14661 + }, + { + "epoch": 0.7312718204488778, + "grad_norm": 0.5445343171047189, + "learning_rate": 8.889441730787906e-06, + "loss": 0.8228, + "step": 14662 + }, + { + "epoch": 0.7313216957605985, + "grad_norm": 0.46398670617416815, + "learning_rate": 8.886353858759261e-06, + "loss": 0.8035, + "step": 14663 + }, + { + "epoch": 0.7313715710723192, + "grad_norm": 0.4668322496700826, + "learning_rate": 8.88326640720901e-06, + "loss": 0.8417, + "step": 14664 + }, + { + "epoch": 0.7314214463840399, + "grad_norm": 0.5408902958426257, + "learning_rate": 8.880179376217706e-06, + "loss": 0.8405, + "step": 14665 + }, + { + "epoch": 0.7314713216957606, + "grad_norm": 0.4721394364767406, + "learning_rate": 8.87709276586591e-06, + "loss": 0.8304, + "step": 14666 + }, + { + "epoch": 0.7315211970074813, + "grad_norm": 0.7931165437516454, + "learning_rate": 8.874006576234154e-06, + "loss": 0.8543, + "step": 14667 + }, + { + "epoch": 0.731571072319202, + "grad_norm": 0.494454422905601, + "learning_rate": 8.870920807402988e-06, + "loss": 0.8447, + "step": 14668 + }, + { + "epoch": 0.7316209476309227, + "grad_norm": 0.6962238797191889, + "learning_rate": 8.867835459452925e-06, + "loss": 0.8679, + "step": 14669 + }, + { + "epoch": 0.7316708229426434, + "grad_norm": 0.4225772971463601, + "learning_rate": 8.864750532464476e-06, + "loss": 0.8402, + "step": 14670 + }, + { + "epoch": 0.7317206982543641, + "grad_norm": 0.47366234008164054, + "learning_rate": 8.861666026518139e-06, + "loss": 0.8526, + "step": 14671 + }, + { + "epoch": 0.7317705735660848, + "grad_norm": 0.6373528073853181, + "learning_rate": 8.858581941694397e-06, + "loss": 0.8486, + "step": 14672 + }, + { + "epoch": 0.7318204488778055, + "grad_norm": 0.4877418763087533, + "learning_rate": 8.855498278073744e-06, + "loss": 0.8332, + "step": 14673 + }, + { + "epoch": 0.7318703241895261, + "grad_norm": 0.6345495058929911, + "learning_rate": 8.852415035736628e-06, + "loss": 0.8648, + "step": 14674 + }, + { + "epoch": 0.7319201995012469, + "grad_norm": 0.6829320118096419, + "learning_rate": 8.849332214763534e-06, + "loss": 0.8799, + "step": 14675 + }, + { + "epoch": 0.7319700748129676, + "grad_norm": 0.5568728895786421, + "learning_rate": 8.846249815234869e-06, + "loss": 0.8815, + "step": 14676 + }, + { + "epoch": 0.7320199501246882, + "grad_norm": 0.7212192293215218, + "learning_rate": 8.843167837231096e-06, + "loss": 0.8512, + "step": 14677 + }, + { + "epoch": 0.732069825436409, + "grad_norm": 0.43758584694242203, + "learning_rate": 8.840086280832619e-06, + "loss": 0.8778, + "step": 14678 + }, + { + "epoch": 0.7321197007481297, + "grad_norm": 0.5013757015732985, + "learning_rate": 8.837005146119872e-06, + "loss": 0.8579, + "step": 14679 + }, + { + "epoch": 0.7321695760598503, + "grad_norm": 0.4731662399211957, + "learning_rate": 8.833924433173226e-06, + "loss": 0.8309, + "step": 14680 + }, + { + "epoch": 0.7322194513715711, + "grad_norm": 0.5129486610072133, + "learning_rate": 8.8308441420731e-06, + "loss": 0.8512, + "step": 14681 + }, + { + "epoch": 0.7322693266832918, + "grad_norm": 0.5498857673976085, + "learning_rate": 8.827764272899855e-06, + "loss": 0.89, + "step": 14682 + }, + { + "epoch": 0.7323192019950124, + "grad_norm": 0.47996375947028863, + "learning_rate": 8.824684825733865e-06, + "loss": 0.8444, + "step": 14683 + }, + { + "epoch": 0.7323690773067332, + "grad_norm": 0.4674276020030193, + "learning_rate": 8.821605800655477e-06, + "loss": 0.8849, + "step": 14684 + }, + { + "epoch": 0.7324189526184539, + "grad_norm": 0.5399908484896461, + "learning_rate": 8.818527197745055e-06, + "loss": 0.8125, + "step": 14685 + }, + { + "epoch": 0.7324688279301745, + "grad_norm": 0.487764418280853, + "learning_rate": 8.815449017082923e-06, + "loss": 0.8319, + "step": 14686 + }, + { + "epoch": 0.7325187032418953, + "grad_norm": 0.4716465438919424, + "learning_rate": 8.812371258749402e-06, + "loss": 0.8915, + "step": 14687 + }, + { + "epoch": 0.732568578553616, + "grad_norm": 2.025474506293504, + "learning_rate": 8.809293922824805e-06, + "loss": 0.8843, + "step": 14688 + }, + { + "epoch": 0.7326184538653366, + "grad_norm": 0.4987649625322424, + "learning_rate": 8.806217009389442e-06, + "loss": 0.7976, + "step": 14689 + }, + { + "epoch": 0.7326683291770574, + "grad_norm": 0.5701929883315298, + "learning_rate": 8.8031405185236e-06, + "loss": 0.8329, + "step": 14690 + }, + { + "epoch": 0.732718204488778, + "grad_norm": 0.5476037009452884, + "learning_rate": 8.80006445030756e-06, + "loss": 0.8483, + "step": 14691 + }, + { + "epoch": 0.7327680798004987, + "grad_norm": 0.7724813722936056, + "learning_rate": 8.796988804821576e-06, + "loss": 0.8574, + "step": 14692 + }, + { + "epoch": 0.7328179551122195, + "grad_norm": 0.47020510542564936, + "learning_rate": 8.793913582145935e-06, + "loss": 0.8118, + "step": 14693 + }, + { + "epoch": 0.7328678304239401, + "grad_norm": 0.5782223207296611, + "learning_rate": 8.790838782360853e-06, + "loss": 0.8535, + "step": 14694 + }, + { + "epoch": 0.7329177057356608, + "grad_norm": 0.485816896115666, + "learning_rate": 8.787764405546584e-06, + "loss": 0.8724, + "step": 14695 + }, + { + "epoch": 0.7329675810473816, + "grad_norm": 0.4218582420997117, + "learning_rate": 8.784690451783337e-06, + "loss": 0.8378, + "step": 14696 + }, + { + "epoch": 0.7330174563591022, + "grad_norm": 1.1591388910331302, + "learning_rate": 8.781616921151348e-06, + "loss": 0.8376, + "step": 14697 + }, + { + "epoch": 0.7330673316708229, + "grad_norm": 0.47361034586122236, + "learning_rate": 8.778543813730802e-06, + "loss": 0.8596, + "step": 14698 + }, + { + "epoch": 0.7331172069825437, + "grad_norm": 0.4861755625696806, + "learning_rate": 8.7754711296019e-06, + "loss": 0.8557, + "step": 14699 + }, + { + "epoch": 0.7331670822942643, + "grad_norm": 0.5407804489916755, + "learning_rate": 8.772398868844808e-06, + "loss": 0.8534, + "step": 14700 + }, + { + "epoch": 0.733216957605985, + "grad_norm": 2.4201253963917617, + "learning_rate": 8.769327031539712e-06, + "loss": 0.8368, + "step": 14701 + }, + { + "epoch": 0.7332668329177058, + "grad_norm": 0.48680388398131036, + "learning_rate": 8.766255617766766e-06, + "loss": 0.8587, + "step": 14702 + }, + { + "epoch": 0.7333167082294264, + "grad_norm": 0.6192721784879774, + "learning_rate": 8.76318462760611e-06, + "loss": 0.8721, + "step": 14703 + }, + { + "epoch": 0.7333665835411471, + "grad_norm": 0.465859146012712, + "learning_rate": 8.760114061137889e-06, + "loss": 0.8752, + "step": 14704 + }, + { + "epoch": 0.7334164588528679, + "grad_norm": 0.5136154409314805, + "learning_rate": 8.757043918442214e-06, + "loss": 0.8532, + "step": 14705 + }, + { + "epoch": 0.7334663341645885, + "grad_norm": 0.5239468178936606, + "learning_rate": 8.753974199599216e-06, + "loss": 0.8191, + "step": 14706 + }, + { + "epoch": 0.7335162094763092, + "grad_norm": 0.513898449748446, + "learning_rate": 8.750904904688991e-06, + "loss": 0.8716, + "step": 14707 + }, + { + "epoch": 0.7335660847880299, + "grad_norm": 0.5068972502535681, + "learning_rate": 8.747836033791629e-06, + "loss": 0.8701, + "step": 14708 + }, + { + "epoch": 0.7336159600997506, + "grad_norm": 0.6602692560465234, + "learning_rate": 8.744767586987206e-06, + "loss": 0.8971, + "step": 14709 + }, + { + "epoch": 0.7336658354114713, + "grad_norm": 0.4919430589969698, + "learning_rate": 8.741699564355805e-06, + "loss": 0.863, + "step": 14710 + }, + { + "epoch": 0.733715710723192, + "grad_norm": 0.46402829862498635, + "learning_rate": 8.73863196597747e-06, + "loss": 0.8601, + "step": 14711 + }, + { + "epoch": 0.7337655860349127, + "grad_norm": 0.5192322826377236, + "learning_rate": 8.735564791932271e-06, + "loss": 0.8686, + "step": 14712 + }, + { + "epoch": 0.7338154613466334, + "grad_norm": 0.5318759292887238, + "learning_rate": 8.732498042300217e-06, + "loss": 0.8412, + "step": 14713 + }, + { + "epoch": 0.7338653366583541, + "grad_norm": 0.5775812166352512, + "learning_rate": 8.729431717161351e-06, + "loss": 0.8846, + "step": 14714 + }, + { + "epoch": 0.7339152119700748, + "grad_norm": 0.5761521774476186, + "learning_rate": 8.726365816595683e-06, + "loss": 0.8439, + "step": 14715 + }, + { + "epoch": 0.7339650872817955, + "grad_norm": 0.5539582397389188, + "learning_rate": 8.72330034068322e-06, + "loss": 0.8515, + "step": 14716 + }, + { + "epoch": 0.7340149625935162, + "grad_norm": 0.5190746809849752, + "learning_rate": 8.720235289503936e-06, + "loss": 0.8651, + "step": 14717 + }, + { + "epoch": 0.7340648379052369, + "grad_norm": 0.5628390264949525, + "learning_rate": 8.717170663137835e-06, + "loss": 0.8481, + "step": 14718 + }, + { + "epoch": 0.7341147132169576, + "grad_norm": 0.47185689873141035, + "learning_rate": 8.71410646166488e-06, + "loss": 0.8089, + "step": 14719 + }, + { + "epoch": 0.7341645885286783, + "grad_norm": 0.52940434343529, + "learning_rate": 8.71104268516503e-06, + "loss": 0.8786, + "step": 14720 + }, + { + "epoch": 0.734214463840399, + "grad_norm": 0.541558331679772, + "learning_rate": 8.707979333718223e-06, + "loss": 0.8879, + "step": 14721 + }, + { + "epoch": 0.7342643391521197, + "grad_norm": 0.6972338586926393, + "learning_rate": 8.704916407404408e-06, + "loss": 0.806, + "step": 14722 + }, + { + "epoch": 0.7343142144638404, + "grad_norm": 1.1197126121264365, + "learning_rate": 8.701853906303512e-06, + "loss": 0.8248, + "step": 14723 + }, + { + "epoch": 0.7343640897755611, + "grad_norm": 0.5403092766089086, + "learning_rate": 8.69879183049544e-06, + "loss": 0.87, + "step": 14724 + }, + { + "epoch": 0.7344139650872819, + "grad_norm": 0.5133425587280693, + "learning_rate": 8.695730180060093e-06, + "loss": 0.8476, + "step": 14725 + }, + { + "epoch": 0.7344638403990025, + "grad_norm": 0.4612220290851877, + "learning_rate": 8.692668955077376e-06, + "loss": 0.8331, + "step": 14726 + }, + { + "epoch": 0.7345137157107232, + "grad_norm": 0.554859089630553, + "learning_rate": 8.689608155627168e-06, + "loss": 0.8288, + "step": 14727 + }, + { + "epoch": 0.7345635910224438, + "grad_norm": 0.5770511391910343, + "learning_rate": 8.686547781789334e-06, + "loss": 0.834, + "step": 14728 + }, + { + "epoch": 0.7346134663341646, + "grad_norm": 0.48306130784415785, + "learning_rate": 8.683487833643728e-06, + "loss": 0.8364, + "step": 14729 + }, + { + "epoch": 0.7346633416458853, + "grad_norm": 0.5322809330199963, + "learning_rate": 8.68042831127022e-06, + "loss": 0.9117, + "step": 14730 + }, + { + "epoch": 0.7347132169576059, + "grad_norm": 0.4454618136877076, + "learning_rate": 8.677369214748615e-06, + "loss": 0.8642, + "step": 14731 + }, + { + "epoch": 0.7347630922693267, + "grad_norm": 0.5374481600164945, + "learning_rate": 8.674310544158761e-06, + "loss": 0.8568, + "step": 14732 + }, + { + "epoch": 0.7348129675810474, + "grad_norm": 0.6650836907018568, + "learning_rate": 8.671252299580463e-06, + "loss": 0.8627, + "step": 14733 + }, + { + "epoch": 0.734862842892768, + "grad_norm": 0.5133675106521068, + "learning_rate": 8.668194481093541e-06, + "loss": 0.8437, + "step": 14734 + }, + { + "epoch": 0.7349127182044888, + "grad_norm": 0.4989469940163097, + "learning_rate": 8.665137088777761e-06, + "loss": 0.7769, + "step": 14735 + }, + { + "epoch": 0.7349625935162095, + "grad_norm": 0.44928216314182395, + "learning_rate": 8.662080122712926e-06, + "loss": 0.8418, + "step": 14736 + }, + { + "epoch": 0.7350124688279301, + "grad_norm": 0.43290463201285123, + "learning_rate": 8.65902358297879e-06, + "loss": 0.8229, + "step": 14737 + }, + { + "epoch": 0.7350623441396509, + "grad_norm": 0.9544720263563715, + "learning_rate": 8.65596746965513e-06, + "loss": 0.8364, + "step": 14738 + }, + { + "epoch": 0.7351122194513716, + "grad_norm": 0.46277419730589203, + "learning_rate": 8.652911782821682e-06, + "loss": 0.8022, + "step": 14739 + }, + { + "epoch": 0.7351620947630922, + "grad_norm": 0.5809930721474375, + "learning_rate": 8.649856522558186e-06, + "loss": 0.8959, + "step": 14740 + }, + { + "epoch": 0.735211970074813, + "grad_norm": 0.49403365159383145, + "learning_rate": 8.646801688944367e-06, + "loss": 0.8578, + "step": 14741 + }, + { + "epoch": 0.7352618453865337, + "grad_norm": 0.4609689082670156, + "learning_rate": 8.64374728205993e-06, + "loss": 0.8502, + "step": 14742 + }, + { + "epoch": 0.7353117206982543, + "grad_norm": 0.491499897813105, + "learning_rate": 8.640693301984596e-06, + "loss": 0.7957, + "step": 14743 + }, + { + "epoch": 0.7353615960099751, + "grad_norm": 2.2422528695380257, + "learning_rate": 8.637639748798049e-06, + "loss": 0.8185, + "step": 14744 + }, + { + "epoch": 0.7354114713216957, + "grad_norm": 0.4989754807985203, + "learning_rate": 8.634586622579969e-06, + "loss": 0.8572, + "step": 14745 + }, + { + "epoch": 0.7354613466334164, + "grad_norm": 0.6707609056032546, + "learning_rate": 8.631533923410017e-06, + "loss": 0.8058, + "step": 14746 + }, + { + "epoch": 0.7355112219451372, + "grad_norm": 0.5155926758710684, + "learning_rate": 8.628481651367876e-06, + "loss": 0.8912, + "step": 14747 + }, + { + "epoch": 0.7355610972568578, + "grad_norm": 0.5575723949136927, + "learning_rate": 8.625429806533166e-06, + "loss": 0.8541, + "step": 14748 + }, + { + "epoch": 0.7356109725685785, + "grad_norm": 0.5104802150129962, + "learning_rate": 8.622378388985555e-06, + "loss": 0.8461, + "step": 14749 + }, + { + "epoch": 0.7356608478802993, + "grad_norm": 0.5047914205560109, + "learning_rate": 8.619327398804632e-06, + "loss": 0.8392, + "step": 14750 + }, + { + "epoch": 0.7357107231920199, + "grad_norm": 0.5386208076038669, + "learning_rate": 8.616276836070042e-06, + "loss": 0.8446, + "step": 14751 + }, + { + "epoch": 0.7357605985037406, + "grad_norm": 0.5535241388135207, + "learning_rate": 8.613226700861373e-06, + "loss": 0.8943, + "step": 14752 + }, + { + "epoch": 0.7358104738154614, + "grad_norm": 0.4424487693039196, + "learning_rate": 8.61017699325822e-06, + "loss": 0.8378, + "step": 14753 + }, + { + "epoch": 0.735860349127182, + "grad_norm": 0.6426350056611785, + "learning_rate": 8.607127713340155e-06, + "loss": 0.9094, + "step": 14754 + }, + { + "epoch": 0.7359102244389027, + "grad_norm": 1.1510037700766078, + "learning_rate": 8.604078861186762e-06, + "loss": 0.8446, + "step": 14755 + }, + { + "epoch": 0.7359600997506235, + "grad_norm": 0.614869163652308, + "learning_rate": 8.601030436877593e-06, + "loss": 0.9217, + "step": 14756 + }, + { + "epoch": 0.7360099750623441, + "grad_norm": 0.4673856969410746, + "learning_rate": 8.5979824404922e-06, + "loss": 0.8664, + "step": 14757 + }, + { + "epoch": 0.7360598503740649, + "grad_norm": 0.59473322228771, + "learning_rate": 8.594934872110102e-06, + "loss": 0.8242, + "step": 14758 + }, + { + "epoch": 0.7361097256857856, + "grad_norm": 0.5488224048182831, + "learning_rate": 8.591887731810846e-06, + "loss": 0.8798, + "step": 14759 + }, + { + "epoch": 0.7361596009975062, + "grad_norm": 0.5067645395747501, + "learning_rate": 8.588841019673938e-06, + "loss": 0.8535, + "step": 14760 + }, + { + "epoch": 0.736209476309227, + "grad_norm": 0.5335645126152254, + "learning_rate": 8.58579473577888e-06, + "loss": 0.8118, + "step": 14761 + }, + { + "epoch": 0.7362593516209476, + "grad_norm": 0.47086067013756683, + "learning_rate": 8.582748880205149e-06, + "loss": 0.8167, + "step": 14762 + }, + { + "epoch": 0.7363092269326683, + "grad_norm": 0.6231638364934787, + "learning_rate": 8.579703453032258e-06, + "loss": 0.8284, + "step": 14763 + }, + { + "epoch": 0.736359102244389, + "grad_norm": 0.6215294752921553, + "learning_rate": 8.576658454339642e-06, + "loss": 0.8208, + "step": 14764 + }, + { + "epoch": 0.7364089775561097, + "grad_norm": 0.563705292545973, + "learning_rate": 8.57361388420678e-06, + "loss": 0.8263, + "step": 14765 + }, + { + "epoch": 0.7364588528678304, + "grad_norm": 0.47914506587019473, + "learning_rate": 8.570569742713102e-06, + "loss": 0.8465, + "step": 14766 + }, + { + "epoch": 0.7365087281795512, + "grad_norm": 0.48990706811797846, + "learning_rate": 8.567526029938073e-06, + "loss": 0.8248, + "step": 14767 + }, + { + "epoch": 0.7365586034912718, + "grad_norm": 0.6428302128798284, + "learning_rate": 8.564482745961081e-06, + "loss": 0.8633, + "step": 14768 + }, + { + "epoch": 0.7366084788029925, + "grad_norm": 0.5757342541320937, + "learning_rate": 8.561439890861563e-06, + "loss": 0.8524, + "step": 14769 + }, + { + "epoch": 0.7366583541147133, + "grad_norm": 2.0569203166086547, + "learning_rate": 8.558397464718907e-06, + "loss": 0.8181, + "step": 14770 + }, + { + "epoch": 0.7367082294264339, + "grad_norm": 0.48345995794404906, + "learning_rate": 8.555355467612527e-06, + "loss": 0.8639, + "step": 14771 + }, + { + "epoch": 0.7367581047381546, + "grad_norm": 0.5835993344824723, + "learning_rate": 8.552313899621772e-06, + "loss": 0.8537, + "step": 14772 + }, + { + "epoch": 0.7368079800498754, + "grad_norm": 0.6882067225811758, + "learning_rate": 8.54927276082603e-06, + "loss": 0.8707, + "step": 14773 + }, + { + "epoch": 0.736857855361596, + "grad_norm": 0.536245050479992, + "learning_rate": 8.546232051304654e-06, + "loss": 0.8844, + "step": 14774 + }, + { + "epoch": 0.7369077306733167, + "grad_norm": 0.5124648356048636, + "learning_rate": 8.54319177113698e-06, + "loss": 0.85, + "step": 14775 + }, + { + "epoch": 0.7369576059850375, + "grad_norm": 0.4057903574419437, + "learning_rate": 8.540151920402361e-06, + "loss": 0.8219, + "step": 14776 + }, + { + "epoch": 0.7370074812967581, + "grad_norm": 0.5108238074691251, + "learning_rate": 8.53711249918011e-06, + "loss": 0.8586, + "step": 14777 + }, + { + "epoch": 0.7370573566084788, + "grad_norm": 0.48007480483539805, + "learning_rate": 8.534073507549541e-06, + "loss": 0.8363, + "step": 14778 + }, + { + "epoch": 0.7371072319201994, + "grad_norm": 0.5457501563123508, + "learning_rate": 8.531034945589947e-06, + "loss": 0.8574, + "step": 14779 + }, + { + "epoch": 0.7371571072319202, + "grad_norm": 0.47813279883383725, + "learning_rate": 8.527996813380634e-06, + "loss": 0.8413, + "step": 14780 + }, + { + "epoch": 0.7372069825436409, + "grad_norm": 0.4850788822225003, + "learning_rate": 8.524959111000871e-06, + "loss": 0.8938, + "step": 14781 + }, + { + "epoch": 0.7372568578553615, + "grad_norm": 0.4489223396425889, + "learning_rate": 8.521921838529929e-06, + "loss": 0.8536, + "step": 14782 + }, + { + "epoch": 0.7373067331670823, + "grad_norm": 1.4158793029425458, + "learning_rate": 8.518884996047053e-06, + "loss": 0.8181, + "step": 14783 + }, + { + "epoch": 0.737356608478803, + "grad_norm": 0.5836464892022236, + "learning_rate": 8.515848583631503e-06, + "loss": 0.8592, + "step": 14784 + }, + { + "epoch": 0.7374064837905236, + "grad_norm": 0.6513386068150291, + "learning_rate": 8.512812601362508e-06, + "loss": 0.8807, + "step": 14785 + }, + { + "epoch": 0.7374563591022444, + "grad_norm": 0.4555560681915875, + "learning_rate": 8.50977704931929e-06, + "loss": 0.839, + "step": 14786 + }, + { + "epoch": 0.7375062344139651, + "grad_norm": 0.5809039312848924, + "learning_rate": 8.506741927581052e-06, + "loss": 0.8655, + "step": 14787 + }, + { + "epoch": 0.7375561097256857, + "grad_norm": 0.5112691312759529, + "learning_rate": 8.50370723622701e-06, + "loss": 0.815, + "step": 14788 + }, + { + "epoch": 0.7376059850374065, + "grad_norm": 0.4975369286882725, + "learning_rate": 8.500672975336343e-06, + "loss": 0.8668, + "step": 14789 + }, + { + "epoch": 0.7376558603491272, + "grad_norm": 0.4665585745970494, + "learning_rate": 8.497639144988232e-06, + "loss": 0.8765, + "step": 14790 + }, + { + "epoch": 0.7377057356608478, + "grad_norm": 0.4592028506022483, + "learning_rate": 8.494605745261836e-06, + "loss": 0.7958, + "step": 14791 + }, + { + "epoch": 0.7377556109725686, + "grad_norm": 0.5251013923646959, + "learning_rate": 8.49157277623632e-06, + "loss": 0.8662, + "step": 14792 + }, + { + "epoch": 0.7378054862842893, + "grad_norm": 0.43517107338743055, + "learning_rate": 8.488540237990828e-06, + "loss": 0.8395, + "step": 14793 + }, + { + "epoch": 0.73785536159601, + "grad_norm": 0.6118631457953501, + "learning_rate": 8.485508130604487e-06, + "loss": 0.8263, + "step": 14794 + }, + { + "epoch": 0.7379052369077307, + "grad_norm": 0.5304178966817452, + "learning_rate": 8.482476454156413e-06, + "loss": 0.8944, + "step": 14795 + }, + { + "epoch": 0.7379551122194514, + "grad_norm": 0.634481471215183, + "learning_rate": 8.47944520872573e-06, + "loss": 0.8622, + "step": 14796 + }, + { + "epoch": 0.738004987531172, + "grad_norm": 0.6241100535370809, + "learning_rate": 8.476414394391533e-06, + "loss": 0.8716, + "step": 14797 + }, + { + "epoch": 0.7380548628428928, + "grad_norm": 0.48601013816240846, + "learning_rate": 8.473384011232908e-06, + "loss": 0.8359, + "step": 14798 + }, + { + "epoch": 0.7381047381546134, + "grad_norm": 0.5256924394566822, + "learning_rate": 8.470354059328919e-06, + "loss": 0.8627, + "step": 14799 + }, + { + "epoch": 0.7381546134663342, + "grad_norm": 0.4685886615875978, + "learning_rate": 8.467324538758662e-06, + "loss": 0.843, + "step": 14800 + }, + { + "epoch": 0.7382044887780549, + "grad_norm": 0.4724455883981153, + "learning_rate": 8.464295449601156e-06, + "loss": 0.8615, + "step": 14801 + }, + { + "epoch": 0.7382543640897755, + "grad_norm": 0.5562042222810648, + "learning_rate": 8.461266791935468e-06, + "loss": 0.8194, + "step": 14802 + }, + { + "epoch": 0.7383042394014963, + "grad_norm": 0.5905868832994178, + "learning_rate": 8.458238565840611e-06, + "loss": 0.8497, + "step": 14803 + }, + { + "epoch": 0.738354114713217, + "grad_norm": 0.5025325648544235, + "learning_rate": 8.455210771395637e-06, + "loss": 0.8145, + "step": 14804 + }, + { + "epoch": 0.7384039900249376, + "grad_norm": 0.5695203104324082, + "learning_rate": 8.452183408679513e-06, + "loss": 0.8489, + "step": 14805 + }, + { + "epoch": 0.7384538653366584, + "grad_norm": 0.4607061958557011, + "learning_rate": 8.449156477771263e-06, + "loss": 0.8662, + "step": 14806 + }, + { + "epoch": 0.7385037406483791, + "grad_norm": 0.5134506514090488, + "learning_rate": 8.446129978749862e-06, + "loss": 0.873, + "step": 14807 + }, + { + "epoch": 0.7385536159600997, + "grad_norm": 0.5665239115471793, + "learning_rate": 8.443103911694308e-06, + "loss": 0.8542, + "step": 14808 + }, + { + "epoch": 0.7386034912718205, + "grad_norm": 0.5488776340207314, + "learning_rate": 8.44007827668353e-06, + "loss": 0.8441, + "step": 14809 + }, + { + "epoch": 0.7386533665835412, + "grad_norm": 0.6107774754118127, + "learning_rate": 8.437053073796508e-06, + "loss": 0.8504, + "step": 14810 + }, + { + "epoch": 0.7387032418952618, + "grad_norm": 0.5279240806921194, + "learning_rate": 8.43402830311217e-06, + "loss": 0.8648, + "step": 14811 + }, + { + "epoch": 0.7387531172069826, + "grad_norm": 0.4661425924497676, + "learning_rate": 8.431003964709443e-06, + "loss": 0.8538, + "step": 14812 + }, + { + "epoch": 0.7388029925187033, + "grad_norm": 0.7720793025181063, + "learning_rate": 8.427980058667262e-06, + "loss": 0.8393, + "step": 14813 + }, + { + "epoch": 0.7388528678304239, + "grad_norm": 0.47777996532161665, + "learning_rate": 8.424956585064525e-06, + "loss": 0.8956, + "step": 14814 + }, + { + "epoch": 0.7389027431421447, + "grad_norm": 0.45722039704705836, + "learning_rate": 8.421933543980126e-06, + "loss": 0.8345, + "step": 14815 + }, + { + "epoch": 0.7389526184538653, + "grad_norm": 0.6145360517953904, + "learning_rate": 8.418910935492944e-06, + "loss": 0.8951, + "step": 14816 + }, + { + "epoch": 0.739002493765586, + "grad_norm": 0.5704752557959626, + "learning_rate": 8.415888759681867e-06, + "loss": 0.8639, + "step": 14817 + }, + { + "epoch": 0.7390523690773068, + "grad_norm": 0.4683031634775369, + "learning_rate": 8.412867016625753e-06, + "loss": 0.8526, + "step": 14818 + }, + { + "epoch": 0.7391022443890274, + "grad_norm": 0.5610740005122733, + "learning_rate": 8.409845706403448e-06, + "loss": 0.8936, + "step": 14819 + }, + { + "epoch": 0.7391521197007481, + "grad_norm": 1.2628125078086867, + "learning_rate": 8.40682482909379e-06, + "loss": 0.8419, + "step": 14820 + }, + { + "epoch": 0.7392019950124689, + "grad_norm": 0.5065158180468665, + "learning_rate": 8.403804384775618e-06, + "loss": 0.8793, + "step": 14821 + }, + { + "epoch": 0.7392518703241895, + "grad_norm": 0.5441526674368153, + "learning_rate": 8.400784373527746e-06, + "loss": 0.8335, + "step": 14822 + }, + { + "epoch": 0.7393017456359102, + "grad_norm": 0.5923772556196264, + "learning_rate": 8.397764795428972e-06, + "loss": 0.7902, + "step": 14823 + }, + { + "epoch": 0.739351620947631, + "grad_norm": 0.5755294135263179, + "learning_rate": 8.394745650558091e-06, + "loss": 0.8451, + "step": 14824 + }, + { + "epoch": 0.7394014962593516, + "grad_norm": 0.5978201186317037, + "learning_rate": 8.391726938993899e-06, + "loss": 0.8533, + "step": 14825 + }, + { + "epoch": 0.7394513715710723, + "grad_norm": 0.6429182411950906, + "learning_rate": 8.388708660815156e-06, + "loss": 0.8255, + "step": 14826 + }, + { + "epoch": 0.7395012468827931, + "grad_norm": 0.4532162091869494, + "learning_rate": 8.38569081610063e-06, + "loss": 0.8303, + "step": 14827 + }, + { + "epoch": 0.7395511221945137, + "grad_norm": 0.5092808567106014, + "learning_rate": 8.382673404929057e-06, + "loss": 0.8428, + "step": 14828 + }, + { + "epoch": 0.7396009975062344, + "grad_norm": 0.5246479356589911, + "learning_rate": 8.379656427379193e-06, + "loss": 0.86, + "step": 14829 + }, + { + "epoch": 0.7396508728179552, + "grad_norm": 0.6281463554943294, + "learning_rate": 8.376639883529754e-06, + "loss": 0.8493, + "step": 14830 + }, + { + "epoch": 0.7397007481296758, + "grad_norm": 0.5505481256223944, + "learning_rate": 8.37362377345946e-06, + "loss": 0.8197, + "step": 14831 + }, + { + "epoch": 0.7397506234413965, + "grad_norm": 0.47789295081651323, + "learning_rate": 8.370608097247005e-06, + "loss": 0.8171, + "step": 14832 + }, + { + "epoch": 0.7398004987531172, + "grad_norm": 0.5241075537104833, + "learning_rate": 8.367592854971095e-06, + "loss": 0.812, + "step": 14833 + }, + { + "epoch": 0.7398503740648379, + "grad_norm": 0.4255618698257449, + "learning_rate": 8.364578046710409e-06, + "loss": 0.8231, + "step": 14834 + }, + { + "epoch": 0.7399002493765586, + "grad_norm": 0.5439951325678322, + "learning_rate": 8.361563672543613e-06, + "loss": 0.846, + "step": 14835 + }, + { + "epoch": 0.7399501246882793, + "grad_norm": 0.46579678575624833, + "learning_rate": 8.358549732549357e-06, + "loss": 0.8372, + "step": 14836 + }, + { + "epoch": 0.74, + "grad_norm": 0.7018518448568598, + "learning_rate": 8.355536226806316e-06, + "loss": 0.8338, + "step": 14837 + }, + { + "epoch": 0.7400498753117207, + "grad_norm": 0.8192764086821039, + "learning_rate": 8.352523155393088e-06, + "loss": 0.8448, + "step": 14838 + }, + { + "epoch": 0.7400997506234414, + "grad_norm": 0.5087340482074838, + "learning_rate": 8.349510518388332e-06, + "loss": 0.8593, + "step": 14839 + }, + { + "epoch": 0.7401496259351621, + "grad_norm": 0.4493147334774325, + "learning_rate": 8.346498315870634e-06, + "loss": 0.8388, + "step": 14840 + }, + { + "epoch": 0.7401995012468828, + "grad_norm": 0.5699361036087751, + "learning_rate": 8.343486547918628e-06, + "loss": 0.8798, + "step": 14841 + }, + { + "epoch": 0.7402493765586035, + "grad_norm": 1.1529935073176938, + "learning_rate": 8.340475214610868e-06, + "loss": 0.8604, + "step": 14842 + }, + { + "epoch": 0.7402992518703242, + "grad_norm": 0.8306819101348184, + "learning_rate": 8.33746431602596e-06, + "loss": 0.8285, + "step": 14843 + }, + { + "epoch": 0.7403491271820449, + "grad_norm": 0.5003764791534546, + "learning_rate": 8.334453852242458e-06, + "loss": 0.8799, + "step": 14844 + }, + { + "epoch": 0.7403990024937656, + "grad_norm": 0.49223677097619783, + "learning_rate": 8.33144382333894e-06, + "loss": 0.8226, + "step": 14845 + }, + { + "epoch": 0.7404488778054863, + "grad_norm": 0.538434073046561, + "learning_rate": 8.328434229393916e-06, + "loss": 0.8303, + "step": 14846 + }, + { + "epoch": 0.740498753117207, + "grad_norm": 0.6038796189226215, + "learning_rate": 8.325425070485948e-06, + "loss": 0.8577, + "step": 14847 + }, + { + "epoch": 0.7405486284289277, + "grad_norm": 0.5022558207054605, + "learning_rate": 8.32241634669355e-06, + "loss": 0.8714, + "step": 14848 + }, + { + "epoch": 0.7405985037406484, + "grad_norm": 0.5081504643830145, + "learning_rate": 8.319408058095238e-06, + "loss": 0.8037, + "step": 14849 + }, + { + "epoch": 0.7406483790523691, + "grad_norm": 0.4783043747724887, + "learning_rate": 8.316400204769496e-06, + "loss": 0.8235, + "step": 14850 + }, + { + "epoch": 0.7406982543640898, + "grad_norm": 0.9079732835864468, + "learning_rate": 8.313392786794833e-06, + "loss": 0.8693, + "step": 14851 + }, + { + "epoch": 0.7407481296758105, + "grad_norm": 0.4591374308763797, + "learning_rate": 8.310385804249718e-06, + "loss": 0.8723, + "step": 14852 + }, + { + "epoch": 0.7407980049875311, + "grad_norm": 0.48723165890840564, + "learning_rate": 8.307379257212605e-06, + "loss": 0.8898, + "step": 14853 + }, + { + "epoch": 0.7408478802992519, + "grad_norm": 0.9100012232937025, + "learning_rate": 8.30437314576197e-06, + "loss": 0.8969, + "step": 14854 + }, + { + "epoch": 0.7408977556109726, + "grad_norm": 0.5517666228308242, + "learning_rate": 8.301367469976246e-06, + "loss": 0.8607, + "step": 14855 + }, + { + "epoch": 0.7409476309226932, + "grad_norm": 0.45498429455477407, + "learning_rate": 8.298362229933864e-06, + "loss": 0.8886, + "step": 14856 + }, + { + "epoch": 0.740997506234414, + "grad_norm": 0.5078773127566307, + "learning_rate": 8.295357425713238e-06, + "loss": 0.8871, + "step": 14857 + }, + { + "epoch": 0.7410473815461347, + "grad_norm": 0.8011796870900052, + "learning_rate": 8.292353057392791e-06, + "loss": 0.8276, + "step": 14858 + }, + { + "epoch": 0.7410972568578553, + "grad_norm": 0.5534076639959596, + "learning_rate": 8.289349125050912e-06, + "loss": 0.8477, + "step": 14859 + }, + { + "epoch": 0.7411471321695761, + "grad_norm": 0.5164659372034194, + "learning_rate": 8.286345628765993e-06, + "loss": 0.9003, + "step": 14860 + }, + { + "epoch": 0.7411970074812968, + "grad_norm": 0.5377024508811175, + "learning_rate": 8.283342568616393e-06, + "loss": 0.8543, + "step": 14861 + }, + { + "epoch": 0.7412468827930174, + "grad_norm": 0.4945068293277045, + "learning_rate": 8.280339944680495e-06, + "loss": 0.8698, + "step": 14862 + }, + { + "epoch": 0.7412967581047382, + "grad_norm": 0.5356985259236735, + "learning_rate": 8.277337757036646e-06, + "loss": 0.8368, + "step": 14863 + }, + { + "epoch": 0.7413466334164589, + "grad_norm": 0.5051222866286017, + "learning_rate": 8.274336005763183e-06, + "loss": 0.8437, + "step": 14864 + }, + { + "epoch": 0.7413965087281795, + "grad_norm": 0.5039721673895375, + "learning_rate": 8.271334690938429e-06, + "loss": 0.8478, + "step": 14865 + }, + { + "epoch": 0.7414463840399003, + "grad_norm": 0.4797452034321572, + "learning_rate": 8.268333812640716e-06, + "loss": 0.8205, + "step": 14866 + }, + { + "epoch": 0.741496259351621, + "grad_norm": 0.5559962663362051, + "learning_rate": 8.265333370948347e-06, + "loss": 0.8847, + "step": 14867 + }, + { + "epoch": 0.7415461346633416, + "grad_norm": 0.505923916959698, + "learning_rate": 8.262333365939611e-06, + "loss": 0.8597, + "step": 14868 + }, + { + "epoch": 0.7415960099750624, + "grad_norm": 0.4333271599000631, + "learning_rate": 8.25933379769279e-06, + "loss": 0.8461, + "step": 14869 + }, + { + "epoch": 0.741645885286783, + "grad_norm": 0.5341251311260172, + "learning_rate": 8.256334666286175e-06, + "loss": 0.8171, + "step": 14870 + }, + { + "epoch": 0.7416957605985037, + "grad_norm": 0.5213635314165576, + "learning_rate": 8.253335971797999e-06, + "loss": 0.8465, + "step": 14871 + }, + { + "epoch": 0.7417456359102245, + "grad_norm": 0.4791310865942384, + "learning_rate": 8.250337714306531e-06, + "loss": 0.885, + "step": 14872 + }, + { + "epoch": 0.7417955112219451, + "grad_norm": 0.5659128688953022, + "learning_rate": 8.24733989389e-06, + "loss": 0.8493, + "step": 14873 + }, + { + "epoch": 0.7418453865336658, + "grad_norm": 1.3920443867388759, + "learning_rate": 8.244342510626651e-06, + "loss": 0.8504, + "step": 14874 + }, + { + "epoch": 0.7418952618453866, + "grad_norm": 0.5006336126875438, + "learning_rate": 8.241345564594668e-06, + "loss": 0.861, + "step": 14875 + }, + { + "epoch": 0.7419451371571072, + "grad_norm": 0.43761262078853136, + "learning_rate": 8.238349055872283e-06, + "loss": 0.845, + "step": 14876 + }, + { + "epoch": 0.7419950124688279, + "grad_norm": 0.5258734086660184, + "learning_rate": 8.235352984537667e-06, + "loss": 0.853, + "step": 14877 + }, + { + "epoch": 0.7420448877805487, + "grad_norm": 0.5087322524565787, + "learning_rate": 8.232357350669031e-06, + "loss": 0.8356, + "step": 14878 + }, + { + "epoch": 0.7420947630922693, + "grad_norm": 0.5973917235345807, + "learning_rate": 8.229362154344508e-06, + "loss": 0.8354, + "step": 14879 + }, + { + "epoch": 0.74214463840399, + "grad_norm": 0.5335911805001697, + "learning_rate": 8.226367395642282e-06, + "loss": 0.8181, + "step": 14880 + }, + { + "epoch": 0.7421945137157108, + "grad_norm": 0.439253413575559, + "learning_rate": 8.223373074640492e-06, + "loss": 0.8383, + "step": 14881 + }, + { + "epoch": 0.7422443890274314, + "grad_norm": 0.5143523025477378, + "learning_rate": 8.220379191417274e-06, + "loss": 0.8763, + "step": 14882 + }, + { + "epoch": 0.7422942643391521, + "grad_norm": 0.47772071778760206, + "learning_rate": 8.217385746050742e-06, + "loss": 0.8181, + "step": 14883 + }, + { + "epoch": 0.7423441396508729, + "grad_norm": 0.4322684681713745, + "learning_rate": 8.214392738619029e-06, + "loss": 0.8282, + "step": 14884 + }, + { + "epoch": 0.7423940149625935, + "grad_norm": 0.5284584239261473, + "learning_rate": 8.211400169200222e-06, + "loss": 0.8985, + "step": 14885 + }, + { + "epoch": 0.7424438902743142, + "grad_norm": 0.5319224410694673, + "learning_rate": 8.208408037872416e-06, + "loss": 0.7931, + "step": 14886 + }, + { + "epoch": 0.7424937655860349, + "grad_norm": 0.556236713841918, + "learning_rate": 8.20541634471368e-06, + "loss": 0.8285, + "step": 14887 + }, + { + "epoch": 0.7425436408977556, + "grad_norm": 0.5466165352147054, + "learning_rate": 8.202425089802096e-06, + "loss": 0.8795, + "step": 14888 + }, + { + "epoch": 0.7425935162094763, + "grad_norm": 1.020972802485231, + "learning_rate": 8.199434273215708e-06, + "loss": 0.8302, + "step": 14889 + }, + { + "epoch": 0.742643391521197, + "grad_norm": 0.6340085889154745, + "learning_rate": 8.19644389503256e-06, + "loss": 0.8499, + "step": 14890 + }, + { + "epoch": 0.7426932668329177, + "grad_norm": 0.5499024924323239, + "learning_rate": 8.193453955330694e-06, + "loss": 0.8727, + "step": 14891 + }, + { + "epoch": 0.7427431421446384, + "grad_norm": 0.7448545286969188, + "learning_rate": 8.190464454188127e-06, + "loss": 0.8729, + "step": 14892 + }, + { + "epoch": 0.7427930174563591, + "grad_norm": 0.5775468620945354, + "learning_rate": 8.187475391682867e-06, + "loss": 0.8739, + "step": 14893 + }, + { + "epoch": 0.7428428927680798, + "grad_norm": 0.48863758222810777, + "learning_rate": 8.184486767892902e-06, + "loss": 0.8306, + "step": 14894 + }, + { + "epoch": 0.7428927680798005, + "grad_norm": 0.49384161377861363, + "learning_rate": 8.181498582896238e-06, + "loss": 0.8569, + "step": 14895 + }, + { + "epoch": 0.7429426433915212, + "grad_norm": 0.45082809819077846, + "learning_rate": 8.178510836770842e-06, + "loss": 0.8671, + "step": 14896 + }, + { + "epoch": 0.7429925187032419, + "grad_norm": 0.515927020723437, + "learning_rate": 8.175523529594678e-06, + "loss": 0.8802, + "step": 14897 + }, + { + "epoch": 0.7430423940149626, + "grad_norm": 0.5413669797669057, + "learning_rate": 8.17253666144569e-06, + "loss": 0.9086, + "step": 14898 + }, + { + "epoch": 0.7430922693266833, + "grad_norm": 0.4650911864371036, + "learning_rate": 8.169550232401835e-06, + "loss": 0.8555, + "step": 14899 + }, + { + "epoch": 0.743142144638404, + "grad_norm": 0.5194128140858078, + "learning_rate": 8.166564242541033e-06, + "loss": 0.8315, + "step": 14900 + }, + { + "epoch": 0.7431920199501247, + "grad_norm": 0.4975590669023555, + "learning_rate": 8.163578691941203e-06, + "loss": 0.8437, + "step": 14901 + }, + { + "epoch": 0.7432418952618454, + "grad_norm": 0.4798738074877217, + "learning_rate": 8.160593580680246e-06, + "loss": 0.9, + "step": 14902 + }, + { + "epoch": 0.7432917705735661, + "grad_norm": 0.4445248182973295, + "learning_rate": 8.15760890883607e-06, + "loss": 0.8651, + "step": 14903 + }, + { + "epoch": 0.7433416458852867, + "grad_norm": 0.5249816682618759, + "learning_rate": 8.154624676486552e-06, + "loss": 0.8551, + "step": 14904 + }, + { + "epoch": 0.7433915211970075, + "grad_norm": 0.5058873304270359, + "learning_rate": 8.151640883709563e-06, + "loss": 0.8587, + "step": 14905 + }, + { + "epoch": 0.7434413965087282, + "grad_norm": 0.4600728136005099, + "learning_rate": 8.148657530582959e-06, + "loss": 0.8452, + "step": 14906 + }, + { + "epoch": 0.7434912718204488, + "grad_norm": 0.47649137395132174, + "learning_rate": 8.145674617184609e-06, + "loss": 0.8394, + "step": 14907 + }, + { + "epoch": 0.7435411471321696, + "grad_norm": 0.43424305865267065, + "learning_rate": 8.142692143592322e-06, + "loss": 0.8125, + "step": 14908 + }, + { + "epoch": 0.7435910224438903, + "grad_norm": 0.8079919389627654, + "learning_rate": 8.139710109883946e-06, + "loss": 0.8481, + "step": 14909 + }, + { + "epoch": 0.7436408977556109, + "grad_norm": 0.592219269856004, + "learning_rate": 8.136728516137281e-06, + "loss": 0.8879, + "step": 14910 + }, + { + "epoch": 0.7436907730673317, + "grad_norm": 0.46495045872511126, + "learning_rate": 8.133747362430158e-06, + "loss": 0.8212, + "step": 14911 + }, + { + "epoch": 0.7437406483790524, + "grad_norm": 0.5321887236856752, + "learning_rate": 8.13076664884033e-06, + "loss": 0.8616, + "step": 14912 + }, + { + "epoch": 0.743790523690773, + "grad_norm": 0.5969368357269499, + "learning_rate": 8.127786375445606e-06, + "loss": 0.8885, + "step": 14913 + }, + { + "epoch": 0.7438403990024938, + "grad_norm": 0.9314546413064205, + "learning_rate": 8.124806542323734e-06, + "loss": 0.8077, + "step": 14914 + }, + { + "epoch": 0.7438902743142145, + "grad_norm": 0.5636782927267974, + "learning_rate": 8.121827149552504e-06, + "loss": 0.8928, + "step": 14915 + }, + { + "epoch": 0.7439401496259351, + "grad_norm": 0.8677565877364019, + "learning_rate": 8.118848197209622e-06, + "loss": 0.8479, + "step": 14916 + }, + { + "epoch": 0.7439900249376559, + "grad_norm": 0.4419037196885708, + "learning_rate": 8.115869685372852e-06, + "loss": 0.833, + "step": 14917 + }, + { + "epoch": 0.7440399002493766, + "grad_norm": 0.4465065261988252, + "learning_rate": 8.112891614119902e-06, + "loss": 0.8821, + "step": 14918 + }, + { + "epoch": 0.7440897755610972, + "grad_norm": 0.5894838526881119, + "learning_rate": 8.109913983528489e-06, + "loss": 0.86, + "step": 14919 + }, + { + "epoch": 0.744139650872818, + "grad_norm": 0.5058995049166989, + "learning_rate": 8.106936793676304e-06, + "loss": 0.8261, + "step": 14920 + }, + { + "epoch": 0.7441895261845387, + "grad_norm": 0.7312435795572872, + "learning_rate": 8.103960044641052e-06, + "loss": 0.9014, + "step": 14921 + }, + { + "epoch": 0.7442394014962593, + "grad_norm": 0.6000997499314211, + "learning_rate": 8.1009837365004e-06, + "loss": 0.8513, + "step": 14922 + }, + { + "epoch": 0.7442892768079801, + "grad_norm": 0.44788354134263564, + "learning_rate": 8.098007869332013e-06, + "loss": 0.8168, + "step": 14923 + }, + { + "epoch": 0.7443391521197007, + "grad_norm": 0.5776951929999647, + "learning_rate": 8.095032443213541e-06, + "loss": 0.8041, + "step": 14924 + }, + { + "epoch": 0.7443890274314214, + "grad_norm": 0.643115267087444, + "learning_rate": 8.09205745822264e-06, + "loss": 0.8679, + "step": 14925 + }, + { + "epoch": 0.7444389027431422, + "grad_norm": 0.5330516682973954, + "learning_rate": 8.08908291443693e-06, + "loss": 0.8296, + "step": 14926 + }, + { + "epoch": 0.7444887780548628, + "grad_norm": 0.49738497218537725, + "learning_rate": 8.086108811934027e-06, + "loss": 0.8655, + "step": 14927 + }, + { + "epoch": 0.7445386533665835, + "grad_norm": 0.4707266539911559, + "learning_rate": 8.083135150791551e-06, + "loss": 0.8425, + "step": 14928 + }, + { + "epoch": 0.7445885286783043, + "grad_norm": 0.486176330023698, + "learning_rate": 8.080161931087094e-06, + "loss": 0.8163, + "step": 14929 + }, + { + "epoch": 0.7446384039900249, + "grad_norm": 0.5212751724849095, + "learning_rate": 8.077189152898237e-06, + "loss": 0.8915, + "step": 14930 + }, + { + "epoch": 0.7446882793017456, + "grad_norm": 0.48212016123237705, + "learning_rate": 8.074216816302551e-06, + "loss": 0.7935, + "step": 14931 + }, + { + "epoch": 0.7447381546134664, + "grad_norm": 0.5465402824352088, + "learning_rate": 8.071244921377607e-06, + "loss": 0.8394, + "step": 14932 + }, + { + "epoch": 0.744788029925187, + "grad_norm": 0.6165852854903474, + "learning_rate": 8.068273468200951e-06, + "loss": 0.8504, + "step": 14933 + }, + { + "epoch": 0.7448379052369077, + "grad_norm": 0.44330328796702967, + "learning_rate": 8.065302456850125e-06, + "loss": 0.8275, + "step": 14934 + }, + { + "epoch": 0.7448877805486285, + "grad_norm": 0.5674267024759744, + "learning_rate": 8.062331887402638e-06, + "loss": 0.8029, + "step": 14935 + }, + { + "epoch": 0.7449376558603491, + "grad_norm": 0.500571683159182, + "learning_rate": 8.059361759936032e-06, + "loss": 0.8789, + "step": 14936 + }, + { + "epoch": 0.7449875311720698, + "grad_norm": 0.5820804377260207, + "learning_rate": 8.0563920745278e-06, + "loss": 0.8547, + "step": 14937 + }, + { + "epoch": 0.7450374064837906, + "grad_norm": 0.43910030010416684, + "learning_rate": 8.05342283125543e-06, + "loss": 0.8426, + "step": 14938 + }, + { + "epoch": 0.7450872817955112, + "grad_norm": 0.4375656630220029, + "learning_rate": 8.050454030196402e-06, + "loss": 0.8741, + "step": 14939 + }, + { + "epoch": 0.7451371571072319, + "grad_norm": 0.5125596667365269, + "learning_rate": 8.047485671428207e-06, + "loss": 0.8897, + "step": 14940 + }, + { + "epoch": 0.7451870324189526, + "grad_norm": 0.47357990366436814, + "learning_rate": 8.044517755028266e-06, + "loss": 0.8684, + "step": 14941 + }, + { + "epoch": 0.7452369077306733, + "grad_norm": 0.4710949584909931, + "learning_rate": 8.04155028107406e-06, + "loss": 0.845, + "step": 14942 + }, + { + "epoch": 0.745286783042394, + "grad_norm": 0.7765837267055898, + "learning_rate": 8.038583249642998e-06, + "loss": 0.8319, + "step": 14943 + }, + { + "epoch": 0.7453366583541147, + "grad_norm": 0.47850989786048415, + "learning_rate": 8.03561666081253e-06, + "loss": 0.8245, + "step": 14944 + }, + { + "epoch": 0.7453865336658354, + "grad_norm": 0.47919556755059056, + "learning_rate": 8.03265051466004e-06, + "loss": 0.8569, + "step": 14945 + }, + { + "epoch": 0.7454364089775561, + "grad_norm": 0.4508494389439134, + "learning_rate": 8.029684811262949e-06, + "loss": 0.8524, + "step": 14946 + }, + { + "epoch": 0.7454862842892768, + "grad_norm": 0.47079968580765236, + "learning_rate": 8.026719550698627e-06, + "loss": 0.8397, + "step": 14947 + }, + { + "epoch": 0.7455361596009975, + "grad_norm": 0.46295682621408646, + "learning_rate": 8.023754733044481e-06, + "loss": 0.8179, + "step": 14948 + }, + { + "epoch": 0.7455860349127182, + "grad_norm": 0.5801072466072914, + "learning_rate": 8.020790358377841e-06, + "loss": 0.8463, + "step": 14949 + }, + { + "epoch": 0.7456359102244389, + "grad_norm": 0.49059179242445905, + "learning_rate": 8.017826426776088e-06, + "loss": 0.8453, + "step": 14950 + }, + { + "epoch": 0.7456857855361596, + "grad_norm": 0.5007228225379212, + "learning_rate": 8.014862938316542e-06, + "loss": 0.8607, + "step": 14951 + }, + { + "epoch": 0.7457356608478803, + "grad_norm": 0.5333836898054423, + "learning_rate": 8.011899893076563e-06, + "loss": 0.8629, + "step": 14952 + }, + { + "epoch": 0.745785536159601, + "grad_norm": 0.45217850612510807, + "learning_rate": 8.008937291133439e-06, + "loss": 0.8354, + "step": 14953 + }, + { + "epoch": 0.7458354114713217, + "grad_norm": 0.6054963249334329, + "learning_rate": 8.0059751325645e-06, + "loss": 0.8252, + "step": 14954 + }, + { + "epoch": 0.7458852867830424, + "grad_norm": 0.5617870384944635, + "learning_rate": 8.003013417447034e-06, + "loss": 0.8603, + "step": 14955 + }, + { + "epoch": 0.7459351620947631, + "grad_norm": 0.5701981739315234, + "learning_rate": 8.000052145858325e-06, + "loss": 0.899, + "step": 14956 + }, + { + "epoch": 0.7459850374064838, + "grad_norm": 1.2625506605158707, + "learning_rate": 7.997091317875638e-06, + "loss": 0.8275, + "step": 14957 + }, + { + "epoch": 0.7460349127182044, + "grad_norm": 0.4891461985125616, + "learning_rate": 7.994130933576256e-06, + "loss": 0.837, + "step": 14958 + }, + { + "epoch": 0.7460847880299252, + "grad_norm": 0.49631407374030284, + "learning_rate": 7.991170993037411e-06, + "loss": 0.8421, + "step": 14959 + }, + { + "epoch": 0.7461346633416459, + "grad_norm": 0.46310280214710464, + "learning_rate": 7.988211496336351e-06, + "loss": 0.863, + "step": 14960 + }, + { + "epoch": 0.7461845386533665, + "grad_norm": 0.4939644260318118, + "learning_rate": 7.98525244355029e-06, + "loss": 0.845, + "step": 14961 + }, + { + "epoch": 0.7462344139650873, + "grad_norm": 0.492075451419987, + "learning_rate": 7.982293834756457e-06, + "loss": 0.8822, + "step": 14962 + }, + { + "epoch": 0.746284289276808, + "grad_norm": 0.4881936834606584, + "learning_rate": 7.979335670032054e-06, + "loss": 0.8346, + "step": 14963 + }, + { + "epoch": 0.7463341645885286, + "grad_norm": 0.6301787588270764, + "learning_rate": 7.976377949454266e-06, + "loss": 0.8501, + "step": 14964 + }, + { + "epoch": 0.7463840399002494, + "grad_norm": 0.5521388019328751, + "learning_rate": 7.973420673100271e-06, + "loss": 0.8666, + "step": 14965 + }, + { + "epoch": 0.7464339152119701, + "grad_norm": 0.5395416548350558, + "learning_rate": 7.970463841047248e-06, + "loss": 0.8889, + "step": 14966 + }, + { + "epoch": 0.7464837905236907, + "grad_norm": 0.49537993378793477, + "learning_rate": 7.967507453372352e-06, + "loss": 0.8711, + "step": 14967 + }, + { + "epoch": 0.7465336658354115, + "grad_norm": 0.5385008638111801, + "learning_rate": 7.96455151015272e-06, + "loss": 0.8373, + "step": 14968 + }, + { + "epoch": 0.7465835411471322, + "grad_norm": 0.47758721441822755, + "learning_rate": 7.9615960114655e-06, + "loss": 0.8538, + "step": 14969 + }, + { + "epoch": 0.7466334164588528, + "grad_norm": 0.557241734808943, + "learning_rate": 7.958640957387803e-06, + "loss": 0.8262, + "step": 14970 + }, + { + "epoch": 0.7466832917705736, + "grad_norm": 0.47739276399226793, + "learning_rate": 7.955686347996746e-06, + "loss": 0.8395, + "step": 14971 + }, + { + "epoch": 0.7467331670822943, + "grad_norm": 1.134313403619754, + "learning_rate": 7.952732183369418e-06, + "loss": 0.8713, + "step": 14972 + }, + { + "epoch": 0.7467830423940149, + "grad_norm": 1.5090476639058343, + "learning_rate": 7.949778463582922e-06, + "loss": 0.8421, + "step": 14973 + }, + { + "epoch": 0.7468329177057357, + "grad_norm": 0.5998520988533174, + "learning_rate": 7.946825188714328e-06, + "loss": 0.8977, + "step": 14974 + }, + { + "epoch": 0.7468827930174564, + "grad_norm": 0.5111058603414068, + "learning_rate": 7.943872358840696e-06, + "loss": 0.8355, + "step": 14975 + }, + { + "epoch": 0.746932668329177, + "grad_norm": 0.45069131153451836, + "learning_rate": 7.940919974039074e-06, + "loss": 0.8357, + "step": 14976 + }, + { + "epoch": 0.7469825436408978, + "grad_norm": 0.5122223707037912, + "learning_rate": 7.937968034386526e-06, + "loss": 0.8763, + "step": 14977 + }, + { + "epoch": 0.7470324189526184, + "grad_norm": 0.47012216582567584, + "learning_rate": 7.93501653996005e-06, + "loss": 0.7972, + "step": 14978 + }, + { + "epoch": 0.7470822942643391, + "grad_norm": 0.4639138048811736, + "learning_rate": 7.93206549083669e-06, + "loss": 0.8245, + "step": 14979 + }, + { + "epoch": 0.7471321695760599, + "grad_norm": 0.43330477749019225, + "learning_rate": 7.929114887093433e-06, + "loss": 0.8089, + "step": 14980 + }, + { + "epoch": 0.7471820448877805, + "grad_norm": 0.4886922964687363, + "learning_rate": 7.9261647288073e-06, + "loss": 0.8651, + "step": 14981 + }, + { + "epoch": 0.7472319201995012, + "grad_norm": 0.6957849634080654, + "learning_rate": 7.92321501605524e-06, + "loss": 0.8389, + "step": 14982 + }, + { + "epoch": 0.747281795511222, + "grad_norm": 0.5718995326945561, + "learning_rate": 7.920265748914252e-06, + "loss": 0.831, + "step": 14983 + }, + { + "epoch": 0.7473316708229426, + "grad_norm": 0.5029852426525476, + "learning_rate": 7.917316927461273e-06, + "loss": 0.8458, + "step": 14984 + }, + { + "epoch": 0.7473815461346633, + "grad_norm": 0.4847255539092938, + "learning_rate": 7.914368551773285e-06, + "loss": 0.8885, + "step": 14985 + }, + { + "epoch": 0.7474314214463841, + "grad_norm": 0.5736801127157416, + "learning_rate": 7.911420621927185e-06, + "loss": 0.8578, + "step": 14986 + }, + { + "epoch": 0.7474812967581047, + "grad_norm": 0.5829620358655535, + "learning_rate": 7.908473137999922e-06, + "loss": 0.8727, + "step": 14987 + }, + { + "epoch": 0.7475311720698254, + "grad_norm": 0.5425286087824832, + "learning_rate": 7.905526100068405e-06, + "loss": 0.852, + "step": 14988 + }, + { + "epoch": 0.7475810473815462, + "grad_norm": 0.5208826767827309, + "learning_rate": 7.902579508209535e-06, + "loss": 0.8285, + "step": 14989 + }, + { + "epoch": 0.7476309226932668, + "grad_norm": 0.5354549700941432, + "learning_rate": 7.899633362500191e-06, + "loss": 0.8577, + "step": 14990 + }, + { + "epoch": 0.7476807980049875, + "grad_norm": 0.7691280199805431, + "learning_rate": 7.896687663017268e-06, + "loss": 0.7997, + "step": 14991 + }, + { + "epoch": 0.7477306733167083, + "grad_norm": 0.4417021925422965, + "learning_rate": 7.893742409837629e-06, + "loss": 0.8278, + "step": 14992 + }, + { + "epoch": 0.7477805486284289, + "grad_norm": 0.4684775788168059, + "learning_rate": 7.890797603038125e-06, + "loss": 0.836, + "step": 14993 + }, + { + "epoch": 0.7478304239401496, + "grad_norm": 0.6759044182457659, + "learning_rate": 7.887853242695593e-06, + "loss": 0.8373, + "step": 14994 + }, + { + "epoch": 0.7478802992518703, + "grad_norm": 0.45527666535145234, + "learning_rate": 7.88490932888688e-06, + "loss": 0.848, + "step": 14995 + }, + { + "epoch": 0.747930174563591, + "grad_norm": 0.47591744403550074, + "learning_rate": 7.881965861688797e-06, + "loss": 0.8254, + "step": 14996 + }, + { + "epoch": 0.7479800498753117, + "grad_norm": 0.49216177620707663, + "learning_rate": 7.879022841178154e-06, + "loss": 0.8551, + "step": 14997 + }, + { + "epoch": 0.7480299251870324, + "grad_norm": 0.5134763238739602, + "learning_rate": 7.87608026743174e-06, + "loss": 0.8678, + "step": 14998 + }, + { + "epoch": 0.7480798004987531, + "grad_norm": 0.5989258274546421, + "learning_rate": 7.873138140526357e-06, + "loss": 0.8414, + "step": 14999 + }, + { + "epoch": 0.7481296758104738, + "grad_norm": 0.5553970300559559, + "learning_rate": 7.87019646053877e-06, + "loss": 0.8538, + "step": 15000 + }, + { + "epoch": 0.7481795511221945, + "grad_norm": 0.605261677950807, + "learning_rate": 7.867255227545739e-06, + "loss": 0.8761, + "step": 15001 + }, + { + "epoch": 0.7482294264339152, + "grad_norm": 0.5676253791752723, + "learning_rate": 7.864314441624004e-06, + "loss": 0.8359, + "step": 15002 + }, + { + "epoch": 0.748279301745636, + "grad_norm": 0.48410329264406804, + "learning_rate": 7.861374102850328e-06, + "loss": 0.8558, + "step": 15003 + }, + { + "epoch": 0.7483291770573566, + "grad_norm": 0.6073015436253227, + "learning_rate": 7.85843421130142e-06, + "loss": 0.8864, + "step": 15004 + }, + { + "epoch": 0.7483790523690773, + "grad_norm": 0.6471456678803607, + "learning_rate": 7.855494767053994e-06, + "loss": 0.7984, + "step": 15005 + }, + { + "epoch": 0.748428927680798, + "grad_norm": 0.49566143004245694, + "learning_rate": 7.852555770184767e-06, + "loss": 0.7912, + "step": 15006 + }, + { + "epoch": 0.7484788029925187, + "grad_norm": 0.5281399347472017, + "learning_rate": 7.849617220770422e-06, + "loss": 0.8943, + "step": 15007 + }, + { + "epoch": 0.7485286783042394, + "grad_norm": 0.4768349687810959, + "learning_rate": 7.846679118887642e-06, + "loss": 0.8481, + "step": 15008 + }, + { + "epoch": 0.7485785536159602, + "grad_norm": 0.5236237405159179, + "learning_rate": 7.843741464613085e-06, + "loss": 0.8313, + "step": 15009 + }, + { + "epoch": 0.7486284289276808, + "grad_norm": 0.45716361261195193, + "learning_rate": 7.840804258023424e-06, + "loss": 0.8165, + "step": 15010 + }, + { + "epoch": 0.7486783042394015, + "grad_norm": 0.9100199469501399, + "learning_rate": 7.837867499195294e-06, + "loss": 0.8744, + "step": 15011 + }, + { + "epoch": 0.7487281795511221, + "grad_norm": 0.5028600910895177, + "learning_rate": 7.834931188205335e-06, + "loss": 0.872, + "step": 15012 + }, + { + "epoch": 0.7487780548628429, + "grad_norm": 0.5247283037348213, + "learning_rate": 7.831995325130153e-06, + "loss": 0.865, + "step": 15013 + }, + { + "epoch": 0.7488279301745636, + "grad_norm": 0.6728245325760843, + "learning_rate": 7.829059910046388e-06, + "loss": 0.8566, + "step": 15014 + }, + { + "epoch": 0.7488778054862842, + "grad_norm": 0.8128562002934929, + "learning_rate": 7.826124943030605e-06, + "loss": 0.814, + "step": 15015 + }, + { + "epoch": 0.748927680798005, + "grad_norm": 0.532403694602955, + "learning_rate": 7.82319042415941e-06, + "loss": 0.8463, + "step": 15016 + }, + { + "epoch": 0.7489775561097257, + "grad_norm": 0.519359299597184, + "learning_rate": 7.820256353509367e-06, + "loss": 0.8189, + "step": 15017 + }, + { + "epoch": 0.7490274314214463, + "grad_norm": 0.5229924016070234, + "learning_rate": 7.817322731157065e-06, + "loss": 0.8706, + "step": 15018 + }, + { + "epoch": 0.7490773067331671, + "grad_norm": 0.718757107826971, + "learning_rate": 7.814389557179017e-06, + "loss": 0.8726, + "step": 15019 + }, + { + "epoch": 0.7491271820448878, + "grad_norm": 0.4909246747916079, + "learning_rate": 7.811456831651789e-06, + "loss": 0.8895, + "step": 15020 + }, + { + "epoch": 0.7491770573566084, + "grad_norm": 0.5103105022690985, + "learning_rate": 7.808524554651898e-06, + "loss": 0.8838, + "step": 15021 + }, + { + "epoch": 0.7492269326683292, + "grad_norm": 0.6360852575506009, + "learning_rate": 7.805592726255878e-06, + "loss": 0.8377, + "step": 15022 + }, + { + "epoch": 0.7492768079800499, + "grad_norm": 0.5092960683454485, + "learning_rate": 7.802661346540205e-06, + "loss": 0.8297, + "step": 15023 + }, + { + "epoch": 0.7493266832917705, + "grad_norm": 0.5256738668311105, + "learning_rate": 7.799730415581397e-06, + "loss": 0.8324, + "step": 15024 + }, + { + "epoch": 0.7493765586034913, + "grad_norm": 0.5990347827576803, + "learning_rate": 7.796799933455926e-06, + "loss": 0.9151, + "step": 15025 + }, + { + "epoch": 0.749426433915212, + "grad_norm": 0.4419618852900836, + "learning_rate": 7.79386990024026e-06, + "loss": 0.8289, + "step": 15026 + }, + { + "epoch": 0.7494763092269326, + "grad_norm": 1.1153580823663203, + "learning_rate": 7.79094031601085e-06, + "loss": 0.8692, + "step": 15027 + }, + { + "epoch": 0.7495261845386534, + "grad_norm": 0.44054831076403445, + "learning_rate": 7.788011180844162e-06, + "loss": 0.8484, + "step": 15028 + }, + { + "epoch": 0.749576059850374, + "grad_norm": 0.46676778424105847, + "learning_rate": 7.785082494816617e-06, + "loss": 0.8668, + "step": 15029 + }, + { + "epoch": 0.7496259351620947, + "grad_norm": 0.556754316477214, + "learning_rate": 7.782154258004639e-06, + "loss": 0.825, + "step": 15030 + }, + { + "epoch": 0.7496758104738155, + "grad_norm": 0.622748358510428, + "learning_rate": 7.779226470484633e-06, + "loss": 0.8744, + "step": 15031 + }, + { + "epoch": 0.7497256857855361, + "grad_norm": 0.5536069519605662, + "learning_rate": 7.776299132333015e-06, + "loss": 0.8773, + "step": 15032 + }, + { + "epoch": 0.7497755610972568, + "grad_norm": 0.47286209421563025, + "learning_rate": 7.773372243626158e-06, + "loss": 0.8713, + "step": 15033 + }, + { + "epoch": 0.7498254364089776, + "grad_norm": 0.49525212848912636, + "learning_rate": 7.770445804440449e-06, + "loss": 0.8474, + "step": 15034 + }, + { + "epoch": 0.7498753117206982, + "grad_norm": 0.6161875147670944, + "learning_rate": 7.767519814852234e-06, + "loss": 0.8182, + "step": 15035 + }, + { + "epoch": 0.749925187032419, + "grad_norm": 0.5652913943384815, + "learning_rate": 7.764594274937884e-06, + "loss": 0.8424, + "step": 15036 + }, + { + "epoch": 0.7499750623441397, + "grad_norm": 0.4794239706331211, + "learning_rate": 7.761669184773735e-06, + "loss": 0.832, + "step": 15037 + }, + { + "epoch": 0.7500249376558603, + "grad_norm": 1.111313220011012, + "learning_rate": 7.758744544436113e-06, + "loss": 0.8293, + "step": 15038 + }, + { + "epoch": 0.750074812967581, + "grad_norm": 0.5531995311293476, + "learning_rate": 7.755820354001328e-06, + "loss": 0.8182, + "step": 15039 + }, + { + "epoch": 0.7501246882793018, + "grad_norm": 0.5175495198196572, + "learning_rate": 7.752896613545702e-06, + "loss": 0.8386, + "step": 15040 + }, + { + "epoch": 0.7501745635910224, + "grad_norm": 0.41479230416432644, + "learning_rate": 7.74997332314552e-06, + "loss": 0.8858, + "step": 15041 + }, + { + "epoch": 0.7502244389027432, + "grad_norm": 0.4645035262010321, + "learning_rate": 7.747050482877055e-06, + "loss": 0.8425, + "step": 15042 + }, + { + "epoch": 0.7502743142144639, + "grad_norm": 0.5477577311374208, + "learning_rate": 7.744128092816597e-06, + "loss": 0.8522, + "step": 15043 + }, + { + "epoch": 0.7503241895261845, + "grad_norm": 0.7357290245937549, + "learning_rate": 7.74120615304039e-06, + "loss": 0.8956, + "step": 15044 + }, + { + "epoch": 0.7503740648379053, + "grad_norm": 0.5303848414275855, + "learning_rate": 7.738284663624688e-06, + "loss": 0.8679, + "step": 15045 + }, + { + "epoch": 0.750423940149626, + "grad_norm": 0.573279720259494, + "learning_rate": 7.735363624645712e-06, + "loss": 0.8726, + "step": 15046 + }, + { + "epoch": 0.7504738154613466, + "grad_norm": 0.6138041338930933, + "learning_rate": 7.73244303617971e-06, + "loss": 0.8598, + "step": 15047 + }, + { + "epoch": 0.7505236907730674, + "grad_norm": 0.5245087421168408, + "learning_rate": 7.729522898302865e-06, + "loss": 0.8159, + "step": 15048 + }, + { + "epoch": 0.750573566084788, + "grad_norm": 0.4702121689380187, + "learning_rate": 7.7266032110914e-06, + "loss": 0.829, + "step": 15049 + }, + { + "epoch": 0.7506234413965087, + "grad_norm": 0.46291207788766325, + "learning_rate": 7.723683974621485e-06, + "loss": 0.8214, + "step": 15050 + }, + { + "epoch": 0.7506733167082295, + "grad_norm": 0.5039521838796635, + "learning_rate": 7.72076518896932e-06, + "loss": 0.8615, + "step": 15051 + }, + { + "epoch": 0.7507231920199501, + "grad_norm": 0.4986304874251431, + "learning_rate": 7.717846854211038e-06, + "loss": 0.843, + "step": 15052 + }, + { + "epoch": 0.7507730673316708, + "grad_norm": 0.6078150267088258, + "learning_rate": 7.714928970422816e-06, + "loss": 0.8779, + "step": 15053 + }, + { + "epoch": 0.7508229426433916, + "grad_norm": 0.45534826028392894, + "learning_rate": 7.71201153768078e-06, + "loss": 0.8377, + "step": 15054 + }, + { + "epoch": 0.7508728179551122, + "grad_norm": 3.1849980200206085, + "learning_rate": 7.709094556061081e-06, + "loss": 0.8764, + "step": 15055 + }, + { + "epoch": 0.7509226932668329, + "grad_norm": 0.4931100171324604, + "learning_rate": 7.706178025639805e-06, + "loss": 0.8561, + "step": 15056 + }, + { + "epoch": 0.7509725685785537, + "grad_norm": 0.4430927383532009, + "learning_rate": 7.703261946493082e-06, + "loss": 0.803, + "step": 15057 + }, + { + "epoch": 0.7510224438902743, + "grad_norm": 0.67580158003729, + "learning_rate": 7.700346318696989e-06, + "loss": 0.8328, + "step": 15058 + }, + { + "epoch": 0.751072319201995, + "grad_norm": 0.499738803509251, + "learning_rate": 7.697431142327632e-06, + "loss": 0.8487, + "step": 15059 + }, + { + "epoch": 0.7511221945137158, + "grad_norm": 0.43924848685045215, + "learning_rate": 7.694516417461051e-06, + "loss": 0.8018, + "step": 15060 + }, + { + "epoch": 0.7511720698254364, + "grad_norm": 0.5126180318010609, + "learning_rate": 7.691602144173327e-06, + "loss": 0.8599, + "step": 15061 + }, + { + "epoch": 0.7512219451371571, + "grad_norm": 0.5044350469137123, + "learning_rate": 7.688688322540497e-06, + "loss": 0.9013, + "step": 15062 + }, + { + "epoch": 0.7512718204488779, + "grad_norm": 0.5212408493342702, + "learning_rate": 7.6857749526386e-06, + "loss": 0.8151, + "step": 15063 + }, + { + "epoch": 0.7513216957605985, + "grad_norm": 0.533674796764531, + "learning_rate": 7.682862034543647e-06, + "loss": 0.8727, + "step": 15064 + }, + { + "epoch": 0.7513715710723192, + "grad_norm": 0.5298313435391727, + "learning_rate": 7.679949568331669e-06, + "loss": 0.8522, + "step": 15065 + }, + { + "epoch": 0.7514214463840398, + "grad_norm": 0.5046709820909066, + "learning_rate": 7.677037554078653e-06, + "loss": 0.8553, + "step": 15066 + }, + { + "epoch": 0.7514713216957606, + "grad_norm": 0.5920788333009829, + "learning_rate": 7.674125991860592e-06, + "loss": 0.8582, + "step": 15067 + }, + { + "epoch": 0.7515211970074813, + "grad_norm": 0.49222552000567865, + "learning_rate": 7.67121488175345e-06, + "loss": 0.8007, + "step": 15068 + }, + { + "epoch": 0.751571072319202, + "grad_norm": 0.46449280623419537, + "learning_rate": 7.668304223833208e-06, + "loss": 0.7924, + "step": 15069 + }, + { + "epoch": 0.7516209476309227, + "grad_norm": 0.7018807248945119, + "learning_rate": 7.665394018175812e-06, + "loss": 0.8471, + "step": 15070 + }, + { + "epoch": 0.7516708229426434, + "grad_norm": 0.5830835880362774, + "learning_rate": 7.662484264857197e-06, + "loss": 0.8949, + "step": 15071 + }, + { + "epoch": 0.751720698254364, + "grad_norm": 0.6756254036453878, + "learning_rate": 7.659574963953292e-06, + "loss": 0.8643, + "step": 15072 + }, + { + "epoch": 0.7517705735660848, + "grad_norm": 0.7439167572012599, + "learning_rate": 7.656666115540027e-06, + "loss": 0.8838, + "step": 15073 + }, + { + "epoch": 0.7518204488778055, + "grad_norm": 0.5791470204091553, + "learning_rate": 7.653757719693294e-06, + "loss": 0.8739, + "step": 15074 + }, + { + "epoch": 0.7518703241895262, + "grad_norm": 0.4795923171671558, + "learning_rate": 7.650849776488991e-06, + "loss": 0.8438, + "step": 15075 + }, + { + "epoch": 0.7519201995012469, + "grad_norm": 0.5034756696644073, + "learning_rate": 7.647942286002994e-06, + "loss": 0.826, + "step": 15076 + }, + { + "epoch": 0.7519700748129676, + "grad_norm": 0.5133596927601581, + "learning_rate": 7.645035248311181e-06, + "loss": 0.853, + "step": 15077 + }, + { + "epoch": 0.7520199501246883, + "grad_norm": 0.5926686395821891, + "learning_rate": 7.642128663489406e-06, + "loss": 0.8756, + "step": 15078 + }, + { + "epoch": 0.752069825436409, + "grad_norm": 0.4806670146806653, + "learning_rate": 7.639222531613519e-06, + "loss": 0.8277, + "step": 15079 + }, + { + "epoch": 0.7521197007481297, + "grad_norm": 0.4443340892933782, + "learning_rate": 7.63631685275934e-06, + "loss": 0.8613, + "step": 15080 + }, + { + "epoch": 0.7521695760598504, + "grad_norm": 0.9496532035373005, + "learning_rate": 7.63341162700271e-06, + "loss": 0.8534, + "step": 15081 + }, + { + "epoch": 0.7522194513715711, + "grad_norm": 0.5121671894557137, + "learning_rate": 7.630506854419431e-06, + "loss": 0.8897, + "step": 15082 + }, + { + "epoch": 0.7522693266832917, + "grad_norm": 0.5422583698265809, + "learning_rate": 7.627602535085293e-06, + "loss": 0.8391, + "step": 15083 + }, + { + "epoch": 0.7523192019950125, + "grad_norm": 0.5231266931837852, + "learning_rate": 7.62469866907611e-06, + "loss": 0.8277, + "step": 15084 + }, + { + "epoch": 0.7523690773067332, + "grad_norm": 0.6630100014989513, + "learning_rate": 7.621795256467623e-06, + "loss": 0.8376, + "step": 15085 + }, + { + "epoch": 0.7524189526184538, + "grad_norm": 0.5196243913597616, + "learning_rate": 7.618892297335619e-06, + "loss": 0.8608, + "step": 15086 + }, + { + "epoch": 0.7524688279301746, + "grad_norm": 0.6435155781415492, + "learning_rate": 7.615989791755834e-06, + "loss": 0.8709, + "step": 15087 + }, + { + "epoch": 0.7525187032418953, + "grad_norm": 0.48899426368717674, + "learning_rate": 7.613087739804029e-06, + "loss": 0.8629, + "step": 15088 + }, + { + "epoch": 0.7525685785536159, + "grad_norm": 0.4634826117028279, + "learning_rate": 7.610186141555905e-06, + "loss": 0.8472, + "step": 15089 + }, + { + "epoch": 0.7526184538653367, + "grad_norm": 0.4873530259423364, + "learning_rate": 7.607284997087197e-06, + "loss": 0.8504, + "step": 15090 + }, + { + "epoch": 0.7526683291770574, + "grad_norm": 0.6603137752751597, + "learning_rate": 7.604384306473597e-06, + "loss": 0.8383, + "step": 15091 + }, + { + "epoch": 0.752718204488778, + "grad_norm": 0.5864379105027582, + "learning_rate": 7.601484069790818e-06, + "loss": 0.8542, + "step": 15092 + }, + { + "epoch": 0.7527680798004988, + "grad_norm": 0.5018181781385874, + "learning_rate": 7.59858428711451e-06, + "loss": 0.8343, + "step": 15093 + }, + { + "epoch": 0.7528179551122195, + "grad_norm": 0.4924686602336305, + "learning_rate": 7.5956849585203635e-06, + "loss": 0.7773, + "step": 15094 + }, + { + "epoch": 0.7528678304239401, + "grad_norm": 0.5622324516032204, + "learning_rate": 7.59278608408403e-06, + "loss": 0.8265, + "step": 15095 + }, + { + "epoch": 0.7529177057356609, + "grad_norm": 0.4727538429948618, + "learning_rate": 7.589887663881154e-06, + "loss": 0.8446, + "step": 15096 + }, + { + "epoch": 0.7529675810473816, + "grad_norm": 0.5708629711006319, + "learning_rate": 7.58698969798736e-06, + "loss": 0.8395, + "step": 15097 + }, + { + "epoch": 0.7530174563591022, + "grad_norm": 0.6675254127787676, + "learning_rate": 7.584092186478284e-06, + "loss": 0.8603, + "step": 15098 + }, + { + "epoch": 0.753067331670823, + "grad_norm": 0.5446230769254464, + "learning_rate": 7.581195129429528e-06, + "loss": 0.8955, + "step": 15099 + }, + { + "epoch": 0.7531172069825436, + "grad_norm": 0.47181592718577725, + "learning_rate": 7.578298526916689e-06, + "loss": 0.885, + "step": 15100 + }, + { + "epoch": 0.7531670822942643, + "grad_norm": 0.47952862731477347, + "learning_rate": 7.575402379015345e-06, + "loss": 0.8519, + "step": 15101 + }, + { + "epoch": 0.7532169576059851, + "grad_norm": 0.6185286162983558, + "learning_rate": 7.572506685801087e-06, + "loss": 0.8331, + "step": 15102 + }, + { + "epoch": 0.7532668329177057, + "grad_norm": 0.5815812856203914, + "learning_rate": 7.569611447349467e-06, + "loss": 0.8782, + "step": 15103 + }, + { + "epoch": 0.7533167082294264, + "grad_norm": 0.639650419263871, + "learning_rate": 7.566716663736034e-06, + "loss": 0.8094, + "step": 15104 + }, + { + "epoch": 0.7533665835411472, + "grad_norm": 0.4670130245214432, + "learning_rate": 7.563822335036322e-06, + "loss": 0.8592, + "step": 15105 + }, + { + "epoch": 0.7534164588528678, + "grad_norm": 0.6286092712942318, + "learning_rate": 7.560928461325875e-06, + "loss": 0.8425, + "step": 15106 + }, + { + "epoch": 0.7534663341645885, + "grad_norm": 0.547777541307658, + "learning_rate": 7.558035042680184e-06, + "loss": 0.8686, + "step": 15107 + }, + { + "epoch": 0.7535162094763093, + "grad_norm": 0.6076076284747577, + "learning_rate": 7.555142079174768e-06, + "loss": 0.8495, + "step": 15108 + }, + { + "epoch": 0.7535660847880299, + "grad_norm": 0.45162039009300314, + "learning_rate": 7.552249570885103e-06, + "loss": 0.8305, + "step": 15109 + }, + { + "epoch": 0.7536159600997506, + "grad_norm": 0.4850746167530856, + "learning_rate": 7.549357517886688e-06, + "loss": 0.8376, + "step": 15110 + }, + { + "epoch": 0.7536658354114714, + "grad_norm": 0.5803221070475869, + "learning_rate": 7.546465920254974e-06, + "loss": 0.8242, + "step": 15111 + }, + { + "epoch": 0.753715710723192, + "grad_norm": 0.46773795281275277, + "learning_rate": 7.543574778065424e-06, + "loss": 0.8573, + "step": 15112 + }, + { + "epoch": 0.7537655860349127, + "grad_norm": 0.5030868667898624, + "learning_rate": 7.5406840913934685e-06, + "loss": 0.8175, + "step": 15113 + }, + { + "epoch": 0.7538154613466335, + "grad_norm": 0.4672597068572866, + "learning_rate": 7.537793860314557e-06, + "loss": 0.8417, + "step": 15114 + }, + { + "epoch": 0.7538653366583541, + "grad_norm": 0.4632614057032304, + "learning_rate": 7.534904084904099e-06, + "loss": 0.8725, + "step": 15115 + }, + { + "epoch": 0.7539152119700748, + "grad_norm": 0.4932383874052852, + "learning_rate": 7.532014765237502e-06, + "loss": 0.84, + "step": 15116 + }, + { + "epoch": 0.7539650872817956, + "grad_norm": 0.4954275509077004, + "learning_rate": 7.529125901390155e-06, + "loss": 0.8365, + "step": 15117 + }, + { + "epoch": 0.7540149625935162, + "grad_norm": 0.9944368093148223, + "learning_rate": 7.526237493437457e-06, + "loss": 0.8533, + "step": 15118 + }, + { + "epoch": 0.7540648379052369, + "grad_norm": 0.6124603739521723, + "learning_rate": 7.523349541454772e-06, + "loss": 0.8696, + "step": 15119 + }, + { + "epoch": 0.7541147132169576, + "grad_norm": 0.4673467891849447, + "learning_rate": 7.520462045517449e-06, + "loss": 0.8688, + "step": 15120 + }, + { + "epoch": 0.7541645885286783, + "grad_norm": 0.49138487985379903, + "learning_rate": 7.5175750057008635e-06, + "loss": 0.8575, + "step": 15121 + }, + { + "epoch": 0.754214463840399, + "grad_norm": 0.5457910701192455, + "learning_rate": 7.514688422080318e-06, + "loss": 0.8637, + "step": 15122 + }, + { + "epoch": 0.7542643391521197, + "grad_norm": 0.5179170543268627, + "learning_rate": 7.51180229473116e-06, + "loss": 0.8343, + "step": 15123 + }, + { + "epoch": 0.7543142144638404, + "grad_norm": 0.5070970023245633, + "learning_rate": 7.508916623728687e-06, + "loss": 0.8033, + "step": 15124 + }, + { + "epoch": 0.7543640897755611, + "grad_norm": 0.5336387460732699, + "learning_rate": 7.50603140914822e-06, + "loss": 0.8504, + "step": 15125 + }, + { + "epoch": 0.7544139650872818, + "grad_norm": 0.4771230582850356, + "learning_rate": 7.503146651065021e-06, + "loss": 0.8179, + "step": 15126 + }, + { + "epoch": 0.7544638403990025, + "grad_norm": 0.6347179203849698, + "learning_rate": 7.500262349554388e-06, + "loss": 0.8396, + "step": 15127 + }, + { + "epoch": 0.7545137157107232, + "grad_norm": 0.5024373898094202, + "learning_rate": 7.497378504691568e-06, + "loss": 0.8495, + "step": 15128 + }, + { + "epoch": 0.7545635910224439, + "grad_norm": 0.5533814271659712, + "learning_rate": 7.494495116551839e-06, + "loss": 0.8315, + "step": 15129 + }, + { + "epoch": 0.7546134663341646, + "grad_norm": 0.5939159835140987, + "learning_rate": 7.4916121852104115e-06, + "loss": 0.8476, + "step": 15130 + }, + { + "epoch": 0.7546633416458853, + "grad_norm": 0.5353582297078544, + "learning_rate": 7.488729710742534e-06, + "loss": 0.8582, + "step": 15131 + }, + { + "epoch": 0.754713216957606, + "grad_norm": 0.4549907387374167, + "learning_rate": 7.4858476932234165e-06, + "loss": 0.8473, + "step": 15132 + }, + { + "epoch": 0.7547630922693267, + "grad_norm": 0.5194800259412068, + "learning_rate": 7.482966132728267e-06, + "loss": 0.8132, + "step": 15133 + }, + { + "epoch": 0.7548129675810474, + "grad_norm": 0.5790466900845707, + "learning_rate": 7.480085029332268e-06, + "loss": 0.8997, + "step": 15134 + }, + { + "epoch": 0.7548628428927681, + "grad_norm": 0.5575500331813215, + "learning_rate": 7.47720438311062e-06, + "loss": 0.8876, + "step": 15135 + }, + { + "epoch": 0.7549127182044888, + "grad_norm": 0.542712827724561, + "learning_rate": 7.474324194138476e-06, + "loss": 0.8741, + "step": 15136 + }, + { + "epoch": 0.7549625935162094, + "grad_norm": 0.5248230824189791, + "learning_rate": 7.471444462491004e-06, + "loss": 0.8559, + "step": 15137 + }, + { + "epoch": 0.7550124688279302, + "grad_norm": 0.6296574460633223, + "learning_rate": 7.468565188243332e-06, + "loss": 0.8472, + "step": 15138 + }, + { + "epoch": 0.7550623441396509, + "grad_norm": 0.5538390853279315, + "learning_rate": 7.465686371470615e-06, + "loss": 0.8212, + "step": 15139 + }, + { + "epoch": 0.7551122194513715, + "grad_norm": 0.5913278266917598, + "learning_rate": 7.462808012247965e-06, + "loss": 0.8774, + "step": 15140 + }, + { + "epoch": 0.7551620947630923, + "grad_norm": 0.5488492530839512, + "learning_rate": 7.459930110650493e-06, + "loss": 0.8913, + "step": 15141 + }, + { + "epoch": 0.755211970074813, + "grad_norm": 0.4728119097927411, + "learning_rate": 7.4570526667532835e-06, + "loss": 0.8008, + "step": 15142 + }, + { + "epoch": 0.7552618453865336, + "grad_norm": 0.4979786699393494, + "learning_rate": 7.454175680631451e-06, + "loss": 0.8499, + "step": 15143 + }, + { + "epoch": 0.7553117206982544, + "grad_norm": 0.6357219543363822, + "learning_rate": 7.451299152360036e-06, + "loss": 0.858, + "step": 15144 + }, + { + "epoch": 0.7553615960099751, + "grad_norm": 0.47554322576826946, + "learning_rate": 7.4484230820141245e-06, + "loss": 0.8576, + "step": 15145 + }, + { + "epoch": 0.7554114713216957, + "grad_norm": 0.4918044724699581, + "learning_rate": 7.445547469668748e-06, + "loss": 0.857, + "step": 15146 + }, + { + "epoch": 0.7554613466334165, + "grad_norm": 0.7531979335508892, + "learning_rate": 7.442672315398963e-06, + "loss": 0.8509, + "step": 15147 + }, + { + "epoch": 0.7555112219451372, + "grad_norm": 0.6381986736197766, + "learning_rate": 7.439797619279784e-06, + "loss": 0.897, + "step": 15148 + }, + { + "epoch": 0.7555610972568578, + "grad_norm": 0.4591118505220026, + "learning_rate": 7.436923381386229e-06, + "loss": 0.8403, + "step": 15149 + }, + { + "epoch": 0.7556109725685786, + "grad_norm": 0.4405250268517999, + "learning_rate": 7.434049601793291e-06, + "loss": 0.8165, + "step": 15150 + }, + { + "epoch": 0.7556608478802993, + "grad_norm": 0.4870148989030126, + "learning_rate": 7.431176280575974e-06, + "loss": 0.8789, + "step": 15151 + }, + { + "epoch": 0.7557107231920199, + "grad_norm": 0.5469134685054107, + "learning_rate": 7.4283034178092516e-06, + "loss": 0.7895, + "step": 15152 + }, + { + "epoch": 0.7557605985037407, + "grad_norm": 0.4287085229462275, + "learning_rate": 7.4254310135680845e-06, + "loss": 0.7978, + "step": 15153 + }, + { + "epoch": 0.7558104738154613, + "grad_norm": 0.5317089344361188, + "learning_rate": 7.422559067927431e-06, + "loss": 0.827, + "step": 15154 + }, + { + "epoch": 0.755860349127182, + "grad_norm": 0.4669918319774022, + "learning_rate": 7.419687580962223e-06, + "loss": 0.7943, + "step": 15155 + }, + { + "epoch": 0.7559102244389028, + "grad_norm": 0.6296327353127366, + "learning_rate": 7.416816552747407e-06, + "loss": 0.8877, + "step": 15156 + }, + { + "epoch": 0.7559600997506234, + "grad_norm": 0.5534445057135571, + "learning_rate": 7.413945983357889e-06, + "loss": 0.8099, + "step": 15157 + }, + { + "epoch": 0.7560099750623441, + "grad_norm": 0.5522529417781047, + "learning_rate": 7.411075872868595e-06, + "loss": 0.8429, + "step": 15158 + }, + { + "epoch": 0.7560598503740649, + "grad_norm": 0.5596741501613202, + "learning_rate": 7.408206221354386e-06, + "loss": 0.8361, + "step": 15159 + }, + { + "epoch": 0.7561097256857855, + "grad_norm": 0.44703516582088, + "learning_rate": 7.405337028890172e-06, + "loss": 0.8435, + "step": 15160 + }, + { + "epoch": 0.7561596009975062, + "grad_norm": 0.5163513670382924, + "learning_rate": 7.4024682955508075e-06, + "loss": 0.8387, + "step": 15161 + }, + { + "epoch": 0.756209476309227, + "grad_norm": 0.7330947051210072, + "learning_rate": 7.399600021411171e-06, + "loss": 0.862, + "step": 15162 + }, + { + "epoch": 0.7562593516209476, + "grad_norm": 0.5111765888242297, + "learning_rate": 7.396732206546081e-06, + "loss": 0.8235, + "step": 15163 + }, + { + "epoch": 0.7563092269326683, + "grad_norm": 0.5468746851621911, + "learning_rate": 7.393864851030391e-06, + "loss": 0.8609, + "step": 15164 + }, + { + "epoch": 0.7563591022443891, + "grad_norm": 0.5262481578160942, + "learning_rate": 7.390997954938911e-06, + "loss": 0.8523, + "step": 15165 + }, + { + "epoch": 0.7564089775561097, + "grad_norm": 0.5329424377675491, + "learning_rate": 7.388131518346478e-06, + "loss": 0.8288, + "step": 15166 + }, + { + "epoch": 0.7564588528678304, + "grad_norm": 0.5163823047047468, + "learning_rate": 7.385265541327851e-06, + "loss": 0.8215, + "step": 15167 + }, + { + "epoch": 0.7565087281795512, + "grad_norm": 0.518005459914965, + "learning_rate": 7.382400023957845e-06, + "loss": 0.8446, + "step": 15168 + }, + { + "epoch": 0.7565586034912718, + "grad_norm": 0.7006128745267967, + "learning_rate": 7.379534966311227e-06, + "loss": 0.8321, + "step": 15169 + }, + { + "epoch": 0.7566084788029925, + "grad_norm": 0.5799149803795609, + "learning_rate": 7.376670368462757e-06, + "loss": 0.8771, + "step": 15170 + }, + { + "epoch": 0.7566583541147133, + "grad_norm": 0.5545864681289551, + "learning_rate": 7.3738062304871816e-06, + "loss": 0.84, + "step": 15171 + }, + { + "epoch": 0.7567082294264339, + "grad_norm": 0.5122636170085394, + "learning_rate": 7.370942552459248e-06, + "loss": 0.8644, + "step": 15172 + }, + { + "epoch": 0.7567581047381546, + "grad_norm": 0.5282108045097305, + "learning_rate": 7.3680793344536815e-06, + "loss": 0.8707, + "step": 15173 + }, + { + "epoch": 0.7568079800498753, + "grad_norm": 0.8033167596018969, + "learning_rate": 7.365216576545192e-06, + "loss": 0.825, + "step": 15174 + }, + { + "epoch": 0.756857855361596, + "grad_norm": 0.5847804008141694, + "learning_rate": 7.362354278808478e-06, + "loss": 0.8382, + "step": 15175 + }, + { + "epoch": 0.7569077306733167, + "grad_norm": 0.5651121055938538, + "learning_rate": 7.359492441318242e-06, + "loss": 0.8775, + "step": 15176 + }, + { + "epoch": 0.7569576059850374, + "grad_norm": 0.59837248744248, + "learning_rate": 7.356631064149156e-06, + "loss": 0.8266, + "step": 15177 + }, + { + "epoch": 0.7570074812967581, + "grad_norm": 0.5011107062575141, + "learning_rate": 7.353770147375885e-06, + "loss": 0.8859, + "step": 15178 + }, + { + "epoch": 0.7570573566084788, + "grad_norm": 0.5372170542057899, + "learning_rate": 7.350909691073079e-06, + "loss": 0.8296, + "step": 15179 + }, + { + "epoch": 0.7571072319201995, + "grad_norm": 0.5472059943901286, + "learning_rate": 7.348049695315401e-06, + "loss": 0.8619, + "step": 15180 + }, + { + "epoch": 0.7571571072319202, + "grad_norm": 0.8336912658221973, + "learning_rate": 7.345190160177448e-06, + "loss": 0.9135, + "step": 15181 + }, + { + "epoch": 0.7572069825436409, + "grad_norm": 0.5694296804707745, + "learning_rate": 7.342331085733867e-06, + "loss": 0.8516, + "step": 15182 + }, + { + "epoch": 0.7572568578553616, + "grad_norm": 0.6816876160664846, + "learning_rate": 7.339472472059245e-06, + "loss": 0.7901, + "step": 15183 + }, + { + "epoch": 0.7573067331670823, + "grad_norm": 0.4820435429191817, + "learning_rate": 7.336614319228194e-06, + "loss": 0.8012, + "step": 15184 + }, + { + "epoch": 0.757356608478803, + "grad_norm": 0.5229095782404097, + "learning_rate": 7.3337566273152854e-06, + "loss": 0.8841, + "step": 15185 + }, + { + "epoch": 0.7574064837905237, + "grad_norm": 0.45662547231273865, + "learning_rate": 7.330899396395094e-06, + "loss": 0.8471, + "step": 15186 + }, + { + "epoch": 0.7574563591022444, + "grad_norm": 0.5044272286678484, + "learning_rate": 7.328042626542164e-06, + "loss": 0.8144, + "step": 15187 + }, + { + "epoch": 0.7575062344139651, + "grad_norm": 2.1868949394440347, + "learning_rate": 7.325186317831062e-06, + "loss": 0.8326, + "step": 15188 + }, + { + "epoch": 0.7575561097256858, + "grad_norm": 0.6489102849515672, + "learning_rate": 7.3223304703363135e-06, + "loss": 0.8184, + "step": 15189 + }, + { + "epoch": 0.7576059850374065, + "grad_norm": 0.5278436512401492, + "learning_rate": 7.31947508413244e-06, + "loss": 0.8703, + "step": 15190 + }, + { + "epoch": 0.7576558603491271, + "grad_norm": 0.479200322305584, + "learning_rate": 7.31662015929395e-06, + "loss": 0.8471, + "step": 15191 + }, + { + "epoch": 0.7577057356608479, + "grad_norm": 0.5762835381855499, + "learning_rate": 7.313765695895336e-06, + "loss": 0.8579, + "step": 15192 + }, + { + "epoch": 0.7577556109725686, + "grad_norm": 0.49855532217677945, + "learning_rate": 7.3109116940110995e-06, + "loss": 0.8508, + "step": 15193 + }, + { + "epoch": 0.7578054862842892, + "grad_norm": 0.5955110473430385, + "learning_rate": 7.308058153715705e-06, + "loss": 0.8197, + "step": 15194 + }, + { + "epoch": 0.75785536159601, + "grad_norm": 0.5284224403767243, + "learning_rate": 7.3052050750836155e-06, + "loss": 0.8724, + "step": 15195 + }, + { + "epoch": 0.7579052369077307, + "grad_norm": 0.5371340058126807, + "learning_rate": 7.302352458189271e-06, + "loss": 0.8878, + "step": 15196 + }, + { + "epoch": 0.7579551122194513, + "grad_norm": 0.5102272738770031, + "learning_rate": 7.299500303107126e-06, + "loss": 0.8046, + "step": 15197 + }, + { + "epoch": 0.7580049875311721, + "grad_norm": 0.6260341515773632, + "learning_rate": 7.296648609911594e-06, + "loss": 0.8492, + "step": 15198 + }, + { + "epoch": 0.7580548628428928, + "grad_norm": 0.6800062186013147, + "learning_rate": 7.293797378677109e-06, + "loss": 0.838, + "step": 15199 + }, + { + "epoch": 0.7581047381546134, + "grad_norm": 0.5351415556414276, + "learning_rate": 7.290946609478039e-06, + "loss": 0.8407, + "step": 15200 + }, + { + "epoch": 0.7581546134663342, + "grad_norm": 0.4627743797673174, + "learning_rate": 7.2880963023888e-06, + "loss": 0.8186, + "step": 15201 + }, + { + "epoch": 0.7582044887780549, + "grad_norm": 0.5634267269833934, + "learning_rate": 7.285246457483763e-06, + "loss": 0.8468, + "step": 15202 + }, + { + "epoch": 0.7582543640897755, + "grad_norm": 0.5907917493047685, + "learning_rate": 7.282397074837288e-06, + "loss": 0.8786, + "step": 15203 + }, + { + "epoch": 0.7583042394014963, + "grad_norm": 0.5779354680866251, + "learning_rate": 7.279548154523727e-06, + "loss": 0.83, + "step": 15204 + }, + { + "epoch": 0.758354114713217, + "grad_norm": 0.5633588808622265, + "learning_rate": 7.2766996966174335e-06, + "loss": 0.8609, + "step": 15205 + }, + { + "epoch": 0.7584039900249376, + "grad_norm": 0.49034261697837733, + "learning_rate": 7.27385170119273e-06, + "loss": 0.8574, + "step": 15206 + }, + { + "epoch": 0.7584538653366584, + "grad_norm": 0.5039203188377573, + "learning_rate": 7.2710041683239336e-06, + "loss": 0.8513, + "step": 15207 + }, + { + "epoch": 0.758503740648379, + "grad_norm": 0.4902428309674143, + "learning_rate": 7.26815709808534e-06, + "loss": 0.8097, + "step": 15208 + }, + { + "epoch": 0.7585536159600997, + "grad_norm": 0.5234313159763635, + "learning_rate": 7.265310490551261e-06, + "loss": 0.8539, + "step": 15209 + }, + { + "epoch": 0.7586034912718205, + "grad_norm": 0.529996013816173, + "learning_rate": 7.262464345795966e-06, + "loss": 0.8477, + "step": 15210 + }, + { + "epoch": 0.7586533665835411, + "grad_norm": 0.4343057831215975, + "learning_rate": 7.259618663893724e-06, + "loss": 0.8912, + "step": 15211 + }, + { + "epoch": 0.7587032418952618, + "grad_norm": 0.5214700114610444, + "learning_rate": 7.256773444918788e-06, + "loss": 0.8444, + "step": 15212 + }, + { + "epoch": 0.7587531172069826, + "grad_norm": 0.49765393392754004, + "learning_rate": 7.253928688945424e-06, + "loss": 0.8507, + "step": 15213 + }, + { + "epoch": 0.7588029925187032, + "grad_norm": 0.5939767040117672, + "learning_rate": 7.251084396047833e-06, + "loss": 0.8343, + "step": 15214 + }, + { + "epoch": 0.7588528678304239, + "grad_norm": 0.5713290786448784, + "learning_rate": 7.248240566300257e-06, + "loss": 0.8553, + "step": 15215 + }, + { + "epoch": 0.7589027431421447, + "grad_norm": 0.5631777273891135, + "learning_rate": 7.245397199776893e-06, + "loss": 0.8456, + "step": 15216 + }, + { + "epoch": 0.7589526184538653, + "grad_norm": 0.5129494065306105, + "learning_rate": 7.2425542965519565e-06, + "loss": 0.8616, + "step": 15217 + }, + { + "epoch": 0.759002493765586, + "grad_norm": 0.5173257742440763, + "learning_rate": 7.2397118566996045e-06, + "loss": 0.8803, + "step": 15218 + }, + { + "epoch": 0.7590523690773068, + "grad_norm": 0.4303934999614384, + "learning_rate": 7.236869880294031e-06, + "loss": 0.8204, + "step": 15219 + }, + { + "epoch": 0.7591022443890274, + "grad_norm": 0.45093310594813246, + "learning_rate": 7.234028367409382e-06, + "loss": 0.8017, + "step": 15220 + }, + { + "epoch": 0.7591521197007481, + "grad_norm": 0.49806029383949363, + "learning_rate": 7.231187318119825e-06, + "loss": 0.8123, + "step": 15221 + }, + { + "epoch": 0.7592019950124689, + "grad_norm": 0.4610064874182372, + "learning_rate": 7.2283467324994695e-06, + "loss": 0.8179, + "step": 15222 + }, + { + "epoch": 0.7592518703241895, + "grad_norm": 0.5607337854625979, + "learning_rate": 7.225506610622456e-06, + "loss": 0.8477, + "step": 15223 + }, + { + "epoch": 0.7593017456359102, + "grad_norm": 1.5597048619209597, + "learning_rate": 7.222666952562887e-06, + "loss": 0.8836, + "step": 15224 + }, + { + "epoch": 0.7593516209476309, + "grad_norm": 0.5093021336071208, + "learning_rate": 7.219827758394878e-06, + "loss": 0.8365, + "step": 15225 + }, + { + "epoch": 0.7594014962593516, + "grad_norm": 0.47533821546400795, + "learning_rate": 7.216989028192503e-06, + "loss": 0.8472, + "step": 15226 + }, + { + "epoch": 0.7594513715710723, + "grad_norm": 0.5416081020308794, + "learning_rate": 7.214150762029842e-06, + "loss": 0.8357, + "step": 15227 + }, + { + "epoch": 0.759501246882793, + "grad_norm": 0.47360748657882, + "learning_rate": 7.2113129599809565e-06, + "loss": 0.8766, + "step": 15228 + }, + { + "epoch": 0.7595511221945137, + "grad_norm": 0.580258510264097, + "learning_rate": 7.208475622119892e-06, + "loss": 0.8586, + "step": 15229 + }, + { + "epoch": 0.7596009975062344, + "grad_norm": 0.5802828658507974, + "learning_rate": 7.205638748520699e-06, + "loss": 0.8342, + "step": 15230 + }, + { + "epoch": 0.7596508728179551, + "grad_norm": 0.4847891569633611, + "learning_rate": 7.2028023392574e-06, + "loss": 0.8018, + "step": 15231 + }, + { + "epoch": 0.7597007481296758, + "grad_norm": 0.4938183483152357, + "learning_rate": 7.199966394404009e-06, + "loss": 0.8485, + "step": 15232 + }, + { + "epoch": 0.7597506234413965, + "grad_norm": 0.5748773301475546, + "learning_rate": 7.197130914034522e-06, + "loss": 0.9085, + "step": 15233 + }, + { + "epoch": 0.7598004987531172, + "grad_norm": 0.46341014829471816, + "learning_rate": 7.194295898222944e-06, + "loss": 0.8658, + "step": 15234 + }, + { + "epoch": 0.7598503740648379, + "grad_norm": 0.8176270596804319, + "learning_rate": 7.191461347043238e-06, + "loss": 0.8432, + "step": 15235 + }, + { + "epoch": 0.7599002493765586, + "grad_norm": 0.5952719216487786, + "learning_rate": 7.188627260569392e-06, + "loss": 0.8475, + "step": 15236 + }, + { + "epoch": 0.7599501246882793, + "grad_norm": 0.4806285601535295, + "learning_rate": 7.185793638875332e-06, + "loss": 0.8279, + "step": 15237 + }, + { + "epoch": 0.76, + "grad_norm": 0.4942303573745577, + "learning_rate": 7.18296048203502e-06, + "loss": 0.8416, + "step": 15238 + }, + { + "epoch": 0.7600498753117207, + "grad_norm": 0.44722579243817384, + "learning_rate": 7.180127790122385e-06, + "loss": 0.8231, + "step": 15239 + }, + { + "epoch": 0.7600997506234414, + "grad_norm": 0.5575222028763688, + "learning_rate": 7.177295563211336e-06, + "loss": 0.8182, + "step": 15240 + }, + { + "epoch": 0.7601496259351621, + "grad_norm": 0.4935453198036846, + "learning_rate": 7.174463801375778e-06, + "loss": 0.9134, + "step": 15241 + }, + { + "epoch": 0.7601995012468828, + "grad_norm": 3.2481488429302647, + "learning_rate": 7.171632504689613e-06, + "loss": 0.8496, + "step": 15242 + }, + { + "epoch": 0.7602493765586035, + "grad_norm": 0.6146573936050966, + "learning_rate": 7.168801673226721e-06, + "loss": 0.8715, + "step": 15243 + }, + { + "epoch": 0.7602992518703242, + "grad_norm": 0.4965221979796655, + "learning_rate": 7.16597130706097e-06, + "loss": 0.8532, + "step": 15244 + }, + { + "epoch": 0.7603491271820448, + "grad_norm": 0.5185754969232422, + "learning_rate": 7.163141406266208e-06, + "loss": 0.8572, + "step": 15245 + }, + { + "epoch": 0.7603990024937656, + "grad_norm": 0.4491436837989004, + "learning_rate": 7.160311970916297e-06, + "loss": 0.8371, + "step": 15246 + }, + { + "epoch": 0.7604488778054863, + "grad_norm": 0.7954030994769036, + "learning_rate": 7.157483001085061e-06, + "loss": 0.895, + "step": 15247 + }, + { + "epoch": 0.7604987531172069, + "grad_norm": 1.4420792457041107, + "learning_rate": 7.154654496846319e-06, + "loss": 0.8537, + "step": 15248 + }, + { + "epoch": 0.7605486284289277, + "grad_norm": 0.5471516607705046, + "learning_rate": 7.1518264582738745e-06, + "loss": 0.8849, + "step": 15249 + }, + { + "epoch": 0.7605985037406484, + "grad_norm": 0.6109603853715809, + "learning_rate": 7.148998885441546e-06, + "loss": 0.8495, + "step": 15250 + }, + { + "epoch": 0.760648379052369, + "grad_norm": 0.6663000433239528, + "learning_rate": 7.146171778423089e-06, + "loss": 0.8899, + "step": 15251 + }, + { + "epoch": 0.7606982543640898, + "grad_norm": 0.6005081848501528, + "learning_rate": 7.143345137292295e-06, + "loss": 0.8368, + "step": 15252 + }, + { + "epoch": 0.7607481296758105, + "grad_norm": 0.5109497236011413, + "learning_rate": 7.140518962122911e-06, + "loss": 0.8528, + "step": 15253 + }, + { + "epoch": 0.7607980049875311, + "grad_norm": 0.6134948031891587, + "learning_rate": 7.137693252988706e-06, + "loss": 0.853, + "step": 15254 + }, + { + "epoch": 0.7608478802992519, + "grad_norm": 0.6534768773504797, + "learning_rate": 7.1348680099633886e-06, + "loss": 0.9049, + "step": 15255 + }, + { + "epoch": 0.7608977556109726, + "grad_norm": 0.4838382254658813, + "learning_rate": 7.1320432331207e-06, + "loss": 0.8188, + "step": 15256 + }, + { + "epoch": 0.7609476309226932, + "grad_norm": 0.5018132800216061, + "learning_rate": 7.129218922534342e-06, + "loss": 0.8703, + "step": 15257 + }, + { + "epoch": 0.760997506234414, + "grad_norm": 0.5514542705343359, + "learning_rate": 7.126395078278031e-06, + "loss": 0.8924, + "step": 15258 + }, + { + "epoch": 0.7610473815461347, + "grad_norm": 0.4538862889198884, + "learning_rate": 7.1235717004254275e-06, + "loss": 0.8585, + "step": 15259 + }, + { + "epoch": 0.7610972568578553, + "grad_norm": 1.023019768494039, + "learning_rate": 7.120748789050224e-06, + "loss": 0.8436, + "step": 15260 + }, + { + "epoch": 0.7611471321695761, + "grad_norm": 0.5006223265915964, + "learning_rate": 7.117926344226083e-06, + "loss": 0.8334, + "step": 15261 + }, + { + "epoch": 0.7611970074812967, + "grad_norm": 0.6233587305589877, + "learning_rate": 7.115104366026642e-06, + "loss": 0.8449, + "step": 15262 + }, + { + "epoch": 0.7612468827930174, + "grad_norm": 0.5225326550183688, + "learning_rate": 7.112282854525556e-06, + "loss": 0.831, + "step": 15263 + }, + { + "epoch": 0.7612967581047382, + "grad_norm": 0.5155179518107235, + "learning_rate": 7.109461809796444e-06, + "loss": 0.8537, + "step": 15264 + }, + { + "epoch": 0.7613466334164588, + "grad_norm": 0.8621661920551552, + "learning_rate": 7.1066412319129194e-06, + "loss": 0.8787, + "step": 15265 + }, + { + "epoch": 0.7613965087281795, + "grad_norm": 0.8918133245723059, + "learning_rate": 7.103821120948575e-06, + "loss": 0.8451, + "step": 15266 + }, + { + "epoch": 0.7614463840399003, + "grad_norm": 0.6675975431250604, + "learning_rate": 7.1010014769770204e-06, + "loss": 0.8169, + "step": 15267 + }, + { + "epoch": 0.7614962593516209, + "grad_norm": 1.3746796737298352, + "learning_rate": 7.098182300071818e-06, + "loss": 0.8134, + "step": 15268 + }, + { + "epoch": 0.7615461346633416, + "grad_norm": 0.5285195005654155, + "learning_rate": 7.095363590306536e-06, + "loss": 0.8526, + "step": 15269 + }, + { + "epoch": 0.7615960099750624, + "grad_norm": 0.6555488332984938, + "learning_rate": 7.092545347754723e-06, + "loss": 0.8492, + "step": 15270 + }, + { + "epoch": 0.761645885286783, + "grad_norm": 0.47131786389767466, + "learning_rate": 7.089727572489932e-06, + "loss": 0.8901, + "step": 15271 + }, + { + "epoch": 0.7616957605985037, + "grad_norm": 1.3063169680891369, + "learning_rate": 7.086910264585677e-06, + "loss": 0.8827, + "step": 15272 + }, + { + "epoch": 0.7617456359102245, + "grad_norm": 0.6108279263618211, + "learning_rate": 7.084093424115496e-06, + "loss": 0.8594, + "step": 15273 + }, + { + "epoch": 0.7617955112219451, + "grad_norm": 0.48917653424508434, + "learning_rate": 7.081277051152865e-06, + "loss": 0.792, + "step": 15274 + }, + { + "epoch": 0.7618453865336658, + "grad_norm": 0.48232082510045177, + "learning_rate": 7.0784611457712956e-06, + "loss": 0.8009, + "step": 15275 + }, + { + "epoch": 0.7618952618453866, + "grad_norm": 0.47270907068774143, + "learning_rate": 7.075645708044262e-06, + "loss": 0.8382, + "step": 15276 + }, + { + "epoch": 0.7619451371571072, + "grad_norm": 0.503765043941609, + "learning_rate": 7.072830738045233e-06, + "loss": 0.8034, + "step": 15277 + }, + { + "epoch": 0.761995012468828, + "grad_norm": 0.526665539031848, + "learning_rate": 7.070016235847657e-06, + "loss": 0.8872, + "step": 15278 + }, + { + "epoch": 0.7620448877805486, + "grad_norm": 0.6841419450322875, + "learning_rate": 7.067202201524989e-06, + "loss": 0.8528, + "step": 15279 + }, + { + "epoch": 0.7620947630922693, + "grad_norm": 0.5205876204763071, + "learning_rate": 7.064388635150651e-06, + "loss": 0.7619, + "step": 15280 + }, + { + "epoch": 0.76214463840399, + "grad_norm": 0.40293835857461024, + "learning_rate": 7.061575536798068e-06, + "loss": 0.7952, + "step": 15281 + }, + { + "epoch": 0.7621945137157107, + "grad_norm": 0.5014328515020132, + "learning_rate": 7.058762906540633e-06, + "loss": 0.8251, + "step": 15282 + }, + { + "epoch": 0.7622443890274314, + "grad_norm": 0.4886807556712801, + "learning_rate": 7.0559507444517615e-06, + "loss": 0.8769, + "step": 15283 + }, + { + "epoch": 0.7622942643391522, + "grad_norm": 0.4809495552010852, + "learning_rate": 7.053139050604823e-06, + "loss": 0.8103, + "step": 15284 + }, + { + "epoch": 0.7623441396508728, + "grad_norm": 0.5361105898417456, + "learning_rate": 7.050327825073188e-06, + "loss": 0.867, + "step": 15285 + }, + { + "epoch": 0.7623940149625935, + "grad_norm": 0.4959381460975948, + "learning_rate": 7.047517067930209e-06, + "loss": 0.8003, + "step": 15286 + }, + { + "epoch": 0.7624438902743143, + "grad_norm": 0.47488071448691427, + "learning_rate": 7.044706779249252e-06, + "loss": 0.8076, + "step": 15287 + }, + { + "epoch": 0.7624937655860349, + "grad_norm": 0.6884669078090907, + "learning_rate": 7.041896959103622e-06, + "loss": 0.8261, + "step": 15288 + }, + { + "epoch": 0.7625436408977556, + "grad_norm": 0.4639279317376237, + "learning_rate": 7.0390876075666605e-06, + "loss": 0.8358, + "step": 15289 + }, + { + "epoch": 0.7625935162094764, + "grad_norm": 0.4162560267790399, + "learning_rate": 7.036278724711662e-06, + "loss": 0.8261, + "step": 15290 + }, + { + "epoch": 0.762643391521197, + "grad_norm": 0.6168036975189468, + "learning_rate": 7.033470310611945e-06, + "loss": 0.7733, + "step": 15291 + }, + { + "epoch": 0.7626932668329177, + "grad_norm": 0.6369992052027583, + "learning_rate": 7.030662365340765e-06, + "loss": 0.8562, + "step": 15292 + }, + { + "epoch": 0.7627431421446385, + "grad_norm": 0.48505325960284507, + "learning_rate": 7.027854888971416e-06, + "loss": 0.8583, + "step": 15293 + }, + { + "epoch": 0.7627930174563591, + "grad_norm": 1.0536167077937242, + "learning_rate": 7.025047881577143e-06, + "loss": 0.8193, + "step": 15294 + }, + { + "epoch": 0.7628428927680798, + "grad_norm": 0.541338972392143, + "learning_rate": 7.022241343231217e-06, + "loss": 0.8482, + "step": 15295 + }, + { + "epoch": 0.7628927680798006, + "grad_norm": 0.49774654713637045, + "learning_rate": 7.019435274006841e-06, + "loss": 0.8599, + "step": 15296 + }, + { + "epoch": 0.7629426433915212, + "grad_norm": 0.5536004548108314, + "learning_rate": 7.016629673977263e-06, + "loss": 0.8586, + "step": 15297 + }, + { + "epoch": 0.7629925187032419, + "grad_norm": 0.6202181279194217, + "learning_rate": 7.013824543215683e-06, + "loss": 0.8507, + "step": 15298 + }, + { + "epoch": 0.7630423940149625, + "grad_norm": 0.6476772064737084, + "learning_rate": 7.011019881795297e-06, + "loss": 0.8218, + "step": 15299 + }, + { + "epoch": 0.7630922693266833, + "grad_norm": 0.6114473263394886, + "learning_rate": 7.008215689789302e-06, + "loss": 0.8454, + "step": 15300 + }, + { + "epoch": 0.763142144638404, + "grad_norm": 1.514060013984838, + "learning_rate": 7.0054119672708665e-06, + "loss": 0.8288, + "step": 15301 + }, + { + "epoch": 0.7631920199501246, + "grad_norm": 0.4469856922160569, + "learning_rate": 7.0026087143131554e-06, + "loss": 0.8297, + "step": 15302 + }, + { + "epoch": 0.7632418952618454, + "grad_norm": 0.5290773610097207, + "learning_rate": 6.999805930989306e-06, + "loss": 0.8626, + "step": 15303 + }, + { + "epoch": 0.7632917705735661, + "grad_norm": 0.5478380508220687, + "learning_rate": 6.997003617372472e-06, + "loss": 0.8333, + "step": 15304 + }, + { + "epoch": 0.7633416458852867, + "grad_norm": 0.6648761521904689, + "learning_rate": 6.9942017735357715e-06, + "loss": 0.8595, + "step": 15305 + }, + { + "epoch": 0.7633915211970075, + "grad_norm": 0.9776672817841305, + "learning_rate": 6.991400399552317e-06, + "loss": 0.8584, + "step": 15306 + }, + { + "epoch": 0.7634413965087282, + "grad_norm": 0.5345252626571922, + "learning_rate": 6.988599495495204e-06, + "loss": 0.841, + "step": 15307 + }, + { + "epoch": 0.7634912718204488, + "grad_norm": 0.7211076179370438, + "learning_rate": 6.9857990614375315e-06, + "loss": 0.8133, + "step": 15308 + }, + { + "epoch": 0.7635411471321696, + "grad_norm": 0.5935339597224163, + "learning_rate": 6.982999097452373e-06, + "loss": 0.8242, + "step": 15309 + }, + { + "epoch": 0.7635910224438903, + "grad_norm": 0.5359998037853141, + "learning_rate": 6.9801996036127886e-06, + "loss": 0.8504, + "step": 15310 + }, + { + "epoch": 0.763640897755611, + "grad_norm": 0.5398641971922675, + "learning_rate": 6.977400579991822e-06, + "loss": 0.8416, + "step": 15311 + }, + { + "epoch": 0.7636907730673317, + "grad_norm": 0.7825238879952104, + "learning_rate": 6.97460202666253e-06, + "loss": 0.8243, + "step": 15312 + }, + { + "epoch": 0.7637406483790524, + "grad_norm": 0.5828197116420443, + "learning_rate": 6.97180394369793e-06, + "loss": 0.8354, + "step": 15313 + }, + { + "epoch": 0.763790523690773, + "grad_norm": 0.5694304852937014, + "learning_rate": 6.96900633117104e-06, + "loss": 0.8396, + "step": 15314 + }, + { + "epoch": 0.7638403990024938, + "grad_norm": 0.4333433791944535, + "learning_rate": 6.966209189154852e-06, + "loss": 0.8403, + "step": 15315 + }, + { + "epoch": 0.7638902743142144, + "grad_norm": 0.5156213870922709, + "learning_rate": 6.9634125177223705e-06, + "loss": 0.852, + "step": 15316 + }, + { + "epoch": 0.7639401496259351, + "grad_norm": 0.7474164771858358, + "learning_rate": 6.960616316946566e-06, + "loss": 0.8455, + "step": 15317 + }, + { + "epoch": 0.7639900249376559, + "grad_norm": 0.8224590430306027, + "learning_rate": 6.957820586900407e-06, + "loss": 0.8323, + "step": 15318 + }, + { + "epoch": 0.7640399002493765, + "grad_norm": 0.4867364034945876, + "learning_rate": 6.955025327656839e-06, + "loss": 0.858, + "step": 15319 + }, + { + "epoch": 0.7640897755610973, + "grad_norm": 0.4709427649277797, + "learning_rate": 6.952230539288823e-06, + "loss": 0.8984, + "step": 15320 + }, + { + "epoch": 0.764139650872818, + "grad_norm": 0.7344342163518107, + "learning_rate": 6.94943622186926e-06, + "loss": 0.8355, + "step": 15321 + }, + { + "epoch": 0.7641895261845386, + "grad_norm": 0.5269037483233403, + "learning_rate": 6.946642375471086e-06, + "loss": 0.8649, + "step": 15322 + }, + { + "epoch": 0.7642394014962594, + "grad_norm": 0.5475264474465242, + "learning_rate": 6.9438490001671935e-06, + "loss": 0.8705, + "step": 15323 + }, + { + "epoch": 0.7642892768079801, + "grad_norm": 0.5307273810801172, + "learning_rate": 6.941056096030496e-06, + "loss": 0.9001, + "step": 15324 + }, + { + "epoch": 0.7643391521197007, + "grad_norm": 0.6229269936564058, + "learning_rate": 6.93826366313384e-06, + "loss": 0.8789, + "step": 15325 + }, + { + "epoch": 0.7643890274314215, + "grad_norm": 0.5751134450836852, + "learning_rate": 6.93547170155012e-06, + "loss": 0.8484, + "step": 15326 + }, + { + "epoch": 0.7644389027431422, + "grad_norm": 0.5736783480735915, + "learning_rate": 6.932680211352172e-06, + "loss": 0.8139, + "step": 15327 + }, + { + "epoch": 0.7644887780548628, + "grad_norm": 0.5258677523712111, + "learning_rate": 6.929889192612865e-06, + "loss": 0.8587, + "step": 15328 + }, + { + "epoch": 0.7645386533665836, + "grad_norm": 0.5183474938689937, + "learning_rate": 6.927098645404997e-06, + "loss": 0.8237, + "step": 15329 + }, + { + "epoch": 0.7645885286783043, + "grad_norm": 0.6665798139217118, + "learning_rate": 6.9243085698014075e-06, + "loss": 0.8489, + "step": 15330 + }, + { + "epoch": 0.7646384039900249, + "grad_norm": 0.5610548952820775, + "learning_rate": 6.92151896587489e-06, + "loss": 0.8695, + "step": 15331 + }, + { + "epoch": 0.7646882793017457, + "grad_norm": 0.8184300060536812, + "learning_rate": 6.918729833698259e-06, + "loss": 0.8385, + "step": 15332 + }, + { + "epoch": 0.7647381546134663, + "grad_norm": 0.5497110289300088, + "learning_rate": 6.9159411733442675e-06, + "loss": 0.8508, + "step": 15333 + }, + { + "epoch": 0.764788029925187, + "grad_norm": 0.5335714882429338, + "learning_rate": 6.9131529848857015e-06, + "loss": 0.8447, + "step": 15334 + }, + { + "epoch": 0.7648379052369078, + "grad_norm": 0.4671956058367607, + "learning_rate": 6.910365268395316e-06, + "loss": 0.841, + "step": 15335 + }, + { + "epoch": 0.7648877805486284, + "grad_norm": 0.5810569581142964, + "learning_rate": 6.907578023945854e-06, + "loss": 0.8137, + "step": 15336 + }, + { + "epoch": 0.7649376558603491, + "grad_norm": 0.49080316259078904, + "learning_rate": 6.9047912516100374e-06, + "loss": 0.8693, + "step": 15337 + }, + { + "epoch": 0.7649875311720699, + "grad_norm": 0.9201183024515015, + "learning_rate": 6.902004951460603e-06, + "loss": 0.8842, + "step": 15338 + }, + { + "epoch": 0.7650374064837905, + "grad_norm": 0.5128265605400089, + "learning_rate": 6.89921912357025e-06, + "loss": 0.8454, + "step": 15339 + }, + { + "epoch": 0.7650872817955112, + "grad_norm": 0.6331123519369036, + "learning_rate": 6.896433768011662e-06, + "loss": 0.9006, + "step": 15340 + }, + { + "epoch": 0.765137157107232, + "grad_norm": 0.41660609749960403, + "learning_rate": 6.893648884857546e-06, + "loss": 0.8608, + "step": 15341 + }, + { + "epoch": 0.7651870324189526, + "grad_norm": 0.4551253699506048, + "learning_rate": 6.890864474180555e-06, + "loss": 0.8267, + "step": 15342 + }, + { + "epoch": 0.7652369077306733, + "grad_norm": 0.5478733166848142, + "learning_rate": 6.888080536053351e-06, + "loss": 0.8531, + "step": 15343 + }, + { + "epoch": 0.7652867830423941, + "grad_norm": 0.4257288160000078, + "learning_rate": 6.885297070548574e-06, + "loss": 0.8746, + "step": 15344 + }, + { + "epoch": 0.7653366583541147, + "grad_norm": 0.4771993503567481, + "learning_rate": 6.882514077738869e-06, + "loss": 0.8575, + "step": 15345 + }, + { + "epoch": 0.7653865336658354, + "grad_norm": 0.543314766993007, + "learning_rate": 6.879731557696853e-06, + "loss": 0.881, + "step": 15346 + }, + { + "epoch": 0.7654364089775562, + "grad_norm": 0.5697521932152753, + "learning_rate": 6.8769495104951284e-06, + "loss": 0.8226, + "step": 15347 + }, + { + "epoch": 0.7654862842892768, + "grad_norm": 0.9112283317263173, + "learning_rate": 6.874167936206291e-06, + "loss": 0.8595, + "step": 15348 + }, + { + "epoch": 0.7655361596009975, + "grad_norm": 0.8180198655867373, + "learning_rate": 6.871386834902937e-06, + "loss": 0.8126, + "step": 15349 + }, + { + "epoch": 0.7655860349127181, + "grad_norm": 0.5360939727779672, + "learning_rate": 6.868606206657629e-06, + "loss": 0.8298, + "step": 15350 + }, + { + "epoch": 0.7656359102244389, + "grad_norm": 0.4630123753763712, + "learning_rate": 6.865826051542929e-06, + "loss": 0.8012, + "step": 15351 + }, + { + "epoch": 0.7656857855361596, + "grad_norm": 0.5593339912667117, + "learning_rate": 6.8630463696313734e-06, + "loss": 0.8367, + "step": 15352 + }, + { + "epoch": 0.7657356608478803, + "grad_norm": 0.4601026600902663, + "learning_rate": 6.860267160995512e-06, + "loss": 0.8301, + "step": 15353 + }, + { + "epoch": 0.765785536159601, + "grad_norm": 0.4962588941858299, + "learning_rate": 6.857488425707864e-06, + "loss": 0.8571, + "step": 15354 + }, + { + "epoch": 0.7658354114713217, + "grad_norm": 0.5706773641924421, + "learning_rate": 6.854710163840933e-06, + "loss": 0.8596, + "step": 15355 + }, + { + "epoch": 0.7658852867830424, + "grad_norm": 0.48913798420881016, + "learning_rate": 6.851932375467213e-06, + "loss": 0.8525, + "step": 15356 + }, + { + "epoch": 0.7659351620947631, + "grad_norm": 0.6550626603482699, + "learning_rate": 6.849155060659212e-06, + "loss": 0.8547, + "step": 15357 + }, + { + "epoch": 0.7659850374064838, + "grad_norm": 0.6150037098533168, + "learning_rate": 6.846378219489369e-06, + "loss": 0.8625, + "step": 15358 + }, + { + "epoch": 0.7660349127182045, + "grad_norm": 0.5268152060208104, + "learning_rate": 6.843601852030171e-06, + "loss": 0.8755, + "step": 15359 + }, + { + "epoch": 0.7660847880299252, + "grad_norm": 0.5506693729990138, + "learning_rate": 6.84082595835405e-06, + "loss": 0.8668, + "step": 15360 + }, + { + "epoch": 0.7661346633416459, + "grad_norm": 0.5453772819881275, + "learning_rate": 6.838050538533464e-06, + "loss": 0.8633, + "step": 15361 + }, + { + "epoch": 0.7661845386533666, + "grad_norm": 0.5536514111419284, + "learning_rate": 6.8352755926408024e-06, + "loss": 0.8771, + "step": 15362 + }, + { + "epoch": 0.7662344139650873, + "grad_norm": 0.5779234022981644, + "learning_rate": 6.832501120748505e-06, + "loss": 0.8556, + "step": 15363 + }, + { + "epoch": 0.766284289276808, + "grad_norm": 0.46693415841806446, + "learning_rate": 6.829727122928953e-06, + "loss": 0.8216, + "step": 15364 + }, + { + "epoch": 0.7663341645885287, + "grad_norm": 0.5866193613402237, + "learning_rate": 6.8269535992545545e-06, + "loss": 0.8603, + "step": 15365 + }, + { + "epoch": 0.7663840399002494, + "grad_norm": 0.5295466582147291, + "learning_rate": 6.824180549797654e-06, + "loss": 0.857, + "step": 15366 + }, + { + "epoch": 0.7664339152119701, + "grad_norm": 0.5843195968056683, + "learning_rate": 6.821407974630634e-06, + "loss": 0.835, + "step": 15367 + }, + { + "epoch": 0.7664837905236908, + "grad_norm": 0.4825539605072279, + "learning_rate": 6.818635873825838e-06, + "loss": 0.8655, + "step": 15368 + }, + { + "epoch": 0.7665336658354115, + "grad_norm": 0.6421085218828034, + "learning_rate": 6.8158642474556e-06, + "loss": 0.8945, + "step": 15369 + }, + { + "epoch": 0.7665835411471321, + "grad_norm": 0.47606203314327533, + "learning_rate": 6.8130930955922404e-06, + "loss": 0.8798, + "step": 15370 + }, + { + "epoch": 0.7666334164588529, + "grad_norm": 0.554424874445929, + "learning_rate": 6.810322418308085e-06, + "loss": 0.8365, + "step": 15371 + }, + { + "epoch": 0.7666832917705736, + "grad_norm": 0.6655494904743839, + "learning_rate": 6.807552215675425e-06, + "loss": 0.8254, + "step": 15372 + }, + { + "epoch": 0.7667331670822942, + "grad_norm": 0.5407092469808226, + "learning_rate": 6.804782487766548e-06, + "loss": 0.8494, + "step": 15373 + }, + { + "epoch": 0.766783042394015, + "grad_norm": 0.5361148926313367, + "learning_rate": 6.80201323465372e-06, + "loss": 0.8492, + "step": 15374 + }, + { + "epoch": 0.7668329177057357, + "grad_norm": 0.622326065729396, + "learning_rate": 6.799244456409223e-06, + "loss": 0.8286, + "step": 15375 + }, + { + "epoch": 0.7668827930174563, + "grad_norm": 0.5611922382970553, + "learning_rate": 6.796476153105294e-06, + "loss": 0.8804, + "step": 15376 + }, + { + "epoch": 0.7669326683291771, + "grad_norm": 0.46988217392969583, + "learning_rate": 6.7937083248141655e-06, + "loss": 0.8641, + "step": 15377 + }, + { + "epoch": 0.7669825436408978, + "grad_norm": 0.4840307109828791, + "learning_rate": 6.790940971608078e-06, + "loss": 0.8076, + "step": 15378 + }, + { + "epoch": 0.7670324189526184, + "grad_norm": 0.4832589087055041, + "learning_rate": 6.788174093559238e-06, + "loss": 0.8425, + "step": 15379 + }, + { + "epoch": 0.7670822942643392, + "grad_norm": 0.4359988902203014, + "learning_rate": 6.785407690739843e-06, + "loss": 0.8325, + "step": 15380 + }, + { + "epoch": 0.7671321695760599, + "grad_norm": 0.6380580007472317, + "learning_rate": 6.782641763222075e-06, + "loss": 0.8423, + "step": 15381 + }, + { + "epoch": 0.7671820448877805, + "grad_norm": 0.5151388392622691, + "learning_rate": 6.779876311078123e-06, + "loss": 0.8987, + "step": 15382 + }, + { + "epoch": 0.7672319201995013, + "grad_norm": 0.41257999634929, + "learning_rate": 6.777111334380148e-06, + "loss": 0.7783, + "step": 15383 + }, + { + "epoch": 0.767281795511222, + "grad_norm": 1.5759985426889127, + "learning_rate": 6.774346833200296e-06, + "loss": 0.8935, + "step": 15384 + }, + { + "epoch": 0.7673316708229426, + "grad_norm": 0.4742229611478089, + "learning_rate": 6.771582807610696e-06, + "loss": 0.7983, + "step": 15385 + }, + { + "epoch": 0.7673815461346634, + "grad_norm": 0.4669459016952488, + "learning_rate": 6.768819257683496e-06, + "loss": 0.8422, + "step": 15386 + }, + { + "epoch": 0.767431421446384, + "grad_norm": 0.660276492385996, + "learning_rate": 6.766056183490799e-06, + "loss": 0.8465, + "step": 15387 + }, + { + "epoch": 0.7674812967581047, + "grad_norm": 0.5134648786396381, + "learning_rate": 6.763293585104702e-06, + "loss": 0.8454, + "step": 15388 + }, + { + "epoch": 0.7675311720698255, + "grad_norm": 0.465425642622404, + "learning_rate": 6.760531462597292e-06, + "loss": 0.8576, + "step": 15389 + }, + { + "epoch": 0.7675810473815461, + "grad_norm": 0.5893107773156138, + "learning_rate": 6.757769816040657e-06, + "loss": 0.8708, + "step": 15390 + }, + { + "epoch": 0.7676309226932668, + "grad_norm": 0.46113057703397764, + "learning_rate": 6.755008645506855e-06, + "loss": 0.8418, + "step": 15391 + }, + { + "epoch": 0.7676807980049876, + "grad_norm": 0.8028003001490348, + "learning_rate": 6.752247951067936e-06, + "loss": 0.8602, + "step": 15392 + }, + { + "epoch": 0.7677306733167082, + "grad_norm": 0.4489749103705095, + "learning_rate": 6.7494877327959335e-06, + "loss": 0.8241, + "step": 15393 + }, + { + "epoch": 0.7677805486284289, + "grad_norm": 0.46152869172572103, + "learning_rate": 6.7467279907628966e-06, + "loss": 0.8624, + "step": 15394 + }, + { + "epoch": 0.7678304239401497, + "grad_norm": 0.6738712311182932, + "learning_rate": 6.743968725040806e-06, + "loss": 0.846, + "step": 15395 + }, + { + "epoch": 0.7678802992518703, + "grad_norm": 0.5267561407537174, + "learning_rate": 6.741209935701692e-06, + "loss": 0.8396, + "step": 15396 + }, + { + "epoch": 0.767930174563591, + "grad_norm": 2.7624554945348145, + "learning_rate": 6.738451622817524e-06, + "loss": 0.8194, + "step": 15397 + }, + { + "epoch": 0.7679800498753118, + "grad_norm": 0.4881187631312017, + "learning_rate": 6.735693786460301e-06, + "loss": 0.8593, + "step": 15398 + }, + { + "epoch": 0.7680299251870324, + "grad_norm": 0.4925271867194313, + "learning_rate": 6.7329364267019635e-06, + "loss": 0.8555, + "step": 15399 + }, + { + "epoch": 0.7680798004987531, + "grad_norm": 0.5123854209817438, + "learning_rate": 6.730179543614479e-06, + "loss": 0.8357, + "step": 15400 + }, + { + "epoch": 0.7681296758104739, + "grad_norm": 0.6113574185859617, + "learning_rate": 6.727423137269778e-06, + "loss": 0.8639, + "step": 15401 + }, + { + "epoch": 0.7681795511221945, + "grad_norm": 0.574671879167773, + "learning_rate": 6.724667207739807e-06, + "loss": 0.8049, + "step": 15402 + }, + { + "epoch": 0.7682294264339152, + "grad_norm": 0.5591203675192895, + "learning_rate": 6.7219117550964495e-06, + "loss": 0.8112, + "step": 15403 + }, + { + "epoch": 0.7682793017456359, + "grad_norm": 0.5015263822084381, + "learning_rate": 6.719156779411634e-06, + "loss": 0.8096, + "step": 15404 + }, + { + "epoch": 0.7683291770573566, + "grad_norm": 0.5173214111783598, + "learning_rate": 6.716402280757239e-06, + "loss": 0.8561, + "step": 15405 + }, + { + "epoch": 0.7683790523690773, + "grad_norm": 0.8070432080630576, + "learning_rate": 6.713648259205144e-06, + "loss": 0.841, + "step": 15406 + }, + { + "epoch": 0.768428927680798, + "grad_norm": 0.6328767224256964, + "learning_rate": 6.710894714827207e-06, + "loss": 0.8339, + "step": 15407 + }, + { + "epoch": 0.7684788029925187, + "grad_norm": 0.7532727725563386, + "learning_rate": 6.708141647695296e-06, + "loss": 0.8422, + "step": 15408 + }, + { + "epoch": 0.7685286783042394, + "grad_norm": 0.524145446065081, + "learning_rate": 6.705389057881239e-06, + "loss": 0.9397, + "step": 15409 + }, + { + "epoch": 0.7685785536159601, + "grad_norm": 0.7287720341258558, + "learning_rate": 6.70263694545687e-06, + "loss": 0.8146, + "step": 15410 + }, + { + "epoch": 0.7686284289276808, + "grad_norm": 0.5186414785447421, + "learning_rate": 6.699885310493992e-06, + "loss": 0.8218, + "step": 15411 + }, + { + "epoch": 0.7686783042394015, + "grad_norm": 0.8431684439584843, + "learning_rate": 6.697134153064427e-06, + "loss": 0.8183, + "step": 15412 + }, + { + "epoch": 0.7687281795511222, + "grad_norm": 0.778019121943247, + "learning_rate": 6.694383473239951e-06, + "loss": 0.7855, + "step": 15413 + }, + { + "epoch": 0.7687780548628429, + "grad_norm": 0.6002301131610883, + "learning_rate": 6.69163327109234e-06, + "loss": 0.8214, + "step": 15414 + }, + { + "epoch": 0.7688279301745636, + "grad_norm": 0.516790794426326, + "learning_rate": 6.6888835466933725e-06, + "loss": 0.8209, + "step": 15415 + }, + { + "epoch": 0.7688778054862843, + "grad_norm": 0.5681697092529754, + "learning_rate": 6.686134300114796e-06, + "loss": 0.8285, + "step": 15416 + }, + { + "epoch": 0.768927680798005, + "grad_norm": 0.7835582878629267, + "learning_rate": 6.683385531428346e-06, + "loss": 0.8453, + "step": 15417 + }, + { + "epoch": 0.7689775561097257, + "grad_norm": 0.46910287125326705, + "learning_rate": 6.680637240705745e-06, + "loss": 0.8714, + "step": 15418 + }, + { + "epoch": 0.7690274314214464, + "grad_norm": 0.4676887842570228, + "learning_rate": 6.677889428018727e-06, + "loss": 0.8168, + "step": 15419 + }, + { + "epoch": 0.7690773067331671, + "grad_norm": 0.5133129602345036, + "learning_rate": 6.675142093438982e-06, + "loss": 0.8309, + "step": 15420 + }, + { + "epoch": 0.7691271820448878, + "grad_norm": 0.5793483929067543, + "learning_rate": 6.672395237038204e-06, + "loss": 0.7959, + "step": 15421 + }, + { + "epoch": 0.7691770573566085, + "grad_norm": 0.5883526152037403, + "learning_rate": 6.669648858888064e-06, + "loss": 0.8541, + "step": 15422 + }, + { + "epoch": 0.7692269326683292, + "grad_norm": 0.4824585117648637, + "learning_rate": 6.666902959060237e-06, + "loss": 0.867, + "step": 15423 + }, + { + "epoch": 0.7692768079800498, + "grad_norm": 0.4989761695759528, + "learning_rate": 6.664157537626375e-06, + "loss": 0.8717, + "step": 15424 + }, + { + "epoch": 0.7693266832917706, + "grad_norm": 0.5128137803132818, + "learning_rate": 6.6614125946581175e-06, + "loss": 0.8245, + "step": 15425 + }, + { + "epoch": 0.7693765586034913, + "grad_norm": 0.5358715588447134, + "learning_rate": 6.658668130227081e-06, + "loss": 0.789, + "step": 15426 + }, + { + "epoch": 0.7694264339152119, + "grad_norm": 0.5171740063544296, + "learning_rate": 6.655924144404907e-06, + "loss": 0.8782, + "step": 15427 + }, + { + "epoch": 0.7694763092269327, + "grad_norm": 0.7129131747610932, + "learning_rate": 6.653180637263168e-06, + "loss": 0.8986, + "step": 15428 + }, + { + "epoch": 0.7695261845386534, + "grad_norm": 0.521739160325565, + "learning_rate": 6.650437608873475e-06, + "loss": 0.8193, + "step": 15429 + }, + { + "epoch": 0.769576059850374, + "grad_norm": 0.4678859043323383, + "learning_rate": 6.647695059307393e-06, + "loss": 0.8519, + "step": 15430 + }, + { + "epoch": 0.7696259351620948, + "grad_norm": 0.5730065146428599, + "learning_rate": 6.644952988636513e-06, + "loss": 0.8604, + "step": 15431 + }, + { + "epoch": 0.7696758104738155, + "grad_norm": 0.7143033489763378, + "learning_rate": 6.642211396932352e-06, + "loss": 0.8226, + "step": 15432 + }, + { + "epoch": 0.7697256857855361, + "grad_norm": 0.518634735677739, + "learning_rate": 6.639470284266477e-06, + "loss": 0.8216, + "step": 15433 + }, + { + "epoch": 0.7697755610972569, + "grad_norm": 0.572867278435272, + "learning_rate": 6.636729650710402e-06, + "loss": 0.8879, + "step": 15434 + }, + { + "epoch": 0.7698254364089776, + "grad_norm": 0.9281765904932293, + "learning_rate": 6.6339894963356634e-06, + "loss": 0.8108, + "step": 15435 + }, + { + "epoch": 0.7698753117206982, + "grad_norm": 0.44175864766972417, + "learning_rate": 6.631249821213734e-06, + "loss": 0.8935, + "step": 15436 + }, + { + "epoch": 0.769925187032419, + "grad_norm": 0.5446385127957656, + "learning_rate": 6.628510625416126e-06, + "loss": 0.8698, + "step": 15437 + }, + { + "epoch": 0.7699750623441397, + "grad_norm": 0.5509028064590094, + "learning_rate": 6.625771909014303e-06, + "loss": 0.8361, + "step": 15438 + }, + { + "epoch": 0.7700249376558603, + "grad_norm": 0.4807185086876855, + "learning_rate": 6.623033672079757e-06, + "loss": 0.8138, + "step": 15439 + }, + { + "epoch": 0.7700748129675811, + "grad_norm": 0.5025892473635647, + "learning_rate": 6.620295914683905e-06, + "loss": 0.8256, + "step": 15440 + }, + { + "epoch": 0.7701246882793017, + "grad_norm": 0.5362691433445301, + "learning_rate": 6.6175586368982135e-06, + "loss": 0.8436, + "step": 15441 + }, + { + "epoch": 0.7701745635910224, + "grad_norm": 0.954783721486218, + "learning_rate": 6.614821838794102e-06, + "loss": 0.8454, + "step": 15442 + }, + { + "epoch": 0.7702244389027432, + "grad_norm": 0.6167036240044417, + "learning_rate": 6.612085520442987e-06, + "loss": 0.8991, + "step": 15443 + }, + { + "epoch": 0.7702743142144638, + "grad_norm": 0.701783470251166, + "learning_rate": 6.609349681916266e-06, + "loss": 0.8582, + "step": 15444 + }, + { + "epoch": 0.7703241895261845, + "grad_norm": 0.6080162794182925, + "learning_rate": 6.60661432328534e-06, + "loss": 0.8467, + "step": 15445 + }, + { + "epoch": 0.7703740648379053, + "grad_norm": 0.618204882928886, + "learning_rate": 6.60387944462158e-06, + "loss": 0.8739, + "step": 15446 + }, + { + "epoch": 0.7704239401496259, + "grad_norm": 0.536264580904916, + "learning_rate": 6.601145045996354e-06, + "loss": 0.8584, + "step": 15447 + }, + { + "epoch": 0.7704738154613466, + "grad_norm": 0.5567313728273984, + "learning_rate": 6.598411127481008e-06, + "loss": 0.8529, + "step": 15448 + }, + { + "epoch": 0.7705236907730674, + "grad_norm": 0.46187343866413716, + "learning_rate": 6.5956776891468925e-06, + "loss": 0.8468, + "step": 15449 + }, + { + "epoch": 0.770573566084788, + "grad_norm": 0.5414080560020081, + "learning_rate": 6.592944731065334e-06, + "loss": 0.8762, + "step": 15450 + }, + { + "epoch": 0.7706234413965087, + "grad_norm": 0.5662327279411977, + "learning_rate": 6.590212253307643e-06, + "loss": 0.8323, + "step": 15451 + }, + { + "epoch": 0.7706733167082295, + "grad_norm": 0.5025158407609729, + "learning_rate": 6.587480255945117e-06, + "loss": 0.8576, + "step": 15452 + }, + { + "epoch": 0.7707231920199501, + "grad_norm": 0.6349093915823051, + "learning_rate": 6.58474873904906e-06, + "loss": 0.8384, + "step": 15453 + }, + { + "epoch": 0.7707730673316708, + "grad_norm": 0.969839672523072, + "learning_rate": 6.582017702690743e-06, + "loss": 0.8869, + "step": 15454 + }, + { + "epoch": 0.7708229426433916, + "grad_norm": 0.47956868952959897, + "learning_rate": 6.579287146941427e-06, + "loss": 0.86, + "step": 15455 + }, + { + "epoch": 0.7708728179551122, + "grad_norm": 0.5848229276538137, + "learning_rate": 6.576557071872372e-06, + "loss": 0.7971, + "step": 15456 + }, + { + "epoch": 0.7709226932668329, + "grad_norm": 0.5296962048324357, + "learning_rate": 6.573827477554817e-06, + "loss": 0.8408, + "step": 15457 + }, + { + "epoch": 0.7709725685785536, + "grad_norm": 0.4790927708986124, + "learning_rate": 6.571098364059989e-06, + "loss": 0.8097, + "step": 15458 + }, + { + "epoch": 0.7710224438902743, + "grad_norm": 0.5241738707996769, + "learning_rate": 6.568369731459093e-06, + "loss": 0.8579, + "step": 15459 + }, + { + "epoch": 0.771072319201995, + "grad_norm": 0.5555011762111874, + "learning_rate": 6.565641579823345e-06, + "loss": 0.8829, + "step": 15460 + }, + { + "epoch": 0.7711221945137157, + "grad_norm": 0.47742974309286446, + "learning_rate": 6.562913909223931e-06, + "loss": 0.8242, + "step": 15461 + }, + { + "epoch": 0.7711720698254364, + "grad_norm": 0.5333031492037522, + "learning_rate": 6.560186719732028e-06, + "loss": 0.8799, + "step": 15462 + }, + { + "epoch": 0.7712219451371571, + "grad_norm": 0.5519145170200536, + "learning_rate": 6.557460011418792e-06, + "loss": 0.8407, + "step": 15463 + }, + { + "epoch": 0.7712718204488778, + "grad_norm": 0.4984631053652349, + "learning_rate": 6.554733784355399e-06, + "loss": 0.856, + "step": 15464 + }, + { + "epoch": 0.7713216957605985, + "grad_norm": 0.7878087352605871, + "learning_rate": 6.552008038612959e-06, + "loss": 0.8589, + "step": 15465 + }, + { + "epoch": 0.7713715710723192, + "grad_norm": 0.599409224210062, + "learning_rate": 6.549282774262619e-06, + "loss": 0.8616, + "step": 15466 + }, + { + "epoch": 0.7714214463840399, + "grad_norm": 0.4751704430191011, + "learning_rate": 6.5465579913754794e-06, + "loss": 0.8405, + "step": 15467 + }, + { + "epoch": 0.7714713216957606, + "grad_norm": 1.3934474357209246, + "learning_rate": 6.543833690022666e-06, + "loss": 0.8002, + "step": 15468 + }, + { + "epoch": 0.7715211970074813, + "grad_norm": 0.5435599505729328, + "learning_rate": 6.541109870275236e-06, + "loss": 0.8807, + "step": 15469 + }, + { + "epoch": 0.771571072319202, + "grad_norm": 0.5049479058525281, + "learning_rate": 6.538386532204294e-06, + "loss": 0.828, + "step": 15470 + }, + { + "epoch": 0.7716209476309227, + "grad_norm": 0.4941949252485109, + "learning_rate": 6.535663675880882e-06, + "loss": 0.8283, + "step": 15471 + }, + { + "epoch": 0.7716708229426434, + "grad_norm": 0.5377987182586992, + "learning_rate": 6.5329413013760805e-06, + "loss": 0.8256, + "step": 15472 + }, + { + "epoch": 0.7717206982543641, + "grad_norm": 0.6433752697802012, + "learning_rate": 6.530219408760893e-06, + "loss": 0.8708, + "step": 15473 + }, + { + "epoch": 0.7717705735660848, + "grad_norm": 0.6633392521388038, + "learning_rate": 6.527497998106374e-06, + "loss": 0.8107, + "step": 15474 + }, + { + "epoch": 0.7718204488778054, + "grad_norm": 0.5857272747033128, + "learning_rate": 6.524777069483526e-06, + "loss": 0.8079, + "step": 15475 + }, + { + "epoch": 0.7718703241895262, + "grad_norm": 0.6075443678386836, + "learning_rate": 6.522056622963352e-06, + "loss": 0.8064, + "step": 15476 + }, + { + "epoch": 0.7719201995012469, + "grad_norm": 0.504641716352545, + "learning_rate": 6.519336658616834e-06, + "loss": 0.8712, + "step": 15477 + }, + { + "epoch": 0.7719700748129675, + "grad_norm": 0.5485512901505257, + "learning_rate": 6.516617176514964e-06, + "loss": 0.8554, + "step": 15478 + }, + { + "epoch": 0.7720199501246883, + "grad_norm": 0.6113896120896191, + "learning_rate": 6.5138981767286946e-06, + "loss": 0.869, + "step": 15479 + }, + { + "epoch": 0.772069825436409, + "grad_norm": 0.534611976111679, + "learning_rate": 6.511179659328978e-06, + "loss": 0.8524, + "step": 15480 + }, + { + "epoch": 0.7721197007481296, + "grad_norm": 0.5001017390389882, + "learning_rate": 6.5084616243867495e-06, + "loss": 0.8522, + "step": 15481 + }, + { + "epoch": 0.7721695760598504, + "grad_norm": 0.4807231347713975, + "learning_rate": 6.505744071972944e-06, + "loss": 0.8358, + "step": 15482 + }, + { + "epoch": 0.7722194513715711, + "grad_norm": 0.7641411782125818, + "learning_rate": 6.503027002158471e-06, + "loss": 0.8469, + "step": 15483 + }, + { + "epoch": 0.7722693266832917, + "grad_norm": 0.5221448089792803, + "learning_rate": 6.50031041501423e-06, + "loss": 0.8289, + "step": 15484 + }, + { + "epoch": 0.7723192019950125, + "grad_norm": 0.5392582968030707, + "learning_rate": 6.497594310611102e-06, + "loss": 0.8777, + "step": 15485 + }, + { + "epoch": 0.7723690773067332, + "grad_norm": 0.47516569721463936, + "learning_rate": 6.4948786890199785e-06, + "loss": 0.8286, + "step": 15486 + }, + { + "epoch": 0.7724189526184538, + "grad_norm": 0.4765942955318122, + "learning_rate": 6.492163550311711e-06, + "loss": 0.8476, + "step": 15487 + }, + { + "epoch": 0.7724688279301746, + "grad_norm": 0.46758036742646564, + "learning_rate": 6.489448894557154e-06, + "loss": 0.8354, + "step": 15488 + }, + { + "epoch": 0.7725187032418953, + "grad_norm": 0.5217639811922528, + "learning_rate": 6.486734721827137e-06, + "loss": 0.8243, + "step": 15489 + }, + { + "epoch": 0.7725685785536159, + "grad_norm": 0.5180898051501892, + "learning_rate": 6.4840210321925e-06, + "loss": 0.8477, + "step": 15490 + }, + { + "epoch": 0.7726184538653367, + "grad_norm": 0.7861936423332161, + "learning_rate": 6.4813078257240455e-06, + "loss": 0.7678, + "step": 15491 + }, + { + "epoch": 0.7726683291770574, + "grad_norm": 0.5130469288032777, + "learning_rate": 6.478595102492571e-06, + "loss": 0.8565, + "step": 15492 + }, + { + "epoch": 0.772718204488778, + "grad_norm": 0.5100754681015081, + "learning_rate": 6.4758828625688716e-06, + "loss": 0.8569, + "step": 15493 + }, + { + "epoch": 0.7727680798004988, + "grad_norm": 0.5500594595468729, + "learning_rate": 6.4731711060237204e-06, + "loss": 0.8454, + "step": 15494 + }, + { + "epoch": 0.7728179551122194, + "grad_norm": 0.5004733241108669, + "learning_rate": 6.470459832927881e-06, + "loss": 0.8168, + "step": 15495 + }, + { + "epoch": 0.7728678304239401, + "grad_norm": 0.585605539178705, + "learning_rate": 6.467749043352086e-06, + "loss": 0.8391, + "step": 15496 + }, + { + "epoch": 0.7729177057356609, + "grad_norm": 0.4359959139725793, + "learning_rate": 6.465038737367097e-06, + "loss": 0.8392, + "step": 15497 + }, + { + "epoch": 0.7729675810473815, + "grad_norm": 0.46707856398910513, + "learning_rate": 6.462328915043628e-06, + "loss": 0.8273, + "step": 15498 + }, + { + "epoch": 0.7730174563591022, + "grad_norm": 0.5640543738538987, + "learning_rate": 6.459619576452386e-06, + "loss": 0.8337, + "step": 15499 + }, + { + "epoch": 0.773067331670823, + "grad_norm": 0.9109353708078783, + "learning_rate": 6.45691072166407e-06, + "loss": 0.8209, + "step": 15500 + }, + { + "epoch": 0.7731172069825436, + "grad_norm": 0.7805231361930315, + "learning_rate": 6.4542023507493804e-06, + "loss": 0.8427, + "step": 15501 + }, + { + "epoch": 0.7731670822942643, + "grad_norm": 0.6004417225749317, + "learning_rate": 6.451494463778968e-06, + "loss": 0.8528, + "step": 15502 + }, + { + "epoch": 0.7732169576059851, + "grad_norm": 0.5832611208549628, + "learning_rate": 6.4487870608235134e-06, + "loss": 0.8125, + "step": 15503 + }, + { + "epoch": 0.7732668329177057, + "grad_norm": 0.5742432523035323, + "learning_rate": 6.4460801419536505e-06, + "loss": 0.8456, + "step": 15504 + }, + { + "epoch": 0.7733167082294264, + "grad_norm": 0.49783230518874044, + "learning_rate": 6.4433737072400386e-06, + "loss": 0.8318, + "step": 15505 + }, + { + "epoch": 0.7733665835411472, + "grad_norm": 0.44330827730905487, + "learning_rate": 6.4406677567532685e-06, + "loss": 0.8167, + "step": 15506 + }, + { + "epoch": 0.7734164588528678, + "grad_norm": 0.5740060855947384, + "learning_rate": 6.437962290563976e-06, + "loss": 0.8928, + "step": 15507 + }, + { + "epoch": 0.7734663341645885, + "grad_norm": 0.5940650400484605, + "learning_rate": 6.435257308742743e-06, + "loss": 0.8481, + "step": 15508 + }, + { + "epoch": 0.7735162094763093, + "grad_norm": 0.7815634772504042, + "learning_rate": 6.432552811360176e-06, + "loss": 0.8721, + "step": 15509 + }, + { + "epoch": 0.7735660847880299, + "grad_norm": 0.46536012066874943, + "learning_rate": 6.429848798486818e-06, + "loss": 0.89, + "step": 15510 + }, + { + "epoch": 0.7736159600997506, + "grad_norm": 0.5635330631019092, + "learning_rate": 6.427145270193255e-06, + "loss": 0.8863, + "step": 15511 + }, + { + "epoch": 0.7736658354114713, + "grad_norm": 0.5161109714874689, + "learning_rate": 6.42444222655002e-06, + "loss": 0.8577, + "step": 15512 + }, + { + "epoch": 0.773715710723192, + "grad_norm": 0.4949198612050334, + "learning_rate": 6.421739667627655e-06, + "loss": 0.8063, + "step": 15513 + }, + { + "epoch": 0.7737655860349127, + "grad_norm": 0.6431253699958971, + "learning_rate": 6.4190375934966715e-06, + "loss": 0.8418, + "step": 15514 + }, + { + "epoch": 0.7738154613466334, + "grad_norm": 0.6405266494280013, + "learning_rate": 6.4163360042275914e-06, + "loss": 0.8427, + "step": 15515 + }, + { + "epoch": 0.7738653366583541, + "grad_norm": 0.4988477842761167, + "learning_rate": 6.413634899890908e-06, + "loss": 0.809, + "step": 15516 + }, + { + "epoch": 0.7739152119700748, + "grad_norm": 0.5511952529574554, + "learning_rate": 6.410934280557102e-06, + "loss": 0.868, + "step": 15517 + }, + { + "epoch": 0.7739650872817955, + "grad_norm": 0.4723504676684223, + "learning_rate": 6.40823414629664e-06, + "loss": 0.8416, + "step": 15518 + }, + { + "epoch": 0.7740149625935162, + "grad_norm": 0.5872236772105932, + "learning_rate": 6.405534497179996e-06, + "loss": 0.8767, + "step": 15519 + }, + { + "epoch": 0.774064837905237, + "grad_norm": 0.5222360612855246, + "learning_rate": 6.402835333277607e-06, + "loss": 0.8483, + "step": 15520 + }, + { + "epoch": 0.7741147132169576, + "grad_norm": 0.49427941925497876, + "learning_rate": 6.400136654659905e-06, + "loss": 0.8504, + "step": 15521 + }, + { + "epoch": 0.7741645885286783, + "grad_norm": 0.660357984790476, + "learning_rate": 6.39743846139731e-06, + "loss": 0.874, + "step": 15522 + }, + { + "epoch": 0.774214463840399, + "grad_norm": 0.4862132661230328, + "learning_rate": 6.394740753560235e-06, + "loss": 0.8839, + "step": 15523 + }, + { + "epoch": 0.7742643391521197, + "grad_norm": 0.44289321721211317, + "learning_rate": 6.392043531219077e-06, + "loss": 0.8728, + "step": 15524 + }, + { + "epoch": 0.7743142144638404, + "grad_norm": 0.5631079388753419, + "learning_rate": 6.389346794444212e-06, + "loss": 0.8976, + "step": 15525 + }, + { + "epoch": 0.7743640897755611, + "grad_norm": 0.5038996227230671, + "learning_rate": 6.3866505433060085e-06, + "loss": 0.8444, + "step": 15526 + }, + { + "epoch": 0.7744139650872818, + "grad_norm": 0.5227717053141452, + "learning_rate": 6.383954777874837e-06, + "loss": 0.8076, + "step": 15527 + }, + { + "epoch": 0.7744638403990025, + "grad_norm": 0.7133704710952273, + "learning_rate": 6.381259498221032e-06, + "loss": 0.8744, + "step": 15528 + }, + { + "epoch": 0.7745137157107231, + "grad_norm": 0.5288639179642786, + "learning_rate": 6.378564704414922e-06, + "loss": 0.9026, + "step": 15529 + }, + { + "epoch": 0.7745635910224439, + "grad_norm": 0.6821464971307859, + "learning_rate": 6.375870396526837e-06, + "loss": 0.8181, + "step": 15530 + }, + { + "epoch": 0.7746134663341646, + "grad_norm": 0.7823461914468648, + "learning_rate": 6.373176574627079e-06, + "loss": 0.8295, + "step": 15531 + }, + { + "epoch": 0.7746633416458852, + "grad_norm": 0.8631567716970302, + "learning_rate": 6.3704832387859456e-06, + "loss": 0.8316, + "step": 15532 + }, + { + "epoch": 0.774713216957606, + "grad_norm": 0.5364397435370121, + "learning_rate": 6.3677903890737054e-06, + "loss": 0.8241, + "step": 15533 + }, + { + "epoch": 0.7747630922693267, + "grad_norm": 0.6976372138996109, + "learning_rate": 6.3650980255606505e-06, + "loss": 0.9083, + "step": 15534 + }, + { + "epoch": 0.7748129675810473, + "grad_norm": 0.543468074344226, + "learning_rate": 6.362406148317007e-06, + "loss": 0.8457, + "step": 15535 + }, + { + "epoch": 0.7748628428927681, + "grad_norm": 0.5342755084184766, + "learning_rate": 6.359714757413043e-06, + "loss": 0.866, + "step": 15536 + }, + { + "epoch": 0.7749127182044888, + "grad_norm": 0.6179886071932038, + "learning_rate": 6.357023852918972e-06, + "loss": 0.8551, + "step": 15537 + }, + { + "epoch": 0.7749625935162094, + "grad_norm": 0.5836682380341127, + "learning_rate": 6.3543334349050355e-06, + "loss": 0.8476, + "step": 15538 + }, + { + "epoch": 0.7750124688279302, + "grad_norm": 0.5188873098093786, + "learning_rate": 6.3516435034414094e-06, + "loss": 0.8554, + "step": 15539 + }, + { + "epoch": 0.7750623441396509, + "grad_norm": 0.48671310896199876, + "learning_rate": 6.348954058598306e-06, + "loss": 0.8521, + "step": 15540 + }, + { + "epoch": 0.7751122194513715, + "grad_norm": 0.5356754509826882, + "learning_rate": 6.3462651004458914e-06, + "loss": 0.8325, + "step": 15541 + }, + { + "epoch": 0.7751620947630923, + "grad_norm": 0.5435270727305296, + "learning_rate": 6.343576629054357e-06, + "loss": 0.8046, + "step": 15542 + }, + { + "epoch": 0.775211970074813, + "grad_norm": 0.552588469936173, + "learning_rate": 6.340888644493826e-06, + "loss": 0.8238, + "step": 15543 + }, + { + "epoch": 0.7752618453865336, + "grad_norm": 0.7402556827856196, + "learning_rate": 6.338201146834461e-06, + "loss": 0.8353, + "step": 15544 + }, + { + "epoch": 0.7753117206982544, + "grad_norm": 0.6320312880404555, + "learning_rate": 6.335514136146387e-06, + "loss": 0.8807, + "step": 15545 + }, + { + "epoch": 0.775361596009975, + "grad_norm": 0.4564832895079566, + "learning_rate": 6.332827612499717e-06, + "loss": 0.8263, + "step": 15546 + }, + { + "epoch": 0.7754114713216957, + "grad_norm": 0.6205705631939344, + "learning_rate": 6.330141575964549e-06, + "loss": 0.8907, + "step": 15547 + }, + { + "epoch": 0.7754613466334165, + "grad_norm": 0.49166815008076264, + "learning_rate": 6.32745602661099e-06, + "loss": 0.8531, + "step": 15548 + }, + { + "epoch": 0.7755112219451371, + "grad_norm": 0.6613282960433083, + "learning_rate": 6.324770964509108e-06, + "loss": 0.8541, + "step": 15549 + }, + { + "epoch": 0.7755610972568578, + "grad_norm": 0.6270999661266702, + "learning_rate": 6.322086389728971e-06, + "loss": 0.8322, + "step": 15550 + }, + { + "epoch": 0.7756109725685786, + "grad_norm": 0.5868406495356289, + "learning_rate": 6.319402302340621e-06, + "loss": 0.8467, + "step": 15551 + }, + { + "epoch": 0.7756608478802992, + "grad_norm": 0.4969273348204655, + "learning_rate": 6.316718702414115e-06, + "loss": 0.8562, + "step": 15552 + }, + { + "epoch": 0.77571072319202, + "grad_norm": 0.5345596207918225, + "learning_rate": 6.314035590019474e-06, + "loss": 0.8322, + "step": 15553 + }, + { + "epoch": 0.7757605985037407, + "grad_norm": 0.5184261354794978, + "learning_rate": 6.311352965226711e-06, + "loss": 0.8442, + "step": 15554 + }, + { + "epoch": 0.7758104738154613, + "grad_norm": 0.5426502444039656, + "learning_rate": 6.308670828105822e-06, + "loss": 0.8353, + "step": 15555 + }, + { + "epoch": 0.775860349127182, + "grad_norm": 0.5426275790463109, + "learning_rate": 6.305989178726809e-06, + "loss": 0.8666, + "step": 15556 + }, + { + "epoch": 0.7759102244389028, + "grad_norm": 0.512927372830293, + "learning_rate": 6.303308017159643e-06, + "loss": 0.8712, + "step": 15557 + }, + { + "epoch": 0.7759600997506234, + "grad_norm": 0.5230327425739835, + "learning_rate": 6.300627343474286e-06, + "loss": 0.8566, + "step": 15558 + }, + { + "epoch": 0.7760099750623441, + "grad_norm": 0.4903680700214619, + "learning_rate": 6.297947157740683e-06, + "loss": 0.8541, + "step": 15559 + }, + { + "epoch": 0.7760598503740649, + "grad_norm": 0.503132449639013, + "learning_rate": 6.295267460028786e-06, + "loss": 0.7957, + "step": 15560 + }, + { + "epoch": 0.7761097256857855, + "grad_norm": 0.5787040025477892, + "learning_rate": 6.292588250408513e-06, + "loss": 0.8704, + "step": 15561 + }, + { + "epoch": 0.7761596009975062, + "grad_norm": 0.7415434123453877, + "learning_rate": 6.2899095289497776e-06, + "loss": 0.8383, + "step": 15562 + }, + { + "epoch": 0.776209476309227, + "grad_norm": 0.5175394630799294, + "learning_rate": 6.28723129572247e-06, + "loss": 0.8154, + "step": 15563 + }, + { + "epoch": 0.7762593516209476, + "grad_norm": 0.5066406429066711, + "learning_rate": 6.284553550796498e-06, + "loss": 0.8393, + "step": 15564 + }, + { + "epoch": 0.7763092269326684, + "grad_norm": 0.5738589398725258, + "learning_rate": 6.2818762942417234e-06, + "loss": 0.8587, + "step": 15565 + }, + { + "epoch": 0.776359102244389, + "grad_norm": 0.5890712362835399, + "learning_rate": 6.27919952612801e-06, + "loss": 0.8636, + "step": 15566 + }, + { + "epoch": 0.7764089775561097, + "grad_norm": 0.55212820836385, + "learning_rate": 6.276523246525198e-06, + "loss": 0.8587, + "step": 15567 + }, + { + "epoch": 0.7764588528678305, + "grad_norm": 0.5598525549419685, + "learning_rate": 6.2738474555031425e-06, + "loss": 0.8046, + "step": 15568 + }, + { + "epoch": 0.7765087281795511, + "grad_norm": 0.5382507451837845, + "learning_rate": 6.271172153131655e-06, + "loss": 0.8433, + "step": 15569 + }, + { + "epoch": 0.7765586034912718, + "grad_norm": 0.4905440022361841, + "learning_rate": 6.268497339480542e-06, + "loss": 0.8453, + "step": 15570 + }, + { + "epoch": 0.7766084788029926, + "grad_norm": 0.6603492924131141, + "learning_rate": 6.265823014619621e-06, + "loss": 0.8604, + "step": 15571 + }, + { + "epoch": 0.7766583541147132, + "grad_norm": 0.5335143983394145, + "learning_rate": 6.263149178618652e-06, + "loss": 0.8699, + "step": 15572 + }, + { + "epoch": 0.7767082294264339, + "grad_norm": 1.4843549805389142, + "learning_rate": 6.260475831547424e-06, + "loss": 0.8174, + "step": 15573 + }, + { + "epoch": 0.7767581047381547, + "grad_norm": 0.5808832470299246, + "learning_rate": 6.2578029734756845e-06, + "loss": 0.8677, + "step": 15574 + }, + { + "epoch": 0.7768079800498753, + "grad_norm": 0.6058319801721154, + "learning_rate": 6.255130604473205e-06, + "loss": 0.865, + "step": 15575 + }, + { + "epoch": 0.776857855361596, + "grad_norm": 0.521537742196319, + "learning_rate": 6.252458724609691e-06, + "loss": 0.8253, + "step": 15576 + }, + { + "epoch": 0.7769077306733168, + "grad_norm": 0.47931691390538, + "learning_rate": 6.249787333954882e-06, + "loss": 0.8609, + "step": 15577 + }, + { + "epoch": 0.7769576059850374, + "grad_norm": 0.536481819970682, + "learning_rate": 6.24711643257847e-06, + "loss": 0.8518, + "step": 15578 + }, + { + "epoch": 0.7770074812967581, + "grad_norm": 0.6519731161370667, + "learning_rate": 6.244446020550182e-06, + "loss": 0.8471, + "step": 15579 + }, + { + "epoch": 0.7770573566084789, + "grad_norm": 0.6230376110055142, + "learning_rate": 6.241776097939664e-06, + "loss": 0.8559, + "step": 15580 + }, + { + "epoch": 0.7771072319201995, + "grad_norm": 0.5806279380546847, + "learning_rate": 6.239106664816611e-06, + "loss": 0.8483, + "step": 15581 + }, + { + "epoch": 0.7771571072319202, + "grad_norm": 0.5369350761454529, + "learning_rate": 6.2364377212506735e-06, + "loss": 0.8482, + "step": 15582 + }, + { + "epoch": 0.7772069825436408, + "grad_norm": 0.5302493688575298, + "learning_rate": 6.2337692673114955e-06, + "loss": 0.8728, + "step": 15583 + }, + { + "epoch": 0.7772568578553616, + "grad_norm": 0.5161713585837605, + "learning_rate": 6.231101303068703e-06, + "loss": 0.8139, + "step": 15584 + }, + { + "epoch": 0.7773067331670823, + "grad_norm": 0.4963231079795906, + "learning_rate": 6.2284338285919285e-06, + "loss": 0.8253, + "step": 15585 + }, + { + "epoch": 0.777356608478803, + "grad_norm": 0.5201042450764033, + "learning_rate": 6.225766843950772e-06, + "loss": 0.8422, + "step": 15586 + }, + { + "epoch": 0.7774064837905237, + "grad_norm": 0.4789701351410661, + "learning_rate": 6.2231003492148305e-06, + "loss": 0.8346, + "step": 15587 + }, + { + "epoch": 0.7774563591022444, + "grad_norm": 0.4472507394471027, + "learning_rate": 6.220434344453674e-06, + "loss": 0.82, + "step": 15588 + }, + { + "epoch": 0.777506234413965, + "grad_norm": 0.4527011657107388, + "learning_rate": 6.217768829736883e-06, + "loss": 0.8651, + "step": 15589 + }, + { + "epoch": 0.7775561097256858, + "grad_norm": 0.5743551156708178, + "learning_rate": 6.215103805134012e-06, + "loss": 0.8633, + "step": 15590 + }, + { + "epoch": 0.7776059850374065, + "grad_norm": 1.0812655068216943, + "learning_rate": 6.212439270714598e-06, + "loss": 0.8386, + "step": 15591 + }, + { + "epoch": 0.7776558603491271, + "grad_norm": 0.6087514980819813, + "learning_rate": 6.209775226548165e-06, + "loss": 0.8868, + "step": 15592 + }, + { + "epoch": 0.7777057356608479, + "grad_norm": 0.6629048354909951, + "learning_rate": 6.207111672704246e-06, + "loss": 0.8311, + "step": 15593 + }, + { + "epoch": 0.7777556109725686, + "grad_norm": 0.7205964049942476, + "learning_rate": 6.2044486092523395e-06, + "loss": 0.8307, + "step": 15594 + }, + { + "epoch": 0.7778054862842892, + "grad_norm": 0.5501799403782281, + "learning_rate": 6.201786036261933e-06, + "loss": 0.8703, + "step": 15595 + }, + { + "epoch": 0.77785536159601, + "grad_norm": 0.6387742543978588, + "learning_rate": 6.199123953802499e-06, + "loss": 0.8418, + "step": 15596 + }, + { + "epoch": 0.7779052369077307, + "grad_norm": 0.4817078320580092, + "learning_rate": 6.196462361943519e-06, + "loss": 0.8197, + "step": 15597 + }, + { + "epoch": 0.7779551122194514, + "grad_norm": 0.5380466141079503, + "learning_rate": 6.193801260754439e-06, + "loss": 0.8641, + "step": 15598 + }, + { + "epoch": 0.7780049875311721, + "grad_norm": 1.7857797735657377, + "learning_rate": 6.191140650304697e-06, + "loss": 0.8528, + "step": 15599 + }, + { + "epoch": 0.7780548628428927, + "grad_norm": 0.5297666908766764, + "learning_rate": 6.188480530663718e-06, + "loss": 0.8386, + "step": 15600 + }, + { + "epoch": 0.7781047381546135, + "grad_norm": 0.6758748062999841, + "learning_rate": 6.185820901900924e-06, + "loss": 0.8495, + "step": 15601 + }, + { + "epoch": 0.7781546134663342, + "grad_norm": 0.5121966432863869, + "learning_rate": 6.183161764085716e-06, + "loss": 0.8245, + "step": 15602 + }, + { + "epoch": 0.7782044887780548, + "grad_norm": 0.5555089831870926, + "learning_rate": 6.1805031172874785e-06, + "loss": 0.88, + "step": 15603 + }, + { + "epoch": 0.7782543640897756, + "grad_norm": 0.5224926629510477, + "learning_rate": 6.1778449615755855e-06, + "loss": 0.8432, + "step": 15604 + }, + { + "epoch": 0.7783042394014963, + "grad_norm": 0.6707301943347759, + "learning_rate": 6.175187297019408e-06, + "loss": 0.8273, + "step": 15605 + }, + { + "epoch": 0.7783541147132169, + "grad_norm": 0.5130511515621019, + "learning_rate": 6.172530123688295e-06, + "loss": 0.8668, + "step": 15606 + }, + { + "epoch": 0.7784039900249377, + "grad_norm": 0.5564701326505258, + "learning_rate": 6.169873441651575e-06, + "loss": 0.8647, + "step": 15607 + }, + { + "epoch": 0.7784538653366584, + "grad_norm": 0.6709659416249375, + "learning_rate": 6.167217250978596e-06, + "loss": 0.8174, + "step": 15608 + }, + { + "epoch": 0.778503740648379, + "grad_norm": 0.527967731167703, + "learning_rate": 6.1645615517386375e-06, + "loss": 0.8081, + "step": 15609 + }, + { + "epoch": 0.7785536159600998, + "grad_norm": 0.5504444627613135, + "learning_rate": 6.161906344001025e-06, + "loss": 0.8858, + "step": 15610 + }, + { + "epoch": 0.7786034912718205, + "grad_norm": 0.4880551975901294, + "learning_rate": 6.1592516278350275e-06, + "loss": 0.8597, + "step": 15611 + }, + { + "epoch": 0.7786533665835411, + "grad_norm": 0.6004469533633124, + "learning_rate": 6.156597403309941e-06, + "loss": 0.8674, + "step": 15612 + }, + { + "epoch": 0.7787032418952619, + "grad_norm": 0.49758328854911943, + "learning_rate": 6.153943670494997e-06, + "loss": 0.8631, + "step": 15613 + }, + { + "epoch": 0.7787531172069826, + "grad_norm": 0.5579428128633788, + "learning_rate": 6.151290429459466e-06, + "loss": 0.853, + "step": 15614 + }, + { + "epoch": 0.7788029925187032, + "grad_norm": 0.6034475035578158, + "learning_rate": 6.14863768027257e-06, + "loss": 0.8296, + "step": 15615 + }, + { + "epoch": 0.778852867830424, + "grad_norm": 0.576135939740563, + "learning_rate": 6.14598542300355e-06, + "loss": 0.828, + "step": 15616 + }, + { + "epoch": 0.7789027431421447, + "grad_norm": 0.7664318844990453, + "learning_rate": 6.14333365772159e-06, + "loss": 0.867, + "step": 15617 + }, + { + "epoch": 0.7789526184538653, + "grad_norm": 0.5602103191411445, + "learning_rate": 6.140682384495902e-06, + "loss": 0.827, + "step": 15618 + }, + { + "epoch": 0.7790024937655861, + "grad_norm": 0.48122853137854044, + "learning_rate": 6.138031603395672e-06, + "loss": 0.8179, + "step": 15619 + }, + { + "epoch": 0.7790523690773067, + "grad_norm": 0.5603049161438471, + "learning_rate": 6.135381314490063e-06, + "loss": 0.8371, + "step": 15620 + }, + { + "epoch": 0.7791022443890274, + "grad_norm": 0.5822979341368486, + "learning_rate": 6.132731517848228e-06, + "loss": 0.8257, + "step": 15621 + }, + { + "epoch": 0.7791521197007482, + "grad_norm": 0.685222734564003, + "learning_rate": 6.1300822135393285e-06, + "loss": 0.8736, + "step": 15622 + }, + { + "epoch": 0.7792019950124688, + "grad_norm": 0.6217355364177395, + "learning_rate": 6.1274334016324875e-06, + "loss": 0.8305, + "step": 15623 + }, + { + "epoch": 0.7792518703241895, + "grad_norm": 0.534361263595742, + "learning_rate": 6.124785082196827e-06, + "loss": 0.8129, + "step": 15624 + }, + { + "epoch": 0.7793017456359103, + "grad_norm": 0.5147177600805425, + "learning_rate": 6.122137255301444e-06, + "loss": 0.8538, + "step": 15625 + }, + { + "epoch": 0.7793516209476309, + "grad_norm": 0.5751094119333515, + "learning_rate": 6.1194899210154505e-06, + "loss": 0.8458, + "step": 15626 + }, + { + "epoch": 0.7794014962593516, + "grad_norm": 0.5198326120163352, + "learning_rate": 6.116843079407914e-06, + "loss": 0.849, + "step": 15627 + }, + { + "epoch": 0.7794513715710724, + "grad_norm": 0.6297185813621997, + "learning_rate": 6.114196730547908e-06, + "loss": 0.8388, + "step": 15628 + }, + { + "epoch": 0.779501246882793, + "grad_norm": 0.5044192065325901, + "learning_rate": 6.111550874504479e-06, + "loss": 0.8431, + "step": 15629 + }, + { + "epoch": 0.7795511221945137, + "grad_norm": 0.6501303722476376, + "learning_rate": 6.108905511346691e-06, + "loss": 0.827, + "step": 15630 + }, + { + "epoch": 0.7796009975062345, + "grad_norm": 0.5154829470383017, + "learning_rate": 6.106260641143546e-06, + "loss": 0.8034, + "step": 15631 + }, + { + "epoch": 0.7796508728179551, + "grad_norm": 0.522694898551367, + "learning_rate": 6.103616263964079e-06, + "loss": 0.868, + "step": 15632 + }, + { + "epoch": 0.7797007481296758, + "grad_norm": 0.6663649943086204, + "learning_rate": 6.100972379877282e-06, + "loss": 0.8474, + "step": 15633 + }, + { + "epoch": 0.7797506234413966, + "grad_norm": 0.5159415864073863, + "learning_rate": 6.098328988952159e-06, + "loss": 0.825, + "step": 15634 + }, + { + "epoch": 0.7798004987531172, + "grad_norm": 0.7047785258108463, + "learning_rate": 6.095686091257686e-06, + "loss": 0.8633, + "step": 15635 + }, + { + "epoch": 0.7798503740648379, + "grad_norm": 0.47981544115956043, + "learning_rate": 6.09304368686282e-06, + "loss": 0.8322, + "step": 15636 + }, + { + "epoch": 0.7799002493765586, + "grad_norm": 0.5381548216836398, + "learning_rate": 6.09040177583651e-06, + "loss": 0.8111, + "step": 15637 + }, + { + "epoch": 0.7799501246882793, + "grad_norm": 0.6488732895643385, + "learning_rate": 6.08776035824771e-06, + "loss": 0.844, + "step": 15638 + }, + { + "epoch": 0.78, + "grad_norm": 0.5447010489310171, + "learning_rate": 6.085119434165343e-06, + "loss": 0.8811, + "step": 15639 + }, + { + "epoch": 0.7800498753117207, + "grad_norm": 0.5544784972367484, + "learning_rate": 6.082479003658315e-06, + "loss": 0.8905, + "step": 15640 + }, + { + "epoch": 0.7800997506234414, + "grad_norm": 0.5507467936779697, + "learning_rate": 6.079839066795534e-06, + "loss": 0.8564, + "step": 15641 + }, + { + "epoch": 0.7801496259351621, + "grad_norm": 0.7719021778821694, + "learning_rate": 6.077199623645874e-06, + "loss": 0.8205, + "step": 15642 + }, + { + "epoch": 0.7801995012468828, + "grad_norm": 0.5469110811923173, + "learning_rate": 6.074560674278232e-06, + "loss": 0.8155, + "step": 15643 + }, + { + "epoch": 0.7802493765586035, + "grad_norm": 0.6349492517286148, + "learning_rate": 6.07192221876145e-06, + "loss": 0.8255, + "step": 15644 + }, + { + "epoch": 0.7802992518703242, + "grad_norm": 0.661661078274787, + "learning_rate": 6.069284257164404e-06, + "loss": 0.8805, + "step": 15645 + }, + { + "epoch": 0.7803491271820449, + "grad_norm": 0.5165689606332672, + "learning_rate": 6.066646789555894e-06, + "loss": 0.8306, + "step": 15646 + }, + { + "epoch": 0.7803990024937656, + "grad_norm": 0.5745062007162783, + "learning_rate": 6.064009816004773e-06, + "loss": 0.8241, + "step": 15647 + }, + { + "epoch": 0.7804488778054863, + "grad_norm": 0.5714443669568289, + "learning_rate": 6.061373336579834e-06, + "loss": 0.8462, + "step": 15648 + }, + { + "epoch": 0.780498753117207, + "grad_norm": 0.5746305017836428, + "learning_rate": 6.058737351349894e-06, + "loss": 0.8599, + "step": 15649 + }, + { + "epoch": 0.7805486284289277, + "grad_norm": 0.6107708183416247, + "learning_rate": 6.056101860383714e-06, + "loss": 0.8106, + "step": 15650 + }, + { + "epoch": 0.7805985037406484, + "grad_norm": 0.5024096103061475, + "learning_rate": 6.0534668637500844e-06, + "loss": 0.8506, + "step": 15651 + }, + { + "epoch": 0.7806483790523691, + "grad_norm": 0.6251012729844447, + "learning_rate": 6.050832361517755e-06, + "loss": 0.8391, + "step": 15652 + }, + { + "epoch": 0.7806982543640898, + "grad_norm": 1.5060200278015354, + "learning_rate": 6.048198353755477e-06, + "loss": 0.8449, + "step": 15653 + }, + { + "epoch": 0.7807481296758104, + "grad_norm": 0.5332565020544968, + "learning_rate": 6.045564840531975e-06, + "loss": 0.8351, + "step": 15654 + }, + { + "epoch": 0.7807980049875312, + "grad_norm": 0.5442764605709671, + "learning_rate": 6.042931821915982e-06, + "loss": 0.8382, + "step": 15655 + }, + { + "epoch": 0.7808478802992519, + "grad_norm": 0.5396732608553014, + "learning_rate": 6.040299297976199e-06, + "loss": 0.8726, + "step": 15656 + }, + { + "epoch": 0.7808977556109725, + "grad_norm": 0.5784743198929673, + "learning_rate": 6.037667268781319e-06, + "loss": 0.8393, + "step": 15657 + }, + { + "epoch": 0.7809476309226933, + "grad_norm": 0.5946869735405202, + "learning_rate": 6.035035734400019e-06, + "loss": 0.8714, + "step": 15658 + }, + { + "epoch": 0.780997506234414, + "grad_norm": 0.5289142581690411, + "learning_rate": 6.032404694900981e-06, + "loss": 0.8199, + "step": 15659 + }, + { + "epoch": 0.7810473815461346, + "grad_norm": 0.6223609766452206, + "learning_rate": 6.029774150352854e-06, + "loss": 0.8574, + "step": 15660 + }, + { + "epoch": 0.7810972568578554, + "grad_norm": 0.584945992503127, + "learning_rate": 6.0271441008242805e-06, + "loss": 0.8521, + "step": 15661 + }, + { + "epoch": 0.7811471321695761, + "grad_norm": 0.522700607455913, + "learning_rate": 6.024514546383886e-06, + "loss": 0.8759, + "step": 15662 + }, + { + "epoch": 0.7811970074812967, + "grad_norm": 0.6336659607169802, + "learning_rate": 6.021885487100296e-06, + "loss": 0.8729, + "step": 15663 + }, + { + "epoch": 0.7812468827930175, + "grad_norm": 0.5085502759426493, + "learning_rate": 6.01925692304211e-06, + "loss": 0.8664, + "step": 15664 + }, + { + "epoch": 0.7812967581047382, + "grad_norm": 0.5647480359534663, + "learning_rate": 6.0166288542779225e-06, + "loss": 0.8196, + "step": 15665 + }, + { + "epoch": 0.7813466334164588, + "grad_norm": 0.5757625486337546, + "learning_rate": 6.0140012808762995e-06, + "loss": 0.8376, + "step": 15666 + }, + { + "epoch": 0.7813965087281796, + "grad_norm": 0.5550713886175832, + "learning_rate": 6.011374202905834e-06, + "loss": 0.8678, + "step": 15667 + }, + { + "epoch": 0.7814463840399003, + "grad_norm": 1.3215785883869922, + "learning_rate": 6.008747620435043e-06, + "loss": 0.8793, + "step": 15668 + }, + { + "epoch": 0.7814962593516209, + "grad_norm": 0.5662811293136601, + "learning_rate": 6.006121533532491e-06, + "loss": 0.8478, + "step": 15669 + }, + { + "epoch": 0.7815461346633417, + "grad_norm": 0.5395995830383069, + "learning_rate": 6.00349594226669e-06, + "loss": 0.8155, + "step": 15670 + }, + { + "epoch": 0.7815960099750623, + "grad_norm": 0.6218134306935762, + "learning_rate": 6.000870846706169e-06, + "loss": 0.8272, + "step": 15671 + }, + { + "epoch": 0.781645885286783, + "grad_norm": 0.5314980243400855, + "learning_rate": 5.99824624691942e-06, + "loss": 0.8428, + "step": 15672 + }, + { + "epoch": 0.7816957605985038, + "grad_norm": 0.5837877746992742, + "learning_rate": 5.99562214297493e-06, + "loss": 0.8545, + "step": 15673 + }, + { + "epoch": 0.7817456359102244, + "grad_norm": 0.5229295269131811, + "learning_rate": 5.992998534941169e-06, + "loss": 0.8349, + "step": 15674 + }, + { + "epoch": 0.7817955112219451, + "grad_norm": 0.49279228346981774, + "learning_rate": 5.99037542288661e-06, + "loss": 0.7938, + "step": 15675 + }, + { + "epoch": 0.7818453865336659, + "grad_norm": 0.7695324171544604, + "learning_rate": 5.9877528068796985e-06, + "loss": 0.9151, + "step": 15676 + }, + { + "epoch": 0.7818952618453865, + "grad_norm": 0.9107377559872039, + "learning_rate": 5.985130686988869e-06, + "loss": 0.8589, + "step": 15677 + }, + { + "epoch": 0.7819451371571072, + "grad_norm": 0.5489534622814123, + "learning_rate": 5.982509063282543e-06, + "loss": 0.8586, + "step": 15678 + }, + { + "epoch": 0.781995012468828, + "grad_norm": 0.5553348932426637, + "learning_rate": 5.979887935829126e-06, + "loss": 0.8408, + "step": 15679 + }, + { + "epoch": 0.7820448877805486, + "grad_norm": 0.6336297790174887, + "learning_rate": 5.977267304697029e-06, + "loss": 0.9296, + "step": 15680 + }, + { + "epoch": 0.7820947630922693, + "grad_norm": 0.5041086323706102, + "learning_rate": 5.9746471699546254e-06, + "loss": 0.8386, + "step": 15681 + }, + { + "epoch": 0.7821446384039901, + "grad_norm": 0.4950557848734984, + "learning_rate": 5.972027531670291e-06, + "loss": 0.8181, + "step": 15682 + }, + { + "epoch": 0.7821945137157107, + "grad_norm": 0.4817644811525246, + "learning_rate": 5.969408389912376e-06, + "loss": 0.8515, + "step": 15683 + }, + { + "epoch": 0.7822443890274314, + "grad_norm": 0.5615307645864676, + "learning_rate": 5.966789744749237e-06, + "loss": 0.8372, + "step": 15684 + }, + { + "epoch": 0.7822942643391522, + "grad_norm": 0.5680174894698398, + "learning_rate": 5.9641715962491966e-06, + "loss": 0.8442, + "step": 15685 + }, + { + "epoch": 0.7823441396508728, + "grad_norm": 0.7028234936269997, + "learning_rate": 5.961553944480591e-06, + "loss": 0.823, + "step": 15686 + }, + { + "epoch": 0.7823940149625935, + "grad_norm": 0.47202799048219996, + "learning_rate": 5.958936789511701e-06, + "loss": 0.8173, + "step": 15687 + }, + { + "epoch": 0.7824438902743143, + "grad_norm": 0.5081387046817933, + "learning_rate": 5.956320131410842e-06, + "loss": 0.8279, + "step": 15688 + }, + { + "epoch": 0.7824937655860349, + "grad_norm": 0.53196403678172, + "learning_rate": 5.953703970246285e-06, + "loss": 0.8502, + "step": 15689 + }, + { + "epoch": 0.7825436408977556, + "grad_norm": 0.5080697864456746, + "learning_rate": 5.951088306086303e-06, + "loss": 0.829, + "step": 15690 + }, + { + "epoch": 0.7825935162094763, + "grad_norm": 0.5252801180121528, + "learning_rate": 5.9484731389991334e-06, + "loss": 0.8127, + "step": 15691 + }, + { + "epoch": 0.782643391521197, + "grad_norm": 0.5877062501643585, + "learning_rate": 5.945858469053042e-06, + "loss": 0.8157, + "step": 15692 + }, + { + "epoch": 0.7826932668329177, + "grad_norm": 0.5448944514430524, + "learning_rate": 5.943244296316245e-06, + "loss": 0.8119, + "step": 15693 + }, + { + "epoch": 0.7827431421446384, + "grad_norm": 0.47775212009062173, + "learning_rate": 5.94063062085696e-06, + "loss": 0.7869, + "step": 15694 + }, + { + "epoch": 0.7827930174563591, + "grad_norm": 0.6164806076872267, + "learning_rate": 5.9380174427433825e-06, + "loss": 0.8078, + "step": 15695 + }, + { + "epoch": 0.7828428927680798, + "grad_norm": 0.5192033396960369, + "learning_rate": 5.935404762043715e-06, + "loss": 0.8214, + "step": 15696 + }, + { + "epoch": 0.7828927680798005, + "grad_norm": 0.6411317041803785, + "learning_rate": 5.932792578826127e-06, + "loss": 0.8532, + "step": 15697 + }, + { + "epoch": 0.7829426433915212, + "grad_norm": 0.5190026661682883, + "learning_rate": 5.930180893158785e-06, + "loss": 0.8212, + "step": 15698 + }, + { + "epoch": 0.7829925187032419, + "grad_norm": 0.5721540852355791, + "learning_rate": 5.927569705109828e-06, + "loss": 0.8573, + "step": 15699 + }, + { + "epoch": 0.7830423940149626, + "grad_norm": 0.5173456200867205, + "learning_rate": 5.9249590147474225e-06, + "loss": 0.8339, + "step": 15700 + }, + { + "epoch": 0.7830922693266833, + "grad_norm": 0.5302852904514709, + "learning_rate": 5.922348822139656e-06, + "loss": 0.8696, + "step": 15701 + }, + { + "epoch": 0.783142144638404, + "grad_norm": 0.5468641508836648, + "learning_rate": 5.9197391273546675e-06, + "loss": 0.7996, + "step": 15702 + }, + { + "epoch": 0.7831920199501247, + "grad_norm": 5.197446539811714, + "learning_rate": 5.917129930460541e-06, + "loss": 0.8519, + "step": 15703 + }, + { + "epoch": 0.7832418952618454, + "grad_norm": 0.5702627136230457, + "learning_rate": 5.914521231525383e-06, + "loss": 0.8383, + "step": 15704 + }, + { + "epoch": 0.7832917705735661, + "grad_norm": 0.46478717777074396, + "learning_rate": 5.911913030617239e-06, + "loss": 0.8286, + "step": 15705 + }, + { + "epoch": 0.7833416458852868, + "grad_norm": 0.5066107147106015, + "learning_rate": 5.909305327804187e-06, + "loss": 0.8273, + "step": 15706 + }, + { + "epoch": 0.7833915211970075, + "grad_norm": 0.5744685801143651, + "learning_rate": 5.906698123154261e-06, + "loss": 0.8533, + "step": 15707 + }, + { + "epoch": 0.7834413965087281, + "grad_norm": 0.49916729492404693, + "learning_rate": 5.90409141673551e-06, + "loss": 0.852, + "step": 15708 + }, + { + "epoch": 0.7834912718204489, + "grad_norm": 0.4673360168496603, + "learning_rate": 5.901485208615948e-06, + "loss": 0.8412, + "step": 15709 + }, + { + "epoch": 0.7835411471321696, + "grad_norm": 0.4950881880278138, + "learning_rate": 5.898879498863583e-06, + "loss": 0.837, + "step": 15710 + }, + { + "epoch": 0.7835910224438902, + "grad_norm": 0.9201413667146875, + "learning_rate": 5.896274287546405e-06, + "loss": 0.8339, + "step": 15711 + }, + { + "epoch": 0.783640897755611, + "grad_norm": 0.655543685336991, + "learning_rate": 5.893669574732396e-06, + "loss": 0.8554, + "step": 15712 + }, + { + "epoch": 0.7836907730673317, + "grad_norm": 0.5929921194442992, + "learning_rate": 5.891065360489537e-06, + "loss": 0.8397, + "step": 15713 + }, + { + "epoch": 0.7837406483790523, + "grad_norm": 0.49189981057594706, + "learning_rate": 5.88846164488577e-06, + "loss": 0.824, + "step": 15714 + }, + { + "epoch": 0.7837905236907731, + "grad_norm": 0.5114937978201127, + "learning_rate": 5.88585842798905e-06, + "loss": 0.8189, + "step": 15715 + }, + { + "epoch": 0.7838403990024938, + "grad_norm": 0.626576766281506, + "learning_rate": 5.883255709867288e-06, + "loss": 0.8384, + "step": 15716 + }, + { + "epoch": 0.7838902743142144, + "grad_norm": 0.4822388493556186, + "learning_rate": 5.88065349058842e-06, + "loss": 0.8212, + "step": 15717 + }, + { + "epoch": 0.7839401496259352, + "grad_norm": 0.5626875504054837, + "learning_rate": 5.8780517702203445e-06, + "loss": 0.8181, + "step": 15718 + }, + { + "epoch": 0.7839900249376559, + "grad_norm": 0.4769979020453519, + "learning_rate": 5.875450548830949e-06, + "loss": 0.8571, + "step": 15719 + }, + { + "epoch": 0.7840399002493765, + "grad_norm": 0.5312296074522977, + "learning_rate": 5.872849826488105e-06, + "loss": 0.8161, + "step": 15720 + }, + { + "epoch": 0.7840897755610973, + "grad_norm": 0.5915944666438867, + "learning_rate": 5.8702496032596924e-06, + "loss": 0.8716, + "step": 15721 + }, + { + "epoch": 0.784139650872818, + "grad_norm": 0.7772154132735135, + "learning_rate": 5.867649879213547e-06, + "loss": 0.8764, + "step": 15722 + }, + { + "epoch": 0.7841895261845386, + "grad_norm": 0.504145345492797, + "learning_rate": 5.865050654417531e-06, + "loss": 0.8443, + "step": 15723 + }, + { + "epoch": 0.7842394014962594, + "grad_norm": 0.65665860860108, + "learning_rate": 5.862451928939441e-06, + "loss": 0.8123, + "step": 15724 + }, + { + "epoch": 0.78428927680798, + "grad_norm": 1.268371303954693, + "learning_rate": 5.859853702847107e-06, + "loss": 0.8218, + "step": 15725 + }, + { + "epoch": 0.7843391521197007, + "grad_norm": 0.5893864996521346, + "learning_rate": 5.857255976208329e-06, + "loss": 0.8222, + "step": 15726 + }, + { + "epoch": 0.7843890274314215, + "grad_norm": 0.6450532712224609, + "learning_rate": 5.854658749090885e-06, + "loss": 0.8577, + "step": 15727 + }, + { + "epoch": 0.7844389027431421, + "grad_norm": 0.7578603038674996, + "learning_rate": 5.852062021562549e-06, + "loss": 0.8432, + "step": 15728 + }, + { + "epoch": 0.7844887780548628, + "grad_norm": 0.4743036799287484, + "learning_rate": 5.849465793691089e-06, + "loss": 0.8466, + "step": 15729 + }, + { + "epoch": 0.7845386533665836, + "grad_norm": 0.652083241417099, + "learning_rate": 5.846870065544249e-06, + "loss": 0.824, + "step": 15730 + }, + { + "epoch": 0.7845885286783042, + "grad_norm": 0.7331551792454051, + "learning_rate": 5.844274837189764e-06, + "loss": 0.8969, + "step": 15731 + }, + { + "epoch": 0.7846384039900249, + "grad_norm": 0.5947726730807321, + "learning_rate": 5.841680108695347e-06, + "loss": 0.8411, + "step": 15732 + }, + { + "epoch": 0.7846882793017457, + "grad_norm": 0.5427536842771443, + "learning_rate": 5.8390858801287195e-06, + "loss": 0.8252, + "step": 15733 + }, + { + "epoch": 0.7847381546134663, + "grad_norm": 0.7793255490652888, + "learning_rate": 5.836492151557571e-06, + "loss": 0.8785, + "step": 15734 + }, + { + "epoch": 0.784788029925187, + "grad_norm": 0.6825225010645475, + "learning_rate": 5.833898923049586e-06, + "loss": 0.8606, + "step": 15735 + }, + { + "epoch": 0.7848379052369078, + "grad_norm": 0.6836310961576281, + "learning_rate": 5.83130619467242e-06, + "loss": 0.7736, + "step": 15736 + }, + { + "epoch": 0.7848877805486284, + "grad_norm": 0.5370533107072494, + "learning_rate": 5.8287139664937565e-06, + "loss": 0.8433, + "step": 15737 + }, + { + "epoch": 0.7849376558603491, + "grad_norm": 0.48128970305285357, + "learning_rate": 5.826122238581206e-06, + "loss": 0.8475, + "step": 15738 + }, + { + "epoch": 0.7849875311720699, + "grad_norm": 0.5118033326493456, + "learning_rate": 5.823531011002423e-06, + "loss": 0.8688, + "step": 15739 + }, + { + "epoch": 0.7850374064837905, + "grad_norm": 0.45999792215848095, + "learning_rate": 5.820940283825008e-06, + "loss": 0.8164, + "step": 15740 + }, + { + "epoch": 0.7850872817955112, + "grad_norm": 0.656656315376635, + "learning_rate": 5.8183500571165895e-06, + "loss": 0.865, + "step": 15741 + }, + { + "epoch": 0.785137157107232, + "grad_norm": 0.5280236733900486, + "learning_rate": 5.815760330944725e-06, + "loss": 0.8361, + "step": 15742 + }, + { + "epoch": 0.7851870324189526, + "grad_norm": 0.7646375072849513, + "learning_rate": 5.813171105377016e-06, + "loss": 0.8218, + "step": 15743 + }, + { + "epoch": 0.7852369077306733, + "grad_norm": 0.4750478079474715, + "learning_rate": 5.810582380481014e-06, + "loss": 0.8559, + "step": 15744 + }, + { + "epoch": 0.785286783042394, + "grad_norm": 0.5623309787907369, + "learning_rate": 5.807994156324292e-06, + "loss": 0.8811, + "step": 15745 + }, + { + "epoch": 0.7853366583541147, + "grad_norm": 0.48501789133477907, + "learning_rate": 5.805406432974359e-06, + "loss": 0.8409, + "step": 15746 + }, + { + "epoch": 0.7853865336658354, + "grad_norm": 0.5639604534146916, + "learning_rate": 5.802819210498758e-06, + "loss": 0.8506, + "step": 15747 + }, + { + "epoch": 0.7854364089775561, + "grad_norm": 0.6536492308288279, + "learning_rate": 5.800232488965002e-06, + "loss": 0.8047, + "step": 15748 + }, + { + "epoch": 0.7854862842892768, + "grad_norm": 0.5855191009217191, + "learning_rate": 5.797646268440579e-06, + "loss": 0.878, + "step": 15749 + }, + { + "epoch": 0.7855361596009975, + "grad_norm": 0.5140231643906352, + "learning_rate": 5.79506054899299e-06, + "loss": 0.8313, + "step": 15750 + }, + { + "epoch": 0.7855860349127182, + "grad_norm": 0.5295632970261047, + "learning_rate": 5.7924753306896975e-06, + "loss": 0.86, + "step": 15751 + }, + { + "epoch": 0.7856359102244389, + "grad_norm": 0.6229245247848968, + "learning_rate": 5.78989061359817e-06, + "loss": 0.8438, + "step": 15752 + }, + { + "epoch": 0.7856857855361596, + "grad_norm": 0.6064456596240985, + "learning_rate": 5.787306397785838e-06, + "loss": 0.8806, + "step": 15753 + }, + { + "epoch": 0.7857356608478803, + "grad_norm": 0.5831371146437571, + "learning_rate": 5.7847226833201565e-06, + "loss": 0.8323, + "step": 15754 + }, + { + "epoch": 0.785785536159601, + "grad_norm": 0.6533271603633171, + "learning_rate": 5.7821394702685386e-06, + "loss": 0.8665, + "step": 15755 + }, + { + "epoch": 0.7858354114713217, + "grad_norm": 0.6459059235661683, + "learning_rate": 5.779556758698388e-06, + "loss": 0.8024, + "step": 15756 + }, + { + "epoch": 0.7858852867830424, + "grad_norm": 1.6051973874390386, + "learning_rate": 5.776974548677094e-06, + "loss": 0.7743, + "step": 15757 + }, + { + "epoch": 0.7859351620947631, + "grad_norm": 0.7356978120742023, + "learning_rate": 5.7743928402720556e-06, + "loss": 0.86, + "step": 15758 + }, + { + "epoch": 0.7859850374064838, + "grad_norm": 0.6662350011272062, + "learning_rate": 5.77181163355063e-06, + "loss": 0.8085, + "step": 15759 + }, + { + "epoch": 0.7860349127182045, + "grad_norm": 0.531671831742602, + "learning_rate": 5.769230928580177e-06, + "loss": 0.8314, + "step": 15760 + }, + { + "epoch": 0.7860847880299252, + "grad_norm": 0.5612981504818435, + "learning_rate": 5.766650725428027e-06, + "loss": 0.8373, + "step": 15761 + }, + { + "epoch": 0.7861346633416458, + "grad_norm": 0.5209610989531853, + "learning_rate": 5.7640710241615245e-06, + "loss": 0.8754, + "step": 15762 + }, + { + "epoch": 0.7861845386533666, + "grad_norm": 0.5577256608292018, + "learning_rate": 5.761491824847981e-06, + "loss": 0.8206, + "step": 15763 + }, + { + "epoch": 0.7862344139650873, + "grad_norm": 0.6172440140360977, + "learning_rate": 5.758913127554699e-06, + "loss": 0.8122, + "step": 15764 + }, + { + "epoch": 0.7862842892768079, + "grad_norm": 0.7981790683752612, + "learning_rate": 5.756334932348961e-06, + "loss": 0.8531, + "step": 15765 + }, + { + "epoch": 0.7863341645885287, + "grad_norm": 0.5717588102488641, + "learning_rate": 5.753757239298058e-06, + "loss": 0.8299, + "step": 15766 + }, + { + "epoch": 0.7863840399002494, + "grad_norm": 0.47570020121919865, + "learning_rate": 5.751180048469243e-06, + "loss": 0.8667, + "step": 15767 + }, + { + "epoch": 0.78643391521197, + "grad_norm": 4.190353910783225, + "learning_rate": 5.748603359929775e-06, + "loss": 0.902, + "step": 15768 + }, + { + "epoch": 0.7864837905236908, + "grad_norm": 0.7554376994856067, + "learning_rate": 5.746027173746876e-06, + "loss": 0.871, + "step": 15769 + }, + { + "epoch": 0.7865336658354115, + "grad_norm": 0.5405599729971248, + "learning_rate": 5.743451489987789e-06, + "loss": 0.8433, + "step": 15770 + }, + { + "epoch": 0.7865835411471321, + "grad_norm": 0.6121626561802026, + "learning_rate": 5.740876308719717e-06, + "loss": 0.7818, + "step": 15771 + }, + { + "epoch": 0.7866334164588529, + "grad_norm": 0.6428113264818263, + "learning_rate": 5.738301630009857e-06, + "loss": 0.8117, + "step": 15772 + }, + { + "epoch": 0.7866832917705736, + "grad_norm": 0.5046049973176947, + "learning_rate": 5.735727453925391e-06, + "loss": 0.8457, + "step": 15773 + }, + { + "epoch": 0.7867331670822942, + "grad_norm": 0.5793541627757833, + "learning_rate": 5.73315378053351e-06, + "loss": 0.8402, + "step": 15774 + }, + { + "epoch": 0.786783042394015, + "grad_norm": 0.5192389695449948, + "learning_rate": 5.73058060990134e-06, + "loss": 0.8228, + "step": 15775 + }, + { + "epoch": 0.7868329177057357, + "grad_norm": 0.6083115655646393, + "learning_rate": 5.7280079420960555e-06, + "loss": 0.8149, + "step": 15776 + }, + { + "epoch": 0.7868827930174563, + "grad_norm": 0.5277096542421703, + "learning_rate": 5.725435777184773e-06, + "loss": 0.8402, + "step": 15777 + }, + { + "epoch": 0.7869326683291771, + "grad_norm": 0.549525482238262, + "learning_rate": 5.72286411523463e-06, + "loss": 0.8777, + "step": 15778 + }, + { + "epoch": 0.7869825436408977, + "grad_norm": 0.5534125696773443, + "learning_rate": 5.720292956312706e-06, + "loss": 0.8261, + "step": 15779 + }, + { + "epoch": 0.7870324189526184, + "grad_norm": 0.547692990849452, + "learning_rate": 5.717722300486117e-06, + "loss": 0.8618, + "step": 15780 + }, + { + "epoch": 0.7870822942643392, + "grad_norm": 0.5274983243887619, + "learning_rate": 5.715152147821925e-06, + "loss": 0.8633, + "step": 15781 + }, + { + "epoch": 0.7871321695760598, + "grad_norm": 0.555803909941937, + "learning_rate": 5.712582498387225e-06, + "loss": 0.8369, + "step": 15782 + }, + { + "epoch": 0.7871820448877805, + "grad_norm": 0.563368938722617, + "learning_rate": 5.710013352249038e-06, + "loss": 0.8671, + "step": 15783 + }, + { + "epoch": 0.7872319201995013, + "grad_norm": 0.45834385668025696, + "learning_rate": 5.707444709474424e-06, + "loss": 0.8472, + "step": 15784 + }, + { + "epoch": 0.7872817955112219, + "grad_norm": 0.5816189585362755, + "learning_rate": 5.704876570130407e-06, + "loss": 0.8217, + "step": 15785 + }, + { + "epoch": 0.7873316708229426, + "grad_norm": 1.0883550676753513, + "learning_rate": 5.702308934283995e-06, + "loss": 0.8372, + "step": 15786 + }, + { + "epoch": 0.7873815461346634, + "grad_norm": 0.6412100282208086, + "learning_rate": 5.699741802002201e-06, + "loss": 0.8665, + "step": 15787 + }, + { + "epoch": 0.787431421446384, + "grad_norm": 0.4288369765380466, + "learning_rate": 5.6971751733520065e-06, + "loss": 0.8088, + "step": 15788 + }, + { + "epoch": 0.7874812967581047, + "grad_norm": 0.7918113904602102, + "learning_rate": 5.69460904840039e-06, + "loss": 0.8698, + "step": 15789 + }, + { + "epoch": 0.7875311720698255, + "grad_norm": 0.492120399225558, + "learning_rate": 5.692043427214299e-06, + "loss": 0.8135, + "step": 15790 + }, + { + "epoch": 0.7875810473815461, + "grad_norm": 0.9309720337144337, + "learning_rate": 5.689478309860704e-06, + "loss": 0.7924, + "step": 15791 + }, + { + "epoch": 0.7876309226932668, + "grad_norm": 0.5055544221395724, + "learning_rate": 5.686913696406529e-06, + "loss": 0.8412, + "step": 15792 + }, + { + "epoch": 0.7876807980049876, + "grad_norm": 0.5709925004467274, + "learning_rate": 5.684349586918697e-06, + "loss": 0.8488, + "step": 15793 + }, + { + "epoch": 0.7877306733167082, + "grad_norm": 0.6679648238723103, + "learning_rate": 5.681785981464113e-06, + "loss": 0.8855, + "step": 15794 + }, + { + "epoch": 0.787780548628429, + "grad_norm": 0.5099019657829655, + "learning_rate": 5.679222880109683e-06, + "loss": 0.8317, + "step": 15795 + }, + { + "epoch": 0.7878304239401496, + "grad_norm": 0.6647041919841336, + "learning_rate": 5.676660282922286e-06, + "loss": 0.8308, + "step": 15796 + }, + { + "epoch": 0.7878802992518703, + "grad_norm": 0.6763399331093092, + "learning_rate": 5.67409818996879e-06, + "loss": 0.8285, + "step": 15797 + }, + { + "epoch": 0.787930174563591, + "grad_norm": 0.5131985014674874, + "learning_rate": 5.6715366013160466e-06, + "loss": 0.8141, + "step": 15798 + }, + { + "epoch": 0.7879800498753117, + "grad_norm": 0.6616668505059575, + "learning_rate": 5.668975517030911e-06, + "loss": 0.8559, + "step": 15799 + }, + { + "epoch": 0.7880299251870324, + "grad_norm": 0.5096999578217442, + "learning_rate": 5.666414937180209e-06, + "loss": 0.8149, + "step": 15800 + }, + { + "epoch": 0.7880798004987531, + "grad_norm": 0.5553892286369949, + "learning_rate": 5.663854861830759e-06, + "loss": 0.9134, + "step": 15801 + }, + { + "epoch": 0.7881296758104738, + "grad_norm": 0.5300093511316675, + "learning_rate": 5.661295291049351e-06, + "loss": 0.8508, + "step": 15802 + }, + { + "epoch": 0.7881795511221945, + "grad_norm": 0.9066858217821662, + "learning_rate": 5.658736224902797e-06, + "loss": 0.8469, + "step": 15803 + }, + { + "epoch": 0.7882294264339152, + "grad_norm": 0.4692067536460649, + "learning_rate": 5.656177663457865e-06, + "loss": 0.8466, + "step": 15804 + }, + { + "epoch": 0.7882793017456359, + "grad_norm": 0.56346978914758, + "learning_rate": 5.6536196067813215e-06, + "loss": 0.8003, + "step": 15805 + }, + { + "epoch": 0.7883291770573566, + "grad_norm": 0.889090620867329, + "learning_rate": 5.6510620549399065e-06, + "loss": 0.9129, + "step": 15806 + }, + { + "epoch": 0.7883790523690773, + "grad_norm": 0.5941818975809957, + "learning_rate": 5.648505008000382e-06, + "loss": 0.8746, + "step": 15807 + }, + { + "epoch": 0.788428927680798, + "grad_norm": 0.5979659452673244, + "learning_rate": 5.6459484660294436e-06, + "loss": 0.8263, + "step": 15808 + }, + { + "epoch": 0.7884788029925187, + "grad_norm": 0.5625915956472568, + "learning_rate": 5.643392429093827e-06, + "loss": 0.8132, + "step": 15809 + }, + { + "epoch": 0.7885286783042394, + "grad_norm": 0.5708937683729575, + "learning_rate": 5.640836897260215e-06, + "loss": 0.8429, + "step": 15810 + }, + { + "epoch": 0.7885785536159601, + "grad_norm": 0.5645567048116867, + "learning_rate": 5.638281870595313e-06, + "loss": 0.8397, + "step": 15811 + }, + { + "epoch": 0.7886284289276808, + "grad_norm": 0.504898455665163, + "learning_rate": 5.635727349165768e-06, + "loss": 0.8054, + "step": 15812 + }, + { + "epoch": 0.7886783042394016, + "grad_norm": 0.7820154010787526, + "learning_rate": 5.633173333038258e-06, + "loss": 0.8243, + "step": 15813 + }, + { + "epoch": 0.7887281795511222, + "grad_norm": 0.5293807558030886, + "learning_rate": 5.630619822279415e-06, + "loss": 0.8806, + "step": 15814 + }, + { + "epoch": 0.7887780548628429, + "grad_norm": 0.5482624055384404, + "learning_rate": 5.628066816955893e-06, + "loss": 0.8601, + "step": 15815 + }, + { + "epoch": 0.7888279301745635, + "grad_norm": 0.5011336432260716, + "learning_rate": 5.625514317134284e-06, + "loss": 0.7974, + "step": 15816 + }, + { + "epoch": 0.7888778054862843, + "grad_norm": 0.5308933493380844, + "learning_rate": 5.622962322881217e-06, + "loss": 0.8385, + "step": 15817 + }, + { + "epoch": 0.788927680798005, + "grad_norm": 0.6594177493442434, + "learning_rate": 5.620410834263274e-06, + "loss": 0.8616, + "step": 15818 + }, + { + "epoch": 0.7889775561097256, + "grad_norm": 0.8777071785229631, + "learning_rate": 5.617859851347037e-06, + "loss": 0.855, + "step": 15819 + }, + { + "epoch": 0.7890274314214464, + "grad_norm": 0.5014581766877204, + "learning_rate": 5.615309374199065e-06, + "loss": 0.8695, + "step": 15820 + }, + { + "epoch": 0.7890773067331671, + "grad_norm": 0.8214152519777668, + "learning_rate": 5.6127594028859285e-06, + "loss": 0.8413, + "step": 15821 + }, + { + "epoch": 0.7891271820448877, + "grad_norm": 0.5155140271387296, + "learning_rate": 5.610209937474159e-06, + "loss": 0.8494, + "step": 15822 + }, + { + "epoch": 0.7891770573566085, + "grad_norm": 0.5838267354805293, + "learning_rate": 5.607660978030274e-06, + "loss": 0.9144, + "step": 15823 + }, + { + "epoch": 0.7892269326683292, + "grad_norm": 0.556908063860512, + "learning_rate": 5.6051125246208066e-06, + "loss": 0.881, + "step": 15824 + }, + { + "epoch": 0.7892768079800498, + "grad_norm": 0.5436302671508209, + "learning_rate": 5.602564577312247e-06, + "loss": 0.84, + "step": 15825 + }, + { + "epoch": 0.7893266832917706, + "grad_norm": 0.6401845254118528, + "learning_rate": 5.600017136171082e-06, + "loss": 0.8568, + "step": 15826 + }, + { + "epoch": 0.7893765586034913, + "grad_norm": 0.5249421799484039, + "learning_rate": 5.597470201263782e-06, + "loss": 0.8563, + "step": 15827 + }, + { + "epoch": 0.789426433915212, + "grad_norm": 0.7261186688128893, + "learning_rate": 5.594923772656821e-06, + "loss": 0.8172, + "step": 15828 + }, + { + "epoch": 0.7894763092269327, + "grad_norm": 0.5226371277171368, + "learning_rate": 5.5923778504166385e-06, + "loss": 0.85, + "step": 15829 + }, + { + "epoch": 0.7895261845386534, + "grad_norm": 0.6487789659803289, + "learning_rate": 5.5898324346096706e-06, + "loss": 0.8561, + "step": 15830 + }, + { + "epoch": 0.789576059850374, + "grad_norm": 0.5402343375378476, + "learning_rate": 5.587287525302332e-06, + "loss": 0.8447, + "step": 15831 + }, + { + "epoch": 0.7896259351620948, + "grad_norm": 0.5919353357186063, + "learning_rate": 5.584743122561043e-06, + "loss": 0.8171, + "step": 15832 + }, + { + "epoch": 0.7896758104738154, + "grad_norm": 0.8036720576712516, + "learning_rate": 5.582199226452195e-06, + "loss": 0.841, + "step": 15833 + }, + { + "epoch": 0.7897256857855361, + "grad_norm": 0.6674703515083183, + "learning_rate": 5.579655837042166e-06, + "loss": 0.8467, + "step": 15834 + }, + { + "epoch": 0.7897755610972569, + "grad_norm": 0.4676907205660094, + "learning_rate": 5.577112954397321e-06, + "loss": 0.7916, + "step": 15835 + }, + { + "epoch": 0.7898254364089775, + "grad_norm": 0.5095683576052281, + "learning_rate": 5.574570578584026e-06, + "loss": 0.8009, + "step": 15836 + }, + { + "epoch": 0.7898753117206982, + "grad_norm": 0.5617441241060542, + "learning_rate": 5.57202870966862e-06, + "loss": 0.8136, + "step": 15837 + }, + { + "epoch": 0.789925187032419, + "grad_norm": 0.5328110321516626, + "learning_rate": 5.569487347717431e-06, + "loss": 0.7979, + "step": 15838 + }, + { + "epoch": 0.7899750623441396, + "grad_norm": 0.608673066722779, + "learning_rate": 5.5669464927967655e-06, + "loss": 0.866, + "step": 15839 + }, + { + "epoch": 0.7900249376558603, + "grad_norm": 0.648027338678567, + "learning_rate": 5.564406144972939e-06, + "loss": 0.8766, + "step": 15840 + }, + { + "epoch": 0.7900748129675811, + "grad_norm": 0.7999100079489684, + "learning_rate": 5.561866304312238e-06, + "loss": 0.8616, + "step": 15841 + }, + { + "epoch": 0.7901246882793017, + "grad_norm": 0.5327587183130478, + "learning_rate": 5.559326970880935e-06, + "loss": 0.8472, + "step": 15842 + }, + { + "epoch": 0.7901745635910224, + "grad_norm": 0.5150503891387128, + "learning_rate": 5.55678814474529e-06, + "loss": 0.8567, + "step": 15843 + }, + { + "epoch": 0.7902244389027432, + "grad_norm": 0.6521203876771616, + "learning_rate": 5.5542498259715696e-06, + "loss": 0.8772, + "step": 15844 + }, + { + "epoch": 0.7902743142144638, + "grad_norm": 0.8470583158582681, + "learning_rate": 5.551712014625981e-06, + "loss": 0.8613, + "step": 15845 + }, + { + "epoch": 0.7903241895261846, + "grad_norm": 0.4958836655246498, + "learning_rate": 5.5491747107747735e-06, + "loss": 0.8845, + "step": 15846 + }, + { + "epoch": 0.7903740648379053, + "grad_norm": 0.4848497807133886, + "learning_rate": 5.546637914484137e-06, + "loss": 0.8482, + "step": 15847 + }, + { + "epoch": 0.7904239401496259, + "grad_norm": 0.8314206163277534, + "learning_rate": 5.544101625820294e-06, + "loss": 0.8365, + "step": 15848 + }, + { + "epoch": 0.7904738154613467, + "grad_norm": 0.46572442380410517, + "learning_rate": 5.541565844849398e-06, + "loss": 0.8171, + "step": 15849 + }, + { + "epoch": 0.7905236907730673, + "grad_norm": 0.5922620281319358, + "learning_rate": 5.539030571637638e-06, + "loss": 0.9145, + "step": 15850 + }, + { + "epoch": 0.790573566084788, + "grad_norm": 0.5050119646903256, + "learning_rate": 5.53649580625116e-06, + "loss": 0.8622, + "step": 15851 + }, + { + "epoch": 0.7906234413965088, + "grad_norm": 0.520361717613114, + "learning_rate": 5.533961548756128e-06, + "loss": 0.853, + "step": 15852 + }, + { + "epoch": 0.7906733167082294, + "grad_norm": 0.5189061134870022, + "learning_rate": 5.531427799218641e-06, + "loss": 0.8453, + "step": 15853 + }, + { + "epoch": 0.7907231920199501, + "grad_norm": 0.9552099443412571, + "learning_rate": 5.528894557704842e-06, + "loss": 0.8671, + "step": 15854 + }, + { + "epoch": 0.7907730673316709, + "grad_norm": 0.5338358003669526, + "learning_rate": 5.526361824280823e-06, + "loss": 0.8456, + "step": 15855 + }, + { + "epoch": 0.7908229426433915, + "grad_norm": 0.5981535491545584, + "learning_rate": 5.523829599012678e-06, + "loss": 0.7932, + "step": 15856 + }, + { + "epoch": 0.7908728179551122, + "grad_norm": 3.522307243047739, + "learning_rate": 5.5212978819664775e-06, + "loss": 0.8274, + "step": 15857 + }, + { + "epoch": 0.790922693266833, + "grad_norm": 0.5105510338250988, + "learning_rate": 5.518766673208297e-06, + "loss": 0.8621, + "step": 15858 + }, + { + "epoch": 0.7909725685785536, + "grad_norm": 0.5756272786609681, + "learning_rate": 5.516235972804182e-06, + "loss": 0.7984, + "step": 15859 + }, + { + "epoch": 0.7910224438902743, + "grad_norm": 1.0350188010550816, + "learning_rate": 5.513705780820169e-06, + "loss": 0.9146, + "step": 15860 + }, + { + "epoch": 0.7910723192019951, + "grad_norm": 0.5074705381325851, + "learning_rate": 5.511176097322276e-06, + "loss": 0.8241, + "step": 15861 + }, + { + "epoch": 0.7911221945137157, + "grad_norm": 0.581407937951004, + "learning_rate": 5.508646922376529e-06, + "loss": 0.866, + "step": 15862 + }, + { + "epoch": 0.7911720698254364, + "grad_norm": 0.6676239544144396, + "learning_rate": 5.506118256048914e-06, + "loss": 0.8296, + "step": 15863 + }, + { + "epoch": 0.7912219451371572, + "grad_norm": 0.5054817180503777, + "learning_rate": 5.503590098405414e-06, + "loss": 0.8174, + "step": 15864 + }, + { + "epoch": 0.7912718204488778, + "grad_norm": 0.5734237131596486, + "learning_rate": 5.501062449512012e-06, + "loss": 0.831, + "step": 15865 + }, + { + "epoch": 0.7913216957605985, + "grad_norm": 0.537712890905552, + "learning_rate": 5.49853530943466e-06, + "loss": 0.8805, + "step": 15866 + }, + { + "epoch": 0.7913715710723191, + "grad_norm": 0.9732620846666933, + "learning_rate": 5.496008678239298e-06, + "loss": 0.851, + "step": 15867 + }, + { + "epoch": 0.7914214463840399, + "grad_norm": 0.5326222578634944, + "learning_rate": 5.493482555991858e-06, + "loss": 0.807, + "step": 15868 + }, + { + "epoch": 0.7914713216957606, + "grad_norm": 0.6700252636624218, + "learning_rate": 5.490956942758263e-06, + "loss": 0.8227, + "step": 15869 + }, + { + "epoch": 0.7915211970074812, + "grad_norm": 0.5488225251734422, + "learning_rate": 5.4884318386044185e-06, + "loss": 0.8396, + "step": 15870 + }, + { + "epoch": 0.791571072319202, + "grad_norm": 0.531153061201467, + "learning_rate": 5.485907243596214e-06, + "loss": 0.827, + "step": 15871 + }, + { + "epoch": 0.7916209476309227, + "grad_norm": 0.591420101295698, + "learning_rate": 5.483383157799516e-06, + "loss": 0.855, + "step": 15872 + }, + { + "epoch": 0.7916708229426433, + "grad_norm": 0.5091004691150945, + "learning_rate": 5.48085958128021e-06, + "loss": 0.8415, + "step": 15873 + }, + { + "epoch": 0.7917206982543641, + "grad_norm": 0.8608248228704575, + "learning_rate": 5.478336514104135e-06, + "loss": 0.8245, + "step": 15874 + }, + { + "epoch": 0.7917705735660848, + "grad_norm": 0.5033872958073374, + "learning_rate": 5.475813956337134e-06, + "loss": 0.8411, + "step": 15875 + }, + { + "epoch": 0.7918204488778054, + "grad_norm": 0.7007032521755007, + "learning_rate": 5.473291908045022e-06, + "loss": 0.8193, + "step": 15876 + }, + { + "epoch": 0.7918703241895262, + "grad_norm": 0.5157178276895283, + "learning_rate": 5.470770369293635e-06, + "loss": 0.819, + "step": 15877 + }, + { + "epoch": 0.7919201995012469, + "grad_norm": 0.5619460137830757, + "learning_rate": 5.4682493401487376e-06, + "loss": 0.877, + "step": 15878 + }, + { + "epoch": 0.7919700748129676, + "grad_norm": 0.6310125027618143, + "learning_rate": 5.4657288206761415e-06, + "loss": 0.8326, + "step": 15879 + }, + { + "epoch": 0.7920199501246883, + "grad_norm": 0.9225051988026298, + "learning_rate": 5.4632088109416e-06, + "loss": 0.8357, + "step": 15880 + }, + { + "epoch": 0.792069825436409, + "grad_norm": 0.5626717966067529, + "learning_rate": 5.460689311010897e-06, + "loss": 0.8456, + "step": 15881 + }, + { + "epoch": 0.7921197007481297, + "grad_norm": 0.6176070139466516, + "learning_rate": 5.458170320949749e-06, + "loss": 0.8193, + "step": 15882 + }, + { + "epoch": 0.7921695760598504, + "grad_norm": 0.588677534381317, + "learning_rate": 5.4556518408239055e-06, + "loss": 0.8834, + "step": 15883 + }, + { + "epoch": 0.7922194513715711, + "grad_norm": 0.6318309447347622, + "learning_rate": 5.453133870699073e-06, + "loss": 0.8074, + "step": 15884 + }, + { + "epoch": 0.7922693266832918, + "grad_norm": 0.5090217079661209, + "learning_rate": 5.4506164106409795e-06, + "loss": 0.8118, + "step": 15885 + }, + { + "epoch": 0.7923192019950125, + "grad_norm": 0.5209158360018327, + "learning_rate": 5.448099460715289e-06, + "loss": 0.8086, + "step": 15886 + }, + { + "epoch": 0.7923690773067331, + "grad_norm": 0.46607075307239826, + "learning_rate": 5.445583020987696e-06, + "loss": 0.8386, + "step": 15887 + }, + { + "epoch": 0.7924189526184539, + "grad_norm": 0.8599873292698231, + "learning_rate": 5.44306709152386e-06, + "loss": 0.807, + "step": 15888 + }, + { + "epoch": 0.7924688279301746, + "grad_norm": 0.5634011006639981, + "learning_rate": 5.440551672389446e-06, + "loss": 0.8396, + "step": 15889 + }, + { + "epoch": 0.7925187032418952, + "grad_norm": 0.6613097410067, + "learning_rate": 5.43803676365007e-06, + "loss": 0.8589, + "step": 15890 + }, + { + "epoch": 0.792568578553616, + "grad_norm": 0.5865728451452814, + "learning_rate": 5.435522365371376e-06, + "loss": 0.8458, + "step": 15891 + }, + { + "epoch": 0.7926184538653367, + "grad_norm": 0.8296602852232119, + "learning_rate": 5.433008477618972e-06, + "loss": 0.8228, + "step": 15892 + }, + { + "epoch": 0.7926683291770573, + "grad_norm": 0.587090968923724, + "learning_rate": 5.430495100458454e-06, + "loss": 0.8737, + "step": 15893 + }, + { + "epoch": 0.792718204488778, + "grad_norm": 0.6607178504147299, + "learning_rate": 5.4279822339554e-06, + "loss": 0.8124, + "step": 15894 + }, + { + "epoch": 0.7927680798004988, + "grad_norm": 0.49508840921284436, + "learning_rate": 5.425469878175399e-06, + "loss": 0.8716, + "step": 15895 + }, + { + "epoch": 0.7928179551122194, + "grad_norm": 0.5765944096804186, + "learning_rate": 5.422958033184e-06, + "loss": 0.8672, + "step": 15896 + }, + { + "epoch": 0.7928678304239402, + "grad_norm": 0.5247568044225319, + "learning_rate": 5.420446699046749e-06, + "loss": 0.8345, + "step": 15897 + }, + { + "epoch": 0.7929177057356609, + "grad_norm": 0.5604262811309914, + "learning_rate": 5.4179358758291725e-06, + "loss": 0.8525, + "step": 15898 + }, + { + "epoch": 0.7929675810473815, + "grad_norm": 0.6079926754519809, + "learning_rate": 5.415425563596801e-06, + "loss": 0.8055, + "step": 15899 + }, + { + "epoch": 0.7930174563591023, + "grad_norm": 0.5544004310387424, + "learning_rate": 5.412915762415138e-06, + "loss": 0.7867, + "step": 15900 + }, + { + "epoch": 0.793067331670823, + "grad_norm": 0.6226909761192724, + "learning_rate": 5.410406472349663e-06, + "loss": 0.8918, + "step": 15901 + }, + { + "epoch": 0.7931172069825436, + "grad_norm": 0.4909616024264729, + "learning_rate": 5.407897693465872e-06, + "loss": 0.8496, + "step": 15902 + }, + { + "epoch": 0.7931670822942644, + "grad_norm": 0.5595328879364352, + "learning_rate": 5.405389425829219e-06, + "loss": 0.81, + "step": 15903 + }, + { + "epoch": 0.793216957605985, + "grad_norm": 0.5732736593479875, + "learning_rate": 5.4028816695051635e-06, + "loss": 0.8361, + "step": 15904 + }, + { + "epoch": 0.7932668329177057, + "grad_norm": 0.5329407769651793, + "learning_rate": 5.400374424559132e-06, + "loss": 0.8242, + "step": 15905 + }, + { + "epoch": 0.7933167082294265, + "grad_norm": 0.47746641399196654, + "learning_rate": 5.3978676910565626e-06, + "loss": 0.8027, + "step": 15906 + }, + { + "epoch": 0.7933665835411471, + "grad_norm": 0.5779012207451203, + "learning_rate": 5.3953614690628676e-06, + "loss": 0.9006, + "step": 15907 + }, + { + "epoch": 0.7934164588528678, + "grad_norm": 0.7271757251715258, + "learning_rate": 5.3928557586434396e-06, + "loss": 0.8067, + "step": 15908 + }, + { + "epoch": 0.7934663341645886, + "grad_norm": 0.47005879748556634, + "learning_rate": 5.390350559863658e-06, + "loss": 0.8089, + "step": 15909 + }, + { + "epoch": 0.7935162094763092, + "grad_norm": 0.5540887557696512, + "learning_rate": 5.387845872788908e-06, + "loss": 0.8015, + "step": 15910 + }, + { + "epoch": 0.7935660847880299, + "grad_norm": 1.8764159433318006, + "learning_rate": 5.385341697484544e-06, + "loss": 0.8262, + "step": 15911 + }, + { + "epoch": 0.7936159600997507, + "grad_norm": 0.6403267740737203, + "learning_rate": 5.382838034015911e-06, + "loss": 0.8379, + "step": 15912 + }, + { + "epoch": 0.7936658354114713, + "grad_norm": 0.6087904659738826, + "learning_rate": 5.380334882448332e-06, + "loss": 0.7929, + "step": 15913 + }, + { + "epoch": 0.793715710723192, + "grad_norm": 0.5703320657103398, + "learning_rate": 5.3778322428471476e-06, + "loss": 0.8233, + "step": 15914 + }, + { + "epoch": 0.7937655860349128, + "grad_norm": 0.6335842205569562, + "learning_rate": 5.3753301152776355e-06, + "loss": 0.8288, + "step": 15915 + }, + { + "epoch": 0.7938154613466334, + "grad_norm": 0.5481739705533081, + "learning_rate": 5.372828499805108e-06, + "loss": 0.8434, + "step": 15916 + }, + { + "epoch": 0.7938653366583541, + "grad_norm": 0.5666647206047195, + "learning_rate": 5.370327396494829e-06, + "loss": 0.8421, + "step": 15917 + }, + { + "epoch": 0.7939152119700749, + "grad_norm": 0.523778563091739, + "learning_rate": 5.367826805412085e-06, + "loss": 0.8754, + "step": 15918 + }, + { + "epoch": 0.7939650872817955, + "grad_norm": 0.5164564242353245, + "learning_rate": 5.365326726622102e-06, + "loss": 0.8871, + "step": 15919 + }, + { + "epoch": 0.7940149625935162, + "grad_norm": 0.5906841668053642, + "learning_rate": 5.362827160190137e-06, + "loss": 0.8624, + "step": 15920 + }, + { + "epoch": 0.7940648379052369, + "grad_norm": 0.8188925940692918, + "learning_rate": 5.360328106181403e-06, + "loss": 0.8848, + "step": 15921 + }, + { + "epoch": 0.7941147132169576, + "grad_norm": 0.5863724468828425, + "learning_rate": 5.357829564661132e-06, + "loss": 0.8794, + "step": 15922 + }, + { + "epoch": 0.7941645885286783, + "grad_norm": 0.5935285760710645, + "learning_rate": 5.355331535694491e-06, + "loss": 0.8292, + "step": 15923 + }, + { + "epoch": 0.794214463840399, + "grad_norm": 0.6203123579691473, + "learning_rate": 5.352834019346689e-06, + "loss": 0.8394, + "step": 15924 + }, + { + "epoch": 0.7942643391521197, + "grad_norm": 0.49284445639255103, + "learning_rate": 5.350337015682894e-06, + "loss": 0.8169, + "step": 15925 + }, + { + "epoch": 0.7943142144638404, + "grad_norm": 0.7475939263905261, + "learning_rate": 5.347840524768258e-06, + "loss": 0.8324, + "step": 15926 + }, + { + "epoch": 0.794364089775561, + "grad_norm": 0.5832333537306381, + "learning_rate": 5.345344546667922e-06, + "loss": 0.8587, + "step": 15927 + }, + { + "epoch": 0.7944139650872818, + "grad_norm": 0.8641757864960972, + "learning_rate": 5.34284908144703e-06, + "loss": 0.9142, + "step": 15928 + }, + { + "epoch": 0.7944638403990025, + "grad_norm": 0.5764505487060337, + "learning_rate": 5.340354129170694e-06, + "loss": 0.829, + "step": 15929 + }, + { + "epoch": 0.7945137157107232, + "grad_norm": 0.5868482353874178, + "learning_rate": 5.337859689904018e-06, + "loss": 0.8627, + "step": 15930 + }, + { + "epoch": 0.7945635910224439, + "grad_norm": 0.5464170377138698, + "learning_rate": 5.3353657637120856e-06, + "loss": 0.8661, + "step": 15931 + }, + { + "epoch": 0.7946134663341646, + "grad_norm": 0.6412012989540363, + "learning_rate": 5.332872350659993e-06, + "loss": 0.859, + "step": 15932 + }, + { + "epoch": 0.7946633416458853, + "grad_norm": 0.5991076433187541, + "learning_rate": 5.330379450812792e-06, + "loss": 0.8496, + "step": 15933 + }, + { + "epoch": 0.794713216957606, + "grad_norm": 0.5502340857378576, + "learning_rate": 5.327887064235537e-06, + "loss": 0.834, + "step": 15934 + }, + { + "epoch": 0.7947630922693267, + "grad_norm": 0.5936078714279651, + "learning_rate": 5.32539519099326e-06, + "loss": 0.8565, + "step": 15935 + }, + { + "epoch": 0.7948129675810474, + "grad_norm": 0.6406904738826442, + "learning_rate": 5.322903831150994e-06, + "loss": 0.8533, + "step": 15936 + }, + { + "epoch": 0.7948628428927681, + "grad_norm": 0.6817816535396353, + "learning_rate": 5.320412984773748e-06, + "loss": 0.8591, + "step": 15937 + }, + { + "epoch": 0.7949127182044888, + "grad_norm": 0.8986227920121597, + "learning_rate": 5.3179226519265105e-06, + "loss": 0.8563, + "step": 15938 + }, + { + "epoch": 0.7949625935162095, + "grad_norm": 0.5581859110425957, + "learning_rate": 5.31543283267428e-06, + "loss": 0.822, + "step": 15939 + }, + { + "epoch": 0.7950124688279302, + "grad_norm": 5.7675870159716265, + "learning_rate": 5.31294352708202e-06, + "loss": 0.844, + "step": 15940 + }, + { + "epoch": 0.7950623441396508, + "grad_norm": 0.5994479611043722, + "learning_rate": 5.31045473521469e-06, + "loss": 0.8727, + "step": 15941 + }, + { + "epoch": 0.7951122194513716, + "grad_norm": 0.5630047367820267, + "learning_rate": 5.30796645713722e-06, + "loss": 0.8431, + "step": 15942 + }, + { + "epoch": 0.7951620947630923, + "grad_norm": 0.6358686738640684, + "learning_rate": 5.305478692914565e-06, + "loss": 0.8644, + "step": 15943 + }, + { + "epoch": 0.7952119700748129, + "grad_norm": 0.5644434939679621, + "learning_rate": 5.302991442611624e-06, + "loss": 0.8818, + "step": 15944 + }, + { + "epoch": 0.7952618453865337, + "grad_norm": 0.5626938819424391, + "learning_rate": 5.3005047062933074e-06, + "loss": 0.8324, + "step": 15945 + }, + { + "epoch": 0.7953117206982544, + "grad_norm": 0.601299601119539, + "learning_rate": 5.2980184840245e-06, + "loss": 0.8516, + "step": 15946 + }, + { + "epoch": 0.795361596009975, + "grad_norm": 0.6262048054452837, + "learning_rate": 5.295532775870088e-06, + "loss": 0.8306, + "step": 15947 + }, + { + "epoch": 0.7954114713216958, + "grad_norm": 0.519650735736843, + "learning_rate": 5.293047581894931e-06, + "loss": 0.8507, + "step": 15948 + }, + { + "epoch": 0.7954613466334165, + "grad_norm": 0.496109893668122, + "learning_rate": 5.290562902163876e-06, + "loss": 0.836, + "step": 15949 + }, + { + "epoch": 0.7955112219451371, + "grad_norm": 1.1802824485645065, + "learning_rate": 5.288078736741753e-06, + "loss": 0.8825, + "step": 15950 + }, + { + "epoch": 0.7955610972568579, + "grad_norm": 0.6009419499113007, + "learning_rate": 5.28559508569341e-06, + "loss": 0.7944, + "step": 15951 + }, + { + "epoch": 0.7956109725685786, + "grad_norm": 0.669866408497758, + "learning_rate": 5.2831119490836245e-06, + "loss": 0.8567, + "step": 15952 + }, + { + "epoch": 0.7956608478802992, + "grad_norm": 0.5063912445821273, + "learning_rate": 5.280629326977216e-06, + "loss": 0.8141, + "step": 15953 + }, + { + "epoch": 0.79571072319202, + "grad_norm": 0.6213282497488194, + "learning_rate": 5.2781472194389534e-06, + "loss": 0.8328, + "step": 15954 + }, + { + "epoch": 0.7957605985037407, + "grad_norm": 0.5404546966267738, + "learning_rate": 5.275665626533627e-06, + "loss": 0.8719, + "step": 15955 + }, + { + "epoch": 0.7958104738154613, + "grad_norm": 0.6596390422169024, + "learning_rate": 5.273184548325965e-06, + "loss": 0.8087, + "step": 15956 + }, + { + "epoch": 0.7958603491271821, + "grad_norm": 0.6363333599886198, + "learning_rate": 5.270703984880729e-06, + "loss": 0.8664, + "step": 15957 + }, + { + "epoch": 0.7959102244389027, + "grad_norm": 0.7397321947376804, + "learning_rate": 5.268223936262636e-06, + "loss": 0.8473, + "step": 15958 + }, + { + "epoch": 0.7959600997506234, + "grad_norm": 0.5857524393900775, + "learning_rate": 5.265744402536424e-06, + "loss": 0.844, + "step": 15959 + }, + { + "epoch": 0.7960099750623442, + "grad_norm": 0.49615703794417926, + "learning_rate": 5.263265383766766e-06, + "loss": 0.8225, + "step": 15960 + }, + { + "epoch": 0.7960598503740648, + "grad_norm": 0.5174120445522535, + "learning_rate": 5.26078688001837e-06, + "loss": 0.8429, + "step": 15961 + }, + { + "epoch": 0.7961097256857855, + "grad_norm": 0.5741280192137607, + "learning_rate": 5.258308891355909e-06, + "loss": 0.8458, + "step": 15962 + }, + { + "epoch": 0.7961596009975063, + "grad_norm": 0.5855087691638298, + "learning_rate": 5.25583141784404e-06, + "loss": 0.8127, + "step": 15963 + }, + { + "epoch": 0.7962094763092269, + "grad_norm": 0.5468796278467951, + "learning_rate": 5.253354459547407e-06, + "loss": 0.8931, + "step": 15964 + }, + { + "epoch": 0.7962593516209476, + "grad_norm": 0.5133198206896049, + "learning_rate": 5.250878016530661e-06, + "loss": 0.8386, + "step": 15965 + }, + { + "epoch": 0.7963092269326684, + "grad_norm": 0.8016733265597579, + "learning_rate": 5.2484020888584115e-06, + "loss": 0.8313, + "step": 15966 + }, + { + "epoch": 0.796359102244389, + "grad_norm": 0.5214505895979337, + "learning_rate": 5.2459266765952726e-06, + "loss": 0.8484, + "step": 15967 + }, + { + "epoch": 0.7964089775561097, + "grad_norm": 0.5564149638604513, + "learning_rate": 5.243451779805828e-06, + "loss": 0.8315, + "step": 15968 + }, + { + "epoch": 0.7964588528678305, + "grad_norm": 0.5974178771005682, + "learning_rate": 5.240977398554673e-06, + "loss": 0.8735, + "step": 15969 + }, + { + "epoch": 0.7965087281795511, + "grad_norm": 0.6051489226054515, + "learning_rate": 5.238503532906372e-06, + "loss": 0.8709, + "step": 15970 + }, + { + "epoch": 0.7965586034912718, + "grad_norm": 0.5800553107746566, + "learning_rate": 5.236030182925475e-06, + "loss": 0.8512, + "step": 15971 + }, + { + "epoch": 0.7966084788029926, + "grad_norm": 0.49508602309441657, + "learning_rate": 5.233557348676521e-06, + "loss": 0.8467, + "step": 15972 + }, + { + "epoch": 0.7966583541147132, + "grad_norm": 0.5382520139377721, + "learning_rate": 5.231085030224045e-06, + "loss": 0.8387, + "step": 15973 + }, + { + "epoch": 0.7967082294264339, + "grad_norm": 0.5545722186440623, + "learning_rate": 5.228613227632562e-06, + "loss": 0.8364, + "step": 15974 + }, + { + "epoch": 0.7967581047381546, + "grad_norm": 0.6119455151229823, + "learning_rate": 5.2261419409665645e-06, + "loss": 0.8507, + "step": 15975 + }, + { + "epoch": 0.7968079800498753, + "grad_norm": 0.7510047905201686, + "learning_rate": 5.22367117029054e-06, + "loss": 0.8444, + "step": 15976 + }, + { + "epoch": 0.796857855361596, + "grad_norm": 0.6681029336485657, + "learning_rate": 5.22120091566897e-06, + "loss": 0.8418, + "step": 15977 + }, + { + "epoch": 0.7969077306733167, + "grad_norm": 0.5609921606004039, + "learning_rate": 5.218731177166311e-06, + "loss": 0.8148, + "step": 15978 + }, + { + "epoch": 0.7969576059850374, + "grad_norm": 0.4801743732598837, + "learning_rate": 5.216261954847002e-06, + "loss": 0.8805, + "step": 15979 + }, + { + "epoch": 0.7970074812967581, + "grad_norm": 0.6481444750322195, + "learning_rate": 5.213793248775489e-06, + "loss": 0.7967, + "step": 15980 + }, + { + "epoch": 0.7970573566084788, + "grad_norm": 0.6510521965422155, + "learning_rate": 5.211325059016189e-06, + "loss": 0.8372, + "step": 15981 + }, + { + "epoch": 0.7971072319201995, + "grad_norm": 0.6313286207855392, + "learning_rate": 5.208857385633503e-06, + "loss": 0.8196, + "step": 15982 + }, + { + "epoch": 0.7971571072319202, + "grad_norm": 0.652327074567291, + "learning_rate": 5.206390228691821e-06, + "loss": 0.8452, + "step": 15983 + }, + { + "epoch": 0.7972069825436409, + "grad_norm": 0.5778759195471093, + "learning_rate": 5.203923588255538e-06, + "loss": 0.8181, + "step": 15984 + }, + { + "epoch": 0.7972568578553616, + "grad_norm": 0.6390139125743425, + "learning_rate": 5.201457464388998e-06, + "loss": 0.8572, + "step": 15985 + }, + { + "epoch": 0.7973067331670823, + "grad_norm": 0.5363501559678542, + "learning_rate": 5.198991857156571e-06, + "loss": 0.8383, + "step": 15986 + }, + { + "epoch": 0.797356608478803, + "grad_norm": 0.5261069641977062, + "learning_rate": 5.196526766622584e-06, + "loss": 0.8594, + "step": 15987 + }, + { + "epoch": 0.7974064837905237, + "grad_norm": 0.6209414338028402, + "learning_rate": 5.194062192851382e-06, + "loss": 0.856, + "step": 15988 + }, + { + "epoch": 0.7974563591022444, + "grad_norm": 0.5972977094851492, + "learning_rate": 5.1915981359072495e-06, + "loss": 0.8352, + "step": 15989 + }, + { + "epoch": 0.7975062344139651, + "grad_norm": 0.5604152413015461, + "learning_rate": 5.189134595854503e-06, + "loss": 0.8394, + "step": 15990 + }, + { + "epoch": 0.7975561097256858, + "grad_norm": 0.5558613995170085, + "learning_rate": 5.186671572757418e-06, + "loss": 0.8771, + "step": 15991 + }, + { + "epoch": 0.7976059850374064, + "grad_norm": 0.8075331171233561, + "learning_rate": 5.184209066680284e-06, + "loss": 0.8383, + "step": 15992 + }, + { + "epoch": 0.7976558603491272, + "grad_norm": 0.5741848431073583, + "learning_rate": 5.181747077687332e-06, + "loss": 0.8295, + "step": 15993 + }, + { + "epoch": 0.7977057356608479, + "grad_norm": 0.5480978475230696, + "learning_rate": 5.179285605842829e-06, + "loss": 0.834, + "step": 15994 + }, + { + "epoch": 0.7977556109725685, + "grad_norm": 0.554217304700327, + "learning_rate": 5.17682465121099e-06, + "loss": 0.8279, + "step": 15995 + }, + { + "epoch": 0.7978054862842893, + "grad_norm": 0.47945827338358754, + "learning_rate": 5.1743642138560535e-06, + "loss": 0.8472, + "step": 15996 + }, + { + "epoch": 0.79785536159601, + "grad_norm": 0.48435220334414836, + "learning_rate": 5.171904293842197e-06, + "loss": 0.822, + "step": 15997 + }, + { + "epoch": 0.7979052369077306, + "grad_norm": 0.5858192038879616, + "learning_rate": 5.16944489123363e-06, + "loss": 0.8681, + "step": 15998 + }, + { + "epoch": 0.7979551122194514, + "grad_norm": 0.6923481903295305, + "learning_rate": 5.166986006094524e-06, + "loss": 0.8532, + "step": 15999 + }, + { + "epoch": 0.7980049875311721, + "grad_norm": 0.6556541293598841, + "learning_rate": 5.164527638489042e-06, + "loss": 0.8035, + "step": 16000 + }, + { + "epoch": 0.7980548628428927, + "grad_norm": 0.5262179592440632, + "learning_rate": 5.1620697884813275e-06, + "loss": 0.8635, + "step": 16001 + }, + { + "epoch": 0.7981047381546135, + "grad_norm": 0.5343493680264378, + "learning_rate": 5.1596124561355295e-06, + "loss": 0.8949, + "step": 16002 + }, + { + "epoch": 0.7981546134663342, + "grad_norm": 0.8324117891481724, + "learning_rate": 5.157155641515765e-06, + "loss": 0.8451, + "step": 16003 + }, + { + "epoch": 0.7982044887780548, + "grad_norm": 0.5418440829593294, + "learning_rate": 5.1546993446861425e-06, + "loss": 0.8543, + "step": 16004 + }, + { + "epoch": 0.7982543640897756, + "grad_norm": 0.7374723364239697, + "learning_rate": 5.152243565710755e-06, + "loss": 0.8546, + "step": 16005 + }, + { + "epoch": 0.7983042394014963, + "grad_norm": 0.9155199204586493, + "learning_rate": 5.149788304653694e-06, + "loss": 0.8651, + "step": 16006 + }, + { + "epoch": 0.7983541147132169, + "grad_norm": 0.5641949939332661, + "learning_rate": 5.147333561579023e-06, + "loss": 0.8735, + "step": 16007 + }, + { + "epoch": 0.7984039900249377, + "grad_norm": 0.6096046299617724, + "learning_rate": 5.144879336550801e-06, + "loss": 0.8491, + "step": 16008 + }, + { + "epoch": 0.7984538653366584, + "grad_norm": 0.6492724729051582, + "learning_rate": 5.142425629633058e-06, + "loss": 0.8666, + "step": 16009 + }, + { + "epoch": 0.798503740648379, + "grad_norm": 0.676323110233747, + "learning_rate": 5.139972440889837e-06, + "loss": 0.8476, + "step": 16010 + }, + { + "epoch": 0.7985536159600998, + "grad_norm": 0.5215459435519245, + "learning_rate": 5.137519770385147e-06, + "loss": 0.8693, + "step": 16011 + }, + { + "epoch": 0.7986034912718204, + "grad_norm": 0.5793428791555045, + "learning_rate": 5.13506761818299e-06, + "loss": 0.8134, + "step": 16012 + }, + { + "epoch": 0.7986533665835411, + "grad_norm": 0.6234114564807591, + "learning_rate": 5.132615984347347e-06, + "loss": 0.8414, + "step": 16013 + }, + { + "epoch": 0.7987032418952619, + "grad_norm": 0.4481389040344173, + "learning_rate": 5.130164868942203e-06, + "loss": 0.805, + "step": 16014 + }, + { + "epoch": 0.7987531172069825, + "grad_norm": 0.5187220924517437, + "learning_rate": 5.1277142720315166e-06, + "loss": 0.8194, + "step": 16015 + }, + { + "epoch": 0.7988029925187032, + "grad_norm": 0.7267809530933363, + "learning_rate": 5.125264193679225e-06, + "loss": 0.8577, + "step": 16016 + }, + { + "epoch": 0.798852867830424, + "grad_norm": 0.5606414175216701, + "learning_rate": 5.122814633949274e-06, + "loss": 0.8538, + "step": 16017 + }, + { + "epoch": 0.7989027431421446, + "grad_norm": 0.5956224367716454, + "learning_rate": 5.120365592905582e-06, + "loss": 0.8474, + "step": 16018 + }, + { + "epoch": 0.7989526184538653, + "grad_norm": 0.7087077691394738, + "learning_rate": 5.1179170706120525e-06, + "loss": 0.8413, + "step": 16019 + }, + { + "epoch": 0.7990024937655861, + "grad_norm": 0.5055527302987493, + "learning_rate": 5.115469067132573e-06, + "loss": 0.8532, + "step": 16020 + }, + { + "epoch": 0.7990523690773067, + "grad_norm": 1.0109910489313614, + "learning_rate": 5.113021582531041e-06, + "loss": 0.8341, + "step": 16021 + }, + { + "epoch": 0.7991022443890274, + "grad_norm": 0.6141128736699472, + "learning_rate": 5.110574616871297e-06, + "loss": 0.861, + "step": 16022 + }, + { + "epoch": 0.7991521197007482, + "grad_norm": 0.6750081155449043, + "learning_rate": 5.108128170217216e-06, + "loss": 0.8697, + "step": 16023 + }, + { + "epoch": 0.7992019950124688, + "grad_norm": 0.6178020753176029, + "learning_rate": 5.105682242632617e-06, + "loss": 0.8593, + "step": 16024 + }, + { + "epoch": 0.7992518703241895, + "grad_norm": 0.45960310164902635, + "learning_rate": 5.103236834181355e-06, + "loss": 0.8205, + "step": 16025 + }, + { + "epoch": 0.7993017456359103, + "grad_norm": 1.090142461278679, + "learning_rate": 5.100791944927205e-06, + "loss": 0.8679, + "step": 16026 + }, + { + "epoch": 0.7993516209476309, + "grad_norm": 1.1678028552195425, + "learning_rate": 5.098347574933993e-06, + "loss": 0.8738, + "step": 16027 + }, + { + "epoch": 0.7994014962593516, + "grad_norm": 0.5840094665446446, + "learning_rate": 5.095903724265488e-06, + "loss": 0.8493, + "step": 16028 + }, + { + "epoch": 0.7994513715710723, + "grad_norm": 0.5670387039597107, + "learning_rate": 5.09346039298548e-06, + "loss": 0.7762, + "step": 16029 + }, + { + "epoch": 0.799501246882793, + "grad_norm": 0.507052533394056, + "learning_rate": 5.091017581157697e-06, + "loss": 0.8625, + "step": 16030 + }, + { + "epoch": 0.7995511221945137, + "grad_norm": 0.6730024774467406, + "learning_rate": 5.088575288845912e-06, + "loss": 0.8458, + "step": 16031 + }, + { + "epoch": 0.7996009975062344, + "grad_norm": 0.7264948942045467, + "learning_rate": 5.0861335161138416e-06, + "loss": 0.8367, + "step": 16032 + }, + { + "epoch": 0.7996508728179551, + "grad_norm": 0.5712510934237018, + "learning_rate": 5.083692263025206e-06, + "loss": 0.8794, + "step": 16033 + }, + { + "epoch": 0.7997007481296758, + "grad_norm": 0.546951374979806, + "learning_rate": 5.081251529643699e-06, + "loss": 0.8662, + "step": 16034 + }, + { + "epoch": 0.7997506234413965, + "grad_norm": 2.343160879043108, + "learning_rate": 5.0788113160330245e-06, + "loss": 0.8676, + "step": 16035 + }, + { + "epoch": 0.7998004987531172, + "grad_norm": 0.6303644837138616, + "learning_rate": 5.076371622256856e-06, + "loss": 0.8323, + "step": 16036 + }, + { + "epoch": 0.7998503740648379, + "grad_norm": 0.5196931510332143, + "learning_rate": 5.073932448378851e-06, + "loss": 0.846, + "step": 16037 + }, + { + "epoch": 0.7999002493765586, + "grad_norm": 0.5555808919220749, + "learning_rate": 5.071493794462656e-06, + "loss": 0.8574, + "step": 16038 + }, + { + "epoch": 0.7999501246882793, + "grad_norm": 0.5473508413618636, + "learning_rate": 5.06905566057192e-06, + "loss": 0.8687, + "step": 16039 + }, + { + "epoch": 0.8, + "grad_norm": 0.607655171828121, + "learning_rate": 5.066618046770252e-06, + "loss": 0.8389, + "step": 16040 + }, + { + "epoch": 0.8000498753117207, + "grad_norm": 0.48500422332621196, + "learning_rate": 5.06418095312127e-06, + "loss": 0.8227, + "step": 16041 + }, + { + "epoch": 0.8000997506234414, + "grad_norm": 0.5490715739709193, + "learning_rate": 5.061744379688554e-06, + "loss": 0.84, + "step": 16042 + }, + { + "epoch": 0.8001496259351621, + "grad_norm": 0.5832136587785528, + "learning_rate": 5.0593083265357025e-06, + "loss": 0.8532, + "step": 16043 + }, + { + "epoch": 0.8001995012468828, + "grad_norm": 0.5546009349438602, + "learning_rate": 5.056872793726275e-06, + "loss": 0.8885, + "step": 16044 + }, + { + "epoch": 0.8002493765586035, + "grad_norm": 0.5506391177516777, + "learning_rate": 5.054437781323828e-06, + "loss": 0.8351, + "step": 16045 + }, + { + "epoch": 0.8002992518703241, + "grad_norm": 0.7038877106141948, + "learning_rate": 5.052003289391893e-06, + "loss": 0.8363, + "step": 16046 + }, + { + "epoch": 0.8003491271820449, + "grad_norm": 0.8597505434602446, + "learning_rate": 5.049569317994013e-06, + "loss": 0.8656, + "step": 16047 + }, + { + "epoch": 0.8003990024937656, + "grad_norm": 0.5173487981558847, + "learning_rate": 5.047135867193692e-06, + "loss": 0.8727, + "step": 16048 + }, + { + "epoch": 0.8004488778054862, + "grad_norm": 0.8486407315300866, + "learning_rate": 5.04470293705443e-06, + "loss": 0.845, + "step": 16049 + }, + { + "epoch": 0.800498753117207, + "grad_norm": 0.6177067092932936, + "learning_rate": 5.042270527639708e-06, + "loss": 0.8701, + "step": 16050 + }, + { + "epoch": 0.8005486284289277, + "grad_norm": 0.5063842842596697, + "learning_rate": 5.039838639013012e-06, + "loss": 0.8306, + "step": 16051 + }, + { + "epoch": 0.8005985037406483, + "grad_norm": 1.2607554727406516, + "learning_rate": 5.037407271237793e-06, + "loss": 0.8235, + "step": 16052 + }, + { + "epoch": 0.8006483790523691, + "grad_norm": 0.6086847265007561, + "learning_rate": 5.034976424377491e-06, + "loss": 0.8526, + "step": 16053 + }, + { + "epoch": 0.8006982543640898, + "grad_norm": 0.5386331418336119, + "learning_rate": 5.03254609849555e-06, + "loss": 0.884, + "step": 16054 + }, + { + "epoch": 0.8007481296758104, + "grad_norm": 0.5765899416000634, + "learning_rate": 5.030116293655382e-06, + "loss": 0.8374, + "step": 16055 + }, + { + "epoch": 0.8007980049875312, + "grad_norm": 0.5148224694924664, + "learning_rate": 5.027687009920393e-06, + "loss": 0.8484, + "step": 16056 + }, + { + "epoch": 0.8008478802992519, + "grad_norm": 0.6235960953628784, + "learning_rate": 5.025258247353967e-06, + "loss": 0.8274, + "step": 16057 + }, + { + "epoch": 0.8008977556109725, + "grad_norm": 0.5021131041503581, + "learning_rate": 5.0228300060195e-06, + "loss": 0.8578, + "step": 16058 + }, + { + "epoch": 0.8009476309226933, + "grad_norm": 0.519515087787441, + "learning_rate": 5.0204022859803305e-06, + "loss": 0.8575, + "step": 16059 + }, + { + "epoch": 0.800997506234414, + "grad_norm": 0.6673383454169368, + "learning_rate": 5.017975087299825e-06, + "loss": 0.8576, + "step": 16060 + }, + { + "epoch": 0.8010473815461346, + "grad_norm": 0.6315873369689974, + "learning_rate": 5.0155484100413155e-06, + "loss": 0.8927, + "step": 16061 + }, + { + "epoch": 0.8010972568578554, + "grad_norm": 0.4563181795028277, + "learning_rate": 5.013122254268135e-06, + "loss": 0.8231, + "step": 16062 + }, + { + "epoch": 0.8011471321695761, + "grad_norm": 0.5325219957317838, + "learning_rate": 5.010696620043573e-06, + "loss": 0.8094, + "step": 16063 + }, + { + "epoch": 0.8011970074812967, + "grad_norm": 0.4914318102194506, + "learning_rate": 5.008271507430945e-06, + "loss": 0.8319, + "step": 16064 + }, + { + "epoch": 0.8012468827930175, + "grad_norm": 0.5218441157328189, + "learning_rate": 5.005846916493514e-06, + "loss": 0.7893, + "step": 16065 + }, + { + "epoch": 0.8012967581047381, + "grad_norm": 0.8253104337230559, + "learning_rate": 5.003422847294578e-06, + "loss": 0.8523, + "step": 16066 + }, + { + "epoch": 0.8013466334164588, + "grad_norm": 0.6291831847044917, + "learning_rate": 5.000999299897358e-06, + "loss": 0.8496, + "step": 16067 + }, + { + "epoch": 0.8013965087281796, + "grad_norm": 0.5150501111765092, + "learning_rate": 4.998576274365116e-06, + "loss": 0.8292, + "step": 16068 + }, + { + "epoch": 0.8014463840399002, + "grad_norm": 0.7082593278369996, + "learning_rate": 4.996153770761075e-06, + "loss": 0.821, + "step": 16069 + }, + { + "epoch": 0.8014962593516209, + "grad_norm": 0.8635340017236007, + "learning_rate": 4.993731789148451e-06, + "loss": 0.8361, + "step": 16070 + }, + { + "epoch": 0.8015461346633417, + "grad_norm": 0.5751653460671634, + "learning_rate": 4.9913103295904316e-06, + "loss": 0.8851, + "step": 16071 + }, + { + "epoch": 0.8015960099750623, + "grad_norm": 0.5621913533964649, + "learning_rate": 4.988889392150226e-06, + "loss": 0.8864, + "step": 16072 + }, + { + "epoch": 0.801645885286783, + "grad_norm": 0.5652459295633956, + "learning_rate": 4.986468976890993e-06, + "loss": 0.8508, + "step": 16073 + }, + { + "epoch": 0.8016957605985038, + "grad_norm": 0.47929059401267365, + "learning_rate": 4.984049083875897e-06, + "loss": 0.7988, + "step": 16074 + }, + { + "epoch": 0.8017456359102244, + "grad_norm": 0.5589958050530106, + "learning_rate": 4.981629713168074e-06, + "loss": 0.7785, + "step": 16075 + }, + { + "epoch": 0.8017955112219451, + "grad_norm": 0.5641067291254556, + "learning_rate": 4.979210864830672e-06, + "loss": 0.8376, + "step": 16076 + }, + { + "epoch": 0.8018453865336659, + "grad_norm": 0.5130438116022482, + "learning_rate": 4.9767925389268e-06, + "loss": 0.8678, + "step": 16077 + }, + { + "epoch": 0.8018952618453865, + "grad_norm": 0.6493239252823394, + "learning_rate": 4.974374735519569e-06, + "loss": 0.8639, + "step": 16078 + }, + { + "epoch": 0.8019451371571072, + "grad_norm": 0.5197820785261626, + "learning_rate": 4.971957454672058e-06, + "loss": 0.7902, + "step": 16079 + }, + { + "epoch": 0.801995012468828, + "grad_norm": 0.8379943894394982, + "learning_rate": 4.969540696447361e-06, + "loss": 0.8621, + "step": 16080 + }, + { + "epoch": 0.8020448877805486, + "grad_norm": 0.5094610292608042, + "learning_rate": 4.967124460908534e-06, + "loss": 0.8879, + "step": 16081 + }, + { + "epoch": 0.8020947630922693, + "grad_norm": 0.879545306483551, + "learning_rate": 4.9647087481186296e-06, + "loss": 0.816, + "step": 16082 + }, + { + "epoch": 0.80214463840399, + "grad_norm": 0.7373477320498714, + "learning_rate": 4.962293558140679e-06, + "loss": 0.8827, + "step": 16083 + }, + { + "epoch": 0.8021945137157107, + "grad_norm": 0.5070633471245412, + "learning_rate": 4.9598788910377145e-06, + "loss": 0.8148, + "step": 16084 + }, + { + "epoch": 0.8022443890274314, + "grad_norm": 0.6862936834603726, + "learning_rate": 4.957464746872742e-06, + "loss": 0.8125, + "step": 16085 + }, + { + "epoch": 0.8022942643391521, + "grad_norm": 0.6724086580429973, + "learning_rate": 4.95505112570876e-06, + "loss": 0.8175, + "step": 16086 + }, + { + "epoch": 0.8023441396508728, + "grad_norm": 0.5298799778443636, + "learning_rate": 4.952638027608736e-06, + "loss": 0.8212, + "step": 16087 + }, + { + "epoch": 0.8023940149625935, + "grad_norm": 0.5198431609859316, + "learning_rate": 4.950225452635663e-06, + "loss": 0.8546, + "step": 16088 + }, + { + "epoch": 0.8024438902743142, + "grad_norm": 0.5291539893554129, + "learning_rate": 4.94781340085248e-06, + "loss": 0.8579, + "step": 16089 + }, + { + "epoch": 0.8024937655860349, + "grad_norm": 0.5720356331304167, + "learning_rate": 4.945401872322131e-06, + "loss": 0.8445, + "step": 16090 + }, + { + "epoch": 0.8025436408977557, + "grad_norm": 0.4851992112658712, + "learning_rate": 4.942990867107547e-06, + "loss": 0.8335, + "step": 16091 + }, + { + "epoch": 0.8025935162094763, + "grad_norm": 1.199384541028505, + "learning_rate": 4.940580385271634e-06, + "loss": 0.8833, + "step": 16092 + }, + { + "epoch": 0.802643391521197, + "grad_norm": 0.42929117528211985, + "learning_rate": 4.938170426877303e-06, + "loss": 0.8428, + "step": 16093 + }, + { + "epoch": 0.8026932668329178, + "grad_norm": 0.6664248943785437, + "learning_rate": 4.935760991987431e-06, + "loss": 0.8335, + "step": 16094 + }, + { + "epoch": 0.8027431421446384, + "grad_norm": 0.49961617886882936, + "learning_rate": 4.9333520806649106e-06, + "loss": 0.872, + "step": 16095 + }, + { + "epoch": 0.8027930174563591, + "grad_norm": 0.5465533324632227, + "learning_rate": 4.930943692972573e-06, + "loss": 0.8571, + "step": 16096 + }, + { + "epoch": 0.8028428927680799, + "grad_norm": 0.5497762205893678, + "learning_rate": 4.928535828973283e-06, + "loss": 0.8078, + "step": 16097 + }, + { + "epoch": 0.8028927680798005, + "grad_norm": 0.5549809712571753, + "learning_rate": 4.926128488729861e-06, + "loss": 0.8889, + "step": 16098 + }, + { + "epoch": 0.8029426433915212, + "grad_norm": 0.5835748645009896, + "learning_rate": 4.9237216723051485e-06, + "loss": 0.8175, + "step": 16099 + }, + { + "epoch": 0.8029925187032418, + "grad_norm": 0.6171592236883362, + "learning_rate": 4.921315379761918e-06, + "loss": 0.8446, + "step": 16100 + }, + { + "epoch": 0.8030423940149626, + "grad_norm": 0.5533617238882966, + "learning_rate": 4.918909611162983e-06, + "loss": 0.8409, + "step": 16101 + }, + { + "epoch": 0.8030922693266833, + "grad_norm": 1.2058198488731426, + "learning_rate": 4.916504366571112e-06, + "loss": 0.8837, + "step": 16102 + }, + { + "epoch": 0.8031421446384039, + "grad_norm": 0.5273686670659059, + "learning_rate": 4.91409964604908e-06, + "loss": 0.8782, + "step": 16103 + }, + { + "epoch": 0.8031920199501247, + "grad_norm": 0.5498254136704123, + "learning_rate": 4.911695449659617e-06, + "loss": 0.8221, + "step": 16104 + }, + { + "epoch": 0.8032418952618454, + "grad_norm": 0.5086154279170241, + "learning_rate": 4.9092917774654775e-06, + "loss": 0.8761, + "step": 16105 + }, + { + "epoch": 0.803291770573566, + "grad_norm": 0.5279068893484044, + "learning_rate": 4.906888629529377e-06, + "loss": 0.8213, + "step": 16106 + }, + { + "epoch": 0.8033416458852868, + "grad_norm": 0.483162190522271, + "learning_rate": 4.9044860059140275e-06, + "loss": 0.8143, + "step": 16107 + }, + { + "epoch": 0.8033915211970075, + "grad_norm": 0.5694033392459942, + "learning_rate": 4.902083906682111e-06, + "loss": 0.8548, + "step": 16108 + }, + { + "epoch": 0.8034413965087281, + "grad_norm": 0.6203923886478677, + "learning_rate": 4.899682331896332e-06, + "loss": 0.8677, + "step": 16109 + }, + { + "epoch": 0.8034912718204489, + "grad_norm": 0.5995863725185319, + "learning_rate": 4.897281281619345e-06, + "loss": 0.8295, + "step": 16110 + }, + { + "epoch": 0.8035411471321696, + "grad_norm": 0.5222949040208367, + "learning_rate": 4.8948807559138055e-06, + "loss": 0.8235, + "step": 16111 + }, + { + "epoch": 0.8035910224438902, + "grad_norm": 0.5954947476035815, + "learning_rate": 4.892480754842346e-06, + "loss": 0.8877, + "step": 16112 + }, + { + "epoch": 0.803640897755611, + "grad_norm": 0.5619956688159721, + "learning_rate": 4.890081278467612e-06, + "loss": 0.8653, + "step": 16113 + }, + { + "epoch": 0.8036907730673317, + "grad_norm": 0.5028637533413141, + "learning_rate": 4.887682326852208e-06, + "loss": 0.8077, + "step": 16114 + }, + { + "epoch": 0.8037406483790523, + "grad_norm": 0.5082791937714715, + "learning_rate": 4.885283900058729e-06, + "loss": 0.8382, + "step": 16115 + }, + { + "epoch": 0.8037905236907731, + "grad_norm": 0.7129668572458479, + "learning_rate": 4.882885998149761e-06, + "loss": 0.8552, + "step": 16116 + }, + { + "epoch": 0.8038403990024937, + "grad_norm": 0.5635505445395054, + "learning_rate": 4.88048862118789e-06, + "loss": 0.8291, + "step": 16117 + }, + { + "epoch": 0.8038902743142144, + "grad_norm": 0.651517420611867, + "learning_rate": 4.878091769235654e-06, + "loss": 0.8722, + "step": 16118 + }, + { + "epoch": 0.8039401496259352, + "grad_norm": 0.6392960162445559, + "learning_rate": 4.875695442355613e-06, + "loss": 0.8931, + "step": 16119 + }, + { + "epoch": 0.8039900249376558, + "grad_norm": 0.5585681216733088, + "learning_rate": 4.873299640610285e-06, + "loss": 0.8427, + "step": 16120 + }, + { + "epoch": 0.8040399002493765, + "grad_norm": 0.5505445571057866, + "learning_rate": 4.870904364062204e-06, + "loss": 0.8361, + "step": 16121 + }, + { + "epoch": 0.8040897755610973, + "grad_norm": 0.6270833509839772, + "learning_rate": 4.8685096127738635e-06, + "loss": 0.8657, + "step": 16122 + }, + { + "epoch": 0.8041396508728179, + "grad_norm": 0.5725978285434679, + "learning_rate": 4.866115386807757e-06, + "loss": 0.8376, + "step": 16123 + }, + { + "epoch": 0.8041895261845386, + "grad_norm": 0.5159550268246521, + "learning_rate": 4.86372168622635e-06, + "loss": 0.8279, + "step": 16124 + }, + { + "epoch": 0.8042394014962594, + "grad_norm": 0.6000835481971575, + "learning_rate": 4.861328511092123e-06, + "loss": 0.8382, + "step": 16125 + }, + { + "epoch": 0.80428927680798, + "grad_norm": 0.5076428295851128, + "learning_rate": 4.858935861467512e-06, + "loss": 0.8443, + "step": 16126 + }, + { + "epoch": 0.8043391521197008, + "grad_norm": 0.7717356448928989, + "learning_rate": 4.856543737414956e-06, + "loss": 0.7911, + "step": 16127 + }, + { + "epoch": 0.8043890274314215, + "grad_norm": 0.55996053758128, + "learning_rate": 4.854152138996879e-06, + "loss": 0.8346, + "step": 16128 + }, + { + "epoch": 0.8044389027431421, + "grad_norm": 0.5320141260949801, + "learning_rate": 4.851761066275679e-06, + "loss": 0.8092, + "step": 16129 + }, + { + "epoch": 0.8044887780548629, + "grad_norm": 0.580323200120604, + "learning_rate": 4.849370519313761e-06, + "loss": 0.796, + "step": 16130 + }, + { + "epoch": 0.8045386533665836, + "grad_norm": 0.5600062522625537, + "learning_rate": 4.8469804981734975e-06, + "loss": 0.8708, + "step": 16131 + }, + { + "epoch": 0.8045885286783042, + "grad_norm": 0.5564417984819768, + "learning_rate": 4.84459100291727e-06, + "loss": 0.8159, + "step": 16132 + }, + { + "epoch": 0.804638403990025, + "grad_norm": 0.6004521498111234, + "learning_rate": 4.842202033607407e-06, + "loss": 0.8599, + "step": 16133 + }, + { + "epoch": 0.8046882793017457, + "grad_norm": 0.6818992657716226, + "learning_rate": 4.839813590306266e-06, + "loss": 0.8578, + "step": 16134 + }, + { + "epoch": 0.8047381546134663, + "grad_norm": 0.5578556058034422, + "learning_rate": 4.83742567307616e-06, + "loss": 0.835, + "step": 16135 + }, + { + "epoch": 0.804788029925187, + "grad_norm": 0.5650509613180854, + "learning_rate": 4.835038281979423e-06, + "loss": 0.8396, + "step": 16136 + }, + { + "epoch": 0.8048379052369077, + "grad_norm": 0.662485225716704, + "learning_rate": 4.832651417078324e-06, + "loss": 0.8771, + "step": 16137 + }, + { + "epoch": 0.8048877805486284, + "grad_norm": 0.5453842644150273, + "learning_rate": 4.830265078435165e-06, + "loss": 0.8982, + "step": 16138 + }, + { + "epoch": 0.8049376558603492, + "grad_norm": 0.4974968331235742, + "learning_rate": 4.8278792661122115e-06, + "loss": 0.8557, + "step": 16139 + }, + { + "epoch": 0.8049875311720698, + "grad_norm": 0.4858316794425729, + "learning_rate": 4.8254939801717245e-06, + "loss": 0.8428, + "step": 16140 + }, + { + "epoch": 0.8050374064837905, + "grad_norm": 0.644142589732645, + "learning_rate": 4.823109220675934e-06, + "loss": 0.8254, + "step": 16141 + }, + { + "epoch": 0.8050872817955113, + "grad_norm": 0.5664780461944586, + "learning_rate": 4.820724987687084e-06, + "loss": 0.8493, + "step": 16142 + }, + { + "epoch": 0.8051371571072319, + "grad_norm": 0.6193041040204842, + "learning_rate": 4.818341281267386e-06, + "loss": 0.7936, + "step": 16143 + }, + { + "epoch": 0.8051870324189526, + "grad_norm": 0.5488509695428777, + "learning_rate": 4.81595810147904e-06, + "loss": 0.8388, + "step": 16144 + }, + { + "epoch": 0.8052369077306734, + "grad_norm": 0.5476122309687735, + "learning_rate": 4.8135754483842275e-06, + "loss": 0.8513, + "step": 16145 + }, + { + "epoch": 0.805286783042394, + "grad_norm": 0.579074304408384, + "learning_rate": 4.811193322045135e-06, + "loss": 0.8016, + "step": 16146 + }, + { + "epoch": 0.8053366583541147, + "grad_norm": 0.8883373881383061, + "learning_rate": 4.808811722523921e-06, + "loss": 0.81, + "step": 16147 + }, + { + "epoch": 0.8053865336658355, + "grad_norm": 0.6844271594632986, + "learning_rate": 4.8064306498827235e-06, + "loss": 0.8817, + "step": 16148 + }, + { + "epoch": 0.8054364089775561, + "grad_norm": 0.5624298030608914, + "learning_rate": 4.804050104183677e-06, + "loss": 0.7885, + "step": 16149 + }, + { + "epoch": 0.8054862842892768, + "grad_norm": 0.5820691712348928, + "learning_rate": 4.801670085488918e-06, + "loss": 0.8351, + "step": 16150 + }, + { + "epoch": 0.8055361596009976, + "grad_norm": 0.9512767137511318, + "learning_rate": 4.799290593860525e-06, + "loss": 0.8267, + "step": 16151 + }, + { + "epoch": 0.8055860349127182, + "grad_norm": 0.5296028758559287, + "learning_rate": 4.796911629360609e-06, + "loss": 0.8387, + "step": 16152 + }, + { + "epoch": 0.8056359102244389, + "grad_norm": 0.544685861408051, + "learning_rate": 4.794533192051237e-06, + "loss": 0.8613, + "step": 16153 + }, + { + "epoch": 0.8056857855361595, + "grad_norm": 0.5435822056150722, + "learning_rate": 4.79215528199449e-06, + "loss": 0.8816, + "step": 16154 + }, + { + "epoch": 0.8057356608478803, + "grad_norm": 0.5481961208927921, + "learning_rate": 4.789777899252395e-06, + "loss": 0.8327, + "step": 16155 + }, + { + "epoch": 0.805785536159601, + "grad_norm": 0.5822421252356017, + "learning_rate": 4.787401043887008e-06, + "loss": 0.8429, + "step": 16156 + }, + { + "epoch": 0.8058354114713216, + "grad_norm": 0.558141547939005, + "learning_rate": 4.785024715960337e-06, + "loss": 0.8787, + "step": 16157 + }, + { + "epoch": 0.8058852867830424, + "grad_norm": 0.5748267247579364, + "learning_rate": 4.782648915534407e-06, + "loss": 0.8257, + "step": 16158 + }, + { + "epoch": 0.8059351620947631, + "grad_norm": 0.6843677514218326, + "learning_rate": 4.780273642671204e-06, + "loss": 0.9137, + "step": 16159 + }, + { + "epoch": 0.8059850374064838, + "grad_norm": 0.5567829971722901, + "learning_rate": 4.777898897432711e-06, + "loss": 0.8386, + "step": 16160 + }, + { + "epoch": 0.8060349127182045, + "grad_norm": 0.5500328550690172, + "learning_rate": 4.775524679880891e-06, + "loss": 0.8234, + "step": 16161 + }, + { + "epoch": 0.8060847880299252, + "grad_norm": 0.521867916669559, + "learning_rate": 4.77315099007771e-06, + "loss": 0.8307, + "step": 16162 + }, + { + "epoch": 0.8061346633416459, + "grad_norm": 0.8329251117022081, + "learning_rate": 4.770777828085099e-06, + "loss": 0.8571, + "step": 16163 + }, + { + "epoch": 0.8061845386533666, + "grad_norm": 0.6195990373687746, + "learning_rate": 4.7684051939649895e-06, + "loss": 0.8389, + "step": 16164 + }, + { + "epoch": 0.8062344139650873, + "grad_norm": 0.6086343992508285, + "learning_rate": 4.76603308777929e-06, + "loss": 0.8287, + "step": 16165 + }, + { + "epoch": 0.806284289276808, + "grad_norm": 0.5757943627875207, + "learning_rate": 4.763661509589898e-06, + "loss": 0.9014, + "step": 16166 + }, + { + "epoch": 0.8063341645885287, + "grad_norm": 0.5949547578401634, + "learning_rate": 4.761290459458706e-06, + "loss": 0.8417, + "step": 16167 + }, + { + "epoch": 0.8063840399002494, + "grad_norm": 0.6379877839600724, + "learning_rate": 4.758919937447576e-06, + "loss": 0.855, + "step": 16168 + }, + { + "epoch": 0.80643391521197, + "grad_norm": 0.5125440826502815, + "learning_rate": 4.756549943618385e-06, + "loss": 0.8318, + "step": 16169 + }, + { + "epoch": 0.8064837905236908, + "grad_norm": 0.5285854215761037, + "learning_rate": 4.7541804780329494e-06, + "loss": 0.855, + "step": 16170 + }, + { + "epoch": 0.8065336658354114, + "grad_norm": 0.5125969475515656, + "learning_rate": 4.751811540753121e-06, + "loss": 0.8668, + "step": 16171 + }, + { + "epoch": 0.8065835411471322, + "grad_norm": 1.3582966854902163, + "learning_rate": 4.749443131840703e-06, + "loss": 0.8399, + "step": 16172 + }, + { + "epoch": 0.8066334164588529, + "grad_norm": 0.48740098827848, + "learning_rate": 4.747075251357513e-06, + "loss": 0.8588, + "step": 16173 + }, + { + "epoch": 0.8066832917705735, + "grad_norm": 0.6971439223230979, + "learning_rate": 4.744707899365322e-06, + "loss": 0.866, + "step": 16174 + }, + { + "epoch": 0.8067331670822943, + "grad_norm": 0.557510795932775, + "learning_rate": 4.742341075925916e-06, + "loss": 0.8795, + "step": 16175 + }, + { + "epoch": 0.806783042394015, + "grad_norm": 0.5640135217998875, + "learning_rate": 4.7399747811010555e-06, + "loss": 0.8361, + "step": 16176 + }, + { + "epoch": 0.8068329177057356, + "grad_norm": 0.4920817345327933, + "learning_rate": 4.737609014952485e-06, + "loss": 0.8533, + "step": 16177 + }, + { + "epoch": 0.8068827930174564, + "grad_norm": 0.4955318959822389, + "learning_rate": 4.7352437775419316e-06, + "loss": 0.8117, + "step": 16178 + }, + { + "epoch": 0.8069326683291771, + "grad_norm": 0.6303098495547145, + "learning_rate": 4.732879068931131e-06, + "loss": 0.8397, + "step": 16179 + }, + { + "epoch": 0.8069825436408977, + "grad_norm": 0.48738407444957255, + "learning_rate": 4.730514889181781e-06, + "loss": 0.849, + "step": 16180 + }, + { + "epoch": 0.8070324189526185, + "grad_norm": 0.5786087449129386, + "learning_rate": 4.728151238355571e-06, + "loss": 0.8761, + "step": 16181 + }, + { + "epoch": 0.8070822942643392, + "grad_norm": 0.5336936857621126, + "learning_rate": 4.725788116514176e-06, + "loss": 0.901, + "step": 16182 + }, + { + "epoch": 0.8071321695760598, + "grad_norm": 0.5667872246778614, + "learning_rate": 4.723425523719274e-06, + "loss": 0.8172, + "step": 16183 + }, + { + "epoch": 0.8071820448877806, + "grad_norm": 1.231620651531035, + "learning_rate": 4.7210634600325096e-06, + "loss": 0.8478, + "step": 16184 + }, + { + "epoch": 0.8072319201995013, + "grad_norm": 0.5621940654901286, + "learning_rate": 4.718701925515517e-06, + "loss": 0.8342, + "step": 16185 + }, + { + "epoch": 0.8072817955112219, + "grad_norm": 0.5616106402230941, + "learning_rate": 4.716340920229914e-06, + "loss": 0.8337, + "step": 16186 + }, + { + "epoch": 0.8073316708229427, + "grad_norm": 0.7644153755199717, + "learning_rate": 4.713980444237328e-06, + "loss": 0.8804, + "step": 16187 + }, + { + "epoch": 0.8073815461346634, + "grad_norm": 0.5510625933057287, + "learning_rate": 4.711620497599334e-06, + "loss": 0.8453, + "step": 16188 + }, + { + "epoch": 0.807431421446384, + "grad_norm": 0.5856715217738724, + "learning_rate": 4.7092610803775304e-06, + "loss": 0.8108, + "step": 16189 + }, + { + "epoch": 0.8074812967581048, + "grad_norm": 0.5059234475876843, + "learning_rate": 4.70690219263347e-06, + "loss": 0.8344, + "step": 16190 + }, + { + "epoch": 0.8075311720698254, + "grad_norm": 0.5381458687296726, + "learning_rate": 4.7045438344287266e-06, + "loss": 0.8125, + "step": 16191 + }, + { + "epoch": 0.8075810473815461, + "grad_norm": 0.6692861619300504, + "learning_rate": 4.702186005824816e-06, + "loss": 0.8647, + "step": 16192 + }, + { + "epoch": 0.8076309226932669, + "grad_norm": 0.6185886641090261, + "learning_rate": 4.699828706883286e-06, + "loss": 0.8233, + "step": 16193 + }, + { + "epoch": 0.8076807980049875, + "grad_norm": 0.5482158658071157, + "learning_rate": 4.697471937665632e-06, + "loss": 0.8305, + "step": 16194 + }, + { + "epoch": 0.8077306733167082, + "grad_norm": 0.7519012878896734, + "learning_rate": 4.695115698233368e-06, + "loss": 0.8116, + "step": 16195 + }, + { + "epoch": 0.807780548628429, + "grad_norm": 0.47658232107027404, + "learning_rate": 4.692759988647974e-06, + "loss": 0.8416, + "step": 16196 + }, + { + "epoch": 0.8078304239401496, + "grad_norm": 0.5273096619842265, + "learning_rate": 4.690404808970919e-06, + "loss": 0.8388, + "step": 16197 + }, + { + "epoch": 0.8078802992518703, + "grad_norm": 0.6150374133728154, + "learning_rate": 4.688050159263663e-06, + "loss": 0.8586, + "step": 16198 + }, + { + "epoch": 0.8079301745635911, + "grad_norm": 1.0119293722110647, + "learning_rate": 4.685696039587639e-06, + "loss": 0.8157, + "step": 16199 + }, + { + "epoch": 0.8079800498753117, + "grad_norm": 0.5424626128230224, + "learning_rate": 4.683342450004294e-06, + "loss": 0.8849, + "step": 16200 + }, + { + "epoch": 0.8080299251870324, + "grad_norm": 0.5787517942446414, + "learning_rate": 4.680989390575033e-06, + "loss": 0.8365, + "step": 16201 + }, + { + "epoch": 0.8080798004987532, + "grad_norm": 0.5322598020878975, + "learning_rate": 4.678636861361261e-06, + "loss": 0.8568, + "step": 16202 + }, + { + "epoch": 0.8081296758104738, + "grad_norm": 0.5616249320110264, + "learning_rate": 4.6762848624243575e-06, + "loss": 0.8502, + "step": 16203 + }, + { + "epoch": 0.8081795511221945, + "grad_norm": 0.8530234911258081, + "learning_rate": 4.673933393825714e-06, + "loss": 0.8044, + "step": 16204 + }, + { + "epoch": 0.8082294264339153, + "grad_norm": 0.5804014563796428, + "learning_rate": 4.671582455626681e-06, + "loss": 0.89, + "step": 16205 + }, + { + "epoch": 0.8082793017456359, + "grad_norm": 0.5402012992119332, + "learning_rate": 4.669232047888605e-06, + "loss": 0.8418, + "step": 16206 + }, + { + "epoch": 0.8083291770573566, + "grad_norm": 0.5641755036288576, + "learning_rate": 4.666882170672812e-06, + "loss": 0.8778, + "step": 16207 + }, + { + "epoch": 0.8083790523690773, + "grad_norm": 0.638583081374625, + "learning_rate": 4.664532824040638e-06, + "loss": 0.8458, + "step": 16208 + }, + { + "epoch": 0.808428927680798, + "grad_norm": 0.6620524263100953, + "learning_rate": 4.6621840080533805e-06, + "loss": 0.8739, + "step": 16209 + }, + { + "epoch": 0.8084788029925187, + "grad_norm": 0.5325355809469615, + "learning_rate": 4.659835722772327e-06, + "loss": 0.822, + "step": 16210 + }, + { + "epoch": 0.8085286783042394, + "grad_norm": 0.6000159893391213, + "learning_rate": 4.65748796825875e-06, + "loss": 0.8829, + "step": 16211 + }, + { + "epoch": 0.8085785536159601, + "grad_norm": 0.5528360406475827, + "learning_rate": 4.655140744573927e-06, + "loss": 0.8028, + "step": 16212 + }, + { + "epoch": 0.8086284289276808, + "grad_norm": 0.7642027332700281, + "learning_rate": 4.652794051779102e-06, + "loss": 0.8191, + "step": 16213 + }, + { + "epoch": 0.8086783042394015, + "grad_norm": 0.645875545925154, + "learning_rate": 4.650447889935508e-06, + "loss": 0.8671, + "step": 16214 + }, + { + "epoch": 0.8087281795511222, + "grad_norm": 0.5111337937355159, + "learning_rate": 4.648102259104364e-06, + "loss": 0.8556, + "step": 16215 + }, + { + "epoch": 0.8087780548628429, + "grad_norm": 0.5635091298676687, + "learning_rate": 4.645757159346892e-06, + "loss": 0.8309, + "step": 16216 + }, + { + "epoch": 0.8088279301745636, + "grad_norm": 0.5015253375890102, + "learning_rate": 4.643412590724275e-06, + "loss": 0.8566, + "step": 16217 + }, + { + "epoch": 0.8088778054862843, + "grad_norm": 0.5860457957302032, + "learning_rate": 4.6410685532976974e-06, + "loss": 0.8328, + "step": 16218 + }, + { + "epoch": 0.808927680798005, + "grad_norm": 0.5010348326145797, + "learning_rate": 4.638725047128317e-06, + "loss": 0.8072, + "step": 16219 + }, + { + "epoch": 0.8089775561097257, + "grad_norm": 0.4842182651433852, + "learning_rate": 4.636382072277301e-06, + "loss": 0.8047, + "step": 16220 + }, + { + "epoch": 0.8090274314214464, + "grad_norm": 0.5095766240254441, + "learning_rate": 4.634039628805781e-06, + "loss": 0.8538, + "step": 16221 + }, + { + "epoch": 0.8090773067331671, + "grad_norm": 0.4849843646001887, + "learning_rate": 4.63169771677488e-06, + "loss": 0.8251, + "step": 16222 + }, + { + "epoch": 0.8091271820448878, + "grad_norm": 0.8168923594494925, + "learning_rate": 4.629356336245707e-06, + "loss": 0.8473, + "step": 16223 + }, + { + "epoch": 0.8091770573566085, + "grad_norm": 0.547423336390002, + "learning_rate": 4.627015487279377e-06, + "loss": 0.827, + "step": 16224 + }, + { + "epoch": 0.8092269326683291, + "grad_norm": 0.5733687830658704, + "learning_rate": 4.624675169936948e-06, + "loss": 0.822, + "step": 16225 + }, + { + "epoch": 0.8092768079800499, + "grad_norm": 0.7676334031210258, + "learning_rate": 4.622335384279505e-06, + "loss": 0.8716, + "step": 16226 + }, + { + "epoch": 0.8093266832917706, + "grad_norm": 0.515409200187446, + "learning_rate": 4.6199961303680985e-06, + "loss": 0.7939, + "step": 16227 + }, + { + "epoch": 0.8093765586034912, + "grad_norm": 1.1494298251172912, + "learning_rate": 4.617657408263782e-06, + "loss": 0.8506, + "step": 16228 + }, + { + "epoch": 0.809426433915212, + "grad_norm": 0.5002014207594039, + "learning_rate": 4.615319218027561e-06, + "loss": 0.8247, + "step": 16229 + }, + { + "epoch": 0.8094763092269327, + "grad_norm": 0.6590048156407416, + "learning_rate": 4.612981559720467e-06, + "loss": 0.8047, + "step": 16230 + }, + { + "epoch": 0.8095261845386533, + "grad_norm": 0.6630750868617957, + "learning_rate": 4.610644433403491e-06, + "loss": 0.9349, + "step": 16231 + }, + { + "epoch": 0.8095760598503741, + "grad_norm": 0.6281234114698637, + "learning_rate": 4.608307839137635e-06, + "loss": 0.8138, + "step": 16232 + }, + { + "epoch": 0.8096259351620948, + "grad_norm": 0.49495318800974086, + "learning_rate": 4.605971776983847e-06, + "loss": 0.8147, + "step": 16233 + }, + { + "epoch": 0.8096758104738154, + "grad_norm": 0.5504660377780721, + "learning_rate": 4.6036362470031045e-06, + "loss": 0.8033, + "step": 16234 + }, + { + "epoch": 0.8097256857855362, + "grad_norm": 0.5462998869399487, + "learning_rate": 4.601301249256349e-06, + "loss": 0.8721, + "step": 16235 + }, + { + "epoch": 0.8097755610972569, + "grad_norm": 0.5833172610418048, + "learning_rate": 4.598966783804498e-06, + "loss": 0.8522, + "step": 16236 + }, + { + "epoch": 0.8098254364089775, + "grad_norm": 0.6533838171679988, + "learning_rate": 4.596632850708485e-06, + "loss": 0.8624, + "step": 16237 + }, + { + "epoch": 0.8098753117206983, + "grad_norm": 0.4892972568923036, + "learning_rate": 4.594299450029207e-06, + "loss": 0.8429, + "step": 16238 + }, + { + "epoch": 0.809925187032419, + "grad_norm": 0.49971008189974736, + "learning_rate": 4.591966581827553e-06, + "loss": 0.8573, + "step": 16239 + }, + { + "epoch": 0.8099750623441396, + "grad_norm": 0.5211834976475878, + "learning_rate": 4.589634246164393e-06, + "loss": 0.8341, + "step": 16240 + }, + { + "epoch": 0.8100249376558604, + "grad_norm": 0.4962561652331585, + "learning_rate": 4.587302443100597e-06, + "loss": 0.8289, + "step": 16241 + }, + { + "epoch": 0.810074812967581, + "grad_norm": 0.7892597290184657, + "learning_rate": 4.584971172697008e-06, + "loss": 0.899, + "step": 16242 + }, + { + "epoch": 0.8101246882793017, + "grad_norm": 0.6996198043485524, + "learning_rate": 4.58264043501446e-06, + "loss": 0.8191, + "step": 16243 + }, + { + "epoch": 0.8101745635910225, + "grad_norm": 0.6080439250651479, + "learning_rate": 4.580310230113766e-06, + "loss": 0.8202, + "step": 16244 + }, + { + "epoch": 0.8102244389027431, + "grad_norm": 0.6906955787742513, + "learning_rate": 4.577980558055744e-06, + "loss": 0.8025, + "step": 16245 + }, + { + "epoch": 0.8102743142144638, + "grad_norm": 0.45401560442494043, + "learning_rate": 4.575651418901181e-06, + "loss": 0.826, + "step": 16246 + }, + { + "epoch": 0.8103241895261846, + "grad_norm": 0.538760948487481, + "learning_rate": 4.573322812710854e-06, + "loss": 0.8058, + "step": 16247 + }, + { + "epoch": 0.8103740648379052, + "grad_norm": 0.5789803929652411, + "learning_rate": 4.570994739545517e-06, + "loss": 0.8442, + "step": 16248 + }, + { + "epoch": 0.8104239401496259, + "grad_norm": 0.541981974466971, + "learning_rate": 4.568667199465937e-06, + "loss": 0.8246, + "step": 16249 + }, + { + "epoch": 0.8104738154613467, + "grad_norm": 0.5660346123075738, + "learning_rate": 4.566340192532842e-06, + "loss": 0.8282, + "step": 16250 + }, + { + "epoch": 0.8105236907730673, + "grad_norm": 0.5304703199087588, + "learning_rate": 4.564013718806956e-06, + "loss": 0.8501, + "step": 16251 + }, + { + "epoch": 0.810573566084788, + "grad_norm": 0.8344244349656195, + "learning_rate": 4.56168777834898e-06, + "loss": 0.8416, + "step": 16252 + }, + { + "epoch": 0.8106234413965088, + "grad_norm": 0.613503198049823, + "learning_rate": 4.5593623712196194e-06, + "loss": 0.8994, + "step": 16253 + }, + { + "epoch": 0.8106733167082294, + "grad_norm": 0.961314733038366, + "learning_rate": 4.5570374974795525e-06, + "loss": 0.8489, + "step": 16254 + }, + { + "epoch": 0.8107231920199501, + "grad_norm": 0.5437837710425394, + "learning_rate": 4.554713157189439e-06, + "loss": 0.8064, + "step": 16255 + }, + { + "epoch": 0.8107730673316709, + "grad_norm": 0.5668168603320328, + "learning_rate": 4.552389350409933e-06, + "loss": 0.8234, + "step": 16256 + }, + { + "epoch": 0.8108229426433915, + "grad_norm": 0.5064292199704052, + "learning_rate": 4.5500660772016865e-06, + "loss": 0.8544, + "step": 16257 + }, + { + "epoch": 0.8108728179551122, + "grad_norm": 1.7409742757366882, + "learning_rate": 4.547743337625301e-06, + "loss": 0.8696, + "step": 16258 + }, + { + "epoch": 0.810922693266833, + "grad_norm": 0.4795522134734952, + "learning_rate": 4.5454211317414065e-06, + "loss": 0.8656, + "step": 16259 + }, + { + "epoch": 0.8109725685785536, + "grad_norm": 0.50903243399594, + "learning_rate": 4.543099459610587e-06, + "loss": 0.8204, + "step": 16260 + }, + { + "epoch": 0.8110224438902743, + "grad_norm": 0.5696425689298532, + "learning_rate": 4.540778321293443e-06, + "loss": 0.8295, + "step": 16261 + }, + { + "epoch": 0.811072319201995, + "grad_norm": 0.4789670504565071, + "learning_rate": 4.5384577168505195e-06, + "loss": 0.8073, + "step": 16262 + }, + { + "epoch": 0.8111221945137157, + "grad_norm": 0.6858794580850646, + "learning_rate": 4.536137646342392e-06, + "loss": 0.8426, + "step": 16263 + }, + { + "epoch": 0.8111720698254364, + "grad_norm": 0.5434795107838399, + "learning_rate": 4.5338181098295875e-06, + "loss": 0.8235, + "step": 16264 + }, + { + "epoch": 0.8112219451371571, + "grad_norm": 0.5276631612291289, + "learning_rate": 4.5314991073726555e-06, + "loss": 0.8491, + "step": 16265 + }, + { + "epoch": 0.8112718204488778, + "grad_norm": 0.5447026119118515, + "learning_rate": 4.529180639032077e-06, + "loss": 0.8485, + "step": 16266 + }, + { + "epoch": 0.8113216957605985, + "grad_norm": 0.584392324906145, + "learning_rate": 4.526862704868376e-06, + "loss": 0.891, + "step": 16267 + }, + { + "epoch": 0.8113715710723192, + "grad_norm": 0.5564020226220808, + "learning_rate": 4.524545304942024e-06, + "loss": 0.8045, + "step": 16268 + }, + { + "epoch": 0.8114214463840399, + "grad_norm": 0.5163823946416574, + "learning_rate": 4.5222284393135115e-06, + "loss": 0.8168, + "step": 16269 + }, + { + "epoch": 0.8114713216957606, + "grad_norm": 0.5523304144518381, + "learning_rate": 4.51991210804327e-06, + "loss": 0.837, + "step": 16270 + }, + { + "epoch": 0.8115211970074813, + "grad_norm": 0.9439515826983308, + "learning_rate": 4.517596311191766e-06, + "loss": 0.7813, + "step": 16271 + }, + { + "epoch": 0.811571072319202, + "grad_norm": 0.7059935774625393, + "learning_rate": 4.515281048819414e-06, + "loss": 0.8436, + "step": 16272 + }, + { + "epoch": 0.8116209476309227, + "grad_norm": 0.8852226786215268, + "learning_rate": 4.512966320986634e-06, + "loss": 0.8144, + "step": 16273 + }, + { + "epoch": 0.8116708229426434, + "grad_norm": 0.5725144589178661, + "learning_rate": 4.510652127753831e-06, + "loss": 0.8417, + "step": 16274 + }, + { + "epoch": 0.8117206982543641, + "grad_norm": 0.5755292617531644, + "learning_rate": 4.508338469181395e-06, + "loss": 0.8601, + "step": 16275 + }, + { + "epoch": 0.8117705735660848, + "grad_norm": 0.5594215989147482, + "learning_rate": 4.506025345329693e-06, + "loss": 0.8681, + "step": 16276 + }, + { + "epoch": 0.8118204488778055, + "grad_norm": 0.4818489472370804, + "learning_rate": 4.503712756259082e-06, + "loss": 0.854, + "step": 16277 + }, + { + "epoch": 0.8118703241895262, + "grad_norm": 0.6136081473759991, + "learning_rate": 4.501400702029921e-06, + "loss": 0.8666, + "step": 16278 + }, + { + "epoch": 0.8119201995012468, + "grad_norm": 0.5337934172507364, + "learning_rate": 4.4990891827025314e-06, + "loss": 0.8381, + "step": 16279 + }, + { + "epoch": 0.8119700748129676, + "grad_norm": 0.6012391748578093, + "learning_rate": 4.496778198337237e-06, + "loss": 0.8641, + "step": 16280 + }, + { + "epoch": 0.8120199501246883, + "grad_norm": 0.4963454554985542, + "learning_rate": 4.494467748994333e-06, + "loss": 0.875, + "step": 16281 + }, + { + "epoch": 0.8120698254364089, + "grad_norm": 0.6555445318782993, + "learning_rate": 4.492157834734123e-06, + "loss": 0.8289, + "step": 16282 + }, + { + "epoch": 0.8121197007481297, + "grad_norm": 0.7990258467767857, + "learning_rate": 4.489848455616874e-06, + "loss": 0.843, + "step": 16283 + }, + { + "epoch": 0.8121695760598504, + "grad_norm": 0.9809373658132237, + "learning_rate": 4.487539611702854e-06, + "loss": 0.8477, + "step": 16284 + }, + { + "epoch": 0.812219451371571, + "grad_norm": 0.6245225536864243, + "learning_rate": 4.485231303052298e-06, + "loss": 0.8751, + "step": 16285 + }, + { + "epoch": 0.8122693266832918, + "grad_norm": 0.4758749667087552, + "learning_rate": 4.482923529725458e-06, + "loss": 0.8472, + "step": 16286 + }, + { + "epoch": 0.8123192019950125, + "grad_norm": 0.6167242929450925, + "learning_rate": 4.4806162917825465e-06, + "loss": 0.8187, + "step": 16287 + }, + { + "epoch": 0.8123690773067331, + "grad_norm": 0.6765626891624238, + "learning_rate": 4.478309589283769e-06, + "loss": 0.8345, + "step": 16288 + }, + { + "epoch": 0.8124189526184539, + "grad_norm": 0.6296446573912021, + "learning_rate": 4.47600342228931e-06, + "loss": 0.8662, + "step": 16289 + }, + { + "epoch": 0.8124688279301746, + "grad_norm": 0.5464893870375982, + "learning_rate": 4.473697790859366e-06, + "loss": 0.8599, + "step": 16290 + }, + { + "epoch": 0.8125187032418952, + "grad_norm": 0.5278882743385779, + "learning_rate": 4.471392695054091e-06, + "loss": 0.8114, + "step": 16291 + }, + { + "epoch": 0.812568578553616, + "grad_norm": 0.8789157934023764, + "learning_rate": 4.469088134933636e-06, + "loss": 0.7983, + "step": 16292 + }, + { + "epoch": 0.8126184538653367, + "grad_norm": 0.639651199522453, + "learning_rate": 4.466784110558131e-06, + "loss": 0.9012, + "step": 16293 + }, + { + "epoch": 0.8126683291770573, + "grad_norm": 0.5154858214420363, + "learning_rate": 4.4644806219877184e-06, + "loss": 0.762, + "step": 16294 + }, + { + "epoch": 0.8127182044887781, + "grad_norm": 0.5339068608702591, + "learning_rate": 4.46217766928248e-06, + "loss": 0.806, + "step": 16295 + }, + { + "epoch": 0.8127680798004987, + "grad_norm": 0.8348352259227645, + "learning_rate": 4.459875252502529e-06, + "loss": 0.8764, + "step": 16296 + }, + { + "epoch": 0.8128179551122194, + "grad_norm": 0.45542534473938706, + "learning_rate": 4.457573371707938e-06, + "loss": 0.8491, + "step": 16297 + }, + { + "epoch": 0.8128678304239402, + "grad_norm": 0.6450949996636458, + "learning_rate": 4.455272026958784e-06, + "loss": 0.8423, + "step": 16298 + }, + { + "epoch": 0.8129177057356608, + "grad_norm": 0.5310676453411817, + "learning_rate": 4.452971218315102e-06, + "loss": 0.8585, + "step": 16299 + }, + { + "epoch": 0.8129675810473815, + "grad_norm": 0.5603828872537583, + "learning_rate": 4.4506709458369486e-06, + "loss": 0.8629, + "step": 16300 + }, + { + "epoch": 0.8130174563591023, + "grad_norm": 0.5070257965640036, + "learning_rate": 4.448371209584331e-06, + "loss": 0.7958, + "step": 16301 + }, + { + "epoch": 0.8130673316708229, + "grad_norm": 0.43742413849089273, + "learning_rate": 4.446072009617283e-06, + "loss": 0.8091, + "step": 16302 + }, + { + "epoch": 0.8131172069825436, + "grad_norm": 0.5075422630678835, + "learning_rate": 4.443773345995774e-06, + "loss": 0.8522, + "step": 16303 + }, + { + "epoch": 0.8131670822942644, + "grad_norm": 0.49214402283404224, + "learning_rate": 4.441475218779806e-06, + "loss": 0.8555, + "step": 16304 + }, + { + "epoch": 0.813216957605985, + "grad_norm": 0.615186776877097, + "learning_rate": 4.4391776280293424e-06, + "loss": 0.8681, + "step": 16305 + }, + { + "epoch": 0.8132668329177057, + "grad_norm": 0.5208125957144802, + "learning_rate": 4.436880573804336e-06, + "loss": 0.8501, + "step": 16306 + }, + { + "epoch": 0.8133167082294265, + "grad_norm": 0.6867227784386534, + "learning_rate": 4.4345840561647204e-06, + "loss": 0.842, + "step": 16307 + }, + { + "epoch": 0.8133665835411471, + "grad_norm": 0.6754680957801914, + "learning_rate": 4.432288075170437e-06, + "loss": 0.8783, + "step": 16308 + }, + { + "epoch": 0.8134164588528678, + "grad_norm": 0.6413930617620178, + "learning_rate": 4.429992630881394e-06, + "loss": 0.875, + "step": 16309 + }, + { + "epoch": 0.8134663341645886, + "grad_norm": 0.6348477404834022, + "learning_rate": 4.4276977233574795e-06, + "loss": 0.8553, + "step": 16310 + }, + { + "epoch": 0.8135162094763092, + "grad_norm": 0.44283486462587507, + "learning_rate": 4.425403352658591e-06, + "loss": 0.8144, + "step": 16311 + }, + { + "epoch": 0.8135660847880299, + "grad_norm": 0.7681491714680823, + "learning_rate": 4.423109518844595e-06, + "loss": 0.8146, + "step": 16312 + }, + { + "epoch": 0.8136159600997506, + "grad_norm": 0.550452745678343, + "learning_rate": 4.420816221975346e-06, + "loss": 0.8054, + "step": 16313 + }, + { + "epoch": 0.8136658354114713, + "grad_norm": 0.7003528411699859, + "learning_rate": 4.4185234621106825e-06, + "loss": 0.9237, + "step": 16314 + }, + { + "epoch": 0.813715710723192, + "grad_norm": 0.6658494209940322, + "learning_rate": 4.416231239310445e-06, + "loss": 0.8467, + "step": 16315 + }, + { + "epoch": 0.8137655860349127, + "grad_norm": 0.7315862333025762, + "learning_rate": 4.413939553634441e-06, + "loss": 0.8134, + "step": 16316 + }, + { + "epoch": 0.8138154613466334, + "grad_norm": 0.6232520030517591, + "learning_rate": 4.411648405142471e-06, + "loss": 0.8283, + "step": 16317 + }, + { + "epoch": 0.8138653366583541, + "grad_norm": 0.5616323196682995, + "learning_rate": 4.4093577938943145e-06, + "loss": 0.8529, + "step": 16318 + }, + { + "epoch": 0.8139152119700748, + "grad_norm": 0.5335929627906445, + "learning_rate": 4.407067719949759e-06, + "loss": 0.7845, + "step": 16319 + }, + { + "epoch": 0.8139650872817955, + "grad_norm": 0.5569764200883901, + "learning_rate": 4.404778183368555e-06, + "loss": 0.8172, + "step": 16320 + }, + { + "epoch": 0.8140149625935162, + "grad_norm": 1.443208179104963, + "learning_rate": 4.402489184210446e-06, + "loss": 0.835, + "step": 16321 + }, + { + "epoch": 0.8140648379052369, + "grad_norm": 0.5842886779736323, + "learning_rate": 4.400200722535155e-06, + "loss": 0.8716, + "step": 16322 + }, + { + "epoch": 0.8141147132169576, + "grad_norm": 0.566015426626274, + "learning_rate": 4.397912798402418e-06, + "loss": 0.8802, + "step": 16323 + }, + { + "epoch": 0.8141645885286783, + "grad_norm": 0.5486479777082756, + "learning_rate": 4.395625411871923e-06, + "loss": 0.8728, + "step": 16324 + }, + { + "epoch": 0.814214463840399, + "grad_norm": 0.504940265644396, + "learning_rate": 4.393338563003363e-06, + "loss": 0.8462, + "step": 16325 + }, + { + "epoch": 0.8142643391521197, + "grad_norm": 0.49427185848054056, + "learning_rate": 4.3910522518564036e-06, + "loss": 0.8006, + "step": 16326 + }, + { + "epoch": 0.8143142144638404, + "grad_norm": 0.650314321203778, + "learning_rate": 4.388766478490719e-06, + "loss": 0.8478, + "step": 16327 + }, + { + "epoch": 0.8143640897755611, + "grad_norm": 0.5420802541474133, + "learning_rate": 4.386481242965951e-06, + "loss": 0.8422, + "step": 16328 + }, + { + "epoch": 0.8144139650872818, + "grad_norm": 0.532685172966557, + "learning_rate": 4.384196545341726e-06, + "loss": 0.8589, + "step": 16329 + }, + { + "epoch": 0.8144638403990025, + "grad_norm": 0.4813474249901086, + "learning_rate": 4.381912385677661e-06, + "loss": 0.8191, + "step": 16330 + }, + { + "epoch": 0.8145137157107232, + "grad_norm": 0.6613562929512148, + "learning_rate": 4.3796287640333775e-06, + "loss": 0.8109, + "step": 16331 + }, + { + "epoch": 0.8145635910224439, + "grad_norm": 0.6048725142415182, + "learning_rate": 4.377345680468442e-06, + "loss": 0.8518, + "step": 16332 + }, + { + "epoch": 0.8146134663341645, + "grad_norm": 0.5446156141960119, + "learning_rate": 4.375063135042445e-06, + "loss": 0.8288, + "step": 16333 + }, + { + "epoch": 0.8146633416458853, + "grad_norm": 0.5879906674196927, + "learning_rate": 4.37278112781494e-06, + "loss": 0.8346, + "step": 16334 + }, + { + "epoch": 0.814713216957606, + "grad_norm": 0.523746625617005, + "learning_rate": 4.370499658845492e-06, + "loss": 0.8401, + "step": 16335 + }, + { + "epoch": 0.8147630922693266, + "grad_norm": 0.5668045593219616, + "learning_rate": 4.36821872819361e-06, + "loss": 0.8903, + "step": 16336 + }, + { + "epoch": 0.8148129675810474, + "grad_norm": 0.5381805350440858, + "learning_rate": 4.365938335918834e-06, + "loss": 0.8149, + "step": 16337 + }, + { + "epoch": 0.8148628428927681, + "grad_norm": 0.5905295385619862, + "learning_rate": 4.363658482080654e-06, + "loss": 0.8361, + "step": 16338 + }, + { + "epoch": 0.8149127182044887, + "grad_norm": 0.5438661729553901, + "learning_rate": 4.361379166738586e-06, + "loss": 0.8381, + "step": 16339 + }, + { + "epoch": 0.8149625935162095, + "grad_norm": 0.7719285828620743, + "learning_rate": 4.3591003899520785e-06, + "loss": 0.8493, + "step": 16340 + }, + { + "epoch": 0.8150124688279302, + "grad_norm": 2.1402554937851197, + "learning_rate": 4.356822151780615e-06, + "loss": 0.8189, + "step": 16341 + }, + { + "epoch": 0.8150623441396508, + "grad_norm": 0.6904251064459997, + "learning_rate": 4.354544452283638e-06, + "loss": 0.8858, + "step": 16342 + }, + { + "epoch": 0.8151122194513716, + "grad_norm": 0.6239871765997467, + "learning_rate": 4.352267291520585e-06, + "loss": 0.8367, + "step": 16343 + }, + { + "epoch": 0.8151620947630923, + "grad_norm": 0.548100593447209, + "learning_rate": 4.3499906695508685e-06, + "loss": 0.8324, + "step": 16344 + }, + { + "epoch": 0.8152119700748129, + "grad_norm": 0.5022194361809134, + "learning_rate": 4.34771458643391e-06, + "loss": 0.8251, + "step": 16345 + }, + { + "epoch": 0.8152618453865337, + "grad_norm": 0.5433483750272509, + "learning_rate": 4.345439042229096e-06, + "loss": 0.8464, + "step": 16346 + }, + { + "epoch": 0.8153117206982544, + "grad_norm": 0.5920786159937297, + "learning_rate": 4.343164036995809e-06, + "loss": 0.8815, + "step": 16347 + }, + { + "epoch": 0.815361596009975, + "grad_norm": 1.1638286156573499, + "learning_rate": 4.3408895707934025e-06, + "loss": 0.8301, + "step": 16348 + }, + { + "epoch": 0.8154114713216958, + "grad_norm": 0.6010143601897584, + "learning_rate": 4.338615643681246e-06, + "loss": 0.8499, + "step": 16349 + }, + { + "epoch": 0.8154613466334164, + "grad_norm": 0.563703503158448, + "learning_rate": 4.336342255718664e-06, + "loss": 0.8465, + "step": 16350 + }, + { + "epoch": 0.8155112219451371, + "grad_norm": 0.5168289232381285, + "learning_rate": 4.3340694069649805e-06, + "loss": 0.8451, + "step": 16351 + }, + { + "epoch": 0.8155610972568579, + "grad_norm": 0.5970398550645193, + "learning_rate": 4.331797097479512e-06, + "loss": 0.8625, + "step": 16352 + }, + { + "epoch": 0.8156109725685785, + "grad_norm": 0.6704226873357706, + "learning_rate": 4.329525327321549e-06, + "loss": 0.8586, + "step": 16353 + }, + { + "epoch": 0.8156608478802992, + "grad_norm": 0.6609482613973673, + "learning_rate": 4.327254096550368e-06, + "loss": 0.864, + "step": 16354 + }, + { + "epoch": 0.81571072319202, + "grad_norm": 0.6665754394588467, + "learning_rate": 4.324983405225236e-06, + "loss": 0.8414, + "step": 16355 + }, + { + "epoch": 0.8157605985037406, + "grad_norm": 0.6188674043779505, + "learning_rate": 4.322713253405417e-06, + "loss": 0.8141, + "step": 16356 + }, + { + "epoch": 0.8158104738154613, + "grad_norm": 0.7349388191852299, + "learning_rate": 4.320443641150138e-06, + "loss": 0.8376, + "step": 16357 + }, + { + "epoch": 0.8158603491271821, + "grad_norm": 0.5863937577142511, + "learning_rate": 4.3181745685186295e-06, + "loss": 0.8369, + "step": 16358 + }, + { + "epoch": 0.8159102244389027, + "grad_norm": 0.8824901982156171, + "learning_rate": 4.315906035570094e-06, + "loss": 0.8353, + "step": 16359 + }, + { + "epoch": 0.8159600997506234, + "grad_norm": 0.5936886518331649, + "learning_rate": 4.313638042363738e-06, + "loss": 0.831, + "step": 16360 + }, + { + "epoch": 0.8160099750623442, + "grad_norm": 0.5352637854701177, + "learning_rate": 4.311370588958741e-06, + "loss": 0.8692, + "step": 16361 + }, + { + "epoch": 0.8160598503740648, + "grad_norm": 0.519667868221297, + "learning_rate": 4.309103675414267e-06, + "loss": 0.8584, + "step": 16362 + }, + { + "epoch": 0.8161097256857855, + "grad_norm": 0.6446146914384137, + "learning_rate": 4.306837301789468e-06, + "loss": 0.858, + "step": 16363 + }, + { + "epoch": 0.8161596009975063, + "grad_norm": 0.6257592216891465, + "learning_rate": 4.304571468143501e-06, + "loss": 0.8618, + "step": 16364 + }, + { + "epoch": 0.8162094763092269, + "grad_norm": 0.6035922462095853, + "learning_rate": 4.302306174535467e-06, + "loss": 0.8438, + "step": 16365 + }, + { + "epoch": 0.8162593516209476, + "grad_norm": 0.6011008328187529, + "learning_rate": 4.300041421024498e-06, + "loss": 0.8451, + "step": 16366 + }, + { + "epoch": 0.8163092269326683, + "grad_norm": 0.9225903632436017, + "learning_rate": 4.297777207669676e-06, + "loss": 0.8725, + "step": 16367 + }, + { + "epoch": 0.816359102244389, + "grad_norm": 0.6464304666831471, + "learning_rate": 4.2955135345301045e-06, + "loss": 0.8811, + "step": 16368 + }, + { + "epoch": 0.8164089775561097, + "grad_norm": 0.5726081160121581, + "learning_rate": 4.293250401664828e-06, + "loss": 0.846, + "step": 16369 + }, + { + "epoch": 0.8164588528678304, + "grad_norm": 0.5303377156847356, + "learning_rate": 4.290987809132923e-06, + "loss": 0.8457, + "step": 16370 + }, + { + "epoch": 0.8165087281795511, + "grad_norm": 0.4998224154354667, + "learning_rate": 4.288725756993417e-06, + "loss": 0.868, + "step": 16371 + }, + { + "epoch": 0.8165586034912719, + "grad_norm": 0.6508083373642966, + "learning_rate": 4.286464245305355e-06, + "loss": 0.8151, + "step": 16372 + }, + { + "epoch": 0.8166084788029925, + "grad_norm": 0.5327859991535864, + "learning_rate": 4.284203274127727e-06, + "loss": 0.8403, + "step": 16373 + }, + { + "epoch": 0.8166583541147132, + "grad_norm": 0.5323530838585215, + "learning_rate": 4.281942843519549e-06, + "loss": 0.8189, + "step": 16374 + }, + { + "epoch": 0.816708229426434, + "grad_norm": 0.5387609506888829, + "learning_rate": 4.2796829535398016e-06, + "loss": 0.8524, + "step": 16375 + }, + { + "epoch": 0.8167581047381546, + "grad_norm": 0.5403151707665895, + "learning_rate": 4.277423604247452e-06, + "loss": 0.8139, + "step": 16376 + }, + { + "epoch": 0.8168079800498753, + "grad_norm": 0.5463830654715718, + "learning_rate": 4.275164795701453e-06, + "loss": 0.8545, + "step": 16377 + }, + { + "epoch": 0.816857855361596, + "grad_norm": 0.6951564593259733, + "learning_rate": 4.27290652796076e-06, + "loss": 0.8791, + "step": 16378 + }, + { + "epoch": 0.8169077306733167, + "grad_norm": 0.6306522909909384, + "learning_rate": 4.270648801084296e-06, + "loss": 0.8638, + "step": 16379 + }, + { + "epoch": 0.8169576059850374, + "grad_norm": 0.6893057678802826, + "learning_rate": 4.268391615130974e-06, + "loss": 0.8637, + "step": 16380 + }, + { + "epoch": 0.8170074812967582, + "grad_norm": 0.5935982621545531, + "learning_rate": 4.266134970159688e-06, + "loss": 0.8514, + "step": 16381 + }, + { + "epoch": 0.8170573566084788, + "grad_norm": 0.5114529256385459, + "learning_rate": 4.263878866229337e-06, + "loss": 0.8379, + "step": 16382 + }, + { + "epoch": 0.8171072319201995, + "grad_norm": 0.5842818406488316, + "learning_rate": 4.261623303398788e-06, + "loss": 0.7907, + "step": 16383 + }, + { + "epoch": 0.8171571072319203, + "grad_norm": 0.5639060911158291, + "learning_rate": 4.259368281726894e-06, + "loss": 0.8361, + "step": 16384 + }, + { + "epoch": 0.8172069825436409, + "grad_norm": 0.5595241668826646, + "learning_rate": 4.2571138012725e-06, + "loss": 0.8428, + "step": 16385 + }, + { + "epoch": 0.8172568578553616, + "grad_norm": 0.43536662080911454, + "learning_rate": 4.254859862094443e-06, + "loss": 0.8135, + "step": 16386 + }, + { + "epoch": 0.8173067331670822, + "grad_norm": 0.700044200965344, + "learning_rate": 4.252606464251533e-06, + "loss": 0.8448, + "step": 16387 + }, + { + "epoch": 0.817356608478803, + "grad_norm": 0.6192002279417405, + "learning_rate": 4.250353607802565e-06, + "loss": 0.8378, + "step": 16388 + }, + { + "epoch": 0.8174064837905237, + "grad_norm": 0.5996315768272945, + "learning_rate": 4.2481012928063415e-06, + "loss": 0.826, + "step": 16389 + }, + { + "epoch": 0.8174563591022443, + "grad_norm": 0.5363834916657734, + "learning_rate": 4.245849519321627e-06, + "loss": 0.8484, + "step": 16390 + }, + { + "epoch": 0.8175062344139651, + "grad_norm": 0.5683852563594601, + "learning_rate": 4.24359828740718e-06, + "loss": 0.8484, + "step": 16391 + }, + { + "epoch": 0.8175561097256858, + "grad_norm": 0.7032040434257163, + "learning_rate": 4.241347597121742e-06, + "loss": 0.856, + "step": 16392 + }, + { + "epoch": 0.8176059850374064, + "grad_norm": 0.5144045591461599, + "learning_rate": 4.2390974485240515e-06, + "loss": 0.9, + "step": 16393 + }, + { + "epoch": 0.8176558603491272, + "grad_norm": 0.5075148659714848, + "learning_rate": 4.236847841672822e-06, + "loss": 0.8323, + "step": 16394 + }, + { + "epoch": 0.8177057356608479, + "grad_norm": 0.5877999287221575, + "learning_rate": 4.234598776626758e-06, + "loss": 0.8609, + "step": 16395 + }, + { + "epoch": 0.8177556109725685, + "grad_norm": 0.48195030186031845, + "learning_rate": 4.232350253444539e-06, + "loss": 0.807, + "step": 16396 + }, + { + "epoch": 0.8178054862842893, + "grad_norm": 0.49399289676979496, + "learning_rate": 4.230102272184849e-06, + "loss": 0.8235, + "step": 16397 + }, + { + "epoch": 0.81785536159601, + "grad_norm": 0.6190325083832573, + "learning_rate": 4.2278548329063475e-06, + "loss": 0.9175, + "step": 16398 + }, + { + "epoch": 0.8179052369077306, + "grad_norm": 0.5407849761709558, + "learning_rate": 4.2256079356676775e-06, + "loss": 0.8233, + "step": 16399 + }, + { + "epoch": 0.8179551122194514, + "grad_norm": 0.5655652022014417, + "learning_rate": 4.223361580527465e-06, + "loss": 0.8052, + "step": 16400 + }, + { + "epoch": 0.8180049875311721, + "grad_norm": 0.539577425292608, + "learning_rate": 4.221115767544345e-06, + "loss": 0.8359, + "step": 16401 + }, + { + "epoch": 0.8180548628428927, + "grad_norm": 0.5259994847984755, + "learning_rate": 4.218870496776897e-06, + "loss": 0.8645, + "step": 16402 + }, + { + "epoch": 0.8181047381546135, + "grad_norm": 0.5465539524629162, + "learning_rate": 4.2166257682837305e-06, + "loss": 0.8451, + "step": 16403 + }, + { + "epoch": 0.8181546134663341, + "grad_norm": 0.46194266632581277, + "learning_rate": 4.214381582123409e-06, + "loss": 0.8599, + "step": 16404 + }, + { + "epoch": 0.8182044887780549, + "grad_norm": 0.855938206685275, + "learning_rate": 4.2121379383545045e-06, + "loss": 0.8293, + "step": 16405 + }, + { + "epoch": 0.8182543640897756, + "grad_norm": 0.6499658037780547, + "learning_rate": 4.20989483703555e-06, + "loss": 0.8997, + "step": 16406 + }, + { + "epoch": 0.8183042394014962, + "grad_norm": 0.601115360201019, + "learning_rate": 4.207652278225088e-06, + "loss": 0.8846, + "step": 16407 + }, + { + "epoch": 0.818354114713217, + "grad_norm": 0.6032584959329576, + "learning_rate": 4.205410261981629e-06, + "loss": 0.86, + "step": 16408 + }, + { + "epoch": 0.8184039900249377, + "grad_norm": 0.6414479116147529, + "learning_rate": 4.203168788363695e-06, + "loss": 0.8883, + "step": 16409 + }, + { + "epoch": 0.8184538653366583, + "grad_norm": 0.8937747414327858, + "learning_rate": 4.200927857429754e-06, + "loss": 0.8726, + "step": 16410 + }, + { + "epoch": 0.818503740648379, + "grad_norm": 0.4650557583285901, + "learning_rate": 4.198687469238297e-06, + "loss": 0.8028, + "step": 16411 + }, + { + "epoch": 0.8185536159600998, + "grad_norm": 0.6148266603059105, + "learning_rate": 4.19644762384778e-06, + "loss": 0.8552, + "step": 16412 + }, + { + "epoch": 0.8186034912718204, + "grad_norm": 0.6632310810163806, + "learning_rate": 4.19420832131665e-06, + "loss": 0.8496, + "step": 16413 + }, + { + "epoch": 0.8186533665835412, + "grad_norm": 0.5183704135654408, + "learning_rate": 4.1919695617033404e-06, + "loss": 0.8957, + "step": 16414 + }, + { + "epoch": 0.8187032418952619, + "grad_norm": 0.5357068500390854, + "learning_rate": 4.189731345066275e-06, + "loss": 0.8064, + "step": 16415 + }, + { + "epoch": 0.8187531172069825, + "grad_norm": 0.5174713458486877, + "learning_rate": 4.18749367146386e-06, + "loss": 0.8671, + "step": 16416 + }, + { + "epoch": 0.8188029925187033, + "grad_norm": 0.7321381566065307, + "learning_rate": 4.185256540954482e-06, + "loss": 0.8819, + "step": 16417 + }, + { + "epoch": 0.818852867830424, + "grad_norm": 0.6149176098345971, + "learning_rate": 4.183019953596512e-06, + "loss": 0.8685, + "step": 16418 + }, + { + "epoch": 0.8189027431421446, + "grad_norm": 0.5288055295987988, + "learning_rate": 4.180783909448327e-06, + "loss": 0.8383, + "step": 16419 + }, + { + "epoch": 0.8189526184538654, + "grad_norm": 0.5503125583372993, + "learning_rate": 4.178548408568267e-06, + "loss": 0.8428, + "step": 16420 + }, + { + "epoch": 0.819002493765586, + "grad_norm": 0.9037883462796158, + "learning_rate": 4.176313451014669e-06, + "loss": 0.8256, + "step": 16421 + }, + { + "epoch": 0.8190523690773067, + "grad_norm": 0.5455012988636129, + "learning_rate": 4.174079036845846e-06, + "loss": 0.8331, + "step": 16422 + }, + { + "epoch": 0.8191022443890275, + "grad_norm": 0.585130457025225, + "learning_rate": 4.171845166120117e-06, + "loss": 0.8105, + "step": 16423 + }, + { + "epoch": 0.8191521197007481, + "grad_norm": 0.7079036707980082, + "learning_rate": 4.169611838895765e-06, + "loss": 0.8517, + "step": 16424 + }, + { + "epoch": 0.8192019950124688, + "grad_norm": 0.5709719318309527, + "learning_rate": 4.1673790552310656e-06, + "loss": 0.8344, + "step": 16425 + }, + { + "epoch": 0.8192518703241896, + "grad_norm": 0.5699459799306028, + "learning_rate": 4.16514681518429e-06, + "loss": 0.8326, + "step": 16426 + }, + { + "epoch": 0.8193017456359102, + "grad_norm": 0.4715933985795828, + "learning_rate": 4.162915118813685e-06, + "loss": 0.796, + "step": 16427 + }, + { + "epoch": 0.8193516209476309, + "grad_norm": 0.5276853699847852, + "learning_rate": 4.160683966177487e-06, + "loss": 0.8063, + "step": 16428 + }, + { + "epoch": 0.8194014962593517, + "grad_norm": 0.5926722346793937, + "learning_rate": 4.158453357333905e-06, + "loss": 0.8679, + "step": 16429 + }, + { + "epoch": 0.8194513715710723, + "grad_norm": 0.7849196812274297, + "learning_rate": 4.156223292341161e-06, + "loss": 0.8703, + "step": 16430 + }, + { + "epoch": 0.819501246882793, + "grad_norm": 0.6888895814220157, + "learning_rate": 4.1539937712574446e-06, + "loss": 0.8798, + "step": 16431 + }, + { + "epoch": 0.8195511221945138, + "grad_norm": 0.6114158450511281, + "learning_rate": 4.151764794140927e-06, + "loss": 0.8691, + "step": 16432 + }, + { + "epoch": 0.8196009975062344, + "grad_norm": 0.44326587543585894, + "learning_rate": 4.149536361049774e-06, + "loss": 0.7979, + "step": 16433 + }, + { + "epoch": 0.8196508728179551, + "grad_norm": 0.46839025286577585, + "learning_rate": 4.147308472042144e-06, + "loss": 0.8317, + "step": 16434 + }, + { + "epoch": 0.8197007481296759, + "grad_norm": 0.7586555432138324, + "learning_rate": 4.145081127176168e-06, + "loss": 0.8642, + "step": 16435 + }, + { + "epoch": 0.8197506234413965, + "grad_norm": 1.4372380786735013, + "learning_rate": 4.142854326509962e-06, + "loss": 0.8195, + "step": 16436 + }, + { + "epoch": 0.8198004987531172, + "grad_norm": 0.5204444672342894, + "learning_rate": 4.140628070101635e-06, + "loss": 0.8678, + "step": 16437 + }, + { + "epoch": 0.8198503740648379, + "grad_norm": 0.651001288151453, + "learning_rate": 4.1384023580092965e-06, + "loss": 0.8629, + "step": 16438 + }, + { + "epoch": 0.8199002493765586, + "grad_norm": 0.5325633547730052, + "learning_rate": 4.136177190290999e-06, + "loss": 0.8486, + "step": 16439 + }, + { + "epoch": 0.8199501246882793, + "grad_norm": 0.5410517741472113, + "learning_rate": 4.133952567004826e-06, + "loss": 0.8828, + "step": 16440 + }, + { + "epoch": 0.82, + "grad_norm": 0.5141075167677563, + "learning_rate": 4.131728488208814e-06, + "loss": 0.8508, + "step": 16441 + }, + { + "epoch": 0.8200498753117207, + "grad_norm": 0.5695798021404165, + "learning_rate": 4.129504953961022e-06, + "loss": 0.84, + "step": 16442 + }, + { + "epoch": 0.8200997506234414, + "grad_norm": 0.5904312575643207, + "learning_rate": 4.127281964319446e-06, + "loss": 0.8274, + "step": 16443 + }, + { + "epoch": 0.820149625935162, + "grad_norm": 0.5293612069526489, + "learning_rate": 4.1250595193421114e-06, + "loss": 0.8236, + "step": 16444 + }, + { + "epoch": 0.8201995012468828, + "grad_norm": 0.7057230887511327, + "learning_rate": 4.1228376190869995e-06, + "loss": 0.8692, + "step": 16445 + }, + { + "epoch": 0.8202493765586035, + "grad_norm": 0.651463863546255, + "learning_rate": 4.120616263612109e-06, + "loss": 0.8244, + "step": 16446 + }, + { + "epoch": 0.8202992518703242, + "grad_norm": 0.4985470545217327, + "learning_rate": 4.118395452975382e-06, + "loss": 0.8697, + "step": 16447 + }, + { + "epoch": 0.8203491271820449, + "grad_norm": 0.5192879427533921, + "learning_rate": 4.116175187234786e-06, + "loss": 0.8444, + "step": 16448 + }, + { + "epoch": 0.8203990024937656, + "grad_norm": 1.33425170329381, + "learning_rate": 4.11395546644825e-06, + "loss": 0.8155, + "step": 16449 + }, + { + "epoch": 0.8204488778054863, + "grad_norm": 0.49615510884235425, + "learning_rate": 4.111736290673701e-06, + "loss": 0.7995, + "step": 16450 + }, + { + "epoch": 0.820498753117207, + "grad_norm": 0.4845267609501319, + "learning_rate": 4.109517659969036e-06, + "loss": 0.8356, + "step": 16451 + }, + { + "epoch": 0.8205486284289277, + "grad_norm": 0.5817966505975605, + "learning_rate": 4.107299574392168e-06, + "loss": 0.8793, + "step": 16452 + }, + { + "epoch": 0.8205985037406484, + "grad_norm": 0.7908582341975889, + "learning_rate": 4.105082034000965e-06, + "loss": 0.8566, + "step": 16453 + }, + { + "epoch": 0.8206483790523691, + "grad_norm": 0.7520619353044592, + "learning_rate": 4.102865038853296e-06, + "loss": 0.8753, + "step": 16454 + }, + { + "epoch": 0.8206982543640898, + "grad_norm": 0.5216151648482501, + "learning_rate": 4.100648589007006e-06, + "loss": 0.8255, + "step": 16455 + }, + { + "epoch": 0.8207481296758105, + "grad_norm": 0.5339168573059069, + "learning_rate": 4.098432684519943e-06, + "loss": 0.8494, + "step": 16456 + }, + { + "epoch": 0.8207980049875312, + "grad_norm": 0.6778655736270187, + "learning_rate": 4.096217325449925e-06, + "loss": 0.8325, + "step": 16457 + }, + { + "epoch": 0.8208478802992518, + "grad_norm": 0.5132089641138924, + "learning_rate": 4.094002511854761e-06, + "loss": 0.861, + "step": 16458 + }, + { + "epoch": 0.8208977556109726, + "grad_norm": 0.4811821199254176, + "learning_rate": 4.0917882437922386e-06, + "loss": 0.8575, + "step": 16459 + }, + { + "epoch": 0.8209476309226933, + "grad_norm": 0.5514685808640186, + "learning_rate": 4.08957452132015e-06, + "loss": 0.8002, + "step": 16460 + }, + { + "epoch": 0.8209975062344139, + "grad_norm": 0.5486524935310059, + "learning_rate": 4.0873613444962575e-06, + "loss": 0.8343, + "step": 16461 + }, + { + "epoch": 0.8210473815461347, + "grad_norm": 0.517685950739049, + "learning_rate": 4.085148713378304e-06, + "loss": 0.8294, + "step": 16462 + }, + { + "epoch": 0.8210972568578554, + "grad_norm": 0.5858959138003026, + "learning_rate": 4.0829366280240394e-06, + "loss": 0.8986, + "step": 16463 + }, + { + "epoch": 0.821147132169576, + "grad_norm": 0.8244564723739747, + "learning_rate": 4.080725088491183e-06, + "loss": 0.8428, + "step": 16464 + }, + { + "epoch": 0.8211970074812968, + "grad_norm": 0.5848512302605798, + "learning_rate": 4.078514094837446e-06, + "loss": 0.823, + "step": 16465 + }, + { + "epoch": 0.8212468827930175, + "grad_norm": 0.6443661305458265, + "learning_rate": 4.07630364712051e-06, + "loss": 0.8483, + "step": 16466 + }, + { + "epoch": 0.8212967581047381, + "grad_norm": 0.530727209619832, + "learning_rate": 4.074093745398078e-06, + "loss": 0.8497, + "step": 16467 + }, + { + "epoch": 0.8213466334164589, + "grad_norm": 0.6720876084154163, + "learning_rate": 4.071884389727801e-06, + "loss": 0.8445, + "step": 16468 + }, + { + "epoch": 0.8213965087281796, + "grad_norm": 0.5072688029993522, + "learning_rate": 4.069675580167334e-06, + "loss": 0.8627, + "step": 16469 + }, + { + "epoch": 0.8214463840399002, + "grad_norm": 0.5550063784542844, + "learning_rate": 4.067467316774309e-06, + "loss": 0.8262, + "step": 16470 + }, + { + "epoch": 0.821496259351621, + "grad_norm": 0.7192436896043718, + "learning_rate": 4.065259599606369e-06, + "loss": 0.8405, + "step": 16471 + }, + { + "epoch": 0.8215461346633417, + "grad_norm": 0.6654236806907313, + "learning_rate": 4.063052428721101e-06, + "loss": 0.8297, + "step": 16472 + }, + { + "epoch": 0.8215960099750623, + "grad_norm": 0.5437058641632166, + "learning_rate": 4.060845804176114e-06, + "loss": 0.8394, + "step": 16473 + }, + { + "epoch": 0.8216458852867831, + "grad_norm": 0.5085149375498405, + "learning_rate": 4.058639726028976e-06, + "loss": 0.8287, + "step": 16474 + }, + { + "epoch": 0.8216957605985037, + "grad_norm": 0.5493977124843104, + "learning_rate": 4.0564341943372775e-06, + "loss": 0.8318, + "step": 16475 + }, + { + "epoch": 0.8217456359102244, + "grad_norm": 0.5079775472027844, + "learning_rate": 4.054229209158545e-06, + "loss": 0.8972, + "step": 16476 + }, + { + "epoch": 0.8217955112219452, + "grad_norm": 0.6371203049329461, + "learning_rate": 4.052024770550328e-06, + "loss": 0.8455, + "step": 16477 + }, + { + "epoch": 0.8218453865336658, + "grad_norm": 0.5224011583990527, + "learning_rate": 4.049820878570148e-06, + "loss": 0.7948, + "step": 16478 + }, + { + "epoch": 0.8218952618453865, + "grad_norm": 0.6507622930465837, + "learning_rate": 4.0476175332755245e-06, + "loss": 0.84, + "step": 16479 + }, + { + "epoch": 0.8219451371571073, + "grad_norm": 0.5057573779472122, + "learning_rate": 4.045414734723934e-06, + "loss": 0.8369, + "step": 16480 + }, + { + "epoch": 0.8219950124688279, + "grad_norm": 0.5894717330755675, + "learning_rate": 4.043212482972877e-06, + "loss": 0.8565, + "step": 16481 + }, + { + "epoch": 0.8220448877805486, + "grad_norm": 0.8249237979421384, + "learning_rate": 4.0410107780798065e-06, + "loss": 0.8749, + "step": 16482 + }, + { + "epoch": 0.8220947630922694, + "grad_norm": 0.5402601131170384, + "learning_rate": 4.038809620102183e-06, + "loss": 0.8356, + "step": 16483 + }, + { + "epoch": 0.82214463840399, + "grad_norm": 0.5987289683474164, + "learning_rate": 4.036609009097434e-06, + "loss": 0.863, + "step": 16484 + }, + { + "epoch": 0.8221945137157107, + "grad_norm": 0.6005247002358239, + "learning_rate": 4.034408945122997e-06, + "loss": 0.8349, + "step": 16485 + }, + { + "epoch": 0.8222443890274315, + "grad_norm": 0.4848925071775631, + "learning_rate": 4.032209428236275e-06, + "loss": 0.7835, + "step": 16486 + }, + { + "epoch": 0.8222942643391521, + "grad_norm": 0.5034682707443764, + "learning_rate": 4.0300104584946655e-06, + "loss": 0.8847, + "step": 16487 + }, + { + "epoch": 0.8223441396508728, + "grad_norm": 0.6190812235832136, + "learning_rate": 4.027812035955542e-06, + "loss": 0.8019, + "step": 16488 + }, + { + "epoch": 0.8223940149625936, + "grad_norm": 0.7050030778668863, + "learning_rate": 4.0256141606762836e-06, + "loss": 0.8519, + "step": 16489 + }, + { + "epoch": 0.8224438902743142, + "grad_norm": 0.7170865094392422, + "learning_rate": 4.0234168327142355e-06, + "loss": 0.8595, + "step": 16490 + }, + { + "epoch": 0.8224937655860349, + "grad_norm": 0.590600134348759, + "learning_rate": 4.0212200521267365e-06, + "loss": 0.8273, + "step": 16491 + }, + { + "epoch": 0.8225436408977556, + "grad_norm": 0.5031617384319338, + "learning_rate": 4.019023818971107e-06, + "loss": 0.8256, + "step": 16492 + }, + { + "epoch": 0.8225935162094763, + "grad_norm": 0.5319820900263452, + "learning_rate": 4.016828133304667e-06, + "loss": 0.7662, + "step": 16493 + }, + { + "epoch": 0.822643391521197, + "grad_norm": 0.5136265780967859, + "learning_rate": 4.014632995184706e-06, + "loss": 0.8539, + "step": 16494 + }, + { + "epoch": 0.8226932668329177, + "grad_norm": 0.7770475496354629, + "learning_rate": 4.012438404668506e-06, + "loss": 0.8571, + "step": 16495 + }, + { + "epoch": 0.8227431421446384, + "grad_norm": 0.5634676601308742, + "learning_rate": 4.010244361813326e-06, + "loss": 0.8514, + "step": 16496 + }, + { + "epoch": 0.8227930174563591, + "grad_norm": 0.6950420033223246, + "learning_rate": 4.008050866676433e-06, + "loss": 0.8189, + "step": 16497 + }, + { + "epoch": 0.8228428927680798, + "grad_norm": 0.5202663638436975, + "learning_rate": 4.0058579193150535e-06, + "loss": 0.8421, + "step": 16498 + }, + { + "epoch": 0.8228927680798005, + "grad_norm": 0.557279861788713, + "learning_rate": 4.00366551978642e-06, + "loss": 0.8478, + "step": 16499 + }, + { + "epoch": 0.8229426433915212, + "grad_norm": 0.6765405310168144, + "learning_rate": 4.001473668147731e-06, + "loss": 0.8437, + "step": 16500 + }, + { + "epoch": 0.8229925187032419, + "grad_norm": 0.5451806038354328, + "learning_rate": 3.999282364456194e-06, + "loss": 0.8221, + "step": 16501 + }, + { + "epoch": 0.8230423940149626, + "grad_norm": 0.5580101211438422, + "learning_rate": 3.997091608768988e-06, + "loss": 0.8348, + "step": 16502 + }, + { + "epoch": 0.8230922693266833, + "grad_norm": 0.5382520105667621, + "learning_rate": 3.994901401143267e-06, + "loss": 0.8557, + "step": 16503 + }, + { + "epoch": 0.823142144638404, + "grad_norm": 0.5763416022432223, + "learning_rate": 3.992711741636201e-06, + "loss": 0.8571, + "step": 16504 + }, + { + "epoch": 0.8231920199501247, + "grad_norm": 0.5504646330691443, + "learning_rate": 3.990522630304921e-06, + "loss": 0.8905, + "step": 16505 + }, + { + "epoch": 0.8232418952618454, + "grad_norm": 0.5095697430371197, + "learning_rate": 3.98833406720655e-06, + "loss": 0.8401, + "step": 16506 + }, + { + "epoch": 0.8232917705735661, + "grad_norm": 0.5621579776518031, + "learning_rate": 3.986146052398191e-06, + "loss": 0.8482, + "step": 16507 + }, + { + "epoch": 0.8233416458852868, + "grad_norm": 0.6092375740481988, + "learning_rate": 3.9839585859369585e-06, + "loss": 0.8727, + "step": 16508 + }, + { + "epoch": 0.8233915211970075, + "grad_norm": 0.9306760833803941, + "learning_rate": 3.981771667879911e-06, + "loss": 0.9177, + "step": 16509 + }, + { + "epoch": 0.8234413965087282, + "grad_norm": 0.6476610162359477, + "learning_rate": 3.979585298284128e-06, + "loss": 0.8488, + "step": 16510 + }, + { + "epoch": 0.8234912718204489, + "grad_norm": 0.7151207025714755, + "learning_rate": 3.977399477206656e-06, + "loss": 0.8443, + "step": 16511 + }, + { + "epoch": 0.8235411471321695, + "grad_norm": 0.5450557828669317, + "learning_rate": 3.975214204704547e-06, + "loss": 0.8206, + "step": 16512 + }, + { + "epoch": 0.8235910224438903, + "grad_norm": 0.6783769022570925, + "learning_rate": 3.9730294808348e-06, + "loss": 0.8622, + "step": 16513 + }, + { + "epoch": 0.823640897755611, + "grad_norm": 0.5967811398375884, + "learning_rate": 3.970845305654447e-06, + "loss": 0.8256, + "step": 16514 + }, + { + "epoch": 0.8236907730673316, + "grad_norm": 0.7030506158457969, + "learning_rate": 3.968661679220468e-06, + "loss": 0.8382, + "step": 16515 + }, + { + "epoch": 0.8237406483790524, + "grad_norm": 0.5638523037700862, + "learning_rate": 3.9664786015898626e-06, + "loss": 0.8472, + "step": 16516 + }, + { + "epoch": 0.8237905236907731, + "grad_norm": 0.7585748797749813, + "learning_rate": 3.964296072819571e-06, + "loss": 0.8054, + "step": 16517 + }, + { + "epoch": 0.8238403990024937, + "grad_norm": 0.6641019919679849, + "learning_rate": 3.962114092966565e-06, + "loss": 0.8537, + "step": 16518 + }, + { + "epoch": 0.8238902743142145, + "grad_norm": 0.6078346205540329, + "learning_rate": 3.959932662087779e-06, + "loss": 0.8449, + "step": 16519 + }, + { + "epoch": 0.8239401496259352, + "grad_norm": 0.6619136331329981, + "learning_rate": 3.957751780240129e-06, + "loss": 0.8637, + "step": 16520 + }, + { + "epoch": 0.8239900249376558, + "grad_norm": 0.5724603611666826, + "learning_rate": 3.955571447480527e-06, + "loss": 0.8258, + "step": 16521 + }, + { + "epoch": 0.8240399002493766, + "grad_norm": 0.5814260186521762, + "learning_rate": 3.953391663865874e-06, + "loss": 0.8862, + "step": 16522 + }, + { + "epoch": 0.8240897755610973, + "grad_norm": 1.0130783977384288, + "learning_rate": 3.95121242945305e-06, + "loss": 0.8898, + "step": 16523 + }, + { + "epoch": 0.8241396508728179, + "grad_norm": 0.5878518032252376, + "learning_rate": 3.949033744298913e-06, + "loss": 0.8671, + "step": 16524 + }, + { + "epoch": 0.8241895261845387, + "grad_norm": 0.5586505467248742, + "learning_rate": 3.946855608460315e-06, + "loss": 0.8786, + "step": 16525 + }, + { + "epoch": 0.8242394014962594, + "grad_norm": 0.5663376331433272, + "learning_rate": 3.944678021994103e-06, + "loss": 0.8443, + "step": 16526 + }, + { + "epoch": 0.82428927680798, + "grad_norm": 0.7890442368083839, + "learning_rate": 3.942500984957096e-06, + "loss": 0.8218, + "step": 16527 + }, + { + "epoch": 0.8243391521197008, + "grad_norm": 0.5649364358356773, + "learning_rate": 3.9403244974061e-06, + "loss": 0.857, + "step": 16528 + }, + { + "epoch": 0.8243890274314214, + "grad_norm": 0.5664460264956455, + "learning_rate": 3.938148559397908e-06, + "loss": 0.8534, + "step": 16529 + }, + { + "epoch": 0.8244389027431421, + "grad_norm": 0.47758484680176916, + "learning_rate": 3.935973170989307e-06, + "loss": 0.8574, + "step": 16530 + }, + { + "epoch": 0.8244887780548629, + "grad_norm": 0.5112020150146159, + "learning_rate": 3.93379833223706e-06, + "loss": 0.8668, + "step": 16531 + }, + { + "epoch": 0.8245386533665835, + "grad_norm": 0.5592605506731635, + "learning_rate": 3.9316240431979176e-06, + "loss": 0.7845, + "step": 16532 + }, + { + "epoch": 0.8245885286783042, + "grad_norm": 0.5155060067811039, + "learning_rate": 3.929450303928612e-06, + "loss": 0.8375, + "step": 16533 + }, + { + "epoch": 0.824638403990025, + "grad_norm": 0.6467271717402415, + "learning_rate": 3.927277114485872e-06, + "loss": 0.7712, + "step": 16534 + }, + { + "epoch": 0.8246882793017456, + "grad_norm": 0.5274743065054718, + "learning_rate": 3.925104474926408e-06, + "loss": 0.8154, + "step": 16535 + }, + { + "epoch": 0.8247381546134663, + "grad_norm": 0.5049406733878153, + "learning_rate": 3.9229323853069124e-06, + "loss": 0.8241, + "step": 16536 + }, + { + "epoch": 0.8247880299251871, + "grad_norm": 0.5382716920153102, + "learning_rate": 3.9207608456840544e-06, + "loss": 0.8128, + "step": 16537 + }, + { + "epoch": 0.8248379052369077, + "grad_norm": 0.647380852291088, + "learning_rate": 3.918589856114516e-06, + "loss": 0.8771, + "step": 16538 + }, + { + "epoch": 0.8248877805486284, + "grad_norm": 0.7775564191236777, + "learning_rate": 3.91641941665494e-06, + "loss": 0.8473, + "step": 16539 + }, + { + "epoch": 0.8249376558603492, + "grad_norm": 0.6170846896293362, + "learning_rate": 3.91424952736196e-06, + "loss": 0.8464, + "step": 16540 + }, + { + "epoch": 0.8249875311720698, + "grad_norm": 0.7165230969403574, + "learning_rate": 3.912080188292211e-06, + "loss": 0.8562, + "step": 16541 + }, + { + "epoch": 0.8250374064837905, + "grad_norm": 0.5095510585198089, + "learning_rate": 3.90991139950228e-06, + "loss": 0.8342, + "step": 16542 + }, + { + "epoch": 0.8250872817955113, + "grad_norm": 0.5393466604730338, + "learning_rate": 3.90774316104878e-06, + "loss": 0.8486, + "step": 16543 + }, + { + "epoch": 0.8251371571072319, + "grad_norm": 0.6067379788265724, + "learning_rate": 3.9055754729882735e-06, + "loss": 0.8646, + "step": 16544 + }, + { + "epoch": 0.8251870324189526, + "grad_norm": 0.5836778098807991, + "learning_rate": 3.903408335377348e-06, + "loss": 0.8592, + "step": 16545 + }, + { + "epoch": 0.8252369077306733, + "grad_norm": 0.8170826956103103, + "learning_rate": 3.90124174827253e-06, + "loss": 0.8754, + "step": 16546 + }, + { + "epoch": 0.825286783042394, + "grad_norm": 0.5931339428045417, + "learning_rate": 3.899075711730369e-06, + "loss": 0.8571, + "step": 16547 + }, + { + "epoch": 0.8253366583541147, + "grad_norm": 0.5494201376001465, + "learning_rate": 3.896910225807379e-06, + "loss": 0.7922, + "step": 16548 + }, + { + "epoch": 0.8253865336658354, + "grad_norm": 0.49648668233853155, + "learning_rate": 3.894745290560084e-06, + "loss": 0.8433, + "step": 16549 + }, + { + "epoch": 0.8254364089775561, + "grad_norm": 0.5374769823562854, + "learning_rate": 3.892580906044954e-06, + "loss": 0.8371, + "step": 16550 + }, + { + "epoch": 0.8254862842892768, + "grad_norm": 0.5351509700732002, + "learning_rate": 3.890417072318486e-06, + "loss": 0.8618, + "step": 16551 + }, + { + "epoch": 0.8255361596009975, + "grad_norm": 0.5171103400813883, + "learning_rate": 3.88825378943713e-06, + "loss": 0.8468, + "step": 16552 + }, + { + "epoch": 0.8255860349127182, + "grad_norm": 1.5018288585290422, + "learning_rate": 3.886091057457353e-06, + "loss": 0.8291, + "step": 16553 + }, + { + "epoch": 0.8256359102244389, + "grad_norm": 0.6015707178051061, + "learning_rate": 3.883928876435569e-06, + "loss": 0.8526, + "step": 16554 + }, + { + "epoch": 0.8256857855361596, + "grad_norm": 0.5559881206478551, + "learning_rate": 3.88176724642822e-06, + "loss": 0.8411, + "step": 16555 + }, + { + "epoch": 0.8257356608478803, + "grad_norm": 0.6214789889985701, + "learning_rate": 3.8796061674917e-06, + "loss": 0.8542, + "step": 16556 + }, + { + "epoch": 0.825785536159601, + "grad_norm": 0.5469637707393941, + "learning_rate": 3.877445639682406e-06, + "loss": 0.8521, + "step": 16557 + }, + { + "epoch": 0.8258354114713217, + "grad_norm": 0.5889966625442298, + "learning_rate": 3.875285663056707e-06, + "loss": 0.854, + "step": 16558 + }, + { + "epoch": 0.8258852867830424, + "grad_norm": 0.670025606552913, + "learning_rate": 3.873126237670985e-06, + "loss": 0.8162, + "step": 16559 + }, + { + "epoch": 0.8259351620947631, + "grad_norm": 0.5404652294164525, + "learning_rate": 3.870967363581574e-06, + "loss": 0.8789, + "step": 16560 + }, + { + "epoch": 0.8259850374064838, + "grad_norm": 0.5741585477825314, + "learning_rate": 3.8688090408448145e-06, + "loss": 0.8403, + "step": 16561 + }, + { + "epoch": 0.8260349127182045, + "grad_norm": 0.5481730017903276, + "learning_rate": 3.866651269517021e-06, + "loss": 0.8721, + "step": 16562 + }, + { + "epoch": 0.8260847880299251, + "grad_norm": 0.6091343082448564, + "learning_rate": 3.86449404965451e-06, + "loss": 0.8959, + "step": 16563 + }, + { + "epoch": 0.8261346633416459, + "grad_norm": 0.6593614140389763, + "learning_rate": 3.8623373813135685e-06, + "loss": 0.8153, + "step": 16564 + }, + { + "epoch": 0.8261845386533666, + "grad_norm": 0.6263287958027194, + "learning_rate": 3.8601812645504746e-06, + "loss": 0.8439, + "step": 16565 + }, + { + "epoch": 0.8262344139650872, + "grad_norm": 0.5674849867608024, + "learning_rate": 3.8580256994214805e-06, + "loss": 0.8055, + "step": 16566 + }, + { + "epoch": 0.826284289276808, + "grad_norm": 0.6485599298401193, + "learning_rate": 3.855870685982851e-06, + "loss": 0.8746, + "step": 16567 + }, + { + "epoch": 0.8263341645885287, + "grad_norm": 0.6251689042525973, + "learning_rate": 3.853716224290812e-06, + "loss": 0.8596, + "step": 16568 + }, + { + "epoch": 0.8263840399002493, + "grad_norm": 0.5577030824349164, + "learning_rate": 3.8515623144015885e-06, + "loss": 0.859, + "step": 16569 + }, + { + "epoch": 0.8264339152119701, + "grad_norm": 0.5978523016321544, + "learning_rate": 3.849408956371373e-06, + "loss": 0.8521, + "step": 16570 + }, + { + "epoch": 0.8264837905236908, + "grad_norm": 0.529518830132404, + "learning_rate": 3.847256150256367e-06, + "loss": 0.856, + "step": 16571 + }, + { + "epoch": 0.8265336658354114, + "grad_norm": 0.630535403466217, + "learning_rate": 3.845103896112751e-06, + "loss": 0.8482, + "step": 16572 + }, + { + "epoch": 0.8265835411471322, + "grad_norm": 0.6013116943303602, + "learning_rate": 3.8429521939966766e-06, + "loss": 0.841, + "step": 16573 + }, + { + "epoch": 0.8266334164588529, + "grad_norm": 0.6131456752822967, + "learning_rate": 3.840801043964293e-06, + "loss": 0.7982, + "step": 16574 + }, + { + "epoch": 0.8266832917705735, + "grad_norm": 0.7433549501080968, + "learning_rate": 3.838650446071742e-06, + "loss": 0.8859, + "step": 16575 + }, + { + "epoch": 0.8267331670822943, + "grad_norm": 0.7102206042955385, + "learning_rate": 3.836500400375137e-06, + "loss": 0.8586, + "step": 16576 + }, + { + "epoch": 0.826783042394015, + "grad_norm": 0.5238596217799217, + "learning_rate": 3.8343509069305755e-06, + "loss": 0.8425, + "step": 16577 + }, + { + "epoch": 0.8268329177057356, + "grad_norm": 0.6246735691183588, + "learning_rate": 3.8322019657941685e-06, + "loss": 0.8516, + "step": 16578 + }, + { + "epoch": 0.8268827930174564, + "grad_norm": 0.608839881063033, + "learning_rate": 3.830053577021964e-06, + "loss": 0.8256, + "step": 16579 + }, + { + "epoch": 0.8269326683291771, + "grad_norm": 0.6467929753200002, + "learning_rate": 3.827905740670043e-06, + "loss": 0.8253, + "step": 16580 + }, + { + "epoch": 0.8269825436408977, + "grad_norm": 0.6124106667775845, + "learning_rate": 3.825758456794442e-06, + "loss": 0.8834, + "step": 16581 + }, + { + "epoch": 0.8270324189526185, + "grad_norm": 0.5943642756143805, + "learning_rate": 3.8236117254512096e-06, + "loss": 0.838, + "step": 16582 + }, + { + "epoch": 0.8270822942643391, + "grad_norm": 0.5782242937418898, + "learning_rate": 3.821465546696337e-06, + "loss": 0.8387, + "step": 16583 + }, + { + "epoch": 0.8271321695760598, + "grad_norm": 0.5732983231742205, + "learning_rate": 3.819319920585851e-06, + "loss": 0.8603, + "step": 16584 + }, + { + "epoch": 0.8271820448877806, + "grad_norm": 0.906914474497314, + "learning_rate": 3.817174847175725e-06, + "loss": 0.7915, + "step": 16585 + }, + { + "epoch": 0.8272319201995012, + "grad_norm": 0.5388939172546529, + "learning_rate": 3.815030326521957e-06, + "loss": 0.8556, + "step": 16586 + }, + { + "epoch": 0.8272817955112219, + "grad_norm": 0.6298991997397274, + "learning_rate": 3.812886358680476e-06, + "loss": 0.8804, + "step": 16587 + }, + { + "epoch": 0.8273316708229427, + "grad_norm": 0.5841067538541777, + "learning_rate": 3.810742943707249e-06, + "loss": 0.8251, + "step": 16588 + }, + { + "epoch": 0.8273815461346633, + "grad_norm": 0.5494733658809892, + "learning_rate": 3.8086000816582035e-06, + "loss": 0.8228, + "step": 16589 + }, + { + "epoch": 0.827431421446384, + "grad_norm": 0.8538953345321986, + "learning_rate": 3.806457772589253e-06, + "loss": 0.8434, + "step": 16590 + }, + { + "epoch": 0.8274812967581048, + "grad_norm": 0.5297005121763527, + "learning_rate": 3.8043160165562976e-06, + "loss": 0.8434, + "step": 16591 + }, + { + "epoch": 0.8275311720698254, + "grad_norm": 0.8622842476076856, + "learning_rate": 3.8021748136152362e-06, + "loss": 0.791, + "step": 16592 + }, + { + "epoch": 0.8275810473815461, + "grad_norm": 0.6172165075061523, + "learning_rate": 3.8000341638219332e-06, + "loss": 0.8282, + "step": 16593 + }, + { + "epoch": 0.8276309226932669, + "grad_norm": 1.1237561039348802, + "learning_rate": 3.797894067232255e-06, + "loss": 0.8361, + "step": 16594 + }, + { + "epoch": 0.8276807980049875, + "grad_norm": 0.9309389040630277, + "learning_rate": 3.7957545239020342e-06, + "loss": 0.8311, + "step": 16595 + }, + { + "epoch": 0.8277306733167082, + "grad_norm": 0.5895714083940742, + "learning_rate": 3.7936155338871153e-06, + "loss": 0.8324, + "step": 16596 + }, + { + "epoch": 0.827780548628429, + "grad_norm": 0.5713727383906484, + "learning_rate": 3.7914770972433067e-06, + "loss": 0.8404, + "step": 16597 + }, + { + "epoch": 0.8278304239401496, + "grad_norm": 0.622245328662972, + "learning_rate": 3.7893392140264135e-06, + "loss": 0.8454, + "step": 16598 + }, + { + "epoch": 0.8278802992518703, + "grad_norm": 0.5855306708386177, + "learning_rate": 3.7872018842922158e-06, + "loss": 0.8087, + "step": 16599 + }, + { + "epoch": 0.827930174563591, + "grad_norm": 0.5692845152730595, + "learning_rate": 3.7850651080964945e-06, + "loss": 0.8674, + "step": 16600 + }, + { + "epoch": 0.8279800498753117, + "grad_norm": 0.5914443954751406, + "learning_rate": 3.782928885495007e-06, + "loss": 0.8357, + "step": 16601 + }, + { + "epoch": 0.8280299251870324, + "grad_norm": 0.5939484088913742, + "learning_rate": 3.7807932165434934e-06, + "loss": 0.8393, + "step": 16602 + }, + { + "epoch": 0.8280798004987531, + "grad_norm": 0.6017502628714462, + "learning_rate": 3.7786581012976803e-06, + "loss": 0.8622, + "step": 16603 + }, + { + "epoch": 0.8281296758104738, + "grad_norm": 0.6592050488526123, + "learning_rate": 3.77652353981329e-06, + "loss": 0.9113, + "step": 16604 + }, + { + "epoch": 0.8281795511221945, + "grad_norm": 0.5778700570990655, + "learning_rate": 3.774389532146019e-06, + "loss": 0.8437, + "step": 16605 + }, + { + "epoch": 0.8282294264339152, + "grad_norm": 0.4621727631297759, + "learning_rate": 3.7722560783515576e-06, + "loss": 0.7982, + "step": 16606 + }, + { + "epoch": 0.8282793017456359, + "grad_norm": 0.5030665765256221, + "learning_rate": 3.7701231784855656e-06, + "loss": 0.7988, + "step": 16607 + }, + { + "epoch": 0.8283291770573566, + "grad_norm": 0.5113873280147545, + "learning_rate": 3.767990832603713e-06, + "loss": 0.8677, + "step": 16608 + }, + { + "epoch": 0.8283790523690773, + "grad_norm": 0.5647705813341659, + "learning_rate": 3.765859040761638e-06, + "loss": 0.808, + "step": 16609 + }, + { + "epoch": 0.828428927680798, + "grad_norm": 0.5677915539850038, + "learning_rate": 3.7637278030149686e-06, + "loss": 0.8158, + "step": 16610 + }, + { + "epoch": 0.8284788029925187, + "grad_norm": 0.5555726626977939, + "learning_rate": 3.7615971194193134e-06, + "loss": 0.8219, + "step": 16611 + }, + { + "epoch": 0.8285286783042394, + "grad_norm": 0.5775596533296008, + "learning_rate": 3.7594669900302802e-06, + "loss": 0.8596, + "step": 16612 + }, + { + "epoch": 0.8285785536159601, + "grad_norm": 1.185563673604096, + "learning_rate": 3.75733741490345e-06, + "loss": 0.8502, + "step": 16613 + }, + { + "epoch": 0.8286284289276808, + "grad_norm": 0.7115966948927193, + "learning_rate": 3.7552083940943923e-06, + "loss": 0.8449, + "step": 16614 + }, + { + "epoch": 0.8286783042394015, + "grad_norm": 1.1107644413751718, + "learning_rate": 3.753079927658665e-06, + "loss": 0.8672, + "step": 16615 + }, + { + "epoch": 0.8287281795511222, + "grad_norm": 0.5372320671077767, + "learning_rate": 3.7509520156518014e-06, + "loss": 0.8661, + "step": 16616 + }, + { + "epoch": 0.8287780548628428, + "grad_norm": 0.5096121705792551, + "learning_rate": 3.7488246581293435e-06, + "loss": 0.8111, + "step": 16617 + }, + { + "epoch": 0.8288279301745636, + "grad_norm": 0.5837538289608358, + "learning_rate": 3.7466978551467857e-06, + "loss": 0.8584, + "step": 16618 + }, + { + "epoch": 0.8288778054862843, + "grad_norm": 0.6090276078908773, + "learning_rate": 3.7445716067596503e-06, + "loss": 0.8452, + "step": 16619 + }, + { + "epoch": 0.8289276807980049, + "grad_norm": 0.6405981090378606, + "learning_rate": 3.742445913023393e-06, + "loss": 0.865, + "step": 16620 + }, + { + "epoch": 0.8289775561097257, + "grad_norm": 0.5352291054179232, + "learning_rate": 3.7403207739935026e-06, + "loss": 0.8595, + "step": 16621 + }, + { + "epoch": 0.8290274314214464, + "grad_norm": 0.676285419708583, + "learning_rate": 3.7381961897254207e-06, + "loss": 0.8386, + "step": 16622 + }, + { + "epoch": 0.829077306733167, + "grad_norm": 0.7600181651767302, + "learning_rate": 3.7360721602746063e-06, + "loss": 0.8294, + "step": 16623 + }, + { + "epoch": 0.8291271820448878, + "grad_norm": 0.5653946257176815, + "learning_rate": 3.733948685696459e-06, + "loss": 0.7931, + "step": 16624 + }, + { + "epoch": 0.8291770573566085, + "grad_norm": 0.5757272820829291, + "learning_rate": 3.7318257660464094e-06, + "loss": 0.8777, + "step": 16625 + }, + { + "epoch": 0.8292269326683291, + "grad_norm": 0.5915639015205439, + "learning_rate": 3.72970340137985e-06, + "loss": 0.842, + "step": 16626 + }, + { + "epoch": 0.8292768079800499, + "grad_norm": 0.724432418040857, + "learning_rate": 3.727581591752158e-06, + "loss": 0.8667, + "step": 16627 + }, + { + "epoch": 0.8293266832917706, + "grad_norm": 0.5981727123800983, + "learning_rate": 3.7254603372187003e-06, + "loss": 0.8615, + "step": 16628 + }, + { + "epoch": 0.8293765586034912, + "grad_norm": 0.643197590428348, + "learning_rate": 3.7233396378348383e-06, + "loss": 0.8613, + "step": 16629 + }, + { + "epoch": 0.829426433915212, + "grad_norm": 0.5382215179604921, + "learning_rate": 3.7212194936559088e-06, + "loss": 0.845, + "step": 16630 + }, + { + "epoch": 0.8294763092269327, + "grad_norm": 0.9440675845762903, + "learning_rate": 3.7190999047372306e-06, + "loss": 0.8342, + "step": 16631 + }, + { + "epoch": 0.8295261845386533, + "grad_norm": 0.72098167821046, + "learning_rate": 3.7169808711341123e-06, + "loss": 0.8199, + "step": 16632 + }, + { + "epoch": 0.8295760598503741, + "grad_norm": 0.757773979394558, + "learning_rate": 3.7148623929018577e-06, + "loss": 0.8414, + "step": 16633 + }, + { + "epoch": 0.8296259351620948, + "grad_norm": 0.6130425934924555, + "learning_rate": 3.712744470095744e-06, + "loss": 0.8466, + "step": 16634 + }, + { + "epoch": 0.8296758104738154, + "grad_norm": 0.583499665929667, + "learning_rate": 3.710627102771036e-06, + "loss": 0.85, + "step": 16635 + }, + { + "epoch": 0.8297256857855362, + "grad_norm": 0.5141075745149332, + "learning_rate": 3.7085102909829805e-06, + "loss": 0.8772, + "step": 16636 + }, + { + "epoch": 0.8297755610972568, + "grad_norm": 0.5208210217205195, + "learning_rate": 3.706394034786831e-06, + "loss": 0.8728, + "step": 16637 + }, + { + "epoch": 0.8298254364089775, + "grad_norm": 0.5226730016921748, + "learning_rate": 3.7042783342377903e-06, + "loss": 0.8413, + "step": 16638 + }, + { + "epoch": 0.8298753117206983, + "grad_norm": 0.4985532132956815, + "learning_rate": 3.702163189391078e-06, + "loss": 0.8456, + "step": 16639 + }, + { + "epoch": 0.8299251870324189, + "grad_norm": 0.5664074982357585, + "learning_rate": 3.7000486003018807e-06, + "loss": 0.8317, + "step": 16640 + }, + { + "epoch": 0.8299750623441396, + "grad_norm": 0.6300165099875941, + "learning_rate": 3.697934567025396e-06, + "loss": 0.8329, + "step": 16641 + }, + { + "epoch": 0.8300249376558604, + "grad_norm": 0.5272855636919596, + "learning_rate": 3.6958210896167632e-06, + "loss": 0.8239, + "step": 16642 + }, + { + "epoch": 0.830074812967581, + "grad_norm": 0.6168162889795739, + "learning_rate": 3.6937081681311494e-06, + "loss": 0.8485, + "step": 16643 + }, + { + "epoch": 0.8301246882793017, + "grad_norm": 0.761608476924795, + "learning_rate": 3.691595802623679e-06, + "loss": 0.8545, + "step": 16644 + }, + { + "epoch": 0.8301745635910225, + "grad_norm": 0.5500082237329791, + "learning_rate": 3.6894839931494868e-06, + "loss": 0.8389, + "step": 16645 + }, + { + "epoch": 0.8302244389027431, + "grad_norm": 0.5742458867028475, + "learning_rate": 3.6873727397636726e-06, + "loss": 0.8901, + "step": 16646 + }, + { + "epoch": 0.8302743142144638, + "grad_norm": 0.625289042337802, + "learning_rate": 3.6852620425213258e-06, + "loss": 0.8454, + "step": 16647 + }, + { + "epoch": 0.8303241895261846, + "grad_norm": 0.591567786394247, + "learning_rate": 3.68315190147753e-06, + "loss": 0.8577, + "step": 16648 + }, + { + "epoch": 0.8303740648379052, + "grad_norm": 0.6499679810734313, + "learning_rate": 3.6810423166873386e-06, + "loss": 0.8737, + "step": 16649 + }, + { + "epoch": 0.830423940149626, + "grad_norm": 0.6244372699021751, + "learning_rate": 3.6789332882058105e-06, + "loss": 0.8224, + "step": 16650 + }, + { + "epoch": 0.8304738154613467, + "grad_norm": 0.5260951551945465, + "learning_rate": 3.6768248160879787e-06, + "loss": 0.8177, + "step": 16651 + }, + { + "epoch": 0.8305236907730673, + "grad_norm": 0.6100421870793845, + "learning_rate": 3.674716900388861e-06, + "loss": 0.8154, + "step": 16652 + }, + { + "epoch": 0.830573566084788, + "grad_norm": 0.6885348118656809, + "learning_rate": 3.672609541163452e-06, + "loss": 0.8274, + "step": 16653 + }, + { + "epoch": 0.8306234413965087, + "grad_norm": 0.6565362267830833, + "learning_rate": 3.6705027384667607e-06, + "loss": 0.8168, + "step": 16654 + }, + { + "epoch": 0.8306733167082294, + "grad_norm": 0.7230134125600368, + "learning_rate": 3.6683964923537484e-06, + "loss": 0.8269, + "step": 16655 + }, + { + "epoch": 0.8307231920199502, + "grad_norm": 0.5895191760829136, + "learning_rate": 3.666290802879396e-06, + "loss": 0.8472, + "step": 16656 + }, + { + "epoch": 0.8307730673316708, + "grad_norm": 0.9012797104544457, + "learning_rate": 3.664185670098624e-06, + "loss": 0.8552, + "step": 16657 + }, + { + "epoch": 0.8308229426433915, + "grad_norm": 0.5672426691110911, + "learning_rate": 3.6620810940663857e-06, + "loss": 0.834, + "step": 16658 + }, + { + "epoch": 0.8308728179551123, + "grad_norm": 0.5711261429555329, + "learning_rate": 3.659977074837584e-06, + "loss": 0.8338, + "step": 16659 + }, + { + "epoch": 0.8309226932668329, + "grad_norm": 0.635795862361717, + "learning_rate": 3.6578736124671414e-06, + "loss": 0.901, + "step": 16660 + }, + { + "epoch": 0.8309725685785536, + "grad_norm": 0.6583482872191299, + "learning_rate": 3.6557707070099255e-06, + "loss": 0.8287, + "step": 16661 + }, + { + "epoch": 0.8310224438902744, + "grad_norm": 0.7963252135699541, + "learning_rate": 3.6536683585208258e-06, + "loss": 0.8803, + "step": 16662 + }, + { + "epoch": 0.831072319201995, + "grad_norm": 0.6223338705327968, + "learning_rate": 3.6515665670546957e-06, + "loss": 0.8448, + "step": 16663 + }, + { + "epoch": 0.8311221945137157, + "grad_norm": 0.8082428470040177, + "learning_rate": 3.6494653326663824e-06, + "loss": 0.8611, + "step": 16664 + }, + { + "epoch": 0.8311720698254365, + "grad_norm": 0.6679874084866382, + "learning_rate": 3.647364655410712e-06, + "loss": 0.8484, + "step": 16665 + }, + { + "epoch": 0.8312219451371571, + "grad_norm": 0.512158339226281, + "learning_rate": 3.6452645353425074e-06, + "loss": 0.8417, + "step": 16666 + }, + { + "epoch": 0.8312718204488778, + "grad_norm": 0.6030440760954519, + "learning_rate": 3.643164972516569e-06, + "loss": 0.8594, + "step": 16667 + }, + { + "epoch": 0.8313216957605986, + "grad_norm": 0.6678446278562153, + "learning_rate": 3.6410659669876834e-06, + "loss": 0.8778, + "step": 16668 + }, + { + "epoch": 0.8313715710723192, + "grad_norm": 0.5739467869099025, + "learning_rate": 3.6389675188106155e-06, + "loss": 0.8571, + "step": 16669 + }, + { + "epoch": 0.8314214463840399, + "grad_norm": 0.634142102882198, + "learning_rate": 3.6368696280401377e-06, + "loss": 0.8762, + "step": 16670 + }, + { + "epoch": 0.8314713216957605, + "grad_norm": 0.45837862688001074, + "learning_rate": 3.6347722947309843e-06, + "loss": 0.7793, + "step": 16671 + }, + { + "epoch": 0.8315211970074813, + "grad_norm": 0.5713083995758913, + "learning_rate": 3.6326755189378864e-06, + "loss": 0.844, + "step": 16672 + }, + { + "epoch": 0.831571072319202, + "grad_norm": 0.6388865059492765, + "learning_rate": 3.630579300715553e-06, + "loss": 0.8383, + "step": 16673 + }, + { + "epoch": 0.8316209476309226, + "grad_norm": 0.567513398394818, + "learning_rate": 3.628483640118699e-06, + "loss": 0.8133, + "step": 16674 + }, + { + "epoch": 0.8316708229426434, + "grad_norm": 0.70746631622735, + "learning_rate": 3.6263885372019885e-06, + "loss": 0.8607, + "step": 16675 + }, + { + "epoch": 0.8317206982543641, + "grad_norm": 0.5741755284212335, + "learning_rate": 3.6242939920201115e-06, + "loss": 0.868, + "step": 16676 + }, + { + "epoch": 0.8317705735660847, + "grad_norm": 3.290092949225353, + "learning_rate": 3.62220000462771e-06, + "loss": 0.8341, + "step": 16677 + }, + { + "epoch": 0.8318204488778055, + "grad_norm": 0.5643109649963057, + "learning_rate": 3.6201065750794404e-06, + "loss": 0.8042, + "step": 16678 + }, + { + "epoch": 0.8318703241895262, + "grad_norm": 0.6479954581575481, + "learning_rate": 3.618013703429912e-06, + "loss": 0.8256, + "step": 16679 + }, + { + "epoch": 0.8319201995012468, + "grad_norm": 0.6186931818304717, + "learning_rate": 3.6159213897337537e-06, + "loss": 0.8791, + "step": 16680 + }, + { + "epoch": 0.8319700748129676, + "grad_norm": 0.5517723920885743, + "learning_rate": 3.6138296340455485e-06, + "loss": 0.8927, + "step": 16681 + }, + { + "epoch": 0.8320199501246883, + "grad_norm": 0.5057504446422166, + "learning_rate": 3.611738436419895e-06, + "loss": 0.8458, + "step": 16682 + }, + { + "epoch": 0.832069825436409, + "grad_norm": 0.6029707450906562, + "learning_rate": 3.6096477969113557e-06, + "loss": 0.8767, + "step": 16683 + }, + { + "epoch": 0.8321197007481297, + "grad_norm": 0.8016312732334844, + "learning_rate": 3.6075577155744806e-06, + "loss": 0.8487, + "step": 16684 + }, + { + "epoch": 0.8321695760598504, + "grad_norm": 0.7812033476551451, + "learning_rate": 3.605468192463815e-06, + "loss": 0.8527, + "step": 16685 + }, + { + "epoch": 0.832219451371571, + "grad_norm": 0.5504122683529656, + "learning_rate": 3.603379227633877e-06, + "loss": 0.8307, + "step": 16686 + }, + { + "epoch": 0.8322693266832918, + "grad_norm": 0.6796486171710867, + "learning_rate": 3.601290821139189e-06, + "loss": 0.8156, + "step": 16687 + }, + { + "epoch": 0.8323192019950124, + "grad_norm": 0.5644381830252325, + "learning_rate": 3.599202973034238e-06, + "loss": 0.8543, + "step": 16688 + }, + { + "epoch": 0.8323690773067332, + "grad_norm": 0.5301758890664438, + "learning_rate": 3.597115683373506e-06, + "loss": 0.8039, + "step": 16689 + }, + { + "epoch": 0.8324189526184539, + "grad_norm": 0.578183229129008, + "learning_rate": 3.59502895221146e-06, + "loss": 0.8192, + "step": 16690 + }, + { + "epoch": 0.8324688279301745, + "grad_norm": 0.5317778913109654, + "learning_rate": 3.5929427796025566e-06, + "loss": 0.8046, + "step": 16691 + }, + { + "epoch": 0.8325187032418953, + "grad_norm": 0.5373992139478653, + "learning_rate": 3.5908571656012246e-06, + "loss": 0.792, + "step": 16692 + }, + { + "epoch": 0.832568578553616, + "grad_norm": 0.6556286078162679, + "learning_rate": 3.5887721102619065e-06, + "loss": 0.8764, + "step": 16693 + }, + { + "epoch": 0.8326184538653366, + "grad_norm": 0.577019390910111, + "learning_rate": 3.5866876136389866e-06, + "loss": 0.8457, + "step": 16694 + }, + { + "epoch": 0.8326683291770574, + "grad_norm": 0.5487788527368445, + "learning_rate": 3.584603675786877e-06, + "loss": 0.8247, + "step": 16695 + }, + { + "epoch": 0.8327182044887781, + "grad_norm": 0.5856656273529158, + "learning_rate": 3.5825202967599473e-06, + "loss": 0.8474, + "step": 16696 + }, + { + "epoch": 0.8327680798004987, + "grad_norm": 0.578019390360652, + "learning_rate": 3.5804374766125667e-06, + "loss": 0.8313, + "step": 16697 + }, + { + "epoch": 0.8328179551122195, + "grad_norm": 0.6098773017585282, + "learning_rate": 3.5783552153990764e-06, + "loss": 0.8451, + "step": 16698 + }, + { + "epoch": 0.8328678304239402, + "grad_norm": 0.5768660435912991, + "learning_rate": 3.576273513173828e-06, + "loss": 0.821, + "step": 16699 + }, + { + "epoch": 0.8329177057356608, + "grad_norm": 0.5363630283054918, + "learning_rate": 3.5741923699911333e-06, + "loss": 0.8621, + "step": 16700 + }, + { + "epoch": 0.8329675810473816, + "grad_norm": 0.5139232710613659, + "learning_rate": 3.5721117859052994e-06, + "loss": 0.7641, + "step": 16701 + }, + { + "epoch": 0.8330174563591023, + "grad_norm": 1.7247081327994163, + "learning_rate": 3.570031760970613e-06, + "loss": 0.8593, + "step": 16702 + }, + { + "epoch": 0.8330673316708229, + "grad_norm": 0.5140270203930606, + "learning_rate": 3.567952295241364e-06, + "loss": 0.8552, + "step": 16703 + }, + { + "epoch": 0.8331172069825437, + "grad_norm": 0.6009658568587206, + "learning_rate": 3.5658733887718067e-06, + "loss": 0.8472, + "step": 16704 + }, + { + "epoch": 0.8331670822942644, + "grad_norm": 0.5804211309217602, + "learning_rate": 3.5637950416161913e-06, + "loss": 0.8374, + "step": 16705 + }, + { + "epoch": 0.833216957605985, + "grad_norm": 0.6414986567704274, + "learning_rate": 3.5617172538287446e-06, + "loss": 0.8535, + "step": 16706 + }, + { + "epoch": 0.8332668329177058, + "grad_norm": 0.632093601153171, + "learning_rate": 3.5596400254637035e-06, + "loss": 0.8597, + "step": 16707 + }, + { + "epoch": 0.8333167082294264, + "grad_norm": 0.6639913474021576, + "learning_rate": 3.557563356575247e-06, + "loss": 0.8552, + "step": 16708 + }, + { + "epoch": 0.8333665835411471, + "grad_norm": 0.7517168585369948, + "learning_rate": 3.555487247217587e-06, + "loss": 0.8803, + "step": 16709 + }, + { + "epoch": 0.8334164588528679, + "grad_norm": 0.5347193798113836, + "learning_rate": 3.5534116974448834e-06, + "loss": 0.9041, + "step": 16710 + }, + { + "epoch": 0.8334663341645885, + "grad_norm": 0.5403469812962846, + "learning_rate": 3.5513367073113147e-06, + "loss": 0.8627, + "step": 16711 + }, + { + "epoch": 0.8335162094763092, + "grad_norm": 0.4846293873728074, + "learning_rate": 3.5492622768710014e-06, + "loss": 0.8621, + "step": 16712 + }, + { + "epoch": 0.83356608478803, + "grad_norm": 0.6193080356395645, + "learning_rate": 3.547188406178098e-06, + "loss": 0.8536, + "step": 16713 + }, + { + "epoch": 0.8336159600997506, + "grad_norm": 0.5770363173774943, + "learning_rate": 3.545115095286705e-06, + "loss": 0.8157, + "step": 16714 + }, + { + "epoch": 0.8336658354114713, + "grad_norm": 0.6031669056449422, + "learning_rate": 3.5430423442509438e-06, + "loss": 0.8387, + "step": 16715 + }, + { + "epoch": 0.8337157107231921, + "grad_norm": 0.5944305581887821, + "learning_rate": 3.5409701531248786e-06, + "loss": 0.8385, + "step": 16716 + }, + { + "epoch": 0.8337655860349127, + "grad_norm": 0.5308013623609585, + "learning_rate": 3.5388985219625968e-06, + "loss": 0.8234, + "step": 16717 + }, + { + "epoch": 0.8338154613466334, + "grad_norm": 0.6050360340573127, + "learning_rate": 3.5368274508181504e-06, + "loss": 0.8387, + "step": 16718 + }, + { + "epoch": 0.8338653366583542, + "grad_norm": 0.7198093871386947, + "learning_rate": 3.53475693974559e-06, + "loss": 0.8145, + "step": 16719 + }, + { + "epoch": 0.8339152119700748, + "grad_norm": 0.7006151236223935, + "learning_rate": 3.5326869887989417e-06, + "loss": 0.8945, + "step": 16720 + }, + { + "epoch": 0.8339650872817955, + "grad_norm": 0.6053953434975244, + "learning_rate": 3.530617598032218e-06, + "loss": 0.8454, + "step": 16721 + }, + { + "epoch": 0.8340149625935163, + "grad_norm": 0.5971601852405082, + "learning_rate": 3.5285487674994205e-06, + "loss": 0.8693, + "step": 16722 + }, + { + "epoch": 0.8340648379052369, + "grad_norm": 0.6241092977882603, + "learning_rate": 3.526480497254528e-06, + "loss": 0.8564, + "step": 16723 + }, + { + "epoch": 0.8341147132169576, + "grad_norm": 0.6634825664180224, + "learning_rate": 3.524412787351519e-06, + "loss": 0.8543, + "step": 16724 + }, + { + "epoch": 0.8341645885286783, + "grad_norm": 0.5530916285230907, + "learning_rate": 3.5223456378443517e-06, + "loss": 0.795, + "step": 16725 + }, + { + "epoch": 0.834214463840399, + "grad_norm": 0.6782362070653076, + "learning_rate": 3.520279048786962e-06, + "loss": 0.8314, + "step": 16726 + }, + { + "epoch": 0.8342643391521197, + "grad_norm": 0.6330605223533904, + "learning_rate": 3.5182130202332685e-06, + "loss": 0.832, + "step": 16727 + }, + { + "epoch": 0.8343142144638404, + "grad_norm": 0.5590643125421251, + "learning_rate": 3.5161475522372008e-06, + "loss": 0.8713, + "step": 16728 + }, + { + "epoch": 0.8343640897755611, + "grad_norm": 0.5595485856900118, + "learning_rate": 3.514082644852648e-06, + "loss": 0.8385, + "step": 16729 + }, + { + "epoch": 0.8344139650872818, + "grad_norm": 0.6336055816475947, + "learning_rate": 3.5120182981334904e-06, + "loss": 0.8282, + "step": 16730 + }, + { + "epoch": 0.8344638403990025, + "grad_norm": 0.7049137082632624, + "learning_rate": 3.5099545121335953e-06, + "loss": 0.8295, + "step": 16731 + }, + { + "epoch": 0.8345137157107232, + "grad_norm": 0.4959304275549031, + "learning_rate": 3.5078912869068253e-06, + "loss": 0.8251, + "step": 16732 + }, + { + "epoch": 0.8345635910224439, + "grad_norm": 0.720409304536981, + "learning_rate": 3.5058286225070126e-06, + "loss": 0.875, + "step": 16733 + }, + { + "epoch": 0.8346134663341646, + "grad_norm": 0.5915439378836936, + "learning_rate": 3.50376651898798e-06, + "loss": 0.8072, + "step": 16734 + }, + { + "epoch": 0.8346633416458853, + "grad_norm": 0.49221921521238154, + "learning_rate": 3.5017049764035385e-06, + "loss": 0.7858, + "step": 16735 + }, + { + "epoch": 0.834713216957606, + "grad_norm": 0.502862598275315, + "learning_rate": 3.499643994807486e-06, + "loss": 0.8673, + "step": 16736 + }, + { + "epoch": 0.8347630922693267, + "grad_norm": 0.7995576599895361, + "learning_rate": 3.4975835742536043e-06, + "loss": 0.8783, + "step": 16737 + }, + { + "epoch": 0.8348129675810474, + "grad_norm": 0.5596935125145864, + "learning_rate": 3.4955237147956537e-06, + "loss": 0.8102, + "step": 16738 + }, + { + "epoch": 0.8348628428927681, + "grad_norm": 0.6827521733405754, + "learning_rate": 3.4934644164873825e-06, + "loss": 0.8513, + "step": 16739 + }, + { + "epoch": 0.8349127182044888, + "grad_norm": 0.5885241201710811, + "learning_rate": 3.4914056793825397e-06, + "loss": 0.8634, + "step": 16740 + }, + { + "epoch": 0.8349625935162095, + "grad_norm": 0.7019177321612187, + "learning_rate": 3.489347503534837e-06, + "loss": 0.8596, + "step": 16741 + }, + { + "epoch": 0.8350124688279301, + "grad_norm": 0.557928786521925, + "learning_rate": 3.4872898889979884e-06, + "loss": 0.8057, + "step": 16742 + }, + { + "epoch": 0.8350623441396509, + "grad_norm": 0.7623475904159756, + "learning_rate": 3.485232835825672e-06, + "loss": 0.8767, + "step": 16743 + }, + { + "epoch": 0.8351122194513716, + "grad_norm": 0.5333367903200846, + "learning_rate": 3.4831763440715896e-06, + "loss": 0.8496, + "step": 16744 + }, + { + "epoch": 0.8351620947630922, + "grad_norm": 0.5420488357241012, + "learning_rate": 3.4811204137893795e-06, + "loss": 0.8769, + "step": 16745 + }, + { + "epoch": 0.835211970074813, + "grad_norm": 0.5440151341654715, + "learning_rate": 3.479065045032706e-06, + "loss": 0.8369, + "step": 16746 + }, + { + "epoch": 0.8352618453865337, + "grad_norm": 0.6831781998224935, + "learning_rate": 3.4770102378551938e-06, + "loss": 0.8545, + "step": 16747 + }, + { + "epoch": 0.8353117206982543, + "grad_norm": 0.6335677576620481, + "learning_rate": 3.47495599231048e-06, + "loss": 0.82, + "step": 16748 + }, + { + "epoch": 0.8353615960099751, + "grad_norm": 0.7000227546208455, + "learning_rate": 3.4729023084521417e-06, + "loss": 0.85, + "step": 16749 + }, + { + "epoch": 0.8354114713216958, + "grad_norm": 0.5680758762220439, + "learning_rate": 3.470849186333791e-06, + "loss": 0.8497, + "step": 16750 + }, + { + "epoch": 0.8354613466334164, + "grad_norm": 0.5805972702848341, + "learning_rate": 3.4687966260089912e-06, + "loss": 0.8562, + "step": 16751 + }, + { + "epoch": 0.8355112219451372, + "grad_norm": 0.5127426121246561, + "learning_rate": 3.4667446275313154e-06, + "loss": 0.8506, + "step": 16752 + }, + { + "epoch": 0.8355610972568579, + "grad_norm": 0.5484858697634969, + "learning_rate": 3.4646931909542933e-06, + "loss": 0.7925, + "step": 16753 + }, + { + "epoch": 0.8356109725685785, + "grad_norm": 0.5677504427165008, + "learning_rate": 3.4626423163314684e-06, + "loss": 0.8743, + "step": 16754 + }, + { + "epoch": 0.8356608478802993, + "grad_norm": 0.5559510376935712, + "learning_rate": 3.460592003716354e-06, + "loss": 0.8372, + "step": 16755 + }, + { + "epoch": 0.83571072319202, + "grad_norm": 0.5024304363547797, + "learning_rate": 3.45854225316245e-06, + "loss": 0.8619, + "step": 16756 + }, + { + "epoch": 0.8357605985037406, + "grad_norm": 0.5796034100169262, + "learning_rate": 3.45649306472324e-06, + "loss": 0.8139, + "step": 16757 + }, + { + "epoch": 0.8358104738154614, + "grad_norm": 0.4902865433187161, + "learning_rate": 3.454444438452209e-06, + "loss": 0.8039, + "step": 16758 + }, + { + "epoch": 0.835860349127182, + "grad_norm": 0.6110568181092663, + "learning_rate": 3.452396374402808e-06, + "loss": 0.8691, + "step": 16759 + }, + { + "epoch": 0.8359102244389027, + "grad_norm": 0.6270178316843532, + "learning_rate": 3.4503488726284727e-06, + "loss": 0.843, + "step": 16760 + }, + { + "epoch": 0.8359600997506235, + "grad_norm": 0.5570764598081076, + "learning_rate": 3.448301933182646e-06, + "loss": 0.8546, + "step": 16761 + }, + { + "epoch": 0.8360099750623441, + "grad_norm": 0.6918387891971383, + "learning_rate": 3.4462555561187355e-06, + "loss": 0.8181, + "step": 16762 + }, + { + "epoch": 0.8360598503740648, + "grad_norm": 0.6229481825573816, + "learning_rate": 3.44420974149014e-06, + "loss": 0.8593, + "step": 16763 + }, + { + "epoch": 0.8361097256857856, + "grad_norm": 0.580489824703818, + "learning_rate": 3.442164489350236e-06, + "loss": 0.9015, + "step": 16764 + }, + { + "epoch": 0.8361596009975062, + "grad_norm": 0.5453320177997255, + "learning_rate": 3.4401197997524086e-06, + "loss": 0.8308, + "step": 16765 + }, + { + "epoch": 0.8362094763092269, + "grad_norm": 0.5406741052682833, + "learning_rate": 3.4380756727500074e-06, + "loss": 0.8498, + "step": 16766 + }, + { + "epoch": 0.8362593516209477, + "grad_norm": 0.5804468322452839, + "learning_rate": 3.4360321083963727e-06, + "loss": 0.8405, + "step": 16767 + }, + { + "epoch": 0.8363092269326683, + "grad_norm": 0.5589480716965981, + "learning_rate": 3.43398910674482e-06, + "loss": 0.7979, + "step": 16768 + }, + { + "epoch": 0.836359102244389, + "grad_norm": 0.5063821334539883, + "learning_rate": 3.431946667848679e-06, + "loss": 0.794, + "step": 16769 + }, + { + "epoch": 0.8364089775561098, + "grad_norm": 0.6301816198542114, + "learning_rate": 3.429904791761235e-06, + "loss": 0.8436, + "step": 16770 + }, + { + "epoch": 0.8364588528678304, + "grad_norm": 0.6455987372671449, + "learning_rate": 3.4278634785357726e-06, + "loss": 0.8133, + "step": 16771 + }, + { + "epoch": 0.8365087281795511, + "grad_norm": 0.558557140708821, + "learning_rate": 3.4258227282255528e-06, + "loss": 0.9056, + "step": 16772 + }, + { + "epoch": 0.8365586034912719, + "grad_norm": 0.5123040859414456, + "learning_rate": 3.423782540883838e-06, + "loss": 0.8149, + "step": 16773 + }, + { + "epoch": 0.8366084788029925, + "grad_norm": 0.6434825064186818, + "learning_rate": 3.4217429165638635e-06, + "loss": 0.863, + "step": 16774 + }, + { + "epoch": 0.8366583541147132, + "grad_norm": 0.5600777366194648, + "learning_rate": 3.4197038553188484e-06, + "loss": 0.8318, + "step": 16775 + }, + { + "epoch": 0.836708229426434, + "grad_norm": 0.5469877859388965, + "learning_rate": 3.417665357201999e-06, + "loss": 0.8026, + "step": 16776 + }, + { + "epoch": 0.8367581047381546, + "grad_norm": 0.7476033651333563, + "learning_rate": 3.415627422266518e-06, + "loss": 0.8633, + "step": 16777 + }, + { + "epoch": 0.8368079800498753, + "grad_norm": 0.5555688237474554, + "learning_rate": 3.4135900505655795e-06, + "loss": 0.8825, + "step": 16778 + }, + { + "epoch": 0.836857855361596, + "grad_norm": 0.7592217139004587, + "learning_rate": 3.4115532421523487e-06, + "loss": 0.8612, + "step": 16779 + }, + { + "epoch": 0.8369077306733167, + "grad_norm": 0.6779190866775433, + "learning_rate": 3.4095169970799674e-06, + "loss": 0.8375, + "step": 16780 + }, + { + "epoch": 0.8369576059850374, + "grad_norm": 0.5167352199649194, + "learning_rate": 3.4074813154015894e-06, + "loss": 0.8281, + "step": 16781 + }, + { + "epoch": 0.8370074812967581, + "grad_norm": 0.5888176113789764, + "learning_rate": 3.405446197170312e-06, + "loss": 0.8461, + "step": 16782 + }, + { + "epoch": 0.8370573566084788, + "grad_norm": 0.587207574706505, + "learning_rate": 3.403411642439258e-06, + "loss": 0.7967, + "step": 16783 + }, + { + "epoch": 0.8371072319201995, + "grad_norm": 0.5556159146115094, + "learning_rate": 3.401377651261506e-06, + "loss": 0.8946, + "step": 16784 + }, + { + "epoch": 0.8371571072319202, + "grad_norm": 0.6182800725238928, + "learning_rate": 3.3993442236901457e-06, + "loss": 0.8296, + "step": 16785 + }, + { + "epoch": 0.8372069825436409, + "grad_norm": 0.7231152811885214, + "learning_rate": 3.397311359778224e-06, + "loss": 0.8695, + "step": 16786 + }, + { + "epoch": 0.8372568578553616, + "grad_norm": 0.8481383353414212, + "learning_rate": 3.3952790595787987e-06, + "loss": 0.8406, + "step": 16787 + }, + { + "epoch": 0.8373067331670823, + "grad_norm": 0.6544633168333271, + "learning_rate": 3.3932473231448907e-06, + "loss": 0.8363, + "step": 16788 + }, + { + "epoch": 0.837356608478803, + "grad_norm": 0.9669457134677859, + "learning_rate": 3.391216150529533e-06, + "loss": 0.8157, + "step": 16789 + }, + { + "epoch": 0.8374064837905237, + "grad_norm": 0.7283031574141117, + "learning_rate": 3.389185541785711e-06, + "loss": 0.8155, + "step": 16790 + }, + { + "epoch": 0.8374563591022444, + "grad_norm": 0.6548118090316268, + "learning_rate": 3.3871554969664263e-06, + "loss": 0.8819, + "step": 16791 + }, + { + "epoch": 0.8375062344139651, + "grad_norm": 0.573601640657173, + "learning_rate": 3.3851260161246455e-06, + "loss": 0.8283, + "step": 16792 + }, + { + "epoch": 0.8375561097256858, + "grad_norm": 0.5778465628097617, + "learning_rate": 3.383097099313326e-06, + "loss": 0.8092, + "step": 16793 + }, + { + "epoch": 0.8376059850374065, + "grad_norm": 0.5442849085225017, + "learning_rate": 3.3810687465854086e-06, + "loss": 0.8197, + "step": 16794 + }, + { + "epoch": 0.8376558603491272, + "grad_norm": 0.7081788624977822, + "learning_rate": 3.3790409579938343e-06, + "loss": 0.865, + "step": 16795 + }, + { + "epoch": 0.8377057356608478, + "grad_norm": 0.559369357140152, + "learning_rate": 3.3770137335915083e-06, + "loss": 0.852, + "step": 16796 + }, + { + "epoch": 0.8377556109725686, + "grad_norm": 0.6562211651015034, + "learning_rate": 3.3749870734313264e-06, + "loss": 0.8576, + "step": 16797 + }, + { + "epoch": 0.8378054862842893, + "grad_norm": 0.579087035134401, + "learning_rate": 3.3729609775661834e-06, + "loss": 0.8371, + "step": 16798 + }, + { + "epoch": 0.8378553615960099, + "grad_norm": 0.7876458261494509, + "learning_rate": 3.3709354460489477e-06, + "loss": 0.842, + "step": 16799 + }, + { + "epoch": 0.8379052369077307, + "grad_norm": 0.6787351018502533, + "learning_rate": 3.3689104789324687e-06, + "loss": 0.8457, + "step": 16800 + }, + { + "epoch": 0.8379551122194514, + "grad_norm": 0.5545597949920756, + "learning_rate": 3.3668860762695845e-06, + "loss": 0.8166, + "step": 16801 + }, + { + "epoch": 0.838004987531172, + "grad_norm": 0.598029119192146, + "learning_rate": 3.364862238113134e-06, + "loss": 0.7951, + "step": 16802 + }, + { + "epoch": 0.8380548628428928, + "grad_norm": 0.6538297217925063, + "learning_rate": 3.3628389645159186e-06, + "loss": 0.8183, + "step": 16803 + }, + { + "epoch": 0.8381047381546135, + "grad_norm": 0.5369526685286387, + "learning_rate": 3.360816255530738e-06, + "loss": 0.8154, + "step": 16804 + }, + { + "epoch": 0.8381546134663341, + "grad_norm": 0.6148911381376332, + "learning_rate": 3.3587941112103673e-06, + "loss": 0.846, + "step": 16805 + }, + { + "epoch": 0.8382044887780549, + "grad_norm": 0.6872880645186925, + "learning_rate": 3.3567725316075854e-06, + "loss": 0.8637, + "step": 16806 + }, + { + "epoch": 0.8382543640897756, + "grad_norm": 0.6715188801048256, + "learning_rate": 3.354751516775137e-06, + "loss": 0.8349, + "step": 16807 + }, + { + "epoch": 0.8383042394014962, + "grad_norm": 0.6364062497675812, + "learning_rate": 3.35273106676576e-06, + "loss": 0.814, + "step": 16808 + }, + { + "epoch": 0.838354114713217, + "grad_norm": 0.5767945910847587, + "learning_rate": 3.350711181632174e-06, + "loss": 0.8252, + "step": 16809 + }, + { + "epoch": 0.8384039900249377, + "grad_norm": 0.6451131418839309, + "learning_rate": 3.3486918614270946e-06, + "loss": 0.8478, + "step": 16810 + }, + { + "epoch": 0.8384538653366583, + "grad_norm": 0.6321544107025232, + "learning_rate": 3.34667310620321e-06, + "loss": 0.8452, + "step": 16811 + }, + { + "epoch": 0.8385037406483791, + "grad_norm": 0.8742923778926848, + "learning_rate": 3.3446549160132013e-06, + "loss": 0.8471, + "step": 16812 + }, + { + "epoch": 0.8385536159600997, + "grad_norm": 0.5464195465113473, + "learning_rate": 3.342637290909728e-06, + "loss": 0.814, + "step": 16813 + }, + { + "epoch": 0.8386034912718204, + "grad_norm": 0.5900047453195035, + "learning_rate": 3.3406202309454486e-06, + "loss": 0.8096, + "step": 16814 + }, + { + "epoch": 0.8386533665835412, + "grad_norm": 0.5484222180080788, + "learning_rate": 3.338603736172982e-06, + "loss": 0.8544, + "step": 16815 + }, + { + "epoch": 0.8387032418952618, + "grad_norm": 0.49289242542456946, + "learning_rate": 3.336587806644964e-06, + "loss": 0.7887, + "step": 16816 + }, + { + "epoch": 0.8387531172069825, + "grad_norm": 0.6175893093058612, + "learning_rate": 3.334572442413983e-06, + "loss": 0.8195, + "step": 16817 + }, + { + "epoch": 0.8388029925187033, + "grad_norm": 0.5557609667951143, + "learning_rate": 3.3325576435326493e-06, + "loss": 0.8218, + "step": 16818 + }, + { + "epoch": 0.8388528678304239, + "grad_norm": 1.286692778405434, + "learning_rate": 3.3305434100535188e-06, + "loss": 0.8091, + "step": 16819 + }, + { + "epoch": 0.8389027431421446, + "grad_norm": 0.6542617442722244, + "learning_rate": 3.3285297420291626e-06, + "loss": 0.8773, + "step": 16820 + }, + { + "epoch": 0.8389526184538654, + "grad_norm": 0.5769335740676661, + "learning_rate": 3.32651663951212e-06, + "loss": 0.836, + "step": 16821 + }, + { + "epoch": 0.839002493765586, + "grad_norm": 0.5419568064087609, + "learning_rate": 3.3245041025549344e-06, + "loss": 0.872, + "step": 16822 + }, + { + "epoch": 0.8390523690773067, + "grad_norm": 0.5368164839043309, + "learning_rate": 3.322492131210106e-06, + "loss": 0.8731, + "step": 16823 + }, + { + "epoch": 0.8391022443890275, + "grad_norm": 0.5255485481807821, + "learning_rate": 3.320480725530148e-06, + "loss": 0.8318, + "step": 16824 + }, + { + "epoch": 0.8391521197007481, + "grad_norm": 1.4333788366922156, + "learning_rate": 3.318469885567538e-06, + "loss": 0.8064, + "step": 16825 + }, + { + "epoch": 0.8392019950124688, + "grad_norm": 0.7378568439368185, + "learning_rate": 3.3164596113747613e-06, + "loss": 0.8747, + "step": 16826 + }, + { + "epoch": 0.8392518703241896, + "grad_norm": 0.6105594000710503, + "learning_rate": 3.3144499030042575e-06, + "loss": 0.827, + "step": 16827 + }, + { + "epoch": 0.8393017456359102, + "grad_norm": 0.5385522418506435, + "learning_rate": 3.312440760508484e-06, + "loss": 0.7873, + "step": 16828 + }, + { + "epoch": 0.8393516209476309, + "grad_norm": 0.7394025966832696, + "learning_rate": 3.3104321839398627e-06, + "loss": 0.8439, + "step": 16829 + }, + { + "epoch": 0.8394014962593517, + "grad_norm": 0.5013199411493047, + "learning_rate": 3.3084241733508075e-06, + "loss": 0.7935, + "step": 16830 + }, + { + "epoch": 0.8394513715710723, + "grad_norm": 0.7211042885908409, + "learning_rate": 3.3064167287937096e-06, + "loss": 0.8502, + "step": 16831 + }, + { + "epoch": 0.839501246882793, + "grad_norm": 0.5577096220844474, + "learning_rate": 3.3044098503209637e-06, + "loss": 0.8262, + "step": 16832 + }, + { + "epoch": 0.8395511221945137, + "grad_norm": 0.7852658089640163, + "learning_rate": 3.302403537984933e-06, + "loss": 0.8915, + "step": 16833 + }, + { + "epoch": 0.8396009975062344, + "grad_norm": 0.7063101066215582, + "learning_rate": 3.300397791837964e-06, + "loss": 0.8222, + "step": 16834 + }, + { + "epoch": 0.8396508728179551, + "grad_norm": 1.2522441379400076, + "learning_rate": 3.2983926119324105e-06, + "loss": 0.8711, + "step": 16835 + }, + { + "epoch": 0.8397007481296758, + "grad_norm": 0.5285459899329713, + "learning_rate": 3.2963879983205907e-06, + "loss": 0.8424, + "step": 16836 + }, + { + "epoch": 0.8397506234413965, + "grad_norm": 1.3440793991146882, + "learning_rate": 3.2943839510548107e-06, + "loss": 0.8399, + "step": 16837 + }, + { + "epoch": 0.8398004987531172, + "grad_norm": 0.5898750741664899, + "learning_rate": 3.2923804701873646e-06, + "loss": 0.8346, + "step": 16838 + }, + { + "epoch": 0.8398503740648379, + "grad_norm": 0.674415530149681, + "learning_rate": 3.290377555770538e-06, + "loss": 0.848, + "step": 16839 + }, + { + "epoch": 0.8399002493765586, + "grad_norm": 0.6407708306723795, + "learning_rate": 3.2883752078565922e-06, + "loss": 0.8548, + "step": 16840 + }, + { + "epoch": 0.8399501246882793, + "grad_norm": 0.45260696747246265, + "learning_rate": 3.2863734264977796e-06, + "loss": 0.8117, + "step": 16841 + }, + { + "epoch": 0.84, + "grad_norm": 0.5769181344155204, + "learning_rate": 3.284372211746331e-06, + "loss": 0.838, + "step": 16842 + }, + { + "epoch": 0.8400498753117207, + "grad_norm": 0.7433665701524435, + "learning_rate": 3.282371563654474e-06, + "loss": 0.8403, + "step": 16843 + }, + { + "epoch": 0.8400997506234414, + "grad_norm": 0.5899044744991313, + "learning_rate": 3.2803714822744138e-06, + "loss": 0.8409, + "step": 16844 + }, + { + "epoch": 0.8401496259351621, + "grad_norm": 0.7500477113450188, + "learning_rate": 3.2783719676583363e-06, + "loss": 0.8377, + "step": 16845 + }, + { + "epoch": 0.8401995012468828, + "grad_norm": 0.5440935585006114, + "learning_rate": 3.276373019858417e-06, + "loss": 0.8628, + "step": 16846 + }, + { + "epoch": 0.8402493765586035, + "grad_norm": 0.5099554819065023, + "learning_rate": 3.2743746389268277e-06, + "loss": 0.7826, + "step": 16847 + }, + { + "epoch": 0.8402992518703242, + "grad_norm": 1.0193305204216598, + "learning_rate": 3.27237682491571e-06, + "loss": 0.8334, + "step": 16848 + }, + { + "epoch": 0.8403491271820449, + "grad_norm": 0.5242763849094993, + "learning_rate": 3.2703795778771915e-06, + "loss": 0.8477, + "step": 16849 + }, + { + "epoch": 0.8403990024937655, + "grad_norm": 0.5883342140570381, + "learning_rate": 3.2683828978633926e-06, + "loss": 0.8125, + "step": 16850 + }, + { + "epoch": 0.8404488778054863, + "grad_norm": 0.5727141529654383, + "learning_rate": 3.266386784926423e-06, + "loss": 0.8236, + "step": 16851 + }, + { + "epoch": 0.840498753117207, + "grad_norm": 1.0281320987666753, + "learning_rate": 3.264391239118353e-06, + "loss": 0.8404, + "step": 16852 + }, + { + "epoch": 0.8405486284289276, + "grad_norm": 0.5061209835473214, + "learning_rate": 3.262396260491274e-06, + "loss": 0.8121, + "step": 16853 + }, + { + "epoch": 0.8405985037406484, + "grad_norm": 0.5161098310863534, + "learning_rate": 3.2604018490972275e-06, + "loss": 0.8367, + "step": 16854 + }, + { + "epoch": 0.8406483790523691, + "grad_norm": 0.4818562580853242, + "learning_rate": 3.258408004988278e-06, + "loss": 0.8443, + "step": 16855 + }, + { + "epoch": 0.8406982543640897, + "grad_norm": 0.5031474740923468, + "learning_rate": 3.256414728216431e-06, + "loss": 0.834, + "step": 16856 + }, + { + "epoch": 0.8407481296758105, + "grad_norm": 0.6361695004976881, + "learning_rate": 3.254422018833714e-06, + "loss": 0.8399, + "step": 16857 + }, + { + "epoch": 0.8407980049875312, + "grad_norm": 0.7554442080792371, + "learning_rate": 3.2524298768921183e-06, + "loss": 0.8554, + "step": 16858 + }, + { + "epoch": 0.8408478802992518, + "grad_norm": 0.797931343521627, + "learning_rate": 3.2504383024436425e-06, + "loss": 0.855, + "step": 16859 + }, + { + "epoch": 0.8408977556109726, + "grad_norm": 1.0166601001301594, + "learning_rate": 3.2484472955402357e-06, + "loss": 0.847, + "step": 16860 + }, + { + "epoch": 0.8409476309226933, + "grad_norm": 0.5815417991387277, + "learning_rate": 3.246456856233865e-06, + "loss": 0.9216, + "step": 16861 + }, + { + "epoch": 0.8409975062344139, + "grad_norm": 0.6266657258237, + "learning_rate": 3.244466984576469e-06, + "loss": 0.7961, + "step": 16862 + }, + { + "epoch": 0.8410473815461347, + "grad_norm": 0.5331745112707289, + "learning_rate": 3.2424776806199707e-06, + "loss": 0.8551, + "step": 16863 + }, + { + "epoch": 0.8410972568578554, + "grad_norm": 0.7778013230785898, + "learning_rate": 3.240488944416273e-06, + "loss": 0.8517, + "step": 16864 + }, + { + "epoch": 0.841147132169576, + "grad_norm": 1.303171725454963, + "learning_rate": 3.2385007760172836e-06, + "loss": 0.831, + "step": 16865 + }, + { + "epoch": 0.8411970074812968, + "grad_norm": 0.5456165241691207, + "learning_rate": 3.2365131754748783e-06, + "loss": 0.8455, + "step": 16866 + }, + { + "epoch": 0.8412468827930174, + "grad_norm": 0.6351249077364873, + "learning_rate": 3.234526142840921e-06, + "loss": 0.8598, + "step": 16867 + }, + { + "epoch": 0.8412967581047381, + "grad_norm": 0.4860650430852076, + "learning_rate": 3.232539678167257e-06, + "loss": 0.8217, + "step": 16868 + }, + { + "epoch": 0.8413466334164589, + "grad_norm": 0.7042883415105365, + "learning_rate": 3.230553781505735e-06, + "loss": 0.8706, + "step": 16869 + }, + { + "epoch": 0.8413965087281795, + "grad_norm": 0.6579736964934193, + "learning_rate": 3.2285684529081707e-06, + "loss": 0.8304, + "step": 16870 + }, + { + "epoch": 0.8414463840399002, + "grad_norm": 0.590942039089441, + "learning_rate": 3.2265836924263658e-06, + "loss": 0.8483, + "step": 16871 + }, + { + "epoch": 0.841496259351621, + "grad_norm": 0.6934808736345363, + "learning_rate": 3.2245995001121106e-06, + "loss": 0.8496, + "step": 16872 + }, + { + "epoch": 0.8415461346633416, + "grad_norm": 0.5251990208753464, + "learning_rate": 3.2226158760171933e-06, + "loss": 0.8433, + "step": 16873 + }, + { + "epoch": 0.8415960099750623, + "grad_norm": 0.814290709382865, + "learning_rate": 3.2206328201933642e-06, + "loss": 0.8543, + "step": 16874 + }, + { + "epoch": 0.8416458852867831, + "grad_norm": 0.5753525741190854, + "learning_rate": 3.2186503326923715e-06, + "loss": 0.8448, + "step": 16875 + }, + { + "epoch": 0.8416957605985037, + "grad_norm": 0.9252278531914642, + "learning_rate": 3.216668413565957e-06, + "loss": 0.8407, + "step": 16876 + }, + { + "epoch": 0.8417456359102244, + "grad_norm": 0.6353732906482197, + "learning_rate": 3.2146870628658313e-06, + "loss": 0.7878, + "step": 16877 + }, + { + "epoch": 0.8417955112219452, + "grad_norm": 0.5901088863812554, + "learning_rate": 3.212706280643696e-06, + "loss": 0.8732, + "step": 16878 + }, + { + "epoch": 0.8418453865336658, + "grad_norm": 0.5328828589429527, + "learning_rate": 3.2107260669512336e-06, + "loss": 0.8056, + "step": 16879 + }, + { + "epoch": 0.8418952618453865, + "grad_norm": 0.6571214357484623, + "learning_rate": 3.208746421840128e-06, + "loss": 0.8213, + "step": 16880 + }, + { + "epoch": 0.8419451371571073, + "grad_norm": 0.5946688853792038, + "learning_rate": 3.206767345362033e-06, + "loss": 0.8428, + "step": 16881 + }, + { + "epoch": 0.8419950124688279, + "grad_norm": 0.62782636246328, + "learning_rate": 3.2047888375685898e-06, + "loss": 0.8524, + "step": 16882 + }, + { + "epoch": 0.8420448877805486, + "grad_norm": 0.6421958201090316, + "learning_rate": 3.202810898511424e-06, + "loss": 0.8073, + "step": 16883 + }, + { + "epoch": 0.8420947630922693, + "grad_norm": 0.6262611692593966, + "learning_rate": 3.2008335282421557e-06, + "loss": 0.835, + "step": 16884 + }, + { + "epoch": 0.84214463840399, + "grad_norm": 1.2607775856501193, + "learning_rate": 3.1988567268123824e-06, + "loss": 0.8367, + "step": 16885 + }, + { + "epoch": 0.8421945137157107, + "grad_norm": 0.6704936858208217, + "learning_rate": 3.1968804942736853e-06, + "loss": 0.8147, + "step": 16886 + }, + { + "epoch": 0.8422443890274314, + "grad_norm": 0.6627351702352338, + "learning_rate": 3.194904830677628e-06, + "loss": 0.8676, + "step": 16887 + }, + { + "epoch": 0.8422942643391521, + "grad_norm": 0.5619491823015391, + "learning_rate": 3.192929736075781e-06, + "loss": 0.8277, + "step": 16888 + }, + { + "epoch": 0.8423441396508728, + "grad_norm": 0.7839014210187204, + "learning_rate": 3.190955210519664e-06, + "loss": 0.8348, + "step": 16889 + }, + { + "epoch": 0.8423940149625935, + "grad_norm": 0.5000203075534756, + "learning_rate": 3.1889812540608134e-06, + "loss": 0.8261, + "step": 16890 + }, + { + "epoch": 0.8424438902743142, + "grad_norm": 0.9513232261830271, + "learning_rate": 3.1870078667507326e-06, + "loss": 0.8503, + "step": 16891 + }, + { + "epoch": 0.842493765586035, + "grad_norm": 0.5659076142865713, + "learning_rate": 3.1850350486409275e-06, + "loss": 0.8527, + "step": 16892 + }, + { + "epoch": 0.8425436408977556, + "grad_norm": 0.7906532509489499, + "learning_rate": 3.18306279978286e-06, + "loss": 0.8338, + "step": 16893 + }, + { + "epoch": 0.8425935162094763, + "grad_norm": 1.1063134718991219, + "learning_rate": 3.181091120228011e-06, + "loss": 0.8879, + "step": 16894 + }, + { + "epoch": 0.842643391521197, + "grad_norm": 0.7101095625576486, + "learning_rate": 3.1791200100278197e-06, + "loss": 0.8511, + "step": 16895 + }, + { + "epoch": 0.8426932668329177, + "grad_norm": 0.5500960015077926, + "learning_rate": 3.177149469233734e-06, + "loss": 0.8397, + "step": 16896 + }, + { + "epoch": 0.8427431421446384, + "grad_norm": 0.8177332546687729, + "learning_rate": 3.175179497897157e-06, + "loss": 0.8808, + "step": 16897 + }, + { + "epoch": 0.8427930174563592, + "grad_norm": 0.49578649950185805, + "learning_rate": 3.1732100960695094e-06, + "loss": 0.855, + "step": 16898 + }, + { + "epoch": 0.8428428927680798, + "grad_norm": 0.6380515276878062, + "learning_rate": 3.171241263802174e-06, + "loss": 0.8624, + "step": 16899 + }, + { + "epoch": 0.8428927680798005, + "grad_norm": 0.5806417127407708, + "learning_rate": 3.1692730011465304e-06, + "loss": 0.8558, + "step": 16900 + }, + { + "epoch": 0.8429426433915213, + "grad_norm": 0.9206999832279611, + "learning_rate": 3.1673053081539338e-06, + "loss": 0.8442, + "step": 16901 + }, + { + "epoch": 0.8429925187032419, + "grad_norm": 0.5999493727794092, + "learning_rate": 3.165338184875735e-06, + "loss": 0.8574, + "step": 16902 + }, + { + "epoch": 0.8430423940149626, + "grad_norm": 0.6224854894560145, + "learning_rate": 3.1633716313632685e-06, + "loss": 0.8651, + "step": 16903 + }, + { + "epoch": 0.8430922693266832, + "grad_norm": 0.7679606485192906, + "learning_rate": 3.161405647667845e-06, + "loss": 0.8637, + "step": 16904 + }, + { + "epoch": 0.843142144638404, + "grad_norm": 0.5490213051514212, + "learning_rate": 3.159440233840763e-06, + "loss": 0.7917, + "step": 16905 + }, + { + "epoch": 0.8431920199501247, + "grad_norm": 0.6037659851356727, + "learning_rate": 3.1574753899333175e-06, + "loss": 0.8192, + "step": 16906 + }, + { + "epoch": 0.8432418952618453, + "grad_norm": 0.6088279310480993, + "learning_rate": 3.155511115996776e-06, + "loss": 0.8574, + "step": 16907 + }, + { + "epoch": 0.8432917705735661, + "grad_norm": 0.6535806840428744, + "learning_rate": 3.1535474120823972e-06, + "loss": 0.8513, + "step": 16908 + }, + { + "epoch": 0.8433416458852868, + "grad_norm": 0.5182761049077055, + "learning_rate": 3.151584278241415e-06, + "loss": 0.8066, + "step": 16909 + }, + { + "epoch": 0.8433915211970074, + "grad_norm": 0.6539417089869777, + "learning_rate": 3.149621714525067e-06, + "loss": 0.8515, + "step": 16910 + }, + { + "epoch": 0.8434413965087282, + "grad_norm": 0.6344953528120523, + "learning_rate": 3.147659720984564e-06, + "loss": 0.8792, + "step": 16911 + }, + { + "epoch": 0.8434912718204489, + "grad_norm": 0.6053492270857748, + "learning_rate": 3.145698297671093e-06, + "loss": 0.8491, + "step": 16912 + }, + { + "epoch": 0.8435411471321695, + "grad_norm": 0.6295080854219341, + "learning_rate": 3.1437374446358523e-06, + "loss": 0.8877, + "step": 16913 + }, + { + "epoch": 0.8435910224438903, + "grad_norm": 0.5508365496550346, + "learning_rate": 3.1417771619299983e-06, + "loss": 0.8742, + "step": 16914 + }, + { + "epoch": 0.843640897755611, + "grad_norm": 0.5954465893561285, + "learning_rate": 3.13981744960469e-06, + "loss": 0.8532, + "step": 16915 + }, + { + "epoch": 0.8436907730673316, + "grad_norm": 0.7515060142339995, + "learning_rate": 3.137858307711056e-06, + "loss": 0.8665, + "step": 16916 + }, + { + "epoch": 0.8437406483790524, + "grad_norm": 0.560353698339325, + "learning_rate": 3.135899736300227e-06, + "loss": 0.8239, + "step": 16917 + }, + { + "epoch": 0.8437905236907731, + "grad_norm": 0.623905674220251, + "learning_rate": 3.1339417354233124e-06, + "loss": 0.8771, + "step": 16918 + }, + { + "epoch": 0.8438403990024937, + "grad_norm": 0.5782356146057368, + "learning_rate": 3.1319843051314023e-06, + "loss": 0.7973, + "step": 16919 + }, + { + "epoch": 0.8438902743142145, + "grad_norm": 1.0037195576772513, + "learning_rate": 3.1300274454755696e-06, + "loss": 0.8165, + "step": 16920 + }, + { + "epoch": 0.8439401496259351, + "grad_norm": 0.5754144924048427, + "learning_rate": 3.1280711565068926e-06, + "loss": 0.8234, + "step": 16921 + }, + { + "epoch": 0.8439900249376558, + "grad_norm": 0.7626160527555998, + "learning_rate": 3.1261154382764003e-06, + "loss": 0.8599, + "step": 16922 + }, + { + "epoch": 0.8440399002493766, + "grad_norm": 0.7530965894806384, + "learning_rate": 3.1241602908351404e-06, + "loss": 0.8475, + "step": 16923 + }, + { + "epoch": 0.8440897755610972, + "grad_norm": 0.5207589110016687, + "learning_rate": 3.1222057142341247e-06, + "loss": 0.8241, + "step": 16924 + }, + { + "epoch": 0.844139650872818, + "grad_norm": 0.6776765321495449, + "learning_rate": 3.1202517085243686e-06, + "loss": 0.8836, + "step": 16925 + }, + { + "epoch": 0.8441895261845387, + "grad_norm": 0.5997519864017722, + "learning_rate": 3.1182982737568394e-06, + "loss": 0.8665, + "step": 16926 + }, + { + "epoch": 0.8442394014962593, + "grad_norm": 0.5441622704999678, + "learning_rate": 3.1163454099825324e-06, + "loss": 0.8473, + "step": 16927 + }, + { + "epoch": 0.84428927680798, + "grad_norm": 0.7624550423886534, + "learning_rate": 3.1143931172523926e-06, + "loss": 0.8816, + "step": 16928 + }, + { + "epoch": 0.8443391521197008, + "grad_norm": 0.5604242270544255, + "learning_rate": 3.112441395617377e-06, + "loss": 0.8374, + "step": 16929 + }, + { + "epoch": 0.8443890274314214, + "grad_norm": 0.6504710820266006, + "learning_rate": 3.1104902451284002e-06, + "loss": 0.884, + "step": 16930 + }, + { + "epoch": 0.8444389027431422, + "grad_norm": 0.7826255873639066, + "learning_rate": 3.108539665836388e-06, + "loss": 0.8828, + "step": 16931 + }, + { + "epoch": 0.8444887780548629, + "grad_norm": 0.8993495181939493, + "learning_rate": 3.1065896577922304e-06, + "loss": 0.8163, + "step": 16932 + }, + { + "epoch": 0.8445386533665835, + "grad_norm": 0.5572658136586411, + "learning_rate": 3.104640221046828e-06, + "loss": 0.8464, + "step": 16933 + }, + { + "epoch": 0.8445885286783043, + "grad_norm": 0.577487498754577, + "learning_rate": 3.10269135565103e-06, + "loss": 0.8522, + "step": 16934 + }, + { + "epoch": 0.844638403990025, + "grad_norm": 0.5388102594779094, + "learning_rate": 3.100743061655706e-06, + "loss": 0.8181, + "step": 16935 + }, + { + "epoch": 0.8446882793017456, + "grad_norm": 0.6367938218264697, + "learning_rate": 3.098795339111693e-06, + "loss": 0.8545, + "step": 16936 + }, + { + "epoch": 0.8447381546134664, + "grad_norm": 0.8082722514177836, + "learning_rate": 3.096848188069812e-06, + "loss": 0.8166, + "step": 16937 + }, + { + "epoch": 0.844788029925187, + "grad_norm": 0.5523628248108493, + "learning_rate": 3.094901608580869e-06, + "loss": 0.8413, + "step": 16938 + }, + { + "epoch": 0.8448379052369077, + "grad_norm": 0.7863887077547687, + "learning_rate": 3.0929556006956713e-06, + "loss": 0.7817, + "step": 16939 + }, + { + "epoch": 0.8448877805486285, + "grad_norm": 0.8130317328222233, + "learning_rate": 3.091010164464994e-06, + "loss": 0.8236, + "step": 16940 + }, + { + "epoch": 0.8449376558603491, + "grad_norm": 0.5801274191575541, + "learning_rate": 3.0890652999396003e-06, + "loss": 0.8013, + "step": 16941 + }, + { + "epoch": 0.8449875311720698, + "grad_norm": 0.5572325345445119, + "learning_rate": 3.087121007170235e-06, + "loss": 0.8407, + "step": 16942 + }, + { + "epoch": 0.8450374064837906, + "grad_norm": 0.8632798417389035, + "learning_rate": 3.085177286207647e-06, + "loss": 0.8537, + "step": 16943 + }, + { + "epoch": 0.8450872817955112, + "grad_norm": 0.6409371511138735, + "learning_rate": 3.0832341371025473e-06, + "loss": 0.8578, + "step": 16944 + }, + { + "epoch": 0.8451371571072319, + "grad_norm": 0.8127126058722223, + "learning_rate": 3.081291559905647e-06, + "loss": 0.8962, + "step": 16945 + }, + { + "epoch": 0.8451870324189527, + "grad_norm": 0.6267232051759188, + "learning_rate": 3.079349554667629e-06, + "loss": 0.8664, + "step": 16946 + }, + { + "epoch": 0.8452369077306733, + "grad_norm": 0.7690073115445638, + "learning_rate": 3.0774081214391765e-06, + "loss": 0.8595, + "step": 16947 + }, + { + "epoch": 0.845286783042394, + "grad_norm": 0.5971019896149226, + "learning_rate": 3.0754672602709504e-06, + "loss": 0.8428, + "step": 16948 + }, + { + "epoch": 0.8453366583541148, + "grad_norm": 0.6103852501323995, + "learning_rate": 3.073526971213586e-06, + "loss": 0.8417, + "step": 16949 + }, + { + "epoch": 0.8453865336658354, + "grad_norm": 0.5657705909839862, + "learning_rate": 3.0715872543177315e-06, + "loss": 0.8634, + "step": 16950 + }, + { + "epoch": 0.8454364089775561, + "grad_norm": 0.7757602640267659, + "learning_rate": 3.0696481096339908e-06, + "loss": 0.8757, + "step": 16951 + }, + { + "epoch": 0.8454862842892769, + "grad_norm": 0.5450509141911346, + "learning_rate": 3.067709537212968e-06, + "loss": 0.8553, + "step": 16952 + }, + { + "epoch": 0.8455361596009975, + "grad_norm": 0.8699038923977958, + "learning_rate": 3.065771537105244e-06, + "loss": 0.8571, + "step": 16953 + }, + { + "epoch": 0.8455860349127182, + "grad_norm": 0.5368365623907403, + "learning_rate": 3.063834109361402e-06, + "loss": 0.827, + "step": 16954 + }, + { + "epoch": 0.845635910224439, + "grad_norm": 1.2468858019679627, + "learning_rate": 3.0618972540319897e-06, + "loss": 0.8435, + "step": 16955 + }, + { + "epoch": 0.8456857855361596, + "grad_norm": 0.6021403843017045, + "learning_rate": 3.0599609711675523e-06, + "loss": 0.8074, + "step": 16956 + }, + { + "epoch": 0.8457356608478803, + "grad_norm": 0.5294484286474713, + "learning_rate": 3.058025260818609e-06, + "loss": 0.8607, + "step": 16957 + }, + { + "epoch": 0.845785536159601, + "grad_norm": 0.5303748228613588, + "learning_rate": 3.0560901230356848e-06, + "loss": 0.8204, + "step": 16958 + }, + { + "epoch": 0.8458354114713217, + "grad_norm": 0.6342133694201643, + "learning_rate": 3.0541555578692595e-06, + "loss": 0.8353, + "step": 16959 + }, + { + "epoch": 0.8458852867830424, + "grad_norm": 0.55617671080695, + "learning_rate": 3.052221565369828e-06, + "loss": 0.8297, + "step": 16960 + }, + { + "epoch": 0.845935162094763, + "grad_norm": 0.5976143160963505, + "learning_rate": 3.050288145587846e-06, + "loss": 0.8456, + "step": 16961 + }, + { + "epoch": 0.8459850374064838, + "grad_norm": 0.5903596108066129, + "learning_rate": 3.0483552985737824e-06, + "loss": 0.8354, + "step": 16962 + }, + { + "epoch": 0.8460349127182045, + "grad_norm": 0.689542288218155, + "learning_rate": 3.0464230243780535e-06, + "loss": 0.8314, + "step": 16963 + }, + { + "epoch": 0.8460847880299251, + "grad_norm": 0.5143742973958894, + "learning_rate": 3.0444913230510937e-06, + "loss": 0.8505, + "step": 16964 + }, + { + "epoch": 0.8461346633416459, + "grad_norm": 0.5432305995952154, + "learning_rate": 3.0425601946433043e-06, + "loss": 0.8225, + "step": 16965 + }, + { + "epoch": 0.8461845386533666, + "grad_norm": 0.7857172998987012, + "learning_rate": 3.0406296392050893e-06, + "loss": 0.8407, + "step": 16966 + }, + { + "epoch": 0.8462344139650873, + "grad_norm": 0.5688401022969689, + "learning_rate": 3.0386996567868058e-06, + "loss": 0.8381, + "step": 16967 + }, + { + "epoch": 0.846284289276808, + "grad_norm": 0.5782417045233849, + "learning_rate": 3.0367702474388305e-06, + "loss": 0.8734, + "step": 16968 + }, + { + "epoch": 0.8463341645885287, + "grad_norm": 0.5360327261363287, + "learning_rate": 3.034841411211506e-06, + "loss": 0.8425, + "step": 16969 + }, + { + "epoch": 0.8463840399002494, + "grad_norm": 0.5224991780261444, + "learning_rate": 3.032913148155164e-06, + "loss": 0.8459, + "step": 16970 + }, + { + "epoch": 0.8464339152119701, + "grad_norm": 0.6185685437129501, + "learning_rate": 3.030985458320118e-06, + "loss": 0.8513, + "step": 16971 + }, + { + "epoch": 0.8464837905236908, + "grad_norm": 0.639194820993868, + "learning_rate": 3.02905834175668e-06, + "loss": 0.8022, + "step": 16972 + }, + { + "epoch": 0.8465336658354115, + "grad_norm": 0.5741286781191848, + "learning_rate": 3.0271317985151317e-06, + "loss": 0.8098, + "step": 16973 + }, + { + "epoch": 0.8465835411471322, + "grad_norm": 0.551921090074963, + "learning_rate": 3.025205828645747e-06, + "loss": 0.8416, + "step": 16974 + }, + { + "epoch": 0.8466334164588528, + "grad_norm": 1.400572588784279, + "learning_rate": 3.0232804321987746e-06, + "loss": 0.8548, + "step": 16975 + }, + { + "epoch": 0.8466832917705736, + "grad_norm": 0.8407264598405813, + "learning_rate": 3.0213556092244686e-06, + "loss": 0.8118, + "step": 16976 + }, + { + "epoch": 0.8467331670822943, + "grad_norm": 0.8692119364481208, + "learning_rate": 3.0194313597730507e-06, + "loss": 0.819, + "step": 16977 + }, + { + "epoch": 0.8467830423940149, + "grad_norm": 0.6440807784461112, + "learning_rate": 3.0175076838947352e-06, + "loss": 0.8781, + "step": 16978 + }, + { + "epoch": 0.8468329177057357, + "grad_norm": 1.9873037843686872, + "learning_rate": 3.0155845816397134e-06, + "loss": 0.8408, + "step": 16979 + }, + { + "epoch": 0.8468827930174564, + "grad_norm": 0.969673877666492, + "learning_rate": 3.0136620530581805e-06, + "loss": 0.8844, + "step": 16980 + }, + { + "epoch": 0.846932668329177, + "grad_norm": 0.7146390706909953, + "learning_rate": 3.011740098200294e-06, + "loss": 0.8292, + "step": 16981 + }, + { + "epoch": 0.8469825436408978, + "grad_norm": 0.706788742719061, + "learning_rate": 3.0098187171162083e-06, + "loss": 0.8288, + "step": 16982 + }, + { + "epoch": 0.8470324189526185, + "grad_norm": 0.5755315715334219, + "learning_rate": 3.0078979098560585e-06, + "loss": 0.7909, + "step": 16983 + }, + { + "epoch": 0.8470822942643391, + "grad_norm": 0.5379146279041537, + "learning_rate": 3.0059776764699733e-06, + "loss": 0.8193, + "step": 16984 + }, + { + "epoch": 0.8471321695760599, + "grad_norm": 0.674388945661846, + "learning_rate": 3.00405801700806e-06, + "loss": 0.8684, + "step": 16985 + }, + { + "epoch": 0.8471820448877806, + "grad_norm": 0.5515492861711679, + "learning_rate": 3.0021389315204068e-06, + "loss": 0.8267, + "step": 16986 + }, + { + "epoch": 0.8472319201995012, + "grad_norm": 0.7075781096733931, + "learning_rate": 3.00022042005709e-06, + "loss": 0.8404, + "step": 16987 + }, + { + "epoch": 0.847281795511222, + "grad_norm": 0.8293676718073509, + "learning_rate": 2.9983024826681778e-06, + "loss": 0.8374, + "step": 16988 + }, + { + "epoch": 0.8473316708229427, + "grad_norm": 0.5275266999685461, + "learning_rate": 2.996385119403719e-06, + "loss": 0.8785, + "step": 16989 + }, + { + "epoch": 0.8473815461346633, + "grad_norm": 0.7050413506858485, + "learning_rate": 2.994468330313735e-06, + "loss": 0.8428, + "step": 16990 + }, + { + "epoch": 0.8474314214463841, + "grad_norm": 0.49207847276639427, + "learning_rate": 2.992552115448258e-06, + "loss": 0.8135, + "step": 16991 + }, + { + "epoch": 0.8474812967581047, + "grad_norm": 0.4969791127768399, + "learning_rate": 2.9906364748572835e-06, + "loss": 0.839, + "step": 16992 + }, + { + "epoch": 0.8475311720698254, + "grad_norm": 0.7365853038971084, + "learning_rate": 2.9887214085908e-06, + "loss": 0.881, + "step": 16993 + }, + { + "epoch": 0.8475810473815462, + "grad_norm": 0.6141386239966062, + "learning_rate": 2.9868069166987755e-06, + "loss": 0.8165, + "step": 16994 + }, + { + "epoch": 0.8476309226932668, + "grad_norm": 0.5805159390890231, + "learning_rate": 2.984892999231184e-06, + "loss": 0.8162, + "step": 16995 + }, + { + "epoch": 0.8476807980049875, + "grad_norm": 0.5800311888135535, + "learning_rate": 2.9829796562379468e-06, + "loss": 0.8642, + "step": 16996 + }, + { + "epoch": 0.8477306733167083, + "grad_norm": 0.7237826681856017, + "learning_rate": 2.9810668877690072e-06, + "loss": 0.8453, + "step": 16997 + }, + { + "epoch": 0.8477805486284289, + "grad_norm": 0.5499759387630985, + "learning_rate": 2.9791546938742643e-06, + "loss": 0.824, + "step": 16998 + }, + { + "epoch": 0.8478304239401496, + "grad_norm": 0.652306838218801, + "learning_rate": 2.977243074603639e-06, + "loss": 0.8166, + "step": 16999 + }, + { + "epoch": 0.8478802992518704, + "grad_norm": 0.6570600713583783, + "learning_rate": 2.975332030006986e-06, + "loss": 0.85, + "step": 17000 + }, + { + "epoch": 0.847930174563591, + "grad_norm": 0.7206805200935362, + "learning_rate": 2.973421560134193e-06, + "loss": 0.8641, + "step": 17001 + }, + { + "epoch": 0.8479800498753117, + "grad_norm": 0.6825392331236683, + "learning_rate": 2.9715116650351037e-06, + "loss": 0.8301, + "step": 17002 + }, + { + "epoch": 0.8480299251870325, + "grad_norm": 0.6073281281989729, + "learning_rate": 2.9696023447595674e-06, + "loss": 0.8295, + "step": 17003 + }, + { + "epoch": 0.8480798004987531, + "grad_norm": 0.8833282694738016, + "learning_rate": 2.967693599357388e-06, + "loss": 0.84, + "step": 17004 + }, + { + "epoch": 0.8481296758104738, + "grad_norm": 1.5226578406488673, + "learning_rate": 2.9657854288783873e-06, + "loss": 0.8013, + "step": 17005 + }, + { + "epoch": 0.8481795511221946, + "grad_norm": 0.7861810458666986, + "learning_rate": 2.9638778333723553e-06, + "loss": 0.8616, + "step": 17006 + }, + { + "epoch": 0.8482294264339152, + "grad_norm": 0.5714313831486124, + "learning_rate": 2.9619708128890726e-06, + "loss": 0.8132, + "step": 17007 + }, + { + "epoch": 0.8482793017456359, + "grad_norm": 0.5331637105894079, + "learning_rate": 2.9600643674782906e-06, + "loss": 0.8379, + "step": 17008 + }, + { + "epoch": 0.8483291770573566, + "grad_norm": 1.0638089843507357, + "learning_rate": 2.9581584971897697e-06, + "loss": 0.7843, + "step": 17009 + }, + { + "epoch": 0.8483790523690773, + "grad_norm": 0.699553294286783, + "learning_rate": 2.9562532020732394e-06, + "loss": 0.8315, + "step": 17010 + }, + { + "epoch": 0.848428927680798, + "grad_norm": 0.659707115342631, + "learning_rate": 2.954348482178415e-06, + "loss": 0.8392, + "step": 17011 + }, + { + "epoch": 0.8484788029925187, + "grad_norm": 0.4965020027881073, + "learning_rate": 2.9524443375549964e-06, + "loss": 0.8293, + "step": 17012 + }, + { + "epoch": 0.8485286783042394, + "grad_norm": 1.062738499176449, + "learning_rate": 2.9505407682526797e-06, + "loss": 0.8554, + "step": 17013 + }, + { + "epoch": 0.8485785536159601, + "grad_norm": 0.5758792422950824, + "learning_rate": 2.9486377743211365e-06, + "loss": 0.8299, + "step": 17014 + }, + { + "epoch": 0.8486284289276808, + "grad_norm": 0.6869916640740555, + "learning_rate": 2.946735355810018e-06, + "loss": 0.8916, + "step": 17015 + }, + { + "epoch": 0.8486783042394015, + "grad_norm": 0.5669564993783797, + "learning_rate": 2.9448335127689685e-06, + "loss": 0.8426, + "step": 17016 + }, + { + "epoch": 0.8487281795511222, + "grad_norm": 0.5378248027484468, + "learning_rate": 2.942932245247623e-06, + "loss": 0.8606, + "step": 17017 + }, + { + "epoch": 0.8487780548628429, + "grad_norm": 0.6923062050821835, + "learning_rate": 2.941031553295587e-06, + "loss": 0.8471, + "step": 17018 + }, + { + "epoch": 0.8488279301745636, + "grad_norm": 0.6442350955929093, + "learning_rate": 2.939131436962461e-06, + "loss": 0.8726, + "step": 17019 + }, + { + "epoch": 0.8488778054862843, + "grad_norm": 0.5191161010987201, + "learning_rate": 2.9372318962978237e-06, + "loss": 0.7833, + "step": 17020 + }, + { + "epoch": 0.848927680798005, + "grad_norm": 0.5836458575368574, + "learning_rate": 2.935332931351248e-06, + "loss": 0.8355, + "step": 17021 + }, + { + "epoch": 0.8489775561097257, + "grad_norm": 0.6205154785498186, + "learning_rate": 2.9334345421722864e-06, + "loss": 0.862, + "step": 17022 + }, + { + "epoch": 0.8490274314214464, + "grad_norm": 0.5993191616586135, + "learning_rate": 2.9315367288104746e-06, + "loss": 0.9141, + "step": 17023 + }, + { + "epoch": 0.8490773067331671, + "grad_norm": 0.5241149971703711, + "learning_rate": 2.929639491315331e-06, + "loss": 0.842, + "step": 17024 + }, + { + "epoch": 0.8491271820448878, + "grad_norm": 0.5813345160908786, + "learning_rate": 2.9277428297363712e-06, + "loss": 0.8499, + "step": 17025 + }, + { + "epoch": 0.8491770573566085, + "grad_norm": 0.5798953011153052, + "learning_rate": 2.9258467441230842e-06, + "loss": 0.8, + "step": 17026 + }, + { + "epoch": 0.8492269326683292, + "grad_norm": 0.5264819970106738, + "learning_rate": 2.923951234524944e-06, + "loss": 0.852, + "step": 17027 + }, + { + "epoch": 0.8492768079800499, + "grad_norm": 0.6692862483082413, + "learning_rate": 2.9220563009914244e-06, + "loss": 0.828, + "step": 17028 + }, + { + "epoch": 0.8493266832917705, + "grad_norm": 0.5653644468458165, + "learning_rate": 2.9201619435719557e-06, + "loss": 0.8082, + "step": 17029 + }, + { + "epoch": 0.8493765586034913, + "grad_norm": 1.0509567662501733, + "learning_rate": 2.918268162315982e-06, + "loss": 0.832, + "step": 17030 + }, + { + "epoch": 0.849426433915212, + "grad_norm": 0.5325421058103623, + "learning_rate": 2.916374957272916e-06, + "loss": 0.7945, + "step": 17031 + }, + { + "epoch": 0.8494763092269326, + "grad_norm": 0.8300370785427502, + "learning_rate": 2.914482328492171e-06, + "loss": 0.8147, + "step": 17032 + }, + { + "epoch": 0.8495261845386534, + "grad_norm": 0.5544578599738069, + "learning_rate": 2.9125902760231165e-06, + "loss": 0.8757, + "step": 17033 + }, + { + "epoch": 0.8495760598503741, + "grad_norm": 0.5727425062451926, + "learning_rate": 2.9106987999151377e-06, + "loss": 0.8503, + "step": 17034 + }, + { + "epoch": 0.8496259351620947, + "grad_norm": 0.5656581443714276, + "learning_rate": 2.908807900217583e-06, + "loss": 0.8743, + "step": 17035 + }, + { + "epoch": 0.8496758104738155, + "grad_norm": 0.538795519666846, + "learning_rate": 2.906917576979809e-06, + "loss": 0.8165, + "step": 17036 + }, + { + "epoch": 0.8497256857855362, + "grad_norm": 0.5253209674387329, + "learning_rate": 2.9050278302511254e-06, + "loss": 0.8405, + "step": 17037 + }, + { + "epoch": 0.8497755610972568, + "grad_norm": 0.5434643116069634, + "learning_rate": 2.9031386600808563e-06, + "loss": 0.8785, + "step": 17038 + }, + { + "epoch": 0.8498254364089776, + "grad_norm": 0.6007556620204056, + "learning_rate": 2.9012500665182935e-06, + "loss": 0.8197, + "step": 17039 + }, + { + "epoch": 0.8498753117206983, + "grad_norm": 0.6563408870565619, + "learning_rate": 2.8993620496127195e-06, + "loss": 0.8754, + "step": 17040 + }, + { + "epoch": 0.8499251870324189, + "grad_norm": 0.6304766380835704, + "learning_rate": 2.8974746094133977e-06, + "loss": 0.8187, + "step": 17041 + }, + { + "epoch": 0.8499750623441397, + "grad_norm": 0.5643212808812159, + "learning_rate": 2.895587745969591e-06, + "loss": 0.7516, + "step": 17042 + }, + { + "epoch": 0.8500249376558604, + "grad_norm": 0.7210478917912985, + "learning_rate": 2.8937014593305277e-06, + "loss": 0.8239, + "step": 17043 + }, + { + "epoch": 0.850074812967581, + "grad_norm": 0.7815042342413249, + "learning_rate": 2.891815749545432e-06, + "loss": 0.8207, + "step": 17044 + }, + { + "epoch": 0.8501246882793018, + "grad_norm": 1.5496947712966898, + "learning_rate": 2.8899306166635027e-06, + "loss": 0.9017, + "step": 17045 + }, + { + "epoch": 0.8501745635910224, + "grad_norm": 0.539518724956246, + "learning_rate": 2.8880460607339455e-06, + "loss": 0.8144, + "step": 17046 + }, + { + "epoch": 0.8502244389027431, + "grad_norm": 0.7300496710566142, + "learning_rate": 2.886162081805932e-06, + "loss": 0.8356, + "step": 17047 + }, + { + "epoch": 0.8502743142144639, + "grad_norm": 0.6328829634140175, + "learning_rate": 2.8842786799286203e-06, + "loss": 0.8795, + "step": 17048 + }, + { + "epoch": 0.8503241895261845, + "grad_norm": 0.5039283630313227, + "learning_rate": 2.8823958551511514e-06, + "loss": 0.7869, + "step": 17049 + }, + { + "epoch": 0.8503740648379052, + "grad_norm": 0.49925353397305766, + "learning_rate": 2.8805136075226724e-06, + "loss": 0.8578, + "step": 17050 + }, + { + "epoch": 0.850423940149626, + "grad_norm": 0.6009013156914758, + "learning_rate": 2.878631937092291e-06, + "loss": 0.8647, + "step": 17051 + }, + { + "epoch": 0.8504738154613466, + "grad_norm": 0.6882371387439375, + "learning_rate": 2.8767508439091097e-06, + "loss": 0.8202, + "step": 17052 + }, + { + "epoch": 0.8505236907730673, + "grad_norm": 0.5814409054786824, + "learning_rate": 2.8748703280222085e-06, + "loss": 0.8172, + "step": 17053 + }, + { + "epoch": 0.8505735660847881, + "grad_norm": 0.6354597641643354, + "learning_rate": 2.8729903894806675e-06, + "loss": 0.8843, + "step": 17054 + }, + { + "epoch": 0.8506234413965087, + "grad_norm": 0.5782868897219334, + "learning_rate": 2.8711110283335397e-06, + "loss": 0.8198, + "step": 17055 + }, + { + "epoch": 0.8506733167082294, + "grad_norm": 0.5665607579127445, + "learning_rate": 2.8692322446298686e-06, + "loss": 0.8534, + "step": 17056 + }, + { + "epoch": 0.8507231920199502, + "grad_norm": 0.4930913820778607, + "learning_rate": 2.867354038418671e-06, + "loss": 0.8446, + "step": 17057 + }, + { + "epoch": 0.8507730673316708, + "grad_norm": 0.5851109027323704, + "learning_rate": 2.8654764097489683e-06, + "loss": 0.8668, + "step": 17058 + }, + { + "epoch": 0.8508229426433915, + "grad_norm": 0.5132286516050353, + "learning_rate": 2.8635993586697553e-06, + "loss": 0.8353, + "step": 17059 + }, + { + "epoch": 0.8508728179551123, + "grad_norm": 0.6105015505526002, + "learning_rate": 2.861722885230009e-06, + "loss": 0.839, + "step": 17060 + }, + { + "epoch": 0.8509226932668329, + "grad_norm": 0.6278464250147332, + "learning_rate": 2.85984698947869e-06, + "loss": 0.8747, + "step": 17061 + }, + { + "epoch": 0.8509725685785536, + "grad_norm": 0.5893785712886661, + "learning_rate": 2.8579716714647596e-06, + "loss": 0.8255, + "step": 17062 + }, + { + "epoch": 0.8510224438902743, + "grad_norm": 0.6017824607687626, + "learning_rate": 2.856096931237151e-06, + "loss": 0.8272, + "step": 17063 + }, + { + "epoch": 0.851072319201995, + "grad_norm": 0.7150097529813715, + "learning_rate": 2.8542227688447774e-06, + "loss": 0.8645, + "step": 17064 + }, + { + "epoch": 0.8511221945137157, + "grad_norm": 0.6344654186836994, + "learning_rate": 2.8523491843365587e-06, + "loss": 0.843, + "step": 17065 + }, + { + "epoch": 0.8511720698254364, + "grad_norm": 0.606586620301864, + "learning_rate": 2.850476177761366e-06, + "loss": 0.864, + "step": 17066 + }, + { + "epoch": 0.8512219451371571, + "grad_norm": 0.7816873349590651, + "learning_rate": 2.8486037491680912e-06, + "loss": 0.8188, + "step": 17067 + }, + { + "epoch": 0.8512718204488778, + "grad_norm": 0.5607650254503326, + "learning_rate": 2.846731898605581e-06, + "loss": 0.7798, + "step": 17068 + }, + { + "epoch": 0.8513216957605985, + "grad_norm": 0.6044130411151932, + "learning_rate": 2.844860626122697e-06, + "loss": 0.8898, + "step": 17069 + }, + { + "epoch": 0.8513715710723192, + "grad_norm": 0.6437173747041626, + "learning_rate": 2.842989931768253e-06, + "loss": 0.8621, + "step": 17070 + }, + { + "epoch": 0.8514214463840399, + "grad_norm": 0.6663054901769594, + "learning_rate": 2.841119815591073e-06, + "loss": 0.8388, + "step": 17071 + }, + { + "epoch": 0.8514713216957606, + "grad_norm": 0.7951639180293453, + "learning_rate": 2.8392502776399487e-06, + "loss": 0.8295, + "step": 17072 + }, + { + "epoch": 0.8515211970074813, + "grad_norm": 0.5211934070430415, + "learning_rate": 2.8373813179636804e-06, + "loss": 0.869, + "step": 17073 + }, + { + "epoch": 0.851571072319202, + "grad_norm": 0.6713790549775808, + "learning_rate": 2.8355129366110205e-06, + "loss": 0.8174, + "step": 17074 + }, + { + "epoch": 0.8516209476309227, + "grad_norm": 0.6367170886867257, + "learning_rate": 2.833645133630733e-06, + "loss": 0.8602, + "step": 17075 + }, + { + "epoch": 0.8516708229426434, + "grad_norm": 0.6722009336560145, + "learning_rate": 2.831777909071559e-06, + "loss": 0.8825, + "step": 17076 + }, + { + "epoch": 0.8517206982543641, + "grad_norm": 0.7766821731758663, + "learning_rate": 2.8299112629822154e-06, + "loss": 0.8529, + "step": 17077 + }, + { + "epoch": 0.8517705735660848, + "grad_norm": 0.5482485703198363, + "learning_rate": 2.8280451954114132e-06, + "loss": 0.8647, + "step": 17078 + }, + { + "epoch": 0.8518204488778055, + "grad_norm": 0.5724135895278663, + "learning_rate": 2.8261797064078527e-06, + "loss": 0.8641, + "step": 17079 + }, + { + "epoch": 0.8518703241895261, + "grad_norm": 0.6762334903578187, + "learning_rate": 2.824314796020208e-06, + "loss": 0.852, + "step": 17080 + }, + { + "epoch": 0.8519201995012469, + "grad_norm": 0.6902526626645495, + "learning_rate": 2.8224504642971467e-06, + "loss": 0.8244, + "step": 17081 + }, + { + "epoch": 0.8519700748129676, + "grad_norm": 0.6276505931592128, + "learning_rate": 2.8205867112873067e-06, + "loss": 0.8348, + "step": 17082 + }, + { + "epoch": 0.8520199501246882, + "grad_norm": 0.6239225554337495, + "learning_rate": 2.8187235370393387e-06, + "loss": 0.8482, + "step": 17083 + }, + { + "epoch": 0.852069825436409, + "grad_norm": 0.94226347954362, + "learning_rate": 2.816860941601851e-06, + "loss": 0.8325, + "step": 17084 + }, + { + "epoch": 0.8521197007481297, + "grad_norm": 0.5195157649255419, + "learning_rate": 2.814998925023449e-06, + "loss": 0.8271, + "step": 17085 + }, + { + "epoch": 0.8521695760598503, + "grad_norm": 0.7578326330078814, + "learning_rate": 2.8131374873527184e-06, + "loss": 0.8876, + "step": 17086 + }, + { + "epoch": 0.8522194513715711, + "grad_norm": 0.7145147359766989, + "learning_rate": 2.811276628638243e-06, + "loss": 0.8479, + "step": 17087 + }, + { + "epoch": 0.8522693266832918, + "grad_norm": 0.5140167463797465, + "learning_rate": 2.8094163489285647e-06, + "loss": 0.7742, + "step": 17088 + }, + { + "epoch": 0.8523192019950124, + "grad_norm": 0.9046765653603025, + "learning_rate": 2.8075566482722414e-06, + "loss": 0.8165, + "step": 17089 + }, + { + "epoch": 0.8523690773067332, + "grad_norm": 0.6427908983616225, + "learning_rate": 2.8056975267177903e-06, + "loss": 0.8282, + "step": 17090 + }, + { + "epoch": 0.8524189526184539, + "grad_norm": 1.215198960826157, + "learning_rate": 2.8038389843137335e-06, + "loss": 0.845, + "step": 17091 + }, + { + "epoch": 0.8524688279301745, + "grad_norm": 0.6221983987423457, + "learning_rate": 2.801981021108563e-06, + "loss": 0.8741, + "step": 17092 + }, + { + "epoch": 0.8525187032418953, + "grad_norm": 0.5728377206095493, + "learning_rate": 2.8001236371507674e-06, + "loss": 0.847, + "step": 17093 + }, + { + "epoch": 0.852568578553616, + "grad_norm": 0.6878444010040741, + "learning_rate": 2.7982668324888023e-06, + "loss": 0.8253, + "step": 17094 + }, + { + "epoch": 0.8526184538653366, + "grad_norm": 0.4802502835153118, + "learning_rate": 2.796410607171132e-06, + "loss": 0.9052, + "step": 17095 + }, + { + "epoch": 0.8526683291770574, + "grad_norm": 0.5998226101057107, + "learning_rate": 2.794554961246193e-06, + "loss": 0.8693, + "step": 17096 + }, + { + "epoch": 0.8527182044887781, + "grad_norm": 0.7623868637558522, + "learning_rate": 2.792699894762402e-06, + "loss": 0.8294, + "step": 17097 + }, + { + "epoch": 0.8527680798004987, + "grad_norm": 0.5978976795527984, + "learning_rate": 2.790845407768164e-06, + "loss": 0.8535, + "step": 17098 + }, + { + "epoch": 0.8528179551122195, + "grad_norm": 0.7018524654660793, + "learning_rate": 2.78899150031188e-06, + "loss": 0.8417, + "step": 17099 + }, + { + "epoch": 0.8528678304239401, + "grad_norm": 0.5419712007320668, + "learning_rate": 2.787138172441925e-06, + "loss": 0.8551, + "step": 17100 + }, + { + "epoch": 0.8529177057356608, + "grad_norm": 0.5690710986881935, + "learning_rate": 2.7852854242066543e-06, + "loss": 0.8353, + "step": 17101 + }, + { + "epoch": 0.8529675810473816, + "grad_norm": 0.5766165233446086, + "learning_rate": 2.783433255654422e-06, + "loss": 0.8678, + "step": 17102 + }, + { + "epoch": 0.8530174563591022, + "grad_norm": 0.5332409625746822, + "learning_rate": 2.7815816668335494e-06, + "loss": 0.8513, + "step": 17103 + }, + { + "epoch": 0.8530673316708229, + "grad_norm": 0.49097595816714595, + "learning_rate": 2.779730657792365e-06, + "loss": 0.8222, + "step": 17104 + }, + { + "epoch": 0.8531172069825437, + "grad_norm": 0.5436356361008574, + "learning_rate": 2.7778802285791604e-06, + "loss": 0.7867, + "step": 17105 + }, + { + "epoch": 0.8531670822942643, + "grad_norm": 0.6800860498857448, + "learning_rate": 2.7760303792422336e-06, + "loss": 0.8322, + "step": 17106 + }, + { + "epoch": 0.853216957605985, + "grad_norm": 0.7133324764825143, + "learning_rate": 2.7741811098298403e-06, + "loss": 0.8315, + "step": 17107 + }, + { + "epoch": 0.8532668329177058, + "grad_norm": 0.5257115887025916, + "learning_rate": 2.772332420390247e-06, + "loss": 0.8234, + "step": 17108 + }, + { + "epoch": 0.8533167082294264, + "grad_norm": 1.1042254135752105, + "learning_rate": 2.7704843109716883e-06, + "loss": 0.8362, + "step": 17109 + }, + { + "epoch": 0.8533665835411471, + "grad_norm": 0.5623810417514302, + "learning_rate": 2.7686367816224023e-06, + "loss": 0.9169, + "step": 17110 + }, + { + "epoch": 0.8534164588528679, + "grad_norm": 0.7502108425409644, + "learning_rate": 2.766789832390579e-06, + "loss": 0.8345, + "step": 17111 + }, + { + "epoch": 0.8534663341645885, + "grad_norm": 0.6303529536288962, + "learning_rate": 2.764943463324429e-06, + "loss": 0.8185, + "step": 17112 + }, + { + "epoch": 0.8535162094763092, + "grad_norm": 0.6982915148507832, + "learning_rate": 2.7630976744721316e-06, + "loss": 0.8573, + "step": 17113 + }, + { + "epoch": 0.85356608478803, + "grad_norm": 0.729153524263674, + "learning_rate": 2.7612524658818447e-06, + "loss": 0.798, + "step": 17114 + }, + { + "epoch": 0.8536159600997506, + "grad_norm": 0.5137715812208473, + "learning_rate": 2.7594078376017184e-06, + "loss": 0.8413, + "step": 17115 + }, + { + "epoch": 0.8536658354114713, + "grad_norm": 0.5689467283962168, + "learning_rate": 2.7575637896798955e-06, + "loss": 0.8471, + "step": 17116 + }, + { + "epoch": 0.853715710723192, + "grad_norm": 0.6192909963957667, + "learning_rate": 2.755720322164493e-06, + "loss": 0.8817, + "step": 17117 + }, + { + "epoch": 0.8537655860349127, + "grad_norm": 0.6002131712935267, + "learning_rate": 2.7538774351036133e-06, + "loss": 0.8526, + "step": 17118 + }, + { + "epoch": 0.8538154613466334, + "grad_norm": 0.5527555452363732, + "learning_rate": 2.7520351285453385e-06, + "loss": 0.8169, + "step": 17119 + }, + { + "epoch": 0.8538653366583541, + "grad_norm": 0.5472910199068449, + "learning_rate": 2.750193402537757e-06, + "loss": 0.8055, + "step": 17120 + }, + { + "epoch": 0.8539152119700748, + "grad_norm": 0.5299807484174727, + "learning_rate": 2.7483522571289194e-06, + "loss": 0.8849, + "step": 17121 + }, + { + "epoch": 0.8539650872817955, + "grad_norm": 0.6360129202910919, + "learning_rate": 2.7465116923668736e-06, + "loss": 0.8609, + "step": 17122 + }, + { + "epoch": 0.8540149625935162, + "grad_norm": 0.6031423214751188, + "learning_rate": 2.7446717082996396e-06, + "loss": 0.8589, + "step": 17123 + }, + { + "epoch": 0.8540648379052369, + "grad_norm": 0.5205121131065472, + "learning_rate": 2.742832304975246e-06, + "loss": 0.8288, + "step": 17124 + }, + { + "epoch": 0.8541147132169576, + "grad_norm": 0.6836241054963891, + "learning_rate": 2.7409934824416726e-06, + "loss": 0.8336, + "step": 17125 + }, + { + "epoch": 0.8541645885286783, + "grad_norm": 0.6886435405907109, + "learning_rate": 2.7391552407469185e-06, + "loss": 0.8766, + "step": 17126 + }, + { + "epoch": 0.854214463840399, + "grad_norm": 0.7302088803555495, + "learning_rate": 2.7373175799389415e-06, + "loss": 0.8254, + "step": 17127 + }, + { + "epoch": 0.8542643391521197, + "grad_norm": 0.6183334488820819, + "learning_rate": 2.7354805000657013e-06, + "loss": 0.8589, + "step": 17128 + }, + { + "epoch": 0.8543142144638404, + "grad_norm": 0.598699191490411, + "learning_rate": 2.733644001175137e-06, + "loss": 0.8698, + "step": 17129 + }, + { + "epoch": 0.8543640897755611, + "grad_norm": 0.58309620082514, + "learning_rate": 2.7318080833151628e-06, + "loss": 0.8103, + "step": 17130 + }, + { + "epoch": 0.8544139650872818, + "grad_norm": 0.6633348423363258, + "learning_rate": 2.7299727465336877e-06, + "loss": 0.8404, + "step": 17131 + }, + { + "epoch": 0.8544638403990025, + "grad_norm": 0.5045531736845505, + "learning_rate": 2.7281379908786126e-06, + "loss": 0.8178, + "step": 17132 + }, + { + "epoch": 0.8545137157107232, + "grad_norm": 0.8314388759666643, + "learning_rate": 2.72630381639781e-06, + "loss": 0.8378, + "step": 17133 + }, + { + "epoch": 0.8545635910224438, + "grad_norm": 0.6132899611619169, + "learning_rate": 2.7244702231391387e-06, + "loss": 0.8464, + "step": 17134 + }, + { + "epoch": 0.8546134663341646, + "grad_norm": 0.7455379921691633, + "learning_rate": 2.72263721115045e-06, + "loss": 0.8556, + "step": 17135 + }, + { + "epoch": 0.8546633416458853, + "grad_norm": 0.6025367038500942, + "learning_rate": 2.720804780479566e-06, + "loss": 0.8712, + "step": 17136 + }, + { + "epoch": 0.8547132169576059, + "grad_norm": 0.5797753960326157, + "learning_rate": 2.7189729311743155e-06, + "loss": 0.8344, + "step": 17137 + }, + { + "epoch": 0.8547630922693267, + "grad_norm": 0.77186484995328, + "learning_rate": 2.7171416632824963e-06, + "loss": 0.8476, + "step": 17138 + }, + { + "epoch": 0.8548129675810474, + "grad_norm": 0.5599744562278967, + "learning_rate": 2.7153109768518925e-06, + "loss": 0.8449, + "step": 17139 + }, + { + "epoch": 0.854862842892768, + "grad_norm": 0.6810293170075189, + "learning_rate": 2.7134808719302684e-06, + "loss": 0.8016, + "step": 17140 + }, + { + "epoch": 0.8549127182044888, + "grad_norm": 0.6106621307835658, + "learning_rate": 2.7116513485653944e-06, + "loss": 0.8956, + "step": 17141 + }, + { + "epoch": 0.8549625935162095, + "grad_norm": 0.5599357685759332, + "learning_rate": 2.709822406804996e-06, + "loss": 0.8682, + "step": 17142 + }, + { + "epoch": 0.8550124688279301, + "grad_norm": 1.0426471801218014, + "learning_rate": 2.7079940466968184e-06, + "loss": 0.8261, + "step": 17143 + }, + { + "epoch": 0.8550623441396509, + "grad_norm": 0.5296208678635058, + "learning_rate": 2.7061662682885486e-06, + "loss": 0.8257, + "step": 17144 + }, + { + "epoch": 0.8551122194513716, + "grad_norm": 0.6288225273919255, + "learning_rate": 2.7043390716278984e-06, + "loss": 0.8501, + "step": 17145 + }, + { + "epoch": 0.8551620947630922, + "grad_norm": 0.6159455388411554, + "learning_rate": 2.7025124567625404e-06, + "loss": 0.8761, + "step": 17146 + }, + { + "epoch": 0.855211970074813, + "grad_norm": 0.733736282333785, + "learning_rate": 2.7006864237401423e-06, + "loss": 0.8299, + "step": 17147 + }, + { + "epoch": 0.8552618453865337, + "grad_norm": 0.6261558969100167, + "learning_rate": 2.698860972608347e-06, + "loss": 0.869, + "step": 17148 + }, + { + "epoch": 0.8553117206982543, + "grad_norm": 1.6655673962871775, + "learning_rate": 2.6970361034147962e-06, + "loss": 0.8675, + "step": 17149 + }, + { + "epoch": 0.8553615960099751, + "grad_norm": 0.6095365539682528, + "learning_rate": 2.6952118162071107e-06, + "loss": 0.8474, + "step": 17150 + }, + { + "epoch": 0.8554114713216958, + "grad_norm": 0.500233720451393, + "learning_rate": 2.693388111032888e-06, + "loss": 0.8355, + "step": 17151 + }, + { + "epoch": 0.8554613466334164, + "grad_norm": 0.535390053617781, + "learning_rate": 2.6915649879397152e-06, + "loss": 0.8093, + "step": 17152 + }, + { + "epoch": 0.8555112219451372, + "grad_norm": 0.5044114689227419, + "learning_rate": 2.689742446975174e-06, + "loss": 0.8211, + "step": 17153 + }, + { + "epoch": 0.8555610972568578, + "grad_norm": 0.7871794737403824, + "learning_rate": 2.6879204881868207e-06, + "loss": 0.8235, + "step": 17154 + }, + { + "epoch": 0.8556109725685785, + "grad_norm": 0.6468169674278146, + "learning_rate": 2.686099111622195e-06, + "loss": 0.8083, + "step": 17155 + }, + { + "epoch": 0.8556608478802993, + "grad_norm": 0.6751176532199838, + "learning_rate": 2.6842783173288227e-06, + "loss": 0.8286, + "step": 17156 + }, + { + "epoch": 0.8557107231920199, + "grad_norm": 0.6320311674731708, + "learning_rate": 2.6824581053542268e-06, + "loss": 0.8245, + "step": 17157 + }, + { + "epoch": 0.8557605985037406, + "grad_norm": 1.0430863834482629, + "learning_rate": 2.680638475745895e-06, + "loss": 0.8545, + "step": 17158 + }, + { + "epoch": 0.8558104738154614, + "grad_norm": 0.6136404460269798, + "learning_rate": 2.678819428551316e-06, + "loss": 0.894, + "step": 17159 + }, + { + "epoch": 0.855860349127182, + "grad_norm": 0.7153043762600307, + "learning_rate": 2.6770009638179494e-06, + "loss": 0.9007, + "step": 17160 + }, + { + "epoch": 0.8559102244389027, + "grad_norm": 0.8105481328431942, + "learning_rate": 2.6751830815932604e-06, + "loss": 0.9058, + "step": 17161 + }, + { + "epoch": 0.8559600997506235, + "grad_norm": 0.9200272907271592, + "learning_rate": 2.6733657819246694e-06, + "loss": 0.8769, + "step": 17162 + }, + { + "epoch": 0.8560099750623441, + "grad_norm": 0.7188064157313969, + "learning_rate": 2.6715490648596104e-06, + "loss": 0.8616, + "step": 17163 + }, + { + "epoch": 0.8560598503740648, + "grad_norm": 0.5873605825222951, + "learning_rate": 2.6697329304454816e-06, + "loss": 0.7838, + "step": 17164 + }, + { + "epoch": 0.8561097256857856, + "grad_norm": 0.8106979259203417, + "learning_rate": 2.6679173787296867e-06, + "loss": 0.8591, + "step": 17165 + }, + { + "epoch": 0.8561596009975062, + "grad_norm": 0.6584076614783287, + "learning_rate": 2.6661024097595856e-06, + "loss": 0.8816, + "step": 17166 + }, + { + "epoch": 0.856209476309227, + "grad_norm": 0.4679925592527433, + "learning_rate": 2.664288023582548e-06, + "loss": 0.8296, + "step": 17167 + }, + { + "epoch": 0.8562593516209477, + "grad_norm": 0.7758823521110784, + "learning_rate": 2.662474220245917e-06, + "loss": 0.8525, + "step": 17168 + }, + { + "epoch": 0.8563092269326683, + "grad_norm": 0.5634826630093129, + "learning_rate": 2.6606609997970265e-06, + "loss": 0.8128, + "step": 17169 + }, + { + "epoch": 0.856359102244389, + "grad_norm": 0.6446871575552764, + "learning_rate": 2.658848362283192e-06, + "loss": 0.8068, + "step": 17170 + }, + { + "epoch": 0.8564089775561097, + "grad_norm": 0.5481122425179314, + "learning_rate": 2.657036307751709e-06, + "loss": 0.8715, + "step": 17171 + }, + { + "epoch": 0.8564588528678304, + "grad_norm": 0.7240702026737434, + "learning_rate": 2.6552248362498644e-06, + "loss": 0.8181, + "step": 17172 + }, + { + "epoch": 0.8565087281795511, + "grad_norm": 0.5183096612901912, + "learning_rate": 2.653413947824923e-06, + "loss": 0.8492, + "step": 17173 + }, + { + "epoch": 0.8565586034912718, + "grad_norm": 0.6822575140285471, + "learning_rate": 2.6516036425241474e-06, + "loss": 0.8621, + "step": 17174 + }, + { + "epoch": 0.8566084788029925, + "grad_norm": 0.5459751176666408, + "learning_rate": 2.6497939203947745e-06, + "loss": 0.8402, + "step": 17175 + }, + { + "epoch": 0.8566583541147132, + "grad_norm": 0.6823553349863404, + "learning_rate": 2.647984781484025e-06, + "loss": 0.8915, + "step": 17176 + }, + { + "epoch": 0.8567082294264339, + "grad_norm": 0.6777794993387591, + "learning_rate": 2.646176225839106e-06, + "loss": 0.8394, + "step": 17177 + }, + { + "epoch": 0.8567581047381546, + "grad_norm": 0.5870309907192532, + "learning_rate": 2.6443682535072177e-06, + "loss": 0.8676, + "step": 17178 + }, + { + "epoch": 0.8568079800498754, + "grad_norm": 0.6357535703544469, + "learning_rate": 2.6425608645355317e-06, + "loss": 0.8775, + "step": 17179 + }, + { + "epoch": 0.856857855361596, + "grad_norm": 0.8784773197784994, + "learning_rate": 2.640754058971223e-06, + "loss": 0.8034, + "step": 17180 + }, + { + "epoch": 0.8569077306733167, + "grad_norm": 0.6626937417228472, + "learning_rate": 2.638947836861419e-06, + "loss": 0.829, + "step": 17181 + }, + { + "epoch": 0.8569576059850375, + "grad_norm": 0.7373768837485161, + "learning_rate": 2.63714219825327e-06, + "loss": 0.8643, + "step": 17182 + }, + { + "epoch": 0.8570074812967581, + "grad_norm": 0.7598806472128438, + "learning_rate": 2.635337143193886e-06, + "loss": 0.8796, + "step": 17183 + }, + { + "epoch": 0.8570573566084788, + "grad_norm": 0.4757276779023762, + "learning_rate": 2.633532671730371e-06, + "loss": 0.7894, + "step": 17184 + }, + { + "epoch": 0.8571072319201996, + "grad_norm": 0.5440546271090165, + "learning_rate": 2.6317287839098066e-06, + "loss": 0.8235, + "step": 17185 + }, + { + "epoch": 0.8571571072319202, + "grad_norm": 0.5974458239459677, + "learning_rate": 2.6299254797792743e-06, + "loss": 0.8358, + "step": 17186 + }, + { + "epoch": 0.8572069825436409, + "grad_norm": 0.676563880216147, + "learning_rate": 2.628122759385823e-06, + "loss": 0.8962, + "step": 17187 + }, + { + "epoch": 0.8572568578553615, + "grad_norm": 0.711455410831998, + "learning_rate": 2.6263206227764986e-06, + "loss": 0.88, + "step": 17188 + }, + { + "epoch": 0.8573067331670823, + "grad_norm": 1.475738008379388, + "learning_rate": 2.6245190699983156e-06, + "loss": 0.8658, + "step": 17189 + }, + { + "epoch": 0.857356608478803, + "grad_norm": 0.5808726978481991, + "learning_rate": 2.622718101098301e-06, + "loss": 0.8867, + "step": 17190 + }, + { + "epoch": 0.8574064837905236, + "grad_norm": 0.7254094786311152, + "learning_rate": 2.6209177161234445e-06, + "loss": 0.8368, + "step": 17191 + }, + { + "epoch": 0.8574563591022444, + "grad_norm": 0.6072591464768307, + "learning_rate": 2.6191179151207224e-06, + "loss": 0.8615, + "step": 17192 + }, + { + "epoch": 0.8575062344139651, + "grad_norm": 0.5926670516234652, + "learning_rate": 2.617318698137097e-06, + "loss": 0.8177, + "step": 17193 + }, + { + "epoch": 0.8575561097256857, + "grad_norm": 0.5294243523815005, + "learning_rate": 2.6155200652195315e-06, + "loss": 0.8489, + "step": 17194 + }, + { + "epoch": 0.8576059850374065, + "grad_norm": 0.6673868033251514, + "learning_rate": 2.6137220164149435e-06, + "loss": 0.837, + "step": 17195 + }, + { + "epoch": 0.8576558603491272, + "grad_norm": 0.7410455600499402, + "learning_rate": 2.6119245517702674e-06, + "loss": 0.8587, + "step": 17196 + }, + { + "epoch": 0.8577057356608478, + "grad_norm": 0.5174870088245721, + "learning_rate": 2.6101276713323934e-06, + "loss": 0.8122, + "step": 17197 + }, + { + "epoch": 0.8577556109725686, + "grad_norm": 0.5644173209497936, + "learning_rate": 2.608331375148229e-06, + "loss": 0.8095, + "step": 17198 + }, + { + "epoch": 0.8578054862842893, + "grad_norm": 0.8101771966025819, + "learning_rate": 2.6065356632646227e-06, + "loss": 0.8399, + "step": 17199 + }, + { + "epoch": 0.85785536159601, + "grad_norm": 1.1323587081457247, + "learning_rate": 2.604740535728456e-06, + "loss": 0.8348, + "step": 17200 + }, + { + "epoch": 0.8579052369077307, + "grad_norm": 0.5247464299398434, + "learning_rate": 2.602945992586553e-06, + "loss": 0.7987, + "step": 17201 + }, + { + "epoch": 0.8579551122194514, + "grad_norm": 0.5864961382366664, + "learning_rate": 2.601152033885762e-06, + "loss": 0.8354, + "step": 17202 + }, + { + "epoch": 0.858004987531172, + "grad_norm": 0.6131067253124849, + "learning_rate": 2.5993586596728763e-06, + "loss": 0.857, + "step": 17203 + }, + { + "epoch": 0.8580548628428928, + "grad_norm": 0.6048369563939282, + "learning_rate": 2.597565869994703e-06, + "loss": 0.828, + "step": 17204 + }, + { + "epoch": 0.8581047381546134, + "grad_norm": 0.7431533541667573, + "learning_rate": 2.5957736648980243e-06, + "loss": 0.8087, + "step": 17205 + }, + { + "epoch": 0.8581546134663341, + "grad_norm": 0.5613056747069682, + "learning_rate": 2.593982044429602e-06, + "loss": 0.8699, + "step": 17206 + }, + { + "epoch": 0.8582044887780549, + "grad_norm": 0.6136853287294357, + "learning_rate": 2.592191008636191e-06, + "loss": 0.8366, + "step": 17207 + }, + { + "epoch": 0.8582543640897755, + "grad_norm": 0.6110540430365573, + "learning_rate": 2.590400557564532e-06, + "loss": 0.8388, + "step": 17208 + }, + { + "epoch": 0.8583042394014962, + "grad_norm": 0.5518909605802911, + "learning_rate": 2.5886106912613368e-06, + "loss": 0.83, + "step": 17209 + }, + { + "epoch": 0.858354114713217, + "grad_norm": 0.6582475601743388, + "learning_rate": 2.5868214097733127e-06, + "loss": 0.8738, + "step": 17210 + }, + { + "epoch": 0.8584039900249376, + "grad_norm": 0.5559136952508984, + "learning_rate": 2.585032713147159e-06, + "loss": 0.8642, + "step": 17211 + }, + { + "epoch": 0.8584538653366584, + "grad_norm": 0.5696501429997782, + "learning_rate": 2.5832446014295435e-06, + "loss": 0.8619, + "step": 17212 + }, + { + "epoch": 0.8585037406483791, + "grad_norm": 0.7349238726321174, + "learning_rate": 2.581457074667129e-06, + "loss": 0.8581, + "step": 17213 + }, + { + "epoch": 0.8585536159600997, + "grad_norm": 0.8139748944401743, + "learning_rate": 2.5796701329065532e-06, + "loss": 0.7899, + "step": 17214 + }, + { + "epoch": 0.8586034912718205, + "grad_norm": 0.5988397688215186, + "learning_rate": 2.5778837761944537e-06, + "loss": 0.8496, + "step": 17215 + }, + { + "epoch": 0.8586533665835412, + "grad_norm": 0.5647460366038464, + "learning_rate": 2.576098004577446e-06, + "loss": 0.8574, + "step": 17216 + }, + { + "epoch": 0.8587032418952618, + "grad_norm": 1.3924784940040007, + "learning_rate": 2.574312818102123e-06, + "loss": 0.8525, + "step": 17217 + }, + { + "epoch": 0.8587531172069826, + "grad_norm": 0.5456919990310576, + "learning_rate": 2.572528216815068e-06, + "loss": 0.839, + "step": 17218 + }, + { + "epoch": 0.8588029925187033, + "grad_norm": 0.6719572856826355, + "learning_rate": 2.5707442007628536e-06, + "loss": 0.8924, + "step": 17219 + }, + { + "epoch": 0.8588528678304239, + "grad_norm": 0.6807441040289116, + "learning_rate": 2.568960769992035e-06, + "loss": 0.8011, + "step": 17220 + }, + { + "epoch": 0.8589027431421447, + "grad_norm": 1.1200453094859568, + "learning_rate": 2.567177924549144e-06, + "loss": 0.8038, + "step": 17221 + }, + { + "epoch": 0.8589526184538654, + "grad_norm": 0.4739810605136334, + "learning_rate": 2.5653956644806992e-06, + "loss": 0.8284, + "step": 17222 + }, + { + "epoch": 0.859002493765586, + "grad_norm": 0.6315296840793213, + "learning_rate": 2.563613989833222e-06, + "loss": 0.8229, + "step": 17223 + }, + { + "epoch": 0.8590523690773068, + "grad_norm": 0.5650955605838742, + "learning_rate": 2.561832900653194e-06, + "loss": 0.8643, + "step": 17224 + }, + { + "epoch": 0.8591022443890274, + "grad_norm": 0.578513073963653, + "learning_rate": 2.560052396987095e-06, + "loss": 0.7933, + "step": 17225 + }, + { + "epoch": 0.8591521197007481, + "grad_norm": 0.6375994792765047, + "learning_rate": 2.5582724788813823e-06, + "loss": 0.8717, + "step": 17226 + }, + { + "epoch": 0.8592019950124689, + "grad_norm": 0.6228193817191984, + "learning_rate": 2.5564931463825103e-06, + "loss": 0.8487, + "step": 17227 + }, + { + "epoch": 0.8592518703241895, + "grad_norm": 0.6994813523664418, + "learning_rate": 2.554714399536906e-06, + "loss": 0.8482, + "step": 17228 + }, + { + "epoch": 0.8593017456359102, + "grad_norm": 0.6117950506841817, + "learning_rate": 2.552936238390982e-06, + "loss": 0.8481, + "step": 17229 + }, + { + "epoch": 0.859351620947631, + "grad_norm": 0.49789783044368785, + "learning_rate": 2.551158662991138e-06, + "loss": 0.7911, + "step": 17230 + }, + { + "epoch": 0.8594014962593516, + "grad_norm": 0.8742252554249385, + "learning_rate": 2.549381673383769e-06, + "loss": 0.8588, + "step": 17231 + }, + { + "epoch": 0.8594513715710723, + "grad_norm": 0.6804624129758908, + "learning_rate": 2.5476052696152304e-06, + "loss": 0.8593, + "step": 17232 + }, + { + "epoch": 0.8595012468827931, + "grad_norm": 0.6634934566043619, + "learning_rate": 2.5458294517318878e-06, + "loss": 0.8387, + "step": 17233 + }, + { + "epoch": 0.8595511221945137, + "grad_norm": 0.6092479016833449, + "learning_rate": 2.5440542197800733e-06, + "loss": 0.868, + "step": 17234 + }, + { + "epoch": 0.8596009975062344, + "grad_norm": 0.52538571347129, + "learning_rate": 2.5422795738061222e-06, + "loss": 0.8292, + "step": 17235 + }, + { + "epoch": 0.8596508728179552, + "grad_norm": 0.6248929906458929, + "learning_rate": 2.540505513856323e-06, + "loss": 0.8755, + "step": 17236 + }, + { + "epoch": 0.8597007481296758, + "grad_norm": 0.7075550700303989, + "learning_rate": 2.5387320399769875e-06, + "loss": 0.8137, + "step": 17237 + }, + { + "epoch": 0.8597506234413965, + "grad_norm": 2.4238370284563597, + "learning_rate": 2.5369591522143824e-06, + "loss": 0.8127, + "step": 17238 + }, + { + "epoch": 0.8598004987531173, + "grad_norm": 0.5456622881280401, + "learning_rate": 2.5351868506147874e-06, + "loss": 0.829, + "step": 17239 + }, + { + "epoch": 0.8598503740648379, + "grad_norm": 0.6980658958869506, + "learning_rate": 2.5334151352244236e-06, + "loss": 0.8311, + "step": 17240 + }, + { + "epoch": 0.8599002493765586, + "grad_norm": 0.6677357779268104, + "learning_rate": 2.531644006089545e-06, + "loss": 0.8341, + "step": 17241 + }, + { + "epoch": 0.8599501246882792, + "grad_norm": 0.5784113133860583, + "learning_rate": 2.52987346325636e-06, + "loss": 0.8681, + "step": 17242 + }, + { + "epoch": 0.86, + "grad_norm": 0.46765801497435805, + "learning_rate": 2.5281035067710653e-06, + "loss": 0.8184, + "step": 17243 + }, + { + "epoch": 0.8600498753117207, + "grad_norm": 0.8647299864078448, + "learning_rate": 2.5263341366798558e-06, + "loss": 0.8783, + "step": 17244 + }, + { + "epoch": 0.8600997506234414, + "grad_norm": 0.7531119808250789, + "learning_rate": 2.5245653530289015e-06, + "loss": 0.8272, + "step": 17245 + }, + { + "epoch": 0.8601496259351621, + "grad_norm": 0.6032571136712859, + "learning_rate": 2.5227971558643537e-06, + "loss": 0.8261, + "step": 17246 + }, + { + "epoch": 0.8601995012468828, + "grad_norm": 0.78066546998689, + "learning_rate": 2.521029545232351e-06, + "loss": 0.808, + "step": 17247 + }, + { + "epoch": 0.8602493765586035, + "grad_norm": 0.7969818346443048, + "learning_rate": 2.5192625211790277e-06, + "loss": 0.8502, + "step": 17248 + }, + { + "epoch": 0.8602992518703242, + "grad_norm": 0.6626754237901947, + "learning_rate": 2.5174960837504868e-06, + "loss": 0.8537, + "step": 17249 + }, + { + "epoch": 0.8603491271820449, + "grad_norm": 2.7947302136089123, + "learning_rate": 2.5157302329928216e-06, + "loss": 0.8715, + "step": 17250 + }, + { + "epoch": 0.8603990024937656, + "grad_norm": 1.0105216112443496, + "learning_rate": 2.513964968952112e-06, + "loss": 0.8377, + "step": 17251 + }, + { + "epoch": 0.8604488778054863, + "grad_norm": 0.5615244515585522, + "learning_rate": 2.512200291674427e-06, + "loss": 0.8732, + "step": 17252 + }, + { + "epoch": 0.860498753117207, + "grad_norm": 0.503546962631101, + "learning_rate": 2.5104362012058096e-06, + "loss": 0.82, + "step": 17253 + }, + { + "epoch": 0.8605486284289277, + "grad_norm": 0.7909087831988407, + "learning_rate": 2.508672697592296e-06, + "loss": 0.8512, + "step": 17254 + }, + { + "epoch": 0.8605985037406484, + "grad_norm": 0.5549850421475867, + "learning_rate": 2.5069097808798987e-06, + "loss": 0.8424, + "step": 17255 + }, + { + "epoch": 0.8606483790523691, + "grad_norm": 0.6598587097604328, + "learning_rate": 2.5051474511146254e-06, + "loss": 0.8471, + "step": 17256 + }, + { + "epoch": 0.8606982543640898, + "grad_norm": 0.7620272062327834, + "learning_rate": 2.503385708342465e-06, + "loss": 0.8235, + "step": 17257 + }, + { + "epoch": 0.8607481296758105, + "grad_norm": 0.634586512179048, + "learning_rate": 2.5016245526093856e-06, + "loss": 0.8342, + "step": 17258 + }, + { + "epoch": 0.8607980049875311, + "grad_norm": 0.6377581107958749, + "learning_rate": 2.4998639839613395e-06, + "loss": 0.8831, + "step": 17259 + }, + { + "epoch": 0.8608478802992519, + "grad_norm": 0.5720574296254863, + "learning_rate": 2.498104002444279e-06, + "loss": 0.834, + "step": 17260 + }, + { + "epoch": 0.8608977556109726, + "grad_norm": 0.5366070973036304, + "learning_rate": 2.496344608104123e-06, + "loss": 0.8121, + "step": 17261 + }, + { + "epoch": 0.8609476309226932, + "grad_norm": 4.056463867282181, + "learning_rate": 2.494585800986782e-06, + "loss": 0.8313, + "step": 17262 + }, + { + "epoch": 0.860997506234414, + "grad_norm": 0.7592947025518908, + "learning_rate": 2.492827581138149e-06, + "loss": 0.8575, + "step": 17263 + }, + { + "epoch": 0.8610473815461347, + "grad_norm": 0.6480687625013948, + "learning_rate": 2.4910699486041107e-06, + "loss": 0.8514, + "step": 17264 + }, + { + "epoch": 0.8610972568578553, + "grad_norm": 0.6266443444895698, + "learning_rate": 2.4893129034305294e-06, + "loss": 0.8857, + "step": 17265 + }, + { + "epoch": 0.8611471321695761, + "grad_norm": 0.6361779444026888, + "learning_rate": 2.4875564456632556e-06, + "loss": 0.8522, + "step": 17266 + }, + { + "epoch": 0.8611970074812968, + "grad_norm": 0.6062859053676696, + "learning_rate": 2.485800575348113e-06, + "loss": 0.8485, + "step": 17267 + }, + { + "epoch": 0.8612468827930174, + "grad_norm": 0.8864615115385592, + "learning_rate": 2.48404529253094e-06, + "loss": 0.8474, + "step": 17268 + }, + { + "epoch": 0.8612967581047382, + "grad_norm": 0.6069925235101665, + "learning_rate": 2.4822905972575167e-06, + "loss": 0.8326, + "step": 17269 + }, + { + "epoch": 0.8613466334164589, + "grad_norm": 0.523738382890967, + "learning_rate": 2.480536489573651e-06, + "loss": 0.8525, + "step": 17270 + }, + { + "epoch": 0.8613965087281795, + "grad_norm": 0.6399589773805409, + "learning_rate": 2.478782969525098e-06, + "loss": 0.8323, + "step": 17271 + }, + { + "epoch": 0.8614463840399003, + "grad_norm": 0.4942157689567675, + "learning_rate": 2.4770300371576377e-06, + "loss": 0.8011, + "step": 17272 + }, + { + "epoch": 0.861496259351621, + "grad_norm": 0.5211027648678388, + "learning_rate": 2.4752776925169864e-06, + "loss": 0.8541, + "step": 17273 + }, + { + "epoch": 0.8615461346633416, + "grad_norm": 0.6890805113194404, + "learning_rate": 2.4735259356488877e-06, + "loss": 0.8276, + "step": 17274 + }, + { + "epoch": 0.8615960099750624, + "grad_norm": 0.5847206984747373, + "learning_rate": 2.471774766599044e-06, + "loss": 0.8178, + "step": 17275 + }, + { + "epoch": 0.8616458852867831, + "grad_norm": 1.5208773953618833, + "learning_rate": 2.4700241854131666e-06, + "loss": 0.8387, + "step": 17276 + }, + { + "epoch": 0.8616957605985037, + "grad_norm": 0.5001533003215141, + "learning_rate": 2.468274192136913e-06, + "loss": 0.8596, + "step": 17277 + }, + { + "epoch": 0.8617456359102245, + "grad_norm": 0.5323743215446951, + "learning_rate": 2.4665247868159684e-06, + "loss": 0.8728, + "step": 17278 + }, + { + "epoch": 0.8617955112219451, + "grad_norm": 0.61713516775751, + "learning_rate": 2.464775969495972e-06, + "loss": 0.8001, + "step": 17279 + }, + { + "epoch": 0.8618453865336658, + "grad_norm": 0.7541033036751285, + "learning_rate": 2.463027740222562e-06, + "loss": 0.8791, + "step": 17280 + }, + { + "epoch": 0.8618952618453866, + "grad_norm": 0.7510301172953402, + "learning_rate": 2.461280099041352e-06, + "loss": 0.8109, + "step": 17281 + }, + { + "epoch": 0.8619451371571072, + "grad_norm": 0.6946983820884294, + "learning_rate": 2.4595330459979554e-06, + "loss": 0.8101, + "step": 17282 + }, + { + "epoch": 0.8619950124688279, + "grad_norm": 0.5277482471531452, + "learning_rate": 2.457786581137958e-06, + "loss": 0.7655, + "step": 17283 + }, + { + "epoch": 0.8620448877805487, + "grad_norm": 0.7552656940890426, + "learning_rate": 2.4560407045069235e-06, + "loss": 0.8924, + "step": 17284 + }, + { + "epoch": 0.8620947630922693, + "grad_norm": 0.5392772216050148, + "learning_rate": 2.454295416150423e-06, + "loss": 0.8535, + "step": 17285 + }, + { + "epoch": 0.86214463840399, + "grad_norm": 0.6369135944962497, + "learning_rate": 2.4525507161139937e-06, + "loss": 0.8573, + "step": 17286 + }, + { + "epoch": 0.8621945137157108, + "grad_norm": 0.6680409129041737, + "learning_rate": 2.450806604443165e-06, + "loss": 0.8347, + "step": 17287 + }, + { + "epoch": 0.8622443890274314, + "grad_norm": 0.6435380878922126, + "learning_rate": 2.449063081183439e-06, + "loss": 0.8264, + "step": 17288 + }, + { + "epoch": 0.8622942643391521, + "grad_norm": 0.7295137816739107, + "learning_rate": 2.447320146380325e-06, + "loss": 0.8751, + "step": 17289 + }, + { + "epoch": 0.8623441396508729, + "grad_norm": 0.6344524147260718, + "learning_rate": 2.445577800079296e-06, + "loss": 0.7996, + "step": 17290 + }, + { + "epoch": 0.8623940149625935, + "grad_norm": 0.5902472815528008, + "learning_rate": 2.4438360423258225e-06, + "loss": 0.8437, + "step": 17291 + }, + { + "epoch": 0.8624438902743142, + "grad_norm": 1.0656373602088587, + "learning_rate": 2.4420948731653487e-06, + "loss": 0.8616, + "step": 17292 + }, + { + "epoch": 0.862493765586035, + "grad_norm": 0.512698242688984, + "learning_rate": 2.4403542926433183e-06, + "loss": 0.813, + "step": 17293 + }, + { + "epoch": 0.8625436408977556, + "grad_norm": 0.5712702288635098, + "learning_rate": 2.4386143008051447e-06, + "loss": 0.8496, + "step": 17294 + }, + { + "epoch": 0.8625935162094763, + "grad_norm": 0.5447824925537892, + "learning_rate": 2.436874897696234e-06, + "loss": 0.8296, + "step": 17295 + }, + { + "epoch": 0.862643391521197, + "grad_norm": 0.5836082432226918, + "learning_rate": 2.435136083361972e-06, + "loss": 0.8385, + "step": 17296 + }, + { + "epoch": 0.8626932668329177, + "grad_norm": 0.5248787422536998, + "learning_rate": 2.4333978578477386e-06, + "loss": 0.8005, + "step": 17297 + }, + { + "epoch": 0.8627431421446384, + "grad_norm": 0.5464220941978511, + "learning_rate": 2.4316602211988892e-06, + "loss": 0.8168, + "step": 17298 + }, + { + "epoch": 0.8627930174563591, + "grad_norm": 0.7225521441121083, + "learning_rate": 2.429923173460763e-06, + "loss": 0.8486, + "step": 17299 + }, + { + "epoch": 0.8628428927680798, + "grad_norm": 0.6299210308918664, + "learning_rate": 2.42818671467869e-06, + "loss": 0.847, + "step": 17300 + }, + { + "epoch": 0.8628927680798005, + "grad_norm": 0.7526979022045501, + "learning_rate": 2.42645084489799e-06, + "loss": 0.8482, + "step": 17301 + }, + { + "epoch": 0.8629426433915212, + "grad_norm": 0.5073404293506237, + "learning_rate": 2.4247155641639428e-06, + "loss": 0.8137, + "step": 17302 + }, + { + "epoch": 0.8629925187032419, + "grad_norm": 0.8507449557517905, + "learning_rate": 2.422980872521846e-06, + "loss": 0.8113, + "step": 17303 + }, + { + "epoch": 0.8630423940149626, + "grad_norm": 0.6238783746339273, + "learning_rate": 2.421246770016955e-06, + "loss": 0.7923, + "step": 17304 + }, + { + "epoch": 0.8630922693266833, + "grad_norm": 0.5864363965223783, + "learning_rate": 2.4195132566945307e-06, + "loss": 0.8322, + "step": 17305 + }, + { + "epoch": 0.863142144638404, + "grad_norm": 0.6343049381130004, + "learning_rate": 2.417780332599795e-06, + "loss": 0.8463, + "step": 17306 + }, + { + "epoch": 0.8631920199501247, + "grad_norm": 1.0920904555867297, + "learning_rate": 2.4160479977779813e-06, + "loss": 0.7892, + "step": 17307 + }, + { + "epoch": 0.8632418952618454, + "grad_norm": 0.8479746888437778, + "learning_rate": 2.414316252274282e-06, + "loss": 0.8888, + "step": 17308 + }, + { + "epoch": 0.8632917705735661, + "grad_norm": 0.8180180523178939, + "learning_rate": 2.4125850961338987e-06, + "loss": 0.8979, + "step": 17309 + }, + { + "epoch": 0.8633416458852868, + "grad_norm": 0.5691371240084552, + "learning_rate": 2.410854529401993e-06, + "loss": 0.8373, + "step": 17310 + }, + { + "epoch": 0.8633915211970075, + "grad_norm": 0.5844991684163048, + "learning_rate": 2.4091245521237317e-06, + "loss": 0.8541, + "step": 17311 + }, + { + "epoch": 0.8634413965087282, + "grad_norm": 0.48736013375952153, + "learning_rate": 2.407395164344253e-06, + "loss": 0.8242, + "step": 17312 + }, + { + "epoch": 0.8634912718204488, + "grad_norm": 0.9229058624571743, + "learning_rate": 2.4056663661086886e-06, + "loss": 0.8592, + "step": 17313 + }, + { + "epoch": 0.8635411471321696, + "grad_norm": 0.6231481364733942, + "learning_rate": 2.4039381574621435e-06, + "loss": 0.8589, + "step": 17314 + }, + { + "epoch": 0.8635910224438903, + "grad_norm": 0.5022807410274993, + "learning_rate": 2.4022105384497257e-06, + "loss": 0.8152, + "step": 17315 + }, + { + "epoch": 0.8636408977556109, + "grad_norm": 0.5828324923699058, + "learning_rate": 2.400483509116508e-06, + "loss": 0.8301, + "step": 17316 + }, + { + "epoch": 0.8636907730673317, + "grad_norm": 0.7389642935892708, + "learning_rate": 2.3987570695075593e-06, + "loss": 0.8013, + "step": 17317 + }, + { + "epoch": 0.8637406483790524, + "grad_norm": 0.856342834142706, + "learning_rate": 2.3970312196679252e-06, + "loss": 0.8747, + "step": 17318 + }, + { + "epoch": 0.863790523690773, + "grad_norm": 0.5524795604603927, + "learning_rate": 2.395305959642652e-06, + "loss": 0.8361, + "step": 17319 + }, + { + "epoch": 0.8638403990024938, + "grad_norm": 0.556014653367156, + "learning_rate": 2.3935812894767513e-06, + "loss": 0.8206, + "step": 17320 + }, + { + "epoch": 0.8638902743142145, + "grad_norm": 1.0197937110561255, + "learning_rate": 2.391857209215226e-06, + "loss": 0.8094, + "step": 17321 + }, + { + "epoch": 0.8639401496259351, + "grad_norm": 0.6129216547491669, + "learning_rate": 2.390133718903073e-06, + "loss": 0.8138, + "step": 17322 + }, + { + "epoch": 0.8639900249376559, + "grad_norm": 0.5514392473090249, + "learning_rate": 2.388410818585263e-06, + "loss": 0.8443, + "step": 17323 + }, + { + "epoch": 0.8640399002493766, + "grad_norm": 0.5820924521270858, + "learning_rate": 2.38668850830675e-06, + "loss": 0.838, + "step": 17324 + }, + { + "epoch": 0.8640897755610972, + "grad_norm": 0.6639085716003247, + "learning_rate": 2.3849667881124778e-06, + "loss": 0.8607, + "step": 17325 + }, + { + "epoch": 0.864139650872818, + "grad_norm": 0.5955091068968781, + "learning_rate": 2.383245658047381e-06, + "loss": 0.8933, + "step": 17326 + }, + { + "epoch": 0.8641895261845387, + "grad_norm": 0.5798813747849251, + "learning_rate": 2.3815251181563663e-06, + "loss": 0.8261, + "step": 17327 + }, + { + "epoch": 0.8642394014962593, + "grad_norm": 1.5459804520589877, + "learning_rate": 2.379805168484331e-06, + "loss": 0.8643, + "step": 17328 + }, + { + "epoch": 0.8642892768079801, + "grad_norm": 0.5664150635147784, + "learning_rate": 2.3780858090761498e-06, + "loss": 0.8475, + "step": 17329 + }, + { + "epoch": 0.8643391521197007, + "grad_norm": 0.907542350642933, + "learning_rate": 2.376367039976701e-06, + "loss": 0.8221, + "step": 17330 + }, + { + "epoch": 0.8643890274314214, + "grad_norm": 1.2243820853380984, + "learning_rate": 2.37464886123083e-06, + "loss": 0.8524, + "step": 17331 + }, + { + "epoch": 0.8644389027431422, + "grad_norm": 0.65487727116496, + "learning_rate": 2.372931272883372e-06, + "loss": 0.8207, + "step": 17332 + }, + { + "epoch": 0.8644887780548628, + "grad_norm": 0.624299753758379, + "learning_rate": 2.371214274979139e-06, + "loss": 0.8176, + "step": 17333 + }, + { + "epoch": 0.8645386533665835, + "grad_norm": 0.6235288075097094, + "learning_rate": 2.3694978675629478e-06, + "loss": 0.8328, + "step": 17334 + }, + { + "epoch": 0.8645885286783043, + "grad_norm": 0.5909432382940399, + "learning_rate": 2.3677820506795816e-06, + "loss": 0.8226, + "step": 17335 + }, + { + "epoch": 0.8646384039900249, + "grad_norm": 1.3201636058763526, + "learning_rate": 2.3660668243738164e-06, + "loss": 0.8285, + "step": 17336 + }, + { + "epoch": 0.8646882793017456, + "grad_norm": 0.5346254596665567, + "learning_rate": 2.3643521886903996e-06, + "loss": 0.8511, + "step": 17337 + }, + { + "epoch": 0.8647381546134664, + "grad_norm": 0.6907087807568931, + "learning_rate": 2.3626381436740926e-06, + "loss": 0.8692, + "step": 17338 + }, + { + "epoch": 0.864788029925187, + "grad_norm": 0.5323366100582744, + "learning_rate": 2.3609246893696035e-06, + "loss": 0.8268, + "step": 17339 + }, + { + "epoch": 0.8648379052369077, + "grad_norm": 0.7510060321124306, + "learning_rate": 2.3592118258216556e-06, + "loss": 0.8327, + "step": 17340 + }, + { + "epoch": 0.8648877805486285, + "grad_norm": 0.5866489255304045, + "learning_rate": 2.3574995530749377e-06, + "loss": 0.8564, + "step": 17341 + }, + { + "epoch": 0.8649376558603491, + "grad_norm": 0.8830064827604064, + "learning_rate": 2.3557878711741475e-06, + "loss": 0.8122, + "step": 17342 + }, + { + "epoch": 0.8649875311720698, + "grad_norm": 0.5167097122144324, + "learning_rate": 2.3540767801639272e-06, + "loss": 0.8214, + "step": 17343 + }, + { + "epoch": 0.8650374064837906, + "grad_norm": 0.5522041146002371, + "learning_rate": 2.352366280088944e-06, + "loss": 0.8616, + "step": 17344 + }, + { + "epoch": 0.8650872817955112, + "grad_norm": 0.644470000479148, + "learning_rate": 2.35065637099382e-06, + "loss": 0.8122, + "step": 17345 + }, + { + "epoch": 0.8651371571072319, + "grad_norm": 0.6765119778711007, + "learning_rate": 2.348947052923195e-06, + "loss": 0.8018, + "step": 17346 + }, + { + "epoch": 0.8651870324189527, + "grad_norm": 0.49236845150845016, + "learning_rate": 2.347238325921647e-06, + "loss": 0.8396, + "step": 17347 + }, + { + "epoch": 0.8652369077306733, + "grad_norm": 0.6084776552337554, + "learning_rate": 2.3455301900337816e-06, + "loss": 0.8053, + "step": 17348 + }, + { + "epoch": 0.865286783042394, + "grad_norm": 0.6481719349018337, + "learning_rate": 2.3438226453041668e-06, + "loss": 0.8318, + "step": 17349 + }, + { + "epoch": 0.8653366583541147, + "grad_norm": 0.6487809882924747, + "learning_rate": 2.3421156917773605e-06, + "loss": 0.7971, + "step": 17350 + }, + { + "epoch": 0.8653865336658354, + "grad_norm": 0.6549138543766075, + "learning_rate": 2.340409329497903e-06, + "loss": 0.8577, + "step": 17351 + }, + { + "epoch": 0.8654364089775561, + "grad_norm": 0.6616848774151576, + "learning_rate": 2.3387035585103244e-06, + "loss": 0.8145, + "step": 17352 + }, + { + "epoch": 0.8654862842892768, + "grad_norm": 0.58717527131486, + "learning_rate": 2.336998378859137e-06, + "loss": 0.8806, + "step": 17353 + }, + { + "epoch": 0.8655361596009975, + "grad_norm": 0.7310099342158555, + "learning_rate": 2.3352937905888328e-06, + "loss": 0.8351, + "step": 17354 + }, + { + "epoch": 0.8655860349127182, + "grad_norm": 0.6427872613527459, + "learning_rate": 2.33358979374389e-06, + "loss": 0.8437, + "step": 17355 + }, + { + "epoch": 0.8656359102244389, + "grad_norm": 0.5944318139649031, + "learning_rate": 2.3318863883687785e-06, + "loss": 0.8054, + "step": 17356 + }, + { + "epoch": 0.8656857855361596, + "grad_norm": 1.3300066447011902, + "learning_rate": 2.330183574507952e-06, + "loss": 0.8681, + "step": 17357 + }, + { + "epoch": 0.8657356608478803, + "grad_norm": 0.83717554413513, + "learning_rate": 2.32848135220583e-06, + "loss": 0.8298, + "step": 17358 + }, + { + "epoch": 0.865785536159601, + "grad_norm": 0.5670160652213475, + "learning_rate": 2.32677972150685e-06, + "loss": 0.8408, + "step": 17359 + }, + { + "epoch": 0.8658354114713217, + "grad_norm": 0.7790452129523154, + "learning_rate": 2.3250786824554034e-06, + "loss": 0.8651, + "step": 17360 + }, + { + "epoch": 0.8658852867830424, + "grad_norm": 0.7779809677937477, + "learning_rate": 2.32337823509588e-06, + "loss": 0.87, + "step": 17361 + }, + { + "epoch": 0.8659351620947631, + "grad_norm": 0.5331469856664336, + "learning_rate": 2.32167837947265e-06, + "loss": 0.8162, + "step": 17362 + }, + { + "epoch": 0.8659850374064838, + "grad_norm": 1.191955450519078, + "learning_rate": 2.3199791156300747e-06, + "loss": 0.8482, + "step": 17363 + }, + { + "epoch": 0.8660349127182045, + "grad_norm": 0.5081562925842883, + "learning_rate": 2.3182804436124974e-06, + "loss": 0.8417, + "step": 17364 + }, + { + "epoch": 0.8660847880299252, + "grad_norm": 0.612320975995408, + "learning_rate": 2.3165823634642398e-06, + "loss": 0.8412, + "step": 17365 + }, + { + "epoch": 0.8661346633416459, + "grad_norm": 0.5535858703446221, + "learning_rate": 2.3148848752296085e-06, + "loss": 0.8353, + "step": 17366 + }, + { + "epoch": 0.8661845386533665, + "grad_norm": 0.6059998397820208, + "learning_rate": 2.3131879789529103e-06, + "loss": 0.8668, + "step": 17367 + }, + { + "epoch": 0.8662344139650873, + "grad_norm": 0.5781378788943288, + "learning_rate": 2.311491674678418e-06, + "loss": 0.8655, + "step": 17368 + }, + { + "epoch": 0.866284289276808, + "grad_norm": 0.5274771558592231, + "learning_rate": 2.3097959624503952e-06, + "loss": 0.8616, + "step": 17369 + }, + { + "epoch": 0.8663341645885286, + "grad_norm": 0.6800363199802397, + "learning_rate": 2.308100842313088e-06, + "loss": 0.8205, + "step": 17370 + }, + { + "epoch": 0.8663840399002494, + "grad_norm": 0.729720391802823, + "learning_rate": 2.306406314310741e-06, + "loss": 0.7699, + "step": 17371 + }, + { + "epoch": 0.8664339152119701, + "grad_norm": 0.6521206092085899, + "learning_rate": 2.3047123784875585e-06, + "loss": 0.848, + "step": 17372 + }, + { + "epoch": 0.8664837905236907, + "grad_norm": 0.6552708171479245, + "learning_rate": 2.3030190348877515e-06, + "loss": 0.8406, + "step": 17373 + }, + { + "epoch": 0.8665336658354115, + "grad_norm": 0.6904946260575741, + "learning_rate": 2.301326283555502e-06, + "loss": 0.8432, + "step": 17374 + }, + { + "epoch": 0.8665835411471322, + "grad_norm": 0.5130778377361677, + "learning_rate": 2.299634124534994e-06, + "loss": 0.8358, + "step": 17375 + }, + { + "epoch": 0.8666334164588528, + "grad_norm": 0.6583212500350291, + "learning_rate": 2.2979425578703638e-06, + "loss": 0.83, + "step": 17376 + }, + { + "epoch": 0.8666832917705736, + "grad_norm": 0.584616913407119, + "learning_rate": 2.2962515836057685e-06, + "loss": 0.8115, + "step": 17377 + }, + { + "epoch": 0.8667331670822943, + "grad_norm": 0.5695510448351657, + "learning_rate": 2.2945612017853226e-06, + "loss": 0.8712, + "step": 17378 + }, + { + "epoch": 0.8667830423940149, + "grad_norm": 0.6144703902849612, + "learning_rate": 2.292871412453149e-06, + "loss": 0.8442, + "step": 17379 + }, + { + "epoch": 0.8668329177057357, + "grad_norm": 0.7063313978114489, + "learning_rate": 2.291182215653323e-06, + "loss": 0.839, + "step": 17380 + }, + { + "epoch": 0.8668827930174564, + "grad_norm": 0.5060510228940466, + "learning_rate": 2.289493611429941e-06, + "loss": 0.834, + "step": 17381 + }, + { + "epoch": 0.866932668329177, + "grad_norm": 0.7613696181779557, + "learning_rate": 2.287805599827053e-06, + "loss": 0.8443, + "step": 17382 + }, + { + "epoch": 0.8669825436408978, + "grad_norm": 0.5097691306623977, + "learning_rate": 2.2861181808887217e-06, + "loss": 0.8612, + "step": 17383 + }, + { + "epoch": 0.8670324189526184, + "grad_norm": 0.768049032587572, + "learning_rate": 2.2844313546589642e-06, + "loss": 0.852, + "step": 17384 + }, + { + "epoch": 0.8670822942643391, + "grad_norm": 0.6397004730770931, + "learning_rate": 2.282745121181809e-06, + "loss": 0.8233, + "step": 17385 + }, + { + "epoch": 0.8671321695760599, + "grad_norm": 0.5084215093460843, + "learning_rate": 2.281059480501252e-06, + "loss": 0.7707, + "step": 17386 + }, + { + "epoch": 0.8671820448877805, + "grad_norm": 0.5958817164496907, + "learning_rate": 2.2793744326612796e-06, + "loss": 0.8468, + "step": 17387 + }, + { + "epoch": 0.8672319201995012, + "grad_norm": 1.87914782216904, + "learning_rate": 2.2776899777058573e-06, + "loss": 0.8512, + "step": 17388 + }, + { + "epoch": 0.867281795511222, + "grad_norm": 0.634665680404544, + "learning_rate": 2.2760061156789525e-06, + "loss": 0.8309, + "step": 17389 + }, + { + "epoch": 0.8673316708229426, + "grad_norm": 0.5702190299555058, + "learning_rate": 2.2743228466244963e-06, + "loss": 0.8269, + "step": 17390 + }, + { + "epoch": 0.8673815461346633, + "grad_norm": 0.6348255516641336, + "learning_rate": 2.272640170586418e-06, + "loss": 0.8269, + "step": 17391 + }, + { + "epoch": 0.8674314214463841, + "grad_norm": 0.8429018823862564, + "learning_rate": 2.2709580876086157e-06, + "loss": 0.8816, + "step": 17392 + }, + { + "epoch": 0.8674812967581047, + "grad_norm": 0.5804389771421938, + "learning_rate": 2.2692765977349956e-06, + "loss": 0.8397, + "step": 17393 + }, + { + "epoch": 0.8675311720698254, + "grad_norm": 0.6064339928662121, + "learning_rate": 2.2675957010094284e-06, + "loss": 0.8196, + "step": 17394 + }, + { + "epoch": 0.8675810473815462, + "grad_norm": 0.5844512192873754, + "learning_rate": 2.2659153974757793e-06, + "loss": 0.8564, + "step": 17395 + }, + { + "epoch": 0.8676309226932668, + "grad_norm": 0.5646798652599071, + "learning_rate": 2.2642356871778873e-06, + "loss": 0.8133, + "step": 17396 + }, + { + "epoch": 0.8676807980049875, + "grad_norm": 0.6148574102738782, + "learning_rate": 2.262556570159596e-06, + "loss": 0.8214, + "step": 17397 + }, + { + "epoch": 0.8677306733167083, + "grad_norm": 0.5593485122199146, + "learning_rate": 2.260878046464715e-06, + "loss": 0.8496, + "step": 17398 + }, + { + "epoch": 0.8677805486284289, + "grad_norm": 0.5825632611977657, + "learning_rate": 2.2592001161370392e-06, + "loss": 0.8562, + "step": 17399 + }, + { + "epoch": 0.8678304239401496, + "grad_norm": 0.5428039176853071, + "learning_rate": 2.2575227792203642e-06, + "loss": 0.8382, + "step": 17400 + }, + { + "epoch": 0.8678802992518704, + "grad_norm": 0.9464447024275873, + "learning_rate": 2.2558460357584548e-06, + "loss": 0.817, + "step": 17401 + }, + { + "epoch": 0.867930174563591, + "grad_norm": 1.1383179144785314, + "learning_rate": 2.2541698857950627e-06, + "loss": 0.8308, + "step": 17402 + }, + { + "epoch": 0.8679800498753117, + "grad_norm": 0.5417399297795743, + "learning_rate": 2.252494329373922e-06, + "loss": 0.8421, + "step": 17403 + }, + { + "epoch": 0.8680299251870324, + "grad_norm": 0.6093357985720658, + "learning_rate": 2.2508193665387674e-06, + "loss": 0.8862, + "step": 17404 + }, + { + "epoch": 0.8680798004987531, + "grad_norm": 0.7027321384959597, + "learning_rate": 2.2491449973332997e-06, + "loss": 0.8645, + "step": 17405 + }, + { + "epoch": 0.8681296758104738, + "grad_norm": 0.8061461456238197, + "learning_rate": 2.2474712218012116e-06, + "loss": 0.7949, + "step": 17406 + }, + { + "epoch": 0.8681795511221945, + "grad_norm": 0.6671473511774219, + "learning_rate": 2.245798039986177e-06, + "loss": 0.8347, + "step": 17407 + }, + { + "epoch": 0.8682294264339152, + "grad_norm": 0.6672733605194158, + "learning_rate": 2.2441254519318666e-06, + "loss": 0.9066, + "step": 17408 + }, + { + "epoch": 0.868279301745636, + "grad_norm": 0.7290863275062484, + "learning_rate": 2.242453457681909e-06, + "loss": 0.8519, + "step": 17409 + }, + { + "epoch": 0.8683291770573566, + "grad_norm": 0.6238773485608472, + "learning_rate": 2.240782057279947e-06, + "loss": 0.8268, + "step": 17410 + }, + { + "epoch": 0.8683790523690773, + "grad_norm": 0.5465163107204042, + "learning_rate": 2.2391112507695877e-06, + "loss": 0.8371, + "step": 17411 + }, + { + "epoch": 0.868428927680798, + "grad_norm": 0.6390700427029229, + "learning_rate": 2.237441038194443e-06, + "loss": 0.8459, + "step": 17412 + }, + { + "epoch": 0.8684788029925187, + "grad_norm": 0.6123112767548833, + "learning_rate": 2.235771419598079e-06, + "loss": 0.8599, + "step": 17413 + }, + { + "epoch": 0.8685286783042394, + "grad_norm": 0.6594402569381136, + "learning_rate": 2.2341023950240738e-06, + "loss": 0.8586, + "step": 17414 + }, + { + "epoch": 0.8685785536159601, + "grad_norm": 0.7740385786144256, + "learning_rate": 2.232433964515976e-06, + "loss": 0.8236, + "step": 17415 + }, + { + "epoch": 0.8686284289276808, + "grad_norm": 0.5390952075775477, + "learning_rate": 2.2307661281173313e-06, + "loss": 0.8329, + "step": 17416 + }, + { + "epoch": 0.8686783042394015, + "grad_norm": 0.6473873392438988, + "learning_rate": 2.2290988858716474e-06, + "loss": 0.8479, + "step": 17417 + }, + { + "epoch": 0.8687281795511222, + "grad_norm": 0.583102655775185, + "learning_rate": 2.227432237822441e-06, + "loss": 0.8345, + "step": 17418 + }, + { + "epoch": 0.8687780548628429, + "grad_norm": 0.6136223277500397, + "learning_rate": 2.2257661840132007e-06, + "loss": 0.8109, + "step": 17419 + }, + { + "epoch": 0.8688279301745636, + "grad_norm": 0.6164897331296046, + "learning_rate": 2.2241007244873964e-06, + "loss": 0.8487, + "step": 17420 + }, + { + "epoch": 0.8688778054862842, + "grad_norm": 0.5504551430993707, + "learning_rate": 2.2224358592884853e-06, + "loss": 0.8508, + "step": 17421 + }, + { + "epoch": 0.868927680798005, + "grad_norm": 0.5720988351592388, + "learning_rate": 2.2207715884599243e-06, + "loss": 0.8753, + "step": 17422 + }, + { + "epoch": 0.8689775561097257, + "grad_norm": 0.5297913236773378, + "learning_rate": 2.219107912045132e-06, + "loss": 0.8153, + "step": 17423 + }, + { + "epoch": 0.8690274314214463, + "grad_norm": 0.6234074645697025, + "learning_rate": 2.217444830087523e-06, + "loss": 0.8514, + "step": 17424 + }, + { + "epoch": 0.8690773067331671, + "grad_norm": 0.5317753083296543, + "learning_rate": 2.215782342630493e-06, + "loss": 0.8475, + "step": 17425 + }, + { + "epoch": 0.8691271820448878, + "grad_norm": 0.720012656900341, + "learning_rate": 2.2141204497174272e-06, + "loss": 0.8271, + "step": 17426 + }, + { + "epoch": 0.8691770573566084, + "grad_norm": 0.6012565315047811, + "learning_rate": 2.2124591513916936e-06, + "loss": 0.8777, + "step": 17427 + }, + { + "epoch": 0.8692269326683292, + "grad_norm": 0.9955820726936534, + "learning_rate": 2.210798447696638e-06, + "loss": 0.8135, + "step": 17428 + }, + { + "epoch": 0.8692768079800499, + "grad_norm": 0.6090387311143451, + "learning_rate": 2.2091383386755927e-06, + "loss": 0.8309, + "step": 17429 + }, + { + "epoch": 0.8693266832917705, + "grad_norm": 3.2868373495768886, + "learning_rate": 2.2074788243718893e-06, + "loss": 0.8716, + "step": 17430 + }, + { + "epoch": 0.8693765586034913, + "grad_norm": 0.5712543515926443, + "learning_rate": 2.2058199048288238e-06, + "loss": 0.8713, + "step": 17431 + }, + { + "epoch": 0.869426433915212, + "grad_norm": 0.5818151329760697, + "learning_rate": 2.2041615800896864e-06, + "loss": 0.8423, + "step": 17432 + }, + { + "epoch": 0.8694763092269326, + "grad_norm": 0.5511639297529379, + "learning_rate": 2.2025038501977486e-06, + "loss": 0.8509, + "step": 17433 + }, + { + "epoch": 0.8695261845386534, + "grad_norm": 0.552291103201478, + "learning_rate": 2.2008467151962727e-06, + "loss": 0.8428, + "step": 17434 + }, + { + "epoch": 0.8695760598503741, + "grad_norm": 0.5530650412519884, + "learning_rate": 2.199190175128496e-06, + "loss": 0.8066, + "step": 17435 + }, + { + "epoch": 0.8696259351620947, + "grad_norm": 0.6258991717029232, + "learning_rate": 2.1975342300376455e-06, + "loss": 0.8767, + "step": 17436 + }, + { + "epoch": 0.8696758104738155, + "grad_norm": 0.6092098897503762, + "learning_rate": 2.1958788799669395e-06, + "loss": 0.8458, + "step": 17437 + }, + { + "epoch": 0.8697256857855361, + "grad_norm": 0.5634510680183205, + "learning_rate": 2.194224124959565e-06, + "loss": 0.8334, + "step": 17438 + }, + { + "epoch": 0.8697755610972568, + "grad_norm": 0.7349481172973129, + "learning_rate": 2.1925699650587076e-06, + "loss": 0.8852, + "step": 17439 + }, + { + "epoch": 0.8698254364089776, + "grad_norm": 0.5699736780725136, + "learning_rate": 2.190916400307527e-06, + "loss": 0.8794, + "step": 17440 + }, + { + "epoch": 0.8698753117206982, + "grad_norm": 0.5777608586721263, + "learning_rate": 2.189263430749178e-06, + "loss": 0.8661, + "step": 17441 + }, + { + "epoch": 0.869925187032419, + "grad_norm": 3.073187785251672, + "learning_rate": 2.187611056426789e-06, + "loss": 0.8414, + "step": 17442 + }, + { + "epoch": 0.8699750623441397, + "grad_norm": 0.6518986328214517, + "learning_rate": 2.1859592773834813e-06, + "loss": 0.8382, + "step": 17443 + }, + { + "epoch": 0.8700249376558603, + "grad_norm": 0.7482242834606304, + "learning_rate": 2.184308093662352e-06, + "loss": 0.8608, + "step": 17444 + }, + { + "epoch": 0.870074812967581, + "grad_norm": 0.5270258408403828, + "learning_rate": 2.182657505306501e-06, + "loss": 0.8651, + "step": 17445 + }, + { + "epoch": 0.8701246882793018, + "grad_norm": 0.6599297245920495, + "learning_rate": 2.181007512358982e-06, + "loss": 0.8122, + "step": 17446 + }, + { + "epoch": 0.8701745635910224, + "grad_norm": 0.4754885820775469, + "learning_rate": 2.1793581148628616e-06, + "loss": 0.8429, + "step": 17447 + }, + { + "epoch": 0.8702244389027431, + "grad_norm": 0.5388857996347509, + "learning_rate": 2.1777093128611757e-06, + "loss": 0.8566, + "step": 17448 + }, + { + "epoch": 0.8702743142144639, + "grad_norm": 0.5697390462502618, + "learning_rate": 2.176061106396962e-06, + "loss": 0.8427, + "step": 17449 + }, + { + "epoch": 0.8703241895261845, + "grad_norm": 0.5507959974479761, + "learning_rate": 2.174413495513208e-06, + "loss": 0.8387, + "step": 17450 + }, + { + "epoch": 0.8703740648379052, + "grad_norm": 0.6174772019322569, + "learning_rate": 2.1727664802529216e-06, + "loss": 0.826, + "step": 17451 + }, + { + "epoch": 0.870423940149626, + "grad_norm": 0.5412787997982416, + "learning_rate": 2.1711200606590765e-06, + "loss": 0.8625, + "step": 17452 + }, + { + "epoch": 0.8704738154613466, + "grad_norm": 0.6305348880582133, + "learning_rate": 2.1694742367746433e-06, + "loss": 0.867, + "step": 17453 + }, + { + "epoch": 0.8705236907730673, + "grad_norm": 0.7112945578394426, + "learning_rate": 2.1678290086425524e-06, + "loss": 0.8357, + "step": 17454 + }, + { + "epoch": 0.870573566084788, + "grad_norm": 0.6191679608666574, + "learning_rate": 2.166184376305752e-06, + "loss": 0.8494, + "step": 17455 + }, + { + "epoch": 0.8706234413965087, + "grad_norm": 0.6058408750228594, + "learning_rate": 2.16454033980715e-06, + "loss": 0.8541, + "step": 17456 + }, + { + "epoch": 0.8706733167082295, + "grad_norm": 0.6438174702099684, + "learning_rate": 2.162896899189648e-06, + "loss": 0.8411, + "step": 17457 + }, + { + "epoch": 0.8707231920199501, + "grad_norm": 0.6393692770042023, + "learning_rate": 2.1612540544961258e-06, + "loss": 0.8362, + "step": 17458 + }, + { + "epoch": 0.8707730673316708, + "grad_norm": 0.6319580363197235, + "learning_rate": 2.1596118057694655e-06, + "loss": 0.8486, + "step": 17459 + }, + { + "epoch": 0.8708229426433916, + "grad_norm": 0.5332787434654092, + "learning_rate": 2.1579701530525105e-06, + "loss": 0.8012, + "step": 17460 + }, + { + "epoch": 0.8708728179551122, + "grad_norm": 0.5419645989467888, + "learning_rate": 2.1563290963881017e-06, + "loss": 0.821, + "step": 17461 + }, + { + "epoch": 0.8709226932668329, + "grad_norm": 0.732692096763767, + "learning_rate": 2.154688635819055e-06, + "loss": 0.8047, + "step": 17462 + }, + { + "epoch": 0.8709725685785537, + "grad_norm": 0.6851110979941538, + "learning_rate": 2.1530487713881913e-06, + "loss": 0.8386, + "step": 17463 + }, + { + "epoch": 0.8710224438902743, + "grad_norm": 0.6401151751881479, + "learning_rate": 2.1514095031382935e-06, + "loss": 0.8717, + "step": 17464 + }, + { + "epoch": 0.871072319201995, + "grad_norm": 0.9141198494140428, + "learning_rate": 2.149770831112141e-06, + "loss": 0.8401, + "step": 17465 + }, + { + "epoch": 0.8711221945137158, + "grad_norm": 0.7797240764109494, + "learning_rate": 2.148132755352486e-06, + "loss": 0.8665, + "step": 17466 + }, + { + "epoch": 0.8711720698254364, + "grad_norm": 0.6005887297161571, + "learning_rate": 2.1464952759020855e-06, + "loss": 0.8292, + "step": 17467 + }, + { + "epoch": 0.8712219451371571, + "grad_norm": 0.6787965783828669, + "learning_rate": 2.1448583928036613e-06, + "loss": 0.8761, + "step": 17468 + }, + { + "epoch": 0.8712718204488779, + "grad_norm": 0.5652331351794284, + "learning_rate": 2.143222106099932e-06, + "loss": 0.8446, + "step": 17469 + }, + { + "epoch": 0.8713216957605985, + "grad_norm": 0.5306798481035178, + "learning_rate": 2.1415864158335854e-06, + "loss": 0.8711, + "step": 17470 + }, + { + "epoch": 0.8713715710723192, + "grad_norm": 0.5775428661580932, + "learning_rate": 2.139951322047315e-06, + "loss": 0.8402, + "step": 17471 + }, + { + "epoch": 0.87142144638404, + "grad_norm": 0.7514641927420393, + "learning_rate": 2.138316824783784e-06, + "loss": 0.8887, + "step": 17472 + }, + { + "epoch": 0.8714713216957606, + "grad_norm": 0.6713376866360118, + "learning_rate": 2.1366829240856423e-06, + "loss": 0.835, + "step": 17473 + }, + { + "epoch": 0.8715211970074813, + "grad_norm": 0.9100956964343777, + "learning_rate": 2.13504961999553e-06, + "loss": 0.868, + "step": 17474 + }, + { + "epoch": 0.871571072319202, + "grad_norm": 0.5663081222962385, + "learning_rate": 2.1334169125560633e-06, + "loss": 0.8064, + "step": 17475 + }, + { + "epoch": 0.8716209476309227, + "grad_norm": 0.5896921958415451, + "learning_rate": 2.1317848018098498e-06, + "loss": 0.8253, + "step": 17476 + }, + { + "epoch": 0.8716708229426434, + "grad_norm": 0.7110090458072333, + "learning_rate": 2.1301532877994746e-06, + "loss": 0.8254, + "step": 17477 + }, + { + "epoch": 0.871720698254364, + "grad_norm": 0.6665061726447399, + "learning_rate": 2.1285223705675206e-06, + "loss": 0.819, + "step": 17478 + }, + { + "epoch": 0.8717705735660848, + "grad_norm": 0.8640287239409599, + "learning_rate": 2.1268920501565314e-06, + "loss": 0.8303, + "step": 17479 + }, + { + "epoch": 0.8718204488778055, + "grad_norm": 0.5547629088763076, + "learning_rate": 2.125262326609062e-06, + "loss": 0.838, + "step": 17480 + }, + { + "epoch": 0.8718703241895261, + "grad_norm": 0.8280486800323063, + "learning_rate": 2.123633199967631e-06, + "loss": 0.8324, + "step": 17481 + }, + { + "epoch": 0.8719201995012469, + "grad_norm": 0.6640960973074781, + "learning_rate": 2.122004670274763e-06, + "loss": 0.8693, + "step": 17482 + }, + { + "epoch": 0.8719700748129676, + "grad_norm": 0.817115924475317, + "learning_rate": 2.120376737572932e-06, + "loss": 0.8358, + "step": 17483 + }, + { + "epoch": 0.8720199501246882, + "grad_norm": 0.6076397531610944, + "learning_rate": 2.1187494019046376e-06, + "loss": 0.8492, + "step": 17484 + }, + { + "epoch": 0.872069825436409, + "grad_norm": 0.8154925048626979, + "learning_rate": 2.117122663312332e-06, + "loss": 0.8387, + "step": 17485 + }, + { + "epoch": 0.8721197007481297, + "grad_norm": 0.5580805988942583, + "learning_rate": 2.1154965218384808e-06, + "loss": 0.8393, + "step": 17486 + }, + { + "epoch": 0.8721695760598503, + "grad_norm": 0.5483700227733799, + "learning_rate": 2.113870977525495e-06, + "loss": 0.8517, + "step": 17487 + }, + { + "epoch": 0.8722194513715711, + "grad_norm": 0.6444393074694842, + "learning_rate": 2.112246030415807e-06, + "loss": 0.8339, + "step": 17488 + }, + { + "epoch": 0.8722693266832918, + "grad_norm": 0.5733228761091633, + "learning_rate": 2.1106216805518136e-06, + "loss": 0.847, + "step": 17489 + }, + { + "epoch": 0.8723192019950124, + "grad_norm": 0.6225948436634965, + "learning_rate": 2.108997927975909e-06, + "loss": 0.857, + "step": 17490 + }, + { + "epoch": 0.8723690773067332, + "grad_norm": 0.5677314817258047, + "learning_rate": 2.1073747727304527e-06, + "loss": 0.8162, + "step": 17491 + }, + { + "epoch": 0.8724189526184538, + "grad_norm": 0.5508191000415196, + "learning_rate": 2.1057522148578092e-06, + "loss": 0.8553, + "step": 17492 + }, + { + "epoch": 0.8724688279301746, + "grad_norm": 0.5768031014660895, + "learning_rate": 2.104130254400316e-06, + "loss": 0.8417, + "step": 17493 + }, + { + "epoch": 0.8725187032418953, + "grad_norm": 0.5728902506769699, + "learning_rate": 2.1025088914002984e-06, + "loss": 0.8635, + "step": 17494 + }, + { + "epoch": 0.8725685785536159, + "grad_norm": 0.5047627615743975, + "learning_rate": 2.1008881259000577e-06, + "loss": 0.84, + "step": 17495 + }, + { + "epoch": 0.8726184538653367, + "grad_norm": 0.5638822886756708, + "learning_rate": 2.0992679579418973e-06, + "loss": 0.863, + "step": 17496 + }, + { + "epoch": 0.8726683291770574, + "grad_norm": 0.6891760141916359, + "learning_rate": 2.0976483875680935e-06, + "loss": 0.8764, + "step": 17497 + }, + { + "epoch": 0.872718204488778, + "grad_norm": 1.4495215840008613, + "learning_rate": 2.096029414820902e-06, + "loss": 0.8967, + "step": 17498 + }, + { + "epoch": 0.8727680798004988, + "grad_norm": 0.7549219444738605, + "learning_rate": 2.0944110397425694e-06, + "loss": 0.8516, + "step": 17499 + }, + { + "epoch": 0.8728179551122195, + "grad_norm": 0.5884105599746178, + "learning_rate": 2.092793262375334e-06, + "loss": 0.8596, + "step": 17500 + }, + { + "epoch": 0.8728678304239401, + "grad_norm": 0.9721847361284897, + "learning_rate": 2.0911760827614037e-06, + "loss": 0.8486, + "step": 17501 + }, + { + "epoch": 0.8729177057356609, + "grad_norm": 0.7693431311760055, + "learning_rate": 2.0895595009429835e-06, + "loss": 0.8246, + "step": 17502 + }, + { + "epoch": 0.8729675810473816, + "grad_norm": 0.6057457775266923, + "learning_rate": 2.0879435169622513e-06, + "loss": 0.8684, + "step": 17503 + }, + { + "epoch": 0.8730174563591022, + "grad_norm": 0.622499087985682, + "learning_rate": 2.0863281308613813e-06, + "loss": 0.8264, + "step": 17504 + }, + { + "epoch": 0.873067331670823, + "grad_norm": 0.5270790901824678, + "learning_rate": 2.084713342682523e-06, + "loss": 0.7927, + "step": 17505 + }, + { + "epoch": 0.8731172069825437, + "grad_norm": 0.6770360528295472, + "learning_rate": 2.0830991524678183e-06, + "loss": 0.8205, + "step": 17506 + }, + { + "epoch": 0.8731670822942643, + "grad_norm": 0.9285435336454548, + "learning_rate": 2.0814855602593776e-06, + "loss": 0.8356, + "step": 17507 + }, + { + "epoch": 0.8732169576059851, + "grad_norm": 0.5737585066792698, + "learning_rate": 2.0798725660993197e-06, + "loss": 0.8422, + "step": 17508 + }, + { + "epoch": 0.8732668329177057, + "grad_norm": 0.631150002172637, + "learning_rate": 2.0782601700297277e-06, + "loss": 0.8157, + "step": 17509 + }, + { + "epoch": 0.8733167082294264, + "grad_norm": 0.7223338753629073, + "learning_rate": 2.0766483720926822e-06, + "loss": 0.8585, + "step": 17510 + }, + { + "epoch": 0.8733665835411472, + "grad_norm": 0.8302431561173368, + "learning_rate": 2.07503717233023e-06, + "loss": 0.8244, + "step": 17511 + }, + { + "epoch": 0.8734164588528678, + "grad_norm": 0.9078370615942248, + "learning_rate": 2.0734265707844287e-06, + "loss": 0.8541, + "step": 17512 + }, + { + "epoch": 0.8734663341645885, + "grad_norm": 0.6868648716742154, + "learning_rate": 2.0718165674973005e-06, + "loss": 0.8829, + "step": 17513 + }, + { + "epoch": 0.8735162094763093, + "grad_norm": 0.6701262784686172, + "learning_rate": 2.0702071625108537e-06, + "loss": 0.8299, + "step": 17514 + }, + { + "epoch": 0.8735660847880299, + "grad_norm": 0.6165415764280969, + "learning_rate": 2.0685983558670962e-06, + "loss": 0.8507, + "step": 17515 + }, + { + "epoch": 0.8736159600997506, + "grad_norm": 0.5551286259849213, + "learning_rate": 2.0669901476079945e-06, + "loss": 0.8277, + "step": 17516 + }, + { + "epoch": 0.8736658354114714, + "grad_norm": 0.5273624519366396, + "learning_rate": 2.065382537775526e-06, + "loss": 0.8554, + "step": 17517 + }, + { + "epoch": 0.873715710723192, + "grad_norm": 0.8706885259412179, + "learning_rate": 2.063775526411629e-06, + "loss": 0.8277, + "step": 17518 + }, + { + "epoch": 0.8737655860349127, + "grad_norm": 0.5552107347401272, + "learning_rate": 2.062169113558257e-06, + "loss": 0.837, + "step": 17519 + }, + { + "epoch": 0.8738154613466335, + "grad_norm": 0.5529569772222318, + "learning_rate": 2.060563299257304e-06, + "loss": 0.8389, + "step": 17520 + }, + { + "epoch": 0.8738653366583541, + "grad_norm": 0.8104329242824942, + "learning_rate": 2.058958083550694e-06, + "loss": 0.8581, + "step": 17521 + }, + { + "epoch": 0.8739152119700748, + "grad_norm": 0.6110370228556644, + "learning_rate": 2.057353466480297e-06, + "loss": 0.8366, + "step": 17522 + }, + { + "epoch": 0.8739650872817956, + "grad_norm": 0.6784019959048441, + "learning_rate": 2.055749448088004e-06, + "loss": 0.8464, + "step": 17523 + }, + { + "epoch": 0.8740149625935162, + "grad_norm": 0.5226190996269081, + "learning_rate": 2.0541460284156544e-06, + "loss": 0.8076, + "step": 17524 + }, + { + "epoch": 0.8740648379052369, + "grad_norm": 0.6330555753794874, + "learning_rate": 2.052543207505095e-06, + "loss": 0.789, + "step": 17525 + }, + { + "epoch": 0.8741147132169576, + "grad_norm": 0.7088267137635104, + "learning_rate": 2.050940985398153e-06, + "loss": 0.8297, + "step": 17526 + }, + { + "epoch": 0.8741645885286783, + "grad_norm": 0.5618937453137007, + "learning_rate": 2.0493393621366373e-06, + "loss": 0.8137, + "step": 17527 + }, + { + "epoch": 0.874214463840399, + "grad_norm": 0.7296807210673005, + "learning_rate": 2.047738337762331e-06, + "loss": 0.8105, + "step": 17528 + }, + { + "epoch": 0.8742643391521197, + "grad_norm": 0.6371077069371114, + "learning_rate": 2.0461379123170284e-06, + "loss": 0.8093, + "step": 17529 + }, + { + "epoch": 0.8743142144638404, + "grad_norm": 0.7793913683869723, + "learning_rate": 2.044538085842482e-06, + "loss": 0.8596, + "step": 17530 + }, + { + "epoch": 0.8743640897755611, + "grad_norm": 0.593831522264606, + "learning_rate": 2.0429388583804416e-06, + "loss": 0.8651, + "step": 17531 + }, + { + "epoch": 0.8744139650872818, + "grad_norm": 0.5643315254881648, + "learning_rate": 2.0413402299726327e-06, + "loss": 0.841, + "step": 17532 + }, + { + "epoch": 0.8744638403990025, + "grad_norm": 0.6658382173821533, + "learning_rate": 2.0397422006607826e-06, + "loss": 0.8179, + "step": 17533 + }, + { + "epoch": 0.8745137157107232, + "grad_norm": 0.6160959883901177, + "learning_rate": 2.03814477048658e-06, + "loss": 0.8587, + "step": 17534 + }, + { + "epoch": 0.8745635910224439, + "grad_norm": 0.6339968972508525, + "learning_rate": 2.0365479394917147e-06, + "loss": 0.852, + "step": 17535 + }, + { + "epoch": 0.8746134663341646, + "grad_norm": 1.0589297013782786, + "learning_rate": 2.0349517077178497e-06, + "loss": 0.8688, + "step": 17536 + }, + { + "epoch": 0.8746633416458853, + "grad_norm": 0.7668417264113547, + "learning_rate": 2.033356075206652e-06, + "loss": 0.8547, + "step": 17537 + }, + { + "epoch": 0.874713216957606, + "grad_norm": 0.5997739664745759, + "learning_rate": 2.0317610419997384e-06, + "loss": 0.881, + "step": 17538 + }, + { + "epoch": 0.8747630922693267, + "grad_norm": 0.5807823641054344, + "learning_rate": 2.030166608138748e-06, + "loss": 0.8065, + "step": 17539 + }, + { + "epoch": 0.8748129675810474, + "grad_norm": 0.7037152727599797, + "learning_rate": 2.0285727736652742e-06, + "loss": 0.8453, + "step": 17540 + }, + { + "epoch": 0.8748628428927681, + "grad_norm": 0.6581857739847263, + "learning_rate": 2.0269795386209157e-06, + "loss": 0.854, + "step": 17541 + }, + { + "epoch": 0.8749127182044888, + "grad_norm": 0.5854455680725247, + "learning_rate": 2.025386903047247e-06, + "loss": 0.8116, + "step": 17542 + }, + { + "epoch": 0.8749625935162095, + "grad_norm": 0.586547087325336, + "learning_rate": 2.023794866985823e-06, + "loss": 0.8287, + "step": 17543 + }, + { + "epoch": 0.8750124688279302, + "grad_norm": 0.8427895641096007, + "learning_rate": 2.0222034304781868e-06, + "loss": 0.8519, + "step": 17544 + }, + { + "epoch": 0.8750623441396509, + "grad_norm": 0.6026242424484467, + "learning_rate": 2.020612593565874e-06, + "loss": 0.862, + "step": 17545 + }, + { + "epoch": 0.8751122194513715, + "grad_norm": 0.5865286973016759, + "learning_rate": 2.019022356290387e-06, + "loss": 0.8299, + "step": 17546 + }, + { + "epoch": 0.8751620947630923, + "grad_norm": 0.522523046471202, + "learning_rate": 2.0174327186932294e-06, + "loss": 0.816, + "step": 17547 + }, + { + "epoch": 0.875211970074813, + "grad_norm": 0.7176900280689635, + "learning_rate": 2.0158436808158764e-06, + "loss": 0.8507, + "step": 17548 + }, + { + "epoch": 0.8752618453865336, + "grad_norm": 1.7163612608595642, + "learning_rate": 2.0142552426998e-06, + "loss": 0.7714, + "step": 17549 + }, + { + "epoch": 0.8753117206982544, + "grad_norm": 0.5769158675304539, + "learning_rate": 2.0126674043864424e-06, + "loss": 0.8409, + "step": 17550 + }, + { + "epoch": 0.8753615960099751, + "grad_norm": 0.6777414544155684, + "learning_rate": 2.0110801659172397e-06, + "loss": 0.871, + "step": 17551 + }, + { + "epoch": 0.8754114713216957, + "grad_norm": 0.5172126191545973, + "learning_rate": 2.0094935273336195e-06, + "loss": 0.8154, + "step": 17552 + }, + { + "epoch": 0.8754613466334165, + "grad_norm": 0.7158229781511032, + "learning_rate": 2.007907488676969e-06, + "loss": 0.8544, + "step": 17553 + }, + { + "epoch": 0.8755112219451372, + "grad_norm": 0.9672451409132401, + "learning_rate": 2.0063220499886874e-06, + "loss": 0.8382, + "step": 17554 + }, + { + "epoch": 0.8755610972568578, + "grad_norm": 0.584430551061109, + "learning_rate": 2.0047372113101344e-06, + "loss": 0.8548, + "step": 17555 + }, + { + "epoch": 0.8756109725685786, + "grad_norm": 0.686200693495326, + "learning_rate": 2.0031529726826842e-06, + "loss": 0.8468, + "step": 17556 + }, + { + "epoch": 0.8756608478802993, + "grad_norm": 0.6466511048584054, + "learning_rate": 2.0015693341476543e-06, + "loss": 0.7947, + "step": 17557 + }, + { + "epoch": 0.8757107231920199, + "grad_norm": 0.5920964041290803, + "learning_rate": 1.9999862957463837e-06, + "loss": 0.8598, + "step": 17558 + }, + { + "epoch": 0.8757605985037407, + "grad_norm": 0.7569359906106302, + "learning_rate": 1.9984038575201758e-06, + "loss": 0.7862, + "step": 17559 + }, + { + "epoch": 0.8758104738154614, + "grad_norm": 0.5988606536375952, + "learning_rate": 1.9968220195103306e-06, + "loss": 0.8355, + "step": 17560 + }, + { + "epoch": 0.875860349127182, + "grad_norm": 0.6512393963819582, + "learning_rate": 1.995240781758112e-06, + "loss": 0.8166, + "step": 17561 + }, + { + "epoch": 0.8759102244389028, + "grad_norm": 0.669293367534208, + "learning_rate": 1.993660144304796e-06, + "loss": 0.8772, + "step": 17562 + }, + { + "epoch": 0.8759600997506234, + "grad_norm": 0.6073874823967552, + "learning_rate": 1.992080107191621e-06, + "loss": 0.8631, + "step": 17563 + }, + { + "epoch": 0.8760099750623441, + "grad_norm": 0.5699294456871742, + "learning_rate": 1.990500670459816e-06, + "loss": 0.8381, + "step": 17564 + }, + { + "epoch": 0.8760598503740649, + "grad_norm": 0.6062521024510201, + "learning_rate": 1.9889218341505977e-06, + "loss": 0.8886, + "step": 17565 + }, + { + "epoch": 0.8761097256857855, + "grad_norm": 0.807520621252123, + "learning_rate": 1.987343598305169e-06, + "loss": 0.8352, + "step": 17566 + }, + { + "epoch": 0.8761596009975062, + "grad_norm": 0.8096034357979113, + "learning_rate": 1.9857659629647084e-06, + "loss": 0.8276, + "step": 17567 + }, + { + "epoch": 0.876209476309227, + "grad_norm": 0.6044391171402151, + "learning_rate": 1.9841889281703884e-06, + "loss": 0.8656, + "step": 17568 + }, + { + "epoch": 0.8762593516209476, + "grad_norm": 0.50651918937532, + "learning_rate": 1.982612493963351e-06, + "loss": 0.8214, + "step": 17569 + }, + { + "epoch": 0.8763092269326683, + "grad_norm": 0.7637209602485209, + "learning_rate": 1.981036660384744e-06, + "loss": 0.8308, + "step": 17570 + }, + { + "epoch": 0.8763591022443891, + "grad_norm": 0.6110790329135208, + "learning_rate": 1.9794614274756816e-06, + "loss": 0.8283, + "step": 17571 + }, + { + "epoch": 0.8764089775561097, + "grad_norm": 0.6334579675045195, + "learning_rate": 1.9778867952772726e-06, + "loss": 0.8283, + "step": 17572 + }, + { + "epoch": 0.8764588528678304, + "grad_norm": 0.5872254590851114, + "learning_rate": 1.976312763830601e-06, + "loss": 0.8515, + "step": 17573 + }, + { + "epoch": 0.8765087281795512, + "grad_norm": 0.5271025740784594, + "learning_rate": 1.9747393331767506e-06, + "loss": 0.8419, + "step": 17574 + }, + { + "epoch": 0.8765586034912718, + "grad_norm": 0.762338868444666, + "learning_rate": 1.9731665033567658e-06, + "loss": 0.867, + "step": 17575 + }, + { + "epoch": 0.8766084788029925, + "grad_norm": 0.7759250824430299, + "learning_rate": 1.971594274411698e-06, + "loss": 0.9067, + "step": 17576 + }, + { + "epoch": 0.8766583541147133, + "grad_norm": 0.5338106152033057, + "learning_rate": 1.9700226463825688e-06, + "loss": 0.8294, + "step": 17577 + }, + { + "epoch": 0.8767082294264339, + "grad_norm": 0.5969230769692834, + "learning_rate": 1.9684516193103963e-06, + "loss": 0.8768, + "step": 17578 + }, + { + "epoch": 0.8767581047381546, + "grad_norm": 0.5881992133189544, + "learning_rate": 1.9668811932361698e-06, + "loss": 0.9106, + "step": 17579 + }, + { + "epoch": 0.8768079800498753, + "grad_norm": 0.714130729528923, + "learning_rate": 1.96531136820087e-06, + "loss": 0.8367, + "step": 17580 + }, + { + "epoch": 0.876857855361596, + "grad_norm": 0.5925973729188923, + "learning_rate": 1.963742144245456e-06, + "loss": 0.8596, + "step": 17581 + }, + { + "epoch": 0.8769077306733167, + "grad_norm": 0.7898095418681977, + "learning_rate": 1.9621735214108872e-06, + "loss": 0.7915, + "step": 17582 + }, + { + "epoch": 0.8769576059850374, + "grad_norm": 0.7280916714322436, + "learning_rate": 1.9606054997380914e-06, + "loss": 0.8422, + "step": 17583 + }, + { + "epoch": 0.8770074812967581, + "grad_norm": 0.733408371632783, + "learning_rate": 1.95903807926798e-06, + "loss": 0.8482, + "step": 17584 + }, + { + "epoch": 0.8770573566084788, + "grad_norm": 0.551281925140793, + "learning_rate": 1.9574712600414604e-06, + "loss": 0.8061, + "step": 17585 + }, + { + "epoch": 0.8771072319201995, + "grad_norm": 0.8975716494446985, + "learning_rate": 1.9559050420994125e-06, + "loss": 0.847, + "step": 17586 + }, + { + "epoch": 0.8771571072319202, + "grad_norm": 1.3231174799699839, + "learning_rate": 1.9543394254827125e-06, + "loss": 0.839, + "step": 17587 + }, + { + "epoch": 0.8772069825436409, + "grad_norm": 0.5218041067740783, + "learning_rate": 1.952774410232208e-06, + "loss": 0.7868, + "step": 17588 + }, + { + "epoch": 0.8772568578553616, + "grad_norm": 0.6167172719809159, + "learning_rate": 1.9512099963887478e-06, + "loss": 0.825, + "step": 17589 + }, + { + "epoch": 0.8773067331670823, + "grad_norm": 0.6059759794486828, + "learning_rate": 1.9496461839931396e-06, + "loss": 0.8333, + "step": 17590 + }, + { + "epoch": 0.877356608478803, + "grad_norm": 0.5865460628468441, + "learning_rate": 1.9480829730862017e-06, + "loss": 0.8382, + "step": 17591 + }, + { + "epoch": 0.8774064837905237, + "grad_norm": 0.6697597556330414, + "learning_rate": 1.9465203637087177e-06, + "loss": 0.8282, + "step": 17592 + }, + { + "epoch": 0.8774563591022444, + "grad_norm": 0.5093511028530955, + "learning_rate": 1.944958355901477e-06, + "loss": 0.7687, + "step": 17593 + }, + { + "epoch": 0.8775062344139651, + "grad_norm": 0.6796990130876147, + "learning_rate": 1.9433969497052203e-06, + "loss": 0.8316, + "step": 17594 + }, + { + "epoch": 0.8775561097256858, + "grad_norm": 0.628111560381721, + "learning_rate": 1.9418361451607054e-06, + "loss": 0.8406, + "step": 17595 + }, + { + "epoch": 0.8776059850374065, + "grad_norm": 0.5884069146781405, + "learning_rate": 1.940275942308653e-06, + "loss": 0.8519, + "step": 17596 + }, + { + "epoch": 0.8776558603491272, + "grad_norm": 0.5864155970315117, + "learning_rate": 1.9387163411897892e-06, + "loss": 0.8202, + "step": 17597 + }, + { + "epoch": 0.8777057356608479, + "grad_norm": 0.8220192540293326, + "learning_rate": 1.937157341844792e-06, + "loss": 0.8094, + "step": 17598 + }, + { + "epoch": 0.8777556109725686, + "grad_norm": 1.2274199603435432, + "learning_rate": 1.9355989443143595e-06, + "loss": 0.8333, + "step": 17599 + }, + { + "epoch": 0.8778054862842892, + "grad_norm": 0.7540070084268938, + "learning_rate": 1.934041148639146e-06, + "loss": 0.8396, + "step": 17600 + }, + { + "epoch": 0.87785536159601, + "grad_norm": 0.5937866614780598, + "learning_rate": 1.9324839548598093e-06, + "loss": 0.838, + "step": 17601 + }, + { + "epoch": 0.8779052369077307, + "grad_norm": 0.5917337594884389, + "learning_rate": 1.930927363016974e-06, + "loss": 0.8266, + "step": 17602 + }, + { + "epoch": 0.8779551122194513, + "grad_norm": 0.6113241371493818, + "learning_rate": 1.9293713731512673e-06, + "loss": 0.8992, + "step": 17603 + }, + { + "epoch": 0.8780049875311721, + "grad_norm": 0.6250784661937439, + "learning_rate": 1.927815985303294e-06, + "loss": 0.8038, + "step": 17604 + }, + { + "epoch": 0.8780548628428928, + "grad_norm": 0.583326712730131, + "learning_rate": 1.926261199513632e-06, + "loss": 0.8526, + "step": 17605 + }, + { + "epoch": 0.8781047381546134, + "grad_norm": 0.6402742352777854, + "learning_rate": 1.9247070158228575e-06, + "loss": 0.8454, + "step": 17606 + }, + { + "epoch": 0.8781546134663342, + "grad_norm": 0.6440278867615961, + "learning_rate": 1.923153434271527e-06, + "loss": 0.9006, + "step": 17607 + }, + { + "epoch": 0.8782044887780549, + "grad_norm": 0.6101472106290106, + "learning_rate": 1.9216004549001827e-06, + "loss": 0.8609, + "step": 17608 + }, + { + "epoch": 0.8782543640897755, + "grad_norm": 0.5588308180176197, + "learning_rate": 1.920048077749345e-06, + "loss": 0.8388, + "step": 17609 + }, + { + "epoch": 0.8783042394014963, + "grad_norm": 0.8535142824897467, + "learning_rate": 1.918496302859518e-06, + "loss": 0.8592, + "step": 17610 + }, + { + "epoch": 0.878354114713217, + "grad_norm": 0.6860724976046908, + "learning_rate": 1.916945130271211e-06, + "loss": 0.8325, + "step": 17611 + }, + { + "epoch": 0.8784039900249376, + "grad_norm": 0.7588468079184127, + "learning_rate": 1.91539456002488e-06, + "loss": 0.8346, + "step": 17612 + }, + { + "epoch": 0.8784538653366584, + "grad_norm": 0.7118208790816011, + "learning_rate": 1.9138445921609987e-06, + "loss": 0.85, + "step": 17613 + }, + { + "epoch": 0.8785037406483791, + "grad_norm": 1.272651366740449, + "learning_rate": 1.9122952267200095e-06, + "loss": 0.8439, + "step": 17614 + }, + { + "epoch": 0.8785536159600997, + "grad_norm": 0.8397516966879839, + "learning_rate": 1.9107464637423467e-06, + "loss": 0.8621, + "step": 17615 + }, + { + "epoch": 0.8786034912718205, + "grad_norm": 0.6259153258008651, + "learning_rate": 1.9091983032684224e-06, + "loss": 0.8811, + "step": 17616 + }, + { + "epoch": 0.8786533665835411, + "grad_norm": 0.878936766382465, + "learning_rate": 1.9076507453386348e-06, + "loss": 0.8503, + "step": 17617 + }, + { + "epoch": 0.8787032418952618, + "grad_norm": 0.6756276805107243, + "learning_rate": 1.9061037899933604e-06, + "loss": 0.8889, + "step": 17618 + }, + { + "epoch": 0.8787531172069826, + "grad_norm": 0.700174639076113, + "learning_rate": 1.9045574372729774e-06, + "loss": 0.8168, + "step": 17619 + }, + { + "epoch": 0.8788029925187032, + "grad_norm": 0.6519571024816373, + "learning_rate": 1.9030116872178316e-06, + "loss": 0.8183, + "step": 17620 + }, + { + "epoch": 0.8788528678304239, + "grad_norm": 0.641234116208846, + "learning_rate": 1.9014665398682602e-06, + "loss": 0.8717, + "step": 17621 + }, + { + "epoch": 0.8789027431421447, + "grad_norm": 0.5933062615816104, + "learning_rate": 1.8999219952645808e-06, + "loss": 0.8506, + "step": 17622 + }, + { + "epoch": 0.8789526184538653, + "grad_norm": 0.6579001742885764, + "learning_rate": 1.8983780534470947e-06, + "loss": 0.8444, + "step": 17623 + }, + { + "epoch": 0.879002493765586, + "grad_norm": 0.7127957344898528, + "learning_rate": 1.896834714456097e-06, + "loss": 0.811, + "step": 17624 + }, + { + "epoch": 0.8790523690773068, + "grad_norm": 0.6278930954043512, + "learning_rate": 1.895291978331859e-06, + "loss": 0.8629, + "step": 17625 + }, + { + "epoch": 0.8791022443890274, + "grad_norm": 0.7586523717069961, + "learning_rate": 1.8937498451146369e-06, + "loss": 0.8255, + "step": 17626 + }, + { + "epoch": 0.8791521197007481, + "grad_norm": 0.8086178110434392, + "learning_rate": 1.892208314844668e-06, + "loss": 0.8482, + "step": 17627 + }, + { + "epoch": 0.8792019950124689, + "grad_norm": 0.6219979630868921, + "learning_rate": 1.890667387562184e-06, + "loss": 0.8517, + "step": 17628 + }, + { + "epoch": 0.8792518703241895, + "grad_norm": 0.6805622933397996, + "learning_rate": 1.8891270633073887e-06, + "loss": 0.909, + "step": 17629 + }, + { + "epoch": 0.8793017456359102, + "grad_norm": 0.8238963468047644, + "learning_rate": 1.8875873421204866e-06, + "loss": 0.8248, + "step": 17630 + }, + { + "epoch": 0.879351620947631, + "grad_norm": 0.537839982649852, + "learning_rate": 1.886048224041642e-06, + "loss": 0.8156, + "step": 17631 + }, + { + "epoch": 0.8794014962593516, + "grad_norm": 0.5692789703361567, + "learning_rate": 1.8845097091110292e-06, + "loss": 0.8887, + "step": 17632 + }, + { + "epoch": 0.8794513715710723, + "grad_norm": 0.9348779669156596, + "learning_rate": 1.8829717973687904e-06, + "loss": 0.8303, + "step": 17633 + }, + { + "epoch": 0.879501246882793, + "grad_norm": 0.659406927294778, + "learning_rate": 1.881434488855055e-06, + "loss": 0.843, + "step": 17634 + }, + { + "epoch": 0.8795511221945137, + "grad_norm": 0.6382198418400032, + "learning_rate": 1.879897783609938e-06, + "loss": 0.8811, + "step": 17635 + }, + { + "epoch": 0.8796009975062344, + "grad_norm": 0.5431143743397441, + "learning_rate": 1.8783616816735433e-06, + "loss": 0.8047, + "step": 17636 + }, + { + "epoch": 0.8796508728179551, + "grad_norm": 0.6263425892628941, + "learning_rate": 1.8768261830859556e-06, + "loss": 0.8301, + "step": 17637 + }, + { + "epoch": 0.8797007481296758, + "grad_norm": 0.8205693271406082, + "learning_rate": 1.8752912878872375e-06, + "loss": 0.8196, + "step": 17638 + }, + { + "epoch": 0.8797506234413965, + "grad_norm": 0.5721305758500512, + "learning_rate": 1.87375699611744e-06, + "loss": 0.8218, + "step": 17639 + }, + { + "epoch": 0.8798004987531172, + "grad_norm": 0.6186019611962514, + "learning_rate": 1.872223307816609e-06, + "loss": 0.8532, + "step": 17640 + }, + { + "epoch": 0.8798503740648379, + "grad_norm": 0.6982104147839537, + "learning_rate": 1.8706902230247598e-06, + "loss": 0.859, + "step": 17641 + }, + { + "epoch": 0.8799002493765586, + "grad_norm": 0.5847844369943689, + "learning_rate": 1.8691577417818963e-06, + "loss": 0.8323, + "step": 17642 + }, + { + "epoch": 0.8799501246882793, + "grad_norm": 1.9819927408181484, + "learning_rate": 1.8676258641280086e-06, + "loss": 0.856, + "step": 17643 + }, + { + "epoch": 0.88, + "grad_norm": 0.5586896995564953, + "learning_rate": 1.8660945901030764e-06, + "loss": 0.838, + "step": 17644 + }, + { + "epoch": 0.8800498753117207, + "grad_norm": 0.5748723401701945, + "learning_rate": 1.864563919747045e-06, + "loss": 0.8176, + "step": 17645 + }, + { + "epoch": 0.8800997506234414, + "grad_norm": 0.5501346951466001, + "learning_rate": 1.8630338530998687e-06, + "loss": 0.842, + "step": 17646 + }, + { + "epoch": 0.8801496259351621, + "grad_norm": 0.5531326573933447, + "learning_rate": 1.8615043902014657e-06, + "loss": 0.8247, + "step": 17647 + }, + { + "epoch": 0.8801995012468828, + "grad_norm": 0.6326354264902008, + "learning_rate": 1.8599755310917593e-06, + "loss": 0.7925, + "step": 17648 + }, + { + "epoch": 0.8802493765586035, + "grad_norm": 0.5932057565826699, + "learning_rate": 1.8584472758106236e-06, + "loss": 0.8405, + "step": 17649 + }, + { + "epoch": 0.8802992518703242, + "grad_norm": 0.551986113637914, + "learning_rate": 1.856919624397957e-06, + "loss": 0.8476, + "step": 17650 + }, + { + "epoch": 0.8803491271820448, + "grad_norm": 0.6157576365395327, + "learning_rate": 1.8553925768936081e-06, + "loss": 0.8195, + "step": 17651 + }, + { + "epoch": 0.8803990024937656, + "grad_norm": 0.6002247406109908, + "learning_rate": 1.8538661333374425e-06, + "loss": 0.8373, + "step": 17652 + }, + { + "epoch": 0.8804488778054863, + "grad_norm": 0.6343404436812802, + "learning_rate": 1.8523402937692723e-06, + "loss": 0.8246, + "step": 17653 + }, + { + "epoch": 0.8804987531172069, + "grad_norm": 0.6272972474757482, + "learning_rate": 1.8508150582289275e-06, + "loss": 0.8483, + "step": 17654 + }, + { + "epoch": 0.8805486284289277, + "grad_norm": 0.703842882860953, + "learning_rate": 1.8492904267561978e-06, + "loss": 0.8443, + "step": 17655 + }, + { + "epoch": 0.8805985037406484, + "grad_norm": 0.5920136675020568, + "learning_rate": 1.8477663993908768e-06, + "loss": 0.8624, + "step": 17656 + }, + { + "epoch": 0.880648379052369, + "grad_norm": 0.7604356748060457, + "learning_rate": 1.8462429761727323e-06, + "loss": 0.8351, + "step": 17657 + }, + { + "epoch": 0.8806982543640898, + "grad_norm": 0.5455064013835481, + "learning_rate": 1.8447201571415162e-06, + "loss": 0.8407, + "step": 17658 + }, + { + "epoch": 0.8807481296758105, + "grad_norm": 0.5941248397370228, + "learning_rate": 1.8431979423369604e-06, + "loss": 0.8734, + "step": 17659 + }, + { + "epoch": 0.8807980049875311, + "grad_norm": 0.7861877685737887, + "learning_rate": 1.841676331798789e-06, + "loss": 0.8591, + "step": 17660 + }, + { + "epoch": 0.8808478802992519, + "grad_norm": 0.5214881336472937, + "learning_rate": 1.8401553255667142e-06, + "loss": 0.8835, + "step": 17661 + }, + { + "epoch": 0.8808977556109726, + "grad_norm": 1.1904215607630737, + "learning_rate": 1.8386349236804212e-06, + "loss": 0.8245, + "step": 17662 + }, + { + "epoch": 0.8809476309226932, + "grad_norm": 0.6337190668293524, + "learning_rate": 1.8371151261795839e-06, + "loss": 0.8705, + "step": 17663 + }, + { + "epoch": 0.880997506234414, + "grad_norm": 0.4886982518410815, + "learning_rate": 1.8355959331038564e-06, + "loss": 0.816, + "step": 17664 + }, + { + "epoch": 0.8810473815461347, + "grad_norm": 0.5603219436127584, + "learning_rate": 1.8340773444928905e-06, + "loss": 0.8027, + "step": 17665 + }, + { + "epoch": 0.8810972568578553, + "grad_norm": 0.5301365579456633, + "learning_rate": 1.8325593603863017e-06, + "loss": 0.8069, + "step": 17666 + }, + { + "epoch": 0.8811471321695761, + "grad_norm": 0.667181727173324, + "learning_rate": 1.8310419808237194e-06, + "loss": 0.8224, + "step": 17667 + }, + { + "epoch": 0.8811970074812968, + "grad_norm": 0.7678363078318268, + "learning_rate": 1.8295252058447177e-06, + "loss": 0.8237, + "step": 17668 + }, + { + "epoch": 0.8812468827930174, + "grad_norm": 0.5546453339494385, + "learning_rate": 1.8280090354888923e-06, + "loss": 0.8129, + "step": 17669 + }, + { + "epoch": 0.8812967581047382, + "grad_norm": 0.6302471958958027, + "learning_rate": 1.826493469795798e-06, + "loss": 0.8505, + "step": 17670 + }, + { + "epoch": 0.8813466334164588, + "grad_norm": 1.4016291281738935, + "learning_rate": 1.8249785088049893e-06, + "loss": 0.836, + "step": 17671 + }, + { + "epoch": 0.8813965087281795, + "grad_norm": 0.645979818533687, + "learning_rate": 1.823464152555987e-06, + "loss": 0.8547, + "step": 17672 + }, + { + "epoch": 0.8814463840399003, + "grad_norm": 0.7348079875649683, + "learning_rate": 1.821950401088321e-06, + "loss": 0.8185, + "step": 17673 + }, + { + "epoch": 0.8814962593516209, + "grad_norm": 0.46692734457711516, + "learning_rate": 1.8204372544414844e-06, + "loss": 0.8289, + "step": 17674 + }, + { + "epoch": 0.8815461346633416, + "grad_norm": 0.5480274383965413, + "learning_rate": 1.8189247126549652e-06, + "loss": 0.8297, + "step": 17675 + }, + { + "epoch": 0.8815960099750624, + "grad_norm": 0.6618066856503441, + "learning_rate": 1.8174127757682263e-06, + "loss": 0.8331, + "step": 17676 + }, + { + "epoch": 0.881645885286783, + "grad_norm": 0.6119832254158939, + "learning_rate": 1.8159014438207306e-06, + "loss": 0.8357, + "step": 17677 + }, + { + "epoch": 0.8816957605985037, + "grad_norm": 0.7578004553405694, + "learning_rate": 1.8143907168519103e-06, + "loss": 0.7793, + "step": 17678 + }, + { + "epoch": 0.8817456359102245, + "grad_norm": 0.5366055157107295, + "learning_rate": 1.8128805949011896e-06, + "loss": 0.8281, + "step": 17679 + }, + { + "epoch": 0.8817955112219451, + "grad_norm": 0.6038384492167297, + "learning_rate": 1.8113710780079674e-06, + "loss": 0.8129, + "step": 17680 + }, + { + "epoch": 0.8818453865336658, + "grad_norm": 0.579600084687574, + "learning_rate": 1.8098621662116456e-06, + "loss": 0.8632, + "step": 17681 + }, + { + "epoch": 0.8818952618453866, + "grad_norm": 0.689236847547262, + "learning_rate": 1.8083538595515842e-06, + "loss": 0.8528, + "step": 17682 + }, + { + "epoch": 0.8819451371571072, + "grad_norm": 0.6396694787566953, + "learning_rate": 1.8068461580671548e-06, + "loss": 0.8336, + "step": 17683 + }, + { + "epoch": 0.8819950124688279, + "grad_norm": 0.6431897705942554, + "learning_rate": 1.8053390617976922e-06, + "loss": 0.8713, + "step": 17684 + }, + { + "epoch": 0.8820448877805487, + "grad_norm": 0.5580879369788735, + "learning_rate": 1.803832570782532e-06, + "loss": 0.8515, + "step": 17685 + }, + { + "epoch": 0.8820947630922693, + "grad_norm": 0.7868818581459185, + "learning_rate": 1.8023266850609733e-06, + "loss": 0.8771, + "step": 17686 + }, + { + "epoch": 0.88214463840399, + "grad_norm": 0.6243012729560757, + "learning_rate": 1.8008214046723204e-06, + "loss": 0.8816, + "step": 17687 + }, + { + "epoch": 0.8821945137157107, + "grad_norm": 0.9452207065664573, + "learning_rate": 1.7993167296558478e-06, + "loss": 0.8451, + "step": 17688 + }, + { + "epoch": 0.8822443890274314, + "grad_norm": 0.5248342846796117, + "learning_rate": 1.7978126600508293e-06, + "loss": 0.8033, + "step": 17689 + }, + { + "epoch": 0.8822942643391521, + "grad_norm": 0.5200827381792476, + "learning_rate": 1.796309195896495e-06, + "loss": 0.7766, + "step": 17690 + }, + { + "epoch": 0.8823441396508728, + "grad_norm": 0.7340003796923675, + "learning_rate": 1.794806337232094e-06, + "loss": 0.878, + "step": 17691 + }, + { + "epoch": 0.8823940149625935, + "grad_norm": 0.9281964241545019, + "learning_rate": 1.7933040840968362e-06, + "loss": 0.8564, + "step": 17692 + }, + { + "epoch": 0.8824438902743142, + "grad_norm": 0.5815366265998557, + "learning_rate": 1.7918024365299185e-06, + "loss": 0.8371, + "step": 17693 + }, + { + "epoch": 0.8824937655860349, + "grad_norm": 0.7851697645945134, + "learning_rate": 1.7903013945705343e-06, + "loss": 0.868, + "step": 17694 + }, + { + "epoch": 0.8825436408977556, + "grad_norm": 0.601916365793418, + "learning_rate": 1.7888009582578468e-06, + "loss": 0.8201, + "step": 17695 + }, + { + "epoch": 0.8825935162094763, + "grad_norm": 0.5469953000100531, + "learning_rate": 1.787301127631011e-06, + "loss": 0.8039, + "step": 17696 + }, + { + "epoch": 0.882643391521197, + "grad_norm": 0.5967406950052297, + "learning_rate": 1.785801902729159e-06, + "loss": 0.7766, + "step": 17697 + }, + { + "epoch": 0.8826932668329177, + "grad_norm": 0.7191206453193778, + "learning_rate": 1.7843032835914208e-06, + "loss": 0.9003, + "step": 17698 + }, + { + "epoch": 0.8827431421446384, + "grad_norm": 0.6716645563505629, + "learning_rate": 1.7828052702568987e-06, + "loss": 0.8348, + "step": 17699 + }, + { + "epoch": 0.8827930174563591, + "grad_norm": 0.542333900259369, + "learning_rate": 1.7813078627646835e-06, + "loss": 0.843, + "step": 17700 + }, + { + "epoch": 0.8828428927680798, + "grad_norm": 0.6285555128088965, + "learning_rate": 1.779811061153841e-06, + "loss": 0.81, + "step": 17701 + }, + { + "epoch": 0.8828927680798005, + "grad_norm": 0.6002738631390303, + "learning_rate": 1.7783148654634429e-06, + "loss": 0.8377, + "step": 17702 + }, + { + "epoch": 0.8829426433915212, + "grad_norm": 0.6099380762529483, + "learning_rate": 1.776819275732522e-06, + "loss": 0.8575, + "step": 17703 + }, + { + "epoch": 0.8829925187032419, + "grad_norm": 0.5987290037495513, + "learning_rate": 1.775324292000119e-06, + "loss": 0.8914, + "step": 17704 + }, + { + "epoch": 0.8830423940149625, + "grad_norm": 0.5474596574321245, + "learning_rate": 1.7738299143052223e-06, + "loss": 0.8741, + "step": 17705 + }, + { + "epoch": 0.8830922693266833, + "grad_norm": 0.8008164442784401, + "learning_rate": 1.772336142686845e-06, + "loss": 0.8254, + "step": 17706 + }, + { + "epoch": 0.883142144638404, + "grad_norm": 0.4863710338979646, + "learning_rate": 1.7708429771839618e-06, + "loss": 0.8065, + "step": 17707 + }, + { + "epoch": 0.8831920199501246, + "grad_norm": 0.5372021142536528, + "learning_rate": 1.7693504178355357e-06, + "loss": 0.836, + "step": 17708 + }, + { + "epoch": 0.8832418952618454, + "grad_norm": 0.5699465171292221, + "learning_rate": 1.767858464680508e-06, + "loss": 0.8486, + "step": 17709 + }, + { + "epoch": 0.8832917705735661, + "grad_norm": 0.5498263389175242, + "learning_rate": 1.7663671177578194e-06, + "loss": 0.8599, + "step": 17710 + }, + { + "epoch": 0.8833416458852867, + "grad_norm": 0.5219781483460458, + "learning_rate": 1.7648763771063837e-06, + "loss": 0.8481, + "step": 17711 + }, + { + "epoch": 0.8833915211970075, + "grad_norm": 1.1679591891514953, + "learning_rate": 1.7633862427651026e-06, + "loss": 0.8738, + "step": 17712 + }, + { + "epoch": 0.8834413965087282, + "grad_norm": 1.5580895296749968, + "learning_rate": 1.761896714772851e-06, + "loss": 0.8491, + "step": 17713 + }, + { + "epoch": 0.8834912718204488, + "grad_norm": 0.6217595576697126, + "learning_rate": 1.7604077931685114e-06, + "loss": 0.8377, + "step": 17714 + }, + { + "epoch": 0.8835411471321696, + "grad_norm": 0.7261198663758527, + "learning_rate": 1.758919477990928e-06, + "loss": 0.876, + "step": 17715 + }, + { + "epoch": 0.8835910224438903, + "grad_norm": 0.6688942984218005, + "learning_rate": 1.7574317692789388e-06, + "loss": 0.8117, + "step": 17716 + }, + { + "epoch": 0.8836408977556109, + "grad_norm": 0.6513558543622312, + "learning_rate": 1.755944667071363e-06, + "loss": 0.9024, + "step": 17717 + }, + { + "epoch": 0.8836907730673317, + "grad_norm": 0.5249552621234832, + "learning_rate": 1.7544581714070168e-06, + "loss": 0.8792, + "step": 17718 + }, + { + "epoch": 0.8837406483790524, + "grad_norm": 0.6247190509386432, + "learning_rate": 1.752972282324672e-06, + "loss": 0.8209, + "step": 17719 + }, + { + "epoch": 0.883790523690773, + "grad_norm": 0.6793320821163632, + "learning_rate": 1.751486999863114e-06, + "loss": 0.8497, + "step": 17720 + }, + { + "epoch": 0.8838403990024938, + "grad_norm": 0.7243732493673347, + "learning_rate": 1.750002324061098e-06, + "loss": 0.8761, + "step": 17721 + }, + { + "epoch": 0.8838902743142145, + "grad_norm": 0.5834947429411314, + "learning_rate": 1.7485182549573709e-06, + "loss": 0.8249, + "step": 17722 + }, + { + "epoch": 0.8839401496259351, + "grad_norm": 0.6093287295853308, + "learning_rate": 1.747034792590646e-06, + "loss": 0.8266, + "step": 17723 + }, + { + "epoch": 0.8839900249376559, + "grad_norm": 0.8105617736666337, + "learning_rate": 1.7455519369996482e-06, + "loss": 0.7981, + "step": 17724 + }, + { + "epoch": 0.8840399002493765, + "grad_norm": 0.5553392100172335, + "learning_rate": 1.74406968822306e-06, + "loss": 0.8254, + "step": 17725 + }, + { + "epoch": 0.8840897755610972, + "grad_norm": 0.6142276476848417, + "learning_rate": 1.742588046299573e-06, + "loss": 0.828, + "step": 17726 + }, + { + "epoch": 0.884139650872818, + "grad_norm": 0.5795380559036607, + "learning_rate": 1.741107011267834e-06, + "loss": 0.8069, + "step": 17727 + }, + { + "epoch": 0.8841895261845386, + "grad_norm": 0.596842805970315, + "learning_rate": 1.7396265831665009e-06, + "loss": 0.8473, + "step": 17728 + }, + { + "epoch": 0.8842394014962593, + "grad_norm": 0.6150943221372545, + "learning_rate": 1.738146762034204e-06, + "loss": 0.8064, + "step": 17729 + }, + { + "epoch": 0.8842892768079801, + "grad_norm": 0.5558022490823236, + "learning_rate": 1.7366675479095511e-06, + "loss": 0.8603, + "step": 17730 + }, + { + "epoch": 0.8843391521197007, + "grad_norm": 0.6085795933202509, + "learning_rate": 1.7351889408311534e-06, + "loss": 0.8269, + "step": 17731 + }, + { + "epoch": 0.8843890274314214, + "grad_norm": 0.6124299842112181, + "learning_rate": 1.7337109408375878e-06, + "loss": 0.8472, + "step": 17732 + }, + { + "epoch": 0.8844389027431422, + "grad_norm": 0.6028142167314309, + "learning_rate": 1.7322335479674213e-06, + "loss": 0.8184, + "step": 17733 + }, + { + "epoch": 0.8844887780548628, + "grad_norm": 0.5929891838049806, + "learning_rate": 1.730756762259203e-06, + "loss": 0.8335, + "step": 17734 + }, + { + "epoch": 0.8845386533665835, + "grad_norm": 2.912600209696544, + "learning_rate": 1.7292805837514803e-06, + "loss": 0.8544, + "step": 17735 + }, + { + "epoch": 0.8845885286783043, + "grad_norm": 0.5730608893120551, + "learning_rate": 1.7278050124827639e-06, + "loss": 0.8289, + "step": 17736 + }, + { + "epoch": 0.8846384039900249, + "grad_norm": 0.5734919907712492, + "learning_rate": 1.726330048491559e-06, + "loss": 0.7819, + "step": 17737 + }, + { + "epoch": 0.8846882793017457, + "grad_norm": 0.52720749634784, + "learning_rate": 1.7248556918163544e-06, + "loss": 0.7965, + "step": 17738 + }, + { + "epoch": 0.8847381546134664, + "grad_norm": 0.535704337866319, + "learning_rate": 1.7233819424956248e-06, + "loss": 0.8402, + "step": 17739 + }, + { + "epoch": 0.884788029925187, + "grad_norm": 0.616148008298147, + "learning_rate": 1.7219088005678286e-06, + "loss": 0.8095, + "step": 17740 + }, + { + "epoch": 0.8848379052369078, + "grad_norm": 0.6592409442355315, + "learning_rate": 1.7204362660714042e-06, + "loss": 0.8751, + "step": 17741 + }, + { + "epoch": 0.8848877805486284, + "grad_norm": 0.5666812532743806, + "learning_rate": 1.7189643390447707e-06, + "loss": 0.853, + "step": 17742 + }, + { + "epoch": 0.8849376558603491, + "grad_norm": 0.5501710725569129, + "learning_rate": 1.7174930195263478e-06, + "loss": 0.8605, + "step": 17743 + }, + { + "epoch": 0.8849875311720699, + "grad_norm": 0.7127661598425102, + "learning_rate": 1.7160223075545267e-06, + "loss": 0.803, + "step": 17744 + }, + { + "epoch": 0.8850374064837905, + "grad_norm": 0.6804751585700911, + "learning_rate": 1.7145522031676825e-06, + "loss": 0.8814, + "step": 17745 + }, + { + "epoch": 0.8850872817955112, + "grad_norm": 0.5934752400727661, + "learning_rate": 1.7130827064041733e-06, + "loss": 0.8568, + "step": 17746 + }, + { + "epoch": 0.885137157107232, + "grad_norm": 0.7138960163307826, + "learning_rate": 1.7116138173023517e-06, + "loss": 0.8485, + "step": 17747 + }, + { + "epoch": 0.8851870324189526, + "grad_norm": 0.6654416012077082, + "learning_rate": 1.7101455359005454e-06, + "loss": 0.8193, + "step": 17748 + }, + { + "epoch": 0.8852369077306733, + "grad_norm": 0.6425756602095296, + "learning_rate": 1.7086778622370708e-06, + "loss": 0.8157, + "step": 17749 + }, + { + "epoch": 0.885286783042394, + "grad_norm": 0.6211948948430035, + "learning_rate": 1.7072107963502171e-06, + "loss": 0.8358, + "step": 17750 + }, + { + "epoch": 0.8853366583541147, + "grad_norm": 0.5192842758672316, + "learning_rate": 1.7057443382782812e-06, + "loss": 0.7914, + "step": 17751 + }, + { + "epoch": 0.8853865336658354, + "grad_norm": 0.5975931313147452, + "learning_rate": 1.704278488059513e-06, + "loss": 0.799, + "step": 17752 + }, + { + "epoch": 0.8854364089775562, + "grad_norm": 0.6165180473806807, + "learning_rate": 1.7028132457321767e-06, + "loss": 0.9127, + "step": 17753 + }, + { + "epoch": 0.8854862842892768, + "grad_norm": 0.5710240867263915, + "learning_rate": 1.7013486113344996e-06, + "loss": 0.835, + "step": 17754 + }, + { + "epoch": 0.8855361596009975, + "grad_norm": 0.5948359331421178, + "learning_rate": 1.6998845849047095e-06, + "loss": 0.8902, + "step": 17755 + }, + { + "epoch": 0.8855860349127183, + "grad_norm": 0.5408152620612413, + "learning_rate": 1.6984211664809957e-06, + "loss": 0.8615, + "step": 17756 + }, + { + "epoch": 0.8856359102244389, + "grad_norm": 0.5612964428531957, + "learning_rate": 1.696958356101558e-06, + "loss": 0.8231, + "step": 17757 + }, + { + "epoch": 0.8856857855361596, + "grad_norm": 0.6107461668108732, + "learning_rate": 1.69549615380456e-06, + "loss": 0.8823, + "step": 17758 + }, + { + "epoch": 0.8857356608478802, + "grad_norm": 0.7611018440212047, + "learning_rate": 1.6940345596281659e-06, + "loss": 0.8684, + "step": 17759 + }, + { + "epoch": 0.885785536159601, + "grad_norm": 0.6696075143396167, + "learning_rate": 1.6925735736105037e-06, + "loss": 0.8548, + "step": 17760 + }, + { + "epoch": 0.8858354114713217, + "grad_norm": 0.9663019245545359, + "learning_rate": 1.6911131957897035e-06, + "loss": 0.8808, + "step": 17761 + }, + { + "epoch": 0.8858852867830423, + "grad_norm": 0.5569666022431532, + "learning_rate": 1.6896534262038717e-06, + "loss": 0.8287, + "step": 17762 + }, + { + "epoch": 0.8859351620947631, + "grad_norm": 0.594618138489389, + "learning_rate": 1.6881942648911076e-06, + "loss": 0.8577, + "step": 17763 + }, + { + "epoch": 0.8859850374064838, + "grad_norm": 0.7161777083412763, + "learning_rate": 1.6867357118894756e-06, + "loss": 0.8659, + "step": 17764 + }, + { + "epoch": 0.8860349127182044, + "grad_norm": 0.5264614362528959, + "learning_rate": 1.6852777672370423e-06, + "loss": 0.8521, + "step": 17765 + }, + { + "epoch": 0.8860847880299252, + "grad_norm": 0.5839942609096092, + "learning_rate": 1.6838204309718552e-06, + "loss": 0.8131, + "step": 17766 + }, + { + "epoch": 0.8861346633416459, + "grad_norm": 0.525958795271028, + "learning_rate": 1.6823637031319367e-06, + "loss": 0.8273, + "step": 17767 + }, + { + "epoch": 0.8861845386533665, + "grad_norm": 0.638166566456383, + "learning_rate": 1.680907583755298e-06, + "loss": 0.8459, + "step": 17768 + }, + { + "epoch": 0.8862344139650873, + "grad_norm": 0.571457398814032, + "learning_rate": 1.6794520728799446e-06, + "loss": 0.8408, + "step": 17769 + }, + { + "epoch": 0.886284289276808, + "grad_norm": 0.5908743893761084, + "learning_rate": 1.6779971705438491e-06, + "loss": 0.8798, + "step": 17770 + }, + { + "epoch": 0.8863341645885287, + "grad_norm": 0.639938845150535, + "learning_rate": 1.6765428767849783e-06, + "loss": 0.8308, + "step": 17771 + }, + { + "epoch": 0.8863840399002494, + "grad_norm": 0.5829548849542543, + "learning_rate": 1.6750891916412881e-06, + "loss": 0.8704, + "step": 17772 + }, + { + "epoch": 0.8864339152119701, + "grad_norm": 0.7369638609508055, + "learning_rate": 1.6736361151507062e-06, + "loss": 0.7998, + "step": 17773 + }, + { + "epoch": 0.8864837905236908, + "grad_norm": 0.572964930493074, + "learning_rate": 1.67218364735115e-06, + "loss": 0.8247, + "step": 17774 + }, + { + "epoch": 0.8865336658354115, + "grad_norm": 0.60422760056962, + "learning_rate": 1.6707317882805162e-06, + "loss": 0.8688, + "step": 17775 + }, + { + "epoch": 0.8865835411471321, + "grad_norm": 0.6102696216054826, + "learning_rate": 1.6692805379767002e-06, + "loss": 0.8775, + "step": 17776 + }, + { + "epoch": 0.8866334164588529, + "grad_norm": 0.6628601725702085, + "learning_rate": 1.6678298964775656e-06, + "loss": 0.7961, + "step": 17777 + }, + { + "epoch": 0.8866832917705736, + "grad_norm": 0.8307906568730681, + "learning_rate": 1.6663798638209688e-06, + "loss": 0.8485, + "step": 17778 + }, + { + "epoch": 0.8867331670822942, + "grad_norm": 0.5885758922888332, + "learning_rate": 1.6649304400447458e-06, + "loss": 0.8953, + "step": 17779 + }, + { + "epoch": 0.886783042394015, + "grad_norm": 0.8034660506287072, + "learning_rate": 1.663481625186719e-06, + "loss": 0.7913, + "step": 17780 + }, + { + "epoch": 0.8868329177057357, + "grad_norm": 0.5562478979538933, + "learning_rate": 1.6620334192846976e-06, + "loss": 0.7874, + "step": 17781 + }, + { + "epoch": 0.8868827930174563, + "grad_norm": 0.5367716828736413, + "learning_rate": 1.6605858223764704e-06, + "loss": 0.8318, + "step": 17782 + }, + { + "epoch": 0.886932668329177, + "grad_norm": 0.5319253240556245, + "learning_rate": 1.6591388344998043e-06, + "loss": 0.8122, + "step": 17783 + }, + { + "epoch": 0.8869825436408978, + "grad_norm": 1.1140726216048513, + "learning_rate": 1.6576924556924694e-06, + "loss": 0.8254, + "step": 17784 + }, + { + "epoch": 0.8870324189526184, + "grad_norm": 0.7391629774299399, + "learning_rate": 1.656246685992202e-06, + "loss": 0.8215, + "step": 17785 + }, + { + "epoch": 0.8870822942643392, + "grad_norm": 0.6259007534217285, + "learning_rate": 1.6548015254367306e-06, + "loss": 0.8192, + "step": 17786 + }, + { + "epoch": 0.8871321695760599, + "grad_norm": 0.7086906967684943, + "learning_rate": 1.6533569740637633e-06, + "loss": 0.8132, + "step": 17787 + }, + { + "epoch": 0.8871820448877805, + "grad_norm": 0.6371207844115387, + "learning_rate": 1.6519130319110038e-06, + "loss": 0.8649, + "step": 17788 + }, + { + "epoch": 0.8872319201995013, + "grad_norm": 0.5806283051224546, + "learning_rate": 1.650469699016116e-06, + "loss": 0.8146, + "step": 17789 + }, + { + "epoch": 0.887281795511222, + "grad_norm": 2.9410532781332006, + "learning_rate": 1.649026975416776e-06, + "loss": 0.8411, + "step": 17790 + }, + { + "epoch": 0.8873316708229426, + "grad_norm": 0.5720366384508423, + "learning_rate": 1.6475848611506223e-06, + "loss": 0.8028, + "step": 17791 + }, + { + "epoch": 0.8873815461346634, + "grad_norm": 0.7963795281792432, + "learning_rate": 1.6461433562553003e-06, + "loss": 0.8034, + "step": 17792 + }, + { + "epoch": 0.8874314214463841, + "grad_norm": 0.5584569708349313, + "learning_rate": 1.6447024607684047e-06, + "loss": 0.855, + "step": 17793 + }, + { + "epoch": 0.8874812967581047, + "grad_norm": 1.2528653361172744, + "learning_rate": 1.6432621747275528e-06, + "loss": 0.8794, + "step": 17794 + }, + { + "epoch": 0.8875311720698255, + "grad_norm": 0.6049682175540059, + "learning_rate": 1.6418224981703146e-06, + "loss": 0.8509, + "step": 17795 + }, + { + "epoch": 0.8875810473815461, + "grad_norm": 0.576513377926834, + "learning_rate": 1.6403834311342737e-06, + "loss": 0.8212, + "step": 17796 + }, + { + "epoch": 0.8876309226932668, + "grad_norm": 0.7375024064566303, + "learning_rate": 1.6389449736569667e-06, + "loss": 0.8556, + "step": 17797 + }, + { + "epoch": 0.8876807980049876, + "grad_norm": 0.7004291597267659, + "learning_rate": 1.637507125775936e-06, + "loss": 0.8069, + "step": 17798 + }, + { + "epoch": 0.8877306733167082, + "grad_norm": 0.5373443572475016, + "learning_rate": 1.6360698875287044e-06, + "loss": 0.8078, + "step": 17799 + }, + { + "epoch": 0.8877805486284289, + "grad_norm": 0.5683585598432608, + "learning_rate": 1.6346332589527697e-06, + "loss": 0.8408, + "step": 17800 + }, + { + "epoch": 0.8878304239401497, + "grad_norm": 0.5822846764947079, + "learning_rate": 1.633197240085621e-06, + "loss": 0.866, + "step": 17801 + }, + { + "epoch": 0.8878802992518703, + "grad_norm": 2.3978150448791986, + "learning_rate": 1.631761830964737e-06, + "loss": 0.8309, + "step": 17802 + }, + { + "epoch": 0.887930174563591, + "grad_norm": 0.8048894393924041, + "learning_rate": 1.6303270316275682e-06, + "loss": 0.8474, + "step": 17803 + }, + { + "epoch": 0.8879800498753118, + "grad_norm": 0.6247108015340915, + "learning_rate": 1.6288928421115596e-06, + "loss": 0.8349, + "step": 17804 + }, + { + "epoch": 0.8880299251870324, + "grad_norm": 0.6072003188644902, + "learning_rate": 1.6274592624541258e-06, + "loss": 0.8291, + "step": 17805 + }, + { + "epoch": 0.8880798004987531, + "grad_norm": 0.5720965135746066, + "learning_rate": 1.6260262926926867e-06, + "loss": 0.8607, + "step": 17806 + }, + { + "epoch": 0.8881296758104739, + "grad_norm": 0.5343650808011401, + "learning_rate": 1.624593932864632e-06, + "loss": 0.8173, + "step": 17807 + }, + { + "epoch": 0.8881795511221945, + "grad_norm": 0.5931208761817114, + "learning_rate": 1.6231621830073318e-06, + "loss": 0.8727, + "step": 17808 + }, + { + "epoch": 0.8882294264339152, + "grad_norm": 0.9157631341152194, + "learning_rate": 1.6217310431581562e-06, + "loss": 0.8342, + "step": 17809 + }, + { + "epoch": 0.888279301745636, + "grad_norm": 0.6043481478264706, + "learning_rate": 1.6203005133544475e-06, + "loss": 0.8516, + "step": 17810 + }, + { + "epoch": 0.8883291770573566, + "grad_norm": 0.5891056221080979, + "learning_rate": 1.618870593633534e-06, + "loss": 0.863, + "step": 17811 + }, + { + "epoch": 0.8883790523690773, + "grad_norm": 0.639439829274291, + "learning_rate": 1.6174412840327224e-06, + "loss": 0.8858, + "step": 17812 + }, + { + "epoch": 0.888428927680798, + "grad_norm": 0.5131106075328713, + "learning_rate": 1.6160125845893215e-06, + "loss": 0.8651, + "step": 17813 + }, + { + "epoch": 0.8884788029925187, + "grad_norm": 0.6497618837184447, + "learning_rate": 1.614584495340607e-06, + "loss": 0.8736, + "step": 17814 + }, + { + "epoch": 0.8885286783042394, + "grad_norm": 0.5104530376439976, + "learning_rate": 1.6131570163238436e-06, + "loss": 0.8282, + "step": 17815 + }, + { + "epoch": 0.88857855361596, + "grad_norm": 0.620968361438328, + "learning_rate": 1.6117301475762765e-06, + "loss": 0.8456, + "step": 17816 + }, + { + "epoch": 0.8886284289276808, + "grad_norm": 0.5202880449936498, + "learning_rate": 1.610303889135148e-06, + "loss": 0.8526, + "step": 17817 + }, + { + "epoch": 0.8886783042394015, + "grad_norm": 0.9701298517334371, + "learning_rate": 1.608878241037673e-06, + "loss": 0.8356, + "step": 17818 + }, + { + "epoch": 0.8887281795511222, + "grad_norm": 0.5536332231074703, + "learning_rate": 1.6074532033210492e-06, + "loss": 0.8388, + "step": 17819 + }, + { + "epoch": 0.8887780548628429, + "grad_norm": 0.6320902440600088, + "learning_rate": 1.606028776022464e-06, + "loss": 0.8315, + "step": 17820 + }, + { + "epoch": 0.8888279301745636, + "grad_norm": 0.5922163053011927, + "learning_rate": 1.6046049591790902e-06, + "loss": 0.8559, + "step": 17821 + }, + { + "epoch": 0.8888778054862843, + "grad_norm": 1.0101714777318758, + "learning_rate": 1.6031817528280813e-06, + "loss": 0.8098, + "step": 17822 + }, + { + "epoch": 0.888927680798005, + "grad_norm": 0.5620687438157694, + "learning_rate": 1.6017591570065715e-06, + "loss": 0.8485, + "step": 17823 + }, + { + "epoch": 0.8889775561097257, + "grad_norm": 0.69263176659214, + "learning_rate": 1.6003371717516814e-06, + "loss": 0.8616, + "step": 17824 + }, + { + "epoch": 0.8890274314214464, + "grad_norm": 0.6238472938772841, + "learning_rate": 1.598915797100528e-06, + "loss": 0.8333, + "step": 17825 + }, + { + "epoch": 0.8890773067331671, + "grad_norm": 0.6620566443445917, + "learning_rate": 1.5974950330901879e-06, + "loss": 0.8298, + "step": 17826 + }, + { + "epoch": 0.8891271820448878, + "grad_norm": 0.6556878786110151, + "learning_rate": 1.5960748797577445e-06, + "loss": 0.849, + "step": 17827 + }, + { + "epoch": 0.8891770573566085, + "grad_norm": 0.7032692236378526, + "learning_rate": 1.5946553371402494e-06, + "loss": 0.8299, + "step": 17828 + }, + { + "epoch": 0.8892269326683292, + "grad_norm": 0.5898476239420434, + "learning_rate": 1.5932364052747534e-06, + "loss": 0.8574, + "step": 17829 + }, + { + "epoch": 0.8892768079800498, + "grad_norm": 0.5229727026683468, + "learning_rate": 1.5918180841982738e-06, + "loss": 0.8409, + "step": 17830 + }, + { + "epoch": 0.8893266832917706, + "grad_norm": 0.5592352903021918, + "learning_rate": 1.5904003739478257e-06, + "loss": 0.7789, + "step": 17831 + }, + { + "epoch": 0.8893765586034913, + "grad_norm": 0.6253281780080148, + "learning_rate": 1.5889832745604017e-06, + "loss": 0.8304, + "step": 17832 + }, + { + "epoch": 0.8894264339152119, + "grad_norm": 0.5752119830479715, + "learning_rate": 1.5875667860729887e-06, + "loss": 0.8037, + "step": 17833 + }, + { + "epoch": 0.8894763092269327, + "grad_norm": 0.5709180624645929, + "learning_rate": 1.5861509085225352e-06, + "loss": 0.8677, + "step": 17834 + }, + { + "epoch": 0.8895261845386534, + "grad_norm": 0.6130132787815721, + "learning_rate": 1.5847356419459974e-06, + "loss": 0.8458, + "step": 17835 + }, + { + "epoch": 0.889576059850374, + "grad_norm": 0.8662699943119327, + "learning_rate": 1.5833209863803045e-06, + "loss": 0.8577, + "step": 17836 + }, + { + "epoch": 0.8896259351620948, + "grad_norm": 0.5513123101875399, + "learning_rate": 1.5819069418623712e-06, + "loss": 0.8696, + "step": 17837 + }, + { + "epoch": 0.8896758104738155, + "grad_norm": 0.5915674933426145, + "learning_rate": 1.5804935084290928e-06, + "loss": 0.8387, + "step": 17838 + }, + { + "epoch": 0.8897256857855361, + "grad_norm": 0.8826590118107756, + "learning_rate": 1.579080686117357e-06, + "loss": 0.8562, + "step": 17839 + }, + { + "epoch": 0.8897755610972569, + "grad_norm": 0.8353312069963987, + "learning_rate": 1.577668474964028e-06, + "loss": 0.8467, + "step": 17840 + }, + { + "epoch": 0.8898254364089776, + "grad_norm": 0.5112971813468652, + "learning_rate": 1.5762568750059604e-06, + "loss": 0.8565, + "step": 17841 + }, + { + "epoch": 0.8898753117206982, + "grad_norm": 0.7021287906119794, + "learning_rate": 1.5748458862799797e-06, + "loss": 0.8411, + "step": 17842 + }, + { + "epoch": 0.889925187032419, + "grad_norm": 0.5843904307965626, + "learning_rate": 1.573435508822918e-06, + "loss": 0.8346, + "step": 17843 + }, + { + "epoch": 0.8899750623441397, + "grad_norm": 0.5379880466467438, + "learning_rate": 1.5720257426715706e-06, + "loss": 0.8682, + "step": 17844 + }, + { + "epoch": 0.8900249376558603, + "grad_norm": 0.5067766001373412, + "learning_rate": 1.5706165878627248e-06, + "loss": 0.7905, + "step": 17845 + }, + { + "epoch": 0.8900748129675811, + "grad_norm": 0.648288277501519, + "learning_rate": 1.5692080444331569e-06, + "loss": 0.8425, + "step": 17846 + }, + { + "epoch": 0.8901246882793017, + "grad_norm": 0.546793001206153, + "learning_rate": 1.5678001124196178e-06, + "loss": 0.8642, + "step": 17847 + }, + { + "epoch": 0.8901745635910224, + "grad_norm": 0.6112521342273476, + "learning_rate": 1.5663927918588477e-06, + "loss": 0.8651, + "step": 17848 + }, + { + "epoch": 0.8902244389027432, + "grad_norm": 0.6054121640245257, + "learning_rate": 1.5649860827875674e-06, + "loss": 0.8587, + "step": 17849 + }, + { + "epoch": 0.8902743142144638, + "grad_norm": 0.5699282436296037, + "learning_rate": 1.5635799852424914e-06, + "loss": 0.8573, + "step": 17850 + }, + { + "epoch": 0.8903241895261845, + "grad_norm": 0.6722257537139865, + "learning_rate": 1.5621744992603049e-06, + "loss": 0.8263, + "step": 17851 + }, + { + "epoch": 0.8903740648379053, + "grad_norm": 0.5915421328703576, + "learning_rate": 1.5607696248776865e-06, + "loss": 0.8476, + "step": 17852 + }, + { + "epoch": 0.8904239401496259, + "grad_norm": 0.7764591616965417, + "learning_rate": 1.5593653621312903e-06, + "loss": 0.8812, + "step": 17853 + }, + { + "epoch": 0.8904738154613466, + "grad_norm": 2.007466421778422, + "learning_rate": 1.5579617110577705e-06, + "loss": 0.8519, + "step": 17854 + }, + { + "epoch": 0.8905236907730674, + "grad_norm": 0.7035762641467647, + "learning_rate": 1.5565586716937447e-06, + "loss": 0.859, + "step": 17855 + }, + { + "epoch": 0.890573566084788, + "grad_norm": 0.7671408921579216, + "learning_rate": 1.555156244075831e-06, + "loss": 0.8499, + "step": 17856 + }, + { + "epoch": 0.8906234413965087, + "grad_norm": 0.6222016075187676, + "learning_rate": 1.5537544282406197e-06, + "loss": 0.7996, + "step": 17857 + }, + { + "epoch": 0.8906733167082295, + "grad_norm": 0.5972476539753145, + "learning_rate": 1.552353224224698e-06, + "loss": 0.851, + "step": 17858 + }, + { + "epoch": 0.8907231920199501, + "grad_norm": 0.6090424968952002, + "learning_rate": 1.5509526320646201e-06, + "loss": 0.8991, + "step": 17859 + }, + { + "epoch": 0.8907730673316708, + "grad_norm": 0.6247706118220592, + "learning_rate": 1.5495526517969428e-06, + "loss": 0.8141, + "step": 17860 + }, + { + "epoch": 0.8908229426433916, + "grad_norm": 0.712070129682775, + "learning_rate": 1.5481532834581897e-06, + "loss": 0.8393, + "step": 17861 + }, + { + "epoch": 0.8908728179551122, + "grad_norm": 0.615391273747879, + "learning_rate": 1.5467545270848872e-06, + "loss": 0.8452, + "step": 17862 + }, + { + "epoch": 0.8909226932668329, + "grad_norm": 0.6875570880624322, + "learning_rate": 1.5453563827135226e-06, + "loss": 0.8636, + "step": 17863 + }, + { + "epoch": 0.8909725685785537, + "grad_norm": 0.6372422517335063, + "learning_rate": 1.543958850380589e-06, + "loss": 0.8575, + "step": 17864 + }, + { + "epoch": 0.8910224438902743, + "grad_norm": 0.6428151415869259, + "learning_rate": 1.5425619301225492e-06, + "loss": 0.8132, + "step": 17865 + }, + { + "epoch": 0.891072319201995, + "grad_norm": 2.142535169123987, + "learning_rate": 1.5411656219758625e-06, + "loss": 0.7923, + "step": 17866 + }, + { + "epoch": 0.8911221945137157, + "grad_norm": 0.5613661308838157, + "learning_rate": 1.5397699259769554e-06, + "loss": 0.8484, + "step": 17867 + }, + { + "epoch": 0.8911720698254364, + "grad_norm": 0.8311878191593871, + "learning_rate": 1.5383748421622574e-06, + "loss": 0.8619, + "step": 17868 + }, + { + "epoch": 0.8912219451371571, + "grad_norm": 0.9737416450622809, + "learning_rate": 1.5369803705681613e-06, + "loss": 0.8928, + "step": 17869 + }, + { + "epoch": 0.8912718204488778, + "grad_norm": 0.7436263737211125, + "learning_rate": 1.5355865112310713e-06, + "loss": 0.8068, + "step": 17870 + }, + { + "epoch": 0.8913216957605985, + "grad_norm": 0.8203136055323471, + "learning_rate": 1.5341932641873391e-06, + "loss": 0.8337, + "step": 17871 + }, + { + "epoch": 0.8913715710723192, + "grad_norm": 0.5853758344063101, + "learning_rate": 1.5328006294733383e-06, + "loss": 0.8507, + "step": 17872 + }, + { + "epoch": 0.8914214463840399, + "grad_norm": 1.8677139369599303, + "learning_rate": 1.5314086071254035e-06, + "loss": 0.8339, + "step": 17873 + }, + { + "epoch": 0.8914713216957606, + "grad_norm": 0.6933379193307558, + "learning_rate": 1.530017197179856e-06, + "loss": 0.8438, + "step": 17874 + }, + { + "epoch": 0.8915211970074813, + "grad_norm": 0.6648624279323189, + "learning_rate": 1.5286263996730026e-06, + "loss": 0.8548, + "step": 17875 + }, + { + "epoch": 0.891571072319202, + "grad_norm": 0.5537331995966117, + "learning_rate": 1.5272362146411424e-06, + "loss": 0.8266, + "step": 17876 + }, + { + "epoch": 0.8916209476309227, + "grad_norm": 0.6628229967707228, + "learning_rate": 1.5258466421205485e-06, + "loss": 0.8521, + "step": 17877 + }, + { + "epoch": 0.8916708229426434, + "grad_norm": 0.5678159620430568, + "learning_rate": 1.5244576821474816e-06, + "loss": 0.8594, + "step": 17878 + }, + { + "epoch": 0.8917206982543641, + "grad_norm": 0.6484760388846639, + "learning_rate": 1.5230693347581815e-06, + "loss": 0.8355, + "step": 17879 + }, + { + "epoch": 0.8917705735660848, + "grad_norm": 0.49710789129484256, + "learning_rate": 1.5216815999888805e-06, + "loss": 0.8419, + "step": 17880 + }, + { + "epoch": 0.8918204488778055, + "grad_norm": 0.7178077294200362, + "learning_rate": 1.5202944778757944e-06, + "loss": 0.8272, + "step": 17881 + }, + { + "epoch": 0.8918703241895262, + "grad_norm": 0.5803486872759872, + "learning_rate": 1.518907968455116e-06, + "loss": 0.8744, + "step": 17882 + }, + { + "epoch": 0.8919201995012469, + "grad_norm": 0.6078301219892697, + "learning_rate": 1.5175220717630196e-06, + "loss": 0.8716, + "step": 17883 + }, + { + "epoch": 0.8919700748129675, + "grad_norm": 0.7007222895583873, + "learning_rate": 1.5161367878356786e-06, + "loss": 0.8659, + "step": 17884 + }, + { + "epoch": 0.8920199501246883, + "grad_norm": 0.8544349527831828, + "learning_rate": 1.5147521167092394e-06, + "loss": 0.7994, + "step": 17885 + }, + { + "epoch": 0.892069825436409, + "grad_norm": 2.266651787584734, + "learning_rate": 1.5133680584198312e-06, + "loss": 0.7975, + "step": 17886 + }, + { + "epoch": 0.8921197007481296, + "grad_norm": 0.5832977079613716, + "learning_rate": 1.5119846130035724e-06, + "loss": 0.8409, + "step": 17887 + }, + { + "epoch": 0.8921695760598504, + "grad_norm": 0.6827830566045465, + "learning_rate": 1.510601780496565e-06, + "loss": 0.8471, + "step": 17888 + }, + { + "epoch": 0.8922194513715711, + "grad_norm": 1.078342103131751, + "learning_rate": 1.509219560934891e-06, + "loss": 0.8871, + "step": 17889 + }, + { + "epoch": 0.8922693266832917, + "grad_norm": 0.6578223422211809, + "learning_rate": 1.5078379543546184e-06, + "loss": 0.8576, + "step": 17890 + }, + { + "epoch": 0.8923192019950125, + "grad_norm": 0.616614274805141, + "learning_rate": 1.5064569607918022e-06, + "loss": 0.8884, + "step": 17891 + }, + { + "epoch": 0.8923690773067332, + "grad_norm": 0.6466578192634435, + "learning_rate": 1.5050765802824774e-06, + "loss": 0.8257, + "step": 17892 + }, + { + "epoch": 0.8924189526184538, + "grad_norm": 0.5506485491511679, + "learning_rate": 1.5036968128626622e-06, + "loss": 0.849, + "step": 17893 + }, + { + "epoch": 0.8924688279301746, + "grad_norm": 0.8556226259436283, + "learning_rate": 1.5023176585683613e-06, + "loss": 0.8027, + "step": 17894 + }, + { + "epoch": 0.8925187032418953, + "grad_norm": 2.0088032904796767, + "learning_rate": 1.5009391174355736e-06, + "loss": 0.859, + "step": 17895 + }, + { + "epoch": 0.8925685785536159, + "grad_norm": 0.5728924762283233, + "learning_rate": 1.4995611895002538e-06, + "loss": 0.8674, + "step": 17896 + }, + { + "epoch": 0.8926184538653367, + "grad_norm": 0.7793291920890351, + "learning_rate": 1.4981838747983729e-06, + "loss": 0.8225, + "step": 17897 + }, + { + "epoch": 0.8926683291770574, + "grad_norm": 0.6757168739450984, + "learning_rate": 1.4968071733658607e-06, + "loss": 0.8362, + "step": 17898 + }, + { + "epoch": 0.892718204488778, + "grad_norm": 0.7935306600354266, + "learning_rate": 1.495431085238652e-06, + "loss": 0.8375, + "step": 17899 + }, + { + "epoch": 0.8927680798004988, + "grad_norm": 0.6074938037639849, + "learning_rate": 1.4940556104526465e-06, + "loss": 0.8617, + "step": 17900 + }, + { + "epoch": 0.8928179551122194, + "grad_norm": 0.6539864960694682, + "learning_rate": 1.49268074904374e-06, + "loss": 0.8383, + "step": 17901 + }, + { + "epoch": 0.8928678304239401, + "grad_norm": 1.4164901591308932, + "learning_rate": 1.4913065010478067e-06, + "loss": 0.8135, + "step": 17902 + }, + { + "epoch": 0.8929177057356609, + "grad_norm": 0.6473527718364472, + "learning_rate": 1.489932866500718e-06, + "loss": 0.8399, + "step": 17903 + }, + { + "epoch": 0.8929675810473815, + "grad_norm": 0.6913485277117576, + "learning_rate": 1.4885598454383003e-06, + "loss": 0.8429, + "step": 17904 + }, + { + "epoch": 0.8930174563591022, + "grad_norm": 0.816017398443915, + "learning_rate": 1.487187437896395e-06, + "loss": 0.8161, + "step": 17905 + }, + { + "epoch": 0.893067331670823, + "grad_norm": 0.5878106995829792, + "learning_rate": 1.4858156439108096e-06, + "loss": 0.8384, + "step": 17906 + }, + { + "epoch": 0.8931172069825436, + "grad_norm": 0.6175089410231892, + "learning_rate": 1.4844444635173427e-06, + "loss": 0.8459, + "step": 17907 + }, + { + "epoch": 0.8931670822942643, + "grad_norm": 0.6436806405673642, + "learning_rate": 1.4830738967517716e-06, + "loss": 0.8402, + "step": 17908 + }, + { + "epoch": 0.8932169576059851, + "grad_norm": 0.5701464231909114, + "learning_rate": 1.481703943649862e-06, + "loss": 0.8218, + "step": 17909 + }, + { + "epoch": 0.8932668329177057, + "grad_norm": 0.8264577909427762, + "learning_rate": 1.480334604247366e-06, + "loss": 0.8792, + "step": 17910 + }, + { + "epoch": 0.8933167082294264, + "grad_norm": 0.6875873795800926, + "learning_rate": 1.4789658785800103e-06, + "loss": 0.858, + "step": 17911 + }, + { + "epoch": 0.8933665835411472, + "grad_norm": 0.9772281506573934, + "learning_rate": 1.4775977666835111e-06, + "loss": 0.8358, + "step": 17912 + }, + { + "epoch": 0.8934164588528678, + "grad_norm": 0.6459810264521083, + "learning_rate": 1.476230268593573e-06, + "loss": 0.8563, + "step": 17913 + }, + { + "epoch": 0.8934663341645885, + "grad_norm": 1.2112284526812858, + "learning_rate": 1.4748633843458787e-06, + "loss": 0.8602, + "step": 17914 + }, + { + "epoch": 0.8935162094763093, + "grad_norm": 0.5879960811672323, + "learning_rate": 1.4734971139760966e-06, + "loss": 0.8439, + "step": 17915 + }, + { + "epoch": 0.8935660847880299, + "grad_norm": 0.5976381258232513, + "learning_rate": 1.4721314575198736e-06, + "loss": 0.8665, + "step": 17916 + }, + { + "epoch": 0.8936159600997506, + "grad_norm": 0.6743852374009504, + "learning_rate": 1.4707664150128558e-06, + "loss": 0.8082, + "step": 17917 + }, + { + "epoch": 0.8936658354114714, + "grad_norm": 0.5015901627997216, + "learning_rate": 1.4694019864906566e-06, + "loss": 0.8645, + "step": 17918 + }, + { + "epoch": 0.893715710723192, + "grad_norm": 0.7238882779954439, + "learning_rate": 1.4680381719888807e-06, + "loss": 0.8553, + "step": 17919 + }, + { + "epoch": 0.8937655860349127, + "grad_norm": 0.6166246433478172, + "learning_rate": 1.4666749715431167e-06, + "loss": 0.8257, + "step": 17920 + }, + { + "epoch": 0.8938154613466334, + "grad_norm": 0.6368053018829236, + "learning_rate": 1.4653123851889382e-06, + "loss": 0.863, + "step": 17921 + }, + { + "epoch": 0.8938653366583541, + "grad_norm": 0.6703843511894653, + "learning_rate": 1.4639504129619008e-06, + "loss": 0.8274, + "step": 17922 + }, + { + "epoch": 0.8939152119700748, + "grad_norm": 0.5606371586851656, + "learning_rate": 1.4625890548975425e-06, + "loss": 0.8615, + "step": 17923 + }, + { + "epoch": 0.8939650872817955, + "grad_norm": 0.5722790149489839, + "learning_rate": 1.4612283110313902e-06, + "loss": 0.8543, + "step": 17924 + }, + { + "epoch": 0.8940149625935162, + "grad_norm": 0.6168338252965854, + "learning_rate": 1.459868181398949e-06, + "loss": 0.8351, + "step": 17925 + }, + { + "epoch": 0.8940648379052369, + "grad_norm": 0.5463573164930803, + "learning_rate": 1.4585086660357155e-06, + "loss": 0.7791, + "step": 17926 + }, + { + "epoch": 0.8941147132169576, + "grad_norm": 0.6691108745893501, + "learning_rate": 1.4571497649771587e-06, + "loss": 0.8164, + "step": 17927 + }, + { + "epoch": 0.8941645885286783, + "grad_norm": 1.4765172368666177, + "learning_rate": 1.4557914782587445e-06, + "loss": 0.8367, + "step": 17928 + }, + { + "epoch": 0.894214463840399, + "grad_norm": 0.651977628677011, + "learning_rate": 1.454433805915914e-06, + "loss": 0.8294, + "step": 17929 + }, + { + "epoch": 0.8942643391521197, + "grad_norm": 0.5828933870697969, + "learning_rate": 1.4530767479840972e-06, + "loss": 0.8577, + "step": 17930 + }, + { + "epoch": 0.8943142144638404, + "grad_norm": 0.5461300441533783, + "learning_rate": 1.451720304498702e-06, + "loss": 0.8446, + "step": 17931 + }, + { + "epoch": 0.8943640897755611, + "grad_norm": 0.8763292487092925, + "learning_rate": 1.4503644754951334e-06, + "loss": 0.8076, + "step": 17932 + }, + { + "epoch": 0.8944139650872818, + "grad_norm": 0.6792947833297388, + "learning_rate": 1.4490092610087548e-06, + "loss": 0.8891, + "step": 17933 + }, + { + "epoch": 0.8944638403990025, + "grad_norm": 0.6360827571893344, + "learning_rate": 1.4476546610749458e-06, + "loss": 0.8149, + "step": 17934 + }, + { + "epoch": 0.8945137157107232, + "grad_norm": 0.569029781646033, + "learning_rate": 1.4463006757290426e-06, + "loss": 0.8253, + "step": 17935 + }, + { + "epoch": 0.8945635910224439, + "grad_norm": 0.7678217947400042, + "learning_rate": 1.4449473050063888e-06, + "loss": 0.8204, + "step": 17936 + }, + { + "epoch": 0.8946134663341646, + "grad_norm": 0.7139560257174602, + "learning_rate": 1.443594548942287e-06, + "loss": 0.8249, + "step": 17937 + }, + { + "epoch": 0.8946633416458852, + "grad_norm": 0.5800764171123332, + "learning_rate": 1.4422424075720476e-06, + "loss": 0.8815, + "step": 17938 + }, + { + "epoch": 0.894713216957606, + "grad_norm": 0.551628377775081, + "learning_rate": 1.4408908809309424e-06, + "loss": 0.8307, + "step": 17939 + }, + { + "epoch": 0.8947630922693267, + "grad_norm": 0.6310348001902716, + "learning_rate": 1.439539969054257e-06, + "loss": 0.8961, + "step": 17940 + }, + { + "epoch": 0.8948129675810473, + "grad_norm": 0.585222796418568, + "learning_rate": 1.438189671977222e-06, + "loss": 0.8227, + "step": 17941 + }, + { + "epoch": 0.8948628428927681, + "grad_norm": 0.5521470302650928, + "learning_rate": 1.4368399897350866e-06, + "loss": 0.8335, + "step": 17942 + }, + { + "epoch": 0.8949127182044888, + "grad_norm": 0.6462751751834244, + "learning_rate": 1.4354909223630669e-06, + "loss": 0.8588, + "step": 17943 + }, + { + "epoch": 0.8949625935162094, + "grad_norm": 0.5102884570427346, + "learning_rate": 1.4341424698963656e-06, + "loss": 0.8548, + "step": 17944 + }, + { + "epoch": 0.8950124688279302, + "grad_norm": 0.75537949207535, + "learning_rate": 1.4327946323701658e-06, + "loss": 0.8674, + "step": 17945 + }, + { + "epoch": 0.8950623441396509, + "grad_norm": 0.6365734325713968, + "learning_rate": 1.4314474098196474e-06, + "loss": 0.8575, + "step": 17946 + }, + { + "epoch": 0.8951122194513715, + "grad_norm": 0.5569269320893804, + "learning_rate": 1.43010080227996e-06, + "loss": 0.8523, + "step": 17947 + }, + { + "epoch": 0.8951620947630923, + "grad_norm": 0.6009871676911428, + "learning_rate": 1.4287548097862453e-06, + "loss": 0.8587, + "step": 17948 + }, + { + "epoch": 0.895211970074813, + "grad_norm": 0.5588459222835245, + "learning_rate": 1.4274094323736192e-06, + "loss": 0.8373, + "step": 17949 + }, + { + "epoch": 0.8952618453865336, + "grad_norm": 0.6839046061667263, + "learning_rate": 1.4260646700771985e-06, + "loss": 0.8702, + "step": 17950 + }, + { + "epoch": 0.8953117206982544, + "grad_norm": 3.3373181103869194, + "learning_rate": 1.4247205229320687e-06, + "loss": 0.8762, + "step": 17951 + }, + { + "epoch": 0.8953615960099751, + "grad_norm": 0.6917640606456927, + "learning_rate": 1.4233769909733075e-06, + "loss": 0.854, + "step": 17952 + }, + { + "epoch": 0.8954114713216957, + "grad_norm": 0.5925355427488709, + "learning_rate": 1.4220340742359673e-06, + "loss": 0.8432, + "step": 17953 + }, + { + "epoch": 0.8954613466334165, + "grad_norm": 0.5965167967232418, + "learning_rate": 1.4206917727551007e-06, + "loss": 0.825, + "step": 17954 + }, + { + "epoch": 0.8955112219451371, + "grad_norm": 0.5936729085877099, + "learning_rate": 1.4193500865657294e-06, + "loss": 0.8259, + "step": 17955 + }, + { + "epoch": 0.8955610972568578, + "grad_norm": 0.7487703733021769, + "learning_rate": 1.4180090157028648e-06, + "loss": 0.8489, + "step": 17956 + }, + { + "epoch": 0.8956109725685786, + "grad_norm": 0.6258916670065082, + "learning_rate": 1.4166685602014952e-06, + "loss": 0.8045, + "step": 17957 + }, + { + "epoch": 0.8956608478802992, + "grad_norm": 0.4853136574114545, + "learning_rate": 1.4153287200966092e-06, + "loss": 0.8114, + "step": 17958 + }, + { + "epoch": 0.8957107231920199, + "grad_norm": 0.9290218355030867, + "learning_rate": 1.4139894954231652e-06, + "loss": 0.8651, + "step": 17959 + }, + { + "epoch": 0.8957605985037407, + "grad_norm": 0.7523132856934955, + "learning_rate": 1.4126508862161075e-06, + "loss": 0.8163, + "step": 17960 + }, + { + "epoch": 0.8958104738154613, + "grad_norm": 0.6010281905242257, + "learning_rate": 1.411312892510372e-06, + "loss": 0.8454, + "step": 17961 + }, + { + "epoch": 0.895860349127182, + "grad_norm": 0.7590392084102008, + "learning_rate": 1.4099755143408693e-06, + "loss": 0.7799, + "step": 17962 + }, + { + "epoch": 0.8959102244389028, + "grad_norm": 0.5580999153489511, + "learning_rate": 1.408638751742497e-06, + "loss": 0.8371, + "step": 17963 + }, + { + "epoch": 0.8959600997506234, + "grad_norm": 0.5967529860799354, + "learning_rate": 1.4073026047501353e-06, + "loss": 0.8371, + "step": 17964 + }, + { + "epoch": 0.8960099750623441, + "grad_norm": 0.6557506534731335, + "learning_rate": 1.4059670733986618e-06, + "loss": 0.8625, + "step": 17965 + }, + { + "epoch": 0.8960598503740649, + "grad_norm": 0.9342836365300555, + "learning_rate": 1.40463215772291e-06, + "loss": 0.8672, + "step": 17966 + }, + { + "epoch": 0.8961097256857855, + "grad_norm": 0.6023512585582323, + "learning_rate": 1.4032978577577266e-06, + "loss": 0.8171, + "step": 17967 + }, + { + "epoch": 0.8961596009975062, + "grad_norm": 0.9040444087572252, + "learning_rate": 1.4019641735379202e-06, + "loss": 0.9113, + "step": 17968 + }, + { + "epoch": 0.896209476309227, + "grad_norm": 0.5796585530770766, + "learning_rate": 1.4006311050983046e-06, + "loss": 0.8348, + "step": 17969 + }, + { + "epoch": 0.8962593516209476, + "grad_norm": 0.5869467006588708, + "learning_rate": 1.3992986524736518e-06, + "loss": 0.8735, + "step": 17970 + }, + { + "epoch": 0.8963092269326683, + "grad_norm": 0.5530443955396573, + "learning_rate": 1.3979668156987425e-06, + "loss": 0.8755, + "step": 17971 + }, + { + "epoch": 0.896359102244389, + "grad_norm": 0.6133991954418577, + "learning_rate": 1.396635594808321e-06, + "loss": 0.8001, + "step": 17972 + }, + { + "epoch": 0.8964089775561097, + "grad_norm": 0.5812163406641816, + "learning_rate": 1.39530498983714e-06, + "loss": 0.8279, + "step": 17973 + }, + { + "epoch": 0.8964588528678304, + "grad_norm": 0.5643980897155257, + "learning_rate": 1.3939750008199025e-06, + "loss": 0.8523, + "step": 17974 + }, + { + "epoch": 0.8965087281795511, + "grad_norm": 0.5696319488404812, + "learning_rate": 1.392645627791328e-06, + "loss": 0.8624, + "step": 17975 + }, + { + "epoch": 0.8965586034912718, + "grad_norm": 0.5593977707701521, + "learning_rate": 1.3913168707860969e-06, + "loss": 0.8357, + "step": 17976 + }, + { + "epoch": 0.8966084788029925, + "grad_norm": 0.9849115321406731, + "learning_rate": 1.3899887298388897e-06, + "loss": 0.7946, + "step": 17977 + }, + { + "epoch": 0.8966583541147132, + "grad_norm": 1.4133291409427113, + "learning_rate": 1.388661204984354e-06, + "loss": 0.8094, + "step": 17978 + }, + { + "epoch": 0.8967082294264339, + "grad_norm": 0.5921018660803615, + "learning_rate": 1.3873342962571422e-06, + "loss": 0.8551, + "step": 17979 + }, + { + "epoch": 0.8967581047381546, + "grad_norm": 0.5143767533561499, + "learning_rate": 1.3860080036918744e-06, + "loss": 0.8297, + "step": 17980 + }, + { + "epoch": 0.8968079800498753, + "grad_norm": 0.6533524070105174, + "learning_rate": 1.3846823273231585e-06, + "loss": 0.8618, + "step": 17981 + }, + { + "epoch": 0.896857855361596, + "grad_norm": 3.3785528080533873, + "learning_rate": 1.3833572671855865e-06, + "loss": 0.8186, + "step": 17982 + }, + { + "epoch": 0.8969077306733167, + "grad_norm": 0.8065730129021063, + "learning_rate": 1.3820328233137391e-06, + "loss": 0.8002, + "step": 17983 + }, + { + "epoch": 0.8969576059850374, + "grad_norm": 0.5467034534255882, + "learning_rate": 1.3807089957421776e-06, + "loss": 0.8186, + "step": 17984 + }, + { + "epoch": 0.8970074812967581, + "grad_norm": 0.5125847695607909, + "learning_rate": 1.3793857845054408e-06, + "loss": 0.8329, + "step": 17985 + }, + { + "epoch": 0.8970573566084789, + "grad_norm": 0.6003832089781681, + "learning_rate": 1.3780631896380597e-06, + "loss": 0.8312, + "step": 17986 + }, + { + "epoch": 0.8971072319201995, + "grad_norm": 0.6439097795816116, + "learning_rate": 1.3767412111745508e-06, + "loss": 0.8601, + "step": 17987 + }, + { + "epoch": 0.8971571072319202, + "grad_norm": 0.6717616324251326, + "learning_rate": 1.3754198491494063e-06, + "loss": 0.8509, + "step": 17988 + }, + { + "epoch": 0.897206982543641, + "grad_norm": 0.7061169665021717, + "learning_rate": 1.3740991035971097e-06, + "loss": 0.8245, + "step": 17989 + }, + { + "epoch": 0.8972568578553616, + "grad_norm": 0.9732598060278267, + "learning_rate": 1.3727789745521191e-06, + "loss": 0.8913, + "step": 17990 + }, + { + "epoch": 0.8973067331670823, + "grad_norm": 0.5401281769232616, + "learning_rate": 1.3714594620488908e-06, + "loss": 0.8822, + "step": 17991 + }, + { + "epoch": 0.8973566084788029, + "grad_norm": 0.6111869576351623, + "learning_rate": 1.3701405661218525e-06, + "loss": 0.8388, + "step": 17992 + }, + { + "epoch": 0.8974064837905237, + "grad_norm": 0.6670904397816247, + "learning_rate": 1.3688222868054213e-06, + "loss": 0.838, + "step": 17993 + }, + { + "epoch": 0.8974563591022444, + "grad_norm": 1.135994666512315, + "learning_rate": 1.3675046241339918e-06, + "loss": 0.8871, + "step": 17994 + }, + { + "epoch": 0.897506234413965, + "grad_norm": 0.5417371749633345, + "learning_rate": 1.3661875781419587e-06, + "loss": 0.8836, + "step": 17995 + }, + { + "epoch": 0.8975561097256858, + "grad_norm": 0.6280582941339345, + "learning_rate": 1.3648711488636805e-06, + "loss": 0.8727, + "step": 17996 + }, + { + "epoch": 0.8976059850374065, + "grad_norm": 0.6930679794132695, + "learning_rate": 1.3635553363335162e-06, + "loss": 0.862, + "step": 17997 + }, + { + "epoch": 0.8976558603491271, + "grad_norm": 0.7437011424928226, + "learning_rate": 1.362240140585791e-06, + "loss": 0.8551, + "step": 17998 + }, + { + "epoch": 0.8977057356608479, + "grad_norm": 0.6514710160732086, + "learning_rate": 1.3609255616548327e-06, + "loss": 0.8147, + "step": 17999 + }, + { + "epoch": 0.8977556109725686, + "grad_norm": 0.607533723105821, + "learning_rate": 1.3596115995749447e-06, + "loss": 0.8301, + "step": 18000 + }, + { + "epoch": 0.8978054862842892, + "grad_norm": 0.5315358637025283, + "learning_rate": 1.358298254380408e-06, + "loss": 0.8468, + "step": 18001 + }, + { + "epoch": 0.89785536159601, + "grad_norm": 0.9210350558139146, + "learning_rate": 1.3569855261055031e-06, + "loss": 0.8162, + "step": 18002 + }, + { + "epoch": 0.8979052369077307, + "grad_norm": 0.7542685974403616, + "learning_rate": 1.3556734147844725e-06, + "loss": 0.8108, + "step": 18003 + }, + { + "epoch": 0.8979551122194513, + "grad_norm": 0.6764553438020082, + "learning_rate": 1.354361920451569e-06, + "loss": 0.8497, + "step": 18004 + }, + { + "epoch": 0.8980049875311721, + "grad_norm": 0.5191370522088452, + "learning_rate": 1.3530510431410015e-06, + "loss": 0.8607, + "step": 18005 + }, + { + "epoch": 0.8980548628428928, + "grad_norm": 0.6427845456539296, + "learning_rate": 1.3517407828869926e-06, + "loss": 0.8491, + "step": 18006 + }, + { + "epoch": 0.8981047381546134, + "grad_norm": 0.6436814293763153, + "learning_rate": 1.3504311397237179e-06, + "loss": 0.8695, + "step": 18007 + }, + { + "epoch": 0.8981546134663342, + "grad_norm": 0.6874164980751272, + "learning_rate": 1.3491221136853638e-06, + "loss": 0.8686, + "step": 18008 + }, + { + "epoch": 0.8982044887780548, + "grad_norm": 0.8065892849153694, + "learning_rate": 1.347813704806075e-06, + "loss": 0.8473, + "step": 18009 + }, + { + "epoch": 0.8982543640897755, + "grad_norm": 0.5637937517440824, + "learning_rate": 1.3465059131200137e-06, + "loss": 0.8991, + "step": 18010 + }, + { + "epoch": 0.8983042394014963, + "grad_norm": 0.6740090040884429, + "learning_rate": 1.3451987386612851e-06, + "loss": 0.8733, + "step": 18011 + }, + { + "epoch": 0.8983541147132169, + "grad_norm": 0.5833906534120833, + "learning_rate": 1.3438921814640153e-06, + "loss": 0.824, + "step": 18012 + }, + { + "epoch": 0.8984039900249376, + "grad_norm": 0.6428125699912054, + "learning_rate": 1.342586241562288e-06, + "loss": 0.8222, + "step": 18013 + }, + { + "epoch": 0.8984538653366584, + "grad_norm": 0.7084715693813521, + "learning_rate": 1.341280918990187e-06, + "loss": 0.8456, + "step": 18014 + }, + { + "epoch": 0.898503740648379, + "grad_norm": 0.5442887594972159, + "learning_rate": 1.3399762137817683e-06, + "loss": 0.8414, + "step": 18015 + }, + { + "epoch": 0.8985536159600997, + "grad_norm": 0.5545174006499881, + "learning_rate": 1.3386721259710856e-06, + "loss": 0.8492, + "step": 18016 + }, + { + "epoch": 0.8986034912718205, + "grad_norm": 0.9090247769013133, + "learning_rate": 1.337368655592164e-06, + "loss": 0.8198, + "step": 18017 + }, + { + "epoch": 0.8986533665835411, + "grad_norm": 0.6259203280813155, + "learning_rate": 1.3360658026790179e-06, + "loss": 0.8295, + "step": 18018 + }, + { + "epoch": 0.8987032418952619, + "grad_norm": 0.5536946524878023, + "learning_rate": 1.33476356726564e-06, + "loss": 0.7987, + "step": 18019 + }, + { + "epoch": 0.8987531172069826, + "grad_norm": 0.7009228240200429, + "learning_rate": 1.3334619493860195e-06, + "loss": 0.8155, + "step": 18020 + }, + { + "epoch": 0.8988029925187032, + "grad_norm": 1.0783170261558659, + "learning_rate": 1.3321609490741182e-06, + "loss": 0.8811, + "step": 18021 + }, + { + "epoch": 0.898852867830424, + "grad_norm": 0.5844711555514182, + "learning_rate": 1.330860566363884e-06, + "loss": 0.8392, + "step": 18022 + }, + { + "epoch": 0.8989027431421447, + "grad_norm": 0.6208486281268829, + "learning_rate": 1.3295608012892452e-06, + "loss": 0.8171, + "step": 18023 + }, + { + "epoch": 0.8989526184538653, + "grad_norm": 0.5579931173965225, + "learning_rate": 1.3282616538841302e-06, + "loss": 0.8472, + "step": 18024 + }, + { + "epoch": 0.899002493765586, + "grad_norm": 1.0515752859732619, + "learning_rate": 1.3269631241824314e-06, + "loss": 0.8401, + "step": 18025 + }, + { + "epoch": 0.8990523690773067, + "grad_norm": 0.6194804899807537, + "learning_rate": 1.3256652122180357e-06, + "loss": 0.8497, + "step": 18026 + }, + { + "epoch": 0.8991022443890274, + "grad_norm": 0.5362693913023528, + "learning_rate": 1.3243679180248075e-06, + "loss": 0.8286, + "step": 18027 + }, + { + "epoch": 0.8991521197007482, + "grad_norm": 0.6339828651130108, + "learning_rate": 1.3230712416366059e-06, + "loss": 0.8461, + "step": 18028 + }, + { + "epoch": 0.8992019950124688, + "grad_norm": 0.6295521206279437, + "learning_rate": 1.3217751830872648e-06, + "loss": 0.8307, + "step": 18029 + }, + { + "epoch": 0.8992518703241895, + "grad_norm": 0.5579212343926232, + "learning_rate": 1.3204797424106019e-06, + "loss": 0.8331, + "step": 18030 + }, + { + "epoch": 0.8993017456359103, + "grad_norm": 0.564738805084796, + "learning_rate": 1.3191849196404205e-06, + "loss": 0.8659, + "step": 18031 + }, + { + "epoch": 0.8993516209476309, + "grad_norm": 0.617108483747804, + "learning_rate": 1.317890714810513e-06, + "loss": 0.8722, + "step": 18032 + }, + { + "epoch": 0.8994014962593516, + "grad_norm": 0.5982353680027438, + "learning_rate": 1.3165971279546469e-06, + "loss": 0.8409, + "step": 18033 + }, + { + "epoch": 0.8994513715710724, + "grad_norm": 0.5766147425861643, + "learning_rate": 1.3153041591065812e-06, + "loss": 0.8115, + "step": 18034 + }, + { + "epoch": 0.899501246882793, + "grad_norm": 0.5333601885444059, + "learning_rate": 1.3140118083000503e-06, + "loss": 0.8511, + "step": 18035 + }, + { + "epoch": 0.8995511221945137, + "grad_norm": 0.6450227576482783, + "learning_rate": 1.3127200755687852e-06, + "loss": 0.8535, + "step": 18036 + }, + { + "epoch": 0.8996009975062345, + "grad_norm": 0.6171046308755314, + "learning_rate": 1.311428960946487e-06, + "loss": 0.8413, + "step": 18037 + }, + { + "epoch": 0.8996508728179551, + "grad_norm": 0.610593033061622, + "learning_rate": 1.3101384644668451e-06, + "loss": 0.8058, + "step": 18038 + }, + { + "epoch": 0.8997007481296758, + "grad_norm": 0.9790425980960724, + "learning_rate": 1.308848586163547e-06, + "loss": 0.8451, + "step": 18039 + }, + { + "epoch": 0.8997506234413966, + "grad_norm": 0.6831365284686236, + "learning_rate": 1.3075593260702345e-06, + "loss": 0.8408, + "step": 18040 + }, + { + "epoch": 0.8998004987531172, + "grad_norm": 0.5837096257942875, + "learning_rate": 1.3062706842205592e-06, + "loss": 0.8329, + "step": 18041 + }, + { + "epoch": 0.8998503740648379, + "grad_norm": 0.5358539245948026, + "learning_rate": 1.3049826606481435e-06, + "loss": 0.8275, + "step": 18042 + }, + { + "epoch": 0.8999002493765587, + "grad_norm": 0.6020071209446674, + "learning_rate": 1.303695255386611e-06, + "loss": 0.8455, + "step": 18043 + }, + { + "epoch": 0.8999501246882793, + "grad_norm": 0.7539313786419322, + "learning_rate": 1.3024084684695347e-06, + "loss": 0.8433, + "step": 18044 + }, + { + "epoch": 0.9, + "grad_norm": 0.6229870508560668, + "learning_rate": 1.30112229993051e-06, + "loss": 0.8239, + "step": 18045 + }, + { + "epoch": 0.9000498753117206, + "grad_norm": 1.7526075818063784, + "learning_rate": 1.2998367498030879e-06, + "loss": 0.8303, + "step": 18046 + }, + { + "epoch": 0.9000997506234414, + "grad_norm": 0.7598407719180278, + "learning_rate": 1.2985518181208245e-06, + "loss": 0.8473, + "step": 18047 + }, + { + "epoch": 0.9001496259351621, + "grad_norm": 0.613485362416641, + "learning_rate": 1.2972675049172407e-06, + "loss": 0.792, + "step": 18048 + }, + { + "epoch": 0.9001995012468827, + "grad_norm": 0.5565005019379128, + "learning_rate": 1.2959838102258536e-06, + "loss": 0.7995, + "step": 18049 + }, + { + "epoch": 0.9002493765586035, + "grad_norm": 0.5862065067660694, + "learning_rate": 1.294700734080162e-06, + "loss": 0.8403, + "step": 18050 + }, + { + "epoch": 0.9002992518703242, + "grad_norm": 0.7348622101641493, + "learning_rate": 1.2934182765136471e-06, + "loss": 0.8861, + "step": 18051 + }, + { + "epoch": 0.9003491271820449, + "grad_norm": 1.2244360469732805, + "learning_rate": 1.2921364375597656e-06, + "loss": 0.8406, + "step": 18052 + }, + { + "epoch": 0.9003990024937656, + "grad_norm": 0.582741840803264, + "learning_rate": 1.2908552172519795e-06, + "loss": 0.8255, + "step": 18053 + }, + { + "epoch": 0.9004488778054863, + "grad_norm": 0.6846701561344181, + "learning_rate": 1.2895746156237149e-06, + "loss": 0.8459, + "step": 18054 + }, + { + "epoch": 0.900498753117207, + "grad_norm": 0.8104855766623461, + "learning_rate": 1.2882946327083896e-06, + "loss": 0.8493, + "step": 18055 + }, + { + "epoch": 0.9005486284289277, + "grad_norm": 0.5746990639630148, + "learning_rate": 1.2870152685393989e-06, + "loss": 0.8217, + "step": 18056 + }, + { + "epoch": 0.9005985037406484, + "grad_norm": 0.553152525505355, + "learning_rate": 1.2857365231501383e-06, + "loss": 0.8203, + "step": 18057 + }, + { + "epoch": 0.900648379052369, + "grad_norm": 0.6843644720768631, + "learning_rate": 1.2844583965739675e-06, + "loss": 0.8298, + "step": 18058 + }, + { + "epoch": 0.9006982543640898, + "grad_norm": 0.5068863911247846, + "learning_rate": 1.2831808888442432e-06, + "loss": 0.7872, + "step": 18059 + }, + { + "epoch": 0.9007481296758105, + "grad_norm": 0.5676928956971838, + "learning_rate": 1.2819039999942938e-06, + "loss": 0.8074, + "step": 18060 + }, + { + "epoch": 0.9007980049875312, + "grad_norm": 0.529707658998281, + "learning_rate": 1.2806277300574543e-06, + "loss": 0.849, + "step": 18061 + }, + { + "epoch": 0.9008478802992519, + "grad_norm": 0.504998057634856, + "learning_rate": 1.2793520790670116e-06, + "loss": 0.8136, + "step": 18062 + }, + { + "epoch": 0.9008977556109725, + "grad_norm": 0.6692063643112708, + "learning_rate": 1.2780770470562614e-06, + "loss": 0.837, + "step": 18063 + }, + { + "epoch": 0.9009476309226933, + "grad_norm": 0.6844584704755412, + "learning_rate": 1.2768026340584716e-06, + "loss": 0.8696, + "step": 18064 + }, + { + "epoch": 0.900997506234414, + "grad_norm": 0.5215129718032175, + "learning_rate": 1.2755288401069044e-06, + "loss": 0.8414, + "step": 18065 + }, + { + "epoch": 0.9010473815461346, + "grad_norm": 0.723368466440035, + "learning_rate": 1.2742556652347942e-06, + "loss": 0.7952, + "step": 18066 + }, + { + "epoch": 0.9010972568578554, + "grad_norm": 0.6551837450217327, + "learning_rate": 1.2729831094753618e-06, + "loss": 0.8168, + "step": 18067 + }, + { + "epoch": 0.9011471321695761, + "grad_norm": 0.5871350241193144, + "learning_rate": 1.271711172861817e-06, + "loss": 0.8476, + "step": 18068 + }, + { + "epoch": 0.9011970074812967, + "grad_norm": 0.7663892318354883, + "learning_rate": 1.2704398554273495e-06, + "loss": 0.869, + "step": 18069 + }, + { + "epoch": 0.9012468827930175, + "grad_norm": 0.6461470762145688, + "learning_rate": 1.2691691572051356e-06, + "loss": 0.8318, + "step": 18070 + }, + { + "epoch": 0.9012967581047382, + "grad_norm": 0.6324135440764793, + "learning_rate": 1.2678990782283324e-06, + "loss": 0.8729, + "step": 18071 + }, + { + "epoch": 0.9013466334164588, + "grad_norm": 0.6835451216955917, + "learning_rate": 1.2666296185300825e-06, + "loss": 0.8494, + "step": 18072 + }, + { + "epoch": 0.9013965087281796, + "grad_norm": 0.6010551211479085, + "learning_rate": 1.265360778143504e-06, + "loss": 0.8727, + "step": 18073 + }, + { + "epoch": 0.9014463840399003, + "grad_norm": 0.5560428676745482, + "learning_rate": 1.2640925571017203e-06, + "loss": 0.835, + "step": 18074 + }, + { + "epoch": 0.9014962593516209, + "grad_norm": 0.617968919620909, + "learning_rate": 1.2628249554378135e-06, + "loss": 0.8069, + "step": 18075 + }, + { + "epoch": 0.9015461346633417, + "grad_norm": 0.5448257927810511, + "learning_rate": 1.2615579731848737e-06, + "loss": 0.8668, + "step": 18076 + }, + { + "epoch": 0.9015960099750624, + "grad_norm": 0.8047862669646224, + "learning_rate": 1.2602916103759493e-06, + "loss": 0.8574, + "step": 18077 + }, + { + "epoch": 0.901645885286783, + "grad_norm": 0.6120659384872227, + "learning_rate": 1.259025867044092e-06, + "loss": 0.8152, + "step": 18078 + }, + { + "epoch": 0.9016957605985038, + "grad_norm": 0.7825175038086672, + "learning_rate": 1.2577607432223276e-06, + "loss": 0.863, + "step": 18079 + }, + { + "epoch": 0.9017456359102244, + "grad_norm": 0.6721892618453508, + "learning_rate": 1.2564962389436774e-06, + "loss": 0.8276, + "step": 18080 + }, + { + "epoch": 0.9017955112219451, + "grad_norm": 0.5699906528143637, + "learning_rate": 1.255232354241123e-06, + "loss": 0.7851, + "step": 18081 + }, + { + "epoch": 0.9018453865336659, + "grad_norm": 0.5906598353034559, + "learning_rate": 1.2539690891476607e-06, + "loss": 0.8072, + "step": 18082 + }, + { + "epoch": 0.9018952618453865, + "grad_norm": 0.6680411053746864, + "learning_rate": 1.2527064436962444e-06, + "loss": 0.8791, + "step": 18083 + }, + { + "epoch": 0.9019451371571072, + "grad_norm": 0.6259102577161597, + "learning_rate": 1.251444417919828e-06, + "loss": 0.8605, + "step": 18084 + }, + { + "epoch": 0.901995012468828, + "grad_norm": 0.6698208535544817, + "learning_rate": 1.2501830118513359e-06, + "loss": 0.8306, + "step": 18085 + }, + { + "epoch": 0.9020448877805486, + "grad_norm": 0.6636391045500866, + "learning_rate": 1.2489222255236939e-06, + "loss": 0.8471, + "step": 18086 + }, + { + "epoch": 0.9020947630922693, + "grad_norm": 0.7085980051591747, + "learning_rate": 1.247662058969795e-06, + "loss": 0.8621, + "step": 18087 + }, + { + "epoch": 0.9021446384039901, + "grad_norm": 0.6308293136200364, + "learning_rate": 1.2464025122225247e-06, + "loss": 0.8077, + "step": 18088 + }, + { + "epoch": 0.9021945137157107, + "grad_norm": 0.5901216764179332, + "learning_rate": 1.245143585314748e-06, + "loss": 0.8437, + "step": 18089 + }, + { + "epoch": 0.9022443890274314, + "grad_norm": 0.5427391283417192, + "learning_rate": 1.2438852782793193e-06, + "loss": 0.8235, + "step": 18090 + }, + { + "epoch": 0.9022942643391522, + "grad_norm": 0.5474900174498144, + "learning_rate": 1.2426275911490759e-06, + "loss": 0.7925, + "step": 18091 + }, + { + "epoch": 0.9023441396508728, + "grad_norm": 0.6407483009653152, + "learning_rate": 1.2413705239568306e-06, + "loss": 0.8778, + "step": 18092 + }, + { + "epoch": 0.9023940149625935, + "grad_norm": 0.6289580469299629, + "learning_rate": 1.2401140767353853e-06, + "loss": 0.8413, + "step": 18093 + }, + { + "epoch": 0.9024438902743143, + "grad_norm": 0.609414885712998, + "learning_rate": 1.2388582495175326e-06, + "loss": 0.8836, + "step": 18094 + }, + { + "epoch": 0.9024937655860349, + "grad_norm": 0.5813473046720758, + "learning_rate": 1.2376030423360413e-06, + "loss": 0.8458, + "step": 18095 + }, + { + "epoch": 0.9025436408977556, + "grad_norm": 0.7050159879770873, + "learning_rate": 1.2363484552236654e-06, + "loss": 0.8707, + "step": 18096 + }, + { + "epoch": 0.9025935162094763, + "grad_norm": 0.638867048112812, + "learning_rate": 1.2350944882131344e-06, + "loss": 0.896, + "step": 18097 + }, + { + "epoch": 0.902643391521197, + "grad_norm": 0.5461546094363974, + "learning_rate": 1.2338411413371858e-06, + "loss": 0.8564, + "step": 18098 + }, + { + "epoch": 0.9026932668329177, + "grad_norm": 0.6177350880569837, + "learning_rate": 1.2325884146285077e-06, + "loss": 0.8754, + "step": 18099 + }, + { + "epoch": 0.9027431421446384, + "grad_norm": 0.9544850678776602, + "learning_rate": 1.2313363081198043e-06, + "loss": 0.8668, + "step": 18100 + }, + { + "epoch": 0.9027930174563591, + "grad_norm": 0.5713977184876758, + "learning_rate": 1.2300848218437354e-06, + "loss": 0.8594, + "step": 18101 + }, + { + "epoch": 0.9028428927680798, + "grad_norm": 0.6760959985963809, + "learning_rate": 1.2288339558329726e-06, + "loss": 0.7901, + "step": 18102 + }, + { + "epoch": 0.9028927680798005, + "grad_norm": 0.5444652168910475, + "learning_rate": 1.2275837101201447e-06, + "loss": 0.8006, + "step": 18103 + }, + { + "epoch": 0.9029426433915212, + "grad_norm": 0.8122929282904706, + "learning_rate": 1.2263340847378845e-06, + "loss": 0.837, + "step": 18104 + }, + { + "epoch": 0.9029925187032419, + "grad_norm": 0.6793640138741599, + "learning_rate": 1.2250850797187908e-06, + "loss": 0.8255, + "step": 18105 + }, + { + "epoch": 0.9030423940149626, + "grad_norm": 0.8755845792522349, + "learning_rate": 1.2238366950954677e-06, + "loss": 0.8327, + "step": 18106 + }, + { + "epoch": 0.9030922693266833, + "grad_norm": 0.6697496328321396, + "learning_rate": 1.222588930900484e-06, + "loss": 0.8144, + "step": 18107 + }, + { + "epoch": 0.903142144638404, + "grad_norm": 0.8957499144020733, + "learning_rate": 1.2213417871663995e-06, + "loss": 0.7869, + "step": 18108 + }, + { + "epoch": 0.9031920199501247, + "grad_norm": 0.6154108118053639, + "learning_rate": 1.2200952639257607e-06, + "loss": 0.8418, + "step": 18109 + }, + { + "epoch": 0.9032418952618454, + "grad_norm": 0.6447280169462867, + "learning_rate": 1.2188493612110885e-06, + "loss": 0.7993, + "step": 18110 + }, + { + "epoch": 0.9032917705735661, + "grad_norm": 0.7017000318846244, + "learning_rate": 1.2176040790549043e-06, + "loss": 0.8186, + "step": 18111 + }, + { + "epoch": 0.9033416458852868, + "grad_norm": 0.605193126334932, + "learning_rate": 1.2163594174896958e-06, + "loss": 0.8354, + "step": 18112 + }, + { + "epoch": 0.9033915211970075, + "grad_norm": 0.5761492767776126, + "learning_rate": 1.2151153765479455e-06, + "loss": 0.8646, + "step": 18113 + }, + { + "epoch": 0.9034413965087282, + "grad_norm": 0.5415821932299966, + "learning_rate": 1.213871956262111e-06, + "loss": 0.8083, + "step": 18114 + }, + { + "epoch": 0.9034912718204489, + "grad_norm": 0.7882400735773627, + "learning_rate": 1.2126291566646464e-06, + "loss": 0.8185, + "step": 18115 + }, + { + "epoch": 0.9035411471321696, + "grad_norm": 0.7918876056387245, + "learning_rate": 1.211386977787976e-06, + "loss": 0.8528, + "step": 18116 + }, + { + "epoch": 0.9035910224438902, + "grad_norm": 0.6806734174588157, + "learning_rate": 1.2101454196645212e-06, + "loss": 0.8005, + "step": 18117 + }, + { + "epoch": 0.903640897755611, + "grad_norm": 0.5891857734127519, + "learning_rate": 1.208904482326667e-06, + "loss": 0.8561, + "step": 18118 + }, + { + "epoch": 0.9036907730673317, + "grad_norm": 0.8389297980385693, + "learning_rate": 1.2076641658068072e-06, + "loss": 0.8156, + "step": 18119 + }, + { + "epoch": 0.9037406483790523, + "grad_norm": 1.0566089005308807, + "learning_rate": 1.2064244701373045e-06, + "loss": 0.8129, + "step": 18120 + }, + { + "epoch": 0.9037905236907731, + "grad_norm": 0.5827626101409532, + "learning_rate": 1.2051853953505054e-06, + "loss": 0.8209, + "step": 18121 + }, + { + "epoch": 0.9038403990024938, + "grad_norm": 0.780199252970769, + "learning_rate": 1.2039469414787396e-06, + "loss": 0.8814, + "step": 18122 + }, + { + "epoch": 0.9038902743142144, + "grad_norm": 0.5877272044778903, + "learning_rate": 1.202709108554334e-06, + "loss": 0.8342, + "step": 18123 + }, + { + "epoch": 0.9039401496259352, + "grad_norm": 0.5911774058944146, + "learning_rate": 1.2014718966095823e-06, + "loss": 0.8409, + "step": 18124 + }, + { + "epoch": 0.9039900249376559, + "grad_norm": 0.6817564206070863, + "learning_rate": 1.2002353056767724e-06, + "loss": 0.8304, + "step": 18125 + }, + { + "epoch": 0.9040399002493765, + "grad_norm": 1.094560625476864, + "learning_rate": 1.1989993357881674e-06, + "loss": 0.8495, + "step": 18126 + }, + { + "epoch": 0.9040897755610973, + "grad_norm": 0.6453040313004177, + "learning_rate": 1.1977639869760248e-06, + "loss": 0.856, + "step": 18127 + }, + { + "epoch": 0.904139650872818, + "grad_norm": 0.5355164795187937, + "learning_rate": 1.1965292592725774e-06, + "loss": 0.812, + "step": 18128 + }, + { + "epoch": 0.9041895261845386, + "grad_norm": 0.5377782101291456, + "learning_rate": 1.1952951527100493e-06, + "loss": 0.857, + "step": 18129 + }, + { + "epoch": 0.9042394014962594, + "grad_norm": 0.5716007067905159, + "learning_rate": 1.194061667320634e-06, + "loss": 0.8376, + "step": 18130 + }, + { + "epoch": 0.9042892768079801, + "grad_norm": 0.5416322311432846, + "learning_rate": 1.1928288031365337e-06, + "loss": 0.8426, + "step": 18131 + }, + { + "epoch": 0.9043391521197007, + "grad_norm": 0.728461127210804, + "learning_rate": 1.1915965601899033e-06, + "loss": 0.8661, + "step": 18132 + }, + { + "epoch": 0.9043890274314215, + "grad_norm": 0.6799248214191673, + "learning_rate": 1.1903649385129112e-06, + "loss": 0.8338, + "step": 18133 + }, + { + "epoch": 0.9044389027431421, + "grad_norm": 0.6686691494049254, + "learning_rate": 1.1891339381376848e-06, + "loss": 0.8543, + "step": 18134 + }, + { + "epoch": 0.9044887780548628, + "grad_norm": 0.6147720970265922, + "learning_rate": 1.1879035590963595e-06, + "loss": 0.8338, + "step": 18135 + }, + { + "epoch": 0.9045386533665836, + "grad_norm": 0.5783975226509647, + "learning_rate": 1.186673801421026e-06, + "loss": 0.8601, + "step": 18136 + }, + { + "epoch": 0.9045885286783042, + "grad_norm": 1.1626660131175526, + "learning_rate": 1.1854446651437866e-06, + "loss": 0.8475, + "step": 18137 + }, + { + "epoch": 0.9046384039900249, + "grad_norm": 0.7809446740752772, + "learning_rate": 1.1842161502967076e-06, + "loss": 0.8264, + "step": 18138 + }, + { + "epoch": 0.9046882793017457, + "grad_norm": 0.5744272703589786, + "learning_rate": 1.1829882569118516e-06, + "loss": 0.8783, + "step": 18139 + }, + { + "epoch": 0.9047381546134663, + "grad_norm": 0.7217451691248513, + "learning_rate": 1.1817609850212574e-06, + "loss": 0.8627, + "step": 18140 + }, + { + "epoch": 0.904788029925187, + "grad_norm": 0.6742200586041803, + "learning_rate": 1.1805343346569547e-06, + "loss": 0.8559, + "step": 18141 + }, + { + "epoch": 0.9048379052369078, + "grad_norm": 0.6403152703292674, + "learning_rate": 1.1793083058509429e-06, + "loss": 0.8628, + "step": 18142 + }, + { + "epoch": 0.9048877805486284, + "grad_norm": 0.5661090239554367, + "learning_rate": 1.1780828986352216e-06, + "loss": 0.8445, + "step": 18143 + }, + { + "epoch": 0.9049376558603491, + "grad_norm": 0.8141937820430991, + "learning_rate": 1.176858113041765e-06, + "loss": 0.86, + "step": 18144 + }, + { + "epoch": 0.9049875311720699, + "grad_norm": 0.7103079594152867, + "learning_rate": 1.1756339491025363e-06, + "loss": 0.8409, + "step": 18145 + }, + { + "epoch": 0.9050374064837905, + "grad_norm": 0.7623602003144437, + "learning_rate": 1.174410406849477e-06, + "loss": 0.8629, + "step": 18146 + }, + { + "epoch": 0.9050872817955112, + "grad_norm": 0.5616911095011095, + "learning_rate": 1.1731874863145143e-06, + "loss": 0.8689, + "step": 18147 + }, + { + "epoch": 0.905137157107232, + "grad_norm": 0.856552832002009, + "learning_rate": 1.1719651875295612e-06, + "loss": 0.8698, + "step": 18148 + }, + { + "epoch": 0.9051870324189526, + "grad_norm": 0.7349946177822376, + "learning_rate": 1.1707435105265118e-06, + "loss": 0.8675, + "step": 18149 + }, + { + "epoch": 0.9052369077306733, + "grad_norm": 0.634933810247329, + "learning_rate": 1.1695224553372464e-06, + "loss": 0.7984, + "step": 18150 + }, + { + "epoch": 0.905286783042394, + "grad_norm": 0.5736240537617227, + "learning_rate": 1.1683020219936253e-06, + "loss": 0.8233, + "step": 18151 + }, + { + "epoch": 0.9053366583541147, + "grad_norm": 0.598299420753234, + "learning_rate": 1.1670822105275008e-06, + "loss": 0.8296, + "step": 18152 + }, + { + "epoch": 0.9053865336658354, + "grad_norm": 0.6836180063110074, + "learning_rate": 1.165863020970695e-06, + "loss": 0.8562, + "step": 18153 + }, + { + "epoch": 0.9054364089775561, + "grad_norm": 0.6976020098797329, + "learning_rate": 1.164644453355035e-06, + "loss": 0.8923, + "step": 18154 + }, + { + "epoch": 0.9054862842892768, + "grad_norm": 0.5820243138512716, + "learning_rate": 1.1634265077123035e-06, + "loss": 0.8335, + "step": 18155 + }, + { + "epoch": 0.9055361596009975, + "grad_norm": 0.6836468977060052, + "learning_rate": 1.1622091840742921e-06, + "loss": 0.8308, + "step": 18156 + }, + { + "epoch": 0.9055860349127182, + "grad_norm": 0.5922913367669222, + "learning_rate": 1.160992482472764e-06, + "loss": 0.8093, + "step": 18157 + }, + { + "epoch": 0.9056359102244389, + "grad_norm": 0.6738573107451424, + "learning_rate": 1.1597764029394693e-06, + "loss": 0.866, + "step": 18158 + }, + { + "epoch": 0.9056857855361596, + "grad_norm": 0.7434319681232238, + "learning_rate": 1.1585609455061347e-06, + "loss": 0.8591, + "step": 18159 + }, + { + "epoch": 0.9057356608478803, + "grad_norm": 0.7685721788283294, + "learning_rate": 1.1573461102044853e-06, + "loss": 0.8568, + "step": 18160 + }, + { + "epoch": 0.905785536159601, + "grad_norm": 0.5426754679910873, + "learning_rate": 1.1561318970662204e-06, + "loss": 0.852, + "step": 18161 + }, + { + "epoch": 0.9058354114713217, + "grad_norm": 0.7362458157270027, + "learning_rate": 1.1549183061230206e-06, + "loss": 0.8326, + "step": 18162 + }, + { + "epoch": 0.9058852867830424, + "grad_norm": 0.523117041540665, + "learning_rate": 1.153705337406555e-06, + "loss": 0.8451, + "step": 18163 + }, + { + "epoch": 0.9059351620947631, + "grad_norm": 0.5678434186274464, + "learning_rate": 1.1524929909484782e-06, + "loss": 0.8082, + "step": 18164 + }, + { + "epoch": 0.9059850374064838, + "grad_norm": 0.557422118740629, + "learning_rate": 1.151281266780424e-06, + "loss": 0.8359, + "step": 18165 + }, + { + "epoch": 0.9060349127182045, + "grad_norm": 0.5151916249051612, + "learning_rate": 1.1500701649340111e-06, + "loss": 0.8057, + "step": 18166 + }, + { + "epoch": 0.9060847880299252, + "grad_norm": 0.9185725497179399, + "learning_rate": 1.1488596854408424e-06, + "loss": 0.8069, + "step": 18167 + }, + { + "epoch": 0.9061346633416459, + "grad_norm": 0.7759834770905382, + "learning_rate": 1.1476498283325116e-06, + "loss": 0.7935, + "step": 18168 + }, + { + "epoch": 0.9061845386533666, + "grad_norm": 0.5169568144051698, + "learning_rate": 1.146440593640577e-06, + "loss": 0.8034, + "step": 18169 + }, + { + "epoch": 0.9062344139650873, + "grad_norm": 0.5468531439561358, + "learning_rate": 1.1452319813966024e-06, + "loss": 0.8329, + "step": 18170 + }, + { + "epoch": 0.9062842892768079, + "grad_norm": 0.6207347195173484, + "learning_rate": 1.1440239916321204e-06, + "loss": 0.8398, + "step": 18171 + }, + { + "epoch": 0.9063341645885287, + "grad_norm": 0.5863500800446653, + "learning_rate": 1.1428166243786619e-06, + "loss": 0.806, + "step": 18172 + }, + { + "epoch": 0.9063840399002494, + "grad_norm": 1.1613967662216258, + "learning_rate": 1.1416098796677183e-06, + "loss": 0.7964, + "step": 18173 + }, + { + "epoch": 0.90643391521197, + "grad_norm": 0.522555898962179, + "learning_rate": 1.1404037575307918e-06, + "loss": 0.816, + "step": 18174 + }, + { + "epoch": 0.9064837905236908, + "grad_norm": 0.7480388361390412, + "learning_rate": 1.1391982579993493e-06, + "loss": 0.8638, + "step": 18175 + }, + { + "epoch": 0.9065336658354115, + "grad_norm": 0.7309724020990369, + "learning_rate": 1.1379933811048544e-06, + "loss": 0.8091, + "step": 18176 + }, + { + "epoch": 0.9065835411471321, + "grad_norm": 0.6341734782083723, + "learning_rate": 1.1367891268787372e-06, + "loss": 0.8402, + "step": 18177 + }, + { + "epoch": 0.9066334164588529, + "grad_norm": 0.8321614255478642, + "learning_rate": 1.1355854953524313e-06, + "loss": 0.8598, + "step": 18178 + }, + { + "epoch": 0.9066832917705736, + "grad_norm": 1.0423845919646257, + "learning_rate": 1.134382486557342e-06, + "loss": 0.8483, + "step": 18179 + }, + { + "epoch": 0.9067331670822942, + "grad_norm": 0.4924073032514731, + "learning_rate": 1.1331801005248583e-06, + "loss": 0.8501, + "step": 18180 + }, + { + "epoch": 0.906783042394015, + "grad_norm": 0.5967227655137128, + "learning_rate": 1.1319783372863602e-06, + "loss": 0.774, + "step": 18181 + }, + { + "epoch": 0.9068329177057357, + "grad_norm": 0.6291399635413518, + "learning_rate": 1.1307771968732062e-06, + "loss": 0.8295, + "step": 18182 + }, + { + "epoch": 0.9068827930174563, + "grad_norm": 0.8274829640505934, + "learning_rate": 1.1295766793167378e-06, + "loss": 0.814, + "step": 18183 + }, + { + "epoch": 0.9069326683291771, + "grad_norm": 0.7250997856702921, + "learning_rate": 1.1283767846482802e-06, + "loss": 0.8088, + "step": 18184 + }, + { + "epoch": 0.9069825436408978, + "grad_norm": 0.6223694137517567, + "learning_rate": 1.1271775128991495e-06, + "loss": 0.8642, + "step": 18185 + }, + { + "epoch": 0.9070324189526184, + "grad_norm": 1.1845031400031485, + "learning_rate": 1.125978864100638e-06, + "loss": 0.8379, + "step": 18186 + }, + { + "epoch": 0.9070822942643392, + "grad_norm": 0.4862346934995368, + "learning_rate": 1.1247808382840258e-06, + "loss": 0.7969, + "step": 18187 + }, + { + "epoch": 0.9071321695760598, + "grad_norm": 0.6899156691133413, + "learning_rate": 1.1235834354805657e-06, + "loss": 0.8661, + "step": 18188 + }, + { + "epoch": 0.9071820448877805, + "grad_norm": 0.5819079753643067, + "learning_rate": 1.1223866557215162e-06, + "loss": 0.7994, + "step": 18189 + }, + { + "epoch": 0.9072319201995013, + "grad_norm": 0.502306113406716, + "learning_rate": 1.1211904990380967e-06, + "loss": 0.8272, + "step": 18190 + }, + { + "epoch": 0.9072817955112219, + "grad_norm": 0.6117380408134465, + "learning_rate": 1.1199949654615293e-06, + "loss": 0.8704, + "step": 18191 + }, + { + "epoch": 0.9073316708229426, + "grad_norm": 0.5465364431146195, + "learning_rate": 1.1188000550230004e-06, + "loss": 0.8036, + "step": 18192 + }, + { + "epoch": 0.9073815461346634, + "grad_norm": 0.5616976732763127, + "learning_rate": 1.1176057677536988e-06, + "loss": 0.8695, + "step": 18193 + }, + { + "epoch": 0.907431421446384, + "grad_norm": 0.9248372794463097, + "learning_rate": 1.1164121036847857e-06, + "loss": 0.8135, + "step": 18194 + }, + { + "epoch": 0.9074812967581047, + "grad_norm": 0.6118993733614537, + "learning_rate": 1.1152190628474112e-06, + "loss": 0.7871, + "step": 18195 + }, + { + "epoch": 0.9075311720698255, + "grad_norm": 0.7079075300425577, + "learning_rate": 1.1140266452727034e-06, + "loss": 0.8438, + "step": 18196 + }, + { + "epoch": 0.9075810473815461, + "grad_norm": 0.5669286890465418, + "learning_rate": 1.112834850991784e-06, + "loss": 0.8768, + "step": 18197 + }, + { + "epoch": 0.9076309226932668, + "grad_norm": 0.6061080237822518, + "learning_rate": 1.1116436800357456e-06, + "loss": 0.8451, + "step": 18198 + }, + { + "epoch": 0.9076807980049876, + "grad_norm": 0.5658037650803203, + "learning_rate": 1.1104531324356765e-06, + "loss": 0.8583, + "step": 18199 + }, + { + "epoch": 0.9077306733167082, + "grad_norm": 0.6129478135728361, + "learning_rate": 1.1092632082226384e-06, + "loss": 0.8298, + "step": 18200 + }, + { + "epoch": 0.9077805486284289, + "grad_norm": 0.728827591522384, + "learning_rate": 1.108073907427687e-06, + "loss": 0.8545, + "step": 18201 + }, + { + "epoch": 0.9078304239401497, + "grad_norm": 0.6234829042054778, + "learning_rate": 1.1068852300818555e-06, + "loss": 0.8222, + "step": 18202 + }, + { + "epoch": 0.9078802992518703, + "grad_norm": 0.8227336385073359, + "learning_rate": 1.1056971762161583e-06, + "loss": 0.8494, + "step": 18203 + }, + { + "epoch": 0.907930174563591, + "grad_norm": 0.49427560731364656, + "learning_rate": 1.1045097458615984e-06, + "loss": 0.8107, + "step": 18204 + }, + { + "epoch": 0.9079800498753117, + "grad_norm": 0.562040565577555, + "learning_rate": 1.10332293904917e-06, + "loss": 0.8796, + "step": 18205 + }, + { + "epoch": 0.9080299251870324, + "grad_norm": 0.6870369852247823, + "learning_rate": 1.1021367558098266e-06, + "loss": 0.8657, + "step": 18206 + }, + { + "epoch": 0.9080798004987531, + "grad_norm": 0.5564281551541076, + "learning_rate": 1.1009511961745323e-06, + "loss": 0.8328, + "step": 18207 + }, + { + "epoch": 0.9081296758104738, + "grad_norm": 0.5477779859314592, + "learning_rate": 1.0997662601742176e-06, + "loss": 0.8395, + "step": 18208 + }, + { + "epoch": 0.9081795511221945, + "grad_norm": 1.1253414968607809, + "learning_rate": 1.0985819478398135e-06, + "loss": 0.8628, + "step": 18209 + }, + { + "epoch": 0.9082294264339152, + "grad_norm": 0.5435571771225093, + "learning_rate": 1.0973982592022092e-06, + "loss": 0.8187, + "step": 18210 + }, + { + "epoch": 0.9082793017456359, + "grad_norm": 0.6972013625543121, + "learning_rate": 1.0962151942923022e-06, + "loss": 0.8551, + "step": 18211 + }, + { + "epoch": 0.9083291770573566, + "grad_norm": 0.6586702729731084, + "learning_rate": 1.0950327531409594e-06, + "loss": 0.8401, + "step": 18212 + }, + { + "epoch": 0.9083790523690773, + "grad_norm": 0.5477719689457666, + "learning_rate": 1.093850935779045e-06, + "loss": 0.8515, + "step": 18213 + }, + { + "epoch": 0.908428927680798, + "grad_norm": 0.6369603964404756, + "learning_rate": 1.0926697422373872e-06, + "loss": 0.8749, + "step": 18214 + }, + { + "epoch": 0.9084788029925187, + "grad_norm": 0.5784722556812721, + "learning_rate": 1.0914891725468141e-06, + "loss": 0.8138, + "step": 18215 + }, + { + "epoch": 0.9085286783042394, + "grad_norm": 0.6733213864499056, + "learning_rate": 1.0903092267381288e-06, + "loss": 0.9101, + "step": 18216 + }, + { + "epoch": 0.9085785536159601, + "grad_norm": 0.5788124840264482, + "learning_rate": 1.0891299048421233e-06, + "loss": 0.8323, + "step": 18217 + }, + { + "epoch": 0.9086284289276808, + "grad_norm": 0.6132957807517274, + "learning_rate": 1.0879512068895758e-06, + "loss": 0.8198, + "step": 18218 + }, + { + "epoch": 0.9086783042394015, + "grad_norm": 0.7653318987412197, + "learning_rate": 1.0867731329112368e-06, + "loss": 0.8253, + "step": 18219 + }, + { + "epoch": 0.9087281795511222, + "grad_norm": 1.199888195740224, + "learning_rate": 1.0855956829378538e-06, + "loss": 0.8238, + "step": 18220 + }, + { + "epoch": 0.9087780548628429, + "grad_norm": 1.6067476165957515, + "learning_rate": 1.084418857000144e-06, + "loss": 0.8638, + "step": 18221 + }, + { + "epoch": 0.9088279301745635, + "grad_norm": 0.6182770703521968, + "learning_rate": 1.0832426551288243e-06, + "loss": 0.8414, + "step": 18222 + }, + { + "epoch": 0.9088778054862843, + "grad_norm": 0.6294977494935156, + "learning_rate": 1.0820670773545844e-06, + "loss": 0.8857, + "step": 18223 + }, + { + "epoch": 0.908927680798005, + "grad_norm": 0.5957964405841498, + "learning_rate": 1.0808921237080994e-06, + "loss": 0.8237, + "step": 18224 + }, + { + "epoch": 0.9089775561097256, + "grad_norm": 0.7240176990380365, + "learning_rate": 1.0797177942200255e-06, + "loss": 0.9381, + "step": 18225 + }, + { + "epoch": 0.9090274314214464, + "grad_norm": 0.5830650851774482, + "learning_rate": 1.0785440889210157e-06, + "loss": 0.8135, + "step": 18226 + }, + { + "epoch": 0.9090773067331671, + "grad_norm": 0.7146931410489957, + "learning_rate": 1.0773710078416905e-06, + "loss": 0.8432, + "step": 18227 + }, + { + "epoch": 0.9091271820448877, + "grad_norm": 0.5557786637359201, + "learning_rate": 1.076198551012661e-06, + "loss": 0.8537, + "step": 18228 + }, + { + "epoch": 0.9091770573566085, + "grad_norm": 0.6432975086918012, + "learning_rate": 1.0750267184645224e-06, + "loss": 0.8332, + "step": 18229 + }, + { + "epoch": 0.9092269326683292, + "grad_norm": 0.63340858369433, + "learning_rate": 1.0738555102278586e-06, + "loss": 0.848, + "step": 18230 + }, + { + "epoch": 0.9092768079800498, + "grad_norm": 0.5497914872939587, + "learning_rate": 1.0726849263332256e-06, + "loss": 0.8256, + "step": 18231 + }, + { + "epoch": 0.9093266832917706, + "grad_norm": 0.7176724964953929, + "learning_rate": 1.0715149668111712e-06, + "loss": 0.8655, + "step": 18232 + }, + { + "epoch": 0.9093765586034913, + "grad_norm": 0.7471888741341699, + "learning_rate": 1.0703456316922211e-06, + "loss": 0.8308, + "step": 18233 + }, + { + "epoch": 0.9094264339152119, + "grad_norm": 0.5986418397785941, + "learning_rate": 1.0691769210068952e-06, + "loss": 0.8672, + "step": 18234 + }, + { + "epoch": 0.9094763092269327, + "grad_norm": 0.5690256792598366, + "learning_rate": 1.0680088347856888e-06, + "loss": 0.8484, + "step": 18235 + }, + { + "epoch": 0.9095261845386534, + "grad_norm": 0.5931192801933849, + "learning_rate": 1.0668413730590799e-06, + "loss": 0.8518, + "step": 18236 + }, + { + "epoch": 0.909576059850374, + "grad_norm": 0.6514778705453724, + "learning_rate": 1.0656745358575303e-06, + "loss": 0.8308, + "step": 18237 + }, + { + "epoch": 0.9096259351620948, + "grad_norm": 0.5884007303097775, + "learning_rate": 1.0645083232115022e-06, + "loss": 0.9072, + "step": 18238 + }, + { + "epoch": 0.9096758104738155, + "grad_norm": 0.5521969912679797, + "learning_rate": 1.0633427351514069e-06, + "loss": 0.8225, + "step": 18239 + }, + { + "epoch": 0.9097256857855361, + "grad_norm": 0.544133655593012, + "learning_rate": 1.0621777717076758e-06, + "loss": 0.8042, + "step": 18240 + }, + { + "epoch": 0.9097755610972569, + "grad_norm": 0.6677694917682789, + "learning_rate": 1.0610134329106986e-06, + "loss": 0.8405, + "step": 18241 + }, + { + "epoch": 0.9098254364089775, + "grad_norm": 0.516912428228673, + "learning_rate": 1.0598497187908702e-06, + "loss": 0.8526, + "step": 18242 + }, + { + "epoch": 0.9098753117206982, + "grad_norm": 1.4519097924478592, + "learning_rate": 1.0586866293785386e-06, + "loss": 0.8341, + "step": 18243 + }, + { + "epoch": 0.909925187032419, + "grad_norm": 1.3518584680481491, + "learning_rate": 1.0575241647040712e-06, + "loss": 0.833, + "step": 18244 + }, + { + "epoch": 0.9099750623441396, + "grad_norm": 0.6464634036428133, + "learning_rate": 1.056362324797791e-06, + "loss": 0.8507, + "step": 18245 + }, + { + "epoch": 0.9100249376558603, + "grad_norm": 0.6246240270891367, + "learning_rate": 1.0552011096900294e-06, + "loss": 0.8299, + "step": 18246 + }, + { + "epoch": 0.9100748129675811, + "grad_norm": 0.6146201803101589, + "learning_rate": 1.0540405194110703e-06, + "loss": 0.8841, + "step": 18247 + }, + { + "epoch": 0.9101246882793017, + "grad_norm": 0.7341461314652549, + "learning_rate": 1.052880553991209e-06, + "loss": 0.8675, + "step": 18248 + }, + { + "epoch": 0.9101745635910224, + "grad_norm": 0.6019038240349769, + "learning_rate": 1.0517212134607129e-06, + "loss": 0.8828, + "step": 18249 + }, + { + "epoch": 0.9102244389027432, + "grad_norm": 0.5800753145048604, + "learning_rate": 1.050562497849833e-06, + "loss": 0.8226, + "step": 18250 + }, + { + "epoch": 0.9102743142144638, + "grad_norm": 0.6376710997162294, + "learning_rate": 1.049404407188806e-06, + "loss": 0.8293, + "step": 18251 + }, + { + "epoch": 0.9103241895261845, + "grad_norm": 0.5552282898796644, + "learning_rate": 1.0482469415078523e-06, + "loss": 0.8337, + "step": 18252 + }, + { + "epoch": 0.9103740648379053, + "grad_norm": 0.5427550911685219, + "learning_rate": 1.0470901008371781e-06, + "loss": 0.8135, + "step": 18253 + }, + { + "epoch": 0.9104239401496259, + "grad_norm": 0.748078278456559, + "learning_rate": 1.0459338852069621e-06, + "loss": 0.8415, + "step": 18254 + }, + { + "epoch": 0.9104738154613466, + "grad_norm": 0.6004478718710602, + "learning_rate": 1.044778294647386e-06, + "loss": 0.8406, + "step": 18255 + }, + { + "epoch": 0.9105236907730674, + "grad_norm": 0.5431694607308771, + "learning_rate": 1.0436233291885977e-06, + "loss": 0.849, + "step": 18256 + }, + { + "epoch": 0.910573566084788, + "grad_norm": 0.6429162316135226, + "learning_rate": 1.0424689888607398e-06, + "loss": 0.8168, + "step": 18257 + }, + { + "epoch": 0.9106234413965087, + "grad_norm": 0.6001582556949167, + "learning_rate": 1.041315273693924e-06, + "loss": 0.8931, + "step": 18258 + }, + { + "epoch": 0.9106733167082294, + "grad_norm": 0.6484509079461981, + "learning_rate": 1.0401621837182712e-06, + "loss": 0.8565, + "step": 18259 + }, + { + "epoch": 0.9107231920199501, + "grad_norm": 0.5747960643219259, + "learning_rate": 1.0390097189638598e-06, + "loss": 0.879, + "step": 18260 + }, + { + "epoch": 0.9107730673316708, + "grad_norm": 0.8804869704820432, + "learning_rate": 1.0378578794607686e-06, + "loss": 0.84, + "step": 18261 + }, + { + "epoch": 0.9108229426433915, + "grad_norm": 0.8758828220898982, + "learning_rate": 1.0367066652390483e-06, + "loss": 0.8877, + "step": 18262 + }, + { + "epoch": 0.9108728179551122, + "grad_norm": 0.8117874495430369, + "learning_rate": 1.0355560763287447e-06, + "loss": 0.8378, + "step": 18263 + }, + { + "epoch": 0.910922693266833, + "grad_norm": 1.0357676548175367, + "learning_rate": 1.0344061127598836e-06, + "loss": 0.8197, + "step": 18264 + }, + { + "epoch": 0.9109725685785536, + "grad_norm": 0.6107016809347227, + "learning_rate": 1.0332567745624661e-06, + "loss": 0.8808, + "step": 18265 + }, + { + "epoch": 0.9110224438902743, + "grad_norm": 0.626415709763682, + "learning_rate": 1.0321080617664847e-06, + "loss": 0.8551, + "step": 18266 + }, + { + "epoch": 0.911072319201995, + "grad_norm": 0.5739873696169586, + "learning_rate": 1.030959974401921e-06, + "loss": 0.8275, + "step": 18267 + }, + { + "epoch": 0.9111221945137157, + "grad_norm": 0.6346333965051758, + "learning_rate": 1.029812512498729e-06, + "loss": 0.8098, + "step": 18268 + }, + { + "epoch": 0.9111720698254364, + "grad_norm": 0.632706313948071, + "learning_rate": 1.0286656760868514e-06, + "loss": 0.8718, + "step": 18269 + }, + { + "epoch": 0.9112219451371572, + "grad_norm": 0.5679986078796152, + "learning_rate": 1.027519465196211e-06, + "loss": 0.8173, + "step": 18270 + }, + { + "epoch": 0.9112718204488778, + "grad_norm": 0.6657766628570774, + "learning_rate": 1.0263738798567262e-06, + "loss": 0.8516, + "step": 18271 + }, + { + "epoch": 0.9113216957605985, + "grad_norm": 0.5541643744090463, + "learning_rate": 1.025228920098284e-06, + "loss": 0.8571, + "step": 18272 + }, + { + "epoch": 0.9113715710723193, + "grad_norm": 0.7085738741926758, + "learning_rate": 1.0240845859507658e-06, + "loss": 0.7913, + "step": 18273 + }, + { + "epoch": 0.9114214463840399, + "grad_norm": 0.5416849470306199, + "learning_rate": 1.0229408774440258e-06, + "loss": 0.8881, + "step": 18274 + }, + { + "epoch": 0.9114713216957606, + "grad_norm": 0.5986248027357218, + "learning_rate": 1.0217977946079206e-06, + "loss": 0.8834, + "step": 18275 + }, + { + "epoch": 0.9115211970074812, + "grad_norm": 0.5615140926622106, + "learning_rate": 1.020655337472262e-06, + "loss": 0.823, + "step": 18276 + }, + { + "epoch": 0.911571072319202, + "grad_norm": 0.6522187492904805, + "learning_rate": 1.0195135060668743e-06, + "loss": 0.8409, + "step": 18277 + }, + { + "epoch": 0.9116209476309227, + "grad_norm": 0.4700238449303022, + "learning_rate": 1.0183723004215467e-06, + "loss": 0.8252, + "step": 18278 + }, + { + "epoch": 0.9116708229426433, + "grad_norm": 0.6581994796722299, + "learning_rate": 1.0172317205660669e-06, + "loss": 0.8663, + "step": 18279 + }, + { + "epoch": 0.9117206982543641, + "grad_norm": 0.6799852999112466, + "learning_rate": 1.0160917665301862e-06, + "loss": 0.8061, + "step": 18280 + }, + { + "epoch": 0.9117705735660848, + "grad_norm": 0.7907349131154141, + "learning_rate": 1.014952438343661e-06, + "loss": 0.8545, + "step": 18281 + }, + { + "epoch": 0.9118204488778054, + "grad_norm": 0.5704286431080843, + "learning_rate": 1.0138137360362121e-06, + "loss": 0.8601, + "step": 18282 + }, + { + "epoch": 0.9118703241895262, + "grad_norm": 0.6245695350598796, + "learning_rate": 1.0126756596375686e-06, + "loss": 0.8757, + "step": 18283 + }, + { + "epoch": 0.9119201995012469, + "grad_norm": 0.582114270776132, + "learning_rate": 1.0115382091774095e-06, + "loss": 0.836, + "step": 18284 + }, + { + "epoch": 0.9119700748129675, + "grad_norm": 0.5812506493653529, + "learning_rate": 1.0104013846854305e-06, + "loss": 0.8708, + "step": 18285 + }, + { + "epoch": 0.9120199501246883, + "grad_norm": 0.591057485880992, + "learning_rate": 1.0092651861912884e-06, + "loss": 0.8657, + "step": 18286 + }, + { + "epoch": 0.912069825436409, + "grad_norm": 0.49041622902798926, + "learning_rate": 1.0081296137246372e-06, + "loss": 0.8469, + "step": 18287 + }, + { + "epoch": 0.9121197007481296, + "grad_norm": 0.5680148507109405, + "learning_rate": 1.0069946673151004e-06, + "loss": 0.8263, + "step": 18288 + }, + { + "epoch": 0.9121695760598504, + "grad_norm": 0.6762456226559739, + "learning_rate": 1.0058603469923045e-06, + "loss": 0.8324, + "step": 18289 + }, + { + "epoch": 0.9122194513715711, + "grad_norm": 0.5594785856463741, + "learning_rate": 1.0047266527858452e-06, + "loss": 0.8555, + "step": 18290 + }, + { + "epoch": 0.9122693266832917, + "grad_norm": 0.5872431528043636, + "learning_rate": 1.0035935847253014e-06, + "loss": 0.7955, + "step": 18291 + }, + { + "epoch": 0.9123192019950125, + "grad_norm": 0.674006161261779, + "learning_rate": 1.0024611428402442e-06, + "loss": 0.8791, + "step": 18292 + }, + { + "epoch": 0.9123690773067331, + "grad_norm": 0.5363519422187345, + "learning_rate": 1.0013293271602248e-06, + "loss": 0.8536, + "step": 18293 + }, + { + "epoch": 0.9124189526184538, + "grad_norm": 0.5848906366161807, + "learning_rate": 1.000198137714778e-06, + "loss": 0.8099, + "step": 18294 + }, + { + "epoch": 0.9124688279301746, + "grad_norm": 0.8928696438758297, + "learning_rate": 9.99067574533416e-07, + "loss": 0.8705, + "step": 18295 + }, + { + "epoch": 0.9125187032418952, + "grad_norm": 0.5758021376532162, + "learning_rate": 9.979376376456463e-07, + "loss": 0.8686, + "step": 18296 + }, + { + "epoch": 0.912568578553616, + "grad_norm": 0.5333420927463043, + "learning_rate": 9.96808327080953e-07, + "loss": 0.8153, + "step": 18297 + }, + { + "epoch": 0.9126184538653367, + "grad_norm": 0.6631369632382387, + "learning_rate": 9.956796428688048e-07, + "loss": 0.8022, + "step": 18298 + }, + { + "epoch": 0.9126683291770573, + "grad_norm": 0.7904074951489513, + "learning_rate": 9.9455158503865e-07, + "loss": 0.865, + "step": 18299 + }, + { + "epoch": 0.912718204488778, + "grad_norm": 0.5914142247181051, + "learning_rate": 9.93424153619929e-07, + "loss": 0.8445, + "step": 18300 + }, + { + "epoch": 0.9127680798004988, + "grad_norm": 0.6665998373495265, + "learning_rate": 9.922973486420628e-07, + "loss": 0.8426, + "step": 18301 + }, + { + "epoch": 0.9128179551122194, + "grad_norm": 0.6065518380162267, + "learning_rate": 9.911711701344529e-07, + "loss": 0.8662, + "step": 18302 + }, + { + "epoch": 0.9128678304239402, + "grad_norm": 0.6919592081156104, + "learning_rate": 9.90045618126484e-07, + "loss": 0.8101, + "step": 18303 + }, + { + "epoch": 0.9129177057356609, + "grad_norm": 0.559400912907786, + "learning_rate": 9.889206926475296e-07, + "loss": 0.8257, + "step": 18304 + }, + { + "epoch": 0.9129675810473815, + "grad_norm": 0.5220275447122376, + "learning_rate": 9.877963937269447e-07, + "loss": 0.814, + "step": 18305 + }, + { + "epoch": 0.9130174563591023, + "grad_norm": 0.728626608187692, + "learning_rate": 9.86672721394069e-07, + "loss": 0.7895, + "step": 18306 + }, + { + "epoch": 0.913067331670823, + "grad_norm": 0.5942675027597627, + "learning_rate": 9.855496756782157e-07, + "loss": 0.8351, + "step": 18307 + }, + { + "epoch": 0.9131172069825436, + "grad_norm": 0.691832749273144, + "learning_rate": 9.844272566087027e-07, + "loss": 0.8414, + "step": 18308 + }, + { + "epoch": 0.9131670822942644, + "grad_norm": 0.5871953620804442, + "learning_rate": 9.833054642148066e-07, + "loss": 0.7878, + "step": 18309 + }, + { + "epoch": 0.9132169576059851, + "grad_norm": 0.6370218043835852, + "learning_rate": 9.821842985258096e-07, + "loss": 0.8232, + "step": 18310 + }, + { + "epoch": 0.9132668329177057, + "grad_norm": 0.7210760909074753, + "learning_rate": 9.810637595709605e-07, + "loss": 0.8488, + "step": 18311 + }, + { + "epoch": 0.9133167082294265, + "grad_norm": 0.6440075574323803, + "learning_rate": 9.799438473795108e-07, + "loss": 0.8199, + "step": 18312 + }, + { + "epoch": 0.9133665835411471, + "grad_norm": 0.562997405861363, + "learning_rate": 9.78824561980668e-07, + "loss": 0.7913, + "step": 18313 + }, + { + "epoch": 0.9134164588528678, + "grad_norm": 0.6424964290320352, + "learning_rate": 9.777059034036528e-07, + "loss": 0.806, + "step": 18314 + }, + { + "epoch": 0.9134663341645886, + "grad_norm": 0.5932582532979496, + "learning_rate": 9.765878716776478e-07, + "loss": 0.8273, + "step": 18315 + }, + { + "epoch": 0.9135162094763092, + "grad_norm": 0.6362742093702419, + "learning_rate": 9.754704668318376e-07, + "loss": 0.8648, + "step": 18316 + }, + { + "epoch": 0.9135660847880299, + "grad_norm": 0.6065744604723843, + "learning_rate": 9.743536888953658e-07, + "loss": 0.8477, + "step": 18317 + }, + { + "epoch": 0.9136159600997507, + "grad_norm": 0.615035903033833, + "learning_rate": 9.73237537897384e-07, + "loss": 0.8296, + "step": 18318 + }, + { + "epoch": 0.9136658354114713, + "grad_norm": 0.60735734275979, + "learning_rate": 9.721220138670162e-07, + "loss": 0.8084, + "step": 18319 + }, + { + "epoch": 0.913715710723192, + "grad_norm": 0.559404750163751, + "learning_rate": 9.71007116833375e-07, + "loss": 0.8294, + "step": 18320 + }, + { + "epoch": 0.9137655860349128, + "grad_norm": 0.5979534877088125, + "learning_rate": 9.69892846825543e-07, + "loss": 0.8295, + "step": 18321 + }, + { + "epoch": 0.9138154613466334, + "grad_norm": 0.5510924313634542, + "learning_rate": 9.68779203872605e-07, + "loss": 0.8207, + "step": 18322 + }, + { + "epoch": 0.9138653366583541, + "grad_norm": 0.6365690777922954, + "learning_rate": 9.676661880036185e-07, + "loss": 0.8329, + "step": 18323 + }, + { + "epoch": 0.9139152119700749, + "grad_norm": 0.714123319973043, + "learning_rate": 9.665537992476299e-07, + "loss": 0.7979, + "step": 18324 + }, + { + "epoch": 0.9139650872817955, + "grad_norm": 0.6607413158417055, + "learning_rate": 9.654420376336599e-07, + "loss": 0.8238, + "step": 18325 + }, + { + "epoch": 0.9140149625935162, + "grad_norm": 0.5793892337792701, + "learning_rate": 9.643309031907243e-07, + "loss": 0.8295, + "step": 18326 + }, + { + "epoch": 0.914064837905237, + "grad_norm": 0.6895677758461569, + "learning_rate": 9.632203959478198e-07, + "loss": 0.8183, + "step": 18327 + }, + { + "epoch": 0.9141147132169576, + "grad_norm": 0.5286440449170453, + "learning_rate": 9.621105159339199e-07, + "loss": 0.8319, + "step": 18328 + }, + { + "epoch": 0.9141645885286783, + "grad_norm": 0.6297732123212704, + "learning_rate": 9.610012631779851e-07, + "loss": 0.7909, + "step": 18329 + }, + { + "epoch": 0.914214463840399, + "grad_norm": 0.9004583238555811, + "learning_rate": 9.598926377089674e-07, + "loss": 0.902, + "step": 18330 + }, + { + "epoch": 0.9142643391521197, + "grad_norm": 0.7557492116089544, + "learning_rate": 9.58784639555793e-07, + "loss": 0.8416, + "step": 18331 + }, + { + "epoch": 0.9143142144638404, + "grad_norm": 0.6445035080626996, + "learning_rate": 9.5767726874737e-07, + "loss": 0.8571, + "step": 18332 + }, + { + "epoch": 0.914364089775561, + "grad_norm": 0.5372091964948832, + "learning_rate": 9.565705253126028e-07, + "loss": 0.8253, + "step": 18333 + }, + { + "epoch": 0.9144139650872818, + "grad_norm": 0.48514792120008415, + "learning_rate": 9.554644092803655e-07, + "loss": 0.7985, + "step": 18334 + }, + { + "epoch": 0.9144638403990025, + "grad_norm": 0.68175996746806, + "learning_rate": 9.54358920679524e-07, + "loss": 0.8303, + "step": 18335 + }, + { + "epoch": 0.9145137157107232, + "grad_norm": 0.6079718873408444, + "learning_rate": 9.532540595389244e-07, + "loss": 0.8615, + "step": 18336 + }, + { + "epoch": 0.9145635910224439, + "grad_norm": 1.1431099048704563, + "learning_rate": 9.521498258873995e-07, + "loss": 0.8525, + "step": 18337 + }, + { + "epoch": 0.9146134663341646, + "grad_norm": 0.6080054344219884, + "learning_rate": 9.510462197537623e-07, + "loss": 0.826, + "step": 18338 + }, + { + "epoch": 0.9146633416458853, + "grad_norm": 0.6999799432386504, + "learning_rate": 9.499432411668091e-07, + "loss": 0.8087, + "step": 18339 + }, + { + "epoch": 0.914713216957606, + "grad_norm": 0.6142064747817109, + "learning_rate": 9.488408901553226e-07, + "loss": 0.824, + "step": 18340 + }, + { + "epoch": 0.9147630922693267, + "grad_norm": 0.5326863593117087, + "learning_rate": 9.477391667480739e-07, + "loss": 0.8387, + "step": 18341 + }, + { + "epoch": 0.9148129675810474, + "grad_norm": 1.2191166288787345, + "learning_rate": 9.466380709738043e-07, + "loss": 0.8421, + "step": 18342 + }, + { + "epoch": 0.9148628428927681, + "grad_norm": 0.5984483505832341, + "learning_rate": 9.455376028612517e-07, + "loss": 0.8682, + "step": 18343 + }, + { + "epoch": 0.9149127182044888, + "grad_norm": 0.6416315678786754, + "learning_rate": 9.444377624391265e-07, + "loss": 0.8447, + "step": 18344 + }, + { + "epoch": 0.9149625935162095, + "grad_norm": 0.6034743600960387, + "learning_rate": 9.433385497361363e-07, + "loss": 0.859, + "step": 18345 + }, + { + "epoch": 0.9150124688279302, + "grad_norm": 0.5453361654138911, + "learning_rate": 9.422399647809555e-07, + "loss": 0.7811, + "step": 18346 + }, + { + "epoch": 0.9150623441396508, + "grad_norm": 0.7934198896122903, + "learning_rate": 9.411420076022609e-07, + "loss": 0.8372, + "step": 18347 + }, + { + "epoch": 0.9151122194513716, + "grad_norm": 0.6031161942938983, + "learning_rate": 9.400446782286909e-07, + "loss": 0.8635, + "step": 18348 + }, + { + "epoch": 0.9151620947630923, + "grad_norm": 0.5241932622420183, + "learning_rate": 9.389479766888976e-07, + "loss": 0.8531, + "step": 18349 + }, + { + "epoch": 0.9152119700748129, + "grad_norm": 0.5954502939461124, + "learning_rate": 9.378519030114802e-07, + "loss": 0.8335, + "step": 18350 + }, + { + "epoch": 0.9152618453865337, + "grad_norm": 0.6012147380735458, + "learning_rate": 9.36756457225052e-07, + "loss": 0.8039, + "step": 18351 + }, + { + "epoch": 0.9153117206982544, + "grad_norm": 0.6515431442301507, + "learning_rate": 9.356616393581902e-07, + "loss": 0.8467, + "step": 18352 + }, + { + "epoch": 0.915361596009975, + "grad_norm": 0.6047544192717673, + "learning_rate": 9.345674494394774e-07, + "loss": 0.8527, + "step": 18353 + }, + { + "epoch": 0.9154114713216958, + "grad_norm": 0.8069718214301924, + "learning_rate": 9.334738874974491e-07, + "loss": 0.8704, + "step": 18354 + }, + { + "epoch": 0.9154613466334165, + "grad_norm": 0.6407132082471785, + "learning_rate": 9.323809535606521e-07, + "loss": 0.833, + "step": 18355 + }, + { + "epoch": 0.9155112219451371, + "grad_norm": 0.5586361066533713, + "learning_rate": 9.312886476576022e-07, + "loss": 0.8359, + "step": 18356 + }, + { + "epoch": 0.9155610972568579, + "grad_norm": 0.5513859527100325, + "learning_rate": 9.301969698168017e-07, + "loss": 0.8154, + "step": 18357 + }, + { + "epoch": 0.9156109725685786, + "grad_norm": 0.6348339912159431, + "learning_rate": 9.29105920066739e-07, + "loss": 0.8694, + "step": 18358 + }, + { + "epoch": 0.9156608478802992, + "grad_norm": 0.6678543575501941, + "learning_rate": 9.280154984358857e-07, + "loss": 0.8641, + "step": 18359 + }, + { + "epoch": 0.91571072319202, + "grad_norm": 1.0034822367276894, + "learning_rate": 9.26925704952697e-07, + "loss": 0.8048, + "step": 18360 + }, + { + "epoch": 0.9157605985037407, + "grad_norm": 0.5653026593747533, + "learning_rate": 9.258365396456082e-07, + "loss": 0.8415, + "step": 18361 + }, + { + "epoch": 0.9158104738154613, + "grad_norm": 0.7090406113893827, + "learning_rate": 9.247480025430355e-07, + "loss": 0.8265, + "step": 18362 + }, + { + "epoch": 0.9158603491271821, + "grad_norm": 0.5929564682565539, + "learning_rate": 9.236600936733925e-07, + "loss": 0.8433, + "step": 18363 + }, + { + "epoch": 0.9159102244389028, + "grad_norm": 0.6695538932682584, + "learning_rate": 9.225728130650646e-07, + "loss": 0.8547, + "step": 18364 + }, + { + "epoch": 0.9159600997506234, + "grad_norm": 0.6398753192264013, + "learning_rate": 9.214861607464237e-07, + "loss": 0.8373, + "step": 18365 + }, + { + "epoch": 0.9160099750623442, + "grad_norm": 0.5867684098844284, + "learning_rate": 9.20400136745822e-07, + "loss": 0.8488, + "step": 18366 + }, + { + "epoch": 0.9160598503740648, + "grad_norm": 0.608183121037254, + "learning_rate": 9.193147410916064e-07, + "loss": 0.881, + "step": 18367 + }, + { + "epoch": 0.9161097256857855, + "grad_norm": 0.6693414565818299, + "learning_rate": 9.182299738120931e-07, + "loss": 0.8107, + "step": 18368 + }, + { + "epoch": 0.9161596009975063, + "grad_norm": 0.5102447589837005, + "learning_rate": 9.171458349355899e-07, + "loss": 0.816, + "step": 18369 + }, + { + "epoch": 0.9162094763092269, + "grad_norm": 0.7063882647209672, + "learning_rate": 9.16062324490391e-07, + "loss": 0.8717, + "step": 18370 + }, + { + "epoch": 0.9162593516209476, + "grad_norm": 0.7370000062591076, + "learning_rate": 9.149794425047653e-07, + "loss": 0.8483, + "step": 18371 + }, + { + "epoch": 0.9163092269326684, + "grad_norm": 0.902446898549567, + "learning_rate": 9.138971890069736e-07, + "loss": 0.8185, + "step": 18372 + }, + { + "epoch": 0.916359102244389, + "grad_norm": 0.6150411591676714, + "learning_rate": 9.128155640252517e-07, + "loss": 0.8839, + "step": 18373 + }, + { + "epoch": 0.9164089775561097, + "grad_norm": 0.9266102413921932, + "learning_rate": 9.117345675878325e-07, + "loss": 0.8158, + "step": 18374 + }, + { + "epoch": 0.9164588528678305, + "grad_norm": 0.6144274330565396, + "learning_rate": 9.106541997229185e-07, + "loss": 0.7947, + "step": 18375 + }, + { + "epoch": 0.9165087281795511, + "grad_norm": 0.7794752098061911, + "learning_rate": 9.095744604587009e-07, + "loss": 0.8416, + "step": 18376 + }, + { + "epoch": 0.9165586034912718, + "grad_norm": 0.5962169249325514, + "learning_rate": 9.084953498233517e-07, + "loss": 0.8426, + "step": 18377 + }, + { + "epoch": 0.9166084788029926, + "grad_norm": 0.654664647147065, + "learning_rate": 9.074168678450401e-07, + "loss": 0.8649, + "step": 18378 + }, + { + "epoch": 0.9166583541147132, + "grad_norm": 0.7159543647120364, + "learning_rate": 9.063390145519019e-07, + "loss": 0.814, + "step": 18379 + }, + { + "epoch": 0.9167082294264339, + "grad_norm": 0.5654070557697931, + "learning_rate": 9.052617899720645e-07, + "loss": 0.7828, + "step": 18380 + }, + { + "epoch": 0.9167581047381547, + "grad_norm": 0.5423191251035493, + "learning_rate": 9.04185194133636e-07, + "loss": 0.8641, + "step": 18381 + }, + { + "epoch": 0.9168079800498753, + "grad_norm": 0.7224786820108867, + "learning_rate": 9.031092270647134e-07, + "loss": 0.8906, + "step": 18382 + }, + { + "epoch": 0.916857855361596, + "grad_norm": 0.7694670408773936, + "learning_rate": 9.020338887933688e-07, + "loss": 0.812, + "step": 18383 + }, + { + "epoch": 0.9169077306733167, + "grad_norm": 0.5942664198828482, + "learning_rate": 9.009591793476657e-07, + "loss": 0.8766, + "step": 18384 + }, + { + "epoch": 0.9169576059850374, + "grad_norm": 0.6359277508841326, + "learning_rate": 8.998850987556456e-07, + "loss": 0.8167, + "step": 18385 + }, + { + "epoch": 0.9170074812967581, + "grad_norm": 0.6803348260138778, + "learning_rate": 8.988116470453445e-07, + "loss": 0.8388, + "step": 18386 + }, + { + "epoch": 0.9170573566084788, + "grad_norm": 0.5760867833540506, + "learning_rate": 8.977388242447593e-07, + "loss": 0.866, + "step": 18387 + }, + { + "epoch": 0.9171072319201995, + "grad_norm": 1.0121340048772651, + "learning_rate": 8.966666303818955e-07, + "loss": 0.8788, + "step": 18388 + }, + { + "epoch": 0.9171571072319202, + "grad_norm": 0.4455851314720966, + "learning_rate": 8.955950654847278e-07, + "loss": 0.8002, + "step": 18389 + }, + { + "epoch": 0.9172069825436409, + "grad_norm": 0.6150084859928971, + "learning_rate": 8.945241295812257e-07, + "loss": 0.7891, + "step": 18390 + }, + { + "epoch": 0.9172568578553616, + "grad_norm": 0.6300050796379092, + "learning_rate": 8.934538226993194e-07, + "loss": 0.8503, + "step": 18391 + }, + { + "epoch": 0.9173067331670823, + "grad_norm": 0.549316910492368, + "learning_rate": 8.923841448669506e-07, + "loss": 0.8203, + "step": 18392 + }, + { + "epoch": 0.917356608478803, + "grad_norm": 0.5598349166633791, + "learning_rate": 8.913150961120303e-07, + "loss": 0.8258, + "step": 18393 + }, + { + "epoch": 0.9174064837905237, + "grad_norm": 0.575112216705834, + "learning_rate": 8.902466764624529e-07, + "loss": 0.8403, + "step": 18394 + }, + { + "epoch": 0.9174563591022444, + "grad_norm": 0.5919286972028188, + "learning_rate": 8.891788859460931e-07, + "loss": 0.8751, + "step": 18395 + }, + { + "epoch": 0.9175062344139651, + "grad_norm": 0.7717183457422153, + "learning_rate": 8.881117245908233e-07, + "loss": 0.8529, + "step": 18396 + }, + { + "epoch": 0.9175561097256858, + "grad_norm": 1.4639688017642092, + "learning_rate": 8.870451924244877e-07, + "loss": 0.8107, + "step": 18397 + }, + { + "epoch": 0.9176059850374065, + "grad_norm": 0.6456170768017407, + "learning_rate": 8.859792894749169e-07, + "loss": 0.8105, + "step": 18398 + }, + { + "epoch": 0.9176558603491272, + "grad_norm": 0.7244827350525173, + "learning_rate": 8.849140157699193e-07, + "loss": 0.8318, + "step": 18399 + }, + { + "epoch": 0.9177057356608479, + "grad_norm": 0.5883463820424584, + "learning_rate": 8.838493713373031e-07, + "loss": 0.7958, + "step": 18400 + }, + { + "epoch": 0.9177556109725685, + "grad_norm": 0.5922228020828301, + "learning_rate": 8.827853562048432e-07, + "loss": 0.8269, + "step": 18401 + }, + { + "epoch": 0.9178054862842893, + "grad_norm": 0.7580721228741751, + "learning_rate": 8.817219704003065e-07, + "loss": 0.8541, + "step": 18402 + }, + { + "epoch": 0.91785536159601, + "grad_norm": 0.7591926958214731, + "learning_rate": 8.806592139514374e-07, + "loss": 0.8377, + "step": 18403 + }, + { + "epoch": 0.9179052369077306, + "grad_norm": 0.7309204225786934, + "learning_rate": 8.795970868859776e-07, + "loss": 0.8517, + "step": 18404 + }, + { + "epoch": 0.9179551122194514, + "grad_norm": 0.6240288911414301, + "learning_rate": 8.785355892316327e-07, + "loss": 0.8214, + "step": 18405 + }, + { + "epoch": 0.9180049875311721, + "grad_norm": 0.7717182124970268, + "learning_rate": 8.774747210161083e-07, + "loss": 0.8135, + "step": 18406 + }, + { + "epoch": 0.9180548628428927, + "grad_norm": 0.6577535992679514, + "learning_rate": 8.764144822670822e-07, + "loss": 0.8294, + "step": 18407 + }, + { + "epoch": 0.9181047381546135, + "grad_norm": 0.7054824479616049, + "learning_rate": 8.753548730122268e-07, + "loss": 0.8404, + "step": 18408 + }, + { + "epoch": 0.9181546134663342, + "grad_norm": 0.6268801526148959, + "learning_rate": 8.742958932791894e-07, + "loss": 0.8107, + "step": 18409 + }, + { + "epoch": 0.9182044887780548, + "grad_norm": 0.571710972366578, + "learning_rate": 8.732375430956008e-07, + "loss": 0.8326, + "step": 18410 + }, + { + "epoch": 0.9182543640897756, + "grad_norm": 0.6438006317262763, + "learning_rate": 8.721798224890831e-07, + "loss": 0.9171, + "step": 18411 + }, + { + "epoch": 0.9183042394014963, + "grad_norm": 0.5779027050522559, + "learning_rate": 8.711227314872366e-07, + "loss": 0.8356, + "step": 18412 + }, + { + "epoch": 0.9183541147132169, + "grad_norm": 0.5375722514960194, + "learning_rate": 8.700662701176421e-07, + "loss": 0.834, + "step": 18413 + }, + { + "epoch": 0.9184039900249377, + "grad_norm": 0.579925407910025, + "learning_rate": 8.690104384078662e-07, + "loss": 0.8905, + "step": 18414 + }, + { + "epoch": 0.9184538653366584, + "grad_norm": 0.5673507970873216, + "learning_rate": 8.679552363854676e-07, + "loss": 0.8412, + "step": 18415 + }, + { + "epoch": 0.918503740648379, + "grad_norm": 0.6459110625270291, + "learning_rate": 8.669006640779742e-07, + "loss": 0.8136, + "step": 18416 + }, + { + "epoch": 0.9185536159600998, + "grad_norm": 0.6272771730343989, + "learning_rate": 8.658467215129084e-07, + "loss": 0.8393, + "step": 18417 + }, + { + "epoch": 0.9186034912718204, + "grad_norm": 0.5650248300804236, + "learning_rate": 8.647934087177679e-07, + "loss": 0.8112, + "step": 18418 + }, + { + "epoch": 0.9186533665835411, + "grad_norm": 0.5672623000589587, + "learning_rate": 8.637407257200497e-07, + "loss": 0.8663, + "step": 18419 + }, + { + "epoch": 0.9187032418952619, + "grad_norm": 0.7471525769141293, + "learning_rate": 8.626886725472072e-07, + "loss": 0.8545, + "step": 18420 + }, + { + "epoch": 0.9187531172069825, + "grad_norm": 0.6372176276093767, + "learning_rate": 8.616372492267072e-07, + "loss": 0.8881, + "step": 18421 + }, + { + "epoch": 0.9188029925187032, + "grad_norm": 0.5858766988419453, + "learning_rate": 8.60586455785975e-07, + "loss": 0.8561, + "step": 18422 + }, + { + "epoch": 0.918852867830424, + "grad_norm": 0.5930884203582395, + "learning_rate": 8.595362922524414e-07, + "loss": 0.8302, + "step": 18423 + }, + { + "epoch": 0.9189027431421446, + "grad_norm": 0.6738854878725518, + "learning_rate": 8.584867586535011e-07, + "loss": 0.8191, + "step": 18424 + }, + { + "epoch": 0.9189526184538653, + "grad_norm": 0.7726735525245895, + "learning_rate": 8.574378550165435e-07, + "loss": 0.8631, + "step": 18425 + }, + { + "epoch": 0.9190024937655861, + "grad_norm": 0.5945765372273836, + "learning_rate": 8.563895813689409e-07, + "loss": 0.8302, + "step": 18426 + }, + { + "epoch": 0.9190523690773067, + "grad_norm": 0.6583529214731548, + "learning_rate": 8.553419377380523e-07, + "loss": 0.825, + "step": 18427 + }, + { + "epoch": 0.9191022443890274, + "grad_norm": 1.5505266520821215, + "learning_rate": 8.542949241512027e-07, + "loss": 0.8566, + "step": 18428 + }, + { + "epoch": 0.9191521197007482, + "grad_norm": 1.8158577340707396, + "learning_rate": 8.532485406357233e-07, + "loss": 0.8713, + "step": 18429 + }, + { + "epoch": 0.9192019950124688, + "grad_norm": 0.6170150748903356, + "learning_rate": 8.522027872189198e-07, + "loss": 0.7853, + "step": 18430 + }, + { + "epoch": 0.9192518703241895, + "grad_norm": 0.5474249904324627, + "learning_rate": 8.511576639280733e-07, + "loss": 0.8546, + "step": 18431 + }, + { + "epoch": 0.9193017456359103, + "grad_norm": 0.6513625928840714, + "learning_rate": 8.501131707904591e-07, + "loss": 0.8581, + "step": 18432 + }, + { + "epoch": 0.9193516209476309, + "grad_norm": 0.6360306111044307, + "learning_rate": 8.49069307833339e-07, + "loss": 0.8553, + "step": 18433 + }, + { + "epoch": 0.9194014962593516, + "grad_norm": 0.6947970066316511, + "learning_rate": 8.480260750839436e-07, + "loss": 0.824, + "step": 18434 + }, + { + "epoch": 0.9194513715710724, + "grad_norm": 1.0499773873357299, + "learning_rate": 8.469834725694987e-07, + "loss": 0.8357, + "step": 18435 + }, + { + "epoch": 0.919501246882793, + "grad_norm": 0.7339746059649598, + "learning_rate": 8.459415003172099e-07, + "loss": 0.8399, + "step": 18436 + }, + { + "epoch": 0.9195511221945137, + "grad_norm": 0.6285045279874937, + "learning_rate": 8.449001583542671e-07, + "loss": 0.8586, + "step": 18437 + }, + { + "epoch": 0.9196009975062344, + "grad_norm": 0.5183153688148359, + "learning_rate": 8.438594467078481e-07, + "loss": 0.8244, + "step": 18438 + }, + { + "epoch": 0.9196508728179551, + "grad_norm": 1.0575006659920185, + "learning_rate": 8.428193654051036e-07, + "loss": 0.8126, + "step": 18439 + }, + { + "epoch": 0.9197007481296758, + "grad_norm": 0.6779112030689334, + "learning_rate": 8.417799144731731e-07, + "loss": 0.7741, + "step": 18440 + }, + { + "epoch": 0.9197506234413965, + "grad_norm": 1.0464101113063538, + "learning_rate": 8.407410939391902e-07, + "loss": 0.8364, + "step": 18441 + }, + { + "epoch": 0.9198004987531172, + "grad_norm": 0.5675994203970153, + "learning_rate": 8.397029038302528e-07, + "loss": 0.8159, + "step": 18442 + }, + { + "epoch": 0.9198503740648379, + "grad_norm": 0.6151791422718548, + "learning_rate": 8.38665344173456e-07, + "loss": 0.8722, + "step": 18443 + }, + { + "epoch": 0.9199002493765586, + "grad_norm": 0.6928010628518514, + "learning_rate": 8.376284149958724e-07, + "loss": 0.8638, + "step": 18444 + }, + { + "epoch": 0.9199501246882793, + "grad_norm": 0.7265898084571523, + "learning_rate": 8.365921163245638e-07, + "loss": 0.8349, + "step": 18445 + }, + { + "epoch": 0.92, + "grad_norm": 0.5961408580858496, + "learning_rate": 8.355564481865697e-07, + "loss": 0.8373, + "step": 18446 + }, + { + "epoch": 0.9200498753117207, + "grad_norm": 0.7351206791732291, + "learning_rate": 8.345214106089128e-07, + "loss": 0.864, + "step": 18447 + }, + { + "epoch": 0.9200997506234414, + "grad_norm": 0.565322855637023, + "learning_rate": 8.334870036186049e-07, + "loss": 0.8179, + "step": 18448 + }, + { + "epoch": 0.9201496259351621, + "grad_norm": 0.6050022642140784, + "learning_rate": 8.324532272426411e-07, + "loss": 0.8324, + "step": 18449 + }, + { + "epoch": 0.9201995012468828, + "grad_norm": 0.813447879161135, + "learning_rate": 8.314200815079942e-07, + "loss": 0.7895, + "step": 18450 + }, + { + "epoch": 0.9202493765586035, + "grad_norm": 0.6048948341272317, + "learning_rate": 8.303875664416206e-07, + "loss": 0.8228, + "step": 18451 + }, + { + "epoch": 0.9202992518703242, + "grad_norm": 1.7342836937381276, + "learning_rate": 8.293556820704706e-07, + "loss": 0.8172, + "step": 18452 + }, + { + "epoch": 0.9203491271820449, + "grad_norm": 0.8861460029684144, + "learning_rate": 8.283244284214647e-07, + "loss": 0.8325, + "step": 18453 + }, + { + "epoch": 0.9203990024937656, + "grad_norm": 0.504819909286498, + "learning_rate": 8.272938055215146e-07, + "loss": 0.8396, + "step": 18454 + }, + { + "epoch": 0.9204488778054862, + "grad_norm": 0.6388364775885385, + "learning_rate": 8.262638133975126e-07, + "loss": 0.8705, + "step": 18455 + }, + { + "epoch": 0.920498753117207, + "grad_norm": 0.6533626622972767, + "learning_rate": 8.252344520763428e-07, + "loss": 0.8841, + "step": 18456 + }, + { + "epoch": 0.9205486284289277, + "grad_norm": 0.5861700657601255, + "learning_rate": 8.242057215848559e-07, + "loss": 0.8355, + "step": 18457 + }, + { + "epoch": 0.9205985037406483, + "grad_norm": 0.7298140532604315, + "learning_rate": 8.231776219499027e-07, + "loss": 0.8071, + "step": 18458 + }, + { + "epoch": 0.9206483790523691, + "grad_norm": 0.7224516822447182, + "learning_rate": 8.221501531983089e-07, + "loss": 0.8326, + "step": 18459 + }, + { + "epoch": 0.9206982543640898, + "grad_norm": 0.9120771411777023, + "learning_rate": 8.211233153568892e-07, + "loss": 0.8577, + "step": 18460 + }, + { + "epoch": 0.9207481296758104, + "grad_norm": 0.5451680159103476, + "learning_rate": 8.200971084524306e-07, + "loss": 0.7829, + "step": 18461 + }, + { + "epoch": 0.9207980049875312, + "grad_norm": 0.5964497915627834, + "learning_rate": 8.190715325117199e-07, + "loss": 0.8645, + "step": 18462 + }, + { + "epoch": 0.9208478802992519, + "grad_norm": 0.7883633013343595, + "learning_rate": 8.180465875615162e-07, + "loss": 0.7861, + "step": 18463 + }, + { + "epoch": 0.9208977556109725, + "grad_norm": 0.667576637384817, + "learning_rate": 8.170222736285621e-07, + "loss": 0.8123, + "step": 18464 + }, + { + "epoch": 0.9209476309226933, + "grad_norm": 0.6264292577365643, + "learning_rate": 8.159985907395889e-07, + "loss": 0.8143, + "step": 18465 + }, + { + "epoch": 0.920997506234414, + "grad_norm": 0.7862297477096877, + "learning_rate": 8.149755389213115e-07, + "loss": 0.8546, + "step": 18466 + }, + { + "epoch": 0.9210473815461346, + "grad_norm": 0.6932900079709458, + "learning_rate": 8.139531182004223e-07, + "loss": 0.834, + "step": 18467 + }, + { + "epoch": 0.9210972568578554, + "grad_norm": 0.5910535234418818, + "learning_rate": 8.129313286036028e-07, + "loss": 0.8483, + "step": 18468 + }, + { + "epoch": 0.9211471321695761, + "grad_norm": 0.6690128628226957, + "learning_rate": 8.119101701575122e-07, + "loss": 0.8923, + "step": 18469 + }, + { + "epoch": 0.9211970074812967, + "grad_norm": 7.762081764955139, + "learning_rate": 8.108896428888041e-07, + "loss": 0.8299, + "step": 18470 + }, + { + "epoch": 0.9212468827930175, + "grad_norm": 0.788568439439528, + "learning_rate": 8.098697468241045e-07, + "loss": 0.8459, + "step": 18471 + }, + { + "epoch": 0.9212967581047381, + "grad_norm": 0.6233261988061538, + "learning_rate": 8.088504819900284e-07, + "loss": 0.862, + "step": 18472 + }, + { + "epoch": 0.9213466334164588, + "grad_norm": 0.7298327985243319, + "learning_rate": 8.078318484131681e-07, + "loss": 0.8357, + "step": 18473 + }, + { + "epoch": 0.9213965087281796, + "grad_norm": 0.5658177038844705, + "learning_rate": 8.068138461201136e-07, + "loss": 0.8985, + "step": 18474 + }, + { + "epoch": 0.9214463840399002, + "grad_norm": 0.6655525276512457, + "learning_rate": 8.057964751374214e-07, + "loss": 0.8876, + "step": 18475 + }, + { + "epoch": 0.9214962593516209, + "grad_norm": 0.6317981526456073, + "learning_rate": 8.047797354916453e-07, + "loss": 0.8437, + "step": 18476 + }, + { + "epoch": 0.9215461346633417, + "grad_norm": 0.6541299646348449, + "learning_rate": 8.037636272093085e-07, + "loss": 0.8623, + "step": 18477 + }, + { + "epoch": 0.9215960099750623, + "grad_norm": 0.6445713803264141, + "learning_rate": 8.027481503169371e-07, + "loss": 0.8239, + "step": 18478 + }, + { + "epoch": 0.921645885286783, + "grad_norm": 0.589275183330085, + "learning_rate": 8.017333048410208e-07, + "loss": 0.8192, + "step": 18479 + }, + { + "epoch": 0.9216957605985038, + "grad_norm": 0.61469246501587, + "learning_rate": 8.007190908080441e-07, + "loss": 0.8661, + "step": 18480 + }, + { + "epoch": 0.9217456359102244, + "grad_norm": 0.6296467017460828, + "learning_rate": 7.997055082444721e-07, + "loss": 0.7901, + "step": 18481 + }, + { + "epoch": 0.9217955112219451, + "grad_norm": 0.5902839543503013, + "learning_rate": 7.986925571767529e-07, + "loss": 0.8021, + "step": 18482 + }, + { + "epoch": 0.9218453865336659, + "grad_norm": 0.672627740024143, + "learning_rate": 7.976802376313236e-07, + "loss": 0.8569, + "step": 18483 + }, + { + "epoch": 0.9218952618453865, + "grad_norm": 0.5784380631698225, + "learning_rate": 7.966685496345938e-07, + "loss": 0.8306, + "step": 18484 + }, + { + "epoch": 0.9219451371571072, + "grad_norm": 0.6851871121772137, + "learning_rate": 7.956574932129673e-07, + "loss": 0.8458, + "step": 18485 + }, + { + "epoch": 0.921995012468828, + "grad_norm": 0.6010765011337487, + "learning_rate": 7.946470683928286e-07, + "loss": 0.8562, + "step": 18486 + }, + { + "epoch": 0.9220448877805486, + "grad_norm": 1.520288720474827, + "learning_rate": 7.936372752005399e-07, + "loss": 0.8354, + "step": 18487 + }, + { + "epoch": 0.9220947630922693, + "grad_norm": 0.7789224596193828, + "learning_rate": 7.926281136624525e-07, + "loss": 0.8611, + "step": 18488 + }, + { + "epoch": 0.9221446384039901, + "grad_norm": 0.8214437668225927, + "learning_rate": 7.916195838049034e-07, + "loss": 0.8056, + "step": 18489 + }, + { + "epoch": 0.9221945137157107, + "grad_norm": 1.1638014858439027, + "learning_rate": 7.906116856542023e-07, + "loss": 0.8338, + "step": 18490 + }, + { + "epoch": 0.9222443890274314, + "grad_norm": 0.810275426826052, + "learning_rate": 7.896044192366586e-07, + "loss": 0.8159, + "step": 18491 + }, + { + "epoch": 0.9222942643391521, + "grad_norm": 0.6162525739886406, + "learning_rate": 7.885977845785486e-07, + "loss": 0.8395, + "step": 18492 + }, + { + "epoch": 0.9223441396508728, + "grad_norm": 0.7870739735702532, + "learning_rate": 7.875917817061484e-07, + "loss": 0.862, + "step": 18493 + }, + { + "epoch": 0.9223940149625935, + "grad_norm": 1.9925915592903924, + "learning_rate": 7.86586410645701e-07, + "loss": 0.7967, + "step": 18494 + }, + { + "epoch": 0.9224438902743142, + "grad_norm": 0.7821807180620292, + "learning_rate": 7.855816714234465e-07, + "loss": 0.8676, + "step": 18495 + }, + { + "epoch": 0.9224937655860349, + "grad_norm": 0.6496466779717149, + "learning_rate": 7.845775640656001e-07, + "loss": 0.8686, + "step": 18496 + }, + { + "epoch": 0.9225436408977556, + "grad_norm": 0.5648665165104247, + "learning_rate": 7.835740885983683e-07, + "loss": 0.8702, + "step": 18497 + }, + { + "epoch": 0.9225935162094763, + "grad_norm": 0.6234975463578192, + "learning_rate": 7.825712450479278e-07, + "loss": 0.8035, + "step": 18498 + }, + { + "epoch": 0.922643391521197, + "grad_norm": 0.546148698784178, + "learning_rate": 7.815690334404547e-07, + "loss": 0.8154, + "step": 18499 + }, + { + "epoch": 0.9226932668329177, + "grad_norm": 2.515247648893928, + "learning_rate": 7.805674538021002e-07, + "loss": 0.841, + "step": 18500 + }, + { + "epoch": 0.9227431421446384, + "grad_norm": 0.7998207001197812, + "learning_rate": 7.795665061589991e-07, + "loss": 0.8596, + "step": 18501 + }, + { + "epoch": 0.9227930174563591, + "grad_norm": 0.48766945395414757, + "learning_rate": 7.785661905372666e-07, + "loss": 0.8042, + "step": 18502 + }, + { + "epoch": 0.9228428927680798, + "grad_norm": 0.5654683245536748, + "learning_rate": 7.775665069630122e-07, + "loss": 0.8295, + "step": 18503 + }, + { + "epoch": 0.9228927680798005, + "grad_norm": 0.6411267838001927, + "learning_rate": 7.765674554623181e-07, + "loss": 0.8694, + "step": 18504 + }, + { + "epoch": 0.9229426433915212, + "grad_norm": 0.7286435651850749, + "learning_rate": 7.755690360612549e-07, + "loss": 0.8407, + "step": 18505 + }, + { + "epoch": 0.922992518703242, + "grad_norm": 0.5014549535081372, + "learning_rate": 7.745712487858769e-07, + "loss": 0.8255, + "step": 18506 + }, + { + "epoch": 0.9230423940149626, + "grad_norm": 0.6164943254992036, + "learning_rate": 7.735740936622188e-07, + "loss": 0.8871, + "step": 18507 + }, + { + "epoch": 0.9230922693266833, + "grad_norm": 1.3868535492494631, + "learning_rate": 7.725775707163068e-07, + "loss": 0.8425, + "step": 18508 + }, + { + "epoch": 0.9231421446384039, + "grad_norm": 0.6620920062267764, + "learning_rate": 7.715816799741371e-07, + "loss": 0.853, + "step": 18509 + }, + { + "epoch": 0.9231920199501247, + "grad_norm": 0.6771408044009273, + "learning_rate": 7.705864214616998e-07, + "loss": 0.853, + "step": 18510 + }, + { + "epoch": 0.9232418952618454, + "grad_norm": 0.6882781881236677, + "learning_rate": 7.695917952049659e-07, + "loss": 0.8319, + "step": 18511 + }, + { + "epoch": 0.923291770573566, + "grad_norm": 0.6236122732420376, + "learning_rate": 7.685978012298923e-07, + "loss": 0.8604, + "step": 18512 + }, + { + "epoch": 0.9233416458852868, + "grad_norm": 0.5947546589369435, + "learning_rate": 7.676044395624138e-07, + "loss": 0.8098, + "step": 18513 + }, + { + "epoch": 0.9233915211970075, + "grad_norm": 0.702752701129737, + "learning_rate": 7.666117102284487e-07, + "loss": 0.8667, + "step": 18514 + }, + { + "epoch": 0.9234413965087281, + "grad_norm": 0.5523944623448129, + "learning_rate": 7.656196132539095e-07, + "loss": 0.8117, + "step": 18515 + }, + { + "epoch": 0.9234912718204489, + "grad_norm": 0.5608184430165077, + "learning_rate": 7.646281486646811e-07, + "loss": 0.849, + "step": 18516 + }, + { + "epoch": 0.9235411471321696, + "grad_norm": 0.724322714792611, + "learning_rate": 7.636373164866345e-07, + "loss": 0.7894, + "step": 18517 + }, + { + "epoch": 0.9235910224438902, + "grad_norm": 0.5548553619247057, + "learning_rate": 7.626471167456212e-07, + "loss": 0.8203, + "step": 18518 + }, + { + "epoch": 0.923640897755611, + "grad_norm": 0.5979770590756803, + "learning_rate": 7.616575494674899e-07, + "loss": 0.8652, + "step": 18519 + }, + { + "epoch": 0.9236907730673317, + "grad_norm": 0.5892385916226149, + "learning_rate": 7.606686146780534e-07, + "loss": 0.8366, + "step": 18520 + }, + { + "epoch": 0.9237406483790523, + "grad_norm": 0.9875124820168231, + "learning_rate": 7.596803124031243e-07, + "loss": 0.8627, + "step": 18521 + }, + { + "epoch": 0.9237905236907731, + "grad_norm": 0.5053586523949425, + "learning_rate": 7.586926426684876e-07, + "loss": 0.8002, + "step": 18522 + }, + { + "epoch": 0.9238403990024938, + "grad_norm": 0.7299791912561164, + "learning_rate": 7.577056054999144e-07, + "loss": 0.879, + "step": 18523 + }, + { + "epoch": 0.9238902743142144, + "grad_norm": 0.599429999884106, + "learning_rate": 7.567192009231672e-07, + "loss": 0.8742, + "step": 18524 + }, + { + "epoch": 0.9239401496259352, + "grad_norm": 0.5964357251159432, + "learning_rate": 7.557334289639811e-07, + "loss": 0.8222, + "step": 18525 + }, + { + "epoch": 0.9239900249376558, + "grad_norm": 0.6395827367393006, + "learning_rate": 7.547482896480857e-07, + "loss": 0.8301, + "step": 18526 + }, + { + "epoch": 0.9240399002493765, + "grad_norm": 1.3042644441447584, + "learning_rate": 7.537637830011767e-07, + "loss": 0.8357, + "step": 18527 + }, + { + "epoch": 0.9240897755610973, + "grad_norm": 0.5996959406957421, + "learning_rate": 7.527799090489535e-07, + "loss": 0.8088, + "step": 18528 + }, + { + "epoch": 0.9241396508728179, + "grad_norm": 0.5536676057478227, + "learning_rate": 7.51796667817084e-07, + "loss": 0.8765, + "step": 18529 + }, + { + "epoch": 0.9241895261845386, + "grad_norm": 0.570403835045107, + "learning_rate": 7.508140593312341e-07, + "loss": 0.82, + "step": 18530 + }, + { + "epoch": 0.9242394014962594, + "grad_norm": 0.5589442375737363, + "learning_rate": 7.49832083617033e-07, + "loss": 0.8208, + "step": 18531 + }, + { + "epoch": 0.92428927680798, + "grad_norm": 1.709448079949297, + "learning_rate": 7.488507407001133e-07, + "loss": 0.8251, + "step": 18532 + }, + { + "epoch": 0.9243391521197007, + "grad_norm": 0.7434561279189025, + "learning_rate": 7.478700306060765e-07, + "loss": 0.8612, + "step": 18533 + }, + { + "epoch": 0.9243890274314215, + "grad_norm": 0.594626499582898, + "learning_rate": 7.468899533605245e-07, + "loss": 0.8241, + "step": 18534 + }, + { + "epoch": 0.9244389027431421, + "grad_norm": 0.7425830393968679, + "learning_rate": 7.4591050898902e-07, + "loss": 0.8367, + "step": 18535 + }, + { + "epoch": 0.9244887780548628, + "grad_norm": 0.6649208701641951, + "learning_rate": 7.449316975171261e-07, + "loss": 0.8257, + "step": 18536 + }, + { + "epoch": 0.9245386533665836, + "grad_norm": 0.5952287412944354, + "learning_rate": 7.439535189703861e-07, + "loss": 0.7915, + "step": 18537 + }, + { + "epoch": 0.9245885286783042, + "grad_norm": 0.6265752085980377, + "learning_rate": 7.429759733743241e-07, + "loss": 0.9118, + "step": 18538 + }, + { + "epoch": 0.924638403990025, + "grad_norm": 0.6413907635542969, + "learning_rate": 7.419990607544447e-07, + "loss": 0.847, + "step": 18539 + }, + { + "epoch": 0.9246882793017457, + "grad_norm": 0.6379510072711629, + "learning_rate": 7.410227811362469e-07, + "loss": 0.8507, + "step": 18540 + }, + { + "epoch": 0.9247381546134663, + "grad_norm": 0.7185966263466934, + "learning_rate": 7.400471345452021e-07, + "loss": 0.8472, + "step": 18541 + }, + { + "epoch": 0.924788029925187, + "grad_norm": 0.636139199052876, + "learning_rate": 7.390721210067703e-07, + "loss": 0.8565, + "step": 18542 + }, + { + "epoch": 0.9248379052369077, + "grad_norm": 0.6908199461109691, + "learning_rate": 7.380977405463895e-07, + "loss": 0.8381, + "step": 18543 + }, + { + "epoch": 0.9248877805486284, + "grad_norm": 0.5454841707868024, + "learning_rate": 7.371239931894953e-07, + "loss": 0.8377, + "step": 18544 + }, + { + "epoch": 0.9249376558603492, + "grad_norm": 0.4890396106684067, + "learning_rate": 7.361508789614891e-07, + "loss": 0.7804, + "step": 18545 + }, + { + "epoch": 0.9249875311720698, + "grad_norm": 0.5611229794835478, + "learning_rate": 7.351783978877675e-07, + "loss": 0.8323, + "step": 18546 + }, + { + "epoch": 0.9250374064837905, + "grad_norm": 0.6148422393066005, + "learning_rate": 7.342065499937046e-07, + "loss": 0.8221, + "step": 18547 + }, + { + "epoch": 0.9250872817955113, + "grad_norm": 0.5736656806526876, + "learning_rate": 7.332353353046662e-07, + "loss": 0.8228, + "step": 18548 + }, + { + "epoch": 0.9251371571072319, + "grad_norm": 0.6026097772584494, + "learning_rate": 7.322647538459849e-07, + "loss": 0.9001, + "step": 18549 + }, + { + "epoch": 0.9251870324189526, + "grad_norm": 0.6314397187255113, + "learning_rate": 7.312948056429986e-07, + "loss": 0.8578, + "step": 18550 + }, + { + "epoch": 0.9252369077306734, + "grad_norm": 0.5891401932736618, + "learning_rate": 7.303254907210094e-07, + "loss": 0.8368, + "step": 18551 + }, + { + "epoch": 0.925286783042394, + "grad_norm": 0.5936826934267844, + "learning_rate": 7.293568091053193e-07, + "loss": 0.851, + "step": 18552 + }, + { + "epoch": 0.9253366583541147, + "grad_norm": 0.6337076204154171, + "learning_rate": 7.28388760821197e-07, + "loss": 0.8633, + "step": 18553 + }, + { + "epoch": 0.9253865336658355, + "grad_norm": 0.5757396611892247, + "learning_rate": 7.274213458939111e-07, + "loss": 0.827, + "step": 18554 + }, + { + "epoch": 0.9254364089775561, + "grad_norm": 0.5770632100486353, + "learning_rate": 7.264545643486997e-07, + "loss": 0.8506, + "step": 18555 + }, + { + "epoch": 0.9254862842892768, + "grad_norm": 0.5573515433442426, + "learning_rate": 7.254884162107928e-07, + "loss": 0.8404, + "step": 18556 + }, + { + "epoch": 0.9255361596009976, + "grad_norm": 0.5754457709969495, + "learning_rate": 7.245229015054034e-07, + "loss": 0.8147, + "step": 18557 + }, + { + "epoch": 0.9255860349127182, + "grad_norm": 0.6195827929819738, + "learning_rate": 7.235580202577253e-07, + "loss": 0.8072, + "step": 18558 + }, + { + "epoch": 0.9256359102244389, + "grad_norm": 0.7213015254154244, + "learning_rate": 7.225937724929355e-07, + "loss": 0.8784, + "step": 18559 + }, + { + "epoch": 0.9256857855361597, + "grad_norm": 0.6633969933891966, + "learning_rate": 7.216301582361945e-07, + "loss": 0.8872, + "step": 18560 + }, + { + "epoch": 0.9257356608478803, + "grad_norm": 0.5445408362870187, + "learning_rate": 7.206671775126489e-07, + "loss": 0.8658, + "step": 18561 + }, + { + "epoch": 0.925785536159601, + "grad_norm": 0.6429182369599277, + "learning_rate": 7.197048303474257e-07, + "loss": 0.8649, + "step": 18562 + }, + { + "epoch": 0.9258354114713216, + "grad_norm": 0.6406845745808369, + "learning_rate": 7.187431167656439e-07, + "loss": 0.838, + "step": 18563 + }, + { + "epoch": 0.9258852867830424, + "grad_norm": 0.603422139381896, + "learning_rate": 7.177820367923887e-07, + "loss": 0.8602, + "step": 18564 + }, + { + "epoch": 0.9259351620947631, + "grad_norm": 0.6078230064182396, + "learning_rate": 7.168215904527487e-07, + "loss": 0.8509, + "step": 18565 + }, + { + "epoch": 0.9259850374064837, + "grad_norm": 0.6165601556543904, + "learning_rate": 7.158617777717758e-07, + "loss": 0.8081, + "step": 18566 + }, + { + "epoch": 0.9260349127182045, + "grad_norm": 0.6540638293471657, + "learning_rate": 7.149025987745278e-07, + "loss": 0.8787, + "step": 18567 + }, + { + "epoch": 0.9260847880299252, + "grad_norm": 0.5345492318256432, + "learning_rate": 7.139440534860237e-07, + "loss": 0.8178, + "step": 18568 + }, + { + "epoch": 0.9261346633416458, + "grad_norm": 0.6253412224121867, + "learning_rate": 7.129861419312822e-07, + "loss": 0.8158, + "step": 18569 + }, + { + "epoch": 0.9261845386533666, + "grad_norm": 0.6731089758278613, + "learning_rate": 7.120288641352973e-07, + "loss": 0.8929, + "step": 18570 + }, + { + "epoch": 0.9262344139650873, + "grad_norm": 1.184002598937128, + "learning_rate": 7.110722201230491e-07, + "loss": 0.8697, + "step": 18571 + }, + { + "epoch": 0.926284289276808, + "grad_norm": 0.6310912030502894, + "learning_rate": 7.10116209919498e-07, + "loss": 0.8589, + "step": 18572 + }, + { + "epoch": 0.9263341645885287, + "grad_norm": 0.6154842661903726, + "learning_rate": 7.091608335495992e-07, + "loss": 0.8548, + "step": 18573 + }, + { + "epoch": 0.9263840399002494, + "grad_norm": 0.5955106791935038, + "learning_rate": 7.082060910382743e-07, + "loss": 0.8513, + "step": 18574 + }, + { + "epoch": 0.92643391521197, + "grad_norm": 0.6495772842804737, + "learning_rate": 7.072519824104423e-07, + "loss": 0.844, + "step": 18575 + }, + { + "epoch": 0.9264837905236908, + "grad_norm": 0.6205520706410979, + "learning_rate": 7.062985076909944e-07, + "loss": 0.8882, + "step": 18576 + }, + { + "epoch": 0.9265336658354115, + "grad_norm": 0.8189551345849824, + "learning_rate": 7.053456669048164e-07, + "loss": 0.8478, + "step": 18577 + }, + { + "epoch": 0.9265835411471322, + "grad_norm": 0.6081456781655242, + "learning_rate": 7.043934600767688e-07, + "loss": 0.8484, + "step": 18578 + }, + { + "epoch": 0.9266334164588529, + "grad_norm": 0.5629361492974364, + "learning_rate": 7.034418872317039e-07, + "loss": 0.8332, + "step": 18579 + }, + { + "epoch": 0.9266832917705735, + "grad_norm": 0.5613078420284987, + "learning_rate": 7.024909483944436e-07, + "loss": 0.8189, + "step": 18580 + }, + { + "epoch": 0.9267331670822943, + "grad_norm": 0.6432226732414641, + "learning_rate": 7.015406435898153e-07, + "loss": 0.8137, + "step": 18581 + }, + { + "epoch": 0.926783042394015, + "grad_norm": 0.6676408688306817, + "learning_rate": 7.005909728426046e-07, + "loss": 0.8023, + "step": 18582 + }, + { + "epoch": 0.9268329177057356, + "grad_norm": 0.6755287815780744, + "learning_rate": 6.996419361775974e-07, + "loss": 0.8179, + "step": 18583 + }, + { + "epoch": 0.9268827930174564, + "grad_norm": 0.6268123267697101, + "learning_rate": 6.986935336195572e-07, + "loss": 0.846, + "step": 18584 + }, + { + "epoch": 0.9269326683291771, + "grad_norm": 0.5992215039011111, + "learning_rate": 6.97745765193239e-07, + "loss": 0.7919, + "step": 18585 + }, + { + "epoch": 0.9269825436408977, + "grad_norm": 0.7381090786235367, + "learning_rate": 6.967986309233649e-07, + "loss": 0.8414, + "step": 18586 + }, + { + "epoch": 0.9270324189526185, + "grad_norm": 0.6504606036237285, + "learning_rate": 6.958521308346538e-07, + "loss": 0.8668, + "step": 18587 + }, + { + "epoch": 0.9270822942643392, + "grad_norm": 0.6857717006596061, + "learning_rate": 6.949062649518028e-07, + "loss": 0.8365, + "step": 18588 + }, + { + "epoch": 0.9271321695760598, + "grad_norm": 0.5512158097748255, + "learning_rate": 6.939610332994978e-07, + "loss": 0.8181, + "step": 18589 + }, + { + "epoch": 0.9271820448877806, + "grad_norm": 0.6190143541988691, + "learning_rate": 6.930164359024022e-07, + "loss": 0.8461, + "step": 18590 + }, + { + "epoch": 0.9272319201995013, + "grad_norm": 0.6909567431661954, + "learning_rate": 6.92072472785163e-07, + "loss": 0.8773, + "step": 18591 + }, + { + "epoch": 0.9272817955112219, + "grad_norm": 0.5344402039949608, + "learning_rate": 6.911291439724133e-07, + "loss": 0.8515, + "step": 18592 + }, + { + "epoch": 0.9273316708229427, + "grad_norm": 0.8587439506807136, + "learning_rate": 6.901864494887694e-07, + "loss": 0.8025, + "step": 18593 + }, + { + "epoch": 0.9273815461346634, + "grad_norm": 0.6767402632751003, + "learning_rate": 6.892443893588313e-07, + "loss": 0.8464, + "step": 18594 + }, + { + "epoch": 0.927431421446384, + "grad_norm": 0.6470772344164083, + "learning_rate": 6.883029636071819e-07, + "loss": 0.843, + "step": 18595 + }, + { + "epoch": 0.9274812967581048, + "grad_norm": 0.5910571324547299, + "learning_rate": 6.87362172258385e-07, + "loss": 0.8391, + "step": 18596 + }, + { + "epoch": 0.9275311720698254, + "grad_norm": 0.6269417508762715, + "learning_rate": 6.864220153369905e-07, + "loss": 0.8237, + "step": 18597 + }, + { + "epoch": 0.9275810473815461, + "grad_norm": 0.7572414639619308, + "learning_rate": 6.854824928675313e-07, + "loss": 0.8316, + "step": 18598 + }, + { + "epoch": 0.9276309226932669, + "grad_norm": 0.47937160056846384, + "learning_rate": 6.845436048745241e-07, + "loss": 0.8337, + "step": 18599 + }, + { + "epoch": 0.9276807980049875, + "grad_norm": 0.5652951992069963, + "learning_rate": 6.836053513824769e-07, + "loss": 0.819, + "step": 18600 + }, + { + "epoch": 0.9277306733167082, + "grad_norm": 0.6410715831273417, + "learning_rate": 6.826677324158564e-07, + "loss": 0.851, + "step": 18601 + }, + { + "epoch": 0.927780548628429, + "grad_norm": 0.6064614269661294, + "learning_rate": 6.81730747999143e-07, + "loss": 0.8397, + "step": 18602 + }, + { + "epoch": 0.9278304239401496, + "grad_norm": 0.6208042644113767, + "learning_rate": 6.80794398156781e-07, + "loss": 0.8192, + "step": 18603 + }, + { + "epoch": 0.9278802992518703, + "grad_norm": 0.5498083522553007, + "learning_rate": 6.798586829132064e-07, + "loss": 0.8096, + "step": 18604 + }, + { + "epoch": 0.9279301745635911, + "grad_norm": 0.5689469795412277, + "learning_rate": 6.78923602292833e-07, + "loss": 0.8334, + "step": 18605 + }, + { + "epoch": 0.9279800498753117, + "grad_norm": 0.8187541513074168, + "learning_rate": 6.779891563200663e-07, + "loss": 0.8789, + "step": 18606 + }, + { + "epoch": 0.9280299251870324, + "grad_norm": 0.6628280244345537, + "learning_rate": 6.770553450192868e-07, + "loss": 0.8661, + "step": 18607 + }, + { + "epoch": 0.9280798004987532, + "grad_norm": 0.6634998063674086, + "learning_rate": 6.761221684148611e-07, + "loss": 0.8895, + "step": 18608 + }, + { + "epoch": 0.9281296758104738, + "grad_norm": 0.6745360405512077, + "learning_rate": 6.751896265311419e-07, + "loss": 0.8313, + "step": 18609 + }, + { + "epoch": 0.9281795511221945, + "grad_norm": 0.6349045704551061, + "learning_rate": 6.742577193924626e-07, + "loss": 0.7949, + "step": 18610 + }, + { + "epoch": 0.9282294264339153, + "grad_norm": 0.6472148548204751, + "learning_rate": 6.733264470231426e-07, + "loss": 0.8322, + "step": 18611 + }, + { + "epoch": 0.9282793017456359, + "grad_norm": 0.5583274822878515, + "learning_rate": 6.72395809447482e-07, + "loss": 0.8613, + "step": 18612 + }, + { + "epoch": 0.9283291770573566, + "grad_norm": 0.5477051953014171, + "learning_rate": 6.714658066897639e-07, + "loss": 0.8257, + "step": 18613 + }, + { + "epoch": 0.9283790523690774, + "grad_norm": 0.6634125405526128, + "learning_rate": 6.70536438774258e-07, + "loss": 0.8712, + "step": 18614 + }, + { + "epoch": 0.928428927680798, + "grad_norm": 0.6467785363933366, + "learning_rate": 6.69607705725217e-07, + "loss": 0.8247, + "step": 18615 + }, + { + "epoch": 0.9284788029925187, + "grad_norm": 0.5350351060874583, + "learning_rate": 6.686796075668717e-07, + "loss": 0.8266, + "step": 18616 + }, + { + "epoch": 0.9285286783042394, + "grad_norm": 0.7481924466442311, + "learning_rate": 6.677521443234414e-07, + "loss": 0.805, + "step": 18617 + }, + { + "epoch": 0.9285785536159601, + "grad_norm": 0.7003021430497044, + "learning_rate": 6.668253160191345e-07, + "loss": 0.8753, + "step": 18618 + }, + { + "epoch": 0.9286284289276808, + "grad_norm": 0.6726377034106492, + "learning_rate": 6.658991226781264e-07, + "loss": 0.8235, + "step": 18619 + }, + { + "epoch": 0.9286783042394015, + "grad_norm": 0.6437516518434926, + "learning_rate": 6.649735643245919e-07, + "loss": 0.7639, + "step": 18620 + }, + { + "epoch": 0.9287281795511222, + "grad_norm": 0.7211045733544625, + "learning_rate": 6.640486409826785e-07, + "loss": 0.8461, + "step": 18621 + }, + { + "epoch": 0.9287780548628429, + "grad_norm": 0.716057102078674, + "learning_rate": 6.631243526765307e-07, + "loss": 0.8275, + "step": 18622 + }, + { + "epoch": 0.9288279301745636, + "grad_norm": 0.6437095014281963, + "learning_rate": 6.622006994302543e-07, + "loss": 0.8134, + "step": 18623 + }, + { + "epoch": 0.9288778054862843, + "grad_norm": 0.6324828419002325, + "learning_rate": 6.612776812679633e-07, + "loss": 0.8423, + "step": 18624 + }, + { + "epoch": 0.928927680798005, + "grad_norm": 0.5973766140944075, + "learning_rate": 6.603552982137329e-07, + "loss": 0.8405, + "step": 18625 + }, + { + "epoch": 0.9289775561097257, + "grad_norm": 0.7096107895326821, + "learning_rate": 6.594335502916437e-07, + "loss": 0.8433, + "step": 18626 + }, + { + "epoch": 0.9290274314214464, + "grad_norm": 0.5241371076138414, + "learning_rate": 6.585124375257407e-07, + "loss": 0.8407, + "step": 18627 + }, + { + "epoch": 0.9290773067331671, + "grad_norm": 0.637483661888409, + "learning_rate": 6.575919599400626e-07, + "loss": 0.8556, + "step": 18628 + }, + { + "epoch": 0.9291271820448878, + "grad_norm": 0.7919972407239279, + "learning_rate": 6.566721175586293e-07, + "loss": 0.7975, + "step": 18629 + }, + { + "epoch": 0.9291770573566085, + "grad_norm": 0.7205295996056691, + "learning_rate": 6.557529104054382e-07, + "loss": 0.8716, + "step": 18630 + }, + { + "epoch": 0.9292269326683292, + "grad_norm": 0.6653962598094391, + "learning_rate": 6.548343385044814e-07, + "loss": 0.8736, + "step": 18631 + }, + { + "epoch": 0.9292768079800499, + "grad_norm": 0.7477555178077266, + "learning_rate": 6.539164018797284e-07, + "loss": 0.77, + "step": 18632 + }, + { + "epoch": 0.9293266832917706, + "grad_norm": 0.6775011185025257, + "learning_rate": 6.529991005551322e-07, + "loss": 0.8341, + "step": 18633 + }, + { + "epoch": 0.9293765586034912, + "grad_norm": 0.6631815251309855, + "learning_rate": 6.520824345546239e-07, + "loss": 0.8492, + "step": 18634 + }, + { + "epoch": 0.929426433915212, + "grad_norm": 0.6389374456941695, + "learning_rate": 6.511664039021315e-07, + "loss": 0.8663, + "step": 18635 + }, + { + "epoch": 0.9294763092269327, + "grad_norm": 0.7095762132812028, + "learning_rate": 6.502510086215525e-07, + "loss": 0.8758, + "step": 18636 + }, + { + "epoch": 0.9295261845386533, + "grad_norm": 0.5458290206106934, + "learning_rate": 6.493362487367788e-07, + "loss": 0.8281, + "step": 18637 + }, + { + "epoch": 0.9295760598503741, + "grad_norm": 0.5613403415964199, + "learning_rate": 6.484221242716721e-07, + "loss": 0.8618, + "step": 18638 + }, + { + "epoch": 0.9296259351620948, + "grad_norm": 0.6028964188878396, + "learning_rate": 6.475086352500965e-07, + "loss": 0.8498, + "step": 18639 + }, + { + "epoch": 0.9296758104738154, + "grad_norm": 0.6016748073347264, + "learning_rate": 6.465957816958829e-07, + "loss": 0.8301, + "step": 18640 + }, + { + "epoch": 0.9297256857855362, + "grad_norm": 0.5326580892487236, + "learning_rate": 6.45683563632854e-07, + "loss": 0.8445, + "step": 18641 + }, + { + "epoch": 0.9297755610972569, + "grad_norm": 0.5868125303447663, + "learning_rate": 6.447719810848129e-07, + "loss": 0.8552, + "step": 18642 + }, + { + "epoch": 0.9298254364089775, + "grad_norm": 0.5932531909779316, + "learning_rate": 6.438610340755463e-07, + "loss": 0.8311, + "step": 18643 + }, + { + "epoch": 0.9298753117206983, + "grad_norm": 0.6401324728125913, + "learning_rate": 6.429507226288267e-07, + "loss": 0.834, + "step": 18644 + }, + { + "epoch": 0.929925187032419, + "grad_norm": 0.5946621904284238, + "learning_rate": 6.420410467684074e-07, + "loss": 0.8779, + "step": 18645 + }, + { + "epoch": 0.9299750623441396, + "grad_norm": 0.5552328050300459, + "learning_rate": 6.411320065180221e-07, + "loss": 0.837, + "step": 18646 + }, + { + "epoch": 0.9300249376558604, + "grad_norm": 0.7035073379501188, + "learning_rate": 6.402236019013991e-07, + "loss": 0.7917, + "step": 18647 + }, + { + "epoch": 0.9300748129675811, + "grad_norm": 0.8059646495957028, + "learning_rate": 6.393158329422388e-07, + "loss": 0.8065, + "step": 18648 + }, + { + "epoch": 0.9301246882793017, + "grad_norm": 0.5372597346401408, + "learning_rate": 6.38408699664228e-07, + "loss": 0.779, + "step": 18649 + }, + { + "epoch": 0.9301745635910225, + "grad_norm": 0.5447064919922996, + "learning_rate": 6.375022020910393e-07, + "loss": 0.8297, + "step": 18650 + }, + { + "epoch": 0.9302244389027431, + "grad_norm": 0.7114987366500393, + "learning_rate": 6.365963402463288e-07, + "loss": 0.8129, + "step": 18651 + }, + { + "epoch": 0.9302743142144638, + "grad_norm": 1.296333574020186, + "learning_rate": 6.356911141537331e-07, + "loss": 0.8078, + "step": 18652 + }, + { + "epoch": 0.9303241895261846, + "grad_norm": 0.6055467286589479, + "learning_rate": 6.34786523836875e-07, + "loss": 0.8983, + "step": 18653 + }, + { + "epoch": 0.9303740648379052, + "grad_norm": 0.6291451259142524, + "learning_rate": 6.338825693193523e-07, + "loss": 0.8471, + "step": 18654 + }, + { + "epoch": 0.9304239401496259, + "grad_norm": 0.5439919778388499, + "learning_rate": 6.329792506247684e-07, + "loss": 0.7767, + "step": 18655 + }, + { + "epoch": 0.9304738154613467, + "grad_norm": 0.5283026308454247, + "learning_rate": 6.320765677766766e-07, + "loss": 0.7997, + "step": 18656 + }, + { + "epoch": 0.9305236907730673, + "grad_norm": 0.6580604876673097, + "learning_rate": 6.311745207986469e-07, + "loss": 0.8089, + "step": 18657 + }, + { + "epoch": 0.930573566084788, + "grad_norm": 0.6011624159856391, + "learning_rate": 6.302731097142078e-07, + "loss": 0.862, + "step": 18658 + }, + { + "epoch": 0.9306234413965088, + "grad_norm": 1.1617768431737059, + "learning_rate": 6.293723345468905e-07, + "loss": 0.8107, + "step": 18659 + }, + { + "epoch": 0.9306733167082294, + "grad_norm": 0.6769764107819709, + "learning_rate": 6.284721953201928e-07, + "loss": 0.8579, + "step": 18660 + }, + { + "epoch": 0.9307231920199501, + "grad_norm": 0.6133518574030045, + "learning_rate": 6.275726920576041e-07, + "loss": 0.8414, + "step": 18661 + }, + { + "epoch": 0.9307730673316709, + "grad_norm": 0.7396104973809459, + "learning_rate": 6.266738247825977e-07, + "loss": 0.8231, + "step": 18662 + }, + { + "epoch": 0.9308229426433915, + "grad_norm": 0.7490507343530383, + "learning_rate": 6.257755935186349e-07, + "loss": 0.8876, + "step": 18663 + }, + { + "epoch": 0.9308728179551122, + "grad_norm": 0.620048615263579, + "learning_rate": 6.248779982891445e-07, + "loss": 0.8628, + "step": 18664 + }, + { + "epoch": 0.930922693266833, + "grad_norm": 0.7104558625141273, + "learning_rate": 6.239810391175577e-07, + "loss": 0.8367, + "step": 18665 + }, + { + "epoch": 0.9309725685785536, + "grad_norm": 0.48063963694647277, + "learning_rate": 6.230847160272751e-07, + "loss": 0.8044, + "step": 18666 + }, + { + "epoch": 0.9310224438902743, + "grad_norm": 0.5628545779318523, + "learning_rate": 6.221890290416837e-07, + "loss": 0.8306, + "step": 18667 + }, + { + "epoch": 0.931072319201995, + "grad_norm": 0.7469901945126782, + "learning_rate": 6.212939781841648e-07, + "loss": 0.8224, + "step": 18668 + }, + { + "epoch": 0.9311221945137157, + "grad_norm": 0.6825086805688083, + "learning_rate": 6.203995634780663e-07, + "loss": 0.8432, + "step": 18669 + }, + { + "epoch": 0.9311720698254364, + "grad_norm": 0.598151885599603, + "learning_rate": 6.195057849467306e-07, + "loss": 0.8612, + "step": 18670 + }, + { + "epoch": 0.9312219451371571, + "grad_norm": 0.6125381646957491, + "learning_rate": 6.186126426134809e-07, + "loss": 0.8437, + "step": 18671 + }, + { + "epoch": 0.9312718204488778, + "grad_norm": 0.5884273856003575, + "learning_rate": 6.177201365016206e-07, + "loss": 0.825, + "step": 18672 + }, + { + "epoch": 0.9313216957605985, + "grad_norm": 0.5899079261700749, + "learning_rate": 6.168282666344449e-07, + "loss": 0.8579, + "step": 18673 + }, + { + "epoch": 0.9313715710723192, + "grad_norm": 0.7265824401049735, + "learning_rate": 6.159370330352215e-07, + "loss": 0.8798, + "step": 18674 + }, + { + "epoch": 0.9314214463840399, + "grad_norm": 0.5752166723537341, + "learning_rate": 6.15046435727204e-07, + "loss": 0.8365, + "step": 18675 + }, + { + "epoch": 0.9314713216957606, + "grad_norm": 0.9138712566035387, + "learning_rate": 6.141564747336431e-07, + "loss": 0.8646, + "step": 18676 + }, + { + "epoch": 0.9315211970074813, + "grad_norm": 0.5912407111489054, + "learning_rate": 6.132671500777509e-07, + "loss": 0.8495, + "step": 18677 + }, + { + "epoch": 0.931571072319202, + "grad_norm": 0.6052509103682735, + "learning_rate": 6.123784617827422e-07, + "loss": 0.8055, + "step": 18678 + }, + { + "epoch": 0.9316209476309227, + "grad_norm": 1.185729261091633, + "learning_rate": 6.114904098717983e-07, + "loss": 0.7966, + "step": 18679 + }, + { + "epoch": 0.9316708229426434, + "grad_norm": 0.5167331393192547, + "learning_rate": 6.106029943681008e-07, + "loss": 0.83, + "step": 18680 + }, + { + "epoch": 0.9317206982543641, + "grad_norm": 0.7980774113921428, + "learning_rate": 6.097162152948005e-07, + "loss": 0.8628, + "step": 18681 + }, + { + "epoch": 0.9317705735660848, + "grad_norm": 0.5394472621897383, + "learning_rate": 6.088300726750429e-07, + "loss": 0.8032, + "step": 18682 + }, + { + "epoch": 0.9318204488778055, + "grad_norm": 0.5893576356774805, + "learning_rate": 6.079445665319455e-07, + "loss": 0.8545, + "step": 18683 + }, + { + "epoch": 0.9318703241895262, + "grad_norm": 0.6305506336805973, + "learning_rate": 6.070596968886206e-07, + "loss": 0.8153, + "step": 18684 + }, + { + "epoch": 0.9319201995012469, + "grad_norm": 0.607495264878446, + "learning_rate": 6.06175463768155e-07, + "loss": 0.847, + "step": 18685 + }, + { + "epoch": 0.9319700748129676, + "grad_norm": 0.6179892041433256, + "learning_rate": 6.05291867193622e-07, + "loss": 0.808, + "step": 18686 + }, + { + "epoch": 0.9320199501246883, + "grad_norm": 0.9399617856221218, + "learning_rate": 6.044089071880809e-07, + "loss": 0.8501, + "step": 18687 + }, + { + "epoch": 0.9320698254364089, + "grad_norm": 0.6715118968650097, + "learning_rate": 6.035265837745746e-07, + "loss": 0.8543, + "step": 18688 + }, + { + "epoch": 0.9321197007481297, + "grad_norm": 0.6475485051600188, + "learning_rate": 6.026448969761178e-07, + "loss": 0.8319, + "step": 18689 + }, + { + "epoch": 0.9321695760598504, + "grad_norm": 0.5691117128612909, + "learning_rate": 6.017638468157282e-07, + "loss": 0.8077, + "step": 18690 + }, + { + "epoch": 0.932219451371571, + "grad_norm": 0.5665578609915481, + "learning_rate": 6.008834333163876e-07, + "loss": 0.8357, + "step": 18691 + }, + { + "epoch": 0.9322693266832918, + "grad_norm": 0.8870746679092686, + "learning_rate": 6.000036565010802e-07, + "loss": 0.8541, + "step": 18692 + }, + { + "epoch": 0.9323192019950125, + "grad_norm": 0.6465879896373095, + "learning_rate": 5.991245163927517e-07, + "loss": 0.8458, + "step": 18693 + }, + { + "epoch": 0.9323690773067331, + "grad_norm": 0.6299946369001538, + "learning_rate": 5.98246013014353e-07, + "loss": 0.8078, + "step": 18694 + }, + { + "epoch": 0.9324189526184539, + "grad_norm": 0.4951862262818805, + "learning_rate": 5.973681463887992e-07, + "loss": 0.7911, + "step": 18695 + }, + { + "epoch": 0.9324688279301746, + "grad_norm": 0.6239550855875703, + "learning_rate": 5.964909165390081e-07, + "loss": 0.8347, + "step": 18696 + }, + { + "epoch": 0.9325187032418952, + "grad_norm": 0.5731312560149272, + "learning_rate": 5.956143234878614e-07, + "loss": 0.8173, + "step": 18697 + }, + { + "epoch": 0.932568578553616, + "grad_norm": 0.5722393480896258, + "learning_rate": 5.94738367258238e-07, + "loss": 0.8459, + "step": 18698 + }, + { + "epoch": 0.9326184538653367, + "grad_norm": 0.6228783421686869, + "learning_rate": 5.938630478729917e-07, + "loss": 0.861, + "step": 18699 + }, + { + "epoch": 0.9326683291770573, + "grad_norm": 0.5911927567270381, + "learning_rate": 5.929883653549712e-07, + "loss": 0.8517, + "step": 18700 + }, + { + "epoch": 0.9327182044887781, + "grad_norm": 0.6360384251835058, + "learning_rate": 5.921143197269913e-07, + "loss": 0.8424, + "step": 18701 + }, + { + "epoch": 0.9327680798004988, + "grad_norm": 0.5981221746579315, + "learning_rate": 5.912409110118672e-07, + "loss": 0.868, + "step": 18702 + }, + { + "epoch": 0.9328179551122194, + "grad_norm": 0.6501277453144971, + "learning_rate": 5.90368139232389e-07, + "loss": 0.8383, + "step": 18703 + }, + { + "epoch": 0.9328678304239402, + "grad_norm": 0.7375057175027178, + "learning_rate": 5.894960044113245e-07, + "loss": 0.8986, + "step": 18704 + }, + { + "epoch": 0.9329177057356608, + "grad_norm": 0.6789994816148811, + "learning_rate": 5.886245065714419e-07, + "loss": 0.8052, + "step": 18705 + }, + { + "epoch": 0.9329675810473815, + "grad_norm": 0.6967969084759142, + "learning_rate": 5.877536457354782e-07, + "loss": 0.878, + "step": 18706 + }, + { + "epoch": 0.9330174563591023, + "grad_norm": 0.5752422875439147, + "learning_rate": 5.868834219261543e-07, + "loss": 0.825, + "step": 18707 + }, + { + "epoch": 0.9330673316708229, + "grad_norm": 0.5645420553081542, + "learning_rate": 5.860138351661825e-07, + "loss": 0.8363, + "step": 18708 + }, + { + "epoch": 0.9331172069825436, + "grad_norm": 0.6011370910761404, + "learning_rate": 5.85144885478256e-07, + "loss": 0.8227, + "step": 18709 + }, + { + "epoch": 0.9331670822942644, + "grad_norm": 0.615187645293553, + "learning_rate": 5.842765728850452e-07, + "loss": 0.8161, + "step": 18710 + }, + { + "epoch": 0.933216957605985, + "grad_norm": 0.5310606541189705, + "learning_rate": 5.8340889740921e-07, + "loss": 0.8549, + "step": 18711 + }, + { + "epoch": 0.9332668329177057, + "grad_norm": 0.9969658288854637, + "learning_rate": 5.825418590733906e-07, + "loss": 0.8326, + "step": 18712 + }, + { + "epoch": 0.9333167082294265, + "grad_norm": 0.7646811828416351, + "learning_rate": 5.816754579002159e-07, + "loss": 0.8336, + "step": 18713 + }, + { + "epoch": 0.9333665835411471, + "grad_norm": 0.5792800181406826, + "learning_rate": 5.808096939122932e-07, + "loss": 0.8491, + "step": 18714 + }, + { + "epoch": 0.9334164588528678, + "grad_norm": 0.660570608934966, + "learning_rate": 5.799445671322096e-07, + "loss": 0.8171, + "step": 18715 + }, + { + "epoch": 0.9334663341645886, + "grad_norm": 0.635206038161907, + "learning_rate": 5.790800775825444e-07, + "loss": 0.8433, + "step": 18716 + }, + { + "epoch": 0.9335162094763092, + "grad_norm": 1.2202471020968053, + "learning_rate": 5.782162252858575e-07, + "loss": 0.8669, + "step": 18717 + }, + { + "epoch": 0.9335660847880299, + "grad_norm": 0.711850509878231, + "learning_rate": 5.773530102646862e-07, + "loss": 0.8604, + "step": 18718 + }, + { + "epoch": 0.9336159600997507, + "grad_norm": 0.6098160597696316, + "learning_rate": 5.764904325415599e-07, + "loss": 0.8356, + "step": 18719 + }, + { + "epoch": 0.9336658354114713, + "grad_norm": 0.7088823627361963, + "learning_rate": 5.756284921389826e-07, + "loss": 0.8606, + "step": 18720 + }, + { + "epoch": 0.933715710723192, + "grad_norm": 0.5400684651200371, + "learning_rate": 5.747671890794504e-07, + "loss": 0.7935, + "step": 18721 + }, + { + "epoch": 0.9337655860349127, + "grad_norm": 0.7041795561536895, + "learning_rate": 5.739065233854368e-07, + "loss": 0.8358, + "step": 18722 + }, + { + "epoch": 0.9338154613466334, + "grad_norm": 1.1933416041645433, + "learning_rate": 5.730464950794018e-07, + "loss": 0.8533, + "step": 18723 + }, + { + "epoch": 0.9338653366583541, + "grad_norm": 0.6264392799039339, + "learning_rate": 5.721871041837829e-07, + "loss": 0.8891, + "step": 18724 + }, + { + "epoch": 0.9339152119700748, + "grad_norm": 0.741010895930074, + "learning_rate": 5.713283507210148e-07, + "loss": 0.8029, + "step": 18725 + }, + { + "epoch": 0.9339650872817955, + "grad_norm": 0.5232750432662522, + "learning_rate": 5.704702347134939e-07, + "loss": 0.8631, + "step": 18726 + }, + { + "epoch": 0.9340149625935162, + "grad_norm": 0.6527434573830132, + "learning_rate": 5.69612756183624e-07, + "loss": 0.8579, + "step": 18727 + }, + { + "epoch": 0.9340648379052369, + "grad_norm": 0.7003530341259142, + "learning_rate": 5.687559151537736e-07, + "loss": 0.8255, + "step": 18728 + }, + { + "epoch": 0.9341147132169576, + "grad_norm": 0.7243818818323008, + "learning_rate": 5.678997116463053e-07, + "loss": 0.8262, + "step": 18729 + }, + { + "epoch": 0.9341645885286783, + "grad_norm": 0.601723757026201, + "learning_rate": 5.670441456835568e-07, + "loss": 0.8369, + "step": 18730 + }, + { + "epoch": 0.934214463840399, + "grad_norm": 0.6438216061657981, + "learning_rate": 5.661892172878603e-07, + "loss": 0.825, + "step": 18731 + }, + { + "epoch": 0.9342643391521197, + "grad_norm": 0.5980970250255978, + "learning_rate": 5.653349264815172e-07, + "loss": 0.8285, + "step": 18732 + }, + { + "epoch": 0.9343142144638404, + "grad_norm": 0.569466143660606, + "learning_rate": 5.644812732868265e-07, + "loss": 0.8473, + "step": 18733 + }, + { + "epoch": 0.9343640897755611, + "grad_norm": 0.5283967635333429, + "learning_rate": 5.636282577260593e-07, + "loss": 0.8749, + "step": 18734 + }, + { + "epoch": 0.9344139650872818, + "grad_norm": 0.565473913034841, + "learning_rate": 5.627758798214755e-07, + "loss": 0.8257, + "step": 18735 + }, + { + "epoch": 0.9344638403990025, + "grad_norm": 1.2874050230647751, + "learning_rate": 5.619241395953212e-07, + "loss": 0.8607, + "step": 18736 + }, + { + "epoch": 0.9345137157107232, + "grad_norm": 0.5791759446249336, + "learning_rate": 5.610730370698203e-07, + "loss": 0.8185, + "step": 18737 + }, + { + "epoch": 0.9345635910224439, + "grad_norm": 0.8034096498293533, + "learning_rate": 5.602225722671773e-07, + "loss": 0.8166, + "step": 18738 + }, + { + "epoch": 0.9346134663341645, + "grad_norm": 0.5710432875389234, + "learning_rate": 5.593727452095909e-07, + "loss": 0.8269, + "step": 18739 + }, + { + "epoch": 0.9346633416458853, + "grad_norm": 0.7481000727794791, + "learning_rate": 5.585235559192326e-07, + "loss": 0.8465, + "step": 18740 + }, + { + "epoch": 0.934713216957606, + "grad_norm": 0.5532056991119113, + "learning_rate": 5.576750044182622e-07, + "loss": 0.8626, + "step": 18741 + }, + { + "epoch": 0.9347630922693266, + "grad_norm": 0.6274914152434042, + "learning_rate": 5.568270907288287e-07, + "loss": 0.8669, + "step": 18742 + }, + { + "epoch": 0.9348129675810474, + "grad_norm": 0.8410537927862952, + "learning_rate": 5.559798148730505e-07, + "loss": 0.8226, + "step": 18743 + }, + { + "epoch": 0.9348628428927681, + "grad_norm": 1.0981429089961108, + "learning_rate": 5.551331768730405e-07, + "loss": 0.8424, + "step": 18744 + }, + { + "epoch": 0.9349127182044887, + "grad_norm": 0.7888821906211158, + "learning_rate": 5.542871767508867e-07, + "loss": 0.8562, + "step": 18745 + }, + { + "epoch": 0.9349625935162095, + "grad_norm": 0.7923963108880234, + "learning_rate": 5.534418145286713e-07, + "loss": 0.8451, + "step": 18746 + }, + { + "epoch": 0.9350124688279302, + "grad_norm": 0.616716591246292, + "learning_rate": 5.525970902284489e-07, + "loss": 0.8229, + "step": 18747 + }, + { + "epoch": 0.9350623441396508, + "grad_norm": 0.6378330784828489, + "learning_rate": 5.517530038722657e-07, + "loss": 0.8646, + "step": 18748 + }, + { + "epoch": 0.9351122194513716, + "grad_norm": 0.5127711381191732, + "learning_rate": 5.509095554821458e-07, + "loss": 0.7855, + "step": 18749 + }, + { + "epoch": 0.9351620947630923, + "grad_norm": 0.6052785666373518, + "learning_rate": 5.500667450800995e-07, + "loss": 0.8258, + "step": 18750 + }, + { + "epoch": 0.9352119700748129, + "grad_norm": 1.3169043224498829, + "learning_rate": 5.492245726881201e-07, + "loss": 0.8965, + "step": 18751 + }, + { + "epoch": 0.9352618453865337, + "grad_norm": 0.5568997944257881, + "learning_rate": 5.483830383281819e-07, + "loss": 0.7724, + "step": 18752 + }, + { + "epoch": 0.9353117206982544, + "grad_norm": 0.695504582660579, + "learning_rate": 5.475421420222421e-07, + "loss": 0.8327, + "step": 18753 + }, + { + "epoch": 0.935361596009975, + "grad_norm": 0.5747294410033216, + "learning_rate": 5.4670188379225e-07, + "loss": 0.7958, + "step": 18754 + }, + { + "epoch": 0.9354114713216958, + "grad_norm": 0.7821229218750338, + "learning_rate": 5.458622636601296e-07, + "loss": 0.862, + "step": 18755 + }, + { + "epoch": 0.9354613466334165, + "grad_norm": 0.620441043021715, + "learning_rate": 5.450232816477885e-07, + "loss": 0.8381, + "step": 18756 + }, + { + "epoch": 0.9355112219451371, + "grad_norm": 0.658217419254335, + "learning_rate": 5.441849377771175e-07, + "loss": 0.8475, + "step": 18757 + }, + { + "epoch": 0.9355610972568579, + "grad_norm": 0.7560483519586993, + "learning_rate": 5.433472320699961e-07, + "loss": 0.8175, + "step": 18758 + }, + { + "epoch": 0.9356109725685785, + "grad_norm": 0.7987186142018852, + "learning_rate": 5.42510164548285e-07, + "loss": 0.8356, + "step": 18759 + }, + { + "epoch": 0.9356608478802992, + "grad_norm": 0.5767025110336763, + "learning_rate": 5.416737352338275e-07, + "loss": 0.8453, + "step": 18760 + }, + { + "epoch": 0.93571072319202, + "grad_norm": 0.6274305470216304, + "learning_rate": 5.408379441484424e-07, + "loss": 0.8419, + "step": 18761 + }, + { + "epoch": 0.9357605985037406, + "grad_norm": 0.9170259261522936, + "learning_rate": 5.40002791313951e-07, + "loss": 0.8148, + "step": 18762 + }, + { + "epoch": 0.9358104738154613, + "grad_norm": 0.5402295137706474, + "learning_rate": 5.391682767521334e-07, + "loss": 0.8378, + "step": 18763 + }, + { + "epoch": 0.9358603491271821, + "grad_norm": 0.682074151242227, + "learning_rate": 5.383344004847773e-07, + "loss": 0.8085, + "step": 18764 + }, + { + "epoch": 0.9359102244389027, + "grad_norm": 0.7594614994211959, + "learning_rate": 5.375011625336324e-07, + "loss": 0.8352, + "step": 18765 + }, + { + "epoch": 0.9359600997506234, + "grad_norm": 0.5968535869413361, + "learning_rate": 5.366685629204532e-07, + "loss": 0.8359, + "step": 18766 + }, + { + "epoch": 0.9360099750623442, + "grad_norm": 0.8306522062672086, + "learning_rate": 5.35836601666953e-07, + "loss": 0.8208, + "step": 18767 + }, + { + "epoch": 0.9360598503740648, + "grad_norm": 0.5662356129241882, + "learning_rate": 5.350052787948506e-07, + "loss": 0.8071, + "step": 18768 + }, + { + "epoch": 0.9361097256857855, + "grad_norm": 0.6549443425062471, + "learning_rate": 5.341745943258369e-07, + "loss": 0.8205, + "step": 18769 + }, + { + "epoch": 0.9361596009975063, + "grad_norm": 0.6039015320247733, + "learning_rate": 5.333445482815891e-07, + "loss": 0.8217, + "step": 18770 + }, + { + "epoch": 0.9362094763092269, + "grad_norm": 0.739816724385211, + "learning_rate": 5.32515140683762e-07, + "loss": 0.8517, + "step": 18771 + }, + { + "epoch": 0.9362593516209476, + "grad_norm": 0.6886085245379262, + "learning_rate": 5.316863715540049e-07, + "loss": 0.8541, + "step": 18772 + }, + { + "epoch": 0.9363092269326684, + "grad_norm": 0.6521090698213797, + "learning_rate": 5.308582409139395e-07, + "loss": 0.8448, + "step": 18773 + }, + { + "epoch": 0.936359102244389, + "grad_norm": 0.8274735279894029, + "learning_rate": 5.300307487851792e-07, + "loss": 0.8158, + "step": 18774 + }, + { + "epoch": 0.9364089775561097, + "grad_norm": 0.5329961260986658, + "learning_rate": 5.292038951893119e-07, + "loss": 0.8451, + "step": 18775 + }, + { + "epoch": 0.9364588528678304, + "grad_norm": 0.6436053592721765, + "learning_rate": 5.283776801479207e-07, + "loss": 0.8405, + "step": 18776 + }, + { + "epoch": 0.9365087281795511, + "grad_norm": 1.2010179093913862, + "learning_rate": 5.275521036825631e-07, + "loss": 0.8685, + "step": 18777 + }, + { + "epoch": 0.9365586034912718, + "grad_norm": 0.5698876826554665, + "learning_rate": 5.267271658147777e-07, + "loss": 0.8466, + "step": 18778 + }, + { + "epoch": 0.9366084788029925, + "grad_norm": 0.744636781396397, + "learning_rate": 5.259028665660942e-07, + "loss": 0.8071, + "step": 18779 + }, + { + "epoch": 0.9366583541147132, + "grad_norm": 0.8280090260592811, + "learning_rate": 5.250792059580234e-07, + "loss": 0.8391, + "step": 18780 + }, + { + "epoch": 0.936708229426434, + "grad_norm": 0.5994155179474976, + "learning_rate": 5.242561840120563e-07, + "loss": 0.8478, + "step": 18781 + }, + { + "epoch": 0.9367581047381546, + "grad_norm": 0.5880857128032854, + "learning_rate": 5.234338007496703e-07, + "loss": 0.8062, + "step": 18782 + }, + { + "epoch": 0.9368079800498753, + "grad_norm": 0.6048520693522429, + "learning_rate": 5.226120561923259e-07, + "loss": 0.8469, + "step": 18783 + }, + { + "epoch": 0.936857855361596, + "grad_norm": 1.156114798714707, + "learning_rate": 5.217909503614643e-07, + "loss": 0.8205, + "step": 18784 + }, + { + "epoch": 0.9369077306733167, + "grad_norm": 0.6307900007902091, + "learning_rate": 5.209704832785128e-07, + "loss": 0.8213, + "step": 18785 + }, + { + "epoch": 0.9369576059850374, + "grad_norm": 0.5720729972477061, + "learning_rate": 5.201506549648793e-07, + "loss": 0.7941, + "step": 18786 + }, + { + "epoch": 0.9370074812967581, + "grad_norm": 0.5195383089097998, + "learning_rate": 5.193314654419634e-07, + "loss": 0.8125, + "step": 18787 + }, + { + "epoch": 0.9370573566084788, + "grad_norm": 0.6296382973153718, + "learning_rate": 5.185129147311341e-07, + "loss": 0.8608, + "step": 18788 + }, + { + "epoch": 0.9371072319201995, + "grad_norm": 0.7584876973183254, + "learning_rate": 5.176950028537547e-07, + "loss": 0.8735, + "step": 18789 + }, + { + "epoch": 0.9371571072319203, + "grad_norm": 1.2131298087340323, + "learning_rate": 5.168777298311639e-07, + "loss": 0.8854, + "step": 18790 + }, + { + "epoch": 0.9372069825436409, + "grad_norm": 0.6319391417389821, + "learning_rate": 5.160610956846946e-07, + "loss": 0.797, + "step": 18791 + }, + { + "epoch": 0.9372568578553616, + "grad_norm": 0.7329259477023902, + "learning_rate": 5.152451004356545e-07, + "loss": 0.8659, + "step": 18792 + }, + { + "epoch": 0.9373067331670822, + "grad_norm": 0.5685696761340143, + "learning_rate": 5.144297441053353e-07, + "loss": 0.7906, + "step": 18793 + }, + { + "epoch": 0.937356608478803, + "grad_norm": 3.5495965507074616, + "learning_rate": 5.136150267150113e-07, + "loss": 0.8213, + "step": 18794 + }, + { + "epoch": 0.9374064837905237, + "grad_norm": 0.6260226270808893, + "learning_rate": 5.128009482859464e-07, + "loss": 0.8002, + "step": 18795 + }, + { + "epoch": 0.9374563591022443, + "grad_norm": 0.7417394133802182, + "learning_rate": 5.119875088393816e-07, + "loss": 0.8779, + "step": 18796 + }, + { + "epoch": 0.9375062344139651, + "grad_norm": 0.6138931995038178, + "learning_rate": 5.111747083965418e-07, + "loss": 0.8647, + "step": 18797 + }, + { + "epoch": 0.9375561097256858, + "grad_norm": 0.7548349768615967, + "learning_rate": 5.103625469786377e-07, + "loss": 0.8217, + "step": 18798 + }, + { + "epoch": 0.9376059850374064, + "grad_norm": 0.7433284451965493, + "learning_rate": 5.095510246068664e-07, + "loss": 0.8192, + "step": 18799 + }, + { + "epoch": 0.9376558603491272, + "grad_norm": 0.6286420726099995, + "learning_rate": 5.087401413023968e-07, + "loss": 0.8362, + "step": 18800 + }, + { + "epoch": 0.9377057356608479, + "grad_norm": 0.5948472157091627, + "learning_rate": 5.079298970863955e-07, + "loss": 0.8099, + "step": 18801 + }, + { + "epoch": 0.9377556109725685, + "grad_norm": 0.5700532056360672, + "learning_rate": 5.071202919799984e-07, + "loss": 0.8289, + "step": 18802 + }, + { + "epoch": 0.9378054862842893, + "grad_norm": 0.6290476243023456, + "learning_rate": 5.063113260043412e-07, + "loss": 0.8747, + "step": 18803 + }, + { + "epoch": 0.93785536159601, + "grad_norm": 0.8395628774890628, + "learning_rate": 5.055029991805238e-07, + "loss": 0.8245, + "step": 18804 + }, + { + "epoch": 0.9379052369077306, + "grad_norm": 0.6289643089348995, + "learning_rate": 5.04695311529646e-07, + "loss": 0.8463, + "step": 18805 + }, + { + "epoch": 0.9379551122194514, + "grad_norm": 0.5541168398426358, + "learning_rate": 5.038882630727798e-07, + "loss": 0.8102, + "step": 18806 + }, + { + "epoch": 0.9380049875311721, + "grad_norm": 0.5514249383350099, + "learning_rate": 5.030818538309861e-07, + "loss": 0.8325, + "step": 18807 + }, + { + "epoch": 0.9380548628428927, + "grad_norm": 0.5756709278878274, + "learning_rate": 5.022760838253066e-07, + "loss": 0.7871, + "step": 18808 + }, + { + "epoch": 0.9381047381546135, + "grad_norm": 0.8069323699188322, + "learning_rate": 5.014709530767714e-07, + "loss": 0.8607, + "step": 18809 + }, + { + "epoch": 0.9381546134663342, + "grad_norm": 0.5962642766387943, + "learning_rate": 5.006664616063889e-07, + "loss": 0.8187, + "step": 18810 + }, + { + "epoch": 0.9382044887780548, + "grad_norm": 1.531470373036372, + "learning_rate": 4.998626094351477e-07, + "loss": 0.8082, + "step": 18811 + }, + { + "epoch": 0.9382543640897756, + "grad_norm": 0.6821208689613081, + "learning_rate": 4.990593965840257e-07, + "loss": 0.8501, + "step": 18812 + }, + { + "epoch": 0.9383042394014962, + "grad_norm": 0.7601921849810565, + "learning_rate": 4.982568230739865e-07, + "loss": 0.8259, + "step": 18813 + }, + { + "epoch": 0.938354114713217, + "grad_norm": 0.6112224054454405, + "learning_rate": 4.974548889259717e-07, + "loss": 0.859, + "step": 18814 + }, + { + "epoch": 0.9384039900249377, + "grad_norm": 0.5906618608920166, + "learning_rate": 4.966535941609035e-07, + "loss": 0.811, + "step": 18815 + }, + { + "epoch": 0.9384538653366583, + "grad_norm": 0.5670972388839643, + "learning_rate": 4.958529387996902e-07, + "loss": 0.8167, + "step": 18816 + }, + { + "epoch": 0.938503740648379, + "grad_norm": 0.54319599419591, + "learning_rate": 4.950529228632316e-07, + "loss": 0.8834, + "step": 18817 + }, + { + "epoch": 0.9385536159600998, + "grad_norm": 0.5759449590025307, + "learning_rate": 4.942535463724002e-07, + "loss": 0.8299, + "step": 18818 + }, + { + "epoch": 0.9386034912718204, + "grad_norm": 0.5859494012280355, + "learning_rate": 4.934548093480512e-07, + "loss": 0.8044, + "step": 18819 + }, + { + "epoch": 0.9386533665835411, + "grad_norm": 0.6206437860140245, + "learning_rate": 4.926567118110348e-07, + "loss": 0.8085, + "step": 18820 + }, + { + "epoch": 0.9387032418952619, + "grad_norm": 0.6244618698395226, + "learning_rate": 4.918592537821731e-07, + "loss": 0.8357, + "step": 18821 + }, + { + "epoch": 0.9387531172069825, + "grad_norm": 0.5799388402233706, + "learning_rate": 4.910624352822774e-07, + "loss": 0.7992, + "step": 18822 + }, + { + "epoch": 0.9388029925187032, + "grad_norm": 0.5427927136514934, + "learning_rate": 4.902662563321337e-07, + "loss": 0.8221, + "step": 18823 + }, + { + "epoch": 0.938852867830424, + "grad_norm": 0.48864235053908245, + "learning_rate": 4.894707169525282e-07, + "loss": 0.8011, + "step": 18824 + }, + { + "epoch": 0.9389027431421446, + "grad_norm": 1.1714082274435322, + "learning_rate": 4.886758171642108e-07, + "loss": 0.8406, + "step": 18825 + }, + { + "epoch": 0.9389526184538654, + "grad_norm": 0.7855477146220905, + "learning_rate": 4.878815569879319e-07, + "loss": 0.8115, + "step": 18826 + }, + { + "epoch": 0.9390024937655861, + "grad_norm": 0.6288342230359948, + "learning_rate": 4.870879364444109e-07, + "loss": 0.8264, + "step": 18827 + }, + { + "epoch": 0.9390523690773067, + "grad_norm": 0.8004390791035787, + "learning_rate": 4.862949555543588e-07, + "loss": 0.8568, + "step": 18828 + }, + { + "epoch": 0.9391022443890275, + "grad_norm": 0.5639008767388018, + "learning_rate": 4.855026143384733e-07, + "loss": 0.8074, + "step": 18829 + }, + { + "epoch": 0.9391521197007481, + "grad_norm": 0.5139229620472309, + "learning_rate": 4.84710912817421e-07, + "loss": 0.7719, + "step": 18830 + }, + { + "epoch": 0.9392019950124688, + "grad_norm": 0.5908302793921952, + "learning_rate": 4.839198510118687e-07, + "loss": 0.8427, + "step": 18831 + }, + { + "epoch": 0.9392518703241896, + "grad_norm": 0.5546430783146198, + "learning_rate": 4.831294289424553e-07, + "loss": 0.8255, + "step": 18832 + }, + { + "epoch": 0.9393017456359102, + "grad_norm": 0.5937905707599016, + "learning_rate": 4.823396466298063e-07, + "loss": 0.8239, + "step": 18833 + }, + { + "epoch": 0.9393516209476309, + "grad_norm": 0.6319484574054388, + "learning_rate": 4.815505040945328e-07, + "loss": 0.82, + "step": 18834 + }, + { + "epoch": 0.9394014962593517, + "grad_norm": 0.6871100634459804, + "learning_rate": 4.807620013572239e-07, + "loss": 0.8213, + "step": 18835 + }, + { + "epoch": 0.9394513715710723, + "grad_norm": 1.3621387279418105, + "learning_rate": 4.799741384384604e-07, + "loss": 0.8859, + "step": 18836 + }, + { + "epoch": 0.939501246882793, + "grad_norm": 0.8011185182113351, + "learning_rate": 4.791869153587952e-07, + "loss": 0.8499, + "step": 18837 + }, + { + "epoch": 0.9395511221945138, + "grad_norm": 0.8161729964843487, + "learning_rate": 4.784003321387758e-07, + "loss": 0.8641, + "step": 18838 + }, + { + "epoch": 0.9396009975062344, + "grad_norm": 0.6535246543273401, + "learning_rate": 4.776143887989221e-07, + "loss": 0.8272, + "step": 18839 + }, + { + "epoch": 0.9396508728179551, + "grad_norm": 0.7816462872334417, + "learning_rate": 4.768290853597507e-07, + "loss": 0.8563, + "step": 18840 + }, + { + "epoch": 0.9397007481296759, + "grad_norm": 0.573866603154084, + "learning_rate": 4.760444218417426e-07, + "loss": 0.7925, + "step": 18841 + }, + { + "epoch": 0.9397506234413965, + "grad_norm": 0.6355335425836659, + "learning_rate": 4.7526039826538415e-07, + "loss": 0.9071, + "step": 18842 + }, + { + "epoch": 0.9398004987531172, + "grad_norm": 0.5908415588303395, + "learning_rate": 4.7447701465113124e-07, + "loss": 0.8483, + "step": 18843 + }, + { + "epoch": 0.939850374064838, + "grad_norm": 0.5932635537288448, + "learning_rate": 4.736942710194231e-07, + "loss": 0.8429, + "step": 18844 + }, + { + "epoch": 0.9399002493765586, + "grad_norm": 0.5770885023580836, + "learning_rate": 4.7291216739068224e-07, + "loss": 0.8293, + "step": 18845 + }, + { + "epoch": 0.9399501246882793, + "grad_norm": 0.5594030195339454, + "learning_rate": 4.7213070378532563e-07, + "loss": 0.8233, + "step": 18846 + }, + { + "epoch": 0.94, + "grad_norm": 0.8146909002234062, + "learning_rate": 4.713498802237426e-07, + "loss": 0.7946, + "step": 18847 + }, + { + "epoch": 0.9400498753117207, + "grad_norm": 0.556092632793061, + "learning_rate": 4.705696967263057e-07, + "loss": 0.8512, + "step": 18848 + }, + { + "epoch": 0.9400997506234414, + "grad_norm": 0.5567756277351318, + "learning_rate": 4.6979015331337085e-07, + "loss": 0.7908, + "step": 18849 + }, + { + "epoch": 0.940149625935162, + "grad_norm": 3.8985652539189237, + "learning_rate": 4.6901125000528567e-07, + "loss": 0.8506, + "step": 18850 + }, + { + "epoch": 0.9401995012468828, + "grad_norm": 0.7340669162353058, + "learning_rate": 4.682329868223756e-07, + "loss": 0.8024, + "step": 18851 + }, + { + "epoch": 0.9402493765586035, + "grad_norm": 0.5573659660125525, + "learning_rate": 4.674553637849466e-07, + "loss": 0.8611, + "step": 18852 + }, + { + "epoch": 0.9402992518703241, + "grad_norm": 0.5776678460554068, + "learning_rate": 4.6667838091328796e-07, + "loss": 0.8217, + "step": 18853 + }, + { + "epoch": 0.9403491271820449, + "grad_norm": 0.6253825432804592, + "learning_rate": 4.6590203822767795e-07, + "loss": 0.8554, + "step": 18854 + }, + { + "epoch": 0.9403990024937656, + "grad_norm": 0.5493578422404028, + "learning_rate": 4.651263357483754e-07, + "loss": 0.8473, + "step": 18855 + }, + { + "epoch": 0.9404488778054862, + "grad_norm": 0.5759216903241309, + "learning_rate": 4.643512734956168e-07, + "loss": 0.858, + "step": 18856 + }, + { + "epoch": 0.940498753117207, + "grad_norm": 0.7919935989698623, + "learning_rate": 4.63576851489636e-07, + "loss": 0.8044, + "step": 18857 + }, + { + "epoch": 0.9405486284289277, + "grad_norm": 0.5799612509667702, + "learning_rate": 4.628030697506336e-07, + "loss": 0.826, + "step": 18858 + }, + { + "epoch": 0.9405985037406484, + "grad_norm": 0.6578137102915493, + "learning_rate": 4.620299282988072e-07, + "loss": 0.8652, + "step": 18859 + }, + { + "epoch": 0.9406483790523691, + "grad_norm": 0.5754670910531829, + "learning_rate": 4.612574271543241e-07, + "loss": 0.8315, + "step": 18860 + }, + { + "epoch": 0.9406982543640898, + "grad_norm": 0.7964762673782979, + "learning_rate": 4.604855663373459e-07, + "loss": 0.7896, + "step": 18861 + }, + { + "epoch": 0.9407481296758105, + "grad_norm": 0.6926286164367034, + "learning_rate": 4.597143458680175e-07, + "loss": 0.8749, + "step": 18862 + }, + { + "epoch": 0.9407980049875312, + "grad_norm": 0.630854323631395, + "learning_rate": 4.5894376576645915e-07, + "loss": 0.8562, + "step": 18863 + }, + { + "epoch": 0.9408478802992518, + "grad_norm": 0.5069286253303819, + "learning_rate": 4.5817382605277956e-07, + "loss": 0.7892, + "step": 18864 + }, + { + "epoch": 0.9408977556109726, + "grad_norm": 0.6686428396300567, + "learning_rate": 4.574045267470711e-07, + "loss": 0.8506, + "step": 18865 + }, + { + "epoch": 0.9409476309226933, + "grad_norm": 0.5322972377195525, + "learning_rate": 4.566358678694066e-07, + "loss": 0.8381, + "step": 18866 + }, + { + "epoch": 0.9409975062344139, + "grad_norm": 0.678330991308597, + "learning_rate": 4.5586784943984774e-07, + "loss": 0.8717, + "step": 18867 + }, + { + "epoch": 0.9410473815461347, + "grad_norm": 0.6116878850065929, + "learning_rate": 4.551004714784285e-07, + "loss": 0.8367, + "step": 18868 + }, + { + "epoch": 0.9410972568578554, + "grad_norm": 0.5706152520537298, + "learning_rate": 4.5433373400518285e-07, + "loss": 0.8323, + "step": 18869 + }, + { + "epoch": 0.941147132169576, + "grad_norm": 0.556225913612068, + "learning_rate": 4.5356763704010877e-07, + "loss": 0.8434, + "step": 18870 + }, + { + "epoch": 0.9411970074812968, + "grad_norm": 0.5972627431268014, + "learning_rate": 4.5280218060320403e-07, + "loss": 0.8496, + "step": 18871 + }, + { + "epoch": 0.9412468827930175, + "grad_norm": 0.6762962402587932, + "learning_rate": 4.520373647144388e-07, + "loss": 0.8241, + "step": 18872 + }, + { + "epoch": 0.9412967581047381, + "grad_norm": 0.6350225363246226, + "learning_rate": 4.5127318939377494e-07, + "loss": 0.8756, + "step": 18873 + }, + { + "epoch": 0.9413466334164589, + "grad_norm": 0.6589402978543389, + "learning_rate": 4.5050965466114923e-07, + "loss": 0.8409, + "step": 18874 + }, + { + "epoch": 0.9413965087281796, + "grad_norm": 0.5265158113224473, + "learning_rate": 4.4974676053648744e-07, + "loss": 0.8285, + "step": 18875 + }, + { + "epoch": 0.9414463840399002, + "grad_norm": 0.6395602744583028, + "learning_rate": 4.489845070396931e-07, + "loss": 0.8527, + "step": 18876 + }, + { + "epoch": 0.941496259351621, + "grad_norm": 0.9853498375736576, + "learning_rate": 4.482228941906669e-07, + "loss": 0.828, + "step": 18877 + }, + { + "epoch": 0.9415461346633417, + "grad_norm": 1.2063535370643725, + "learning_rate": 4.474619220092707e-07, + "loss": 0.7901, + "step": 18878 + }, + { + "epoch": 0.9415960099750623, + "grad_norm": 0.7558111552499407, + "learning_rate": 4.467015905153693e-07, + "loss": 0.8387, + "step": 18879 + }, + { + "epoch": 0.9416458852867831, + "grad_norm": 0.6612455350965579, + "learning_rate": 4.459418997288023e-07, + "loss": 0.8485, + "step": 18880 + }, + { + "epoch": 0.9416957605985038, + "grad_norm": 0.6929660036546356, + "learning_rate": 4.451828496693927e-07, + "loss": 0.8341, + "step": 18881 + }, + { + "epoch": 0.9417456359102244, + "grad_norm": 0.6568222132888275, + "learning_rate": 4.444244403569442e-07, + "loss": 0.8082, + "step": 18882 + }, + { + "epoch": 0.9417955112219452, + "grad_norm": 0.568401825378456, + "learning_rate": 4.4366667181125475e-07, + "loss": 0.8709, + "step": 18883 + }, + { + "epoch": 0.9418453865336658, + "grad_norm": 0.6052901265365187, + "learning_rate": 4.4290954405209197e-07, + "loss": 0.805, + "step": 18884 + }, + { + "epoch": 0.9418952618453865, + "grad_norm": 0.624925861967887, + "learning_rate": 4.4215305709921217e-07, + "loss": 0.8426, + "step": 18885 + }, + { + "epoch": 0.9419451371571073, + "grad_norm": 0.5122684243395949, + "learning_rate": 4.41397210972358e-07, + "loss": 0.855, + "step": 18886 + }, + { + "epoch": 0.9419950124688279, + "grad_norm": 0.5160990144021027, + "learning_rate": 4.4064200569125525e-07, + "loss": 0.8329, + "step": 18887 + }, + { + "epoch": 0.9420448877805486, + "grad_norm": 0.5283953705397618, + "learning_rate": 4.3988744127560487e-07, + "loss": 0.8414, + "step": 18888 + }, + { + "epoch": 0.9420947630922694, + "grad_norm": 0.5902958764365155, + "learning_rate": 4.3913351774510226e-07, + "loss": 0.8106, + "step": 18889 + }, + { + "epoch": 0.94214463840399, + "grad_norm": 0.6019751604131233, + "learning_rate": 4.383802351194177e-07, + "loss": 0.8365, + "step": 18890 + }, + { + "epoch": 0.9421945137157107, + "grad_norm": 0.6227578967909148, + "learning_rate": 4.376275934182078e-07, + "loss": 0.8343, + "step": 18891 + }, + { + "epoch": 0.9422443890274315, + "grad_norm": 0.5776534930981665, + "learning_rate": 4.368755926611151e-07, + "loss": 0.7965, + "step": 18892 + }, + { + "epoch": 0.9422942643391521, + "grad_norm": 1.0321724183987722, + "learning_rate": 4.3612423286776007e-07, + "loss": 0.8178, + "step": 18893 + }, + { + "epoch": 0.9423441396508728, + "grad_norm": 0.5846481367182879, + "learning_rate": 4.3537351405774917e-07, + "loss": 0.8932, + "step": 18894 + }, + { + "epoch": 0.9423940149625936, + "grad_norm": 0.7311443661404536, + "learning_rate": 4.346234362506724e-07, + "loss": 0.8448, + "step": 18895 + }, + { + "epoch": 0.9424438902743142, + "grad_norm": 0.6314294532103883, + "learning_rate": 4.338739994661056e-07, + "loss": 0.8308, + "step": 18896 + }, + { + "epoch": 0.9424937655860349, + "grad_norm": 0.5893094829746339, + "learning_rate": 4.331252037236e-07, + "loss": 0.8478, + "step": 18897 + }, + { + "epoch": 0.9425436408977557, + "grad_norm": 0.602076012734209, + "learning_rate": 4.3237704904269814e-07, + "loss": 0.8674, + "step": 18898 + }, + { + "epoch": 0.9425935162094763, + "grad_norm": 0.6215728654656806, + "learning_rate": 4.3162953544292064e-07, + "loss": 0.8546, + "step": 18899 + }, + { + "epoch": 0.942643391521197, + "grad_norm": 0.7376242667513906, + "learning_rate": 4.3088266294377954e-07, + "loss": 0.8817, + "step": 18900 + }, + { + "epoch": 0.9426932668329177, + "grad_norm": 1.021479103409798, + "learning_rate": 4.301364315647538e-07, + "loss": 0.8452, + "step": 18901 + }, + { + "epoch": 0.9427431421446384, + "grad_norm": 0.6101639223481193, + "learning_rate": 4.293908413253278e-07, + "loss": 0.8486, + "step": 18902 + }, + { + "epoch": 0.9427930174563591, + "grad_norm": 0.7019591228831503, + "learning_rate": 4.286458922449471e-07, + "loss": 0.8379, + "step": 18903 + }, + { + "epoch": 0.9428428927680798, + "grad_norm": 0.7033911108408183, + "learning_rate": 4.279015843430573e-07, + "loss": 0.8543, + "step": 18904 + }, + { + "epoch": 0.9428927680798005, + "grad_norm": 0.5353714233175428, + "learning_rate": 4.2715791763907607e-07, + "loss": 0.8318, + "step": 18905 + }, + { + "epoch": 0.9429426433915212, + "grad_norm": 0.6958050672707344, + "learning_rate": 4.264148921524158e-07, + "loss": 0.8816, + "step": 18906 + }, + { + "epoch": 0.9429925187032419, + "grad_norm": 0.5243343189791961, + "learning_rate": 4.256725079024554e-07, + "loss": 0.8478, + "step": 18907 + }, + { + "epoch": 0.9430423940149626, + "grad_norm": 0.6734198159078907, + "learning_rate": 4.249307649085765e-07, + "loss": 0.8455, + "step": 18908 + }, + { + "epoch": 0.9430922693266833, + "grad_norm": 0.5604099253415746, + "learning_rate": 4.2418966319013045e-07, + "loss": 0.8269, + "step": 18909 + }, + { + "epoch": 0.943142144638404, + "grad_norm": 0.5633315079682526, + "learning_rate": 4.2344920276645715e-07, + "loss": 0.7938, + "step": 18910 + }, + { + "epoch": 0.9431920199501247, + "grad_norm": 0.6562977196256119, + "learning_rate": 4.227093836568774e-07, + "loss": 0.8528, + "step": 18911 + }, + { + "epoch": 0.9432418952618454, + "grad_norm": 0.5171346758428488, + "learning_rate": 4.219702058806951e-07, + "loss": 0.8268, + "step": 18912 + }, + { + "epoch": 0.9432917705735661, + "grad_norm": 1.1182613738098226, + "learning_rate": 4.212316694572033e-07, + "loss": 0.8454, + "step": 18913 + }, + { + "epoch": 0.9433416458852868, + "grad_norm": 0.659636918658602, + "learning_rate": 4.204937744056725e-07, + "loss": 0.8491, + "step": 18914 + }, + { + "epoch": 0.9433915211970075, + "grad_norm": 0.5992032602326118, + "learning_rate": 4.197565207453513e-07, + "loss": 0.8232, + "step": 18915 + }, + { + "epoch": 0.9434413965087282, + "grad_norm": 0.6669739757586024, + "learning_rate": 4.1901990849548546e-07, + "loss": 0.8491, + "step": 18916 + }, + { + "epoch": 0.9434912718204489, + "grad_norm": 0.7188548509291702, + "learning_rate": 4.182839376752956e-07, + "loss": 0.8707, + "step": 18917 + }, + { + "epoch": 0.9435411471321695, + "grad_norm": 0.7426940648371405, + "learning_rate": 4.1754860830398314e-07, + "loss": 0.8687, + "step": 18918 + }, + { + "epoch": 0.9435910224438903, + "grad_norm": 0.6623822090972238, + "learning_rate": 4.168139204007382e-07, + "loss": 0.8311, + "step": 18919 + }, + { + "epoch": 0.943640897755611, + "grad_norm": 0.6135542916292079, + "learning_rate": 4.1607987398473157e-07, + "loss": 0.8253, + "step": 18920 + }, + { + "epoch": 0.9436907730673316, + "grad_norm": 0.6821422199320011, + "learning_rate": 4.153464690751202e-07, + "loss": 0.8245, + "step": 18921 + }, + { + "epoch": 0.9437406483790524, + "grad_norm": 0.669864775952782, + "learning_rate": 4.146137056910387e-07, + "loss": 0.8826, + "step": 18922 + }, + { + "epoch": 0.9437905236907731, + "grad_norm": 0.586048457094671, + "learning_rate": 4.13881583851608e-07, + "loss": 0.8264, + "step": 18923 + }, + { + "epoch": 0.9438403990024937, + "grad_norm": 0.6411800105133959, + "learning_rate": 4.1315010357593774e-07, + "loss": 0.8118, + "step": 18924 + }, + { + "epoch": 0.9438902743142145, + "grad_norm": 0.6455305398244545, + "learning_rate": 4.124192648831099e-07, + "loss": 0.8428, + "step": 18925 + }, + { + "epoch": 0.9439401496259352, + "grad_norm": 1.9205449162523391, + "learning_rate": 4.1168906779219805e-07, + "loss": 0.8344, + "step": 18926 + }, + { + "epoch": 0.9439900249376558, + "grad_norm": 0.6592518004396045, + "learning_rate": 4.1095951232225374e-07, + "loss": 0.834, + "step": 18927 + }, + { + "epoch": 0.9440399002493766, + "grad_norm": 0.5883311439147052, + "learning_rate": 4.1023059849231994e-07, + "loss": 0.8102, + "step": 18928 + }, + { + "epoch": 0.9440897755610973, + "grad_norm": 0.5910975861731291, + "learning_rate": 4.095023263214121e-07, + "loss": 0.8526, + "step": 18929 + }, + { + "epoch": 0.9441396508728179, + "grad_norm": 0.8588876409577377, + "learning_rate": 4.0877469582853434e-07, + "loss": 0.8624, + "step": 18930 + }, + { + "epoch": 0.9441895261845387, + "grad_norm": 0.6258087874134828, + "learning_rate": 4.0804770703267435e-07, + "loss": 0.8393, + "step": 18931 + }, + { + "epoch": 0.9442394014962594, + "grad_norm": 0.6712440913400375, + "learning_rate": 4.0732135995280585e-07, + "loss": 0.8488, + "step": 18932 + }, + { + "epoch": 0.94428927680798, + "grad_norm": 0.6101465959928558, + "learning_rate": 4.065956546078803e-07, + "loss": 0.8086, + "step": 18933 + }, + { + "epoch": 0.9443391521197008, + "grad_norm": 2.1546105723032767, + "learning_rate": 4.0587059101683255e-07, + "loss": 0.8382, + "step": 18934 + }, + { + "epoch": 0.9443890274314215, + "grad_norm": 0.7337693676408491, + "learning_rate": 4.0514616919858637e-07, + "loss": 0.846, + "step": 18935 + }, + { + "epoch": 0.9444389027431421, + "grad_norm": 1.1934913333822512, + "learning_rate": 4.044223891720461e-07, + "loss": 0.8214, + "step": 18936 + }, + { + "epoch": 0.9444887780548629, + "grad_norm": 0.6851475969296394, + "learning_rate": 4.0369925095609387e-07, + "loss": 0.8587, + "step": 18937 + }, + { + "epoch": 0.9445386533665835, + "grad_norm": 0.5463267259218465, + "learning_rate": 4.029767545696006e-07, + "loss": 0.8104, + "step": 18938 + }, + { + "epoch": 0.9445885286783042, + "grad_norm": 0.6577285173529904, + "learning_rate": 4.022549000314235e-07, + "loss": 0.8414, + "step": 18939 + }, + { + "epoch": 0.944638403990025, + "grad_norm": 0.5942042233581845, + "learning_rate": 4.01533687360392e-07, + "loss": 0.8299, + "step": 18940 + }, + { + "epoch": 0.9446882793017456, + "grad_norm": 1.6537011954553085, + "learning_rate": 4.008131165753326e-07, + "loss": 0.8678, + "step": 18941 + }, + { + "epoch": 0.9447381546134663, + "grad_norm": 0.6249605209566614, + "learning_rate": 4.000931876950442e-07, + "loss": 0.8289, + "step": 18942 + }, + { + "epoch": 0.9447880299251871, + "grad_norm": 0.9822433325001134, + "learning_rate": 3.9937390073832015e-07, + "loss": 0.8385, + "step": 18943 + }, + { + "epoch": 0.9448379052369077, + "grad_norm": 0.6072077760595993, + "learning_rate": 3.986552557239176e-07, + "loss": 0.8537, + "step": 18944 + }, + { + "epoch": 0.9448877805486284, + "grad_norm": 0.5393369412404029, + "learning_rate": 3.9793725267059933e-07, + "loss": 0.7841, + "step": 18945 + }, + { + "epoch": 0.9449376558603492, + "grad_norm": 0.9041033760087079, + "learning_rate": 3.972198915970976e-07, + "loss": 0.8611, + "step": 18946 + }, + { + "epoch": 0.9449875311720698, + "grad_norm": 0.5780284282678095, + "learning_rate": 3.9650317252213356e-07, + "loss": 0.8379, + "step": 18947 + }, + { + "epoch": 0.9450374064837905, + "grad_norm": 0.7652995660049137, + "learning_rate": 3.957870954644033e-07, + "loss": 0.838, + "step": 18948 + }, + { + "epoch": 0.9450872817955113, + "grad_norm": 0.6664809720452461, + "learning_rate": 3.9507166044260303e-07, + "loss": 0.8517, + "step": 18949 + }, + { + "epoch": 0.9451371571072319, + "grad_norm": 0.5042330664259803, + "learning_rate": 3.943568674753928e-07, + "loss": 0.8139, + "step": 18950 + }, + { + "epoch": 0.9451870324189526, + "grad_norm": 0.5862577718020904, + "learning_rate": 3.936427165814299e-07, + "loss": 0.828, + "step": 18951 + }, + { + "epoch": 0.9452369077306734, + "grad_norm": 0.6596935907036495, + "learning_rate": 3.9292920777934394e-07, + "loss": 0.8808, + "step": 18952 + }, + { + "epoch": 0.945286783042394, + "grad_norm": 0.677994415297844, + "learning_rate": 3.9221634108776163e-07, + "loss": 0.8329, + "step": 18953 + }, + { + "epoch": 0.9453366583541147, + "grad_norm": 0.6657791612401545, + "learning_rate": 3.9150411652528196e-07, + "loss": 0.8319, + "step": 18954 + }, + { + "epoch": 0.9453865336658354, + "grad_norm": 0.6176793283409676, + "learning_rate": 3.9079253411048735e-07, + "loss": 0.8293, + "step": 18955 + }, + { + "epoch": 0.9454364089775561, + "grad_norm": 0.5324543838677995, + "learning_rate": 3.900815938619462e-07, + "loss": 0.8352, + "step": 18956 + }, + { + "epoch": 0.9454862842892768, + "grad_norm": 0.6836984802826399, + "learning_rate": 3.893712957982132e-07, + "loss": 0.8806, + "step": 18957 + }, + { + "epoch": 0.9455361596009975, + "grad_norm": 0.5869917437759423, + "learning_rate": 3.886616399378207e-07, + "loss": 0.8344, + "step": 18958 + }, + { + "epoch": 0.9455860349127182, + "grad_norm": 0.992096423073704, + "learning_rate": 3.8795262629928996e-07, + "loss": 0.856, + "step": 18959 + }, + { + "epoch": 0.9456359102244389, + "grad_norm": 0.8872573127591058, + "learning_rate": 3.872442549011202e-07, + "loss": 0.8066, + "step": 18960 + }, + { + "epoch": 0.9456857855361596, + "grad_norm": 0.6240922167648869, + "learning_rate": 3.8653652576179646e-07, + "loss": 0.8606, + "step": 18961 + }, + { + "epoch": 0.9457356608478803, + "grad_norm": 0.5407105748600652, + "learning_rate": 3.858294388997846e-07, + "loss": 0.8365, + "step": 18962 + }, + { + "epoch": 0.945785536159601, + "grad_norm": 0.5357397594746662, + "learning_rate": 3.851229943335394e-07, + "loss": 0.8491, + "step": 18963 + }, + { + "epoch": 0.9458354114713217, + "grad_norm": 0.7321698230443148, + "learning_rate": 3.844171920814932e-07, + "loss": 0.8063, + "step": 18964 + }, + { + "epoch": 0.9458852867830424, + "grad_norm": 0.696746932575189, + "learning_rate": 3.837120321620646e-07, + "loss": 0.8656, + "step": 18965 + }, + { + "epoch": 0.9459351620947631, + "grad_norm": 0.7753092473511628, + "learning_rate": 3.830075145936529e-07, + "loss": 0.8374, + "step": 18966 + }, + { + "epoch": 0.9459850374064838, + "grad_norm": 0.5362855338924613, + "learning_rate": 3.823036393946433e-07, + "loss": 0.8089, + "step": 18967 + }, + { + "epoch": 0.9460349127182045, + "grad_norm": 0.5791130053367215, + "learning_rate": 3.816004065834017e-07, + "loss": 0.7922, + "step": 18968 + }, + { + "epoch": 0.9460847880299252, + "grad_norm": 0.5791644317643825, + "learning_rate": 3.8089781617828e-07, + "loss": 0.8559, + "step": 18969 + }, + { + "epoch": 0.9461346633416459, + "grad_norm": 0.5438215490890579, + "learning_rate": 3.8019586819761366e-07, + "loss": 0.8642, + "step": 18970 + }, + { + "epoch": 0.9461845386533666, + "grad_norm": 0.61561960632501, + "learning_rate": 3.7949456265971574e-07, + "loss": 0.8098, + "step": 18971 + }, + { + "epoch": 0.9462344139650872, + "grad_norm": 0.72453461754994, + "learning_rate": 3.7879389958289104e-07, + "loss": 0.8553, + "step": 18972 + }, + { + "epoch": 0.946284289276808, + "grad_norm": 0.571458013598006, + "learning_rate": 3.780938789854166e-07, + "loss": 0.8388, + "step": 18973 + }, + { + "epoch": 0.9463341645885287, + "grad_norm": 0.561203668238069, + "learning_rate": 3.7739450088556674e-07, + "loss": 0.8346, + "step": 18974 + }, + { + "epoch": 0.9463840399002493, + "grad_norm": 0.702925451895641, + "learning_rate": 3.7669576530158515e-07, + "loss": 0.8695, + "step": 18975 + }, + { + "epoch": 0.9464339152119701, + "grad_norm": 0.7984774239980489, + "learning_rate": 3.7599767225171003e-07, + "loss": 0.8616, + "step": 18976 + }, + { + "epoch": 0.9464837905236908, + "grad_norm": 0.5668367876198779, + "learning_rate": 3.753002217541518e-07, + "loss": 0.8363, + "step": 18977 + }, + { + "epoch": 0.9465336658354114, + "grad_norm": 0.8691837125996602, + "learning_rate": 3.7460341382711817e-07, + "loss": 0.8018, + "step": 18978 + }, + { + "epoch": 0.9465835411471322, + "grad_norm": 0.6904727986360579, + "learning_rate": 3.7390724848878345e-07, + "loss": 0.875, + "step": 18979 + }, + { + "epoch": 0.9466334164588529, + "grad_norm": 0.649030293532135, + "learning_rate": 3.73211725757322e-07, + "loss": 0.8424, + "step": 18980 + }, + { + "epoch": 0.9466832917705735, + "grad_norm": 0.6354235643155829, + "learning_rate": 3.725168456508749e-07, + "loss": 0.8143, + "step": 18981 + }, + { + "epoch": 0.9467331670822943, + "grad_norm": 0.5964041805942425, + "learning_rate": 3.7182260818758306e-07, + "loss": 0.8368, + "step": 18982 + }, + { + "epoch": 0.946783042394015, + "grad_norm": 0.5553087763266876, + "learning_rate": 3.711290133855544e-07, + "loss": 0.8726, + "step": 18983 + }, + { + "epoch": 0.9468329177057356, + "grad_norm": 0.7362337099912332, + "learning_rate": 3.7043606126289375e-07, + "loss": 0.8544, + "step": 18984 + }, + { + "epoch": 0.9468827930174564, + "grad_norm": 4.244661859347102, + "learning_rate": 3.6974375183768115e-07, + "loss": 0.8792, + "step": 18985 + }, + { + "epoch": 0.9469326683291771, + "grad_norm": 0.6062050610294204, + "learning_rate": 3.6905208512797993e-07, + "loss": 0.8466, + "step": 18986 + }, + { + "epoch": 0.9469825436408977, + "grad_norm": 0.5682455895452074, + "learning_rate": 3.683610611518451e-07, + "loss": 0.8115, + "step": 18987 + }, + { + "epoch": 0.9470324189526185, + "grad_norm": 0.7572533102114527, + "learning_rate": 3.6767067992730386e-07, + "loss": 0.8474, + "step": 18988 + }, + { + "epoch": 0.9470822942643391, + "grad_norm": 1.1634300911076636, + "learning_rate": 3.669809414723696e-07, + "loss": 0.8531, + "step": 18989 + }, + { + "epoch": 0.9471321695760598, + "grad_norm": 0.8536738947108572, + "learning_rate": 3.662918458050446e-07, + "loss": 0.8446, + "step": 18990 + }, + { + "epoch": 0.9471820448877806, + "grad_norm": 0.5898589218037502, + "learning_rate": 3.656033929433089e-07, + "loss": 0.8151, + "step": 18991 + }, + { + "epoch": 0.9472319201995012, + "grad_norm": 0.526074935858443, + "learning_rate": 3.6491558290512874e-07, + "loss": 0.8503, + "step": 18992 + }, + { + "epoch": 0.9472817955112219, + "grad_norm": 0.7148002219738667, + "learning_rate": 3.642284157084508e-07, + "loss": 0.8413, + "step": 18993 + }, + { + "epoch": 0.9473316708229427, + "grad_norm": 0.5592219122703731, + "learning_rate": 3.6354189137120797e-07, + "loss": 0.8159, + "step": 18994 + }, + { + "epoch": 0.9473815461346633, + "grad_norm": 0.5892245396656673, + "learning_rate": 3.62856009911311e-07, + "loss": 0.8369, + "step": 18995 + }, + { + "epoch": 0.947431421446384, + "grad_norm": 0.5941167405132041, + "learning_rate": 3.621707713466621e-07, + "loss": 0.8673, + "step": 18996 + }, + { + "epoch": 0.9474812967581048, + "grad_norm": 0.894700693981399, + "learning_rate": 3.6148617569514155e-07, + "loss": 0.8776, + "step": 18997 + }, + { + "epoch": 0.9475311720698254, + "grad_norm": 0.6902899205329314, + "learning_rate": 3.6080222297461e-07, + "loss": 0.8204, + "step": 18998 + }, + { + "epoch": 0.9475810473815461, + "grad_norm": 0.5368829151865088, + "learning_rate": 3.601189132029198e-07, + "loss": 0.8263, + "step": 18999 + }, + { + "epoch": 0.9476309226932669, + "grad_norm": 0.6247138584041912, + "learning_rate": 3.594362463978984e-07, + "loss": 0.8095, + "step": 19000 + }, + { + "epoch": 0.9476807980049875, + "grad_norm": 0.6230198733484267, + "learning_rate": 3.5875422257735933e-07, + "loss": 0.8299, + "step": 19001 + }, + { + "epoch": 0.9477306733167082, + "grad_norm": 0.6498786698098786, + "learning_rate": 3.5807284175910226e-07, + "loss": 0.8069, + "step": 19002 + }, + { + "epoch": 0.947780548628429, + "grad_norm": 0.5909673768133781, + "learning_rate": 3.573921039609046e-07, + "loss": 0.8417, + "step": 19003 + }, + { + "epoch": 0.9478304239401496, + "grad_norm": 0.6934665450775367, + "learning_rate": 3.567120092005327e-07, + "loss": 0.8654, + "step": 19004 + }, + { + "epoch": 0.9478802992518703, + "grad_norm": 0.6073767935396052, + "learning_rate": 3.5603255749573075e-07, + "loss": 0.8789, + "step": 19005 + }, + { + "epoch": 0.9479301745635911, + "grad_norm": 0.502738699827064, + "learning_rate": 3.553537488642317e-07, + "loss": 0.817, + "step": 19006 + }, + { + "epoch": 0.9479800498753117, + "grad_norm": 0.6118817613735885, + "learning_rate": 3.546755833237464e-07, + "loss": 0.8107, + "step": 19007 + }, + { + "epoch": 0.9480299251870324, + "grad_norm": 0.6393777589650853, + "learning_rate": 3.539980608919691e-07, + "loss": 0.8415, + "step": 19008 + }, + { + "epoch": 0.9480798004987531, + "grad_norm": 0.6135540127067322, + "learning_rate": 3.5332118158658835e-07, + "loss": 0.8266, + "step": 19009 + }, + { + "epoch": 0.9481296758104738, + "grad_norm": 0.7498830397082499, + "learning_rate": 3.5264494542525395e-07, + "loss": 0.8286, + "step": 19010 + }, + { + "epoch": 0.9481795511221945, + "grad_norm": 0.7041679234874709, + "learning_rate": 3.51969352425624e-07, + "loss": 0.8589, + "step": 19011 + }, + { + "epoch": 0.9482294264339152, + "grad_norm": 0.7382426534493076, + "learning_rate": 3.5129440260532055e-07, + "loss": 0.8114, + "step": 19012 + }, + { + "epoch": 0.9482793017456359, + "grad_norm": 0.6745952627174071, + "learning_rate": 3.5062009598196e-07, + "loss": 0.8681, + "step": 19013 + }, + { + "epoch": 0.9483291770573566, + "grad_norm": 0.6214208675071633, + "learning_rate": 3.4994643257313384e-07, + "loss": 0.8569, + "step": 19014 + }, + { + "epoch": 0.9483790523690773, + "grad_norm": 0.7089708553946242, + "learning_rate": 3.492734123964281e-07, + "loss": 0.7974, + "step": 19015 + }, + { + "epoch": 0.948428927680798, + "grad_norm": 0.6458330380627464, + "learning_rate": 3.4860103546939525e-07, + "loss": 0.8735, + "step": 19016 + }, + { + "epoch": 0.9484788029925187, + "grad_norm": 0.5867860728676901, + "learning_rate": 3.4792930180959083e-07, + "loss": 0.8186, + "step": 19017 + }, + { + "epoch": 0.9485286783042394, + "grad_norm": 0.6843389669596346, + "learning_rate": 3.4725821143453407e-07, + "loss": 0.8626, + "step": 19018 + }, + { + "epoch": 0.9485785536159601, + "grad_norm": 0.6528810961001882, + "learning_rate": 3.4658776436174434e-07, + "loss": 0.8276, + "step": 19019 + }, + { + "epoch": 0.9486284289276808, + "grad_norm": 0.6044940845173775, + "learning_rate": 3.459179606087104e-07, + "loss": 0.8427, + "step": 19020 + }, + { + "epoch": 0.9486783042394015, + "grad_norm": 0.5904080935538967, + "learning_rate": 3.452488001929155e-07, + "loss": 0.8076, + "step": 19021 + }, + { + "epoch": 0.9487281795511222, + "grad_norm": 0.5846985128028848, + "learning_rate": 3.445802831318179e-07, + "loss": 0.8677, + "step": 19022 + }, + { + "epoch": 0.948778054862843, + "grad_norm": 0.6150257850856816, + "learning_rate": 3.4391240944286474e-07, + "loss": 0.8012, + "step": 19023 + }, + { + "epoch": 0.9488279301745636, + "grad_norm": 0.6675708169522802, + "learning_rate": 3.4324517914348377e-07, + "loss": 0.8418, + "step": 19024 + }, + { + "epoch": 0.9488778054862843, + "grad_norm": 0.5094202869261157, + "learning_rate": 3.4257859225108324e-07, + "loss": 0.8084, + "step": 19025 + }, + { + "epoch": 0.9489276807980049, + "grad_norm": 0.8124533735084563, + "learning_rate": 3.419126487830576e-07, + "loss": 0.8294, + "step": 19026 + }, + { + "epoch": 0.9489775561097257, + "grad_norm": 0.6189086480086701, + "learning_rate": 3.412473487567902e-07, + "loss": 0.8726, + "step": 19027 + }, + { + "epoch": 0.9490274314214464, + "grad_norm": 0.6123446707549789, + "learning_rate": 3.405826921896338e-07, + "loss": 0.8261, + "step": 19028 + }, + { + "epoch": 0.949077306733167, + "grad_norm": 0.5577437177448796, + "learning_rate": 3.399186790989384e-07, + "loss": 0.8134, + "step": 19029 + }, + { + "epoch": 0.9491271820448878, + "grad_norm": 0.85724566626619, + "learning_rate": 3.392553095020262e-07, + "loss": 0.8439, + "step": 19030 + }, + { + "epoch": 0.9491770573566085, + "grad_norm": 0.5188379329566805, + "learning_rate": 3.3859258341621125e-07, + "loss": 0.8456, + "step": 19031 + }, + { + "epoch": 0.9492269326683291, + "grad_norm": 0.5718660539036391, + "learning_rate": 3.379305008587852e-07, + "loss": 0.8359, + "step": 19032 + }, + { + "epoch": 0.9492768079800499, + "grad_norm": 0.623737558777188, + "learning_rate": 3.372690618470259e-07, + "loss": 0.8144, + "step": 19033 + }, + { + "epoch": 0.9493266832917706, + "grad_norm": 0.5858991764025093, + "learning_rate": 3.366082663981918e-07, + "loss": 0.851, + "step": 19034 + }, + { + "epoch": 0.9493765586034912, + "grad_norm": 0.5591512403852014, + "learning_rate": 3.3594811452953014e-07, + "loss": 0.891, + "step": 19035 + }, + { + "epoch": 0.949426433915212, + "grad_norm": 0.7351588260217586, + "learning_rate": 3.352886062582605e-07, + "loss": 0.8129, + "step": 19036 + }, + { + "epoch": 0.9494763092269327, + "grad_norm": 0.669760198670713, + "learning_rate": 3.346297416015998e-07, + "loss": 0.8373, + "step": 19037 + }, + { + "epoch": 0.9495261845386533, + "grad_norm": 0.5427135653211276, + "learning_rate": 3.3397152057673407e-07, + "loss": 0.8219, + "step": 19038 + }, + { + "epoch": 0.9495760598503741, + "grad_norm": 0.5167047635743005, + "learning_rate": 3.333139432008442e-07, + "loss": 0.8092, + "step": 19039 + }, + { + "epoch": 0.9496259351620948, + "grad_norm": 0.7696108467684915, + "learning_rate": 3.326570094910886e-07, + "loss": 0.8227, + "step": 19040 + }, + { + "epoch": 0.9496758104738154, + "grad_norm": 0.9543382089917386, + "learning_rate": 3.3200071946460633e-07, + "loss": 0.8238, + "step": 19041 + }, + { + "epoch": 0.9497256857855362, + "grad_norm": 0.5781748423610803, + "learning_rate": 3.313450731385254e-07, + "loss": 0.8028, + "step": 19042 + }, + { + "epoch": 0.9497755610972568, + "grad_norm": 0.6041568106466207, + "learning_rate": 3.3069007052995426e-07, + "loss": 0.8152, + "step": 19043 + }, + { + "epoch": 0.9498254364089775, + "grad_norm": 0.678603867738072, + "learning_rate": 3.300357116559877e-07, + "loss": 0.8786, + "step": 19044 + }, + { + "epoch": 0.9498753117206983, + "grad_norm": 0.6168957298328261, + "learning_rate": 3.2938199653369803e-07, + "loss": 0.8121, + "step": 19045 + }, + { + "epoch": 0.9499251870324189, + "grad_norm": 0.6448098454104918, + "learning_rate": 3.28728925180144e-07, + "loss": 0.8344, + "step": 19046 + }, + { + "epoch": 0.9499750623441396, + "grad_norm": 0.6797509935445121, + "learning_rate": 3.280764976123646e-07, + "loss": 0.8333, + "step": 19047 + }, + { + "epoch": 0.9500249376558604, + "grad_norm": 0.5799463137220979, + "learning_rate": 3.274247138473879e-07, + "loss": 0.8286, + "step": 19048 + }, + { + "epoch": 0.950074812967581, + "grad_norm": 0.8006859663439381, + "learning_rate": 3.2677357390222265e-07, + "loss": 0.8075, + "step": 19049 + }, + { + "epoch": 0.9501246882793017, + "grad_norm": 0.5725951576281589, + "learning_rate": 3.2612307779386067e-07, + "loss": 0.8394, + "step": 19050 + }, + { + "epoch": 0.9501745635910225, + "grad_norm": 0.7200185873645473, + "learning_rate": 3.2547322553926894e-07, + "loss": 0.8849, + "step": 19051 + }, + { + "epoch": 0.9502244389027431, + "grad_norm": 0.6378539067764815, + "learning_rate": 3.2482401715541454e-07, + "loss": 0.8391, + "step": 19052 + }, + { + "epoch": 0.9502743142144638, + "grad_norm": 0.8072023766056978, + "learning_rate": 3.241754526592311e-07, + "loss": 0.8437, + "step": 19053 + }, + { + "epoch": 0.9503241895261846, + "grad_norm": 0.9362644386969281, + "learning_rate": 3.235275320676523e-07, + "loss": 0.8246, + "step": 19054 + }, + { + "epoch": 0.9503740648379052, + "grad_norm": 0.6248950963969391, + "learning_rate": 3.2288025539757293e-07, + "loss": 0.8359, + "step": 19055 + }, + { + "epoch": 0.950423940149626, + "grad_norm": 0.6051216966384735, + "learning_rate": 3.222336226658906e-07, + "loss": 0.8092, + "step": 19056 + }, + { + "epoch": 0.9504738154613467, + "grad_norm": 0.6735237615563493, + "learning_rate": 3.21587633889478e-07, + "loss": 0.8113, + "step": 19057 + }, + { + "epoch": 0.9505236907730673, + "grad_norm": 0.5346987205338055, + "learning_rate": 3.20942289085191e-07, + "loss": 0.8511, + "step": 19058 + }, + { + "epoch": 0.950573566084788, + "grad_norm": 0.6132588530410276, + "learning_rate": 3.202975882698689e-07, + "loss": 0.8443, + "step": 19059 + }, + { + "epoch": 0.9506234413965087, + "grad_norm": 0.5369099052518305, + "learning_rate": 3.1965353146033994e-07, + "loss": 0.8232, + "step": 19060 + }, + { + "epoch": 0.9506733167082294, + "grad_norm": 0.7816960213053731, + "learning_rate": 3.190101186734046e-07, + "loss": 0.8898, + "step": 19061 + }, + { + "epoch": 0.9507231920199501, + "grad_norm": 0.818807086708931, + "learning_rate": 3.1836734992585495e-07, + "loss": 0.8968, + "step": 19062 + }, + { + "epoch": 0.9507730673316708, + "grad_norm": 0.5413972463748444, + "learning_rate": 3.177252252344609e-07, + "loss": 0.8788, + "step": 19063 + }, + { + "epoch": 0.9508229426433915, + "grad_norm": 0.7290189108919866, + "learning_rate": 3.170837446159813e-07, + "loss": 0.819, + "step": 19064 + }, + { + "epoch": 0.9508728179551122, + "grad_norm": 0.5715384169036032, + "learning_rate": 3.164429080871556e-07, + "loss": 0.8787, + "step": 19065 + }, + { + "epoch": 0.9509226932668329, + "grad_norm": 0.5844257445994182, + "learning_rate": 3.158027156647064e-07, + "loss": 0.8149, + "step": 19066 + }, + { + "epoch": 0.9509725685785536, + "grad_norm": 0.5970242050215803, + "learning_rate": 3.1516316736533433e-07, + "loss": 0.833, + "step": 19067 + }, + { + "epoch": 0.9510224438902743, + "grad_norm": 0.7300124258074537, + "learning_rate": 3.1452426320573715e-07, + "loss": 0.8755, + "step": 19068 + }, + { + "epoch": 0.951072319201995, + "grad_norm": 0.6634800359079247, + "learning_rate": 3.138860032025792e-07, + "loss": 0.8456, + "step": 19069 + }, + { + "epoch": 0.9511221945137157, + "grad_norm": 0.5766098807069898, + "learning_rate": 3.1324838737251673e-07, + "loss": 0.821, + "step": 19070 + }, + { + "epoch": 0.9511720698254365, + "grad_norm": 0.78946138731047, + "learning_rate": 3.126114157321891e-07, + "loss": 0.8221, + "step": 19071 + }, + { + "epoch": 0.9512219451371571, + "grad_norm": 0.664382585432895, + "learning_rate": 3.1197508829822197e-07, + "loss": 0.8432, + "step": 19072 + }, + { + "epoch": 0.9512718204488778, + "grad_norm": 0.6414420918824789, + "learning_rate": 3.1133940508721315e-07, + "loss": 0.9013, + "step": 19073 + }, + { + "epoch": 0.9513216957605986, + "grad_norm": 0.5582898387291847, + "learning_rate": 3.1070436611575214e-07, + "loss": 0.8181, + "step": 19074 + }, + { + "epoch": 0.9513715710723192, + "grad_norm": 0.6955326059480383, + "learning_rate": 3.100699714004118e-07, + "loss": 0.8419, + "step": 19075 + }, + { + "epoch": 0.9514214463840399, + "grad_norm": 0.5973203413428946, + "learning_rate": 3.094362209577456e-07, + "loss": 0.8433, + "step": 19076 + }, + { + "epoch": 0.9514713216957607, + "grad_norm": 0.7950458087733662, + "learning_rate": 3.08803114804293e-07, + "loss": 0.869, + "step": 19077 + }, + { + "epoch": 0.9515211970074813, + "grad_norm": 0.6454386556066395, + "learning_rate": 3.0817065295657143e-07, + "loss": 0.8217, + "step": 19078 + }, + { + "epoch": 0.951571072319202, + "grad_norm": 0.5700764420736556, + "learning_rate": 3.0753883543108706e-07, + "loss": 0.8435, + "step": 19079 + }, + { + "epoch": 0.9516209476309226, + "grad_norm": 0.596608813394596, + "learning_rate": 3.06907662244324e-07, + "loss": 0.8957, + "step": 19080 + }, + { + "epoch": 0.9516708229426434, + "grad_norm": 0.5678994805250717, + "learning_rate": 3.062771334127579e-07, + "loss": 0.8511, + "step": 19081 + }, + { + "epoch": 0.9517206982543641, + "grad_norm": 0.6296450513379683, + "learning_rate": 3.056472489528367e-07, + "loss": 0.9115, + "step": 19082 + }, + { + "epoch": 0.9517705735660847, + "grad_norm": 0.7941514026656634, + "learning_rate": 3.050180088809973e-07, + "loss": 0.8603, + "step": 19083 + }, + { + "epoch": 0.9518204488778055, + "grad_norm": 0.7053528251270613, + "learning_rate": 3.0438941321365985e-07, + "loss": 0.8272, + "step": 19084 + }, + { + "epoch": 0.9518703241895262, + "grad_norm": 0.6061798932735261, + "learning_rate": 3.0376146196723075e-07, + "loss": 0.8237, + "step": 19085 + }, + { + "epoch": 0.9519201995012468, + "grad_norm": 0.528383576964443, + "learning_rate": 3.0313415515809407e-07, + "loss": 0.8698, + "step": 19086 + }, + { + "epoch": 0.9519700748129676, + "grad_norm": 0.6105721615272992, + "learning_rate": 3.0250749280262003e-07, + "loss": 0.8756, + "step": 19087 + }, + { + "epoch": 0.9520199501246883, + "grad_norm": 0.6414267056963072, + "learning_rate": 3.018814749171567e-07, + "loss": 0.8771, + "step": 19088 + }, + { + "epoch": 0.952069825436409, + "grad_norm": 1.0055994654469977, + "learning_rate": 3.0125610151804374e-07, + "loss": 0.838, + "step": 19089 + }, + { + "epoch": 0.9521197007481297, + "grad_norm": 0.5703856477643311, + "learning_rate": 3.006313726215987e-07, + "loss": 0.8177, + "step": 19090 + }, + { + "epoch": 0.9521695760598504, + "grad_norm": 0.6057872655861217, + "learning_rate": 3.0000728824412517e-07, + "loss": 0.8179, + "step": 19091 + }, + { + "epoch": 0.952219451371571, + "grad_norm": 0.6098103901671791, + "learning_rate": 2.993838484019046e-07, + "loss": 0.8429, + "step": 19092 + }, + { + "epoch": 0.9522693266832918, + "grad_norm": 0.6336708405391728, + "learning_rate": 2.9876105311121285e-07, + "loss": 0.8814, + "step": 19093 + }, + { + "epoch": 0.9523192019950125, + "grad_norm": 0.7590498459241823, + "learning_rate": 2.9813890238829243e-07, + "loss": 0.8734, + "step": 19094 + }, + { + "epoch": 0.9523690773067331, + "grad_norm": 0.5068574294858395, + "learning_rate": 2.9751739624938324e-07, + "loss": 0.8187, + "step": 19095 + }, + { + "epoch": 0.9524189526184539, + "grad_norm": 0.521263368188051, + "learning_rate": 2.968965347107028e-07, + "loss": 0.8191, + "step": 19096 + }, + { + "epoch": 0.9524688279301745, + "grad_norm": 1.000907221792675, + "learning_rate": 2.96276317788452e-07, + "loss": 0.8381, + "step": 19097 + }, + { + "epoch": 0.9525187032418952, + "grad_norm": 0.5833666529583212, + "learning_rate": 2.956567454988124e-07, + "loss": 0.8226, + "step": 19098 + }, + { + "epoch": 0.952568578553616, + "grad_norm": 0.758977675841511, + "learning_rate": 2.9503781785795713e-07, + "loss": 0.8377, + "step": 19099 + }, + { + "epoch": 0.9526184538653366, + "grad_norm": 2.4949941529055417, + "learning_rate": 2.944195348820317e-07, + "loss": 0.839, + "step": 19100 + }, + { + "epoch": 0.9526683291770573, + "grad_norm": 0.746951416764062, + "learning_rate": 2.9380189658717027e-07, + "loss": 0.8871, + "step": 19101 + }, + { + "epoch": 0.9527182044887781, + "grad_norm": 0.6383734070303465, + "learning_rate": 2.9318490298949344e-07, + "loss": 0.8112, + "step": 19102 + }, + { + "epoch": 0.9527680798004987, + "grad_norm": 0.6901480331911038, + "learning_rate": 2.925685541050993e-07, + "loss": 0.8722, + "step": 19103 + }, + { + "epoch": 0.9528179551122195, + "grad_norm": 0.5880759102425714, + "learning_rate": 2.9195284995006956e-07, + "loss": 0.815, + "step": 19104 + }, + { + "epoch": 0.9528678304239402, + "grad_norm": 0.6407403815140689, + "learning_rate": 2.913377905404746e-07, + "loss": 0.839, + "step": 19105 + }, + { + "epoch": 0.9529177057356608, + "grad_norm": 1.1216297386534642, + "learning_rate": 2.9072337589235997e-07, + "loss": 0.8298, + "step": 19106 + }, + { + "epoch": 0.9529675810473816, + "grad_norm": 0.7755576246807429, + "learning_rate": 2.9010960602176287e-07, + "loss": 0.866, + "step": 19107 + }, + { + "epoch": 0.9530174563591023, + "grad_norm": 1.2222956355218064, + "learning_rate": 2.894964809446926e-07, + "loss": 0.8809, + "step": 19108 + }, + { + "epoch": 0.9530673316708229, + "grad_norm": 1.7410476811565299, + "learning_rate": 2.888840006771559e-07, + "loss": 0.815, + "step": 19109 + }, + { + "epoch": 0.9531172069825437, + "grad_norm": 0.6229960071161809, + "learning_rate": 2.882721652351317e-07, + "loss": 0.8758, + "step": 19110 + }, + { + "epoch": 0.9531670822942644, + "grad_norm": 0.5856139311996768, + "learning_rate": 2.8766097463458486e-07, + "loss": 0.8105, + "step": 19111 + }, + { + "epoch": 0.953216957605985, + "grad_norm": 0.694951430711117, + "learning_rate": 2.8705042889146383e-07, + "loss": 0.7975, + "step": 19112 + }, + { + "epoch": 0.9532668329177058, + "grad_norm": 0.8042008987124923, + "learning_rate": 2.8644052802170583e-07, + "loss": 0.8617, + "step": 19113 + }, + { + "epoch": 0.9533167082294264, + "grad_norm": 1.4055996104503596, + "learning_rate": 2.8583127204121765e-07, + "loss": 0.8535, + "step": 19114 + }, + { + "epoch": 0.9533665835411471, + "grad_norm": 0.6395876523978912, + "learning_rate": 2.852226609659059e-07, + "loss": 0.857, + "step": 19115 + }, + { + "epoch": 0.9534164588528679, + "grad_norm": 0.9516022624707428, + "learning_rate": 2.846146948116468e-07, + "loss": 0.8741, + "step": 19116 + }, + { + "epoch": 0.9534663341645885, + "grad_norm": 0.6801785503884874, + "learning_rate": 2.8400737359430273e-07, + "loss": 0.8129, + "step": 19117 + }, + { + "epoch": 0.9535162094763092, + "grad_norm": 0.5813476636486984, + "learning_rate": 2.8340069732972764e-07, + "loss": 0.7985, + "step": 19118 + }, + { + "epoch": 0.95356608478803, + "grad_norm": 0.5719557176084908, + "learning_rate": 2.827946660337505e-07, + "loss": 0.849, + "step": 19119 + }, + { + "epoch": 0.9536159600997506, + "grad_norm": 0.5258645973737988, + "learning_rate": 2.821892797221809e-07, + "loss": 0.871, + "step": 19120 + }, + { + "epoch": 0.9536658354114713, + "grad_norm": 0.5148841223056785, + "learning_rate": 2.815845384108229e-07, + "loss": 0.7757, + "step": 19121 + }, + { + "epoch": 0.9537157107231921, + "grad_norm": 0.7120960425045557, + "learning_rate": 2.809804421154527e-07, + "loss": 0.876, + "step": 19122 + }, + { + "epoch": 0.9537655860349127, + "grad_norm": 0.5618390166019342, + "learning_rate": 2.8037699085183553e-07, + "loss": 0.847, + "step": 19123 + }, + { + "epoch": 0.9538154613466334, + "grad_norm": 0.6733848344540124, + "learning_rate": 2.7977418463571713e-07, + "loss": 0.8769, + "step": 19124 + }, + { + "epoch": 0.9538653366583542, + "grad_norm": 0.8523500548682891, + "learning_rate": 2.7917202348282933e-07, + "loss": 0.8114, + "step": 19125 + }, + { + "epoch": 0.9539152119700748, + "grad_norm": 0.6084773361219967, + "learning_rate": 2.785705074088818e-07, + "loss": 0.8258, + "step": 19126 + }, + { + "epoch": 0.9539650872817955, + "grad_norm": 0.5522293365220967, + "learning_rate": 2.779696364295758e-07, + "loss": 0.862, + "step": 19127 + }, + { + "epoch": 0.9540149625935163, + "grad_norm": 0.8184264826557103, + "learning_rate": 2.773694105605878e-07, + "loss": 0.8353, + "step": 19128 + }, + { + "epoch": 0.9540648379052369, + "grad_norm": 1.027062294509617, + "learning_rate": 2.767698298175775e-07, + "loss": 0.8245, + "step": 19129 + }, + { + "epoch": 0.9541147132169576, + "grad_norm": 0.5465552064949604, + "learning_rate": 2.7617089421619614e-07, + "loss": 0.8688, + "step": 19130 + }, + { + "epoch": 0.9541645885286784, + "grad_norm": 0.5490684180101016, + "learning_rate": 2.7557260377207015e-07, + "loss": 0.8872, + "step": 19131 + }, + { + "epoch": 0.954214463840399, + "grad_norm": 0.7278960160751023, + "learning_rate": 2.749749585008121e-07, + "loss": 0.8759, + "step": 19132 + }, + { + "epoch": 0.9542643391521197, + "grad_norm": 0.5370174745007054, + "learning_rate": 2.7437795841801505e-07, + "loss": 0.8354, + "step": 19133 + }, + { + "epoch": 0.9543142144638403, + "grad_norm": 0.534366934391775, + "learning_rate": 2.73781603539261e-07, + "loss": 0.8384, + "step": 19134 + }, + { + "epoch": 0.9543640897755611, + "grad_norm": 0.5478126912019545, + "learning_rate": 2.731858938801124e-07, + "loss": 0.8473, + "step": 19135 + }, + { + "epoch": 0.9544139650872818, + "grad_norm": 0.8080800638014971, + "learning_rate": 2.725908294561097e-07, + "loss": 0.8534, + "step": 19136 + }, + { + "epoch": 0.9544638403990024, + "grad_norm": 0.5921652772855293, + "learning_rate": 2.7199641028277936e-07, + "loss": 0.7834, + "step": 19137 + }, + { + "epoch": 0.9545137157107232, + "grad_norm": 0.53603019486493, + "learning_rate": 2.714026363756422e-07, + "loss": 0.7909, + "step": 19138 + }, + { + "epoch": 0.9545635910224439, + "grad_norm": 1.3660365998805883, + "learning_rate": 2.7080950775018043e-07, + "loss": 0.8392, + "step": 19139 + }, + { + "epoch": 0.9546134663341646, + "grad_norm": 1.1602249730860215, + "learning_rate": 2.702170244218788e-07, + "loss": 0.8218, + "step": 19140 + }, + { + "epoch": 0.9546633416458853, + "grad_norm": 0.6369746820082854, + "learning_rate": 2.6962518640619716e-07, + "loss": 0.8542, + "step": 19141 + }, + { + "epoch": 0.954713216957606, + "grad_norm": 0.6019395845186328, + "learning_rate": 2.690339937185815e-07, + "loss": 0.8491, + "step": 19142 + }, + { + "epoch": 0.9547630922693267, + "grad_norm": 1.0274410518904071, + "learning_rate": 2.6844344637445007e-07, + "loss": 0.8803, + "step": 19143 + }, + { + "epoch": 0.9548129675810474, + "grad_norm": 0.5438741626994009, + "learning_rate": 2.6785354438922106e-07, + "loss": 0.8528, + "step": 19144 + }, + { + "epoch": 0.9548628428927681, + "grad_norm": 0.6882653473927139, + "learning_rate": 2.6726428777828496e-07, + "loss": 0.815, + "step": 19145 + }, + { + "epoch": 0.9549127182044888, + "grad_norm": 0.7578216914188148, + "learning_rate": 2.6667567655701833e-07, + "loss": 0.8501, + "step": 19146 + }, + { + "epoch": 0.9549625935162095, + "grad_norm": 0.5963934351029754, + "learning_rate": 2.660877107407811e-07, + "loss": 0.8191, + "step": 19147 + }, + { + "epoch": 0.9550124688279302, + "grad_norm": 0.5957596800846205, + "learning_rate": 2.655003903449138e-07, + "loss": 0.8762, + "step": 19148 + }, + { + "epoch": 0.9550623441396509, + "grad_norm": 0.656394161744319, + "learning_rate": 2.6491371538474585e-07, + "loss": 0.8429, + "step": 19149 + }, + { + "epoch": 0.9551122194513716, + "grad_norm": 0.6026979230438209, + "learning_rate": 2.643276858755844e-07, + "loss": 0.8622, + "step": 19150 + }, + { + "epoch": 0.9551620947630922, + "grad_norm": 0.5821509304034325, + "learning_rate": 2.6374230183272284e-07, + "loss": 0.837, + "step": 19151 + }, + { + "epoch": 0.955211970074813, + "grad_norm": 0.9128399925500601, + "learning_rate": 2.63157563271435e-07, + "loss": 0.861, + "step": 19152 + }, + { + "epoch": 0.9552618453865337, + "grad_norm": 0.7251910538652554, + "learning_rate": 2.625734702069782e-07, + "loss": 0.8672, + "step": 19153 + }, + { + "epoch": 0.9553117206982543, + "grad_norm": 0.566369698538965, + "learning_rate": 2.619900226545957e-07, + "loss": 0.8341, + "step": 19154 + }, + { + "epoch": 0.9553615960099751, + "grad_norm": 0.8923667199792844, + "learning_rate": 2.614072206295115e-07, + "loss": 0.8597, + "step": 19155 + }, + { + "epoch": 0.9554114713216958, + "grad_norm": 1.1826323620556995, + "learning_rate": 2.6082506414693843e-07, + "loss": 0.8242, + "step": 19156 + }, + { + "epoch": 0.9554613466334164, + "grad_norm": 0.5283084593270984, + "learning_rate": 2.602435532220615e-07, + "loss": 0.852, + "step": 19157 + }, + { + "epoch": 0.9555112219451372, + "grad_norm": 0.6325098042637322, + "learning_rate": 2.5966268787005476e-07, + "loss": 0.8308, + "step": 19158 + }, + { + "epoch": 0.9555610972568579, + "grad_norm": 0.7694572420846499, + "learning_rate": 2.590824681060783e-07, + "loss": 0.8473, + "step": 19159 + }, + { + "epoch": 0.9556109725685785, + "grad_norm": 0.5943004128262194, + "learning_rate": 2.585028939452755e-07, + "loss": 0.843, + "step": 19160 + }, + { + "epoch": 0.9556608478802993, + "grad_norm": 0.5598624351453984, + "learning_rate": 2.5792396540276486e-07, + "loss": 0.8372, + "step": 19161 + }, + { + "epoch": 0.95571072319202, + "grad_norm": 0.5899481256885665, + "learning_rate": 2.573456824936538e-07, + "loss": 0.8273, + "step": 19162 + }, + { + "epoch": 0.9557605985037406, + "grad_norm": 0.7537565803183307, + "learning_rate": 2.5676804523303564e-07, + "loss": 0.824, + "step": 19163 + }, + { + "epoch": 0.9558104738154614, + "grad_norm": 0.5441802825284633, + "learning_rate": 2.5619105363598185e-07, + "loss": 0.8596, + "step": 19164 + }, + { + "epoch": 0.9558603491271821, + "grad_norm": 0.6402610363465134, + "learning_rate": 2.5561470771754694e-07, + "loss": 0.8676, + "step": 19165 + }, + { + "epoch": 0.9559102244389027, + "grad_norm": 0.6372671368592403, + "learning_rate": 2.5503900749277167e-07, + "loss": 0.8134, + "step": 19166 + }, + { + "epoch": 0.9559600997506235, + "grad_norm": 0.6059832388886602, + "learning_rate": 2.544639529766829e-07, + "loss": 0.8077, + "step": 19167 + }, + { + "epoch": 0.9560099750623441, + "grad_norm": 0.5958110344270275, + "learning_rate": 2.538895441842798e-07, + "loss": 0.7875, + "step": 19168 + }, + { + "epoch": 0.9560598503740648, + "grad_norm": 0.6893097447381143, + "learning_rate": 2.533157811305559e-07, + "loss": 0.8445, + "step": 19169 + }, + { + "epoch": 0.9561097256857856, + "grad_norm": 1.1433994546265946, + "learning_rate": 2.5274266383047694e-07, + "loss": 0.8041, + "step": 19170 + }, + { + "epoch": 0.9561596009975062, + "grad_norm": 0.6573213175006496, + "learning_rate": 2.521701922990061e-07, + "loss": 0.8247, + "step": 19171 + }, + { + "epoch": 0.9562094763092269, + "grad_norm": 0.7648738864590157, + "learning_rate": 2.515983665510813e-07, + "loss": 0.8696, + "step": 19172 + }, + { + "epoch": 0.9562593516209477, + "grad_norm": 0.5637362341396249, + "learning_rate": 2.510271866016184e-07, + "loss": 0.8406, + "step": 19173 + }, + { + "epoch": 0.9563092269326683, + "grad_norm": 0.6461758887306971, + "learning_rate": 2.5045665246552217e-07, + "loss": 0.7897, + "step": 19174 + }, + { + "epoch": 0.956359102244389, + "grad_norm": 0.6772296937570029, + "learning_rate": 2.498867641576891e-07, + "loss": 0.8231, + "step": 19175 + }, + { + "epoch": 0.9564089775561098, + "grad_norm": 0.5991467005078498, + "learning_rate": 2.493175216929794e-07, + "loss": 0.7919, + "step": 19176 + }, + { + "epoch": 0.9564588528678304, + "grad_norm": 0.7034342133266935, + "learning_rate": 2.487489250862535e-07, + "loss": 0.8585, + "step": 19177 + }, + { + "epoch": 0.9565087281795511, + "grad_norm": 0.702805643271102, + "learning_rate": 2.481809743523467e-07, + "loss": 0.8404, + "step": 19178 + }, + { + "epoch": 0.9565586034912719, + "grad_norm": 0.6511742014069529, + "learning_rate": 2.4761366950608043e-07, + "loss": 0.8481, + "step": 19179 + }, + { + "epoch": 0.9566084788029925, + "grad_norm": 1.1359803383274996, + "learning_rate": 2.4704701056225686e-07, + "loss": 0.8416, + "step": 19180 + }, + { + "epoch": 0.9566583541147132, + "grad_norm": 0.5608829057719645, + "learning_rate": 2.464809975356641e-07, + "loss": 0.8404, + "step": 19181 + }, + { + "epoch": 0.956708229426434, + "grad_norm": 0.6740938633652704, + "learning_rate": 2.459156304410709e-07, + "loss": 0.8443, + "step": 19182 + }, + { + "epoch": 0.9567581047381546, + "grad_norm": 0.7663157268385231, + "learning_rate": 2.453509092932349e-07, + "loss": 0.8544, + "step": 19183 + }, + { + "epoch": 0.9568079800498753, + "grad_norm": 0.5947931681182569, + "learning_rate": 2.447868341068832e-07, + "loss": 0.8146, + "step": 19184 + }, + { + "epoch": 0.956857855361596, + "grad_norm": 0.5425341574143329, + "learning_rate": 2.442234048967429e-07, + "loss": 0.8541, + "step": 19185 + }, + { + "epoch": 0.9569077306733167, + "grad_norm": 0.6507541521601043, + "learning_rate": 2.4366062167751055e-07, + "loss": 0.8668, + "step": 19186 + }, + { + "epoch": 0.9569576059850374, + "grad_norm": 0.5894236846141495, + "learning_rate": 2.430984844638773e-07, + "loss": 0.8372, + "step": 19187 + }, + { + "epoch": 0.9570074812967581, + "grad_norm": 0.7110259077287323, + "learning_rate": 2.425369932705091e-07, + "loss": 0.9111, + "step": 19188 + }, + { + "epoch": 0.9570573566084788, + "grad_norm": 0.5708053745932272, + "learning_rate": 2.4197614811205807e-07, + "loss": 0.8412, + "step": 19189 + }, + { + "epoch": 0.9571072319201995, + "grad_norm": 0.501813534691596, + "learning_rate": 2.4141594900315977e-07, + "loss": 0.8028, + "step": 19190 + }, + { + "epoch": 0.9571571072319202, + "grad_norm": 0.5528491502795386, + "learning_rate": 2.408563959584303e-07, + "loss": 0.8463, + "step": 19191 + }, + { + "epoch": 0.9572069825436409, + "grad_norm": 0.5948374107160052, + "learning_rate": 2.4029748899247194e-07, + "loss": 0.8545, + "step": 19192 + }, + { + "epoch": 0.9572568578553616, + "grad_norm": 1.2675090970456793, + "learning_rate": 2.397392281198729e-07, + "loss": 0.8115, + "step": 19193 + }, + { + "epoch": 0.9573067331670823, + "grad_norm": 0.654709590463888, + "learning_rate": 2.391816133551938e-07, + "loss": 0.8403, + "step": 19194 + }, + { + "epoch": 0.957356608478803, + "grad_norm": 0.8732124402513599, + "learning_rate": 2.3862464471298973e-07, + "loss": 0.8298, + "step": 19195 + }, + { + "epoch": 0.9574064837905237, + "grad_norm": 0.532027193185074, + "learning_rate": 2.3806832220779618e-07, + "loss": 0.8767, + "step": 19196 + }, + { + "epoch": 0.9574563591022444, + "grad_norm": 0.7378348257670071, + "learning_rate": 2.3751264585412658e-07, + "loss": 0.8642, + "step": 19197 + }, + { + "epoch": 0.9575062344139651, + "grad_norm": 0.5844458912861389, + "learning_rate": 2.3695761566648044e-07, + "loss": 0.8206, + "step": 19198 + }, + { + "epoch": 0.9575561097256858, + "grad_norm": 0.6222460098182142, + "learning_rate": 2.364032316593434e-07, + "loss": 0.8409, + "step": 19199 + }, + { + "epoch": 0.9576059850374065, + "grad_norm": 0.5604783405548935, + "learning_rate": 2.3584949384718446e-07, + "loss": 0.8957, + "step": 19200 + }, + { + "epoch": 0.9576558603491272, + "grad_norm": 0.6154400934463228, + "learning_rate": 2.352964022444476e-07, + "loss": 0.8158, + "step": 19201 + }, + { + "epoch": 0.9577057356608479, + "grad_norm": 0.8315289647341192, + "learning_rate": 2.347439568655685e-07, + "loss": 0.8373, + "step": 19202 + }, + { + "epoch": 0.9577556109725686, + "grad_norm": 0.5456683875119738, + "learning_rate": 2.3419215772496062e-07, + "loss": 0.818, + "step": 19203 + }, + { + "epoch": 0.9578054862842893, + "grad_norm": 0.5994709348172863, + "learning_rate": 2.3364100483702635e-07, + "loss": 0.7979, + "step": 19204 + }, + { + "epoch": 0.9578553615960099, + "grad_norm": 0.7214445375040605, + "learning_rate": 2.3309049821614583e-07, + "loss": 0.8483, + "step": 19205 + }, + { + "epoch": 0.9579052369077307, + "grad_norm": 0.6426095424939388, + "learning_rate": 2.3254063787668257e-07, + "loss": 0.8387, + "step": 19206 + }, + { + "epoch": 0.9579551122194514, + "grad_norm": 0.6059671123744163, + "learning_rate": 2.3199142383298898e-07, + "loss": 0.8427, + "step": 19207 + }, + { + "epoch": 0.958004987531172, + "grad_norm": 0.92722795100866, + "learning_rate": 2.3144285609939532e-07, + "loss": 0.8127, + "step": 19208 + }, + { + "epoch": 0.9580548628428928, + "grad_norm": 0.6530117993949103, + "learning_rate": 2.308949346902123e-07, + "loss": 0.802, + "step": 19209 + }, + { + "epoch": 0.9581047381546135, + "grad_norm": 0.6069662902397582, + "learning_rate": 2.3034765961974513e-07, + "loss": 0.8202, + "step": 19210 + }, + { + "epoch": 0.9581546134663341, + "grad_norm": 0.6738521175045606, + "learning_rate": 2.2980103090226578e-07, + "loss": 0.8774, + "step": 19211 + }, + { + "epoch": 0.9582044887780549, + "grad_norm": 0.683938367136476, + "learning_rate": 2.2925504855204617e-07, + "loss": 0.8713, + "step": 19212 + }, + { + "epoch": 0.9582543640897756, + "grad_norm": 0.6150187152590629, + "learning_rate": 2.2870971258332764e-07, + "loss": 0.8402, + "step": 19213 + }, + { + "epoch": 0.9583042394014962, + "grad_norm": 1.437177320977755, + "learning_rate": 2.2816502301034604e-07, + "loss": 0.8297, + "step": 19214 + }, + { + "epoch": 0.958354114713217, + "grad_norm": 0.5390645708870216, + "learning_rate": 2.2762097984730945e-07, + "loss": 0.8133, + "step": 19215 + }, + { + "epoch": 0.9584039900249377, + "grad_norm": 1.0747995936420822, + "learning_rate": 2.2707758310841764e-07, + "loss": 0.8405, + "step": 19216 + }, + { + "epoch": 0.9584538653366583, + "grad_norm": 0.7654731139779759, + "learning_rate": 2.2653483280784815e-07, + "loss": 0.857, + "step": 19217 + }, + { + "epoch": 0.9585037406483791, + "grad_norm": 0.6717094857383602, + "learning_rate": 2.2599272895976464e-07, + "loss": 0.8313, + "step": 19218 + }, + { + "epoch": 0.9585536159600998, + "grad_norm": 0.6188185048627586, + "learning_rate": 2.2545127157831413e-07, + "loss": 0.8358, + "step": 19219 + }, + { + "epoch": 0.9586034912718204, + "grad_norm": 0.7250121777441594, + "learning_rate": 2.24910460677627e-07, + "loss": 0.8331, + "step": 19220 + }, + { + "epoch": 0.9586533665835412, + "grad_norm": 0.7972434730470787, + "learning_rate": 2.243702962718114e-07, + "loss": 0.8652, + "step": 19221 + }, + { + "epoch": 0.9587032418952618, + "grad_norm": 0.6329570938313545, + "learning_rate": 2.2383077837496714e-07, + "loss": 0.8593, + "step": 19222 + }, + { + "epoch": 0.9587531172069825, + "grad_norm": 0.6140902842919331, + "learning_rate": 2.2329190700116908e-07, + "loss": 0.8226, + "step": 19223 + }, + { + "epoch": 0.9588029925187033, + "grad_norm": 1.23742965677022, + "learning_rate": 2.2275368216447823e-07, + "loss": 0.7869, + "step": 19224 + }, + { + "epoch": 0.9588528678304239, + "grad_norm": 0.6378084981936766, + "learning_rate": 2.2221610387894164e-07, + "loss": 0.8565, + "step": 19225 + }, + { + "epoch": 0.9589027431421446, + "grad_norm": 0.6627605148212427, + "learning_rate": 2.216791721585898e-07, + "loss": 0.8383, + "step": 19226 + }, + { + "epoch": 0.9589526184538654, + "grad_norm": 0.6267064974051064, + "learning_rate": 2.2114288701742812e-07, + "loss": 0.8549, + "step": 19227 + }, + { + "epoch": 0.959002493765586, + "grad_norm": 0.7194647822522425, + "learning_rate": 2.206072484694538e-07, + "loss": 0.849, + "step": 19228 + }, + { + "epoch": 0.9590523690773067, + "grad_norm": 0.5132736839155514, + "learning_rate": 2.200722565286445e-07, + "loss": 0.797, + "step": 19229 + }, + { + "epoch": 0.9591022443890275, + "grad_norm": 0.677268334635241, + "learning_rate": 2.1953791120895851e-07, + "loss": 0.8894, + "step": 19230 + }, + { + "epoch": 0.9591521197007481, + "grad_norm": 0.5349504703053317, + "learning_rate": 2.1900421252434023e-07, + "loss": 0.8469, + "step": 19231 + }, + { + "epoch": 0.9592019950124688, + "grad_norm": 0.5485904993482069, + "learning_rate": 2.1847116048871463e-07, + "loss": 0.8387, + "step": 19232 + }, + { + "epoch": 0.9592518703241896, + "grad_norm": 0.8412283551095285, + "learning_rate": 2.179387551159956e-07, + "loss": 0.8276, + "step": 19233 + }, + { + "epoch": 0.9593017456359102, + "grad_norm": 0.6208342892465294, + "learning_rate": 2.174069964200748e-07, + "loss": 0.7958, + "step": 19234 + }, + { + "epoch": 0.9593516209476309, + "grad_norm": 0.5449976375207964, + "learning_rate": 2.168758844148272e-07, + "loss": 0.814, + "step": 19235 + }, + { + "epoch": 0.9594014962593517, + "grad_norm": 0.717714830226066, + "learning_rate": 2.1634541911410845e-07, + "loss": 0.8436, + "step": 19236 + }, + { + "epoch": 0.9594513715710723, + "grad_norm": 0.5806394769368237, + "learning_rate": 2.158156005317685e-07, + "loss": 0.8087, + "step": 19237 + }, + { + "epoch": 0.959501246882793, + "grad_norm": 0.5063234321789173, + "learning_rate": 2.1528642868162695e-07, + "loss": 0.8307, + "step": 19238 + }, + { + "epoch": 0.9595511221945137, + "grad_norm": 0.6152417386516706, + "learning_rate": 2.1475790357749215e-07, + "loss": 0.8139, + "step": 19239 + }, + { + "epoch": 0.9596009975062344, + "grad_norm": 0.5114636104241169, + "learning_rate": 2.142300252331586e-07, + "loss": 0.8234, + "step": 19240 + }, + { + "epoch": 0.9596508728179551, + "grad_norm": 0.5720933375246522, + "learning_rate": 2.1370279366240143e-07, + "loss": 0.8826, + "step": 19241 + }, + { + "epoch": 0.9597007481296758, + "grad_norm": 0.7439218397500578, + "learning_rate": 2.1317620887897627e-07, + "loss": 0.8059, + "step": 19242 + }, + { + "epoch": 0.9597506234413965, + "grad_norm": 0.5834937208122117, + "learning_rate": 2.1265027089662216e-07, + "loss": 0.8844, + "step": 19243 + }, + { + "epoch": 0.9598004987531172, + "grad_norm": 0.7639413674813573, + "learning_rate": 2.1212497972906697e-07, + "loss": 0.8412, + "step": 19244 + }, + { + "epoch": 0.9598503740648379, + "grad_norm": 0.8050969707330514, + "learning_rate": 2.1160033539001922e-07, + "loss": 0.8611, + "step": 19245 + }, + { + "epoch": 0.9599002493765586, + "grad_norm": 0.87017438228777, + "learning_rate": 2.1107633789316518e-07, + "loss": 0.8285, + "step": 19246 + }, + { + "epoch": 0.9599501246882793, + "grad_norm": 0.55950386463631, + "learning_rate": 2.1055298725217997e-07, + "loss": 0.8648, + "step": 19247 + }, + { + "epoch": 0.96, + "grad_norm": 0.6277531766524334, + "learning_rate": 2.100302834807194e-07, + "loss": 0.8676, + "step": 19248 + }, + { + "epoch": 0.9600498753117207, + "grad_norm": 0.6101314890924412, + "learning_rate": 2.0950822659242808e-07, + "loss": 0.8613, + "step": 19249 + }, + { + "epoch": 0.9600997506234414, + "grad_norm": 0.8235913812556718, + "learning_rate": 2.0898681660092012e-07, + "loss": 0.8352, + "step": 19250 + }, + { + "epoch": 0.9601496259351621, + "grad_norm": 0.49732632357496787, + "learning_rate": 2.0846605351980963e-07, + "loss": 0.7925, + "step": 19251 + }, + { + "epoch": 0.9601995012468828, + "grad_norm": 1.1037508757283399, + "learning_rate": 2.0794593736267742e-07, + "loss": 0.7924, + "step": 19252 + }, + { + "epoch": 0.9602493765586035, + "grad_norm": 0.6793281264659431, + "learning_rate": 2.0742646814310706e-07, + "loss": 0.8338, + "step": 19253 + }, + { + "epoch": 0.9602992518703242, + "grad_norm": 0.7475299306504872, + "learning_rate": 2.069076458746405e-07, + "loss": 0.8538, + "step": 19254 + }, + { + "epoch": 0.9603491271820449, + "grad_norm": 0.5061838881878546, + "learning_rate": 2.0638947057082802e-07, + "loss": 0.8275, + "step": 19255 + }, + { + "epoch": 0.9603990024937656, + "grad_norm": 0.7949357393845342, + "learning_rate": 2.058719422451838e-07, + "loss": 0.8958, + "step": 19256 + }, + { + "epoch": 0.9604488778054863, + "grad_norm": 1.4647607855317368, + "learning_rate": 2.0535506091121648e-07, + "loss": 0.8288, + "step": 19257 + }, + { + "epoch": 0.960498753117207, + "grad_norm": 0.6731720107014035, + "learning_rate": 2.0483882658240972e-07, + "loss": 0.8436, + "step": 19258 + }, + { + "epoch": 0.9605486284289276, + "grad_norm": 0.6943112675073104, + "learning_rate": 2.0432323927223884e-07, + "loss": 0.92, + "step": 19259 + }, + { + "epoch": 0.9605985037406484, + "grad_norm": 0.8548161127139756, + "learning_rate": 2.0380829899415698e-07, + "loss": 0.8498, + "step": 19260 + }, + { + "epoch": 0.9606483790523691, + "grad_norm": 0.6060473843869583, + "learning_rate": 2.0329400576159785e-07, + "loss": 0.788, + "step": 19261 + }, + { + "epoch": 0.9606982543640897, + "grad_norm": 0.5939195727857453, + "learning_rate": 2.02780359587984e-07, + "loss": 0.8343, + "step": 19262 + }, + { + "epoch": 0.9607481296758105, + "grad_norm": 0.6498163587077503, + "learning_rate": 2.0226736048672147e-07, + "loss": 0.8799, + "step": 19263 + }, + { + "epoch": 0.9607980049875312, + "grad_norm": 0.6548271961150803, + "learning_rate": 2.017550084711911e-07, + "loss": 0.7915, + "step": 19264 + }, + { + "epoch": 0.9608478802992518, + "grad_norm": 0.6103567755465535, + "learning_rate": 2.012433035547684e-07, + "loss": 0.8003, + "step": 19265 + }, + { + "epoch": 0.9608977556109726, + "grad_norm": 0.6896770181355883, + "learning_rate": 2.0073224575080097e-07, + "loss": 0.87, + "step": 19266 + }, + { + "epoch": 0.9609476309226933, + "grad_norm": 0.5286548792838297, + "learning_rate": 2.0022183507262814e-07, + "loss": 0.8211, + "step": 19267 + }, + { + "epoch": 0.9609975062344139, + "grad_norm": 0.8263414000575471, + "learning_rate": 1.9971207153356707e-07, + "loss": 0.8305, + "step": 19268 + }, + { + "epoch": 0.9610473815461347, + "grad_norm": 0.621508283486172, + "learning_rate": 1.99202955146921e-07, + "loss": 0.8249, + "step": 19269 + }, + { + "epoch": 0.9610972568578554, + "grad_norm": 0.5411307963809779, + "learning_rate": 1.9869448592597372e-07, + "loss": 0.8464, + "step": 19270 + }, + { + "epoch": 0.961147132169576, + "grad_norm": 0.5990156134651332, + "learning_rate": 1.981866638839952e-07, + "loss": 0.8221, + "step": 19271 + }, + { + "epoch": 0.9611970074812968, + "grad_norm": 0.6266638646474074, + "learning_rate": 1.9767948903423316e-07, + "loss": 0.879, + "step": 19272 + }, + { + "epoch": 0.9612468827930175, + "grad_norm": 0.7185353548205029, + "learning_rate": 1.9717296138992702e-07, + "loss": 0.8246, + "step": 19273 + }, + { + "epoch": 0.9612967581047381, + "grad_norm": 0.5642351228497369, + "learning_rate": 1.966670809642912e-07, + "loss": 0.8179, + "step": 19274 + }, + { + "epoch": 0.9613466334164589, + "grad_norm": 0.5278340323203062, + "learning_rate": 1.9616184777052904e-07, + "loss": 0.8229, + "step": 19275 + }, + { + "epoch": 0.9613965087281795, + "grad_norm": 0.6351049316335693, + "learning_rate": 1.9565726182182166e-07, + "loss": 0.8658, + "step": 19276 + }, + { + "epoch": 0.9614463840399002, + "grad_norm": 0.6812173029325356, + "learning_rate": 1.9515332313133628e-07, + "loss": 0.8401, + "step": 19277 + }, + { + "epoch": 0.961496259351621, + "grad_norm": 1.027040973690292, + "learning_rate": 1.946500317122263e-07, + "loss": 0.8414, + "step": 19278 + }, + { + "epoch": 0.9615461346633416, + "grad_norm": 0.6225282777298424, + "learning_rate": 1.941473875776201e-07, + "loss": 0.8585, + "step": 19279 + }, + { + "epoch": 0.9615960099750623, + "grad_norm": 0.5693432682632711, + "learning_rate": 1.936453907406377e-07, + "loss": 0.8268, + "step": 19280 + }, + { + "epoch": 0.9616458852867831, + "grad_norm": 0.8852195297899106, + "learning_rate": 1.9314404121437423e-07, + "loss": 0.8807, + "step": 19281 + }, + { + "epoch": 0.9616957605985037, + "grad_norm": 0.6633983330323847, + "learning_rate": 1.9264333901191646e-07, + "loss": 0.8307, + "step": 19282 + }, + { + "epoch": 0.9617456359102244, + "grad_norm": 0.5128923630447689, + "learning_rate": 1.9214328414632889e-07, + "loss": 0.8411, + "step": 19283 + }, + { + "epoch": 0.9617955112219452, + "grad_norm": 1.081243556227653, + "learning_rate": 1.916438766306594e-07, + "loss": 0.8113, + "step": 19284 + }, + { + "epoch": 0.9618453865336658, + "grad_norm": 0.6456748648814437, + "learning_rate": 1.911451164779393e-07, + "loss": 0.8541, + "step": 19285 + }, + { + "epoch": 0.9618952618453865, + "grad_norm": 0.5527321857732338, + "learning_rate": 1.9064700370118593e-07, + "loss": 0.8475, + "step": 19286 + }, + { + "epoch": 0.9619451371571073, + "grad_norm": 0.5874640895575003, + "learning_rate": 1.9014953831339445e-07, + "loss": 0.8337, + "step": 19287 + }, + { + "epoch": 0.9619950124688279, + "grad_norm": 0.5390438613447265, + "learning_rate": 1.8965272032754889e-07, + "loss": 0.8042, + "step": 19288 + }, + { + "epoch": 0.9620448877805486, + "grad_norm": 0.6232185006997867, + "learning_rate": 1.8915654975661113e-07, + "loss": 0.8597, + "step": 19289 + }, + { + "epoch": 0.9620947630922694, + "grad_norm": 0.7077248554024665, + "learning_rate": 1.8866102661352914e-07, + "loss": 0.8381, + "step": 19290 + }, + { + "epoch": 0.96214463840399, + "grad_norm": 1.077511148180026, + "learning_rate": 1.8816615091123424e-07, + "loss": 0.8301, + "step": 19291 + }, + { + "epoch": 0.9621945137157107, + "grad_norm": 0.6581387710552412, + "learning_rate": 1.8767192266263834e-07, + "loss": 0.8632, + "step": 19292 + }, + { + "epoch": 0.9622443890274314, + "grad_norm": 0.6004591584959084, + "learning_rate": 1.8717834188063943e-07, + "loss": 0.8637, + "step": 19293 + }, + { + "epoch": 0.9622942643391521, + "grad_norm": 0.6592877922194986, + "learning_rate": 1.866854085781161e-07, + "loss": 0.8947, + "step": 19294 + }, + { + "epoch": 0.9623441396508728, + "grad_norm": 1.1632022188671656, + "learning_rate": 1.861931227679331e-07, + "loss": 0.7915, + "step": 19295 + }, + { + "epoch": 0.9623940149625935, + "grad_norm": 1.3494104003767917, + "learning_rate": 1.8570148446293568e-07, + "loss": 0.804, + "step": 19296 + }, + { + "epoch": 0.9624438902743142, + "grad_norm": 0.6437539110272453, + "learning_rate": 1.8521049367595245e-07, + "loss": 0.8588, + "step": 19297 + }, + { + "epoch": 0.9624937655860349, + "grad_norm": 0.7321576551741859, + "learning_rate": 1.847201504197954e-07, + "loss": 0.8525, + "step": 19298 + }, + { + "epoch": 0.9625436408977556, + "grad_norm": 0.588672801250469, + "learning_rate": 1.8423045470725987e-07, + "loss": 0.7818, + "step": 19299 + }, + { + "epoch": 0.9625935162094763, + "grad_norm": 0.7228047409688083, + "learning_rate": 1.837414065511245e-07, + "loss": 0.8372, + "step": 19300 + }, + { + "epoch": 0.962643391521197, + "grad_norm": 0.5893085552419713, + "learning_rate": 1.832530059641513e-07, + "loss": 0.8045, + "step": 19301 + }, + { + "epoch": 0.9626932668329177, + "grad_norm": 0.5917688098806074, + "learning_rate": 1.8276525295908564e-07, + "loss": 0.8247, + "step": 19302 + }, + { + "epoch": 0.9627431421446384, + "grad_norm": 0.6205441138680151, + "learning_rate": 1.8227814754865068e-07, + "loss": 0.8329, + "step": 19303 + }, + { + "epoch": 0.9627930174563591, + "grad_norm": 0.5105053493111897, + "learning_rate": 1.8179168974556126e-07, + "loss": 0.8048, + "step": 19304 + }, + { + "epoch": 0.9628428927680798, + "grad_norm": 0.4682532673871341, + "learning_rate": 1.8130587956251276e-07, + "loss": 0.8126, + "step": 19305 + }, + { + "epoch": 0.9628927680798005, + "grad_norm": 0.6711582152805148, + "learning_rate": 1.808207170121784e-07, + "loss": 0.8566, + "step": 19306 + }, + { + "epoch": 0.9629426433915212, + "grad_norm": 0.6704174349878226, + "learning_rate": 1.8033620210722024e-07, + "loss": 0.849, + "step": 19307 + }, + { + "epoch": 0.9629925187032419, + "grad_norm": 0.750121901871256, + "learning_rate": 1.7985233486028097e-07, + "loss": 0.8448, + "step": 19308 + }, + { + "epoch": 0.9630423940149626, + "grad_norm": 0.6507799887203334, + "learning_rate": 1.7936911528398658e-07, + "loss": 0.8578, + "step": 19309 + }, + { + "epoch": 0.9630922693266832, + "grad_norm": 0.5803117324898786, + "learning_rate": 1.7888654339094646e-07, + "loss": 0.8291, + "step": 19310 + }, + { + "epoch": 0.963142144638404, + "grad_norm": 0.5324194719874074, + "learning_rate": 1.7840461919375607e-07, + "loss": 0.8205, + "step": 19311 + }, + { + "epoch": 0.9631920199501247, + "grad_norm": 0.61697847280405, + "learning_rate": 1.779233427049859e-07, + "loss": 0.7971, + "step": 19312 + }, + { + "epoch": 0.9632418952618453, + "grad_norm": 0.5920653435611786, + "learning_rate": 1.7744271393719812e-07, + "loss": 0.8651, + "step": 19313 + }, + { + "epoch": 0.9632917705735661, + "grad_norm": 0.8416055042952272, + "learning_rate": 1.7696273290293276e-07, + "loss": 0.8152, + "step": 19314 + }, + { + "epoch": 0.9633416458852868, + "grad_norm": 0.700993718478955, + "learning_rate": 1.7648339961471582e-07, + "loss": 0.803, + "step": 19315 + }, + { + "epoch": 0.9633915211970074, + "grad_norm": 0.4881169398798315, + "learning_rate": 1.7600471408505681e-07, + "loss": 0.8278, + "step": 19316 + }, + { + "epoch": 0.9634413965087282, + "grad_norm": 0.5692552708150591, + "learning_rate": 1.7552667632644292e-07, + "loss": 0.854, + "step": 19317 + }, + { + "epoch": 0.9634912718204489, + "grad_norm": 1.0117826755094093, + "learning_rate": 1.7504928635135033e-07, + "loss": 0.8141, + "step": 19318 + }, + { + "epoch": 0.9635411471321695, + "grad_norm": 0.6642574956248709, + "learning_rate": 1.7457254417223846e-07, + "loss": 0.8067, + "step": 19319 + }, + { + "epoch": 0.9635910224438903, + "grad_norm": 0.596416633791334, + "learning_rate": 1.740964498015446e-07, + "loss": 0.8506, + "step": 19320 + }, + { + "epoch": 0.963640897755611, + "grad_norm": 0.5681233882336836, + "learning_rate": 1.7362100325169216e-07, + "loss": 0.837, + "step": 19321 + }, + { + "epoch": 0.9636907730673316, + "grad_norm": 0.6356363030745613, + "learning_rate": 1.7314620453509068e-07, + "loss": 0.8572, + "step": 19322 + }, + { + "epoch": 0.9637406483790524, + "grad_norm": 0.5793027294342725, + "learning_rate": 1.726720536641274e-07, + "loss": 0.8752, + "step": 19323 + }, + { + "epoch": 0.9637905236907731, + "grad_norm": 0.5559962529306868, + "learning_rate": 1.7219855065117585e-07, + "loss": 0.8309, + "step": 19324 + }, + { + "epoch": 0.9638403990024937, + "grad_norm": 0.6046697039785974, + "learning_rate": 1.7172569550858997e-07, + "loss": 0.8334, + "step": 19325 + }, + { + "epoch": 0.9638902743142145, + "grad_norm": 0.5835382629881007, + "learning_rate": 1.712534882487099e-07, + "loss": 0.8567, + "step": 19326 + }, + { + "epoch": 0.9639401496259352, + "grad_norm": 0.5149322149218355, + "learning_rate": 1.7078192888386192e-07, + "loss": 0.8048, + "step": 19327 + }, + { + "epoch": 0.9639900249376558, + "grad_norm": 0.6425260459369635, + "learning_rate": 1.7031101742634448e-07, + "loss": 0.8565, + "step": 19328 + }, + { + "epoch": 0.9640399002493766, + "grad_norm": 0.6661431259709677, + "learning_rate": 1.6984075388845055e-07, + "loss": 0.8424, + "step": 19329 + }, + { + "epoch": 0.9640897755610972, + "grad_norm": 0.5686171872546538, + "learning_rate": 1.693711382824481e-07, + "loss": 0.8301, + "step": 19330 + }, + { + "epoch": 0.9641396508728179, + "grad_norm": 0.5813619920290624, + "learning_rate": 1.6890217062059398e-07, + "loss": 0.8332, + "step": 19331 + }, + { + "epoch": 0.9641895261845387, + "grad_norm": 0.5714598949859631, + "learning_rate": 1.6843385091512288e-07, + "loss": 0.8148, + "step": 19332 + }, + { + "epoch": 0.9642394014962593, + "grad_norm": 0.692590381015824, + "learning_rate": 1.679661791782583e-07, + "loss": 0.8433, + "step": 19333 + }, + { + "epoch": 0.96428927680798, + "grad_norm": 0.5934373357096175, + "learning_rate": 1.674991554222044e-07, + "loss": 0.8453, + "step": 19334 + }, + { + "epoch": 0.9643391521197008, + "grad_norm": 0.6113355407341792, + "learning_rate": 1.6703277965914588e-07, + "loss": 0.8522, + "step": 19335 + }, + { + "epoch": 0.9643890274314214, + "grad_norm": 0.7992008858320614, + "learning_rate": 1.6656705190125077e-07, + "loss": 0.7737, + "step": 19336 + }, + { + "epoch": 0.9644389027431421, + "grad_norm": 0.5924468034746652, + "learning_rate": 1.6610197216067603e-07, + "loss": 0.8627, + "step": 19337 + }, + { + "epoch": 0.9644887780548629, + "grad_norm": 0.7548683603405529, + "learning_rate": 1.6563754044955915e-07, + "loss": 0.829, + "step": 19338 + }, + { + "epoch": 0.9645386533665835, + "grad_norm": 0.7163632246299063, + "learning_rate": 1.6517375678001267e-07, + "loss": 0.8577, + "step": 19339 + }, + { + "epoch": 0.9645885286783042, + "grad_norm": 0.6013149339843109, + "learning_rate": 1.6471062116414082e-07, + "loss": 0.8143, + "step": 19340 + }, + { + "epoch": 0.964638403990025, + "grad_norm": 0.5628636389661285, + "learning_rate": 1.642481336140339e-07, + "loss": 0.8148, + "step": 19341 + }, + { + "epoch": 0.9646882793017456, + "grad_norm": 0.5612793106338496, + "learning_rate": 1.6378629414175728e-07, + "loss": 0.8103, + "step": 19342 + }, + { + "epoch": 0.9647381546134663, + "grad_norm": 0.5415567476281482, + "learning_rate": 1.6332510275935963e-07, + "loss": 0.8522, + "step": 19343 + }, + { + "epoch": 0.9647880299251871, + "grad_norm": 1.3953492078875724, + "learning_rate": 1.6286455947887858e-07, + "loss": 0.867, + "step": 19344 + }, + { + "epoch": 0.9648379052369077, + "grad_norm": 0.5659918691206791, + "learning_rate": 1.6240466431233226e-07, + "loss": 0.8396, + "step": 19345 + }, + { + "epoch": 0.9648877805486284, + "grad_norm": 0.7172441096561886, + "learning_rate": 1.6194541727171942e-07, + "loss": 0.841, + "step": 19346 + }, + { + "epoch": 0.9649376558603491, + "grad_norm": 0.6242075135807232, + "learning_rate": 1.614868183690249e-07, + "loss": 0.8489, + "step": 19347 + }, + { + "epoch": 0.9649875311720698, + "grad_norm": 0.5639882988628303, + "learning_rate": 1.6102886761621695e-07, + "loss": 0.8119, + "step": 19348 + }, + { + "epoch": 0.9650374064837905, + "grad_norm": 0.6293180376218735, + "learning_rate": 1.605715650252415e-07, + "loss": 0.7846, + "step": 19349 + }, + { + "epoch": 0.9650872817955112, + "grad_norm": 0.5560929260260689, + "learning_rate": 1.6011491060803907e-07, + "loss": 0.8195, + "step": 19350 + }, + { + "epoch": 0.9651371571072319, + "grad_norm": 0.5629269598198293, + "learning_rate": 1.5965890437651677e-07, + "loss": 0.8304, + "step": 19351 + }, + { + "epoch": 0.9651870324189527, + "grad_norm": 0.6493753036183961, + "learning_rate": 1.5920354634258173e-07, + "loss": 0.8235, + "step": 19352 + }, + { + "epoch": 0.9652369077306733, + "grad_norm": 0.5772841770636077, + "learning_rate": 1.587488365181078e-07, + "loss": 0.8146, + "step": 19353 + }, + { + "epoch": 0.965286783042394, + "grad_norm": 0.5874027020526346, + "learning_rate": 1.582947749149688e-07, + "loss": 0.8559, + "step": 19354 + }, + { + "epoch": 0.9653366583541148, + "grad_norm": 0.631552523653318, + "learning_rate": 1.5784136154500805e-07, + "loss": 0.8102, + "step": 19355 + }, + { + "epoch": 0.9653865336658354, + "grad_norm": 0.6395896108575772, + "learning_rate": 1.5738859642006333e-07, + "loss": 0.8439, + "step": 19356 + }, + { + "epoch": 0.9654364089775561, + "grad_norm": 0.5704349126082768, + "learning_rate": 1.569364795519418e-07, + "loss": 0.8432, + "step": 19357 + }, + { + "epoch": 0.9654862842892769, + "grad_norm": 0.7275552305493991, + "learning_rate": 1.5648501095244516e-07, + "loss": 0.8451, + "step": 19358 + }, + { + "epoch": 0.9655361596009975, + "grad_norm": 0.5423892950564434, + "learning_rate": 1.560341906333529e-07, + "loss": 0.8549, + "step": 19359 + }, + { + "epoch": 0.9655860349127182, + "grad_norm": 0.8131048147288091, + "learning_rate": 1.5558401860643336e-07, + "loss": 0.7898, + "step": 19360 + }, + { + "epoch": 0.965635910224439, + "grad_norm": 0.6880341894821089, + "learning_rate": 1.5513449488342435e-07, + "loss": 0.8537, + "step": 19361 + }, + { + "epoch": 0.9656857855361596, + "grad_norm": 0.5718836417026311, + "learning_rate": 1.5468561947606376e-07, + "loss": 0.8132, + "step": 19362 + }, + { + "epoch": 0.9657356608478803, + "grad_norm": 0.6593735139145964, + "learning_rate": 1.5423739239606437e-07, + "loss": 0.8692, + "step": 19363 + }, + { + "epoch": 0.9657855361596009, + "grad_norm": 0.652483677374325, + "learning_rate": 1.5378981365511967e-07, + "loss": 0.8682, + "step": 19364 + }, + { + "epoch": 0.9658354114713217, + "grad_norm": 0.5340894264295265, + "learning_rate": 1.533428832649092e-07, + "loss": 0.8024, + "step": 19365 + }, + { + "epoch": 0.9658852867830424, + "grad_norm": 0.6161884050412078, + "learning_rate": 1.5289660123709858e-07, + "loss": 0.84, + "step": 19366 + }, + { + "epoch": 0.965935162094763, + "grad_norm": 0.6276463765222305, + "learning_rate": 1.5245096758332854e-07, + "loss": 0.8587, + "step": 19367 + }, + { + "epoch": 0.9659850374064838, + "grad_norm": 0.6137218410274458, + "learning_rate": 1.5200598231523144e-07, + "loss": 0.911, + "step": 19368 + }, + { + "epoch": 0.9660349127182045, + "grad_norm": 0.7566081951100445, + "learning_rate": 1.515616454444174e-07, + "loss": 0.8896, + "step": 19369 + }, + { + "epoch": 0.9660847880299251, + "grad_norm": 1.5566363732364288, + "learning_rate": 1.5111795698248e-07, + "loss": 0.845, + "step": 19370 + }, + { + "epoch": 0.9661346633416459, + "grad_norm": 0.9126222457585288, + "learning_rate": 1.5067491694100154e-07, + "loss": 0.8016, + "step": 19371 + }, + { + "epoch": 0.9661845386533666, + "grad_norm": 0.6841365378067598, + "learning_rate": 1.5023252533153954e-07, + "loss": 0.8332, + "step": 19372 + }, + { + "epoch": 0.9662344139650872, + "grad_norm": 0.6387348170231841, + "learning_rate": 1.497907821656347e-07, + "loss": 0.8499, + "step": 19373 + }, + { + "epoch": 0.966284289276808, + "grad_norm": 0.7029315515834925, + "learning_rate": 1.4934968745482226e-07, + "loss": 0.8319, + "step": 19374 + }, + { + "epoch": 0.9663341645885287, + "grad_norm": 0.5220817094499103, + "learning_rate": 1.489092412106069e-07, + "loss": 0.8209, + "step": 19375 + }, + { + "epoch": 0.9663840399002493, + "grad_norm": 0.6686061034184828, + "learning_rate": 1.484694434444822e-07, + "loss": 0.8573, + "step": 19376 + }, + { + "epoch": 0.9664339152119701, + "grad_norm": 0.7205944688818384, + "learning_rate": 1.4803029416792512e-07, + "loss": 0.8825, + "step": 19377 + }, + { + "epoch": 0.9664837905236908, + "grad_norm": 0.5920788029831825, + "learning_rate": 1.4759179339239593e-07, + "loss": 0.8774, + "step": 19378 + }, + { + "epoch": 0.9665336658354114, + "grad_norm": 0.696062609908464, + "learning_rate": 1.4715394112933822e-07, + "loss": 0.817, + "step": 19379 + }, + { + "epoch": 0.9665835411471322, + "grad_norm": 0.57846720620285, + "learning_rate": 1.4671673739017067e-07, + "loss": 0.8699, + "step": 19380 + }, + { + "epoch": 0.9666334164588529, + "grad_norm": 0.6285864492308487, + "learning_rate": 1.462801821863119e-07, + "loss": 0.8299, + "step": 19381 + }, + { + "epoch": 0.9666832917705735, + "grad_norm": 1.345094103097856, + "learning_rate": 1.4584427552914726e-07, + "loss": 0.866, + "step": 19382 + }, + { + "epoch": 0.9667331670822943, + "grad_norm": 1.6363385260592482, + "learning_rate": 1.4540901743005097e-07, + "loss": 0.8382, + "step": 19383 + }, + { + "epoch": 0.9667830423940149, + "grad_norm": 0.6674731805143697, + "learning_rate": 1.4497440790038618e-07, + "loss": 0.8791, + "step": 19384 + }, + { + "epoch": 0.9668329177057357, + "grad_norm": 0.6487377289811708, + "learning_rate": 1.4454044695148828e-07, + "loss": 0.8322, + "step": 19385 + }, + { + "epoch": 0.9668827930174564, + "grad_norm": 0.5255442182144727, + "learning_rate": 1.4410713459468427e-07, + "loss": 0.8252, + "step": 19386 + }, + { + "epoch": 0.966932668329177, + "grad_norm": 0.5608691455342827, + "learning_rate": 1.4367447084128184e-07, + "loss": 0.849, + "step": 19387 + }, + { + "epoch": 0.9669825436408978, + "grad_norm": 0.8138933027998138, + "learning_rate": 1.4324245570256633e-07, + "loss": 0.8192, + "step": 19388 + }, + { + "epoch": 0.9670324189526185, + "grad_norm": 0.7180055782440043, + "learning_rate": 1.4281108918981767e-07, + "loss": 0.8633, + "step": 19389 + }, + { + "epoch": 0.9670822942643391, + "grad_norm": 0.5822784161261508, + "learning_rate": 1.4238037131428795e-07, + "loss": 0.8078, + "step": 19390 + }, + { + "epoch": 0.9671321695760599, + "grad_norm": 0.5665587085050402, + "learning_rate": 1.4195030208721817e-07, + "loss": 0.8757, + "step": 19391 + }, + { + "epoch": 0.9671820448877806, + "grad_norm": 0.6170158975745883, + "learning_rate": 1.415208815198299e-07, + "loss": 0.8307, + "step": 19392 + }, + { + "epoch": 0.9672319201995012, + "grad_norm": 0.5478103153025026, + "learning_rate": 1.4109210962332808e-07, + "loss": 0.8225, + "step": 19393 + }, + { + "epoch": 0.967281795511222, + "grad_norm": 0.6042114940313554, + "learning_rate": 1.4066398640890378e-07, + "loss": 0.8302, + "step": 19394 + }, + { + "epoch": 0.9673316708229427, + "grad_norm": 0.5325863683260827, + "learning_rate": 1.402365118877258e-07, + "loss": 0.8214, + "step": 19395 + }, + { + "epoch": 0.9673815461346633, + "grad_norm": 0.6159950565044027, + "learning_rate": 1.398096860709519e-07, + "loss": 0.8441, + "step": 19396 + }, + { + "epoch": 0.967431421446384, + "grad_norm": 0.5155339214802678, + "learning_rate": 1.3938350896971764e-07, + "loss": 0.8281, + "step": 19397 + }, + { + "epoch": 0.9674812967581048, + "grad_norm": 0.564034595454654, + "learning_rate": 1.3895798059514187e-07, + "loss": 0.8683, + "step": 19398 + }, + { + "epoch": 0.9675311720698254, + "grad_norm": 0.6507050665773108, + "learning_rate": 1.3853310095833515e-07, + "loss": 0.8539, + "step": 19399 + }, + { + "epoch": 0.9675810473815462, + "grad_norm": 0.6049746186129145, + "learning_rate": 1.3810887007038033e-07, + "loss": 0.8567, + "step": 19400 + }, + { + "epoch": 0.9676309226932668, + "grad_norm": 0.5265427189019318, + "learning_rate": 1.3768528794234624e-07, + "loss": 0.8085, + "step": 19401 + }, + { + "epoch": 0.9676807980049875, + "grad_norm": 0.8301866594575114, + "learning_rate": 1.3726235458528803e-07, + "loss": 0.8251, + "step": 19402 + }, + { + "epoch": 0.9677306733167083, + "grad_norm": 0.7245836625347891, + "learning_rate": 1.3684007001024123e-07, + "loss": 0.8382, + "step": 19403 + }, + { + "epoch": 0.9677805486284289, + "grad_norm": 0.5714302348992186, + "learning_rate": 1.3641843422822486e-07, + "loss": 0.8127, + "step": 19404 + }, + { + "epoch": 0.9678304239401496, + "grad_norm": 0.6469732387724486, + "learning_rate": 1.35997447250244e-07, + "loss": 0.8292, + "step": 19405 + }, + { + "epoch": 0.9678802992518704, + "grad_norm": 0.9943960172980425, + "learning_rate": 1.3557710908727872e-07, + "loss": 0.8488, + "step": 19406 + }, + { + "epoch": 0.967930174563591, + "grad_norm": 0.6227575014911998, + "learning_rate": 1.351574197503036e-07, + "loss": 0.8505, + "step": 19407 + }, + { + "epoch": 0.9679800498753117, + "grad_norm": 0.6918472735260476, + "learning_rate": 1.3473837925026822e-07, + "loss": 0.8485, + "step": 19408 + }, + { + "epoch": 0.9680299251870325, + "grad_norm": 0.8805894348333807, + "learning_rate": 1.3431998759810548e-07, + "loss": 0.857, + "step": 19409 + }, + { + "epoch": 0.9680798004987531, + "grad_norm": 0.6128523500346749, + "learning_rate": 1.3390224480473447e-07, + "loss": 0.838, + "step": 19410 + }, + { + "epoch": 0.9681296758104738, + "grad_norm": 0.6128484647139326, + "learning_rate": 1.3348515088105474e-07, + "loss": 0.8515, + "step": 19411 + }, + { + "epoch": 0.9681795511221946, + "grad_norm": 0.6351311117129804, + "learning_rate": 1.3306870583795207e-07, + "loss": 0.8384, + "step": 19412 + }, + { + "epoch": 0.9682294264339152, + "grad_norm": 0.6761544167354243, + "learning_rate": 1.3265290968629273e-07, + "loss": 0.8411, + "step": 19413 + }, + { + "epoch": 0.9682793017456359, + "grad_norm": 0.626147815491574, + "learning_rate": 1.3223776243692641e-07, + "loss": 0.8312, + "step": 19414 + }, + { + "epoch": 0.9683291770573567, + "grad_norm": 0.5841631183791455, + "learning_rate": 1.3182326410068612e-07, + "loss": 0.8281, + "step": 19415 + }, + { + "epoch": 0.9683790523690773, + "grad_norm": 0.720711826318555, + "learning_rate": 1.3140941468838818e-07, + "loss": 0.8521, + "step": 19416 + }, + { + "epoch": 0.968428927680798, + "grad_norm": 0.5011797672378933, + "learning_rate": 1.309962142108323e-07, + "loss": 0.855, + "step": 19417 + }, + { + "epoch": 0.9684788029925187, + "grad_norm": 0.9096714055567401, + "learning_rate": 1.3058366267879872e-07, + "loss": 0.837, + "step": 19418 + }, + { + "epoch": 0.9685286783042394, + "grad_norm": 0.6329387061983442, + "learning_rate": 1.3017176010305666e-07, + "loss": 0.8234, + "step": 19419 + }, + { + "epoch": 0.9685785536159601, + "grad_norm": 0.5387054017764703, + "learning_rate": 1.2976050649435302e-07, + "loss": 0.7825, + "step": 19420 + }, + { + "epoch": 0.9686284289276808, + "grad_norm": 0.6192201417628194, + "learning_rate": 1.2934990186341535e-07, + "loss": 0.8134, + "step": 19421 + }, + { + "epoch": 0.9686783042394015, + "grad_norm": 0.6788216701781621, + "learning_rate": 1.2893994622096284e-07, + "loss": 0.8288, + "step": 19422 + }, + { + "epoch": 0.9687281795511222, + "grad_norm": 0.6246592974029589, + "learning_rate": 1.2853063957769253e-07, + "loss": 0.8483, + "step": 19423 + }, + { + "epoch": 0.9687780548628429, + "grad_norm": 0.5669538098964083, + "learning_rate": 1.2812198194428194e-07, + "loss": 0.7951, + "step": 19424 + }, + { + "epoch": 0.9688279301745636, + "grad_norm": 0.7697699012079833, + "learning_rate": 1.2771397333139757e-07, + "loss": 0.8412, + "step": 19425 + }, + { + "epoch": 0.9688778054862843, + "grad_norm": 0.6049979892699171, + "learning_rate": 1.2730661374968643e-07, + "loss": 0.845, + "step": 19426 + }, + { + "epoch": 0.968927680798005, + "grad_norm": 0.6784550533855332, + "learning_rate": 1.2689990320977896e-07, + "loss": 0.8468, + "step": 19427 + }, + { + "epoch": 0.9689775561097257, + "grad_norm": 0.6622694096342752, + "learning_rate": 1.2649384172228606e-07, + "loss": 0.8824, + "step": 19428 + }, + { + "epoch": 0.9690274314214464, + "grad_norm": 0.8058777650261129, + "learning_rate": 1.2608842929780207e-07, + "loss": 0.8706, + "step": 19429 + }, + { + "epoch": 0.969077306733167, + "grad_norm": 0.602662476778716, + "learning_rate": 1.2568366594691018e-07, + "loss": 0.8062, + "step": 19430 + }, + { + "epoch": 0.9691271820448878, + "grad_norm": 0.6499943239440071, + "learning_rate": 1.2527955168017135e-07, + "loss": 0.7865, + "step": 19431 + }, + { + "epoch": 0.9691770573566085, + "grad_norm": 0.5717740721080705, + "learning_rate": 1.2487608650813e-07, + "loss": 0.8485, + "step": 19432 + }, + { + "epoch": 0.9692269326683292, + "grad_norm": 0.9396887243255774, + "learning_rate": 1.2447327044131097e-07, + "loss": 0.8622, + "step": 19433 + }, + { + "epoch": 0.9692768079800499, + "grad_norm": 0.5435858107406476, + "learning_rate": 1.2407110349023366e-07, + "loss": 0.7803, + "step": 19434 + }, + { + "epoch": 0.9693266832917705, + "grad_norm": 0.541107889515931, + "learning_rate": 1.236695856653869e-07, + "loss": 0.8369, + "step": 19435 + }, + { + "epoch": 0.9693765586034913, + "grad_norm": 1.6297474509225356, + "learning_rate": 1.232687169772484e-07, + "loss": 0.8403, + "step": 19436 + }, + { + "epoch": 0.969426433915212, + "grad_norm": 0.6081135570486013, + "learning_rate": 1.2286849743627926e-07, + "loss": 0.8011, + "step": 19437 + }, + { + "epoch": 0.9694763092269326, + "grad_norm": 0.7327349517268344, + "learning_rate": 1.224689270529239e-07, + "loss": 0.8038, + "step": 19438 + }, + { + "epoch": 0.9695261845386534, + "grad_norm": 0.8434866784149098, + "learning_rate": 1.220700058376073e-07, + "loss": 0.8237, + "step": 19439 + }, + { + "epoch": 0.9695760598503741, + "grad_norm": 0.6121409385706728, + "learning_rate": 1.216717338007406e-07, + "loss": 0.8258, + "step": 19440 + }, + { + "epoch": 0.9696259351620947, + "grad_norm": 0.49039683544514473, + "learning_rate": 1.2127411095271545e-07, + "loss": 0.8061, + "step": 19441 + }, + { + "epoch": 0.9696758104738155, + "grad_norm": 0.662217366561434, + "learning_rate": 1.2087713730390692e-07, + "loss": 0.8204, + "step": 19442 + }, + { + "epoch": 0.9697256857855362, + "grad_norm": 0.5884490611963001, + "learning_rate": 1.2048081286467615e-07, + "loss": 0.8828, + "step": 19443 + }, + { + "epoch": 0.9697755610972568, + "grad_norm": 0.6083937231054382, + "learning_rate": 1.2008513764536488e-07, + "loss": 0.8843, + "step": 19444 + }, + { + "epoch": 0.9698254364089776, + "grad_norm": 0.742322133265654, + "learning_rate": 1.196901116562982e-07, + "loss": 0.7997, + "step": 19445 + }, + { + "epoch": 0.9698753117206983, + "grad_norm": 0.6569563777232255, + "learning_rate": 1.1929573490778167e-07, + "loss": 0.8716, + "step": 19446 + }, + { + "epoch": 0.9699251870324189, + "grad_norm": 0.5794165567960723, + "learning_rate": 1.1890200741010438e-07, + "loss": 0.814, + "step": 19447 + }, + { + "epoch": 0.9699750623441397, + "grad_norm": 0.599352028214965, + "learning_rate": 1.1850892917354694e-07, + "loss": 0.8154, + "step": 19448 + }, + { + "epoch": 0.9700249376558604, + "grad_norm": 0.5864083025785372, + "learning_rate": 1.1811650020836506e-07, + "loss": 0.8418, + "step": 19449 + }, + { + "epoch": 0.970074812967581, + "grad_norm": 0.6423852905350169, + "learning_rate": 1.1772472052479499e-07, + "loss": 0.8791, + "step": 19450 + }, + { + "epoch": 0.9701246882793018, + "grad_norm": 0.6203152100700813, + "learning_rate": 1.1733359013306466e-07, + "loss": 0.8601, + "step": 19451 + }, + { + "epoch": 0.9701745635910225, + "grad_norm": 0.6379669296504954, + "learning_rate": 1.1694310904337702e-07, + "loss": 0.8233, + "step": 19452 + }, + { + "epoch": 0.9702244389027431, + "grad_norm": 0.6721663975706096, + "learning_rate": 1.1655327726592113e-07, + "loss": 0.851, + "step": 19453 + }, + { + "epoch": 0.9702743142144639, + "grad_norm": 0.6503961720865158, + "learning_rate": 1.1616409481087498e-07, + "loss": 0.8543, + "step": 19454 + }, + { + "epoch": 0.9703241895261845, + "grad_norm": 1.0908437168085117, + "learning_rate": 1.1577556168838599e-07, + "loss": 0.8361, + "step": 19455 + }, + { + "epoch": 0.9703740648379052, + "grad_norm": 0.6713106262997621, + "learning_rate": 1.1538767790859883e-07, + "loss": 0.8473, + "step": 19456 + }, + { + "epoch": 0.970423940149626, + "grad_norm": 0.5811133911810455, + "learning_rate": 1.150004434816332e-07, + "loss": 0.8146, + "step": 19457 + }, + { + "epoch": 0.9704738154613466, + "grad_norm": 0.5069121564986027, + "learning_rate": 1.1461385841759486e-07, + "loss": 0.8174, + "step": 19458 + }, + { + "epoch": 0.9705236907730673, + "grad_norm": 0.7558556802391126, + "learning_rate": 1.1422792272657024e-07, + "loss": 0.8345, + "step": 19459 + }, + { + "epoch": 0.9705735660847881, + "grad_norm": 0.656310131761286, + "learning_rate": 1.1384263641862903e-07, + "loss": 0.8557, + "step": 19460 + }, + { + "epoch": 0.9706234413965087, + "grad_norm": 0.7205040511264199, + "learning_rate": 1.1345799950382985e-07, + "loss": 0.8626, + "step": 19461 + }, + { + "epoch": 0.9706733167082294, + "grad_norm": 0.6148315895095493, + "learning_rate": 1.1307401199220358e-07, + "loss": 0.8539, + "step": 19462 + }, + { + "epoch": 0.9707231920199502, + "grad_norm": 0.6202321314241568, + "learning_rate": 1.1269067389377553e-07, + "loss": 0.8336, + "step": 19463 + }, + { + "epoch": 0.9707730673316708, + "grad_norm": 0.5900017052406757, + "learning_rate": 1.1230798521854325e-07, + "loss": 0.8458, + "step": 19464 + }, + { + "epoch": 0.9708229426433915, + "grad_norm": 0.5809793549551806, + "learning_rate": 1.1192594597649597e-07, + "loss": 0.8924, + "step": 19465 + }, + { + "epoch": 0.9708728179551123, + "grad_norm": 0.5446570546529446, + "learning_rate": 1.1154455617760351e-07, + "loss": 0.8491, + "step": 19466 + }, + { + "epoch": 0.9709226932668329, + "grad_norm": 0.5812381079240517, + "learning_rate": 1.1116381583181901e-07, + "loss": 0.827, + "step": 19467 + }, + { + "epoch": 0.9709725685785536, + "grad_norm": 0.7245668591323905, + "learning_rate": 1.107837249490734e-07, + "loss": 0.8503, + "step": 19468 + }, + { + "epoch": 0.9710224438902744, + "grad_norm": 0.671737102518107, + "learning_rate": 1.1040428353928933e-07, + "loss": 0.8093, + "step": 19469 + }, + { + "epoch": 0.971072319201995, + "grad_norm": 0.5811941596672351, + "learning_rate": 1.100254916123672e-07, + "loss": 0.8539, + "step": 19470 + }, + { + "epoch": 0.9711221945137157, + "grad_norm": 0.7450067827308575, + "learning_rate": 1.0964734917818798e-07, + "loss": 0.8452, + "step": 19471 + }, + { + "epoch": 0.9711720698254364, + "grad_norm": 0.6373758163243167, + "learning_rate": 1.0926985624662434e-07, + "loss": 0.8157, + "step": 19472 + }, + { + "epoch": 0.9712219451371571, + "grad_norm": 0.604833924242158, + "learning_rate": 1.0889301282752118e-07, + "loss": 0.7903, + "step": 19473 + }, + { + "epoch": 0.9712718204488778, + "grad_norm": 0.861703478450829, + "learning_rate": 1.0851681893071785e-07, + "loss": 0.8533, + "step": 19474 + }, + { + "epoch": 0.9713216957605985, + "grad_norm": 0.6474737237910748, + "learning_rate": 1.0814127456602596e-07, + "loss": 0.8278, + "step": 19475 + }, + { + "epoch": 0.9713715710723192, + "grad_norm": 0.6528984082665961, + "learning_rate": 1.0776637974324877e-07, + "loss": 0.8204, + "step": 19476 + }, + { + "epoch": 0.9714214463840399, + "grad_norm": 0.554266140741836, + "learning_rate": 1.0739213447216734e-07, + "loss": 0.8358, + "step": 19477 + }, + { + "epoch": 0.9714713216957606, + "grad_norm": 0.5730880923019385, + "learning_rate": 1.0701853876254609e-07, + "loss": 0.8626, + "step": 19478 + }, + { + "epoch": 0.9715211970074813, + "grad_norm": 0.7620184502122733, + "learning_rate": 1.0664559262413831e-07, + "loss": 0.8687, + "step": 19479 + }, + { + "epoch": 0.971571072319202, + "grad_norm": 0.994257334646246, + "learning_rate": 1.0627329606666958e-07, + "loss": 0.8296, + "step": 19480 + }, + { + "epoch": 0.9716209476309227, + "grad_norm": 0.6060587406648665, + "learning_rate": 1.0590164909985989e-07, + "loss": 0.8327, + "step": 19481 + }, + { + "epoch": 0.9716708229426434, + "grad_norm": 0.5998336158184149, + "learning_rate": 1.0553065173340704e-07, + "loss": 0.8959, + "step": 19482 + }, + { + "epoch": 0.9717206982543641, + "grad_norm": 0.5637549180739307, + "learning_rate": 1.051603039769894e-07, + "loss": 0.8013, + "step": 19483 + }, + { + "epoch": 0.9717705735660848, + "grad_norm": 0.6344000458327875, + "learning_rate": 1.0479060584027145e-07, + "loss": 0.8596, + "step": 19484 + }, + { + "epoch": 0.9718204488778055, + "grad_norm": 0.6311337891472075, + "learning_rate": 1.0442155733290105e-07, + "loss": 0.8406, + "step": 19485 + }, + { + "epoch": 0.9718703241895262, + "grad_norm": 0.6753959074589749, + "learning_rate": 1.0405315846450936e-07, + "loss": 0.8251, + "step": 19486 + }, + { + "epoch": 0.9719201995012469, + "grad_norm": 0.5971528558255145, + "learning_rate": 1.0368540924470816e-07, + "loss": 0.8712, + "step": 19487 + }, + { + "epoch": 0.9719700748129676, + "grad_norm": 0.5932539122449929, + "learning_rate": 1.0331830968309253e-07, + "loss": 0.8367, + "step": 19488 + }, + { + "epoch": 0.9720199501246882, + "grad_norm": 0.6166602537368431, + "learning_rate": 1.0295185978924371e-07, + "loss": 0.8176, + "step": 19489 + }, + { + "epoch": 0.972069825436409, + "grad_norm": 0.6882992775128766, + "learning_rate": 1.0258605957272627e-07, + "loss": 0.8627, + "step": 19490 + }, + { + "epoch": 0.9721197007481297, + "grad_norm": 0.6823501757589023, + "learning_rate": 1.0222090904308257e-07, + "loss": 0.868, + "step": 19491 + }, + { + "epoch": 0.9721695760598503, + "grad_norm": 0.630261239561223, + "learning_rate": 1.0185640820983832e-07, + "loss": 0.8401, + "step": 19492 + }, + { + "epoch": 0.9722194513715711, + "grad_norm": 0.6737331452473247, + "learning_rate": 1.0149255708251094e-07, + "loss": 0.8635, + "step": 19493 + }, + { + "epoch": 0.9722693266832918, + "grad_norm": 0.6040004725321567, + "learning_rate": 1.0112935567059001e-07, + "loss": 0.8371, + "step": 19494 + }, + { + "epoch": 0.9723192019950124, + "grad_norm": 0.86871853926261, + "learning_rate": 1.0076680398355686e-07, + "loss": 0.8592, + "step": 19495 + }, + { + "epoch": 0.9723690773067332, + "grad_norm": 0.9568019152740368, + "learning_rate": 1.0040490203087061e-07, + "loss": 0.8637, + "step": 19496 + }, + { + "epoch": 0.9724189526184539, + "grad_norm": 0.6516003660033902, + "learning_rate": 1.0004364982197368e-07, + "loss": 0.8281, + "step": 19497 + }, + { + "epoch": 0.9724688279301745, + "grad_norm": 0.6259348545287399, + "learning_rate": 9.968304736629464e-08, + "loss": 0.8809, + "step": 19498 + }, + { + "epoch": 0.9725187032418953, + "grad_norm": 0.5912712235838009, + "learning_rate": 9.932309467324264e-08, + "loss": 0.8422, + "step": 19499 + }, + { + "epoch": 0.972568578553616, + "grad_norm": 1.1774747850093166, + "learning_rate": 9.896379175221016e-08, + "loss": 0.8416, + "step": 19500 + }, + { + "epoch": 0.9726184538653366, + "grad_norm": 0.5708198060643955, + "learning_rate": 9.860513861257304e-08, + "loss": 0.8415, + "step": 19501 + }, + { + "epoch": 0.9726683291770574, + "grad_norm": 1.3117152740364777, + "learning_rate": 9.824713526369046e-08, + "loss": 0.8555, + "step": 19502 + }, + { + "epoch": 0.9727182044887781, + "grad_norm": 0.6127261443681014, + "learning_rate": 9.788978171490492e-08, + "loss": 0.8738, + "step": 19503 + }, + { + "epoch": 0.9727680798004987, + "grad_norm": 0.5460366935039787, + "learning_rate": 9.753307797554234e-08, + "loss": 0.8581, + "step": 19504 + }, + { + "epoch": 0.9728179551122195, + "grad_norm": 0.6102432752278423, + "learning_rate": 9.717702405490636e-08, + "loss": 0.8199, + "step": 19505 + }, + { + "epoch": 0.9728678304239401, + "grad_norm": 0.6257372169738917, + "learning_rate": 9.682161996229233e-08, + "loss": 0.8082, + "step": 19506 + }, + { + "epoch": 0.9729177057356608, + "grad_norm": 0.8832313849343771, + "learning_rate": 9.646686570697061e-08, + "loss": 0.8149, + "step": 19507 + }, + { + "epoch": 0.9729675810473816, + "grad_norm": 0.7484641061050985, + "learning_rate": 9.611276129820323e-08, + "loss": 0.8677, + "step": 19508 + }, + { + "epoch": 0.9730174563591022, + "grad_norm": 0.6030845425111522, + "learning_rate": 9.575930674522727e-08, + "loss": 0.7908, + "step": 19509 + }, + { + "epoch": 0.9730673316708229, + "grad_norm": 0.7823435008665848, + "learning_rate": 9.540650205726587e-08, + "loss": 0.8936, + "step": 19510 + }, + { + "epoch": 0.9731172069825437, + "grad_norm": 0.6525885325433217, + "learning_rate": 9.505434724352835e-08, + "loss": 0.8219, + "step": 19511 + }, + { + "epoch": 0.9731670822942643, + "grad_norm": 1.602613930873058, + "learning_rate": 9.470284231319904e-08, + "loss": 0.8833, + "step": 19512 + }, + { + "epoch": 0.973216957605985, + "grad_norm": 0.5966184681137359, + "learning_rate": 9.435198727545391e-08, + "loss": 0.8523, + "step": 19513 + }, + { + "epoch": 0.9732668329177058, + "grad_norm": 0.6435960360608342, + "learning_rate": 9.400178213944677e-08, + "loss": 0.8293, + "step": 19514 + }, + { + "epoch": 0.9733167082294264, + "grad_norm": 0.7251558443448636, + "learning_rate": 9.365222691431752e-08, + "loss": 0.8282, + "step": 19515 + }, + { + "epoch": 0.9733665835411471, + "grad_norm": 1.1414834442025623, + "learning_rate": 9.330332160918665e-08, + "loss": 0.8498, + "step": 19516 + }, + { + "epoch": 0.9734164588528679, + "grad_norm": 0.6583195012819651, + "learning_rate": 9.295506623315797e-08, + "loss": 0.843, + "step": 19517 + }, + { + "epoch": 0.9734663341645885, + "grad_norm": 0.6602160677700029, + "learning_rate": 9.260746079532146e-08, + "loss": 0.8306, + "step": 19518 + }, + { + "epoch": 0.9735162094763092, + "grad_norm": 1.4605213329023914, + "learning_rate": 9.226050530474206e-08, + "loss": 0.842, + "step": 19519 + }, + { + "epoch": 0.97356608478803, + "grad_norm": 0.6884545157236912, + "learning_rate": 9.191419977048199e-08, + "loss": 0.8488, + "step": 19520 + }, + { + "epoch": 0.9736159600997506, + "grad_norm": 0.6713385934293653, + "learning_rate": 9.156854420157013e-08, + "loss": 0.8266, + "step": 19521 + }, + { + "epoch": 0.9736658354114713, + "grad_norm": 0.6206704623064186, + "learning_rate": 9.122353860703258e-08, + "loss": 0.8388, + "step": 19522 + }, + { + "epoch": 0.9737157107231921, + "grad_norm": 0.7171468571148959, + "learning_rate": 9.087918299586773e-08, + "loss": 0.8871, + "step": 19523 + }, + { + "epoch": 0.9737655860349127, + "grad_norm": 0.5348838800768891, + "learning_rate": 9.053547737706281e-08, + "loss": 0.8524, + "step": 19524 + }, + { + "epoch": 0.9738154613466334, + "grad_norm": 0.6475013330582953, + "learning_rate": 9.019242175958565e-08, + "loss": 0.8594, + "step": 19525 + }, + { + "epoch": 0.9738653366583541, + "grad_norm": 0.7007361418606051, + "learning_rate": 8.985001615239019e-08, + "loss": 0.8538, + "step": 19526 + }, + { + "epoch": 0.9739152119700748, + "grad_norm": 0.5669344174295256, + "learning_rate": 8.950826056441097e-08, + "loss": 0.8436, + "step": 19527 + }, + { + "epoch": 0.9739650872817955, + "grad_norm": 1.6038057633322882, + "learning_rate": 8.916715500456585e-08, + "loss": 0.8481, + "step": 19528 + }, + { + "epoch": 0.9740149625935162, + "grad_norm": 0.6173429459116304, + "learning_rate": 8.882669948175326e-08, + "loss": 0.8446, + "step": 19529 + }, + { + "epoch": 0.9740648379052369, + "grad_norm": 0.7185855120094975, + "learning_rate": 8.848689400485776e-08, + "loss": 0.8203, + "step": 19530 + }, + { + "epoch": 0.9741147132169576, + "grad_norm": 0.722143193836085, + "learning_rate": 8.814773858275004e-08, + "loss": 0.8736, + "step": 19531 + }, + { + "epoch": 0.9741645885286783, + "grad_norm": 0.7010363807675956, + "learning_rate": 8.780923322427859e-08, + "loss": 0.7698, + "step": 19532 + }, + { + "epoch": 0.974214463840399, + "grad_norm": 0.69105556662253, + "learning_rate": 8.74713779382752e-08, + "loss": 0.8222, + "step": 19533 + }, + { + "epoch": 0.9742643391521197, + "grad_norm": 0.6542691193237857, + "learning_rate": 8.713417273355784e-08, + "loss": 0.835, + "step": 19534 + }, + { + "epoch": 0.9743142144638404, + "grad_norm": 0.6145902411600284, + "learning_rate": 8.679761761892224e-08, + "loss": 0.877, + "step": 19535 + }, + { + "epoch": 0.9743640897755611, + "grad_norm": 0.6272240565444672, + "learning_rate": 8.646171260315306e-08, + "loss": 0.8106, + "step": 19536 + }, + { + "epoch": 0.9744139650872818, + "grad_norm": 0.5834457228619134, + "learning_rate": 8.612645769501826e-08, + "loss": 0.8462, + "step": 19537 + }, + { + "epoch": 0.9744638403990025, + "grad_norm": 0.6008368892795228, + "learning_rate": 8.579185290326086e-08, + "loss": 0.8232, + "step": 19538 + }, + { + "epoch": 0.9745137157107232, + "grad_norm": 0.6343126876742132, + "learning_rate": 8.545789823661832e-08, + "loss": 0.8604, + "step": 19539 + }, + { + "epoch": 0.9745635910224439, + "grad_norm": 0.689416925138983, + "learning_rate": 8.512459370379755e-08, + "loss": 0.8426, + "step": 19540 + }, + { + "epoch": 0.9746134663341646, + "grad_norm": 0.6316266767469753, + "learning_rate": 8.479193931350549e-08, + "loss": 0.816, + "step": 19541 + }, + { + "epoch": 0.9746633416458853, + "grad_norm": 0.6283473361555929, + "learning_rate": 8.445993507441296e-08, + "loss": 0.839, + "step": 19542 + }, + { + "epoch": 0.9747132169576059, + "grad_norm": 0.7946146650147723, + "learning_rate": 8.412858099518806e-08, + "loss": 0.8821, + "step": 19543 + }, + { + "epoch": 0.9747630922693267, + "grad_norm": 0.5347326995772113, + "learning_rate": 8.379787708447662e-08, + "loss": 0.8841, + "step": 19544 + }, + { + "epoch": 0.9748129675810474, + "grad_norm": 0.5073823375421023, + "learning_rate": 8.346782335091063e-08, + "loss": 0.7768, + "step": 19545 + }, + { + "epoch": 0.974862842892768, + "grad_norm": 0.6073692551493719, + "learning_rate": 8.313841980309711e-08, + "loss": 0.8473, + "step": 19546 + }, + { + "epoch": 0.9749127182044888, + "grad_norm": 0.5906788796447211, + "learning_rate": 8.280966644964027e-08, + "loss": 0.8247, + "step": 19547 + }, + { + "epoch": 0.9749625935162095, + "grad_norm": 0.7455096986770237, + "learning_rate": 8.248156329911105e-08, + "loss": 0.8665, + "step": 19548 + }, + { + "epoch": 0.9750124688279301, + "grad_norm": 0.5789234853929505, + "learning_rate": 8.215411036007204e-08, + "loss": 0.8091, + "step": 19549 + }, + { + "epoch": 0.9750623441396509, + "grad_norm": 0.6664089884809881, + "learning_rate": 8.182730764107193e-08, + "loss": 0.8307, + "step": 19550 + }, + { + "epoch": 0.9751122194513716, + "grad_norm": 0.8036710702086642, + "learning_rate": 8.15011551506345e-08, + "loss": 0.8293, + "step": 19551 + }, + { + "epoch": 0.9751620947630922, + "grad_norm": 0.5580858746500698, + "learning_rate": 8.117565289727514e-08, + "loss": 0.7941, + "step": 19552 + }, + { + "epoch": 0.975211970074813, + "grad_norm": 0.5766707965486775, + "learning_rate": 8.085080088948427e-08, + "loss": 0.8005, + "step": 19553 + }, + { + "epoch": 0.9752618453865337, + "grad_norm": 0.650239866382384, + "learning_rate": 8.052659913573568e-08, + "loss": 0.8319, + "step": 19554 + }, + { + "epoch": 0.9753117206982543, + "grad_norm": 0.6158110664496905, + "learning_rate": 8.02030476444976e-08, + "loss": 0.8193, + "step": 19555 + }, + { + "epoch": 0.9753615960099751, + "grad_norm": 0.6770962244993192, + "learning_rate": 7.988014642420494e-08, + "loss": 0.8224, + "step": 19556 + }, + { + "epoch": 0.9754114713216958, + "grad_norm": 0.551413592860451, + "learning_rate": 7.955789548328985e-08, + "loss": 0.8467, + "step": 19557 + }, + { + "epoch": 0.9754613466334164, + "grad_norm": 0.7859442481975318, + "learning_rate": 7.92362948301567e-08, + "loss": 0.8352, + "step": 19558 + }, + { + "epoch": 0.9755112219451372, + "grad_norm": 0.7143887625715204, + "learning_rate": 7.891534447320159e-08, + "loss": 0.849, + "step": 19559 + }, + { + "epoch": 0.9755610972568578, + "grad_norm": 0.5881235102473763, + "learning_rate": 7.859504442079558e-08, + "loss": 0.8054, + "step": 19560 + }, + { + "epoch": 0.9756109725685785, + "grad_norm": 0.7778390361511489, + "learning_rate": 7.827539468129863e-08, + "loss": 0.7809, + "step": 19561 + }, + { + "epoch": 0.9756608478802993, + "grad_norm": 0.6498240934379516, + "learning_rate": 7.795639526305131e-08, + "loss": 0.8535, + "step": 19562 + }, + { + "epoch": 0.9757107231920199, + "grad_norm": 0.6187672046113151, + "learning_rate": 7.763804617437753e-08, + "loss": 0.8148, + "step": 19563 + }, + { + "epoch": 0.9757605985037406, + "grad_norm": 0.7690923632805754, + "learning_rate": 7.732034742358729e-08, + "loss": 0.8648, + "step": 19564 + }, + { + "epoch": 0.9758104738154614, + "grad_norm": 0.815126902381136, + "learning_rate": 7.700329901896841e-08, + "loss": 0.8612, + "step": 19565 + }, + { + "epoch": 0.975860349127182, + "grad_norm": 0.6209844158599221, + "learning_rate": 7.668690096879483e-08, + "loss": 0.8476, + "step": 19566 + }, + { + "epoch": 0.9759102244389027, + "grad_norm": 0.5491156118610073, + "learning_rate": 7.637115328131828e-08, + "loss": 0.8169, + "step": 19567 + }, + { + "epoch": 0.9759600997506235, + "grad_norm": 0.724835092736127, + "learning_rate": 7.605605596478771e-08, + "loss": 0.8212, + "step": 19568 + }, + { + "epoch": 0.9760099750623441, + "grad_norm": 0.6594887564243969, + "learning_rate": 7.574160902741601e-08, + "loss": 0.8299, + "step": 19569 + }, + { + "epoch": 0.9760598503740648, + "grad_norm": 0.6756339696457953, + "learning_rate": 7.542781247741326e-08, + "loss": 0.8705, + "step": 19570 + }, + { + "epoch": 0.9761097256857856, + "grad_norm": 0.730155989433142, + "learning_rate": 7.511466632296738e-08, + "loss": 0.8322, + "step": 19571 + }, + { + "epoch": 0.9761596009975062, + "grad_norm": 0.5811546349920986, + "learning_rate": 7.480217057224959e-08, + "loss": 0.837, + "step": 19572 + }, + { + "epoch": 0.9762094763092269, + "grad_norm": 0.5253541080404206, + "learning_rate": 7.449032523341171e-08, + "loss": 0.8331, + "step": 19573 + }, + { + "epoch": 0.9762593516209477, + "grad_norm": 0.9039634666885196, + "learning_rate": 7.417913031459722e-08, + "loss": 0.8601, + "step": 19574 + }, + { + "epoch": 0.9763092269326683, + "grad_norm": 0.756328793014089, + "learning_rate": 7.386858582392187e-08, + "loss": 0.8398, + "step": 19575 + }, + { + "epoch": 0.976359102244389, + "grad_norm": 0.6892685038702188, + "learning_rate": 7.355869176949027e-08, + "loss": 0.8305, + "step": 19576 + }, + { + "epoch": 0.9764089775561098, + "grad_norm": 0.5325787836217512, + "learning_rate": 7.324944815938761e-08, + "loss": 0.8362, + "step": 19577 + }, + { + "epoch": 0.9764588528678304, + "grad_norm": 0.5928347519673295, + "learning_rate": 7.294085500168523e-08, + "loss": 0.8677, + "step": 19578 + }, + { + "epoch": 0.9765087281795511, + "grad_norm": 0.5112912888427327, + "learning_rate": 7.26329123044378e-08, + "loss": 0.8224, + "step": 19579 + }, + { + "epoch": 0.9765586034912718, + "grad_norm": 0.5560232887689208, + "learning_rate": 7.2325620075675e-08, + "loss": 0.8013, + "step": 19580 + }, + { + "epoch": 0.9766084788029925, + "grad_norm": 0.6934050326535841, + "learning_rate": 7.201897832342097e-08, + "loss": 0.8566, + "step": 19581 + }, + { + "epoch": 0.9766583541147132, + "grad_norm": 0.5689340398579364, + "learning_rate": 7.171298705567486e-08, + "loss": 0.8276, + "step": 19582 + }, + { + "epoch": 0.9767082294264339, + "grad_norm": 0.539107638964638, + "learning_rate": 7.140764628042196e-08, + "loss": 0.769, + "step": 19583 + }, + { + "epoch": 0.9767581047381546, + "grad_norm": 0.7071588476221093, + "learning_rate": 7.110295600563089e-08, + "loss": 0.818, + "step": 19584 + }, + { + "epoch": 0.9768079800498753, + "grad_norm": 0.6031111296142818, + "learning_rate": 7.079891623925083e-08, + "loss": 0.8255, + "step": 19585 + }, + { + "epoch": 0.976857855361596, + "grad_norm": 0.6097484882425038, + "learning_rate": 7.049552698921713e-08, + "loss": 0.8723, + "step": 19586 + }, + { + "epoch": 0.9769077306733167, + "grad_norm": 1.269884368025021, + "learning_rate": 7.019278826344289e-08, + "loss": 0.8737, + "step": 19587 + }, + { + "epoch": 0.9769576059850374, + "grad_norm": 0.659490280114976, + "learning_rate": 6.989070006983289e-08, + "loss": 0.8435, + "step": 19588 + }, + { + "epoch": 0.9770074812967581, + "grad_norm": 0.5795149675621947, + "learning_rate": 6.958926241626696e-08, + "loss": 0.742, + "step": 19589 + }, + { + "epoch": 0.9770573566084788, + "grad_norm": 0.8454044026963283, + "learning_rate": 6.9288475310611e-08, + "loss": 0.8049, + "step": 19590 + }, + { + "epoch": 0.9771072319201995, + "grad_norm": 0.7688756446941717, + "learning_rate": 6.898833876071709e-08, + "loss": 0.8554, + "step": 19591 + }, + { + "epoch": 0.9771571072319202, + "grad_norm": 0.508466495638602, + "learning_rate": 6.868885277441506e-08, + "loss": 0.7684, + "step": 19592 + }, + { + "epoch": 0.9772069825436409, + "grad_norm": 0.6290791981042458, + "learning_rate": 6.839001735951534e-08, + "loss": 0.8277, + "step": 19593 + }, + { + "epoch": 0.9772568578553616, + "grad_norm": 0.6374288452108339, + "learning_rate": 6.809183252382555e-08, + "loss": 0.8443, + "step": 19594 + }, + { + "epoch": 0.9773067331670823, + "grad_norm": 0.6225733529423393, + "learning_rate": 6.77942982751173e-08, + "loss": 0.8212, + "step": 19595 + }, + { + "epoch": 0.977356608478803, + "grad_norm": 0.53501419370139, + "learning_rate": 6.74974146211621e-08, + "loss": 0.8237, + "step": 19596 + }, + { + "epoch": 0.9774064837905236, + "grad_norm": 0.7036648224598515, + "learning_rate": 6.720118156970101e-08, + "loss": 0.8454, + "step": 19597 + }, + { + "epoch": 0.9774563591022444, + "grad_norm": 0.7400204306622806, + "learning_rate": 6.690559912846673e-08, + "loss": 0.8491, + "step": 19598 + }, + { + "epoch": 0.9775062344139651, + "grad_norm": 0.6123336063418963, + "learning_rate": 6.661066730517251e-08, + "loss": 0.8343, + "step": 19599 + }, + { + "epoch": 0.9775561097256857, + "grad_norm": 0.5362026042380608, + "learning_rate": 6.631638610751501e-08, + "loss": 0.8101, + "step": 19600 + }, + { + "epoch": 0.9776059850374065, + "grad_norm": 0.6440807606672461, + "learning_rate": 6.602275554317138e-08, + "loss": 0.8389, + "step": 19601 + }, + { + "epoch": 0.9776558603491272, + "grad_norm": 0.6836119537302632, + "learning_rate": 6.572977561980498e-08, + "loss": 0.8294, + "step": 19602 + }, + { + "epoch": 0.9777057356608478, + "grad_norm": 0.5145357441647999, + "learning_rate": 6.543744634506244e-08, + "loss": 0.8448, + "step": 19603 + }, + { + "epoch": 0.9777556109725686, + "grad_norm": 0.7424484610179929, + "learning_rate": 6.514576772656822e-08, + "loss": 0.835, + "step": 19604 + }, + { + "epoch": 0.9778054862842893, + "grad_norm": 0.5717994652801646, + "learning_rate": 6.485473977193846e-08, + "loss": 0.8092, + "step": 19605 + }, + { + "epoch": 0.9778553615960099, + "grad_norm": 0.5037394840140306, + "learning_rate": 6.456436248876153e-08, + "loss": 0.8, + "step": 19606 + }, + { + "epoch": 0.9779052369077307, + "grad_norm": 0.7164538499269206, + "learning_rate": 6.427463588462024e-08, + "loss": 0.8172, + "step": 19607 + }, + { + "epoch": 0.9779551122194514, + "grad_norm": 0.6943703437932197, + "learning_rate": 6.398555996707245e-08, + "loss": 0.8208, + "step": 19608 + }, + { + "epoch": 0.978004987531172, + "grad_norm": 0.599585210021375, + "learning_rate": 6.369713474366212e-08, + "loss": 0.8332, + "step": 19609 + }, + { + "epoch": 0.9780548628428928, + "grad_norm": 0.5520802481299192, + "learning_rate": 6.34093602219138e-08, + "loss": 0.8258, + "step": 19610 + }, + { + "epoch": 0.9781047381546135, + "grad_norm": 0.48335460841714833, + "learning_rate": 6.312223640933812e-08, + "loss": 0.8019, + "step": 19611 + }, + { + "epoch": 0.9781546134663341, + "grad_norm": 1.0724670127353344, + "learning_rate": 6.283576331342633e-08, + "loss": 0.8493, + "step": 19612 + }, + { + "epoch": 0.9782044887780549, + "grad_norm": 0.7543722829039927, + "learning_rate": 6.254994094165855e-08, + "loss": 0.8491, + "step": 19613 + }, + { + "epoch": 0.9782543640897755, + "grad_norm": 0.6684529124039242, + "learning_rate": 6.226476930148717e-08, + "loss": 0.877, + "step": 19614 + }, + { + "epoch": 0.9783042394014962, + "grad_norm": 0.7707153062891813, + "learning_rate": 6.198024840035621e-08, + "loss": 0.8337, + "step": 19615 + }, + { + "epoch": 0.978354114713217, + "grad_norm": 1.005719448087985, + "learning_rate": 6.169637824568752e-08, + "loss": 0.8295, + "step": 19616 + }, + { + "epoch": 0.9784039900249376, + "grad_norm": 0.6138819395986668, + "learning_rate": 6.141315884489463e-08, + "loss": 0.8155, + "step": 19617 + }, + { + "epoch": 0.9784538653366583, + "grad_norm": 0.7080788751666804, + "learning_rate": 6.113059020536327e-08, + "loss": 0.878, + "step": 19618 + }, + { + "epoch": 0.9785037406483791, + "grad_norm": 0.6324768838230337, + "learning_rate": 6.084867233446812e-08, + "loss": 0.8626, + "step": 19619 + }, + { + "epoch": 0.9785536159600997, + "grad_norm": 0.5905411797125167, + "learning_rate": 6.056740523956439e-08, + "loss": 0.8192, + "step": 19620 + }, + { + "epoch": 0.9786034912718204, + "grad_norm": 0.5161515679392128, + "learning_rate": 6.028678892799344e-08, + "loss": 0.8209, + "step": 19621 + }, + { + "epoch": 0.9786533665835412, + "grad_norm": 0.5960603756269162, + "learning_rate": 6.00068234070772e-08, + "loss": 0.8529, + "step": 19622 + }, + { + "epoch": 0.9787032418952618, + "grad_norm": 3.0087670086743907, + "learning_rate": 5.972750868412091e-08, + "loss": 0.8273, + "step": 19623 + }, + { + "epoch": 0.9787531172069825, + "grad_norm": 0.5499695174910876, + "learning_rate": 5.9448844766413213e-08, + "loss": 0.8005, + "step": 19624 + }, + { + "epoch": 0.9788029925187033, + "grad_norm": 0.6311958378594775, + "learning_rate": 5.9170831661226056e-08, + "loss": 0.8223, + "step": 19625 + }, + { + "epoch": 0.9788528678304239, + "grad_norm": 0.5591054475882403, + "learning_rate": 5.8893469375814745e-08, + "loss": 0.7901, + "step": 19626 + }, + { + "epoch": 0.9789027431421446, + "grad_norm": 1.0051719154561647, + "learning_rate": 5.8616757917415165e-08, + "loss": 0.82, + "step": 19627 + }, + { + "epoch": 0.9789526184538654, + "grad_norm": 0.6427945502650385, + "learning_rate": 5.834069729324932e-08, + "loss": 0.877, + "step": 19628 + }, + { + "epoch": 0.979002493765586, + "grad_norm": 0.6090239545945554, + "learning_rate": 5.806528751052254e-08, + "loss": 0.8413, + "step": 19629 + }, + { + "epoch": 0.9790523690773068, + "grad_norm": 0.5800814100195335, + "learning_rate": 5.7790528576415204e-08, + "loss": 0.8232, + "step": 19630 + }, + { + "epoch": 0.9791022443890274, + "grad_norm": 0.591970435004037, + "learning_rate": 5.75164204981049e-08, + "loss": 0.8361, + "step": 19631 + }, + { + "epoch": 0.9791521197007481, + "grad_norm": 0.546734882530223, + "learning_rate": 5.72429632827387e-08, + "loss": 0.8376, + "step": 19632 + }, + { + "epoch": 0.9792019950124689, + "grad_norm": 0.7282253004015709, + "learning_rate": 5.6970156937458105e-08, + "loss": 0.8125, + "step": 19633 + }, + { + "epoch": 0.9792518703241895, + "grad_norm": 0.6077681886875529, + "learning_rate": 5.6698001469374096e-08, + "loss": 0.8134, + "step": 19634 + }, + { + "epoch": 0.9793017456359102, + "grad_norm": 0.7350006611926416, + "learning_rate": 5.642649688559487e-08, + "loss": 0.8569, + "step": 19635 + }, + { + "epoch": 0.979351620947631, + "grad_norm": 0.5758475632461668, + "learning_rate": 5.6155643193203655e-08, + "loss": 0.8358, + "step": 19636 + }, + { + "epoch": 0.9794014962593516, + "grad_norm": 0.612278835052666, + "learning_rate": 5.588544039926424e-08, + "loss": 0.8595, + "step": 19637 + }, + { + "epoch": 0.9794513715710723, + "grad_norm": 0.8159192510203939, + "learning_rate": 5.56158885108321e-08, + "loss": 0.8079, + "step": 19638 + }, + { + "epoch": 0.979501246882793, + "grad_norm": 0.6550848030318512, + "learning_rate": 5.53469875349405e-08, + "loss": 0.7923, + "step": 19639 + }, + { + "epoch": 0.9795511221945137, + "grad_norm": 0.6017509427035296, + "learning_rate": 5.507873747860603e-08, + "loss": 0.8255, + "step": 19640 + }, + { + "epoch": 0.9796009975062344, + "grad_norm": 0.6739394228046696, + "learning_rate": 5.4811138348825894e-08, + "loss": 0.833, + "step": 19641 + }, + { + "epoch": 0.9796508728179552, + "grad_norm": 0.6150917502833005, + "learning_rate": 5.454419015258616e-08, + "loss": 0.8545, + "step": 19642 + }, + { + "epoch": 0.9797007481296758, + "grad_norm": 0.618893807245013, + "learning_rate": 5.4277892896853476e-08, + "loss": 0.8161, + "step": 19643 + }, + { + "epoch": 0.9797506234413965, + "grad_norm": 0.6026765653728385, + "learning_rate": 5.401224658857229e-08, + "loss": 0.8527, + "step": 19644 + }, + { + "epoch": 0.9798004987531173, + "grad_norm": 0.5588012582694318, + "learning_rate": 5.374725123467872e-08, + "loss": 0.8357, + "step": 19645 + }, + { + "epoch": 0.9798503740648379, + "grad_norm": 0.5616958947114732, + "learning_rate": 5.348290684208668e-08, + "loss": 0.8237, + "step": 19646 + }, + { + "epoch": 0.9799002493765586, + "grad_norm": 0.9154373852395734, + "learning_rate": 5.321921341769065e-08, + "loss": 0.8479, + "step": 19647 + }, + { + "epoch": 0.9799501246882794, + "grad_norm": 0.9793365019509752, + "learning_rate": 5.295617096837679e-08, + "loss": 0.8294, + "step": 19648 + }, + { + "epoch": 0.98, + "grad_norm": 0.5582524675095554, + "learning_rate": 5.269377950100629e-08, + "loss": 0.8181, + "step": 19649 + }, + { + "epoch": 0.9800498753117207, + "grad_norm": 0.6485161170767025, + "learning_rate": 5.243203902242644e-08, + "loss": 0.8134, + "step": 19650 + }, + { + "epoch": 0.9800997506234413, + "grad_norm": 0.5804982501059358, + "learning_rate": 5.2170949539467884e-08, + "loss": 0.8051, + "step": 19651 + }, + { + "epoch": 0.9801496259351621, + "grad_norm": 0.6714867602898656, + "learning_rate": 5.191051105894462e-08, + "loss": 0.8571, + "step": 19652 + }, + { + "epoch": 0.9801995012468828, + "grad_norm": 0.5577836790206399, + "learning_rate": 5.165072358764844e-08, + "loss": 0.8191, + "step": 19653 + }, + { + "epoch": 0.9802493765586034, + "grad_norm": 0.6187413969365563, + "learning_rate": 5.1391587132362806e-08, + "loss": 0.8515, + "step": 19654 + }, + { + "epoch": 0.9802992518703242, + "grad_norm": 0.5638063964976049, + "learning_rate": 5.1133101699848976e-08, + "loss": 0.8213, + "step": 19655 + }, + { + "epoch": 0.9803491271820449, + "grad_norm": 0.5495543815580898, + "learning_rate": 5.087526729685155e-08, + "loss": 0.8276, + "step": 19656 + }, + { + "epoch": 0.9803990024937655, + "grad_norm": 0.6196311181629925, + "learning_rate": 5.0618083930095705e-08, + "loss": 0.8177, + "step": 19657 + }, + { + "epoch": 0.9804488778054863, + "grad_norm": 0.6608837879536706, + "learning_rate": 5.036155160629552e-08, + "loss": 0.836, + "step": 19658 + }, + { + "epoch": 0.980498753117207, + "grad_norm": 0.798965684023685, + "learning_rate": 5.010567033214564e-08, + "loss": 0.8122, + "step": 19659 + }, + { + "epoch": 0.9805486284289276, + "grad_norm": 0.5281128791419634, + "learning_rate": 4.985044011432127e-08, + "loss": 0.8106, + "step": 19660 + }, + { + "epoch": 0.9805985037406484, + "grad_norm": 0.5528790433386498, + "learning_rate": 4.959586095948376e-08, + "loss": 0.8236, + "step": 19661 + }, + { + "epoch": 0.9806483790523691, + "grad_norm": 0.6263890860022623, + "learning_rate": 4.93419328742778e-08, + "loss": 0.8532, + "step": 19662 + }, + { + "epoch": 0.9806982543640897, + "grad_norm": 0.6588487682050927, + "learning_rate": 4.908865586532585e-08, + "loss": 0.8488, + "step": 19663 + }, + { + "epoch": 0.9807481296758105, + "grad_norm": 0.6991074417649196, + "learning_rate": 4.88360299392393e-08, + "loss": 0.8597, + "step": 19664 + }, + { + "epoch": 0.9807980049875312, + "grad_norm": 0.5326973132007198, + "learning_rate": 4.85840551026101e-08, + "loss": 0.8412, + "step": 19665 + }, + { + "epoch": 0.9808478802992519, + "grad_norm": 0.7212363435057929, + "learning_rate": 4.833273136201355e-08, + "loss": 0.8564, + "step": 19666 + }, + { + "epoch": 0.9808977556109726, + "grad_norm": 1.0313919105358549, + "learning_rate": 4.8082058724005507e-08, + "loss": 0.8562, + "step": 19667 + }, + { + "epoch": 0.9809476309226932, + "grad_norm": 0.8881676829773042, + "learning_rate": 4.783203719513074e-08, + "loss": 0.835, + "step": 19668 + }, + { + "epoch": 0.980997506234414, + "grad_norm": 0.5745964488715349, + "learning_rate": 4.7582666781909034e-08, + "loss": 0.8066, + "step": 19669 + }, + { + "epoch": 0.9810473815461347, + "grad_norm": 0.6673874587630219, + "learning_rate": 4.7333947490854625e-08, + "loss": 0.8636, + "step": 19670 + }, + { + "epoch": 0.9810972568578553, + "grad_norm": 0.6794971542590224, + "learning_rate": 4.708587932845121e-08, + "loss": 0.836, + "step": 19671 + }, + { + "epoch": 0.981147132169576, + "grad_norm": 0.5752444767999518, + "learning_rate": 4.6838462301174165e-08, + "loss": 0.8205, + "step": 19672 + }, + { + "epoch": 0.9811970074812968, + "grad_norm": 0.8231477201630977, + "learning_rate": 4.659169641548222e-08, + "loss": 0.8539, + "step": 19673 + }, + { + "epoch": 0.9812468827930174, + "grad_norm": 0.5788194933801327, + "learning_rate": 4.634558167780911e-08, + "loss": 0.8154, + "step": 19674 + }, + { + "epoch": 0.9812967581047382, + "grad_norm": 0.6892429227237112, + "learning_rate": 4.6100118094580255e-08, + "loss": 0.8284, + "step": 19675 + }, + { + "epoch": 0.9813466334164589, + "grad_norm": 0.7404501526480792, + "learning_rate": 4.585530567220442e-08, + "loss": 0.8709, + "step": 19676 + }, + { + "epoch": 0.9813965087281795, + "grad_norm": 0.6330928498926643, + "learning_rate": 4.56111444170626e-08, + "loss": 0.8454, + "step": 19677 + }, + { + "epoch": 0.9814463840399003, + "grad_norm": 0.7437736540968692, + "learning_rate": 4.536763433553304e-08, + "loss": 0.8543, + "step": 19678 + }, + { + "epoch": 0.981496259351621, + "grad_norm": 0.65724630110854, + "learning_rate": 4.512477543396343e-08, + "loss": 0.8236, + "step": 19679 + }, + { + "epoch": 0.9815461346633416, + "grad_norm": 0.5734722561787999, + "learning_rate": 4.48825677186987e-08, + "loss": 0.8093, + "step": 19680 + }, + { + "epoch": 0.9815960099750624, + "grad_norm": 0.6543744503200489, + "learning_rate": 4.4641011196050463e-08, + "loss": 0.8343, + "step": 19681 + }, + { + "epoch": 0.9816458852867831, + "grad_norm": 0.6209726671811213, + "learning_rate": 4.4400105872327566e-08, + "loss": 0.8736, + "step": 19682 + }, + { + "epoch": 0.9816957605985037, + "grad_norm": 0.5818887269800388, + "learning_rate": 4.415985175381665e-08, + "loss": 0.8109, + "step": 19683 + }, + { + "epoch": 0.9817456359102245, + "grad_norm": 0.5479273375934526, + "learning_rate": 4.392024884678769e-08, + "loss": 0.8507, + "step": 19684 + }, + { + "epoch": 0.9817955112219451, + "grad_norm": 0.6023871609940556, + "learning_rate": 4.3681297157488476e-08, + "loss": 0.8158, + "step": 19685 + }, + { + "epoch": 0.9818453865336658, + "grad_norm": 0.5159950924223337, + "learning_rate": 4.344299669215568e-08, + "loss": 0.8129, + "step": 19686 + }, + { + "epoch": 0.9818952618453866, + "grad_norm": 0.8183811669743957, + "learning_rate": 4.3205347457009325e-08, + "loss": 0.8568, + "step": 19687 + }, + { + "epoch": 0.9819451371571072, + "grad_norm": 0.561321258305344, + "learning_rate": 4.296834945825001e-08, + "loss": 0.8064, + "step": 19688 + }, + { + "epoch": 0.9819950124688279, + "grad_norm": 0.8211196103807253, + "learning_rate": 4.273200270206446e-08, + "loss": 0.8108, + "step": 19689 + }, + { + "epoch": 0.9820448877805487, + "grad_norm": 2.215502790278858, + "learning_rate": 4.249630719461717e-08, + "loss": 0.7713, + "step": 19690 + }, + { + "epoch": 0.9820947630922693, + "grad_norm": 0.580296759164176, + "learning_rate": 4.226126294205601e-08, + "loss": 0.8208, + "step": 19691 + }, + { + "epoch": 0.98214463840399, + "grad_norm": 0.5433570539028779, + "learning_rate": 4.202686995052052e-08, + "loss": 0.8097, + "step": 19692 + }, + { + "epoch": 0.9821945137157108, + "grad_norm": 0.7002472808234492, + "learning_rate": 4.179312822612524e-08, + "loss": 0.8498, + "step": 19693 + }, + { + "epoch": 0.9822443890274314, + "grad_norm": 0.6885058509625563, + "learning_rate": 4.156003777496531e-08, + "loss": 0.8621, + "step": 19694 + }, + { + "epoch": 0.9822942643391521, + "grad_norm": 0.8401543036243652, + "learning_rate": 4.132759860313029e-08, + "loss": 0.8495, + "step": 19695 + }, + { + "epoch": 0.9823441396508729, + "grad_norm": 0.6053340078130501, + "learning_rate": 4.109581071667923e-08, + "loss": 0.8083, + "step": 19696 + }, + { + "epoch": 0.9823940149625935, + "grad_norm": 0.5412520713854371, + "learning_rate": 4.086467412166284e-08, + "loss": 0.838, + "step": 19697 + }, + { + "epoch": 0.9824438902743142, + "grad_norm": 1.295426387157138, + "learning_rate": 4.0634188824109634e-08, + "loss": 0.8659, + "step": 19698 + }, + { + "epoch": 0.982493765586035, + "grad_norm": 0.624070722615195, + "learning_rate": 4.040435483004257e-08, + "loss": 0.8907, + "step": 19699 + }, + { + "epoch": 0.9825436408977556, + "grad_norm": 0.5129198177357105, + "learning_rate": 4.017517214544852e-08, + "loss": 0.8489, + "step": 19700 + }, + { + "epoch": 0.9825935162094763, + "grad_norm": 0.7051341389082789, + "learning_rate": 3.994664077631438e-08, + "loss": 0.8409, + "step": 19701 + }, + { + "epoch": 0.9826433915211971, + "grad_norm": 0.6303764109259224, + "learning_rate": 3.9718760728602034e-08, + "loss": 0.8276, + "step": 19702 + }, + { + "epoch": 0.9826932668329177, + "grad_norm": 0.5997247404037481, + "learning_rate": 3.94915320082595e-08, + "loss": 0.8536, + "step": 19703 + }, + { + "epoch": 0.9827431421446384, + "grad_norm": 0.6409142047533898, + "learning_rate": 3.92649546212126e-08, + "loss": 0.7964, + "step": 19704 + }, + { + "epoch": 0.982793017456359, + "grad_norm": 0.6731886470318826, + "learning_rate": 3.903902857337882e-08, + "loss": 0.8917, + "step": 19705 + }, + { + "epoch": 0.9828428927680798, + "grad_norm": 0.6576943635892988, + "learning_rate": 3.88137538706479e-08, + "loss": 0.8425, + "step": 19706 + }, + { + "epoch": 0.9828927680798005, + "grad_norm": 0.6042103430097112, + "learning_rate": 3.858913051890123e-08, + "loss": 0.8529, + "step": 19707 + }, + { + "epoch": 0.9829426433915212, + "grad_norm": 0.850536851675313, + "learning_rate": 3.8365158523998024e-08, + "loss": 0.8419, + "step": 19708 + }, + { + "epoch": 0.9829925187032419, + "grad_norm": 0.5483979434002411, + "learning_rate": 3.814183789178638e-08, + "loss": 0.8287, + "step": 19709 + }, + { + "epoch": 0.9830423940149626, + "grad_norm": 0.6151088465172463, + "learning_rate": 3.791916862809219e-08, + "loss": 0.8297, + "step": 19710 + }, + { + "epoch": 0.9830922693266833, + "grad_norm": 0.607588351033352, + "learning_rate": 3.769715073872748e-08, + "loss": 0.8404, + "step": 19711 + }, + { + "epoch": 0.983142144638404, + "grad_norm": 0.5360890910075043, + "learning_rate": 3.747578422948206e-08, + "loss": 0.8094, + "step": 19712 + }, + { + "epoch": 0.9831920199501247, + "grad_norm": 0.5642616129959528, + "learning_rate": 3.7255069106134635e-08, + "loss": 0.8153, + "step": 19713 + }, + { + "epoch": 0.9832418952618454, + "grad_norm": 0.763472891696572, + "learning_rate": 3.7035005374444485e-08, + "loss": 0.8023, + "step": 19714 + }, + { + "epoch": 0.9832917705735661, + "grad_norm": 0.6418327494372361, + "learning_rate": 3.6815593040151476e-08, + "loss": 0.8573, + "step": 19715 + }, + { + "epoch": 0.9833416458852868, + "grad_norm": 0.5820226401179118, + "learning_rate": 3.659683210898712e-08, + "loss": 0.7999, + "step": 19716 + }, + { + "epoch": 0.9833915211970075, + "grad_norm": 0.7235427441307218, + "learning_rate": 3.63787225866552e-08, + "loss": 0.8715, + "step": 19717 + }, + { + "epoch": 0.9834413965087282, + "grad_norm": 0.6157493457100337, + "learning_rate": 3.6161264478845606e-08, + "loss": 0.8494, + "step": 19718 + }, + { + "epoch": 0.9834912718204489, + "grad_norm": 0.6003237282707238, + "learning_rate": 3.594445779123712e-08, + "loss": 0.8368, + "step": 19719 + }, + { + "epoch": 0.9835411471321696, + "grad_norm": 0.6360441384226547, + "learning_rate": 3.5728302529486335e-08, + "loss": 0.8714, + "step": 19720 + }, + { + "epoch": 0.9835910224438903, + "grad_norm": 0.6358438839423329, + "learning_rate": 3.5512798699233186e-08, + "loss": 0.8792, + "step": 19721 + }, + { + "epoch": 0.9836408977556109, + "grad_norm": 0.6055475759305842, + "learning_rate": 3.529794630610095e-08, + "loss": 0.8336, + "step": 19722 + }, + { + "epoch": 0.9836907730673317, + "grad_norm": 0.5963166858283037, + "learning_rate": 3.508374535569625e-08, + "loss": 0.8212, + "step": 19723 + }, + { + "epoch": 0.9837406483790524, + "grad_norm": 0.6193663167115051, + "learning_rate": 3.4870195853606293e-08, + "loss": 0.7799, + "step": 19724 + }, + { + "epoch": 0.983790523690773, + "grad_norm": 0.5751103279260736, + "learning_rate": 3.4657297805407163e-08, + "loss": 0.8282, + "step": 19725 + }, + { + "epoch": 0.9838403990024938, + "grad_norm": 0.6705048401564371, + "learning_rate": 3.444505121665276e-08, + "loss": 0.8428, + "step": 19726 + }, + { + "epoch": 0.9838902743142145, + "grad_norm": 0.7247879594066524, + "learning_rate": 3.423345609288031e-08, + "loss": 0.8604, + "step": 19727 + }, + { + "epoch": 0.9839401496259351, + "grad_norm": 0.5627314192323046, + "learning_rate": 3.402251243961596e-08, + "loss": 0.8063, + "step": 19728 + }, + { + "epoch": 0.9839900249376559, + "grad_norm": 0.8954571137500907, + "learning_rate": 3.381222026235809e-08, + "loss": 0.8359, + "step": 19729 + }, + { + "epoch": 0.9840399002493766, + "grad_norm": 0.6171218055688206, + "learning_rate": 3.360257956659674e-08, + "loss": 0.8248, + "step": 19730 + }, + { + "epoch": 0.9840897755610972, + "grad_norm": 0.6399661898957385, + "learning_rate": 3.339359035780254e-08, + "loss": 0.85, + "step": 19731 + }, + { + "epoch": 0.984139650872818, + "grad_norm": 0.6346993343697926, + "learning_rate": 3.318525264143224e-08, + "loss": 0.8788, + "step": 19732 + }, + { + "epoch": 0.9841895261845387, + "grad_norm": 0.5334665847585132, + "learning_rate": 3.297756642291483e-08, + "loss": 0.8082, + "step": 19733 + }, + { + "epoch": 0.9842394014962593, + "grad_norm": 0.6200125391956152, + "learning_rate": 3.277053170767652e-08, + "loss": 0.8326, + "step": 19734 + }, + { + "epoch": 0.9842892768079801, + "grad_norm": 0.5834244266080422, + "learning_rate": 3.2564148501115776e-08, + "loss": 0.8321, + "step": 19735 + }, + { + "epoch": 0.9843391521197008, + "grad_norm": 0.602309521793914, + "learning_rate": 3.235841680862273e-08, + "loss": 0.8588, + "step": 19736 + }, + { + "epoch": 0.9843890274314214, + "grad_norm": 0.6708542536415026, + "learning_rate": 3.215333663555975e-08, + "loss": 0.797, + "step": 19737 + }, + { + "epoch": 0.9844389027431422, + "grad_norm": 0.8447138781732418, + "learning_rate": 3.1948907987280895e-08, + "loss": 0.8853, + "step": 19738 + }, + { + "epoch": 0.9844887780548628, + "grad_norm": 0.5807933356206082, + "learning_rate": 3.1745130869123566e-08, + "loss": 0.8057, + "step": 19739 + }, + { + "epoch": 0.9845386533665835, + "grad_norm": 0.8797915554321216, + "learning_rate": 3.154200528640017e-08, + "loss": 0.8221, + "step": 19740 + }, + { + "epoch": 0.9845885286783043, + "grad_norm": 0.6073395949618726, + "learning_rate": 3.13395312444148e-08, + "loss": 0.8151, + "step": 19741 + }, + { + "epoch": 0.9846384039900249, + "grad_norm": 0.6830522668073119, + "learning_rate": 3.113770874844935e-08, + "loss": 0.861, + "step": 19742 + }, + { + "epoch": 0.9846882793017456, + "grad_norm": 0.552766841123452, + "learning_rate": 3.0936537803771814e-08, + "loss": 0.8557, + "step": 19743 + }, + { + "epoch": 0.9847381546134664, + "grad_norm": 0.7852592250449405, + "learning_rate": 3.0736018415630786e-08, + "loss": 0.8205, + "step": 19744 + }, + { + "epoch": 0.984788029925187, + "grad_norm": 0.57102528818312, + "learning_rate": 3.053615058925818e-08, + "loss": 0.8682, + "step": 19745 + }, + { + "epoch": 0.9848379052369077, + "grad_norm": 0.582075516309123, + "learning_rate": 3.033693432986928e-08, + "loss": 0.8214, + "step": 19746 + }, + { + "epoch": 0.9848877805486285, + "grad_norm": 0.5618463806746103, + "learning_rate": 3.01383696426627e-08, + "loss": 0.7924, + "step": 19747 + }, + { + "epoch": 0.9849376558603491, + "grad_norm": 0.7280003903994278, + "learning_rate": 2.994045653282318e-08, + "loss": 0.8159, + "step": 19748 + }, + { + "epoch": 0.9849875311720698, + "grad_norm": 0.5866614345442818, + "learning_rate": 2.9743195005510482e-08, + "loss": 0.786, + "step": 19749 + }, + { + "epoch": 0.9850374064837906, + "grad_norm": 0.5930810522924062, + "learning_rate": 2.9546585065873266e-08, + "loss": 0.8236, + "step": 19750 + }, + { + "epoch": 0.9850872817955112, + "grad_norm": 0.7379829951967416, + "learning_rate": 2.935062671904354e-08, + "loss": 0.8547, + "step": 19751 + }, + { + "epoch": 0.9851371571072319, + "grad_norm": 0.7846882878754327, + "learning_rate": 2.9155319970131102e-08, + "loss": 0.8526, + "step": 19752 + }, + { + "epoch": 0.9851870324189527, + "grad_norm": 0.5697126340572123, + "learning_rate": 2.8960664824237427e-08, + "loss": 0.8382, + "step": 19753 + }, + { + "epoch": 0.9852369077306733, + "grad_norm": 0.5917893011587108, + "learning_rate": 2.876666128643901e-08, + "loss": 0.8501, + "step": 19754 + }, + { + "epoch": 0.985286783042394, + "grad_norm": 0.6559734293003057, + "learning_rate": 2.857330936180125e-08, + "loss": 0.8138, + "step": 19755 + }, + { + "epoch": 0.9853366583541147, + "grad_norm": 0.5759174360591545, + "learning_rate": 2.8380609055364547e-08, + "loss": 0.8266, + "step": 19756 + }, + { + "epoch": 0.9853865336658354, + "grad_norm": 0.4790563985039218, + "learning_rate": 2.8188560372161e-08, + "loss": 0.8191, + "step": 19757 + }, + { + "epoch": 0.9854364089775561, + "grad_norm": 0.6049426511568191, + "learning_rate": 2.799716331720048e-08, + "loss": 0.818, + "step": 19758 + }, + { + "epoch": 0.9854862842892768, + "grad_norm": 0.5989020384450168, + "learning_rate": 2.7806417895478998e-08, + "loss": 0.8403, + "step": 19759 + }, + { + "epoch": 0.9855361596009975, + "grad_norm": 0.5766122246714631, + "learning_rate": 2.761632411197035e-08, + "loss": 0.7978, + "step": 19760 + }, + { + "epoch": 0.9855860349127182, + "grad_norm": 0.569608606641355, + "learning_rate": 2.7426881971640007e-08, + "loss": 0.8591, + "step": 19761 + }, + { + "epoch": 0.9856359102244389, + "grad_norm": 0.7191738691084391, + "learning_rate": 2.723809147942846e-08, + "loss": 0.8016, + "step": 19762 + }, + { + "epoch": 0.9856857855361596, + "grad_norm": 0.6120251726077728, + "learning_rate": 2.7049952640262334e-08, + "loss": 0.846, + "step": 19763 + }, + { + "epoch": 0.9857356608478803, + "grad_norm": 0.8576448875094372, + "learning_rate": 2.686246545905158e-08, + "loss": 0.8437, + "step": 19764 + }, + { + "epoch": 0.985785536159601, + "grad_norm": 0.610069080048981, + "learning_rate": 2.6675629940689507e-08, + "loss": 0.8175, + "step": 19765 + }, + { + "epoch": 0.9858354114713217, + "grad_norm": 1.3142299717414672, + "learning_rate": 2.6489446090047222e-08, + "loss": 0.8771, + "step": 19766 + }, + { + "epoch": 0.9858852867830424, + "grad_norm": 4.1304559972777115, + "learning_rate": 2.6303913911990273e-08, + "loss": 0.8133, + "step": 19767 + }, + { + "epoch": 0.9859351620947631, + "grad_norm": 0.4700657629443367, + "learning_rate": 2.6119033411350914e-08, + "loss": 0.8592, + "step": 19768 + }, + { + "epoch": 0.9859850374064838, + "grad_norm": 0.6579835542202451, + "learning_rate": 2.5934804592964158e-08, + "loss": 0.8605, + "step": 19769 + }, + { + "epoch": 0.9860349127182045, + "grad_norm": 0.6256512917767942, + "learning_rate": 2.575122746162617e-08, + "loss": 0.8493, + "step": 19770 + }, + { + "epoch": 0.9860847880299252, + "grad_norm": 0.6008669223161945, + "learning_rate": 2.5568302022135894e-08, + "loss": 0.8368, + "step": 19771 + }, + { + "epoch": 0.9861346633416459, + "grad_norm": 0.9078197114373141, + "learning_rate": 2.5386028279264505e-08, + "loss": 0.8142, + "step": 19772 + }, + { + "epoch": 0.9861845386533666, + "grad_norm": 0.6237492062682131, + "learning_rate": 2.520440623776654e-08, + "loss": 0.8322, + "step": 19773 + }, + { + "epoch": 0.9862344139650873, + "grad_norm": 0.6395370427220789, + "learning_rate": 2.5023435902379878e-08, + "loss": 0.8566, + "step": 19774 + }, + { + "epoch": 0.986284289276808, + "grad_norm": 1.4444347970508318, + "learning_rate": 2.4843117277831284e-08, + "loss": 0.8758, + "step": 19775 + }, + { + "epoch": 0.9863341645885286, + "grad_norm": 0.6130226506465306, + "learning_rate": 2.4663450368825335e-08, + "loss": 0.8184, + "step": 19776 + }, + { + "epoch": 0.9863840399002494, + "grad_norm": 0.6436437877868426, + "learning_rate": 2.4484435180047172e-08, + "loss": 0.8527, + "step": 19777 + }, + { + "epoch": 0.9864339152119701, + "grad_norm": 0.6582461360025063, + "learning_rate": 2.430607171617083e-08, + "loss": 0.8222, + "step": 19778 + }, + { + "epoch": 0.9864837905236907, + "grad_norm": 0.582418653442051, + "learning_rate": 2.4128359981850924e-08, + "loss": 0.8376, + "step": 19779 + }, + { + "epoch": 0.9865336658354115, + "grad_norm": 0.5599423679374909, + "learning_rate": 2.395129998172263e-08, + "loss": 0.8628, + "step": 19780 + }, + { + "epoch": 0.9865835411471322, + "grad_norm": 0.5613513964802426, + "learning_rate": 2.3774891720407257e-08, + "loss": 0.8391, + "step": 19781 + }, + { + "epoch": 0.9866334164588528, + "grad_norm": 0.8172860775039046, + "learning_rate": 2.359913520250945e-08, + "loss": 0.8674, + "step": 19782 + }, + { + "epoch": 0.9866832917705736, + "grad_norm": 0.7612381122407421, + "learning_rate": 2.342403043261443e-08, + "loss": 0.8328, + "step": 19783 + }, + { + "epoch": 0.9867331670822943, + "grad_norm": 0.8491382502469925, + "learning_rate": 2.3249577415287992e-08, + "loss": 0.8214, + "step": 19784 + }, + { + "epoch": 0.9867830423940149, + "grad_norm": 1.1945170249307995, + "learning_rate": 2.307577615509038e-08, + "loss": 0.8312, + "step": 19785 + }, + { + "epoch": 0.9868329177057357, + "grad_norm": 0.5847238654295613, + "learning_rate": 2.2902626656548522e-08, + "loss": 0.8582, + "step": 19786 + }, + { + "epoch": 0.9868827930174564, + "grad_norm": 0.5912133943434991, + "learning_rate": 2.2730128924186578e-08, + "loss": 0.8474, + "step": 19787 + }, + { + "epoch": 0.986932668329177, + "grad_norm": 0.8183772471393159, + "learning_rate": 2.2558282962500955e-08, + "loss": 0.8307, + "step": 19788 + }, + { + "epoch": 0.9869825436408978, + "grad_norm": 0.6018724068910792, + "learning_rate": 2.2387088775979726e-08, + "loss": 0.8507, + "step": 19789 + }, + { + "epoch": 0.9870324189526185, + "grad_norm": 0.6380936808543424, + "learning_rate": 2.2216546369088764e-08, + "loss": 0.8328, + "step": 19790 + }, + { + "epoch": 0.9870822942643391, + "grad_norm": 0.6742716296352285, + "learning_rate": 2.2046655746280064e-08, + "loss": 0.8614, + "step": 19791 + }, + { + "epoch": 0.9871321695760599, + "grad_norm": 0.6334611658652786, + "learning_rate": 2.187741691198342e-08, + "loss": 0.8424, + "step": 19792 + }, + { + "epoch": 0.9871820448877805, + "grad_norm": 0.5522603211314029, + "learning_rate": 2.170882987061751e-08, + "loss": 0.8077, + "step": 19793 + }, + { + "epoch": 0.9872319201995012, + "grad_norm": 0.6887755145088084, + "learning_rate": 2.1540894626581598e-08, + "loss": 0.8499, + "step": 19794 + }, + { + "epoch": 0.987281795511222, + "grad_norm": 0.5483829862298856, + "learning_rate": 2.1373611184255516e-08, + "loss": 0.8347, + "step": 19795 + }, + { + "epoch": 0.9873316708229426, + "grad_norm": 0.636634167200859, + "learning_rate": 2.120697954800799e-08, + "loss": 0.8583, + "step": 19796 + }, + { + "epoch": 0.9873815461346633, + "grad_norm": 0.6234578140929149, + "learning_rate": 2.1040999722182765e-08, + "loss": 0.876, + "step": 19797 + }, + { + "epoch": 0.9874314214463841, + "grad_norm": 0.5631039337100239, + "learning_rate": 2.0875671711115263e-08, + "loss": 0.8172, + "step": 19798 + }, + { + "epoch": 0.9874812967581047, + "grad_norm": 0.7018007774285016, + "learning_rate": 2.0710995519118704e-08, + "loss": 0.8575, + "step": 19799 + }, + { + "epoch": 0.9875311720698254, + "grad_norm": 0.5238268138149514, + "learning_rate": 2.054697115048687e-08, + "loss": 0.8134, + "step": 19800 + }, + { + "epoch": 0.9875810473815462, + "grad_norm": 0.5985299905015806, + "learning_rate": 2.038359860950523e-08, + "loss": 0.841, + "step": 19801 + }, + { + "epoch": 0.9876309226932668, + "grad_norm": 0.5915271818312583, + "learning_rate": 2.022087790043148e-08, + "loss": 0.8681, + "step": 19802 + }, + { + "epoch": 0.9876807980049875, + "grad_norm": 0.6122334253064138, + "learning_rate": 2.0058809027515e-08, + "loss": 0.8137, + "step": 19803 + }, + { + "epoch": 0.9877306733167083, + "grad_norm": 0.5228678233971706, + "learning_rate": 1.989739199498575e-08, + "loss": 0.8104, + "step": 19804 + }, + { + "epoch": 0.9877805486284289, + "grad_norm": 0.6280017801260789, + "learning_rate": 1.9736626807054236e-08, + "loss": 0.8267, + "step": 19805 + }, + { + "epoch": 0.9878304239401496, + "grad_norm": 0.8448668868585854, + "learning_rate": 1.957651346791711e-08, + "loss": 0.8363, + "step": 19806 + }, + { + "epoch": 0.9878802992518704, + "grad_norm": 0.6037793852434163, + "learning_rate": 1.9417051981748813e-08, + "loss": 0.7879, + "step": 19807 + }, + { + "epoch": 0.987930174563591, + "grad_norm": 0.5816422993404825, + "learning_rate": 1.9258242352712676e-08, + "loss": 0.8024, + "step": 19808 + }, + { + "epoch": 0.9879800498753117, + "grad_norm": 0.6221643249311981, + "learning_rate": 1.9100084584952605e-08, + "loss": 0.8264, + "step": 19809 + }, + { + "epoch": 0.9880299251870324, + "grad_norm": 0.5757737954292982, + "learning_rate": 1.8942578682595858e-08, + "loss": 0.8339, + "step": 19810 + }, + { + "epoch": 0.9880798004987531, + "grad_norm": 0.5735950673352476, + "learning_rate": 1.8785724649753033e-08, + "loss": 0.8263, + "step": 19811 + }, + { + "epoch": 0.9881296758104738, + "grad_norm": 0.6773588648024839, + "learning_rate": 1.8629522490518082e-08, + "loss": 0.8122, + "step": 19812 + }, + { + "epoch": 0.9881795511221945, + "grad_norm": 0.5893582746606815, + "learning_rate": 1.8473972208962742e-08, + "loss": 0.8562, + "step": 19813 + }, + { + "epoch": 0.9882294264339152, + "grad_norm": 1.038764632036832, + "learning_rate": 1.8319073809150435e-08, + "loss": 0.7797, + "step": 19814 + }, + { + "epoch": 0.9882793017456359, + "grad_norm": 0.6334666915410608, + "learning_rate": 1.8164827295122368e-08, + "loss": 0.8252, + "step": 19815 + }, + { + "epoch": 0.9883291770573566, + "grad_norm": 1.2579168478273497, + "learning_rate": 1.80112326709031e-08, + "loss": 0.8574, + "step": 19816 + }, + { + "epoch": 0.9883790523690773, + "grad_norm": 0.646278465348065, + "learning_rate": 1.7858289940497762e-08, + "loss": 0.8886, + "step": 19817 + }, + { + "epoch": 0.988428927680798, + "grad_norm": 0.5858560886134279, + "learning_rate": 1.7705999107900383e-08, + "loss": 0.8061, + "step": 19818 + }, + { + "epoch": 0.9884788029925187, + "grad_norm": 0.5588368629573478, + "learning_rate": 1.755436017708556e-08, + "loss": 0.796, + "step": 19819 + }, + { + "epoch": 0.9885286783042394, + "grad_norm": 0.5771215535193432, + "learning_rate": 1.7403373152008463e-08, + "loss": 0.8205, + "step": 19820 + }, + { + "epoch": 0.9885785536159601, + "grad_norm": 0.5732940122723408, + "learning_rate": 1.725303803661038e-08, + "loss": 0.8478, + "step": 19821 + }, + { + "epoch": 0.9886284289276808, + "grad_norm": 0.5169078221614926, + "learning_rate": 1.7103354834810402e-08, + "loss": 0.8319, + "step": 19822 + }, + { + "epoch": 0.9886783042394015, + "grad_norm": 0.584102714050646, + "learning_rate": 1.695432355051929e-08, + "loss": 0.8279, + "step": 19823 + }, + { + "epoch": 0.9887281795511222, + "grad_norm": 0.5155938971101092, + "learning_rate": 1.68059441876256e-08, + "loss": 0.7594, + "step": 19824 + }, + { + "epoch": 0.9887780548628429, + "grad_norm": 0.5694388764700226, + "learning_rate": 1.6658216749998456e-08, + "loss": 0.8354, + "step": 19825 + }, + { + "epoch": 0.9888279301745636, + "grad_norm": 0.5602507439750382, + "learning_rate": 1.6511141241495886e-08, + "loss": 0.8182, + "step": 19826 + }, + { + "epoch": 0.9888778054862843, + "grad_norm": 0.48035053645829867, + "learning_rate": 1.6364717665950935e-08, + "loss": 0.8288, + "step": 19827 + }, + { + "epoch": 0.988927680798005, + "grad_norm": 0.706231706747559, + "learning_rate": 1.621894602718832e-08, + "loss": 0.8075, + "step": 19828 + }, + { + "epoch": 0.9889775561097257, + "grad_norm": 0.6270921089443611, + "learning_rate": 1.607382632901333e-08, + "loss": 0.8078, + "step": 19829 + }, + { + "epoch": 0.9890274314214463, + "grad_norm": 0.6313370065146099, + "learning_rate": 1.5929358575206275e-08, + "loss": 0.849, + "step": 19830 + }, + { + "epoch": 0.9890773067331671, + "grad_norm": 0.8098121775129208, + "learning_rate": 1.5785542769544692e-08, + "loss": 0.8838, + "step": 19831 + }, + { + "epoch": 0.9891271820448878, + "grad_norm": 0.6943974139804777, + "learning_rate": 1.5642378915778356e-08, + "loss": 0.8428, + "step": 19832 + }, + { + "epoch": 0.9891770573566084, + "grad_norm": 0.6746356530609174, + "learning_rate": 1.549986701764039e-08, + "loss": 0.8907, + "step": 19833 + }, + { + "epoch": 0.9892269326683292, + "grad_norm": 0.7272882999899228, + "learning_rate": 1.535800707885282e-08, + "loss": 0.817, + "step": 19834 + }, + { + "epoch": 0.9892768079800499, + "grad_norm": 0.6354724005667431, + "learning_rate": 1.5216799103115464e-08, + "loss": 0.8769, + "step": 19835 + }, + { + "epoch": 0.9893266832917705, + "grad_norm": 0.7137592134971662, + "learning_rate": 1.507624309411426e-08, + "loss": 0.8875, + "step": 19836 + }, + { + "epoch": 0.9893765586034913, + "grad_norm": 0.5988497304001711, + "learning_rate": 1.49363390555185e-08, + "loss": 0.8678, + "step": 19837 + }, + { + "epoch": 0.989426433915212, + "grad_norm": 0.5937981395989372, + "learning_rate": 1.4797086990975262e-08, + "loss": 0.8272, + "step": 19838 + }, + { + "epoch": 0.9894763092269326, + "grad_norm": 0.6539717044569825, + "learning_rate": 1.4658486904120527e-08, + "loss": 0.8395, + "step": 19839 + }, + { + "epoch": 0.9895261845386534, + "grad_norm": 0.6305420480640499, + "learning_rate": 1.4520538798570849e-08, + "loss": 0.8467, + "step": 19840 + }, + { + "epoch": 0.9895760598503741, + "grad_norm": 0.8659227190832846, + "learning_rate": 1.4383242677926123e-08, + "loss": 0.8667, + "step": 19841 + }, + { + "epoch": 0.9896259351620947, + "grad_norm": 0.6033814395972301, + "learning_rate": 1.4246598545766821e-08, + "loss": 0.812, + "step": 19842 + }, + { + "epoch": 0.9896758104738155, + "grad_norm": 0.6760385303834017, + "learning_rate": 1.4110606405662308e-08, + "loss": 0.8727, + "step": 19843 + }, + { + "epoch": 0.9897256857855362, + "grad_norm": 0.5809255098217562, + "learning_rate": 1.3975266261156971e-08, + "loss": 0.8604, + "step": 19844 + }, + { + "epoch": 0.9897755610972568, + "grad_norm": 0.5959997206406354, + "learning_rate": 1.3840578115786874e-08, + "loss": 0.804, + "step": 19845 + }, + { + "epoch": 0.9898254364089776, + "grad_norm": 0.672716907467087, + "learning_rate": 1.3706541973063091e-08, + "loss": 0.8863, + "step": 19846 + }, + { + "epoch": 0.9898753117206982, + "grad_norm": 0.6399706265181376, + "learning_rate": 1.3573157836485606e-08, + "loss": 0.8119, + "step": 19847 + }, + { + "epoch": 0.9899251870324189, + "grad_norm": 0.7331887751860267, + "learning_rate": 1.3440425709534965e-08, + "loss": 0.8422, + "step": 19848 + }, + { + "epoch": 0.9899750623441397, + "grad_norm": 0.7248287144990573, + "learning_rate": 1.3308345595672289e-08, + "loss": 0.8507, + "step": 19849 + }, + { + "epoch": 0.9900249376558603, + "grad_norm": 0.5909466807104267, + "learning_rate": 1.3176917498347596e-08, + "loss": 0.8119, + "step": 19850 + }, + { + "epoch": 0.990074812967581, + "grad_norm": 0.6549453140457739, + "learning_rate": 1.3046141420985925e-08, + "loss": 0.8229, + "step": 19851 + }, + { + "epoch": 0.9901246882793018, + "grad_norm": 0.5771343875525039, + "learning_rate": 1.2916017367003986e-08, + "loss": 0.8007, + "step": 19852 + }, + { + "epoch": 0.9901745635910224, + "grad_norm": 0.6411214524904395, + "learning_rate": 1.2786545339796286e-08, + "loss": 0.8979, + "step": 19853 + }, + { + "epoch": 0.9902244389027431, + "grad_norm": 0.6046651200854555, + "learning_rate": 1.2657725342740679e-08, + "loss": 0.8288, + "step": 19854 + }, + { + "epoch": 0.9902743142144639, + "grad_norm": 0.5606183784439223, + "learning_rate": 1.2529557379198365e-08, + "loss": 0.7754, + "step": 19855 + }, + { + "epoch": 0.9903241895261845, + "grad_norm": 0.5440905095018693, + "learning_rate": 1.2402041452513891e-08, + "loss": 0.8223, + "step": 19856 + }, + { + "epoch": 0.9903740648379052, + "grad_norm": 0.6714720494763615, + "learning_rate": 1.2275177566017926e-08, + "loss": 0.833, + "step": 19857 + }, + { + "epoch": 0.990423940149626, + "grad_norm": 0.7139435347660978, + "learning_rate": 1.2148965723016159e-08, + "loss": 0.8113, + "step": 19858 + }, + { + "epoch": 0.9904738154613466, + "grad_norm": 0.516807287691466, + "learning_rate": 1.2023405926803178e-08, + "loss": 0.8268, + "step": 19859 + }, + { + "epoch": 0.9905236907730673, + "grad_norm": 0.6231541257625552, + "learning_rate": 1.1898498180656914e-08, + "loss": 0.8412, + "step": 19860 + }, + { + "epoch": 0.9905735660847881, + "grad_norm": 0.8974851588969398, + "learning_rate": 1.1774242487835874e-08, + "loss": 0.8614, + "step": 19861 + }, + { + "epoch": 0.9906234413965087, + "grad_norm": 0.5974544673716052, + "learning_rate": 1.1650638851584684e-08, + "loss": 0.8323, + "step": 19862 + }, + { + "epoch": 0.9906733167082294, + "grad_norm": 0.6120950652417031, + "learning_rate": 1.1527687275122988e-08, + "loss": 0.8127, + "step": 19863 + }, + { + "epoch": 0.9907231920199501, + "grad_norm": 0.5708122799621557, + "learning_rate": 1.1405387761664887e-08, + "loss": 0.8103, + "step": 19864 + }, + { + "epoch": 0.9907730673316708, + "grad_norm": 0.5498104475174762, + "learning_rate": 1.1283740314399494e-08, + "loss": 0.8162, + "step": 19865 + }, + { + "epoch": 0.9908229426433915, + "grad_norm": 0.6925881824984979, + "learning_rate": 1.1162744936502045e-08, + "loss": 0.8112, + "step": 19866 + }, + { + "epoch": 0.9908728179551122, + "grad_norm": 0.7860333366870919, + "learning_rate": 1.1042401631131127e-08, + "loss": 0.8586, + "step": 19867 + }, + { + "epoch": 0.9909226932668329, + "grad_norm": 1.0627685640110816, + "learning_rate": 1.092271040142312e-08, + "loss": 0.845, + "step": 19868 + }, + { + "epoch": 0.9909725685785536, + "grad_norm": 0.5749894158381317, + "learning_rate": 1.08036712505033e-08, + "loss": 0.8492, + "step": 19869 + }, + { + "epoch": 0.9910224438902743, + "grad_norm": 0.5807201586006652, + "learning_rate": 1.0685284181477518e-08, + "loss": 0.8163, + "step": 19870 + }, + { + "epoch": 0.991072319201995, + "grad_norm": 0.5274241793122554, + "learning_rate": 1.0567549197434967e-08, + "loss": 0.8322, + "step": 19871 + }, + { + "epoch": 0.9911221945137157, + "grad_norm": 0.8209731213886893, + "learning_rate": 1.0450466301448192e-08, + "loss": 0.828, + "step": 19872 + }, + { + "epoch": 0.9911720698254364, + "grad_norm": 0.8385280268020744, + "learning_rate": 1.0334035496575856e-08, + "loss": 0.8, + "step": 19873 + }, + { + "epoch": 0.9912219451371571, + "grad_norm": 0.6940187026006946, + "learning_rate": 1.0218256785851643e-08, + "loss": 0.8512, + "step": 19874 + }, + { + "epoch": 0.9912718204488778, + "grad_norm": 0.6067719604123557, + "learning_rate": 1.010313017229536e-08, + "loss": 0.8255, + "step": 19875 + }, + { + "epoch": 0.9913216957605985, + "grad_norm": 0.6659445297723674, + "learning_rate": 9.988655658915713e-09, + "loss": 0.8619, + "step": 19876 + }, + { + "epoch": 0.9913715710723192, + "grad_norm": 0.6710327465205017, + "learning_rate": 9.8748332486992e-09, + "loss": 0.8692, + "step": 19877 + }, + { + "epoch": 0.99142144638404, + "grad_norm": 0.7272781831577971, + "learning_rate": 9.761662944612893e-09, + "loss": 0.804, + "step": 19878 + }, + { + "epoch": 0.9914713216957606, + "grad_norm": 0.7033216954996401, + "learning_rate": 9.64914474961276e-09, + "loss": 0.8282, + "step": 19879 + }, + { + "epoch": 0.9915211970074813, + "grad_norm": 0.5181594623743849, + "learning_rate": 9.537278666635342e-09, + "loss": 0.8401, + "step": 19880 + }, + { + "epoch": 0.9915710723192019, + "grad_norm": 0.7315060885658512, + "learning_rate": 9.426064698594971e-09, + "loss": 0.8074, + "step": 19881 + }, + { + "epoch": 0.9916209476309227, + "grad_norm": 0.5994745774132385, + "learning_rate": 9.315502848400436e-09, + "loss": 0.8418, + "step": 19882 + }, + { + "epoch": 0.9916708229426434, + "grad_norm": 0.544394037719406, + "learning_rate": 9.20559311892999e-09, + "loss": 0.7736, + "step": 19883 + }, + { + "epoch": 0.991720698254364, + "grad_norm": 0.8169369093689525, + "learning_rate": 9.096335513059106e-09, + "loss": 0.8332, + "step": 19884 + }, + { + "epoch": 0.9917705735660848, + "grad_norm": 0.7360878926835203, + "learning_rate": 8.987730033632736e-09, + "loss": 0.7995, + "step": 19885 + }, + { + "epoch": 0.9918204488778055, + "grad_norm": 0.8158949494862543, + "learning_rate": 8.879776683487495e-09, + "loss": 0.8659, + "step": 19886 + }, + { + "epoch": 0.9918703241895261, + "grad_norm": 1.2577246054870328, + "learning_rate": 8.772475465440578e-09, + "loss": 0.8522, + "step": 19887 + }, + { + "epoch": 0.9919201995012469, + "grad_norm": 0.6849353698043414, + "learning_rate": 8.665826382289743e-09, + "loss": 0.8276, + "step": 19888 + }, + { + "epoch": 0.9919700748129676, + "grad_norm": 0.5847061792909073, + "learning_rate": 8.559829436818878e-09, + "loss": 0.8335, + "step": 19889 + }, + { + "epoch": 0.9920199501246882, + "grad_norm": 0.5977878958780002, + "learning_rate": 8.454484631797987e-09, + "loss": 0.8196, + "step": 19890 + }, + { + "epoch": 0.992069825436409, + "grad_norm": 0.5246241531874979, + "learning_rate": 8.349791969969322e-09, + "loss": 0.8365, + "step": 19891 + }, + { + "epoch": 0.9921197007481297, + "grad_norm": 0.6149884281278328, + "learning_rate": 8.245751454069584e-09, + "loss": 0.8328, + "step": 19892 + }, + { + "epoch": 0.9921695760598503, + "grad_norm": 0.7177947824836302, + "learning_rate": 8.142363086810489e-09, + "loss": 0.8423, + "step": 19893 + }, + { + "epoch": 0.9922194513715711, + "grad_norm": 0.6452605993606292, + "learning_rate": 8.039626870895433e-09, + "loss": 0.8327, + "step": 19894 + }, + { + "epoch": 0.9922693266832918, + "grad_norm": 0.6310478470190767, + "learning_rate": 7.937542808997278e-09, + "loss": 0.8689, + "step": 19895 + }, + { + "epoch": 0.9923192019950124, + "grad_norm": 0.6186088201322762, + "learning_rate": 7.836110903786109e-09, + "loss": 0.8125, + "step": 19896 + }, + { + "epoch": 0.9923690773067332, + "grad_norm": 0.9353019457056261, + "learning_rate": 7.735331157904258e-09, + "loss": 0.8187, + "step": 19897 + }, + { + "epoch": 0.9924189526184539, + "grad_norm": 0.5268153002262037, + "learning_rate": 7.635203573985728e-09, + "loss": 0.837, + "step": 19898 + }, + { + "epoch": 0.9924688279301745, + "grad_norm": 0.6173803311247363, + "learning_rate": 7.535728154642319e-09, + "loss": 0.8815, + "step": 19899 + }, + { + "epoch": 0.9925187032418953, + "grad_norm": 1.5584779267692745, + "learning_rate": 7.436904902469177e-09, + "loss": 0.8564, + "step": 19900 + }, + { + "epoch": 0.9925685785536159, + "grad_norm": 0.6681188667089212, + "learning_rate": 7.338733820042021e-09, + "loss": 0.8159, + "step": 19901 + }, + { + "epoch": 0.9926184538653366, + "grad_norm": 0.8909087634005046, + "learning_rate": 7.24121490992824e-09, + "loss": 0.7787, + "step": 19902 + }, + { + "epoch": 0.9926683291770574, + "grad_norm": 0.6990978940623099, + "learning_rate": 7.14434817466747e-09, + "loss": 0.8405, + "step": 19903 + }, + { + "epoch": 0.992718204488778, + "grad_norm": 0.5815623104517489, + "learning_rate": 7.0481336167910195e-09, + "loss": 0.8477, + "step": 19904 + }, + { + "epoch": 0.9927680798004987, + "grad_norm": 0.5803677842495022, + "learning_rate": 6.9525712388052164e-09, + "loss": 0.8361, + "step": 19905 + }, + { + "epoch": 0.9928179551122195, + "grad_norm": 0.5913189740124816, + "learning_rate": 6.857661043210839e-09, + "loss": 0.8037, + "step": 19906 + }, + { + "epoch": 0.9928678304239401, + "grad_norm": 0.8812105071434884, + "learning_rate": 6.763403032478133e-09, + "loss": 0.8348, + "step": 19907 + }, + { + "epoch": 0.9929177057356608, + "grad_norm": 0.724072231138895, + "learning_rate": 6.669797209069017e-09, + "loss": 0.8109, + "step": 19908 + }, + { + "epoch": 0.9929675810473816, + "grad_norm": 0.6091267147218931, + "learning_rate": 6.576843575425984e-09, + "loss": 0.8482, + "step": 19909 + }, + { + "epoch": 0.9930174563591022, + "grad_norm": 0.5867512290088546, + "learning_rate": 6.484542133974869e-09, + "loss": 0.8474, + "step": 19910 + }, + { + "epoch": 0.993067331670823, + "grad_norm": 1.4703898938484161, + "learning_rate": 6.392892887122082e-09, + "loss": 0.8122, + "step": 19911 + }, + { + "epoch": 0.9931172069825437, + "grad_norm": 0.6571901728380495, + "learning_rate": 6.301895837262928e-09, + "loss": 0.8176, + "step": 19912 + }, + { + "epoch": 0.9931670822942643, + "grad_norm": 0.6455364298494559, + "learning_rate": 6.211550986770509e-09, + "loss": 0.8449, + "step": 19913 + }, + { + "epoch": 0.993216957605985, + "grad_norm": 0.5873884259552488, + "learning_rate": 6.121858338001274e-09, + "loss": 0.8287, + "step": 19914 + }, + { + "epoch": 0.9932668329177058, + "grad_norm": 0.8587720300403067, + "learning_rate": 6.032817893297793e-09, + "loss": 0.833, + "step": 19915 + }, + { + "epoch": 0.9933167082294264, + "grad_norm": 0.6029121260488893, + "learning_rate": 5.944429654980433e-09, + "loss": 0.8649, + "step": 19916 + }, + { + "epoch": 0.9933665835411472, + "grad_norm": 1.2388570010603184, + "learning_rate": 5.856693625358456e-09, + "loss": 0.8571, + "step": 19917 + }, + { + "epoch": 0.9934164588528678, + "grad_norm": 0.602327430641721, + "learning_rate": 5.769609806718923e-09, + "loss": 0.8324, + "step": 19918 + }, + { + "epoch": 0.9934663341645885, + "grad_norm": 0.6809094560451006, + "learning_rate": 5.683178201335016e-09, + "loss": 0.811, + "step": 19919 + }, + { + "epoch": 0.9935162094763093, + "grad_norm": 0.8218437619833125, + "learning_rate": 5.597398811463261e-09, + "loss": 0.7929, + "step": 19920 + }, + { + "epoch": 0.9935660847880299, + "grad_norm": 0.6125696002328008, + "learning_rate": 5.512271639343536e-09, + "loss": 0.8053, + "step": 19921 + }, + { + "epoch": 0.9936159600997506, + "grad_norm": 0.5144697881913806, + "learning_rate": 5.427796687190734e-09, + "loss": 0.8432, + "step": 19922 + }, + { + "epoch": 0.9936658354114714, + "grad_norm": 0.738673705926294, + "learning_rate": 5.343973957216974e-09, + "loss": 0.8192, + "step": 19923 + }, + { + "epoch": 0.993715710723192, + "grad_norm": 0.8129280477971551, + "learning_rate": 5.260803451603846e-09, + "loss": 0.8915, + "step": 19924 + }, + { + "epoch": 0.9937655860349127, + "grad_norm": 0.6336724729735737, + "learning_rate": 5.178285172527386e-09, + "loss": 0.8161, + "step": 19925 + }, + { + "epoch": 0.9938154613466335, + "grad_norm": 0.5861933530322325, + "learning_rate": 5.0964191221331e-09, + "loss": 0.8799, + "step": 19926 + }, + { + "epoch": 0.9938653366583541, + "grad_norm": 1.1225883853499423, + "learning_rate": 5.015205302563719e-09, + "loss": 0.8958, + "step": 19927 + }, + { + "epoch": 0.9939152119700748, + "grad_norm": 0.6919575672755178, + "learning_rate": 4.934643715936993e-09, + "loss": 0.8105, + "step": 19928 + }, + { + "epoch": 0.9939650872817956, + "grad_norm": 0.7163294471866262, + "learning_rate": 4.854734364351244e-09, + "loss": 0.8138, + "step": 19929 + }, + { + "epoch": 0.9940149625935162, + "grad_norm": 0.6251460102119877, + "learning_rate": 4.775477249896465e-09, + "loss": 0.8624, + "step": 19930 + }, + { + "epoch": 0.9940648379052369, + "grad_norm": 0.5751310039811325, + "learning_rate": 4.696872374640449e-09, + "loss": 0.8479, + "step": 19931 + }, + { + "epoch": 0.9941147132169577, + "grad_norm": 0.7970324536839567, + "learning_rate": 4.618919740631556e-09, + "loss": 0.8496, + "step": 19932 + }, + { + "epoch": 0.9941645885286783, + "grad_norm": 0.5404655315642494, + "learning_rate": 4.5416193499070444e-09, + "loss": 0.8506, + "step": 19933 + }, + { + "epoch": 0.994214463840399, + "grad_norm": 0.5353919770310626, + "learning_rate": 4.464971204481971e-09, + "loss": 0.8156, + "step": 19934 + }, + { + "epoch": 0.9942643391521196, + "grad_norm": 0.6490820629058465, + "learning_rate": 4.3889753063575125e-09, + "loss": 0.8144, + "step": 19935 + }, + { + "epoch": 0.9943142144638404, + "grad_norm": 2.2942235563919713, + "learning_rate": 4.31363165751264e-09, + "loss": 0.8116, + "step": 19936 + }, + { + "epoch": 0.9943640897755611, + "grad_norm": 0.5506164287737016, + "learning_rate": 4.2389402599207765e-09, + "loss": 0.7916, + "step": 19937 + }, + { + "epoch": 0.9944139650872817, + "grad_norm": 0.6076642556866678, + "learning_rate": 4.164901115524811e-09, + "loss": 0.8674, + "step": 19938 + }, + { + "epoch": 0.9944638403990025, + "grad_norm": 0.5258513813929205, + "learning_rate": 4.091514226262083e-09, + "loss": 0.8201, + "step": 19939 + }, + { + "epoch": 0.9945137157107232, + "grad_norm": 4.824976994253603, + "learning_rate": 4.0187795940421766e-09, + "loss": 0.786, + "step": 19940 + }, + { + "epoch": 0.9945635910224438, + "grad_norm": 0.5795600172628619, + "learning_rate": 3.9466972207663486e-09, + "loss": 0.8468, + "step": 19941 + }, + { + "epoch": 0.9946134663341646, + "grad_norm": 0.7290665662612075, + "learning_rate": 3.8752671083136514e-09, + "loss": 0.8639, + "step": 19942 + }, + { + "epoch": 0.9946633416458853, + "grad_norm": 0.6352832328135073, + "learning_rate": 3.80448925854926e-09, + "loss": 0.8366, + "step": 19943 + }, + { + "epoch": 0.994713216957606, + "grad_norm": 0.5631335755121418, + "learning_rate": 3.734363673321695e-09, + "loss": 0.8752, + "step": 19944 + }, + { + "epoch": 0.9947630922693267, + "grad_norm": 0.524823035569132, + "learning_rate": 3.6648903544572733e-09, + "loss": 0.8001, + "step": 19945 + }, + { + "epoch": 0.9948129675810474, + "grad_norm": 0.9026579556577484, + "learning_rate": 3.59606930377121e-09, + "loss": 0.8735, + "step": 19946 + }, + { + "epoch": 0.994862842892768, + "grad_norm": 0.549571349307706, + "learning_rate": 3.5279005230565155e-09, + "loss": 0.8784, + "step": 19947 + }, + { + "epoch": 0.9949127182044888, + "grad_norm": 0.5139494885063923, + "learning_rate": 3.460384014097873e-09, + "loss": 0.7981, + "step": 19948 + }, + { + "epoch": 0.9949625935162095, + "grad_norm": 1.8348947608043404, + "learning_rate": 3.39351977865221e-09, + "loss": 0.8322, + "step": 19949 + }, + { + "epoch": 0.9950124688279302, + "grad_norm": 0.7405986323883851, + "learning_rate": 3.327307818465353e-09, + "loss": 0.8147, + "step": 19950 + }, + { + "epoch": 0.9950623441396509, + "grad_norm": 0.7163616165703632, + "learning_rate": 3.261748135263698e-09, + "loss": 0.8219, + "step": 19951 + }, + { + "epoch": 0.9951122194513715, + "grad_norm": 0.5652099157459992, + "learning_rate": 3.1968407307625402e-09, + "loss": 0.8533, + "step": 19952 + }, + { + "epoch": 0.9951620947630923, + "grad_norm": 0.6466896599321111, + "learning_rate": 3.132585606652194e-09, + "loss": 0.8145, + "step": 19953 + }, + { + "epoch": 0.995211970074813, + "grad_norm": 0.6882320552812768, + "learning_rate": 3.0689827646090965e-09, + "loss": 0.8348, + "step": 19954 + }, + { + "epoch": 0.9952618453865336, + "grad_norm": 0.8432335963102467, + "learning_rate": 3.00603220629303e-09, + "loss": 0.843, + "step": 19955 + }, + { + "epoch": 0.9953117206982544, + "grad_norm": 0.7632508972214437, + "learning_rate": 2.943733933347126e-09, + "loss": 0.7924, + "step": 19956 + }, + { + "epoch": 0.9953615960099751, + "grad_norm": 0.5589896076483711, + "learning_rate": 2.882087947400636e-09, + "loss": 0.8081, + "step": 19957 + }, + { + "epoch": 0.9954114713216957, + "grad_norm": 0.5853400997304122, + "learning_rate": 2.8210942500550562e-09, + "loss": 0.8482, + "step": 19958 + }, + { + "epoch": 0.9954613466334165, + "grad_norm": 0.6065878129593779, + "learning_rate": 2.7607528429063332e-09, + "loss": 0.8323, + "step": 19959 + }, + { + "epoch": 0.9955112219451372, + "grad_norm": 0.5679048405112688, + "learning_rate": 2.7010637275309834e-09, + "loss": 0.8463, + "step": 19960 + }, + { + "epoch": 0.9955610972568578, + "grad_norm": 0.5704634783867732, + "learning_rate": 2.642026905483319e-09, + "loss": 0.8428, + "step": 19961 + }, + { + "epoch": 0.9956109725685786, + "grad_norm": 0.6527722409399737, + "learning_rate": 2.5836423783037746e-09, + "loss": 0.8739, + "step": 19962 + }, + { + "epoch": 0.9956608478802993, + "grad_norm": 2.0450711801455363, + "learning_rate": 2.5259101475161305e-09, + "loss": 0.813, + "step": 19963 + }, + { + "epoch": 0.9957107231920199, + "grad_norm": 0.6337528140941348, + "learning_rate": 2.4688302146275155e-09, + "loss": 0.8097, + "step": 19964 + }, + { + "epoch": 0.9957605985037407, + "grad_norm": 0.5906873418214557, + "learning_rate": 2.4124025811284034e-09, + "loss": 0.7951, + "step": 19965 + }, + { + "epoch": 0.9958104738154614, + "grad_norm": 0.6099441201268891, + "learning_rate": 2.35662724848984e-09, + "loss": 0.8457, + "step": 19966 + }, + { + "epoch": 0.995860349127182, + "grad_norm": 0.6857643074635436, + "learning_rate": 2.3015042181662172e-09, + "loss": 0.855, + "step": 19967 + }, + { + "epoch": 0.9959102244389028, + "grad_norm": 0.6229920164253653, + "learning_rate": 2.2470334915980495e-09, + "loss": 0.827, + "step": 19968 + }, + { + "epoch": 0.9959600997506235, + "grad_norm": 0.5539833202972015, + "learning_rate": 2.1932150702064227e-09, + "loss": 0.8371, + "step": 19969 + }, + { + "epoch": 0.9960099750623441, + "grad_norm": 0.5146734630668928, + "learning_rate": 2.140048955395768e-09, + "loss": 0.8096, + "step": 19970 + }, + { + "epoch": 0.9960598503740649, + "grad_norm": 0.5894905184224442, + "learning_rate": 2.0875351485510895e-09, + "loss": 0.8185, + "step": 19971 + }, + { + "epoch": 0.9961097256857855, + "grad_norm": 0.632342674687323, + "learning_rate": 2.035673651046288e-09, + "loss": 0.8244, + "step": 19972 + }, + { + "epoch": 0.9961596009975062, + "grad_norm": 0.858357091900739, + "learning_rate": 1.98446446423306e-09, + "loss": 0.8635, + "step": 19973 + }, + { + "epoch": 0.996209476309227, + "grad_norm": 0.5266325058395636, + "learning_rate": 1.9339075894464485e-09, + "loss": 0.8612, + "step": 19974 + }, + { + "epoch": 0.9962593516209476, + "grad_norm": 0.6124853319681176, + "learning_rate": 1.8840030280076194e-09, + "loss": 0.8638, + "step": 19975 + }, + { + "epoch": 0.9963092269326683, + "grad_norm": 0.51915962522179, + "learning_rate": 1.8347507812183085e-09, + "loss": 0.871, + "step": 19976 + }, + { + "epoch": 0.9963591022443891, + "grad_norm": 0.8176273893298661, + "learning_rate": 1.7861508503635994e-09, + "loss": 0.8684, + "step": 19977 + }, + { + "epoch": 0.9964089775561097, + "grad_norm": 0.6653392459438994, + "learning_rate": 1.7382032367119217e-09, + "loss": 0.8441, + "step": 19978 + }, + { + "epoch": 0.9964588528678304, + "grad_norm": 0.6366302401768079, + "learning_rate": 1.6909079415122764e-09, + "loss": 0.8355, + "step": 19979 + }, + { + "epoch": 0.9965087281795512, + "grad_norm": 0.5752789879553086, + "learning_rate": 1.644264966002562e-09, + "loss": 0.8466, + "step": 19980 + }, + { + "epoch": 0.9965586034912718, + "grad_norm": 0.6612678660812772, + "learning_rate": 1.5982743113956978e-09, + "loss": 0.8183, + "step": 19981 + }, + { + "epoch": 0.9966084788029925, + "grad_norm": 0.700307410046946, + "learning_rate": 1.552935978896275e-09, + "loss": 0.8576, + "step": 19982 + }, + { + "epoch": 0.9966583541147133, + "grad_norm": 0.5783212454312238, + "learning_rate": 1.5082499696839059e-09, + "loss": 0.8479, + "step": 19983 + }, + { + "epoch": 0.9967082294264339, + "grad_norm": 0.7588331930378505, + "learning_rate": 1.4642162849271002e-09, + "loss": 0.8296, + "step": 19984 + }, + { + "epoch": 0.9967581047381546, + "grad_norm": 0.6250043235822664, + "learning_rate": 1.4208349257721632e-09, + "loss": 0.8371, + "step": 19985 + }, + { + "epoch": 0.9968079800498754, + "grad_norm": 0.5704451518044901, + "learning_rate": 1.3781058933515222e-09, + "loss": 0.822, + "step": 19986 + }, + { + "epoch": 0.996857855361596, + "grad_norm": 0.5367080288885453, + "learning_rate": 1.3360291887837274e-09, + "loss": 0.803, + "step": 19987 + }, + { + "epoch": 0.9969077306733167, + "grad_norm": 0.5274488053281817, + "learning_rate": 1.2946048131651235e-09, + "loss": 0.8115, + "step": 19988 + }, + { + "epoch": 0.9969576059850374, + "grad_norm": 0.5078035622286813, + "learning_rate": 1.2538327675726268e-09, + "loss": 0.7828, + "step": 19989 + }, + { + "epoch": 0.9970074812967581, + "grad_norm": 0.5347566909246372, + "learning_rate": 1.2137130530748275e-09, + "loss": 0.7991, + "step": 19990 + }, + { + "epoch": 0.9970573566084788, + "grad_norm": 0.5903347593532448, + "learning_rate": 1.1742456707181103e-09, + "loss": 0.8064, + "step": 19991 + }, + { + "epoch": 0.9971072319201995, + "grad_norm": 2.4229573741148513, + "learning_rate": 1.1354306215294319e-09, + "loss": 0.8461, + "step": 19992 + }, + { + "epoch": 0.9971571072319202, + "grad_norm": 1.311875207957357, + "learning_rate": 1.0972679065274216e-09, + "loss": 0.8836, + "step": 19993 + }, + { + "epoch": 0.9972069825436409, + "grad_norm": 0.6116940639230108, + "learning_rate": 1.059757526700178e-09, + "loss": 0.8519, + "step": 19994 + }, + { + "epoch": 0.9972568578553616, + "grad_norm": 0.6640163642652023, + "learning_rate": 1.022899483033024e-09, + "loss": 0.8381, + "step": 19995 + }, + { + "epoch": 0.9973067331670823, + "grad_norm": 0.6551300972885609, + "learning_rate": 9.866937764835271e-10, + "loss": 0.8274, + "step": 19996 + }, + { + "epoch": 0.997356608478803, + "grad_norm": 0.9473058144333818, + "learning_rate": 9.511404079981522e-10, + "loss": 0.8837, + "step": 19997 + }, + { + "epoch": 0.9974064837905237, + "grad_norm": 0.5539823379781664, + "learning_rate": 9.162393785067114e-10, + "loss": 0.8191, + "step": 19998 + }, + { + "epoch": 0.9974563591022444, + "grad_norm": 0.591093445142007, + "learning_rate": 8.819906889168117e-10, + "loss": 0.8187, + "step": 19999 + }, + { + "epoch": 0.9975062344139651, + "grad_norm": 0.6853776238341317, + "learning_rate": 8.483943401221827e-10, + "loss": 0.8608, + "step": 20000 + }, + { + "epoch": 0.9975561097256858, + "grad_norm": 0.5387217818286557, + "learning_rate": 8.154503329999008e-10, + "loss": 0.8505, + "step": 20001 + }, + { + "epoch": 0.9976059850374065, + "grad_norm": 0.7452939429353711, + "learning_rate": 7.831586684131642e-10, + "loss": 0.846, + "step": 20002 + }, + { + "epoch": 0.9976558603491272, + "grad_norm": 0.6474779574193683, + "learning_rate": 7.515193472001913e-10, + "loss": 0.8459, + "step": 20003 + }, + { + "epoch": 0.9977057356608479, + "grad_norm": 0.874704077693331, + "learning_rate": 7.205323701880984e-10, + "loss": 0.8384, + "step": 20004 + }, + { + "epoch": 0.9977556109725686, + "grad_norm": 0.5669366251442677, + "learning_rate": 6.901977381845726e-10, + "loss": 0.813, + "step": 20005 + }, + { + "epoch": 0.9978054862842892, + "grad_norm": 0.7579607422429993, + "learning_rate": 6.605154519834234e-10, + "loss": 0.7893, + "step": 20006 + }, + { + "epoch": 0.99785536159601, + "grad_norm": 0.7025833667094024, + "learning_rate": 6.314855123590313e-10, + "loss": 0.8298, + "step": 20007 + }, + { + "epoch": 0.9979052369077307, + "grad_norm": 0.6822825162530769, + "learning_rate": 6.031079200663481e-10, + "loss": 0.8294, + "step": 20008 + }, + { + "epoch": 0.9979551122194513, + "grad_norm": 0.5641932471008541, + "learning_rate": 5.753826758464476e-10, + "loss": 0.8534, + "step": 20009 + }, + { + "epoch": 0.9980049875311721, + "grad_norm": 0.5502785617791992, + "learning_rate": 5.483097804237502e-10, + "loss": 0.8374, + "step": 20010 + }, + { + "epoch": 0.9980548628428928, + "grad_norm": 0.5797350598728326, + "learning_rate": 5.218892345060233e-10, + "loss": 0.8207, + "step": 20011 + }, + { + "epoch": 0.9981047381546134, + "grad_norm": 47.244133506098784, + "learning_rate": 4.961210387788296e-10, + "loss": 0.827, + "step": 20012 + }, + { + "epoch": 0.9981546134663342, + "grad_norm": 0.5301393973768542, + "learning_rate": 4.710051939194049e-10, + "loss": 0.7818, + "step": 20013 + }, + { + "epoch": 0.9982044887780549, + "grad_norm": 0.5619960514354968, + "learning_rate": 4.4654170058000546e-10, + "loss": 0.8407, + "step": 20014 + }, + { + "epoch": 0.9982543640897755, + "grad_norm": 0.7467305332460475, + "learning_rate": 4.2273055940178496e-10, + "loss": 0.8302, + "step": 20015 + }, + { + "epoch": 0.9983042394014963, + "grad_norm": 0.6788323953289958, + "learning_rate": 3.9957177100091726e-10, + "loss": 0.8315, + "step": 20016 + }, + { + "epoch": 0.998354114713217, + "grad_norm": 1.9040550945420904, + "learning_rate": 3.770653359852494e-10, + "loss": 0.8395, + "step": 20017 + }, + { + "epoch": 0.9984039900249376, + "grad_norm": 0.6329656606393746, + "learning_rate": 3.5521125494319964e-10, + "loss": 0.7792, + "step": 20018 + }, + { + "epoch": 0.9984538653366584, + "grad_norm": 0.564087692657042, + "learning_rate": 3.340095284437572e-10, + "loss": 0.8367, + "step": 20019 + }, + { + "epoch": 0.9985037406483791, + "grad_norm": 0.6231573323534695, + "learning_rate": 3.134601570392581e-10, + "loss": 0.8399, + "step": 20020 + }, + { + "epoch": 0.9985536159600997, + "grad_norm": 0.6000604475983881, + "learning_rate": 2.9356314126538497e-10, + "loss": 0.8064, + "step": 20021 + }, + { + "epoch": 0.9986034912718205, + "grad_norm": 0.5751740946233178, + "learning_rate": 2.7431848164394257e-10, + "loss": 0.8402, + "step": 20022 + }, + { + "epoch": 0.9986533665835412, + "grad_norm": 0.5790587064781433, + "learning_rate": 2.557261786745313e-10, + "loss": 0.8621, + "step": 20023 + }, + { + "epoch": 0.9987032418952618, + "grad_norm": 1.0088009629091235, + "learning_rate": 2.377862328456493e-10, + "loss": 0.8586, + "step": 20024 + }, + { + "epoch": 0.9987531172069826, + "grad_norm": 0.628503276670994, + "learning_rate": 2.204986446208146e-10, + "loss": 0.8342, + "step": 20025 + }, + { + "epoch": 0.9988029925187032, + "grad_norm": 0.6226772144041229, + "learning_rate": 2.0386341445521871e-10, + "loss": 0.8001, + "step": 20026 + }, + { + "epoch": 0.9988528678304239, + "grad_norm": 0.6216250783506779, + "learning_rate": 1.8788054278184862e-10, + "loss": 0.7871, + "step": 20027 + }, + { + "epoch": 0.9989027431421447, + "grad_norm": 0.6233164897251364, + "learning_rate": 1.7255003001703795e-10, + "loss": 0.858, + "step": 20028 + }, + { + "epoch": 0.9989526184538653, + "grad_norm": 0.5494432375395558, + "learning_rate": 1.5787187656046697e-10, + "loss": 0.7992, + "step": 20029 + }, + { + "epoch": 0.999002493765586, + "grad_norm": 0.9194271299883761, + "learning_rate": 1.4384608279516265e-10, + "loss": 0.8094, + "step": 20030 + }, + { + "epoch": 0.9990523690773068, + "grad_norm": 0.7430416656402185, + "learning_rate": 1.3047264908749856e-10, + "loss": 0.8632, + "step": 20031 + }, + { + "epoch": 0.9991022443890274, + "grad_norm": 0.5526624641501507, + "learning_rate": 1.1775157578997053e-10, + "loss": 0.7819, + "step": 20032 + }, + { + "epoch": 0.9991521197007481, + "grad_norm": 0.6386139500772287, + "learning_rate": 1.0568286322731879e-10, + "loss": 0.8568, + "step": 20033 + }, + { + "epoch": 0.9992019950124689, + "grad_norm": 0.655738136570397, + "learning_rate": 9.426651171873246e-11, + "loss": 0.8262, + "step": 20034 + }, + { + "epoch": 0.9992518703241895, + "grad_norm": 0.5702246229531862, + "learning_rate": 8.350252156397176e-11, + "loss": 0.8453, + "step": 20035 + }, + { + "epoch": 0.9993017456359102, + "grad_norm": 0.5342061868564102, + "learning_rate": 7.339089304059244e-11, + "loss": 0.8076, + "step": 20036 + }, + { + "epoch": 0.999351620947631, + "grad_norm": 0.6107358973128958, + "learning_rate": 6.393162641504802e-11, + "loss": 0.8564, + "step": 20037 + }, + { + "epoch": 0.9994014962593516, + "grad_norm": 0.6380261956871217, + "learning_rate": 5.512472193158757e-11, + "loss": 0.8247, + "step": 20038 + }, + { + "epoch": 0.9994513715710723, + "grad_norm": 0.7345842819898625, + "learning_rate": 4.697017982058238e-11, + "loss": 0.8217, + "step": 20039 + }, + { + "epoch": 0.9995012468827931, + "grad_norm": 0.5665446650023948, + "learning_rate": 3.946800029575037e-11, + "loss": 0.8633, + "step": 20040 + }, + { + "epoch": 0.9995511221945137, + "grad_norm": 0.6638707317835986, + "learning_rate": 3.261818355138058e-11, + "loss": 0.8457, + "step": 20041 + }, + { + "epoch": 0.9996009975062344, + "grad_norm": 0.5586891143407277, + "learning_rate": 2.6420729767884233e-11, + "loss": 0.8584, + "step": 20042 + }, + { + "epoch": 0.9996508728179551, + "grad_norm": 0.6858342808859521, + "learning_rate": 2.0875639106243682e-11, + "loss": 0.8239, + "step": 20043 + }, + { + "epoch": 0.9997007481296758, + "grad_norm": 0.6394408469436613, + "learning_rate": 1.5982911713563474e-11, + "loss": 0.8587, + "step": 20044 + }, + { + "epoch": 0.9997506234413965, + "grad_norm": 0.5849243737944279, + "learning_rate": 1.1742547711968144e-11, + "loss": 0.8386, + "step": 20045 + }, + { + "epoch": 0.9998004987531172, + "grad_norm": 1.1529590397571412, + "learning_rate": 8.154547218031106e-12, + "loss": 0.8342, + "step": 20046 + }, + { + "epoch": 0.9998503740648379, + "grad_norm": 0.6134065770013761, + "learning_rate": 5.2189103205702025e-12, + "loss": 0.82, + "step": 20047 + }, + { + "epoch": 0.9999002493765586, + "grad_norm": 0.5835340710993654, + "learning_rate": 2.935637100076605e-12, + "loss": 0.8706, + "step": 20048 + }, + { + "epoch": 0.9999501246882793, + "grad_norm": 0.5360506512431845, + "learning_rate": 1.3047276148370202e-12, + "loss": 0.8003, + "step": 20049 + }, + { + "epoch": 1.0, + "grad_norm": 1.0903653383181189, + "learning_rate": 3.2618190648481263e-13, + "loss": 0.8621, + "step": 20050 + }, + { + "epoch": 1.0, + "step": 20050, + "total_flos": 8.880318486177382e+16, + "train_loss": 0.8906813718553196, + "train_runtime": 481462.6685, + "train_samples_per_second": 21.321, + "train_steps_per_second": 0.042 + } + ], + "logging_steps": 1.0, + "max_steps": 20050, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 8.880318486177382e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}