{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 846, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0035460992907801418, "grad_norm": 8.665782750896012, "learning_rate": 0.0, "loss": 1.734, "step": 1 }, { "epoch": 0.0070921985815602835, "grad_norm": 9.073210859768501, "learning_rate": 1.1764705882352942e-07, "loss": 1.7773, "step": 2 }, { "epoch": 0.010638297872340425, "grad_norm": 9.33152602116301, "learning_rate": 2.3529411764705883e-07, "loss": 1.7825, "step": 3 }, { "epoch": 0.014184397163120567, "grad_norm": 8.82868655592556, "learning_rate": 3.529411764705883e-07, "loss": 1.7792, "step": 4 }, { "epoch": 0.01773049645390071, "grad_norm": 8.698561517122181, "learning_rate": 4.7058823529411767e-07, "loss": 1.799, "step": 5 }, { "epoch": 0.02127659574468085, "grad_norm": 8.749586538988138, "learning_rate": 5.882352941176471e-07, "loss": 1.8009, "step": 6 }, { "epoch": 0.024822695035460994, "grad_norm": 9.207181215808223, "learning_rate": 7.058823529411766e-07, "loss": 1.8257, "step": 7 }, { "epoch": 0.028368794326241134, "grad_norm": 8.989325804732584, "learning_rate": 8.235294117647059e-07, "loss": 1.8266, "step": 8 }, { "epoch": 0.031914893617021274, "grad_norm": 8.601818663152391, "learning_rate": 9.411764705882353e-07, "loss": 1.7777, "step": 9 }, { "epoch": 0.03546099290780142, "grad_norm": 7.770455003053462, "learning_rate": 1.0588235294117648e-06, "loss": 1.7491, "step": 10 }, { "epoch": 0.03900709219858156, "grad_norm": 7.233900438934411, "learning_rate": 1.1764705882352942e-06, "loss": 1.6963, "step": 11 }, { "epoch": 0.0425531914893617, "grad_norm": 7.663746375225752, "learning_rate": 1.2941176470588237e-06, "loss": 1.7596, "step": 12 }, { "epoch": 0.04609929078014184, "grad_norm": 5.843368105374257, "learning_rate": 1.4117647058823531e-06, "loss": 1.6749, "step": 13 }, { "epoch": 0.04964539007092199, "grad_norm": 5.220448996155471, "learning_rate": 1.5294117647058826e-06, "loss": 1.663, "step": 14 }, { "epoch": 0.05319148936170213, "grad_norm": 5.113386171799863, "learning_rate": 1.6470588235294118e-06, "loss": 1.6749, "step": 15 }, { "epoch": 0.05673758865248227, "grad_norm": 4.475044677944814, "learning_rate": 1.7647058823529414e-06, "loss": 1.5849, "step": 16 }, { "epoch": 0.06028368794326241, "grad_norm": 4.6477087987430705, "learning_rate": 1.8823529411764707e-06, "loss": 1.6245, "step": 17 }, { "epoch": 0.06382978723404255, "grad_norm": 2.5805253792201404, "learning_rate": 2.0000000000000003e-06, "loss": 1.5541, "step": 18 }, { "epoch": 0.0673758865248227, "grad_norm": 2.47541421097718, "learning_rate": 2.1176470588235296e-06, "loss": 1.5419, "step": 19 }, { "epoch": 0.07092198581560284, "grad_norm": 2.2089303697698703, "learning_rate": 2.2352941176470592e-06, "loss": 1.514, "step": 20 }, { "epoch": 0.07446808510638298, "grad_norm": 2.0212954417136952, "learning_rate": 2.3529411764705885e-06, "loss": 1.4676, "step": 21 }, { "epoch": 0.07801418439716312, "grad_norm": 2.0159184649143462, "learning_rate": 2.470588235294118e-06, "loss": 1.4892, "step": 22 }, { "epoch": 0.08156028368794327, "grad_norm": 1.879512030897446, "learning_rate": 2.5882352941176473e-06, "loss": 1.4722, "step": 23 }, { "epoch": 0.0851063829787234, "grad_norm": 1.6159053894320274, "learning_rate": 2.7058823529411766e-06, "loss": 1.4178, "step": 24 }, { "epoch": 0.08865248226950355, "grad_norm": 1.7651635868673334, "learning_rate": 2.8235294117647062e-06, "loss": 1.4392, "step": 25 }, { "epoch": 0.09219858156028368, "grad_norm": 1.9214151541000806, "learning_rate": 2.9411764705882355e-06, "loss": 1.4395, "step": 26 }, { "epoch": 0.09574468085106383, "grad_norm": 1.7616852593451773, "learning_rate": 3.058823529411765e-06, "loss": 1.417, "step": 27 }, { "epoch": 0.09929078014184398, "grad_norm": 1.650161181106416, "learning_rate": 3.1764705882352943e-06, "loss": 1.3993, "step": 28 }, { "epoch": 0.10283687943262411, "grad_norm": 1.2846682109346441, "learning_rate": 3.2941176470588236e-06, "loss": 1.4113, "step": 29 }, { "epoch": 0.10638297872340426, "grad_norm": 1.1943683208009703, "learning_rate": 3.4117647058823532e-06, "loss": 1.371, "step": 30 }, { "epoch": 0.1099290780141844, "grad_norm": 1.0571452595841404, "learning_rate": 3.529411764705883e-06, "loss": 1.3643, "step": 31 }, { "epoch": 0.11347517730496454, "grad_norm": 0.8747714134416469, "learning_rate": 3.6470588235294117e-06, "loss": 1.3424, "step": 32 }, { "epoch": 0.11702127659574468, "grad_norm": 0.8535968958025469, "learning_rate": 3.7647058823529414e-06, "loss": 1.2885, "step": 33 }, { "epoch": 0.12056737588652482, "grad_norm": 0.8751710244741459, "learning_rate": 3.882352941176471e-06, "loss": 1.2421, "step": 34 }, { "epoch": 0.12411347517730496, "grad_norm": 0.834483511360012, "learning_rate": 4.000000000000001e-06, "loss": 1.2989, "step": 35 }, { "epoch": 0.1276595744680851, "grad_norm": 0.8108376720466497, "learning_rate": 4.11764705882353e-06, "loss": 1.2921, "step": 36 }, { "epoch": 0.13120567375886524, "grad_norm": 0.7089780448255002, "learning_rate": 4.235294117647059e-06, "loss": 1.2438, "step": 37 }, { "epoch": 0.1347517730496454, "grad_norm": 0.7060661444366226, "learning_rate": 4.352941176470588e-06, "loss": 1.2826, "step": 38 }, { "epoch": 0.13829787234042554, "grad_norm": 0.634423964638156, "learning_rate": 4.4705882352941184e-06, "loss": 1.236, "step": 39 }, { "epoch": 0.14184397163120568, "grad_norm": 0.5837706778737664, "learning_rate": 4.588235294117647e-06, "loss": 1.2282, "step": 40 }, { "epoch": 0.1453900709219858, "grad_norm": 0.6043664210650173, "learning_rate": 4.705882352941177e-06, "loss": 1.2534, "step": 41 }, { "epoch": 0.14893617021276595, "grad_norm": 0.5741574926941739, "learning_rate": 4.823529411764706e-06, "loss": 1.1884, "step": 42 }, { "epoch": 0.1524822695035461, "grad_norm": 0.5003113616633108, "learning_rate": 4.941176470588236e-06, "loss": 1.2248, "step": 43 }, { "epoch": 0.15602836879432624, "grad_norm": 0.4774017898688041, "learning_rate": 5.058823529411765e-06, "loss": 1.2299, "step": 44 }, { "epoch": 0.1595744680851064, "grad_norm": 0.4660433370056393, "learning_rate": 5.176470588235295e-06, "loss": 1.1804, "step": 45 }, { "epoch": 0.16312056737588654, "grad_norm": 0.4684741687135575, "learning_rate": 5.294117647058824e-06, "loss": 1.2018, "step": 46 }, { "epoch": 0.16666666666666666, "grad_norm": 0.4461573065223691, "learning_rate": 5.411764705882353e-06, "loss": 1.2006, "step": 47 }, { "epoch": 0.1702127659574468, "grad_norm": 0.47669147750076235, "learning_rate": 5.529411764705883e-06, "loss": 1.1929, "step": 48 }, { "epoch": 0.17375886524822695, "grad_norm": 0.4681274554559587, "learning_rate": 5.6470588235294125e-06, "loss": 1.1953, "step": 49 }, { "epoch": 0.1773049645390071, "grad_norm": 0.4117584073738419, "learning_rate": 5.764705882352941e-06, "loss": 1.1603, "step": 50 }, { "epoch": 0.18085106382978725, "grad_norm": 0.4422286308465251, "learning_rate": 5.882352941176471e-06, "loss": 1.1315, "step": 51 }, { "epoch": 0.18439716312056736, "grad_norm": 0.37217783475190447, "learning_rate": 6e-06, "loss": 1.1181, "step": 52 }, { "epoch": 0.1879432624113475, "grad_norm": 0.36980227530078597, "learning_rate": 6.11764705882353e-06, "loss": 1.1971, "step": 53 }, { "epoch": 0.19148936170212766, "grad_norm": 0.36030712532957193, "learning_rate": 6.2352941176470595e-06, "loss": 1.1771, "step": 54 }, { "epoch": 0.1950354609929078, "grad_norm": 0.36098217999021565, "learning_rate": 6.352941176470589e-06, "loss": 1.1355, "step": 55 }, { "epoch": 0.19858156028368795, "grad_norm": 0.35528278156761867, "learning_rate": 6.470588235294119e-06, "loss": 1.1384, "step": 56 }, { "epoch": 0.20212765957446807, "grad_norm": 0.3675547579022495, "learning_rate": 6.588235294117647e-06, "loss": 1.1842, "step": 57 }, { "epoch": 0.20567375886524822, "grad_norm": 0.34200481588273246, "learning_rate": 6.705882352941176e-06, "loss": 1.1599, "step": 58 }, { "epoch": 0.20921985815602837, "grad_norm": 0.333714711680506, "learning_rate": 6.8235294117647065e-06, "loss": 1.1252, "step": 59 }, { "epoch": 0.2127659574468085, "grad_norm": 0.3344160490792551, "learning_rate": 6.941176470588236e-06, "loss": 1.1078, "step": 60 }, { "epoch": 0.21631205673758866, "grad_norm": 0.31118927321227396, "learning_rate": 7.058823529411766e-06, "loss": 1.1001, "step": 61 }, { "epoch": 0.2198581560283688, "grad_norm": 0.2968922847070698, "learning_rate": 7.176470588235295e-06, "loss": 1.1382, "step": 62 }, { "epoch": 0.22340425531914893, "grad_norm": 0.3128124794798863, "learning_rate": 7.294117647058823e-06, "loss": 1.1035, "step": 63 }, { "epoch": 0.22695035460992907, "grad_norm": 0.3095498333912435, "learning_rate": 7.4117647058823535e-06, "loss": 1.1357, "step": 64 }, { "epoch": 0.23049645390070922, "grad_norm": 0.31168069824379874, "learning_rate": 7.529411764705883e-06, "loss": 1.1553, "step": 65 }, { "epoch": 0.23404255319148937, "grad_norm": 0.30855942417205323, "learning_rate": 7.647058823529411e-06, "loss": 1.1003, "step": 66 }, { "epoch": 0.2375886524822695, "grad_norm": 0.29357750471897964, "learning_rate": 7.764705882352941e-06, "loss": 1.131, "step": 67 }, { "epoch": 0.24113475177304963, "grad_norm": 0.2829486806613938, "learning_rate": 7.882352941176471e-06, "loss": 1.0633, "step": 68 }, { "epoch": 0.24468085106382978, "grad_norm": 0.28592190510582, "learning_rate": 8.000000000000001e-06, "loss": 1.1008, "step": 69 }, { "epoch": 0.24822695035460993, "grad_norm": 0.2792631296175681, "learning_rate": 8.11764705882353e-06, "loss": 1.0963, "step": 70 }, { "epoch": 0.25177304964539005, "grad_norm": 0.29663931921093056, "learning_rate": 8.23529411764706e-06, "loss": 1.1208, "step": 71 }, { "epoch": 0.2553191489361702, "grad_norm": 0.30454640644976066, "learning_rate": 8.35294117647059e-06, "loss": 1.1288, "step": 72 }, { "epoch": 0.25886524822695034, "grad_norm": 0.27685316392898796, "learning_rate": 8.470588235294118e-06, "loss": 1.0898, "step": 73 }, { "epoch": 0.2624113475177305, "grad_norm": 0.2905573025217783, "learning_rate": 8.588235294117647e-06, "loss": 1.0947, "step": 74 }, { "epoch": 0.26595744680851063, "grad_norm": 0.27984674195208564, "learning_rate": 8.705882352941177e-06, "loss": 1.0741, "step": 75 }, { "epoch": 0.2695035460992908, "grad_norm": 0.2769251115792928, "learning_rate": 8.823529411764707e-06, "loss": 1.0782, "step": 76 }, { "epoch": 0.2730496453900709, "grad_norm": 0.2887658735650916, "learning_rate": 8.941176470588237e-06, "loss": 1.0935, "step": 77 }, { "epoch": 0.2765957446808511, "grad_norm": 0.369351390392209, "learning_rate": 9.058823529411765e-06, "loss": 1.1128, "step": 78 }, { "epoch": 0.2801418439716312, "grad_norm": 0.3059322355078604, "learning_rate": 9.176470588235294e-06, "loss": 1.109, "step": 79 }, { "epoch": 0.28368794326241137, "grad_norm": 0.2937313420253811, "learning_rate": 9.294117647058824e-06, "loss": 1.0688, "step": 80 }, { "epoch": 0.2872340425531915, "grad_norm": 0.2638757720647012, "learning_rate": 9.411764705882354e-06, "loss": 1.0775, "step": 81 }, { "epoch": 0.2907801418439716, "grad_norm": 0.270253555320252, "learning_rate": 9.529411764705882e-06, "loss": 1.0684, "step": 82 }, { "epoch": 0.29432624113475175, "grad_norm": 0.28430202497386, "learning_rate": 9.647058823529412e-06, "loss": 1.089, "step": 83 }, { "epoch": 0.2978723404255319, "grad_norm": 0.30191612275918134, "learning_rate": 9.764705882352942e-06, "loss": 1.1054, "step": 84 }, { "epoch": 0.30141843971631205, "grad_norm": 0.26101169797064544, "learning_rate": 9.882352941176472e-06, "loss": 1.0571, "step": 85 }, { "epoch": 0.3049645390070922, "grad_norm": 0.2935032376259707, "learning_rate": 1e-05, "loss": 1.0395, "step": 86 }, { "epoch": 0.30851063829787234, "grad_norm": 0.28686971413993645, "learning_rate": 9.99995739409215e-06, "loss": 1.0816, "step": 87 }, { "epoch": 0.3120567375886525, "grad_norm": 0.2731583148049416, "learning_rate": 9.999829577094702e-06, "loss": 1.0741, "step": 88 }, { "epoch": 0.31560283687943264, "grad_norm": 0.28004061179263656, "learning_rate": 9.999616551185959e-06, "loss": 1.0721, "step": 89 }, { "epoch": 0.3191489361702128, "grad_norm": 0.309988727435713, "learning_rate": 9.999318319996388e-06, "loss": 1.1171, "step": 90 }, { "epoch": 0.32269503546099293, "grad_norm": 0.266253584171138, "learning_rate": 9.998934888608553e-06, "loss": 1.0616, "step": 91 }, { "epoch": 0.3262411347517731, "grad_norm": 0.27534337601827485, "learning_rate": 9.998466263557032e-06, "loss": 1.0514, "step": 92 }, { "epoch": 0.32978723404255317, "grad_norm": 0.28378450055514537, "learning_rate": 9.9979124528283e-06, "loss": 1.0378, "step": 93 }, { "epoch": 0.3333333333333333, "grad_norm": 0.29217323895123287, "learning_rate": 9.997273465860602e-06, "loss": 1.118, "step": 94 }, { "epoch": 0.33687943262411346, "grad_norm": 0.28141725197518247, "learning_rate": 9.996549313543788e-06, "loss": 1.0884, "step": 95 }, { "epoch": 0.3404255319148936, "grad_norm": 0.26450466413927254, "learning_rate": 9.99574000821912e-06, "loss": 1.0738, "step": 96 }, { "epoch": 0.34397163120567376, "grad_norm": 0.28936259760371336, "learning_rate": 9.99484556367908e-06, "loss": 1.1117, "step": 97 }, { "epoch": 0.3475177304964539, "grad_norm": 0.272251075475874, "learning_rate": 9.993865995167113e-06, "loss": 1.04, "step": 98 }, { "epoch": 0.35106382978723405, "grad_norm": 0.2891346764860495, "learning_rate": 9.992801319377379e-06, "loss": 1.0483, "step": 99 }, { "epoch": 0.3546099290780142, "grad_norm": 0.28808676340577277, "learning_rate": 9.991651554454473e-06, "loss": 1.0628, "step": 100 }, { "epoch": 0.35815602836879434, "grad_norm": 0.28555539631547217, "learning_rate": 9.990416719993105e-06, "loss": 1.0507, "step": 101 }, { "epoch": 0.3617021276595745, "grad_norm": 0.29884075598207954, "learning_rate": 9.989096837037774e-06, "loss": 1.0434, "step": 102 }, { "epoch": 0.36524822695035464, "grad_norm": 0.27023892468602984, "learning_rate": 9.987691928082399e-06, "loss": 1.0614, "step": 103 }, { "epoch": 0.36879432624113473, "grad_norm": 0.3008753357607719, "learning_rate": 9.986202017069957e-06, "loss": 1.0774, "step": 104 }, { "epoch": 0.3723404255319149, "grad_norm": 0.2827762959674367, "learning_rate": 9.984627129392045e-06, "loss": 1.0382, "step": 105 }, { "epoch": 0.375886524822695, "grad_norm": 0.290729608423594, "learning_rate": 9.982967291888474e-06, "loss": 1.0545, "step": 106 }, { "epoch": 0.37943262411347517, "grad_norm": 0.2952030439034152, "learning_rate": 9.9812225328468e-06, "loss": 1.0433, "step": 107 }, { "epoch": 0.3829787234042553, "grad_norm": 0.29324332312236556, "learning_rate": 9.979392882001835e-06, "loss": 0.9999, "step": 108 }, { "epoch": 0.38652482269503546, "grad_norm": 0.28018999022527835, "learning_rate": 9.977478370535156e-06, "loss": 1.0705, "step": 109 }, { "epoch": 0.3900709219858156, "grad_norm": 0.28839264758215516, "learning_rate": 9.975479031074563e-06, "loss": 1.0774, "step": 110 }, { "epoch": 0.39361702127659576, "grad_norm": 0.33471916347884695, "learning_rate": 9.973394897693524e-06, "loss": 1.0861, "step": 111 }, { "epoch": 0.3971631205673759, "grad_norm": 0.2880697439332527, "learning_rate": 9.971226005910597e-06, "loss": 1.037, "step": 112 }, { "epoch": 0.40070921985815605, "grad_norm": 0.291284384558207, "learning_rate": 9.968972392688825e-06, "loss": 1.0579, "step": 113 }, { "epoch": 0.40425531914893614, "grad_norm": 0.2944206405936176, "learning_rate": 9.966634096435101e-06, "loss": 1.0869, "step": 114 }, { "epoch": 0.4078014184397163, "grad_norm": 0.319193677476812, "learning_rate": 9.964211156999519e-06, "loss": 1.0394, "step": 115 }, { "epoch": 0.41134751773049644, "grad_norm": 0.3140182476293659, "learning_rate": 9.961703615674693e-06, "loss": 1.0829, "step": 116 }, { "epoch": 0.4148936170212766, "grad_norm": 0.27933019157374656, "learning_rate": 9.959111515195055e-06, "loss": 1.0543, "step": 117 }, { "epoch": 0.41843971631205673, "grad_norm": 0.30523242594041544, "learning_rate": 9.95643489973612e-06, "loss": 1.0796, "step": 118 }, { "epoch": 0.4219858156028369, "grad_norm": 0.3165198325879428, "learning_rate": 9.95367381491374e-06, "loss": 1.0423, "step": 119 }, { "epoch": 0.425531914893617, "grad_norm": 0.29298000114900263, "learning_rate": 9.950828307783328e-06, "loss": 1.0284, "step": 120 }, { "epoch": 0.42907801418439717, "grad_norm": 0.2800862450026043, "learning_rate": 9.947898426839048e-06, "loss": 1.0316, "step": 121 }, { "epoch": 0.4326241134751773, "grad_norm": 0.30601294439968607, "learning_rate": 9.944884222012995e-06, "loss": 1.0553, "step": 122 }, { "epoch": 0.43617021276595747, "grad_norm": 0.284319356407183, "learning_rate": 9.941785744674344e-06, "loss": 1.0533, "step": 123 }, { "epoch": 0.4397163120567376, "grad_norm": 0.274890086621811, "learning_rate": 9.938603047628468e-06, "loss": 1.0258, "step": 124 }, { "epoch": 0.4432624113475177, "grad_norm": 0.29521471460009063, "learning_rate": 9.935336185116048e-06, "loss": 1.0119, "step": 125 }, { "epoch": 0.44680851063829785, "grad_norm": 0.2863382737870297, "learning_rate": 9.93198521281214e-06, "loss": 1.0198, "step": 126 }, { "epoch": 0.450354609929078, "grad_norm": 0.29822519644430967, "learning_rate": 9.928550187825234e-06, "loss": 1.0767, "step": 127 }, { "epoch": 0.45390070921985815, "grad_norm": 0.2874782254648065, "learning_rate": 9.925031168696268e-06, "loss": 1.051, "step": 128 }, { "epoch": 0.4574468085106383, "grad_norm": 0.2902801730281031, "learning_rate": 9.921428215397649e-06, "loss": 1.0447, "step": 129 }, { "epoch": 0.46099290780141844, "grad_norm": 0.2794589949187171, "learning_rate": 9.917741389332213e-06, "loss": 1.023, "step": 130 }, { "epoch": 0.4645390070921986, "grad_norm": 0.3020614887475608, "learning_rate": 9.913970753332189e-06, "loss": 1.0528, "step": 131 }, { "epoch": 0.46808510638297873, "grad_norm": 0.29527202357948545, "learning_rate": 9.910116371658122e-06, "loss": 1.0489, "step": 132 }, { "epoch": 0.4716312056737589, "grad_norm": 0.2820383135204731, "learning_rate": 9.90617830999779e-06, "loss": 1.0476, "step": 133 }, { "epoch": 0.475177304964539, "grad_norm": 0.29001003955759574, "learning_rate": 9.902156635465067e-06, "loss": 1.0671, "step": 134 }, { "epoch": 0.4787234042553192, "grad_norm": 0.2752317091332543, "learning_rate": 9.89805141659879e-06, "loss": 1.0551, "step": 135 }, { "epoch": 0.48226950354609927, "grad_norm": 0.297643382917343, "learning_rate": 9.89386272336159e-06, "loss": 1.0352, "step": 136 }, { "epoch": 0.4858156028368794, "grad_norm": 0.4438509644509643, "learning_rate": 9.889590627138698e-06, "loss": 1.0369, "step": 137 }, { "epoch": 0.48936170212765956, "grad_norm": 0.3067044389133881, "learning_rate": 9.885235200736731e-06, "loss": 1.062, "step": 138 }, { "epoch": 0.4929078014184397, "grad_norm": 0.28814890904125423, "learning_rate": 9.880796518382447e-06, "loss": 1.0093, "step": 139 }, { "epoch": 0.49645390070921985, "grad_norm": 0.28885705655209154, "learning_rate": 9.87627465572148e-06, "loss": 1.0391, "step": 140 }, { "epoch": 0.5, "grad_norm": 0.3074105509288272, "learning_rate": 9.871669689817058e-06, "loss": 1.0376, "step": 141 }, { "epoch": 0.5035460992907801, "grad_norm": 0.28789023280610604, "learning_rate": 9.866981699148683e-06, "loss": 1.0259, "step": 142 }, { "epoch": 0.5070921985815603, "grad_norm": 0.31775036052035616, "learning_rate": 9.86221076361079e-06, "loss": 1.0301, "step": 143 }, { "epoch": 0.5106382978723404, "grad_norm": 0.289475415999448, "learning_rate": 9.8573569645114e-06, "loss": 1.0789, "step": 144 }, { "epoch": 0.5141843971631206, "grad_norm": 0.3195181970119566, "learning_rate": 9.852420384570717e-06, "loss": 1.0366, "step": 145 }, { "epoch": 0.5177304964539007, "grad_norm": 0.2951525321799835, "learning_rate": 9.84740110791973e-06, "loss": 1.0588, "step": 146 }, { "epoch": 0.5212765957446809, "grad_norm": 0.29275723465906067, "learning_rate": 9.842299220098775e-06, "loss": 1.0413, "step": 147 }, { "epoch": 0.524822695035461, "grad_norm": 0.3023376713763015, "learning_rate": 9.837114808056073e-06, "loss": 1.058, "step": 148 }, { "epoch": 0.5283687943262412, "grad_norm": 0.2895103250537338, "learning_rate": 9.831847960146264e-06, "loss": 1.0265, "step": 149 }, { "epoch": 0.5319148936170213, "grad_norm": 0.28682317045855765, "learning_rate": 9.826498766128876e-06, "loss": 0.9984, "step": 150 }, { "epoch": 0.5354609929078015, "grad_norm": 0.287367587107079, "learning_rate": 9.82106731716682e-06, "loss": 1.0155, "step": 151 }, { "epoch": 0.5390070921985816, "grad_norm": 0.2921469690437761, "learning_rate": 9.815553705824816e-06, "loss": 1.0365, "step": 152 }, { "epoch": 0.5425531914893617, "grad_norm": 0.27794402233529, "learning_rate": 9.809958026067838e-06, "loss": 1.0411, "step": 153 }, { "epoch": 0.5460992907801419, "grad_norm": 0.29157738711850495, "learning_rate": 9.804280373259489e-06, "loss": 1.0296, "step": 154 }, { "epoch": 0.549645390070922, "grad_norm": 0.2924252310516793, "learning_rate": 9.79852084416039e-06, "loss": 1.0522, "step": 155 }, { "epoch": 0.5531914893617021, "grad_norm": 0.2992217201631393, "learning_rate": 9.792679536926526e-06, "loss": 1.0766, "step": 156 }, { "epoch": 0.5567375886524822, "grad_norm": 0.2825624029287434, "learning_rate": 9.78675655110758e-06, "loss": 1.0187, "step": 157 }, { "epoch": 0.5602836879432624, "grad_norm": 0.30077330925291607, "learning_rate": 9.780751987645223e-06, "loss": 1.0562, "step": 158 }, { "epoch": 0.5638297872340425, "grad_norm": 0.3014933898805732, "learning_rate": 9.77466594887141e-06, "loss": 1.0094, "step": 159 }, { "epoch": 0.5673758865248227, "grad_norm": 0.2799259155269312, "learning_rate": 9.768498538506618e-06, "loss": 1.0364, "step": 160 }, { "epoch": 0.5709219858156028, "grad_norm": 0.29853935316314356, "learning_rate": 9.7622498616581e-06, "loss": 0.9901, "step": 161 }, { "epoch": 0.574468085106383, "grad_norm": 0.2952176639922698, "learning_rate": 9.755920024818074e-06, "loss": 1.0605, "step": 162 }, { "epoch": 0.5780141843971631, "grad_norm": 0.2985888999988048, "learning_rate": 9.749509135861918e-06, "loss": 1.0085, "step": 163 }, { "epoch": 0.5815602836879432, "grad_norm": 0.3051932576224292, "learning_rate": 9.743017304046328e-06, "loss": 1.0384, "step": 164 }, { "epoch": 0.5851063829787234, "grad_norm": 0.3091337034146976, "learning_rate": 9.736444640007462e-06, "loss": 1.0244, "step": 165 }, { "epoch": 0.5886524822695035, "grad_norm": 0.2810985208396095, "learning_rate": 9.729791255759045e-06, "loss": 1.0389, "step": 166 }, { "epoch": 0.5921985815602837, "grad_norm": 0.3005837429435301, "learning_rate": 9.72305726469047e-06, "loss": 1.0363, "step": 167 }, { "epoch": 0.5957446808510638, "grad_norm": 0.31326792357356764, "learning_rate": 9.716242781564854e-06, "loss": 0.9763, "step": 168 }, { "epoch": 0.599290780141844, "grad_norm": 0.28272380336374825, "learning_rate": 9.709347922517099e-06, "loss": 1.0054, "step": 169 }, { "epoch": 0.6028368794326241, "grad_norm": 0.31084817377146984, "learning_rate": 9.702372805051893e-06, "loss": 1.0162, "step": 170 }, { "epoch": 0.6063829787234043, "grad_norm": 0.2871465973607954, "learning_rate": 9.695317548041721e-06, "loss": 1.043, "step": 171 }, { "epoch": 0.6099290780141844, "grad_norm": 0.33911283490809646, "learning_rate": 9.688182271724834e-06, "loss": 1.0244, "step": 172 }, { "epoch": 0.6134751773049646, "grad_norm": 0.28833587554690415, "learning_rate": 9.680967097703205e-06, "loss": 1.0346, "step": 173 }, { "epoch": 0.6170212765957447, "grad_norm": 0.3407767291286794, "learning_rate": 9.673672148940446e-06, "loss": 1.0429, "step": 174 }, { "epoch": 0.6205673758865248, "grad_norm": 0.31853269296167086, "learning_rate": 9.666297549759727e-06, "loss": 1.0517, "step": 175 }, { "epoch": 0.624113475177305, "grad_norm": 0.3306993278733597, "learning_rate": 9.658843425841642e-06, "loss": 1.056, "step": 176 }, { "epoch": 0.6276595744680851, "grad_norm": 0.28438083059915054, "learning_rate": 9.651309904222079e-06, "loss": 1.034, "step": 177 }, { "epoch": 0.6312056737588653, "grad_norm": 0.3058726744655159, "learning_rate": 9.643697113290051e-06, "loss": 1.0578, "step": 178 }, { "epoch": 0.6347517730496454, "grad_norm": 0.2925401396305092, "learning_rate": 9.636005182785501e-06, "loss": 1.0134, "step": 179 }, { "epoch": 0.6382978723404256, "grad_norm": 0.30927067743462905, "learning_rate": 9.628234243797107e-06, "loss": 1.042, "step": 180 }, { "epoch": 0.6418439716312057, "grad_norm": 0.3016660825361443, "learning_rate": 9.620384428760031e-06, "loss": 1.0349, "step": 181 }, { "epoch": 0.6453900709219859, "grad_norm": 0.29704136197137365, "learning_rate": 9.61245587145367e-06, "loss": 1.0304, "step": 182 }, { "epoch": 0.648936170212766, "grad_norm": 0.32294295760487873, "learning_rate": 9.604448706999379e-06, "loss": 1.0122, "step": 183 }, { "epoch": 0.6524822695035462, "grad_norm": 0.31342044597542557, "learning_rate": 9.596363071858161e-06, "loss": 1.0467, "step": 184 }, { "epoch": 0.6560283687943262, "grad_norm": 0.2960454297367841, "learning_rate": 9.588199103828346e-06, "loss": 1.0333, "step": 185 }, { "epoch": 0.6595744680851063, "grad_norm": 0.3528288213618268, "learning_rate": 9.579956942043243e-06, "loss": 1.0115, "step": 186 }, { "epoch": 0.6631205673758865, "grad_norm": 0.3169608196576877, "learning_rate": 9.571636726968766e-06, "loss": 1.0424, "step": 187 }, { "epoch": 0.6666666666666666, "grad_norm": 0.31320103498344787, "learning_rate": 9.563238600401042e-06, "loss": 1.0185, "step": 188 }, { "epoch": 0.6702127659574468, "grad_norm": 0.3412135778364423, "learning_rate": 9.554762705463994e-06, "loss": 1.0356, "step": 189 }, { "epoch": 0.6737588652482269, "grad_norm": 0.3215567621277966, "learning_rate": 9.546209186606898e-06, "loss": 0.9931, "step": 190 }, { "epoch": 0.6773049645390071, "grad_norm": 0.29932441333806625, "learning_rate": 9.537578189601933e-06, "loss": 1.0048, "step": 191 }, { "epoch": 0.6808510638297872, "grad_norm": 0.3184988628896441, "learning_rate": 9.528869861541683e-06, "loss": 1.0166, "step": 192 }, { "epoch": 0.6843971631205674, "grad_norm": 0.34402918970651564, "learning_rate": 9.520084350836636e-06, "loss": 1.0051, "step": 193 }, { "epoch": 0.6879432624113475, "grad_norm": 0.31908592113927103, "learning_rate": 9.511221807212655e-06, "loss": 0.9976, "step": 194 }, { "epoch": 0.6914893617021277, "grad_norm": 0.32960233236529624, "learning_rate": 9.502282381708428e-06, "loss": 1.0144, "step": 195 }, { "epoch": 0.6950354609929078, "grad_norm": 0.3210235909325311, "learning_rate": 9.493266226672893e-06, "loss": 1.0483, "step": 196 }, { "epoch": 0.6985815602836879, "grad_norm": 0.3024555620120441, "learning_rate": 9.484173495762634e-06, "loss": 0.9595, "step": 197 }, { "epoch": 0.7021276595744681, "grad_norm": 0.31873767816055826, "learning_rate": 9.475004343939276e-06, "loss": 1.0384, "step": 198 }, { "epoch": 0.7056737588652482, "grad_norm": 0.3088705386134263, "learning_rate": 9.465758927466832e-06, "loss": 1.0228, "step": 199 }, { "epoch": 0.7092198581560284, "grad_norm": 0.3195917197194452, "learning_rate": 9.45643740390905e-06, "loss": 0.9969, "step": 200 }, { "epoch": 0.7127659574468085, "grad_norm": 0.3065260876712806, "learning_rate": 9.447039932126717e-06, "loss": 0.9973, "step": 201 }, { "epoch": 0.7163120567375887, "grad_norm": 0.31642758555597067, "learning_rate": 9.43756667227496e-06, "loss": 1.0266, "step": 202 }, { "epoch": 0.7198581560283688, "grad_norm": 0.3212210397838421, "learning_rate": 9.428017785800514e-06, "loss": 1.0015, "step": 203 }, { "epoch": 0.723404255319149, "grad_norm": 0.3170434487214493, "learning_rate": 9.418393435438971e-06, "loss": 1.014, "step": 204 }, { "epoch": 0.7269503546099291, "grad_norm": 0.31080515038850465, "learning_rate": 9.408693785212001e-06, "loss": 1.0416, "step": 205 }, { "epoch": 0.7304964539007093, "grad_norm": 0.36906935215247894, "learning_rate": 9.39891900042457e-06, "loss": 1.0464, "step": 206 }, { "epoch": 0.7340425531914894, "grad_norm": 0.3263395095703568, "learning_rate": 9.389069247662107e-06, "loss": 0.9797, "step": 207 }, { "epoch": 0.7375886524822695, "grad_norm": 0.3349459718371076, "learning_rate": 9.379144694787677e-06, "loss": 1.0228, "step": 208 }, { "epoch": 0.7411347517730497, "grad_norm": 0.3664483950963355, "learning_rate": 9.369145510939115e-06, "loss": 1.0443, "step": 209 }, { "epoch": 0.7446808510638298, "grad_norm": 0.34712867921712787, "learning_rate": 9.359071866526139e-06, "loss": 1.0027, "step": 210 }, { "epoch": 0.74822695035461, "grad_norm": 0.3268013749905146, "learning_rate": 9.348923933227461e-06, "loss": 0.9811, "step": 211 }, { "epoch": 0.75177304964539, "grad_norm": 0.32321948208548656, "learning_rate": 9.33870188398784e-06, "loss": 0.9945, "step": 212 }, { "epoch": 0.7553191489361702, "grad_norm": 0.32515213577790986, "learning_rate": 9.328405893015155e-06, "loss": 0.9804, "step": 213 }, { "epoch": 0.7588652482269503, "grad_norm": 0.3343004880702304, "learning_rate": 9.318036135777421e-06, "loss": 1.0509, "step": 214 }, { "epoch": 0.7624113475177305, "grad_norm": 0.31895828707310475, "learning_rate": 9.307592788999808e-06, "loss": 1.017, "step": 215 }, { "epoch": 0.7659574468085106, "grad_norm": 0.3518848376795708, "learning_rate": 9.297076030661622e-06, "loss": 1.0268, "step": 216 }, { "epoch": 0.7695035460992907, "grad_norm": 0.3281415693788677, "learning_rate": 9.28648603999328e-06, "loss": 1.0235, "step": 217 }, { "epoch": 0.7730496453900709, "grad_norm": 0.31173191678595574, "learning_rate": 9.27582299747325e-06, "loss": 1.0901, "step": 218 }, { "epoch": 0.776595744680851, "grad_norm": 0.30367432405953754, "learning_rate": 9.26508708482497e-06, "loss": 1.0006, "step": 219 }, { "epoch": 0.7801418439716312, "grad_norm": 0.31206481752656323, "learning_rate": 9.254278485013765e-06, "loss": 1.0283, "step": 220 }, { "epoch": 0.7836879432624113, "grad_norm": 0.312789626078834, "learning_rate": 9.243397382243718e-06, "loss": 1.0352, "step": 221 }, { "epoch": 0.7872340425531915, "grad_norm": 0.3013782591211239, "learning_rate": 9.232443961954531e-06, "loss": 1.0527, "step": 222 }, { "epoch": 0.7907801418439716, "grad_norm": 0.28293371686604846, "learning_rate": 9.221418410818374e-06, "loss": 1.047, "step": 223 }, { "epoch": 0.7943262411347518, "grad_norm": 0.3382320179475007, "learning_rate": 9.210320916736693e-06, "loss": 1.0104, "step": 224 }, { "epoch": 0.7978723404255319, "grad_norm": 0.3340615761959316, "learning_rate": 9.19915166883701e-06, "loss": 0.9624, "step": 225 }, { "epoch": 0.8014184397163121, "grad_norm": 0.3382645575443834, "learning_rate": 9.187910857469704e-06, "loss": 1.0322, "step": 226 }, { "epoch": 0.8049645390070922, "grad_norm": 0.31433602086563084, "learning_rate": 9.176598674204766e-06, "loss": 1.0186, "step": 227 }, { "epoch": 0.8085106382978723, "grad_norm": 0.30328279949157005, "learning_rate": 9.16521531182853e-06, "loss": 1.0139, "step": 228 }, { "epoch": 0.8120567375886525, "grad_norm": 0.31372898860653586, "learning_rate": 9.153760964340391e-06, "loss": 1.0212, "step": 229 }, { "epoch": 0.8156028368794326, "grad_norm": 0.32033498415989303, "learning_rate": 9.142235826949497e-06, "loss": 1.0356, "step": 230 }, { "epoch": 0.8191489361702128, "grad_norm": 0.2979725029841173, "learning_rate": 9.130640096071429e-06, "loss": 0.9572, "step": 231 }, { "epoch": 0.8226950354609929, "grad_norm": 0.32822859171781554, "learning_rate": 9.11897396932484e-06, "loss": 1.0134, "step": 232 }, { "epoch": 0.8262411347517731, "grad_norm": 0.314410919568177, "learning_rate": 9.1072376455281e-06, "loss": 1.0037, "step": 233 }, { "epoch": 0.8297872340425532, "grad_norm": 0.3340424247532965, "learning_rate": 9.0954313246959e-06, "loss": 0.9911, "step": 234 }, { "epoch": 0.8333333333333334, "grad_norm": 0.31866331147548976, "learning_rate": 9.083555208035848e-06, "loss": 1.032, "step": 235 }, { "epoch": 0.8368794326241135, "grad_norm": 0.3020515489322515, "learning_rate": 9.071609497945036e-06, "loss": 0.9906, "step": 236 }, { "epoch": 0.8404255319148937, "grad_norm": 0.35021724456096814, "learning_rate": 9.059594398006593e-06, "loss": 1.0346, "step": 237 }, { "epoch": 0.8439716312056738, "grad_norm": 0.2740256152950809, "learning_rate": 9.047510112986218e-06, "loss": 1.0307, "step": 238 }, { "epoch": 0.8475177304964538, "grad_norm": 0.3560077642849469, "learning_rate": 9.03535684882868e-06, "loss": 1.0311, "step": 239 }, { "epoch": 0.851063829787234, "grad_norm": 0.33132222521672106, "learning_rate": 9.023134812654324e-06, "loss": 1.035, "step": 240 }, { "epoch": 0.8546099290780141, "grad_norm": 0.2907151693183102, "learning_rate": 9.010844212755529e-06, "loss": 1.0169, "step": 241 }, { "epoch": 0.8581560283687943, "grad_norm": 0.3258977393566678, "learning_rate": 8.998485258593158e-06, "loss": 0.9811, "step": 242 }, { "epoch": 0.8617021276595744, "grad_norm": 0.40506484006864596, "learning_rate": 8.986058160792998e-06, "loss": 0.9882, "step": 243 }, { "epoch": 0.8652482269503546, "grad_norm": 0.31243079555073994, "learning_rate": 8.973563131142164e-06, "loss": 0.9977, "step": 244 }, { "epoch": 0.8687943262411347, "grad_norm": 0.31577105096377783, "learning_rate": 8.961000382585488e-06, "loss": 1.0093, "step": 245 }, { "epoch": 0.8723404255319149, "grad_norm": 0.3347257600330544, "learning_rate": 8.94837012922189e-06, "loss": 1.0099, "step": 246 }, { "epoch": 0.875886524822695, "grad_norm": 0.3278060216355481, "learning_rate": 8.935672586300737e-06, "loss": 1.0193, "step": 247 }, { "epoch": 0.8794326241134752, "grad_norm": 0.30184498329218623, "learning_rate": 8.922907970218168e-06, "loss": 1.0029, "step": 248 }, { "epoch": 0.8829787234042553, "grad_norm": 0.3398698375214726, "learning_rate": 8.910076498513403e-06, "loss": 1.0093, "step": 249 }, { "epoch": 0.8865248226950354, "grad_norm": 0.30955017602197954, "learning_rate": 8.897178389865042e-06, "loss": 0.9941, "step": 250 }, { "epoch": 0.8900709219858156, "grad_norm": 0.32638158968370107, "learning_rate": 8.884213864087338e-06, "loss": 0.9985, "step": 251 }, { "epoch": 0.8936170212765957, "grad_norm": 0.33247828438256866, "learning_rate": 8.871183142126448e-06, "loss": 0.9756, "step": 252 }, { "epoch": 0.8971631205673759, "grad_norm": 0.31978717931535167, "learning_rate": 8.858086446056663e-06, "loss": 1.0017, "step": 253 }, { "epoch": 0.900709219858156, "grad_norm": 0.3109705452256348, "learning_rate": 8.84492399907664e-06, "loss": 0.9821, "step": 254 }, { "epoch": 0.9042553191489362, "grad_norm": 0.3286233702182248, "learning_rate": 8.831696025505578e-06, "loss": 1.0255, "step": 255 }, { "epoch": 0.9078014184397163, "grad_norm": 0.36774429785023866, "learning_rate": 8.818402750779402e-06, "loss": 1.0219, "step": 256 }, { "epoch": 0.9113475177304965, "grad_norm": 0.31331456588377077, "learning_rate": 8.805044401446934e-06, "loss": 1.0213, "step": 257 }, { "epoch": 0.9148936170212766, "grad_norm": 0.31688893896143006, "learning_rate": 8.791621205166008e-06, "loss": 1.0309, "step": 258 }, { "epoch": 0.9184397163120568, "grad_norm": 0.3171461675322659, "learning_rate": 8.778133390699614e-06, "loss": 1.0634, "step": 259 }, { "epoch": 0.9219858156028369, "grad_norm": 0.3127548936419294, "learning_rate": 8.76458118791198e-06, "loss": 0.9812, "step": 260 }, { "epoch": 0.925531914893617, "grad_norm": 0.30897161073749047, "learning_rate": 8.750964827764672e-06, "loss": 1.0406, "step": 261 }, { "epoch": 0.9290780141843972, "grad_norm": 0.2897907506271805, "learning_rate": 8.737284542312641e-06, "loss": 1.0094, "step": 262 }, { "epoch": 0.9326241134751773, "grad_norm": 0.3665379588453067, "learning_rate": 8.723540564700281e-06, "loss": 1.0044, "step": 263 }, { "epoch": 0.9361702127659575, "grad_norm": 0.2913167263149895, "learning_rate": 8.70973312915745e-06, "loss": 1.0024, "step": 264 }, { "epoch": 0.9397163120567376, "grad_norm": 0.32389726921327855, "learning_rate": 8.695862470995476e-06, "loss": 1.0067, "step": 265 }, { "epoch": 0.9432624113475178, "grad_norm": 0.2887062071948652, "learning_rate": 8.681928826603154e-06, "loss": 1.0193, "step": 266 }, { "epoch": 0.9468085106382979, "grad_norm": 0.3444704323983549, "learning_rate": 8.667932433442712e-06, "loss": 1.0151, "step": 267 }, { "epoch": 0.950354609929078, "grad_norm": 0.32387765447530653, "learning_rate": 8.653873530045762e-06, "loss": 0.9763, "step": 268 }, { "epoch": 0.9539007092198581, "grad_norm": 0.33452698373000367, "learning_rate": 8.639752356009247e-06, "loss": 1.0618, "step": 269 }, { "epoch": 0.9574468085106383, "grad_norm": 0.30214269182575554, "learning_rate": 8.625569151991337e-06, "loss": 1.0177, "step": 270 }, { "epoch": 0.9609929078014184, "grad_norm": 0.30219144726515185, "learning_rate": 8.61132415970735e-06, "loss": 0.9992, "step": 271 }, { "epoch": 0.9645390070921985, "grad_norm": 0.3082396690680757, "learning_rate": 8.597017621925613e-06, "loss": 0.985, "step": 272 }, { "epoch": 0.9680851063829787, "grad_norm": 0.2911224108393295, "learning_rate": 8.582649782463342e-06, "loss": 0.9703, "step": 273 }, { "epoch": 0.9716312056737588, "grad_norm": 0.32560268114434127, "learning_rate": 8.568220886182471e-06, "loss": 1.0256, "step": 274 }, { "epoch": 0.975177304964539, "grad_norm": 0.3417439506575457, "learning_rate": 8.553731178985494e-06, "loss": 1.0151, "step": 275 }, { "epoch": 0.9787234042553191, "grad_norm": 0.3009926028618643, "learning_rate": 8.539180907811259e-06, "loss": 1.0075, "step": 276 }, { "epoch": 0.9822695035460993, "grad_norm": 0.28365338807856594, "learning_rate": 8.524570320630776e-06, "loss": 1.0129, "step": 277 }, { "epoch": 0.9858156028368794, "grad_norm": 0.28724644120252363, "learning_rate": 8.509899666442972e-06, "loss": 1.0105, "step": 278 }, { "epoch": 0.9893617021276596, "grad_norm": 0.31003584935491224, "learning_rate": 8.495169195270468e-06, "loss": 1.013, "step": 279 }, { "epoch": 0.9929078014184397, "grad_norm": 0.3341596345537401, "learning_rate": 8.480379158155299e-06, "loss": 1.0101, "step": 280 }, { "epoch": 0.9964539007092199, "grad_norm": 0.2916203631074705, "learning_rate": 8.46552980715465e-06, "loss": 0.98, "step": 281 }, { "epoch": 1.0, "grad_norm": 0.30602453254734685, "learning_rate": 8.450621395336554e-06, "loss": 1.0149, "step": 282 }, { "epoch": 1.00354609929078, "grad_norm": 0.34143180499153625, "learning_rate": 8.435654176775577e-06, "loss": 1.008, "step": 283 }, { "epoch": 1.0070921985815602, "grad_norm": 0.31755355103402105, "learning_rate": 8.420628406548495e-06, "loss": 0.9468, "step": 284 }, { "epoch": 1.0106382978723405, "grad_norm": 0.29693001139769637, "learning_rate": 8.405544340729938e-06, "loss": 0.9732, "step": 285 }, { "epoch": 1.0141843971631206, "grad_norm": 0.30517392727553205, "learning_rate": 8.39040223638804e-06, "loss": 1.005, "step": 286 }, { "epoch": 1.0177304964539007, "grad_norm": 0.3319155942533091, "learning_rate": 8.375202351580032e-06, "loss": 0.9983, "step": 287 }, { "epoch": 1.0212765957446808, "grad_norm": 0.33898788661133994, "learning_rate": 8.359944945347878e-06, "loss": 0.9845, "step": 288 }, { "epoch": 1.024822695035461, "grad_norm": 0.3294475503786435, "learning_rate": 8.344630277713833e-06, "loss": 0.9783, "step": 289 }, { "epoch": 1.0283687943262412, "grad_norm": 0.3034842323103239, "learning_rate": 8.329258609676025e-06, "loss": 1.0015, "step": 290 }, { "epoch": 1.0319148936170213, "grad_norm": 0.33472880263702304, "learning_rate": 8.313830203204e-06, "loss": 1.0634, "step": 291 }, { "epoch": 1.0354609929078014, "grad_norm": 0.29885368652442307, "learning_rate": 8.298345321234268e-06, "loss": 0.9805, "step": 292 }, { "epoch": 1.0390070921985815, "grad_norm": 0.33194416730791076, "learning_rate": 8.282804227665811e-06, "loss": 0.9417, "step": 293 }, { "epoch": 1.0425531914893618, "grad_norm": 0.31122904875223173, "learning_rate": 8.267207187355584e-06, "loss": 0.9834, "step": 294 }, { "epoch": 1.0460992907801419, "grad_norm": 0.31299257820971016, "learning_rate": 8.251554466114015e-06, "loss": 0.9814, "step": 295 }, { "epoch": 1.049645390070922, "grad_norm": 0.3257141390690963, "learning_rate": 8.235846330700462e-06, "loss": 1.028, "step": 296 }, { "epoch": 1.053191489361702, "grad_norm": 0.3139449259339367, "learning_rate": 8.220083048818677e-06, "loss": 0.9874, "step": 297 }, { "epoch": 1.0567375886524824, "grad_norm": 0.28375925722079853, "learning_rate": 8.20426488911223e-06, "loss": 0.988, "step": 298 }, { "epoch": 1.0602836879432624, "grad_norm": 0.3080449763736085, "learning_rate": 8.188392121159944e-06, "loss": 0.9603, "step": 299 }, { "epoch": 1.0638297872340425, "grad_norm": 0.31810874711561066, "learning_rate": 8.172465015471297e-06, "loss": 1.0021, "step": 300 }, { "epoch": 1.0673758865248226, "grad_norm": 0.3843835432630994, "learning_rate": 8.156483843481803e-06, "loss": 0.9741, "step": 301 }, { "epoch": 1.070921985815603, "grad_norm": 0.2921680297046874, "learning_rate": 8.140448877548402e-06, "loss": 0.9652, "step": 302 }, { "epoch": 1.074468085106383, "grad_norm": 0.32308043249413676, "learning_rate": 8.124360390944806e-06, "loss": 0.9622, "step": 303 }, { "epoch": 1.0780141843971631, "grad_norm": 0.32008763782680183, "learning_rate": 8.108218657856847e-06, "loss": 0.989, "step": 304 }, { "epoch": 1.0815602836879432, "grad_norm": 0.29218848441796924, "learning_rate": 8.0920239533778e-06, "loss": 0.9507, "step": 305 }, { "epoch": 1.0851063829787233, "grad_norm": 0.31803772458478674, "learning_rate": 8.075776553503697e-06, "loss": 0.9458, "step": 306 }, { "epoch": 1.0886524822695036, "grad_norm": 0.29672154291811365, "learning_rate": 8.059476735128633e-06, "loss": 0.9771, "step": 307 }, { "epoch": 1.0921985815602837, "grad_norm": 0.31845357327852275, "learning_rate": 8.04312477604003e-06, "loss": 0.9709, "step": 308 }, { "epoch": 1.0957446808510638, "grad_norm": 0.3038595490947578, "learning_rate": 8.026720954913911e-06, "loss": 1.0014, "step": 309 }, { "epoch": 1.099290780141844, "grad_norm": 0.2983466920350398, "learning_rate": 8.010265551310152e-06, "loss": 1.0073, "step": 310 }, { "epoch": 1.1028368794326242, "grad_norm": 0.29685166161102927, "learning_rate": 7.993758845667721e-06, "loss": 0.9691, "step": 311 }, { "epoch": 1.1063829787234043, "grad_norm": 0.29662562844030943, "learning_rate": 7.977201119299884e-06, "loss": 0.9739, "step": 312 }, { "epoch": 1.1099290780141844, "grad_norm": 0.2889966961782496, "learning_rate": 7.960592654389431e-06, "loss": 0.9764, "step": 313 }, { "epoch": 1.1134751773049645, "grad_norm": 0.3238202468883854, "learning_rate": 7.943933733983851e-06, "loss": 0.994, "step": 314 }, { "epoch": 1.1170212765957448, "grad_norm": 0.32222442320832, "learning_rate": 7.927224641990514e-06, "loss": 0.9732, "step": 315 }, { "epoch": 1.1205673758865249, "grad_norm": 0.298533720417504, "learning_rate": 7.910465663171836e-06, "loss": 0.9429, "step": 316 }, { "epoch": 1.124113475177305, "grad_norm": 0.296905453140236, "learning_rate": 7.893657083140417e-06, "loss": 0.9932, "step": 317 }, { "epoch": 1.127659574468085, "grad_norm": 0.32555930618520895, "learning_rate": 7.876799188354182e-06, "loss": 0.9568, "step": 318 }, { "epoch": 1.1312056737588652, "grad_norm": 0.3173449654238968, "learning_rate": 7.8598922661115e-06, "loss": 0.9859, "step": 319 }, { "epoch": 1.1347517730496455, "grad_norm": 0.28826864562436166, "learning_rate": 7.842936604546274e-06, "loss": 0.9727, "step": 320 }, { "epoch": 1.1382978723404256, "grad_norm": 0.3185721313668236, "learning_rate": 7.825932492623047e-06, "loss": 0.9748, "step": 321 }, { "epoch": 1.1418439716312057, "grad_norm": 0.3050387005026849, "learning_rate": 7.808880220132072e-06, "loss": 0.9771, "step": 322 }, { "epoch": 1.1453900709219857, "grad_norm": 0.29506494819979956, "learning_rate": 7.791780077684365e-06, "loss": 0.9856, "step": 323 }, { "epoch": 1.148936170212766, "grad_norm": 0.30447454904881693, "learning_rate": 7.774632356706768e-06, "loss": 0.9564, "step": 324 }, { "epoch": 1.1524822695035462, "grad_norm": 0.30491652295076516, "learning_rate": 7.757437349436965e-06, "loss": 0.9706, "step": 325 }, { "epoch": 1.1560283687943262, "grad_norm": 0.2979750142174216, "learning_rate": 7.740195348918516e-06, "loss": 1.013, "step": 326 }, { "epoch": 1.1595744680851063, "grad_norm": 0.3208638182053111, "learning_rate": 7.722906648995856e-06, "loss": 0.9648, "step": 327 }, { "epoch": 1.1631205673758864, "grad_norm": 0.31562754162792245, "learning_rate": 7.705571544309284e-06, "loss": 0.9812, "step": 328 }, { "epoch": 1.1666666666666667, "grad_norm": 0.3154831480025962, "learning_rate": 7.688190330289954e-06, "loss": 1.0242, "step": 329 }, { "epoch": 1.1702127659574468, "grad_norm": 0.32770738923745224, "learning_rate": 7.670763303154823e-06, "loss": 0.9792, "step": 330 }, { "epoch": 1.173758865248227, "grad_norm": 0.2897014974624381, "learning_rate": 7.653290759901617e-06, "loss": 0.9626, "step": 331 }, { "epoch": 1.177304964539007, "grad_norm": 0.3240743301503223, "learning_rate": 7.635772998303762e-06, "loss": 1.0117, "step": 332 }, { "epoch": 1.1808510638297873, "grad_norm": 0.3417606557387963, "learning_rate": 7.618210316905316e-06, "loss": 1.0022, "step": 333 }, { "epoch": 1.1843971631205674, "grad_norm": 0.30388050884466505, "learning_rate": 7.60060301501587e-06, "loss": 1.0127, "step": 334 }, { "epoch": 1.1879432624113475, "grad_norm": 0.33397221226632634, "learning_rate": 7.58295139270546e-06, "loss": 0.9678, "step": 335 }, { "epoch": 1.1914893617021276, "grad_norm": 0.3114879011415663, "learning_rate": 7.56525575079944e-06, "loss": 0.9587, "step": 336 }, { "epoch": 1.1950354609929077, "grad_norm": 0.3035188078010411, "learning_rate": 7.547516390873366e-06, "loss": 0.9481, "step": 337 }, { "epoch": 1.198581560283688, "grad_norm": 0.2812108732976255, "learning_rate": 7.529733615247852e-06, "loss": 0.9639, "step": 338 }, { "epoch": 1.202127659574468, "grad_norm": 0.3332051202997969, "learning_rate": 7.511907726983418e-06, "loss": 1.0031, "step": 339 }, { "epoch": 1.2056737588652482, "grad_norm": 0.3233390257068262, "learning_rate": 7.494039029875326e-06, "loss": 1.0058, "step": 340 }, { "epoch": 1.2092198581560283, "grad_norm": 0.3064224647002411, "learning_rate": 7.476127828448401e-06, "loss": 0.9945, "step": 341 }, { "epoch": 1.2127659574468086, "grad_norm": 0.32004319887815785, "learning_rate": 7.458174427951839e-06, "loss": 0.9942, "step": 342 }, { "epoch": 1.2163120567375887, "grad_norm": 0.3098637711971066, "learning_rate": 7.440179134354015e-06, "loss": 0.9843, "step": 343 }, { "epoch": 1.2198581560283688, "grad_norm": 0.3200842602708185, "learning_rate": 7.422142254337255e-06, "loss": 0.9645, "step": 344 }, { "epoch": 1.2234042553191489, "grad_norm": 0.31627809897067866, "learning_rate": 7.40406409529262e-06, "loss": 1.0088, "step": 345 }, { "epoch": 1.226950354609929, "grad_norm": 0.30516493458777627, "learning_rate": 7.385944965314659e-06, "loss": 0.9485, "step": 346 }, { "epoch": 1.2304964539007093, "grad_norm": 0.42006679247749923, "learning_rate": 7.367785173196165e-06, "loss": 0.9741, "step": 347 }, { "epoch": 1.2340425531914894, "grad_norm": 0.2979983368279489, "learning_rate": 7.349585028422911e-06, "loss": 0.9621, "step": 348 }, { "epoch": 1.2375886524822695, "grad_norm": 0.2837787977691954, "learning_rate": 7.331344841168373e-06, "loss": 0.9653, "step": 349 }, { "epoch": 1.2411347517730495, "grad_norm": 0.32587288668181963, "learning_rate": 7.313064922288447e-06, "loss": 0.9788, "step": 350 }, { "epoch": 1.2446808510638299, "grad_norm": 0.30784818916780626, "learning_rate": 7.294745583316146e-06, "loss": 0.9468, "step": 351 }, { "epoch": 1.24822695035461, "grad_norm": 0.3154143746283973, "learning_rate": 7.276387136456301e-06, "loss": 0.9799, "step": 352 }, { "epoch": 1.25177304964539, "grad_norm": 0.3030006377170088, "learning_rate": 7.2579898945802275e-06, "loss": 0.9886, "step": 353 }, { "epoch": 1.2553191489361701, "grad_norm": 0.3143441269422687, "learning_rate": 7.239554171220402e-06, "loss": 0.9968, "step": 354 }, { "epoch": 1.2588652482269502, "grad_norm": 0.2912094833630409, "learning_rate": 7.221080280565119e-06, "loss": 0.9709, "step": 355 }, { "epoch": 1.2624113475177305, "grad_norm": 0.30996872705872774, "learning_rate": 7.2025685374531296e-06, "loss": 0.946, "step": 356 }, { "epoch": 1.2659574468085106, "grad_norm": 0.3055274174785096, "learning_rate": 7.184019257368283e-06, "loss": 0.9934, "step": 357 }, { "epoch": 1.2695035460992907, "grad_norm": 0.3146398842770105, "learning_rate": 7.165432756434147e-06, "loss": 0.9682, "step": 358 }, { "epoch": 1.273049645390071, "grad_norm": 0.31789868733117355, "learning_rate": 7.14680935140862e-06, "loss": 1.0099, "step": 359 }, { "epoch": 1.2765957446808511, "grad_norm": 0.29066796596227135, "learning_rate": 7.128149359678531e-06, "loss": 0.9823, "step": 360 }, { "epoch": 1.2801418439716312, "grad_norm": 0.3001734007673492, "learning_rate": 7.10945309925424e-06, "loss": 0.9538, "step": 361 }, { "epoch": 1.2836879432624113, "grad_norm": 0.3195112428656178, "learning_rate": 7.090720888764201e-06, "loss": 0.9805, "step": 362 }, { "epoch": 1.2872340425531914, "grad_norm": 0.31414358074483567, "learning_rate": 7.0719530474495516e-06, "loss": 0.9471, "step": 363 }, { "epoch": 1.2907801418439715, "grad_norm": 0.30331999751795313, "learning_rate": 7.053149895158655e-06, "loss": 0.9373, "step": 364 }, { "epoch": 1.2943262411347518, "grad_norm": 0.3263724598027234, "learning_rate": 7.034311752341667e-06, "loss": 0.9734, "step": 365 }, { "epoch": 1.297872340425532, "grad_norm": 0.2947513781189209, "learning_rate": 7.015438940045052e-06, "loss": 1.0191, "step": 366 }, { "epoch": 1.301418439716312, "grad_norm": 0.3288298854144095, "learning_rate": 6.996531779906134e-06, "loss": 0.9636, "step": 367 }, { "epoch": 1.3049645390070923, "grad_norm": 0.29605266293183813, "learning_rate": 6.977590594147602e-06, "loss": 0.9695, "step": 368 }, { "epoch": 1.3085106382978724, "grad_norm": 0.2985474792869868, "learning_rate": 6.95861570557202e-06, "loss": 1.0043, "step": 369 }, { "epoch": 1.3120567375886525, "grad_norm": 0.3344709349037918, "learning_rate": 6.939607437556332e-06, "loss": 0.9864, "step": 370 }, { "epoch": 1.3156028368794326, "grad_norm": 0.2994020935139944, "learning_rate": 6.920566114046342e-06, "loss": 0.948, "step": 371 }, { "epoch": 1.3191489361702127, "grad_norm": 0.31339342553939153, "learning_rate": 6.901492059551202e-06, "loss": 0.9684, "step": 372 }, { "epoch": 1.322695035460993, "grad_norm": 0.29663562079700745, "learning_rate": 6.882385599137873e-06, "loss": 0.9907, "step": 373 }, { "epoch": 1.326241134751773, "grad_norm": 0.2968550934200831, "learning_rate": 6.863247058425594e-06, "loss": 0.9718, "step": 374 }, { "epoch": 1.3297872340425532, "grad_norm": 0.3321212522461139, "learning_rate": 6.844076763580325e-06, "loss": 0.955, "step": 375 }, { "epoch": 1.3333333333333333, "grad_norm": 0.31193859789454, "learning_rate": 6.824875041309193e-06, "loss": 0.9741, "step": 376 }, { "epoch": 1.3368794326241136, "grad_norm": 0.2896182290576476, "learning_rate": 6.80564221885492e-06, "loss": 0.9754, "step": 377 }, { "epoch": 1.3404255319148937, "grad_norm": 0.29596129311607944, "learning_rate": 6.786378623990251e-06, "loss": 0.979, "step": 378 }, { "epoch": 1.3439716312056738, "grad_norm": 0.28949936975952406, "learning_rate": 6.767084585012365e-06, "loss": 0.9435, "step": 379 }, { "epoch": 1.3475177304964538, "grad_norm": 0.29808609428623706, "learning_rate": 6.747760430737283e-06, "loss": 0.952, "step": 380 }, { "epoch": 1.351063829787234, "grad_norm": 0.299659499522143, "learning_rate": 6.728406490494257e-06, "loss": 1.0018, "step": 381 }, { "epoch": 1.3546099290780143, "grad_norm": 0.28062106602113573, "learning_rate": 6.709023094120164e-06, "loss": 0.9646, "step": 382 }, { "epoch": 1.3581560283687943, "grad_norm": 0.3190611371431519, "learning_rate": 6.689610571953888e-06, "loss": 0.9906, "step": 383 }, { "epoch": 1.3617021276595744, "grad_norm": 0.3226903983486605, "learning_rate": 6.670169254830677e-06, "loss": 0.9883, "step": 384 }, { "epoch": 1.3652482269503547, "grad_norm": 0.31979586889104406, "learning_rate": 6.650699474076521e-06, "loss": 0.9551, "step": 385 }, { "epoch": 1.3687943262411348, "grad_norm": 0.2987370000405775, "learning_rate": 6.63120156150249e-06, "loss": 0.9905, "step": 386 }, { "epoch": 1.372340425531915, "grad_norm": 0.3158581640769174, "learning_rate": 6.611675849399093e-06, "loss": 0.944, "step": 387 }, { "epoch": 1.375886524822695, "grad_norm": 0.3196575897121104, "learning_rate": 6.592122670530605e-06, "loss": 0.9683, "step": 388 }, { "epoch": 1.3794326241134751, "grad_norm": 0.31718846582809435, "learning_rate": 6.572542358129402e-06, "loss": 0.9663, "step": 389 }, { "epoch": 1.3829787234042552, "grad_norm": 0.29081199827594456, "learning_rate": 6.552935245890279e-06, "loss": 0.9642, "step": 390 }, { "epoch": 1.3865248226950355, "grad_norm": 0.33150300720886544, "learning_rate": 6.533301667964761e-06, "loss": 0.9847, "step": 391 }, { "epoch": 1.3900709219858156, "grad_norm": 0.3080116352973645, "learning_rate": 6.513641958955415e-06, "loss": 0.97, "step": 392 }, { "epoch": 1.3936170212765957, "grad_norm": 0.29915760110577744, "learning_rate": 6.493956453910137e-06, "loss": 0.9889, "step": 393 }, { "epoch": 1.397163120567376, "grad_norm": 0.2937550770844295, "learning_rate": 6.474245488316457e-06, "loss": 0.9501, "step": 394 }, { "epoch": 1.400709219858156, "grad_norm": 0.29771846720804795, "learning_rate": 6.454509398095808e-06, "loss": 0.9682, "step": 395 }, { "epoch": 1.4042553191489362, "grad_norm": 0.29916669393230055, "learning_rate": 6.4347485195978044e-06, "loss": 0.9584, "step": 396 }, { "epoch": 1.4078014184397163, "grad_norm": 0.28671933648011866, "learning_rate": 6.414963189594513e-06, "loss": 0.9882, "step": 397 }, { "epoch": 1.4113475177304964, "grad_norm": 0.3189197205066843, "learning_rate": 6.395153745274716e-06, "loss": 0.9873, "step": 398 }, { "epoch": 1.4148936170212765, "grad_norm": 0.3326690726765476, "learning_rate": 6.375320524238154e-06, "loss": 0.9782, "step": 399 }, { "epoch": 1.4184397163120568, "grad_norm": 0.30128728810316074, "learning_rate": 6.355463864489784e-06, "loss": 0.9575, "step": 400 }, { "epoch": 1.4219858156028369, "grad_norm": 0.3221081525807363, "learning_rate": 6.335584104434012e-06, "loss": 0.9589, "step": 401 }, { "epoch": 1.425531914893617, "grad_norm": 0.30461915445802135, "learning_rate": 6.315681582868927e-06, "loss": 0.952, "step": 402 }, { "epoch": 1.4290780141843973, "grad_norm": 0.29693950403052216, "learning_rate": 6.295756638980529e-06, "loss": 0.973, "step": 403 }, { "epoch": 1.4326241134751774, "grad_norm": 0.2826303564203807, "learning_rate": 6.275809612336947e-06, "loss": 0.9749, "step": 404 }, { "epoch": 1.4361702127659575, "grad_norm": 0.30933574638253075, "learning_rate": 6.255840842882654e-06, "loss": 0.975, "step": 405 }, { "epoch": 1.4397163120567376, "grad_norm": 0.3035994652682524, "learning_rate": 6.235850670932671e-06, "loss": 0.9858, "step": 406 }, { "epoch": 1.4432624113475176, "grad_norm": 0.30860001277258076, "learning_rate": 6.215839437166766e-06, "loss": 0.9753, "step": 407 }, { "epoch": 1.4468085106382977, "grad_norm": 0.3169790014334784, "learning_rate": 6.195807482623653e-06, "loss": 1.0004, "step": 408 }, { "epoch": 1.450354609929078, "grad_norm": 0.29694454989551333, "learning_rate": 6.175755148695174e-06, "loss": 0.9842, "step": 409 }, { "epoch": 1.4539007092198581, "grad_norm": 0.310904878784513, "learning_rate": 6.155682777120486e-06, "loss": 0.9336, "step": 410 }, { "epoch": 1.4574468085106382, "grad_norm": 0.34217894432121504, "learning_rate": 6.135590709980237e-06, "loss": 0.9795, "step": 411 }, { "epoch": 1.4609929078014185, "grad_norm": 0.3020059499176723, "learning_rate": 6.115479289690729e-06, "loss": 0.9844, "step": 412 }, { "epoch": 1.4645390070921986, "grad_norm": 0.317624004692271, "learning_rate": 6.095348858998089e-06, "loss": 0.9951, "step": 413 }, { "epoch": 1.4680851063829787, "grad_norm": 0.2928543809085441, "learning_rate": 6.075199760972429e-06, "loss": 0.97, "step": 414 }, { "epoch": 1.4716312056737588, "grad_norm": 0.3274430352361046, "learning_rate": 6.055032339001995e-06, "loss": 0.9495, "step": 415 }, { "epoch": 1.475177304964539, "grad_norm": 0.3067154203571681, "learning_rate": 6.034846936787314e-06, "loss": 0.9525, "step": 416 }, { "epoch": 1.4787234042553192, "grad_norm": 0.3635305749619241, "learning_rate": 6.014643898335342e-06, "loss": 0.9483, "step": 417 }, { "epoch": 1.4822695035460993, "grad_norm": 0.31982253258089927, "learning_rate": 5.994423567953594e-06, "loss": 0.959, "step": 418 }, { "epoch": 1.4858156028368794, "grad_norm": 0.307334104209482, "learning_rate": 5.974186290244287e-06, "loss": 0.975, "step": 419 }, { "epoch": 1.4893617021276595, "grad_norm": 0.28386747082530117, "learning_rate": 5.953932410098455e-06, "loss": 0.9378, "step": 420 }, { "epoch": 1.4929078014184398, "grad_norm": 0.3303235689028051, "learning_rate": 5.933662272690079e-06, "loss": 0.9595, "step": 421 }, { "epoch": 1.49645390070922, "grad_norm": 0.2992530384972015, "learning_rate": 5.9133762234702005e-06, "loss": 0.9519, "step": 422 }, { "epoch": 1.5, "grad_norm": 0.3006026134285121, "learning_rate": 5.893074608161039e-06, "loss": 0.9872, "step": 423 }, { "epoch": 1.50354609929078, "grad_norm": 0.30120382439027926, "learning_rate": 5.872757772750093e-06, "loss": 0.993, "step": 424 }, { "epoch": 1.5070921985815602, "grad_norm": 0.2769893266761998, "learning_rate": 5.85242606348425e-06, "loss": 0.9473, "step": 425 }, { "epoch": 1.5106382978723403, "grad_norm": 0.3089395827751758, "learning_rate": 5.832079826863884e-06, "loss": 1.0065, "step": 426 }, { "epoch": 1.5141843971631206, "grad_norm": 0.29940081344211505, "learning_rate": 5.8117194096369436e-06, "loss": 0.944, "step": 427 }, { "epoch": 1.5177304964539007, "grad_norm": 0.2995098648995305, "learning_rate": 5.791345158793058e-06, "loss": 0.9733, "step": 428 }, { "epoch": 1.521276595744681, "grad_norm": 0.3115358431531882, "learning_rate": 5.770957421557606e-06, "loss": 0.9607, "step": 429 }, { "epoch": 1.524822695035461, "grad_norm": 0.30438941792218244, "learning_rate": 5.750556545385809e-06, "loss": 0.9483, "step": 430 }, { "epoch": 1.5283687943262412, "grad_norm": 0.2878973588948658, "learning_rate": 5.73014287795681e-06, "loss": 0.9673, "step": 431 }, { "epoch": 1.5319148936170213, "grad_norm": 0.4799995809511032, "learning_rate": 5.709716767167741e-06, "loss": 0.9839, "step": 432 }, { "epoch": 1.5354609929078014, "grad_norm": 0.29444675718497804, "learning_rate": 5.689278561127798e-06, "loss": 0.9638, "step": 433 }, { "epoch": 1.5390070921985815, "grad_norm": 0.3125370922335577, "learning_rate": 5.66882860815231e-06, "loss": 0.9988, "step": 434 }, { "epoch": 1.5425531914893615, "grad_norm": 0.29290640767090315, "learning_rate": 5.648367256756805e-06, "loss": 0.9655, "step": 435 }, { "epoch": 1.5460992907801419, "grad_norm": 0.2962189440583618, "learning_rate": 5.627894855651061e-06, "loss": 0.9771, "step": 436 }, { "epoch": 1.549645390070922, "grad_norm": 0.30214417137638616, "learning_rate": 5.607411753733173e-06, "loss": 0.9864, "step": 437 }, { "epoch": 1.5531914893617023, "grad_norm": 0.30275111188842363, "learning_rate": 5.586918300083601e-06, "loss": 0.9649, "step": 438 }, { "epoch": 1.5567375886524824, "grad_norm": 0.3010084088396629, "learning_rate": 5.566414843959228e-06, "loss": 0.9363, "step": 439 }, { "epoch": 1.5602836879432624, "grad_norm": 0.29858075869310646, "learning_rate": 5.5459017347873945e-06, "loss": 0.9592, "step": 440 }, { "epoch": 1.5638297872340425, "grad_norm": 0.28734689028186544, "learning_rate": 5.525379322159959e-06, "loss": 0.9319, "step": 441 }, { "epoch": 1.5673758865248226, "grad_norm": 0.291813442851596, "learning_rate": 5.504847955827326e-06, "loss": 0.9637, "step": 442 }, { "epoch": 1.5709219858156027, "grad_norm": 0.2766076476707464, "learning_rate": 5.4843079856925e-06, "loss": 0.9534, "step": 443 }, { "epoch": 1.574468085106383, "grad_norm": 0.27840413680816445, "learning_rate": 5.46375976180511e-06, "loss": 0.9483, "step": 444 }, { "epoch": 1.5780141843971631, "grad_norm": 0.2947185958545857, "learning_rate": 5.44320363435545e-06, "loss": 0.9801, "step": 445 }, { "epoch": 1.5815602836879432, "grad_norm": 0.2886872982993816, "learning_rate": 5.422639953668508e-06, "loss": 0.9641, "step": 446 }, { "epoch": 1.5851063829787235, "grad_norm": 0.3089500390477893, "learning_rate": 5.4020690701979975e-06, "loss": 0.9486, "step": 447 }, { "epoch": 1.5886524822695036, "grad_norm": 0.2866723670777294, "learning_rate": 5.381491334520386e-06, "loss": 0.9449, "step": 448 }, { "epoch": 1.5921985815602837, "grad_norm": 0.3073645413927433, "learning_rate": 5.360907097328916e-06, "loss": 0.9586, "step": 449 }, { "epoch": 1.5957446808510638, "grad_norm": 0.2915250177745083, "learning_rate": 5.340316709427633e-06, "loss": 0.994, "step": 450 }, { "epoch": 1.599290780141844, "grad_norm": 0.2833870815858668, "learning_rate": 5.319720521725404e-06, "loss": 0.9908, "step": 451 }, { "epoch": 1.602836879432624, "grad_norm": 0.3007428124917032, "learning_rate": 5.299118885229944e-06, "loss": 0.9713, "step": 452 }, { "epoch": 1.6063829787234043, "grad_norm": 0.28433623098313027, "learning_rate": 5.278512151041817e-06, "loss": 0.9515, "step": 453 }, { "epoch": 1.6099290780141844, "grad_norm": 0.30719385606264527, "learning_rate": 5.257900670348473e-06, "loss": 0.9583, "step": 454 }, { "epoch": 1.6134751773049647, "grad_norm": 0.3062954471881532, "learning_rate": 5.237284794418252e-06, "loss": 0.986, "step": 455 }, { "epoch": 1.6170212765957448, "grad_norm": 0.2934177698314127, "learning_rate": 5.216664874594395e-06, "loss": 0.9473, "step": 456 }, { "epoch": 1.6205673758865249, "grad_norm": 0.3080528183868018, "learning_rate": 5.196041262289068e-06, "loss": 0.9593, "step": 457 }, { "epoch": 1.624113475177305, "grad_norm": 0.2793602089162291, "learning_rate": 5.175414308977356e-06, "loss": 0.9734, "step": 458 }, { "epoch": 1.627659574468085, "grad_norm": 0.29233280736864115, "learning_rate": 5.154784366191291e-06, "loss": 0.9659, "step": 459 }, { "epoch": 1.6312056737588652, "grad_norm": 0.2780019654413922, "learning_rate": 5.134151785513848e-06, "loss": 0.9338, "step": 460 }, { "epoch": 1.6347517730496453, "grad_norm": 0.31143249616181146, "learning_rate": 5.113516918572962e-06, "loss": 0.9628, "step": 461 }, { "epoch": 1.6382978723404256, "grad_norm": 0.28911603697527494, "learning_rate": 5.092880117035527e-06, "loss": 0.9546, "step": 462 }, { "epoch": 1.6418439716312057, "grad_norm": 0.2897381836850394, "learning_rate": 5.072241732601409e-06, "loss": 0.9553, "step": 463 }, { "epoch": 1.645390070921986, "grad_norm": 0.2847413324095445, "learning_rate": 5.05160211699745e-06, "loss": 0.956, "step": 464 }, { "epoch": 1.648936170212766, "grad_norm": 0.31042765983352155, "learning_rate": 5.030961621971473e-06, "loss": 0.9709, "step": 465 }, { "epoch": 1.6524822695035462, "grad_norm": 0.3174804935713863, "learning_rate": 5.010320599286291e-06, "loss": 0.9702, "step": 466 }, { "epoch": 1.6560283687943262, "grad_norm": 0.33371712685845656, "learning_rate": 4.98967940071371e-06, "loss": 0.9786, "step": 467 }, { "epoch": 1.6595744680851063, "grad_norm": 0.299102183315897, "learning_rate": 4.9690383780285275e-06, "loss": 0.9818, "step": 468 }, { "epoch": 1.6631205673758864, "grad_norm": 0.29989741435526823, "learning_rate": 4.948397883002552e-06, "loss": 0.9619, "step": 469 }, { "epoch": 1.6666666666666665, "grad_norm": 0.2878302499960192, "learning_rate": 4.927758267398593e-06, "loss": 0.9679, "step": 470 }, { "epoch": 1.6702127659574468, "grad_norm": 0.2968381639716943, "learning_rate": 4.907119882964475e-06, "loss": 0.9593, "step": 471 }, { "epoch": 1.673758865248227, "grad_norm": 0.2903252373864183, "learning_rate": 4.88648308142704e-06, "loss": 0.9518, "step": 472 }, { "epoch": 1.6773049645390072, "grad_norm": 0.31754523985000604, "learning_rate": 4.865848214486152e-06, "loss": 0.9879, "step": 473 }, { "epoch": 1.6808510638297873, "grad_norm": 0.28170937739513674, "learning_rate": 4.845215633808711e-06, "loss": 0.9861, "step": 474 }, { "epoch": 1.6843971631205674, "grad_norm": 0.305109891784026, "learning_rate": 4.8245856910226465e-06, "loss": 0.9854, "step": 475 }, { "epoch": 1.6879432624113475, "grad_norm": 0.30305507843139246, "learning_rate": 4.803958737710934e-06, "loss": 0.95, "step": 476 }, { "epoch": 1.6914893617021276, "grad_norm": 0.4140533832093159, "learning_rate": 4.783335125405607e-06, "loss": 0.974, "step": 477 }, { "epoch": 1.6950354609929077, "grad_norm": 0.3219922547401882, "learning_rate": 4.76271520558175e-06, "loss": 0.9908, "step": 478 }, { "epoch": 1.6985815602836878, "grad_norm": 0.30690193865601584, "learning_rate": 4.742099329651529e-06, "loss": 0.9174, "step": 479 }, { "epoch": 1.702127659574468, "grad_norm": 0.30185794201523497, "learning_rate": 4.721487848958186e-06, "loss": 0.9511, "step": 480 }, { "epoch": 1.7056737588652482, "grad_norm": 0.2842241981843966, "learning_rate": 4.700881114770058e-06, "loss": 0.9396, "step": 481 }, { "epoch": 1.7092198581560285, "grad_norm": 0.29298942450660576, "learning_rate": 4.6802794782745964e-06, "loss": 0.9319, "step": 482 }, { "epoch": 1.7127659574468086, "grad_norm": 0.3019299474779739, "learning_rate": 4.659683290572368e-06, "loss": 0.9698, "step": 483 }, { "epoch": 1.7163120567375887, "grad_norm": 0.3178805833084689, "learning_rate": 4.639092902671086e-06, "loss": 0.984, "step": 484 }, { "epoch": 1.7198581560283688, "grad_norm": 0.29136410757159953, "learning_rate": 4.618508665479617e-06, "loss": 0.9706, "step": 485 }, { "epoch": 1.7234042553191489, "grad_norm": 0.29445141518274043, "learning_rate": 4.597930929802004e-06, "loss": 0.9806, "step": 486 }, { "epoch": 1.726950354609929, "grad_norm": 0.31012531312214564, "learning_rate": 4.577360046331493e-06, "loss": 0.9785, "step": 487 }, { "epoch": 1.7304964539007093, "grad_norm": 0.28404553910311303, "learning_rate": 4.556796365644551e-06, "loss": 0.9275, "step": 488 }, { "epoch": 1.7340425531914894, "grad_norm": 0.28794630384381376, "learning_rate": 4.536240238194891e-06, "loss": 0.9592, "step": 489 }, { "epoch": 1.7375886524822695, "grad_norm": 0.3125072262094061, "learning_rate": 4.5156920143075025e-06, "loss": 1.0066, "step": 490 }, { "epoch": 1.7411347517730498, "grad_norm": 0.2853483673702439, "learning_rate": 4.495152044172675e-06, "loss": 0.955, "step": 491 }, { "epoch": 1.7446808510638299, "grad_norm": 0.3065651685727851, "learning_rate": 4.474620677840045e-06, "loss": 0.969, "step": 492 }, { "epoch": 1.74822695035461, "grad_norm": 0.3030297248512967, "learning_rate": 4.454098265212606e-06, "loss": 0.9919, "step": 493 }, { "epoch": 1.75177304964539, "grad_norm": 0.26607760748077697, "learning_rate": 4.433585156040773e-06, "loss": 1.0229, "step": 494 }, { "epoch": 1.7553191489361701, "grad_norm": 0.2780988384136799, "learning_rate": 4.4130816999164e-06, "loss": 0.9493, "step": 495 }, { "epoch": 1.7588652482269502, "grad_norm": 0.3296008877389596, "learning_rate": 4.392588246266828e-06, "loss": 0.9832, "step": 496 }, { "epoch": 1.7624113475177305, "grad_norm": 0.3036798104904837, "learning_rate": 4.372105144348941e-06, "loss": 0.9676, "step": 497 }, { "epoch": 1.7659574468085106, "grad_norm": 0.28255169922201634, "learning_rate": 4.351632743243196e-06, "loss": 0.9514, "step": 498 }, { "epoch": 1.7695035460992907, "grad_norm": 0.2898670553449216, "learning_rate": 4.3311713918476905e-06, "loss": 0.9899, "step": 499 }, { "epoch": 1.773049645390071, "grad_norm": 0.2970934893697634, "learning_rate": 4.310721438872204e-06, "loss": 0.9899, "step": 500 }, { "epoch": 1.7765957446808511, "grad_norm": 0.28135988445679605, "learning_rate": 4.29028323283226e-06, "loss": 0.9673, "step": 501 }, { "epoch": 1.7801418439716312, "grad_norm": 0.27280819974954307, "learning_rate": 4.269857122043191e-06, "loss": 0.9533, "step": 502 }, { "epoch": 1.7836879432624113, "grad_norm": 0.2821277460947102, "learning_rate": 4.24944345461419e-06, "loss": 0.9906, "step": 503 }, { "epoch": 1.7872340425531914, "grad_norm": 0.307676093939498, "learning_rate": 4.229042578442396e-06, "loss": 0.9648, "step": 504 }, { "epoch": 1.7907801418439715, "grad_norm": 0.2691018301096354, "learning_rate": 4.208654841206945e-06, "loss": 0.9578, "step": 505 }, { "epoch": 1.7943262411347518, "grad_norm": 0.2807385843450333, "learning_rate": 4.188280590363057e-06, "loss": 0.9314, "step": 506 }, { "epoch": 1.797872340425532, "grad_norm": 0.28470763646920694, "learning_rate": 4.16792017313612e-06, "loss": 0.9716, "step": 507 }, { "epoch": 1.8014184397163122, "grad_norm": 0.2914819804698818, "learning_rate": 4.147573936515751e-06, "loss": 0.9746, "step": 508 }, { "epoch": 1.8049645390070923, "grad_norm": 0.2851091013774576, "learning_rate": 4.127242227249909e-06, "loss": 0.965, "step": 509 }, { "epoch": 1.8085106382978724, "grad_norm": 0.2996185175157145, "learning_rate": 4.106925391838963e-06, "loss": 0.979, "step": 510 }, { "epoch": 1.8120567375886525, "grad_norm": 0.29929940280427364, "learning_rate": 4.0866237765298e-06, "loss": 0.9404, "step": 511 }, { "epoch": 1.8156028368794326, "grad_norm": 0.3340420824403431, "learning_rate": 4.066337727309923e-06, "loss": 0.9874, "step": 512 }, { "epoch": 1.8191489361702127, "grad_norm": 0.2875024520377711, "learning_rate": 4.046067589901546e-06, "loss": 0.9268, "step": 513 }, { "epoch": 1.8226950354609928, "grad_norm": 0.29616333531021444, "learning_rate": 4.0258137097557145e-06, "loss": 0.9983, "step": 514 }, { "epoch": 1.826241134751773, "grad_norm": 0.2810687469878062, "learning_rate": 4.005576432046406e-06, "loss": 1.0002, "step": 515 }, { "epoch": 1.8297872340425532, "grad_norm": 0.29072017254478427, "learning_rate": 3.98535610166466e-06, "loss": 0.9691, "step": 516 }, { "epoch": 1.8333333333333335, "grad_norm": 0.2623580921407139, "learning_rate": 3.9651530632126885e-06, "loss": 0.9163, "step": 517 }, { "epoch": 1.8368794326241136, "grad_norm": 0.29904177768923496, "learning_rate": 3.944967660998007e-06, "loss": 0.949, "step": 518 }, { "epoch": 1.8404255319148937, "grad_norm": 0.28929807689484766, "learning_rate": 3.924800239027572e-06, "loss": 0.9733, "step": 519 }, { "epoch": 1.8439716312056738, "grad_norm": 0.27190815686381575, "learning_rate": 3.9046511410019115e-06, "loss": 0.9396, "step": 520 }, { "epoch": 1.8475177304964538, "grad_norm": 0.2825920621666795, "learning_rate": 3.884520710309273e-06, "loss": 0.9291, "step": 521 }, { "epoch": 1.851063829787234, "grad_norm": 0.30547650387137687, "learning_rate": 3.864409290019765e-06, "loss": 0.9728, "step": 522 }, { "epoch": 1.854609929078014, "grad_norm": 0.31297298034797305, "learning_rate": 3.844317222879513e-06, "loss": 0.9619, "step": 523 }, { "epoch": 1.8581560283687943, "grad_norm": 0.2848899449071369, "learning_rate": 3.824244851304827e-06, "loss": 0.9552, "step": 524 }, { "epoch": 1.8617021276595744, "grad_norm": 0.30073821252912547, "learning_rate": 3.804192517376348e-06, "loss": 0.9554, "step": 525 }, { "epoch": 1.8652482269503547, "grad_norm": 0.2768210486652934, "learning_rate": 3.784160562833235e-06, "loss": 0.9901, "step": 526 }, { "epoch": 1.8687943262411348, "grad_norm": 0.28077414419886976, "learning_rate": 3.764149329067329e-06, "loss": 0.9636, "step": 527 }, { "epoch": 1.872340425531915, "grad_norm": 0.277176038192619, "learning_rate": 3.744159157117345e-06, "loss": 0.9214, "step": 528 }, { "epoch": 1.875886524822695, "grad_norm": 0.29870576848760505, "learning_rate": 3.724190387663054e-06, "loss": 0.9607, "step": 529 }, { "epoch": 1.8794326241134751, "grad_norm": 0.292237260005101, "learning_rate": 3.7042433610194717e-06, "loss": 0.9683, "step": 530 }, { "epoch": 1.8829787234042552, "grad_norm": 0.2786605534373809, "learning_rate": 3.6843184171310748e-06, "loss": 0.9351, "step": 531 }, { "epoch": 1.8865248226950353, "grad_norm": 0.3185035381508269, "learning_rate": 3.66441589556599e-06, "loss": 0.9708, "step": 532 }, { "epoch": 1.8900709219858156, "grad_norm": 0.279203889694889, "learning_rate": 3.644536135510217e-06, "loss": 0.94, "step": 533 }, { "epoch": 1.8936170212765957, "grad_norm": 0.289952634900548, "learning_rate": 3.6246794757618464e-06, "loss": 1.0022, "step": 534 }, { "epoch": 1.897163120567376, "grad_norm": 0.27458435937104975, "learning_rate": 3.6048462547252847e-06, "loss": 0.9735, "step": 535 }, { "epoch": 1.900709219858156, "grad_norm": 0.27290633411671783, "learning_rate": 3.5850368104054877e-06, "loss": 0.968, "step": 536 }, { "epoch": 1.9042553191489362, "grad_norm": 0.2989979790305114, "learning_rate": 3.5652514804021985e-06, "loss": 0.9751, "step": 537 }, { "epoch": 1.9078014184397163, "grad_norm": 0.2977453602913095, "learning_rate": 3.5454906019041936e-06, "loss": 0.9588, "step": 538 }, { "epoch": 1.9113475177304964, "grad_norm": 0.2729304800860835, "learning_rate": 3.5257545116835433e-06, "loss": 0.9787, "step": 539 }, { "epoch": 1.9148936170212765, "grad_norm": 0.29372281528169397, "learning_rate": 3.5060435460898623e-06, "loss": 0.9884, "step": 540 }, { "epoch": 1.9184397163120568, "grad_norm": 0.2786257293371286, "learning_rate": 3.4863580410445865e-06, "loss": 0.9732, "step": 541 }, { "epoch": 1.9219858156028369, "grad_norm": 0.2879894147784387, "learning_rate": 3.46669833203524e-06, "loss": 0.9669, "step": 542 }, { "epoch": 1.925531914893617, "grad_norm": 0.2811802137237428, "learning_rate": 3.447064754109722e-06, "loss": 0.963, "step": 543 }, { "epoch": 1.9290780141843973, "grad_norm": 0.27409703034250343, "learning_rate": 3.427457641870599e-06, "loss": 0.9635, "step": 544 }, { "epoch": 1.9326241134751774, "grad_norm": 0.27424991306379276, "learning_rate": 3.407877329469395e-06, "loss": 0.9702, "step": 545 }, { "epoch": 1.9361702127659575, "grad_norm": 0.28757818556734716, "learning_rate": 3.3883241506009084e-06, "loss": 0.9452, "step": 546 }, { "epoch": 1.9397163120567376, "grad_norm": 0.2797257476342126, "learning_rate": 3.3687984384975127e-06, "loss": 0.9193, "step": 547 }, { "epoch": 1.9432624113475176, "grad_norm": 0.2888590851585118, "learning_rate": 3.3493005259234806e-06, "loss": 0.9532, "step": 548 }, { "epoch": 1.9468085106382977, "grad_norm": 0.27397053268853844, "learning_rate": 3.329830745169324e-06, "loss": 0.9263, "step": 549 }, { "epoch": 1.950354609929078, "grad_norm": 0.3041708724941164, "learning_rate": 3.3103894280461136e-06, "loss": 0.9944, "step": 550 }, { "epoch": 1.9539007092198581, "grad_norm": 0.27683630826913047, "learning_rate": 3.2909769058798367e-06, "loss": 0.9642, "step": 551 }, { "epoch": 1.9574468085106385, "grad_norm": 0.2877278021257939, "learning_rate": 3.2715935095057462e-06, "loss": 0.9549, "step": 552 }, { "epoch": 1.9609929078014185, "grad_norm": 0.2821807734131442, "learning_rate": 3.2522395692627185e-06, "loss": 0.9631, "step": 553 }, { "epoch": 1.9645390070921986, "grad_norm": 0.28853626674447513, "learning_rate": 3.2329154149876364e-06, "loss": 0.9539, "step": 554 }, { "epoch": 1.9680851063829787, "grad_norm": 0.28257875993931314, "learning_rate": 3.2136213760097494e-06, "loss": 0.9642, "step": 555 }, { "epoch": 1.9716312056737588, "grad_norm": 0.29723382583132046, "learning_rate": 3.1943577811450814e-06, "loss": 0.9911, "step": 556 }, { "epoch": 1.975177304964539, "grad_norm": 0.2871600479431813, "learning_rate": 3.1751249586908096e-06, "loss": 0.9602, "step": 557 }, { "epoch": 1.978723404255319, "grad_norm": 0.2770335002399615, "learning_rate": 3.1559232364196758e-06, "loss": 0.9862, "step": 558 }, { "epoch": 1.9822695035460993, "grad_norm": 0.2891540340709798, "learning_rate": 3.1367529415744074e-06, "loss": 0.964, "step": 559 }, { "epoch": 1.9858156028368794, "grad_norm": 0.29884024785255925, "learning_rate": 3.117614400862127e-06, "loss": 0.955, "step": 560 }, { "epoch": 1.9893617021276597, "grad_norm": 0.2872543538398749, "learning_rate": 3.0985079404487996e-06, "loss": 0.9667, "step": 561 }, { "epoch": 1.9929078014184398, "grad_norm": 0.3078540388603912, "learning_rate": 3.0794338859536597e-06, "loss": 0.9518, "step": 562 }, { "epoch": 1.99645390070922, "grad_norm": 0.29042976474625587, "learning_rate": 3.0603925624436694e-06, "loss": 0.919, "step": 563 }, { "epoch": 2.0, "grad_norm": 0.27633705422516736, "learning_rate": 3.0413842944279814e-06, "loss": 0.9662, "step": 564 }, { "epoch": 2.00354609929078, "grad_norm": 0.34211568090401184, "learning_rate": 3.0224094058524e-06, "loss": 0.9166, "step": 565 }, { "epoch": 2.00709219858156, "grad_norm": 0.3305431234600616, "learning_rate": 3.0034682200938674e-06, "loss": 0.9789, "step": 566 }, { "epoch": 2.0106382978723403, "grad_norm": 0.2764439353136178, "learning_rate": 2.98456105995495e-06, "loss": 0.9392, "step": 567 }, { "epoch": 2.0141843971631204, "grad_norm": 0.29239625818522735, "learning_rate": 2.965688247658335e-06, "loss": 0.9467, "step": 568 }, { "epoch": 2.017730496453901, "grad_norm": 0.2919261016206263, "learning_rate": 2.9468501048413455e-06, "loss": 0.9434, "step": 569 }, { "epoch": 2.021276595744681, "grad_norm": 0.28828087244092343, "learning_rate": 2.9280469525504497e-06, "loss": 0.9191, "step": 570 }, { "epoch": 2.024822695035461, "grad_norm": 0.28533521833400965, "learning_rate": 2.9092791112358017e-06, "loss": 0.9449, "step": 571 }, { "epoch": 2.028368794326241, "grad_norm": 0.31171267011983783, "learning_rate": 2.8905469007457642e-06, "loss": 0.9262, "step": 572 }, { "epoch": 2.0319148936170213, "grad_norm": 0.30226821392239767, "learning_rate": 2.87185064032147e-06, "loss": 0.9285, "step": 573 }, { "epoch": 2.0354609929078014, "grad_norm": 0.3055261291230611, "learning_rate": 2.8531906485913827e-06, "loss": 0.9532, "step": 574 }, { "epoch": 2.0390070921985815, "grad_norm": 0.29091256337193977, "learning_rate": 2.8345672435658534e-06, "loss": 0.9265, "step": 575 }, { "epoch": 2.0425531914893615, "grad_norm": 0.29603659523290005, "learning_rate": 2.815980742631718e-06, "loss": 0.9394, "step": 576 }, { "epoch": 2.0460992907801416, "grad_norm": 0.3081350560933878, "learning_rate": 2.7974314625468725e-06, "loss": 0.9791, "step": 577 }, { "epoch": 2.049645390070922, "grad_norm": 0.2953052822851617, "learning_rate": 2.778919719434882e-06, "loss": 0.9147, "step": 578 }, { "epoch": 2.0531914893617023, "grad_norm": 0.29073720558692157, "learning_rate": 2.7604458287795986e-06, "loss": 0.9277, "step": 579 }, { "epoch": 2.0567375886524824, "grad_norm": 0.29108010374316245, "learning_rate": 2.7420101054197733e-06, "loss": 0.9348, "step": 580 }, { "epoch": 2.0602836879432624, "grad_norm": 0.2911095892676222, "learning_rate": 2.7236128635436997e-06, "loss": 0.9109, "step": 581 }, { "epoch": 2.0638297872340425, "grad_norm": 0.2977706310964848, "learning_rate": 2.7052544166838546e-06, "loss": 0.9433, "step": 582 }, { "epoch": 2.0673758865248226, "grad_norm": 0.2881480472063736, "learning_rate": 2.686935077711553e-06, "loss": 0.9672, "step": 583 }, { "epoch": 2.0709219858156027, "grad_norm": 0.28903989577522315, "learning_rate": 2.6686551588316277e-06, "loss": 0.9591, "step": 584 }, { "epoch": 2.074468085106383, "grad_norm": 0.36170594222878666, "learning_rate": 2.6504149715770906e-06, "loss": 0.9019, "step": 585 }, { "epoch": 2.078014184397163, "grad_norm": 0.2894517024838222, "learning_rate": 2.6322148268038374e-06, "loss": 0.9385, "step": 586 }, { "epoch": 2.0815602836879434, "grad_norm": 0.31218917348569936, "learning_rate": 2.6140550346853443e-06, "loss": 0.946, "step": 587 }, { "epoch": 2.0851063829787235, "grad_norm": 0.2861013285625956, "learning_rate": 2.5959359047073816e-06, "loss": 0.9351, "step": 588 }, { "epoch": 2.0886524822695036, "grad_norm": 0.266118093698617, "learning_rate": 2.5778577456627464e-06, "loss": 0.931, "step": 589 }, { "epoch": 2.0921985815602837, "grad_norm": 0.35858405171450963, "learning_rate": 2.5598208656459857e-06, "loss": 0.9236, "step": 590 }, { "epoch": 2.095744680851064, "grad_norm": 0.2907531000936525, "learning_rate": 2.5418255720481614e-06, "loss": 0.9664, "step": 591 }, { "epoch": 2.099290780141844, "grad_norm": 0.3034743580007518, "learning_rate": 2.5238721715516015e-06, "loss": 0.9188, "step": 592 }, { "epoch": 2.102836879432624, "grad_norm": 0.26171020225643493, "learning_rate": 2.5059609701246747e-06, "loss": 0.9322, "step": 593 }, { "epoch": 2.106382978723404, "grad_norm": 0.2975159541989247, "learning_rate": 2.4880922730165834e-06, "loss": 0.9509, "step": 594 }, { "epoch": 2.1099290780141846, "grad_norm": 0.27667835490300113, "learning_rate": 2.4702663847521486e-06, "loss": 0.9303, "step": 595 }, { "epoch": 2.1134751773049647, "grad_norm": 0.28878029559058327, "learning_rate": 2.4524836091266356e-06, "loss": 0.9414, "step": 596 }, { "epoch": 2.117021276595745, "grad_norm": 0.27394981881250624, "learning_rate": 2.434744249200563e-06, "loss": 0.9181, "step": 597 }, { "epoch": 2.120567375886525, "grad_norm": 0.30512835898777346, "learning_rate": 2.417048607294541e-06, "loss": 0.9646, "step": 598 }, { "epoch": 2.124113475177305, "grad_norm": 0.2841936138283053, "learning_rate": 2.3993969849841302e-06, "loss": 0.9495, "step": 599 }, { "epoch": 2.127659574468085, "grad_norm": 0.30533474413733125, "learning_rate": 2.3817896830946836e-06, "loss": 0.9382, "step": 600 }, { "epoch": 2.131205673758865, "grad_norm": 0.2836448626237975, "learning_rate": 2.3642270016962375e-06, "loss": 0.9441, "step": 601 }, { "epoch": 2.1347517730496453, "grad_norm": 0.2666663810743801, "learning_rate": 2.346709240098385e-06, "loss": 0.9542, "step": 602 }, { "epoch": 2.1382978723404253, "grad_norm": 0.2717443703316108, "learning_rate": 2.3292366968451794e-06, "loss": 0.9301, "step": 603 }, { "epoch": 2.141843971631206, "grad_norm": 0.2673424341790724, "learning_rate": 2.311809669710049e-06, "loss": 0.911, "step": 604 }, { "epoch": 2.145390070921986, "grad_norm": 0.2826849646707434, "learning_rate": 2.2944284556907164e-06, "loss": 0.9438, "step": 605 }, { "epoch": 2.148936170212766, "grad_norm": 0.2678779264629623, "learning_rate": 2.277093351004146e-06, "loss": 0.9531, "step": 606 }, { "epoch": 2.152482269503546, "grad_norm": 0.2699342042371205, "learning_rate": 2.259804651081486e-06, "loss": 0.9257, "step": 607 }, { "epoch": 2.1560283687943262, "grad_norm": 0.3061134213706471, "learning_rate": 2.242562650563036e-06, "loss": 0.9371, "step": 608 }, { "epoch": 2.1595744680851063, "grad_norm": 0.2721793891287796, "learning_rate": 2.2253676432932343e-06, "loss": 0.9393, "step": 609 }, { "epoch": 2.1631205673758864, "grad_norm": 0.2758871053995902, "learning_rate": 2.2082199223156353e-06, "loss": 0.95, "step": 610 }, { "epoch": 2.1666666666666665, "grad_norm": 0.27264130067982384, "learning_rate": 2.19111977986793e-06, "loss": 0.9214, "step": 611 }, { "epoch": 2.1702127659574466, "grad_norm": 0.2739629744024947, "learning_rate": 2.174067507376953e-06, "loss": 0.8991, "step": 612 }, { "epoch": 2.173758865248227, "grad_norm": 0.27429293801614335, "learning_rate": 2.1570633954537275e-06, "loss": 0.9574, "step": 613 }, { "epoch": 2.1773049645390072, "grad_norm": 0.2766435265905702, "learning_rate": 2.1401077338885025e-06, "loss": 0.9501, "step": 614 }, { "epoch": 2.1808510638297873, "grad_norm": 0.2863342636566308, "learning_rate": 2.123200811645817e-06, "loss": 0.9127, "step": 615 }, { "epoch": 2.1843971631205674, "grad_norm": 0.2694371336420443, "learning_rate": 2.1063429168595837e-06, "loss": 0.9348, "step": 616 }, { "epoch": 2.1879432624113475, "grad_norm": 0.27521771982705806, "learning_rate": 2.089534336828166e-06, "loss": 0.9259, "step": 617 }, { "epoch": 2.1914893617021276, "grad_norm": 0.2777223172868451, "learning_rate": 2.072775358009487e-06, "loss": 0.9349, "step": 618 }, { "epoch": 2.1950354609929077, "grad_norm": 0.26579333257806625, "learning_rate": 2.056066266016151e-06, "loss": 0.9071, "step": 619 }, { "epoch": 2.198581560283688, "grad_norm": 0.2754832251067736, "learning_rate": 2.0394073456105695e-06, "loss": 0.9809, "step": 620 }, { "epoch": 2.202127659574468, "grad_norm": 0.2722018752414588, "learning_rate": 2.022798880700117e-06, "loss": 0.9238, "step": 621 }, { "epoch": 2.2056737588652484, "grad_norm": 0.27237879185832914, "learning_rate": 2.00624115433228e-06, "loss": 0.974, "step": 622 }, { "epoch": 2.2092198581560285, "grad_norm": 0.27785931041430745, "learning_rate": 1.9897344486898484e-06, "loss": 0.9747, "step": 623 }, { "epoch": 2.2127659574468086, "grad_norm": 0.2756168481636738, "learning_rate": 1.973279045086091e-06, "loss": 0.9499, "step": 624 }, { "epoch": 2.2163120567375887, "grad_norm": 0.26675266910105166, "learning_rate": 1.9568752239599703e-06, "loss": 0.8902, "step": 625 }, { "epoch": 2.219858156028369, "grad_norm": 0.26212899656565325, "learning_rate": 1.940523264871367e-06, "loss": 0.9367, "step": 626 }, { "epoch": 2.223404255319149, "grad_norm": 0.2612008316175052, "learning_rate": 1.924223446496302e-06, "loss": 0.9313, "step": 627 }, { "epoch": 2.226950354609929, "grad_norm": 0.2855793919618653, "learning_rate": 1.9079760466222024e-06, "loss": 0.9493, "step": 628 }, { "epoch": 2.230496453900709, "grad_norm": 0.27511813469355617, "learning_rate": 1.8917813421431553e-06, "loss": 0.9874, "step": 629 }, { "epoch": 2.2340425531914896, "grad_norm": 0.28658768397486173, "learning_rate": 1.8756396090551936e-06, "loss": 0.9419, "step": 630 }, { "epoch": 2.2375886524822697, "grad_norm": 0.2970142820760959, "learning_rate": 1.8595511224515983e-06, "loss": 0.9497, "step": 631 }, { "epoch": 2.2411347517730498, "grad_norm": 0.2642536950812353, "learning_rate": 1.8435161565181986e-06, "loss": 0.958, "step": 632 }, { "epoch": 2.24468085106383, "grad_norm": 0.2820677710345039, "learning_rate": 1.8275349845287067e-06, "loss": 0.9204, "step": 633 }, { "epoch": 2.24822695035461, "grad_norm": 0.27294677589133154, "learning_rate": 1.8116078788400565e-06, "loss": 0.9537, "step": 634 }, { "epoch": 2.25177304964539, "grad_norm": 0.26990056141467217, "learning_rate": 1.795735110887772e-06, "loss": 0.9079, "step": 635 }, { "epoch": 2.25531914893617, "grad_norm": 0.27070672156460635, "learning_rate": 1.7799169511813257e-06, "loss": 0.9421, "step": 636 }, { "epoch": 2.2588652482269502, "grad_norm": 0.2705661929202214, "learning_rate": 1.7641536692995381e-06, "loss": 0.9136, "step": 637 }, { "epoch": 2.2624113475177303, "grad_norm": 0.2640291246713601, "learning_rate": 1.7484455338859873e-06, "loss": 0.9188, "step": 638 }, { "epoch": 2.2659574468085104, "grad_norm": 0.26625883501507475, "learning_rate": 1.7327928126444188e-06, "loss": 0.9209, "step": 639 }, { "epoch": 2.269503546099291, "grad_norm": 0.27398770162785274, "learning_rate": 1.7171957723341915e-06, "loss": 0.9551, "step": 640 }, { "epoch": 2.273049645390071, "grad_norm": 0.27482488308667535, "learning_rate": 1.7016546787657323e-06, "loss": 0.9602, "step": 641 }, { "epoch": 2.276595744680851, "grad_norm": 0.2845010838813931, "learning_rate": 1.686169796795999e-06, "loss": 0.9184, "step": 642 }, { "epoch": 2.280141843971631, "grad_norm": 0.2940585321034576, "learning_rate": 1.670741390323976e-06, "loss": 0.9571, "step": 643 }, { "epoch": 2.2836879432624113, "grad_norm": 0.2734310721998961, "learning_rate": 1.655369722286168e-06, "loss": 0.9418, "step": 644 }, { "epoch": 2.2872340425531914, "grad_norm": 0.26638039243536477, "learning_rate": 1.6400550546521222e-06, "loss": 0.9385, "step": 645 }, { "epoch": 2.2907801418439715, "grad_norm": 0.2726655750146368, "learning_rate": 1.6247976484199685e-06, "loss": 0.9622, "step": 646 }, { "epoch": 2.2943262411347516, "grad_norm": 0.26624856688782533, "learning_rate": 1.6095977636119615e-06, "loss": 0.9284, "step": 647 }, { "epoch": 2.297872340425532, "grad_norm": 0.27049982015424745, "learning_rate": 1.5944556592700611e-06, "loss": 0.9524, "step": 648 }, { "epoch": 2.301418439716312, "grad_norm": 0.26393403318675995, "learning_rate": 1.5793715934515063e-06, "loss": 0.9561, "step": 649 }, { "epoch": 2.3049645390070923, "grad_norm": 0.2712991096501478, "learning_rate": 1.5643458232244253e-06, "loss": 0.8982, "step": 650 }, { "epoch": 2.3085106382978724, "grad_norm": 0.27856650838814195, "learning_rate": 1.549378604663449e-06, "loss": 0.9281, "step": 651 }, { "epoch": 2.3120567375886525, "grad_norm": 0.26700086938766765, "learning_rate": 1.534470192845352e-06, "loss": 0.8984, "step": 652 }, { "epoch": 2.3156028368794326, "grad_norm": 0.26288396690044663, "learning_rate": 1.5196208418447033e-06, "loss": 0.8772, "step": 653 }, { "epoch": 2.3191489361702127, "grad_norm": 0.27221357473526764, "learning_rate": 1.5048308047295356e-06, "loss": 0.8915, "step": 654 }, { "epoch": 2.3226950354609928, "grad_norm": 0.26191239591480037, "learning_rate": 1.4901003335570292e-06, "loss": 0.9536, "step": 655 }, { "epoch": 2.326241134751773, "grad_norm": 0.2792546756089542, "learning_rate": 1.4754296793692264e-06, "loss": 0.9326, "step": 656 }, { "epoch": 2.329787234042553, "grad_norm": 0.26152119749050184, "learning_rate": 1.4608190921887406e-06, "loss": 0.94, "step": 657 }, { "epoch": 2.3333333333333335, "grad_norm": 0.28274958580588, "learning_rate": 1.4462688210145077e-06, "loss": 0.9424, "step": 658 }, { "epoch": 2.3368794326241136, "grad_norm": 0.2695174473374348, "learning_rate": 1.4317791138175303e-06, "loss": 0.922, "step": 659 }, { "epoch": 2.3404255319148937, "grad_norm": 0.2582885263018122, "learning_rate": 1.4173502175366594e-06, "loss": 0.9323, "step": 660 }, { "epoch": 2.3439716312056738, "grad_norm": 0.2724071001312669, "learning_rate": 1.4029823780743879e-06, "loss": 0.9504, "step": 661 }, { "epoch": 2.347517730496454, "grad_norm": 0.278288308139881, "learning_rate": 1.388675840292651e-06, "loss": 0.9445, "step": 662 }, { "epoch": 2.351063829787234, "grad_norm": 0.2736443704775053, "learning_rate": 1.3744308480086633e-06, "loss": 0.968, "step": 663 }, { "epoch": 2.354609929078014, "grad_norm": 0.2871922333842366, "learning_rate": 1.3602476439907548e-06, "loss": 0.915, "step": 664 }, { "epoch": 2.3581560283687946, "grad_norm": 0.28372616759624253, "learning_rate": 1.3461264699542386e-06, "loss": 0.9308, "step": 665 }, { "epoch": 2.3617021276595747, "grad_norm": 0.2610769215992973, "learning_rate": 1.3320675665572914e-06, "loss": 0.96, "step": 666 }, { "epoch": 2.3652482269503547, "grad_norm": 0.26372917083560865, "learning_rate": 1.3180711733968477e-06, "loss": 0.9604, "step": 667 }, { "epoch": 2.368794326241135, "grad_norm": 0.27415856630467206, "learning_rate": 1.3041375290045266e-06, "loss": 0.9328, "step": 668 }, { "epoch": 2.372340425531915, "grad_norm": 0.2680173166359837, "learning_rate": 1.290266870842553e-06, "loss": 0.9297, "step": 669 }, { "epoch": 2.375886524822695, "grad_norm": 0.27020805930267716, "learning_rate": 1.27645943529972e-06, "loss": 0.958, "step": 670 }, { "epoch": 2.379432624113475, "grad_norm": 0.2977607797526371, "learning_rate": 1.26271545768736e-06, "loss": 0.9364, "step": 671 }, { "epoch": 2.382978723404255, "grad_norm": 0.2734742565246787, "learning_rate": 1.2490351722353283e-06, "loss": 0.9206, "step": 672 }, { "epoch": 2.3865248226950353, "grad_norm": 0.2647493764911038, "learning_rate": 1.2354188120880206e-06, "loss": 0.9312, "step": 673 }, { "epoch": 2.3900709219858154, "grad_norm": 0.2660065928052957, "learning_rate": 1.2218666093003884e-06, "loss": 0.9756, "step": 674 }, { "epoch": 2.393617021276596, "grad_norm": 0.2723267155994901, "learning_rate": 1.2083787948339925e-06, "loss": 0.9528, "step": 675 }, { "epoch": 2.397163120567376, "grad_norm": 0.26496002204240177, "learning_rate": 1.1949555985530681e-06, "loss": 0.9335, "step": 676 }, { "epoch": 2.400709219858156, "grad_norm": 0.2759489347208577, "learning_rate": 1.1815972492205974e-06, "loss": 0.9385, "step": 677 }, { "epoch": 2.404255319148936, "grad_norm": 0.2707167804658785, "learning_rate": 1.1683039744944236e-06, "loss": 0.9324, "step": 678 }, { "epoch": 2.4078014184397163, "grad_norm": 0.2642636238838349, "learning_rate": 1.1550760009233607e-06, "loss": 0.8979, "step": 679 }, { "epoch": 2.4113475177304964, "grad_norm": 0.27558125969457525, "learning_rate": 1.1419135539433357e-06, "loss": 0.9501, "step": 680 }, { "epoch": 2.4148936170212765, "grad_norm": 0.2584461830085395, "learning_rate": 1.1288168578735541e-06, "loss": 0.9466, "step": 681 }, { "epoch": 2.4184397163120566, "grad_norm": 0.30931090282053647, "learning_rate": 1.1157861359126638e-06, "loss": 0.9479, "step": 682 }, { "epoch": 2.421985815602837, "grad_norm": 0.27238782208384227, "learning_rate": 1.1028216101349604e-06, "loss": 0.928, "step": 683 }, { "epoch": 2.425531914893617, "grad_norm": 0.25534380499626974, "learning_rate": 1.0899235014866005e-06, "loss": 0.9341, "step": 684 }, { "epoch": 2.4290780141843973, "grad_norm": 0.27763438297242915, "learning_rate": 1.0770920297818339e-06, "loss": 0.941, "step": 685 }, { "epoch": 2.4326241134751774, "grad_norm": 0.2747334100295996, "learning_rate": 1.0643274136992644e-06, "loss": 0.9101, "step": 686 }, { "epoch": 2.4361702127659575, "grad_norm": 0.26964106150792605, "learning_rate": 1.0516298707781109e-06, "loss": 0.9374, "step": 687 }, { "epoch": 2.4397163120567376, "grad_norm": 0.2721275932715395, "learning_rate": 1.0389996174145145e-06, "loss": 0.9339, "step": 688 }, { "epoch": 2.4432624113475176, "grad_norm": 0.27168300668809897, "learning_rate": 1.0264368688578374e-06, "loss": 0.9753, "step": 689 }, { "epoch": 2.4468085106382977, "grad_norm": 0.2561766637952011, "learning_rate": 1.0139418392070021e-06, "loss": 0.9381, "step": 690 }, { "epoch": 2.450354609929078, "grad_norm": 0.2657285317249217, "learning_rate": 1.0015147414068433e-06, "loss": 0.9216, "step": 691 }, { "epoch": 2.453900709219858, "grad_norm": 0.27331163784411394, "learning_rate": 9.891557872444724e-07, "loss": 0.9542, "step": 692 }, { "epoch": 2.4574468085106385, "grad_norm": 0.26271233644006137, "learning_rate": 9.768651873456764e-07, "loss": 0.9666, "step": 693 }, { "epoch": 2.4609929078014185, "grad_norm": 0.26974319247616274, "learning_rate": 9.646431511713207e-07, "loss": 0.911, "step": 694 }, { "epoch": 2.4645390070921986, "grad_norm": 0.2682511134644669, "learning_rate": 9.524898870137827e-07, "loss": 0.9417, "step": 695 }, { "epoch": 2.4680851063829787, "grad_norm": 0.2666591045330975, "learning_rate": 9.404056019934071e-07, "loss": 0.9054, "step": 696 }, { "epoch": 2.471631205673759, "grad_norm": 0.29608448349931366, "learning_rate": 9.283905020549655e-07, "loss": 0.9688, "step": 697 }, { "epoch": 2.475177304964539, "grad_norm": 0.251322321212032, "learning_rate": 9.16444791964154e-07, "loss": 0.9402, "step": 698 }, { "epoch": 2.478723404255319, "grad_norm": 0.2584585398103428, "learning_rate": 9.045686753041016e-07, "loss": 0.9389, "step": 699 }, { "epoch": 2.482269503546099, "grad_norm": 0.2658426292800937, "learning_rate": 8.927623544719011e-07, "loss": 0.9464, "step": 700 }, { "epoch": 2.4858156028368796, "grad_norm": 0.2628043336908807, "learning_rate": 8.810260306751612e-07, "loss": 0.9271, "step": 701 }, { "epoch": 2.4893617021276597, "grad_norm": 0.2688262554416729, "learning_rate": 8.693599039285716e-07, "loss": 0.9463, "step": 702 }, { "epoch": 2.49290780141844, "grad_norm": 0.2598369483856922, "learning_rate": 8.577641730505032e-07, "loss": 0.9569, "step": 703 }, { "epoch": 2.49645390070922, "grad_norm": 0.26583462991103224, "learning_rate": 8.462390356596117e-07, "loss": 0.9463, "step": 704 }, { "epoch": 2.5, "grad_norm": 0.26516473860679296, "learning_rate": 8.347846881714716e-07, "loss": 0.9268, "step": 705 }, { "epoch": 2.50354609929078, "grad_norm": 0.30359764298409464, "learning_rate": 8.234013257952356e-07, "loss": 0.9576, "step": 706 }, { "epoch": 2.50709219858156, "grad_norm": 0.2566244565376392, "learning_rate": 8.120891425302962e-07, "loss": 0.9056, "step": 707 }, { "epoch": 2.5106382978723403, "grad_norm": 0.2817933625242659, "learning_rate": 8.008483311629911e-07, "loss": 0.9285, "step": 708 }, { "epoch": 2.5141843971631204, "grad_norm": 0.2874339252931253, "learning_rate": 7.896790832633073e-07, "loss": 0.9191, "step": 709 }, { "epoch": 2.5177304964539005, "grad_norm": 0.25987497845494867, "learning_rate": 7.785815891816256e-07, "loss": 0.918, "step": 710 }, { "epoch": 2.521276595744681, "grad_norm": 0.2615222962993215, "learning_rate": 7.675560380454694e-07, "loss": 0.9187, "step": 711 }, { "epoch": 2.524822695035461, "grad_norm": 0.27082310468575305, "learning_rate": 7.566026177562846e-07, "loss": 0.9257, "step": 712 }, { "epoch": 2.528368794326241, "grad_norm": 0.2608625308482611, "learning_rate": 7.457215149862373e-07, "loss": 0.9953, "step": 713 }, { "epoch": 2.5319148936170213, "grad_norm": 0.27403820285612135, "learning_rate": 7.349129151750312e-07, "loss": 0.9548, "step": 714 }, { "epoch": 2.5354609929078014, "grad_norm": 0.2731107594749644, "learning_rate": 7.241770025267519e-07, "loss": 0.9305, "step": 715 }, { "epoch": 2.5390070921985815, "grad_norm": 0.2611308704872923, "learning_rate": 7.135139600067203e-07, "loss": 0.9052, "step": 716 }, { "epoch": 2.5425531914893615, "grad_norm": 0.27278470215722556, "learning_rate": 7.029239693383777e-07, "loss": 0.9235, "step": 717 }, { "epoch": 2.546099290780142, "grad_norm": 0.29157759285983176, "learning_rate": 6.924072110001934e-07, "loss": 0.9641, "step": 718 }, { "epoch": 2.549645390070922, "grad_norm": 0.27021197439112066, "learning_rate": 6.819638642225795e-07, "loss": 0.9117, "step": 719 }, { "epoch": 2.5531914893617023, "grad_norm": 0.26559679148254933, "learning_rate": 6.715941069848458e-07, "loss": 0.9129, "step": 720 }, { "epoch": 2.5567375886524824, "grad_norm": 0.2721897785577394, "learning_rate": 6.612981160121612e-07, "loss": 0.9079, "step": 721 }, { "epoch": 2.5602836879432624, "grad_norm": 0.2618912576682587, "learning_rate": 6.510760667725407e-07, "loss": 0.9112, "step": 722 }, { "epoch": 2.5638297872340425, "grad_norm": 0.26656210780565326, "learning_rate": 6.409281334738615e-07, "loss": 0.9374, "step": 723 }, { "epoch": 2.5673758865248226, "grad_norm": 0.2745734131529982, "learning_rate": 6.308544890608865e-07, "loss": 0.9447, "step": 724 }, { "epoch": 2.5709219858156027, "grad_norm": 0.2762364925858152, "learning_rate": 6.208553052123235e-07, "loss": 0.925, "step": 725 }, { "epoch": 2.574468085106383, "grad_norm": 0.30232247237344967, "learning_rate": 6.109307523378938e-07, "loss": 0.9296, "step": 726 }, { "epoch": 2.578014184397163, "grad_norm": 0.28462920964403415, "learning_rate": 6.010809995754307e-07, "loss": 0.9187, "step": 727 }, { "epoch": 2.581560283687943, "grad_norm": 0.25947649129627265, "learning_rate": 5.913062147879995e-07, "loss": 0.9342, "step": 728 }, { "epoch": 2.5851063829787235, "grad_norm": 0.26772972307784576, "learning_rate": 5.816065645610314e-07, "loss": 0.937, "step": 729 }, { "epoch": 2.5886524822695036, "grad_norm": 0.26620748539291117, "learning_rate": 5.719822141994874e-07, "loss": 0.9473, "step": 730 }, { "epoch": 2.5921985815602837, "grad_norm": 0.26491691619118457, "learning_rate": 5.624333277250416e-07, "loss": 0.9275, "step": 731 }, { "epoch": 2.595744680851064, "grad_norm": 0.26530284566928103, "learning_rate": 5.529600678732843e-07, "loss": 0.9587, "step": 732 }, { "epoch": 2.599290780141844, "grad_norm": 0.267798546927437, "learning_rate": 5.435625960909514e-07, "loss": 0.9274, "step": 733 }, { "epoch": 2.602836879432624, "grad_norm": 0.25244168669199174, "learning_rate": 5.342410725331682e-07, "loss": 0.9886, "step": 734 }, { "epoch": 2.6063829787234045, "grad_norm": 0.24678575470883943, "learning_rate": 5.249956560607256e-07, "loss": 0.9575, "step": 735 }, { "epoch": 2.6099290780141846, "grad_norm": 0.26714436807007375, "learning_rate": 5.158265042373672e-07, "loss": 0.9558, "step": 736 }, { "epoch": 2.6134751773049647, "grad_norm": 0.25993914954896113, "learning_rate": 5.067337733271083e-07, "loss": 0.9526, "step": 737 }, { "epoch": 2.617021276595745, "grad_norm": 0.24552937743202094, "learning_rate": 4.977176182915727e-07, "loss": 0.9444, "step": 738 }, { "epoch": 2.620567375886525, "grad_norm": 0.2747588898578525, "learning_rate": 4.887781927873459e-07, "loss": 0.9611, "step": 739 }, { "epoch": 2.624113475177305, "grad_norm": 0.2686365390549021, "learning_rate": 4.799156491633655e-07, "loss": 0.9196, "step": 740 }, { "epoch": 2.627659574468085, "grad_norm": 0.2608084334845081, "learning_rate": 4.7113013845831834e-07, "loss": 0.9471, "step": 741 }, { "epoch": 2.631205673758865, "grad_norm": 0.2609640464369094, "learning_rate": 4.6242181039806656e-07, "loss": 0.9192, "step": 742 }, { "epoch": 2.6347517730496453, "grad_norm": 0.2672996154942586, "learning_rate": 4.537908133931018e-07, "loss": 0.9104, "step": 743 }, { "epoch": 2.6382978723404253, "grad_norm": 0.260698595165063, "learning_rate": 4.452372945360073e-07, "loss": 0.9634, "step": 744 }, { "epoch": 2.6418439716312054, "grad_norm": 0.2638610144586483, "learning_rate": 4.367613995989589e-07, "loss": 0.9195, "step": 745 }, { "epoch": 2.645390070921986, "grad_norm": 0.279987374915907, "learning_rate": 4.2836327303123484e-07, "loss": 0.9687, "step": 746 }, { "epoch": 2.648936170212766, "grad_norm": 0.25712602018521363, "learning_rate": 4.2004305795675714e-07, "loss": 0.9289, "step": 747 }, { "epoch": 2.652482269503546, "grad_norm": 0.2893982275001574, "learning_rate": 4.118008961716552e-07, "loss": 0.9724, "step": 748 }, { "epoch": 2.6560283687943262, "grad_norm": 0.25225295213709026, "learning_rate": 4.0363692814184007e-07, "loss": 0.9188, "step": 749 }, { "epoch": 2.6595744680851063, "grad_norm": 0.26430826252751677, "learning_rate": 3.9555129300062225e-07, "loss": 0.9376, "step": 750 }, { "epoch": 2.6631205673758864, "grad_norm": 0.26133831584842315, "learning_rate": 3.8754412854633104e-07, "loss": 0.9177, "step": 751 }, { "epoch": 2.6666666666666665, "grad_norm": 0.24963986361147492, "learning_rate": 3.796155712399702e-07, "loss": 0.9087, "step": 752 }, { "epoch": 2.670212765957447, "grad_norm": 0.25371218531350415, "learning_rate": 3.717657562028937e-07, "loss": 0.964, "step": 753 }, { "epoch": 2.673758865248227, "grad_norm": 0.2553858509354702, "learning_rate": 3.6399481721449857e-07, "loss": 0.9033, "step": 754 }, { "epoch": 2.6773049645390072, "grad_norm": 0.25651830434752826, "learning_rate": 3.563028867099505e-07, "loss": 0.9434, "step": 755 }, { "epoch": 2.6808510638297873, "grad_norm": 0.320904301162738, "learning_rate": 3.486900957779216e-07, "loss": 0.907, "step": 756 }, { "epoch": 2.6843971631205674, "grad_norm": 0.25280223400634505, "learning_rate": 3.4115657415835835e-07, "loss": 0.9312, "step": 757 }, { "epoch": 2.6879432624113475, "grad_norm": 0.26938907060044553, "learning_rate": 3.3370245024027415e-07, "loss": 0.9095, "step": 758 }, { "epoch": 2.6914893617021276, "grad_norm": 0.2631434787310301, "learning_rate": 3.2632785105955465e-07, "loss": 0.9506, "step": 759 }, { "epoch": 2.6950354609929077, "grad_norm": 0.2662315616466783, "learning_rate": 3.190329022967975e-07, "loss": 0.9392, "step": 760 }, { "epoch": 2.698581560283688, "grad_norm": 0.25700233418438867, "learning_rate": 3.1181772827516666e-07, "loss": 0.9383, "step": 761 }, { "epoch": 2.702127659574468, "grad_norm": 0.2576229597011133, "learning_rate": 3.046824519582808e-07, "loss": 0.9419, "step": 762 }, { "epoch": 2.705673758865248, "grad_norm": 0.24686728966731478, "learning_rate": 2.976271949481085e-07, "loss": 0.9705, "step": 763 }, { "epoch": 2.7092198581560285, "grad_norm": 0.2573835107225912, "learning_rate": 2.9065207748290136e-07, "loss": 0.9609, "step": 764 }, { "epoch": 2.7127659574468086, "grad_norm": 0.25974999711493557, "learning_rate": 2.8375721843514503e-07, "loss": 0.9468, "step": 765 }, { "epoch": 2.7163120567375887, "grad_norm": 0.2793677158973486, "learning_rate": 2.7694273530953163e-07, "loss": 0.98, "step": 766 }, { "epoch": 2.719858156028369, "grad_norm": 0.2663607441533007, "learning_rate": 2.702087442409551e-07, "loss": 0.9105, "step": 767 }, { "epoch": 2.723404255319149, "grad_norm": 0.2571780169524377, "learning_rate": 2.6355535999253887e-07, "loss": 0.9677, "step": 768 }, { "epoch": 2.726950354609929, "grad_norm": 0.26739563794355903, "learning_rate": 2.5698269595367254e-07, "loss": 0.9446, "step": 769 }, { "epoch": 2.7304964539007095, "grad_norm": 0.25577143835777355, "learning_rate": 2.5049086413808376e-07, "loss": 0.9327, "step": 770 }, { "epoch": 2.7340425531914896, "grad_norm": 0.25867564649826646, "learning_rate": 2.440799751819273e-07, "loss": 0.9455, "step": 771 }, { "epoch": 2.7375886524822697, "grad_norm": 0.263552113764812, "learning_rate": 2.3775013834190063e-07, "loss": 0.9618, "step": 772 }, { "epoch": 2.7411347517730498, "grad_norm": 0.27800019549051025, "learning_rate": 2.3150146149338249e-07, "loss": 0.9363, "step": 773 }, { "epoch": 2.74468085106383, "grad_norm": 0.2757194068424258, "learning_rate": 2.253340511285923e-07, "loss": 0.9365, "step": 774 }, { "epoch": 2.74822695035461, "grad_norm": 0.25076011424778144, "learning_rate": 2.1924801235477744e-07, "loss": 0.9192, "step": 775 }, { "epoch": 2.75177304964539, "grad_norm": 0.2557047852207197, "learning_rate": 2.1324344889242122e-07, "loss": 0.93, "step": 776 }, { "epoch": 2.75531914893617, "grad_norm": 0.2609327004251684, "learning_rate": 2.073204630734743e-07, "loss": 0.9582, "step": 777 }, { "epoch": 2.7588652482269502, "grad_norm": 0.2642765945321785, "learning_rate": 2.0147915583961175e-07, "loss": 0.9, "step": 778 }, { "epoch": 2.7624113475177303, "grad_norm": 0.25819174016270086, "learning_rate": 1.9571962674051204e-07, "loss": 0.9363, "step": 779 }, { "epoch": 2.7659574468085104, "grad_norm": 0.2604856481250497, "learning_rate": 1.9004197393216294e-07, "loss": 0.9446, "step": 780 }, { "epoch": 2.7695035460992905, "grad_norm": 0.24811480634600303, "learning_rate": 1.84446294175184e-07, "loss": 0.9235, "step": 781 }, { "epoch": 2.773049645390071, "grad_norm": 0.2597300173613648, "learning_rate": 1.7893268283318276e-07, "loss": 0.9437, "step": 782 }, { "epoch": 2.776595744680851, "grad_norm": 0.2597277953828975, "learning_rate": 1.7350123387112562e-07, "loss": 0.9465, "step": 783 }, { "epoch": 2.780141843971631, "grad_norm": 0.25756536891360987, "learning_rate": 1.6815203985373728e-07, "loss": 0.9309, "step": 784 }, { "epoch": 2.7836879432624113, "grad_norm": 0.2598794151766591, "learning_rate": 1.6288519194392615e-07, "loss": 0.9667, "step": 785 }, { "epoch": 2.7872340425531914, "grad_norm": 0.2662240402731695, "learning_rate": 1.5770077990122644e-07, "loss": 0.9764, "step": 786 }, { "epoch": 2.7907801418439715, "grad_norm": 0.26343349081880274, "learning_rate": 1.5259889208027012e-07, "loss": 0.8732, "step": 787 }, { "epoch": 2.794326241134752, "grad_norm": 0.2598077583062048, "learning_rate": 1.4757961542928356e-07, "loss": 0.9881, "step": 788 }, { "epoch": 2.797872340425532, "grad_norm": 0.26914400835517405, "learning_rate": 1.4264303548859993e-07, "loss": 0.9735, "step": 789 }, { "epoch": 2.801418439716312, "grad_norm": 0.2707362212251115, "learning_rate": 1.377892363892097e-07, "loss": 0.9465, "step": 790 }, { "epoch": 2.8049645390070923, "grad_norm": 0.26181829443327354, "learning_rate": 1.3301830085131863e-07, "loss": 0.9436, "step": 791 }, { "epoch": 2.8085106382978724, "grad_norm": 0.26534018517709246, "learning_rate": 1.2833031018294252e-07, "loss": 0.946, "step": 792 }, { "epoch": 2.8120567375886525, "grad_norm": 0.2554277594983207, "learning_rate": 1.237253442785208e-07, "loss": 0.9451, "step": 793 }, { "epoch": 2.8156028368794326, "grad_norm": 0.2634071799046019, "learning_rate": 1.1920348161755413e-07, "loss": 0.9331, "step": 794 }, { "epoch": 2.8191489361702127, "grad_norm": 0.2646406755816084, "learning_rate": 1.1476479926326944e-07, "loss": 0.9157, "step": 795 }, { "epoch": 2.8226950354609928, "grad_norm": 0.25707150715315435, "learning_rate": 1.104093728613026e-07, "loss": 0.9264, "step": 796 }, { "epoch": 2.826241134751773, "grad_norm": 0.2759766163022058, "learning_rate": 1.0613727663841112e-07, "loss": 0.9765, "step": 797 }, { "epoch": 2.829787234042553, "grad_norm": 0.25456013736088967, "learning_rate": 1.0194858340121184e-07, "loss": 0.9449, "step": 798 }, { "epoch": 2.8333333333333335, "grad_norm": 0.2643560537983254, "learning_rate": 9.784336453493415e-08, "loss": 0.95, "step": 799 }, { "epoch": 2.8368794326241136, "grad_norm": 0.25873374065179594, "learning_rate": 9.382169000221042e-08, "loss": 0.9177, "step": 800 }, { "epoch": 2.8404255319148937, "grad_norm": 0.27751027580275006, "learning_rate": 8.988362834187747e-08, "loss": 0.93, "step": 801 }, { "epoch": 2.8439716312056738, "grad_norm": 0.2708618235616301, "learning_rate": 8.602924666781254e-08, "loss": 0.9741, "step": 802 }, { "epoch": 2.847517730496454, "grad_norm": 0.2638289808159278, "learning_rate": 8.225861066778807e-08, "loss": 0.9306, "step": 803 }, { "epoch": 2.851063829787234, "grad_norm": 0.2517912051724094, "learning_rate": 7.857178460235149e-08, "loss": 0.9239, "step": 804 }, { "epoch": 2.854609929078014, "grad_norm": 0.25779114471538306, "learning_rate": 7.496883130373167e-08, "loss": 0.9505, "step": 805 }, { "epoch": 2.8581560283687946, "grad_norm": 0.2830555379585059, "learning_rate": 7.144981217476754e-08, "loss": 0.9172, "step": 806 }, { "epoch": 2.8617021276595747, "grad_norm": 0.2590069266053024, "learning_rate": 6.801478718785948e-08, "loss": 0.9569, "step": 807 }, { "epoch": 2.8652482269503547, "grad_norm": 0.25695969549652814, "learning_rate": 6.46638148839529e-08, "loss": 0.9528, "step": 808 }, { "epoch": 2.868794326241135, "grad_norm": 0.26175524028590974, "learning_rate": 6.139695237153298e-08, "loss": 0.9455, "step": 809 }, { "epoch": 2.872340425531915, "grad_norm": 0.26425649700364123, "learning_rate": 5.821425532565816e-08, "loss": 0.9466, "step": 810 }, { "epoch": 2.875886524822695, "grad_norm": 0.25262758581265743, "learning_rate": 5.5115777987005956e-08, "loss": 0.9411, "step": 811 }, { "epoch": 2.879432624113475, "grad_norm": 0.273289162457411, "learning_rate": 5.21015731609531e-08, "loss": 0.9517, "step": 812 }, { "epoch": 2.882978723404255, "grad_norm": 0.2665548690236494, "learning_rate": 4.91716922166735e-08, "loss": 0.9544, "step": 813 }, { "epoch": 2.8865248226950353, "grad_norm": 0.26534870433514324, "learning_rate": 4.6326185086260634e-08, "loss": 0.946, "step": 814 }, { "epoch": 2.8900709219858154, "grad_norm": 0.2550850518900624, "learning_rate": 4.35651002638815e-08, "loss": 0.9563, "step": 815 }, { "epoch": 2.8936170212765955, "grad_norm": 0.2751504704032806, "learning_rate": 4.088848480494567e-08, "loss": 0.9738, "step": 816 }, { "epoch": 2.897163120567376, "grad_norm": 0.25741748325767494, "learning_rate": 3.8296384325307024e-08, "loss": 0.9573, "step": 817 }, { "epoch": 2.900709219858156, "grad_norm": 0.28054862469798914, "learning_rate": 3.5788843000481575e-08, "loss": 0.8909, "step": 818 }, { "epoch": 2.904255319148936, "grad_norm": 0.26724319225833276, "learning_rate": 3.3365903564899773e-08, "loss": 0.9378, "step": 819 }, { "epoch": 2.9078014184397163, "grad_norm": 0.26348826685817667, "learning_rate": 3.102760731117593e-08, "loss": 0.8886, "step": 820 }, { "epoch": 2.9113475177304964, "grad_norm": 0.26122848965414114, "learning_rate": 2.8773994089402734e-08, "loss": 0.9048, "step": 821 }, { "epoch": 2.9148936170212765, "grad_norm": 0.2619580896446043, "learning_rate": 2.6605102306476725e-08, "loss": 0.8901, "step": 822 }, { "epoch": 2.918439716312057, "grad_norm": 0.26643130220056477, "learning_rate": 2.452096892543776e-08, "loss": 0.931, "step": 823 }, { "epoch": 2.921985815602837, "grad_norm": 0.2851320956428963, "learning_rate": 2.2521629464844484e-08, "loss": 0.9523, "step": 824 }, { "epoch": 2.925531914893617, "grad_norm": 0.2561510264542094, "learning_rate": 2.0607117998165947e-08, "loss": 0.9246, "step": 825 }, { "epoch": 2.9290780141843973, "grad_norm": 0.3003128839605516, "learning_rate": 1.8777467153202055e-08, "loss": 0.9452, "step": 826 }, { "epoch": 2.9326241134751774, "grad_norm": 0.27748290541252363, "learning_rate": 1.703270811152624e-08, "loss": 0.918, "step": 827 }, { "epoch": 2.9361702127659575, "grad_norm": 0.25657144190605263, "learning_rate": 1.5372870607956448e-08, "loss": 0.9527, "step": 828 }, { "epoch": 2.9397163120567376, "grad_norm": 0.27232610817801395, "learning_rate": 1.3797982930044973e-08, "loss": 0.9188, "step": 829 }, { "epoch": 2.9432624113475176, "grad_norm": 0.24647000171153371, "learning_rate": 1.2308071917601083e-08, "loss": 0.908, "step": 830 }, { "epoch": 2.9468085106382977, "grad_norm": 0.25827601265791794, "learning_rate": 1.0903162962228598e-08, "loss": 0.9049, "step": 831 }, { "epoch": 2.950354609929078, "grad_norm": 0.2671579880930949, "learning_rate": 9.583280006895679e-09, "loss": 0.9732, "step": 832 }, { "epoch": 2.953900709219858, "grad_norm": 0.26132938091167346, "learning_rate": 8.34844554552794e-09, "loss": 0.9909, "step": 833 }, { "epoch": 2.9574468085106385, "grad_norm": 0.24255363296502028, "learning_rate": 7.198680622621523e-09, "loss": 0.9467, "step": 834 }, { "epoch": 2.9609929078014185, "grad_norm": 0.2573154199772112, "learning_rate": 6.134004832888396e-09, "loss": 0.9468, "step": 835 }, { "epoch": 2.9645390070921986, "grad_norm": 0.2508365913431423, "learning_rate": 5.154436320919942e-09, "loss": 0.9503, "step": 836 }, { "epoch": 2.9680851063829787, "grad_norm": 0.24637486817879609, "learning_rate": 4.25999178087888e-09, "loss": 0.941, "step": 837 }, { "epoch": 2.971631205673759, "grad_norm": 0.26079752515927407, "learning_rate": 3.450686456213381e-09, "loss": 0.9182, "step": 838 }, { "epoch": 2.975177304964539, "grad_norm": 0.2607289447336066, "learning_rate": 2.7265341393983844e-09, "loss": 0.9463, "step": 839 }, { "epoch": 2.978723404255319, "grad_norm": 0.2673832395714231, "learning_rate": 2.087547171701343e-09, "loss": 0.9078, "step": 840 }, { "epoch": 2.9822695035460995, "grad_norm": 0.2659989005016676, "learning_rate": 1.5337364429696133e-09, "loss": 0.9019, "step": 841 }, { "epoch": 2.9858156028368796, "grad_norm": 0.25928259552748, "learning_rate": 1.065111391447271e-09, "loss": 0.9419, "step": 842 }, { "epoch": 2.9893617021276597, "grad_norm": 0.25450153912530626, "learning_rate": 6.816800036124616e-10, "loss": 0.9443, "step": 843 }, { "epoch": 2.99290780141844, "grad_norm": 0.3048767532459319, "learning_rate": 3.8344881404195347e-10, "loss": 0.947, "step": 844 }, { "epoch": 2.99645390070922, "grad_norm": 0.2586313278007574, "learning_rate": 1.7042290529956096e-10, "loss": 0.9426, "step": 845 }, { "epoch": 3.0, "grad_norm": 0.25582905752564317, "learning_rate": 4.260590785121199e-11, "loss": 0.9182, "step": 846 } ], "logging_steps": 1, "max_steps": 846, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1640423164477440.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }