{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.988593155893536, "eval_steps": 500, "global_step": 1970, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0025348542458808617, "grad_norm": 1.1835554838180542, "learning_rate": 0.0, "loss": 2.7162, "step": 1 }, { "epoch": 0.005069708491761723, "grad_norm": 1.1406067609786987, "learning_rate": 4e-05, "loss": 2.7021, "step": 2 }, { "epoch": 0.0076045627376425855, "grad_norm": 1.1929512023925781, "learning_rate": 8e-05, "loss": 2.5728, "step": 3 }, { "epoch": 0.010139416983523447, "grad_norm": 1.523325800895691, "learning_rate": 0.00012, "loss": 2.5825, "step": 4 }, { "epoch": 0.012674271229404309, "grad_norm": 1.712708592414856, "learning_rate": 0.00016, "loss": 2.1986, "step": 5 }, { "epoch": 0.015209125475285171, "grad_norm": 1.263485312461853, "learning_rate": 0.0002, "loss": 2.1478, "step": 6 }, { "epoch": 0.017743979721166033, "grad_norm": 1.2837083339691162, "learning_rate": 0.00019989821882951655, "loss": 2.2153, "step": 7 }, { "epoch": 0.020278833967046894, "grad_norm": 1.0831111669540405, "learning_rate": 0.0001997964376590331, "loss": 1.9272, "step": 8 }, { "epoch": 0.022813688212927757, "grad_norm": 0.7921498417854309, "learning_rate": 0.00019969465648854963, "loss": 1.4929, "step": 9 }, { "epoch": 0.025348542458808618, "grad_norm": 0.9243067502975464, "learning_rate": 0.00019959287531806617, "loss": 1.4312, "step": 10 }, { "epoch": 0.02788339670468948, "grad_norm": 1.2378944158554077, "learning_rate": 0.0001994910941475827, "loss": 1.1605, "step": 11 }, { "epoch": 0.030418250950570342, "grad_norm": 1.401106834411621, "learning_rate": 0.00019938931297709925, "loss": 1.0236, "step": 12 }, { "epoch": 0.032953105196451206, "grad_norm": 1.0503413677215576, "learning_rate": 0.00019928753180661578, "loss": 0.8441, "step": 13 }, { "epoch": 0.035487959442332066, "grad_norm": 0.928716778755188, "learning_rate": 0.00019918575063613232, "loss": 0.8098, "step": 14 }, { "epoch": 0.03802281368821293, "grad_norm": 0.6546494364738464, "learning_rate": 0.00019908396946564886, "loss": 0.5083, "step": 15 }, { "epoch": 0.04055766793409379, "grad_norm": 0.8399775624275208, "learning_rate": 0.0001989821882951654, "loss": 0.5798, "step": 16 }, { "epoch": 0.043092522179974654, "grad_norm": 0.6111662983894348, "learning_rate": 0.00019888040712468194, "loss": 0.471, "step": 17 }, { "epoch": 0.045627376425855515, "grad_norm": 0.6786199808120728, "learning_rate": 0.00019877862595419848, "loss": 0.5124, "step": 18 }, { "epoch": 0.048162230671736375, "grad_norm": 0.7001961469650269, "learning_rate": 0.00019867684478371502, "loss": 0.5764, "step": 19 }, { "epoch": 0.050697084917617236, "grad_norm": 0.5670634508132935, "learning_rate": 0.00019857506361323156, "loss": 0.5595, "step": 20 }, { "epoch": 0.053231939163498096, "grad_norm": 0.6825580596923828, "learning_rate": 0.0001984732824427481, "loss": 0.6601, "step": 21 }, { "epoch": 0.05576679340937896, "grad_norm": 0.5777536630630493, "learning_rate": 0.00019837150127226464, "loss": 0.6232, "step": 22 }, { "epoch": 0.058301647655259824, "grad_norm": 0.7791958451271057, "learning_rate": 0.00019826972010178118, "loss": 0.4741, "step": 23 }, { "epoch": 0.060836501901140684, "grad_norm": 0.7647196054458618, "learning_rate": 0.00019816793893129772, "loss": 0.574, "step": 24 }, { "epoch": 0.06337135614702155, "grad_norm": 0.6175855398178101, "learning_rate": 0.00019806615776081426, "loss": 0.6792, "step": 25 }, { "epoch": 0.06590621039290241, "grad_norm": 0.7071298360824585, "learning_rate": 0.0001979643765903308, "loss": 0.6333, "step": 26 }, { "epoch": 0.06844106463878327, "grad_norm": 0.7675352692604065, "learning_rate": 0.00019786259541984734, "loss": 0.5004, "step": 27 }, { "epoch": 0.07097591888466413, "grad_norm": 0.6224766969680786, "learning_rate": 0.00019776081424936387, "loss": 0.5649, "step": 28 }, { "epoch": 0.07351077313054499, "grad_norm": 0.6023550629615784, "learning_rate": 0.00019765903307888041, "loss": 0.4004, "step": 29 }, { "epoch": 0.07604562737642585, "grad_norm": 0.6253474354743958, "learning_rate": 0.00019755725190839695, "loss": 0.548, "step": 30 }, { "epoch": 0.07858048162230671, "grad_norm": 0.43560266494750977, "learning_rate": 0.00019745547073791352, "loss": 0.4721, "step": 31 }, { "epoch": 0.08111533586818757, "grad_norm": 0.6321932077407837, "learning_rate": 0.00019735368956743003, "loss": 0.4671, "step": 32 }, { "epoch": 0.08365019011406843, "grad_norm": 0.41977155208587646, "learning_rate": 0.00019725190839694657, "loss": 0.3716, "step": 33 }, { "epoch": 0.08618504435994931, "grad_norm": 0.4449223279953003, "learning_rate": 0.0001971501272264631, "loss": 0.6045, "step": 34 }, { "epoch": 0.08871989860583017, "grad_norm": 0.5593668222427368, "learning_rate": 0.00019704834605597965, "loss": 0.3789, "step": 35 }, { "epoch": 0.09125475285171103, "grad_norm": 0.4293775260448456, "learning_rate": 0.0001969465648854962, "loss": 0.3834, "step": 36 }, { "epoch": 0.09378960709759189, "grad_norm": 0.49535441398620605, "learning_rate": 0.00019684478371501273, "loss": 0.5504, "step": 37 }, { "epoch": 0.09632446134347275, "grad_norm": 0.4620949625968933, "learning_rate": 0.00019674300254452927, "loss": 0.3212, "step": 38 }, { "epoch": 0.09885931558935361, "grad_norm": 0.46665605902671814, "learning_rate": 0.0001966412213740458, "loss": 0.4868, "step": 39 }, { "epoch": 0.10139416983523447, "grad_norm": 0.4120428264141083, "learning_rate": 0.00019653944020356235, "loss": 0.4926, "step": 40 }, { "epoch": 0.10392902408111533, "grad_norm": 0.41570335626602173, "learning_rate": 0.00019643765903307889, "loss": 0.5068, "step": 41 }, { "epoch": 0.10646387832699619, "grad_norm": 0.4141896665096283, "learning_rate": 0.00019633587786259542, "loss": 0.4064, "step": 42 }, { "epoch": 0.10899873257287707, "grad_norm": 0.3192928433418274, "learning_rate": 0.00019623409669211196, "loss": 0.4581, "step": 43 }, { "epoch": 0.11153358681875793, "grad_norm": 0.4188425838947296, "learning_rate": 0.00019613231552162853, "loss": 0.371, "step": 44 }, { "epoch": 0.11406844106463879, "grad_norm": 0.3750368654727936, "learning_rate": 0.00019603053435114504, "loss": 0.3728, "step": 45 }, { "epoch": 0.11660329531051965, "grad_norm": 0.5102046728134155, "learning_rate": 0.00019592875318066158, "loss": 0.357, "step": 46 }, { "epoch": 0.11913814955640051, "grad_norm": 0.4143039882183075, "learning_rate": 0.00019582697201017812, "loss": 0.4373, "step": 47 }, { "epoch": 0.12167300380228137, "grad_norm": 0.42558473348617554, "learning_rate": 0.00019572519083969466, "loss": 0.5877, "step": 48 }, { "epoch": 0.12420785804816223, "grad_norm": 0.35768038034439087, "learning_rate": 0.0001956234096692112, "loss": 0.3326, "step": 49 }, { "epoch": 0.1267427122940431, "grad_norm": 0.32826319336891174, "learning_rate": 0.00019552162849872774, "loss": 0.3521, "step": 50 }, { "epoch": 0.12927756653992395, "grad_norm": 0.3507271409034729, "learning_rate": 0.00019541984732824428, "loss": 0.4157, "step": 51 }, { "epoch": 0.13181242078580482, "grad_norm": 0.5069169402122498, "learning_rate": 0.00019531806615776082, "loss": 0.4453, "step": 52 }, { "epoch": 0.13434727503168567, "grad_norm": 0.4759957492351532, "learning_rate": 0.00019521628498727736, "loss": 0.5131, "step": 53 }, { "epoch": 0.13688212927756654, "grad_norm": 0.4045158326625824, "learning_rate": 0.0001951145038167939, "loss": 0.3927, "step": 54 }, { "epoch": 0.1394169835234474, "grad_norm": 0.49629393219947815, "learning_rate": 0.00019501272264631046, "loss": 0.4708, "step": 55 }, { "epoch": 0.14195183776932827, "grad_norm": 0.3735599219799042, "learning_rate": 0.00019491094147582698, "loss": 0.4076, "step": 56 }, { "epoch": 0.1444866920152091, "grad_norm": 0.4713466763496399, "learning_rate": 0.00019480916030534354, "loss": 0.4187, "step": 57 }, { "epoch": 0.14702154626108999, "grad_norm": 0.6454377770423889, "learning_rate": 0.00019470737913486005, "loss": 0.4032, "step": 58 }, { "epoch": 0.14955640050697086, "grad_norm": 0.39378786087036133, "learning_rate": 0.00019460559796437662, "loss": 0.3508, "step": 59 }, { "epoch": 0.1520912547528517, "grad_norm": 0.3768695592880249, "learning_rate": 0.00019450381679389313, "loss": 0.3129, "step": 60 }, { "epoch": 0.15462610899873258, "grad_norm": 0.4250476062297821, "learning_rate": 0.00019440203562340967, "loss": 0.3426, "step": 61 }, { "epoch": 0.15716096324461343, "grad_norm": 0.3653964698314667, "learning_rate": 0.0001943002544529262, "loss": 0.3339, "step": 62 }, { "epoch": 0.1596958174904943, "grad_norm": 0.4973353445529938, "learning_rate": 0.00019419847328244275, "loss": 0.4759, "step": 63 }, { "epoch": 0.16223067173637515, "grad_norm": 0.41738295555114746, "learning_rate": 0.0001940966921119593, "loss": 0.3809, "step": 64 }, { "epoch": 0.16476552598225602, "grad_norm": 0.42326119542121887, "learning_rate": 0.00019399491094147583, "loss": 0.3399, "step": 65 }, { "epoch": 0.16730038022813687, "grad_norm": 0.4244116246700287, "learning_rate": 0.00019389312977099237, "loss": 0.4085, "step": 66 }, { "epoch": 0.16983523447401774, "grad_norm": 0.40235379338264465, "learning_rate": 0.0001937913486005089, "loss": 0.3016, "step": 67 }, { "epoch": 0.17237008871989862, "grad_norm": 0.3983120322227478, "learning_rate": 0.00019368956743002547, "loss": 0.5101, "step": 68 }, { "epoch": 0.17490494296577946, "grad_norm": 0.4857071042060852, "learning_rate": 0.00019358778625954199, "loss": 0.3131, "step": 69 }, { "epoch": 0.17743979721166034, "grad_norm": 0.5238108038902283, "learning_rate": 0.00019348600508905855, "loss": 0.5841, "step": 70 }, { "epoch": 0.17997465145754118, "grad_norm": 0.5322052240371704, "learning_rate": 0.00019338422391857506, "loss": 0.3895, "step": 71 }, { "epoch": 0.18250950570342206, "grad_norm": 0.4643409252166748, "learning_rate": 0.00019328244274809163, "loss": 0.364, "step": 72 }, { "epoch": 0.1850443599493029, "grad_norm": 0.36517271399497986, "learning_rate": 0.00019318066157760814, "loss": 0.4092, "step": 73 }, { "epoch": 0.18757921419518378, "grad_norm": 0.49409031867980957, "learning_rate": 0.00019307888040712468, "loss": 0.3359, "step": 74 }, { "epoch": 0.19011406844106463, "grad_norm": 0.44665688276290894, "learning_rate": 0.00019297709923664122, "loss": 0.3275, "step": 75 }, { "epoch": 0.1926489226869455, "grad_norm": 0.353208065032959, "learning_rate": 0.00019287531806615776, "loss": 0.3396, "step": 76 }, { "epoch": 0.19518377693282637, "grad_norm": 0.4061962366104126, "learning_rate": 0.0001927735368956743, "loss": 0.4658, "step": 77 }, { "epoch": 0.19771863117870722, "grad_norm": 0.4785591959953308, "learning_rate": 0.00019267175572519084, "loss": 0.4705, "step": 78 }, { "epoch": 0.2002534854245881, "grad_norm": 0.44644224643707275, "learning_rate": 0.00019256997455470738, "loss": 0.3573, "step": 79 }, { "epoch": 0.20278833967046894, "grad_norm": 0.4554955065250397, "learning_rate": 0.00019246819338422392, "loss": 0.3822, "step": 80 }, { "epoch": 0.20532319391634982, "grad_norm": 0.4537349343299866, "learning_rate": 0.00019236641221374049, "loss": 0.5222, "step": 81 }, { "epoch": 0.20785804816223066, "grad_norm": 0.32820987701416016, "learning_rate": 0.000192264631043257, "loss": 0.3185, "step": 82 }, { "epoch": 0.21039290240811154, "grad_norm": 0.39827391505241394, "learning_rate": 0.00019216284987277356, "loss": 0.3693, "step": 83 }, { "epoch": 0.21292775665399238, "grad_norm": 0.4188093841075897, "learning_rate": 0.00019206106870229008, "loss": 0.4168, "step": 84 }, { "epoch": 0.21546261089987326, "grad_norm": 0.4770517349243164, "learning_rate": 0.00019195928753180664, "loss": 0.4113, "step": 85 }, { "epoch": 0.21799746514575413, "grad_norm": 0.346224844455719, "learning_rate": 0.00019185750636132315, "loss": 0.4238, "step": 86 }, { "epoch": 0.22053231939163498, "grad_norm": 0.37398770451545715, "learning_rate": 0.00019175572519083972, "loss": 0.4285, "step": 87 }, { "epoch": 0.22306717363751585, "grad_norm": 0.35467982292175293, "learning_rate": 0.00019165394402035623, "loss": 0.3201, "step": 88 }, { "epoch": 0.2256020278833967, "grad_norm": 0.3411659002304077, "learning_rate": 0.00019155216284987277, "loss": 0.3428, "step": 89 }, { "epoch": 0.22813688212927757, "grad_norm": 0.4002087712287903, "learning_rate": 0.0001914503816793893, "loss": 0.5375, "step": 90 }, { "epoch": 0.23067173637515842, "grad_norm": 0.4339190423488617, "learning_rate": 0.00019134860050890585, "loss": 0.3355, "step": 91 }, { "epoch": 0.2332065906210393, "grad_norm": 0.43449410796165466, "learning_rate": 0.00019124681933842242, "loss": 0.4355, "step": 92 }, { "epoch": 0.23574144486692014, "grad_norm": 0.4565323293209076, "learning_rate": 0.00019114503816793893, "loss": 0.3178, "step": 93 }, { "epoch": 0.23827629911280102, "grad_norm": 0.46309894323349, "learning_rate": 0.0001910432569974555, "loss": 0.3308, "step": 94 }, { "epoch": 0.24081115335868186, "grad_norm": 0.3554096817970276, "learning_rate": 0.000190941475826972, "loss": 0.3358, "step": 95 }, { "epoch": 0.24334600760456274, "grad_norm": 0.39129987359046936, "learning_rate": 0.00019083969465648857, "loss": 0.3988, "step": 96 }, { "epoch": 0.2458808618504436, "grad_norm": 0.4193456470966339, "learning_rate": 0.0001907379134860051, "loss": 0.4064, "step": 97 }, { "epoch": 0.24841571609632446, "grad_norm": 0.39571425318717957, "learning_rate": 0.00019063613231552165, "loss": 0.3213, "step": 98 }, { "epoch": 0.2509505703422053, "grad_norm": 0.48566195368766785, "learning_rate": 0.00019053435114503817, "loss": 0.3505, "step": 99 }, { "epoch": 0.2534854245880862, "grad_norm": 0.43266433477401733, "learning_rate": 0.00019043256997455473, "loss": 0.3579, "step": 100 }, { "epoch": 0.25602027883396705, "grad_norm": 0.31110769510269165, "learning_rate": 0.00019033078880407124, "loss": 0.2832, "step": 101 }, { "epoch": 0.2585551330798479, "grad_norm": 0.40166690945625305, "learning_rate": 0.00019022900763358778, "loss": 0.2964, "step": 102 }, { "epoch": 0.26108998732572875, "grad_norm": 0.554072380065918, "learning_rate": 0.00019012722646310432, "loss": 0.3661, "step": 103 }, { "epoch": 0.26362484157160965, "grad_norm": 0.45009374618530273, "learning_rate": 0.00019002544529262086, "loss": 0.3812, "step": 104 }, { "epoch": 0.2661596958174905, "grad_norm": 0.48349273204803467, "learning_rate": 0.00018992366412213743, "loss": 0.4183, "step": 105 }, { "epoch": 0.26869455006337134, "grad_norm": 0.4157555103302002, "learning_rate": 0.00018982188295165394, "loss": 0.2962, "step": 106 }, { "epoch": 0.27122940430925224, "grad_norm": 0.3300265073776245, "learning_rate": 0.0001897201017811705, "loss": 0.3351, "step": 107 }, { "epoch": 0.2737642585551331, "grad_norm": 0.3690893054008484, "learning_rate": 0.00018961832061068702, "loss": 0.3251, "step": 108 }, { "epoch": 0.27629911280101394, "grad_norm": 0.49013710021972656, "learning_rate": 0.00018951653944020359, "loss": 0.4757, "step": 109 }, { "epoch": 0.2788339670468948, "grad_norm": 0.4416143000125885, "learning_rate": 0.0001894147582697201, "loss": 0.4421, "step": 110 }, { "epoch": 0.2813688212927757, "grad_norm": 0.3613321781158447, "learning_rate": 0.00018931297709923666, "loss": 0.3475, "step": 111 }, { "epoch": 0.28390367553865653, "grad_norm": 0.45548489689826965, "learning_rate": 0.00018921119592875318, "loss": 0.3587, "step": 112 }, { "epoch": 0.2864385297845374, "grad_norm": 0.49439120292663574, "learning_rate": 0.00018910941475826974, "loss": 0.4017, "step": 113 }, { "epoch": 0.2889733840304182, "grad_norm": 0.35214680433273315, "learning_rate": 0.00018900763358778626, "loss": 0.2645, "step": 114 }, { "epoch": 0.2915082382762991, "grad_norm": 0.5512099266052246, "learning_rate": 0.00018890585241730282, "loss": 0.3736, "step": 115 }, { "epoch": 0.29404309252217997, "grad_norm": 0.4146886467933655, "learning_rate": 0.00018880407124681936, "loss": 0.3361, "step": 116 }, { "epoch": 0.2965779467680608, "grad_norm": 0.42954355478286743, "learning_rate": 0.00018870229007633587, "loss": 0.3841, "step": 117 }, { "epoch": 0.2991128010139417, "grad_norm": 0.47189798951148987, "learning_rate": 0.00018860050890585244, "loss": 0.3591, "step": 118 }, { "epoch": 0.30164765525982257, "grad_norm": 0.5082337260246277, "learning_rate": 0.00018849872773536895, "loss": 0.4249, "step": 119 }, { "epoch": 0.3041825095057034, "grad_norm": 0.4005051255226135, "learning_rate": 0.00018839694656488552, "loss": 0.4433, "step": 120 }, { "epoch": 0.30671736375158426, "grad_norm": 0.4730987250804901, "learning_rate": 0.00018829516539440203, "loss": 0.3575, "step": 121 }, { "epoch": 0.30925221799746516, "grad_norm": 0.5227373242378235, "learning_rate": 0.0001881933842239186, "loss": 0.3511, "step": 122 }, { "epoch": 0.311787072243346, "grad_norm": 0.3693684935569763, "learning_rate": 0.0001880916030534351, "loss": 0.3097, "step": 123 }, { "epoch": 0.31432192648922685, "grad_norm": 0.45321500301361084, "learning_rate": 0.00018798982188295168, "loss": 0.4464, "step": 124 }, { "epoch": 0.31685678073510776, "grad_norm": 0.3797638714313507, "learning_rate": 0.0001878880407124682, "loss": 0.328, "step": 125 }, { "epoch": 0.3193916349809886, "grad_norm": 0.3996891975402832, "learning_rate": 0.00018778625954198475, "loss": 0.28, "step": 126 }, { "epoch": 0.32192648922686945, "grad_norm": 0.3931027352809906, "learning_rate": 0.00018768447837150127, "loss": 0.2439, "step": 127 }, { "epoch": 0.3244613434727503, "grad_norm": 0.4259742200374603, "learning_rate": 0.00018758269720101783, "loss": 0.3068, "step": 128 }, { "epoch": 0.3269961977186312, "grad_norm": 0.4267159402370453, "learning_rate": 0.00018748091603053437, "loss": 0.3405, "step": 129 }, { "epoch": 0.32953105196451205, "grad_norm": 0.41900908946990967, "learning_rate": 0.0001873791348600509, "loss": 0.327, "step": 130 }, { "epoch": 0.3320659062103929, "grad_norm": 0.436499685049057, "learning_rate": 0.00018727735368956745, "loss": 0.5089, "step": 131 }, { "epoch": 0.33460076045627374, "grad_norm": 0.43961402773857117, "learning_rate": 0.00018717557251908396, "loss": 0.339, "step": 132 }, { "epoch": 0.33713561470215464, "grad_norm": 0.45645856857299805, "learning_rate": 0.00018707379134860053, "loss": 0.3738, "step": 133 }, { "epoch": 0.3396704689480355, "grad_norm": 0.36948803067207336, "learning_rate": 0.00018697201017811704, "loss": 0.2777, "step": 134 }, { "epoch": 0.34220532319391633, "grad_norm": 0.32040536403656006, "learning_rate": 0.0001868702290076336, "loss": 0.3679, "step": 135 }, { "epoch": 0.34474017743979724, "grad_norm": 0.37474381923675537, "learning_rate": 0.00018676844783715012, "loss": 0.4282, "step": 136 }, { "epoch": 0.3472750316856781, "grad_norm": 0.4243752360343933, "learning_rate": 0.0001866666666666667, "loss": 0.533, "step": 137 }, { "epoch": 0.34980988593155893, "grad_norm": 0.39162227511405945, "learning_rate": 0.0001865648854961832, "loss": 0.2989, "step": 138 }, { "epoch": 0.3523447401774398, "grad_norm": 0.3585897386074066, "learning_rate": 0.00018646310432569977, "loss": 0.3368, "step": 139 }, { "epoch": 0.3548795944233207, "grad_norm": 0.39330482482910156, "learning_rate": 0.00018636132315521628, "loss": 0.4904, "step": 140 }, { "epoch": 0.3574144486692015, "grad_norm": 0.3404198884963989, "learning_rate": 0.00018625954198473284, "loss": 0.2684, "step": 141 }, { "epoch": 0.35994930291508237, "grad_norm": 0.34813976287841797, "learning_rate": 0.00018615776081424938, "loss": 0.2988, "step": 142 }, { "epoch": 0.36248415716096327, "grad_norm": 0.4100090265274048, "learning_rate": 0.00018605597964376592, "loss": 0.3325, "step": 143 }, { "epoch": 0.3650190114068441, "grad_norm": 0.2897261083126068, "learning_rate": 0.00018595419847328246, "loss": 0.2487, "step": 144 }, { "epoch": 0.36755386565272496, "grad_norm": 0.43023669719696045, "learning_rate": 0.00018585241730279897, "loss": 0.4875, "step": 145 }, { "epoch": 0.3700887198986058, "grad_norm": 0.39708128571510315, "learning_rate": 0.00018575063613231554, "loss": 0.3742, "step": 146 }, { "epoch": 0.3726235741444867, "grad_norm": 0.4191845953464508, "learning_rate": 0.00018564885496183205, "loss": 0.3253, "step": 147 }, { "epoch": 0.37515842839036756, "grad_norm": 0.3373403549194336, "learning_rate": 0.00018554707379134862, "loss": 0.2636, "step": 148 }, { "epoch": 0.3776932826362484, "grad_norm": 0.3522009551525116, "learning_rate": 0.00018544529262086513, "loss": 0.2413, "step": 149 }, { "epoch": 0.38022813688212925, "grad_norm": 0.4140997529029846, "learning_rate": 0.0001853435114503817, "loss": 0.3663, "step": 150 }, { "epoch": 0.38276299112801015, "grad_norm": 0.3986112177371979, "learning_rate": 0.0001852417302798982, "loss": 0.276, "step": 151 }, { "epoch": 0.385297845373891, "grad_norm": 0.46847087144851685, "learning_rate": 0.00018513994910941478, "loss": 0.3369, "step": 152 }, { "epoch": 0.38783269961977185, "grad_norm": 0.43623679876327515, "learning_rate": 0.00018503816793893132, "loss": 0.37, "step": 153 }, { "epoch": 0.39036755386565275, "grad_norm": 0.4128822684288025, "learning_rate": 0.00018493638676844785, "loss": 0.3763, "step": 154 }, { "epoch": 0.3929024081115336, "grad_norm": 0.3352810740470886, "learning_rate": 0.0001848346055979644, "loss": 0.2446, "step": 155 }, { "epoch": 0.39543726235741444, "grad_norm": 0.580634355545044, "learning_rate": 0.00018473282442748093, "loss": 0.3691, "step": 156 }, { "epoch": 0.3979721166032953, "grad_norm": 0.452499657869339, "learning_rate": 0.00018463104325699747, "loss": 0.4361, "step": 157 }, { "epoch": 0.4005069708491762, "grad_norm": 0.4160007834434509, "learning_rate": 0.000184529262086514, "loss": 0.4003, "step": 158 }, { "epoch": 0.40304182509505704, "grad_norm": 0.3049513101577759, "learning_rate": 0.00018442748091603055, "loss": 0.2167, "step": 159 }, { "epoch": 0.4055766793409379, "grad_norm": 0.38912078738212585, "learning_rate": 0.00018432569974554706, "loss": 0.2766, "step": 160 }, { "epoch": 0.40811153358681873, "grad_norm": 0.4433249831199646, "learning_rate": 0.00018422391857506363, "loss": 0.3331, "step": 161 }, { "epoch": 0.41064638783269963, "grad_norm": 0.36410561203956604, "learning_rate": 0.00018412213740458014, "loss": 0.2719, "step": 162 }, { "epoch": 0.4131812420785805, "grad_norm": 0.47044846415519714, "learning_rate": 0.0001840203562340967, "loss": 0.3602, "step": 163 }, { "epoch": 0.4157160963244613, "grad_norm": 0.38755008578300476, "learning_rate": 0.00018391857506361322, "loss": 0.2815, "step": 164 }, { "epoch": 0.41825095057034223, "grad_norm": 0.39241930842399597, "learning_rate": 0.0001838167938931298, "loss": 0.3642, "step": 165 }, { "epoch": 0.4207858048162231, "grad_norm": 0.37138187885284424, "learning_rate": 0.00018371501272264633, "loss": 0.267, "step": 166 }, { "epoch": 0.4233206590621039, "grad_norm": 0.4508083462715149, "learning_rate": 0.00018361323155216287, "loss": 0.4093, "step": 167 }, { "epoch": 0.42585551330798477, "grad_norm": 0.4390806257724762, "learning_rate": 0.0001835114503816794, "loss": 0.424, "step": 168 }, { "epoch": 0.42839036755386567, "grad_norm": 0.4640062153339386, "learning_rate": 0.00018340966921119594, "loss": 0.4065, "step": 169 }, { "epoch": 0.4309252217997465, "grad_norm": 0.37822040915489197, "learning_rate": 0.00018330788804071248, "loss": 0.2854, "step": 170 }, { "epoch": 0.43346007604562736, "grad_norm": 0.3658731281757355, "learning_rate": 0.00018320610687022902, "loss": 0.2826, "step": 171 }, { "epoch": 0.43599493029150826, "grad_norm": 0.4271928369998932, "learning_rate": 0.00018310432569974556, "loss": 0.4538, "step": 172 }, { "epoch": 0.4385297845373891, "grad_norm": 0.33550775051116943, "learning_rate": 0.00018300254452926207, "loss": 0.3015, "step": 173 }, { "epoch": 0.44106463878326996, "grad_norm": 0.5374005436897278, "learning_rate": 0.00018290076335877864, "loss": 0.2771, "step": 174 }, { "epoch": 0.4435994930291508, "grad_norm": 0.4630737602710724, "learning_rate": 0.00018279898218829515, "loss": 0.3786, "step": 175 }, { "epoch": 0.4461343472750317, "grad_norm": 0.4163656234741211, "learning_rate": 0.00018269720101781172, "loss": 0.3224, "step": 176 }, { "epoch": 0.44866920152091255, "grad_norm": 0.43972182273864746, "learning_rate": 0.00018259541984732826, "loss": 0.4192, "step": 177 }, { "epoch": 0.4512040557667934, "grad_norm": 0.4114130437374115, "learning_rate": 0.0001824936386768448, "loss": 0.2979, "step": 178 }, { "epoch": 0.45373891001267425, "grad_norm": 0.5002878308296204, "learning_rate": 0.00018239185750636134, "loss": 0.3339, "step": 179 }, { "epoch": 0.45627376425855515, "grad_norm": 0.42383208870887756, "learning_rate": 0.00018229007633587788, "loss": 0.2958, "step": 180 }, { "epoch": 0.458808618504436, "grad_norm": 0.3234981894493103, "learning_rate": 0.00018218829516539442, "loss": 0.2215, "step": 181 }, { "epoch": 0.46134347275031684, "grad_norm": 0.33356910943984985, "learning_rate": 0.00018208651399491096, "loss": 0.3017, "step": 182 }, { "epoch": 0.46387832699619774, "grad_norm": 0.442376047372818, "learning_rate": 0.0001819847328244275, "loss": 0.2751, "step": 183 }, { "epoch": 0.4664131812420786, "grad_norm": 0.4563845992088318, "learning_rate": 0.00018188295165394403, "loss": 0.3001, "step": 184 }, { "epoch": 0.46894803548795944, "grad_norm": 0.3957296907901764, "learning_rate": 0.00018178117048346057, "loss": 0.3864, "step": 185 }, { "epoch": 0.4714828897338403, "grad_norm": 0.32932132482528687, "learning_rate": 0.0001816793893129771, "loss": 0.2528, "step": 186 }, { "epoch": 0.4740177439797212, "grad_norm": 0.3960365951061249, "learning_rate": 0.00018157760814249365, "loss": 0.3975, "step": 187 }, { "epoch": 0.47655259822560203, "grad_norm": 0.38450995087623596, "learning_rate": 0.00018147582697201016, "loss": 0.2552, "step": 188 }, { "epoch": 0.4790874524714829, "grad_norm": 0.4259994626045227, "learning_rate": 0.00018137404580152673, "loss": 0.3, "step": 189 }, { "epoch": 0.4816223067173637, "grad_norm": 0.4965859055519104, "learning_rate": 0.00018127226463104327, "loss": 0.3099, "step": 190 }, { "epoch": 0.4841571609632446, "grad_norm": 0.38229548931121826, "learning_rate": 0.0001811704834605598, "loss": 0.3799, "step": 191 }, { "epoch": 0.4866920152091255, "grad_norm": 0.4622017741203308, "learning_rate": 0.00018106870229007635, "loss": 0.4815, "step": 192 }, { "epoch": 0.4892268694550063, "grad_norm": 0.3207991123199463, "learning_rate": 0.0001809669211195929, "loss": 0.2534, "step": 193 }, { "epoch": 0.4917617237008872, "grad_norm": 0.3322354555130005, "learning_rate": 0.00018086513994910943, "loss": 0.2331, "step": 194 }, { "epoch": 0.49429657794676807, "grad_norm": 0.35752132534980774, "learning_rate": 0.00018076335877862597, "loss": 0.3621, "step": 195 }, { "epoch": 0.4968314321926489, "grad_norm": 0.2801353633403778, "learning_rate": 0.0001806615776081425, "loss": 0.2198, "step": 196 }, { "epoch": 0.49936628643852976, "grad_norm": 0.5065000057220459, "learning_rate": 0.00018055979643765905, "loss": 0.3806, "step": 197 }, { "epoch": 0.5019011406844106, "grad_norm": 0.4308508336544037, "learning_rate": 0.00018045801526717558, "loss": 0.4028, "step": 198 }, { "epoch": 0.5044359949302915, "grad_norm": 0.5432320833206177, "learning_rate": 0.00018035623409669212, "loss": 0.506, "step": 199 }, { "epoch": 0.5069708491761724, "grad_norm": 0.37079155445098877, "learning_rate": 0.00018025445292620866, "loss": 0.2242, "step": 200 }, { "epoch": 0.5095057034220533, "grad_norm": 0.3533012568950653, "learning_rate": 0.00018015267175572518, "loss": 0.3462, "step": 201 }, { "epoch": 0.5120405576679341, "grad_norm": 0.37727662920951843, "learning_rate": 0.00018005089058524174, "loss": 0.2421, "step": 202 }, { "epoch": 0.514575411913815, "grad_norm": 0.42737269401550293, "learning_rate": 0.00017994910941475828, "loss": 0.3338, "step": 203 }, { "epoch": 0.5171102661596958, "grad_norm": 0.41085687279701233, "learning_rate": 0.00017984732824427482, "loss": 0.4233, "step": 204 }, { "epoch": 0.5196451204055766, "grad_norm": 0.4871644675731659, "learning_rate": 0.00017974554707379136, "loss": 0.3504, "step": 205 }, { "epoch": 0.5221799746514575, "grad_norm": 0.308347225189209, "learning_rate": 0.0001796437659033079, "loss": 0.27, "step": 206 }, { "epoch": 0.5247148288973384, "grad_norm": 0.31587716937065125, "learning_rate": 0.00017954198473282444, "loss": 0.3161, "step": 207 }, { "epoch": 0.5272496831432193, "grad_norm": 0.471392959356308, "learning_rate": 0.00017944020356234098, "loss": 0.3758, "step": 208 }, { "epoch": 0.5297845373891001, "grad_norm": 0.33414778113365173, "learning_rate": 0.00017933842239185752, "loss": 0.3095, "step": 209 }, { "epoch": 0.532319391634981, "grad_norm": 0.26553916931152344, "learning_rate": 0.00017923664122137406, "loss": 0.232, "step": 210 }, { "epoch": 0.5348542458808618, "grad_norm": 0.27914223074913025, "learning_rate": 0.0001791348600508906, "loss": 0.2438, "step": 211 }, { "epoch": 0.5373891001267427, "grad_norm": 0.36625003814697266, "learning_rate": 0.00017903307888040713, "loss": 0.2479, "step": 212 }, { "epoch": 0.5399239543726235, "grad_norm": 0.3876325488090515, "learning_rate": 0.00017893129770992367, "loss": 0.3428, "step": 213 }, { "epoch": 0.5424588086185045, "grad_norm": 0.5402606129646301, "learning_rate": 0.0001788295165394402, "loss": 0.394, "step": 214 }, { "epoch": 0.5449936628643853, "grad_norm": 0.4023256301879883, "learning_rate": 0.00017872773536895675, "loss": 0.3348, "step": 215 }, { "epoch": 0.5475285171102662, "grad_norm": 0.4440263509750366, "learning_rate": 0.0001786259541984733, "loss": 0.3001, "step": 216 }, { "epoch": 0.550063371356147, "grad_norm": 0.39178457856178284, "learning_rate": 0.00017852417302798983, "loss": 0.2561, "step": 217 }, { "epoch": 0.5525982256020279, "grad_norm": 0.5261508226394653, "learning_rate": 0.00017842239185750637, "loss": 0.4583, "step": 218 }, { "epoch": 0.5551330798479087, "grad_norm": 0.3981377184391022, "learning_rate": 0.0001783206106870229, "loss": 0.265, "step": 219 }, { "epoch": 0.5576679340937896, "grad_norm": 0.3689790666103363, "learning_rate": 0.00017821882951653945, "loss": 0.3965, "step": 220 }, { "epoch": 0.5602027883396705, "grad_norm": 0.38442498445510864, "learning_rate": 0.000178117048346056, "loss": 0.268, "step": 221 }, { "epoch": 0.5627376425855514, "grad_norm": 0.3051845133304596, "learning_rate": 0.00017801526717557253, "loss": 0.2362, "step": 222 }, { "epoch": 0.5652724968314322, "grad_norm": 0.41551336646080017, "learning_rate": 0.00017791348600508907, "loss": 0.3428, "step": 223 }, { "epoch": 0.5678073510773131, "grad_norm": 0.2885109484195709, "learning_rate": 0.0001778117048346056, "loss": 0.2328, "step": 224 }, { "epoch": 0.5703422053231939, "grad_norm": 0.48813045024871826, "learning_rate": 0.00017770992366412215, "loss": 0.3502, "step": 225 }, { "epoch": 0.5728770595690748, "grad_norm": 0.4413661062717438, "learning_rate": 0.00017760814249363869, "loss": 0.2687, "step": 226 }, { "epoch": 0.5754119138149556, "grad_norm": 0.422799289226532, "learning_rate": 0.00017750636132315522, "loss": 0.4776, "step": 227 }, { "epoch": 0.5779467680608364, "grad_norm": 0.39486098289489746, "learning_rate": 0.00017740458015267176, "loss": 0.3551, "step": 228 }, { "epoch": 0.5804816223067174, "grad_norm": 0.366207480430603, "learning_rate": 0.0001773027989821883, "loss": 0.2639, "step": 229 }, { "epoch": 0.5830164765525983, "grad_norm": 0.334626704454422, "learning_rate": 0.00017720101781170484, "loss": 0.2407, "step": 230 }, { "epoch": 0.5855513307984791, "grad_norm": 0.5580838918685913, "learning_rate": 0.00017709923664122138, "loss": 0.3856, "step": 231 }, { "epoch": 0.5880861850443599, "grad_norm": 0.3495747148990631, "learning_rate": 0.00017699745547073792, "loss": 0.3113, "step": 232 }, { "epoch": 0.5906210392902408, "grad_norm": 0.38515543937683105, "learning_rate": 0.00017689567430025446, "loss": 0.3765, "step": 233 }, { "epoch": 0.5931558935361216, "grad_norm": 0.43240851163864136, "learning_rate": 0.000176793893129771, "loss": 0.3094, "step": 234 }, { "epoch": 0.5956907477820025, "grad_norm": 0.42353445291519165, "learning_rate": 0.00017669211195928754, "loss": 0.2992, "step": 235 }, { "epoch": 0.5982256020278834, "grad_norm": 0.42463192343711853, "learning_rate": 0.00017659033078880408, "loss": 0.2486, "step": 236 }, { "epoch": 0.6007604562737643, "grad_norm": 0.4749039113521576, "learning_rate": 0.00017648854961832062, "loss": 0.3742, "step": 237 }, { "epoch": 0.6032953105196451, "grad_norm": 0.5651363730430603, "learning_rate": 0.00017638676844783716, "loss": 0.3079, "step": 238 }, { "epoch": 0.605830164765526, "grad_norm": 0.34195011854171753, "learning_rate": 0.0001762849872773537, "loss": 0.3236, "step": 239 }, { "epoch": 0.6083650190114068, "grad_norm": 0.5522583723068237, "learning_rate": 0.00017618320610687024, "loss": 0.3026, "step": 240 }, { "epoch": 0.6108998732572877, "grad_norm": 0.41445448994636536, "learning_rate": 0.00017608142493638677, "loss": 0.32, "step": 241 }, { "epoch": 0.6134347275031685, "grad_norm": 0.5023159384727478, "learning_rate": 0.00017597964376590331, "loss": 0.2658, "step": 242 }, { "epoch": 0.6159695817490495, "grad_norm": 0.39539164304733276, "learning_rate": 0.00017587786259541985, "loss": 0.2687, "step": 243 }, { "epoch": 0.6185044359949303, "grad_norm": 0.3105890154838562, "learning_rate": 0.0001757760814249364, "loss": 0.2224, "step": 244 }, { "epoch": 0.6210392902408112, "grad_norm": 0.3665928840637207, "learning_rate": 0.00017567430025445293, "loss": 0.3101, "step": 245 }, { "epoch": 0.623574144486692, "grad_norm": 0.28569111227989197, "learning_rate": 0.00017557251908396947, "loss": 0.2316, "step": 246 }, { "epoch": 0.6261089987325729, "grad_norm": 0.24598725140094757, "learning_rate": 0.000175470737913486, "loss": 0.2314, "step": 247 }, { "epoch": 0.6286438529784537, "grad_norm": 0.4301004111766815, "learning_rate": 0.00017536895674300255, "loss": 0.2606, "step": 248 }, { "epoch": 0.6311787072243346, "grad_norm": 0.36598455905914307, "learning_rate": 0.0001752671755725191, "loss": 0.2243, "step": 249 }, { "epoch": 0.6337135614702155, "grad_norm": 0.31714677810668945, "learning_rate": 0.00017516539440203563, "loss": 0.2561, "step": 250 }, { "epoch": 0.6362484157160964, "grad_norm": 0.5131182670593262, "learning_rate": 0.0001750636132315522, "loss": 0.3216, "step": 251 }, { "epoch": 0.6387832699619772, "grad_norm": 0.4067549407482147, "learning_rate": 0.0001749618320610687, "loss": 0.3032, "step": 252 }, { "epoch": 0.641318124207858, "grad_norm": 0.6457440853118896, "learning_rate": 0.00017486005089058525, "loss": 0.349, "step": 253 }, { "epoch": 0.6438529784537389, "grad_norm": 0.3759848177433014, "learning_rate": 0.00017475826972010179, "loss": 0.2974, "step": 254 }, { "epoch": 0.6463878326996197, "grad_norm": 0.40348076820373535, "learning_rate": 0.00017465648854961833, "loss": 0.2781, "step": 255 }, { "epoch": 0.6489226869455006, "grad_norm": 0.2639053463935852, "learning_rate": 0.00017455470737913486, "loss": 0.2413, "step": 256 }, { "epoch": 0.6514575411913816, "grad_norm": 0.4014027416706085, "learning_rate": 0.0001744529262086514, "loss": 0.2878, "step": 257 }, { "epoch": 0.6539923954372624, "grad_norm": 0.4871384799480438, "learning_rate": 0.00017435114503816794, "loss": 0.2527, "step": 258 }, { "epoch": 0.6565272496831432, "grad_norm": 0.28687578439712524, "learning_rate": 0.00017424936386768448, "loss": 0.2233, "step": 259 }, { "epoch": 0.6590621039290241, "grad_norm": 0.36948761343955994, "learning_rate": 0.00017414758269720102, "loss": 0.3007, "step": 260 }, { "epoch": 0.6615969581749049, "grad_norm": 0.6034134030342102, "learning_rate": 0.00017404580152671756, "loss": 0.3054, "step": 261 }, { "epoch": 0.6641318124207858, "grad_norm": 0.3481515645980835, "learning_rate": 0.0001739440203562341, "loss": 0.2388, "step": 262 }, { "epoch": 0.6666666666666666, "grad_norm": 0.3772611916065216, "learning_rate": 0.00017384223918575064, "loss": 0.317, "step": 263 }, { "epoch": 0.6692015209125475, "grad_norm": 0.4693986177444458, "learning_rate": 0.0001737404580152672, "loss": 0.3441, "step": 264 }, { "epoch": 0.6717363751584284, "grad_norm": 0.38484400510787964, "learning_rate": 0.00017363867684478372, "loss": 0.2637, "step": 265 }, { "epoch": 0.6742712294043093, "grad_norm": 0.3638555407524109, "learning_rate": 0.00017353689567430026, "loss": 0.2695, "step": 266 }, { "epoch": 0.6768060836501901, "grad_norm": 0.36848586797714233, "learning_rate": 0.0001734351145038168, "loss": 0.3149, "step": 267 }, { "epoch": 0.679340937896071, "grad_norm": 0.31740638613700867, "learning_rate": 0.00017333333333333334, "loss": 0.3049, "step": 268 }, { "epoch": 0.6818757921419518, "grad_norm": 0.41415438055992126, "learning_rate": 0.00017323155216284988, "loss": 0.231, "step": 269 }, { "epoch": 0.6844106463878327, "grad_norm": 0.41449829936027527, "learning_rate": 0.00017312977099236641, "loss": 0.3344, "step": 270 }, { "epoch": 0.6869455006337135, "grad_norm": 0.30683189630508423, "learning_rate": 0.00017302798982188295, "loss": 0.283, "step": 271 }, { "epoch": 0.6894803548795945, "grad_norm": 0.29896244406700134, "learning_rate": 0.0001729262086513995, "loss": 0.2363, "step": 272 }, { "epoch": 0.6920152091254753, "grad_norm": 0.44181492924690247, "learning_rate": 0.00017282442748091603, "loss": 0.3439, "step": 273 }, { "epoch": 0.6945500633713562, "grad_norm": 0.43460434675216675, "learning_rate": 0.00017272264631043257, "loss": 0.3004, "step": 274 }, { "epoch": 0.697084917617237, "grad_norm": 0.40781405568122864, "learning_rate": 0.00017262086513994914, "loss": 0.2554, "step": 275 }, { "epoch": 0.6996197718631179, "grad_norm": 0.39359861612319946, "learning_rate": 0.00017251908396946565, "loss": 0.3094, "step": 276 }, { "epoch": 0.7021546261089987, "grad_norm": 0.4507496953010559, "learning_rate": 0.00017241730279898222, "loss": 0.2985, "step": 277 }, { "epoch": 0.7046894803548795, "grad_norm": 0.4513093829154968, "learning_rate": 0.00017231552162849873, "loss": 0.4, "step": 278 }, { "epoch": 0.7072243346007605, "grad_norm": 0.3133571147918701, "learning_rate": 0.0001722137404580153, "loss": 0.2241, "step": 279 }, { "epoch": 0.7097591888466414, "grad_norm": 0.36957162618637085, "learning_rate": 0.0001721119592875318, "loss": 0.2461, "step": 280 }, { "epoch": 0.7122940430925222, "grad_norm": 0.4224545955657959, "learning_rate": 0.00017201017811704835, "loss": 0.3178, "step": 281 }, { "epoch": 0.714828897338403, "grad_norm": 0.4696861207485199, "learning_rate": 0.0001719083969465649, "loss": 0.3911, "step": 282 }, { "epoch": 0.7173637515842839, "grad_norm": 0.44058746099472046, "learning_rate": 0.00017180661577608143, "loss": 0.3169, "step": 283 }, { "epoch": 0.7198986058301647, "grad_norm": 0.32616788148880005, "learning_rate": 0.00017170483460559797, "loss": 0.2441, "step": 284 }, { "epoch": 0.7224334600760456, "grad_norm": 0.3941279649734497, "learning_rate": 0.0001716030534351145, "loss": 0.3433, "step": 285 }, { "epoch": 0.7249683143219265, "grad_norm": 0.3746216297149658, "learning_rate": 0.00017150127226463104, "loss": 0.3993, "step": 286 }, { "epoch": 0.7275031685678074, "grad_norm": 0.3758716881275177, "learning_rate": 0.00017139949109414758, "loss": 0.3139, "step": 287 }, { "epoch": 0.7300380228136882, "grad_norm": 0.35631927847862244, "learning_rate": 0.00017129770992366415, "loss": 0.2316, "step": 288 }, { "epoch": 0.7325728770595691, "grad_norm": 0.48128026723861694, "learning_rate": 0.00017119592875318066, "loss": 0.3306, "step": 289 }, { "epoch": 0.7351077313054499, "grad_norm": 0.3464122414588928, "learning_rate": 0.00017109414758269723, "loss": 0.3148, "step": 290 }, { "epoch": 0.7376425855513308, "grad_norm": 0.3772057294845581, "learning_rate": 0.00017099236641221374, "loss": 0.274, "step": 291 }, { "epoch": 0.7401774397972116, "grad_norm": 0.2896706759929657, "learning_rate": 0.0001708905852417303, "loss": 0.2275, "step": 292 }, { "epoch": 0.7427122940430925, "grad_norm": 0.48482832312583923, "learning_rate": 0.00017078880407124682, "loss": 0.2913, "step": 293 }, { "epoch": 0.7452471482889734, "grad_norm": 0.3086034655570984, "learning_rate": 0.00017068702290076336, "loss": 0.2453, "step": 294 }, { "epoch": 0.7477820025348543, "grad_norm": 0.42840075492858887, "learning_rate": 0.0001705852417302799, "loss": 0.352, "step": 295 }, { "epoch": 0.7503168567807351, "grad_norm": 0.4574609398841858, "learning_rate": 0.00017048346055979644, "loss": 0.3698, "step": 296 }, { "epoch": 0.752851711026616, "grad_norm": 0.4295889735221863, "learning_rate": 0.00017038167938931298, "loss": 0.3341, "step": 297 }, { "epoch": 0.7553865652724968, "grad_norm": 0.46036672592163086, "learning_rate": 0.00017027989821882952, "loss": 0.3175, "step": 298 }, { "epoch": 0.7579214195183777, "grad_norm": 0.45897790789604187, "learning_rate": 0.00017017811704834608, "loss": 0.31, "step": 299 }, { "epoch": 0.7604562737642585, "grad_norm": 0.2966432273387909, "learning_rate": 0.0001700763358778626, "loss": 0.2439, "step": 300 }, { "epoch": 0.7629911280101395, "grad_norm": 0.32714638113975525, "learning_rate": 0.00016997455470737916, "loss": 0.2653, "step": 301 }, { "epoch": 0.7655259822560203, "grad_norm": 0.32264646887779236, "learning_rate": 0.00016987277353689567, "loss": 0.2728, "step": 302 }, { "epoch": 0.7680608365019012, "grad_norm": 0.4073767066001892, "learning_rate": 0.00016977099236641224, "loss": 0.3501, "step": 303 }, { "epoch": 0.770595690747782, "grad_norm": 0.5493949055671692, "learning_rate": 0.00016966921119592875, "loss": 0.3212, "step": 304 }, { "epoch": 0.7731305449936628, "grad_norm": 0.335705429315567, "learning_rate": 0.00016956743002544532, "loss": 0.299, "step": 305 }, { "epoch": 0.7756653992395437, "grad_norm": 0.32758405804634094, "learning_rate": 0.00016946564885496183, "loss": 0.2547, "step": 306 }, { "epoch": 0.7782002534854245, "grad_norm": 0.32411983609199524, "learning_rate": 0.0001693638676844784, "loss": 0.2593, "step": 307 }, { "epoch": 0.7807351077313055, "grad_norm": 0.5713444352149963, "learning_rate": 0.0001692620865139949, "loss": 0.3661, "step": 308 }, { "epoch": 0.7832699619771863, "grad_norm": 0.3287065327167511, "learning_rate": 0.00016916030534351145, "loss": 0.2559, "step": 309 }, { "epoch": 0.7858048162230672, "grad_norm": 0.3499440550804138, "learning_rate": 0.000169058524173028, "loss": 0.3489, "step": 310 }, { "epoch": 0.788339670468948, "grad_norm": 0.259787917137146, "learning_rate": 0.00016895674300254453, "loss": 0.2451, "step": 311 }, { "epoch": 0.7908745247148289, "grad_norm": 0.3902716338634491, "learning_rate": 0.0001688549618320611, "loss": 0.2821, "step": 312 }, { "epoch": 0.7934093789607097, "grad_norm": 0.4061296582221985, "learning_rate": 0.0001687531806615776, "loss": 0.4289, "step": 313 }, { "epoch": 0.7959442332065906, "grad_norm": 0.3062605857849121, "learning_rate": 0.00016865139949109417, "loss": 0.2489, "step": 314 }, { "epoch": 0.7984790874524715, "grad_norm": 0.36886945366859436, "learning_rate": 0.00016854961832061068, "loss": 0.4049, "step": 315 }, { "epoch": 0.8010139416983524, "grad_norm": 0.25828975439071655, "learning_rate": 0.00016844783715012725, "loss": 0.238, "step": 316 }, { "epoch": 0.8035487959442332, "grad_norm": 0.39747142791748047, "learning_rate": 0.00016834605597964376, "loss": 0.3928, "step": 317 }, { "epoch": 0.8060836501901141, "grad_norm": 0.3884779214859009, "learning_rate": 0.00016824427480916033, "loss": 0.2881, "step": 318 }, { "epoch": 0.8086185044359949, "grad_norm": 0.3687349855899811, "learning_rate": 0.00016814249363867684, "loss": 0.3662, "step": 319 }, { "epoch": 0.8111533586818758, "grad_norm": 0.3631541132926941, "learning_rate": 0.0001680407124681934, "loss": 0.2657, "step": 320 }, { "epoch": 0.8136882129277566, "grad_norm": 0.3174535930156708, "learning_rate": 0.00016793893129770992, "loss": 0.2636, "step": 321 }, { "epoch": 0.8162230671736375, "grad_norm": 0.44168904423713684, "learning_rate": 0.00016783715012722646, "loss": 0.2882, "step": 322 }, { "epoch": 0.8187579214195184, "grad_norm": 0.370685875415802, "learning_rate": 0.000167735368956743, "loss": 0.3228, "step": 323 }, { "epoch": 0.8212927756653993, "grad_norm": 0.3001299798488617, "learning_rate": 0.00016763358778625954, "loss": 0.2256, "step": 324 }, { "epoch": 0.8238276299112801, "grad_norm": 0.37992653250694275, "learning_rate": 0.0001675318066157761, "loss": 0.2633, "step": 325 }, { "epoch": 0.826362484157161, "grad_norm": 0.4739125072956085, "learning_rate": 0.00016743002544529262, "loss": 0.3044, "step": 326 }, { "epoch": 0.8288973384030418, "grad_norm": 0.36424344778060913, "learning_rate": 0.00016732824427480918, "loss": 0.3311, "step": 327 }, { "epoch": 0.8314321926489227, "grad_norm": 0.4474777579307556, "learning_rate": 0.0001672264631043257, "loss": 0.4099, "step": 328 }, { "epoch": 0.8339670468948035, "grad_norm": 0.4337301552295685, "learning_rate": 0.00016712468193384226, "loss": 0.3567, "step": 329 }, { "epoch": 0.8365019011406845, "grad_norm": 0.37666353583335876, "learning_rate": 0.00016702290076335877, "loss": 0.3079, "step": 330 }, { "epoch": 0.8390367553865653, "grad_norm": 0.36810433864593506, "learning_rate": 0.00016692111959287534, "loss": 0.414, "step": 331 }, { "epoch": 0.8415716096324461, "grad_norm": 0.3914581537246704, "learning_rate": 0.00016681933842239185, "loss": 0.2807, "step": 332 }, { "epoch": 0.844106463878327, "grad_norm": 0.3891938626766205, "learning_rate": 0.00016671755725190842, "loss": 0.3101, "step": 333 }, { "epoch": 0.8466413181242078, "grad_norm": 0.4397302269935608, "learning_rate": 0.00016661577608142493, "loss": 0.2659, "step": 334 }, { "epoch": 0.8491761723700887, "grad_norm": 0.3152853846549988, "learning_rate": 0.0001665139949109415, "loss": 0.308, "step": 335 }, { "epoch": 0.8517110266159695, "grad_norm": 0.2894272208213806, "learning_rate": 0.00016641221374045804, "loss": 0.2675, "step": 336 }, { "epoch": 0.8542458808618505, "grad_norm": 0.27995947003364563, "learning_rate": 0.00016631043256997455, "loss": 0.2603, "step": 337 }, { "epoch": 0.8567807351077313, "grad_norm": 0.42209070920944214, "learning_rate": 0.00016620865139949112, "loss": 0.3417, "step": 338 }, { "epoch": 0.8593155893536122, "grad_norm": 0.3781871795654297, "learning_rate": 0.00016610687022900763, "loss": 0.3441, "step": 339 }, { "epoch": 0.861850443599493, "grad_norm": 0.3438952565193176, "learning_rate": 0.0001660050890585242, "loss": 0.2249, "step": 340 }, { "epoch": 0.8643852978453739, "grad_norm": 0.32164961099624634, "learning_rate": 0.0001659033078880407, "loss": 0.2472, "step": 341 }, { "epoch": 0.8669201520912547, "grad_norm": 0.3517252504825592, "learning_rate": 0.00016580152671755727, "loss": 0.2434, "step": 342 }, { "epoch": 0.8694550063371356, "grad_norm": 0.29841092228889465, "learning_rate": 0.00016569974554707378, "loss": 0.2536, "step": 343 }, { "epoch": 0.8719898605830165, "grad_norm": 0.3351423144340515, "learning_rate": 0.00016559796437659035, "loss": 0.2501, "step": 344 }, { "epoch": 0.8745247148288974, "grad_norm": 0.3979301154613495, "learning_rate": 0.00016549618320610686, "loss": 0.2358, "step": 345 }, { "epoch": 0.8770595690747782, "grad_norm": 0.3859489858150482, "learning_rate": 0.00016539440203562343, "loss": 0.2675, "step": 346 }, { "epoch": 0.8795944233206591, "grad_norm": 0.3836475908756256, "learning_rate": 0.00016529262086513994, "loss": 0.2179, "step": 347 }, { "epoch": 0.8821292775665399, "grad_norm": 0.3986142575740814, "learning_rate": 0.0001651908396946565, "loss": 0.2599, "step": 348 }, { "epoch": 0.8846641318124208, "grad_norm": 0.4105628430843353, "learning_rate": 0.00016508905852417305, "loss": 0.242, "step": 349 }, { "epoch": 0.8871989860583016, "grad_norm": 0.34334608912467957, "learning_rate": 0.00016498727735368956, "loss": 0.2771, "step": 350 }, { "epoch": 0.8897338403041825, "grad_norm": 0.3412443995475769, "learning_rate": 0.00016488549618320613, "loss": 0.2289, "step": 351 }, { "epoch": 0.8922686945500634, "grad_norm": 0.3596668541431427, "learning_rate": 0.00016478371501272264, "loss": 0.2253, "step": 352 }, { "epoch": 0.8948035487959443, "grad_norm": 0.43112802505493164, "learning_rate": 0.0001646819338422392, "loss": 0.3116, "step": 353 }, { "epoch": 0.8973384030418251, "grad_norm": 0.4306243062019348, "learning_rate": 0.00016458015267175572, "loss": 0.3099, "step": 354 }, { "epoch": 0.899873257287706, "grad_norm": 0.2773829996585846, "learning_rate": 0.00016447837150127228, "loss": 0.2765, "step": 355 }, { "epoch": 0.9024081115335868, "grad_norm": 0.5014198422431946, "learning_rate": 0.0001643765903307888, "loss": 0.302, "step": 356 }, { "epoch": 0.9049429657794676, "grad_norm": 0.4376792013645172, "learning_rate": 0.00016427480916030536, "loss": 0.2967, "step": 357 }, { "epoch": 0.9074778200253485, "grad_norm": 0.34460946917533875, "learning_rate": 0.00016417302798982187, "loss": 0.3678, "step": 358 }, { "epoch": 0.9100126742712294, "grad_norm": 0.23346909880638123, "learning_rate": 0.00016407124681933844, "loss": 0.2409, "step": 359 }, { "epoch": 0.9125475285171103, "grad_norm": 0.35633108019828796, "learning_rate": 0.00016396946564885498, "loss": 0.3555, "step": 360 }, { "epoch": 0.9150823827629911, "grad_norm": 0.26780250668525696, "learning_rate": 0.00016386768447837152, "loss": 0.2543, "step": 361 }, { "epoch": 0.917617237008872, "grad_norm": 0.34583303332328796, "learning_rate": 0.00016376590330788806, "loss": 0.2444, "step": 362 }, { "epoch": 0.9201520912547528, "grad_norm": 0.38331279158592224, "learning_rate": 0.0001636641221374046, "loss": 0.3549, "step": 363 }, { "epoch": 0.9226869455006337, "grad_norm": 0.37290483713150024, "learning_rate": 0.00016356234096692114, "loss": 0.3311, "step": 364 }, { "epoch": 0.9252217997465145, "grad_norm": 0.406568318605423, "learning_rate": 0.00016346055979643765, "loss": 0.2774, "step": 365 }, { "epoch": 0.9277566539923955, "grad_norm": 0.35498303174972534, "learning_rate": 0.00016335877862595422, "loss": 0.2121, "step": 366 }, { "epoch": 0.9302915082382763, "grad_norm": 0.3682021498680115, "learning_rate": 0.00016325699745547073, "loss": 0.2648, "step": 367 }, { "epoch": 0.9328263624841572, "grad_norm": 0.37826359272003174, "learning_rate": 0.0001631552162849873, "loss": 0.2214, "step": 368 }, { "epoch": 0.935361216730038, "grad_norm": 0.4018029570579529, "learning_rate": 0.0001630534351145038, "loss": 0.2291, "step": 369 }, { "epoch": 0.9378960709759189, "grad_norm": 0.4628411531448364, "learning_rate": 0.00016295165394402037, "loss": 0.3486, "step": 370 }, { "epoch": 0.9404309252217997, "grad_norm": 0.5615106821060181, "learning_rate": 0.00016284987277353689, "loss": 0.3281, "step": 371 }, { "epoch": 0.9429657794676806, "grad_norm": 0.40337833762168884, "learning_rate": 0.00016274809160305345, "loss": 0.22, "step": 372 }, { "epoch": 0.9455006337135615, "grad_norm": 0.4247727692127228, "learning_rate": 0.00016264631043257, "loss": 0.2801, "step": 373 }, { "epoch": 0.9480354879594424, "grad_norm": 0.28746598958969116, "learning_rate": 0.00016254452926208653, "loss": 0.2349, "step": 374 }, { "epoch": 0.9505703422053232, "grad_norm": 0.3654968738555908, "learning_rate": 0.00016244274809160307, "loss": 0.2696, "step": 375 }, { "epoch": 0.9531051964512041, "grad_norm": 0.3999825417995453, "learning_rate": 0.0001623409669211196, "loss": 0.4228, "step": 376 }, { "epoch": 0.9556400506970849, "grad_norm": 0.3065613806247711, "learning_rate": 0.00016223918575063615, "loss": 0.2505, "step": 377 }, { "epoch": 0.9581749049429658, "grad_norm": 0.3503481149673462, "learning_rate": 0.0001621374045801527, "loss": 0.2953, "step": 378 }, { "epoch": 0.9607097591888466, "grad_norm": 0.28918176889419556, "learning_rate": 0.00016203562340966923, "loss": 0.2454, "step": 379 }, { "epoch": 0.9632446134347274, "grad_norm": 0.3047085404396057, "learning_rate": 0.00016193384223918574, "loss": 0.2639, "step": 380 }, { "epoch": 0.9657794676806084, "grad_norm": 0.3775922358036041, "learning_rate": 0.0001618320610687023, "loss": 0.3787, "step": 381 }, { "epoch": 0.9683143219264893, "grad_norm": 0.32147660851478577, "learning_rate": 0.00016173027989821882, "loss": 0.2273, "step": 382 }, { "epoch": 0.9708491761723701, "grad_norm": 0.355747252702713, "learning_rate": 0.00016162849872773538, "loss": 0.2805, "step": 383 }, { "epoch": 0.973384030418251, "grad_norm": 0.2670198082923889, "learning_rate": 0.0001615267175572519, "loss": 0.2393, "step": 384 }, { "epoch": 0.9759188846641318, "grad_norm": 0.3395114839076996, "learning_rate": 0.00016142493638676846, "loss": 0.2893, "step": 385 }, { "epoch": 0.9784537389100126, "grad_norm": 0.3189052641391754, "learning_rate": 0.000161323155216285, "loss": 0.2442, "step": 386 }, { "epoch": 0.9809885931558935, "grad_norm": 0.49379605054855347, "learning_rate": 0.00016122137404580154, "loss": 0.3126, "step": 387 }, { "epoch": 0.9835234474017744, "grad_norm": 0.2787371575832367, "learning_rate": 0.00016111959287531808, "loss": 0.2329, "step": 388 }, { "epoch": 0.9860583016476553, "grad_norm": 0.3559485673904419, "learning_rate": 0.00016101781170483462, "loss": 0.335, "step": 389 }, { "epoch": 0.9885931558935361, "grad_norm": 0.43041396141052246, "learning_rate": 0.00016091603053435116, "loss": 0.3069, "step": 390 }, { "epoch": 0.991128010139417, "grad_norm": 0.3231935203075409, "learning_rate": 0.0001608142493638677, "loss": 0.2354, "step": 391 }, { "epoch": 0.9936628643852978, "grad_norm": 0.3676549792289734, "learning_rate": 0.00016071246819338424, "loss": 0.2958, "step": 392 }, { "epoch": 0.9961977186311787, "grad_norm": 0.37902191281318665, "learning_rate": 0.00016061068702290075, "loss": 0.2792, "step": 393 }, { "epoch": 0.9987325728770595, "grad_norm": 0.47126442193984985, "learning_rate": 0.00016050890585241732, "loss": 0.4871, "step": 394 }, { "epoch": 1.0, "grad_norm": 0.4303727447986603, "learning_rate": 0.00016040712468193383, "loss": 0.2121, "step": 395 }, { "epoch": 1.002534854245881, "grad_norm": 0.3156070411205292, "learning_rate": 0.0001603053435114504, "loss": 0.2528, "step": 396 }, { "epoch": 1.0050697084917617, "grad_norm": 0.3030865788459778, "learning_rate": 0.00016020356234096693, "loss": 0.2029, "step": 397 }, { "epoch": 1.0076045627376427, "grad_norm": 0.2900277376174927, "learning_rate": 0.00016010178117048347, "loss": 0.2192, "step": 398 }, { "epoch": 1.0101394169835234, "grad_norm": 0.4288582503795624, "learning_rate": 0.00016, "loss": 0.308, "step": 399 }, { "epoch": 1.0126742712294043, "grad_norm": 0.3376273214817047, "learning_rate": 0.00015989821882951655, "loss": 0.2569, "step": 400 }, { "epoch": 1.015209125475285, "grad_norm": 0.39375385642051697, "learning_rate": 0.0001597964376590331, "loss": 0.2104, "step": 401 }, { "epoch": 1.017743979721166, "grad_norm": 0.2907378077507019, "learning_rate": 0.00015969465648854963, "loss": 0.2057, "step": 402 }, { "epoch": 1.020278833967047, "grad_norm": 0.3524622917175293, "learning_rate": 0.00015959287531806617, "loss": 0.2296, "step": 403 }, { "epoch": 1.0228136882129277, "grad_norm": 0.36487293243408203, "learning_rate": 0.0001594910941475827, "loss": 0.2133, "step": 404 }, { "epoch": 1.0253485424588087, "grad_norm": 0.4489257335662842, "learning_rate": 0.00015938931297709925, "loss": 0.2162, "step": 405 }, { "epoch": 1.0278833967046894, "grad_norm": 0.41142696142196655, "learning_rate": 0.0001592875318066158, "loss": 0.2383, "step": 406 }, { "epoch": 1.0304182509505704, "grad_norm": 0.3364538848400116, "learning_rate": 0.00015918575063613233, "loss": 0.2077, "step": 407 }, { "epoch": 1.0329531051964511, "grad_norm": 0.576775312423706, "learning_rate": 0.00015908396946564884, "loss": 0.2435, "step": 408 }, { "epoch": 1.035487959442332, "grad_norm": 0.6190880537033081, "learning_rate": 0.0001589821882951654, "loss": 0.252, "step": 409 }, { "epoch": 1.038022813688213, "grad_norm": 0.4943700432777405, "learning_rate": 0.00015888040712468195, "loss": 0.3275, "step": 410 }, { "epoch": 1.0405576679340938, "grad_norm": 0.3160712420940399, "learning_rate": 0.00015877862595419848, "loss": 0.217, "step": 411 }, { "epoch": 1.0430925221799747, "grad_norm": 0.34546172618865967, "learning_rate": 0.00015867684478371502, "loss": 0.2509, "step": 412 }, { "epoch": 1.0456273764258555, "grad_norm": 0.3498256802558899, "learning_rate": 0.00015857506361323156, "loss": 0.2376, "step": 413 }, { "epoch": 1.0481622306717364, "grad_norm": 0.29526984691619873, "learning_rate": 0.0001584732824427481, "loss": 0.2305, "step": 414 }, { "epoch": 1.0506970849176172, "grad_norm": 0.30113956332206726, "learning_rate": 0.00015837150127226464, "loss": 0.2205, "step": 415 }, { "epoch": 1.053231939163498, "grad_norm": 0.4007863402366638, "learning_rate": 0.00015826972010178118, "loss": 0.2407, "step": 416 }, { "epoch": 1.055766793409379, "grad_norm": 0.2594064176082611, "learning_rate": 0.00015816793893129772, "loss": 0.1923, "step": 417 }, { "epoch": 1.0583016476552598, "grad_norm": 0.23412476480007172, "learning_rate": 0.00015806615776081426, "loss": 0.2158, "step": 418 }, { "epoch": 1.0608365019011408, "grad_norm": 0.397443562746048, "learning_rate": 0.0001579643765903308, "loss": 0.3666, "step": 419 }, { "epoch": 1.0633713561470215, "grad_norm": 0.3756926655769348, "learning_rate": 0.00015786259541984734, "loss": 0.2081, "step": 420 }, { "epoch": 1.0659062103929025, "grad_norm": 0.5698515772819519, "learning_rate": 0.00015776081424936388, "loss": 0.2265, "step": 421 }, { "epoch": 1.0684410646387832, "grad_norm": 0.3608737289905548, "learning_rate": 0.00015765903307888042, "loss": 0.3821, "step": 422 }, { "epoch": 1.0709759188846641, "grad_norm": 0.4109106957912445, "learning_rate": 0.00015755725190839696, "loss": 0.3484, "step": 423 }, { "epoch": 1.073510773130545, "grad_norm": 0.38270992040634155, "learning_rate": 0.0001574554707379135, "loss": 0.2365, "step": 424 }, { "epoch": 1.0760456273764258, "grad_norm": 0.2857488989830017, "learning_rate": 0.00015735368956743004, "loss": 0.263, "step": 425 }, { "epoch": 1.0785804816223068, "grad_norm": 0.25236523151397705, "learning_rate": 0.00015725190839694657, "loss": 0.2216, "step": 426 }, { "epoch": 1.0811153358681875, "grad_norm": 0.40370991826057434, "learning_rate": 0.00015715012722646311, "loss": 0.3711, "step": 427 }, { "epoch": 1.0836501901140685, "grad_norm": 0.2624306380748749, "learning_rate": 0.00015704834605597965, "loss": 0.2082, "step": 428 }, { "epoch": 1.0861850443599492, "grad_norm": 0.4375905692577362, "learning_rate": 0.0001569465648854962, "loss": 0.3474, "step": 429 }, { "epoch": 1.0887198986058302, "grad_norm": 0.3287188410758972, "learning_rate": 0.00015684478371501273, "loss": 0.3097, "step": 430 }, { "epoch": 1.091254752851711, "grad_norm": 0.2669587731361389, "learning_rate": 0.00015674300254452927, "loss": 0.229, "step": 431 }, { "epoch": 1.0937896070975919, "grad_norm": 0.28192129731178284, "learning_rate": 0.0001566412213740458, "loss": 0.2226, "step": 432 }, { "epoch": 1.0963244613434728, "grad_norm": 0.30673590302467346, "learning_rate": 0.00015653944020356235, "loss": 0.2331, "step": 433 }, { "epoch": 1.0988593155893536, "grad_norm": 0.34343135356903076, "learning_rate": 0.0001564376590330789, "loss": 0.2567, "step": 434 }, { "epoch": 1.1013941698352345, "grad_norm": 0.4853306710720062, "learning_rate": 0.00015633587786259543, "loss": 0.3688, "step": 435 }, { "epoch": 1.1039290240811153, "grad_norm": 0.42215099930763245, "learning_rate": 0.00015623409669211197, "loss": 0.3465, "step": 436 }, { "epoch": 1.1064638783269962, "grad_norm": 0.5882295370101929, "learning_rate": 0.0001561323155216285, "loss": 0.4502, "step": 437 }, { "epoch": 1.1089987325728772, "grad_norm": 0.44578316807746887, "learning_rate": 0.00015603053435114505, "loss": 0.3345, "step": 438 }, { "epoch": 1.111533586818758, "grad_norm": 0.366653174161911, "learning_rate": 0.00015592875318066159, "loss": 0.2111, "step": 439 }, { "epoch": 1.1140684410646389, "grad_norm": 0.4964495003223419, "learning_rate": 0.00015582697201017812, "loss": 0.2731, "step": 440 }, { "epoch": 1.1166032953105196, "grad_norm": 0.3171039819717407, "learning_rate": 0.00015572519083969466, "loss": 0.2148, "step": 441 }, { "epoch": 1.1191381495564006, "grad_norm": 0.3483026921749115, "learning_rate": 0.0001556234096692112, "loss": 0.2481, "step": 442 }, { "epoch": 1.1216730038022813, "grad_norm": 0.37379321455955505, "learning_rate": 0.00015552162849872774, "loss": 0.3292, "step": 443 }, { "epoch": 1.1242078580481623, "grad_norm": 0.32108721137046814, "learning_rate": 0.00015541984732824428, "loss": 0.3363, "step": 444 }, { "epoch": 1.126742712294043, "grad_norm": 0.3879946768283844, "learning_rate": 0.00015531806615776082, "loss": 0.2891, "step": 445 }, { "epoch": 1.129277566539924, "grad_norm": 0.2334345281124115, "learning_rate": 0.00015521628498727736, "loss": 0.2183, "step": 446 }, { "epoch": 1.131812420785805, "grad_norm": 0.274795264005661, "learning_rate": 0.0001551145038167939, "loss": 0.2002, "step": 447 }, { "epoch": 1.1343472750316856, "grad_norm": 0.45602667331695557, "learning_rate": 0.00015501272264631044, "loss": 0.3282, "step": 448 }, { "epoch": 1.1368821292775666, "grad_norm": 0.25433096289634705, "learning_rate": 0.00015491094147582698, "loss": 0.2195, "step": 449 }, { "epoch": 1.1394169835234473, "grad_norm": 0.3606742024421692, "learning_rate": 0.00015480916030534352, "loss": 0.244, "step": 450 }, { "epoch": 1.1419518377693283, "grad_norm": 0.3597625494003296, "learning_rate": 0.00015470737913486006, "loss": 0.2117, "step": 451 }, { "epoch": 1.144486692015209, "grad_norm": 0.32967302203178406, "learning_rate": 0.0001546055979643766, "loss": 0.2662, "step": 452 }, { "epoch": 1.14702154626109, "grad_norm": 0.32538869976997375, "learning_rate": 0.00015450381679389314, "loss": 0.2439, "step": 453 }, { "epoch": 1.149556400506971, "grad_norm": 0.36263129115104675, "learning_rate": 0.00015440203562340968, "loss": 0.2688, "step": 454 }, { "epoch": 1.1520912547528517, "grad_norm": 0.4200229346752167, "learning_rate": 0.00015430025445292621, "loss": 0.3201, "step": 455 }, { "epoch": 1.1546261089987326, "grad_norm": 0.35889115929603577, "learning_rate": 0.00015419847328244275, "loss": 0.2584, "step": 456 }, { "epoch": 1.1571609632446134, "grad_norm": 0.36060044169425964, "learning_rate": 0.0001540966921119593, "loss": 0.2496, "step": 457 }, { "epoch": 1.1596958174904943, "grad_norm": 0.3046696186065674, "learning_rate": 0.00015399491094147583, "loss": 0.2102, "step": 458 }, { "epoch": 1.162230671736375, "grad_norm": 0.4576256275177002, "learning_rate": 0.00015389312977099237, "loss": 0.3594, "step": 459 }, { "epoch": 1.164765525982256, "grad_norm": 0.3436565697193146, "learning_rate": 0.0001537913486005089, "loss": 0.2289, "step": 460 }, { "epoch": 1.167300380228137, "grad_norm": 0.4197808802127838, "learning_rate": 0.00015368956743002545, "loss": 0.2863, "step": 461 }, { "epoch": 1.1698352344740177, "grad_norm": 0.3584151566028595, "learning_rate": 0.000153587786259542, "loss": 0.2797, "step": 462 }, { "epoch": 1.1723700887198987, "grad_norm": 0.29760056734085083, "learning_rate": 0.00015348600508905853, "loss": 0.212, "step": 463 }, { "epoch": 1.1749049429657794, "grad_norm": 0.3856862485408783, "learning_rate": 0.00015338422391857507, "loss": 0.2986, "step": 464 }, { "epoch": 1.1774397972116604, "grad_norm": 0.42522993683815, "learning_rate": 0.0001532824427480916, "loss": 0.2869, "step": 465 }, { "epoch": 1.179974651457541, "grad_norm": 0.33221253752708435, "learning_rate": 0.00015318066157760815, "loss": 0.2236, "step": 466 }, { "epoch": 1.182509505703422, "grad_norm": 0.35414496064186096, "learning_rate": 0.00015307888040712469, "loss": 0.2658, "step": 467 }, { "epoch": 1.1850443599493028, "grad_norm": 0.41883930563926697, "learning_rate": 0.00015297709923664123, "loss": 0.3939, "step": 468 }, { "epoch": 1.1875792141951838, "grad_norm": 0.3070299029350281, "learning_rate": 0.00015287531806615776, "loss": 0.2208, "step": 469 }, { "epoch": 1.1901140684410647, "grad_norm": 0.30749714374542236, "learning_rate": 0.0001527735368956743, "loss": 0.242, "step": 470 }, { "epoch": 1.1926489226869454, "grad_norm": 0.2579677104949951, "learning_rate": 0.00015267175572519084, "loss": 0.2435, "step": 471 }, { "epoch": 1.1951837769328264, "grad_norm": 0.46220460534095764, "learning_rate": 0.00015256997455470738, "loss": 0.2803, "step": 472 }, { "epoch": 1.1977186311787071, "grad_norm": 0.3824957609176636, "learning_rate": 0.00015246819338422392, "loss": 0.3143, "step": 473 }, { "epoch": 1.200253485424588, "grad_norm": 0.3049899637699127, "learning_rate": 0.00015236641221374046, "loss": 0.2231, "step": 474 }, { "epoch": 1.202788339670469, "grad_norm": 0.4378805458545685, "learning_rate": 0.000152264631043257, "loss": 0.2041, "step": 475 }, { "epoch": 1.2053231939163498, "grad_norm": 0.3902495801448822, "learning_rate": 0.00015216284987277354, "loss": 0.3055, "step": 476 }, { "epoch": 1.2078580481622307, "grad_norm": 0.3150664269924164, "learning_rate": 0.00015206106870229008, "loss": 0.2222, "step": 477 }, { "epoch": 1.2103929024081115, "grad_norm": 0.3551795184612274, "learning_rate": 0.00015195928753180662, "loss": 0.2304, "step": 478 }, { "epoch": 1.2129277566539924, "grad_norm": 0.35522422194480896, "learning_rate": 0.00015185750636132316, "loss": 0.2636, "step": 479 }, { "epoch": 1.2154626108998732, "grad_norm": 0.35261449217796326, "learning_rate": 0.0001517557251908397, "loss": 0.2743, "step": 480 }, { "epoch": 1.2179974651457541, "grad_norm": 0.4755167067050934, "learning_rate": 0.00015165394402035624, "loss": 0.321, "step": 481 }, { "epoch": 1.2205323193916349, "grad_norm": 0.36083585023880005, "learning_rate": 0.0001515521628498728, "loss": 0.2549, "step": 482 }, { "epoch": 1.2230671736375158, "grad_norm": 0.3213503956794739, "learning_rate": 0.00015145038167938932, "loss": 0.2685, "step": 483 }, { "epoch": 1.2256020278833968, "grad_norm": 0.29988422989845276, "learning_rate": 0.00015134860050890588, "loss": 0.3253, "step": 484 }, { "epoch": 1.2281368821292775, "grad_norm": 0.3549601435661316, "learning_rate": 0.0001512468193384224, "loss": 0.2574, "step": 485 }, { "epoch": 1.2306717363751585, "grad_norm": 0.33347830176353455, "learning_rate": 0.00015114503816793893, "loss": 0.3408, "step": 486 }, { "epoch": 1.2332065906210392, "grad_norm": 0.2988692820072174, "learning_rate": 0.00015104325699745547, "loss": 0.2583, "step": 487 }, { "epoch": 1.2357414448669202, "grad_norm": 0.2710984945297241, "learning_rate": 0.000150941475826972, "loss": 0.2708, "step": 488 }, { "epoch": 1.2382762991128011, "grad_norm": 0.28278592228889465, "learning_rate": 0.00015083969465648855, "loss": 0.2345, "step": 489 }, { "epoch": 1.2408111533586819, "grad_norm": 0.31838810443878174, "learning_rate": 0.0001507379134860051, "loss": 0.2193, "step": 490 }, { "epoch": 1.2433460076045628, "grad_norm": 0.31196919083595276, "learning_rate": 0.00015063613231552163, "loss": 0.2334, "step": 491 }, { "epoch": 1.2458808618504436, "grad_norm": 0.3953218460083008, "learning_rate": 0.00015053435114503817, "loss": 0.2716, "step": 492 }, { "epoch": 1.2484157160963245, "grad_norm": 0.4814457297325134, "learning_rate": 0.0001504325699745547, "loss": 0.2847, "step": 493 }, { "epoch": 1.2509505703422052, "grad_norm": 0.5870761275291443, "learning_rate": 0.00015033078880407125, "loss": 0.3685, "step": 494 }, { "epoch": 1.2534854245880862, "grad_norm": 0.30315646529197693, "learning_rate": 0.00015022900763358781, "loss": 0.2112, "step": 495 }, { "epoch": 1.256020278833967, "grad_norm": 0.4358583390712738, "learning_rate": 0.00015012722646310433, "loss": 0.279, "step": 496 }, { "epoch": 1.258555133079848, "grad_norm": 0.3699369728565216, "learning_rate": 0.0001500254452926209, "loss": 0.2941, "step": 497 }, { "epoch": 1.2610899873257289, "grad_norm": 0.338522344827652, "learning_rate": 0.0001499236641221374, "loss": 0.273, "step": 498 }, { "epoch": 1.2636248415716096, "grad_norm": 0.29661208391189575, "learning_rate": 0.00014982188295165397, "loss": 0.23, "step": 499 }, { "epoch": 1.2661596958174905, "grad_norm": 0.4247685968875885, "learning_rate": 0.00014972010178117048, "loss": 0.3112, "step": 500 }, { "epoch": 1.2686945500633713, "grad_norm": 0.44488340616226196, "learning_rate": 0.00014961832061068702, "loss": 0.3796, "step": 501 }, { "epoch": 1.2712294043092522, "grad_norm": 0.30672356486320496, "learning_rate": 0.00014951653944020356, "loss": 0.2222, "step": 502 }, { "epoch": 1.2737642585551332, "grad_norm": 0.3291172981262207, "learning_rate": 0.0001494147582697201, "loss": 0.2177, "step": 503 }, { "epoch": 1.276299112801014, "grad_norm": 0.4180152118206024, "learning_rate": 0.00014931297709923664, "loss": 0.3673, "step": 504 }, { "epoch": 1.2788339670468947, "grad_norm": 0.41350388526916504, "learning_rate": 0.00014921119592875318, "loss": 0.2544, "step": 505 }, { "epoch": 1.2813688212927756, "grad_norm": 0.3517690598964691, "learning_rate": 0.00014910941475826972, "loss": 0.2139, "step": 506 }, { "epoch": 1.2839036755386566, "grad_norm": 0.4273949861526489, "learning_rate": 0.00014900763358778626, "loss": 0.255, "step": 507 }, { "epoch": 1.2864385297845373, "grad_norm": 0.3510381877422333, "learning_rate": 0.00014890585241730283, "loss": 0.2503, "step": 508 }, { "epoch": 1.2889733840304183, "grad_norm": 0.4069119393825531, "learning_rate": 0.00014880407124681934, "loss": 0.3267, "step": 509 }, { "epoch": 1.291508238276299, "grad_norm": 0.6244072318077087, "learning_rate": 0.0001487022900763359, "loss": 0.2519, "step": 510 }, { "epoch": 1.29404309252218, "grad_norm": 0.473450630903244, "learning_rate": 0.00014860050890585242, "loss": 0.3093, "step": 511 }, { "epoch": 1.296577946768061, "grad_norm": 0.3139822781085968, "learning_rate": 0.00014849872773536898, "loss": 0.2396, "step": 512 }, { "epoch": 1.2991128010139417, "grad_norm": 0.23700624704360962, "learning_rate": 0.0001483969465648855, "loss": 0.1945, "step": 513 }, { "epoch": 1.3016476552598226, "grad_norm": 0.42849189043045044, "learning_rate": 0.00014829516539440203, "loss": 0.2275, "step": 514 }, { "epoch": 1.3041825095057034, "grad_norm": 0.4083426296710968, "learning_rate": 0.00014819338422391857, "loss": 0.3626, "step": 515 }, { "epoch": 1.3067173637515843, "grad_norm": 0.4541410207748413, "learning_rate": 0.0001480916030534351, "loss": 0.3102, "step": 516 }, { "epoch": 1.3092522179974653, "grad_norm": 0.6483343839645386, "learning_rate": 0.00014798982188295165, "loss": 0.3427, "step": 517 }, { "epoch": 1.311787072243346, "grad_norm": 0.3928525447845459, "learning_rate": 0.0001478880407124682, "loss": 0.3155, "step": 518 }, { "epoch": 1.3143219264892267, "grad_norm": 0.319035142660141, "learning_rate": 0.00014778625954198476, "loss": 0.2555, "step": 519 }, { "epoch": 1.3168567807351077, "grad_norm": 0.2855183780193329, "learning_rate": 0.00014768447837150127, "loss": 0.2115, "step": 520 }, { "epoch": 1.3193916349809887, "grad_norm": 0.3499714136123657, "learning_rate": 0.00014758269720101784, "loss": 0.254, "step": 521 }, { "epoch": 1.3219264892268694, "grad_norm": 0.40895748138427734, "learning_rate": 0.00014748091603053435, "loss": 0.2975, "step": 522 }, { "epoch": 1.3244613434727504, "grad_norm": 0.30614539980888367, "learning_rate": 0.00014737913486005091, "loss": 0.2584, "step": 523 }, { "epoch": 1.326996197718631, "grad_norm": 0.2832574248313904, "learning_rate": 0.00014727735368956743, "loss": 0.2259, "step": 524 }, { "epoch": 1.329531051964512, "grad_norm": 0.3444589674472809, "learning_rate": 0.000147175572519084, "loss": 0.2608, "step": 525 }, { "epoch": 1.332065906210393, "grad_norm": 0.35170844197273254, "learning_rate": 0.0001470737913486005, "loss": 0.3019, "step": 526 }, { "epoch": 1.3346007604562737, "grad_norm": 0.46164563298225403, "learning_rate": 0.00014697201017811707, "loss": 0.2024, "step": 527 }, { "epoch": 1.3371356147021547, "grad_norm": 0.2369971126317978, "learning_rate": 0.00014687022900763358, "loss": 0.1967, "step": 528 }, { "epoch": 1.3396704689480354, "grad_norm": 0.43180060386657715, "learning_rate": 0.00014676844783715012, "loss": 0.2415, "step": 529 }, { "epoch": 1.3422053231939164, "grad_norm": 0.3531292676925659, "learning_rate": 0.00014666666666666666, "loss": 0.2283, "step": 530 }, { "epoch": 1.3447401774397973, "grad_norm": 0.49374547600746155, "learning_rate": 0.0001465648854961832, "loss": 0.3025, "step": 531 }, { "epoch": 1.347275031685678, "grad_norm": 0.4822668731212616, "learning_rate": 0.00014646310432569977, "loss": 0.3498, "step": 532 }, { "epoch": 1.3498098859315588, "grad_norm": 0.4463392496109009, "learning_rate": 0.00014636132315521628, "loss": 0.2186, "step": 533 }, { "epoch": 1.3523447401774398, "grad_norm": 0.40042299032211304, "learning_rate": 0.00014625954198473285, "loss": 0.2316, "step": 534 }, { "epoch": 1.3548795944233207, "grad_norm": 0.41266927123069763, "learning_rate": 0.00014615776081424936, "loss": 0.2324, "step": 535 }, { "epoch": 1.3574144486692015, "grad_norm": 0.46208152174949646, "learning_rate": 0.00014605597964376593, "loss": 0.2261, "step": 536 }, { "epoch": 1.3599493029150824, "grad_norm": 0.38895705342292786, "learning_rate": 0.00014595419847328244, "loss": 0.2732, "step": 537 }, { "epoch": 1.3624841571609632, "grad_norm": 0.4489743113517761, "learning_rate": 0.000145852417302799, "loss": 0.3197, "step": 538 }, { "epoch": 1.3650190114068441, "grad_norm": 0.25082916021347046, "learning_rate": 0.00014575063613231552, "loss": 0.2096, "step": 539 }, { "epoch": 1.367553865652725, "grad_norm": 0.3681942820549011, "learning_rate": 0.00014564885496183208, "loss": 0.2496, "step": 540 }, { "epoch": 1.3700887198986058, "grad_norm": 0.30986878275871277, "learning_rate": 0.0001455470737913486, "loss": 0.2244, "step": 541 }, { "epoch": 1.3726235741444868, "grad_norm": 0.42349961400032043, "learning_rate": 0.00014544529262086513, "loss": 0.2315, "step": 542 }, { "epoch": 1.3751584283903675, "grad_norm": 0.29656872153282166, "learning_rate": 0.00014534351145038167, "loss": 0.2458, "step": 543 }, { "epoch": 1.3776932826362485, "grad_norm": 0.4033924341201782, "learning_rate": 0.0001452417302798982, "loss": 0.3506, "step": 544 }, { "epoch": 1.3802281368821292, "grad_norm": 0.3998583257198334, "learning_rate": 0.00014513994910941478, "loss": 0.3108, "step": 545 }, { "epoch": 1.3827629911280102, "grad_norm": 0.3335135281085968, "learning_rate": 0.0001450381679389313, "loss": 0.2816, "step": 546 }, { "epoch": 1.385297845373891, "grad_norm": 0.39304816722869873, "learning_rate": 0.00014493638676844786, "loss": 0.3968, "step": 547 }, { "epoch": 1.3878326996197718, "grad_norm": 0.34913384914398193, "learning_rate": 0.00014483460559796437, "loss": 0.2653, "step": 548 }, { "epoch": 1.3903675538656528, "grad_norm": 0.3312399387359619, "learning_rate": 0.00014473282442748094, "loss": 0.2629, "step": 549 }, { "epoch": 1.3929024081115335, "grad_norm": 0.31613558530807495, "learning_rate": 0.00014463104325699745, "loss": 0.2033, "step": 550 }, { "epoch": 1.3954372623574145, "grad_norm": 0.2872864603996277, "learning_rate": 0.00014452926208651402, "loss": 0.2097, "step": 551 }, { "epoch": 1.3979721166032952, "grad_norm": 0.24432098865509033, "learning_rate": 0.00014442748091603053, "loss": 0.2172, "step": 552 }, { "epoch": 1.4005069708491762, "grad_norm": 0.31649062037467957, "learning_rate": 0.0001443256997455471, "loss": 0.2255, "step": 553 }, { "epoch": 1.4030418250950571, "grad_norm": 0.2483261376619339, "learning_rate": 0.0001442239185750636, "loss": 0.1856, "step": 554 }, { "epoch": 1.4055766793409379, "grad_norm": 0.437757670879364, "learning_rate": 0.00014412213740458017, "loss": 0.2713, "step": 555 }, { "epoch": 1.4081115335868186, "grad_norm": 0.43551307916641235, "learning_rate": 0.0001440203562340967, "loss": 0.2654, "step": 556 }, { "epoch": 1.4106463878326996, "grad_norm": 0.5781947374343872, "learning_rate": 0.00014391857506361322, "loss": 0.3242, "step": 557 }, { "epoch": 1.4131812420785805, "grad_norm": 0.3809725344181061, "learning_rate": 0.0001438167938931298, "loss": 0.2176, "step": 558 }, { "epoch": 1.4157160963244613, "grad_norm": 0.38208654522895813, "learning_rate": 0.0001437150127226463, "loss": 0.2043, "step": 559 }, { "epoch": 1.4182509505703422, "grad_norm": 0.39930659532546997, "learning_rate": 0.00014361323155216287, "loss": 0.2914, "step": 560 }, { "epoch": 1.420785804816223, "grad_norm": 0.3019846975803375, "learning_rate": 0.00014351145038167938, "loss": 0.2037, "step": 561 }, { "epoch": 1.423320659062104, "grad_norm": 0.4549913704395294, "learning_rate": 0.00014340966921119595, "loss": 0.2308, "step": 562 }, { "epoch": 1.4258555133079849, "grad_norm": 0.38887929916381836, "learning_rate": 0.00014330788804071246, "loss": 0.2339, "step": 563 }, { "epoch": 1.4283903675538656, "grad_norm": 0.3481290340423584, "learning_rate": 0.00014320610687022903, "loss": 0.2206, "step": 564 }, { "epoch": 1.4309252217997466, "grad_norm": 0.46603840589523315, "learning_rate": 0.00014310432569974554, "loss": 0.3006, "step": 565 }, { "epoch": 1.4334600760456273, "grad_norm": 0.3586963713169098, "learning_rate": 0.0001430025445292621, "loss": 0.2646, "step": 566 }, { "epoch": 1.4359949302915083, "grad_norm": 0.3106522560119629, "learning_rate": 0.00014290076335877862, "loss": 0.2725, "step": 567 }, { "epoch": 1.4385297845373892, "grad_norm": 0.48086050152778625, "learning_rate": 0.00014279898218829518, "loss": 0.3007, "step": 568 }, { "epoch": 1.44106463878327, "grad_norm": 0.44636330008506775, "learning_rate": 0.00014269720101781172, "loss": 0.3755, "step": 569 }, { "epoch": 1.4435994930291507, "grad_norm": 0.3114064633846283, "learning_rate": 0.00014259541984732824, "loss": 0.2606, "step": 570 }, { "epoch": 1.4461343472750317, "grad_norm": 0.358394593000412, "learning_rate": 0.0001424936386768448, "loss": 0.27, "step": 571 }, { "epoch": 1.4486692015209126, "grad_norm": 0.3568032681941986, "learning_rate": 0.00014239185750636131, "loss": 0.2767, "step": 572 }, { "epoch": 1.4512040557667933, "grad_norm": 0.4407200515270233, "learning_rate": 0.00014229007633587788, "loss": 0.3786, "step": 573 }, { "epoch": 1.4537389100126743, "grad_norm": 0.4096840023994446, "learning_rate": 0.0001421882951653944, "loss": 0.3199, "step": 574 }, { "epoch": 1.456273764258555, "grad_norm": 0.3343110680580139, "learning_rate": 0.00014208651399491096, "loss": 0.2538, "step": 575 }, { "epoch": 1.458808618504436, "grad_norm": 0.27782517671585083, "learning_rate": 0.00014198473282442747, "loss": 0.2179, "step": 576 }, { "epoch": 1.461343472750317, "grad_norm": 0.2901310920715332, "learning_rate": 0.00014188295165394404, "loss": 0.2552, "step": 577 }, { "epoch": 1.4638783269961977, "grad_norm": 0.3634903132915497, "learning_rate": 0.00014178117048346055, "loss": 0.257, "step": 578 }, { "epoch": 1.4664131812420786, "grad_norm": 0.37307262420654297, "learning_rate": 0.00014167938931297712, "loss": 0.254, "step": 579 }, { "epoch": 1.4689480354879594, "grad_norm": 0.27726346254348755, "learning_rate": 0.00014157760814249366, "loss": 0.1938, "step": 580 }, { "epoch": 1.4714828897338403, "grad_norm": 0.3364371657371521, "learning_rate": 0.0001414758269720102, "loss": 0.2094, "step": 581 }, { "epoch": 1.4740177439797213, "grad_norm": 0.4418800473213196, "learning_rate": 0.00014137404580152673, "loss": 0.3243, "step": 582 }, { "epoch": 1.476552598225602, "grad_norm": 0.42042022943496704, "learning_rate": 0.00014127226463104327, "loss": 0.2333, "step": 583 }, { "epoch": 1.4790874524714828, "grad_norm": 0.36881470680236816, "learning_rate": 0.0001411704834605598, "loss": 0.2513, "step": 584 }, { "epoch": 1.4816223067173637, "grad_norm": 0.4009782671928406, "learning_rate": 0.00014106870229007632, "loss": 0.3085, "step": 585 }, { "epoch": 1.4841571609632447, "grad_norm": 0.43179744482040405, "learning_rate": 0.0001409669211195929, "loss": 0.3189, "step": 586 }, { "epoch": 1.4866920152091254, "grad_norm": 0.3721300959587097, "learning_rate": 0.0001408651399491094, "loss": 0.2318, "step": 587 }, { "epoch": 1.4892268694550064, "grad_norm": 0.3875066339969635, "learning_rate": 0.00014076335877862597, "loss": 0.2753, "step": 588 }, { "epoch": 1.491761723700887, "grad_norm": 0.35223937034606934, "learning_rate": 0.00014066157760814248, "loss": 0.2257, "step": 589 }, { "epoch": 1.494296577946768, "grad_norm": 0.30979710817337036, "learning_rate": 0.00014055979643765905, "loss": 0.2149, "step": 590 }, { "epoch": 1.496831432192649, "grad_norm": 0.23923753201961517, "learning_rate": 0.00014045801526717556, "loss": 0.1911, "step": 591 }, { "epoch": 1.4993662864385298, "grad_norm": 0.40893304347991943, "learning_rate": 0.00014035623409669213, "loss": 0.2756, "step": 592 }, { "epoch": 1.5019011406844105, "grad_norm": 0.2659086585044861, "learning_rate": 0.00014025445292620867, "loss": 0.2154, "step": 593 }, { "epoch": 1.5044359949302915, "grad_norm": 0.30749884247779846, "learning_rate": 0.0001401526717557252, "loss": 0.2184, "step": 594 }, { "epoch": 1.5069708491761724, "grad_norm": 0.3892879784107208, "learning_rate": 0.00014005089058524175, "loss": 0.2849, "step": 595 }, { "epoch": 1.5095057034220534, "grad_norm": 0.5041462779045105, "learning_rate": 0.00013994910941475828, "loss": 0.2551, "step": 596 }, { "epoch": 1.512040557667934, "grad_norm": 0.4143123924732208, "learning_rate": 0.00013984732824427482, "loss": 0.2485, "step": 597 }, { "epoch": 1.5145754119138148, "grad_norm": 0.5315548181533813, "learning_rate": 0.00013974554707379136, "loss": 0.3242, "step": 598 }, { "epoch": 1.5171102661596958, "grad_norm": 0.28680169582366943, "learning_rate": 0.0001396437659033079, "loss": 0.227, "step": 599 }, { "epoch": 1.5196451204055768, "grad_norm": 0.3015950620174408, "learning_rate": 0.00013954198473282441, "loss": 0.2122, "step": 600 }, { "epoch": 1.5221799746514575, "grad_norm": 0.30785971879959106, "learning_rate": 0.00013944020356234098, "loss": 0.2194, "step": 601 }, { "epoch": 1.5247148288973384, "grad_norm": 0.3596206605434418, "learning_rate": 0.0001393384223918575, "loss": 0.2574, "step": 602 }, { "epoch": 1.5272496831432192, "grad_norm": 0.18499840795993805, "learning_rate": 0.00013923664122137406, "loss": 0.1944, "step": 603 }, { "epoch": 1.5297845373891001, "grad_norm": 0.4346081614494324, "learning_rate": 0.00013913486005089057, "loss": 0.3187, "step": 604 }, { "epoch": 1.532319391634981, "grad_norm": 0.46154457330703735, "learning_rate": 0.00013903307888040714, "loss": 0.3149, "step": 605 }, { "epoch": 1.5348542458808618, "grad_norm": 0.3444209098815918, "learning_rate": 0.00013893129770992368, "loss": 0.2801, "step": 606 }, { "epoch": 1.5373891001267426, "grad_norm": 0.550620436668396, "learning_rate": 0.00013882951653944022, "loss": 0.3038, "step": 607 }, { "epoch": 1.5399239543726235, "grad_norm": 0.36603689193725586, "learning_rate": 0.00013872773536895676, "loss": 0.3224, "step": 608 }, { "epoch": 1.5424588086185045, "grad_norm": 0.213638037443161, "learning_rate": 0.0001386259541984733, "loss": 0.2081, "step": 609 }, { "epoch": 1.5449936628643854, "grad_norm": 0.34508904814720154, "learning_rate": 0.00013852417302798983, "loss": 0.2474, "step": 610 }, { "epoch": 1.5475285171102662, "grad_norm": 0.42072099447250366, "learning_rate": 0.00013842239185750637, "loss": 0.3049, "step": 611 }, { "epoch": 1.550063371356147, "grad_norm": 0.3760271966457367, "learning_rate": 0.0001383206106870229, "loss": 0.2499, "step": 612 }, { "epoch": 1.5525982256020279, "grad_norm": 0.24040678143501282, "learning_rate": 0.00013821882951653943, "loss": 0.2134, "step": 613 }, { "epoch": 1.5551330798479088, "grad_norm": 0.458035945892334, "learning_rate": 0.000138117048346056, "loss": 0.3375, "step": 614 }, { "epoch": 1.5576679340937896, "grad_norm": 0.30446937680244446, "learning_rate": 0.0001380152671755725, "loss": 0.2252, "step": 615 }, { "epoch": 1.5602027883396705, "grad_norm": 0.3036455810070038, "learning_rate": 0.00013791348600508907, "loss": 0.2095, "step": 616 }, { "epoch": 1.5627376425855513, "grad_norm": 0.4190979301929474, "learning_rate": 0.0001378117048346056, "loss": 0.2932, "step": 617 }, { "epoch": 1.5652724968314322, "grad_norm": 0.27648523449897766, "learning_rate": 0.00013770992366412215, "loss": 0.2133, "step": 618 }, { "epoch": 1.5678073510773132, "grad_norm": 0.28326693177223206, "learning_rate": 0.0001376081424936387, "loss": 0.2087, "step": 619 }, { "epoch": 1.570342205323194, "grad_norm": 0.3020143508911133, "learning_rate": 0.00013750636132315523, "loss": 0.2321, "step": 620 }, { "epoch": 1.5728770595690746, "grad_norm": 0.3246900141239166, "learning_rate": 0.00013740458015267177, "loss": 0.2121, "step": 621 }, { "epoch": 1.5754119138149556, "grad_norm": 0.3806106448173523, "learning_rate": 0.0001373027989821883, "loss": 0.2856, "step": 622 }, { "epoch": 1.5779467680608366, "grad_norm": 0.3568238317966461, "learning_rate": 0.00013720101781170485, "loss": 0.2579, "step": 623 }, { "epoch": 1.5804816223067175, "grad_norm": 0.45590534806251526, "learning_rate": 0.00013709923664122139, "loss": 0.2059, "step": 624 }, { "epoch": 1.5830164765525983, "grad_norm": 0.41996893286705017, "learning_rate": 0.00013699745547073792, "loss": 0.2154, "step": 625 }, { "epoch": 1.585551330798479, "grad_norm": 0.5142170190811157, "learning_rate": 0.00013689567430025446, "loss": 0.2708, "step": 626 }, { "epoch": 1.58808618504436, "grad_norm": 0.36335933208465576, "learning_rate": 0.000136793893129771, "loss": 0.2501, "step": 627 }, { "epoch": 1.590621039290241, "grad_norm": 0.3186666667461395, "learning_rate": 0.00013669211195928752, "loss": 0.2227, "step": 628 }, { "epoch": 1.5931558935361216, "grad_norm": 0.29709601402282715, "learning_rate": 0.00013659033078880408, "loss": 0.2265, "step": 629 }, { "epoch": 1.5956907477820024, "grad_norm": 0.2891612648963928, "learning_rate": 0.00013648854961832062, "loss": 0.2298, "step": 630 }, { "epoch": 1.5982256020278833, "grad_norm": 0.2191978096961975, "learning_rate": 0.00013638676844783716, "loss": 0.2049, "step": 631 }, { "epoch": 1.6007604562737643, "grad_norm": 0.37781399488449097, "learning_rate": 0.0001362849872773537, "loss": 0.3664, "step": 632 }, { "epoch": 1.6032953105196452, "grad_norm": 0.3082154393196106, "learning_rate": 0.00013618320610687024, "loss": 0.2063, "step": 633 }, { "epoch": 1.605830164765526, "grad_norm": 0.318317711353302, "learning_rate": 0.00013608142493638678, "loss": 0.2085, "step": 634 }, { "epoch": 1.6083650190114067, "grad_norm": 0.45566102862358093, "learning_rate": 0.00013597964376590332, "loss": 0.2876, "step": 635 }, { "epoch": 1.6108998732572877, "grad_norm": 0.3186021149158478, "learning_rate": 0.00013587786259541986, "loss": 0.2704, "step": 636 }, { "epoch": 1.6134347275031686, "grad_norm": 0.28905680775642395, "learning_rate": 0.0001357760814249364, "loss": 0.209, "step": 637 }, { "epoch": 1.6159695817490496, "grad_norm": 0.23341360688209534, "learning_rate": 0.00013567430025445294, "loss": 0.1835, "step": 638 }, { "epoch": 1.6185044359949303, "grad_norm": 0.336247056722641, "learning_rate": 0.00013557251908396947, "loss": 0.2547, "step": 639 }, { "epoch": 1.621039290240811, "grad_norm": 0.3736225366592407, "learning_rate": 0.00013547073791348601, "loss": 0.3053, "step": 640 }, { "epoch": 1.623574144486692, "grad_norm": 0.3983825743198395, "learning_rate": 0.00013536895674300255, "loss": 0.2395, "step": 641 }, { "epoch": 1.626108998732573, "grad_norm": 0.35913559794425964, "learning_rate": 0.0001352671755725191, "loss": 0.2918, "step": 642 }, { "epoch": 1.6286438529784537, "grad_norm": 0.2984326183795929, "learning_rate": 0.00013516539440203563, "loss": 0.2148, "step": 643 }, { "epoch": 1.6311787072243344, "grad_norm": 0.3113880753517151, "learning_rate": 0.00013506361323155217, "loss": 0.2044, "step": 644 }, { "epoch": 1.6337135614702154, "grad_norm": 0.5340004563331604, "learning_rate": 0.0001349618320610687, "loss": 0.3234, "step": 645 }, { "epoch": 1.6362484157160964, "grad_norm": 0.38927194476127625, "learning_rate": 0.00013486005089058525, "loss": 0.2866, "step": 646 }, { "epoch": 1.6387832699619773, "grad_norm": 0.38895881175994873, "learning_rate": 0.0001347582697201018, "loss": 0.2324, "step": 647 }, { "epoch": 1.641318124207858, "grad_norm": 0.41959917545318604, "learning_rate": 0.00013465648854961833, "loss": 0.2666, "step": 648 }, { "epoch": 1.6438529784537388, "grad_norm": 0.4299626648426056, "learning_rate": 0.00013455470737913487, "loss": 0.2905, "step": 649 }, { "epoch": 1.6463878326996197, "grad_norm": 0.4236285090446472, "learning_rate": 0.0001344529262086514, "loss": 0.292, "step": 650 }, { "epoch": 1.6489226869455007, "grad_norm": 0.8049849271774292, "learning_rate": 0.00013435114503816795, "loss": 0.2351, "step": 651 }, { "epoch": 1.6514575411913817, "grad_norm": 0.3420075476169586, "learning_rate": 0.00013424936386768449, "loss": 0.2355, "step": 652 }, { "epoch": 1.6539923954372624, "grad_norm": 0.3632122874259949, "learning_rate": 0.00013414758269720103, "loss": 0.2377, "step": 653 }, { "epoch": 1.6565272496831431, "grad_norm": 0.27961722016334534, "learning_rate": 0.00013404580152671756, "loss": 0.2299, "step": 654 }, { "epoch": 1.659062103929024, "grad_norm": 0.3043057918548584, "learning_rate": 0.0001339440203562341, "loss": 0.2321, "step": 655 }, { "epoch": 1.661596958174905, "grad_norm": 0.3421036899089813, "learning_rate": 0.00013384223918575064, "loss": 0.2492, "step": 656 }, { "epoch": 1.6641318124207858, "grad_norm": 0.39606526494026184, "learning_rate": 0.00013374045801526718, "loss": 0.3401, "step": 657 }, { "epoch": 1.6666666666666665, "grad_norm": 0.35081973671913147, "learning_rate": 0.00013363867684478372, "loss": 0.2175, "step": 658 }, { "epoch": 1.6692015209125475, "grad_norm": 0.420175701379776, "learning_rate": 0.00013353689567430026, "loss": 0.2813, "step": 659 }, { "epoch": 1.6717363751584284, "grad_norm": 0.24181438982486725, "learning_rate": 0.0001334351145038168, "loss": 0.219, "step": 660 }, { "epoch": 1.6742712294043094, "grad_norm": 0.6243584752082825, "learning_rate": 0.00013333333333333334, "loss": 0.3087, "step": 661 }, { "epoch": 1.6768060836501901, "grad_norm": 0.4036748707294464, "learning_rate": 0.00013323155216284988, "loss": 0.251, "step": 662 }, { "epoch": 1.6793409378960709, "grad_norm": 0.39555415511131287, "learning_rate": 0.00013312977099236642, "loss": 0.3279, "step": 663 }, { "epoch": 1.6818757921419518, "grad_norm": 0.4018571674823761, "learning_rate": 0.00013302798982188296, "loss": 0.2337, "step": 664 }, { "epoch": 1.6844106463878328, "grad_norm": 0.36354130506515503, "learning_rate": 0.0001329262086513995, "loss": 0.2503, "step": 665 }, { "epoch": 1.6869455006337135, "grad_norm": 0.32249706983566284, "learning_rate": 0.00013282442748091604, "loss": 0.27, "step": 666 }, { "epoch": 1.6894803548795945, "grad_norm": 0.33560654520988464, "learning_rate": 0.00013272264631043258, "loss": 0.203, "step": 667 }, { "epoch": 1.6920152091254752, "grad_norm": 0.39997267723083496, "learning_rate": 0.00013262086513994911, "loss": 0.2662, "step": 668 }, { "epoch": 1.6945500633713562, "grad_norm": 0.6739961504936218, "learning_rate": 0.00013251908396946565, "loss": 0.2803, "step": 669 }, { "epoch": 1.6970849176172371, "grad_norm": 0.5863606929779053, "learning_rate": 0.0001324173027989822, "loss": 0.351, "step": 670 }, { "epoch": 1.6996197718631179, "grad_norm": 0.4408819079399109, "learning_rate": 0.00013231552162849873, "loss": 0.1814, "step": 671 }, { "epoch": 1.7021546261089986, "grad_norm": 0.3341253697872162, "learning_rate": 0.00013221374045801527, "loss": 0.2156, "step": 672 }, { "epoch": 1.7046894803548795, "grad_norm": 0.3035176992416382, "learning_rate": 0.0001321119592875318, "loss": 0.2308, "step": 673 }, { "epoch": 1.7072243346007605, "grad_norm": 0.4395483136177063, "learning_rate": 0.00013201017811704835, "loss": 0.3418, "step": 674 }, { "epoch": 1.7097591888466415, "grad_norm": 0.22972792387008667, "learning_rate": 0.0001319083969465649, "loss": 0.1873, "step": 675 }, { "epoch": 1.7122940430925222, "grad_norm": 0.47378918528556824, "learning_rate": 0.00013180661577608143, "loss": 0.2514, "step": 676 }, { "epoch": 1.714828897338403, "grad_norm": 0.3947070240974426, "learning_rate": 0.00013170483460559797, "loss": 0.2289, "step": 677 }, { "epoch": 1.717363751584284, "grad_norm": 0.3789718747138977, "learning_rate": 0.0001316030534351145, "loss": 0.2476, "step": 678 }, { "epoch": 1.7198986058301649, "grad_norm": 0.4904823899269104, "learning_rate": 0.00013150127226463105, "loss": 0.2163, "step": 679 }, { "epoch": 1.7224334600760456, "grad_norm": 0.3285132646560669, "learning_rate": 0.0001313994910941476, "loss": 0.2786, "step": 680 }, { "epoch": 1.7249683143219265, "grad_norm": 0.4326847493648529, "learning_rate": 0.00013129770992366413, "loss": 0.2409, "step": 681 }, { "epoch": 1.7275031685678073, "grad_norm": 0.3819947838783264, "learning_rate": 0.00013119592875318067, "loss": 0.2076, "step": 682 }, { "epoch": 1.7300380228136882, "grad_norm": 0.4046533703804016, "learning_rate": 0.0001310941475826972, "loss": 0.2717, "step": 683 }, { "epoch": 1.7325728770595692, "grad_norm": 0.34681758284568787, "learning_rate": 0.00013099236641221374, "loss": 0.2389, "step": 684 }, { "epoch": 1.73510773130545, "grad_norm": 0.35155028104782104, "learning_rate": 0.00013089058524173028, "loss": 0.2407, "step": 685 }, { "epoch": 1.7376425855513307, "grad_norm": 0.3306678533554077, "learning_rate": 0.00013078880407124682, "loss": 0.2767, "step": 686 }, { "epoch": 1.7401774397972116, "grad_norm": 0.27715572714805603, "learning_rate": 0.00013068702290076336, "loss": 0.1955, "step": 687 }, { "epoch": 1.7427122940430926, "grad_norm": 0.3591010272502899, "learning_rate": 0.0001305852417302799, "loss": 0.2269, "step": 688 }, { "epoch": 1.7452471482889735, "grad_norm": 0.39104408025741577, "learning_rate": 0.00013048346055979644, "loss": 0.2392, "step": 689 }, { "epoch": 1.7477820025348543, "grad_norm": 0.44545605778694153, "learning_rate": 0.00013038167938931298, "loss": 0.2823, "step": 690 }, { "epoch": 1.750316856780735, "grad_norm": 0.29502785205841064, "learning_rate": 0.00013027989821882952, "loss": 0.1899, "step": 691 }, { "epoch": 1.752851711026616, "grad_norm": 0.40423381328582764, "learning_rate": 0.00013017811704834606, "loss": 0.2069, "step": 692 }, { "epoch": 1.755386565272497, "grad_norm": 0.38649502396583557, "learning_rate": 0.0001300763358778626, "loss": 0.1938, "step": 693 }, { "epoch": 1.7579214195183777, "grad_norm": 0.40014389157295227, "learning_rate": 0.00012997455470737914, "loss": 0.2825, "step": 694 }, { "epoch": 1.7604562737642584, "grad_norm": 0.4783387780189514, "learning_rate": 0.00012987277353689568, "loss": 0.2629, "step": 695 }, { "epoch": 1.7629911280101394, "grad_norm": 0.4938651919364929, "learning_rate": 0.00012977099236641222, "loss": 0.2976, "step": 696 }, { "epoch": 1.7655259822560203, "grad_norm": 0.32507607340812683, "learning_rate": 0.00012966921119592875, "loss": 0.2097, "step": 697 }, { "epoch": 1.7680608365019013, "grad_norm": 0.31158536672592163, "learning_rate": 0.0001295674300254453, "loss": 0.223, "step": 698 }, { "epoch": 1.770595690747782, "grad_norm": 0.5594013333320618, "learning_rate": 0.00012946564885496183, "loss": 0.3523, "step": 699 }, { "epoch": 1.7731305449936627, "grad_norm": 0.5820282697677612, "learning_rate": 0.00012936386768447837, "loss": 0.3181, "step": 700 }, { "epoch": 1.7756653992395437, "grad_norm": 0.3635233938694, "learning_rate": 0.0001292620865139949, "loss": 0.2387, "step": 701 }, { "epoch": 1.7782002534854247, "grad_norm": 0.3195054531097412, "learning_rate": 0.00012916030534351148, "loss": 0.2046, "step": 702 }, { "epoch": 1.7807351077313056, "grad_norm": 0.3483947217464447, "learning_rate": 0.000129058524173028, "loss": 0.2576, "step": 703 }, { "epoch": 1.7832699619771863, "grad_norm": 0.3419065475463867, "learning_rate": 0.00012895674300254456, "loss": 0.2361, "step": 704 }, { "epoch": 1.785804816223067, "grad_norm": 0.3142557442188263, "learning_rate": 0.00012885496183206107, "loss": 0.2172, "step": 705 }, { "epoch": 1.788339670468948, "grad_norm": 0.3502836227416992, "learning_rate": 0.0001287531806615776, "loss": 0.2621, "step": 706 }, { "epoch": 1.790874524714829, "grad_norm": 0.37896937131881714, "learning_rate": 0.00012865139949109415, "loss": 0.2374, "step": 707 }, { "epoch": 1.7934093789607097, "grad_norm": 0.3880506455898285, "learning_rate": 0.0001285496183206107, "loss": 0.2862, "step": 708 }, { "epoch": 1.7959442332065905, "grad_norm": 0.2648681700229645, "learning_rate": 0.00012844783715012723, "loss": 0.206, "step": 709 }, { "epoch": 1.7984790874524714, "grad_norm": 0.25072911381721497, "learning_rate": 0.00012834605597964377, "loss": 0.2123, "step": 710 }, { "epoch": 1.8010139416983524, "grad_norm": 0.3076663315296173, "learning_rate": 0.0001282442748091603, "loss": 0.2983, "step": 711 }, { "epoch": 1.8035487959442333, "grad_norm": 0.4219549000263214, "learning_rate": 0.00012814249363867684, "loss": 0.2213, "step": 712 }, { "epoch": 1.806083650190114, "grad_norm": 0.2831745445728302, "learning_rate": 0.00012804071246819338, "loss": 0.2062, "step": 713 }, { "epoch": 1.8086185044359948, "grad_norm": 0.4014468491077423, "learning_rate": 0.00012793893129770992, "loss": 0.2945, "step": 714 }, { "epoch": 1.8111533586818758, "grad_norm": 0.2980962097644806, "learning_rate": 0.0001278371501272265, "loss": 0.2179, "step": 715 }, { "epoch": 1.8136882129277567, "grad_norm": 0.2338070124387741, "learning_rate": 0.000127735368956743, "loss": 0.1664, "step": 716 }, { "epoch": 1.8162230671736375, "grad_norm": 0.6155439615249634, "learning_rate": 0.00012763358778625957, "loss": 0.3429, "step": 717 }, { "epoch": 1.8187579214195184, "grad_norm": 0.46969589591026306, "learning_rate": 0.00012753180661577608, "loss": 0.2584, "step": 718 }, { "epoch": 1.8212927756653992, "grad_norm": 0.5578194260597229, "learning_rate": 0.00012743002544529265, "loss": 0.2695, "step": 719 }, { "epoch": 1.8238276299112801, "grad_norm": 0.34903043508529663, "learning_rate": 0.00012732824427480916, "loss": 0.2119, "step": 720 }, { "epoch": 1.826362484157161, "grad_norm": 0.3990432322025299, "learning_rate": 0.0001272264631043257, "loss": 0.2487, "step": 721 }, { "epoch": 1.8288973384030418, "grad_norm": 0.3382611572742462, "learning_rate": 0.00012712468193384224, "loss": 0.2313, "step": 722 }, { "epoch": 1.8314321926489225, "grad_norm": 0.30938395857810974, "learning_rate": 0.00012702290076335878, "loss": 0.2113, "step": 723 }, { "epoch": 1.8339670468948035, "grad_norm": 0.39266690611839294, "learning_rate": 0.00012692111959287532, "loss": 0.2609, "step": 724 }, { "epoch": 1.8365019011406845, "grad_norm": 0.4396655261516571, "learning_rate": 0.00012681933842239186, "loss": 0.2518, "step": 725 }, { "epoch": 1.8390367553865654, "grad_norm": 0.4134500324726105, "learning_rate": 0.0001267175572519084, "loss": 0.3317, "step": 726 }, { "epoch": 1.8415716096324461, "grad_norm": 0.29644638299942017, "learning_rate": 0.00012661577608142493, "loss": 0.1912, "step": 727 }, { "epoch": 1.8441064638783269, "grad_norm": 0.3661201596260071, "learning_rate": 0.0001265139949109415, "loss": 0.2911, "step": 728 }, { "epoch": 1.8466413181242078, "grad_norm": 0.4504169225692749, "learning_rate": 0.000126412213740458, "loss": 0.3409, "step": 729 }, { "epoch": 1.8491761723700888, "grad_norm": 0.28516069054603577, "learning_rate": 0.00012631043256997458, "loss": 0.254, "step": 730 }, { "epoch": 1.8517110266159695, "grad_norm": 0.33754590153694153, "learning_rate": 0.0001262086513994911, "loss": 0.2275, "step": 731 }, { "epoch": 1.8542458808618505, "grad_norm": 0.26562589406967163, "learning_rate": 0.00012610687022900766, "loss": 0.1979, "step": 732 }, { "epoch": 1.8567807351077312, "grad_norm": 0.3081592321395874, "learning_rate": 0.00012600508905852417, "loss": 0.2099, "step": 733 }, { "epoch": 1.8593155893536122, "grad_norm": 0.34866124391555786, "learning_rate": 0.0001259033078880407, "loss": 0.3038, "step": 734 }, { "epoch": 1.8618504435994931, "grad_norm": 0.2867881953716278, "learning_rate": 0.00012580152671755725, "loss": 0.2225, "step": 735 }, { "epoch": 1.8643852978453739, "grad_norm": 0.2374526560306549, "learning_rate": 0.0001256997455470738, "loss": 0.1945, "step": 736 }, { "epoch": 1.8669201520912546, "grad_norm": 0.3072168827056885, "learning_rate": 0.00012559796437659033, "loss": 0.2135, "step": 737 }, { "epoch": 1.8694550063371356, "grad_norm": 0.36897239089012146, "learning_rate": 0.00012549618320610687, "loss": 0.3225, "step": 738 }, { "epoch": 1.8719898605830165, "grad_norm": 0.3114832937717438, "learning_rate": 0.00012539440203562343, "loss": 0.2064, "step": 739 }, { "epoch": 1.8745247148288975, "grad_norm": 0.40082940459251404, "learning_rate": 0.00012529262086513995, "loss": 0.2145, "step": 740 }, { "epoch": 1.8770595690747782, "grad_norm": 0.28362375497817993, "learning_rate": 0.0001251908396946565, "loss": 0.2044, "step": 741 }, { "epoch": 1.879594423320659, "grad_norm": 0.2738857567310333, "learning_rate": 0.00012508905852417302, "loss": 0.1852, "step": 742 }, { "epoch": 1.88212927756654, "grad_norm": 0.37283095717430115, "learning_rate": 0.0001249872773536896, "loss": 0.248, "step": 743 }, { "epoch": 1.8846641318124209, "grad_norm": 0.3065252900123596, "learning_rate": 0.0001248854961832061, "loss": 0.2028, "step": 744 }, { "epoch": 1.8871989860583016, "grad_norm": 0.2891787588596344, "learning_rate": 0.00012478371501272267, "loss": 0.1977, "step": 745 }, { "epoch": 1.8897338403041823, "grad_norm": 0.5002029538154602, "learning_rate": 0.00012468193384223918, "loss": 0.2731, "step": 746 }, { "epoch": 1.8922686945500633, "grad_norm": 0.34734681248664856, "learning_rate": 0.00012458015267175575, "loss": 0.2236, "step": 747 }, { "epoch": 1.8948035487959443, "grad_norm": 0.4372716248035431, "learning_rate": 0.00012447837150127226, "loss": 0.3787, "step": 748 }, { "epoch": 1.8973384030418252, "grad_norm": 0.41203773021698, "learning_rate": 0.0001243765903307888, "loss": 0.2385, "step": 749 }, { "epoch": 1.899873257287706, "grad_norm": 0.28231269121170044, "learning_rate": 0.00012427480916030534, "loss": 0.1966, "step": 750 }, { "epoch": 1.9024081115335867, "grad_norm": 0.3689015209674835, "learning_rate": 0.00012417302798982188, "loss": 0.2266, "step": 751 }, { "epoch": 1.9049429657794676, "grad_norm": 0.35862621665000916, "learning_rate": 0.00012407124681933844, "loss": 0.2226, "step": 752 }, { "epoch": 1.9074778200253486, "grad_norm": 0.27552056312561035, "learning_rate": 0.00012396946564885496, "loss": 0.2049, "step": 753 }, { "epoch": 1.9100126742712296, "grad_norm": 0.3665705919265747, "learning_rate": 0.00012386768447837152, "loss": 0.2262, "step": 754 }, { "epoch": 1.9125475285171103, "grad_norm": 0.37812677025794983, "learning_rate": 0.00012376590330788803, "loss": 0.2561, "step": 755 }, { "epoch": 1.915082382762991, "grad_norm": 0.34638741612434387, "learning_rate": 0.0001236641221374046, "loss": 0.2152, "step": 756 }, { "epoch": 1.917617237008872, "grad_norm": 0.3499183654785156, "learning_rate": 0.00012356234096692111, "loss": 0.2823, "step": 757 }, { "epoch": 1.920152091254753, "grad_norm": 0.3274863362312317, "learning_rate": 0.00012346055979643768, "loss": 0.202, "step": 758 }, { "epoch": 1.9226869455006337, "grad_norm": 0.4568060338497162, "learning_rate": 0.0001233587786259542, "loss": 0.3531, "step": 759 }, { "epoch": 1.9252217997465144, "grad_norm": 0.3351891040802002, "learning_rate": 0.00012325699745547076, "loss": 0.3491, "step": 760 }, { "epoch": 1.9277566539923954, "grad_norm": 0.3045225739479065, "learning_rate": 0.00012315521628498727, "loss": 0.2412, "step": 761 }, { "epoch": 1.9302915082382763, "grad_norm": 0.4453962445259094, "learning_rate": 0.0001230534351145038, "loss": 0.485, "step": 762 }, { "epoch": 1.9328263624841573, "grad_norm": 0.4568649232387543, "learning_rate": 0.00012295165394402038, "loss": 0.4203, "step": 763 }, { "epoch": 1.935361216730038, "grad_norm": 0.33376067876815796, "learning_rate": 0.0001228498727735369, "loss": 0.2287, "step": 764 }, { "epoch": 1.9378960709759188, "grad_norm": 0.2670106887817383, "learning_rate": 0.00012274809160305346, "loss": 0.2265, "step": 765 }, { "epoch": 1.9404309252217997, "grad_norm": 0.25930914282798767, "learning_rate": 0.00012264631043256997, "loss": 0.2661, "step": 766 }, { "epoch": 1.9429657794676807, "grad_norm": 0.22364859282970428, "learning_rate": 0.00012254452926208653, "loss": 0.1938, "step": 767 }, { "epoch": 1.9455006337135616, "grad_norm": 0.4107860028743744, "learning_rate": 0.00012244274809160305, "loss": 0.3227, "step": 768 }, { "epoch": 1.9480354879594424, "grad_norm": 0.24454613029956818, "learning_rate": 0.0001223409669211196, "loss": 0.2813, "step": 769 }, { "epoch": 1.950570342205323, "grad_norm": 0.28310418128967285, "learning_rate": 0.00012223918575063612, "loss": 0.2065, "step": 770 }, { "epoch": 1.953105196451204, "grad_norm": 0.28080177307128906, "learning_rate": 0.0001221374045801527, "loss": 0.1941, "step": 771 }, { "epoch": 1.955640050697085, "grad_norm": 0.365400105714798, "learning_rate": 0.0001220356234096692, "loss": 0.2657, "step": 772 }, { "epoch": 1.9581749049429658, "grad_norm": 0.3115444779396057, "learning_rate": 0.00012193384223918576, "loss": 0.2117, "step": 773 }, { "epoch": 1.9607097591888465, "grad_norm": 0.30900898575782776, "learning_rate": 0.00012183206106870228, "loss": 0.2563, "step": 774 }, { "epoch": 1.9632446134347274, "grad_norm": 0.341789573431015, "learning_rate": 0.00012173027989821883, "loss": 0.2396, "step": 775 }, { "epoch": 1.9657794676806084, "grad_norm": 0.39556756615638733, "learning_rate": 0.00012162849872773539, "loss": 0.2203, "step": 776 }, { "epoch": 1.9683143219264894, "grad_norm": 0.4282820224761963, "learning_rate": 0.00012152671755725191, "loss": 0.2476, "step": 777 }, { "epoch": 1.97084917617237, "grad_norm": 0.3683648109436035, "learning_rate": 0.00012142493638676847, "loss": 0.2414, "step": 778 }, { "epoch": 1.9733840304182508, "grad_norm": 0.19751296937465668, "learning_rate": 0.00012132315521628499, "loss": 0.1622, "step": 779 }, { "epoch": 1.9759188846641318, "grad_norm": 0.4522268772125244, "learning_rate": 0.00012122137404580154, "loss": 0.3372, "step": 780 }, { "epoch": 1.9784537389100127, "grad_norm": 0.3386411666870117, "learning_rate": 0.00012111959287531807, "loss": 0.1966, "step": 781 }, { "epoch": 1.9809885931558935, "grad_norm": 0.3266599178314209, "learning_rate": 0.00012101781170483461, "loss": 0.2507, "step": 782 }, { "epoch": 1.9835234474017744, "grad_norm": 0.395271897315979, "learning_rate": 0.00012091603053435115, "loss": 0.2626, "step": 783 }, { "epoch": 1.9860583016476552, "grad_norm": 0.23269407451152802, "learning_rate": 0.00012081424936386769, "loss": 0.1806, "step": 784 }, { "epoch": 1.9885931558935361, "grad_norm": 0.3929823040962219, "learning_rate": 0.00012071246819338421, "loss": 0.2912, "step": 785 }, { "epoch": 1.991128010139417, "grad_norm": 0.2597116529941559, "learning_rate": 0.00012061068702290077, "loss": 0.1918, "step": 786 }, { "epoch": 1.9936628643852978, "grad_norm": 0.44690757989883423, "learning_rate": 0.00012050890585241729, "loss": 0.2644, "step": 787 }, { "epoch": 1.9961977186311786, "grad_norm": 0.4133460819721222, "learning_rate": 0.00012040712468193385, "loss": 0.2541, "step": 788 }, { "epoch": 1.9987325728770595, "grad_norm": 0.33399301767349243, "learning_rate": 0.0001203053435114504, "loss": 0.2778, "step": 789 }, { "epoch": 2.0, "grad_norm": 0.6268282532691956, "learning_rate": 0.00012020356234096692, "loss": 0.3105, "step": 790 }, { "epoch": 2.002534854245881, "grad_norm": 0.38419365882873535, "learning_rate": 0.00012010178117048348, "loss": 0.2352, "step": 791 }, { "epoch": 2.005069708491762, "grad_norm": 0.30469566583633423, "learning_rate": 0.00012, "loss": 0.2011, "step": 792 }, { "epoch": 2.0076045627376424, "grad_norm": 0.36411482095718384, "learning_rate": 0.00011989821882951656, "loss": 0.2324, "step": 793 }, { "epoch": 2.0101394169835234, "grad_norm": 0.40986311435699463, "learning_rate": 0.00011979643765903308, "loss": 0.2217, "step": 794 }, { "epoch": 2.0126742712294043, "grad_norm": 0.46682968735694885, "learning_rate": 0.00011969465648854963, "loss": 0.2688, "step": 795 }, { "epoch": 2.0152091254752853, "grad_norm": 0.31846344470977783, "learning_rate": 0.00011959287531806616, "loss": 0.1984, "step": 796 }, { "epoch": 2.017743979721166, "grad_norm": 0.48346126079559326, "learning_rate": 0.0001194910941475827, "loss": 0.2404, "step": 797 }, { "epoch": 2.0202788339670468, "grad_norm": 0.5090253949165344, "learning_rate": 0.00011938931297709924, "loss": 0.2363, "step": 798 }, { "epoch": 2.0228136882129277, "grad_norm": 0.4886679947376251, "learning_rate": 0.00011928753180661578, "loss": 0.2656, "step": 799 }, { "epoch": 2.0253485424588087, "grad_norm": 0.5652650594711304, "learning_rate": 0.00011918575063613233, "loss": 0.2444, "step": 800 }, { "epoch": 2.0278833967046896, "grad_norm": 0.7158893346786499, "learning_rate": 0.00011908396946564886, "loss": 0.2362, "step": 801 }, { "epoch": 2.03041825095057, "grad_norm": 0.5168672800064087, "learning_rate": 0.00011898218829516541, "loss": 0.2067, "step": 802 }, { "epoch": 2.032953105196451, "grad_norm": 0.7243991494178772, "learning_rate": 0.00011888040712468194, "loss": 0.2458, "step": 803 }, { "epoch": 2.035487959442332, "grad_norm": 0.4199936091899872, "learning_rate": 0.00011877862595419849, "loss": 0.2009, "step": 804 }, { "epoch": 2.038022813688213, "grad_norm": 0.41791805624961853, "learning_rate": 0.00011867684478371501, "loss": 0.2325, "step": 805 }, { "epoch": 2.040557667934094, "grad_norm": 0.6389465928077698, "learning_rate": 0.00011857506361323157, "loss": 0.2636, "step": 806 }, { "epoch": 2.0430925221799745, "grad_norm": 0.6254114508628845, "learning_rate": 0.00011847328244274809, "loss": 0.2292, "step": 807 }, { "epoch": 2.0456273764258555, "grad_norm": 0.8436942100524902, "learning_rate": 0.00011837150127226465, "loss": 0.2913, "step": 808 }, { "epoch": 2.0481622306717364, "grad_norm": 0.42698097229003906, "learning_rate": 0.00011826972010178117, "loss": 0.2107, "step": 809 }, { "epoch": 2.0506970849176174, "grad_norm": 0.432607501745224, "learning_rate": 0.00011816793893129771, "loss": 0.1851, "step": 810 }, { "epoch": 2.053231939163498, "grad_norm": 0.48241573572158813, "learning_rate": 0.00011806615776081425, "loss": 0.2333, "step": 811 }, { "epoch": 2.055766793409379, "grad_norm": 0.3920150101184845, "learning_rate": 0.00011796437659033079, "loss": 0.2256, "step": 812 }, { "epoch": 2.05830164765526, "grad_norm": 0.3601329028606415, "learning_rate": 0.00011786259541984734, "loss": 0.2428, "step": 813 }, { "epoch": 2.0608365019011408, "grad_norm": 0.428524911403656, "learning_rate": 0.00011776081424936387, "loss": 0.3109, "step": 814 }, { "epoch": 2.0633713561470217, "grad_norm": 0.22846737504005432, "learning_rate": 0.00011765903307888042, "loss": 0.1715, "step": 815 }, { "epoch": 2.0659062103929022, "grad_norm": 0.3656214475631714, "learning_rate": 0.00011755725190839695, "loss": 0.2211, "step": 816 }, { "epoch": 2.068441064638783, "grad_norm": 0.2633965015411377, "learning_rate": 0.0001174554707379135, "loss": 0.1933, "step": 817 }, { "epoch": 2.070975918884664, "grad_norm": 0.4318942129611969, "learning_rate": 0.00011735368956743003, "loss": 0.2829, "step": 818 }, { "epoch": 2.073510773130545, "grad_norm": 0.2643216848373413, "learning_rate": 0.00011725190839694658, "loss": 0.1938, "step": 819 }, { "epoch": 2.076045627376426, "grad_norm": 0.4560074508190155, "learning_rate": 0.0001171501272264631, "loss": 0.3017, "step": 820 }, { "epoch": 2.0785804816223066, "grad_norm": 0.380374550819397, "learning_rate": 0.00011704834605597966, "loss": 0.2141, "step": 821 }, { "epoch": 2.0811153358681875, "grad_norm": 0.321417897939682, "learning_rate": 0.00011694656488549618, "loss": 0.2058, "step": 822 }, { "epoch": 2.0836501901140685, "grad_norm": 0.350496768951416, "learning_rate": 0.00011684478371501274, "loss": 0.1761, "step": 823 }, { "epoch": 2.0861850443599494, "grad_norm": 0.35794898867607117, "learning_rate": 0.00011674300254452927, "loss": 0.2016, "step": 824 }, { "epoch": 2.08871989860583, "grad_norm": 0.37890860438346863, "learning_rate": 0.0001166412213740458, "loss": 0.253, "step": 825 }, { "epoch": 2.091254752851711, "grad_norm": 0.41833457350730896, "learning_rate": 0.00011653944020356235, "loss": 0.2012, "step": 826 }, { "epoch": 2.093789607097592, "grad_norm": 0.49572086334228516, "learning_rate": 0.00011643765903307888, "loss": 0.214, "step": 827 }, { "epoch": 2.096324461343473, "grad_norm": 0.44266751408576965, "learning_rate": 0.00011633587786259543, "loss": 0.2496, "step": 828 }, { "epoch": 2.098859315589354, "grad_norm": 0.7018102407455444, "learning_rate": 0.00011623409669211196, "loss": 0.3996, "step": 829 }, { "epoch": 2.1013941698352343, "grad_norm": 0.42781826853752136, "learning_rate": 0.00011613231552162851, "loss": 0.2325, "step": 830 }, { "epoch": 2.1039290240811153, "grad_norm": 0.35814788937568665, "learning_rate": 0.00011603053435114504, "loss": 0.2003, "step": 831 }, { "epoch": 2.106463878326996, "grad_norm": 0.2381380945444107, "learning_rate": 0.00011592875318066159, "loss": 0.1791, "step": 832 }, { "epoch": 2.108998732572877, "grad_norm": 0.3152197003364563, "learning_rate": 0.00011582697201017811, "loss": 0.1802, "step": 833 }, { "epoch": 2.111533586818758, "grad_norm": 0.3493264615535736, "learning_rate": 0.00011572519083969467, "loss": 0.173, "step": 834 }, { "epoch": 2.1140684410646386, "grad_norm": 0.339036762714386, "learning_rate": 0.0001156234096692112, "loss": 0.1875, "step": 835 }, { "epoch": 2.1166032953105196, "grad_norm": 0.3622972369194031, "learning_rate": 0.00011552162849872775, "loss": 0.1892, "step": 836 }, { "epoch": 2.1191381495564006, "grad_norm": 0.7021862268447876, "learning_rate": 0.00011541984732824429, "loss": 0.272, "step": 837 }, { "epoch": 2.1216730038022815, "grad_norm": 0.4027453064918518, "learning_rate": 0.00011531806615776081, "loss": 0.2296, "step": 838 }, { "epoch": 2.124207858048162, "grad_norm": 0.3509223163127899, "learning_rate": 0.00011521628498727736, "loss": 0.1812, "step": 839 }, { "epoch": 2.126742712294043, "grad_norm": 0.4156752824783325, "learning_rate": 0.00011511450381679389, "loss": 0.2444, "step": 840 }, { "epoch": 2.129277566539924, "grad_norm": 0.3596971035003662, "learning_rate": 0.00011501272264631044, "loss": 0.1944, "step": 841 }, { "epoch": 2.131812420785805, "grad_norm": 0.4088239371776581, "learning_rate": 0.00011491094147582697, "loss": 0.1892, "step": 842 }, { "epoch": 2.134347275031686, "grad_norm": 0.3603368103504181, "learning_rate": 0.00011480916030534352, "loss": 0.1955, "step": 843 }, { "epoch": 2.1368821292775664, "grad_norm": 0.3702489733695984, "learning_rate": 0.00011470737913486005, "loss": 0.2401, "step": 844 }, { "epoch": 2.1394169835234473, "grad_norm": 0.427312433719635, "learning_rate": 0.0001146055979643766, "loss": 0.2097, "step": 845 }, { "epoch": 2.1419518377693283, "grad_norm": 0.34239426255226135, "learning_rate": 0.00011450381679389313, "loss": 0.2055, "step": 846 }, { "epoch": 2.1444866920152093, "grad_norm": 0.522627055644989, "learning_rate": 0.00011440203562340968, "loss": 0.2206, "step": 847 }, { "epoch": 2.14702154626109, "grad_norm": 0.5005999207496643, "learning_rate": 0.0001143002544529262, "loss": 0.2187, "step": 848 }, { "epoch": 2.1495564005069707, "grad_norm": 0.4834093451499939, "learning_rate": 0.00011419847328244276, "loss": 0.2616, "step": 849 }, { "epoch": 2.1520912547528517, "grad_norm": 0.3305776119232178, "learning_rate": 0.0001140966921119593, "loss": 0.2193, "step": 850 }, { "epoch": 2.1546261089987326, "grad_norm": 0.3691657781600952, "learning_rate": 0.00011399491094147584, "loss": 0.2343, "step": 851 }, { "epoch": 2.1571609632446136, "grad_norm": 0.4711242914199829, "learning_rate": 0.00011389312977099238, "loss": 0.2961, "step": 852 }, { "epoch": 2.159695817490494, "grad_norm": 0.4091726839542389, "learning_rate": 0.0001137913486005089, "loss": 0.2735, "step": 853 }, { "epoch": 2.162230671736375, "grad_norm": 0.28634020686149597, "learning_rate": 0.00011368956743002545, "loss": 0.2026, "step": 854 }, { "epoch": 2.164765525982256, "grad_norm": 0.3120497763156891, "learning_rate": 0.00011358778625954198, "loss": 0.1826, "step": 855 }, { "epoch": 2.167300380228137, "grad_norm": 0.3803773522377014, "learning_rate": 0.00011348600508905853, "loss": 0.2206, "step": 856 }, { "epoch": 2.169835234474018, "grad_norm": 0.4069412648677826, "learning_rate": 0.00011338422391857506, "loss": 0.23, "step": 857 }, { "epoch": 2.1723700887198985, "grad_norm": 0.31032097339630127, "learning_rate": 0.00011328244274809161, "loss": 0.1774, "step": 858 }, { "epoch": 2.1749049429657794, "grad_norm": 0.3429819941520691, "learning_rate": 0.00011318066157760814, "loss": 0.207, "step": 859 }, { "epoch": 2.1774397972116604, "grad_norm": 0.32155394554138184, "learning_rate": 0.00011307888040712469, "loss": 0.1817, "step": 860 }, { "epoch": 2.1799746514575413, "grad_norm": 0.3859189450740814, "learning_rate": 0.00011297709923664124, "loss": 0.205, "step": 861 }, { "epoch": 2.182509505703422, "grad_norm": 0.33794042468070984, "learning_rate": 0.00011287531806615777, "loss": 0.2002, "step": 862 }, { "epoch": 2.185044359949303, "grad_norm": 0.38762131333351135, "learning_rate": 0.00011277353689567431, "loss": 0.206, "step": 863 }, { "epoch": 2.1875792141951838, "grad_norm": 0.35734203457832336, "learning_rate": 0.00011267175572519085, "loss": 0.2332, "step": 864 }, { "epoch": 2.1901140684410647, "grad_norm": 0.32456931471824646, "learning_rate": 0.00011256997455470739, "loss": 0.1873, "step": 865 }, { "epoch": 2.1926489226869457, "grad_norm": 0.5198532938957214, "learning_rate": 0.00011246819338422391, "loss": 0.2408, "step": 866 }, { "epoch": 2.195183776932826, "grad_norm": 0.3863469362258911, "learning_rate": 0.00011236641221374046, "loss": 0.1778, "step": 867 }, { "epoch": 2.197718631178707, "grad_norm": 0.39902037382125854, "learning_rate": 0.00011226463104325699, "loss": 0.1982, "step": 868 }, { "epoch": 2.200253485424588, "grad_norm": 0.3974783718585968, "learning_rate": 0.00011216284987277354, "loss": 0.2157, "step": 869 }, { "epoch": 2.202788339670469, "grad_norm": 0.33785662055015564, "learning_rate": 0.00011206106870229007, "loss": 0.2152, "step": 870 }, { "epoch": 2.20532319391635, "grad_norm": 0.4233367145061493, "learning_rate": 0.00011195928753180662, "loss": 0.2992, "step": 871 }, { "epoch": 2.2078580481622305, "grad_norm": 0.37665534019470215, "learning_rate": 0.00011185750636132315, "loss": 0.2273, "step": 872 }, { "epoch": 2.2103929024081115, "grad_norm": 0.3841243088245392, "learning_rate": 0.0001117557251908397, "loss": 0.1991, "step": 873 }, { "epoch": 2.2129277566539924, "grad_norm": 0.3544892966747284, "learning_rate": 0.00011165394402035625, "loss": 0.2098, "step": 874 }, { "epoch": 2.2154626108998734, "grad_norm": 0.43662142753601074, "learning_rate": 0.00011155216284987278, "loss": 0.2411, "step": 875 }, { "epoch": 2.2179974651457544, "grad_norm": 0.3305199146270752, "learning_rate": 0.00011145038167938933, "loss": 0.1803, "step": 876 }, { "epoch": 2.220532319391635, "grad_norm": 0.34674328565597534, "learning_rate": 0.00011134860050890586, "loss": 0.2206, "step": 877 }, { "epoch": 2.223067173637516, "grad_norm": 0.39985305070877075, "learning_rate": 0.0001112468193384224, "loss": 0.2951, "step": 878 }, { "epoch": 2.225602027883397, "grad_norm": 0.36231693625450134, "learning_rate": 0.00011114503816793894, "loss": 0.2601, "step": 879 }, { "epoch": 2.2281368821292777, "grad_norm": 0.4199659526348114, "learning_rate": 0.00011104325699745548, "loss": 0.2719, "step": 880 }, { "epoch": 2.2306717363751583, "grad_norm": 0.3472574055194855, "learning_rate": 0.000110941475826972, "loss": 0.2437, "step": 881 }, { "epoch": 2.233206590621039, "grad_norm": 0.2765200436115265, "learning_rate": 0.00011083969465648855, "loss": 0.1983, "step": 882 }, { "epoch": 2.23574144486692, "grad_norm": 0.4466260075569153, "learning_rate": 0.00011073791348600508, "loss": 0.2323, "step": 883 }, { "epoch": 2.238276299112801, "grad_norm": 0.43661364912986755, "learning_rate": 0.00011063613231552163, "loss": 0.2957, "step": 884 }, { "epoch": 2.240811153358682, "grad_norm": 0.3262166976928711, "learning_rate": 0.00011053435114503819, "loss": 0.195, "step": 885 }, { "epoch": 2.2433460076045626, "grad_norm": 0.5085666179656982, "learning_rate": 0.00011043256997455471, "loss": 0.3349, "step": 886 }, { "epoch": 2.2458808618504436, "grad_norm": 0.46551409363746643, "learning_rate": 0.00011033078880407126, "loss": 0.3318, "step": 887 }, { "epoch": 2.2484157160963245, "grad_norm": 0.425530344247818, "learning_rate": 0.00011022900763358779, "loss": 0.2857, "step": 888 }, { "epoch": 2.2509505703422055, "grad_norm": 0.3377918601036072, "learning_rate": 0.00011012722646310434, "loss": 0.2215, "step": 889 }, { "epoch": 2.253485424588086, "grad_norm": 0.3491476774215698, "learning_rate": 0.00011002544529262087, "loss": 0.2471, "step": 890 }, { "epoch": 2.256020278833967, "grad_norm": 0.3779531419277191, "learning_rate": 0.00010992366412213742, "loss": 0.1984, "step": 891 }, { "epoch": 2.258555133079848, "grad_norm": 0.425077885389328, "learning_rate": 0.00010982188295165395, "loss": 0.2535, "step": 892 }, { "epoch": 2.261089987325729, "grad_norm": 0.40296900272369385, "learning_rate": 0.00010972010178117049, "loss": 0.1955, "step": 893 }, { "epoch": 2.26362484157161, "grad_norm": 0.4394761919975281, "learning_rate": 0.00010961832061068703, "loss": 0.2638, "step": 894 }, { "epoch": 2.2661596958174903, "grad_norm": 0.4743111729621887, "learning_rate": 0.00010951653944020357, "loss": 0.1932, "step": 895 }, { "epoch": 2.2686945500633713, "grad_norm": 0.5121330618858337, "learning_rate": 0.00010941475826972009, "loss": 0.2541, "step": 896 }, { "epoch": 2.2712294043092522, "grad_norm": 0.2810382544994354, "learning_rate": 0.00010931297709923664, "loss": 0.1884, "step": 897 }, { "epoch": 2.273764258555133, "grad_norm": 0.3637334108352661, "learning_rate": 0.0001092111959287532, "loss": 0.2208, "step": 898 }, { "epoch": 2.2762991128010137, "grad_norm": 0.4116186201572418, "learning_rate": 0.00010910941475826972, "loss": 0.1898, "step": 899 }, { "epoch": 2.2788339670468947, "grad_norm": 0.4166296720504761, "learning_rate": 0.00010900763358778628, "loss": 0.2399, "step": 900 }, { "epoch": 2.2813688212927756, "grad_norm": 0.5998784303665161, "learning_rate": 0.0001089058524173028, "loss": 0.2926, "step": 901 }, { "epoch": 2.2839036755386566, "grad_norm": 0.6252371668815613, "learning_rate": 0.00010880407124681935, "loss": 0.2392, "step": 902 }, { "epoch": 2.2864385297845375, "grad_norm": 0.4495537579059601, "learning_rate": 0.00010870229007633588, "loss": 0.2142, "step": 903 }, { "epoch": 2.288973384030418, "grad_norm": 0.5659827589988708, "learning_rate": 0.00010860050890585243, "loss": 0.2993, "step": 904 }, { "epoch": 2.291508238276299, "grad_norm": 0.4290786385536194, "learning_rate": 0.00010849872773536896, "loss": 0.3127, "step": 905 }, { "epoch": 2.29404309252218, "grad_norm": 0.3835826516151428, "learning_rate": 0.0001083969465648855, "loss": 0.1927, "step": 906 }, { "epoch": 2.296577946768061, "grad_norm": 0.4915788769721985, "learning_rate": 0.00010829516539440204, "loss": 0.2553, "step": 907 }, { "epoch": 2.299112801013942, "grad_norm": 0.42122524976730347, "learning_rate": 0.00010819338422391858, "loss": 0.2133, "step": 908 }, { "epoch": 2.3016476552598224, "grad_norm": 0.3904586732387543, "learning_rate": 0.0001080916030534351, "loss": 0.2064, "step": 909 }, { "epoch": 2.3041825095057034, "grad_norm": 0.3680777847766876, "learning_rate": 0.00010798982188295166, "loss": 0.1989, "step": 910 }, { "epoch": 2.3067173637515843, "grad_norm": 0.44054466485977173, "learning_rate": 0.00010788804071246821, "loss": 0.2386, "step": 911 }, { "epoch": 2.3092522179974653, "grad_norm": 0.28730717301368713, "learning_rate": 0.00010778625954198473, "loss": 0.175, "step": 912 }, { "epoch": 2.3117870722433462, "grad_norm": 0.4209315776824951, "learning_rate": 0.00010768447837150129, "loss": 0.2197, "step": 913 }, { "epoch": 2.3143219264892267, "grad_norm": 0.41457393765449524, "learning_rate": 0.00010758269720101781, "loss": 0.202, "step": 914 }, { "epoch": 2.3168567807351077, "grad_norm": 0.40807071328163147, "learning_rate": 0.00010748091603053437, "loss": 0.3087, "step": 915 }, { "epoch": 2.3193916349809887, "grad_norm": 0.42118731141090393, "learning_rate": 0.00010737913486005089, "loss": 0.2269, "step": 916 }, { "epoch": 2.3219264892268696, "grad_norm": 0.3436257541179657, "learning_rate": 0.00010727735368956744, "loss": 0.1987, "step": 917 }, { "epoch": 2.32446134347275, "grad_norm": 0.3721463978290558, "learning_rate": 0.00010717557251908397, "loss": 0.2081, "step": 918 }, { "epoch": 2.326996197718631, "grad_norm": 0.45050719380378723, "learning_rate": 0.00010707379134860052, "loss": 0.2199, "step": 919 }, { "epoch": 2.329531051964512, "grad_norm": 0.42665717005729675, "learning_rate": 0.00010697201017811705, "loss": 0.2176, "step": 920 }, { "epoch": 2.332065906210393, "grad_norm": 0.35217922925949097, "learning_rate": 0.00010687022900763359, "loss": 0.1915, "step": 921 }, { "epoch": 2.334600760456274, "grad_norm": 0.5407602190971375, "learning_rate": 0.00010676844783715014, "loss": 0.2309, "step": 922 }, { "epoch": 2.3371356147021545, "grad_norm": 0.6984291076660156, "learning_rate": 0.00010666666666666667, "loss": 0.2779, "step": 923 }, { "epoch": 2.3396704689480354, "grad_norm": 0.5333911776542664, "learning_rate": 0.00010656488549618322, "loss": 0.2659, "step": 924 }, { "epoch": 2.3422053231939164, "grad_norm": 0.5130952596664429, "learning_rate": 0.00010646310432569974, "loss": 0.315, "step": 925 }, { "epoch": 2.3447401774397973, "grad_norm": 0.3874262869358063, "learning_rate": 0.0001063613231552163, "loss": 0.294, "step": 926 }, { "epoch": 2.347275031685678, "grad_norm": 0.37864431738853455, "learning_rate": 0.00010625954198473282, "loss": 0.1894, "step": 927 }, { "epoch": 2.349809885931559, "grad_norm": 0.406448632478714, "learning_rate": 0.00010615776081424938, "loss": 0.1913, "step": 928 }, { "epoch": 2.3523447401774398, "grad_norm": 0.4278213381767273, "learning_rate": 0.0001060559796437659, "loss": 0.2136, "step": 929 }, { "epoch": 2.3548795944233207, "grad_norm": 0.3853738009929657, "learning_rate": 0.00010595419847328246, "loss": 0.213, "step": 930 }, { "epoch": 2.3574144486692017, "grad_norm": 0.3785664737224579, "learning_rate": 0.00010585241730279898, "loss": 0.22, "step": 931 }, { "epoch": 2.359949302915082, "grad_norm": 0.5863676071166992, "learning_rate": 0.00010575063613231553, "loss": 0.2305, "step": 932 }, { "epoch": 2.362484157160963, "grad_norm": 0.36629414558410645, "learning_rate": 0.00010564885496183206, "loss": 0.2041, "step": 933 }, { "epoch": 2.365019011406844, "grad_norm": 0.44699156284332275, "learning_rate": 0.0001055470737913486, "loss": 0.2763, "step": 934 }, { "epoch": 2.367553865652725, "grad_norm": 0.4775685667991638, "learning_rate": 0.00010544529262086515, "loss": 0.2779, "step": 935 }, { "epoch": 2.3700887198986056, "grad_norm": 0.3192265033721924, "learning_rate": 0.00010534351145038168, "loss": 0.1861, "step": 936 }, { "epoch": 2.3726235741444865, "grad_norm": 0.3589562177658081, "learning_rate": 0.00010524173027989823, "loss": 0.2266, "step": 937 }, { "epoch": 2.3751584283903675, "grad_norm": 0.36193573474884033, "learning_rate": 0.00010513994910941476, "loss": 0.2105, "step": 938 }, { "epoch": 2.3776932826362485, "grad_norm": 0.4141902029514313, "learning_rate": 0.00010503816793893131, "loss": 0.2676, "step": 939 }, { "epoch": 2.3802281368821294, "grad_norm": 0.3118525445461273, "learning_rate": 0.00010493638676844783, "loss": 0.1941, "step": 940 }, { "epoch": 2.3827629911280104, "grad_norm": 0.3232119679450989, "learning_rate": 0.00010483460559796439, "loss": 0.2065, "step": 941 }, { "epoch": 2.385297845373891, "grad_norm": 0.30440258979797363, "learning_rate": 0.00010473282442748091, "loss": 0.1834, "step": 942 }, { "epoch": 2.387832699619772, "grad_norm": 0.5841143131256104, "learning_rate": 0.00010463104325699747, "loss": 0.3785, "step": 943 }, { "epoch": 2.390367553865653, "grad_norm": 0.31851619482040405, "learning_rate": 0.00010452926208651399, "loss": 0.1798, "step": 944 }, { "epoch": 2.3929024081115338, "grad_norm": 0.3820517361164093, "learning_rate": 0.00010442748091603054, "loss": 0.2376, "step": 945 }, { "epoch": 2.3954372623574143, "grad_norm": 0.4379272758960724, "learning_rate": 0.00010432569974554708, "loss": 0.2356, "step": 946 }, { "epoch": 2.3979721166032952, "grad_norm": 0.3120323419570923, "learning_rate": 0.00010422391857506362, "loss": 0.1936, "step": 947 }, { "epoch": 2.400506970849176, "grad_norm": 0.3143107295036316, "learning_rate": 0.00010412213740458016, "loss": 0.184, "step": 948 }, { "epoch": 2.403041825095057, "grad_norm": 0.44618573784828186, "learning_rate": 0.00010402035623409669, "loss": 0.2468, "step": 949 }, { "epoch": 2.405576679340938, "grad_norm": 0.3838117718696594, "learning_rate": 0.00010391857506361324, "loss": 0.2276, "step": 950 }, { "epoch": 2.4081115335868186, "grad_norm": 0.3427219092845917, "learning_rate": 0.00010381679389312977, "loss": 0.2169, "step": 951 }, { "epoch": 2.4106463878326996, "grad_norm": 0.3738270699977875, "learning_rate": 0.00010371501272264632, "loss": 0.2447, "step": 952 }, { "epoch": 2.4131812420785805, "grad_norm": 0.33645015954971313, "learning_rate": 0.00010361323155216285, "loss": 0.1939, "step": 953 }, { "epoch": 2.4157160963244615, "grad_norm": 0.45420047640800476, "learning_rate": 0.0001035114503816794, "loss": 0.242, "step": 954 }, { "epoch": 2.418250950570342, "grad_norm": 0.47141382098197937, "learning_rate": 0.00010340966921119592, "loss": 0.2923, "step": 955 }, { "epoch": 2.420785804816223, "grad_norm": 0.42177528142929077, "learning_rate": 0.00010330788804071248, "loss": 0.2827, "step": 956 }, { "epoch": 2.423320659062104, "grad_norm": 0.409502774477005, "learning_rate": 0.000103206106870229, "loss": 0.2016, "step": 957 }, { "epoch": 2.425855513307985, "grad_norm": 0.47684770822525024, "learning_rate": 0.00010310432569974556, "loss": 0.2093, "step": 958 }, { "epoch": 2.428390367553866, "grad_norm": 0.3357095718383789, "learning_rate": 0.0001030025445292621, "loss": 0.1744, "step": 959 }, { "epoch": 2.4309252217997463, "grad_norm": 0.4120575487613678, "learning_rate": 0.00010290076335877863, "loss": 0.214, "step": 960 }, { "epoch": 2.4334600760456273, "grad_norm": 0.5090222954750061, "learning_rate": 0.00010279898218829517, "loss": 0.2427, "step": 961 }, { "epoch": 2.4359949302915083, "grad_norm": 0.4142550528049469, "learning_rate": 0.0001026972010178117, "loss": 0.2412, "step": 962 }, { "epoch": 2.4385297845373892, "grad_norm": 0.3446972966194153, "learning_rate": 0.00010259541984732825, "loss": 0.1952, "step": 963 }, { "epoch": 2.4410646387832697, "grad_norm": 0.37858110666275024, "learning_rate": 0.00010249363867684478, "loss": 0.1964, "step": 964 }, { "epoch": 2.4435994930291507, "grad_norm": 0.3989041745662689, "learning_rate": 0.00010239185750636133, "loss": 0.2115, "step": 965 }, { "epoch": 2.4461343472750317, "grad_norm": 0.3948146402835846, "learning_rate": 0.00010229007633587786, "loss": 0.2067, "step": 966 }, { "epoch": 2.4486692015209126, "grad_norm": 0.3683820068836212, "learning_rate": 0.00010218829516539441, "loss": 0.1881, "step": 967 }, { "epoch": 2.4512040557667936, "grad_norm": 0.36742380261421204, "learning_rate": 0.00010208651399491094, "loss": 0.2302, "step": 968 }, { "epoch": 2.453738910012674, "grad_norm": 0.32195988297462463, "learning_rate": 0.00010198473282442749, "loss": 0.1994, "step": 969 }, { "epoch": 2.456273764258555, "grad_norm": 0.42296963930130005, "learning_rate": 0.00010188295165394401, "loss": 0.2657, "step": 970 }, { "epoch": 2.458808618504436, "grad_norm": 0.3555774688720703, "learning_rate": 0.00010178117048346057, "loss": 0.1812, "step": 971 }, { "epoch": 2.461343472750317, "grad_norm": 0.6991668343544006, "learning_rate": 0.00010167938931297712, "loss": 0.4318, "step": 972 }, { "epoch": 2.463878326996198, "grad_norm": 0.4290355443954468, "learning_rate": 0.00010157760814249365, "loss": 0.1856, "step": 973 }, { "epoch": 2.4664131812420784, "grad_norm": 0.3479045331478119, "learning_rate": 0.00010147582697201018, "loss": 0.1844, "step": 974 }, { "epoch": 2.4689480354879594, "grad_norm": 0.3862701952457428, "learning_rate": 0.00010137404580152672, "loss": 0.2108, "step": 975 }, { "epoch": 2.4714828897338403, "grad_norm": 0.34411442279815674, "learning_rate": 0.00010127226463104326, "loss": 0.1851, "step": 976 }, { "epoch": 2.4740177439797213, "grad_norm": 0.2434609979391098, "learning_rate": 0.00010117048346055979, "loss": 0.1757, "step": 977 }, { "epoch": 2.4765525982256023, "grad_norm": 0.3341599106788635, "learning_rate": 0.00010106870229007634, "loss": 0.1879, "step": 978 }, { "epoch": 2.4790874524714828, "grad_norm": 0.27678003907203674, "learning_rate": 0.00010096692111959287, "loss": 0.1943, "step": 979 }, { "epoch": 2.4816223067173637, "grad_norm": 0.2388005256652832, "learning_rate": 0.00010086513994910942, "loss": 0.1804, "step": 980 }, { "epoch": 2.4841571609632447, "grad_norm": 0.5265661478042603, "learning_rate": 0.00010076335877862595, "loss": 0.2813, "step": 981 }, { "epoch": 2.4866920152091256, "grad_norm": 0.337007075548172, "learning_rate": 0.0001006615776081425, "loss": 0.1976, "step": 982 }, { "epoch": 2.489226869455006, "grad_norm": 0.42700427770614624, "learning_rate": 0.00010055979643765905, "loss": 0.2031, "step": 983 }, { "epoch": 2.491761723700887, "grad_norm": 0.3900333642959595, "learning_rate": 0.00010045801526717558, "loss": 0.2178, "step": 984 }, { "epoch": 2.494296577946768, "grad_norm": 0.45332932472229004, "learning_rate": 0.00010035623409669213, "loss": 0.2537, "step": 985 }, { "epoch": 2.496831432192649, "grad_norm": 0.30331265926361084, "learning_rate": 0.00010025445292620866, "loss": 0.2074, "step": 986 }, { "epoch": 2.49936628643853, "grad_norm": 0.3379949927330017, "learning_rate": 0.0001001526717557252, "loss": 0.1768, "step": 987 }, { "epoch": 2.5019011406844105, "grad_norm": 0.40859973430633545, "learning_rate": 0.00010005089058524174, "loss": 0.1984, "step": 988 }, { "epoch": 2.5044359949302915, "grad_norm": 0.3993757963180542, "learning_rate": 9.994910941475827e-05, "loss": 0.2162, "step": 989 }, { "epoch": 2.5069708491761724, "grad_norm": 0.5887713432312012, "learning_rate": 9.984732824427481e-05, "loss": 0.2806, "step": 990 }, { "epoch": 2.5095057034220534, "grad_norm": 0.3590678572654724, "learning_rate": 9.974554707379135e-05, "loss": 0.2045, "step": 991 }, { "epoch": 2.512040557667934, "grad_norm": 0.3090289831161499, "learning_rate": 9.964376590330789e-05, "loss": 0.2151, "step": 992 }, { "epoch": 2.514575411913815, "grad_norm": 0.42125657200813293, "learning_rate": 9.954198473282443e-05, "loss": 0.2277, "step": 993 }, { "epoch": 2.517110266159696, "grad_norm": 0.3213401734828949, "learning_rate": 9.944020356234097e-05, "loss": 0.1927, "step": 994 }, { "epoch": 2.5196451204055768, "grad_norm": 0.4558688998222351, "learning_rate": 9.933842239185751e-05, "loss": 0.2418, "step": 995 }, { "epoch": 2.5221799746514577, "grad_norm": 0.5181113481521606, "learning_rate": 9.923664122137405e-05, "loss": 0.2955, "step": 996 }, { "epoch": 2.5247148288973387, "grad_norm": 0.409424751996994, "learning_rate": 9.913486005089059e-05, "loss": 0.226, "step": 997 }, { "epoch": 2.527249683143219, "grad_norm": 0.44536876678466797, "learning_rate": 9.903307888040713e-05, "loss": 0.2412, "step": 998 }, { "epoch": 2.5297845373891, "grad_norm": 0.5028473734855652, "learning_rate": 9.893129770992367e-05, "loss": 0.2658, "step": 999 }, { "epoch": 2.532319391634981, "grad_norm": 0.3157128691673279, "learning_rate": 9.882951653944021e-05, "loss": 0.1939, "step": 1000 }, { "epoch": 2.5348542458808616, "grad_norm": 0.3184659481048584, "learning_rate": 9.872773536895676e-05, "loss": 0.2113, "step": 1001 }, { "epoch": 2.5373891001267426, "grad_norm": 0.5658953785896301, "learning_rate": 9.862595419847329e-05, "loss": 0.2641, "step": 1002 }, { "epoch": 2.5399239543726235, "grad_norm": 0.5306189060211182, "learning_rate": 9.852417302798982e-05, "loss": 0.2495, "step": 1003 }, { "epoch": 2.5424588086185045, "grad_norm": 0.5272448062896729, "learning_rate": 9.842239185750636e-05, "loss": 0.2212, "step": 1004 }, { "epoch": 2.5449936628643854, "grad_norm": 0.3216992914676666, "learning_rate": 9.83206106870229e-05, "loss": 0.2284, "step": 1005 }, { "epoch": 2.5475285171102664, "grad_norm": 0.3573670983314514, "learning_rate": 9.821882951653944e-05, "loss": 0.2568, "step": 1006 }, { "epoch": 2.550063371356147, "grad_norm": 0.4088655710220337, "learning_rate": 9.811704834605598e-05, "loss": 0.2033, "step": 1007 }, { "epoch": 2.552598225602028, "grad_norm": 0.33729737997055054, "learning_rate": 9.801526717557252e-05, "loss": 0.1843, "step": 1008 }, { "epoch": 2.555133079847909, "grad_norm": 0.3298558294773102, "learning_rate": 9.791348600508906e-05, "loss": 0.193, "step": 1009 }, { "epoch": 2.5576679340937893, "grad_norm": 0.33454427123069763, "learning_rate": 9.78117048346056e-05, "loss": 0.1823, "step": 1010 }, { "epoch": 2.5602027883396703, "grad_norm": 0.3466435670852661, "learning_rate": 9.770992366412214e-05, "loss": 0.2204, "step": 1011 }, { "epoch": 2.5627376425855513, "grad_norm": 0.3551004230976105, "learning_rate": 9.760814249363868e-05, "loss": 0.2027, "step": 1012 }, { "epoch": 2.565272496831432, "grad_norm": 0.4317062795162201, "learning_rate": 9.750636132315523e-05, "loss": 0.2099, "step": 1013 }, { "epoch": 2.567807351077313, "grad_norm": 0.5695217847824097, "learning_rate": 9.740458015267177e-05, "loss": 0.2547, "step": 1014 }, { "epoch": 2.570342205323194, "grad_norm": 0.4523742198944092, "learning_rate": 9.730279898218831e-05, "loss": 0.2501, "step": 1015 }, { "epoch": 2.5728770595690746, "grad_norm": 0.3191470503807068, "learning_rate": 9.720101781170484e-05, "loss": 0.1918, "step": 1016 }, { "epoch": 2.5754119138149556, "grad_norm": 0.36234062910079956, "learning_rate": 9.709923664122138e-05, "loss": 0.2081, "step": 1017 }, { "epoch": 2.5779467680608366, "grad_norm": 0.42196425795555115, "learning_rate": 9.699745547073791e-05, "loss": 0.2801, "step": 1018 }, { "epoch": 2.5804816223067175, "grad_norm": 0.3382538855075836, "learning_rate": 9.689567430025445e-05, "loss": 0.221, "step": 1019 }, { "epoch": 2.583016476552598, "grad_norm": 0.5736209750175476, "learning_rate": 9.679389312977099e-05, "loss": 0.2684, "step": 1020 }, { "epoch": 2.585551330798479, "grad_norm": 0.4692763686180115, "learning_rate": 9.669211195928753e-05, "loss": 0.244, "step": 1021 }, { "epoch": 2.58808618504436, "grad_norm": 0.4888627827167511, "learning_rate": 9.659033078880407e-05, "loss": 0.2493, "step": 1022 }, { "epoch": 2.590621039290241, "grad_norm": 0.29745686054229736, "learning_rate": 9.648854961832061e-05, "loss": 0.1757, "step": 1023 }, { "epoch": 2.593155893536122, "grad_norm": 0.476639062166214, "learning_rate": 9.638676844783715e-05, "loss": 0.2031, "step": 1024 }, { "epoch": 2.5956907477820024, "grad_norm": 0.4214845895767212, "learning_rate": 9.628498727735369e-05, "loss": 0.2588, "step": 1025 }, { "epoch": 2.5982256020278833, "grad_norm": 0.3036046326160431, "learning_rate": 9.618320610687024e-05, "loss": 0.2031, "step": 1026 }, { "epoch": 2.6007604562737643, "grad_norm": 0.7941879630088806, "learning_rate": 9.608142493638678e-05, "loss": 0.2096, "step": 1027 }, { "epoch": 2.6032953105196452, "grad_norm": 0.36381933093070984, "learning_rate": 9.597964376590332e-05, "loss": 0.2102, "step": 1028 }, { "epoch": 2.6058301647655258, "grad_norm": 0.3213381767272949, "learning_rate": 9.587786259541986e-05, "loss": 0.1884, "step": 1029 }, { "epoch": 2.6083650190114067, "grad_norm": 0.38559427857398987, "learning_rate": 9.577608142493639e-05, "loss": 0.2229, "step": 1030 }, { "epoch": 2.6108998732572877, "grad_norm": 0.4000662863254547, "learning_rate": 9.567430025445293e-05, "loss": 0.198, "step": 1031 }, { "epoch": 2.6134347275031686, "grad_norm": 0.3635396659374237, "learning_rate": 9.557251908396946e-05, "loss": 0.2267, "step": 1032 }, { "epoch": 2.6159695817490496, "grad_norm": 0.31810763478279114, "learning_rate": 9.5470737913486e-05, "loss": 0.1691, "step": 1033 }, { "epoch": 2.6185044359949305, "grad_norm": 0.29606062173843384, "learning_rate": 9.536895674300254e-05, "loss": 0.1834, "step": 1034 }, { "epoch": 2.621039290240811, "grad_norm": 0.3528769612312317, "learning_rate": 9.526717557251908e-05, "loss": 0.2086, "step": 1035 }, { "epoch": 2.623574144486692, "grad_norm": 0.4795662760734558, "learning_rate": 9.516539440203562e-05, "loss": 0.2429, "step": 1036 }, { "epoch": 2.626108998732573, "grad_norm": 0.4627299904823303, "learning_rate": 9.506361323155216e-05, "loss": 0.1956, "step": 1037 }, { "epoch": 2.6286438529784535, "grad_norm": 0.3330387473106384, "learning_rate": 9.496183206106871e-05, "loss": 0.1891, "step": 1038 }, { "epoch": 2.6311787072243344, "grad_norm": 0.4265390634536743, "learning_rate": 9.486005089058525e-05, "loss": 0.2086, "step": 1039 }, { "epoch": 2.6337135614702154, "grad_norm": 0.37214142084121704, "learning_rate": 9.475826972010179e-05, "loss": 0.2321, "step": 1040 }, { "epoch": 2.6362484157160964, "grad_norm": 0.4183201491832733, "learning_rate": 9.465648854961833e-05, "loss": 0.2029, "step": 1041 }, { "epoch": 2.6387832699619773, "grad_norm": 0.5688794851303101, "learning_rate": 9.455470737913487e-05, "loss": 0.2481, "step": 1042 }, { "epoch": 2.6413181242078583, "grad_norm": 0.38355833292007446, "learning_rate": 9.445292620865141e-05, "loss": 0.1989, "step": 1043 }, { "epoch": 2.643852978453739, "grad_norm": 0.4998534023761749, "learning_rate": 9.435114503816794e-05, "loss": 0.2272, "step": 1044 }, { "epoch": 2.6463878326996197, "grad_norm": 0.2796792685985565, "learning_rate": 9.424936386768448e-05, "loss": 0.1694, "step": 1045 }, { "epoch": 2.6489226869455007, "grad_norm": 0.30551543831825256, "learning_rate": 9.414758269720102e-05, "loss": 0.1782, "step": 1046 }, { "epoch": 2.6514575411913817, "grad_norm": 0.3933429718017578, "learning_rate": 9.404580152671755e-05, "loss": 0.272, "step": 1047 }, { "epoch": 2.653992395437262, "grad_norm": 0.3543720841407776, "learning_rate": 9.39440203562341e-05, "loss": 0.2271, "step": 1048 }, { "epoch": 2.656527249683143, "grad_norm": 0.2716831564903259, "learning_rate": 9.384223918575063e-05, "loss": 0.1898, "step": 1049 }, { "epoch": 2.659062103929024, "grad_norm": 0.3037743866443634, "learning_rate": 9.374045801526719e-05, "loss": 0.1911, "step": 1050 }, { "epoch": 2.661596958174905, "grad_norm": 0.4390093982219696, "learning_rate": 9.363867684478373e-05, "loss": 0.2369, "step": 1051 }, { "epoch": 2.664131812420786, "grad_norm": 0.3383953273296356, "learning_rate": 9.353689567430026e-05, "loss": 0.2519, "step": 1052 }, { "epoch": 2.6666666666666665, "grad_norm": 0.28227975964546204, "learning_rate": 9.34351145038168e-05, "loss": 0.1926, "step": 1053 }, { "epoch": 2.6692015209125475, "grad_norm": 0.33451253175735474, "learning_rate": 9.333333333333334e-05, "loss": 0.1864, "step": 1054 }, { "epoch": 2.6717363751584284, "grad_norm": 0.4116145372390747, "learning_rate": 9.323155216284988e-05, "loss": 0.2462, "step": 1055 }, { "epoch": 2.6742712294043094, "grad_norm": 0.43822887539863586, "learning_rate": 9.312977099236642e-05, "loss": 0.2014, "step": 1056 }, { "epoch": 2.67680608365019, "grad_norm": 0.4394984841346741, "learning_rate": 9.302798982188296e-05, "loss": 0.2378, "step": 1057 }, { "epoch": 2.679340937896071, "grad_norm": 0.4073251783847809, "learning_rate": 9.292620865139949e-05, "loss": 0.2711, "step": 1058 }, { "epoch": 2.681875792141952, "grad_norm": 0.3316657841205597, "learning_rate": 9.282442748091603e-05, "loss": 0.214, "step": 1059 }, { "epoch": 2.6844106463878328, "grad_norm": 0.2994216978549957, "learning_rate": 9.272264631043257e-05, "loss": 0.1838, "step": 1060 }, { "epoch": 2.6869455006337137, "grad_norm": 0.5388765335083008, "learning_rate": 9.26208651399491e-05, "loss": 0.277, "step": 1061 }, { "epoch": 2.6894803548795947, "grad_norm": 0.3714945912361145, "learning_rate": 9.251908396946566e-05, "loss": 0.2428, "step": 1062 }, { "epoch": 2.692015209125475, "grad_norm": 0.32202383875846863, "learning_rate": 9.24173027989822e-05, "loss": 0.2063, "step": 1063 }, { "epoch": 2.694550063371356, "grad_norm": 0.4116881191730499, "learning_rate": 9.231552162849874e-05, "loss": 0.2661, "step": 1064 }, { "epoch": 2.697084917617237, "grad_norm": 0.36626386642456055, "learning_rate": 9.221374045801528e-05, "loss": 0.2897, "step": 1065 }, { "epoch": 2.6996197718631176, "grad_norm": 0.33859655261039734, "learning_rate": 9.211195928753181e-05, "loss": 0.1959, "step": 1066 }, { "epoch": 2.7021546261089986, "grad_norm": 0.38263705372810364, "learning_rate": 9.201017811704835e-05, "loss": 0.2827, "step": 1067 }, { "epoch": 2.7046894803548795, "grad_norm": 0.3557961583137512, "learning_rate": 9.19083969465649e-05, "loss": 0.176, "step": 1068 }, { "epoch": 2.7072243346007605, "grad_norm": 0.35334861278533936, "learning_rate": 9.180661577608143e-05, "loss": 0.2183, "step": 1069 }, { "epoch": 2.7097591888466415, "grad_norm": 0.4672026038169861, "learning_rate": 9.170483460559797e-05, "loss": 0.2715, "step": 1070 }, { "epoch": 2.7122940430925224, "grad_norm": 0.41585099697113037, "learning_rate": 9.160305343511451e-05, "loss": 0.1912, "step": 1071 }, { "epoch": 2.714828897338403, "grad_norm": 0.54674232006073, "learning_rate": 9.150127226463104e-05, "loss": 0.2493, "step": 1072 }, { "epoch": 2.717363751584284, "grad_norm": 0.30595988035202026, "learning_rate": 9.139949109414758e-05, "loss": 0.1843, "step": 1073 }, { "epoch": 2.719898605830165, "grad_norm": 0.3521415889263153, "learning_rate": 9.129770992366413e-05, "loss": 0.2047, "step": 1074 }, { "epoch": 2.7224334600760454, "grad_norm": 0.47393590211868286, "learning_rate": 9.119592875318067e-05, "loss": 0.3398, "step": 1075 }, { "epoch": 2.7249683143219263, "grad_norm": 0.4672793745994568, "learning_rate": 9.109414758269721e-05, "loss": 0.3569, "step": 1076 }, { "epoch": 2.7275031685678073, "grad_norm": 0.41231435537338257, "learning_rate": 9.099236641221375e-05, "loss": 0.2323, "step": 1077 }, { "epoch": 2.7300380228136882, "grad_norm": 0.36700156331062317, "learning_rate": 9.089058524173029e-05, "loss": 0.2023, "step": 1078 }, { "epoch": 2.732572877059569, "grad_norm": 0.32198184728622437, "learning_rate": 9.078880407124683e-05, "loss": 0.1814, "step": 1079 }, { "epoch": 2.73510773130545, "grad_norm": 0.46826303005218506, "learning_rate": 9.068702290076337e-05, "loss": 0.2216, "step": 1080 }, { "epoch": 2.7376425855513307, "grad_norm": 0.3026100695133209, "learning_rate": 9.05852417302799e-05, "loss": 0.1826, "step": 1081 }, { "epoch": 2.7401774397972116, "grad_norm": 0.2897210717201233, "learning_rate": 9.048346055979644e-05, "loss": 0.1853, "step": 1082 }, { "epoch": 2.7427122940430926, "grad_norm": 0.296286940574646, "learning_rate": 9.038167938931298e-05, "loss": 0.1776, "step": 1083 }, { "epoch": 2.7452471482889735, "grad_norm": 0.374600887298584, "learning_rate": 9.027989821882952e-05, "loss": 0.2031, "step": 1084 }, { "epoch": 2.747782002534854, "grad_norm": 0.5333495140075684, "learning_rate": 9.017811704834606e-05, "loss": 0.2798, "step": 1085 }, { "epoch": 2.750316856780735, "grad_norm": 0.43342864513397217, "learning_rate": 9.007633587786259e-05, "loss": 0.2063, "step": 1086 }, { "epoch": 2.752851711026616, "grad_norm": 0.5283639430999756, "learning_rate": 8.997455470737914e-05, "loss": 0.25, "step": 1087 }, { "epoch": 2.755386565272497, "grad_norm": 0.556190013885498, "learning_rate": 8.987277353689568e-05, "loss": 0.2044, "step": 1088 }, { "epoch": 2.757921419518378, "grad_norm": 0.35083258152008057, "learning_rate": 8.977099236641222e-05, "loss": 0.188, "step": 1089 }, { "epoch": 2.7604562737642584, "grad_norm": 0.42917102575302124, "learning_rate": 8.966921119592876e-05, "loss": 0.2511, "step": 1090 }, { "epoch": 2.7629911280101394, "grad_norm": 0.5665780305862427, "learning_rate": 8.95674300254453e-05, "loss": 0.3307, "step": 1091 }, { "epoch": 2.7655259822560203, "grad_norm": 0.40193435549736023, "learning_rate": 8.946564885496184e-05, "loss": 0.2453, "step": 1092 }, { "epoch": 2.7680608365019013, "grad_norm": 0.46344733238220215, "learning_rate": 8.936386768447838e-05, "loss": 0.2096, "step": 1093 }, { "epoch": 2.770595690747782, "grad_norm": 0.4600921869277954, "learning_rate": 8.926208651399492e-05, "loss": 0.2161, "step": 1094 }, { "epoch": 2.7731305449936627, "grad_norm": 0.46053385734558105, "learning_rate": 8.916030534351145e-05, "loss": 0.2369, "step": 1095 }, { "epoch": 2.7756653992395437, "grad_norm": 0.45449280738830566, "learning_rate": 8.9058524173028e-05, "loss": 0.2344, "step": 1096 }, { "epoch": 2.7782002534854247, "grad_norm": 0.39411383867263794, "learning_rate": 8.895674300254453e-05, "loss": 0.2082, "step": 1097 }, { "epoch": 2.7807351077313056, "grad_norm": 0.38967519998550415, "learning_rate": 8.885496183206107e-05, "loss": 0.2264, "step": 1098 }, { "epoch": 2.7832699619771866, "grad_norm": 0.3357069194316864, "learning_rate": 8.875318066157761e-05, "loss": 0.1896, "step": 1099 }, { "epoch": 2.785804816223067, "grad_norm": 0.4941220283508301, "learning_rate": 8.865139949109415e-05, "loss": 0.3003, "step": 1100 }, { "epoch": 2.788339670468948, "grad_norm": 0.3897833526134491, "learning_rate": 8.854961832061069e-05, "loss": 0.1907, "step": 1101 }, { "epoch": 2.790874524714829, "grad_norm": 0.4247800409793854, "learning_rate": 8.844783715012723e-05, "loss": 0.1843, "step": 1102 }, { "epoch": 2.7934093789607095, "grad_norm": 0.46850237250328064, "learning_rate": 8.834605597964377e-05, "loss": 0.2501, "step": 1103 }, { "epoch": 2.7959442332065905, "grad_norm": 0.4753093421459198, "learning_rate": 8.824427480916031e-05, "loss": 0.2277, "step": 1104 }, { "epoch": 2.7984790874524714, "grad_norm": 0.3235141932964325, "learning_rate": 8.814249363867685e-05, "loss": 0.1817, "step": 1105 }, { "epoch": 2.8010139416983524, "grad_norm": 0.48403674364089966, "learning_rate": 8.804071246819339e-05, "loss": 0.2278, "step": 1106 }, { "epoch": 2.8035487959442333, "grad_norm": 0.30417025089263916, "learning_rate": 8.793893129770993e-05, "loss": 0.1867, "step": 1107 }, { "epoch": 2.8060836501901143, "grad_norm": 0.30289140343666077, "learning_rate": 8.783715012722647e-05, "loss": 0.1898, "step": 1108 }, { "epoch": 2.808618504435995, "grad_norm": 0.47156116366386414, "learning_rate": 8.7735368956743e-05, "loss": 0.2381, "step": 1109 }, { "epoch": 2.8111533586818758, "grad_norm": 0.4420924186706543, "learning_rate": 8.763358778625954e-05, "loss": 0.251, "step": 1110 }, { "epoch": 2.8136882129277567, "grad_norm": 0.42235851287841797, "learning_rate": 8.75318066157761e-05, "loss": 0.2007, "step": 1111 }, { "epoch": 2.8162230671736372, "grad_norm": 0.40069061517715454, "learning_rate": 8.743002544529262e-05, "loss": 0.2052, "step": 1112 }, { "epoch": 2.818757921419518, "grad_norm": 0.5213333368301392, "learning_rate": 8.732824427480916e-05, "loss": 0.2236, "step": 1113 }, { "epoch": 2.821292775665399, "grad_norm": 0.3919121026992798, "learning_rate": 8.72264631043257e-05, "loss": 0.2338, "step": 1114 }, { "epoch": 2.82382762991128, "grad_norm": 0.4295049011707306, "learning_rate": 8.712468193384224e-05, "loss": 0.2713, "step": 1115 }, { "epoch": 2.826362484157161, "grad_norm": 0.25834596157073975, "learning_rate": 8.702290076335878e-05, "loss": 0.1701, "step": 1116 }, { "epoch": 2.828897338403042, "grad_norm": 0.36217084527015686, "learning_rate": 8.692111959287532e-05, "loss": 0.1963, "step": 1117 }, { "epoch": 2.8314321926489225, "grad_norm": 0.39089757204055786, "learning_rate": 8.681933842239186e-05, "loss": 0.186, "step": 1118 }, { "epoch": 2.8339670468948035, "grad_norm": 0.45900896191596985, "learning_rate": 8.67175572519084e-05, "loss": 0.22, "step": 1119 }, { "epoch": 2.8365019011406845, "grad_norm": 0.2946614623069763, "learning_rate": 8.661577608142494e-05, "loss": 0.1771, "step": 1120 }, { "epoch": 2.8390367553865654, "grad_norm": 0.4160090982913971, "learning_rate": 8.651399491094148e-05, "loss": 0.2083, "step": 1121 }, { "epoch": 2.841571609632446, "grad_norm": 0.43507587909698486, "learning_rate": 8.641221374045802e-05, "loss": 0.2595, "step": 1122 }, { "epoch": 2.844106463878327, "grad_norm": 0.449813574552536, "learning_rate": 8.631043256997457e-05, "loss": 0.2982, "step": 1123 }, { "epoch": 2.846641318124208, "grad_norm": 0.33715054392814636, "learning_rate": 8.620865139949111e-05, "loss": 0.1851, "step": 1124 }, { "epoch": 2.849176172370089, "grad_norm": 0.4767422676086426, "learning_rate": 8.610687022900765e-05, "loss": 0.2865, "step": 1125 }, { "epoch": 2.8517110266159698, "grad_norm": 0.4232870042324066, "learning_rate": 8.600508905852417e-05, "loss": 0.2355, "step": 1126 }, { "epoch": 2.8542458808618507, "grad_norm": 0.286565363407135, "learning_rate": 8.590330788804071e-05, "loss": 0.188, "step": 1127 }, { "epoch": 2.8567807351077312, "grad_norm": 0.304606169462204, "learning_rate": 8.580152671755725e-05, "loss": 0.2367, "step": 1128 }, { "epoch": 2.859315589353612, "grad_norm": 0.4730917811393738, "learning_rate": 8.569974554707379e-05, "loss": 0.2925, "step": 1129 }, { "epoch": 2.861850443599493, "grad_norm": 0.348651647567749, "learning_rate": 8.559796437659033e-05, "loss": 0.242, "step": 1130 }, { "epoch": 2.8643852978453737, "grad_norm": 0.31156882643699646, "learning_rate": 8.549618320610687e-05, "loss": 0.1865, "step": 1131 }, { "epoch": 2.8669201520912546, "grad_norm": 0.4416813254356384, "learning_rate": 8.539440203562341e-05, "loss": 0.311, "step": 1132 }, { "epoch": 2.8694550063371356, "grad_norm": 0.2997666895389557, "learning_rate": 8.529262086513995e-05, "loss": 0.1956, "step": 1133 }, { "epoch": 2.8719898605830165, "grad_norm": 0.30020904541015625, "learning_rate": 8.519083969465649e-05, "loss": 0.206, "step": 1134 }, { "epoch": 2.8745247148288975, "grad_norm": 0.4457029104232788, "learning_rate": 8.508905852417304e-05, "loss": 0.2422, "step": 1135 }, { "epoch": 2.8770595690747784, "grad_norm": 0.3519587218761444, "learning_rate": 8.498727735368958e-05, "loss": 0.2277, "step": 1136 }, { "epoch": 2.879594423320659, "grad_norm": 0.3482111394405365, "learning_rate": 8.488549618320612e-05, "loss": 0.1981, "step": 1137 }, { "epoch": 2.88212927756654, "grad_norm": 0.31978392601013184, "learning_rate": 8.478371501272266e-05, "loss": 0.1849, "step": 1138 }, { "epoch": 2.884664131812421, "grad_norm": 0.2380414754152298, "learning_rate": 8.46819338422392e-05, "loss": 0.1619, "step": 1139 }, { "epoch": 2.8871989860583014, "grad_norm": 0.25577735900878906, "learning_rate": 8.458015267175572e-05, "loss": 0.1594, "step": 1140 }, { "epoch": 2.8897338403041823, "grad_norm": 0.36093661189079285, "learning_rate": 8.447837150127226e-05, "loss": 0.1937, "step": 1141 }, { "epoch": 2.8922686945500633, "grad_norm": 0.3542689085006714, "learning_rate": 8.43765903307888e-05, "loss": 0.2219, "step": 1142 }, { "epoch": 2.8948035487959443, "grad_norm": 0.3966139853000641, "learning_rate": 8.427480916030534e-05, "loss": 0.2427, "step": 1143 }, { "epoch": 2.897338403041825, "grad_norm": 0.3684738278388977, "learning_rate": 8.417302798982188e-05, "loss": 0.2093, "step": 1144 }, { "epoch": 2.899873257287706, "grad_norm": 0.430477499961853, "learning_rate": 8.407124681933842e-05, "loss": 0.2266, "step": 1145 }, { "epoch": 2.9024081115335867, "grad_norm": 0.32896652817726135, "learning_rate": 8.396946564885496e-05, "loss": 0.2447, "step": 1146 }, { "epoch": 2.9049429657794676, "grad_norm": 0.45568832755088806, "learning_rate": 8.38676844783715e-05, "loss": 0.2251, "step": 1147 }, { "epoch": 2.9074778200253486, "grad_norm": 0.48290732502937317, "learning_rate": 8.376590330788805e-05, "loss": 0.2471, "step": 1148 }, { "epoch": 2.9100126742712296, "grad_norm": 0.40795937180519104, "learning_rate": 8.366412213740459e-05, "loss": 0.2031, "step": 1149 }, { "epoch": 2.91254752851711, "grad_norm": 0.362835168838501, "learning_rate": 8.356234096692113e-05, "loss": 0.1991, "step": 1150 }, { "epoch": 2.915082382762991, "grad_norm": 0.38601744174957275, "learning_rate": 8.346055979643767e-05, "loss": 0.1821, "step": 1151 }, { "epoch": 2.917617237008872, "grad_norm": 0.2641182541847229, "learning_rate": 8.335877862595421e-05, "loss": 0.16, "step": 1152 }, { "epoch": 2.920152091254753, "grad_norm": 0.5600478053092957, "learning_rate": 8.325699745547075e-05, "loss": 0.2476, "step": 1153 }, { "epoch": 2.922686945500634, "grad_norm": 0.3873019516468048, "learning_rate": 8.315521628498727e-05, "loss": 0.2264, "step": 1154 }, { "epoch": 2.9252217997465144, "grad_norm": 0.2946743667125702, "learning_rate": 8.305343511450381e-05, "loss": 0.1776, "step": 1155 }, { "epoch": 2.9277566539923954, "grad_norm": 0.3886416554450989, "learning_rate": 8.295165394402035e-05, "loss": 0.2123, "step": 1156 }, { "epoch": 2.9302915082382763, "grad_norm": 0.39706671237945557, "learning_rate": 8.284987277353689e-05, "loss": 0.2319, "step": 1157 }, { "epoch": 2.9328263624841573, "grad_norm": 0.30693602561950684, "learning_rate": 8.274809160305343e-05, "loss": 0.1939, "step": 1158 }, { "epoch": 2.935361216730038, "grad_norm": 0.37277474999427795, "learning_rate": 8.264631043256997e-05, "loss": 0.2194, "step": 1159 }, { "epoch": 2.9378960709759188, "grad_norm": 0.442508727312088, "learning_rate": 8.254452926208652e-05, "loss": 0.2142, "step": 1160 }, { "epoch": 2.9404309252217997, "grad_norm": 0.275898814201355, "learning_rate": 8.244274809160306e-05, "loss": 0.1791, "step": 1161 }, { "epoch": 2.9429657794676807, "grad_norm": 0.4033918082714081, "learning_rate": 8.23409669211196e-05, "loss": 0.295, "step": 1162 }, { "epoch": 2.9455006337135616, "grad_norm": 0.46713244915008545, "learning_rate": 8.223918575063614e-05, "loss": 0.2662, "step": 1163 }, { "epoch": 2.9480354879594426, "grad_norm": 0.37975406646728516, "learning_rate": 8.213740458015268e-05, "loss": 0.1915, "step": 1164 }, { "epoch": 2.950570342205323, "grad_norm": 0.31382545828819275, "learning_rate": 8.203562340966922e-05, "loss": 0.1793, "step": 1165 }, { "epoch": 2.953105196451204, "grad_norm": 0.42415499687194824, "learning_rate": 8.193384223918576e-05, "loss": 0.2375, "step": 1166 }, { "epoch": 2.955640050697085, "grad_norm": 0.4227803647518158, "learning_rate": 8.18320610687023e-05, "loss": 0.213, "step": 1167 }, { "epoch": 2.9581749049429655, "grad_norm": 0.3395853638648987, "learning_rate": 8.173027989821882e-05, "loss": 0.1942, "step": 1168 }, { "epoch": 2.9607097591888465, "grad_norm": 0.4627746641635895, "learning_rate": 8.162849872773536e-05, "loss": 0.2266, "step": 1169 }, { "epoch": 2.9632446134347274, "grad_norm": 0.36325398087501526, "learning_rate": 8.15267175572519e-05, "loss": 0.2176, "step": 1170 }, { "epoch": 2.9657794676806084, "grad_norm": 0.4188767671585083, "learning_rate": 8.142493638676844e-05, "loss": 0.1992, "step": 1171 }, { "epoch": 2.9683143219264894, "grad_norm": 0.3149709403514862, "learning_rate": 8.1323155216285e-05, "loss": 0.1829, "step": 1172 }, { "epoch": 2.9708491761723703, "grad_norm": 0.26542145013809204, "learning_rate": 8.122137404580153e-05, "loss": 0.1801, "step": 1173 }, { "epoch": 2.973384030418251, "grad_norm": 0.28748998045921326, "learning_rate": 8.111959287531807e-05, "loss": 0.1764, "step": 1174 }, { "epoch": 2.975918884664132, "grad_norm": 0.3103797733783722, "learning_rate": 8.101781170483461e-05, "loss": 0.2047, "step": 1175 }, { "epoch": 2.9784537389100127, "grad_norm": 0.3357256054878235, "learning_rate": 8.091603053435115e-05, "loss": 0.2303, "step": 1176 }, { "epoch": 2.9809885931558933, "grad_norm": 0.4399915933609009, "learning_rate": 8.081424936386769e-05, "loss": 0.2423, "step": 1177 }, { "epoch": 2.983523447401774, "grad_norm": 0.3486070930957794, "learning_rate": 8.071246819338423e-05, "loss": 0.19, "step": 1178 }, { "epoch": 2.986058301647655, "grad_norm": 0.33286648988723755, "learning_rate": 8.061068702290077e-05, "loss": 0.1788, "step": 1179 }, { "epoch": 2.988593155893536, "grad_norm": 0.2841028571128845, "learning_rate": 8.050890585241731e-05, "loss": 0.167, "step": 1180 }, { "epoch": 2.991128010139417, "grad_norm": 0.44933149218559265, "learning_rate": 8.040712468193385e-05, "loss": 0.3098, "step": 1181 }, { "epoch": 2.993662864385298, "grad_norm": 0.2849741280078888, "learning_rate": 8.030534351145038e-05, "loss": 0.1896, "step": 1182 }, { "epoch": 2.9961977186311786, "grad_norm": 0.39720216393470764, "learning_rate": 8.020356234096691e-05, "loss": 0.2426, "step": 1183 }, { "epoch": 2.9987325728770595, "grad_norm": 0.3838231563568115, "learning_rate": 8.010178117048347e-05, "loss": 0.2194, "step": 1184 }, { "epoch": 3.0, "grad_norm": 0.6684709787368774, "learning_rate": 8e-05, "loss": 0.2783, "step": 1185 }, { "epoch": 3.002534854245881, "grad_norm": 0.44380757212638855, "learning_rate": 7.989821882951655e-05, "loss": 0.2938, "step": 1186 }, { "epoch": 3.005069708491762, "grad_norm": 0.4787996709346771, "learning_rate": 7.979643765903309e-05, "loss": 0.2998, "step": 1187 }, { "epoch": 3.0076045627376424, "grad_norm": 0.36355340480804443, "learning_rate": 7.969465648854962e-05, "loss": 0.1555, "step": 1188 }, { "epoch": 3.0101394169835234, "grad_norm": 0.37890535593032837, "learning_rate": 7.959287531806616e-05, "loss": 0.1743, "step": 1189 }, { "epoch": 3.0126742712294043, "grad_norm": 0.4317542612552643, "learning_rate": 7.94910941475827e-05, "loss": 0.1891, "step": 1190 }, { "epoch": 3.0152091254752853, "grad_norm": 0.3477863669395447, "learning_rate": 7.938931297709924e-05, "loss": 0.1576, "step": 1191 }, { "epoch": 3.017743979721166, "grad_norm": 0.414050817489624, "learning_rate": 7.928753180661578e-05, "loss": 0.2014, "step": 1192 }, { "epoch": 3.0202788339670468, "grad_norm": 0.3596842288970947, "learning_rate": 7.918575063613232e-05, "loss": 0.1482, "step": 1193 }, { "epoch": 3.0228136882129277, "grad_norm": 0.49169921875, "learning_rate": 7.908396946564886e-05, "loss": 0.1686, "step": 1194 }, { "epoch": 3.0253485424588087, "grad_norm": 0.44806674122810364, "learning_rate": 7.89821882951654e-05, "loss": 0.2044, "step": 1195 }, { "epoch": 3.0278833967046896, "grad_norm": 0.43101197481155396, "learning_rate": 7.888040712468194e-05, "loss": 0.1911, "step": 1196 }, { "epoch": 3.03041825095057, "grad_norm": 0.5595632195472717, "learning_rate": 7.877862595419848e-05, "loss": 0.1823, "step": 1197 }, { "epoch": 3.032953105196451, "grad_norm": 0.5024780035018921, "learning_rate": 7.867684478371502e-05, "loss": 0.1789, "step": 1198 }, { "epoch": 3.035487959442332, "grad_norm": 0.4227488934993744, "learning_rate": 7.857506361323156e-05, "loss": 0.1539, "step": 1199 }, { "epoch": 3.038022813688213, "grad_norm": 0.43486127257347107, "learning_rate": 7.84732824427481e-05, "loss": 0.1577, "step": 1200 }, { "epoch": 3.040557667934094, "grad_norm": 0.47951167821884155, "learning_rate": 7.837150127226464e-05, "loss": 0.1975, "step": 1201 }, { "epoch": 3.0430925221799745, "grad_norm": 0.4223075211048126, "learning_rate": 7.826972010178117e-05, "loss": 0.1719, "step": 1202 }, { "epoch": 3.0456273764258555, "grad_norm": 0.6699900031089783, "learning_rate": 7.816793893129771e-05, "loss": 0.2139, "step": 1203 }, { "epoch": 3.0481622306717364, "grad_norm": 0.6038373708724976, "learning_rate": 7.806615776081425e-05, "loss": 0.2163, "step": 1204 }, { "epoch": 3.0506970849176174, "grad_norm": 0.530208945274353, "learning_rate": 7.796437659033079e-05, "loss": 0.1482, "step": 1205 }, { "epoch": 3.053231939163498, "grad_norm": 0.6380701661109924, "learning_rate": 7.786259541984733e-05, "loss": 0.2191, "step": 1206 }, { "epoch": 3.055766793409379, "grad_norm": 0.6455860137939453, "learning_rate": 7.776081424936387e-05, "loss": 0.1812, "step": 1207 }, { "epoch": 3.05830164765526, "grad_norm": 0.5198556184768677, "learning_rate": 7.765903307888041e-05, "loss": 0.1602, "step": 1208 }, { "epoch": 3.0608365019011408, "grad_norm": 0.4842750132083893, "learning_rate": 7.755725190839695e-05, "loss": 0.1739, "step": 1209 }, { "epoch": 3.0633713561470217, "grad_norm": 0.6345165371894836, "learning_rate": 7.745547073791349e-05, "loss": 0.1841, "step": 1210 }, { "epoch": 3.0659062103929022, "grad_norm": 0.551673173904419, "learning_rate": 7.735368956743003e-05, "loss": 0.1755, "step": 1211 }, { "epoch": 3.068441064638783, "grad_norm": 0.5332705974578857, "learning_rate": 7.725190839694657e-05, "loss": 0.2175, "step": 1212 }, { "epoch": 3.070975918884664, "grad_norm": 0.6630911231040955, "learning_rate": 7.715012722646311e-05, "loss": 0.2868, "step": 1213 }, { "epoch": 3.073510773130545, "grad_norm": 0.42508792877197266, "learning_rate": 7.704834605597965e-05, "loss": 0.1811, "step": 1214 }, { "epoch": 3.076045627376426, "grad_norm": 0.504231870174408, "learning_rate": 7.694656488549619e-05, "loss": 0.1765, "step": 1215 }, { "epoch": 3.0785804816223066, "grad_norm": 0.39370813965797424, "learning_rate": 7.684478371501273e-05, "loss": 0.1739, "step": 1216 }, { "epoch": 3.0811153358681875, "grad_norm": 0.5411176085472107, "learning_rate": 7.674300254452926e-05, "loss": 0.2015, "step": 1217 }, { "epoch": 3.0836501901140685, "grad_norm": 0.58034348487854, "learning_rate": 7.66412213740458e-05, "loss": 0.2293, "step": 1218 }, { "epoch": 3.0861850443599494, "grad_norm": 0.48355352878570557, "learning_rate": 7.653944020356234e-05, "loss": 0.1858, "step": 1219 }, { "epoch": 3.08871989860583, "grad_norm": 0.3532313406467438, "learning_rate": 7.643765903307888e-05, "loss": 0.1689, "step": 1220 }, { "epoch": 3.091254752851711, "grad_norm": 0.36245197057724, "learning_rate": 7.633587786259542e-05, "loss": 0.1744, "step": 1221 }, { "epoch": 3.093789607097592, "grad_norm": 0.4752829372882843, "learning_rate": 7.623409669211196e-05, "loss": 0.1733, "step": 1222 }, { "epoch": 3.096324461343473, "grad_norm": 0.3701539933681488, "learning_rate": 7.61323155216285e-05, "loss": 0.158, "step": 1223 }, { "epoch": 3.098859315589354, "grad_norm": 0.45548319816589355, "learning_rate": 7.603053435114504e-05, "loss": 0.1822, "step": 1224 }, { "epoch": 3.1013941698352343, "grad_norm": 0.376499205827713, "learning_rate": 7.592875318066158e-05, "loss": 0.1613, "step": 1225 }, { "epoch": 3.1039290240811153, "grad_norm": 0.4430786967277527, "learning_rate": 7.582697201017812e-05, "loss": 0.1691, "step": 1226 }, { "epoch": 3.106463878326996, "grad_norm": 0.44311538338661194, "learning_rate": 7.572519083969466e-05, "loss": 0.1853, "step": 1227 }, { "epoch": 3.108998732572877, "grad_norm": 0.5815149545669556, "learning_rate": 7.56234096692112e-05, "loss": 0.2039, "step": 1228 }, { "epoch": 3.111533586818758, "grad_norm": 0.5101373195648193, "learning_rate": 7.552162849872774e-05, "loss": 0.2022, "step": 1229 }, { "epoch": 3.1140684410646386, "grad_norm": 0.6038093566894531, "learning_rate": 7.541984732824428e-05, "loss": 0.1859, "step": 1230 }, { "epoch": 3.1166032953105196, "grad_norm": 0.5133914351463318, "learning_rate": 7.531806615776081e-05, "loss": 0.1626, "step": 1231 }, { "epoch": 3.1191381495564006, "grad_norm": 0.40495821833610535, "learning_rate": 7.521628498727735e-05, "loss": 0.1739, "step": 1232 }, { "epoch": 3.1216730038022815, "grad_norm": 0.6585063934326172, "learning_rate": 7.511450381679391e-05, "loss": 0.2402, "step": 1233 }, { "epoch": 3.124207858048162, "grad_norm": 0.45598068833351135, "learning_rate": 7.501272264631045e-05, "loss": 0.1632, "step": 1234 }, { "epoch": 3.126742712294043, "grad_norm": 0.42114904522895813, "learning_rate": 7.491094147582699e-05, "loss": 0.1638, "step": 1235 }, { "epoch": 3.129277566539924, "grad_norm": 0.443198561668396, "learning_rate": 7.480916030534351e-05, "loss": 0.2148, "step": 1236 }, { "epoch": 3.131812420785805, "grad_norm": 0.5573143362998962, "learning_rate": 7.470737913486005e-05, "loss": 0.2219, "step": 1237 }, { "epoch": 3.134347275031686, "grad_norm": 0.6023311614990234, "learning_rate": 7.460559796437659e-05, "loss": 0.1987, "step": 1238 }, { "epoch": 3.1368821292775664, "grad_norm": 0.5282934904098511, "learning_rate": 7.450381679389313e-05, "loss": 0.2377, "step": 1239 }, { "epoch": 3.1394169835234473, "grad_norm": 0.49694669246673584, "learning_rate": 7.440203562340967e-05, "loss": 0.1804, "step": 1240 }, { "epoch": 3.1419518377693283, "grad_norm": 0.43045276403427124, "learning_rate": 7.430025445292621e-05, "loss": 0.1635, "step": 1241 }, { "epoch": 3.1444866920152093, "grad_norm": 0.4798453152179718, "learning_rate": 7.419847328244275e-05, "loss": 0.1696, "step": 1242 }, { "epoch": 3.14702154626109, "grad_norm": 0.5173293352127075, "learning_rate": 7.409669211195929e-05, "loss": 0.1802, "step": 1243 }, { "epoch": 3.1495564005069707, "grad_norm": 0.5398945808410645, "learning_rate": 7.399491094147583e-05, "loss": 0.1949, "step": 1244 }, { "epoch": 3.1520912547528517, "grad_norm": 0.5297830700874329, "learning_rate": 7.389312977099238e-05, "loss": 0.1987, "step": 1245 }, { "epoch": 3.1546261089987326, "grad_norm": 0.5320866703987122, "learning_rate": 7.379134860050892e-05, "loss": 0.1715, "step": 1246 }, { "epoch": 3.1571609632446136, "grad_norm": 0.6132882833480835, "learning_rate": 7.368956743002546e-05, "loss": 0.3204, "step": 1247 }, { "epoch": 3.159695817490494, "grad_norm": 0.4120640158653259, "learning_rate": 7.3587786259542e-05, "loss": 0.157, "step": 1248 }, { "epoch": 3.162230671736375, "grad_norm": 0.6765384674072266, "learning_rate": 7.348600508905854e-05, "loss": 0.2186, "step": 1249 }, { "epoch": 3.164765525982256, "grad_norm": 0.6318830847740173, "learning_rate": 7.338422391857506e-05, "loss": 0.2189, "step": 1250 }, { "epoch": 3.167300380228137, "grad_norm": 0.508305013179779, "learning_rate": 7.32824427480916e-05, "loss": 0.1962, "step": 1251 }, { "epoch": 3.169835234474018, "grad_norm": 0.603520393371582, "learning_rate": 7.318066157760814e-05, "loss": 0.2615, "step": 1252 }, { "epoch": 3.1723700887198985, "grad_norm": 0.7639157176017761, "learning_rate": 7.307888040712468e-05, "loss": 0.2982, "step": 1253 }, { "epoch": 3.1749049429657794, "grad_norm": 0.5995659232139587, "learning_rate": 7.297709923664122e-05, "loss": 0.2206, "step": 1254 }, { "epoch": 3.1774397972116604, "grad_norm": 0.6512479186058044, "learning_rate": 7.287531806615776e-05, "loss": 0.2065, "step": 1255 }, { "epoch": 3.1799746514575413, "grad_norm": 0.4128544330596924, "learning_rate": 7.27735368956743e-05, "loss": 0.1589, "step": 1256 }, { "epoch": 3.182509505703422, "grad_norm": 0.5341802835464478, "learning_rate": 7.267175572519084e-05, "loss": 0.1812, "step": 1257 }, { "epoch": 3.185044359949303, "grad_norm": 0.38032597303390503, "learning_rate": 7.256997455470739e-05, "loss": 0.1773, "step": 1258 }, { "epoch": 3.1875792141951838, "grad_norm": 0.5732728838920593, "learning_rate": 7.246819338422393e-05, "loss": 0.2047, "step": 1259 }, { "epoch": 3.1901140684410647, "grad_norm": 0.47396236658096313, "learning_rate": 7.236641221374047e-05, "loss": 0.2095, "step": 1260 }, { "epoch": 3.1926489226869457, "grad_norm": 0.4764629304409027, "learning_rate": 7.226463104325701e-05, "loss": 0.1802, "step": 1261 }, { "epoch": 3.195183776932826, "grad_norm": 0.5802401304244995, "learning_rate": 7.216284987277355e-05, "loss": 0.1821, "step": 1262 }, { "epoch": 3.197718631178707, "grad_norm": 0.47988972067832947, "learning_rate": 7.206106870229009e-05, "loss": 0.163, "step": 1263 }, { "epoch": 3.200253485424588, "grad_norm": 0.48500359058380127, "learning_rate": 7.195928753180661e-05, "loss": 0.1739, "step": 1264 }, { "epoch": 3.202788339670469, "grad_norm": 0.7479031682014465, "learning_rate": 7.185750636132315e-05, "loss": 0.2646, "step": 1265 }, { "epoch": 3.20532319391635, "grad_norm": 0.48695701360702515, "learning_rate": 7.175572519083969e-05, "loss": 0.1822, "step": 1266 }, { "epoch": 3.2078580481622305, "grad_norm": 0.712354838848114, "learning_rate": 7.165394402035623e-05, "loss": 0.1827, "step": 1267 }, { "epoch": 3.2103929024081115, "grad_norm": 0.4304606020450592, "learning_rate": 7.155216284987277e-05, "loss": 0.1759, "step": 1268 }, { "epoch": 3.2129277566539924, "grad_norm": 0.44741392135620117, "learning_rate": 7.145038167938931e-05, "loss": 0.1979, "step": 1269 }, { "epoch": 3.2154626108998734, "grad_norm": 0.3691045045852661, "learning_rate": 7.134860050890586e-05, "loss": 0.1575, "step": 1270 }, { "epoch": 3.2179974651457544, "grad_norm": 0.4908023476600647, "learning_rate": 7.12468193384224e-05, "loss": 0.1854, "step": 1271 }, { "epoch": 3.220532319391635, "grad_norm": 0.3953510820865631, "learning_rate": 7.114503816793894e-05, "loss": 0.1821, "step": 1272 }, { "epoch": 3.223067173637516, "grad_norm": 0.35227248072624207, "learning_rate": 7.104325699745548e-05, "loss": 0.173, "step": 1273 }, { "epoch": 3.225602027883397, "grad_norm": 0.41285187005996704, "learning_rate": 7.094147582697202e-05, "loss": 0.1708, "step": 1274 }, { "epoch": 3.2281368821292777, "grad_norm": 0.5076828002929688, "learning_rate": 7.083969465648856e-05, "loss": 0.2128, "step": 1275 }, { "epoch": 3.2306717363751583, "grad_norm": 0.5385151505470276, "learning_rate": 7.07379134860051e-05, "loss": 0.2181, "step": 1276 }, { "epoch": 3.233206590621039, "grad_norm": 0.4620850086212158, "learning_rate": 7.063613231552164e-05, "loss": 0.212, "step": 1277 }, { "epoch": 3.23574144486692, "grad_norm": 0.6768701672554016, "learning_rate": 7.053435114503816e-05, "loss": 0.2704, "step": 1278 }, { "epoch": 3.238276299112801, "grad_norm": 0.43216967582702637, "learning_rate": 7.04325699745547e-05, "loss": 0.1633, "step": 1279 }, { "epoch": 3.240811153358682, "grad_norm": 0.3756103813648224, "learning_rate": 7.033078880407124e-05, "loss": 0.1767, "step": 1280 }, { "epoch": 3.2433460076045626, "grad_norm": 0.612819254398346, "learning_rate": 7.022900763358778e-05, "loss": 0.2563, "step": 1281 }, { "epoch": 3.2458808618504436, "grad_norm": 0.5477813482284546, "learning_rate": 7.012722646310433e-05, "loss": 0.2053, "step": 1282 }, { "epoch": 3.2484157160963245, "grad_norm": 0.3412390351295471, "learning_rate": 7.002544529262087e-05, "loss": 0.1506, "step": 1283 }, { "epoch": 3.2509505703422055, "grad_norm": 0.34337860345840454, "learning_rate": 6.992366412213741e-05, "loss": 0.1612, "step": 1284 }, { "epoch": 3.253485424588086, "grad_norm": 0.37943509221076965, "learning_rate": 6.982188295165395e-05, "loss": 0.168, "step": 1285 }, { "epoch": 3.256020278833967, "grad_norm": 0.6030418872833252, "learning_rate": 6.972010178117049e-05, "loss": 0.2146, "step": 1286 }, { "epoch": 3.258555133079848, "grad_norm": 0.34367507696151733, "learning_rate": 6.961832061068703e-05, "loss": 0.1726, "step": 1287 }, { "epoch": 3.261089987325729, "grad_norm": 0.3952295780181885, "learning_rate": 6.951653944020357e-05, "loss": 0.1754, "step": 1288 }, { "epoch": 3.26362484157161, "grad_norm": 0.5151681900024414, "learning_rate": 6.941475826972011e-05, "loss": 0.1849, "step": 1289 }, { "epoch": 3.2661596958174903, "grad_norm": 0.496988445520401, "learning_rate": 6.931297709923665e-05, "loss": 0.1938, "step": 1290 }, { "epoch": 3.2686945500633713, "grad_norm": 0.45343711972236633, "learning_rate": 6.921119592875319e-05, "loss": 0.1845, "step": 1291 }, { "epoch": 3.2712294043092522, "grad_norm": 0.5323635935783386, "learning_rate": 6.910941475826971e-05, "loss": 0.177, "step": 1292 }, { "epoch": 3.273764258555133, "grad_norm": 0.39680036902427673, "learning_rate": 6.900763358778625e-05, "loss": 0.1843, "step": 1293 }, { "epoch": 3.2762991128010137, "grad_norm": 0.4767110049724579, "learning_rate": 6.89058524173028e-05, "loss": 0.2103, "step": 1294 }, { "epoch": 3.2788339670468947, "grad_norm": 0.5565052032470703, "learning_rate": 6.880407124681934e-05, "loss": 0.2185, "step": 1295 }, { "epoch": 3.2813688212927756, "grad_norm": 0.5472534894943237, "learning_rate": 6.870229007633588e-05, "loss": 0.2237, "step": 1296 }, { "epoch": 3.2839036755386566, "grad_norm": 0.632560133934021, "learning_rate": 6.860050890585242e-05, "loss": 0.2213, "step": 1297 }, { "epoch": 3.2864385297845375, "grad_norm": 0.5626386404037476, "learning_rate": 6.849872773536896e-05, "loss": 0.2324, "step": 1298 }, { "epoch": 3.288973384030418, "grad_norm": 0.5527671575546265, "learning_rate": 6.83969465648855e-05, "loss": 0.227, "step": 1299 }, { "epoch": 3.291508238276299, "grad_norm": 0.6093178391456604, "learning_rate": 6.829516539440204e-05, "loss": 0.2368, "step": 1300 }, { "epoch": 3.29404309252218, "grad_norm": 0.3845243453979492, "learning_rate": 6.819338422391858e-05, "loss": 0.1804, "step": 1301 }, { "epoch": 3.296577946768061, "grad_norm": 0.6384890079498291, "learning_rate": 6.809160305343512e-05, "loss": 0.2598, "step": 1302 }, { "epoch": 3.299112801013942, "grad_norm": 0.5135822892189026, "learning_rate": 6.798982188295166e-05, "loss": 0.2142, "step": 1303 }, { "epoch": 3.3016476552598224, "grad_norm": 0.4996071457862854, "learning_rate": 6.78880407124682e-05, "loss": 0.2107, "step": 1304 }, { "epoch": 3.3041825095057034, "grad_norm": 0.31445005536079407, "learning_rate": 6.778625954198474e-05, "loss": 0.1764, "step": 1305 }, { "epoch": 3.3067173637515843, "grad_norm": 0.544301450252533, "learning_rate": 6.768447837150128e-05, "loss": 0.2856, "step": 1306 }, { "epoch": 3.3092522179974653, "grad_norm": 0.5029551982879639, "learning_rate": 6.758269720101782e-05, "loss": 0.2374, "step": 1307 }, { "epoch": 3.3117870722433462, "grad_norm": 0.3769523799419403, "learning_rate": 6.748091603053436e-05, "loss": 0.1853, "step": 1308 }, { "epoch": 3.3143219264892267, "grad_norm": 0.3540287911891937, "learning_rate": 6.73791348600509e-05, "loss": 0.193, "step": 1309 }, { "epoch": 3.3168567807351077, "grad_norm": 0.42674198746681213, "learning_rate": 6.727735368956743e-05, "loss": 0.1953, "step": 1310 }, { "epoch": 3.3193916349809887, "grad_norm": 0.5152068138122559, "learning_rate": 6.717557251908397e-05, "loss": 0.1871, "step": 1311 }, { "epoch": 3.3219264892268696, "grad_norm": 0.48964372277259827, "learning_rate": 6.707379134860051e-05, "loss": 0.2142, "step": 1312 }, { "epoch": 3.32446134347275, "grad_norm": 0.5390191674232483, "learning_rate": 6.697201017811705e-05, "loss": 0.1764, "step": 1313 }, { "epoch": 3.326996197718631, "grad_norm": 0.3849482238292694, "learning_rate": 6.687022900763359e-05, "loss": 0.1681, "step": 1314 }, { "epoch": 3.329531051964512, "grad_norm": 0.36165010929107666, "learning_rate": 6.676844783715013e-05, "loss": 0.148, "step": 1315 }, { "epoch": 3.332065906210393, "grad_norm": 0.47739362716674805, "learning_rate": 6.666666666666667e-05, "loss": 0.1748, "step": 1316 }, { "epoch": 3.334600760456274, "grad_norm": 0.41228094696998596, "learning_rate": 6.656488549618321e-05, "loss": 0.2006, "step": 1317 }, { "epoch": 3.3371356147021545, "grad_norm": 0.43494951725006104, "learning_rate": 6.646310432569975e-05, "loss": 0.1821, "step": 1318 }, { "epoch": 3.3396704689480354, "grad_norm": 0.5502039194107056, "learning_rate": 6.636132315521629e-05, "loss": 0.208, "step": 1319 }, { "epoch": 3.3422053231939164, "grad_norm": 0.5151738524436951, "learning_rate": 6.625954198473283e-05, "loss": 0.2304, "step": 1320 }, { "epoch": 3.3447401774397973, "grad_norm": 0.3866114914417267, "learning_rate": 6.615776081424937e-05, "loss": 0.1738, "step": 1321 }, { "epoch": 3.347275031685678, "grad_norm": 0.5542702674865723, "learning_rate": 6.60559796437659e-05, "loss": 0.1885, "step": 1322 }, { "epoch": 3.349809885931559, "grad_norm": 0.5107680559158325, "learning_rate": 6.595419847328245e-05, "loss": 0.1856, "step": 1323 }, { "epoch": 3.3523447401774398, "grad_norm": 0.8266568183898926, "learning_rate": 6.585241730279898e-05, "loss": 0.2826, "step": 1324 }, { "epoch": 3.3548795944233207, "grad_norm": 0.45209088921546936, "learning_rate": 6.575063613231552e-05, "loss": 0.1519, "step": 1325 }, { "epoch": 3.3574144486692017, "grad_norm": 0.4708397388458252, "learning_rate": 6.564885496183206e-05, "loss": 0.1834, "step": 1326 }, { "epoch": 3.359949302915082, "grad_norm": 0.39958736300468445, "learning_rate": 6.55470737913486e-05, "loss": 0.1444, "step": 1327 }, { "epoch": 3.362484157160963, "grad_norm": 0.5764468312263489, "learning_rate": 6.544529262086514e-05, "loss": 0.2024, "step": 1328 }, { "epoch": 3.365019011406844, "grad_norm": 0.4573269188404083, "learning_rate": 6.534351145038168e-05, "loss": 0.1857, "step": 1329 }, { "epoch": 3.367553865652725, "grad_norm": 0.598423957824707, "learning_rate": 6.524173027989822e-05, "loss": 0.2206, "step": 1330 }, { "epoch": 3.3700887198986056, "grad_norm": 0.5643012523651123, "learning_rate": 6.513994910941476e-05, "loss": 0.157, "step": 1331 }, { "epoch": 3.3726235741444865, "grad_norm": 0.6568096876144409, "learning_rate": 6.50381679389313e-05, "loss": 0.2588, "step": 1332 }, { "epoch": 3.3751584283903675, "grad_norm": 0.6552339792251587, "learning_rate": 6.493638676844784e-05, "loss": 0.2032, "step": 1333 }, { "epoch": 3.3776932826362485, "grad_norm": 0.5274556279182434, "learning_rate": 6.483460559796438e-05, "loss": 0.1877, "step": 1334 }, { "epoch": 3.3802281368821294, "grad_norm": 0.43894869089126587, "learning_rate": 6.473282442748092e-05, "loss": 0.155, "step": 1335 }, { "epoch": 3.3827629911280104, "grad_norm": 0.6116171479225159, "learning_rate": 6.463104325699746e-05, "loss": 0.2978, "step": 1336 }, { "epoch": 3.385297845373891, "grad_norm": 0.4588301479816437, "learning_rate": 6.4529262086514e-05, "loss": 0.1765, "step": 1337 }, { "epoch": 3.387832699619772, "grad_norm": 0.4299813508987427, "learning_rate": 6.442748091603053e-05, "loss": 0.1725, "step": 1338 }, { "epoch": 3.390367553865653, "grad_norm": 0.4996776580810547, "learning_rate": 6.432569974554707e-05, "loss": 0.1815, "step": 1339 }, { "epoch": 3.3929024081115338, "grad_norm": 0.42195963859558105, "learning_rate": 6.422391857506361e-05, "loss": 0.1544, "step": 1340 }, { "epoch": 3.3954372623574143, "grad_norm": 0.3918668031692505, "learning_rate": 6.412213740458015e-05, "loss": 0.1677, "step": 1341 }, { "epoch": 3.3979721166032952, "grad_norm": 0.5436106324195862, "learning_rate": 6.402035623409669e-05, "loss": 0.2624, "step": 1342 }, { "epoch": 3.400506970849176, "grad_norm": 0.5056617856025696, "learning_rate": 6.391857506361324e-05, "loss": 0.1735, "step": 1343 }, { "epoch": 3.403041825095057, "grad_norm": 0.497035950422287, "learning_rate": 6.381679389312978e-05, "loss": 0.192, "step": 1344 }, { "epoch": 3.405576679340938, "grad_norm": 0.4464019238948822, "learning_rate": 6.371501272264632e-05, "loss": 0.165, "step": 1345 }, { "epoch": 3.4081115335868186, "grad_norm": 0.3940610885620117, "learning_rate": 6.361323155216285e-05, "loss": 0.1698, "step": 1346 }, { "epoch": 3.4106463878326996, "grad_norm": 0.34197869896888733, "learning_rate": 6.351145038167939e-05, "loss": 0.1676, "step": 1347 }, { "epoch": 3.4131812420785805, "grad_norm": 0.5477511286735535, "learning_rate": 6.340966921119593e-05, "loss": 0.2913, "step": 1348 }, { "epoch": 3.4157160963244615, "grad_norm": 0.47384947538375854, "learning_rate": 6.330788804071247e-05, "loss": 0.1807, "step": 1349 }, { "epoch": 3.418250950570342, "grad_norm": 0.4805784821510315, "learning_rate": 6.3206106870229e-05, "loss": 0.1844, "step": 1350 }, { "epoch": 3.420785804816223, "grad_norm": 0.4914521276950836, "learning_rate": 6.310432569974555e-05, "loss": 0.21, "step": 1351 }, { "epoch": 3.423320659062104, "grad_norm": 0.42754796147346497, "learning_rate": 6.300254452926209e-05, "loss": 0.2003, "step": 1352 }, { "epoch": 3.425855513307985, "grad_norm": 0.5367889404296875, "learning_rate": 6.290076335877862e-05, "loss": 0.2126, "step": 1353 }, { "epoch": 3.428390367553866, "grad_norm": 0.5015621781349182, "learning_rate": 6.279898218829516e-05, "loss": 0.176, "step": 1354 }, { "epoch": 3.4309252217997463, "grad_norm": 0.4498123228549957, "learning_rate": 6.269720101781172e-05, "loss": 0.1963, "step": 1355 }, { "epoch": 3.4334600760456273, "grad_norm": 0.4548507034778595, "learning_rate": 6.259541984732826e-05, "loss": 0.185, "step": 1356 }, { "epoch": 3.4359949302915083, "grad_norm": 0.5188789963722229, "learning_rate": 6.24936386768448e-05, "loss": 0.2152, "step": 1357 }, { "epoch": 3.4385297845373892, "grad_norm": 0.5717540979385376, "learning_rate": 6.239185750636133e-05, "loss": 0.2541, "step": 1358 }, { "epoch": 3.4410646387832697, "grad_norm": 0.43195176124572754, "learning_rate": 6.229007633587787e-05, "loss": 0.1841, "step": 1359 }, { "epoch": 3.4435994930291507, "grad_norm": 0.8148223161697388, "learning_rate": 6.21882951653944e-05, "loss": 0.1903, "step": 1360 }, { "epoch": 3.4461343472750317, "grad_norm": 0.39928868412971497, "learning_rate": 6.208651399491094e-05, "loss": 0.1551, "step": 1361 }, { "epoch": 3.4486692015209126, "grad_norm": 0.8072621822357178, "learning_rate": 6.198473282442748e-05, "loss": 0.1973, "step": 1362 }, { "epoch": 3.4512040557667936, "grad_norm": 0.6420927047729492, "learning_rate": 6.188295165394402e-05, "loss": 0.2304, "step": 1363 }, { "epoch": 3.453738910012674, "grad_norm": 0.4896611273288727, "learning_rate": 6.178117048346056e-05, "loss": 0.1968, "step": 1364 }, { "epoch": 3.456273764258555, "grad_norm": 0.5518379211425781, "learning_rate": 6.16793893129771e-05, "loss": 0.2136, "step": 1365 }, { "epoch": 3.458808618504436, "grad_norm": 0.35489922761917114, "learning_rate": 6.157760814249364e-05, "loss": 0.1735, "step": 1366 }, { "epoch": 3.461343472750317, "grad_norm": 0.3575512766838074, "learning_rate": 6.147582697201019e-05, "loss": 0.1704, "step": 1367 }, { "epoch": 3.463878326996198, "grad_norm": 0.46745261549949646, "learning_rate": 6.137404580152673e-05, "loss": 0.1702, "step": 1368 }, { "epoch": 3.4664131812420784, "grad_norm": 0.39378833770751953, "learning_rate": 6.127226463104327e-05, "loss": 0.1512, "step": 1369 }, { "epoch": 3.4689480354879594, "grad_norm": 0.5645838975906372, "learning_rate": 6.11704834605598e-05, "loss": 0.2053, "step": 1370 }, { "epoch": 3.4714828897338403, "grad_norm": 0.3613208830356598, "learning_rate": 6.106870229007635e-05, "loss": 0.1749, "step": 1371 }, { "epoch": 3.4740177439797213, "grad_norm": 0.573124349117279, "learning_rate": 6.096692111959288e-05, "loss": 0.2229, "step": 1372 }, { "epoch": 3.4765525982256023, "grad_norm": 0.43110212683677673, "learning_rate": 6.086513994910942e-05, "loss": 0.2082, "step": 1373 }, { "epoch": 3.4790874524714828, "grad_norm": 0.6268284320831299, "learning_rate": 6.076335877862596e-05, "loss": 0.2826, "step": 1374 }, { "epoch": 3.4816223067173637, "grad_norm": 0.5699491500854492, "learning_rate": 6.0661577608142496e-05, "loss": 0.2373, "step": 1375 }, { "epoch": 3.4841571609632447, "grad_norm": 0.451548308134079, "learning_rate": 6.0559796437659035e-05, "loss": 0.1782, "step": 1376 }, { "epoch": 3.4866920152091256, "grad_norm": 0.44955211877822876, "learning_rate": 6.0458015267175575e-05, "loss": 0.1896, "step": 1377 }, { "epoch": 3.489226869455006, "grad_norm": 0.44076019525527954, "learning_rate": 6.035623409669211e-05, "loss": 0.1854, "step": 1378 }, { "epoch": 3.491761723700887, "grad_norm": 0.8012815117835999, "learning_rate": 6.0254452926208646e-05, "loss": 0.2067, "step": 1379 }, { "epoch": 3.494296577946768, "grad_norm": 0.5558981895446777, "learning_rate": 6.01526717557252e-05, "loss": 0.1913, "step": 1380 }, { "epoch": 3.496831432192649, "grad_norm": 0.42501258850097656, "learning_rate": 6.005089058524174e-05, "loss": 0.1781, "step": 1381 }, { "epoch": 3.49936628643853, "grad_norm": 0.3618164658546448, "learning_rate": 5.994910941475828e-05, "loss": 0.1472, "step": 1382 }, { "epoch": 3.5019011406844105, "grad_norm": 0.5384409427642822, "learning_rate": 5.984732824427482e-05, "loss": 0.2063, "step": 1383 }, { "epoch": 3.5044359949302915, "grad_norm": 0.5103084444999695, "learning_rate": 5.974554707379135e-05, "loss": 0.1737, "step": 1384 }, { "epoch": 3.5069708491761724, "grad_norm": 0.37908968329429626, "learning_rate": 5.964376590330789e-05, "loss": 0.1599, "step": 1385 }, { "epoch": 3.5095057034220534, "grad_norm": 0.5049726963043213, "learning_rate": 5.954198473282443e-05, "loss": 0.1891, "step": 1386 }, { "epoch": 3.512040557667934, "grad_norm": 0.4436114430427551, "learning_rate": 5.944020356234097e-05, "loss": 0.1667, "step": 1387 }, { "epoch": 3.514575411913815, "grad_norm": 0.6733534336090088, "learning_rate": 5.933842239185751e-05, "loss": 0.2714, "step": 1388 }, { "epoch": 3.517110266159696, "grad_norm": 0.7258228659629822, "learning_rate": 5.9236641221374046e-05, "loss": 0.258, "step": 1389 }, { "epoch": 3.5196451204055768, "grad_norm": 0.6425923705101013, "learning_rate": 5.9134860050890586e-05, "loss": 0.1791, "step": 1390 }, { "epoch": 3.5221799746514577, "grad_norm": 0.45786988735198975, "learning_rate": 5.9033078880407125e-05, "loss": 0.1989, "step": 1391 }, { "epoch": 3.5247148288973387, "grad_norm": 0.43258994817733765, "learning_rate": 5.893129770992367e-05, "loss": 0.166, "step": 1392 }, { "epoch": 3.527249683143219, "grad_norm": 0.36486050486564636, "learning_rate": 5.882951653944021e-05, "loss": 0.1634, "step": 1393 }, { "epoch": 3.5297845373891, "grad_norm": 0.5883339047431946, "learning_rate": 5.872773536895675e-05, "loss": 0.2236, "step": 1394 }, { "epoch": 3.532319391634981, "grad_norm": 0.6296584010124207, "learning_rate": 5.862595419847329e-05, "loss": 0.1866, "step": 1395 }, { "epoch": 3.5348542458808616, "grad_norm": 0.4262075126171112, "learning_rate": 5.852417302798983e-05, "loss": 0.1707, "step": 1396 }, { "epoch": 3.5373891001267426, "grad_norm": 0.459573894739151, "learning_rate": 5.842239185750637e-05, "loss": 0.1654, "step": 1397 }, { "epoch": 3.5399239543726235, "grad_norm": 0.47115570306777954, "learning_rate": 5.83206106870229e-05, "loss": 0.1936, "step": 1398 }, { "epoch": 3.5424588086185045, "grad_norm": 0.41362589597702026, "learning_rate": 5.821882951653944e-05, "loss": 0.1897, "step": 1399 }, { "epoch": 3.5449936628643854, "grad_norm": 0.4314422607421875, "learning_rate": 5.811704834605598e-05, "loss": 0.172, "step": 1400 }, { "epoch": 3.5475285171102664, "grad_norm": 0.48116129636764526, "learning_rate": 5.801526717557252e-05, "loss": 0.1721, "step": 1401 }, { "epoch": 3.550063371356147, "grad_norm": 0.3902725279331207, "learning_rate": 5.791348600508906e-05, "loss": 0.1886, "step": 1402 }, { "epoch": 3.552598225602028, "grad_norm": 0.37996864318847656, "learning_rate": 5.78117048346056e-05, "loss": 0.1705, "step": 1403 }, { "epoch": 3.555133079847909, "grad_norm": 0.589279294013977, "learning_rate": 5.770992366412214e-05, "loss": 0.1848, "step": 1404 }, { "epoch": 3.5576679340937893, "grad_norm": 0.4233790636062622, "learning_rate": 5.760814249363868e-05, "loss": 0.18, "step": 1405 }, { "epoch": 3.5602027883396703, "grad_norm": 0.3760955333709717, "learning_rate": 5.750636132315522e-05, "loss": 0.1743, "step": 1406 }, { "epoch": 3.5627376425855513, "grad_norm": 0.552793562412262, "learning_rate": 5.740458015267176e-05, "loss": 0.2315, "step": 1407 }, { "epoch": 3.565272496831432, "grad_norm": 0.5440211892127991, "learning_rate": 5.73027989821883e-05, "loss": 0.186, "step": 1408 }, { "epoch": 3.567807351077313, "grad_norm": 0.5183967351913452, "learning_rate": 5.720101781170484e-05, "loss": 0.1626, "step": 1409 }, { "epoch": 3.570342205323194, "grad_norm": 0.47962069511413574, "learning_rate": 5.709923664122138e-05, "loss": 0.1813, "step": 1410 }, { "epoch": 3.5728770595690746, "grad_norm": 0.8065668940544128, "learning_rate": 5.699745547073792e-05, "loss": 0.2537, "step": 1411 }, { "epoch": 3.5754119138149556, "grad_norm": 0.46018585562705994, "learning_rate": 5.689567430025445e-05, "loss": 0.1756, "step": 1412 }, { "epoch": 3.5779467680608366, "grad_norm": 0.5229590535163879, "learning_rate": 5.679389312977099e-05, "loss": 0.1873, "step": 1413 }, { "epoch": 3.5804816223067175, "grad_norm": 0.510209321975708, "learning_rate": 5.669211195928753e-05, "loss": 0.167, "step": 1414 }, { "epoch": 3.583016476552598, "grad_norm": 0.4264031648635864, "learning_rate": 5.659033078880407e-05, "loss": 0.1705, "step": 1415 }, { "epoch": 3.585551330798479, "grad_norm": 0.6208323240280151, "learning_rate": 5.648854961832062e-05, "loss": 0.2268, "step": 1416 }, { "epoch": 3.58808618504436, "grad_norm": 0.3730670213699341, "learning_rate": 5.6386768447837154e-05, "loss": 0.1676, "step": 1417 }, { "epoch": 3.590621039290241, "grad_norm": 0.52936190366745, "learning_rate": 5.628498727735369e-05, "loss": 0.2055, "step": 1418 }, { "epoch": 3.593155893536122, "grad_norm": 0.44800981879234314, "learning_rate": 5.618320610687023e-05, "loss": 0.1782, "step": 1419 }, { "epoch": 3.5956907477820024, "grad_norm": 0.37429654598236084, "learning_rate": 5.608142493638677e-05, "loss": 0.1566, "step": 1420 }, { "epoch": 3.5982256020278833, "grad_norm": 0.5618942975997925, "learning_rate": 5.597964376590331e-05, "loss": 0.2249, "step": 1421 }, { "epoch": 3.6007604562737643, "grad_norm": 0.6893648505210876, "learning_rate": 5.587786259541985e-05, "loss": 0.2104, "step": 1422 }, { "epoch": 3.6032953105196452, "grad_norm": 0.4185943603515625, "learning_rate": 5.577608142493639e-05, "loss": 0.1729, "step": 1423 }, { "epoch": 3.6058301647655258, "grad_norm": 0.46326011419296265, "learning_rate": 5.567430025445293e-05, "loss": 0.1888, "step": 1424 }, { "epoch": 3.6083650190114067, "grad_norm": 0.4564262628555298, "learning_rate": 5.557251908396947e-05, "loss": 0.1957, "step": 1425 }, { "epoch": 3.6108998732572877, "grad_norm": 0.654411256313324, "learning_rate": 5.5470737913486e-05, "loss": 0.2101, "step": 1426 }, { "epoch": 3.6134347275031686, "grad_norm": 0.4059501886367798, "learning_rate": 5.536895674300254e-05, "loss": 0.1638, "step": 1427 }, { "epoch": 3.6159695817490496, "grad_norm": 0.4155724346637726, "learning_rate": 5.526717557251909e-05, "loss": 0.1799, "step": 1428 }, { "epoch": 3.6185044359949305, "grad_norm": 0.4041290581226349, "learning_rate": 5.516539440203563e-05, "loss": 0.1755, "step": 1429 }, { "epoch": 3.621039290240811, "grad_norm": 0.3458746373653412, "learning_rate": 5.506361323155217e-05, "loss": 0.1474, "step": 1430 }, { "epoch": 3.623574144486692, "grad_norm": 0.5046303272247314, "learning_rate": 5.496183206106871e-05, "loss": 0.2554, "step": 1431 }, { "epoch": 3.626108998732573, "grad_norm": 0.4284549951553345, "learning_rate": 5.4860050890585244e-05, "loss": 0.1855, "step": 1432 }, { "epoch": 3.6286438529784535, "grad_norm": 0.5116839408874512, "learning_rate": 5.475826972010178e-05, "loss": 0.1777, "step": 1433 }, { "epoch": 3.6311787072243344, "grad_norm": 0.4303711950778961, "learning_rate": 5.465648854961832e-05, "loss": 0.1792, "step": 1434 }, { "epoch": 3.6337135614702154, "grad_norm": 0.4602053463459015, "learning_rate": 5.455470737913486e-05, "loss": 0.1716, "step": 1435 }, { "epoch": 3.6362484157160964, "grad_norm": 0.47606271505355835, "learning_rate": 5.44529262086514e-05, "loss": 0.2063, "step": 1436 }, { "epoch": 3.6387832699619773, "grad_norm": 0.5861607193946838, "learning_rate": 5.435114503816794e-05, "loss": 0.2133, "step": 1437 }, { "epoch": 3.6413181242078583, "grad_norm": 0.42663708329200745, "learning_rate": 5.424936386768448e-05, "loss": 0.1662, "step": 1438 }, { "epoch": 3.643852978453739, "grad_norm": 0.6255937218666077, "learning_rate": 5.414758269720102e-05, "loss": 0.1875, "step": 1439 }, { "epoch": 3.6463878326996197, "grad_norm": 0.5422307252883911, "learning_rate": 5.404580152671755e-05, "loss": 0.1624, "step": 1440 }, { "epoch": 3.6489226869455007, "grad_norm": 0.540477991104126, "learning_rate": 5.3944020356234104e-05, "loss": 0.2489, "step": 1441 }, { "epoch": 3.6514575411913817, "grad_norm": 0.5656100511550903, "learning_rate": 5.3842239185750643e-05, "loss": 0.2289, "step": 1442 }, { "epoch": 3.653992395437262, "grad_norm": 0.5202456712722778, "learning_rate": 5.374045801526718e-05, "loss": 0.23, "step": 1443 }, { "epoch": 3.656527249683143, "grad_norm": 0.5069813132286072, "learning_rate": 5.363867684478372e-05, "loss": 0.1845, "step": 1444 }, { "epoch": 3.659062103929024, "grad_norm": 0.5711066126823425, "learning_rate": 5.353689567430026e-05, "loss": 0.2076, "step": 1445 }, { "epoch": 3.661596958174905, "grad_norm": 0.5115897059440613, "learning_rate": 5.3435114503816794e-05, "loss": 0.1696, "step": 1446 }, { "epoch": 3.664131812420786, "grad_norm": 0.6119818687438965, "learning_rate": 5.333333333333333e-05, "loss": 0.1905, "step": 1447 }, { "epoch": 3.6666666666666665, "grad_norm": 0.7333729863166809, "learning_rate": 5.323155216284987e-05, "loss": 0.2208, "step": 1448 }, { "epoch": 3.6692015209125475, "grad_norm": 0.5657917857170105, "learning_rate": 5.312977099236641e-05, "loss": 0.218, "step": 1449 }, { "epoch": 3.6717363751584284, "grad_norm": 0.5568459033966064, "learning_rate": 5.302798982188295e-05, "loss": 0.1957, "step": 1450 }, { "epoch": 3.6742712294043094, "grad_norm": 0.40060222148895264, "learning_rate": 5.292620865139949e-05, "loss": 0.1634, "step": 1451 }, { "epoch": 3.67680608365019, "grad_norm": 0.5395296216011047, "learning_rate": 5.282442748091603e-05, "loss": 0.2284, "step": 1452 }, { "epoch": 3.679340937896071, "grad_norm": 0.395298570394516, "learning_rate": 5.2722646310432576e-05, "loss": 0.1717, "step": 1453 }, { "epoch": 3.681875792141952, "grad_norm": 0.4693946838378906, "learning_rate": 5.2620865139949115e-05, "loss": 0.1719, "step": 1454 }, { "epoch": 3.6844106463878328, "grad_norm": 0.5206104516983032, "learning_rate": 5.2519083969465654e-05, "loss": 0.2158, "step": 1455 }, { "epoch": 3.6869455006337137, "grad_norm": 0.5576691031455994, "learning_rate": 5.2417302798982194e-05, "loss": 0.2031, "step": 1456 }, { "epoch": 3.6894803548795947, "grad_norm": 0.5826637148857117, "learning_rate": 5.231552162849873e-05, "loss": 0.2785, "step": 1457 }, { "epoch": 3.692015209125475, "grad_norm": 0.5928865075111389, "learning_rate": 5.221374045801527e-05, "loss": 0.1765, "step": 1458 }, { "epoch": 3.694550063371356, "grad_norm": 0.5932832956314087, "learning_rate": 5.211195928753181e-05, "loss": 0.1767, "step": 1459 }, { "epoch": 3.697084917617237, "grad_norm": 0.4178262948989868, "learning_rate": 5.2010178117048344e-05, "loss": 0.1636, "step": 1460 }, { "epoch": 3.6996197718631176, "grad_norm": 0.6029627919197083, "learning_rate": 5.1908396946564884e-05, "loss": 0.2086, "step": 1461 }, { "epoch": 3.7021546261089986, "grad_norm": 0.48641863465309143, "learning_rate": 5.180661577608142e-05, "loss": 0.1613, "step": 1462 }, { "epoch": 3.7046894803548795, "grad_norm": 0.40176740288734436, "learning_rate": 5.170483460559796e-05, "loss": 0.1647, "step": 1463 }, { "epoch": 3.7072243346007605, "grad_norm": 0.42600035667419434, "learning_rate": 5.16030534351145e-05, "loss": 0.1818, "step": 1464 }, { "epoch": 3.7097591888466415, "grad_norm": 0.48061972856521606, "learning_rate": 5.150127226463105e-05, "loss": 0.187, "step": 1465 }, { "epoch": 3.7122940430925224, "grad_norm": 0.4085710346698761, "learning_rate": 5.139949109414759e-05, "loss": 0.1562, "step": 1466 }, { "epoch": 3.714828897338403, "grad_norm": 0.4378439486026764, "learning_rate": 5.1297709923664126e-05, "loss": 0.1723, "step": 1467 }, { "epoch": 3.717363751584284, "grad_norm": 0.5806863307952881, "learning_rate": 5.1195928753180665e-05, "loss": 0.2069, "step": 1468 }, { "epoch": 3.719898605830165, "grad_norm": 0.4711120128631592, "learning_rate": 5.1094147582697205e-05, "loss": 0.1851, "step": 1469 }, { "epoch": 3.7224334600760454, "grad_norm": 0.47227099537849426, "learning_rate": 5.0992366412213744e-05, "loss": 0.1885, "step": 1470 }, { "epoch": 3.7249683143219263, "grad_norm": 0.4405531585216522, "learning_rate": 5.0890585241730283e-05, "loss": 0.1662, "step": 1471 }, { "epoch": 3.7275031685678073, "grad_norm": 0.5168079733848572, "learning_rate": 5.078880407124682e-05, "loss": 0.2002, "step": 1472 }, { "epoch": 3.7300380228136882, "grad_norm": 0.3839830160140991, "learning_rate": 5.068702290076336e-05, "loss": 0.168, "step": 1473 }, { "epoch": 3.732572877059569, "grad_norm": 0.338012158870697, "learning_rate": 5.0585241730279895e-05, "loss": 0.1596, "step": 1474 }, { "epoch": 3.73510773130545, "grad_norm": 0.5466023087501526, "learning_rate": 5.0483460559796434e-05, "loss": 0.2379, "step": 1475 }, { "epoch": 3.7376425855513307, "grad_norm": 0.44543328881263733, "learning_rate": 5.038167938931297e-05, "loss": 0.1778, "step": 1476 }, { "epoch": 3.7401774397972116, "grad_norm": 0.4166903793811798, "learning_rate": 5.0279898218829526e-05, "loss": 0.1554, "step": 1477 }, { "epoch": 3.7427122940430926, "grad_norm": 0.3806212544441223, "learning_rate": 5.0178117048346065e-05, "loss": 0.1648, "step": 1478 }, { "epoch": 3.7452471482889735, "grad_norm": 0.5990723967552185, "learning_rate": 5.00763358778626e-05, "loss": 0.2348, "step": 1479 }, { "epoch": 3.747782002534854, "grad_norm": 0.715096116065979, "learning_rate": 4.997455470737914e-05, "loss": 0.2201, "step": 1480 }, { "epoch": 3.750316856780735, "grad_norm": 0.6297019124031067, "learning_rate": 4.9872773536895677e-05, "loss": 0.2398, "step": 1481 }, { "epoch": 3.752851711026616, "grad_norm": 0.6131380200386047, "learning_rate": 4.9770992366412216e-05, "loss": 0.2128, "step": 1482 }, { "epoch": 3.755386565272497, "grad_norm": 0.5018277764320374, "learning_rate": 4.9669211195928755e-05, "loss": 0.1913, "step": 1483 }, { "epoch": 3.757921419518378, "grad_norm": 0.516939103603363, "learning_rate": 4.9567430025445294e-05, "loss": 0.1958, "step": 1484 }, { "epoch": 3.7604562737642584, "grad_norm": 0.4485652446746826, "learning_rate": 4.9465648854961834e-05, "loss": 0.1678, "step": 1485 }, { "epoch": 3.7629911280101394, "grad_norm": 0.6227991580963135, "learning_rate": 4.936386768447838e-05, "loss": 0.2403, "step": 1486 }, { "epoch": 3.7655259822560203, "grad_norm": 0.42331916093826294, "learning_rate": 4.926208651399491e-05, "loss": 0.1673, "step": 1487 }, { "epoch": 3.7680608365019013, "grad_norm": 0.5072351098060608, "learning_rate": 4.916030534351145e-05, "loss": 0.204, "step": 1488 }, { "epoch": 3.770595690747782, "grad_norm": 0.445578008890152, "learning_rate": 4.905852417302799e-05, "loss": 0.1908, "step": 1489 }, { "epoch": 3.7731305449936627, "grad_norm": 0.49046698212623596, "learning_rate": 4.895674300254453e-05, "loss": 0.1615, "step": 1490 }, { "epoch": 3.7756653992395437, "grad_norm": 0.37768882513046265, "learning_rate": 4.885496183206107e-05, "loss": 0.1604, "step": 1491 }, { "epoch": 3.7782002534854247, "grad_norm": 0.38343289494514465, "learning_rate": 4.8753180661577616e-05, "loss": 0.1709, "step": 1492 }, { "epoch": 3.7807351077313056, "grad_norm": 0.4102202355861664, "learning_rate": 4.8651399491094155e-05, "loss": 0.1629, "step": 1493 }, { "epoch": 3.7832699619771866, "grad_norm": 0.4545007050037384, "learning_rate": 4.854961832061069e-05, "loss": 0.1709, "step": 1494 }, { "epoch": 3.785804816223067, "grad_norm": 0.48300206661224365, "learning_rate": 4.844783715012723e-05, "loss": 0.2211, "step": 1495 }, { "epoch": 3.788339670468948, "grad_norm": 0.5301868319511414, "learning_rate": 4.8346055979643766e-05, "loss": 0.2053, "step": 1496 }, { "epoch": 3.790874524714829, "grad_norm": 0.48716598749160767, "learning_rate": 4.8244274809160306e-05, "loss": 0.2392, "step": 1497 }, { "epoch": 3.7934093789607095, "grad_norm": 0.6201879978179932, "learning_rate": 4.8142493638676845e-05, "loss": 0.2267, "step": 1498 }, { "epoch": 3.7959442332065905, "grad_norm": 0.46254560351371765, "learning_rate": 4.804071246819339e-05, "loss": 0.1824, "step": 1499 }, { "epoch": 3.7984790874524714, "grad_norm": 0.6153382658958435, "learning_rate": 4.793893129770993e-05, "loss": 0.2095, "step": 1500 }, { "epoch": 3.8010139416983524, "grad_norm": 0.6054911613464355, "learning_rate": 4.783715012722646e-05, "loss": 0.2291, "step": 1501 }, { "epoch": 3.8035487959442333, "grad_norm": 0.3899902403354645, "learning_rate": 4.7735368956743e-05, "loss": 0.1507, "step": 1502 }, { "epoch": 3.8060836501901143, "grad_norm": 0.4634632170200348, "learning_rate": 4.763358778625954e-05, "loss": 0.1436, "step": 1503 }, { "epoch": 3.808618504435995, "grad_norm": 0.6829271912574768, "learning_rate": 4.753180661577608e-05, "loss": 0.2611, "step": 1504 }, { "epoch": 3.8111533586818758, "grad_norm": 0.553393542766571, "learning_rate": 4.743002544529263e-05, "loss": 0.1862, "step": 1505 }, { "epoch": 3.8136882129277567, "grad_norm": 0.4285520315170288, "learning_rate": 4.7328244274809166e-05, "loss": 0.1522, "step": 1506 }, { "epoch": 3.8162230671736372, "grad_norm": 0.5505307912826538, "learning_rate": 4.7226463104325705e-05, "loss": 0.2056, "step": 1507 }, { "epoch": 3.818757921419518, "grad_norm": 0.635071873664856, "learning_rate": 4.712468193384224e-05, "loss": 0.1899, "step": 1508 }, { "epoch": 3.821292775665399, "grad_norm": 0.4297153353691101, "learning_rate": 4.702290076335878e-05, "loss": 0.1632, "step": 1509 }, { "epoch": 3.82382762991128, "grad_norm": 0.5538508892059326, "learning_rate": 4.6921119592875317e-05, "loss": 0.1965, "step": 1510 }, { "epoch": 3.826362484157161, "grad_norm": 0.6736975908279419, "learning_rate": 4.681933842239186e-05, "loss": 0.2334, "step": 1511 }, { "epoch": 3.828897338403042, "grad_norm": 0.49381881952285767, "learning_rate": 4.67175572519084e-05, "loss": 0.2074, "step": 1512 }, { "epoch": 3.8314321926489225, "grad_norm": 0.4285455346107483, "learning_rate": 4.661577608142494e-05, "loss": 0.176, "step": 1513 }, { "epoch": 3.8339670468948035, "grad_norm": 0.5771308541297913, "learning_rate": 4.651399491094148e-05, "loss": 0.229, "step": 1514 }, { "epoch": 3.8365019011406845, "grad_norm": 0.4749429225921631, "learning_rate": 4.641221374045801e-05, "loss": 0.1968, "step": 1515 }, { "epoch": 3.8390367553865654, "grad_norm": 0.48094430565834045, "learning_rate": 4.631043256997455e-05, "loss": 0.1982, "step": 1516 }, { "epoch": 3.841571609632446, "grad_norm": 0.49878042936325073, "learning_rate": 4.62086513994911e-05, "loss": 0.1552, "step": 1517 }, { "epoch": 3.844106463878327, "grad_norm": 0.4872034192085266, "learning_rate": 4.610687022900764e-05, "loss": 0.1808, "step": 1518 }, { "epoch": 3.846641318124208, "grad_norm": 0.4905577600002289, "learning_rate": 4.600508905852418e-05, "loss": 0.1703, "step": 1519 }, { "epoch": 3.849176172370089, "grad_norm": 0.49980783462524414, "learning_rate": 4.5903307888040716e-05, "loss": 0.1727, "step": 1520 }, { "epoch": 3.8517110266159698, "grad_norm": 0.5426180958747864, "learning_rate": 4.5801526717557256e-05, "loss": 0.2192, "step": 1521 }, { "epoch": 3.8542458808618507, "grad_norm": 0.6399853825569153, "learning_rate": 4.569974554707379e-05, "loss": 0.2387, "step": 1522 }, { "epoch": 3.8567807351077312, "grad_norm": 0.5311464667320251, "learning_rate": 4.5597964376590334e-05, "loss": 0.1976, "step": 1523 }, { "epoch": 3.859315589353612, "grad_norm": 0.5433202981948853, "learning_rate": 4.5496183206106874e-05, "loss": 0.1916, "step": 1524 }, { "epoch": 3.861850443599493, "grad_norm": 0.4024597704410553, "learning_rate": 4.539440203562341e-05, "loss": 0.1643, "step": 1525 }, { "epoch": 3.8643852978453737, "grad_norm": 0.347566157579422, "learning_rate": 4.529262086513995e-05, "loss": 0.1676, "step": 1526 }, { "epoch": 3.8669201520912546, "grad_norm": 0.45405861735343933, "learning_rate": 4.519083969465649e-05, "loss": 0.1963, "step": 1527 }, { "epoch": 3.8694550063371356, "grad_norm": 0.6430472731590271, "learning_rate": 4.508905852417303e-05, "loss": 0.2322, "step": 1528 }, { "epoch": 3.8719898605830165, "grad_norm": 0.4391939043998718, "learning_rate": 4.498727735368957e-05, "loss": 0.1871, "step": 1529 }, { "epoch": 3.8745247148288975, "grad_norm": 0.47301623225212097, "learning_rate": 4.488549618320611e-05, "loss": 0.1549, "step": 1530 }, { "epoch": 3.8770595690747784, "grad_norm": 0.4237573742866516, "learning_rate": 4.478371501272265e-05, "loss": 0.1548, "step": 1531 }, { "epoch": 3.879594423320659, "grad_norm": 0.5859849452972412, "learning_rate": 4.468193384223919e-05, "loss": 0.2023, "step": 1532 }, { "epoch": 3.88212927756654, "grad_norm": 0.45050573348999023, "learning_rate": 4.458015267175573e-05, "loss": 0.165, "step": 1533 }, { "epoch": 3.884664131812421, "grad_norm": 0.5347339510917664, "learning_rate": 4.447837150127227e-05, "loss": 0.1854, "step": 1534 }, { "epoch": 3.8871989860583014, "grad_norm": 0.375836580991745, "learning_rate": 4.4376590330788806e-05, "loss": 0.152, "step": 1535 }, { "epoch": 3.8897338403041823, "grad_norm": 0.5403718948364258, "learning_rate": 4.4274809160305345e-05, "loss": 0.2065, "step": 1536 }, { "epoch": 3.8922686945500633, "grad_norm": 0.5624736547470093, "learning_rate": 4.4173027989821885e-05, "loss": 0.1857, "step": 1537 }, { "epoch": 3.8948035487959443, "grad_norm": 0.5971560478210449, "learning_rate": 4.4071246819338424e-05, "loss": 0.1928, "step": 1538 }, { "epoch": 3.897338403041825, "grad_norm": 0.5225517153739929, "learning_rate": 4.396946564885496e-05, "loss": 0.2054, "step": 1539 }, { "epoch": 3.899873257287706, "grad_norm": 0.47341519594192505, "learning_rate": 4.38676844783715e-05, "loss": 0.1786, "step": 1540 }, { "epoch": 3.9024081115335867, "grad_norm": 0.3734676241874695, "learning_rate": 4.376590330788805e-05, "loss": 0.1447, "step": 1541 }, { "epoch": 3.9049429657794676, "grad_norm": 0.5003755688667297, "learning_rate": 4.366412213740458e-05, "loss": 0.1734, "step": 1542 }, { "epoch": 3.9074778200253486, "grad_norm": 0.41165000200271606, "learning_rate": 4.356234096692112e-05, "loss": 0.172, "step": 1543 }, { "epoch": 3.9100126742712296, "grad_norm": 0.45096197724342346, "learning_rate": 4.346055979643766e-05, "loss": 0.1726, "step": 1544 }, { "epoch": 3.91254752851711, "grad_norm": 0.5445842146873474, "learning_rate": 4.33587786259542e-05, "loss": 0.206, "step": 1545 }, { "epoch": 3.915082382762991, "grad_norm": 0.5139321088790894, "learning_rate": 4.325699745547074e-05, "loss": 0.1803, "step": 1546 }, { "epoch": 3.917617237008872, "grad_norm": 0.5652433633804321, "learning_rate": 4.3155216284987285e-05, "loss": 0.2051, "step": 1547 }, { "epoch": 3.920152091254753, "grad_norm": 0.38091734051704407, "learning_rate": 4.3053435114503824e-05, "loss": 0.1541, "step": 1548 }, { "epoch": 3.922686945500634, "grad_norm": 0.3614705801010132, "learning_rate": 4.2951653944020356e-05, "loss": 0.147, "step": 1549 }, { "epoch": 3.9252217997465144, "grad_norm": 0.4551761746406555, "learning_rate": 4.2849872773536896e-05, "loss": 0.1685, "step": 1550 }, { "epoch": 3.9277566539923954, "grad_norm": 0.5226624011993408, "learning_rate": 4.2748091603053435e-05, "loss": 0.1727, "step": 1551 }, { "epoch": 3.9302915082382763, "grad_norm": 0.3541867136955261, "learning_rate": 4.2646310432569974e-05, "loss": 0.1488, "step": 1552 }, { "epoch": 3.9328263624841573, "grad_norm": 0.4599204659461975, "learning_rate": 4.254452926208652e-05, "loss": 0.1536, "step": 1553 }, { "epoch": 3.935361216730038, "grad_norm": 0.45082637667655945, "learning_rate": 4.244274809160306e-05, "loss": 0.1671, "step": 1554 }, { "epoch": 3.9378960709759188, "grad_norm": 0.6053276658058167, "learning_rate": 4.23409669211196e-05, "loss": 0.2043, "step": 1555 }, { "epoch": 3.9404309252217997, "grad_norm": 0.506443440914154, "learning_rate": 4.223918575063613e-05, "loss": 0.1893, "step": 1556 }, { "epoch": 3.9429657794676807, "grad_norm": 0.6029784679412842, "learning_rate": 4.213740458015267e-05, "loss": 0.201, "step": 1557 }, { "epoch": 3.9455006337135616, "grad_norm": 0.3993350863456726, "learning_rate": 4.203562340966921e-05, "loss": 0.1637, "step": 1558 }, { "epoch": 3.9480354879594426, "grad_norm": 0.5887712836265564, "learning_rate": 4.193384223918575e-05, "loss": 0.2207, "step": 1559 }, { "epoch": 3.950570342205323, "grad_norm": 0.5538966059684753, "learning_rate": 4.1832061068702296e-05, "loss": 0.1674, "step": 1560 }, { "epoch": 3.953105196451204, "grad_norm": 0.4831174910068512, "learning_rate": 4.1730279898218835e-05, "loss": 0.1694, "step": 1561 }, { "epoch": 3.955640050697085, "grad_norm": 0.39700761437416077, "learning_rate": 4.1628498727735374e-05, "loss": 0.1695, "step": 1562 }, { "epoch": 3.9581749049429655, "grad_norm": 0.5388202667236328, "learning_rate": 4.152671755725191e-05, "loss": 0.1769, "step": 1563 }, { "epoch": 3.9607097591888465, "grad_norm": 0.5717085599899292, "learning_rate": 4.1424936386768446e-05, "loss": 0.2602, "step": 1564 }, { "epoch": 3.9632446134347274, "grad_norm": 0.4135623872280121, "learning_rate": 4.1323155216284985e-05, "loss": 0.1512, "step": 1565 }, { "epoch": 3.9657794676806084, "grad_norm": 0.478411465883255, "learning_rate": 4.122137404580153e-05, "loss": 0.1967, "step": 1566 }, { "epoch": 3.9683143219264894, "grad_norm": 0.4836915135383606, "learning_rate": 4.111959287531807e-05, "loss": 0.2297, "step": 1567 }, { "epoch": 3.9708491761723703, "grad_norm": 0.6355355978012085, "learning_rate": 4.101781170483461e-05, "loss": 0.2291, "step": 1568 }, { "epoch": 3.973384030418251, "grad_norm": 0.42811089754104614, "learning_rate": 4.091603053435115e-05, "loss": 0.1518, "step": 1569 }, { "epoch": 3.975918884664132, "grad_norm": 0.5778828859329224, "learning_rate": 4.081424936386768e-05, "loss": 0.1638, "step": 1570 }, { "epoch": 3.9784537389100127, "grad_norm": 0.4650358259677887, "learning_rate": 4.071246819338422e-05, "loss": 0.1658, "step": 1571 }, { "epoch": 3.9809885931558933, "grad_norm": 0.5939072966575623, "learning_rate": 4.061068702290077e-05, "loss": 0.2276, "step": 1572 }, { "epoch": 3.983523447401774, "grad_norm": 0.5296881794929504, "learning_rate": 4.050890585241731e-05, "loss": 0.1895, "step": 1573 }, { "epoch": 3.986058301647655, "grad_norm": 0.4479645788669586, "learning_rate": 4.0407124681933846e-05, "loss": 0.168, "step": 1574 }, { "epoch": 3.988593155893536, "grad_norm": 0.6041486859321594, "learning_rate": 4.0305343511450385e-05, "loss": 0.2225, "step": 1575 }, { "epoch": 3.991128010139417, "grad_norm": 1.0764771699905396, "learning_rate": 4.0203562340966925e-05, "loss": 0.1736, "step": 1576 }, { "epoch": 3.993662864385298, "grad_norm": 0.4830266535282135, "learning_rate": 4.010178117048346e-05, "loss": 0.2017, "step": 1577 }, { "epoch": 3.9961977186311786, "grad_norm": 0.4032004773616791, "learning_rate": 4e-05, "loss": 0.1723, "step": 1578 }, { "epoch": 3.9987325728770595, "grad_norm": 0.4441380798816681, "learning_rate": 3.989821882951654e-05, "loss": 0.1714, "step": 1579 }, { "epoch": 4.0, "grad_norm": 0.673060953617096, "learning_rate": 3.979643765903308e-05, "loss": 0.1651, "step": 1580 }, { "epoch": 4.002534854245881, "grad_norm": 0.5185714960098267, "learning_rate": 3.969465648854962e-05, "loss": 0.1877, "step": 1581 }, { "epoch": 4.005069708491762, "grad_norm": 0.4302978217601776, "learning_rate": 3.959287531806616e-05, "loss": 0.1575, "step": 1582 }, { "epoch": 4.007604562737643, "grad_norm": 0.45982813835144043, "learning_rate": 3.94910941475827e-05, "loss": 0.1615, "step": 1583 }, { "epoch": 4.010139416983524, "grad_norm": 0.4118313789367676, "learning_rate": 3.938931297709924e-05, "loss": 0.1508, "step": 1584 }, { "epoch": 4.012674271229404, "grad_norm": 0.6039855480194092, "learning_rate": 3.928753180661578e-05, "loss": 0.1782, "step": 1585 }, { "epoch": 4.015209125475285, "grad_norm": 0.4311355948448181, "learning_rate": 3.918575063613232e-05, "loss": 0.1488, "step": 1586 }, { "epoch": 4.017743979721166, "grad_norm": 0.7398537993431091, "learning_rate": 3.908396946564886e-05, "loss": 0.1879, "step": 1587 }, { "epoch": 4.020278833967047, "grad_norm": 0.37064164876937866, "learning_rate": 3.8982188295165396e-05, "loss": 0.1257, "step": 1588 }, { "epoch": 4.022813688212928, "grad_norm": 0.46931344270706177, "learning_rate": 3.8880407124681936e-05, "loss": 0.1579, "step": 1589 }, { "epoch": 4.025348542458809, "grad_norm": 0.4544156789779663, "learning_rate": 3.8778625954198475e-05, "loss": 0.134, "step": 1590 }, { "epoch": 4.02788339670469, "grad_norm": 0.5562132000923157, "learning_rate": 3.8676844783715014e-05, "loss": 0.1488, "step": 1591 }, { "epoch": 4.030418250950571, "grad_norm": 0.5679481625556946, "learning_rate": 3.8575063613231554e-05, "loss": 0.1322, "step": 1592 }, { "epoch": 4.032953105196452, "grad_norm": 0.6101714372634888, "learning_rate": 3.847328244274809e-05, "loss": 0.1534, "step": 1593 }, { "epoch": 4.035487959442332, "grad_norm": 0.8060622215270996, "learning_rate": 3.837150127226463e-05, "loss": 0.1986, "step": 1594 }, { "epoch": 4.038022813688213, "grad_norm": 0.5501425266265869, "learning_rate": 3.826972010178117e-05, "loss": 0.1444, "step": 1595 }, { "epoch": 4.0405576679340935, "grad_norm": 0.5117461085319519, "learning_rate": 3.816793893129771e-05, "loss": 0.1259, "step": 1596 }, { "epoch": 4.0430925221799745, "grad_norm": 0.571770429611206, "learning_rate": 3.806615776081425e-05, "loss": 0.1413, "step": 1597 }, { "epoch": 4.0456273764258555, "grad_norm": 0.7756439447402954, "learning_rate": 3.796437659033079e-05, "loss": 0.1874, "step": 1598 }, { "epoch": 4.048162230671736, "grad_norm": 0.6393389701843262, "learning_rate": 3.786259541984733e-05, "loss": 0.1226, "step": 1599 }, { "epoch": 4.050697084917617, "grad_norm": 0.7177454233169556, "learning_rate": 3.776081424936387e-05, "loss": 0.1382, "step": 1600 }, { "epoch": 4.053231939163498, "grad_norm": 0.6561391353607178, "learning_rate": 3.765903307888041e-05, "loss": 0.1557, "step": 1601 }, { "epoch": 4.055766793409379, "grad_norm": 0.8319444060325623, "learning_rate": 3.7557251908396954e-05, "loss": 0.1608, "step": 1602 }, { "epoch": 4.05830164765526, "grad_norm": 0.7468693852424622, "learning_rate": 3.745547073791349e-05, "loss": 0.1442, "step": 1603 }, { "epoch": 4.06083650190114, "grad_norm": 0.623657763004303, "learning_rate": 3.7353689567430025e-05, "loss": 0.1395, "step": 1604 }, { "epoch": 4.063371356147021, "grad_norm": 0.5870152115821838, "learning_rate": 3.7251908396946565e-05, "loss": 0.1322, "step": 1605 }, { "epoch": 4.065906210392902, "grad_norm": 0.6840811371803284, "learning_rate": 3.7150127226463104e-05, "loss": 0.132, "step": 1606 }, { "epoch": 4.068441064638783, "grad_norm": 0.6177504658699036, "learning_rate": 3.704834605597964e-05, "loss": 0.1265, "step": 1607 }, { "epoch": 4.070975918884664, "grad_norm": 0.6908831000328064, "learning_rate": 3.694656488549619e-05, "loss": 0.1593, "step": 1608 }, { "epoch": 4.073510773130545, "grad_norm": 0.787434458732605, "learning_rate": 3.684478371501273e-05, "loss": 0.1184, "step": 1609 }, { "epoch": 4.076045627376426, "grad_norm": 0.8011195063591003, "learning_rate": 3.674300254452927e-05, "loss": 0.1341, "step": 1610 }, { "epoch": 4.078580481622307, "grad_norm": 0.5523831248283386, "learning_rate": 3.66412213740458e-05, "loss": 0.1283, "step": 1611 }, { "epoch": 4.081115335868188, "grad_norm": 0.6396963596343994, "learning_rate": 3.653944020356234e-05, "loss": 0.1424, "step": 1612 }, { "epoch": 4.083650190114068, "grad_norm": 0.7471883893013, "learning_rate": 3.643765903307888e-05, "loss": 0.1627, "step": 1613 }, { "epoch": 4.086185044359949, "grad_norm": 0.5498061776161194, "learning_rate": 3.633587786259542e-05, "loss": 0.1478, "step": 1614 }, { "epoch": 4.08871989860583, "grad_norm": 0.6853391528129578, "learning_rate": 3.6234096692111965e-05, "loss": 0.1588, "step": 1615 }, { "epoch": 4.091254752851711, "grad_norm": 0.6638361811637878, "learning_rate": 3.6132315521628504e-05, "loss": 0.1695, "step": 1616 }, { "epoch": 4.093789607097592, "grad_norm": 0.6155263781547546, "learning_rate": 3.603053435114504e-05, "loss": 0.1355, "step": 1617 }, { "epoch": 4.096324461343473, "grad_norm": 0.574590265750885, "learning_rate": 3.5928753180661576e-05, "loss": 0.1498, "step": 1618 }, { "epoch": 4.098859315589354, "grad_norm": 0.5972251296043396, "learning_rate": 3.5826972010178115e-05, "loss": 0.1684, "step": 1619 }, { "epoch": 4.101394169835235, "grad_norm": 0.668618381023407, "learning_rate": 3.5725190839694654e-05, "loss": 0.1377, "step": 1620 }, { "epoch": 4.103929024081116, "grad_norm": 0.6238232851028442, "learning_rate": 3.56234096692112e-05, "loss": 0.2025, "step": 1621 }, { "epoch": 4.106463878326996, "grad_norm": 0.9182467460632324, "learning_rate": 3.552162849872774e-05, "loss": 0.1539, "step": 1622 }, { "epoch": 4.108998732572877, "grad_norm": 0.6368919014930725, "learning_rate": 3.541984732824428e-05, "loss": 0.1421, "step": 1623 }, { "epoch": 4.111533586818758, "grad_norm": 0.7871132493019104, "learning_rate": 3.531806615776082e-05, "loss": 0.1482, "step": 1624 }, { "epoch": 4.114068441064639, "grad_norm": 0.7697343230247498, "learning_rate": 3.521628498727735e-05, "loss": 0.1607, "step": 1625 }, { "epoch": 4.11660329531052, "grad_norm": 0.5805296897888184, "learning_rate": 3.511450381679389e-05, "loss": 0.1497, "step": 1626 }, { "epoch": 4.119138149556401, "grad_norm": 0.6484183073043823, "learning_rate": 3.5012722646310436e-05, "loss": 0.1827, "step": 1627 }, { "epoch": 4.1216730038022815, "grad_norm": 1.0351064205169678, "learning_rate": 3.4910941475826976e-05, "loss": 0.2331, "step": 1628 }, { "epoch": 4.1242078580481625, "grad_norm": 0.620452344417572, "learning_rate": 3.4809160305343515e-05, "loss": 0.1516, "step": 1629 }, { "epoch": 4.126742712294043, "grad_norm": 0.6269112229347229, "learning_rate": 3.4707379134860054e-05, "loss": 0.1322, "step": 1630 }, { "epoch": 4.129277566539924, "grad_norm": 0.7780957221984863, "learning_rate": 3.4605597964376594e-05, "loss": 0.1974, "step": 1631 }, { "epoch": 4.1318124207858045, "grad_norm": 0.6183624267578125, "learning_rate": 3.4503816793893126e-05, "loss": 0.1423, "step": 1632 }, { "epoch": 4.134347275031685, "grad_norm": 0.715943455696106, "learning_rate": 3.440203562340967e-05, "loss": 0.1422, "step": 1633 }, { "epoch": 4.136882129277566, "grad_norm": 0.6383997201919556, "learning_rate": 3.430025445292621e-05, "loss": 0.1566, "step": 1634 }, { "epoch": 4.139416983523447, "grad_norm": 0.6354379653930664, "learning_rate": 3.419847328244275e-05, "loss": 0.14, "step": 1635 }, { "epoch": 4.141951837769328, "grad_norm": 0.5692049264907837, "learning_rate": 3.409669211195929e-05, "loss": 0.1315, "step": 1636 }, { "epoch": 4.144486692015209, "grad_norm": 0.5286855697631836, "learning_rate": 3.399491094147583e-05, "loss": 0.119, "step": 1637 }, { "epoch": 4.14702154626109, "grad_norm": 0.6007808446884155, "learning_rate": 3.389312977099237e-05, "loss": 0.1368, "step": 1638 }, { "epoch": 4.149556400506971, "grad_norm": 0.8727791905403137, "learning_rate": 3.379134860050891e-05, "loss": 0.1635, "step": 1639 }, { "epoch": 4.152091254752852, "grad_norm": 0.7203207015991211, "learning_rate": 3.368956743002545e-05, "loss": 0.1668, "step": 1640 }, { "epoch": 4.154626108998732, "grad_norm": 0.7178492546081543, "learning_rate": 3.358778625954199e-05, "loss": 0.1601, "step": 1641 }, { "epoch": 4.157160963244613, "grad_norm": 0.6133365035057068, "learning_rate": 3.3486005089058526e-05, "loss": 0.1438, "step": 1642 }, { "epoch": 4.159695817490494, "grad_norm": 0.690122127532959, "learning_rate": 3.3384223918575065e-05, "loss": 0.1592, "step": 1643 }, { "epoch": 4.162230671736375, "grad_norm": 0.5469484925270081, "learning_rate": 3.3282442748091605e-05, "loss": 0.1499, "step": 1644 }, { "epoch": 4.164765525982256, "grad_norm": 0.7380850911140442, "learning_rate": 3.3180661577608144e-05, "loss": 0.1724, "step": 1645 }, { "epoch": 4.167300380228137, "grad_norm": 0.6949165463447571, "learning_rate": 3.307888040712468e-05, "loss": 0.1642, "step": 1646 }, { "epoch": 4.169835234474018, "grad_norm": 0.6445840001106262, "learning_rate": 3.297709923664122e-05, "loss": 0.1576, "step": 1647 }, { "epoch": 4.172370088719899, "grad_norm": 0.577178418636322, "learning_rate": 3.287531806615776e-05, "loss": 0.1482, "step": 1648 }, { "epoch": 4.17490494296578, "grad_norm": 0.5232000350952148, "learning_rate": 3.27735368956743e-05, "loss": 0.1385, "step": 1649 }, { "epoch": 4.17743979721166, "grad_norm": 0.8429796695709229, "learning_rate": 3.267175572519084e-05, "loss": 0.2456, "step": 1650 }, { "epoch": 4.179974651457541, "grad_norm": 0.5647293925285339, "learning_rate": 3.256997455470738e-05, "loss": 0.1482, "step": 1651 }, { "epoch": 4.182509505703422, "grad_norm": 0.7679947018623352, "learning_rate": 3.246819338422392e-05, "loss": 0.1705, "step": 1652 }, { "epoch": 4.185044359949303, "grad_norm": 0.7913497686386108, "learning_rate": 3.236641221374046e-05, "loss": 0.2133, "step": 1653 }, { "epoch": 4.187579214195184, "grad_norm": 0.5105036497116089, "learning_rate": 3.2264631043257e-05, "loss": 0.1335, "step": 1654 }, { "epoch": 4.190114068441065, "grad_norm": 0.6503207087516785, "learning_rate": 3.216284987277354e-05, "loss": 0.1872, "step": 1655 }, { "epoch": 4.192648922686946, "grad_norm": 0.9579104781150818, "learning_rate": 3.2061068702290076e-05, "loss": 0.1985, "step": 1656 }, { "epoch": 4.195183776932827, "grad_norm": 0.5334345698356628, "learning_rate": 3.195928753180662e-05, "loss": 0.137, "step": 1657 }, { "epoch": 4.197718631178708, "grad_norm": 0.7031605243682861, "learning_rate": 3.185750636132316e-05, "loss": 0.1574, "step": 1658 }, { "epoch": 4.200253485424588, "grad_norm": 0.6237590909004211, "learning_rate": 3.1755725190839694e-05, "loss": 0.1686, "step": 1659 }, { "epoch": 4.202788339670469, "grad_norm": 0.827680230140686, "learning_rate": 3.1653944020356234e-05, "loss": 0.1765, "step": 1660 }, { "epoch": 4.20532319391635, "grad_norm": 0.6170578002929688, "learning_rate": 3.155216284987277e-05, "loss": 0.1699, "step": 1661 }, { "epoch": 4.2078580481622305, "grad_norm": 0.600803017616272, "learning_rate": 3.145038167938931e-05, "loss": 0.1345, "step": 1662 }, { "epoch": 4.2103929024081115, "grad_norm": 0.5505921840667725, "learning_rate": 3.134860050890586e-05, "loss": 0.1418, "step": 1663 }, { "epoch": 4.212927756653992, "grad_norm": 0.5893916487693787, "learning_rate": 3.12468193384224e-05, "loss": 0.1414, "step": 1664 }, { "epoch": 4.215462610899873, "grad_norm": 0.7622592449188232, "learning_rate": 3.114503816793894e-05, "loss": 0.1568, "step": 1665 }, { "epoch": 4.217997465145754, "grad_norm": 0.6462287306785583, "learning_rate": 3.104325699745547e-05, "loss": 0.1641, "step": 1666 }, { "epoch": 4.220532319391635, "grad_norm": 0.4971311092376709, "learning_rate": 3.094147582697201e-05, "loss": 0.1276, "step": 1667 }, { "epoch": 4.223067173637516, "grad_norm": 0.7270475029945374, "learning_rate": 3.083969465648855e-05, "loss": 0.1603, "step": 1668 }, { "epoch": 4.225602027883396, "grad_norm": 0.5765766501426697, "learning_rate": 3.0737913486005094e-05, "loss": 0.1341, "step": 1669 }, { "epoch": 4.228136882129277, "grad_norm": 0.577694296836853, "learning_rate": 3.0636132315521633e-05, "loss": 0.1415, "step": 1670 }, { "epoch": 4.230671736375158, "grad_norm": 0.6085098385810852, "learning_rate": 3.053435114503817e-05, "loss": 0.1359, "step": 1671 }, { "epoch": 4.233206590621039, "grad_norm": 0.6224119663238525, "learning_rate": 3.043256997455471e-05, "loss": 0.1494, "step": 1672 }, { "epoch": 4.23574144486692, "grad_norm": 0.4535973072052002, "learning_rate": 3.0330788804071248e-05, "loss": 0.1415, "step": 1673 }, { "epoch": 4.238276299112801, "grad_norm": 0.6283777356147766, "learning_rate": 3.0229007633587787e-05, "loss": 0.1569, "step": 1674 }, { "epoch": 4.240811153358682, "grad_norm": 0.6005566120147705, "learning_rate": 3.0127226463104323e-05, "loss": 0.1385, "step": 1675 }, { "epoch": 4.243346007604563, "grad_norm": 0.6437854766845703, "learning_rate": 3.002544529262087e-05, "loss": 0.1584, "step": 1676 }, { "epoch": 4.245880861850444, "grad_norm": 0.5184986591339111, "learning_rate": 2.992366412213741e-05, "loss": 0.1384, "step": 1677 }, { "epoch": 4.248415716096324, "grad_norm": 0.5969160199165344, "learning_rate": 2.9821882951653945e-05, "loss": 0.1609, "step": 1678 }, { "epoch": 4.250950570342205, "grad_norm": 0.85272616147995, "learning_rate": 2.9720101781170484e-05, "loss": 0.178, "step": 1679 }, { "epoch": 4.253485424588086, "grad_norm": 0.5351912379264832, "learning_rate": 2.9618320610687023e-05, "loss": 0.1465, "step": 1680 }, { "epoch": 4.256020278833967, "grad_norm": 0.5821883678436279, "learning_rate": 2.9516539440203562e-05, "loss": 0.135, "step": 1681 }, { "epoch": 4.258555133079848, "grad_norm": 0.5453548431396484, "learning_rate": 2.9414758269720105e-05, "loss": 0.1287, "step": 1682 }, { "epoch": 4.261089987325729, "grad_norm": 0.6280243396759033, "learning_rate": 2.9312977099236644e-05, "loss": 0.152, "step": 1683 }, { "epoch": 4.26362484157161, "grad_norm": 0.5709437131881714, "learning_rate": 2.9211195928753184e-05, "loss": 0.1487, "step": 1684 }, { "epoch": 4.266159695817491, "grad_norm": 0.4667048752307892, "learning_rate": 2.910941475826972e-05, "loss": 0.129, "step": 1685 }, { "epoch": 4.268694550063372, "grad_norm": 0.5744767189025879, "learning_rate": 2.900763358778626e-05, "loss": 0.1668, "step": 1686 }, { "epoch": 4.271229404309253, "grad_norm": 0.552631139755249, "learning_rate": 2.89058524173028e-05, "loss": 0.128, "step": 1687 }, { "epoch": 4.273764258555133, "grad_norm": 0.46616679430007935, "learning_rate": 2.880407124681934e-05, "loss": 0.1168, "step": 1688 }, { "epoch": 4.276299112801014, "grad_norm": 0.7842658758163452, "learning_rate": 2.870229007633588e-05, "loss": 0.1617, "step": 1689 }, { "epoch": 4.278833967046895, "grad_norm": 0.5530945062637329, "learning_rate": 2.860050890585242e-05, "loss": 0.1619, "step": 1690 }, { "epoch": 4.281368821292776, "grad_norm": 0.9341786503791809, "learning_rate": 2.849872773536896e-05, "loss": 0.231, "step": 1691 }, { "epoch": 4.283903675538657, "grad_norm": 0.8043704032897949, "learning_rate": 2.8396946564885495e-05, "loss": 0.1826, "step": 1692 }, { "epoch": 4.2864385297845375, "grad_norm": 0.4446638524532318, "learning_rate": 2.8295165394402034e-05, "loss": 0.1413, "step": 1693 }, { "epoch": 4.2889733840304185, "grad_norm": 0.6845833659172058, "learning_rate": 2.8193384223918577e-05, "loss": 0.1577, "step": 1694 }, { "epoch": 4.2915082382762995, "grad_norm": 0.6702572107315063, "learning_rate": 2.8091603053435116e-05, "loss": 0.1714, "step": 1695 }, { "epoch": 4.29404309252218, "grad_norm": 0.6405001282691956, "learning_rate": 2.7989821882951656e-05, "loss": 0.1527, "step": 1696 }, { "epoch": 4.2965779467680605, "grad_norm": 0.6155828833580017, "learning_rate": 2.7888040712468195e-05, "loss": 0.1471, "step": 1697 }, { "epoch": 4.299112801013941, "grad_norm": 0.5606924295425415, "learning_rate": 2.7786259541984734e-05, "loss": 0.1331, "step": 1698 }, { "epoch": 4.301647655259822, "grad_norm": 0.7498462200164795, "learning_rate": 2.768447837150127e-05, "loss": 0.1713, "step": 1699 }, { "epoch": 4.304182509505703, "grad_norm": 0.6262723803520203, "learning_rate": 2.7582697201017816e-05, "loss": 0.1585, "step": 1700 }, { "epoch": 4.306717363751584, "grad_norm": 0.6729116439819336, "learning_rate": 2.7480916030534355e-05, "loss": 0.1347, "step": 1701 }, { "epoch": 4.309252217997465, "grad_norm": 0.7870539426803589, "learning_rate": 2.737913486005089e-05, "loss": 0.1512, "step": 1702 }, { "epoch": 4.311787072243346, "grad_norm": 0.4943903684616089, "learning_rate": 2.727735368956743e-05, "loss": 0.1274, "step": 1703 }, { "epoch": 4.314321926489227, "grad_norm": 0.4763108193874359, "learning_rate": 2.717557251908397e-05, "loss": 0.1228, "step": 1704 }, { "epoch": 4.316856780735108, "grad_norm": 0.6400578618049622, "learning_rate": 2.707379134860051e-05, "loss": 0.1558, "step": 1705 }, { "epoch": 4.319391634980988, "grad_norm": 0.5445212125778198, "learning_rate": 2.6972010178117052e-05, "loss": 0.1328, "step": 1706 }, { "epoch": 4.321926489226869, "grad_norm": 0.6329374313354492, "learning_rate": 2.687022900763359e-05, "loss": 0.1615, "step": 1707 }, { "epoch": 4.32446134347275, "grad_norm": 0.5299343466758728, "learning_rate": 2.676844783715013e-05, "loss": 0.122, "step": 1708 }, { "epoch": 4.326996197718631, "grad_norm": 0.6486507058143616, "learning_rate": 2.6666666666666667e-05, "loss": 0.1553, "step": 1709 }, { "epoch": 4.329531051964512, "grad_norm": 0.6306889653205872, "learning_rate": 2.6564885496183206e-05, "loss": 0.1638, "step": 1710 }, { "epoch": 4.332065906210393, "grad_norm": 0.6417018175125122, "learning_rate": 2.6463104325699745e-05, "loss": 0.1404, "step": 1711 }, { "epoch": 4.334600760456274, "grad_norm": 0.7283552289009094, "learning_rate": 2.6361323155216288e-05, "loss": 0.1837, "step": 1712 }, { "epoch": 4.337135614702155, "grad_norm": 0.7142099142074585, "learning_rate": 2.6259541984732827e-05, "loss": 0.1535, "step": 1713 }, { "epoch": 4.339670468948036, "grad_norm": 0.6059632897377014, "learning_rate": 2.6157760814249367e-05, "loss": 0.1551, "step": 1714 }, { "epoch": 4.342205323193916, "grad_norm": 0.6492133140563965, "learning_rate": 2.6055979643765906e-05, "loss": 0.1413, "step": 1715 }, { "epoch": 4.344740177439797, "grad_norm": 0.7166099548339844, "learning_rate": 2.5954198473282442e-05, "loss": 0.1534, "step": 1716 }, { "epoch": 4.347275031685678, "grad_norm": 0.6357300877571106, "learning_rate": 2.585241730279898e-05, "loss": 0.1445, "step": 1717 }, { "epoch": 4.349809885931559, "grad_norm": 0.6684461236000061, "learning_rate": 2.5750636132315524e-05, "loss": 0.1469, "step": 1718 }, { "epoch": 4.35234474017744, "grad_norm": 0.7808713912963867, "learning_rate": 2.5648854961832063e-05, "loss": 0.1892, "step": 1719 }, { "epoch": 4.354879594423321, "grad_norm": 0.6660336852073669, "learning_rate": 2.5547073791348602e-05, "loss": 0.1545, "step": 1720 }, { "epoch": 4.357414448669202, "grad_norm": 0.7266603112220764, "learning_rate": 2.5445292620865142e-05, "loss": 0.1346, "step": 1721 }, { "epoch": 4.359949302915083, "grad_norm": 0.5710493326187134, "learning_rate": 2.534351145038168e-05, "loss": 0.1199, "step": 1722 }, { "epoch": 4.362484157160964, "grad_norm": 0.6178765296936035, "learning_rate": 2.5241730279898217e-05, "loss": 0.1416, "step": 1723 }, { "epoch": 4.365019011406844, "grad_norm": 0.5881832242012024, "learning_rate": 2.5139949109414763e-05, "loss": 0.1389, "step": 1724 }, { "epoch": 4.367553865652725, "grad_norm": 0.5589767694473267, "learning_rate": 2.50381679389313e-05, "loss": 0.1356, "step": 1725 }, { "epoch": 4.370088719898606, "grad_norm": 0.611072301864624, "learning_rate": 2.4936386768447838e-05, "loss": 0.1618, "step": 1726 }, { "epoch": 4.3726235741444865, "grad_norm": 1.0045723915100098, "learning_rate": 2.4834605597964378e-05, "loss": 0.2004, "step": 1727 }, { "epoch": 4.3751584283903675, "grad_norm": 1.0154621601104736, "learning_rate": 2.4732824427480917e-05, "loss": 0.1593, "step": 1728 }, { "epoch": 4.3776932826362485, "grad_norm": 0.7933842539787292, "learning_rate": 2.4631043256997456e-05, "loss": 0.183, "step": 1729 }, { "epoch": 4.380228136882129, "grad_norm": 0.8141732811927795, "learning_rate": 2.4529262086513996e-05, "loss": 0.1412, "step": 1730 }, { "epoch": 4.38276299112801, "grad_norm": 0.6575155854225159, "learning_rate": 2.4427480916030535e-05, "loss": 0.1592, "step": 1731 }, { "epoch": 4.385297845373891, "grad_norm": 0.7710108757019043, "learning_rate": 2.4325699745547078e-05, "loss": 0.2306, "step": 1732 }, { "epoch": 4.387832699619771, "grad_norm": 0.6438276767730713, "learning_rate": 2.4223918575063613e-05, "loss": 0.143, "step": 1733 }, { "epoch": 4.390367553865652, "grad_norm": 0.7019467949867249, "learning_rate": 2.4122137404580153e-05, "loss": 0.1641, "step": 1734 }, { "epoch": 4.392902408111533, "grad_norm": 0.598584771156311, "learning_rate": 2.4020356234096695e-05, "loss": 0.1456, "step": 1735 }, { "epoch": 4.395437262357414, "grad_norm": 0.6024305820465088, "learning_rate": 2.391857506361323e-05, "loss": 0.1287, "step": 1736 }, { "epoch": 4.397972116603295, "grad_norm": 0.8446558713912964, "learning_rate": 2.381679389312977e-05, "loss": 0.1705, "step": 1737 }, { "epoch": 4.400506970849176, "grad_norm": 0.5697831511497498, "learning_rate": 2.3715012722646313e-05, "loss": 0.1386, "step": 1738 }, { "epoch": 4.403041825095057, "grad_norm": 0.6655327677726746, "learning_rate": 2.3613231552162853e-05, "loss": 0.186, "step": 1739 }, { "epoch": 4.405576679340938, "grad_norm": 1.1001065969467163, "learning_rate": 2.351145038167939e-05, "loss": 0.2531, "step": 1740 }, { "epoch": 4.408111533586819, "grad_norm": 0.5302372574806213, "learning_rate": 2.340966921119593e-05, "loss": 0.1342, "step": 1741 }, { "epoch": 4.4106463878327, "grad_norm": 0.6450605392456055, "learning_rate": 2.330788804071247e-05, "loss": 0.1499, "step": 1742 }, { "epoch": 4.41318124207858, "grad_norm": 0.5733135342597961, "learning_rate": 2.3206106870229007e-05, "loss": 0.166, "step": 1743 }, { "epoch": 4.415716096324461, "grad_norm": 0.609865665435791, "learning_rate": 2.310432569974555e-05, "loss": 0.1306, "step": 1744 }, { "epoch": 4.418250950570342, "grad_norm": 0.5957082509994507, "learning_rate": 2.300254452926209e-05, "loss": 0.1309, "step": 1745 }, { "epoch": 4.420785804816223, "grad_norm": 0.5951780080795288, "learning_rate": 2.2900763358778628e-05, "loss": 0.1366, "step": 1746 }, { "epoch": 4.423320659062104, "grad_norm": 0.7225191593170166, "learning_rate": 2.2798982188295167e-05, "loss": 0.1825, "step": 1747 }, { "epoch": 4.425855513307985, "grad_norm": 0.6427996158599854, "learning_rate": 2.2697201017811707e-05, "loss": 0.1326, "step": 1748 }, { "epoch": 4.428390367553866, "grad_norm": 0.49267786741256714, "learning_rate": 2.2595419847328246e-05, "loss": 0.1367, "step": 1749 }, { "epoch": 4.430925221799747, "grad_norm": 0.5365452766418457, "learning_rate": 2.2493638676844785e-05, "loss": 0.1456, "step": 1750 }, { "epoch": 4.433460076045628, "grad_norm": 0.65265291929245, "learning_rate": 2.2391857506361324e-05, "loss": 0.1379, "step": 1751 }, { "epoch": 4.435994930291509, "grad_norm": 0.5401502847671509, "learning_rate": 2.2290076335877864e-05, "loss": 0.1293, "step": 1752 }, { "epoch": 4.438529784537389, "grad_norm": 0.6832171678543091, "learning_rate": 2.2188295165394403e-05, "loss": 0.1448, "step": 1753 }, { "epoch": 4.44106463878327, "grad_norm": 0.8080681562423706, "learning_rate": 2.2086513994910942e-05, "loss": 0.1832, "step": 1754 }, { "epoch": 4.443599493029151, "grad_norm": 0.6201688051223755, "learning_rate": 2.198473282442748e-05, "loss": 0.159, "step": 1755 }, { "epoch": 4.446134347275032, "grad_norm": 0.8549275994300842, "learning_rate": 2.1882951653944024e-05, "loss": 0.2103, "step": 1756 }, { "epoch": 4.448669201520913, "grad_norm": 0.5879942178726196, "learning_rate": 2.178117048346056e-05, "loss": 0.1524, "step": 1757 }, { "epoch": 4.451204055766794, "grad_norm": 0.6592312455177307, "learning_rate": 2.16793893129771e-05, "loss": 0.1535, "step": 1758 }, { "epoch": 4.4537389100126745, "grad_norm": 0.6493979096412659, "learning_rate": 2.1577608142493642e-05, "loss": 0.1451, "step": 1759 }, { "epoch": 4.4562737642585555, "grad_norm": 0.7973134517669678, "learning_rate": 2.1475826972010178e-05, "loss": 0.1519, "step": 1760 }, { "epoch": 4.458808618504436, "grad_norm": 0.7703438401222229, "learning_rate": 2.1374045801526718e-05, "loss": 0.1653, "step": 1761 }, { "epoch": 4.4613434727503165, "grad_norm": 1.0013222694396973, "learning_rate": 2.127226463104326e-05, "loss": 0.2064, "step": 1762 }, { "epoch": 4.4638783269961975, "grad_norm": 0.7007017135620117, "learning_rate": 2.11704834605598e-05, "loss": 0.1401, "step": 1763 }, { "epoch": 4.466413181242078, "grad_norm": 0.5366234183311462, "learning_rate": 2.1068702290076335e-05, "loss": 0.1389, "step": 1764 }, { "epoch": 4.468948035487959, "grad_norm": 0.7167120575904846, "learning_rate": 2.0966921119592875e-05, "loss": 0.1817, "step": 1765 }, { "epoch": 4.47148288973384, "grad_norm": 0.7901313900947571, "learning_rate": 2.0865139949109417e-05, "loss": 0.1817, "step": 1766 }, { "epoch": 4.474017743979721, "grad_norm": 0.6681633591651917, "learning_rate": 2.0763358778625953e-05, "loss": 0.1458, "step": 1767 }, { "epoch": 4.476552598225602, "grad_norm": 0.5067597031593323, "learning_rate": 2.0661577608142493e-05, "loss": 0.1301, "step": 1768 }, { "epoch": 4.479087452471483, "grad_norm": 0.6582893133163452, "learning_rate": 2.0559796437659035e-05, "loss": 0.1576, "step": 1769 }, { "epoch": 4.481622306717364, "grad_norm": 0.6628451943397522, "learning_rate": 2.0458015267175575e-05, "loss": 0.168, "step": 1770 }, { "epoch": 4.484157160963244, "grad_norm": 0.5435721278190613, "learning_rate": 2.035623409669211e-05, "loss": 0.1476, "step": 1771 }, { "epoch": 4.486692015209125, "grad_norm": 0.6182110905647278, "learning_rate": 2.0254452926208653e-05, "loss": 0.1441, "step": 1772 }, { "epoch": 4.489226869455006, "grad_norm": 0.9246516823768616, "learning_rate": 2.0152671755725193e-05, "loss": 0.1747, "step": 1773 }, { "epoch": 4.491761723700887, "grad_norm": 0.5967719554901123, "learning_rate": 2.005089058524173e-05, "loss": 0.1461, "step": 1774 }, { "epoch": 4.494296577946768, "grad_norm": 0.5998682379722595, "learning_rate": 1.994910941475827e-05, "loss": 0.1276, "step": 1775 }, { "epoch": 4.496831432192649, "grad_norm": 0.6168457865715027, "learning_rate": 1.984732824427481e-05, "loss": 0.1407, "step": 1776 }, { "epoch": 4.49936628643853, "grad_norm": 0.6580602526664734, "learning_rate": 1.974554707379135e-05, "loss": 0.149, "step": 1777 }, { "epoch": 4.501901140684411, "grad_norm": 0.5117031335830688, "learning_rate": 1.964376590330789e-05, "loss": 0.1397, "step": 1778 }, { "epoch": 4.504435994930292, "grad_norm": 0.4603317975997925, "learning_rate": 1.954198473282443e-05, "loss": 0.1211, "step": 1779 }, { "epoch": 4.506970849176172, "grad_norm": 0.5981631278991699, "learning_rate": 1.9440203562340968e-05, "loss": 0.1371, "step": 1780 }, { "epoch": 4.509505703422053, "grad_norm": 0.6693590879440308, "learning_rate": 1.9338422391857507e-05, "loss": 0.1495, "step": 1781 }, { "epoch": 4.512040557667934, "grad_norm": 0.5286784172058105, "learning_rate": 1.9236641221374046e-05, "loss": 0.1304, "step": 1782 }, { "epoch": 4.514575411913815, "grad_norm": 0.7040352821350098, "learning_rate": 1.9134860050890586e-05, "loss": 0.1584, "step": 1783 }, { "epoch": 4.517110266159696, "grad_norm": 0.6396339535713196, "learning_rate": 1.9033078880407125e-05, "loss": 0.1529, "step": 1784 }, { "epoch": 4.519645120405577, "grad_norm": 0.6708245873451233, "learning_rate": 1.8931297709923664e-05, "loss": 0.1477, "step": 1785 }, { "epoch": 4.522179974651458, "grad_norm": 0.6562108993530273, "learning_rate": 1.8829516539440204e-05, "loss": 0.1499, "step": 1786 }, { "epoch": 4.524714828897339, "grad_norm": 0.5181876420974731, "learning_rate": 1.8727735368956746e-05, "loss": 0.1398, "step": 1787 }, { "epoch": 4.52724968314322, "grad_norm": 0.5952017307281494, "learning_rate": 1.8625954198473282e-05, "loss": 0.1438, "step": 1788 }, { "epoch": 4.5297845373891, "grad_norm": 0.6668636202812195, "learning_rate": 1.852417302798982e-05, "loss": 0.1805, "step": 1789 }, { "epoch": 4.532319391634981, "grad_norm": 0.5433321595191956, "learning_rate": 1.8422391857506364e-05, "loss": 0.1397, "step": 1790 }, { "epoch": 4.534854245880862, "grad_norm": 0.5353025197982788, "learning_rate": 1.83206106870229e-05, "loss": 0.1419, "step": 1791 }, { "epoch": 4.537389100126743, "grad_norm": 0.6123271584510803, "learning_rate": 1.821882951653944e-05, "loss": 0.1493, "step": 1792 }, { "epoch": 4.5399239543726235, "grad_norm": 0.6581493616104126, "learning_rate": 1.8117048346055982e-05, "loss": 0.1467, "step": 1793 }, { "epoch": 4.5424588086185045, "grad_norm": 0.5537798404693604, "learning_rate": 1.801526717557252e-05, "loss": 0.1467, "step": 1794 }, { "epoch": 4.544993662864385, "grad_norm": 0.7163582444190979, "learning_rate": 1.7913486005089058e-05, "loss": 0.1736, "step": 1795 }, { "epoch": 4.547528517110266, "grad_norm": 0.694922149181366, "learning_rate": 1.78117048346056e-05, "loss": 0.1516, "step": 1796 }, { "epoch": 4.550063371356147, "grad_norm": 0.7119778394699097, "learning_rate": 1.770992366412214e-05, "loss": 0.1899, "step": 1797 }, { "epoch": 4.552598225602027, "grad_norm": 0.7570186853408813, "learning_rate": 1.7608142493638675e-05, "loss": 0.1951, "step": 1798 }, { "epoch": 4.555133079847908, "grad_norm": 0.6789132356643677, "learning_rate": 1.7506361323155218e-05, "loss": 0.1475, "step": 1799 }, { "epoch": 4.557667934093789, "grad_norm": 0.5750378966331482, "learning_rate": 1.7404580152671757e-05, "loss": 0.1431, "step": 1800 }, { "epoch": 4.56020278833967, "grad_norm": 0.6066502332687378, "learning_rate": 1.7302798982188297e-05, "loss": 0.16, "step": 1801 }, { "epoch": 4.562737642585551, "grad_norm": 0.5730226039886475, "learning_rate": 1.7201017811704836e-05, "loss": 0.1455, "step": 1802 }, { "epoch": 4.565272496831432, "grad_norm": 0.5752687454223633, "learning_rate": 1.7099236641221375e-05, "loss": 0.1281, "step": 1803 }, { "epoch": 4.567807351077313, "grad_norm": 0.5497205853462219, "learning_rate": 1.6997455470737915e-05, "loss": 0.1431, "step": 1804 }, { "epoch": 4.570342205323194, "grad_norm": 0.7738269567489624, "learning_rate": 1.6895674300254454e-05, "loss": 0.1523, "step": 1805 }, { "epoch": 4.572877059569075, "grad_norm": 0.5750918388366699, "learning_rate": 1.6793893129770993e-05, "loss": 0.1466, "step": 1806 }, { "epoch": 4.575411913814955, "grad_norm": 0.5575040578842163, "learning_rate": 1.6692111959287533e-05, "loss": 0.1267, "step": 1807 }, { "epoch": 4.577946768060836, "grad_norm": 0.509616494178772, "learning_rate": 1.6590330788804072e-05, "loss": 0.1434, "step": 1808 }, { "epoch": 4.580481622306717, "grad_norm": 0.643009603023529, "learning_rate": 1.648854961832061e-05, "loss": 0.136, "step": 1809 }, { "epoch": 4.583016476552598, "grad_norm": 0.5133553743362427, "learning_rate": 1.638676844783715e-05, "loss": 0.1223, "step": 1810 }, { "epoch": 4.585551330798479, "grad_norm": 0.7505659461021423, "learning_rate": 1.628498727735369e-05, "loss": 0.1607, "step": 1811 }, { "epoch": 4.58808618504436, "grad_norm": 0.6981300711631775, "learning_rate": 1.618320610687023e-05, "loss": 0.1525, "step": 1812 }, { "epoch": 4.590621039290241, "grad_norm": 0.4981435537338257, "learning_rate": 1.608142493638677e-05, "loss": 0.1236, "step": 1813 }, { "epoch": 4.593155893536122, "grad_norm": 0.6467440724372864, "learning_rate": 1.597964376590331e-05, "loss": 0.153, "step": 1814 }, { "epoch": 4.595690747782003, "grad_norm": 0.6843181848526001, "learning_rate": 1.5877862595419847e-05, "loss": 0.1604, "step": 1815 }, { "epoch": 4.598225602027884, "grad_norm": 0.49898776412010193, "learning_rate": 1.5776081424936386e-05, "loss": 0.1165, "step": 1816 }, { "epoch": 4.600760456273765, "grad_norm": 0.6252351403236389, "learning_rate": 1.567430025445293e-05, "loss": 0.1228, "step": 1817 }, { "epoch": 4.603295310519645, "grad_norm": 0.5452350974082947, "learning_rate": 1.557251908396947e-05, "loss": 0.1245, "step": 1818 }, { "epoch": 4.605830164765526, "grad_norm": 0.6847854852676392, "learning_rate": 1.5470737913486004e-05, "loss": 0.1462, "step": 1819 }, { "epoch": 4.608365019011407, "grad_norm": 0.49941131472587585, "learning_rate": 1.5368956743002547e-05, "loss": 0.1268, "step": 1820 }, { "epoch": 4.610899873257288, "grad_norm": 0.581243097782135, "learning_rate": 1.5267175572519086e-05, "loss": 0.1296, "step": 1821 }, { "epoch": 4.613434727503169, "grad_norm": 0.8345553874969482, "learning_rate": 1.5165394402035624e-05, "loss": 0.1307, "step": 1822 }, { "epoch": 4.61596958174905, "grad_norm": 0.6534408926963806, "learning_rate": 1.5063613231552162e-05, "loss": 0.1446, "step": 1823 }, { "epoch": 4.6185044359949305, "grad_norm": 0.7743064165115356, "learning_rate": 1.4961832061068704e-05, "loss": 0.2027, "step": 1824 }, { "epoch": 4.6210392902408115, "grad_norm": 0.6709569096565247, "learning_rate": 1.4860050890585242e-05, "loss": 0.1427, "step": 1825 }, { "epoch": 4.6235741444866925, "grad_norm": 0.6598264575004578, "learning_rate": 1.4758269720101781e-05, "loss": 0.1399, "step": 1826 }, { "epoch": 4.6261089987325725, "grad_norm": 0.49041053652763367, "learning_rate": 1.4656488549618322e-05, "loss": 0.133, "step": 1827 }, { "epoch": 4.6286438529784535, "grad_norm": 0.6697686910629272, "learning_rate": 1.455470737913486e-05, "loss": 0.1735, "step": 1828 }, { "epoch": 4.6311787072243344, "grad_norm": 0.5481597781181335, "learning_rate": 1.44529262086514e-05, "loss": 0.1244, "step": 1829 }, { "epoch": 4.633713561470215, "grad_norm": 0.6251161694526672, "learning_rate": 1.435114503816794e-05, "loss": 0.1436, "step": 1830 }, { "epoch": 4.636248415716096, "grad_norm": 0.7515272498130798, "learning_rate": 1.424936386768448e-05, "loss": 0.1493, "step": 1831 }, { "epoch": 4.638783269961977, "grad_norm": 0.8478451371192932, "learning_rate": 1.4147582697201017e-05, "loss": 0.1519, "step": 1832 }, { "epoch": 4.641318124207858, "grad_norm": 0.5417062640190125, "learning_rate": 1.4045801526717558e-05, "loss": 0.1318, "step": 1833 }, { "epoch": 4.643852978453739, "grad_norm": 0.6493893265724182, "learning_rate": 1.3944020356234097e-05, "loss": 0.1546, "step": 1834 }, { "epoch": 4.64638783269962, "grad_norm": 0.8475616574287415, "learning_rate": 1.3842239185750635e-05, "loss": 0.172, "step": 1835 }, { "epoch": 4.6489226869455, "grad_norm": 0.5484082698822021, "learning_rate": 1.3740458015267178e-05, "loss": 0.1203, "step": 1836 }, { "epoch": 4.651457541191381, "grad_norm": 0.6533843874931335, "learning_rate": 1.3638676844783715e-05, "loss": 0.1501, "step": 1837 }, { "epoch": 4.653992395437262, "grad_norm": 0.7521854043006897, "learning_rate": 1.3536895674300255e-05, "loss": 0.1955, "step": 1838 }, { "epoch": 4.656527249683143, "grad_norm": 0.6500900983810425, "learning_rate": 1.3435114503816796e-05, "loss": 0.14, "step": 1839 }, { "epoch": 4.659062103929024, "grad_norm": 0.7133599519729614, "learning_rate": 1.3333333333333333e-05, "loss": 0.1707, "step": 1840 }, { "epoch": 4.661596958174905, "grad_norm": 0.7065775394439697, "learning_rate": 1.3231552162849873e-05, "loss": 0.144, "step": 1841 }, { "epoch": 4.664131812420786, "grad_norm": 0.7716514468193054, "learning_rate": 1.3129770992366414e-05, "loss": 0.1792, "step": 1842 }, { "epoch": 4.666666666666667, "grad_norm": 0.9312828779220581, "learning_rate": 1.3027989821882953e-05, "loss": 0.2139, "step": 1843 }, { "epoch": 4.669201520912548, "grad_norm": 0.5163487792015076, "learning_rate": 1.292620865139949e-05, "loss": 0.139, "step": 1844 }, { "epoch": 4.671736375158428, "grad_norm": 0.7424818277359009, "learning_rate": 1.2824427480916032e-05, "loss": 0.1533, "step": 1845 }, { "epoch": 4.674271229404309, "grad_norm": 0.5935065150260925, "learning_rate": 1.2722646310432571e-05, "loss": 0.1319, "step": 1846 }, { "epoch": 4.67680608365019, "grad_norm": 0.7372322678565979, "learning_rate": 1.2620865139949108e-05, "loss": 0.1832, "step": 1847 }, { "epoch": 4.679340937896071, "grad_norm": 0.5936238765716553, "learning_rate": 1.251908396946565e-05, "loss": 0.1357, "step": 1848 }, { "epoch": 4.681875792141952, "grad_norm": 0.6689032316207886, "learning_rate": 1.2417302798982189e-05, "loss": 0.1709, "step": 1849 }, { "epoch": 4.684410646387833, "grad_norm": 0.6519850492477417, "learning_rate": 1.2315521628498728e-05, "loss": 0.1438, "step": 1850 }, { "epoch": 4.686945500633714, "grad_norm": 0.5853939056396484, "learning_rate": 1.2213740458015267e-05, "loss": 0.134, "step": 1851 }, { "epoch": 4.689480354879595, "grad_norm": 0.5059859752655029, "learning_rate": 1.2111959287531807e-05, "loss": 0.1088, "step": 1852 }, { "epoch": 4.692015209125476, "grad_norm": 0.6989784240722656, "learning_rate": 1.2010178117048348e-05, "loss": 0.1527, "step": 1853 }, { "epoch": 4.694550063371356, "grad_norm": 0.5851006507873535, "learning_rate": 1.1908396946564885e-05, "loss": 0.143, "step": 1854 }, { "epoch": 4.697084917617237, "grad_norm": 0.5606602430343628, "learning_rate": 1.1806615776081426e-05, "loss": 0.1288, "step": 1855 }, { "epoch": 4.699619771863118, "grad_norm": 0.6175526976585388, "learning_rate": 1.1704834605597966e-05, "loss": 0.1564, "step": 1856 }, { "epoch": 4.702154626108999, "grad_norm": 0.5776654481887817, "learning_rate": 1.1603053435114503e-05, "loss": 0.1323, "step": 1857 }, { "epoch": 4.7046894803548795, "grad_norm": 0.5664159059524536, "learning_rate": 1.1501272264631044e-05, "loss": 0.1371, "step": 1858 }, { "epoch": 4.7072243346007605, "grad_norm": 0.7187889218330383, "learning_rate": 1.1399491094147584e-05, "loss": 0.1476, "step": 1859 }, { "epoch": 4.7097591888466415, "grad_norm": 0.5795005559921265, "learning_rate": 1.1297709923664123e-05, "loss": 0.1373, "step": 1860 }, { "epoch": 4.712294043092522, "grad_norm": 0.5491251945495605, "learning_rate": 1.1195928753180662e-05, "loss": 0.1192, "step": 1861 }, { "epoch": 4.714828897338403, "grad_norm": 0.4715762734413147, "learning_rate": 1.1094147582697202e-05, "loss": 0.1106, "step": 1862 }, { "epoch": 4.7173637515842834, "grad_norm": 0.6300286054611206, "learning_rate": 1.099236641221374e-05, "loss": 0.138, "step": 1863 }, { "epoch": 4.719898605830164, "grad_norm": 0.7265313267707825, "learning_rate": 1.089058524173028e-05, "loss": 0.2246, "step": 1864 }, { "epoch": 4.722433460076045, "grad_norm": 0.7080928087234497, "learning_rate": 1.0788804071246821e-05, "loss": 0.1335, "step": 1865 }, { "epoch": 4.724968314321926, "grad_norm": 0.605714738368988, "learning_rate": 1.0687022900763359e-05, "loss": 0.1412, "step": 1866 }, { "epoch": 4.727503168567807, "grad_norm": 0.6648192405700684, "learning_rate": 1.05852417302799e-05, "loss": 0.1648, "step": 1867 }, { "epoch": 4.730038022813688, "grad_norm": 0.6057281494140625, "learning_rate": 1.0483460559796437e-05, "loss": 0.1266, "step": 1868 }, { "epoch": 4.732572877059569, "grad_norm": 0.6135514974594116, "learning_rate": 1.0381679389312977e-05, "loss": 0.1457, "step": 1869 }, { "epoch": 4.73510773130545, "grad_norm": 0.6599459052085876, "learning_rate": 1.0279898218829518e-05, "loss": 0.1558, "step": 1870 }, { "epoch": 4.737642585551331, "grad_norm": 0.5975873470306396, "learning_rate": 1.0178117048346055e-05, "loss": 0.134, "step": 1871 }, { "epoch": 4.740177439797211, "grad_norm": 0.6581792235374451, "learning_rate": 1.0076335877862596e-05, "loss": 0.1463, "step": 1872 }, { "epoch": 4.742712294043092, "grad_norm": 0.5627064108848572, "learning_rate": 9.974554707379136e-06, "loss": 0.1238, "step": 1873 }, { "epoch": 4.745247148288973, "grad_norm": 0.6461361050605774, "learning_rate": 9.872773536895675e-06, "loss": 0.1621, "step": 1874 }, { "epoch": 4.747782002534854, "grad_norm": 0.5615333914756775, "learning_rate": 9.770992366412214e-06, "loss": 0.1387, "step": 1875 }, { "epoch": 4.750316856780735, "grad_norm": 0.6830117702484131, "learning_rate": 9.669211195928754e-06, "loss": 0.1397, "step": 1876 }, { "epoch": 4.752851711026616, "grad_norm": 0.731072187423706, "learning_rate": 9.567430025445293e-06, "loss": 0.1508, "step": 1877 }, { "epoch": 4.755386565272497, "grad_norm": 0.7469286918640137, "learning_rate": 9.465648854961832e-06, "loss": 0.1944, "step": 1878 }, { "epoch": 4.757921419518378, "grad_norm": 0.700532078742981, "learning_rate": 9.363867684478373e-06, "loss": 0.1697, "step": 1879 }, { "epoch": 4.760456273764259, "grad_norm": 0.7140323519706726, "learning_rate": 9.26208651399491e-06, "loss": 0.1597, "step": 1880 }, { "epoch": 4.76299112801014, "grad_norm": 0.6711133718490601, "learning_rate": 9.16030534351145e-06, "loss": 0.1731, "step": 1881 }, { "epoch": 4.765525982256021, "grad_norm": 0.43002957105636597, "learning_rate": 9.058524173027991e-06, "loss": 0.1181, "step": 1882 }, { "epoch": 4.768060836501901, "grad_norm": 0.669159471988678, "learning_rate": 8.956743002544529e-06, "loss": 0.1578, "step": 1883 }, { "epoch": 4.770595690747782, "grad_norm": 0.5030307769775391, "learning_rate": 8.85496183206107e-06, "loss": 0.1213, "step": 1884 }, { "epoch": 4.773130544993663, "grad_norm": 0.7841615080833435, "learning_rate": 8.753180661577609e-06, "loss": 0.1619, "step": 1885 }, { "epoch": 4.775665399239544, "grad_norm": 0.5570418834686279, "learning_rate": 8.651399491094148e-06, "loss": 0.1308, "step": 1886 }, { "epoch": 4.778200253485425, "grad_norm": 0.6690031886100769, "learning_rate": 8.549618320610688e-06, "loss": 0.1413, "step": 1887 }, { "epoch": 4.780735107731306, "grad_norm": 0.524140477180481, "learning_rate": 8.447837150127227e-06, "loss": 0.1354, "step": 1888 }, { "epoch": 4.783269961977187, "grad_norm": 0.5612379908561707, "learning_rate": 8.346055979643766e-06, "loss": 0.1375, "step": 1889 }, { "epoch": 4.7858048162230675, "grad_norm": 0.851925790309906, "learning_rate": 8.244274809160306e-06, "loss": 0.1783, "step": 1890 }, { "epoch": 4.7883396704689485, "grad_norm": 0.8507834672927856, "learning_rate": 8.142493638676845e-06, "loss": 0.1743, "step": 1891 }, { "epoch": 4.7908745247148286, "grad_norm": 0.8136033415794373, "learning_rate": 8.040712468193384e-06, "loss": 0.1381, "step": 1892 }, { "epoch": 4.7934093789607095, "grad_norm": 0.7247329354286194, "learning_rate": 7.938931297709924e-06, "loss": 0.1793, "step": 1893 }, { "epoch": 4.7959442332065905, "grad_norm": 0.5494823455810547, "learning_rate": 7.837150127226465e-06, "loss": 0.1231, "step": 1894 }, { "epoch": 4.798479087452471, "grad_norm": 0.6107218861579895, "learning_rate": 7.735368956743002e-06, "loss": 0.1358, "step": 1895 }, { "epoch": 4.801013941698352, "grad_norm": 0.6297575235366821, "learning_rate": 7.633587786259543e-06, "loss": 0.1699, "step": 1896 }, { "epoch": 4.803548795944233, "grad_norm": 0.8669266700744629, "learning_rate": 7.531806615776081e-06, "loss": 0.1982, "step": 1897 }, { "epoch": 4.806083650190114, "grad_norm": 0.583975076675415, "learning_rate": 7.430025445292621e-06, "loss": 0.1517, "step": 1898 }, { "epoch": 4.808618504435995, "grad_norm": 0.6059403419494629, "learning_rate": 7.328244274809161e-06, "loss": 0.138, "step": 1899 }, { "epoch": 4.811153358681876, "grad_norm": 1.0802148580551147, "learning_rate": 7.2264631043257e-06, "loss": 0.1677, "step": 1900 }, { "epoch": 4.813688212927756, "grad_norm": 0.5637528300285339, "learning_rate": 7.12468193384224e-06, "loss": 0.1517, "step": 1901 }, { "epoch": 4.816223067173637, "grad_norm": 0.6925719976425171, "learning_rate": 7.022900763358779e-06, "loss": 0.1636, "step": 1902 }, { "epoch": 4.818757921419518, "grad_norm": 0.6529707908630371, "learning_rate": 6.9211195928753175e-06, "loss": 0.1587, "step": 1903 }, { "epoch": 4.821292775665399, "grad_norm": 1.1477290391921997, "learning_rate": 6.819338422391858e-06, "loss": 0.1655, "step": 1904 }, { "epoch": 4.82382762991128, "grad_norm": 0.7867985367774963, "learning_rate": 6.717557251908398e-06, "loss": 0.1955, "step": 1905 }, { "epoch": 4.826362484157161, "grad_norm": 0.617871105670929, "learning_rate": 6.615776081424936e-06, "loss": 0.1554, "step": 1906 }, { "epoch": 4.828897338403042, "grad_norm": 0.5985192656517029, "learning_rate": 6.5139949109414765e-06, "loss": 0.1484, "step": 1907 }, { "epoch": 4.831432192648923, "grad_norm": 0.6069400310516357, "learning_rate": 6.412213740458016e-06, "loss": 0.1326, "step": 1908 }, { "epoch": 4.833967046894804, "grad_norm": 0.9009010195732117, "learning_rate": 6.310432569974554e-06, "loss": 0.1999, "step": 1909 }, { "epoch": 4.836501901140684, "grad_norm": 0.5913792848587036, "learning_rate": 6.208651399491094e-06, "loss": 0.1381, "step": 1910 }, { "epoch": 4.839036755386565, "grad_norm": 0.5730859637260437, "learning_rate": 6.106870229007634e-06, "loss": 0.1346, "step": 1911 }, { "epoch": 4.841571609632446, "grad_norm": 0.6579172611236572, "learning_rate": 6.005089058524174e-06, "loss": 0.1572, "step": 1912 }, { "epoch": 4.844106463878327, "grad_norm": 0.5854265093803406, "learning_rate": 5.903307888040713e-06, "loss": 0.1359, "step": 1913 }, { "epoch": 4.846641318124208, "grad_norm": 0.7668277025222778, "learning_rate": 5.801526717557252e-06, "loss": 0.1728, "step": 1914 }, { "epoch": 4.849176172370089, "grad_norm": 0.8092861175537109, "learning_rate": 5.699745547073792e-06, "loss": 0.1741, "step": 1915 }, { "epoch": 4.85171102661597, "grad_norm": 0.6868001818656921, "learning_rate": 5.597964376590331e-06, "loss": 0.1604, "step": 1916 }, { "epoch": 4.854245880861851, "grad_norm": 0.6506228446960449, "learning_rate": 5.49618320610687e-06, "loss": 0.1459, "step": 1917 }, { "epoch": 4.856780735107732, "grad_norm": 0.6033440232276917, "learning_rate": 5.394402035623411e-06, "loss": 0.1435, "step": 1918 }, { "epoch": 4.859315589353612, "grad_norm": 0.7446348071098328, "learning_rate": 5.29262086513995e-06, "loss": 0.165, "step": 1919 }, { "epoch": 4.861850443599493, "grad_norm": 0.5380656123161316, "learning_rate": 5.190839694656488e-06, "loss": 0.1504, "step": 1920 }, { "epoch": 4.864385297845374, "grad_norm": 0.6752755641937256, "learning_rate": 5.089058524173028e-06, "loss": 0.1616, "step": 1921 }, { "epoch": 4.866920152091255, "grad_norm": 0.6897322535514832, "learning_rate": 4.987277353689568e-06, "loss": 0.1409, "step": 1922 }, { "epoch": 4.869455006337136, "grad_norm": 0.5405673980712891, "learning_rate": 4.885496183206107e-06, "loss": 0.1215, "step": 1923 }, { "epoch": 4.8719898605830165, "grad_norm": 0.6921371221542358, "learning_rate": 4.7837150127226464e-06, "loss": 0.1554, "step": 1924 }, { "epoch": 4.8745247148288975, "grad_norm": 0.6672477722167969, "learning_rate": 4.681933842239187e-06, "loss": 0.1685, "step": 1925 }, { "epoch": 4.8770595690747784, "grad_norm": 0.5887411236763, "learning_rate": 4.580152671755725e-06, "loss": 0.1495, "step": 1926 }, { "epoch": 4.879594423320659, "grad_norm": 0.8119281530380249, "learning_rate": 4.478371501272264e-06, "loss": 0.1778, "step": 1927 }, { "epoch": 4.8821292775665395, "grad_norm": 0.6423155665397644, "learning_rate": 4.3765903307888045e-06, "loss": 0.1532, "step": 1928 }, { "epoch": 4.88466413181242, "grad_norm": 0.576859712600708, "learning_rate": 4.274809160305344e-06, "loss": 0.1474, "step": 1929 }, { "epoch": 4.887198986058301, "grad_norm": 0.668792188167572, "learning_rate": 4.173027989821883e-06, "loss": 0.1583, "step": 1930 }, { "epoch": 4.889733840304182, "grad_norm": 0.727428138256073, "learning_rate": 4.0712468193384225e-06, "loss": 0.1759, "step": 1931 }, { "epoch": 4.892268694550063, "grad_norm": 0.7260742783546448, "learning_rate": 3.969465648854962e-06, "loss": 0.1665, "step": 1932 }, { "epoch": 4.894803548795944, "grad_norm": 0.6192269921302795, "learning_rate": 3.867684478371501e-06, "loss": 0.1377, "step": 1933 }, { "epoch": 4.897338403041825, "grad_norm": 0.7672135233879089, "learning_rate": 3.7659033078880404e-06, "loss": 0.1696, "step": 1934 }, { "epoch": 4.899873257287706, "grad_norm": 0.5162369012832642, "learning_rate": 3.6641221374045806e-06, "loss": 0.1384, "step": 1935 }, { "epoch": 4.902408111533587, "grad_norm": 0.6594913601875305, "learning_rate": 3.56234096692112e-06, "loss": 0.1714, "step": 1936 }, { "epoch": 4.904942965779467, "grad_norm": 0.7748851776123047, "learning_rate": 3.4605597964376588e-06, "loss": 0.2014, "step": 1937 }, { "epoch": 4.907477820025348, "grad_norm": 0.6400601267814636, "learning_rate": 3.358778625954199e-06, "loss": 0.1522, "step": 1938 }, { "epoch": 4.910012674271229, "grad_norm": 0.5443174839019775, "learning_rate": 3.2569974554707382e-06, "loss": 0.1276, "step": 1939 }, { "epoch": 4.91254752851711, "grad_norm": 0.6544225811958313, "learning_rate": 3.155216284987277e-06, "loss": 0.1441, "step": 1940 }, { "epoch": 4.915082382762991, "grad_norm": 0.6579450368881226, "learning_rate": 3.053435114503817e-06, "loss": 0.1688, "step": 1941 }, { "epoch": 4.917617237008872, "grad_norm": 0.594393253326416, "learning_rate": 2.9516539440203566e-06, "loss": 0.1586, "step": 1942 }, { "epoch": 4.920152091254753, "grad_norm": 0.6417977213859558, "learning_rate": 2.849872773536896e-06, "loss": 0.1389, "step": 1943 }, { "epoch": 4.922686945500634, "grad_norm": 0.5247513055801392, "learning_rate": 2.748091603053435e-06, "loss": 0.1282, "step": 1944 }, { "epoch": 4.925221799746515, "grad_norm": 0.6372106075286865, "learning_rate": 2.646310432569975e-06, "loss": 0.1391, "step": 1945 }, { "epoch": 4.927756653992396, "grad_norm": 0.5967155694961548, "learning_rate": 2.544529262086514e-06, "loss": 0.1358, "step": 1946 }, { "epoch": 4.930291508238277, "grad_norm": 0.6050627827644348, "learning_rate": 2.4427480916030536e-06, "loss": 0.1449, "step": 1947 }, { "epoch": 4.932826362484157, "grad_norm": 0.7595526576042175, "learning_rate": 2.3409669211195933e-06, "loss": 0.1838, "step": 1948 }, { "epoch": 4.935361216730038, "grad_norm": 0.7220463156700134, "learning_rate": 2.239185750636132e-06, "loss": 0.1695, "step": 1949 }, { "epoch": 4.937896070975919, "grad_norm": 0.4891555905342102, "learning_rate": 2.137404580152672e-06, "loss": 0.1394, "step": 1950 }, { "epoch": 4.9404309252218, "grad_norm": 0.5262938141822815, "learning_rate": 2.0356234096692112e-06, "loss": 0.1452, "step": 1951 }, { "epoch": 4.942965779467681, "grad_norm": 0.7193884253501892, "learning_rate": 1.9338422391857505e-06, "loss": 0.176, "step": 1952 }, { "epoch": 4.945500633713562, "grad_norm": 0.7117200493812561, "learning_rate": 1.8320610687022903e-06, "loss": 0.1697, "step": 1953 }, { "epoch": 4.948035487959443, "grad_norm": 0.7884610891342163, "learning_rate": 1.7302798982188294e-06, "loss": 0.1864, "step": 1954 }, { "epoch": 4.9505703422053235, "grad_norm": 0.8606098890304565, "learning_rate": 1.6284987277353691e-06, "loss": 0.1568, "step": 1955 }, { "epoch": 4.9531051964512045, "grad_norm": 0.5030885338783264, "learning_rate": 1.5267175572519084e-06, "loss": 0.1306, "step": 1956 }, { "epoch": 4.955640050697085, "grad_norm": 0.5155559182167053, "learning_rate": 1.424936386768448e-06, "loss": 0.1311, "step": 1957 }, { "epoch": 4.9581749049429655, "grad_norm": 0.4945980906486511, "learning_rate": 1.3231552162849875e-06, "loss": 0.1212, "step": 1958 }, { "epoch": 4.9607097591888465, "grad_norm": 0.79302978515625, "learning_rate": 1.2213740458015268e-06, "loss": 0.1763, "step": 1959 }, { "epoch": 4.9632446134347274, "grad_norm": 0.6397921442985535, "learning_rate": 1.119592875318066e-06, "loss": 0.1416, "step": 1960 }, { "epoch": 4.965779467680608, "grad_norm": 0.6680799722671509, "learning_rate": 1.0178117048346056e-06, "loss": 0.1519, "step": 1961 }, { "epoch": 4.968314321926489, "grad_norm": 0.5919336080551147, "learning_rate": 9.160305343511451e-07, "loss": 0.16, "step": 1962 }, { "epoch": 4.97084917617237, "grad_norm": 0.5929127335548401, "learning_rate": 8.142493638676846e-07, "loss": 0.143, "step": 1963 }, { "epoch": 4.973384030418251, "grad_norm": 0.5678686499595642, "learning_rate": 7.12468193384224e-07, "loss": 0.1236, "step": 1964 }, { "epoch": 4.975918884664132, "grad_norm": 0.5478057861328125, "learning_rate": 6.106870229007634e-07, "loss": 0.1407, "step": 1965 }, { "epoch": 4.978453738910012, "grad_norm": 0.6003939509391785, "learning_rate": 5.089058524173028e-07, "loss": 0.1315, "step": 1966 }, { "epoch": 4.980988593155893, "grad_norm": 0.5943416357040405, "learning_rate": 4.071246819338423e-07, "loss": 0.1451, "step": 1967 }, { "epoch": 4.983523447401774, "grad_norm": 0.5419045090675354, "learning_rate": 3.053435114503817e-07, "loss": 0.1338, "step": 1968 }, { "epoch": 4.986058301647655, "grad_norm": 0.5665134787559509, "learning_rate": 2.0356234096692114e-07, "loss": 0.1347, "step": 1969 }, { "epoch": 4.988593155893536, "grad_norm": 0.5646002292633057, "learning_rate": 1.0178117048346057e-07, "loss": 0.1352, "step": 1970 } ], "logging_steps": 1, "max_steps": 1970, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.5558390987853286e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }