Pascal-Base-Lora / trainer_state.json
Delta-Vector's picture
Upload folder using huggingface_hub
65592cd verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1004,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00099601593625498,
"grad_norm": 0.4463295638561249,
"learning_rate": 9.900990099009901e-08,
"loss": 2.6197,
"step": 1
},
{
"epoch": 0.00199203187250996,
"grad_norm": 0.8444207906723022,
"learning_rate": 1.9801980198019803e-07,
"loss": 2.7588,
"step": 2
},
{
"epoch": 0.00298804780876494,
"grad_norm": 0.6591606140136719,
"learning_rate": 2.9702970297029703e-07,
"loss": 2.9263,
"step": 3
},
{
"epoch": 0.00398406374501992,
"grad_norm": 0.5656299591064453,
"learning_rate": 3.9603960396039606e-07,
"loss": 2.4296,
"step": 4
},
{
"epoch": 0.0049800796812749,
"grad_norm": 0.5051721930503845,
"learning_rate": 4.950495049504951e-07,
"loss": 2.3939,
"step": 5
},
{
"epoch": 0.00597609561752988,
"grad_norm": 0.7762399911880493,
"learning_rate": 5.940594059405941e-07,
"loss": 2.6638,
"step": 6
},
{
"epoch": 0.0069721115537848604,
"grad_norm": 0.5301679968833923,
"learning_rate": 6.930693069306931e-07,
"loss": 2.2199,
"step": 7
},
{
"epoch": 0.00796812749003984,
"grad_norm": 0.6617525219917297,
"learning_rate": 7.920792079207921e-07,
"loss": 2.8019,
"step": 8
},
{
"epoch": 0.008964143426294821,
"grad_norm": 0.7944237589836121,
"learning_rate": 8.910891089108911e-07,
"loss": 2.7146,
"step": 9
},
{
"epoch": 0.0099601593625498,
"grad_norm": 0.6918312907218933,
"learning_rate": 9.900990099009902e-07,
"loss": 2.6128,
"step": 10
},
{
"epoch": 0.010956175298804782,
"grad_norm": 0.550072193145752,
"learning_rate": 1.0891089108910893e-06,
"loss": 2.5252,
"step": 11
},
{
"epoch": 0.01195219123505976,
"grad_norm": 0.4420550763607025,
"learning_rate": 1.1881188118811881e-06,
"loss": 2.6964,
"step": 12
},
{
"epoch": 0.012948207171314742,
"grad_norm": 0.4683515429496765,
"learning_rate": 1.2871287128712872e-06,
"loss": 2.6433,
"step": 13
},
{
"epoch": 0.013944223107569721,
"grad_norm": 0.5689812898635864,
"learning_rate": 1.3861386138613863e-06,
"loss": 2.3309,
"step": 14
},
{
"epoch": 0.014940239043824702,
"grad_norm": 0.5711223483085632,
"learning_rate": 1.4851485148514852e-06,
"loss": 2.4396,
"step": 15
},
{
"epoch": 0.01593625498007968,
"grad_norm": 0.4562544822692871,
"learning_rate": 1.5841584158415842e-06,
"loss": 2.234,
"step": 16
},
{
"epoch": 0.01693227091633466,
"grad_norm": 0.33882570266723633,
"learning_rate": 1.6831683168316833e-06,
"loss": 2.5468,
"step": 17
},
{
"epoch": 0.017928286852589643,
"grad_norm": 0.46446338295936584,
"learning_rate": 1.7821782178217822e-06,
"loss": 2.6143,
"step": 18
},
{
"epoch": 0.018924302788844622,
"grad_norm": 0.625619649887085,
"learning_rate": 1.8811881188118813e-06,
"loss": 2.5565,
"step": 19
},
{
"epoch": 0.0199203187250996,
"grad_norm": 0.5139931440353394,
"learning_rate": 1.9801980198019803e-06,
"loss": 2.8371,
"step": 20
},
{
"epoch": 0.02091633466135458,
"grad_norm": 0.45826011896133423,
"learning_rate": 2.0792079207920794e-06,
"loss": 2.5883,
"step": 21
},
{
"epoch": 0.021912350597609563,
"grad_norm": 0.5945838093757629,
"learning_rate": 2.1782178217821785e-06,
"loss": 2.6956,
"step": 22
},
{
"epoch": 0.022908366533864542,
"grad_norm": 0.6705940961837769,
"learning_rate": 2.2772277227722776e-06,
"loss": 3.1889,
"step": 23
},
{
"epoch": 0.02390438247011952,
"grad_norm": 0.489014208316803,
"learning_rate": 2.3762376237623762e-06,
"loss": 2.646,
"step": 24
},
{
"epoch": 0.0249003984063745,
"grad_norm": 0.5540168285369873,
"learning_rate": 2.4752475247524753e-06,
"loss": 2.6909,
"step": 25
},
{
"epoch": 0.025896414342629483,
"grad_norm": 0.34993091225624084,
"learning_rate": 2.5742574257425744e-06,
"loss": 2.5703,
"step": 26
},
{
"epoch": 0.026892430278884463,
"grad_norm": 0.6204649209976196,
"learning_rate": 2.6732673267326735e-06,
"loss": 2.6973,
"step": 27
},
{
"epoch": 0.027888446215139442,
"grad_norm": 0.6948006749153137,
"learning_rate": 2.7722772277227726e-06,
"loss": 2.8826,
"step": 28
},
{
"epoch": 0.02888446215139442,
"grad_norm": 0.4865665137767792,
"learning_rate": 2.8712871287128712e-06,
"loss": 2.6566,
"step": 29
},
{
"epoch": 0.029880478087649404,
"grad_norm": 0.7654755711555481,
"learning_rate": 2.9702970297029703e-06,
"loss": 2.9627,
"step": 30
},
{
"epoch": 0.030876494023904383,
"grad_norm": 0.636715829372406,
"learning_rate": 3.0693069306930694e-06,
"loss": 2.3846,
"step": 31
},
{
"epoch": 0.03187250996015936,
"grad_norm": 0.3698335289955139,
"learning_rate": 3.1683168316831685e-06,
"loss": 2.6724,
"step": 32
},
{
"epoch": 0.03286852589641434,
"grad_norm": 0.7592146396636963,
"learning_rate": 3.2673267326732676e-06,
"loss": 2.9239,
"step": 33
},
{
"epoch": 0.03386454183266932,
"grad_norm": 0.7194887399673462,
"learning_rate": 3.3663366336633666e-06,
"loss": 2.4727,
"step": 34
},
{
"epoch": 0.0348605577689243,
"grad_norm": 0.4150688350200653,
"learning_rate": 3.4653465346534653e-06,
"loss": 2.6407,
"step": 35
},
{
"epoch": 0.035856573705179286,
"grad_norm": 0.4450097382068634,
"learning_rate": 3.5643564356435644e-06,
"loss": 2.6709,
"step": 36
},
{
"epoch": 0.036852589641434265,
"grad_norm": 0.4754781424999237,
"learning_rate": 3.6633663366336635e-06,
"loss": 2.6848,
"step": 37
},
{
"epoch": 0.037848605577689244,
"grad_norm": 0.41208407282829285,
"learning_rate": 3.7623762376237625e-06,
"loss": 2.5176,
"step": 38
},
{
"epoch": 0.03884462151394422,
"grad_norm": 0.41328731179237366,
"learning_rate": 3.861386138613862e-06,
"loss": 2.3275,
"step": 39
},
{
"epoch": 0.0398406374501992,
"grad_norm": 0.5368106365203857,
"learning_rate": 3.960396039603961e-06,
"loss": 2.4877,
"step": 40
},
{
"epoch": 0.04083665338645418,
"grad_norm": 0.37100547552108765,
"learning_rate": 4.05940594059406e-06,
"loss": 2.5933,
"step": 41
},
{
"epoch": 0.04183266932270916,
"grad_norm": 0.4816776216030121,
"learning_rate": 4.158415841584159e-06,
"loss": 2.844,
"step": 42
},
{
"epoch": 0.04282868525896414,
"grad_norm": 0.4209342896938324,
"learning_rate": 4.2574257425742575e-06,
"loss": 2.5422,
"step": 43
},
{
"epoch": 0.043824701195219126,
"grad_norm": 0.6714078783988953,
"learning_rate": 4.356435643564357e-06,
"loss": 2.7081,
"step": 44
},
{
"epoch": 0.044820717131474105,
"grad_norm": 0.26568883657455444,
"learning_rate": 4.455445544554456e-06,
"loss": 2.9217,
"step": 45
},
{
"epoch": 0.045816733067729085,
"grad_norm": 0.37946802377700806,
"learning_rate": 4.554455445544555e-06,
"loss": 2.4118,
"step": 46
},
{
"epoch": 0.046812749003984064,
"grad_norm": 0.6484291553497314,
"learning_rate": 4.653465346534654e-06,
"loss": 2.547,
"step": 47
},
{
"epoch": 0.04780876494023904,
"grad_norm": 0.29198533296585083,
"learning_rate": 4.7524752475247525e-06,
"loss": 2.519,
"step": 48
},
{
"epoch": 0.04880478087649402,
"grad_norm": 0.7583147883415222,
"learning_rate": 4.851485148514852e-06,
"loss": 2.9767,
"step": 49
},
{
"epoch": 0.049800796812749,
"grad_norm": 0.5130609273910522,
"learning_rate": 4.950495049504951e-06,
"loss": 2.7004,
"step": 50
},
{
"epoch": 0.05079681274900399,
"grad_norm": 0.31620916724205017,
"learning_rate": 5.04950495049505e-06,
"loss": 2.4462,
"step": 51
},
{
"epoch": 0.05179282868525897,
"grad_norm": 0.8395189046859741,
"learning_rate": 5.148514851485149e-06,
"loss": 2.5588,
"step": 52
},
{
"epoch": 0.052788844621513946,
"grad_norm": 0.387138307094574,
"learning_rate": 5.247524752475248e-06,
"loss": 2.1448,
"step": 53
},
{
"epoch": 0.053784860557768925,
"grad_norm": 1.2158163785934448,
"learning_rate": 5.346534653465347e-06,
"loss": 2.3136,
"step": 54
},
{
"epoch": 0.054780876494023904,
"grad_norm": 0.2387009710073471,
"learning_rate": 5.4455445544554465e-06,
"loss": 2.1985,
"step": 55
},
{
"epoch": 0.055776892430278883,
"grad_norm": 0.3181290924549103,
"learning_rate": 5.544554455445545e-06,
"loss": 2.3419,
"step": 56
},
{
"epoch": 0.05677290836653386,
"grad_norm": 0.37027299404144287,
"learning_rate": 5.643564356435644e-06,
"loss": 2.6245,
"step": 57
},
{
"epoch": 0.05776892430278884,
"grad_norm": 1.2025309801101685,
"learning_rate": 5.7425742574257425e-06,
"loss": 3.9658,
"step": 58
},
{
"epoch": 0.05876494023904383,
"grad_norm": 0.2420024573802948,
"learning_rate": 5.841584158415842e-06,
"loss": 2.6491,
"step": 59
},
{
"epoch": 0.05976095617529881,
"grad_norm": 0.5019764304161072,
"learning_rate": 5.940594059405941e-06,
"loss": 2.5349,
"step": 60
},
{
"epoch": 0.060756972111553786,
"grad_norm": 0.42150792479515076,
"learning_rate": 6.03960396039604e-06,
"loss": 3.0192,
"step": 61
},
{
"epoch": 0.061752988047808766,
"grad_norm": 0.21461670100688934,
"learning_rate": 6.138613861386139e-06,
"loss": 2.4131,
"step": 62
},
{
"epoch": 0.06274900398406374,
"grad_norm": 0.27715393900871277,
"learning_rate": 6.237623762376238e-06,
"loss": 2.0699,
"step": 63
},
{
"epoch": 0.06374501992031872,
"grad_norm": 0.23175019025802612,
"learning_rate": 6.336633663366337e-06,
"loss": 2.454,
"step": 64
},
{
"epoch": 0.0647410358565737,
"grad_norm": 1.4909917116165161,
"learning_rate": 6.4356435643564364e-06,
"loss": 2.394,
"step": 65
},
{
"epoch": 0.06573705179282868,
"grad_norm": 0.25836002826690674,
"learning_rate": 6.534653465346535e-06,
"loss": 2.1991,
"step": 66
},
{
"epoch": 0.06673306772908366,
"grad_norm": 0.24367666244506836,
"learning_rate": 6.633663366336635e-06,
"loss": 2.1038,
"step": 67
},
{
"epoch": 0.06772908366533864,
"grad_norm": 0.2366018295288086,
"learning_rate": 6.732673267326733e-06,
"loss": 2.4102,
"step": 68
},
{
"epoch": 0.06872509960159362,
"grad_norm": 0.2741665542125702,
"learning_rate": 6.831683168316833e-06,
"loss": 2.4163,
"step": 69
},
{
"epoch": 0.0697211155378486,
"grad_norm": 1.1350017786026,
"learning_rate": 6.930693069306931e-06,
"loss": 3.1909,
"step": 70
},
{
"epoch": 0.07071713147410359,
"grad_norm": 0.7466657161712646,
"learning_rate": 7.02970297029703e-06,
"loss": 3.0505,
"step": 71
},
{
"epoch": 0.07171314741035857,
"grad_norm": 0.6016573309898376,
"learning_rate": 7.128712871287129e-06,
"loss": 2.523,
"step": 72
},
{
"epoch": 0.07270916334661355,
"grad_norm": 0.29950985312461853,
"learning_rate": 7.227722772277228e-06,
"loss": 2.2338,
"step": 73
},
{
"epoch": 0.07370517928286853,
"grad_norm": 0.3367365598678589,
"learning_rate": 7.326732673267327e-06,
"loss": 2.3668,
"step": 74
},
{
"epoch": 0.07470119521912351,
"grad_norm": 0.23957543075084686,
"learning_rate": 7.425742574257426e-06,
"loss": 2.3929,
"step": 75
},
{
"epoch": 0.07569721115537849,
"grad_norm": 0.2996574342250824,
"learning_rate": 7.524752475247525e-06,
"loss": 2.4929,
"step": 76
},
{
"epoch": 0.07669322709163347,
"grad_norm": 1.1166422367095947,
"learning_rate": 7.6237623762376246e-06,
"loss": 2.2256,
"step": 77
},
{
"epoch": 0.07768924302788845,
"grad_norm": 0.3733150362968445,
"learning_rate": 7.722772277227724e-06,
"loss": 2.4912,
"step": 78
},
{
"epoch": 0.07868525896414343,
"grad_norm": 0.3624296486377716,
"learning_rate": 7.821782178217822e-06,
"loss": 2.7605,
"step": 79
},
{
"epoch": 0.0796812749003984,
"grad_norm": 0.47846829891204834,
"learning_rate": 7.920792079207921e-06,
"loss": 2.7636,
"step": 80
},
{
"epoch": 0.08067729083665338,
"grad_norm": 0.3782709836959839,
"learning_rate": 8.019801980198021e-06,
"loss": 2.4481,
"step": 81
},
{
"epoch": 0.08167330677290836,
"grad_norm": 0.5004844665527344,
"learning_rate": 8.11881188118812e-06,
"loss": 2.4166,
"step": 82
},
{
"epoch": 0.08266932270916334,
"grad_norm": 0.20830737054347992,
"learning_rate": 8.217821782178218e-06,
"loss": 2.4728,
"step": 83
},
{
"epoch": 0.08366533864541832,
"grad_norm": 0.2479114830493927,
"learning_rate": 8.316831683168318e-06,
"loss": 2.2449,
"step": 84
},
{
"epoch": 0.0846613545816733,
"grad_norm": 0.42911332845687866,
"learning_rate": 8.415841584158416e-06,
"loss": 2.2295,
"step": 85
},
{
"epoch": 0.08565737051792828,
"grad_norm": 0.44220131635665894,
"learning_rate": 8.514851485148515e-06,
"loss": 2.4495,
"step": 86
},
{
"epoch": 0.08665338645418327,
"grad_norm": 0.23947738111019135,
"learning_rate": 8.613861386138615e-06,
"loss": 2.1415,
"step": 87
},
{
"epoch": 0.08764940239043825,
"grad_norm": 0.42801541090011597,
"learning_rate": 8.712871287128714e-06,
"loss": 2.3226,
"step": 88
},
{
"epoch": 0.08864541832669323,
"grad_norm": 0.39098042249679565,
"learning_rate": 8.811881188118812e-06,
"loss": 2.3063,
"step": 89
},
{
"epoch": 0.08964143426294821,
"grad_norm": 0.29923197627067566,
"learning_rate": 8.910891089108911e-06,
"loss": 2.6163,
"step": 90
},
{
"epoch": 0.09063745019920319,
"grad_norm": 0.2684191167354584,
"learning_rate": 9.009900990099011e-06,
"loss": 2.6845,
"step": 91
},
{
"epoch": 0.09163346613545817,
"grad_norm": 0.27097082138061523,
"learning_rate": 9.10891089108911e-06,
"loss": 2.598,
"step": 92
},
{
"epoch": 0.09262948207171315,
"grad_norm": 0.2647894024848938,
"learning_rate": 9.20792079207921e-06,
"loss": 2.2659,
"step": 93
},
{
"epoch": 0.09362549800796813,
"grad_norm": 0.38580745458602905,
"learning_rate": 9.306930693069308e-06,
"loss": 2.5818,
"step": 94
},
{
"epoch": 0.0946215139442231,
"grad_norm": 0.23376502096652985,
"learning_rate": 9.405940594059405e-06,
"loss": 2.207,
"step": 95
},
{
"epoch": 0.09561752988047809,
"grad_norm": 0.7030872702598572,
"learning_rate": 9.504950495049505e-06,
"loss": 2.1312,
"step": 96
},
{
"epoch": 0.09661354581673307,
"grad_norm": 0.23579809069633484,
"learning_rate": 9.603960396039604e-06,
"loss": 2.0685,
"step": 97
},
{
"epoch": 0.09760956175298804,
"grad_norm": 0.3901154100894928,
"learning_rate": 9.702970297029704e-06,
"loss": 2.662,
"step": 98
},
{
"epoch": 0.09860557768924302,
"grad_norm": 0.2687411606311798,
"learning_rate": 9.801980198019802e-06,
"loss": 2.9062,
"step": 99
},
{
"epoch": 0.099601593625498,
"grad_norm": 0.1916651427745819,
"learning_rate": 9.900990099009901e-06,
"loss": 2.1874,
"step": 100
},
{
"epoch": 0.10059760956175298,
"grad_norm": 0.3915342688560486,
"learning_rate": 1e-05,
"loss": 2.4196,
"step": 101
},
{
"epoch": 0.10159362549800798,
"grad_norm": 0.4256736636161804,
"learning_rate": 9.999969740355927e-06,
"loss": 2.3229,
"step": 102
},
{
"epoch": 0.10258964143426295,
"grad_norm": 0.24454592168331146,
"learning_rate": 9.999878961789962e-06,
"loss": 2.4725,
"step": 103
},
{
"epoch": 0.10358565737051793,
"grad_norm": 0.4549209475517273,
"learning_rate": 9.999727665400876e-06,
"loss": 2.6802,
"step": 104
},
{
"epoch": 0.10458167330677291,
"grad_norm": 0.27968448400497437,
"learning_rate": 9.999515853019941e-06,
"loss": 2.154,
"step": 105
},
{
"epoch": 0.10557768924302789,
"grad_norm": 0.30781856179237366,
"learning_rate": 9.999243527210904e-06,
"loss": 2.1358,
"step": 106
},
{
"epoch": 0.10657370517928287,
"grad_norm": 0.22190262377262115,
"learning_rate": 9.998910691269957e-06,
"loss": 2.1951,
"step": 107
},
{
"epoch": 0.10756972111553785,
"grad_norm": 0.36984801292419434,
"learning_rate": 9.998517349225698e-06,
"loss": 2.2283,
"step": 108
},
{
"epoch": 0.10856573705179283,
"grad_norm": 0.24837668240070343,
"learning_rate": 9.998063505839084e-06,
"loss": 2.4556,
"step": 109
},
{
"epoch": 0.10956175298804781,
"grad_norm": 0.17527616024017334,
"learning_rate": 9.99754916660337e-06,
"loss": 2.2414,
"step": 110
},
{
"epoch": 0.11055776892430279,
"grad_norm": 0.2681446969509125,
"learning_rate": 9.996974337744047e-06,
"loss": 2.6504,
"step": 111
},
{
"epoch": 0.11155378486055777,
"grad_norm": 0.41997164487838745,
"learning_rate": 9.99633902621876e-06,
"loss": 2.4704,
"step": 112
},
{
"epoch": 0.11254980079681275,
"grad_norm": 0.43319636583328247,
"learning_rate": 9.995643239717228e-06,
"loss": 2.4391,
"step": 113
},
{
"epoch": 0.11354581673306773,
"grad_norm": 0.3344462811946869,
"learning_rate": 9.994886986661155e-06,
"loss": 2.4113,
"step": 114
},
{
"epoch": 0.1145418326693227,
"grad_norm": 0.2086816132068634,
"learning_rate": 9.994070276204115e-06,
"loss": 2.1469,
"step": 115
},
{
"epoch": 0.11553784860557768,
"grad_norm": 0.35499969124794006,
"learning_rate": 9.993193118231463e-06,
"loss": 2.6212,
"step": 116
},
{
"epoch": 0.11653386454183266,
"grad_norm": 0.4640713036060333,
"learning_rate": 9.992255523360187e-06,
"loss": 2.5926,
"step": 117
},
{
"epoch": 0.11752988047808766,
"grad_norm": 0.3302168548107147,
"learning_rate": 9.991257502938805e-06,
"loss": 2.0769,
"step": 118
},
{
"epoch": 0.11852589641434264,
"grad_norm": 0.4918990135192871,
"learning_rate": 9.990199069047216e-06,
"loss": 2.4818,
"step": 119
},
{
"epoch": 0.11952191235059761,
"grad_norm": 0.25973260402679443,
"learning_rate": 9.989080234496548e-06,
"loss": 2.3236,
"step": 120
},
{
"epoch": 0.1205179282868526,
"grad_norm": 0.26280921697616577,
"learning_rate": 9.98790101282902e-06,
"loss": 2.4276,
"step": 121
},
{
"epoch": 0.12151394422310757,
"grad_norm": 0.21018964052200317,
"learning_rate": 9.986661418317759e-06,
"loss": 2.3006,
"step": 122
},
{
"epoch": 0.12250996015936255,
"grad_norm": 0.30688363313674927,
"learning_rate": 9.985361465966644e-06,
"loss": 2.3402,
"step": 123
},
{
"epoch": 0.12350597609561753,
"grad_norm": 0.27469494938850403,
"learning_rate": 9.984001171510112e-06,
"loss": 2.7987,
"step": 124
},
{
"epoch": 0.12450199203187251,
"grad_norm": 0.2825527489185333,
"learning_rate": 9.982580551412972e-06,
"loss": 2.4642,
"step": 125
},
{
"epoch": 0.1254980079681275,
"grad_norm": 0.6109227538108826,
"learning_rate": 9.98109962287021e-06,
"loss": 2.6041,
"step": 126
},
{
"epoch": 0.12649402390438247,
"grad_norm": 0.3983345031738281,
"learning_rate": 9.979558403806773e-06,
"loss": 2.4599,
"step": 127
},
{
"epoch": 0.12749003984063745,
"grad_norm": 0.3466341495513916,
"learning_rate": 9.977956912877356e-06,
"loss": 2.1902,
"step": 128
},
{
"epoch": 0.12848605577689243,
"grad_norm": 0.3762282729148865,
"learning_rate": 9.97629516946618e-06,
"loss": 2.2643,
"step": 129
},
{
"epoch": 0.1294820717131474,
"grad_norm": 0.523991048336029,
"learning_rate": 9.974573193686747e-06,
"loss": 2.196,
"step": 130
},
{
"epoch": 0.13047808764940239,
"grad_norm": 0.23254041373729706,
"learning_rate": 9.97279100638161e-06,
"loss": 2.4402,
"step": 131
},
{
"epoch": 0.13147410358565736,
"grad_norm": 0.3137255311012268,
"learning_rate": 9.970948629122108e-06,
"loss": 2.5905,
"step": 132
},
{
"epoch": 0.13247011952191234,
"grad_norm": 0.21106691658496857,
"learning_rate": 9.969046084208116e-06,
"loss": 2.3683,
"step": 133
},
{
"epoch": 0.13346613545816732,
"grad_norm": 0.4183836877346039,
"learning_rate": 9.967083394667763e-06,
"loss": 2.0614,
"step": 134
},
{
"epoch": 0.1344621513944223,
"grad_norm": 0.4468408226966858,
"learning_rate": 9.965060584257165e-06,
"loss": 2.4639,
"step": 135
},
{
"epoch": 0.13545816733067728,
"grad_norm": 0.22207094728946686,
"learning_rate": 9.962977677460132e-06,
"loss": 2.2261,
"step": 136
},
{
"epoch": 0.13645418326693226,
"grad_norm": 0.2465856373310089,
"learning_rate": 9.960834699487873e-06,
"loss": 2.2444,
"step": 137
},
{
"epoch": 0.13745019920318724,
"grad_norm": 0.3648821711540222,
"learning_rate": 9.958631676278686e-06,
"loss": 2.6132,
"step": 138
},
{
"epoch": 0.13844621513944222,
"grad_norm": 0.21419532597064972,
"learning_rate": 9.956368634497648e-06,
"loss": 2.4863,
"step": 139
},
{
"epoch": 0.1394422310756972,
"grad_norm": 0.5358874797821045,
"learning_rate": 9.9540456015363e-06,
"loss": 2.401,
"step": 140
},
{
"epoch": 0.14043824701195218,
"grad_norm": 0.25208160281181335,
"learning_rate": 9.951662605512298e-06,
"loss": 2.5901,
"step": 141
},
{
"epoch": 0.14143426294820718,
"grad_norm": 0.2659305930137634,
"learning_rate": 9.949219675269089e-06,
"loss": 2.3229,
"step": 142
},
{
"epoch": 0.14243027888446216,
"grad_norm": 0.5667638778686523,
"learning_rate": 9.946716840375552e-06,
"loss": 2.4998,
"step": 143
},
{
"epoch": 0.14342629482071714,
"grad_norm": 0.315893292427063,
"learning_rate": 9.944154131125643e-06,
"loss": 2.4017,
"step": 144
},
{
"epoch": 0.14442231075697212,
"grad_norm": 0.43832895159721375,
"learning_rate": 9.941531578538032e-06,
"loss": 2.473,
"step": 145
},
{
"epoch": 0.1454183266932271,
"grad_norm": 0.2750052511692047,
"learning_rate": 9.938849214355722e-06,
"loss": 2.3208,
"step": 146
},
{
"epoch": 0.14641434262948208,
"grad_norm": 0.26100143790245056,
"learning_rate": 9.936107071045665e-06,
"loss": 2.2151,
"step": 147
},
{
"epoch": 0.14741035856573706,
"grad_norm": 0.3080121576786041,
"learning_rate": 9.933305181798374e-06,
"loss": 2.0868,
"step": 148
},
{
"epoch": 0.14840637450199204,
"grad_norm": 0.24721984565258026,
"learning_rate": 9.93044358052752e-06,
"loss": 2.3312,
"step": 149
},
{
"epoch": 0.14940239043824702,
"grad_norm": 0.5916289687156677,
"learning_rate": 9.927522301869515e-06,
"loss": 2.6542,
"step": 150
},
{
"epoch": 0.150398406374502,
"grad_norm": 0.22790588438510895,
"learning_rate": 9.924541381183099e-06,
"loss": 2.3121,
"step": 151
},
{
"epoch": 0.15139442231075698,
"grad_norm": 0.3201110064983368,
"learning_rate": 9.921500854548916e-06,
"loss": 2.8776,
"step": 152
},
{
"epoch": 0.15239043824701196,
"grad_norm": 0.3063032627105713,
"learning_rate": 9.918400758769063e-06,
"loss": 2.3604,
"step": 153
},
{
"epoch": 0.15338645418326693,
"grad_norm": 0.41891732811927795,
"learning_rate": 9.915241131366657e-06,
"loss": 2.1036,
"step": 154
},
{
"epoch": 0.15438247011952191,
"grad_norm": 1.0909143686294556,
"learning_rate": 9.912022010585385e-06,
"loss": 2.5247,
"step": 155
},
{
"epoch": 0.1553784860557769,
"grad_norm": 0.26673582196235657,
"learning_rate": 9.90874343538902e-06,
"loss": 2.159,
"step": 156
},
{
"epoch": 0.15637450199203187,
"grad_norm": 0.3614170551300049,
"learning_rate": 9.905405445460972e-06,
"loss": 2.4383,
"step": 157
},
{
"epoch": 0.15737051792828685,
"grad_norm": 0.5078898668289185,
"learning_rate": 9.902008081203796e-06,
"loss": 2.2543,
"step": 158
},
{
"epoch": 0.15836653386454183,
"grad_norm": 0.3936934769153595,
"learning_rate": 9.898551383738707e-06,
"loss": 2.538,
"step": 159
},
{
"epoch": 0.1593625498007968,
"grad_norm": 0.6516975164413452,
"learning_rate": 9.895035394905073e-06,
"loss": 2.5301,
"step": 160
},
{
"epoch": 0.1603585657370518,
"grad_norm": 0.26518794894218445,
"learning_rate": 9.89146015725993e-06,
"loss": 2.2118,
"step": 161
},
{
"epoch": 0.16135458167330677,
"grad_norm": 0.41366127133369446,
"learning_rate": 9.887825714077439e-06,
"loss": 2.4799,
"step": 162
},
{
"epoch": 0.16235059760956175,
"grad_norm": 0.5400006771087646,
"learning_rate": 9.884132109348386e-06,
"loss": 2.5019,
"step": 163
},
{
"epoch": 0.16334661354581673,
"grad_norm": 0.36508408188819885,
"learning_rate": 9.880379387779637e-06,
"loss": 2.7938,
"step": 164
},
{
"epoch": 0.1643426294820717,
"grad_norm": 0.4240388572216034,
"learning_rate": 9.876567594793597e-06,
"loss": 2.5002,
"step": 165
},
{
"epoch": 0.16533864541832669,
"grad_norm": 0.277864933013916,
"learning_rate": 9.87269677652767e-06,
"loss": 2.5436,
"step": 166
},
{
"epoch": 0.16633466135458166,
"grad_norm": 0.25840163230895996,
"learning_rate": 9.868766979833686e-06,
"loss": 2.2811,
"step": 167
},
{
"epoch": 0.16733067729083664,
"grad_norm": 0.32198214530944824,
"learning_rate": 9.864778252277344e-06,
"loss": 2.3215,
"step": 168
},
{
"epoch": 0.16832669322709162,
"grad_norm": 0.613046407699585,
"learning_rate": 9.86073064213764e-06,
"loss": 2.8133,
"step": 169
},
{
"epoch": 0.1693227091633466,
"grad_norm": 0.38470038771629333,
"learning_rate": 9.856624198406262e-06,
"loss": 2.4133,
"step": 170
},
{
"epoch": 0.17031872509960158,
"grad_norm": 0.3675747811794281,
"learning_rate": 9.852458970787027e-06,
"loss": 2.0617,
"step": 171
},
{
"epoch": 0.17131474103585656,
"grad_norm": 0.26074767112731934,
"learning_rate": 9.848235009695255e-06,
"loss": 2.132,
"step": 172
},
{
"epoch": 0.17231075697211157,
"grad_norm": 0.5974801778793335,
"learning_rate": 9.84395236625717e-06,
"loss": 2.3888,
"step": 173
},
{
"epoch": 0.17330677290836655,
"grad_norm": 0.2652048170566559,
"learning_rate": 9.839611092309278e-06,
"loss": 2.4468,
"step": 174
},
{
"epoch": 0.17430278884462153,
"grad_norm": 0.6336271166801453,
"learning_rate": 9.835211240397743e-06,
"loss": 2.3256,
"step": 175
},
{
"epoch": 0.1752988047808765,
"grad_norm": 0.3853505849838257,
"learning_rate": 9.830752863777741e-06,
"loss": 2.5527,
"step": 176
},
{
"epoch": 0.17629482071713148,
"grad_norm": 0.25374558568000793,
"learning_rate": 9.826236016412833e-06,
"loss": 2.5593,
"step": 177
},
{
"epoch": 0.17729083665338646,
"grad_norm": 0.29101264476776123,
"learning_rate": 9.821660752974294e-06,
"loss": 2.6399,
"step": 178
},
{
"epoch": 0.17828685258964144,
"grad_norm": 0.7464101910591125,
"learning_rate": 9.817027128840462e-06,
"loss": 2.1674,
"step": 179
},
{
"epoch": 0.17928286852589642,
"grad_norm": 0.28557366132736206,
"learning_rate": 9.812335200096064e-06,
"loss": 2.7127,
"step": 180
},
{
"epoch": 0.1802788844621514,
"grad_norm": 0.5655897259712219,
"learning_rate": 9.807585023531536e-06,
"loss": 2.0397,
"step": 181
},
{
"epoch": 0.18127490039840638,
"grad_norm": 0.2831386625766754,
"learning_rate": 9.802776656642341e-06,
"loss": 2.1947,
"step": 182
},
{
"epoch": 0.18227091633466136,
"grad_norm": 0.30917420983314514,
"learning_rate": 9.797910157628265e-06,
"loss": 2.3951,
"step": 183
},
{
"epoch": 0.18326693227091634,
"grad_norm": 0.3886703550815582,
"learning_rate": 9.792985585392722e-06,
"loss": 2.4107,
"step": 184
},
{
"epoch": 0.18426294820717132,
"grad_norm": 0.51981121301651,
"learning_rate": 9.78800299954203e-06,
"loss": 2.3655,
"step": 185
},
{
"epoch": 0.1852589641434263,
"grad_norm": 0.31090375781059265,
"learning_rate": 9.782962460384701e-06,
"loss": 1.984,
"step": 186
},
{
"epoch": 0.18625498007968128,
"grad_norm": 0.4561314880847931,
"learning_rate": 9.777864028930705e-06,
"loss": 3.0161,
"step": 187
},
{
"epoch": 0.18725099601593626,
"grad_norm": 0.3265978693962097,
"learning_rate": 9.772707766890726e-06,
"loss": 2.6738,
"step": 188
},
{
"epoch": 0.18824701195219123,
"grad_norm": 0.5627899765968323,
"learning_rate": 9.767493736675429e-06,
"loss": 2.4544,
"step": 189
},
{
"epoch": 0.1892430278884462,
"grad_norm": 0.3551636338233948,
"learning_rate": 9.762222001394692e-06,
"loss": 2.4696,
"step": 190
},
{
"epoch": 0.1902390438247012,
"grad_norm": 0.27445298433303833,
"learning_rate": 9.756892624856848e-06,
"loss": 2.5626,
"step": 191
},
{
"epoch": 0.19123505976095617,
"grad_norm": 0.4346907436847687,
"learning_rate": 9.751505671567914e-06,
"loss": 2.6588,
"step": 192
},
{
"epoch": 0.19223107569721115,
"grad_norm": 1.7177170515060425,
"learning_rate": 9.746061206730801e-06,
"loss": 3.3538,
"step": 193
},
{
"epoch": 0.19322709163346613,
"grad_norm": 0.294007807970047,
"learning_rate": 9.740559296244543e-06,
"loss": 2.7963,
"step": 194
},
{
"epoch": 0.1942231075697211,
"grad_norm": 0.3322044014930725,
"learning_rate": 9.735000006703475e-06,
"loss": 2.1763,
"step": 195
},
{
"epoch": 0.1952191235059761,
"grad_norm": 0.2852723002433777,
"learning_rate": 9.72938340539645e-06,
"loss": 2.2182,
"step": 196
},
{
"epoch": 0.19621513944223107,
"grad_norm": 0.2600834369659424,
"learning_rate": 9.723709560306009e-06,
"loss": 2.7632,
"step": 197
},
{
"epoch": 0.19721115537848605,
"grad_norm": 0.27677562832832336,
"learning_rate": 9.717978540107566e-06,
"loss": 2.3831,
"step": 198
},
{
"epoch": 0.19820717131474103,
"grad_norm": 0.4312080144882202,
"learning_rate": 9.712190414168573e-06,
"loss": 2.4096,
"step": 199
},
{
"epoch": 0.199203187250996,
"grad_norm": 0.7516922950744629,
"learning_rate": 9.706345252547681e-06,
"loss": 3.0072,
"step": 200
},
{
"epoch": 0.20019920318725098,
"grad_norm": 0.47257497906684875,
"learning_rate": 9.700443125993897e-06,
"loss": 2.4537,
"step": 201
},
{
"epoch": 0.20119521912350596,
"grad_norm": 0.5163850784301758,
"learning_rate": 9.694484105945719e-06,
"loss": 2.4488,
"step": 202
},
{
"epoch": 0.20219123505976094,
"grad_norm": 0.2632780373096466,
"learning_rate": 9.688468264530278e-06,
"loss": 2.5477,
"step": 203
},
{
"epoch": 0.20318725099601595,
"grad_norm": 1.0932762622833252,
"learning_rate": 9.682395674562459e-06,
"loss": 2.8381,
"step": 204
},
{
"epoch": 0.20418326693227093,
"grad_norm": 0.568217396736145,
"learning_rate": 9.676266409544031e-06,
"loss": 2.2398,
"step": 205
},
{
"epoch": 0.2051792828685259,
"grad_norm": 0.5864899754524231,
"learning_rate": 9.670080543662742e-06,
"loss": 2.5067,
"step": 206
},
{
"epoch": 0.2061752988047809,
"grad_norm": 0.38742220401763916,
"learning_rate": 9.663838151791431e-06,
"loss": 2.3831,
"step": 207
},
{
"epoch": 0.20717131474103587,
"grad_norm": 0.441034197807312,
"learning_rate": 9.657539309487123e-06,
"loss": 2.3785,
"step": 208
},
{
"epoch": 0.20816733067729085,
"grad_norm": 1.6438182592391968,
"learning_rate": 9.651184092990109e-06,
"loss": 3.6952,
"step": 209
},
{
"epoch": 0.20916334661354583,
"grad_norm": 0.35267430543899536,
"learning_rate": 9.644772579223023e-06,
"loss": 2.4354,
"step": 210
},
{
"epoch": 0.2101593625498008,
"grad_norm": 0.27790936827659607,
"learning_rate": 9.638304845789916e-06,
"loss": 2.3407,
"step": 211
},
{
"epoch": 0.21115537848605578,
"grad_norm": 0.4317843019962311,
"learning_rate": 9.631780970975311e-06,
"loss": 2.2805,
"step": 212
},
{
"epoch": 0.21215139442231076,
"grad_norm": 0.35801681876182556,
"learning_rate": 9.625201033743262e-06,
"loss": 2.3219,
"step": 213
},
{
"epoch": 0.21314741035856574,
"grad_norm": 0.3666556477546692,
"learning_rate": 9.618565113736388e-06,
"loss": 2.1962,
"step": 214
},
{
"epoch": 0.21414342629482072,
"grad_norm": 0.36347630620002747,
"learning_rate": 9.611873291274927e-06,
"loss": 2.1945,
"step": 215
},
{
"epoch": 0.2151394422310757,
"grad_norm": 0.47142109274864197,
"learning_rate": 9.60512564735574e-06,
"loss": 2.1907,
"step": 216
},
{
"epoch": 0.21613545816733068,
"grad_norm": 0.3300761580467224,
"learning_rate": 9.598322263651352e-06,
"loss": 2.0638,
"step": 217
},
{
"epoch": 0.21713147410358566,
"grad_norm": 0.3918429911136627,
"learning_rate": 9.591463222508947e-06,
"loss": 2.4349,
"step": 218
},
{
"epoch": 0.21812749003984064,
"grad_norm": 0.3837280869483948,
"learning_rate": 9.584548606949384e-06,
"loss": 2.2359,
"step": 219
},
{
"epoch": 0.21912350597609562,
"grad_norm": 0.47225990891456604,
"learning_rate": 9.577578500666187e-06,
"loss": 2.4696,
"step": 220
},
{
"epoch": 0.2201195219123506,
"grad_norm": 0.3448033630847931,
"learning_rate": 9.570552988024527e-06,
"loss": 2.3639,
"step": 221
},
{
"epoch": 0.22111553784860558,
"grad_norm": 0.39937150478363037,
"learning_rate": 9.563472154060212e-06,
"loss": 2.3513,
"step": 222
},
{
"epoch": 0.22211155378486055,
"grad_norm": 0.3486849367618561,
"learning_rate": 9.556336084478645e-06,
"loss": 2.3674,
"step": 223
},
{
"epoch": 0.22310756972111553,
"grad_norm": 0.4388813376426697,
"learning_rate": 9.5491448656538e-06,
"loss": 2.4748,
"step": 224
},
{
"epoch": 0.2241035856573705,
"grad_norm": 0.4307428300380707,
"learning_rate": 9.541898584627164e-06,
"loss": 2.1206,
"step": 225
},
{
"epoch": 0.2250996015936255,
"grad_norm": 0.5265683531761169,
"learning_rate": 9.534597329106688e-06,
"loss": 2.589,
"step": 226
},
{
"epoch": 0.22609561752988047,
"grad_norm": 0.5943540930747986,
"learning_rate": 9.527241187465735e-06,
"loss": 2.8641,
"step": 227
},
{
"epoch": 0.22709163346613545,
"grad_norm": 0.3554113209247589,
"learning_rate": 9.519830248741991e-06,
"loss": 2.0978,
"step": 228
},
{
"epoch": 0.22808764940239043,
"grad_norm": 0.43764352798461914,
"learning_rate": 9.512364602636405e-06,
"loss": 2.2777,
"step": 229
},
{
"epoch": 0.2290836653386454,
"grad_norm": 0.27372264862060547,
"learning_rate": 9.504844339512096e-06,
"loss": 2.3366,
"step": 230
},
{
"epoch": 0.2300796812749004,
"grad_norm": 0.5419708490371704,
"learning_rate": 9.497269550393257e-06,
"loss": 2.5115,
"step": 231
},
{
"epoch": 0.23107569721115537,
"grad_norm": 0.3294195234775543,
"learning_rate": 9.489640326964058e-06,
"loss": 2.3812,
"step": 232
},
{
"epoch": 0.23207171314741035,
"grad_norm": 0.3676604926586151,
"learning_rate": 9.481956761567531e-06,
"loss": 2.1645,
"step": 233
},
{
"epoch": 0.23306772908366533,
"grad_norm": 0.24499647319316864,
"learning_rate": 9.47421894720446e-06,
"loss": 2.436,
"step": 234
},
{
"epoch": 0.2340637450199203,
"grad_norm": 0.4562065601348877,
"learning_rate": 9.466426977532246e-06,
"loss": 2.4614,
"step": 235
},
{
"epoch": 0.2350597609561753,
"grad_norm": 0.4152824580669403,
"learning_rate": 9.458580946863784e-06,
"loss": 2.6406,
"step": 236
},
{
"epoch": 0.2360557768924303,
"grad_norm": 0.2812240421772003,
"learning_rate": 9.45068095016631e-06,
"loss": 2.4129,
"step": 237
},
{
"epoch": 0.23705179282868527,
"grad_norm": 0.37695300579071045,
"learning_rate": 9.442727083060258e-06,
"loss": 2.8288,
"step": 238
},
{
"epoch": 0.23804780876494025,
"grad_norm": 0.29094114899635315,
"learning_rate": 9.434719441818106e-06,
"loss": 2.2392,
"step": 239
},
{
"epoch": 0.23904382470119523,
"grad_norm": 0.6004308462142944,
"learning_rate": 9.426658123363202e-06,
"loss": 2.6978,
"step": 240
},
{
"epoch": 0.2400398406374502,
"grad_norm": 0.824376106262207,
"learning_rate": 9.418543225268598e-06,
"loss": 2.8384,
"step": 241
},
{
"epoch": 0.2410358565737052,
"grad_norm": 0.37361350655555725,
"learning_rate": 9.410374845755862e-06,
"loss": 2.7737,
"step": 242
},
{
"epoch": 0.24203187250996017,
"grad_norm": 0.3311799168586731,
"learning_rate": 9.402153083693898e-06,
"loss": 2.6569,
"step": 243
},
{
"epoch": 0.24302788844621515,
"grad_norm": 0.31674501299858093,
"learning_rate": 9.393878038597748e-06,
"loss": 2.4791,
"step": 244
},
{
"epoch": 0.24402390438247012,
"grad_norm": 0.5512855052947998,
"learning_rate": 9.385549810627374e-06,
"loss": 2.0412,
"step": 245
},
{
"epoch": 0.2450199203187251,
"grad_norm": 0.48202139139175415,
"learning_rate": 9.377168500586465e-06,
"loss": 2.4472,
"step": 246
},
{
"epoch": 0.24601593625498008,
"grad_norm": 0.29134997725486755,
"learning_rate": 9.3687342099212e-06,
"loss": 2.1911,
"step": 247
},
{
"epoch": 0.24701195219123506,
"grad_norm": 0.27262917160987854,
"learning_rate": 9.36024704071904e-06,
"loss": 2.3633,
"step": 248
},
{
"epoch": 0.24800796812749004,
"grad_norm": 0.2618001699447632,
"learning_rate": 9.351707095707465e-06,
"loss": 2.3556,
"step": 249
},
{
"epoch": 0.24900398406374502,
"grad_norm": 0.4861814081668854,
"learning_rate": 9.343114478252758e-06,
"loss": 2.2809,
"step": 250
},
{
"epoch": 0.25,
"grad_norm": 0.3331791162490845,
"learning_rate": 9.334469292358736e-06,
"loss": 2.2747,
"step": 251
},
{
"epoch": 0.250996015936255,
"grad_norm": 0.3599317669868469,
"learning_rate": 9.3257716426655e-06,
"loss": 2.3204,
"step": 252
},
{
"epoch": 0.25199203187250996,
"grad_norm": 0.28026479482650757,
"learning_rate": 9.317021634448162e-06,
"loss": 2.5997,
"step": 253
},
{
"epoch": 0.25298804780876494,
"grad_norm": 0.4968087077140808,
"learning_rate": 9.308219373615574e-06,
"loss": 2.4623,
"step": 254
},
{
"epoch": 0.2539840637450199,
"grad_norm": 0.5899234414100647,
"learning_rate": 9.299364966709051e-06,
"loss": 2.4678,
"step": 255
},
{
"epoch": 0.2549800796812749,
"grad_norm": 0.27741050720214844,
"learning_rate": 9.290458520901072e-06,
"loss": 2.4373,
"step": 256
},
{
"epoch": 0.2559760956175299,
"grad_norm": 0.44141483306884766,
"learning_rate": 9.28150014399399e-06,
"loss": 2.3013,
"step": 257
},
{
"epoch": 0.25697211155378485,
"grad_norm": 0.4108343720436096,
"learning_rate": 9.272489944418724e-06,
"loss": 2.6281,
"step": 258
},
{
"epoch": 0.25796812749003983,
"grad_norm": 0.4309611916542053,
"learning_rate": 9.263428031233444e-06,
"loss": 2.6192,
"step": 259
},
{
"epoch": 0.2589641434262948,
"grad_norm": 0.3191240727901459,
"learning_rate": 9.25431451412226e-06,
"loss": 2.3667,
"step": 260
},
{
"epoch": 0.2599601593625498,
"grad_norm": 0.4311404824256897,
"learning_rate": 9.245149503393884e-06,
"loss": 2.5286,
"step": 261
},
{
"epoch": 0.26095617529880477,
"grad_norm": 0.8753085136413574,
"learning_rate": 9.235933109980302e-06,
"loss": 2.6609,
"step": 262
},
{
"epoch": 0.26195219123505975,
"grad_norm": 0.679023265838623,
"learning_rate": 9.226665445435428e-06,
"loss": 2.4715,
"step": 263
},
{
"epoch": 0.26294820717131473,
"grad_norm": 0.4910929501056671,
"learning_rate": 9.217346621933753e-06,
"loss": 2.0939,
"step": 264
},
{
"epoch": 0.2639442231075697,
"grad_norm": 1.3370636701583862,
"learning_rate": 9.207976752268992e-06,
"loss": 2.4367,
"step": 265
},
{
"epoch": 0.2649402390438247,
"grad_norm": 0.4148995578289032,
"learning_rate": 9.19855594985271e-06,
"loss": 2.5403,
"step": 266
},
{
"epoch": 0.26593625498007967,
"grad_norm": 0.5130553841590881,
"learning_rate": 9.189084328712961e-06,
"loss": 2.654,
"step": 267
},
{
"epoch": 0.26693227091633465,
"grad_norm": 0.5000612139701843,
"learning_rate": 9.179562003492898e-06,
"loss": 2.2451,
"step": 268
},
{
"epoch": 0.2679282868525896,
"grad_norm": 1.3651481866836548,
"learning_rate": 9.16998908944939e-06,
"loss": 2.3566,
"step": 269
},
{
"epoch": 0.2689243027888446,
"grad_norm": 0.4688972532749176,
"learning_rate": 9.160365702451625e-06,
"loss": 2.4274,
"step": 270
},
{
"epoch": 0.2699203187250996,
"grad_norm": 0.44729602336883545,
"learning_rate": 9.150691958979712e-06,
"loss": 2.3431,
"step": 271
},
{
"epoch": 0.27091633466135456,
"grad_norm": 0.4126404821872711,
"learning_rate": 9.14096797612326e-06,
"loss": 2.4358,
"step": 272
},
{
"epoch": 0.27191235059760954,
"grad_norm": 0.8661454319953918,
"learning_rate": 9.131193871579975e-06,
"loss": 2.6125,
"step": 273
},
{
"epoch": 0.2729083665338645,
"grad_norm": 0.35947325825691223,
"learning_rate": 9.121369763654228e-06,
"loss": 2.2618,
"step": 274
},
{
"epoch": 0.2739043824701195,
"grad_norm": 0.3399883210659027,
"learning_rate": 9.111495771255623e-06,
"loss": 2.1576,
"step": 275
},
{
"epoch": 0.2749003984063745,
"grad_norm": 0.4308667778968811,
"learning_rate": 9.101572013897555e-06,
"loss": 2.0927,
"step": 276
},
{
"epoch": 0.27589641434262946,
"grad_norm": 0.3370983302593231,
"learning_rate": 9.091598611695774e-06,
"loss": 2.3073,
"step": 277
},
{
"epoch": 0.27689243027888444,
"grad_norm": 0.30387991666793823,
"learning_rate": 9.081575685366919e-06,
"loss": 2.5888,
"step": 278
},
{
"epoch": 0.2778884462151394,
"grad_norm": 0.6190817952156067,
"learning_rate": 9.071503356227063e-06,
"loss": 2.5076,
"step": 279
},
{
"epoch": 0.2788844621513944,
"grad_norm": 0.43932202458381653,
"learning_rate": 9.061381746190243e-06,
"loss": 2.3828,
"step": 280
},
{
"epoch": 0.2798804780876494,
"grad_norm": 0.4108044505119324,
"learning_rate": 9.051210977766987e-06,
"loss": 2.4859,
"step": 281
},
{
"epoch": 0.28087649402390436,
"grad_norm": 0.7741344571113586,
"learning_rate": 9.040991174062827e-06,
"loss": 2.332,
"step": 282
},
{
"epoch": 0.2818725099601594,
"grad_norm": 0.42518022656440735,
"learning_rate": 9.030722458776815e-06,
"loss": 2.5427,
"step": 283
},
{
"epoch": 0.28286852589641437,
"grad_norm": 0.3040229082107544,
"learning_rate": 9.020404956200016e-06,
"loss": 2.5173,
"step": 284
},
{
"epoch": 0.28386454183266935,
"grad_norm": 0.4257875680923462,
"learning_rate": 9.010038791214012e-06,
"loss": 2.3872,
"step": 285
},
{
"epoch": 0.2848605577689243,
"grad_norm": 0.6529532074928284,
"learning_rate": 8.999624089289389e-06,
"loss": 2.2936,
"step": 286
},
{
"epoch": 0.2858565737051793,
"grad_norm": 0.2723180055618286,
"learning_rate": 8.989160976484218e-06,
"loss": 2.5736,
"step": 287
},
{
"epoch": 0.2868525896414343,
"grad_norm": 0.3371571898460388,
"learning_rate": 8.978649579442525e-06,
"loss": 2.3213,
"step": 288
},
{
"epoch": 0.28784860557768926,
"grad_norm": 0.4722624719142914,
"learning_rate": 8.968090025392763e-06,
"loss": 2.4039,
"step": 289
},
{
"epoch": 0.28884462151394424,
"grad_norm": 0.6963698863983154,
"learning_rate": 8.957482442146271e-06,
"loss": 2.4849,
"step": 290
},
{
"epoch": 0.2898406374501992,
"grad_norm": 0.41670724749565125,
"learning_rate": 8.946826958095726e-06,
"loss": 2.4297,
"step": 291
},
{
"epoch": 0.2908366533864542,
"grad_norm": 0.44924449920654297,
"learning_rate": 8.936123702213593e-06,
"loss": 2.29,
"step": 292
},
{
"epoch": 0.2918326693227092,
"grad_norm": 0.5405289530754089,
"learning_rate": 8.925372804050554e-06,
"loss": 2.4732,
"step": 293
},
{
"epoch": 0.29282868525896416,
"grad_norm": 0.5333283543586731,
"learning_rate": 8.914574393733953e-06,
"loss": 2.3553,
"step": 294
},
{
"epoch": 0.29382470119521914,
"grad_norm": 0.4173821806907654,
"learning_rate": 8.903728601966206e-06,
"loss": 2.4729,
"step": 295
},
{
"epoch": 0.2948207171314741,
"grad_norm": 0.6668480634689331,
"learning_rate": 8.892835560023236e-06,
"loss": 2.7302,
"step": 296
},
{
"epoch": 0.2958167330677291,
"grad_norm": 0.5601832270622253,
"learning_rate": 8.881895399752873e-06,
"loss": 2.8201,
"step": 297
},
{
"epoch": 0.2968127490039841,
"grad_norm": 0.7715175151824951,
"learning_rate": 8.870908253573255e-06,
"loss": 2.439,
"step": 298
},
{
"epoch": 0.29780876494023906,
"grad_norm": 0.6411163210868835,
"learning_rate": 8.85987425447124e-06,
"loss": 2.2098,
"step": 299
},
{
"epoch": 0.29880478087649404,
"grad_norm": 1.8174595832824707,
"learning_rate": 8.848793536000779e-06,
"loss": 2.862,
"step": 300
},
{
"epoch": 0.299800796812749,
"grad_norm": 0.4861983060836792,
"learning_rate": 8.837666232281312e-06,
"loss": 1.964,
"step": 301
},
{
"epoch": 0.300796812749004,
"grad_norm": 0.629531979560852,
"learning_rate": 8.826492477996138e-06,
"loss": 2.4866,
"step": 302
},
{
"epoch": 0.301792828685259,
"grad_norm": 0.33506232500076294,
"learning_rate": 8.81527240839079e-06,
"loss": 2.1813,
"step": 303
},
{
"epoch": 0.30278884462151395,
"grad_norm": 0.5065098404884338,
"learning_rate": 8.80400615927139e-06,
"loss": 2.2313,
"step": 304
},
{
"epoch": 0.30378486055776893,
"grad_norm": 0.31633898615837097,
"learning_rate": 8.792693867003017e-06,
"loss": 2.5764,
"step": 305
},
{
"epoch": 0.3047808764940239,
"grad_norm": 0.6082801818847656,
"learning_rate": 8.781335668508044e-06,
"loss": 2.1408,
"step": 306
},
{
"epoch": 0.3057768924302789,
"grad_norm": 0.3309324383735657,
"learning_rate": 8.76993170126449e-06,
"loss": 2.5198,
"step": 307
},
{
"epoch": 0.30677290836653387,
"grad_norm": 0.38401421904563904,
"learning_rate": 8.758482103304348e-06,
"loss": 2.2784,
"step": 308
},
{
"epoch": 0.30776892430278885,
"grad_norm": 0.4021207392215729,
"learning_rate": 8.746987013211924e-06,
"loss": 2.1789,
"step": 309
},
{
"epoch": 0.30876494023904383,
"grad_norm": 1.5585757493972778,
"learning_rate": 8.735446570122151e-06,
"loss": 2.429,
"step": 310
},
{
"epoch": 0.3097609561752988,
"grad_norm": 0.5734106302261353,
"learning_rate": 8.72386091371891e-06,
"loss": 2.3943,
"step": 311
},
{
"epoch": 0.3107569721115538,
"grad_norm": 0.5181722044944763,
"learning_rate": 8.712230184233337e-06,
"loss": 2.4501,
"step": 312
},
{
"epoch": 0.31175298804780877,
"grad_norm": 0.42989951372146606,
"learning_rate": 8.700554522442124e-06,
"loss": 2.3612,
"step": 313
},
{
"epoch": 0.31274900398406374,
"grad_norm": 0.8372073769569397,
"learning_rate": 8.688834069665819e-06,
"loss": 2.6138,
"step": 314
},
{
"epoch": 0.3137450199203187,
"grad_norm": 0.5447811484336853,
"learning_rate": 8.677068967767117e-06,
"loss": 2.4036,
"step": 315
},
{
"epoch": 0.3147410358565737,
"grad_norm": 0.28892412781715393,
"learning_rate": 8.665259359149132e-06,
"loss": 2.4249,
"step": 316
},
{
"epoch": 0.3157370517928287,
"grad_norm": 0.33981916308403015,
"learning_rate": 8.653405386753688e-06,
"loss": 2.8255,
"step": 317
},
{
"epoch": 0.31673306772908366,
"grad_norm": 0.37266361713409424,
"learning_rate": 8.64150719405958e-06,
"loss": 2.4619,
"step": 318
},
{
"epoch": 0.31772908366533864,
"grad_norm": 0.611991286277771,
"learning_rate": 8.629564925080838e-06,
"loss": 2.6266,
"step": 319
},
{
"epoch": 0.3187250996015936,
"grad_norm": 0.7753398418426514,
"learning_rate": 8.617578724364984e-06,
"loss": 2.5113,
"step": 320
},
{
"epoch": 0.3197211155378486,
"grad_norm": 0.7812793850898743,
"learning_rate": 8.605548736991284e-06,
"loss": 2.1736,
"step": 321
},
{
"epoch": 0.3207171314741036,
"grad_norm": 0.5114774703979492,
"learning_rate": 8.593475108568995e-06,
"loss": 2.7169,
"step": 322
},
{
"epoch": 0.32171314741035856,
"grad_norm": 0.3701231777667999,
"learning_rate": 8.581357985235595e-06,
"loss": 2.2407,
"step": 323
},
{
"epoch": 0.32270916334661354,
"grad_norm": 1.135130763053894,
"learning_rate": 8.569197513655022e-06,
"loss": 2.0901,
"step": 324
},
{
"epoch": 0.3237051792828685,
"grad_norm": 0.2848869562149048,
"learning_rate": 8.55699384101589e-06,
"loss": 2.592,
"step": 325
},
{
"epoch": 0.3247011952191235,
"grad_norm": 0.5609592795372009,
"learning_rate": 8.544747115029717e-06,
"loss": 2.3673,
"step": 326
},
{
"epoch": 0.3256972111553785,
"grad_norm": 0.3470471203327179,
"learning_rate": 8.53245748392913e-06,
"loss": 2.377,
"step": 327
},
{
"epoch": 0.32669322709163345,
"grad_norm": 0.9178757667541504,
"learning_rate": 8.520125096466072e-06,
"loss": 2.7617,
"step": 328
},
{
"epoch": 0.32768924302788843,
"grad_norm": 0.46402791142463684,
"learning_rate": 8.50775010191001e-06,
"loss": 2.2848,
"step": 329
},
{
"epoch": 0.3286852589641434,
"grad_norm": 0.4459151327610016,
"learning_rate": 8.495332650046112e-06,
"loss": 2.1105,
"step": 330
},
{
"epoch": 0.3296812749003984,
"grad_norm": 0.7026370763778687,
"learning_rate": 8.48287289117345e-06,
"loss": 2.6823,
"step": 331
},
{
"epoch": 0.33067729083665337,
"grad_norm": 0.7429327964782715,
"learning_rate": 8.470370976103171e-06,
"loss": 2.2689,
"step": 332
},
{
"epoch": 0.33167330677290835,
"grad_norm": 0.37948814034461975,
"learning_rate": 8.457827056156673e-06,
"loss": 2.628,
"step": 333
},
{
"epoch": 0.33266932270916333,
"grad_norm": 0.3749179244041443,
"learning_rate": 8.44524128316378e-06,
"loss": 2.4552,
"step": 334
},
{
"epoch": 0.3336653386454183,
"grad_norm": 0.421303391456604,
"learning_rate": 8.432613809460895e-06,
"loss": 2.1797,
"step": 335
},
{
"epoch": 0.3346613545816733,
"grad_norm": 0.5645405054092407,
"learning_rate": 8.419944787889162e-06,
"loss": 2.4308,
"step": 336
},
{
"epoch": 0.33565737051792827,
"grad_norm": 0.7171806693077087,
"learning_rate": 8.407234371792614e-06,
"loss": 2.3868,
"step": 337
},
{
"epoch": 0.33665338645418325,
"grad_norm": 0.7937443852424622,
"learning_rate": 8.394482715016318e-06,
"loss": 2.433,
"step": 338
},
{
"epoch": 0.3376494023904382,
"grad_norm": 0.5325895547866821,
"learning_rate": 8.381689971904514e-06,
"loss": 2.5116,
"step": 339
},
{
"epoch": 0.3386454183266932,
"grad_norm": 0.3550787568092346,
"learning_rate": 8.368856297298742e-06,
"loss": 2.4187,
"step": 340
},
{
"epoch": 0.3396414342629482,
"grad_norm": 0.5118217468261719,
"learning_rate": 8.355981846535972e-06,
"loss": 2.1325,
"step": 341
},
{
"epoch": 0.34063745019920316,
"grad_norm": 0.35231295228004456,
"learning_rate": 8.343066775446724e-06,
"loss": 2.3751,
"step": 342
},
{
"epoch": 0.34163346613545814,
"grad_norm": 0.6949347853660583,
"learning_rate": 8.330111240353178e-06,
"loss": 2.5615,
"step": 343
},
{
"epoch": 0.3426294820717131,
"grad_norm": 0.5718231797218323,
"learning_rate": 8.317115398067289e-06,
"loss": 2.2858,
"step": 344
},
{
"epoch": 0.3436254980079681,
"grad_norm": 0.6337103843688965,
"learning_rate": 8.30407940588888e-06,
"loss": 2.5088,
"step": 345
},
{
"epoch": 0.34462151394422313,
"grad_norm": 0.4129788875579834,
"learning_rate": 8.29100342160374e-06,
"loss": 2.2748,
"step": 346
},
{
"epoch": 0.3456175298804781,
"grad_norm": 0.3462570607662201,
"learning_rate": 8.27788760348173e-06,
"loss": 2.3205,
"step": 347
},
{
"epoch": 0.3466135458167331,
"grad_norm": 0.4417884349822998,
"learning_rate": 8.26473211027484e-06,
"loss": 2.3901,
"step": 348
},
{
"epoch": 0.34760956175298807,
"grad_norm": 0.45579978823661804,
"learning_rate": 8.251537101215287e-06,
"loss": 2.2336,
"step": 349
},
{
"epoch": 0.34860557768924305,
"grad_norm": 0.6957226991653442,
"learning_rate": 8.238302736013587e-06,
"loss": 2.7518,
"step": 350
},
{
"epoch": 0.34960159362549803,
"grad_norm": 0.696114718914032,
"learning_rate": 8.225029174856602e-06,
"loss": 2.0373,
"step": 351
},
{
"epoch": 0.350597609561753,
"grad_norm": 0.40747031569480896,
"learning_rate": 8.211716578405635e-06,
"loss": 2.4626,
"step": 352
},
{
"epoch": 0.351593625498008,
"grad_norm": 0.47290411591529846,
"learning_rate": 8.198365107794457e-06,
"loss": 2.5871,
"step": 353
},
{
"epoch": 0.35258964143426297,
"grad_norm": 0.592217206954956,
"learning_rate": 8.184974924627365e-06,
"loss": 2.3886,
"step": 354
},
{
"epoch": 0.35358565737051795,
"grad_norm": 0.8985310196876526,
"learning_rate": 8.171546190977231e-06,
"loss": 2.6021,
"step": 355
},
{
"epoch": 0.3545816733067729,
"grad_norm": 0.7999231815338135,
"learning_rate": 8.158079069383535e-06,
"loss": 2.2757,
"step": 356
},
{
"epoch": 0.3555776892430279,
"grad_norm": 0.6230331063270569,
"learning_rate": 8.1445737228504e-06,
"loss": 2.4343,
"step": 357
},
{
"epoch": 0.3565737051792829,
"grad_norm": 0.43640607595443726,
"learning_rate": 8.131030314844617e-06,
"loss": 1.9181,
"step": 358
},
{
"epoch": 0.35756972111553786,
"grad_norm": 0.5745819211006165,
"learning_rate": 8.117449009293668e-06,
"loss": 2.9202,
"step": 359
},
{
"epoch": 0.35856573705179284,
"grad_norm": 0.46643760800361633,
"learning_rate": 8.103829970583742e-06,
"loss": 2.5197,
"step": 360
},
{
"epoch": 0.3595617529880478,
"grad_norm": 0.5368001461029053,
"learning_rate": 8.090173363557748e-06,
"loss": 2.3562,
"step": 361
},
{
"epoch": 0.3605577689243028,
"grad_norm": 0.5117561221122742,
"learning_rate": 8.076479353513308e-06,
"loss": 2.2398,
"step": 362
},
{
"epoch": 0.3615537848605578,
"grad_norm": 1.2853957414627075,
"learning_rate": 8.06274810620077e-06,
"loss": 2.041,
"step": 363
},
{
"epoch": 0.36254980079681276,
"grad_norm": 0.5592566132545471,
"learning_rate": 8.0489797878212e-06,
"loss": 2.614,
"step": 364
},
{
"epoch": 0.36354581673306774,
"grad_norm": 0.3889990448951721,
"learning_rate": 8.035174565024362e-06,
"loss": 2.4095,
"step": 365
},
{
"epoch": 0.3645418326693227,
"grad_norm": 0.39642640948295593,
"learning_rate": 8.021332604906709e-06,
"loss": 1.9971,
"step": 366
},
{
"epoch": 0.3655378486055777,
"grad_norm": 0.5725635290145874,
"learning_rate": 8.007454075009352e-06,
"loss": 2.4322,
"step": 367
},
{
"epoch": 0.3665338645418327,
"grad_norm": 0.538329541683197,
"learning_rate": 7.993539143316044e-06,
"loss": 2.3927,
"step": 368
},
{
"epoch": 0.36752988047808766,
"grad_norm": 0.8502817153930664,
"learning_rate": 7.979587978251136e-06,
"loss": 2.6487,
"step": 369
},
{
"epoch": 0.36852589641434264,
"grad_norm": 0.5096033811569214,
"learning_rate": 7.965600748677545e-06,
"loss": 2.4393,
"step": 370
},
{
"epoch": 0.3695219123505976,
"grad_norm": 0.39716872572898865,
"learning_rate": 7.951577623894701e-06,
"loss": 2.9555,
"step": 371
},
{
"epoch": 0.3705179282868526,
"grad_norm": 0.6120476126670837,
"learning_rate": 7.937518773636518e-06,
"loss": 2.1758,
"step": 372
},
{
"epoch": 0.3715139442231076,
"grad_norm": 0.4519326984882355,
"learning_rate": 7.923424368069312e-06,
"loss": 2.4508,
"step": 373
},
{
"epoch": 0.37250996015936255,
"grad_norm": 0.4252610504627228,
"learning_rate": 7.909294577789765e-06,
"loss": 2.134,
"step": 374
},
{
"epoch": 0.37350597609561753,
"grad_norm": 0.5511481761932373,
"learning_rate": 7.895129573822844e-06,
"loss": 2.4735,
"step": 375
},
{
"epoch": 0.3745019920318725,
"grad_norm": 1.1340324878692627,
"learning_rate": 7.880929527619742e-06,
"loss": 2.7602,
"step": 376
},
{
"epoch": 0.3754980079681275,
"grad_norm": 0.45862501859664917,
"learning_rate": 7.866694611055796e-06,
"loss": 2.5242,
"step": 377
},
{
"epoch": 0.37649402390438247,
"grad_norm": 0.48843666911125183,
"learning_rate": 7.852424996428412e-06,
"loss": 2.3878,
"step": 378
},
{
"epoch": 0.37749003984063745,
"grad_norm": 1.07990562915802,
"learning_rate": 7.838120856454967e-06,
"loss": 2.2745,
"step": 379
},
{
"epoch": 0.3784860557768924,
"grad_norm": 0.466766893863678,
"learning_rate": 7.823782364270743e-06,
"loss": 2.5844,
"step": 380
},
{
"epoch": 0.3794820717131474,
"grad_norm": 0.6437628865242004,
"learning_rate": 7.809409693426803e-06,
"loss": 2.381,
"step": 381
},
{
"epoch": 0.3804780876494024,
"grad_norm": 0.6378084421157837,
"learning_rate": 7.79500301788791e-06,
"loss": 2.4076,
"step": 382
},
{
"epoch": 0.38147410358565736,
"grad_norm": 0.6559402346611023,
"learning_rate": 7.780562512030414e-06,
"loss": 2.2115,
"step": 383
},
{
"epoch": 0.38247011952191234,
"grad_norm": 0.4882892668247223,
"learning_rate": 7.766088350640141e-06,
"loss": 2.179,
"step": 384
},
{
"epoch": 0.3834661354581673,
"grad_norm": 0.37981244921684265,
"learning_rate": 7.75158070891028e-06,
"loss": 2.3806,
"step": 385
},
{
"epoch": 0.3844621513944223,
"grad_norm": 0.6670547723770142,
"learning_rate": 7.737039762439263e-06,
"loss": 2.0258,
"step": 386
},
{
"epoch": 0.3854581673306773,
"grad_norm": 0.5870895981788635,
"learning_rate": 7.722465687228634e-06,
"loss": 2.4609,
"step": 387
},
{
"epoch": 0.38645418326693226,
"grad_norm": 0.5210617184638977,
"learning_rate": 7.707858659680924e-06,
"loss": 2.0152,
"step": 388
},
{
"epoch": 0.38745019920318724,
"grad_norm": 0.8872121572494507,
"learning_rate": 7.693218856597515e-06,
"loss": 2.4107,
"step": 389
},
{
"epoch": 0.3884462151394422,
"grad_norm": 0.408750057220459,
"learning_rate": 7.6785464551765e-06,
"loss": 2.4381,
"step": 390
},
{
"epoch": 0.3894422310756972,
"grad_norm": 0.4789107143878937,
"learning_rate": 7.663841633010539e-06,
"loss": 2.3057,
"step": 391
},
{
"epoch": 0.3904382470119522,
"grad_norm": 0.37599480152130127,
"learning_rate": 7.649104568084701e-06,
"loss": 2.4101,
"step": 392
},
{
"epoch": 0.39143426294820716,
"grad_norm": 0.6352246403694153,
"learning_rate": 7.634335438774325e-06,
"loss": 2.2202,
"step": 393
},
{
"epoch": 0.39243027888446214,
"grad_norm": 0.5684521198272705,
"learning_rate": 7.619534423842852e-06,
"loss": 2.7473,
"step": 394
},
{
"epoch": 0.3934262948207171,
"grad_norm": 0.5998817682266235,
"learning_rate": 7.604701702439652e-06,
"loss": 2.109,
"step": 395
},
{
"epoch": 0.3944223107569721,
"grad_norm": 0.69579017162323,
"learning_rate": 7.589837454097879e-06,
"loss": 2.6706,
"step": 396
},
{
"epoch": 0.3954183266932271,
"grad_norm": 0.5617753267288208,
"learning_rate": 7.574941858732279e-06,
"loss": 2.4175,
"step": 397
},
{
"epoch": 0.39641434262948205,
"grad_norm": 0.509094774723053,
"learning_rate": 7.560015096637015e-06,
"loss": 2.4367,
"step": 398
},
{
"epoch": 0.39741035856573703,
"grad_norm": 0.5854381322860718,
"learning_rate": 7.54505734848349e-06,
"loss": 2.3366,
"step": 399
},
{
"epoch": 0.398406374501992,
"grad_norm": 0.7329273819923401,
"learning_rate": 7.53006879531816e-06,
"loss": 2.2252,
"step": 400
},
{
"epoch": 0.399402390438247,
"grad_norm": 0.7855085730552673,
"learning_rate": 7.515049618560337e-06,
"loss": 2.2244,
"step": 401
},
{
"epoch": 0.40039840637450197,
"grad_norm": 0.608400821685791,
"learning_rate": 7.500000000000001e-06,
"loss": 2.2232,
"step": 402
},
{
"epoch": 0.40139442231075695,
"grad_norm": 0.5910929441452026,
"learning_rate": 7.484920121795589e-06,
"loss": 2.4034,
"step": 403
},
{
"epoch": 0.40239043824701193,
"grad_norm": 0.5254145264625549,
"learning_rate": 7.469810166471802e-06,
"loss": 2.3053,
"step": 404
},
{
"epoch": 0.4033864541832669,
"grad_norm": 0.5197448134422302,
"learning_rate": 7.454670316917387e-06,
"loss": 2.2416,
"step": 405
},
{
"epoch": 0.4043824701195219,
"grad_norm": 0.759675145149231,
"learning_rate": 7.4395007563829295e-06,
"loss": 2.4197,
"step": 406
},
{
"epoch": 0.40537848605577687,
"grad_norm": 0.4646972417831421,
"learning_rate": 7.424301668478626e-06,
"loss": 2.5638,
"step": 407
},
{
"epoch": 0.4063745019920319,
"grad_norm": 0.5564824938774109,
"learning_rate": 7.4090732371720774e-06,
"loss": 2.5038,
"step": 408
},
{
"epoch": 0.4073705179282869,
"grad_norm": 0.8897591829299927,
"learning_rate": 7.393815646786047e-06,
"loss": 2.6751,
"step": 409
},
{
"epoch": 0.40836653386454186,
"grad_norm": 0.8237727284431458,
"learning_rate": 7.378529081996233e-06,
"loss": 2.2649,
"step": 410
},
{
"epoch": 0.40936254980079684,
"grad_norm": 0.4922022521495819,
"learning_rate": 7.363213727829045e-06,
"loss": 2.2894,
"step": 411
},
{
"epoch": 0.4103585657370518,
"grad_norm": 0.48476412892341614,
"learning_rate": 7.347869769659346e-06,
"loss": 2.4317,
"step": 412
},
{
"epoch": 0.4113545816733068,
"grad_norm": 0.7627730965614319,
"learning_rate": 7.332497393208221e-06,
"loss": 3.0171,
"step": 413
},
{
"epoch": 0.4123505976095618,
"grad_norm": 1.4489892721176147,
"learning_rate": 7.317096784540728e-06,
"loss": 3.0745,
"step": 414
},
{
"epoch": 0.41334661354581675,
"grad_norm": 0.44949018955230713,
"learning_rate": 7.301668130063639e-06,
"loss": 2.4086,
"step": 415
},
{
"epoch": 0.41434262948207173,
"grad_norm": 0.9026828408241272,
"learning_rate": 7.286211616523193e-06,
"loss": 2.4219,
"step": 416
},
{
"epoch": 0.4153386454183267,
"grad_norm": 0.5528742074966431,
"learning_rate": 7.2707274310028306e-06,
"loss": 2.069,
"step": 417
},
{
"epoch": 0.4163346613545817,
"grad_norm": 0.8069695830345154,
"learning_rate": 7.255215760920925e-06,
"loss": 2.2901,
"step": 418
},
{
"epoch": 0.41733067729083667,
"grad_norm": 0.9854758977890015,
"learning_rate": 7.239676794028526e-06,
"loss": 2.533,
"step": 419
},
{
"epoch": 0.41832669322709165,
"grad_norm": 0.8304996490478516,
"learning_rate": 7.224110718407075e-06,
"loss": 2.4076,
"step": 420
},
{
"epoch": 0.41932270916334663,
"grad_norm": 0.5824740529060364,
"learning_rate": 7.208517722466135e-06,
"loss": 2.2298,
"step": 421
},
{
"epoch": 0.4203187250996016,
"grad_norm": 0.740998387336731,
"learning_rate": 7.192897994941111e-06,
"loss": 2.2991,
"step": 422
},
{
"epoch": 0.4213147410358566,
"grad_norm": 0.9516714215278625,
"learning_rate": 7.177251724890957e-06,
"loss": 2.2348,
"step": 423
},
{
"epoch": 0.42231075697211157,
"grad_norm": 1.330517053604126,
"learning_rate": 7.1615791016959024e-06,
"loss": 2.5969,
"step": 424
},
{
"epoch": 0.42330677290836655,
"grad_norm": 0.8636577129364014,
"learning_rate": 7.145880315055145e-06,
"loss": 2.4203,
"step": 425
},
{
"epoch": 0.4243027888446215,
"grad_norm": 0.40839532017707825,
"learning_rate": 7.1301555549845634e-06,
"loss": 2.428,
"step": 426
},
{
"epoch": 0.4252988047808765,
"grad_norm": 0.6337350010871887,
"learning_rate": 7.114405011814415e-06,
"loss": 2.3018,
"step": 427
},
{
"epoch": 0.4262948207171315,
"grad_norm": 0.8653415441513062,
"learning_rate": 7.098628876187031e-06,
"loss": 2.8947,
"step": 428
},
{
"epoch": 0.42729083665338646,
"grad_norm": 0.7558097839355469,
"learning_rate": 7.082827339054513e-06,
"loss": 2.2061,
"step": 429
},
{
"epoch": 0.42828685258964144,
"grad_norm": 0.8000844717025757,
"learning_rate": 7.067000591676416e-06,
"loss": 2.4076,
"step": 430
},
{
"epoch": 0.4292828685258964,
"grad_norm": 0.7074631452560425,
"learning_rate": 7.051148825617435e-06,
"loss": 2.7278,
"step": 431
},
{
"epoch": 0.4302788844621514,
"grad_norm": 1.1026921272277832,
"learning_rate": 7.035272232745093e-06,
"loss": 2.5602,
"step": 432
},
{
"epoch": 0.4312749003984064,
"grad_norm": 0.8589175939559937,
"learning_rate": 7.019371005227407e-06,
"loss": 2.3789,
"step": 433
},
{
"epoch": 0.43227091633466136,
"grad_norm": 0.8680172562599182,
"learning_rate": 7.003445335530572e-06,
"loss": 2.6989,
"step": 434
},
{
"epoch": 0.43326693227091634,
"grad_norm": 0.7565051317214966,
"learning_rate": 6.987495416416627e-06,
"loss": 2.3688,
"step": 435
},
{
"epoch": 0.4342629482071713,
"grad_norm": 1.050288438796997,
"learning_rate": 6.9715214409411204e-06,
"loss": 2.1948,
"step": 436
},
{
"epoch": 0.4352589641434263,
"grad_norm": 0.5728120803833008,
"learning_rate": 6.95552360245078e-06,
"loss": 2.2716,
"step": 437
},
{
"epoch": 0.4362549800796813,
"grad_norm": 1.0542654991149902,
"learning_rate": 6.939502094581164e-06,
"loss": 2.5845,
"step": 438
},
{
"epoch": 0.43725099601593626,
"grad_norm": 0.7242105603218079,
"learning_rate": 6.923457111254322e-06,
"loss": 2.2048,
"step": 439
},
{
"epoch": 0.43824701195219123,
"grad_norm": 1.0688732862472534,
"learning_rate": 6.9073888466764495e-06,
"loss": 2.1883,
"step": 440
},
{
"epoch": 0.4392430278884462,
"grad_norm": 0.8276563286781311,
"learning_rate": 6.891297495335531e-06,
"loss": 2.2532,
"step": 441
},
{
"epoch": 0.4402390438247012,
"grad_norm": 0.7350841760635376,
"learning_rate": 6.875183251998993e-06,
"loss": 2.2517,
"step": 442
},
{
"epoch": 0.44123505976095617,
"grad_norm": 0.7074620127677917,
"learning_rate": 6.859046311711344e-06,
"loss": 2.1943,
"step": 443
},
{
"epoch": 0.44223107569721115,
"grad_norm": 0.9273977279663086,
"learning_rate": 6.84288686979181e-06,
"loss": 2.4566,
"step": 444
},
{
"epoch": 0.44322709163346613,
"grad_norm": 0.54817795753479,
"learning_rate": 6.8267051218319766e-06,
"loss": 2.1578,
"step": 445
},
{
"epoch": 0.4442231075697211,
"grad_norm": 0.8614233136177063,
"learning_rate": 6.810501263693416e-06,
"loss": 2.3546,
"step": 446
},
{
"epoch": 0.4452191235059761,
"grad_norm": 0.5919457077980042,
"learning_rate": 6.7942754915053225e-06,
"loss": 2.4907,
"step": 447
},
{
"epoch": 0.44621513944223107,
"grad_norm": 0.7229816317558289,
"learning_rate": 6.77802800166213e-06,
"loss": 2.2884,
"step": 448
},
{
"epoch": 0.44721115537848605,
"grad_norm": 1.1251389980316162,
"learning_rate": 6.761758990821143e-06,
"loss": 2.508,
"step": 449
},
{
"epoch": 0.448207171314741,
"grad_norm": 0.7033310532569885,
"learning_rate": 6.745468655900156e-06,
"loss": 2.4315,
"step": 450
},
{
"epoch": 0.449203187250996,
"grad_norm": 1.2203772068023682,
"learning_rate": 6.7291571940750575e-06,
"loss": 2.6156,
"step": 451
},
{
"epoch": 0.450199203187251,
"grad_norm": 0.7150283455848694,
"learning_rate": 6.712824802777465e-06,
"loss": 2.2121,
"step": 452
},
{
"epoch": 0.45119521912350596,
"grad_norm": 0.9422833323478699,
"learning_rate": 6.696471679692313e-06,
"loss": 2.2294,
"step": 453
},
{
"epoch": 0.45219123505976094,
"grad_norm": 0.6846040487289429,
"learning_rate": 6.680098022755478e-06,
"loss": 2.5567,
"step": 454
},
{
"epoch": 0.4531872509960159,
"grad_norm": 0.5450727343559265,
"learning_rate": 6.66370403015137e-06,
"loss": 2.4599,
"step": 455
},
{
"epoch": 0.4541832669322709,
"grad_norm": 0.5914618968963623,
"learning_rate": 6.647289900310545e-06,
"loss": 2.5134,
"step": 456
},
{
"epoch": 0.4551792828685259,
"grad_norm": 0.8993861079216003,
"learning_rate": 6.63085583190729e-06,
"loss": 2.5729,
"step": 457
},
{
"epoch": 0.45617529880478086,
"grad_norm": 0.5783509016036987,
"learning_rate": 6.614402023857231e-06,
"loss": 2.5881,
"step": 458
},
{
"epoch": 0.45717131474103584,
"grad_norm": 0.6340298652648926,
"learning_rate": 6.597928675314918e-06,
"loss": 2.4509,
"step": 459
},
{
"epoch": 0.4581673306772908,
"grad_norm": 0.7813217043876648,
"learning_rate": 6.581435985671418e-06,
"loss": 2.242,
"step": 460
},
{
"epoch": 0.4591633466135458,
"grad_norm": 0.8051680326461792,
"learning_rate": 6.564924154551895e-06,
"loss": 2.398,
"step": 461
},
{
"epoch": 0.4601593625498008,
"grad_norm": 0.6447633504867554,
"learning_rate": 6.548393381813205e-06,
"loss": 2.9214,
"step": 462
},
{
"epoch": 0.46115537848605576,
"grad_norm": 0.5684821605682373,
"learning_rate": 6.5318438675414665e-06,
"loss": 2.2545,
"step": 463
},
{
"epoch": 0.46215139442231074,
"grad_norm": 0.6067225337028503,
"learning_rate": 6.515275812049644e-06,
"loss": 2.5541,
"step": 464
},
{
"epoch": 0.4631474103585657,
"grad_norm": 0.5634474754333496,
"learning_rate": 6.498689415875121e-06,
"loss": 2.581,
"step": 465
},
{
"epoch": 0.4641434262948207,
"grad_norm": 0.4764470160007477,
"learning_rate": 6.48208487977728e-06,
"loss": 2.2492,
"step": 466
},
{
"epoch": 0.4651394422310757,
"grad_norm": 0.6636649370193481,
"learning_rate": 6.4654624047350575e-06,
"loss": 2.418,
"step": 467
},
{
"epoch": 0.46613545816733065,
"grad_norm": 0.6568376421928406,
"learning_rate": 6.448822191944526e-06,
"loss": 2.4608,
"step": 468
},
{
"epoch": 0.46713147410358563,
"grad_norm": 0.5867657661437988,
"learning_rate": 6.432164442816452e-06,
"loss": 2.7101,
"step": 469
},
{
"epoch": 0.4681274900398406,
"grad_norm": 0.5845500826835632,
"learning_rate": 6.41548935897386e-06,
"loss": 2.1822,
"step": 470
},
{
"epoch": 0.46912350597609564,
"grad_norm": 0.9894917011260986,
"learning_rate": 6.398797142249591e-06,
"loss": 2.4232,
"step": 471
},
{
"epoch": 0.4701195219123506,
"grad_norm": 0.5987226366996765,
"learning_rate": 6.3820879946838585e-06,
"loss": 2.6506,
"step": 472
},
{
"epoch": 0.4711155378486056,
"grad_norm": 1.552355408668518,
"learning_rate": 6.365362118521807e-06,
"loss": 3.0233,
"step": 473
},
{
"epoch": 0.4721115537848606,
"grad_norm": 0.6667497158050537,
"learning_rate": 6.348619716211058e-06,
"loss": 2.6748,
"step": 474
},
{
"epoch": 0.47310756972111556,
"grad_norm": 0.970600962638855,
"learning_rate": 6.33186099039927e-06,
"loss": 2.4542,
"step": 475
},
{
"epoch": 0.47410358565737054,
"grad_norm": 0.6683152914047241,
"learning_rate": 6.31508614393167e-06,
"loss": 2.4034,
"step": 476
},
{
"epoch": 0.4750996015936255,
"grad_norm": 0.9013263583183289,
"learning_rate": 6.2982953798486124e-06,
"loss": 2.4747,
"step": 477
},
{
"epoch": 0.4760956175298805,
"grad_norm": 0.8827518820762634,
"learning_rate": 6.2814889013831174e-06,
"loss": 2.5649,
"step": 478
},
{
"epoch": 0.4770916334661355,
"grad_norm": 0.8040870428085327,
"learning_rate": 6.264666911958404e-06,
"loss": 2.2855,
"step": 479
},
{
"epoch": 0.47808764940239046,
"grad_norm": 0.9028819799423218,
"learning_rate": 6.247829615185441e-06,
"loss": 2.3607,
"step": 480
},
{
"epoch": 0.47908366533864544,
"grad_norm": 0.8722829818725586,
"learning_rate": 6.230977214860468e-06,
"loss": 2.2346,
"step": 481
},
{
"epoch": 0.4800796812749004,
"grad_norm": 0.6933993697166443,
"learning_rate": 6.214109914962542e-06,
"loss": 2.7604,
"step": 482
},
{
"epoch": 0.4810756972111554,
"grad_norm": 0.6176011562347412,
"learning_rate": 6.1972279196510565e-06,
"loss": 2.8278,
"step": 483
},
{
"epoch": 0.4820717131474104,
"grad_norm": 0.7204033732414246,
"learning_rate": 6.180331433263283e-06,
"loss": 2.4275,
"step": 484
},
{
"epoch": 0.48306772908366535,
"grad_norm": 1.1777395009994507,
"learning_rate": 6.1634206603118844e-06,
"loss": 2.501,
"step": 485
},
{
"epoch": 0.48406374501992033,
"grad_norm": 1.3905079364776611,
"learning_rate": 6.146495805482451e-06,
"loss": 2.9668,
"step": 486
},
{
"epoch": 0.4850597609561753,
"grad_norm": 1.8294525146484375,
"learning_rate": 6.129557073631013e-06,
"loss": 2.0932,
"step": 487
},
{
"epoch": 0.4860557768924303,
"grad_norm": 1.0543792247772217,
"learning_rate": 6.112604669781572e-06,
"loss": 2.1609,
"step": 488
},
{
"epoch": 0.48705179282868527,
"grad_norm": 0.6805559992790222,
"learning_rate": 6.09563879912361e-06,
"loss": 2.5514,
"step": 489
},
{
"epoch": 0.48804780876494025,
"grad_norm": 0.913993239402771,
"learning_rate": 6.07865966700961e-06,
"loss": 2.1365,
"step": 490
},
{
"epoch": 0.48904382470119523,
"grad_norm": 1.127585768699646,
"learning_rate": 6.06166747895257e-06,
"loss": 2.5846,
"step": 491
},
{
"epoch": 0.4900398406374502,
"grad_norm": 0.5136232376098633,
"learning_rate": 6.044662440623512e-06,
"loss": 2.3032,
"step": 492
},
{
"epoch": 0.4910358565737052,
"grad_norm": 1.1162834167480469,
"learning_rate": 6.027644757849004e-06,
"loss": 2.4506,
"step": 493
},
{
"epoch": 0.49203187250996017,
"grad_norm": 0.8343062996864319,
"learning_rate": 6.0106146366086514e-06,
"loss": 2.5259,
"step": 494
},
{
"epoch": 0.49302788844621515,
"grad_norm": 1.2255266904830933,
"learning_rate": 5.99357228303262e-06,
"loss": 2.4964,
"step": 495
},
{
"epoch": 0.4940239043824701,
"grad_norm": 0.8280947804450989,
"learning_rate": 5.976517903399128e-06,
"loss": 2.3576,
"step": 496
},
{
"epoch": 0.4950199203187251,
"grad_norm": 0.5358011722564697,
"learning_rate": 5.959451704131962e-06,
"loss": 2.4103,
"step": 497
},
{
"epoch": 0.4960159362549801,
"grad_norm": 0.7300974130630493,
"learning_rate": 5.9423738917979655e-06,
"loss": 2.4792,
"step": 498
},
{
"epoch": 0.49701195219123506,
"grad_norm": 1.1368849277496338,
"learning_rate": 5.92528467310455e-06,
"loss": 2.7343,
"step": 499
},
{
"epoch": 0.49800796812749004,
"grad_norm": 1.1836261749267578,
"learning_rate": 5.908184254897183e-06,
"loss": 2.263,
"step": 500
},
{
"epoch": 0.499003984063745,
"grad_norm": 0.7548766136169434,
"learning_rate": 5.891072844156895e-06,
"loss": 2.5232,
"step": 501
},
{
"epoch": 0.5,
"grad_norm": 0.5676743984222412,
"learning_rate": 5.87395064799776e-06,
"loss": 2.532,
"step": 502
},
{
"epoch": 0.500996015936255,
"grad_norm": 0.6243408918380737,
"learning_rate": 5.856817873664409e-06,
"loss": 2.6337,
"step": 503
},
{
"epoch": 0.50199203187251,
"grad_norm": 0.6920816898345947,
"learning_rate": 5.839674728529499e-06,
"loss": 2.3761,
"step": 504
},
{
"epoch": 0.5029880478087649,
"grad_norm": 0.773199737071991,
"learning_rate": 5.8225214200912195e-06,
"loss": 2.3334,
"step": 505
},
{
"epoch": 0.5039840637450199,
"grad_norm": 0.8243865370750427,
"learning_rate": 5.8053581559707754e-06,
"loss": 2.2201,
"step": 506
},
{
"epoch": 0.5049800796812749,
"grad_norm": 0.6483944654464722,
"learning_rate": 5.788185143909868e-06,
"loss": 2.2019,
"step": 507
},
{
"epoch": 0.5059760956175299,
"grad_norm": 1.1562165021896362,
"learning_rate": 5.7710025917681954e-06,
"loss": 2.6738,
"step": 508
},
{
"epoch": 0.5069721115537849,
"grad_norm": 0.7714682221412659,
"learning_rate": 5.753810707520918e-06,
"loss": 2.5662,
"step": 509
},
{
"epoch": 0.5079681274900398,
"grad_norm": 1.8939898014068604,
"learning_rate": 5.736609699256158e-06,
"loss": 2.4841,
"step": 510
},
{
"epoch": 0.5089641434262948,
"grad_norm": 1.0769683122634888,
"learning_rate": 5.719399775172468e-06,
"loss": 2.2339,
"step": 511
},
{
"epoch": 0.5099601593625498,
"grad_norm": 0.7231705188751221,
"learning_rate": 5.702181143576323e-06,
"loss": 2.2754,
"step": 512
},
{
"epoch": 0.5109561752988048,
"grad_norm": 1.0154212713241577,
"learning_rate": 5.684954012879583e-06,
"loss": 2.5029,
"step": 513
},
{
"epoch": 0.5119521912350598,
"grad_norm": 1.130210041999817,
"learning_rate": 5.66771859159699e-06,
"loss": 2.5272,
"step": 514
},
{
"epoch": 0.5129482071713147,
"grad_norm": 0.6206554770469666,
"learning_rate": 5.6504750883436275e-06,
"loss": 2.3235,
"step": 515
},
{
"epoch": 0.5139442231075697,
"grad_norm": 1.0481131076812744,
"learning_rate": 5.633223711832403e-06,
"loss": 2.2866,
"step": 516
},
{
"epoch": 0.5149402390438247,
"grad_norm": 0.7321045398712158,
"learning_rate": 5.615964670871524e-06,
"loss": 2.4402,
"step": 517
},
{
"epoch": 0.5159362549800797,
"grad_norm": 0.7684382796287537,
"learning_rate": 5.5986981743619615e-06,
"loss": 2.3404,
"step": 518
},
{
"epoch": 0.5169322709163346,
"grad_norm": 1.1246601343154907,
"learning_rate": 5.581424431294936e-06,
"loss": 2.2032,
"step": 519
},
{
"epoch": 0.5179282868525896,
"grad_norm": 0.617168128490448,
"learning_rate": 5.56414365074937e-06,
"loss": 2.5379,
"step": 520
},
{
"epoch": 0.5189243027888446,
"grad_norm": 0.7718055248260498,
"learning_rate": 5.546856041889374e-06,
"loss": 2.4803,
"step": 521
},
{
"epoch": 0.5199203187250996,
"grad_norm": 1.5422130823135376,
"learning_rate": 5.5295618139617e-06,
"loss": 2.3014,
"step": 522
},
{
"epoch": 0.5209163346613546,
"grad_norm": 0.5689607262611389,
"learning_rate": 5.512261176293226e-06,
"loss": 2.1094,
"step": 523
},
{
"epoch": 0.5219123505976095,
"grad_norm": 0.6268129944801331,
"learning_rate": 5.494954338288404e-06,
"loss": 2.4562,
"step": 524
},
{
"epoch": 0.5229083665338645,
"grad_norm": 1.1070692539215088,
"learning_rate": 5.477641509426739e-06,
"loss": 2.0365,
"step": 525
},
{
"epoch": 0.5239043824701195,
"grad_norm": 1.2830649614334106,
"learning_rate": 5.460322899260245e-06,
"loss": 2.2455,
"step": 526
},
{
"epoch": 0.5249003984063745,
"grad_norm": 0.6706056594848633,
"learning_rate": 5.442998717410916e-06,
"loss": 2.3095,
"step": 527
},
{
"epoch": 0.5258964143426295,
"grad_norm": 0.6565206050872803,
"learning_rate": 5.425669173568179e-06,
"loss": 2.5873,
"step": 528
},
{
"epoch": 0.5268924302788844,
"grad_norm": 0.8396487236022949,
"learning_rate": 5.408334477486369e-06,
"loss": 2.4121,
"step": 529
},
{
"epoch": 0.5278884462151394,
"grad_norm": 0.8442867398262024,
"learning_rate": 5.390994838982178e-06,
"loss": 2.1112,
"step": 530
},
{
"epoch": 0.5288844621513944,
"grad_norm": 0.6712931990623474,
"learning_rate": 5.373650467932122e-06,
"loss": 2.5735,
"step": 531
},
{
"epoch": 0.5298804780876494,
"grad_norm": 0.8547630906105042,
"learning_rate": 5.356301574269998e-06,
"loss": 2.5899,
"step": 532
},
{
"epoch": 0.5308764940239044,
"grad_norm": 1.1971904039382935,
"learning_rate": 5.338948367984347e-06,
"loss": 2.7327,
"step": 533
},
{
"epoch": 0.5318725099601593,
"grad_norm": 0.8025546669960022,
"learning_rate": 5.321591059115906e-06,
"loss": 2.3269,
"step": 534
},
{
"epoch": 0.5328685258964143,
"grad_norm": 0.9815241098403931,
"learning_rate": 5.30422985775507e-06,
"loss": 2.4555,
"step": 535
},
{
"epoch": 0.5338645418326693,
"grad_norm": 0.9032710194587708,
"learning_rate": 5.286864974039349e-06,
"loss": 2.4246,
"step": 536
},
{
"epoch": 0.5348605577689243,
"grad_norm": 1.508058786392212,
"learning_rate": 5.269496618150823e-06,
"loss": 2.1206,
"step": 537
},
{
"epoch": 0.5358565737051793,
"grad_norm": 1.0455362796783447,
"learning_rate": 5.2521250003136005e-06,
"loss": 2.4406,
"step": 538
},
{
"epoch": 0.5368525896414342,
"grad_norm": 0.8184682726860046,
"learning_rate": 5.234750330791268e-06,
"loss": 2.1303,
"step": 539
},
{
"epoch": 0.5378486055776892,
"grad_norm": 0.8355326056480408,
"learning_rate": 5.217372819884353e-06,
"loss": 2.3468,
"step": 540
},
{
"epoch": 0.5388446215139442,
"grad_norm": 1.0594871044158936,
"learning_rate": 5.199992677927775e-06,
"loss": 2.2558,
"step": 541
},
{
"epoch": 0.5398406374501992,
"grad_norm": 1.3626909255981445,
"learning_rate": 5.182610115288296e-06,
"loss": 2.4741,
"step": 542
},
{
"epoch": 0.5408366533864541,
"grad_norm": 0.6529922485351562,
"learning_rate": 5.165225342361978e-06,
"loss": 2.3131,
"step": 543
},
{
"epoch": 0.5418326693227091,
"grad_norm": 0.8616756796836853,
"learning_rate": 5.147838569571642e-06,
"loss": 2.2786,
"step": 544
},
{
"epoch": 0.5428286852589641,
"grad_norm": 1.9104212522506714,
"learning_rate": 5.1304500073643045e-06,
"loss": 2.2784,
"step": 545
},
{
"epoch": 0.5438247011952191,
"grad_norm": 0.7414434552192688,
"learning_rate": 5.1130598662086514e-06,
"loss": 2.1551,
"step": 546
},
{
"epoch": 0.5448207171314741,
"grad_norm": 0.889681339263916,
"learning_rate": 5.095668356592474e-06,
"loss": 1.9545,
"step": 547
},
{
"epoch": 0.545816733067729,
"grad_norm": 0.8562681674957275,
"learning_rate": 5.078275689020129e-06,
"loss": 2.1665,
"step": 548
},
{
"epoch": 0.546812749003984,
"grad_norm": 0.810234010219574,
"learning_rate": 5.060882074009988e-06,
"loss": 2.5324,
"step": 549
},
{
"epoch": 0.547808764940239,
"grad_norm": 1.867493748664856,
"learning_rate": 5.043487722091891e-06,
"loss": 2.6972,
"step": 550
},
{
"epoch": 0.548804780876494,
"grad_norm": 0.8987257480621338,
"learning_rate": 5.026092843804599e-06,
"loss": 2.3632,
"step": 551
},
{
"epoch": 0.549800796812749,
"grad_norm": 0.9021519422531128,
"learning_rate": 5.0086976496932374e-06,
"loss": 2.2825,
"step": 552
},
{
"epoch": 0.5507968127490039,
"grad_norm": 1.0589499473571777,
"learning_rate": 4.991302350306764e-06,
"loss": 2.6046,
"step": 553
},
{
"epoch": 0.5517928286852589,
"grad_norm": 0.8575243949890137,
"learning_rate": 4.973907156195405e-06,
"loss": 2.6962,
"step": 554
},
{
"epoch": 0.5527888446215139,
"grad_norm": 1.7027395963668823,
"learning_rate": 4.956512277908109e-06,
"loss": 2.4405,
"step": 555
},
{
"epoch": 0.5537848605577689,
"grad_norm": 0.8842843770980835,
"learning_rate": 4.939117925990013e-06,
"loss": 2.4694,
"step": 556
},
{
"epoch": 0.5547808764940239,
"grad_norm": 0.9840981364250183,
"learning_rate": 4.921724310979872e-06,
"loss": 2.5034,
"step": 557
},
{
"epoch": 0.5557768924302788,
"grad_norm": 0.8198688626289368,
"learning_rate": 4.904331643407527e-06,
"loss": 2.5664,
"step": 558
},
{
"epoch": 0.5567729083665338,
"grad_norm": 1.1393426656723022,
"learning_rate": 4.886940133791349e-06,
"loss": 2.4415,
"step": 559
},
{
"epoch": 0.5577689243027888,
"grad_norm": 0.835932731628418,
"learning_rate": 4.869549992635697e-06,
"loss": 2.3719,
"step": 560
},
{
"epoch": 0.5587649402390438,
"grad_norm": 0.952167272567749,
"learning_rate": 4.8521614304283615e-06,
"loss": 2.3092,
"step": 561
},
{
"epoch": 0.5597609561752988,
"grad_norm": 0.8146136999130249,
"learning_rate": 4.834774657638023e-06,
"loss": 2.5503,
"step": 562
},
{
"epoch": 0.5607569721115537,
"grad_norm": 0.7990023493766785,
"learning_rate": 4.817389884711706e-06,
"loss": 2.3319,
"step": 563
},
{
"epoch": 0.5617529880478087,
"grad_norm": 0.5947994589805603,
"learning_rate": 4.800007322072226e-06,
"loss": 2.1591,
"step": 564
},
{
"epoch": 0.5627490039840638,
"grad_norm": 0.8183361291885376,
"learning_rate": 4.7826271801156485e-06,
"loss": 2.431,
"step": 565
},
{
"epoch": 0.5637450199203188,
"grad_norm": 0.896101713180542,
"learning_rate": 4.765249669208733e-06,
"loss": 2.3313,
"step": 566
},
{
"epoch": 0.5647410358565738,
"grad_norm": 1.7690149545669556,
"learning_rate": 4.747874999686401e-06,
"loss": 2.1365,
"step": 567
},
{
"epoch": 0.5657370517928287,
"grad_norm": 0.8712881803512573,
"learning_rate": 4.730503381849179e-06,
"loss": 2.4139,
"step": 568
},
{
"epoch": 0.5667330677290837,
"grad_norm": 1.0946391820907593,
"learning_rate": 4.713135025960652e-06,
"loss": 2.1844,
"step": 569
},
{
"epoch": 0.5677290836653387,
"grad_norm": 0.9877662658691406,
"learning_rate": 4.695770142244931e-06,
"loss": 2.4189,
"step": 570
},
{
"epoch": 0.5687250996015937,
"grad_norm": 3.4181249141693115,
"learning_rate": 4.6784089408840955e-06,
"loss": 2.7926,
"step": 571
},
{
"epoch": 0.5697211155378487,
"grad_norm": 0.7407424449920654,
"learning_rate": 4.661051632015655e-06,
"loss": 2.4036,
"step": 572
},
{
"epoch": 0.5707171314741036,
"grad_norm": 0.8603093028068542,
"learning_rate": 4.643698425730004e-06,
"loss": 2.0895,
"step": 573
},
{
"epoch": 0.5717131474103586,
"grad_norm": 1.5615819692611694,
"learning_rate": 4.626349532067879e-06,
"loss": 2.6668,
"step": 574
},
{
"epoch": 0.5727091633466136,
"grad_norm": 0.9211438298225403,
"learning_rate": 4.609005161017824e-06,
"loss": 2.7534,
"step": 575
},
{
"epoch": 0.5737051792828686,
"grad_norm": 0.6863355040550232,
"learning_rate": 4.591665522513633e-06,
"loss": 2.4823,
"step": 576
},
{
"epoch": 0.5747011952191236,
"grad_norm": 1.1497968435287476,
"learning_rate": 4.574330826431822e-06,
"loss": 2.7213,
"step": 577
},
{
"epoch": 0.5756972111553785,
"grad_norm": 0.6396723389625549,
"learning_rate": 4.557001282589086e-06,
"loss": 2.342,
"step": 578
},
{
"epoch": 0.5766932270916335,
"grad_norm": 0.7893930673599243,
"learning_rate": 4.5396771007397565e-06,
"loss": 2.426,
"step": 579
},
{
"epoch": 0.5776892430278885,
"grad_norm": 1.0288350582122803,
"learning_rate": 4.5223584905732635e-06,
"loss": 2.3415,
"step": 580
},
{
"epoch": 0.5786852589641435,
"grad_norm": 1.215003252029419,
"learning_rate": 4.505045661711596e-06,
"loss": 2.2311,
"step": 581
},
{
"epoch": 0.5796812749003984,
"grad_norm": 1.2418211698532104,
"learning_rate": 4.487738823706775e-06,
"loss": 2.6043,
"step": 582
},
{
"epoch": 0.5806772908366534,
"grad_norm": 0.658145546913147,
"learning_rate": 4.470438186038301e-06,
"loss": 2.5859,
"step": 583
},
{
"epoch": 0.5816733067729084,
"grad_norm": 0.9392750859260559,
"learning_rate": 4.4531439581106295e-06,
"loss": 2.4185,
"step": 584
},
{
"epoch": 0.5826693227091634,
"grad_norm": 0.726355254650116,
"learning_rate": 4.43585634925063e-06,
"loss": 2.656,
"step": 585
},
{
"epoch": 0.5836653386454184,
"grad_norm": 0.8283027410507202,
"learning_rate": 4.418575568705066e-06,
"loss": 2.6963,
"step": 586
},
{
"epoch": 0.5846613545816733,
"grad_norm": 2.0478458404541016,
"learning_rate": 4.401301825638039e-06,
"loss": 2.8958,
"step": 587
},
{
"epoch": 0.5856573705179283,
"grad_norm": 0.9227387309074402,
"learning_rate": 4.3840353291284776e-06,
"loss": 2.5498,
"step": 588
},
{
"epoch": 0.5866533864541833,
"grad_norm": 1.2917126417160034,
"learning_rate": 4.366776288167598e-06,
"loss": 2.4996,
"step": 589
},
{
"epoch": 0.5876494023904383,
"grad_norm": 0.9034551978111267,
"learning_rate": 4.349524911656373e-06,
"loss": 2.4281,
"step": 590
},
{
"epoch": 0.5886454183266933,
"grad_norm": 1.010878324508667,
"learning_rate": 4.332281408403011e-06,
"loss": 2.2912,
"step": 591
},
{
"epoch": 0.5896414342629482,
"grad_norm": 1.7873375415802002,
"learning_rate": 4.315045987120417e-06,
"loss": 2.5171,
"step": 592
},
{
"epoch": 0.5906374501992032,
"grad_norm": 0.8005262613296509,
"learning_rate": 4.297818856423679e-06,
"loss": 2.33,
"step": 593
},
{
"epoch": 0.5916334661354582,
"grad_norm": 0.6728765368461609,
"learning_rate": 4.280600224827533e-06,
"loss": 2.1524,
"step": 594
},
{
"epoch": 0.5926294820717132,
"grad_norm": 0.8610662221908569,
"learning_rate": 4.2633903007438445e-06,
"loss": 2.7524,
"step": 595
},
{
"epoch": 0.5936254980079682,
"grad_norm": 1.0232973098754883,
"learning_rate": 4.2461892924790825e-06,
"loss": 2.6512,
"step": 596
},
{
"epoch": 0.5946215139442231,
"grad_norm": 1.0610368251800537,
"learning_rate": 4.228997408231806e-06,
"loss": 2.4996,
"step": 597
},
{
"epoch": 0.5956175298804781,
"grad_norm": 1.2796133756637573,
"learning_rate": 4.2118148560901325e-06,
"loss": 2.2488,
"step": 598
},
{
"epoch": 0.5966135458167331,
"grad_norm": 1.5423349142074585,
"learning_rate": 4.194641844029227e-06,
"loss": 2.6293,
"step": 599
},
{
"epoch": 0.5976095617529881,
"grad_norm": 1.5228114128112793,
"learning_rate": 4.1774785799087805e-06,
"loss": 2.3751,
"step": 600
},
{
"epoch": 0.598605577689243,
"grad_norm": 0.9803175330162048,
"learning_rate": 4.160325271470502e-06,
"loss": 2.5003,
"step": 601
},
{
"epoch": 0.599601593625498,
"grad_norm": 1.0139139890670776,
"learning_rate": 4.143182126335594e-06,
"loss": 2.5435,
"step": 602
},
{
"epoch": 0.600597609561753,
"grad_norm": 0.8577011227607727,
"learning_rate": 4.12604935200224e-06,
"loss": 2.6227,
"step": 603
},
{
"epoch": 0.601593625498008,
"grad_norm": 1.015549659729004,
"learning_rate": 4.108927155843108e-06,
"loss": 2.3803,
"step": 604
},
{
"epoch": 0.602589641434263,
"grad_norm": 1.5675932168960571,
"learning_rate": 4.091815745102818e-06,
"loss": 2.364,
"step": 605
},
{
"epoch": 0.603585657370518,
"grad_norm": 0.7266266345977783,
"learning_rate": 4.074715326895453e-06,
"loss": 2.6198,
"step": 606
},
{
"epoch": 0.6045816733067729,
"grad_norm": 0.617874026298523,
"learning_rate": 4.0576261082020345e-06,
"loss": 2.6864,
"step": 607
},
{
"epoch": 0.6055776892430279,
"grad_norm": 1.3024420738220215,
"learning_rate": 4.040548295868039e-06,
"loss": 2.4742,
"step": 608
},
{
"epoch": 0.6065737051792829,
"grad_norm": 0.7109612822532654,
"learning_rate": 4.023482096600873e-06,
"loss": 2.6456,
"step": 609
},
{
"epoch": 0.6075697211155379,
"grad_norm": 1.2119102478027344,
"learning_rate": 4.006427716967382e-06,
"loss": 2.3745,
"step": 610
},
{
"epoch": 0.6085657370517928,
"grad_norm": 1.3014880418777466,
"learning_rate": 3.9893853633913485e-06,
"loss": 2.6845,
"step": 611
},
{
"epoch": 0.6095617529880478,
"grad_norm": 1.3628534078598022,
"learning_rate": 3.9723552421509975e-06,
"loss": 2.6973,
"step": 612
},
{
"epoch": 0.6105577689243028,
"grad_norm": 0.9777284860610962,
"learning_rate": 3.955337559376489e-06,
"loss": 2.4989,
"step": 613
},
{
"epoch": 0.6115537848605578,
"grad_norm": 0.6070024371147156,
"learning_rate": 3.938332521047434e-06,
"loss": 2.0082,
"step": 614
},
{
"epoch": 0.6125498007968128,
"grad_norm": 0.6223677396774292,
"learning_rate": 3.921340332990392e-06,
"loss": 2.2016,
"step": 615
},
{
"epoch": 0.6135458167330677,
"grad_norm": 1.2076197862625122,
"learning_rate": 3.904361200876391e-06,
"loss": 2.7328,
"step": 616
},
{
"epoch": 0.6145418326693227,
"grad_norm": 0.7502063512802124,
"learning_rate": 3.887395330218429e-06,
"loss": 2.1634,
"step": 617
},
{
"epoch": 0.6155378486055777,
"grad_norm": 1.090084195137024,
"learning_rate": 3.8704429263689865e-06,
"loss": 2.2409,
"step": 618
},
{
"epoch": 0.6165338645418327,
"grad_norm": 1.7830555438995361,
"learning_rate": 3.853504194517551e-06,
"loss": 2.5541,
"step": 619
},
{
"epoch": 0.6175298804780877,
"grad_norm": 1.0715655088424683,
"learning_rate": 3.836579339688116e-06,
"loss": 2.7304,
"step": 620
},
{
"epoch": 0.6185258964143426,
"grad_norm": 0.7255896925926208,
"learning_rate": 3.819668566736719e-06,
"loss": 2.5671,
"step": 621
},
{
"epoch": 0.6195219123505976,
"grad_norm": 1.475665807723999,
"learning_rate": 3.802772080348943e-06,
"loss": 2.2374,
"step": 622
},
{
"epoch": 0.6205179282868526,
"grad_norm": 1.1244341135025024,
"learning_rate": 3.7858900850374596e-06,
"loss": 2.2705,
"step": 623
},
{
"epoch": 0.6215139442231076,
"grad_norm": 1.270950436592102,
"learning_rate": 3.769022785139534e-06,
"loss": 2.427,
"step": 624
},
{
"epoch": 0.6225099601593626,
"grad_norm": 0.9996942281723022,
"learning_rate": 3.752170384814562e-06,
"loss": 2.3181,
"step": 625
},
{
"epoch": 0.6235059760956175,
"grad_norm": 0.9702761173248291,
"learning_rate": 3.7353330880415963e-06,
"loss": 2.4871,
"step": 626
},
{
"epoch": 0.6245019920318725,
"grad_norm": 0.7174897193908691,
"learning_rate": 3.7185110986168842e-06,
"loss": 2.6481,
"step": 627
},
{
"epoch": 0.6254980079681275,
"grad_norm": 1.0198302268981934,
"learning_rate": 3.701704620151389e-06,
"loss": 2.4368,
"step": 628
},
{
"epoch": 0.6264940239043825,
"grad_norm": 0.6317278742790222,
"learning_rate": 3.6849138560683305e-06,
"loss": 2.2506,
"step": 629
},
{
"epoch": 0.6274900398406374,
"grad_norm": 1.6083205938339233,
"learning_rate": 3.6681390096007315e-06,
"loss": 2.441,
"step": 630
},
{
"epoch": 0.6284860557768924,
"grad_norm": 1.1788543462753296,
"learning_rate": 3.651380283788942e-06,
"loss": 2.0867,
"step": 631
},
{
"epoch": 0.6294820717131474,
"grad_norm": 1.6041985750198364,
"learning_rate": 3.634637881478196e-06,
"loss": 2.7786,
"step": 632
},
{
"epoch": 0.6304780876494024,
"grad_norm": 0.7498704195022583,
"learning_rate": 3.617912005316142e-06,
"loss": 2.5885,
"step": 633
},
{
"epoch": 0.6314741035856574,
"grad_norm": 1.2260042428970337,
"learning_rate": 3.6012028577504106e-06,
"loss": 2.5491,
"step": 634
},
{
"epoch": 0.6324701195219123,
"grad_norm": 0.766639232635498,
"learning_rate": 3.5845106410261417e-06,
"loss": 2.6436,
"step": 635
},
{
"epoch": 0.6334661354581673,
"grad_norm": 0.8522284626960754,
"learning_rate": 3.56783555718355e-06,
"loss": 2.361,
"step": 636
},
{
"epoch": 0.6344621513944223,
"grad_norm": 1.09912109375,
"learning_rate": 3.551177808055476e-06,
"loss": 2.5303,
"step": 637
},
{
"epoch": 0.6354581673306773,
"grad_norm": 1.4560422897338867,
"learning_rate": 3.534537595264944e-06,
"loss": 2.6122,
"step": 638
},
{
"epoch": 0.6364541832669323,
"grad_norm": 0.858035147190094,
"learning_rate": 3.5179151202227214e-06,
"loss": 2.3591,
"step": 639
},
{
"epoch": 0.6374501992031872,
"grad_norm": 4.184999942779541,
"learning_rate": 3.5013105841248794e-06,
"loss": 2.3339,
"step": 640
},
{
"epoch": 0.6384462151394422,
"grad_norm": 1.2636277675628662,
"learning_rate": 3.4847241879503574e-06,
"loss": 2.6084,
"step": 641
},
{
"epoch": 0.6394422310756972,
"grad_norm": 1.3735069036483765,
"learning_rate": 3.4681561324585356e-06,
"loss": 2.4582,
"step": 642
},
{
"epoch": 0.6404382470119522,
"grad_norm": 1.3198506832122803,
"learning_rate": 3.451606618186796e-06,
"loss": 2.2207,
"step": 643
},
{
"epoch": 0.6414342629482072,
"grad_norm": 0.895077109336853,
"learning_rate": 3.435075845448105e-06,
"loss": 2.141,
"step": 644
},
{
"epoch": 0.6424302788844621,
"grad_norm": 1.5022435188293457,
"learning_rate": 3.418564014328583e-06,
"loss": 2.5608,
"step": 645
},
{
"epoch": 0.6434262948207171,
"grad_norm": 0.9838452935218811,
"learning_rate": 3.402071324685082e-06,
"loss": 2.372,
"step": 646
},
{
"epoch": 0.6444223107569721,
"grad_norm": 0.6322600841522217,
"learning_rate": 3.3855979761427705e-06,
"loss": 2.5499,
"step": 647
},
{
"epoch": 0.6454183266932271,
"grad_norm": 1.3608890771865845,
"learning_rate": 3.3691441680927105e-06,
"loss": 2.4689,
"step": 648
},
{
"epoch": 0.646414342629482,
"grad_norm": 0.9520907998085022,
"learning_rate": 3.352710099689457e-06,
"loss": 2.5012,
"step": 649
},
{
"epoch": 0.647410358565737,
"grad_norm": 3.0419979095458984,
"learning_rate": 3.3362959698486307e-06,
"loss": 2.2773,
"step": 650
},
{
"epoch": 0.648406374501992,
"grad_norm": 1.0915313959121704,
"learning_rate": 3.3199019772445253e-06,
"loss": 2.3744,
"step": 651
},
{
"epoch": 0.649402390438247,
"grad_norm": 1.543050765991211,
"learning_rate": 3.3035283203076877e-06,
"loss": 2.3499,
"step": 652
},
{
"epoch": 0.650398406374502,
"grad_norm": 1.0574357509613037,
"learning_rate": 3.287175197222537e-06,
"loss": 2.516,
"step": 653
},
{
"epoch": 0.651394422310757,
"grad_norm": 1.3127410411834717,
"learning_rate": 3.2708428059249437e-06,
"loss": 2.4012,
"step": 654
},
{
"epoch": 0.6523904382470119,
"grad_norm": 0.9456487894058228,
"learning_rate": 3.254531344099847e-06,
"loss": 2.773,
"step": 655
},
{
"epoch": 0.6533864541832669,
"grad_norm": 1.1509116888046265,
"learning_rate": 3.2382410091788567e-06,
"loss": 2.7622,
"step": 656
},
{
"epoch": 0.6543824701195219,
"grad_norm": 1.0328110456466675,
"learning_rate": 3.221971998337872e-06,
"loss": 2.5343,
"step": 657
},
{
"epoch": 0.6553784860557769,
"grad_norm": 1.723029613494873,
"learning_rate": 3.2057245084946796e-06,
"loss": 2.8968,
"step": 658
},
{
"epoch": 0.6563745019920318,
"grad_norm": 1.13263738155365,
"learning_rate": 3.189498736306584e-06,
"loss": 2.4689,
"step": 659
},
{
"epoch": 0.6573705179282868,
"grad_norm": 1.4411126375198364,
"learning_rate": 3.173294878168025e-06,
"loss": 2.0715,
"step": 660
},
{
"epoch": 0.6583665338645418,
"grad_norm": 2.4463119506835938,
"learning_rate": 3.1571131302081916e-06,
"loss": 2.5004,
"step": 661
},
{
"epoch": 0.6593625498007968,
"grad_norm": 1.063270926475525,
"learning_rate": 3.140953688288658e-06,
"loss": 2.4079,
"step": 662
},
{
"epoch": 0.6603585657370518,
"grad_norm": 1.860757827758789,
"learning_rate": 3.1248167480010083e-06,
"loss": 2.6755,
"step": 663
},
{
"epoch": 0.6613545816733067,
"grad_norm": 1.5435043573379517,
"learning_rate": 3.1087025046644704e-06,
"loss": 2.7499,
"step": 664
},
{
"epoch": 0.6623505976095617,
"grad_norm": 0.7970728278160095,
"learning_rate": 3.0926111533235526e-06,
"loss": 2.1911,
"step": 665
},
{
"epoch": 0.6633466135458167,
"grad_norm": 1.1135482788085938,
"learning_rate": 3.0765428887456794e-06,
"loss": 2.6387,
"step": 666
},
{
"epoch": 0.6643426294820717,
"grad_norm": 1.2876728773117065,
"learning_rate": 3.0604979054188367e-06,
"loss": 2.6715,
"step": 667
},
{
"epoch": 0.6653386454183267,
"grad_norm": 0.6579734683036804,
"learning_rate": 3.044476397549221e-06,
"loss": 2.1833,
"step": 668
},
{
"epoch": 0.6663346613545816,
"grad_norm": 1.7546638250350952,
"learning_rate": 3.0284785590588804e-06,
"loss": 2.5761,
"step": 669
},
{
"epoch": 0.6673306772908366,
"grad_norm": 1.1617887020111084,
"learning_rate": 3.012504583583374e-06,
"loss": 2.4205,
"step": 670
},
{
"epoch": 0.6683266932270916,
"grad_norm": 1.4457294940948486,
"learning_rate": 2.9965546644694287e-06,
"loss": 2.178,
"step": 671
},
{
"epoch": 0.6693227091633466,
"grad_norm": 0.9334515333175659,
"learning_rate": 2.9806289947725947e-06,
"loss": 2.5343,
"step": 672
},
{
"epoch": 0.6703187250996016,
"grad_norm": 1.115212082862854,
"learning_rate": 2.9647277672549093e-06,
"loss": 2.1731,
"step": 673
},
{
"epoch": 0.6713147410358565,
"grad_norm": 1.1038217544555664,
"learning_rate": 2.948851174382565e-06,
"loss": 2.3589,
"step": 674
},
{
"epoch": 0.6723107569721115,
"grad_norm": 1.4897500276565552,
"learning_rate": 2.9329994083235857e-06,
"loss": 2.4302,
"step": 675
},
{
"epoch": 0.6733067729083665,
"grad_norm": 1.7196754217147827,
"learning_rate": 2.9171726609454875e-06,
"loss": 2.5387,
"step": 676
},
{
"epoch": 0.6743027888446215,
"grad_norm": 1.271872878074646,
"learning_rate": 2.9013711238129693e-06,
"loss": 2.1938,
"step": 677
},
{
"epoch": 0.6752988047808764,
"grad_norm": 1.0383085012435913,
"learning_rate": 2.885594988185587e-06,
"loss": 2.5842,
"step": 678
},
{
"epoch": 0.6762948207171314,
"grad_norm": 1.9233471155166626,
"learning_rate": 2.8698444450154395e-06,
"loss": 2.4421,
"step": 679
},
{
"epoch": 0.6772908366533864,
"grad_norm": 0.9812890291213989,
"learning_rate": 2.8541196849448582e-06,
"loss": 2.1876,
"step": 680
},
{
"epoch": 0.6782868525896414,
"grad_norm": 1.217011570930481,
"learning_rate": 2.8384208983040997e-06,
"loss": 2.5115,
"step": 681
},
{
"epoch": 0.6792828685258964,
"grad_norm": 1.4093648195266724,
"learning_rate": 2.8227482751090445e-06,
"loss": 2.5296,
"step": 682
},
{
"epoch": 0.6802788844621513,
"grad_norm": 1.0479772090911865,
"learning_rate": 2.8071020050588927e-06,
"loss": 2.3801,
"step": 683
},
{
"epoch": 0.6812749003984063,
"grad_norm": 0.761779248714447,
"learning_rate": 2.7914822775338678e-06,
"loss": 2.397,
"step": 684
},
{
"epoch": 0.6822709163346613,
"grad_norm": 0.7536188364028931,
"learning_rate": 2.775889281592927e-06,
"loss": 2.2802,
"step": 685
},
{
"epoch": 0.6832669322709163,
"grad_norm": 1.1621276140213013,
"learning_rate": 2.760323205971476e-06,
"loss": 2.3802,
"step": 686
},
{
"epoch": 0.6842629482071713,
"grad_norm": 1.2401965856552124,
"learning_rate": 2.744784239079077e-06,
"loss": 1.9567,
"step": 687
},
{
"epoch": 0.6852589641434262,
"grad_norm": 0.9456545114517212,
"learning_rate": 2.7292725689971732e-06,
"loss": 2.547,
"step": 688
},
{
"epoch": 0.6862549800796812,
"grad_norm": 1.3253943920135498,
"learning_rate": 2.7137883834768076e-06,
"loss": 2.2105,
"step": 689
},
{
"epoch": 0.6872509960159362,
"grad_norm": 1.525397777557373,
"learning_rate": 2.6983318699363627e-06,
"loss": 2.3682,
"step": 690
},
{
"epoch": 0.6882470119521913,
"grad_norm": 0.9517590403556824,
"learning_rate": 2.6829032154592745e-06,
"loss": 2.2159,
"step": 691
},
{
"epoch": 0.6892430278884463,
"grad_norm": 0.8040021061897278,
"learning_rate": 2.6675026067917808e-06,
"loss": 2.3967,
"step": 692
},
{
"epoch": 0.6902390438247012,
"grad_norm": 1.6833242177963257,
"learning_rate": 2.652130230340655e-06,
"loss": 2.9864,
"step": 693
},
{
"epoch": 0.6912350597609562,
"grad_norm": 1.104771614074707,
"learning_rate": 2.636786272170956e-06,
"loss": 2.5124,
"step": 694
},
{
"epoch": 0.6922310756972112,
"grad_norm": 1.1763907670974731,
"learning_rate": 2.621470918003768e-06,
"loss": 2.6426,
"step": 695
},
{
"epoch": 0.6932270916334662,
"grad_norm": 2.304222345352173,
"learning_rate": 2.6061843532139563e-06,
"loss": 2.4522,
"step": 696
},
{
"epoch": 0.6942231075697212,
"grad_norm": 0.8599796891212463,
"learning_rate": 2.5909267628279234e-06,
"loss": 2.6796,
"step": 697
},
{
"epoch": 0.6952191235059761,
"grad_norm": 1.0061733722686768,
"learning_rate": 2.5756983315213748e-06,
"loss": 2.5076,
"step": 698
},
{
"epoch": 0.6962151394422311,
"grad_norm": 1.392606258392334,
"learning_rate": 2.560499243617074e-06,
"loss": 2.5134,
"step": 699
},
{
"epoch": 0.6972111553784861,
"grad_norm": 1.2116351127624512,
"learning_rate": 2.5453296830826135e-06,
"loss": 2.0634,
"step": 700
},
{
"epoch": 0.6982071713147411,
"grad_norm": 0.7071558237075806,
"learning_rate": 2.5301898335281994e-06,
"loss": 2.1104,
"step": 701
},
{
"epoch": 0.6992031872509961,
"grad_norm": 1.8307946920394897,
"learning_rate": 2.5150798782044123e-06,
"loss": 2.8147,
"step": 702
},
{
"epoch": 0.700199203187251,
"grad_norm": 0.9716182351112366,
"learning_rate": 2.5000000000000015e-06,
"loss": 2.4836,
"step": 703
},
{
"epoch": 0.701195219123506,
"grad_norm": 0.7655389308929443,
"learning_rate": 2.4849503814396624e-06,
"loss": 2.2803,
"step": 704
},
{
"epoch": 0.702191235059761,
"grad_norm": 1.1354485750198364,
"learning_rate": 2.469931204681841e-06,
"loss": 2.5936,
"step": 705
},
{
"epoch": 0.703187250996016,
"grad_norm": 0.9272159337997437,
"learning_rate": 2.4549426515165116e-06,
"loss": 2.6629,
"step": 706
},
{
"epoch": 0.704183266932271,
"grad_norm": 1.20318603515625,
"learning_rate": 2.439984903362988e-06,
"loss": 2.341,
"step": 707
},
{
"epoch": 0.7051792828685259,
"grad_norm": 1.0813405513763428,
"learning_rate": 2.425058141267722e-06,
"loss": 2.5484,
"step": 708
},
{
"epoch": 0.7061752988047809,
"grad_norm": 0.6365978121757507,
"learning_rate": 2.4101625459021212e-06,
"loss": 2.2276,
"step": 709
},
{
"epoch": 0.7071713147410359,
"grad_norm": 1.4600951671600342,
"learning_rate": 2.3952982975603494e-06,
"loss": 2.7489,
"step": 710
},
{
"epoch": 0.7081673306772909,
"grad_norm": 1.0905722379684448,
"learning_rate": 2.3804655761571517e-06,
"loss": 2.7045,
"step": 711
},
{
"epoch": 0.7091633466135459,
"grad_norm": 1.2118492126464844,
"learning_rate": 2.3656645612256747e-06,
"loss": 2.2625,
"step": 712
},
{
"epoch": 0.7101593625498008,
"grad_norm": 2.2730562686920166,
"learning_rate": 2.3508954319153e-06,
"loss": 2.9233,
"step": 713
},
{
"epoch": 0.7111553784860558,
"grad_norm": 1.1532260179519653,
"learning_rate": 2.3361583669894634e-06,
"loss": 2.6882,
"step": 714
},
{
"epoch": 0.7121513944223108,
"grad_norm": 1.74001944065094,
"learning_rate": 2.321453544823499e-06,
"loss": 2.296,
"step": 715
},
{
"epoch": 0.7131474103585658,
"grad_norm": 1.6281747817993164,
"learning_rate": 2.306781143402485e-06,
"loss": 2.5453,
"step": 716
},
{
"epoch": 0.7141434262948207,
"grad_norm": 1.2188794612884521,
"learning_rate": 2.2921413403190774e-06,
"loss": 2.3351,
"step": 717
},
{
"epoch": 0.7151394422310757,
"grad_norm": 1.180245280265808,
"learning_rate": 2.2775343127713685e-06,
"loss": 2.8909,
"step": 718
},
{
"epoch": 0.7161354581673307,
"grad_norm": 1.207853078842163,
"learning_rate": 2.2629602375607373e-06,
"loss": 2.2249,
"step": 719
},
{
"epoch": 0.7171314741035857,
"grad_norm": 1.1911535263061523,
"learning_rate": 2.24841929108972e-06,
"loss": 1.9728,
"step": 720
},
{
"epoch": 0.7181274900398407,
"grad_norm": 2.913970470428467,
"learning_rate": 2.23391164935986e-06,
"loss": 3.3,
"step": 721
},
{
"epoch": 0.7191235059760956,
"grad_norm": 1.0136604309082031,
"learning_rate": 2.219437487969588e-06,
"loss": 2.3078,
"step": 722
},
{
"epoch": 0.7201195219123506,
"grad_norm": 1.5370888710021973,
"learning_rate": 2.20499698211209e-06,
"loss": 2.4226,
"step": 723
},
{
"epoch": 0.7211155378486056,
"grad_norm": 0.8609825968742371,
"learning_rate": 2.190590306573198e-06,
"loss": 2.5837,
"step": 724
},
{
"epoch": 0.7221115537848606,
"grad_norm": 0.7969903945922852,
"learning_rate": 2.1762176357292582e-06,
"loss": 2.5065,
"step": 725
},
{
"epoch": 0.7231075697211156,
"grad_norm": 0.6895061135292053,
"learning_rate": 2.1618791435450334e-06,
"loss": 2.4443,
"step": 726
},
{
"epoch": 0.7241035856573705,
"grad_norm": 1.005803108215332,
"learning_rate": 2.1475750035715914e-06,
"loss": 2.8449,
"step": 727
},
{
"epoch": 0.7250996015936255,
"grad_norm": 1.464055061340332,
"learning_rate": 2.1333053889442033e-06,
"loss": 2.436,
"step": 728
},
{
"epoch": 0.7260956175298805,
"grad_norm": 0.7166134715080261,
"learning_rate": 2.1190704723802587e-06,
"loss": 2.6141,
"step": 729
},
{
"epoch": 0.7270916334661355,
"grad_norm": 1.2269198894500732,
"learning_rate": 2.104870426177157e-06,
"loss": 2.0039,
"step": 730
},
{
"epoch": 0.7280876494023905,
"grad_norm": 1.233473539352417,
"learning_rate": 2.0907054222102367e-06,
"loss": 2.4503,
"step": 731
},
{
"epoch": 0.7290836653386454,
"grad_norm": 0.8751947283744812,
"learning_rate": 2.0765756319306897e-06,
"loss": 2.482,
"step": 732
},
{
"epoch": 0.7300796812749004,
"grad_norm": 1.954285979270935,
"learning_rate": 2.0624812263634847e-06,
"loss": 2.6237,
"step": 733
},
{
"epoch": 0.7310756972111554,
"grad_norm": 0.7244362235069275,
"learning_rate": 2.048422376105299e-06,
"loss": 2.3371,
"step": 734
},
{
"epoch": 0.7320717131474104,
"grad_norm": 0.7712534666061401,
"learning_rate": 2.034399251322458e-06,
"loss": 2.6775,
"step": 735
},
{
"epoch": 0.7330677290836654,
"grad_norm": 1.0466793775558472,
"learning_rate": 2.020412021748866e-06,
"loss": 2.2773,
"step": 736
},
{
"epoch": 0.7340637450199203,
"grad_norm": 1.417794942855835,
"learning_rate": 2.0064608566839584e-06,
"loss": 2.7359,
"step": 737
},
{
"epoch": 0.7350597609561753,
"grad_norm": 0.5706871747970581,
"learning_rate": 1.9925459249906488e-06,
"loss": 2.246,
"step": 738
},
{
"epoch": 0.7360557768924303,
"grad_norm": 1.112219214439392,
"learning_rate": 1.978667395093293e-06,
"loss": 2.5444,
"step": 739
},
{
"epoch": 0.7370517928286853,
"grad_norm": 1.5537924766540527,
"learning_rate": 1.964825434975639e-06,
"loss": 2.2497,
"step": 740
},
{
"epoch": 0.7380478087649402,
"grad_norm": 0.7418034672737122,
"learning_rate": 1.9510202121788003e-06,
"loss": 2.4711,
"step": 741
},
{
"epoch": 0.7390438247011952,
"grad_norm": 1.0376439094543457,
"learning_rate": 1.9372518937992306e-06,
"loss": 2.1369,
"step": 742
},
{
"epoch": 0.7400398406374502,
"grad_norm": 1.6938295364379883,
"learning_rate": 1.923520646486695e-06,
"loss": 2.7013,
"step": 743
},
{
"epoch": 0.7410358565737052,
"grad_norm": 1.1227657794952393,
"learning_rate": 1.9098266364422554e-06,
"loss": 2.1956,
"step": 744
},
{
"epoch": 0.7420318725099602,
"grad_norm": 0.8521560430526733,
"learning_rate": 1.8961700294162578e-06,
"loss": 2.7621,
"step": 745
},
{
"epoch": 0.7430278884462151,
"grad_norm": 1.3367222547531128,
"learning_rate": 1.8825509907063328e-06,
"loss": 2.3669,
"step": 746
},
{
"epoch": 0.7440239043824701,
"grad_norm": 1.0971968173980713,
"learning_rate": 1.8689696851553847e-06,
"loss": 2.2727,
"step": 747
},
{
"epoch": 0.7450199203187251,
"grad_norm": 0.7232230305671692,
"learning_rate": 1.8554262771496017e-06,
"loss": 2.4247,
"step": 748
},
{
"epoch": 0.7460159362549801,
"grad_norm": 0.779901921749115,
"learning_rate": 1.8419209306164653e-06,
"loss": 2.4956,
"step": 749
},
{
"epoch": 0.7470119521912351,
"grad_norm": 0.9150820970535278,
"learning_rate": 1.82845380902277e-06,
"loss": 2.6319,
"step": 750
},
{
"epoch": 0.74800796812749,
"grad_norm": 1.0264116525650024,
"learning_rate": 1.8150250753726363e-06,
"loss": 2.537,
"step": 751
},
{
"epoch": 0.749003984063745,
"grad_norm": 1.3325294256210327,
"learning_rate": 1.8016348922055448e-06,
"loss": 2.5891,
"step": 752
},
{
"epoch": 0.75,
"grad_norm": 0.9217858910560608,
"learning_rate": 1.7882834215943645e-06,
"loss": 2.3572,
"step": 753
},
{
"epoch": 0.750996015936255,
"grad_norm": 1.020738959312439,
"learning_rate": 1.7749708251433983e-06,
"loss": 2.4734,
"step": 754
},
{
"epoch": 0.75199203187251,
"grad_norm": 0.9455721378326416,
"learning_rate": 1.7616972639864166e-06,
"loss": 2.4533,
"step": 755
},
{
"epoch": 0.7529880478087649,
"grad_norm": 1.7625263929367065,
"learning_rate": 1.7484628987847125e-06,
"loss": 2.5292,
"step": 756
},
{
"epoch": 0.7539840637450199,
"grad_norm": 1.2456424236297607,
"learning_rate": 1.7352678897251606e-06,
"loss": 2.5379,
"step": 757
},
{
"epoch": 0.7549800796812749,
"grad_norm": 1.9081121683120728,
"learning_rate": 1.7221123965182712e-06,
"loss": 2.413,
"step": 758
},
{
"epoch": 0.7559760956175299,
"grad_norm": 1.062225341796875,
"learning_rate": 1.7089965783962608e-06,
"loss": 2.458,
"step": 759
},
{
"epoch": 0.7569721115537849,
"grad_norm": 1.1116987466812134,
"learning_rate": 1.6959205941111228e-06,
"loss": 2.4556,
"step": 760
},
{
"epoch": 0.7579681274900398,
"grad_norm": 1.6234967708587646,
"learning_rate": 1.6828846019327128e-06,
"loss": 2.5499,
"step": 761
},
{
"epoch": 0.7589641434262948,
"grad_norm": 0.741877555847168,
"learning_rate": 1.6698887596468232e-06,
"loss": 2.1629,
"step": 762
},
{
"epoch": 0.7599601593625498,
"grad_norm": 2.112726926803589,
"learning_rate": 1.6569332245532777e-06,
"loss": 2.85,
"step": 763
},
{
"epoch": 0.7609561752988048,
"grad_norm": 0.9073076844215393,
"learning_rate": 1.6440181534640277e-06,
"loss": 2.4195,
"step": 764
},
{
"epoch": 0.7619521912350598,
"grad_norm": 1.2635924816131592,
"learning_rate": 1.6311437027012582e-06,
"loss": 2.3853,
"step": 765
},
{
"epoch": 0.7629482071713147,
"grad_norm": 1.6688510179519653,
"learning_rate": 1.618310028095486e-06,
"loss": 2.186,
"step": 766
},
{
"epoch": 0.7639442231075697,
"grad_norm": 1.076957106590271,
"learning_rate": 1.6055172849836826e-06,
"loss": 2.3887,
"step": 767
},
{
"epoch": 0.7649402390438247,
"grad_norm": 1.0081124305725098,
"learning_rate": 1.5927656282073861e-06,
"loss": 2.4315,
"step": 768
},
{
"epoch": 0.7659362549800797,
"grad_norm": 1.2986465692520142,
"learning_rate": 1.5800552121108392e-06,
"loss": 2.6633,
"step": 769
},
{
"epoch": 0.7669322709163346,
"grad_norm": 0.7207338213920593,
"learning_rate": 1.567386190539107e-06,
"loss": 2.6924,
"step": 770
},
{
"epoch": 0.7679282868525896,
"grad_norm": 0.6458574533462524,
"learning_rate": 1.5547587168362204e-06,
"loss": 2.7688,
"step": 771
},
{
"epoch": 0.7689243027888446,
"grad_norm": 1.0633124113082886,
"learning_rate": 1.5421729438433274e-06,
"loss": 2.1328,
"step": 772
},
{
"epoch": 0.7699203187250996,
"grad_norm": 2.143666982650757,
"learning_rate": 1.5296290238968303e-06,
"loss": 2.29,
"step": 773
},
{
"epoch": 0.7709163346613546,
"grad_norm": 0.5651401281356812,
"learning_rate": 1.517127108826551e-06,
"loss": 2.4732,
"step": 774
},
{
"epoch": 0.7719123505976095,
"grad_norm": 0.8489325642585754,
"learning_rate": 1.5046673499538893e-06,
"loss": 2.3174,
"step": 775
},
{
"epoch": 0.7729083665338645,
"grad_norm": 1.1251336336135864,
"learning_rate": 1.4922498980899907e-06,
"loss": 2.2915,
"step": 776
},
{
"epoch": 0.7739043824701195,
"grad_norm": 0.7484387755393982,
"learning_rate": 1.4798749035339278e-06,
"loss": 2.3685,
"step": 777
},
{
"epoch": 0.7749003984063745,
"grad_norm": 1.1463130712509155,
"learning_rate": 1.4675425160708723e-06,
"loss": 2.468,
"step": 778
},
{
"epoch": 0.7758964143426295,
"grad_norm": 1.5645790100097656,
"learning_rate": 1.4552528849702852e-06,
"loss": 2.6442,
"step": 779
},
{
"epoch": 0.7768924302788844,
"grad_norm": 1.8811829090118408,
"learning_rate": 1.4430061589841122e-06,
"loss": 2.5609,
"step": 780
},
{
"epoch": 0.7778884462151394,
"grad_norm": 0.8737534284591675,
"learning_rate": 1.4308024863449805e-06,
"loss": 2.6824,
"step": 781
},
{
"epoch": 0.7788844621513944,
"grad_norm": 1.1957892179489136,
"learning_rate": 1.4186420147644053e-06,
"loss": 2.3529,
"step": 782
},
{
"epoch": 0.7798804780876494,
"grad_norm": 1.2302711009979248,
"learning_rate": 1.4065248914310066e-06,
"loss": 2.513,
"step": 783
},
{
"epoch": 0.7808764940239044,
"grad_norm": 0.5240752100944519,
"learning_rate": 1.3944512630087182e-06,
"loss": 2.4043,
"step": 784
},
{
"epoch": 0.7818725099601593,
"grad_norm": 1.9195410013198853,
"learning_rate": 1.3824212756350196e-06,
"loss": 2.8095,
"step": 785
},
{
"epoch": 0.7828685258964143,
"grad_norm": 0.9604887962341309,
"learning_rate": 1.3704350749191642e-06,
"loss": 2.3252,
"step": 786
},
{
"epoch": 0.7838645418326693,
"grad_norm": 0.9721193313598633,
"learning_rate": 1.3584928059404207e-06,
"loss": 2.4578,
"step": 787
},
{
"epoch": 0.7848605577689243,
"grad_norm": 1.9194726943969727,
"learning_rate": 1.3465946132463125e-06,
"loss": 2.623,
"step": 788
},
{
"epoch": 0.7858565737051793,
"grad_norm": 1.482784390449524,
"learning_rate": 1.3347406408508695e-06,
"loss": 2.7708,
"step": 789
},
{
"epoch": 0.7868525896414342,
"grad_norm": 0.7451381683349609,
"learning_rate": 1.3229310322328847e-06,
"loss": 2.4386,
"step": 790
},
{
"epoch": 0.7878486055776892,
"grad_norm": 0.6679832339286804,
"learning_rate": 1.3111659303341824e-06,
"loss": 2.37,
"step": 791
},
{
"epoch": 0.7888446215139442,
"grad_norm": 0.8974138498306274,
"learning_rate": 1.2994454775578785e-06,
"loss": 2.2855,
"step": 792
},
{
"epoch": 0.7898406374501992,
"grad_norm": 1.3459084033966064,
"learning_rate": 1.2877698157666663e-06,
"loss": 2.7191,
"step": 793
},
{
"epoch": 0.7908366533864541,
"grad_norm": 1.0975403785705566,
"learning_rate": 1.2761390862810907e-06,
"loss": 2.2521,
"step": 794
},
{
"epoch": 0.7918326693227091,
"grad_norm": 0.9908530712127686,
"learning_rate": 1.2645534298778506e-06,
"loss": 2.603,
"step": 795
},
{
"epoch": 0.7928286852589641,
"grad_norm": 0.7879658937454224,
"learning_rate": 1.253012986788078e-06,
"loss": 2.6744,
"step": 796
},
{
"epoch": 0.7938247011952191,
"grad_norm": 0.9611647129058838,
"learning_rate": 1.2415178966956531e-06,
"loss": 2.3191,
"step": 797
},
{
"epoch": 0.7948207171314741,
"grad_norm": 2.035386085510254,
"learning_rate": 1.2300682987355122e-06,
"loss": 2.46,
"step": 798
},
{
"epoch": 0.795816733067729,
"grad_norm": 1.4089851379394531,
"learning_rate": 1.2186643314919571e-06,
"loss": 2.164,
"step": 799
},
{
"epoch": 0.796812749003984,
"grad_norm": 0.944324254989624,
"learning_rate": 1.2073061329969843e-06,
"loss": 2.4692,
"step": 800
},
{
"epoch": 0.797808764940239,
"grad_norm": 1.0982811450958252,
"learning_rate": 1.1959938407286099e-06,
"loss": 2.2721,
"step": 801
},
{
"epoch": 0.798804780876494,
"grad_norm": 0.8596687316894531,
"learning_rate": 1.1847275916092116e-06,
"loss": 2.3065,
"step": 802
},
{
"epoch": 0.799800796812749,
"grad_norm": 0.8045834898948669,
"learning_rate": 1.1735075220038634e-06,
"loss": 2.1781,
"step": 803
},
{
"epoch": 0.8007968127490039,
"grad_norm": 1.5618336200714111,
"learning_rate": 1.1623337677186902e-06,
"loss": 2.4166,
"step": 804
},
{
"epoch": 0.8017928286852589,
"grad_norm": 1.3380889892578125,
"learning_rate": 1.151206463999222e-06,
"loss": 2.5112,
"step": 805
},
{
"epoch": 0.8027888446215139,
"grad_norm": 1.37197744846344,
"learning_rate": 1.1401257455287612e-06,
"loss": 2.4657,
"step": 806
},
{
"epoch": 0.8037848605577689,
"grad_norm": 1.3427671194076538,
"learning_rate": 1.1290917464267458e-06,
"loss": 2.5771,
"step": 807
},
{
"epoch": 0.8047808764940239,
"grad_norm": 0.5480353832244873,
"learning_rate": 1.1181046002471292e-06,
"loss": 2.2615,
"step": 808
},
{
"epoch": 0.8057768924302788,
"grad_norm": 1.03799569606781,
"learning_rate": 1.107164439976764e-06,
"loss": 2.1258,
"step": 809
},
{
"epoch": 0.8067729083665338,
"grad_norm": 0.874638020992279,
"learning_rate": 1.0962713980337947e-06,
"loss": 2.3982,
"step": 810
},
{
"epoch": 0.8077689243027888,
"grad_norm": 0.7668205499649048,
"learning_rate": 1.085425606266049e-06,
"loss": 2.5981,
"step": 811
},
{
"epoch": 0.8087649402390438,
"grad_norm": 1.033339262008667,
"learning_rate": 1.0746271959494453e-06,
"loss": 2.425,
"step": 812
},
{
"epoch": 0.8097609561752988,
"grad_norm": 0.7686687707901001,
"learning_rate": 1.063876297786407e-06,
"loss": 2.287,
"step": 813
},
{
"epoch": 0.8107569721115537,
"grad_norm": 0.8868098855018616,
"learning_rate": 1.0531730419042736e-06,
"loss": 2.4047,
"step": 814
},
{
"epoch": 0.8117529880478087,
"grad_norm": 0.9515554308891296,
"learning_rate": 1.04251755785373e-06,
"loss": 2.2864,
"step": 815
},
{
"epoch": 0.8127490039840638,
"grad_norm": 0.96048903465271,
"learning_rate": 1.0319099746072375e-06,
"loss": 2.305,
"step": 816
},
{
"epoch": 0.8137450199203188,
"grad_norm": 1.2892875671386719,
"learning_rate": 1.0213504205574758e-06,
"loss": 2.8046,
"step": 817
},
{
"epoch": 0.8147410358565738,
"grad_norm": 1.2894792556762695,
"learning_rate": 1.0108390235157828e-06,
"loss": 2.3662,
"step": 818
},
{
"epoch": 0.8157370517928287,
"grad_norm": 0.894437849521637,
"learning_rate": 1.0003759107106116e-06,
"loss": 2.3213,
"step": 819
},
{
"epoch": 0.8167330677290837,
"grad_norm": 0.8480390906333923,
"learning_rate": 9.899612087859883e-07,
"loss": 2.2743,
"step": 820
},
{
"epoch": 0.8177290836653387,
"grad_norm": 0.6957425475120544,
"learning_rate": 9.795950437999852e-07,
"loss": 2.6014,
"step": 821
},
{
"epoch": 0.8187250996015937,
"grad_norm": 1.1951571702957153,
"learning_rate": 9.692775412231863e-07,
"loss": 2.5359,
"step": 822
},
{
"epoch": 0.8197211155378487,
"grad_norm": 1.5518149137496948,
"learning_rate": 9.590088259371738e-07,
"loss": 2.5717,
"step": 823
},
{
"epoch": 0.8207171314741036,
"grad_norm": 0.9850301146507263,
"learning_rate": 9.487890222330137e-07,
"loss": 2.3225,
"step": 824
},
{
"epoch": 0.8217131474103586,
"grad_norm": 1.0830625295639038,
"learning_rate": 9.386182538097582e-07,
"loss": 2.49,
"step": 825
},
{
"epoch": 0.8227091633466136,
"grad_norm": 1.1903777122497559,
"learning_rate": 9.284966437729387e-07,
"loss": 2.3532,
"step": 826
},
{
"epoch": 0.8237051792828686,
"grad_norm": 1.2485320568084717,
"learning_rate": 9.184243146330829e-07,
"loss": 2.286,
"step": 827
},
{
"epoch": 0.8247011952191236,
"grad_norm": 1.5328834056854248,
"learning_rate": 9.084013883042276e-07,
"loss": 2.5148,
"step": 828
},
{
"epoch": 0.8256972111553785,
"grad_norm": 1.7866473197937012,
"learning_rate": 8.984279861024453e-07,
"loss": 2.7636,
"step": 829
},
{
"epoch": 0.8266932270916335,
"grad_norm": 0.8635814785957336,
"learning_rate": 8.885042287443785e-07,
"loss": 2.579,
"step": 830
},
{
"epoch": 0.8276892430278885,
"grad_norm": 1.480765461921692,
"learning_rate": 8.786302363457733e-07,
"loss": 2.6228,
"step": 831
},
{
"epoch": 0.8286852589641435,
"grad_norm": 0.6803283095359802,
"learning_rate": 8.688061284200266e-07,
"loss": 2.4377,
"step": 832
},
{
"epoch": 0.8296812749003984,
"grad_norm": 1.2872114181518555,
"learning_rate": 8.590320238767425e-07,
"loss": 2.4269,
"step": 833
},
{
"epoch": 0.8306772908366534,
"grad_norm": 1.054971694946289,
"learning_rate": 8.493080410202914e-07,
"loss": 2.4407,
"step": 834
},
{
"epoch": 0.8316733067729084,
"grad_norm": 0.8980826735496521,
"learning_rate": 8.396342975483751e-07,
"loss": 2.352,
"step": 835
},
{
"epoch": 0.8326693227091634,
"grad_norm": 0.7561918497085571,
"learning_rate": 8.30010910550611e-07,
"loss": 2.3562,
"step": 836
},
{
"epoch": 0.8336653386454184,
"grad_norm": 0.6130694150924683,
"learning_rate": 8.204379965071036e-07,
"loss": 2.3059,
"step": 837
},
{
"epoch": 0.8346613545816733,
"grad_norm": 1.1262505054473877,
"learning_rate": 8.109156712870397e-07,
"loss": 2.7065,
"step": 838
},
{
"epoch": 0.8356573705179283,
"grad_norm": 2.2383527755737305,
"learning_rate": 8.014440501472909e-07,
"loss": 2.6112,
"step": 839
},
{
"epoch": 0.8366533864541833,
"grad_norm": 1.0032474994659424,
"learning_rate": 7.920232477310102e-07,
"loss": 2.4155,
"step": 840
},
{
"epoch": 0.8376494023904383,
"grad_norm": 1.5110204219818115,
"learning_rate": 7.826533780662481e-07,
"loss": 2.2312,
"step": 841
},
{
"epoch": 0.8386454183266933,
"grad_norm": 1.206811785697937,
"learning_rate": 7.733345545645726e-07,
"loss": 2.7882,
"step": 842
},
{
"epoch": 0.8396414342629482,
"grad_norm": 1.1954952478408813,
"learning_rate": 7.640668900196985e-07,
"loss": 2.5765,
"step": 843
},
{
"epoch": 0.8406374501992032,
"grad_norm": 0.8079789280891418,
"learning_rate": 7.54850496606117e-07,
"loss": 2.0988,
"step": 844
},
{
"epoch": 0.8416334661354582,
"grad_norm": 1.204300880432129,
"learning_rate": 7.456854858777418e-07,
"loss": 2.6726,
"step": 845
},
{
"epoch": 0.8426294820717132,
"grad_norm": 2.1050732135772705,
"learning_rate": 7.365719687665568e-07,
"loss": 2.4657,
"step": 846
},
{
"epoch": 0.8436254980079682,
"grad_norm": 1.1028344631195068,
"learning_rate": 7.27510055581278e-07,
"loss": 2.7165,
"step": 847
},
{
"epoch": 0.8446215139442231,
"grad_norm": 0.9454997777938843,
"learning_rate": 7.184998560060114e-07,
"loss": 2.4538,
"step": 848
},
{
"epoch": 0.8456175298804781,
"grad_norm": 1.000157117843628,
"learning_rate": 7.095414790989292e-07,
"loss": 2.2186,
"step": 849
},
{
"epoch": 0.8466135458167331,
"grad_norm": 1.7517778873443604,
"learning_rate": 7.006350332909495e-07,
"loss": 2.4932,
"step": 850
},
{
"epoch": 0.8476095617529881,
"grad_norm": 1.2720731496810913,
"learning_rate": 6.917806263844268e-07,
"loss": 2.287,
"step": 851
},
{
"epoch": 0.848605577689243,
"grad_norm": 1.1185457706451416,
"learning_rate": 6.829783655518402e-07,
"loss": 2.4009,
"step": 852
},
{
"epoch": 0.849601593625498,
"grad_norm": 1.228535771369934,
"learning_rate": 6.742283573345004e-07,
"loss": 2.4729,
"step": 853
},
{
"epoch": 0.850597609561753,
"grad_norm": 1.3845924139022827,
"learning_rate": 6.655307076412637e-07,
"loss": 2.5723,
"step": 854
},
{
"epoch": 0.851593625498008,
"grad_norm": 1.2278180122375488,
"learning_rate": 6.568855217472425e-07,
"loss": 2.4958,
"step": 855
},
{
"epoch": 0.852589641434263,
"grad_norm": 0.7195264101028442,
"learning_rate": 6.482929042925363e-07,
"loss": 2.5879,
"step": 856
},
{
"epoch": 0.853585657370518,
"grad_norm": 1.1685850620269775,
"learning_rate": 6.397529592809615e-07,
"loss": 2.3411,
"step": 857
},
{
"epoch": 0.8545816733067729,
"grad_norm": 0.935212254524231,
"learning_rate": 6.312657900788e-07,
"loss": 2.3266,
"step": 858
},
{
"epoch": 0.8555776892430279,
"grad_norm": 1.0523936748504639,
"learning_rate": 6.228314994135376e-07,
"loss": 2.5915,
"step": 859
},
{
"epoch": 0.8565737051792829,
"grad_norm": 0.7740164399147034,
"learning_rate": 6.14450189372628e-07,
"loss": 2.7521,
"step": 860
},
{
"epoch": 0.8575697211155379,
"grad_norm": 2.0991780757904053,
"learning_rate": 6.061219614022535e-07,
"loss": 2.2285,
"step": 861
},
{
"epoch": 0.8585657370517928,
"grad_norm": 0.990088164806366,
"learning_rate": 5.978469163061018e-07,
"loss": 2.3091,
"step": 862
},
{
"epoch": 0.8595617529880478,
"grad_norm": 1.3237099647521973,
"learning_rate": 5.896251542441395e-07,
"loss": 2.4856,
"step": 863
},
{
"epoch": 0.8605577689243028,
"grad_norm": 0.9690184593200684,
"learning_rate": 5.814567747314049e-07,
"loss": 2.3608,
"step": 864
},
{
"epoch": 0.8615537848605578,
"grad_norm": 2.3030495643615723,
"learning_rate": 5.733418766367988e-07,
"loss": 2.4188,
"step": 865
},
{
"epoch": 0.8625498007968128,
"grad_norm": 1.1400495767593384,
"learning_rate": 5.652805581818943e-07,
"loss": 2.2339,
"step": 866
},
{
"epoch": 0.8635458167330677,
"grad_norm": 1.5023630857467651,
"learning_rate": 5.572729169397422e-07,
"loss": 2.1393,
"step": 867
},
{
"epoch": 0.8645418326693227,
"grad_norm": 0.9161491394042969,
"learning_rate": 5.493190498336903e-07,
"loss": 2.3602,
"step": 868
},
{
"epoch": 0.8655378486055777,
"grad_norm": 0.9129965901374817,
"learning_rate": 5.414190531362162e-07,
"loss": 2.3639,
"step": 869
},
{
"epoch": 0.8665338645418327,
"grad_norm": 1.9090954065322876,
"learning_rate": 5.335730224677538e-07,
"loss": 2.2505,
"step": 870
},
{
"epoch": 0.8675298804780877,
"grad_norm": 0.8173048496246338,
"learning_rate": 5.25781052795541e-07,
"loss": 2.2072,
"step": 871
},
{
"epoch": 0.8685258964143426,
"grad_norm": 2.2648603916168213,
"learning_rate": 5.180432384324691e-07,
"loss": 2.458,
"step": 872
},
{
"epoch": 0.8695219123505976,
"grad_norm": 0.906757652759552,
"learning_rate": 5.103596730359428e-07,
"loss": 2.5185,
"step": 873
},
{
"epoch": 0.8705179282868526,
"grad_norm": 0.9674282073974609,
"learning_rate": 5.027304496067431e-07,
"loss": 2.3208,
"step": 874
},
{
"epoch": 0.8715139442231076,
"grad_norm": 1.4019418954849243,
"learning_rate": 4.951556604879049e-07,
"loss": 2.531,
"step": 875
},
{
"epoch": 0.8725099601593626,
"grad_norm": 1.1170932054519653,
"learning_rate": 4.876353973635955e-07,
"loss": 2.2724,
"step": 876
},
{
"epoch": 0.8735059760956175,
"grad_norm": 0.8795150518417358,
"learning_rate": 4.8016975125801e-07,
"loss": 2.4447,
"step": 877
},
{
"epoch": 0.8745019920318725,
"grad_norm": 1.4134328365325928,
"learning_rate": 4.727588125342669e-07,
"loss": 2.4728,
"step": 878
},
{
"epoch": 0.8754980079681275,
"grad_norm": 0.8537651300430298,
"learning_rate": 4.6540267089331294e-07,
"loss": 1.8693,
"step": 879
},
{
"epoch": 0.8764940239043825,
"grad_norm": 0.973147988319397,
"learning_rate": 4.581014153728386e-07,
"loss": 2.6805,
"step": 880
},
{
"epoch": 0.8774900398406374,
"grad_norm": 2.0054306983947754,
"learning_rate": 4.508551343462014e-07,
"loss": 2.8345,
"step": 881
},
{
"epoch": 0.8784860557768924,
"grad_norm": 1.3600786924362183,
"learning_rate": 4.4366391552135567e-07,
"loss": 2.4685,
"step": 882
},
{
"epoch": 0.8794820717131474,
"grad_norm": 1.042197823524475,
"learning_rate": 4.3652784593978927e-07,
"loss": 2.2609,
"step": 883
},
{
"epoch": 0.8804780876494024,
"grad_norm": 1.342214822769165,
"learning_rate": 4.29447011975474e-07,
"loss": 2.3443,
"step": 884
},
{
"epoch": 0.8814741035856574,
"grad_norm": 1.0559214353561401,
"learning_rate": 4.224214993338149e-07,
"loss": 2.5197,
"step": 885
},
{
"epoch": 0.8824701195219123,
"grad_norm": 0.9082587361335754,
"learning_rate": 4.154513930506171e-07,
"loss": 2.4609,
"step": 886
},
{
"epoch": 0.8834661354581673,
"grad_norm": 1.555578589439392,
"learning_rate": 4.0853677749105426e-07,
"loss": 2.5883,
"step": 887
},
{
"epoch": 0.8844621513944223,
"grad_norm": 0.6700481176376343,
"learning_rate": 4.0167773634865017e-07,
"loss": 2.4754,
"step": 888
},
{
"epoch": 0.8854581673306773,
"grad_norm": 1.7728710174560547,
"learning_rate": 3.9487435264426056e-07,
"loss": 2.5669,
"step": 889
},
{
"epoch": 0.8864541832669323,
"grad_norm": 0.8938197493553162,
"learning_rate": 3.8812670872507454e-07,
"loss": 2.7976,
"step": 890
},
{
"epoch": 0.8874501992031872,
"grad_norm": 1.6636402606964111,
"learning_rate": 3.8143488626361135e-07,
"loss": 2.4172,
"step": 891
},
{
"epoch": 0.8884462151394422,
"grad_norm": 1.0676062107086182,
"learning_rate": 3.747989662567403e-07,
"loss": 2.582,
"step": 892
},
{
"epoch": 0.8894422310756972,
"grad_norm": 1.280415654182434,
"learning_rate": 3.6821902902469066e-07,
"loss": 2.2295,
"step": 893
},
{
"epoch": 0.8904382470119522,
"grad_norm": 1.0814982652664185,
"learning_rate": 3.6169515421008494e-07,
"loss": 2.7001,
"step": 894
},
{
"epoch": 0.8914342629482072,
"grad_norm": 1.077160120010376,
"learning_rate": 3.5522742077697734e-07,
"loss": 2.49,
"step": 895
},
{
"epoch": 0.8924302788844621,
"grad_norm": 1.2483303546905518,
"learning_rate": 3.4881590700989175e-07,
"loss": 2.3397,
"step": 896
},
{
"epoch": 0.8934262948207171,
"grad_norm": 0.6463543772697449,
"learning_rate": 3.4246069051287747e-07,
"loss": 2.3176,
"step": 897
},
{
"epoch": 0.8944223107569721,
"grad_norm": 0.8918944001197815,
"learning_rate": 3.3616184820856936e-07,
"loss": 2.5445,
"step": 898
},
{
"epoch": 0.8954183266932271,
"grad_norm": 0.963518500328064,
"learning_rate": 3.299194563372604e-07,
"loss": 2.3521,
"step": 899
},
{
"epoch": 0.896414342629482,
"grad_norm": 1.8490091562271118,
"learning_rate": 3.237335904559713e-07,
"loss": 1.8583,
"step": 900
},
{
"epoch": 0.897410358565737,
"grad_norm": 1.2349917888641357,
"learning_rate": 3.176043254375422e-07,
"loss": 2.5093,
"step": 901
},
{
"epoch": 0.898406374501992,
"grad_norm": 1.3500816822052002,
"learning_rate": 3.1153173546972395e-07,
"loss": 2.3507,
"step": 902
},
{
"epoch": 0.899402390438247,
"grad_norm": 1.2740628719329834,
"learning_rate": 3.055158940542818e-07,
"loss": 2.4008,
"step": 903
},
{
"epoch": 0.900398406374502,
"grad_norm": 33.82315444946289,
"learning_rate": 2.9955687400610336e-07,
"loss": 2.3794,
"step": 904
},
{
"epoch": 0.901394422310757,
"grad_norm": 0.8968676328659058,
"learning_rate": 2.9365474745231935e-07,
"loss": 2.3304,
"step": 905
},
{
"epoch": 0.9023904382470119,
"grad_norm": 1.474859356880188,
"learning_rate": 2.878095858314278e-07,
"loss": 2.2761,
"step": 906
},
{
"epoch": 0.9033864541832669,
"grad_norm": 1.0338733196258545,
"learning_rate": 2.820214598924348e-07,
"loss": 2.222,
"step": 907
},
{
"epoch": 0.9043824701195219,
"grad_norm": 1.2461026906967163,
"learning_rate": 2.7629043969399193e-07,
"loss": 2.129,
"step": 908
},
{
"epoch": 0.9053784860557769,
"grad_norm": 1.0711873769760132,
"learning_rate": 2.7061659460355047e-07,
"loss": 2.5671,
"step": 909
},
{
"epoch": 0.9063745019920318,
"grad_norm": 2.0772130489349365,
"learning_rate": 2.6499999329652525e-07,
"loss": 2.4514,
"step": 910
},
{
"epoch": 0.9073705179282868,
"grad_norm": 0.8682186603546143,
"learning_rate": 2.594407037554586e-07,
"loss": 2.4655,
"step": 911
},
{
"epoch": 0.9083665338645418,
"grad_norm": 2.363909959793091,
"learning_rate": 2.539387932691995e-07,
"loss": 3.0087,
"step": 912
},
{
"epoch": 0.9093625498007968,
"grad_norm": 1.1324294805526733,
"learning_rate": 2.4849432843208786e-07,
"loss": 2.6718,
"step": 913
},
{
"epoch": 0.9103585657370518,
"grad_norm": 1.2291409969329834,
"learning_rate": 2.431073751431529e-07,
"loss": 2.4459,
"step": 914
},
{
"epoch": 0.9113545816733067,
"grad_norm": 1.1384942531585693,
"learning_rate": 2.377779986053097e-07,
"loss": 2.3891,
"step": 915
},
{
"epoch": 0.9123505976095617,
"grad_norm": 0.9154942631721497,
"learning_rate": 2.3250626332457226e-07,
"loss": 2.3965,
"step": 916
},
{
"epoch": 0.9133466135458167,
"grad_norm": 3.3389575481414795,
"learning_rate": 2.2729223310927473e-07,
"loss": 2.4395,
"step": 917
},
{
"epoch": 0.9143426294820717,
"grad_norm": 1.6811953783035278,
"learning_rate": 2.2213597106929608e-07,
"loss": 2.6017,
"step": 918
},
{
"epoch": 0.9153386454183267,
"grad_norm": 0.8396251201629639,
"learning_rate": 2.1703753961529906e-07,
"loss": 2.7736,
"step": 919
},
{
"epoch": 0.9163346613545816,
"grad_norm": 1.4626351594924927,
"learning_rate": 2.1199700045797077e-07,
"loss": 2.2861,
"step": 920
},
{
"epoch": 0.9173306772908366,
"grad_norm": 1.5617847442626953,
"learning_rate": 2.070144146072789e-07,
"loss": 2.6273,
"step": 921
},
{
"epoch": 0.9183266932270916,
"grad_norm": 1.4151337146759033,
"learning_rate": 2.0208984237173546e-07,
"loss": 2.1617,
"step": 922
},
{
"epoch": 0.9193227091633466,
"grad_norm": 0.9167352318763733,
"learning_rate": 1.9722334335766092e-07,
"loss": 2.3805,
"step": 923
},
{
"epoch": 0.9203187250996016,
"grad_norm": 0.840559720993042,
"learning_rate": 1.9241497646846463e-07,
"loss": 2.3247,
"step": 924
},
{
"epoch": 0.9213147410358565,
"grad_norm": 0.9646689891815186,
"learning_rate": 1.876647999039377e-07,
"loss": 2.5301,
"step": 925
},
{
"epoch": 0.9223107569721115,
"grad_norm": 0.7615554928779602,
"learning_rate": 1.829728711595391e-07,
"loss": 2.3153,
"step": 926
},
{
"epoch": 0.9233067729083665,
"grad_norm": 1.138389229774475,
"learning_rate": 1.7833924702570725e-07,
"loss": 2.573,
"step": 927
},
{
"epoch": 0.9243027888446215,
"grad_norm": 1.0490334033966064,
"learning_rate": 1.7376398358716852e-07,
"loss": 2.6447,
"step": 928
},
{
"epoch": 0.9252988047808764,
"grad_norm": 0.9057884812355042,
"learning_rate": 1.6924713622225975e-07,
"loss": 2.6325,
"step": 929
},
{
"epoch": 0.9262948207171314,
"grad_norm": 2.5391931533813477,
"learning_rate": 1.6478875960225904e-07,
"loss": 2.7326,
"step": 930
},
{
"epoch": 0.9272908366533864,
"grad_norm": 1.9867279529571533,
"learning_rate": 1.6038890769072223e-07,
"loss": 2.4295,
"step": 931
},
{
"epoch": 0.9282868525896414,
"grad_norm": 0.7426679730415344,
"learning_rate": 1.5604763374283073e-07,
"loss": 2.3705,
"step": 932
},
{
"epoch": 0.9292828685258964,
"grad_norm": 1.033311367034912,
"learning_rate": 1.5176499030474578e-07,
"loss": 2.2442,
"step": 933
},
{
"epoch": 0.9302788844621513,
"grad_norm": 1.394702672958374,
"learning_rate": 1.4754102921297363e-07,
"loss": 2.2417,
"step": 934
},
{
"epoch": 0.9312749003984063,
"grad_norm": 1.109711766242981,
"learning_rate": 1.4337580159373864e-07,
"loss": 2.1194,
"step": 935
},
{
"epoch": 0.9322709163346613,
"grad_norm": 0.810978353023529,
"learning_rate": 1.3926935786236218e-07,
"loss": 2.251,
"step": 936
},
{
"epoch": 0.9332669322709163,
"grad_norm": 0.9075368642807007,
"learning_rate": 1.3522174772265585e-07,
"loss": 2.3295,
"step": 937
},
{
"epoch": 0.9342629482071713,
"grad_norm": 1.0174400806427002,
"learning_rate": 1.3123302016631477e-07,
"loss": 2.4231,
"step": 938
},
{
"epoch": 0.9352589641434262,
"grad_norm": 1.4848259687423706,
"learning_rate": 1.2730322347233037e-07,
"loss": 2.327,
"step": 939
},
{
"epoch": 0.9362549800796812,
"grad_norm": 0.7531813979148865,
"learning_rate": 1.2343240520640287e-07,
"loss": 2.5221,
"step": 940
},
{
"epoch": 0.9372509960159362,
"grad_norm": 2.318554162979126,
"learning_rate": 1.196206122203647e-07,
"loss": 2.3946,
"step": 941
},
{
"epoch": 0.9382470119521913,
"grad_norm": 0.8973721861839294,
"learning_rate": 1.158678906516153e-07,
"loss": 2.1919,
"step": 942
},
{
"epoch": 0.9392430278884463,
"grad_norm": 1.4439376592636108,
"learning_rate": 1.1217428592256218e-07,
"loss": 2.3653,
"step": 943
},
{
"epoch": 0.9402390438247012,
"grad_norm": 1.6569935083389282,
"learning_rate": 1.0853984274007246e-07,
"loss": 2.6982,
"step": 944
},
{
"epoch": 0.9412350597609562,
"grad_norm": 1.5078299045562744,
"learning_rate": 1.0496460509492767e-07,
"loss": 2.4708,
"step": 945
},
{
"epoch": 0.9422310756972112,
"grad_norm": 2.2158310413360596,
"learning_rate": 1.0144861626129599e-07,
"loss": 2.5999,
"step": 946
},
{
"epoch": 0.9432270916334662,
"grad_norm": 1.4059021472930908,
"learning_rate": 9.799191879620474e-08,
"loss": 2.4373,
"step": 947
},
{
"epoch": 0.9442231075697212,
"grad_norm": 2.9164271354675293,
"learning_rate": 9.459455453902866e-08,
"loss": 2.6697,
"step": 948
},
{
"epoch": 0.9452191235059761,
"grad_norm": 1.275817632675171,
"learning_rate": 9.125656461098142e-08,
"loss": 2.48,
"step": 949
},
{
"epoch": 0.9462151394422311,
"grad_norm": 0.821499764919281,
"learning_rate": 8.797798941461655e-08,
"loss": 2.4301,
"step": 950
},
{
"epoch": 0.9472111553784861,
"grad_norm": 1.4214954376220703,
"learning_rate": 8.475886863334282e-08,
"loss": 2.2847,
"step": 951
},
{
"epoch": 0.9482071713147411,
"grad_norm": 0.9953071475028992,
"learning_rate": 8.15992412309391e-08,
"loss": 2.464,
"step": 952
},
{
"epoch": 0.9492031872509961,
"grad_norm": 1.4973928928375244,
"learning_rate": 7.84991454510864e-08,
"loss": 2.0293,
"step": 953
},
{
"epoch": 0.950199203187251,
"grad_norm": 1.326232671737671,
"learning_rate": 7.545861881690097e-08,
"loss": 2.4445,
"step": 954
},
{
"epoch": 0.951195219123506,
"grad_norm": 1.0779331922531128,
"learning_rate": 7.247769813048644e-08,
"loss": 2.7232,
"step": 955
},
{
"epoch": 0.952191235059761,
"grad_norm": 1.0593082904815674,
"learning_rate": 6.955641947248127e-08,
"loss": 2.8634,
"step": 956
},
{
"epoch": 0.953187250996016,
"grad_norm": 0.8761929869651794,
"learning_rate": 6.669481820162638e-08,
"loss": 2.3328,
"step": 957
},
{
"epoch": 0.954183266932271,
"grad_norm": 0.9143054485321045,
"learning_rate": 6.389292895433608e-08,
"loss": 2.2261,
"step": 958
},
{
"epoch": 0.9551792828685259,
"grad_norm": 0.8204777240753174,
"learning_rate": 6.115078564427946e-08,
"loss": 2.4155,
"step": 959
},
{
"epoch": 0.9561752988047809,
"grad_norm": 0.7546234726905823,
"learning_rate": 5.8468421461968517e-08,
"loss": 2.267,
"step": 960
},
{
"epoch": 0.9571713147410359,
"grad_norm": 1.0376356840133667,
"learning_rate": 5.584586887435739e-08,
"loss": 2.4496,
"step": 961
},
{
"epoch": 0.9581673306772909,
"grad_norm": 0.6573870182037354,
"learning_rate": 5.3283159624448745e-08,
"loss": 2.4904,
"step": 962
},
{
"epoch": 0.9591633466135459,
"grad_norm": 1.3613762855529785,
"learning_rate": 5.0780324730911877e-08,
"loss": 2.4824,
"step": 963
},
{
"epoch": 0.9601593625498008,
"grad_norm": 1.4304169416427612,
"learning_rate": 4.833739448770247e-08,
"loss": 2.9062,
"step": 964
},
{
"epoch": 0.9611553784860558,
"grad_norm": 0.9547715783119202,
"learning_rate": 4.5954398463700647e-08,
"loss": 2.4977,
"step": 965
},
{
"epoch": 0.9621513944223108,
"grad_norm": 1.3909553289413452,
"learning_rate": 4.3631365502351805e-08,
"loss": 2.2116,
"step": 966
},
{
"epoch": 0.9631474103585658,
"grad_norm": 0.7387050986289978,
"learning_rate": 4.136832372131583e-08,
"loss": 2.5225,
"step": 967
},
{
"epoch": 0.9641434262948207,
"grad_norm": 1.2469770908355713,
"learning_rate": 3.916530051212841e-08,
"loss": 2.4759,
"step": 968
},
{
"epoch": 0.9651394422310757,
"grad_norm": 1.3780826330184937,
"learning_rate": 3.702232253986804e-08,
"loss": 2.0538,
"step": 969
},
{
"epoch": 0.9661354581673307,
"grad_norm": 0.9699292778968811,
"learning_rate": 3.4939415742835655e-08,
"loss": 2.3441,
"step": 970
},
{
"epoch": 0.9671314741035857,
"grad_norm": 1.147615671157837,
"learning_rate": 3.2916605332238284e-08,
"loss": 2.4042,
"step": 971
},
{
"epoch": 0.9681274900398407,
"grad_norm": 1.1738359928131104,
"learning_rate": 3.095391579188589e-08,
"loss": 2.539,
"step": 972
},
{
"epoch": 0.9691235059760956,
"grad_norm": 0.640042781829834,
"learning_rate": 2.9051370877892226e-08,
"loss": 2.3044,
"step": 973
},
{
"epoch": 0.9701195219123506,
"grad_norm": 0.7774790525436401,
"learning_rate": 2.7208993618390578e-08,
"loss": 2.2616,
"step": 974
},
{
"epoch": 0.9711155378486056,
"grad_norm": 1.0389803647994995,
"learning_rate": 2.5426806313252895e-08,
"loss": 2.4425,
"step": 975
},
{
"epoch": 0.9721115537848606,
"grad_norm": 1.3041914701461792,
"learning_rate": 2.370483053382111e-08,
"loss": 2.39,
"step": 976
},
{
"epoch": 0.9731075697211156,
"grad_norm": 0.6934490203857422,
"learning_rate": 2.2043087122644023e-08,
"loss": 2.2232,
"step": 977
},
{
"epoch": 0.9741035856573705,
"grad_norm": 0.6868986487388611,
"learning_rate": 2.0441596193227497e-08,
"loss": 2.4806,
"step": 978
},
{
"epoch": 0.9750996015936255,
"grad_norm": 1.1924256086349487,
"learning_rate": 1.8900377129790205e-08,
"loss": 2.4314,
"step": 979
},
{
"epoch": 0.9760956175298805,
"grad_norm": 0.7938891053199768,
"learning_rate": 1.741944858702771e-08,
"loss": 2.4715,
"step": 980
},
{
"epoch": 0.9770916334661355,
"grad_norm": 0.9900745749473572,
"learning_rate": 1.5998828489888762e-08,
"loss": 2.0915,
"step": 981
},
{
"epoch": 0.9780876494023905,
"grad_norm": 3.0510518550872803,
"learning_rate": 1.4638534033356578e-08,
"loss": 3.3239,
"step": 982
},
{
"epoch": 0.9790836653386454,
"grad_norm": 1.1174182891845703,
"learning_rate": 1.333858168224178e-08,
"loss": 2.1631,
"step": 983
},
{
"epoch": 0.9800796812749004,
"grad_norm": 0.7828091979026794,
"learning_rate": 1.2098987170982013e-08,
"loss": 2.2998,
"step": 984
},
{
"epoch": 0.9810756972111554,
"grad_norm": 0.8114204406738281,
"learning_rate": 1.0919765503453195e-08,
"loss": 2.3996,
"step": 985
},
{
"epoch": 0.9820717131474104,
"grad_norm": 0.625230073928833,
"learning_rate": 9.800930952786336e-09,
"loss": 2.6785,
"step": 986
},
{
"epoch": 0.9830677290836654,
"grad_norm": 1.6107351779937744,
"learning_rate": 8.742497061195455e-09,
"loss": 2.6999,
"step": 987
},
{
"epoch": 0.9840637450199203,
"grad_norm": 1.5219416618347168,
"learning_rate": 7.744476639813814e-09,
"loss": 2.3396,
"step": 988
},
{
"epoch": 0.9850597609561753,
"grad_norm": 1.0381386280059814,
"learning_rate": 6.806881768539053e-09,
"loss": 2.2097,
"step": 989
},
{
"epoch": 0.9860557768924303,
"grad_norm": 1.1309791803359985,
"learning_rate": 5.929723795884967e-09,
"loss": 2.4901,
"step": 990
},
{
"epoch": 0.9870517928286853,
"grad_norm": 0.979224443435669,
"learning_rate": 5.113013338847173e-09,
"loss": 2.355,
"step": 991
},
{
"epoch": 0.9880478087649402,
"grad_norm": 0.9343250393867493,
"learning_rate": 4.356760282773209e-09,
"loss": 2.682,
"step": 992
},
{
"epoch": 0.9890438247011952,
"grad_norm": 4.211667060852051,
"learning_rate": 3.660973781242083e-09,
"loss": 3.9138,
"step": 993
},
{
"epoch": 0.9900398406374502,
"grad_norm": 0.9402066469192505,
"learning_rate": 3.0256622559543537e-09,
"loss": 2.1841,
"step": 994
},
{
"epoch": 0.9910358565737052,
"grad_norm": 1.136916995048523,
"learning_rate": 2.4508333966305473e-09,
"loss": 2.2469,
"step": 995
},
{
"epoch": 0.9920318725099602,
"grad_norm": 1.080809473991394,
"learning_rate": 1.936494160916791e-09,
"loss": 2.4922,
"step": 996
},
{
"epoch": 0.9930278884462151,
"grad_norm": 0.9956486821174622,
"learning_rate": 1.4826507743032071e-09,
"loss": 2.5901,
"step": 997
},
{
"epoch": 0.9940239043824701,
"grad_norm": 2.1008529663085938,
"learning_rate": 1.089308730043981e-09,
"loss": 2.8828,
"step": 998
},
{
"epoch": 0.9950199203187251,
"grad_norm": 1.3167147636413574,
"learning_rate": 7.564727890968515e-10,
"loss": 2.3331,
"step": 999
},
{
"epoch": 0.9960159362549801,
"grad_norm": 1.1669059991836548,
"learning_rate": 4.841469800592746e-10,
"loss": 1.9942,
"step": 1000
},
{
"epoch": 0.9970119521912351,
"grad_norm": 0.9161001443862915,
"learning_rate": 2.723345991245685e-10,
"loss": 2.2932,
"step": 1001
},
{
"epoch": 0.99800796812749,
"grad_norm": 0.8516436815261841,
"learning_rate": 1.210382100397256e-10,
"loss": 2.2065,
"step": 1002
},
{
"epoch": 0.999003984063745,
"grad_norm": 0.7772925496101379,
"learning_rate": 3.0259644074326355e-11,
"loss": 2.4638,
"step": 1003
},
{
"epoch": 1.0,
"grad_norm": 1.244535207748413,
"learning_rate": 0.0,
"loss": 2.3463,
"step": 1004
}
],
"logging_steps": 1,
"max_steps": 1004,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.5282748355775386e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}