diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,64840 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 9258, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003240440699935191, + "grad_norm": 9.704249382019043, + "learning_rate": 0.0, + "loss": 0.6798, + "step": 1 + }, + { + "epoch": 0.0006480881399870382, + "grad_norm": 8.982847213745117, + "learning_rate": 1.798561151079137e-08, + "loss": 0.6615, + "step": 2 + }, + { + "epoch": 0.0009721322099805574, + "grad_norm": 9.398996353149414, + "learning_rate": 3.597122302158274e-08, + "loss": 0.6612, + "step": 3 + }, + { + "epoch": 0.0012961762799740765, + "grad_norm": 9.116962432861328, + "learning_rate": 5.395683453237411e-08, + "loss": 0.6734, + "step": 4 + }, + { + "epoch": 0.0016202203499675956, + "grad_norm": 9.689047813415527, + "learning_rate": 7.194244604316547e-08, + "loss": 0.6595, + "step": 5 + }, + { + "epoch": 0.0019442644199611147, + "grad_norm": 9.494297981262207, + "learning_rate": 8.992805755395684e-08, + "loss": 0.6694, + "step": 6 + }, + { + "epoch": 0.002268308489954634, + "grad_norm": 8.994853019714355, + "learning_rate": 1.0791366906474822e-07, + "loss": 0.6465, + "step": 7 + }, + { + "epoch": 0.002592352559948153, + "grad_norm": 9.176651954650879, + "learning_rate": 1.2589928057553958e-07, + "loss": 0.6729, + "step": 8 + }, + { + "epoch": 0.002916396629941672, + "grad_norm": 9.519684791564941, + "learning_rate": 1.4388489208633095e-07, + "loss": 0.6765, + "step": 9 + }, + { + "epoch": 0.0032404406999351912, + "grad_norm": 8.757326126098633, + "learning_rate": 1.618705035971223e-07, + "loss": 0.654, + "step": 10 + }, + { + "epoch": 0.0035644847699287103, + "grad_norm": 9.101982116699219, + "learning_rate": 1.7985611510791368e-07, + "loss": 0.6451, + "step": 11 + }, + { + "epoch": 0.0038885288399222295, + "grad_norm": 8.611491203308105, + "learning_rate": 1.9784172661870504e-07, + "loss": 0.6484, + "step": 12 + }, + { + "epoch": 0.004212572909915748, + "grad_norm": 9.016146659851074, + "learning_rate": 2.1582733812949643e-07, + "loss": 0.6544, + "step": 13 + }, + { + "epoch": 0.004536616979909268, + "grad_norm": 9.414257049560547, + "learning_rate": 2.338129496402878e-07, + "loss": 0.6798, + "step": 14 + }, + { + "epoch": 0.004860661049902786, + "grad_norm": 8.944530487060547, + "learning_rate": 2.5179856115107916e-07, + "loss": 0.6736, + "step": 15 + }, + { + "epoch": 0.005184705119896306, + "grad_norm": 7.937344074249268, + "learning_rate": 2.697841726618705e-07, + "loss": 0.6335, + "step": 16 + }, + { + "epoch": 0.005508749189889825, + "grad_norm": 8.436976432800293, + "learning_rate": 2.877697841726619e-07, + "loss": 0.6361, + "step": 17 + }, + { + "epoch": 0.005832793259883344, + "grad_norm": 8.096851348876953, + "learning_rate": 3.057553956834533e-07, + "loss": 0.6176, + "step": 18 + }, + { + "epoch": 0.006156837329876863, + "grad_norm": 8.40873908996582, + "learning_rate": 3.237410071942446e-07, + "loss": 0.6179, + "step": 19 + }, + { + "epoch": 0.0064808813998703824, + "grad_norm": 7.971982479095459, + "learning_rate": 3.41726618705036e-07, + "loss": 0.6322, + "step": 20 + }, + { + "epoch": 0.006804925469863901, + "grad_norm": 8.637293815612793, + "learning_rate": 3.5971223021582736e-07, + "loss": 0.634, + "step": 21 + }, + { + "epoch": 0.007128969539857421, + "grad_norm": 8.830659866333008, + "learning_rate": 3.7769784172661875e-07, + "loss": 0.5987, + "step": 22 + }, + { + "epoch": 0.007453013609850939, + "grad_norm": 6.82891321182251, + "learning_rate": 3.956834532374101e-07, + "loss": 0.5337, + "step": 23 + }, + { + "epoch": 0.007777057679844459, + "grad_norm": 6.408069133758545, + "learning_rate": 4.136690647482015e-07, + "loss": 0.5231, + "step": 24 + }, + { + "epoch": 0.008101101749837978, + "grad_norm": 6.321592807769775, + "learning_rate": 4.3165467625899287e-07, + "loss": 0.5346, + "step": 25 + }, + { + "epoch": 0.008425145819831496, + "grad_norm": 5.974789619445801, + "learning_rate": 4.496402877697842e-07, + "loss": 0.5239, + "step": 26 + }, + { + "epoch": 0.008749189889825017, + "grad_norm": 6.678211688995361, + "learning_rate": 4.676258992805756e-07, + "loss": 0.5051, + "step": 27 + }, + { + "epoch": 0.009073233959818535, + "grad_norm": 6.219130992889404, + "learning_rate": 4.85611510791367e-07, + "loss": 0.5005, + "step": 28 + }, + { + "epoch": 0.009397278029812054, + "grad_norm": 6.158231735229492, + "learning_rate": 5.035971223021583e-07, + "loss": 0.492, + "step": 29 + }, + { + "epoch": 0.009721322099805573, + "grad_norm": 5.8069071769714355, + "learning_rate": 5.215827338129497e-07, + "loss": 0.4757, + "step": 30 + }, + { + "epoch": 0.010045366169799093, + "grad_norm": 6.29368257522583, + "learning_rate": 5.39568345323741e-07, + "loss": 0.3917, + "step": 31 + }, + { + "epoch": 0.010369410239792612, + "grad_norm": 4.626503944396973, + "learning_rate": 5.575539568345325e-07, + "loss": 0.3609, + "step": 32 + }, + { + "epoch": 0.01069345430978613, + "grad_norm": 4.250292778015137, + "learning_rate": 5.755395683453238e-07, + "loss": 0.3723, + "step": 33 + }, + { + "epoch": 0.01101749837977965, + "grad_norm": 4.216755390167236, + "learning_rate": 5.935251798561151e-07, + "loss": 0.3696, + "step": 34 + }, + { + "epoch": 0.01134154244977317, + "grad_norm": 3.7793445587158203, + "learning_rate": 6.115107913669066e-07, + "loss": 0.3648, + "step": 35 + }, + { + "epoch": 0.011665586519766688, + "grad_norm": 3.3505563735961914, + "learning_rate": 6.294964028776979e-07, + "loss": 0.343, + "step": 36 + }, + { + "epoch": 0.011989630589760207, + "grad_norm": 2.7636797428131104, + "learning_rate": 6.474820143884893e-07, + "loss": 0.3059, + "step": 37 + }, + { + "epoch": 0.012313674659753726, + "grad_norm": 2.806562662124634, + "learning_rate": 6.654676258992807e-07, + "loss": 0.3271, + "step": 38 + }, + { + "epoch": 0.012637718729747246, + "grad_norm": 2.9375364780426025, + "learning_rate": 6.83453237410072e-07, + "loss": 0.3347, + "step": 39 + }, + { + "epoch": 0.012961762799740765, + "grad_norm": 2.538407325744629, + "learning_rate": 7.014388489208633e-07, + "loss": 0.3039, + "step": 40 + }, + { + "epoch": 0.013285806869734284, + "grad_norm": 2.7263214588165283, + "learning_rate": 7.194244604316547e-07, + "loss": 0.3153, + "step": 41 + }, + { + "epoch": 0.013609850939727802, + "grad_norm": 2.802217721939087, + "learning_rate": 7.37410071942446e-07, + "loss": 0.3141, + "step": 42 + }, + { + "epoch": 0.013933895009721323, + "grad_norm": 2.8862125873565674, + "learning_rate": 7.553956834532375e-07, + "loss": 0.2925, + "step": 43 + }, + { + "epoch": 0.014257939079714841, + "grad_norm": 3.05582594871521, + "learning_rate": 7.733812949640289e-07, + "loss": 0.3012, + "step": 44 + }, + { + "epoch": 0.01458198314970836, + "grad_norm": 3.127528667449951, + "learning_rate": 7.913669064748202e-07, + "loss": 0.3073, + "step": 45 + }, + { + "epoch": 0.014906027219701879, + "grad_norm": 2.8370361328125, + "learning_rate": 8.093525179856115e-07, + "loss": 0.29, + "step": 46 + }, + { + "epoch": 0.0152300712896954, + "grad_norm": 2.813434362411499, + "learning_rate": 8.27338129496403e-07, + "loss": 0.2866, + "step": 47 + }, + { + "epoch": 0.015554115359688918, + "grad_norm": 2.4069221019744873, + "learning_rate": 8.453237410071943e-07, + "loss": 0.2856, + "step": 48 + }, + { + "epoch": 0.015878159429682438, + "grad_norm": 2.02687406539917, + "learning_rate": 8.633093525179857e-07, + "loss": 0.2856, + "step": 49 + }, + { + "epoch": 0.016202203499675955, + "grad_norm": 2.191495418548584, + "learning_rate": 8.81294964028777e-07, + "loss": 0.2905, + "step": 50 + }, + { + "epoch": 0.016526247569669476, + "grad_norm": 2.063936233520508, + "learning_rate": 8.992805755395684e-07, + "loss": 0.2556, + "step": 51 + }, + { + "epoch": 0.016850291639662993, + "grad_norm": 2.148212194442749, + "learning_rate": 9.172661870503598e-07, + "loss": 0.2684, + "step": 52 + }, + { + "epoch": 0.017174335709656513, + "grad_norm": 2.0932915210723877, + "learning_rate": 9.352517985611512e-07, + "loss": 0.2865, + "step": 53 + }, + { + "epoch": 0.017498379779650033, + "grad_norm": 2.168978452682495, + "learning_rate": 9.532374100719425e-07, + "loss": 0.2919, + "step": 54 + }, + { + "epoch": 0.01782242384964355, + "grad_norm": 1.9223214387893677, + "learning_rate": 9.71223021582734e-07, + "loss": 0.2609, + "step": 55 + }, + { + "epoch": 0.01814646791963707, + "grad_norm": 1.9711604118347168, + "learning_rate": 9.892086330935252e-07, + "loss": 0.2645, + "step": 56 + }, + { + "epoch": 0.01847051198963059, + "grad_norm": 2.067918062210083, + "learning_rate": 1.0071942446043167e-06, + "loss": 0.2945, + "step": 57 + }, + { + "epoch": 0.018794556059624108, + "grad_norm": 1.9994969367980957, + "learning_rate": 1.025179856115108e-06, + "loss": 0.2597, + "step": 58 + }, + { + "epoch": 0.01911860012961763, + "grad_norm": 1.9406379461288452, + "learning_rate": 1.0431654676258993e-06, + "loss": 0.2505, + "step": 59 + }, + { + "epoch": 0.019442644199611146, + "grad_norm": 1.9860442876815796, + "learning_rate": 1.0611510791366908e-06, + "loss": 0.2872, + "step": 60 + }, + { + "epoch": 0.019766688269604666, + "grad_norm": 2.0322153568267822, + "learning_rate": 1.079136690647482e-06, + "loss": 0.2602, + "step": 61 + }, + { + "epoch": 0.020090732339598186, + "grad_norm": 1.8435291051864624, + "learning_rate": 1.0971223021582735e-06, + "loss": 0.248, + "step": 62 + }, + { + "epoch": 0.020414776409591703, + "grad_norm": 1.874155879020691, + "learning_rate": 1.115107913669065e-06, + "loss": 0.2502, + "step": 63 + }, + { + "epoch": 0.020738820479585224, + "grad_norm": 1.779914140701294, + "learning_rate": 1.1330935251798561e-06, + "loss": 0.2317, + "step": 64 + }, + { + "epoch": 0.021062864549578744, + "grad_norm": 2.0697224140167236, + "learning_rate": 1.1510791366906476e-06, + "loss": 0.2508, + "step": 65 + }, + { + "epoch": 0.02138690861957226, + "grad_norm": 1.8721024990081787, + "learning_rate": 1.1690647482014388e-06, + "loss": 0.2524, + "step": 66 + }, + { + "epoch": 0.02171095268956578, + "grad_norm": 1.8915222883224487, + "learning_rate": 1.1870503597122303e-06, + "loss": 0.2572, + "step": 67 + }, + { + "epoch": 0.0220349967595593, + "grad_norm": 1.9579988718032837, + "learning_rate": 1.2050359712230217e-06, + "loss": 0.233, + "step": 68 + }, + { + "epoch": 0.02235904082955282, + "grad_norm": 1.8511942625045776, + "learning_rate": 1.2230215827338131e-06, + "loss": 0.2311, + "step": 69 + }, + { + "epoch": 0.02268308489954634, + "grad_norm": 1.8020142316818237, + "learning_rate": 1.2410071942446044e-06, + "loss": 0.2366, + "step": 70 + }, + { + "epoch": 0.023007128969539856, + "grad_norm": 1.8556476831436157, + "learning_rate": 1.2589928057553958e-06, + "loss": 0.2255, + "step": 71 + }, + { + "epoch": 0.023331173039533377, + "grad_norm": 1.8471894264221191, + "learning_rate": 1.2769784172661873e-06, + "loss": 0.2414, + "step": 72 + }, + { + "epoch": 0.023655217109526897, + "grad_norm": 1.9478641748428345, + "learning_rate": 1.2949640287769785e-06, + "loss": 0.2471, + "step": 73 + }, + { + "epoch": 0.023979261179520414, + "grad_norm": 1.799368143081665, + "learning_rate": 1.3129496402877697e-06, + "loss": 0.2506, + "step": 74 + }, + { + "epoch": 0.024303305249513935, + "grad_norm": 1.9307256937026978, + "learning_rate": 1.3309352517985614e-06, + "loss": 0.2624, + "step": 75 + }, + { + "epoch": 0.02462734931950745, + "grad_norm": 1.8890374898910522, + "learning_rate": 1.3489208633093526e-06, + "loss": 0.2626, + "step": 76 + }, + { + "epoch": 0.024951393389500972, + "grad_norm": 1.7502024173736572, + "learning_rate": 1.366906474820144e-06, + "loss": 0.2349, + "step": 77 + }, + { + "epoch": 0.025275437459494492, + "grad_norm": 1.7886251211166382, + "learning_rate": 1.3848920863309353e-06, + "loss": 0.249, + "step": 78 + }, + { + "epoch": 0.02559948152948801, + "grad_norm": 1.7713961601257324, + "learning_rate": 1.4028776978417265e-06, + "loss": 0.2432, + "step": 79 + }, + { + "epoch": 0.02592352559948153, + "grad_norm": 1.7449482679367065, + "learning_rate": 1.4208633093525182e-06, + "loss": 0.2425, + "step": 80 + }, + { + "epoch": 0.02624756966947505, + "grad_norm": 1.9202888011932373, + "learning_rate": 1.4388489208633094e-06, + "loss": 0.2579, + "step": 81 + }, + { + "epoch": 0.026571613739468567, + "grad_norm": 1.7376736402511597, + "learning_rate": 1.4568345323741009e-06, + "loss": 0.2077, + "step": 82 + }, + { + "epoch": 0.026895657809462088, + "grad_norm": 1.8440021276474, + "learning_rate": 1.474820143884892e-06, + "loss": 0.2394, + "step": 83 + }, + { + "epoch": 0.027219701879455604, + "grad_norm": 1.7778856754302979, + "learning_rate": 1.4928057553956835e-06, + "loss": 0.2037, + "step": 84 + }, + { + "epoch": 0.027543745949449125, + "grad_norm": 1.9841382503509521, + "learning_rate": 1.510791366906475e-06, + "loss": 0.2461, + "step": 85 + }, + { + "epoch": 0.027867790019442645, + "grad_norm": 1.8014554977416992, + "learning_rate": 1.5287769784172662e-06, + "loss": 0.2258, + "step": 86 + }, + { + "epoch": 0.028191834089436162, + "grad_norm": 1.8475525379180908, + "learning_rate": 1.5467625899280579e-06, + "loss": 0.2142, + "step": 87 + }, + { + "epoch": 0.028515878159429683, + "grad_norm": 1.8251590728759766, + "learning_rate": 1.5647482014388491e-06, + "loss": 0.2222, + "step": 88 + }, + { + "epoch": 0.028839922229423203, + "grad_norm": 1.7561265230178833, + "learning_rate": 1.5827338129496403e-06, + "loss": 0.2283, + "step": 89 + }, + { + "epoch": 0.02916396629941672, + "grad_norm": 1.9121075868606567, + "learning_rate": 1.6007194244604318e-06, + "loss": 0.2415, + "step": 90 + }, + { + "epoch": 0.02948801036941024, + "grad_norm": 1.8700357675552368, + "learning_rate": 1.618705035971223e-06, + "loss": 0.2329, + "step": 91 + }, + { + "epoch": 0.029812054439403757, + "grad_norm": 1.8043524026870728, + "learning_rate": 1.6366906474820147e-06, + "loss": 0.2327, + "step": 92 + }, + { + "epoch": 0.030136098509397278, + "grad_norm": 1.8841935396194458, + "learning_rate": 1.654676258992806e-06, + "loss": 0.2269, + "step": 93 + }, + { + "epoch": 0.0304601425793908, + "grad_norm": 1.8864487409591675, + "learning_rate": 1.6726618705035971e-06, + "loss": 0.2223, + "step": 94 + }, + { + "epoch": 0.030784186649384315, + "grad_norm": 1.8025115728378296, + "learning_rate": 1.6906474820143886e-06, + "loss": 0.2257, + "step": 95 + }, + { + "epoch": 0.031108230719377836, + "grad_norm": 1.8227530717849731, + "learning_rate": 1.7086330935251798e-06, + "loss": 0.2419, + "step": 96 + }, + { + "epoch": 0.031432274789371356, + "grad_norm": 1.745831847190857, + "learning_rate": 1.7266187050359715e-06, + "loss": 0.2227, + "step": 97 + }, + { + "epoch": 0.031756318859364877, + "grad_norm": 1.9700807332992554, + "learning_rate": 1.7446043165467627e-06, + "loss": 0.2159, + "step": 98 + }, + { + "epoch": 0.03208036292935839, + "grad_norm": 1.744411826133728, + "learning_rate": 1.762589928057554e-06, + "loss": 0.2259, + "step": 99 + }, + { + "epoch": 0.03240440699935191, + "grad_norm": 1.7854716777801514, + "learning_rate": 1.7805755395683456e-06, + "loss": 0.2127, + "step": 100 + }, + { + "epoch": 0.03272845106934543, + "grad_norm": 1.7840704917907715, + "learning_rate": 1.7985611510791368e-06, + "loss": 0.2399, + "step": 101 + }, + { + "epoch": 0.03305249513933895, + "grad_norm": 1.8360185623168945, + "learning_rate": 1.8165467625899283e-06, + "loss": 0.2069, + "step": 102 + }, + { + "epoch": 0.03337653920933247, + "grad_norm": 1.716846227645874, + "learning_rate": 1.8345323741007195e-06, + "loss": 0.2235, + "step": 103 + }, + { + "epoch": 0.033700583279325985, + "grad_norm": 1.7586473226547241, + "learning_rate": 1.8525179856115107e-06, + "loss": 0.1966, + "step": 104 + }, + { + "epoch": 0.034024627349319506, + "grad_norm": 1.776848316192627, + "learning_rate": 1.8705035971223024e-06, + "loss": 0.2095, + "step": 105 + }, + { + "epoch": 0.034348671419313026, + "grad_norm": 2.3353116512298584, + "learning_rate": 1.8884892086330936e-06, + "loss": 0.2112, + "step": 106 + }, + { + "epoch": 0.034672715489306546, + "grad_norm": 1.820418357849121, + "learning_rate": 1.906474820143885e-06, + "loss": 0.2251, + "step": 107 + }, + { + "epoch": 0.03499675955930007, + "grad_norm": 1.8014744520187378, + "learning_rate": 1.9244604316546765e-06, + "loss": 0.2182, + "step": 108 + }, + { + "epoch": 0.03532080362929359, + "grad_norm": 1.767818570137024, + "learning_rate": 1.942446043165468e-06, + "loss": 0.207, + "step": 109 + }, + { + "epoch": 0.0356448476992871, + "grad_norm": 1.7806322574615479, + "learning_rate": 1.960431654676259e-06, + "loss": 0.2138, + "step": 110 + }, + { + "epoch": 0.03596889176928062, + "grad_norm": 1.7334893941879272, + "learning_rate": 1.9784172661870504e-06, + "loss": 0.2135, + "step": 111 + }, + { + "epoch": 0.03629293583927414, + "grad_norm": 1.7799729108810425, + "learning_rate": 1.996402877697842e-06, + "loss": 0.2122, + "step": 112 + }, + { + "epoch": 0.03661697990926766, + "grad_norm": 1.7285608053207397, + "learning_rate": 2.0143884892086333e-06, + "loss": 0.2204, + "step": 113 + }, + { + "epoch": 0.03694102397926118, + "grad_norm": 1.7246173620224, + "learning_rate": 2.0323741007194248e-06, + "loss": 0.2261, + "step": 114 + }, + { + "epoch": 0.037265068049254696, + "grad_norm": 1.7286440134048462, + "learning_rate": 2.050359712230216e-06, + "loss": 0.2343, + "step": 115 + }, + { + "epoch": 0.037589112119248216, + "grad_norm": 1.736197829246521, + "learning_rate": 2.0683453237410072e-06, + "loss": 0.2161, + "step": 116 + }, + { + "epoch": 0.03791315618924174, + "grad_norm": 1.6097439527511597, + "learning_rate": 2.0863309352517987e-06, + "loss": 0.2004, + "step": 117 + }, + { + "epoch": 0.03823720025923526, + "grad_norm": 1.6844645738601685, + "learning_rate": 2.10431654676259e-06, + "loss": 0.1974, + "step": 118 + }, + { + "epoch": 0.03856124432922878, + "grad_norm": 1.8262434005737305, + "learning_rate": 2.1223021582733816e-06, + "loss": 0.2268, + "step": 119 + }, + { + "epoch": 0.03888528839922229, + "grad_norm": 2.0028789043426514, + "learning_rate": 2.140287769784173e-06, + "loss": 0.2118, + "step": 120 + }, + { + "epoch": 0.03920933246921581, + "grad_norm": 1.803137183189392, + "learning_rate": 2.158273381294964e-06, + "loss": 0.212, + "step": 121 + }, + { + "epoch": 0.03953337653920933, + "grad_norm": 1.7705177068710327, + "learning_rate": 2.1762589928057555e-06, + "loss": 0.2231, + "step": 122 + }, + { + "epoch": 0.03985742060920285, + "grad_norm": 1.6794323921203613, + "learning_rate": 2.194244604316547e-06, + "loss": 0.2138, + "step": 123 + }, + { + "epoch": 0.04018146467919637, + "grad_norm": 1.6739569902420044, + "learning_rate": 2.2122302158273384e-06, + "loss": 0.2289, + "step": 124 + }, + { + "epoch": 0.04050550874918989, + "grad_norm": 1.684012532234192, + "learning_rate": 2.23021582733813e-06, + "loss": 0.2059, + "step": 125 + }, + { + "epoch": 0.04082955281918341, + "grad_norm": 1.6604381799697876, + "learning_rate": 2.248201438848921e-06, + "loss": 0.211, + "step": 126 + }, + { + "epoch": 0.04115359688917693, + "grad_norm": 1.837553858757019, + "learning_rate": 2.2661870503597123e-06, + "loss": 0.2355, + "step": 127 + }, + { + "epoch": 0.04147764095917045, + "grad_norm": 1.6966252326965332, + "learning_rate": 2.2841726618705037e-06, + "loss": 0.1959, + "step": 128 + }, + { + "epoch": 0.04180168502916397, + "grad_norm": 2.023315906524658, + "learning_rate": 2.302158273381295e-06, + "loss": 0.2232, + "step": 129 + }, + { + "epoch": 0.04212572909915749, + "grad_norm": 1.8194133043289185, + "learning_rate": 2.3201438848920866e-06, + "loss": 0.2147, + "step": 130 + }, + { + "epoch": 0.042449773169151, + "grad_norm": 1.71575927734375, + "learning_rate": 2.3381294964028776e-06, + "loss": 0.2104, + "step": 131 + }, + { + "epoch": 0.04277381723914452, + "grad_norm": 1.6668249368667603, + "learning_rate": 2.3561151079136695e-06, + "loss": 0.2085, + "step": 132 + }, + { + "epoch": 0.04309786130913804, + "grad_norm": 1.722672700881958, + "learning_rate": 2.3741007194244605e-06, + "loss": 0.1993, + "step": 133 + }, + { + "epoch": 0.04342190537913156, + "grad_norm": 1.736364722251892, + "learning_rate": 2.392086330935252e-06, + "loss": 0.2156, + "step": 134 + }, + { + "epoch": 0.043745949449125084, + "grad_norm": 1.9815757274627686, + "learning_rate": 2.4100719424460434e-06, + "loss": 0.2081, + "step": 135 + }, + { + "epoch": 0.0440699935191186, + "grad_norm": 1.8326865434646606, + "learning_rate": 2.4280575539568344e-06, + "loss": 0.2107, + "step": 136 + }, + { + "epoch": 0.04439403758911212, + "grad_norm": 2.0552074909210205, + "learning_rate": 2.4460431654676263e-06, + "loss": 0.2275, + "step": 137 + }, + { + "epoch": 0.04471808165910564, + "grad_norm": 1.8263514041900635, + "learning_rate": 2.4640287769784173e-06, + "loss": 0.1993, + "step": 138 + }, + { + "epoch": 0.04504212572909916, + "grad_norm": 1.835706353187561, + "learning_rate": 2.4820143884892088e-06, + "loss": 0.2067, + "step": 139 + }, + { + "epoch": 0.04536616979909268, + "grad_norm": 1.6910218000411987, + "learning_rate": 2.5e-06, + "loss": 0.2078, + "step": 140 + }, + { + "epoch": 0.04569021386908619, + "grad_norm": 1.7438052892684937, + "learning_rate": 2.5179856115107916e-06, + "loss": 0.2288, + "step": 141 + }, + { + "epoch": 0.04601425793907971, + "grad_norm": 1.739857792854309, + "learning_rate": 2.5359712230215827e-06, + "loss": 0.237, + "step": 142 + }, + { + "epoch": 0.04633830200907323, + "grad_norm": 1.7011078596115112, + "learning_rate": 2.5539568345323745e-06, + "loss": 0.2159, + "step": 143 + }, + { + "epoch": 0.046662346079066754, + "grad_norm": 1.6127219200134277, + "learning_rate": 2.571942446043166e-06, + "loss": 0.2079, + "step": 144 + }, + { + "epoch": 0.046986390149060274, + "grad_norm": 1.6532703638076782, + "learning_rate": 2.589928057553957e-06, + "loss": 0.2019, + "step": 145 + }, + { + "epoch": 0.047310434219053794, + "grad_norm": 1.6992716789245605, + "learning_rate": 2.6079136690647484e-06, + "loss": 0.1957, + "step": 146 + }, + { + "epoch": 0.04763447828904731, + "grad_norm": 2.0122122764587402, + "learning_rate": 2.6258992805755395e-06, + "loss": 0.2147, + "step": 147 + }, + { + "epoch": 0.04795852235904083, + "grad_norm": 1.7450064420700073, + "learning_rate": 2.6438848920863313e-06, + "loss": 0.2197, + "step": 148 + }, + { + "epoch": 0.04828256642903435, + "grad_norm": 1.612120270729065, + "learning_rate": 2.6618705035971228e-06, + "loss": 0.1964, + "step": 149 + }, + { + "epoch": 0.04860661049902787, + "grad_norm": 1.8553955554962158, + "learning_rate": 2.679856115107914e-06, + "loss": 0.1978, + "step": 150 + }, + { + "epoch": 0.04893065456902139, + "grad_norm": 1.8830714225769043, + "learning_rate": 2.6978417266187052e-06, + "loss": 0.2102, + "step": 151 + }, + { + "epoch": 0.0492546986390149, + "grad_norm": 1.759135127067566, + "learning_rate": 2.7158273381294963e-06, + "loss": 0.2221, + "step": 152 + }, + { + "epoch": 0.04957874270900842, + "grad_norm": 1.7659202814102173, + "learning_rate": 2.733812949640288e-06, + "loss": 0.2033, + "step": 153 + }, + { + "epoch": 0.049902786779001944, + "grad_norm": 1.6272228956222534, + "learning_rate": 2.7517985611510796e-06, + "loss": 0.2066, + "step": 154 + }, + { + "epoch": 0.050226830848995464, + "grad_norm": 1.6313395500183105, + "learning_rate": 2.7697841726618706e-06, + "loss": 0.2106, + "step": 155 + }, + { + "epoch": 0.050550874918988985, + "grad_norm": 2.1572105884552, + "learning_rate": 2.787769784172662e-06, + "loss": 0.2298, + "step": 156 + }, + { + "epoch": 0.0508749189889825, + "grad_norm": 1.8465955257415771, + "learning_rate": 2.805755395683453e-06, + "loss": 0.2172, + "step": 157 + }, + { + "epoch": 0.05119896305897602, + "grad_norm": 1.7251900434494019, + "learning_rate": 2.823741007194245e-06, + "loss": 0.2014, + "step": 158 + }, + { + "epoch": 0.05152300712896954, + "grad_norm": 1.6110676527023315, + "learning_rate": 2.8417266187050364e-06, + "loss": 0.2063, + "step": 159 + }, + { + "epoch": 0.05184705119896306, + "grad_norm": 1.714495301246643, + "learning_rate": 2.8597122302158274e-06, + "loss": 0.2074, + "step": 160 + }, + { + "epoch": 0.05217109526895658, + "grad_norm": 1.6673401594161987, + "learning_rate": 2.877697841726619e-06, + "loss": 0.201, + "step": 161 + }, + { + "epoch": 0.0524951393389501, + "grad_norm": 1.5677164793014526, + "learning_rate": 2.89568345323741e-06, + "loss": 0.2016, + "step": 162 + }, + { + "epoch": 0.052819183408943614, + "grad_norm": 1.8159863948822021, + "learning_rate": 2.9136690647482017e-06, + "loss": 0.2317, + "step": 163 + }, + { + "epoch": 0.053143227478937134, + "grad_norm": 1.7845139503479004, + "learning_rate": 2.931654676258993e-06, + "loss": 0.219, + "step": 164 + }, + { + "epoch": 0.053467271548930655, + "grad_norm": 1.5887399911880493, + "learning_rate": 2.949640287769784e-06, + "loss": 0.2021, + "step": 165 + }, + { + "epoch": 0.053791315618924175, + "grad_norm": 1.6343255043029785, + "learning_rate": 2.9676258992805756e-06, + "loss": 0.201, + "step": 166 + }, + { + "epoch": 0.054115359688917695, + "grad_norm": 1.9027572870254517, + "learning_rate": 2.985611510791367e-06, + "loss": 0.2172, + "step": 167 + }, + { + "epoch": 0.05443940375891121, + "grad_norm": 1.7987514734268188, + "learning_rate": 3.0035971223021585e-06, + "loss": 0.2225, + "step": 168 + }, + { + "epoch": 0.05476344782890473, + "grad_norm": 1.7745798826217651, + "learning_rate": 3.02158273381295e-06, + "loss": 0.2089, + "step": 169 + }, + { + "epoch": 0.05508749189889825, + "grad_norm": 1.7217110395431519, + "learning_rate": 3.0395683453237414e-06, + "loss": 0.2015, + "step": 170 + }, + { + "epoch": 0.05541153596889177, + "grad_norm": 1.7840914726257324, + "learning_rate": 3.0575539568345324e-06, + "loss": 0.2309, + "step": 171 + }, + { + "epoch": 0.05573558003888529, + "grad_norm": 1.6944200992584229, + "learning_rate": 3.075539568345324e-06, + "loss": 0.2101, + "step": 172 + }, + { + "epoch": 0.056059624108878804, + "grad_norm": 1.6712582111358643, + "learning_rate": 3.0935251798561158e-06, + "loss": 0.2138, + "step": 173 + }, + { + "epoch": 0.056383668178872325, + "grad_norm": 1.7150169610977173, + "learning_rate": 3.1115107913669068e-06, + "loss": 0.2295, + "step": 174 + }, + { + "epoch": 0.056707712248865845, + "grad_norm": 1.8611018657684326, + "learning_rate": 3.1294964028776982e-06, + "loss": 0.2064, + "step": 175 + }, + { + "epoch": 0.057031756318859365, + "grad_norm": 1.6113686561584473, + "learning_rate": 3.1474820143884892e-06, + "loss": 0.2012, + "step": 176 + }, + { + "epoch": 0.057355800388852886, + "grad_norm": 2.0248565673828125, + "learning_rate": 3.1654676258992807e-06, + "loss": 0.2188, + "step": 177 + }, + { + "epoch": 0.057679844458846406, + "grad_norm": 1.7024465799331665, + "learning_rate": 3.1834532374100726e-06, + "loss": 0.1992, + "step": 178 + }, + { + "epoch": 0.05800388852883992, + "grad_norm": 1.689802885055542, + "learning_rate": 3.2014388489208636e-06, + "loss": 0.2075, + "step": 179 + }, + { + "epoch": 0.05832793259883344, + "grad_norm": 1.6592146158218384, + "learning_rate": 3.219424460431655e-06, + "loss": 0.2079, + "step": 180 + }, + { + "epoch": 0.05865197666882696, + "grad_norm": 1.5495208501815796, + "learning_rate": 3.237410071942446e-06, + "loss": 0.178, + "step": 181 + }, + { + "epoch": 0.05897602073882048, + "grad_norm": 1.6852517127990723, + "learning_rate": 3.2553956834532375e-06, + "loss": 0.213, + "step": 182 + }, + { + "epoch": 0.059300064808814, + "grad_norm": 1.772308349609375, + "learning_rate": 3.2733812949640294e-06, + "loss": 0.2156, + "step": 183 + }, + { + "epoch": 0.059624108878807515, + "grad_norm": 1.5808868408203125, + "learning_rate": 3.2913669064748204e-06, + "loss": 0.2204, + "step": 184 + }, + { + "epoch": 0.059948152948801035, + "grad_norm": 1.6497232913970947, + "learning_rate": 3.309352517985612e-06, + "loss": 0.1995, + "step": 185 + }, + { + "epoch": 0.060272197018794556, + "grad_norm": 1.7812473773956299, + "learning_rate": 3.327338129496403e-06, + "loss": 0.2339, + "step": 186 + }, + { + "epoch": 0.060596241088788076, + "grad_norm": 1.710668921470642, + "learning_rate": 3.3453237410071943e-06, + "loss": 0.2096, + "step": 187 + }, + { + "epoch": 0.0609202851587816, + "grad_norm": 1.5381975173950195, + "learning_rate": 3.363309352517986e-06, + "loss": 0.1935, + "step": 188 + }, + { + "epoch": 0.06124432922877511, + "grad_norm": 1.5938481092453003, + "learning_rate": 3.381294964028777e-06, + "loss": 0.1973, + "step": 189 + }, + { + "epoch": 0.06156837329876863, + "grad_norm": 1.6954132318496704, + "learning_rate": 3.3992805755395686e-06, + "loss": 0.21, + "step": 190 + }, + { + "epoch": 0.06189241736876215, + "grad_norm": 1.6907073259353638, + "learning_rate": 3.4172661870503596e-06, + "loss": 0.1942, + "step": 191 + }, + { + "epoch": 0.06221646143875567, + "grad_norm": 1.7327340841293335, + "learning_rate": 3.435251798561151e-06, + "loss": 0.2102, + "step": 192 + }, + { + "epoch": 0.06254050550874919, + "grad_norm": 1.7439619302749634, + "learning_rate": 3.453237410071943e-06, + "loss": 0.2111, + "step": 193 + }, + { + "epoch": 0.06286454957874271, + "grad_norm": 1.8764787912368774, + "learning_rate": 3.471223021582734e-06, + "loss": 0.2021, + "step": 194 + }, + { + "epoch": 0.06318859364873623, + "grad_norm": 1.610394835472107, + "learning_rate": 3.4892086330935254e-06, + "loss": 0.1965, + "step": 195 + }, + { + "epoch": 0.06351263771872975, + "grad_norm": 1.7442972660064697, + "learning_rate": 3.507194244604317e-06, + "loss": 0.214, + "step": 196 + }, + { + "epoch": 0.06383668178872326, + "grad_norm": 1.5908910036087036, + "learning_rate": 3.525179856115108e-06, + "loss": 0.2101, + "step": 197 + }, + { + "epoch": 0.06416072585871678, + "grad_norm": 1.7243609428405762, + "learning_rate": 3.5431654676258998e-06, + "loss": 0.229, + "step": 198 + }, + { + "epoch": 0.0644847699287103, + "grad_norm": 1.6044604778289795, + "learning_rate": 3.561151079136691e-06, + "loss": 0.2052, + "step": 199 + }, + { + "epoch": 0.06480881399870382, + "grad_norm": 1.5355489253997803, + "learning_rate": 3.5791366906474822e-06, + "loss": 0.2168, + "step": 200 + }, + { + "epoch": 0.06513285806869734, + "grad_norm": 1.5240367650985718, + "learning_rate": 3.5971223021582737e-06, + "loss": 0.1988, + "step": 201 + }, + { + "epoch": 0.06545690213869086, + "grad_norm": 1.5550951957702637, + "learning_rate": 3.6151079136690647e-06, + "loss": 0.1931, + "step": 202 + }, + { + "epoch": 0.06578094620868438, + "grad_norm": 1.6683835983276367, + "learning_rate": 3.6330935251798566e-06, + "loss": 0.2151, + "step": 203 + }, + { + "epoch": 0.0661049902786779, + "grad_norm": 1.51242995262146, + "learning_rate": 3.651079136690648e-06, + "loss": 0.1894, + "step": 204 + }, + { + "epoch": 0.06642903434867142, + "grad_norm": 1.6069519519805908, + "learning_rate": 3.669064748201439e-06, + "loss": 0.2019, + "step": 205 + }, + { + "epoch": 0.06675307841866494, + "grad_norm": 1.5711833238601685, + "learning_rate": 3.6870503597122305e-06, + "loss": 0.1927, + "step": 206 + }, + { + "epoch": 0.06707712248865846, + "grad_norm": 1.57998526096344, + "learning_rate": 3.7050359712230215e-06, + "loss": 0.2146, + "step": 207 + }, + { + "epoch": 0.06740116655865197, + "grad_norm": 2.213890552520752, + "learning_rate": 3.7230215827338134e-06, + "loss": 0.2017, + "step": 208 + }, + { + "epoch": 0.06772521062864549, + "grad_norm": 2.3263840675354004, + "learning_rate": 3.741007194244605e-06, + "loss": 0.1953, + "step": 209 + }, + { + "epoch": 0.06804925469863901, + "grad_norm": 1.5306557416915894, + "learning_rate": 3.758992805755396e-06, + "loss": 0.2037, + "step": 210 + }, + { + "epoch": 0.06837329876863253, + "grad_norm": 1.7475553750991821, + "learning_rate": 3.7769784172661873e-06, + "loss": 0.2144, + "step": 211 + }, + { + "epoch": 0.06869734283862605, + "grad_norm": 1.5679391622543335, + "learning_rate": 3.794964028776979e-06, + "loss": 0.1995, + "step": 212 + }, + { + "epoch": 0.06902138690861957, + "grad_norm": 1.5701528787612915, + "learning_rate": 3.81294964028777e-06, + "loss": 0.1902, + "step": 213 + }, + { + "epoch": 0.06934543097861309, + "grad_norm": 1.6837743520736694, + "learning_rate": 3.830935251798562e-06, + "loss": 0.1998, + "step": 214 + }, + { + "epoch": 0.06966947504860661, + "grad_norm": 1.5841065645217896, + "learning_rate": 3.848920863309353e-06, + "loss": 0.2024, + "step": 215 + }, + { + "epoch": 0.06999351911860013, + "grad_norm": 1.5467228889465332, + "learning_rate": 3.866906474820144e-06, + "loss": 0.1954, + "step": 216 + }, + { + "epoch": 0.07031756318859365, + "grad_norm": 1.5360314846038818, + "learning_rate": 3.884892086330936e-06, + "loss": 0.1948, + "step": 217 + }, + { + "epoch": 0.07064160725858717, + "grad_norm": 1.6404602527618408, + "learning_rate": 3.902877697841727e-06, + "loss": 0.2148, + "step": 218 + }, + { + "epoch": 0.07096565132858068, + "grad_norm": 1.636122465133667, + "learning_rate": 3.920863309352518e-06, + "loss": 0.2039, + "step": 219 + }, + { + "epoch": 0.0712896953985742, + "grad_norm": 1.574469804763794, + "learning_rate": 3.938848920863309e-06, + "loss": 0.2205, + "step": 220 + }, + { + "epoch": 0.07161373946856772, + "grad_norm": 1.6298938989639282, + "learning_rate": 3.956834532374101e-06, + "loss": 0.2021, + "step": 221 + }, + { + "epoch": 0.07193778353856124, + "grad_norm": 1.631216287612915, + "learning_rate": 3.974820143884892e-06, + "loss": 0.2062, + "step": 222 + }, + { + "epoch": 0.07226182760855476, + "grad_norm": 1.546505331993103, + "learning_rate": 3.992805755395684e-06, + "loss": 0.2061, + "step": 223 + }, + { + "epoch": 0.07258587167854828, + "grad_norm": 1.629062294960022, + "learning_rate": 4.010791366906475e-06, + "loss": 0.2172, + "step": 224 + }, + { + "epoch": 0.0729099157485418, + "grad_norm": 1.4732308387756348, + "learning_rate": 4.028776978417267e-06, + "loss": 0.212, + "step": 225 + }, + { + "epoch": 0.07323395981853532, + "grad_norm": 1.3658744096755981, + "learning_rate": 4.046762589928058e-06, + "loss": 0.1766, + "step": 226 + }, + { + "epoch": 0.07355800388852884, + "grad_norm": 1.543238639831543, + "learning_rate": 4.0647482014388495e-06, + "loss": 0.2065, + "step": 227 + }, + { + "epoch": 0.07388204795852236, + "grad_norm": 1.507412075996399, + "learning_rate": 4.082733812949641e-06, + "loss": 0.1977, + "step": 228 + }, + { + "epoch": 0.07420609202851587, + "grad_norm": 1.5816974639892578, + "learning_rate": 4.100719424460432e-06, + "loss": 0.2148, + "step": 229 + }, + { + "epoch": 0.07453013609850939, + "grad_norm": 1.497773289680481, + "learning_rate": 4.118705035971223e-06, + "loss": 0.2019, + "step": 230 + }, + { + "epoch": 0.07485418016850291, + "grad_norm": 1.5865081548690796, + "learning_rate": 4.1366906474820145e-06, + "loss": 0.2074, + "step": 231 + }, + { + "epoch": 0.07517822423849643, + "grad_norm": 1.6510618925094604, + "learning_rate": 4.154676258992807e-06, + "loss": 0.2251, + "step": 232 + }, + { + "epoch": 0.07550226830848995, + "grad_norm": 1.5500881671905518, + "learning_rate": 4.172661870503597e-06, + "loss": 0.2134, + "step": 233 + }, + { + "epoch": 0.07582631237848347, + "grad_norm": 1.786974549293518, + "learning_rate": 4.190647482014389e-06, + "loss": 0.2086, + "step": 234 + }, + { + "epoch": 0.076150356448477, + "grad_norm": 1.4506052732467651, + "learning_rate": 4.20863309352518e-06, + "loss": 0.2073, + "step": 235 + }, + { + "epoch": 0.07647440051847051, + "grad_norm": 1.7376664876937866, + "learning_rate": 4.226618705035972e-06, + "loss": 0.235, + "step": 236 + }, + { + "epoch": 0.07679844458846403, + "grad_norm": 1.600990891456604, + "learning_rate": 4.244604316546763e-06, + "loss": 0.2214, + "step": 237 + }, + { + "epoch": 0.07712248865845756, + "grad_norm": 1.6190606355667114, + "learning_rate": 4.2625899280575546e-06, + "loss": 0.2103, + "step": 238 + }, + { + "epoch": 0.07744653272845108, + "grad_norm": 1.4629621505737305, + "learning_rate": 4.280575539568346e-06, + "loss": 0.2059, + "step": 239 + }, + { + "epoch": 0.07777057679844458, + "grad_norm": 1.7007418870925903, + "learning_rate": 4.298561151079137e-06, + "loss": 0.2195, + "step": 240 + }, + { + "epoch": 0.0780946208684381, + "grad_norm": 1.6682326793670654, + "learning_rate": 4.316546762589928e-06, + "loss": 0.2291, + "step": 241 + }, + { + "epoch": 0.07841866493843162, + "grad_norm": 1.6797760725021362, + "learning_rate": 4.33453237410072e-06, + "loss": 0.2099, + "step": 242 + }, + { + "epoch": 0.07874270900842514, + "grad_norm": 1.5488982200622559, + "learning_rate": 4.352517985611511e-06, + "loss": 0.1945, + "step": 243 + }, + { + "epoch": 0.07906675307841866, + "grad_norm": 1.6398717164993286, + "learning_rate": 4.370503597122302e-06, + "loss": 0.2127, + "step": 244 + }, + { + "epoch": 0.07939079714841218, + "grad_norm": 1.6234924793243408, + "learning_rate": 4.388489208633094e-06, + "loss": 0.22, + "step": 245 + }, + { + "epoch": 0.0797148412184057, + "grad_norm": 1.6633092164993286, + "learning_rate": 4.406474820143885e-06, + "loss": 0.1997, + "step": 246 + }, + { + "epoch": 0.08003888528839923, + "grad_norm": 1.5225783586502075, + "learning_rate": 4.424460431654677e-06, + "loss": 0.2063, + "step": 247 + }, + { + "epoch": 0.08036292935839275, + "grad_norm": 1.5646318197250366, + "learning_rate": 4.442446043165468e-06, + "loss": 0.2013, + "step": 248 + }, + { + "epoch": 0.08068697342838627, + "grad_norm": 1.5946208238601685, + "learning_rate": 4.46043165467626e-06, + "loss": 0.1889, + "step": 249 + }, + { + "epoch": 0.08101101749837979, + "grad_norm": 1.4035966396331787, + "learning_rate": 4.478417266187051e-06, + "loss": 0.1801, + "step": 250 + }, + { + "epoch": 0.08133506156837329, + "grad_norm": 1.5441200733184814, + "learning_rate": 4.496402877697842e-06, + "loss": 0.2025, + "step": 251 + }, + { + "epoch": 0.08165910563836681, + "grad_norm": 1.6632438898086548, + "learning_rate": 4.514388489208634e-06, + "loss": 0.1919, + "step": 252 + }, + { + "epoch": 0.08198314970836033, + "grad_norm": 1.5009981393814087, + "learning_rate": 4.5323741007194245e-06, + "loss": 0.2122, + "step": 253 + }, + { + "epoch": 0.08230719377835385, + "grad_norm": 1.409143090248108, + "learning_rate": 4.550359712230216e-06, + "loss": 0.1963, + "step": 254 + }, + { + "epoch": 0.08263123784834737, + "grad_norm": 1.8114739656448364, + "learning_rate": 4.5683453237410074e-06, + "loss": 0.2144, + "step": 255 + }, + { + "epoch": 0.0829552819183409, + "grad_norm": 1.529091238975525, + "learning_rate": 4.586330935251799e-06, + "loss": 0.2014, + "step": 256 + }, + { + "epoch": 0.08327932598833442, + "grad_norm": 1.4776802062988281, + "learning_rate": 4.60431654676259e-06, + "loss": 0.1995, + "step": 257 + }, + { + "epoch": 0.08360337005832794, + "grad_norm": 1.6742963790893555, + "learning_rate": 4.622302158273382e-06, + "loss": 0.1949, + "step": 258 + }, + { + "epoch": 0.08392741412832146, + "grad_norm": 1.5035185813903809, + "learning_rate": 4.640287769784173e-06, + "loss": 0.2033, + "step": 259 + }, + { + "epoch": 0.08425145819831498, + "grad_norm": 1.47641122341156, + "learning_rate": 4.658273381294965e-06, + "loss": 0.2082, + "step": 260 + }, + { + "epoch": 0.08457550226830848, + "grad_norm": 1.4812216758728027, + "learning_rate": 4.676258992805755e-06, + "loss": 0.2137, + "step": 261 + }, + { + "epoch": 0.084899546338302, + "grad_norm": 1.5529314279556274, + "learning_rate": 4.6942446043165475e-06, + "loss": 0.2112, + "step": 262 + }, + { + "epoch": 0.08522359040829552, + "grad_norm": 1.3942975997924805, + "learning_rate": 4.712230215827339e-06, + "loss": 0.1954, + "step": 263 + }, + { + "epoch": 0.08554763447828904, + "grad_norm": 1.439681053161621, + "learning_rate": 4.73021582733813e-06, + "loss": 0.1926, + "step": 264 + }, + { + "epoch": 0.08587167854828257, + "grad_norm": 1.56781005859375, + "learning_rate": 4.748201438848921e-06, + "loss": 0.2144, + "step": 265 + }, + { + "epoch": 0.08619572261827609, + "grad_norm": 1.6256016492843628, + "learning_rate": 4.7661870503597125e-06, + "loss": 0.2052, + "step": 266 + }, + { + "epoch": 0.0865197666882696, + "grad_norm": 1.5104886293411255, + "learning_rate": 4.784172661870504e-06, + "loss": 0.2063, + "step": 267 + }, + { + "epoch": 0.08684381075826313, + "grad_norm": 1.588659405708313, + "learning_rate": 4.802158273381295e-06, + "loss": 0.1996, + "step": 268 + }, + { + "epoch": 0.08716785482825665, + "grad_norm": 1.7575246095657349, + "learning_rate": 4.820143884892087e-06, + "loss": 0.2145, + "step": 269 + }, + { + "epoch": 0.08749189889825017, + "grad_norm": 1.5401091575622559, + "learning_rate": 4.838129496402878e-06, + "loss": 0.2057, + "step": 270 + }, + { + "epoch": 0.08781594296824369, + "grad_norm": 1.5030951499938965, + "learning_rate": 4.856115107913669e-06, + "loss": 0.1922, + "step": 271 + }, + { + "epoch": 0.0881399870382372, + "grad_norm": 1.4104645252227783, + "learning_rate": 4.874100719424461e-06, + "loss": 0.201, + "step": 272 + }, + { + "epoch": 0.08846403110823071, + "grad_norm": 1.5870946645736694, + "learning_rate": 4.892086330935253e-06, + "loss": 0.209, + "step": 273 + }, + { + "epoch": 0.08878807517822424, + "grad_norm": 1.4110219478607178, + "learning_rate": 4.910071942446043e-06, + "loss": 0.2037, + "step": 274 + }, + { + "epoch": 0.08911211924821776, + "grad_norm": 1.5320945978164673, + "learning_rate": 4.928057553956835e-06, + "loss": 0.2145, + "step": 275 + }, + { + "epoch": 0.08943616331821128, + "grad_norm": 1.565433382987976, + "learning_rate": 4.946043165467626e-06, + "loss": 0.2311, + "step": 276 + }, + { + "epoch": 0.0897602073882048, + "grad_norm": 1.4688873291015625, + "learning_rate": 4.9640287769784175e-06, + "loss": 0.2169, + "step": 277 + }, + { + "epoch": 0.09008425145819832, + "grad_norm": 1.3488482236862183, + "learning_rate": 4.982014388489209e-06, + "loss": 0.1774, + "step": 278 + }, + { + "epoch": 0.09040829552819184, + "grad_norm": 1.523825764656067, + "learning_rate": 5e-06, + "loss": 0.1916, + "step": 279 + }, + { + "epoch": 0.09073233959818536, + "grad_norm": 1.4836606979370117, + "learning_rate": 4.999999847012101e-06, + "loss": 0.2061, + "step": 280 + }, + { + "epoch": 0.09105638366817888, + "grad_norm": 1.5318763256072998, + "learning_rate": 4.9999993880484235e-06, + "loss": 0.1889, + "step": 281 + }, + { + "epoch": 0.09138042773817238, + "grad_norm": 1.487181544303894, + "learning_rate": 4.999998623109022e-06, + "loss": 0.2027, + "step": 282 + }, + { + "epoch": 0.0917044718081659, + "grad_norm": 1.6822458505630493, + "learning_rate": 4.99999755219399e-06, + "loss": 0.2108, + "step": 283 + }, + { + "epoch": 0.09202851587815943, + "grad_norm": 1.552655816078186, + "learning_rate": 4.9999961753034595e-06, + "loss": 0.2097, + "step": 284 + }, + { + "epoch": 0.09235255994815295, + "grad_norm": 1.4975074529647827, + "learning_rate": 4.9999944924376e-06, + "loss": 0.1946, + "step": 285 + }, + { + "epoch": 0.09267660401814647, + "grad_norm": 1.4986190795898438, + "learning_rate": 4.999992503596616e-06, + "loss": 0.2013, + "step": 286 + }, + { + "epoch": 0.09300064808813999, + "grad_norm": 1.628769040107727, + "learning_rate": 4.999990208780751e-06, + "loss": 0.2232, + "step": 287 + }, + { + "epoch": 0.09332469215813351, + "grad_norm": 1.578641414642334, + "learning_rate": 4.999987607990287e-06, + "loss": 0.2099, + "step": 288 + }, + { + "epoch": 0.09364873622812703, + "grad_norm": 1.4884730577468872, + "learning_rate": 4.999984701225542e-06, + "loss": 0.2151, + "step": 289 + }, + { + "epoch": 0.09397278029812055, + "grad_norm": 1.5223939418792725, + "learning_rate": 4.9999814884868705e-06, + "loss": 0.2058, + "step": 290 + }, + { + "epoch": 0.09429682436811407, + "grad_norm": 1.3765919208526611, + "learning_rate": 4.999977969774666e-06, + "loss": 0.1892, + "step": 291 + }, + { + "epoch": 0.09462086843810759, + "grad_norm": 1.4410372972488403, + "learning_rate": 4.99997414508936e-06, + "loss": 0.2049, + "step": 292 + }, + { + "epoch": 0.0949449125081011, + "grad_norm": 1.507271409034729, + "learning_rate": 4.999970014431421e-06, + "loss": 0.199, + "step": 293 + }, + { + "epoch": 0.09526895657809462, + "grad_norm": 1.416298747062683, + "learning_rate": 4.999965577801354e-06, + "loss": 0.1996, + "step": 294 + }, + { + "epoch": 0.09559300064808814, + "grad_norm": 1.3537237644195557, + "learning_rate": 4.999960835199701e-06, + "loss": 0.1947, + "step": 295 + }, + { + "epoch": 0.09591704471808166, + "grad_norm": 1.4421045780181885, + "learning_rate": 4.999955786627042e-06, + "loss": 0.1857, + "step": 296 + }, + { + "epoch": 0.09624108878807518, + "grad_norm": 1.487221598625183, + "learning_rate": 4.999950432083998e-06, + "loss": 0.212, + "step": 297 + }, + { + "epoch": 0.0965651328580687, + "grad_norm": 1.4410208463668823, + "learning_rate": 4.999944771571222e-06, + "loss": 0.1949, + "step": 298 + }, + { + "epoch": 0.09688917692806222, + "grad_norm": 1.4714524745941162, + "learning_rate": 4.999938805089407e-06, + "loss": 0.2045, + "step": 299 + }, + { + "epoch": 0.09721322099805574, + "grad_norm": 1.4626051187515259, + "learning_rate": 4.999932532639285e-06, + "loss": 0.2116, + "step": 300 + }, + { + "epoch": 0.09753726506804926, + "grad_norm": 1.4867863655090332, + "learning_rate": 4.99992595422162e-06, + "loss": 0.2194, + "step": 301 + }, + { + "epoch": 0.09786130913804278, + "grad_norm": 1.3757691383361816, + "learning_rate": 4.9999190698372216e-06, + "loss": 0.1946, + "step": 302 + }, + { + "epoch": 0.0981853532080363, + "grad_norm": 1.4051896333694458, + "learning_rate": 4.9999118794869285e-06, + "loss": 0.2055, + "step": 303 + }, + { + "epoch": 0.0985093972780298, + "grad_norm": 1.3222601413726807, + "learning_rate": 4.999904383171623e-06, + "loss": 0.1899, + "step": 304 + }, + { + "epoch": 0.09883344134802333, + "grad_norm": 1.4336018562316895, + "learning_rate": 4.999896580892221e-06, + "loss": 0.2018, + "step": 305 + }, + { + "epoch": 0.09915748541801685, + "grad_norm": 1.3262324333190918, + "learning_rate": 4.99988847264968e-06, + "loss": 0.1848, + "step": 306 + }, + { + "epoch": 0.09948152948801037, + "grad_norm": 1.374320387840271, + "learning_rate": 4.99988005844499e-06, + "loss": 0.1948, + "step": 307 + }, + { + "epoch": 0.09980557355800389, + "grad_norm": 1.347687005996704, + "learning_rate": 4.999871338279181e-06, + "loss": 0.189, + "step": 308 + }, + { + "epoch": 0.10012961762799741, + "grad_norm": 1.3867069482803345, + "learning_rate": 4.999862312153322e-06, + "loss": 0.182, + "step": 309 + }, + { + "epoch": 0.10045366169799093, + "grad_norm": 1.3333046436309814, + "learning_rate": 4.999852980068516e-06, + "loss": 0.1862, + "step": 310 + }, + { + "epoch": 0.10077770576798445, + "grad_norm": 1.2988485097885132, + "learning_rate": 4.9998433420259055e-06, + "loss": 0.19, + "step": 311 + }, + { + "epoch": 0.10110174983797797, + "grad_norm": 1.490419864654541, + "learning_rate": 4.99983339802667e-06, + "loss": 0.2118, + "step": 312 + }, + { + "epoch": 0.10142579390797149, + "grad_norm": 1.5834200382232666, + "learning_rate": 4.999823148072027e-06, + "loss": 0.1984, + "step": 313 + }, + { + "epoch": 0.101749837977965, + "grad_norm": 1.3932247161865234, + "learning_rate": 4.999812592163232e-06, + "loss": 0.1853, + "step": 314 + }, + { + "epoch": 0.10207388204795852, + "grad_norm": 1.5058470964431763, + "learning_rate": 4.9998017303015735e-06, + "loss": 0.2086, + "step": 315 + }, + { + "epoch": 0.10239792611795204, + "grad_norm": 1.5837807655334473, + "learning_rate": 4.999790562488385e-06, + "loss": 0.1863, + "step": 316 + }, + { + "epoch": 0.10272197018794556, + "grad_norm": 1.9013391733169556, + "learning_rate": 4.999779088725031e-06, + "loss": 0.1974, + "step": 317 + }, + { + "epoch": 0.10304601425793908, + "grad_norm": 1.4886730909347534, + "learning_rate": 4.999767309012916e-06, + "loss": 0.2042, + "step": 318 + }, + { + "epoch": 0.1033700583279326, + "grad_norm": 1.4408074617385864, + "learning_rate": 4.999755223353483e-06, + "loss": 0.1949, + "step": 319 + }, + { + "epoch": 0.10369410239792612, + "grad_norm": 1.3255444765090942, + "learning_rate": 4.9997428317482086e-06, + "loss": 0.189, + "step": 320 + }, + { + "epoch": 0.10401814646791964, + "grad_norm": 1.4605129957199097, + "learning_rate": 4.999730134198612e-06, + "loss": 0.2046, + "step": 321 + }, + { + "epoch": 0.10434219053791316, + "grad_norm": 1.4560701847076416, + "learning_rate": 4.999717130706247e-06, + "loss": 0.1994, + "step": 322 + }, + { + "epoch": 0.10466623460790668, + "grad_norm": 1.5182634592056274, + "learning_rate": 4.999703821272702e-06, + "loss": 0.2241, + "step": 323 + }, + { + "epoch": 0.1049902786779002, + "grad_norm": 1.4704344272613525, + "learning_rate": 4.99969020589961e-06, + "loss": 0.2173, + "step": 324 + }, + { + "epoch": 0.10531432274789371, + "grad_norm": 1.4940931797027588, + "learning_rate": 4.999676284588635e-06, + "loss": 0.2165, + "step": 325 + }, + { + "epoch": 0.10563836681788723, + "grad_norm": 1.3513116836547852, + "learning_rate": 4.999662057341482e-06, + "loss": 0.1954, + "step": 326 + }, + { + "epoch": 0.10596241088788075, + "grad_norm": 1.4675105810165405, + "learning_rate": 4.999647524159892e-06, + "loss": 0.1961, + "step": 327 + }, + { + "epoch": 0.10628645495787427, + "grad_norm": 1.3306858539581299, + "learning_rate": 4.9996326850456435e-06, + "loss": 0.1886, + "step": 328 + }, + { + "epoch": 0.10661049902786779, + "grad_norm": 1.3358745574951172, + "learning_rate": 4.999617540000552e-06, + "loss": 0.1833, + "step": 329 + }, + { + "epoch": 0.10693454309786131, + "grad_norm": 1.5109752416610718, + "learning_rate": 4.999602089026472e-06, + "loss": 0.2237, + "step": 330 + }, + { + "epoch": 0.10725858716785483, + "grad_norm": 1.499991536140442, + "learning_rate": 4.999586332125294e-06, + "loss": 0.2212, + "step": 331 + }, + { + "epoch": 0.10758263123784835, + "grad_norm": 1.4174326658248901, + "learning_rate": 4.9995702692989476e-06, + "loss": 0.1931, + "step": 332 + }, + { + "epoch": 0.10790667530784187, + "grad_norm": 1.3674904108047485, + "learning_rate": 4.999553900549398e-06, + "loss": 0.1928, + "step": 333 + }, + { + "epoch": 0.10823071937783539, + "grad_norm": 1.3582217693328857, + "learning_rate": 4.999537225878648e-06, + "loss": 0.1884, + "step": 334 + }, + { + "epoch": 0.10855476344782891, + "grad_norm": 1.3601632118225098, + "learning_rate": 4.999520245288739e-06, + "loss": 0.1849, + "step": 335 + }, + { + "epoch": 0.10887880751782242, + "grad_norm": 1.4074798822402954, + "learning_rate": 4.999502958781749e-06, + "loss": 0.2129, + "step": 336 + }, + { + "epoch": 0.10920285158781594, + "grad_norm": 1.4298856258392334, + "learning_rate": 4.999485366359794e-06, + "loss": 0.2149, + "step": 337 + }, + { + "epoch": 0.10952689565780946, + "grad_norm": 1.4826171398162842, + "learning_rate": 4.999467468025028e-06, + "loss": 0.216, + "step": 338 + }, + { + "epoch": 0.10985093972780298, + "grad_norm": 1.3365073204040527, + "learning_rate": 4.99944926377964e-06, + "loss": 0.1888, + "step": 339 + }, + { + "epoch": 0.1101749837977965, + "grad_norm": 1.359991192817688, + "learning_rate": 4.999430753625858e-06, + "loss": 0.1882, + "step": 340 + }, + { + "epoch": 0.11049902786779002, + "grad_norm": 1.4163484573364258, + "learning_rate": 4.999411937565949e-06, + "loss": 0.2003, + "step": 341 + }, + { + "epoch": 0.11082307193778354, + "grad_norm": 1.3787403106689453, + "learning_rate": 4.999392815602214e-06, + "loss": 0.207, + "step": 342 + }, + { + "epoch": 0.11114711600777706, + "grad_norm": 1.3745408058166504, + "learning_rate": 4.999373387736996e-06, + "loss": 0.1864, + "step": 343 + }, + { + "epoch": 0.11147116007777058, + "grad_norm": 1.473127007484436, + "learning_rate": 4.999353653972669e-06, + "loss": 0.188, + "step": 344 + }, + { + "epoch": 0.1117952041477641, + "grad_norm": 1.4026800394058228, + "learning_rate": 4.999333614311652e-06, + "loss": 0.1809, + "step": 345 + }, + { + "epoch": 0.11211924821775761, + "grad_norm": 1.5205342769622803, + "learning_rate": 4.999313268756396e-06, + "loss": 0.1965, + "step": 346 + }, + { + "epoch": 0.11244329228775113, + "grad_norm": 1.5150617361068726, + "learning_rate": 4.99929261730939e-06, + "loss": 0.214, + "step": 347 + }, + { + "epoch": 0.11276733635774465, + "grad_norm": 1.642708659172058, + "learning_rate": 4.999271659973164e-06, + "loss": 0.2094, + "step": 348 + }, + { + "epoch": 0.11309138042773817, + "grad_norm": 1.6056296825408936, + "learning_rate": 4.999250396750281e-06, + "loss": 0.2051, + "step": 349 + }, + { + "epoch": 0.11341542449773169, + "grad_norm": 1.3636996746063232, + "learning_rate": 4.999228827643344e-06, + "loss": 0.2132, + "step": 350 + }, + { + "epoch": 0.11373946856772521, + "grad_norm": 1.3864322900772095, + "learning_rate": 4.999206952654993e-06, + "loss": 0.1984, + "step": 351 + }, + { + "epoch": 0.11406351263771873, + "grad_norm": 1.4255362749099731, + "learning_rate": 4.999184771787905e-06, + "loss": 0.1827, + "step": 352 + }, + { + "epoch": 0.11438755670771225, + "grad_norm": 1.4326763153076172, + "learning_rate": 4.999162285044795e-06, + "loss": 0.19, + "step": 353 + }, + { + "epoch": 0.11471160077770577, + "grad_norm": 1.3926764726638794, + "learning_rate": 4.9991394924284155e-06, + "loss": 0.1942, + "step": 354 + }, + { + "epoch": 0.11503564484769929, + "grad_norm": 1.4934500455856323, + "learning_rate": 4.999116393941556e-06, + "loss": 0.1966, + "step": 355 + }, + { + "epoch": 0.11535968891769281, + "grad_norm": 1.5054737329483032, + "learning_rate": 4.999092989587042e-06, + "loss": 0.1813, + "step": 356 + }, + { + "epoch": 0.11568373298768632, + "grad_norm": 1.4996047019958496, + "learning_rate": 4.9990692793677395e-06, + "loss": 0.1848, + "step": 357 + }, + { + "epoch": 0.11600777705767984, + "grad_norm": 1.534684181213379, + "learning_rate": 4.999045263286551e-06, + "loss": 0.1881, + "step": 358 + }, + { + "epoch": 0.11633182112767336, + "grad_norm": 1.467447280883789, + "learning_rate": 4.999020941346414e-06, + "loss": 0.18, + "step": 359 + }, + { + "epoch": 0.11665586519766688, + "grad_norm": 1.4896132946014404, + "learning_rate": 4.998996313550306e-06, + "loss": 0.1949, + "step": 360 + }, + { + "epoch": 0.1169799092676604, + "grad_norm": 1.3926950693130493, + "learning_rate": 4.998971379901242e-06, + "loss": 0.2029, + "step": 361 + }, + { + "epoch": 0.11730395333765392, + "grad_norm": 1.5150505304336548, + "learning_rate": 4.998946140402273e-06, + "loss": 0.2077, + "step": 362 + }, + { + "epoch": 0.11762799740764744, + "grad_norm": 1.2856528759002686, + "learning_rate": 4.998920595056488e-06, + "loss": 0.1987, + "step": 363 + }, + { + "epoch": 0.11795204147764096, + "grad_norm": 1.3384044170379639, + "learning_rate": 4.998894743867013e-06, + "loss": 0.192, + "step": 364 + }, + { + "epoch": 0.11827608554763448, + "grad_norm": 1.3708319664001465, + "learning_rate": 4.998868586837013e-06, + "loss": 0.1769, + "step": 365 + }, + { + "epoch": 0.118600129617628, + "grad_norm": 1.3513811826705933, + "learning_rate": 4.998842123969689e-06, + "loss": 0.1963, + "step": 366 + }, + { + "epoch": 0.11892417368762152, + "grad_norm": 1.3095284700393677, + "learning_rate": 4.998815355268279e-06, + "loss": 0.194, + "step": 367 + }, + { + "epoch": 0.11924821775761503, + "grad_norm": 1.3722801208496094, + "learning_rate": 4.998788280736061e-06, + "loss": 0.2084, + "step": 368 + }, + { + "epoch": 0.11957226182760855, + "grad_norm": 1.3854695558547974, + "learning_rate": 4.998760900376347e-06, + "loss": 0.186, + "step": 369 + }, + { + "epoch": 0.11989630589760207, + "grad_norm": 1.3411400318145752, + "learning_rate": 4.99873321419249e-06, + "loss": 0.1975, + "step": 370 + }, + { + "epoch": 0.12022034996759559, + "grad_norm": 1.2395044565200806, + "learning_rate": 4.998705222187875e-06, + "loss": 0.1871, + "step": 371 + }, + { + "epoch": 0.12054439403758911, + "grad_norm": 1.3219211101531982, + "learning_rate": 4.998676924365931e-06, + "loss": 0.1842, + "step": 372 + }, + { + "epoch": 0.12086843810758263, + "grad_norm": 1.3255659341812134, + "learning_rate": 4.998648320730121e-06, + "loss": 0.1973, + "step": 373 + }, + { + "epoch": 0.12119248217757615, + "grad_norm": 1.2957849502563477, + "learning_rate": 4.998619411283945e-06, + "loss": 0.1992, + "step": 374 + }, + { + "epoch": 0.12151652624756967, + "grad_norm": 1.3687437772750854, + "learning_rate": 4.998590196030942e-06, + "loss": 0.2024, + "step": 375 + }, + { + "epoch": 0.1218405703175632, + "grad_norm": 1.472497582435608, + "learning_rate": 4.998560674974686e-06, + "loss": 0.2079, + "step": 376 + }, + { + "epoch": 0.12216461438755671, + "grad_norm": 1.4358025789260864, + "learning_rate": 4.998530848118792e-06, + "loss": 0.2117, + "step": 377 + }, + { + "epoch": 0.12248865845755022, + "grad_norm": 1.23166823387146, + "learning_rate": 4.99850071546691e-06, + "loss": 0.1909, + "step": 378 + }, + { + "epoch": 0.12281270252754374, + "grad_norm": 1.4100823402404785, + "learning_rate": 4.998470277022728e-06, + "loss": 0.2054, + "step": 379 + }, + { + "epoch": 0.12313674659753726, + "grad_norm": 1.3292949199676514, + "learning_rate": 4.99843953278997e-06, + "loss": 0.2005, + "step": 380 + }, + { + "epoch": 0.12346079066753078, + "grad_norm": 1.3157209157943726, + "learning_rate": 4.998408482772401e-06, + "loss": 0.2004, + "step": 381 + }, + { + "epoch": 0.1237848347375243, + "grad_norm": 1.3142346143722534, + "learning_rate": 4.99837712697382e-06, + "loss": 0.1836, + "step": 382 + }, + { + "epoch": 0.12410887880751782, + "grad_norm": 1.3334403038024902, + "learning_rate": 4.998345465398066e-06, + "loss": 0.2, + "step": 383 + }, + { + "epoch": 0.12443292287751134, + "grad_norm": 1.2839090824127197, + "learning_rate": 4.998313498049011e-06, + "loss": 0.1899, + "step": 384 + }, + { + "epoch": 0.12475696694750486, + "grad_norm": 1.3406338691711426, + "learning_rate": 4.9982812249305704e-06, + "loss": 0.1834, + "step": 385 + }, + { + "epoch": 0.12508101101749838, + "grad_norm": 1.3714925050735474, + "learning_rate": 4.998248646046693e-06, + "loss": 0.2022, + "step": 386 + }, + { + "epoch": 0.1254050550874919, + "grad_norm": 1.3312101364135742, + "learning_rate": 4.998215761401366e-06, + "loss": 0.1975, + "step": 387 + }, + { + "epoch": 0.12572909915748542, + "grad_norm": 1.2964175939559937, + "learning_rate": 4.9981825709986145e-06, + "loss": 0.1953, + "step": 388 + }, + { + "epoch": 0.12605314322747893, + "grad_norm": 1.2531015872955322, + "learning_rate": 4.9981490748425e-06, + "loss": 0.1928, + "step": 389 + }, + { + "epoch": 0.12637718729747247, + "grad_norm": 1.4965262413024902, + "learning_rate": 4.998115272937123e-06, + "loss": 0.1909, + "step": 390 + }, + { + "epoch": 0.12670123136746597, + "grad_norm": 1.3003981113433838, + "learning_rate": 4.998081165286621e-06, + "loss": 0.1953, + "step": 391 + }, + { + "epoch": 0.1270252754374595, + "grad_norm": 1.2948582172393799, + "learning_rate": 4.9980467518951666e-06, + "loss": 0.1945, + "step": 392 + }, + { + "epoch": 0.127349319507453, + "grad_norm": 1.4373679161071777, + "learning_rate": 4.998012032766974e-06, + "loss": 0.1925, + "step": 393 + }, + { + "epoch": 0.12767336357744652, + "grad_norm": 1.4847952127456665, + "learning_rate": 4.997977007906291e-06, + "loss": 0.2031, + "step": 394 + }, + { + "epoch": 0.12799740764744005, + "grad_norm": 1.2792102098464966, + "learning_rate": 4.997941677317403e-06, + "loss": 0.184, + "step": 395 + }, + { + "epoch": 0.12832145171743356, + "grad_norm": 1.2261104583740234, + "learning_rate": 4.997906041004637e-06, + "loss": 0.1957, + "step": 396 + }, + { + "epoch": 0.1286454957874271, + "grad_norm": 1.330273985862732, + "learning_rate": 4.997870098972353e-06, + "loss": 0.206, + "step": 397 + }, + { + "epoch": 0.1289695398574206, + "grad_norm": 1.3119021654129028, + "learning_rate": 4.99783385122495e-06, + "loss": 0.176, + "step": 398 + }, + { + "epoch": 0.12929358392741414, + "grad_norm": 1.3380613327026367, + "learning_rate": 4.997797297766864e-06, + "loss": 0.1909, + "step": 399 + }, + { + "epoch": 0.12961762799740764, + "grad_norm": 1.312547206878662, + "learning_rate": 4.9977604386025704e-06, + "loss": 0.1905, + "step": 400 + }, + { + "epoch": 0.12994167206740118, + "grad_norm": 1.2903507947921753, + "learning_rate": 4.997723273736579e-06, + "loss": 0.1803, + "step": 401 + }, + { + "epoch": 0.13026571613739468, + "grad_norm": 1.2392823696136475, + "learning_rate": 4.9976858031734375e-06, + "loss": 0.1833, + "step": 402 + }, + { + "epoch": 0.13058976020738822, + "grad_norm": 1.2063300609588623, + "learning_rate": 4.9976480269177345e-06, + "loss": 0.1778, + "step": 403 + }, + { + "epoch": 0.13091380427738172, + "grad_norm": 1.3096263408660889, + "learning_rate": 4.997609944974092e-06, + "loss": 0.1878, + "step": 404 + }, + { + "epoch": 0.13123784834737523, + "grad_norm": 1.3855416774749756, + "learning_rate": 4.99757155734717e-06, + "loss": 0.1939, + "step": 405 + }, + { + "epoch": 0.13156189241736876, + "grad_norm": 1.3690385818481445, + "learning_rate": 4.997532864041669e-06, + "loss": 0.1958, + "step": 406 + }, + { + "epoch": 0.13188593648736227, + "grad_norm": 1.4282350540161133, + "learning_rate": 4.997493865062323e-06, + "loss": 0.1979, + "step": 407 + }, + { + "epoch": 0.1322099805573558, + "grad_norm": 1.3318852186203003, + "learning_rate": 4.9974545604139055e-06, + "loss": 0.1975, + "step": 408 + }, + { + "epoch": 0.1325340246273493, + "grad_norm": 1.3509126901626587, + "learning_rate": 4.997414950101227e-06, + "loss": 0.1985, + "step": 409 + }, + { + "epoch": 0.13285806869734285, + "grad_norm": 1.3912403583526611, + "learning_rate": 4.997375034129135e-06, + "loss": 0.2103, + "step": 410 + }, + { + "epoch": 0.13318211276733635, + "grad_norm": 1.2692376375198364, + "learning_rate": 4.997334812502516e-06, + "loss": 0.1849, + "step": 411 + }, + { + "epoch": 0.1335061568373299, + "grad_norm": 1.3442357778549194, + "learning_rate": 4.9972942852262915e-06, + "loss": 0.1898, + "step": 412 + }, + { + "epoch": 0.1338302009073234, + "grad_norm": 1.2724248170852661, + "learning_rate": 4.997253452305423e-06, + "loss": 0.1917, + "step": 413 + }, + { + "epoch": 0.13415424497731693, + "grad_norm": 1.4180892705917358, + "learning_rate": 4.9972123137449065e-06, + "loss": 0.2009, + "step": 414 + }, + { + "epoch": 0.13447828904731043, + "grad_norm": 1.3496129512786865, + "learning_rate": 4.997170869549778e-06, + "loss": 0.1932, + "step": 415 + }, + { + "epoch": 0.13480233311730394, + "grad_norm": 1.1647557020187378, + "learning_rate": 4.99712911972511e-06, + "loss": 0.1763, + "step": 416 + }, + { + "epoch": 0.13512637718729748, + "grad_norm": 1.286051630973816, + "learning_rate": 4.99708706427601e-06, + "loss": 0.185, + "step": 417 + }, + { + "epoch": 0.13545042125729098, + "grad_norm": 1.241931438446045, + "learning_rate": 4.997044703207629e-06, + "loss": 0.2015, + "step": 418 + }, + { + "epoch": 0.13577446532728452, + "grad_norm": 1.4348373413085938, + "learning_rate": 4.9970020365251485e-06, + "loss": 0.1929, + "step": 419 + }, + { + "epoch": 0.13609850939727802, + "grad_norm": 1.2149462699890137, + "learning_rate": 4.996959064233792e-06, + "loss": 0.183, + "step": 420 + }, + { + "epoch": 0.13642255346727156, + "grad_norm": 1.2824324369430542, + "learning_rate": 4.996915786338818e-06, + "loss": 0.2014, + "step": 421 + }, + { + "epoch": 0.13674659753726506, + "grad_norm": 1.2929573059082031, + "learning_rate": 4.9968722028455245e-06, + "loss": 0.1894, + "step": 422 + }, + { + "epoch": 0.1370706416072586, + "grad_norm": 1.458095669746399, + "learning_rate": 4.996828313759245e-06, + "loss": 0.1986, + "step": 423 + }, + { + "epoch": 0.1373946856772521, + "grad_norm": 1.3376439809799194, + "learning_rate": 4.99678411908535e-06, + "loss": 0.1878, + "step": 424 + }, + { + "epoch": 0.13771872974724564, + "grad_norm": 1.4877251386642456, + "learning_rate": 4.996739618829251e-06, + "loss": 0.2029, + "step": 425 + }, + { + "epoch": 0.13804277381723914, + "grad_norm": 1.4747685194015503, + "learning_rate": 4.996694812996391e-06, + "loss": 0.1788, + "step": 426 + }, + { + "epoch": 0.13836681788723265, + "grad_norm": 1.5499141216278076, + "learning_rate": 4.996649701592258e-06, + "loss": 0.2201, + "step": 427 + }, + { + "epoch": 0.13869086195722619, + "grad_norm": 1.3866907358169556, + "learning_rate": 4.99660428462237e-06, + "loss": 0.1858, + "step": 428 + }, + { + "epoch": 0.1390149060272197, + "grad_norm": 1.3738040924072266, + "learning_rate": 4.996558562092286e-06, + "loss": 0.1759, + "step": 429 + }, + { + "epoch": 0.13933895009721323, + "grad_norm": 1.41068696975708, + "learning_rate": 4.996512534007602e-06, + "loss": 0.2034, + "step": 430 + }, + { + "epoch": 0.13966299416720673, + "grad_norm": 1.2787730693817139, + "learning_rate": 4.996466200373954e-06, + "loss": 0.1811, + "step": 431 + }, + { + "epoch": 0.13998703823720027, + "grad_norm": 1.2944291830062866, + "learning_rate": 4.99641956119701e-06, + "loss": 0.1965, + "step": 432 + }, + { + "epoch": 0.14031108230719377, + "grad_norm": 1.319860577583313, + "learning_rate": 4.996372616482478e-06, + "loss": 0.1961, + "step": 433 + }, + { + "epoch": 0.1406351263771873, + "grad_norm": 1.2821969985961914, + "learning_rate": 4.996325366236105e-06, + "loss": 0.2068, + "step": 434 + }, + { + "epoch": 0.14095917044718081, + "grad_norm": 1.3399871587753296, + "learning_rate": 4.996277810463675e-06, + "loss": 0.2055, + "step": 435 + }, + { + "epoch": 0.14128321451717435, + "grad_norm": 1.2458884716033936, + "learning_rate": 4.996229949171004e-06, + "loss": 0.1983, + "step": 436 + }, + { + "epoch": 0.14160725858716786, + "grad_norm": 1.2212367057800293, + "learning_rate": 4.996181782363955e-06, + "loss": 0.1952, + "step": 437 + }, + { + "epoch": 0.14193130265716136, + "grad_norm": 1.197187066078186, + "learning_rate": 4.99613331004842e-06, + "loss": 0.1869, + "step": 438 + }, + { + "epoch": 0.1422553467271549, + "grad_norm": 1.1908378601074219, + "learning_rate": 4.996084532230332e-06, + "loss": 0.1987, + "step": 439 + }, + { + "epoch": 0.1425793907971484, + "grad_norm": 1.388664960861206, + "learning_rate": 4.996035448915661e-06, + "loss": 0.2039, + "step": 440 + }, + { + "epoch": 0.14290343486714194, + "grad_norm": 1.3541980981826782, + "learning_rate": 4.995986060110415e-06, + "loss": 0.192, + "step": 441 + }, + { + "epoch": 0.14322747893713544, + "grad_norm": 1.3946161270141602, + "learning_rate": 4.995936365820638e-06, + "loss": 0.1903, + "step": 442 + }, + { + "epoch": 0.14355152300712898, + "grad_norm": 1.1276389360427856, + "learning_rate": 4.9958863660524125e-06, + "loss": 0.1585, + "step": 443 + }, + { + "epoch": 0.14387556707712248, + "grad_norm": 1.1817028522491455, + "learning_rate": 4.995836060811859e-06, + "loss": 0.1906, + "step": 444 + }, + { + "epoch": 0.14419961114711602, + "grad_norm": 1.2277320623397827, + "learning_rate": 4.995785450105131e-06, + "loss": 0.1823, + "step": 445 + }, + { + "epoch": 0.14452365521710953, + "grad_norm": 1.2285058498382568, + "learning_rate": 4.995734533938427e-06, + "loss": 0.1835, + "step": 446 + }, + { + "epoch": 0.14484769928710303, + "grad_norm": 1.2986756563186646, + "learning_rate": 4.995683312317975e-06, + "loss": 0.1847, + "step": 447 + }, + { + "epoch": 0.14517174335709657, + "grad_norm": 1.2609082460403442, + "learning_rate": 4.995631785250046e-06, + "loss": 0.182, + "step": 448 + }, + { + "epoch": 0.14549578742709007, + "grad_norm": 1.2614150047302246, + "learning_rate": 4.9955799527409465e-06, + "loss": 0.1937, + "step": 449 + }, + { + "epoch": 0.1458198314970836, + "grad_norm": 1.3610444068908691, + "learning_rate": 4.99552781479702e-06, + "loss": 0.1908, + "step": 450 + }, + { + "epoch": 0.14614387556707711, + "grad_norm": 1.259647250175476, + "learning_rate": 4.995475371424648e-06, + "loss": 0.1909, + "step": 451 + }, + { + "epoch": 0.14646791963707065, + "grad_norm": 1.150729775428772, + "learning_rate": 4.995422622630247e-06, + "loss": 0.1734, + "step": 452 + }, + { + "epoch": 0.14679196370706415, + "grad_norm": 1.2474488019943237, + "learning_rate": 4.995369568420276e-06, + "loss": 0.1765, + "step": 453 + }, + { + "epoch": 0.1471160077770577, + "grad_norm": 1.356665015220642, + "learning_rate": 4.995316208801226e-06, + "loss": 0.1949, + "step": 454 + }, + { + "epoch": 0.1474400518470512, + "grad_norm": 1.299708366394043, + "learning_rate": 4.99526254377963e-06, + "loss": 0.1886, + "step": 455 + }, + { + "epoch": 0.14776409591704473, + "grad_norm": 1.213100552558899, + "learning_rate": 4.995208573362053e-06, + "loss": 0.1885, + "step": 456 + }, + { + "epoch": 0.14808813998703824, + "grad_norm": 1.3678295612335205, + "learning_rate": 4.995154297555103e-06, + "loss": 0.192, + "step": 457 + }, + { + "epoch": 0.14841218405703174, + "grad_norm": 1.2744077444076538, + "learning_rate": 4.995099716365421e-06, + "loss": 0.1979, + "step": 458 + }, + { + "epoch": 0.14873622812702528, + "grad_norm": 1.3041316270828247, + "learning_rate": 4.995044829799689e-06, + "loss": 0.1951, + "step": 459 + }, + { + "epoch": 0.14906027219701878, + "grad_norm": 1.2977657318115234, + "learning_rate": 4.994989637864624e-06, + "loss": 0.1824, + "step": 460 + }, + { + "epoch": 0.14938431626701232, + "grad_norm": 1.3233217000961304, + "learning_rate": 4.99493414056698e-06, + "loss": 0.1786, + "step": 461 + }, + { + "epoch": 0.14970836033700582, + "grad_norm": 1.3560962677001953, + "learning_rate": 4.99487833791355e-06, + "loss": 0.1838, + "step": 462 + }, + { + "epoch": 0.15003240440699936, + "grad_norm": 1.3699971437454224, + "learning_rate": 4.9948222299111644e-06, + "loss": 0.1924, + "step": 463 + }, + { + "epoch": 0.15035644847699287, + "grad_norm": 1.2370259761810303, + "learning_rate": 4.994765816566689e-06, + "loss": 0.189, + "step": 464 + }, + { + "epoch": 0.1506804925469864, + "grad_norm": 1.428999423980713, + "learning_rate": 4.994709097887029e-06, + "loss": 0.1995, + "step": 465 + }, + { + "epoch": 0.1510045366169799, + "grad_norm": 1.3747769594192505, + "learning_rate": 4.994652073879127e-06, + "loss": 0.1891, + "step": 466 + }, + { + "epoch": 0.15132858068697344, + "grad_norm": 1.290266752243042, + "learning_rate": 4.994594744549961e-06, + "loss": 0.1868, + "step": 467 + }, + { + "epoch": 0.15165262475696695, + "grad_norm": 1.3300879001617432, + "learning_rate": 4.994537109906546e-06, + "loss": 0.1826, + "step": 468 + }, + { + "epoch": 0.15197666882696045, + "grad_norm": 1.2477858066558838, + "learning_rate": 4.99447916995594e-06, + "loss": 0.1817, + "step": 469 + }, + { + "epoch": 0.152300712896954, + "grad_norm": 1.1481152772903442, + "learning_rate": 4.99442092470523e-06, + "loss": 0.1688, + "step": 470 + }, + { + "epoch": 0.1526247569669475, + "grad_norm": 1.2472997903823853, + "learning_rate": 4.994362374161548e-06, + "loss": 0.1946, + "step": 471 + }, + { + "epoch": 0.15294880103694103, + "grad_norm": 1.2494356632232666, + "learning_rate": 4.994303518332059e-06, + "loss": 0.1998, + "step": 472 + }, + { + "epoch": 0.15327284510693454, + "grad_norm": 1.3602643013000488, + "learning_rate": 4.994244357223965e-06, + "loss": 0.1881, + "step": 473 + }, + { + "epoch": 0.15359688917692807, + "grad_norm": 1.2709087133407593, + "learning_rate": 4.994184890844509e-06, + "loss": 0.1818, + "step": 474 + }, + { + "epoch": 0.15392093324692158, + "grad_norm": 1.2899342775344849, + "learning_rate": 4.9941251192009665e-06, + "loss": 0.2027, + "step": 475 + }, + { + "epoch": 0.1542449773169151, + "grad_norm": 1.3323594331741333, + "learning_rate": 4.994065042300655e-06, + "loss": 0.2063, + "step": 476 + }, + { + "epoch": 0.15456902138690862, + "grad_norm": 1.3085150718688965, + "learning_rate": 4.994004660150927e-06, + "loss": 0.1992, + "step": 477 + }, + { + "epoch": 0.15489306545690215, + "grad_norm": 1.2538460493087769, + "learning_rate": 4.993943972759173e-06, + "loss": 0.1807, + "step": 478 + }, + { + "epoch": 0.15521710952689566, + "grad_norm": 1.2298994064331055, + "learning_rate": 4.993882980132819e-06, + "loss": 0.177, + "step": 479 + }, + { + "epoch": 0.15554115359688916, + "grad_norm": 1.2511969804763794, + "learning_rate": 4.993821682279332e-06, + "loss": 0.1908, + "step": 480 + }, + { + "epoch": 0.1558651976668827, + "grad_norm": 1.2835829257965088, + "learning_rate": 4.993760079206212e-06, + "loss": 0.189, + "step": 481 + }, + { + "epoch": 0.1561892417368762, + "grad_norm": 1.252970814704895, + "learning_rate": 4.993698170920999e-06, + "loss": 0.1988, + "step": 482 + }, + { + "epoch": 0.15651328580686974, + "grad_norm": 1.2566627264022827, + "learning_rate": 4.993635957431273e-06, + "loss": 0.1787, + "step": 483 + }, + { + "epoch": 0.15683732987686325, + "grad_norm": 1.2273926734924316, + "learning_rate": 4.993573438744645e-06, + "loss": 0.196, + "step": 484 + }, + { + "epoch": 0.15716137394685678, + "grad_norm": 1.194635033607483, + "learning_rate": 4.993510614868767e-06, + "loss": 0.1762, + "step": 485 + }, + { + "epoch": 0.1574854180168503, + "grad_norm": 1.2113969326019287, + "learning_rate": 4.99344748581133e-06, + "loss": 0.1882, + "step": 486 + }, + { + "epoch": 0.15780946208684382, + "grad_norm": 1.3154219388961792, + "learning_rate": 4.993384051580059e-06, + "loss": 0.2097, + "step": 487 + }, + { + "epoch": 0.15813350615683733, + "grad_norm": 1.3013001680374146, + "learning_rate": 4.993320312182718e-06, + "loss": 0.1935, + "step": 488 + }, + { + "epoch": 0.15845755022683086, + "grad_norm": 1.2724648714065552, + "learning_rate": 4.993256267627108e-06, + "loss": 0.1931, + "step": 489 + }, + { + "epoch": 0.15878159429682437, + "grad_norm": 1.1998355388641357, + "learning_rate": 4.993191917921066e-06, + "loss": 0.1756, + "step": 490 + }, + { + "epoch": 0.15910563836681788, + "grad_norm": 1.2753697633743286, + "learning_rate": 4.9931272630724704e-06, + "loss": 0.1835, + "step": 491 + }, + { + "epoch": 0.1594296824368114, + "grad_norm": 1.2968438863754272, + "learning_rate": 4.993062303089233e-06, + "loss": 0.1967, + "step": 492 + }, + { + "epoch": 0.15975372650680492, + "grad_norm": 1.2662702798843384, + "learning_rate": 4.992997037979304e-06, + "loss": 0.185, + "step": 493 + }, + { + "epoch": 0.16007777057679845, + "grad_norm": 1.1670453548431396, + "learning_rate": 4.992931467750673e-06, + "loss": 0.1666, + "step": 494 + }, + { + "epoch": 0.16040181464679196, + "grad_norm": 1.1736934185028076, + "learning_rate": 4.992865592411362e-06, + "loss": 0.1824, + "step": 495 + }, + { + "epoch": 0.1607258587167855, + "grad_norm": 1.1742326021194458, + "learning_rate": 4.992799411969436e-06, + "loss": 0.1718, + "step": 496 + }, + { + "epoch": 0.161049902786779, + "grad_norm": 1.3553086519241333, + "learning_rate": 4.992732926432995e-06, + "loss": 0.2112, + "step": 497 + }, + { + "epoch": 0.16137394685677253, + "grad_norm": 1.3199315071105957, + "learning_rate": 4.9926661358101745e-06, + "loss": 0.1954, + "step": 498 + }, + { + "epoch": 0.16169799092676604, + "grad_norm": 1.2116035223007202, + "learning_rate": 4.9925990401091505e-06, + "loss": 0.1846, + "step": 499 + }, + { + "epoch": 0.16202203499675957, + "grad_norm": 1.3013466596603394, + "learning_rate": 4.992531639338133e-06, + "loss": 0.1938, + "step": 500 + }, + { + "epoch": 0.16234607906675308, + "grad_norm": 1.2134751081466675, + "learning_rate": 4.992463933505374e-06, + "loss": 0.1813, + "step": 501 + }, + { + "epoch": 0.16267012313674659, + "grad_norm": 1.275068759918213, + "learning_rate": 4.9923959226191574e-06, + "loss": 0.1942, + "step": 502 + }, + { + "epoch": 0.16299416720674012, + "grad_norm": 1.2700871229171753, + "learning_rate": 4.992327606687808e-06, + "loss": 0.1936, + "step": 503 + }, + { + "epoch": 0.16331821127673363, + "grad_norm": 1.2754777669906616, + "learning_rate": 4.992258985719688e-06, + "loss": 0.2123, + "step": 504 + }, + { + "epoch": 0.16364225534672716, + "grad_norm": 1.177857518196106, + "learning_rate": 4.992190059723194e-06, + "loss": 0.1782, + "step": 505 + }, + { + "epoch": 0.16396629941672067, + "grad_norm": 1.1868280172348022, + "learning_rate": 4.992120828706763e-06, + "loss": 0.1863, + "step": 506 + }, + { + "epoch": 0.1642903434867142, + "grad_norm": 1.2085050344467163, + "learning_rate": 4.99205129267887e-06, + "loss": 0.1805, + "step": 507 + }, + { + "epoch": 0.1646143875567077, + "grad_norm": 1.1440081596374512, + "learning_rate": 4.991981451648022e-06, + "loss": 0.1719, + "step": 508 + }, + { + "epoch": 0.16493843162670124, + "grad_norm": 1.2123503684997559, + "learning_rate": 4.9919113056227685e-06, + "loss": 0.1929, + "step": 509 + }, + { + "epoch": 0.16526247569669475, + "grad_norm": 1.1267462968826294, + "learning_rate": 4.991840854611696e-06, + "loss": 0.1631, + "step": 510 + }, + { + "epoch": 0.16558651976668826, + "grad_norm": 1.198703646659851, + "learning_rate": 4.991770098623425e-06, + "loss": 0.1747, + "step": 511 + }, + { + "epoch": 0.1659105638366818, + "grad_norm": 1.1974960565567017, + "learning_rate": 4.9916990376666156e-06, + "loss": 0.1703, + "step": 512 + }, + { + "epoch": 0.1662346079066753, + "grad_norm": 1.3251335620880127, + "learning_rate": 4.991627671749966e-06, + "loss": 0.1917, + "step": 513 + }, + { + "epoch": 0.16655865197666883, + "grad_norm": 1.2980823516845703, + "learning_rate": 4.9915560008822105e-06, + "loss": 0.1798, + "step": 514 + }, + { + "epoch": 0.16688269604666234, + "grad_norm": 1.24545419216156, + "learning_rate": 4.99148402507212e-06, + "loss": 0.183, + "step": 515 + }, + { + "epoch": 0.16720674011665587, + "grad_norm": 1.3871142864227295, + "learning_rate": 4.991411744328505e-06, + "loss": 0.196, + "step": 516 + }, + { + "epoch": 0.16753078418664938, + "grad_norm": 1.186227560043335, + "learning_rate": 4.991339158660211e-06, + "loss": 0.1788, + "step": 517 + }, + { + "epoch": 0.1678548282566429, + "grad_norm": 1.1392605304718018, + "learning_rate": 4.991266268076121e-06, + "loss": 0.1847, + "step": 518 + }, + { + "epoch": 0.16817887232663642, + "grad_norm": 1.200305461883545, + "learning_rate": 4.991193072585158e-06, + "loss": 0.1906, + "step": 519 + }, + { + "epoch": 0.16850291639662995, + "grad_norm": 1.3353350162506104, + "learning_rate": 4.99111957219628e-06, + "loss": 0.1819, + "step": 520 + }, + { + "epoch": 0.16882696046662346, + "grad_norm": 1.2189663648605347, + "learning_rate": 4.991045766918482e-06, + "loss": 0.1764, + "step": 521 + }, + { + "epoch": 0.16915100453661697, + "grad_norm": 1.2358533143997192, + "learning_rate": 4.990971656760797e-06, + "loss": 0.1869, + "step": 522 + }, + { + "epoch": 0.1694750486066105, + "grad_norm": 1.2056154012680054, + "learning_rate": 4.990897241732296e-06, + "loss": 0.2036, + "step": 523 + }, + { + "epoch": 0.169799092676604, + "grad_norm": 1.3262749910354614, + "learning_rate": 4.990822521842086e-06, + "loss": 0.2051, + "step": 524 + }, + { + "epoch": 0.17012313674659754, + "grad_norm": 1.2622315883636475, + "learning_rate": 4.990747497099312e-06, + "loss": 0.1693, + "step": 525 + }, + { + "epoch": 0.17044718081659105, + "grad_norm": 1.2696737051010132, + "learning_rate": 4.990672167513158e-06, + "loss": 0.2031, + "step": 526 + }, + { + "epoch": 0.17077122488658458, + "grad_norm": 1.2847379446029663, + "learning_rate": 4.990596533092841e-06, + "loss": 0.1903, + "step": 527 + }, + { + "epoch": 0.1710952689565781, + "grad_norm": 1.1462384462356567, + "learning_rate": 4.9905205938476195e-06, + "loss": 0.1963, + "step": 528 + }, + { + "epoch": 0.17141931302657162, + "grad_norm": 1.2983579635620117, + "learning_rate": 4.990444349786788e-06, + "loss": 0.1967, + "step": 529 + }, + { + "epoch": 0.17174335709656513, + "grad_norm": 1.3803335428237915, + "learning_rate": 4.990367800919677e-06, + "loss": 0.1682, + "step": 530 + }, + { + "epoch": 0.17206740116655866, + "grad_norm": 1.2707942724227905, + "learning_rate": 4.990290947255656e-06, + "loss": 0.1938, + "step": 531 + }, + { + "epoch": 0.17239144523655217, + "grad_norm": 1.0509445667266846, + "learning_rate": 4.9902137888041304e-06, + "loss": 0.1651, + "step": 532 + }, + { + "epoch": 0.17271548930654568, + "grad_norm": 1.1488968133926392, + "learning_rate": 4.990136325574545e-06, + "loss": 0.168, + "step": 533 + }, + { + "epoch": 0.1730395333765392, + "grad_norm": 1.2975245714187622, + "learning_rate": 4.990058557576379e-06, + "loss": 0.1844, + "step": 534 + }, + { + "epoch": 0.17336357744653272, + "grad_norm": 1.2090622186660767, + "learning_rate": 4.989980484819152e-06, + "loss": 0.1867, + "step": 535 + }, + { + "epoch": 0.17368762151652625, + "grad_norm": 1.2510167360305786, + "learning_rate": 4.9899021073124175e-06, + "loss": 0.1929, + "step": 536 + }, + { + "epoch": 0.17401166558651976, + "grad_norm": 1.250388503074646, + "learning_rate": 4.989823425065769e-06, + "loss": 0.1863, + "step": 537 + }, + { + "epoch": 0.1743357096565133, + "grad_norm": 1.1425954103469849, + "learning_rate": 4.989744438088838e-06, + "loss": 0.1763, + "step": 538 + }, + { + "epoch": 0.1746597537265068, + "grad_norm": 1.3070205450057983, + "learning_rate": 4.98966514639129e-06, + "loss": 0.1941, + "step": 539 + }, + { + "epoch": 0.17498379779650033, + "grad_norm": 1.1972076892852783, + "learning_rate": 4.98958554998283e-06, + "loss": 0.1877, + "step": 540 + }, + { + "epoch": 0.17530784186649384, + "grad_norm": 1.2954251766204834, + "learning_rate": 4.989505648873198e-06, + "loss": 0.2039, + "step": 541 + }, + { + "epoch": 0.17563188593648738, + "grad_norm": 1.1820374727249146, + "learning_rate": 4.989425443072177e-06, + "loss": 0.1837, + "step": 542 + }, + { + "epoch": 0.17595593000648088, + "grad_norm": 1.264702558517456, + "learning_rate": 4.9893449325895804e-06, + "loss": 0.2013, + "step": 543 + }, + { + "epoch": 0.1762799740764744, + "grad_norm": 1.1946943998336792, + "learning_rate": 4.989264117435263e-06, + "loss": 0.1873, + "step": 544 + }, + { + "epoch": 0.17660401814646792, + "grad_norm": 1.220134973526001, + "learning_rate": 4.9891829976191155e-06, + "loss": 0.2003, + "step": 545 + }, + { + "epoch": 0.17692806221646143, + "grad_norm": 1.138335943222046, + "learning_rate": 4.9891015731510665e-06, + "loss": 0.1678, + "step": 546 + }, + { + "epoch": 0.17725210628645496, + "grad_norm": 1.111392855644226, + "learning_rate": 4.989019844041081e-06, + "loss": 0.163, + "step": 547 + }, + { + "epoch": 0.17757615035644847, + "grad_norm": 1.2142761945724487, + "learning_rate": 4.988937810299161e-06, + "loss": 0.183, + "step": 548 + }, + { + "epoch": 0.177900194426442, + "grad_norm": 1.1644195318222046, + "learning_rate": 4.98885547193535e-06, + "loss": 0.1832, + "step": 549 + }, + { + "epoch": 0.1782242384964355, + "grad_norm": 1.2614682912826538, + "learning_rate": 4.988772828959722e-06, + "loss": 0.1738, + "step": 550 + }, + { + "epoch": 0.17854828256642905, + "grad_norm": 1.1590503454208374, + "learning_rate": 4.988689881382392e-06, + "loss": 0.1929, + "step": 551 + }, + { + "epoch": 0.17887232663642255, + "grad_norm": 1.0835431814193726, + "learning_rate": 4.988606629213515e-06, + "loss": 0.1567, + "step": 552 + }, + { + "epoch": 0.17919637070641609, + "grad_norm": 1.1390941143035889, + "learning_rate": 4.9885230724632775e-06, + "loss": 0.1757, + "step": 553 + }, + { + "epoch": 0.1795204147764096, + "grad_norm": 1.2216302156448364, + "learning_rate": 4.9884392111419056e-06, + "loss": 0.1826, + "step": 554 + }, + { + "epoch": 0.1798444588464031, + "grad_norm": 1.1715116500854492, + "learning_rate": 4.988355045259665e-06, + "loss": 0.1802, + "step": 555 + }, + { + "epoch": 0.18016850291639663, + "grad_norm": 1.1871827840805054, + "learning_rate": 4.988270574826857e-06, + "loss": 0.179, + "step": 556 + }, + { + "epoch": 0.18049254698639014, + "grad_norm": 1.1360725164413452, + "learning_rate": 4.9881857998538175e-06, + "loss": 0.1763, + "step": 557 + }, + { + "epoch": 0.18081659105638367, + "grad_norm": 1.1245923042297363, + "learning_rate": 4.988100720350924e-06, + "loss": 0.1901, + "step": 558 + }, + { + "epoch": 0.18114063512637718, + "grad_norm": 1.2312275171279907, + "learning_rate": 4.988015336328589e-06, + "loss": 0.1939, + "step": 559 + }, + { + "epoch": 0.18146467919637072, + "grad_norm": 1.1649733781814575, + "learning_rate": 4.987929647797263e-06, + "loss": 0.1601, + "step": 560 + }, + { + "epoch": 0.18178872326636422, + "grad_norm": 1.1686375141143799, + "learning_rate": 4.987843654767432e-06, + "loss": 0.1818, + "step": 561 + }, + { + "epoch": 0.18211276733635776, + "grad_norm": 1.3112282752990723, + "learning_rate": 4.987757357249623e-06, + "loss": 0.2011, + "step": 562 + }, + { + "epoch": 0.18243681140635126, + "grad_norm": 1.2649235725402832, + "learning_rate": 4.987670755254397e-06, + "loss": 0.2082, + "step": 563 + }, + { + "epoch": 0.18276085547634477, + "grad_norm": 1.2089769840240479, + "learning_rate": 4.987583848792353e-06, + "loss": 0.1771, + "step": 564 + }, + { + "epoch": 0.1830848995463383, + "grad_norm": 1.185096263885498, + "learning_rate": 4.987496637874127e-06, + "loss": 0.1883, + "step": 565 + }, + { + "epoch": 0.1834089436163318, + "grad_norm": 1.1989091634750366, + "learning_rate": 4.987409122510394e-06, + "loss": 0.1969, + "step": 566 + }, + { + "epoch": 0.18373298768632534, + "grad_norm": 1.0628764629364014, + "learning_rate": 4.9873213027118635e-06, + "loss": 0.1644, + "step": 567 + }, + { + "epoch": 0.18405703175631885, + "grad_norm": 1.2635180950164795, + "learning_rate": 4.987233178489285e-06, + "loss": 0.1812, + "step": 568 + }, + { + "epoch": 0.18438107582631239, + "grad_norm": 1.1539015769958496, + "learning_rate": 4.987144749853444e-06, + "loss": 0.1954, + "step": 569 + }, + { + "epoch": 0.1847051198963059, + "grad_norm": 1.1005996465682983, + "learning_rate": 4.987056016815163e-06, + "loss": 0.1718, + "step": 570 + }, + { + "epoch": 0.18502916396629943, + "grad_norm": 1.2113686800003052, + "learning_rate": 4.986966979385302e-06, + "loss": 0.1957, + "step": 571 + }, + { + "epoch": 0.18535320803629293, + "grad_norm": 1.204167366027832, + "learning_rate": 4.986877637574758e-06, + "loss": 0.1787, + "step": 572 + }, + { + "epoch": 0.18567725210628647, + "grad_norm": 1.0967299938201904, + "learning_rate": 4.986787991394467e-06, + "loss": 0.1563, + "step": 573 + }, + { + "epoch": 0.18600129617627997, + "grad_norm": 1.188849925994873, + "learning_rate": 4.9866980408554e-06, + "loss": 0.1832, + "step": 574 + }, + { + "epoch": 0.18632534024627348, + "grad_norm": 1.205378770828247, + "learning_rate": 4.986607785968565e-06, + "loss": 0.1817, + "step": 575 + }, + { + "epoch": 0.18664938431626701, + "grad_norm": 1.2052730321884155, + "learning_rate": 4.986517226745009e-06, + "loss": 0.1909, + "step": 576 + }, + { + "epoch": 0.18697342838626052, + "grad_norm": 1.2092132568359375, + "learning_rate": 4.9864263631958165e-06, + "loss": 0.1742, + "step": 577 + }, + { + "epoch": 0.18729747245625405, + "grad_norm": 1.1333421468734741, + "learning_rate": 4.986335195332107e-06, + "loss": 0.1745, + "step": 578 + }, + { + "epoch": 0.18762151652624756, + "grad_norm": 1.1709595918655396, + "learning_rate": 4.986243723165039e-06, + "loss": 0.1843, + "step": 579 + }, + { + "epoch": 0.1879455605962411, + "grad_norm": 1.1630189418792725, + "learning_rate": 4.9861519467058094e-06, + "loss": 0.1949, + "step": 580 + }, + { + "epoch": 0.1882696046662346, + "grad_norm": 1.1940983533859253, + "learning_rate": 4.986059865965649e-06, + "loss": 0.1836, + "step": 581 + }, + { + "epoch": 0.18859364873622814, + "grad_norm": 1.1740580797195435, + "learning_rate": 4.985967480955827e-06, + "loss": 0.1966, + "step": 582 + }, + { + "epoch": 0.18891769280622164, + "grad_norm": 1.1827421188354492, + "learning_rate": 4.9858747916876515e-06, + "loss": 0.1855, + "step": 583 + }, + { + "epoch": 0.18924173687621518, + "grad_norm": 1.240294337272644, + "learning_rate": 4.985781798172467e-06, + "loss": 0.1844, + "step": 584 + }, + { + "epoch": 0.18956578094620868, + "grad_norm": 1.2834677696228027, + "learning_rate": 4.9856885004216545e-06, + "loss": 0.1923, + "step": 585 + }, + { + "epoch": 0.1898898250162022, + "grad_norm": 1.1834359169006348, + "learning_rate": 4.985594898446633e-06, + "loss": 0.1759, + "step": 586 + }, + { + "epoch": 0.19021386908619572, + "grad_norm": 1.2249093055725098, + "learning_rate": 4.9855009922588585e-06, + "loss": 0.1893, + "step": 587 + }, + { + "epoch": 0.19053791315618923, + "grad_norm": 1.1315933465957642, + "learning_rate": 4.985406781869824e-06, + "loss": 0.1697, + "step": 588 + }, + { + "epoch": 0.19086195722618277, + "grad_norm": 1.2335212230682373, + "learning_rate": 4.98531226729106e-06, + "loss": 0.2065, + "step": 589 + }, + { + "epoch": 0.19118600129617627, + "grad_norm": 1.1094428300857544, + "learning_rate": 4.985217448534134e-06, + "loss": 0.1651, + "step": 590 + }, + { + "epoch": 0.1915100453661698, + "grad_norm": 1.2721391916275024, + "learning_rate": 4.985122325610651e-06, + "loss": 0.1967, + "step": 591 + }, + { + "epoch": 0.1918340894361633, + "grad_norm": 1.1110994815826416, + "learning_rate": 4.985026898532253e-06, + "loss": 0.1709, + "step": 592 + }, + { + "epoch": 0.19215813350615685, + "grad_norm": 1.2032331228256226, + "learning_rate": 4.98493116731062e-06, + "loss": 0.174, + "step": 593 + }, + { + "epoch": 0.19248217757615035, + "grad_norm": 1.286197543144226, + "learning_rate": 4.984835131957468e-06, + "loss": 0.1896, + "step": 594 + }, + { + "epoch": 0.1928062216461439, + "grad_norm": 1.1432693004608154, + "learning_rate": 4.98473879248455e-06, + "loss": 0.1721, + "step": 595 + }, + { + "epoch": 0.1931302657161374, + "grad_norm": 1.290723443031311, + "learning_rate": 4.984642148903659e-06, + "loss": 0.1959, + "step": 596 + }, + { + "epoch": 0.1934543097861309, + "grad_norm": 1.2293765544891357, + "learning_rate": 4.984545201226623e-06, + "loss": 0.1815, + "step": 597 + }, + { + "epoch": 0.19377835385612444, + "grad_norm": 1.2856359481811523, + "learning_rate": 4.984447949465305e-06, + "loss": 0.1954, + "step": 598 + }, + { + "epoch": 0.19410239792611794, + "grad_norm": 1.1901377439498901, + "learning_rate": 4.98435039363161e-06, + "loss": 0.1781, + "step": 599 + }, + { + "epoch": 0.19442644199611148, + "grad_norm": 1.2254037857055664, + "learning_rate": 4.984252533737477e-06, + "loss": 0.1898, + "step": 600 + }, + { + "epoch": 0.19475048606610498, + "grad_norm": 1.1281192302703857, + "learning_rate": 4.984154369794883e-06, + "loss": 0.1809, + "step": 601 + }, + { + "epoch": 0.19507453013609852, + "grad_norm": 1.2336041927337646, + "learning_rate": 4.984055901815844e-06, + "loss": 0.1738, + "step": 602 + }, + { + "epoch": 0.19539857420609202, + "grad_norm": 1.119174838066101, + "learning_rate": 4.983957129812409e-06, + "loss": 0.1735, + "step": 603 + }, + { + "epoch": 0.19572261827608556, + "grad_norm": 1.2780518531799316, + "learning_rate": 4.9838580537966676e-06, + "loss": 0.1965, + "step": 604 + }, + { + "epoch": 0.19604666234607906, + "grad_norm": 1.16103196144104, + "learning_rate": 4.983758673780747e-06, + "loss": 0.1845, + "step": 605 + }, + { + "epoch": 0.1963707064160726, + "grad_norm": 1.1239101886749268, + "learning_rate": 4.9836589897768084e-06, + "loss": 0.1696, + "step": 606 + }, + { + "epoch": 0.1966947504860661, + "grad_norm": 1.3181592226028442, + "learning_rate": 4.983559001797054e-06, + "loss": 0.2048, + "step": 607 + }, + { + "epoch": 0.1970187945560596, + "grad_norm": 1.1426509618759155, + "learning_rate": 4.983458709853719e-06, + "loss": 0.177, + "step": 608 + }, + { + "epoch": 0.19734283862605315, + "grad_norm": 1.1979957818984985, + "learning_rate": 4.9833581139590814e-06, + "loss": 0.1853, + "step": 609 + }, + { + "epoch": 0.19766688269604665, + "grad_norm": 1.1111805438995361, + "learning_rate": 4.983257214125451e-06, + "loss": 0.1932, + "step": 610 + }, + { + "epoch": 0.1979909267660402, + "grad_norm": 1.2329010963439941, + "learning_rate": 4.9831560103651765e-06, + "loss": 0.204, + "step": 611 + }, + { + "epoch": 0.1983149708360337, + "grad_norm": 1.1231783628463745, + "learning_rate": 4.983054502690646e-06, + "loss": 0.1691, + "step": 612 + }, + { + "epoch": 0.19863901490602723, + "grad_norm": 1.1197031736373901, + "learning_rate": 4.9829526911142825e-06, + "loss": 0.1593, + "step": 613 + }, + { + "epoch": 0.19896305897602073, + "grad_norm": 1.194003939628601, + "learning_rate": 4.982850575648545e-06, + "loss": 0.183, + "step": 614 + }, + { + "epoch": 0.19928710304601427, + "grad_norm": 1.1786737442016602, + "learning_rate": 4.982748156305934e-06, + "loss": 0.1864, + "step": 615 + }, + { + "epoch": 0.19961114711600778, + "grad_norm": 1.1826181411743164, + "learning_rate": 4.982645433098984e-06, + "loss": 0.1753, + "step": 616 + }, + { + "epoch": 0.1999351911860013, + "grad_norm": 1.1305351257324219, + "learning_rate": 4.982542406040266e-06, + "loss": 0.1964, + "step": 617 + }, + { + "epoch": 0.20025923525599482, + "grad_norm": 1.084776759147644, + "learning_rate": 4.98243907514239e-06, + "loss": 0.1831, + "step": 618 + }, + { + "epoch": 0.20058327932598832, + "grad_norm": 1.1831001043319702, + "learning_rate": 4.982335440418004e-06, + "loss": 0.1833, + "step": 619 + }, + { + "epoch": 0.20090732339598186, + "grad_norm": 1.1076908111572266, + "learning_rate": 4.98223150187979e-06, + "loss": 0.1732, + "step": 620 + }, + { + "epoch": 0.20123136746597536, + "grad_norm": 1.15507173538208, + "learning_rate": 4.982127259540471e-06, + "loss": 0.1737, + "step": 621 + }, + { + "epoch": 0.2015554115359689, + "grad_norm": 1.2487183809280396, + "learning_rate": 4.9820227134128045e-06, + "loss": 0.1937, + "step": 622 + }, + { + "epoch": 0.2018794556059624, + "grad_norm": 1.2484679222106934, + "learning_rate": 4.981917863509585e-06, + "loss": 0.1968, + "step": 623 + }, + { + "epoch": 0.20220349967595594, + "grad_norm": 1.2015026807785034, + "learning_rate": 4.981812709843646e-06, + "loss": 0.1881, + "step": 624 + }, + { + "epoch": 0.20252754374594945, + "grad_norm": 1.1933735609054565, + "learning_rate": 4.981707252427857e-06, + "loss": 0.187, + "step": 625 + }, + { + "epoch": 0.20285158781594298, + "grad_norm": 1.1229411363601685, + "learning_rate": 4.981601491275125e-06, + "loss": 0.183, + "step": 626 + }, + { + "epoch": 0.2031756318859365, + "grad_norm": 1.1262580156326294, + "learning_rate": 4.981495426398395e-06, + "loss": 0.1798, + "step": 627 + }, + { + "epoch": 0.20349967595593, + "grad_norm": 1.1283111572265625, + "learning_rate": 4.981389057810647e-06, + "loss": 0.1835, + "step": 628 + }, + { + "epoch": 0.20382372002592353, + "grad_norm": 1.294592261314392, + "learning_rate": 4.9812823855248996e-06, + "loss": 0.1819, + "step": 629 + }, + { + "epoch": 0.20414776409591703, + "grad_norm": 1.1340450048446655, + "learning_rate": 4.98117540955421e-06, + "loss": 0.1735, + "step": 630 + }, + { + "epoch": 0.20447180816591057, + "grad_norm": 1.2071174383163452, + "learning_rate": 4.981068129911669e-06, + "loss": 0.1872, + "step": 631 + }, + { + "epoch": 0.20479585223590407, + "grad_norm": 1.153228998184204, + "learning_rate": 4.980960546610408e-06, + "loss": 0.19, + "step": 632 + }, + { + "epoch": 0.2051198963058976, + "grad_norm": 1.1905288696289062, + "learning_rate": 4.980852659663593e-06, + "loss": 0.1668, + "step": 633 + }, + { + "epoch": 0.20544394037589112, + "grad_norm": 1.1860299110412598, + "learning_rate": 4.9807444690844296e-06, + "loss": 0.1827, + "step": 634 + }, + { + "epoch": 0.20576798444588465, + "grad_norm": 1.1758476495742798, + "learning_rate": 4.980635974886158e-06, + "loss": 0.199, + "step": 635 + }, + { + "epoch": 0.20609202851587816, + "grad_norm": 1.2125898599624634, + "learning_rate": 4.980527177082058e-06, + "loss": 0.2062, + "step": 636 + }, + { + "epoch": 0.2064160725858717, + "grad_norm": 1.203070878982544, + "learning_rate": 4.980418075685445e-06, + "loss": 0.1927, + "step": 637 + }, + { + "epoch": 0.2067401166558652, + "grad_norm": 1.188751459121704, + "learning_rate": 4.980308670709671e-06, + "loss": 0.1865, + "step": 638 + }, + { + "epoch": 0.2070641607258587, + "grad_norm": 1.1638119220733643, + "learning_rate": 4.980198962168128e-06, + "loss": 0.1928, + "step": 639 + }, + { + "epoch": 0.20738820479585224, + "grad_norm": 1.1287533044815063, + "learning_rate": 4.9800889500742415e-06, + "loss": 0.1873, + "step": 640 + }, + { + "epoch": 0.20771224886584574, + "grad_norm": 1.1128509044647217, + "learning_rate": 4.979978634441477e-06, + "loss": 0.1735, + "step": 641 + }, + { + "epoch": 0.20803629293583928, + "grad_norm": 1.259926676750183, + "learning_rate": 4.979868015283336e-06, + "loss": 0.2012, + "step": 642 + }, + { + "epoch": 0.20836033700583279, + "grad_norm": 1.155649185180664, + "learning_rate": 4.979757092613357e-06, + "loss": 0.1834, + "step": 643 + }, + { + "epoch": 0.20868438107582632, + "grad_norm": 1.2149689197540283, + "learning_rate": 4.979645866445114e-06, + "loss": 0.2143, + "step": 644 + }, + { + "epoch": 0.20900842514581983, + "grad_norm": 1.041944146156311, + "learning_rate": 4.9795343367922235e-06, + "loss": 0.1683, + "step": 645 + }, + { + "epoch": 0.20933246921581336, + "grad_norm": 1.1962753534317017, + "learning_rate": 4.979422503668334e-06, + "loss": 0.172, + "step": 646 + }, + { + "epoch": 0.20965651328580687, + "grad_norm": 1.273494005203247, + "learning_rate": 4.979310367087132e-06, + "loss": 0.1867, + "step": 647 + }, + { + "epoch": 0.2099805573558004, + "grad_norm": 1.0932351350784302, + "learning_rate": 4.979197927062343e-06, + "loss": 0.187, + "step": 648 + }, + { + "epoch": 0.2103046014257939, + "grad_norm": 1.18181312084198, + "learning_rate": 4.979085183607728e-06, + "loss": 0.1884, + "step": 649 + }, + { + "epoch": 0.21062864549578741, + "grad_norm": 1.156614065170288, + "learning_rate": 4.978972136737086e-06, + "loss": 0.1961, + "step": 650 + }, + { + "epoch": 0.21095268956578095, + "grad_norm": 1.1621402502059937, + "learning_rate": 4.978858786464252e-06, + "loss": 0.1901, + "step": 651 + }, + { + "epoch": 0.21127673363577446, + "grad_norm": 1.298018455505371, + "learning_rate": 4.978745132803101e-06, + "loss": 0.1982, + "step": 652 + }, + { + "epoch": 0.211600777705768, + "grad_norm": 1.1199668645858765, + "learning_rate": 4.9786311757675425e-06, + "loss": 0.1726, + "step": 653 + }, + { + "epoch": 0.2119248217757615, + "grad_norm": 1.1561723947525024, + "learning_rate": 4.978516915371522e-06, + "loss": 0.1814, + "step": 654 + }, + { + "epoch": 0.21224886584575503, + "grad_norm": 1.095726728439331, + "learning_rate": 4.978402351629024e-06, + "loss": 0.1768, + "step": 655 + }, + { + "epoch": 0.21257290991574854, + "grad_norm": 1.2473907470703125, + "learning_rate": 4.9782874845540715e-06, + "loss": 0.2086, + "step": 656 + }, + { + "epoch": 0.21289695398574207, + "grad_norm": 1.1992051601409912, + "learning_rate": 4.978172314160724e-06, + "loss": 0.1901, + "step": 657 + }, + { + "epoch": 0.21322099805573558, + "grad_norm": 1.1524806022644043, + "learning_rate": 4.9780568404630746e-06, + "loss": 0.1879, + "step": 658 + }, + { + "epoch": 0.2135450421257291, + "grad_norm": 1.2208423614501953, + "learning_rate": 4.977941063475258e-06, + "loss": 0.1853, + "step": 659 + }, + { + "epoch": 0.21386908619572262, + "grad_norm": 1.1771621704101562, + "learning_rate": 4.977824983211443e-06, + "loss": 0.2026, + "step": 660 + }, + { + "epoch": 0.21419313026571613, + "grad_norm": 1.1515275239944458, + "learning_rate": 4.977708599685837e-06, + "loss": 0.1769, + "step": 661 + }, + { + "epoch": 0.21451717433570966, + "grad_norm": 1.1625581979751587, + "learning_rate": 4.977591912912685e-06, + "loss": 0.1933, + "step": 662 + }, + { + "epoch": 0.21484121840570317, + "grad_norm": 1.1556047201156616, + "learning_rate": 4.977474922906268e-06, + "loss": 0.1885, + "step": 663 + }, + { + "epoch": 0.2151652624756967, + "grad_norm": 1.1594843864440918, + "learning_rate": 4.977357629680903e-06, + "loss": 0.1899, + "step": 664 + }, + { + "epoch": 0.2154893065456902, + "grad_norm": 1.0708738565444946, + "learning_rate": 4.977240033250948e-06, + "loss": 0.1737, + "step": 665 + }, + { + "epoch": 0.21581335061568374, + "grad_norm": 1.0543522834777832, + "learning_rate": 4.977122133630795e-06, + "loss": 0.1781, + "step": 666 + }, + { + "epoch": 0.21613739468567725, + "grad_norm": 1.1143603324890137, + "learning_rate": 4.9770039308348725e-06, + "loss": 0.1808, + "step": 667 + }, + { + "epoch": 0.21646143875567078, + "grad_norm": 1.2150770425796509, + "learning_rate": 4.9768854248776475e-06, + "loss": 0.1799, + "step": 668 + }, + { + "epoch": 0.2167854828256643, + "grad_norm": 1.2431052923202515, + "learning_rate": 4.976766615773626e-06, + "loss": 0.1911, + "step": 669 + }, + { + "epoch": 0.21710952689565782, + "grad_norm": 1.1677272319793701, + "learning_rate": 4.976647503537347e-06, + "loss": 0.1765, + "step": 670 + }, + { + "epoch": 0.21743357096565133, + "grad_norm": 1.0950998067855835, + "learning_rate": 4.9765280881833885e-06, + "loss": 0.162, + "step": 671 + }, + { + "epoch": 0.21775761503564484, + "grad_norm": 1.2298495769500732, + "learning_rate": 4.976408369726368e-06, + "loss": 0.1857, + "step": 672 + }, + { + "epoch": 0.21808165910563837, + "grad_norm": 1.0828574895858765, + "learning_rate": 4.976288348180935e-06, + "loss": 0.1803, + "step": 673 + }, + { + "epoch": 0.21840570317563188, + "grad_norm": 1.1887153387069702, + "learning_rate": 4.976168023561782e-06, + "loss": 0.1801, + "step": 674 + }, + { + "epoch": 0.2187297472456254, + "grad_norm": 1.1923915147781372, + "learning_rate": 4.976047395883634e-06, + "loss": 0.1932, + "step": 675 + }, + { + "epoch": 0.21905379131561892, + "grad_norm": 1.193912386894226, + "learning_rate": 4.975926465161254e-06, + "loss": 0.1954, + "step": 676 + }, + { + "epoch": 0.21937783538561245, + "grad_norm": 1.124260425567627, + "learning_rate": 4.975805231409444e-06, + "loss": 0.1728, + "step": 677 + }, + { + "epoch": 0.21970187945560596, + "grad_norm": 1.070874571800232, + "learning_rate": 4.975683694643041e-06, + "loss": 0.1753, + "step": 678 + }, + { + "epoch": 0.2200259235255995, + "grad_norm": 1.173336386680603, + "learning_rate": 4.97556185487692e-06, + "loss": 0.1731, + "step": 679 + }, + { + "epoch": 0.220349967595593, + "grad_norm": 1.197383999824524, + "learning_rate": 4.9754397121259935e-06, + "loss": 0.1874, + "step": 680 + }, + { + "epoch": 0.22067401166558653, + "grad_norm": 1.1929588317871094, + "learning_rate": 4.975317266405211e-06, + "loss": 0.1887, + "step": 681 + }, + { + "epoch": 0.22099805573558004, + "grad_norm": 1.1419634819030762, + "learning_rate": 4.975194517729557e-06, + "loss": 0.1744, + "step": 682 + }, + { + "epoch": 0.22132209980557355, + "grad_norm": 1.1331706047058105, + "learning_rate": 4.975071466114057e-06, + "loss": 0.1772, + "step": 683 + }, + { + "epoch": 0.22164614387556708, + "grad_norm": 1.1220134496688843, + "learning_rate": 4.974948111573768e-06, + "loss": 0.191, + "step": 684 + }, + { + "epoch": 0.2219701879455606, + "grad_norm": 1.1688811779022217, + "learning_rate": 4.9748244541237915e-06, + "loss": 0.167, + "step": 685 + }, + { + "epoch": 0.22229423201555412, + "grad_norm": 1.1049573421478271, + "learning_rate": 4.97470049377926e-06, + "loss": 0.175, + "step": 686 + }, + { + "epoch": 0.22261827608554763, + "grad_norm": 1.1855946779251099, + "learning_rate": 4.974576230555344e-06, + "loss": 0.1788, + "step": 687 + }, + { + "epoch": 0.22294232015554116, + "grad_norm": 1.2324986457824707, + "learning_rate": 4.974451664467253e-06, + "loss": 0.1661, + "step": 688 + }, + { + "epoch": 0.22326636422553467, + "grad_norm": 1.261545181274414, + "learning_rate": 4.974326795530234e-06, + "loss": 0.1904, + "step": 689 + }, + { + "epoch": 0.2235904082955282, + "grad_norm": 1.0972044467926025, + "learning_rate": 4.974201623759568e-06, + "loss": 0.1738, + "step": 690 + }, + { + "epoch": 0.2239144523655217, + "grad_norm": 1.1361660957336426, + "learning_rate": 4.974076149170575e-06, + "loss": 0.1679, + "step": 691 + }, + { + "epoch": 0.22423849643551522, + "grad_norm": 1.222582221031189, + "learning_rate": 4.973950371778612e-06, + "loss": 0.1853, + "step": 692 + }, + { + "epoch": 0.22456254050550875, + "grad_norm": 1.1474609375, + "learning_rate": 4.973824291599074e-06, + "loss": 0.1789, + "step": 693 + }, + { + "epoch": 0.22488658457550226, + "grad_norm": 1.0796842575073242, + "learning_rate": 4.973697908647391e-06, + "loss": 0.17, + "step": 694 + }, + { + "epoch": 0.2252106286454958, + "grad_norm": 1.19746732711792, + "learning_rate": 4.973571222939031e-06, + "loss": 0.1819, + "step": 695 + }, + { + "epoch": 0.2255346727154893, + "grad_norm": 1.0817978382110596, + "learning_rate": 4.973444234489499e-06, + "loss": 0.191, + "step": 696 + }, + { + "epoch": 0.22585871678548283, + "grad_norm": 1.1662395000457764, + "learning_rate": 4.973316943314338e-06, + "loss": 0.189, + "step": 697 + }, + { + "epoch": 0.22618276085547634, + "grad_norm": 1.2320014238357544, + "learning_rate": 4.9731893494291275e-06, + "loss": 0.1777, + "step": 698 + }, + { + "epoch": 0.22650680492546987, + "grad_norm": 1.099103331565857, + "learning_rate": 4.973061452849481e-06, + "loss": 0.1662, + "step": 699 + }, + { + "epoch": 0.22683084899546338, + "grad_norm": 1.195725917816162, + "learning_rate": 4.972933253591056e-06, + "loss": 0.1798, + "step": 700 + }, + { + "epoch": 0.22715489306545691, + "grad_norm": 1.120126724243164, + "learning_rate": 4.972804751669539e-06, + "loss": 0.1895, + "step": 701 + }, + { + "epoch": 0.22747893713545042, + "grad_norm": 1.2362133264541626, + "learning_rate": 4.972675947100659e-06, + "loss": 0.1854, + "step": 702 + }, + { + "epoch": 0.22780298120544393, + "grad_norm": 1.2055583000183105, + "learning_rate": 4.972546839900181e-06, + "loss": 0.1935, + "step": 703 + }, + { + "epoch": 0.22812702527543746, + "grad_norm": 1.1982338428497314, + "learning_rate": 4.972417430083906e-06, + "loss": 0.1887, + "step": 704 + }, + { + "epoch": 0.22845106934543097, + "grad_norm": 1.174058198928833, + "learning_rate": 4.972287717667672e-06, + "loss": 0.1722, + "step": 705 + }, + { + "epoch": 0.2287751134154245, + "grad_norm": 1.1342368125915527, + "learning_rate": 4.972157702667356e-06, + "loss": 0.1741, + "step": 706 + }, + { + "epoch": 0.229099157485418, + "grad_norm": 1.2124993801116943, + "learning_rate": 4.972027385098868e-06, + "loss": 0.1814, + "step": 707 + }, + { + "epoch": 0.22942320155541154, + "grad_norm": 1.1575313806533813, + "learning_rate": 4.97189676497816e-06, + "loss": 0.1804, + "step": 708 + }, + { + "epoch": 0.22974724562540505, + "grad_norm": 1.0987682342529297, + "learning_rate": 4.971765842321218e-06, + "loss": 0.1745, + "step": 709 + }, + { + "epoch": 0.23007128969539858, + "grad_norm": 1.1387704610824585, + "learning_rate": 4.971634617144065e-06, + "loss": 0.1791, + "step": 710 + }, + { + "epoch": 0.2303953337653921, + "grad_norm": 1.0869576930999756, + "learning_rate": 4.971503089462762e-06, + "loss": 0.168, + "step": 711 + }, + { + "epoch": 0.23071937783538563, + "grad_norm": 1.177750587463379, + "learning_rate": 4.9713712592934075e-06, + "loss": 0.1879, + "step": 712 + }, + { + "epoch": 0.23104342190537913, + "grad_norm": 1.0906344652175903, + "learning_rate": 4.971239126652135e-06, + "loss": 0.1755, + "step": 713 + }, + { + "epoch": 0.23136746597537264, + "grad_norm": 1.2198588848114014, + "learning_rate": 4.971106691555116e-06, + "loss": 0.198, + "step": 714 + }, + { + "epoch": 0.23169151004536617, + "grad_norm": 1.144925594329834, + "learning_rate": 4.9709739540185616e-06, + "loss": 0.1679, + "step": 715 + }, + { + "epoch": 0.23201555411535968, + "grad_norm": 1.0273765325546265, + "learning_rate": 4.970840914058716e-06, + "loss": 0.1588, + "step": 716 + }, + { + "epoch": 0.2323395981853532, + "grad_norm": 1.1109790802001953, + "learning_rate": 4.970707571691862e-06, + "loss": 0.1926, + "step": 717 + }, + { + "epoch": 0.23266364225534672, + "grad_norm": 1.127719759941101, + "learning_rate": 4.970573926934319e-06, + "loss": 0.1947, + "step": 718 + }, + { + "epoch": 0.23298768632534025, + "grad_norm": 1.1377744674682617, + "learning_rate": 4.970439979802445e-06, + "loss": 0.1811, + "step": 719 + }, + { + "epoch": 0.23331173039533376, + "grad_norm": 1.1827045679092407, + "learning_rate": 4.970305730312632e-06, + "loss": 0.1919, + "step": 720 + }, + { + "epoch": 0.2336357744653273, + "grad_norm": 1.0300381183624268, + "learning_rate": 4.9701711784813135e-06, + "loss": 0.1602, + "step": 721 + }, + { + "epoch": 0.2339598185353208, + "grad_norm": 1.158111810684204, + "learning_rate": 4.970036324324955e-06, + "loss": 0.1767, + "step": 722 + }, + { + "epoch": 0.23428386260531434, + "grad_norm": 1.1824687719345093, + "learning_rate": 4.969901167860063e-06, + "loss": 0.1798, + "step": 723 + }, + { + "epoch": 0.23460790667530784, + "grad_norm": 1.1232125759124756, + "learning_rate": 4.969765709103177e-06, + "loss": 0.1692, + "step": 724 + }, + { + "epoch": 0.23493195074530135, + "grad_norm": 1.1801738739013672, + "learning_rate": 4.9696299480708785e-06, + "loss": 0.1813, + "step": 725 + }, + { + "epoch": 0.23525599481529488, + "grad_norm": 1.1177990436553955, + "learning_rate": 4.969493884779783e-06, + "loss": 0.1867, + "step": 726 + }, + { + "epoch": 0.2355800388852884, + "grad_norm": 1.1591545343399048, + "learning_rate": 4.969357519246542e-06, + "loss": 0.1828, + "step": 727 + }, + { + "epoch": 0.23590408295528192, + "grad_norm": 1.1052945852279663, + "learning_rate": 4.9692208514878445e-06, + "loss": 0.1872, + "step": 728 + }, + { + "epoch": 0.23622812702527543, + "grad_norm": 1.1025199890136719, + "learning_rate": 4.96908388152042e-06, + "loss": 0.1709, + "step": 729 + }, + { + "epoch": 0.23655217109526896, + "grad_norm": 1.0618928670883179, + "learning_rate": 4.968946609361031e-06, + "loss": 0.1665, + "step": 730 + }, + { + "epoch": 0.23687621516526247, + "grad_norm": 1.199231743812561, + "learning_rate": 4.968809035026477e-06, + "loss": 0.1885, + "step": 731 + }, + { + "epoch": 0.237200259235256, + "grad_norm": 1.0359363555908203, + "learning_rate": 4.968671158533599e-06, + "loss": 0.1678, + "step": 732 + }, + { + "epoch": 0.2375243033052495, + "grad_norm": 1.1739076375961304, + "learning_rate": 4.968532979899269e-06, + "loss": 0.1826, + "step": 733 + }, + { + "epoch": 0.23784834737524305, + "grad_norm": 1.2278923988342285, + "learning_rate": 4.9683944991403985e-06, + "loss": 0.1802, + "step": 734 + }, + { + "epoch": 0.23817239144523655, + "grad_norm": 1.165088415145874, + "learning_rate": 4.968255716273938e-06, + "loss": 0.1694, + "step": 735 + }, + { + "epoch": 0.23849643551523006, + "grad_norm": 1.1245152950286865, + "learning_rate": 4.968116631316873e-06, + "loss": 0.1769, + "step": 736 + }, + { + "epoch": 0.2388204795852236, + "grad_norm": 1.0869536399841309, + "learning_rate": 4.967977244286225e-06, + "loss": 0.1569, + "step": 737 + }, + { + "epoch": 0.2391445236552171, + "grad_norm": 1.4542876482009888, + "learning_rate": 4.967837555199054e-06, + "loss": 0.1847, + "step": 738 + }, + { + "epoch": 0.23946856772521063, + "grad_norm": 1.2349416017532349, + "learning_rate": 4.967697564072457e-06, + "loss": 0.1916, + "step": 739 + }, + { + "epoch": 0.23979261179520414, + "grad_norm": 1.1966452598571777, + "learning_rate": 4.9675572709235665e-06, + "loss": 0.1919, + "step": 740 + }, + { + "epoch": 0.24011665586519768, + "grad_norm": 1.1190122365951538, + "learning_rate": 4.967416675769555e-06, + "loss": 0.182, + "step": 741 + }, + { + "epoch": 0.24044069993519118, + "grad_norm": 1.224090814590454, + "learning_rate": 4.967275778627628e-06, + "loss": 0.1923, + "step": 742 + }, + { + "epoch": 0.24076474400518472, + "grad_norm": 1.151762843132019, + "learning_rate": 4.967134579515032e-06, + "loss": 0.1895, + "step": 743 + }, + { + "epoch": 0.24108878807517822, + "grad_norm": 1.1457250118255615, + "learning_rate": 4.966993078449046e-06, + "loss": 0.1753, + "step": 744 + }, + { + "epoch": 0.24141283214517173, + "grad_norm": 1.1521213054656982, + "learning_rate": 4.96685127544699e-06, + "loss": 0.1725, + "step": 745 + }, + { + "epoch": 0.24173687621516526, + "grad_norm": 1.084601640701294, + "learning_rate": 4.966709170526219e-06, + "loss": 0.1835, + "step": 746 + }, + { + "epoch": 0.24206092028515877, + "grad_norm": 1.0784162282943726, + "learning_rate": 4.966566763704124e-06, + "loss": 0.1975, + "step": 747 + }, + { + "epoch": 0.2423849643551523, + "grad_norm": 1.2336294651031494, + "learning_rate": 4.966424054998137e-06, + "loss": 0.1887, + "step": 748 + }, + { + "epoch": 0.2427090084251458, + "grad_norm": 1.1423250436782837, + "learning_rate": 4.966281044425722e-06, + "loss": 0.182, + "step": 749 + }, + { + "epoch": 0.24303305249513935, + "grad_norm": 1.0632977485656738, + "learning_rate": 4.9661377320043815e-06, + "loss": 0.1719, + "step": 750 + }, + { + "epoch": 0.24335709656513285, + "grad_norm": 1.178665280342102, + "learning_rate": 4.965994117751658e-06, + "loss": 0.1949, + "step": 751 + }, + { + "epoch": 0.2436811406351264, + "grad_norm": 1.1657874584197998, + "learning_rate": 4.965850201685126e-06, + "loss": 0.1817, + "step": 752 + }, + { + "epoch": 0.2440051847051199, + "grad_norm": 1.0570935010910034, + "learning_rate": 4.965705983822401e-06, + "loss": 0.1731, + "step": 753 + }, + { + "epoch": 0.24432922877511343, + "grad_norm": 1.027815818786621, + "learning_rate": 4.965561464181134e-06, + "loss": 0.164, + "step": 754 + }, + { + "epoch": 0.24465327284510693, + "grad_norm": 1.262858271598816, + "learning_rate": 4.965416642779012e-06, + "loss": 0.1866, + "step": 755 + }, + { + "epoch": 0.24497731691510044, + "grad_norm": 1.0563353300094604, + "learning_rate": 4.96527151963376e-06, + "loss": 0.1751, + "step": 756 + }, + { + "epoch": 0.24530136098509397, + "grad_norm": 0.9677377939224243, + "learning_rate": 4.9651260947631395e-06, + "loss": 0.1582, + "step": 757 + }, + { + "epoch": 0.24562540505508748, + "grad_norm": 1.0785053968429565, + "learning_rate": 4.9649803681849495e-06, + "loss": 0.1863, + "step": 758 + }, + { + "epoch": 0.24594944912508102, + "grad_norm": 1.1994812488555908, + "learning_rate": 4.9648343399170254e-06, + "loss": 0.1812, + "step": 759 + }, + { + "epoch": 0.24627349319507452, + "grad_norm": 1.0674124956130981, + "learning_rate": 4.964688009977239e-06, + "loss": 0.1596, + "step": 760 + }, + { + "epoch": 0.24659753726506806, + "grad_norm": 1.0390790700912476, + "learning_rate": 4.9645413783835006e-06, + "loss": 0.1614, + "step": 761 + }, + { + "epoch": 0.24692158133506156, + "grad_norm": 1.1015980243682861, + "learning_rate": 4.964394445153756e-06, + "loss": 0.1716, + "step": 762 + }, + { + "epoch": 0.2472456254050551, + "grad_norm": 1.1416356563568115, + "learning_rate": 4.964247210305989e-06, + "loss": 0.1701, + "step": 763 + }, + { + "epoch": 0.2475696694750486, + "grad_norm": 1.1309244632720947, + "learning_rate": 4.964099673858219e-06, + "loss": 0.1721, + "step": 764 + }, + { + "epoch": 0.24789371354504214, + "grad_norm": 1.2402899265289307, + "learning_rate": 4.963951835828503e-06, + "loss": 0.1881, + "step": 765 + }, + { + "epoch": 0.24821775761503564, + "grad_norm": 1.167535424232483, + "learning_rate": 4.963803696234935e-06, + "loss": 0.1874, + "step": 766 + }, + { + "epoch": 0.24854180168502915, + "grad_norm": 1.0330973863601685, + "learning_rate": 4.9636552550956465e-06, + "loss": 0.1794, + "step": 767 + }, + { + "epoch": 0.24886584575502269, + "grad_norm": 1.0965633392333984, + "learning_rate": 4.963506512428804e-06, + "loss": 0.1952, + "step": 768 + }, + { + "epoch": 0.2491898898250162, + "grad_norm": 1.0930819511413574, + "learning_rate": 4.963357468252614e-06, + "loss": 0.1896, + "step": 769 + }, + { + "epoch": 0.24951393389500973, + "grad_norm": 1.1125822067260742, + "learning_rate": 4.9632081225853165e-06, + "loss": 0.1961, + "step": 770 + }, + { + "epoch": 0.24983797796500323, + "grad_norm": 1.1762914657592773, + "learning_rate": 4.9630584754451906e-06, + "loss": 0.1833, + "step": 771 + }, + { + "epoch": 0.25016202203499677, + "grad_norm": 1.0865849256515503, + "learning_rate": 4.962908526850552e-06, + "loss": 0.1759, + "step": 772 + }, + { + "epoch": 0.2504860661049903, + "grad_norm": 1.1505688428878784, + "learning_rate": 4.962758276819752e-06, + "loss": 0.1835, + "step": 773 + }, + { + "epoch": 0.2508101101749838, + "grad_norm": 1.1822468042373657, + "learning_rate": 4.9626077253711805e-06, + "loss": 0.1771, + "step": 774 + }, + { + "epoch": 0.2511341542449773, + "grad_norm": 1.0646265745162964, + "learning_rate": 4.962456872523263e-06, + "loss": 0.1753, + "step": 775 + }, + { + "epoch": 0.25145819831497085, + "grad_norm": 1.2307766675949097, + "learning_rate": 4.962305718294462e-06, + "loss": 0.1814, + "step": 776 + }, + { + "epoch": 0.2517822423849643, + "grad_norm": 1.13232421875, + "learning_rate": 4.96215426270328e-06, + "loss": 0.1847, + "step": 777 + }, + { + "epoch": 0.25210628645495786, + "grad_norm": 1.1121189594268799, + "learning_rate": 4.962002505768251e-06, + "loss": 0.1823, + "step": 778 + }, + { + "epoch": 0.2524303305249514, + "grad_norm": 1.018474817276001, + "learning_rate": 4.961850447507948e-06, + "loss": 0.1638, + "step": 779 + }, + { + "epoch": 0.25275437459494493, + "grad_norm": 1.1667485237121582, + "learning_rate": 4.961698087940984e-06, + "loss": 0.187, + "step": 780 + }, + { + "epoch": 0.2530784186649384, + "grad_norm": 1.1762861013412476, + "learning_rate": 4.961545427086006e-06, + "loss": 0.1791, + "step": 781 + }, + { + "epoch": 0.25340246273493194, + "grad_norm": 1.076377034187317, + "learning_rate": 4.961392464961695e-06, + "loss": 0.1729, + "step": 782 + }, + { + "epoch": 0.2537265068049255, + "grad_norm": 1.1636652946472168, + "learning_rate": 4.961239201586776e-06, + "loss": 0.2067, + "step": 783 + }, + { + "epoch": 0.254050550874919, + "grad_norm": 1.1299415826797485, + "learning_rate": 4.961085636980005e-06, + "loss": 0.1841, + "step": 784 + }, + { + "epoch": 0.2543745949449125, + "grad_norm": 1.199076771736145, + "learning_rate": 4.960931771160177e-06, + "loss": 0.193, + "step": 785 + }, + { + "epoch": 0.254698639014906, + "grad_norm": 1.0781837701797485, + "learning_rate": 4.960777604146124e-06, + "loss": 0.1701, + "step": 786 + }, + { + "epoch": 0.25502268308489956, + "grad_norm": 1.0722193717956543, + "learning_rate": 4.9606231359567146e-06, + "loss": 0.1797, + "step": 787 + }, + { + "epoch": 0.25534672715489304, + "grad_norm": 1.0604737997055054, + "learning_rate": 4.960468366610854e-06, + "loss": 0.1689, + "step": 788 + }, + { + "epoch": 0.2556707712248866, + "grad_norm": 1.0014824867248535, + "learning_rate": 4.960313296127485e-06, + "loss": 0.1559, + "step": 789 + }, + { + "epoch": 0.2559948152948801, + "grad_norm": 1.0583043098449707, + "learning_rate": 4.960157924525585e-06, + "loss": 0.1781, + "step": 790 + }, + { + "epoch": 0.25631885936487364, + "grad_norm": 1.066277265548706, + "learning_rate": 4.960002251824172e-06, + "loss": 0.1762, + "step": 791 + }, + { + "epoch": 0.2566429034348671, + "grad_norm": 1.135353922843933, + "learning_rate": 4.959846278042298e-06, + "loss": 0.1952, + "step": 792 + }, + { + "epoch": 0.25696694750486065, + "grad_norm": 1.230164647102356, + "learning_rate": 4.959690003199052e-06, + "loss": 0.1929, + "step": 793 + }, + { + "epoch": 0.2572909915748542, + "grad_norm": 1.0832064151763916, + "learning_rate": 4.959533427313562e-06, + "loss": 0.1643, + "step": 794 + }, + { + "epoch": 0.2576150356448477, + "grad_norm": 1.0479978322982788, + "learning_rate": 4.95937655040499e-06, + "loss": 0.1645, + "step": 795 + }, + { + "epoch": 0.2579390797148412, + "grad_norm": 1.0821533203125, + "learning_rate": 4.959219372492539e-06, + "loss": 0.1767, + "step": 796 + }, + { + "epoch": 0.25826312378483474, + "grad_norm": 1.1445401906967163, + "learning_rate": 4.9590618935954415e-06, + "loss": 0.1909, + "step": 797 + }, + { + "epoch": 0.25858716785482827, + "grad_norm": 1.0938869714736938, + "learning_rate": 4.958904113732975e-06, + "loss": 0.1642, + "step": 798 + }, + { + "epoch": 0.25891121192482175, + "grad_norm": 1.0983704328536987, + "learning_rate": 4.958746032924449e-06, + "loss": 0.1801, + "step": 799 + }, + { + "epoch": 0.2592352559948153, + "grad_norm": 1.0518333911895752, + "learning_rate": 4.95858765118921e-06, + "loss": 0.1786, + "step": 800 + }, + { + "epoch": 0.2595593000648088, + "grad_norm": 1.1234681606292725, + "learning_rate": 4.9584289685466444e-06, + "loss": 0.1826, + "step": 801 + }, + { + "epoch": 0.25988334413480235, + "grad_norm": 1.0565016269683838, + "learning_rate": 4.958269985016172e-06, + "loss": 0.165, + "step": 802 + }, + { + "epoch": 0.26020738820479583, + "grad_norm": 1.0922951698303223, + "learning_rate": 4.958110700617251e-06, + "loss": 0.1702, + "step": 803 + }, + { + "epoch": 0.26053143227478937, + "grad_norm": 1.077223539352417, + "learning_rate": 4.957951115369378e-06, + "loss": 0.1686, + "step": 804 + }, + { + "epoch": 0.2608554763447829, + "grad_norm": 1.0731221437454224, + "learning_rate": 4.957791229292082e-06, + "loss": 0.1793, + "step": 805 + }, + { + "epoch": 0.26117952041477643, + "grad_norm": 1.3346308469772339, + "learning_rate": 4.957631042404934e-06, + "loss": 0.1741, + "step": 806 + }, + { + "epoch": 0.2615035644847699, + "grad_norm": 1.1206022500991821, + "learning_rate": 4.957470554727536e-06, + "loss": 0.1904, + "step": 807 + }, + { + "epoch": 0.26182760855476345, + "grad_norm": 1.0771183967590332, + "learning_rate": 4.9573097662795344e-06, + "loss": 0.1658, + "step": 808 + }, + { + "epoch": 0.262151652624757, + "grad_norm": 1.1210774183273315, + "learning_rate": 4.957148677080605e-06, + "loss": 0.1886, + "step": 809 + }, + { + "epoch": 0.26247569669475046, + "grad_norm": 1.1223516464233398, + "learning_rate": 4.956987287150465e-06, + "loss": 0.1809, + "step": 810 + }, + { + "epoch": 0.262799740764744, + "grad_norm": 1.106168508529663, + "learning_rate": 4.956825596508867e-06, + "loss": 0.17, + "step": 811 + }, + { + "epoch": 0.26312378483473753, + "grad_norm": 1.1239475011825562, + "learning_rate": 4.956663605175599e-06, + "loss": 0.192, + "step": 812 + }, + { + "epoch": 0.26344782890473106, + "grad_norm": 1.1581978797912598, + "learning_rate": 4.956501313170487e-06, + "loss": 0.1939, + "step": 813 + }, + { + "epoch": 0.26377187297472454, + "grad_norm": 1.0884262323379517, + "learning_rate": 4.956338720513397e-06, + "loss": 0.1865, + "step": 814 + }, + { + "epoch": 0.2640959170447181, + "grad_norm": 1.0481289625167847, + "learning_rate": 4.956175827224226e-06, + "loss": 0.1759, + "step": 815 + }, + { + "epoch": 0.2644199611147116, + "grad_norm": 1.0824658870697021, + "learning_rate": 4.956012633322912e-06, + "loss": 0.1657, + "step": 816 + }, + { + "epoch": 0.26474400518470514, + "grad_norm": 1.0250821113586426, + "learning_rate": 4.955849138829428e-06, + "loss": 0.1563, + "step": 817 + }, + { + "epoch": 0.2650680492546986, + "grad_norm": 1.0275894403457642, + "learning_rate": 4.955685343763782e-06, + "loss": 0.172, + "step": 818 + }, + { + "epoch": 0.26539209332469216, + "grad_norm": 1.0740388631820679, + "learning_rate": 4.9555212481460245e-06, + "loss": 0.1767, + "step": 819 + }, + { + "epoch": 0.2657161373946857, + "grad_norm": 1.1377601623535156, + "learning_rate": 4.955356851996236e-06, + "loss": 0.1786, + "step": 820 + }, + { + "epoch": 0.26604018146467917, + "grad_norm": 1.0854454040527344, + "learning_rate": 4.955192155334539e-06, + "loss": 0.182, + "step": 821 + }, + { + "epoch": 0.2663642255346727, + "grad_norm": 1.1744272708892822, + "learning_rate": 4.955027158181092e-06, + "loss": 0.1882, + "step": 822 + }, + { + "epoch": 0.26668826960466624, + "grad_norm": 1.0906330347061157, + "learning_rate": 4.9548618605560855e-06, + "loss": 0.1832, + "step": 823 + }, + { + "epoch": 0.2670123136746598, + "grad_norm": 1.0744832754135132, + "learning_rate": 4.954696262479753e-06, + "loss": 0.1705, + "step": 824 + }, + { + "epoch": 0.26733635774465325, + "grad_norm": 1.241999864578247, + "learning_rate": 4.954530363972361e-06, + "loss": 0.1893, + "step": 825 + }, + { + "epoch": 0.2676604018146468, + "grad_norm": 1.0803509950637817, + "learning_rate": 4.954364165054214e-06, + "loss": 0.1735, + "step": 826 + }, + { + "epoch": 0.2679844458846403, + "grad_norm": 1.1593369245529175, + "learning_rate": 4.9541976657456535e-06, + "loss": 0.166, + "step": 827 + }, + { + "epoch": 0.26830848995463386, + "grad_norm": 1.1206544637680054, + "learning_rate": 4.954030866067057e-06, + "loss": 0.1675, + "step": 828 + }, + { + "epoch": 0.26863253402462733, + "grad_norm": 1.1313645839691162, + "learning_rate": 4.95386376603884e-06, + "loss": 0.1818, + "step": 829 + }, + { + "epoch": 0.26895657809462087, + "grad_norm": 1.0753988027572632, + "learning_rate": 4.953696365681452e-06, + "loss": 0.1762, + "step": 830 + }, + { + "epoch": 0.2692806221646144, + "grad_norm": 1.1497502326965332, + "learning_rate": 4.953528665015383e-06, + "loss": 0.173, + "step": 831 + }, + { + "epoch": 0.2696046662346079, + "grad_norm": 1.0890166759490967, + "learning_rate": 4.953360664061159e-06, + "loss": 0.1823, + "step": 832 + }, + { + "epoch": 0.2699287103046014, + "grad_norm": 1.1478276252746582, + "learning_rate": 4.953192362839338e-06, + "loss": 0.1752, + "step": 833 + }, + { + "epoch": 0.27025275437459495, + "grad_norm": 1.0240693092346191, + "learning_rate": 4.953023761370521e-06, + "loss": 0.1663, + "step": 834 + }, + { + "epoch": 0.2705767984445885, + "grad_norm": 1.083274006843567, + "learning_rate": 4.952854859675343e-06, + "loss": 0.1947, + "step": 835 + }, + { + "epoch": 0.27090084251458196, + "grad_norm": 1.0814635753631592, + "learning_rate": 4.952685657774476e-06, + "loss": 0.1757, + "step": 836 + }, + { + "epoch": 0.2712248865845755, + "grad_norm": 1.0839900970458984, + "learning_rate": 4.952516155688628e-06, + "loss": 0.1824, + "step": 837 + }, + { + "epoch": 0.27154893065456903, + "grad_norm": 1.1437290906906128, + "learning_rate": 4.9523463534385444e-06, + "loss": 0.1772, + "step": 838 + }, + { + "epoch": 0.27187297472456257, + "grad_norm": 1.006161093711853, + "learning_rate": 4.952176251045008e-06, + "loss": 0.1686, + "step": 839 + }, + { + "epoch": 0.27219701879455604, + "grad_norm": 1.0428589582443237, + "learning_rate": 4.952005848528838e-06, + "loss": 0.1675, + "step": 840 + }, + { + "epoch": 0.2725210628645496, + "grad_norm": 1.1923826932907104, + "learning_rate": 4.951835145910888e-06, + "loss": 0.1846, + "step": 841 + }, + { + "epoch": 0.2728451069345431, + "grad_norm": 1.1122922897338867, + "learning_rate": 4.951664143212053e-06, + "loss": 0.1819, + "step": 842 + }, + { + "epoch": 0.2731691510045366, + "grad_norm": 1.1033662557601929, + "learning_rate": 4.95149284045326e-06, + "loss": 0.161, + "step": 843 + }, + { + "epoch": 0.2734931950745301, + "grad_norm": 1.053223967552185, + "learning_rate": 4.951321237655477e-06, + "loss": 0.1905, + "step": 844 + }, + { + "epoch": 0.27381723914452366, + "grad_norm": 1.1286718845367432, + "learning_rate": 4.951149334839703e-06, + "loss": 0.1857, + "step": 845 + }, + { + "epoch": 0.2741412832145172, + "grad_norm": 1.0990140438079834, + "learning_rate": 4.950977132026981e-06, + "loss": 0.1918, + "step": 846 + }, + { + "epoch": 0.2744653272845107, + "grad_norm": 1.0953593254089355, + "learning_rate": 4.9508046292383846e-06, + "loss": 0.1701, + "step": 847 + }, + { + "epoch": 0.2747893713545042, + "grad_norm": 1.0887120962142944, + "learning_rate": 4.950631826495027e-06, + "loss": 0.1678, + "step": 848 + }, + { + "epoch": 0.27511341542449774, + "grad_norm": 1.1221007108688354, + "learning_rate": 4.950458723818058e-06, + "loss": 0.1741, + "step": 849 + }, + { + "epoch": 0.2754374594944913, + "grad_norm": 1.0366301536560059, + "learning_rate": 4.950285321228664e-06, + "loss": 0.1573, + "step": 850 + }, + { + "epoch": 0.27576150356448476, + "grad_norm": 1.098291277885437, + "learning_rate": 4.950111618748067e-06, + "loss": 0.1849, + "step": 851 + }, + { + "epoch": 0.2760855476344783, + "grad_norm": 1.1351509094238281, + "learning_rate": 4.949937616397527e-06, + "loss": 0.1765, + "step": 852 + }, + { + "epoch": 0.2764095917044718, + "grad_norm": 1.0571086406707764, + "learning_rate": 4.949763314198339e-06, + "loss": 0.1761, + "step": 853 + }, + { + "epoch": 0.2767336357744653, + "grad_norm": 1.0983611345291138, + "learning_rate": 4.949588712171838e-06, + "loss": 0.1784, + "step": 854 + }, + { + "epoch": 0.27705767984445884, + "grad_norm": 1.1928962469100952, + "learning_rate": 4.949413810339392e-06, + "loss": 0.1795, + "step": 855 + }, + { + "epoch": 0.27738172391445237, + "grad_norm": 1.1288187503814697, + "learning_rate": 4.949238608722408e-06, + "loss": 0.1989, + "step": 856 + }, + { + "epoch": 0.2777057679844459, + "grad_norm": 1.1986991167068481, + "learning_rate": 4.949063107342329e-06, + "loss": 0.196, + "step": 857 + }, + { + "epoch": 0.2780298120544394, + "grad_norm": 1.0543721914291382, + "learning_rate": 4.948887306220634e-06, + "loss": 0.1713, + "step": 858 + }, + { + "epoch": 0.2783538561244329, + "grad_norm": 1.0088876485824585, + "learning_rate": 4.94871120537884e-06, + "loss": 0.1732, + "step": 859 + }, + { + "epoch": 0.27867790019442645, + "grad_norm": 1.0996417999267578, + "learning_rate": 4.9485348048385e-06, + "loss": 0.1724, + "step": 860 + }, + { + "epoch": 0.27900194426442, + "grad_norm": 1.0093612670898438, + "learning_rate": 4.9483581046212025e-06, + "loss": 0.1711, + "step": 861 + }, + { + "epoch": 0.27932598833441347, + "grad_norm": 1.0933109521865845, + "learning_rate": 4.948181104748576e-06, + "loss": 0.1835, + "step": 862 + }, + { + "epoch": 0.279650032404407, + "grad_norm": 1.0886743068695068, + "learning_rate": 4.948003805242282e-06, + "loss": 0.1663, + "step": 863 + }, + { + "epoch": 0.27997407647440054, + "grad_norm": 1.0580806732177734, + "learning_rate": 4.9478262061240216e-06, + "loss": 0.179, + "step": 864 + }, + { + "epoch": 0.280298120544394, + "grad_norm": 1.0770388841629028, + "learning_rate": 4.947648307415529e-06, + "loss": 0.1539, + "step": 865 + }, + { + "epoch": 0.28062216461438755, + "grad_norm": 1.1204828023910522, + "learning_rate": 4.947470109138579e-06, + "loss": 0.1519, + "step": 866 + }, + { + "epoch": 0.2809462086843811, + "grad_norm": 1.175567388534546, + "learning_rate": 4.947291611314981e-06, + "loss": 0.1882, + "step": 867 + }, + { + "epoch": 0.2812702527543746, + "grad_norm": 1.0687494277954102, + "learning_rate": 4.9471128139665826e-06, + "loss": 0.1696, + "step": 868 + }, + { + "epoch": 0.2815942968243681, + "grad_norm": 1.1533077955245972, + "learning_rate": 4.9469337171152645e-06, + "loss": 0.185, + "step": 869 + }, + { + "epoch": 0.28191834089436163, + "grad_norm": 1.1578819751739502, + "learning_rate": 4.946754320782948e-06, + "loss": 0.186, + "step": 870 + }, + { + "epoch": 0.28224238496435516, + "grad_norm": 1.2411943674087524, + "learning_rate": 4.946574624991589e-06, + "loss": 0.1921, + "step": 871 + }, + { + "epoch": 0.2825664290343487, + "grad_norm": 0.9703081250190735, + "learning_rate": 4.946394629763181e-06, + "loss": 0.1565, + "step": 872 + }, + { + "epoch": 0.2828904731043422, + "grad_norm": 1.061703085899353, + "learning_rate": 4.946214335119752e-06, + "loss": 0.1664, + "step": 873 + }, + { + "epoch": 0.2832145171743357, + "grad_norm": 1.0972583293914795, + "learning_rate": 4.94603374108337e-06, + "loss": 0.1915, + "step": 874 + }, + { + "epoch": 0.28353856124432925, + "grad_norm": 1.1283047199249268, + "learning_rate": 4.945852847676138e-06, + "loss": 0.1836, + "step": 875 + }, + { + "epoch": 0.2838626053143227, + "grad_norm": 1.1044758558273315, + "learning_rate": 4.945671654920195e-06, + "loss": 0.17, + "step": 876 + }, + { + "epoch": 0.28418664938431626, + "grad_norm": 1.046217679977417, + "learning_rate": 4.945490162837718e-06, + "loss": 0.1671, + "step": 877 + }, + { + "epoch": 0.2845106934543098, + "grad_norm": 1.0820338726043701, + "learning_rate": 4.945308371450919e-06, + "loss": 0.173, + "step": 878 + }, + { + "epoch": 0.2848347375243033, + "grad_norm": 1.1840169429779053, + "learning_rate": 4.945126280782047e-06, + "loss": 0.1952, + "step": 879 + }, + { + "epoch": 0.2851587815942968, + "grad_norm": 1.0911513566970825, + "learning_rate": 4.944943890853389e-06, + "loss": 0.1781, + "step": 880 + }, + { + "epoch": 0.28548282566429034, + "grad_norm": 1.0861856937408447, + "learning_rate": 4.944761201687268e-06, + "loss": 0.1661, + "step": 881 + }, + { + "epoch": 0.2858068697342839, + "grad_norm": 1.0911214351654053, + "learning_rate": 4.944578213306043e-06, + "loss": 0.189, + "step": 882 + }, + { + "epoch": 0.2861309138042774, + "grad_norm": 1.0918217897415161, + "learning_rate": 4.94439492573211e-06, + "loss": 0.1879, + "step": 883 + }, + { + "epoch": 0.2864549578742709, + "grad_norm": 1.1766140460968018, + "learning_rate": 4.944211338987901e-06, + "loss": 0.1774, + "step": 884 + }, + { + "epoch": 0.2867790019442644, + "grad_norm": 1.0490213632583618, + "learning_rate": 4.944027453095887e-06, + "loss": 0.1666, + "step": 885 + }, + { + "epoch": 0.28710304601425796, + "grad_norm": 1.2016412019729614, + "learning_rate": 4.943843268078572e-06, + "loss": 0.1794, + "step": 886 + }, + { + "epoch": 0.28742709008425144, + "grad_norm": 1.0137540102005005, + "learning_rate": 4.9436587839585e-06, + "loss": 0.1671, + "step": 887 + }, + { + "epoch": 0.28775113415424497, + "grad_norm": 1.1008466482162476, + "learning_rate": 4.9434740007582485e-06, + "loss": 0.1864, + "step": 888 + }, + { + "epoch": 0.2880751782242385, + "grad_norm": 1.0474399328231812, + "learning_rate": 4.943288918500434e-06, + "loss": 0.1699, + "step": 889 + }, + { + "epoch": 0.28839922229423204, + "grad_norm": 0.9579008221626282, + "learning_rate": 4.94310353720771e-06, + "loss": 0.1554, + "step": 890 + }, + { + "epoch": 0.2887232663642255, + "grad_norm": 1.095854640007019, + "learning_rate": 4.942917856902763e-06, + "loss": 0.187, + "step": 891 + }, + { + "epoch": 0.28904731043421905, + "grad_norm": 1.0819822549819946, + "learning_rate": 4.942731877608319e-06, + "loss": 0.1728, + "step": 892 + }, + { + "epoch": 0.2893713545042126, + "grad_norm": 1.2087551355361938, + "learning_rate": 4.942545599347142e-06, + "loss": 0.1852, + "step": 893 + }, + { + "epoch": 0.28969539857420606, + "grad_norm": 1.020182490348816, + "learning_rate": 4.942359022142028e-06, + "loss": 0.1591, + "step": 894 + }, + { + "epoch": 0.2900194426441996, + "grad_norm": 1.0779310464859009, + "learning_rate": 4.942172146015814e-06, + "loss": 0.1695, + "step": 895 + }, + { + "epoch": 0.29034348671419313, + "grad_norm": 1.0906720161437988, + "learning_rate": 4.941984970991372e-06, + "loss": 0.1781, + "step": 896 + }, + { + "epoch": 0.29066753078418667, + "grad_norm": 0.9882294535636902, + "learning_rate": 4.9417974970916096e-06, + "loss": 0.164, + "step": 897 + }, + { + "epoch": 0.29099157485418015, + "grad_norm": 1.0383557081222534, + "learning_rate": 4.9416097243394725e-06, + "loss": 0.183, + "step": 898 + }, + { + "epoch": 0.2913156189241737, + "grad_norm": 1.0150779485702515, + "learning_rate": 4.94142165275794e-06, + "loss": 0.1712, + "step": 899 + }, + { + "epoch": 0.2916396629941672, + "grad_norm": 1.071306824684143, + "learning_rate": 4.941233282370034e-06, + "loss": 0.1839, + "step": 900 + }, + { + "epoch": 0.29196370706416075, + "grad_norm": 0.9918048977851868, + "learning_rate": 4.941044613198807e-06, + "loss": 0.172, + "step": 901 + }, + { + "epoch": 0.29228775113415423, + "grad_norm": 0.9918202757835388, + "learning_rate": 4.940855645267349e-06, + "loss": 0.1625, + "step": 902 + }, + { + "epoch": 0.29261179520414776, + "grad_norm": 1.1438778638839722, + "learning_rate": 4.94066637859879e-06, + "loss": 0.1832, + "step": 903 + }, + { + "epoch": 0.2929358392741413, + "grad_norm": 1.005126953125, + "learning_rate": 4.940476813216294e-06, + "loss": 0.1769, + "step": 904 + }, + { + "epoch": 0.2932598833441348, + "grad_norm": 1.1077789068222046, + "learning_rate": 4.940286949143061e-06, + "loss": 0.179, + "step": 905 + }, + { + "epoch": 0.2935839274141283, + "grad_norm": 1.115727186203003, + "learning_rate": 4.940096786402331e-06, + "loss": 0.1653, + "step": 906 + }, + { + "epoch": 0.29390797148412184, + "grad_norm": 1.1475930213928223, + "learning_rate": 4.939906325017374e-06, + "loss": 0.183, + "step": 907 + }, + { + "epoch": 0.2942320155541154, + "grad_norm": 1.0840696096420288, + "learning_rate": 4.939715565011504e-06, + "loss": 0.1747, + "step": 908 + }, + { + "epoch": 0.29455605962410886, + "grad_norm": 1.0846959352493286, + "learning_rate": 4.939524506408068e-06, + "loss": 0.1763, + "step": 909 + }, + { + "epoch": 0.2948801036941024, + "grad_norm": 1.0708680152893066, + "learning_rate": 4.939333149230447e-06, + "loss": 0.1832, + "step": 910 + }, + { + "epoch": 0.2952041477640959, + "grad_norm": 1.0281907320022583, + "learning_rate": 4.9391414935020656e-06, + "loss": 0.1721, + "step": 911 + }, + { + "epoch": 0.29552819183408946, + "grad_norm": 0.8992030620574951, + "learning_rate": 4.938949539246376e-06, + "loss": 0.1537, + "step": 912 + }, + { + "epoch": 0.29585223590408294, + "grad_norm": 1.1132668256759644, + "learning_rate": 4.938757286486874e-06, + "loss": 0.1725, + "step": 913 + }, + { + "epoch": 0.2961762799740765, + "grad_norm": 1.146109700202942, + "learning_rate": 4.93856473524709e-06, + "loss": 0.179, + "step": 914 + }, + { + "epoch": 0.29650032404407, + "grad_norm": 0.9988548755645752, + "learning_rate": 4.938371885550589e-06, + "loss": 0.1581, + "step": 915 + }, + { + "epoch": 0.2968243681140635, + "grad_norm": 1.0798379182815552, + "learning_rate": 4.938178737420974e-06, + "loss": 0.1761, + "step": 916 + }, + { + "epoch": 0.297148412184057, + "grad_norm": 1.0951101779937744, + "learning_rate": 4.937985290881886e-06, + "loss": 0.1619, + "step": 917 + }, + { + "epoch": 0.29747245625405055, + "grad_norm": 0.9965536594390869, + "learning_rate": 4.9377915459569995e-06, + "loss": 0.1564, + "step": 918 + }, + { + "epoch": 0.2977965003240441, + "grad_norm": 1.1427702903747559, + "learning_rate": 4.937597502670027e-06, + "loss": 0.1853, + "step": 919 + }, + { + "epoch": 0.29812054439403757, + "grad_norm": 1.0869373083114624, + "learning_rate": 4.9374031610447185e-06, + "loss": 0.1657, + "step": 920 + }, + { + "epoch": 0.2984445884640311, + "grad_norm": 1.1138787269592285, + "learning_rate": 4.937208521104858e-06, + "loss": 0.1665, + "step": 921 + }, + { + "epoch": 0.29876863253402464, + "grad_norm": 1.1613776683807373, + "learning_rate": 4.937013582874269e-06, + "loss": 0.1818, + "step": 922 + }, + { + "epoch": 0.29909267660401817, + "grad_norm": 1.084401249885559, + "learning_rate": 4.93681834637681e-06, + "loss": 0.1811, + "step": 923 + }, + { + "epoch": 0.29941672067401165, + "grad_norm": 1.1521601676940918, + "learning_rate": 4.936622811636376e-06, + "loss": 0.1704, + "step": 924 + }, + { + "epoch": 0.2997407647440052, + "grad_norm": 1.006294846534729, + "learning_rate": 4.936426978676897e-06, + "loss": 0.1649, + "step": 925 + }, + { + "epoch": 0.3000648088139987, + "grad_norm": 1.1440045833587646, + "learning_rate": 4.936230847522343e-06, + "loss": 0.179, + "step": 926 + }, + { + "epoch": 0.3003888528839922, + "grad_norm": 1.0526880025863647, + "learning_rate": 4.936034418196718e-06, + "loss": 0.1828, + "step": 927 + }, + { + "epoch": 0.30071289695398573, + "grad_norm": 1.721049427986145, + "learning_rate": 4.935837690724063e-06, + "loss": 0.176, + "step": 928 + }, + { + "epoch": 0.30103694102397927, + "grad_norm": 1.0776753425598145, + "learning_rate": 4.935640665128454e-06, + "loss": 0.1637, + "step": 929 + }, + { + "epoch": 0.3013609850939728, + "grad_norm": 1.0923445224761963, + "learning_rate": 4.935443341434008e-06, + "loss": 0.1707, + "step": 930 + }, + { + "epoch": 0.3016850291639663, + "grad_norm": 1.0009198188781738, + "learning_rate": 4.935245719664873e-06, + "loss": 0.1715, + "step": 931 + }, + { + "epoch": 0.3020090732339598, + "grad_norm": 1.0673625469207764, + "learning_rate": 4.935047799845238e-06, + "loss": 0.1711, + "step": 932 + }, + { + "epoch": 0.30233311730395335, + "grad_norm": 1.2885209321975708, + "learning_rate": 4.9348495819993235e-06, + "loss": 0.1728, + "step": 933 + }, + { + "epoch": 0.3026571613739469, + "grad_norm": 1.1276648044586182, + "learning_rate": 4.9346510661513924e-06, + "loss": 0.1873, + "step": 934 + }, + { + "epoch": 0.30298120544394036, + "grad_norm": 1.1598179340362549, + "learning_rate": 4.93445225232574e-06, + "loss": 0.1845, + "step": 935 + }, + { + "epoch": 0.3033052495139339, + "grad_norm": 1.1591609716415405, + "learning_rate": 4.9342531405467e-06, + "loss": 0.1778, + "step": 936 + }, + { + "epoch": 0.30362929358392743, + "grad_norm": 1.0711930990219116, + "learning_rate": 4.934053730838639e-06, + "loss": 0.1662, + "step": 937 + }, + { + "epoch": 0.3039533376539209, + "grad_norm": 1.1723405122756958, + "learning_rate": 4.9338540232259664e-06, + "loss": 0.177, + "step": 938 + }, + { + "epoch": 0.30427738172391444, + "grad_norm": 1.033252239227295, + "learning_rate": 4.9336540177331225e-06, + "loss": 0.1706, + "step": 939 + }, + { + "epoch": 0.304601425793908, + "grad_norm": 1.1548298597335815, + "learning_rate": 4.9334537143845876e-06, + "loss": 0.1886, + "step": 940 + }, + { + "epoch": 0.3049254698639015, + "grad_norm": 1.0255663394927979, + "learning_rate": 4.933253113204874e-06, + "loss": 0.18, + "step": 941 + }, + { + "epoch": 0.305249513933895, + "grad_norm": 1.3654695749282837, + "learning_rate": 4.933052214218535e-06, + "loss": 0.1961, + "step": 942 + }, + { + "epoch": 0.3055735580038885, + "grad_norm": 1.3150925636291504, + "learning_rate": 4.93285101745016e-06, + "loss": 0.178, + "step": 943 + }, + { + "epoch": 0.30589760207388206, + "grad_norm": 1.116111397743225, + "learning_rate": 4.932649522924372e-06, + "loss": 0.1777, + "step": 944 + }, + { + "epoch": 0.3062216461438756, + "grad_norm": 1.0210191011428833, + "learning_rate": 4.932447730665832e-06, + "loss": 0.1585, + "step": 945 + }, + { + "epoch": 0.30654569021386907, + "grad_norm": 0.9345679879188538, + "learning_rate": 4.932245640699238e-06, + "loss": 0.1651, + "step": 946 + }, + { + "epoch": 0.3068697342838626, + "grad_norm": 1.1611742973327637, + "learning_rate": 4.932043253049323e-06, + "loss": 0.1863, + "step": 947 + }, + { + "epoch": 0.30719377835385614, + "grad_norm": 1.1058744192123413, + "learning_rate": 4.931840567740858e-06, + "loss": 0.1713, + "step": 948 + }, + { + "epoch": 0.3075178224238496, + "grad_norm": 1.1998214721679688, + "learning_rate": 4.93163758479865e-06, + "loss": 0.2018, + "step": 949 + }, + { + "epoch": 0.30784186649384315, + "grad_norm": 1.105220079421997, + "learning_rate": 4.931434304247541e-06, + "loss": 0.1756, + "step": 950 + }, + { + "epoch": 0.3081659105638367, + "grad_norm": 1.0230122804641724, + "learning_rate": 4.931230726112412e-06, + "loss": 0.1769, + "step": 951 + }, + { + "epoch": 0.3084899546338302, + "grad_norm": 1.0697253942489624, + "learning_rate": 4.9310268504181764e-06, + "loss": 0.1809, + "step": 952 + }, + { + "epoch": 0.3088139987038237, + "grad_norm": 1.0562597513198853, + "learning_rate": 4.930822677189791e-06, + "loss": 0.161, + "step": 953 + }, + { + "epoch": 0.30913804277381723, + "grad_norm": 0.9493795037269592, + "learning_rate": 4.93061820645224e-06, + "loss": 0.1547, + "step": 954 + }, + { + "epoch": 0.30946208684381077, + "grad_norm": 0.9875866770744324, + "learning_rate": 4.930413438230552e-06, + "loss": 0.17, + "step": 955 + }, + { + "epoch": 0.3097861309138043, + "grad_norm": 0.9486485123634338, + "learning_rate": 4.930208372549787e-06, + "loss": 0.1583, + "step": 956 + }, + { + "epoch": 0.3101101749837978, + "grad_norm": 1.0512800216674805, + "learning_rate": 4.930003009435043e-06, + "loss": 0.1717, + "step": 957 + }, + { + "epoch": 0.3104342190537913, + "grad_norm": 0.9418579936027527, + "learning_rate": 4.9297973489114565e-06, + "loss": 0.1573, + "step": 958 + }, + { + "epoch": 0.31075826312378485, + "grad_norm": 1.1517508029937744, + "learning_rate": 4.929591391004196e-06, + "loss": 0.1868, + "step": 959 + }, + { + "epoch": 0.31108230719377833, + "grad_norm": 1.0254417657852173, + "learning_rate": 4.929385135738469e-06, + "loss": 0.1613, + "step": 960 + }, + { + "epoch": 0.31140635126377186, + "grad_norm": 1.1666157245635986, + "learning_rate": 4.92917858313952e-06, + "loss": 0.1892, + "step": 961 + }, + { + "epoch": 0.3117303953337654, + "grad_norm": 1.1078205108642578, + "learning_rate": 4.928971733232628e-06, + "loss": 0.1685, + "step": 962 + }, + { + "epoch": 0.31205443940375893, + "grad_norm": 1.0600954294204712, + "learning_rate": 4.928764586043111e-06, + "loss": 0.1754, + "step": 963 + }, + { + "epoch": 0.3123784834737524, + "grad_norm": 1.190826177597046, + "learning_rate": 4.9285571415963205e-06, + "loss": 0.1771, + "step": 964 + }, + { + "epoch": 0.31270252754374595, + "grad_norm": 1.0361576080322266, + "learning_rate": 4.928349399917646e-06, + "loss": 0.1838, + "step": 965 + }, + { + "epoch": 0.3130265716137395, + "grad_norm": 0.980185329914093, + "learning_rate": 4.928141361032513e-06, + "loss": 0.1643, + "step": 966 + }, + { + "epoch": 0.313350615683733, + "grad_norm": 1.0426617860794067, + "learning_rate": 4.927933024966385e-06, + "loss": 0.1656, + "step": 967 + }, + { + "epoch": 0.3136746597537265, + "grad_norm": 1.0412538051605225, + "learning_rate": 4.927724391744758e-06, + "loss": 0.1661, + "step": 968 + }, + { + "epoch": 0.31399870382372, + "grad_norm": 1.0205472707748413, + "learning_rate": 4.927515461393167e-06, + "loss": 0.1811, + "step": 969 + }, + { + "epoch": 0.31432274789371356, + "grad_norm": 0.9813237190246582, + "learning_rate": 4.927306233937185e-06, + "loss": 0.1561, + "step": 970 + }, + { + "epoch": 0.31464679196370704, + "grad_norm": 1.0640448331832886, + "learning_rate": 4.927096709402417e-06, + "loss": 0.1701, + "step": 971 + }, + { + "epoch": 0.3149708360337006, + "grad_norm": 1.0201969146728516, + "learning_rate": 4.926886887814509e-06, + "loss": 0.1727, + "step": 972 + }, + { + "epoch": 0.3152948801036941, + "grad_norm": 1.0877408981323242, + "learning_rate": 4.926676769199139e-06, + "loss": 0.171, + "step": 973 + }, + { + "epoch": 0.31561892417368764, + "grad_norm": 1.0435376167297363, + "learning_rate": 4.9264663535820256e-06, + "loss": 0.1806, + "step": 974 + }, + { + "epoch": 0.3159429682436811, + "grad_norm": 1.0373156070709229, + "learning_rate": 4.926255640988919e-06, + "loss": 0.178, + "step": 975 + }, + { + "epoch": 0.31626701231367466, + "grad_norm": 1.0691494941711426, + "learning_rate": 4.926044631445611e-06, + "loss": 0.175, + "step": 976 + }, + { + "epoch": 0.3165910563836682, + "grad_norm": 0.9887287616729736, + "learning_rate": 4.925833324977926e-06, + "loss": 0.1791, + "step": 977 + }, + { + "epoch": 0.3169151004536617, + "grad_norm": 1.038283109664917, + "learning_rate": 4.925621721611726e-06, + "loss": 0.1734, + "step": 978 + }, + { + "epoch": 0.3172391445236552, + "grad_norm": 1.0908006429672241, + "learning_rate": 4.925409821372908e-06, + "loss": 0.1838, + "step": 979 + }, + { + "epoch": 0.31756318859364874, + "grad_norm": 1.0472310781478882, + "learning_rate": 4.925197624287409e-06, + "loss": 0.191, + "step": 980 + }, + { + "epoch": 0.31788723266364227, + "grad_norm": 1.0285770893096924, + "learning_rate": 4.924985130381198e-06, + "loss": 0.1593, + "step": 981 + }, + { + "epoch": 0.31821127673363575, + "grad_norm": 1.0358308553695679, + "learning_rate": 4.924772339680283e-06, + "loss": 0.1876, + "step": 982 + }, + { + "epoch": 0.3185353208036293, + "grad_norm": 1.0257309675216675, + "learning_rate": 4.9245592522107065e-06, + "loss": 0.1766, + "step": 983 + }, + { + "epoch": 0.3188593648736228, + "grad_norm": 0.9954764246940613, + "learning_rate": 4.92434586799855e-06, + "loss": 0.1617, + "step": 984 + }, + { + "epoch": 0.31918340894361635, + "grad_norm": 1.1356722116470337, + "learning_rate": 4.924132187069928e-06, + "loss": 0.1799, + "step": 985 + }, + { + "epoch": 0.31950745301360983, + "grad_norm": 0.9893943071365356, + "learning_rate": 4.923918209450994e-06, + "loss": 0.1634, + "step": 986 + }, + { + "epoch": 0.31983149708360337, + "grad_norm": 1.1390126943588257, + "learning_rate": 4.9237039351679365e-06, + "loss": 0.1855, + "step": 987 + }, + { + "epoch": 0.3201555411535969, + "grad_norm": 1.0541259050369263, + "learning_rate": 4.923489364246981e-06, + "loss": 0.1807, + "step": 988 + }, + { + "epoch": 0.32047958522359044, + "grad_norm": 1.0718092918395996, + "learning_rate": 4.923274496714387e-06, + "loss": 0.1763, + "step": 989 + }, + { + "epoch": 0.3208036292935839, + "grad_norm": 1.102406620979309, + "learning_rate": 4.923059332596456e-06, + "loss": 0.173, + "step": 990 + }, + { + "epoch": 0.32112767336357745, + "grad_norm": 1.1300830841064453, + "learning_rate": 4.922843871919518e-06, + "loss": 0.1818, + "step": 991 + }, + { + "epoch": 0.321451717433571, + "grad_norm": 1.021759033203125, + "learning_rate": 4.922628114709945e-06, + "loss": 0.1656, + "step": 992 + }, + { + "epoch": 0.32177576150356446, + "grad_norm": 1.0214760303497314, + "learning_rate": 4.922412060994145e-06, + "loss": 0.1733, + "step": 993 + }, + { + "epoch": 0.322099805573558, + "grad_norm": 1.0010415315628052, + "learning_rate": 4.922195710798559e-06, + "loss": 0.1529, + "step": 994 + }, + { + "epoch": 0.32242384964355153, + "grad_norm": 1.140674114227295, + "learning_rate": 4.9219790641496656e-06, + "loss": 0.177, + "step": 995 + }, + { + "epoch": 0.32274789371354506, + "grad_norm": 0.9754520654678345, + "learning_rate": 4.9217621210739826e-06, + "loss": 0.1698, + "step": 996 + }, + { + "epoch": 0.32307193778353854, + "grad_norm": 1.0422990322113037, + "learning_rate": 4.921544881598059e-06, + "loss": 0.1808, + "step": 997 + }, + { + "epoch": 0.3233959818535321, + "grad_norm": 1.1023666858673096, + "learning_rate": 4.921327345748486e-06, + "loss": 0.1743, + "step": 998 + }, + { + "epoch": 0.3237200259235256, + "grad_norm": 1.0531352758407593, + "learning_rate": 4.921109513551885e-06, + "loss": 0.1463, + "step": 999 + }, + { + "epoch": 0.32404406999351915, + "grad_norm": 0.9822701811790466, + "learning_rate": 4.920891385034918e-06, + "loss": 0.156, + "step": 1000 + }, + { + "epoch": 0.3243681140635126, + "grad_norm": 1.0433939695358276, + "learning_rate": 4.920672960224282e-06, + "loss": 0.1799, + "step": 1001 + }, + { + "epoch": 0.32469215813350616, + "grad_norm": 1.1296837329864502, + "learning_rate": 4.920454239146709e-06, + "loss": 0.1774, + "step": 1002 + }, + { + "epoch": 0.3250162022034997, + "grad_norm": 1.0009548664093018, + "learning_rate": 4.92023522182897e-06, + "loss": 0.1501, + "step": 1003 + }, + { + "epoch": 0.32534024627349317, + "grad_norm": 1.0985316038131714, + "learning_rate": 4.9200159082978685e-06, + "loss": 0.1746, + "step": 1004 + }, + { + "epoch": 0.3256642903434867, + "grad_norm": 1.0474634170532227, + "learning_rate": 4.919796298580247e-06, + "loss": 0.1661, + "step": 1005 + }, + { + "epoch": 0.32598833441348024, + "grad_norm": 1.036765694618225, + "learning_rate": 4.919576392702984e-06, + "loss": 0.1812, + "step": 1006 + }, + { + "epoch": 0.3263123784834738, + "grad_norm": 1.0654313564300537, + "learning_rate": 4.9193561906929945e-06, + "loss": 0.1777, + "step": 1007 + }, + { + "epoch": 0.32663642255346725, + "grad_norm": 1.0503085851669312, + "learning_rate": 4.919135692577229e-06, + "loss": 0.1525, + "step": 1008 + }, + { + "epoch": 0.3269604666234608, + "grad_norm": 1.0202323198318481, + "learning_rate": 4.918914898382673e-06, + "loss": 0.1728, + "step": 1009 + }, + { + "epoch": 0.3272845106934543, + "grad_norm": 1.02118718624115, + "learning_rate": 4.91869380813635e-06, + "loss": 0.1763, + "step": 1010 + }, + { + "epoch": 0.3276085547634478, + "grad_norm": 0.99228435754776, + "learning_rate": 4.91847242186532e-06, + "loss": 0.1563, + "step": 1011 + }, + { + "epoch": 0.32793259883344134, + "grad_norm": 1.0361772775650024, + "learning_rate": 4.918250739596678e-06, + "loss": 0.1779, + "step": 1012 + }, + { + "epoch": 0.32825664290343487, + "grad_norm": 1.1137558221817017, + "learning_rate": 4.918028761357557e-06, + "loss": 0.1614, + "step": 1013 + }, + { + "epoch": 0.3285806869734284, + "grad_norm": 0.9790223240852356, + "learning_rate": 4.917806487175123e-06, + "loss": 0.1627, + "step": 1014 + }, + { + "epoch": 0.3289047310434219, + "grad_norm": 1.0379266738891602, + "learning_rate": 4.917583917076581e-06, + "loss": 0.1651, + "step": 1015 + }, + { + "epoch": 0.3292287751134154, + "grad_norm": 1.0073673725128174, + "learning_rate": 4.917361051089172e-06, + "loss": 0.1838, + "step": 1016 + }, + { + "epoch": 0.32955281918340895, + "grad_norm": 1.0644619464874268, + "learning_rate": 4.917137889240172e-06, + "loss": 0.1724, + "step": 1017 + }, + { + "epoch": 0.3298768632534025, + "grad_norm": 1.0988903045654297, + "learning_rate": 4.916914431556895e-06, + "loss": 0.1787, + "step": 1018 + }, + { + "epoch": 0.33020090732339596, + "grad_norm": 1.063129186630249, + "learning_rate": 4.916690678066688e-06, + "loss": 0.181, + "step": 1019 + }, + { + "epoch": 0.3305249513933895, + "grad_norm": 1.057511806488037, + "learning_rate": 4.916466628796938e-06, + "loss": 0.174, + "step": 1020 + }, + { + "epoch": 0.33084899546338303, + "grad_norm": 0.9521802663803101, + "learning_rate": 4.9162422837750654e-06, + "loss": 0.1579, + "step": 1021 + }, + { + "epoch": 0.3311730395333765, + "grad_norm": 1.0636956691741943, + "learning_rate": 4.916017643028529e-06, + "loss": 0.1627, + "step": 1022 + }, + { + "epoch": 0.33149708360337005, + "grad_norm": 1.0339527130126953, + "learning_rate": 4.915792706584821e-06, + "loss": 0.1567, + "step": 1023 + }, + { + "epoch": 0.3318211276733636, + "grad_norm": 0.9493985772132874, + "learning_rate": 4.9155674744714725e-06, + "loss": 0.1567, + "step": 1024 + }, + { + "epoch": 0.3321451717433571, + "grad_norm": 1.0365159511566162, + "learning_rate": 4.91534194671605e-06, + "loss": 0.1746, + "step": 1025 + }, + { + "epoch": 0.3324692158133506, + "grad_norm": 1.083169937133789, + "learning_rate": 4.915116123346155e-06, + "loss": 0.1913, + "step": 1026 + }, + { + "epoch": 0.33279325988334413, + "grad_norm": 1.019592046737671, + "learning_rate": 4.9148900043894275e-06, + "loss": 0.1843, + "step": 1027 + }, + { + "epoch": 0.33311730395333766, + "grad_norm": 1.0441523790359497, + "learning_rate": 4.914663589873541e-06, + "loss": 0.1679, + "step": 1028 + }, + { + "epoch": 0.3334413480233312, + "grad_norm": 1.0396127700805664, + "learning_rate": 4.914436879826207e-06, + "loss": 0.1785, + "step": 1029 + }, + { + "epoch": 0.3337653920933247, + "grad_norm": 1.0079026222229004, + "learning_rate": 4.9142098742751726e-06, + "loss": 0.17, + "step": 1030 + }, + { + "epoch": 0.3340894361633182, + "grad_norm": 1.001125454902649, + "learning_rate": 4.9139825732482205e-06, + "loss": 0.1643, + "step": 1031 + }, + { + "epoch": 0.33441348023331174, + "grad_norm": 1.0608137845993042, + "learning_rate": 4.91375497677317e-06, + "loss": 0.1898, + "step": 1032 + }, + { + "epoch": 0.3347375243033052, + "grad_norm": 0.992143452167511, + "learning_rate": 4.913527084877879e-06, + "loss": 0.166, + "step": 1033 + }, + { + "epoch": 0.33506156837329876, + "grad_norm": 1.0682899951934814, + "learning_rate": 4.913298897590237e-06, + "loss": 0.1838, + "step": 1034 + }, + { + "epoch": 0.3353856124432923, + "grad_norm": 1.0318293571472168, + "learning_rate": 4.913070414938172e-06, + "loss": 0.1728, + "step": 1035 + }, + { + "epoch": 0.3357096565132858, + "grad_norm": 1.0967031717300415, + "learning_rate": 4.912841636949649e-06, + "loss": 0.1753, + "step": 1036 + }, + { + "epoch": 0.3360337005832793, + "grad_norm": 1.0476478338241577, + "learning_rate": 4.912612563652667e-06, + "loss": 0.1706, + "step": 1037 + }, + { + "epoch": 0.33635774465327284, + "grad_norm": 1.0481637716293335, + "learning_rate": 4.912383195075264e-06, + "loss": 0.1742, + "step": 1038 + }, + { + "epoch": 0.3366817887232664, + "grad_norm": 1.0776970386505127, + "learning_rate": 4.912153531245511e-06, + "loss": 0.1827, + "step": 1039 + }, + { + "epoch": 0.3370058327932599, + "grad_norm": 1.0306576490402222, + "learning_rate": 4.9119235721915174e-06, + "loss": 0.1608, + "step": 1040 + }, + { + "epoch": 0.3373298768632534, + "grad_norm": 1.0382986068725586, + "learning_rate": 4.911693317941428e-06, + "loss": 0.1674, + "step": 1041 + }, + { + "epoch": 0.3376539209332469, + "grad_norm": 1.069576621055603, + "learning_rate": 4.911462768523423e-06, + "loss": 0.1754, + "step": 1042 + }, + { + "epoch": 0.33797796500324045, + "grad_norm": 0.9400745630264282, + "learning_rate": 4.9112319239657204e-06, + "loss": 0.1719, + "step": 1043 + }, + { + "epoch": 0.33830200907323393, + "grad_norm": 1.0903481245040894, + "learning_rate": 4.911000784296572e-06, + "loss": 0.1731, + "step": 1044 + }, + { + "epoch": 0.33862605314322747, + "grad_norm": 1.0023761987686157, + "learning_rate": 4.910769349544269e-06, + "loss": 0.1606, + "step": 1045 + }, + { + "epoch": 0.338950097213221, + "grad_norm": 1.0253771543502808, + "learning_rate": 4.9105376197371355e-06, + "loss": 0.1739, + "step": 1046 + }, + { + "epoch": 0.33927414128321454, + "grad_norm": 1.0307811498641968, + "learning_rate": 4.9103055949035326e-06, + "loss": 0.1889, + "step": 1047 + }, + { + "epoch": 0.339598185353208, + "grad_norm": 1.1487541198730469, + "learning_rate": 4.910073275071858e-06, + "loss": 0.1858, + "step": 1048 + }, + { + "epoch": 0.33992222942320155, + "grad_norm": 1.0212219953536987, + "learning_rate": 4.909840660270547e-06, + "loss": 0.163, + "step": 1049 + }, + { + "epoch": 0.3402462734931951, + "grad_norm": 0.9798551201820374, + "learning_rate": 4.909607750528068e-06, + "loss": 0.1625, + "step": 1050 + }, + { + "epoch": 0.3405703175631886, + "grad_norm": 0.9840319156646729, + "learning_rate": 4.909374545872927e-06, + "loss": 0.1456, + "step": 1051 + }, + { + "epoch": 0.3408943616331821, + "grad_norm": 1.056037187576294, + "learning_rate": 4.909141046333666e-06, + "loss": 0.1666, + "step": 1052 + }, + { + "epoch": 0.34121840570317563, + "grad_norm": 1.102063775062561, + "learning_rate": 4.908907251938864e-06, + "loss": 0.1763, + "step": 1053 + }, + { + "epoch": 0.34154244977316917, + "grad_norm": 0.9911783337593079, + "learning_rate": 4.908673162717133e-06, + "loss": 0.1614, + "step": 1054 + }, + { + "epoch": 0.34186649384316264, + "grad_norm": 1.1577961444854736, + "learning_rate": 4.908438778697125e-06, + "loss": 0.1811, + "step": 1055 + }, + { + "epoch": 0.3421905379131562, + "grad_norm": 1.0934644937515259, + "learning_rate": 4.908204099907527e-06, + "loss": 0.1796, + "step": 1056 + }, + { + "epoch": 0.3425145819831497, + "grad_norm": 1.0998642444610596, + "learning_rate": 4.907969126377059e-06, + "loss": 0.1804, + "step": 1057 + }, + { + "epoch": 0.34283862605314325, + "grad_norm": 0.9529743790626526, + "learning_rate": 4.907733858134482e-06, + "loss": 0.1647, + "step": 1058 + }, + { + "epoch": 0.3431626701231367, + "grad_norm": 1.0420947074890137, + "learning_rate": 4.907498295208589e-06, + "loss": 0.1743, + "step": 1059 + }, + { + "epoch": 0.34348671419313026, + "grad_norm": 0.9765776991844177, + "learning_rate": 4.907262437628211e-06, + "loss": 0.1688, + "step": 1060 + }, + { + "epoch": 0.3438107582631238, + "grad_norm": 1.0862605571746826, + "learning_rate": 4.907026285422215e-06, + "loss": 0.1809, + "step": 1061 + }, + { + "epoch": 0.34413480233311733, + "grad_norm": 0.9697964191436768, + "learning_rate": 4.906789838619504e-06, + "loss": 0.1777, + "step": 1062 + }, + { + "epoch": 0.3444588464031108, + "grad_norm": 0.9970915913581848, + "learning_rate": 4.906553097249015e-06, + "loss": 0.1543, + "step": 1063 + }, + { + "epoch": 0.34478289047310434, + "grad_norm": 1.0512598752975464, + "learning_rate": 4.906316061339724e-06, + "loss": 0.1766, + "step": 1064 + }, + { + "epoch": 0.3451069345430979, + "grad_norm": 1.0634959936141968, + "learning_rate": 4.9060787309206436e-06, + "loss": 0.1799, + "step": 1065 + }, + { + "epoch": 0.34543097861309136, + "grad_norm": 1.0844204425811768, + "learning_rate": 4.905841106020818e-06, + "loss": 0.1902, + "step": 1066 + }, + { + "epoch": 0.3457550226830849, + "grad_norm": 1.1198972463607788, + "learning_rate": 4.905603186669332e-06, + "loss": 0.1662, + "step": 1067 + }, + { + "epoch": 0.3460790667530784, + "grad_norm": 1.0448524951934814, + "learning_rate": 4.905364972895304e-06, + "loss": 0.1798, + "step": 1068 + }, + { + "epoch": 0.34640311082307196, + "grad_norm": 1.0317894220352173, + "learning_rate": 4.9051264647278886e-06, + "loss": 0.172, + "step": 1069 + }, + { + "epoch": 0.34672715489306544, + "grad_norm": 1.1010342836380005, + "learning_rate": 4.904887662196277e-06, + "loss": 0.1537, + "step": 1070 + }, + { + "epoch": 0.34705119896305897, + "grad_norm": 1.052812099456787, + "learning_rate": 4.904648565329697e-06, + "loss": 0.1891, + "step": 1071 + }, + { + "epoch": 0.3473752430330525, + "grad_norm": 1.0273829698562622, + "learning_rate": 4.904409174157412e-06, + "loss": 0.1735, + "step": 1072 + }, + { + "epoch": 0.34769928710304604, + "grad_norm": 0.980993926525116, + "learning_rate": 4.90416948870872e-06, + "loss": 0.1677, + "step": 1073 + }, + { + "epoch": 0.3480233311730395, + "grad_norm": 0.9855329394340515, + "learning_rate": 4.903929509012957e-06, + "loss": 0.1524, + "step": 1074 + }, + { + "epoch": 0.34834737524303305, + "grad_norm": 1.0246031284332275, + "learning_rate": 4.9036892350994935e-06, + "loss": 0.166, + "step": 1075 + }, + { + "epoch": 0.3486714193130266, + "grad_norm": 0.99644935131073, + "learning_rate": 4.9034486669977375e-06, + "loss": 0.162, + "step": 1076 + }, + { + "epoch": 0.34899546338302007, + "grad_norm": 1.0822203159332275, + "learning_rate": 4.903207804737132e-06, + "loss": 0.1737, + "step": 1077 + }, + { + "epoch": 0.3493195074530136, + "grad_norm": 1.0989410877227783, + "learning_rate": 4.902966648347156e-06, + "loss": 0.169, + "step": 1078 + }, + { + "epoch": 0.34964355152300713, + "grad_norm": 1.1245644092559814, + "learning_rate": 4.902725197857325e-06, + "loss": 0.1771, + "step": 1079 + }, + { + "epoch": 0.34996759559300067, + "grad_norm": 1.121975302696228, + "learning_rate": 4.902483453297189e-06, + "loss": 0.1796, + "step": 1080 + }, + { + "epoch": 0.35029163966299415, + "grad_norm": 1.0160001516342163, + "learning_rate": 4.902241414696337e-06, + "loss": 0.1639, + "step": 1081 + }, + { + "epoch": 0.3506156837329877, + "grad_norm": 0.9480010271072388, + "learning_rate": 4.901999082084391e-06, + "loss": 0.1544, + "step": 1082 + }, + { + "epoch": 0.3509397278029812, + "grad_norm": 1.0363365411758423, + "learning_rate": 4.901756455491011e-06, + "loss": 0.186, + "step": 1083 + }, + { + "epoch": 0.35126377187297475, + "grad_norm": 1.0564618110656738, + "learning_rate": 4.901513534945891e-06, + "loss": 0.1787, + "step": 1084 + }, + { + "epoch": 0.35158781594296823, + "grad_norm": 1.0492441654205322, + "learning_rate": 4.901270320478763e-06, + "loss": 0.1695, + "step": 1085 + }, + { + "epoch": 0.35191186001296176, + "grad_norm": 1.025549054145813, + "learning_rate": 4.901026812119394e-06, + "loss": 0.171, + "step": 1086 + }, + { + "epoch": 0.3522359040829553, + "grad_norm": 1.0320312976837158, + "learning_rate": 4.9007830098975875e-06, + "loss": 0.1716, + "step": 1087 + }, + { + "epoch": 0.3525599481529488, + "grad_norm": 1.0225930213928223, + "learning_rate": 4.900538913843181e-06, + "loss": 0.1717, + "step": 1088 + }, + { + "epoch": 0.3528839922229423, + "grad_norm": 1.103419542312622, + "learning_rate": 4.900294523986051e-06, + "loss": 0.1929, + "step": 1089 + }, + { + "epoch": 0.35320803629293585, + "grad_norm": 1.0638277530670166, + "learning_rate": 4.900049840356107e-06, + "loss": 0.183, + "step": 1090 + }, + { + "epoch": 0.3535320803629294, + "grad_norm": 0.9254642724990845, + "learning_rate": 4.899804862983298e-06, + "loss": 0.1541, + "step": 1091 + }, + { + "epoch": 0.35385612443292286, + "grad_norm": 0.9626860618591309, + "learning_rate": 4.899559591897604e-06, + "loss": 0.169, + "step": 1092 + }, + { + "epoch": 0.3541801685029164, + "grad_norm": 1.0504587888717651, + "learning_rate": 4.899314027129047e-06, + "loss": 0.1759, + "step": 1093 + }, + { + "epoch": 0.3545042125729099, + "grad_norm": 1.0354413986206055, + "learning_rate": 4.89906816870768e-06, + "loss": 0.173, + "step": 1094 + }, + { + "epoch": 0.35482825664290346, + "grad_norm": 0.9573935270309448, + "learning_rate": 4.898822016663595e-06, + "loss": 0.1683, + "step": 1095 + }, + { + "epoch": 0.35515230071289694, + "grad_norm": 1.1483545303344727, + "learning_rate": 4.898575571026916e-06, + "loss": 0.1916, + "step": 1096 + }, + { + "epoch": 0.3554763447828905, + "grad_norm": 0.990126371383667, + "learning_rate": 4.898328831827808e-06, + "loss": 0.164, + "step": 1097 + }, + { + "epoch": 0.355800388852884, + "grad_norm": 1.0703517198562622, + "learning_rate": 4.898081799096467e-06, + "loss": 0.1678, + "step": 1098 + }, + { + "epoch": 0.3561244329228775, + "grad_norm": 1.0092120170593262, + "learning_rate": 4.897834472863131e-06, + "loss": 0.1688, + "step": 1099 + }, + { + "epoch": 0.356448476992871, + "grad_norm": 1.0366566181182861, + "learning_rate": 4.897586853158067e-06, + "loss": 0.1699, + "step": 1100 + }, + { + "epoch": 0.35677252106286456, + "grad_norm": 1.0042214393615723, + "learning_rate": 4.897338940011583e-06, + "loss": 0.1557, + "step": 1101 + }, + { + "epoch": 0.3570965651328581, + "grad_norm": 0.9984369277954102, + "learning_rate": 4.897090733454021e-06, + "loss": 0.1863, + "step": 1102 + }, + { + "epoch": 0.35742060920285157, + "grad_norm": 1.1319072246551514, + "learning_rate": 4.896842233515759e-06, + "loss": 0.1892, + "step": 1103 + }, + { + "epoch": 0.3577446532728451, + "grad_norm": 1.0137627124786377, + "learning_rate": 4.89659344022721e-06, + "loss": 0.1744, + "step": 1104 + }, + { + "epoch": 0.35806869734283864, + "grad_norm": 1.036480188369751, + "learning_rate": 4.896344353618826e-06, + "loss": 0.1684, + "step": 1105 + }, + { + "epoch": 0.35839274141283217, + "grad_norm": 1.005391001701355, + "learning_rate": 4.896094973721091e-06, + "loss": 0.1568, + "step": 1106 + }, + { + "epoch": 0.35871678548282565, + "grad_norm": 0.9591735601425171, + "learning_rate": 4.8958453005645265e-06, + "loss": 0.1481, + "step": 1107 + }, + { + "epoch": 0.3590408295528192, + "grad_norm": 1.1032018661499023, + "learning_rate": 4.895595334179692e-06, + "loss": 0.1889, + "step": 1108 + }, + { + "epoch": 0.3593648736228127, + "grad_norm": 1.04655122756958, + "learning_rate": 4.89534507459718e-06, + "loss": 0.1749, + "step": 1109 + }, + { + "epoch": 0.3596889176928062, + "grad_norm": 1.006601095199585, + "learning_rate": 4.895094521847617e-06, + "loss": 0.1757, + "step": 1110 + }, + { + "epoch": 0.36001296176279973, + "grad_norm": 0.9353299736976624, + "learning_rate": 4.894843675961673e-06, + "loss": 0.1597, + "step": 1111 + }, + { + "epoch": 0.36033700583279327, + "grad_norm": 0.9540934562683105, + "learning_rate": 4.894592536970047e-06, + "loss": 0.1551, + "step": 1112 + }, + { + "epoch": 0.3606610499027868, + "grad_norm": 1.0541253089904785, + "learning_rate": 4.894341104903476e-06, + "loss": 0.1736, + "step": 1113 + }, + { + "epoch": 0.3609850939727803, + "grad_norm": 0.9918221235275269, + "learning_rate": 4.894089379792731e-06, + "loss": 0.1707, + "step": 1114 + }, + { + "epoch": 0.3613091380427738, + "grad_norm": 1.0815917253494263, + "learning_rate": 4.893837361668624e-06, + "loss": 0.1773, + "step": 1115 + }, + { + "epoch": 0.36163318211276735, + "grad_norm": 1.0560083389282227, + "learning_rate": 4.8935850505619985e-06, + "loss": 0.1782, + "step": 1116 + }, + { + "epoch": 0.3619572261827609, + "grad_norm": 1.0156229734420776, + "learning_rate": 4.8933324465037334e-06, + "loss": 0.1783, + "step": 1117 + }, + { + "epoch": 0.36228127025275436, + "grad_norm": 1.0813552141189575, + "learning_rate": 4.893079549524747e-06, + "loss": 0.1656, + "step": 1118 + }, + { + "epoch": 0.3626053143227479, + "grad_norm": 1.0563653707504272, + "learning_rate": 4.89282635965599e-06, + "loss": 0.1606, + "step": 1119 + }, + { + "epoch": 0.36292935839274143, + "grad_norm": 0.9194615483283997, + "learning_rate": 4.8925728769284504e-06, + "loss": 0.1498, + "step": 1120 + }, + { + "epoch": 0.3632534024627349, + "grad_norm": 1.0284903049468994, + "learning_rate": 4.892319101373154e-06, + "loss": 0.1706, + "step": 1121 + }, + { + "epoch": 0.36357744653272844, + "grad_norm": 1.003746747970581, + "learning_rate": 4.892065033021158e-06, + "loss": 0.1718, + "step": 1122 + }, + { + "epoch": 0.363901490602722, + "grad_norm": 1.0391348600387573, + "learning_rate": 4.8918106719035594e-06, + "loss": 0.1683, + "step": 1123 + }, + { + "epoch": 0.3642255346727155, + "grad_norm": 1.1661279201507568, + "learning_rate": 4.891556018051489e-06, + "loss": 0.1863, + "step": 1124 + }, + { + "epoch": 0.364549578742709, + "grad_norm": 1.0064976215362549, + "learning_rate": 4.891301071496113e-06, + "loss": 0.1622, + "step": 1125 + }, + { + "epoch": 0.3648736228127025, + "grad_norm": 1.1605420112609863, + "learning_rate": 4.891045832268637e-06, + "loss": 0.1911, + "step": 1126 + }, + { + "epoch": 0.36519766688269606, + "grad_norm": 1.0069886445999146, + "learning_rate": 4.890790300400297e-06, + "loss": 0.1785, + "step": 1127 + }, + { + "epoch": 0.36552171095268954, + "grad_norm": 1.0143624544143677, + "learning_rate": 4.8905344759223696e-06, + "loss": 0.1756, + "step": 1128 + }, + { + "epoch": 0.3658457550226831, + "grad_norm": 1.0237759351730347, + "learning_rate": 4.890278358866165e-06, + "loss": 0.1686, + "step": 1129 + }, + { + "epoch": 0.3661697990926766, + "grad_norm": 1.0224316120147705, + "learning_rate": 4.890021949263027e-06, + "loss": 0.176, + "step": 1130 + }, + { + "epoch": 0.36649384316267014, + "grad_norm": 1.0045219659805298, + "learning_rate": 4.889765247144341e-06, + "loss": 0.1667, + "step": 1131 + }, + { + "epoch": 0.3668178872326636, + "grad_norm": 1.0561575889587402, + "learning_rate": 4.889508252541524e-06, + "loss": 0.1809, + "step": 1132 + }, + { + "epoch": 0.36714193130265715, + "grad_norm": 0.9486965537071228, + "learning_rate": 4.889250965486029e-06, + "loss": 0.1544, + "step": 1133 + }, + { + "epoch": 0.3674659753726507, + "grad_norm": 0.9930503964424133, + "learning_rate": 4.888993386009345e-06, + "loss": 0.1568, + "step": 1134 + }, + { + "epoch": 0.3677900194426442, + "grad_norm": 1.032828450202942, + "learning_rate": 4.888735514142998e-06, + "loss": 0.1623, + "step": 1135 + }, + { + "epoch": 0.3681140635126377, + "grad_norm": 0.9454177021980286, + "learning_rate": 4.8884773499185485e-06, + "loss": 0.1597, + "step": 1136 + }, + { + "epoch": 0.36843810758263124, + "grad_norm": 1.0029901266098022, + "learning_rate": 4.8882188933675935e-06, + "loss": 0.1599, + "step": 1137 + }, + { + "epoch": 0.36876215165262477, + "grad_norm": 1.074245572090149, + "learning_rate": 4.887960144521766e-06, + "loss": 0.1737, + "step": 1138 + }, + { + "epoch": 0.36908619572261825, + "grad_norm": 1.103922724723816, + "learning_rate": 4.887701103412734e-06, + "loss": 0.1617, + "step": 1139 + }, + { + "epoch": 0.3694102397926118, + "grad_norm": 1.051020860671997, + "learning_rate": 4.8874417700722025e-06, + "loss": 0.1682, + "step": 1140 + }, + { + "epoch": 0.3697342838626053, + "grad_norm": 0.9718478322029114, + "learning_rate": 4.887182144531909e-06, + "loss": 0.1616, + "step": 1141 + }, + { + "epoch": 0.37005832793259885, + "grad_norm": 1.0041842460632324, + "learning_rate": 4.886922226823632e-06, + "loss": 0.1524, + "step": 1142 + }, + { + "epoch": 0.37038237200259233, + "grad_norm": 1.0239226818084717, + "learning_rate": 4.8866620169791815e-06, + "loss": 0.1788, + "step": 1143 + }, + { + "epoch": 0.37070641607258586, + "grad_norm": 0.9961230754852295, + "learning_rate": 4.886401515030404e-06, + "loss": 0.1594, + "step": 1144 + }, + { + "epoch": 0.3710304601425794, + "grad_norm": 1.0124776363372803, + "learning_rate": 4.886140721009184e-06, + "loss": 0.1747, + "step": 1145 + }, + { + "epoch": 0.37135450421257293, + "grad_norm": 0.9768791794776917, + "learning_rate": 4.885879634947439e-06, + "loss": 0.1564, + "step": 1146 + }, + { + "epoch": 0.3716785482825664, + "grad_norm": 1.1680079698562622, + "learning_rate": 4.885618256877123e-06, + "loss": 0.1892, + "step": 1147 + }, + { + "epoch": 0.37200259235255995, + "grad_norm": 0.9732511043548584, + "learning_rate": 4.885356586830229e-06, + "loss": 0.1536, + "step": 1148 + }, + { + "epoch": 0.3723266364225535, + "grad_norm": 1.0803674459457397, + "learning_rate": 4.8850946248387795e-06, + "loss": 0.1577, + "step": 1149 + }, + { + "epoch": 0.37265068049254696, + "grad_norm": 1.022645115852356, + "learning_rate": 4.884832370934838e-06, + "loss": 0.1623, + "step": 1150 + }, + { + "epoch": 0.3729747245625405, + "grad_norm": 0.979411244392395, + "learning_rate": 4.8845698251505e-06, + "loss": 0.1505, + "step": 1151 + }, + { + "epoch": 0.37329876863253403, + "grad_norm": 1.043480396270752, + "learning_rate": 4.8843069875179005e-06, + "loss": 0.1792, + "step": 1152 + }, + { + "epoch": 0.37362281270252756, + "grad_norm": 1.0261013507843018, + "learning_rate": 4.884043858069208e-06, + "loss": 0.1688, + "step": 1153 + }, + { + "epoch": 0.37394685677252104, + "grad_norm": 1.000368356704712, + "learning_rate": 4.883780436836627e-06, + "loss": 0.168, + "step": 1154 + }, + { + "epoch": 0.3742709008425146, + "grad_norm": 0.9413356781005859, + "learning_rate": 4.883516723852396e-06, + "loss": 0.1667, + "step": 1155 + }, + { + "epoch": 0.3745949449125081, + "grad_norm": 1.012265920639038, + "learning_rate": 4.883252719148794e-06, + "loss": 0.1638, + "step": 1156 + }, + { + "epoch": 0.37491898898250164, + "grad_norm": 0.9922842979431152, + "learning_rate": 4.8829884227581294e-06, + "loss": 0.1693, + "step": 1157 + }, + { + "epoch": 0.3752430330524951, + "grad_norm": 0.9883171319961548, + "learning_rate": 4.88272383471275e-06, + "loss": 0.1535, + "step": 1158 + }, + { + "epoch": 0.37556707712248866, + "grad_norm": 0.9470762610435486, + "learning_rate": 4.8824589550450415e-06, + "loss": 0.1475, + "step": 1159 + }, + { + "epoch": 0.3758911211924822, + "grad_norm": 0.9897817373275757, + "learning_rate": 4.882193783787421e-06, + "loss": 0.1802, + "step": 1160 + }, + { + "epoch": 0.37621516526247567, + "grad_norm": 1.0278658866882324, + "learning_rate": 4.881928320972342e-06, + "loss": 0.1799, + "step": 1161 + }, + { + "epoch": 0.3765392093324692, + "grad_norm": 1.0314370393753052, + "learning_rate": 4.881662566632296e-06, + "loss": 0.1789, + "step": 1162 + }, + { + "epoch": 0.37686325340246274, + "grad_norm": 0.9717196226119995, + "learning_rate": 4.881396520799808e-06, + "loss": 0.1569, + "step": 1163 + }, + { + "epoch": 0.3771872974724563, + "grad_norm": 1.0661332607269287, + "learning_rate": 4.8811301835074384e-06, + "loss": 0.1821, + "step": 1164 + }, + { + "epoch": 0.37751134154244975, + "grad_norm": 0.9644685983657837, + "learning_rate": 4.880863554787787e-06, + "loss": 0.1586, + "step": 1165 + }, + { + "epoch": 0.3778353856124433, + "grad_norm": 1.0331933498382568, + "learning_rate": 4.880596634673484e-06, + "loss": 0.1865, + "step": 1166 + }, + { + "epoch": 0.3781594296824368, + "grad_norm": 1.0733047723770142, + "learning_rate": 4.8803294231972e-06, + "loss": 0.1742, + "step": 1167 + }, + { + "epoch": 0.37848347375243035, + "grad_norm": 1.0722155570983887, + "learning_rate": 4.8800619203916376e-06, + "loss": 0.1963, + "step": 1168 + }, + { + "epoch": 0.37880751782242383, + "grad_norm": 1.0205607414245605, + "learning_rate": 4.8797941262895365e-06, + "loss": 0.1777, + "step": 1169 + }, + { + "epoch": 0.37913156189241737, + "grad_norm": 0.9205136299133301, + "learning_rate": 4.8795260409236725e-06, + "loss": 0.1503, + "step": 1170 + }, + { + "epoch": 0.3794556059624109, + "grad_norm": 0.9154426455497742, + "learning_rate": 4.879257664326856e-06, + "loss": 0.1568, + "step": 1171 + }, + { + "epoch": 0.3797796500324044, + "grad_norm": 1.0617220401763916, + "learning_rate": 4.8789889965319355e-06, + "loss": 0.1827, + "step": 1172 + }, + { + "epoch": 0.3801036941023979, + "grad_norm": 0.9944412112236023, + "learning_rate": 4.878720037571792e-06, + "loss": 0.1659, + "step": 1173 + }, + { + "epoch": 0.38042773817239145, + "grad_norm": 0.992470383644104, + "learning_rate": 4.878450787479344e-06, + "loss": 0.17, + "step": 1174 + }, + { + "epoch": 0.380751782242385, + "grad_norm": 1.038560152053833, + "learning_rate": 4.878181246287544e-06, + "loss": 0.169, + "step": 1175 + }, + { + "epoch": 0.38107582631237846, + "grad_norm": 0.9579697847366333, + "learning_rate": 4.877911414029382e-06, + "loss": 0.1578, + "step": 1176 + }, + { + "epoch": 0.381399870382372, + "grad_norm": 0.9731485247612, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.1648, + "step": 1177 + }, + { + "epoch": 0.38172391445236553, + "grad_norm": 0.9061117172241211, + "learning_rate": 4.877370876446109e-06, + "loss": 0.1609, + "step": 1178 + }, + { + "epoch": 0.38204795852235907, + "grad_norm": 0.983727753162384, + "learning_rate": 4.877100171187154e-06, + "loss": 0.1774, + "step": 1179 + }, + { + "epoch": 0.38237200259235254, + "grad_norm": 1.0166122913360596, + "learning_rate": 4.876829174994149e-06, + "loss": 0.1586, + "step": 1180 + }, + { + "epoch": 0.3826960466623461, + "grad_norm": 0.9968889355659485, + "learning_rate": 4.8765578879002625e-06, + "loss": 0.1544, + "step": 1181 + }, + { + "epoch": 0.3830200907323396, + "grad_norm": 0.990476667881012, + "learning_rate": 4.8762863099386984e-06, + "loss": 0.1724, + "step": 1182 + }, + { + "epoch": 0.3833441348023331, + "grad_norm": 0.9911254644393921, + "learning_rate": 4.876014441142693e-06, + "loss": 0.173, + "step": 1183 + }, + { + "epoch": 0.3836681788723266, + "grad_norm": 0.9807798266410828, + "learning_rate": 4.8757422815455215e-06, + "loss": 0.171, + "step": 1184 + }, + { + "epoch": 0.38399222294232016, + "grad_norm": 0.9989405870437622, + "learning_rate": 4.875469831180495e-06, + "loss": 0.1648, + "step": 1185 + }, + { + "epoch": 0.3843162670123137, + "grad_norm": 1.0662082433700562, + "learning_rate": 4.875197090080957e-06, + "loss": 0.1713, + "step": 1186 + }, + { + "epoch": 0.3846403110823072, + "grad_norm": 1.0581663846969604, + "learning_rate": 4.874924058280288e-06, + "loss": 0.1865, + "step": 1187 + }, + { + "epoch": 0.3849643551523007, + "grad_norm": 0.9903859496116638, + "learning_rate": 4.874650735811906e-06, + "loss": 0.1794, + "step": 1188 + }, + { + "epoch": 0.38528839922229424, + "grad_norm": 0.9108482003211975, + "learning_rate": 4.874377122709263e-06, + "loss": 0.1642, + "step": 1189 + }, + { + "epoch": 0.3856124432922878, + "grad_norm": 1.0256187915802002, + "learning_rate": 4.874103219005845e-06, + "loss": 0.1617, + "step": 1190 + }, + { + "epoch": 0.38593648736228126, + "grad_norm": 0.9225676655769348, + "learning_rate": 4.873829024735176e-06, + "loss": 0.1685, + "step": 1191 + }, + { + "epoch": 0.3862605314322748, + "grad_norm": 0.9761823415756226, + "learning_rate": 4.873554539930815e-06, + "loss": 0.1672, + "step": 1192 + }, + { + "epoch": 0.3865845755022683, + "grad_norm": 1.0413146018981934, + "learning_rate": 4.873279764626357e-06, + "loss": 0.1751, + "step": 1193 + }, + { + "epoch": 0.3869086195722618, + "grad_norm": 1.0643138885498047, + "learning_rate": 4.87300469885543e-06, + "loss": 0.1738, + "step": 1194 + }, + { + "epoch": 0.38723266364225534, + "grad_norm": 1.077222228050232, + "learning_rate": 4.872729342651701e-06, + "loss": 0.1918, + "step": 1195 + }, + { + "epoch": 0.38755670771224887, + "grad_norm": 1.0266451835632324, + "learning_rate": 4.87245369604887e-06, + "loss": 0.1765, + "step": 1196 + }, + { + "epoch": 0.3878807517822424, + "grad_norm": 1.0106679201126099, + "learning_rate": 4.872177759080673e-06, + "loss": 0.1745, + "step": 1197 + }, + { + "epoch": 0.3882047958522359, + "grad_norm": 0.9687255024909973, + "learning_rate": 4.8719015317808835e-06, + "loss": 0.1704, + "step": 1198 + }, + { + "epoch": 0.3885288399222294, + "grad_norm": 0.9420855045318604, + "learning_rate": 4.871625014183308e-06, + "loss": 0.1563, + "step": 1199 + }, + { + "epoch": 0.38885288399222295, + "grad_norm": 0.9460660815238953, + "learning_rate": 4.8713482063217895e-06, + "loss": 0.1617, + "step": 1200 + }, + { + "epoch": 0.3891769280622165, + "grad_norm": 1.044751524925232, + "learning_rate": 4.871071108230208e-06, + "loss": 0.1814, + "step": 1201 + }, + { + "epoch": 0.38950097213220997, + "grad_norm": 0.9933145046234131, + "learning_rate": 4.8707937199424756e-06, + "loss": 0.1715, + "step": 1202 + }, + { + "epoch": 0.3898250162022035, + "grad_norm": 1.1108256578445435, + "learning_rate": 4.870516041492543e-06, + "loss": 0.186, + "step": 1203 + }, + { + "epoch": 0.39014906027219703, + "grad_norm": 0.9695666432380676, + "learning_rate": 4.870238072914396e-06, + "loss": 0.1576, + "step": 1204 + }, + { + "epoch": 0.3904731043421905, + "grad_norm": 0.9143527150154114, + "learning_rate": 4.869959814242054e-06, + "loss": 0.1625, + "step": 1205 + }, + { + "epoch": 0.39079714841218405, + "grad_norm": 0.9999296069145203, + "learning_rate": 4.8696812655095744e-06, + "loss": 0.1693, + "step": 1206 + }, + { + "epoch": 0.3911211924821776, + "grad_norm": 0.982399582862854, + "learning_rate": 4.869402426751048e-06, + "loss": 0.1725, + "step": 1207 + }, + { + "epoch": 0.3914452365521711, + "grad_norm": 1.0316600799560547, + "learning_rate": 4.8691232980006015e-06, + "loss": 0.1759, + "step": 1208 + }, + { + "epoch": 0.3917692806221646, + "grad_norm": 0.9703241586685181, + "learning_rate": 4.868843879292399e-06, + "loss": 0.1744, + "step": 1209 + }, + { + "epoch": 0.39209332469215813, + "grad_norm": 0.9049566388130188, + "learning_rate": 4.868564170660637e-06, + "loss": 0.1503, + "step": 1210 + }, + { + "epoch": 0.39241736876215166, + "grad_norm": 1.0091451406478882, + "learning_rate": 4.868284172139551e-06, + "loss": 0.1625, + "step": 1211 + }, + { + "epoch": 0.3927414128321452, + "grad_norm": 0.9870232343673706, + "learning_rate": 4.868003883763408e-06, + "loss": 0.1686, + "step": 1212 + }, + { + "epoch": 0.3930654569021387, + "grad_norm": 1.030659556388855, + "learning_rate": 4.867723305566514e-06, + "loss": 0.1903, + "step": 1213 + }, + { + "epoch": 0.3933895009721322, + "grad_norm": 1.165837287902832, + "learning_rate": 4.86744243758321e-06, + "loss": 0.1819, + "step": 1214 + }, + { + "epoch": 0.39371354504212575, + "grad_norm": 0.9671489596366882, + "learning_rate": 4.8671612798478685e-06, + "loss": 0.1603, + "step": 1215 + }, + { + "epoch": 0.3940375891121192, + "grad_norm": 1.0202622413635254, + "learning_rate": 4.866879832394903e-06, + "loss": 0.1697, + "step": 1216 + }, + { + "epoch": 0.39436163318211276, + "grad_norm": 1.043723702430725, + "learning_rate": 4.86659809525876e-06, + "loss": 0.181, + "step": 1217 + }, + { + "epoch": 0.3946856772521063, + "grad_norm": 0.9605787396430969, + "learning_rate": 4.866316068473919e-06, + "loss": 0.1566, + "step": 1218 + }, + { + "epoch": 0.3950097213220998, + "grad_norm": 1.0066555738449097, + "learning_rate": 4.8660337520749e-06, + "loss": 0.1648, + "step": 1219 + }, + { + "epoch": 0.3953337653920933, + "grad_norm": 1.0389704704284668, + "learning_rate": 4.865751146096255e-06, + "loss": 0.1872, + "step": 1220 + }, + { + "epoch": 0.39565780946208684, + "grad_norm": 1.052191138267517, + "learning_rate": 4.865468250572571e-06, + "loss": 0.1532, + "step": 1221 + }, + { + "epoch": 0.3959818535320804, + "grad_norm": 0.9602256417274475, + "learning_rate": 4.865185065538472e-06, + "loss": 0.1608, + "step": 1222 + }, + { + "epoch": 0.3963058976020739, + "grad_norm": 1.0805506706237793, + "learning_rate": 4.86490159102862e-06, + "loss": 0.1763, + "step": 1223 + }, + { + "epoch": 0.3966299416720674, + "grad_norm": 1.0248594284057617, + "learning_rate": 4.8646178270777055e-06, + "loss": 0.1774, + "step": 1224 + }, + { + "epoch": 0.3969539857420609, + "grad_norm": 1.0524420738220215, + "learning_rate": 4.864333773720461e-06, + "loss": 0.1564, + "step": 1225 + }, + { + "epoch": 0.39727802981205446, + "grad_norm": 1.0076290369033813, + "learning_rate": 4.8640494309916506e-06, + "loss": 0.165, + "step": 1226 + }, + { + "epoch": 0.39760207388204793, + "grad_norm": 1.1074403524398804, + "learning_rate": 4.863764798926076e-06, + "loss": 0.1941, + "step": 1227 + }, + { + "epoch": 0.39792611795204147, + "grad_norm": 1.0143996477127075, + "learning_rate": 4.863479877558573e-06, + "loss": 0.1663, + "step": 1228 + }, + { + "epoch": 0.398250162022035, + "grad_norm": 0.888302206993103, + "learning_rate": 4.863194666924013e-06, + "loss": 0.1523, + "step": 1229 + }, + { + "epoch": 0.39857420609202854, + "grad_norm": 1.0188536643981934, + "learning_rate": 4.862909167057304e-06, + "loss": 0.18, + "step": 1230 + }, + { + "epoch": 0.398898250162022, + "grad_norm": 1.0219933986663818, + "learning_rate": 4.862623377993387e-06, + "loss": 0.1524, + "step": 1231 + }, + { + "epoch": 0.39922229423201555, + "grad_norm": 1.0216704607009888, + "learning_rate": 4.862337299767241e-06, + "loss": 0.1629, + "step": 1232 + }, + { + "epoch": 0.3995463383020091, + "grad_norm": 0.9859946370124817, + "learning_rate": 4.862050932413878e-06, + "loss": 0.1643, + "step": 1233 + }, + { + "epoch": 0.3998703823720026, + "grad_norm": 1.0218664407730103, + "learning_rate": 4.8617642759683474e-06, + "loss": 0.1714, + "step": 1234 + }, + { + "epoch": 0.4001944264419961, + "grad_norm": 0.9597703218460083, + "learning_rate": 4.861477330465734e-06, + "loss": 0.1628, + "step": 1235 + }, + { + "epoch": 0.40051847051198963, + "grad_norm": 1.0279196500778198, + "learning_rate": 4.861190095941155e-06, + "loss": 0.1744, + "step": 1236 + }, + { + "epoch": 0.40084251458198317, + "grad_norm": 0.953804075717926, + "learning_rate": 4.860902572429767e-06, + "loss": 0.1734, + "step": 1237 + }, + { + "epoch": 0.40116655865197665, + "grad_norm": 1.052389144897461, + "learning_rate": 4.86061475996676e-06, + "loss": 0.1778, + "step": 1238 + }, + { + "epoch": 0.4014906027219702, + "grad_norm": 0.9892643094062805, + "learning_rate": 4.860326658587358e-06, + "loss": 0.179, + "step": 1239 + }, + { + "epoch": 0.4018146467919637, + "grad_norm": 1.012811303138733, + "learning_rate": 4.860038268326823e-06, + "loss": 0.1713, + "step": 1240 + }, + { + "epoch": 0.40213869086195725, + "grad_norm": 0.994625449180603, + "learning_rate": 4.85974958922045e-06, + "loss": 0.1785, + "step": 1241 + }, + { + "epoch": 0.4024627349319507, + "grad_norm": 0.9277936220169067, + "learning_rate": 4.859460621303572e-06, + "loss": 0.1622, + "step": 1242 + }, + { + "epoch": 0.40278677900194426, + "grad_norm": 0.9426575303077698, + "learning_rate": 4.859171364611556e-06, + "loss": 0.1676, + "step": 1243 + }, + { + "epoch": 0.4031108230719378, + "grad_norm": 0.9697012901306152, + "learning_rate": 4.8588818191798035e-06, + "loss": 0.1644, + "step": 1244 + }, + { + "epoch": 0.40343486714193133, + "grad_norm": 0.9683858752250671, + "learning_rate": 4.858591985043751e-06, + "loss": 0.1693, + "step": 1245 + }, + { + "epoch": 0.4037589112119248, + "grad_norm": 0.9764914512634277, + "learning_rate": 4.858301862238874e-06, + "loss": 0.1727, + "step": 1246 + }, + { + "epoch": 0.40408295528191834, + "grad_norm": 1.0774421691894531, + "learning_rate": 4.858011450800678e-06, + "loss": 0.1741, + "step": 1247 + }, + { + "epoch": 0.4044069993519119, + "grad_norm": 1.0993943214416504, + "learning_rate": 4.857720750764708e-06, + "loss": 0.1666, + "step": 1248 + }, + { + "epoch": 0.40473104342190536, + "grad_norm": 1.1373517513275146, + "learning_rate": 4.857429762166543e-06, + "loss": 0.1678, + "step": 1249 + }, + { + "epoch": 0.4050550874918989, + "grad_norm": 1.045294165611267, + "learning_rate": 4.857138485041797e-06, + "loss": 0.1626, + "step": 1250 + }, + { + "epoch": 0.4053791315618924, + "grad_norm": 1.0252169370651245, + "learning_rate": 4.856846919426118e-06, + "loss": 0.1762, + "step": 1251 + }, + { + "epoch": 0.40570317563188596, + "grad_norm": 0.9912242293357849, + "learning_rate": 4.856555065355193e-06, + "loss": 0.1658, + "step": 1252 + }, + { + "epoch": 0.40602721970187944, + "grad_norm": 1.0291924476623535, + "learning_rate": 4.856262922864741e-06, + "loss": 0.1616, + "step": 1253 + }, + { + "epoch": 0.406351263771873, + "grad_norm": 1.0011651515960693, + "learning_rate": 4.855970491990518e-06, + "loss": 0.161, + "step": 1254 + }, + { + "epoch": 0.4066753078418665, + "grad_norm": 0.9770960211753845, + "learning_rate": 4.855677772768315e-06, + "loss": 0.1673, + "step": 1255 + }, + { + "epoch": 0.40699935191186, + "grad_norm": 0.967759370803833, + "learning_rate": 4.855384765233956e-06, + "loss": 0.1582, + "step": 1256 + }, + { + "epoch": 0.4073233959818535, + "grad_norm": 1.005577564239502, + "learning_rate": 4.8550914694233045e-06, + "loss": 0.1626, + "step": 1257 + }, + { + "epoch": 0.40764744005184705, + "grad_norm": 1.0803974866867065, + "learning_rate": 4.854797885372255e-06, + "loss": 0.1827, + "step": 1258 + }, + { + "epoch": 0.4079714841218406, + "grad_norm": 1.0000061988830566, + "learning_rate": 4.854504013116741e-06, + "loss": 0.1501, + "step": 1259 + }, + { + "epoch": 0.40829552819183407, + "grad_norm": 0.997345507144928, + "learning_rate": 4.8542098526927304e-06, + "loss": 0.1597, + "step": 1260 + }, + { + "epoch": 0.4086195722618276, + "grad_norm": 0.9689996242523193, + "learning_rate": 4.853915404136223e-06, + "loss": 0.1626, + "step": 1261 + }, + { + "epoch": 0.40894361633182114, + "grad_norm": 0.95046067237854, + "learning_rate": 4.853620667483259e-06, + "loss": 0.1603, + "step": 1262 + }, + { + "epoch": 0.40926766040181467, + "grad_norm": 1.1230173110961914, + "learning_rate": 4.853325642769908e-06, + "loss": 0.1964, + "step": 1263 + }, + { + "epoch": 0.40959170447180815, + "grad_norm": 0.9287686347961426, + "learning_rate": 4.853030330032283e-06, + "loss": 0.1575, + "step": 1264 + }, + { + "epoch": 0.4099157485418017, + "grad_norm": 1.087897777557373, + "learning_rate": 4.852734729306523e-06, + "loss": 0.1707, + "step": 1265 + }, + { + "epoch": 0.4102397926117952, + "grad_norm": 0.95196533203125, + "learning_rate": 4.852438840628808e-06, + "loss": 0.1537, + "step": 1266 + }, + { + "epoch": 0.4105638366817887, + "grad_norm": 0.9249547123908997, + "learning_rate": 4.852142664035353e-06, + "loss": 0.1565, + "step": 1267 + }, + { + "epoch": 0.41088788075178223, + "grad_norm": 1.0064045190811157, + "learning_rate": 4.8518461995624064e-06, + "loss": 0.1623, + "step": 1268 + }, + { + "epoch": 0.41121192482177576, + "grad_norm": 1.031893014907837, + "learning_rate": 4.851549447246253e-06, + "loss": 0.1737, + "step": 1269 + }, + { + "epoch": 0.4115359688917693, + "grad_norm": 0.9710052609443665, + "learning_rate": 4.851252407123211e-06, + "loss": 0.1637, + "step": 1270 + }, + { + "epoch": 0.4118600129617628, + "grad_norm": 1.0140632390975952, + "learning_rate": 4.850955079229637e-06, + "loss": 0.1629, + "step": 1271 + }, + { + "epoch": 0.4121840570317563, + "grad_norm": 0.9252100586891174, + "learning_rate": 4.850657463601921e-06, + "loss": 0.1543, + "step": 1272 + }, + { + "epoch": 0.41250810110174985, + "grad_norm": 0.9155288338661194, + "learning_rate": 4.850359560276486e-06, + "loss": 0.1565, + "step": 1273 + }, + { + "epoch": 0.4128321451717434, + "grad_norm": 1.0258336067199707, + "learning_rate": 4.850061369289795e-06, + "loss": 0.179, + "step": 1274 + }, + { + "epoch": 0.41315618924173686, + "grad_norm": 1.0771592855453491, + "learning_rate": 4.8497628906783425e-06, + "loss": 0.1706, + "step": 1275 + }, + { + "epoch": 0.4134802333117304, + "grad_norm": 1.0786904096603394, + "learning_rate": 4.84946412447866e-06, + "loss": 0.1765, + "step": 1276 + }, + { + "epoch": 0.41380427738172393, + "grad_norm": 0.9597084522247314, + "learning_rate": 4.849165070727313e-06, + "loss": 0.1464, + "step": 1277 + }, + { + "epoch": 0.4141283214517174, + "grad_norm": 0.9457292556762695, + "learning_rate": 4.848865729460903e-06, + "loss": 0.1499, + "step": 1278 + }, + { + "epoch": 0.41445236552171094, + "grad_norm": 0.9749992489814758, + "learning_rate": 4.848566100716066e-06, + "loss": 0.1672, + "step": 1279 + }, + { + "epoch": 0.4147764095917045, + "grad_norm": 1.054645299911499, + "learning_rate": 4.848266184529475e-06, + "loss": 0.1816, + "step": 1280 + }, + { + "epoch": 0.415100453661698, + "grad_norm": 0.9836113452911377, + "learning_rate": 4.847965980937836e-06, + "loss": 0.1528, + "step": 1281 + }, + { + "epoch": 0.4154244977316915, + "grad_norm": 0.9920374155044556, + "learning_rate": 4.847665489977891e-06, + "loss": 0.1672, + "step": 1282 + }, + { + "epoch": 0.415748541801685, + "grad_norm": 0.9647442102432251, + "learning_rate": 4.847364711686417e-06, + "loss": 0.1694, + "step": 1283 + }, + { + "epoch": 0.41607258587167856, + "grad_norm": 0.9995275139808655, + "learning_rate": 4.847063646100226e-06, + "loss": 0.1708, + "step": 1284 + }, + { + "epoch": 0.4163966299416721, + "grad_norm": 1.002206563949585, + "learning_rate": 4.846762293256167e-06, + "loss": 0.1611, + "step": 1285 + }, + { + "epoch": 0.41672067401166557, + "grad_norm": 0.9814795851707458, + "learning_rate": 4.846460653191121e-06, + "loss": 0.1717, + "step": 1286 + }, + { + "epoch": 0.4170447180816591, + "grad_norm": 0.9551466703414917, + "learning_rate": 4.846158725942006e-06, + "loss": 0.1716, + "step": 1287 + }, + { + "epoch": 0.41736876215165264, + "grad_norm": 0.985569953918457, + "learning_rate": 4.845856511545777e-06, + "loss": 0.1756, + "step": 1288 + }, + { + "epoch": 0.4176928062216461, + "grad_norm": 0.9430775046348572, + "learning_rate": 4.84555401003942e-06, + "loss": 0.1591, + "step": 1289 + }, + { + "epoch": 0.41801685029163965, + "grad_norm": 1.0009950399398804, + "learning_rate": 4.845251221459958e-06, + "loss": 0.1838, + "step": 1290 + }, + { + "epoch": 0.4183408943616332, + "grad_norm": 0.9523888230323792, + "learning_rate": 4.844948145844452e-06, + "loss": 0.1475, + "step": 1291 + }, + { + "epoch": 0.4186649384316267, + "grad_norm": 1.0713485479354858, + "learning_rate": 4.844644783229993e-06, + "loss": 0.169, + "step": 1292 + }, + { + "epoch": 0.4189889825016202, + "grad_norm": 1.074339509010315, + "learning_rate": 4.844341133653709e-06, + "loss": 0.1923, + "step": 1293 + }, + { + "epoch": 0.41931302657161373, + "grad_norm": 0.9841317534446716, + "learning_rate": 4.844037197152767e-06, + "loss": 0.1743, + "step": 1294 + }, + { + "epoch": 0.41963707064160727, + "grad_norm": 1.0041999816894531, + "learning_rate": 4.843732973764363e-06, + "loss": 0.174, + "step": 1295 + }, + { + "epoch": 0.4199611147116008, + "grad_norm": 0.9640102982521057, + "learning_rate": 4.8434284635257335e-06, + "loss": 0.1655, + "step": 1296 + }, + { + "epoch": 0.4202851587815943, + "grad_norm": 0.9690929651260376, + "learning_rate": 4.843123666474146e-06, + "loss": 0.1666, + "step": 1297 + }, + { + "epoch": 0.4206092028515878, + "grad_norm": 1.0001012086868286, + "learning_rate": 4.842818582646904e-06, + "loss": 0.1749, + "step": 1298 + }, + { + "epoch": 0.42093324692158135, + "grad_norm": 1.098995566368103, + "learning_rate": 4.842513212081348e-06, + "loss": 0.1761, + "step": 1299 + }, + { + "epoch": 0.42125729099157483, + "grad_norm": 0.9785960912704468, + "learning_rate": 4.8422075548148525e-06, + "loss": 0.1538, + "step": 1300 + }, + { + "epoch": 0.42158133506156836, + "grad_norm": 0.9744332432746887, + "learning_rate": 4.841901610884826e-06, + "loss": 0.1822, + "step": 1301 + }, + { + "epoch": 0.4219053791315619, + "grad_norm": 0.9197339415550232, + "learning_rate": 4.841595380328714e-06, + "loss": 0.164, + "step": 1302 + }, + { + "epoch": 0.42222942320155543, + "grad_norm": 0.9323499202728271, + "learning_rate": 4.841288863183996e-06, + "loss": 0.1605, + "step": 1303 + }, + { + "epoch": 0.4225534672715489, + "grad_norm": 1.0368022918701172, + "learning_rate": 4.840982059488186e-06, + "loss": 0.1796, + "step": 1304 + }, + { + "epoch": 0.42287751134154244, + "grad_norm": 0.9731484651565552, + "learning_rate": 4.840674969278836e-06, + "loss": 0.163, + "step": 1305 + }, + { + "epoch": 0.423201555411536, + "grad_norm": 0.9851950407028198, + "learning_rate": 4.8403675925935275e-06, + "loss": 0.1697, + "step": 1306 + }, + { + "epoch": 0.4235255994815295, + "grad_norm": 0.8935414552688599, + "learning_rate": 4.8400599294698825e-06, + "loss": 0.1496, + "step": 1307 + }, + { + "epoch": 0.423849643551523, + "grad_norm": 0.9661771059036255, + "learning_rate": 4.839751979945556e-06, + "loss": 0.1557, + "step": 1308 + }, + { + "epoch": 0.4241736876215165, + "grad_norm": 1.0194766521453857, + "learning_rate": 4.839443744058238e-06, + "loss": 0.1668, + "step": 1309 + }, + { + "epoch": 0.42449773169151006, + "grad_norm": 0.9830193519592285, + "learning_rate": 4.839135221845654e-06, + "loss": 0.1632, + "step": 1310 + }, + { + "epoch": 0.42482177576150354, + "grad_norm": 1.0080420970916748, + "learning_rate": 4.838826413345561e-06, + "loss": 0.1706, + "step": 1311 + }, + { + "epoch": 0.4251458198314971, + "grad_norm": 1.0843628644943237, + "learning_rate": 4.838517318595758e-06, + "loss": 0.177, + "step": 1312 + }, + { + "epoch": 0.4254698639014906, + "grad_norm": 0.9669689536094666, + "learning_rate": 4.838207937634074e-06, + "loss": 0.1624, + "step": 1313 + }, + { + "epoch": 0.42579390797148414, + "grad_norm": 0.9528621435165405, + "learning_rate": 4.837898270498374e-06, + "loss": 0.1645, + "step": 1314 + }, + { + "epoch": 0.4261179520414776, + "grad_norm": 1.02474844455719, + "learning_rate": 4.837588317226558e-06, + "loss": 0.1746, + "step": 1315 + }, + { + "epoch": 0.42644199611147116, + "grad_norm": 1.0107295513153076, + "learning_rate": 4.837278077856562e-06, + "loss": 0.1806, + "step": 1316 + }, + { + "epoch": 0.4267660401814647, + "grad_norm": 1.029313564300537, + "learning_rate": 4.836967552426355e-06, + "loss": 0.1587, + "step": 1317 + }, + { + "epoch": 0.4270900842514582, + "grad_norm": 0.9927104115486145, + "learning_rate": 4.836656740973944e-06, + "loss": 0.1679, + "step": 1318 + }, + { + "epoch": 0.4274141283214517, + "grad_norm": 0.93535977602005, + "learning_rate": 4.836345643537368e-06, + "loss": 0.1695, + "step": 1319 + }, + { + "epoch": 0.42773817239144524, + "grad_norm": 0.9303317070007324, + "learning_rate": 4.836034260154704e-06, + "loss": 0.1575, + "step": 1320 + }, + { + "epoch": 0.42806221646143877, + "grad_norm": 0.9873595237731934, + "learning_rate": 4.83572259086406e-06, + "loss": 0.1645, + "step": 1321 + }, + { + "epoch": 0.42838626053143225, + "grad_norm": 1.003503441810608, + "learning_rate": 4.835410635703582e-06, + "loss": 0.1627, + "step": 1322 + }, + { + "epoch": 0.4287103046014258, + "grad_norm": 0.9385678768157959, + "learning_rate": 4.835098394711451e-06, + "loss": 0.1565, + "step": 1323 + }, + { + "epoch": 0.4290343486714193, + "grad_norm": 0.9281715154647827, + "learning_rate": 4.834785867925883e-06, + "loss": 0.1657, + "step": 1324 + }, + { + "epoch": 0.42935839274141285, + "grad_norm": 0.9996689558029175, + "learning_rate": 4.8344730553851275e-06, + "loss": 0.1681, + "step": 1325 + }, + { + "epoch": 0.42968243681140633, + "grad_norm": 1.0321195125579834, + "learning_rate": 4.834159957127468e-06, + "loss": 0.1747, + "step": 1326 + }, + { + "epoch": 0.43000648088139987, + "grad_norm": 1.0136144161224365, + "learning_rate": 4.833846573191227e-06, + "loss": 0.1617, + "step": 1327 + }, + { + "epoch": 0.4303305249513934, + "grad_norm": 0.9769629836082458, + "learning_rate": 4.833532903614758e-06, + "loss": 0.1769, + "step": 1328 + }, + { + "epoch": 0.43065456902138693, + "grad_norm": 1.0463759899139404, + "learning_rate": 4.833218948436453e-06, + "loss": 0.169, + "step": 1329 + }, + { + "epoch": 0.4309786130913804, + "grad_norm": 0.9294410347938538, + "learning_rate": 4.832904707694736e-06, + "loss": 0.1636, + "step": 1330 + }, + { + "epoch": 0.43130265716137395, + "grad_norm": 0.8959236741065979, + "learning_rate": 4.832590181428066e-06, + "loss": 0.1675, + "step": 1331 + }, + { + "epoch": 0.4316267012313675, + "grad_norm": 1.002616047859192, + "learning_rate": 4.832275369674939e-06, + "loss": 0.1457, + "step": 1332 + }, + { + "epoch": 0.43195074530136096, + "grad_norm": 1.065735101699829, + "learning_rate": 4.831960272473886e-06, + "loss": 0.1665, + "step": 1333 + }, + { + "epoch": 0.4322747893713545, + "grad_norm": 1.0663241147994995, + "learning_rate": 4.831644889863471e-06, + "loss": 0.1823, + "step": 1334 + }, + { + "epoch": 0.43259883344134803, + "grad_norm": 0.9952039122581482, + "learning_rate": 4.831329221882291e-06, + "loss": 0.1589, + "step": 1335 + }, + { + "epoch": 0.43292287751134156, + "grad_norm": 0.9810091853141785, + "learning_rate": 4.831013268568986e-06, + "loss": 0.1681, + "step": 1336 + }, + { + "epoch": 0.43324692158133504, + "grad_norm": 1.0748008489608765, + "learning_rate": 4.830697029962222e-06, + "loss": 0.1676, + "step": 1337 + }, + { + "epoch": 0.4335709656513286, + "grad_norm": 0.9215734601020813, + "learning_rate": 4.830380506100704e-06, + "loss": 0.1638, + "step": 1338 + }, + { + "epoch": 0.4338950097213221, + "grad_norm": 1.0052947998046875, + "learning_rate": 4.830063697023173e-06, + "loss": 0.1735, + "step": 1339 + }, + { + "epoch": 0.43421905379131565, + "grad_norm": 0.974852979183197, + "learning_rate": 4.829746602768401e-06, + "loss": 0.1587, + "step": 1340 + }, + { + "epoch": 0.4345430978613091, + "grad_norm": 1.002942681312561, + "learning_rate": 4.8294292233752e-06, + "loss": 0.1504, + "step": 1341 + }, + { + "epoch": 0.43486714193130266, + "grad_norm": 0.9386513233184814, + "learning_rate": 4.829111558882411e-06, + "loss": 0.1623, + "step": 1342 + }, + { + "epoch": 0.4351911860012962, + "grad_norm": 0.9189992547035217, + "learning_rate": 4.828793609328916e-06, + "loss": 0.1595, + "step": 1343 + }, + { + "epoch": 0.43551523007128967, + "grad_norm": 1.1138416528701782, + "learning_rate": 4.828475374753627e-06, + "loss": 0.1723, + "step": 1344 + }, + { + "epoch": 0.4358392741412832, + "grad_norm": 1.056380033493042, + "learning_rate": 4.828156855195493e-06, + "loss": 0.1802, + "step": 1345 + }, + { + "epoch": 0.43616331821127674, + "grad_norm": 0.9843893051147461, + "learning_rate": 4.827838050693499e-06, + "loss": 0.1635, + "step": 1346 + }, + { + "epoch": 0.4364873622812703, + "grad_norm": 0.991206169128418, + "learning_rate": 4.827518961286663e-06, + "loss": 0.1829, + "step": 1347 + }, + { + "epoch": 0.43681140635126375, + "grad_norm": 1.008387804031372, + "learning_rate": 4.827199587014038e-06, + "loss": 0.1648, + "step": 1348 + }, + { + "epoch": 0.4371354504212573, + "grad_norm": 1.040220856666565, + "learning_rate": 4.826879927914713e-06, + "loss": 0.1898, + "step": 1349 + }, + { + "epoch": 0.4374594944912508, + "grad_norm": 0.9465731978416443, + "learning_rate": 4.82655998402781e-06, + "loss": 0.156, + "step": 1350 + }, + { + "epoch": 0.43778353856124436, + "grad_norm": 1.0382673740386963, + "learning_rate": 4.826239755392488e-06, + "loss": 0.1802, + "step": 1351 + }, + { + "epoch": 0.43810758263123784, + "grad_norm": 1.010624647140503, + "learning_rate": 4.8259192420479395e-06, + "loss": 0.179, + "step": 1352 + }, + { + "epoch": 0.43843162670123137, + "grad_norm": 1.0043545961380005, + "learning_rate": 4.825598444033393e-06, + "loss": 0.1821, + "step": 1353 + }, + { + "epoch": 0.4387556707712249, + "grad_norm": 0.9797499179840088, + "learning_rate": 4.82527736138811e-06, + "loss": 0.1774, + "step": 1354 + }, + { + "epoch": 0.4390797148412184, + "grad_norm": 1.036889910697937, + "learning_rate": 4.824955994151389e-06, + "loss": 0.1719, + "step": 1355 + }, + { + "epoch": 0.4394037589112119, + "grad_norm": 1.041792869567871, + "learning_rate": 4.824634342362561e-06, + "loss": 0.1888, + "step": 1356 + }, + { + "epoch": 0.43972780298120545, + "grad_norm": 0.9451294541358948, + "learning_rate": 4.824312406060995e-06, + "loss": 0.1678, + "step": 1357 + }, + { + "epoch": 0.440051847051199, + "grad_norm": 1.011522889137268, + "learning_rate": 4.82399018528609e-06, + "loss": 0.1658, + "step": 1358 + }, + { + "epoch": 0.44037589112119246, + "grad_norm": 0.9879494905471802, + "learning_rate": 4.823667680077285e-06, + "loss": 0.1664, + "step": 1359 + }, + { + "epoch": 0.440699935191186, + "grad_norm": 1.0607211589813232, + "learning_rate": 4.8233448904740505e-06, + "loss": 0.184, + "step": 1360 + }, + { + "epoch": 0.44102397926117953, + "grad_norm": 1.0475084781646729, + "learning_rate": 4.823021816515893e-06, + "loss": 0.17, + "step": 1361 + }, + { + "epoch": 0.44134802333117307, + "grad_norm": 0.9797631502151489, + "learning_rate": 4.8226984582423545e-06, + "loss": 0.1547, + "step": 1362 + }, + { + "epoch": 0.44167206740116655, + "grad_norm": 0.9320208430290222, + "learning_rate": 4.82237481569301e-06, + "loss": 0.1627, + "step": 1363 + }, + { + "epoch": 0.4419961114711601, + "grad_norm": 1.0192317962646484, + "learning_rate": 4.822050888907469e-06, + "loss": 0.1582, + "step": 1364 + }, + { + "epoch": 0.4423201555411536, + "grad_norm": 0.9490489363670349, + "learning_rate": 4.82172667792538e-06, + "loss": 0.1567, + "step": 1365 + }, + { + "epoch": 0.4426441996111471, + "grad_norm": 0.9601473212242126, + "learning_rate": 4.821402182786421e-06, + "loss": 0.1719, + "step": 1366 + }, + { + "epoch": 0.4429682436811406, + "grad_norm": 0.9360396265983582, + "learning_rate": 4.8210774035303085e-06, + "loss": 0.165, + "step": 1367 + }, + { + "epoch": 0.44329228775113416, + "grad_norm": 1.0148745775222778, + "learning_rate": 4.82075234019679e-06, + "loss": 0.1722, + "step": 1368 + }, + { + "epoch": 0.4436163318211277, + "grad_norm": 1.180428147315979, + "learning_rate": 4.820426992825653e-06, + "loss": 0.1852, + "step": 1369 + }, + { + "epoch": 0.4439403758911212, + "grad_norm": 0.9918110966682434, + "learning_rate": 4.820101361456715e-06, + "loss": 0.1789, + "step": 1370 + }, + { + "epoch": 0.4442644199611147, + "grad_norm": 0.9531265497207642, + "learning_rate": 4.819775446129832e-06, + "loss": 0.1599, + "step": 1371 + }, + { + "epoch": 0.44458846403110824, + "grad_norm": 0.9279592633247375, + "learning_rate": 4.8194492468848895e-06, + "loss": 0.1569, + "step": 1372 + }, + { + "epoch": 0.4449125081011017, + "grad_norm": 0.9266502857208252, + "learning_rate": 4.8191227637618145e-06, + "loss": 0.1635, + "step": 1373 + }, + { + "epoch": 0.44523655217109526, + "grad_norm": 0.9631732106208801, + "learning_rate": 4.818795996800564e-06, + "loss": 0.1661, + "step": 1374 + }, + { + "epoch": 0.4455605962410888, + "grad_norm": 0.9835265874862671, + "learning_rate": 4.8184689460411306e-06, + "loss": 0.1763, + "step": 1375 + }, + { + "epoch": 0.4458846403110823, + "grad_norm": 0.9746513366699219, + "learning_rate": 4.818141611523543e-06, + "loss": 0.1729, + "step": 1376 + }, + { + "epoch": 0.4462086843810758, + "grad_norm": 0.9658451080322266, + "learning_rate": 4.817813993287863e-06, + "loss": 0.1622, + "step": 1377 + }, + { + "epoch": 0.44653272845106934, + "grad_norm": 0.898226261138916, + "learning_rate": 4.817486091374189e-06, + "loss": 0.1486, + "step": 1378 + }, + { + "epoch": 0.4468567725210629, + "grad_norm": 1.0004597902297974, + "learning_rate": 4.817157905822652e-06, + "loss": 0.1764, + "step": 1379 + }, + { + "epoch": 0.4471808165910564, + "grad_norm": 1.0172264575958252, + "learning_rate": 4.816829436673421e-06, + "loss": 0.1729, + "step": 1380 + }, + { + "epoch": 0.4475048606610499, + "grad_norm": 0.9500515460968018, + "learning_rate": 4.816500683966694e-06, + "loss": 0.1579, + "step": 1381 + }, + { + "epoch": 0.4478289047310434, + "grad_norm": 1.0111055374145508, + "learning_rate": 4.816171647742708e-06, + "loss": 0.1705, + "step": 1382 + }, + { + "epoch": 0.44815294880103695, + "grad_norm": 1.0100187063217163, + "learning_rate": 4.815842328041736e-06, + "loss": 0.1729, + "step": 1383 + }, + { + "epoch": 0.44847699287103043, + "grad_norm": 0.8995622992515564, + "learning_rate": 4.815512724904081e-06, + "loss": 0.1504, + "step": 1384 + }, + { + "epoch": 0.44880103694102397, + "grad_norm": 1.2931605577468872, + "learning_rate": 4.815182838370085e-06, + "loss": 0.1682, + "step": 1385 + }, + { + "epoch": 0.4491250810110175, + "grad_norm": 1.0237258672714233, + "learning_rate": 4.814852668480122e-06, + "loss": 0.161, + "step": 1386 + }, + { + "epoch": 0.44944912508101104, + "grad_norm": 0.9830361008644104, + "learning_rate": 4.814522215274603e-06, + "loss": 0.1693, + "step": 1387 + }, + { + "epoch": 0.4497731691510045, + "grad_norm": 1.0150436162948608, + "learning_rate": 4.81419147879397e-06, + "loss": 0.1773, + "step": 1388 + }, + { + "epoch": 0.45009721322099805, + "grad_norm": 0.9565200209617615, + "learning_rate": 4.813860459078703e-06, + "loss": 0.1611, + "step": 1389 + }, + { + "epoch": 0.4504212572909916, + "grad_norm": 1.0266962051391602, + "learning_rate": 4.813529156169317e-06, + "loss": 0.1715, + "step": 1390 + }, + { + "epoch": 0.4507453013609851, + "grad_norm": 0.9856990575790405, + "learning_rate": 4.813197570106357e-06, + "loss": 0.1586, + "step": 1391 + }, + { + "epoch": 0.4510693454309786, + "grad_norm": 0.9939296245574951, + "learning_rate": 4.8128657009304096e-06, + "loss": 0.155, + "step": 1392 + }, + { + "epoch": 0.45139338950097213, + "grad_norm": 0.9616846442222595, + "learning_rate": 4.8125335486820905e-06, + "loss": 0.1571, + "step": 1393 + }, + { + "epoch": 0.45171743357096567, + "grad_norm": 0.8807049989700317, + "learning_rate": 4.8122011134020505e-06, + "loss": 0.1526, + "step": 1394 + }, + { + "epoch": 0.45204147764095914, + "grad_norm": 1.0040156841278076, + "learning_rate": 4.8118683951309795e-06, + "loss": 0.1561, + "step": 1395 + }, + { + "epoch": 0.4523655217109527, + "grad_norm": 1.0367448329925537, + "learning_rate": 4.811535393909598e-06, + "loss": 0.1871, + "step": 1396 + }, + { + "epoch": 0.4526895657809462, + "grad_norm": 1.0239020586013794, + "learning_rate": 4.811202109778661e-06, + "loss": 0.1636, + "step": 1397 + }, + { + "epoch": 0.45301360985093975, + "grad_norm": 0.949949324131012, + "learning_rate": 4.810868542778959e-06, + "loss": 0.1662, + "step": 1398 + }, + { + "epoch": 0.4533376539209332, + "grad_norm": 1.001170039176941, + "learning_rate": 4.81053469295132e-06, + "loss": 0.1654, + "step": 1399 + }, + { + "epoch": 0.45366169799092676, + "grad_norm": 0.9990749955177307, + "learning_rate": 4.810200560336601e-06, + "loss": 0.1718, + "step": 1400 + }, + { + "epoch": 0.4539857420609203, + "grad_norm": 0.9623527526855469, + "learning_rate": 4.809866144975699e-06, + "loss": 0.1649, + "step": 1401 + }, + { + "epoch": 0.45430978613091383, + "grad_norm": 0.9943154454231262, + "learning_rate": 4.809531446909541e-06, + "loss": 0.1808, + "step": 1402 + }, + { + "epoch": 0.4546338302009073, + "grad_norm": 0.9126044511795044, + "learning_rate": 4.8091964661790926e-06, + "loss": 0.1587, + "step": 1403 + }, + { + "epoch": 0.45495787427090084, + "grad_norm": 0.9064966440200806, + "learning_rate": 4.808861202825351e-06, + "loss": 0.1654, + "step": 1404 + }, + { + "epoch": 0.4552819183408944, + "grad_norm": 1.045502781867981, + "learning_rate": 4.80852565688935e-06, + "loss": 0.1482, + "step": 1405 + }, + { + "epoch": 0.45560596241088785, + "grad_norm": 0.9608927965164185, + "learning_rate": 4.808189828412157e-06, + "loss": 0.1692, + "step": 1406 + }, + { + "epoch": 0.4559300064808814, + "grad_norm": 1.019291639328003, + "learning_rate": 4.807853717434874e-06, + "loss": 0.1792, + "step": 1407 + }, + { + "epoch": 0.4562540505508749, + "grad_norm": 1.4600938558578491, + "learning_rate": 4.807517323998637e-06, + "loss": 0.1613, + "step": 1408 + }, + { + "epoch": 0.45657809462086846, + "grad_norm": 0.9859667420387268, + "learning_rate": 4.8071806481446194e-06, + "loss": 0.162, + "step": 1409 + }, + { + "epoch": 0.45690213869086194, + "grad_norm": 0.9860358238220215, + "learning_rate": 4.806843689914025e-06, + "loss": 0.1731, + "step": 1410 + }, + { + "epoch": 0.45722618276085547, + "grad_norm": 0.9931458830833435, + "learning_rate": 4.806506449348094e-06, + "loss": 0.1724, + "step": 1411 + }, + { + "epoch": 0.457550226830849, + "grad_norm": 0.9938206672668457, + "learning_rate": 4.8061689264881036e-06, + "loss": 0.1695, + "step": 1412 + }, + { + "epoch": 0.45787427090084254, + "grad_norm": 0.9255531430244446, + "learning_rate": 4.805831121375361e-06, + "loss": 0.1694, + "step": 1413 + }, + { + "epoch": 0.458198314970836, + "grad_norm": 0.9822893738746643, + "learning_rate": 4.805493034051212e-06, + "loss": 0.1648, + "step": 1414 + }, + { + "epoch": 0.45852235904082955, + "grad_norm": 0.9348422288894653, + "learning_rate": 4.805154664557034e-06, + "loss": 0.1477, + "step": 1415 + }, + { + "epoch": 0.4588464031108231, + "grad_norm": 0.9459620118141174, + "learning_rate": 4.804816012934242e-06, + "loss": 0.1728, + "step": 1416 + }, + { + "epoch": 0.45917044718081657, + "grad_norm": 0.8711515069007874, + "learning_rate": 4.8044770792242815e-06, + "loss": 0.1472, + "step": 1417 + }, + { + "epoch": 0.4594944912508101, + "grad_norm": 1.0231870412826538, + "learning_rate": 4.8041378634686355e-06, + "loss": 0.16, + "step": 1418 + }, + { + "epoch": 0.45981853532080363, + "grad_norm": 0.9538334608078003, + "learning_rate": 4.803798365708821e-06, + "loss": 0.1615, + "step": 1419 + }, + { + "epoch": 0.46014257939079717, + "grad_norm": 1.0552150011062622, + "learning_rate": 4.803458585986389e-06, + "loss": 0.186, + "step": 1420 + }, + { + "epoch": 0.46046662346079065, + "grad_norm": 1.0478894710540771, + "learning_rate": 4.803118524342925e-06, + "loss": 0.1639, + "step": 1421 + }, + { + "epoch": 0.4607906675307842, + "grad_norm": 0.9332813024520874, + "learning_rate": 4.80277818082005e-06, + "loss": 0.1566, + "step": 1422 + }, + { + "epoch": 0.4611147116007777, + "grad_norm": 1.0140115022659302, + "learning_rate": 4.802437555459418e-06, + "loss": 0.1678, + "step": 1423 + }, + { + "epoch": 0.46143875567077125, + "grad_norm": 1.0074522495269775, + "learning_rate": 4.802096648302718e-06, + "loss": 0.206, + "step": 1424 + }, + { + "epoch": 0.46176279974076473, + "grad_norm": 1.0931317806243896, + "learning_rate": 4.801755459391675e-06, + "loss": 0.1796, + "step": 1425 + }, + { + "epoch": 0.46208684381075826, + "grad_norm": 0.940055787563324, + "learning_rate": 4.801413988768047e-06, + "loss": 0.1609, + "step": 1426 + }, + { + "epoch": 0.4624108878807518, + "grad_norm": 0.938195526599884, + "learning_rate": 4.801072236473625e-06, + "loss": 0.1637, + "step": 1427 + }, + { + "epoch": 0.4627349319507453, + "grad_norm": 0.9516801834106445, + "learning_rate": 4.800730202550237e-06, + "loss": 0.1676, + "step": 1428 + }, + { + "epoch": 0.4630589760207388, + "grad_norm": 0.93316251039505, + "learning_rate": 4.800387887039747e-06, + "loss": 0.158, + "step": 1429 + }, + { + "epoch": 0.46338302009073234, + "grad_norm": 0.9559198021888733, + "learning_rate": 4.800045289984047e-06, + "loss": 0.1562, + "step": 1430 + }, + { + "epoch": 0.4637070641607259, + "grad_norm": 0.9279831051826477, + "learning_rate": 4.799702411425071e-06, + "loss": 0.1626, + "step": 1431 + }, + { + "epoch": 0.46403110823071936, + "grad_norm": 0.9327981472015381, + "learning_rate": 4.7993592514047825e-06, + "loss": 0.1579, + "step": 1432 + }, + { + "epoch": 0.4643551523007129, + "grad_norm": 1.0297558307647705, + "learning_rate": 4.7990158099651815e-06, + "loss": 0.1646, + "step": 1433 + }, + { + "epoch": 0.4646791963707064, + "grad_norm": 0.8880211114883423, + "learning_rate": 4.798672087148301e-06, + "loss": 0.1631, + "step": 1434 + }, + { + "epoch": 0.46500324044069996, + "grad_norm": 0.8490897417068481, + "learning_rate": 4.79832808299621e-06, + "loss": 0.1439, + "step": 1435 + }, + { + "epoch": 0.46532728451069344, + "grad_norm": 0.9423319101333618, + "learning_rate": 4.797983797551011e-06, + "loss": 0.1675, + "step": 1436 + }, + { + "epoch": 0.465651328580687, + "grad_norm": 1.0397905111312866, + "learning_rate": 4.7976392308548416e-06, + "loss": 0.1734, + "step": 1437 + }, + { + "epoch": 0.4659753726506805, + "grad_norm": 0.9998886585235596, + "learning_rate": 4.797294382949873e-06, + "loss": 0.1602, + "step": 1438 + }, + { + "epoch": 0.466299416720674, + "grad_norm": 0.8817910552024841, + "learning_rate": 4.796949253878311e-06, + "loss": 0.1516, + "step": 1439 + }, + { + "epoch": 0.4666234607906675, + "grad_norm": 0.9884592890739441, + "learning_rate": 4.796603843682397e-06, + "loss": 0.1666, + "step": 1440 + }, + { + "epoch": 0.46694750486066106, + "grad_norm": 0.9682788848876953, + "learning_rate": 4.796258152404406e-06, + "loss": 0.1719, + "step": 1441 + }, + { + "epoch": 0.4672715489306546, + "grad_norm": 0.9318127036094666, + "learning_rate": 4.795912180086646e-06, + "loss": 0.1572, + "step": 1442 + }, + { + "epoch": 0.46759559300064807, + "grad_norm": 0.9371210336685181, + "learning_rate": 4.795565926771461e-06, + "loss": 0.1684, + "step": 1443 + }, + { + "epoch": 0.4679196370706416, + "grad_norm": 0.968223512172699, + "learning_rate": 4.79521939250123e-06, + "loss": 0.1627, + "step": 1444 + }, + { + "epoch": 0.46824368114063514, + "grad_norm": 0.9787799715995789, + "learning_rate": 4.7948725773183645e-06, + "loss": 0.1781, + "step": 1445 + }, + { + "epoch": 0.46856772521062867, + "grad_norm": 1.0054574012756348, + "learning_rate": 4.794525481265312e-06, + "loss": 0.1825, + "step": 1446 + }, + { + "epoch": 0.46889176928062215, + "grad_norm": 0.9471763372421265, + "learning_rate": 4.794178104384554e-06, + "loss": 0.1568, + "step": 1447 + }, + { + "epoch": 0.4692158133506157, + "grad_norm": 1.0143144130706787, + "learning_rate": 4.7938304467186036e-06, + "loss": 0.1659, + "step": 1448 + }, + { + "epoch": 0.4695398574206092, + "grad_norm": 0.891257107257843, + "learning_rate": 4.793482508310014e-06, + "loss": 0.15, + "step": 1449 + }, + { + "epoch": 0.4698639014906027, + "grad_norm": 0.9457550048828125, + "learning_rate": 4.793134289201367e-06, + "loss": 0.1551, + "step": 1450 + }, + { + "epoch": 0.47018794556059623, + "grad_norm": 0.9223624467849731, + "learning_rate": 4.792785789435283e-06, + "loss": 0.149, + "step": 1451 + }, + { + "epoch": 0.47051198963058977, + "grad_norm": 0.9519339799880981, + "learning_rate": 4.792437009054413e-06, + "loss": 0.1596, + "step": 1452 + }, + { + "epoch": 0.4708360337005833, + "grad_norm": 0.9895346760749817, + "learning_rate": 4.792087948101447e-06, + "loss": 0.1659, + "step": 1453 + }, + { + "epoch": 0.4711600777705768, + "grad_norm": 0.9164407849311829, + "learning_rate": 4.791738606619105e-06, + "loss": 0.1615, + "step": 1454 + }, + { + "epoch": 0.4714841218405703, + "grad_norm": 0.9475098252296448, + "learning_rate": 4.791388984650143e-06, + "loss": 0.148, + "step": 1455 + }, + { + "epoch": 0.47180816591056385, + "grad_norm": 0.9886651635169983, + "learning_rate": 4.791039082237352e-06, + "loss": 0.1867, + "step": 1456 + }, + { + "epoch": 0.4721322099805574, + "grad_norm": 0.8633338809013367, + "learning_rate": 4.790688899423556e-06, + "loss": 0.1466, + "step": 1457 + }, + { + "epoch": 0.47245625405055086, + "grad_norm": 1.012751817703247, + "learning_rate": 4.7903384362516135e-06, + "loss": 0.1747, + "step": 1458 + }, + { + "epoch": 0.4727802981205444, + "grad_norm": 1.0237362384796143, + "learning_rate": 4.78998769276442e-06, + "loss": 0.1719, + "step": 1459 + }, + { + "epoch": 0.47310434219053793, + "grad_norm": 0.9652113318443298, + "learning_rate": 4.7896366690049016e-06, + "loss": 0.1583, + "step": 1460 + }, + { + "epoch": 0.4734283862605314, + "grad_norm": 0.9437934756278992, + "learning_rate": 4.789285365016019e-06, + "loss": 0.1584, + "step": 1461 + }, + { + "epoch": 0.47375243033052494, + "grad_norm": 0.9734069108963013, + "learning_rate": 4.788933780840771e-06, + "loss": 0.1593, + "step": 1462 + }, + { + "epoch": 0.4740764744005185, + "grad_norm": 0.9889543652534485, + "learning_rate": 4.788581916522186e-06, + "loss": 0.1707, + "step": 1463 + }, + { + "epoch": 0.474400518470512, + "grad_norm": 1.0652320384979248, + "learning_rate": 4.78822977210333e-06, + "loss": 0.1699, + "step": 1464 + }, + { + "epoch": 0.4747245625405055, + "grad_norm": 0.91072016954422, + "learning_rate": 4.787877347627302e-06, + "loss": 0.1548, + "step": 1465 + }, + { + "epoch": 0.475048606610499, + "grad_norm": 0.9999649524688721, + "learning_rate": 4.787524643137235e-06, + "loss": 0.1639, + "step": 1466 + }, + { + "epoch": 0.47537265068049256, + "grad_norm": 1.094711422920227, + "learning_rate": 4.7871716586762965e-06, + "loss": 0.182, + "step": 1467 + }, + { + "epoch": 0.4756966947504861, + "grad_norm": 0.98603755235672, + "learning_rate": 4.786818394287688e-06, + "loss": 0.1679, + "step": 1468 + }, + { + "epoch": 0.47602073882047957, + "grad_norm": 0.9653013348579407, + "learning_rate": 4.786464850014646e-06, + "loss": 0.1611, + "step": 1469 + }, + { + "epoch": 0.4763447828904731, + "grad_norm": 0.971269965171814, + "learning_rate": 4.786111025900442e-06, + "loss": 0.1724, + "step": 1470 + }, + { + "epoch": 0.47666882696046664, + "grad_norm": 0.9750480055809021, + "learning_rate": 4.785756921988379e-06, + "loss": 0.1637, + "step": 1471 + }, + { + "epoch": 0.4769928710304601, + "grad_norm": 0.9388756155967712, + "learning_rate": 4.785402538321798e-06, + "loss": 0.1637, + "step": 1472 + }, + { + "epoch": 0.47731691510045365, + "grad_norm": 1.0086320638656616, + "learning_rate": 4.785047874944069e-06, + "loss": 0.1616, + "step": 1473 + }, + { + "epoch": 0.4776409591704472, + "grad_norm": 1.0045452117919922, + "learning_rate": 4.784692931898601e-06, + "loss": 0.1541, + "step": 1474 + }, + { + "epoch": 0.4779650032404407, + "grad_norm": 0.9847915768623352, + "learning_rate": 4.7843377092288365e-06, + "loss": 0.164, + "step": 1475 + }, + { + "epoch": 0.4782890473104342, + "grad_norm": 0.996334969997406, + "learning_rate": 4.7839822069782505e-06, + "loss": 0.1659, + "step": 1476 + }, + { + "epoch": 0.47861309138042774, + "grad_norm": 0.9724141359329224, + "learning_rate": 4.783626425190353e-06, + "loss": 0.1675, + "step": 1477 + }, + { + "epoch": 0.47893713545042127, + "grad_norm": 0.9559745192527771, + "learning_rate": 4.783270363908687e-06, + "loss": 0.1688, + "step": 1478 + }, + { + "epoch": 0.4792611795204148, + "grad_norm": 1.0229589939117432, + "learning_rate": 4.782914023176834e-06, + "loss": 0.1793, + "step": 1479 + }, + { + "epoch": 0.4795852235904083, + "grad_norm": 0.9505227208137512, + "learning_rate": 4.782557403038404e-06, + "loss": 0.1658, + "step": 1480 + }, + { + "epoch": 0.4799092676604018, + "grad_norm": 0.9692978262901306, + "learning_rate": 4.7822005035370455e-06, + "loss": 0.1596, + "step": 1481 + }, + { + "epoch": 0.48023331173039535, + "grad_norm": 1.0005214214324951, + "learning_rate": 4.781843324716437e-06, + "loss": 0.1836, + "step": 1482 + }, + { + "epoch": 0.48055735580038883, + "grad_norm": 0.8998204469680786, + "learning_rate": 4.7814858666202975e-06, + "loss": 0.1557, + "step": 1483 + }, + { + "epoch": 0.48088139987038236, + "grad_norm": 0.9466003179550171, + "learning_rate": 4.781128129292374e-06, + "loss": 0.1658, + "step": 1484 + }, + { + "epoch": 0.4812054439403759, + "grad_norm": 0.9802458882331848, + "learning_rate": 4.7807701127764506e-06, + "loss": 0.1604, + "step": 1485 + }, + { + "epoch": 0.48152948801036943, + "grad_norm": 0.9373664855957031, + "learning_rate": 4.780411817116344e-06, + "loss": 0.1781, + "step": 1486 + }, + { + "epoch": 0.4818535320803629, + "grad_norm": 0.9663504958152771, + "learning_rate": 4.780053242355908e-06, + "loss": 0.1761, + "step": 1487 + }, + { + "epoch": 0.48217757615035645, + "grad_norm": 0.8889632821083069, + "learning_rate": 4.779694388539027e-06, + "loss": 0.1677, + "step": 1488 + }, + { + "epoch": 0.48250162022035, + "grad_norm": 0.9445527791976929, + "learning_rate": 4.779335255709623e-06, + "loss": 0.1644, + "step": 1489 + }, + { + "epoch": 0.48282566429034346, + "grad_norm": 0.9753222465515137, + "learning_rate": 4.778975843911649e-06, + "loss": 0.165, + "step": 1490 + }, + { + "epoch": 0.483149708360337, + "grad_norm": 0.9640643000602722, + "learning_rate": 4.778616153189093e-06, + "loss": 0.1621, + "step": 1491 + }, + { + "epoch": 0.48347375243033053, + "grad_norm": 0.9978510737419128, + "learning_rate": 4.7782561835859795e-06, + "loss": 0.1775, + "step": 1492 + }, + { + "epoch": 0.48379779650032406, + "grad_norm": 0.9495340585708618, + "learning_rate": 4.777895935146364e-06, + "loss": 0.1687, + "step": 1493 + }, + { + "epoch": 0.48412184057031754, + "grad_norm": 0.9323464035987854, + "learning_rate": 4.777535407914338e-06, + "loss": 0.1569, + "step": 1494 + }, + { + "epoch": 0.4844458846403111, + "grad_norm": 0.881629228591919, + "learning_rate": 4.777174601934026e-06, + "loss": 0.1582, + "step": 1495 + }, + { + "epoch": 0.4847699287103046, + "grad_norm": 0.9408942461013794, + "learning_rate": 4.776813517249588e-06, + "loss": 0.1677, + "step": 1496 + }, + { + "epoch": 0.48509397278029814, + "grad_norm": 1.020785927772522, + "learning_rate": 4.776452153905216e-06, + "loss": 0.186, + "step": 1497 + }, + { + "epoch": 0.4854180168502916, + "grad_norm": 0.990865170955658, + "learning_rate": 4.776090511945139e-06, + "loss": 0.1713, + "step": 1498 + }, + { + "epoch": 0.48574206092028516, + "grad_norm": 0.942308783531189, + "learning_rate": 4.775728591413616e-06, + "loss": 0.1633, + "step": 1499 + }, + { + "epoch": 0.4860661049902787, + "grad_norm": 0.9622128009796143, + "learning_rate": 4.775366392354946e-06, + "loss": 0.1612, + "step": 1500 + }, + { + "epoch": 0.48639014906027217, + "grad_norm": 0.9915785193443298, + "learning_rate": 4.775003914813456e-06, + "loss": 0.1655, + "step": 1501 + }, + { + "epoch": 0.4867141931302657, + "grad_norm": 1.0087889432907104, + "learning_rate": 4.7746411588335105e-06, + "loss": 0.1766, + "step": 1502 + }, + { + "epoch": 0.48703823720025924, + "grad_norm": 0.9488150477409363, + "learning_rate": 4.774278124459509e-06, + "loss": 0.1667, + "step": 1503 + }, + { + "epoch": 0.4873622812702528, + "grad_norm": 0.8961800336837769, + "learning_rate": 4.773914811735879e-06, + "loss": 0.1537, + "step": 1504 + }, + { + "epoch": 0.48768632534024625, + "grad_norm": 0.9038843512535095, + "learning_rate": 4.773551220707091e-06, + "loss": 0.1574, + "step": 1505 + }, + { + "epoch": 0.4880103694102398, + "grad_norm": 0.9799198508262634, + "learning_rate": 4.773187351417643e-06, + "loss": 0.1741, + "step": 1506 + }, + { + "epoch": 0.4883344134802333, + "grad_norm": 1.0378434658050537, + "learning_rate": 4.772823203912069e-06, + "loss": 0.169, + "step": 1507 + }, + { + "epoch": 0.48865845755022685, + "grad_norm": 0.9953622817993164, + "learning_rate": 4.772458778234938e-06, + "loss": 0.181, + "step": 1508 + }, + { + "epoch": 0.48898250162022033, + "grad_norm": 1.0660994052886963, + "learning_rate": 4.772094074430852e-06, + "loss": 0.1734, + "step": 1509 + }, + { + "epoch": 0.48930654569021387, + "grad_norm": 0.8679867386817932, + "learning_rate": 4.771729092544446e-06, + "loss": 0.1565, + "step": 1510 + }, + { + "epoch": 0.4896305897602074, + "grad_norm": 0.9982333183288574, + "learning_rate": 4.771363832620391e-06, + "loss": 0.1826, + "step": 1511 + }, + { + "epoch": 0.4899546338302009, + "grad_norm": 1.1239075660705566, + "learning_rate": 4.770998294703392e-06, + "loss": 0.1739, + "step": 1512 + }, + { + "epoch": 0.4902786779001944, + "grad_norm": 0.9775390028953552, + "learning_rate": 4.7706324788381865e-06, + "loss": 0.1711, + "step": 1513 + }, + { + "epoch": 0.49060272197018795, + "grad_norm": 1.0100210905075073, + "learning_rate": 4.770266385069547e-06, + "loss": 0.1774, + "step": 1514 + }, + { + "epoch": 0.4909267660401815, + "grad_norm": 0.9602349996566772, + "learning_rate": 4.769900013442279e-06, + "loss": 0.1631, + "step": 1515 + }, + { + "epoch": 0.49125081011017496, + "grad_norm": 1.0033910274505615, + "learning_rate": 4.769533364001225e-06, + "loss": 0.1764, + "step": 1516 + }, + { + "epoch": 0.4915748541801685, + "grad_norm": 0.9373995661735535, + "learning_rate": 4.769166436791257e-06, + "loss": 0.1545, + "step": 1517 + }, + { + "epoch": 0.49189889825016203, + "grad_norm": 0.8880525231361389, + "learning_rate": 4.768799231857285e-06, + "loss": 0.1442, + "step": 1518 + }, + { + "epoch": 0.49222294232015557, + "grad_norm": 0.9575677514076233, + "learning_rate": 4.768431749244251e-06, + "loss": 0.1735, + "step": 1519 + }, + { + "epoch": 0.49254698639014904, + "grad_norm": 0.9560174942016602, + "learning_rate": 4.76806398899713e-06, + "loss": 0.166, + "step": 1520 + }, + { + "epoch": 0.4928710304601426, + "grad_norm": 0.9476724863052368, + "learning_rate": 4.767695951160934e-06, + "loss": 0.1689, + "step": 1521 + }, + { + "epoch": 0.4931950745301361, + "grad_norm": 0.9879964590072632, + "learning_rate": 4.767327635780707e-06, + "loss": 0.1708, + "step": 1522 + }, + { + "epoch": 0.4935191186001296, + "grad_norm": 0.9266806840896606, + "learning_rate": 4.7669590429015265e-06, + "loss": 0.1576, + "step": 1523 + }, + { + "epoch": 0.4938431626701231, + "grad_norm": 1.020934820175171, + "learning_rate": 4.7665901725685045e-06, + "loss": 0.169, + "step": 1524 + }, + { + "epoch": 0.49416720674011666, + "grad_norm": 0.8984940052032471, + "learning_rate": 4.766221024826788e-06, + "loss": 0.1624, + "step": 1525 + }, + { + "epoch": 0.4944912508101102, + "grad_norm": 0.9921505451202393, + "learning_rate": 4.765851599721557e-06, + "loss": 0.1668, + "step": 1526 + }, + { + "epoch": 0.4948152948801037, + "grad_norm": 0.965148389339447, + "learning_rate": 4.765481897298025e-06, + "loss": 0.1684, + "step": 1527 + }, + { + "epoch": 0.4951393389500972, + "grad_norm": 0.9692872166633606, + "learning_rate": 4.76511191760144e-06, + "loss": 0.154, + "step": 1528 + }, + { + "epoch": 0.49546338302009074, + "grad_norm": 1.029179573059082, + "learning_rate": 4.764741660677085e-06, + "loss": 0.1872, + "step": 1529 + }, + { + "epoch": 0.4957874270900843, + "grad_norm": 0.9152058959007263, + "learning_rate": 4.764371126570275e-06, + "loss": 0.1618, + "step": 1530 + }, + { + "epoch": 0.49611147116007775, + "grad_norm": 0.9520514607429504, + "learning_rate": 4.76400031532636e-06, + "loss": 0.165, + "step": 1531 + }, + { + "epoch": 0.4964355152300713, + "grad_norm": 0.9545121192932129, + "learning_rate": 4.763629226990724e-06, + "loss": 0.1744, + "step": 1532 + }, + { + "epoch": 0.4967595593000648, + "grad_norm": 0.9610336422920227, + "learning_rate": 4.763257861608783e-06, + "loss": 0.1603, + "step": 1533 + }, + { + "epoch": 0.4970836033700583, + "grad_norm": 0.9071905016899109, + "learning_rate": 4.762886219225991e-06, + "loss": 0.152, + "step": 1534 + }, + { + "epoch": 0.49740764744005184, + "grad_norm": 0.9342719912528992, + "learning_rate": 4.762514299887831e-06, + "loss": 0.1625, + "step": 1535 + }, + { + "epoch": 0.49773169151004537, + "grad_norm": 1.0375604629516602, + "learning_rate": 4.762142103639824e-06, + "loss": 0.18, + "step": 1536 + }, + { + "epoch": 0.4980557355800389, + "grad_norm": 0.9291142225265503, + "learning_rate": 4.761769630527523e-06, + "loss": 0.1569, + "step": 1537 + }, + { + "epoch": 0.4983797796500324, + "grad_norm": 0.9139031767845154, + "learning_rate": 4.761396880596515e-06, + "loss": 0.1517, + "step": 1538 + }, + { + "epoch": 0.4987038237200259, + "grad_norm": 0.8803319334983826, + "learning_rate": 4.76102385389242e-06, + "loss": 0.1512, + "step": 1539 + }, + { + "epoch": 0.49902786779001945, + "grad_norm": 0.8804055452346802, + "learning_rate": 4.760650550460895e-06, + "loss": 0.1537, + "step": 1540 + }, + { + "epoch": 0.499351911860013, + "grad_norm": 0.8991281986236572, + "learning_rate": 4.760276970347627e-06, + "loss": 0.1597, + "step": 1541 + }, + { + "epoch": 0.49967595593000647, + "grad_norm": 0.9166383743286133, + "learning_rate": 4.759903113598338e-06, + "loss": 0.1686, + "step": 1542 + }, + { + "epoch": 0.5, + "grad_norm": 0.854396641254425, + "learning_rate": 4.759528980258786e-06, + "loss": 0.1364, + "step": 1543 + }, + { + "epoch": 0.5003240440699935, + "grad_norm": 1.0105361938476562, + "learning_rate": 4.759154570374761e-06, + "loss": 0.1794, + "step": 1544 + }, + { + "epoch": 0.5006480881399871, + "grad_norm": 0.9273484349250793, + "learning_rate": 4.758779883992087e-06, + "loss": 0.1607, + "step": 1545 + }, + { + "epoch": 0.5009721322099806, + "grad_norm": 1.0614210367202759, + "learning_rate": 4.758404921156622e-06, + "loss": 0.1927, + "step": 1546 + }, + { + "epoch": 0.501296176279974, + "grad_norm": 0.9139055609703064, + "learning_rate": 4.7580296819142565e-06, + "loss": 0.1525, + "step": 1547 + }, + { + "epoch": 0.5016202203499676, + "grad_norm": 0.9235586524009705, + "learning_rate": 4.757654166310919e-06, + "loss": 0.1526, + "step": 1548 + }, + { + "epoch": 0.5019442644199611, + "grad_norm": 0.9597269296646118, + "learning_rate": 4.757278374392567e-06, + "loss": 0.1707, + "step": 1549 + }, + { + "epoch": 0.5022683084899546, + "grad_norm": 0.9178736805915833, + "learning_rate": 4.7569023062051936e-06, + "loss": 0.159, + "step": 1550 + }, + { + "epoch": 0.5025923525599482, + "grad_norm": 0.9147335886955261, + "learning_rate": 4.756525961794826e-06, + "loss": 0.1573, + "step": 1551 + }, + { + "epoch": 0.5029163966299417, + "grad_norm": 0.9195179343223572, + "learning_rate": 4.756149341207526e-06, + "loss": 0.165, + "step": 1552 + }, + { + "epoch": 0.5032404406999352, + "grad_norm": 0.9070149660110474, + "learning_rate": 4.755772444489388e-06, + "loss": 0.1631, + "step": 1553 + }, + { + "epoch": 0.5035644847699287, + "grad_norm": 0.9737628102302551, + "learning_rate": 4.75539527168654e-06, + "loss": 0.173, + "step": 1554 + }, + { + "epoch": 0.5038885288399222, + "grad_norm": 1.0073634386062622, + "learning_rate": 4.755017822845145e-06, + "loss": 0.1799, + "step": 1555 + }, + { + "epoch": 0.5042125729099157, + "grad_norm": 0.8783349394798279, + "learning_rate": 4.754640098011399e-06, + "loss": 0.1545, + "step": 1556 + }, + { + "epoch": 0.5045366169799093, + "grad_norm": 0.896638035774231, + "learning_rate": 4.754262097231531e-06, + "loss": 0.1603, + "step": 1557 + }, + { + "epoch": 0.5048606610499028, + "grad_norm": 0.9870060086250305, + "learning_rate": 4.753883820551806e-06, + "loss": 0.1601, + "step": 1558 + }, + { + "epoch": 0.5051847051198963, + "grad_norm": 0.9582774043083191, + "learning_rate": 4.75350526801852e-06, + "loss": 0.1682, + "step": 1559 + }, + { + "epoch": 0.5055087491898899, + "grad_norm": 0.9573325514793396, + "learning_rate": 4.753126439678005e-06, + "loss": 0.163, + "step": 1560 + }, + { + "epoch": 0.5058327932598834, + "grad_norm": 0.9994848370552063, + "learning_rate": 4.752747335576626e-06, + "loss": 0.1694, + "step": 1561 + }, + { + "epoch": 0.5061568373298768, + "grad_norm": 0.9292284250259399, + "learning_rate": 4.752367955760781e-06, + "loss": 0.1619, + "step": 1562 + }, + { + "epoch": 0.5064808813998704, + "grad_norm": 0.9492481350898743, + "learning_rate": 4.751988300276903e-06, + "loss": 0.1673, + "step": 1563 + }, + { + "epoch": 0.5068049254698639, + "grad_norm": 0.9279813170433044, + "learning_rate": 4.751608369171458e-06, + "loss": 0.1544, + "step": 1564 + }, + { + "epoch": 0.5071289695398574, + "grad_norm": 0.9612503051757812, + "learning_rate": 4.751228162490946e-06, + "loss": 0.1599, + "step": 1565 + }, + { + "epoch": 0.507453013609851, + "grad_norm": 1.0000669956207275, + "learning_rate": 4.750847680281901e-06, + "loss": 0.1655, + "step": 1566 + }, + { + "epoch": 0.5077770576798445, + "grad_norm": 0.9342029690742493, + "learning_rate": 4.750466922590888e-06, + "loss": 0.1537, + "step": 1567 + }, + { + "epoch": 0.508101101749838, + "grad_norm": 0.8906809091567993, + "learning_rate": 4.750085889464512e-06, + "loss": 0.162, + "step": 1568 + }, + { + "epoch": 0.5084251458198314, + "grad_norm": 1.016288161277771, + "learning_rate": 4.749704580949404e-06, + "loss": 0.167, + "step": 1569 + }, + { + "epoch": 0.508749189889825, + "grad_norm": 1.0042933225631714, + "learning_rate": 4.749322997092235e-06, + "loss": 0.1605, + "step": 1570 + }, + { + "epoch": 0.5090732339598185, + "grad_norm": 1.0259679555892944, + "learning_rate": 4.748941137939706e-06, + "loss": 0.1699, + "step": 1571 + }, + { + "epoch": 0.509397278029812, + "grad_norm": 0.8490434885025024, + "learning_rate": 4.748559003538553e-06, + "loss": 0.1486, + "step": 1572 + }, + { + "epoch": 0.5097213220998056, + "grad_norm": 0.9260954260826111, + "learning_rate": 4.748176593935546e-06, + "loss": 0.1751, + "step": 1573 + }, + { + "epoch": 0.5100453661697991, + "grad_norm": 0.8708962798118591, + "learning_rate": 4.7477939091774885e-06, + "loss": 0.154, + "step": 1574 + }, + { + "epoch": 0.5103694102397927, + "grad_norm": 1.035266399383545, + "learning_rate": 4.7474109493112154e-06, + "loss": 0.1629, + "step": 1575 + }, + { + "epoch": 0.5106934543097861, + "grad_norm": 1.014716625213623, + "learning_rate": 4.7470277143836e-06, + "loss": 0.1741, + "step": 1576 + }, + { + "epoch": 0.5110174983797796, + "grad_norm": 0.9917327761650085, + "learning_rate": 4.746644204441545e-06, + "loss": 0.1772, + "step": 1577 + }, + { + "epoch": 0.5113415424497731, + "grad_norm": 1.0170183181762695, + "learning_rate": 4.746260419531989e-06, + "loss": 0.1718, + "step": 1578 + }, + { + "epoch": 0.5116655865197667, + "grad_norm": 0.9468106031417847, + "learning_rate": 4.745876359701902e-06, + "loss": 0.1652, + "step": 1579 + }, + { + "epoch": 0.5119896305897602, + "grad_norm": 0.9813267588615417, + "learning_rate": 4.745492024998291e-06, + "loss": 0.17, + "step": 1580 + }, + { + "epoch": 0.5123136746597537, + "grad_norm": 1.0143340826034546, + "learning_rate": 4.745107415468194e-06, + "loss": 0.1665, + "step": 1581 + }, + { + "epoch": 0.5126377187297473, + "grad_norm": 0.9156917929649353, + "learning_rate": 4.744722531158683e-06, + "loss": 0.1556, + "step": 1582 + }, + { + "epoch": 0.5129617627997408, + "grad_norm": 0.8644934892654419, + "learning_rate": 4.744337372116866e-06, + "loss": 0.1604, + "step": 1583 + }, + { + "epoch": 0.5132858068697342, + "grad_norm": 0.9044967293739319, + "learning_rate": 4.743951938389881e-06, + "loss": 0.1516, + "step": 1584 + }, + { + "epoch": 0.5136098509397278, + "grad_norm": 1.0245859622955322, + "learning_rate": 4.743566230024902e-06, + "loss": 0.1574, + "step": 1585 + }, + { + "epoch": 0.5139338950097213, + "grad_norm": 1.0180058479309082, + "learning_rate": 4.7431802470691355e-06, + "loss": 0.1725, + "step": 1586 + }, + { + "epoch": 0.5142579390797148, + "grad_norm": 0.9526636600494385, + "learning_rate": 4.7427939895698235e-06, + "loss": 0.1586, + "step": 1587 + }, + { + "epoch": 0.5145819831497084, + "grad_norm": 0.9528282284736633, + "learning_rate": 4.742407457574238e-06, + "loss": 0.1678, + "step": 1588 + }, + { + "epoch": 0.5149060272197019, + "grad_norm": 0.8863286375999451, + "learning_rate": 4.7420206511296885e-06, + "loss": 0.149, + "step": 1589 + }, + { + "epoch": 0.5152300712896954, + "grad_norm": 0.8912634253501892, + "learning_rate": 4.7416335702835155e-06, + "loss": 0.1614, + "step": 1590 + }, + { + "epoch": 0.5155541153596889, + "grad_norm": 1.000412940979004, + "learning_rate": 4.741246215083094e-06, + "loss": 0.1785, + "step": 1591 + }, + { + "epoch": 0.5158781594296824, + "grad_norm": 1.007895827293396, + "learning_rate": 4.740858585575832e-06, + "loss": 0.1659, + "step": 1592 + }, + { + "epoch": 0.5162022034996759, + "grad_norm": 0.9396275281906128, + "learning_rate": 4.7404706818091736e-06, + "loss": 0.1656, + "step": 1593 + }, + { + "epoch": 0.5165262475696695, + "grad_norm": 1.000503420829773, + "learning_rate": 4.740082503830593e-06, + "loss": 0.1299, + "step": 1594 + }, + { + "epoch": 0.516850291639663, + "grad_norm": 0.9946775436401367, + "learning_rate": 4.7396940516875996e-06, + "loss": 0.1719, + "step": 1595 + }, + { + "epoch": 0.5171743357096565, + "grad_norm": 0.8963751792907715, + "learning_rate": 4.739305325427736e-06, + "loss": 0.1536, + "step": 1596 + }, + { + "epoch": 0.5174983797796501, + "grad_norm": 1.0010433197021484, + "learning_rate": 4.738916325098579e-06, + "loss": 0.1629, + "step": 1597 + }, + { + "epoch": 0.5178224238496435, + "grad_norm": 0.9205980896949768, + "learning_rate": 4.738527050747738e-06, + "loss": 0.1569, + "step": 1598 + }, + { + "epoch": 0.518146467919637, + "grad_norm": 0.9161039590835571, + "learning_rate": 4.738137502422856e-06, + "loss": 0.1559, + "step": 1599 + }, + { + "epoch": 0.5184705119896306, + "grad_norm": 0.9791836738586426, + "learning_rate": 4.737747680171611e-06, + "loss": 0.1663, + "step": 1600 + }, + { + "epoch": 0.5187945560596241, + "grad_norm": 0.9533772468566895, + "learning_rate": 4.737357584041713e-06, + "loss": 0.1763, + "step": 1601 + }, + { + "epoch": 0.5191186001296176, + "grad_norm": 0.8820740580558777, + "learning_rate": 4.7369672140809065e-06, + "loss": 0.1528, + "step": 1602 + }, + { + "epoch": 0.5194426441996112, + "grad_norm": 0.9435256123542786, + "learning_rate": 4.736576570336968e-06, + "loss": 0.1746, + "step": 1603 + }, + { + "epoch": 0.5197666882696047, + "grad_norm": 0.9638010263442993, + "learning_rate": 4.736185652857709e-06, + "loss": 0.1581, + "step": 1604 + }, + { + "epoch": 0.5200907323395982, + "grad_norm": 0.9515169262886047, + "learning_rate": 4.7357944616909745e-06, + "loss": 0.159, + "step": 1605 + }, + { + "epoch": 0.5204147764095917, + "grad_norm": 0.9088611602783203, + "learning_rate": 4.735402996884642e-06, + "loss": 0.1648, + "step": 1606 + }, + { + "epoch": 0.5207388204795852, + "grad_norm": 0.8556360006332397, + "learning_rate": 4.7350112584866225e-06, + "loss": 0.1603, + "step": 1607 + }, + { + "epoch": 0.5210628645495787, + "grad_norm": 0.9666315913200378, + "learning_rate": 4.734619246544862e-06, + "loss": 0.1751, + "step": 1608 + }, + { + "epoch": 0.5213869086195723, + "grad_norm": 0.9698085188865662, + "learning_rate": 4.734226961107338e-06, + "loss": 0.1634, + "step": 1609 + }, + { + "epoch": 0.5217109526895658, + "grad_norm": 0.9516555666923523, + "learning_rate": 4.733834402222064e-06, + "loss": 0.1646, + "step": 1610 + }, + { + "epoch": 0.5220349967595593, + "grad_norm": 1.0506342649459839, + "learning_rate": 4.7334415699370825e-06, + "loss": 0.1772, + "step": 1611 + }, + { + "epoch": 0.5223590408295529, + "grad_norm": 0.9331313967704773, + "learning_rate": 4.733048464300476e-06, + "loss": 0.1487, + "step": 1612 + }, + { + "epoch": 0.5226830848995463, + "grad_norm": 1.0250540971755981, + "learning_rate": 4.732655085360355e-06, + "loss": 0.179, + "step": 1613 + }, + { + "epoch": 0.5230071289695398, + "grad_norm": 0.9178912043571472, + "learning_rate": 4.7322614331648645e-06, + "loss": 0.1594, + "step": 1614 + }, + { + "epoch": 0.5233311730395334, + "grad_norm": 0.9130881428718567, + "learning_rate": 4.731867507762184e-06, + "loss": 0.1538, + "step": 1615 + }, + { + "epoch": 0.5236552171095269, + "grad_norm": 0.8410662412643433, + "learning_rate": 4.731473309200528e-06, + "loss": 0.1489, + "step": 1616 + }, + { + "epoch": 0.5239792611795204, + "grad_norm": 0.925051748752594, + "learning_rate": 4.731078837528141e-06, + "loss": 0.1653, + "step": 1617 + }, + { + "epoch": 0.524303305249514, + "grad_norm": 0.8911797404289246, + "learning_rate": 4.730684092793302e-06, + "loss": 0.1587, + "step": 1618 + }, + { + "epoch": 0.5246273493195075, + "grad_norm": 0.9711928367614746, + "learning_rate": 4.730289075044326e-06, + "loss": 0.1706, + "step": 1619 + }, + { + "epoch": 0.5249513933895009, + "grad_norm": 0.9292394518852234, + "learning_rate": 4.729893784329557e-06, + "loss": 0.1465, + "step": 1620 + }, + { + "epoch": 0.5252754374594945, + "grad_norm": 0.9164044857025146, + "learning_rate": 4.729498220697377e-06, + "loss": 0.1635, + "step": 1621 + }, + { + "epoch": 0.525599481529488, + "grad_norm": 0.8762049674987793, + "learning_rate": 4.729102384196197e-06, + "loss": 0.1554, + "step": 1622 + }, + { + "epoch": 0.5259235255994815, + "grad_norm": 0.8459444046020508, + "learning_rate": 4.728706274874465e-06, + "loss": 0.1619, + "step": 1623 + }, + { + "epoch": 0.5262475696694751, + "grad_norm": 0.9501531720161438, + "learning_rate": 4.72830989278066e-06, + "loss": 0.1756, + "step": 1624 + }, + { + "epoch": 0.5265716137394686, + "grad_norm": 0.9610527157783508, + "learning_rate": 4.727913237963296e-06, + "loss": 0.1707, + "step": 1625 + }, + { + "epoch": 0.5268956578094621, + "grad_norm": 0.9790906310081482, + "learning_rate": 4.72751631047092e-06, + "loss": 0.1887, + "step": 1626 + }, + { + "epoch": 0.5272197018794557, + "grad_norm": 1.0699243545532227, + "learning_rate": 4.727119110352112e-06, + "loss": 0.1571, + "step": 1627 + }, + { + "epoch": 0.5275437459494491, + "grad_norm": 0.8611209988594055, + "learning_rate": 4.726721637655484e-06, + "loss": 0.1649, + "step": 1628 + }, + { + "epoch": 0.5278677900194426, + "grad_norm": 0.9416497349739075, + "learning_rate": 4.7263238924296835e-06, + "loss": 0.162, + "step": 1629 + }, + { + "epoch": 0.5281918340894362, + "grad_norm": 0.8431905508041382, + "learning_rate": 4.725925874723393e-06, + "loss": 0.1535, + "step": 1630 + }, + { + "epoch": 0.5285158781594297, + "grad_norm": 1.0187968015670776, + "learning_rate": 4.725527584585322e-06, + "loss": 0.1622, + "step": 1631 + }, + { + "epoch": 0.5288399222294232, + "grad_norm": 0.9045505523681641, + "learning_rate": 4.725129022064221e-06, + "loss": 0.1442, + "step": 1632 + }, + { + "epoch": 0.5291639662994168, + "grad_norm": 0.9165852665901184, + "learning_rate": 4.724730187208868e-06, + "loss": 0.1612, + "step": 1633 + }, + { + "epoch": 0.5294880103694103, + "grad_norm": 1.0090407133102417, + "learning_rate": 4.724331080068077e-06, + "loss": 0.181, + "step": 1634 + }, + { + "epoch": 0.5298120544394037, + "grad_norm": 0.9618867635726929, + "learning_rate": 4.723931700690695e-06, + "loss": 0.1576, + "step": 1635 + }, + { + "epoch": 0.5301360985093972, + "grad_norm": 0.8928550481796265, + "learning_rate": 4.7235320491256026e-06, + "loss": 0.1512, + "step": 1636 + }, + { + "epoch": 0.5304601425793908, + "grad_norm": 0.9328296184539795, + "learning_rate": 4.723132125421712e-06, + "loss": 0.1681, + "step": 1637 + }, + { + "epoch": 0.5307841866493843, + "grad_norm": 0.9727100133895874, + "learning_rate": 4.722731929627971e-06, + "loss": 0.1634, + "step": 1638 + }, + { + "epoch": 0.5311082307193778, + "grad_norm": 0.88592129945755, + "learning_rate": 4.722331461793361e-06, + "loss": 0.1529, + "step": 1639 + }, + { + "epoch": 0.5314322747893714, + "grad_norm": 0.94303959608078, + "learning_rate": 4.721930721966893e-06, + "loss": 0.148, + "step": 1640 + }, + { + "epoch": 0.5317563188593649, + "grad_norm": 0.8974708914756775, + "learning_rate": 4.7215297101976145e-06, + "loss": 0.1583, + "step": 1641 + }, + { + "epoch": 0.5320803629293583, + "grad_norm": 0.9344980120658875, + "learning_rate": 4.721128426534605e-06, + "loss": 0.1624, + "step": 1642 + }, + { + "epoch": 0.5324044069993519, + "grad_norm": 1.0182918310165405, + "learning_rate": 4.720726871026978e-06, + "loss": 0.1764, + "step": 1643 + }, + { + "epoch": 0.5327284510693454, + "grad_norm": 0.9536001682281494, + "learning_rate": 4.720325043723881e-06, + "loss": 0.1679, + "step": 1644 + }, + { + "epoch": 0.5330524951393389, + "grad_norm": 0.8858970403671265, + "learning_rate": 4.719922944674494e-06, + "loss": 0.1511, + "step": 1645 + }, + { + "epoch": 0.5333765392093325, + "grad_norm": 0.9926290512084961, + "learning_rate": 4.719520573928028e-06, + "loss": 0.1723, + "step": 1646 + }, + { + "epoch": 0.533700583279326, + "grad_norm": 0.9548216462135315, + "learning_rate": 4.71911793153373e-06, + "loss": 0.1684, + "step": 1647 + }, + { + "epoch": 0.5340246273493195, + "grad_norm": 1.0088659524917603, + "learning_rate": 4.7187150175408805e-06, + "loss": 0.1768, + "step": 1648 + }, + { + "epoch": 0.5343486714193131, + "grad_norm": 0.9483785033226013, + "learning_rate": 4.718311831998792e-06, + "loss": 0.1788, + "step": 1649 + }, + { + "epoch": 0.5346727154893065, + "grad_norm": 0.8539042472839355, + "learning_rate": 4.71790837495681e-06, + "loss": 0.1612, + "step": 1650 + }, + { + "epoch": 0.5349967595593, + "grad_norm": 0.8781387209892273, + "learning_rate": 4.717504646464314e-06, + "loss": 0.1654, + "step": 1651 + }, + { + "epoch": 0.5353208036292936, + "grad_norm": 0.888820469379425, + "learning_rate": 4.717100646570716e-06, + "loss": 0.1558, + "step": 1652 + }, + { + "epoch": 0.5356448476992871, + "grad_norm": 0.8900648951530457, + "learning_rate": 4.7166963753254616e-06, + "loss": 0.1484, + "step": 1653 + }, + { + "epoch": 0.5359688917692806, + "grad_norm": 0.8877013325691223, + "learning_rate": 4.716291832778031e-06, + "loss": 0.1614, + "step": 1654 + }, + { + "epoch": 0.5362929358392742, + "grad_norm": 0.9067593216896057, + "learning_rate": 4.715887018977935e-06, + "loss": 0.1511, + "step": 1655 + }, + { + "epoch": 0.5366169799092677, + "grad_norm": 0.9863800406455994, + "learning_rate": 4.715481933974719e-06, + "loss": 0.1576, + "step": 1656 + }, + { + "epoch": 0.5369410239792611, + "grad_norm": 0.9320042729377747, + "learning_rate": 4.715076577817963e-06, + "loss": 0.164, + "step": 1657 + }, + { + "epoch": 0.5372650680492547, + "grad_norm": 1.0090923309326172, + "learning_rate": 4.714670950557276e-06, + "loss": 0.1868, + "step": 1658 + }, + { + "epoch": 0.5375891121192482, + "grad_norm": 0.9123954772949219, + "learning_rate": 4.714265052242306e-06, + "loss": 0.1631, + "step": 1659 + }, + { + "epoch": 0.5379131561892417, + "grad_norm": 0.9024103879928589, + "learning_rate": 4.7138588829227285e-06, + "loss": 0.1576, + "step": 1660 + }, + { + "epoch": 0.5382372002592353, + "grad_norm": 0.9303012490272522, + "learning_rate": 4.713452442648255e-06, + "loss": 0.1581, + "step": 1661 + }, + { + "epoch": 0.5385612443292288, + "grad_norm": 0.9538670778274536, + "learning_rate": 4.7130457314686316e-06, + "loss": 0.1716, + "step": 1662 + }, + { + "epoch": 0.5388852883992223, + "grad_norm": 0.8686206936836243, + "learning_rate": 4.712638749433634e-06, + "loss": 0.1473, + "step": 1663 + }, + { + "epoch": 0.5392093324692158, + "grad_norm": 0.9704967141151428, + "learning_rate": 4.7122314965930724e-06, + "loss": 0.1639, + "step": 1664 + }, + { + "epoch": 0.5395333765392093, + "grad_norm": 0.953475832939148, + "learning_rate": 4.711823972996793e-06, + "loss": 0.1729, + "step": 1665 + }, + { + "epoch": 0.5398574206092028, + "grad_norm": 0.9771159291267395, + "learning_rate": 4.711416178694671e-06, + "loss": 0.1882, + "step": 1666 + }, + { + "epoch": 0.5401814646791964, + "grad_norm": 0.9252657890319824, + "learning_rate": 4.711008113736617e-06, + "loss": 0.16, + "step": 1667 + }, + { + "epoch": 0.5405055087491899, + "grad_norm": 0.9023372530937195, + "learning_rate": 4.710599778172575e-06, + "loss": 0.161, + "step": 1668 + }, + { + "epoch": 0.5408295528191834, + "grad_norm": 0.9718496799468994, + "learning_rate": 4.7101911720525186e-06, + "loss": 0.1513, + "step": 1669 + }, + { + "epoch": 0.541153596889177, + "grad_norm": 0.952275276184082, + "learning_rate": 4.70978229542646e-06, + "loss": 0.1693, + "step": 1670 + }, + { + "epoch": 0.5414776409591704, + "grad_norm": 0.8786360025405884, + "learning_rate": 4.709373148344441e-06, + "loss": 0.1574, + "step": 1671 + }, + { + "epoch": 0.5418016850291639, + "grad_norm": 0.9894282221794128, + "learning_rate": 4.708963730856536e-06, + "loss": 0.1632, + "step": 1672 + }, + { + "epoch": 0.5421257290991575, + "grad_norm": 0.9938233494758606, + "learning_rate": 4.708554043012857e-06, + "loss": 0.1622, + "step": 1673 + }, + { + "epoch": 0.542449773169151, + "grad_norm": 1.0080715417861938, + "learning_rate": 4.708144084863541e-06, + "loss": 0.1603, + "step": 1674 + }, + { + "epoch": 0.5427738172391445, + "grad_norm": 1.285111665725708, + "learning_rate": 4.707733856458767e-06, + "loss": 0.1904, + "step": 1675 + }, + { + "epoch": 0.5430978613091381, + "grad_norm": 0.9874075055122375, + "learning_rate": 4.707323357848741e-06, + "loss": 0.1682, + "step": 1676 + }, + { + "epoch": 0.5434219053791316, + "grad_norm": 0.9716241955757141, + "learning_rate": 4.706912589083704e-06, + "loss": 0.1737, + "step": 1677 + }, + { + "epoch": 0.5437459494491251, + "grad_norm": 0.8894767761230469, + "learning_rate": 4.706501550213932e-06, + "loss": 0.1582, + "step": 1678 + }, + { + "epoch": 0.5440699935191186, + "grad_norm": 1.0196465253829956, + "learning_rate": 4.70609024128973e-06, + "loss": 0.1846, + "step": 1679 + }, + { + "epoch": 0.5443940375891121, + "grad_norm": 0.9901070594787598, + "learning_rate": 4.7056786623614395e-06, + "loss": 0.1629, + "step": 1680 + }, + { + "epoch": 0.5447180816591056, + "grad_norm": 0.9512284994125366, + "learning_rate": 4.705266813479434e-06, + "loss": 0.1527, + "step": 1681 + }, + { + "epoch": 0.5450421257290992, + "grad_norm": 0.9086491465568542, + "learning_rate": 4.704854694694117e-06, + "loss": 0.1581, + "step": 1682 + }, + { + "epoch": 0.5453661697990927, + "grad_norm": 0.9667937755584717, + "learning_rate": 4.704442306055932e-06, + "loss": 0.1622, + "step": 1683 + }, + { + "epoch": 0.5456902138690862, + "grad_norm": 0.9949867129325867, + "learning_rate": 4.704029647615348e-06, + "loss": 0.1639, + "step": 1684 + }, + { + "epoch": 0.5460142579390798, + "grad_norm": 0.9589911103248596, + "learning_rate": 4.703616719422873e-06, + "loss": 0.1692, + "step": 1685 + }, + { + "epoch": 0.5463383020090732, + "grad_norm": 0.9363300204277039, + "learning_rate": 4.703203521529044e-06, + "loss": 0.1641, + "step": 1686 + }, + { + "epoch": 0.5466623460790667, + "grad_norm": 0.9348986148834229, + "learning_rate": 4.702790053984432e-06, + "loss": 0.1566, + "step": 1687 + }, + { + "epoch": 0.5469863901490603, + "grad_norm": 1.3050991296768188, + "learning_rate": 4.702376316839642e-06, + "loss": 0.1701, + "step": 1688 + }, + { + "epoch": 0.5473104342190538, + "grad_norm": 0.8916712403297424, + "learning_rate": 4.701962310145312e-06, + "loss": 0.1613, + "step": 1689 + }, + { + "epoch": 0.5476344782890473, + "grad_norm": 0.8960726857185364, + "learning_rate": 4.7015480339521115e-06, + "loss": 0.1574, + "step": 1690 + }, + { + "epoch": 0.5479585223590409, + "grad_norm": 0.9802014231681824, + "learning_rate": 4.701133488310744e-06, + "loss": 0.1691, + "step": 1691 + }, + { + "epoch": 0.5482825664290344, + "grad_norm": 0.8750230669975281, + "learning_rate": 4.700718673271947e-06, + "loss": 0.1495, + "step": 1692 + }, + { + "epoch": 0.5486066104990278, + "grad_norm": 0.9495435953140259, + "learning_rate": 4.700303588886489e-06, + "loss": 0.1607, + "step": 1693 + }, + { + "epoch": 0.5489306545690213, + "grad_norm": 0.9570188522338867, + "learning_rate": 4.699888235205172e-06, + "loss": 0.1641, + "step": 1694 + }, + { + "epoch": 0.5492546986390149, + "grad_norm": 0.9574050903320312, + "learning_rate": 4.699472612278831e-06, + "loss": 0.1766, + "step": 1695 + }, + { + "epoch": 0.5495787427090084, + "grad_norm": 0.9535946846008301, + "learning_rate": 4.699056720158336e-06, + "loss": 0.1609, + "step": 1696 + }, + { + "epoch": 0.549902786779002, + "grad_norm": 0.9584629535675049, + "learning_rate": 4.698640558894586e-06, + "loss": 0.165, + "step": 1697 + }, + { + "epoch": 0.5502268308489955, + "grad_norm": 0.932038426399231, + "learning_rate": 4.698224128538517e-06, + "loss": 0.162, + "step": 1698 + }, + { + "epoch": 0.550550874918989, + "grad_norm": 0.9042884111404419, + "learning_rate": 4.6978074291410936e-06, + "loss": 0.1565, + "step": 1699 + }, + { + "epoch": 0.5508749189889826, + "grad_norm": 0.9677349925041199, + "learning_rate": 4.697390460753318e-06, + "loss": 0.1815, + "step": 1700 + }, + { + "epoch": 0.551198963058976, + "grad_norm": 0.867965817451477, + "learning_rate": 4.696973223426224e-06, + "loss": 0.1427, + "step": 1701 + }, + { + "epoch": 0.5515230071289695, + "grad_norm": 0.8912596106529236, + "learning_rate": 4.696555717210873e-06, + "loss": 0.1511, + "step": 1702 + }, + { + "epoch": 0.551847051198963, + "grad_norm": 0.9456349015235901, + "learning_rate": 4.6961379421583685e-06, + "loss": 0.1618, + "step": 1703 + }, + { + "epoch": 0.5521710952689566, + "grad_norm": 0.9106193780899048, + "learning_rate": 4.695719898319839e-06, + "loss": 0.1529, + "step": 1704 + }, + { + "epoch": 0.5524951393389501, + "grad_norm": 0.9977267980575562, + "learning_rate": 4.695301585746451e-06, + "loss": 0.1705, + "step": 1705 + }, + { + "epoch": 0.5528191834089436, + "grad_norm": 1.021049976348877, + "learning_rate": 4.6948830044894016e-06, + "loss": 0.1711, + "step": 1706 + }, + { + "epoch": 0.5531432274789372, + "grad_norm": 0.953581690788269, + "learning_rate": 4.6944641545999194e-06, + "loss": 0.1588, + "step": 1707 + }, + { + "epoch": 0.5534672715489306, + "grad_norm": 0.9142120480537415, + "learning_rate": 4.694045036129269e-06, + "loss": 0.1667, + "step": 1708 + }, + { + "epoch": 0.5537913156189241, + "grad_norm": 0.9687793850898743, + "learning_rate": 4.693625649128746e-06, + "loss": 0.1669, + "step": 1709 + }, + { + "epoch": 0.5541153596889177, + "grad_norm": 0.9391654133796692, + "learning_rate": 4.69320599364968e-06, + "loss": 0.1739, + "step": 1710 + }, + { + "epoch": 0.5544394037589112, + "grad_norm": 0.8609366416931152, + "learning_rate": 4.692786069743432e-06, + "loss": 0.1523, + "step": 1711 + }, + { + "epoch": 0.5547634478289047, + "grad_norm": 0.9310189485549927, + "learning_rate": 4.692365877461397e-06, + "loss": 0.1693, + "step": 1712 + }, + { + "epoch": 0.5550874918988983, + "grad_norm": 0.925957202911377, + "learning_rate": 4.691945416855002e-06, + "loss": 0.164, + "step": 1713 + }, + { + "epoch": 0.5554115359688918, + "grad_norm": 0.9316194653511047, + "learning_rate": 4.6915246879757084e-06, + "loss": 0.1649, + "step": 1714 + }, + { + "epoch": 0.5557355800388852, + "grad_norm": 0.9110631346702576, + "learning_rate": 4.691103690875007e-06, + "loss": 0.1579, + "step": 1715 + }, + { + "epoch": 0.5560596241088788, + "grad_norm": 0.9508496522903442, + "learning_rate": 4.690682425604427e-06, + "loss": 0.1705, + "step": 1716 + }, + { + "epoch": 0.5563836681788723, + "grad_norm": 0.9079408645629883, + "learning_rate": 4.690260892215525e-06, + "loss": 0.1683, + "step": 1717 + }, + { + "epoch": 0.5567077122488658, + "grad_norm": 0.9047849178314209, + "learning_rate": 4.689839090759893e-06, + "loss": 0.1633, + "step": 1718 + }, + { + "epoch": 0.5570317563188594, + "grad_norm": 0.9550807476043701, + "learning_rate": 4.689417021289157e-06, + "loss": 0.1776, + "step": 1719 + }, + { + "epoch": 0.5573558003888529, + "grad_norm": 0.9622770547866821, + "learning_rate": 4.68899468385497e-06, + "loss": 0.1701, + "step": 1720 + }, + { + "epoch": 0.5576798444588464, + "grad_norm": 0.8848745226860046, + "learning_rate": 4.688572078509027e-06, + "loss": 0.1558, + "step": 1721 + }, + { + "epoch": 0.55800388852884, + "grad_norm": 0.9164038896560669, + "learning_rate": 4.688149205303048e-06, + "loss": 0.1597, + "step": 1722 + }, + { + "epoch": 0.5583279325988334, + "grad_norm": 0.9261012673377991, + "learning_rate": 4.687726064288789e-06, + "loss": 0.1656, + "step": 1723 + }, + { + "epoch": 0.5586519766688269, + "grad_norm": 0.8843210339546204, + "learning_rate": 4.6873026555180386e-06, + "loss": 0.1481, + "step": 1724 + }, + { + "epoch": 0.5589760207388205, + "grad_norm": 0.8937526345252991, + "learning_rate": 4.6868789790426185e-06, + "loss": 0.1594, + "step": 1725 + }, + { + "epoch": 0.559300064808814, + "grad_norm": 0.8904445767402649, + "learning_rate": 4.6864550349143815e-06, + "loss": 0.1554, + "step": 1726 + }, + { + "epoch": 0.5596241088788075, + "grad_norm": 0.9143627882003784, + "learning_rate": 4.686030823185215e-06, + "loss": 0.1535, + "step": 1727 + }, + { + "epoch": 0.5599481529488011, + "grad_norm": 0.9604513645172119, + "learning_rate": 4.685606343907038e-06, + "loss": 0.1797, + "step": 1728 + }, + { + "epoch": 0.5602721970187946, + "grad_norm": 0.9768103957176208, + "learning_rate": 4.685181597131802e-06, + "loss": 0.1694, + "step": 1729 + }, + { + "epoch": 0.560596241088788, + "grad_norm": 0.9122162461280823, + "learning_rate": 4.684756582911494e-06, + "loss": 0.1681, + "step": 1730 + }, + { + "epoch": 0.5609202851587816, + "grad_norm": 0.9020214676856995, + "learning_rate": 4.6843313012981295e-06, + "loss": 0.1618, + "step": 1731 + }, + { + "epoch": 0.5612443292287751, + "grad_norm": 0.9545856714248657, + "learning_rate": 4.6839057523437606e-06, + "loss": 0.1671, + "step": 1732 + }, + { + "epoch": 0.5615683732987686, + "grad_norm": 0.8946810364723206, + "learning_rate": 4.683479936100468e-06, + "loss": 0.1694, + "step": 1733 + }, + { + "epoch": 0.5618924173687622, + "grad_norm": 0.9462122321128845, + "learning_rate": 4.68305385262037e-06, + "loss": 0.162, + "step": 1734 + }, + { + "epoch": 0.5622164614387557, + "grad_norm": 0.8827371001243591, + "learning_rate": 4.682627501955614e-06, + "loss": 0.1497, + "step": 1735 + }, + { + "epoch": 0.5625405055087492, + "grad_norm": 0.9082047939300537, + "learning_rate": 4.682200884158381e-06, + "loss": 0.1565, + "step": 1736 + }, + { + "epoch": 0.5628645495787427, + "grad_norm": 0.9250028729438782, + "learning_rate": 4.6817739992808855e-06, + "loss": 0.1735, + "step": 1737 + }, + { + "epoch": 0.5631885936487362, + "grad_norm": 0.9381402730941772, + "learning_rate": 4.681346847375373e-06, + "loss": 0.1711, + "step": 1738 + }, + { + "epoch": 0.5635126377187297, + "grad_norm": 0.9299399852752686, + "learning_rate": 4.6809194284941236e-06, + "loss": 0.1763, + "step": 1739 + }, + { + "epoch": 0.5638366817887233, + "grad_norm": 0.8783348202705383, + "learning_rate": 4.6804917426894495e-06, + "loss": 0.1583, + "step": 1740 + }, + { + "epoch": 0.5641607258587168, + "grad_norm": 0.8714194297790527, + "learning_rate": 4.6800637900136944e-06, + "loss": 0.143, + "step": 1741 + }, + { + "epoch": 0.5644847699287103, + "grad_norm": 1.0816848278045654, + "learning_rate": 4.679635570519236e-06, + "loss": 0.1597, + "step": 1742 + }, + { + "epoch": 0.5648088139987039, + "grad_norm": 0.9509788155555725, + "learning_rate": 4.6792070842584855e-06, + "loss": 0.1745, + "step": 1743 + }, + { + "epoch": 0.5651328580686974, + "grad_norm": 0.9559815526008606, + "learning_rate": 4.678778331283883e-06, + "loss": 0.1757, + "step": 1744 + }, + { + "epoch": 0.5654569021386908, + "grad_norm": 0.8636212944984436, + "learning_rate": 4.678349311647905e-06, + "loss": 0.1552, + "step": 1745 + }, + { + "epoch": 0.5657809462086844, + "grad_norm": 0.9815044403076172, + "learning_rate": 4.67792002540306e-06, + "loss": 0.161, + "step": 1746 + }, + { + "epoch": 0.5661049902786779, + "grad_norm": 0.9376473426818848, + "learning_rate": 4.677490472601888e-06, + "loss": 0.1514, + "step": 1747 + }, + { + "epoch": 0.5664290343486714, + "grad_norm": 1.0068804025650024, + "learning_rate": 4.677060653296961e-06, + "loss": 0.1676, + "step": 1748 + }, + { + "epoch": 0.566753078418665, + "grad_norm": 0.8940487504005432, + "learning_rate": 4.676630567540886e-06, + "loss": 0.1468, + "step": 1749 + }, + { + "epoch": 0.5670771224886585, + "grad_norm": 0.8850117921829224, + "learning_rate": 4.6762002153863e-06, + "loss": 0.1557, + "step": 1750 + }, + { + "epoch": 0.567401166558652, + "grad_norm": 0.9821330904960632, + "learning_rate": 4.675769596885877e-06, + "loss": 0.1742, + "step": 1751 + }, + { + "epoch": 0.5677252106286454, + "grad_norm": 0.9198369979858398, + "learning_rate": 4.675338712092316e-06, + "loss": 0.1653, + "step": 1752 + }, + { + "epoch": 0.568049254698639, + "grad_norm": 0.8890524506568909, + "learning_rate": 4.674907561058358e-06, + "loss": 0.1525, + "step": 1753 + }, + { + "epoch": 0.5683732987686325, + "grad_norm": 0.9032002091407776, + "learning_rate": 4.674476143836768e-06, + "loss": 0.1542, + "step": 1754 + }, + { + "epoch": 0.568697342838626, + "grad_norm": 0.9556316137313843, + "learning_rate": 4.674044460480348e-06, + "loss": 0.1637, + "step": 1755 + }, + { + "epoch": 0.5690213869086196, + "grad_norm": 0.9863228797912598, + "learning_rate": 4.673612511041933e-06, + "loss": 0.1714, + "step": 1756 + }, + { + "epoch": 0.5693454309786131, + "grad_norm": 0.9153149127960205, + "learning_rate": 4.673180295574389e-06, + "loss": 0.1523, + "step": 1757 + }, + { + "epoch": 0.5696694750486067, + "grad_norm": 1.0959794521331787, + "learning_rate": 4.672747814130615e-06, + "loss": 0.1797, + "step": 1758 + }, + { + "epoch": 0.5699935191186001, + "grad_norm": 0.9302141070365906, + "learning_rate": 4.672315066763542e-06, + "loss": 0.1592, + "step": 1759 + }, + { + "epoch": 0.5703175631885936, + "grad_norm": 0.8578373193740845, + "learning_rate": 4.671882053526135e-06, + "loss": 0.1507, + "step": 1760 + }, + { + "epoch": 0.5706416072585871, + "grad_norm": 0.8829067945480347, + "learning_rate": 4.671448774471389e-06, + "loss": 0.1525, + "step": 1761 + }, + { + "epoch": 0.5709656513285807, + "grad_norm": 0.9160066843032837, + "learning_rate": 4.671015229652335e-06, + "loss": 0.1695, + "step": 1762 + }, + { + "epoch": 0.5712896953985742, + "grad_norm": 0.9778125286102295, + "learning_rate": 4.670581419122034e-06, + "loss": 0.1595, + "step": 1763 + }, + { + "epoch": 0.5716137394685677, + "grad_norm": 0.9264950156211853, + "learning_rate": 4.67014734293358e-06, + "loss": 0.1729, + "step": 1764 + }, + { + "epoch": 0.5719377835385613, + "grad_norm": 0.8530830144882202, + "learning_rate": 4.6697130011401e-06, + "loss": 0.1556, + "step": 1765 + }, + { + "epoch": 0.5722618276085548, + "grad_norm": 0.9190473556518555, + "learning_rate": 4.669278393794753e-06, + "loss": 0.1623, + "step": 1766 + }, + { + "epoch": 0.5725858716785482, + "grad_norm": 0.9246593117713928, + "learning_rate": 4.6688435209507305e-06, + "loss": 0.1595, + "step": 1767 + }, + { + "epoch": 0.5729099157485418, + "grad_norm": 0.9285086989402771, + "learning_rate": 4.668408382661257e-06, + "loss": 0.1518, + "step": 1768 + }, + { + "epoch": 0.5732339598185353, + "grad_norm": 0.8804833889007568, + "learning_rate": 4.66797297897959e-06, + "loss": 0.1574, + "step": 1769 + }, + { + "epoch": 0.5735580038885288, + "grad_norm": 0.9340384602546692, + "learning_rate": 4.667537309959018e-06, + "loss": 0.1584, + "step": 1770 + }, + { + "epoch": 0.5738820479585224, + "grad_norm": 0.9027433395385742, + "learning_rate": 4.667101375652862e-06, + "loss": 0.1642, + "step": 1771 + }, + { + "epoch": 0.5742060920285159, + "grad_norm": 0.9212940335273743, + "learning_rate": 4.666665176114477e-06, + "loss": 0.1612, + "step": 1772 + }, + { + "epoch": 0.5745301360985094, + "grad_norm": 0.9400113224983215, + "learning_rate": 4.666228711397249e-06, + "loss": 0.1651, + "step": 1773 + }, + { + "epoch": 0.5748541801685029, + "grad_norm": 0.9218817353248596, + "learning_rate": 4.665791981554598e-06, + "loss": 0.1614, + "step": 1774 + }, + { + "epoch": 0.5751782242384964, + "grad_norm": 0.9443331360816956, + "learning_rate": 4.665354986639975e-06, + "loss": 0.1615, + "step": 1775 + }, + { + "epoch": 0.5755022683084899, + "grad_norm": 0.8934869170188904, + "learning_rate": 4.664917726706864e-06, + "loss": 0.1417, + "step": 1776 + }, + { + "epoch": 0.5758263123784835, + "grad_norm": 0.9580870270729065, + "learning_rate": 4.6644802018087806e-06, + "loss": 0.1595, + "step": 1777 + }, + { + "epoch": 0.576150356448477, + "grad_norm": 0.9523159861564636, + "learning_rate": 4.664042411999276e-06, + "loss": 0.1667, + "step": 1778 + }, + { + "epoch": 0.5764744005184705, + "grad_norm": 0.9081785082817078, + "learning_rate": 4.663604357331928e-06, + "loss": 0.1654, + "step": 1779 + }, + { + "epoch": 0.5767984445884641, + "grad_norm": 0.9075709581375122, + "learning_rate": 4.6631660378603526e-06, + "loss": 0.1674, + "step": 1780 + }, + { + "epoch": 0.5771224886584575, + "grad_norm": 0.8578202724456787, + "learning_rate": 4.662727453638195e-06, + "loss": 0.1501, + "step": 1781 + }, + { + "epoch": 0.577446532728451, + "grad_norm": 0.859342098236084, + "learning_rate": 4.662288604719134e-06, + "loss": 0.1467, + "step": 1782 + }, + { + "epoch": 0.5777705767984446, + "grad_norm": 0.9672659635543823, + "learning_rate": 4.66184949115688e-06, + "loss": 0.1663, + "step": 1783 + }, + { + "epoch": 0.5780946208684381, + "grad_norm": 0.9163477420806885, + "learning_rate": 4.661410113005177e-06, + "loss": 0.1571, + "step": 1784 + }, + { + "epoch": 0.5784186649384316, + "grad_norm": 0.9425603151321411, + "learning_rate": 4.6609704703178e-06, + "loss": 0.1613, + "step": 1785 + }, + { + "epoch": 0.5787427090084252, + "grad_norm": 0.8870068192481995, + "learning_rate": 4.660530563148557e-06, + "loss": 0.1512, + "step": 1786 + }, + { + "epoch": 0.5790667530784187, + "grad_norm": 0.8830798268318176, + "learning_rate": 4.66009039155129e-06, + "loss": 0.1444, + "step": 1787 + }, + { + "epoch": 0.5793907971484121, + "grad_norm": 0.8862148523330688, + "learning_rate": 4.659649955579869e-06, + "loss": 0.1597, + "step": 1788 + }, + { + "epoch": 0.5797148412184057, + "grad_norm": 0.978209912776947, + "learning_rate": 4.659209255288201e-06, + "loss": 0.1543, + "step": 1789 + }, + { + "epoch": 0.5800388852883992, + "grad_norm": 1.030468225479126, + "learning_rate": 4.658768290730222e-06, + "loss": 0.1787, + "step": 1790 + }, + { + "epoch": 0.5803629293583927, + "grad_norm": 1.020377278327942, + "learning_rate": 4.658327061959904e-06, + "loss": 0.1736, + "step": 1791 + }, + { + "epoch": 0.5806869734283863, + "grad_norm": 0.8843832015991211, + "learning_rate": 4.6578855690312474e-06, + "loss": 0.1502, + "step": 1792 + }, + { + "epoch": 0.5810110174983798, + "grad_norm": 0.9210875034332275, + "learning_rate": 4.657443811998287e-06, + "loss": 0.1556, + "step": 1793 + }, + { + "epoch": 0.5813350615683733, + "grad_norm": 0.9051605463027954, + "learning_rate": 4.65700179091509e-06, + "loss": 0.1414, + "step": 1794 + }, + { + "epoch": 0.5816591056383669, + "grad_norm": 0.9195812940597534, + "learning_rate": 4.656559505835755e-06, + "loss": 0.1558, + "step": 1795 + }, + { + "epoch": 0.5819831497083603, + "grad_norm": 0.8479184508323669, + "learning_rate": 4.656116956814414e-06, + "loss": 0.1533, + "step": 1796 + }, + { + "epoch": 0.5823071937783538, + "grad_norm": 0.9624475240707397, + "learning_rate": 4.655674143905229e-06, + "loss": 0.1715, + "step": 1797 + }, + { + "epoch": 0.5826312378483474, + "grad_norm": 0.8872055411338806, + "learning_rate": 4.655231067162398e-06, + "loss": 0.1449, + "step": 1798 + }, + { + "epoch": 0.5829552819183409, + "grad_norm": 0.7795613408088684, + "learning_rate": 4.65478772664015e-06, + "loss": 0.1366, + "step": 1799 + }, + { + "epoch": 0.5832793259883344, + "grad_norm": 1.0152753591537476, + "learning_rate": 4.654344122392742e-06, + "loss": 0.1559, + "step": 1800 + }, + { + "epoch": 0.583603370058328, + "grad_norm": 0.9903998374938965, + "learning_rate": 4.6539002544744705e-06, + "loss": 0.1492, + "step": 1801 + }, + { + "epoch": 0.5839274141283215, + "grad_norm": 0.9595767259597778, + "learning_rate": 4.653456122939659e-06, + "loss": 0.1616, + "step": 1802 + }, + { + "epoch": 0.5842514581983149, + "grad_norm": 0.8353689312934875, + "learning_rate": 4.653011727842665e-06, + "loss": 0.1418, + "step": 1803 + }, + { + "epoch": 0.5845755022683085, + "grad_norm": 0.9445038437843323, + "learning_rate": 4.652567069237877e-06, + "loss": 0.1664, + "step": 1804 + }, + { + "epoch": 0.584899546338302, + "grad_norm": 0.9006994962692261, + "learning_rate": 4.652122147179721e-06, + "loss": 0.1495, + "step": 1805 + }, + { + "epoch": 0.5852235904082955, + "grad_norm": 0.9477143883705139, + "learning_rate": 4.651676961722647e-06, + "loss": 0.1673, + "step": 1806 + }, + { + "epoch": 0.5855476344782891, + "grad_norm": 0.9442145824432373, + "learning_rate": 4.651231512921142e-06, + "loss": 0.1603, + "step": 1807 + }, + { + "epoch": 0.5858716785482826, + "grad_norm": 0.9480714797973633, + "learning_rate": 4.650785800829726e-06, + "loss": 0.1666, + "step": 1808 + }, + { + "epoch": 0.5861957226182761, + "grad_norm": 0.8913865089416504, + "learning_rate": 4.650339825502949e-06, + "loss": 0.1503, + "step": 1809 + }, + { + "epoch": 0.5865197666882696, + "grad_norm": 0.8414224982261658, + "learning_rate": 4.6498935869953945e-06, + "loss": 0.1514, + "step": 1810 + }, + { + "epoch": 0.5868438107582631, + "grad_norm": 0.9178133606910706, + "learning_rate": 4.649447085361677e-06, + "loss": 0.1674, + "step": 1811 + }, + { + "epoch": 0.5871678548282566, + "grad_norm": 0.8867731690406799, + "learning_rate": 4.649000320656445e-06, + "loss": 0.1516, + "step": 1812 + }, + { + "epoch": 0.5874918988982502, + "grad_norm": 0.9027390480041504, + "learning_rate": 4.648553292934377e-06, + "loss": 0.1693, + "step": 1813 + }, + { + "epoch": 0.5878159429682437, + "grad_norm": 0.9258837699890137, + "learning_rate": 4.648106002250186e-06, + "loss": 0.1694, + "step": 1814 + }, + { + "epoch": 0.5881399870382372, + "grad_norm": 0.8763899207115173, + "learning_rate": 4.647658448658616e-06, + "loss": 0.1591, + "step": 1815 + }, + { + "epoch": 0.5884640311082308, + "grad_norm": 0.8758101463317871, + "learning_rate": 4.647210632214443e-06, + "loss": 0.1601, + "step": 1816 + }, + { + "epoch": 0.5887880751782243, + "grad_norm": 0.9090110659599304, + "learning_rate": 4.646762552972475e-06, + "loss": 0.1536, + "step": 1817 + }, + { + "epoch": 0.5891121192482177, + "grad_norm": 0.9815348982810974, + "learning_rate": 4.646314210987552e-06, + "loss": 0.1717, + "step": 1818 + }, + { + "epoch": 0.5894361633182112, + "grad_norm": 0.9743311405181885, + "learning_rate": 4.645865606314548e-06, + "loss": 0.1639, + "step": 1819 + }, + { + "epoch": 0.5897602073882048, + "grad_norm": 0.9204444289207458, + "learning_rate": 4.645416739008367e-06, + "loss": 0.1537, + "step": 1820 + }, + { + "epoch": 0.5900842514581983, + "grad_norm": 0.9639553427696228, + "learning_rate": 4.644967609123947e-06, + "loss": 0.1704, + "step": 1821 + }, + { + "epoch": 0.5904082955281919, + "grad_norm": 1.0289804935455322, + "learning_rate": 4.644518216716256e-06, + "loss": 0.1718, + "step": 1822 + }, + { + "epoch": 0.5907323395981854, + "grad_norm": 0.8550962805747986, + "learning_rate": 4.644068561840297e-06, + "loss": 0.1629, + "step": 1823 + }, + { + "epoch": 0.5910563836681789, + "grad_norm": 0.8934510350227356, + "learning_rate": 4.643618644551101e-06, + "loss": 0.164, + "step": 1824 + }, + { + "epoch": 0.5913804277381723, + "grad_norm": 1.0084798336029053, + "learning_rate": 4.643168464903736e-06, + "loss": 0.1552, + "step": 1825 + }, + { + "epoch": 0.5917044718081659, + "grad_norm": 0.8919445276260376, + "learning_rate": 4.642718022953297e-06, + "loss": 0.1692, + "step": 1826 + }, + { + "epoch": 0.5920285158781594, + "grad_norm": 0.883470892906189, + "learning_rate": 4.642267318754915e-06, + "loss": 0.1646, + "step": 1827 + }, + { + "epoch": 0.592352559948153, + "grad_norm": 0.8612281680107117, + "learning_rate": 4.641816352363753e-06, + "loss": 0.1591, + "step": 1828 + }, + { + "epoch": 0.5926766040181465, + "grad_norm": 0.9234902858734131, + "learning_rate": 4.641365123835004e-06, + "loss": 0.1766, + "step": 1829 + }, + { + "epoch": 0.59300064808814, + "grad_norm": 0.9338532090187073, + "learning_rate": 4.640913633223893e-06, + "loss": 0.1626, + "step": 1830 + }, + { + "epoch": 0.5933246921581335, + "grad_norm": 0.8946052193641663, + "learning_rate": 4.64046188058568e-06, + "loss": 0.1668, + "step": 1831 + }, + { + "epoch": 0.593648736228127, + "grad_norm": 0.8641635179519653, + "learning_rate": 4.6400098659756525e-06, + "loss": 0.1568, + "step": 1832 + }, + { + "epoch": 0.5939727802981205, + "grad_norm": 0.8863862156867981, + "learning_rate": 4.639557589449135e-06, + "loss": 0.1596, + "step": 1833 + }, + { + "epoch": 0.594296824368114, + "grad_norm": 0.895317018032074, + "learning_rate": 4.639105051061481e-06, + "loss": 0.1609, + "step": 1834 + }, + { + "epoch": 0.5946208684381076, + "grad_norm": 0.9277777671813965, + "learning_rate": 4.638652250868078e-06, + "loss": 0.1574, + "step": 1835 + }, + { + "epoch": 0.5949449125081011, + "grad_norm": 0.9136897921562195, + "learning_rate": 4.6381991889243416e-06, + "loss": 0.162, + "step": 1836 + }, + { + "epoch": 0.5952689565780946, + "grad_norm": 1.0817289352416992, + "learning_rate": 4.637745865285725e-06, + "loss": 0.1736, + "step": 1837 + }, + { + "epoch": 0.5955930006480882, + "grad_norm": 0.915195107460022, + "learning_rate": 4.637292280007709e-06, + "loss": 0.1501, + "step": 1838 + }, + { + "epoch": 0.5959170447180817, + "grad_norm": 0.9343981146812439, + "learning_rate": 4.6368384331458085e-06, + "loss": 0.1637, + "step": 1839 + }, + { + "epoch": 0.5962410887880751, + "grad_norm": 0.9210292100906372, + "learning_rate": 4.63638432475557e-06, + "loss": 0.1636, + "step": 1840 + }, + { + "epoch": 0.5965651328580687, + "grad_norm": 0.8031634092330933, + "learning_rate": 4.635929954892572e-06, + "loss": 0.1447, + "step": 1841 + }, + { + "epoch": 0.5968891769280622, + "grad_norm": 0.9125548601150513, + "learning_rate": 4.6354753236124254e-06, + "loss": 0.157, + "step": 1842 + }, + { + "epoch": 0.5972132209980557, + "grad_norm": 0.9125232100486755, + "learning_rate": 4.635020430970771e-06, + "loss": 0.1625, + "step": 1843 + }, + { + "epoch": 0.5975372650680493, + "grad_norm": 1.0112113952636719, + "learning_rate": 4.6345652770232856e-06, + "loss": 0.1616, + "step": 1844 + }, + { + "epoch": 0.5978613091380428, + "grad_norm": 0.8509640693664551, + "learning_rate": 4.6341098618256745e-06, + "loss": 0.1402, + "step": 1845 + }, + { + "epoch": 0.5981853532080363, + "grad_norm": 0.8826951384544373, + "learning_rate": 4.633654185433676e-06, + "loss": 0.1684, + "step": 1846 + }, + { + "epoch": 0.5985093972780298, + "grad_norm": 0.9403404593467712, + "learning_rate": 4.63319824790306e-06, + "loss": 0.1612, + "step": 1847 + }, + { + "epoch": 0.5988334413480233, + "grad_norm": 0.8916503190994263, + "learning_rate": 4.6327420492896295e-06, + "loss": 0.1527, + "step": 1848 + }, + { + "epoch": 0.5991574854180168, + "grad_norm": 0.9854899644851685, + "learning_rate": 4.632285589649219e-06, + "loss": 0.168, + "step": 1849 + }, + { + "epoch": 0.5994815294880104, + "grad_norm": 0.9854278564453125, + "learning_rate": 4.631828869037694e-06, + "loss": 0.1587, + "step": 1850 + }, + { + "epoch": 0.5998055735580039, + "grad_norm": 0.8996358513832092, + "learning_rate": 4.631371887510954e-06, + "loss": 0.1694, + "step": 1851 + }, + { + "epoch": 0.6001296176279974, + "grad_norm": 0.8837627172470093, + "learning_rate": 4.630914645124928e-06, + "loss": 0.1648, + "step": 1852 + }, + { + "epoch": 0.600453661697991, + "grad_norm": 0.9942870736122131, + "learning_rate": 4.630457141935577e-06, + "loss": 0.1545, + "step": 1853 + }, + { + "epoch": 0.6007777057679844, + "grad_norm": 0.9469466209411621, + "learning_rate": 4.629999377998898e-06, + "loss": 0.1825, + "step": 1854 + }, + { + "epoch": 0.6011017498379779, + "grad_norm": 0.858815610408783, + "learning_rate": 4.629541353370914e-06, + "loss": 0.1495, + "step": 1855 + }, + { + "epoch": 0.6014257939079715, + "grad_norm": 0.8882916569709778, + "learning_rate": 4.629083068107684e-06, + "loss": 0.1618, + "step": 1856 + }, + { + "epoch": 0.601749837977965, + "grad_norm": 0.9027432203292847, + "learning_rate": 4.628624522265298e-06, + "loss": 0.1667, + "step": 1857 + }, + { + "epoch": 0.6020738820479585, + "grad_norm": 0.9251678586006165, + "learning_rate": 4.628165715899877e-06, + "loss": 0.1621, + "step": 1858 + }, + { + "epoch": 0.6023979261179521, + "grad_norm": 1.0207189321517944, + "learning_rate": 4.627706649067575e-06, + "loss": 0.1701, + "step": 1859 + }, + { + "epoch": 0.6027219701879456, + "grad_norm": 0.8564452528953552, + "learning_rate": 4.627247321824576e-06, + "loss": 0.1558, + "step": 1860 + }, + { + "epoch": 0.6030460142579391, + "grad_norm": 0.8749232888221741, + "learning_rate": 4.6267877342271e-06, + "loss": 0.1466, + "step": 1861 + }, + { + "epoch": 0.6033700583279326, + "grad_norm": 0.9126169085502625, + "learning_rate": 4.626327886331392e-06, + "loss": 0.1704, + "step": 1862 + }, + { + "epoch": 0.6036941023979261, + "grad_norm": 0.863114595413208, + "learning_rate": 4.625867778193737e-06, + "loss": 0.1567, + "step": 1863 + }, + { + "epoch": 0.6040181464679196, + "grad_norm": 0.8384044170379639, + "learning_rate": 4.625407409870444e-06, + "loss": 0.1432, + "step": 1864 + }, + { + "epoch": 0.6043421905379132, + "grad_norm": 0.8923614621162415, + "learning_rate": 4.624946781417861e-06, + "loss": 0.1513, + "step": 1865 + }, + { + "epoch": 0.6046662346079067, + "grad_norm": 0.9929710626602173, + "learning_rate": 4.624485892892363e-06, + "loss": 0.1813, + "step": 1866 + }, + { + "epoch": 0.6049902786779002, + "grad_norm": 0.8921619057655334, + "learning_rate": 4.624024744350358e-06, + "loss": 0.157, + "step": 1867 + }, + { + "epoch": 0.6053143227478938, + "grad_norm": 0.9352006316184998, + "learning_rate": 4.623563335848286e-06, + "loss": 0.1704, + "step": 1868 + }, + { + "epoch": 0.6056383668178872, + "grad_norm": 0.9039468765258789, + "learning_rate": 4.62310166744262e-06, + "loss": 0.1439, + "step": 1869 + }, + { + "epoch": 0.6059624108878807, + "grad_norm": 0.8728505969047546, + "learning_rate": 4.622639739189863e-06, + "loss": 0.1464, + "step": 1870 + }, + { + "epoch": 0.6062864549578743, + "grad_norm": 0.9441261887550354, + "learning_rate": 4.62217755114655e-06, + "loss": 0.1716, + "step": 1871 + }, + { + "epoch": 0.6066104990278678, + "grad_norm": 0.8899536728858948, + "learning_rate": 4.62171510336925e-06, + "loss": 0.1627, + "step": 1872 + }, + { + "epoch": 0.6069345430978613, + "grad_norm": 0.9109644889831543, + "learning_rate": 4.621252395914561e-06, + "loss": 0.1657, + "step": 1873 + }, + { + "epoch": 0.6072585871678549, + "grad_norm": 0.8352120518684387, + "learning_rate": 4.620789428839114e-06, + "loss": 0.1511, + "step": 1874 + }, + { + "epoch": 0.6075826312378484, + "grad_norm": 0.8798180222511292, + "learning_rate": 4.620326202199572e-06, + "loss": 0.148, + "step": 1875 + }, + { + "epoch": 0.6079066753078418, + "grad_norm": 0.8908928036689758, + "learning_rate": 4.619862716052629e-06, + "loss": 0.167, + "step": 1876 + }, + { + "epoch": 0.6082307193778353, + "grad_norm": 0.9110801219940186, + "learning_rate": 4.6193989704550105e-06, + "loss": 0.1693, + "step": 1877 + }, + { + "epoch": 0.6085547634478289, + "grad_norm": 0.9067312479019165, + "learning_rate": 4.6189349654634766e-06, + "loss": 0.1601, + "step": 1878 + }, + { + "epoch": 0.6088788075178224, + "grad_norm": 0.9549569487571716, + "learning_rate": 4.618470701134815e-06, + "loss": 0.1732, + "step": 1879 + }, + { + "epoch": 0.609202851587816, + "grad_norm": 0.8671174645423889, + "learning_rate": 4.618006177525849e-06, + "loss": 0.1459, + "step": 1880 + }, + { + "epoch": 0.6095268956578095, + "grad_norm": 0.8848288059234619, + "learning_rate": 4.61754139469343e-06, + "loss": 0.1562, + "step": 1881 + }, + { + "epoch": 0.609850939727803, + "grad_norm": 0.9089393019676208, + "learning_rate": 4.6170763526944425e-06, + "loss": 0.1545, + "step": 1882 + }, + { + "epoch": 0.6101749837977966, + "grad_norm": 0.8325822949409485, + "learning_rate": 4.616611051585806e-06, + "loss": 0.1411, + "step": 1883 + }, + { + "epoch": 0.61049902786779, + "grad_norm": 0.8850380778312683, + "learning_rate": 4.6161454914244665e-06, + "loss": 0.1566, + "step": 1884 + }, + { + "epoch": 0.6108230719377835, + "grad_norm": 0.889552891254425, + "learning_rate": 4.615679672267405e-06, + "loss": 0.152, + "step": 1885 + }, + { + "epoch": 0.611147116007777, + "grad_norm": 0.8596039414405823, + "learning_rate": 4.615213594171633e-06, + "loss": 0.1443, + "step": 1886 + }, + { + "epoch": 0.6114711600777706, + "grad_norm": 0.9552878737449646, + "learning_rate": 4.614747257194194e-06, + "loss": 0.1743, + "step": 1887 + }, + { + "epoch": 0.6117952041477641, + "grad_norm": 0.8836682438850403, + "learning_rate": 4.614280661392163e-06, + "loss": 0.1666, + "step": 1888 + }, + { + "epoch": 0.6121192482177576, + "grad_norm": 0.8738223910331726, + "learning_rate": 4.613813806822647e-06, + "loss": 0.1571, + "step": 1889 + }, + { + "epoch": 0.6124432922877512, + "grad_norm": 0.9057800769805908, + "learning_rate": 4.613346693542784e-06, + "loss": 0.1436, + "step": 1890 + }, + { + "epoch": 0.6127673363577446, + "grad_norm": 0.8955968022346497, + "learning_rate": 4.6128793216097445e-06, + "loss": 0.1544, + "step": 1891 + }, + { + "epoch": 0.6130913804277381, + "grad_norm": 0.9024104475975037, + "learning_rate": 4.61241169108073e-06, + "loss": 0.1527, + "step": 1892 + }, + { + "epoch": 0.6134154244977317, + "grad_norm": 0.807380735874176, + "learning_rate": 4.611943802012975e-06, + "loss": 0.148, + "step": 1893 + }, + { + "epoch": 0.6137394685677252, + "grad_norm": 0.9021716117858887, + "learning_rate": 4.611475654463743e-06, + "loss": 0.1506, + "step": 1894 + }, + { + "epoch": 0.6140635126377187, + "grad_norm": 0.9311186671257019, + "learning_rate": 4.6110072484903326e-06, + "loss": 0.1724, + "step": 1895 + }, + { + "epoch": 0.6143875567077123, + "grad_norm": 0.8692731261253357, + "learning_rate": 4.610538584150071e-06, + "loss": 0.155, + "step": 1896 + }, + { + "epoch": 0.6147116007777058, + "grad_norm": 0.9471341967582703, + "learning_rate": 4.610069661500317e-06, + "loss": 0.1704, + "step": 1897 + }, + { + "epoch": 0.6150356448476992, + "grad_norm": 0.8956695199012756, + "learning_rate": 4.609600480598464e-06, + "loss": 0.1688, + "step": 1898 + }, + { + "epoch": 0.6153596889176928, + "grad_norm": 0.9151939749717712, + "learning_rate": 4.6091310415019355e-06, + "loss": 0.1757, + "step": 1899 + }, + { + "epoch": 0.6156837329876863, + "grad_norm": 0.8341889381408691, + "learning_rate": 4.608661344268185e-06, + "loss": 0.1417, + "step": 1900 + }, + { + "epoch": 0.6160077770576798, + "grad_norm": 0.9647911787033081, + "learning_rate": 4.608191388954699e-06, + "loss": 0.1558, + "step": 1901 + }, + { + "epoch": 0.6163318211276734, + "grad_norm": 0.9372779726982117, + "learning_rate": 4.607721175618997e-06, + "loss": 0.1571, + "step": 1902 + }, + { + "epoch": 0.6166558651976669, + "grad_norm": 0.9415113925933838, + "learning_rate": 4.6072507043186265e-06, + "loss": 0.1635, + "step": 1903 + }, + { + "epoch": 0.6169799092676604, + "grad_norm": 0.9065474271774292, + "learning_rate": 4.60677997511117e-06, + "loss": 0.1594, + "step": 1904 + }, + { + "epoch": 0.6173039533376539, + "grad_norm": 0.9385161995887756, + "learning_rate": 4.606308988054239e-06, + "loss": 0.1619, + "step": 1905 + }, + { + "epoch": 0.6176279974076474, + "grad_norm": 0.9063534140586853, + "learning_rate": 4.605837743205479e-06, + "loss": 0.1594, + "step": 1906 + }, + { + "epoch": 0.6179520414776409, + "grad_norm": 0.908365786075592, + "learning_rate": 4.605366240622565e-06, + "loss": 0.1643, + "step": 1907 + }, + { + "epoch": 0.6182760855476345, + "grad_norm": 0.924689769744873, + "learning_rate": 4.604894480363205e-06, + "loss": 0.1593, + "step": 1908 + }, + { + "epoch": 0.618600129617628, + "grad_norm": 0.8969070911407471, + "learning_rate": 4.604422462485138e-06, + "loss": 0.1576, + "step": 1909 + }, + { + "epoch": 0.6189241736876215, + "grad_norm": 0.9201893210411072, + "learning_rate": 4.603950187046134e-06, + "loss": 0.1711, + "step": 1910 + }, + { + "epoch": 0.6192482177576151, + "grad_norm": 0.9483135342597961, + "learning_rate": 4.603477654103994e-06, + "loss": 0.178, + "step": 1911 + }, + { + "epoch": 0.6195722618276086, + "grad_norm": 0.8227450847625732, + "learning_rate": 4.603004863716553e-06, + "loss": 0.1516, + "step": 1912 + }, + { + "epoch": 0.619896305897602, + "grad_norm": 0.8497554659843445, + "learning_rate": 4.602531815941676e-06, + "loss": 0.1537, + "step": 1913 + }, + { + "epoch": 0.6202203499675956, + "grad_norm": 0.8721224069595337, + "learning_rate": 4.602058510837257e-06, + "loss": 0.1622, + "step": 1914 + }, + { + "epoch": 0.6205443940375891, + "grad_norm": 0.9595826864242554, + "learning_rate": 4.6015849484612265e-06, + "loss": 0.1696, + "step": 1915 + }, + { + "epoch": 0.6208684381075826, + "grad_norm": 0.8733614087104797, + "learning_rate": 4.601111128871544e-06, + "loss": 0.1496, + "step": 1916 + }, + { + "epoch": 0.6211924821775762, + "grad_norm": 0.8793935775756836, + "learning_rate": 4.600637052126199e-06, + "loss": 0.1503, + "step": 1917 + }, + { + "epoch": 0.6215165262475697, + "grad_norm": 0.9011240005493164, + "learning_rate": 4.600162718283215e-06, + "loss": 0.1603, + "step": 1918 + }, + { + "epoch": 0.6218405703175632, + "grad_norm": 1.0156891345977783, + "learning_rate": 4.599688127400645e-06, + "loss": 0.1696, + "step": 1919 + }, + { + "epoch": 0.6221646143875567, + "grad_norm": 0.9367722868919373, + "learning_rate": 4.599213279536575e-06, + "loss": 0.1665, + "step": 1920 + }, + { + "epoch": 0.6224886584575502, + "grad_norm": 0.9469424486160278, + "learning_rate": 4.598738174749121e-06, + "loss": 0.1747, + "step": 1921 + }, + { + "epoch": 0.6228127025275437, + "grad_norm": 0.9037867784500122, + "learning_rate": 4.598262813096432e-06, + "loss": 0.1555, + "step": 1922 + }, + { + "epoch": 0.6231367465975373, + "grad_norm": 0.954498291015625, + "learning_rate": 4.597787194636688e-06, + "loss": 0.1571, + "step": 1923 + }, + { + "epoch": 0.6234607906675308, + "grad_norm": 0.9314236044883728, + "learning_rate": 4.597311319428099e-06, + "loss": 0.156, + "step": 1924 + }, + { + "epoch": 0.6237848347375243, + "grad_norm": 0.9310925006866455, + "learning_rate": 4.596835187528908e-06, + "loss": 0.1643, + "step": 1925 + }, + { + "epoch": 0.6241088788075179, + "grad_norm": 0.7893508076667786, + "learning_rate": 4.59635879899739e-06, + "loss": 0.1468, + "step": 1926 + }, + { + "epoch": 0.6244329228775113, + "grad_norm": 0.8389633893966675, + "learning_rate": 4.595882153891849e-06, + "loss": 0.1462, + "step": 1927 + }, + { + "epoch": 0.6247569669475048, + "grad_norm": 0.9003433585166931, + "learning_rate": 4.595405252270622e-06, + "loss": 0.1646, + "step": 1928 + }, + { + "epoch": 0.6250810110174984, + "grad_norm": 0.9383062720298767, + "learning_rate": 4.594928094192076e-06, + "loss": 0.1631, + "step": 1929 + }, + { + "epoch": 0.6254050550874919, + "grad_norm": 0.8632388710975647, + "learning_rate": 4.594450679714613e-06, + "loss": 0.1489, + "step": 1930 + }, + { + "epoch": 0.6257290991574854, + "grad_norm": 0.8126484751701355, + "learning_rate": 4.593973008896662e-06, + "loss": 0.1439, + "step": 1931 + }, + { + "epoch": 0.626053143227479, + "grad_norm": 0.9024901986122131, + "learning_rate": 4.593495081796686e-06, + "loss": 0.1499, + "step": 1932 + }, + { + "epoch": 0.6263771872974725, + "grad_norm": 0.8754759430885315, + "learning_rate": 4.59301689847318e-06, + "loss": 0.1414, + "step": 1933 + }, + { + "epoch": 0.626701231367466, + "grad_norm": 0.93331378698349, + "learning_rate": 4.592538458984666e-06, + "loss": 0.1634, + "step": 1934 + }, + { + "epoch": 0.6270252754374595, + "grad_norm": 0.9175758957862854, + "learning_rate": 4.5920597633897015e-06, + "loss": 0.1518, + "step": 1935 + }, + { + "epoch": 0.627349319507453, + "grad_norm": 1.0189933776855469, + "learning_rate": 4.5915808117468766e-06, + "loss": 0.1678, + "step": 1936 + }, + { + "epoch": 0.6276733635774465, + "grad_norm": 0.9787598252296448, + "learning_rate": 4.591101604114807e-06, + "loss": 0.1695, + "step": 1937 + }, + { + "epoch": 0.62799740764744, + "grad_norm": 0.8894520998001099, + "learning_rate": 4.590622140552144e-06, + "loss": 0.1647, + "step": 1938 + }, + { + "epoch": 0.6283214517174336, + "grad_norm": 0.9015518426895142, + "learning_rate": 4.5901424211175715e-06, + "loss": 0.1558, + "step": 1939 + }, + { + "epoch": 0.6286454957874271, + "grad_norm": 0.9240670800209045, + "learning_rate": 4.5896624458698e-06, + "loss": 0.1564, + "step": 1940 + }, + { + "epoch": 0.6289695398574207, + "grad_norm": 0.9072241187095642, + "learning_rate": 4.5891822148675745e-06, + "loss": 0.1511, + "step": 1941 + }, + { + "epoch": 0.6292935839274141, + "grad_norm": 0.8959493637084961, + "learning_rate": 4.588701728169671e-06, + "loss": 0.1642, + "step": 1942 + }, + { + "epoch": 0.6296176279974076, + "grad_norm": 0.924720823764801, + "learning_rate": 4.5882209858348956e-06, + "loss": 0.1605, + "step": 1943 + }, + { + "epoch": 0.6299416720674011, + "grad_norm": 0.9390182495117188, + "learning_rate": 4.587739987922087e-06, + "loss": 0.1647, + "step": 1944 + }, + { + "epoch": 0.6302657161373947, + "grad_norm": 0.8489257097244263, + "learning_rate": 4.587258734490115e-06, + "loss": 0.1432, + "step": 1945 + }, + { + "epoch": 0.6305897602073882, + "grad_norm": 0.9559697508811951, + "learning_rate": 4.586777225597881e-06, + "loss": 0.1582, + "step": 1946 + }, + { + "epoch": 0.6309138042773818, + "grad_norm": 0.9959377646446228, + "learning_rate": 4.586295461304315e-06, + "loss": 0.1829, + "step": 1947 + }, + { + "epoch": 0.6312378483473753, + "grad_norm": 0.8491382002830505, + "learning_rate": 4.585813441668383e-06, + "loss": 0.1528, + "step": 1948 + }, + { + "epoch": 0.6315618924173687, + "grad_norm": 0.8617604374885559, + "learning_rate": 4.585331166749077e-06, + "loss": 0.1583, + "step": 1949 + }, + { + "epoch": 0.6318859364873622, + "grad_norm": 0.8658198118209839, + "learning_rate": 4.584848636605423e-06, + "loss": 0.1505, + "step": 1950 + }, + { + "epoch": 0.6322099805573558, + "grad_norm": 0.894616961479187, + "learning_rate": 4.58436585129648e-06, + "loss": 0.1539, + "step": 1951 + }, + { + "epoch": 0.6325340246273493, + "grad_norm": 0.9101769328117371, + "learning_rate": 4.583882810881334e-06, + "loss": 0.1623, + "step": 1952 + }, + { + "epoch": 0.6328580686973428, + "grad_norm": 0.8776459097862244, + "learning_rate": 4.583399515419106e-06, + "loss": 0.1501, + "step": 1953 + }, + { + "epoch": 0.6331821127673364, + "grad_norm": 0.9368626475334167, + "learning_rate": 4.582915964968946e-06, + "loss": 0.1716, + "step": 1954 + }, + { + "epoch": 0.6335061568373299, + "grad_norm": 0.931447446346283, + "learning_rate": 4.582432159590037e-06, + "loss": 0.1792, + "step": 1955 + }, + { + "epoch": 0.6338302009073234, + "grad_norm": 0.8453537821769714, + "learning_rate": 4.58194809934159e-06, + "loss": 0.1514, + "step": 1956 + }, + { + "epoch": 0.6341542449773169, + "grad_norm": 0.8716861605644226, + "learning_rate": 4.5814637842828506e-06, + "loss": 0.1556, + "step": 1957 + }, + { + "epoch": 0.6344782890473104, + "grad_norm": 0.8787641525268555, + "learning_rate": 4.580979214473095e-06, + "loss": 0.1608, + "step": 1958 + }, + { + "epoch": 0.6348023331173039, + "grad_norm": 0.9362958073616028, + "learning_rate": 4.580494389971628e-06, + "loss": 0.1673, + "step": 1959 + }, + { + "epoch": 0.6351263771872975, + "grad_norm": 0.9452981352806091, + "learning_rate": 4.580009310837789e-06, + "loss": 0.1627, + "step": 1960 + }, + { + "epoch": 0.635450421257291, + "grad_norm": 0.885234534740448, + "learning_rate": 4.579523977130946e-06, + "loss": 0.1511, + "step": 1961 + }, + { + "epoch": 0.6357744653272845, + "grad_norm": 0.8860384225845337, + "learning_rate": 4.579038388910499e-06, + "loss": 0.1612, + "step": 1962 + }, + { + "epoch": 0.6360985093972781, + "grad_norm": 0.8743710517883301, + "learning_rate": 4.578552546235882e-06, + "loss": 0.1653, + "step": 1963 + }, + { + "epoch": 0.6364225534672715, + "grad_norm": 0.8612011075019836, + "learning_rate": 4.578066449166554e-06, + "loss": 0.1495, + "step": 1964 + }, + { + "epoch": 0.636746597537265, + "grad_norm": 0.8718269467353821, + "learning_rate": 4.57758009776201e-06, + "loss": 0.1518, + "step": 1965 + }, + { + "epoch": 0.6370706416072586, + "grad_norm": 0.8312247395515442, + "learning_rate": 4.577093492081774e-06, + "loss": 0.1475, + "step": 1966 + }, + { + "epoch": 0.6373946856772521, + "grad_norm": 0.8253561854362488, + "learning_rate": 4.576606632185403e-06, + "loss": 0.1415, + "step": 1967 + }, + { + "epoch": 0.6377187297472456, + "grad_norm": 0.8615391850471497, + "learning_rate": 4.576119518132483e-06, + "loss": 0.1562, + "step": 1968 + }, + { + "epoch": 0.6380427738172392, + "grad_norm": 0.8764085173606873, + "learning_rate": 4.575632149982631e-06, + "loss": 0.1444, + "step": 1969 + }, + { + "epoch": 0.6383668178872327, + "grad_norm": 0.8978087306022644, + "learning_rate": 4.5751445277955e-06, + "loss": 0.1504, + "step": 1970 + }, + { + "epoch": 0.6386908619572261, + "grad_norm": 0.9768132567405701, + "learning_rate": 4.574656651630767e-06, + "loss": 0.1644, + "step": 1971 + }, + { + "epoch": 0.6390149060272197, + "grad_norm": 0.9122047424316406, + "learning_rate": 4.574168521548144e-06, + "loss": 0.1627, + "step": 1972 + }, + { + "epoch": 0.6393389500972132, + "grad_norm": 0.8582742810249329, + "learning_rate": 4.573680137607373e-06, + "loss": 0.1547, + "step": 1973 + }, + { + "epoch": 0.6396629941672067, + "grad_norm": 0.9364274144172668, + "learning_rate": 4.573191499868228e-06, + "loss": 0.1497, + "step": 1974 + }, + { + "epoch": 0.6399870382372003, + "grad_norm": 1.0029749870300293, + "learning_rate": 4.572702608390513e-06, + "loss": 0.1576, + "step": 1975 + }, + { + "epoch": 0.6403110823071938, + "grad_norm": 0.9168883562088013, + "learning_rate": 4.572213463234065e-06, + "loss": 0.165, + "step": 1976 + }, + { + "epoch": 0.6406351263771873, + "grad_norm": 0.9613797664642334, + "learning_rate": 4.5717240644587495e-06, + "loss": 0.1644, + "step": 1977 + }, + { + "epoch": 0.6409591704471809, + "grad_norm": 0.8673607110977173, + "learning_rate": 4.571234412124464e-06, + "loss": 0.1622, + "step": 1978 + }, + { + "epoch": 0.6412832145171743, + "grad_norm": 0.8543764352798462, + "learning_rate": 4.570744506291138e-06, + "loss": 0.1593, + "step": 1979 + }, + { + "epoch": 0.6416072585871678, + "grad_norm": 0.9339621663093567, + "learning_rate": 4.570254347018731e-06, + "loss": 0.1754, + "step": 1980 + }, + { + "epoch": 0.6419313026571614, + "grad_norm": 0.8893762826919556, + "learning_rate": 4.5697639343672325e-06, + "loss": 0.1639, + "step": 1981 + }, + { + "epoch": 0.6422553467271549, + "grad_norm": 0.8264583945274353, + "learning_rate": 4.569273268396667e-06, + "loss": 0.1303, + "step": 1982 + }, + { + "epoch": 0.6425793907971484, + "grad_norm": 0.9855983257293701, + "learning_rate": 4.568782349167084e-06, + "loss": 0.1679, + "step": 1983 + }, + { + "epoch": 0.642903434867142, + "grad_norm": 0.8214596509933472, + "learning_rate": 4.56829117673857e-06, + "loss": 0.1484, + "step": 1984 + }, + { + "epoch": 0.6432274789371355, + "grad_norm": 0.8593002557754517, + "learning_rate": 4.567799751171237e-06, + "loss": 0.1418, + "step": 1985 + }, + { + "epoch": 0.6435515230071289, + "grad_norm": 0.959836483001709, + "learning_rate": 4.567308072525233e-06, + "loss": 0.1593, + "step": 1986 + }, + { + "epoch": 0.6438755670771225, + "grad_norm": 0.9470639228820801, + "learning_rate": 4.566816140860735e-06, + "loss": 0.1392, + "step": 1987 + }, + { + "epoch": 0.644199611147116, + "grad_norm": 0.9426745176315308, + "learning_rate": 4.566323956237948e-06, + "loss": 0.1654, + "step": 1988 + }, + { + "epoch": 0.6445236552171095, + "grad_norm": 0.9331299066543579, + "learning_rate": 4.565831518717114e-06, + "loss": 0.1481, + "step": 1989 + }, + { + "epoch": 0.6448476992871031, + "grad_norm": 0.9420998692512512, + "learning_rate": 4.5653388283585e-06, + "loss": 0.1599, + "step": 1990 + }, + { + "epoch": 0.6451717433570966, + "grad_norm": 0.8443018198013306, + "learning_rate": 4.564845885222407e-06, + "loss": 0.145, + "step": 1991 + }, + { + "epoch": 0.6454957874270901, + "grad_norm": 0.8921066522598267, + "learning_rate": 4.564352689369168e-06, + "loss": 0.1553, + "step": 1992 + }, + { + "epoch": 0.6458198314970836, + "grad_norm": 1.0385701656341553, + "learning_rate": 4.563859240859144e-06, + "loss": 0.1769, + "step": 1993 + }, + { + "epoch": 0.6461438755670771, + "grad_norm": 0.9667747616767883, + "learning_rate": 4.563365539752728e-06, + "loss": 0.1691, + "step": 1994 + }, + { + "epoch": 0.6464679196370706, + "grad_norm": 0.9109490513801575, + "learning_rate": 4.5628715861103455e-06, + "loss": 0.1491, + "step": 1995 + }, + { + "epoch": 0.6467919637070642, + "grad_norm": 0.8816606998443604, + "learning_rate": 4.562377379992451e-06, + "loss": 0.151, + "step": 1996 + }, + { + "epoch": 0.6471160077770577, + "grad_norm": 0.8787545561790466, + "learning_rate": 4.56188292145953e-06, + "loss": 0.1628, + "step": 1997 + }, + { + "epoch": 0.6474400518470512, + "grad_norm": 0.9294182062149048, + "learning_rate": 4.561388210572101e-06, + "loss": 0.1683, + "step": 1998 + }, + { + "epoch": 0.6477640959170448, + "grad_norm": 0.9135559797286987, + "learning_rate": 4.56089324739071e-06, + "loss": 0.1687, + "step": 1999 + }, + { + "epoch": 0.6480881399870383, + "grad_norm": 0.9418561458587646, + "learning_rate": 4.560398031975937e-06, + "loss": 0.1566, + "step": 2000 + }, + { + "epoch": 0.6484121840570317, + "grad_norm": 0.8324270844459534, + "learning_rate": 4.55990256438839e-06, + "loss": 0.1436, + "step": 2001 + }, + { + "epoch": 0.6487362281270252, + "grad_norm": 0.9083436131477356, + "learning_rate": 4.559406844688711e-06, + "loss": 0.1526, + "step": 2002 + }, + { + "epoch": 0.6490602721970188, + "grad_norm": 0.8812525272369385, + "learning_rate": 4.558910872937572e-06, + "loss": 0.1622, + "step": 2003 + }, + { + "epoch": 0.6493843162670123, + "grad_norm": 0.8936917185783386, + "learning_rate": 4.558414649195673e-06, + "loss": 0.1517, + "step": 2004 + }, + { + "epoch": 0.6497083603370059, + "grad_norm": 0.8962358832359314, + "learning_rate": 4.557918173523747e-06, + "loss": 0.155, + "step": 2005 + }, + { + "epoch": 0.6500324044069994, + "grad_norm": 0.8386431336402893, + "learning_rate": 4.55742144598256e-06, + "loss": 0.153, + "step": 2006 + }, + { + "epoch": 0.6503564484769929, + "grad_norm": 0.8774044513702393, + "learning_rate": 4.5569244666329055e-06, + "loss": 0.154, + "step": 2007 + }, + { + "epoch": 0.6506804925469863, + "grad_norm": 0.8934404253959656, + "learning_rate": 4.5564272355356085e-06, + "loss": 0.1502, + "step": 2008 + }, + { + "epoch": 0.6510045366169799, + "grad_norm": 0.8520406484603882, + "learning_rate": 4.555929752751526e-06, + "loss": 0.1455, + "step": 2009 + }, + { + "epoch": 0.6513285806869734, + "grad_norm": 0.827907383441925, + "learning_rate": 4.5554320183415435e-06, + "loss": 0.1503, + "step": 2010 + }, + { + "epoch": 0.651652624756967, + "grad_norm": 0.9456222057342529, + "learning_rate": 4.5549340323665815e-06, + "loss": 0.1573, + "step": 2011 + }, + { + "epoch": 0.6519766688269605, + "grad_norm": 0.8829612731933594, + "learning_rate": 4.554435794887586e-06, + "loss": 0.163, + "step": 2012 + }, + { + "epoch": 0.652300712896954, + "grad_norm": 0.8887278437614441, + "learning_rate": 4.553937305965539e-06, + "loss": 0.1396, + "step": 2013 + }, + { + "epoch": 0.6526247569669476, + "grad_norm": 0.9172626733779907, + "learning_rate": 4.553438565661448e-06, + "loss": 0.1656, + "step": 2014 + }, + { + "epoch": 0.652948801036941, + "grad_norm": 0.8400613069534302, + "learning_rate": 4.552939574036356e-06, + "loss": 0.165, + "step": 2015 + }, + { + "epoch": 0.6532728451069345, + "grad_norm": 0.879304826259613, + "learning_rate": 4.552440331151334e-06, + "loss": 0.1559, + "step": 2016 + }, + { + "epoch": 0.653596889176928, + "grad_norm": 0.8935216069221497, + "learning_rate": 4.551940837067486e-06, + "loss": 0.164, + "step": 2017 + }, + { + "epoch": 0.6539209332469216, + "grad_norm": 0.8760375380516052, + "learning_rate": 4.551441091845942e-06, + "loss": 0.1501, + "step": 2018 + }, + { + "epoch": 0.6542449773169151, + "grad_norm": 0.8720047473907471, + "learning_rate": 4.550941095547869e-06, + "loss": 0.1545, + "step": 2019 + }, + { + "epoch": 0.6545690213869086, + "grad_norm": 0.8929414749145508, + "learning_rate": 4.55044084823446e-06, + "loss": 0.1543, + "step": 2020 + }, + { + "epoch": 0.6548930654569022, + "grad_norm": 0.9258971214294434, + "learning_rate": 4.5499403499669415e-06, + "loss": 0.164, + "step": 2021 + }, + { + "epoch": 0.6552171095268956, + "grad_norm": 0.8609963655471802, + "learning_rate": 4.549439600806568e-06, + "loss": 0.1483, + "step": 2022 + }, + { + "epoch": 0.6555411535968891, + "grad_norm": 0.9027585387229919, + "learning_rate": 4.548938600814629e-06, + "loss": 0.1656, + "step": 2023 + }, + { + "epoch": 0.6558651976668827, + "grad_norm": 0.8795916438102722, + "learning_rate": 4.5484373500524395e-06, + "loss": 0.1619, + "step": 2024 + }, + { + "epoch": 0.6561892417368762, + "grad_norm": 0.8832913041114807, + "learning_rate": 4.547935848581349e-06, + "loss": 0.1583, + "step": 2025 + }, + { + "epoch": 0.6565132858068697, + "grad_norm": 0.8522735834121704, + "learning_rate": 4.5474340964627365e-06, + "loss": 0.1629, + "step": 2026 + }, + { + "epoch": 0.6568373298768633, + "grad_norm": 0.8502964973449707, + "learning_rate": 4.5469320937580105e-06, + "loss": 0.1511, + "step": 2027 + }, + { + "epoch": 0.6571613739468568, + "grad_norm": 0.8551455736160278, + "learning_rate": 4.546429840528612e-06, + "loss": 0.147, + "step": 2028 + }, + { + "epoch": 0.6574854180168503, + "grad_norm": 0.9344658851623535, + "learning_rate": 4.545927336836013e-06, + "loss": 0.1699, + "step": 2029 + }, + { + "epoch": 0.6578094620868438, + "grad_norm": 0.8650590777397156, + "learning_rate": 4.545424582741714e-06, + "loss": 0.1465, + "step": 2030 + }, + { + "epoch": 0.6581335061568373, + "grad_norm": 0.9409444332122803, + "learning_rate": 4.544921578307246e-06, + "loss": 0.1746, + "step": 2031 + }, + { + "epoch": 0.6584575502268308, + "grad_norm": 0.8694846630096436, + "learning_rate": 4.544418323594175e-06, + "loss": 0.1583, + "step": 2032 + }, + { + "epoch": 0.6587815942968244, + "grad_norm": 0.9473041892051697, + "learning_rate": 4.543914818664092e-06, + "loss": 0.1662, + "step": 2033 + }, + { + "epoch": 0.6591056383668179, + "grad_norm": 0.792140007019043, + "learning_rate": 4.543411063578621e-06, + "loss": 0.1422, + "step": 2034 + }, + { + "epoch": 0.6594296824368114, + "grad_norm": 0.8943572640419006, + "learning_rate": 4.5429070583994185e-06, + "loss": 0.145, + "step": 2035 + }, + { + "epoch": 0.659753726506805, + "grad_norm": 0.8899739384651184, + "learning_rate": 4.542402803188168e-06, + "loss": 0.1653, + "step": 2036 + }, + { + "epoch": 0.6600777705767984, + "grad_norm": 0.8766211271286011, + "learning_rate": 4.5418982980065874e-06, + "loss": 0.1566, + "step": 2037 + }, + { + "epoch": 0.6604018146467919, + "grad_norm": 0.8960506319999695, + "learning_rate": 4.541393542916423e-06, + "loss": 0.1631, + "step": 2038 + }, + { + "epoch": 0.6607258587167855, + "grad_norm": 0.8128004670143127, + "learning_rate": 4.540888537979449e-06, + "loss": 0.1522, + "step": 2039 + }, + { + "epoch": 0.661049902786779, + "grad_norm": 0.9594781398773193, + "learning_rate": 4.540383283257477e-06, + "loss": 0.1645, + "step": 2040 + }, + { + "epoch": 0.6613739468567725, + "grad_norm": 0.9011057615280151, + "learning_rate": 4.539877778812342e-06, + "loss": 0.1463, + "step": 2041 + }, + { + "epoch": 0.6616979909267661, + "grad_norm": 0.8882570862770081, + "learning_rate": 4.539372024705916e-06, + "loss": 0.1532, + "step": 2042 + }, + { + "epoch": 0.6620220349967596, + "grad_norm": 0.8770557045936584, + "learning_rate": 4.538866021000096e-06, + "loss": 0.1573, + "step": 2043 + }, + { + "epoch": 0.662346079066753, + "grad_norm": 0.9828415513038635, + "learning_rate": 4.538359767756813e-06, + "loss": 0.1709, + "step": 2044 + }, + { + "epoch": 0.6626701231367466, + "grad_norm": 0.9431034326553345, + "learning_rate": 4.537853265038027e-06, + "loss": 0.1647, + "step": 2045 + }, + { + "epoch": 0.6629941672067401, + "grad_norm": 0.9096002578735352, + "learning_rate": 4.537346512905729e-06, + "loss": 0.1665, + "step": 2046 + }, + { + "epoch": 0.6633182112767336, + "grad_norm": 0.8507137298583984, + "learning_rate": 4.536839511421941e-06, + "loss": 0.1435, + "step": 2047 + }, + { + "epoch": 0.6636422553467272, + "grad_norm": 0.9147508144378662, + "learning_rate": 4.536332260648716e-06, + "loss": 0.1585, + "step": 2048 + }, + { + "epoch": 0.6639662994167207, + "grad_norm": 0.9033685326576233, + "learning_rate": 4.535824760648135e-06, + "loss": 0.161, + "step": 2049 + }, + { + "epoch": 0.6642903434867142, + "grad_norm": 0.8964657783508301, + "learning_rate": 4.535317011482311e-06, + "loss": 0.1644, + "step": 2050 + }, + { + "epoch": 0.6646143875567078, + "grad_norm": 0.8217459321022034, + "learning_rate": 4.534809013213389e-06, + "loss": 0.1467, + "step": 2051 + }, + { + "epoch": 0.6649384316267012, + "grad_norm": 0.871848464012146, + "learning_rate": 4.534300765903542e-06, + "loss": 0.1422, + "step": 2052 + }, + { + "epoch": 0.6652624756966947, + "grad_norm": 0.9146044254302979, + "learning_rate": 4.533792269614974e-06, + "loss": 0.1733, + "step": 2053 + }, + { + "epoch": 0.6655865197666883, + "grad_norm": 0.8984795808792114, + "learning_rate": 4.533283524409922e-06, + "loss": 0.1619, + "step": 2054 + }, + { + "epoch": 0.6659105638366818, + "grad_norm": 0.8733825087547302, + "learning_rate": 4.53277453035065e-06, + "loss": 0.1575, + "step": 2055 + }, + { + "epoch": 0.6662346079066753, + "grad_norm": 0.8426579833030701, + "learning_rate": 4.532265287499454e-06, + "loss": 0.1419, + "step": 2056 + }, + { + "epoch": 0.6665586519766689, + "grad_norm": 0.8925921320915222, + "learning_rate": 4.531755795918661e-06, + "loss": 0.1527, + "step": 2057 + }, + { + "epoch": 0.6668826960466624, + "grad_norm": 0.9321977496147156, + "learning_rate": 4.531246055670627e-06, + "loss": 0.1731, + "step": 2058 + }, + { + "epoch": 0.6672067401166558, + "grad_norm": 0.8613082766532898, + "learning_rate": 4.53073606681774e-06, + "loss": 0.1434, + "step": 2059 + }, + { + "epoch": 0.6675307841866494, + "grad_norm": 0.906181275844574, + "learning_rate": 4.530225829422418e-06, + "loss": 0.1543, + "step": 2060 + }, + { + "epoch": 0.6678548282566429, + "grad_norm": 0.9868515729904175, + "learning_rate": 4.529715343547107e-06, + "loss": 0.1832, + "step": 2061 + }, + { + "epoch": 0.6681788723266364, + "grad_norm": 0.8776630163192749, + "learning_rate": 4.5292046092542885e-06, + "loss": 0.1487, + "step": 2062 + }, + { + "epoch": 0.66850291639663, + "grad_norm": 0.8811656832695007, + "learning_rate": 4.52869362660647e-06, + "loss": 0.1584, + "step": 2063 + }, + { + "epoch": 0.6688269604666235, + "grad_norm": 0.8803176879882812, + "learning_rate": 4.5281823956661905e-06, + "loss": 0.1545, + "step": 2064 + }, + { + "epoch": 0.669151004536617, + "grad_norm": 0.8835181593894958, + "learning_rate": 4.527670916496021e-06, + "loss": 0.1579, + "step": 2065 + }, + { + "epoch": 0.6694750486066104, + "grad_norm": 0.9012269973754883, + "learning_rate": 4.52715918915856e-06, + "loss": 0.16, + "step": 2066 + }, + { + "epoch": 0.669799092676604, + "grad_norm": 0.8266863226890564, + "learning_rate": 4.526647213716438e-06, + "loss": 0.1454, + "step": 2067 + }, + { + "epoch": 0.6701231367465975, + "grad_norm": 0.9318245649337769, + "learning_rate": 4.526134990232317e-06, + "loss": 0.1644, + "step": 2068 + }, + { + "epoch": 0.670447180816591, + "grad_norm": 0.8861281275749207, + "learning_rate": 4.525622518768888e-06, + "loss": 0.1601, + "step": 2069 + }, + { + "epoch": 0.6707712248865846, + "grad_norm": 0.8727645874023438, + "learning_rate": 4.5251097993888726e-06, + "loss": 0.1538, + "step": 2070 + }, + { + "epoch": 0.6710952689565781, + "grad_norm": 0.8234463930130005, + "learning_rate": 4.524596832155022e-06, + "loss": 0.1543, + "step": 2071 + }, + { + "epoch": 0.6714193130265717, + "grad_norm": 0.8433378338813782, + "learning_rate": 4.524083617130118e-06, + "loss": 0.1475, + "step": 2072 + }, + { + "epoch": 0.6717433570965652, + "grad_norm": 0.9181938171386719, + "learning_rate": 4.523570154376975e-06, + "loss": 0.1601, + "step": 2073 + }, + { + "epoch": 0.6720674011665586, + "grad_norm": 0.8135218620300293, + "learning_rate": 4.5230564439584335e-06, + "loss": 0.1459, + "step": 2074 + }, + { + "epoch": 0.6723914452365521, + "grad_norm": 0.8569528460502625, + "learning_rate": 4.522542485937369e-06, + "loss": 0.152, + "step": 2075 + }, + { + "epoch": 0.6727154893065457, + "grad_norm": 0.875869870185852, + "learning_rate": 4.522028280376683e-06, + "loss": 0.1576, + "step": 2076 + }, + { + "epoch": 0.6730395333765392, + "grad_norm": 0.9111397862434387, + "learning_rate": 4.521513827339311e-06, + "loss": 0.1613, + "step": 2077 + }, + { + "epoch": 0.6733635774465327, + "grad_norm": 0.9533864855766296, + "learning_rate": 4.5209991268882165e-06, + "loss": 0.1723, + "step": 2078 + }, + { + "epoch": 0.6736876215165263, + "grad_norm": 0.9166319966316223, + "learning_rate": 4.520484179086394e-06, + "loss": 0.1597, + "step": 2079 + }, + { + "epoch": 0.6740116655865198, + "grad_norm": 0.8753485083580017, + "learning_rate": 4.519968983996867e-06, + "loss": 0.1572, + "step": 2080 + }, + { + "epoch": 0.6743357096565132, + "grad_norm": 0.9850584864616394, + "learning_rate": 4.519453541682691e-06, + "loss": 0.1632, + "step": 2081 + }, + { + "epoch": 0.6746597537265068, + "grad_norm": 0.8764849901199341, + "learning_rate": 4.518937852206952e-06, + "loss": 0.1594, + "step": 2082 + }, + { + "epoch": 0.6749837977965003, + "grad_norm": 0.8675936460494995, + "learning_rate": 4.518421915632764e-06, + "loss": 0.1493, + "step": 2083 + }, + { + "epoch": 0.6753078418664938, + "grad_norm": 0.8667171001434326, + "learning_rate": 4.5179057320232735e-06, + "loss": 0.134, + "step": 2084 + }, + { + "epoch": 0.6756318859364874, + "grad_norm": 0.8813750147819519, + "learning_rate": 4.517389301441657e-06, + "loss": 0.1566, + "step": 2085 + }, + { + "epoch": 0.6759559300064809, + "grad_norm": 0.8764129281044006, + "learning_rate": 4.51687262395112e-06, + "loss": 0.15, + "step": 2086 + }, + { + "epoch": 0.6762799740764744, + "grad_norm": 0.8690442442893982, + "learning_rate": 4.516355699614897e-06, + "loss": 0.1537, + "step": 2087 + }, + { + "epoch": 0.6766040181464679, + "grad_norm": 0.9069793224334717, + "learning_rate": 4.515838528496257e-06, + "loss": 0.1676, + "step": 2088 + }, + { + "epoch": 0.6769280622164614, + "grad_norm": 0.8847238421440125, + "learning_rate": 4.5153211106584965e-06, + "loss": 0.1581, + "step": 2089 + }, + { + "epoch": 0.6772521062864549, + "grad_norm": 0.8980826139450073, + "learning_rate": 4.514803446164941e-06, + "loss": 0.1544, + "step": 2090 + }, + { + "epoch": 0.6775761503564485, + "grad_norm": 0.9628041386604309, + "learning_rate": 4.514285535078949e-06, + "loss": 0.1632, + "step": 2091 + }, + { + "epoch": 0.677900194426442, + "grad_norm": 0.8706278204917908, + "learning_rate": 4.513767377463908e-06, + "loss": 0.1526, + "step": 2092 + }, + { + "epoch": 0.6782242384964355, + "grad_norm": 0.8483056426048279, + "learning_rate": 4.513248973383234e-06, + "loss": 0.1558, + "step": 2093 + }, + { + "epoch": 0.6785482825664291, + "grad_norm": 0.9565305709838867, + "learning_rate": 4.512730322900375e-06, + "loss": 0.1809, + "step": 2094 + }, + { + "epoch": 0.6788723266364226, + "grad_norm": 0.9398934841156006, + "learning_rate": 4.51221142607881e-06, + "loss": 0.1675, + "step": 2095 + }, + { + "epoch": 0.679196370706416, + "grad_norm": 0.821510910987854, + "learning_rate": 4.511692282982047e-06, + "loss": 0.1437, + "step": 2096 + }, + { + "epoch": 0.6795204147764096, + "grad_norm": 0.8104325532913208, + "learning_rate": 4.511172893673621e-06, + "loss": 0.1493, + "step": 2097 + }, + { + "epoch": 0.6798444588464031, + "grad_norm": 0.8659233450889587, + "learning_rate": 4.510653258217103e-06, + "loss": 0.162, + "step": 2098 + }, + { + "epoch": 0.6801685029163966, + "grad_norm": 0.8639686703681946, + "learning_rate": 4.5101333766760926e-06, + "loss": 0.1578, + "step": 2099 + }, + { + "epoch": 0.6804925469863902, + "grad_norm": 0.8878895044326782, + "learning_rate": 4.509613249114215e-06, + "loss": 0.1631, + "step": 2100 + }, + { + "epoch": 0.6808165910563837, + "grad_norm": 0.8527207970619202, + "learning_rate": 4.509092875595131e-06, + "loss": 0.1542, + "step": 2101 + }, + { + "epoch": 0.6811406351263772, + "grad_norm": 0.8527224659919739, + "learning_rate": 4.508572256182528e-06, + "loss": 0.1533, + "step": 2102 + }, + { + "epoch": 0.6814646791963707, + "grad_norm": 0.840473473072052, + "learning_rate": 4.508051390940125e-06, + "loss": 0.1435, + "step": 2103 + }, + { + "epoch": 0.6817887232663642, + "grad_norm": 0.8408612012863159, + "learning_rate": 4.507530279931673e-06, + "loss": 0.1476, + "step": 2104 + }, + { + "epoch": 0.6821127673363577, + "grad_norm": 1.008131980895996, + "learning_rate": 4.5070089232209465e-06, + "loss": 0.1717, + "step": 2105 + }, + { + "epoch": 0.6824368114063513, + "grad_norm": 0.8935782313346863, + "learning_rate": 4.506487320871758e-06, + "loss": 0.1552, + "step": 2106 + }, + { + "epoch": 0.6827608554763448, + "grad_norm": 0.8052367568016052, + "learning_rate": 4.5059654729479474e-06, + "loss": 0.1458, + "step": 2107 + }, + { + "epoch": 0.6830848995463383, + "grad_norm": 0.9380152225494385, + "learning_rate": 4.505443379513381e-06, + "loss": 0.16, + "step": 2108 + }, + { + "epoch": 0.6834089436163319, + "grad_norm": 0.9281724095344543, + "learning_rate": 4.5049210406319585e-06, + "loss": 0.1549, + "step": 2109 + }, + { + "epoch": 0.6837329876863253, + "grad_norm": 0.8564958572387695, + "learning_rate": 4.5043984563676105e-06, + "loss": 0.1492, + "step": 2110 + }, + { + "epoch": 0.6840570317563188, + "grad_norm": 0.9268948435783386, + "learning_rate": 4.503875626784295e-06, + "loss": 0.1694, + "step": 2111 + }, + { + "epoch": 0.6843810758263124, + "grad_norm": 0.8347535133361816, + "learning_rate": 4.503352551946003e-06, + "loss": 0.1386, + "step": 2112 + }, + { + "epoch": 0.6847051198963059, + "grad_norm": 1.0070157051086426, + "learning_rate": 4.5028292319167515e-06, + "loss": 0.1743, + "step": 2113 + }, + { + "epoch": 0.6850291639662994, + "grad_norm": 0.9377530813217163, + "learning_rate": 4.502305666760592e-06, + "loss": 0.159, + "step": 2114 + }, + { + "epoch": 0.685353208036293, + "grad_norm": 0.869188666343689, + "learning_rate": 4.501781856541601e-06, + "loss": 0.1466, + "step": 2115 + }, + { + "epoch": 0.6856772521062865, + "grad_norm": 0.8491179943084717, + "learning_rate": 4.50125780132389e-06, + "loss": 0.1486, + "step": 2116 + }, + { + "epoch": 0.68600129617628, + "grad_norm": 0.9667462110519409, + "learning_rate": 4.500733501171599e-06, + "loss": 0.1893, + "step": 2117 + }, + { + "epoch": 0.6863253402462735, + "grad_norm": 0.8738124370574951, + "learning_rate": 4.500208956148895e-06, + "loss": 0.1741, + "step": 2118 + }, + { + "epoch": 0.686649384316267, + "grad_norm": 0.8235524892807007, + "learning_rate": 4.499684166319978e-06, + "loss": 0.153, + "step": 2119 + }, + { + "epoch": 0.6869734283862605, + "grad_norm": 0.8553236722946167, + "learning_rate": 4.499159131749079e-06, + "loss": 0.149, + "step": 2120 + }, + { + "epoch": 0.687297472456254, + "grad_norm": 0.9009736180305481, + "learning_rate": 4.498633852500455e-06, + "loss": 0.1642, + "step": 2121 + }, + { + "epoch": 0.6876215165262476, + "grad_norm": 0.9023570418357849, + "learning_rate": 4.498108328638395e-06, + "loss": 0.1455, + "step": 2122 + }, + { + "epoch": 0.6879455605962411, + "grad_norm": 0.9002334475517273, + "learning_rate": 4.4975825602272185e-06, + "loss": 0.1607, + "step": 2123 + }, + { + "epoch": 0.6882696046662347, + "grad_norm": 0.9422554969787598, + "learning_rate": 4.497056547331276e-06, + "loss": 0.1688, + "step": 2124 + }, + { + "epoch": 0.6885936487362281, + "grad_norm": 0.8739017248153687, + "learning_rate": 4.496530290014945e-06, + "loss": 0.1644, + "step": 2125 + }, + { + "epoch": 0.6889176928062216, + "grad_norm": 0.9366029500961304, + "learning_rate": 4.496003788342633e-06, + "loss": 0.1601, + "step": 2126 + }, + { + "epoch": 0.6892417368762151, + "grad_norm": 0.899392306804657, + "learning_rate": 4.495477042378781e-06, + "loss": 0.1522, + "step": 2127 + }, + { + "epoch": 0.6895657809462087, + "grad_norm": 0.8177936673164368, + "learning_rate": 4.494950052187857e-06, + "loss": 0.1438, + "step": 2128 + }, + { + "epoch": 0.6898898250162022, + "grad_norm": 0.857815682888031, + "learning_rate": 4.494422817834359e-06, + "loss": 0.1548, + "step": 2129 + }, + { + "epoch": 0.6902138690861958, + "grad_norm": 0.8262993097305298, + "learning_rate": 4.493895339382815e-06, + "loss": 0.1498, + "step": 2130 + }, + { + "epoch": 0.6905379131561893, + "grad_norm": 0.8081302046775818, + "learning_rate": 4.493367616897785e-06, + "loss": 0.148, + "step": 2131 + }, + { + "epoch": 0.6908619572261827, + "grad_norm": 0.8847200870513916, + "learning_rate": 4.4928396504438555e-06, + "loss": 0.1638, + "step": 2132 + }, + { + "epoch": 0.6911860012961762, + "grad_norm": 0.8429677486419678, + "learning_rate": 4.4923114400856445e-06, + "loss": 0.1586, + "step": 2133 + }, + { + "epoch": 0.6915100453661698, + "grad_norm": 0.9239067435264587, + "learning_rate": 4.491782985887802e-06, + "loss": 0.1628, + "step": 2134 + }, + { + "epoch": 0.6918340894361633, + "grad_norm": 0.8306019902229309, + "learning_rate": 4.491254287915003e-06, + "loss": 0.1402, + "step": 2135 + }, + { + "epoch": 0.6921581335061568, + "grad_norm": 0.9087281823158264, + "learning_rate": 4.490725346231954e-06, + "loss": 0.168, + "step": 2136 + }, + { + "epoch": 0.6924821775761504, + "grad_norm": 0.8837271928787231, + "learning_rate": 4.4901961609033965e-06, + "loss": 0.1625, + "step": 2137 + }, + { + "epoch": 0.6928062216461439, + "grad_norm": 0.8362112045288086, + "learning_rate": 4.489666731994095e-06, + "loss": 0.1521, + "step": 2138 + }, + { + "epoch": 0.6931302657161373, + "grad_norm": 0.8731306195259094, + "learning_rate": 4.489137059568847e-06, + "loss": 0.1504, + "step": 2139 + }, + { + "epoch": 0.6934543097861309, + "grad_norm": 0.9544700980186462, + "learning_rate": 4.48860714369248e-06, + "loss": 0.1722, + "step": 2140 + }, + { + "epoch": 0.6937783538561244, + "grad_norm": 0.8797779083251953, + "learning_rate": 4.488076984429849e-06, + "loss": 0.1574, + "step": 2141 + }, + { + "epoch": 0.6941023979261179, + "grad_norm": 0.8555868864059448, + "learning_rate": 4.4875465818458404e-06, + "loss": 0.1483, + "step": 2142 + }, + { + "epoch": 0.6944264419961115, + "grad_norm": 0.9178819060325623, + "learning_rate": 4.4870159360053725e-06, + "loss": 0.1646, + "step": 2143 + }, + { + "epoch": 0.694750486066105, + "grad_norm": 0.8088469505310059, + "learning_rate": 4.4864850469733886e-06, + "loss": 0.1365, + "step": 2144 + }, + { + "epoch": 0.6950745301360985, + "grad_norm": 0.9104022979736328, + "learning_rate": 4.485953914814867e-06, + "loss": 0.1477, + "step": 2145 + }, + { + "epoch": 0.6953985742060921, + "grad_norm": 0.9033350348472595, + "learning_rate": 4.485422539594811e-06, + "loss": 0.1532, + "step": 2146 + }, + { + "epoch": 0.6957226182760855, + "grad_norm": 0.9154496192932129, + "learning_rate": 4.4848909213782566e-06, + "loss": 0.1596, + "step": 2147 + }, + { + "epoch": 0.696046662346079, + "grad_norm": 0.8063889741897583, + "learning_rate": 4.484359060230269e-06, + "loss": 0.1341, + "step": 2148 + }, + { + "epoch": 0.6963707064160726, + "grad_norm": 0.8335157632827759, + "learning_rate": 4.483826956215942e-06, + "loss": 0.1493, + "step": 2149 + }, + { + "epoch": 0.6966947504860661, + "grad_norm": 0.8637626767158508, + "learning_rate": 4.4832946094004e-06, + "loss": 0.1513, + "step": 2150 + }, + { + "epoch": 0.6970187945560596, + "grad_norm": 0.9334248900413513, + "learning_rate": 4.482762019848799e-06, + "loss": 0.1634, + "step": 2151 + }, + { + "epoch": 0.6973428386260532, + "grad_norm": 0.8626821041107178, + "learning_rate": 4.48222918762632e-06, + "loss": 0.1487, + "step": 2152 + }, + { + "epoch": 0.6976668826960467, + "grad_norm": 0.9040731191635132, + "learning_rate": 4.481696112798179e-06, + "loss": 0.158, + "step": 2153 + }, + { + "epoch": 0.6979909267660401, + "grad_norm": 0.8181210160255432, + "learning_rate": 4.481162795429618e-06, + "loss": 0.147, + "step": 2154 + }, + { + "epoch": 0.6983149708360337, + "grad_norm": 0.9151508808135986, + "learning_rate": 4.480629235585909e-06, + "loss": 0.1527, + "step": 2155 + }, + { + "epoch": 0.6986390149060272, + "grad_norm": 0.909498929977417, + "learning_rate": 4.480095433332357e-06, + "loss": 0.1604, + "step": 2156 + }, + { + "epoch": 0.6989630589760207, + "grad_norm": 0.9044917821884155, + "learning_rate": 4.4795613887342916e-06, + "loss": 0.1577, + "step": 2157 + }, + { + "epoch": 0.6992871030460143, + "grad_norm": 0.8315892219543457, + "learning_rate": 4.479027101857076e-06, + "loss": 0.1459, + "step": 2158 + }, + { + "epoch": 0.6996111471160078, + "grad_norm": 0.9254205822944641, + "learning_rate": 4.4784925727661025e-06, + "loss": 0.1633, + "step": 2159 + }, + { + "epoch": 0.6999351911860013, + "grad_norm": 0.9383320808410645, + "learning_rate": 4.47795780152679e-06, + "loss": 0.1617, + "step": 2160 + }, + { + "epoch": 0.7002592352559948, + "grad_norm": 0.8765949606895447, + "learning_rate": 4.477422788204592e-06, + "loss": 0.1571, + "step": 2161 + }, + { + "epoch": 0.7005832793259883, + "grad_norm": 0.8948960900306702, + "learning_rate": 4.476887532864986e-06, + "loss": 0.1367, + "step": 2162 + }, + { + "epoch": 0.7009073233959818, + "grad_norm": 0.9318114519119263, + "learning_rate": 4.476352035573486e-06, + "loss": 0.1636, + "step": 2163 + }, + { + "epoch": 0.7012313674659754, + "grad_norm": 0.8219728469848633, + "learning_rate": 4.475816296395627e-06, + "loss": 0.1455, + "step": 2164 + }, + { + "epoch": 0.7015554115359689, + "grad_norm": 0.952003538608551, + "learning_rate": 4.475280315396982e-06, + "loss": 0.1677, + "step": 2165 + }, + { + "epoch": 0.7018794556059624, + "grad_norm": 0.9319960474967957, + "learning_rate": 4.474744092643149e-06, + "loss": 0.157, + "step": 2166 + }, + { + "epoch": 0.702203499675956, + "grad_norm": 0.8745998740196228, + "learning_rate": 4.474207628199756e-06, + "loss": 0.1497, + "step": 2167 + }, + { + "epoch": 0.7025275437459495, + "grad_norm": 0.8776024580001831, + "learning_rate": 4.47367092213246e-06, + "loss": 0.152, + "step": 2168 + }, + { + "epoch": 0.7028515878159429, + "grad_norm": 0.845687985420227, + "learning_rate": 4.473133974506951e-06, + "loss": 0.1473, + "step": 2169 + }, + { + "epoch": 0.7031756318859365, + "grad_norm": 0.9918162226676941, + "learning_rate": 4.472596785388944e-06, + "loss": 0.1871, + "step": 2170 + }, + { + "epoch": 0.70349967595593, + "grad_norm": 0.9364629983901978, + "learning_rate": 4.472059354844187e-06, + "loss": 0.16, + "step": 2171 + }, + { + "epoch": 0.7038237200259235, + "grad_norm": 0.8628015518188477, + "learning_rate": 4.4715216829384566e-06, + "loss": 0.1676, + "step": 2172 + }, + { + "epoch": 0.7041477640959171, + "grad_norm": 0.9118626713752747, + "learning_rate": 4.470983769737557e-06, + "loss": 0.1758, + "step": 2173 + }, + { + "epoch": 0.7044718081659106, + "grad_norm": 0.9084286689758301, + "learning_rate": 4.470445615307325e-06, + "loss": 0.1684, + "step": 2174 + }, + { + "epoch": 0.7047958522359041, + "grad_norm": 0.8909494280815125, + "learning_rate": 4.4699072197136255e-06, + "loss": 0.1707, + "step": 2175 + }, + { + "epoch": 0.7051198963058976, + "grad_norm": 0.9201353192329407, + "learning_rate": 4.469368583022352e-06, + "loss": 0.167, + "step": 2176 + }, + { + "epoch": 0.7054439403758911, + "grad_norm": 0.9684481024742126, + "learning_rate": 4.468829705299429e-06, + "loss": 0.1635, + "step": 2177 + }, + { + "epoch": 0.7057679844458846, + "grad_norm": 0.9111968278884888, + "learning_rate": 4.4682905866108094e-06, + "loss": 0.1672, + "step": 2178 + }, + { + "epoch": 0.7060920285158782, + "grad_norm": 0.9797288775444031, + "learning_rate": 4.467751227022478e-06, + "loss": 0.1512, + "step": 2179 + }, + { + "epoch": 0.7064160725858717, + "grad_norm": 0.8910669088363647, + "learning_rate": 4.467211626600444e-06, + "loss": 0.1626, + "step": 2180 + }, + { + "epoch": 0.7067401166558652, + "grad_norm": 0.8507121205329895, + "learning_rate": 4.466671785410752e-06, + "loss": 0.1566, + "step": 2181 + }, + { + "epoch": 0.7070641607258588, + "grad_norm": 0.8694670796394348, + "learning_rate": 4.4661317035194716e-06, + "loss": 0.155, + "step": 2182 + }, + { + "epoch": 0.7073882047958522, + "grad_norm": 0.9297208189964294, + "learning_rate": 4.4655913809927045e-06, + "loss": 0.1554, + "step": 2183 + }, + { + "epoch": 0.7077122488658457, + "grad_norm": 1.0536330938339233, + "learning_rate": 4.4650508178965814e-06, + "loss": 0.1626, + "step": 2184 + }, + { + "epoch": 0.7080362929358393, + "grad_norm": 0.979632556438446, + "learning_rate": 4.464510014297261e-06, + "loss": 0.1503, + "step": 2185 + }, + { + "epoch": 0.7083603370058328, + "grad_norm": 0.869288980960846, + "learning_rate": 4.4639689702609326e-06, + "loss": 0.1545, + "step": 2186 + }, + { + "epoch": 0.7086843810758263, + "grad_norm": 0.9722819328308105, + "learning_rate": 4.463427685853815e-06, + "loss": 0.1575, + "step": 2187 + }, + { + "epoch": 0.7090084251458199, + "grad_norm": 0.958224892616272, + "learning_rate": 4.462886161142157e-06, + "loss": 0.1641, + "step": 2188 + }, + { + "epoch": 0.7093324692158134, + "grad_norm": 0.9742621183395386, + "learning_rate": 4.4623443961922334e-06, + "loss": 0.1651, + "step": 2189 + }, + { + "epoch": 0.7096565132858069, + "grad_norm": 0.92420494556427, + "learning_rate": 4.461802391070354e-06, + "loss": 0.1536, + "step": 2190 + }, + { + "epoch": 0.7099805573558003, + "grad_norm": 0.9209905862808228, + "learning_rate": 4.4612601458428525e-06, + "loss": 0.153, + "step": 2191 + }, + { + "epoch": 0.7103046014257939, + "grad_norm": 1.007217526435852, + "learning_rate": 4.460717660576097e-06, + "loss": 0.1683, + "step": 2192 + }, + { + "epoch": 0.7106286454957874, + "grad_norm": 0.8647215366363525, + "learning_rate": 4.46017493533648e-06, + "loss": 0.153, + "step": 2193 + }, + { + "epoch": 0.710952689565781, + "grad_norm": 0.9210792183876038, + "learning_rate": 4.459631970190428e-06, + "loss": 0.1606, + "step": 2194 + }, + { + "epoch": 0.7112767336357745, + "grad_norm": 0.8727303743362427, + "learning_rate": 4.4590887652043925e-06, + "loss": 0.1507, + "step": 2195 + }, + { + "epoch": 0.711600777705768, + "grad_norm": 1.004441261291504, + "learning_rate": 4.458545320444857e-06, + "loss": 0.1669, + "step": 2196 + }, + { + "epoch": 0.7119248217757616, + "grad_norm": 0.9051446914672852, + "learning_rate": 4.458001635978335e-06, + "loss": 0.1516, + "step": 2197 + }, + { + "epoch": 0.712248865845755, + "grad_norm": 0.879884660243988, + "learning_rate": 4.457457711871369e-06, + "loss": 0.1618, + "step": 2198 + }, + { + "epoch": 0.7125729099157485, + "grad_norm": 0.9076919555664062, + "learning_rate": 4.4569135481905274e-06, + "loss": 0.1543, + "step": 2199 + }, + { + "epoch": 0.712896953985742, + "grad_norm": 0.8741816878318787, + "learning_rate": 4.456369145002412e-06, + "loss": 0.1498, + "step": 2200 + }, + { + "epoch": 0.7132209980557356, + "grad_norm": 0.9390980005264282, + "learning_rate": 4.455824502373653e-06, + "loss": 0.1487, + "step": 2201 + }, + { + "epoch": 0.7135450421257291, + "grad_norm": 0.8313080072402954, + "learning_rate": 4.455279620370908e-06, + "loss": 0.1516, + "step": 2202 + }, + { + "epoch": 0.7138690861957226, + "grad_norm": 0.8874267339706421, + "learning_rate": 4.454734499060867e-06, + "loss": 0.161, + "step": 2203 + }, + { + "epoch": 0.7141931302657162, + "grad_norm": 0.8838033676147461, + "learning_rate": 4.454189138510246e-06, + "loss": 0.1545, + "step": 2204 + }, + { + "epoch": 0.7145171743357096, + "grad_norm": 1.1029689311981201, + "learning_rate": 4.453643538785793e-06, + "loss": 0.1689, + "step": 2205 + }, + { + "epoch": 0.7148412184057031, + "grad_norm": 0.9252147674560547, + "learning_rate": 4.453097699954282e-06, + "loss": 0.1704, + "step": 2206 + }, + { + "epoch": 0.7151652624756967, + "grad_norm": 0.8694695234298706, + "learning_rate": 4.452551622082522e-06, + "loss": 0.1456, + "step": 2207 + }, + { + "epoch": 0.7154893065456902, + "grad_norm": 1.0107353925704956, + "learning_rate": 4.452005305237344e-06, + "loss": 0.1657, + "step": 2208 + }, + { + "epoch": 0.7158133506156837, + "grad_norm": 0.841675877571106, + "learning_rate": 4.451458749485614e-06, + "loss": 0.1541, + "step": 2209 + }, + { + "epoch": 0.7161373946856773, + "grad_norm": 0.9358766674995422, + "learning_rate": 4.4509119548942245e-06, + "loss": 0.1616, + "step": 2210 + }, + { + "epoch": 0.7164614387556708, + "grad_norm": 0.8442674875259399, + "learning_rate": 4.450364921530099e-06, + "loss": 0.155, + "step": 2211 + }, + { + "epoch": 0.7167854828256643, + "grad_norm": 0.9010113477706909, + "learning_rate": 4.449817649460187e-06, + "loss": 0.1539, + "step": 2212 + }, + { + "epoch": 0.7171095268956578, + "grad_norm": 0.8915073275566101, + "learning_rate": 4.449270138751471e-06, + "loss": 0.1473, + "step": 2213 + }, + { + "epoch": 0.7174335709656513, + "grad_norm": 0.9104732275009155, + "learning_rate": 4.4487223894709606e-06, + "loss": 0.1461, + "step": 2214 + }, + { + "epoch": 0.7177576150356448, + "grad_norm": 0.8660483956336975, + "learning_rate": 4.448174401685694e-06, + "loss": 0.1528, + "step": 2215 + }, + { + "epoch": 0.7180816591056384, + "grad_norm": 0.897094190120697, + "learning_rate": 4.447626175462741e-06, + "loss": 0.1566, + "step": 2216 + }, + { + "epoch": 0.7184057031756319, + "grad_norm": 0.9597735404968262, + "learning_rate": 4.447077710869199e-06, + "loss": 0.1585, + "step": 2217 + }, + { + "epoch": 0.7187297472456254, + "grad_norm": 0.880084753036499, + "learning_rate": 4.4465290079721935e-06, + "loss": 0.1567, + "step": 2218 + }, + { + "epoch": 0.719053791315619, + "grad_norm": 0.8820602297782898, + "learning_rate": 4.445980066838882e-06, + "loss": 0.1564, + "step": 2219 + }, + { + "epoch": 0.7193778353856124, + "grad_norm": 0.9745096564292908, + "learning_rate": 4.4454308875364486e-06, + "loss": 0.1735, + "step": 2220 + }, + { + "epoch": 0.7197018794556059, + "grad_norm": 0.8503068089485168, + "learning_rate": 4.444881470132108e-06, + "loss": 0.1488, + "step": 2221 + }, + { + "epoch": 0.7200259235255995, + "grad_norm": 0.8765119314193726, + "learning_rate": 4.444331814693103e-06, + "loss": 0.1539, + "step": 2222 + }, + { + "epoch": 0.720349967595593, + "grad_norm": 0.8700873255729675, + "learning_rate": 4.443781921286706e-06, + "loss": 0.1485, + "step": 2223 + }, + { + "epoch": 0.7206740116655865, + "grad_norm": 0.8490604162216187, + "learning_rate": 4.4432317899802205e-06, + "loss": 0.157, + "step": 2224 + }, + { + "epoch": 0.7209980557355801, + "grad_norm": 0.8531603813171387, + "learning_rate": 4.442681420840974e-06, + "loss": 0.1606, + "step": 2225 + }, + { + "epoch": 0.7213220998055736, + "grad_norm": 0.8722713589668274, + "learning_rate": 4.44213081393633e-06, + "loss": 0.1389, + "step": 2226 + }, + { + "epoch": 0.721646143875567, + "grad_norm": 0.9597254991531372, + "learning_rate": 4.441579969333675e-06, + "loss": 0.1544, + "step": 2227 + }, + { + "epoch": 0.7219701879455606, + "grad_norm": 0.896874725818634, + "learning_rate": 4.441028887100427e-06, + "loss": 0.1721, + "step": 2228 + }, + { + "epoch": 0.7222942320155541, + "grad_norm": 0.8581936955451965, + "learning_rate": 4.4404775673040346e-06, + "loss": 0.159, + "step": 2229 + }, + { + "epoch": 0.7226182760855476, + "grad_norm": 0.8402219414710999, + "learning_rate": 4.4399260100119726e-06, + "loss": 0.1543, + "step": 2230 + }, + { + "epoch": 0.7229423201555412, + "grad_norm": 0.9325534105300903, + "learning_rate": 4.439374215291748e-06, + "loss": 0.1589, + "step": 2231 + }, + { + "epoch": 0.7232663642255347, + "grad_norm": 0.805816650390625, + "learning_rate": 4.438822183210894e-06, + "loss": 0.1422, + "step": 2232 + }, + { + "epoch": 0.7235904082955282, + "grad_norm": 0.8733918070793152, + "learning_rate": 4.438269913836972e-06, + "loss": 0.1603, + "step": 2233 + }, + { + "epoch": 0.7239144523655218, + "grad_norm": 0.9027920961380005, + "learning_rate": 4.437717407237578e-06, + "loss": 0.1584, + "step": 2234 + }, + { + "epoch": 0.7242384964355152, + "grad_norm": 0.8571335077285767, + "learning_rate": 4.437164663480332e-06, + "loss": 0.1588, + "step": 2235 + }, + { + "epoch": 0.7245625405055087, + "grad_norm": 0.8303107619285583, + "learning_rate": 4.436611682632884e-06, + "loss": 0.153, + "step": 2236 + }, + { + "epoch": 0.7248865845755023, + "grad_norm": 0.8509790897369385, + "learning_rate": 4.436058464762915e-06, + "loss": 0.1602, + "step": 2237 + }, + { + "epoch": 0.7252106286454958, + "grad_norm": 0.8622452616691589, + "learning_rate": 4.435505009938131e-06, + "loss": 0.1532, + "step": 2238 + }, + { + "epoch": 0.7255346727154893, + "grad_norm": 0.8336579203605652, + "learning_rate": 4.434951318226272e-06, + "loss": 0.1349, + "step": 2239 + }, + { + "epoch": 0.7258587167854829, + "grad_norm": 0.9228907227516174, + "learning_rate": 4.434397389695102e-06, + "loss": 0.1675, + "step": 2240 + }, + { + "epoch": 0.7261827608554764, + "grad_norm": 0.8288503289222717, + "learning_rate": 4.433843224412419e-06, + "loss": 0.1423, + "step": 2241 + }, + { + "epoch": 0.7265068049254698, + "grad_norm": 0.9091185927391052, + "learning_rate": 4.4332888224460466e-06, + "loss": 0.1498, + "step": 2242 + }, + { + "epoch": 0.7268308489954634, + "grad_norm": 0.8939113616943359, + "learning_rate": 4.432734183863837e-06, + "loss": 0.152, + "step": 2243 + }, + { + "epoch": 0.7271548930654569, + "grad_norm": 0.8943285942077637, + "learning_rate": 4.432179308733674e-06, + "loss": 0.1574, + "step": 2244 + }, + { + "epoch": 0.7274789371354504, + "grad_norm": 0.8595091104507446, + "learning_rate": 4.43162419712347e-06, + "loss": 0.1569, + "step": 2245 + }, + { + "epoch": 0.727802981205444, + "grad_norm": 0.874617874622345, + "learning_rate": 4.431068849101162e-06, + "loss": 0.1532, + "step": 2246 + }, + { + "epoch": 0.7281270252754375, + "grad_norm": 0.9029631018638611, + "learning_rate": 4.4305132647347215e-06, + "loss": 0.1656, + "step": 2247 + }, + { + "epoch": 0.728451069345431, + "grad_norm": 0.9923983812332153, + "learning_rate": 4.429957444092146e-06, + "loss": 0.1887, + "step": 2248 + }, + { + "epoch": 0.7287751134154244, + "grad_norm": 0.8730635046958923, + "learning_rate": 4.429401387241464e-06, + "loss": 0.1687, + "step": 2249 + }, + { + "epoch": 0.729099157485418, + "grad_norm": 0.9126926064491272, + "learning_rate": 4.428845094250729e-06, + "loss": 0.1624, + "step": 2250 + }, + { + "epoch": 0.7294232015554115, + "grad_norm": 0.8735573887825012, + "learning_rate": 4.428288565188028e-06, + "loss": 0.1564, + "step": 2251 + }, + { + "epoch": 0.729747245625405, + "grad_norm": 0.8695078492164612, + "learning_rate": 4.427731800121473e-06, + "loss": 0.1528, + "step": 2252 + }, + { + "epoch": 0.7300712896953986, + "grad_norm": 0.8787115812301636, + "learning_rate": 4.427174799119208e-06, + "loss": 0.156, + "step": 2253 + }, + { + "epoch": 0.7303953337653921, + "grad_norm": 0.8693944215774536, + "learning_rate": 4.426617562249405e-06, + "loss": 0.1605, + "step": 2254 + }, + { + "epoch": 0.7307193778353857, + "grad_norm": 0.9276489019393921, + "learning_rate": 4.426060089580262e-06, + "loss": 0.1643, + "step": 2255 + }, + { + "epoch": 0.7310434219053791, + "grad_norm": 0.8228742480278015, + "learning_rate": 4.42550238118001e-06, + "loss": 0.1482, + "step": 2256 + }, + { + "epoch": 0.7313674659753726, + "grad_norm": 0.9286083579063416, + "learning_rate": 4.424944437116907e-06, + "loss": 0.1643, + "step": 2257 + }, + { + "epoch": 0.7316915100453661, + "grad_norm": 0.8886469602584839, + "learning_rate": 4.424386257459241e-06, + "loss": 0.169, + "step": 2258 + }, + { + "epoch": 0.7320155541153597, + "grad_norm": 0.9932120442390442, + "learning_rate": 4.423827842275325e-06, + "loss": 0.1628, + "step": 2259 + }, + { + "epoch": 0.7323395981853532, + "grad_norm": 0.8697348237037659, + "learning_rate": 4.4232691916335055e-06, + "loss": 0.1387, + "step": 2260 + }, + { + "epoch": 0.7326636422553467, + "grad_norm": 0.9891642928123474, + "learning_rate": 4.422710305602156e-06, + "loss": 0.1627, + "step": 2261 + }, + { + "epoch": 0.7329876863253403, + "grad_norm": 0.8648738861083984, + "learning_rate": 4.422151184249679e-06, + "loss": 0.1644, + "step": 2262 + }, + { + "epoch": 0.7333117303953338, + "grad_norm": 0.8476166129112244, + "learning_rate": 4.421591827644503e-06, + "loss": 0.1588, + "step": 2263 + }, + { + "epoch": 0.7336357744653272, + "grad_norm": 0.9727128744125366, + "learning_rate": 4.4210322358550915e-06, + "loss": 0.167, + "step": 2264 + }, + { + "epoch": 0.7339598185353208, + "grad_norm": 0.8248828649520874, + "learning_rate": 4.420472408949931e-06, + "loss": 0.141, + "step": 2265 + }, + { + "epoch": 0.7342838626053143, + "grad_norm": 0.8558880090713501, + "learning_rate": 4.419912346997539e-06, + "loss": 0.1369, + "step": 2266 + }, + { + "epoch": 0.7346079066753078, + "grad_norm": 0.9087838530540466, + "learning_rate": 4.419352050066462e-06, + "loss": 0.1654, + "step": 2267 + }, + { + "epoch": 0.7349319507453014, + "grad_norm": 0.8426334857940674, + "learning_rate": 4.418791518225275e-06, + "loss": 0.1451, + "step": 2268 + }, + { + "epoch": 0.7352559948152949, + "grad_norm": 0.8864561915397644, + "learning_rate": 4.418230751542581e-06, + "loss": 0.1666, + "step": 2269 + }, + { + "epoch": 0.7355800388852884, + "grad_norm": 0.8837295174598694, + "learning_rate": 4.417669750087014e-06, + "loss": 0.1488, + "step": 2270 + }, + { + "epoch": 0.7359040829552819, + "grad_norm": 0.8623096942901611, + "learning_rate": 4.417108513927233e-06, + "loss": 0.1403, + "step": 2271 + }, + { + "epoch": 0.7362281270252754, + "grad_norm": 0.841560423374176, + "learning_rate": 4.416547043131929e-06, + "loss": 0.1416, + "step": 2272 + }, + { + "epoch": 0.7365521710952689, + "grad_norm": 0.935539960861206, + "learning_rate": 4.41598533776982e-06, + "loss": 0.1668, + "step": 2273 + }, + { + "epoch": 0.7368762151652625, + "grad_norm": 0.8458617329597473, + "learning_rate": 4.415423397909655e-06, + "loss": 0.1599, + "step": 2274 + }, + { + "epoch": 0.737200259235256, + "grad_norm": 0.7807754278182983, + "learning_rate": 4.414861223620209e-06, + "loss": 0.1348, + "step": 2275 + }, + { + "epoch": 0.7375243033052495, + "grad_norm": 0.8110257983207703, + "learning_rate": 4.414298814970286e-06, + "loss": 0.1532, + "step": 2276 + }, + { + "epoch": 0.7378483473752431, + "grad_norm": 0.8469443917274475, + "learning_rate": 4.41373617202872e-06, + "loss": 0.15, + "step": 2277 + }, + { + "epoch": 0.7381723914452365, + "grad_norm": 0.8829174041748047, + "learning_rate": 4.413173294864373e-06, + "loss": 0.1585, + "step": 2278 + }, + { + "epoch": 0.73849643551523, + "grad_norm": 0.7983418703079224, + "learning_rate": 4.412610183546135e-06, + "loss": 0.1429, + "step": 2279 + }, + { + "epoch": 0.7388204795852236, + "grad_norm": 0.8902999758720398, + "learning_rate": 4.412046838142927e-06, + "loss": 0.1519, + "step": 2280 + }, + { + "epoch": 0.7391445236552171, + "grad_norm": 0.8876487612724304, + "learning_rate": 4.411483258723695e-06, + "loss": 0.155, + "step": 2281 + }, + { + "epoch": 0.7394685677252106, + "grad_norm": 0.8737650513648987, + "learning_rate": 4.410919445357418e-06, + "loss": 0.1547, + "step": 2282 + }, + { + "epoch": 0.7397926117952042, + "grad_norm": 0.8292451500892639, + "learning_rate": 4.410355398113099e-06, + "loss": 0.1442, + "step": 2283 + }, + { + "epoch": 0.7401166558651977, + "grad_norm": 0.8985834121704102, + "learning_rate": 4.409791117059773e-06, + "loss": 0.1526, + "step": 2284 + }, + { + "epoch": 0.7404406999351912, + "grad_norm": 0.841582715511322, + "learning_rate": 4.409226602266503e-06, + "loss": 0.1523, + "step": 2285 + }, + { + "epoch": 0.7407647440051847, + "grad_norm": 0.8735675811767578, + "learning_rate": 4.408661853802379e-06, + "loss": 0.1638, + "step": 2286 + }, + { + "epoch": 0.7410887880751782, + "grad_norm": 0.9003371596336365, + "learning_rate": 4.408096871736522e-06, + "loss": 0.1668, + "step": 2287 + }, + { + "epoch": 0.7414128321451717, + "grad_norm": 0.8409478068351746, + "learning_rate": 4.407531656138079e-06, + "loss": 0.1567, + "step": 2288 + }, + { + "epoch": 0.7417368762151653, + "grad_norm": 0.8526241183280945, + "learning_rate": 4.406966207076229e-06, + "loss": 0.1491, + "step": 2289 + }, + { + "epoch": 0.7420609202851588, + "grad_norm": 0.8444311618804932, + "learning_rate": 4.406400524620174e-06, + "loss": 0.1456, + "step": 2290 + }, + { + "epoch": 0.7423849643551523, + "grad_norm": 0.8632143139839172, + "learning_rate": 4.405834608839152e-06, + "loss": 0.1535, + "step": 2291 + }, + { + "epoch": 0.7427090084251459, + "grad_norm": 0.8817870020866394, + "learning_rate": 4.405268459802423e-06, + "loss": 0.1597, + "step": 2292 + }, + { + "epoch": 0.7430330524951393, + "grad_norm": 0.8612931370735168, + "learning_rate": 4.404702077579279e-06, + "loss": 0.1703, + "step": 2293 + }, + { + "epoch": 0.7433570965651328, + "grad_norm": 0.8779714703559875, + "learning_rate": 4.4041354622390395e-06, + "loss": 0.146, + "step": 2294 + }, + { + "epoch": 0.7436811406351264, + "grad_norm": 0.9611407518386841, + "learning_rate": 4.403568613851054e-06, + "loss": 0.1677, + "step": 2295 + }, + { + "epoch": 0.7440051847051199, + "grad_norm": 0.8865842223167419, + "learning_rate": 4.403001532484697e-06, + "loss": 0.159, + "step": 2296 + }, + { + "epoch": 0.7443292287751134, + "grad_norm": 0.8867451548576355, + "learning_rate": 4.4024342182093745e-06, + "loss": 0.1642, + "step": 2297 + }, + { + "epoch": 0.744653272845107, + "grad_norm": 0.828064501285553, + "learning_rate": 4.401866671094522e-06, + "loss": 0.15, + "step": 2298 + }, + { + "epoch": 0.7449773169151005, + "grad_norm": 0.8317776918411255, + "learning_rate": 4.4012988912096e-06, + "loss": 0.1406, + "step": 2299 + }, + { + "epoch": 0.7453013609850939, + "grad_norm": 0.8799874186515808, + "learning_rate": 4.4007308786241e-06, + "loss": 0.1477, + "step": 2300 + }, + { + "epoch": 0.7456254050550875, + "grad_norm": 0.8931517004966736, + "learning_rate": 4.40016263340754e-06, + "loss": 0.1444, + "step": 2301 + }, + { + "epoch": 0.745949449125081, + "grad_norm": 0.8301799893379211, + "learning_rate": 4.399594155629469e-06, + "loss": 0.1455, + "step": 2302 + }, + { + "epoch": 0.7462734931950745, + "grad_norm": 0.8843263387680054, + "learning_rate": 4.3990254453594634e-06, + "loss": 0.146, + "step": 2303 + }, + { + "epoch": 0.7465975372650681, + "grad_norm": 0.8951470255851746, + "learning_rate": 4.398456502667127e-06, + "loss": 0.1623, + "step": 2304 + }, + { + "epoch": 0.7469215813350616, + "grad_norm": 0.9557040333747864, + "learning_rate": 4.397887327622093e-06, + "loss": 0.1693, + "step": 2305 + }, + { + "epoch": 0.7472456254050551, + "grad_norm": 0.8738280534744263, + "learning_rate": 4.397317920294023e-06, + "loss": 0.1553, + "step": 2306 + }, + { + "epoch": 0.7475696694750487, + "grad_norm": 0.9042795896530151, + "learning_rate": 4.396748280752608e-06, + "loss": 0.1601, + "step": 2307 + }, + { + "epoch": 0.7478937135450421, + "grad_norm": 0.8857934474945068, + "learning_rate": 4.396178409067564e-06, + "loss": 0.1633, + "step": 2308 + }, + { + "epoch": 0.7482177576150356, + "grad_norm": 0.9887219071388245, + "learning_rate": 4.395608305308639e-06, + "loss": 0.1663, + "step": 2309 + }, + { + "epoch": 0.7485418016850292, + "grad_norm": 0.9533881545066833, + "learning_rate": 4.395037969545609e-06, + "loss": 0.1609, + "step": 2310 + }, + { + "epoch": 0.7488658457550227, + "grad_norm": 0.8651255965232849, + "learning_rate": 4.394467401848277e-06, + "loss": 0.1505, + "step": 2311 + }, + { + "epoch": 0.7491898898250162, + "grad_norm": 0.8285210132598877, + "learning_rate": 4.393896602286475e-06, + "loss": 0.1569, + "step": 2312 + }, + { + "epoch": 0.7495139338950098, + "grad_norm": 0.8599982857704163, + "learning_rate": 4.3933255709300635e-06, + "loss": 0.166, + "step": 2313 + }, + { + "epoch": 0.7498379779650033, + "grad_norm": 0.8919265270233154, + "learning_rate": 4.3927543078489295e-06, + "loss": 0.1662, + "step": 2314 + }, + { + "epoch": 0.7501620220349967, + "grad_norm": 0.8909933567047119, + "learning_rate": 4.392182813112993e-06, + "loss": 0.1619, + "step": 2315 + }, + { + "epoch": 0.7504860661049902, + "grad_norm": 0.9427803158760071, + "learning_rate": 4.391611086792198e-06, + "loss": 0.1523, + "step": 2316 + }, + { + "epoch": 0.7508101101749838, + "grad_norm": 0.8598654270172119, + "learning_rate": 4.391039128956517e-06, + "loss": 0.1614, + "step": 2317 + }, + { + "epoch": 0.7511341542449773, + "grad_norm": 0.9585804343223572, + "learning_rate": 4.390466939675954e-06, + "loss": 0.1602, + "step": 2318 + }, + { + "epoch": 0.7514581983149708, + "grad_norm": 0.7937201857566833, + "learning_rate": 4.389894519020539e-06, + "loss": 0.1382, + "step": 2319 + }, + { + "epoch": 0.7517822423849644, + "grad_norm": 0.8627025485038757, + "learning_rate": 4.38932186706033e-06, + "loss": 0.1467, + "step": 2320 + }, + { + "epoch": 0.7521062864549579, + "grad_norm": 0.9446385502815247, + "learning_rate": 4.388748983865414e-06, + "loss": 0.1666, + "step": 2321 + }, + { + "epoch": 0.7524303305249513, + "grad_norm": 0.869536280632019, + "learning_rate": 4.388175869505908e-06, + "loss": 0.148, + "step": 2322 + }, + { + "epoch": 0.7527543745949449, + "grad_norm": 0.8737410306930542, + "learning_rate": 4.387602524051954e-06, + "loss": 0.1463, + "step": 2323 + }, + { + "epoch": 0.7530784186649384, + "grad_norm": 0.8531762957572937, + "learning_rate": 4.387028947573724e-06, + "loss": 0.136, + "step": 2324 + }, + { + "epoch": 0.7534024627349319, + "grad_norm": 0.9215272068977356, + "learning_rate": 4.3864551401414195e-06, + "loss": 0.166, + "step": 2325 + }, + { + "epoch": 0.7537265068049255, + "grad_norm": 0.8515351414680481, + "learning_rate": 4.385881101825268e-06, + "loss": 0.16, + "step": 2326 + }, + { + "epoch": 0.754050550874919, + "grad_norm": 0.8563395738601685, + "learning_rate": 4.385306832695526e-06, + "loss": 0.1488, + "step": 2327 + }, + { + "epoch": 0.7543745949449125, + "grad_norm": 0.8613479733467102, + "learning_rate": 4.384732332822479e-06, + "loss": 0.146, + "step": 2328 + }, + { + "epoch": 0.7546986390149061, + "grad_norm": 0.8307859897613525, + "learning_rate": 4.38415760227644e-06, + "loss": 0.1554, + "step": 2329 + }, + { + "epoch": 0.7550226830848995, + "grad_norm": 0.8669750690460205, + "learning_rate": 4.38358264112775e-06, + "loss": 0.1494, + "step": 2330 + }, + { + "epoch": 0.755346727154893, + "grad_norm": 0.986054003238678, + "learning_rate": 4.3830074494467815e-06, + "loss": 0.1739, + "step": 2331 + }, + { + "epoch": 0.7556707712248866, + "grad_norm": 0.8789486885070801, + "learning_rate": 4.382432027303928e-06, + "loss": 0.1444, + "step": 2332 + }, + { + "epoch": 0.7559948152948801, + "grad_norm": 0.8285205960273743, + "learning_rate": 4.381856374769617e-06, + "loss": 0.1608, + "step": 2333 + }, + { + "epoch": 0.7563188593648736, + "grad_norm": 0.9019232392311096, + "learning_rate": 4.3812804919143055e-06, + "loss": 0.1461, + "step": 2334 + }, + { + "epoch": 0.7566429034348672, + "grad_norm": 0.9552561640739441, + "learning_rate": 4.380704378808473e-06, + "loss": 0.1699, + "step": 2335 + }, + { + "epoch": 0.7569669475048607, + "grad_norm": 0.9247875213623047, + "learning_rate": 4.380128035522632e-06, + "loss": 0.1642, + "step": 2336 + }, + { + "epoch": 0.7572909915748541, + "grad_norm": 0.9469590783119202, + "learning_rate": 4.379551462127319e-06, + "loss": 0.1793, + "step": 2337 + }, + { + "epoch": 0.7576150356448477, + "grad_norm": 0.9114015698432922, + "learning_rate": 4.3789746586931034e-06, + "loss": 0.1636, + "step": 2338 + }, + { + "epoch": 0.7579390797148412, + "grad_norm": 0.8937296271324158, + "learning_rate": 4.37839762529058e-06, + "loss": 0.1426, + "step": 2339 + }, + { + "epoch": 0.7582631237848347, + "grad_norm": 0.9419205188751221, + "learning_rate": 4.3778203619903716e-06, + "loss": 0.1729, + "step": 2340 + }, + { + "epoch": 0.7585871678548283, + "grad_norm": 0.8397945165634155, + "learning_rate": 4.3772428688631285e-06, + "loss": 0.1488, + "step": 2341 + }, + { + "epoch": 0.7589112119248218, + "grad_norm": 0.8474755883216858, + "learning_rate": 4.376665145979532e-06, + "loss": 0.1493, + "step": 2342 + }, + { + "epoch": 0.7592352559948153, + "grad_norm": 0.9139355421066284, + "learning_rate": 4.376087193410289e-06, + "loss": 0.1618, + "step": 2343 + }, + { + "epoch": 0.7595593000648088, + "grad_norm": 0.9013419151306152, + "learning_rate": 4.375509011226135e-06, + "loss": 0.1483, + "step": 2344 + }, + { + "epoch": 0.7598833441348023, + "grad_norm": 0.881345808506012, + "learning_rate": 4.374930599497835e-06, + "loss": 0.1484, + "step": 2345 + }, + { + "epoch": 0.7602073882047958, + "grad_norm": 0.882448673248291, + "learning_rate": 4.37435195829618e-06, + "loss": 0.1491, + "step": 2346 + }, + { + "epoch": 0.7605314322747894, + "grad_norm": 0.9138561487197876, + "learning_rate": 4.373773087691992e-06, + "loss": 0.1575, + "step": 2347 + }, + { + "epoch": 0.7608554763447829, + "grad_norm": 0.7963294982910156, + "learning_rate": 4.373193987756116e-06, + "loss": 0.1482, + "step": 2348 + }, + { + "epoch": 0.7611795204147764, + "grad_norm": 0.8454092144966125, + "learning_rate": 4.3726146585594296e-06, + "loss": 0.1556, + "step": 2349 + }, + { + "epoch": 0.76150356448477, + "grad_norm": 0.9024544358253479, + "learning_rate": 4.372035100172838e-06, + "loss": 0.1663, + "step": 2350 + }, + { + "epoch": 0.7618276085547635, + "grad_norm": 0.8852647542953491, + "learning_rate": 4.371455312667272e-06, + "loss": 0.1622, + "step": 2351 + }, + { + "epoch": 0.7621516526247569, + "grad_norm": 0.9493899345397949, + "learning_rate": 4.370875296113694e-06, + "loss": 0.1781, + "step": 2352 + }, + { + "epoch": 0.7624756966947505, + "grad_norm": 0.847676694393158, + "learning_rate": 4.370295050583091e-06, + "loss": 0.1501, + "step": 2353 + }, + { + "epoch": 0.762799740764744, + "grad_norm": 0.9140247702598572, + "learning_rate": 4.3697145761464785e-06, + "loss": 0.1599, + "step": 2354 + }, + { + "epoch": 0.7631237848347375, + "grad_norm": 0.8939958214759827, + "learning_rate": 4.369133872874903e-06, + "loss": 0.1621, + "step": 2355 + }, + { + "epoch": 0.7634478289047311, + "grad_norm": 0.8538257479667664, + "learning_rate": 4.368552940839436e-06, + "loss": 0.1685, + "step": 2356 + }, + { + "epoch": 0.7637718729747246, + "grad_norm": 0.9288912415504456, + "learning_rate": 4.367971780111179e-06, + "loss": 0.1538, + "step": 2357 + }, + { + "epoch": 0.7640959170447181, + "grad_norm": 0.830361545085907, + "learning_rate": 4.367390390761258e-06, + "loss": 0.1318, + "step": 2358 + }, + { + "epoch": 0.7644199611147116, + "grad_norm": 0.8867520093917847, + "learning_rate": 4.3668087728608314e-06, + "loss": 0.1494, + "step": 2359 + }, + { + "epoch": 0.7647440051847051, + "grad_norm": 0.887715220451355, + "learning_rate": 4.366226926481083e-06, + "loss": 0.1604, + "step": 2360 + }, + { + "epoch": 0.7650680492546986, + "grad_norm": 0.9055389761924744, + "learning_rate": 4.365644851693226e-06, + "loss": 0.1605, + "step": 2361 + }, + { + "epoch": 0.7653920933246922, + "grad_norm": 0.8388006687164307, + "learning_rate": 4.3650625485685e-06, + "loss": 0.1508, + "step": 2362 + }, + { + "epoch": 0.7657161373946857, + "grad_norm": 0.8231101036071777, + "learning_rate": 4.364480017178172e-06, + "loss": 0.147, + "step": 2363 + }, + { + "epoch": 0.7660401814646792, + "grad_norm": 0.9128398895263672, + "learning_rate": 4.36389725759354e-06, + "loss": 0.169, + "step": 2364 + }, + { + "epoch": 0.7663642255346728, + "grad_norm": 0.8485229015350342, + "learning_rate": 4.363314269885928e-06, + "loss": 0.1539, + "step": 2365 + }, + { + "epoch": 0.7666882696046662, + "grad_norm": 0.9628329277038574, + "learning_rate": 4.362731054126687e-06, + "loss": 0.1779, + "step": 2366 + }, + { + "epoch": 0.7670123136746597, + "grad_norm": 0.855089008808136, + "learning_rate": 4.362147610387198e-06, + "loss": 0.1575, + "step": 2367 + }, + { + "epoch": 0.7673363577446533, + "grad_norm": 0.8463426828384399, + "learning_rate": 4.361563938738869e-06, + "loss": 0.1525, + "step": 2368 + }, + { + "epoch": 0.7676604018146468, + "grad_norm": 0.8714507818222046, + "learning_rate": 4.3609800392531345e-06, + "loss": 0.163, + "step": 2369 + }, + { + "epoch": 0.7679844458846403, + "grad_norm": 0.9294506907463074, + "learning_rate": 4.36039591200146e-06, + "loss": 0.176, + "step": 2370 + }, + { + "epoch": 0.7683084899546339, + "grad_norm": 0.8144478797912598, + "learning_rate": 4.359811557055335e-06, + "loss": 0.1452, + "step": 2371 + }, + { + "epoch": 0.7686325340246274, + "grad_norm": 0.9185246825218201, + "learning_rate": 4.3592269744862794e-06, + "loss": 0.1645, + "step": 2372 + }, + { + "epoch": 0.7689565780946209, + "grad_norm": 0.8482814431190491, + "learning_rate": 4.3586421643658404e-06, + "loss": 0.1533, + "step": 2373 + }, + { + "epoch": 0.7692806221646143, + "grad_norm": 0.8046854138374329, + "learning_rate": 4.3580571267655945e-06, + "loss": 0.1535, + "step": 2374 + }, + { + "epoch": 0.7696046662346079, + "grad_norm": 0.858644425868988, + "learning_rate": 4.357471861757144e-06, + "loss": 0.1616, + "step": 2375 + }, + { + "epoch": 0.7699287103046014, + "grad_norm": 0.8516557812690735, + "learning_rate": 4.3568863694121185e-06, + "loss": 0.1598, + "step": 2376 + }, + { + "epoch": 0.770252754374595, + "grad_norm": 0.868341863155365, + "learning_rate": 4.356300649802178e-06, + "loss": 0.1553, + "step": 2377 + }, + { + "epoch": 0.7705767984445885, + "grad_norm": 0.8729805946350098, + "learning_rate": 4.355714702999008e-06, + "loss": 0.1544, + "step": 2378 + }, + { + "epoch": 0.770900842514582, + "grad_norm": 0.847970187664032, + "learning_rate": 4.355128529074323e-06, + "loss": 0.1426, + "step": 2379 + }, + { + "epoch": 0.7712248865845756, + "grad_norm": 0.916247546672821, + "learning_rate": 4.354542128099866e-06, + "loss": 0.1604, + "step": 2380 + }, + { + "epoch": 0.771548930654569, + "grad_norm": 0.8671932816505432, + "learning_rate": 4.353955500147405e-06, + "loss": 0.1557, + "step": 2381 + }, + { + "epoch": 0.7718729747245625, + "grad_norm": 0.9008615016937256, + "learning_rate": 4.353368645288738e-06, + "loss": 0.1552, + "step": 2382 + }, + { + "epoch": 0.772197018794556, + "grad_norm": 0.8007634878158569, + "learning_rate": 4.352781563595691e-06, + "loss": 0.1498, + "step": 2383 + }, + { + "epoch": 0.7725210628645496, + "grad_norm": 0.9064083099365234, + "learning_rate": 4.352194255140118e-06, + "loss": 0.1662, + "step": 2384 + }, + { + "epoch": 0.7728451069345431, + "grad_norm": 0.8306066989898682, + "learning_rate": 4.351606719993899e-06, + "loss": 0.147, + "step": 2385 + }, + { + "epoch": 0.7731691510045366, + "grad_norm": 0.8699860572814941, + "learning_rate": 4.351018958228941e-06, + "loss": 0.1704, + "step": 2386 + }, + { + "epoch": 0.7734931950745302, + "grad_norm": 0.9070653319358826, + "learning_rate": 4.350430969917182e-06, + "loss": 0.1594, + "step": 2387 + }, + { + "epoch": 0.7738172391445236, + "grad_norm": 0.9005059599876404, + "learning_rate": 4.349842755130587e-06, + "loss": 0.1675, + "step": 2388 + }, + { + "epoch": 0.7741412832145171, + "grad_norm": 0.8573256731033325, + "learning_rate": 4.349254313941146e-06, + "loss": 0.1646, + "step": 2389 + }, + { + "epoch": 0.7744653272845107, + "grad_norm": 0.8498200178146362, + "learning_rate": 4.3486656464208785e-06, + "loss": 0.1666, + "step": 2390 + }, + { + "epoch": 0.7747893713545042, + "grad_norm": 0.7561559081077576, + "learning_rate": 4.348076752641834e-06, + "loss": 0.13, + "step": 2391 + }, + { + "epoch": 0.7751134154244977, + "grad_norm": 0.8790591359138489, + "learning_rate": 4.347487632676084e-06, + "loss": 0.1481, + "step": 2392 + }, + { + "epoch": 0.7754374594944913, + "grad_norm": 0.8516284823417664, + "learning_rate": 4.346898286595733e-06, + "loss": 0.1547, + "step": 2393 + }, + { + "epoch": 0.7757615035644848, + "grad_norm": 0.803137481212616, + "learning_rate": 4.3463087144729115e-06, + "loss": 0.1431, + "step": 2394 + }, + { + "epoch": 0.7760855476344782, + "grad_norm": 0.796620786190033, + "learning_rate": 4.3457189163797776e-06, + "loss": 0.1441, + "step": 2395 + }, + { + "epoch": 0.7764095917044718, + "grad_norm": 0.9133745431900024, + "learning_rate": 4.345128892388515e-06, + "loss": 0.1605, + "step": 2396 + }, + { + "epoch": 0.7767336357744653, + "grad_norm": 0.8595781326293945, + "learning_rate": 4.344538642571339e-06, + "loss": 0.1483, + "step": 2397 + }, + { + "epoch": 0.7770576798444588, + "grad_norm": 0.8615599274635315, + "learning_rate": 4.3439481670004895e-06, + "loss": 0.1482, + "step": 2398 + }, + { + "epoch": 0.7773817239144524, + "grad_norm": 0.852824866771698, + "learning_rate": 4.343357465748235e-06, + "loss": 0.1526, + "step": 2399 + }, + { + "epoch": 0.7777057679844459, + "grad_norm": 0.7731544375419617, + "learning_rate": 4.342766538886872e-06, + "loss": 0.1371, + "step": 2400 + }, + { + "epoch": 0.7780298120544394, + "grad_norm": 0.920595645904541, + "learning_rate": 4.342175386488724e-06, + "loss": 0.1728, + "step": 2401 + }, + { + "epoch": 0.778353856124433, + "grad_norm": 0.8988751769065857, + "learning_rate": 4.341584008626143e-06, + "loss": 0.1701, + "step": 2402 + }, + { + "epoch": 0.7786779001944264, + "grad_norm": 0.8690599799156189, + "learning_rate": 4.340992405371506e-06, + "loss": 0.1398, + "step": 2403 + }, + { + "epoch": 0.7790019442644199, + "grad_norm": 0.8576830625534058, + "learning_rate": 4.340400576797221e-06, + "loss": 0.1348, + "step": 2404 + }, + { + "epoch": 0.7793259883344135, + "grad_norm": 0.8800456523895264, + "learning_rate": 4.339808522975722e-06, + "loss": 0.1526, + "step": 2405 + }, + { + "epoch": 0.779650032404407, + "grad_norm": 0.8863540291786194, + "learning_rate": 4.339216243979471e-06, + "loss": 0.1441, + "step": 2406 + }, + { + "epoch": 0.7799740764744005, + "grad_norm": 0.92470383644104, + "learning_rate": 4.3386237398809576e-06, + "loss": 0.1441, + "step": 2407 + }, + { + "epoch": 0.7802981205443941, + "grad_norm": 0.9302868247032166, + "learning_rate": 4.338031010752696e-06, + "loss": 0.1704, + "step": 2408 + }, + { + "epoch": 0.7806221646143876, + "grad_norm": 0.9741601347923279, + "learning_rate": 4.337438056667233e-06, + "loss": 0.1597, + "step": 2409 + }, + { + "epoch": 0.780946208684381, + "grad_norm": 0.8258799314498901, + "learning_rate": 4.336844877697139e-06, + "loss": 0.1507, + "step": 2410 + }, + { + "epoch": 0.7812702527543746, + "grad_norm": 0.8625882863998413, + "learning_rate": 4.336251473915015e-06, + "loss": 0.154, + "step": 2411 + }, + { + "epoch": 0.7815942968243681, + "grad_norm": 0.929656445980072, + "learning_rate": 4.335657845393486e-06, + "loss": 0.1698, + "step": 2412 + }, + { + "epoch": 0.7819183408943616, + "grad_norm": 0.8270826935768127, + "learning_rate": 4.335063992205207e-06, + "loss": 0.1537, + "step": 2413 + }, + { + "epoch": 0.7822423849643552, + "grad_norm": 0.8252003788948059, + "learning_rate": 4.3344699144228605e-06, + "loss": 0.1365, + "step": 2414 + }, + { + "epoch": 0.7825664290343487, + "grad_norm": 0.8627497553825378, + "learning_rate": 4.333875612119156e-06, + "loss": 0.1558, + "step": 2415 + }, + { + "epoch": 0.7828904731043422, + "grad_norm": 0.905093252658844, + "learning_rate": 4.333281085366829e-06, + "loss": 0.1434, + "step": 2416 + }, + { + "epoch": 0.7832145171743357, + "grad_norm": 0.8470600247383118, + "learning_rate": 4.332686334238646e-06, + "loss": 0.1496, + "step": 2417 + }, + { + "epoch": 0.7835385612443292, + "grad_norm": 0.9364388585090637, + "learning_rate": 4.332091358807397e-06, + "loss": 0.1678, + "step": 2418 + }, + { + "epoch": 0.7838626053143227, + "grad_norm": 0.7869712710380554, + "learning_rate": 4.3314961591459015e-06, + "loss": 0.1384, + "step": 2419 + }, + { + "epoch": 0.7841866493843163, + "grad_norm": 0.824519157409668, + "learning_rate": 4.330900735327006e-06, + "loss": 0.1475, + "step": 2420 + }, + { + "epoch": 0.7845106934543098, + "grad_norm": 0.8825136423110962, + "learning_rate": 4.330305087423585e-06, + "loss": 0.1589, + "step": 2421 + }, + { + "epoch": 0.7848347375243033, + "grad_norm": 0.8884319067001343, + "learning_rate": 4.329709215508541e-06, + "loss": 0.1592, + "step": 2422 + }, + { + "epoch": 0.7851587815942969, + "grad_norm": 0.8827762007713318, + "learning_rate": 4.329113119654801e-06, + "loss": 0.1526, + "step": 2423 + }, + { + "epoch": 0.7854828256642904, + "grad_norm": 0.8353465795516968, + "learning_rate": 4.328516799935323e-06, + "loss": 0.1563, + "step": 2424 + }, + { + "epoch": 0.7858068697342838, + "grad_norm": 0.8486441373825073, + "learning_rate": 4.327920256423089e-06, + "loss": 0.1582, + "step": 2425 + }, + { + "epoch": 0.7861309138042774, + "grad_norm": 0.9012899398803711, + "learning_rate": 4.3273234891911135e-06, + "loss": 0.1608, + "step": 2426 + }, + { + "epoch": 0.7864549578742709, + "grad_norm": 0.8898869752883911, + "learning_rate": 4.3267264983124304e-06, + "loss": 0.1568, + "step": 2427 + }, + { + "epoch": 0.7867790019442644, + "grad_norm": 0.9775742888450623, + "learning_rate": 4.326129283860109e-06, + "loss": 0.1559, + "step": 2428 + }, + { + "epoch": 0.787103046014258, + "grad_norm": 0.8252977132797241, + "learning_rate": 4.3255318459072415e-06, + "loss": 0.1414, + "step": 2429 + }, + { + "epoch": 0.7874270900842515, + "grad_norm": 0.962891697883606, + "learning_rate": 4.324934184526949e-06, + "loss": 0.1728, + "step": 2430 + }, + { + "epoch": 0.787751134154245, + "grad_norm": 0.8263359069824219, + "learning_rate": 4.324336299792378e-06, + "loss": 0.1541, + "step": 2431 + }, + { + "epoch": 0.7880751782242384, + "grad_norm": 0.8529121279716492, + "learning_rate": 4.3237381917767054e-06, + "loss": 0.1476, + "step": 2432 + }, + { + "epoch": 0.788399222294232, + "grad_norm": 0.8236402869224548, + "learning_rate": 4.323139860553133e-06, + "loss": 0.1552, + "step": 2433 + }, + { + "epoch": 0.7887232663642255, + "grad_norm": 0.8603233098983765, + "learning_rate": 4.3225413061948915e-06, + "loss": 0.1409, + "step": 2434 + }, + { + "epoch": 0.789047310434219, + "grad_norm": 0.8792175650596619, + "learning_rate": 4.321942528775238e-06, + "loss": 0.1503, + "step": 2435 + }, + { + "epoch": 0.7893713545042126, + "grad_norm": 0.8012834787368774, + "learning_rate": 4.3213435283674556e-06, + "loss": 0.1428, + "step": 2436 + }, + { + "epoch": 0.7896953985742061, + "grad_norm": 0.8402407169342041, + "learning_rate": 4.320744305044858e-06, + "loss": 0.1547, + "step": 2437 + }, + { + "epoch": 0.7900194426441997, + "grad_norm": 0.8767852187156677, + "learning_rate": 4.320144858880784e-06, + "loss": 0.156, + "step": 2438 + }, + { + "epoch": 0.7903434867141931, + "grad_norm": 0.7511712312698364, + "learning_rate": 4.319545189948599e-06, + "loss": 0.1397, + "step": 2439 + }, + { + "epoch": 0.7906675307841866, + "grad_norm": 0.7794116735458374, + "learning_rate": 4.318945298321698e-06, + "loss": 0.1305, + "step": 2440 + }, + { + "epoch": 0.7909915748541801, + "grad_norm": 0.8676954507827759, + "learning_rate": 4.3183451840735e-06, + "loss": 0.1573, + "step": 2441 + }, + { + "epoch": 0.7913156189241737, + "grad_norm": 0.8750085234642029, + "learning_rate": 4.3177448472774566e-06, + "loss": 0.1627, + "step": 2442 + }, + { + "epoch": 0.7916396629941672, + "grad_norm": 0.9190241694450378, + "learning_rate": 4.317144288007039e-06, + "loss": 0.144, + "step": 2443 + }, + { + "epoch": 0.7919637070641607, + "grad_norm": 0.8547205924987793, + "learning_rate": 4.316543506335752e-06, + "loss": 0.144, + "step": 2444 + }, + { + "epoch": 0.7922877511341543, + "grad_norm": 0.8410033583641052, + "learning_rate": 4.315942502337126e-06, + "loss": 0.1499, + "step": 2445 + }, + { + "epoch": 0.7926117952041478, + "grad_norm": 0.8728411197662354, + "learning_rate": 4.315341276084717e-06, + "loss": 0.1593, + "step": 2446 + }, + { + "epoch": 0.7929358392741412, + "grad_norm": 0.8590639233589172, + "learning_rate": 4.3147398276521105e-06, + "loss": 0.1559, + "step": 2447 + }, + { + "epoch": 0.7932598833441348, + "grad_norm": 0.788457989692688, + "learning_rate": 4.314138157112916e-06, + "loss": 0.1345, + "step": 2448 + }, + { + "epoch": 0.7935839274141283, + "grad_norm": 0.9031588435173035, + "learning_rate": 4.313536264540774e-06, + "loss": 0.1688, + "step": 2449 + }, + { + "epoch": 0.7939079714841218, + "grad_norm": 0.8987663388252258, + "learning_rate": 4.312934150009351e-06, + "loss": 0.1625, + "step": 2450 + }, + { + "epoch": 0.7942320155541154, + "grad_norm": 0.8170994520187378, + "learning_rate": 4.3123318135923355e-06, + "loss": 0.1302, + "step": 2451 + }, + { + "epoch": 0.7945560596241089, + "grad_norm": 0.8526625633239746, + "learning_rate": 4.311729255363453e-06, + "loss": 0.1456, + "step": 2452 + }, + { + "epoch": 0.7948801036941024, + "grad_norm": 0.7795225381851196, + "learning_rate": 4.3111264753964475e-06, + "loss": 0.1478, + "step": 2453 + }, + { + "epoch": 0.7952041477640959, + "grad_norm": 0.8411941528320312, + "learning_rate": 4.310523473765095e-06, + "loss": 0.1452, + "step": 2454 + }, + { + "epoch": 0.7955281918340894, + "grad_norm": 0.9050775766372681, + "learning_rate": 4.309920250543196e-06, + "loss": 0.1628, + "step": 2455 + }, + { + "epoch": 0.7958522359040829, + "grad_norm": 0.8332045674324036, + "learning_rate": 4.30931680580458e-06, + "loss": 0.1504, + "step": 2456 + }, + { + "epoch": 0.7961762799740765, + "grad_norm": 0.8788651823997498, + "learning_rate": 4.308713139623103e-06, + "loss": 0.1613, + "step": 2457 + }, + { + "epoch": 0.79650032404407, + "grad_norm": 0.8998255729675293, + "learning_rate": 4.308109252072647e-06, + "loss": 0.1557, + "step": 2458 + }, + { + "epoch": 0.7968243681140635, + "grad_norm": 0.8916454315185547, + "learning_rate": 4.307505143227122e-06, + "loss": 0.1641, + "step": 2459 + }, + { + "epoch": 0.7971484121840571, + "grad_norm": 0.8175183534622192, + "learning_rate": 4.306900813160466e-06, + "loss": 0.1479, + "step": 2460 + }, + { + "epoch": 0.7974724562540505, + "grad_norm": 0.8839316964149475, + "learning_rate": 4.306296261946643e-06, + "loss": 0.1449, + "step": 2461 + }, + { + "epoch": 0.797796500324044, + "grad_norm": 0.8067646026611328, + "learning_rate": 4.305691489659643e-06, + "loss": 0.1492, + "step": 2462 + }, + { + "epoch": 0.7981205443940376, + "grad_norm": 0.9073911905288696, + "learning_rate": 4.3050864963734854e-06, + "loss": 0.1603, + "step": 2463 + }, + { + "epoch": 0.7984445884640311, + "grad_norm": 0.893830418586731, + "learning_rate": 4.304481282162215e-06, + "loss": 0.16, + "step": 2464 + }, + { + "epoch": 0.7987686325340246, + "grad_norm": 0.9255582690238953, + "learning_rate": 4.3038758470999056e-06, + "loss": 0.1771, + "step": 2465 + }, + { + "epoch": 0.7990926766040182, + "grad_norm": 0.9087891578674316, + "learning_rate": 4.303270191260654e-06, + "loss": 0.138, + "step": 2466 + }, + { + "epoch": 0.7994167206740117, + "grad_norm": 0.8127539753913879, + "learning_rate": 4.302664314718588e-06, + "loss": 0.1409, + "step": 2467 + }, + { + "epoch": 0.7997407647440052, + "grad_norm": 0.8776745796203613, + "learning_rate": 4.302058217547862e-06, + "loss": 0.1619, + "step": 2468 + }, + { + "epoch": 0.8000648088139987, + "grad_norm": 0.8019484281539917, + "learning_rate": 4.301451899822655e-06, + "loss": 0.1527, + "step": 2469 + }, + { + "epoch": 0.8003888528839922, + "grad_norm": 1.013856291770935, + "learning_rate": 4.3008453616171746e-06, + "loss": 0.1623, + "step": 2470 + }, + { + "epoch": 0.8007128969539857, + "grad_norm": 0.9002841711044312, + "learning_rate": 4.300238603005656e-06, + "loss": 0.1626, + "step": 2471 + }, + { + "epoch": 0.8010369410239793, + "grad_norm": 0.8387237787246704, + "learning_rate": 4.299631624062359e-06, + "loss": 0.1596, + "step": 2472 + }, + { + "epoch": 0.8013609850939728, + "grad_norm": 0.8484622240066528, + "learning_rate": 4.299024424861574e-06, + "loss": 0.1568, + "step": 2473 + }, + { + "epoch": 0.8016850291639663, + "grad_norm": 0.7935303449630737, + "learning_rate": 4.298417005477616e-06, + "loss": 0.1405, + "step": 2474 + }, + { + "epoch": 0.8020090732339599, + "grad_norm": 0.8886042237281799, + "learning_rate": 4.2978093659848255e-06, + "loss": 0.1485, + "step": 2475 + }, + { + "epoch": 0.8023331173039533, + "grad_norm": 0.8659217953681946, + "learning_rate": 4.2972015064575726e-06, + "loss": 0.1561, + "step": 2476 + }, + { + "epoch": 0.8026571613739468, + "grad_norm": 0.8989936709403992, + "learning_rate": 4.2965934269702535e-06, + "loss": 0.1637, + "step": 2477 + }, + { + "epoch": 0.8029812054439404, + "grad_norm": 0.9043972492218018, + "learning_rate": 4.295985127597291e-06, + "loss": 0.1596, + "step": 2478 + }, + { + "epoch": 0.8033052495139339, + "grad_norm": 0.9152305126190186, + "learning_rate": 4.295376608413136e-06, + "loss": 0.169, + "step": 2479 + }, + { + "epoch": 0.8036292935839274, + "grad_norm": 0.8600202798843384, + "learning_rate": 4.294767869492265e-06, + "loss": 0.1575, + "step": 2480 + }, + { + "epoch": 0.803953337653921, + "grad_norm": 0.8142397999763489, + "learning_rate": 4.294158910909181e-06, + "loss": 0.1401, + "step": 2481 + }, + { + "epoch": 0.8042773817239145, + "grad_norm": 0.8893610835075378, + "learning_rate": 4.293549732738415e-06, + "loss": 0.1619, + "step": 2482 + }, + { + "epoch": 0.8046014257939079, + "grad_norm": 0.872360110282898, + "learning_rate": 4.2929403350545255e-06, + "loss": 0.1542, + "step": 2483 + }, + { + "epoch": 0.8049254698639015, + "grad_norm": 0.8567931652069092, + "learning_rate": 4.292330717932095e-06, + "loss": 0.1521, + "step": 2484 + }, + { + "epoch": 0.805249513933895, + "grad_norm": 0.8640723824501038, + "learning_rate": 4.2917208814457364e-06, + "loss": 0.1532, + "step": 2485 + }, + { + "epoch": 0.8055735580038885, + "grad_norm": 0.8216172456741333, + "learning_rate": 4.291110825670087e-06, + "loss": 0.1353, + "step": 2486 + }, + { + "epoch": 0.8058976020738821, + "grad_norm": 0.8190692067146301, + "learning_rate": 4.290500550679811e-06, + "loss": 0.1593, + "step": 2487 + }, + { + "epoch": 0.8062216461438756, + "grad_norm": 0.9348410367965698, + "learning_rate": 4.289890056549603e-06, + "loss": 0.1559, + "step": 2488 + }, + { + "epoch": 0.8065456902138691, + "grad_norm": 0.7952721118927002, + "learning_rate": 4.289279343354178e-06, + "loss": 0.1409, + "step": 2489 + }, + { + "epoch": 0.8068697342838627, + "grad_norm": 0.9495409727096558, + "learning_rate": 4.288668411168283e-06, + "loss": 0.1705, + "step": 2490 + }, + { + "epoch": 0.8071937783538561, + "grad_norm": 0.9091714024543762, + "learning_rate": 4.28805726006669e-06, + "loss": 0.1559, + "step": 2491 + }, + { + "epoch": 0.8075178224238496, + "grad_norm": 0.8902946710586548, + "learning_rate": 4.287445890124198e-06, + "loss": 0.1521, + "step": 2492 + }, + { + "epoch": 0.8078418664938432, + "grad_norm": 0.889609158039093, + "learning_rate": 4.286834301415634e-06, + "loss": 0.1493, + "step": 2493 + }, + { + "epoch": 0.8081659105638367, + "grad_norm": 0.9025107026100159, + "learning_rate": 4.286222494015848e-06, + "loss": 0.1489, + "step": 2494 + }, + { + "epoch": 0.8084899546338302, + "grad_norm": 0.8463943600654602, + "learning_rate": 4.285610467999722e-06, + "loss": 0.1407, + "step": 2495 + }, + { + "epoch": 0.8088139987038238, + "grad_norm": 0.9571568965911865, + "learning_rate": 4.28499822344216e-06, + "loss": 0.1551, + "step": 2496 + }, + { + "epoch": 0.8091380427738173, + "grad_norm": 0.8634231090545654, + "learning_rate": 4.2843857604180955e-06, + "loss": 0.1561, + "step": 2497 + }, + { + "epoch": 0.8094620868438107, + "grad_norm": 0.9505967497825623, + "learning_rate": 4.283773079002488e-06, + "loss": 0.1591, + "step": 2498 + }, + { + "epoch": 0.8097861309138042, + "grad_norm": 0.8230482339859009, + "learning_rate": 4.283160179270325e-06, + "loss": 0.1439, + "step": 2499 + }, + { + "epoch": 0.8101101749837978, + "grad_norm": 0.8318188786506653, + "learning_rate": 4.282547061296618e-06, + "loss": 0.1527, + "step": 2500 + }, + { + "epoch": 0.8104342190537913, + "grad_norm": 0.7912052869796753, + "learning_rate": 4.281933725156406e-06, + "loss": 0.1449, + "step": 2501 + }, + { + "epoch": 0.8107582631237849, + "grad_norm": 0.9153440594673157, + "learning_rate": 4.281320170924758e-06, + "loss": 0.1436, + "step": 2502 + }, + { + "epoch": 0.8110823071937784, + "grad_norm": 0.8174635171890259, + "learning_rate": 4.280706398676764e-06, + "loss": 0.1495, + "step": 2503 + }, + { + "epoch": 0.8114063512637719, + "grad_norm": 0.8115018010139465, + "learning_rate": 4.2800924084875465e-06, + "loss": 0.1494, + "step": 2504 + }, + { + "epoch": 0.8117303953337653, + "grad_norm": 0.82187819480896, + "learning_rate": 4.27947820043225e-06, + "loss": 0.1493, + "step": 2505 + }, + { + "epoch": 0.8120544394037589, + "grad_norm": 0.8348486423492432, + "learning_rate": 4.278863774586049e-06, + "loss": 0.1596, + "step": 2506 + }, + { + "epoch": 0.8123784834737524, + "grad_norm": 0.8637591600418091, + "learning_rate": 4.2782491310241426e-06, + "loss": 0.1504, + "step": 2507 + }, + { + "epoch": 0.812702527543746, + "grad_norm": 0.8602386713027954, + "learning_rate": 4.2776342698217575e-06, + "loss": 0.1504, + "step": 2508 + }, + { + "epoch": 0.8130265716137395, + "grad_norm": 0.9002025723457336, + "learning_rate": 4.277019191054146e-06, + "loss": 0.1616, + "step": 2509 + }, + { + "epoch": 0.813350615683733, + "grad_norm": 0.8553583025932312, + "learning_rate": 4.276403894796589e-06, + "loss": 0.1496, + "step": 2510 + }, + { + "epoch": 0.8136746597537265, + "grad_norm": 0.8963319659233093, + "learning_rate": 4.275788381124393e-06, + "loss": 0.1569, + "step": 2511 + }, + { + "epoch": 0.81399870382372, + "grad_norm": 0.8563810586929321, + "learning_rate": 4.275172650112889e-06, + "loss": 0.1492, + "step": 2512 + }, + { + "epoch": 0.8143227478937135, + "grad_norm": 0.8267882466316223, + "learning_rate": 4.274556701837438e-06, + "loss": 0.1425, + "step": 2513 + }, + { + "epoch": 0.814646791963707, + "grad_norm": 0.8709789514541626, + "learning_rate": 4.273940536373426e-06, + "loss": 0.1552, + "step": 2514 + }, + { + "epoch": 0.8149708360337006, + "grad_norm": 0.8784293532371521, + "learning_rate": 4.273324153796264e-06, + "loss": 0.1548, + "step": 2515 + }, + { + "epoch": 0.8152948801036941, + "grad_norm": 0.8772052526473999, + "learning_rate": 4.2727075541813945e-06, + "loss": 0.1597, + "step": 2516 + }, + { + "epoch": 0.8156189241736876, + "grad_norm": 0.7947052717208862, + "learning_rate": 4.27209073760428e-06, + "loss": 0.1505, + "step": 2517 + }, + { + "epoch": 0.8159429682436812, + "grad_norm": 0.8064588904380798, + "learning_rate": 4.271473704140415e-06, + "loss": 0.1517, + "step": 2518 + }, + { + "epoch": 0.8162670123136747, + "grad_norm": 0.8790615797042847, + "learning_rate": 4.270856453865318e-06, + "loss": 0.156, + "step": 2519 + }, + { + "epoch": 0.8165910563836681, + "grad_norm": 0.790266215801239, + "learning_rate": 4.270238986854534e-06, + "loss": 0.1371, + "step": 2520 + }, + { + "epoch": 0.8169151004536617, + "grad_norm": 0.9105173945426941, + "learning_rate": 4.2696213031836355e-06, + "loss": 0.1632, + "step": 2521 + }, + { + "epoch": 0.8172391445236552, + "grad_norm": 0.8295355439186096, + "learning_rate": 4.2690034029282214e-06, + "loss": 0.1537, + "step": 2522 + }, + { + "epoch": 0.8175631885936487, + "grad_norm": 0.8722296357154846, + "learning_rate": 4.268385286163915e-06, + "loss": 0.1675, + "step": 2523 + }, + { + "epoch": 0.8178872326636423, + "grad_norm": 0.8620377779006958, + "learning_rate": 4.267766952966369e-06, + "loss": 0.1493, + "step": 2524 + }, + { + "epoch": 0.8182112767336358, + "grad_norm": 0.8079911470413208, + "learning_rate": 4.267148403411261e-06, + "loss": 0.1395, + "step": 2525 + }, + { + "epoch": 0.8185353208036293, + "grad_norm": 0.8399182558059692, + "learning_rate": 4.266529637574297e-06, + "loss": 0.1499, + "step": 2526 + }, + { + "epoch": 0.8188593648736228, + "grad_norm": 0.8858416676521301, + "learning_rate": 4.265910655531206e-06, + "loss": 0.1418, + "step": 2527 + }, + { + "epoch": 0.8191834089436163, + "grad_norm": 0.820108950138092, + "learning_rate": 4.265291457357746e-06, + "loss": 0.1467, + "step": 2528 + }, + { + "epoch": 0.8195074530136098, + "grad_norm": 0.8316980004310608, + "learning_rate": 4.2646720431297006e-06, + "loss": 0.1446, + "step": 2529 + }, + { + "epoch": 0.8198314970836034, + "grad_norm": 0.8442394137382507, + "learning_rate": 4.2640524129228815e-06, + "loss": 0.1541, + "step": 2530 + }, + { + "epoch": 0.8201555411535969, + "grad_norm": 0.8731728196144104, + "learning_rate": 4.263432566813123e-06, + "loss": 0.1597, + "step": 2531 + }, + { + "epoch": 0.8204795852235904, + "grad_norm": 0.8163385987281799, + "learning_rate": 4.262812504876291e-06, + "loss": 0.1365, + "step": 2532 + }, + { + "epoch": 0.820803629293584, + "grad_norm": 0.9060762524604797, + "learning_rate": 4.262192227188273e-06, + "loss": 0.1584, + "step": 2533 + }, + { + "epoch": 0.8211276733635774, + "grad_norm": 0.8710756301879883, + "learning_rate": 4.261571733824986e-06, + "loss": 0.1541, + "step": 2534 + }, + { + "epoch": 0.8214517174335709, + "grad_norm": 0.8472952842712402, + "learning_rate": 4.260951024862372e-06, + "loss": 0.15, + "step": 2535 + }, + { + "epoch": 0.8217757615035645, + "grad_norm": 0.8542965650558472, + "learning_rate": 4.2603301003763994e-06, + "loss": 0.1476, + "step": 2536 + }, + { + "epoch": 0.822099805573558, + "grad_norm": 0.8944793343544006, + "learning_rate": 4.259708960443065e-06, + "loss": 0.156, + "step": 2537 + }, + { + "epoch": 0.8224238496435515, + "grad_norm": 0.8457514643669128, + "learning_rate": 4.259087605138388e-06, + "loss": 0.1559, + "step": 2538 + }, + { + "epoch": 0.8227478937135451, + "grad_norm": 0.8715274333953857, + "learning_rate": 4.2584660345384176e-06, + "loss": 0.1461, + "step": 2539 + }, + { + "epoch": 0.8230719377835386, + "grad_norm": 0.8897663354873657, + "learning_rate": 4.257844248719229e-06, + "loss": 0.1685, + "step": 2540 + }, + { + "epoch": 0.8233959818535321, + "grad_norm": 0.8603126406669617, + "learning_rate": 4.25722224775692e-06, + "loss": 0.1481, + "step": 2541 + }, + { + "epoch": 0.8237200259235256, + "grad_norm": 0.8388012647628784, + "learning_rate": 4.25660003172762e-06, + "loss": 0.1575, + "step": 2542 + }, + { + "epoch": 0.8240440699935191, + "grad_norm": 0.8356682062149048, + "learning_rate": 4.255977600707481e-06, + "loss": 0.1575, + "step": 2543 + }, + { + "epoch": 0.8243681140635126, + "grad_norm": 0.8801903128623962, + "learning_rate": 4.255354954772684e-06, + "loss": 0.1464, + "step": 2544 + }, + { + "epoch": 0.8246921581335062, + "grad_norm": 0.8981055021286011, + "learning_rate": 4.2547320939994315e-06, + "loss": 0.1595, + "step": 2545 + }, + { + "epoch": 0.8250162022034997, + "grad_norm": 0.815737783908844, + "learning_rate": 4.25410901846396e-06, + "loss": 0.1444, + "step": 2546 + }, + { + "epoch": 0.8253402462734932, + "grad_norm": 0.9538549780845642, + "learning_rate": 4.253485728242525e-06, + "loss": 0.1531, + "step": 2547 + }, + { + "epoch": 0.8256642903434868, + "grad_norm": 0.8105667233467102, + "learning_rate": 4.252862223411412e-06, + "loss": 0.1458, + "step": 2548 + }, + { + "epoch": 0.8259883344134802, + "grad_norm": 0.7784798741340637, + "learning_rate": 4.252238504046931e-06, + "loss": 0.1307, + "step": 2549 + }, + { + "epoch": 0.8263123784834737, + "grad_norm": 0.9695181846618652, + "learning_rate": 4.251614570225421e-06, + "loss": 0.1674, + "step": 2550 + }, + { + "epoch": 0.8266364225534673, + "grad_norm": 0.8391134738922119, + "learning_rate": 4.250990422023243e-06, + "loss": 0.153, + "step": 2551 + }, + { + "epoch": 0.8269604666234608, + "grad_norm": 0.882595419883728, + "learning_rate": 4.250366059516791e-06, + "loss": 0.154, + "step": 2552 + }, + { + "epoch": 0.8272845106934543, + "grad_norm": 0.8489596247673035, + "learning_rate": 4.249741482782476e-06, + "loss": 0.1565, + "step": 2553 + }, + { + "epoch": 0.8276085547634479, + "grad_norm": 0.878032922744751, + "learning_rate": 4.249116691896743e-06, + "loss": 0.1506, + "step": 2554 + }, + { + "epoch": 0.8279325988334414, + "grad_norm": 0.8520094752311707, + "learning_rate": 4.248491686936059e-06, + "loss": 0.1559, + "step": 2555 + }, + { + "epoch": 0.8282566429034348, + "grad_norm": 0.8968580961227417, + "learning_rate": 4.2478664679769196e-06, + "loss": 0.1574, + "step": 2556 + }, + { + "epoch": 0.8285806869734283, + "grad_norm": 0.8315073251724243, + "learning_rate": 4.247241035095846e-06, + "loss": 0.1415, + "step": 2557 + }, + { + "epoch": 0.8289047310434219, + "grad_norm": 0.919426441192627, + "learning_rate": 4.246615388369384e-06, + "loss": 0.162, + "step": 2558 + }, + { + "epoch": 0.8292287751134154, + "grad_norm": 0.8385602235794067, + "learning_rate": 4.245989527874107e-06, + "loss": 0.1462, + "step": 2559 + }, + { + "epoch": 0.829552819183409, + "grad_norm": 0.892264723777771, + "learning_rate": 4.245363453686614e-06, + "loss": 0.1554, + "step": 2560 + }, + { + "epoch": 0.8298768632534025, + "grad_norm": 0.9176088571548462, + "learning_rate": 4.24473716588353e-06, + "loss": 0.1717, + "step": 2561 + }, + { + "epoch": 0.830200907323396, + "grad_norm": 0.8820486068725586, + "learning_rate": 4.2441106645415085e-06, + "loss": 0.1563, + "step": 2562 + }, + { + "epoch": 0.8305249513933896, + "grad_norm": 0.8998873233795166, + "learning_rate": 4.243483949737225e-06, + "loss": 0.175, + "step": 2563 + }, + { + "epoch": 0.830848995463383, + "grad_norm": 0.824447512626648, + "learning_rate": 4.242857021547385e-06, + "loss": 0.1551, + "step": 2564 + }, + { + "epoch": 0.8311730395333765, + "grad_norm": 0.9089164733886719, + "learning_rate": 4.242229880048718e-06, + "loss": 0.1599, + "step": 2565 + }, + { + "epoch": 0.83149708360337, + "grad_norm": 0.9251378178596497, + "learning_rate": 4.241602525317979e-06, + "loss": 0.1716, + "step": 2566 + }, + { + "epoch": 0.8318211276733636, + "grad_norm": 0.8123130202293396, + "learning_rate": 4.240974957431951e-06, + "loss": 0.1416, + "step": 2567 + }, + { + "epoch": 0.8321451717433571, + "grad_norm": 0.8112072348594666, + "learning_rate": 4.240347176467442e-06, + "loss": 0.1446, + "step": 2568 + }, + { + "epoch": 0.8324692158133506, + "grad_norm": 0.8598057627677917, + "learning_rate": 4.2397191825012865e-06, + "loss": 0.1575, + "step": 2569 + }, + { + "epoch": 0.8327932598833442, + "grad_norm": 0.8484219312667847, + "learning_rate": 4.239090975610346e-06, + "loss": 0.1522, + "step": 2570 + }, + { + "epoch": 0.8331173039533376, + "grad_norm": 0.8238083124160767, + "learning_rate": 4.2384625558715045e-06, + "loss": 0.1481, + "step": 2571 + }, + { + "epoch": 0.8334413480233311, + "grad_norm": 0.8477686643600464, + "learning_rate": 4.237833923361676e-06, + "loss": 0.1528, + "step": 2572 + }, + { + "epoch": 0.8337653920933247, + "grad_norm": 1.2084839344024658, + "learning_rate": 4.237205078157799e-06, + "loss": 0.1555, + "step": 2573 + }, + { + "epoch": 0.8340894361633182, + "grad_norm": 0.822715163230896, + "learning_rate": 4.236576020336838e-06, + "loss": 0.1515, + "step": 2574 + }, + { + "epoch": 0.8344134802333117, + "grad_norm": 0.8149126172065735, + "learning_rate": 4.235946749975783e-06, + "loss": 0.1515, + "step": 2575 + }, + { + "epoch": 0.8347375243033053, + "grad_norm": 0.810035228729248, + "learning_rate": 4.235317267151652e-06, + "loss": 0.1642, + "step": 2576 + }, + { + "epoch": 0.8350615683732988, + "grad_norm": 0.8240023255348206, + "learning_rate": 4.234687571941486e-06, + "loss": 0.1445, + "step": 2577 + }, + { + "epoch": 0.8353856124432922, + "grad_norm": 0.8536616563796997, + "learning_rate": 4.234057664422354e-06, + "loss": 0.1533, + "step": 2578 + }, + { + "epoch": 0.8357096565132858, + "grad_norm": 0.8576487302780151, + "learning_rate": 4.2334275446713515e-06, + "loss": 0.1539, + "step": 2579 + }, + { + "epoch": 0.8360337005832793, + "grad_norm": 0.8609020709991455, + "learning_rate": 4.232797212765598e-06, + "loss": 0.1555, + "step": 2580 + }, + { + "epoch": 0.8363577446532728, + "grad_norm": 0.8141218423843384, + "learning_rate": 4.2321666687822405e-06, + "loss": 0.1578, + "step": 2581 + }, + { + "epoch": 0.8366817887232664, + "grad_norm": 0.9742339849472046, + "learning_rate": 4.231535912798452e-06, + "loss": 0.1601, + "step": 2582 + }, + { + "epoch": 0.8370058327932599, + "grad_norm": 0.8200106024742126, + "learning_rate": 4.23090494489143e-06, + "loss": 0.1406, + "step": 2583 + }, + { + "epoch": 0.8373298768632534, + "grad_norm": 0.7919231057167053, + "learning_rate": 4.230273765138399e-06, + "loss": 0.1371, + "step": 2584 + }, + { + "epoch": 0.837653920933247, + "grad_norm": 0.8794668316841125, + "learning_rate": 4.229642373616609e-06, + "loss": 0.1628, + "step": 2585 + }, + { + "epoch": 0.8379779650032404, + "grad_norm": 0.8123350143432617, + "learning_rate": 4.229010770403337e-06, + "loss": 0.137, + "step": 2586 + }, + { + "epoch": 0.8383020090732339, + "grad_norm": 0.7920833230018616, + "learning_rate": 4.228378955575885e-06, + "loss": 0.1309, + "step": 2587 + }, + { + "epoch": 0.8386260531432275, + "grad_norm": 0.8339619636535645, + "learning_rate": 4.227746929211582e-06, + "loss": 0.1561, + "step": 2588 + }, + { + "epoch": 0.838950097213221, + "grad_norm": 0.8547282218933105, + "learning_rate": 4.227114691387779e-06, + "loss": 0.1472, + "step": 2589 + }, + { + "epoch": 0.8392741412832145, + "grad_norm": 0.9381993412971497, + "learning_rate": 4.226482242181859e-06, + "loss": 0.1607, + "step": 2590 + }, + { + "epoch": 0.8395981853532081, + "grad_norm": 0.8304243683815002, + "learning_rate": 4.225849581671225e-06, + "loss": 0.1461, + "step": 2591 + }, + { + "epoch": 0.8399222294232016, + "grad_norm": 0.8055948615074158, + "learning_rate": 4.225216709933309e-06, + "loss": 0.1383, + "step": 2592 + }, + { + "epoch": 0.840246273493195, + "grad_norm": 0.8732760548591614, + "learning_rate": 4.2245836270455706e-06, + "loss": 0.1596, + "step": 2593 + }, + { + "epoch": 0.8405703175631886, + "grad_norm": 0.8491640090942383, + "learning_rate": 4.223950333085492e-06, + "loss": 0.1584, + "step": 2594 + }, + { + "epoch": 0.8408943616331821, + "grad_norm": 0.7880257964134216, + "learning_rate": 4.223316828130581e-06, + "loss": 0.1416, + "step": 2595 + }, + { + "epoch": 0.8412184057031756, + "grad_norm": 0.8852708339691162, + "learning_rate": 4.222683112258372e-06, + "loss": 0.1435, + "step": 2596 + }, + { + "epoch": 0.8415424497731692, + "grad_norm": 0.8148348927497864, + "learning_rate": 4.222049185546428e-06, + "loss": 0.139, + "step": 2597 + }, + { + "epoch": 0.8418664938431627, + "grad_norm": 0.9881069660186768, + "learning_rate": 4.221415048072335e-06, + "loss": 0.1942, + "step": 2598 + }, + { + "epoch": 0.8421905379131562, + "grad_norm": 0.9064881801605225, + "learning_rate": 4.220780699913704e-06, + "loss": 0.1671, + "step": 2599 + }, + { + "epoch": 0.8425145819831497, + "grad_norm": 0.8283922672271729, + "learning_rate": 4.220146141148174e-06, + "loss": 0.1446, + "step": 2600 + }, + { + "epoch": 0.8428386260531432, + "grad_norm": 0.8480839729309082, + "learning_rate": 4.219511371853408e-06, + "loss": 0.1572, + "step": 2601 + }, + { + "epoch": 0.8431626701231367, + "grad_norm": 0.8674578666687012, + "learning_rate": 4.2188763921070974e-06, + "loss": 0.1604, + "step": 2602 + }, + { + "epoch": 0.8434867141931303, + "grad_norm": 0.8412938714027405, + "learning_rate": 4.2182412019869556e-06, + "loss": 0.1419, + "step": 2603 + }, + { + "epoch": 0.8438107582631238, + "grad_norm": 0.9011821150779724, + "learning_rate": 4.217605801570725e-06, + "loss": 0.1778, + "step": 2604 + }, + { + "epoch": 0.8441348023331173, + "grad_norm": 0.8756242990493774, + "learning_rate": 4.216970190936171e-06, + "loss": 0.1635, + "step": 2605 + }, + { + "epoch": 0.8444588464031109, + "grad_norm": 0.8517950177192688, + "learning_rate": 4.2163343701610884e-06, + "loss": 0.1594, + "step": 2606 + }, + { + "epoch": 0.8447828904731044, + "grad_norm": 0.9082374572753906, + "learning_rate": 4.215698339323294e-06, + "loss": 0.1674, + "step": 2607 + }, + { + "epoch": 0.8451069345430978, + "grad_norm": 0.8264471888542175, + "learning_rate": 4.215062098500632e-06, + "loss": 0.1598, + "step": 2608 + }, + { + "epoch": 0.8454309786130914, + "grad_norm": 0.8358623385429382, + "learning_rate": 4.214425647770972e-06, + "loss": 0.1533, + "step": 2609 + }, + { + "epoch": 0.8457550226830849, + "grad_norm": 0.8975405097007751, + "learning_rate": 4.213788987212211e-06, + "loss": 0.1782, + "step": 2610 + }, + { + "epoch": 0.8460790667530784, + "grad_norm": 0.8398000001907349, + "learning_rate": 4.213152116902267e-06, + "loss": 0.1499, + "step": 2611 + }, + { + "epoch": 0.846403110823072, + "grad_norm": 0.7483087182044983, + "learning_rate": 4.212515036919089e-06, + "loss": 0.1413, + "step": 2612 + }, + { + "epoch": 0.8467271548930655, + "grad_norm": 0.8305369019508362, + "learning_rate": 4.211877747340649e-06, + "loss": 0.1461, + "step": 2613 + }, + { + "epoch": 0.847051198963059, + "grad_norm": 0.8465678691864014, + "learning_rate": 4.211240248244945e-06, + "loss": 0.1555, + "step": 2614 + }, + { + "epoch": 0.8473752430330524, + "grad_norm": 0.8199893236160278, + "learning_rate": 4.21060253971e-06, + "loss": 0.1233, + "step": 2615 + }, + { + "epoch": 0.847699287103046, + "grad_norm": 0.898139476776123, + "learning_rate": 4.2099646218138655e-06, + "loss": 0.162, + "step": 2616 + }, + { + "epoch": 0.8480233311730395, + "grad_norm": 0.8000852465629578, + "learning_rate": 4.209326494634614e-06, + "loss": 0.1361, + "step": 2617 + }, + { + "epoch": 0.848347375243033, + "grad_norm": 0.8270335793495178, + "learning_rate": 4.208688158250348e-06, + "loss": 0.136, + "step": 2618 + }, + { + "epoch": 0.8486714193130266, + "grad_norm": 0.8229568600654602, + "learning_rate": 4.2080496127391914e-06, + "loss": 0.1506, + "step": 2619 + }, + { + "epoch": 0.8489954633830201, + "grad_norm": 0.9595044255256653, + "learning_rate": 4.207410858179298e-06, + "loss": 0.16, + "step": 2620 + }, + { + "epoch": 0.8493195074530137, + "grad_norm": 0.8614192008972168, + "learning_rate": 4.206771894648846e-06, + "loss": 0.1548, + "step": 2621 + }, + { + "epoch": 0.8496435515230071, + "grad_norm": 0.9117885231971741, + "learning_rate": 4.206132722226035e-06, + "loss": 0.1691, + "step": 2622 + }, + { + "epoch": 0.8499675955930006, + "grad_norm": 0.8764152526855469, + "learning_rate": 4.205493340989096e-06, + "loss": 0.1532, + "step": 2623 + }, + { + "epoch": 0.8502916396629941, + "grad_norm": 0.8756179213523865, + "learning_rate": 4.204853751016282e-06, + "loss": 0.1601, + "step": 2624 + }, + { + "epoch": 0.8506156837329877, + "grad_norm": 0.8578720092773438, + "learning_rate": 4.204213952385875e-06, + "loss": 0.1421, + "step": 2625 + }, + { + "epoch": 0.8509397278029812, + "grad_norm": 0.8636815547943115, + "learning_rate": 4.203573945176177e-06, + "loss": 0.1495, + "step": 2626 + }, + { + "epoch": 0.8512637718729748, + "grad_norm": 0.8231790065765381, + "learning_rate": 4.202933729465519e-06, + "loss": 0.1492, + "step": 2627 + }, + { + "epoch": 0.8515878159429683, + "grad_norm": 0.9206662774085999, + "learning_rate": 4.20229330533226e-06, + "loss": 0.1589, + "step": 2628 + }, + { + "epoch": 0.8519118600129617, + "grad_norm": 0.8775075078010559, + "learning_rate": 4.201652672854779e-06, + "loss": 0.155, + "step": 2629 + }, + { + "epoch": 0.8522359040829552, + "grad_norm": 0.8573769927024841, + "learning_rate": 4.201011832111485e-06, + "loss": 0.1506, + "step": 2630 + }, + { + "epoch": 0.8525599481529488, + "grad_norm": 0.816596269607544, + "learning_rate": 4.2003707831808086e-06, + "loss": 0.1404, + "step": 2631 + }, + { + "epoch": 0.8528839922229423, + "grad_norm": 0.8326970934867859, + "learning_rate": 4.199729526141209e-06, + "loss": 0.1511, + "step": 2632 + }, + { + "epoch": 0.8532080362929358, + "grad_norm": 0.8921494483947754, + "learning_rate": 4.199088061071172e-06, + "loss": 0.1717, + "step": 2633 + }, + { + "epoch": 0.8535320803629294, + "grad_norm": 0.8992860913276672, + "learning_rate": 4.198446388049203e-06, + "loss": 0.1559, + "step": 2634 + }, + { + "epoch": 0.8538561244329229, + "grad_norm": 0.7511637806892395, + "learning_rate": 4.197804507153838e-06, + "loss": 0.14, + "step": 2635 + }, + { + "epoch": 0.8541801685029164, + "grad_norm": 0.8324440121650696, + "learning_rate": 4.197162418463639e-06, + "loss": 0.1485, + "step": 2636 + }, + { + "epoch": 0.8545042125729099, + "grad_norm": 0.7635222673416138, + "learning_rate": 4.1965201220571895e-06, + "loss": 0.1551, + "step": 2637 + }, + { + "epoch": 0.8548282566429034, + "grad_norm": 0.8577344417572021, + "learning_rate": 4.1958776180131e-06, + "loss": 0.1605, + "step": 2638 + }, + { + "epoch": 0.8551523007128969, + "grad_norm": 0.8160384893417358, + "learning_rate": 4.1952349064100074e-06, + "loss": 0.1504, + "step": 2639 + }, + { + "epoch": 0.8554763447828905, + "grad_norm": 0.870599091053009, + "learning_rate": 4.194591987326574e-06, + "loss": 0.149, + "step": 2640 + }, + { + "epoch": 0.855800388852884, + "grad_norm": 0.8268269300460815, + "learning_rate": 4.193948860841485e-06, + "loss": 0.1513, + "step": 2641 + }, + { + "epoch": 0.8561244329228775, + "grad_norm": 0.8784127235412598, + "learning_rate": 4.193305527033456e-06, + "loss": 0.1622, + "step": 2642 + }, + { + "epoch": 0.8564484769928711, + "grad_norm": 0.8954482674598694, + "learning_rate": 4.192661985981221e-06, + "loss": 0.1544, + "step": 2643 + }, + { + "epoch": 0.8567725210628645, + "grad_norm": 0.8588570952415466, + "learning_rate": 4.192018237763547e-06, + "loss": 0.1479, + "step": 2644 + }, + { + "epoch": 0.857096565132858, + "grad_norm": 0.8091132640838623, + "learning_rate": 4.19137428245922e-06, + "loss": 0.1484, + "step": 2645 + }, + { + "epoch": 0.8574206092028516, + "grad_norm": 0.8757805824279785, + "learning_rate": 4.190730120147054e-06, + "loss": 0.1573, + "step": 2646 + }, + { + "epoch": 0.8577446532728451, + "grad_norm": 0.8502841591835022, + "learning_rate": 4.190085750905889e-06, + "loss": 0.1473, + "step": 2647 + }, + { + "epoch": 0.8580686973428386, + "grad_norm": 0.7808975577354431, + "learning_rate": 4.189441174814589e-06, + "loss": 0.1461, + "step": 2648 + }, + { + "epoch": 0.8583927414128322, + "grad_norm": 0.7952367067337036, + "learning_rate": 4.188796391952046e-06, + "loss": 0.1446, + "step": 2649 + }, + { + "epoch": 0.8587167854828257, + "grad_norm": 0.8280566334724426, + "learning_rate": 4.188151402397172e-06, + "loss": 0.1459, + "step": 2650 + }, + { + "epoch": 0.8590408295528191, + "grad_norm": 0.8707353472709656, + "learning_rate": 4.187506206228909e-06, + "loss": 0.1494, + "step": 2651 + }, + { + "epoch": 0.8593648736228127, + "grad_norm": 0.7923481464385986, + "learning_rate": 4.1868608035262225e-06, + "loss": 0.1584, + "step": 2652 + }, + { + "epoch": 0.8596889176928062, + "grad_norm": 0.7720541954040527, + "learning_rate": 4.186215194368105e-06, + "loss": 0.1404, + "step": 2653 + }, + { + "epoch": 0.8600129617627997, + "grad_norm": 0.7756115794181824, + "learning_rate": 4.18556937883357e-06, + "loss": 0.1396, + "step": 2654 + }, + { + "epoch": 0.8603370058327933, + "grad_norm": 0.8358388543128967, + "learning_rate": 4.184923357001661e-06, + "loss": 0.1575, + "step": 2655 + }, + { + "epoch": 0.8606610499027868, + "grad_norm": 0.8395894765853882, + "learning_rate": 4.184277128951445e-06, + "loss": 0.15, + "step": 2656 + }, + { + "epoch": 0.8609850939727803, + "grad_norm": 0.8533385396003723, + "learning_rate": 4.1836306947620135e-06, + "loss": 0.1574, + "step": 2657 + }, + { + "epoch": 0.8613091380427739, + "grad_norm": 0.9090582132339478, + "learning_rate": 4.182984054512483e-06, + "loss": 0.1658, + "step": 2658 + }, + { + "epoch": 0.8616331821127673, + "grad_norm": 0.8781484961509705, + "learning_rate": 4.182337208281998e-06, + "loss": 0.1631, + "step": 2659 + }, + { + "epoch": 0.8619572261827608, + "grad_norm": 0.825088381767273, + "learning_rate": 4.181690156149724e-06, + "loss": 0.1494, + "step": 2660 + }, + { + "epoch": 0.8622812702527544, + "grad_norm": 0.8371817469596863, + "learning_rate": 4.1810428981948555e-06, + "loss": 0.1548, + "step": 2661 + }, + { + "epoch": 0.8626053143227479, + "grad_norm": 0.872991681098938, + "learning_rate": 4.1803954344966095e-06, + "loss": 0.1492, + "step": 2662 + }, + { + "epoch": 0.8629293583927414, + "grad_norm": 0.8750609755516052, + "learning_rate": 4.17974776513423e-06, + "loss": 0.1539, + "step": 2663 + }, + { + "epoch": 0.863253402462735, + "grad_norm": 0.8449800610542297, + "learning_rate": 4.179099890186985e-06, + "loss": 0.1604, + "step": 2664 + }, + { + "epoch": 0.8635774465327285, + "grad_norm": 0.9044042229652405, + "learning_rate": 4.178451809734168e-06, + "loss": 0.1558, + "step": 2665 + }, + { + "epoch": 0.8639014906027219, + "grad_norm": 0.8914145827293396, + "learning_rate": 4.1778035238550995e-06, + "loss": 0.1664, + "step": 2666 + }, + { + "epoch": 0.8642255346727155, + "grad_norm": 0.8576059937477112, + "learning_rate": 4.177155032629122e-06, + "loss": 0.1622, + "step": 2667 + }, + { + "epoch": 0.864549578742709, + "grad_norm": 0.8929911255836487, + "learning_rate": 4.176506336135603e-06, + "loss": 0.1594, + "step": 2668 + }, + { + "epoch": 0.8648736228127025, + "grad_norm": 0.9083852171897888, + "learning_rate": 4.175857434453939e-06, + "loss": 0.156, + "step": 2669 + }, + { + "epoch": 0.8651976668826961, + "grad_norm": 0.8650022745132446, + "learning_rate": 4.175208327663549e-06, + "loss": 0.1646, + "step": 2670 + }, + { + "epoch": 0.8655217109526896, + "grad_norm": 0.8090750575065613, + "learning_rate": 4.174559015843878e-06, + "loss": 0.1537, + "step": 2671 + }, + { + "epoch": 0.8658457550226831, + "grad_norm": 0.7599000930786133, + "learning_rate": 4.173909499074392e-06, + "loss": 0.1317, + "step": 2672 + }, + { + "epoch": 0.8661697990926766, + "grad_norm": 0.8080236911773682, + "learning_rate": 4.173259777434589e-06, + "loss": 0.1463, + "step": 2673 + }, + { + "epoch": 0.8664938431626701, + "grad_norm": 0.8762059807777405, + "learning_rate": 4.1726098510039894e-06, + "loss": 0.1599, + "step": 2674 + }, + { + "epoch": 0.8668178872326636, + "grad_norm": 0.8226339817047119, + "learning_rate": 4.171959719862134e-06, + "loss": 0.1489, + "step": 2675 + }, + { + "epoch": 0.8671419313026572, + "grad_norm": 0.8414006233215332, + "learning_rate": 4.171309384088596e-06, + "loss": 0.1611, + "step": 2676 + }, + { + "epoch": 0.8674659753726507, + "grad_norm": 0.862315833568573, + "learning_rate": 4.170658843762968e-06, + "loss": 0.1656, + "step": 2677 + }, + { + "epoch": 0.8677900194426442, + "grad_norm": 0.827226459980011, + "learning_rate": 4.170008098964871e-06, + "loss": 0.154, + "step": 2678 + }, + { + "epoch": 0.8681140635126378, + "grad_norm": 0.8278335928916931, + "learning_rate": 4.169357149773949e-06, + "loss": 0.1657, + "step": 2679 + }, + { + "epoch": 0.8684381075826313, + "grad_norm": 0.8246443867683411, + "learning_rate": 4.168705996269874e-06, + "loss": 0.1479, + "step": 2680 + }, + { + "epoch": 0.8687621516526247, + "grad_norm": 0.8692947626113892, + "learning_rate": 4.168054638532338e-06, + "loss": 0.159, + "step": 2681 + }, + { + "epoch": 0.8690861957226182, + "grad_norm": 0.7972445487976074, + "learning_rate": 4.167403076641063e-06, + "loss": 0.1534, + "step": 2682 + }, + { + "epoch": 0.8694102397926118, + "grad_norm": 0.8467311263084412, + "learning_rate": 4.166751310675793e-06, + "loss": 0.131, + "step": 2683 + }, + { + "epoch": 0.8697342838626053, + "grad_norm": 0.8024827837944031, + "learning_rate": 4.166099340716298e-06, + "loss": 0.1496, + "step": 2684 + }, + { + "epoch": 0.8700583279325989, + "grad_norm": 0.7870508432388306, + "learning_rate": 4.165447166842373e-06, + "loss": 0.1382, + "step": 2685 + }, + { + "epoch": 0.8703823720025924, + "grad_norm": 0.8258494138717651, + "learning_rate": 4.164794789133837e-06, + "loss": 0.1617, + "step": 2686 + }, + { + "epoch": 0.8707064160725859, + "grad_norm": 0.8265944719314575, + "learning_rate": 4.164142207670536e-06, + "loss": 0.1364, + "step": 2687 + }, + { + "epoch": 0.8710304601425793, + "grad_norm": 0.8695728182792664, + "learning_rate": 4.163489422532338e-06, + "loss": 0.1628, + "step": 2688 + }, + { + "epoch": 0.8713545042125729, + "grad_norm": 0.7880622744560242, + "learning_rate": 4.162836433799139e-06, + "loss": 0.1428, + "step": 2689 + }, + { + "epoch": 0.8716785482825664, + "grad_norm": 0.7860510945320129, + "learning_rate": 4.162183241550858e-06, + "loss": 0.1382, + "step": 2690 + }, + { + "epoch": 0.87200259235256, + "grad_norm": 0.8720436692237854, + "learning_rate": 4.161529845867439e-06, + "loss": 0.1501, + "step": 2691 + }, + { + "epoch": 0.8723266364225535, + "grad_norm": 0.8479650020599365, + "learning_rate": 4.160876246828853e-06, + "loss": 0.1517, + "step": 2692 + }, + { + "epoch": 0.872650680492547, + "grad_norm": 0.801552951335907, + "learning_rate": 4.160222444515092e-06, + "loss": 0.1478, + "step": 2693 + }, + { + "epoch": 0.8729747245625405, + "grad_norm": 0.900095522403717, + "learning_rate": 4.159568439006176e-06, + "loss": 0.1571, + "step": 2694 + }, + { + "epoch": 0.873298768632534, + "grad_norm": 0.9002645015716553, + "learning_rate": 4.1589142303821485e-06, + "loss": 0.1473, + "step": 2695 + }, + { + "epoch": 0.8736228127025275, + "grad_norm": 0.9044235944747925, + "learning_rate": 4.158259818723079e-06, + "loss": 0.1545, + "step": 2696 + }, + { + "epoch": 0.873946856772521, + "grad_norm": 0.8760396838188171, + "learning_rate": 4.157605204109062e-06, + "loss": 0.1445, + "step": 2697 + }, + { + "epoch": 0.8742709008425146, + "grad_norm": 0.8378698825836182, + "learning_rate": 4.156950386620214e-06, + "loss": 0.1615, + "step": 2698 + }, + { + "epoch": 0.8745949449125081, + "grad_norm": 0.7960793972015381, + "learning_rate": 4.156295366336679e-06, + "loss": 0.1423, + "step": 2699 + }, + { + "epoch": 0.8749189889825016, + "grad_norm": 0.8463109135627747, + "learning_rate": 4.155640143338625e-06, + "loss": 0.1558, + "step": 2700 + }, + { + "epoch": 0.8752430330524952, + "grad_norm": 0.7898882031440735, + "learning_rate": 4.154984717706246e-06, + "loss": 0.146, + "step": 2701 + }, + { + "epoch": 0.8755670771224887, + "grad_norm": 0.8624311685562134, + "learning_rate": 4.15432908951976e-06, + "loss": 0.1512, + "step": 2702 + }, + { + "epoch": 0.8758911211924821, + "grad_norm": 0.8539666533470154, + "learning_rate": 4.153673258859406e-06, + "loss": 0.1485, + "step": 2703 + }, + { + "epoch": 0.8762151652624757, + "grad_norm": 0.8110834956169128, + "learning_rate": 4.153017225805456e-06, + "loss": 0.1387, + "step": 2704 + }, + { + "epoch": 0.8765392093324692, + "grad_norm": 0.8246792554855347, + "learning_rate": 4.1523609904382e-06, + "loss": 0.1467, + "step": 2705 + }, + { + "epoch": 0.8768632534024627, + "grad_norm": 0.8625628352165222, + "learning_rate": 4.1517045528379544e-06, + "loss": 0.1599, + "step": 2706 + }, + { + "epoch": 0.8771872974724563, + "grad_norm": 0.8822122812271118, + "learning_rate": 4.151047913085061e-06, + "loss": 0.1621, + "step": 2707 + }, + { + "epoch": 0.8775113415424498, + "grad_norm": 0.8297098278999329, + "learning_rate": 4.150391071259886e-06, + "loss": 0.1463, + "step": 2708 + }, + { + "epoch": 0.8778353856124433, + "grad_norm": 0.8685880303382874, + "learning_rate": 4.149734027442821e-06, + "loss": 0.1599, + "step": 2709 + }, + { + "epoch": 0.8781594296824368, + "grad_norm": 0.9043188095092773, + "learning_rate": 4.149076781714283e-06, + "loss": 0.1672, + "step": 2710 + }, + { + "epoch": 0.8784834737524303, + "grad_norm": 0.7538388967514038, + "learning_rate": 4.1484193341547106e-06, + "loss": 0.1413, + "step": 2711 + }, + { + "epoch": 0.8788075178224238, + "grad_norm": 0.8316788673400879, + "learning_rate": 4.147761684844569e-06, + "loss": 0.1477, + "step": 2712 + }, + { + "epoch": 0.8791315618924174, + "grad_norm": 0.8419067859649658, + "learning_rate": 4.147103833864349e-06, + "loss": 0.155, + "step": 2713 + }, + { + "epoch": 0.8794556059624109, + "grad_norm": 0.8238720297813416, + "learning_rate": 4.146445781294566e-06, + "loss": 0.1492, + "step": 2714 + }, + { + "epoch": 0.8797796500324044, + "grad_norm": 0.7852333188056946, + "learning_rate": 4.145787527215757e-06, + "loss": 0.1459, + "step": 2715 + }, + { + "epoch": 0.880103694102398, + "grad_norm": 0.8900299668312073, + "learning_rate": 4.145129071708487e-06, + "loss": 0.1781, + "step": 2716 + }, + { + "epoch": 0.8804277381723914, + "grad_norm": 0.8969780802726746, + "learning_rate": 4.144470414853345e-06, + "loss": 0.1668, + "step": 2717 + }, + { + "epoch": 0.8807517822423849, + "grad_norm": 0.880719006061554, + "learning_rate": 4.143811556730944e-06, + "loss": 0.1623, + "step": 2718 + }, + { + "epoch": 0.8810758263123785, + "grad_norm": 0.8145759105682373, + "learning_rate": 4.143152497421922e-06, + "loss": 0.1431, + "step": 2719 + }, + { + "epoch": 0.881399870382372, + "grad_norm": 0.8346846103668213, + "learning_rate": 4.142493237006941e-06, + "loss": 0.1461, + "step": 2720 + }, + { + "epoch": 0.8817239144523655, + "grad_norm": 0.8116910457611084, + "learning_rate": 4.141833775566688e-06, + "loss": 0.1587, + "step": 2721 + }, + { + "epoch": 0.8820479585223591, + "grad_norm": 0.843694806098938, + "learning_rate": 4.1411741131818765e-06, + "loss": 0.1443, + "step": 2722 + }, + { + "epoch": 0.8823720025923526, + "grad_norm": 0.9246916174888611, + "learning_rate": 4.14051424993324e-06, + "loss": 0.1565, + "step": 2723 + }, + { + "epoch": 0.8826960466623461, + "grad_norm": 0.7806346416473389, + "learning_rate": 4.1398541859015405e-06, + "loss": 0.1419, + "step": 2724 + }, + { + "epoch": 0.8830200907323396, + "grad_norm": 0.9195976853370667, + "learning_rate": 4.139193921167565e-06, + "loss": 0.1572, + "step": 2725 + }, + { + "epoch": 0.8833441348023331, + "grad_norm": 0.9154192209243774, + "learning_rate": 4.138533455812121e-06, + "loss": 0.1791, + "step": 2726 + }, + { + "epoch": 0.8836681788723266, + "grad_norm": 0.7888971567153931, + "learning_rate": 4.137872789916044e-06, + "loss": 0.1437, + "step": 2727 + }, + { + "epoch": 0.8839922229423202, + "grad_norm": 0.8127666711807251, + "learning_rate": 4.137211923560195e-06, + "loss": 0.1536, + "step": 2728 + }, + { + "epoch": 0.8843162670123137, + "grad_norm": 0.8747718930244446, + "learning_rate": 4.136550856825455e-06, + "loss": 0.1627, + "step": 2729 + }, + { + "epoch": 0.8846403110823072, + "grad_norm": 0.8618316650390625, + "learning_rate": 4.135889589792733e-06, + "loss": 0.1604, + "step": 2730 + }, + { + "epoch": 0.8849643551523008, + "grad_norm": 0.8301213383674622, + "learning_rate": 4.135228122542962e-06, + "loss": 0.1573, + "step": 2731 + }, + { + "epoch": 0.8852883992222942, + "grad_norm": 0.8168521523475647, + "learning_rate": 4.1345664551570985e-06, + "loss": 0.1595, + "step": 2732 + }, + { + "epoch": 0.8856124432922877, + "grad_norm": 0.8900003433227539, + "learning_rate": 4.133904587716126e-06, + "loss": 0.1471, + "step": 2733 + }, + { + "epoch": 0.8859364873622813, + "grad_norm": 0.7271791696548462, + "learning_rate": 4.133242520301049e-06, + "loss": 0.1235, + "step": 2734 + }, + { + "epoch": 0.8862605314322748, + "grad_norm": 0.8689054250717163, + "learning_rate": 4.132580252992898e-06, + "loss": 0.1585, + "step": 2735 + }, + { + "epoch": 0.8865845755022683, + "grad_norm": 0.8039261102676392, + "learning_rate": 4.131917785872728e-06, + "loss": 0.1519, + "step": 2736 + }, + { + "epoch": 0.8869086195722619, + "grad_norm": 0.819491446018219, + "learning_rate": 4.13125511902162e-06, + "loss": 0.1567, + "step": 2737 + }, + { + "epoch": 0.8872326636422554, + "grad_norm": 0.9310662746429443, + "learning_rate": 4.130592252520677e-06, + "loss": 0.1694, + "step": 2738 + }, + { + "epoch": 0.8875567077122488, + "grad_norm": 0.79558265209198, + "learning_rate": 4.129929186451028e-06, + "loss": 0.155, + "step": 2739 + }, + { + "epoch": 0.8878807517822424, + "grad_norm": 0.7396335601806641, + "learning_rate": 4.129265920893826e-06, + "loss": 0.139, + "step": 2740 + }, + { + "epoch": 0.8882047958522359, + "grad_norm": 0.803795576095581, + "learning_rate": 4.128602455930247e-06, + "loss": 0.1545, + "step": 2741 + }, + { + "epoch": 0.8885288399222294, + "grad_norm": 0.8279784321784973, + "learning_rate": 4.127938791641493e-06, + "loss": 0.1427, + "step": 2742 + }, + { + "epoch": 0.888852883992223, + "grad_norm": 0.8597333431243896, + "learning_rate": 4.127274928108792e-06, + "loss": 0.1665, + "step": 2743 + }, + { + "epoch": 0.8891769280622165, + "grad_norm": 0.9069391489028931, + "learning_rate": 4.126610865413392e-06, + "loss": 0.1663, + "step": 2744 + }, + { + "epoch": 0.88950097213221, + "grad_norm": 0.9247065186500549, + "learning_rate": 4.125946603636569e-06, + "loss": 0.162, + "step": 2745 + }, + { + "epoch": 0.8898250162022034, + "grad_norm": 0.85749751329422, + "learning_rate": 4.125282142859622e-06, + "loss": 0.1452, + "step": 2746 + }, + { + "epoch": 0.890149060272197, + "grad_norm": 0.8194043636322021, + "learning_rate": 4.124617483163876e-06, + "loss": 0.1525, + "step": 2747 + }, + { + "epoch": 0.8904731043421905, + "grad_norm": 0.8424063324928284, + "learning_rate": 4.123952624630676e-06, + "loss": 0.1455, + "step": 2748 + }, + { + "epoch": 0.890797148412184, + "grad_norm": 0.8458553552627563, + "learning_rate": 4.123287567341396e-06, + "loss": 0.1526, + "step": 2749 + }, + { + "epoch": 0.8911211924821776, + "grad_norm": 0.822384774684906, + "learning_rate": 4.122622311377433e-06, + "loss": 0.1426, + "step": 2750 + }, + { + "epoch": 0.8914452365521711, + "grad_norm": 0.903175413608551, + "learning_rate": 4.121956856820207e-06, + "loss": 0.1571, + "step": 2751 + }, + { + "epoch": 0.8917692806221647, + "grad_norm": 0.8061724305152893, + "learning_rate": 4.1212912037511634e-06, + "loss": 0.1398, + "step": 2752 + }, + { + "epoch": 0.8920933246921582, + "grad_norm": 0.8506471514701843, + "learning_rate": 4.1206253522517725e-06, + "loss": 0.1521, + "step": 2753 + }, + { + "epoch": 0.8924173687621516, + "grad_norm": 0.8087679743766785, + "learning_rate": 4.119959302403527e-06, + "loss": 0.1345, + "step": 2754 + }, + { + "epoch": 0.8927414128321451, + "grad_norm": 0.8700354695320129, + "learning_rate": 4.119293054287945e-06, + "loss": 0.1518, + "step": 2755 + }, + { + "epoch": 0.8930654569021387, + "grad_norm": 0.824869692325592, + "learning_rate": 4.118626607986569e-06, + "loss": 0.1392, + "step": 2756 + }, + { + "epoch": 0.8933895009721322, + "grad_norm": 0.8283060193061829, + "learning_rate": 4.1179599635809654e-06, + "loss": 0.1466, + "step": 2757 + }, + { + "epoch": 0.8937135450421257, + "grad_norm": 0.9671257138252258, + "learning_rate": 4.1172931211527254e-06, + "loss": 0.1638, + "step": 2758 + }, + { + "epoch": 0.8940375891121193, + "grad_norm": 0.9112168550491333, + "learning_rate": 4.116626080783464e-06, + "loss": 0.1666, + "step": 2759 + }, + { + "epoch": 0.8943616331821128, + "grad_norm": 0.9166633486747742, + "learning_rate": 4.1159588425548215e-06, + "loss": 0.1697, + "step": 2760 + }, + { + "epoch": 0.8946856772521062, + "grad_norm": 0.836449384689331, + "learning_rate": 4.11529140654846e-06, + "loss": 0.1504, + "step": 2761 + }, + { + "epoch": 0.8950097213220998, + "grad_norm": 0.7775622606277466, + "learning_rate": 4.114623772846067e-06, + "loss": 0.1301, + "step": 2762 + }, + { + "epoch": 0.8953337653920933, + "grad_norm": 0.8575242757797241, + "learning_rate": 4.113955941529355e-06, + "loss": 0.1571, + "step": 2763 + }, + { + "epoch": 0.8956578094620868, + "grad_norm": 0.968856930732727, + "learning_rate": 4.113287912680061e-06, + "loss": 0.1669, + "step": 2764 + }, + { + "epoch": 0.8959818535320804, + "grad_norm": 0.8376212120056152, + "learning_rate": 4.112619686379944e-06, + "loss": 0.1504, + "step": 2765 + }, + { + "epoch": 0.8963058976020739, + "grad_norm": 0.8225062489509583, + "learning_rate": 4.111951262710788e-06, + "loss": 0.1574, + "step": 2766 + }, + { + "epoch": 0.8966299416720674, + "grad_norm": 0.7696053981781006, + "learning_rate": 4.111282641754403e-06, + "loss": 0.1356, + "step": 2767 + }, + { + "epoch": 0.8969539857420609, + "grad_norm": 0.8558640480041504, + "learning_rate": 4.110613823592621e-06, + "loss": 0.1712, + "step": 2768 + }, + { + "epoch": 0.8972780298120544, + "grad_norm": 0.8523713946342468, + "learning_rate": 4.109944808307298e-06, + "loss": 0.1461, + "step": 2769 + }, + { + "epoch": 0.8976020738820479, + "grad_norm": 0.7888774275779724, + "learning_rate": 4.109275595980316e-06, + "loss": 0.1427, + "step": 2770 + }, + { + "epoch": 0.8979261179520415, + "grad_norm": 0.8624205589294434, + "learning_rate": 4.108606186693582e-06, + "loss": 0.1652, + "step": 2771 + }, + { + "epoch": 0.898250162022035, + "grad_norm": 0.8590965867042542, + "learning_rate": 4.1079365805290214e-06, + "loss": 0.1639, + "step": 2772 + }, + { + "epoch": 0.8985742060920285, + "grad_norm": 0.8436129093170166, + "learning_rate": 4.10726677756859e-06, + "loss": 0.1465, + "step": 2773 + }, + { + "epoch": 0.8988982501620221, + "grad_norm": 0.791901171207428, + "learning_rate": 4.106596777894265e-06, + "loss": 0.1377, + "step": 2774 + }, + { + "epoch": 0.8992222942320156, + "grad_norm": 0.8803940415382385, + "learning_rate": 4.105926581588046e-06, + "loss": 0.1592, + "step": 2775 + }, + { + "epoch": 0.899546338302009, + "grad_norm": 0.8036686778068542, + "learning_rate": 4.105256188731962e-06, + "loss": 0.1501, + "step": 2776 + }, + { + "epoch": 0.8998703823720026, + "grad_norm": 0.8757483959197998, + "learning_rate": 4.104585599408059e-06, + "loss": 0.1658, + "step": 2777 + }, + { + "epoch": 0.9001944264419961, + "grad_norm": 0.8514494299888611, + "learning_rate": 4.1039148136984134e-06, + "loss": 0.1547, + "step": 2778 + }, + { + "epoch": 0.9005184705119896, + "grad_norm": 0.9344333410263062, + "learning_rate": 4.103243831685121e-06, + "loss": 0.175, + "step": 2779 + }, + { + "epoch": 0.9008425145819832, + "grad_norm": 0.8821116089820862, + "learning_rate": 4.102572653450304e-06, + "loss": 0.1616, + "step": 2780 + }, + { + "epoch": 0.9011665586519767, + "grad_norm": 0.7999144196510315, + "learning_rate": 4.101901279076108e-06, + "loss": 0.1549, + "step": 2781 + }, + { + "epoch": 0.9014906027219702, + "grad_norm": 0.8356010317802429, + "learning_rate": 4.101229708644704e-06, + "loss": 0.1421, + "step": 2782 + }, + { + "epoch": 0.9018146467919637, + "grad_norm": 0.7832415699958801, + "learning_rate": 4.100557942238284e-06, + "loss": 0.1453, + "step": 2783 + }, + { + "epoch": 0.9021386908619572, + "grad_norm": 0.7718949913978577, + "learning_rate": 4.099885979939068e-06, + "loss": 0.1266, + "step": 2784 + }, + { + "epoch": 0.9024627349319507, + "grad_norm": 0.8561499118804932, + "learning_rate": 4.099213821829295e-06, + "loss": 0.1477, + "step": 2785 + }, + { + "epoch": 0.9027867790019443, + "grad_norm": 0.9258077144622803, + "learning_rate": 4.098541467991231e-06, + "loss": 0.1668, + "step": 2786 + }, + { + "epoch": 0.9031108230719378, + "grad_norm": 0.9254985451698303, + "learning_rate": 4.097868918507168e-06, + "loss": 0.152, + "step": 2787 + }, + { + "epoch": 0.9034348671419313, + "grad_norm": 0.8128352165222168, + "learning_rate": 4.097196173459417e-06, + "loss": 0.1525, + "step": 2788 + }, + { + "epoch": 0.9037589112119249, + "grad_norm": 0.885798454284668, + "learning_rate": 4.0965232329303175e-06, + "loss": 0.1561, + "step": 2789 + }, + { + "epoch": 0.9040829552819183, + "grad_norm": 0.8290806412696838, + "learning_rate": 4.095850097002228e-06, + "loss": 0.1504, + "step": 2790 + }, + { + "epoch": 0.9044069993519118, + "grad_norm": 0.8356497883796692, + "learning_rate": 4.095176765757537e-06, + "loss": 0.157, + "step": 2791 + }, + { + "epoch": 0.9047310434219054, + "grad_norm": 0.8767359256744385, + "learning_rate": 4.094503239278652e-06, + "loss": 0.1465, + "step": 2792 + }, + { + "epoch": 0.9050550874918989, + "grad_norm": 0.8219475746154785, + "learning_rate": 4.0938295176480055e-06, + "loss": 0.1561, + "step": 2793 + }, + { + "epoch": 0.9053791315618924, + "grad_norm": 0.8670582175254822, + "learning_rate": 4.093155600948057e-06, + "loss": 0.1659, + "step": 2794 + }, + { + "epoch": 0.905703175631886, + "grad_norm": 0.7694851756095886, + "learning_rate": 4.092481489261285e-06, + "loss": 0.135, + "step": 2795 + }, + { + "epoch": 0.9060272197018795, + "grad_norm": 0.819607138633728, + "learning_rate": 4.0918071826701966e-06, + "loss": 0.1543, + "step": 2796 + }, + { + "epoch": 0.906351263771873, + "grad_norm": 0.8917694091796875, + "learning_rate": 4.091132681257317e-06, + "loss": 0.1595, + "step": 2797 + }, + { + "epoch": 0.9066753078418665, + "grad_norm": 0.8101462721824646, + "learning_rate": 4.090457985105202e-06, + "loss": 0.1539, + "step": 2798 + }, + { + "epoch": 0.90699935191186, + "grad_norm": 0.8657563328742981, + "learning_rate": 4.089783094296425e-06, + "loss": 0.1734, + "step": 2799 + }, + { + "epoch": 0.9073233959818535, + "grad_norm": 0.8259778022766113, + "learning_rate": 4.089108008913589e-06, + "loss": 0.1483, + "step": 2800 + }, + { + "epoch": 0.907647440051847, + "grad_norm": 0.7770955562591553, + "learning_rate": 4.088432729039316e-06, + "loss": 0.1395, + "step": 2801 + }, + { + "epoch": 0.9079714841218406, + "grad_norm": 0.7907054424285889, + "learning_rate": 4.087757254756254e-06, + "loss": 0.1453, + "step": 2802 + }, + { + "epoch": 0.9082955281918341, + "grad_norm": 0.7664549350738525, + "learning_rate": 4.087081586147075e-06, + "loss": 0.1341, + "step": 2803 + }, + { + "epoch": 0.9086195722618277, + "grad_norm": 0.7957342267036438, + "learning_rate": 4.086405723294474e-06, + "loss": 0.1443, + "step": 2804 + }, + { + "epoch": 0.9089436163318211, + "grad_norm": 0.8862059712409973, + "learning_rate": 4.0857296662811696e-06, + "loss": 0.155, + "step": 2805 + }, + { + "epoch": 0.9092676604018146, + "grad_norm": 0.7839810252189636, + "learning_rate": 4.085053415189905e-06, + "loss": 0.1337, + "step": 2806 + }, + { + "epoch": 0.9095917044718081, + "grad_norm": 0.8341511487960815, + "learning_rate": 4.084376970103448e-06, + "loss": 0.143, + "step": 2807 + }, + { + "epoch": 0.9099157485418017, + "grad_norm": 0.8733060956001282, + "learning_rate": 4.0837003311045865e-06, + "loss": 0.1542, + "step": 2808 + }, + { + "epoch": 0.9102397926117952, + "grad_norm": 0.821233332157135, + "learning_rate": 4.083023498276136e-06, + "loss": 0.1525, + "step": 2809 + }, + { + "epoch": 0.9105638366817888, + "grad_norm": 0.8411147594451904, + "learning_rate": 4.082346471700935e-06, + "loss": 0.1471, + "step": 2810 + }, + { + "epoch": 0.9108878807517823, + "grad_norm": 0.8866571187973022, + "learning_rate": 4.081669251461844e-06, + "loss": 0.1625, + "step": 2811 + }, + { + "epoch": 0.9112119248217757, + "grad_norm": 0.8973330855369568, + "learning_rate": 4.080991837641748e-06, + "loss": 0.1481, + "step": 2812 + }, + { + "epoch": 0.9115359688917692, + "grad_norm": 0.794649600982666, + "learning_rate": 4.080314230323556e-06, + "loss": 0.136, + "step": 2813 + }, + { + "epoch": 0.9118600129617628, + "grad_norm": 0.8813613057136536, + "learning_rate": 4.079636429590201e-06, + "loss": 0.1581, + "step": 2814 + }, + { + "epoch": 0.9121840570317563, + "grad_norm": 0.8289700150489807, + "learning_rate": 4.07895843552464e-06, + "loss": 0.1558, + "step": 2815 + }, + { + "epoch": 0.9125081011017498, + "grad_norm": 0.8710315227508545, + "learning_rate": 4.078280248209851e-06, + "loss": 0.1582, + "step": 2816 + }, + { + "epoch": 0.9128321451717434, + "grad_norm": 0.7751961350440979, + "learning_rate": 4.077601867728839e-06, + "loss": 0.1398, + "step": 2817 + }, + { + "epoch": 0.9131561892417369, + "grad_norm": 0.7780879139900208, + "learning_rate": 4.07692329416463e-06, + "loss": 0.1455, + "step": 2818 + }, + { + "epoch": 0.9134802333117304, + "grad_norm": 0.8030531406402588, + "learning_rate": 4.0762445276002765e-06, + "loss": 0.1471, + "step": 2819 + }, + { + "epoch": 0.9138042773817239, + "grad_norm": 0.8089961409568787, + "learning_rate": 4.075565568118852e-06, + "loss": 0.1606, + "step": 2820 + }, + { + "epoch": 0.9141283214517174, + "grad_norm": 0.7848464846611023, + "learning_rate": 4.074886415803454e-06, + "loss": 0.1508, + "step": 2821 + }, + { + "epoch": 0.9144523655217109, + "grad_norm": 0.9256161451339722, + "learning_rate": 4.074207070737205e-06, + "loss": 0.1608, + "step": 2822 + }, + { + "epoch": 0.9147764095917045, + "grad_norm": 0.8239982724189758, + "learning_rate": 4.07352753300325e-06, + "loss": 0.1536, + "step": 2823 + }, + { + "epoch": 0.915100453661698, + "grad_norm": 0.823776364326477, + "learning_rate": 4.072847802684758e-06, + "loss": 0.1555, + "step": 2824 + }, + { + "epoch": 0.9154244977316915, + "grad_norm": 0.8191870450973511, + "learning_rate": 4.072167879864922e-06, + "loss": 0.1474, + "step": 2825 + }, + { + "epoch": 0.9157485418016851, + "grad_norm": 0.8104268908500671, + "learning_rate": 4.071487764626957e-06, + "loss": 0.1426, + "step": 2826 + }, + { + "epoch": 0.9160725858716785, + "grad_norm": 0.8624962568283081, + "learning_rate": 4.070807457054102e-06, + "loss": 0.1455, + "step": 2827 + }, + { + "epoch": 0.916396629941672, + "grad_norm": 0.8465151786804199, + "learning_rate": 4.070126957229622e-06, + "loss": 0.1671, + "step": 2828 + }, + { + "epoch": 0.9167206740116656, + "grad_norm": 0.845751166343689, + "learning_rate": 4.069446265236801e-06, + "loss": 0.1483, + "step": 2829 + }, + { + "epoch": 0.9170447180816591, + "grad_norm": 0.8201125264167786, + "learning_rate": 4.068765381158951e-06, + "loss": 0.1479, + "step": 2830 + }, + { + "epoch": 0.9173687621516526, + "grad_norm": 0.9279409646987915, + "learning_rate": 4.068084305079406e-06, + "loss": 0.1677, + "step": 2831 + }, + { + "epoch": 0.9176928062216462, + "grad_norm": 0.8532978892326355, + "learning_rate": 4.067403037081522e-06, + "loss": 0.1493, + "step": 2832 + }, + { + "epoch": 0.9180168502916397, + "grad_norm": 0.8532582521438599, + "learning_rate": 4.06672157724868e-06, + "loss": 0.139, + "step": 2833 + }, + { + "epoch": 0.9183408943616331, + "grad_norm": 0.822012722492218, + "learning_rate": 4.066039925664283e-06, + "loss": 0.1565, + "step": 2834 + }, + { + "epoch": 0.9186649384316267, + "grad_norm": 0.8295703530311584, + "learning_rate": 4.06535808241176e-06, + "loss": 0.1484, + "step": 2835 + }, + { + "epoch": 0.9189889825016202, + "grad_norm": 0.9345645904541016, + "learning_rate": 4.064676047574561e-06, + "loss": 0.1516, + "step": 2836 + }, + { + "epoch": 0.9193130265716137, + "grad_norm": 0.8906318545341492, + "learning_rate": 4.063993821236162e-06, + "loss": 0.1548, + "step": 2837 + }, + { + "epoch": 0.9196370706416073, + "grad_norm": 0.853904664516449, + "learning_rate": 4.063311403480061e-06, + "loss": 0.1597, + "step": 2838 + }, + { + "epoch": 0.9199611147116008, + "grad_norm": 0.8370136618614197, + "learning_rate": 4.0626287943897765e-06, + "loss": 0.1402, + "step": 2839 + }, + { + "epoch": 0.9202851587815943, + "grad_norm": 0.8235974907875061, + "learning_rate": 4.061945994048855e-06, + "loss": 0.1506, + "step": 2840 + }, + { + "epoch": 0.9206092028515879, + "grad_norm": 0.8228686451911926, + "learning_rate": 4.061263002540865e-06, + "loss": 0.1543, + "step": 2841 + }, + { + "epoch": 0.9209332469215813, + "grad_norm": 0.8150495886802673, + "learning_rate": 4.060579819949398e-06, + "loss": 0.1528, + "step": 2842 + }, + { + "epoch": 0.9212572909915748, + "grad_norm": 0.7938353419303894, + "learning_rate": 4.059896446358068e-06, + "loss": 0.1377, + "step": 2843 + }, + { + "epoch": 0.9215813350615684, + "grad_norm": 0.8098077774047852, + "learning_rate": 4.059212881850515e-06, + "loss": 0.1437, + "step": 2844 + }, + { + "epoch": 0.9219053791315619, + "grad_norm": 0.9222689270973206, + "learning_rate": 4.0585291265103985e-06, + "loss": 0.1548, + "step": 2845 + }, + { + "epoch": 0.9222294232015554, + "grad_norm": 0.8263746500015259, + "learning_rate": 4.057845180421405e-06, + "loss": 0.1441, + "step": 2846 + }, + { + "epoch": 0.922553467271549, + "grad_norm": 0.8179762363433838, + "learning_rate": 4.057161043667243e-06, + "loss": 0.1468, + "step": 2847 + }, + { + "epoch": 0.9228775113415425, + "grad_norm": 0.8503578901290894, + "learning_rate": 4.056476716331643e-06, + "loss": 0.1513, + "step": 2848 + }, + { + "epoch": 0.9232015554115359, + "grad_norm": 0.8281416893005371, + "learning_rate": 4.05579219849836e-06, + "loss": 0.1535, + "step": 2849 + }, + { + "epoch": 0.9235255994815295, + "grad_norm": 0.8391620516777039, + "learning_rate": 4.055107490251175e-06, + "loss": 0.1475, + "step": 2850 + }, + { + "epoch": 0.923849643551523, + "grad_norm": 0.805622935295105, + "learning_rate": 4.054422591673887e-06, + "loss": 0.1424, + "step": 2851 + }, + { + "epoch": 0.9241736876215165, + "grad_norm": 0.8314092755317688, + "learning_rate": 4.0537375028503225e-06, + "loss": 0.1421, + "step": 2852 + }, + { + "epoch": 0.9244977316915101, + "grad_norm": 0.9176031947135925, + "learning_rate": 4.053052223864328e-06, + "loss": 0.152, + "step": 2853 + }, + { + "epoch": 0.9248217757615036, + "grad_norm": 0.8071728944778442, + "learning_rate": 4.052366754799776e-06, + "loss": 0.1525, + "step": 2854 + }, + { + "epoch": 0.9251458198314971, + "grad_norm": 0.8542532920837402, + "learning_rate": 4.051681095740561e-06, + "loss": 0.1671, + "step": 2855 + }, + { + "epoch": 0.9254698639014906, + "grad_norm": 0.8364072442054749, + "learning_rate": 4.050995246770602e-06, + "loss": 0.1663, + "step": 2856 + }, + { + "epoch": 0.9257939079714841, + "grad_norm": 0.8671528100967407, + "learning_rate": 4.05030920797384e-06, + "loss": 0.1529, + "step": 2857 + }, + { + "epoch": 0.9261179520414776, + "grad_norm": 0.8020675778388977, + "learning_rate": 4.049622979434239e-06, + "loss": 0.1581, + "step": 2858 + }, + { + "epoch": 0.9264419961114712, + "grad_norm": 0.8543144464492798, + "learning_rate": 4.0489365612357854e-06, + "loss": 0.1452, + "step": 2859 + }, + { + "epoch": 0.9267660401814647, + "grad_norm": 0.8329032063484192, + "learning_rate": 4.0482499534624934e-06, + "loss": 0.1609, + "step": 2860 + }, + { + "epoch": 0.9270900842514582, + "grad_norm": 0.8056915998458862, + "learning_rate": 4.047563156198394e-06, + "loss": 0.1452, + "step": 2861 + }, + { + "epoch": 0.9274141283214518, + "grad_norm": 0.8419206738471985, + "learning_rate": 4.046876169527547e-06, + "loss": 0.1529, + "step": 2862 + }, + { + "epoch": 0.9277381723914452, + "grad_norm": 0.8076557517051697, + "learning_rate": 4.04618899353403e-06, + "loss": 0.151, + "step": 2863 + }, + { + "epoch": 0.9280622164614387, + "grad_norm": 0.8330927491188049, + "learning_rate": 4.04550162830195e-06, + "loss": 0.1523, + "step": 2864 + }, + { + "epoch": 0.9283862605314323, + "grad_norm": 0.8939826488494873, + "learning_rate": 4.044814073915432e-06, + "loss": 0.1744, + "step": 2865 + }, + { + "epoch": 0.9287103046014258, + "grad_norm": 0.8498444557189941, + "learning_rate": 4.044126330458626e-06, + "loss": 0.1471, + "step": 2866 + }, + { + "epoch": 0.9290343486714193, + "grad_norm": 0.7920177578926086, + "learning_rate": 4.0434383980157055e-06, + "loss": 0.1439, + "step": 2867 + }, + { + "epoch": 0.9293583927414129, + "grad_norm": 0.8161009550094604, + "learning_rate": 4.042750276670867e-06, + "loss": 0.1435, + "step": 2868 + }, + { + "epoch": 0.9296824368114064, + "grad_norm": 0.7840574383735657, + "learning_rate": 4.04206196650833e-06, + "loss": 0.1332, + "step": 2869 + }, + { + "epoch": 0.9300064808813999, + "grad_norm": 0.808705747127533, + "learning_rate": 4.041373467612337e-06, + "loss": 0.149, + "step": 2870 + }, + { + "epoch": 0.9303305249513933, + "grad_norm": 0.7589499354362488, + "learning_rate": 4.0406847800671515e-06, + "loss": 0.1438, + "step": 2871 + }, + { + "epoch": 0.9306545690213869, + "grad_norm": 0.9110842943191528, + "learning_rate": 4.0399959039570646e-06, + "loss": 0.172, + "step": 2872 + }, + { + "epoch": 0.9309786130913804, + "grad_norm": 0.802574634552002, + "learning_rate": 4.039306839366387e-06, + "loss": 0.145, + "step": 2873 + }, + { + "epoch": 0.931302657161374, + "grad_norm": 0.8501918315887451, + "learning_rate": 4.038617586379455e-06, + "loss": 0.1559, + "step": 2874 + }, + { + "epoch": 0.9316267012313675, + "grad_norm": 0.8628923296928406, + "learning_rate": 4.0379281450806255e-06, + "loss": 0.1472, + "step": 2875 + }, + { + "epoch": 0.931950745301361, + "grad_norm": 0.8358166813850403, + "learning_rate": 4.037238515554278e-06, + "loss": 0.1493, + "step": 2876 + }, + { + "epoch": 0.9322747893713546, + "grad_norm": 0.7423512935638428, + "learning_rate": 4.0365486978848176e-06, + "loss": 0.1399, + "step": 2877 + }, + { + "epoch": 0.932598833441348, + "grad_norm": 0.7642088532447815, + "learning_rate": 4.035858692156673e-06, + "loss": 0.1341, + "step": 2878 + }, + { + "epoch": 0.9329228775113415, + "grad_norm": 0.8165916800498962, + "learning_rate": 4.035168498454292e-06, + "loss": 0.142, + "step": 2879 + }, + { + "epoch": 0.933246921581335, + "grad_norm": 0.8518221974372864, + "learning_rate": 4.034478116862149e-06, + "loss": 0.1422, + "step": 2880 + }, + { + "epoch": 0.9335709656513286, + "grad_norm": 0.7966205477714539, + "learning_rate": 4.033787547464738e-06, + "loss": 0.1502, + "step": 2881 + }, + { + "epoch": 0.9338950097213221, + "grad_norm": 0.7498399019241333, + "learning_rate": 4.033096790346581e-06, + "loss": 0.1329, + "step": 2882 + }, + { + "epoch": 0.9342190537913156, + "grad_norm": 0.8181966543197632, + "learning_rate": 4.032405845592218e-06, + "loss": 0.1494, + "step": 2883 + }, + { + "epoch": 0.9345430978613092, + "grad_norm": 0.8142485618591309, + "learning_rate": 4.0317147132862135e-06, + "loss": 0.1386, + "step": 2884 + }, + { + "epoch": 0.9348671419313026, + "grad_norm": 0.8145033717155457, + "learning_rate": 4.031023393513157e-06, + "loss": 0.139, + "step": 2885 + }, + { + "epoch": 0.9351911860012961, + "grad_norm": 0.8427969217300415, + "learning_rate": 4.030331886357659e-06, + "loss": 0.1607, + "step": 2886 + }, + { + "epoch": 0.9355152300712897, + "grad_norm": 0.7301238179206848, + "learning_rate": 4.029640191904352e-06, + "loss": 0.1297, + "step": 2887 + }, + { + "epoch": 0.9358392741412832, + "grad_norm": 0.8938672542572021, + "learning_rate": 4.028948310237893e-06, + "loss": 0.1588, + "step": 2888 + }, + { + "epoch": 0.9361633182112767, + "grad_norm": 0.8654724955558777, + "learning_rate": 4.0282562414429635e-06, + "loss": 0.1537, + "step": 2889 + }, + { + "epoch": 0.9364873622812703, + "grad_norm": 0.8152555823326111, + "learning_rate": 4.027563985604264e-06, + "loss": 0.1551, + "step": 2890 + }, + { + "epoch": 0.9368114063512638, + "grad_norm": 0.8133431077003479, + "learning_rate": 4.026871542806521e-06, + "loss": 0.1523, + "step": 2891 + }, + { + "epoch": 0.9371354504212573, + "grad_norm": 0.9307568073272705, + "learning_rate": 4.026178913134482e-06, + "loss": 0.1737, + "step": 2892 + }, + { + "epoch": 0.9374594944912508, + "grad_norm": 0.8865219950675964, + "learning_rate": 4.02548609667292e-06, + "loss": 0.1419, + "step": 2893 + }, + { + "epoch": 0.9377835385612443, + "grad_norm": 0.8078862428665161, + "learning_rate": 4.024793093506626e-06, + "loss": 0.1528, + "step": 2894 + }, + { + "epoch": 0.9381075826312378, + "grad_norm": 0.7754785418510437, + "learning_rate": 4.024099903720419e-06, + "loss": 0.1438, + "step": 2895 + }, + { + "epoch": 0.9384316267012314, + "grad_norm": 0.7733836770057678, + "learning_rate": 4.023406527399137e-06, + "loss": 0.1337, + "step": 2896 + }, + { + "epoch": 0.9387556707712249, + "grad_norm": 0.795275092124939, + "learning_rate": 4.022712964627645e-06, + "loss": 0.1536, + "step": 2897 + }, + { + "epoch": 0.9390797148412184, + "grad_norm": 0.8315867781639099, + "learning_rate": 4.022019215490827e-06, + "loss": 0.1476, + "step": 2898 + }, + { + "epoch": 0.939403758911212, + "grad_norm": 0.8151440024375916, + "learning_rate": 4.021325280073592e-06, + "loss": 0.1532, + "step": 2899 + }, + { + "epoch": 0.9397278029812054, + "grad_norm": 0.8618231415748596, + "learning_rate": 4.0206311584608705e-06, + "loss": 0.1506, + "step": 2900 + }, + { + "epoch": 0.9400518470511989, + "grad_norm": 0.84187912940979, + "learning_rate": 4.019936850737615e-06, + "loss": 0.1488, + "step": 2901 + }, + { + "epoch": 0.9403758911211925, + "grad_norm": 0.8414024710655212, + "learning_rate": 4.019242356988803e-06, + "loss": 0.1523, + "step": 2902 + }, + { + "epoch": 0.940699935191186, + "grad_norm": 0.8652313947677612, + "learning_rate": 4.018547677299434e-06, + "loss": 0.1571, + "step": 2903 + }, + { + "epoch": 0.9410239792611795, + "grad_norm": 0.9478034973144531, + "learning_rate": 4.01785281175453e-06, + "loss": 0.1612, + "step": 2904 + }, + { + "epoch": 0.9413480233311731, + "grad_norm": 0.8561654686927795, + "learning_rate": 4.017157760439136e-06, + "loss": 0.1425, + "step": 2905 + }, + { + "epoch": 0.9416720674011666, + "grad_norm": 0.840130627155304, + "learning_rate": 4.01646252343832e-06, + "loss": 0.1546, + "step": 2906 + }, + { + "epoch": 0.94199611147116, + "grad_norm": 0.7839550971984863, + "learning_rate": 4.015767100837171e-06, + "loss": 0.1372, + "step": 2907 + }, + { + "epoch": 0.9423201555411536, + "grad_norm": 0.8164642453193665, + "learning_rate": 4.015071492720802e-06, + "loss": 0.1482, + "step": 2908 + }, + { + "epoch": 0.9426441996111471, + "grad_norm": 0.900124728679657, + "learning_rate": 4.014375699174351e-06, + "loss": 0.1543, + "step": 2909 + }, + { + "epoch": 0.9429682436811406, + "grad_norm": 0.8421492576599121, + "learning_rate": 4.013679720282973e-06, + "loss": 0.1596, + "step": 2910 + }, + { + "epoch": 0.9432922877511342, + "grad_norm": 0.8553417921066284, + "learning_rate": 4.012983556131852e-06, + "loss": 0.1473, + "step": 2911 + }, + { + "epoch": 0.9436163318211277, + "grad_norm": 0.9092195630073547, + "learning_rate": 4.01228720680619e-06, + "loss": 0.149, + "step": 2912 + }, + { + "epoch": 0.9439403758911212, + "grad_norm": 0.8383063673973083, + "learning_rate": 4.011590672391213e-06, + "loss": 0.1466, + "step": 2913 + }, + { + "epoch": 0.9442644199611148, + "grad_norm": 0.8686926364898682, + "learning_rate": 4.010893952972173e-06, + "loss": 0.1468, + "step": 2914 + }, + { + "epoch": 0.9445884640311082, + "grad_norm": 0.844262957572937, + "learning_rate": 4.010197048634338e-06, + "loss": 0.1513, + "step": 2915 + }, + { + "epoch": 0.9449125081011017, + "grad_norm": 0.7631595730781555, + "learning_rate": 4.009499959463005e-06, + "loss": 0.1396, + "step": 2916 + }, + { + "epoch": 0.9452365521710953, + "grad_norm": 0.8633599877357483, + "learning_rate": 4.00880268554349e-06, + "loss": 0.1629, + "step": 2917 + }, + { + "epoch": 0.9455605962410888, + "grad_norm": 0.8235397338867188, + "learning_rate": 4.008105226961132e-06, + "loss": 0.1392, + "step": 2918 + }, + { + "epoch": 0.9458846403110823, + "grad_norm": 0.747604489326477, + "learning_rate": 4.007407583801295e-06, + "loss": 0.1274, + "step": 2919 + }, + { + "epoch": 0.9462086843810759, + "grad_norm": 0.8771180510520935, + "learning_rate": 4.006709756149362e-06, + "loss": 0.1656, + "step": 2920 + }, + { + "epoch": 0.9465327284510694, + "grad_norm": 0.8580746650695801, + "learning_rate": 4.006011744090741e-06, + "loss": 0.1534, + "step": 2921 + }, + { + "epoch": 0.9468567725210628, + "grad_norm": 0.8321346044540405, + "learning_rate": 4.005313547710861e-06, + "loss": 0.1457, + "step": 2922 + }, + { + "epoch": 0.9471808165910564, + "grad_norm": 0.7928733825683594, + "learning_rate": 4.004615167095176e-06, + "loss": 0.1487, + "step": 2923 + }, + { + "epoch": 0.9475048606610499, + "grad_norm": 0.914631724357605, + "learning_rate": 4.003916602329161e-06, + "loss": 0.1602, + "step": 2924 + }, + { + "epoch": 0.9478289047310434, + "grad_norm": 0.8683075904846191, + "learning_rate": 4.0032178534983115e-06, + "loss": 0.1569, + "step": 2925 + }, + { + "epoch": 0.948152948801037, + "grad_norm": 0.817596435546875, + "learning_rate": 4.00251892068815e-06, + "loss": 0.1536, + "step": 2926 + }, + { + "epoch": 0.9484769928710305, + "grad_norm": 0.8051871657371521, + "learning_rate": 4.001819803984218e-06, + "loss": 0.1396, + "step": 2927 + }, + { + "epoch": 0.948801036941024, + "grad_norm": 0.8614630103111267, + "learning_rate": 4.00112050347208e-06, + "loss": 0.1549, + "step": 2928 + }, + { + "epoch": 0.9491250810110174, + "grad_norm": 0.9327095150947571, + "learning_rate": 4.000421019237326e-06, + "loss": 0.1754, + "step": 2929 + }, + { + "epoch": 0.949449125081011, + "grad_norm": 0.8932909369468689, + "learning_rate": 3.999721351365563e-06, + "loss": 0.1588, + "step": 2930 + }, + { + "epoch": 0.9497731691510045, + "grad_norm": 0.7977306246757507, + "learning_rate": 3.999021499942425e-06, + "loss": 0.1565, + "step": 2931 + }, + { + "epoch": 0.950097213220998, + "grad_norm": 0.800845205783844, + "learning_rate": 3.998321465053568e-06, + "loss": 0.1423, + "step": 2932 + }, + { + "epoch": 0.9504212572909916, + "grad_norm": 0.7385462522506714, + "learning_rate": 3.9976212467846674e-06, + "loss": 0.1341, + "step": 2933 + }, + { + "epoch": 0.9507453013609851, + "grad_norm": 0.8102892637252808, + "learning_rate": 3.996920845221425e-06, + "loss": 0.1369, + "step": 2934 + }, + { + "epoch": 0.9510693454309787, + "grad_norm": 0.8875524401664734, + "learning_rate": 3.996220260449563e-06, + "loss": 0.1652, + "step": 2935 + }, + { + "epoch": 0.9513933895009722, + "grad_norm": 0.7795872092247009, + "learning_rate": 3.9955194925548245e-06, + "loss": 0.1426, + "step": 2936 + }, + { + "epoch": 0.9517174335709656, + "grad_norm": 0.9427581429481506, + "learning_rate": 3.994818541622979e-06, + "loss": 0.1694, + "step": 2937 + }, + { + "epoch": 0.9520414776409591, + "grad_norm": 0.8169898986816406, + "learning_rate": 3.994117407739814e-06, + "loss": 0.1452, + "step": 2938 + }, + { + "epoch": 0.9523655217109527, + "grad_norm": 0.8433765769004822, + "learning_rate": 3.993416090991143e-06, + "loss": 0.1441, + "step": 2939 + }, + { + "epoch": 0.9526895657809462, + "grad_norm": 0.9561168551445007, + "learning_rate": 3.992714591462799e-06, + "loss": 0.1592, + "step": 2940 + }, + { + "epoch": 0.9530136098509397, + "grad_norm": 0.7818630933761597, + "learning_rate": 3.992012909240641e-06, + "loss": 0.1417, + "step": 2941 + }, + { + "epoch": 0.9533376539209333, + "grad_norm": 0.8289167284965515, + "learning_rate": 3.991311044410546e-06, + "loss": 0.15, + "step": 2942 + }, + { + "epoch": 0.9536616979909268, + "grad_norm": 0.8170865774154663, + "learning_rate": 3.990608997058416e-06, + "loss": 0.1371, + "step": 2943 + }, + { + "epoch": 0.9539857420609202, + "grad_norm": 0.8745966553688049, + "learning_rate": 3.989906767270175e-06, + "loss": 0.1588, + "step": 2944 + }, + { + "epoch": 0.9543097861309138, + "grad_norm": 0.8419452905654907, + "learning_rate": 3.989204355131769e-06, + "loss": 0.1459, + "step": 2945 + }, + { + "epoch": 0.9546338302009073, + "grad_norm": 0.8660749793052673, + "learning_rate": 3.988501760729168e-06, + "loss": 0.1471, + "step": 2946 + }, + { + "epoch": 0.9549578742709008, + "grad_norm": 0.9283533096313477, + "learning_rate": 3.98779898414836e-06, + "loss": 0.1659, + "step": 2947 + }, + { + "epoch": 0.9552819183408944, + "grad_norm": 0.8726081252098083, + "learning_rate": 3.98709602547536e-06, + "loss": 0.1589, + "step": 2948 + }, + { + "epoch": 0.9556059624108879, + "grad_norm": 0.7964595556259155, + "learning_rate": 3.986392884796202e-06, + "loss": 0.1459, + "step": 2949 + }, + { + "epoch": 0.9559300064808814, + "grad_norm": 0.8376349210739136, + "learning_rate": 3.9856895621969435e-06, + "loss": 0.1475, + "step": 2950 + }, + { + "epoch": 0.9562540505508749, + "grad_norm": 0.8806761503219604, + "learning_rate": 3.984986057763667e-06, + "loss": 0.1524, + "step": 2951 + }, + { + "epoch": 0.9565780946208684, + "grad_norm": 0.9261873960494995, + "learning_rate": 3.984282371582472e-06, + "loss": 0.133, + "step": 2952 + }, + { + "epoch": 0.9569021386908619, + "grad_norm": 0.8796440958976746, + "learning_rate": 3.983578503739483e-06, + "loss": 0.151, + "step": 2953 + }, + { + "epoch": 0.9572261827608555, + "grad_norm": 0.8551722168922424, + "learning_rate": 3.982874454320849e-06, + "loss": 0.1509, + "step": 2954 + }, + { + "epoch": 0.957550226830849, + "grad_norm": 0.932380199432373, + "learning_rate": 3.982170223412735e-06, + "loss": 0.1656, + "step": 2955 + }, + { + "epoch": 0.9578742709008425, + "grad_norm": 0.784862756729126, + "learning_rate": 3.981465811101335e-06, + "loss": 0.1379, + "step": 2956 + }, + { + "epoch": 0.9581983149708361, + "grad_norm": 0.9122695922851562, + "learning_rate": 3.9807612174728615e-06, + "loss": 0.1561, + "step": 2957 + }, + { + "epoch": 0.9585223590408296, + "grad_norm": 0.8077194094657898, + "learning_rate": 3.98005644261355e-06, + "loss": 0.15, + "step": 2958 + }, + { + "epoch": 0.958846403110823, + "grad_norm": 0.8609473705291748, + "learning_rate": 3.979351486609659e-06, + "loss": 0.1593, + "step": 2959 + }, + { + "epoch": 0.9591704471808166, + "grad_norm": 0.7915871739387512, + "learning_rate": 3.978646349547466e-06, + "loss": 0.1534, + "step": 2960 + }, + { + "epoch": 0.9594944912508101, + "grad_norm": 0.8479771614074707, + "learning_rate": 3.977941031513275e-06, + "loss": 0.1421, + "step": 2961 + }, + { + "epoch": 0.9598185353208036, + "grad_norm": 0.8406703472137451, + "learning_rate": 3.977235532593408e-06, + "loss": 0.1376, + "step": 2962 + }, + { + "epoch": 0.9601425793907972, + "grad_norm": 0.9173598289489746, + "learning_rate": 3.976529852874214e-06, + "loss": 0.1566, + "step": 2963 + }, + { + "epoch": 0.9604666234607907, + "grad_norm": 0.8462458252906799, + "learning_rate": 3.975823992442058e-06, + "loss": 0.1556, + "step": 2964 + }, + { + "epoch": 0.9607906675307842, + "grad_norm": 0.8729339241981506, + "learning_rate": 3.975117951383334e-06, + "loss": 0.1509, + "step": 2965 + }, + { + "epoch": 0.9611147116007777, + "grad_norm": 0.9013268351554871, + "learning_rate": 3.974411729784453e-06, + "loss": 0.1626, + "step": 2966 + }, + { + "epoch": 0.9614387556707712, + "grad_norm": 0.7783372402191162, + "learning_rate": 3.973705327731849e-06, + "loss": 0.144, + "step": 2967 + }, + { + "epoch": 0.9617627997407647, + "grad_norm": 0.7804936766624451, + "learning_rate": 3.97299874531198e-06, + "loss": 0.1509, + "step": 2968 + }, + { + "epoch": 0.9620868438107583, + "grad_norm": 1.2105624675750732, + "learning_rate": 3.972291982611325e-06, + "loss": 0.2081, + "step": 2969 + }, + { + "epoch": 0.9624108878807518, + "grad_norm": 0.8226312398910522, + "learning_rate": 3.971585039716382e-06, + "loss": 0.1561, + "step": 2970 + }, + { + "epoch": 0.9627349319507453, + "grad_norm": 0.8245861530303955, + "learning_rate": 3.970877916713678e-06, + "loss": 0.1532, + "step": 2971 + }, + { + "epoch": 0.9630589760207389, + "grad_norm": 0.8438794016838074, + "learning_rate": 3.9701706136897564e-06, + "loss": 0.1597, + "step": 2972 + }, + { + "epoch": 0.9633830200907323, + "grad_norm": 0.8172274231910706, + "learning_rate": 3.969463130731183e-06, + "loss": 0.1548, + "step": 2973 + }, + { + "epoch": 0.9637070641607258, + "grad_norm": 0.8268179893493652, + "learning_rate": 3.968755467924549e-06, + "loss": 0.1507, + "step": 2974 + }, + { + "epoch": 0.9640311082307194, + "grad_norm": 0.7694240212440491, + "learning_rate": 3.968047625356463e-06, + "loss": 0.136, + "step": 2975 + }, + { + "epoch": 0.9643551523007129, + "grad_norm": 0.8401308059692383, + "learning_rate": 3.96733960311356e-06, + "loss": 0.1548, + "step": 2976 + }, + { + "epoch": 0.9646791963707064, + "grad_norm": 0.8167865872383118, + "learning_rate": 3.966631401282495e-06, + "loss": 0.1379, + "step": 2977 + }, + { + "epoch": 0.9650032404407, + "grad_norm": 0.8723135590553284, + "learning_rate": 3.965923019949944e-06, + "loss": 0.1586, + "step": 2978 + }, + { + "epoch": 0.9653272845106935, + "grad_norm": 0.8042678833007812, + "learning_rate": 3.965214459202607e-06, + "loss": 0.146, + "step": 2979 + }, + { + "epoch": 0.9656513285806869, + "grad_norm": 0.8387596607208252, + "learning_rate": 3.964505719127205e-06, + "loss": 0.1541, + "step": 2980 + }, + { + "epoch": 0.9659753726506805, + "grad_norm": 0.8149929642677307, + "learning_rate": 3.963796799810479e-06, + "loss": 0.1546, + "step": 2981 + }, + { + "epoch": 0.966299416720674, + "grad_norm": 0.8523327708244324, + "learning_rate": 3.9630877013391964e-06, + "loss": 0.164, + "step": 2982 + }, + { + "epoch": 0.9666234607906675, + "grad_norm": 0.7993361353874207, + "learning_rate": 3.962378423800143e-06, + "loss": 0.1415, + "step": 2983 + }, + { + "epoch": 0.9669475048606611, + "grad_norm": 0.7707695364952087, + "learning_rate": 3.961668967280128e-06, + "loss": 0.1429, + "step": 2984 + }, + { + "epoch": 0.9672715489306546, + "grad_norm": 0.7808859348297119, + "learning_rate": 3.96095933186598e-06, + "loss": 0.1367, + "step": 2985 + }, + { + "epoch": 0.9675955930006481, + "grad_norm": 0.8761918544769287, + "learning_rate": 3.960249517644553e-06, + "loss": 0.1578, + "step": 2986 + }, + { + "epoch": 0.9679196370706417, + "grad_norm": 0.8220571279525757, + "learning_rate": 3.959539524702722e-06, + "loss": 0.1414, + "step": 2987 + }, + { + "epoch": 0.9682436811406351, + "grad_norm": 0.8590094447135925, + "learning_rate": 3.958829353127383e-06, + "loss": 0.1556, + "step": 2988 + }, + { + "epoch": 0.9685677252106286, + "grad_norm": 0.8472662568092346, + "learning_rate": 3.958119003005453e-06, + "loss": 0.1653, + "step": 2989 + }, + { + "epoch": 0.9688917692806222, + "grad_norm": 0.745740532875061, + "learning_rate": 3.9574084744238735e-06, + "loss": 0.1381, + "step": 2990 + }, + { + "epoch": 0.9692158133506157, + "grad_norm": 0.8592861890792847, + "learning_rate": 3.956697767469606e-06, + "loss": 0.1637, + "step": 2991 + }, + { + "epoch": 0.9695398574206092, + "grad_norm": 0.8486888408660889, + "learning_rate": 3.955986882229632e-06, + "loss": 0.1527, + "step": 2992 + }, + { + "epoch": 0.9698639014906028, + "grad_norm": 0.9307520985603333, + "learning_rate": 3.95527581879096e-06, + "loss": 0.1672, + "step": 2993 + }, + { + "epoch": 0.9701879455605963, + "grad_norm": 0.8392777442932129, + "learning_rate": 3.954564577240615e-06, + "loss": 0.139, + "step": 2994 + }, + { + "epoch": 0.9705119896305897, + "grad_norm": 0.8226277828216553, + "learning_rate": 3.9538531576656465e-06, + "loss": 0.1563, + "step": 2995 + }, + { + "epoch": 0.9708360337005832, + "grad_norm": 0.8464024662971497, + "learning_rate": 3.953141560153128e-06, + "loss": 0.16, + "step": 2996 + }, + { + "epoch": 0.9711600777705768, + "grad_norm": 0.9020957350730896, + "learning_rate": 3.952429784790148e-06, + "loss": 0.1497, + "step": 2997 + }, + { + "epoch": 0.9714841218405703, + "grad_norm": 0.8413184881210327, + "learning_rate": 3.951717831663825e-06, + "loss": 0.1578, + "step": 2998 + }, + { + "epoch": 0.9718081659105638, + "grad_norm": 0.784361720085144, + "learning_rate": 3.951005700861291e-06, + "loss": 0.1408, + "step": 2999 + }, + { + "epoch": 0.9721322099805574, + "grad_norm": 0.7845505475997925, + "learning_rate": 3.9502933924697076e-06, + "loss": 0.1428, + "step": 3000 + }, + { + "epoch": 0.9724562540505509, + "grad_norm": 0.8205187320709229, + "learning_rate": 3.949580906576252e-06, + "loss": 0.1572, + "step": 3001 + }, + { + "epoch": 0.9727802981205443, + "grad_norm": 0.8463999032974243, + "learning_rate": 3.948868243268127e-06, + "loss": 0.1402, + "step": 3002 + }, + { + "epoch": 0.9731043421905379, + "grad_norm": 0.8993105292320251, + "learning_rate": 3.948155402632554e-06, + "loss": 0.1724, + "step": 3003 + }, + { + "epoch": 0.9734283862605314, + "grad_norm": 0.7958623170852661, + "learning_rate": 3.94744238475678e-06, + "loss": 0.1367, + "step": 3004 + }, + { + "epoch": 0.9737524303305249, + "grad_norm": 0.8215851783752441, + "learning_rate": 3.94672918972807e-06, + "loss": 0.1452, + "step": 3005 + }, + { + "epoch": 0.9740764744005185, + "grad_norm": 0.75538569688797, + "learning_rate": 3.946015817633714e-06, + "loss": 0.1349, + "step": 3006 + }, + { + "epoch": 0.974400518470512, + "grad_norm": 0.749919593334198, + "learning_rate": 3.945302268561019e-06, + "loss": 0.1394, + "step": 3007 + }, + { + "epoch": 0.9747245625405055, + "grad_norm": 0.8285024166107178, + "learning_rate": 3.944588542597319e-06, + "loss": 0.1553, + "step": 3008 + }, + { + "epoch": 0.9750486066104991, + "grad_norm": 0.8027199506759644, + "learning_rate": 3.943874639829964e-06, + "loss": 0.1453, + "step": 3009 + }, + { + "epoch": 0.9753726506804925, + "grad_norm": 0.7956819534301758, + "learning_rate": 3.943160560346332e-06, + "loss": 0.1393, + "step": 3010 + }, + { + "epoch": 0.975696694750486, + "grad_norm": 0.8369397521018982, + "learning_rate": 3.942446304233819e-06, + "loss": 0.1461, + "step": 3011 + }, + { + "epoch": 0.9760207388204796, + "grad_norm": 0.8727890253067017, + "learning_rate": 3.941731871579842e-06, + "loss": 0.1707, + "step": 3012 + }, + { + "epoch": 0.9763447828904731, + "grad_norm": 0.846352756023407, + "learning_rate": 3.94101726247184e-06, + "loss": 0.1551, + "step": 3013 + }, + { + "epoch": 0.9766688269604666, + "grad_norm": 0.7778693437576294, + "learning_rate": 3.9403024769972766e-06, + "loss": 0.1372, + "step": 3014 + }, + { + "epoch": 0.9769928710304602, + "grad_norm": 0.8392274975776672, + "learning_rate": 3.939587515243632e-06, + "loss": 0.1581, + "step": 3015 + }, + { + "epoch": 0.9773169151004537, + "grad_norm": 0.8242346048355103, + "learning_rate": 3.938872377298413e-06, + "loss": 0.1449, + "step": 3016 + }, + { + "epoch": 0.9776409591704471, + "grad_norm": 0.8143014907836914, + "learning_rate": 3.938157063249144e-06, + "loss": 0.1478, + "step": 3017 + }, + { + "epoch": 0.9779650032404407, + "grad_norm": 0.8591554760932922, + "learning_rate": 3.937441573183373e-06, + "loss": 0.1429, + "step": 3018 + }, + { + "epoch": 0.9782890473104342, + "grad_norm": 0.8570237159729004, + "learning_rate": 3.936725907188668e-06, + "loss": 0.1607, + "step": 3019 + }, + { + "epoch": 0.9786130913804277, + "grad_norm": 0.8121863007545471, + "learning_rate": 3.936010065352622e-06, + "loss": 0.1423, + "step": 3020 + }, + { + "epoch": 0.9789371354504213, + "grad_norm": 0.8878906965255737, + "learning_rate": 3.935294047762844e-06, + "loss": 0.1491, + "step": 3021 + }, + { + "epoch": 0.9792611795204148, + "grad_norm": 0.8019052147865295, + "learning_rate": 3.93457785450697e-06, + "loss": 0.1525, + "step": 3022 + }, + { + "epoch": 0.9795852235904083, + "grad_norm": 0.8467800617218018, + "learning_rate": 3.933861485672656e-06, + "loss": 0.1611, + "step": 3023 + }, + { + "epoch": 0.9799092676604018, + "grad_norm": 0.816117525100708, + "learning_rate": 3.933144941347574e-06, + "loss": 0.1548, + "step": 3024 + }, + { + "epoch": 0.9802333117303953, + "grad_norm": 0.8132639527320862, + "learning_rate": 3.932428221619427e-06, + "loss": 0.1482, + "step": 3025 + }, + { + "epoch": 0.9805573558003888, + "grad_norm": 0.845690906047821, + "learning_rate": 3.931711326575933e-06, + "loss": 0.1483, + "step": 3026 + }, + { + "epoch": 0.9808813998703824, + "grad_norm": 0.8532651662826538, + "learning_rate": 3.9309942563048315e-06, + "loss": 0.1628, + "step": 3027 + }, + { + "epoch": 0.9812054439403759, + "grad_norm": 0.8248721957206726, + "learning_rate": 3.930277010893887e-06, + "loss": 0.1588, + "step": 3028 + }, + { + "epoch": 0.9815294880103694, + "grad_norm": 0.8405593037605286, + "learning_rate": 3.929559590430881e-06, + "loss": 0.1566, + "step": 3029 + }, + { + "epoch": 0.981853532080363, + "grad_norm": 0.6945734620094299, + "learning_rate": 3.928841995003622e-06, + "loss": 0.1208, + "step": 3030 + }, + { + "epoch": 0.9821775761503565, + "grad_norm": 0.8376073241233826, + "learning_rate": 3.928124224699935e-06, + "loss": 0.1516, + "step": 3031 + }, + { + "epoch": 0.9825016202203499, + "grad_norm": 0.855110228061676, + "learning_rate": 3.927406279607668e-06, + "loss": 0.1549, + "step": 3032 + }, + { + "epoch": 0.9828256642903435, + "grad_norm": 0.900316596031189, + "learning_rate": 3.92668815981469e-06, + "loss": 0.164, + "step": 3033 + }, + { + "epoch": 0.983149708360337, + "grad_norm": 0.8319350481033325, + "learning_rate": 3.925969865408893e-06, + "loss": 0.1537, + "step": 3034 + }, + { + "epoch": 0.9834737524303305, + "grad_norm": 0.8483204245567322, + "learning_rate": 3.925251396478189e-06, + "loss": 0.1537, + "step": 3035 + }, + { + "epoch": 0.9837977965003241, + "grad_norm": 0.8711523413658142, + "learning_rate": 3.9245327531105115e-06, + "loss": 0.1462, + "step": 3036 + }, + { + "epoch": 0.9841218405703176, + "grad_norm": 0.7892616987228394, + "learning_rate": 3.923813935393816e-06, + "loss": 0.1363, + "step": 3037 + }, + { + "epoch": 0.9844458846403111, + "grad_norm": 0.8650854229927063, + "learning_rate": 3.923094943416078e-06, + "loss": 0.1653, + "step": 3038 + }, + { + "epoch": 0.9847699287103046, + "grad_norm": 0.8720736503601074, + "learning_rate": 3.922375777265296e-06, + "loss": 0.159, + "step": 3039 + }, + { + "epoch": 0.9850939727802981, + "grad_norm": 0.8047921061515808, + "learning_rate": 3.921656437029488e-06, + "loss": 0.1389, + "step": 3040 + }, + { + "epoch": 0.9854180168502916, + "grad_norm": 0.8041824102401733, + "learning_rate": 3.9209369227966945e-06, + "loss": 0.1431, + "step": 3041 + }, + { + "epoch": 0.9857420609202852, + "grad_norm": 0.8418930768966675, + "learning_rate": 3.920217234654978e-06, + "loss": 0.1605, + "step": 3042 + }, + { + "epoch": 0.9860661049902787, + "grad_norm": 0.7825391888618469, + "learning_rate": 3.919497372692421e-06, + "loss": 0.1443, + "step": 3043 + }, + { + "epoch": 0.9863901490602722, + "grad_norm": 0.704852819442749, + "learning_rate": 3.918777336997127e-06, + "loss": 0.1255, + "step": 3044 + }, + { + "epoch": 0.9867141931302658, + "grad_norm": 0.8802457451820374, + "learning_rate": 3.918057127657222e-06, + "loss": 0.1746, + "step": 3045 + }, + { + "epoch": 0.9870382372002592, + "grad_norm": 0.9226857423782349, + "learning_rate": 3.9173367447608525e-06, + "loss": 0.1442, + "step": 3046 + }, + { + "epoch": 0.9873622812702527, + "grad_norm": 0.8810186386108398, + "learning_rate": 3.916616188396185e-06, + "loss": 0.1583, + "step": 3047 + }, + { + "epoch": 0.9876863253402463, + "grad_norm": 0.8760223388671875, + "learning_rate": 3.915895458651411e-06, + "loss": 0.1478, + "step": 3048 + }, + { + "epoch": 0.9880103694102398, + "grad_norm": 0.8516616225242615, + "learning_rate": 3.9151745556147404e-06, + "loss": 0.151, + "step": 3049 + }, + { + "epoch": 0.9883344134802333, + "grad_norm": 0.8152266144752502, + "learning_rate": 3.914453479374403e-06, + "loss": 0.1543, + "step": 3050 + }, + { + "epoch": 0.9886584575502269, + "grad_norm": 0.7873851656913757, + "learning_rate": 3.913732230018654e-06, + "loss": 0.1565, + "step": 3051 + }, + { + "epoch": 0.9889825016202204, + "grad_norm": 0.7410478591918945, + "learning_rate": 3.913010807635765e-06, + "loss": 0.1332, + "step": 3052 + }, + { + "epoch": 0.9893065456902139, + "grad_norm": 0.8501771092414856, + "learning_rate": 3.9122892123140324e-06, + "loss": 0.1471, + "step": 3053 + }, + { + "epoch": 0.9896305897602073, + "grad_norm": 0.8704087734222412, + "learning_rate": 3.911567444141771e-06, + "loss": 0.1569, + "step": 3054 + }, + { + "epoch": 0.9899546338302009, + "grad_norm": 0.826313316822052, + "learning_rate": 3.910845503207322e-06, + "loss": 0.1441, + "step": 3055 + }, + { + "epoch": 0.9902786779001944, + "grad_norm": 0.7761698961257935, + "learning_rate": 3.9101233895990396e-06, + "loss": 0.1338, + "step": 3056 + }, + { + "epoch": 0.990602721970188, + "grad_norm": 0.7883891463279724, + "learning_rate": 3.909401103405307e-06, + "loss": 0.1445, + "step": 3057 + }, + { + "epoch": 0.9909267660401815, + "grad_norm": 0.8291919231414795, + "learning_rate": 3.908678644714522e-06, + "loss": 0.1392, + "step": 3058 + }, + { + "epoch": 0.991250810110175, + "grad_norm": 0.8155975937843323, + "learning_rate": 3.907956013615108e-06, + "loss": 0.1501, + "step": 3059 + }, + { + "epoch": 0.9915748541801686, + "grad_norm": 0.7870499491691589, + "learning_rate": 3.907233210195508e-06, + "loss": 0.1436, + "step": 3060 + }, + { + "epoch": 0.991898898250162, + "grad_norm": 0.8479496836662292, + "learning_rate": 3.906510234544186e-06, + "loss": 0.1534, + "step": 3061 + }, + { + "epoch": 0.9922229423201555, + "grad_norm": 0.7778910994529724, + "learning_rate": 3.905787086749628e-06, + "loss": 0.144, + "step": 3062 + }, + { + "epoch": 0.992546986390149, + "grad_norm": 0.7847429513931274, + "learning_rate": 3.90506376690034e-06, + "loss": 0.133, + "step": 3063 + }, + { + "epoch": 0.9928710304601426, + "grad_norm": 0.8366702198982239, + "learning_rate": 3.904340275084848e-06, + "loss": 0.1524, + "step": 3064 + }, + { + "epoch": 0.9931950745301361, + "grad_norm": 0.7599114179611206, + "learning_rate": 3.9036166113917015e-06, + "loss": 0.1384, + "step": 3065 + }, + { + "epoch": 0.9935191186001296, + "grad_norm": 0.7668454051017761, + "learning_rate": 3.90289277590947e-06, + "loss": 0.1434, + "step": 3066 + }, + { + "epoch": 0.9938431626701232, + "grad_norm": 0.7598092555999756, + "learning_rate": 3.902168768726745e-06, + "loss": 0.1231, + "step": 3067 + }, + { + "epoch": 0.9941672067401166, + "grad_norm": 0.8079851865768433, + "learning_rate": 3.9014445899321355e-06, + "loss": 0.1425, + "step": 3068 + }, + { + "epoch": 0.9944912508101101, + "grad_norm": 0.833429217338562, + "learning_rate": 3.900720239614275e-06, + "loss": 0.1472, + "step": 3069 + }, + { + "epoch": 0.9948152948801037, + "grad_norm": 0.7490724325180054, + "learning_rate": 3.899995717861818e-06, + "loss": 0.1418, + "step": 3070 + }, + { + "epoch": 0.9951393389500972, + "grad_norm": 0.785024106502533, + "learning_rate": 3.899271024763438e-06, + "loss": 0.1423, + "step": 3071 + }, + { + "epoch": 0.9954633830200907, + "grad_norm": 0.7634562849998474, + "learning_rate": 3.89854616040783e-06, + "loss": 0.1292, + "step": 3072 + }, + { + "epoch": 0.9957874270900843, + "grad_norm": 0.8108155131340027, + "learning_rate": 3.897821124883711e-06, + "loss": 0.1467, + "step": 3073 + }, + { + "epoch": 0.9961114711600778, + "grad_norm": 0.9205564856529236, + "learning_rate": 3.897095918279818e-06, + "loss": 0.1715, + "step": 3074 + }, + { + "epoch": 0.9964355152300713, + "grad_norm": 0.8294411897659302, + "learning_rate": 3.896370540684911e-06, + "loss": 0.1392, + "step": 3075 + }, + { + "epoch": 0.9967595593000648, + "grad_norm": 0.827343761920929, + "learning_rate": 3.895644992187767e-06, + "loss": 0.1623, + "step": 3076 + }, + { + "epoch": 0.9970836033700583, + "grad_norm": 0.7698377370834351, + "learning_rate": 3.894919272877187e-06, + "loss": 0.1421, + "step": 3077 + }, + { + "epoch": 0.9974076474400518, + "grad_norm": 0.7445449829101562, + "learning_rate": 3.894193382841991e-06, + "loss": 0.1374, + "step": 3078 + }, + { + "epoch": 0.9977316915100454, + "grad_norm": 0.7607712745666504, + "learning_rate": 3.893467322171022e-06, + "loss": 0.1403, + "step": 3079 + }, + { + "epoch": 0.9980557355800389, + "grad_norm": 0.8286640644073486, + "learning_rate": 3.892741090953143e-06, + "loss": 0.1451, + "step": 3080 + }, + { + "epoch": 0.9983797796500324, + "grad_norm": 0.7811787128448486, + "learning_rate": 3.892014689277238e-06, + "loss": 0.1364, + "step": 3081 + }, + { + "epoch": 0.998703823720026, + "grad_norm": 0.798345148563385, + "learning_rate": 3.891288117232209e-06, + "loss": 0.1535, + "step": 3082 + }, + { + "epoch": 0.9990278677900194, + "grad_norm": 0.8324628472328186, + "learning_rate": 3.890561374906985e-06, + "loss": 0.1366, + "step": 3083 + }, + { + "epoch": 0.9993519118600129, + "grad_norm": 0.7567122578620911, + "learning_rate": 3.889834462390509e-06, + "loss": 0.1342, + "step": 3084 + }, + { + "epoch": 0.9996759559300065, + "grad_norm": 0.8021467328071594, + "learning_rate": 3.889107379771749e-06, + "loss": 0.1404, + "step": 3085 + }, + { + "epoch": 1.0, + "grad_norm": 0.9288654923439026, + "learning_rate": 3.888380127139695e-06, + "loss": 0.1531, + "step": 3086 + }, + { + "epoch": 1.0003240440699934, + "grad_norm": 0.7892737984657288, + "learning_rate": 3.887652704583354e-06, + "loss": 0.1268, + "step": 3087 + }, + { + "epoch": 1.000648088139987, + "grad_norm": 0.722515344619751, + "learning_rate": 3.886925112191754e-06, + "loss": 0.1064, + "step": 3088 + }, + { + "epoch": 1.0009721322099805, + "grad_norm": 0.743994951248169, + "learning_rate": 3.886197350053948e-06, + "loss": 0.1129, + "step": 3089 + }, + { + "epoch": 1.0012961762799741, + "grad_norm": 0.786417543888092, + "learning_rate": 3.885469418259005e-06, + "loss": 0.1188, + "step": 3090 + }, + { + "epoch": 1.0016202203499676, + "grad_norm": 0.7891602516174316, + "learning_rate": 3.8847413168960175e-06, + "loss": 0.1138, + "step": 3091 + }, + { + "epoch": 1.0019442644199612, + "grad_norm": 0.8058298230171204, + "learning_rate": 3.884013046054098e-06, + "loss": 0.1159, + "step": 3092 + }, + { + "epoch": 1.0022683084899546, + "grad_norm": 0.7223353385925293, + "learning_rate": 3.8832846058223814e-06, + "loss": 0.1061, + "step": 3093 + }, + { + "epoch": 1.002592352559948, + "grad_norm": 0.8861434459686279, + "learning_rate": 3.882555996290019e-06, + "loss": 0.1278, + "step": 3094 + }, + { + "epoch": 1.0029163966299417, + "grad_norm": 0.7787957787513733, + "learning_rate": 3.881827217546187e-06, + "loss": 0.1085, + "step": 3095 + }, + { + "epoch": 1.0032404406999351, + "grad_norm": 0.8384479880332947, + "learning_rate": 3.881098269680081e-06, + "loss": 0.1168, + "step": 3096 + }, + { + "epoch": 1.0035644847699288, + "grad_norm": 0.8060561418533325, + "learning_rate": 3.880369152780916e-06, + "loss": 0.1152, + "step": 3097 + }, + { + "epoch": 1.0038885288399222, + "grad_norm": 0.8682321906089783, + "learning_rate": 3.879639866937931e-06, + "loss": 0.1127, + "step": 3098 + }, + { + "epoch": 1.0042125729099158, + "grad_norm": 0.8855298757553101, + "learning_rate": 3.8789104122403815e-06, + "loss": 0.1269, + "step": 3099 + }, + { + "epoch": 1.0045366169799093, + "grad_norm": 0.8725508451461792, + "learning_rate": 3.878180788777546e-06, + "loss": 0.1207, + "step": 3100 + }, + { + "epoch": 1.0048606610499027, + "grad_norm": 0.8179731965065002, + "learning_rate": 3.877450996638725e-06, + "loss": 0.1175, + "step": 3101 + }, + { + "epoch": 1.0051847051198963, + "grad_norm": 0.8003975749015808, + "learning_rate": 3.876721035913236e-06, + "loss": 0.1099, + "step": 3102 + }, + { + "epoch": 1.0055087491898898, + "grad_norm": 0.8058536648750305, + "learning_rate": 3.87599090669042e-06, + "loss": 0.1163, + "step": 3103 + }, + { + "epoch": 1.0058327932598834, + "grad_norm": 0.774759829044342, + "learning_rate": 3.875260609059638e-06, + "loss": 0.1048, + "step": 3104 + }, + { + "epoch": 1.0061568373298768, + "grad_norm": 0.8399088382720947, + "learning_rate": 3.87453014311027e-06, + "loss": 0.121, + "step": 3105 + }, + { + "epoch": 1.0064808813998705, + "grad_norm": 0.8479682803153992, + "learning_rate": 3.87379950893172e-06, + "loss": 0.1118, + "step": 3106 + }, + { + "epoch": 1.0068049254698639, + "grad_norm": 0.8828272819519043, + "learning_rate": 3.8730687066134086e-06, + "loss": 0.132, + "step": 3107 + }, + { + "epoch": 1.0071289695398573, + "grad_norm": 0.7902854084968567, + "learning_rate": 3.8723377362447805e-06, + "loss": 0.1061, + "step": 3108 + }, + { + "epoch": 1.007453013609851, + "grad_norm": 0.8610204458236694, + "learning_rate": 3.871606597915298e-06, + "loss": 0.1216, + "step": 3109 + }, + { + "epoch": 1.0077770576798444, + "grad_norm": 0.8818372488021851, + "learning_rate": 3.870875291714448e-06, + "loss": 0.1242, + "step": 3110 + }, + { + "epoch": 1.008101101749838, + "grad_norm": 0.8453108668327332, + "learning_rate": 3.870143817731732e-06, + "loss": 0.1203, + "step": 3111 + }, + { + "epoch": 1.0084251458198314, + "grad_norm": 0.7680081725120544, + "learning_rate": 3.8694121760566765e-06, + "loss": 0.1126, + "step": 3112 + }, + { + "epoch": 1.008749189889825, + "grad_norm": 0.8433241248130798, + "learning_rate": 3.868680366778828e-06, + "loss": 0.1191, + "step": 3113 + }, + { + "epoch": 1.0090732339598185, + "grad_norm": 0.8732882142066956, + "learning_rate": 3.867948389987752e-06, + "loss": 0.1221, + "step": 3114 + }, + { + "epoch": 1.0093972780298122, + "grad_norm": 0.8452468514442444, + "learning_rate": 3.8672162457730365e-06, + "loss": 0.1186, + "step": 3115 + }, + { + "epoch": 1.0097213220998056, + "grad_norm": 0.7950354814529419, + "learning_rate": 3.866483934224288e-06, + "loss": 0.1136, + "step": 3116 + }, + { + "epoch": 1.010045366169799, + "grad_norm": 0.8962631225585938, + "learning_rate": 3.865751455431134e-06, + "loss": 0.1232, + "step": 3117 + }, + { + "epoch": 1.0103694102397927, + "grad_norm": 0.7999354600906372, + "learning_rate": 3.865018809483224e-06, + "loss": 0.1216, + "step": 3118 + }, + { + "epoch": 1.010693454309786, + "grad_norm": 0.8645737171173096, + "learning_rate": 3.864285996470226e-06, + "loss": 0.1284, + "step": 3119 + }, + { + "epoch": 1.0110174983797797, + "grad_norm": 0.8569557070732117, + "learning_rate": 3.863553016481829e-06, + "loss": 0.1151, + "step": 3120 + }, + { + "epoch": 1.0113415424497731, + "grad_norm": 0.812362015247345, + "learning_rate": 3.862819869607743e-06, + "loss": 0.1143, + "step": 3121 + }, + { + "epoch": 1.0116655865197668, + "grad_norm": 0.8417925834655762, + "learning_rate": 3.862086555937699e-06, + "loss": 0.122, + "step": 3122 + }, + { + "epoch": 1.0119896305897602, + "grad_norm": 0.8303039073944092, + "learning_rate": 3.861353075561446e-06, + "loss": 0.1218, + "step": 3123 + }, + { + "epoch": 1.0123136746597536, + "grad_norm": 0.809136688709259, + "learning_rate": 3.860619428568756e-06, + "loss": 0.1159, + "step": 3124 + }, + { + "epoch": 1.0126377187297473, + "grad_norm": 0.7998272180557251, + "learning_rate": 3.859885615049419e-06, + "loss": 0.1116, + "step": 3125 + }, + { + "epoch": 1.0129617627997407, + "grad_norm": 0.8065343499183655, + "learning_rate": 3.8591516350932476e-06, + "loss": 0.1142, + "step": 3126 + }, + { + "epoch": 1.0132858068697344, + "grad_norm": 0.7445245981216431, + "learning_rate": 3.8584174887900735e-06, + "loss": 0.1069, + "step": 3127 + }, + { + "epoch": 1.0136098509397278, + "grad_norm": 0.8598371744155884, + "learning_rate": 3.8576831762297495e-06, + "loss": 0.1211, + "step": 3128 + }, + { + "epoch": 1.0139338950097214, + "grad_norm": 0.7716249227523804, + "learning_rate": 3.856948697502148e-06, + "loss": 0.1109, + "step": 3129 + }, + { + "epoch": 1.0142579390797148, + "grad_norm": 0.7559572458267212, + "learning_rate": 3.8562140526971625e-06, + "loss": 0.1062, + "step": 3130 + }, + { + "epoch": 1.0145819831497083, + "grad_norm": 0.8070071935653687, + "learning_rate": 3.855479241904705e-06, + "loss": 0.1126, + "step": 3131 + }, + { + "epoch": 1.014906027219702, + "grad_norm": 0.8478087782859802, + "learning_rate": 3.8547442652147115e-06, + "loss": 0.1198, + "step": 3132 + }, + { + "epoch": 1.0152300712896953, + "grad_norm": 0.8656642436981201, + "learning_rate": 3.854009122717135e-06, + "loss": 0.1206, + "step": 3133 + }, + { + "epoch": 1.015554115359689, + "grad_norm": 0.8849382996559143, + "learning_rate": 3.8532738145019484e-06, + "loss": 0.1165, + "step": 3134 + }, + { + "epoch": 1.0158781594296824, + "grad_norm": 0.8415027856826782, + "learning_rate": 3.852538340659149e-06, + "loss": 0.1157, + "step": 3135 + }, + { + "epoch": 1.016202203499676, + "grad_norm": 0.8097540736198425, + "learning_rate": 3.85180270127875e-06, + "loss": 0.1136, + "step": 3136 + }, + { + "epoch": 1.0165262475696695, + "grad_norm": 0.8937448263168335, + "learning_rate": 3.851066896450787e-06, + "loss": 0.1208, + "step": 3137 + }, + { + "epoch": 1.016850291639663, + "grad_norm": 0.7370225191116333, + "learning_rate": 3.850330926265314e-06, + "loss": 0.1026, + "step": 3138 + }, + { + "epoch": 1.0171743357096565, + "grad_norm": 0.8523708581924438, + "learning_rate": 3.849594790812409e-06, + "loss": 0.1166, + "step": 3139 + }, + { + "epoch": 1.01749837977965, + "grad_norm": 0.7796236872673035, + "learning_rate": 3.848858490182167e-06, + "loss": 0.1078, + "step": 3140 + }, + { + "epoch": 1.0178224238496436, + "grad_norm": 0.7496045827865601, + "learning_rate": 3.8481220244647025e-06, + "loss": 0.1047, + "step": 3141 + }, + { + "epoch": 1.018146467919637, + "grad_norm": 0.7842292785644531, + "learning_rate": 3.847385393750154e-06, + "loss": 0.1118, + "step": 3142 + }, + { + "epoch": 1.0184705119896307, + "grad_norm": 0.8823502659797668, + "learning_rate": 3.846648598128677e-06, + "loss": 0.1218, + "step": 3143 + }, + { + "epoch": 1.018794556059624, + "grad_norm": 0.7918041348457336, + "learning_rate": 3.8459116376904475e-06, + "loss": 0.1173, + "step": 3144 + }, + { + "epoch": 1.0191186001296175, + "grad_norm": 0.8599136471748352, + "learning_rate": 3.8451745125256635e-06, + "loss": 0.1154, + "step": 3145 + }, + { + "epoch": 1.0194426441996112, + "grad_norm": 0.8427619338035583, + "learning_rate": 3.8444372227245415e-06, + "loss": 0.1199, + "step": 3146 + }, + { + "epoch": 1.0197666882696046, + "grad_norm": 0.8241166472434998, + "learning_rate": 3.843699768377318e-06, + "loss": 0.1149, + "step": 3147 + }, + { + "epoch": 1.0200907323395982, + "grad_norm": 0.8806132674217224, + "learning_rate": 3.842962149574252e-06, + "loss": 0.1315, + "step": 3148 + }, + { + "epoch": 1.0204147764095917, + "grad_norm": 0.8508759140968323, + "learning_rate": 3.842224366405619e-06, + "loss": 0.1203, + "step": 3149 + }, + { + "epoch": 1.0207388204795853, + "grad_norm": 0.869519829750061, + "learning_rate": 3.841486418961717e-06, + "loss": 0.1294, + "step": 3150 + }, + { + "epoch": 1.0210628645495787, + "grad_norm": 0.7835947275161743, + "learning_rate": 3.840748307332865e-06, + "loss": 0.1105, + "step": 3151 + }, + { + "epoch": 1.0213869086195722, + "grad_norm": 0.8605433702468872, + "learning_rate": 3.840010031609398e-06, + "loss": 0.1303, + "step": 3152 + }, + { + "epoch": 1.0217109526895658, + "grad_norm": 0.7887565493583679, + "learning_rate": 3.8392715918816755e-06, + "loss": 0.1112, + "step": 3153 + }, + { + "epoch": 1.0220349967595592, + "grad_norm": 0.7533249258995056, + "learning_rate": 3.838532988240077e-06, + "loss": 0.1026, + "step": 3154 + }, + { + "epoch": 1.0223590408295529, + "grad_norm": 0.8589524626731873, + "learning_rate": 3.837794220774998e-06, + "loss": 0.1211, + "step": 3155 + }, + { + "epoch": 1.0226830848995463, + "grad_norm": 0.8438388109207153, + "learning_rate": 3.8370552895768565e-06, + "loss": 0.1182, + "step": 3156 + }, + { + "epoch": 1.02300712896954, + "grad_norm": 0.8679496645927429, + "learning_rate": 3.836316194736093e-06, + "loss": 0.124, + "step": 3157 + }, + { + "epoch": 1.0233311730395334, + "grad_norm": 0.9431676268577576, + "learning_rate": 3.835576936343162e-06, + "loss": 0.1277, + "step": 3158 + }, + { + "epoch": 1.023655217109527, + "grad_norm": 0.797709584236145, + "learning_rate": 3.8348375144885445e-06, + "loss": 0.1122, + "step": 3159 + }, + { + "epoch": 1.0239792611795204, + "grad_norm": 0.8156585693359375, + "learning_rate": 3.834097929262737e-06, + "loss": 0.1212, + "step": 3160 + }, + { + "epoch": 1.0243033052495139, + "grad_norm": 0.7179230451583862, + "learning_rate": 3.833358180756258e-06, + "loss": 0.1027, + "step": 3161 + }, + { + "epoch": 1.0246273493195075, + "grad_norm": 0.880363404750824, + "learning_rate": 3.832618269059645e-06, + "loss": 0.1201, + "step": 3162 + }, + { + "epoch": 1.024951393389501, + "grad_norm": 0.826892077922821, + "learning_rate": 3.831878194263458e-06, + "loss": 0.1147, + "step": 3163 + }, + { + "epoch": 1.0252754374594946, + "grad_norm": 0.8258995413780212, + "learning_rate": 3.831137956458272e-06, + "loss": 0.1219, + "step": 3164 + }, + { + "epoch": 1.025599481529488, + "grad_norm": 0.8825317621231079, + "learning_rate": 3.830397555734687e-06, + "loss": 0.1307, + "step": 3165 + }, + { + "epoch": 1.0259235255994816, + "grad_norm": 0.8365684151649475, + "learning_rate": 3.8296569921833214e-06, + "loss": 0.1133, + "step": 3166 + }, + { + "epoch": 1.026247569669475, + "grad_norm": 0.8617843985557556, + "learning_rate": 3.8289162658948114e-06, + "loss": 0.1262, + "step": 3167 + }, + { + "epoch": 1.0265716137394685, + "grad_norm": 0.74489825963974, + "learning_rate": 3.828175376959815e-06, + "loss": 0.1099, + "step": 3168 + }, + { + "epoch": 1.0268956578094621, + "grad_norm": 0.7609895467758179, + "learning_rate": 3.827434325469011e-06, + "loss": 0.1073, + "step": 3169 + }, + { + "epoch": 1.0272197018794555, + "grad_norm": 0.8371672034263611, + "learning_rate": 3.8266931115130955e-06, + "loss": 0.1189, + "step": 3170 + }, + { + "epoch": 1.0275437459494492, + "grad_norm": 0.847689688205719, + "learning_rate": 3.8259517351827866e-06, + "loss": 0.1252, + "step": 3171 + }, + { + "epoch": 1.0278677900194426, + "grad_norm": 0.8477571606636047, + "learning_rate": 3.825210196568823e-06, + "loss": 0.1165, + "step": 3172 + }, + { + "epoch": 1.0281918340894363, + "grad_norm": 0.7831157445907593, + "learning_rate": 3.824468495761958e-06, + "loss": 0.1125, + "step": 3173 + }, + { + "epoch": 1.0285158781594297, + "grad_norm": 0.8094596266746521, + "learning_rate": 3.823726632852972e-06, + "loss": 0.1223, + "step": 3174 + }, + { + "epoch": 1.028839922229423, + "grad_norm": 0.8329768776893616, + "learning_rate": 3.822984607932661e-06, + "loss": 0.1106, + "step": 3175 + }, + { + "epoch": 1.0291639662994168, + "grad_norm": 0.8299722671508789, + "learning_rate": 3.8222424210918404e-06, + "loss": 0.1174, + "step": 3176 + }, + { + "epoch": 1.0294880103694102, + "grad_norm": 0.7693575620651245, + "learning_rate": 3.821500072421349e-06, + "loss": 0.1169, + "step": 3177 + }, + { + "epoch": 1.0298120544394038, + "grad_norm": 0.7669704556465149, + "learning_rate": 3.820757562012042e-06, + "loss": 0.1147, + "step": 3178 + }, + { + "epoch": 1.0301360985093972, + "grad_norm": 0.8572734594345093, + "learning_rate": 3.820014889954794e-06, + "loss": 0.1253, + "step": 3179 + }, + { + "epoch": 1.030460142579391, + "grad_norm": 0.8561815619468689, + "learning_rate": 3.819272056340504e-06, + "loss": 0.1232, + "step": 3180 + }, + { + "epoch": 1.0307841866493843, + "grad_norm": 0.7440727949142456, + "learning_rate": 3.818529061260084e-06, + "loss": 0.107, + "step": 3181 + }, + { + "epoch": 1.0311082307193777, + "grad_norm": 0.8579633235931396, + "learning_rate": 3.817785904804473e-06, + "loss": 0.1214, + "step": 3182 + }, + { + "epoch": 1.0314322747893714, + "grad_norm": 0.8033884763717651, + "learning_rate": 3.817042587064623e-06, + "loss": 0.108, + "step": 3183 + }, + { + "epoch": 1.0317563188593648, + "grad_norm": 0.8052843809127808, + "learning_rate": 3.81629910813151e-06, + "loss": 0.117, + "step": 3184 + }, + { + "epoch": 1.0320803629293585, + "grad_norm": 0.8760218620300293, + "learning_rate": 3.815555468096131e-06, + "loss": 0.1211, + "step": 3185 + }, + { + "epoch": 1.0324044069993519, + "grad_norm": 0.8402146100997925, + "learning_rate": 3.814811667049497e-06, + "loss": 0.1151, + "step": 3186 + }, + { + "epoch": 1.0327284510693455, + "grad_norm": 0.8820028901100159, + "learning_rate": 3.814067705082643e-06, + "loss": 0.1171, + "step": 3187 + }, + { + "epoch": 1.033052495139339, + "grad_norm": 0.8354962468147278, + "learning_rate": 3.8133235822866234e-06, + "loss": 0.1094, + "step": 3188 + }, + { + "epoch": 1.0333765392093324, + "grad_norm": 0.8769159913063049, + "learning_rate": 3.812579298752511e-06, + "loss": 0.118, + "step": 3189 + }, + { + "epoch": 1.033700583279326, + "grad_norm": 0.8448501229286194, + "learning_rate": 3.8118348545714e-06, + "loss": 0.1182, + "step": 3190 + }, + { + "epoch": 1.0340246273493194, + "grad_norm": 0.8219982385635376, + "learning_rate": 3.8110902498344023e-06, + "loss": 0.107, + "step": 3191 + }, + { + "epoch": 1.034348671419313, + "grad_norm": 0.8816295266151428, + "learning_rate": 3.8103454846326493e-06, + "loss": 0.1138, + "step": 3192 + }, + { + "epoch": 1.0346727154893065, + "grad_norm": 0.8417076468467712, + "learning_rate": 3.809600559057295e-06, + "loss": 0.1157, + "step": 3193 + }, + { + "epoch": 1.0349967595593002, + "grad_norm": 0.8487566709518433, + "learning_rate": 3.80885547319951e-06, + "loss": 0.1152, + "step": 3194 + }, + { + "epoch": 1.0353208036292936, + "grad_norm": 0.7868528366088867, + "learning_rate": 3.808110227150485e-06, + "loss": 0.1064, + "step": 3195 + }, + { + "epoch": 1.035644847699287, + "grad_norm": 0.8207704424858093, + "learning_rate": 3.8073648210014323e-06, + "loss": 0.1108, + "step": 3196 + }, + { + "epoch": 1.0359688917692806, + "grad_norm": 0.9189059734344482, + "learning_rate": 3.806619254843582e-06, + "loss": 0.1235, + "step": 3197 + }, + { + "epoch": 1.036292935839274, + "grad_norm": 0.851455807685852, + "learning_rate": 3.8058735287681835e-06, + "loss": 0.1101, + "step": 3198 + }, + { + "epoch": 1.0366169799092677, + "grad_norm": 0.9308487772941589, + "learning_rate": 3.8051276428665074e-06, + "loss": 0.1269, + "step": 3199 + }, + { + "epoch": 1.0369410239792611, + "grad_norm": 0.8116505146026611, + "learning_rate": 3.8043815972298424e-06, + "loss": 0.1193, + "step": 3200 + }, + { + "epoch": 1.0372650680492548, + "grad_norm": 0.8720618486404419, + "learning_rate": 3.8036353919494973e-06, + "loss": 0.1135, + "step": 3201 + }, + { + "epoch": 1.0375891121192482, + "grad_norm": 0.8109154105186462, + "learning_rate": 3.8028890271168e-06, + "loss": 0.1125, + "step": 3202 + }, + { + "epoch": 1.0379131561892416, + "grad_norm": 0.8150811195373535, + "learning_rate": 3.8021425028230994e-06, + "loss": 0.1062, + "step": 3203 + }, + { + "epoch": 1.0382372002592353, + "grad_norm": 0.7590900659561157, + "learning_rate": 3.801395819159761e-06, + "loss": 0.106, + "step": 3204 + }, + { + "epoch": 1.0385612443292287, + "grad_norm": 0.8100997805595398, + "learning_rate": 3.8006489762181744e-06, + "loss": 0.1129, + "step": 3205 + }, + { + "epoch": 1.0388852883992223, + "grad_norm": 0.8717626333236694, + "learning_rate": 3.7999019740897423e-06, + "loss": 0.1146, + "step": 3206 + }, + { + "epoch": 1.0392093324692158, + "grad_norm": 1.0034371614456177, + "learning_rate": 3.799154812865894e-06, + "loss": 0.1263, + "step": 3207 + }, + { + "epoch": 1.0395333765392094, + "grad_norm": 0.8377565741539001, + "learning_rate": 3.7984074926380733e-06, + "loss": 0.1188, + "step": 3208 + }, + { + "epoch": 1.0398574206092028, + "grad_norm": 0.881369948387146, + "learning_rate": 3.7976600134977455e-06, + "loss": 0.117, + "step": 3209 + }, + { + "epoch": 1.0401814646791965, + "grad_norm": 0.8010326027870178, + "learning_rate": 3.7969123755363935e-06, + "loss": 0.1139, + "step": 3210 + }, + { + "epoch": 1.04050550874919, + "grad_norm": 0.9041508436203003, + "learning_rate": 3.7961645788455225e-06, + "loss": 0.1213, + "step": 3211 + }, + { + "epoch": 1.0408295528191833, + "grad_norm": 0.7512882351875305, + "learning_rate": 3.7954166235166545e-06, + "loss": 0.1059, + "step": 3212 + }, + { + "epoch": 1.041153596889177, + "grad_norm": 0.8295339345932007, + "learning_rate": 3.794668509641332e-06, + "loss": 0.1212, + "step": 3213 + }, + { + "epoch": 1.0414776409591704, + "grad_norm": 0.8528342247009277, + "learning_rate": 3.793920237311118e-06, + "loss": 0.1162, + "step": 3214 + }, + { + "epoch": 1.041801685029164, + "grad_norm": 0.7838711142539978, + "learning_rate": 3.793171806617593e-06, + "loss": 0.1125, + "step": 3215 + }, + { + "epoch": 1.0421257290991575, + "grad_norm": 0.8150649666786194, + "learning_rate": 3.7924232176523574e-06, + "loss": 0.1084, + "step": 3216 + }, + { + "epoch": 1.042449773169151, + "grad_norm": 0.7916179299354553, + "learning_rate": 3.7916744705070318e-06, + "loss": 0.1068, + "step": 3217 + }, + { + "epoch": 1.0427738172391445, + "grad_norm": 0.8389194011688232, + "learning_rate": 3.790925565273255e-06, + "loss": 0.1163, + "step": 3218 + }, + { + "epoch": 1.043097861309138, + "grad_norm": 0.9675598740577698, + "learning_rate": 3.790176502042686e-06, + "loss": 0.1275, + "step": 3219 + }, + { + "epoch": 1.0434219053791316, + "grad_norm": 0.8746939301490784, + "learning_rate": 3.789427280907004e-06, + "loss": 0.1245, + "step": 3220 + }, + { + "epoch": 1.043745949449125, + "grad_norm": 0.7958021759986877, + "learning_rate": 3.7886779019579045e-06, + "loss": 0.1178, + "step": 3221 + }, + { + "epoch": 1.0440699935191187, + "grad_norm": 0.8526121973991394, + "learning_rate": 3.787928365287106e-06, + "loss": 0.1142, + "step": 3222 + }, + { + "epoch": 1.044394037589112, + "grad_norm": 0.8836873769760132, + "learning_rate": 3.7871786709863435e-06, + "loss": 0.1262, + "step": 3223 + }, + { + "epoch": 1.0447180816591057, + "grad_norm": 0.8168222308158875, + "learning_rate": 3.7864288191473718e-06, + "loss": 0.1201, + "step": 3224 + }, + { + "epoch": 1.0450421257290992, + "grad_norm": 0.86906898021698, + "learning_rate": 3.7856788098619667e-06, + "loss": 0.1158, + "step": 3225 + }, + { + "epoch": 1.0453661697990926, + "grad_norm": 0.898335337638855, + "learning_rate": 3.7849286432219216e-06, + "loss": 0.1174, + "step": 3226 + }, + { + "epoch": 1.0456902138690862, + "grad_norm": 0.8181985020637512, + "learning_rate": 3.78417831931905e-06, + "loss": 0.1177, + "step": 3227 + }, + { + "epoch": 1.0460142579390797, + "grad_norm": 0.8221048712730408, + "learning_rate": 3.783427838245184e-06, + "loss": 0.1144, + "step": 3228 + }, + { + "epoch": 1.0463383020090733, + "grad_norm": 0.8519485592842102, + "learning_rate": 3.7826772000921742e-06, + "loss": 0.1199, + "step": 3229 + }, + { + "epoch": 1.0466623460790667, + "grad_norm": 0.8599885702133179, + "learning_rate": 3.781926404951893e-06, + "loss": 0.1152, + "step": 3230 + }, + { + "epoch": 1.0469863901490604, + "grad_norm": 0.8111092448234558, + "learning_rate": 3.7811754529162294e-06, + "loss": 0.1144, + "step": 3231 + }, + { + "epoch": 1.0473104342190538, + "grad_norm": 0.8220904469490051, + "learning_rate": 3.7804243440770936e-06, + "loss": 0.1152, + "step": 3232 + }, + { + "epoch": 1.0476344782890472, + "grad_norm": 0.8551547527313232, + "learning_rate": 3.779673078526414e-06, + "loss": 0.1169, + "step": 3233 + }, + { + "epoch": 1.0479585223590409, + "grad_norm": 0.8881143927574158, + "learning_rate": 3.7789216563561373e-06, + "loss": 0.1331, + "step": 3234 + }, + { + "epoch": 1.0482825664290343, + "grad_norm": 0.8921029567718506, + "learning_rate": 3.778170077658231e-06, + "loss": 0.1165, + "step": 3235 + }, + { + "epoch": 1.048606610499028, + "grad_norm": 0.7967270612716675, + "learning_rate": 3.77741834252468e-06, + "loss": 0.1103, + "step": 3236 + }, + { + "epoch": 1.0489306545690213, + "grad_norm": 0.8333545327186584, + "learning_rate": 3.7766664510474903e-06, + "loss": 0.1178, + "step": 3237 + }, + { + "epoch": 1.049254698639015, + "grad_norm": 0.8133196830749512, + "learning_rate": 3.775914403318687e-06, + "loss": 0.1088, + "step": 3238 + }, + { + "epoch": 1.0495787427090084, + "grad_norm": 0.836877703666687, + "learning_rate": 3.7751621994303123e-06, + "loss": 0.121, + "step": 3239 + }, + { + "epoch": 1.0499027867790018, + "grad_norm": 0.8581812977790833, + "learning_rate": 3.7744098394744287e-06, + "loss": 0.1215, + "step": 3240 + }, + { + "epoch": 1.0502268308489955, + "grad_norm": 0.8775882720947266, + "learning_rate": 3.7736573235431174e-06, + "loss": 0.1312, + "step": 3241 + }, + { + "epoch": 1.050550874918989, + "grad_norm": 0.8495439291000366, + "learning_rate": 3.7729046517284805e-06, + "loss": 0.1217, + "step": 3242 + }, + { + "epoch": 1.0508749189889826, + "grad_norm": 0.8486558794975281, + "learning_rate": 3.7721518241226375e-06, + "loss": 0.112, + "step": 3243 + }, + { + "epoch": 1.051198963058976, + "grad_norm": 0.7727014422416687, + "learning_rate": 3.771398840817725e-06, + "loss": 0.1024, + "step": 3244 + }, + { + "epoch": 1.0515230071289696, + "grad_norm": 0.9136723279953003, + "learning_rate": 3.770645701905904e-06, + "loss": 0.1325, + "step": 3245 + }, + { + "epoch": 1.051847051198963, + "grad_norm": 0.807131290435791, + "learning_rate": 3.7698924074793484e-06, + "loss": 0.1109, + "step": 3246 + }, + { + "epoch": 1.0521710952689567, + "grad_norm": 0.7704994678497314, + "learning_rate": 3.7691389576302567e-06, + "loss": 0.1101, + "step": 3247 + }, + { + "epoch": 1.0524951393389501, + "grad_norm": 0.775662362575531, + "learning_rate": 3.768385352450842e-06, + "loss": 0.1074, + "step": 3248 + }, + { + "epoch": 1.0528191834089435, + "grad_norm": 0.8145730495452881, + "learning_rate": 3.7676315920333396e-06, + "loss": 0.1181, + "step": 3249 + }, + { + "epoch": 1.0531432274789372, + "grad_norm": 0.8446400165557861, + "learning_rate": 3.7668776764700023e-06, + "loss": 0.1255, + "step": 3250 + }, + { + "epoch": 1.0534672715489306, + "grad_norm": 0.8573490977287292, + "learning_rate": 3.766123605853101e-06, + "loss": 0.1225, + "step": 3251 + }, + { + "epoch": 1.0537913156189243, + "grad_norm": 0.9274383187294006, + "learning_rate": 3.765369380274928e-06, + "loss": 0.1274, + "step": 3252 + }, + { + "epoch": 1.0541153596889177, + "grad_norm": 0.8625757098197937, + "learning_rate": 3.7646149998277924e-06, + "loss": 0.1156, + "step": 3253 + }, + { + "epoch": 1.054439403758911, + "grad_norm": 0.896124541759491, + "learning_rate": 3.7638604646040232e-06, + "loss": 0.1276, + "step": 3254 + }, + { + "epoch": 1.0547634478289047, + "grad_norm": 0.8273859024047852, + "learning_rate": 3.763105774695968e-06, + "loss": 0.1146, + "step": 3255 + }, + { + "epoch": 1.0550874918988982, + "grad_norm": 0.821397602558136, + "learning_rate": 3.7623509301959935e-06, + "loss": 0.113, + "step": 3256 + }, + { + "epoch": 1.0554115359688918, + "grad_norm": 0.7991721630096436, + "learning_rate": 3.7615959311964865e-06, + "loss": 0.1137, + "step": 3257 + }, + { + "epoch": 1.0557355800388852, + "grad_norm": 0.7959055304527283, + "learning_rate": 3.760840777789851e-06, + "loss": 0.1141, + "step": 3258 + }, + { + "epoch": 1.0560596241088789, + "grad_norm": 0.8501467108726501, + "learning_rate": 3.7600854700685095e-06, + "loss": 0.1181, + "step": 3259 + }, + { + "epoch": 1.0563836681788723, + "grad_norm": 0.8222145438194275, + "learning_rate": 3.759330008124905e-06, + "loss": 0.1161, + "step": 3260 + }, + { + "epoch": 1.056707712248866, + "grad_norm": 0.7849128842353821, + "learning_rate": 3.7585743920514985e-06, + "loss": 0.1145, + "step": 3261 + }, + { + "epoch": 1.0570317563188594, + "grad_norm": 0.7778906226158142, + "learning_rate": 3.757818621940771e-06, + "loss": 0.1163, + "step": 3262 + }, + { + "epoch": 1.0573558003888528, + "grad_norm": 0.9160462021827698, + "learning_rate": 3.7570626978852203e-06, + "loss": 0.1322, + "step": 3263 + }, + { + "epoch": 1.0576798444588464, + "grad_norm": 0.7777760028839111, + "learning_rate": 3.7563066199773645e-06, + "loss": 0.1106, + "step": 3264 + }, + { + "epoch": 1.0580038885288399, + "grad_norm": 0.8116745948791504, + "learning_rate": 3.7555503883097414e-06, + "loss": 0.12, + "step": 3265 + }, + { + "epoch": 1.0583279325988335, + "grad_norm": 0.8378960490226746, + "learning_rate": 3.7547940029749054e-06, + "loss": 0.1151, + "step": 3266 + }, + { + "epoch": 1.058651976668827, + "grad_norm": 0.8337701559066772, + "learning_rate": 3.75403746406543e-06, + "loss": 0.1212, + "step": 3267 + }, + { + "epoch": 1.0589760207388206, + "grad_norm": 0.8173056840896606, + "learning_rate": 3.7532807716739082e-06, + "loss": 0.1166, + "step": 3268 + }, + { + "epoch": 1.059300064808814, + "grad_norm": 0.7868010401725769, + "learning_rate": 3.752523925892954e-06, + "loss": 0.1182, + "step": 3269 + }, + { + "epoch": 1.0596241088788074, + "grad_norm": 0.7789453268051147, + "learning_rate": 3.7517669268151967e-06, + "loss": 0.0995, + "step": 3270 + }, + { + "epoch": 1.059948152948801, + "grad_norm": 0.8659542798995972, + "learning_rate": 3.751009774533285e-06, + "loss": 0.1272, + "step": 3271 + }, + { + "epoch": 1.0602721970187945, + "grad_norm": 0.8237266540527344, + "learning_rate": 3.7502524691398877e-06, + "loss": 0.1153, + "step": 3272 + }, + { + "epoch": 1.0605962410887881, + "grad_norm": 0.8071155548095703, + "learning_rate": 3.7494950107276917e-06, + "loss": 0.1126, + "step": 3273 + }, + { + "epoch": 1.0609202851587816, + "grad_norm": 0.8354878425598145, + "learning_rate": 3.7487373993894027e-06, + "loss": 0.1151, + "step": 3274 + }, + { + "epoch": 1.0612443292287752, + "grad_norm": 0.8763176798820496, + "learning_rate": 3.7479796352177445e-06, + "loss": 0.116, + "step": 3275 + }, + { + "epoch": 1.0615683732987686, + "grad_norm": 0.891167402267456, + "learning_rate": 3.7472217183054605e-06, + "loss": 0.1303, + "step": 3276 + }, + { + "epoch": 1.061892417368762, + "grad_norm": 0.7942646145820618, + "learning_rate": 3.7464636487453122e-06, + "loss": 0.1087, + "step": 3277 + }, + { + "epoch": 1.0622164614387557, + "grad_norm": 0.9152526259422302, + "learning_rate": 3.74570542663008e-06, + "loss": 0.1235, + "step": 3278 + }, + { + "epoch": 1.0625405055087491, + "grad_norm": 0.7322328090667725, + "learning_rate": 3.744947052052562e-06, + "loss": 0.1007, + "step": 3279 + }, + { + "epoch": 1.0628645495787428, + "grad_norm": 0.844443678855896, + "learning_rate": 3.7441885251055774e-06, + "loss": 0.1207, + "step": 3280 + }, + { + "epoch": 1.0631885936487362, + "grad_norm": 0.813022792339325, + "learning_rate": 3.7434298458819622e-06, + "loss": 0.1127, + "step": 3281 + }, + { + "epoch": 1.0635126377187298, + "grad_norm": 0.8554878234863281, + "learning_rate": 3.7426710144745717e-06, + "loss": 0.1213, + "step": 3282 + }, + { + "epoch": 1.0638366817887233, + "grad_norm": 0.831193745136261, + "learning_rate": 3.7419120309762787e-06, + "loss": 0.1169, + "step": 3283 + }, + { + "epoch": 1.0641607258587167, + "grad_norm": 0.8382897973060608, + "learning_rate": 3.7411528954799752e-06, + "loss": 0.123, + "step": 3284 + }, + { + "epoch": 1.0644847699287103, + "grad_norm": 0.8006016612052917, + "learning_rate": 3.740393608078573e-06, + "loss": 0.116, + "step": 3285 + }, + { + "epoch": 1.0648088139987038, + "grad_norm": 0.9068976640701294, + "learning_rate": 3.739634168865001e-06, + "loss": 0.1199, + "step": 3286 + }, + { + "epoch": 1.0651328580686974, + "grad_norm": 0.7854377627372742, + "learning_rate": 3.738874577932208e-06, + "loss": 0.1088, + "step": 3287 + }, + { + "epoch": 1.0654569021386908, + "grad_norm": 0.81007981300354, + "learning_rate": 3.738114835373159e-06, + "loss": 0.1219, + "step": 3288 + }, + { + "epoch": 1.0657809462086845, + "grad_norm": 0.8615376353263855, + "learning_rate": 3.73735494128084e-06, + "loss": 0.1216, + "step": 3289 + }, + { + "epoch": 1.0661049902786779, + "grad_norm": 0.7818930745124817, + "learning_rate": 3.736594895748255e-06, + "loss": 0.1137, + "step": 3290 + }, + { + "epoch": 1.0664290343486713, + "grad_norm": 0.8900367617607117, + "learning_rate": 3.7358346988684258e-06, + "loss": 0.1304, + "step": 3291 + }, + { + "epoch": 1.066753078418665, + "grad_norm": 0.8263948559761047, + "learning_rate": 3.735074350734393e-06, + "loss": 0.1197, + "step": 3292 + }, + { + "epoch": 1.0670771224886584, + "grad_norm": 0.8178905248641968, + "learning_rate": 3.734313851439217e-06, + "loss": 0.1183, + "step": 3293 + }, + { + "epoch": 1.067401166558652, + "grad_norm": 0.8497899174690247, + "learning_rate": 3.7335532010759747e-06, + "loss": 0.1206, + "step": 3294 + }, + { + "epoch": 1.0677252106286454, + "grad_norm": 0.7759349942207336, + "learning_rate": 3.732792399737761e-06, + "loss": 0.1035, + "step": 3295 + }, + { + "epoch": 1.068049254698639, + "grad_norm": 0.9050149321556091, + "learning_rate": 3.7320314475176933e-06, + "loss": 0.1125, + "step": 3296 + }, + { + "epoch": 1.0683732987686325, + "grad_norm": 0.8377945423126221, + "learning_rate": 3.731270344508903e-06, + "loss": 0.1145, + "step": 3297 + }, + { + "epoch": 1.0686973428386262, + "grad_norm": 0.8067528605461121, + "learning_rate": 3.7305090908045422e-06, + "loss": 0.1138, + "step": 3298 + }, + { + "epoch": 1.0690213869086196, + "grad_norm": 0.7546223402023315, + "learning_rate": 3.7297476864977805e-06, + "loss": 0.1017, + "step": 3299 + }, + { + "epoch": 1.069345430978613, + "grad_norm": 0.8082880973815918, + "learning_rate": 3.7289861316818077e-06, + "loss": 0.1181, + "step": 3300 + }, + { + "epoch": 1.0696694750486067, + "grad_norm": 0.8667186498641968, + "learning_rate": 3.728224426449829e-06, + "loss": 0.1225, + "step": 3301 + }, + { + "epoch": 1.0699935191186, + "grad_norm": 0.8660303354263306, + "learning_rate": 3.7274625708950706e-06, + "loss": 0.1213, + "step": 3302 + }, + { + "epoch": 1.0703175631885937, + "grad_norm": 0.847798764705658, + "learning_rate": 3.7267005651107763e-06, + "loss": 0.1202, + "step": 3303 + }, + { + "epoch": 1.0706416072585871, + "grad_norm": 0.8011866211891174, + "learning_rate": 3.7259384091902085e-06, + "loss": 0.105, + "step": 3304 + }, + { + "epoch": 1.0709656513285806, + "grad_norm": 0.8448381423950195, + "learning_rate": 3.7251761032266475e-06, + "loss": 0.1194, + "step": 3305 + }, + { + "epoch": 1.0712896953985742, + "grad_norm": 0.854239284992218, + "learning_rate": 3.7244136473133924e-06, + "loss": 0.1184, + "step": 3306 + }, + { + "epoch": 1.0716137394685676, + "grad_norm": 0.7776972651481628, + "learning_rate": 3.7236510415437598e-06, + "loss": 0.1097, + "step": 3307 + }, + { + "epoch": 1.0719377835385613, + "grad_norm": 0.8738824129104614, + "learning_rate": 3.7228882860110856e-06, + "loss": 0.1238, + "step": 3308 + }, + { + "epoch": 1.0722618276085547, + "grad_norm": 0.7855773568153381, + "learning_rate": 3.7221253808087234e-06, + "loss": 0.1102, + "step": 3309 + }, + { + "epoch": 1.0725858716785484, + "grad_norm": 0.8029606342315674, + "learning_rate": 3.721362326030046e-06, + "loss": 0.1086, + "step": 3310 + }, + { + "epoch": 1.0729099157485418, + "grad_norm": 0.8558063507080078, + "learning_rate": 3.720599121768443e-06, + "loss": 0.1188, + "step": 3311 + }, + { + "epoch": 1.0732339598185354, + "grad_norm": 0.8302557468414307, + "learning_rate": 3.7198357681173247e-06, + "loss": 0.114, + "step": 3312 + }, + { + "epoch": 1.0735580038885288, + "grad_norm": 0.8412912487983704, + "learning_rate": 3.7190722651701166e-06, + "loss": 0.1216, + "step": 3313 + }, + { + "epoch": 1.0738820479585223, + "grad_norm": 0.8054918646812439, + "learning_rate": 3.718308613020265e-06, + "loss": 0.1124, + "step": 3314 + }, + { + "epoch": 1.074206092028516, + "grad_norm": 0.8104496002197266, + "learning_rate": 3.717544811761233e-06, + "loss": 0.1142, + "step": 3315 + }, + { + "epoch": 1.0745301360985093, + "grad_norm": 0.8747921586036682, + "learning_rate": 3.716780861486503e-06, + "loss": 0.1192, + "step": 3316 + }, + { + "epoch": 1.074854180168503, + "grad_norm": 0.7890901565551758, + "learning_rate": 3.716016762289576e-06, + "loss": 0.1104, + "step": 3317 + }, + { + "epoch": 1.0751782242384964, + "grad_norm": 0.9319736957550049, + "learning_rate": 3.7152525142639682e-06, + "loss": 0.1261, + "step": 3318 + }, + { + "epoch": 1.07550226830849, + "grad_norm": 0.8697876930236816, + "learning_rate": 3.7144881175032178e-06, + "loss": 0.1239, + "step": 3319 + }, + { + "epoch": 1.0758263123784835, + "grad_norm": 0.7517074942588806, + "learning_rate": 3.713723572100878e-06, + "loss": 0.1055, + "step": 3320 + }, + { + "epoch": 1.076150356448477, + "grad_norm": 0.8587399125099182, + "learning_rate": 3.7129588781505232e-06, + "loss": 0.1217, + "step": 3321 + }, + { + "epoch": 1.0764744005184705, + "grad_norm": 0.7583026885986328, + "learning_rate": 3.7121940357457438e-06, + "loss": 0.1058, + "step": 3322 + }, + { + "epoch": 1.076798444588464, + "grad_norm": 0.8852120637893677, + "learning_rate": 3.7114290449801493e-06, + "loss": 0.1194, + "step": 3323 + }, + { + "epoch": 1.0771224886584576, + "grad_norm": 0.8140954971313477, + "learning_rate": 3.7106639059473675e-06, + "loss": 0.1113, + "step": 3324 + }, + { + "epoch": 1.077446532728451, + "grad_norm": 0.9091130495071411, + "learning_rate": 3.7098986187410447e-06, + "loss": 0.1293, + "step": 3325 + }, + { + "epoch": 1.0777705767984447, + "grad_norm": 0.8124028444290161, + "learning_rate": 3.7091331834548427e-06, + "loss": 0.1265, + "step": 3326 + }, + { + "epoch": 1.078094620868438, + "grad_norm": 0.8255280256271362, + "learning_rate": 3.7083676001824443e-06, + "loss": 0.1163, + "step": 3327 + }, + { + "epoch": 1.0784186649384315, + "grad_norm": 0.9926854968070984, + "learning_rate": 3.70760186901755e-06, + "loss": 0.1202, + "step": 3328 + }, + { + "epoch": 1.0787427090084252, + "grad_norm": 0.8930680155754089, + "learning_rate": 3.706835990053877e-06, + "loss": 0.1185, + "step": 3329 + }, + { + "epoch": 1.0790667530784186, + "grad_norm": 0.8199201822280884, + "learning_rate": 3.7060699633851615e-06, + "loss": 0.1073, + "step": 3330 + }, + { + "epoch": 1.0793907971484122, + "grad_norm": 0.8079055547714233, + "learning_rate": 3.7053037891051596e-06, + "loss": 0.1172, + "step": 3331 + }, + { + "epoch": 1.0797148412184057, + "grad_norm": 0.8031049370765686, + "learning_rate": 3.704537467307641e-06, + "loss": 0.1104, + "step": 3332 + }, + { + "epoch": 1.0800388852883993, + "grad_norm": 0.8727356791496277, + "learning_rate": 3.7037709980863974e-06, + "loss": 0.1254, + "step": 3333 + }, + { + "epoch": 1.0803629293583927, + "grad_norm": 0.8698641657829285, + "learning_rate": 3.703004381535237e-06, + "loss": 0.1155, + "step": 3334 + }, + { + "epoch": 1.0806869734283864, + "grad_norm": 0.8368037939071655, + "learning_rate": 3.7022376177479863e-06, + "loss": 0.1196, + "step": 3335 + }, + { + "epoch": 1.0810110174983798, + "grad_norm": 0.7489742040634155, + "learning_rate": 3.7014707068184895e-06, + "loss": 0.1045, + "step": 3336 + }, + { + "epoch": 1.0813350615683732, + "grad_norm": 0.8222491145133972, + "learning_rate": 3.70070364884061e-06, + "loss": 0.1212, + "step": 3337 + }, + { + "epoch": 1.0816591056383669, + "grad_norm": 0.761885941028595, + "learning_rate": 3.6999364439082274e-06, + "loss": 0.1163, + "step": 3338 + }, + { + "epoch": 1.0819831497083603, + "grad_norm": 0.8204994201660156, + "learning_rate": 3.6991690921152407e-06, + "loss": 0.1124, + "step": 3339 + }, + { + "epoch": 1.082307193778354, + "grad_norm": 1.0559394359588623, + "learning_rate": 3.698401593555565e-06, + "loss": 0.1241, + "step": 3340 + }, + { + "epoch": 1.0826312378483474, + "grad_norm": 0.7862476110458374, + "learning_rate": 3.697633948323136e-06, + "loss": 0.1147, + "step": 3341 + }, + { + "epoch": 1.0829552819183408, + "grad_norm": 0.871893584728241, + "learning_rate": 3.6968661565119062e-06, + "loss": 0.1168, + "step": 3342 + }, + { + "epoch": 1.0832793259883344, + "grad_norm": 0.9019219279289246, + "learning_rate": 3.6960982182158458e-06, + "loss": 0.1303, + "step": 3343 + }, + { + "epoch": 1.0836033700583279, + "grad_norm": 0.8220975995063782, + "learning_rate": 3.6953301335289415e-06, + "loss": 0.1243, + "step": 3344 + }, + { + "epoch": 1.0839274141283215, + "grad_norm": 0.8278359770774841, + "learning_rate": 3.6945619025452006e-06, + "loss": 0.1202, + "step": 3345 + }, + { + "epoch": 1.084251458198315, + "grad_norm": 0.7727410793304443, + "learning_rate": 3.6937935253586475e-06, + "loss": 0.1061, + "step": 3346 + }, + { + "epoch": 1.0845755022683086, + "grad_norm": 0.836865246295929, + "learning_rate": 3.6930250020633237e-06, + "loss": 0.1206, + "step": 3347 + }, + { + "epoch": 1.084899546338302, + "grad_norm": 0.8528717756271362, + "learning_rate": 3.692256332753289e-06, + "loss": 0.1193, + "step": 3348 + }, + { + "epoch": 1.0852235904082956, + "grad_norm": 0.860572338104248, + "learning_rate": 3.691487517522621e-06, + "loss": 0.1228, + "step": 3349 + }, + { + "epoch": 1.085547634478289, + "grad_norm": 0.8165038824081421, + "learning_rate": 3.690718556465416e-06, + "loss": 0.1206, + "step": 3350 + }, + { + "epoch": 1.0858716785482825, + "grad_norm": 0.8471930623054504, + "learning_rate": 3.689949449675786e-06, + "loss": 0.1296, + "step": 3351 + }, + { + "epoch": 1.0861957226182761, + "grad_norm": 0.7674320936203003, + "learning_rate": 3.689180197247863e-06, + "loss": 0.1062, + "step": 3352 + }, + { + "epoch": 1.0865197666882696, + "grad_norm": 0.8666447401046753, + "learning_rate": 3.688410799275796e-06, + "loss": 0.1242, + "step": 3353 + }, + { + "epoch": 1.0868438107582632, + "grad_norm": 0.8304182291030884, + "learning_rate": 3.6876412558537524e-06, + "loss": 0.1169, + "step": 3354 + }, + { + "epoch": 1.0871678548282566, + "grad_norm": 0.838219404220581, + "learning_rate": 3.686871567075916e-06, + "loss": 0.1192, + "step": 3355 + }, + { + "epoch": 1.0874918988982503, + "grad_norm": 0.859727144241333, + "learning_rate": 3.6861017330364897e-06, + "loss": 0.1202, + "step": 3356 + }, + { + "epoch": 1.0878159429682437, + "grad_norm": 0.7956854701042175, + "learning_rate": 3.685331753829693e-06, + "loss": 0.11, + "step": 3357 + }, + { + "epoch": 1.088139987038237, + "grad_norm": 0.7603848576545715, + "learning_rate": 3.684561629549765e-06, + "loss": 0.1057, + "step": 3358 + }, + { + "epoch": 1.0884640311082308, + "grad_norm": 0.84071284532547, + "learning_rate": 3.6837913602909615e-06, + "loss": 0.1142, + "step": 3359 + }, + { + "epoch": 1.0887880751782242, + "grad_norm": 0.8513961434364319, + "learning_rate": 3.6830209461475554e-06, + "loss": 0.1139, + "step": 3360 + }, + { + "epoch": 1.0891121192482178, + "grad_norm": 0.845880389213562, + "learning_rate": 3.6822503872138377e-06, + "loss": 0.1218, + "step": 3361 + }, + { + "epoch": 1.0894361633182112, + "grad_norm": 0.7607066631317139, + "learning_rate": 3.6814796835841172e-06, + "loss": 0.1039, + "step": 3362 + }, + { + "epoch": 1.089760207388205, + "grad_norm": 0.9276735782623291, + "learning_rate": 3.6807088353527216e-06, + "loss": 0.1254, + "step": 3363 + }, + { + "epoch": 1.0900842514581983, + "grad_norm": 0.768365740776062, + "learning_rate": 3.6799378426139942e-06, + "loss": 0.1011, + "step": 3364 + }, + { + "epoch": 1.0904082955281917, + "grad_norm": 0.8057131767272949, + "learning_rate": 3.679166705462298e-06, + "loss": 0.1149, + "step": 3365 + }, + { + "epoch": 1.0907323395981854, + "grad_norm": 0.8209421038627625, + "learning_rate": 3.6783954239920118e-06, + "loss": 0.1144, + "step": 3366 + }, + { + "epoch": 1.0910563836681788, + "grad_norm": 0.8330515027046204, + "learning_rate": 3.677623998297534e-06, + "loss": 0.1146, + "step": 3367 + }, + { + "epoch": 1.0913804277381725, + "grad_norm": 0.7507935762405396, + "learning_rate": 3.6768524284732794e-06, + "loss": 0.1078, + "step": 3368 + }, + { + "epoch": 1.0917044718081659, + "grad_norm": 0.9468154907226562, + "learning_rate": 3.6760807146136796e-06, + "loss": 0.1243, + "step": 3369 + }, + { + "epoch": 1.0920285158781595, + "grad_norm": 0.906830906867981, + "learning_rate": 3.675308856813186e-06, + "loss": 0.122, + "step": 3370 + }, + { + "epoch": 1.092352559948153, + "grad_norm": 0.8906958699226379, + "learning_rate": 3.6745368551662663e-06, + "loss": 0.1191, + "step": 3371 + }, + { + "epoch": 1.0926766040181464, + "grad_norm": 0.8567639589309692, + "learning_rate": 3.6737647097674056e-06, + "loss": 0.1204, + "step": 3372 + }, + { + "epoch": 1.09300064808814, + "grad_norm": 0.7823848128318787, + "learning_rate": 3.6729924207111077e-06, + "loss": 0.1133, + "step": 3373 + }, + { + "epoch": 1.0933246921581334, + "grad_norm": 0.8717378973960876, + "learning_rate": 3.6722199880918928e-06, + "loss": 0.1261, + "step": 3374 + }, + { + "epoch": 1.093648736228127, + "grad_norm": 0.785054087638855, + "learning_rate": 3.6714474120042993e-06, + "loss": 0.1128, + "step": 3375 + }, + { + "epoch": 1.0939727802981205, + "grad_norm": 0.8238503336906433, + "learning_rate": 3.6706746925428833e-06, + "loss": 0.1205, + "step": 3376 + }, + { + "epoch": 1.0942968243681142, + "grad_norm": 0.7985695600509644, + "learning_rate": 3.6699018298022173e-06, + "loss": 0.113, + "step": 3377 + }, + { + "epoch": 1.0946208684381076, + "grad_norm": 0.8084473013877869, + "learning_rate": 3.6691288238768928e-06, + "loss": 0.1211, + "step": 3378 + }, + { + "epoch": 1.094944912508101, + "grad_norm": 0.7871827483177185, + "learning_rate": 3.6683556748615196e-06, + "loss": 0.1149, + "step": 3379 + }, + { + "epoch": 1.0952689565780946, + "grad_norm": 0.7735625505447388, + "learning_rate": 3.667582382850721e-06, + "loss": 0.1085, + "step": 3380 + }, + { + "epoch": 1.095593000648088, + "grad_norm": 0.8151714205741882, + "learning_rate": 3.6668089479391433e-06, + "loss": 0.1145, + "step": 3381 + }, + { + "epoch": 1.0959170447180817, + "grad_norm": 0.8279069662094116, + "learning_rate": 3.666035370221445e-06, + "loss": 0.11, + "step": 3382 + }, + { + "epoch": 1.0962410887880751, + "grad_norm": 0.8695924282073975, + "learning_rate": 3.665261649792305e-06, + "loss": 0.1205, + "step": 3383 + }, + { + "epoch": 1.0965651328580688, + "grad_norm": 0.7513477802276611, + "learning_rate": 3.66448778674642e-06, + "loss": 0.1042, + "step": 3384 + }, + { + "epoch": 1.0968891769280622, + "grad_norm": 0.7814244031906128, + "learning_rate": 3.663713781178504e-06, + "loss": 0.105, + "step": 3385 + }, + { + "epoch": 1.0972132209980558, + "grad_norm": 0.8193655610084534, + "learning_rate": 3.6629396331832854e-06, + "loss": 0.1184, + "step": 3386 + }, + { + "epoch": 1.0975372650680493, + "grad_norm": 0.8398494124412537, + "learning_rate": 3.6621653428555144e-06, + "loss": 0.1234, + "step": 3387 + }, + { + "epoch": 1.0978613091380427, + "grad_norm": 0.8566270470619202, + "learning_rate": 3.661390910289956e-06, + "loss": 0.1151, + "step": 3388 + }, + { + "epoch": 1.0981853532080363, + "grad_norm": 0.8087670207023621, + "learning_rate": 3.6606163355813935e-06, + "loss": 0.114, + "step": 3389 + }, + { + "epoch": 1.0985093972780298, + "grad_norm": 0.8933915495872498, + "learning_rate": 3.6598416188246265e-06, + "loss": 0.1168, + "step": 3390 + }, + { + "epoch": 1.0988334413480234, + "grad_norm": 0.834700345993042, + "learning_rate": 3.6590667601144748e-06, + "loss": 0.1172, + "step": 3391 + }, + { + "epoch": 1.0991574854180168, + "grad_norm": 0.7665829658508301, + "learning_rate": 3.6582917595457718e-06, + "loss": 0.1144, + "step": 3392 + }, + { + "epoch": 1.0994815294880103, + "grad_norm": 0.8383209705352783, + "learning_rate": 3.6575166172133703e-06, + "loss": 0.1162, + "step": 3393 + }, + { + "epoch": 1.099805573558004, + "grad_norm": 0.8014138340950012, + "learning_rate": 3.6567413332121402e-06, + "loss": 0.1169, + "step": 3394 + }, + { + "epoch": 1.1001296176279973, + "grad_norm": 0.8607217669487, + "learning_rate": 3.655965907636969e-06, + "loss": 0.1237, + "step": 3395 + }, + { + "epoch": 1.100453661697991, + "grad_norm": 0.7848190665245056, + "learning_rate": 3.6551903405827615e-06, + "loss": 0.1114, + "step": 3396 + }, + { + "epoch": 1.1007777057679844, + "grad_norm": 0.8079403638839722, + "learning_rate": 3.6544146321444397e-06, + "loss": 0.1114, + "step": 3397 + }, + { + "epoch": 1.101101749837978, + "grad_norm": 0.8031437397003174, + "learning_rate": 3.653638782416943e-06, + "loss": 0.1104, + "step": 3398 + }, + { + "epoch": 1.1014257939079715, + "grad_norm": 0.8697689175605774, + "learning_rate": 3.6528627914952263e-06, + "loss": 0.1196, + "step": 3399 + }, + { + "epoch": 1.101749837977965, + "grad_norm": 0.8094655871391296, + "learning_rate": 3.652086659474265e-06, + "loss": 0.1161, + "step": 3400 + }, + { + "epoch": 1.1020738820479585, + "grad_norm": 0.8157943487167358, + "learning_rate": 3.6513103864490497e-06, + "loss": 0.1213, + "step": 3401 + }, + { + "epoch": 1.102397926117952, + "grad_norm": 0.806490957736969, + "learning_rate": 3.650533972514589e-06, + "loss": 0.1206, + "step": 3402 + }, + { + "epoch": 1.1027219701879456, + "grad_norm": 0.8227203488349915, + "learning_rate": 3.6497574177659073e-06, + "loss": 0.1115, + "step": 3403 + }, + { + "epoch": 1.103046014257939, + "grad_norm": 0.8440227508544922, + "learning_rate": 3.6489807222980487e-06, + "loss": 0.126, + "step": 3404 + }, + { + "epoch": 1.1033700583279327, + "grad_norm": 0.8067148923873901, + "learning_rate": 3.648203886206073e-06, + "loss": 0.1138, + "step": 3405 + }, + { + "epoch": 1.103694102397926, + "grad_norm": 0.8669979572296143, + "learning_rate": 3.6474269095850568e-06, + "loss": 0.1259, + "step": 3406 + }, + { + "epoch": 1.1040181464679197, + "grad_norm": 0.8610507249832153, + "learning_rate": 3.646649792530094e-06, + "loss": 0.115, + "step": 3407 + }, + { + "epoch": 1.1043421905379132, + "grad_norm": 0.8911367058753967, + "learning_rate": 3.645872535136298e-06, + "loss": 0.1235, + "step": 3408 + }, + { + "epoch": 1.1046662346079066, + "grad_norm": 0.7985128164291382, + "learning_rate": 3.6450951374987958e-06, + "loss": 0.1059, + "step": 3409 + }, + { + "epoch": 1.1049902786779002, + "grad_norm": 0.8526930809020996, + "learning_rate": 3.6443175997127354e-06, + "loss": 0.1152, + "step": 3410 + }, + { + "epoch": 1.1053143227478937, + "grad_norm": 0.8289358019828796, + "learning_rate": 3.6435399218732776e-06, + "loss": 0.1218, + "step": 3411 + }, + { + "epoch": 1.1056383668178873, + "grad_norm": 0.8703218102455139, + "learning_rate": 3.642762104075604e-06, + "loss": 0.1181, + "step": 3412 + }, + { + "epoch": 1.1059624108878807, + "grad_norm": 0.834434986114502, + "learning_rate": 3.641984146414912e-06, + "loss": 0.1183, + "step": 3413 + }, + { + "epoch": 1.1062864549578744, + "grad_norm": 0.8061531782150269, + "learning_rate": 3.6412060489864155e-06, + "loss": 0.119, + "step": 3414 + }, + { + "epoch": 1.1066104990278678, + "grad_norm": 0.8192474246025085, + "learning_rate": 3.640427811885346e-06, + "loss": 0.1126, + "step": 3415 + }, + { + "epoch": 1.1069345430978612, + "grad_norm": 0.8862019181251526, + "learning_rate": 3.639649435206953e-06, + "loss": 0.1197, + "step": 3416 + }, + { + "epoch": 1.1072585871678549, + "grad_norm": 0.8898884057998657, + "learning_rate": 3.6388709190465018e-06, + "loss": 0.123, + "step": 3417 + }, + { + "epoch": 1.1075826312378483, + "grad_norm": 0.8126233220100403, + "learning_rate": 3.638092263499274e-06, + "loss": 0.1195, + "step": 3418 + }, + { + "epoch": 1.107906675307842, + "grad_norm": 0.7620636224746704, + "learning_rate": 3.6373134686605722e-06, + "loss": 0.1032, + "step": 3419 + }, + { + "epoch": 1.1082307193778353, + "grad_norm": 0.7888167500495911, + "learning_rate": 3.6365345346257112e-06, + "loss": 0.1081, + "step": 3420 + }, + { + "epoch": 1.108554763447829, + "grad_norm": 0.9147345423698425, + "learning_rate": 3.635755461490026e-06, + "loss": 0.133, + "step": 3421 + }, + { + "epoch": 1.1088788075178224, + "grad_norm": 0.823191225528717, + "learning_rate": 3.634976249348867e-06, + "loss": 0.1205, + "step": 3422 + }, + { + "epoch": 1.1092028515878158, + "grad_norm": 0.8202910423278809, + "learning_rate": 3.6341968982976027e-06, + "loss": 0.1154, + "step": 3423 + }, + { + "epoch": 1.1095268956578095, + "grad_norm": 0.8147830367088318, + "learning_rate": 3.6334174084316186e-06, + "loss": 0.1082, + "step": 3424 + }, + { + "epoch": 1.109850939727803, + "grad_norm": 0.8129696846008301, + "learning_rate": 3.632637779846315e-06, + "loss": 0.1149, + "step": 3425 + }, + { + "epoch": 1.1101749837977966, + "grad_norm": 0.9183993339538574, + "learning_rate": 3.6318580126371124e-06, + "loss": 0.1357, + "step": 3426 + }, + { + "epoch": 1.11049902786779, + "grad_norm": 0.7583103179931641, + "learning_rate": 3.631078106899446e-06, + "loss": 0.1048, + "step": 3427 + }, + { + "epoch": 1.1108230719377836, + "grad_norm": 0.8466932773590088, + "learning_rate": 3.630298062728769e-06, + "loss": 0.1206, + "step": 3428 + }, + { + "epoch": 1.111147116007777, + "grad_norm": 0.8395487070083618, + "learning_rate": 3.6295178802205515e-06, + "loss": 0.1238, + "step": 3429 + }, + { + "epoch": 1.1114711600777705, + "grad_norm": 0.8088542222976685, + "learning_rate": 3.62873755947028e-06, + "loss": 0.1162, + "step": 3430 + }, + { + "epoch": 1.1117952041477641, + "grad_norm": 0.9063001275062561, + "learning_rate": 3.6279571005734583e-06, + "loss": 0.1284, + "step": 3431 + }, + { + "epoch": 1.1121192482177575, + "grad_norm": 0.774341881275177, + "learning_rate": 3.6271765036256064e-06, + "loss": 0.0993, + "step": 3432 + }, + { + "epoch": 1.1124432922877512, + "grad_norm": 0.8491561412811279, + "learning_rate": 3.6263957687222633e-06, + "loss": 0.1114, + "step": 3433 + }, + { + "epoch": 1.1127673363577446, + "grad_norm": 0.8198556900024414, + "learning_rate": 3.625614895958982e-06, + "loss": 0.113, + "step": 3434 + }, + { + "epoch": 1.1130913804277383, + "grad_norm": 0.7903474569320679, + "learning_rate": 3.624833885431334e-06, + "loss": 0.113, + "step": 3435 + }, + { + "epoch": 1.1134154244977317, + "grad_norm": 1.7318416833877563, + "learning_rate": 3.624052737234908e-06, + "loss": 0.1048, + "step": 3436 + }, + { + "epoch": 1.1137394685677253, + "grad_norm": 0.8492423295974731, + "learning_rate": 3.6232714514653082e-06, + "loss": 0.1185, + "step": 3437 + }, + { + "epoch": 1.1140635126377187, + "grad_norm": 0.7925660014152527, + "learning_rate": 3.6224900282181574e-06, + "loss": 0.1124, + "step": 3438 + }, + { + "epoch": 1.1143875567077122, + "grad_norm": 0.847726047039032, + "learning_rate": 3.6217084675890935e-06, + "loss": 0.1211, + "step": 3439 + }, + { + "epoch": 1.1147116007777058, + "grad_norm": 0.8837020397186279, + "learning_rate": 3.6209267696737723e-06, + "loss": 0.1312, + "step": 3440 + }, + { + "epoch": 1.1150356448476992, + "grad_norm": 0.8512900471687317, + "learning_rate": 3.6201449345678657e-06, + "loss": 0.1186, + "step": 3441 + }, + { + "epoch": 1.1153596889176929, + "grad_norm": 0.7909263968467712, + "learning_rate": 3.6193629623670627e-06, + "loss": 0.1098, + "step": 3442 + }, + { + "epoch": 1.1156837329876863, + "grad_norm": 0.7999937534332275, + "learning_rate": 3.6185808531670695e-06, + "loss": 0.1119, + "step": 3443 + }, + { + "epoch": 1.1160077770576797, + "grad_norm": 0.8300805687904358, + "learning_rate": 3.617798607063609e-06, + "loss": 0.1162, + "step": 3444 + }, + { + "epoch": 1.1163318211276734, + "grad_norm": 0.7801008820533752, + "learning_rate": 3.61701622415242e-06, + "loss": 0.1079, + "step": 3445 + }, + { + "epoch": 1.1166558651976668, + "grad_norm": 0.8645122051239014, + "learning_rate": 3.616233704529259e-06, + "loss": 0.1198, + "step": 3446 + }, + { + "epoch": 1.1169799092676604, + "grad_norm": 0.7718315124511719, + "learning_rate": 3.6154510482898973e-06, + "loss": 0.1117, + "step": 3447 + }, + { + "epoch": 1.1173039533376539, + "grad_norm": 0.8447999954223633, + "learning_rate": 3.6146682555301266e-06, + "loss": 0.13, + "step": 3448 + }, + { + "epoch": 1.1176279974076475, + "grad_norm": 0.8865242004394531, + "learning_rate": 3.613885326345752e-06, + "loss": 0.1235, + "step": 3449 + }, + { + "epoch": 1.117952041477641, + "grad_norm": 0.8368310332298279, + "learning_rate": 3.6131022608325973e-06, + "loss": 0.115, + "step": 3450 + }, + { + "epoch": 1.1182760855476346, + "grad_norm": 0.8560866713523865, + "learning_rate": 3.6123190590865e-06, + "loss": 0.1211, + "step": 3451 + }, + { + "epoch": 1.118600129617628, + "grad_norm": 0.8064238429069519, + "learning_rate": 3.6115357212033196e-06, + "loss": 0.1116, + "step": 3452 + }, + { + "epoch": 1.1189241736876214, + "grad_norm": 0.808874249458313, + "learning_rate": 3.610752247278927e-06, + "loss": 0.1171, + "step": 3453 + }, + { + "epoch": 1.119248217757615, + "grad_norm": 0.814598023891449, + "learning_rate": 3.609968637409212e-06, + "loss": 0.1143, + "step": 3454 + }, + { + "epoch": 1.1195722618276085, + "grad_norm": 0.8557067513465881, + "learning_rate": 3.6091848916900816e-06, + "loss": 0.1184, + "step": 3455 + }, + { + "epoch": 1.1198963058976021, + "grad_norm": 0.8691409230232239, + "learning_rate": 3.6084010102174576e-06, + "loss": 0.1223, + "step": 3456 + }, + { + "epoch": 1.1202203499675956, + "grad_norm": 0.8462197780609131, + "learning_rate": 3.6076169930872805e-06, + "loss": 0.1197, + "step": 3457 + }, + { + "epoch": 1.1205443940375892, + "grad_norm": 0.9942762851715088, + "learning_rate": 3.606832840395506e-06, + "loss": 0.1159, + "step": 3458 + }, + { + "epoch": 1.1208684381075826, + "grad_norm": 0.8324168920516968, + "learning_rate": 3.6060485522381067e-06, + "loss": 0.12, + "step": 3459 + }, + { + "epoch": 1.121192482177576, + "grad_norm": 0.8624020218849182, + "learning_rate": 3.605264128711072e-06, + "loss": 0.124, + "step": 3460 + }, + { + "epoch": 1.1215165262475697, + "grad_norm": 0.7688712477684021, + "learning_rate": 3.6044795699104074e-06, + "loss": 0.1082, + "step": 3461 + }, + { + "epoch": 1.1218405703175631, + "grad_norm": 0.8165678977966309, + "learning_rate": 3.6036948759321357e-06, + "loss": 0.1111, + "step": 3462 + }, + { + "epoch": 1.1221646143875568, + "grad_norm": 0.8439948558807373, + "learning_rate": 3.6029100468722954e-06, + "loss": 0.1068, + "step": 3463 + }, + { + "epoch": 1.1224886584575502, + "grad_norm": 0.863638699054718, + "learning_rate": 3.602125082826944e-06, + "loss": 0.1218, + "step": 3464 + }, + { + "epoch": 1.1228127025275438, + "grad_norm": 0.8242613673210144, + "learning_rate": 3.60133998389215e-06, + "loss": 0.1168, + "step": 3465 + }, + { + "epoch": 1.1231367465975373, + "grad_norm": 0.8749831318855286, + "learning_rate": 3.600554750164005e-06, + "loss": 0.12, + "step": 3466 + }, + { + "epoch": 1.1234607906675307, + "grad_norm": 0.853952169418335, + "learning_rate": 3.5997693817386128e-06, + "loss": 0.1169, + "step": 3467 + }, + { + "epoch": 1.1237848347375243, + "grad_norm": 0.8400661945343018, + "learning_rate": 3.598983878712094e-06, + "loss": 0.1206, + "step": 3468 + }, + { + "epoch": 1.1241088788075178, + "grad_norm": 0.7379463315010071, + "learning_rate": 3.598198241180588e-06, + "loss": 0.1027, + "step": 3469 + }, + { + "epoch": 1.1244329228775114, + "grad_norm": 0.8917940258979797, + "learning_rate": 3.597412469240248e-06, + "loss": 0.1169, + "step": 3470 + }, + { + "epoch": 1.1247569669475048, + "grad_norm": 0.8408751487731934, + "learning_rate": 3.5966265629872466e-06, + "loss": 0.1189, + "step": 3471 + }, + { + "epoch": 1.1250810110174985, + "grad_norm": 0.8500704765319824, + "learning_rate": 3.595840522517769e-06, + "loss": 0.123, + "step": 3472 + }, + { + "epoch": 1.125405055087492, + "grad_norm": 0.7802397608757019, + "learning_rate": 3.5950543479280205e-06, + "loss": 0.1088, + "step": 3473 + }, + { + "epoch": 1.1257290991574855, + "grad_norm": 0.8065983653068542, + "learning_rate": 3.5942680393142203e-06, + "loss": 0.1171, + "step": 3474 + }, + { + "epoch": 1.126053143227479, + "grad_norm": 0.7985681891441345, + "learning_rate": 3.593481596772606e-06, + "loss": 0.1143, + "step": 3475 + }, + { + "epoch": 1.1263771872974724, + "grad_norm": 0.8253538608551025, + "learning_rate": 3.5926950203994303e-06, + "loss": 0.1088, + "step": 3476 + }, + { + "epoch": 1.126701231367466, + "grad_norm": 0.7611861824989319, + "learning_rate": 3.5919083102909615e-06, + "loss": 0.1148, + "step": 3477 + }, + { + "epoch": 1.1270252754374595, + "grad_norm": 0.8118978142738342, + "learning_rate": 3.591121466543487e-06, + "loss": 0.1166, + "step": 3478 + }, + { + "epoch": 1.127349319507453, + "grad_norm": 0.8605949282646179, + "learning_rate": 3.5903344892533067e-06, + "loss": 0.1075, + "step": 3479 + }, + { + "epoch": 1.1276733635774465, + "grad_norm": 0.8666369915008545, + "learning_rate": 3.5895473785167407e-06, + "loss": 0.1152, + "step": 3480 + }, + { + "epoch": 1.12799740764744, + "grad_norm": 0.8612379431724548, + "learning_rate": 3.5887601344301228e-06, + "loss": 0.1238, + "step": 3481 + }, + { + "epoch": 1.1283214517174336, + "grad_norm": 0.8836613893508911, + "learning_rate": 3.587972757089805e-06, + "loss": 0.1226, + "step": 3482 + }, + { + "epoch": 1.128645495787427, + "grad_norm": 0.8429580926895142, + "learning_rate": 3.587185246592154e-06, + "loss": 0.1138, + "step": 3483 + }, + { + "epoch": 1.1289695398574207, + "grad_norm": 0.8613108396530151, + "learning_rate": 3.5863976030335535e-06, + "loss": 0.1205, + "step": 3484 + }, + { + "epoch": 1.129293583927414, + "grad_norm": 0.8061350584030151, + "learning_rate": 3.5856098265104033e-06, + "loss": 0.1081, + "step": 3485 + }, + { + "epoch": 1.1296176279974077, + "grad_norm": 0.755104124546051, + "learning_rate": 3.58482191711912e-06, + "loss": 0.1155, + "step": 3486 + }, + { + "epoch": 1.1299416720674011, + "grad_norm": 0.9006972908973694, + "learning_rate": 3.5840338749561365e-06, + "loss": 0.1212, + "step": 3487 + }, + { + "epoch": 1.1302657161373948, + "grad_norm": 0.7697506546974182, + "learning_rate": 3.5832457001179e-06, + "loss": 0.1079, + "step": 3488 + }, + { + "epoch": 1.1305897602073882, + "grad_norm": 0.920031726360321, + "learning_rate": 3.582457392700878e-06, + "loss": 0.1285, + "step": 3489 + }, + { + "epoch": 1.1309138042773816, + "grad_norm": 0.8720796704292297, + "learning_rate": 3.5816689528015485e-06, + "loss": 0.1183, + "step": 3490 + }, + { + "epoch": 1.1312378483473753, + "grad_norm": 0.8361836075782776, + "learning_rate": 3.580880380516411e-06, + "loss": 0.1163, + "step": 3491 + }, + { + "epoch": 1.1315618924173687, + "grad_norm": 0.900898814201355, + "learning_rate": 3.5800916759419784e-06, + "loss": 0.1269, + "step": 3492 + }, + { + "epoch": 1.1318859364873624, + "grad_norm": 0.9403854608535767, + "learning_rate": 3.579302839174781e-06, + "loss": 0.1318, + "step": 3493 + }, + { + "epoch": 1.1322099805573558, + "grad_norm": 0.8255032300949097, + "learning_rate": 3.578513870311365e-06, + "loss": 0.1165, + "step": 3494 + }, + { + "epoch": 1.1325340246273492, + "grad_norm": 0.8372628092765808, + "learning_rate": 3.577724769448292e-06, + "loss": 0.1245, + "step": 3495 + }, + { + "epoch": 1.1328580686973428, + "grad_norm": 0.801400363445282, + "learning_rate": 3.57693553668214e-06, + "loss": 0.1133, + "step": 3496 + }, + { + "epoch": 1.1331821127673363, + "grad_norm": 0.832352340221405, + "learning_rate": 3.5761461721095037e-06, + "loss": 0.1279, + "step": 3497 + }, + { + "epoch": 1.13350615683733, + "grad_norm": 0.9028059840202332, + "learning_rate": 3.575356675826995e-06, + "loss": 0.1193, + "step": 3498 + }, + { + "epoch": 1.1338302009073233, + "grad_norm": 0.837671160697937, + "learning_rate": 3.574567047931238e-06, + "loss": 0.1244, + "step": 3499 + }, + { + "epoch": 1.134154244977317, + "grad_norm": 0.7758857011795044, + "learning_rate": 3.5737772885188777e-06, + "loss": 0.1149, + "step": 3500 + }, + { + "epoch": 1.1344782890473104, + "grad_norm": 0.8156300783157349, + "learning_rate": 3.5729873976865726e-06, + "loss": 0.1178, + "step": 3501 + }, + { + "epoch": 1.134802333117304, + "grad_norm": 0.8366619944572449, + "learning_rate": 3.5721973755309963e-06, + "loss": 0.1186, + "step": 3502 + }, + { + "epoch": 1.1351263771872975, + "grad_norm": 0.7583264708518982, + "learning_rate": 3.5714072221488414e-06, + "loss": 0.1086, + "step": 3503 + }, + { + "epoch": 1.135450421257291, + "grad_norm": 0.8399912714958191, + "learning_rate": 3.5706169376368143e-06, + "loss": 0.1206, + "step": 3504 + }, + { + "epoch": 1.1357744653272845, + "grad_norm": 0.8327789306640625, + "learning_rate": 3.5698265220916388e-06, + "loss": 0.123, + "step": 3505 + }, + { + "epoch": 1.136098509397278, + "grad_norm": 0.8301259875297546, + "learning_rate": 3.5690359756100532e-06, + "loss": 0.1242, + "step": 3506 + }, + { + "epoch": 1.1364225534672716, + "grad_norm": 0.8315457701683044, + "learning_rate": 3.5682452982888143e-06, + "loss": 0.1241, + "step": 3507 + }, + { + "epoch": 1.136746597537265, + "grad_norm": 0.8232241868972778, + "learning_rate": 3.5674544902246916e-06, + "loss": 0.121, + "step": 3508 + }, + { + "epoch": 1.1370706416072587, + "grad_norm": 0.8107770085334778, + "learning_rate": 3.566663551514473e-06, + "loss": 0.1149, + "step": 3509 + }, + { + "epoch": 1.137394685677252, + "grad_norm": 0.7625928521156311, + "learning_rate": 3.5658724822549624e-06, + "loss": 0.1046, + "step": 3510 + }, + { + "epoch": 1.1377187297472457, + "grad_norm": 0.8264610767364502, + "learning_rate": 3.5650812825429774e-06, + "loss": 0.1142, + "step": 3511 + }, + { + "epoch": 1.1380427738172392, + "grad_norm": 0.8182802200317383, + "learning_rate": 3.5642899524753548e-06, + "loss": 0.1156, + "step": 3512 + }, + { + "epoch": 1.1383668178872326, + "grad_norm": 0.8160684704780579, + "learning_rate": 3.5634984921489455e-06, + "loss": 0.1112, + "step": 3513 + }, + { + "epoch": 1.1386908619572262, + "grad_norm": 0.8345882296562195, + "learning_rate": 3.562706901660616e-06, + "loss": 0.1198, + "step": 3514 + }, + { + "epoch": 1.1390149060272197, + "grad_norm": 0.7987375259399414, + "learning_rate": 3.561915181107249e-06, + "loss": 0.1113, + "step": 3515 + }, + { + "epoch": 1.1393389500972133, + "grad_norm": 0.8715707659721375, + "learning_rate": 3.561123330585744e-06, + "loss": 0.1209, + "step": 3516 + }, + { + "epoch": 1.1396629941672067, + "grad_norm": 0.9261237978935242, + "learning_rate": 3.560331350193016e-06, + "loss": 0.1182, + "step": 3517 + }, + { + "epoch": 1.1399870382372002, + "grad_norm": 0.8216531872749329, + "learning_rate": 3.5595392400259963e-06, + "loss": 0.1148, + "step": 3518 + }, + { + "epoch": 1.1403110823071938, + "grad_norm": 0.7825983166694641, + "learning_rate": 3.55874700018163e-06, + "loss": 0.1094, + "step": 3519 + }, + { + "epoch": 1.1406351263771872, + "grad_norm": 0.932933509349823, + "learning_rate": 3.5579546307568807e-06, + "loss": 0.1231, + "step": 3520 + }, + { + "epoch": 1.1409591704471809, + "grad_norm": 0.8715762495994568, + "learning_rate": 3.557162131848726e-06, + "loss": 0.1248, + "step": 3521 + }, + { + "epoch": 1.1412832145171743, + "grad_norm": 0.8369512557983398, + "learning_rate": 3.5563695035541607e-06, + "loss": 0.1064, + "step": 3522 + }, + { + "epoch": 1.141607258587168, + "grad_norm": 0.739835262298584, + "learning_rate": 3.5555767459701946e-06, + "loss": 0.1117, + "step": 3523 + }, + { + "epoch": 1.1419313026571614, + "grad_norm": 0.8349111676216125, + "learning_rate": 3.554783859193853e-06, + "loss": 0.1152, + "step": 3524 + }, + { + "epoch": 1.142255346727155, + "grad_norm": 0.8169111013412476, + "learning_rate": 3.5539908433221793e-06, + "loss": 0.1142, + "step": 3525 + }, + { + "epoch": 1.1425793907971484, + "grad_norm": 0.804642915725708, + "learning_rate": 3.553197698452229e-06, + "loss": 0.1133, + "step": 3526 + }, + { + "epoch": 1.1429034348671419, + "grad_norm": 0.8334620594978333, + "learning_rate": 3.5524044246810764e-06, + "loss": 0.1196, + "step": 3527 + }, + { + "epoch": 1.1432274789371355, + "grad_norm": 0.8267092704772949, + "learning_rate": 3.5516110221058096e-06, + "loss": 0.118, + "step": 3528 + }, + { + "epoch": 1.143551523007129, + "grad_norm": 0.7908319234848022, + "learning_rate": 3.550817490823535e-06, + "loss": 0.1122, + "step": 3529 + }, + { + "epoch": 1.1438755670771226, + "grad_norm": 0.8376598954200745, + "learning_rate": 3.5500238309313717e-06, + "loss": 0.1122, + "step": 3530 + }, + { + "epoch": 1.144199611147116, + "grad_norm": 0.8465738296508789, + "learning_rate": 3.5492300425264574e-06, + "loss": 0.1159, + "step": 3531 + }, + { + "epoch": 1.1445236552171094, + "grad_norm": 0.8763962388038635, + "learning_rate": 3.5484361257059425e-06, + "loss": 0.1256, + "step": 3532 + }, + { + "epoch": 1.144847699287103, + "grad_norm": 0.8678879737854004, + "learning_rate": 3.5476420805669953e-06, + "loss": 0.1215, + "step": 3533 + }, + { + "epoch": 1.1451717433570965, + "grad_norm": 0.8365668058395386, + "learning_rate": 3.5468479072067996e-06, + "loss": 0.1156, + "step": 3534 + }, + { + "epoch": 1.1454957874270901, + "grad_norm": 0.817199170589447, + "learning_rate": 3.5460536057225542e-06, + "loss": 0.1117, + "step": 3535 + }, + { + "epoch": 1.1458198314970836, + "grad_norm": 0.8301751017570496, + "learning_rate": 3.545259176211474e-06, + "loss": 0.1111, + "step": 3536 + }, + { + "epoch": 1.1461438755670772, + "grad_norm": 0.845215916633606, + "learning_rate": 3.5444646187707897e-06, + "loss": 0.1216, + "step": 3537 + }, + { + "epoch": 1.1464679196370706, + "grad_norm": 0.8913041353225708, + "learning_rate": 3.5436699334977476e-06, + "loss": 0.1364, + "step": 3538 + }, + { + "epoch": 1.1467919637070643, + "grad_norm": 0.7893886566162109, + "learning_rate": 3.5428751204896083e-06, + "loss": 0.1106, + "step": 3539 + }, + { + "epoch": 1.1471160077770577, + "grad_norm": 0.8129891753196716, + "learning_rate": 3.542080179843651e-06, + "loss": 0.1157, + "step": 3540 + }, + { + "epoch": 1.1474400518470511, + "grad_norm": 0.8534232974052429, + "learning_rate": 3.5412851116571673e-06, + "loss": 0.1198, + "step": 3541 + }, + { + "epoch": 1.1477640959170448, + "grad_norm": 0.8511255383491516, + "learning_rate": 3.5404899160274664e-06, + "loss": 0.1205, + "step": 3542 + }, + { + "epoch": 1.1480881399870382, + "grad_norm": 0.8838348984718323, + "learning_rate": 3.5396945930518722e-06, + "loss": 0.1295, + "step": 3543 + }, + { + "epoch": 1.1484121840570318, + "grad_norm": 0.8620544672012329, + "learning_rate": 3.538899142827726e-06, + "loss": 0.1107, + "step": 3544 + }, + { + "epoch": 1.1487362281270252, + "grad_norm": 0.8188989162445068, + "learning_rate": 3.538103565452381e-06, + "loss": 0.1173, + "step": 3545 + }, + { + "epoch": 1.1490602721970187, + "grad_norm": 0.8156138062477112, + "learning_rate": 3.537307861023209e-06, + "loss": 0.1171, + "step": 3546 + }, + { + "epoch": 1.1493843162670123, + "grad_norm": 0.7772921323776245, + "learning_rate": 3.536512029637597e-06, + "loss": 0.1117, + "step": 3547 + }, + { + "epoch": 1.1497083603370057, + "grad_norm": 0.9473240971565247, + "learning_rate": 3.5357160713929473e-06, + "loss": 0.123, + "step": 3548 + }, + { + "epoch": 1.1500324044069994, + "grad_norm": 0.8777554035186768, + "learning_rate": 3.534919986386676e-06, + "loss": 0.1224, + "step": 3549 + }, + { + "epoch": 1.1503564484769928, + "grad_norm": 0.825102686882019, + "learning_rate": 3.5341237747162183e-06, + "loss": 0.1251, + "step": 3550 + }, + { + "epoch": 1.1506804925469865, + "grad_norm": 0.8877979516983032, + "learning_rate": 3.533327436479021e-06, + "loss": 0.1228, + "step": 3551 + }, + { + "epoch": 1.1510045366169799, + "grad_norm": 0.9140046238899231, + "learning_rate": 3.53253097177255e-06, + "loss": 0.13, + "step": 3552 + }, + { + "epoch": 1.1513285806869735, + "grad_norm": 0.9374979734420776, + "learning_rate": 3.531734380694282e-06, + "loss": 0.1281, + "step": 3553 + }, + { + "epoch": 1.151652624756967, + "grad_norm": 0.8636061549186707, + "learning_rate": 3.5309376633417146e-06, + "loss": 0.1286, + "step": 3554 + }, + { + "epoch": 1.1519766688269604, + "grad_norm": 0.7899109721183777, + "learning_rate": 3.530140819812357e-06, + "loss": 0.1173, + "step": 3555 + }, + { + "epoch": 1.152300712896954, + "grad_norm": 0.819510281085968, + "learning_rate": 3.5293438502037363e-06, + "loss": 0.1186, + "step": 3556 + }, + { + "epoch": 1.1526247569669474, + "grad_norm": 0.8103784322738647, + "learning_rate": 3.5285467546133926e-06, + "loss": 0.1182, + "step": 3557 + }, + { + "epoch": 1.152948801036941, + "grad_norm": 0.8102884888648987, + "learning_rate": 3.5277495331388835e-06, + "loss": 0.1167, + "step": 3558 + }, + { + "epoch": 1.1532728451069345, + "grad_norm": 0.8475920557975769, + "learning_rate": 3.526952185877781e-06, + "loss": 0.1234, + "step": 3559 + }, + { + "epoch": 1.1535968891769282, + "grad_norm": 0.8501052856445312, + "learning_rate": 3.526154712927672e-06, + "loss": 0.1218, + "step": 3560 + }, + { + "epoch": 1.1539209332469216, + "grad_norm": 0.7630634307861328, + "learning_rate": 3.525357114386161e-06, + "loss": 0.1099, + "step": 3561 + }, + { + "epoch": 1.1542449773169152, + "grad_norm": 0.8789275884628296, + "learning_rate": 3.524559390350865e-06, + "loss": 0.12, + "step": 3562 + }, + { + "epoch": 1.1545690213869086, + "grad_norm": 0.7459178566932678, + "learning_rate": 3.523761540919418e-06, + "loss": 0.1022, + "step": 3563 + }, + { + "epoch": 1.154893065456902, + "grad_norm": 0.7126516103744507, + "learning_rate": 3.5229635661894696e-06, + "loss": 0.1033, + "step": 3564 + }, + { + "epoch": 1.1552171095268957, + "grad_norm": 0.7776930928230286, + "learning_rate": 3.5221654662586837e-06, + "loss": 0.1093, + "step": 3565 + }, + { + "epoch": 1.1555411535968891, + "grad_norm": 0.7800213694572449, + "learning_rate": 3.521367241224739e-06, + "loss": 0.1102, + "step": 3566 + }, + { + "epoch": 1.1558651976668828, + "grad_norm": 0.7869005799293518, + "learning_rate": 3.5205688911853326e-06, + "loss": 0.105, + "step": 3567 + }, + { + "epoch": 1.1561892417368762, + "grad_norm": 0.8360563516616821, + "learning_rate": 3.5197704162381742e-06, + "loss": 0.1201, + "step": 3568 + }, + { + "epoch": 1.1565132858068696, + "grad_norm": 0.8422601222991943, + "learning_rate": 3.5189718164809884e-06, + "loss": 0.1286, + "step": 3569 + }, + { + "epoch": 1.1568373298768633, + "grad_norm": 0.8678613901138306, + "learning_rate": 3.5181730920115165e-06, + "loss": 0.1163, + "step": 3570 + }, + { + "epoch": 1.1571613739468567, + "grad_norm": 0.8137229681015015, + "learning_rate": 3.517374242927514e-06, + "loss": 0.1132, + "step": 3571 + }, + { + "epoch": 1.1574854180168503, + "grad_norm": 0.7828996181488037, + "learning_rate": 3.516575269326755e-06, + "loss": 0.1156, + "step": 3572 + }, + { + "epoch": 1.1578094620868438, + "grad_norm": 0.7514994740486145, + "learning_rate": 3.515776171307023e-06, + "loss": 0.1053, + "step": 3573 + }, + { + "epoch": 1.1581335061568374, + "grad_norm": 0.7747782468795776, + "learning_rate": 3.5149769489661216e-06, + "loss": 0.1069, + "step": 3574 + }, + { + "epoch": 1.1584575502268308, + "grad_norm": 0.7957112193107605, + "learning_rate": 3.5141776024018676e-06, + "loss": 0.1064, + "step": 3575 + }, + { + "epoch": 1.1587815942968245, + "grad_norm": 0.8027356863021851, + "learning_rate": 3.513378131712092e-06, + "loss": 0.1126, + "step": 3576 + }, + { + "epoch": 1.159105638366818, + "grad_norm": 0.9086324572563171, + "learning_rate": 3.5125785369946442e-06, + "loss": 0.1256, + "step": 3577 + }, + { + "epoch": 1.1594296824368113, + "grad_norm": 0.7713324427604675, + "learning_rate": 3.5117788183473856e-06, + "loss": 0.1096, + "step": 3578 + }, + { + "epoch": 1.159753726506805, + "grad_norm": 0.8583053350448608, + "learning_rate": 3.5109789758681944e-06, + "loss": 0.1113, + "step": 3579 + }, + { + "epoch": 1.1600777705767984, + "grad_norm": 0.831057071685791, + "learning_rate": 3.5101790096549643e-06, + "loss": 0.1163, + "step": 3580 + }, + { + "epoch": 1.160401814646792, + "grad_norm": 0.7998328804969788, + "learning_rate": 3.509378919805602e-06, + "loss": 0.1171, + "step": 3581 + }, + { + "epoch": 1.1607258587167855, + "grad_norm": 0.8440855741500854, + "learning_rate": 3.5085787064180317e-06, + "loss": 0.122, + "step": 3582 + }, + { + "epoch": 1.1610499027867789, + "grad_norm": 0.8473719358444214, + "learning_rate": 3.5077783695901917e-06, + "loss": 0.1072, + "step": 3583 + }, + { + "epoch": 1.1613739468567725, + "grad_norm": 0.7954153418540955, + "learning_rate": 3.506977909420035e-06, + "loss": 0.1138, + "step": 3584 + }, + { + "epoch": 1.161697990926766, + "grad_norm": 0.836928129196167, + "learning_rate": 3.506177326005531e-06, + "loss": 0.1263, + "step": 3585 + }, + { + "epoch": 1.1620220349967596, + "grad_norm": 0.8505675196647644, + "learning_rate": 3.5053766194446626e-06, + "loss": 0.1155, + "step": 3586 + }, + { + "epoch": 1.162346079066753, + "grad_norm": 0.8441529273986816, + "learning_rate": 3.504575789835428e-06, + "loss": 0.1185, + "step": 3587 + }, + { + "epoch": 1.1626701231367467, + "grad_norm": 0.8499163389205933, + "learning_rate": 3.503774837275843e-06, + "loss": 0.115, + "step": 3588 + }, + { + "epoch": 1.16299416720674, + "grad_norm": 0.8068932294845581, + "learning_rate": 3.5029737618639344e-06, + "loss": 0.1064, + "step": 3589 + }, + { + "epoch": 1.1633182112767337, + "grad_norm": 0.8755378723144531, + "learning_rate": 3.5021725636977466e-06, + "loss": 0.1189, + "step": 3590 + }, + { + "epoch": 1.1636422553467272, + "grad_norm": 0.8903499841690063, + "learning_rate": 3.5013712428753392e-06, + "loss": 0.1253, + "step": 3591 + }, + { + "epoch": 1.1639662994167206, + "grad_norm": 0.7750681042671204, + "learning_rate": 3.500569799494786e-06, + "loss": 0.1071, + "step": 3592 + }, + { + "epoch": 1.1642903434867142, + "grad_norm": 0.7885310649871826, + "learning_rate": 3.4997682336541756e-06, + "loss": 0.1155, + "step": 3593 + }, + { + "epoch": 1.1646143875567077, + "grad_norm": 0.8886268138885498, + "learning_rate": 3.498966545451612e-06, + "loss": 0.1259, + "step": 3594 + }, + { + "epoch": 1.1649384316267013, + "grad_norm": 0.8036131858825684, + "learning_rate": 3.4981647349852137e-06, + "loss": 0.1145, + "step": 3595 + }, + { + "epoch": 1.1652624756966947, + "grad_norm": 0.7636457681655884, + "learning_rate": 3.4973628023531146e-06, + "loss": 0.0985, + "step": 3596 + }, + { + "epoch": 1.1655865197666881, + "grad_norm": 0.8249123692512512, + "learning_rate": 3.496560747653464e-06, + "loss": 0.1203, + "step": 3597 + }, + { + "epoch": 1.1659105638366818, + "grad_norm": 0.8131135702133179, + "learning_rate": 3.4957585709844254e-06, + "loss": 0.1138, + "step": 3598 + }, + { + "epoch": 1.1662346079066752, + "grad_norm": 0.8088524341583252, + "learning_rate": 3.494956272444177e-06, + "loss": 0.1147, + "step": 3599 + }, + { + "epoch": 1.1665586519766689, + "grad_norm": 0.8252901434898376, + "learning_rate": 3.494153852130913e-06, + "loss": 0.1206, + "step": 3600 + }, + { + "epoch": 1.1668826960466623, + "grad_norm": 0.7895000576972961, + "learning_rate": 3.4933513101428416e-06, + "loss": 0.1061, + "step": 3601 + }, + { + "epoch": 1.167206740116656, + "grad_norm": 0.8485018014907837, + "learning_rate": 3.4925486465781865e-06, + "loss": 0.1161, + "step": 3602 + }, + { + "epoch": 1.1675307841866494, + "grad_norm": 0.9247622489929199, + "learning_rate": 3.4917458615351853e-06, + "loss": 0.1252, + "step": 3603 + }, + { + "epoch": 1.167854828256643, + "grad_norm": 0.8563806414604187, + "learning_rate": 3.490942955112092e-06, + "loss": 0.114, + "step": 3604 + }, + { + "epoch": 1.1681788723266364, + "grad_norm": 0.9615631699562073, + "learning_rate": 3.490139927407174e-06, + "loss": 0.1387, + "step": 3605 + }, + { + "epoch": 1.1685029163966298, + "grad_norm": 0.8398370742797852, + "learning_rate": 3.4893367785187137e-06, + "loss": 0.122, + "step": 3606 + }, + { + "epoch": 1.1688269604666235, + "grad_norm": 0.8580317497253418, + "learning_rate": 3.4885335085450095e-06, + "loss": 0.1137, + "step": 3607 + }, + { + "epoch": 1.169151004536617, + "grad_norm": 0.8144624829292297, + "learning_rate": 3.4877301175843735e-06, + "loss": 0.1166, + "step": 3608 + }, + { + "epoch": 1.1694750486066106, + "grad_norm": 0.8784302473068237, + "learning_rate": 3.486926605735133e-06, + "loss": 0.123, + "step": 3609 + }, + { + "epoch": 1.169799092676604, + "grad_norm": 0.8242217898368835, + "learning_rate": 3.486122973095631e-06, + "loss": 0.1254, + "step": 3610 + }, + { + "epoch": 1.1701231367465976, + "grad_norm": 0.8689938187599182, + "learning_rate": 3.4853192197642226e-06, + "loss": 0.1201, + "step": 3611 + }, + { + "epoch": 1.170447180816591, + "grad_norm": 0.804336667060852, + "learning_rate": 3.48451534583928e-06, + "loss": 0.1263, + "step": 3612 + }, + { + "epoch": 1.1707712248865847, + "grad_norm": 0.8793946504592896, + "learning_rate": 3.4837113514191907e-06, + "loss": 0.1355, + "step": 3613 + }, + { + "epoch": 1.1710952689565781, + "grad_norm": 0.8660229444503784, + "learning_rate": 3.482907236602354e-06, + "loss": 0.1216, + "step": 3614 + }, + { + "epoch": 1.1714193130265715, + "grad_norm": 0.7956066727638245, + "learning_rate": 3.4821030014871886e-06, + "loss": 0.1165, + "step": 3615 + }, + { + "epoch": 1.1717433570965652, + "grad_norm": 0.8330211639404297, + "learning_rate": 3.481298646172122e-06, + "loss": 0.1127, + "step": 3616 + }, + { + "epoch": 1.1720674011665586, + "grad_norm": 0.7992603182792664, + "learning_rate": 3.480494170755602e-06, + "loss": 0.112, + "step": 3617 + }, + { + "epoch": 1.1723914452365523, + "grad_norm": 0.7594689726829529, + "learning_rate": 3.479689575336086e-06, + "loss": 0.1078, + "step": 3618 + }, + { + "epoch": 1.1727154893065457, + "grad_norm": 0.8167518973350525, + "learning_rate": 3.4788848600120507e-06, + "loss": 0.1056, + "step": 3619 + }, + { + "epoch": 1.173039533376539, + "grad_norm": 0.8602342009544373, + "learning_rate": 3.4780800248819847e-06, + "loss": 0.118, + "step": 3620 + }, + { + "epoch": 1.1733635774465327, + "grad_norm": 0.8619323968887329, + "learning_rate": 3.4772750700443923e-06, + "loss": 0.1171, + "step": 3621 + }, + { + "epoch": 1.1736876215165262, + "grad_norm": 0.8086685538291931, + "learning_rate": 3.476469995597792e-06, + "loss": 0.1093, + "step": 3622 + }, + { + "epoch": 1.1740116655865198, + "grad_norm": 0.8759011030197144, + "learning_rate": 3.4756648016407175e-06, + "loss": 0.1131, + "step": 3623 + }, + { + "epoch": 1.1743357096565132, + "grad_norm": 0.8564737439155579, + "learning_rate": 3.4748594882717163e-06, + "loss": 0.1182, + "step": 3624 + }, + { + "epoch": 1.1746597537265069, + "grad_norm": 0.8285084962844849, + "learning_rate": 3.474054055589351e-06, + "loss": 0.1075, + "step": 3625 + }, + { + "epoch": 1.1749837977965003, + "grad_norm": 0.932595431804657, + "learning_rate": 3.473248503692199e-06, + "loss": 0.1289, + "step": 3626 + }, + { + "epoch": 1.175307841866494, + "grad_norm": 0.844597578048706, + "learning_rate": 3.472442832678852e-06, + "loss": 0.121, + "step": 3627 + }, + { + "epoch": 1.1756318859364874, + "grad_norm": 0.9832401275634766, + "learning_rate": 3.471637042647916e-06, + "loss": 0.1271, + "step": 3628 + }, + { + "epoch": 1.1759559300064808, + "grad_norm": 0.880580484867096, + "learning_rate": 3.470831133698013e-06, + "loss": 0.1166, + "step": 3629 + }, + { + "epoch": 1.1762799740764744, + "grad_norm": 0.8617246150970459, + "learning_rate": 3.470025105927777e-06, + "loss": 0.1141, + "step": 3630 + }, + { + "epoch": 1.1766040181464679, + "grad_norm": 0.8869532346725464, + "learning_rate": 3.4692189594358578e-06, + "loss": 0.1227, + "step": 3631 + }, + { + "epoch": 1.1769280622164615, + "grad_norm": 0.8361830711364746, + "learning_rate": 3.468412694320921e-06, + "loss": 0.1171, + "step": 3632 + }, + { + "epoch": 1.177252106286455, + "grad_norm": 0.8260972499847412, + "learning_rate": 3.467606310681646e-06, + "loss": 0.1133, + "step": 3633 + }, + { + "epoch": 1.1775761503564484, + "grad_norm": 0.8462033867835999, + "learning_rate": 3.4667998086167253e-06, + "loss": 0.1181, + "step": 3634 + }, + { + "epoch": 1.177900194426442, + "grad_norm": 0.826924204826355, + "learning_rate": 3.465993188224868e-06, + "loss": 0.1122, + "step": 3635 + }, + { + "epoch": 1.1782242384964354, + "grad_norm": 0.7624189853668213, + "learning_rate": 3.4651864496047952e-06, + "loss": 0.1142, + "step": 3636 + }, + { + "epoch": 1.178548282566429, + "grad_norm": 0.8189399242401123, + "learning_rate": 3.464379592855246e-06, + "loss": 0.1198, + "step": 3637 + }, + { + "epoch": 1.1788723266364225, + "grad_norm": 0.8579363822937012, + "learning_rate": 3.4635726180749698e-06, + "loss": 0.1197, + "step": 3638 + }, + { + "epoch": 1.1791963707064161, + "grad_norm": 0.8414760231971741, + "learning_rate": 3.4627655253627324e-06, + "loss": 0.1269, + "step": 3639 + }, + { + "epoch": 1.1795204147764096, + "grad_norm": 0.8462185263633728, + "learning_rate": 3.461958314817316e-06, + "loss": 0.1205, + "step": 3640 + }, + { + "epoch": 1.1798444588464032, + "grad_norm": 0.8419263958930969, + "learning_rate": 3.4611509865375143e-06, + "loss": 0.1262, + "step": 3641 + }, + { + "epoch": 1.1801685029163966, + "grad_norm": 0.8019840121269226, + "learning_rate": 3.4603435406221356e-06, + "loss": 0.1171, + "step": 3642 + }, + { + "epoch": 1.18049254698639, + "grad_norm": 0.850952684879303, + "learning_rate": 3.4595359771700055e-06, + "loss": 0.1245, + "step": 3643 + }, + { + "epoch": 1.1808165910563837, + "grad_norm": 0.7618928551673889, + "learning_rate": 3.4587282962799602e-06, + "loss": 0.1056, + "step": 3644 + }, + { + "epoch": 1.1811406351263771, + "grad_norm": 0.7782850861549377, + "learning_rate": 3.4579204980508525e-06, + "loss": 0.1109, + "step": 3645 + }, + { + "epoch": 1.1814646791963708, + "grad_norm": 0.8100502490997314, + "learning_rate": 3.45711258258155e-06, + "loss": 0.1163, + "step": 3646 + }, + { + "epoch": 1.1817887232663642, + "grad_norm": 0.9370281100273132, + "learning_rate": 3.4563045499709324e-06, + "loss": 0.1319, + "step": 3647 + }, + { + "epoch": 1.1821127673363578, + "grad_norm": 0.8365497589111328, + "learning_rate": 3.455496400317896e-06, + "loss": 0.1188, + "step": 3648 + }, + { + "epoch": 1.1824368114063513, + "grad_norm": 0.8149347305297852, + "learning_rate": 3.45468813372135e-06, + "loss": 0.1127, + "step": 3649 + }, + { + "epoch": 1.1827608554763447, + "grad_norm": 0.815455436706543, + "learning_rate": 3.453879750280218e-06, + "loss": 0.1072, + "step": 3650 + }, + { + "epoch": 1.1830848995463383, + "grad_norm": 0.8272018432617188, + "learning_rate": 3.4530712500934393e-06, + "loss": 0.1172, + "step": 3651 + }, + { + "epoch": 1.1834089436163318, + "grad_norm": 0.8645578622817993, + "learning_rate": 3.4522626332599657e-06, + "loss": 0.1261, + "step": 3652 + }, + { + "epoch": 1.1837329876863254, + "grad_norm": 0.8879179954528809, + "learning_rate": 3.451453899878765e-06, + "loss": 0.1229, + "step": 3653 + }, + { + "epoch": 1.1840570317563188, + "grad_norm": 0.8350304961204529, + "learning_rate": 3.450645050048817e-06, + "loss": 0.116, + "step": 3654 + }, + { + "epoch": 1.1843810758263125, + "grad_norm": 0.7642476558685303, + "learning_rate": 3.449836083869118e-06, + "loss": 0.1096, + "step": 3655 + }, + { + "epoch": 1.184705119896306, + "grad_norm": 0.7937090396881104, + "learning_rate": 3.449027001438678e-06, + "loss": 0.1167, + "step": 3656 + }, + { + "epoch": 1.1850291639662993, + "grad_norm": 0.9045467972755432, + "learning_rate": 3.44821780285652e-06, + "loss": 0.1261, + "step": 3657 + }, + { + "epoch": 1.185353208036293, + "grad_norm": 0.8645219206809998, + "learning_rate": 3.4474084882216826e-06, + "loss": 0.1329, + "step": 3658 + }, + { + "epoch": 1.1856772521062864, + "grad_norm": 0.8591427206993103, + "learning_rate": 3.4465990576332177e-06, + "loss": 0.1254, + "step": 3659 + }, + { + "epoch": 1.18600129617628, + "grad_norm": 0.8770203590393066, + "learning_rate": 3.445789511190192e-06, + "loss": 0.122, + "step": 3660 + }, + { + "epoch": 1.1863253402462735, + "grad_norm": 0.8709147572517395, + "learning_rate": 3.4449798489916856e-06, + "loss": 0.1274, + "step": 3661 + }, + { + "epoch": 1.186649384316267, + "grad_norm": 0.8532130122184753, + "learning_rate": 3.444170071136794e-06, + "loss": 0.1276, + "step": 3662 + }, + { + "epoch": 1.1869734283862605, + "grad_norm": 0.831161618232727, + "learning_rate": 3.4433601777246263e-06, + "loss": 0.122, + "step": 3663 + }, + { + "epoch": 1.1872974724562542, + "grad_norm": 0.7930514812469482, + "learning_rate": 3.442550168854305e-06, + "loss": 0.1182, + "step": 3664 + }, + { + "epoch": 1.1876215165262476, + "grad_norm": 0.7588174939155579, + "learning_rate": 3.4417400446249684e-06, + "loss": 0.1083, + "step": 3665 + }, + { + "epoch": 1.187945560596241, + "grad_norm": 0.8680461645126343, + "learning_rate": 3.440929805135766e-06, + "loss": 0.1298, + "step": 3666 + }, + { + "epoch": 1.1882696046662347, + "grad_norm": 0.8464110493659973, + "learning_rate": 3.440119450485865e-06, + "loss": 0.1155, + "step": 3667 + }, + { + "epoch": 1.188593648736228, + "grad_norm": 0.8591915369033813, + "learning_rate": 3.439308980774444e-06, + "loss": 0.1267, + "step": 3668 + }, + { + "epoch": 1.1889176928062217, + "grad_norm": 0.7415616512298584, + "learning_rate": 3.438498396100697e-06, + "loss": 0.0989, + "step": 3669 + }, + { + "epoch": 1.1892417368762151, + "grad_norm": 0.8387213945388794, + "learning_rate": 3.4376876965638317e-06, + "loss": 0.1218, + "step": 3670 + }, + { + "epoch": 1.1895657809462086, + "grad_norm": 0.8068972826004028, + "learning_rate": 3.4368768822630705e-06, + "loss": 0.112, + "step": 3671 + }, + { + "epoch": 1.1898898250162022, + "grad_norm": 0.8905644416809082, + "learning_rate": 3.4360659532976475e-06, + "loss": 0.1264, + "step": 3672 + }, + { + "epoch": 1.1902138690861956, + "grad_norm": 0.769451916217804, + "learning_rate": 3.435254909766814e-06, + "loss": 0.1135, + "step": 3673 + }, + { + "epoch": 1.1905379131561893, + "grad_norm": 0.7571532726287842, + "learning_rate": 3.4344437517698336e-06, + "loss": 0.1142, + "step": 3674 + }, + { + "epoch": 1.1908619572261827, + "grad_norm": 0.8844805955886841, + "learning_rate": 3.433632479405984e-06, + "loss": 0.1057, + "step": 3675 + }, + { + "epoch": 1.1911860012961764, + "grad_norm": 0.8947396874427795, + "learning_rate": 3.4328210927745577e-06, + "loss": 0.1295, + "step": 3676 + }, + { + "epoch": 1.1915100453661698, + "grad_norm": 0.7603358030319214, + "learning_rate": 3.4320095919748596e-06, + "loss": 0.1088, + "step": 3677 + }, + { + "epoch": 1.1918340894361634, + "grad_norm": 0.8614794611930847, + "learning_rate": 3.43119797710621e-06, + "loss": 0.1288, + "step": 3678 + }, + { + "epoch": 1.1921581335061568, + "grad_norm": 0.8844104409217834, + "learning_rate": 3.4303862482679435e-06, + "loss": 0.1293, + "step": 3679 + }, + { + "epoch": 1.1924821775761503, + "grad_norm": 0.8741898536682129, + "learning_rate": 3.429574405559406e-06, + "loss": 0.1255, + "step": 3680 + }, + { + "epoch": 1.192806221646144, + "grad_norm": 0.8146804571151733, + "learning_rate": 3.4287624490799605e-06, + "loss": 0.1139, + "step": 3681 + }, + { + "epoch": 1.1931302657161373, + "grad_norm": 0.8065559267997742, + "learning_rate": 3.4279503789289824e-06, + "loss": 0.1118, + "step": 3682 + }, + { + "epoch": 1.193454309786131, + "grad_norm": 0.8196713328361511, + "learning_rate": 3.4271381952058607e-06, + "loss": 0.118, + "step": 3683 + }, + { + "epoch": 1.1937783538561244, + "grad_norm": 0.7994847893714905, + "learning_rate": 3.42632589801e-06, + "loss": 0.1161, + "step": 3684 + }, + { + "epoch": 1.1941023979261178, + "grad_norm": 0.9280275702476501, + "learning_rate": 3.425513487440817e-06, + "loss": 0.1256, + "step": 3685 + }, + { + "epoch": 1.1944264419961115, + "grad_norm": 0.8903563022613525, + "learning_rate": 3.4247009635977425e-06, + "loss": 0.1233, + "step": 3686 + }, + { + "epoch": 1.194750486066105, + "grad_norm": 0.7753077745437622, + "learning_rate": 3.4238883265802215e-06, + "loss": 0.1099, + "step": 3687 + }, + { + "epoch": 1.1950745301360985, + "grad_norm": 0.8520569205284119, + "learning_rate": 3.4230755764877133e-06, + "loss": 0.1233, + "step": 3688 + }, + { + "epoch": 1.195398574206092, + "grad_norm": 0.8427757620811462, + "learning_rate": 3.4222627134196917e-06, + "loss": 0.1226, + "step": 3689 + }, + { + "epoch": 1.1957226182760856, + "grad_norm": 0.7773596048355103, + "learning_rate": 3.4214497374756415e-06, + "loss": 0.1099, + "step": 3690 + }, + { + "epoch": 1.196046662346079, + "grad_norm": 0.7584930658340454, + "learning_rate": 3.4206366487550637e-06, + "loss": 0.1096, + "step": 3691 + }, + { + "epoch": 1.1963707064160727, + "grad_norm": 0.8986695408821106, + "learning_rate": 3.419823447357472e-06, + "loss": 0.1293, + "step": 3692 + }, + { + "epoch": 1.196694750486066, + "grad_norm": 0.8784900307655334, + "learning_rate": 3.4190101333823956e-06, + "loss": 0.1246, + "step": 3693 + }, + { + "epoch": 1.1970187945560595, + "grad_norm": 0.8179463148117065, + "learning_rate": 3.4181967069293754e-06, + "loss": 0.1115, + "step": 3694 + }, + { + "epoch": 1.1973428386260532, + "grad_norm": 0.7988253831863403, + "learning_rate": 3.417383168097967e-06, + "loss": 0.1103, + "step": 3695 + }, + { + "epoch": 1.1976668826960466, + "grad_norm": 0.826006293296814, + "learning_rate": 3.41656951698774e-06, + "loss": 0.1143, + "step": 3696 + }, + { + "epoch": 1.1979909267660402, + "grad_norm": 0.8301533460617065, + "learning_rate": 3.4157557536982773e-06, + "loss": 0.115, + "step": 3697 + }, + { + "epoch": 1.1983149708360337, + "grad_norm": 0.7946401238441467, + "learning_rate": 3.414941878329175e-06, + "loss": 0.1146, + "step": 3698 + }, + { + "epoch": 1.1986390149060273, + "grad_norm": 0.836628258228302, + "learning_rate": 3.4141278909800444e-06, + "loss": 0.1147, + "step": 3699 + }, + { + "epoch": 1.1989630589760207, + "grad_norm": 0.8173867464065552, + "learning_rate": 3.41331379175051e-06, + "loss": 0.1129, + "step": 3700 + }, + { + "epoch": 1.1992871030460144, + "grad_norm": 0.8635963797569275, + "learning_rate": 3.4124995807402082e-06, + "loss": 0.1219, + "step": 3701 + }, + { + "epoch": 1.1996111471160078, + "grad_norm": 0.8261188864707947, + "learning_rate": 3.4116852580487925e-06, + "loss": 0.112, + "step": 3702 + }, + { + "epoch": 1.1999351911860012, + "grad_norm": 0.8579382300376892, + "learning_rate": 3.4108708237759258e-06, + "loss": 0.1121, + "step": 3703 + }, + { + "epoch": 1.2002592352559949, + "grad_norm": 0.8296414017677307, + "learning_rate": 3.4100562780212887e-06, + "loss": 0.1129, + "step": 3704 + }, + { + "epoch": 1.2005832793259883, + "grad_norm": 0.8090779185295105, + "learning_rate": 3.4092416208845723e-06, + "loss": 0.1141, + "step": 3705 + }, + { + "epoch": 1.200907323395982, + "grad_norm": 0.8289951682090759, + "learning_rate": 3.4084268524654847e-06, + "loss": 0.1223, + "step": 3706 + }, + { + "epoch": 1.2012313674659754, + "grad_norm": 0.8589972257614136, + "learning_rate": 3.407611972863744e-06, + "loss": 0.1205, + "step": 3707 + }, + { + "epoch": 1.2015554115359688, + "grad_norm": 0.8730632662773132, + "learning_rate": 3.406796982179085e-06, + "loss": 0.1253, + "step": 3708 + }, + { + "epoch": 1.2018794556059624, + "grad_norm": 0.8103494644165039, + "learning_rate": 3.4059818805112534e-06, + "loss": 0.1139, + "step": 3709 + }, + { + "epoch": 1.2022034996759559, + "grad_norm": 0.7576596736907959, + "learning_rate": 3.4051666679600105e-06, + "loss": 0.104, + "step": 3710 + }, + { + "epoch": 1.2025275437459495, + "grad_norm": 0.8215952515602112, + "learning_rate": 3.40435134462513e-06, + "loss": 0.1136, + "step": 3711 + }, + { + "epoch": 1.202851587815943, + "grad_norm": 0.7837064862251282, + "learning_rate": 3.403535910606399e-06, + "loss": 0.1106, + "step": 3712 + }, + { + "epoch": 1.2031756318859366, + "grad_norm": 0.7939320206642151, + "learning_rate": 3.4027203660036202e-06, + "loss": 0.1104, + "step": 3713 + }, + { + "epoch": 1.20349967595593, + "grad_norm": 0.7713475227355957, + "learning_rate": 3.4019047109166077e-06, + "loss": 0.1086, + "step": 3714 + }, + { + "epoch": 1.2038237200259236, + "grad_norm": 0.8262404203414917, + "learning_rate": 3.401088945445189e-06, + "loss": 0.1188, + "step": 3715 + }, + { + "epoch": 1.204147764095917, + "grad_norm": 0.8336588144302368, + "learning_rate": 3.4002730696892073e-06, + "loss": 0.1182, + "step": 3716 + }, + { + "epoch": 1.2044718081659105, + "grad_norm": 0.9291484951972961, + "learning_rate": 3.3994570837485163e-06, + "loss": 0.1332, + "step": 3717 + }, + { + "epoch": 1.2047958522359041, + "grad_norm": 0.749964714050293, + "learning_rate": 3.3986409877229863e-06, + "loss": 0.1097, + "step": 3718 + }, + { + "epoch": 1.2051198963058976, + "grad_norm": 0.8636124134063721, + "learning_rate": 3.3978247817124986e-06, + "loss": 0.1273, + "step": 3719 + }, + { + "epoch": 1.2054439403758912, + "grad_norm": 0.7298382520675659, + "learning_rate": 3.39700846581695e-06, + "loss": 0.1015, + "step": 3720 + }, + { + "epoch": 1.2057679844458846, + "grad_norm": 0.832988440990448, + "learning_rate": 3.3961920401362488e-06, + "loss": 0.1156, + "step": 3721 + }, + { + "epoch": 1.206092028515878, + "grad_norm": 0.8105771541595459, + "learning_rate": 3.3953755047703174e-06, + "loss": 0.1164, + "step": 3722 + }, + { + "epoch": 1.2064160725858717, + "grad_norm": 0.8326833844184875, + "learning_rate": 3.394558859819092e-06, + "loss": 0.1274, + "step": 3723 + }, + { + "epoch": 1.2067401166558651, + "grad_norm": 0.8200793862342834, + "learning_rate": 3.393742105382522e-06, + "loss": 0.1213, + "step": 3724 + }, + { + "epoch": 1.2070641607258588, + "grad_norm": 0.849841296672821, + "learning_rate": 3.3929252415605708e-06, + "loss": 0.1198, + "step": 3725 + }, + { + "epoch": 1.2073882047958522, + "grad_norm": 0.8289008736610413, + "learning_rate": 3.3921082684532143e-06, + "loss": 0.1142, + "step": 3726 + }, + { + "epoch": 1.2077122488658458, + "grad_norm": 0.7795475721359253, + "learning_rate": 3.391291186160441e-06, + "loss": 0.1029, + "step": 3727 + }, + { + "epoch": 1.2080362929358393, + "grad_norm": 0.8882423639297485, + "learning_rate": 3.3904739947822556e-06, + "loss": 0.1276, + "step": 3728 + }, + { + "epoch": 1.208360337005833, + "grad_norm": 0.7692775130271912, + "learning_rate": 3.3896566944186737e-06, + "loss": 0.1042, + "step": 3729 + }, + { + "epoch": 1.2086843810758263, + "grad_norm": 0.8318784236907959, + "learning_rate": 3.388839285169725e-06, + "loss": 0.1188, + "step": 3730 + }, + { + "epoch": 1.2090084251458197, + "grad_norm": 0.798616349697113, + "learning_rate": 3.3880217671354527e-06, + "loss": 0.1135, + "step": 3731 + }, + { + "epoch": 1.2093324692158134, + "grad_norm": 0.8450761437416077, + "learning_rate": 3.3872041404159124e-06, + "loss": 0.1233, + "step": 3732 + }, + { + "epoch": 1.2096565132858068, + "grad_norm": 0.8565431833267212, + "learning_rate": 3.3863864051111744e-06, + "loss": 0.1238, + "step": 3733 + }, + { + "epoch": 1.2099805573558005, + "grad_norm": 0.7984777092933655, + "learning_rate": 3.385568561321321e-06, + "loss": 0.1144, + "step": 3734 + }, + { + "epoch": 1.2103046014257939, + "grad_norm": 0.8732336759567261, + "learning_rate": 3.3847506091464487e-06, + "loss": 0.124, + "step": 3735 + }, + { + "epoch": 1.2106286454957873, + "grad_norm": 0.8275569081306458, + "learning_rate": 3.383932548686667e-06, + "loss": 0.1198, + "step": 3736 + }, + { + "epoch": 1.210952689565781, + "grad_norm": 0.8002777099609375, + "learning_rate": 3.3831143800420983e-06, + "loss": 0.116, + "step": 3737 + }, + { + "epoch": 1.2112767336357744, + "grad_norm": 0.8314655423164368, + "learning_rate": 3.3822961033128793e-06, + "loss": 0.1161, + "step": 3738 + }, + { + "epoch": 1.211600777705768, + "grad_norm": 0.8205680251121521, + "learning_rate": 3.3814777185991577e-06, + "loss": 0.1094, + "step": 3739 + }, + { + "epoch": 1.2119248217757614, + "grad_norm": 0.8222754597663879, + "learning_rate": 3.380659226001097e-06, + "loss": 0.1182, + "step": 3740 + }, + { + "epoch": 1.212248865845755, + "grad_norm": 0.8440394997596741, + "learning_rate": 3.3798406256188725e-06, + "loss": 0.1242, + "step": 3741 + }, + { + "epoch": 1.2125729099157485, + "grad_norm": 0.8333975672721863, + "learning_rate": 3.3790219175526733e-06, + "loss": 0.115, + "step": 3742 + }, + { + "epoch": 1.2128969539857422, + "grad_norm": 0.7962969541549683, + "learning_rate": 3.3782031019027006e-06, + "loss": 0.1148, + "step": 3743 + }, + { + "epoch": 1.2132209980557356, + "grad_norm": 0.8393021821975708, + "learning_rate": 3.3773841787691708e-06, + "loss": 0.1196, + "step": 3744 + }, + { + "epoch": 1.213545042125729, + "grad_norm": 0.8597471117973328, + "learning_rate": 3.3765651482523097e-06, + "loss": 0.1186, + "step": 3745 + }, + { + "epoch": 1.2138690861957226, + "grad_norm": 0.837240993976593, + "learning_rate": 3.375746010452361e-06, + "loss": 0.1261, + "step": 3746 + }, + { + "epoch": 1.214193130265716, + "grad_norm": 0.8484938144683838, + "learning_rate": 3.374926765469578e-06, + "loss": 0.1129, + "step": 3747 + }, + { + "epoch": 1.2145171743357097, + "grad_norm": 0.8841095566749573, + "learning_rate": 3.3741074134042297e-06, + "loss": 0.1372, + "step": 3748 + }, + { + "epoch": 1.2148412184057031, + "grad_norm": 0.782389223575592, + "learning_rate": 3.3732879543565955e-06, + "loss": 0.1139, + "step": 3749 + }, + { + "epoch": 1.2151652624756968, + "grad_norm": 0.8806767463684082, + "learning_rate": 3.3724683884269702e-06, + "loss": 0.1382, + "step": 3750 + }, + { + "epoch": 1.2154893065456902, + "grad_norm": 0.799345850944519, + "learning_rate": 3.37164871571566e-06, + "loss": 0.1162, + "step": 3751 + }, + { + "epoch": 1.2158133506156839, + "grad_norm": 0.8000692129135132, + "learning_rate": 3.370828936322985e-06, + "loss": 0.1096, + "step": 3752 + }, + { + "epoch": 1.2161373946856773, + "grad_norm": 0.7767576575279236, + "learning_rate": 3.3700090503492795e-06, + "loss": 0.1146, + "step": 3753 + }, + { + "epoch": 1.2164614387556707, + "grad_norm": 0.7903865575790405, + "learning_rate": 3.3691890578948876e-06, + "loss": 0.1145, + "step": 3754 + }, + { + "epoch": 1.2167854828256643, + "grad_norm": 0.8180772066116333, + "learning_rate": 3.36836895906017e-06, + "loss": 0.1103, + "step": 3755 + }, + { + "epoch": 1.2171095268956578, + "grad_norm": 0.907408595085144, + "learning_rate": 3.3675487539454972e-06, + "loss": 0.1282, + "step": 3756 + }, + { + "epoch": 1.2174335709656514, + "grad_norm": 0.7968785166740417, + "learning_rate": 3.3667284426512565e-06, + "loss": 0.1143, + "step": 3757 + }, + { + "epoch": 1.2177576150356448, + "grad_norm": 0.7605866193771362, + "learning_rate": 3.3659080252778446e-06, + "loss": 0.1154, + "step": 3758 + }, + { + "epoch": 1.2180816591056383, + "grad_norm": 0.9358341097831726, + "learning_rate": 3.365087501925673e-06, + "loss": 0.1308, + "step": 3759 + }, + { + "epoch": 1.218405703175632, + "grad_norm": 0.8174268007278442, + "learning_rate": 3.3642668726951657e-06, + "loss": 0.1218, + "step": 3760 + }, + { + "epoch": 1.2187297472456253, + "grad_norm": 0.8260276317596436, + "learning_rate": 3.36344613768676e-06, + "loss": 0.1175, + "step": 3761 + }, + { + "epoch": 1.219053791315619, + "grad_norm": 0.8210480213165283, + "learning_rate": 3.362625297000906e-06, + "loss": 0.1207, + "step": 3762 + }, + { + "epoch": 1.2193778353856124, + "grad_norm": 0.7981109619140625, + "learning_rate": 3.3618043507380673e-06, + "loss": 0.116, + "step": 3763 + }, + { + "epoch": 1.219701879455606, + "grad_norm": 0.7980762720108032, + "learning_rate": 3.3609832989987178e-06, + "loss": 0.1181, + "step": 3764 + }, + { + "epoch": 1.2200259235255995, + "grad_norm": 0.7661181688308716, + "learning_rate": 3.360162141883348e-06, + "loss": 0.1081, + "step": 3765 + }, + { + "epoch": 1.220349967595593, + "grad_norm": 0.8313801884651184, + "learning_rate": 3.3593408794924585e-06, + "loss": 0.116, + "step": 3766 + }, + { + "epoch": 1.2206740116655865, + "grad_norm": 0.7752500176429749, + "learning_rate": 3.358519511926565e-06, + "loss": 0.1102, + "step": 3767 + }, + { + "epoch": 1.22099805573558, + "grad_norm": 0.8230960369110107, + "learning_rate": 3.357698039286194e-06, + "loss": 0.1212, + "step": 3768 + }, + { + "epoch": 1.2213220998055736, + "grad_norm": 0.7976222634315491, + "learning_rate": 3.356876461671887e-06, + "loss": 0.1159, + "step": 3769 + }, + { + "epoch": 1.221646143875567, + "grad_norm": 0.8018410801887512, + "learning_rate": 3.3560547791841957e-06, + "loss": 0.1112, + "step": 3770 + }, + { + "epoch": 1.2219701879455607, + "grad_norm": 0.8810217976570129, + "learning_rate": 3.3552329919236865e-06, + "loss": 0.1152, + "step": 3771 + }, + { + "epoch": 1.222294232015554, + "grad_norm": 0.8110620379447937, + "learning_rate": 3.3544110999909385e-06, + "loss": 0.1117, + "step": 3772 + }, + { + "epoch": 1.2226182760855475, + "grad_norm": 0.8340097069740295, + "learning_rate": 3.3535891034865433e-06, + "loss": 0.1178, + "step": 3773 + }, + { + "epoch": 1.2229423201555412, + "grad_norm": 0.8052523732185364, + "learning_rate": 3.3527670025111046e-06, + "loss": 0.1058, + "step": 3774 + }, + { + "epoch": 1.2232663642255346, + "grad_norm": 0.7342526912689209, + "learning_rate": 3.3519447971652407e-06, + "loss": 0.1037, + "step": 3775 + }, + { + "epoch": 1.2235904082955282, + "grad_norm": 0.8502779006958008, + "learning_rate": 3.351122487549582e-06, + "loss": 0.1146, + "step": 3776 + }, + { + "epoch": 1.2239144523655217, + "grad_norm": 0.8023433685302734, + "learning_rate": 3.3503000737647696e-06, + "loss": 0.1193, + "step": 3777 + }, + { + "epoch": 1.2242384964355153, + "grad_norm": 0.792868971824646, + "learning_rate": 3.349477555911459e-06, + "loss": 0.1089, + "step": 3778 + }, + { + "epoch": 1.2245625405055087, + "grad_norm": 0.8314449191093445, + "learning_rate": 3.3486549340903196e-06, + "loss": 0.1228, + "step": 3779 + }, + { + "epoch": 1.2248865845755024, + "grad_norm": 0.8645145893096924, + "learning_rate": 3.3478322084020322e-06, + "loss": 0.1152, + "step": 3780 + }, + { + "epoch": 1.2252106286454958, + "grad_norm": 0.9096472859382629, + "learning_rate": 3.34700937894729e-06, + "loss": 0.1181, + "step": 3781 + }, + { + "epoch": 1.2255346727154892, + "grad_norm": 0.8502877950668335, + "learning_rate": 3.3461864458267996e-06, + "loss": 0.1129, + "step": 3782 + }, + { + "epoch": 1.2258587167854829, + "grad_norm": 0.8530779480934143, + "learning_rate": 3.3453634091412795e-06, + "loss": 0.1243, + "step": 3783 + }, + { + "epoch": 1.2261827608554763, + "grad_norm": 0.9280954003334045, + "learning_rate": 3.344540268991462e-06, + "loss": 0.1348, + "step": 3784 + }, + { + "epoch": 1.22650680492547, + "grad_norm": 0.785020649433136, + "learning_rate": 3.343717025478092e-06, + "loss": 0.1126, + "step": 3785 + }, + { + "epoch": 1.2268308489954634, + "grad_norm": 0.8465002179145813, + "learning_rate": 3.342893678701925e-06, + "loss": 0.1169, + "step": 3786 + }, + { + "epoch": 1.227154893065457, + "grad_norm": 0.8912972211837769, + "learning_rate": 3.3420702287637325e-06, + "loss": 0.1332, + "step": 3787 + }, + { + "epoch": 1.2274789371354504, + "grad_norm": 0.8117028474807739, + "learning_rate": 3.341246675764295e-06, + "loss": 0.1202, + "step": 3788 + }, + { + "epoch": 1.2278029812054438, + "grad_norm": 0.8363078832626343, + "learning_rate": 3.3404230198044085e-06, + "loss": 0.1237, + "step": 3789 + }, + { + "epoch": 1.2281270252754375, + "grad_norm": 0.7812249660491943, + "learning_rate": 3.3395992609848804e-06, + "loss": 0.1153, + "step": 3790 + }, + { + "epoch": 1.228451069345431, + "grad_norm": 0.858752965927124, + "learning_rate": 3.338775399406531e-06, + "loss": 0.1166, + "step": 3791 + }, + { + "epoch": 1.2287751134154246, + "grad_norm": 0.7888984680175781, + "learning_rate": 3.3379514351701924e-06, + "loss": 0.1111, + "step": 3792 + }, + { + "epoch": 1.229099157485418, + "grad_norm": 0.7987516522407532, + "learning_rate": 3.3371273683767102e-06, + "loss": 0.1086, + "step": 3793 + }, + { + "epoch": 1.2294232015554116, + "grad_norm": 0.8837012648582458, + "learning_rate": 3.3363031991269423e-06, + "loss": 0.1211, + "step": 3794 + }, + { + "epoch": 1.229747245625405, + "grad_norm": 0.8158565759658813, + "learning_rate": 3.3354789275217587e-06, + "loss": 0.1187, + "step": 3795 + }, + { + "epoch": 1.2300712896953985, + "grad_norm": 0.8124872446060181, + "learning_rate": 3.3346545536620425e-06, + "loss": 0.1181, + "step": 3796 + }, + { + "epoch": 1.2303953337653921, + "grad_norm": 0.8039227724075317, + "learning_rate": 3.3338300776486886e-06, + "loss": 0.1138, + "step": 3797 + }, + { + "epoch": 1.2307193778353855, + "grad_norm": 0.8304813504219055, + "learning_rate": 3.3330054995826056e-06, + "loss": 0.1146, + "step": 3798 + }, + { + "epoch": 1.2310434219053792, + "grad_norm": 0.7987209558486938, + "learning_rate": 3.3321808195647144e-06, + "loss": 0.1155, + "step": 3799 + }, + { + "epoch": 1.2313674659753726, + "grad_norm": 0.7513888478279114, + "learning_rate": 3.3313560376959456e-06, + "loss": 0.1082, + "step": 3800 + }, + { + "epoch": 1.2316915100453663, + "grad_norm": 0.851762592792511, + "learning_rate": 3.3305311540772467e-06, + "loss": 0.1147, + "step": 3801 + }, + { + "epoch": 1.2320155541153597, + "grad_norm": 0.7502050399780273, + "learning_rate": 3.3297061688095746e-06, + "loss": 0.1051, + "step": 3802 + }, + { + "epoch": 1.2323395981853533, + "grad_norm": 0.7800151109695435, + "learning_rate": 3.3288810819938995e-06, + "loss": 0.1141, + "step": 3803 + }, + { + "epoch": 1.2326636422553467, + "grad_norm": 0.8294506669044495, + "learning_rate": 3.3280558937312037e-06, + "loss": 0.1031, + "step": 3804 + }, + { + "epoch": 1.2329876863253402, + "grad_norm": 0.8127052783966064, + "learning_rate": 3.327230604122484e-06, + "loss": 0.1157, + "step": 3805 + }, + { + "epoch": 1.2333117303953338, + "grad_norm": 0.8488887548446655, + "learning_rate": 3.326405213268745e-06, + "loss": 0.1138, + "step": 3806 + }, + { + "epoch": 1.2336357744653272, + "grad_norm": 0.8581557869911194, + "learning_rate": 3.3255797212710095e-06, + "loss": 0.1199, + "step": 3807 + }, + { + "epoch": 1.2339598185353209, + "grad_norm": 0.8989530801773071, + "learning_rate": 3.3247541282303082e-06, + "loss": 0.1269, + "step": 3808 + }, + { + "epoch": 1.2342838626053143, + "grad_norm": 0.8879166841506958, + "learning_rate": 3.3239284342476852e-06, + "loss": 0.1292, + "step": 3809 + }, + { + "epoch": 1.2346079066753077, + "grad_norm": 0.847872257232666, + "learning_rate": 3.3231026394241983e-06, + "loss": 0.1283, + "step": 3810 + }, + { + "epoch": 1.2349319507453014, + "grad_norm": 0.7723486423492432, + "learning_rate": 3.3222767438609166e-06, + "loss": 0.1089, + "step": 3811 + }, + { + "epoch": 1.2352559948152948, + "grad_norm": 0.8634940385818481, + "learning_rate": 3.321450747658922e-06, + "loss": 0.1284, + "step": 3812 + }, + { + "epoch": 1.2355800388852884, + "grad_norm": 0.8183860182762146, + "learning_rate": 3.3206246509193076e-06, + "loss": 0.1219, + "step": 3813 + }, + { + "epoch": 1.2359040829552819, + "grad_norm": 0.907188892364502, + "learning_rate": 3.3197984537431797e-06, + "loss": 0.1089, + "step": 3814 + }, + { + "epoch": 1.2362281270252755, + "grad_norm": 0.8658711910247803, + "learning_rate": 3.3189721562316585e-06, + "loss": 0.1258, + "step": 3815 + }, + { + "epoch": 1.236552171095269, + "grad_norm": 0.7768942713737488, + "learning_rate": 3.3181457584858736e-06, + "loss": 0.1107, + "step": 3816 + }, + { + "epoch": 1.2368762151652626, + "grad_norm": 0.8284804821014404, + "learning_rate": 3.3173192606069673e-06, + "loss": 0.1197, + "step": 3817 + }, + { + "epoch": 1.237200259235256, + "grad_norm": 0.8545773029327393, + "learning_rate": 3.316492662696097e-06, + "loss": 0.1232, + "step": 3818 + }, + { + "epoch": 1.2375243033052494, + "grad_norm": 0.8069331645965576, + "learning_rate": 3.3156659648544276e-06, + "loss": 0.1193, + "step": 3819 + }, + { + "epoch": 1.237848347375243, + "grad_norm": 0.8107204437255859, + "learning_rate": 3.314839167183141e-06, + "loss": 0.1113, + "step": 3820 + }, + { + "epoch": 1.2381723914452365, + "grad_norm": 0.7912015914916992, + "learning_rate": 3.3140122697834287e-06, + "loss": 0.1072, + "step": 3821 + }, + { + "epoch": 1.2384964355152301, + "grad_norm": 0.8278244733810425, + "learning_rate": 3.3131852727564947e-06, + "loss": 0.1166, + "step": 3822 + }, + { + "epoch": 1.2388204795852236, + "grad_norm": 0.8750582337379456, + "learning_rate": 3.3123581762035557e-06, + "loss": 0.1253, + "step": 3823 + }, + { + "epoch": 1.239144523655217, + "grad_norm": 0.860623836517334, + "learning_rate": 3.31153098022584e-06, + "loss": 0.1155, + "step": 3824 + }, + { + "epoch": 1.2394685677252106, + "grad_norm": 0.8325600028038025, + "learning_rate": 3.3107036849245883e-06, + "loss": 0.1158, + "step": 3825 + }, + { + "epoch": 1.239792611795204, + "grad_norm": 0.8544103503227234, + "learning_rate": 3.309876290401054e-06, + "loss": 0.1187, + "step": 3826 + }, + { + "epoch": 1.2401166558651977, + "grad_norm": 0.8223888278007507, + "learning_rate": 3.309048796756503e-06, + "loss": 0.1204, + "step": 3827 + }, + { + "epoch": 1.2404406999351911, + "grad_norm": 0.7503345608711243, + "learning_rate": 3.3082212040922103e-06, + "loss": 0.1153, + "step": 3828 + }, + { + "epoch": 1.2407647440051848, + "grad_norm": 0.8012227416038513, + "learning_rate": 3.307393512509466e-06, + "loss": 0.1154, + "step": 3829 + }, + { + "epoch": 1.2410887880751782, + "grad_norm": 0.724138081073761, + "learning_rate": 3.3065657221095732e-06, + "loss": 0.0955, + "step": 3830 + }, + { + "epoch": 1.2414128321451718, + "grad_norm": 0.8285104036331177, + "learning_rate": 3.3057378329938432e-06, + "loss": 0.1181, + "step": 3831 + }, + { + "epoch": 1.2417368762151653, + "grad_norm": 0.8077479004859924, + "learning_rate": 3.304909845263603e-06, + "loss": 0.1138, + "step": 3832 + }, + { + "epoch": 1.2420609202851587, + "grad_norm": 0.7801032662391663, + "learning_rate": 3.3040817590201897e-06, + "loss": 0.1041, + "step": 3833 + }, + { + "epoch": 1.2423849643551523, + "grad_norm": 0.8577712178230286, + "learning_rate": 3.303253574364953e-06, + "loss": 0.1206, + "step": 3834 + }, + { + "epoch": 1.2427090084251458, + "grad_norm": 0.8118460178375244, + "learning_rate": 3.3024252913992548e-06, + "loss": 0.112, + "step": 3835 + }, + { + "epoch": 1.2430330524951394, + "grad_norm": 0.9097766876220703, + "learning_rate": 3.3015969102244704e-06, + "loss": 0.1292, + "step": 3836 + }, + { + "epoch": 1.2433570965651328, + "grad_norm": 0.7517203688621521, + "learning_rate": 3.300768430941983e-06, + "loss": 0.1056, + "step": 3837 + }, + { + "epoch": 1.2436811406351265, + "grad_norm": 0.81373530626297, + "learning_rate": 3.299939853653192e-06, + "loss": 0.1121, + "step": 3838 + }, + { + "epoch": 1.24400518470512, + "grad_norm": 0.8525338768959045, + "learning_rate": 3.299111178459507e-06, + "loss": 0.1279, + "step": 3839 + }, + { + "epoch": 1.2443292287751135, + "grad_norm": 0.8729413747787476, + "learning_rate": 3.29828240546235e-06, + "loss": 0.1248, + "step": 3840 + }, + { + "epoch": 1.244653272845107, + "grad_norm": 0.790283739566803, + "learning_rate": 3.297453534763154e-06, + "loss": 0.1146, + "step": 3841 + }, + { + "epoch": 1.2449773169151004, + "grad_norm": 0.8346278667449951, + "learning_rate": 3.2966245664633654e-06, + "loss": 0.1189, + "step": 3842 + }, + { + "epoch": 1.245301360985094, + "grad_norm": 0.8715471625328064, + "learning_rate": 3.295795500664442e-06, + "loss": 0.1221, + "step": 3843 + }, + { + "epoch": 1.2456254050550875, + "grad_norm": 0.8614826202392578, + "learning_rate": 3.294966337467853e-06, + "loss": 0.1177, + "step": 3844 + }, + { + "epoch": 1.245949449125081, + "grad_norm": 0.7593932151794434, + "learning_rate": 3.2941370769750804e-06, + "loss": 0.1124, + "step": 3845 + }, + { + "epoch": 1.2462734931950745, + "grad_norm": 0.8406314253807068, + "learning_rate": 3.293307719287617e-06, + "loss": 0.1099, + "step": 3846 + }, + { + "epoch": 1.246597537265068, + "grad_norm": 0.8011007308959961, + "learning_rate": 3.2924782645069684e-06, + "loss": 0.1129, + "step": 3847 + }, + { + "epoch": 1.2469215813350616, + "grad_norm": 0.841182291507721, + "learning_rate": 3.291648712734653e-06, + "loss": 0.1089, + "step": 3848 + }, + { + "epoch": 1.247245625405055, + "grad_norm": 0.84003746509552, + "learning_rate": 3.290819064072198e-06, + "loss": 0.1254, + "step": 3849 + }, + { + "epoch": 1.2475696694750487, + "grad_norm": 0.920987606048584, + "learning_rate": 3.289989318621146e-06, + "loss": 0.1157, + "step": 3850 + }, + { + "epoch": 1.247893713545042, + "grad_norm": 0.8493415117263794, + "learning_rate": 3.289159476483049e-06, + "loss": 0.1176, + "step": 3851 + }, + { + "epoch": 1.2482177576150357, + "grad_norm": 0.9096932411193848, + "learning_rate": 3.2883295377594716e-06, + "loss": 0.127, + "step": 3852 + }, + { + "epoch": 1.2485418016850292, + "grad_norm": 0.838018000125885, + "learning_rate": 3.2874995025519897e-06, + "loss": 0.1176, + "step": 3853 + }, + { + "epoch": 1.2488658457550228, + "grad_norm": 0.804726779460907, + "learning_rate": 3.2866693709621933e-06, + "loss": 0.1116, + "step": 3854 + }, + { + "epoch": 1.2491898898250162, + "grad_norm": 0.796380341053009, + "learning_rate": 3.285839143091681e-06, + "loss": 0.1067, + "step": 3855 + }, + { + "epoch": 1.2495139338950096, + "grad_norm": 0.8139790296554565, + "learning_rate": 3.2850088190420647e-06, + "loss": 0.1152, + "step": 3856 + }, + { + "epoch": 1.2498379779650033, + "grad_norm": 0.8421602845191956, + "learning_rate": 3.284178398914969e-06, + "loss": 0.1151, + "step": 3857 + }, + { + "epoch": 1.2501620220349967, + "grad_norm": 0.811719536781311, + "learning_rate": 3.283347882812028e-06, + "loss": 0.1221, + "step": 3858 + }, + { + "epoch": 1.2504860661049904, + "grad_norm": 0.9033892154693604, + "learning_rate": 3.282517270834891e-06, + "loss": 0.1241, + "step": 3859 + }, + { + "epoch": 1.2508101101749838, + "grad_norm": 0.8108022809028625, + "learning_rate": 3.281686563085214e-06, + "loss": 0.1213, + "step": 3860 + }, + { + "epoch": 1.2511341542449772, + "grad_norm": 0.8107488751411438, + "learning_rate": 3.28085575966467e-06, + "loss": 0.109, + "step": 3861 + }, + { + "epoch": 1.2514581983149708, + "grad_norm": 0.8544443249702454, + "learning_rate": 3.2800248606749395e-06, + "loss": 0.1167, + "step": 3862 + }, + { + "epoch": 1.2517822423849643, + "grad_norm": 0.8549750447273254, + "learning_rate": 3.2791938662177174e-06, + "loss": 0.1126, + "step": 3863 + }, + { + "epoch": 1.252106286454958, + "grad_norm": 0.9143353700637817, + "learning_rate": 3.278362776394709e-06, + "loss": 0.1354, + "step": 3864 + }, + { + "epoch": 1.2524303305249513, + "grad_norm": 0.838623583316803, + "learning_rate": 3.277531591307632e-06, + "loss": 0.116, + "step": 3865 + }, + { + "epoch": 1.252754374594945, + "grad_norm": 0.7793409824371338, + "learning_rate": 3.2767003110582164e-06, + "loss": 0.1098, + "step": 3866 + }, + { + "epoch": 1.2530784186649384, + "grad_norm": 0.8405201435089111, + "learning_rate": 3.275868935748201e-06, + "loss": 0.1283, + "step": 3867 + }, + { + "epoch": 1.253402462734932, + "grad_norm": 0.8322092294692993, + "learning_rate": 3.2750374654793387e-06, + "loss": 0.1145, + "step": 3868 + }, + { + "epoch": 1.2537265068049255, + "grad_norm": 0.894817054271698, + "learning_rate": 3.2742059003533933e-06, + "loss": 0.1262, + "step": 3869 + }, + { + "epoch": 1.254050550874919, + "grad_norm": 0.7786268591880798, + "learning_rate": 3.2733742404721413e-06, + "loss": 0.1056, + "step": 3870 + }, + { + "epoch": 1.2543745949449125, + "grad_norm": 0.744128406047821, + "learning_rate": 3.272542485937369e-06, + "loss": 0.1039, + "step": 3871 + }, + { + "epoch": 1.254698639014906, + "grad_norm": 0.8040996789932251, + "learning_rate": 3.2717106368508755e-06, + "loss": 0.1115, + "step": 3872 + }, + { + "epoch": 1.2550226830848996, + "grad_norm": 0.8093754649162292, + "learning_rate": 3.27087869331447e-06, + "loss": 0.108, + "step": 3873 + }, + { + "epoch": 1.255346727154893, + "grad_norm": 0.8474469184875488, + "learning_rate": 3.2700466554299755e-06, + "loss": 0.1125, + "step": 3874 + }, + { + "epoch": 1.2556707712248865, + "grad_norm": 0.8335444331169128, + "learning_rate": 3.2692145232992244e-06, + "loss": 0.121, + "step": 3875 + }, + { + "epoch": 1.25599481529488, + "grad_norm": 0.7793686389923096, + "learning_rate": 3.268382297024063e-06, + "loss": 0.1073, + "step": 3876 + }, + { + "epoch": 1.2563188593648738, + "grad_norm": 0.7746005654335022, + "learning_rate": 3.2675499767063464e-06, + "loss": 0.1127, + "step": 3877 + }, + { + "epoch": 1.2566429034348672, + "grad_norm": 0.8334304094314575, + "learning_rate": 3.266717562447944e-06, + "loss": 0.1209, + "step": 3878 + }, + { + "epoch": 1.2569669475048606, + "grad_norm": 0.8408777117729187, + "learning_rate": 3.2658850543507336e-06, + "loss": 0.1244, + "step": 3879 + }, + { + "epoch": 1.2572909915748542, + "grad_norm": 0.7953009009361267, + "learning_rate": 3.2650524525166064e-06, + "loss": 0.1093, + "step": 3880 + }, + { + "epoch": 1.2576150356448477, + "grad_norm": 0.8646888136863708, + "learning_rate": 3.2642197570474665e-06, + "loss": 0.1205, + "step": 3881 + }, + { + "epoch": 1.2579390797148413, + "grad_norm": 0.8490650057792664, + "learning_rate": 3.263386968045226e-06, + "loss": 0.1093, + "step": 3882 + }, + { + "epoch": 1.2582631237848347, + "grad_norm": 0.8932080268859863, + "learning_rate": 3.262554085611811e-06, + "loss": 0.1282, + "step": 3883 + }, + { + "epoch": 1.2585871678548282, + "grad_norm": 0.7818010449409485, + "learning_rate": 3.261721109849158e-06, + "loss": 0.1078, + "step": 3884 + }, + { + "epoch": 1.2589112119248218, + "grad_norm": 0.7877272963523865, + "learning_rate": 3.2608880408592148e-06, + "loss": 0.1067, + "step": 3885 + }, + { + "epoch": 1.2592352559948152, + "grad_norm": 0.8062349557876587, + "learning_rate": 3.2600548787439413e-06, + "loss": 0.1154, + "step": 3886 + }, + { + "epoch": 1.2595593000648089, + "grad_norm": 0.8956946730613708, + "learning_rate": 3.2592216236053086e-06, + "loss": 0.1166, + "step": 3887 + }, + { + "epoch": 1.2598833441348023, + "grad_norm": 0.8361225724220276, + "learning_rate": 3.2583882755452994e-06, + "loss": 0.1152, + "step": 3888 + }, + { + "epoch": 1.2602073882047957, + "grad_norm": 0.8042587041854858, + "learning_rate": 3.257554834665907e-06, + "loss": 0.1178, + "step": 3889 + }, + { + "epoch": 1.2605314322747894, + "grad_norm": 0.8830363154411316, + "learning_rate": 3.2567213010691367e-06, + "loss": 0.1199, + "step": 3890 + }, + { + "epoch": 1.260855476344783, + "grad_norm": 0.841013491153717, + "learning_rate": 3.255887674857004e-06, + "loss": 0.1174, + "step": 3891 + }, + { + "epoch": 1.2611795204147764, + "grad_norm": 0.7718873620033264, + "learning_rate": 3.2550539561315385e-06, + "loss": 0.1115, + "step": 3892 + }, + { + "epoch": 1.2615035644847699, + "grad_norm": 0.835184633731842, + "learning_rate": 3.2542201449947774e-06, + "loss": 0.122, + "step": 3893 + }, + { + "epoch": 1.2618276085547635, + "grad_norm": 0.7849681973457336, + "learning_rate": 3.2533862415487723e-06, + "loss": 0.111, + "step": 3894 + }, + { + "epoch": 1.262151652624757, + "grad_norm": 0.8814342617988586, + "learning_rate": 3.2525522458955843e-06, + "loss": 0.1277, + "step": 3895 + }, + { + "epoch": 1.2624756966947506, + "grad_norm": 0.7500383257865906, + "learning_rate": 3.251718158137287e-06, + "loss": 0.109, + "step": 3896 + }, + { + "epoch": 1.262799740764744, + "grad_norm": 0.7909573316574097, + "learning_rate": 3.2508839783759642e-06, + "loss": 0.1103, + "step": 3897 + }, + { + "epoch": 1.2631237848347374, + "grad_norm": 0.7755079865455627, + "learning_rate": 3.2500497067137116e-06, + "loss": 0.1096, + "step": 3898 + }, + { + "epoch": 1.263447828904731, + "grad_norm": 0.8379855155944824, + "learning_rate": 3.2492153432526356e-06, + "loss": 0.1125, + "step": 3899 + }, + { + "epoch": 1.2637718729747245, + "grad_norm": 0.8379008769989014, + "learning_rate": 3.2483808880948552e-06, + "loss": 0.1344, + "step": 3900 + }, + { + "epoch": 1.2640959170447181, + "grad_norm": 0.7528431415557861, + "learning_rate": 3.2475463413424983e-06, + "loss": 0.107, + "step": 3901 + }, + { + "epoch": 1.2644199611147116, + "grad_norm": 0.806319534778595, + "learning_rate": 3.246711703097707e-06, + "loss": 0.1239, + "step": 3902 + }, + { + "epoch": 1.2647440051847052, + "grad_norm": 0.8018485307693481, + "learning_rate": 3.2458769734626315e-06, + "loss": 0.126, + "step": 3903 + }, + { + "epoch": 1.2650680492546986, + "grad_norm": 0.817126452922821, + "learning_rate": 3.245042152539435e-06, + "loss": 0.1187, + "step": 3904 + }, + { + "epoch": 1.2653920933246923, + "grad_norm": 0.7525573372840881, + "learning_rate": 3.2442072404302917e-06, + "loss": 0.101, + "step": 3905 + }, + { + "epoch": 1.2657161373946857, + "grad_norm": 0.8159139156341553, + "learning_rate": 3.243372237237386e-06, + "loss": 0.1137, + "step": 3906 + }, + { + "epoch": 1.2660401814646791, + "grad_norm": 0.8026230931282043, + "learning_rate": 3.2425371430629155e-06, + "loss": 0.1156, + "step": 3907 + }, + { + "epoch": 1.2663642255346728, + "grad_norm": 0.8402119874954224, + "learning_rate": 3.241701958009087e-06, + "loss": 0.1192, + "step": 3908 + }, + { + "epoch": 1.2666882696046662, + "grad_norm": 0.8447930216789246, + "learning_rate": 3.2408666821781186e-06, + "loss": 0.1234, + "step": 3909 + }, + { + "epoch": 1.2670123136746598, + "grad_norm": 0.8497375845909119, + "learning_rate": 3.2400313156722414e-06, + "loss": 0.1145, + "step": 3910 + }, + { + "epoch": 1.2673363577446533, + "grad_norm": 0.7882909178733826, + "learning_rate": 3.2391958585936946e-06, + "loss": 0.1088, + "step": 3911 + }, + { + "epoch": 1.2676604018146467, + "grad_norm": 0.8392199873924255, + "learning_rate": 3.2383603110447304e-06, + "loss": 0.1188, + "step": 3912 + }, + { + "epoch": 1.2679844458846403, + "grad_norm": 0.7700950503349304, + "learning_rate": 3.2375246731276122e-06, + "loss": 0.1084, + "step": 3913 + }, + { + "epoch": 1.268308489954634, + "grad_norm": 0.8718739151954651, + "learning_rate": 3.236688944944614e-06, + "loss": 0.1128, + "step": 3914 + }, + { + "epoch": 1.2686325340246274, + "grad_norm": 0.8307285904884338, + "learning_rate": 3.2358531265980207e-06, + "loss": 0.1103, + "step": 3915 + }, + { + "epoch": 1.2689565780946208, + "grad_norm": 0.843315064907074, + "learning_rate": 3.2350172181901283e-06, + "loss": 0.1171, + "step": 3916 + }, + { + "epoch": 1.2692806221646145, + "grad_norm": 0.8017109632492065, + "learning_rate": 3.2341812198232437e-06, + "loss": 0.123, + "step": 3917 + }, + { + "epoch": 1.2696046662346079, + "grad_norm": 0.8025103211402893, + "learning_rate": 3.2333451315996857e-06, + "loss": 0.1172, + "step": 3918 + }, + { + "epoch": 1.2699287103046015, + "grad_norm": 0.8225618600845337, + "learning_rate": 3.232508953621782e-06, + "loss": 0.1195, + "step": 3919 + }, + { + "epoch": 1.270252754374595, + "grad_norm": 0.8533084392547607, + "learning_rate": 3.231672685991874e-06, + "loss": 0.1228, + "step": 3920 + }, + { + "epoch": 1.2705767984445884, + "grad_norm": 0.8867461681365967, + "learning_rate": 3.2308363288123128e-06, + "loss": 0.1232, + "step": 3921 + }, + { + "epoch": 1.270900842514582, + "grad_norm": 0.7814192771911621, + "learning_rate": 3.2299998821854593e-06, + "loss": 0.1085, + "step": 3922 + }, + { + "epoch": 1.2712248865845754, + "grad_norm": 0.804747998714447, + "learning_rate": 3.229163346213688e-06, + "loss": 0.1079, + "step": 3923 + }, + { + "epoch": 1.271548930654569, + "grad_norm": 0.797062873840332, + "learning_rate": 3.228326720999382e-06, + "loss": 0.1124, + "step": 3924 + }, + { + "epoch": 1.2718729747245625, + "grad_norm": 0.8368006944656372, + "learning_rate": 3.2274900066449355e-06, + "loss": 0.1222, + "step": 3925 + }, + { + "epoch": 1.272197018794556, + "grad_norm": 0.8199383616447449, + "learning_rate": 3.2266532032527548e-06, + "loss": 0.1121, + "step": 3926 + }, + { + "epoch": 1.2725210628645496, + "grad_norm": 0.7986612915992737, + "learning_rate": 3.225816310925257e-06, + "loss": 0.1106, + "step": 3927 + }, + { + "epoch": 1.2728451069345432, + "grad_norm": 0.764630913734436, + "learning_rate": 3.224979329764869e-06, + "loss": 0.106, + "step": 3928 + }, + { + "epoch": 1.2731691510045366, + "grad_norm": 0.8350912928581238, + "learning_rate": 3.224142259874029e-06, + "loss": 0.1207, + "step": 3929 + }, + { + "epoch": 1.27349319507453, + "grad_norm": 0.8321165442466736, + "learning_rate": 3.223305101355187e-06, + "loss": 0.1245, + "step": 3930 + }, + { + "epoch": 1.2738172391445237, + "grad_norm": 0.7997496128082275, + "learning_rate": 3.2224678543108024e-06, + "loss": 0.1121, + "step": 3931 + }, + { + "epoch": 1.2741412832145171, + "grad_norm": 0.8008485436439514, + "learning_rate": 3.221630518843347e-06, + "loss": 0.1135, + "step": 3932 + }, + { + "epoch": 1.2744653272845108, + "grad_norm": 0.798004150390625, + "learning_rate": 3.2207930950553017e-06, + "loss": 0.1134, + "step": 3933 + }, + { + "epoch": 1.2747893713545042, + "grad_norm": 0.8458718061447144, + "learning_rate": 3.2199555830491597e-06, + "loss": 0.12, + "step": 3934 + }, + { + "epoch": 1.2751134154244976, + "grad_norm": 0.8262900710105896, + "learning_rate": 3.2191179829274244e-06, + "loss": 0.1205, + "step": 3935 + }, + { + "epoch": 1.2754374594944913, + "grad_norm": 0.8696123361587524, + "learning_rate": 3.2182802947926086e-06, + "loss": 0.1196, + "step": 3936 + }, + { + "epoch": 1.2757615035644847, + "grad_norm": 0.8649618029594421, + "learning_rate": 3.2174425187472387e-06, + "loss": 0.1243, + "step": 3937 + }, + { + "epoch": 1.2760855476344783, + "grad_norm": 0.7876730561256409, + "learning_rate": 3.2166046548938497e-06, + "loss": 0.1118, + "step": 3938 + }, + { + "epoch": 1.2764095917044718, + "grad_norm": 0.8547011017799377, + "learning_rate": 3.215766703334988e-06, + "loss": 0.1213, + "step": 3939 + }, + { + "epoch": 1.2767336357744652, + "grad_norm": 0.9488222599029541, + "learning_rate": 3.214928664173211e-06, + "loss": 0.1214, + "step": 3940 + }, + { + "epoch": 1.2770576798444588, + "grad_norm": 0.8610497117042542, + "learning_rate": 3.2140905375110875e-06, + "loss": 0.1271, + "step": 3941 + }, + { + "epoch": 1.2773817239144525, + "grad_norm": 0.793376088142395, + "learning_rate": 3.2132523234511943e-06, + "loss": 0.1095, + "step": 3942 + }, + { + "epoch": 1.277705767984446, + "grad_norm": 0.8819077014923096, + "learning_rate": 3.2124140220961215e-06, + "loss": 0.1255, + "step": 3943 + }, + { + "epoch": 1.2780298120544393, + "grad_norm": 0.8474619388580322, + "learning_rate": 3.2115756335484694e-06, + "loss": 0.1265, + "step": 3944 + }, + { + "epoch": 1.278353856124433, + "grad_norm": 0.8497169613838196, + "learning_rate": 3.210737157910848e-06, + "loss": 0.1276, + "step": 3945 + }, + { + "epoch": 1.2786779001944264, + "grad_norm": 0.8501853346824646, + "learning_rate": 3.2098985952858796e-06, + "loss": 0.1274, + "step": 3946 + }, + { + "epoch": 1.27900194426442, + "grad_norm": 0.8726108074188232, + "learning_rate": 3.209059945776195e-06, + "loss": 0.1309, + "step": 3947 + }, + { + "epoch": 1.2793259883344135, + "grad_norm": 0.7873273491859436, + "learning_rate": 3.2082212094844374e-06, + "loss": 0.1187, + "step": 3948 + }, + { + "epoch": 1.279650032404407, + "grad_norm": 0.757980227470398, + "learning_rate": 3.20738238651326e-06, + "loss": 0.1116, + "step": 3949 + }, + { + "epoch": 1.2799740764744005, + "grad_norm": 0.8315106630325317, + "learning_rate": 3.206543476965326e-06, + "loss": 0.1271, + "step": 3950 + }, + { + "epoch": 1.280298120544394, + "grad_norm": 0.8244580626487732, + "learning_rate": 3.2057044809433108e-06, + "loss": 0.1221, + "step": 3951 + }, + { + "epoch": 1.2806221646143876, + "grad_norm": 0.8443965315818787, + "learning_rate": 3.2048653985498985e-06, + "loss": 0.1241, + "step": 3952 + }, + { + "epoch": 1.280946208684381, + "grad_norm": 0.7351576089859009, + "learning_rate": 3.204026229887785e-06, + "loss": 0.1099, + "step": 3953 + }, + { + "epoch": 1.2812702527543747, + "grad_norm": 0.8433205485343933, + "learning_rate": 3.203186975059677e-06, + "loss": 0.1226, + "step": 3954 + }, + { + "epoch": 1.281594296824368, + "grad_norm": 0.8397737145423889, + "learning_rate": 3.2023476341682902e-06, + "loss": 0.1119, + "step": 3955 + }, + { + "epoch": 1.2819183408943617, + "grad_norm": 0.7775555849075317, + "learning_rate": 3.2015082073163524e-06, + "loss": 0.1221, + "step": 3956 + }, + { + "epoch": 1.2822423849643552, + "grad_norm": 0.7749071717262268, + "learning_rate": 3.2006686946066012e-06, + "loss": 0.104, + "step": 3957 + }, + { + "epoch": 1.2825664290343486, + "grad_norm": 0.8695152997970581, + "learning_rate": 3.1998290961417844e-06, + "loss": 0.1147, + "step": 3958 + }, + { + "epoch": 1.2828904731043422, + "grad_norm": 0.8355270624160767, + "learning_rate": 3.1989894120246613e-06, + "loss": 0.1153, + "step": 3959 + }, + { + "epoch": 1.2832145171743357, + "grad_norm": 0.8163199424743652, + "learning_rate": 3.1981496423580012e-06, + "loss": 0.1232, + "step": 3960 + }, + { + "epoch": 1.2835385612443293, + "grad_norm": 0.8346755504608154, + "learning_rate": 3.1973097872445828e-06, + "loss": 0.1029, + "step": 3961 + }, + { + "epoch": 1.2838626053143227, + "grad_norm": 0.859024167060852, + "learning_rate": 3.1964698467871976e-06, + "loss": 0.1304, + "step": 3962 + }, + { + "epoch": 1.2841866493843161, + "grad_norm": 0.7759827971458435, + "learning_rate": 3.1956298210886454e-06, + "loss": 0.1074, + "step": 3963 + }, + { + "epoch": 1.2845106934543098, + "grad_norm": 0.8474326729774475, + "learning_rate": 3.1947897102517374e-06, + "loss": 0.1157, + "step": 3964 + }, + { + "epoch": 1.2848347375243034, + "grad_norm": 0.8403254151344299, + "learning_rate": 3.1939495143792944e-06, + "loss": 0.1182, + "step": 3965 + }, + { + "epoch": 1.2851587815942969, + "grad_norm": 0.8455356359481812, + "learning_rate": 3.1931092335741497e-06, + "loss": 0.1211, + "step": 3966 + }, + { + "epoch": 1.2854828256642903, + "grad_norm": 0.9320614337921143, + "learning_rate": 3.192268867939144e-06, + "loss": 0.1256, + "step": 3967 + }, + { + "epoch": 1.285806869734284, + "grad_norm": 0.8611961007118225, + "learning_rate": 3.1914284175771303e-06, + "loss": 0.1158, + "step": 3968 + }, + { + "epoch": 1.2861309138042774, + "grad_norm": 0.8324908018112183, + "learning_rate": 3.1905878825909726e-06, + "loss": 0.1133, + "step": 3969 + }, + { + "epoch": 1.286454957874271, + "grad_norm": 0.8874091506004333, + "learning_rate": 3.189747263083543e-06, + "loss": 0.1223, + "step": 3970 + }, + { + "epoch": 1.2867790019442644, + "grad_norm": 0.7869680523872375, + "learning_rate": 3.188906559157725e-06, + "loss": 0.1079, + "step": 3971 + }, + { + "epoch": 1.2871030460142578, + "grad_norm": 0.8412497639656067, + "learning_rate": 3.1880657709164144e-06, + "loss": 0.1209, + "step": 3972 + }, + { + "epoch": 1.2874270900842515, + "grad_norm": 0.8754928708076477, + "learning_rate": 3.1872248984625135e-06, + "loss": 0.1205, + "step": 3973 + }, + { + "epoch": 1.287751134154245, + "grad_norm": 0.8010230660438538, + "learning_rate": 3.1863839418989385e-06, + "loss": 0.1072, + "step": 3974 + }, + { + "epoch": 1.2880751782242386, + "grad_norm": 0.9506638646125793, + "learning_rate": 3.185542901328613e-06, + "loss": 0.1405, + "step": 3975 + }, + { + "epoch": 1.288399222294232, + "grad_norm": 0.8250179290771484, + "learning_rate": 3.184701776854474e-06, + "loss": 0.1131, + "step": 3976 + }, + { + "epoch": 1.2887232663642254, + "grad_norm": 0.8661324381828308, + "learning_rate": 3.1838605685794665e-06, + "loss": 0.1178, + "step": 3977 + }, + { + "epoch": 1.289047310434219, + "grad_norm": 0.8399199843406677, + "learning_rate": 3.1830192766065445e-06, + "loss": 0.1167, + "step": 3978 + }, + { + "epoch": 1.2893713545042127, + "grad_norm": 0.7626136541366577, + "learning_rate": 3.1821779010386755e-06, + "loss": 0.1067, + "step": 3979 + }, + { + "epoch": 1.2896953985742061, + "grad_norm": 0.8195885419845581, + "learning_rate": 3.181336441978835e-06, + "loss": 0.1162, + "step": 3980 + }, + { + "epoch": 1.2900194426441995, + "grad_norm": 0.7261697053909302, + "learning_rate": 3.18049489953001e-06, + "loss": 0.1101, + "step": 3981 + }, + { + "epoch": 1.2903434867141932, + "grad_norm": 0.7609726190567017, + "learning_rate": 3.1796532737951975e-06, + "loss": 0.1089, + "step": 3982 + }, + { + "epoch": 1.2906675307841866, + "grad_norm": 0.8558682799339294, + "learning_rate": 3.1788115648774033e-06, + "loss": 0.1262, + "step": 3983 + }, + { + "epoch": 1.2909915748541803, + "grad_norm": 0.8411134481430054, + "learning_rate": 3.177969772879645e-06, + "loss": 0.113, + "step": 3984 + }, + { + "epoch": 1.2913156189241737, + "grad_norm": 0.8650814890861511, + "learning_rate": 3.1771278979049496e-06, + "loss": 0.1182, + "step": 3985 + }, + { + "epoch": 1.291639662994167, + "grad_norm": 0.8382723331451416, + "learning_rate": 3.176285940056355e-06, + "loss": 0.122, + "step": 3986 + }, + { + "epoch": 1.2919637070641607, + "grad_norm": 0.7756894826889038, + "learning_rate": 3.1754438994369087e-06, + "loss": 0.1129, + "step": 3987 + }, + { + "epoch": 1.2922877511341542, + "grad_norm": 0.8274821639060974, + "learning_rate": 3.174601776149668e-06, + "loss": 0.122, + "step": 3988 + }, + { + "epoch": 1.2926117952041478, + "grad_norm": 0.8245730400085449, + "learning_rate": 3.1737595702976996e-06, + "loss": 0.1165, + "step": 3989 + }, + { + "epoch": 1.2929358392741412, + "grad_norm": 0.833957850933075, + "learning_rate": 3.1729172819840825e-06, + "loss": 0.1195, + "step": 3990 + }, + { + "epoch": 1.2932598833441347, + "grad_norm": 0.7777155041694641, + "learning_rate": 3.1720749113119045e-06, + "loss": 0.109, + "step": 3991 + }, + { + "epoch": 1.2935839274141283, + "grad_norm": 0.8168273568153381, + "learning_rate": 3.1712324583842637e-06, + "loss": 0.1182, + "step": 3992 + }, + { + "epoch": 1.293907971484122, + "grad_norm": 0.8285285234451294, + "learning_rate": 3.1703899233042675e-06, + "loss": 0.1202, + "step": 3993 + }, + { + "epoch": 1.2942320155541154, + "grad_norm": 0.813170850276947, + "learning_rate": 3.1695473061750353e-06, + "loss": 0.1088, + "step": 3994 + }, + { + "epoch": 1.2945560596241088, + "grad_norm": 0.8755943775177002, + "learning_rate": 3.1687046070996942e-06, + "loss": 0.1231, + "step": 3995 + }, + { + "epoch": 1.2948801036941024, + "grad_norm": 0.848179042339325, + "learning_rate": 3.1678618261813828e-06, + "loss": 0.1127, + "step": 3996 + }, + { + "epoch": 1.2952041477640959, + "grad_norm": 0.8618291020393372, + "learning_rate": 3.167018963523249e-06, + "loss": 0.1241, + "step": 3997 + }, + { + "epoch": 1.2955281918340895, + "grad_norm": 0.8661099076271057, + "learning_rate": 3.1661760192284518e-06, + "loss": 0.1185, + "step": 3998 + }, + { + "epoch": 1.295852235904083, + "grad_norm": 0.8482186794281006, + "learning_rate": 3.165332993400159e-06, + "loss": 0.1263, + "step": 3999 + }, + { + "epoch": 1.2961762799740764, + "grad_norm": 0.8614206910133362, + "learning_rate": 3.1644898861415484e-06, + "loss": 0.1234, + "step": 4000 + }, + { + "epoch": 1.29650032404407, + "grad_norm": 0.8205857276916504, + "learning_rate": 3.163646697555809e-06, + "loss": 0.1192, + "step": 4001 + }, + { + "epoch": 1.2968243681140634, + "grad_norm": 0.8212650418281555, + "learning_rate": 3.1628034277461376e-06, + "loss": 0.1126, + "step": 4002 + }, + { + "epoch": 1.297148412184057, + "grad_norm": 0.9054401516914368, + "learning_rate": 3.161960076815743e-06, + "loss": 0.1238, + "step": 4003 + }, + { + "epoch": 1.2974724562540505, + "grad_norm": 0.7641319632530212, + "learning_rate": 3.1611166448678445e-06, + "loss": 0.1091, + "step": 4004 + }, + { + "epoch": 1.2977965003240441, + "grad_norm": 0.9123237133026123, + "learning_rate": 3.1602731320056675e-06, + "loss": 0.1245, + "step": 4005 + }, + { + "epoch": 1.2981205443940376, + "grad_norm": 0.8741422295570374, + "learning_rate": 3.159429538332452e-06, + "loss": 0.1302, + "step": 4006 + }, + { + "epoch": 1.2984445884640312, + "grad_norm": 0.8232343792915344, + "learning_rate": 3.1585858639514444e-06, + "loss": 0.1146, + "step": 4007 + }, + { + "epoch": 1.2987686325340246, + "grad_norm": 0.8644008636474609, + "learning_rate": 3.1577421089659023e-06, + "loss": 0.1153, + "step": 4008 + }, + { + "epoch": 1.299092676604018, + "grad_norm": 0.8121967315673828, + "learning_rate": 3.1568982734790943e-06, + "loss": 0.1167, + "step": 4009 + }, + { + "epoch": 1.2994167206740117, + "grad_norm": 0.8596533536911011, + "learning_rate": 3.1560543575942958e-06, + "loss": 0.1206, + "step": 4010 + }, + { + "epoch": 1.2997407647440051, + "grad_norm": 0.8099182844161987, + "learning_rate": 3.1552103614147955e-06, + "loss": 0.1108, + "step": 4011 + }, + { + "epoch": 1.3000648088139988, + "grad_norm": 0.8212376832962036, + "learning_rate": 3.1543662850438905e-06, + "loss": 0.1232, + "step": 4012 + }, + { + "epoch": 1.3003888528839922, + "grad_norm": 0.8360647559165955, + "learning_rate": 3.1535221285848866e-06, + "loss": 0.1204, + "step": 4013 + }, + { + "epoch": 1.3007128969539856, + "grad_norm": 0.8495321869850159, + "learning_rate": 3.1526778921411006e-06, + "loss": 0.1182, + "step": 4014 + }, + { + "epoch": 1.3010369410239793, + "grad_norm": 0.7739179730415344, + "learning_rate": 3.151833575815859e-06, + "loss": 0.1107, + "step": 4015 + }, + { + "epoch": 1.301360985093973, + "grad_norm": 0.8643922805786133, + "learning_rate": 3.1509891797124977e-06, + "loss": 0.1296, + "step": 4016 + }, + { + "epoch": 1.3016850291639663, + "grad_norm": 0.8476508855819702, + "learning_rate": 3.150144703934363e-06, + "loss": 0.126, + "step": 4017 + }, + { + "epoch": 1.3020090732339598, + "grad_norm": 0.8171755075454712, + "learning_rate": 3.149300148584811e-06, + "loss": 0.1062, + "step": 4018 + }, + { + "epoch": 1.3023331173039534, + "grad_norm": 0.731436014175415, + "learning_rate": 3.1484555137672063e-06, + "loss": 0.1098, + "step": 4019 + }, + { + "epoch": 1.3026571613739468, + "grad_norm": 0.7991498112678528, + "learning_rate": 3.147610799584924e-06, + "loss": 0.1142, + "step": 4020 + }, + { + "epoch": 1.3029812054439405, + "grad_norm": 0.8196418285369873, + "learning_rate": 3.1467660061413497e-06, + "loss": 0.1178, + "step": 4021 + }, + { + "epoch": 1.303305249513934, + "grad_norm": 0.8183781504631042, + "learning_rate": 3.1459211335398765e-06, + "loss": 0.1203, + "step": 4022 + }, + { + "epoch": 1.3036292935839273, + "grad_norm": 0.8166159391403198, + "learning_rate": 3.14507618188391e-06, + "loss": 0.1229, + "step": 4023 + }, + { + "epoch": 1.303953337653921, + "grad_norm": 0.8409057855606079, + "learning_rate": 3.144231151276864e-06, + "loss": 0.1155, + "step": 4024 + }, + { + "epoch": 1.3042773817239144, + "grad_norm": 0.8440519571304321, + "learning_rate": 3.143386041822162e-06, + "loss": 0.1224, + "step": 4025 + }, + { + "epoch": 1.304601425793908, + "grad_norm": 0.8403790593147278, + "learning_rate": 3.142540853623236e-06, + "loss": 0.1204, + "step": 4026 + }, + { + "epoch": 1.3049254698639015, + "grad_norm": 0.8980937004089355, + "learning_rate": 3.14169558678353e-06, + "loss": 0.1178, + "step": 4027 + }, + { + "epoch": 1.3052495139338949, + "grad_norm": 0.7783441543579102, + "learning_rate": 3.1408502414064963e-06, + "loss": 0.1024, + "step": 4028 + }, + { + "epoch": 1.3055735580038885, + "grad_norm": 0.7647897601127625, + "learning_rate": 3.140004817595597e-06, + "loss": 0.1079, + "step": 4029 + }, + { + "epoch": 1.3058976020738822, + "grad_norm": 0.8126063346862793, + "learning_rate": 3.1391593154543043e-06, + "loss": 0.1178, + "step": 4030 + }, + { + "epoch": 1.3062216461438756, + "grad_norm": 0.9322881698608398, + "learning_rate": 3.138313735086099e-06, + "loss": 0.1259, + "step": 4031 + }, + { + "epoch": 1.306545690213869, + "grad_norm": 0.8398449420928955, + "learning_rate": 3.137468076594471e-06, + "loss": 0.1201, + "step": 4032 + }, + { + "epoch": 1.3068697342838627, + "grad_norm": 0.825153648853302, + "learning_rate": 3.1366223400829215e-06, + "loss": 0.1136, + "step": 4033 + }, + { + "epoch": 1.307193778353856, + "grad_norm": 0.8654718995094299, + "learning_rate": 3.135776525654961e-06, + "loss": 0.1132, + "step": 4034 + }, + { + "epoch": 1.3075178224238497, + "grad_norm": 0.8122419714927673, + "learning_rate": 3.1349306334141084e-06, + "loss": 0.113, + "step": 4035 + }, + { + "epoch": 1.3078418664938432, + "grad_norm": 0.8519636988639832, + "learning_rate": 3.134084663463894e-06, + "loss": 0.1174, + "step": 4036 + }, + { + "epoch": 1.3081659105638366, + "grad_norm": 0.8526609539985657, + "learning_rate": 3.1332386159078536e-06, + "loss": 0.113, + "step": 4037 + }, + { + "epoch": 1.3084899546338302, + "grad_norm": 0.8123700618743896, + "learning_rate": 3.132392490849537e-06, + "loss": 0.1064, + "step": 4038 + }, + { + "epoch": 1.3088139987038236, + "grad_norm": 0.8796939253807068, + "learning_rate": 3.1315462883925026e-06, + "loss": 0.1277, + "step": 4039 + }, + { + "epoch": 1.3091380427738173, + "grad_norm": 0.8536302447319031, + "learning_rate": 3.1307000086403162e-06, + "loss": 0.1161, + "step": 4040 + }, + { + "epoch": 1.3094620868438107, + "grad_norm": 0.8248025178909302, + "learning_rate": 3.1298536516965537e-06, + "loss": 0.1204, + "step": 4041 + }, + { + "epoch": 1.3097861309138044, + "grad_norm": 0.8177707195281982, + "learning_rate": 3.129007217664802e-06, + "loss": 0.113, + "step": 4042 + }, + { + "epoch": 1.3101101749837978, + "grad_norm": 0.8128197193145752, + "learning_rate": 3.1281607066486565e-06, + "loss": 0.1063, + "step": 4043 + }, + { + "epoch": 1.3104342190537914, + "grad_norm": 0.7673478722572327, + "learning_rate": 3.127314118751721e-06, + "loss": 0.1068, + "step": 4044 + }, + { + "epoch": 1.3107582631237849, + "grad_norm": 0.8692690134048462, + "learning_rate": 3.12646745407761e-06, + "loss": 0.1221, + "step": 4045 + }, + { + "epoch": 1.3110823071937783, + "grad_norm": 0.8033125400543213, + "learning_rate": 3.1256207127299475e-06, + "loss": 0.117, + "step": 4046 + }, + { + "epoch": 1.311406351263772, + "grad_norm": 0.8258150219917297, + "learning_rate": 3.124773894812367e-06, + "loss": 0.114, + "step": 4047 + }, + { + "epoch": 1.3117303953337653, + "grad_norm": 0.8598066568374634, + "learning_rate": 3.123927000428509e-06, + "loss": 0.1243, + "step": 4048 + }, + { + "epoch": 1.312054439403759, + "grad_norm": 0.8733276724815369, + "learning_rate": 3.123080029682027e-06, + "loss": 0.1285, + "step": 4049 + }, + { + "epoch": 1.3123784834737524, + "grad_norm": 0.8112250566482544, + "learning_rate": 3.1222329826765806e-06, + "loss": 0.1169, + "step": 4050 + }, + { + "epoch": 1.3127025275437458, + "grad_norm": 0.8250533938407898, + "learning_rate": 3.121385859515842e-06, + "loss": 0.13, + "step": 4051 + }, + { + "epoch": 1.3130265716137395, + "grad_norm": 0.8050197958946228, + "learning_rate": 3.1205386603034886e-06, + "loss": 0.1154, + "step": 4052 + }, + { + "epoch": 1.3133506156837331, + "grad_norm": 0.819603443145752, + "learning_rate": 3.1196913851432108e-06, + "loss": 0.1146, + "step": 4053 + }, + { + "epoch": 1.3136746597537265, + "grad_norm": 0.7729926705360413, + "learning_rate": 3.1188440341387063e-06, + "loss": 0.1006, + "step": 4054 + }, + { + "epoch": 1.31399870382372, + "grad_norm": 0.7933728694915771, + "learning_rate": 3.1179966073936837e-06, + "loss": 0.1112, + "step": 4055 + }, + { + "epoch": 1.3143227478937136, + "grad_norm": 0.7561730146408081, + "learning_rate": 3.117149105011858e-06, + "loss": 0.1139, + "step": 4056 + }, + { + "epoch": 1.314646791963707, + "grad_norm": 0.9096986055374146, + "learning_rate": 3.1163015270969567e-06, + "loss": 0.1286, + "step": 4057 + }, + { + "epoch": 1.3149708360337007, + "grad_norm": 0.8904516100883484, + "learning_rate": 3.115453873752714e-06, + "loss": 0.1251, + "step": 4058 + }, + { + "epoch": 1.315294880103694, + "grad_norm": 1.1391996145248413, + "learning_rate": 3.114606145082876e-06, + "loss": 0.1189, + "step": 4059 + }, + { + "epoch": 1.3156189241736875, + "grad_norm": 0.841249406337738, + "learning_rate": 3.1137583411911954e-06, + "loss": 0.1183, + "step": 4060 + }, + { + "epoch": 1.3159429682436812, + "grad_norm": 0.7857441902160645, + "learning_rate": 3.1129104621814365e-06, + "loss": 0.1144, + "step": 4061 + }, + { + "epoch": 1.3162670123136746, + "grad_norm": 0.8113446235656738, + "learning_rate": 3.1120625081573696e-06, + "loss": 0.1098, + "step": 4062 + }, + { + "epoch": 1.3165910563836682, + "grad_norm": 0.8641226887702942, + "learning_rate": 3.1112144792227774e-06, + "loss": 0.1207, + "step": 4063 + }, + { + "epoch": 1.3169151004536617, + "grad_norm": 0.9279201030731201, + "learning_rate": 3.1103663754814493e-06, + "loss": 0.1371, + "step": 4064 + }, + { + "epoch": 1.317239144523655, + "grad_norm": 0.8208470940589905, + "learning_rate": 3.109518197037186e-06, + "loss": 0.1121, + "step": 4065 + }, + { + "epoch": 1.3175631885936487, + "grad_norm": 0.8214272856712341, + "learning_rate": 3.1086699439937957e-06, + "loss": 0.1141, + "step": 4066 + }, + { + "epoch": 1.3178872326636424, + "grad_norm": 0.8766685724258423, + "learning_rate": 3.1078216164550966e-06, + "loss": 0.1273, + "step": 4067 + }, + { + "epoch": 1.3182112767336358, + "grad_norm": 0.8116971254348755, + "learning_rate": 3.1069732145249166e-06, + "loss": 0.1043, + "step": 4068 + }, + { + "epoch": 1.3185353208036292, + "grad_norm": 0.7683243751525879, + "learning_rate": 3.1061247383070905e-06, + "loss": 0.1127, + "step": 4069 + }, + { + "epoch": 1.3188593648736229, + "grad_norm": 0.8518966436386108, + "learning_rate": 3.1052761879054637e-06, + "loss": 0.116, + "step": 4070 + }, + { + "epoch": 1.3191834089436163, + "grad_norm": 0.8467736840248108, + "learning_rate": 3.1044275634238913e-06, + "loss": 0.1158, + "step": 4071 + }, + { + "epoch": 1.31950745301361, + "grad_norm": 0.8552513718605042, + "learning_rate": 3.103578864966237e-06, + "loss": 0.1209, + "step": 4072 + }, + { + "epoch": 1.3198314970836034, + "grad_norm": 0.928536593914032, + "learning_rate": 3.1027300926363723e-06, + "loss": 0.1257, + "step": 4073 + }, + { + "epoch": 1.3201555411535968, + "grad_norm": 0.9464770555496216, + "learning_rate": 3.1018812465381796e-06, + "loss": 0.1207, + "step": 4074 + }, + { + "epoch": 1.3204795852235904, + "grad_norm": 0.8095433115959167, + "learning_rate": 3.1010323267755486e-06, + "loss": 0.1136, + "step": 4075 + }, + { + "epoch": 1.3208036292935839, + "grad_norm": 0.7639227509498596, + "learning_rate": 3.100183333452379e-06, + "loss": 0.1124, + "step": 4076 + }, + { + "epoch": 1.3211276733635775, + "grad_norm": 0.8170154094696045, + "learning_rate": 3.0993342666725803e-06, + "loss": 0.1136, + "step": 4077 + }, + { + "epoch": 1.321451717433571, + "grad_norm": 0.8701738119125366, + "learning_rate": 3.0984851265400683e-06, + "loss": 0.1177, + "step": 4078 + }, + { + "epoch": 1.3217757615035644, + "grad_norm": 0.8065285086631775, + "learning_rate": 3.097635913158772e-06, + "loss": 0.1141, + "step": 4079 + }, + { + "epoch": 1.322099805573558, + "grad_norm": 0.8607053160667419, + "learning_rate": 3.096786626632624e-06, + "loss": 0.1234, + "step": 4080 + }, + { + "epoch": 1.3224238496435516, + "grad_norm": 0.8417751789093018, + "learning_rate": 3.0959372670655714e-06, + "loss": 0.1201, + "step": 4081 + }, + { + "epoch": 1.322747893713545, + "grad_norm": 0.8256826996803284, + "learning_rate": 3.0950878345615654e-06, + "loss": 0.1238, + "step": 4082 + }, + { + "epoch": 1.3230719377835385, + "grad_norm": 0.8309873938560486, + "learning_rate": 3.0942383292245704e-06, + "loss": 0.1177, + "step": 4083 + }, + { + "epoch": 1.3233959818535321, + "grad_norm": 0.828330934047699, + "learning_rate": 3.0933887511585564e-06, + "loss": 0.117, + "step": 4084 + }, + { + "epoch": 1.3237200259235256, + "grad_norm": 0.8285284638404846, + "learning_rate": 3.0925391004675037e-06, + "loss": 0.1148, + "step": 4085 + }, + { + "epoch": 1.3240440699935192, + "grad_norm": 0.8505363464355469, + "learning_rate": 3.0916893772554006e-06, + "loss": 0.1135, + "step": 4086 + }, + { + "epoch": 1.3243681140635126, + "grad_norm": 0.8414075374603271, + "learning_rate": 3.0908395816262466e-06, + "loss": 0.1201, + "step": 4087 + }, + { + "epoch": 1.324692158133506, + "grad_norm": 0.9821286797523499, + "learning_rate": 3.0899897136840468e-06, + "loss": 0.1199, + "step": 4088 + }, + { + "epoch": 1.3250162022034997, + "grad_norm": 0.8443735837936401, + "learning_rate": 3.0891397735328176e-06, + "loss": 0.1157, + "step": 4089 + }, + { + "epoch": 1.3253402462734931, + "grad_norm": 0.8299155235290527, + "learning_rate": 3.088289761276584e-06, + "loss": 0.1094, + "step": 4090 + }, + { + "epoch": 1.3256642903434868, + "grad_norm": 0.8571391701698303, + "learning_rate": 3.0874396770193785e-06, + "loss": 0.13, + "step": 4091 + }, + { + "epoch": 1.3259883344134802, + "grad_norm": 0.8635405898094177, + "learning_rate": 3.0865895208652436e-06, + "loss": 0.1303, + "step": 4092 + }, + { + "epoch": 1.3263123784834738, + "grad_norm": 0.8105101585388184, + "learning_rate": 3.0857392929182296e-06, + "loss": 0.113, + "step": 4093 + }, + { + "epoch": 1.3266364225534673, + "grad_norm": 0.8883736729621887, + "learning_rate": 3.084888993282397e-06, + "loss": 0.1266, + "step": 4094 + }, + { + "epoch": 1.326960466623461, + "grad_norm": 0.9142129421234131, + "learning_rate": 3.0840386220618137e-06, + "loss": 0.1251, + "step": 4095 + }, + { + "epoch": 1.3272845106934543, + "grad_norm": 0.8064966201782227, + "learning_rate": 3.083188179360556e-06, + "loss": 0.1119, + "step": 4096 + }, + { + "epoch": 1.3276085547634477, + "grad_norm": 0.8080044984817505, + "learning_rate": 3.0823376652827123e-06, + "loss": 0.1174, + "step": 4097 + }, + { + "epoch": 1.3279325988334414, + "grad_norm": 0.8498368263244629, + "learning_rate": 3.0814870799323748e-06, + "loss": 0.1129, + "step": 4098 + }, + { + "epoch": 1.3282566429034348, + "grad_norm": 0.8681304454803467, + "learning_rate": 3.080636423413649e-06, + "loss": 0.1259, + "step": 4099 + }, + { + "epoch": 1.3285806869734285, + "grad_norm": 0.8149116635322571, + "learning_rate": 3.079785695830645e-06, + "loss": 0.1256, + "step": 4100 + }, + { + "epoch": 1.3289047310434219, + "grad_norm": 0.8620883226394653, + "learning_rate": 3.0789348972874844e-06, + "loss": 0.1142, + "step": 4101 + }, + { + "epoch": 1.3292287751134153, + "grad_norm": 0.790808379650116, + "learning_rate": 3.0780840278882974e-06, + "loss": 0.1068, + "step": 4102 + }, + { + "epoch": 1.329552819183409, + "grad_norm": 0.8280158042907715, + "learning_rate": 3.077233087737222e-06, + "loss": 0.1182, + "step": 4103 + }, + { + "epoch": 1.3298768632534026, + "grad_norm": 0.8379877805709839, + "learning_rate": 3.0763820769384038e-06, + "loss": 0.1151, + "step": 4104 + }, + { + "epoch": 1.330200907323396, + "grad_norm": 0.7841100692749023, + "learning_rate": 3.0755309955960007e-06, + "loss": 0.1126, + "step": 4105 + }, + { + "epoch": 1.3305249513933894, + "grad_norm": 0.8145238757133484, + "learning_rate": 3.074679843814174e-06, + "loss": 0.1166, + "step": 4106 + }, + { + "epoch": 1.330848995463383, + "grad_norm": 0.8462419509887695, + "learning_rate": 3.073828621697098e-06, + "loss": 0.1243, + "step": 4107 + }, + { + "epoch": 1.3311730395333765, + "grad_norm": 0.8405357003211975, + "learning_rate": 3.072977329348954e-06, + "loss": 0.1231, + "step": 4108 + }, + { + "epoch": 1.3314970836033702, + "grad_norm": 0.9085520505905151, + "learning_rate": 3.072125966873932e-06, + "loss": 0.1308, + "step": 4109 + }, + { + "epoch": 1.3318211276733636, + "grad_norm": 0.8673784136772156, + "learning_rate": 3.0712745343762295e-06, + "loss": 0.1177, + "step": 4110 + }, + { + "epoch": 1.332145171743357, + "grad_norm": 0.7908552885055542, + "learning_rate": 3.0704230319600547e-06, + "loss": 0.1072, + "step": 4111 + }, + { + "epoch": 1.3324692158133506, + "grad_norm": 0.7814868092536926, + "learning_rate": 3.069571459729623e-06, + "loss": 0.1146, + "step": 4112 + }, + { + "epoch": 1.332793259883344, + "grad_norm": 0.7968869209289551, + "learning_rate": 3.068719817789158e-06, + "loss": 0.1108, + "step": 4113 + }, + { + "epoch": 1.3331173039533377, + "grad_norm": 0.8128607869148254, + "learning_rate": 3.067868106242894e-06, + "loss": 0.1139, + "step": 4114 + }, + { + "epoch": 1.3334413480233311, + "grad_norm": 0.8034228682518005, + "learning_rate": 3.0670163251950703e-06, + "loss": 0.1082, + "step": 4115 + }, + { + "epoch": 1.3337653920933246, + "grad_norm": 0.8152320384979248, + "learning_rate": 3.0661644747499385e-06, + "loss": 0.1158, + "step": 4116 + }, + { + "epoch": 1.3340894361633182, + "grad_norm": 0.8252043128013611, + "learning_rate": 3.0653125550117547e-06, + "loss": 0.1136, + "step": 4117 + }, + { + "epoch": 1.3344134802333119, + "grad_norm": 0.8656401038169861, + "learning_rate": 3.0644605660847875e-06, + "loss": 0.1219, + "step": 4118 + }, + { + "epoch": 1.3347375243033053, + "grad_norm": 0.7695584297180176, + "learning_rate": 3.0636085080733113e-06, + "loss": 0.1087, + "step": 4119 + }, + { + "epoch": 1.3350615683732987, + "grad_norm": 0.8830423355102539, + "learning_rate": 3.0627563810816097e-06, + "loss": 0.1213, + "step": 4120 + }, + { + "epoch": 1.3353856124432923, + "grad_norm": 0.8353223204612732, + "learning_rate": 3.0619041852139746e-06, + "loss": 0.1192, + "step": 4121 + }, + { + "epoch": 1.3357096565132858, + "grad_norm": 0.7975737452507019, + "learning_rate": 3.061051920574708e-06, + "loss": 0.1153, + "step": 4122 + }, + { + "epoch": 1.3360337005832794, + "grad_norm": 0.8361819386482239, + "learning_rate": 3.0601995872681167e-06, + "loss": 0.1178, + "step": 4123 + }, + { + "epoch": 1.3363577446532728, + "grad_norm": 0.7925208806991577, + "learning_rate": 3.0593471853985197e-06, + "loss": 0.1085, + "step": 4124 + }, + { + "epoch": 1.3366817887232663, + "grad_norm": 0.8033835887908936, + "learning_rate": 3.058494715070242e-06, + "loss": 0.1076, + "step": 4125 + }, + { + "epoch": 1.33700583279326, + "grad_norm": 0.8789135813713074, + "learning_rate": 3.0576421763876174e-06, + "loss": 0.1258, + "step": 4126 + }, + { + "epoch": 1.3373298768632533, + "grad_norm": 0.8176473379135132, + "learning_rate": 3.056789569454989e-06, + "loss": 0.1133, + "step": 4127 + }, + { + "epoch": 1.337653920933247, + "grad_norm": 0.8783532381057739, + "learning_rate": 3.055936894376708e-06, + "loss": 0.1224, + "step": 4128 + }, + { + "epoch": 1.3379779650032404, + "grad_norm": 0.8398758769035339, + "learning_rate": 3.055084151257133e-06, + "loss": 0.1184, + "step": 4129 + }, + { + "epoch": 1.3383020090732338, + "grad_norm": 0.7387755513191223, + "learning_rate": 3.054231340200631e-06, + "loss": 0.0991, + "step": 4130 + }, + { + "epoch": 1.3386260531432275, + "grad_norm": 0.7608300447463989, + "learning_rate": 3.053378461311578e-06, + "loss": 0.1077, + "step": 4131 + }, + { + "epoch": 1.3389500972132211, + "grad_norm": 0.781328558921814, + "learning_rate": 3.0525255146943582e-06, + "loss": 0.1077, + "step": 4132 + }, + { + "epoch": 1.3392741412832145, + "grad_norm": 0.8142044544219971, + "learning_rate": 3.0516725004533648e-06, + "loss": 0.1176, + "step": 4133 + }, + { + "epoch": 1.339598185353208, + "grad_norm": 0.7998998165130615, + "learning_rate": 3.0508194186929983e-06, + "loss": 0.1114, + "step": 4134 + }, + { + "epoch": 1.3399222294232016, + "grad_norm": 0.7751430869102478, + "learning_rate": 3.0499662695176675e-06, + "loss": 0.1102, + "step": 4135 + }, + { + "epoch": 1.340246273493195, + "grad_norm": 0.8340958952903748, + "learning_rate": 3.0491130530317887e-06, + "loss": 0.1214, + "step": 4136 + }, + { + "epoch": 1.3405703175631887, + "grad_norm": 0.869506299495697, + "learning_rate": 3.0482597693397887e-06, + "loss": 0.1162, + "step": 4137 + }, + { + "epoch": 1.340894361633182, + "grad_norm": 0.8065547943115234, + "learning_rate": 3.0474064185461e-06, + "loss": 0.1193, + "step": 4138 + }, + { + "epoch": 1.3412184057031755, + "grad_norm": 0.8128240704536438, + "learning_rate": 3.0465530007551646e-06, + "loss": 0.1194, + "step": 4139 + }, + { + "epoch": 1.3415424497731692, + "grad_norm": 0.7759813666343689, + "learning_rate": 3.0456995160714344e-06, + "loss": 0.1123, + "step": 4140 + }, + { + "epoch": 1.3418664938431626, + "grad_norm": 0.7582803964614868, + "learning_rate": 3.044845964599365e-06, + "loss": 0.1106, + "step": 4141 + }, + { + "epoch": 1.3421905379131562, + "grad_norm": 0.7886553406715393, + "learning_rate": 3.043992346443424e-06, + "loss": 0.1079, + "step": 4142 + }, + { + "epoch": 1.3425145819831497, + "grad_norm": 0.778815746307373, + "learning_rate": 3.043138661708086e-06, + "loss": 0.1089, + "step": 4143 + }, + { + "epoch": 1.3428386260531433, + "grad_norm": 0.8232936859130859, + "learning_rate": 3.042284910497834e-06, + "loss": 0.1248, + "step": 4144 + }, + { + "epoch": 1.3431626701231367, + "grad_norm": 0.8517846465110779, + "learning_rate": 3.0414310929171587e-06, + "loss": 0.1183, + "step": 4145 + }, + { + "epoch": 1.3434867141931304, + "grad_norm": 0.8031988143920898, + "learning_rate": 3.04057720907056e-06, + "loss": 0.1153, + "step": 4146 + }, + { + "epoch": 1.3438107582631238, + "grad_norm": 0.8768326640129089, + "learning_rate": 3.039723259062543e-06, + "loss": 0.1196, + "step": 4147 + }, + { + "epoch": 1.3441348023331172, + "grad_norm": 0.8870144486427307, + "learning_rate": 3.0388692429976247e-06, + "loss": 0.12, + "step": 4148 + }, + { + "epoch": 1.3444588464031109, + "grad_norm": 0.8968208432197571, + "learning_rate": 3.038015160980327e-06, + "loss": 0.1258, + "step": 4149 + }, + { + "epoch": 1.3447828904731043, + "grad_norm": 0.8066931366920471, + "learning_rate": 3.0371610131151823e-06, + "loss": 0.1162, + "step": 4150 + }, + { + "epoch": 1.345106934543098, + "grad_norm": 0.8446550965309143, + "learning_rate": 3.0363067995067297e-06, + "loss": 0.1218, + "step": 4151 + }, + { + "epoch": 1.3454309786130914, + "grad_norm": 0.7766255736351013, + "learning_rate": 3.035452520259517e-06, + "loss": 0.1122, + "step": 4152 + }, + { + "epoch": 1.3457550226830848, + "grad_norm": 0.8000936508178711, + "learning_rate": 3.034598175478099e-06, + "loss": 0.1174, + "step": 4153 + }, + { + "epoch": 1.3460790667530784, + "grad_norm": 0.825092077255249, + "learning_rate": 3.03374376526704e-06, + "loss": 0.1217, + "step": 4154 + }, + { + "epoch": 1.346403110823072, + "grad_norm": 0.8683377504348755, + "learning_rate": 3.0328892897309105e-06, + "loss": 0.1277, + "step": 4155 + }, + { + "epoch": 1.3467271548930655, + "grad_norm": 0.8227855563163757, + "learning_rate": 3.0320347489742905e-06, + "loss": 0.1121, + "step": 4156 + }, + { + "epoch": 1.347051198963059, + "grad_norm": 0.8627481460571289, + "learning_rate": 3.031180143101769e-06, + "loss": 0.1163, + "step": 4157 + }, + { + "epoch": 1.3473752430330526, + "grad_norm": 0.8161624073982239, + "learning_rate": 3.03032547221794e-06, + "loss": 0.1209, + "step": 4158 + }, + { + "epoch": 1.347699287103046, + "grad_norm": 0.7790018320083618, + "learning_rate": 3.0294707364274066e-06, + "loss": 0.1144, + "step": 4159 + }, + { + "epoch": 1.3480233311730396, + "grad_norm": 0.8219239115715027, + "learning_rate": 3.028615935834781e-06, + "loss": 0.114, + "step": 4160 + }, + { + "epoch": 1.348347375243033, + "grad_norm": 0.7680139541625977, + "learning_rate": 3.027761070544682e-06, + "loss": 0.1078, + "step": 4161 + }, + { + "epoch": 1.3486714193130265, + "grad_norm": 0.8134229183197021, + "learning_rate": 3.026906140661737e-06, + "loss": 0.1065, + "step": 4162 + }, + { + "epoch": 1.3489954633830201, + "grad_norm": 0.8216208815574646, + "learning_rate": 3.026051146290581e-06, + "loss": 0.1227, + "step": 4163 + }, + { + "epoch": 1.3493195074530135, + "grad_norm": 0.8143205642700195, + "learning_rate": 3.025196087535858e-06, + "loss": 0.1202, + "step": 4164 + }, + { + "epoch": 1.3496435515230072, + "grad_norm": 0.8093990087509155, + "learning_rate": 3.024340964502218e-06, + "loss": 0.1106, + "step": 4165 + }, + { + "epoch": 1.3499675955930006, + "grad_norm": 0.7998570799827576, + "learning_rate": 3.0234857772943197e-06, + "loss": 0.1032, + "step": 4166 + }, + { + "epoch": 1.350291639662994, + "grad_norm": 0.874220073223114, + "learning_rate": 3.0226305260168298e-06, + "loss": 0.129, + "step": 4167 + }, + { + "epoch": 1.3506156837329877, + "grad_norm": 0.8281093239784241, + "learning_rate": 3.0217752107744237e-06, + "loss": 0.1122, + "step": 4168 + }, + { + "epoch": 1.3509397278029813, + "grad_norm": 0.8350166082382202, + "learning_rate": 3.0209198316717825e-06, + "loss": 0.1187, + "step": 4169 + }, + { + "epoch": 1.3512637718729748, + "grad_norm": 0.904016375541687, + "learning_rate": 3.0200643888135973e-06, + "loss": 0.13, + "step": 4170 + }, + { + "epoch": 1.3515878159429682, + "grad_norm": 0.7951503396034241, + "learning_rate": 3.019208882304565e-06, + "loss": 0.1036, + "step": 4171 + }, + { + "epoch": 1.3519118600129618, + "grad_norm": 0.9027987122535706, + "learning_rate": 3.0183533122493917e-06, + "loss": 0.1325, + "step": 4172 + }, + { + "epoch": 1.3522359040829552, + "grad_norm": 0.879817545413971, + "learning_rate": 3.017497678752791e-06, + "loss": 0.1233, + "step": 4173 + }, + { + "epoch": 1.3525599481529489, + "grad_norm": 0.8084651231765747, + "learning_rate": 3.016641981919485e-06, + "loss": 0.1125, + "step": 4174 + }, + { + "epoch": 1.3528839922229423, + "grad_norm": 0.8651638031005859, + "learning_rate": 3.0157862218542004e-06, + "loss": 0.1302, + "step": 4175 + }, + { + "epoch": 1.3532080362929357, + "grad_norm": 0.8165826201438904, + "learning_rate": 3.0149303986616772e-06, + "loss": 0.117, + "step": 4176 + }, + { + "epoch": 1.3535320803629294, + "grad_norm": 0.8501429557800293, + "learning_rate": 3.014074512446657e-06, + "loss": 0.1255, + "step": 4177 + }, + { + "epoch": 1.3538561244329228, + "grad_norm": 0.8126803636550903, + "learning_rate": 3.0132185633138934e-06, + "loss": 0.1159, + "step": 4178 + }, + { + "epoch": 1.3541801685029164, + "grad_norm": 0.8380131721496582, + "learning_rate": 3.0123625513681463e-06, + "loss": 0.1199, + "step": 4179 + }, + { + "epoch": 1.3545042125729099, + "grad_norm": 0.8589929938316345, + "learning_rate": 3.0115064767141827e-06, + "loss": 0.1251, + "step": 4180 + }, + { + "epoch": 1.3548282566429035, + "grad_norm": 0.7794283032417297, + "learning_rate": 3.0106503394567775e-06, + "loss": 0.1142, + "step": 4181 + }, + { + "epoch": 1.355152300712897, + "grad_norm": 0.7469875812530518, + "learning_rate": 3.0097941397007156e-06, + "loss": 0.1103, + "step": 4182 + }, + { + "epoch": 1.3554763447828906, + "grad_norm": 0.8287574648857117, + "learning_rate": 3.008937877550785e-06, + "loss": 0.1221, + "step": 4183 + }, + { + "epoch": 1.355800388852884, + "grad_norm": 0.8686206340789795, + "learning_rate": 3.008081553111786e-06, + "loss": 0.1297, + "step": 4184 + }, + { + "epoch": 1.3561244329228774, + "grad_norm": 0.7942254543304443, + "learning_rate": 3.0072251664885222e-06, + "loss": 0.1146, + "step": 4185 + }, + { + "epoch": 1.356448476992871, + "grad_norm": 0.7875716686248779, + "learning_rate": 3.006368717785809e-06, + "loss": 0.1089, + "step": 4186 + }, + { + "epoch": 1.3567725210628645, + "grad_norm": 0.7777183651924133, + "learning_rate": 3.005512207108467e-06, + "loss": 0.1094, + "step": 4187 + }, + { + "epoch": 1.3570965651328581, + "grad_norm": 0.8916992545127869, + "learning_rate": 3.004655634561325e-06, + "loss": 0.1252, + "step": 4188 + }, + { + "epoch": 1.3574206092028516, + "grad_norm": 0.8388050198554993, + "learning_rate": 3.003799000249218e-06, + "loss": 0.1269, + "step": 4189 + }, + { + "epoch": 1.357744653272845, + "grad_norm": 0.8051396608352661, + "learning_rate": 3.002942304276991e-06, + "loss": 0.1152, + "step": 4190 + }, + { + "epoch": 1.3580686973428386, + "grad_norm": 0.8340529203414917, + "learning_rate": 3.002085546749495e-06, + "loss": 0.1157, + "step": 4191 + }, + { + "epoch": 1.3583927414128323, + "grad_norm": 0.8595792651176453, + "learning_rate": 3.001228727771588e-06, + "loss": 0.129, + "step": 4192 + }, + { + "epoch": 1.3587167854828257, + "grad_norm": 0.7768383026123047, + "learning_rate": 3.000371847448137e-06, + "loss": 0.1055, + "step": 4193 + }, + { + "epoch": 1.3590408295528191, + "grad_norm": 0.8091722130775452, + "learning_rate": 2.9995149058840157e-06, + "loss": 0.1112, + "step": 4194 + }, + { + "epoch": 1.3593648736228128, + "grad_norm": 0.809382975101471, + "learning_rate": 2.998657903184107e-06, + "loss": 0.1147, + "step": 4195 + }, + { + "epoch": 1.3596889176928062, + "grad_norm": 0.8365122675895691, + "learning_rate": 2.9978008394532966e-06, + "loss": 0.1181, + "step": 4196 + }, + { + "epoch": 1.3600129617627998, + "grad_norm": 0.8634129762649536, + "learning_rate": 2.996943714796483e-06, + "loss": 0.1212, + "step": 4197 + }, + { + "epoch": 1.3603370058327933, + "grad_norm": 0.8216782808303833, + "learning_rate": 2.9960865293185697e-06, + "loss": 0.1161, + "step": 4198 + }, + { + "epoch": 1.3606610499027867, + "grad_norm": 0.8733593225479126, + "learning_rate": 2.995229283124468e-06, + "loss": 0.1126, + "step": 4199 + }, + { + "epoch": 1.3609850939727803, + "grad_norm": 0.7830566763877869, + "learning_rate": 2.994371976319096e-06, + "loss": 0.117, + "step": 4200 + }, + { + "epoch": 1.3613091380427738, + "grad_norm": 0.8722835183143616, + "learning_rate": 2.993514609007381e-06, + "loss": 0.1305, + "step": 4201 + }, + { + "epoch": 1.3616331821127674, + "grad_norm": 0.7920193672180176, + "learning_rate": 2.992657181294254e-06, + "loss": 0.109, + "step": 4202 + }, + { + "epoch": 1.3619572261827608, + "grad_norm": 0.7887697815895081, + "learning_rate": 2.9917996932846572e-06, + "loss": 0.1066, + "step": 4203 + }, + { + "epoch": 1.3622812702527543, + "grad_norm": 0.9010506868362427, + "learning_rate": 2.99094214508354e-06, + "loss": 0.1252, + "step": 4204 + }, + { + "epoch": 1.362605314322748, + "grad_norm": 0.7866716384887695, + "learning_rate": 2.990084536795856e-06, + "loss": 0.1037, + "step": 4205 + }, + { + "epoch": 1.3629293583927415, + "grad_norm": 0.7375853657722473, + "learning_rate": 2.989226868526569e-06, + "loss": 0.105, + "step": 4206 + }, + { + "epoch": 1.363253402462735, + "grad_norm": 0.7753633260726929, + "learning_rate": 2.98836914038065e-06, + "loss": 0.111, + "step": 4207 + }, + { + "epoch": 1.3635774465327284, + "grad_norm": 0.811918318271637, + "learning_rate": 2.987511352463076e-06, + "loss": 0.1184, + "step": 4208 + }, + { + "epoch": 1.363901490602722, + "grad_norm": 0.7513830065727234, + "learning_rate": 2.9866535048788314e-06, + "loss": 0.1086, + "step": 4209 + }, + { + "epoch": 1.3642255346727155, + "grad_norm": 0.8139031529426575, + "learning_rate": 2.9857955977329095e-06, + "loss": 0.1147, + "step": 4210 + }, + { + "epoch": 1.364549578742709, + "grad_norm": 0.8000282645225525, + "learning_rate": 2.9849376311303095e-06, + "loss": 0.1095, + "step": 4211 + }, + { + "epoch": 1.3648736228127025, + "grad_norm": 0.8604514598846436, + "learning_rate": 2.984079605176038e-06, + "loss": 0.1242, + "step": 4212 + }, + { + "epoch": 1.365197666882696, + "grad_norm": 0.8277812004089355, + "learning_rate": 2.9832215199751085e-06, + "loss": 0.1174, + "step": 4213 + }, + { + "epoch": 1.3655217109526896, + "grad_norm": 0.809662938117981, + "learning_rate": 2.9823633756325433e-06, + "loss": 0.1101, + "step": 4214 + }, + { + "epoch": 1.365845755022683, + "grad_norm": 0.8288755416870117, + "learning_rate": 2.9815051722533707e-06, + "loss": 0.1194, + "step": 4215 + }, + { + "epoch": 1.3661697990926767, + "grad_norm": 0.7696095108985901, + "learning_rate": 2.9806469099426254e-06, + "loss": 0.1036, + "step": 4216 + }, + { + "epoch": 1.36649384316267, + "grad_norm": 0.8410941958427429, + "learning_rate": 2.9797885888053517e-06, + "loss": 0.1171, + "step": 4217 + }, + { + "epoch": 1.3668178872326635, + "grad_norm": 0.8517903685569763, + "learning_rate": 2.9789302089466e-06, + "loss": 0.1236, + "step": 4218 + }, + { + "epoch": 1.3671419313026572, + "grad_norm": 0.8763234615325928, + "learning_rate": 2.978071770471427e-06, + "loss": 0.1238, + "step": 4219 + }, + { + "epoch": 1.3674659753726508, + "grad_norm": 0.8843665719032288, + "learning_rate": 2.9772132734848974e-06, + "loss": 0.1258, + "step": 4220 + }, + { + "epoch": 1.3677900194426442, + "grad_norm": 0.8795833587646484, + "learning_rate": 2.9763547180920825e-06, + "loss": 0.1202, + "step": 4221 + }, + { + "epoch": 1.3681140635126376, + "grad_norm": 0.7880746126174927, + "learning_rate": 2.9754961043980623e-06, + "loss": 0.1171, + "step": 4222 + }, + { + "epoch": 1.3684381075826313, + "grad_norm": 0.9248366951942444, + "learning_rate": 2.9746374325079213e-06, + "loss": 0.1184, + "step": 4223 + }, + { + "epoch": 1.3687621516526247, + "grad_norm": 0.9103997945785522, + "learning_rate": 2.973778702526754e-06, + "loss": 0.131, + "step": 4224 + }, + { + "epoch": 1.3690861957226184, + "grad_norm": 0.8283572793006897, + "learning_rate": 2.97291991455966e-06, + "loss": 0.1162, + "step": 4225 + }, + { + "epoch": 1.3694102397926118, + "grad_norm": 0.82844078540802, + "learning_rate": 2.9720610687117462e-06, + "loss": 0.1115, + "step": 4226 + }, + { + "epoch": 1.3697342838626052, + "grad_norm": 0.8517773747444153, + "learning_rate": 2.971202165088128e-06, + "loss": 0.1209, + "step": 4227 + }, + { + "epoch": 1.3700583279325989, + "grad_norm": 0.8081420063972473, + "learning_rate": 2.9703432037939255e-06, + "loss": 0.1171, + "step": 4228 + }, + { + "epoch": 1.3703823720025923, + "grad_norm": 0.8540878295898438, + "learning_rate": 2.9694841849342688e-06, + "loss": 0.1222, + "step": 4229 + }, + { + "epoch": 1.370706416072586, + "grad_norm": 0.8613157868385315, + "learning_rate": 2.9686251086142927e-06, + "loss": 0.118, + "step": 4230 + }, + { + "epoch": 1.3710304601425793, + "grad_norm": 0.8323705196380615, + "learning_rate": 2.9677659749391404e-06, + "loss": 0.1134, + "step": 4231 + }, + { + "epoch": 1.371354504212573, + "grad_norm": 0.8358492255210876, + "learning_rate": 2.9669067840139603e-06, + "loss": 0.1126, + "step": 4232 + }, + { + "epoch": 1.3716785482825664, + "grad_norm": 0.8735930323600769, + "learning_rate": 2.9660475359439113e-06, + "loss": 0.1258, + "step": 4233 + }, + { + "epoch": 1.37200259235256, + "grad_norm": 0.889940619468689, + "learning_rate": 2.965188230834154e-06, + "loss": 0.1188, + "step": 4234 + }, + { + "epoch": 1.3723266364225535, + "grad_norm": 0.8550429344177246, + "learning_rate": 2.9643288687898614e-06, + "loss": 0.1218, + "step": 4235 + }, + { + "epoch": 1.372650680492547, + "grad_norm": 0.8896758556365967, + "learning_rate": 2.96346944991621e-06, + "loss": 0.1309, + "step": 4236 + }, + { + "epoch": 1.3729747245625405, + "grad_norm": 0.8091087341308594, + "learning_rate": 2.962609974318385e-06, + "loss": 0.1177, + "step": 4237 + }, + { + "epoch": 1.373298768632534, + "grad_norm": 0.837196946144104, + "learning_rate": 2.961750442101577e-06, + "loss": 0.1116, + "step": 4238 + }, + { + "epoch": 1.3736228127025276, + "grad_norm": 0.8127410411834717, + "learning_rate": 2.9608908533709852e-06, + "loss": 0.1112, + "step": 4239 + }, + { + "epoch": 1.373946856772521, + "grad_norm": 0.8529530167579651, + "learning_rate": 2.9600312082318144e-06, + "loss": 0.1249, + "step": 4240 + }, + { + "epoch": 1.3742709008425145, + "grad_norm": 0.8476062417030334, + "learning_rate": 2.9591715067892777e-06, + "loss": 0.1173, + "step": 4241 + }, + { + "epoch": 1.374594944912508, + "grad_norm": 0.8822113275527954, + "learning_rate": 2.958311749148594e-06, + "loss": 0.1212, + "step": 4242 + }, + { + "epoch": 1.3749189889825018, + "grad_norm": 0.8067193627357483, + "learning_rate": 2.9574519354149884e-06, + "loss": 0.1244, + "step": 4243 + }, + { + "epoch": 1.3752430330524952, + "grad_norm": 0.8280297517776489, + "learning_rate": 2.9565920656936947e-06, + "loss": 0.1161, + "step": 4244 + }, + { + "epoch": 1.3755670771224886, + "grad_norm": 0.791181743144989, + "learning_rate": 2.9557321400899524e-06, + "loss": 0.1201, + "step": 4245 + }, + { + "epoch": 1.3758911211924822, + "grad_norm": 0.8039045333862305, + "learning_rate": 2.9548721587090075e-06, + "loss": 0.1093, + "step": 4246 + }, + { + "epoch": 1.3762151652624757, + "grad_norm": 0.7414494752883911, + "learning_rate": 2.954012121656114e-06, + "loss": 0.0999, + "step": 4247 + }, + { + "epoch": 1.3765392093324693, + "grad_norm": 0.7822027802467346, + "learning_rate": 2.9531520290365316e-06, + "loss": 0.1073, + "step": 4248 + }, + { + "epoch": 1.3768632534024627, + "grad_norm": 0.8837736248970032, + "learning_rate": 2.952291880955529e-06, + "loss": 0.1187, + "step": 4249 + }, + { + "epoch": 1.3771872974724562, + "grad_norm": 0.8309406638145447, + "learning_rate": 2.9514316775183777e-06, + "loss": 0.1246, + "step": 4250 + }, + { + "epoch": 1.3775113415424498, + "grad_norm": 0.7587549090385437, + "learning_rate": 2.950571418830359e-06, + "loss": 0.1067, + "step": 4251 + }, + { + "epoch": 1.3778353856124432, + "grad_norm": 0.8112297654151917, + "learning_rate": 2.949711104996761e-06, + "loss": 0.1227, + "step": 4252 + }, + { + "epoch": 1.3781594296824369, + "grad_norm": 0.7933392524719238, + "learning_rate": 2.948850736122878e-06, + "loss": 0.1188, + "step": 4253 + }, + { + "epoch": 1.3784834737524303, + "grad_norm": 0.8585779666900635, + "learning_rate": 2.947990312314009e-06, + "loss": 0.1321, + "step": 4254 + }, + { + "epoch": 1.3788075178224237, + "grad_norm": 0.8090001940727234, + "learning_rate": 2.9471298336754633e-06, + "loss": 0.1171, + "step": 4255 + }, + { + "epoch": 1.3791315618924174, + "grad_norm": 0.8341668844223022, + "learning_rate": 2.9462693003125544e-06, + "loss": 0.1124, + "step": 4256 + }, + { + "epoch": 1.379455605962411, + "grad_norm": 0.7895886301994324, + "learning_rate": 2.945408712330603e-06, + "loss": 0.1157, + "step": 4257 + }, + { + "epoch": 1.3797796500324044, + "grad_norm": 0.8023675084114075, + "learning_rate": 2.944548069834937e-06, + "loss": 0.1195, + "step": 4258 + }, + { + "epoch": 1.3801036941023979, + "grad_norm": 0.8319817781448364, + "learning_rate": 2.943687372930891e-06, + "loss": 0.1146, + "step": 4259 + }, + { + "epoch": 1.3804277381723915, + "grad_norm": 0.7980121374130249, + "learning_rate": 2.942826621723806e-06, + "loss": 0.1168, + "step": 4260 + }, + { + "epoch": 1.380751782242385, + "grad_norm": 0.8253116607666016, + "learning_rate": 2.9419658163190295e-06, + "loss": 0.1193, + "step": 4261 + }, + { + "epoch": 1.3810758263123786, + "grad_norm": 0.7385011911392212, + "learning_rate": 2.9411049568219153e-06, + "loss": 0.1051, + "step": 4262 + }, + { + "epoch": 1.381399870382372, + "grad_norm": 0.7881051301956177, + "learning_rate": 2.9402440433378247e-06, + "loss": 0.1146, + "step": 4263 + }, + { + "epoch": 1.3817239144523654, + "grad_norm": 0.8268387913703918, + "learning_rate": 2.939383075972125e-06, + "loss": 0.1144, + "step": 4264 + }, + { + "epoch": 1.382047958522359, + "grad_norm": 0.8658704161643982, + "learning_rate": 2.9385220548301906e-06, + "loss": 0.1187, + "step": 4265 + }, + { + "epoch": 1.3823720025923525, + "grad_norm": 0.8008397817611694, + "learning_rate": 2.937660980017402e-06, + "loss": 0.1109, + "step": 4266 + }, + { + "epoch": 1.3826960466623461, + "grad_norm": 0.8876312971115112, + "learning_rate": 2.936799851639146e-06, + "loss": 0.1232, + "step": 4267 + }, + { + "epoch": 1.3830200907323396, + "grad_norm": 0.8302385807037354, + "learning_rate": 2.9359386698008172e-06, + "loss": 0.1146, + "step": 4268 + }, + { + "epoch": 1.383344134802333, + "grad_norm": 0.8758426904678345, + "learning_rate": 2.935077434607815e-06, + "loss": 0.1123, + "step": 4269 + }, + { + "epoch": 1.3836681788723266, + "grad_norm": 0.8773407340049744, + "learning_rate": 2.9342161461655468e-06, + "loss": 0.1195, + "step": 4270 + }, + { + "epoch": 1.3839922229423203, + "grad_norm": 0.7563143372535706, + "learning_rate": 2.9333548045794253e-06, + "loss": 0.1115, + "step": 4271 + }, + { + "epoch": 1.3843162670123137, + "grad_norm": 0.8111708164215088, + "learning_rate": 2.9324934099548713e-06, + "loss": 0.1113, + "step": 4272 + }, + { + "epoch": 1.3846403110823071, + "grad_norm": 0.922406792640686, + "learning_rate": 2.931631962397311e-06, + "loss": 0.1287, + "step": 4273 + }, + { + "epoch": 1.3849643551523008, + "grad_norm": 0.7872484922409058, + "learning_rate": 2.9307704620121775e-06, + "loss": 0.1143, + "step": 4274 + }, + { + "epoch": 1.3852883992222942, + "grad_norm": 0.8987158536911011, + "learning_rate": 2.9299089089049092e-06, + "loss": 0.1223, + "step": 4275 + }, + { + "epoch": 1.3856124432922878, + "grad_norm": 0.7530018091201782, + "learning_rate": 2.929047303180952e-06, + "loss": 0.1081, + "step": 4276 + }, + { + "epoch": 1.3859364873622813, + "grad_norm": 0.8436494469642639, + "learning_rate": 2.9281856449457587e-06, + "loss": 0.1177, + "step": 4277 + }, + { + "epoch": 1.3862605314322747, + "grad_norm": 0.8935341238975525, + "learning_rate": 2.927323934304787e-06, + "loss": 0.1408, + "step": 4278 + }, + { + "epoch": 1.3865845755022683, + "grad_norm": 0.750361979007721, + "learning_rate": 2.926462171363503e-06, + "loss": 0.1105, + "step": 4279 + }, + { + "epoch": 1.3869086195722617, + "grad_norm": 0.8003627061843872, + "learning_rate": 2.9256003562273784e-06, + "loss": 0.1154, + "step": 4280 + }, + { + "epoch": 1.3872326636422554, + "grad_norm": 0.8266944885253906, + "learning_rate": 2.924738489001889e-06, + "loss": 0.1148, + "step": 4281 + }, + { + "epoch": 1.3875567077122488, + "grad_norm": 0.9039225578308105, + "learning_rate": 2.923876569792521e-06, + "loss": 0.1333, + "step": 4282 + }, + { + "epoch": 1.3878807517822425, + "grad_norm": 0.8375152945518494, + "learning_rate": 2.923014598704764e-06, + "loss": 0.1231, + "step": 4283 + }, + { + "epoch": 1.3882047958522359, + "grad_norm": 0.7849745154380798, + "learning_rate": 2.9221525758441155e-06, + "loss": 0.1195, + "step": 4284 + }, + { + "epoch": 1.3885288399222295, + "grad_norm": 0.7857218384742737, + "learning_rate": 2.9212905013160784e-06, + "loss": 0.1123, + "step": 4285 + }, + { + "epoch": 1.388852883992223, + "grad_norm": 0.7229386568069458, + "learning_rate": 2.920428375226163e-06, + "loss": 0.1031, + "step": 4286 + }, + { + "epoch": 1.3891769280622164, + "grad_norm": 0.7687814831733704, + "learning_rate": 2.9195661976798838e-06, + "loss": 0.1049, + "step": 4287 + }, + { + "epoch": 1.38950097213221, + "grad_norm": 0.8102664947509766, + "learning_rate": 2.918703968782764e-06, + "loss": 0.1169, + "step": 4288 + }, + { + "epoch": 1.3898250162022034, + "grad_norm": 0.8313232660293579, + "learning_rate": 2.9178416886403318e-06, + "loss": 0.1241, + "step": 4289 + }, + { + "epoch": 1.390149060272197, + "grad_norm": 0.76463782787323, + "learning_rate": 2.916979357358121e-06, + "loss": 0.1158, + "step": 4290 + }, + { + "epoch": 1.3904731043421905, + "grad_norm": 0.8194743394851685, + "learning_rate": 2.9161169750416746e-06, + "loss": 0.1194, + "step": 4291 + }, + { + "epoch": 1.390797148412184, + "grad_norm": 0.7722929120063782, + "learning_rate": 2.915254541796539e-06, + "loss": 0.113, + "step": 4292 + }, + { + "epoch": 1.3911211924821776, + "grad_norm": 0.8249363303184509, + "learning_rate": 2.914392057728267e-06, + "loss": 0.1223, + "step": 4293 + }, + { + "epoch": 1.3914452365521712, + "grad_norm": 0.8252852559089661, + "learning_rate": 2.913529522942418e-06, + "loss": 0.1171, + "step": 4294 + }, + { + "epoch": 1.3917692806221647, + "grad_norm": 0.795011043548584, + "learning_rate": 2.9126669375445595e-06, + "loss": 0.1163, + "step": 4295 + }, + { + "epoch": 1.392093324692158, + "grad_norm": 0.8001385927200317, + "learning_rate": 2.911804301640263e-06, + "loss": 0.1222, + "step": 4296 + }, + { + "epoch": 1.3924173687621517, + "grad_norm": 0.8127326369285583, + "learning_rate": 2.910941615335106e-06, + "loss": 0.124, + "step": 4297 + }, + { + "epoch": 1.3927414128321451, + "grad_norm": 0.851428747177124, + "learning_rate": 2.9100788787346746e-06, + "loss": 0.12, + "step": 4298 + }, + { + "epoch": 1.3930654569021388, + "grad_norm": 0.8312917947769165, + "learning_rate": 2.9092160919445566e-06, + "loss": 0.1216, + "step": 4299 + }, + { + "epoch": 1.3933895009721322, + "grad_norm": 0.7327541708946228, + "learning_rate": 2.9083532550703515e-06, + "loss": 0.1113, + "step": 4300 + }, + { + "epoch": 1.3937135450421256, + "grad_norm": 0.8703411221504211, + "learning_rate": 2.9074903682176607e-06, + "loss": 0.1268, + "step": 4301 + }, + { + "epoch": 1.3940375891121193, + "grad_norm": 0.8300853967666626, + "learning_rate": 2.906627431492094e-06, + "loss": 0.1112, + "step": 4302 + }, + { + "epoch": 1.3943616331821127, + "grad_norm": 0.8130788207054138, + "learning_rate": 2.9057644449992655e-06, + "loss": 0.1144, + "step": 4303 + }, + { + "epoch": 1.3946856772521063, + "grad_norm": 0.8016307353973389, + "learning_rate": 2.904901408844798e-06, + "loss": 0.1188, + "step": 4304 + }, + { + "epoch": 1.3950097213220998, + "grad_norm": 0.7863853573799133, + "learning_rate": 2.9040383231343173e-06, + "loss": 0.1105, + "step": 4305 + }, + { + "epoch": 1.3953337653920932, + "grad_norm": 0.8026547431945801, + "learning_rate": 2.903175187973457e-06, + "loss": 0.1144, + "step": 4306 + }, + { + "epoch": 1.3956578094620868, + "grad_norm": 0.8447574973106384, + "learning_rate": 2.9023120034678575e-06, + "loss": 0.132, + "step": 4307 + }, + { + "epoch": 1.3959818535320805, + "grad_norm": 0.8244043588638306, + "learning_rate": 2.901448769723163e-06, + "loss": 0.1069, + "step": 4308 + }, + { + "epoch": 1.396305897602074, + "grad_norm": 0.8813815116882324, + "learning_rate": 2.900585486845026e-06, + "loss": 0.127, + "step": 4309 + }, + { + "epoch": 1.3966299416720673, + "grad_norm": 0.8630741834640503, + "learning_rate": 2.8997221549391025e-06, + "loss": 0.1239, + "step": 4310 + }, + { + "epoch": 1.396953985742061, + "grad_norm": 0.75046706199646, + "learning_rate": 2.8988587741110575e-06, + "loss": 0.0973, + "step": 4311 + }, + { + "epoch": 1.3972780298120544, + "grad_norm": 0.7843610048294067, + "learning_rate": 2.8979953444665585e-06, + "loss": 0.1046, + "step": 4312 + }, + { + "epoch": 1.397602073882048, + "grad_norm": 0.8384765386581421, + "learning_rate": 2.8971318661112836e-06, + "loss": 0.1163, + "step": 4313 + }, + { + "epoch": 1.3979261179520415, + "grad_norm": 0.8430674076080322, + "learning_rate": 2.896268339150912e-06, + "loss": 0.114, + "step": 4314 + }, + { + "epoch": 1.398250162022035, + "grad_norm": 0.8547707200050354, + "learning_rate": 2.895404763691132e-06, + "loss": 0.119, + "step": 4315 + }, + { + "epoch": 1.3985742060920285, + "grad_norm": 0.7359633445739746, + "learning_rate": 2.894541139837638e-06, + "loss": 0.1078, + "step": 4316 + }, + { + "epoch": 1.398898250162022, + "grad_norm": 0.8550858497619629, + "learning_rate": 2.8936774676961264e-06, + "loss": 0.1228, + "step": 4317 + }, + { + "epoch": 1.3992222942320156, + "grad_norm": 0.8657211661338806, + "learning_rate": 2.892813747372305e-06, + "loss": 0.1265, + "step": 4318 + }, + { + "epoch": 1.399546338302009, + "grad_norm": 0.8944101929664612, + "learning_rate": 2.891949978971883e-06, + "loss": 0.1146, + "step": 4319 + }, + { + "epoch": 1.3998703823720027, + "grad_norm": 0.7339831590652466, + "learning_rate": 2.8910861626005774e-06, + "loss": 0.0996, + "step": 4320 + }, + { + "epoch": 1.400194426441996, + "grad_norm": 0.839537501335144, + "learning_rate": 2.890222298364112e-06, + "loss": 0.1263, + "step": 4321 + }, + { + "epoch": 1.4005184705119897, + "grad_norm": 0.7799286246299744, + "learning_rate": 2.8893583863682157e-06, + "loss": 0.1191, + "step": 4322 + }, + { + "epoch": 1.4008425145819832, + "grad_norm": 0.9313471913337708, + "learning_rate": 2.888494426718621e-06, + "loss": 0.1292, + "step": 4323 + }, + { + "epoch": 1.4011665586519766, + "grad_norm": 0.8260443210601807, + "learning_rate": 2.8876304195210697e-06, + "loss": 0.1134, + "step": 4324 + }, + { + "epoch": 1.4014906027219702, + "grad_norm": 0.8529420495033264, + "learning_rate": 2.8867663648813077e-06, + "loss": 0.1257, + "step": 4325 + }, + { + "epoch": 1.4018146467919637, + "grad_norm": 0.8417792916297913, + "learning_rate": 2.885902262905087e-06, + "loss": 0.1114, + "step": 4326 + }, + { + "epoch": 1.4021386908619573, + "grad_norm": 0.8513123393058777, + "learning_rate": 2.885038113698165e-06, + "loss": 0.126, + "step": 4327 + }, + { + "epoch": 1.4024627349319507, + "grad_norm": 0.8104535341262817, + "learning_rate": 2.8841739173663057e-06, + "loss": 0.1161, + "step": 4328 + }, + { + "epoch": 1.4027867790019442, + "grad_norm": 0.7754889130592346, + "learning_rate": 2.883309674015278e-06, + "loss": 0.107, + "step": 4329 + }, + { + "epoch": 1.4031108230719378, + "grad_norm": 0.8020490407943726, + "learning_rate": 2.8824453837508563e-06, + "loss": 0.1122, + "step": 4330 + }, + { + "epoch": 1.4034348671419314, + "grad_norm": 0.8663510680198669, + "learning_rate": 2.8815810466788225e-06, + "loss": 0.1237, + "step": 4331 + }, + { + "epoch": 1.4037589112119249, + "grad_norm": 0.9045275449752808, + "learning_rate": 2.8807166629049623e-06, + "loss": 0.132, + "step": 4332 + }, + { + "epoch": 1.4040829552819183, + "grad_norm": 0.7936350703239441, + "learning_rate": 2.8798522325350683e-06, + "loss": 0.1106, + "step": 4333 + }, + { + "epoch": 1.404406999351912, + "grad_norm": 0.8123335242271423, + "learning_rate": 2.8789877556749383e-06, + "loss": 0.107, + "step": 4334 + }, + { + "epoch": 1.4047310434219054, + "grad_norm": 0.8923666477203369, + "learning_rate": 2.8781232324303758e-06, + "loss": 0.1217, + "step": 4335 + }, + { + "epoch": 1.405055087491899, + "grad_norm": 0.7928508520126343, + "learning_rate": 2.8772586629071902e-06, + "loss": 0.1119, + "step": 4336 + }, + { + "epoch": 1.4053791315618924, + "grad_norm": 0.7635928392410278, + "learning_rate": 2.876394047211196e-06, + "loss": 0.1124, + "step": 4337 + }, + { + "epoch": 1.4057031756318858, + "grad_norm": 0.861503005027771, + "learning_rate": 2.875529385448215e-06, + "loss": 0.1146, + "step": 4338 + }, + { + "epoch": 1.4060272197018795, + "grad_norm": 0.8829136490821838, + "learning_rate": 2.8746646777240724e-06, + "loss": 0.1275, + "step": 4339 + }, + { + "epoch": 1.406351263771873, + "grad_norm": 0.8503878712654114, + "learning_rate": 2.8737999241446e-06, + "loss": 0.1244, + "step": 4340 + }, + { + "epoch": 1.4066753078418666, + "grad_norm": 0.7719369530677795, + "learning_rate": 2.8729351248156364e-06, + "loss": 0.1028, + "step": 4341 + }, + { + "epoch": 1.40699935191186, + "grad_norm": 0.8596400022506714, + "learning_rate": 2.872070279843023e-06, + "loss": 0.1246, + "step": 4342 + }, + { + "epoch": 1.4073233959818534, + "grad_norm": 0.8827523589134216, + "learning_rate": 2.8712053893326088e-06, + "loss": 0.1285, + "step": 4343 + }, + { + "epoch": 1.407647440051847, + "grad_norm": 0.8571576476097107, + "learning_rate": 2.8703404533902492e-06, + "loss": 0.1292, + "step": 4344 + }, + { + "epoch": 1.4079714841218407, + "grad_norm": 0.7802562713623047, + "learning_rate": 2.8694754721218027e-06, + "loss": 0.1114, + "step": 4345 + }, + { + "epoch": 1.4082955281918341, + "grad_norm": 0.8333742022514343, + "learning_rate": 2.8686104456331356e-06, + "loss": 0.108, + "step": 4346 + }, + { + "epoch": 1.4086195722618275, + "grad_norm": 0.7979025840759277, + "learning_rate": 2.8677453740301185e-06, + "loss": 0.1129, + "step": 4347 + }, + { + "epoch": 1.4089436163318212, + "grad_norm": 0.8137672543525696, + "learning_rate": 2.8668802574186277e-06, + "loss": 0.1217, + "step": 4348 + }, + { + "epoch": 1.4092676604018146, + "grad_norm": 0.8151755332946777, + "learning_rate": 2.8660150959045456e-06, + "loss": 0.1226, + "step": 4349 + }, + { + "epoch": 1.4095917044718083, + "grad_norm": 0.8417986631393433, + "learning_rate": 2.865149889593758e-06, + "loss": 0.1133, + "step": 4350 + }, + { + "epoch": 1.4099157485418017, + "grad_norm": 0.8378992676734924, + "learning_rate": 2.8642846385921593e-06, + "loss": 0.1168, + "step": 4351 + }, + { + "epoch": 1.410239792611795, + "grad_norm": 0.8362653255462646, + "learning_rate": 2.863419343005647e-06, + "loss": 0.1181, + "step": 4352 + }, + { + "epoch": 1.4105638366817888, + "grad_norm": 0.8783177137374878, + "learning_rate": 2.8625540029401262e-06, + "loss": 0.126, + "step": 4353 + }, + { + "epoch": 1.4108878807517822, + "grad_norm": 0.8419884443283081, + "learning_rate": 2.8616886185015046e-06, + "loss": 0.1275, + "step": 4354 + }, + { + "epoch": 1.4112119248217758, + "grad_norm": 0.8621551394462585, + "learning_rate": 2.860823189795697e-06, + "loss": 0.1217, + "step": 4355 + }, + { + "epoch": 1.4115359688917692, + "grad_norm": 0.8619939684867859, + "learning_rate": 2.859957716928625e-06, + "loss": 0.1229, + "step": 4356 + }, + { + "epoch": 1.4118600129617627, + "grad_norm": 0.7918229699134827, + "learning_rate": 2.8590922000062125e-06, + "loss": 0.1133, + "step": 4357 + }, + { + "epoch": 1.4121840570317563, + "grad_norm": 0.8201887011528015, + "learning_rate": 2.858226639134391e-06, + "loss": 0.114, + "step": 4358 + }, + { + "epoch": 1.41250810110175, + "grad_norm": 0.8521756529808044, + "learning_rate": 2.8573610344190978e-06, + "loss": 0.1302, + "step": 4359 + }, + { + "epoch": 1.4128321451717434, + "grad_norm": 0.8289884328842163, + "learning_rate": 2.8564953859662725e-06, + "loss": 0.1152, + "step": 4360 + }, + { + "epoch": 1.4131561892417368, + "grad_norm": 0.8881711959838867, + "learning_rate": 2.8556296938818632e-06, + "loss": 0.1246, + "step": 4361 + }, + { + "epoch": 1.4134802333117304, + "grad_norm": 0.7568867206573486, + "learning_rate": 2.8547639582718223e-06, + "loss": 0.1027, + "step": 4362 + }, + { + "epoch": 1.4138042773817239, + "grad_norm": 0.8158787488937378, + "learning_rate": 2.853898179242107e-06, + "loss": 0.1159, + "step": 4363 + }, + { + "epoch": 1.4141283214517175, + "grad_norm": 0.7525155544281006, + "learning_rate": 2.8530323568986805e-06, + "loss": 0.1044, + "step": 4364 + }, + { + "epoch": 1.414452365521711, + "grad_norm": 0.8484021425247192, + "learning_rate": 2.8521664913475123e-06, + "loss": 0.1192, + "step": 4365 + }, + { + "epoch": 1.4147764095917044, + "grad_norm": 0.7542396783828735, + "learning_rate": 2.8513005826945733e-06, + "loss": 0.1086, + "step": 4366 + }, + { + "epoch": 1.415100453661698, + "grad_norm": 0.7980123162269592, + "learning_rate": 2.8504346310458446e-06, + "loss": 0.1173, + "step": 4367 + }, + { + "epoch": 1.4154244977316914, + "grad_norm": 0.8765463829040527, + "learning_rate": 2.8495686365073096e-06, + "loss": 0.1198, + "step": 4368 + }, + { + "epoch": 1.415748541801685, + "grad_norm": 0.7539858818054199, + "learning_rate": 2.848702599184957e-06, + "loss": 0.1024, + "step": 4369 + }, + { + "epoch": 1.4160725858716785, + "grad_norm": 0.7840802073478699, + "learning_rate": 2.8478365191847824e-06, + "loss": 0.1206, + "step": 4370 + }, + { + "epoch": 1.4163966299416721, + "grad_norm": 0.8659231066703796, + "learning_rate": 2.8469703966127853e-06, + "loss": 0.1203, + "step": 4371 + }, + { + "epoch": 1.4167206740116656, + "grad_norm": 0.7570154070854187, + "learning_rate": 2.8461042315749706e-06, + "loss": 0.1065, + "step": 4372 + }, + { + "epoch": 1.4170447180816592, + "grad_norm": 0.8071007132530212, + "learning_rate": 2.845238024177348e-06, + "loss": 0.1171, + "step": 4373 + }, + { + "epoch": 1.4173687621516526, + "grad_norm": 0.8437150716781616, + "learning_rate": 2.8443717745259335e-06, + "loss": 0.1185, + "step": 4374 + }, + { + "epoch": 1.417692806221646, + "grad_norm": 0.8691065907478333, + "learning_rate": 2.8435054827267476e-06, + "loss": 0.125, + "step": 4375 + }, + { + "epoch": 1.4180168502916397, + "grad_norm": 0.8441025614738464, + "learning_rate": 2.8426391488858163e-06, + "loss": 0.1209, + "step": 4376 + }, + { + "epoch": 1.4183408943616331, + "grad_norm": 0.8875846266746521, + "learning_rate": 2.8417727731091705e-06, + "loss": 0.1258, + "step": 4377 + }, + { + "epoch": 1.4186649384316268, + "grad_norm": 0.8732943534851074, + "learning_rate": 2.840906355502845e-06, + "loss": 0.1215, + "step": 4378 + }, + { + "epoch": 1.4189889825016202, + "grad_norm": 0.826084554195404, + "learning_rate": 2.840039896172882e-06, + "loss": 0.1104, + "step": 4379 + }, + { + "epoch": 1.4193130265716136, + "grad_norm": 0.8853211402893066, + "learning_rate": 2.8391733952253277e-06, + "loss": 0.1232, + "step": 4380 + }, + { + "epoch": 1.4196370706416073, + "grad_norm": 0.8420249223709106, + "learning_rate": 2.838306852766234e-06, + "loss": 0.118, + "step": 4381 + }, + { + "epoch": 1.419961114711601, + "grad_norm": 0.8085249662399292, + "learning_rate": 2.8374402689016557e-06, + "loss": 0.1096, + "step": 4382 + }, + { + "epoch": 1.4202851587815943, + "grad_norm": 0.8482363224029541, + "learning_rate": 2.8365736437376555e-06, + "loss": 0.1234, + "step": 4383 + }, + { + "epoch": 1.4206092028515878, + "grad_norm": 0.857765257358551, + "learning_rate": 2.8357069773802996e-06, + "loss": 0.1207, + "step": 4384 + }, + { + "epoch": 1.4209332469215814, + "grad_norm": 0.8329556584358215, + "learning_rate": 2.834840269935659e-06, + "loss": 0.1166, + "step": 4385 + }, + { + "epoch": 1.4212572909915748, + "grad_norm": 0.8334675431251526, + "learning_rate": 2.833973521509812e-06, + "loss": 0.123, + "step": 4386 + }, + { + "epoch": 1.4215813350615685, + "grad_norm": 0.7916951179504395, + "learning_rate": 2.833106732208838e-06, + "loss": 0.1183, + "step": 4387 + }, + { + "epoch": 1.421905379131562, + "grad_norm": 0.8398131132125854, + "learning_rate": 2.8322399021388248e-06, + "loss": 0.118, + "step": 4388 + }, + { + "epoch": 1.4222294232015553, + "grad_norm": 0.8872308731079102, + "learning_rate": 2.8313730314058645e-06, + "loss": 0.126, + "step": 4389 + }, + { + "epoch": 1.422553467271549, + "grad_norm": 0.8836546540260315, + "learning_rate": 2.830506120116053e-06, + "loss": 0.1261, + "step": 4390 + }, + { + "epoch": 1.4228775113415424, + "grad_norm": 0.8260588049888611, + "learning_rate": 2.8296391683754916e-06, + "loss": 0.1191, + "step": 4391 + }, + { + "epoch": 1.423201555411536, + "grad_norm": 0.814530611038208, + "learning_rate": 2.8287721762902877e-06, + "loss": 0.1125, + "step": 4392 + }, + { + "epoch": 1.4235255994815295, + "grad_norm": 0.8176182508468628, + "learning_rate": 2.8279051439665516e-06, + "loss": 0.1226, + "step": 4393 + }, + { + "epoch": 1.4238496435515229, + "grad_norm": 0.8547371029853821, + "learning_rate": 2.8270380715104e-06, + "loss": 0.1196, + "step": 4394 + }, + { + "epoch": 1.4241736876215165, + "grad_norm": 0.8309570550918579, + "learning_rate": 2.826170959027956e-06, + "loss": 0.1123, + "step": 4395 + }, + { + "epoch": 1.4244977316915102, + "grad_norm": 0.8684561848640442, + "learning_rate": 2.8253038066253423e-06, + "loss": 0.1257, + "step": 4396 + }, + { + "epoch": 1.4248217757615036, + "grad_norm": 0.8328901529312134, + "learning_rate": 2.8244366144086926e-06, + "loss": 0.1136, + "step": 4397 + }, + { + "epoch": 1.425145819831497, + "grad_norm": 0.8921581506729126, + "learning_rate": 2.823569382484142e-06, + "loss": 0.1238, + "step": 4398 + }, + { + "epoch": 1.4254698639014907, + "grad_norm": 0.8457754850387573, + "learning_rate": 2.822702110957831e-06, + "loss": 0.114, + "step": 4399 + }, + { + "epoch": 1.425793907971484, + "grad_norm": 0.8151159882545471, + "learning_rate": 2.8218347999359066e-06, + "loss": 0.1187, + "step": 4400 + }, + { + "epoch": 1.4261179520414777, + "grad_norm": 0.8689265251159668, + "learning_rate": 2.8209674495245177e-06, + "loss": 0.1183, + "step": 4401 + }, + { + "epoch": 1.4264419961114712, + "grad_norm": 0.8540447950363159, + "learning_rate": 2.82010005982982e-06, + "loss": 0.1209, + "step": 4402 + }, + { + "epoch": 1.4267660401814646, + "grad_norm": 0.9191943407058716, + "learning_rate": 2.819232630957975e-06, + "loss": 0.1325, + "step": 4403 + }, + { + "epoch": 1.4270900842514582, + "grad_norm": 0.7965993881225586, + "learning_rate": 2.818365163015145e-06, + "loss": 0.1152, + "step": 4404 + }, + { + "epoch": 1.4274141283214516, + "grad_norm": 0.8361888527870178, + "learning_rate": 2.8174976561075013e-06, + "loss": 0.1224, + "step": 4405 + }, + { + "epoch": 1.4277381723914453, + "grad_norm": 0.7920622229576111, + "learning_rate": 2.816630110341218e-06, + "loss": 0.1131, + "step": 4406 + }, + { + "epoch": 1.4280622164614387, + "grad_norm": 0.8130832314491272, + "learning_rate": 2.8157625258224746e-06, + "loss": 0.1178, + "step": 4407 + }, + { + "epoch": 1.4283862605314321, + "grad_norm": 0.7392692565917969, + "learning_rate": 2.814894902657456e-06, + "loss": 0.103, + "step": 4408 + }, + { + "epoch": 1.4287103046014258, + "grad_norm": 0.8898904919624329, + "learning_rate": 2.814027240952348e-06, + "loss": 0.1238, + "step": 4409 + }, + { + "epoch": 1.4290343486714194, + "grad_norm": 0.8473485708236694, + "learning_rate": 2.8131595408133467e-06, + "loss": 0.1141, + "step": 4410 + }, + { + "epoch": 1.4293583927414129, + "grad_norm": 0.8053525686264038, + "learning_rate": 2.8122918023466485e-06, + "loss": 0.1173, + "step": 4411 + }, + { + "epoch": 1.4296824368114063, + "grad_norm": 0.8200134634971619, + "learning_rate": 2.811424025658458e-06, + "loss": 0.1042, + "step": 4412 + }, + { + "epoch": 1.4300064808814, + "grad_norm": 0.9330074191093445, + "learning_rate": 2.8105562108549807e-06, + "loss": 0.13, + "step": 4413 + }, + { + "epoch": 1.4303305249513933, + "grad_norm": 0.8630911111831665, + "learning_rate": 2.80968835804243e-06, + "loss": 0.1167, + "step": 4414 + }, + { + "epoch": 1.430654569021387, + "grad_norm": 0.8359479904174805, + "learning_rate": 2.808820467327022e-06, + "loss": 0.1221, + "step": 4415 + }, + { + "epoch": 1.4309786130913804, + "grad_norm": 0.8438312411308289, + "learning_rate": 2.8079525388149787e-06, + "loss": 0.1268, + "step": 4416 + }, + { + "epoch": 1.4313026571613738, + "grad_norm": 0.8100758790969849, + "learning_rate": 2.8070845726125257e-06, + "loss": 0.1063, + "step": 4417 + }, + { + "epoch": 1.4316267012313675, + "grad_norm": 0.9410319924354553, + "learning_rate": 2.8062165688258934e-06, + "loss": 0.1308, + "step": 4418 + }, + { + "epoch": 1.431950745301361, + "grad_norm": 0.7617793083190918, + "learning_rate": 2.8053485275613177e-06, + "loss": 0.1006, + "step": 4419 + }, + { + "epoch": 1.4322747893713546, + "grad_norm": 0.8048340082168579, + "learning_rate": 2.804480448925039e-06, + "loss": 0.1155, + "step": 4420 + }, + { + "epoch": 1.432598833441348, + "grad_norm": 0.7600870728492737, + "learning_rate": 2.8036123330233e-06, + "loss": 0.1066, + "step": 4421 + }, + { + "epoch": 1.4329228775113416, + "grad_norm": 0.8405733704566956, + "learning_rate": 2.802744179962351e-06, + "loss": 0.1167, + "step": 4422 + }, + { + "epoch": 1.433246921581335, + "grad_norm": 0.7904203534126282, + "learning_rate": 2.801875989848446e-06, + "loss": 0.1155, + "step": 4423 + }, + { + "epoch": 1.4335709656513287, + "grad_norm": 0.8001915812492371, + "learning_rate": 2.8010077627878414e-06, + "loss": 0.1098, + "step": 4424 + }, + { + "epoch": 1.4338950097213221, + "grad_norm": 0.8362448811531067, + "learning_rate": 2.8001394988868003e-06, + "loss": 0.1179, + "step": 4425 + }, + { + "epoch": 1.4342190537913155, + "grad_norm": 0.7693252563476562, + "learning_rate": 2.7992711982515908e-06, + "loss": 0.1128, + "step": 4426 + }, + { + "epoch": 1.4345430978613092, + "grad_norm": 0.813224732875824, + "learning_rate": 2.798402860988483e-06, + "loss": 0.1061, + "step": 4427 + }, + { + "epoch": 1.4348671419313026, + "grad_norm": 0.8760613799095154, + "learning_rate": 2.797534487203755e-06, + "loss": 0.1146, + "step": 4428 + }, + { + "epoch": 1.4351911860012962, + "grad_norm": 0.8515808582305908, + "learning_rate": 2.7966660770036845e-06, + "loss": 0.1285, + "step": 4429 + }, + { + "epoch": 1.4355152300712897, + "grad_norm": 0.8039884567260742, + "learning_rate": 2.795797630494559e-06, + "loss": 0.1116, + "step": 4430 + }, + { + "epoch": 1.435839274141283, + "grad_norm": 0.8766940832138062, + "learning_rate": 2.7949291477826666e-06, + "loss": 0.1156, + "step": 4431 + }, + { + "epoch": 1.4361633182112767, + "grad_norm": 0.7784311175346375, + "learning_rate": 2.7940606289743026e-06, + "loss": 0.1047, + "step": 4432 + }, + { + "epoch": 1.4364873622812704, + "grad_norm": 0.8435165882110596, + "learning_rate": 2.793192074175764e-06, + "loss": 0.1232, + "step": 4433 + }, + { + "epoch": 1.4368114063512638, + "grad_norm": 0.8447853326797485, + "learning_rate": 2.792323483493354e-06, + "loss": 0.1173, + "step": 4434 + }, + { + "epoch": 1.4371354504212572, + "grad_norm": 0.7999764680862427, + "learning_rate": 2.791454857033379e-06, + "loss": 0.1116, + "step": 4435 + }, + { + "epoch": 1.4374594944912509, + "grad_norm": 0.8022122979164124, + "learning_rate": 2.790586194902151e-06, + "loss": 0.1163, + "step": 4436 + }, + { + "epoch": 1.4377835385612443, + "grad_norm": 0.7947648763656616, + "learning_rate": 2.789717497205986e-06, + "loss": 0.1068, + "step": 4437 + }, + { + "epoch": 1.438107582631238, + "grad_norm": 0.8844077587127686, + "learning_rate": 2.7888487640512046e-06, + "loss": 0.1274, + "step": 4438 + }, + { + "epoch": 1.4384316267012314, + "grad_norm": 0.7734432220458984, + "learning_rate": 2.78797999554413e-06, + "loss": 0.1074, + "step": 4439 + }, + { + "epoch": 1.4387556707712248, + "grad_norm": 0.8220635056495667, + "learning_rate": 2.787111191791092e-06, + "loss": 0.1178, + "step": 4440 + }, + { + "epoch": 1.4390797148412184, + "grad_norm": 0.7775998115539551, + "learning_rate": 2.7862423528984233e-06, + "loss": 0.1151, + "step": 4441 + }, + { + "epoch": 1.4394037589112119, + "grad_norm": 0.7471057772636414, + "learning_rate": 2.7853734789724618e-06, + "loss": 0.1044, + "step": 4442 + }, + { + "epoch": 1.4397278029812055, + "grad_norm": 0.9184688925743103, + "learning_rate": 2.7845045701195494e-06, + "loss": 0.1394, + "step": 4443 + }, + { + "epoch": 1.440051847051199, + "grad_norm": 0.800000011920929, + "learning_rate": 2.7836356264460316e-06, + "loss": 0.1167, + "step": 4444 + }, + { + "epoch": 1.4403758911211924, + "grad_norm": 0.8667371273040771, + "learning_rate": 2.7827666480582593e-06, + "loss": 0.1202, + "step": 4445 + }, + { + "epoch": 1.440699935191186, + "grad_norm": 0.7411370873451233, + "learning_rate": 2.7818976350625864e-06, + "loss": 0.0965, + "step": 4446 + }, + { + "epoch": 1.4410239792611796, + "grad_norm": 0.8143919706344604, + "learning_rate": 2.781028587565372e-06, + "loss": 0.1213, + "step": 4447 + }, + { + "epoch": 1.441348023331173, + "grad_norm": 0.8431907892227173, + "learning_rate": 2.780159505672979e-06, + "loss": 0.1196, + "step": 4448 + }, + { + "epoch": 1.4416720674011665, + "grad_norm": 0.7820329666137695, + "learning_rate": 2.7792903894917746e-06, + "loss": 0.1174, + "step": 4449 + }, + { + "epoch": 1.4419961114711601, + "grad_norm": 0.8205739855766296, + "learning_rate": 2.7784212391281307e-06, + "loss": 0.1138, + "step": 4450 + }, + { + "epoch": 1.4423201555411536, + "grad_norm": 0.9027796983718872, + "learning_rate": 2.7775520546884216e-06, + "loss": 0.1163, + "step": 4451 + }, + { + "epoch": 1.4426441996111472, + "grad_norm": 0.8040214776992798, + "learning_rate": 2.7766828362790283e-06, + "loss": 0.1204, + "step": 4452 + }, + { + "epoch": 1.4429682436811406, + "grad_norm": 0.8394964933395386, + "learning_rate": 2.7758135840063344e-06, + "loss": 0.1254, + "step": 4453 + }, + { + "epoch": 1.443292287751134, + "grad_norm": 0.8752254843711853, + "learning_rate": 2.7749442979767276e-06, + "loss": 0.1226, + "step": 4454 + }, + { + "epoch": 1.4436163318211277, + "grad_norm": 0.8331737518310547, + "learning_rate": 2.7740749782966016e-06, + "loss": 0.1139, + "step": 4455 + }, + { + "epoch": 1.4439403758911211, + "grad_norm": 0.8385717868804932, + "learning_rate": 2.7732056250723505e-06, + "loss": 0.1175, + "step": 4456 + }, + { + "epoch": 1.4442644199611148, + "grad_norm": 0.8438707590103149, + "learning_rate": 2.7723362384103757e-06, + "loss": 0.1213, + "step": 4457 + }, + { + "epoch": 1.4445884640311082, + "grad_norm": 0.8456485867500305, + "learning_rate": 2.771466818417082e-06, + "loss": 0.1257, + "step": 4458 + }, + { + "epoch": 1.4449125081011016, + "grad_norm": 0.7441908717155457, + "learning_rate": 2.7705973651988777e-06, + "loss": 0.1059, + "step": 4459 + }, + { + "epoch": 1.4452365521710953, + "grad_norm": 0.8134219646453857, + "learning_rate": 2.769727878862175e-06, + "loss": 0.1182, + "step": 4460 + }, + { + "epoch": 1.445560596241089, + "grad_norm": 0.8193073272705078, + "learning_rate": 2.768858359513392e-06, + "loss": 0.116, + "step": 4461 + }, + { + "epoch": 1.4458846403110823, + "grad_norm": 0.9124169945716858, + "learning_rate": 2.767988807258948e-06, + "loss": 0.1266, + "step": 4462 + }, + { + "epoch": 1.4462086843810757, + "grad_norm": 0.7838951945304871, + "learning_rate": 2.7671192222052685e-06, + "loss": 0.1128, + "step": 4463 + }, + { + "epoch": 1.4465327284510694, + "grad_norm": 0.8049332499504089, + "learning_rate": 2.7662496044587817e-06, + "loss": 0.1112, + "step": 4464 + }, + { + "epoch": 1.4468567725210628, + "grad_norm": 0.9545280933380127, + "learning_rate": 2.765379954125921e-06, + "loss": 0.1239, + "step": 4465 + }, + { + "epoch": 1.4471808165910565, + "grad_norm": 0.781912088394165, + "learning_rate": 2.764510271313123e-06, + "loss": 0.1115, + "step": 4466 + }, + { + "epoch": 1.4475048606610499, + "grad_norm": 0.8536630272865295, + "learning_rate": 2.7636405561268286e-06, + "loss": 0.1219, + "step": 4467 + }, + { + "epoch": 1.4478289047310433, + "grad_norm": 0.8723033666610718, + "learning_rate": 2.7627708086734827e-06, + "loss": 0.1229, + "step": 4468 + }, + { + "epoch": 1.448152948801037, + "grad_norm": 0.7570147514343262, + "learning_rate": 2.7619010290595333e-06, + "loss": 0.1005, + "step": 4469 + }, + { + "epoch": 1.4484769928710304, + "grad_norm": 0.806489109992981, + "learning_rate": 2.7610312173914334e-06, + "loss": 0.1101, + "step": 4470 + }, + { + "epoch": 1.448801036941024, + "grad_norm": 0.8391226530075073, + "learning_rate": 2.760161373775639e-06, + "loss": 0.1176, + "step": 4471 + }, + { + "epoch": 1.4491250810110174, + "grad_norm": 0.8136021494865417, + "learning_rate": 2.7592914983186113e-06, + "loss": 0.1177, + "step": 4472 + }, + { + "epoch": 1.449449125081011, + "grad_norm": 0.8229960203170776, + "learning_rate": 2.758421591126814e-06, + "loss": 0.1206, + "step": 4473 + }, + { + "epoch": 1.4497731691510045, + "grad_norm": 0.7568755745887756, + "learning_rate": 2.757551652306717e-06, + "loss": 0.1093, + "step": 4474 + }, + { + "epoch": 1.4500972132209982, + "grad_norm": 0.8672506213188171, + "learning_rate": 2.7566816819647897e-06, + "loss": 0.1248, + "step": 4475 + }, + { + "epoch": 1.4504212572909916, + "grad_norm": 0.774766206741333, + "learning_rate": 2.7558116802075095e-06, + "loss": 0.1124, + "step": 4476 + }, + { + "epoch": 1.450745301360985, + "grad_norm": 0.8504829406738281, + "learning_rate": 2.754941647141357e-06, + "loss": 0.1218, + "step": 4477 + }, + { + "epoch": 1.4510693454309787, + "grad_norm": 0.8221288919448853, + "learning_rate": 2.754071582872814e-06, + "loss": 0.1166, + "step": 4478 + }, + { + "epoch": 1.451393389500972, + "grad_norm": 0.8883230090141296, + "learning_rate": 2.753201487508369e-06, + "loss": 0.1232, + "step": 4479 + }, + { + "epoch": 1.4517174335709657, + "grad_norm": 0.8256147503852844, + "learning_rate": 2.7523313611545133e-06, + "loss": 0.114, + "step": 4480 + }, + { + "epoch": 1.4520414776409591, + "grad_norm": 0.8885819911956787, + "learning_rate": 2.7514612039177422e-06, + "loss": 0.1282, + "step": 4481 + }, + { + "epoch": 1.4523655217109526, + "grad_norm": 0.8273267149925232, + "learning_rate": 2.7505910159045534e-06, + "loss": 0.1132, + "step": 4482 + }, + { + "epoch": 1.4526895657809462, + "grad_norm": 0.8149267435073853, + "learning_rate": 2.74972079722145e-06, + "loss": 0.1222, + "step": 4483 + }, + { + "epoch": 1.4530136098509399, + "grad_norm": 0.8353701233863831, + "learning_rate": 2.7488505479749395e-06, + "loss": 0.1161, + "step": 4484 + }, + { + "epoch": 1.4533376539209333, + "grad_norm": 0.8027179837226868, + "learning_rate": 2.74798026827153e-06, + "loss": 0.116, + "step": 4485 + }, + { + "epoch": 1.4536616979909267, + "grad_norm": 0.8460054993629456, + "learning_rate": 2.747109958217737e-06, + "loss": 0.1116, + "step": 4486 + }, + { + "epoch": 1.4539857420609203, + "grad_norm": 0.8322781324386597, + "learning_rate": 2.746239617920077e-06, + "loss": 0.1108, + "step": 4487 + }, + { + "epoch": 1.4543097861309138, + "grad_norm": 0.7815658450126648, + "learning_rate": 2.745369247485072e-06, + "loss": 0.1023, + "step": 4488 + }, + { + "epoch": 1.4546338302009074, + "grad_norm": 0.7694299221038818, + "learning_rate": 2.7444988470192457e-06, + "loss": 0.11, + "step": 4489 + }, + { + "epoch": 1.4549578742709008, + "grad_norm": 0.8041089177131653, + "learning_rate": 2.743628416629128e-06, + "loss": 0.1098, + "step": 4490 + }, + { + "epoch": 1.4552819183408943, + "grad_norm": 0.7983806729316711, + "learning_rate": 2.7427579564212496e-06, + "loss": 0.1143, + "step": 4491 + }, + { + "epoch": 1.455605962410888, + "grad_norm": 0.7699452638626099, + "learning_rate": 2.7418874665021483e-06, + "loss": 0.1135, + "step": 4492 + }, + { + "epoch": 1.4559300064808813, + "grad_norm": 0.8244098424911499, + "learning_rate": 2.7410169469783632e-06, + "loss": 0.1186, + "step": 4493 + }, + { + "epoch": 1.456254050550875, + "grad_norm": 0.77878338098526, + "learning_rate": 2.7401463979564365e-06, + "loss": 0.1096, + "step": 4494 + }, + { + "epoch": 1.4565780946208684, + "grad_norm": 0.8096309304237366, + "learning_rate": 2.7392758195429153e-06, + "loss": 0.1079, + "step": 4495 + }, + { + "epoch": 1.4569021386908618, + "grad_norm": 0.7888034582138062, + "learning_rate": 2.73840521184435e-06, + "loss": 0.1149, + "step": 4496 + }, + { + "epoch": 1.4572261827608555, + "grad_norm": 0.8182387351989746, + "learning_rate": 2.737534574967295e-06, + "loss": 0.1187, + "step": 4497 + }, + { + "epoch": 1.4575502268308491, + "grad_norm": 0.8907278776168823, + "learning_rate": 2.7366639090183076e-06, + "loss": 0.1335, + "step": 4498 + }, + { + "epoch": 1.4578742709008425, + "grad_norm": 0.8672367334365845, + "learning_rate": 2.7357932141039494e-06, + "loss": 0.1185, + "step": 4499 + }, + { + "epoch": 1.458198314970836, + "grad_norm": 0.8519613146781921, + "learning_rate": 2.7349224903307836e-06, + "loss": 0.1175, + "step": 4500 + }, + { + "epoch": 1.4585223590408296, + "grad_norm": 0.8108680844306946, + "learning_rate": 2.734051737805379e-06, + "loss": 0.1129, + "step": 4501 + }, + { + "epoch": 1.458846403110823, + "grad_norm": 0.8352702260017395, + "learning_rate": 2.733180956634308e-06, + "loss": 0.1165, + "step": 4502 + }, + { + "epoch": 1.4591704471808167, + "grad_norm": 0.8059620261192322, + "learning_rate": 2.7323101469241454e-06, + "loss": 0.1156, + "step": 4503 + }, + { + "epoch": 1.45949449125081, + "grad_norm": 0.8234224319458008, + "learning_rate": 2.7314393087814693e-06, + "loss": 0.1128, + "step": 4504 + }, + { + "epoch": 1.4598185353208035, + "grad_norm": 0.7840672135353088, + "learning_rate": 2.7305684423128633e-06, + "loss": 0.1067, + "step": 4505 + }, + { + "epoch": 1.4601425793907972, + "grad_norm": 0.8008546233177185, + "learning_rate": 2.729697547624911e-06, + "loss": 0.1062, + "step": 4506 + }, + { + "epoch": 1.4604666234607906, + "grad_norm": 0.8446060419082642, + "learning_rate": 2.7288266248242025e-06, + "loss": 0.1157, + "step": 4507 + }, + { + "epoch": 1.4607906675307842, + "grad_norm": 0.7840669751167297, + "learning_rate": 2.7279556740173306e-06, + "loss": 0.1023, + "step": 4508 + }, + { + "epoch": 1.4611147116007777, + "grad_norm": 0.8466615080833435, + "learning_rate": 2.7270846953108913e-06, + "loss": 0.1214, + "step": 4509 + }, + { + "epoch": 1.4614387556707713, + "grad_norm": 0.8785419464111328, + "learning_rate": 2.7262136888114833e-06, + "loss": 0.1154, + "step": 4510 + }, + { + "epoch": 1.4617627997407647, + "grad_norm": 0.7884746789932251, + "learning_rate": 2.72534265462571e-06, + "loss": 0.1046, + "step": 4511 + }, + { + "epoch": 1.4620868438107584, + "grad_norm": 0.8650237321853638, + "learning_rate": 2.7244715928601774e-06, + "loss": 0.1162, + "step": 4512 + }, + { + "epoch": 1.4624108878807518, + "grad_norm": 0.7944271564483643, + "learning_rate": 2.723600503621494e-06, + "loss": 0.116, + "step": 4513 + }, + { + "epoch": 1.4627349319507452, + "grad_norm": 0.8509773015975952, + "learning_rate": 2.7227293870162742e-06, + "loss": 0.1162, + "step": 4514 + }, + { + "epoch": 1.4630589760207389, + "grad_norm": 0.8748196959495544, + "learning_rate": 2.721858243151133e-06, + "loss": 0.1258, + "step": 4515 + }, + { + "epoch": 1.4633830200907323, + "grad_norm": 0.7907067537307739, + "learning_rate": 2.7209870721326915e-06, + "loss": 0.1078, + "step": 4516 + }, + { + "epoch": 1.463707064160726, + "grad_norm": 0.8583985567092896, + "learning_rate": 2.7201158740675714e-06, + "loss": 0.1173, + "step": 4517 + }, + { + "epoch": 1.4640311082307194, + "grad_norm": 0.8395696878433228, + "learning_rate": 2.719244649062399e-06, + "loss": 0.1142, + "step": 4518 + }, + { + "epoch": 1.4643551523007128, + "grad_norm": 0.8139791488647461, + "learning_rate": 2.718373397223804e-06, + "loss": 0.111, + "step": 4519 + }, + { + "epoch": 1.4646791963707064, + "grad_norm": 0.8256497979164124, + "learning_rate": 2.71750211865842e-06, + "loss": 0.1076, + "step": 4520 + }, + { + "epoch": 1.4650032404407, + "grad_norm": 0.8206305503845215, + "learning_rate": 2.7166308134728814e-06, + "loss": 0.1139, + "step": 4521 + }, + { + "epoch": 1.4653272845106935, + "grad_norm": 0.845827043056488, + "learning_rate": 2.715759481773828e-06, + "loss": 0.115, + "step": 4522 + }, + { + "epoch": 1.465651328580687, + "grad_norm": 0.7799459099769592, + "learning_rate": 2.7148881236679035e-06, + "loss": 0.1124, + "step": 4523 + }, + { + "epoch": 1.4659753726506806, + "grad_norm": 0.8159389495849609, + "learning_rate": 2.7140167392617527e-06, + "loss": 0.122, + "step": 4524 + }, + { + "epoch": 1.466299416720674, + "grad_norm": 0.8140308260917664, + "learning_rate": 2.7131453286620253e-06, + "loss": 0.1178, + "step": 4525 + }, + { + "epoch": 1.4666234607906676, + "grad_norm": 0.854189932346344, + "learning_rate": 2.712273891975372e-06, + "loss": 0.1239, + "step": 4526 + }, + { + "epoch": 1.466947504860661, + "grad_norm": 0.7412453889846802, + "learning_rate": 2.7114024293084502e-06, + "loss": 0.0979, + "step": 4527 + }, + { + "epoch": 1.4672715489306545, + "grad_norm": 0.8710846900939941, + "learning_rate": 2.710530940767917e-06, + "loss": 0.1207, + "step": 4528 + }, + { + "epoch": 1.4675955930006481, + "grad_norm": 0.7715765237808228, + "learning_rate": 2.7096594264604357e-06, + "loss": 0.1087, + "step": 4529 + }, + { + "epoch": 1.4679196370706415, + "grad_norm": 0.8217405080795288, + "learning_rate": 2.7087878864926696e-06, + "loss": 0.1121, + "step": 4530 + }, + { + "epoch": 1.4682436811406352, + "grad_norm": 0.8400027751922607, + "learning_rate": 2.707916320971288e-06, + "loss": 0.1123, + "step": 4531 + }, + { + "epoch": 1.4685677252106286, + "grad_norm": 0.8444027304649353, + "learning_rate": 2.7070447300029607e-06, + "loss": 0.1208, + "step": 4532 + }, + { + "epoch": 1.468891769280622, + "grad_norm": 0.8026435375213623, + "learning_rate": 2.706173113694363e-06, + "loss": 0.1087, + "step": 4533 + }, + { + "epoch": 1.4692158133506157, + "grad_norm": 0.8625495433807373, + "learning_rate": 2.705301472152172e-06, + "loss": 0.121, + "step": 4534 + }, + { + "epoch": 1.4695398574206093, + "grad_norm": 0.8635041117668152, + "learning_rate": 2.7044298054830687e-06, + "loss": 0.1159, + "step": 4535 + }, + { + "epoch": 1.4698639014906028, + "grad_norm": 0.8940765857696533, + "learning_rate": 2.703558113793736e-06, + "loss": 0.1188, + "step": 4536 + }, + { + "epoch": 1.4701879455605962, + "grad_norm": 0.816907525062561, + "learning_rate": 2.7026863971908607e-06, + "loss": 0.1149, + "step": 4537 + }, + { + "epoch": 1.4705119896305898, + "grad_norm": 0.8096557259559631, + "learning_rate": 2.7018146557811325e-06, + "loss": 0.1164, + "step": 4538 + }, + { + "epoch": 1.4708360337005832, + "grad_norm": 0.8419772386550903, + "learning_rate": 2.7009428896712443e-06, + "loss": 0.1158, + "step": 4539 + }, + { + "epoch": 1.471160077770577, + "grad_norm": 0.832172155380249, + "learning_rate": 2.700071098967892e-06, + "loss": 0.1164, + "step": 4540 + }, + { + "epoch": 1.4714841218405703, + "grad_norm": 0.769187867641449, + "learning_rate": 2.699199283777773e-06, + "loss": 0.1026, + "step": 4541 + }, + { + "epoch": 1.4718081659105637, + "grad_norm": 0.8220400810241699, + "learning_rate": 2.6983274442075914e-06, + "loss": 0.1177, + "step": 4542 + }, + { + "epoch": 1.4721322099805574, + "grad_norm": 0.8324995040893555, + "learning_rate": 2.69745558036405e-06, + "loss": 0.1198, + "step": 4543 + }, + { + "epoch": 1.4724562540505508, + "grad_norm": 0.7806642055511475, + "learning_rate": 2.6965836923538568e-06, + "loss": 0.1133, + "step": 4544 + }, + { + "epoch": 1.4727802981205445, + "grad_norm": 0.8086503148078918, + "learning_rate": 2.695711780283723e-06, + "loss": 0.1151, + "step": 4545 + }, + { + "epoch": 1.4731043421905379, + "grad_norm": 0.8214067816734314, + "learning_rate": 2.694839844260361e-06, + "loss": 0.1197, + "step": 4546 + }, + { + "epoch": 1.4734283862605313, + "grad_norm": 0.9072923064231873, + "learning_rate": 2.6939678843904897e-06, + "loss": 0.1356, + "step": 4547 + }, + { + "epoch": 1.473752430330525, + "grad_norm": 0.7834147214889526, + "learning_rate": 2.6930959007808268e-06, + "loss": 0.1171, + "step": 4548 + }, + { + "epoch": 1.4740764744005186, + "grad_norm": 1.084670901298523, + "learning_rate": 2.6922238935380946e-06, + "loss": 0.1128, + "step": 4549 + }, + { + "epoch": 1.474400518470512, + "grad_norm": 0.8274711966514587, + "learning_rate": 2.691351862769018e-06, + "loss": 0.1135, + "step": 4550 + }, + { + "epoch": 1.4747245625405054, + "grad_norm": 0.7669548392295837, + "learning_rate": 2.6904798085803276e-06, + "loss": 0.1088, + "step": 4551 + }, + { + "epoch": 1.475048606610499, + "grad_norm": 0.7579964995384216, + "learning_rate": 2.689607731078751e-06, + "loss": 0.1123, + "step": 4552 + }, + { + "epoch": 1.4753726506804925, + "grad_norm": 0.8058258891105652, + "learning_rate": 2.688735630371024e-06, + "loss": 0.1128, + "step": 4553 + }, + { + "epoch": 1.4756966947504861, + "grad_norm": 0.7897571921348572, + "learning_rate": 2.6878635065638843e-06, + "loss": 0.1126, + "step": 4554 + }, + { + "epoch": 1.4760207388204796, + "grad_norm": 0.8144134283065796, + "learning_rate": 2.6869913597640686e-06, + "loss": 0.1117, + "step": 4555 + }, + { + "epoch": 1.476344782890473, + "grad_norm": 0.8493690490722656, + "learning_rate": 2.6861191900783213e-06, + "loss": 0.1187, + "step": 4556 + }, + { + "epoch": 1.4766688269604666, + "grad_norm": 0.7807652950286865, + "learning_rate": 2.685246997613386e-06, + "loss": 0.1168, + "step": 4557 + }, + { + "epoch": 1.47699287103046, + "grad_norm": 0.8303132057189941, + "learning_rate": 2.6843747824760125e-06, + "loss": 0.1106, + "step": 4558 + }, + { + "epoch": 1.4773169151004537, + "grad_norm": 0.7779312133789062, + "learning_rate": 2.6835025447729495e-06, + "loss": 0.112, + "step": 4559 + }, + { + "epoch": 1.4776409591704471, + "grad_norm": 0.7840691804885864, + "learning_rate": 2.682630284610953e-06, + "loss": 0.1129, + "step": 4560 + }, + { + "epoch": 1.4779650032404408, + "grad_norm": 0.8038629293441772, + "learning_rate": 2.6817580020967767e-06, + "loss": 0.1118, + "step": 4561 + }, + { + "epoch": 1.4782890473104342, + "grad_norm": 0.7459808588027954, + "learning_rate": 2.680885697337181e-06, + "loss": 0.1016, + "step": 4562 + }, + { + "epoch": 1.4786130913804278, + "grad_norm": 0.8536557555198669, + "learning_rate": 2.6800133704389263e-06, + "loss": 0.1199, + "step": 4563 + }, + { + "epoch": 1.4789371354504213, + "grad_norm": 0.8047173023223877, + "learning_rate": 2.6791410215087783e-06, + "loss": 0.1109, + "step": 4564 + }, + { + "epoch": 1.4792611795204147, + "grad_norm": 0.7865987420082092, + "learning_rate": 2.678268650653503e-06, + "loss": 0.1076, + "step": 4565 + }, + { + "epoch": 1.4795852235904083, + "grad_norm": 0.9128715991973877, + "learning_rate": 2.6773962579798713e-06, + "loss": 0.124, + "step": 4566 + }, + { + "epoch": 1.4799092676604018, + "grad_norm": 0.8672195076942444, + "learning_rate": 2.6765238435946543e-06, + "loss": 0.1202, + "step": 4567 + }, + { + "epoch": 1.4802333117303954, + "grad_norm": 0.7601926922798157, + "learning_rate": 2.675651407604628e-06, + "loss": 0.1028, + "step": 4568 + }, + { + "epoch": 1.4805573558003888, + "grad_norm": 0.7621402144432068, + "learning_rate": 2.67477895011657e-06, + "loss": 0.1144, + "step": 4569 + }, + { + "epoch": 1.4808813998703823, + "grad_norm": 0.8962377309799194, + "learning_rate": 2.6739064712372596e-06, + "loss": 0.1272, + "step": 4570 + }, + { + "epoch": 1.481205443940376, + "grad_norm": 0.803502082824707, + "learning_rate": 2.6730339710734815e-06, + "loss": 0.1136, + "step": 4571 + }, + { + "epoch": 1.4815294880103695, + "grad_norm": 0.7923281192779541, + "learning_rate": 2.672161449732021e-06, + "loss": 0.1106, + "step": 4572 + }, + { + "epoch": 1.481853532080363, + "grad_norm": 0.8307749032974243, + "learning_rate": 2.671288907319666e-06, + "loss": 0.1184, + "step": 4573 + }, + { + "epoch": 1.4821775761503564, + "grad_norm": 0.8426263332366943, + "learning_rate": 2.670416343943205e-06, + "loss": 0.1167, + "step": 4574 + }, + { + "epoch": 1.48250162022035, + "grad_norm": 0.7670769691467285, + "learning_rate": 2.669543759709434e-06, + "loss": 0.1101, + "step": 4575 + }, + { + "epoch": 1.4828256642903435, + "grad_norm": 0.7515110969543457, + "learning_rate": 2.668671154725149e-06, + "loss": 0.1086, + "step": 4576 + }, + { + "epoch": 1.483149708360337, + "grad_norm": 0.8248686790466309, + "learning_rate": 2.6677985290971464e-06, + "loss": 0.1147, + "step": 4577 + }, + { + "epoch": 1.4834737524303305, + "grad_norm": 0.7573780417442322, + "learning_rate": 2.666925882932229e-06, + "loss": 0.1088, + "step": 4578 + }, + { + "epoch": 1.483797796500324, + "grad_norm": 0.7721401453018188, + "learning_rate": 2.6660532163371995e-06, + "loss": 0.1085, + "step": 4579 + }, + { + "epoch": 1.4841218405703176, + "grad_norm": 0.7526261806488037, + "learning_rate": 2.665180529418863e-06, + "loss": 0.1062, + "step": 4580 + }, + { + "epoch": 1.484445884640311, + "grad_norm": 0.8702643513679504, + "learning_rate": 2.6643078222840295e-06, + "loss": 0.125, + "step": 4581 + }, + { + "epoch": 1.4847699287103047, + "grad_norm": 0.8288251161575317, + "learning_rate": 2.6634350950395096e-06, + "loss": 0.118, + "step": 4582 + }, + { + "epoch": 1.485093972780298, + "grad_norm": 0.8676289319992065, + "learning_rate": 2.662562347792116e-06, + "loss": 0.1221, + "step": 4583 + }, + { + "epoch": 1.4854180168502915, + "grad_norm": 0.8323357105255127, + "learning_rate": 2.6616895806486644e-06, + "loss": 0.1181, + "step": 4584 + }, + { + "epoch": 1.4857420609202852, + "grad_norm": 0.793228268623352, + "learning_rate": 2.6608167937159735e-06, + "loss": 0.113, + "step": 4585 + }, + { + "epoch": 1.4860661049902788, + "grad_norm": 0.7849512100219727, + "learning_rate": 2.6599439871008636e-06, + "loss": 0.1086, + "step": 4586 + }, + { + "epoch": 1.4863901490602722, + "grad_norm": 0.7768422961235046, + "learning_rate": 2.659071160910158e-06, + "loss": 0.104, + "step": 4587 + }, + { + "epoch": 1.4867141931302656, + "grad_norm": 0.817353367805481, + "learning_rate": 2.6581983152506825e-06, + "loss": 0.1171, + "step": 4588 + }, + { + "epoch": 1.4870382372002593, + "grad_norm": 0.8006489872932434, + "learning_rate": 2.6573254502292644e-06, + "loss": 0.1126, + "step": 4589 + }, + { + "epoch": 1.4873622812702527, + "grad_norm": 0.782650887966156, + "learning_rate": 2.656452565952735e-06, + "loss": 0.1123, + "step": 4590 + }, + { + "epoch": 1.4876863253402464, + "grad_norm": 0.8069577217102051, + "learning_rate": 2.6555796625279257e-06, + "loss": 0.1129, + "step": 4591 + }, + { + "epoch": 1.4880103694102398, + "grad_norm": 0.7839584350585938, + "learning_rate": 2.6547067400616717e-06, + "loss": 0.1076, + "step": 4592 + }, + { + "epoch": 1.4883344134802332, + "grad_norm": 0.8803831338882446, + "learning_rate": 2.6538337986608105e-06, + "loss": 0.1188, + "step": 4593 + }, + { + "epoch": 1.4886584575502269, + "grad_norm": 0.7832203507423401, + "learning_rate": 2.6529608384321815e-06, + "loss": 0.1104, + "step": 4594 + }, + { + "epoch": 1.4889825016202203, + "grad_norm": 0.8302662968635559, + "learning_rate": 2.6520878594826268e-06, + "loss": 0.1216, + "step": 4595 + }, + { + "epoch": 1.489306545690214, + "grad_norm": 0.8601539134979248, + "learning_rate": 2.651214861918991e-06, + "loss": 0.1208, + "step": 4596 + }, + { + "epoch": 1.4896305897602073, + "grad_norm": 0.7589988708496094, + "learning_rate": 2.6503418458481188e-06, + "loss": 0.106, + "step": 4597 + }, + { + "epoch": 1.4899546338302008, + "grad_norm": 0.8155503273010254, + "learning_rate": 2.649468811376861e-06, + "loss": 0.1228, + "step": 4598 + }, + { + "epoch": 1.4902786779001944, + "grad_norm": 0.8368939161300659, + "learning_rate": 2.6485957586120664e-06, + "loss": 0.1175, + "step": 4599 + }, + { + "epoch": 1.490602721970188, + "grad_norm": 0.7997182011604309, + "learning_rate": 2.6477226876605903e-06, + "loss": 0.1142, + "step": 4600 + }, + { + "epoch": 1.4909267660401815, + "grad_norm": 0.8174132704734802, + "learning_rate": 2.646849598629287e-06, + "loss": 0.1163, + "step": 4601 + }, + { + "epoch": 1.491250810110175, + "grad_norm": 0.8421667814254761, + "learning_rate": 2.645976491625015e-06, + "loss": 0.1193, + "step": 4602 + }, + { + "epoch": 1.4915748541801686, + "grad_norm": 0.8165988326072693, + "learning_rate": 2.645103366754633e-06, + "loss": 0.1169, + "step": 4603 + }, + { + "epoch": 1.491898898250162, + "grad_norm": 0.8092917203903198, + "learning_rate": 2.6442302241250047e-06, + "loss": 0.1065, + "step": 4604 + }, + { + "epoch": 1.4922229423201556, + "grad_norm": 0.8170256614685059, + "learning_rate": 2.6433570638429923e-06, + "loss": 0.1088, + "step": 4605 + }, + { + "epoch": 1.492546986390149, + "grad_norm": 0.805540919303894, + "learning_rate": 2.6424838860154633e-06, + "loss": 0.1164, + "step": 4606 + }, + { + "epoch": 1.4928710304601425, + "grad_norm": 0.9216932058334351, + "learning_rate": 2.641610690749286e-06, + "loss": 0.1325, + "step": 4607 + }, + { + "epoch": 1.4931950745301361, + "grad_norm": 0.7840494513511658, + "learning_rate": 2.640737478151331e-06, + "loss": 0.1074, + "step": 4608 + }, + { + "epoch": 1.4935191186001295, + "grad_norm": 0.9483461380004883, + "learning_rate": 2.6398642483284716e-06, + "loss": 0.1233, + "step": 4609 + }, + { + "epoch": 1.4938431626701232, + "grad_norm": 0.8062817454338074, + "learning_rate": 2.6389910013875814e-06, + "loss": 0.1109, + "step": 4610 + }, + { + "epoch": 1.4941672067401166, + "grad_norm": 0.7685247659683228, + "learning_rate": 2.638117737435538e-06, + "loss": 0.1091, + "step": 4611 + }, + { + "epoch": 1.4944912508101102, + "grad_norm": 0.8100350499153137, + "learning_rate": 2.637244456579221e-06, + "loss": 0.1165, + "step": 4612 + }, + { + "epoch": 1.4948152948801037, + "grad_norm": 0.7939400672912598, + "learning_rate": 2.6363711589255115e-06, + "loss": 0.1099, + "step": 4613 + }, + { + "epoch": 1.4951393389500973, + "grad_norm": 0.8388355374336243, + "learning_rate": 2.6354978445812923e-06, + "loss": 0.116, + "step": 4614 + }, + { + "epoch": 1.4954633830200907, + "grad_norm": 0.7846883535385132, + "learning_rate": 2.6346245136534483e-06, + "loss": 0.111, + "step": 4615 + }, + { + "epoch": 1.4957874270900842, + "grad_norm": 0.853891134262085, + "learning_rate": 2.6337511662488678e-06, + "loss": 0.1192, + "step": 4616 + }, + { + "epoch": 1.4961114711600778, + "grad_norm": 0.7976066470146179, + "learning_rate": 2.6328778024744384e-06, + "loss": 0.1149, + "step": 4617 + }, + { + "epoch": 1.4964355152300712, + "grad_norm": 0.7755050659179688, + "learning_rate": 2.6320044224370526e-06, + "loss": 0.1081, + "step": 4618 + }, + { + "epoch": 1.4967595593000649, + "grad_norm": 0.9004369974136353, + "learning_rate": 2.6311310262436035e-06, + "loss": 0.1294, + "step": 4619 + }, + { + "epoch": 1.4970836033700583, + "grad_norm": 0.8410223722457886, + "learning_rate": 2.6302576140009866e-06, + "loss": 0.1154, + "step": 4620 + }, + { + "epoch": 1.4974076474400517, + "grad_norm": 0.7805401682853699, + "learning_rate": 2.6293841858160983e-06, + "loss": 0.1081, + "step": 4621 + }, + { + "epoch": 1.4977316915100454, + "grad_norm": 0.7736295461654663, + "learning_rate": 2.6285107417958385e-06, + "loss": 0.1062, + "step": 4622 + }, + { + "epoch": 1.498055735580039, + "grad_norm": 0.8259331583976746, + "learning_rate": 2.6276372820471073e-06, + "loss": 0.114, + "step": 4623 + }, + { + "epoch": 1.4983797796500324, + "grad_norm": 0.8400769829750061, + "learning_rate": 2.6267638066768087e-06, + "loss": 0.1187, + "step": 4624 + }, + { + "epoch": 1.4987038237200259, + "grad_norm": 0.8498058319091797, + "learning_rate": 2.625890315791848e-06, + "loss": 0.1117, + "step": 4625 + }, + { + "epoch": 1.4990278677900195, + "grad_norm": 0.7712481021881104, + "learning_rate": 2.625016809499131e-06, + "loss": 0.1056, + "step": 4626 + }, + { + "epoch": 1.499351911860013, + "grad_norm": 0.7691459059715271, + "learning_rate": 2.6241432879055667e-06, + "loss": 0.1115, + "step": 4627 + }, + { + "epoch": 1.4996759559300066, + "grad_norm": 0.8329920768737793, + "learning_rate": 2.6232697511180654e-06, + "loss": 0.1141, + "step": 4628 + }, + { + "epoch": 1.5, + "grad_norm": 0.8468537926673889, + "learning_rate": 2.6223961992435406e-06, + "loss": 0.1103, + "step": 4629 + }, + { + "epoch": 1.5003240440699934, + "grad_norm": 0.782757580280304, + "learning_rate": 2.6215226323889048e-06, + "loss": 0.1064, + "step": 4630 + }, + { + "epoch": 1.500648088139987, + "grad_norm": 0.8492832779884338, + "learning_rate": 2.620649050661076e-06, + "loss": 0.1166, + "step": 4631 + }, + { + "epoch": 1.5009721322099807, + "grad_norm": 0.8164941668510437, + "learning_rate": 2.6197754541669714e-06, + "loss": 0.1172, + "step": 4632 + }, + { + "epoch": 1.5012961762799741, + "grad_norm": 0.8901158571243286, + "learning_rate": 2.6189018430135106e-06, + "loss": 0.1259, + "step": 4633 + }, + { + "epoch": 1.5016202203499676, + "grad_norm": 0.7792893052101135, + "learning_rate": 2.6180282173076156e-06, + "loss": 0.1082, + "step": 4634 + }, + { + "epoch": 1.501944264419961, + "grad_norm": 0.8170306086540222, + "learning_rate": 2.6171545771562085e-06, + "loss": 0.1148, + "step": 4635 + }, + { + "epoch": 1.5022683084899546, + "grad_norm": 0.8025646209716797, + "learning_rate": 2.6162809226662167e-06, + "loss": 0.1127, + "step": 4636 + }, + { + "epoch": 1.5025923525599483, + "grad_norm": 0.8521740436553955, + "learning_rate": 2.6154072539445645e-06, + "loss": 0.1278, + "step": 4637 + }, + { + "epoch": 1.5029163966299417, + "grad_norm": 0.7798749208450317, + "learning_rate": 2.6145335710981817e-06, + "loss": 0.1058, + "step": 4638 + }, + { + "epoch": 1.5032404406999351, + "grad_norm": 0.8448235988616943, + "learning_rate": 2.613659874233999e-06, + "loss": 0.1181, + "step": 4639 + }, + { + "epoch": 1.5035644847699285, + "grad_norm": 0.8113897442817688, + "learning_rate": 2.612786163458948e-06, + "loss": 0.1179, + "step": 4640 + }, + { + "epoch": 1.5038885288399222, + "grad_norm": 0.9807034730911255, + "learning_rate": 2.611912438879962e-06, + "loss": 0.1293, + "step": 4641 + }, + { + "epoch": 1.5042125729099158, + "grad_norm": 0.753908097743988, + "learning_rate": 2.611038700603977e-06, + "loss": 0.109, + "step": 4642 + }, + { + "epoch": 1.5045366169799093, + "grad_norm": 0.8391751050949097, + "learning_rate": 2.6101649487379304e-06, + "loss": 0.1214, + "step": 4643 + }, + { + "epoch": 1.5048606610499027, + "grad_norm": 0.9241876006126404, + "learning_rate": 2.6092911833887602e-06, + "loss": 0.1219, + "step": 4644 + }, + { + "epoch": 1.5051847051198963, + "grad_norm": 0.7995864748954773, + "learning_rate": 2.6084174046634075e-06, + "loss": 0.1159, + "step": 4645 + }, + { + "epoch": 1.50550874918989, + "grad_norm": 0.7828188538551331, + "learning_rate": 2.607543612668814e-06, + "loss": 0.1143, + "step": 4646 + }, + { + "epoch": 1.5058327932598834, + "grad_norm": 0.7762099504470825, + "learning_rate": 2.6066698075119237e-06, + "loss": 0.1087, + "step": 4647 + }, + { + "epoch": 1.5061568373298768, + "grad_norm": 0.8002234101295471, + "learning_rate": 2.605795989299681e-06, + "loss": 0.1118, + "step": 4648 + }, + { + "epoch": 1.5064808813998702, + "grad_norm": 0.800934910774231, + "learning_rate": 2.604922158139033e-06, + "loss": 0.1068, + "step": 4649 + }, + { + "epoch": 1.5068049254698639, + "grad_norm": 0.8457976579666138, + "learning_rate": 2.6040483141369293e-06, + "loss": 0.1187, + "step": 4650 + }, + { + "epoch": 1.5071289695398575, + "grad_norm": 0.8208997249603271, + "learning_rate": 2.603174457400319e-06, + "loss": 0.115, + "step": 4651 + }, + { + "epoch": 1.507453013609851, + "grad_norm": 0.7759538292884827, + "learning_rate": 2.602300588036154e-06, + "loss": 0.104, + "step": 4652 + }, + { + "epoch": 1.5077770576798444, + "grad_norm": 0.8319365978240967, + "learning_rate": 2.6014267061513875e-06, + "loss": 0.1132, + "step": 4653 + }, + { + "epoch": 1.508101101749838, + "grad_norm": 0.8539026975631714, + "learning_rate": 2.6005528118529738e-06, + "loss": 0.1196, + "step": 4654 + }, + { + "epoch": 1.5084251458198314, + "grad_norm": 0.8303791880607605, + "learning_rate": 2.5996789052478693e-06, + "loss": 0.1211, + "step": 4655 + }, + { + "epoch": 1.508749189889825, + "grad_norm": 0.807961642742157, + "learning_rate": 2.5988049864430314e-06, + "loss": 0.1053, + "step": 4656 + }, + { + "epoch": 1.5090732339598185, + "grad_norm": 0.7976993322372437, + "learning_rate": 2.597931055545421e-06, + "loss": 0.1118, + "step": 4657 + }, + { + "epoch": 1.509397278029812, + "grad_norm": 0.8033688068389893, + "learning_rate": 2.597057112661997e-06, + "loss": 0.1179, + "step": 4658 + }, + { + "epoch": 1.5097213220998056, + "grad_norm": 0.798485517501831, + "learning_rate": 2.5961831578997214e-06, + "loss": 0.1162, + "step": 4659 + }, + { + "epoch": 1.5100453661697992, + "grad_norm": 0.79729825258255, + "learning_rate": 2.5953091913655586e-06, + "loss": 0.1147, + "step": 4660 + }, + { + "epoch": 1.5103694102397927, + "grad_norm": 0.8427738547325134, + "learning_rate": 2.594435213166473e-06, + "loss": 0.1186, + "step": 4661 + }, + { + "epoch": 1.510693454309786, + "grad_norm": 0.7964630126953125, + "learning_rate": 2.593561223409432e-06, + "loss": 0.1198, + "step": 4662 + }, + { + "epoch": 1.5110174983797795, + "grad_norm": 0.7886890172958374, + "learning_rate": 2.592687222201403e-06, + "loss": 0.1071, + "step": 4663 + }, + { + "epoch": 1.5113415424497731, + "grad_norm": 0.8665766716003418, + "learning_rate": 2.5918132096493552e-06, + "loss": 0.124, + "step": 4664 + }, + { + "epoch": 1.5116655865197668, + "grad_norm": 0.7874413728713989, + "learning_rate": 2.5909391858602596e-06, + "loss": 0.1137, + "step": 4665 + }, + { + "epoch": 1.5119896305897602, + "grad_norm": 0.7725497484207153, + "learning_rate": 2.5900651509410875e-06, + "loss": 0.1111, + "step": 4666 + }, + { + "epoch": 1.5123136746597536, + "grad_norm": 0.8019454479217529, + "learning_rate": 2.5891911049988133e-06, + "loss": 0.1153, + "step": 4667 + }, + { + "epoch": 1.5126377187297473, + "grad_norm": 0.8188468813896179, + "learning_rate": 2.5883170481404112e-06, + "loss": 0.1121, + "step": 4668 + }, + { + "epoch": 1.512961762799741, + "grad_norm": 0.788652241230011, + "learning_rate": 2.587442980472858e-06, + "loss": 0.1109, + "step": 4669 + }, + { + "epoch": 1.5132858068697344, + "grad_norm": 0.8207998275756836, + "learning_rate": 2.5865689021031292e-06, + "loss": 0.108, + "step": 4670 + }, + { + "epoch": 1.5136098509397278, + "grad_norm": 0.868406355381012, + "learning_rate": 2.5856948131382055e-06, + "loss": 0.1246, + "step": 4671 + }, + { + "epoch": 1.5139338950097212, + "grad_norm": 0.7927680611610413, + "learning_rate": 2.584820713685066e-06, + "loss": 0.11, + "step": 4672 + }, + { + "epoch": 1.5142579390797148, + "grad_norm": 0.7637700438499451, + "learning_rate": 2.5839466038506927e-06, + "loss": 0.1093, + "step": 4673 + }, + { + "epoch": 1.5145819831497085, + "grad_norm": 0.875170111656189, + "learning_rate": 2.5830724837420675e-06, + "loss": 0.1272, + "step": 4674 + }, + { + "epoch": 1.514906027219702, + "grad_norm": 0.8147414326667786, + "learning_rate": 2.582198353466175e-06, + "loss": 0.1123, + "step": 4675 + }, + { + "epoch": 1.5152300712896953, + "grad_norm": 0.8192980885505676, + "learning_rate": 2.5813242131299986e-06, + "loss": 0.1124, + "step": 4676 + }, + { + "epoch": 1.5155541153596888, + "grad_norm": 0.8011279106140137, + "learning_rate": 2.5804500628405265e-06, + "loss": 0.1178, + "step": 4677 + }, + { + "epoch": 1.5158781594296824, + "grad_norm": 0.8356201648712158, + "learning_rate": 2.5795759027047457e-06, + "loss": 0.1149, + "step": 4678 + }, + { + "epoch": 1.516202203499676, + "grad_norm": 0.8754128217697144, + "learning_rate": 2.578701732829645e-06, + "loss": 0.1221, + "step": 4679 + }, + { + "epoch": 1.5165262475696695, + "grad_norm": 0.7901120781898499, + "learning_rate": 2.5778275533222135e-06, + "loss": 0.1132, + "step": 4680 + }, + { + "epoch": 1.516850291639663, + "grad_norm": 0.8232377767562866, + "learning_rate": 2.5769533642894433e-06, + "loss": 0.1104, + "step": 4681 + }, + { + "epoch": 1.5171743357096565, + "grad_norm": 0.8965370655059814, + "learning_rate": 2.576079165838326e-06, + "loss": 0.125, + "step": 4682 + }, + { + "epoch": 1.5174983797796502, + "grad_norm": 0.830436646938324, + "learning_rate": 2.5752049580758555e-06, + "loss": 0.1243, + "step": 4683 + }, + { + "epoch": 1.5178224238496436, + "grad_norm": 0.8122559785842896, + "learning_rate": 2.5743307411090255e-06, + "loss": 0.1151, + "step": 4684 + }, + { + "epoch": 1.518146467919637, + "grad_norm": 0.7352414727210999, + "learning_rate": 2.5734565150448325e-06, + "loss": 0.1014, + "step": 4685 + }, + { + "epoch": 1.5184705119896305, + "grad_norm": 0.8098716139793396, + "learning_rate": 2.5725822799902738e-06, + "loss": 0.1172, + "step": 4686 + }, + { + "epoch": 1.518794556059624, + "grad_norm": 0.8782811760902405, + "learning_rate": 2.5717080360523464e-06, + "loss": 0.1174, + "step": 4687 + }, + { + "epoch": 1.5191186001296177, + "grad_norm": 0.7974719405174255, + "learning_rate": 2.57083378333805e-06, + "loss": 0.1174, + "step": 4688 + }, + { + "epoch": 1.5194426441996112, + "grad_norm": 0.8503957986831665, + "learning_rate": 2.5699595219543838e-06, + "loss": 0.1233, + "step": 4689 + }, + { + "epoch": 1.5197666882696046, + "grad_norm": 0.7533683776855469, + "learning_rate": 2.5690852520083496e-06, + "loss": 0.1092, + "step": 4690 + }, + { + "epoch": 1.5200907323395982, + "grad_norm": 0.8622933030128479, + "learning_rate": 2.5682109736069492e-06, + "loss": 0.1289, + "step": 4691 + }, + { + "epoch": 1.5204147764095917, + "grad_norm": 0.7616762518882751, + "learning_rate": 2.5673366868571858e-06, + "loss": 0.1098, + "step": 4692 + }, + { + "epoch": 1.5207388204795853, + "grad_norm": 0.8017957806587219, + "learning_rate": 2.566462391866064e-06, + "loss": 0.1185, + "step": 4693 + }, + { + "epoch": 1.5210628645495787, + "grad_norm": 0.8263230323791504, + "learning_rate": 2.5655880887405893e-06, + "loss": 0.12, + "step": 4694 + }, + { + "epoch": 1.5213869086195722, + "grad_norm": 0.7774035930633545, + "learning_rate": 2.564713777587767e-06, + "loss": 0.111, + "step": 4695 + }, + { + "epoch": 1.5217109526895658, + "grad_norm": 0.7920206189155579, + "learning_rate": 2.5638394585146044e-06, + "loss": 0.1148, + "step": 4696 + }, + { + "epoch": 1.5220349967595594, + "grad_norm": 0.8174133896827698, + "learning_rate": 2.56296513162811e-06, + "loss": 0.115, + "step": 4697 + }, + { + "epoch": 1.5223590408295529, + "grad_norm": 0.8442100882530212, + "learning_rate": 2.5620907970352937e-06, + "loss": 0.118, + "step": 4698 + }, + { + "epoch": 1.5226830848995463, + "grad_norm": 0.7956017255783081, + "learning_rate": 2.561216454843165e-06, + "loss": 0.1117, + "step": 4699 + }, + { + "epoch": 1.5230071289695397, + "grad_norm": 0.850167453289032, + "learning_rate": 2.5603421051587344e-06, + "loss": 0.1192, + "step": 4700 + }, + { + "epoch": 1.5233311730395334, + "grad_norm": 0.8265661001205444, + "learning_rate": 2.5594677480890152e-06, + "loss": 0.1171, + "step": 4701 + }, + { + "epoch": 1.523655217109527, + "grad_norm": 0.8158977031707764, + "learning_rate": 2.558593383741018e-06, + "loss": 0.1166, + "step": 4702 + }, + { + "epoch": 1.5239792611795204, + "grad_norm": 0.8036544322967529, + "learning_rate": 2.5577190122217583e-06, + "loss": 0.1111, + "step": 4703 + }, + { + "epoch": 1.5243033052495139, + "grad_norm": 0.7271086573600769, + "learning_rate": 2.55684463363825e-06, + "loss": 0.1089, + "step": 4704 + }, + { + "epoch": 1.5246273493195075, + "grad_norm": 0.8072844743728638, + "learning_rate": 2.5559702480975094e-06, + "loss": 0.1158, + "step": 4705 + }, + { + "epoch": 1.524951393389501, + "grad_norm": 0.8877519369125366, + "learning_rate": 2.5550958557065523e-06, + "loss": 0.1264, + "step": 4706 + }, + { + "epoch": 1.5252754374594946, + "grad_norm": 0.7614017724990845, + "learning_rate": 2.554221456572396e-06, + "loss": 0.1148, + "step": 4707 + }, + { + "epoch": 1.525599481529488, + "grad_norm": 0.8140348196029663, + "learning_rate": 2.553347050802058e-06, + "loss": 0.1167, + "step": 4708 + }, + { + "epoch": 1.5259235255994814, + "grad_norm": 0.8409748077392578, + "learning_rate": 2.552472638502557e-06, + "loss": 0.1194, + "step": 4709 + }, + { + "epoch": 1.526247569669475, + "grad_norm": 0.9039139151573181, + "learning_rate": 2.5515982197809142e-06, + "loss": 0.1203, + "step": 4710 + }, + { + "epoch": 1.5265716137394687, + "grad_norm": 0.8296209573745728, + "learning_rate": 2.5507237947441478e-06, + "loss": 0.1166, + "step": 4711 + }, + { + "epoch": 1.5268956578094621, + "grad_norm": 0.9392574429512024, + "learning_rate": 2.5498493634992803e-06, + "loss": 0.1247, + "step": 4712 + }, + { + "epoch": 1.5272197018794555, + "grad_norm": 0.8392185568809509, + "learning_rate": 2.5489749261533333e-06, + "loss": 0.1126, + "step": 4713 + }, + { + "epoch": 1.527543745949449, + "grad_norm": 0.8599982857704163, + "learning_rate": 2.548100482813329e-06, + "loss": 0.1168, + "step": 4714 + }, + { + "epoch": 1.5278677900194426, + "grad_norm": 0.8504279851913452, + "learning_rate": 2.5472260335862915e-06, + "loss": 0.117, + "step": 4715 + }, + { + "epoch": 1.5281918340894363, + "grad_norm": 0.7987934947013855, + "learning_rate": 2.546351578579245e-06, + "loss": 0.108, + "step": 4716 + }, + { + "epoch": 1.5285158781594297, + "grad_norm": 0.7965323328971863, + "learning_rate": 2.545477117899213e-06, + "loss": 0.118, + "step": 4717 + }, + { + "epoch": 1.528839922229423, + "grad_norm": 0.9307451844215393, + "learning_rate": 2.5446026516532235e-06, + "loss": 0.1273, + "step": 4718 + }, + { + "epoch": 1.5291639662994168, + "grad_norm": 0.8149518966674805, + "learning_rate": 2.5437281799483005e-06, + "loss": 0.1226, + "step": 4719 + }, + { + "epoch": 1.5294880103694104, + "grad_norm": 0.8553066849708557, + "learning_rate": 2.542853702891471e-06, + "loss": 0.1218, + "step": 4720 + }, + { + "epoch": 1.5298120544394038, + "grad_norm": 0.9042952656745911, + "learning_rate": 2.541979220589765e-06, + "loss": 0.1204, + "step": 4721 + }, + { + "epoch": 1.5301360985093972, + "grad_norm": 0.7949498295783997, + "learning_rate": 2.541104733150207e-06, + "loss": 0.1123, + "step": 4722 + }, + { + "epoch": 1.5304601425793907, + "grad_norm": 0.8241522312164307, + "learning_rate": 2.540230240679828e-06, + "loss": 0.1149, + "step": 4723 + }, + { + "epoch": 1.5307841866493843, + "grad_norm": 0.7348130941390991, + "learning_rate": 2.5393557432856575e-06, + "loss": 0.1083, + "step": 4724 + }, + { + "epoch": 1.531108230719378, + "grad_norm": 0.8246307969093323, + "learning_rate": 2.5384812410747244e-06, + "loss": 0.1158, + "step": 4725 + }, + { + "epoch": 1.5314322747893714, + "grad_norm": 0.8028252720832825, + "learning_rate": 2.53760673415406e-06, + "loss": 0.1165, + "step": 4726 + }, + { + "epoch": 1.5317563188593648, + "grad_norm": 0.76541668176651, + "learning_rate": 2.5367322226306956e-06, + "loss": 0.1147, + "step": 4727 + }, + { + "epoch": 1.5320803629293582, + "grad_norm": 0.8166850805282593, + "learning_rate": 2.5358577066116622e-06, + "loss": 0.1162, + "step": 4728 + }, + { + "epoch": 1.5324044069993519, + "grad_norm": 0.7746853232383728, + "learning_rate": 2.534983186203993e-06, + "loss": 0.1112, + "step": 4729 + }, + { + "epoch": 1.5327284510693455, + "grad_norm": 0.8185518383979797, + "learning_rate": 2.5341086615147207e-06, + "loss": 0.1201, + "step": 4730 + }, + { + "epoch": 1.533052495139339, + "grad_norm": 0.7633078694343567, + "learning_rate": 2.5332341326508786e-06, + "loss": 0.1064, + "step": 4731 + }, + { + "epoch": 1.5333765392093324, + "grad_norm": 0.7728112936019897, + "learning_rate": 2.5323595997195005e-06, + "loss": 0.1065, + "step": 4732 + }, + { + "epoch": 1.533700583279326, + "grad_norm": 0.8007860779762268, + "learning_rate": 2.53148506282762e-06, + "loss": 0.1181, + "step": 4733 + }, + { + "epoch": 1.5340246273493197, + "grad_norm": 0.8603343367576599, + "learning_rate": 2.530610522082273e-06, + "loss": 0.1144, + "step": 4734 + }, + { + "epoch": 1.534348671419313, + "grad_norm": 0.8246845006942749, + "learning_rate": 2.529735977590494e-06, + "loss": 0.1179, + "step": 4735 + }, + { + "epoch": 1.5346727154893065, + "grad_norm": 0.8384710550308228, + "learning_rate": 2.52886142945932e-06, + "loss": 0.1242, + "step": 4736 + }, + { + "epoch": 1.5349967595593, + "grad_norm": 0.7882184386253357, + "learning_rate": 2.527986877795786e-06, + "loss": 0.108, + "step": 4737 + }, + { + "epoch": 1.5353208036292936, + "grad_norm": 0.8187207579612732, + "learning_rate": 2.527112322706929e-06, + "loss": 0.1177, + "step": 4738 + }, + { + "epoch": 1.5356448476992872, + "grad_norm": 0.8378166556358337, + "learning_rate": 2.526237764299786e-06, + "loss": 0.1099, + "step": 4739 + }, + { + "epoch": 1.5359688917692806, + "grad_norm": 0.7764092683792114, + "learning_rate": 2.5253632026813945e-06, + "loss": 0.1131, + "step": 4740 + }, + { + "epoch": 1.536292935839274, + "grad_norm": 0.8988218307495117, + "learning_rate": 2.524488637958793e-06, + "loss": 0.1217, + "step": 4741 + }, + { + "epoch": 1.5366169799092677, + "grad_norm": 0.7971240878105164, + "learning_rate": 2.5236140702390194e-06, + "loss": 0.1072, + "step": 4742 + }, + { + "epoch": 1.5369410239792611, + "grad_norm": 0.7888842225074768, + "learning_rate": 2.522739499629112e-06, + "loss": 0.1167, + "step": 4743 + }, + { + "epoch": 1.5372650680492548, + "grad_norm": 0.8259699940681458, + "learning_rate": 2.5218649262361104e-06, + "loss": 0.1183, + "step": 4744 + }, + { + "epoch": 1.5375891121192482, + "grad_norm": 0.8481448888778687, + "learning_rate": 2.520990350167053e-06, + "loss": 0.1183, + "step": 4745 + }, + { + "epoch": 1.5379131561892416, + "grad_norm": 0.7942140102386475, + "learning_rate": 2.5201157715289796e-06, + "loss": 0.1144, + "step": 4746 + }, + { + "epoch": 1.5382372002592353, + "grad_norm": 0.8272114992141724, + "learning_rate": 2.519241190428931e-06, + "loss": 0.1237, + "step": 4747 + }, + { + "epoch": 1.538561244329229, + "grad_norm": 0.841238260269165, + "learning_rate": 2.518366606973947e-06, + "loss": 0.1244, + "step": 4748 + }, + { + "epoch": 1.5388852883992223, + "grad_norm": 0.8029258251190186, + "learning_rate": 2.517492021271068e-06, + "loss": 0.1112, + "step": 4749 + }, + { + "epoch": 1.5392093324692158, + "grad_norm": 0.7682906985282898, + "learning_rate": 2.5166174334273347e-06, + "loss": 0.1106, + "step": 4750 + }, + { + "epoch": 1.5395333765392092, + "grad_norm": 0.7744021415710449, + "learning_rate": 2.5157428435497887e-06, + "loss": 0.1129, + "step": 4751 + }, + { + "epoch": 1.5398574206092028, + "grad_norm": 0.810088038444519, + "learning_rate": 2.5148682517454707e-06, + "loss": 0.1172, + "step": 4752 + }, + { + "epoch": 1.5401814646791965, + "grad_norm": 0.8204680681228638, + "learning_rate": 2.5139936581214235e-06, + "loss": 0.1038, + "step": 4753 + }, + { + "epoch": 1.54050550874919, + "grad_norm": 0.8239283561706543, + "learning_rate": 2.5131190627846875e-06, + "loss": 0.1216, + "step": 4754 + }, + { + "epoch": 1.5408295528191833, + "grad_norm": 0.8344904780387878, + "learning_rate": 2.512244465842305e-06, + "loss": 0.1204, + "step": 4755 + }, + { + "epoch": 1.541153596889177, + "grad_norm": 0.830840528011322, + "learning_rate": 2.5113698674013186e-06, + "loss": 0.1169, + "step": 4756 + }, + { + "epoch": 1.5414776409591704, + "grad_norm": 0.8880487084388733, + "learning_rate": 2.5104952675687706e-06, + "loss": 0.1265, + "step": 4757 + }, + { + "epoch": 1.541801685029164, + "grad_norm": 0.8078792095184326, + "learning_rate": 2.509620666451703e-06, + "loss": 0.1146, + "step": 4758 + }, + { + "epoch": 1.5421257290991575, + "grad_norm": 0.8384366631507874, + "learning_rate": 2.5087460641571594e-06, + "loss": 0.1212, + "step": 4759 + }, + { + "epoch": 1.5424497731691509, + "grad_norm": 0.8105331063270569, + "learning_rate": 2.5078714607921825e-06, + "loss": 0.1169, + "step": 4760 + }, + { + "epoch": 1.5427738172391445, + "grad_norm": 0.8387223482131958, + "learning_rate": 2.506996856463814e-06, + "loss": 0.1172, + "step": 4761 + }, + { + "epoch": 1.5430978613091382, + "grad_norm": 0.9171087741851807, + "learning_rate": 2.506122251279099e-06, + "loss": 0.1261, + "step": 4762 + }, + { + "epoch": 1.5434219053791316, + "grad_norm": 0.8017227053642273, + "learning_rate": 2.5052476453450788e-06, + "loss": 0.1142, + "step": 4763 + }, + { + "epoch": 1.543745949449125, + "grad_norm": 0.8722697496414185, + "learning_rate": 2.504373038768799e-06, + "loss": 0.127, + "step": 4764 + }, + { + "epoch": 1.5440699935191184, + "grad_norm": 0.8668619394302368, + "learning_rate": 2.5034984316573003e-06, + "loss": 0.1272, + "step": 4765 + }, + { + "epoch": 1.544394037589112, + "grad_norm": 0.8164231777191162, + "learning_rate": 2.5026238241176283e-06, + "loss": 0.1133, + "step": 4766 + }, + { + "epoch": 1.5447180816591057, + "grad_norm": 0.8477160930633545, + "learning_rate": 2.5017492162568246e-06, + "loss": 0.1191, + "step": 4767 + }, + { + "epoch": 1.5450421257290992, + "grad_norm": 0.9103198647499084, + "learning_rate": 2.5008746081819345e-06, + "loss": 0.1168, + "step": 4768 + }, + { + "epoch": 1.5453661697990926, + "grad_norm": 0.7986923456192017, + "learning_rate": 2.5e-06, + "loss": 0.1182, + "step": 4769 + }, + { + "epoch": 1.5456902138690862, + "grad_norm": 0.8324925899505615, + "learning_rate": 2.4991253918180668e-06, + "loss": 0.1238, + "step": 4770 + }, + { + "epoch": 1.5460142579390799, + "grad_norm": 0.8424703478813171, + "learning_rate": 2.498250783743176e-06, + "loss": 0.1205, + "step": 4771 + }, + { + "epoch": 1.5463383020090733, + "grad_norm": 0.8214755058288574, + "learning_rate": 2.4973761758823734e-06, + "loss": 0.1153, + "step": 4772 + }, + { + "epoch": 1.5466623460790667, + "grad_norm": 0.850267767906189, + "learning_rate": 2.4965015683427005e-06, + "loss": 0.1092, + "step": 4773 + }, + { + "epoch": 1.5469863901490601, + "grad_norm": 0.7865439653396606, + "learning_rate": 2.4956269612312025e-06, + "loss": 0.1159, + "step": 4774 + }, + { + "epoch": 1.5473104342190538, + "grad_norm": 0.772476851940155, + "learning_rate": 2.494752354654921e-06, + "loss": 0.112, + "step": 4775 + }, + { + "epoch": 1.5476344782890474, + "grad_norm": 0.8782417178153992, + "learning_rate": 2.4938777487209022e-06, + "loss": 0.1306, + "step": 4776 + }, + { + "epoch": 1.5479585223590409, + "grad_norm": 0.809949517250061, + "learning_rate": 2.493003143536187e-06, + "loss": 0.1152, + "step": 4777 + }, + { + "epoch": 1.5482825664290343, + "grad_norm": 0.7971104979515076, + "learning_rate": 2.4921285392078184e-06, + "loss": 0.1096, + "step": 4778 + }, + { + "epoch": 1.5486066104990277, + "grad_norm": 0.8603529930114746, + "learning_rate": 2.491253935842842e-06, + "loss": 0.1264, + "step": 4779 + }, + { + "epoch": 1.5489306545690213, + "grad_norm": 0.882556140422821, + "learning_rate": 2.490379333548297e-06, + "loss": 0.1173, + "step": 4780 + }, + { + "epoch": 1.549254698639015, + "grad_norm": 0.788023829460144, + "learning_rate": 2.4895047324312303e-06, + "loss": 0.1108, + "step": 4781 + }, + { + "epoch": 1.5495787427090084, + "grad_norm": 0.7869258522987366, + "learning_rate": 2.4886301325986827e-06, + "loss": 0.1076, + "step": 4782 + }, + { + "epoch": 1.5499027867790018, + "grad_norm": 0.7824283838272095, + "learning_rate": 2.4877555341576955e-06, + "loss": 0.104, + "step": 4783 + }, + { + "epoch": 1.5502268308489955, + "grad_norm": 0.8321191668510437, + "learning_rate": 2.4868809372153137e-06, + "loss": 0.119, + "step": 4784 + }, + { + "epoch": 1.5505508749189891, + "grad_norm": 0.7406947612762451, + "learning_rate": 2.4860063418785773e-06, + "loss": 0.1015, + "step": 4785 + }, + { + "epoch": 1.5508749189889826, + "grad_norm": 0.8466529846191406, + "learning_rate": 2.4851317482545297e-06, + "loss": 0.12, + "step": 4786 + }, + { + "epoch": 1.551198963058976, + "grad_norm": 0.7870974540710449, + "learning_rate": 2.4842571564502117e-06, + "loss": 0.1097, + "step": 4787 + }, + { + "epoch": 1.5515230071289694, + "grad_norm": 0.8245130777359009, + "learning_rate": 2.4833825665726657e-06, + "loss": 0.1146, + "step": 4788 + }, + { + "epoch": 1.551847051198963, + "grad_norm": 0.7951235175132751, + "learning_rate": 2.482507978728933e-06, + "loss": 0.1213, + "step": 4789 + }, + { + "epoch": 1.5521710952689567, + "grad_norm": 0.7778697609901428, + "learning_rate": 2.4816333930260535e-06, + "loss": 0.11, + "step": 4790 + }, + { + "epoch": 1.5524951393389501, + "grad_norm": 0.7982646822929382, + "learning_rate": 2.4807588095710696e-06, + "loss": 0.11, + "step": 4791 + }, + { + "epoch": 1.5528191834089435, + "grad_norm": 0.8741667866706848, + "learning_rate": 2.4798842284710203e-06, + "loss": 0.1156, + "step": 4792 + }, + { + "epoch": 1.5531432274789372, + "grad_norm": 0.8188022971153259, + "learning_rate": 2.4790096498329477e-06, + "loss": 0.1172, + "step": 4793 + }, + { + "epoch": 1.5534672715489306, + "grad_norm": 0.8138577938079834, + "learning_rate": 2.478135073763891e-06, + "loss": 0.1167, + "step": 4794 + }, + { + "epoch": 1.5537913156189243, + "grad_norm": 0.8361755609512329, + "learning_rate": 2.4772605003708885e-06, + "loss": 0.121, + "step": 4795 + }, + { + "epoch": 1.5541153596889177, + "grad_norm": 0.8536872863769531, + "learning_rate": 2.476385929760981e-06, + "loss": 0.1223, + "step": 4796 + }, + { + "epoch": 1.554439403758911, + "grad_norm": 0.754604697227478, + "learning_rate": 2.475511362041207e-06, + "loss": 0.0987, + "step": 4797 + }, + { + "epoch": 1.5547634478289047, + "grad_norm": 0.8049080967903137, + "learning_rate": 2.4746367973186063e-06, + "loss": 0.1185, + "step": 4798 + }, + { + "epoch": 1.5550874918988984, + "grad_norm": 0.7643322944641113, + "learning_rate": 2.473762235700214e-06, + "loss": 0.1061, + "step": 4799 + }, + { + "epoch": 1.5554115359688918, + "grad_norm": 0.8364583849906921, + "learning_rate": 2.472887677293072e-06, + "loss": 0.1135, + "step": 4800 + }, + { + "epoch": 1.5557355800388852, + "grad_norm": 0.8260913491249084, + "learning_rate": 2.4720131222042156e-06, + "loss": 0.1139, + "step": 4801 + }, + { + "epoch": 1.5560596241088787, + "grad_norm": 0.805766224861145, + "learning_rate": 2.4711385705406805e-06, + "loss": 0.1171, + "step": 4802 + }, + { + "epoch": 1.5563836681788723, + "grad_norm": 0.8735179901123047, + "learning_rate": 2.4702640224095066e-06, + "loss": 0.1242, + "step": 4803 + }, + { + "epoch": 1.556707712248866, + "grad_norm": 0.7554102540016174, + "learning_rate": 2.469389477917727e-06, + "loss": 0.1104, + "step": 4804 + }, + { + "epoch": 1.5570317563188594, + "grad_norm": 0.7954150438308716, + "learning_rate": 2.4685149371723806e-06, + "loss": 0.1133, + "step": 4805 + }, + { + "epoch": 1.5573558003888528, + "grad_norm": 0.7790459394454956, + "learning_rate": 2.467640400280501e-06, + "loss": 0.1042, + "step": 4806 + }, + { + "epoch": 1.5576798444588464, + "grad_norm": 0.8028296232223511, + "learning_rate": 2.466765867349122e-06, + "loss": 0.1188, + "step": 4807 + }, + { + "epoch": 1.55800388852884, + "grad_norm": 0.804160475730896, + "learning_rate": 2.46589133848528e-06, + "loss": 0.1187, + "step": 4808 + }, + { + "epoch": 1.5583279325988335, + "grad_norm": 0.7990769743919373, + "learning_rate": 2.465016813796007e-06, + "loss": 0.121, + "step": 4809 + }, + { + "epoch": 1.558651976668827, + "grad_norm": 0.7471599578857422, + "learning_rate": 2.464142293388338e-06, + "loss": 0.1058, + "step": 4810 + }, + { + "epoch": 1.5589760207388204, + "grad_norm": 0.7916015982627869, + "learning_rate": 2.4632677773693048e-06, + "loss": 0.1057, + "step": 4811 + }, + { + "epoch": 1.559300064808814, + "grad_norm": 0.8223981857299805, + "learning_rate": 2.4623932658459406e-06, + "loss": 0.1215, + "step": 4812 + }, + { + "epoch": 1.5596241088788076, + "grad_norm": 0.7924101948738098, + "learning_rate": 2.461518758925277e-06, + "loss": 0.1134, + "step": 4813 + }, + { + "epoch": 1.559948152948801, + "grad_norm": 0.8664736151695251, + "learning_rate": 2.4606442567143434e-06, + "loss": 0.1199, + "step": 4814 + }, + { + "epoch": 1.5602721970187945, + "grad_norm": 0.7690613865852356, + "learning_rate": 2.4597697593201728e-06, + "loss": 0.1054, + "step": 4815 + }, + { + "epoch": 1.560596241088788, + "grad_norm": 0.7565768957138062, + "learning_rate": 2.4588952668497937e-06, + "loss": 0.1106, + "step": 4816 + }, + { + "epoch": 1.5609202851587816, + "grad_norm": 0.7539846301078796, + "learning_rate": 2.4580207794102364e-06, + "loss": 0.1077, + "step": 4817 + }, + { + "epoch": 1.5612443292287752, + "grad_norm": 0.8341878056526184, + "learning_rate": 2.4571462971085293e-06, + "loss": 0.115, + "step": 4818 + }, + { + "epoch": 1.5615683732987686, + "grad_norm": 0.8036201000213623, + "learning_rate": 2.4562718200517003e-06, + "loss": 0.1145, + "step": 4819 + }, + { + "epoch": 1.561892417368762, + "grad_norm": 0.786769688129425, + "learning_rate": 2.4553973483467778e-06, + "loss": 0.1103, + "step": 4820 + }, + { + "epoch": 1.5622164614387557, + "grad_norm": 0.8251572251319885, + "learning_rate": 2.454522882100787e-06, + "loss": 0.114, + "step": 4821 + }, + { + "epoch": 1.5625405055087493, + "grad_norm": 0.8083153367042542, + "learning_rate": 2.453648421420756e-06, + "loss": 0.1152, + "step": 4822 + }, + { + "epoch": 1.5628645495787428, + "grad_norm": 0.7225403785705566, + "learning_rate": 2.4527739664137085e-06, + "loss": 0.1001, + "step": 4823 + }, + { + "epoch": 1.5631885936487362, + "grad_norm": 0.7972891926765442, + "learning_rate": 2.4518995171866717e-06, + "loss": 0.1162, + "step": 4824 + }, + { + "epoch": 1.5635126377187296, + "grad_norm": 0.8370020985603333, + "learning_rate": 2.451025073846668e-06, + "loss": 0.1191, + "step": 4825 + }, + { + "epoch": 1.5638366817887233, + "grad_norm": 0.7791475057601929, + "learning_rate": 2.45015063650072e-06, + "loss": 0.1151, + "step": 4826 + }, + { + "epoch": 1.564160725858717, + "grad_norm": 0.8660632371902466, + "learning_rate": 2.449276205255853e-06, + "loss": 0.1156, + "step": 4827 + }, + { + "epoch": 1.5644847699287103, + "grad_norm": 0.8165311813354492, + "learning_rate": 2.448401780219087e-06, + "loss": 0.1074, + "step": 4828 + }, + { + "epoch": 1.5648088139987038, + "grad_norm": 0.8229710459709167, + "learning_rate": 2.4475273614974437e-06, + "loss": 0.1185, + "step": 4829 + }, + { + "epoch": 1.5651328580686974, + "grad_norm": 0.9050421714782715, + "learning_rate": 2.4466529491979437e-06, + "loss": 0.1183, + "step": 4830 + }, + { + "epoch": 1.5654569021386908, + "grad_norm": 0.8065712451934814, + "learning_rate": 2.445778543427605e-06, + "loss": 0.1073, + "step": 4831 + }, + { + "epoch": 1.5657809462086845, + "grad_norm": 0.7765482664108276, + "learning_rate": 2.4449041442934485e-06, + "loss": 0.1133, + "step": 4832 + }, + { + "epoch": 1.5661049902786779, + "grad_norm": 0.8295376300811768, + "learning_rate": 2.4440297519024906e-06, + "loss": 0.1208, + "step": 4833 + }, + { + "epoch": 1.5664290343486713, + "grad_norm": 0.7947652339935303, + "learning_rate": 2.4431553663617502e-06, + "loss": 0.109, + "step": 4834 + }, + { + "epoch": 1.566753078418665, + "grad_norm": 0.8637305498123169, + "learning_rate": 2.4422809877782417e-06, + "loss": 0.1207, + "step": 4835 + }, + { + "epoch": 1.5670771224886586, + "grad_norm": 0.7860732674598694, + "learning_rate": 2.4414066162589823e-06, + "loss": 0.1154, + "step": 4836 + }, + { + "epoch": 1.567401166558652, + "grad_norm": 0.7914908528327942, + "learning_rate": 2.4405322519109864e-06, + "loss": 0.1124, + "step": 4837 + }, + { + "epoch": 1.5677252106286454, + "grad_norm": 0.8570221662521362, + "learning_rate": 2.4396578948412664e-06, + "loss": 0.1285, + "step": 4838 + }, + { + "epoch": 1.5680492546986389, + "grad_norm": 0.8589009642601013, + "learning_rate": 2.4387835451568355e-06, + "loss": 0.1199, + "step": 4839 + }, + { + "epoch": 1.5683732987686325, + "grad_norm": 0.798724353313446, + "learning_rate": 2.4379092029647067e-06, + "loss": 0.1061, + "step": 4840 + }, + { + "epoch": 1.5686973428386262, + "grad_norm": 0.772853434085846, + "learning_rate": 2.4370348683718906e-06, + "loss": 0.1117, + "step": 4841 + }, + { + "epoch": 1.5690213869086196, + "grad_norm": 0.8315975069999695, + "learning_rate": 2.436160541485396e-06, + "loss": 0.114, + "step": 4842 + }, + { + "epoch": 1.569345430978613, + "grad_norm": 0.893140971660614, + "learning_rate": 2.4352862224122344e-06, + "loss": 0.1195, + "step": 4843 + }, + { + "epoch": 1.5696694750486067, + "grad_norm": 0.7216249108314514, + "learning_rate": 2.4344119112594124e-06, + "loss": 0.097, + "step": 4844 + }, + { + "epoch": 1.5699935191186, + "grad_norm": 0.7690826654434204, + "learning_rate": 2.4335376081339364e-06, + "loss": 0.114, + "step": 4845 + }, + { + "epoch": 1.5703175631885937, + "grad_norm": 0.7511160969734192, + "learning_rate": 2.4326633131428147e-06, + "loss": 0.1046, + "step": 4846 + }, + { + "epoch": 1.5706416072585871, + "grad_norm": 0.7963749170303345, + "learning_rate": 2.4317890263930516e-06, + "loss": 0.1108, + "step": 4847 + }, + { + "epoch": 1.5709656513285806, + "grad_norm": 0.7963820695877075, + "learning_rate": 2.430914747991651e-06, + "loss": 0.1189, + "step": 4848 + }, + { + "epoch": 1.5712896953985742, + "grad_norm": 0.7745254039764404, + "learning_rate": 2.430040478045617e-06, + "loss": 0.1064, + "step": 4849 + }, + { + "epoch": 1.5716137394685679, + "grad_norm": 0.9512690901756287, + "learning_rate": 2.429166216661951e-06, + "loss": 0.1161, + "step": 4850 + }, + { + "epoch": 1.5719377835385613, + "grad_norm": 0.8008152842521667, + "learning_rate": 2.4282919639476544e-06, + "loss": 0.1096, + "step": 4851 + }, + { + "epoch": 1.5722618276085547, + "grad_norm": 0.8169487714767456, + "learning_rate": 2.4274177200097266e-06, + "loss": 0.1197, + "step": 4852 + }, + { + "epoch": 1.5725858716785481, + "grad_norm": 0.8935542702674866, + "learning_rate": 2.426543484955168e-06, + "loss": 0.1219, + "step": 4853 + }, + { + "epoch": 1.5729099157485418, + "grad_norm": 0.824174165725708, + "learning_rate": 2.425669258890975e-06, + "loss": 0.1223, + "step": 4854 + }, + { + "epoch": 1.5732339598185354, + "grad_norm": 0.8504593372344971, + "learning_rate": 2.4247950419241457e-06, + "loss": 0.1183, + "step": 4855 + }, + { + "epoch": 1.5735580038885288, + "grad_norm": 0.8114284873008728, + "learning_rate": 2.4239208341616755e-06, + "loss": 0.1142, + "step": 4856 + }, + { + "epoch": 1.5738820479585223, + "grad_norm": 0.8362430334091187, + "learning_rate": 2.4230466357105575e-06, + "loss": 0.1182, + "step": 4857 + }, + { + "epoch": 1.574206092028516, + "grad_norm": 0.8267870545387268, + "learning_rate": 2.4221724466777874e-06, + "loss": 0.1129, + "step": 4858 + }, + { + "epoch": 1.5745301360985096, + "grad_norm": 0.829098641872406, + "learning_rate": 2.421298267170356e-06, + "loss": 0.1185, + "step": 4859 + }, + { + "epoch": 1.574854180168503, + "grad_norm": 0.8627114295959473, + "learning_rate": 2.420424097295255e-06, + "loss": 0.1207, + "step": 4860 + }, + { + "epoch": 1.5751782242384964, + "grad_norm": 0.8436676263809204, + "learning_rate": 2.419549937159474e-06, + "loss": 0.1097, + "step": 4861 + }, + { + "epoch": 1.5755022683084898, + "grad_norm": 0.8668633699417114, + "learning_rate": 2.418675786870002e-06, + "loss": 0.1217, + "step": 4862 + }, + { + "epoch": 1.5758263123784835, + "grad_norm": 0.7808005809783936, + "learning_rate": 2.4178016465338266e-06, + "loss": 0.1154, + "step": 4863 + }, + { + "epoch": 1.5761503564484771, + "grad_norm": 0.8143179416656494, + "learning_rate": 2.416927516257933e-06, + "loss": 0.1228, + "step": 4864 + }, + { + "epoch": 1.5764744005184705, + "grad_norm": 0.8253973722457886, + "learning_rate": 2.416053396149308e-06, + "loss": 0.1144, + "step": 4865 + }, + { + "epoch": 1.576798444588464, + "grad_norm": 0.833735466003418, + "learning_rate": 2.415179286314934e-06, + "loss": 0.1172, + "step": 4866 + }, + { + "epoch": 1.5771224886584574, + "grad_norm": 0.8309965133666992, + "learning_rate": 2.414305186861795e-06, + "loss": 0.1228, + "step": 4867 + }, + { + "epoch": 1.577446532728451, + "grad_norm": 0.8067078590393066, + "learning_rate": 2.4134310978968716e-06, + "loss": 0.1125, + "step": 4868 + }, + { + "epoch": 1.5777705767984447, + "grad_norm": 0.8044711947441101, + "learning_rate": 2.412557019527143e-06, + "loss": 0.1147, + "step": 4869 + }, + { + "epoch": 1.578094620868438, + "grad_norm": 0.7666822075843811, + "learning_rate": 2.4116829518595896e-06, + "loss": 0.1057, + "step": 4870 + }, + { + "epoch": 1.5784186649384315, + "grad_norm": 0.7714723348617554, + "learning_rate": 2.410808895001187e-06, + "loss": 0.1156, + "step": 4871 + }, + { + "epoch": 1.5787427090084252, + "grad_norm": 0.828205406665802, + "learning_rate": 2.409934849058913e-06, + "loss": 0.1247, + "step": 4872 + }, + { + "epoch": 1.5790667530784188, + "grad_norm": 0.8341466784477234, + "learning_rate": 2.4090608141397417e-06, + "loss": 0.1127, + "step": 4873 + }, + { + "epoch": 1.5793907971484122, + "grad_norm": 0.7735393047332764, + "learning_rate": 2.408186790350645e-06, + "loss": 0.1051, + "step": 4874 + }, + { + "epoch": 1.5797148412184057, + "grad_norm": 0.7524586915969849, + "learning_rate": 2.4073127777985982e-06, + "loss": 0.0987, + "step": 4875 + }, + { + "epoch": 1.580038885288399, + "grad_norm": 0.7769646644592285, + "learning_rate": 2.406438776590568e-06, + "loss": 0.1116, + "step": 4876 + }, + { + "epoch": 1.5803629293583927, + "grad_norm": 0.7380020022392273, + "learning_rate": 2.4055647868335273e-06, + "loss": 0.1086, + "step": 4877 + }, + { + "epoch": 1.5806869734283864, + "grad_norm": 0.8171250224113464, + "learning_rate": 2.404690808634442e-06, + "loss": 0.12, + "step": 4878 + }, + { + "epoch": 1.5810110174983798, + "grad_norm": 0.8413975834846497, + "learning_rate": 2.4038168421002795e-06, + "loss": 0.1187, + "step": 4879 + }, + { + "epoch": 1.5813350615683732, + "grad_norm": 0.7690439820289612, + "learning_rate": 2.4029428873380044e-06, + "loss": 0.1027, + "step": 4880 + }, + { + "epoch": 1.5816591056383669, + "grad_norm": 0.8208738565444946, + "learning_rate": 2.4020689444545796e-06, + "loss": 0.1173, + "step": 4881 + }, + { + "epoch": 1.5819831497083603, + "grad_norm": 0.8537747263908386, + "learning_rate": 2.401195013556969e-06, + "loss": 0.1144, + "step": 4882 + }, + { + "epoch": 1.582307193778354, + "grad_norm": 0.8122548460960388, + "learning_rate": 2.400321094752131e-06, + "loss": 0.1181, + "step": 4883 + }, + { + "epoch": 1.5826312378483474, + "grad_norm": 0.8134654760360718, + "learning_rate": 2.399447188147027e-06, + "loss": 0.1079, + "step": 4884 + }, + { + "epoch": 1.5829552819183408, + "grad_norm": 0.8235934376716614, + "learning_rate": 2.3985732938486137e-06, + "loss": 0.1192, + "step": 4885 + }, + { + "epoch": 1.5832793259883344, + "grad_norm": 0.8062383532524109, + "learning_rate": 2.3976994119638464e-06, + "loss": 0.1149, + "step": 4886 + }, + { + "epoch": 1.583603370058328, + "grad_norm": 0.8052951693534851, + "learning_rate": 2.3968255425996817e-06, + "loss": 0.1108, + "step": 4887 + }, + { + "epoch": 1.5839274141283215, + "grad_norm": 0.8194572925567627, + "learning_rate": 2.3959516858630707e-06, + "loss": 0.115, + "step": 4888 + }, + { + "epoch": 1.584251458198315, + "grad_norm": 0.7949249744415283, + "learning_rate": 2.3950778418609676e-06, + "loss": 0.1005, + "step": 4889 + }, + { + "epoch": 1.5845755022683083, + "grad_norm": 0.8598880767822266, + "learning_rate": 2.39420401070032e-06, + "loss": 0.1188, + "step": 4890 + }, + { + "epoch": 1.584899546338302, + "grad_norm": 0.7758145928382874, + "learning_rate": 2.3933301924880768e-06, + "loss": 0.1084, + "step": 4891 + }, + { + "epoch": 1.5852235904082956, + "grad_norm": 0.9253159761428833, + "learning_rate": 2.3924563873311868e-06, + "loss": 0.1175, + "step": 4892 + }, + { + "epoch": 1.585547634478289, + "grad_norm": 0.9324074983596802, + "learning_rate": 2.391582595336593e-06, + "loss": 0.1233, + "step": 4893 + }, + { + "epoch": 1.5858716785482825, + "grad_norm": 0.8482661843299866, + "learning_rate": 2.3907088166112406e-06, + "loss": 0.1168, + "step": 4894 + }, + { + "epoch": 1.5861957226182761, + "grad_norm": 0.8450517058372498, + "learning_rate": 2.3898350512620696e-06, + "loss": 0.1251, + "step": 4895 + }, + { + "epoch": 1.5865197666882696, + "grad_norm": 0.8631404042243958, + "learning_rate": 2.3889612993960233e-06, + "loss": 0.1223, + "step": 4896 + }, + { + "epoch": 1.5868438107582632, + "grad_norm": 0.8064488768577576, + "learning_rate": 2.3880875611200387e-06, + "loss": 0.1152, + "step": 4897 + }, + { + "epoch": 1.5871678548282566, + "grad_norm": 0.862596869468689, + "learning_rate": 2.3872138365410525e-06, + "loss": 0.1194, + "step": 4898 + }, + { + "epoch": 1.58749189889825, + "grad_norm": 0.7267577052116394, + "learning_rate": 2.3863401257660016e-06, + "loss": 0.1091, + "step": 4899 + }, + { + "epoch": 1.5878159429682437, + "grad_norm": 0.8517301082611084, + "learning_rate": 2.3854664289018182e-06, + "loss": 0.1258, + "step": 4900 + }, + { + "epoch": 1.5881399870382373, + "grad_norm": 0.8594855666160583, + "learning_rate": 2.3845927460554363e-06, + "loss": 0.1183, + "step": 4901 + }, + { + "epoch": 1.5884640311082308, + "grad_norm": 0.8139765858650208, + "learning_rate": 2.383719077333784e-06, + "loss": 0.1216, + "step": 4902 + }, + { + "epoch": 1.5887880751782242, + "grad_norm": 0.7673820853233337, + "learning_rate": 2.382845422843792e-06, + "loss": 0.1149, + "step": 4903 + }, + { + "epoch": 1.5891121192482176, + "grad_norm": 0.790939211845398, + "learning_rate": 2.381971782692386e-06, + "loss": 0.1075, + "step": 4904 + }, + { + "epoch": 1.5894361633182112, + "grad_norm": 0.7800084948539734, + "learning_rate": 2.3810981569864898e-06, + "loss": 0.1068, + "step": 4905 + }, + { + "epoch": 1.589760207388205, + "grad_norm": 0.8078755736351013, + "learning_rate": 2.38022454583303e-06, + "loss": 0.1169, + "step": 4906 + }, + { + "epoch": 1.5900842514581983, + "grad_norm": 0.7707123756408691, + "learning_rate": 2.379350949338924e-06, + "loss": 0.1197, + "step": 4907 + }, + { + "epoch": 1.5904082955281917, + "grad_norm": 0.7820994257926941, + "learning_rate": 2.378477367611096e-06, + "loss": 0.1103, + "step": 4908 + }, + { + "epoch": 1.5907323395981854, + "grad_norm": 0.8051702976226807, + "learning_rate": 2.377603800756461e-06, + "loss": 0.1158, + "step": 4909 + }, + { + "epoch": 1.591056383668179, + "grad_norm": 0.8018122911453247, + "learning_rate": 2.376730248881935e-06, + "loss": 0.1121, + "step": 4910 + }, + { + "epoch": 1.5913804277381725, + "grad_norm": 0.8079573512077332, + "learning_rate": 2.3758567120944345e-06, + "loss": 0.1095, + "step": 4911 + }, + { + "epoch": 1.5917044718081659, + "grad_norm": 0.7187642455101013, + "learning_rate": 2.3749831905008704e-06, + "loss": 0.1, + "step": 4912 + }, + { + "epoch": 1.5920285158781593, + "grad_norm": 0.8316268920898438, + "learning_rate": 2.374109684208153e-06, + "loss": 0.1127, + "step": 4913 + }, + { + "epoch": 1.592352559948153, + "grad_norm": 0.8374061584472656, + "learning_rate": 2.3732361933231917e-06, + "loss": 0.1121, + "step": 4914 + }, + { + "epoch": 1.5926766040181466, + "grad_norm": 0.7737603187561035, + "learning_rate": 2.3723627179528935e-06, + "loss": 0.1065, + "step": 4915 + }, + { + "epoch": 1.59300064808814, + "grad_norm": 0.7794497609138489, + "learning_rate": 2.371489258204163e-06, + "loss": 0.1103, + "step": 4916 + }, + { + "epoch": 1.5933246921581334, + "grad_norm": 0.8009302020072937, + "learning_rate": 2.3706158141839025e-06, + "loss": 0.1145, + "step": 4917 + }, + { + "epoch": 1.5936487362281269, + "grad_norm": 1.001829743385315, + "learning_rate": 2.3697423859990147e-06, + "loss": 0.1099, + "step": 4918 + }, + { + "epoch": 1.5939727802981205, + "grad_norm": 0.8271702527999878, + "learning_rate": 2.3688689737563965e-06, + "loss": 0.1202, + "step": 4919 + }, + { + "epoch": 1.5942968243681142, + "grad_norm": 0.8045162558555603, + "learning_rate": 2.367995577562948e-06, + "loss": 0.1101, + "step": 4920 + }, + { + "epoch": 1.5946208684381076, + "grad_norm": 0.8168801069259644, + "learning_rate": 2.3671221975255616e-06, + "loss": 0.1161, + "step": 4921 + }, + { + "epoch": 1.594944912508101, + "grad_norm": 0.8279063701629639, + "learning_rate": 2.366248833751133e-06, + "loss": 0.1223, + "step": 4922 + }, + { + "epoch": 1.5952689565780946, + "grad_norm": 0.8146759271621704, + "learning_rate": 2.365375486346552e-06, + "loss": 0.1133, + "step": 4923 + }, + { + "epoch": 1.5955930006480883, + "grad_norm": 0.8911711573600769, + "learning_rate": 2.3645021554187086e-06, + "loss": 0.1244, + "step": 4924 + }, + { + "epoch": 1.5959170447180817, + "grad_norm": 0.7659531235694885, + "learning_rate": 2.3636288410744894e-06, + "loss": 0.1093, + "step": 4925 + }, + { + "epoch": 1.5962410887880751, + "grad_norm": 0.8901681303977966, + "learning_rate": 2.3627555434207787e-06, + "loss": 0.1236, + "step": 4926 + }, + { + "epoch": 1.5965651328580686, + "grad_norm": 0.8137619495391846, + "learning_rate": 2.3618822625644624e-06, + "loss": 0.1165, + "step": 4927 + }, + { + "epoch": 1.5968891769280622, + "grad_norm": 0.749087393283844, + "learning_rate": 2.36100899861242e-06, + "loss": 0.1126, + "step": 4928 + }, + { + "epoch": 1.5972132209980558, + "grad_norm": 0.7478780746459961, + "learning_rate": 2.3601357516715297e-06, + "loss": 0.1111, + "step": 4929 + }, + { + "epoch": 1.5975372650680493, + "grad_norm": 0.8310571908950806, + "learning_rate": 2.35926252184867e-06, + "loss": 0.1178, + "step": 4930 + }, + { + "epoch": 1.5978613091380427, + "grad_norm": 0.7701750993728638, + "learning_rate": 2.3583893092507144e-06, + "loss": 0.1099, + "step": 4931 + }, + { + "epoch": 1.5981853532080363, + "grad_norm": 0.7579521536827087, + "learning_rate": 2.3575161139845375e-06, + "loss": 0.1044, + "step": 4932 + }, + { + "epoch": 1.5985093972780298, + "grad_norm": 0.7884305715560913, + "learning_rate": 2.356642936157008e-06, + "loss": 0.1026, + "step": 4933 + }, + { + "epoch": 1.5988334413480234, + "grad_norm": 0.8843073844909668, + "learning_rate": 2.3557697758749966e-06, + "loss": 0.1258, + "step": 4934 + }, + { + "epoch": 1.5991574854180168, + "grad_norm": 0.8237454891204834, + "learning_rate": 2.3548966332453673e-06, + "loss": 0.1171, + "step": 4935 + }, + { + "epoch": 1.5994815294880103, + "grad_norm": 0.8186402320861816, + "learning_rate": 2.3540235083749853e-06, + "loss": 0.1168, + "step": 4936 + }, + { + "epoch": 1.599805573558004, + "grad_norm": 0.8118522763252258, + "learning_rate": 2.3531504013707134e-06, + "loss": 0.1116, + "step": 4937 + }, + { + "epoch": 1.6001296176279975, + "grad_norm": 0.7732279896736145, + "learning_rate": 2.35227731233941e-06, + "loss": 0.1142, + "step": 4938 + }, + { + "epoch": 1.600453661697991, + "grad_norm": 0.8047366738319397, + "learning_rate": 2.3514042413879344e-06, + "loss": 0.1122, + "step": 4939 + }, + { + "epoch": 1.6007777057679844, + "grad_norm": 0.845105767250061, + "learning_rate": 2.350531188623141e-06, + "loss": 0.1134, + "step": 4940 + }, + { + "epoch": 1.6011017498379778, + "grad_norm": 0.7973531484603882, + "learning_rate": 2.349658154151882e-06, + "loss": 0.1211, + "step": 4941 + }, + { + "epoch": 1.6014257939079715, + "grad_norm": 0.8800602555274963, + "learning_rate": 2.3487851380810106e-06, + "loss": 0.1208, + "step": 4942 + }, + { + "epoch": 1.601749837977965, + "grad_norm": 0.7945040464401245, + "learning_rate": 2.3479121405173736e-06, + "loss": 0.1104, + "step": 4943 + }, + { + "epoch": 1.6020738820479585, + "grad_norm": 0.8744386434555054, + "learning_rate": 2.347039161567819e-06, + "loss": 0.1204, + "step": 4944 + }, + { + "epoch": 1.602397926117952, + "grad_norm": 0.7569018006324768, + "learning_rate": 2.34616620133919e-06, + "loss": 0.1081, + "step": 4945 + }, + { + "epoch": 1.6027219701879456, + "grad_norm": 0.7882606387138367, + "learning_rate": 2.345293259938329e-06, + "loss": 0.1089, + "step": 4946 + }, + { + "epoch": 1.6030460142579392, + "grad_norm": 0.7308951616287231, + "learning_rate": 2.3444203374720755e-06, + "loss": 0.1065, + "step": 4947 + }, + { + "epoch": 1.6033700583279327, + "grad_norm": 0.8240647912025452, + "learning_rate": 2.3435474340472657e-06, + "loss": 0.1205, + "step": 4948 + }, + { + "epoch": 1.603694102397926, + "grad_norm": 0.8487390875816345, + "learning_rate": 2.3426745497707364e-06, + "loss": 0.1189, + "step": 4949 + }, + { + "epoch": 1.6040181464679195, + "grad_norm": 0.7914772033691406, + "learning_rate": 2.341801684749318e-06, + "loss": 0.1119, + "step": 4950 + }, + { + "epoch": 1.6043421905379132, + "grad_norm": 0.7847076058387756, + "learning_rate": 2.3409288390898427e-06, + "loss": 0.1144, + "step": 4951 + }, + { + "epoch": 1.6046662346079068, + "grad_norm": 0.8344978094100952, + "learning_rate": 2.3400560128991377e-06, + "loss": 0.1169, + "step": 4952 + }, + { + "epoch": 1.6049902786779002, + "grad_norm": 0.780342698097229, + "learning_rate": 2.3391832062840273e-06, + "loss": 0.1088, + "step": 4953 + }, + { + "epoch": 1.6053143227478937, + "grad_norm": 0.752775251865387, + "learning_rate": 2.338310419351337e-06, + "loss": 0.1006, + "step": 4954 + }, + { + "epoch": 1.605638366817887, + "grad_norm": 0.8063361644744873, + "learning_rate": 2.3374376522078852e-06, + "loss": 0.1119, + "step": 4955 + }, + { + "epoch": 1.6059624108878807, + "grad_norm": 0.8843941688537598, + "learning_rate": 2.3365649049604917e-06, + "loss": 0.1319, + "step": 4956 + }, + { + "epoch": 1.6062864549578744, + "grad_norm": 0.7759780883789062, + "learning_rate": 2.3356921777159705e-06, + "loss": 0.1078, + "step": 4957 + }, + { + "epoch": 1.6066104990278678, + "grad_norm": 0.8233708143234253, + "learning_rate": 2.334819470581137e-06, + "loss": 0.1178, + "step": 4958 + }, + { + "epoch": 1.6069345430978612, + "grad_norm": 0.8936583399772644, + "learning_rate": 2.3339467836628018e-06, + "loss": 0.1333, + "step": 4959 + }, + { + "epoch": 1.6072585871678549, + "grad_norm": 0.8358684182167053, + "learning_rate": 2.3330741170677713e-06, + "loss": 0.1252, + "step": 4960 + }, + { + "epoch": 1.6075826312378485, + "grad_norm": 0.7811366319656372, + "learning_rate": 2.3322014709028545e-06, + "loss": 0.1106, + "step": 4961 + }, + { + "epoch": 1.607906675307842, + "grad_norm": 0.8231356739997864, + "learning_rate": 2.3313288452748515e-06, + "loss": 0.1177, + "step": 4962 + }, + { + "epoch": 1.6082307193778353, + "grad_norm": 0.8499090075492859, + "learning_rate": 2.3304562402905662e-06, + "loss": 0.1125, + "step": 4963 + }, + { + "epoch": 1.6085547634478288, + "grad_norm": 0.8415418863296509, + "learning_rate": 2.329583656056796e-06, + "loss": 0.125, + "step": 4964 + }, + { + "epoch": 1.6088788075178224, + "grad_norm": 0.8120545744895935, + "learning_rate": 2.3287110926803354e-06, + "loss": 0.1154, + "step": 4965 + }, + { + "epoch": 1.609202851587816, + "grad_norm": 0.8539736270904541, + "learning_rate": 2.32783855026798e-06, + "loss": 0.1258, + "step": 4966 + }, + { + "epoch": 1.6095268956578095, + "grad_norm": 0.9000766277313232, + "learning_rate": 2.3269660289265184e-06, + "loss": 0.1315, + "step": 4967 + }, + { + "epoch": 1.609850939727803, + "grad_norm": 0.8382875919342041, + "learning_rate": 2.3260935287627408e-06, + "loss": 0.1247, + "step": 4968 + }, + { + "epoch": 1.6101749837977966, + "grad_norm": 0.8455458283424377, + "learning_rate": 2.3252210498834306e-06, + "loss": 0.1123, + "step": 4969 + }, + { + "epoch": 1.61049902786779, + "grad_norm": 0.8859215974807739, + "learning_rate": 2.3243485923953725e-06, + "loss": 0.1233, + "step": 4970 + }, + { + "epoch": 1.6108230719377836, + "grad_norm": 0.818753182888031, + "learning_rate": 2.323476156405347e-06, + "loss": 0.1148, + "step": 4971 + }, + { + "epoch": 1.611147116007777, + "grad_norm": 0.8630556464195251, + "learning_rate": 2.3226037420201296e-06, + "loss": 0.1189, + "step": 4972 + }, + { + "epoch": 1.6114711600777705, + "grad_norm": 0.7116557955741882, + "learning_rate": 2.3217313493464977e-06, + "loss": 0.0991, + "step": 4973 + }, + { + "epoch": 1.6117952041477641, + "grad_norm": 0.822060763835907, + "learning_rate": 2.320858978491222e-06, + "loss": 0.1177, + "step": 4974 + }, + { + "epoch": 1.6121192482177578, + "grad_norm": 0.8356267213821411, + "learning_rate": 2.319986629561074e-06, + "loss": 0.1149, + "step": 4975 + }, + { + "epoch": 1.6124432922877512, + "grad_norm": 0.8050525784492493, + "learning_rate": 2.3191143026628206e-06, + "loss": 0.1135, + "step": 4976 + }, + { + "epoch": 1.6127673363577446, + "grad_norm": 0.826977014541626, + "learning_rate": 2.318241997903224e-06, + "loss": 0.1191, + "step": 4977 + }, + { + "epoch": 1.613091380427738, + "grad_norm": 0.7832971215248108, + "learning_rate": 2.3173697153890486e-06, + "loss": 0.1048, + "step": 4978 + }, + { + "epoch": 1.6134154244977317, + "grad_norm": 0.7937376499176025, + "learning_rate": 2.31649745522705e-06, + "loss": 0.1147, + "step": 4979 + }, + { + "epoch": 1.6137394685677253, + "grad_norm": 0.734635591506958, + "learning_rate": 2.3156252175239883e-06, + "loss": 0.1054, + "step": 4980 + }, + { + "epoch": 1.6140635126377187, + "grad_norm": 0.9116652011871338, + "learning_rate": 2.3147530023866136e-06, + "loss": 0.1301, + "step": 4981 + }, + { + "epoch": 1.6143875567077122, + "grad_norm": 0.7477267384529114, + "learning_rate": 2.3138808099216796e-06, + "loss": 0.1043, + "step": 4982 + }, + { + "epoch": 1.6147116007777058, + "grad_norm": 0.8117696046829224, + "learning_rate": 2.3130086402359327e-06, + "loss": 0.1129, + "step": 4983 + }, + { + "epoch": 1.6150356448476992, + "grad_norm": 0.86134934425354, + "learning_rate": 2.312136493436117e-06, + "loss": 0.124, + "step": 4984 + }, + { + "epoch": 1.6153596889176929, + "grad_norm": 0.7768048644065857, + "learning_rate": 2.311264369628976e-06, + "loss": 0.1131, + "step": 4985 + }, + { + "epoch": 1.6156837329876863, + "grad_norm": 0.782417356967926, + "learning_rate": 2.3103922689212494e-06, + "loss": 0.1112, + "step": 4986 + }, + { + "epoch": 1.6160077770576797, + "grad_norm": 0.8572040796279907, + "learning_rate": 2.3095201914196732e-06, + "loss": 0.1081, + "step": 4987 + }, + { + "epoch": 1.6163318211276734, + "grad_norm": 0.8067081570625305, + "learning_rate": 2.308648137230982e-06, + "loss": 0.1144, + "step": 4988 + }, + { + "epoch": 1.616655865197667, + "grad_norm": 0.8214498162269592, + "learning_rate": 2.3077761064619062e-06, + "loss": 0.1079, + "step": 4989 + }, + { + "epoch": 1.6169799092676604, + "grad_norm": 0.7868883609771729, + "learning_rate": 2.3069040992191745e-06, + "loss": 0.106, + "step": 4990 + }, + { + "epoch": 1.6173039533376539, + "grad_norm": 0.7447760105133057, + "learning_rate": 2.3060321156095107e-06, + "loss": 0.0981, + "step": 4991 + }, + { + "epoch": 1.6176279974076473, + "grad_norm": 0.8617889881134033, + "learning_rate": 2.3051601557396393e-06, + "loss": 0.1238, + "step": 4992 + }, + { + "epoch": 1.617952041477641, + "grad_norm": 0.8899185061454773, + "learning_rate": 2.3042882197162776e-06, + "loss": 0.1225, + "step": 4993 + }, + { + "epoch": 1.6182760855476346, + "grad_norm": 0.8807141780853271, + "learning_rate": 2.303416307646144e-06, + "loss": 0.1192, + "step": 4994 + }, + { + "epoch": 1.618600129617628, + "grad_norm": 0.7890043258666992, + "learning_rate": 2.3025444196359513e-06, + "loss": 0.1076, + "step": 4995 + }, + { + "epoch": 1.6189241736876214, + "grad_norm": 0.8740971684455872, + "learning_rate": 2.3016725557924095e-06, + "loss": 0.1248, + "step": 4996 + }, + { + "epoch": 1.619248217757615, + "grad_norm": 0.7967113852500916, + "learning_rate": 2.3008007162222273e-06, + "loss": 0.1136, + "step": 4997 + }, + { + "epoch": 1.6195722618276087, + "grad_norm": 0.7795601487159729, + "learning_rate": 2.2999289010321092e-06, + "loss": 0.1077, + "step": 4998 + }, + { + "epoch": 1.6198963058976021, + "grad_norm": 0.7632074356079102, + "learning_rate": 2.299057110328757e-06, + "loss": 0.1107, + "step": 4999 + }, + { + "epoch": 1.6202203499675956, + "grad_norm": 0.8201764822006226, + "learning_rate": 2.298185344218868e-06, + "loss": 0.1121, + "step": 5000 + }, + { + "epoch": 1.620544394037589, + "grad_norm": 0.8989620804786682, + "learning_rate": 2.29731360280914e-06, + "loss": 0.1314, + "step": 5001 + }, + { + "epoch": 1.6208684381075826, + "grad_norm": 0.7793564200401306, + "learning_rate": 2.2964418862062655e-06, + "loss": 0.1135, + "step": 5002 + }, + { + "epoch": 1.6211924821775763, + "grad_norm": 0.8126095533370972, + "learning_rate": 2.2955701945169317e-06, + "loss": 0.1193, + "step": 5003 + }, + { + "epoch": 1.6215165262475697, + "grad_norm": 0.7722181081771851, + "learning_rate": 2.294698527847829e-06, + "loss": 0.1091, + "step": 5004 + }, + { + "epoch": 1.6218405703175631, + "grad_norm": 0.850853443145752, + "learning_rate": 2.2938268863056373e-06, + "loss": 0.1197, + "step": 5005 + }, + { + "epoch": 1.6221646143875565, + "grad_norm": 0.8804651498794556, + "learning_rate": 2.29295526999704e-06, + "loss": 0.1214, + "step": 5006 + }, + { + "epoch": 1.6224886584575502, + "grad_norm": 0.7808780074119568, + "learning_rate": 2.2920836790287134e-06, + "loss": 0.1112, + "step": 5007 + }, + { + "epoch": 1.6228127025275438, + "grad_norm": 0.7318097352981567, + "learning_rate": 2.291212113507331e-06, + "loss": 0.1057, + "step": 5008 + }, + { + "epoch": 1.6231367465975373, + "grad_norm": 0.8634607195854187, + "learning_rate": 2.290340573539565e-06, + "loss": 0.1229, + "step": 5009 + }, + { + "epoch": 1.6234607906675307, + "grad_norm": 0.6968221664428711, + "learning_rate": 2.2894690592320827e-06, + "loss": 0.0988, + "step": 5010 + }, + { + "epoch": 1.6237848347375243, + "grad_norm": 0.816426694393158, + "learning_rate": 2.2885975706915506e-06, + "loss": 0.1122, + "step": 5011 + }, + { + "epoch": 1.624108878807518, + "grad_norm": 0.8346845507621765, + "learning_rate": 2.287726108024628e-06, + "loss": 0.1226, + "step": 5012 + }, + { + "epoch": 1.6244329228775114, + "grad_norm": 0.8421067595481873, + "learning_rate": 2.2868546713379755e-06, + "loss": 0.115, + "step": 5013 + }, + { + "epoch": 1.6247569669475048, + "grad_norm": 0.8678304553031921, + "learning_rate": 2.285983260738248e-06, + "loss": 0.1262, + "step": 5014 + }, + { + "epoch": 1.6250810110174982, + "grad_norm": 0.7817627191543579, + "learning_rate": 2.285111876332097e-06, + "loss": 0.1139, + "step": 5015 + }, + { + "epoch": 1.625405055087492, + "grad_norm": 0.8300804495811462, + "learning_rate": 2.2842405182261725e-06, + "loss": 0.1191, + "step": 5016 + }, + { + "epoch": 1.6257290991574855, + "grad_norm": 0.783602237701416, + "learning_rate": 2.283369186527119e-06, + "loss": 0.1142, + "step": 5017 + }, + { + "epoch": 1.626053143227479, + "grad_norm": 0.7819046974182129, + "learning_rate": 2.282497881341581e-06, + "loss": 0.1108, + "step": 5018 + }, + { + "epoch": 1.6263771872974724, + "grad_norm": 0.7916659116744995, + "learning_rate": 2.2816266027761965e-06, + "loss": 0.1134, + "step": 5019 + }, + { + "epoch": 1.626701231367466, + "grad_norm": 0.7798147201538086, + "learning_rate": 2.280755350937602e-06, + "loss": 0.1078, + "step": 5020 + }, + { + "epoch": 1.6270252754374595, + "grad_norm": 0.8133070468902588, + "learning_rate": 2.27988412593243e-06, + "loss": 0.116, + "step": 5021 + }, + { + "epoch": 1.627349319507453, + "grad_norm": 0.8206139802932739, + "learning_rate": 2.279012927867309e-06, + "loss": 0.1211, + "step": 5022 + }, + { + "epoch": 1.6276733635774465, + "grad_norm": 0.8827968835830688, + "learning_rate": 2.2781417568488677e-06, + "loss": 0.1236, + "step": 5023 + }, + { + "epoch": 1.62799740764744, + "grad_norm": 0.8431521654129028, + "learning_rate": 2.277270612983726e-06, + "loss": 0.1168, + "step": 5024 + }, + { + "epoch": 1.6283214517174336, + "grad_norm": 0.7578520774841309, + "learning_rate": 2.2763994963785066e-06, + "loss": 0.1064, + "step": 5025 + }, + { + "epoch": 1.6286454957874272, + "grad_norm": 0.7708545923233032, + "learning_rate": 2.2755284071398243e-06, + "loss": 0.1068, + "step": 5026 + }, + { + "epoch": 1.6289695398574207, + "grad_norm": 0.8634827733039856, + "learning_rate": 2.2746573453742905e-06, + "loss": 0.1206, + "step": 5027 + }, + { + "epoch": 1.629293583927414, + "grad_norm": 0.7665538191795349, + "learning_rate": 2.2737863111885175e-06, + "loss": 0.1117, + "step": 5028 + }, + { + "epoch": 1.6296176279974075, + "grad_norm": 0.7242918610572815, + "learning_rate": 2.2729153046891095e-06, + "loss": 0.1006, + "step": 5029 + }, + { + "epoch": 1.6299416720674011, + "grad_norm": 0.9581153988838196, + "learning_rate": 2.2720443259826702e-06, + "loss": 0.1225, + "step": 5030 + }, + { + "epoch": 1.6302657161373948, + "grad_norm": 0.8096961975097656, + "learning_rate": 2.2711733751757983e-06, + "loss": 0.109, + "step": 5031 + }, + { + "epoch": 1.6305897602073882, + "grad_norm": 0.8381730318069458, + "learning_rate": 2.27030245237509e-06, + "loss": 0.1175, + "step": 5032 + }, + { + "epoch": 1.6309138042773816, + "grad_norm": 0.7741770148277283, + "learning_rate": 2.2694315576871384e-06, + "loss": 0.1063, + "step": 5033 + }, + { + "epoch": 1.6312378483473753, + "grad_norm": 0.962214469909668, + "learning_rate": 2.268560691218531e-06, + "loss": 0.1262, + "step": 5034 + }, + { + "epoch": 1.6315618924173687, + "grad_norm": 0.8154774308204651, + "learning_rate": 2.2676898530758554e-06, + "loss": 0.1133, + "step": 5035 + }, + { + "epoch": 1.6318859364873624, + "grad_norm": 0.7884747385978699, + "learning_rate": 2.266819043365692e-06, + "loss": 0.111, + "step": 5036 + }, + { + "epoch": 1.6322099805573558, + "grad_norm": 0.8193156123161316, + "learning_rate": 2.265948262194621e-06, + "loss": 0.1207, + "step": 5037 + }, + { + "epoch": 1.6325340246273492, + "grad_norm": 0.7977531552314758, + "learning_rate": 2.2650775096692176e-06, + "loss": 0.1132, + "step": 5038 + }, + { + "epoch": 1.6328580686973428, + "grad_norm": 0.8377412557601929, + "learning_rate": 2.2642067858960514e-06, + "loss": 0.116, + "step": 5039 + }, + { + "epoch": 1.6331821127673365, + "grad_norm": 0.8771550059318542, + "learning_rate": 2.263336090981693e-06, + "loss": 0.1228, + "step": 5040 + }, + { + "epoch": 1.63350615683733, + "grad_norm": 0.8469054698944092, + "learning_rate": 2.2624654250327054e-06, + "loss": 0.1166, + "step": 5041 + }, + { + "epoch": 1.6338302009073233, + "grad_norm": 0.7819681167602539, + "learning_rate": 2.2615947881556506e-06, + "loss": 0.111, + "step": 5042 + }, + { + "epoch": 1.6341542449773168, + "grad_norm": 0.7739814519882202, + "learning_rate": 2.2607241804570864e-06, + "loss": 0.1136, + "step": 5043 + }, + { + "epoch": 1.6344782890473104, + "grad_norm": 0.7743594646453857, + "learning_rate": 2.2598536020435644e-06, + "loss": 0.1103, + "step": 5044 + }, + { + "epoch": 1.634802333117304, + "grad_norm": 0.8755719065666199, + "learning_rate": 2.258983053021638e-06, + "loss": 0.1293, + "step": 5045 + }, + { + "epoch": 1.6351263771872975, + "grad_norm": 0.8436084985733032, + "learning_rate": 2.2581125334978517e-06, + "loss": 0.1123, + "step": 5046 + }, + { + "epoch": 1.635450421257291, + "grad_norm": 0.7783282995223999, + "learning_rate": 2.257242043578751e-06, + "loss": 0.1098, + "step": 5047 + }, + { + "epoch": 1.6357744653272845, + "grad_norm": 0.7676630616188049, + "learning_rate": 2.2563715833708726e-06, + "loss": 0.109, + "step": 5048 + }, + { + "epoch": 1.6360985093972782, + "grad_norm": 0.8583880662918091, + "learning_rate": 2.255501152980755e-06, + "loss": 0.1196, + "step": 5049 + }, + { + "epoch": 1.6364225534672716, + "grad_norm": 0.8024161458015442, + "learning_rate": 2.2546307525149293e-06, + "loss": 0.1156, + "step": 5050 + }, + { + "epoch": 1.636746597537265, + "grad_norm": 0.8013902306556702, + "learning_rate": 2.253760382079924e-06, + "loss": 0.1083, + "step": 5051 + }, + { + "epoch": 1.6370706416072585, + "grad_norm": 0.8168145418167114, + "learning_rate": 2.2528900417822636e-06, + "loss": 0.1235, + "step": 5052 + }, + { + "epoch": 1.637394685677252, + "grad_norm": 0.8024635910987854, + "learning_rate": 2.2520197317284702e-06, + "loss": 0.1162, + "step": 5053 + }, + { + "epoch": 1.6377187297472457, + "grad_norm": 0.7219820618629456, + "learning_rate": 2.2511494520250613e-06, + "loss": 0.1, + "step": 5054 + }, + { + "epoch": 1.6380427738172392, + "grad_norm": 0.8405579328536987, + "learning_rate": 2.2502792027785508e-06, + "loss": 0.1219, + "step": 5055 + }, + { + "epoch": 1.6383668178872326, + "grad_norm": 0.8250084519386292, + "learning_rate": 2.249408984095447e-06, + "loss": 0.1162, + "step": 5056 + }, + { + "epoch": 1.638690861957226, + "grad_norm": 0.8255623579025269, + "learning_rate": 2.248538796082259e-06, + "loss": 0.1113, + "step": 5057 + }, + { + "epoch": 1.6390149060272197, + "grad_norm": 0.8642399311065674, + "learning_rate": 2.2476686388454867e-06, + "loss": 0.1145, + "step": 5058 + }, + { + "epoch": 1.6393389500972133, + "grad_norm": 0.7692127823829651, + "learning_rate": 2.2467985124916314e-06, + "loss": 0.1139, + "step": 5059 + }, + { + "epoch": 1.6396629941672067, + "grad_norm": 0.866449236869812, + "learning_rate": 2.2459284171271863e-06, + "loss": 0.1216, + "step": 5060 + }, + { + "epoch": 1.6399870382372002, + "grad_norm": 0.8447216153144836, + "learning_rate": 2.2450583528586437e-06, + "loss": 0.1135, + "step": 5061 + }, + { + "epoch": 1.6403110823071938, + "grad_norm": 0.8008463978767395, + "learning_rate": 2.244188319792491e-06, + "loss": 0.109, + "step": 5062 + }, + { + "epoch": 1.6406351263771874, + "grad_norm": 0.8315277695655823, + "learning_rate": 2.243318318035211e-06, + "loss": 0.1131, + "step": 5063 + }, + { + "epoch": 1.6409591704471809, + "grad_norm": 0.7836526036262512, + "learning_rate": 2.2424483476932847e-06, + "loss": 0.1083, + "step": 5064 + }, + { + "epoch": 1.6412832145171743, + "grad_norm": 0.7713335752487183, + "learning_rate": 2.241578408873186e-06, + "loss": 0.1083, + "step": 5065 + }, + { + "epoch": 1.6416072585871677, + "grad_norm": 0.8240923881530762, + "learning_rate": 2.2407085016813895e-06, + "loss": 0.1143, + "step": 5066 + }, + { + "epoch": 1.6419313026571614, + "grad_norm": 0.8240248560905457, + "learning_rate": 2.239838626224361e-06, + "loss": 0.1105, + "step": 5067 + }, + { + "epoch": 1.642255346727155, + "grad_norm": 0.8505247235298157, + "learning_rate": 2.2389687826085675e-06, + "loss": 0.1148, + "step": 5068 + }, + { + "epoch": 1.6425793907971484, + "grad_norm": 0.7987221479415894, + "learning_rate": 2.238098970940468e-06, + "loss": 0.1047, + "step": 5069 + }, + { + "epoch": 1.6429034348671419, + "grad_norm": 0.8047895431518555, + "learning_rate": 2.2372291913265177e-06, + "loss": 0.12, + "step": 5070 + }, + { + "epoch": 1.6432274789371355, + "grad_norm": 0.8510376811027527, + "learning_rate": 2.236359443873172e-06, + "loss": 0.1134, + "step": 5071 + }, + { + "epoch": 1.643551523007129, + "grad_norm": 0.7813231348991394, + "learning_rate": 2.2354897286868773e-06, + "loss": 0.1093, + "step": 5072 + }, + { + "epoch": 1.6438755670771226, + "grad_norm": 0.7600345611572266, + "learning_rate": 2.23462004587408e-06, + "loss": 0.1041, + "step": 5073 + }, + { + "epoch": 1.644199611147116, + "grad_norm": 0.8008465766906738, + "learning_rate": 2.233750395541219e-06, + "loss": 0.1094, + "step": 5074 + }, + { + "epoch": 1.6445236552171094, + "grad_norm": 0.7862761616706848, + "learning_rate": 2.2328807777947323e-06, + "loss": 0.1045, + "step": 5075 + }, + { + "epoch": 1.644847699287103, + "grad_norm": 0.9118563532829285, + "learning_rate": 2.232011192741053e-06, + "loss": 0.1293, + "step": 5076 + }, + { + "epoch": 1.6451717433570967, + "grad_norm": 0.8136278986930847, + "learning_rate": 2.2311416404866085e-06, + "loss": 0.113, + "step": 5077 + }, + { + "epoch": 1.6454957874270901, + "grad_norm": 0.7324303388595581, + "learning_rate": 2.2302721211378254e-06, + "loss": 0.1044, + "step": 5078 + }, + { + "epoch": 1.6458198314970836, + "grad_norm": 0.8289517760276794, + "learning_rate": 2.2294026348011223e-06, + "loss": 0.1239, + "step": 5079 + }, + { + "epoch": 1.646143875567077, + "grad_norm": 0.8013531565666199, + "learning_rate": 2.2285331815829187e-06, + "loss": 0.1104, + "step": 5080 + }, + { + "epoch": 1.6464679196370706, + "grad_norm": 0.791636049747467, + "learning_rate": 2.227663761589625e-06, + "loss": 0.1078, + "step": 5081 + }, + { + "epoch": 1.6467919637070643, + "grad_norm": 0.7839667797088623, + "learning_rate": 2.2267943749276503e-06, + "loss": 0.1129, + "step": 5082 + }, + { + "epoch": 1.6471160077770577, + "grad_norm": 0.674390435218811, + "learning_rate": 2.225925021703399e-06, + "loss": 0.095, + "step": 5083 + }, + { + "epoch": 1.6474400518470511, + "grad_norm": 0.8988636136054993, + "learning_rate": 2.2250557020232724e-06, + "loss": 0.122, + "step": 5084 + }, + { + "epoch": 1.6477640959170448, + "grad_norm": 0.8070955872535706, + "learning_rate": 2.2241864159936664e-06, + "loss": 0.1136, + "step": 5085 + }, + { + "epoch": 1.6480881399870384, + "grad_norm": 0.8262961506843567, + "learning_rate": 2.223317163720973e-06, + "loss": 0.118, + "step": 5086 + }, + { + "epoch": 1.6484121840570318, + "grad_norm": 0.9083569645881653, + "learning_rate": 2.222447945311579e-06, + "loss": 0.1346, + "step": 5087 + }, + { + "epoch": 1.6487362281270252, + "grad_norm": 0.8316741585731506, + "learning_rate": 2.2215787608718706e-06, + "loss": 0.1166, + "step": 5088 + }, + { + "epoch": 1.6490602721970187, + "grad_norm": 0.7819686532020569, + "learning_rate": 2.220709610508226e-06, + "loss": 0.1106, + "step": 5089 + }, + { + "epoch": 1.6493843162670123, + "grad_norm": 0.7735289931297302, + "learning_rate": 2.2198404943270217e-06, + "loss": 0.1057, + "step": 5090 + }, + { + "epoch": 1.649708360337006, + "grad_norm": 0.8702659606933594, + "learning_rate": 2.218971412434628e-06, + "loss": 0.12, + "step": 5091 + }, + { + "epoch": 1.6500324044069994, + "grad_norm": 0.8448833227157593, + "learning_rate": 2.218102364937414e-06, + "loss": 0.1131, + "step": 5092 + }, + { + "epoch": 1.6503564484769928, + "grad_norm": 0.8449171185493469, + "learning_rate": 2.2172333519417415e-06, + "loss": 0.1238, + "step": 5093 + }, + { + "epoch": 1.6506804925469862, + "grad_norm": 0.7623543739318848, + "learning_rate": 2.2163643735539688e-06, + "loss": 0.1144, + "step": 5094 + }, + { + "epoch": 1.6510045366169799, + "grad_norm": 0.7137896418571472, + "learning_rate": 2.2154954298804514e-06, + "loss": 0.0932, + "step": 5095 + }, + { + "epoch": 1.6513285806869735, + "grad_norm": 0.7739506363868713, + "learning_rate": 2.214626521027538e-06, + "loss": 0.115, + "step": 5096 + }, + { + "epoch": 1.651652624756967, + "grad_norm": 0.8516456484794617, + "learning_rate": 2.213757647101577e-06, + "loss": 0.1153, + "step": 5097 + }, + { + "epoch": 1.6519766688269604, + "grad_norm": 0.7973065972328186, + "learning_rate": 2.2128888082089093e-06, + "loss": 0.1111, + "step": 5098 + }, + { + "epoch": 1.652300712896954, + "grad_norm": 0.8301618695259094, + "learning_rate": 2.2120200044558705e-06, + "loss": 0.1171, + "step": 5099 + }, + { + "epoch": 1.6526247569669477, + "grad_norm": 0.8303407430648804, + "learning_rate": 2.2111512359487967e-06, + "loss": 0.1091, + "step": 5100 + }, + { + "epoch": 1.652948801036941, + "grad_norm": 0.8207636475563049, + "learning_rate": 2.2102825027940143e-06, + "loss": 0.118, + "step": 5101 + }, + { + "epoch": 1.6532728451069345, + "grad_norm": 0.7522189617156982, + "learning_rate": 2.2094138050978496e-06, + "loss": 0.1047, + "step": 5102 + }, + { + "epoch": 1.653596889176928, + "grad_norm": 1.0848859548568726, + "learning_rate": 2.2085451429666215e-06, + "loss": 0.118, + "step": 5103 + }, + { + "epoch": 1.6539209332469216, + "grad_norm": 0.8184290528297424, + "learning_rate": 2.207676516506647e-06, + "loss": 0.1085, + "step": 5104 + }, + { + "epoch": 1.6542449773169152, + "grad_norm": 0.8451368808746338, + "learning_rate": 2.206807925824237e-06, + "loss": 0.1212, + "step": 5105 + }, + { + "epoch": 1.6545690213869086, + "grad_norm": 0.8918312788009644, + "learning_rate": 2.205939371025698e-06, + "loss": 0.1222, + "step": 5106 + }, + { + "epoch": 1.654893065456902, + "grad_norm": 0.8274011015892029, + "learning_rate": 2.205070852217334e-06, + "loss": 0.1167, + "step": 5107 + }, + { + "epoch": 1.6552171095268955, + "grad_norm": 0.84331214427948, + "learning_rate": 2.204202369505441e-06, + "loss": 0.1171, + "step": 5108 + }, + { + "epoch": 1.6555411535968891, + "grad_norm": 0.7344668507575989, + "learning_rate": 2.203333922996316e-06, + "loss": 0.0974, + "step": 5109 + }, + { + "epoch": 1.6558651976668828, + "grad_norm": 0.8344709277153015, + "learning_rate": 2.202465512796247e-06, + "loss": 0.1152, + "step": 5110 + }, + { + "epoch": 1.6561892417368762, + "grad_norm": 0.7639608383178711, + "learning_rate": 2.2015971390115172e-06, + "loss": 0.1025, + "step": 5111 + }, + { + "epoch": 1.6565132858068696, + "grad_norm": 0.7677879929542542, + "learning_rate": 2.2007288017484105e-06, + "loss": 0.1072, + "step": 5112 + }, + { + "epoch": 1.6568373298768633, + "grad_norm": 0.8520178198814392, + "learning_rate": 2.1998605011131997e-06, + "loss": 0.1175, + "step": 5113 + }, + { + "epoch": 1.657161373946857, + "grad_norm": 0.8265737891197205, + "learning_rate": 2.19899223721216e-06, + "loss": 0.1137, + "step": 5114 + }, + { + "epoch": 1.6574854180168503, + "grad_norm": 0.7737930417060852, + "learning_rate": 2.1981240101515548e-06, + "loss": 0.1098, + "step": 5115 + }, + { + "epoch": 1.6578094620868438, + "grad_norm": 0.8008612394332886, + "learning_rate": 2.1972558200376497e-06, + "loss": 0.1057, + "step": 5116 + }, + { + "epoch": 1.6581335061568372, + "grad_norm": 0.8462890982627869, + "learning_rate": 2.1963876669767008e-06, + "loss": 0.1181, + "step": 5117 + }, + { + "epoch": 1.6584575502268308, + "grad_norm": 0.8264334797859192, + "learning_rate": 2.1955195510749614e-06, + "loss": 0.124, + "step": 5118 + }, + { + "epoch": 1.6587815942968245, + "grad_norm": 0.8477981090545654, + "learning_rate": 2.1946514724386827e-06, + "loss": 0.1155, + "step": 5119 + }, + { + "epoch": 1.659105638366818, + "grad_norm": 0.7597894668579102, + "learning_rate": 2.1937834311741066e-06, + "loss": 0.1052, + "step": 5120 + }, + { + "epoch": 1.6594296824368113, + "grad_norm": 0.7452861666679382, + "learning_rate": 2.192915427387475e-06, + "loss": 0.1054, + "step": 5121 + }, + { + "epoch": 1.659753726506805, + "grad_norm": 0.7864974141120911, + "learning_rate": 2.1920474611850225e-06, + "loss": 0.109, + "step": 5122 + }, + { + "epoch": 1.6600777705767984, + "grad_norm": 0.769801914691925, + "learning_rate": 2.1911795326729784e-06, + "loss": 0.1022, + "step": 5123 + }, + { + "epoch": 1.660401814646792, + "grad_norm": 0.7793107032775879, + "learning_rate": 2.190311641957571e-06, + "loss": 0.105, + "step": 5124 + }, + { + "epoch": 1.6607258587167855, + "grad_norm": 0.8600584864616394, + "learning_rate": 2.18944378914502e-06, + "loss": 0.1248, + "step": 5125 + }, + { + "epoch": 1.6610499027867789, + "grad_norm": 0.714015007019043, + "learning_rate": 2.188575974341543e-06, + "loss": 0.0992, + "step": 5126 + }, + { + "epoch": 1.6613739468567725, + "grad_norm": 0.8324199914932251, + "learning_rate": 2.1877081976533515e-06, + "loss": 0.12, + "step": 5127 + }, + { + "epoch": 1.6616979909267662, + "grad_norm": 0.7645648121833801, + "learning_rate": 2.186840459186654e-06, + "loss": 0.1041, + "step": 5128 + }, + { + "epoch": 1.6620220349967596, + "grad_norm": 0.9017804265022278, + "learning_rate": 2.185972759047653e-06, + "loss": 0.13, + "step": 5129 + }, + { + "epoch": 1.662346079066753, + "grad_norm": 0.7904400825500488, + "learning_rate": 2.1851050973425454e-06, + "loss": 0.1126, + "step": 5130 + }, + { + "epoch": 1.6626701231367464, + "grad_norm": 0.7225543856620789, + "learning_rate": 2.1842374741775262e-06, + "loss": 0.1025, + "step": 5131 + }, + { + "epoch": 1.66299416720674, + "grad_norm": 0.8221871256828308, + "learning_rate": 2.1833698896587816e-06, + "loss": 0.1158, + "step": 5132 + }, + { + "epoch": 1.6633182112767337, + "grad_norm": 0.8409931659698486, + "learning_rate": 2.1825023438924995e-06, + "loss": 0.1221, + "step": 5133 + }, + { + "epoch": 1.6636422553467272, + "grad_norm": 0.7728251814842224, + "learning_rate": 2.1816348369848555e-06, + "loss": 0.1082, + "step": 5134 + }, + { + "epoch": 1.6639662994167206, + "grad_norm": 0.7423127889633179, + "learning_rate": 2.180767369042026e-06, + "loss": 0.0987, + "step": 5135 + }, + { + "epoch": 1.6642903434867142, + "grad_norm": 0.8032506108283997, + "learning_rate": 2.1798999401701802e-06, + "loss": 0.1088, + "step": 5136 + }, + { + "epoch": 1.6646143875567079, + "grad_norm": 0.8562729954719543, + "learning_rate": 2.1790325504754827e-06, + "loss": 0.1244, + "step": 5137 + }, + { + "epoch": 1.6649384316267013, + "grad_norm": 0.7597046494483948, + "learning_rate": 2.1781652000640947e-06, + "loss": 0.1047, + "step": 5138 + }, + { + "epoch": 1.6652624756966947, + "grad_norm": 0.8073447942733765, + "learning_rate": 2.177297889042169e-06, + "loss": 0.1147, + "step": 5139 + }, + { + "epoch": 1.6655865197666881, + "grad_norm": 0.7578743696212769, + "learning_rate": 2.1764306175158588e-06, + "loss": 0.1047, + "step": 5140 + }, + { + "epoch": 1.6659105638366818, + "grad_norm": 0.8077635765075684, + "learning_rate": 2.1755633855913086e-06, + "loss": 0.1157, + "step": 5141 + }, + { + "epoch": 1.6662346079066754, + "grad_norm": 0.7816113233566284, + "learning_rate": 2.174696193374658e-06, + "loss": 0.1126, + "step": 5142 + }, + { + "epoch": 1.6665586519766689, + "grad_norm": 0.8228937387466431, + "learning_rate": 2.173829040972046e-06, + "loss": 0.1208, + "step": 5143 + }, + { + "epoch": 1.6668826960466623, + "grad_norm": 0.8054512143135071, + "learning_rate": 2.1729619284896e-06, + "loss": 0.1165, + "step": 5144 + }, + { + "epoch": 1.6672067401166557, + "grad_norm": 0.712684690952301, + "learning_rate": 2.1720948560334492e-06, + "loss": 0.0987, + "step": 5145 + }, + { + "epoch": 1.6675307841866494, + "grad_norm": 0.9595630764961243, + "learning_rate": 2.171227823709713e-06, + "loss": 0.1372, + "step": 5146 + }, + { + "epoch": 1.667854828256643, + "grad_norm": 0.8455243706703186, + "learning_rate": 2.1703608316245092e-06, + "loss": 0.1212, + "step": 5147 + }, + { + "epoch": 1.6681788723266364, + "grad_norm": 0.7891972661018372, + "learning_rate": 2.169493879883948e-06, + "loss": 0.1104, + "step": 5148 + }, + { + "epoch": 1.6685029163966298, + "grad_norm": 0.8356092572212219, + "learning_rate": 2.168626968594136e-06, + "loss": 0.1129, + "step": 5149 + }, + { + "epoch": 1.6688269604666235, + "grad_norm": 0.7961825728416443, + "learning_rate": 2.167760097861176e-06, + "loss": 0.1114, + "step": 5150 + }, + { + "epoch": 1.6691510045366171, + "grad_norm": 0.7900418043136597, + "learning_rate": 2.1668932677911624e-06, + "loss": 0.1059, + "step": 5151 + }, + { + "epoch": 1.6694750486066106, + "grad_norm": 0.7676316499710083, + "learning_rate": 2.166026478490189e-06, + "loss": 0.1045, + "step": 5152 + }, + { + "epoch": 1.669799092676604, + "grad_norm": 0.8411489725112915, + "learning_rate": 2.1651597300643418e-06, + "loss": 0.1065, + "step": 5153 + }, + { + "epoch": 1.6701231367465974, + "grad_norm": 0.7867658138275146, + "learning_rate": 2.1642930226197012e-06, + "loss": 0.1154, + "step": 5154 + }, + { + "epoch": 1.670447180816591, + "grad_norm": 0.8002599477767944, + "learning_rate": 2.1634263562623454e-06, + "loss": 0.1102, + "step": 5155 + }, + { + "epoch": 1.6707712248865847, + "grad_norm": 0.703038215637207, + "learning_rate": 2.162559731098345e-06, + "loss": 0.0976, + "step": 5156 + }, + { + "epoch": 1.6710952689565781, + "grad_norm": 0.8850612044334412, + "learning_rate": 2.161693147233767e-06, + "loss": 0.1279, + "step": 5157 + }, + { + "epoch": 1.6714193130265715, + "grad_norm": 0.890364408493042, + "learning_rate": 2.1608266047746723e-06, + "loss": 0.1266, + "step": 5158 + }, + { + "epoch": 1.6717433570965652, + "grad_norm": 0.7292555570602417, + "learning_rate": 2.1599601038271186e-06, + "loss": 0.1019, + "step": 5159 + }, + { + "epoch": 1.6720674011665586, + "grad_norm": 0.8944790959358215, + "learning_rate": 2.1590936444971563e-06, + "loss": 0.123, + "step": 5160 + }, + { + "epoch": 1.6723914452365523, + "grad_norm": 0.8260093331336975, + "learning_rate": 2.1582272268908307e-06, + "loss": 0.122, + "step": 5161 + }, + { + "epoch": 1.6727154893065457, + "grad_norm": 0.7920061945915222, + "learning_rate": 2.1573608511141845e-06, + "loss": 0.1176, + "step": 5162 + }, + { + "epoch": 1.673039533376539, + "grad_norm": 0.8595673441886902, + "learning_rate": 2.1564945172732523e-06, + "loss": 0.1229, + "step": 5163 + }, + { + "epoch": 1.6733635774465327, + "grad_norm": 0.7942947745323181, + "learning_rate": 2.155628225474067e-06, + "loss": 0.1139, + "step": 5164 + }, + { + "epoch": 1.6736876215165264, + "grad_norm": 0.808169960975647, + "learning_rate": 2.154761975822653e-06, + "loss": 0.1129, + "step": 5165 + }, + { + "epoch": 1.6740116655865198, + "grad_norm": 0.7921257019042969, + "learning_rate": 2.1538957684250303e-06, + "loss": 0.1118, + "step": 5166 + }, + { + "epoch": 1.6743357096565132, + "grad_norm": 0.8302102088928223, + "learning_rate": 2.1530296033872155e-06, + "loss": 0.1158, + "step": 5167 + }, + { + "epoch": 1.6746597537265067, + "grad_norm": 0.7993295788764954, + "learning_rate": 2.152163480815218e-06, + "loss": 0.1146, + "step": 5168 + }, + { + "epoch": 1.6749837977965003, + "grad_norm": 0.785004198551178, + "learning_rate": 2.151297400815044e-06, + "loss": 0.115, + "step": 5169 + }, + { + "epoch": 1.675307841866494, + "grad_norm": 0.7763941884040833, + "learning_rate": 2.150431363492691e-06, + "loss": 0.1143, + "step": 5170 + }, + { + "epoch": 1.6756318859364874, + "grad_norm": 0.8561198711395264, + "learning_rate": 2.1495653689541562e-06, + "loss": 0.1224, + "step": 5171 + }, + { + "epoch": 1.6759559300064808, + "grad_norm": 0.7910784482955933, + "learning_rate": 2.1486994173054276e-06, + "loss": 0.1122, + "step": 5172 + }, + { + "epoch": 1.6762799740764744, + "grad_norm": 0.821010947227478, + "learning_rate": 2.1478335086524885e-06, + "loss": 0.1088, + "step": 5173 + }, + { + "epoch": 1.6766040181464679, + "grad_norm": 0.8397283554077148, + "learning_rate": 2.14696764310132e-06, + "loss": 0.1194, + "step": 5174 + }, + { + "epoch": 1.6769280622164615, + "grad_norm": 0.7711231112480164, + "learning_rate": 2.1461018207578932e-06, + "loss": 0.1017, + "step": 5175 + }, + { + "epoch": 1.677252106286455, + "grad_norm": 0.7861514091491699, + "learning_rate": 2.1452360417281786e-06, + "loss": 0.1092, + "step": 5176 + }, + { + "epoch": 1.6775761503564484, + "grad_norm": 0.7565597891807556, + "learning_rate": 2.144370306118138e-06, + "loss": 0.1043, + "step": 5177 + }, + { + "epoch": 1.677900194426442, + "grad_norm": 0.7293825149536133, + "learning_rate": 2.143504614033728e-06, + "loss": 0.1027, + "step": 5178 + }, + { + "epoch": 1.6782242384964356, + "grad_norm": 0.8587285280227661, + "learning_rate": 2.142638965580903e-06, + "loss": 0.1151, + "step": 5179 + }, + { + "epoch": 1.678548282566429, + "grad_norm": 0.8521298766136169, + "learning_rate": 2.141773360865609e-06, + "loss": 0.1197, + "step": 5180 + }, + { + "epoch": 1.6788723266364225, + "grad_norm": 0.8533197641372681, + "learning_rate": 2.1409077999937883e-06, + "loss": 0.1212, + "step": 5181 + }, + { + "epoch": 1.679196370706416, + "grad_norm": 0.8288674354553223, + "learning_rate": 2.1400422830713752e-06, + "loss": 0.1158, + "step": 5182 + }, + { + "epoch": 1.6795204147764096, + "grad_norm": 0.8852332234382629, + "learning_rate": 2.1391768102043032e-06, + "loss": 0.1265, + "step": 5183 + }, + { + "epoch": 1.6798444588464032, + "grad_norm": 0.8235684037208557, + "learning_rate": 2.1383113814984967e-06, + "loss": 0.1112, + "step": 5184 + }, + { + "epoch": 1.6801685029163966, + "grad_norm": 0.7521533370018005, + "learning_rate": 2.137445997059874e-06, + "loss": 0.1086, + "step": 5185 + }, + { + "epoch": 1.68049254698639, + "grad_norm": 0.8826772570610046, + "learning_rate": 2.1365806569943533e-06, + "loss": 0.1226, + "step": 5186 + }, + { + "epoch": 1.6808165910563837, + "grad_norm": 0.8674874901771545, + "learning_rate": 2.1357153614078407e-06, + "loss": 0.1196, + "step": 5187 + }, + { + "epoch": 1.6811406351263773, + "grad_norm": 0.8353680372238159, + "learning_rate": 2.1348501104062423e-06, + "loss": 0.1225, + "step": 5188 + }, + { + "epoch": 1.6814646791963708, + "grad_norm": 0.8163774609565735, + "learning_rate": 2.1339849040954556e-06, + "loss": 0.1132, + "step": 5189 + }, + { + "epoch": 1.6817887232663642, + "grad_norm": 0.7950161695480347, + "learning_rate": 2.133119742581373e-06, + "loss": 0.1117, + "step": 5190 + }, + { + "epoch": 1.6821127673363576, + "grad_norm": 0.8247337341308594, + "learning_rate": 2.1322546259698823e-06, + "loss": 0.1144, + "step": 5191 + }, + { + "epoch": 1.6824368114063513, + "grad_norm": 0.7793399095535278, + "learning_rate": 2.1313895543668644e-06, + "loss": 0.1107, + "step": 5192 + }, + { + "epoch": 1.682760855476345, + "grad_norm": 0.8173040747642517, + "learning_rate": 2.1305245278781977e-06, + "loss": 0.1139, + "step": 5193 + }, + { + "epoch": 1.6830848995463383, + "grad_norm": 0.8045807480812073, + "learning_rate": 2.129659546609751e-06, + "loss": 0.1136, + "step": 5194 + }, + { + "epoch": 1.6834089436163318, + "grad_norm": 0.80312180519104, + "learning_rate": 2.1287946106673916e-06, + "loss": 0.1108, + "step": 5195 + }, + { + "epoch": 1.6837329876863252, + "grad_norm": 0.836544930934906, + "learning_rate": 2.1279297201569787e-06, + "loss": 0.1209, + "step": 5196 + }, + { + "epoch": 1.6840570317563188, + "grad_norm": 0.7888853549957275, + "learning_rate": 2.127064875184365e-06, + "loss": 0.106, + "step": 5197 + }, + { + "epoch": 1.6843810758263125, + "grad_norm": 0.8119872808456421, + "learning_rate": 2.126200075855401e-06, + "loss": 0.12, + "step": 5198 + }, + { + "epoch": 1.684705119896306, + "grad_norm": 0.7502855062484741, + "learning_rate": 2.125335322275928e-06, + "loss": 0.1081, + "step": 5199 + }, + { + "epoch": 1.6850291639662993, + "grad_norm": 0.7704909443855286, + "learning_rate": 2.1244706145517853e-06, + "loss": 0.1046, + "step": 5200 + }, + { + "epoch": 1.685353208036293, + "grad_norm": 0.8156105279922485, + "learning_rate": 2.1236059527888044e-06, + "loss": 0.1142, + "step": 5201 + }, + { + "epoch": 1.6856772521062866, + "grad_norm": 0.8128482103347778, + "learning_rate": 2.1227413370928106e-06, + "loss": 0.1135, + "step": 5202 + }, + { + "epoch": 1.68600129617628, + "grad_norm": 0.7718719840049744, + "learning_rate": 2.1218767675696255e-06, + "loss": 0.1082, + "step": 5203 + }, + { + "epoch": 1.6863253402462735, + "grad_norm": 0.7750940918922424, + "learning_rate": 2.1210122443250625e-06, + "loss": 0.1143, + "step": 5204 + }, + { + "epoch": 1.6866493843162669, + "grad_norm": 0.840144693851471, + "learning_rate": 2.1201477674649326e-06, + "loss": 0.1135, + "step": 5205 + }, + { + "epoch": 1.6869734283862605, + "grad_norm": 0.8162680864334106, + "learning_rate": 2.119283337095038e-06, + "loss": 0.1122, + "step": 5206 + }, + { + "epoch": 1.6872974724562542, + "grad_norm": 0.7928407192230225, + "learning_rate": 2.1184189533211783e-06, + "loss": 0.118, + "step": 5207 + }, + { + "epoch": 1.6876215165262476, + "grad_norm": 0.8174422979354858, + "learning_rate": 2.117554616249145e-06, + "loss": 0.1202, + "step": 5208 + }, + { + "epoch": 1.687945560596241, + "grad_norm": 0.7704837918281555, + "learning_rate": 2.1166903259847228e-06, + "loss": 0.1016, + "step": 5209 + }, + { + "epoch": 1.6882696046662347, + "grad_norm": 0.8021647930145264, + "learning_rate": 2.115826082633695e-06, + "loss": 0.1123, + "step": 5210 + }, + { + "epoch": 1.688593648736228, + "grad_norm": 0.7987725138664246, + "learning_rate": 2.114961886301835e-06, + "loss": 0.1127, + "step": 5211 + }, + { + "epoch": 1.6889176928062217, + "grad_norm": 0.8358885049819946, + "learning_rate": 2.114097737094914e-06, + "loss": 0.1216, + "step": 5212 + }, + { + "epoch": 1.6892417368762151, + "grad_norm": 0.8821001648902893, + "learning_rate": 2.1132336351186923e-06, + "loss": 0.1176, + "step": 5213 + }, + { + "epoch": 1.6895657809462086, + "grad_norm": 0.8151198029518127, + "learning_rate": 2.1123695804789307e-06, + "loss": 0.1104, + "step": 5214 + }, + { + "epoch": 1.6898898250162022, + "grad_norm": 0.8548397421836853, + "learning_rate": 2.11150557328138e-06, + "loss": 0.1244, + "step": 5215 + }, + { + "epoch": 1.6902138690861959, + "grad_norm": 0.7912091016769409, + "learning_rate": 2.110641613631785e-06, + "loss": 0.1117, + "step": 5216 + }, + { + "epoch": 1.6905379131561893, + "grad_norm": 0.8448325395584106, + "learning_rate": 2.109777701635889e-06, + "loss": 0.1182, + "step": 5217 + }, + { + "epoch": 1.6908619572261827, + "grad_norm": 0.8200645446777344, + "learning_rate": 2.1089138373994226e-06, + "loss": 0.1068, + "step": 5218 + }, + { + "epoch": 1.6911860012961761, + "grad_norm": 0.767809271812439, + "learning_rate": 2.108050021028118e-06, + "loss": 0.1083, + "step": 5219 + }, + { + "epoch": 1.6915100453661698, + "grad_norm": 0.7847505807876587, + "learning_rate": 2.1071862526276963e-06, + "loss": 0.1012, + "step": 5220 + }, + { + "epoch": 1.6918340894361634, + "grad_norm": 0.7471839785575867, + "learning_rate": 2.1063225323038744e-06, + "loss": 0.1055, + "step": 5221 + }, + { + "epoch": 1.6921581335061568, + "grad_norm": 0.8450305461883545, + "learning_rate": 2.1054588601623634e-06, + "loss": 0.1166, + "step": 5222 + }, + { + "epoch": 1.6924821775761503, + "grad_norm": 0.8114377856254578, + "learning_rate": 2.104595236308868e-06, + "loss": 0.1092, + "step": 5223 + }, + { + "epoch": 1.692806221646144, + "grad_norm": 0.8460206389427185, + "learning_rate": 2.1037316608490886e-06, + "loss": 0.1171, + "step": 5224 + }, + { + "epoch": 1.6931302657161373, + "grad_norm": 0.8369781374931335, + "learning_rate": 2.1028681338887164e-06, + "loss": 0.1207, + "step": 5225 + }, + { + "epoch": 1.693454309786131, + "grad_norm": 0.7532182931900024, + "learning_rate": 2.102004655533442e-06, + "loss": 0.1019, + "step": 5226 + }, + { + "epoch": 1.6937783538561244, + "grad_norm": 0.7661359906196594, + "learning_rate": 2.101141225888944e-06, + "loss": 0.1109, + "step": 5227 + }, + { + "epoch": 1.6941023979261178, + "grad_norm": 0.7669044137001038, + "learning_rate": 2.100277845060898e-06, + "loss": 0.1078, + "step": 5228 + }, + { + "epoch": 1.6944264419961115, + "grad_norm": 0.8359112739562988, + "learning_rate": 2.0994145131549755e-06, + "loss": 0.1174, + "step": 5229 + }, + { + "epoch": 1.6947504860661051, + "grad_norm": 0.7608208060264587, + "learning_rate": 2.0985512302768366e-06, + "loss": 0.1007, + "step": 5230 + }, + { + "epoch": 1.6950745301360985, + "grad_norm": 0.7669379711151123, + "learning_rate": 2.097687996532143e-06, + "loss": 0.1005, + "step": 5231 + }, + { + "epoch": 1.695398574206092, + "grad_norm": 0.8257039785385132, + "learning_rate": 2.0968248120265433e-06, + "loss": 0.1183, + "step": 5232 + }, + { + "epoch": 1.6957226182760854, + "grad_norm": 0.792262852191925, + "learning_rate": 2.095961676865683e-06, + "loss": 0.1116, + "step": 5233 + }, + { + "epoch": 1.696046662346079, + "grad_norm": 0.7693544626235962, + "learning_rate": 2.095098591155203e-06, + "loss": 0.1106, + "step": 5234 + }, + { + "epoch": 1.6963707064160727, + "grad_norm": 0.8442361354827881, + "learning_rate": 2.094235555000734e-06, + "loss": 0.1116, + "step": 5235 + }, + { + "epoch": 1.696694750486066, + "grad_norm": 0.7691715955734253, + "learning_rate": 2.093372568507907e-06, + "loss": 0.1071, + "step": 5236 + }, + { + "epoch": 1.6970187945560595, + "grad_norm": 0.8054088354110718, + "learning_rate": 2.0925096317823393e-06, + "loss": 0.111, + "step": 5237 + }, + { + "epoch": 1.6973428386260532, + "grad_norm": 0.7808986902236938, + "learning_rate": 2.091646744929649e-06, + "loss": 0.1039, + "step": 5238 + }, + { + "epoch": 1.6976668826960468, + "grad_norm": 0.8026829361915588, + "learning_rate": 2.0907839080554443e-06, + "loss": 0.1158, + "step": 5239 + }, + { + "epoch": 1.6979909267660402, + "grad_norm": 0.7938512563705444, + "learning_rate": 2.0899211212653262e-06, + "loss": 0.1135, + "step": 5240 + }, + { + "epoch": 1.6983149708360337, + "grad_norm": 0.7803632616996765, + "learning_rate": 2.0890583846648945e-06, + "loss": 0.1089, + "step": 5241 + }, + { + "epoch": 1.698639014906027, + "grad_norm": 0.8693332672119141, + "learning_rate": 2.0881956983597375e-06, + "loss": 0.1149, + "step": 5242 + }, + { + "epoch": 1.6989630589760207, + "grad_norm": 0.8382584452629089, + "learning_rate": 2.087333062455441e-06, + "loss": 0.1217, + "step": 5243 + }, + { + "epoch": 1.6992871030460144, + "grad_norm": 0.8314406275749207, + "learning_rate": 2.0864704770575824e-06, + "loss": 0.1168, + "step": 5244 + }, + { + "epoch": 1.6996111471160078, + "grad_norm": 0.848581075668335, + "learning_rate": 2.085607942271734e-06, + "loss": 0.1153, + "step": 5245 + }, + { + "epoch": 1.6999351911860012, + "grad_norm": 0.8251847624778748, + "learning_rate": 2.0847454582034625e-06, + "loss": 0.1206, + "step": 5246 + }, + { + "epoch": 1.7002592352559946, + "grad_norm": 0.750935971736908, + "learning_rate": 2.0838830249583254e-06, + "loss": 0.1015, + "step": 5247 + }, + { + "epoch": 1.7005832793259883, + "grad_norm": 0.8350731134414673, + "learning_rate": 2.0830206426418794e-06, + "loss": 0.1158, + "step": 5248 + }, + { + "epoch": 1.700907323395982, + "grad_norm": 0.8031424880027771, + "learning_rate": 2.0821583113596686e-06, + "loss": 0.1155, + "step": 5249 + }, + { + "epoch": 1.7012313674659754, + "grad_norm": 0.904130220413208, + "learning_rate": 2.081296031217237e-06, + "loss": 0.1181, + "step": 5250 + }, + { + "epoch": 1.7015554115359688, + "grad_norm": 0.7820038795471191, + "learning_rate": 2.080433802320117e-06, + "loss": 0.1104, + "step": 5251 + }, + { + "epoch": 1.7018794556059624, + "grad_norm": 0.7897928953170776, + "learning_rate": 2.0795716247738374e-06, + "loss": 0.1116, + "step": 5252 + }, + { + "epoch": 1.702203499675956, + "grad_norm": 0.7536505460739136, + "learning_rate": 2.078709498683922e-06, + "loss": 0.1046, + "step": 5253 + }, + { + "epoch": 1.7025275437459495, + "grad_norm": 0.7892839312553406, + "learning_rate": 2.0778474241558845e-06, + "loss": 0.1101, + "step": 5254 + }, + { + "epoch": 1.702851587815943, + "grad_norm": 0.745802640914917, + "learning_rate": 2.0769854012952368e-06, + "loss": 0.102, + "step": 5255 + }, + { + "epoch": 1.7031756318859363, + "grad_norm": 0.8427780866622925, + "learning_rate": 2.0761234302074803e-06, + "loss": 0.1191, + "step": 5256 + }, + { + "epoch": 1.70349967595593, + "grad_norm": 0.714269757270813, + "learning_rate": 2.0752615109981116e-06, + "loss": 0.0992, + "step": 5257 + }, + { + "epoch": 1.7038237200259236, + "grad_norm": 0.7593355774879456, + "learning_rate": 2.0743996437726233e-06, + "loss": 0.0979, + "step": 5258 + }, + { + "epoch": 1.704147764095917, + "grad_norm": 0.7677283883094788, + "learning_rate": 2.073537828636497e-06, + "loss": 0.0983, + "step": 5259 + }, + { + "epoch": 1.7044718081659105, + "grad_norm": 0.7660530805587769, + "learning_rate": 2.0726760656952137e-06, + "loss": 0.1055, + "step": 5260 + }, + { + "epoch": 1.7047958522359041, + "grad_norm": 0.8433137536048889, + "learning_rate": 2.0718143550542418e-06, + "loss": 0.113, + "step": 5261 + }, + { + "epoch": 1.7051198963058976, + "grad_norm": 0.8808390498161316, + "learning_rate": 2.0709526968190483e-06, + "loss": 0.1149, + "step": 5262 + }, + { + "epoch": 1.7054439403758912, + "grad_norm": 0.853110671043396, + "learning_rate": 2.070091091095092e-06, + "loss": 0.116, + "step": 5263 + }, + { + "epoch": 1.7057679844458846, + "grad_norm": 0.8006988167762756, + "learning_rate": 2.0692295379878237e-06, + "loss": 0.1183, + "step": 5264 + }, + { + "epoch": 1.706092028515878, + "grad_norm": 0.8483065366744995, + "learning_rate": 2.0683680376026897e-06, + "loss": 0.1277, + "step": 5265 + }, + { + "epoch": 1.7064160725858717, + "grad_norm": 0.8355550169944763, + "learning_rate": 2.0675065900451287e-06, + "loss": 0.1146, + "step": 5266 + }, + { + "epoch": 1.7067401166558653, + "grad_norm": 0.8079224228858948, + "learning_rate": 2.066645195420575e-06, + "loss": 0.1138, + "step": 5267 + }, + { + "epoch": 1.7070641607258588, + "grad_norm": 0.90150386095047, + "learning_rate": 2.0657838538344545e-06, + "loss": 0.1182, + "step": 5268 + }, + { + "epoch": 1.7073882047958522, + "grad_norm": 0.7275080680847168, + "learning_rate": 2.0649225653921855e-06, + "loss": 0.1079, + "step": 5269 + }, + { + "epoch": 1.7077122488658456, + "grad_norm": 0.7734233736991882, + "learning_rate": 2.064061330199184e-06, + "loss": 0.1117, + "step": 5270 + }, + { + "epoch": 1.7080362929358393, + "grad_norm": 0.747999370098114, + "learning_rate": 2.0632001483608544e-06, + "loss": 0.1078, + "step": 5271 + }, + { + "epoch": 1.708360337005833, + "grad_norm": 0.7794125080108643, + "learning_rate": 2.062339019982599e-06, + "loss": 0.1177, + "step": 5272 + }, + { + "epoch": 1.7086843810758263, + "grad_norm": 0.8238285183906555, + "learning_rate": 2.06147794516981e-06, + "loss": 0.1164, + "step": 5273 + }, + { + "epoch": 1.7090084251458197, + "grad_norm": 0.8097456097602844, + "learning_rate": 2.0606169240278752e-06, + "loss": 0.1167, + "step": 5274 + }, + { + "epoch": 1.7093324692158134, + "grad_norm": 0.8134164214134216, + "learning_rate": 2.059755956662176e-06, + "loss": 0.1173, + "step": 5275 + }, + { + "epoch": 1.709656513285807, + "grad_norm": 0.7630842924118042, + "learning_rate": 2.058895043178085e-06, + "loss": 0.108, + "step": 5276 + }, + { + "epoch": 1.7099805573558005, + "grad_norm": 0.7667798399925232, + "learning_rate": 2.0580341836809718e-06, + "loss": 0.1113, + "step": 5277 + }, + { + "epoch": 1.7103046014257939, + "grad_norm": 0.821010947227478, + "learning_rate": 2.0571733782761943e-06, + "loss": 0.1101, + "step": 5278 + }, + { + "epoch": 1.7106286454957873, + "grad_norm": 0.8166499137878418, + "learning_rate": 2.0563126270691097e-06, + "loss": 0.1195, + "step": 5279 + }, + { + "epoch": 1.710952689565781, + "grad_norm": 0.7887958884239197, + "learning_rate": 2.055451930165063e-06, + "loss": 0.105, + "step": 5280 + }, + { + "epoch": 1.7112767336357746, + "grad_norm": 0.8705652356147766, + "learning_rate": 2.054591287669398e-06, + "loss": 0.1162, + "step": 5281 + }, + { + "epoch": 1.711600777705768, + "grad_norm": 0.8159167170524597, + "learning_rate": 2.053730699687447e-06, + "loss": 0.1121, + "step": 5282 + }, + { + "epoch": 1.7119248217757614, + "grad_norm": 0.8327007293701172, + "learning_rate": 2.052870166324537e-06, + "loss": 0.1124, + "step": 5283 + }, + { + "epoch": 1.7122488658457549, + "grad_norm": 0.8116735816001892, + "learning_rate": 2.0520096876859918e-06, + "loss": 0.1156, + "step": 5284 + }, + { + "epoch": 1.7125729099157485, + "grad_norm": 0.7572367191314697, + "learning_rate": 2.051149263877123e-06, + "loss": 0.1156, + "step": 5285 + }, + { + "epoch": 1.7128969539857422, + "grad_norm": 0.8980776071548462, + "learning_rate": 2.0502888950032396e-06, + "loss": 0.1274, + "step": 5286 + }, + { + "epoch": 1.7132209980557356, + "grad_norm": 0.7499247193336487, + "learning_rate": 2.0494285811696417e-06, + "loss": 0.1088, + "step": 5287 + }, + { + "epoch": 1.713545042125729, + "grad_norm": 0.9621228575706482, + "learning_rate": 2.048568322481623e-06, + "loss": 0.1082, + "step": 5288 + }, + { + "epoch": 1.7138690861957226, + "grad_norm": 0.7719082236289978, + "learning_rate": 2.0477081190444724e-06, + "loss": 0.1068, + "step": 5289 + }, + { + "epoch": 1.7141931302657163, + "grad_norm": 0.847591757774353, + "learning_rate": 2.046847970963468e-06, + "loss": 0.1174, + "step": 5290 + }, + { + "epoch": 1.7145171743357097, + "grad_norm": 0.7991353869438171, + "learning_rate": 2.0459878783438867e-06, + "loss": 0.1075, + "step": 5291 + }, + { + "epoch": 1.7148412184057031, + "grad_norm": 0.8025398254394531, + "learning_rate": 2.045127841290993e-06, + "loss": 0.1116, + "step": 5292 + }, + { + "epoch": 1.7151652624756966, + "grad_norm": 0.8297464847564697, + "learning_rate": 2.0442678599100484e-06, + "loss": 0.1166, + "step": 5293 + }, + { + "epoch": 1.7154893065456902, + "grad_norm": 0.9035441279411316, + "learning_rate": 2.043407934306306e-06, + "loss": 0.1201, + "step": 5294 + }, + { + "epoch": 1.7158133506156839, + "grad_norm": 0.809662938117981, + "learning_rate": 2.0425480645850124e-06, + "loss": 0.1137, + "step": 5295 + }, + { + "epoch": 1.7161373946856773, + "grad_norm": 0.8593390583992004, + "learning_rate": 2.041688250851407e-06, + "loss": 0.1164, + "step": 5296 + }, + { + "epoch": 1.7164614387556707, + "grad_norm": 0.8075687885284424, + "learning_rate": 2.0408284932107227e-06, + "loss": 0.1164, + "step": 5297 + }, + { + "epoch": 1.7167854828256643, + "grad_norm": 0.8436287641525269, + "learning_rate": 2.039968791768186e-06, + "loss": 0.1125, + "step": 5298 + }, + { + "epoch": 1.7171095268956578, + "grad_norm": 0.8198383450508118, + "learning_rate": 2.039109146629016e-06, + "loss": 0.1095, + "step": 5299 + }, + { + "epoch": 1.7174335709656514, + "grad_norm": 0.8079723119735718, + "learning_rate": 2.0382495578984236e-06, + "loss": 0.1109, + "step": 5300 + }, + { + "epoch": 1.7177576150356448, + "grad_norm": 0.7642526030540466, + "learning_rate": 2.0373900256816166e-06, + "loss": 0.1099, + "step": 5301 + }, + { + "epoch": 1.7180816591056383, + "grad_norm": 0.7780331373214722, + "learning_rate": 2.0365305500837906e-06, + "loss": 0.1121, + "step": 5302 + }, + { + "epoch": 1.718405703175632, + "grad_norm": 0.8425008058547974, + "learning_rate": 2.0356711312101394e-06, + "loss": 0.1149, + "step": 5303 + }, + { + "epoch": 1.7187297472456255, + "grad_norm": 0.799623429775238, + "learning_rate": 2.0348117691658463e-06, + "loss": 0.1125, + "step": 5304 + }, + { + "epoch": 1.719053791315619, + "grad_norm": 0.7548736333847046, + "learning_rate": 2.03395246405609e-06, + "loss": 0.1032, + "step": 5305 + }, + { + "epoch": 1.7193778353856124, + "grad_norm": 0.7889924645423889, + "learning_rate": 2.03309321598604e-06, + "loss": 0.1098, + "step": 5306 + }, + { + "epoch": 1.7197018794556058, + "grad_norm": 0.7848178744316101, + "learning_rate": 2.03223402506086e-06, + "loss": 0.1077, + "step": 5307 + }, + { + "epoch": 1.7200259235255995, + "grad_norm": 0.7917930483818054, + "learning_rate": 2.031374891385708e-06, + "loss": 0.1116, + "step": 5308 + }, + { + "epoch": 1.720349967595593, + "grad_norm": 0.819743275642395, + "learning_rate": 2.0305158150657316e-06, + "loss": 0.1153, + "step": 5309 + }, + { + "epoch": 1.7206740116655865, + "grad_norm": 0.7816472053527832, + "learning_rate": 2.0296567962060753e-06, + "loss": 0.1096, + "step": 5310 + }, + { + "epoch": 1.72099805573558, + "grad_norm": 0.8776841759681702, + "learning_rate": 2.0287978349118737e-06, + "loss": 0.1221, + "step": 5311 + }, + { + "epoch": 1.7213220998055736, + "grad_norm": 0.8157861828804016, + "learning_rate": 2.0279389312882546e-06, + "loss": 0.1112, + "step": 5312 + }, + { + "epoch": 1.721646143875567, + "grad_norm": 0.7488312125205994, + "learning_rate": 2.027080085440341e-06, + "loss": 0.1014, + "step": 5313 + }, + { + "epoch": 1.7219701879455607, + "grad_norm": 0.8361231088638306, + "learning_rate": 2.0262212974732465e-06, + "loss": 0.1071, + "step": 5314 + }, + { + "epoch": 1.722294232015554, + "grad_norm": 0.8565608859062195, + "learning_rate": 2.0253625674920795e-06, + "loss": 0.1147, + "step": 5315 + }, + { + "epoch": 1.7226182760855475, + "grad_norm": 0.8289151787757874, + "learning_rate": 2.0245038956019386e-06, + "loss": 0.1124, + "step": 5316 + }, + { + "epoch": 1.7229423201555412, + "grad_norm": 0.7731626629829407, + "learning_rate": 2.0236452819079183e-06, + "loss": 0.1099, + "step": 5317 + }, + { + "epoch": 1.7232663642255348, + "grad_norm": 0.8253824710845947, + "learning_rate": 2.0227867265151035e-06, + "loss": 0.1169, + "step": 5318 + }, + { + "epoch": 1.7235904082955282, + "grad_norm": 0.7757041454315186, + "learning_rate": 2.0219282295285734e-06, + "loss": 0.1063, + "step": 5319 + }, + { + "epoch": 1.7239144523655217, + "grad_norm": 0.7756922841072083, + "learning_rate": 2.021069791053401e-06, + "loss": 0.1052, + "step": 5320 + }, + { + "epoch": 1.724238496435515, + "grad_norm": 0.9245814681053162, + "learning_rate": 2.0202114111946483e-06, + "loss": 0.1267, + "step": 5321 + }, + { + "epoch": 1.7245625405055087, + "grad_norm": 0.7585616707801819, + "learning_rate": 2.019353090057375e-06, + "loss": 0.1021, + "step": 5322 + }, + { + "epoch": 1.7248865845755024, + "grad_norm": 0.8026843667030334, + "learning_rate": 2.018494827746631e-06, + "loss": 0.1194, + "step": 5323 + }, + { + "epoch": 1.7252106286454958, + "grad_norm": 0.7823585867881775, + "learning_rate": 2.0176366243674575e-06, + "loss": 0.1073, + "step": 5324 + }, + { + "epoch": 1.7255346727154892, + "grad_norm": 0.8478108048439026, + "learning_rate": 2.0167784800248924e-06, + "loss": 0.1177, + "step": 5325 + }, + { + "epoch": 1.7258587167854829, + "grad_norm": 0.8905182480812073, + "learning_rate": 2.0159203948239624e-06, + "loss": 0.121, + "step": 5326 + }, + { + "epoch": 1.7261827608554765, + "grad_norm": 0.830605149269104, + "learning_rate": 2.015062368869691e-06, + "loss": 0.1163, + "step": 5327 + }, + { + "epoch": 1.72650680492547, + "grad_norm": 0.8245023488998413, + "learning_rate": 2.0142044022670905e-06, + "loss": 0.1167, + "step": 5328 + }, + { + "epoch": 1.7268308489954634, + "grad_norm": 0.8123703002929688, + "learning_rate": 2.013346495121169e-06, + "loss": 0.1167, + "step": 5329 + }, + { + "epoch": 1.7271548930654568, + "grad_norm": 0.8318195939064026, + "learning_rate": 2.012488647536925e-06, + "loss": 0.1167, + "step": 5330 + }, + { + "epoch": 1.7274789371354504, + "grad_norm": 0.8157840967178345, + "learning_rate": 2.0116308596193502e-06, + "loss": 0.1087, + "step": 5331 + }, + { + "epoch": 1.727802981205444, + "grad_norm": 0.8502197265625, + "learning_rate": 2.0107731314734316e-06, + "loss": 0.1228, + "step": 5332 + }, + { + "epoch": 1.7281270252754375, + "grad_norm": 0.8437539935112, + "learning_rate": 2.0099154632041446e-06, + "loss": 0.1133, + "step": 5333 + }, + { + "epoch": 1.728451069345431, + "grad_norm": 0.777830183506012, + "learning_rate": 2.0090578549164614e-06, + "loss": 0.1145, + "step": 5334 + }, + { + "epoch": 1.7287751134154243, + "grad_norm": 0.8191279172897339, + "learning_rate": 2.0082003067153436e-06, + "loss": 0.1099, + "step": 5335 + }, + { + "epoch": 1.729099157485418, + "grad_norm": 0.8253933191299438, + "learning_rate": 2.007342818705747e-06, + "loss": 0.1137, + "step": 5336 + }, + { + "epoch": 1.7294232015554116, + "grad_norm": 0.7542148232460022, + "learning_rate": 2.006485390992621e-06, + "loss": 0.1002, + "step": 5337 + }, + { + "epoch": 1.729747245625405, + "grad_norm": 0.8375315070152283, + "learning_rate": 2.0056280236809044e-06, + "loss": 0.1187, + "step": 5338 + }, + { + "epoch": 1.7300712896953985, + "grad_norm": 0.8146992325782776, + "learning_rate": 2.004770716875533e-06, + "loss": 0.1155, + "step": 5339 + }, + { + "epoch": 1.7303953337653921, + "grad_norm": 0.8450291752815247, + "learning_rate": 2.0039134706814303e-06, + "loss": 0.1193, + "step": 5340 + }, + { + "epoch": 1.7307193778353858, + "grad_norm": 0.7564694881439209, + "learning_rate": 2.0030562852035175e-06, + "loss": 0.1044, + "step": 5341 + }, + { + "epoch": 1.7310434219053792, + "grad_norm": 0.7646568417549133, + "learning_rate": 2.0021991605467043e-06, + "loss": 0.108, + "step": 5342 + }, + { + "epoch": 1.7313674659753726, + "grad_norm": 0.8570701479911804, + "learning_rate": 2.0013420968158944e-06, + "loss": 0.114, + "step": 5343 + }, + { + "epoch": 1.731691510045366, + "grad_norm": 0.7767725586891174, + "learning_rate": 2.0004850941159847e-06, + "loss": 0.1087, + "step": 5344 + }, + { + "epoch": 1.7320155541153597, + "grad_norm": 0.7576577067375183, + "learning_rate": 1.999628152551863e-06, + "loss": 0.1071, + "step": 5345 + }, + { + "epoch": 1.7323395981853533, + "grad_norm": 0.7547813057899475, + "learning_rate": 1.9987712722284132e-06, + "loss": 0.1012, + "step": 5346 + }, + { + "epoch": 1.7326636422553467, + "grad_norm": 0.8170945644378662, + "learning_rate": 1.9979144532505064e-06, + "loss": 0.1113, + "step": 5347 + }, + { + "epoch": 1.7329876863253402, + "grad_norm": 0.8752892017364502, + "learning_rate": 1.9970576957230094e-06, + "loss": 0.1209, + "step": 5348 + }, + { + "epoch": 1.7333117303953338, + "grad_norm": 0.7312746644020081, + "learning_rate": 1.996200999750783e-06, + "loss": 0.1034, + "step": 5349 + }, + { + "epoch": 1.7336357744653272, + "grad_norm": 0.850387692451477, + "learning_rate": 1.995344365438676e-06, + "loss": 0.1226, + "step": 5350 + }, + { + "epoch": 1.7339598185353209, + "grad_norm": 0.8289128541946411, + "learning_rate": 1.994487792891534e-06, + "loss": 0.1199, + "step": 5351 + }, + { + "epoch": 1.7342838626053143, + "grad_norm": 0.7456474304199219, + "learning_rate": 1.993631282214191e-06, + "loss": 0.112, + "step": 5352 + }, + { + "epoch": 1.7346079066753077, + "grad_norm": 0.8394036889076233, + "learning_rate": 1.992774833511478e-06, + "loss": 0.1312, + "step": 5353 + }, + { + "epoch": 1.7349319507453014, + "grad_norm": 0.8686239123344421, + "learning_rate": 1.991918446888216e-06, + "loss": 0.1235, + "step": 5354 + }, + { + "epoch": 1.735255994815295, + "grad_norm": 0.8267319798469543, + "learning_rate": 1.9910621224492154e-06, + "loss": 0.1126, + "step": 5355 + }, + { + "epoch": 1.7355800388852884, + "grad_norm": 0.7727353572845459, + "learning_rate": 1.9902058602992856e-06, + "loss": 0.1056, + "step": 5356 + }, + { + "epoch": 1.7359040829552819, + "grad_norm": 0.7727401852607727, + "learning_rate": 1.989349660543222e-06, + "loss": 0.0959, + "step": 5357 + }, + { + "epoch": 1.7362281270252753, + "grad_norm": 0.8392402529716492, + "learning_rate": 1.988493523285818e-06, + "loss": 0.1186, + "step": 5358 + }, + { + "epoch": 1.736552171095269, + "grad_norm": 0.8880442380905151, + "learning_rate": 1.9876374486318545e-06, + "loss": 0.1243, + "step": 5359 + }, + { + "epoch": 1.7368762151652626, + "grad_norm": 0.756595253944397, + "learning_rate": 1.9867814366861075e-06, + "loss": 0.1028, + "step": 5360 + }, + { + "epoch": 1.737200259235256, + "grad_norm": 0.7915642261505127, + "learning_rate": 1.9859254875533435e-06, + "loss": 0.1118, + "step": 5361 + }, + { + "epoch": 1.7375243033052494, + "grad_norm": 0.8473532199859619, + "learning_rate": 1.9850696013383236e-06, + "loss": 0.1188, + "step": 5362 + }, + { + "epoch": 1.737848347375243, + "grad_norm": 0.7980912327766418, + "learning_rate": 1.9842137781458e-06, + "loss": 0.1066, + "step": 5363 + }, + { + "epoch": 1.7381723914452365, + "grad_norm": 0.7806613445281982, + "learning_rate": 1.9833580180805155e-06, + "loss": 0.1127, + "step": 5364 + }, + { + "epoch": 1.7384964355152301, + "grad_norm": 0.8259003758430481, + "learning_rate": 1.9825023212472095e-06, + "loss": 0.1118, + "step": 5365 + }, + { + "epoch": 1.7388204795852236, + "grad_norm": 0.7611026167869568, + "learning_rate": 1.9816466877506095e-06, + "loss": 0.1056, + "step": 5366 + }, + { + "epoch": 1.739144523655217, + "grad_norm": 0.7931367754936218, + "learning_rate": 1.9807911176954357e-06, + "loss": 0.112, + "step": 5367 + }, + { + "epoch": 1.7394685677252106, + "grad_norm": 0.7729525566101074, + "learning_rate": 1.9799356111864036e-06, + "loss": 0.1119, + "step": 5368 + }, + { + "epoch": 1.7397926117952043, + "grad_norm": 0.78594970703125, + "learning_rate": 1.979080168328218e-06, + "loss": 0.1131, + "step": 5369 + }, + { + "epoch": 1.7401166558651977, + "grad_norm": 0.8103339672088623, + "learning_rate": 1.9782247892255767e-06, + "loss": 0.1094, + "step": 5370 + }, + { + "epoch": 1.7404406999351911, + "grad_norm": 0.8295299410820007, + "learning_rate": 1.9773694739831702e-06, + "loss": 0.1165, + "step": 5371 + }, + { + "epoch": 1.7407647440051845, + "grad_norm": 0.7763726115226746, + "learning_rate": 1.976514222705681e-06, + "loss": 0.1077, + "step": 5372 + }, + { + "epoch": 1.7410887880751782, + "grad_norm": 0.8518121838569641, + "learning_rate": 1.975659035497783e-06, + "loss": 0.116, + "step": 5373 + }, + { + "epoch": 1.7414128321451718, + "grad_norm": 0.7763303518295288, + "learning_rate": 1.9748039124641426e-06, + "loss": 0.1081, + "step": 5374 + }, + { + "epoch": 1.7417368762151653, + "grad_norm": 0.8390318155288696, + "learning_rate": 1.9739488537094197e-06, + "loss": 0.1155, + "step": 5375 + }, + { + "epoch": 1.7420609202851587, + "grad_norm": 0.8260394930839539, + "learning_rate": 1.973093859338263e-06, + "loss": 0.1085, + "step": 5376 + }, + { + "epoch": 1.7423849643551523, + "grad_norm": 0.7894601225852966, + "learning_rate": 1.9722389294553188e-06, + "loss": 0.1084, + "step": 5377 + }, + { + "epoch": 1.742709008425146, + "grad_norm": 0.8820915818214417, + "learning_rate": 1.9713840641652206e-06, + "loss": 0.1255, + "step": 5378 + }, + { + "epoch": 1.7430330524951394, + "grad_norm": 0.8178963661193848, + "learning_rate": 1.970529263572594e-06, + "loss": 0.1186, + "step": 5379 + }, + { + "epoch": 1.7433570965651328, + "grad_norm": 0.836940586566925, + "learning_rate": 1.9696745277820613e-06, + "loss": 0.1125, + "step": 5380 + }, + { + "epoch": 1.7436811406351262, + "grad_norm": 0.9130598306655884, + "learning_rate": 1.9688198568982316e-06, + "loss": 0.1297, + "step": 5381 + }, + { + "epoch": 1.74400518470512, + "grad_norm": 0.7412441968917847, + "learning_rate": 1.96796525102571e-06, + "loss": 0.1025, + "step": 5382 + }, + { + "epoch": 1.7443292287751135, + "grad_norm": 0.7745087146759033, + "learning_rate": 1.96711071026909e-06, + "loss": 0.1054, + "step": 5383 + }, + { + "epoch": 1.744653272845107, + "grad_norm": 0.7893478870391846, + "learning_rate": 1.9662562347329613e-06, + "loss": 0.1165, + "step": 5384 + }, + { + "epoch": 1.7449773169151004, + "grad_norm": 0.8026453256607056, + "learning_rate": 1.9654018245219024e-06, + "loss": 0.111, + "step": 5385 + }, + { + "epoch": 1.7453013609850938, + "grad_norm": 0.8066617846488953, + "learning_rate": 1.9645474797404838e-06, + "loss": 0.1152, + "step": 5386 + }, + { + "epoch": 1.7456254050550875, + "grad_norm": 0.7359598278999329, + "learning_rate": 1.963693200493271e-06, + "loss": 0.1042, + "step": 5387 + }, + { + "epoch": 1.745949449125081, + "grad_norm": 0.8136652708053589, + "learning_rate": 1.962838986884818e-06, + "loss": 0.1208, + "step": 5388 + }, + { + "epoch": 1.7462734931950745, + "grad_norm": 0.8009673953056335, + "learning_rate": 1.9619848390196734e-06, + "loss": 0.1083, + "step": 5389 + }, + { + "epoch": 1.746597537265068, + "grad_norm": 0.749184250831604, + "learning_rate": 1.9611307570023766e-06, + "loss": 0.1048, + "step": 5390 + }, + { + "epoch": 1.7469215813350616, + "grad_norm": 0.8438735008239746, + "learning_rate": 1.960276740937458e-06, + "loss": 0.1162, + "step": 5391 + }, + { + "epoch": 1.7472456254050552, + "grad_norm": 0.7352275848388672, + "learning_rate": 1.959422790929441e-06, + "loss": 0.1023, + "step": 5392 + }, + { + "epoch": 1.7475696694750487, + "grad_norm": 0.8380963206291199, + "learning_rate": 1.9585689070828413e-06, + "loss": 0.1136, + "step": 5393 + }, + { + "epoch": 1.747893713545042, + "grad_norm": 0.8800554275512695, + "learning_rate": 1.9577150895021664e-06, + "loss": 0.1219, + "step": 5394 + }, + { + "epoch": 1.7482177576150355, + "grad_norm": 0.7962919473648071, + "learning_rate": 1.9568613382919142e-06, + "loss": 0.1144, + "step": 5395 + }, + { + "epoch": 1.7485418016850292, + "grad_norm": 0.8999350666999817, + "learning_rate": 1.9560076535565766e-06, + "loss": 0.1117, + "step": 5396 + }, + { + "epoch": 1.7488658457550228, + "grad_norm": 0.8126070499420166, + "learning_rate": 1.9551540354006366e-06, + "loss": 0.1117, + "step": 5397 + }, + { + "epoch": 1.7491898898250162, + "grad_norm": 0.8283818364143372, + "learning_rate": 1.954300483928567e-06, + "loss": 0.1188, + "step": 5398 + }, + { + "epoch": 1.7495139338950096, + "grad_norm": 0.8194707036018372, + "learning_rate": 1.953446999244836e-06, + "loss": 0.1105, + "step": 5399 + }, + { + "epoch": 1.7498379779650033, + "grad_norm": 0.7576519846916199, + "learning_rate": 1.9525935814539e-06, + "loss": 0.1008, + "step": 5400 + }, + { + "epoch": 1.7501620220349967, + "grad_norm": 0.8350682854652405, + "learning_rate": 1.951740230660212e-06, + "loss": 0.1184, + "step": 5401 + }, + { + "epoch": 1.7504860661049904, + "grad_norm": 0.8378635048866272, + "learning_rate": 1.950886946968212e-06, + "loss": 0.1135, + "step": 5402 + }, + { + "epoch": 1.7508101101749838, + "grad_norm": 0.7591724395751953, + "learning_rate": 1.9500337304823333e-06, + "loss": 0.1009, + "step": 5403 + }, + { + "epoch": 1.7511341542449772, + "grad_norm": 0.8226900696754456, + "learning_rate": 1.9491805813070025e-06, + "loss": 0.1155, + "step": 5404 + }, + { + "epoch": 1.7514581983149708, + "grad_norm": 0.8173897862434387, + "learning_rate": 1.948327499546635e-06, + "loss": 0.1143, + "step": 5405 + }, + { + "epoch": 1.7517822423849645, + "grad_norm": 0.735022783279419, + "learning_rate": 1.947474485305642e-06, + "loss": 0.1036, + "step": 5406 + }, + { + "epoch": 1.752106286454958, + "grad_norm": 0.8631489872932434, + "learning_rate": 1.9466215386884223e-06, + "loss": 0.1249, + "step": 5407 + }, + { + "epoch": 1.7524303305249513, + "grad_norm": 0.8307135105133057, + "learning_rate": 1.9457686597993704e-06, + "loss": 0.1137, + "step": 5408 + }, + { + "epoch": 1.7527543745949448, + "grad_norm": 0.8436553478240967, + "learning_rate": 1.9449158487428688e-06, + "loss": 0.1183, + "step": 5409 + }, + { + "epoch": 1.7530784186649384, + "grad_norm": 0.8238299489021301, + "learning_rate": 1.9440631056232926e-06, + "loss": 0.1132, + "step": 5410 + }, + { + "epoch": 1.753402462734932, + "grad_norm": 0.8608657121658325, + "learning_rate": 1.9432104305450117e-06, + "loss": 0.1219, + "step": 5411 + }, + { + "epoch": 1.7537265068049255, + "grad_norm": 0.8268369436264038, + "learning_rate": 1.942357823612383e-06, + "loss": 0.1212, + "step": 5412 + }, + { + "epoch": 1.754050550874919, + "grad_norm": 0.9069793820381165, + "learning_rate": 1.9415052849297585e-06, + "loss": 0.1331, + "step": 5413 + }, + { + "epoch": 1.7543745949449125, + "grad_norm": 0.8022398352622986, + "learning_rate": 1.9406528146014815e-06, + "loss": 0.1126, + "step": 5414 + }, + { + "epoch": 1.7546986390149062, + "grad_norm": 0.8379114270210266, + "learning_rate": 1.939800412731884e-06, + "loss": 0.1239, + "step": 5415 + }, + { + "epoch": 1.7550226830848996, + "grad_norm": 0.8638489842414856, + "learning_rate": 1.9389480794252933e-06, + "loss": 0.1207, + "step": 5416 + }, + { + "epoch": 1.755346727154893, + "grad_norm": 0.7857935428619385, + "learning_rate": 1.9380958147860254e-06, + "loss": 0.1119, + "step": 5417 + }, + { + "epoch": 1.7556707712248865, + "grad_norm": 0.8815799355506897, + "learning_rate": 1.937243618918391e-06, + "loss": 0.1176, + "step": 5418 + }, + { + "epoch": 1.75599481529488, + "grad_norm": 0.7748388648033142, + "learning_rate": 1.936391491926689e-06, + "loss": 0.1063, + "step": 5419 + }, + { + "epoch": 1.7563188593648738, + "grad_norm": 0.812292754650116, + "learning_rate": 1.9355394339152133e-06, + "loss": 0.1105, + "step": 5420 + }, + { + "epoch": 1.7566429034348672, + "grad_norm": 0.7713181376457214, + "learning_rate": 1.9346874449882465e-06, + "loss": 0.1145, + "step": 5421 + }, + { + "epoch": 1.7569669475048606, + "grad_norm": 0.8215756416320801, + "learning_rate": 1.9338355252500624e-06, + "loss": 0.1151, + "step": 5422 + }, + { + "epoch": 1.757290991574854, + "grad_norm": 0.7613236904144287, + "learning_rate": 1.93298367480493e-06, + "loss": 0.1037, + "step": 5423 + }, + { + "epoch": 1.7576150356448477, + "grad_norm": 0.809416651725769, + "learning_rate": 1.932131893757107e-06, + "loss": 0.1012, + "step": 5424 + }, + { + "epoch": 1.7579390797148413, + "grad_norm": 0.8261358141899109, + "learning_rate": 1.9312801822108425e-06, + "loss": 0.1164, + "step": 5425 + }, + { + "epoch": 1.7582631237848347, + "grad_norm": 0.805388331413269, + "learning_rate": 1.9304285402703775e-06, + "loss": 0.114, + "step": 5426 + }, + { + "epoch": 1.7585871678548282, + "grad_norm": 0.8319758176803589, + "learning_rate": 1.929576968039946e-06, + "loss": 0.1232, + "step": 5427 + }, + { + "epoch": 1.7589112119248218, + "grad_norm": 0.8471801280975342, + "learning_rate": 1.928725465623772e-06, + "loss": 0.1236, + "step": 5428 + }, + { + "epoch": 1.7592352559948155, + "grad_norm": 0.8064789772033691, + "learning_rate": 1.927874033126069e-06, + "loss": 0.1191, + "step": 5429 + }, + { + "epoch": 1.7595593000648089, + "grad_norm": 0.8756115436553955, + "learning_rate": 1.927022670651047e-06, + "loss": 0.1263, + "step": 5430 + }, + { + "epoch": 1.7598833441348023, + "grad_norm": 0.7553970813751221, + "learning_rate": 1.9261713783029024e-06, + "loss": 0.1064, + "step": 5431 + }, + { + "epoch": 1.7602073882047957, + "grad_norm": 0.7707648277282715, + "learning_rate": 1.9253201561858266e-06, + "loss": 0.1038, + "step": 5432 + }, + { + "epoch": 1.7605314322747894, + "grad_norm": 0.7251192927360535, + "learning_rate": 1.924469004404001e-06, + "loss": 0.1036, + "step": 5433 + }, + { + "epoch": 1.760855476344783, + "grad_norm": 0.7130362391471863, + "learning_rate": 1.9236179230615967e-06, + "loss": 0.099, + "step": 5434 + }, + { + "epoch": 1.7611795204147764, + "grad_norm": 0.7608776688575745, + "learning_rate": 1.922766912262779e-06, + "loss": 0.1005, + "step": 5435 + }, + { + "epoch": 1.7615035644847699, + "grad_norm": 0.8560105562210083, + "learning_rate": 1.921915972111703e-06, + "loss": 0.1186, + "step": 5436 + }, + { + "epoch": 1.7618276085547635, + "grad_norm": 0.7603005766868591, + "learning_rate": 1.9210651027125164e-06, + "loss": 0.1112, + "step": 5437 + }, + { + "epoch": 1.762151652624757, + "grad_norm": 0.7880005240440369, + "learning_rate": 1.9202143041693554e-06, + "loss": 0.1141, + "step": 5438 + }, + { + "epoch": 1.7624756966947506, + "grad_norm": 0.8264434933662415, + "learning_rate": 1.919363576586352e-06, + "loss": 0.1116, + "step": 5439 + }, + { + "epoch": 1.762799740764744, + "grad_norm": 0.8080835342407227, + "learning_rate": 1.918512920067626e-06, + "loss": 0.1137, + "step": 5440 + }, + { + "epoch": 1.7631237848347374, + "grad_norm": 0.8233956694602966, + "learning_rate": 1.9176623347172885e-06, + "loss": 0.1129, + "step": 5441 + }, + { + "epoch": 1.763447828904731, + "grad_norm": 0.8224033713340759, + "learning_rate": 1.9168118206394443e-06, + "loss": 0.1191, + "step": 5442 + }, + { + "epoch": 1.7637718729747247, + "grad_norm": 0.7948696613311768, + "learning_rate": 1.915961377938187e-06, + "loss": 0.1141, + "step": 5443 + }, + { + "epoch": 1.7640959170447181, + "grad_norm": 0.9159996509552002, + "learning_rate": 1.9151110067176038e-06, + "loss": 0.1365, + "step": 5444 + }, + { + "epoch": 1.7644199611147116, + "grad_norm": 0.7687258720397949, + "learning_rate": 1.914260707081771e-06, + "loss": 0.1043, + "step": 5445 + }, + { + "epoch": 1.764744005184705, + "grad_norm": 0.8619191646575928, + "learning_rate": 1.913410479134757e-06, + "loss": 0.123, + "step": 5446 + }, + { + "epoch": 1.7650680492546986, + "grad_norm": 0.881009578704834, + "learning_rate": 1.9125603229806223e-06, + "loss": 0.1185, + "step": 5447 + }, + { + "epoch": 1.7653920933246923, + "grad_norm": 0.7797643542289734, + "learning_rate": 1.9117102387234165e-06, + "loss": 0.1069, + "step": 5448 + }, + { + "epoch": 1.7657161373946857, + "grad_norm": 0.9099947214126587, + "learning_rate": 1.910860226467183e-06, + "loss": 0.1206, + "step": 5449 + }, + { + "epoch": 1.7660401814646791, + "grad_norm": 0.8067218065261841, + "learning_rate": 1.910010286315953e-06, + "loss": 0.1021, + "step": 5450 + }, + { + "epoch": 1.7663642255346728, + "grad_norm": 0.7724782228469849, + "learning_rate": 1.9091604183737546e-06, + "loss": 0.1118, + "step": 5451 + }, + { + "epoch": 1.7666882696046662, + "grad_norm": 0.8174054622650146, + "learning_rate": 1.9083106227446e-06, + "loss": 0.1098, + "step": 5452 + }, + { + "epoch": 1.7670123136746598, + "grad_norm": 0.8142311573028564, + "learning_rate": 1.907460899532497e-06, + "loss": 0.1042, + "step": 5453 + }, + { + "epoch": 1.7673363577446533, + "grad_norm": 0.8318405151367188, + "learning_rate": 1.9066112488414445e-06, + "loss": 0.111, + "step": 5454 + }, + { + "epoch": 1.7676604018146467, + "grad_norm": 0.8179137110710144, + "learning_rate": 1.90576167077543e-06, + "loss": 0.1048, + "step": 5455 + }, + { + "epoch": 1.7679844458846403, + "grad_norm": 0.7973840832710266, + "learning_rate": 1.904912165438435e-06, + "loss": 0.118, + "step": 5456 + }, + { + "epoch": 1.768308489954634, + "grad_norm": 0.794683039188385, + "learning_rate": 1.9040627329344296e-06, + "loss": 0.1086, + "step": 5457 + }, + { + "epoch": 1.7686325340246274, + "grad_norm": 0.8206287026405334, + "learning_rate": 1.9032133733673764e-06, + "loss": 0.1183, + "step": 5458 + }, + { + "epoch": 1.7689565780946208, + "grad_norm": 0.7799130082130432, + "learning_rate": 1.9023640868412297e-06, + "loss": 0.1059, + "step": 5459 + }, + { + "epoch": 1.7692806221646142, + "grad_norm": 0.7698259949684143, + "learning_rate": 1.9015148734599317e-06, + "loss": 0.1103, + "step": 5460 + }, + { + "epoch": 1.7696046662346079, + "grad_norm": 0.8091194033622742, + "learning_rate": 1.900665733327421e-06, + "loss": 0.115, + "step": 5461 + }, + { + "epoch": 1.7699287103046015, + "grad_norm": 0.8662896752357483, + "learning_rate": 1.899816666547621e-06, + "loss": 0.1244, + "step": 5462 + }, + { + "epoch": 1.770252754374595, + "grad_norm": 0.8698745369911194, + "learning_rate": 1.8989676732244522e-06, + "loss": 0.1276, + "step": 5463 + }, + { + "epoch": 1.7705767984445884, + "grad_norm": 0.863068163394928, + "learning_rate": 1.8981187534618217e-06, + "loss": 0.1183, + "step": 5464 + }, + { + "epoch": 1.770900842514582, + "grad_norm": 0.7592810988426208, + "learning_rate": 1.8972699073636283e-06, + "loss": 0.1053, + "step": 5465 + }, + { + "epoch": 1.7712248865845757, + "grad_norm": 0.8105905652046204, + "learning_rate": 1.8964211350337637e-06, + "loss": 0.112, + "step": 5466 + }, + { + "epoch": 1.771548930654569, + "grad_norm": 0.7767974138259888, + "learning_rate": 1.895572436576109e-06, + "loss": 0.1049, + "step": 5467 + }, + { + "epoch": 1.7718729747245625, + "grad_norm": 0.8776288628578186, + "learning_rate": 1.8947238120945372e-06, + "loss": 0.1234, + "step": 5468 + }, + { + "epoch": 1.772197018794556, + "grad_norm": 0.8377476334571838, + "learning_rate": 1.8938752616929112e-06, + "loss": 0.1157, + "step": 5469 + }, + { + "epoch": 1.7725210628645496, + "grad_norm": 0.8418101072311401, + "learning_rate": 1.8930267854750845e-06, + "loss": 0.1247, + "step": 5470 + }, + { + "epoch": 1.7728451069345432, + "grad_norm": 0.7262491583824158, + "learning_rate": 1.8921783835449042e-06, + "loss": 0.1037, + "step": 5471 + }, + { + "epoch": 1.7731691510045366, + "grad_norm": 0.8218994140625, + "learning_rate": 1.8913300560062047e-06, + "loss": 0.1125, + "step": 5472 + }, + { + "epoch": 1.77349319507453, + "grad_norm": 0.7634402513504028, + "learning_rate": 1.890481802962815e-06, + "loss": 0.1026, + "step": 5473 + }, + { + "epoch": 1.7738172391445235, + "grad_norm": 0.8064968585968018, + "learning_rate": 1.889633624518551e-06, + "loss": 0.116, + "step": 5474 + }, + { + "epoch": 1.7741412832145171, + "grad_norm": 0.8574610948562622, + "learning_rate": 1.8887855207772235e-06, + "loss": 0.1254, + "step": 5475 + }, + { + "epoch": 1.7744653272845108, + "grad_norm": 0.7670876383781433, + "learning_rate": 1.8879374918426312e-06, + "loss": 0.1119, + "step": 5476 + }, + { + "epoch": 1.7747893713545042, + "grad_norm": 0.8037365078926086, + "learning_rate": 1.8870895378185643e-06, + "loss": 0.1106, + "step": 5477 + }, + { + "epoch": 1.7751134154244976, + "grad_norm": 0.7313063740730286, + "learning_rate": 1.886241658808805e-06, + "loss": 0.1012, + "step": 5478 + }, + { + "epoch": 1.7754374594944913, + "grad_norm": 0.8549699187278748, + "learning_rate": 1.8853938549171242e-06, + "loss": 0.1195, + "step": 5479 + }, + { + "epoch": 1.775761503564485, + "grad_norm": 0.8040359020233154, + "learning_rate": 1.8845461262472863e-06, + "loss": 0.1167, + "step": 5480 + }, + { + "epoch": 1.7760855476344783, + "grad_norm": 0.7298676371574402, + "learning_rate": 1.883698472903045e-06, + "loss": 0.1053, + "step": 5481 + }, + { + "epoch": 1.7764095917044718, + "grad_norm": 0.7766726016998291, + "learning_rate": 1.882850894988143e-06, + "loss": 0.1121, + "step": 5482 + }, + { + "epoch": 1.7767336357744652, + "grad_norm": 0.7604732513427734, + "learning_rate": 1.882003392606318e-06, + "loss": 0.1104, + "step": 5483 + }, + { + "epoch": 1.7770576798444588, + "grad_norm": 0.757429301738739, + "learning_rate": 1.8811559658612941e-06, + "loss": 0.1065, + "step": 5484 + }, + { + "epoch": 1.7773817239144525, + "grad_norm": 0.8506981134414673, + "learning_rate": 1.88030861485679e-06, + "loss": 0.1186, + "step": 5485 + }, + { + "epoch": 1.777705767984446, + "grad_norm": 0.772424578666687, + "learning_rate": 1.879461339696512e-06, + "loss": 0.1101, + "step": 5486 + }, + { + "epoch": 1.7780298120544393, + "grad_norm": 0.777412474155426, + "learning_rate": 1.8786141404841587e-06, + "loss": 0.1094, + "step": 5487 + }, + { + "epoch": 1.778353856124433, + "grad_norm": 0.8396077752113342, + "learning_rate": 1.8777670173234198e-06, + "loss": 0.1134, + "step": 5488 + }, + { + "epoch": 1.7786779001944264, + "grad_norm": 0.7566655874252319, + "learning_rate": 1.8769199703179736e-06, + "loss": 0.1046, + "step": 5489 + }, + { + "epoch": 1.77900194426442, + "grad_norm": 0.8267093896865845, + "learning_rate": 1.8760729995714916e-06, + "loss": 0.1147, + "step": 5490 + }, + { + "epoch": 1.7793259883344135, + "grad_norm": 0.8138669729232788, + "learning_rate": 1.8752261051876337e-06, + "loss": 0.117, + "step": 5491 + }, + { + "epoch": 1.779650032404407, + "grad_norm": 0.8392179012298584, + "learning_rate": 1.8743792872700529e-06, + "loss": 0.1089, + "step": 5492 + }, + { + "epoch": 1.7799740764744005, + "grad_norm": 0.7915463447570801, + "learning_rate": 1.873532545922391e-06, + "loss": 0.109, + "step": 5493 + }, + { + "epoch": 1.7802981205443942, + "grad_norm": 0.7900585532188416, + "learning_rate": 1.8726858812482798e-06, + "loss": 0.1122, + "step": 5494 + }, + { + "epoch": 1.7806221646143876, + "grad_norm": 0.7889156937599182, + "learning_rate": 1.871839293351345e-06, + "loss": 0.1055, + "step": 5495 + }, + { + "epoch": 1.780946208684381, + "grad_norm": 0.8391129970550537, + "learning_rate": 1.870992782335198e-06, + "loss": 0.1161, + "step": 5496 + }, + { + "epoch": 1.7812702527543745, + "grad_norm": 0.8119585514068604, + "learning_rate": 1.8701463483034471e-06, + "loss": 0.1159, + "step": 5497 + }, + { + "epoch": 1.781594296824368, + "grad_norm": 0.9002379775047302, + "learning_rate": 1.8692999913596846e-06, + "loss": 0.1303, + "step": 5498 + }, + { + "epoch": 1.7819183408943617, + "grad_norm": 0.8775905966758728, + "learning_rate": 1.8684537116074983e-06, + "loss": 0.1182, + "step": 5499 + }, + { + "epoch": 1.7822423849643552, + "grad_norm": 0.7602542042732239, + "learning_rate": 1.8676075091504637e-06, + "loss": 0.101, + "step": 5500 + }, + { + "epoch": 1.7825664290343486, + "grad_norm": 0.8325319886207581, + "learning_rate": 1.866761384092147e-06, + "loss": 0.1114, + "step": 5501 + }, + { + "epoch": 1.7828904731043422, + "grad_norm": 0.8351833820343018, + "learning_rate": 1.8659153365361076e-06, + "loss": 0.1161, + "step": 5502 + }, + { + "epoch": 1.7832145171743357, + "grad_norm": 0.8513324856758118, + "learning_rate": 1.8650693665858916e-06, + "loss": 0.1202, + "step": 5503 + }, + { + "epoch": 1.7835385612443293, + "grad_norm": 0.7680867910385132, + "learning_rate": 1.8642234743450394e-06, + "loss": 0.1019, + "step": 5504 + }, + { + "epoch": 1.7838626053143227, + "grad_norm": 0.8985635638237, + "learning_rate": 1.8633776599170783e-06, + "loss": 0.1187, + "step": 5505 + }, + { + "epoch": 1.7841866493843161, + "grad_norm": 0.8368175029754639, + "learning_rate": 1.86253192340553e-06, + "loss": 0.1177, + "step": 5506 + }, + { + "epoch": 1.7845106934543098, + "grad_norm": 0.7756860852241516, + "learning_rate": 1.8616862649139024e-06, + "loss": 0.1057, + "step": 5507 + }, + { + "epoch": 1.7848347375243034, + "grad_norm": 0.7564083933830261, + "learning_rate": 1.8608406845456968e-06, + "loss": 0.106, + "step": 5508 + }, + { + "epoch": 1.7851587815942969, + "grad_norm": 0.7947510480880737, + "learning_rate": 1.8599951824044033e-06, + "loss": 0.1036, + "step": 5509 + }, + { + "epoch": 1.7854828256642903, + "grad_norm": 0.8901932239532471, + "learning_rate": 1.8591497585935041e-06, + "loss": 0.1292, + "step": 5510 + }, + { + "epoch": 1.7858068697342837, + "grad_norm": 0.8269218802452087, + "learning_rate": 1.858304413216471e-06, + "loss": 0.1167, + "step": 5511 + }, + { + "epoch": 1.7861309138042774, + "grad_norm": 0.782346785068512, + "learning_rate": 1.8574591463767656e-06, + "loss": 0.1083, + "step": 5512 + }, + { + "epoch": 1.786454957874271, + "grad_norm": 0.8424670696258545, + "learning_rate": 1.8566139581778392e-06, + "loss": 0.1178, + "step": 5513 + }, + { + "epoch": 1.7867790019442644, + "grad_norm": 0.8284109830856323, + "learning_rate": 1.855768848723137e-06, + "loss": 0.1123, + "step": 5514 + }, + { + "epoch": 1.7871030460142578, + "grad_norm": 0.816237211227417, + "learning_rate": 1.85492381811609e-06, + "loss": 0.1181, + "step": 5515 + }, + { + "epoch": 1.7874270900842515, + "grad_norm": 0.8606086373329163, + "learning_rate": 1.854078866460124e-06, + "loss": 0.1094, + "step": 5516 + }, + { + "epoch": 1.7877511341542451, + "grad_norm": 0.8743523359298706, + "learning_rate": 1.8532339938586513e-06, + "loss": 0.1208, + "step": 5517 + }, + { + "epoch": 1.7880751782242386, + "grad_norm": 0.7890545725822449, + "learning_rate": 1.8523892004150765e-06, + "loss": 0.1117, + "step": 5518 + }, + { + "epoch": 1.788399222294232, + "grad_norm": 0.7640580534934998, + "learning_rate": 1.8515444862327947e-06, + "loss": 0.1057, + "step": 5519 + }, + { + "epoch": 1.7887232663642254, + "grad_norm": 0.7732981443405151, + "learning_rate": 1.8506998514151896e-06, + "loss": 0.1088, + "step": 5520 + }, + { + "epoch": 1.789047310434219, + "grad_norm": 0.7903842329978943, + "learning_rate": 1.8498552960656378e-06, + "loss": 0.1075, + "step": 5521 + }, + { + "epoch": 1.7893713545042127, + "grad_norm": 0.8699726462364197, + "learning_rate": 1.8490108202875023e-06, + "loss": 0.1256, + "step": 5522 + }, + { + "epoch": 1.7896953985742061, + "grad_norm": 0.7327967882156372, + "learning_rate": 1.848166424184142e-06, + "loss": 0.1058, + "step": 5523 + }, + { + "epoch": 1.7900194426441995, + "grad_norm": 0.8472350239753723, + "learning_rate": 1.8473221078589006e-06, + "loss": 0.1179, + "step": 5524 + }, + { + "epoch": 1.790343486714193, + "grad_norm": 0.7945756316184998, + "learning_rate": 1.846477871415114e-06, + "loss": 0.108, + "step": 5525 + }, + { + "epoch": 1.7906675307841866, + "grad_norm": 0.8109987378120422, + "learning_rate": 1.8456337149561105e-06, + "loss": 0.1162, + "step": 5526 + }, + { + "epoch": 1.7909915748541803, + "grad_norm": 0.808334469795227, + "learning_rate": 1.8447896385852043e-06, + "loss": 0.1114, + "step": 5527 + }, + { + "epoch": 1.7913156189241737, + "grad_norm": 0.8291633725166321, + "learning_rate": 1.8439456424057044e-06, + "loss": 0.1116, + "step": 5528 + }, + { + "epoch": 1.791639662994167, + "grad_norm": 0.822845995426178, + "learning_rate": 1.8431017265209067e-06, + "loss": 0.1138, + "step": 5529 + }, + { + "epoch": 1.7919637070641607, + "grad_norm": 0.7952374815940857, + "learning_rate": 1.8422578910340985e-06, + "loss": 0.1073, + "step": 5530 + }, + { + "epoch": 1.7922877511341544, + "grad_norm": 0.8398154973983765, + "learning_rate": 1.8414141360485565e-06, + "loss": 0.1159, + "step": 5531 + }, + { + "epoch": 1.7926117952041478, + "grad_norm": 0.7845988273620605, + "learning_rate": 1.840570461667549e-06, + "loss": 0.1043, + "step": 5532 + }, + { + "epoch": 1.7929358392741412, + "grad_norm": 0.8346614241600037, + "learning_rate": 1.8397268679943333e-06, + "loss": 0.1109, + "step": 5533 + }, + { + "epoch": 1.7932598833441347, + "grad_norm": 0.8257557153701782, + "learning_rate": 1.8388833551321562e-06, + "loss": 0.1118, + "step": 5534 + }, + { + "epoch": 1.7935839274141283, + "grad_norm": 0.8189712762832642, + "learning_rate": 1.838039923184257e-06, + "loss": 0.1122, + "step": 5535 + }, + { + "epoch": 1.793907971484122, + "grad_norm": 0.7669243812561035, + "learning_rate": 1.8371965722538636e-06, + "loss": 0.1116, + "step": 5536 + }, + { + "epoch": 1.7942320155541154, + "grad_norm": 0.7271428108215332, + "learning_rate": 1.836353302444192e-06, + "loss": 0.1006, + "step": 5537 + }, + { + "epoch": 1.7945560596241088, + "grad_norm": 0.8294802308082581, + "learning_rate": 1.8355101138584524e-06, + "loss": 0.1216, + "step": 5538 + }, + { + "epoch": 1.7948801036941024, + "grad_norm": 0.747330904006958, + "learning_rate": 1.8346670065998411e-06, + "loss": 0.1061, + "step": 5539 + }, + { + "epoch": 1.7952041477640959, + "grad_norm": 0.8953630328178406, + "learning_rate": 1.8338239807715486e-06, + "loss": 0.1247, + "step": 5540 + }, + { + "epoch": 1.7955281918340895, + "grad_norm": 0.8005939722061157, + "learning_rate": 1.8329810364767511e-06, + "loss": 0.111, + "step": 5541 + }, + { + "epoch": 1.795852235904083, + "grad_norm": 0.7868248224258423, + "learning_rate": 1.8321381738186178e-06, + "loss": 0.1054, + "step": 5542 + }, + { + "epoch": 1.7961762799740764, + "grad_norm": 0.7667168974876404, + "learning_rate": 1.8312953929003068e-06, + "loss": 0.1025, + "step": 5543 + }, + { + "epoch": 1.79650032404407, + "grad_norm": 0.7552728652954102, + "learning_rate": 1.8304526938249653e-06, + "loss": 0.1005, + "step": 5544 + }, + { + "epoch": 1.7968243681140637, + "grad_norm": 0.8472879528999329, + "learning_rate": 1.8296100766957331e-06, + "loss": 0.118, + "step": 5545 + }, + { + "epoch": 1.797148412184057, + "grad_norm": 0.7923120260238647, + "learning_rate": 1.828767541615737e-06, + "loss": 0.1094, + "step": 5546 + }, + { + "epoch": 1.7974724562540505, + "grad_norm": 0.8863158822059631, + "learning_rate": 1.8279250886880962e-06, + "loss": 0.1308, + "step": 5547 + }, + { + "epoch": 1.797796500324044, + "grad_norm": 0.8249463438987732, + "learning_rate": 1.827082718015919e-06, + "loss": 0.1177, + "step": 5548 + }, + { + "epoch": 1.7981205443940376, + "grad_norm": 0.7819499373435974, + "learning_rate": 1.8262404297023013e-06, + "loss": 0.1148, + "step": 5549 + }, + { + "epoch": 1.7984445884640312, + "grad_norm": 0.7605578303337097, + "learning_rate": 1.8253982238503338e-06, + "loss": 0.1037, + "step": 5550 + }, + { + "epoch": 1.7987686325340246, + "grad_norm": 0.778826117515564, + "learning_rate": 1.8245561005630921e-06, + "loss": 0.108, + "step": 5551 + }, + { + "epoch": 1.799092676604018, + "grad_norm": 0.8250377178192139, + "learning_rate": 1.823714059943646e-06, + "loss": 0.1149, + "step": 5552 + }, + { + "epoch": 1.7994167206740117, + "grad_norm": 0.776631772518158, + "learning_rate": 1.8228721020950504e-06, + "loss": 0.1045, + "step": 5553 + }, + { + "epoch": 1.7997407647440054, + "grad_norm": 0.8652671575546265, + "learning_rate": 1.8220302271203557e-06, + "loss": 0.1172, + "step": 5554 + }, + { + "epoch": 1.8000648088139988, + "grad_norm": 0.7777321934700012, + "learning_rate": 1.8211884351225978e-06, + "loss": 0.105, + "step": 5555 + }, + { + "epoch": 1.8003888528839922, + "grad_norm": 0.8324203491210938, + "learning_rate": 1.8203467262048033e-06, + "loss": 0.1198, + "step": 5556 + }, + { + "epoch": 1.8007128969539856, + "grad_norm": 0.7479450702667236, + "learning_rate": 1.819505100469991e-06, + "loss": 0.1027, + "step": 5557 + }, + { + "epoch": 1.8010369410239793, + "grad_norm": 0.8221516013145447, + "learning_rate": 1.8186635580211654e-06, + "loss": 0.1082, + "step": 5558 + }, + { + "epoch": 1.801360985093973, + "grad_norm": 0.8015259504318237, + "learning_rate": 1.8178220989613255e-06, + "loss": 0.1073, + "step": 5559 + }, + { + "epoch": 1.8016850291639663, + "grad_norm": 0.817034125328064, + "learning_rate": 1.8169807233934567e-06, + "loss": 0.1109, + "step": 5560 + }, + { + "epoch": 1.8020090732339598, + "grad_norm": 0.8351563215255737, + "learning_rate": 1.8161394314205343e-06, + "loss": 0.1189, + "step": 5561 + }, + { + "epoch": 1.8023331173039532, + "grad_norm": 0.7715746760368347, + "learning_rate": 1.8152982231455262e-06, + "loss": 0.1047, + "step": 5562 + }, + { + "epoch": 1.8026571613739468, + "grad_norm": 0.8527897596359253, + "learning_rate": 1.8144570986713867e-06, + "loss": 0.1201, + "step": 5563 + }, + { + "epoch": 1.8029812054439405, + "grad_norm": 0.8445289134979248, + "learning_rate": 1.8136160581010624e-06, + "loss": 0.1186, + "step": 5564 + }, + { + "epoch": 1.803305249513934, + "grad_norm": 0.877656102180481, + "learning_rate": 1.8127751015374865e-06, + "loss": 0.1211, + "step": 5565 + }, + { + "epoch": 1.8036292935839273, + "grad_norm": 0.7629379034042358, + "learning_rate": 1.8119342290835864e-06, + "loss": 0.1053, + "step": 5566 + }, + { + "epoch": 1.803953337653921, + "grad_norm": 0.8069267868995667, + "learning_rate": 1.8110934408422758e-06, + "loss": 0.1132, + "step": 5567 + }, + { + "epoch": 1.8042773817239146, + "grad_norm": 0.899695873260498, + "learning_rate": 1.810252736916458e-06, + "loss": 0.1216, + "step": 5568 + }, + { + "epoch": 1.804601425793908, + "grad_norm": 0.8216001391410828, + "learning_rate": 1.8094121174090288e-06, + "loss": 0.1135, + "step": 5569 + }, + { + "epoch": 1.8049254698639015, + "grad_norm": 0.8484271168708801, + "learning_rate": 1.80857158242287e-06, + "loss": 0.1267, + "step": 5570 + }, + { + "epoch": 1.8052495139338949, + "grad_norm": 0.845133364200592, + "learning_rate": 1.8077311320608571e-06, + "loss": 0.1172, + "step": 5571 + }, + { + "epoch": 1.8055735580038885, + "grad_norm": 0.8173267841339111, + "learning_rate": 1.806890766425851e-06, + "loss": 0.1136, + "step": 5572 + }, + { + "epoch": 1.8058976020738822, + "grad_norm": 0.8459123373031616, + "learning_rate": 1.8060504856207062e-06, + "loss": 0.1203, + "step": 5573 + }, + { + "epoch": 1.8062216461438756, + "grad_norm": 0.8414890170097351, + "learning_rate": 1.8052102897482643e-06, + "loss": 0.1127, + "step": 5574 + }, + { + "epoch": 1.806545690213869, + "grad_norm": 0.7746798396110535, + "learning_rate": 1.8043701789113552e-06, + "loss": 0.1093, + "step": 5575 + }, + { + "epoch": 1.8068697342838627, + "grad_norm": 0.8486313223838806, + "learning_rate": 1.8035301532128032e-06, + "loss": 0.1098, + "step": 5576 + }, + { + "epoch": 1.807193778353856, + "grad_norm": 0.8131248950958252, + "learning_rate": 1.8026902127554172e-06, + "loss": 0.1156, + "step": 5577 + }, + { + "epoch": 1.8075178224238497, + "grad_norm": 0.846875786781311, + "learning_rate": 1.8018503576419996e-06, + "loss": 0.1163, + "step": 5578 + }, + { + "epoch": 1.8078418664938432, + "grad_norm": 0.8525400161743164, + "learning_rate": 1.8010105879753398e-06, + "loss": 0.1169, + "step": 5579 + }, + { + "epoch": 1.8081659105638366, + "grad_norm": 0.7956811785697937, + "learning_rate": 1.800170903858216e-06, + "loss": 0.107, + "step": 5580 + }, + { + "epoch": 1.8084899546338302, + "grad_norm": 0.7707834243774414, + "learning_rate": 1.7993313053933998e-06, + "loss": 0.1065, + "step": 5581 + }, + { + "epoch": 1.8088139987038239, + "grad_norm": 0.7616016864776611, + "learning_rate": 1.7984917926836484e-06, + "loss": 0.1078, + "step": 5582 + }, + { + "epoch": 1.8091380427738173, + "grad_norm": 0.8728845715522766, + "learning_rate": 1.7976523658317104e-06, + "loss": 0.1256, + "step": 5583 + }, + { + "epoch": 1.8094620868438107, + "grad_norm": 0.8702101707458496, + "learning_rate": 1.7968130249403238e-06, + "loss": 0.1167, + "step": 5584 + }, + { + "epoch": 1.8097861309138041, + "grad_norm": 0.8324860334396362, + "learning_rate": 1.7959737701122157e-06, + "loss": 0.1242, + "step": 5585 + }, + { + "epoch": 1.8101101749837978, + "grad_norm": 0.8261635303497314, + "learning_rate": 1.7951346014501027e-06, + "loss": 0.1085, + "step": 5586 + }, + { + "epoch": 1.8104342190537914, + "grad_norm": 0.8950084447860718, + "learning_rate": 1.7942955190566899e-06, + "loss": 0.1043, + "step": 5587 + }, + { + "epoch": 1.8107582631237849, + "grad_norm": 0.7961917519569397, + "learning_rate": 1.7934565230346752e-06, + "loss": 0.1083, + "step": 5588 + }, + { + "epoch": 1.8110823071937783, + "grad_norm": 0.7906010746955872, + "learning_rate": 1.7926176134867408e-06, + "loss": 0.1068, + "step": 5589 + }, + { + "epoch": 1.811406351263772, + "grad_norm": 0.841482937335968, + "learning_rate": 1.7917787905155634e-06, + "loss": 0.1244, + "step": 5590 + }, + { + "epoch": 1.8117303953337653, + "grad_norm": 0.802306592464447, + "learning_rate": 1.790940054223806e-06, + "loss": 0.1149, + "step": 5591 + }, + { + "epoch": 1.812054439403759, + "grad_norm": 0.8944019675254822, + "learning_rate": 1.7901014047141208e-06, + "loss": 0.1219, + "step": 5592 + }, + { + "epoch": 1.8123784834737524, + "grad_norm": 0.8466723561286926, + "learning_rate": 1.7892628420891526e-06, + "loss": 0.1077, + "step": 5593 + }, + { + "epoch": 1.8127025275437458, + "grad_norm": 0.8002016544342041, + "learning_rate": 1.788424366451531e-06, + "loss": 0.1126, + "step": 5594 + }, + { + "epoch": 1.8130265716137395, + "grad_norm": 0.7788607478141785, + "learning_rate": 1.7875859779038796e-06, + "loss": 0.1081, + "step": 5595 + }, + { + "epoch": 1.8133506156837331, + "grad_norm": 0.7892195582389832, + "learning_rate": 1.7867476765488061e-06, + "loss": 0.1089, + "step": 5596 + }, + { + "epoch": 1.8136746597537265, + "grad_norm": 0.8240311741828918, + "learning_rate": 1.7859094624889135e-06, + "loss": 0.1165, + "step": 5597 + }, + { + "epoch": 1.81399870382372, + "grad_norm": 0.8330913186073303, + "learning_rate": 1.7850713358267897e-06, + "loss": 0.1157, + "step": 5598 + }, + { + "epoch": 1.8143227478937134, + "grad_norm": 0.8133293390274048, + "learning_rate": 1.7842332966650122e-06, + "loss": 0.1139, + "step": 5599 + }, + { + "epoch": 1.814646791963707, + "grad_norm": 0.8080258369445801, + "learning_rate": 1.7833953451061513e-06, + "loss": 0.1074, + "step": 5600 + }, + { + "epoch": 1.8149708360337007, + "grad_norm": 0.7805373072624207, + "learning_rate": 1.7825574812527617e-06, + "loss": 0.1061, + "step": 5601 + }, + { + "epoch": 1.815294880103694, + "grad_norm": 0.7952004671096802, + "learning_rate": 1.781719705207392e-06, + "loss": 0.1103, + "step": 5602 + }, + { + "epoch": 1.8156189241736875, + "grad_norm": 0.7933663725852966, + "learning_rate": 1.7808820170725772e-06, + "loss": 0.1052, + "step": 5603 + }, + { + "epoch": 1.8159429682436812, + "grad_norm": 0.857387900352478, + "learning_rate": 1.7800444169508414e-06, + "loss": 0.1174, + "step": 5604 + }, + { + "epoch": 1.8162670123136748, + "grad_norm": 0.8322799205780029, + "learning_rate": 1.7792069049446987e-06, + "loss": 0.1169, + "step": 5605 + }, + { + "epoch": 1.8165910563836682, + "grad_norm": 0.7579730749130249, + "learning_rate": 1.7783694811566534e-06, + "loss": 0.1065, + "step": 5606 + }, + { + "epoch": 1.8169151004536617, + "grad_norm": 0.7945507764816284, + "learning_rate": 1.777532145689198e-06, + "loss": 0.1095, + "step": 5607 + }, + { + "epoch": 1.817239144523655, + "grad_norm": 0.7895393967628479, + "learning_rate": 1.7766948986448131e-06, + "loss": 0.1058, + "step": 5608 + }, + { + "epoch": 1.8175631885936487, + "grad_norm": 0.7503500580787659, + "learning_rate": 1.7758577401259716e-06, + "loss": 0.1079, + "step": 5609 + }, + { + "epoch": 1.8178872326636424, + "grad_norm": 0.7820692658424377, + "learning_rate": 1.7750206702351325e-06, + "loss": 0.1079, + "step": 5610 + }, + { + "epoch": 1.8182112767336358, + "grad_norm": 0.8591213226318359, + "learning_rate": 1.7741836890747438e-06, + "loss": 0.1171, + "step": 5611 + }, + { + "epoch": 1.8185353208036292, + "grad_norm": 0.7695344090461731, + "learning_rate": 1.7733467967472459e-06, + "loss": 0.1051, + "step": 5612 + }, + { + "epoch": 1.8188593648736227, + "grad_norm": 0.8064881563186646, + "learning_rate": 1.7725099933550649e-06, + "loss": 0.1082, + "step": 5613 + }, + { + "epoch": 1.8191834089436163, + "grad_norm": 0.8662197589874268, + "learning_rate": 1.7716732790006188e-06, + "loss": 0.1145, + "step": 5614 + }, + { + "epoch": 1.81950745301361, + "grad_norm": 0.8277799487113953, + "learning_rate": 1.7708366537863129e-06, + "loss": 0.1185, + "step": 5615 + }, + { + "epoch": 1.8198314970836034, + "grad_norm": 0.7557022571563721, + "learning_rate": 1.7700001178145409e-06, + "loss": 0.1029, + "step": 5616 + }, + { + "epoch": 1.8201555411535968, + "grad_norm": 0.7699225544929504, + "learning_rate": 1.7691636711876883e-06, + "loss": 0.1046, + "step": 5617 + }, + { + "epoch": 1.8204795852235904, + "grad_norm": 0.7422550916671753, + "learning_rate": 1.768327314008126e-06, + "loss": 0.1037, + "step": 5618 + }, + { + "epoch": 1.820803629293584, + "grad_norm": 0.8538484573364258, + "learning_rate": 1.7674910463782186e-06, + "loss": 0.1219, + "step": 5619 + }, + { + "epoch": 1.8211276733635775, + "grad_norm": 0.7328668236732483, + "learning_rate": 1.766654868400315e-06, + "loss": 0.108, + "step": 5620 + }, + { + "epoch": 1.821451717433571, + "grad_norm": 0.7712154388427734, + "learning_rate": 1.7658187801767568e-06, + "loss": 0.1009, + "step": 5621 + }, + { + "epoch": 1.8217757615035644, + "grad_norm": 0.7927885055541992, + "learning_rate": 1.7649827818098727e-06, + "loss": 0.1069, + "step": 5622 + }, + { + "epoch": 1.822099805573558, + "grad_norm": 0.7272517681121826, + "learning_rate": 1.7641468734019795e-06, + "loss": 0.0989, + "step": 5623 + }, + { + "epoch": 1.8224238496435516, + "grad_norm": 0.7580925822257996, + "learning_rate": 1.7633110550553867e-06, + "loss": 0.1062, + "step": 5624 + }, + { + "epoch": 1.822747893713545, + "grad_norm": 0.8520054817199707, + "learning_rate": 1.7624753268723882e-06, + "loss": 0.1196, + "step": 5625 + }, + { + "epoch": 1.8230719377835385, + "grad_norm": 0.8394516706466675, + "learning_rate": 1.7616396889552706e-06, + "loss": 0.115, + "step": 5626 + }, + { + "epoch": 1.8233959818535321, + "grad_norm": 0.9501622319221497, + "learning_rate": 1.7608041414063065e-06, + "loss": 0.1275, + "step": 5627 + }, + { + "epoch": 1.8237200259235256, + "grad_norm": 0.8643268942832947, + "learning_rate": 1.7599686843277596e-06, + "loss": 0.1206, + "step": 5628 + }, + { + "epoch": 1.8240440699935192, + "grad_norm": 0.8088141083717346, + "learning_rate": 1.7591333178218823e-06, + "loss": 0.1037, + "step": 5629 + }, + { + "epoch": 1.8243681140635126, + "grad_norm": 0.8530080318450928, + "learning_rate": 1.7582980419909135e-06, + "loss": 0.1229, + "step": 5630 + }, + { + "epoch": 1.824692158133506, + "grad_norm": 0.8482834696769714, + "learning_rate": 1.7574628569370855e-06, + "loss": 0.1144, + "step": 5631 + }, + { + "epoch": 1.8250162022034997, + "grad_norm": 0.8686640858650208, + "learning_rate": 1.756627762762614e-06, + "loss": 0.1174, + "step": 5632 + }, + { + "epoch": 1.8253402462734933, + "grad_norm": 0.812243640422821, + "learning_rate": 1.7557927595697094e-06, + "loss": 0.1156, + "step": 5633 + }, + { + "epoch": 1.8256642903434868, + "grad_norm": 0.8343957662582397, + "learning_rate": 1.7549578474605661e-06, + "loss": 0.1134, + "step": 5634 + }, + { + "epoch": 1.8259883344134802, + "grad_norm": 0.7635195255279541, + "learning_rate": 1.754123026537369e-06, + "loss": 0.1032, + "step": 5635 + }, + { + "epoch": 1.8263123784834736, + "grad_norm": 0.8139019012451172, + "learning_rate": 1.7532882969022941e-06, + "loss": 0.1145, + "step": 5636 + }, + { + "epoch": 1.8266364225534673, + "grad_norm": 0.7639754414558411, + "learning_rate": 1.752453658657502e-06, + "loss": 0.1065, + "step": 5637 + }, + { + "epoch": 1.826960466623461, + "grad_norm": 0.7924251556396484, + "learning_rate": 1.7516191119051456e-06, + "loss": 0.1129, + "step": 5638 + }, + { + "epoch": 1.8272845106934543, + "grad_norm": 0.796831488609314, + "learning_rate": 1.7507846567473643e-06, + "loss": 0.112, + "step": 5639 + }, + { + "epoch": 1.8276085547634477, + "grad_norm": 0.7968273758888245, + "learning_rate": 1.749950293286289e-06, + "loss": 0.1103, + "step": 5640 + }, + { + "epoch": 1.8279325988334414, + "grad_norm": 0.7729365825653076, + "learning_rate": 1.7491160216240368e-06, + "loss": 0.1157, + "step": 5641 + }, + { + "epoch": 1.8282566429034348, + "grad_norm": 0.7678266167640686, + "learning_rate": 1.7482818418627134e-06, + "loss": 0.1115, + "step": 5642 + }, + { + "epoch": 1.8285806869734285, + "grad_norm": 0.7949190735816956, + "learning_rate": 1.7474477541044165e-06, + "loss": 0.1096, + "step": 5643 + }, + { + "epoch": 1.8289047310434219, + "grad_norm": 0.7390647530555725, + "learning_rate": 1.746613758451228e-06, + "loss": 0.105, + "step": 5644 + }, + { + "epoch": 1.8292287751134153, + "grad_norm": 0.8184527158737183, + "learning_rate": 1.7457798550052232e-06, + "loss": 0.1188, + "step": 5645 + }, + { + "epoch": 1.829552819183409, + "grad_norm": 0.848542332649231, + "learning_rate": 1.744946043868463e-06, + "loss": 0.116, + "step": 5646 + }, + { + "epoch": 1.8298768632534026, + "grad_norm": 0.8174238204956055, + "learning_rate": 1.7441123251429968e-06, + "loss": 0.1102, + "step": 5647 + }, + { + "epoch": 1.830200907323396, + "grad_norm": 0.7834118008613586, + "learning_rate": 1.7432786989308648e-06, + "loss": 0.1089, + "step": 5648 + }, + { + "epoch": 1.8305249513933894, + "grad_norm": 0.7657251954078674, + "learning_rate": 1.7424451653340934e-06, + "loss": 0.1096, + "step": 5649 + }, + { + "epoch": 1.8308489954633829, + "grad_norm": 0.8082907795906067, + "learning_rate": 1.7416117244547014e-06, + "loss": 0.1201, + "step": 5650 + }, + { + "epoch": 1.8311730395333765, + "grad_norm": 0.8496717810630798, + "learning_rate": 1.7407783763946911e-06, + "loss": 0.1235, + "step": 5651 + }, + { + "epoch": 1.8314970836033702, + "grad_norm": 0.8051583766937256, + "learning_rate": 1.7399451212560593e-06, + "loss": 0.1154, + "step": 5652 + }, + { + "epoch": 1.8318211276733636, + "grad_norm": 0.8396188020706177, + "learning_rate": 1.7391119591407863e-06, + "loss": 0.1161, + "step": 5653 + }, + { + "epoch": 1.832145171743357, + "grad_norm": 0.847876787185669, + "learning_rate": 1.7382788901508426e-06, + "loss": 0.1184, + "step": 5654 + }, + { + "epoch": 1.8324692158133506, + "grad_norm": 0.8422421813011169, + "learning_rate": 1.7374459143881899e-06, + "loss": 0.104, + "step": 5655 + }, + { + "epoch": 1.8327932598833443, + "grad_norm": 0.7739959955215454, + "learning_rate": 1.7366130319547747e-06, + "loss": 0.1131, + "step": 5656 + }, + { + "epoch": 1.8331173039533377, + "grad_norm": 0.7968919277191162, + "learning_rate": 1.735780242952534e-06, + "loss": 0.1105, + "step": 5657 + }, + { + "epoch": 1.8334413480233311, + "grad_norm": 0.8058465123176575, + "learning_rate": 1.7349475474833938e-06, + "loss": 0.1196, + "step": 5658 + }, + { + "epoch": 1.8337653920933246, + "grad_norm": 0.8363388180732727, + "learning_rate": 1.7341149456492672e-06, + "loss": 0.1148, + "step": 5659 + }, + { + "epoch": 1.8340894361633182, + "grad_norm": 0.7761878371238708, + "learning_rate": 1.7332824375520574e-06, + "loss": 0.1007, + "step": 5660 + }, + { + "epoch": 1.8344134802333119, + "grad_norm": 0.8327564597129822, + "learning_rate": 1.7324500232936536e-06, + "loss": 0.1218, + "step": 5661 + }, + { + "epoch": 1.8347375243033053, + "grad_norm": 0.8729468584060669, + "learning_rate": 1.731617702975938e-06, + "loss": 0.1299, + "step": 5662 + }, + { + "epoch": 1.8350615683732987, + "grad_norm": 0.8369905352592468, + "learning_rate": 1.7307854767007756e-06, + "loss": 0.1154, + "step": 5663 + }, + { + "epoch": 1.8353856124432921, + "grad_norm": 0.8091652989387512, + "learning_rate": 1.7299533445700253e-06, + "loss": 0.1108, + "step": 5664 + }, + { + "epoch": 1.8357096565132858, + "grad_norm": 0.8384531736373901, + "learning_rate": 1.7291213066855312e-06, + "loss": 0.1189, + "step": 5665 + }, + { + "epoch": 1.8360337005832794, + "grad_norm": 0.8815416693687439, + "learning_rate": 1.7282893631491253e-06, + "loss": 0.117, + "step": 5666 + }, + { + "epoch": 1.8363577446532728, + "grad_norm": 0.7908858060836792, + "learning_rate": 1.7274575140626318e-06, + "loss": 0.1106, + "step": 5667 + }, + { + "epoch": 1.8366817887232663, + "grad_norm": 0.8137184977531433, + "learning_rate": 1.7266257595278591e-06, + "loss": 0.1109, + "step": 5668 + }, + { + "epoch": 1.83700583279326, + "grad_norm": 0.7663424015045166, + "learning_rate": 1.725794099646607e-06, + "loss": 0.1098, + "step": 5669 + }, + { + "epoch": 1.8373298768632536, + "grad_norm": 0.782175600528717, + "learning_rate": 1.7249625345206623e-06, + "loss": 0.1107, + "step": 5670 + }, + { + "epoch": 1.837653920933247, + "grad_norm": 0.8185158967971802, + "learning_rate": 1.7241310642517998e-06, + "loss": 0.1171, + "step": 5671 + }, + { + "epoch": 1.8379779650032404, + "grad_norm": 0.7788330316543579, + "learning_rate": 1.7232996889417846e-06, + "loss": 0.1057, + "step": 5672 + }, + { + "epoch": 1.8383020090732338, + "grad_norm": 0.7830475568771362, + "learning_rate": 1.7224684086923677e-06, + "loss": 0.1092, + "step": 5673 + }, + { + "epoch": 1.8386260531432275, + "grad_norm": 0.7791877388954163, + "learning_rate": 1.7216372236052914e-06, + "loss": 0.1124, + "step": 5674 + }, + { + "epoch": 1.8389500972132211, + "grad_norm": 0.7709876298904419, + "learning_rate": 1.7208061337822828e-06, + "loss": 0.1102, + "step": 5675 + }, + { + "epoch": 1.8392741412832145, + "grad_norm": 0.8033519387245178, + "learning_rate": 1.7199751393250614e-06, + "loss": 0.1087, + "step": 5676 + }, + { + "epoch": 1.839598185353208, + "grad_norm": 0.8064829707145691, + "learning_rate": 1.7191442403353314e-06, + "loss": 0.1106, + "step": 5677 + }, + { + "epoch": 1.8399222294232016, + "grad_norm": 0.8984112739562988, + "learning_rate": 1.7183134369147866e-06, + "loss": 0.1151, + "step": 5678 + }, + { + "epoch": 1.840246273493195, + "grad_norm": 0.8125988245010376, + "learning_rate": 1.71748272916511e-06, + "loss": 0.1144, + "step": 5679 + }, + { + "epoch": 1.8405703175631887, + "grad_norm": 0.8261982202529907, + "learning_rate": 1.716652117187972e-06, + "loss": 0.108, + "step": 5680 + }, + { + "epoch": 1.840894361633182, + "grad_norm": 0.7781147956848145, + "learning_rate": 1.7158216010850318e-06, + "loss": 0.1156, + "step": 5681 + }, + { + "epoch": 1.8412184057031755, + "grad_norm": 0.8234754800796509, + "learning_rate": 1.7149911809579361e-06, + "loss": 0.1121, + "step": 5682 + }, + { + "epoch": 1.8415424497731692, + "grad_norm": 0.7627200484275818, + "learning_rate": 1.7141608569083195e-06, + "loss": 0.1027, + "step": 5683 + }, + { + "epoch": 1.8418664938431628, + "grad_norm": 0.7426549196243286, + "learning_rate": 1.7133306290378077e-06, + "loss": 0.0976, + "step": 5684 + }, + { + "epoch": 1.8421905379131562, + "grad_norm": 0.8514507412910461, + "learning_rate": 1.7125004974480102e-06, + "loss": 0.1075, + "step": 5685 + }, + { + "epoch": 1.8425145819831497, + "grad_norm": 0.7836448550224304, + "learning_rate": 1.7116704622405295e-06, + "loss": 0.1102, + "step": 5686 + }, + { + "epoch": 1.842838626053143, + "grad_norm": 0.7563350796699524, + "learning_rate": 1.7108405235169511e-06, + "loss": 0.0993, + "step": 5687 + }, + { + "epoch": 1.8431626701231367, + "grad_norm": 0.8214438557624817, + "learning_rate": 1.7100106813788544e-06, + "loss": 0.117, + "step": 5688 + }, + { + "epoch": 1.8434867141931304, + "grad_norm": 0.8818809390068054, + "learning_rate": 1.7091809359278025e-06, + "loss": 0.1211, + "step": 5689 + }, + { + "epoch": 1.8438107582631238, + "grad_norm": 0.8602452278137207, + "learning_rate": 1.7083512872653477e-06, + "loss": 0.1177, + "step": 5690 + }, + { + "epoch": 1.8441348023331172, + "grad_norm": 0.8079312443733215, + "learning_rate": 1.7075217354930324e-06, + "loss": 0.1103, + "step": 5691 + }, + { + "epoch": 1.8444588464031109, + "grad_norm": 0.742443323135376, + "learning_rate": 1.7066922807123834e-06, + "loss": 0.1041, + "step": 5692 + }, + { + "epoch": 1.8447828904731045, + "grad_norm": 0.7868834733963013, + "learning_rate": 1.7058629230249207e-06, + "loss": 0.1048, + "step": 5693 + }, + { + "epoch": 1.845106934543098, + "grad_norm": 0.7643986940383911, + "learning_rate": 1.7050336625321484e-06, + "loss": 0.1067, + "step": 5694 + }, + { + "epoch": 1.8454309786130914, + "grad_norm": 0.8305909037590027, + "learning_rate": 1.704204499335559e-06, + "loss": 0.1121, + "step": 5695 + }, + { + "epoch": 1.8457550226830848, + "grad_norm": 0.8438430428504944, + "learning_rate": 1.7033754335366356e-06, + "loss": 0.1189, + "step": 5696 + }, + { + "epoch": 1.8460790667530784, + "grad_norm": 0.8111346364021301, + "learning_rate": 1.7025464652368464e-06, + "loss": 0.1134, + "step": 5697 + }, + { + "epoch": 1.846403110823072, + "grad_norm": 0.8159440755844116, + "learning_rate": 1.701717594537651e-06, + "loss": 0.1141, + "step": 5698 + }, + { + "epoch": 1.8467271548930655, + "grad_norm": 0.7892287969589233, + "learning_rate": 1.7008888215404933e-06, + "loss": 0.1072, + "step": 5699 + }, + { + "epoch": 1.847051198963059, + "grad_norm": 0.7901312112808228, + "learning_rate": 1.7000601463468088e-06, + "loss": 0.1073, + "step": 5700 + }, + { + "epoch": 1.8473752430330523, + "grad_norm": 0.7535051703453064, + "learning_rate": 1.6992315690580178e-06, + "loss": 0.0971, + "step": 5701 + }, + { + "epoch": 1.847699287103046, + "grad_norm": 0.798143208026886, + "learning_rate": 1.6984030897755304e-06, + "loss": 0.1134, + "step": 5702 + }, + { + "epoch": 1.8480233311730396, + "grad_norm": 0.8003287315368652, + "learning_rate": 1.6975747086007454e-06, + "loss": 0.1073, + "step": 5703 + }, + { + "epoch": 1.848347375243033, + "grad_norm": 0.8201723694801331, + "learning_rate": 1.6967464256350468e-06, + "loss": 0.1188, + "step": 5704 + }, + { + "epoch": 1.8486714193130265, + "grad_norm": 0.8480156660079956, + "learning_rate": 1.6959182409798111e-06, + "loss": 0.116, + "step": 5705 + }, + { + "epoch": 1.8489954633830201, + "grad_norm": 0.783407986164093, + "learning_rate": 1.695090154736398e-06, + "loss": 0.108, + "step": 5706 + }, + { + "epoch": 1.8493195074530138, + "grad_norm": 0.7597818970680237, + "learning_rate": 1.6942621670061574e-06, + "loss": 0.1106, + "step": 5707 + }, + { + "epoch": 1.8496435515230072, + "grad_norm": 0.7512392401695251, + "learning_rate": 1.693434277890428e-06, + "loss": 0.1107, + "step": 5708 + }, + { + "epoch": 1.8499675955930006, + "grad_norm": 0.7646990418434143, + "learning_rate": 1.692606487490534e-06, + "loss": 0.1071, + "step": 5709 + }, + { + "epoch": 1.850291639662994, + "grad_norm": 0.8326711058616638, + "learning_rate": 1.6917787959077907e-06, + "loss": 0.1266, + "step": 5710 + }, + { + "epoch": 1.8506156837329877, + "grad_norm": 0.8679319620132446, + "learning_rate": 1.6909512032434984e-06, + "loss": 0.1271, + "step": 5711 + }, + { + "epoch": 1.8509397278029813, + "grad_norm": 0.733222484588623, + "learning_rate": 1.6901237095989464e-06, + "loss": 0.0994, + "step": 5712 + }, + { + "epoch": 1.8512637718729748, + "grad_norm": 0.8290033340454102, + "learning_rate": 1.6892963150754128e-06, + "loss": 0.1091, + "step": 5713 + }, + { + "epoch": 1.8515878159429682, + "grad_norm": 0.8612515926361084, + "learning_rate": 1.6884690197741608e-06, + "loss": 0.1217, + "step": 5714 + }, + { + "epoch": 1.8519118600129616, + "grad_norm": 0.8125770092010498, + "learning_rate": 1.6876418237964453e-06, + "loss": 0.1133, + "step": 5715 + }, + { + "epoch": 1.8522359040829552, + "grad_norm": 0.7832993268966675, + "learning_rate": 1.6868147272435057e-06, + "loss": 0.107, + "step": 5716 + }, + { + "epoch": 1.8525599481529489, + "grad_norm": 0.8174955248832703, + "learning_rate": 1.6859877302165723e-06, + "loss": 0.1116, + "step": 5717 + }, + { + "epoch": 1.8528839922229423, + "grad_norm": 0.8022476434707642, + "learning_rate": 1.6851608328168589e-06, + "loss": 0.112, + "step": 5718 + }, + { + "epoch": 1.8532080362929357, + "grad_norm": 0.7933598160743713, + "learning_rate": 1.6843340351455728e-06, + "loss": 0.1077, + "step": 5719 + }, + { + "epoch": 1.8535320803629294, + "grad_norm": 0.9144831895828247, + "learning_rate": 1.6835073373039045e-06, + "loss": 0.1126, + "step": 5720 + }, + { + "epoch": 1.853856124432923, + "grad_norm": 0.7254115343093872, + "learning_rate": 1.6826807393930334e-06, + "loss": 0.0942, + "step": 5721 + }, + { + "epoch": 1.8541801685029164, + "grad_norm": 0.8766010999679565, + "learning_rate": 1.6818542415141273e-06, + "loss": 0.1158, + "step": 5722 + }, + { + "epoch": 1.8545042125729099, + "grad_norm": 0.8294383883476257, + "learning_rate": 1.6810278437683419e-06, + "loss": 0.1195, + "step": 5723 + }, + { + "epoch": 1.8548282566429033, + "grad_norm": 0.8535876274108887, + "learning_rate": 1.6802015462568205e-06, + "loss": 0.1182, + "step": 5724 + }, + { + "epoch": 1.855152300712897, + "grad_norm": 0.7217211723327637, + "learning_rate": 1.6793753490806939e-06, + "loss": 0.097, + "step": 5725 + }, + { + "epoch": 1.8554763447828906, + "grad_norm": 0.800193727016449, + "learning_rate": 1.678549252341079e-06, + "loss": 0.115, + "step": 5726 + }, + { + "epoch": 1.855800388852884, + "grad_norm": 0.7480499148368835, + "learning_rate": 1.6777232561390844e-06, + "loss": 0.1029, + "step": 5727 + }, + { + "epoch": 1.8561244329228774, + "grad_norm": 0.7266390919685364, + "learning_rate": 1.6768973605758021e-06, + "loss": 0.1011, + "step": 5728 + }, + { + "epoch": 1.856448476992871, + "grad_norm": 0.8023138642311096, + "learning_rate": 1.6760715657523158e-06, + "loss": 0.1112, + "step": 5729 + }, + { + "epoch": 1.8567725210628645, + "grad_norm": 0.7607341408729553, + "learning_rate": 1.6752458717696928e-06, + "loss": 0.1078, + "step": 5730 + }, + { + "epoch": 1.8570965651328581, + "grad_norm": 0.7758510112762451, + "learning_rate": 1.674420278728991e-06, + "loss": 0.1066, + "step": 5731 + }, + { + "epoch": 1.8574206092028516, + "grad_norm": 0.8236150145530701, + "learning_rate": 1.6735947867312553e-06, + "loss": 0.1162, + "step": 5732 + }, + { + "epoch": 1.857744653272845, + "grad_norm": 0.826967716217041, + "learning_rate": 1.6727693958775172e-06, + "loss": 0.1165, + "step": 5733 + }, + { + "epoch": 1.8580686973428386, + "grad_norm": 0.8800385594367981, + "learning_rate": 1.671944106268797e-06, + "loss": 0.124, + "step": 5734 + }, + { + "epoch": 1.8583927414128323, + "grad_norm": 0.7806604504585266, + "learning_rate": 1.671118918006101e-06, + "loss": 0.1119, + "step": 5735 + }, + { + "epoch": 1.8587167854828257, + "grad_norm": 0.8205379843711853, + "learning_rate": 1.6702938311904262e-06, + "loss": 0.1139, + "step": 5736 + }, + { + "epoch": 1.8590408295528191, + "grad_norm": 0.7946128845214844, + "learning_rate": 1.6694688459227545e-06, + "loss": 0.1129, + "step": 5737 + }, + { + "epoch": 1.8593648736228126, + "grad_norm": 0.733856737613678, + "learning_rate": 1.6686439623040548e-06, + "loss": 0.1001, + "step": 5738 + }, + { + "epoch": 1.8596889176928062, + "grad_norm": 0.8636114001274109, + "learning_rate": 1.6678191804352873e-06, + "loss": 0.1161, + "step": 5739 + }, + { + "epoch": 1.8600129617627998, + "grad_norm": 0.8229653239250183, + "learning_rate": 1.6669945004173944e-06, + "loss": 0.1152, + "step": 5740 + }, + { + "epoch": 1.8603370058327933, + "grad_norm": 0.7979140281677246, + "learning_rate": 1.6661699223513118e-06, + "loss": 0.1155, + "step": 5741 + }, + { + "epoch": 1.8606610499027867, + "grad_norm": 0.8589354753494263, + "learning_rate": 1.6653454463379582e-06, + "loss": 0.1226, + "step": 5742 + }, + { + "epoch": 1.8609850939727803, + "grad_norm": 0.8426529765129089, + "learning_rate": 1.6645210724782423e-06, + "loss": 0.1196, + "step": 5743 + }, + { + "epoch": 1.861309138042774, + "grad_norm": 0.7011165618896484, + "learning_rate": 1.6636968008730586e-06, + "loss": 0.0934, + "step": 5744 + }, + { + "epoch": 1.8616331821127674, + "grad_norm": 0.7157084941864014, + "learning_rate": 1.6628726316232902e-06, + "loss": 0.0959, + "step": 5745 + }, + { + "epoch": 1.8619572261827608, + "grad_norm": 0.8379035592079163, + "learning_rate": 1.6620485648298084e-06, + "loss": 0.1115, + "step": 5746 + }, + { + "epoch": 1.8622812702527543, + "grad_norm": 0.728182315826416, + "learning_rate": 1.6612246005934694e-06, + "loss": 0.1021, + "step": 5747 + }, + { + "epoch": 1.862605314322748, + "grad_norm": 0.832934558391571, + "learning_rate": 1.66040073901512e-06, + "loss": 0.1214, + "step": 5748 + }, + { + "epoch": 1.8629293583927415, + "grad_norm": 0.7622983455657959, + "learning_rate": 1.6595769801955925e-06, + "loss": 0.103, + "step": 5749 + }, + { + "epoch": 1.863253402462735, + "grad_norm": 0.9064056873321533, + "learning_rate": 1.6587533242357053e-06, + "loss": 0.104, + "step": 5750 + }, + { + "epoch": 1.8635774465327284, + "grad_norm": 0.8469486832618713, + "learning_rate": 1.6579297712362686e-06, + "loss": 0.1173, + "step": 5751 + }, + { + "epoch": 1.8639014906027218, + "grad_norm": 0.8399779796600342, + "learning_rate": 1.6571063212980753e-06, + "loss": 0.1172, + "step": 5752 + }, + { + "epoch": 1.8642255346727155, + "grad_norm": 0.9038817286491394, + "learning_rate": 1.6562829745219089e-06, + "loss": 0.1134, + "step": 5753 + }, + { + "epoch": 1.864549578742709, + "grad_norm": 0.7293316125869751, + "learning_rate": 1.6554597310085383e-06, + "loss": 0.0983, + "step": 5754 + }, + { + "epoch": 1.8648736228127025, + "grad_norm": 0.7801869511604309, + "learning_rate": 1.6546365908587213e-06, + "loss": 0.1103, + "step": 5755 + }, + { + "epoch": 1.865197666882696, + "grad_norm": 0.8423866033554077, + "learning_rate": 1.653813554173202e-06, + "loss": 0.1205, + "step": 5756 + }, + { + "epoch": 1.8655217109526896, + "grad_norm": 0.8145995736122131, + "learning_rate": 1.6529906210527107e-06, + "loss": 0.1146, + "step": 5757 + }, + { + "epoch": 1.8658457550226832, + "grad_norm": 0.8038000464439392, + "learning_rate": 1.6521677915979688e-06, + "loss": 0.1106, + "step": 5758 + }, + { + "epoch": 1.8661697990926767, + "grad_norm": 0.8118459582328796, + "learning_rate": 1.6513450659096804e-06, + "loss": 0.1099, + "step": 5759 + }, + { + "epoch": 1.86649384316267, + "grad_norm": 0.9116185307502747, + "learning_rate": 1.6505224440885414e-06, + "loss": 0.122, + "step": 5760 + }, + { + "epoch": 1.8668178872326635, + "grad_norm": 0.7717702984809875, + "learning_rate": 1.649699926235232e-06, + "loss": 0.1033, + "step": 5761 + }, + { + "epoch": 1.8671419313026572, + "grad_norm": 0.8235916495323181, + "learning_rate": 1.6488775124504188e-06, + "loss": 0.1252, + "step": 5762 + }, + { + "epoch": 1.8674659753726508, + "grad_norm": 0.8452646732330322, + "learning_rate": 1.6480552028347597e-06, + "loss": 0.1179, + "step": 5763 + }, + { + "epoch": 1.8677900194426442, + "grad_norm": 0.7542973160743713, + "learning_rate": 1.6472329974888956e-06, + "loss": 0.1077, + "step": 5764 + }, + { + "epoch": 1.8681140635126376, + "grad_norm": 0.8130019307136536, + "learning_rate": 1.6464108965134578e-06, + "loss": 0.1183, + "step": 5765 + }, + { + "epoch": 1.8684381075826313, + "grad_norm": 0.8335098028182983, + "learning_rate": 1.645588900009062e-06, + "loss": 0.1148, + "step": 5766 + }, + { + "epoch": 1.8687621516526247, + "grad_norm": 0.8066989779472351, + "learning_rate": 1.6447670080763146e-06, + "loss": 0.1193, + "step": 5767 + }, + { + "epoch": 1.8690861957226184, + "grad_norm": 0.8658890724182129, + "learning_rate": 1.6439452208158058e-06, + "loss": 0.1165, + "step": 5768 + }, + { + "epoch": 1.8694102397926118, + "grad_norm": 0.786740779876709, + "learning_rate": 1.6431235383281135e-06, + "loss": 0.1164, + "step": 5769 + }, + { + "epoch": 1.8697342838626052, + "grad_norm": 0.8632078766822815, + "learning_rate": 1.6423019607138064e-06, + "loss": 0.1092, + "step": 5770 + }, + { + "epoch": 1.8700583279325989, + "grad_norm": 0.777319610118866, + "learning_rate": 1.641480488073435e-06, + "loss": 0.1151, + "step": 5771 + }, + { + "epoch": 1.8703823720025925, + "grad_norm": 0.7908869385719299, + "learning_rate": 1.6406591205075417e-06, + "loss": 0.1074, + "step": 5772 + }, + { + "epoch": 1.870706416072586, + "grad_norm": 0.7509408593177795, + "learning_rate": 1.639837858116653e-06, + "loss": 0.1022, + "step": 5773 + }, + { + "epoch": 1.8710304601425793, + "grad_norm": 0.8177096247673035, + "learning_rate": 1.6390167010012824e-06, + "loss": 0.1157, + "step": 5774 + }, + { + "epoch": 1.8713545042125728, + "grad_norm": 0.7938538193702698, + "learning_rate": 1.638195649261934e-06, + "loss": 0.1048, + "step": 5775 + }, + { + "epoch": 1.8716785482825664, + "grad_norm": 0.8329545855522156, + "learning_rate": 1.6373747029990943e-06, + "loss": 0.12, + "step": 5776 + }, + { + "epoch": 1.87200259235256, + "grad_norm": 0.7647078633308411, + "learning_rate": 1.6365538623132405e-06, + "loss": 0.0998, + "step": 5777 + }, + { + "epoch": 1.8723266364225535, + "grad_norm": 0.7947881817817688, + "learning_rate": 1.6357331273048343e-06, + "loss": 0.1172, + "step": 5778 + }, + { + "epoch": 1.872650680492547, + "grad_norm": 0.8033885359764099, + "learning_rate": 1.6349124980743278e-06, + "loss": 0.1067, + "step": 5779 + }, + { + "epoch": 1.8729747245625405, + "grad_norm": 0.8645670413970947, + "learning_rate": 1.6340919747221568e-06, + "loss": 0.1126, + "step": 5780 + }, + { + "epoch": 1.873298768632534, + "grad_norm": 0.6997979879379272, + "learning_rate": 1.633271557348744e-06, + "loss": 0.097, + "step": 5781 + }, + { + "epoch": 1.8736228127025276, + "grad_norm": 0.7694612145423889, + "learning_rate": 1.6324512460545034e-06, + "loss": 0.1058, + "step": 5782 + }, + { + "epoch": 1.873946856772521, + "grad_norm": 0.8525017499923706, + "learning_rate": 1.6316310409398306e-06, + "loss": 0.123, + "step": 5783 + }, + { + "epoch": 1.8742709008425145, + "grad_norm": 0.7532781362533569, + "learning_rate": 1.6308109421051132e-06, + "loss": 0.1098, + "step": 5784 + }, + { + "epoch": 1.874594944912508, + "grad_norm": 0.887808084487915, + "learning_rate": 1.6299909496507214e-06, + "loss": 0.1232, + "step": 5785 + }, + { + "epoch": 1.8749189889825018, + "grad_norm": 0.7432640194892883, + "learning_rate": 1.6291710636770152e-06, + "loss": 0.1042, + "step": 5786 + }, + { + "epoch": 1.8752430330524952, + "grad_norm": 0.8639612197875977, + "learning_rate": 1.628351284284341e-06, + "loss": 0.1128, + "step": 5787 + }, + { + "epoch": 1.8755670771224886, + "grad_norm": 0.8335579633712769, + "learning_rate": 1.6275316115730302e-06, + "loss": 0.1175, + "step": 5788 + }, + { + "epoch": 1.875891121192482, + "grad_norm": 0.8358171582221985, + "learning_rate": 1.626712045643405e-06, + "loss": 0.1141, + "step": 5789 + }, + { + "epoch": 1.8762151652624757, + "grad_norm": 0.765986442565918, + "learning_rate": 1.6258925865957703e-06, + "loss": 0.1085, + "step": 5790 + }, + { + "epoch": 1.8765392093324693, + "grad_norm": 0.8088056445121765, + "learning_rate": 1.625073234530422e-06, + "loss": 0.1107, + "step": 5791 + }, + { + "epoch": 1.8768632534024627, + "grad_norm": 0.7557681202888489, + "learning_rate": 1.62425398954764e-06, + "loss": 0.1057, + "step": 5792 + }, + { + "epoch": 1.8771872974724562, + "grad_norm": 0.7989634275436401, + "learning_rate": 1.6234348517476905e-06, + "loss": 0.1215, + "step": 5793 + }, + { + "epoch": 1.8775113415424498, + "grad_norm": 1.2016243934631348, + "learning_rate": 1.6226158212308307e-06, + "loss": 0.1141, + "step": 5794 + }, + { + "epoch": 1.8778353856124435, + "grad_norm": 0.7669769525527954, + "learning_rate": 1.6217968980972998e-06, + "loss": 0.0992, + "step": 5795 + }, + { + "epoch": 1.8781594296824369, + "grad_norm": 0.8094942569732666, + "learning_rate": 1.620978082447327e-06, + "loss": 0.1138, + "step": 5796 + }, + { + "epoch": 1.8784834737524303, + "grad_norm": 0.8035215139389038, + "learning_rate": 1.6201593743811275e-06, + "loss": 0.11, + "step": 5797 + }, + { + "epoch": 1.8788075178224237, + "grad_norm": 0.8170638084411621, + "learning_rate": 1.6193407739989037e-06, + "loss": 0.1106, + "step": 5798 + }, + { + "epoch": 1.8791315618924174, + "grad_norm": 0.7591819763183594, + "learning_rate": 1.6185222814008434e-06, + "loss": 0.1116, + "step": 5799 + }, + { + "epoch": 1.879455605962411, + "grad_norm": 0.8090928196907043, + "learning_rate": 1.6177038966871213e-06, + "loss": 0.1098, + "step": 5800 + }, + { + "epoch": 1.8797796500324044, + "grad_norm": 0.7584290504455566, + "learning_rate": 1.6168856199579025e-06, + "loss": 0.1066, + "step": 5801 + }, + { + "epoch": 1.8801036941023979, + "grad_norm": 0.8811301589012146, + "learning_rate": 1.6160674513133332e-06, + "loss": 0.1213, + "step": 5802 + }, + { + "epoch": 1.8804277381723913, + "grad_norm": 0.8295065760612488, + "learning_rate": 1.615249390853552e-06, + "loss": 0.1057, + "step": 5803 + }, + { + "epoch": 1.880751782242385, + "grad_norm": 0.8181267380714417, + "learning_rate": 1.61443143867868e-06, + "loss": 0.115, + "step": 5804 + }, + { + "epoch": 1.8810758263123786, + "grad_norm": 0.7951749563217163, + "learning_rate": 1.613613594888826e-06, + "loss": 0.1131, + "step": 5805 + }, + { + "epoch": 1.881399870382372, + "grad_norm": 0.7731319665908813, + "learning_rate": 1.612795859584088e-06, + "loss": 0.1056, + "step": 5806 + }, + { + "epoch": 1.8817239144523654, + "grad_norm": 0.8112607598304749, + "learning_rate": 1.611978232864548e-06, + "loss": 0.115, + "step": 5807 + }, + { + "epoch": 1.882047958522359, + "grad_norm": 0.7796767354011536, + "learning_rate": 1.6111607148302758e-06, + "loss": 0.1063, + "step": 5808 + }, + { + "epoch": 1.8823720025923527, + "grad_norm": 0.7607157230377197, + "learning_rate": 1.6103433055813265e-06, + "loss": 0.1002, + "step": 5809 + }, + { + "epoch": 1.8826960466623461, + "grad_norm": 0.7605647444725037, + "learning_rate": 1.6095260052177446e-06, + "loss": 0.1037, + "step": 5810 + }, + { + "epoch": 1.8830200907323396, + "grad_norm": 0.8884912133216858, + "learning_rate": 1.6087088138395598e-06, + "loss": 0.122, + "step": 5811 + }, + { + "epoch": 1.883344134802333, + "grad_norm": 0.7630835175514221, + "learning_rate": 1.6078917315467867e-06, + "loss": 0.1037, + "step": 5812 + }, + { + "epoch": 1.8836681788723266, + "grad_norm": 0.8057610988616943, + "learning_rate": 1.6070747584394303e-06, + "loss": 0.1156, + "step": 5813 + }, + { + "epoch": 1.8839922229423203, + "grad_norm": 0.8627750277519226, + "learning_rate": 1.6062578946174785e-06, + "loss": 0.1265, + "step": 5814 + }, + { + "epoch": 1.8843162670123137, + "grad_norm": 0.8453134894371033, + "learning_rate": 1.605441140180909e-06, + "loss": 0.1056, + "step": 5815 + }, + { + "epoch": 1.8846403110823071, + "grad_norm": 0.7545762658119202, + "learning_rate": 1.6046244952296839e-06, + "loss": 0.1004, + "step": 5816 + }, + { + "epoch": 1.8849643551523008, + "grad_norm": 0.9063136577606201, + "learning_rate": 1.6038079598637523e-06, + "loss": 0.1143, + "step": 5817 + }, + { + "epoch": 1.8852883992222942, + "grad_norm": 1.0915850400924683, + "learning_rate": 1.6029915341830503e-06, + "loss": 0.1489, + "step": 5818 + }, + { + "epoch": 1.8856124432922878, + "grad_norm": 0.7500689625740051, + "learning_rate": 1.6021752182875012e-06, + "loss": 0.1045, + "step": 5819 + }, + { + "epoch": 1.8859364873622813, + "grad_norm": 0.8040003180503845, + "learning_rate": 1.6013590122770143e-06, + "loss": 0.1113, + "step": 5820 + }, + { + "epoch": 1.8862605314322747, + "grad_norm": 0.7638602256774902, + "learning_rate": 1.6005429162514834e-06, + "loss": 0.1029, + "step": 5821 + }, + { + "epoch": 1.8865845755022683, + "grad_norm": 0.7854073643684387, + "learning_rate": 1.5997269303107937e-06, + "loss": 0.1125, + "step": 5822 + }, + { + "epoch": 1.886908619572262, + "grad_norm": 0.808712363243103, + "learning_rate": 1.598911054554812e-06, + "loss": 0.1047, + "step": 5823 + }, + { + "epoch": 1.8872326636422554, + "grad_norm": 0.7914475202560425, + "learning_rate": 1.5980952890833929e-06, + "loss": 0.1034, + "step": 5824 + }, + { + "epoch": 1.8875567077122488, + "grad_norm": 0.8212109208106995, + "learning_rate": 1.5972796339963806e-06, + "loss": 0.1155, + "step": 5825 + }, + { + "epoch": 1.8878807517822422, + "grad_norm": 0.8299123644828796, + "learning_rate": 1.5964640893936015e-06, + "loss": 0.1205, + "step": 5826 + }, + { + "epoch": 1.8882047958522359, + "grad_norm": 0.8492401838302612, + "learning_rate": 1.595648655374871e-06, + "loss": 0.1188, + "step": 5827 + }, + { + "epoch": 1.8885288399222295, + "grad_norm": 0.7753995656967163, + "learning_rate": 1.5948333320399905e-06, + "loss": 0.1108, + "step": 5828 + }, + { + "epoch": 1.888852883992223, + "grad_norm": 0.7334645986557007, + "learning_rate": 1.5940181194887472e-06, + "loss": 0.0975, + "step": 5829 + }, + { + "epoch": 1.8891769280622164, + "grad_norm": 0.7638627290725708, + "learning_rate": 1.5932030178209163e-06, + "loss": 0.1067, + "step": 5830 + }, + { + "epoch": 1.88950097213221, + "grad_norm": 0.7731084227561951, + "learning_rate": 1.592388027136256e-06, + "loss": 0.1142, + "step": 5831 + }, + { + "epoch": 1.8898250162022034, + "grad_norm": 0.792477548122406, + "learning_rate": 1.591573147534516e-06, + "loss": 0.1125, + "step": 5832 + }, + { + "epoch": 1.890149060272197, + "grad_norm": 0.7775505781173706, + "learning_rate": 1.5907583791154275e-06, + "loss": 0.1097, + "step": 5833 + }, + { + "epoch": 1.8904731043421905, + "grad_norm": 0.7763543128967285, + "learning_rate": 1.5899437219787124e-06, + "loss": 0.1057, + "step": 5834 + }, + { + "epoch": 1.890797148412184, + "grad_norm": 0.7588939666748047, + "learning_rate": 1.5891291762240757e-06, + "loss": 0.1072, + "step": 5835 + }, + { + "epoch": 1.8911211924821776, + "grad_norm": 0.839514970779419, + "learning_rate": 1.5883147419512086e-06, + "loss": 0.1125, + "step": 5836 + }, + { + "epoch": 1.8914452365521712, + "grad_norm": 0.8462491035461426, + "learning_rate": 1.5875004192597926e-06, + "loss": 0.114, + "step": 5837 + }, + { + "epoch": 1.8917692806221647, + "grad_norm": 0.7633974552154541, + "learning_rate": 1.5866862082494907e-06, + "loss": 0.1008, + "step": 5838 + }, + { + "epoch": 1.892093324692158, + "grad_norm": 0.7939409017562866, + "learning_rate": 1.5858721090199564e-06, + "loss": 0.11, + "step": 5839 + }, + { + "epoch": 1.8924173687621515, + "grad_norm": 0.806594967842102, + "learning_rate": 1.5850581216708254e-06, + "loss": 0.1167, + "step": 5840 + }, + { + "epoch": 1.8927414128321451, + "grad_norm": 0.8261216878890991, + "learning_rate": 1.5842442463017235e-06, + "loss": 0.1074, + "step": 5841 + }, + { + "epoch": 1.8930654569021388, + "grad_norm": 0.7876153588294983, + "learning_rate": 1.583430483012261e-06, + "loss": 0.1097, + "step": 5842 + }, + { + "epoch": 1.8933895009721322, + "grad_norm": 0.7754987478256226, + "learning_rate": 1.5826168319020332e-06, + "loss": 0.1062, + "step": 5843 + }, + { + "epoch": 1.8937135450421256, + "grad_norm": 0.8216641545295715, + "learning_rate": 1.5818032930706254e-06, + "loss": 0.1115, + "step": 5844 + }, + { + "epoch": 1.8940375891121193, + "grad_norm": 0.8169437050819397, + "learning_rate": 1.5809898666176044e-06, + "loss": 0.112, + "step": 5845 + }, + { + "epoch": 1.894361633182113, + "grad_norm": 0.7897141575813293, + "learning_rate": 1.5801765526425283e-06, + "loss": 0.1121, + "step": 5846 + }, + { + "epoch": 1.8946856772521063, + "grad_norm": 0.8244640827178955, + "learning_rate": 1.5793633512449374e-06, + "loss": 0.1151, + "step": 5847 + }, + { + "epoch": 1.8950097213220998, + "grad_norm": 0.792438805103302, + "learning_rate": 1.578550262524359e-06, + "loss": 0.1074, + "step": 5848 + }, + { + "epoch": 1.8953337653920932, + "grad_norm": 0.796003520488739, + "learning_rate": 1.5777372865803091e-06, + "loss": 0.1065, + "step": 5849 + }, + { + "epoch": 1.8956578094620868, + "grad_norm": 0.8183282017707825, + "learning_rate": 1.5769244235122867e-06, + "loss": 0.1153, + "step": 5850 + }, + { + "epoch": 1.8959818535320805, + "grad_norm": 0.8319751024246216, + "learning_rate": 1.576111673419779e-06, + "loss": 0.1194, + "step": 5851 + }, + { + "epoch": 1.896305897602074, + "grad_norm": 0.7649463415145874, + "learning_rate": 1.5752990364022588e-06, + "loss": 0.1042, + "step": 5852 + }, + { + "epoch": 1.8966299416720673, + "grad_norm": 0.8287633657455444, + "learning_rate": 1.5744865125591837e-06, + "loss": 0.1141, + "step": 5853 + }, + { + "epoch": 1.8969539857420608, + "grad_norm": 0.7885811924934387, + "learning_rate": 1.573674101990001e-06, + "loss": 0.1142, + "step": 5854 + }, + { + "epoch": 1.8972780298120544, + "grad_norm": 0.839322566986084, + "learning_rate": 1.5728618047941393e-06, + "loss": 0.1066, + "step": 5855 + }, + { + "epoch": 1.897602073882048, + "grad_norm": 0.820233166217804, + "learning_rate": 1.5720496210710185e-06, + "loss": 0.1101, + "step": 5856 + }, + { + "epoch": 1.8979261179520415, + "grad_norm": 0.8723364472389221, + "learning_rate": 1.5712375509200397e-06, + "loss": 0.1142, + "step": 5857 + }, + { + "epoch": 1.898250162022035, + "grad_norm": 0.8589418530464172, + "learning_rate": 1.5704255944405947e-06, + "loss": 0.1131, + "step": 5858 + }, + { + "epoch": 1.8985742060920285, + "grad_norm": 0.8543845415115356, + "learning_rate": 1.5696137517320582e-06, + "loss": 0.1165, + "step": 5859 + }, + { + "epoch": 1.8988982501620222, + "grad_norm": 0.78272545337677, + "learning_rate": 1.5688020228937905e-06, + "loss": 0.1114, + "step": 5860 + }, + { + "epoch": 1.8992222942320156, + "grad_norm": 0.879530668258667, + "learning_rate": 1.5679904080251414e-06, + "loss": 0.1238, + "step": 5861 + }, + { + "epoch": 1.899546338302009, + "grad_norm": 0.8408613801002502, + "learning_rate": 1.567178907225443e-06, + "loss": 0.1219, + "step": 5862 + }, + { + "epoch": 1.8998703823720025, + "grad_norm": 0.8422929048538208, + "learning_rate": 1.5663675205940164e-06, + "loss": 0.112, + "step": 5863 + }, + { + "epoch": 1.900194426441996, + "grad_norm": 0.8757169842720032, + "learning_rate": 1.5655562482301664e-06, + "loss": 0.1196, + "step": 5864 + }, + { + "epoch": 1.9005184705119897, + "grad_norm": 0.8293363451957703, + "learning_rate": 1.5647450902331866e-06, + "loss": 0.1174, + "step": 5865 + }, + { + "epoch": 1.9008425145819832, + "grad_norm": 0.7704747319221497, + "learning_rate": 1.5639340467023534e-06, + "loss": 0.1063, + "step": 5866 + }, + { + "epoch": 1.9011665586519766, + "grad_norm": 0.7849206924438477, + "learning_rate": 1.5631231177369305e-06, + "loss": 0.1112, + "step": 5867 + }, + { + "epoch": 1.9014906027219702, + "grad_norm": 0.8597646951675415, + "learning_rate": 1.562312303436169e-06, + "loss": 0.1212, + "step": 5868 + }, + { + "epoch": 1.9018146467919637, + "grad_norm": 0.8411148190498352, + "learning_rate": 1.5615016038993036e-06, + "loss": 0.1117, + "step": 5869 + }, + { + "epoch": 1.9021386908619573, + "grad_norm": 0.8403557538986206, + "learning_rate": 1.5606910192255565e-06, + "loss": 0.1155, + "step": 5870 + }, + { + "epoch": 1.9024627349319507, + "grad_norm": 0.759688138961792, + "learning_rate": 1.5598805495141362e-06, + "loss": 0.1069, + "step": 5871 + }, + { + "epoch": 1.9027867790019442, + "grad_norm": 0.799876868724823, + "learning_rate": 1.5590701948642348e-06, + "loss": 0.1136, + "step": 5872 + }, + { + "epoch": 1.9031108230719378, + "grad_norm": 0.8051289319992065, + "learning_rate": 1.5582599553750332e-06, + "loss": 0.1179, + "step": 5873 + }, + { + "epoch": 1.9034348671419314, + "grad_norm": 0.7589281797409058, + "learning_rate": 1.5574498311456953e-06, + "loss": 0.1034, + "step": 5874 + }, + { + "epoch": 1.9037589112119249, + "grad_norm": 0.797426700592041, + "learning_rate": 1.5566398222753745e-06, + "loss": 0.1056, + "step": 5875 + }, + { + "epoch": 1.9040829552819183, + "grad_norm": 0.7796162366867065, + "learning_rate": 1.5558299288632061e-06, + "loss": 0.105, + "step": 5876 + }, + { + "epoch": 1.9044069993519117, + "grad_norm": 0.8247846364974976, + "learning_rate": 1.555020151008315e-06, + "loss": 0.1214, + "step": 5877 + }, + { + "epoch": 1.9047310434219054, + "grad_norm": 0.7716207504272461, + "learning_rate": 1.5542104888098093e-06, + "loss": 0.1049, + "step": 5878 + }, + { + "epoch": 1.905055087491899, + "grad_norm": 0.8307933807373047, + "learning_rate": 1.553400942366783e-06, + "loss": 0.1152, + "step": 5879 + }, + { + "epoch": 1.9053791315618924, + "grad_norm": 0.8331568241119385, + "learning_rate": 1.5525915117783182e-06, + "loss": 0.1159, + "step": 5880 + }, + { + "epoch": 1.9057031756318858, + "grad_norm": 0.8303659558296204, + "learning_rate": 1.5517821971434804e-06, + "loss": 0.1094, + "step": 5881 + }, + { + "epoch": 1.9060272197018795, + "grad_norm": 0.8347376585006714, + "learning_rate": 1.5509729985613232e-06, + "loss": 0.1157, + "step": 5882 + }, + { + "epoch": 1.9063512637718731, + "grad_norm": 0.8244526386260986, + "learning_rate": 1.5501639161308829e-06, + "loss": 0.1142, + "step": 5883 + }, + { + "epoch": 1.9066753078418666, + "grad_norm": 0.8219770789146423, + "learning_rate": 1.5493549499511834e-06, + "loss": 0.1195, + "step": 5884 + }, + { + "epoch": 1.90699935191186, + "grad_norm": 0.8310133218765259, + "learning_rate": 1.5485461001212365e-06, + "loss": 0.116, + "step": 5885 + }, + { + "epoch": 1.9073233959818534, + "grad_norm": 0.807929515838623, + "learning_rate": 1.5477373667400347e-06, + "loss": 0.1096, + "step": 5886 + }, + { + "epoch": 1.907647440051847, + "grad_norm": 0.7839474678039551, + "learning_rate": 1.5469287499065615e-06, + "loss": 0.1069, + "step": 5887 + }, + { + "epoch": 1.9079714841218407, + "grad_norm": 0.8028229475021362, + "learning_rate": 1.5461202497197821e-06, + "loss": 0.11, + "step": 5888 + }, + { + "epoch": 1.9082955281918341, + "grad_norm": 0.8570075631141663, + "learning_rate": 1.5453118662786509e-06, + "loss": 0.1234, + "step": 5889 + }, + { + "epoch": 1.9086195722618275, + "grad_norm": 0.7751265168190002, + "learning_rate": 1.544503599682105e-06, + "loss": 0.1067, + "step": 5890 + }, + { + "epoch": 1.908943616331821, + "grad_norm": 0.827232837677002, + "learning_rate": 1.5436954500290684e-06, + "loss": 0.1127, + "step": 5891 + }, + { + "epoch": 1.9092676604018146, + "grad_norm": 0.8176938891410828, + "learning_rate": 1.5428874174184509e-06, + "loss": 0.1181, + "step": 5892 + }, + { + "epoch": 1.9095917044718083, + "grad_norm": 0.7876701951026917, + "learning_rate": 1.5420795019491475e-06, + "loss": 0.1095, + "step": 5893 + }, + { + "epoch": 1.9099157485418017, + "grad_norm": 0.781857430934906, + "learning_rate": 1.5412717037200406e-06, + "loss": 0.1003, + "step": 5894 + }, + { + "epoch": 1.910239792611795, + "grad_norm": 0.8700427412986755, + "learning_rate": 1.540464022829996e-06, + "loss": 0.1094, + "step": 5895 + }, + { + "epoch": 1.9105638366817888, + "grad_norm": 0.8291754722595215, + "learning_rate": 1.5396564593778646e-06, + "loss": 0.1135, + "step": 5896 + }, + { + "epoch": 1.9108878807517824, + "grad_norm": 0.800238311290741, + "learning_rate": 1.538849013462487e-06, + "loss": 0.1076, + "step": 5897 + }, + { + "epoch": 1.9112119248217758, + "grad_norm": 0.8612180948257446, + "learning_rate": 1.5380416851826845e-06, + "loss": 0.1196, + "step": 5898 + }, + { + "epoch": 1.9115359688917692, + "grad_norm": 0.8104245662689209, + "learning_rate": 1.537234474637268e-06, + "loss": 0.1122, + "step": 5899 + }, + { + "epoch": 1.9118600129617627, + "grad_norm": 0.8221279382705688, + "learning_rate": 1.5364273819250308e-06, + "loss": 0.1145, + "step": 5900 + }, + { + "epoch": 1.9121840570317563, + "grad_norm": 0.7910533547401428, + "learning_rate": 1.535620407144755e-06, + "loss": 0.114, + "step": 5901 + }, + { + "epoch": 1.91250810110175, + "grad_norm": 0.8792036175727844, + "learning_rate": 1.534813550395205e-06, + "loss": 0.1158, + "step": 5902 + }, + { + "epoch": 1.9128321451717434, + "grad_norm": 0.7558249235153198, + "learning_rate": 1.5340068117751329e-06, + "loss": 0.1039, + "step": 5903 + }, + { + "epoch": 1.9131561892417368, + "grad_norm": 0.8300741910934448, + "learning_rate": 1.5332001913832754e-06, + "loss": 0.1175, + "step": 5904 + }, + { + "epoch": 1.9134802333117304, + "grad_norm": 0.8217713832855225, + "learning_rate": 1.5323936893183542e-06, + "loss": 0.1116, + "step": 5905 + }, + { + "epoch": 1.9138042773817239, + "grad_norm": 0.7724614143371582, + "learning_rate": 1.5315873056790791e-06, + "loss": 0.1054, + "step": 5906 + }, + { + "epoch": 1.9141283214517175, + "grad_norm": 0.8543126583099365, + "learning_rate": 1.5307810405641433e-06, + "loss": 0.1141, + "step": 5907 + }, + { + "epoch": 1.914452365521711, + "grad_norm": 0.848846971988678, + "learning_rate": 1.5299748940722241e-06, + "loss": 0.1095, + "step": 5908 + }, + { + "epoch": 1.9147764095917044, + "grad_norm": 0.8107044696807861, + "learning_rate": 1.5291688663019885e-06, + "loss": 0.1089, + "step": 5909 + }, + { + "epoch": 1.915100453661698, + "grad_norm": 0.777558445930481, + "learning_rate": 1.5283629573520841e-06, + "loss": 0.1079, + "step": 5910 + }, + { + "epoch": 1.9154244977316917, + "grad_norm": 0.9644023776054382, + "learning_rate": 1.5275571673211487e-06, + "loss": 0.1273, + "step": 5911 + }, + { + "epoch": 1.915748541801685, + "grad_norm": 0.7175261974334717, + "learning_rate": 1.5267514963078014e-06, + "loss": 0.0976, + "step": 5912 + }, + { + "epoch": 1.9160725858716785, + "grad_norm": 0.7565993070602417, + "learning_rate": 1.5259459444106497e-06, + "loss": 0.1074, + "step": 5913 + }, + { + "epoch": 1.916396629941672, + "grad_norm": 0.84763503074646, + "learning_rate": 1.5251405117282843e-06, + "loss": 0.1177, + "step": 5914 + }, + { + "epoch": 1.9167206740116656, + "grad_norm": 0.8428905606269836, + "learning_rate": 1.524335198359283e-06, + "loss": 0.1194, + "step": 5915 + }, + { + "epoch": 1.9170447180816592, + "grad_norm": 0.7854635119438171, + "learning_rate": 1.5235300044022088e-06, + "loss": 0.1123, + "step": 5916 + }, + { + "epoch": 1.9173687621516526, + "grad_norm": 0.7823472023010254, + "learning_rate": 1.522724929955608e-06, + "loss": 0.1067, + "step": 5917 + }, + { + "epoch": 1.917692806221646, + "grad_norm": 0.8037146329879761, + "learning_rate": 1.5219199751180162e-06, + "loss": 0.1032, + "step": 5918 + }, + { + "epoch": 1.9180168502916397, + "grad_norm": 0.7649844288825989, + "learning_rate": 1.5211151399879505e-06, + "loss": 0.112, + "step": 5919 + }, + { + "epoch": 1.9183408943616331, + "grad_norm": 0.8710806965827942, + "learning_rate": 1.5203104246639144e-06, + "loss": 0.125, + "step": 5920 + }, + { + "epoch": 1.9186649384316268, + "grad_norm": 0.7725611329078674, + "learning_rate": 1.5195058292443996e-06, + "loss": 0.1097, + "step": 5921 + }, + { + "epoch": 1.9189889825016202, + "grad_norm": 0.8272144198417664, + "learning_rate": 1.518701353827878e-06, + "loss": 0.1106, + "step": 5922 + }, + { + "epoch": 1.9193130265716136, + "grad_norm": 0.8756875395774841, + "learning_rate": 1.5178969985128122e-06, + "loss": 0.1208, + "step": 5923 + }, + { + "epoch": 1.9196370706416073, + "grad_norm": 0.8208725452423096, + "learning_rate": 1.5170927633976457e-06, + "loss": 0.1136, + "step": 5924 + }, + { + "epoch": 1.919961114711601, + "grad_norm": 0.7696161270141602, + "learning_rate": 1.5162886485808102e-06, + "loss": 0.1129, + "step": 5925 + }, + { + "epoch": 1.9202851587815943, + "grad_norm": 0.9552154541015625, + "learning_rate": 1.515484654160721e-06, + "loss": 0.1238, + "step": 5926 + }, + { + "epoch": 1.9206092028515878, + "grad_norm": 0.8473532199859619, + "learning_rate": 1.5146807802357782e-06, + "loss": 0.1185, + "step": 5927 + }, + { + "epoch": 1.9209332469215812, + "grad_norm": 0.8460490107536316, + "learning_rate": 1.5138770269043704e-06, + "loss": 0.1154, + "step": 5928 + }, + { + "epoch": 1.9212572909915748, + "grad_norm": 0.7778971791267395, + "learning_rate": 1.513073394264867e-06, + "loss": 0.1023, + "step": 5929 + }, + { + "epoch": 1.9215813350615685, + "grad_norm": 0.7810094952583313, + "learning_rate": 1.5122698824156271e-06, + "loss": 0.1075, + "step": 5930 + }, + { + "epoch": 1.921905379131562, + "grad_norm": 0.7526049613952637, + "learning_rate": 1.5114664914549903e-06, + "loss": 0.1046, + "step": 5931 + }, + { + "epoch": 1.9222294232015553, + "grad_norm": 0.7893481254577637, + "learning_rate": 1.5106632214812865e-06, + "loss": 0.1111, + "step": 5932 + }, + { + "epoch": 1.922553467271549, + "grad_norm": 0.8875093460083008, + "learning_rate": 1.5098600725928269e-06, + "loss": 0.117, + "step": 5933 + }, + { + "epoch": 1.9228775113415426, + "grad_norm": 0.7732181549072266, + "learning_rate": 1.5090570448879088e-06, + "loss": 0.1081, + "step": 5934 + }, + { + "epoch": 1.923201555411536, + "grad_norm": 0.8007689118385315, + "learning_rate": 1.5082541384648154e-06, + "loss": 0.1089, + "step": 5935 + }, + { + "epoch": 1.9235255994815295, + "grad_norm": 0.7940258383750916, + "learning_rate": 1.5074513534218137e-06, + "loss": 0.1079, + "step": 5936 + }, + { + "epoch": 1.9238496435515229, + "grad_norm": 0.7803616523742676, + "learning_rate": 1.5066486898571588e-06, + "loss": 0.1085, + "step": 5937 + }, + { + "epoch": 1.9241736876215165, + "grad_norm": 0.7531672120094299, + "learning_rate": 1.5058461478690878e-06, + "loss": 0.1014, + "step": 5938 + }, + { + "epoch": 1.9244977316915102, + "grad_norm": 0.8603652715682983, + "learning_rate": 1.5050437275558233e-06, + "loss": 0.1164, + "step": 5939 + }, + { + "epoch": 1.9248217757615036, + "grad_norm": 0.8175323605537415, + "learning_rate": 1.5042414290155754e-06, + "loss": 0.1079, + "step": 5940 + }, + { + "epoch": 1.925145819831497, + "grad_norm": 0.7698320150375366, + "learning_rate": 1.5034392523465364e-06, + "loss": 0.0941, + "step": 5941 + }, + { + "epoch": 1.9254698639014904, + "grad_norm": 0.7824234962463379, + "learning_rate": 1.502637197646886e-06, + "loss": 0.1125, + "step": 5942 + }, + { + "epoch": 1.925793907971484, + "grad_norm": 0.794787585735321, + "learning_rate": 1.5018352650147872e-06, + "loss": 0.1062, + "step": 5943 + }, + { + "epoch": 1.9261179520414777, + "grad_norm": 0.8701479434967041, + "learning_rate": 1.5010334545483885e-06, + "loss": 0.1243, + "step": 5944 + }, + { + "epoch": 1.9264419961114712, + "grad_norm": 0.813524067401886, + "learning_rate": 1.500231766345825e-06, + "loss": 0.1221, + "step": 5945 + }, + { + "epoch": 1.9267660401814646, + "grad_norm": 0.7518372535705566, + "learning_rate": 1.4994302005052141e-06, + "loss": 0.1048, + "step": 5946 + }, + { + "epoch": 1.9270900842514582, + "grad_norm": 0.8398098945617676, + "learning_rate": 1.4986287571246614e-06, + "loss": 0.1189, + "step": 5947 + }, + { + "epoch": 1.9274141283214519, + "grad_norm": 0.782597005367279, + "learning_rate": 1.4978274363022532e-06, + "loss": 0.1132, + "step": 5948 + }, + { + "epoch": 1.9277381723914453, + "grad_norm": 0.7935764789581299, + "learning_rate": 1.4970262381360664e-06, + "loss": 0.1054, + "step": 5949 + }, + { + "epoch": 1.9280622164614387, + "grad_norm": 0.8103215098381042, + "learning_rate": 1.4962251627241583e-06, + "loss": 0.112, + "step": 5950 + }, + { + "epoch": 1.9283862605314321, + "grad_norm": 0.7776728272438049, + "learning_rate": 1.4954242101645722e-06, + "loss": 0.1062, + "step": 5951 + }, + { + "epoch": 1.9287103046014258, + "grad_norm": 0.7960215210914612, + "learning_rate": 1.4946233805553387e-06, + "loss": 0.1115, + "step": 5952 + }, + { + "epoch": 1.9290343486714194, + "grad_norm": 0.8163950443267822, + "learning_rate": 1.4938226739944694e-06, + "loss": 0.1115, + "step": 5953 + }, + { + "epoch": 1.9293583927414129, + "grad_norm": 0.788702130317688, + "learning_rate": 1.4930220905799652e-06, + "loss": 0.1158, + "step": 5954 + }, + { + "epoch": 1.9296824368114063, + "grad_norm": 0.8842043280601501, + "learning_rate": 1.4922216304098085e-06, + "loss": 0.122, + "step": 5955 + }, + { + "epoch": 1.9300064808814, + "grad_norm": 0.8993422985076904, + "learning_rate": 1.4914212935819689e-06, + "loss": 0.1249, + "step": 5956 + }, + { + "epoch": 1.9303305249513933, + "grad_norm": 0.7884942889213562, + "learning_rate": 1.4906210801943985e-06, + "loss": 0.1128, + "step": 5957 + }, + { + "epoch": 1.930654569021387, + "grad_norm": 0.8368211388587952, + "learning_rate": 1.4898209903450361e-06, + "loss": 0.1194, + "step": 5958 + }, + { + "epoch": 1.9309786130913804, + "grad_norm": 0.7727013230323792, + "learning_rate": 1.489021024131806e-06, + "loss": 0.1048, + "step": 5959 + }, + { + "epoch": 1.9313026571613738, + "grad_norm": 0.7856116890907288, + "learning_rate": 1.4882211816526144e-06, + "loss": 0.1057, + "step": 5960 + }, + { + "epoch": 1.9316267012313675, + "grad_norm": 0.8511420488357544, + "learning_rate": 1.4874214630053562e-06, + "loss": 0.1149, + "step": 5961 + }, + { + "epoch": 1.9319507453013611, + "grad_norm": 0.7858923077583313, + "learning_rate": 1.4866218682879088e-06, + "loss": 0.1119, + "step": 5962 + }, + { + "epoch": 1.9322747893713546, + "grad_norm": 0.8455827236175537, + "learning_rate": 1.4858223975981334e-06, + "loss": 0.1151, + "step": 5963 + }, + { + "epoch": 1.932598833441348, + "grad_norm": 0.759452223777771, + "learning_rate": 1.4850230510338792e-06, + "loss": 0.1092, + "step": 5964 + }, + { + "epoch": 1.9329228775113414, + "grad_norm": 0.8318132758140564, + "learning_rate": 1.4842238286929777e-06, + "loss": 0.1105, + "step": 5965 + }, + { + "epoch": 1.933246921581335, + "grad_norm": 0.8317754864692688, + "learning_rate": 1.4834247306732457e-06, + "loss": 0.1139, + "step": 5966 + }, + { + "epoch": 1.9335709656513287, + "grad_norm": 0.7857781052589417, + "learning_rate": 1.4826257570724856e-06, + "loss": 0.1036, + "step": 5967 + }, + { + "epoch": 1.9338950097213221, + "grad_norm": 0.8519753217697144, + "learning_rate": 1.4818269079884845e-06, + "loss": 0.1156, + "step": 5968 + }, + { + "epoch": 1.9342190537913155, + "grad_norm": 0.7977727055549622, + "learning_rate": 1.4810281835190132e-06, + "loss": 0.1126, + "step": 5969 + }, + { + "epoch": 1.9345430978613092, + "grad_norm": 0.846114993095398, + "learning_rate": 1.4802295837618268e-06, + "loss": 0.1158, + "step": 5970 + }, + { + "epoch": 1.9348671419313026, + "grad_norm": 0.8024095296859741, + "learning_rate": 1.479431108814668e-06, + "loss": 0.1027, + "step": 5971 + }, + { + "epoch": 1.9351911860012962, + "grad_norm": 0.7943775653839111, + "learning_rate": 1.4786327587752608e-06, + "loss": 0.1087, + "step": 5972 + }, + { + "epoch": 1.9355152300712897, + "grad_norm": 0.8726453185081482, + "learning_rate": 1.4778345337413174e-06, + "loss": 0.1194, + "step": 5973 + }, + { + "epoch": 1.935839274141283, + "grad_norm": 0.7967553734779358, + "learning_rate": 1.4770364338105315e-06, + "loss": 0.1114, + "step": 5974 + }, + { + "epoch": 1.9361633182112767, + "grad_norm": 0.7967367172241211, + "learning_rate": 1.4762384590805823e-06, + "loss": 0.111, + "step": 5975 + }, + { + "epoch": 1.9364873622812704, + "grad_norm": 0.8494221568107605, + "learning_rate": 1.475440609649136e-06, + "loss": 0.1203, + "step": 5976 + }, + { + "epoch": 1.9368114063512638, + "grad_norm": 0.8069921135902405, + "learning_rate": 1.4746428856138395e-06, + "loss": 0.1079, + "step": 5977 + }, + { + "epoch": 1.9371354504212572, + "grad_norm": 0.7718681693077087, + "learning_rate": 1.4738452870723286e-06, + "loss": 0.1092, + "step": 5978 + }, + { + "epoch": 1.9374594944912507, + "grad_norm": 0.881232500076294, + "learning_rate": 1.4730478141222194e-06, + "loss": 0.1189, + "step": 5979 + }, + { + "epoch": 1.9377835385612443, + "grad_norm": 0.8377132415771484, + "learning_rate": 1.4722504668611172e-06, + "loss": 0.1098, + "step": 5980 + }, + { + "epoch": 1.938107582631238, + "grad_norm": 0.8156023621559143, + "learning_rate": 1.4714532453866084e-06, + "loss": 0.1097, + "step": 5981 + }, + { + "epoch": 1.9384316267012314, + "grad_norm": 0.8057281374931335, + "learning_rate": 1.4706561497962644e-06, + "loss": 0.1108, + "step": 5982 + }, + { + "epoch": 1.9387556707712248, + "grad_norm": 0.8118177056312561, + "learning_rate": 1.4698591801876435e-06, + "loss": 0.1214, + "step": 5983 + }, + { + "epoch": 1.9390797148412184, + "grad_norm": 0.8250911831855774, + "learning_rate": 1.4690623366582856e-06, + "loss": 0.1141, + "step": 5984 + }, + { + "epoch": 1.939403758911212, + "grad_norm": 0.7952162027359009, + "learning_rate": 1.4682656193057189e-06, + "loss": 0.107, + "step": 5985 + }, + { + "epoch": 1.9397278029812055, + "grad_norm": 0.8406686782836914, + "learning_rate": 1.4674690282274517e-06, + "loss": 0.1182, + "step": 5986 + }, + { + "epoch": 1.940051847051199, + "grad_norm": 0.8226285576820374, + "learning_rate": 1.4666725635209794e-06, + "loss": 0.1098, + "step": 5987 + }, + { + "epoch": 1.9403758911211924, + "grad_norm": 0.835455596446991, + "learning_rate": 1.4658762252837821e-06, + "loss": 0.116, + "step": 5988 + }, + { + "epoch": 1.940699935191186, + "grad_norm": 0.7987916469573975, + "learning_rate": 1.4650800136133238e-06, + "loss": 0.1066, + "step": 5989 + }, + { + "epoch": 1.9410239792611796, + "grad_norm": 0.7740179896354675, + "learning_rate": 1.4642839286070537e-06, + "loss": 0.119, + "step": 5990 + }, + { + "epoch": 1.941348023331173, + "grad_norm": 0.8301352262496948, + "learning_rate": 1.4634879703624027e-06, + "loss": 0.1089, + "step": 5991 + }, + { + "epoch": 1.9416720674011665, + "grad_norm": 0.7866767644882202, + "learning_rate": 1.4626921389767915e-06, + "loss": 0.1096, + "step": 5992 + }, + { + "epoch": 1.94199611147116, + "grad_norm": 0.783625066280365, + "learning_rate": 1.4618964345476203e-06, + "loss": 0.1066, + "step": 5993 + }, + { + "epoch": 1.9423201555411536, + "grad_norm": 0.789161741733551, + "learning_rate": 1.4611008571722748e-06, + "loss": 0.1108, + "step": 5994 + }, + { + "epoch": 1.9426441996111472, + "grad_norm": 0.7411786317825317, + "learning_rate": 1.4603054069481282e-06, + "loss": 0.0965, + "step": 5995 + }, + { + "epoch": 1.9429682436811406, + "grad_norm": 0.8316660523414612, + "learning_rate": 1.4595100839725338e-06, + "loss": 0.1132, + "step": 5996 + }, + { + "epoch": 1.943292287751134, + "grad_norm": 0.733279287815094, + "learning_rate": 1.4587148883428337e-06, + "loss": 0.1051, + "step": 5997 + }, + { + "epoch": 1.9436163318211277, + "grad_norm": 0.7861854434013367, + "learning_rate": 1.45791982015635e-06, + "loss": 0.1092, + "step": 5998 + }, + { + "epoch": 1.9439403758911213, + "grad_norm": 0.795117974281311, + "learning_rate": 1.4571248795103921e-06, + "loss": 0.1044, + "step": 5999 + }, + { + "epoch": 1.9442644199611148, + "grad_norm": 0.7990328073501587, + "learning_rate": 1.4563300665022534e-06, + "loss": 0.1061, + "step": 6000 + }, + { + "epoch": 1.9445884640311082, + "grad_norm": 0.8901631236076355, + "learning_rate": 1.4555353812292105e-06, + "loss": 0.1181, + "step": 6001 + }, + { + "epoch": 1.9449125081011016, + "grad_norm": 0.7094663977622986, + "learning_rate": 1.4547408237885262e-06, + "loss": 0.0938, + "step": 6002 + }, + { + "epoch": 1.9452365521710953, + "grad_norm": 0.8745992183685303, + "learning_rate": 1.4539463942774462e-06, + "loss": 0.1169, + "step": 6003 + }, + { + "epoch": 1.945560596241089, + "grad_norm": 0.8128504157066345, + "learning_rate": 1.4531520927932017e-06, + "loss": 0.1117, + "step": 6004 + }, + { + "epoch": 1.9458846403110823, + "grad_norm": 0.8003115057945251, + "learning_rate": 1.452357919433006e-06, + "loss": 0.1132, + "step": 6005 + }, + { + "epoch": 1.9462086843810757, + "grad_norm": 0.7621777057647705, + "learning_rate": 1.4515638742940585e-06, + "loss": 0.1032, + "step": 6006 + }, + { + "epoch": 1.9465327284510694, + "grad_norm": 0.794258713722229, + "learning_rate": 1.4507699574735436e-06, + "loss": 0.1058, + "step": 6007 + }, + { + "epoch": 1.9468567725210628, + "grad_norm": 0.8062702417373657, + "learning_rate": 1.4499761690686287e-06, + "loss": 0.1082, + "step": 6008 + }, + { + "epoch": 1.9471808165910565, + "grad_norm": 0.7925530672073364, + "learning_rate": 1.4491825091764656e-06, + "loss": 0.1128, + "step": 6009 + }, + { + "epoch": 1.9475048606610499, + "grad_norm": 0.7610135078430176, + "learning_rate": 1.4483889778941904e-06, + "loss": 0.1008, + "step": 6010 + }, + { + "epoch": 1.9478289047310433, + "grad_norm": 0.8870550394058228, + "learning_rate": 1.447595575318924e-06, + "loss": 0.1269, + "step": 6011 + }, + { + "epoch": 1.948152948801037, + "grad_norm": 0.7672936320304871, + "learning_rate": 1.4468023015477722e-06, + "loss": 0.1077, + "step": 6012 + }, + { + "epoch": 1.9484769928710306, + "grad_norm": 0.8201618790626526, + "learning_rate": 1.446009156677822e-06, + "loss": 0.1079, + "step": 6013 + }, + { + "epoch": 1.948801036941024, + "grad_norm": 0.8383089303970337, + "learning_rate": 1.4452161408061478e-06, + "loss": 0.1203, + "step": 6014 + }, + { + "epoch": 1.9491250810110174, + "grad_norm": 0.8672509789466858, + "learning_rate": 1.4444232540298064e-06, + "loss": 0.1179, + "step": 6015 + }, + { + "epoch": 1.9494491250810109, + "grad_norm": 0.7988051176071167, + "learning_rate": 1.44363049644584e-06, + "loss": 0.1131, + "step": 6016 + }, + { + "epoch": 1.9497731691510045, + "grad_norm": 0.7616187930107117, + "learning_rate": 1.4428378681512755e-06, + "loss": 0.0999, + "step": 6017 + }, + { + "epoch": 1.9500972132209982, + "grad_norm": 0.7699881792068481, + "learning_rate": 1.4420453692431197e-06, + "loss": 0.1111, + "step": 6018 + }, + { + "epoch": 1.9504212572909916, + "grad_norm": 0.7896310091018677, + "learning_rate": 1.441252999818371e-06, + "loss": 0.1061, + "step": 6019 + }, + { + "epoch": 1.950745301360985, + "grad_norm": 0.7674881219863892, + "learning_rate": 1.440460759974004e-06, + "loss": 0.1035, + "step": 6020 + }, + { + "epoch": 1.9510693454309787, + "grad_norm": 0.8250930309295654, + "learning_rate": 1.4396686498069844e-06, + "loss": 0.1178, + "step": 6021 + }, + { + "epoch": 1.9513933895009723, + "grad_norm": 0.7929973006248474, + "learning_rate": 1.4388766694142553e-06, + "loss": 0.1058, + "step": 6022 + }, + { + "epoch": 1.9517174335709657, + "grad_norm": 0.8182612061500549, + "learning_rate": 1.4380848188927516e-06, + "loss": 0.1129, + "step": 6023 + }, + { + "epoch": 1.9520414776409591, + "grad_norm": 1.7397314310073853, + "learning_rate": 1.4372930983393849e-06, + "loss": 0.1156, + "step": 6024 + }, + { + "epoch": 1.9523655217109526, + "grad_norm": 0.7889402508735657, + "learning_rate": 1.4365015078510553e-06, + "loss": 0.1028, + "step": 6025 + }, + { + "epoch": 1.9526895657809462, + "grad_norm": 0.8027127385139465, + "learning_rate": 1.4357100475246463e-06, + "loss": 0.1084, + "step": 6026 + }, + { + "epoch": 1.9530136098509399, + "grad_norm": 0.7677908539772034, + "learning_rate": 1.4349187174570226e-06, + "loss": 0.1017, + "step": 6027 + }, + { + "epoch": 1.9533376539209333, + "grad_norm": 0.7761096954345703, + "learning_rate": 1.4341275177450389e-06, + "loss": 0.1089, + "step": 6028 + }, + { + "epoch": 1.9536616979909267, + "grad_norm": 0.7991950511932373, + "learning_rate": 1.4333364484855277e-06, + "loss": 0.11, + "step": 6029 + }, + { + "epoch": 1.9539857420609201, + "grad_norm": 0.7403672337532043, + "learning_rate": 1.432545509775309e-06, + "loss": 0.1019, + "step": 6030 + }, + { + "epoch": 1.9543097861309138, + "grad_norm": 0.8399749398231506, + "learning_rate": 1.4317547017111865e-06, + "loss": 0.1143, + "step": 6031 + }, + { + "epoch": 1.9546338302009074, + "grad_norm": 0.8288451433181763, + "learning_rate": 1.4309640243899467e-06, + "loss": 0.1136, + "step": 6032 + }, + { + "epoch": 1.9549578742709008, + "grad_norm": 0.8094178438186646, + "learning_rate": 1.4301734779083614e-06, + "loss": 0.1064, + "step": 6033 + }, + { + "epoch": 1.9552819183408943, + "grad_norm": 0.8080481290817261, + "learning_rate": 1.4293830623631857e-06, + "loss": 0.1077, + "step": 6034 + }, + { + "epoch": 1.955605962410888, + "grad_norm": 0.8804945349693298, + "learning_rate": 1.4285927778511598e-06, + "loss": 0.1185, + "step": 6035 + }, + { + "epoch": 1.9559300064808816, + "grad_norm": 0.7942977547645569, + "learning_rate": 1.4278026244690046e-06, + "loss": 0.1089, + "step": 6036 + }, + { + "epoch": 1.956254050550875, + "grad_norm": 0.7964348196983337, + "learning_rate": 1.427012602313429e-06, + "loss": 0.1145, + "step": 6037 + }, + { + "epoch": 1.9565780946208684, + "grad_norm": 0.8437153697013855, + "learning_rate": 1.4262227114811233e-06, + "loss": 0.1127, + "step": 6038 + }, + { + "epoch": 1.9569021386908618, + "grad_norm": 0.7795710563659668, + "learning_rate": 1.4254329520687626e-06, + "loss": 0.109, + "step": 6039 + }, + { + "epoch": 1.9572261827608555, + "grad_norm": 0.760061502456665, + "learning_rate": 1.4246433241730062e-06, + "loss": 0.111, + "step": 6040 + }, + { + "epoch": 1.9575502268308491, + "grad_norm": 0.8100674152374268, + "learning_rate": 1.4238538278904973e-06, + "loss": 0.1036, + "step": 6041 + }, + { + "epoch": 1.9578742709008425, + "grad_norm": 0.7496659159660339, + "learning_rate": 1.4230644633178603e-06, + "loss": 0.1059, + "step": 6042 + }, + { + "epoch": 1.958198314970836, + "grad_norm": 0.7997440695762634, + "learning_rate": 1.4222752305517093e-06, + "loss": 0.1098, + "step": 6043 + }, + { + "epoch": 1.9585223590408296, + "grad_norm": 0.885632336139679, + "learning_rate": 1.421486129688635e-06, + "loss": 0.1222, + "step": 6044 + }, + { + "epoch": 1.958846403110823, + "grad_norm": 0.7634774446487427, + "learning_rate": 1.4206971608252196e-06, + "loss": 0.1074, + "step": 6045 + }, + { + "epoch": 1.9591704471808167, + "grad_norm": 0.8368650078773499, + "learning_rate": 1.4199083240580218e-06, + "loss": 0.1131, + "step": 6046 + }, + { + "epoch": 1.95949449125081, + "grad_norm": 0.8191322684288025, + "learning_rate": 1.41911961948359e-06, + "loss": 0.1075, + "step": 6047 + }, + { + "epoch": 1.9598185353208035, + "grad_norm": 0.8034201860427856, + "learning_rate": 1.4183310471984532e-06, + "loss": 0.1084, + "step": 6048 + }, + { + "epoch": 1.9601425793907972, + "grad_norm": 0.7873152494430542, + "learning_rate": 1.4175426072991234e-06, + "loss": 0.1093, + "step": 6049 + }, + { + "epoch": 1.9604666234607908, + "grad_norm": 0.7949634790420532, + "learning_rate": 1.416754299882101e-06, + "loss": 0.109, + "step": 6050 + }, + { + "epoch": 1.9607906675307842, + "grad_norm": 0.838327944278717, + "learning_rate": 1.415966125043864e-06, + "loss": 0.1069, + "step": 6051 + }, + { + "epoch": 1.9611147116007777, + "grad_norm": 0.7817742824554443, + "learning_rate": 1.415178082880881e-06, + "loss": 0.1005, + "step": 6052 + }, + { + "epoch": 1.961438755670771, + "grad_norm": 0.7917423844337463, + "learning_rate": 1.4143901734895973e-06, + "loss": 0.1064, + "step": 6053 + }, + { + "epoch": 1.9617627997407647, + "grad_norm": 0.8067330718040466, + "learning_rate": 1.4136023969664471e-06, + "loss": 0.1107, + "step": 6054 + }, + { + "epoch": 1.9620868438107584, + "grad_norm": 0.7707548141479492, + "learning_rate": 1.4128147534078469e-06, + "loss": 0.1064, + "step": 6055 + }, + { + "epoch": 1.9624108878807518, + "grad_norm": 0.8027534484863281, + "learning_rate": 1.4120272429101955e-06, + "loss": 0.1102, + "step": 6056 + }, + { + "epoch": 1.9627349319507452, + "grad_norm": 0.7992814779281616, + "learning_rate": 1.4112398655698772e-06, + "loss": 0.1092, + "step": 6057 + }, + { + "epoch": 1.9630589760207389, + "grad_norm": 0.7964221835136414, + "learning_rate": 1.4104526214832595e-06, + "loss": 0.1078, + "step": 6058 + }, + { + "epoch": 1.9633830200907323, + "grad_norm": 0.8293501138687134, + "learning_rate": 1.4096655107466943e-06, + "loss": 0.113, + "step": 6059 + }, + { + "epoch": 1.963707064160726, + "grad_norm": 0.7957487106323242, + "learning_rate": 1.4088785334565145e-06, + "loss": 0.1143, + "step": 6060 + }, + { + "epoch": 1.9640311082307194, + "grad_norm": 0.8345206379890442, + "learning_rate": 1.4080916897090391e-06, + "loss": 0.1147, + "step": 6061 + }, + { + "epoch": 1.9643551523007128, + "grad_norm": 0.8262994289398193, + "learning_rate": 1.4073049796005705e-06, + "loss": 0.1181, + "step": 6062 + }, + { + "epoch": 1.9646791963707064, + "grad_norm": 0.8111175894737244, + "learning_rate": 1.4065184032273942e-06, + "loss": 0.1137, + "step": 6063 + }, + { + "epoch": 1.9650032404407, + "grad_norm": 0.8582708239555359, + "learning_rate": 1.4057319606857795e-06, + "loss": 0.1181, + "step": 6064 + }, + { + "epoch": 1.9653272845106935, + "grad_norm": 0.7551923394203186, + "learning_rate": 1.4049456520719805e-06, + "loss": 0.1078, + "step": 6065 + }, + { + "epoch": 1.965651328580687, + "grad_norm": 0.7485328316688538, + "learning_rate": 1.404159477482231e-06, + "loss": 0.0979, + "step": 6066 + }, + { + "epoch": 1.9659753726506803, + "grad_norm": 0.7505809664726257, + "learning_rate": 1.403373437012755e-06, + "loss": 0.1012, + "step": 6067 + }, + { + "epoch": 1.966299416720674, + "grad_norm": 0.8598964214324951, + "learning_rate": 1.4025875307597528e-06, + "loss": 0.1112, + "step": 6068 + }, + { + "epoch": 1.9666234607906676, + "grad_norm": 0.7594893574714661, + "learning_rate": 1.4018017588194132e-06, + "loss": 0.098, + "step": 6069 + }, + { + "epoch": 1.966947504860661, + "grad_norm": 0.7838597893714905, + "learning_rate": 1.401016121287907e-06, + "loss": 0.1124, + "step": 6070 + }, + { + "epoch": 1.9672715489306545, + "grad_norm": 0.802146852016449, + "learning_rate": 1.4002306182613885e-06, + "loss": 0.1161, + "step": 6071 + }, + { + "epoch": 1.9675955930006481, + "grad_norm": 0.7836012840270996, + "learning_rate": 1.3994452498359963e-06, + "loss": 0.1055, + "step": 6072 + }, + { + "epoch": 1.9679196370706418, + "grad_norm": 0.7942901253700256, + "learning_rate": 1.39866001610785e-06, + "loss": 0.115, + "step": 6073 + }, + { + "epoch": 1.9682436811406352, + "grad_norm": 0.765285313129425, + "learning_rate": 1.3978749171730577e-06, + "loss": 0.1062, + "step": 6074 + }, + { + "epoch": 1.9685677252106286, + "grad_norm": 0.749472975730896, + "learning_rate": 1.397089953127704e-06, + "loss": 0.1019, + "step": 6075 + }, + { + "epoch": 1.968891769280622, + "grad_norm": 0.8121153116226196, + "learning_rate": 1.3963051240678652e-06, + "loss": 0.1119, + "step": 6076 + }, + { + "epoch": 1.9692158133506157, + "grad_norm": 0.8156499862670898, + "learning_rate": 1.3955204300895937e-06, + "loss": 0.1091, + "step": 6077 + }, + { + "epoch": 1.9695398574206093, + "grad_norm": 0.796762228012085, + "learning_rate": 1.3947358712889292e-06, + "loss": 0.1114, + "step": 6078 + }, + { + "epoch": 1.9698639014906028, + "grad_norm": 0.7823910713195801, + "learning_rate": 1.3939514477618944e-06, + "loss": 0.1058, + "step": 6079 + }, + { + "epoch": 1.9701879455605962, + "grad_norm": 0.798762321472168, + "learning_rate": 1.3931671596044946e-06, + "loss": 0.1052, + "step": 6080 + }, + { + "epoch": 1.9705119896305896, + "grad_norm": 0.761111319065094, + "learning_rate": 1.392383006912721e-06, + "loss": 0.1038, + "step": 6081 + }, + { + "epoch": 1.9708360337005832, + "grad_norm": 0.8409186005592346, + "learning_rate": 1.3915989897825424e-06, + "loss": 0.1129, + "step": 6082 + }, + { + "epoch": 1.971160077770577, + "grad_norm": 0.7887939214706421, + "learning_rate": 1.3908151083099195e-06, + "loss": 0.1089, + "step": 6083 + }, + { + "epoch": 1.9714841218405703, + "grad_norm": 0.8472685217857361, + "learning_rate": 1.3900313625907886e-06, + "loss": 0.116, + "step": 6084 + }, + { + "epoch": 1.9718081659105637, + "grad_norm": 0.8228740096092224, + "learning_rate": 1.3892477527210734e-06, + "loss": 0.1149, + "step": 6085 + }, + { + "epoch": 1.9721322099805574, + "grad_norm": 0.8421080708503723, + "learning_rate": 1.3884642787966806e-06, + "loss": 0.1144, + "step": 6086 + }, + { + "epoch": 1.972456254050551, + "grad_norm": 0.8418703675270081, + "learning_rate": 1.3876809409134994e-06, + "loss": 0.1235, + "step": 6087 + }, + { + "epoch": 1.9727802981205445, + "grad_norm": 0.7663489580154419, + "learning_rate": 1.3868977391674033e-06, + "loss": 0.1077, + "step": 6088 + }, + { + "epoch": 1.9731043421905379, + "grad_norm": 0.7076537013053894, + "learning_rate": 1.386114673654248e-06, + "loss": 0.1008, + "step": 6089 + }, + { + "epoch": 1.9734283862605313, + "grad_norm": 0.904552698135376, + "learning_rate": 1.3853317444698744e-06, + "loss": 0.1294, + "step": 6090 + }, + { + "epoch": 1.973752430330525, + "grad_norm": 0.8387356400489807, + "learning_rate": 1.3845489517101036e-06, + "loss": 0.1183, + "step": 6091 + }, + { + "epoch": 1.9740764744005186, + "grad_norm": 0.824189305305481, + "learning_rate": 1.3837662954707426e-06, + "loss": 0.1127, + "step": 6092 + }, + { + "epoch": 1.974400518470512, + "grad_norm": 0.8098284006118774, + "learning_rate": 1.3829837758475808e-06, + "loss": 0.113, + "step": 6093 + }, + { + "epoch": 1.9747245625405054, + "grad_norm": 0.752780020236969, + "learning_rate": 1.3822013929363914e-06, + "loss": 0.1019, + "step": 6094 + }, + { + "epoch": 1.975048606610499, + "grad_norm": 0.8036346435546875, + "learning_rate": 1.3814191468329307e-06, + "loss": 0.1091, + "step": 6095 + }, + { + "epoch": 1.9753726506804925, + "grad_norm": 0.8747115135192871, + "learning_rate": 1.3806370376329388e-06, + "loss": 0.1105, + "step": 6096 + }, + { + "epoch": 1.9756966947504861, + "grad_norm": 0.8383525013923645, + "learning_rate": 1.3798550654321347e-06, + "loss": 0.1199, + "step": 6097 + }, + { + "epoch": 1.9760207388204796, + "grad_norm": 0.852014422416687, + "learning_rate": 1.379073230326229e-06, + "loss": 0.1255, + "step": 6098 + }, + { + "epoch": 1.976344782890473, + "grad_norm": 0.8316348791122437, + "learning_rate": 1.3782915324109075e-06, + "loss": 0.1131, + "step": 6099 + }, + { + "epoch": 1.9766688269604666, + "grad_norm": 0.8360430598258972, + "learning_rate": 1.3775099717818432e-06, + "loss": 0.1112, + "step": 6100 + }, + { + "epoch": 1.9769928710304603, + "grad_norm": 0.8059060573577881, + "learning_rate": 1.376728548534692e-06, + "loss": 0.1118, + "step": 6101 + }, + { + "epoch": 1.9773169151004537, + "grad_norm": 0.8584257364273071, + "learning_rate": 1.3759472627650926e-06, + "loss": 0.117, + "step": 6102 + }, + { + "epoch": 1.9776409591704471, + "grad_norm": 0.8232131004333496, + "learning_rate": 1.3751661145686673e-06, + "loss": 0.1163, + "step": 6103 + }, + { + "epoch": 1.9779650032404406, + "grad_norm": 0.770917534828186, + "learning_rate": 1.3743851040410183e-06, + "loss": 0.1104, + "step": 6104 + }, + { + "epoch": 1.9782890473104342, + "grad_norm": 0.7977389097213745, + "learning_rate": 1.3736042312777381e-06, + "loss": 0.1052, + "step": 6105 + }, + { + "epoch": 1.9786130913804278, + "grad_norm": 0.7602249979972839, + "learning_rate": 1.3728234963743931e-06, + "loss": 0.1044, + "step": 6106 + }, + { + "epoch": 1.9789371354504213, + "grad_norm": 0.7684023380279541, + "learning_rate": 1.3720428994265427e-06, + "loss": 0.1059, + "step": 6107 + }, + { + "epoch": 1.9792611795204147, + "grad_norm": 0.8109222054481506, + "learning_rate": 1.3712624405297209e-06, + "loss": 0.1185, + "step": 6108 + }, + { + "epoch": 1.9795852235904083, + "grad_norm": 0.7657626271247864, + "learning_rate": 1.3704821197794491e-06, + "loss": 0.1007, + "step": 6109 + }, + { + "epoch": 1.9799092676604018, + "grad_norm": 0.831357479095459, + "learning_rate": 1.369701937271231e-06, + "loss": 0.1095, + "step": 6110 + }, + { + "epoch": 1.9802333117303954, + "grad_norm": 0.832591712474823, + "learning_rate": 1.3689218931005543e-06, + "loss": 0.1175, + "step": 6111 + }, + { + "epoch": 1.9805573558003888, + "grad_norm": 0.7991769313812256, + "learning_rate": 1.368141987362889e-06, + "loss": 0.1078, + "step": 6112 + }, + { + "epoch": 1.9808813998703823, + "grad_norm": 0.8150991797447205, + "learning_rate": 1.3673622201536852e-06, + "loss": 0.1111, + "step": 6113 + }, + { + "epoch": 1.981205443940376, + "grad_norm": 0.849617600440979, + "learning_rate": 1.3665825915683829e-06, + "loss": 0.1197, + "step": 6114 + }, + { + "epoch": 1.9815294880103695, + "grad_norm": 0.8102957606315613, + "learning_rate": 1.3658031017023977e-06, + "loss": 0.1076, + "step": 6115 + }, + { + "epoch": 1.981853532080363, + "grad_norm": 0.8073728680610657, + "learning_rate": 1.3650237506511333e-06, + "loss": 0.1071, + "step": 6116 + }, + { + "epoch": 1.9821775761503564, + "grad_norm": 0.7590360045433044, + "learning_rate": 1.3642445385099746e-06, + "loss": 0.1061, + "step": 6117 + }, + { + "epoch": 1.9825016202203498, + "grad_norm": 0.7506588101387024, + "learning_rate": 1.363465465374289e-06, + "loss": 0.0982, + "step": 6118 + }, + { + "epoch": 1.9828256642903435, + "grad_norm": 0.8684452772140503, + "learning_rate": 1.362686531339428e-06, + "loss": 0.1182, + "step": 6119 + }, + { + "epoch": 1.983149708360337, + "grad_norm": 0.8147026896476746, + "learning_rate": 1.3619077365007266e-06, + "loss": 0.1093, + "step": 6120 + }, + { + "epoch": 1.9834737524303305, + "grad_norm": 0.796436607837677, + "learning_rate": 1.3611290809534997e-06, + "loss": 0.1095, + "step": 6121 + }, + { + "epoch": 1.983797796500324, + "grad_norm": 0.7356460094451904, + "learning_rate": 1.3603505647930481e-06, + "loss": 0.1034, + "step": 6122 + }, + { + "epoch": 1.9841218405703176, + "grad_norm": 0.7705609202384949, + "learning_rate": 1.3595721881146548e-06, + "loss": 0.1086, + "step": 6123 + }, + { + "epoch": 1.9844458846403112, + "grad_norm": 0.7775025963783264, + "learning_rate": 1.3587939510135856e-06, + "loss": 0.1098, + "step": 6124 + }, + { + "epoch": 1.9847699287103047, + "grad_norm": 0.8204125761985779, + "learning_rate": 1.3580158535850884e-06, + "loss": 0.1138, + "step": 6125 + }, + { + "epoch": 1.985093972780298, + "grad_norm": 0.7840316891670227, + "learning_rate": 1.357237895924396e-06, + "loss": 0.1069, + "step": 6126 + }, + { + "epoch": 1.9854180168502915, + "grad_norm": 0.730146586894989, + "learning_rate": 1.3564600781267234e-06, + "loss": 0.1001, + "step": 6127 + }, + { + "epoch": 1.9857420609202852, + "grad_norm": 0.776727557182312, + "learning_rate": 1.3556824002872648e-06, + "loss": 0.109, + "step": 6128 + }, + { + "epoch": 1.9860661049902788, + "grad_norm": 0.8463088274002075, + "learning_rate": 1.3549048625012046e-06, + "loss": 0.1204, + "step": 6129 + }, + { + "epoch": 1.9863901490602722, + "grad_norm": 0.7711324691772461, + "learning_rate": 1.354127464863703e-06, + "loss": 0.1085, + "step": 6130 + }, + { + "epoch": 1.9867141931302656, + "grad_norm": 0.8014845848083496, + "learning_rate": 1.3533502074699065e-06, + "loss": 0.1058, + "step": 6131 + }, + { + "epoch": 1.987038237200259, + "grad_norm": 0.8500660061836243, + "learning_rate": 1.3525730904149443e-06, + "loss": 0.1129, + "step": 6132 + }, + { + "epoch": 1.9873622812702527, + "grad_norm": 0.7671992778778076, + "learning_rate": 1.351796113793928e-06, + "loss": 0.1037, + "step": 6133 + }, + { + "epoch": 1.9876863253402464, + "grad_norm": 0.7462871074676514, + "learning_rate": 1.3510192777019527e-06, + "loss": 0.1031, + "step": 6134 + }, + { + "epoch": 1.9880103694102398, + "grad_norm": 0.7898198366165161, + "learning_rate": 1.3502425822340925e-06, + "loss": 0.1099, + "step": 6135 + }, + { + "epoch": 1.9883344134802332, + "grad_norm": 0.8464086651802063, + "learning_rate": 1.3494660274854122e-06, + "loss": 0.1139, + "step": 6136 + }, + { + "epoch": 1.9886584575502269, + "grad_norm": 0.7823293209075928, + "learning_rate": 1.3486896135509503e-06, + "loss": 0.1068, + "step": 6137 + }, + { + "epoch": 1.9889825016202205, + "grad_norm": 0.7849881052970886, + "learning_rate": 1.3479133405257355e-06, + "loss": 0.1032, + "step": 6138 + }, + { + "epoch": 1.989306545690214, + "grad_norm": 0.7832697629928589, + "learning_rate": 1.3471372085047743e-06, + "loss": 0.1061, + "step": 6139 + }, + { + "epoch": 1.9896305897602073, + "grad_norm": 0.8261182308197021, + "learning_rate": 1.3463612175830578e-06, + "loss": 0.1112, + "step": 6140 + }, + { + "epoch": 1.9899546338302008, + "grad_norm": 0.777273952960968, + "learning_rate": 1.3455853678555605e-06, + "loss": 0.1028, + "step": 6141 + }, + { + "epoch": 1.9902786779001944, + "grad_norm": 0.7343865036964417, + "learning_rate": 1.3448096594172383e-06, + "loss": 0.0994, + "step": 6142 + }, + { + "epoch": 1.990602721970188, + "grad_norm": 0.8520564436912537, + "learning_rate": 1.344034092363032e-06, + "loss": 0.1064, + "step": 6143 + }, + { + "epoch": 1.9909267660401815, + "grad_norm": 0.7399676442146301, + "learning_rate": 1.343258666787861e-06, + "loss": 0.1009, + "step": 6144 + }, + { + "epoch": 1.991250810110175, + "grad_norm": 0.8225117325782776, + "learning_rate": 1.3424833827866312e-06, + "loss": 0.1151, + "step": 6145 + }, + { + "epoch": 1.9915748541801686, + "grad_norm": 0.8109341859817505, + "learning_rate": 1.3417082404542295e-06, + "loss": 0.1099, + "step": 6146 + }, + { + "epoch": 1.991898898250162, + "grad_norm": 0.795943021774292, + "learning_rate": 1.3409332398855263e-06, + "loss": 0.107, + "step": 6147 + }, + { + "epoch": 1.9922229423201556, + "grad_norm": 0.797545313835144, + "learning_rate": 1.3401583811753735e-06, + "loss": 0.1094, + "step": 6148 + }, + { + "epoch": 1.992546986390149, + "grad_norm": 0.7752687931060791, + "learning_rate": 1.339383664418607e-06, + "loss": 0.0992, + "step": 6149 + }, + { + "epoch": 1.9928710304601425, + "grad_norm": 0.7549872398376465, + "learning_rate": 1.3386090897100442e-06, + "loss": 0.0952, + "step": 6150 + }, + { + "epoch": 1.9931950745301361, + "grad_norm": 0.7405936121940613, + "learning_rate": 1.3378346571444866e-06, + "loss": 0.1015, + "step": 6151 + }, + { + "epoch": 1.9935191186001298, + "grad_norm": 0.7260972261428833, + "learning_rate": 1.3370603668167156e-06, + "loss": 0.1032, + "step": 6152 + }, + { + "epoch": 1.9938431626701232, + "grad_norm": 0.8044086694717407, + "learning_rate": 1.3362862188214977e-06, + "loss": 0.1104, + "step": 6153 + }, + { + "epoch": 1.9941672067401166, + "grad_norm": 0.7475235462188721, + "learning_rate": 1.3355122132535806e-06, + "loss": 0.1031, + "step": 6154 + }, + { + "epoch": 1.99449125081011, + "grad_norm": 0.7913709282875061, + "learning_rate": 1.3347383502076955e-06, + "loss": 0.1115, + "step": 6155 + }, + { + "epoch": 1.9948152948801037, + "grad_norm": 0.8162972927093506, + "learning_rate": 1.333964629778556e-06, + "loss": 0.1131, + "step": 6156 + }, + { + "epoch": 1.9951393389500973, + "grad_norm": 0.8377811908721924, + "learning_rate": 1.3331910520608576e-06, + "loss": 0.1199, + "step": 6157 + }, + { + "epoch": 1.9954633830200907, + "grad_norm": 0.8220179677009583, + "learning_rate": 1.3324176171492798e-06, + "loss": 0.117, + "step": 6158 + }, + { + "epoch": 1.9957874270900842, + "grad_norm": 0.8617610335350037, + "learning_rate": 1.3316443251384808e-06, + "loss": 0.1204, + "step": 6159 + }, + { + "epoch": 1.9961114711600778, + "grad_norm": 0.8488910794258118, + "learning_rate": 1.3308711761231074e-06, + "loss": 0.1157, + "step": 6160 + }, + { + "epoch": 1.9964355152300715, + "grad_norm": 0.7747528553009033, + "learning_rate": 1.3300981701977834e-06, + "loss": 0.105, + "step": 6161 + }, + { + "epoch": 1.9967595593000649, + "grad_norm": 0.796110212802887, + "learning_rate": 1.3293253074571178e-06, + "loss": 0.1033, + "step": 6162 + }, + { + "epoch": 1.9970836033700583, + "grad_norm": 0.8537262082099915, + "learning_rate": 1.3285525879957011e-06, + "loss": 0.1172, + "step": 6163 + }, + { + "epoch": 1.9974076474400517, + "grad_norm": 0.7868740558624268, + "learning_rate": 1.3277800119081077e-06, + "loss": 0.1106, + "step": 6164 + }, + { + "epoch": 1.9977316915100454, + "grad_norm": 0.8078975677490234, + "learning_rate": 1.3270075792888937e-06, + "loss": 0.113, + "step": 6165 + }, + { + "epoch": 1.998055735580039, + "grad_norm": 0.8577303886413574, + "learning_rate": 1.3262352902325944e-06, + "loss": 0.1162, + "step": 6166 + }, + { + "epoch": 1.9983797796500324, + "grad_norm": 0.7342814803123474, + "learning_rate": 1.325463144833735e-06, + "loss": 0.1032, + "step": 6167 + }, + { + "epoch": 1.9987038237200259, + "grad_norm": 0.7626562118530273, + "learning_rate": 1.324691143186814e-06, + "loss": 0.1019, + "step": 6168 + }, + { + "epoch": 1.9990278677900193, + "grad_norm": 0.829145073890686, + "learning_rate": 1.323919285386321e-06, + "loss": 0.1126, + "step": 6169 + }, + { + "epoch": 1.999351911860013, + "grad_norm": 0.7580152153968811, + "learning_rate": 1.3231475715267217e-06, + "loss": 0.11, + "step": 6170 + }, + { + "epoch": 1.9996759559300066, + "grad_norm": 0.7596506476402283, + "learning_rate": 1.3223760017024661e-06, + "loss": 0.1, + "step": 6171 + }, + { + "epoch": 2.0, + "grad_norm": 0.8046398758888245, + "learning_rate": 1.3216045760079882e-06, + "loss": 0.1231, + "step": 6172 + }, + { + "epoch": 2.0003240440699934, + "grad_norm": 0.6925712823867798, + "learning_rate": 1.3208332945377022e-06, + "loss": 0.0823, + "step": 6173 + }, + { + "epoch": 2.000648088139987, + "grad_norm": 0.6641772985458374, + "learning_rate": 1.3200621573860068e-06, + "loss": 0.0814, + "step": 6174 + }, + { + "epoch": 2.0009721322099807, + "grad_norm": 0.6670467257499695, + "learning_rate": 1.3192911646472796e-06, + "loss": 0.0787, + "step": 6175 + }, + { + "epoch": 2.001296176279974, + "grad_norm": 0.6554123759269714, + "learning_rate": 1.3185203164158838e-06, + "loss": 0.0778, + "step": 6176 + }, + { + "epoch": 2.0016202203499676, + "grad_norm": 0.7232511639595032, + "learning_rate": 1.3177496127861635e-06, + "loss": 0.0843, + "step": 6177 + }, + { + "epoch": 2.001944264419961, + "grad_norm": 0.6921700835227966, + "learning_rate": 1.3169790538524457e-06, + "loss": 0.0815, + "step": 6178 + }, + { + "epoch": 2.002268308489955, + "grad_norm": 0.6646423935890198, + "learning_rate": 1.316208639709039e-06, + "loss": 0.0795, + "step": 6179 + }, + { + "epoch": 2.0025923525599483, + "grad_norm": 0.6958689093589783, + "learning_rate": 1.3154383704502349e-06, + "loss": 0.082, + "step": 6180 + }, + { + "epoch": 2.0029163966299417, + "grad_norm": 0.6785232424736023, + "learning_rate": 1.3146682461703069e-06, + "loss": 0.0738, + "step": 6181 + }, + { + "epoch": 2.003240440699935, + "grad_norm": 0.6926654577255249, + "learning_rate": 1.3138982669635117e-06, + "loss": 0.0751, + "step": 6182 + }, + { + "epoch": 2.0035644847699285, + "grad_norm": 0.6905853152275085, + "learning_rate": 1.313128432924084e-06, + "loss": 0.0783, + "step": 6183 + }, + { + "epoch": 2.0038885288399224, + "grad_norm": 0.731368362903595, + "learning_rate": 1.3123587441462487e-06, + "loss": 0.0816, + "step": 6184 + }, + { + "epoch": 2.004212572909916, + "grad_norm": 0.7648592591285706, + "learning_rate": 1.3115892007242046e-06, + "loss": 0.0748, + "step": 6185 + }, + { + "epoch": 2.0045366169799093, + "grad_norm": 0.706275999546051, + "learning_rate": 1.3108198027521374e-06, + "loss": 0.0732, + "step": 6186 + }, + { + "epoch": 2.0048606610499027, + "grad_norm": 0.7441841959953308, + "learning_rate": 1.3100505503242156e-06, + "loss": 0.0762, + "step": 6187 + }, + { + "epoch": 2.005184705119896, + "grad_norm": 0.7099791765213013, + "learning_rate": 1.3092814435345845e-06, + "loss": 0.0733, + "step": 6188 + }, + { + "epoch": 2.00550874918989, + "grad_norm": 0.781180739402771, + "learning_rate": 1.3085124824773797e-06, + "loss": 0.0818, + "step": 6189 + }, + { + "epoch": 2.0058327932598834, + "grad_norm": 0.7897968888282776, + "learning_rate": 1.307743667246711e-06, + "loss": 0.084, + "step": 6190 + }, + { + "epoch": 2.006156837329877, + "grad_norm": 0.7949253916740417, + "learning_rate": 1.306974997936677e-06, + "loss": 0.0843, + "step": 6191 + }, + { + "epoch": 2.0064808813998702, + "grad_norm": 0.8092707395553589, + "learning_rate": 1.3062064746413522e-06, + "loss": 0.0783, + "step": 6192 + }, + { + "epoch": 2.006804925469864, + "grad_norm": 0.9016462564468384, + "learning_rate": 1.3054380974547998e-06, + "loss": 0.0845, + "step": 6193 + }, + { + "epoch": 2.0071289695398575, + "grad_norm": 0.8049711585044861, + "learning_rate": 1.3046698664710595e-06, + "loss": 0.076, + "step": 6194 + }, + { + "epoch": 2.007453013609851, + "grad_norm": 0.8476788401603699, + "learning_rate": 1.3039017817841553e-06, + "loss": 0.0785, + "step": 6195 + }, + { + "epoch": 2.0077770576798444, + "grad_norm": 0.860939621925354, + "learning_rate": 1.3031338434880952e-06, + "loss": 0.0837, + "step": 6196 + }, + { + "epoch": 2.008101101749838, + "grad_norm": 0.8430168032646179, + "learning_rate": 1.3023660516768638e-06, + "loss": 0.088, + "step": 6197 + }, + { + "epoch": 2.0084251458198317, + "grad_norm": 0.8520883917808533, + "learning_rate": 1.301598406444436e-06, + "loss": 0.0808, + "step": 6198 + }, + { + "epoch": 2.008749189889825, + "grad_norm": 0.8094761967658997, + "learning_rate": 1.3008309078847605e-06, + "loss": 0.0755, + "step": 6199 + }, + { + "epoch": 2.0090732339598185, + "grad_norm": 0.8996002674102783, + "learning_rate": 1.3000635560917735e-06, + "loss": 0.0789, + "step": 6200 + }, + { + "epoch": 2.009397278029812, + "grad_norm": 0.8273560404777527, + "learning_rate": 1.2992963511593904e-06, + "loss": 0.0801, + "step": 6201 + }, + { + "epoch": 2.0097213220998054, + "grad_norm": 0.9030359983444214, + "learning_rate": 1.2985292931815105e-06, + "loss": 0.0764, + "step": 6202 + }, + { + "epoch": 2.0100453661697992, + "grad_norm": 0.9174606800079346, + "learning_rate": 1.2977623822520141e-06, + "loss": 0.083, + "step": 6203 + }, + { + "epoch": 2.0103694102397927, + "grad_norm": 0.8109633326530457, + "learning_rate": 1.296995618464763e-06, + "loss": 0.0764, + "step": 6204 + }, + { + "epoch": 2.010693454309786, + "grad_norm": 0.7600975632667542, + "learning_rate": 1.2962290019136028e-06, + "loss": 0.0746, + "step": 6205 + }, + { + "epoch": 2.0110174983797795, + "grad_norm": 0.7989012002944946, + "learning_rate": 1.2954625326923602e-06, + "loss": 0.0745, + "step": 6206 + }, + { + "epoch": 2.0113415424497734, + "grad_norm": 0.8223599195480347, + "learning_rate": 1.294696210894842e-06, + "loss": 0.08, + "step": 6207 + }, + { + "epoch": 2.011665586519767, + "grad_norm": 0.786533772945404, + "learning_rate": 1.2939300366148389e-06, + "loss": 0.0758, + "step": 6208 + }, + { + "epoch": 2.01198963058976, + "grad_norm": 0.913048505783081, + "learning_rate": 1.2931640099461237e-06, + "loss": 0.0861, + "step": 6209 + }, + { + "epoch": 2.0123136746597536, + "grad_norm": 0.8590562343597412, + "learning_rate": 1.2923981309824507e-06, + "loss": 0.0855, + "step": 6210 + }, + { + "epoch": 2.012637718729747, + "grad_norm": 0.819560706615448, + "learning_rate": 1.291632399817557e-06, + "loss": 0.0767, + "step": 6211 + }, + { + "epoch": 2.012961762799741, + "grad_norm": 0.797325849533081, + "learning_rate": 1.2908668165451577e-06, + "loss": 0.0761, + "step": 6212 + }, + { + "epoch": 2.0132858068697344, + "grad_norm": 0.8787040710449219, + "learning_rate": 1.290101381258957e-06, + "loss": 0.0814, + "step": 6213 + }, + { + "epoch": 2.0136098509397278, + "grad_norm": 0.823753297328949, + "learning_rate": 1.289336094052632e-06, + "loss": 0.0816, + "step": 6214 + }, + { + "epoch": 2.013933895009721, + "grad_norm": 0.8768038749694824, + "learning_rate": 1.288570955019851e-06, + "loss": 0.0862, + "step": 6215 + }, + { + "epoch": 2.0142579390797146, + "grad_norm": 0.7695865035057068, + "learning_rate": 1.2878059642542566e-06, + "loss": 0.0798, + "step": 6216 + }, + { + "epoch": 2.0145819831497085, + "grad_norm": 0.7729963064193726, + "learning_rate": 1.2870411218494778e-06, + "loss": 0.074, + "step": 6217 + }, + { + "epoch": 2.014906027219702, + "grad_norm": 0.8531014919281006, + "learning_rate": 1.2862764278991236e-06, + "loss": 0.0798, + "step": 6218 + }, + { + "epoch": 2.0152300712896953, + "grad_norm": 0.8081734776496887, + "learning_rate": 1.2855118824967833e-06, + "loss": 0.0779, + "step": 6219 + }, + { + "epoch": 2.0155541153596888, + "grad_norm": 0.8807827830314636, + "learning_rate": 1.2847474857360332e-06, + "loss": 0.0787, + "step": 6220 + }, + { + "epoch": 2.0158781594296826, + "grad_norm": 0.8906676769256592, + "learning_rate": 1.2839832377104245e-06, + "loss": 0.0909, + "step": 6221 + }, + { + "epoch": 2.016202203499676, + "grad_norm": 0.7742086052894592, + "learning_rate": 1.2832191385134972e-06, + "loss": 0.0733, + "step": 6222 + }, + { + "epoch": 2.0165262475696695, + "grad_norm": 0.917332112789154, + "learning_rate": 1.2824551882387664e-06, + "loss": 0.0847, + "step": 6223 + }, + { + "epoch": 2.016850291639663, + "grad_norm": 0.787459135055542, + "learning_rate": 1.2816913869797353e-06, + "loss": 0.0773, + "step": 6224 + }, + { + "epoch": 2.0171743357096563, + "grad_norm": 0.7695854902267456, + "learning_rate": 1.2809277348298838e-06, + "loss": 0.0735, + "step": 6225 + }, + { + "epoch": 2.01749837977965, + "grad_norm": 0.8414837718009949, + "learning_rate": 1.2801642318826759e-06, + "loss": 0.0858, + "step": 6226 + }, + { + "epoch": 2.0178224238496436, + "grad_norm": 0.7882809042930603, + "learning_rate": 1.279400878231557e-06, + "loss": 0.077, + "step": 6227 + }, + { + "epoch": 2.018146467919637, + "grad_norm": 0.883886992931366, + "learning_rate": 1.2786376739699547e-06, + "loss": 0.0865, + "step": 6228 + }, + { + "epoch": 2.0184705119896305, + "grad_norm": 0.7620802521705627, + "learning_rate": 1.2778746191912778e-06, + "loss": 0.0714, + "step": 6229 + }, + { + "epoch": 2.0187945560596243, + "grad_norm": 0.7754296064376831, + "learning_rate": 1.2771117139889155e-06, + "loss": 0.0763, + "step": 6230 + }, + { + "epoch": 2.0191186001296177, + "grad_norm": 0.8185173869132996, + "learning_rate": 1.276348958456241e-06, + "loss": 0.0796, + "step": 6231 + }, + { + "epoch": 2.019442644199611, + "grad_norm": 0.8952303528785706, + "learning_rate": 1.2755863526866087e-06, + "loss": 0.0823, + "step": 6232 + }, + { + "epoch": 2.0197666882696046, + "grad_norm": 0.8771202564239502, + "learning_rate": 1.2748238967733529e-06, + "loss": 0.0848, + "step": 6233 + }, + { + "epoch": 2.020090732339598, + "grad_norm": 0.8356162309646606, + "learning_rate": 1.2740615908097915e-06, + "loss": 0.077, + "step": 6234 + }, + { + "epoch": 2.020414776409592, + "grad_norm": 0.8438013195991516, + "learning_rate": 1.2732994348892237e-06, + "loss": 0.0797, + "step": 6235 + }, + { + "epoch": 2.0207388204795853, + "grad_norm": 0.8168169856071472, + "learning_rate": 1.2725374291049296e-06, + "loss": 0.0781, + "step": 6236 + }, + { + "epoch": 2.0210628645495787, + "grad_norm": 0.8462203741073608, + "learning_rate": 1.2717755735501725e-06, + "loss": 0.0829, + "step": 6237 + }, + { + "epoch": 2.021386908619572, + "grad_norm": 0.8509506583213806, + "learning_rate": 1.2710138683181937e-06, + "loss": 0.079, + "step": 6238 + }, + { + "epoch": 2.0217109526895656, + "grad_norm": 0.808458149433136, + "learning_rate": 1.2702523135022205e-06, + "loss": 0.072, + "step": 6239 + }, + { + "epoch": 2.0220349967595594, + "grad_norm": 0.8539109826087952, + "learning_rate": 1.2694909091954588e-06, + "loss": 0.0844, + "step": 6240 + }, + { + "epoch": 2.022359040829553, + "grad_norm": 0.9025552272796631, + "learning_rate": 1.2687296554910978e-06, + "loss": 0.0766, + "step": 6241 + }, + { + "epoch": 2.0226830848995463, + "grad_norm": 0.8735927939414978, + "learning_rate": 1.2679685524823082e-06, + "loss": 0.078, + "step": 6242 + }, + { + "epoch": 2.0230071289695397, + "grad_norm": 0.8082770705223083, + "learning_rate": 1.2672076002622386e-06, + "loss": 0.0741, + "step": 6243 + }, + { + "epoch": 2.0233311730395336, + "grad_norm": 0.8505511283874512, + "learning_rate": 1.2664467989240265e-06, + "loss": 0.0788, + "step": 6244 + }, + { + "epoch": 2.023655217109527, + "grad_norm": 0.854013979434967, + "learning_rate": 1.2656861485607828e-06, + "loss": 0.0793, + "step": 6245 + }, + { + "epoch": 2.0239792611795204, + "grad_norm": 0.9275755882263184, + "learning_rate": 1.264925649265607e-06, + "loss": 0.081, + "step": 6246 + }, + { + "epoch": 2.024303305249514, + "grad_norm": 0.8417896032333374, + "learning_rate": 1.2641653011315746e-06, + "loss": 0.0758, + "step": 6247 + }, + { + "epoch": 2.0246273493195073, + "grad_norm": 0.8762497901916504, + "learning_rate": 1.2634051042517453e-06, + "loss": 0.0795, + "step": 6248 + }, + { + "epoch": 2.024951393389501, + "grad_norm": 0.861675500869751, + "learning_rate": 1.2626450587191602e-06, + "loss": 0.0775, + "step": 6249 + }, + { + "epoch": 2.0252754374594946, + "grad_norm": 0.9877171516418457, + "learning_rate": 1.2618851646268416e-06, + "loss": 0.0827, + "step": 6250 + }, + { + "epoch": 2.025599481529488, + "grad_norm": 0.8101239800453186, + "learning_rate": 1.2611254220677937e-06, + "loss": 0.0698, + "step": 6251 + }, + { + "epoch": 2.0259235255994814, + "grad_norm": 0.7930020689964294, + "learning_rate": 1.260365831134999e-06, + "loss": 0.0721, + "step": 6252 + }, + { + "epoch": 2.026247569669475, + "grad_norm": 0.9465885162353516, + "learning_rate": 1.259606391921428e-06, + "loss": 0.0766, + "step": 6253 + }, + { + "epoch": 2.0265716137394687, + "grad_norm": 0.8918383717536926, + "learning_rate": 1.2588471045200256e-06, + "loss": 0.0717, + "step": 6254 + }, + { + "epoch": 2.026895657809462, + "grad_norm": 0.8400719165802002, + "learning_rate": 1.2580879690237224e-06, + "loss": 0.0785, + "step": 6255 + }, + { + "epoch": 2.0272197018794555, + "grad_norm": 0.854532778263092, + "learning_rate": 1.257328985525429e-06, + "loss": 0.0799, + "step": 6256 + }, + { + "epoch": 2.027543745949449, + "grad_norm": 0.8931845426559448, + "learning_rate": 1.256570154118038e-06, + "loss": 0.0827, + "step": 6257 + }, + { + "epoch": 2.027867790019443, + "grad_norm": 0.9417962431907654, + "learning_rate": 1.2558114748944226e-06, + "loss": 0.0821, + "step": 6258 + }, + { + "epoch": 2.0281918340894363, + "grad_norm": 0.8137034773826599, + "learning_rate": 1.2550529479474383e-06, + "loss": 0.0775, + "step": 6259 + }, + { + "epoch": 2.0285158781594297, + "grad_norm": 0.8253341913223267, + "learning_rate": 1.2542945733699216e-06, + "loss": 0.0745, + "step": 6260 + }, + { + "epoch": 2.028839922229423, + "grad_norm": 0.9062314629554749, + "learning_rate": 1.2535363512546892e-06, + "loss": 0.0814, + "step": 6261 + }, + { + "epoch": 2.0291639662994165, + "grad_norm": 0.7367941737174988, + "learning_rate": 1.2527782816945405e-06, + "loss": 0.0683, + "step": 6262 + }, + { + "epoch": 2.0294880103694104, + "grad_norm": 0.8357585668563843, + "learning_rate": 1.2520203647822563e-06, + "loss": 0.0791, + "step": 6263 + }, + { + "epoch": 2.029812054439404, + "grad_norm": 0.9064425826072693, + "learning_rate": 1.2512626006105977e-06, + "loss": 0.086, + "step": 6264 + }, + { + "epoch": 2.0301360985093972, + "grad_norm": 0.7778387665748596, + "learning_rate": 1.2505049892723083e-06, + "loss": 0.0769, + "step": 6265 + }, + { + "epoch": 2.0304601425793907, + "grad_norm": 0.8687406778335571, + "learning_rate": 1.2497475308601134e-06, + "loss": 0.0844, + "step": 6266 + }, + { + "epoch": 2.0307841866493845, + "grad_norm": 0.8010364174842834, + "learning_rate": 1.248990225466715e-06, + "loss": 0.077, + "step": 6267 + }, + { + "epoch": 2.031108230719378, + "grad_norm": 0.8573675751686096, + "learning_rate": 1.2482330731848044e-06, + "loss": 0.0789, + "step": 6268 + }, + { + "epoch": 2.0314322747893714, + "grad_norm": 0.8595082759857178, + "learning_rate": 1.2474760741070465e-06, + "loss": 0.0807, + "step": 6269 + }, + { + "epoch": 2.031756318859365, + "grad_norm": 0.9239386320114136, + "learning_rate": 1.246719228326092e-06, + "loss": 0.082, + "step": 6270 + }, + { + "epoch": 2.0320803629293582, + "grad_norm": 0.8387404680252075, + "learning_rate": 1.2459625359345712e-06, + "loss": 0.0746, + "step": 6271 + }, + { + "epoch": 2.032404406999352, + "grad_norm": 0.8717005848884583, + "learning_rate": 1.2452059970250957e-06, + "loss": 0.0818, + "step": 6272 + }, + { + "epoch": 2.0327284510693455, + "grad_norm": 0.9566702842712402, + "learning_rate": 1.2444496116902602e-06, + "loss": 0.0867, + "step": 6273 + }, + { + "epoch": 2.033052495139339, + "grad_norm": 0.7944853901863098, + "learning_rate": 1.2436933800226352e-06, + "loss": 0.0757, + "step": 6274 + }, + { + "epoch": 2.0333765392093324, + "grad_norm": 0.9047906398773193, + "learning_rate": 1.2429373021147808e-06, + "loss": 0.0883, + "step": 6275 + }, + { + "epoch": 2.033700583279326, + "grad_norm": 0.8332332372665405, + "learning_rate": 1.2421813780592294e-06, + "loss": 0.0774, + "step": 6276 + }, + { + "epoch": 2.0340246273493197, + "grad_norm": 0.8721747398376465, + "learning_rate": 1.2414256079485021e-06, + "loss": 0.0821, + "step": 6277 + }, + { + "epoch": 2.034348671419313, + "grad_norm": 0.9223491549491882, + "learning_rate": 1.240669991875096e-06, + "loss": 0.0817, + "step": 6278 + }, + { + "epoch": 2.0346727154893065, + "grad_norm": 0.8711641430854797, + "learning_rate": 1.2399145299314913e-06, + "loss": 0.0867, + "step": 6279 + }, + { + "epoch": 2.0349967595593, + "grad_norm": 0.8048623204231262, + "learning_rate": 1.2391592222101497e-06, + "loss": 0.0727, + "step": 6280 + }, + { + "epoch": 2.035320803629294, + "grad_norm": 0.8184431195259094, + "learning_rate": 1.2384040688035135e-06, + "loss": 0.0767, + "step": 6281 + }, + { + "epoch": 2.035644847699287, + "grad_norm": 0.7848328948020935, + "learning_rate": 1.2376490698040069e-06, + "loss": 0.0702, + "step": 6282 + }, + { + "epoch": 2.0359688917692806, + "grad_norm": 0.8464064002037048, + "learning_rate": 1.236894225304032e-06, + "loss": 0.0756, + "step": 6283 + }, + { + "epoch": 2.036292935839274, + "grad_norm": 0.8497661352157593, + "learning_rate": 1.2361395353959776e-06, + "loss": 0.0767, + "step": 6284 + }, + { + "epoch": 2.0366169799092675, + "grad_norm": 0.8617527484893799, + "learning_rate": 1.2353850001722084e-06, + "loss": 0.0808, + "step": 6285 + }, + { + "epoch": 2.0369410239792614, + "grad_norm": 0.8274610638618469, + "learning_rate": 1.2346306197250727e-06, + "loss": 0.0733, + "step": 6286 + }, + { + "epoch": 2.037265068049255, + "grad_norm": 0.9231866002082825, + "learning_rate": 1.2338763941468993e-06, + "loss": 0.0873, + "step": 6287 + }, + { + "epoch": 2.037589112119248, + "grad_norm": 0.9184801578521729, + "learning_rate": 1.2331223235299983e-06, + "loss": 0.0836, + "step": 6288 + }, + { + "epoch": 2.0379131561892416, + "grad_norm": 0.7702471017837524, + "learning_rate": 1.2323684079666604e-06, + "loss": 0.0724, + "step": 6289 + }, + { + "epoch": 2.038237200259235, + "grad_norm": 0.8048983812332153, + "learning_rate": 1.2316146475491578e-06, + "loss": 0.0746, + "step": 6290 + }, + { + "epoch": 2.038561244329229, + "grad_norm": 0.8604027032852173, + "learning_rate": 1.2308610423697446e-06, + "loss": 0.0788, + "step": 6291 + }, + { + "epoch": 2.0388852883992223, + "grad_norm": 0.8295606374740601, + "learning_rate": 1.2301075925206524e-06, + "loss": 0.0766, + "step": 6292 + }, + { + "epoch": 2.0392093324692158, + "grad_norm": 0.7977343201637268, + "learning_rate": 1.2293542980940974e-06, + "loss": 0.0726, + "step": 6293 + }, + { + "epoch": 2.039533376539209, + "grad_norm": 0.9230608940124512, + "learning_rate": 1.2286011591822756e-06, + "loss": 0.0883, + "step": 6294 + }, + { + "epoch": 2.039857420609203, + "grad_norm": 0.8658449649810791, + "learning_rate": 1.2278481758773636e-06, + "loss": 0.0813, + "step": 6295 + }, + { + "epoch": 2.0401814646791965, + "grad_norm": 0.857531726360321, + "learning_rate": 1.2270953482715197e-06, + "loss": 0.0796, + "step": 6296 + }, + { + "epoch": 2.04050550874919, + "grad_norm": 0.9500970840454102, + "learning_rate": 1.2263426764568835e-06, + "loss": 0.0836, + "step": 6297 + }, + { + "epoch": 2.0408295528191833, + "grad_norm": 0.8607531785964966, + "learning_rate": 1.2255901605255715e-06, + "loss": 0.0777, + "step": 6298 + }, + { + "epoch": 2.0411535968891767, + "grad_norm": 0.8002095818519592, + "learning_rate": 1.224837800569689e-06, + "loss": 0.07, + "step": 6299 + }, + { + "epoch": 2.0414776409591706, + "grad_norm": 0.9007039666175842, + "learning_rate": 1.224085596681314e-06, + "loss": 0.082, + "step": 6300 + }, + { + "epoch": 2.041801685029164, + "grad_norm": 0.8380284309387207, + "learning_rate": 1.22333354895251e-06, + "loss": 0.0774, + "step": 6301 + }, + { + "epoch": 2.0421257290991575, + "grad_norm": 0.8431714773178101, + "learning_rate": 1.2225816574753208e-06, + "loss": 0.0763, + "step": 6302 + }, + { + "epoch": 2.042449773169151, + "grad_norm": 0.8380304574966431, + "learning_rate": 1.2218299223417702e-06, + "loss": 0.0738, + "step": 6303 + }, + { + "epoch": 2.0427738172391443, + "grad_norm": 0.930404782295227, + "learning_rate": 1.2210783436438644e-06, + "loss": 0.0801, + "step": 6304 + }, + { + "epoch": 2.043097861309138, + "grad_norm": 0.8345676064491272, + "learning_rate": 1.2203269214735866e-06, + "loss": 0.0766, + "step": 6305 + }, + { + "epoch": 2.0434219053791316, + "grad_norm": 0.8367553949356079, + "learning_rate": 1.2195756559229072e-06, + "loss": 0.074, + "step": 6306 + }, + { + "epoch": 2.043745949449125, + "grad_norm": 0.9327186346054077, + "learning_rate": 1.2188245470837702e-06, + "loss": 0.0865, + "step": 6307 + }, + { + "epoch": 2.0440699935191184, + "grad_norm": 0.7962414622306824, + "learning_rate": 1.218073595048108e-06, + "loss": 0.0751, + "step": 6308 + }, + { + "epoch": 2.0443940375891123, + "grad_norm": 0.8408752679824829, + "learning_rate": 1.2173227999078264e-06, + "loss": 0.0775, + "step": 6309 + }, + { + "epoch": 2.0447180816591057, + "grad_norm": 0.8453505039215088, + "learning_rate": 1.2165721617548172e-06, + "loss": 0.0811, + "step": 6310 + }, + { + "epoch": 2.045042125729099, + "grad_norm": 0.8683754801750183, + "learning_rate": 1.2158216806809505e-06, + "loss": 0.079, + "step": 6311 + }, + { + "epoch": 2.0453661697990926, + "grad_norm": 0.8050029277801514, + "learning_rate": 1.2150713567780786e-06, + "loss": 0.0739, + "step": 6312 + }, + { + "epoch": 2.045690213869086, + "grad_norm": 0.8375382423400879, + "learning_rate": 1.2143211901380341e-06, + "loss": 0.0755, + "step": 6313 + }, + { + "epoch": 2.04601425793908, + "grad_norm": 0.8136559724807739, + "learning_rate": 1.2135711808526282e-06, + "loss": 0.0765, + "step": 6314 + }, + { + "epoch": 2.0463383020090733, + "grad_norm": 0.8717918395996094, + "learning_rate": 1.2128213290136578e-06, + "loss": 0.0789, + "step": 6315 + }, + { + "epoch": 2.0466623460790667, + "grad_norm": 0.8089046478271484, + "learning_rate": 1.212071634712895e-06, + "loss": 0.069, + "step": 6316 + }, + { + "epoch": 2.04698639014906, + "grad_norm": 0.8941287398338318, + "learning_rate": 1.211322098042096e-06, + "loss": 0.0812, + "step": 6317 + }, + { + "epoch": 2.047310434219054, + "grad_norm": 0.9220302700996399, + "learning_rate": 1.2105727190929967e-06, + "loss": 0.0816, + "step": 6318 + }, + { + "epoch": 2.0476344782890474, + "grad_norm": 0.9743109345436096, + "learning_rate": 1.209823497957314e-06, + "loss": 0.0868, + "step": 6319 + }, + { + "epoch": 2.047958522359041, + "grad_norm": 0.8121354579925537, + "learning_rate": 1.2090744347267452e-06, + "loss": 0.0762, + "step": 6320 + }, + { + "epoch": 2.0482825664290343, + "grad_norm": 0.830605149269104, + "learning_rate": 1.2083255294929697e-06, + "loss": 0.0782, + "step": 6321 + }, + { + "epoch": 2.0486066104990277, + "grad_norm": 0.9353241920471191, + "learning_rate": 1.2075767823476439e-06, + "loss": 0.0848, + "step": 6322 + }, + { + "epoch": 2.0489306545690216, + "grad_norm": 1.0210410356521606, + "learning_rate": 1.2068281933824084e-06, + "loss": 0.0916, + "step": 6323 + }, + { + "epoch": 2.049254698639015, + "grad_norm": 0.8150740265846252, + "learning_rate": 1.2060797626888828e-06, + "loss": 0.0701, + "step": 6324 + }, + { + "epoch": 2.0495787427090084, + "grad_norm": 0.849547266960144, + "learning_rate": 1.2053314903586685e-06, + "loss": 0.0799, + "step": 6325 + }, + { + "epoch": 2.049902786779002, + "grad_norm": 0.8484644889831543, + "learning_rate": 1.2045833764833461e-06, + "loss": 0.0792, + "step": 6326 + }, + { + "epoch": 2.0502268308489953, + "grad_norm": 0.861136257648468, + "learning_rate": 1.2038354211544781e-06, + "loss": 0.0801, + "step": 6327 + }, + { + "epoch": 2.050550874918989, + "grad_norm": 0.8917108178138733, + "learning_rate": 1.2030876244636078e-06, + "loss": 0.0771, + "step": 6328 + }, + { + "epoch": 2.0508749189889826, + "grad_norm": 0.8093529343605042, + "learning_rate": 1.202339986502255e-06, + "loss": 0.0726, + "step": 6329 + }, + { + "epoch": 2.051198963058976, + "grad_norm": 0.8013548254966736, + "learning_rate": 1.2015925073619275e-06, + "loss": 0.0736, + "step": 6330 + }, + { + "epoch": 2.0515230071289694, + "grad_norm": 0.7830600142478943, + "learning_rate": 1.2008451871341056e-06, + "loss": 0.073, + "step": 6331 + }, + { + "epoch": 2.0518470511989633, + "grad_norm": 0.8527432084083557, + "learning_rate": 1.200098025910258e-06, + "loss": 0.0811, + "step": 6332 + }, + { + "epoch": 2.0521710952689567, + "grad_norm": 0.8012470006942749, + "learning_rate": 1.1993510237818269e-06, + "loss": 0.0738, + "step": 6333 + }, + { + "epoch": 2.05249513933895, + "grad_norm": 0.8954493999481201, + "learning_rate": 1.1986041808402393e-06, + "loss": 0.0867, + "step": 6334 + }, + { + "epoch": 2.0528191834089435, + "grad_norm": 0.8696557283401489, + "learning_rate": 1.1978574971769025e-06, + "loss": 0.082, + "step": 6335 + }, + { + "epoch": 2.053143227478937, + "grad_norm": 0.7633064389228821, + "learning_rate": 1.1971109728832003e-06, + "loss": 0.074, + "step": 6336 + }, + { + "epoch": 2.053467271548931, + "grad_norm": 1.0557372570037842, + "learning_rate": 1.196364608050504e-06, + "loss": 0.0797, + "step": 6337 + }, + { + "epoch": 2.0537913156189243, + "grad_norm": 0.772982656955719, + "learning_rate": 1.1956184027701576e-06, + "loss": 0.0729, + "step": 6338 + }, + { + "epoch": 2.0541153596889177, + "grad_norm": 0.8109322190284729, + "learning_rate": 1.1948723571334932e-06, + "loss": 0.076, + "step": 6339 + }, + { + "epoch": 2.054439403758911, + "grad_norm": 0.870343804359436, + "learning_rate": 1.1941264712318167e-06, + "loss": 0.0764, + "step": 6340 + }, + { + "epoch": 2.0547634478289045, + "grad_norm": 0.9311466813087463, + "learning_rate": 1.1933807451564186e-06, + "loss": 0.0773, + "step": 6341 + }, + { + "epoch": 2.0550874918988984, + "grad_norm": 0.8491466641426086, + "learning_rate": 1.192635178998568e-06, + "loss": 0.0706, + "step": 6342 + }, + { + "epoch": 2.055411535968892, + "grad_norm": 0.7975406050682068, + "learning_rate": 1.191889772849515e-06, + "loss": 0.0729, + "step": 6343 + }, + { + "epoch": 2.0557355800388852, + "grad_norm": 0.8268863558769226, + "learning_rate": 1.1911445268004917e-06, + "loss": 0.077, + "step": 6344 + }, + { + "epoch": 2.0560596241088787, + "grad_norm": 0.9013711214065552, + "learning_rate": 1.1903994409427063e-06, + "loss": 0.0801, + "step": 6345 + }, + { + "epoch": 2.0563836681788725, + "grad_norm": 0.7981849908828735, + "learning_rate": 1.1896545153673517e-06, + "loss": 0.0741, + "step": 6346 + }, + { + "epoch": 2.056707712248866, + "grad_norm": 0.8823484182357788, + "learning_rate": 1.1889097501655991e-06, + "loss": 0.0776, + "step": 6347 + }, + { + "epoch": 2.0570317563188594, + "grad_norm": 0.9027769565582275, + "learning_rate": 1.1881651454286008e-06, + "loss": 0.0792, + "step": 6348 + }, + { + "epoch": 2.057355800388853, + "grad_norm": 0.8280683755874634, + "learning_rate": 1.1874207012474891e-06, + "loss": 0.0769, + "step": 6349 + }, + { + "epoch": 2.057679844458846, + "grad_norm": 0.8139511346817017, + "learning_rate": 1.186676417713377e-06, + "loss": 0.0754, + "step": 6350 + }, + { + "epoch": 2.05800388852884, + "grad_norm": 0.8425285816192627, + "learning_rate": 1.1859322949173572e-06, + "loss": 0.0801, + "step": 6351 + }, + { + "epoch": 2.0583279325988335, + "grad_norm": 0.9451056122779846, + "learning_rate": 1.1851883329505043e-06, + "loss": 0.0838, + "step": 6352 + }, + { + "epoch": 2.058651976668827, + "grad_norm": 0.8416407704353333, + "learning_rate": 1.1844445319038694e-06, + "loss": 0.0758, + "step": 6353 + }, + { + "epoch": 2.0589760207388204, + "grad_norm": 0.8476356267929077, + "learning_rate": 1.18370089186849e-06, + "loss": 0.0823, + "step": 6354 + }, + { + "epoch": 2.059300064808814, + "grad_norm": 0.9370405077934265, + "learning_rate": 1.1829574129353777e-06, + "loss": 0.0773, + "step": 6355 + }, + { + "epoch": 2.0596241088788076, + "grad_norm": 0.865960955619812, + "learning_rate": 1.182214095195528e-06, + "loss": 0.074, + "step": 6356 + }, + { + "epoch": 2.059948152948801, + "grad_norm": 0.8611675500869751, + "learning_rate": 1.181470938739917e-06, + "loss": 0.0848, + "step": 6357 + }, + { + "epoch": 2.0602721970187945, + "grad_norm": 0.9232220649719238, + "learning_rate": 1.1807279436594967e-06, + "loss": 0.0815, + "step": 6358 + }, + { + "epoch": 2.060596241088788, + "grad_norm": 0.9129186272621155, + "learning_rate": 1.1799851100452067e-06, + "loss": 0.0798, + "step": 6359 + }, + { + "epoch": 2.060920285158782, + "grad_norm": 0.8891286849975586, + "learning_rate": 1.1792424379879582e-06, + "loss": 0.081, + "step": 6360 + }, + { + "epoch": 2.061244329228775, + "grad_norm": 0.8557460308074951, + "learning_rate": 1.1784999275786515e-06, + "loss": 0.0764, + "step": 6361 + }, + { + "epoch": 2.0615683732987686, + "grad_norm": 0.8832924365997314, + "learning_rate": 1.177757578908159e-06, + "loss": 0.0825, + "step": 6362 + }, + { + "epoch": 2.061892417368762, + "grad_norm": 0.859259843826294, + "learning_rate": 1.17701539206734e-06, + "loss": 0.0855, + "step": 6363 + }, + { + "epoch": 2.0622164614387555, + "grad_norm": 0.804772138595581, + "learning_rate": 1.1762733671470285e-06, + "loss": 0.0733, + "step": 6364 + }, + { + "epoch": 2.0625405055087493, + "grad_norm": 0.8585007190704346, + "learning_rate": 1.1755315042380425e-06, + "loss": 0.0821, + "step": 6365 + }, + { + "epoch": 2.0628645495787428, + "grad_norm": 0.8764128684997559, + "learning_rate": 1.1747898034311782e-06, + "loss": 0.0857, + "step": 6366 + }, + { + "epoch": 2.063188593648736, + "grad_norm": 0.858251690864563, + "learning_rate": 1.1740482648172132e-06, + "loss": 0.0817, + "step": 6367 + }, + { + "epoch": 2.0635126377187296, + "grad_norm": 0.8610502481460571, + "learning_rate": 1.1733068884869053e-06, + "loss": 0.0709, + "step": 6368 + }, + { + "epoch": 2.0638366817887235, + "grad_norm": 0.8864075541496277, + "learning_rate": 1.172565674530989e-06, + "loss": 0.0862, + "step": 6369 + }, + { + "epoch": 2.064160725858717, + "grad_norm": 0.8807039260864258, + "learning_rate": 1.1718246230401856e-06, + "loss": 0.0837, + "step": 6370 + }, + { + "epoch": 2.0644847699287103, + "grad_norm": 0.913631796836853, + "learning_rate": 1.1710837341051892e-06, + "loss": 0.0887, + "step": 6371 + }, + { + "epoch": 2.0648088139987038, + "grad_norm": 0.8298909664154053, + "learning_rate": 1.1703430078166792e-06, + "loss": 0.0807, + "step": 6372 + }, + { + "epoch": 2.065132858068697, + "grad_norm": 0.9504325985908508, + "learning_rate": 1.169602444265313e-06, + "loss": 0.0878, + "step": 6373 + }, + { + "epoch": 2.065456902138691, + "grad_norm": 0.8303631544113159, + "learning_rate": 1.168862043541728e-06, + "loss": 0.0778, + "step": 6374 + }, + { + "epoch": 2.0657809462086845, + "grad_norm": 0.860530436038971, + "learning_rate": 1.1681218057365429e-06, + "loss": 0.081, + "step": 6375 + }, + { + "epoch": 2.066104990278678, + "grad_norm": 0.8380438685417175, + "learning_rate": 1.167381730940356e-06, + "loss": 0.0781, + "step": 6376 + }, + { + "epoch": 2.0664290343486713, + "grad_norm": 0.8239679336547852, + "learning_rate": 1.1666418192437434e-06, + "loss": 0.0773, + "step": 6377 + }, + { + "epoch": 2.0667530784186647, + "grad_norm": 0.8223795294761658, + "learning_rate": 1.1659020707372643e-06, + "loss": 0.0753, + "step": 6378 + }, + { + "epoch": 2.0670771224886586, + "grad_norm": 0.7889320850372314, + "learning_rate": 1.1651624855114565e-06, + "loss": 0.0788, + "step": 6379 + }, + { + "epoch": 2.067401166558652, + "grad_norm": 0.886277973651886, + "learning_rate": 1.1644230636568384e-06, + "loss": 0.0752, + "step": 6380 + }, + { + "epoch": 2.0677252106286454, + "grad_norm": 0.8053793907165527, + "learning_rate": 1.1636838052639081e-06, + "loss": 0.0779, + "step": 6381 + }, + { + "epoch": 2.068049254698639, + "grad_norm": 0.8326542377471924, + "learning_rate": 1.1629447104231435e-06, + "loss": 0.0796, + "step": 6382 + }, + { + "epoch": 2.0683732987686327, + "grad_norm": 0.865424394607544, + "learning_rate": 1.1622057792250033e-06, + "loss": 0.0824, + "step": 6383 + }, + { + "epoch": 2.068697342838626, + "grad_norm": 0.8067560791969299, + "learning_rate": 1.1614670117599231e-06, + "loss": 0.0732, + "step": 6384 + }, + { + "epoch": 2.0690213869086196, + "grad_norm": 0.9387754201889038, + "learning_rate": 1.1607284081183245e-06, + "loss": 0.0818, + "step": 6385 + }, + { + "epoch": 2.069345430978613, + "grad_norm": 0.8114498853683472, + "learning_rate": 1.1599899683906026e-06, + "loss": 0.074, + "step": 6386 + }, + { + "epoch": 2.0696694750486064, + "grad_norm": 0.8556808233261108, + "learning_rate": 1.1592516926671367e-06, + "loss": 0.0783, + "step": 6387 + }, + { + "epoch": 2.0699935191186003, + "grad_norm": 0.904638946056366, + "learning_rate": 1.1585135810382836e-06, + "loss": 0.0811, + "step": 6388 + }, + { + "epoch": 2.0703175631885937, + "grad_norm": 0.8684148788452148, + "learning_rate": 1.1577756335943818e-06, + "loss": 0.0751, + "step": 6389 + }, + { + "epoch": 2.070641607258587, + "grad_norm": 0.9171475768089294, + "learning_rate": 1.1570378504257499e-06, + "loss": 0.0775, + "step": 6390 + }, + { + "epoch": 2.0709656513285806, + "grad_norm": 0.949954092502594, + "learning_rate": 1.156300231622682e-06, + "loss": 0.0844, + "step": 6391 + }, + { + "epoch": 2.071289695398574, + "grad_norm": 0.9226496815681458, + "learning_rate": 1.1555627772754595e-06, + "loss": 0.0763, + "step": 6392 + }, + { + "epoch": 2.071613739468568, + "grad_norm": 0.888752281665802, + "learning_rate": 1.1548254874743365e-06, + "loss": 0.0773, + "step": 6393 + }, + { + "epoch": 2.0719377835385613, + "grad_norm": 0.8955729007720947, + "learning_rate": 1.154088362309553e-06, + "loss": 0.0805, + "step": 6394 + }, + { + "epoch": 2.0722618276085547, + "grad_norm": 0.8570363521575928, + "learning_rate": 1.1533514018713238e-06, + "loss": 0.0815, + "step": 6395 + }, + { + "epoch": 2.072585871678548, + "grad_norm": 0.9176596403121948, + "learning_rate": 1.1526146062498464e-06, + "loss": 0.0831, + "step": 6396 + }, + { + "epoch": 2.072909915748542, + "grad_norm": 0.8681780695915222, + "learning_rate": 1.1518779755352977e-06, + "loss": 0.0785, + "step": 6397 + }, + { + "epoch": 2.0732339598185354, + "grad_norm": 0.8597813844680786, + "learning_rate": 1.1511415098178336e-06, + "loss": 0.0769, + "step": 6398 + }, + { + "epoch": 2.073558003888529, + "grad_norm": 0.8584122061729431, + "learning_rate": 1.1504052091875917e-06, + "loss": 0.0763, + "step": 6399 + }, + { + "epoch": 2.0738820479585223, + "grad_norm": 0.9270933270454407, + "learning_rate": 1.1496690737346864e-06, + "loss": 0.0823, + "step": 6400 + }, + { + "epoch": 2.0742060920285157, + "grad_norm": 0.8876338601112366, + "learning_rate": 1.148933103549214e-06, + "loss": 0.0782, + "step": 6401 + }, + { + "epoch": 2.0745301360985096, + "grad_norm": 0.8715948462486267, + "learning_rate": 1.1481972987212505e-06, + "loss": 0.0806, + "step": 6402 + }, + { + "epoch": 2.074854180168503, + "grad_norm": 0.8495489358901978, + "learning_rate": 1.1474616593408513e-06, + "loss": 0.0784, + "step": 6403 + }, + { + "epoch": 2.0751782242384964, + "grad_norm": 0.8601410388946533, + "learning_rate": 1.1467261854980513e-06, + "loss": 0.0773, + "step": 6404 + }, + { + "epoch": 2.07550226830849, + "grad_norm": 0.9512822031974792, + "learning_rate": 1.1459908772828658e-06, + "loss": 0.0792, + "step": 6405 + }, + { + "epoch": 2.0758263123784833, + "grad_norm": 0.8868715167045593, + "learning_rate": 1.1452557347852885e-06, + "loss": 0.0814, + "step": 6406 + }, + { + "epoch": 2.076150356448477, + "grad_norm": 0.9470981359481812, + "learning_rate": 1.1445207580952956e-06, + "loss": 0.0904, + "step": 6407 + }, + { + "epoch": 2.0764744005184705, + "grad_norm": 0.9002374410629272, + "learning_rate": 1.143785947302839e-06, + "loss": 0.088, + "step": 6408 + }, + { + "epoch": 2.076798444588464, + "grad_norm": 0.7991862893104553, + "learning_rate": 1.143051302497853e-06, + "loss": 0.0712, + "step": 6409 + }, + { + "epoch": 2.0771224886584574, + "grad_norm": 0.8354719281196594, + "learning_rate": 1.1423168237702515e-06, + "loss": 0.0699, + "step": 6410 + }, + { + "epoch": 2.0774465327284513, + "grad_norm": 0.8705977201461792, + "learning_rate": 1.1415825112099274e-06, + "loss": 0.0742, + "step": 6411 + }, + { + "epoch": 2.0777705767984447, + "grad_norm": 0.7786136269569397, + "learning_rate": 1.1408483649067541e-06, + "loss": 0.0743, + "step": 6412 + }, + { + "epoch": 2.078094620868438, + "grad_norm": 0.8944233059883118, + "learning_rate": 1.1401143849505816e-06, + "loss": 0.0845, + "step": 6413 + }, + { + "epoch": 2.0784186649384315, + "grad_norm": 0.8579782843589783, + "learning_rate": 1.1393805714312456e-06, + "loss": 0.0816, + "step": 6414 + }, + { + "epoch": 2.078742709008425, + "grad_norm": 0.8879421949386597, + "learning_rate": 1.138646924438554e-06, + "loss": 0.0819, + "step": 6415 + }, + { + "epoch": 2.079066753078419, + "grad_norm": 0.8416301608085632, + "learning_rate": 1.1379134440623018e-06, + "loss": 0.073, + "step": 6416 + }, + { + "epoch": 2.0793907971484122, + "grad_norm": 0.9148510694503784, + "learning_rate": 1.137180130392257e-06, + "loss": 0.0858, + "step": 6417 + }, + { + "epoch": 2.0797148412184057, + "grad_norm": 0.8215204477310181, + "learning_rate": 1.1364469835181712e-06, + "loss": 0.0745, + "step": 6418 + }, + { + "epoch": 2.080038885288399, + "grad_norm": 0.86520916223526, + "learning_rate": 1.1357140035297745e-06, + "loss": 0.0767, + "step": 6419 + }, + { + "epoch": 2.080362929358393, + "grad_norm": 0.837408721446991, + "learning_rate": 1.1349811905167762e-06, + "loss": 0.0743, + "step": 6420 + }, + { + "epoch": 2.0806869734283864, + "grad_norm": 0.8289994597434998, + "learning_rate": 1.134248544568867e-06, + "loss": 0.0785, + "step": 6421 + }, + { + "epoch": 2.08101101749838, + "grad_norm": 0.9177697896957397, + "learning_rate": 1.1335160657757121e-06, + "loss": 0.0806, + "step": 6422 + }, + { + "epoch": 2.0813350615683732, + "grad_norm": 0.9063735604286194, + "learning_rate": 1.1327837542269645e-06, + "loss": 0.0825, + "step": 6423 + }, + { + "epoch": 2.0816591056383666, + "grad_norm": 0.9097846150398254, + "learning_rate": 1.1320516100122487e-06, + "loss": 0.0817, + "step": 6424 + }, + { + "epoch": 2.0819831497083605, + "grad_norm": 1.008061408996582, + "learning_rate": 1.1313196332211728e-06, + "loss": 0.0807, + "step": 6425 + }, + { + "epoch": 2.082307193778354, + "grad_norm": 0.872922956943512, + "learning_rate": 1.130587823943324e-06, + "loss": 0.0818, + "step": 6426 + }, + { + "epoch": 2.0826312378483474, + "grad_norm": 0.8754384517669678, + "learning_rate": 1.1298561822682687e-06, + "loss": 0.0786, + "step": 6427 + }, + { + "epoch": 2.082955281918341, + "grad_norm": 0.8794349431991577, + "learning_rate": 1.1291247082855528e-06, + "loss": 0.074, + "step": 6428 + }, + { + "epoch": 2.083279325988334, + "grad_norm": 0.82512366771698, + "learning_rate": 1.1283934020847015e-06, + "loss": 0.0758, + "step": 6429 + }, + { + "epoch": 2.083603370058328, + "grad_norm": 0.8695098161697388, + "learning_rate": 1.1276622637552203e-06, + "loss": 0.08, + "step": 6430 + }, + { + "epoch": 2.0839274141283215, + "grad_norm": 0.9111599922180176, + "learning_rate": 1.126931293386592e-06, + "loss": 0.0805, + "step": 6431 + }, + { + "epoch": 2.084251458198315, + "grad_norm": 0.8794967532157898, + "learning_rate": 1.1262004910682811e-06, + "loss": 0.0809, + "step": 6432 + }, + { + "epoch": 2.0845755022683083, + "grad_norm": 0.8865387439727783, + "learning_rate": 1.1254698568897308e-06, + "loss": 0.0822, + "step": 6433 + }, + { + "epoch": 2.084899546338302, + "grad_norm": 0.8278342485427856, + "learning_rate": 1.124739390940363e-06, + "loss": 0.0758, + "step": 6434 + }, + { + "epoch": 2.0852235904082956, + "grad_norm": 0.821538507938385, + "learning_rate": 1.1240090933095806e-06, + "loss": 0.0719, + "step": 6435 + }, + { + "epoch": 2.085547634478289, + "grad_norm": 0.8098189830780029, + "learning_rate": 1.1232789640867644e-06, + "loss": 0.0741, + "step": 6436 + }, + { + "epoch": 2.0858716785482825, + "grad_norm": 0.8238601088523865, + "learning_rate": 1.1225490033612755e-06, + "loss": 0.0785, + "step": 6437 + }, + { + "epoch": 2.086195722618276, + "grad_norm": 0.8251599669456482, + "learning_rate": 1.1218192112224547e-06, + "loss": 0.0754, + "step": 6438 + }, + { + "epoch": 2.0865197666882698, + "grad_norm": 0.9041087627410889, + "learning_rate": 1.1210895877596195e-06, + "loss": 0.0812, + "step": 6439 + }, + { + "epoch": 2.086843810758263, + "grad_norm": 0.8868924975395203, + "learning_rate": 1.12036013306207e-06, + "loss": 0.0779, + "step": 6440 + }, + { + "epoch": 2.0871678548282566, + "grad_norm": 0.9086484313011169, + "learning_rate": 1.1196308472190845e-06, + "loss": 0.0798, + "step": 6441 + }, + { + "epoch": 2.08749189889825, + "grad_norm": 0.9073233604431152, + "learning_rate": 1.1189017303199198e-06, + "loss": 0.0798, + "step": 6442 + }, + { + "epoch": 2.087815942968244, + "grad_norm": 1.000227928161621, + "learning_rate": 1.1181727824538147e-06, + "loss": 0.0824, + "step": 6443 + }, + { + "epoch": 2.0881399870382373, + "grad_norm": 0.980191171169281, + "learning_rate": 1.1174440037099815e-06, + "loss": 0.0814, + "step": 6444 + }, + { + "epoch": 2.0884640311082308, + "grad_norm": 0.863726794719696, + "learning_rate": 1.1167153941776205e-06, + "loss": 0.0773, + "step": 6445 + }, + { + "epoch": 2.088788075178224, + "grad_norm": 0.9686128497123718, + "learning_rate": 1.1159869539459018e-06, + "loss": 0.0832, + "step": 6446 + }, + { + "epoch": 2.0891121192482176, + "grad_norm": 0.8538342118263245, + "learning_rate": 1.1152586831039835e-06, + "loss": 0.0772, + "step": 6447 + }, + { + "epoch": 2.0894361633182115, + "grad_norm": 0.8319849967956543, + "learning_rate": 1.1145305817409962e-06, + "loss": 0.0735, + "step": 6448 + }, + { + "epoch": 2.089760207388205, + "grad_norm": 0.8991792798042297, + "learning_rate": 1.1138026499460532e-06, + "loss": 0.0805, + "step": 6449 + }, + { + "epoch": 2.0900842514581983, + "grad_norm": 0.8539870381355286, + "learning_rate": 1.1130748878082467e-06, + "loss": 0.0689, + "step": 6450 + }, + { + "epoch": 2.0904082955281917, + "grad_norm": 0.8225018382072449, + "learning_rate": 1.1123472954166473e-06, + "loss": 0.0737, + "step": 6451 + }, + { + "epoch": 2.090732339598185, + "grad_norm": 0.8963720798492432, + "learning_rate": 1.1116198728603061e-06, + "loss": 0.0788, + "step": 6452 + }, + { + "epoch": 2.091056383668179, + "grad_norm": 0.8966756463050842, + "learning_rate": 1.1108926202282505e-06, + "loss": 0.0821, + "step": 6453 + }, + { + "epoch": 2.0913804277381725, + "grad_norm": 0.8887714147567749, + "learning_rate": 1.110165537609492e-06, + "loss": 0.0745, + "step": 6454 + }, + { + "epoch": 2.091704471808166, + "grad_norm": 0.9101707935333252, + "learning_rate": 1.1094386250930164e-06, + "loss": 0.0791, + "step": 6455 + }, + { + "epoch": 2.0920285158781593, + "grad_norm": 0.8202990889549255, + "learning_rate": 1.1087118827677915e-06, + "loss": 0.0758, + "step": 6456 + }, + { + "epoch": 2.0923525599481527, + "grad_norm": 0.8937187790870667, + "learning_rate": 1.1079853107227634e-06, + "loss": 0.0853, + "step": 6457 + }, + { + "epoch": 2.0926766040181466, + "grad_norm": 0.9171735644340515, + "learning_rate": 1.1072589090468571e-06, + "loss": 0.0819, + "step": 6458 + }, + { + "epoch": 2.09300064808814, + "grad_norm": 0.9766994714736938, + "learning_rate": 1.1065326778289782e-06, + "loss": 0.0824, + "step": 6459 + }, + { + "epoch": 2.0933246921581334, + "grad_norm": 0.8692915439605713, + "learning_rate": 1.1058066171580092e-06, + "loss": 0.0762, + "step": 6460 + }, + { + "epoch": 2.093648736228127, + "grad_norm": 0.8715638518333435, + "learning_rate": 1.1050807271228146e-06, + "loss": 0.0753, + "step": 6461 + }, + { + "epoch": 2.0939727802981207, + "grad_norm": 0.9047412872314453, + "learning_rate": 1.1043550078122342e-06, + "loss": 0.0803, + "step": 6462 + }, + { + "epoch": 2.094296824368114, + "grad_norm": 0.8451228737831116, + "learning_rate": 1.1036294593150898e-06, + "loss": 0.0753, + "step": 6463 + }, + { + "epoch": 2.0946208684381076, + "grad_norm": 0.8436135649681091, + "learning_rate": 1.1029040817201819e-06, + "loss": 0.0772, + "step": 6464 + }, + { + "epoch": 2.094944912508101, + "grad_norm": 0.8356603980064392, + "learning_rate": 1.1021788751162893e-06, + "loss": 0.0804, + "step": 6465 + }, + { + "epoch": 2.0952689565780944, + "grad_norm": 0.8406388163566589, + "learning_rate": 1.1014538395921704e-06, + "loss": 0.0778, + "step": 6466 + }, + { + "epoch": 2.0955930006480883, + "grad_norm": 0.8398995995521545, + "learning_rate": 1.1007289752365635e-06, + "loss": 0.0731, + "step": 6467 + }, + { + "epoch": 2.0959170447180817, + "grad_norm": 0.8790268898010254, + "learning_rate": 1.1000042821381823e-06, + "loss": 0.075, + "step": 6468 + }, + { + "epoch": 2.096241088788075, + "grad_norm": 0.8896059393882751, + "learning_rate": 1.0992797603857257e-06, + "loss": 0.0794, + "step": 6469 + }, + { + "epoch": 2.0965651328580686, + "grad_norm": 0.9420241713523865, + "learning_rate": 1.0985554100678647e-06, + "loss": 0.0787, + "step": 6470 + }, + { + "epoch": 2.0968891769280624, + "grad_norm": 0.8144295811653137, + "learning_rate": 1.0978312312732562e-06, + "loss": 0.075, + "step": 6471 + }, + { + "epoch": 2.097213220998056, + "grad_norm": 0.8950752019882202, + "learning_rate": 1.09710722409053e-06, + "loss": 0.084, + "step": 6472 + }, + { + "epoch": 2.0975372650680493, + "grad_norm": 0.8596192598342896, + "learning_rate": 1.0963833886082987e-06, + "loss": 0.0787, + "step": 6473 + }, + { + "epoch": 2.0978613091380427, + "grad_norm": 0.8208886981010437, + "learning_rate": 1.0956597249151532e-06, + "loss": 0.0747, + "step": 6474 + }, + { + "epoch": 2.098185353208036, + "grad_norm": 0.8358070254325867, + "learning_rate": 1.0949362330996605e-06, + "loss": 0.0718, + "step": 6475 + }, + { + "epoch": 2.09850939727803, + "grad_norm": 0.8843006491661072, + "learning_rate": 1.094212913250373e-06, + "loss": 0.0749, + "step": 6476 + }, + { + "epoch": 2.0988334413480234, + "grad_norm": 0.8269924521446228, + "learning_rate": 1.0934897654558134e-06, + "loss": 0.0763, + "step": 6477 + }, + { + "epoch": 2.099157485418017, + "grad_norm": 0.8224289417266846, + "learning_rate": 1.0927667898044927e-06, + "loss": 0.076, + "step": 6478 + }, + { + "epoch": 2.0994815294880103, + "grad_norm": 0.7836449146270752, + "learning_rate": 1.092043986384893e-06, + "loss": 0.0702, + "step": 6479 + }, + { + "epoch": 2.0998055735580037, + "grad_norm": 0.9487519264221191, + "learning_rate": 1.091321355285479e-06, + "loss": 0.0868, + "step": 6480 + }, + { + "epoch": 2.1001296176279975, + "grad_norm": 0.8501242995262146, + "learning_rate": 1.0905988965946942e-06, + "loss": 0.0752, + "step": 6481 + }, + { + "epoch": 2.100453661697991, + "grad_norm": 0.9248630404472351, + "learning_rate": 1.0898766104009606e-06, + "loss": 0.087, + "step": 6482 + }, + { + "epoch": 2.1007777057679844, + "grad_norm": 0.8161444067955017, + "learning_rate": 1.0891544967926795e-06, + "loss": 0.0741, + "step": 6483 + }, + { + "epoch": 2.101101749837978, + "grad_norm": 0.8928951025009155, + "learning_rate": 1.0884325558582283e-06, + "loss": 0.0764, + "step": 6484 + }, + { + "epoch": 2.1014257939079717, + "grad_norm": 0.9648940563201904, + "learning_rate": 1.0877107876859688e-06, + "loss": 0.0839, + "step": 6485 + }, + { + "epoch": 2.101749837977965, + "grad_norm": 0.9580192565917969, + "learning_rate": 1.086989192364236e-06, + "loss": 0.0772, + "step": 6486 + }, + { + "epoch": 2.1020738820479585, + "grad_norm": 0.8301624655723572, + "learning_rate": 1.0862677699813471e-06, + "loss": 0.0742, + "step": 6487 + }, + { + "epoch": 2.102397926117952, + "grad_norm": 0.8573307394981384, + "learning_rate": 1.0855465206255972e-06, + "loss": 0.0776, + "step": 6488 + }, + { + "epoch": 2.1027219701879454, + "grad_norm": 0.8289283514022827, + "learning_rate": 1.0848254443852602e-06, + "loss": 0.0796, + "step": 6489 + }, + { + "epoch": 2.1030460142579392, + "grad_norm": 0.8256188035011292, + "learning_rate": 1.084104541348589e-06, + "loss": 0.0715, + "step": 6490 + }, + { + "epoch": 2.1033700583279327, + "grad_norm": 0.9029165506362915, + "learning_rate": 1.0833838116038156e-06, + "loss": 0.0785, + "step": 6491 + }, + { + "epoch": 2.103694102397926, + "grad_norm": 0.8527599573135376, + "learning_rate": 1.0826632552391484e-06, + "loss": 0.0781, + "step": 6492 + }, + { + "epoch": 2.1040181464679195, + "grad_norm": 0.8875585198402405, + "learning_rate": 1.081942872342779e-06, + "loss": 0.0807, + "step": 6493 + }, + { + "epoch": 2.1043421905379134, + "grad_norm": 0.8848504424095154, + "learning_rate": 1.0812226630028738e-06, + "loss": 0.0773, + "step": 6494 + }, + { + "epoch": 2.104666234607907, + "grad_norm": 0.8722944855690002, + "learning_rate": 1.0805026273075797e-06, + "loss": 0.0819, + "step": 6495 + }, + { + "epoch": 2.1049902786779002, + "grad_norm": 0.9383862018585205, + "learning_rate": 1.0797827653450222e-06, + "loss": 0.0837, + "step": 6496 + }, + { + "epoch": 2.1053143227478937, + "grad_norm": 0.8965055346488953, + "learning_rate": 1.0790630772033057e-06, + "loss": 0.0843, + "step": 6497 + }, + { + "epoch": 2.105638366817887, + "grad_norm": 0.8149116635322571, + "learning_rate": 1.0783435629705134e-06, + "loss": 0.0697, + "step": 6498 + }, + { + "epoch": 2.105962410887881, + "grad_norm": 0.8215212225914001, + "learning_rate": 1.0776242227347044e-06, + "loss": 0.0753, + "step": 6499 + }, + { + "epoch": 2.1062864549578744, + "grad_norm": 0.9240198135375977, + "learning_rate": 1.0769050565839228e-06, + "loss": 0.0829, + "step": 6500 + }, + { + "epoch": 2.106610499027868, + "grad_norm": 0.8391976356506348, + "learning_rate": 1.0761860646061838e-06, + "loss": 0.075, + "step": 6501 + }, + { + "epoch": 2.106934543097861, + "grad_norm": 0.8725175857543945, + "learning_rate": 1.0754672468894889e-06, + "loss": 0.0838, + "step": 6502 + }, + { + "epoch": 2.1072585871678546, + "grad_norm": 0.861038088798523, + "learning_rate": 1.0747486035218116e-06, + "loss": 0.0774, + "step": 6503 + }, + { + "epoch": 2.1075826312378485, + "grad_norm": 0.9436843991279602, + "learning_rate": 1.0740301345911075e-06, + "loss": 0.0855, + "step": 6504 + }, + { + "epoch": 2.107906675307842, + "grad_norm": 0.8671337962150574, + "learning_rate": 1.0733118401853112e-06, + "loss": 0.0792, + "step": 6505 + }, + { + "epoch": 2.1082307193778353, + "grad_norm": 0.8089661598205566, + "learning_rate": 1.0725937203923327e-06, + "loss": 0.073, + "step": 6506 + }, + { + "epoch": 2.1085547634478288, + "grad_norm": 0.9231501221656799, + "learning_rate": 1.0718757753000665e-06, + "loss": 0.0797, + "step": 6507 + }, + { + "epoch": 2.108878807517822, + "grad_norm": 0.8823448419570923, + "learning_rate": 1.071158004996378e-06, + "loss": 0.0775, + "step": 6508 + }, + { + "epoch": 2.109202851587816, + "grad_norm": 0.8127901554107666, + "learning_rate": 1.070440409569119e-06, + "loss": 0.0745, + "step": 6509 + }, + { + "epoch": 2.1095268956578095, + "grad_norm": 0.8558254241943359, + "learning_rate": 1.0697229891061141e-06, + "loss": 0.0778, + "step": 6510 + }, + { + "epoch": 2.109850939727803, + "grad_norm": 0.8050652742385864, + "learning_rate": 1.0690057436951689e-06, + "loss": 0.0712, + "step": 6511 + }, + { + "epoch": 2.1101749837977963, + "grad_norm": 0.8715561032295227, + "learning_rate": 1.068288673424068e-06, + "loss": 0.0712, + "step": 6512 + }, + { + "epoch": 2.11049902786779, + "grad_norm": 0.8177485466003418, + "learning_rate": 1.067571778380573e-06, + "loss": 0.0771, + "step": 6513 + }, + { + "epoch": 2.1108230719377836, + "grad_norm": 0.7821464538574219, + "learning_rate": 1.0668550586524256e-06, + "loss": 0.0701, + "step": 6514 + }, + { + "epoch": 2.111147116007777, + "grad_norm": 0.860443651676178, + "learning_rate": 1.066138514327345e-06, + "loss": 0.0755, + "step": 6515 + }, + { + "epoch": 2.1114711600777705, + "grad_norm": 0.8350194692611694, + "learning_rate": 1.0654221454930305e-06, + "loss": 0.0736, + "step": 6516 + }, + { + "epoch": 2.111795204147764, + "grad_norm": 0.9321835041046143, + "learning_rate": 1.0647059522371565e-06, + "loss": 0.0906, + "step": 6517 + }, + { + "epoch": 2.1121192482177578, + "grad_norm": 0.8878209590911865, + "learning_rate": 1.0639899346473792e-06, + "loss": 0.0741, + "step": 6518 + }, + { + "epoch": 2.112443292287751, + "grad_norm": 0.8689842224121094, + "learning_rate": 1.0632740928113323e-06, + "loss": 0.0777, + "step": 6519 + }, + { + "epoch": 2.1127673363577446, + "grad_norm": 0.8841999769210815, + "learning_rate": 1.0625584268166278e-06, + "loss": 0.0792, + "step": 6520 + }, + { + "epoch": 2.113091380427738, + "grad_norm": 0.8802910447120667, + "learning_rate": 1.0618429367508564e-06, + "loss": 0.0782, + "step": 6521 + }, + { + "epoch": 2.113415424497732, + "grad_norm": 0.780536413192749, + "learning_rate": 1.061127622701588e-06, + "loss": 0.0698, + "step": 6522 + }, + { + "epoch": 2.1137394685677253, + "grad_norm": 0.9865183234214783, + "learning_rate": 1.0604124847563674e-06, + "loss": 0.0846, + "step": 6523 + }, + { + "epoch": 2.1140635126377187, + "grad_norm": 0.868303656578064, + "learning_rate": 1.0596975230027243e-06, + "loss": 0.0799, + "step": 6524 + }, + { + "epoch": 2.114387556707712, + "grad_norm": 0.865825355052948, + "learning_rate": 1.05898273752816e-06, + "loss": 0.0759, + "step": 6525 + }, + { + "epoch": 2.1147116007777056, + "grad_norm": 0.8290408849716187, + "learning_rate": 1.0582681284201587e-06, + "loss": 0.0761, + "step": 6526 + }, + { + "epoch": 2.1150356448476995, + "grad_norm": 0.7881754636764526, + "learning_rate": 1.0575536957661814e-06, + "loss": 0.0694, + "step": 6527 + }, + { + "epoch": 2.115359688917693, + "grad_norm": 0.9243281483650208, + "learning_rate": 1.056839439653668e-06, + "loss": 0.0826, + "step": 6528 + }, + { + "epoch": 2.1156837329876863, + "grad_norm": 0.8762599229812622, + "learning_rate": 1.056125360170037e-06, + "loss": 0.0782, + "step": 6529 + }, + { + "epoch": 2.1160077770576797, + "grad_norm": 0.9436377286911011, + "learning_rate": 1.0554114574026823e-06, + "loss": 0.0877, + "step": 6530 + }, + { + "epoch": 2.116331821127673, + "grad_norm": 0.8819335103034973, + "learning_rate": 1.0546977314389822e-06, + "loss": 0.0803, + "step": 6531 + }, + { + "epoch": 2.116655865197667, + "grad_norm": 0.8602144718170166, + "learning_rate": 1.0539841823662867e-06, + "loss": 0.0821, + "step": 6532 + }, + { + "epoch": 2.1169799092676604, + "grad_norm": 0.8705927729606628, + "learning_rate": 1.0532708102719303e-06, + "loss": 0.0798, + "step": 6533 + }, + { + "epoch": 2.117303953337654, + "grad_norm": 0.9389498233795166, + "learning_rate": 1.0525576152432204e-06, + "loss": 0.0813, + "step": 6534 + }, + { + "epoch": 2.1176279974076473, + "grad_norm": 0.9420464634895325, + "learning_rate": 1.051844597367446e-06, + "loss": 0.0812, + "step": 6535 + }, + { + "epoch": 2.117952041477641, + "grad_norm": 0.8560028076171875, + "learning_rate": 1.0511317567318737e-06, + "loss": 0.0802, + "step": 6536 + }, + { + "epoch": 2.1182760855476346, + "grad_norm": 0.8696438670158386, + "learning_rate": 1.0504190934237484e-06, + "loss": 0.0747, + "step": 6537 + }, + { + "epoch": 2.118600129617628, + "grad_norm": 0.9260076284408569, + "learning_rate": 1.0497066075302939e-06, + "loss": 0.0838, + "step": 6538 + }, + { + "epoch": 2.1189241736876214, + "grad_norm": 0.8828380107879639, + "learning_rate": 1.0489942991387088e-06, + "loss": 0.0757, + "step": 6539 + }, + { + "epoch": 2.119248217757615, + "grad_norm": 0.8743027448654175, + "learning_rate": 1.0482821683361767e-06, + "loss": 0.0775, + "step": 6540 + }, + { + "epoch": 2.1195722618276087, + "grad_norm": 0.8967780470848083, + "learning_rate": 1.0475702152098522e-06, + "loss": 0.0787, + "step": 6541 + }, + { + "epoch": 2.119896305897602, + "grad_norm": 0.7822275757789612, + "learning_rate": 1.0468584398468729e-06, + "loss": 0.0706, + "step": 6542 + }, + { + "epoch": 2.1202203499675956, + "grad_norm": 0.8575571775436401, + "learning_rate": 1.0461468423343532e-06, + "loss": 0.0796, + "step": 6543 + }, + { + "epoch": 2.120544394037589, + "grad_norm": 0.8331258893013, + "learning_rate": 1.0454354227593855e-06, + "loss": 0.0745, + "step": 6544 + }, + { + "epoch": 2.120868438107583, + "grad_norm": 0.8735781908035278, + "learning_rate": 1.0447241812090408e-06, + "loss": 0.0797, + "step": 6545 + }, + { + "epoch": 2.1211924821775763, + "grad_norm": 0.8512864112854004, + "learning_rate": 1.0440131177703692e-06, + "loss": 0.0795, + "step": 6546 + }, + { + "epoch": 2.1215165262475697, + "grad_norm": 0.8513296246528625, + "learning_rate": 1.0433022325303956e-06, + "loss": 0.0754, + "step": 6547 + }, + { + "epoch": 2.121840570317563, + "grad_norm": 0.8489466905593872, + "learning_rate": 1.042591525576127e-06, + "loss": 0.0775, + "step": 6548 + }, + { + "epoch": 2.1221646143875565, + "grad_norm": 0.8595914840698242, + "learning_rate": 1.041880996994547e-06, + "loss": 0.0787, + "step": 6549 + }, + { + "epoch": 2.1224886584575504, + "grad_norm": 0.8622096180915833, + "learning_rate": 1.0411706468726173e-06, + "loss": 0.0804, + "step": 6550 + }, + { + "epoch": 2.122812702527544, + "grad_norm": 0.874594509601593, + "learning_rate": 1.040460475297278e-06, + "loss": 0.0791, + "step": 6551 + }, + { + "epoch": 2.1231367465975373, + "grad_norm": 0.8095867037773132, + "learning_rate": 1.039750482355447e-06, + "loss": 0.0788, + "step": 6552 + }, + { + "epoch": 2.1234607906675307, + "grad_norm": 0.8671919107437134, + "learning_rate": 1.0390406681340212e-06, + "loss": 0.0759, + "step": 6553 + }, + { + "epoch": 2.123784834737524, + "grad_norm": 0.8312748074531555, + "learning_rate": 1.0383310327198728e-06, + "loss": 0.077, + "step": 6554 + }, + { + "epoch": 2.124108878807518, + "grad_norm": 0.8587348461151123, + "learning_rate": 1.0376215761998578e-06, + "loss": 0.0744, + "step": 6555 + }, + { + "epoch": 2.1244329228775114, + "grad_norm": 0.8469531536102295, + "learning_rate": 1.0369122986608044e-06, + "loss": 0.0803, + "step": 6556 + }, + { + "epoch": 2.124756966947505, + "grad_norm": 0.8027523756027222, + "learning_rate": 1.0362032001895214e-06, + "loss": 0.0767, + "step": 6557 + }, + { + "epoch": 2.1250810110174982, + "grad_norm": 0.8883047103881836, + "learning_rate": 1.0354942808727962e-06, + "loss": 0.0837, + "step": 6558 + }, + { + "epoch": 2.1254050550874917, + "grad_norm": 0.8956378698348999, + "learning_rate": 1.0347855407973933e-06, + "loss": 0.0811, + "step": 6559 + }, + { + "epoch": 2.1257290991574855, + "grad_norm": 0.813138484954834, + "learning_rate": 1.034076980050057e-06, + "loss": 0.071, + "step": 6560 + }, + { + "epoch": 2.126053143227479, + "grad_norm": 0.8464450836181641, + "learning_rate": 1.0333685987175052e-06, + "loss": 0.0788, + "step": 6561 + }, + { + "epoch": 2.1263771872974724, + "grad_norm": 0.8753923773765564, + "learning_rate": 1.0326603968864407e-06, + "loss": 0.0759, + "step": 6562 + }, + { + "epoch": 2.126701231367466, + "grad_norm": 0.9201774001121521, + "learning_rate": 1.0319523746435367e-06, + "loss": 0.082, + "step": 6563 + }, + { + "epoch": 2.1270252754374597, + "grad_norm": 0.8545686602592468, + "learning_rate": 1.0312445320754522e-06, + "loss": 0.0812, + "step": 6564 + }, + { + "epoch": 2.127349319507453, + "grad_norm": 0.9130364060401917, + "learning_rate": 1.0305368692688175e-06, + "loss": 0.0795, + "step": 6565 + }, + { + "epoch": 2.1276733635774465, + "grad_norm": 0.8865534067153931, + "learning_rate": 1.0298293863102444e-06, + "loss": 0.0792, + "step": 6566 + }, + { + "epoch": 2.12799740764744, + "grad_norm": 0.8899914026260376, + "learning_rate": 1.0291220832863219e-06, + "loss": 0.0793, + "step": 6567 + }, + { + "epoch": 2.1283214517174334, + "grad_norm": 0.8158039450645447, + "learning_rate": 1.0284149602836174e-06, + "loss": 0.0769, + "step": 6568 + }, + { + "epoch": 2.1286454957874272, + "grad_norm": 0.8327584266662598, + "learning_rate": 1.0277080173886766e-06, + "loss": 0.0752, + "step": 6569 + }, + { + "epoch": 2.1289695398574207, + "grad_norm": 0.8870092630386353, + "learning_rate": 1.0270012546880207e-06, + "loss": 0.0802, + "step": 6570 + }, + { + "epoch": 2.129293583927414, + "grad_norm": 0.8395013213157654, + "learning_rate": 1.0262946722681513e-06, + "loss": 0.0752, + "step": 6571 + }, + { + "epoch": 2.1296176279974075, + "grad_norm": 0.8073970079421997, + "learning_rate": 1.0255882702155476e-06, + "loss": 0.0723, + "step": 6572 + }, + { + "epoch": 2.1299416720674014, + "grad_norm": 0.8960281610488892, + "learning_rate": 1.024882048616666e-06, + "loss": 0.0795, + "step": 6573 + }, + { + "epoch": 2.130265716137395, + "grad_norm": 0.8593527674674988, + "learning_rate": 1.0241760075579418e-06, + "loss": 0.0766, + "step": 6574 + }, + { + "epoch": 2.130589760207388, + "grad_norm": 0.8314931988716125, + "learning_rate": 1.0234701471257868e-06, + "loss": 0.0757, + "step": 6575 + }, + { + "epoch": 2.1309138042773816, + "grad_norm": 0.842562735080719, + "learning_rate": 1.0227644674065923e-06, + "loss": 0.0728, + "step": 6576 + }, + { + "epoch": 2.131237848347375, + "grad_norm": 0.9939832091331482, + "learning_rate": 1.0220589684867269e-06, + "loss": 0.0761, + "step": 6577 + }, + { + "epoch": 2.131561892417369, + "grad_norm": 0.8881149291992188, + "learning_rate": 1.021353650452535e-06, + "loss": 0.0752, + "step": 6578 + }, + { + "epoch": 2.1318859364873624, + "grad_norm": 0.8410991430282593, + "learning_rate": 1.0206485133903424e-06, + "loss": 0.0735, + "step": 6579 + }, + { + "epoch": 2.1322099805573558, + "grad_norm": 0.8679512143135071, + "learning_rate": 1.0199435573864502e-06, + "loss": 0.0777, + "step": 6580 + }, + { + "epoch": 2.132534024627349, + "grad_norm": 0.954770028591156, + "learning_rate": 1.0192387825271384e-06, + "loss": 0.0841, + "step": 6581 + }, + { + "epoch": 2.1328580686973426, + "grad_norm": 0.9145404100418091, + "learning_rate": 1.018534188898665e-06, + "loss": 0.0809, + "step": 6582 + }, + { + "epoch": 2.1331821127673365, + "grad_norm": 0.8095046281814575, + "learning_rate": 1.0178297765872651e-06, + "loss": 0.076, + "step": 6583 + }, + { + "epoch": 2.13350615683733, + "grad_norm": 0.9143222570419312, + "learning_rate": 1.0171255456791531e-06, + "loss": 0.0817, + "step": 6584 + }, + { + "epoch": 2.1338302009073233, + "grad_norm": 0.8669990301132202, + "learning_rate": 1.016421496260517e-06, + "loss": 0.0811, + "step": 6585 + }, + { + "epoch": 2.1341542449773168, + "grad_norm": 0.8360460996627808, + "learning_rate": 1.0157176284175293e-06, + "loss": 0.0687, + "step": 6586 + }, + { + "epoch": 2.1344782890473106, + "grad_norm": 0.8764594793319702, + "learning_rate": 1.0150139422363342e-06, + "loss": 0.0768, + "step": 6587 + }, + { + "epoch": 2.134802333117304, + "grad_norm": 0.9600023627281189, + "learning_rate": 1.0143104378030565e-06, + "loss": 0.0873, + "step": 6588 + }, + { + "epoch": 2.1351263771872975, + "grad_norm": 0.9232833385467529, + "learning_rate": 1.013607115203799e-06, + "loss": 0.0801, + "step": 6589 + }, + { + "epoch": 2.135450421257291, + "grad_norm": 0.9177942276000977, + "learning_rate": 1.012903974524641e-06, + "loss": 0.0792, + "step": 6590 + }, + { + "epoch": 2.1357744653272843, + "grad_norm": 0.9922571182250977, + "learning_rate": 1.0122010158516412e-06, + "loss": 0.0898, + "step": 6591 + }, + { + "epoch": 2.136098509397278, + "grad_norm": 0.8474758267402649, + "learning_rate": 1.0114982392708325e-06, + "loss": 0.0775, + "step": 6592 + }, + { + "epoch": 2.1364225534672716, + "grad_norm": 0.8396010398864746, + "learning_rate": 1.010795644868231e-06, + "loss": 0.0799, + "step": 6593 + }, + { + "epoch": 2.136746597537265, + "grad_norm": 0.8584730625152588, + "learning_rate": 1.0100932327298244e-06, + "loss": 0.0732, + "step": 6594 + }, + { + "epoch": 2.1370706416072585, + "grad_norm": 0.8349241018295288, + "learning_rate": 1.0093910029415843e-06, + "loss": 0.0753, + "step": 6595 + }, + { + "epoch": 2.1373946856772523, + "grad_norm": 0.8902902603149414, + "learning_rate": 1.0086889555894545e-06, + "loss": 0.0806, + "step": 6596 + }, + { + "epoch": 2.1377187297472457, + "grad_norm": 0.89394211769104, + "learning_rate": 1.0079870907593592e-06, + "loss": 0.08, + "step": 6597 + }, + { + "epoch": 2.138042773817239, + "grad_norm": 0.8607198596000671, + "learning_rate": 1.0072854085372005e-06, + "loss": 0.0767, + "step": 6598 + }, + { + "epoch": 2.1383668178872326, + "grad_norm": 0.8974934816360474, + "learning_rate": 1.0065839090088572e-06, + "loss": 0.0791, + "step": 6599 + }, + { + "epoch": 2.138690861957226, + "grad_norm": 0.92155522108078, + "learning_rate": 1.0058825922601866e-06, + "loss": 0.0852, + "step": 6600 + }, + { + "epoch": 2.13901490602722, + "grad_norm": 0.7993993759155273, + "learning_rate": 1.005181458377022e-06, + "loss": 0.0703, + "step": 6601 + }, + { + "epoch": 2.1393389500972133, + "grad_norm": 0.8989080786705017, + "learning_rate": 1.0044805074451757e-06, + "loss": 0.0828, + "step": 6602 + }, + { + "epoch": 2.1396629941672067, + "grad_norm": 0.8937354683876038, + "learning_rate": 1.003779739550438e-06, + "loss": 0.0802, + "step": 6603 + }, + { + "epoch": 2.1399870382372, + "grad_norm": 0.8435376882553101, + "learning_rate": 1.003079154778575e-06, + "loss": 0.0763, + "step": 6604 + }, + { + "epoch": 2.1403110823071936, + "grad_norm": 0.8782016038894653, + "learning_rate": 1.0023787532153325e-06, + "loss": 0.08, + "step": 6605 + }, + { + "epoch": 2.1406351263771874, + "grad_norm": 0.8370432257652283, + "learning_rate": 1.0016785349464326e-06, + "loss": 0.0724, + "step": 6606 + }, + { + "epoch": 2.140959170447181, + "grad_norm": 0.86859530210495, + "learning_rate": 1.0009785000575747e-06, + "loss": 0.0791, + "step": 6607 + }, + { + "epoch": 2.1412832145171743, + "grad_norm": 0.8456296324729919, + "learning_rate": 1.0002786486344379e-06, + "loss": 0.0682, + "step": 6608 + }, + { + "epoch": 2.1416072585871677, + "grad_norm": 0.8785818219184875, + "learning_rate": 9.995789807626754e-07, + "loss": 0.0787, + "step": 6609 + }, + { + "epoch": 2.141931302657161, + "grad_norm": 0.876229465007782, + "learning_rate": 9.988794965279203e-07, + "loss": 0.0824, + "step": 6610 + }, + { + "epoch": 2.142255346727155, + "grad_norm": 0.8579014539718628, + "learning_rate": 9.981801960157827e-07, + "loss": 0.08, + "step": 6611 + }, + { + "epoch": 2.1425793907971484, + "grad_norm": 0.844199538230896, + "learning_rate": 9.974810793118505e-07, + "loss": 0.0755, + "step": 6612 + }, + { + "epoch": 2.142903434867142, + "grad_norm": 0.8682866096496582, + "learning_rate": 9.967821465016893e-07, + "loss": 0.0719, + "step": 6613 + }, + { + "epoch": 2.1432274789371353, + "grad_norm": 0.8322014808654785, + "learning_rate": 9.960833976708398e-07, + "loss": 0.0719, + "step": 6614 + }, + { + "epoch": 2.143551523007129, + "grad_norm": 0.9430346488952637, + "learning_rate": 9.953848329048248e-07, + "loss": 0.0817, + "step": 6615 + }, + { + "epoch": 2.1438755670771226, + "grad_norm": 0.9577280879020691, + "learning_rate": 9.94686452289139e-07, + "loss": 0.0837, + "step": 6616 + }, + { + "epoch": 2.144199611147116, + "grad_norm": 0.917645275592804, + "learning_rate": 9.939882559092604e-07, + "loss": 0.0821, + "step": 6617 + }, + { + "epoch": 2.1445236552171094, + "grad_norm": 0.8027808666229248, + "learning_rate": 9.93290243850638e-07, + "loss": 0.0721, + "step": 6618 + }, + { + "epoch": 2.144847699287103, + "grad_norm": 0.862285315990448, + "learning_rate": 9.925924161987057e-07, + "loss": 0.0782, + "step": 6619 + }, + { + "epoch": 2.1451717433570967, + "grad_norm": 0.9676426649093628, + "learning_rate": 9.918947730388682e-07, + "loss": 0.0891, + "step": 6620 + }, + { + "epoch": 2.14549578742709, + "grad_norm": 0.8734449148178101, + "learning_rate": 9.911973144565105e-07, + "loss": 0.0862, + "step": 6621 + }, + { + "epoch": 2.1458198314970836, + "grad_norm": 0.9051758050918579, + "learning_rate": 9.90500040536996e-07, + "loss": 0.0715, + "step": 6622 + }, + { + "epoch": 2.146143875567077, + "grad_norm": 0.976852536201477, + "learning_rate": 9.898029513656618e-07, + "loss": 0.0819, + "step": 6623 + }, + { + "epoch": 2.146467919637071, + "grad_norm": 0.8941224217414856, + "learning_rate": 9.891060470278286e-07, + "loss": 0.0824, + "step": 6624 + }, + { + "epoch": 2.1467919637070643, + "grad_norm": 0.958651065826416, + "learning_rate": 9.884093276087871e-07, + "loss": 0.0833, + "step": 6625 + }, + { + "epoch": 2.1471160077770577, + "grad_norm": 0.9196736812591553, + "learning_rate": 9.877127931938111e-07, + "loss": 0.0773, + "step": 6626 + }, + { + "epoch": 2.147440051847051, + "grad_norm": 0.8458701968193054, + "learning_rate": 9.87016443868149e-07, + "loss": 0.0756, + "step": 6627 + }, + { + "epoch": 2.1477640959170445, + "grad_norm": 0.8649837374687195, + "learning_rate": 9.863202797170273e-07, + "loss": 0.0764, + "step": 6628 + }, + { + "epoch": 2.1480881399870384, + "grad_norm": 1.1385384798049927, + "learning_rate": 9.8562430082565e-07, + "loss": 0.0797, + "step": 6629 + }, + { + "epoch": 2.148412184057032, + "grad_norm": 0.8109810948371887, + "learning_rate": 9.849285072791978e-07, + "loss": 0.0715, + "step": 6630 + }, + { + "epoch": 2.1487362281270252, + "grad_norm": 0.8902484774589539, + "learning_rate": 9.8423289916283e-07, + "loss": 0.078, + "step": 6631 + }, + { + "epoch": 2.1490602721970187, + "grad_norm": 0.8315941095352173, + "learning_rate": 9.835374765616809e-07, + "loss": 0.0742, + "step": 6632 + }, + { + "epoch": 2.149384316267012, + "grad_norm": 0.8010660409927368, + "learning_rate": 9.82842239560864e-07, + "loss": 0.0734, + "step": 6633 + }, + { + "epoch": 2.149708360337006, + "grad_norm": 0.8385948538780212, + "learning_rate": 9.821471882454703e-07, + "loss": 0.0769, + "step": 6634 + }, + { + "epoch": 2.1500324044069994, + "grad_norm": 0.8368885517120361, + "learning_rate": 9.814523227005662e-07, + "loss": 0.0806, + "step": 6635 + }, + { + "epoch": 2.150356448476993, + "grad_norm": 0.9209007620811462, + "learning_rate": 9.807576430111975e-07, + "loss": 0.0696, + "step": 6636 + }, + { + "epoch": 2.1506804925469862, + "grad_norm": 0.8243295550346375, + "learning_rate": 9.800631492623867e-07, + "loss": 0.0712, + "step": 6637 + }, + { + "epoch": 2.15100453661698, + "grad_norm": 0.8927811980247498, + "learning_rate": 9.793688415391304e-07, + "loss": 0.0759, + "step": 6638 + }, + { + "epoch": 2.1513285806869735, + "grad_norm": 0.8509796261787415, + "learning_rate": 9.786747199264088e-07, + "loss": 0.076, + "step": 6639 + }, + { + "epoch": 2.151652624756967, + "grad_norm": 0.8079892992973328, + "learning_rate": 9.779807845091722e-07, + "loss": 0.0723, + "step": 6640 + }, + { + "epoch": 2.1519766688269604, + "grad_norm": 0.993626058101654, + "learning_rate": 9.77287035372355e-07, + "loss": 0.0811, + "step": 6641 + }, + { + "epoch": 2.152300712896954, + "grad_norm": 0.8776823282241821, + "learning_rate": 9.76593472600863e-07, + "loss": 0.0759, + "step": 6642 + }, + { + "epoch": 2.1526247569669477, + "grad_norm": 0.9392797946929932, + "learning_rate": 9.75900096279582e-07, + "loss": 0.0822, + "step": 6643 + }, + { + "epoch": 2.152948801036941, + "grad_norm": 0.8545043468475342, + "learning_rate": 9.752069064933758e-07, + "loss": 0.0795, + "step": 6644 + }, + { + "epoch": 2.1532728451069345, + "grad_norm": 0.9270698428153992, + "learning_rate": 9.745139033270812e-07, + "loss": 0.0839, + "step": 6645 + }, + { + "epoch": 2.153596889176928, + "grad_norm": 0.896757185459137, + "learning_rate": 9.738210868655187e-07, + "loss": 0.0772, + "step": 6646 + }, + { + "epoch": 2.153920933246922, + "grad_norm": 0.8868404626846313, + "learning_rate": 9.73128457193479e-07, + "loss": 0.0796, + "step": 6647 + }, + { + "epoch": 2.154244977316915, + "grad_norm": 0.8353792428970337, + "learning_rate": 9.724360143957367e-07, + "loss": 0.0741, + "step": 6648 + }, + { + "epoch": 2.1545690213869086, + "grad_norm": 0.9920429587364197, + "learning_rate": 9.717437585570375e-07, + "loss": 0.0852, + "step": 6649 + }, + { + "epoch": 2.154893065456902, + "grad_norm": 0.8376013040542603, + "learning_rate": 9.710516897621072e-07, + "loss": 0.0719, + "step": 6650 + }, + { + "epoch": 2.1552171095268955, + "grad_norm": 0.8551664352416992, + "learning_rate": 9.703598080956488e-07, + "loss": 0.0782, + "step": 6651 + }, + { + "epoch": 2.1555411535968894, + "grad_norm": 0.8405658006668091, + "learning_rate": 9.696681136423422e-07, + "loss": 0.0779, + "step": 6652 + }, + { + "epoch": 2.155865197666883, + "grad_norm": 0.9270169138908386, + "learning_rate": 9.689766064868434e-07, + "loss": 0.0805, + "step": 6653 + }, + { + "epoch": 2.156189241736876, + "grad_norm": 0.8944373726844788, + "learning_rate": 9.682852867137865e-07, + "loss": 0.0836, + "step": 6654 + }, + { + "epoch": 2.1565132858068696, + "grad_norm": 0.8218947052955627, + "learning_rate": 9.675941544077833e-07, + "loss": 0.0726, + "step": 6655 + }, + { + "epoch": 2.156837329876863, + "grad_norm": 0.8564376831054688, + "learning_rate": 9.6690320965342e-07, + "loss": 0.0757, + "step": 6656 + }, + { + "epoch": 2.157161373946857, + "grad_norm": 0.8816162347793579, + "learning_rate": 9.66212452535262e-07, + "loss": 0.079, + "step": 6657 + }, + { + "epoch": 2.1574854180168503, + "grad_norm": 0.8961756825447083, + "learning_rate": 9.655218831378518e-07, + "loss": 0.0798, + "step": 6658 + }, + { + "epoch": 2.1578094620868438, + "grad_norm": 0.8354091048240662, + "learning_rate": 9.648315015457083e-07, + "loss": 0.0713, + "step": 6659 + }, + { + "epoch": 2.158133506156837, + "grad_norm": 0.8975980281829834, + "learning_rate": 9.641413078433274e-07, + "loss": 0.0757, + "step": 6660 + }, + { + "epoch": 2.158457550226831, + "grad_norm": 0.8505983352661133, + "learning_rate": 9.63451302115182e-07, + "loss": 0.0717, + "step": 6661 + }, + { + "epoch": 2.1587815942968245, + "grad_norm": 0.8575515747070312, + "learning_rate": 9.627614844457222e-07, + "loss": 0.0771, + "step": 6662 + }, + { + "epoch": 2.159105638366818, + "grad_norm": 0.9212492108345032, + "learning_rate": 9.620718549193764e-07, + "loss": 0.0781, + "step": 6663 + }, + { + "epoch": 2.1594296824368113, + "grad_norm": 0.8683608174324036, + "learning_rate": 9.61382413620546e-07, + "loss": 0.0807, + "step": 6664 + }, + { + "epoch": 2.1597537265068047, + "grad_norm": 0.8901230096817017, + "learning_rate": 9.606931606336134e-07, + "loss": 0.0773, + "step": 6665 + }, + { + "epoch": 2.1600777705767986, + "grad_norm": 0.8463370203971863, + "learning_rate": 9.60004096042936e-07, + "loss": 0.0687, + "step": 6666 + }, + { + "epoch": 2.160401814646792, + "grad_norm": 0.8447796702384949, + "learning_rate": 9.593152199328494e-07, + "loss": 0.0747, + "step": 6667 + }, + { + "epoch": 2.1607258587167855, + "grad_norm": 0.9252559542655945, + "learning_rate": 9.586265323876653e-07, + "loss": 0.0866, + "step": 6668 + }, + { + "epoch": 2.161049902786779, + "grad_norm": 0.8271765112876892, + "learning_rate": 9.579380334916704e-07, + "loss": 0.0729, + "step": 6669 + }, + { + "epoch": 2.1613739468567728, + "grad_norm": 0.9104677438735962, + "learning_rate": 9.572497233291337e-07, + "loss": 0.0845, + "step": 6670 + }, + { + "epoch": 2.161697990926766, + "grad_norm": 0.8818185925483704, + "learning_rate": 9.56561601984294e-07, + "loss": 0.0786, + "step": 6671 + }, + { + "epoch": 2.1620220349967596, + "grad_norm": 0.8366871476173401, + "learning_rate": 9.558736695413745e-07, + "loss": 0.0789, + "step": 6672 + }, + { + "epoch": 2.162346079066753, + "grad_norm": 0.9518574476242065, + "learning_rate": 9.551859260845686e-07, + "loss": 0.0734, + "step": 6673 + }, + { + "epoch": 2.1626701231367464, + "grad_norm": 0.9152442812919617, + "learning_rate": 9.544983716980505e-07, + "loss": 0.0801, + "step": 6674 + }, + { + "epoch": 2.1629941672067403, + "grad_norm": 0.8730852603912354, + "learning_rate": 9.5381100646597e-07, + "loss": 0.0755, + "step": 6675 + }, + { + "epoch": 2.1633182112767337, + "grad_norm": 0.8248627185821533, + "learning_rate": 9.531238304724538e-07, + "loss": 0.074, + "step": 6676 + }, + { + "epoch": 2.163642255346727, + "grad_norm": 0.8867030739784241, + "learning_rate": 9.524368438016071e-07, + "loss": 0.0783, + "step": 6677 + }, + { + "epoch": 2.1639662994167206, + "grad_norm": 0.8418394923210144, + "learning_rate": 9.517500465375071e-07, + "loss": 0.0737, + "step": 6678 + }, + { + "epoch": 2.164290343486714, + "grad_norm": 0.9146268963813782, + "learning_rate": 9.510634387642151e-07, + "loss": 0.0805, + "step": 6679 + }, + { + "epoch": 2.164614387556708, + "grad_norm": 0.907317578792572, + "learning_rate": 9.503770205657625e-07, + "loss": 0.0743, + "step": 6680 + }, + { + "epoch": 2.1649384316267013, + "grad_norm": 0.9126441478729248, + "learning_rate": 9.496907920261609e-07, + "loss": 0.0778, + "step": 6681 + }, + { + "epoch": 2.1652624756966947, + "grad_norm": 0.929598331451416, + "learning_rate": 9.490047532293984e-07, + "loss": 0.0799, + "step": 6682 + }, + { + "epoch": 2.165586519766688, + "grad_norm": 0.8832554817199707, + "learning_rate": 9.48318904259439e-07, + "loss": 0.0764, + "step": 6683 + }, + { + "epoch": 2.1659105638366816, + "grad_norm": 0.8561263084411621, + "learning_rate": 9.476332452002245e-07, + "loss": 0.0788, + "step": 6684 + }, + { + "epoch": 2.1662346079066754, + "grad_norm": 0.8597761392593384, + "learning_rate": 9.469477761356727e-07, + "loss": 0.0754, + "step": 6685 + }, + { + "epoch": 2.166558651976669, + "grad_norm": 0.9028628468513489, + "learning_rate": 9.462624971496793e-07, + "loss": 0.0819, + "step": 6686 + }, + { + "epoch": 2.1668826960466623, + "grad_norm": 0.89371258020401, + "learning_rate": 9.455774083261138e-07, + "loss": 0.0739, + "step": 6687 + }, + { + "epoch": 2.1672067401166557, + "grad_norm": 0.806632399559021, + "learning_rate": 9.448925097488257e-07, + "loss": 0.0744, + "step": 6688 + }, + { + "epoch": 2.1675307841866496, + "grad_norm": 0.9763009548187256, + "learning_rate": 9.442078015016398e-07, + "loss": 0.086, + "step": 6689 + }, + { + "epoch": 2.167854828256643, + "grad_norm": 0.9614084959030151, + "learning_rate": 9.435232836683577e-07, + "loss": 0.089, + "step": 6690 + }, + { + "epoch": 2.1681788723266364, + "grad_norm": 0.9859657287597656, + "learning_rate": 9.42838956332758e-07, + "loss": 0.0826, + "step": 6691 + }, + { + "epoch": 2.16850291639663, + "grad_norm": 0.8706166744232178, + "learning_rate": 9.421548195785962e-07, + "loss": 0.0781, + "step": 6692 + }, + { + "epoch": 2.1688269604666233, + "grad_norm": 0.928634524345398, + "learning_rate": 9.414708734896019e-07, + "loss": 0.0798, + "step": 6693 + }, + { + "epoch": 2.169151004536617, + "grad_norm": 0.9566361904144287, + "learning_rate": 9.407871181494865e-07, + "loss": 0.0887, + "step": 6694 + }, + { + "epoch": 2.1694750486066106, + "grad_norm": 0.8379427790641785, + "learning_rate": 9.401035536419326e-07, + "loss": 0.0746, + "step": 6695 + }, + { + "epoch": 2.169799092676604, + "grad_norm": 0.8841421008110046, + "learning_rate": 9.394201800506028e-07, + "loss": 0.0803, + "step": 6696 + }, + { + "epoch": 2.1701231367465974, + "grad_norm": 0.8510726094245911, + "learning_rate": 9.387369974591353e-07, + "loss": 0.0761, + "step": 6697 + }, + { + "epoch": 2.1704471808165913, + "grad_norm": 0.874298632144928, + "learning_rate": 9.380540059511453e-07, + "loss": 0.0805, + "step": 6698 + }, + { + "epoch": 2.1707712248865847, + "grad_norm": 1.0128662586212158, + "learning_rate": 9.373712056102249e-07, + "loss": 0.0888, + "step": 6699 + }, + { + "epoch": 2.171095268956578, + "grad_norm": 0.842639148235321, + "learning_rate": 9.366885965199398e-07, + "loss": 0.0772, + "step": 6700 + }, + { + "epoch": 2.1714193130265715, + "grad_norm": 0.8456000089645386, + "learning_rate": 9.360061787638383e-07, + "loss": 0.0776, + "step": 6701 + }, + { + "epoch": 2.171743357096565, + "grad_norm": 0.8387840390205383, + "learning_rate": 9.353239524254382e-07, + "loss": 0.0755, + "step": 6702 + }, + { + "epoch": 2.172067401166559, + "grad_norm": 0.8434891700744629, + "learning_rate": 9.346419175882407e-07, + "loss": 0.0772, + "step": 6703 + }, + { + "epoch": 2.1723914452365523, + "grad_norm": 0.8305788040161133, + "learning_rate": 9.339600743357177e-07, + "loss": 0.0731, + "step": 6704 + }, + { + "epoch": 2.1727154893065457, + "grad_norm": 0.8565192222595215, + "learning_rate": 9.332784227513212e-07, + "loss": 0.0745, + "step": 6705 + }, + { + "epoch": 2.173039533376539, + "grad_norm": 0.8777684569358826, + "learning_rate": 9.325969629184789e-07, + "loss": 0.0762, + "step": 6706 + }, + { + "epoch": 2.1733635774465325, + "grad_norm": 0.8476821184158325, + "learning_rate": 9.319156949205943e-07, + "loss": 0.0786, + "step": 6707 + }, + { + "epoch": 2.1736876215165264, + "grad_norm": 0.9548823833465576, + "learning_rate": 9.312346188410496e-07, + "loss": 0.0743, + "step": 6708 + }, + { + "epoch": 2.17401166558652, + "grad_norm": 0.9136479496955872, + "learning_rate": 9.30553734763199e-07, + "loss": 0.0792, + "step": 6709 + }, + { + "epoch": 2.1743357096565132, + "grad_norm": 0.896507203578949, + "learning_rate": 9.298730427703795e-07, + "loss": 0.0808, + "step": 6710 + }, + { + "epoch": 2.1746597537265067, + "grad_norm": 0.9299559593200684, + "learning_rate": 9.291925429458987e-07, + "loss": 0.0768, + "step": 6711 + }, + { + "epoch": 2.1749837977965005, + "grad_norm": 0.9667069315910339, + "learning_rate": 9.285122353730439e-07, + "loss": 0.0793, + "step": 6712 + }, + { + "epoch": 2.175307841866494, + "grad_norm": 0.8196617364883423, + "learning_rate": 9.278321201350784e-07, + "loss": 0.0742, + "step": 6713 + }, + { + "epoch": 2.1756318859364874, + "grad_norm": 0.9729592204093933, + "learning_rate": 9.271521973152418e-07, + "loss": 0.0806, + "step": 6714 + }, + { + "epoch": 2.175955930006481, + "grad_norm": 1.6799136400222778, + "learning_rate": 9.264724669967498e-07, + "loss": 0.1204, + "step": 6715 + }, + { + "epoch": 2.176279974076474, + "grad_norm": 0.9227179288864136, + "learning_rate": 9.257929292627956e-07, + "loss": 0.0829, + "step": 6716 + }, + { + "epoch": 2.176604018146468, + "grad_norm": 0.877239465713501, + "learning_rate": 9.251135841965467e-07, + "loss": 0.0788, + "step": 6717 + }, + { + "epoch": 2.1769280622164615, + "grad_norm": 0.776756227016449, + "learning_rate": 9.244344318811491e-07, + "loss": 0.0711, + "step": 6718 + }, + { + "epoch": 2.177252106286455, + "grad_norm": 0.9062384963035583, + "learning_rate": 9.237554723997242e-07, + "loss": 0.079, + "step": 6719 + }, + { + "epoch": 2.1775761503564484, + "grad_norm": 0.8527910113334656, + "learning_rate": 9.230767058353701e-07, + "loss": 0.0787, + "step": 6720 + }, + { + "epoch": 2.1779001944264422, + "grad_norm": 0.8512275218963623, + "learning_rate": 9.223981322711617e-07, + "loss": 0.0759, + "step": 6721 + }, + { + "epoch": 2.1782242384964356, + "grad_norm": 0.8467569351196289, + "learning_rate": 9.217197517901494e-07, + "loss": 0.0738, + "step": 6722 + }, + { + "epoch": 2.178548282566429, + "grad_norm": 0.8667898178100586, + "learning_rate": 9.210415644753615e-07, + "loss": 0.0753, + "step": 6723 + }, + { + "epoch": 2.1788723266364225, + "grad_norm": 0.9238751530647278, + "learning_rate": 9.203635704097988e-07, + "loss": 0.0782, + "step": 6724 + }, + { + "epoch": 2.179196370706416, + "grad_norm": 0.8899182677268982, + "learning_rate": 9.196857696764446e-07, + "loss": 0.0813, + "step": 6725 + }, + { + "epoch": 2.17952041477641, + "grad_norm": 0.8934040665626526, + "learning_rate": 9.190081623582531e-07, + "loss": 0.0813, + "step": 6726 + }, + { + "epoch": 2.179844458846403, + "grad_norm": 0.8986812829971313, + "learning_rate": 9.183307485381571e-07, + "loss": 0.0833, + "step": 6727 + }, + { + "epoch": 2.1801685029163966, + "grad_norm": 0.9125000834465027, + "learning_rate": 9.176535282990656e-07, + "loss": 0.0782, + "step": 6728 + }, + { + "epoch": 2.18049254698639, + "grad_norm": 0.8446953296661377, + "learning_rate": 9.169765017238641e-07, + "loss": 0.0761, + "step": 6729 + }, + { + "epoch": 2.1808165910563835, + "grad_norm": 0.9098793268203735, + "learning_rate": 9.162996688954148e-07, + "loss": 0.0786, + "step": 6730 + }, + { + "epoch": 2.1811406351263773, + "grad_norm": 0.8848891258239746, + "learning_rate": 9.156230298965529e-07, + "loss": 0.0786, + "step": 6731 + }, + { + "epoch": 2.1814646791963708, + "grad_norm": 0.920576274394989, + "learning_rate": 9.149465848100958e-07, + "loss": 0.0821, + "step": 6732 + }, + { + "epoch": 2.181788723266364, + "grad_norm": 0.8231469988822937, + "learning_rate": 9.142703337188305e-07, + "loss": 0.0769, + "step": 6733 + }, + { + "epoch": 2.1821127673363576, + "grad_norm": 0.8333210349082947, + "learning_rate": 9.135942767055272e-07, + "loss": 0.0774, + "step": 6734 + }, + { + "epoch": 2.182436811406351, + "grad_norm": 0.885479211807251, + "learning_rate": 9.129184138529259e-07, + "loss": 0.077, + "step": 6735 + }, + { + "epoch": 2.182760855476345, + "grad_norm": 0.940333366394043, + "learning_rate": 9.122427452437465e-07, + "loss": 0.0842, + "step": 6736 + }, + { + "epoch": 2.1830848995463383, + "grad_norm": 0.9102086424827576, + "learning_rate": 9.115672709606846e-07, + "loss": 0.0846, + "step": 6737 + }, + { + "epoch": 2.1834089436163318, + "grad_norm": 0.9427852034568787, + "learning_rate": 9.108919910864111e-07, + "loss": 0.0812, + "step": 6738 + }, + { + "epoch": 2.183732987686325, + "grad_norm": 0.8577998876571655, + "learning_rate": 9.102169057035753e-07, + "loss": 0.0743, + "step": 6739 + }, + { + "epoch": 2.184057031756319, + "grad_norm": 0.8352759480476379, + "learning_rate": 9.095420148947984e-07, + "loss": 0.0749, + "step": 6740 + }, + { + "epoch": 2.1843810758263125, + "grad_norm": 0.8783653378486633, + "learning_rate": 9.088673187426836e-07, + "loss": 0.0756, + "step": 6741 + }, + { + "epoch": 2.184705119896306, + "grad_norm": 0.9297550916671753, + "learning_rate": 9.081928173298046e-07, + "loss": 0.0805, + "step": 6742 + }, + { + "epoch": 2.1850291639662993, + "grad_norm": 0.9048861265182495, + "learning_rate": 9.075185107387149e-07, + "loss": 0.0832, + "step": 6743 + }, + { + "epoch": 2.1853532080362927, + "grad_norm": 0.869208574295044, + "learning_rate": 9.068443990519432e-07, + "loss": 0.0796, + "step": 6744 + }, + { + "epoch": 2.1856772521062866, + "grad_norm": 0.8934289813041687, + "learning_rate": 9.061704823519943e-07, + "loss": 0.0795, + "step": 6745 + }, + { + "epoch": 2.18600129617628, + "grad_norm": 0.856629490852356, + "learning_rate": 9.054967607213486e-07, + "loss": 0.0773, + "step": 6746 + }, + { + "epoch": 2.1863253402462735, + "grad_norm": 0.9060944318771362, + "learning_rate": 9.048232342424642e-07, + "loss": 0.0839, + "step": 6747 + }, + { + "epoch": 2.186649384316267, + "grad_norm": 0.9247906804084778, + "learning_rate": 9.04149902997773e-07, + "loss": 0.0853, + "step": 6748 + }, + { + "epoch": 2.1869734283862607, + "grad_norm": 0.8763548135757446, + "learning_rate": 9.034767670696842e-07, + "loss": 0.0825, + "step": 6749 + }, + { + "epoch": 2.187297472456254, + "grad_norm": 0.9106340408325195, + "learning_rate": 9.028038265405836e-07, + "loss": 0.0807, + "step": 6750 + }, + { + "epoch": 2.1876215165262476, + "grad_norm": 0.8381937742233276, + "learning_rate": 9.021310814928328e-07, + "loss": 0.0751, + "step": 6751 + }, + { + "epoch": 2.187945560596241, + "grad_norm": 0.8870315551757812, + "learning_rate": 9.01458532008769e-07, + "loss": 0.0776, + "step": 6752 + }, + { + "epoch": 2.1882696046662344, + "grad_norm": 0.8370722532272339, + "learning_rate": 9.007861781707056e-07, + "loss": 0.0757, + "step": 6753 + }, + { + "epoch": 2.1885936487362283, + "grad_norm": 0.9604190587997437, + "learning_rate": 9.001140200609334e-07, + "loss": 0.0829, + "step": 6754 + }, + { + "epoch": 2.1889176928062217, + "grad_norm": 0.9112382531166077, + "learning_rate": 8.994420577617155e-07, + "loss": 0.0829, + "step": 6755 + }, + { + "epoch": 2.189241736876215, + "grad_norm": 0.8749806880950928, + "learning_rate": 8.987702913552964e-07, + "loss": 0.0755, + "step": 6756 + }, + { + "epoch": 2.1895657809462086, + "grad_norm": 0.8946632146835327, + "learning_rate": 8.980987209238922e-07, + "loss": 0.0771, + "step": 6757 + }, + { + "epoch": 2.189889825016202, + "grad_norm": 0.8695065975189209, + "learning_rate": 8.974273465496966e-07, + "loss": 0.0739, + "step": 6758 + }, + { + "epoch": 2.190213869086196, + "grad_norm": 0.7892318964004517, + "learning_rate": 8.967561683148798e-07, + "loss": 0.0708, + "step": 6759 + }, + { + "epoch": 2.1905379131561893, + "grad_norm": 0.8877748250961304, + "learning_rate": 8.960851863015874e-07, + "loss": 0.0767, + "step": 6760 + }, + { + "epoch": 2.1908619572261827, + "grad_norm": 0.9287748336791992, + "learning_rate": 8.954144005919422e-07, + "loss": 0.0806, + "step": 6761 + }, + { + "epoch": 2.191186001296176, + "grad_norm": 0.8311761617660522, + "learning_rate": 8.947438112680387e-07, + "loss": 0.0729, + "step": 6762 + }, + { + "epoch": 2.19151004536617, + "grad_norm": 0.8091285824775696, + "learning_rate": 8.940734184119542e-07, + "loss": 0.0716, + "step": 6763 + }, + { + "epoch": 2.1918340894361634, + "grad_norm": 0.9159359335899353, + "learning_rate": 8.934032221057354e-07, + "loss": 0.0796, + "step": 6764 + }, + { + "epoch": 2.192158133506157, + "grad_norm": 0.8579771518707275, + "learning_rate": 8.927332224314106e-07, + "loss": 0.0765, + "step": 6765 + }, + { + "epoch": 2.1924821775761503, + "grad_norm": 0.9172267317771912, + "learning_rate": 8.92063419470979e-07, + "loss": 0.0767, + "step": 6766 + }, + { + "epoch": 2.1928062216461437, + "grad_norm": 0.8143701553344727, + "learning_rate": 8.91393813306419e-07, + "loss": 0.0715, + "step": 6767 + }, + { + "epoch": 2.1931302657161376, + "grad_norm": 0.8438109159469604, + "learning_rate": 8.907244040196836e-07, + "loss": 0.0751, + "step": 6768 + }, + { + "epoch": 2.193454309786131, + "grad_norm": 0.960784912109375, + "learning_rate": 8.900551916927022e-07, + "loss": 0.0819, + "step": 6769 + }, + { + "epoch": 2.1937783538561244, + "grad_norm": 0.8121520280838013, + "learning_rate": 8.893861764073808e-07, + "loss": 0.0717, + "step": 6770 + }, + { + "epoch": 2.194102397926118, + "grad_norm": 0.8404099941253662, + "learning_rate": 8.887173582455985e-07, + "loss": 0.0752, + "step": 6771 + }, + { + "epoch": 2.1944264419961117, + "grad_norm": 0.898887574672699, + "learning_rate": 8.88048737289213e-07, + "loss": 0.0817, + "step": 6772 + }, + { + "epoch": 2.194750486066105, + "grad_norm": 0.9333533644676208, + "learning_rate": 8.873803136200574e-07, + "loss": 0.0818, + "step": 6773 + }, + { + "epoch": 2.1950745301360985, + "grad_norm": 0.8780669569969177, + "learning_rate": 8.8671208731994e-07, + "loss": 0.0763, + "step": 6774 + }, + { + "epoch": 2.195398574206092, + "grad_norm": 0.9539688229560852, + "learning_rate": 8.860440584706451e-07, + "loss": 0.0791, + "step": 6775 + }, + { + "epoch": 2.1957226182760854, + "grad_norm": 0.9045437574386597, + "learning_rate": 8.853762271539332e-07, + "loss": 0.0721, + "step": 6776 + }, + { + "epoch": 2.1960466623460793, + "grad_norm": 0.8967257142066956, + "learning_rate": 8.847085934515404e-07, + "loss": 0.0753, + "step": 6777 + }, + { + "epoch": 2.1963707064160727, + "grad_norm": 0.8401694297790527, + "learning_rate": 8.840411574451793e-07, + "loss": 0.0737, + "step": 6778 + }, + { + "epoch": 2.196694750486066, + "grad_norm": 0.7956202030181885, + "learning_rate": 8.833739192165352e-07, + "loss": 0.0689, + "step": 6779 + }, + { + "epoch": 2.1970187945560595, + "grad_norm": 0.8320735096931458, + "learning_rate": 8.827068788472751e-07, + "loss": 0.0773, + "step": 6780 + }, + { + "epoch": 2.197342838626053, + "grad_norm": 0.8158857822418213, + "learning_rate": 8.820400364190351e-07, + "loss": 0.0726, + "step": 6781 + }, + { + "epoch": 2.197666882696047, + "grad_norm": 0.9041940569877625, + "learning_rate": 8.813733920134321e-07, + "loss": 0.08, + "step": 6782 + }, + { + "epoch": 2.1979909267660402, + "grad_norm": 0.8967098593711853, + "learning_rate": 8.807069457120571e-07, + "loss": 0.0809, + "step": 6783 + }, + { + "epoch": 2.1983149708360337, + "grad_norm": 0.908525288105011, + "learning_rate": 8.80040697596474e-07, + "loss": 0.079, + "step": 6784 + }, + { + "epoch": 2.198639014906027, + "grad_norm": 0.8507987856864929, + "learning_rate": 8.79374647748229e-07, + "loss": 0.0752, + "step": 6785 + }, + { + "epoch": 2.1989630589760205, + "grad_norm": 0.9298058152198792, + "learning_rate": 8.787087962488367e-07, + "loss": 0.0854, + "step": 6786 + }, + { + "epoch": 2.1992871030460144, + "grad_norm": 0.9183042645454407, + "learning_rate": 8.780431431797937e-07, + "loss": 0.0795, + "step": 6787 + }, + { + "epoch": 2.199611147116008, + "grad_norm": 0.8680285215377808, + "learning_rate": 8.773776886225668e-07, + "loss": 0.0757, + "step": 6788 + }, + { + "epoch": 2.1999351911860012, + "grad_norm": 0.9786904454231262, + "learning_rate": 8.767124326586043e-07, + "loss": 0.0822, + "step": 6789 + }, + { + "epoch": 2.2002592352559946, + "grad_norm": 0.8955831527709961, + "learning_rate": 8.760473753693243e-07, + "loss": 0.0764, + "step": 6790 + }, + { + "epoch": 2.2005832793259885, + "grad_norm": 0.8659034967422485, + "learning_rate": 8.753825168361249e-07, + "loss": 0.0783, + "step": 6791 + }, + { + "epoch": 2.200907323395982, + "grad_norm": 0.8939731121063232, + "learning_rate": 8.747178571403786e-07, + "loss": 0.0758, + "step": 6792 + }, + { + "epoch": 2.2012313674659754, + "grad_norm": 0.8434916734695435, + "learning_rate": 8.74053396363431e-07, + "loss": 0.0772, + "step": 6793 + }, + { + "epoch": 2.201555411535969, + "grad_norm": 0.8473688960075378, + "learning_rate": 8.733891345866088e-07, + "loss": 0.0751, + "step": 6794 + }, + { + "epoch": 2.201879455605962, + "grad_norm": 0.9216592907905579, + "learning_rate": 8.727250718912089e-07, + "loss": 0.0774, + "step": 6795 + }, + { + "epoch": 2.202203499675956, + "grad_norm": 0.9039542078971863, + "learning_rate": 8.72061208358507e-07, + "loss": 0.0773, + "step": 6796 + }, + { + "epoch": 2.2025275437459495, + "grad_norm": 0.8723130226135254, + "learning_rate": 8.713975440697536e-07, + "loss": 0.0784, + "step": 6797 + }, + { + "epoch": 2.202851587815943, + "grad_norm": 0.9859674572944641, + "learning_rate": 8.707340791061747e-07, + "loss": 0.0848, + "step": 6798 + }, + { + "epoch": 2.2031756318859363, + "grad_norm": 0.9083540439605713, + "learning_rate": 8.700708135489722e-07, + "loss": 0.0791, + "step": 6799 + }, + { + "epoch": 2.20349967595593, + "grad_norm": 0.8758490085601807, + "learning_rate": 8.694077474793227e-07, + "loss": 0.0807, + "step": 6800 + }, + { + "epoch": 2.2038237200259236, + "grad_norm": 0.9906332492828369, + "learning_rate": 8.687448809783799e-07, + "loss": 0.0829, + "step": 6801 + }, + { + "epoch": 2.204147764095917, + "grad_norm": 0.8625491857528687, + "learning_rate": 8.680822141272727e-07, + "loss": 0.0741, + "step": 6802 + }, + { + "epoch": 2.2044718081659105, + "grad_norm": 0.9207634925842285, + "learning_rate": 8.674197470071033e-07, + "loss": 0.0819, + "step": 6803 + }, + { + "epoch": 2.204795852235904, + "grad_norm": 0.8922463655471802, + "learning_rate": 8.667574796989526e-07, + "loss": 0.0779, + "step": 6804 + }, + { + "epoch": 2.2051198963058978, + "grad_norm": 0.8703399896621704, + "learning_rate": 8.66095412283875e-07, + "loss": 0.0763, + "step": 6805 + }, + { + "epoch": 2.205443940375891, + "grad_norm": 0.8407284617424011, + "learning_rate": 8.654335448429016e-07, + "loss": 0.0749, + "step": 6806 + }, + { + "epoch": 2.2057679844458846, + "grad_norm": 0.8342950344085693, + "learning_rate": 8.647718774570385e-07, + "loss": 0.0733, + "step": 6807 + }, + { + "epoch": 2.206092028515878, + "grad_norm": 0.8918887376785278, + "learning_rate": 8.641104102072676e-07, + "loss": 0.0794, + "step": 6808 + }, + { + "epoch": 2.2064160725858715, + "grad_norm": 0.8364049196243286, + "learning_rate": 8.634491431745465e-07, + "loss": 0.0695, + "step": 6809 + }, + { + "epoch": 2.2067401166558653, + "grad_norm": 0.8273484110832214, + "learning_rate": 8.627880764398055e-07, + "loss": 0.0723, + "step": 6810 + }, + { + "epoch": 2.2070641607258588, + "grad_norm": 0.8052671551704407, + "learning_rate": 8.621272100839562e-07, + "loss": 0.0719, + "step": 6811 + }, + { + "epoch": 2.207388204795852, + "grad_norm": 1.0396753549575806, + "learning_rate": 8.614665441878798e-07, + "loss": 0.0945, + "step": 6812 + }, + { + "epoch": 2.2077122488658456, + "grad_norm": 0.8524155616760254, + "learning_rate": 8.60806078832436e-07, + "loss": 0.076, + "step": 6813 + }, + { + "epoch": 2.2080362929358395, + "grad_norm": 0.7909897565841675, + "learning_rate": 8.601458140984606e-07, + "loss": 0.0706, + "step": 6814 + }, + { + "epoch": 2.208360337005833, + "grad_norm": 0.897506833076477, + "learning_rate": 8.594857500667606e-07, + "loss": 0.079, + "step": 6815 + }, + { + "epoch": 2.2086843810758263, + "grad_norm": 0.8871533274650574, + "learning_rate": 8.588258868181251e-07, + "loss": 0.0832, + "step": 6816 + }, + { + "epoch": 2.2090084251458197, + "grad_norm": 0.8382236957550049, + "learning_rate": 8.581662244333116e-07, + "loss": 0.0723, + "step": 6817 + }, + { + "epoch": 2.209332469215813, + "grad_norm": 0.85988450050354, + "learning_rate": 8.575067629930601e-07, + "loss": 0.075, + "step": 6818 + }, + { + "epoch": 2.209656513285807, + "grad_norm": 0.9264865517616272, + "learning_rate": 8.568475025780781e-07, + "loss": 0.081, + "step": 6819 + }, + { + "epoch": 2.2099805573558005, + "grad_norm": 0.9004557728767395, + "learning_rate": 8.561884432690568e-07, + "loss": 0.0802, + "step": 6820 + }, + { + "epoch": 2.210304601425794, + "grad_norm": 0.8373770713806152, + "learning_rate": 8.555295851466556e-07, + "loss": 0.0776, + "step": 6821 + }, + { + "epoch": 2.2106286454957873, + "grad_norm": 1.0056724548339844, + "learning_rate": 8.548709282915135e-07, + "loss": 0.0845, + "step": 6822 + }, + { + "epoch": 2.210952689565781, + "grad_norm": 0.7826520800590515, + "learning_rate": 8.542124727842438e-07, + "loss": 0.0699, + "step": 6823 + }, + { + "epoch": 2.2112767336357746, + "grad_norm": 0.9300687909126282, + "learning_rate": 8.535542187054352e-07, + "loss": 0.0808, + "step": 6824 + }, + { + "epoch": 2.211600777705768, + "grad_norm": 0.8985550999641418, + "learning_rate": 8.528961661356519e-07, + "loss": 0.0791, + "step": 6825 + }, + { + "epoch": 2.2119248217757614, + "grad_norm": 0.8403812646865845, + "learning_rate": 8.52238315155432e-07, + "loss": 0.0727, + "step": 6826 + }, + { + "epoch": 2.212248865845755, + "grad_norm": 0.8673394918441772, + "learning_rate": 8.515806658452908e-07, + "loss": 0.0763, + "step": 6827 + }, + { + "epoch": 2.2125729099157487, + "grad_norm": 0.9250020980834961, + "learning_rate": 8.50923218285718e-07, + "loss": 0.0803, + "step": 6828 + }, + { + "epoch": 2.212896953985742, + "grad_norm": 0.8896529674530029, + "learning_rate": 8.502659725571791e-07, + "loss": 0.0769, + "step": 6829 + }, + { + "epoch": 2.2132209980557356, + "grad_norm": 0.903784990310669, + "learning_rate": 8.496089287401144e-07, + "loss": 0.0782, + "step": 6830 + }, + { + "epoch": 2.213545042125729, + "grad_norm": 0.8488913178443909, + "learning_rate": 8.489520869149398e-07, + "loss": 0.0757, + "step": 6831 + }, + { + "epoch": 2.2138690861957224, + "grad_norm": 0.8580282926559448, + "learning_rate": 8.482954471620464e-07, + "loss": 0.0709, + "step": 6832 + }, + { + "epoch": 2.2141931302657163, + "grad_norm": 0.9183516502380371, + "learning_rate": 8.476390095618015e-07, + "loss": 0.0809, + "step": 6833 + }, + { + "epoch": 2.2145171743357097, + "grad_norm": 0.878735363483429, + "learning_rate": 8.469827741945447e-07, + "loss": 0.0762, + "step": 6834 + }, + { + "epoch": 2.214841218405703, + "grad_norm": 0.88670814037323, + "learning_rate": 8.46326741140594e-07, + "loss": 0.0742, + "step": 6835 + }, + { + "epoch": 2.2151652624756966, + "grad_norm": 0.8494538068771362, + "learning_rate": 8.456709104802413e-07, + "loss": 0.0772, + "step": 6836 + }, + { + "epoch": 2.21548930654569, + "grad_norm": 0.8244561553001404, + "learning_rate": 8.450152822937541e-07, + "loss": 0.074, + "step": 6837 + }, + { + "epoch": 2.215813350615684, + "grad_norm": 0.9178814888000488, + "learning_rate": 8.443598566613756e-07, + "loss": 0.0767, + "step": 6838 + }, + { + "epoch": 2.2161373946856773, + "grad_norm": 0.8819333910942078, + "learning_rate": 8.437046336633212e-07, + "loss": 0.0808, + "step": 6839 + }, + { + "epoch": 2.2164614387556707, + "grad_norm": 0.8642560839653015, + "learning_rate": 8.430496133797872e-07, + "loss": 0.0767, + "step": 6840 + }, + { + "epoch": 2.216785482825664, + "grad_norm": 0.8757349252700806, + "learning_rate": 8.423947958909381e-07, + "loss": 0.0792, + "step": 6841 + }, + { + "epoch": 2.217109526895658, + "grad_norm": 0.939451277256012, + "learning_rate": 8.41740181276921e-07, + "loss": 0.0846, + "step": 6842 + }, + { + "epoch": 2.2174335709656514, + "grad_norm": 0.9106187224388123, + "learning_rate": 8.410857696178518e-07, + "loss": 0.0788, + "step": 6843 + }, + { + "epoch": 2.217757615035645, + "grad_norm": 0.8261914849281311, + "learning_rate": 8.404315609938246e-07, + "loss": 0.0746, + "step": 6844 + }, + { + "epoch": 2.2180816591056383, + "grad_norm": 0.9008349180221558, + "learning_rate": 8.397775554849086e-07, + "loss": 0.081, + "step": 6845 + }, + { + "epoch": 2.2184057031756317, + "grad_norm": 0.9082056283950806, + "learning_rate": 8.391237531711474e-07, + "loss": 0.0825, + "step": 6846 + }, + { + "epoch": 2.2187297472456255, + "grad_norm": 0.9769788980484009, + "learning_rate": 8.384701541325612e-07, + "loss": 0.0867, + "step": 6847 + }, + { + "epoch": 2.219053791315619, + "grad_norm": 0.8257579803466797, + "learning_rate": 8.378167584491417e-07, + "loss": 0.0738, + "step": 6848 + }, + { + "epoch": 2.2193778353856124, + "grad_norm": 0.9038711190223694, + "learning_rate": 8.371635662008615e-07, + "loss": 0.0738, + "step": 6849 + }, + { + "epoch": 2.219701879455606, + "grad_norm": 0.9552277326583862, + "learning_rate": 8.365105774676624e-07, + "loss": 0.078, + "step": 6850 + }, + { + "epoch": 2.2200259235255997, + "grad_norm": 0.9162812232971191, + "learning_rate": 8.358577923294647e-07, + "loss": 0.0839, + "step": 6851 + }, + { + "epoch": 2.220349967595593, + "grad_norm": 0.8380052447319031, + "learning_rate": 8.352052108661634e-07, + "loss": 0.0762, + "step": 6852 + }, + { + "epoch": 2.2206740116655865, + "grad_norm": 0.8755866885185242, + "learning_rate": 8.345528331576275e-07, + "loss": 0.0779, + "step": 6853 + }, + { + "epoch": 2.22099805573558, + "grad_norm": 0.9222939014434814, + "learning_rate": 8.339006592837021e-07, + "loss": 0.0866, + "step": 6854 + }, + { + "epoch": 2.2213220998055734, + "grad_norm": 0.853366494178772, + "learning_rate": 8.33248689324207e-07, + "loss": 0.0782, + "step": 6855 + }, + { + "epoch": 2.2216461438755672, + "grad_norm": 0.8806123733520508, + "learning_rate": 8.325969233589376e-07, + "loss": 0.076, + "step": 6856 + }, + { + "epoch": 2.2219701879455607, + "grad_norm": 0.8320830464363098, + "learning_rate": 8.319453614676626e-07, + "loss": 0.0744, + "step": 6857 + }, + { + "epoch": 2.222294232015554, + "grad_norm": 0.9103955626487732, + "learning_rate": 8.31294003730127e-07, + "loss": 0.0827, + "step": 6858 + }, + { + "epoch": 2.2226182760855475, + "grad_norm": 0.8920407891273499, + "learning_rate": 8.306428502260511e-07, + "loss": 0.0771, + "step": 6859 + }, + { + "epoch": 2.222942320155541, + "grad_norm": 0.8198265433311462, + "learning_rate": 8.299919010351296e-07, + "loss": 0.0722, + "step": 6860 + }, + { + "epoch": 2.223266364225535, + "grad_norm": 0.8588047027587891, + "learning_rate": 8.293411562370327e-07, + "loss": 0.0776, + "step": 6861 + }, + { + "epoch": 2.2235904082955282, + "grad_norm": 0.9530218243598938, + "learning_rate": 8.286906159114058e-07, + "loss": 0.0795, + "step": 6862 + }, + { + "epoch": 2.2239144523655217, + "grad_norm": 0.9290475845336914, + "learning_rate": 8.280402801378662e-07, + "loss": 0.0838, + "step": 6863 + }, + { + "epoch": 2.224238496435515, + "grad_norm": 0.9210637807846069, + "learning_rate": 8.27390148996012e-07, + "loss": 0.0817, + "step": 6864 + }, + { + "epoch": 2.224562540505509, + "grad_norm": 0.8271251320838928, + "learning_rate": 8.267402225654112e-07, + "loss": 0.0722, + "step": 6865 + }, + { + "epoch": 2.2248865845755024, + "grad_norm": 0.9257084131240845, + "learning_rate": 8.260905009256081e-07, + "loss": 0.0842, + "step": 6866 + }, + { + "epoch": 2.225210628645496, + "grad_norm": 0.8730802536010742, + "learning_rate": 8.254409841561234e-07, + "loss": 0.0773, + "step": 6867 + }, + { + "epoch": 2.225534672715489, + "grad_norm": 0.9253896474838257, + "learning_rate": 8.24791672336451e-07, + "loss": 0.0816, + "step": 6868 + }, + { + "epoch": 2.2258587167854826, + "grad_norm": 0.8534382581710815, + "learning_rate": 8.241425655460616e-07, + "loss": 0.0757, + "step": 6869 + }, + { + "epoch": 2.2261827608554765, + "grad_norm": 0.8908569812774658, + "learning_rate": 8.23493663864397e-07, + "loss": 0.0805, + "step": 6870 + }, + { + "epoch": 2.22650680492547, + "grad_norm": 0.8594645261764526, + "learning_rate": 8.228449673708797e-07, + "loss": 0.0773, + "step": 6871 + }, + { + "epoch": 2.2268308489954634, + "grad_norm": 0.8712368607521057, + "learning_rate": 8.221964761449008e-07, + "loss": 0.0772, + "step": 6872 + }, + { + "epoch": 2.2271548930654568, + "grad_norm": 0.8286442160606384, + "learning_rate": 8.215481902658323e-07, + "loss": 0.0725, + "step": 6873 + }, + { + "epoch": 2.2274789371354506, + "grad_norm": 0.8417304754257202, + "learning_rate": 8.209001098130157e-07, + "loss": 0.0778, + "step": 6874 + }, + { + "epoch": 2.227802981205444, + "grad_norm": 0.8697782754898071, + "learning_rate": 8.20252234865771e-07, + "loss": 0.0789, + "step": 6875 + }, + { + "epoch": 2.2281270252754375, + "grad_norm": 0.833076000213623, + "learning_rate": 8.196045655033913e-07, + "loss": 0.0754, + "step": 6876 + }, + { + "epoch": 2.228451069345431, + "grad_norm": 0.8312671780586243, + "learning_rate": 8.189571018051454e-07, + "loss": 0.0749, + "step": 6877 + }, + { + "epoch": 2.2287751134154243, + "grad_norm": 0.9436811208724976, + "learning_rate": 8.183098438502771e-07, + "loss": 0.0847, + "step": 6878 + }, + { + "epoch": 2.229099157485418, + "grad_norm": 0.898650586605072, + "learning_rate": 8.176627917180025e-07, + "loss": 0.0808, + "step": 6879 + }, + { + "epoch": 2.2294232015554116, + "grad_norm": 0.842548131942749, + "learning_rate": 8.170159454875173e-07, + "loss": 0.077, + "step": 6880 + }, + { + "epoch": 2.229747245625405, + "grad_norm": 0.9235697984695435, + "learning_rate": 8.163693052379873e-07, + "loss": 0.0834, + "step": 6881 + }, + { + "epoch": 2.2300712896953985, + "grad_norm": 0.8513306975364685, + "learning_rate": 8.157228710485554e-07, + "loss": 0.0716, + "step": 6882 + }, + { + "epoch": 2.230395333765392, + "grad_norm": 0.8318166136741638, + "learning_rate": 8.15076642998339e-07, + "loss": 0.0729, + "step": 6883 + }, + { + "epoch": 2.2307193778353858, + "grad_norm": 0.9229317903518677, + "learning_rate": 8.144306211664302e-07, + "loss": 0.0802, + "step": 6884 + }, + { + "epoch": 2.231043421905379, + "grad_norm": 0.945536732673645, + "learning_rate": 8.137848056318959e-07, + "loss": 0.0848, + "step": 6885 + }, + { + "epoch": 2.2313674659753726, + "grad_norm": 0.938944399356842, + "learning_rate": 8.131391964737773e-07, + "loss": 0.0819, + "step": 6886 + }, + { + "epoch": 2.231691510045366, + "grad_norm": 0.9072917103767395, + "learning_rate": 8.12493793771092e-07, + "loss": 0.0767, + "step": 6887 + }, + { + "epoch": 2.2320155541153595, + "grad_norm": 0.8586976528167725, + "learning_rate": 8.118485976028292e-07, + "loss": 0.0773, + "step": 6888 + }, + { + "epoch": 2.2323395981853533, + "grad_norm": 0.918808102607727, + "learning_rate": 8.112036080479554e-07, + "loss": 0.0816, + "step": 6889 + }, + { + "epoch": 2.2326636422553467, + "grad_norm": 0.8832508325576782, + "learning_rate": 8.10558825185411e-07, + "loss": 0.0805, + "step": 6890 + }, + { + "epoch": 2.23298768632534, + "grad_norm": 0.9577889442443848, + "learning_rate": 8.099142490941117e-07, + "loss": 0.0767, + "step": 6891 + }, + { + "epoch": 2.2333117303953336, + "grad_norm": 0.8775278925895691, + "learning_rate": 8.09269879852947e-07, + "loss": 0.0769, + "step": 6892 + }, + { + "epoch": 2.2336357744653275, + "grad_norm": 0.8546897172927856, + "learning_rate": 8.086257175407819e-07, + "loss": 0.0775, + "step": 6893 + }, + { + "epoch": 2.233959818535321, + "grad_norm": 0.9491926431655884, + "learning_rate": 8.079817622364539e-07, + "loss": 0.0835, + "step": 6894 + }, + { + "epoch": 2.2342838626053143, + "grad_norm": 0.9256629943847656, + "learning_rate": 8.073380140187795e-07, + "loss": 0.0813, + "step": 6895 + }, + { + "epoch": 2.2346079066753077, + "grad_norm": 0.8432714939117432, + "learning_rate": 8.066944729665455e-07, + "loss": 0.0735, + "step": 6896 + }, + { + "epoch": 2.234931950745301, + "grad_norm": 0.9228755235671997, + "learning_rate": 8.060511391585152e-07, + "loss": 0.0847, + "step": 6897 + }, + { + "epoch": 2.235255994815295, + "grad_norm": 0.9158481955528259, + "learning_rate": 8.054080126734271e-07, + "loss": 0.0815, + "step": 6898 + }, + { + "epoch": 2.2355800388852884, + "grad_norm": 0.9194527864456177, + "learning_rate": 8.047650935899931e-07, + "loss": 0.0792, + "step": 6899 + }, + { + "epoch": 2.235904082955282, + "grad_norm": 0.9050752520561218, + "learning_rate": 8.041223819869015e-07, + "loss": 0.0754, + "step": 6900 + }, + { + "epoch": 2.2362281270252753, + "grad_norm": 0.8906585574150085, + "learning_rate": 8.034798779428113e-07, + "loss": 0.0802, + "step": 6901 + }, + { + "epoch": 2.236552171095269, + "grad_norm": 0.9296242594718933, + "learning_rate": 8.02837581536362e-07, + "loss": 0.0753, + "step": 6902 + }, + { + "epoch": 2.2368762151652626, + "grad_norm": 0.8792465329170227, + "learning_rate": 8.021954928461611e-07, + "loss": 0.0787, + "step": 6903 + }, + { + "epoch": 2.237200259235256, + "grad_norm": 0.9047431349754333, + "learning_rate": 8.015536119507977e-07, + "loss": 0.0749, + "step": 6904 + }, + { + "epoch": 2.2375243033052494, + "grad_norm": 0.9151782393455505, + "learning_rate": 8.009119389288292e-07, + "loss": 0.0756, + "step": 6905 + }, + { + "epoch": 2.237848347375243, + "grad_norm": 0.8479213118553162, + "learning_rate": 8.002704738587911e-07, + "loss": 0.0714, + "step": 6906 + }, + { + "epoch": 2.2381723914452367, + "grad_norm": 0.8420944809913635, + "learning_rate": 7.996292168191919e-07, + "loss": 0.0767, + "step": 6907 + }, + { + "epoch": 2.23849643551523, + "grad_norm": 0.8398191332817078, + "learning_rate": 7.989881678885158e-07, + "loss": 0.0722, + "step": 6908 + }, + { + "epoch": 2.2388204795852236, + "grad_norm": 0.9042385816574097, + "learning_rate": 7.983473271452219e-07, + "loss": 0.0768, + "step": 6909 + }, + { + "epoch": 2.239144523655217, + "grad_norm": 0.8790053129196167, + "learning_rate": 7.977066946677404e-07, + "loss": 0.0777, + "step": 6910 + }, + { + "epoch": 2.2394685677252104, + "grad_norm": 0.8296760320663452, + "learning_rate": 7.970662705344812e-07, + "loss": 0.0712, + "step": 6911 + }, + { + "epoch": 2.2397926117952043, + "grad_norm": 0.8679847717285156, + "learning_rate": 7.964260548238242e-07, + "loss": 0.0754, + "step": 6912 + }, + { + "epoch": 2.2401166558651977, + "grad_norm": 0.8999646902084351, + "learning_rate": 7.957860476141261e-07, + "loss": 0.0784, + "step": 6913 + }, + { + "epoch": 2.240440699935191, + "grad_norm": 0.888016402721405, + "learning_rate": 7.951462489837178e-07, + "loss": 0.083, + "step": 6914 + }, + { + "epoch": 2.2407647440051845, + "grad_norm": 0.9187142252922058, + "learning_rate": 7.945066590109044e-07, + "loss": 0.0781, + "step": 6915 + }, + { + "epoch": 2.2410887880751784, + "grad_norm": 0.8380825519561768, + "learning_rate": 7.938672777739654e-07, + "loss": 0.0762, + "step": 6916 + }, + { + "epoch": 2.241412832145172, + "grad_norm": 0.9074355959892273, + "learning_rate": 7.932281053511559e-07, + "loss": 0.0815, + "step": 6917 + }, + { + "epoch": 2.2417368762151653, + "grad_norm": 0.8893736600875854, + "learning_rate": 7.925891418207024e-07, + "loss": 0.0788, + "step": 6918 + }, + { + "epoch": 2.2420609202851587, + "grad_norm": 0.9274340271949768, + "learning_rate": 7.919503872608092e-07, + "loss": 0.0852, + "step": 6919 + }, + { + "epoch": 2.242384964355152, + "grad_norm": 0.9161635041236877, + "learning_rate": 7.913118417496532e-07, + "loss": 0.0793, + "step": 6920 + }, + { + "epoch": 2.242709008425146, + "grad_norm": 0.8371251821517944, + "learning_rate": 7.906735053653866e-07, + "loss": 0.0734, + "step": 6921 + }, + { + "epoch": 2.2430330524951394, + "grad_norm": 0.880815863609314, + "learning_rate": 7.900353781861353e-07, + "loss": 0.0789, + "step": 6922 + }, + { + "epoch": 2.243357096565133, + "grad_norm": 0.8050634860992432, + "learning_rate": 7.893974602899998e-07, + "loss": 0.0719, + "step": 6923 + }, + { + "epoch": 2.2436811406351262, + "grad_norm": 0.9162620902061462, + "learning_rate": 7.887597517550564e-07, + "loss": 0.0804, + "step": 6924 + }, + { + "epoch": 2.24400518470512, + "grad_norm": 0.8453664183616638, + "learning_rate": 7.881222526593513e-07, + "loss": 0.0709, + "step": 6925 + }, + { + "epoch": 2.2443292287751135, + "grad_norm": 0.8469733595848083, + "learning_rate": 7.87484963080912e-07, + "loss": 0.0742, + "step": 6926 + }, + { + "epoch": 2.244653272845107, + "grad_norm": 0.8893758058547974, + "learning_rate": 7.868478830977331e-07, + "loss": 0.0832, + "step": 6927 + }, + { + "epoch": 2.2449773169151004, + "grad_norm": 0.8658952713012695, + "learning_rate": 7.862110127877903e-07, + "loss": 0.0757, + "step": 6928 + }, + { + "epoch": 2.245301360985094, + "grad_norm": 0.9522693157196045, + "learning_rate": 7.855743522290283e-07, + "loss": 0.0762, + "step": 6929 + }, + { + "epoch": 2.2456254050550877, + "grad_norm": 0.9310401678085327, + "learning_rate": 7.849379014993683e-07, + "loss": 0.0814, + "step": 6930 + }, + { + "epoch": 2.245949449125081, + "grad_norm": 0.878420352935791, + "learning_rate": 7.84301660676707e-07, + "loss": 0.0753, + "step": 6931 + }, + { + "epoch": 2.2462734931950745, + "grad_norm": 0.8977034687995911, + "learning_rate": 7.836656298389114e-07, + "loss": 0.0761, + "step": 6932 + }, + { + "epoch": 2.246597537265068, + "grad_norm": 0.8511933088302612, + "learning_rate": 7.830298090638291e-07, + "loss": 0.077, + "step": 6933 + }, + { + "epoch": 2.2469215813350614, + "grad_norm": 0.9201487302780151, + "learning_rate": 7.823941984292752e-07, + "loss": 0.0838, + "step": 6934 + }, + { + "epoch": 2.2472456254050552, + "grad_norm": 0.8686670660972595, + "learning_rate": 7.817587980130451e-07, + "loss": 0.0747, + "step": 6935 + }, + { + "epoch": 2.2475696694750487, + "grad_norm": 0.9665684103965759, + "learning_rate": 7.811236078929033e-07, + "loss": 0.0828, + "step": 6936 + }, + { + "epoch": 2.247893713545042, + "grad_norm": 0.938249945640564, + "learning_rate": 7.80488628146592e-07, + "loss": 0.0831, + "step": 6937 + }, + { + "epoch": 2.2482177576150355, + "grad_norm": 0.9339914321899414, + "learning_rate": 7.798538588518265e-07, + "loss": 0.0858, + "step": 6938 + }, + { + "epoch": 2.248541801685029, + "grad_norm": 0.8758047819137573, + "learning_rate": 7.792193000862964e-07, + "loss": 0.0719, + "step": 6939 + }, + { + "epoch": 2.248865845755023, + "grad_norm": 0.8981780409812927, + "learning_rate": 7.785849519276661e-07, + "loss": 0.0785, + "step": 6940 + }, + { + "epoch": 2.249189889825016, + "grad_norm": 0.8714284896850586, + "learning_rate": 7.779508144535725e-07, + "loss": 0.0779, + "step": 6941 + }, + { + "epoch": 2.2495139338950096, + "grad_norm": 0.9096783399581909, + "learning_rate": 7.773168877416285e-07, + "loss": 0.076, + "step": 6942 + }, + { + "epoch": 2.249837977965003, + "grad_norm": 0.8372735977172852, + "learning_rate": 7.766831718694204e-07, + "loss": 0.0806, + "step": 6943 + }, + { + "epoch": 2.250162022034997, + "grad_norm": 0.8659960031509399, + "learning_rate": 7.760496669145093e-07, + "loss": 0.0769, + "step": 6944 + }, + { + "epoch": 2.2504860661049904, + "grad_norm": 0.8610251545906067, + "learning_rate": 7.754163729544297e-07, + "loss": 0.081, + "step": 6945 + }, + { + "epoch": 2.250810110174984, + "grad_norm": 0.9168947339057922, + "learning_rate": 7.747832900666907e-07, + "loss": 0.0815, + "step": 6946 + }, + { + "epoch": 2.251134154244977, + "grad_norm": 0.844121515750885, + "learning_rate": 7.741504183287757e-07, + "loss": 0.071, + "step": 6947 + }, + { + "epoch": 2.251458198314971, + "grad_norm": 0.905340850353241, + "learning_rate": 7.73517757818143e-07, + "loss": 0.0842, + "step": 6948 + }, + { + "epoch": 2.2517822423849645, + "grad_norm": 0.9762375950813293, + "learning_rate": 7.728853086122212e-07, + "loss": 0.09, + "step": 6949 + }, + { + "epoch": 2.252106286454958, + "grad_norm": 0.8414440155029297, + "learning_rate": 7.722530707884196e-07, + "loss": 0.0731, + "step": 6950 + }, + { + "epoch": 2.2524303305249513, + "grad_norm": 0.8716568946838379, + "learning_rate": 7.716210444241154e-07, + "loss": 0.0737, + "step": 6951 + }, + { + "epoch": 2.2527543745949448, + "grad_norm": 0.8804923892021179, + "learning_rate": 7.709892295966634e-07, + "loss": 0.0807, + "step": 6952 + }, + { + "epoch": 2.2530784186649386, + "grad_norm": 0.8454858064651489, + "learning_rate": 7.703576263833915e-07, + "loss": 0.0759, + "step": 6953 + }, + { + "epoch": 2.253402462734932, + "grad_norm": 0.8796680569648743, + "learning_rate": 7.697262348616019e-07, + "loss": 0.0783, + "step": 6954 + }, + { + "epoch": 2.2537265068049255, + "grad_norm": 0.8306034803390503, + "learning_rate": 7.690950551085716e-07, + "loss": 0.0716, + "step": 6955 + }, + { + "epoch": 2.254050550874919, + "grad_norm": 0.8274477124214172, + "learning_rate": 7.684640872015484e-07, + "loss": 0.0737, + "step": 6956 + }, + { + "epoch": 2.2543745949449123, + "grad_norm": 0.954513430595398, + "learning_rate": 7.678333312177602e-07, + "loss": 0.0817, + "step": 6957 + }, + { + "epoch": 2.254698639014906, + "grad_norm": 0.8665919899940491, + "learning_rate": 7.672027872344017e-07, + "loss": 0.0754, + "step": 6958 + }, + { + "epoch": 2.2550226830848996, + "grad_norm": 0.8250190615653992, + "learning_rate": 7.665724553286491e-07, + "loss": 0.0759, + "step": 6959 + }, + { + "epoch": 2.255346727154893, + "grad_norm": 0.870201826095581, + "learning_rate": 7.659423355776463e-07, + "loss": 0.0772, + "step": 6960 + }, + { + "epoch": 2.2556707712248865, + "grad_norm": 0.9322313070297241, + "learning_rate": 7.653124280585145e-07, + "loss": 0.0788, + "step": 6961 + }, + { + "epoch": 2.25599481529488, + "grad_norm": 0.9001493453979492, + "learning_rate": 7.646827328483486e-07, + "loss": 0.0808, + "step": 6962 + }, + { + "epoch": 2.2563188593648738, + "grad_norm": 0.9827464818954468, + "learning_rate": 7.64053250024217e-07, + "loss": 0.0774, + "step": 6963 + }, + { + "epoch": 2.256642903434867, + "grad_norm": 0.9482652544975281, + "learning_rate": 7.634239796631629e-07, + "loss": 0.0885, + "step": 6964 + }, + { + "epoch": 2.2569669475048606, + "grad_norm": 0.8908950090408325, + "learning_rate": 7.62794921842201e-07, + "loss": 0.0751, + "step": 6965 + }, + { + "epoch": 2.257290991574854, + "grad_norm": 0.8292078971862793, + "learning_rate": 7.621660766383246e-07, + "loss": 0.0678, + "step": 6966 + }, + { + "epoch": 2.257615035644848, + "grad_norm": 0.9161243438720703, + "learning_rate": 7.615374441284962e-07, + "loss": 0.0778, + "step": 6967 + }, + { + "epoch": 2.2579390797148413, + "grad_norm": 0.9270373582839966, + "learning_rate": 7.60909024389655e-07, + "loss": 0.0809, + "step": 6968 + }, + { + "epoch": 2.2582631237848347, + "grad_norm": 0.8143619298934937, + "learning_rate": 7.602808174987137e-07, + "loss": 0.0735, + "step": 6969 + }, + { + "epoch": 2.258587167854828, + "grad_norm": 0.8208263516426086, + "learning_rate": 7.596528235325582e-07, + "loss": 0.0738, + "step": 6970 + }, + { + "epoch": 2.2589112119248216, + "grad_norm": 0.9438315033912659, + "learning_rate": 7.590250425680496e-07, + "loss": 0.085, + "step": 6971 + }, + { + "epoch": 2.2592352559948155, + "grad_norm": 0.8808038830757141, + "learning_rate": 7.583974746820222e-07, + "loss": 0.0782, + "step": 6972 + }, + { + "epoch": 2.259559300064809, + "grad_norm": 0.8868935704231262, + "learning_rate": 7.577701199512835e-07, + "loss": 0.0781, + "step": 6973 + }, + { + "epoch": 2.2598833441348023, + "grad_norm": 0.8319234848022461, + "learning_rate": 7.571429784526157e-07, + "loss": 0.0716, + "step": 6974 + }, + { + "epoch": 2.2602073882047957, + "grad_norm": 0.9218267798423767, + "learning_rate": 7.565160502627752e-07, + "loss": 0.0759, + "step": 6975 + }, + { + "epoch": 2.2605314322747896, + "grad_norm": 0.9264833927154541, + "learning_rate": 7.558893354584923e-07, + "loss": 0.0782, + "step": 6976 + }, + { + "epoch": 2.260855476344783, + "grad_norm": 0.8697494268417358, + "learning_rate": 7.5526283411647e-07, + "loss": 0.079, + "step": 6977 + }, + { + "epoch": 2.2611795204147764, + "grad_norm": 0.892841100692749, + "learning_rate": 7.546365463133867e-07, + "loss": 0.0778, + "step": 6978 + }, + { + "epoch": 2.26150356448477, + "grad_norm": 0.92507404088974, + "learning_rate": 7.540104721258945e-07, + "loss": 0.0789, + "step": 6979 + }, + { + "epoch": 2.2618276085547633, + "grad_norm": 0.8921122550964355, + "learning_rate": 7.533846116306162e-07, + "loss": 0.0797, + "step": 6980 + }, + { + "epoch": 2.262151652624757, + "grad_norm": 0.9586029648780823, + "learning_rate": 7.527589649041548e-07, + "loss": 0.0803, + "step": 6981 + }, + { + "epoch": 2.2624756966947506, + "grad_norm": 0.822803258895874, + "learning_rate": 7.521335320230804e-07, + "loss": 0.0721, + "step": 6982 + }, + { + "epoch": 2.262799740764744, + "grad_norm": 0.9236354231834412, + "learning_rate": 7.515083130639411e-07, + "loss": 0.0795, + "step": 6983 + }, + { + "epoch": 2.2631237848347374, + "grad_norm": 0.999817967414856, + "learning_rate": 7.508833081032577e-07, + "loss": 0.085, + "step": 6984 + }, + { + "epoch": 2.263447828904731, + "grad_norm": 0.8854063153266907, + "learning_rate": 7.502585172175244e-07, + "loss": 0.0722, + "step": 6985 + }, + { + "epoch": 2.2637718729747247, + "grad_norm": 1.0334446430206299, + "learning_rate": 7.496339404832109e-07, + "loss": 0.0834, + "step": 6986 + }, + { + "epoch": 2.264095917044718, + "grad_norm": 0.8735396862030029, + "learning_rate": 7.490095779767564e-07, + "loss": 0.0726, + "step": 6987 + }, + { + "epoch": 2.2644199611147116, + "grad_norm": 0.943081796169281, + "learning_rate": 7.483854297745805e-07, + "loss": 0.0785, + "step": 6988 + }, + { + "epoch": 2.264744005184705, + "grad_norm": 0.901913583278656, + "learning_rate": 7.47761495953069e-07, + "loss": 0.0777, + "step": 6989 + }, + { + "epoch": 2.2650680492546984, + "grad_norm": 0.870795488357544, + "learning_rate": 7.471377765885893e-07, + "loss": 0.0803, + "step": 6990 + }, + { + "epoch": 2.2653920933246923, + "grad_norm": 0.9563770294189453, + "learning_rate": 7.465142717574761e-07, + "loss": 0.0827, + "step": 6991 + }, + { + "epoch": 2.2657161373946857, + "grad_norm": 0.8497862815856934, + "learning_rate": 7.458909815360407e-07, + "loss": 0.0727, + "step": 6992 + }, + { + "epoch": 2.266040181464679, + "grad_norm": 0.873889684677124, + "learning_rate": 7.45267906000568e-07, + "loss": 0.0787, + "step": 6993 + }, + { + "epoch": 2.2663642255346725, + "grad_norm": 0.8586767911911011, + "learning_rate": 7.446450452273168e-07, + "loss": 0.0758, + "step": 6994 + }, + { + "epoch": 2.2666882696046664, + "grad_norm": 0.9471902251243591, + "learning_rate": 7.440223992925194e-07, + "loss": 0.0819, + "step": 6995 + }, + { + "epoch": 2.26701231367466, + "grad_norm": 0.8423433303833008, + "learning_rate": 7.433999682723805e-07, + "loss": 0.0707, + "step": 6996 + }, + { + "epoch": 2.2673363577446533, + "grad_norm": 0.8941042423248291, + "learning_rate": 7.427777522430804e-07, + "loss": 0.0761, + "step": 6997 + }, + { + "epoch": 2.2676604018146467, + "grad_norm": 0.923392653465271, + "learning_rate": 7.42155751280772e-07, + "loss": 0.074, + "step": 6998 + }, + { + "epoch": 2.2679844458846405, + "grad_norm": 0.867560088634491, + "learning_rate": 7.415339654615824e-07, + "loss": 0.0731, + "step": 6999 + }, + { + "epoch": 2.268308489954634, + "grad_norm": 0.8599602580070496, + "learning_rate": 7.409123948616123e-07, + "loss": 0.0772, + "step": 7000 + }, + { + "epoch": 2.2686325340246274, + "grad_norm": 0.8821036219596863, + "learning_rate": 7.402910395569357e-07, + "loss": 0.0772, + "step": 7001 + }, + { + "epoch": 2.268956578094621, + "grad_norm": 0.9057403802871704, + "learning_rate": 7.396698996236004e-07, + "loss": 0.0803, + "step": 7002 + }, + { + "epoch": 2.2692806221646142, + "grad_norm": 0.87221759557724, + "learning_rate": 7.39048975137629e-07, + "loss": 0.0733, + "step": 7003 + }, + { + "epoch": 2.269604666234608, + "grad_norm": 0.9132773876190186, + "learning_rate": 7.38428266175015e-07, + "loss": 0.0799, + "step": 7004 + }, + { + "epoch": 2.2699287103046015, + "grad_norm": 0.9301595091819763, + "learning_rate": 7.378077728117277e-07, + "loss": 0.0705, + "step": 7005 + }, + { + "epoch": 2.270252754374595, + "grad_norm": 0.8766486644744873, + "learning_rate": 7.371874951237099e-07, + "loss": 0.0772, + "step": 7006 + }, + { + "epoch": 2.2705767984445884, + "grad_norm": 0.8661054968833923, + "learning_rate": 7.365674331868772e-07, + "loss": 0.0746, + "step": 7007 + }, + { + "epoch": 2.270900842514582, + "grad_norm": 0.8911915421485901, + "learning_rate": 7.359475870771202e-07, + "loss": 0.0753, + "step": 7008 + }, + { + "epoch": 2.2712248865845757, + "grad_norm": 0.8957100510597229, + "learning_rate": 7.353279568702995e-07, + "loss": 0.0747, + "step": 7009 + }, + { + "epoch": 2.271548930654569, + "grad_norm": 0.9232888221740723, + "learning_rate": 7.347085426422551e-07, + "loss": 0.0777, + "step": 7010 + }, + { + "epoch": 2.2718729747245625, + "grad_norm": 0.9190528988838196, + "learning_rate": 7.340893444687944e-07, + "loss": 0.0815, + "step": 7011 + }, + { + "epoch": 2.272197018794556, + "grad_norm": 0.8859702348709106, + "learning_rate": 7.334703624257039e-07, + "loss": 0.0778, + "step": 7012 + }, + { + "epoch": 2.2725210628645494, + "grad_norm": 0.8363940119743347, + "learning_rate": 7.328515965887389e-07, + "loss": 0.072, + "step": 7013 + }, + { + "epoch": 2.2728451069345432, + "grad_norm": 0.8972896337509155, + "learning_rate": 7.322330470336314e-07, + "loss": 0.0725, + "step": 7014 + }, + { + "epoch": 2.2731691510045366, + "grad_norm": 0.8751944899559021, + "learning_rate": 7.316147138360855e-07, + "loss": 0.0777, + "step": 7015 + }, + { + "epoch": 2.27349319507453, + "grad_norm": 0.9271724224090576, + "learning_rate": 7.309965970717795e-07, + "loss": 0.0822, + "step": 7016 + }, + { + "epoch": 2.2738172391445235, + "grad_norm": 0.8555557131767273, + "learning_rate": 7.303786968163651e-07, + "loss": 0.0749, + "step": 7017 + }, + { + "epoch": 2.2741412832145174, + "grad_norm": 0.8882536292076111, + "learning_rate": 7.297610131454657e-07, + "loss": 0.0797, + "step": 7018 + }, + { + "epoch": 2.274465327284511, + "grad_norm": 0.8157416582107544, + "learning_rate": 7.291435461346827e-07, + "loss": 0.0772, + "step": 7019 + }, + { + "epoch": 2.274789371354504, + "grad_norm": 1.0363208055496216, + "learning_rate": 7.285262958595846e-07, + "loss": 0.0758, + "step": 7020 + }, + { + "epoch": 2.2751134154244976, + "grad_norm": 0.9158379435539246, + "learning_rate": 7.279092623957204e-07, + "loss": 0.0813, + "step": 7021 + }, + { + "epoch": 2.2754374594944915, + "grad_norm": 0.9131003022193909, + "learning_rate": 7.272924458186064e-07, + "loss": 0.0831, + "step": 7022 + }, + { + "epoch": 2.275761503564485, + "grad_norm": 0.8216702938079834, + "learning_rate": 7.26675846203736e-07, + "loss": 0.0757, + "step": 7023 + }, + { + "epoch": 2.2760855476344783, + "grad_norm": 0.8345896005630493, + "learning_rate": 7.26059463626575e-07, + "loss": 0.0729, + "step": 7024 + }, + { + "epoch": 2.2764095917044718, + "grad_norm": 0.8480775952339172, + "learning_rate": 7.254432981625626e-07, + "loss": 0.0707, + "step": 7025 + }, + { + "epoch": 2.276733635774465, + "grad_norm": 0.8742204308509827, + "learning_rate": 7.248273498871119e-07, + "loss": 0.073, + "step": 7026 + }, + { + "epoch": 2.277057679844459, + "grad_norm": 0.933005154132843, + "learning_rate": 7.242116188756082e-07, + "loss": 0.0865, + "step": 7027 + }, + { + "epoch": 2.2773817239144525, + "grad_norm": 0.880244791507721, + "learning_rate": 7.235961052034113e-07, + "loss": 0.078, + "step": 7028 + }, + { + "epoch": 2.277705767984446, + "grad_norm": 0.8771071434020996, + "learning_rate": 7.22980808945854e-07, + "loss": 0.081, + "step": 7029 + }, + { + "epoch": 2.2780298120544393, + "grad_norm": 0.8915297389030457, + "learning_rate": 7.22365730178243e-07, + "loss": 0.0824, + "step": 7030 + }, + { + "epoch": 2.2783538561244328, + "grad_norm": 0.8193331956863403, + "learning_rate": 7.217508689758576e-07, + "loss": 0.0777, + "step": 7031 + }, + { + "epoch": 2.2786779001944266, + "grad_norm": 0.8842501640319824, + "learning_rate": 7.211362254139512e-07, + "loss": 0.0792, + "step": 7032 + }, + { + "epoch": 2.27900194426442, + "grad_norm": 0.80404132604599, + "learning_rate": 7.205217995677502e-07, + "loss": 0.0728, + "step": 7033 + }, + { + "epoch": 2.2793259883344135, + "grad_norm": 0.8987725973129272, + "learning_rate": 7.199075915124548e-07, + "loss": 0.0773, + "step": 7034 + }, + { + "epoch": 2.279650032404407, + "grad_norm": 0.8705540895462036, + "learning_rate": 7.192936013232368e-07, + "loss": 0.0809, + "step": 7035 + }, + { + "epoch": 2.2799740764744003, + "grad_norm": 0.8713969588279724, + "learning_rate": 7.186798290752436e-07, + "loss": 0.0782, + "step": 7036 + }, + { + "epoch": 2.280298120544394, + "grad_norm": 0.8617414236068726, + "learning_rate": 7.180662748435946e-07, + "loss": 0.0748, + "step": 7037 + }, + { + "epoch": 2.2806221646143876, + "grad_norm": 0.8548529148101807, + "learning_rate": 7.174529387033832e-07, + "loss": 0.0764, + "step": 7038 + }, + { + "epoch": 2.280946208684381, + "grad_norm": 0.858833909034729, + "learning_rate": 7.168398207296764e-07, + "loss": 0.0757, + "step": 7039 + }, + { + "epoch": 2.2812702527543745, + "grad_norm": 0.8891960978507996, + "learning_rate": 7.162269209975117e-07, + "loss": 0.0763, + "step": 7040 + }, + { + "epoch": 2.281594296824368, + "grad_norm": 0.8929044008255005, + "learning_rate": 7.156142395819055e-07, + "loss": 0.0788, + "step": 7041 + }, + { + "epoch": 2.2819183408943617, + "grad_norm": 0.9119802117347717, + "learning_rate": 7.150017765578401e-07, + "loss": 0.0863, + "step": 7042 + }, + { + "epoch": 2.282242384964355, + "grad_norm": 0.882005512714386, + "learning_rate": 7.143895320002789e-07, + "loss": 0.0825, + "step": 7043 + }, + { + "epoch": 2.2825664290343486, + "grad_norm": 0.9207246899604797, + "learning_rate": 7.137775059841523e-07, + "loss": 0.0849, + "step": 7044 + }, + { + "epoch": 2.282890473104342, + "grad_norm": 0.9211490750312805, + "learning_rate": 7.131656985843669e-07, + "loss": 0.0803, + "step": 7045 + }, + { + "epoch": 2.283214517174336, + "grad_norm": 0.909271776676178, + "learning_rate": 7.125541098758021e-07, + "loss": 0.0818, + "step": 7046 + }, + { + "epoch": 2.2835385612443293, + "grad_norm": 0.8487682938575745, + "learning_rate": 7.119427399333104e-07, + "loss": 0.0725, + "step": 7047 + }, + { + "epoch": 2.2838626053143227, + "grad_norm": 1.0248647928237915, + "learning_rate": 7.113315888317182e-07, + "loss": 0.0883, + "step": 7048 + }, + { + "epoch": 2.284186649384316, + "grad_norm": 0.9240375757217407, + "learning_rate": 7.107206566458225e-07, + "loss": 0.0771, + "step": 7049 + }, + { + "epoch": 2.28451069345431, + "grad_norm": 0.8063546419143677, + "learning_rate": 7.101099434503986e-07, + "loss": 0.0724, + "step": 7050 + }, + { + "epoch": 2.2848347375243034, + "grad_norm": 0.9139791131019592, + "learning_rate": 7.09499449320189e-07, + "loss": 0.0846, + "step": 7051 + }, + { + "epoch": 2.285158781594297, + "grad_norm": 0.877644419670105, + "learning_rate": 7.088891743299136e-07, + "loss": 0.0788, + "step": 7052 + }, + { + "epoch": 2.2854828256642903, + "grad_norm": 0.9822368621826172, + "learning_rate": 7.08279118554264e-07, + "loss": 0.0813, + "step": 7053 + }, + { + "epoch": 2.2858068697342837, + "grad_norm": 0.9277673363685608, + "learning_rate": 7.076692820679051e-07, + "loss": 0.0823, + "step": 7054 + }, + { + "epoch": 2.2861309138042776, + "grad_norm": 0.9683315753936768, + "learning_rate": 7.070596649454748e-07, + "loss": 0.076, + "step": 7055 + }, + { + "epoch": 2.286454957874271, + "grad_norm": 0.8959435224533081, + "learning_rate": 7.064502672615847e-07, + "loss": 0.0811, + "step": 7056 + }, + { + "epoch": 2.2867790019442644, + "grad_norm": 0.9008287787437439, + "learning_rate": 7.058410890908196e-07, + "loss": 0.0741, + "step": 7057 + }, + { + "epoch": 2.287103046014258, + "grad_norm": 0.9168748259544373, + "learning_rate": 7.052321305077356e-07, + "loss": 0.0799, + "step": 7058 + }, + { + "epoch": 2.2874270900842513, + "grad_norm": 0.9024057984352112, + "learning_rate": 7.046233915868642e-07, + "loss": 0.0723, + "step": 7059 + }, + { + "epoch": 2.287751134154245, + "grad_norm": 0.9057157039642334, + "learning_rate": 7.04014872402709e-07, + "loss": 0.0794, + "step": 7060 + }, + { + "epoch": 2.2880751782242386, + "grad_norm": 0.8857600688934326, + "learning_rate": 7.034065730297471e-07, + "loss": 0.079, + "step": 7061 + }, + { + "epoch": 2.288399222294232, + "grad_norm": 0.9752936959266663, + "learning_rate": 7.027984935424284e-07, + "loss": 0.0859, + "step": 7062 + }, + { + "epoch": 2.2887232663642254, + "grad_norm": 0.9727655053138733, + "learning_rate": 7.021906340151763e-07, + "loss": 0.0856, + "step": 7063 + }, + { + "epoch": 2.289047310434219, + "grad_norm": 0.8868271708488464, + "learning_rate": 7.015829945223851e-07, + "loss": 0.0802, + "step": 7064 + }, + { + "epoch": 2.2893713545042127, + "grad_norm": 0.9148418307304382, + "learning_rate": 7.009755751384267e-07, + "loss": 0.0787, + "step": 7065 + }, + { + "epoch": 2.289695398574206, + "grad_norm": 0.8575038313865662, + "learning_rate": 7.003683759376415e-07, + "loss": 0.0737, + "step": 7066 + }, + { + "epoch": 2.2900194426441995, + "grad_norm": 0.7997493743896484, + "learning_rate": 6.997613969943451e-07, + "loss": 0.0717, + "step": 7067 + }, + { + "epoch": 2.290343486714193, + "grad_norm": 0.95135498046875, + "learning_rate": 6.99154638382826e-07, + "loss": 0.0864, + "step": 7068 + }, + { + "epoch": 2.290667530784187, + "grad_norm": 0.8583988547325134, + "learning_rate": 6.985481001773456e-07, + "loss": 0.0747, + "step": 7069 + }, + { + "epoch": 2.2909915748541803, + "grad_norm": 0.8334120512008667, + "learning_rate": 6.979417824521393e-07, + "loss": 0.0741, + "step": 7070 + }, + { + "epoch": 2.2913156189241737, + "grad_norm": 0.8967745900154114, + "learning_rate": 6.97335685281412e-07, + "loss": 0.0838, + "step": 7071 + }, + { + "epoch": 2.291639662994167, + "grad_norm": 0.8889458179473877, + "learning_rate": 6.967298087393471e-07, + "loss": 0.0747, + "step": 7072 + }, + { + "epoch": 2.291963707064161, + "grad_norm": 0.9700846076011658, + "learning_rate": 6.96124152900095e-07, + "loss": 0.0783, + "step": 7073 + }, + { + "epoch": 2.2922877511341544, + "grad_norm": 0.8384163975715637, + "learning_rate": 6.955187178377853e-07, + "loss": 0.076, + "step": 7074 + }, + { + "epoch": 2.292611795204148, + "grad_norm": 0.8633262515068054, + "learning_rate": 6.949135036265153e-07, + "loss": 0.0745, + "step": 7075 + }, + { + "epoch": 2.2929358392741412, + "grad_norm": 0.9053198099136353, + "learning_rate": 6.943085103403577e-07, + "loss": 0.0759, + "step": 7076 + }, + { + "epoch": 2.2932598833441347, + "grad_norm": 0.8445454239845276, + "learning_rate": 6.937037380533579e-07, + "loss": 0.0737, + "step": 7077 + }, + { + "epoch": 2.2935839274141285, + "grad_norm": 0.8526014089584351, + "learning_rate": 6.930991868395343e-07, + "loss": 0.0763, + "step": 7078 + }, + { + "epoch": 2.293907971484122, + "grad_norm": 0.8953704833984375, + "learning_rate": 6.924948567728787e-07, + "loss": 0.0803, + "step": 7079 + }, + { + "epoch": 2.2942320155541154, + "grad_norm": 0.8394033312797546, + "learning_rate": 6.918907479273535e-07, + "loss": 0.0736, + "step": 7080 + }, + { + "epoch": 2.294556059624109, + "grad_norm": 0.9278076887130737, + "learning_rate": 6.912868603768979e-07, + "loss": 0.0836, + "step": 7081 + }, + { + "epoch": 2.2948801036941022, + "grad_norm": 0.8417161107063293, + "learning_rate": 6.906831941954206e-07, + "loss": 0.0768, + "step": 7082 + }, + { + "epoch": 2.295204147764096, + "grad_norm": 0.873684287071228, + "learning_rate": 6.900797494568045e-07, + "loss": 0.0789, + "step": 7083 + }, + { + "epoch": 2.2955281918340895, + "grad_norm": 0.9292371869087219, + "learning_rate": 6.894765262349056e-07, + "loss": 0.0797, + "step": 7084 + }, + { + "epoch": 2.295852235904083, + "grad_norm": 0.8600883483886719, + "learning_rate": 6.88873524603553e-07, + "loss": 0.0759, + "step": 7085 + }, + { + "epoch": 2.2961762799740764, + "grad_norm": 0.9637444019317627, + "learning_rate": 6.882707446365477e-07, + "loss": 0.087, + "step": 7086 + }, + { + "epoch": 2.29650032404407, + "grad_norm": 0.9064836502075195, + "learning_rate": 6.876681864076646e-07, + "loss": 0.077, + "step": 7087 + }, + { + "epoch": 2.2968243681140637, + "grad_norm": 0.9461975693702698, + "learning_rate": 6.870658499906505e-07, + "loss": 0.0814, + "step": 7088 + }, + { + "epoch": 2.297148412184057, + "grad_norm": 0.8657816052436829, + "learning_rate": 6.864637354592266e-07, + "loss": 0.0801, + "step": 7089 + }, + { + "epoch": 2.2974724562540505, + "grad_norm": 0.8060582280158997, + "learning_rate": 6.858618428870842e-07, + "loss": 0.0702, + "step": 7090 + }, + { + "epoch": 2.297796500324044, + "grad_norm": 0.905968964099884, + "learning_rate": 6.852601723478902e-07, + "loss": 0.08, + "step": 7091 + }, + { + "epoch": 2.2981205443940373, + "grad_norm": 0.8682804107666016, + "learning_rate": 6.84658723915283e-07, + "loss": 0.0763, + "step": 7092 + }, + { + "epoch": 2.298444588464031, + "grad_norm": 0.8451237678527832, + "learning_rate": 6.840574976628741e-07, + "loss": 0.0792, + "step": 7093 + }, + { + "epoch": 2.2987686325340246, + "grad_norm": 0.880070686340332, + "learning_rate": 6.834564936642488e-07, + "loss": 0.077, + "step": 7094 + }, + { + "epoch": 2.299092676604018, + "grad_norm": 0.9359896779060364, + "learning_rate": 6.828557119929613e-07, + "loss": 0.0838, + "step": 7095 + }, + { + "epoch": 2.2994167206740115, + "grad_norm": 0.8509678840637207, + "learning_rate": 6.822551527225452e-07, + "loss": 0.0741, + "step": 7096 + }, + { + "epoch": 2.2997407647440054, + "grad_norm": 0.8210530281066895, + "learning_rate": 6.816548159264993e-07, + "loss": 0.0713, + "step": 7097 + }, + { + "epoch": 2.3000648088139988, + "grad_norm": 0.8998435139656067, + "learning_rate": 6.810547016783029e-07, + "loss": 0.0779, + "step": 7098 + }, + { + "epoch": 2.300388852883992, + "grad_norm": 0.8631594777107239, + "learning_rate": 6.804548100514013e-07, + "loss": 0.0773, + "step": 7099 + }, + { + "epoch": 2.3007128969539856, + "grad_norm": 0.8914357423782349, + "learning_rate": 6.798551411192165e-07, + "loss": 0.0814, + "step": 7100 + }, + { + "epoch": 2.3010369410239795, + "grad_norm": 0.9626676440238953, + "learning_rate": 6.792556949551426e-07, + "loss": 0.0868, + "step": 7101 + }, + { + "epoch": 2.301360985093973, + "grad_norm": 0.7938775420188904, + "learning_rate": 6.786564716325441e-07, + "loss": 0.0731, + "step": 7102 + }, + { + "epoch": 2.3016850291639663, + "grad_norm": 0.8719715476036072, + "learning_rate": 6.780574712247632e-07, + "loss": 0.0738, + "step": 7103 + }, + { + "epoch": 2.3020090732339598, + "grad_norm": 0.8483251333236694, + "learning_rate": 6.774586938051084e-07, + "loss": 0.0767, + "step": 7104 + }, + { + "epoch": 2.302333117303953, + "grad_norm": 0.8604863286018372, + "learning_rate": 6.768601394468674e-07, + "loss": 0.0791, + "step": 7105 + }, + { + "epoch": 2.302657161373947, + "grad_norm": 0.8614899516105652, + "learning_rate": 6.762618082232952e-07, + "loss": 0.0803, + "step": 7106 + }, + { + "epoch": 2.3029812054439405, + "grad_norm": 0.8272896409034729, + "learning_rate": 6.756637002076225e-07, + "loss": 0.0766, + "step": 7107 + }, + { + "epoch": 2.303305249513934, + "grad_norm": 0.9013105034828186, + "learning_rate": 6.750658154730522e-07, + "loss": 0.0779, + "step": 7108 + }, + { + "epoch": 2.3036292935839273, + "grad_norm": 0.9156434535980225, + "learning_rate": 6.744681540927588e-07, + "loss": 0.0818, + "step": 7109 + }, + { + "epoch": 2.3039533376539207, + "grad_norm": 0.8647904992103577, + "learning_rate": 6.738707161398914e-07, + "loss": 0.0819, + "step": 7110 + }, + { + "epoch": 2.3042773817239146, + "grad_norm": 0.9575430750846863, + "learning_rate": 6.732735016875697e-07, + "loss": 0.0802, + "step": 7111 + }, + { + "epoch": 2.304601425793908, + "grad_norm": 0.8224124312400818, + "learning_rate": 6.726765108088881e-07, + "loss": 0.0706, + "step": 7112 + }, + { + "epoch": 2.3049254698639015, + "grad_norm": 0.8639494180679321, + "learning_rate": 6.720797435769111e-07, + "loss": 0.0799, + "step": 7113 + }, + { + "epoch": 2.305249513933895, + "grad_norm": 0.9154525995254517, + "learning_rate": 6.714832000646778e-07, + "loss": 0.0777, + "step": 7114 + }, + { + "epoch": 2.3055735580038883, + "grad_norm": 0.8499510884284973, + "learning_rate": 6.708868803451992e-07, + "loss": 0.0753, + "step": 7115 + }, + { + "epoch": 2.305897602073882, + "grad_norm": 0.8317762613296509, + "learning_rate": 6.702907844914597e-07, + "loss": 0.0748, + "step": 7116 + }, + { + "epoch": 2.3062216461438756, + "grad_norm": 0.9097682237625122, + "learning_rate": 6.696949125764149e-07, + "loss": 0.082, + "step": 7117 + }, + { + "epoch": 2.306545690213869, + "grad_norm": 0.8897013068199158, + "learning_rate": 6.690992646729949e-07, + "loss": 0.0825, + "step": 7118 + }, + { + "epoch": 2.3068697342838624, + "grad_norm": 0.8689218163490295, + "learning_rate": 6.685038408540989e-07, + "loss": 0.0808, + "step": 7119 + }, + { + "epoch": 2.3071937783538563, + "grad_norm": 0.8567068576812744, + "learning_rate": 6.679086411926039e-07, + "loss": 0.0765, + "step": 7120 + }, + { + "epoch": 2.3075178224238497, + "grad_norm": 0.9129802584648132, + "learning_rate": 6.673136657613547e-07, + "loss": 0.0805, + "step": 7121 + }, + { + "epoch": 2.307841866493843, + "grad_norm": 0.9538443684577942, + "learning_rate": 6.667189146331707e-07, + "loss": 0.0819, + "step": 7122 + }, + { + "epoch": 2.3081659105638366, + "grad_norm": 0.8463201522827148, + "learning_rate": 6.661243878808443e-07, + "loss": 0.0706, + "step": 7123 + }, + { + "epoch": 2.3084899546338304, + "grad_norm": 0.8989773988723755, + "learning_rate": 6.655300855771393e-07, + "loss": 0.0759, + "step": 7124 + }, + { + "epoch": 2.308813998703824, + "grad_norm": 0.9400907158851624, + "learning_rate": 6.649360077947939e-07, + "loss": 0.0796, + "step": 7125 + }, + { + "epoch": 2.3091380427738173, + "grad_norm": 0.8975268602371216, + "learning_rate": 6.643421546065146e-07, + "loss": 0.0813, + "step": 7126 + }, + { + "epoch": 2.3094620868438107, + "grad_norm": 0.9153731465339661, + "learning_rate": 6.637485260849866e-07, + "loss": 0.0739, + "step": 7127 + }, + { + "epoch": 2.309786130913804, + "grad_norm": 0.9203081130981445, + "learning_rate": 6.63155122302861e-07, + "loss": 0.0748, + "step": 7128 + }, + { + "epoch": 2.310110174983798, + "grad_norm": 0.8517372012138367, + "learning_rate": 6.625619433327681e-07, + "loss": 0.0753, + "step": 7129 + }, + { + "epoch": 2.3104342190537914, + "grad_norm": 0.8861249089241028, + "learning_rate": 6.619689892473046e-07, + "loss": 0.078, + "step": 7130 + }, + { + "epoch": 2.310758263123785, + "grad_norm": 0.9046693444252014, + "learning_rate": 6.613762601190435e-07, + "loss": 0.0819, + "step": 7131 + }, + { + "epoch": 2.3110823071937783, + "grad_norm": 0.8858131170272827, + "learning_rate": 6.60783756020529e-07, + "loss": 0.0739, + "step": 7132 + }, + { + "epoch": 2.3114063512637717, + "grad_norm": 0.9197781085968018, + "learning_rate": 6.601914770242776e-07, + "loss": 0.0803, + "step": 7133 + }, + { + "epoch": 2.3117303953337656, + "grad_norm": 0.7731859683990479, + "learning_rate": 6.595994232027794e-07, + "loss": 0.0658, + "step": 7134 + }, + { + "epoch": 2.312054439403759, + "grad_norm": 0.8590306639671326, + "learning_rate": 6.590075946284941e-07, + "loss": 0.0697, + "step": 7135 + }, + { + "epoch": 2.3123784834737524, + "grad_norm": 0.9065210223197937, + "learning_rate": 6.584159913738583e-07, + "loss": 0.0851, + "step": 7136 + }, + { + "epoch": 2.312702527543746, + "grad_norm": 0.8334217667579651, + "learning_rate": 6.578246135112765e-07, + "loss": 0.0666, + "step": 7137 + }, + { + "epoch": 2.3130265716137393, + "grad_norm": 0.8901119232177734, + "learning_rate": 6.572334611131284e-07, + "loss": 0.0822, + "step": 7138 + }, + { + "epoch": 2.313350615683733, + "grad_norm": 0.9318228960037231, + "learning_rate": 6.566425342517652e-07, + "loss": 0.0764, + "step": 7139 + }, + { + "epoch": 2.3136746597537265, + "grad_norm": 0.8824783563613892, + "learning_rate": 6.560518329995108e-07, + "loss": 0.0824, + "step": 7140 + }, + { + "epoch": 2.31399870382372, + "grad_norm": 0.8565016984939575, + "learning_rate": 6.554613574286614e-07, + "loss": 0.0759, + "step": 7141 + }, + { + "epoch": 2.3143227478937134, + "grad_norm": 0.9348742961883545, + "learning_rate": 6.548711076114858e-07, + "loss": 0.0855, + "step": 7142 + }, + { + "epoch": 2.314646791963707, + "grad_norm": 0.8648019433021545, + "learning_rate": 6.542810836202237e-07, + "loss": 0.0764, + "step": 7143 + }, + { + "epoch": 2.3149708360337007, + "grad_norm": 0.9514819979667664, + "learning_rate": 6.536912855270894e-07, + "loss": 0.0861, + "step": 7144 + }, + { + "epoch": 2.315294880103694, + "grad_norm": 0.8484213948249817, + "learning_rate": 6.531017134042678e-07, + "loss": 0.0725, + "step": 7145 + }, + { + "epoch": 2.3156189241736875, + "grad_norm": 0.850391149520874, + "learning_rate": 6.52512367323917e-07, + "loss": 0.0698, + "step": 7146 + }, + { + "epoch": 2.315942968243681, + "grad_norm": 0.8870947360992432, + "learning_rate": 6.519232473581675e-07, + "loss": 0.081, + "step": 7147 + }, + { + "epoch": 2.316267012313675, + "grad_norm": 0.9229629635810852, + "learning_rate": 6.513343535791216e-07, + "loss": 0.0778, + "step": 7148 + }, + { + "epoch": 2.3165910563836682, + "grad_norm": 0.9507783651351929, + "learning_rate": 6.507456860588554e-07, + "loss": 0.0783, + "step": 7149 + }, + { + "epoch": 2.3169151004536617, + "grad_norm": 0.8440315127372742, + "learning_rate": 6.501572448694135e-07, + "loss": 0.0726, + "step": 7150 + }, + { + "epoch": 2.317239144523655, + "grad_norm": 0.8955333232879639, + "learning_rate": 6.495690300828183e-07, + "loss": 0.0771, + "step": 7151 + }, + { + "epoch": 2.317563188593649, + "grad_norm": 0.8333034515380859, + "learning_rate": 6.489810417710596e-07, + "loss": 0.0721, + "step": 7152 + }, + { + "epoch": 2.3178872326636424, + "grad_norm": 0.8312009572982788, + "learning_rate": 6.483932800061021e-07, + "loss": 0.0732, + "step": 7153 + }, + { + "epoch": 2.318211276733636, + "grad_norm": 0.9005463719367981, + "learning_rate": 6.478057448598821e-07, + "loss": 0.0803, + "step": 7154 + }, + { + "epoch": 2.3185353208036292, + "grad_norm": 0.9170337915420532, + "learning_rate": 6.472184364043085e-07, + "loss": 0.0806, + "step": 7155 + }, + { + "epoch": 2.3188593648736227, + "grad_norm": 0.9278320074081421, + "learning_rate": 6.466313547112627e-07, + "loss": 0.0755, + "step": 7156 + }, + { + "epoch": 2.3191834089436165, + "grad_norm": 0.8766685128211975, + "learning_rate": 6.460444998525953e-07, + "loss": 0.0784, + "step": 7157 + }, + { + "epoch": 2.31950745301361, + "grad_norm": 0.9274651408195496, + "learning_rate": 6.454578719001353e-07, + "loss": 0.0803, + "step": 7158 + }, + { + "epoch": 2.3198314970836034, + "grad_norm": 0.9497063755989075, + "learning_rate": 6.448714709256768e-07, + "loss": 0.0826, + "step": 7159 + }, + { + "epoch": 2.320155541153597, + "grad_norm": 0.9060593843460083, + "learning_rate": 6.442852970009925e-07, + "loss": 0.0832, + "step": 7160 + }, + { + "epoch": 2.32047958522359, + "grad_norm": 0.8644150495529175, + "learning_rate": 6.436993501978226e-07, + "loss": 0.0739, + "step": 7161 + }, + { + "epoch": 2.320803629293584, + "grad_norm": 0.9196017980575562, + "learning_rate": 6.431136305878819e-07, + "loss": 0.0771, + "step": 7162 + }, + { + "epoch": 2.3211276733635775, + "grad_norm": 0.8810617327690125, + "learning_rate": 6.425281382428566e-07, + "loss": 0.081, + "step": 7163 + }, + { + "epoch": 2.321451717433571, + "grad_norm": 0.8554599285125732, + "learning_rate": 6.419428732344055e-07, + "loss": 0.0773, + "step": 7164 + }, + { + "epoch": 2.3217757615035644, + "grad_norm": 0.8686216473579407, + "learning_rate": 6.413578356341602e-07, + "loss": 0.075, + "step": 7165 + }, + { + "epoch": 2.3220998055735578, + "grad_norm": 0.8910344839096069, + "learning_rate": 6.407730255137212e-07, + "loss": 0.0846, + "step": 7166 + }, + { + "epoch": 2.3224238496435516, + "grad_norm": 0.909019947052002, + "learning_rate": 6.401884429446667e-07, + "loss": 0.0808, + "step": 7167 + }, + { + "epoch": 2.322747893713545, + "grad_norm": 0.9781714677810669, + "learning_rate": 6.396040879985416e-07, + "loss": 0.0804, + "step": 7168 + }, + { + "epoch": 2.3230719377835385, + "grad_norm": 0.8202382922172546, + "learning_rate": 6.390199607468661e-07, + "loss": 0.0725, + "step": 7169 + }, + { + "epoch": 2.323395981853532, + "grad_norm": 0.8984230160713196, + "learning_rate": 6.384360612611317e-07, + "loss": 0.0815, + "step": 7170 + }, + { + "epoch": 2.323720025923526, + "grad_norm": 0.8639374375343323, + "learning_rate": 6.378523896128022e-07, + "loss": 0.0767, + "step": 7171 + }, + { + "epoch": 2.324044069993519, + "grad_norm": 0.8666595816612244, + "learning_rate": 6.37268945873313e-07, + "loss": 0.0745, + "step": 7172 + }, + { + "epoch": 2.3243681140635126, + "grad_norm": 0.8796083927154541, + "learning_rate": 6.36685730114073e-07, + "loss": 0.0782, + "step": 7173 + }, + { + "epoch": 2.324692158133506, + "grad_norm": 0.956018328666687, + "learning_rate": 6.361027424064609e-07, + "loss": 0.0861, + "step": 7174 + }, + { + "epoch": 2.3250162022035, + "grad_norm": 0.9067574143409729, + "learning_rate": 6.355199828218289e-07, + "loss": 0.0766, + "step": 7175 + }, + { + "epoch": 2.3253402462734933, + "grad_norm": 0.9012507200241089, + "learning_rate": 6.349374514315015e-07, + "loss": 0.0801, + "step": 7176 + }, + { + "epoch": 2.3256642903434868, + "grad_norm": 0.8362342119216919, + "learning_rate": 6.343551483067751e-07, + "loss": 0.0734, + "step": 7177 + }, + { + "epoch": 2.32598833441348, + "grad_norm": 0.8647367358207703, + "learning_rate": 6.337730735189174e-07, + "loss": 0.0754, + "step": 7178 + }, + { + "epoch": 2.3263123784834736, + "grad_norm": 0.9567950367927551, + "learning_rate": 6.331912271391688e-07, + "loss": 0.0785, + "step": 7179 + }, + { + "epoch": 2.3266364225534675, + "grad_norm": 0.8517213463783264, + "learning_rate": 6.326096092387429e-07, + "loss": 0.0775, + "step": 7180 + }, + { + "epoch": 2.326960466623461, + "grad_norm": 0.8583937287330627, + "learning_rate": 6.320282198888217e-07, + "loss": 0.0757, + "step": 7181 + }, + { + "epoch": 2.3272845106934543, + "grad_norm": 0.919492244720459, + "learning_rate": 6.314470591605646e-07, + "loss": 0.0824, + "step": 7182 + }, + { + "epoch": 2.3276085547634477, + "grad_norm": 0.8579474091529846, + "learning_rate": 6.308661271250974e-07, + "loss": 0.0771, + "step": 7183 + }, + { + "epoch": 2.327932598833441, + "grad_norm": 0.9200006723403931, + "learning_rate": 6.302854238535219e-07, + "loss": 0.0801, + "step": 7184 + }, + { + "epoch": 2.328256642903435, + "grad_norm": 0.8708798885345459, + "learning_rate": 6.2970494941691e-07, + "loss": 0.0802, + "step": 7185 + }, + { + "epoch": 2.3285806869734285, + "grad_norm": 0.8774047493934631, + "learning_rate": 6.291247038863066e-07, + "loss": 0.0805, + "step": 7186 + }, + { + "epoch": 2.328904731043422, + "grad_norm": 0.866742730140686, + "learning_rate": 6.285446873327289e-07, + "loss": 0.0756, + "step": 7187 + }, + { + "epoch": 2.3292287751134153, + "grad_norm": 0.9446413516998291, + "learning_rate": 6.279648998271626e-07, + "loss": 0.0804, + "step": 7188 + }, + { + "epoch": 2.3295528191834087, + "grad_norm": 0.8982147574424744, + "learning_rate": 6.273853414405715e-07, + "loss": 0.0781, + "step": 7189 + }, + { + "epoch": 2.3298768632534026, + "grad_norm": 0.8736512660980225, + "learning_rate": 6.268060122438846e-07, + "loss": 0.0783, + "step": 7190 + }, + { + "epoch": 2.330200907323396, + "grad_norm": 0.9103041291236877, + "learning_rate": 6.262269123080095e-07, + "loss": 0.0775, + "step": 7191 + }, + { + "epoch": 2.3305249513933894, + "grad_norm": 0.9528464078903198, + "learning_rate": 6.256480417038202e-07, + "loss": 0.081, + "step": 7192 + }, + { + "epoch": 2.330848995463383, + "grad_norm": 0.82795649766922, + "learning_rate": 6.250694005021651e-07, + "loss": 0.0743, + "step": 7193 + }, + { + "epoch": 2.3311730395333763, + "grad_norm": 1.019910454750061, + "learning_rate": 6.244909887738651e-07, + "loss": 0.0845, + "step": 7194 + }, + { + "epoch": 2.33149708360337, + "grad_norm": 0.8735139966011047, + "learning_rate": 6.239128065897113e-07, + "loss": 0.0805, + "step": 7195 + }, + { + "epoch": 2.3318211276733636, + "grad_norm": 0.8908354043960571, + "learning_rate": 6.233348540204689e-07, + "loss": 0.0829, + "step": 7196 + }, + { + "epoch": 2.332145171743357, + "grad_norm": 0.9360215663909912, + "learning_rate": 6.227571311368724e-07, + "loss": 0.0812, + "step": 7197 + }, + { + "epoch": 2.3324692158133504, + "grad_norm": 0.8365440368652344, + "learning_rate": 6.221796380096298e-07, + "loss": 0.0698, + "step": 7198 + }, + { + "epoch": 2.3327932598833443, + "grad_norm": 0.8395832777023315, + "learning_rate": 6.216023747094207e-07, + "loss": 0.0766, + "step": 7199 + }, + { + "epoch": 2.3331173039533377, + "grad_norm": 0.8404845595359802, + "learning_rate": 6.210253413068964e-07, + "loss": 0.07, + "step": 7200 + }, + { + "epoch": 2.333441348023331, + "grad_norm": 0.9587187767028809, + "learning_rate": 6.20448537872681e-07, + "loss": 0.0864, + "step": 7201 + }, + { + "epoch": 2.3337653920933246, + "grad_norm": 0.8404291868209839, + "learning_rate": 6.198719644773687e-07, + "loss": 0.0724, + "step": 7202 + }, + { + "epoch": 2.3340894361633184, + "grad_norm": 0.8552994132041931, + "learning_rate": 6.192956211915269e-07, + "loss": 0.0766, + "step": 7203 + }, + { + "epoch": 2.334413480233312, + "grad_norm": 0.8210543990135193, + "learning_rate": 6.187195080856953e-07, + "loss": 0.0723, + "step": 7204 + }, + { + "epoch": 2.3347375243033053, + "grad_norm": 0.9449579119682312, + "learning_rate": 6.181436252303829e-07, + "loss": 0.0801, + "step": 7205 + }, + { + "epoch": 2.3350615683732987, + "grad_norm": 0.8949571251869202, + "learning_rate": 6.175679726960731e-07, + "loss": 0.0786, + "step": 7206 + }, + { + "epoch": 2.335385612443292, + "grad_norm": 0.8811914920806885, + "learning_rate": 6.169925505532201e-07, + "loss": 0.0784, + "step": 7207 + }, + { + "epoch": 2.335709656513286, + "grad_norm": 0.8562451004981995, + "learning_rate": 6.164173588722497e-07, + "loss": 0.0728, + "step": 7208 + }, + { + "epoch": 2.3360337005832794, + "grad_norm": 0.895576536655426, + "learning_rate": 6.158423977235611e-07, + "loss": 0.0799, + "step": 7209 + }, + { + "epoch": 2.336357744653273, + "grad_norm": 0.8703263401985168, + "learning_rate": 6.152676671775215e-07, + "loss": 0.0737, + "step": 7210 + }, + { + "epoch": 2.3366817887232663, + "grad_norm": 0.9055336713790894, + "learning_rate": 6.146931673044751e-07, + "loss": 0.0796, + "step": 7211 + }, + { + "epoch": 2.3370058327932597, + "grad_norm": 0.9409792423248291, + "learning_rate": 6.141188981747323e-07, + "loss": 0.0804, + "step": 7212 + }, + { + "epoch": 2.3373298768632536, + "grad_norm": 0.8985840082168579, + "learning_rate": 6.135448598585814e-07, + "loss": 0.081, + "step": 7213 + }, + { + "epoch": 2.337653920933247, + "grad_norm": 0.8504696488380432, + "learning_rate": 6.129710524262758e-07, + "loss": 0.0721, + "step": 7214 + }, + { + "epoch": 2.3379779650032404, + "grad_norm": 0.9190945625305176, + "learning_rate": 6.123974759480469e-07, + "loss": 0.0776, + "step": 7215 + }, + { + "epoch": 2.338302009073234, + "grad_norm": 0.9012274742126465, + "learning_rate": 6.118241304940928e-07, + "loss": 0.0747, + "step": 7216 + }, + { + "epoch": 2.3386260531432272, + "grad_norm": 0.8527746796607971, + "learning_rate": 6.112510161345861e-07, + "loss": 0.0742, + "step": 7217 + }, + { + "epoch": 2.338950097213221, + "grad_norm": 0.9016045928001404, + "learning_rate": 6.106781329396714e-07, + "loss": 0.0717, + "step": 7218 + }, + { + "epoch": 2.3392741412832145, + "grad_norm": 0.8989761471748352, + "learning_rate": 6.101054809794615e-07, + "loss": 0.0788, + "step": 7219 + }, + { + "epoch": 2.339598185353208, + "grad_norm": 0.9128497242927551, + "learning_rate": 6.095330603240468e-07, + "loss": 0.0778, + "step": 7220 + }, + { + "epoch": 2.3399222294232014, + "grad_norm": 1.0037459135055542, + "learning_rate": 6.089608710434836e-07, + "loss": 0.0694, + "step": 7221 + }, + { + "epoch": 2.3402462734931953, + "grad_norm": 0.7942954897880554, + "learning_rate": 6.083889132078033e-07, + "loss": 0.0699, + "step": 7222 + }, + { + "epoch": 2.3405703175631887, + "grad_norm": 0.949350893497467, + "learning_rate": 6.078171868870075e-07, + "loss": 0.0783, + "step": 7223 + }, + { + "epoch": 2.340894361633182, + "grad_norm": 0.8463143706321716, + "learning_rate": 6.072456921510703e-07, + "loss": 0.0696, + "step": 7224 + }, + { + "epoch": 2.3412184057031755, + "grad_norm": 0.9158247113227844, + "learning_rate": 6.066744290699372e-07, + "loss": 0.078, + "step": 7225 + }, + { + "epoch": 2.3415424497731694, + "grad_norm": 0.8678821325302124, + "learning_rate": 6.061033977135253e-07, + "loss": 0.0799, + "step": 7226 + }, + { + "epoch": 2.341866493843163, + "grad_norm": 0.9464151263237, + "learning_rate": 6.055325981517238e-07, + "loss": 0.0845, + "step": 7227 + }, + { + "epoch": 2.3421905379131562, + "grad_norm": 0.8548157811164856, + "learning_rate": 6.049620304543916e-07, + "loss": 0.0744, + "step": 7228 + }, + { + "epoch": 2.3425145819831497, + "grad_norm": 0.9701404571533203, + "learning_rate": 6.043916946913613e-07, + "loss": 0.0882, + "step": 7229 + }, + { + "epoch": 2.342838626053143, + "grad_norm": 0.8720276951789856, + "learning_rate": 6.038215909324372e-07, + "loss": 0.0744, + "step": 7230 + }, + { + "epoch": 2.343162670123137, + "grad_norm": 0.8316831588745117, + "learning_rate": 6.032517192473935e-07, + "loss": 0.0749, + "step": 7231 + }, + { + "epoch": 2.3434867141931304, + "grad_norm": 0.8839612007141113, + "learning_rate": 6.026820797059777e-07, + "loss": 0.0755, + "step": 7232 + }, + { + "epoch": 2.343810758263124, + "grad_norm": 0.9230669736862183, + "learning_rate": 6.021126723779075e-07, + "loss": 0.085, + "step": 7233 + }, + { + "epoch": 2.344134802333117, + "grad_norm": 0.8618535399436951, + "learning_rate": 6.015434973328735e-07, + "loss": 0.0746, + "step": 7234 + }, + { + "epoch": 2.3444588464031106, + "grad_norm": 0.9741597771644592, + "learning_rate": 6.009745546405377e-07, + "loss": 0.0699, + "step": 7235 + }, + { + "epoch": 2.3447828904731045, + "grad_norm": 0.8314618468284607, + "learning_rate": 6.00405844370531e-07, + "loss": 0.0711, + "step": 7236 + }, + { + "epoch": 2.345106934543098, + "grad_norm": 0.9069364070892334, + "learning_rate": 5.998373665924606e-07, + "loss": 0.0817, + "step": 7237 + }, + { + "epoch": 2.3454309786130914, + "grad_norm": 0.8206673264503479, + "learning_rate": 5.992691213759011e-07, + "loss": 0.071, + "step": 7238 + }, + { + "epoch": 2.345755022683085, + "grad_norm": 0.8871822357177734, + "learning_rate": 5.987011087904007e-07, + "loss": 0.0785, + "step": 7239 + }, + { + "epoch": 2.346079066753078, + "grad_norm": 0.8879174590110779, + "learning_rate": 5.981333289054792e-07, + "loss": 0.0764, + "step": 7240 + }, + { + "epoch": 2.346403110823072, + "grad_norm": 0.884364664554596, + "learning_rate": 5.975657817906253e-07, + "loss": 0.0777, + "step": 7241 + }, + { + "epoch": 2.3467271548930655, + "grad_norm": 0.8927558660507202, + "learning_rate": 5.96998467515304e-07, + "loss": 0.0831, + "step": 7242 + }, + { + "epoch": 2.347051198963059, + "grad_norm": 0.8751205205917358, + "learning_rate": 5.964313861489466e-07, + "loss": 0.0721, + "step": 7243 + }, + { + "epoch": 2.3473752430330523, + "grad_norm": 0.9098830819129944, + "learning_rate": 5.958645377609606e-07, + "loss": 0.08, + "step": 7244 + }, + { + "epoch": 2.347699287103046, + "grad_norm": 0.9423938393592834, + "learning_rate": 5.952979224207205e-07, + "loss": 0.0823, + "step": 7245 + }, + { + "epoch": 2.3480233311730396, + "grad_norm": 0.8223200440406799, + "learning_rate": 5.947315401975773e-07, + "loss": 0.0715, + "step": 7246 + }, + { + "epoch": 2.348347375243033, + "grad_norm": 0.8894121646881104, + "learning_rate": 5.941653911608486e-07, + "loss": 0.0779, + "step": 7247 + }, + { + "epoch": 2.3486714193130265, + "grad_norm": 0.8771853446960449, + "learning_rate": 5.935994753798258e-07, + "loss": 0.0764, + "step": 7248 + }, + { + "epoch": 2.34899546338302, + "grad_norm": 0.8687610030174255, + "learning_rate": 5.930337929237726e-07, + "loss": 0.0744, + "step": 7249 + }, + { + "epoch": 2.3493195074530138, + "grad_norm": 0.9485291242599487, + "learning_rate": 5.924683438619208e-07, + "loss": 0.0777, + "step": 7250 + }, + { + "epoch": 2.349643551523007, + "grad_norm": 0.9002734422683716, + "learning_rate": 5.91903128263479e-07, + "loss": 0.072, + "step": 7251 + }, + { + "epoch": 2.3499675955930006, + "grad_norm": 0.8892845511436462, + "learning_rate": 5.913381461976217e-07, + "loss": 0.0741, + "step": 7252 + }, + { + "epoch": 2.350291639662994, + "grad_norm": 0.9703330397605896, + "learning_rate": 5.907733977334978e-07, + "loss": 0.0824, + "step": 7253 + }, + { + "epoch": 2.350615683732988, + "grad_norm": 0.8633927702903748, + "learning_rate": 5.902088829402274e-07, + "loss": 0.07, + "step": 7254 + }, + { + "epoch": 2.3509397278029813, + "grad_norm": 0.8836617469787598, + "learning_rate": 5.896446018869018e-07, + "loss": 0.0734, + "step": 7255 + }, + { + "epoch": 2.3512637718729748, + "grad_norm": 0.9725565314292908, + "learning_rate": 5.890805546425832e-07, + "loss": 0.0798, + "step": 7256 + }, + { + "epoch": 2.351587815942968, + "grad_norm": 0.930892825126648, + "learning_rate": 5.885167412763051e-07, + "loss": 0.0788, + "step": 7257 + }, + { + "epoch": 2.3519118600129616, + "grad_norm": 0.883513867855072, + "learning_rate": 5.879531618570738e-07, + "loss": 0.0811, + "step": 7258 + }, + { + "epoch": 2.3522359040829555, + "grad_norm": 0.9249931573867798, + "learning_rate": 5.873898164538658e-07, + "loss": 0.0767, + "step": 7259 + }, + { + "epoch": 2.352559948152949, + "grad_norm": 0.8391486406326294, + "learning_rate": 5.868267051356283e-07, + "loss": 0.0742, + "step": 7260 + }, + { + "epoch": 2.3528839922229423, + "grad_norm": 0.9068962335586548, + "learning_rate": 5.86263827971281e-07, + "loss": 0.076, + "step": 7261 + }, + { + "epoch": 2.3532080362929357, + "grad_norm": 0.904548704624176, + "learning_rate": 5.857011850297148e-07, + "loss": 0.0786, + "step": 7262 + }, + { + "epoch": 2.353532080362929, + "grad_norm": 0.8462362289428711, + "learning_rate": 5.851387763797916e-07, + "loss": 0.0788, + "step": 7263 + }, + { + "epoch": 2.353856124432923, + "grad_norm": 0.8829976916313171, + "learning_rate": 5.845766020903459e-07, + "loss": 0.0765, + "step": 7264 + }, + { + "epoch": 2.3541801685029164, + "grad_norm": 0.9879518747329712, + "learning_rate": 5.840146622301796e-07, + "loss": 0.0889, + "step": 7265 + }, + { + "epoch": 2.35450421257291, + "grad_norm": 0.907825767993927, + "learning_rate": 5.834529568680722e-07, + "loss": 0.0775, + "step": 7266 + }, + { + "epoch": 2.3548282566429033, + "grad_norm": 0.8694019913673401, + "learning_rate": 5.828914860727674e-07, + "loss": 0.0776, + "step": 7267 + }, + { + "epoch": 2.3551523007128967, + "grad_norm": 0.8807018995285034, + "learning_rate": 5.823302499129873e-07, + "loss": 0.0794, + "step": 7268 + }, + { + "epoch": 2.3554763447828906, + "grad_norm": 0.8571588397026062, + "learning_rate": 5.817692484574197e-07, + "loss": 0.0733, + "step": 7269 + }, + { + "epoch": 2.355800388852884, + "grad_norm": 0.8664126396179199, + "learning_rate": 5.81208481774726e-07, + "loss": 0.075, + "step": 7270 + }, + { + "epoch": 2.3561244329228774, + "grad_norm": 0.8779671788215637, + "learning_rate": 5.806479499335385e-07, + "loss": 0.075, + "step": 7271 + }, + { + "epoch": 2.356448476992871, + "grad_norm": 0.9118459224700928, + "learning_rate": 5.800876530024615e-07, + "loss": 0.0799, + "step": 7272 + }, + { + "epoch": 2.3567725210628647, + "grad_norm": 0.8571300506591797, + "learning_rate": 5.795275910500703e-07, + "loss": 0.074, + "step": 7273 + }, + { + "epoch": 2.357096565132858, + "grad_norm": 0.8532280325889587, + "learning_rate": 5.789677641449087e-07, + "loss": 0.076, + "step": 7274 + }, + { + "epoch": 2.3574206092028516, + "grad_norm": 0.8807914853096008, + "learning_rate": 5.784081723554971e-07, + "loss": 0.0749, + "step": 7275 + }, + { + "epoch": 2.357744653272845, + "grad_norm": 0.9329232573509216, + "learning_rate": 5.778488157503223e-07, + "loss": 0.0825, + "step": 7276 + }, + { + "epoch": 2.358068697342839, + "grad_norm": 0.8534782528877258, + "learning_rate": 5.772896943978446e-07, + "loss": 0.0748, + "step": 7277 + }, + { + "epoch": 2.3583927414128323, + "grad_norm": 0.9644832015037537, + "learning_rate": 5.767308083664949e-07, + "loss": 0.0815, + "step": 7278 + }, + { + "epoch": 2.3587167854828257, + "grad_norm": 0.9723904132843018, + "learning_rate": 5.761721577246754e-07, + "loss": 0.0836, + "step": 7279 + }, + { + "epoch": 2.359040829552819, + "grad_norm": 0.8773126602172852, + "learning_rate": 5.756137425407598e-07, + "loss": 0.0798, + "step": 7280 + }, + { + "epoch": 2.3593648736228126, + "grad_norm": 0.9173557162284851, + "learning_rate": 5.750555628830928e-07, + "loss": 0.0787, + "step": 7281 + }, + { + "epoch": 2.3596889176928064, + "grad_norm": 0.9228742718696594, + "learning_rate": 5.744976188199905e-07, + "loss": 0.0773, + "step": 7282 + }, + { + "epoch": 2.3600129617628, + "grad_norm": 0.9630638360977173, + "learning_rate": 5.739399104197388e-07, + "loss": 0.0814, + "step": 7283 + }, + { + "epoch": 2.3603370058327933, + "grad_norm": 0.8754711747169495, + "learning_rate": 5.733824377505965e-07, + "loss": 0.0739, + "step": 7284 + }, + { + "epoch": 2.3606610499027867, + "grad_norm": 0.828157365322113, + "learning_rate": 5.728252008807925e-07, + "loss": 0.0729, + "step": 7285 + }, + { + "epoch": 2.36098509397278, + "grad_norm": 0.8452654480934143, + "learning_rate": 5.722681998785273e-07, + "loss": 0.0762, + "step": 7286 + }, + { + "epoch": 2.361309138042774, + "grad_norm": 1.0112296342849731, + "learning_rate": 5.717114348119726e-07, + "loss": 0.0846, + "step": 7287 + }, + { + "epoch": 2.3616331821127674, + "grad_norm": 0.8676653504371643, + "learning_rate": 5.711549057492718e-07, + "loss": 0.0693, + "step": 7288 + }, + { + "epoch": 2.361957226182761, + "grad_norm": 0.8431695699691772, + "learning_rate": 5.705986127585364e-07, + "loss": 0.0721, + "step": 7289 + }, + { + "epoch": 2.3622812702527543, + "grad_norm": 0.8941037058830261, + "learning_rate": 5.700425559078543e-07, + "loss": 0.0732, + "step": 7290 + }, + { + "epoch": 2.3626053143227477, + "grad_norm": 0.8676804304122925, + "learning_rate": 5.694867352652791e-07, + "loss": 0.0797, + "step": 7291 + }, + { + "epoch": 2.3629293583927415, + "grad_norm": 0.8437778949737549, + "learning_rate": 5.689311508988385e-07, + "loss": 0.0774, + "step": 7292 + }, + { + "epoch": 2.363253402462735, + "grad_norm": 0.8584735989570618, + "learning_rate": 5.68375802876531e-07, + "loss": 0.0709, + "step": 7293 + }, + { + "epoch": 2.3635774465327284, + "grad_norm": 0.901317834854126, + "learning_rate": 5.678206912663259e-07, + "loss": 0.0752, + "step": 7294 + }, + { + "epoch": 2.363901490602722, + "grad_norm": 0.9817501306533813, + "learning_rate": 5.672658161361636e-07, + "loss": 0.084, + "step": 7295 + }, + { + "epoch": 2.3642255346727157, + "grad_norm": 0.8984825611114502, + "learning_rate": 5.667111775539538e-07, + "loss": 0.0792, + "step": 7296 + }, + { + "epoch": 2.364549578742709, + "grad_norm": 0.903588056564331, + "learning_rate": 5.661567755875816e-07, + "loss": 0.0761, + "step": 7297 + }, + { + "epoch": 2.3648736228127025, + "grad_norm": 0.9453230500221252, + "learning_rate": 5.656026103048975e-07, + "loss": 0.0767, + "step": 7298 + }, + { + "epoch": 2.365197666882696, + "grad_norm": 0.8390913009643555, + "learning_rate": 5.650486817737291e-07, + "loss": 0.0739, + "step": 7299 + }, + { + "epoch": 2.3655217109526894, + "grad_norm": 0.8238028287887573, + "learning_rate": 5.644949900618696e-07, + "loss": 0.0686, + "step": 7300 + }, + { + "epoch": 2.3658457550226832, + "grad_norm": 0.8137726187705994, + "learning_rate": 5.639415352370858e-07, + "loss": 0.0707, + "step": 7301 + }, + { + "epoch": 2.3661697990926767, + "grad_norm": 0.8957241773605347, + "learning_rate": 5.633883173671159e-07, + "loss": 0.0747, + "step": 7302 + }, + { + "epoch": 2.36649384316267, + "grad_norm": 0.9324427843093872, + "learning_rate": 5.628353365196682e-07, + "loss": 0.0825, + "step": 7303 + }, + { + "epoch": 2.3668178872326635, + "grad_norm": 0.8149759769439697, + "learning_rate": 5.622825927624226e-07, + "loss": 0.0717, + "step": 7304 + }, + { + "epoch": 2.3671419313026574, + "grad_norm": 0.8785021901130676, + "learning_rate": 5.617300861630276e-07, + "loss": 0.0781, + "step": 7305 + }, + { + "epoch": 2.367465975372651, + "grad_norm": 0.9200155735015869, + "learning_rate": 5.611778167891077e-07, + "loss": 0.0813, + "step": 7306 + }, + { + "epoch": 2.3677900194426442, + "grad_norm": 0.9415361881256104, + "learning_rate": 5.60625784708253e-07, + "loss": 0.0797, + "step": 7307 + }, + { + "epoch": 2.3681140635126376, + "grad_norm": 0.8852391839027405, + "learning_rate": 5.600739899880275e-07, + "loss": 0.0765, + "step": 7308 + }, + { + "epoch": 2.368438107582631, + "grad_norm": 0.9021180868148804, + "learning_rate": 5.595224326959662e-07, + "loss": 0.0767, + "step": 7309 + }, + { + "epoch": 2.368762151652625, + "grad_norm": 0.809059202671051, + "learning_rate": 5.589711128995734e-07, + "loss": 0.0706, + "step": 7310 + }, + { + "epoch": 2.3690861957226184, + "grad_norm": 0.9455664753913879, + "learning_rate": 5.584200306663259e-07, + "loss": 0.078, + "step": 7311 + }, + { + "epoch": 2.369410239792612, + "grad_norm": 0.8230124115943909, + "learning_rate": 5.578691860636706e-07, + "loss": 0.0735, + "step": 7312 + }, + { + "epoch": 2.369734283862605, + "grad_norm": 0.8970810174942017, + "learning_rate": 5.573185791590266e-07, + "loss": 0.0771, + "step": 7313 + }, + { + "epoch": 2.3700583279325986, + "grad_norm": 0.8559962511062622, + "learning_rate": 5.567682100197808e-07, + "loss": 0.0742, + "step": 7314 + }, + { + "epoch": 2.3703823720025925, + "grad_norm": 0.769929826259613, + "learning_rate": 5.562180787132945e-07, + "loss": 0.0634, + "step": 7315 + }, + { + "epoch": 2.370706416072586, + "grad_norm": 0.8632463216781616, + "learning_rate": 5.55668185306898e-07, + "loss": 0.0743, + "step": 7316 + }, + { + "epoch": 2.3710304601425793, + "grad_norm": 0.857993483543396, + "learning_rate": 5.551185298678929e-07, + "loss": 0.075, + "step": 7317 + }, + { + "epoch": 2.3713545042125728, + "grad_norm": 0.9158240556716919, + "learning_rate": 5.545691124635518e-07, + "loss": 0.077, + "step": 7318 + }, + { + "epoch": 2.371678548282566, + "grad_norm": 0.9267992377281189, + "learning_rate": 5.54019933161119e-07, + "loss": 0.0814, + "step": 7319 + }, + { + "epoch": 2.37200259235256, + "grad_norm": 0.961631715297699, + "learning_rate": 5.534709920278064e-07, + "loss": 0.0796, + "step": 7320 + }, + { + "epoch": 2.3723266364225535, + "grad_norm": 0.8961342573165894, + "learning_rate": 5.52922289130802e-07, + "loss": 0.0798, + "step": 7321 + }, + { + "epoch": 2.372650680492547, + "grad_norm": 0.9417958855628967, + "learning_rate": 5.523738245372596e-07, + "loss": 0.0781, + "step": 7322 + }, + { + "epoch": 2.3729747245625403, + "grad_norm": 0.9764118194580078, + "learning_rate": 5.518255983143061e-07, + "loss": 0.0829, + "step": 7323 + }, + { + "epoch": 2.373298768632534, + "grad_norm": 0.8964194059371948, + "learning_rate": 5.512776105290402e-07, + "loss": 0.077, + "step": 7324 + }, + { + "epoch": 2.3736228127025276, + "grad_norm": 0.8937922716140747, + "learning_rate": 5.507298612485293e-07, + "loss": 0.0792, + "step": 7325 + }, + { + "epoch": 2.373946856772521, + "grad_norm": 0.8468626737594604, + "learning_rate": 5.501823505398137e-07, + "loss": 0.0732, + "step": 7326 + }, + { + "epoch": 2.3742709008425145, + "grad_norm": 0.8795669674873352, + "learning_rate": 5.496350784699015e-07, + "loss": 0.0729, + "step": 7327 + }, + { + "epoch": 2.3745949449125083, + "grad_norm": 0.9298455715179443, + "learning_rate": 5.490880451057759e-07, + "loss": 0.0791, + "step": 7328 + }, + { + "epoch": 2.3749189889825018, + "grad_norm": 0.8860503435134888, + "learning_rate": 5.485412505143858e-07, + "loss": 0.0755, + "step": 7329 + }, + { + "epoch": 2.375243033052495, + "grad_norm": 0.9270402193069458, + "learning_rate": 5.479946947626566e-07, + "loss": 0.0825, + "step": 7330 + }, + { + "epoch": 2.3755670771224886, + "grad_norm": 0.9208892583847046, + "learning_rate": 5.474483779174791e-07, + "loss": 0.0741, + "step": 7331 + }, + { + "epoch": 2.375891121192482, + "grad_norm": 0.8634201884269714, + "learning_rate": 5.469023000457183e-07, + "loss": 0.0761, + "step": 7332 + }, + { + "epoch": 2.376215165262476, + "grad_norm": 0.8918706178665161, + "learning_rate": 5.463564612142083e-07, + "loss": 0.0763, + "step": 7333 + }, + { + "epoch": 2.3765392093324693, + "grad_norm": 0.9424406886100769, + "learning_rate": 5.458108614897545e-07, + "loss": 0.0848, + "step": 7334 + }, + { + "epoch": 2.3768632534024627, + "grad_norm": 0.8506817817687988, + "learning_rate": 5.452655009391341e-07, + "loss": 0.0749, + "step": 7335 + }, + { + "epoch": 2.377187297472456, + "grad_norm": 0.8940818905830383, + "learning_rate": 5.447203796290918e-07, + "loss": 0.082, + "step": 7336 + }, + { + "epoch": 2.3775113415424496, + "grad_norm": 0.9235643148422241, + "learning_rate": 5.441754976263478e-07, + "loss": 0.0852, + "step": 7337 + }, + { + "epoch": 2.3778353856124435, + "grad_norm": 0.875626266002655, + "learning_rate": 5.436308549975883e-07, + "loss": 0.0747, + "step": 7338 + }, + { + "epoch": 2.378159429682437, + "grad_norm": 0.8669414520263672, + "learning_rate": 5.430864518094731e-07, + "loss": 0.0719, + "step": 7339 + }, + { + "epoch": 2.3784834737524303, + "grad_norm": 0.8698946833610535, + "learning_rate": 5.425422881286319e-07, + "loss": 0.0818, + "step": 7340 + }, + { + "epoch": 2.3788075178224237, + "grad_norm": 0.937790036201477, + "learning_rate": 5.419983640216647e-07, + "loss": 0.0846, + "step": 7341 + }, + { + "epoch": 2.379131561892417, + "grad_norm": 0.8682368397712708, + "learning_rate": 5.414546795551429e-07, + "loss": 0.078, + "step": 7342 + }, + { + "epoch": 2.379455605962411, + "grad_norm": 0.8425775766372681, + "learning_rate": 5.409112347956089e-07, + "loss": 0.0746, + "step": 7343 + }, + { + "epoch": 2.3797796500324044, + "grad_norm": 0.9196928143501282, + "learning_rate": 5.403680298095737e-07, + "loss": 0.0747, + "step": 7344 + }, + { + "epoch": 2.380103694102398, + "grad_norm": 0.8911183476448059, + "learning_rate": 5.398250646635209e-07, + "loss": 0.082, + "step": 7345 + }, + { + "epoch": 2.3804277381723913, + "grad_norm": 0.9871136546134949, + "learning_rate": 5.392823394239042e-07, + "loss": 0.083, + "step": 7346 + }, + { + "epoch": 2.380751782242385, + "grad_norm": 0.8932740688323975, + "learning_rate": 5.387398541571479e-07, + "loss": 0.0797, + "step": 7347 + }, + { + "epoch": 2.3810758263123786, + "grad_norm": 0.902044415473938, + "learning_rate": 5.381976089296467e-07, + "loss": 0.0784, + "step": 7348 + }, + { + "epoch": 2.381399870382372, + "grad_norm": 0.8423619866371155, + "learning_rate": 5.376556038077668e-07, + "loss": 0.0747, + "step": 7349 + }, + { + "epoch": 2.3817239144523654, + "grad_norm": 1.0458214282989502, + "learning_rate": 5.371138388578448e-07, + "loss": 0.0808, + "step": 7350 + }, + { + "epoch": 2.3820479585223593, + "grad_norm": 0.9807311296463013, + "learning_rate": 5.365723141461851e-07, + "loss": 0.0848, + "step": 7351 + }, + { + "epoch": 2.3823720025923527, + "grad_norm": 0.8166460990905762, + "learning_rate": 5.360310297390681e-07, + "loss": 0.0718, + "step": 7352 + }, + { + "epoch": 2.382696046662346, + "grad_norm": 0.8361016511917114, + "learning_rate": 5.354899857027398e-07, + "loss": 0.0745, + "step": 7353 + }, + { + "epoch": 2.3830200907323396, + "grad_norm": 0.8787776827812195, + "learning_rate": 5.349491821034192e-07, + "loss": 0.0782, + "step": 7354 + }, + { + "epoch": 2.383344134802333, + "grad_norm": 0.8866326212882996, + "learning_rate": 5.344086190072955e-07, + "loss": 0.0734, + "step": 7355 + }, + { + "epoch": 2.383668178872327, + "grad_norm": 0.9183595776557922, + "learning_rate": 5.338682964805286e-07, + "loss": 0.0826, + "step": 7356 + }, + { + "epoch": 2.3839922229423203, + "grad_norm": 0.8560160398483276, + "learning_rate": 5.333282145892493e-07, + "loss": 0.0719, + "step": 7357 + }, + { + "epoch": 2.3843162670123137, + "grad_norm": 0.8655824065208435, + "learning_rate": 5.327883733995562e-07, + "loss": 0.0793, + "step": 7358 + }, + { + "epoch": 2.384640311082307, + "grad_norm": 0.9186474084854126, + "learning_rate": 5.322487729775233e-07, + "loss": 0.0791, + "step": 7359 + }, + { + "epoch": 2.3849643551523005, + "grad_norm": 0.8937341570854187, + "learning_rate": 5.317094133891903e-07, + "loss": 0.0776, + "step": 7360 + }, + { + "epoch": 2.3852883992222944, + "grad_norm": 0.9676603078842163, + "learning_rate": 5.311702947005718e-07, + "loss": 0.0764, + "step": 7361 + }, + { + "epoch": 2.385612443292288, + "grad_norm": 0.8829269409179688, + "learning_rate": 5.306314169776486e-07, + "loss": 0.0728, + "step": 7362 + }, + { + "epoch": 2.3859364873622813, + "grad_norm": 0.9241729378700256, + "learning_rate": 5.30092780286375e-07, + "loss": 0.0774, + "step": 7363 + }, + { + "epoch": 2.3862605314322747, + "grad_norm": 0.8636072278022766, + "learning_rate": 5.295543846926752e-07, + "loss": 0.077, + "step": 7364 + }, + { + "epoch": 2.386584575502268, + "grad_norm": 0.882157564163208, + "learning_rate": 5.290162302624433e-07, + "loss": 0.0787, + "step": 7365 + }, + { + "epoch": 2.386908619572262, + "grad_norm": 0.9587984681129456, + "learning_rate": 5.284783170615446e-07, + "loss": 0.086, + "step": 7366 + }, + { + "epoch": 2.3872326636422554, + "grad_norm": 0.8575683832168579, + "learning_rate": 5.279406451558136e-07, + "loss": 0.0726, + "step": 7367 + }, + { + "epoch": 2.387556707712249, + "grad_norm": 0.8097689151763916, + "learning_rate": 5.274032146110567e-07, + "loss": 0.071, + "step": 7368 + }, + { + "epoch": 2.3878807517822422, + "grad_norm": 0.9363576769828796, + "learning_rate": 5.268660254930499e-07, + "loss": 0.0774, + "step": 7369 + }, + { + "epoch": 2.3882047958522357, + "grad_norm": 0.8731128573417664, + "learning_rate": 5.263290778675401e-07, + "loss": 0.0754, + "step": 7370 + }, + { + "epoch": 2.3885288399222295, + "grad_norm": 0.9303240180015564, + "learning_rate": 5.257923718002447e-07, + "loss": 0.0767, + "step": 7371 + }, + { + "epoch": 2.388852883992223, + "grad_norm": 0.88297039270401, + "learning_rate": 5.252559073568514e-07, + "loss": 0.0738, + "step": 7372 + }, + { + "epoch": 2.3891769280622164, + "grad_norm": 0.8268675208091736, + "learning_rate": 5.247196846030178e-07, + "loss": 0.0696, + "step": 7373 + }, + { + "epoch": 2.38950097213221, + "grad_norm": 0.9614165425300598, + "learning_rate": 5.241837036043731e-07, + "loss": 0.0773, + "step": 7374 + }, + { + "epoch": 2.3898250162022037, + "grad_norm": 0.9003193974494934, + "learning_rate": 5.236479644265153e-07, + "loss": 0.0764, + "step": 7375 + }, + { + "epoch": 2.390149060272197, + "grad_norm": 0.8745166063308716, + "learning_rate": 5.231124671350141e-07, + "loss": 0.0763, + "step": 7376 + }, + { + "epoch": 2.3904731043421905, + "grad_norm": 0.8800874948501587, + "learning_rate": 5.225772117954089e-07, + "loss": 0.0757, + "step": 7377 + }, + { + "epoch": 2.390797148412184, + "grad_norm": 0.8205560445785522, + "learning_rate": 5.220421984732104e-07, + "loss": 0.0746, + "step": 7378 + }, + { + "epoch": 2.391121192482178, + "grad_norm": 0.9015682935714722, + "learning_rate": 5.215074272338986e-07, + "loss": 0.0766, + "step": 7379 + }, + { + "epoch": 2.3914452365521712, + "grad_norm": 0.8613619208335876, + "learning_rate": 5.20972898142924e-07, + "loss": 0.0771, + "step": 7380 + }, + { + "epoch": 2.3917692806221647, + "grad_norm": 0.8701586127281189, + "learning_rate": 5.204386112657095e-07, + "loss": 0.0797, + "step": 7381 + }, + { + "epoch": 2.392093324692158, + "grad_norm": 0.8837924599647522, + "learning_rate": 5.199045666676436e-07, + "loss": 0.0712, + "step": 7382 + }, + { + "epoch": 2.3924173687621515, + "grad_norm": 0.9213774800300598, + "learning_rate": 5.193707644140913e-07, + "loss": 0.085, + "step": 7383 + }, + { + "epoch": 2.3927414128321454, + "grad_norm": 0.8316073417663574, + "learning_rate": 5.188372045703824e-07, + "loss": 0.069, + "step": 7384 + }, + { + "epoch": 2.393065456902139, + "grad_norm": 0.8606321811676025, + "learning_rate": 5.183038872018215e-07, + "loss": 0.0767, + "step": 7385 + }, + { + "epoch": 2.393389500972132, + "grad_norm": 0.9383647441864014, + "learning_rate": 5.1777081237368e-07, + "loss": 0.0792, + "step": 7386 + }, + { + "epoch": 2.3937135450421256, + "grad_norm": 0.8869799971580505, + "learning_rate": 5.172379801512014e-07, + "loss": 0.0786, + "step": 7387 + }, + { + "epoch": 2.394037589112119, + "grad_norm": 0.8742722272872925, + "learning_rate": 5.167053905996003e-07, + "loss": 0.0808, + "step": 7388 + }, + { + "epoch": 2.394361633182113, + "grad_norm": 0.960090696811676, + "learning_rate": 5.161730437840585e-07, + "loss": 0.0822, + "step": 7389 + }, + { + "epoch": 2.3946856772521063, + "grad_norm": 0.8900645971298218, + "learning_rate": 5.15640939769732e-07, + "loss": 0.0793, + "step": 7390 + }, + { + "epoch": 2.3950097213220998, + "grad_norm": 0.8405351638793945, + "learning_rate": 5.151090786217433e-07, + "loss": 0.0768, + "step": 7391 + }, + { + "epoch": 2.395333765392093, + "grad_norm": 0.8822963237762451, + "learning_rate": 5.145774604051895e-07, + "loss": 0.0766, + "step": 7392 + }, + { + "epoch": 2.3956578094620866, + "grad_norm": 1.0235251188278198, + "learning_rate": 5.140460851851336e-07, + "loss": 0.0812, + "step": 7393 + }, + { + "epoch": 2.3959818535320805, + "grad_norm": 0.9306867122650146, + "learning_rate": 5.135149530266112e-07, + "loss": 0.0772, + "step": 7394 + }, + { + "epoch": 2.396305897602074, + "grad_norm": 0.9007172584533691, + "learning_rate": 5.129840639946279e-07, + "loss": 0.0726, + "step": 7395 + }, + { + "epoch": 2.3966299416720673, + "grad_norm": 0.8892046213150024, + "learning_rate": 5.124534181541596e-07, + "loss": 0.0761, + "step": 7396 + }, + { + "epoch": 2.3969539857420608, + "grad_norm": 0.9353534579277039, + "learning_rate": 5.119230155701515e-07, + "loss": 0.0809, + "step": 7397 + }, + { + "epoch": 2.3972780298120546, + "grad_norm": 0.9226713180541992, + "learning_rate": 5.113928563075213e-07, + "loss": 0.0803, + "step": 7398 + }, + { + "epoch": 2.397602073882048, + "grad_norm": 0.9027561545372009, + "learning_rate": 5.108629404311535e-07, + "loss": 0.0785, + "step": 7399 + }, + { + "epoch": 2.3979261179520415, + "grad_norm": 0.8203698396682739, + "learning_rate": 5.103332680059053e-07, + "loss": 0.0717, + "step": 7400 + }, + { + "epoch": 2.398250162022035, + "grad_norm": 0.9251837134361267, + "learning_rate": 5.098038390966039e-07, + "loss": 0.0805, + "step": 7401 + }, + { + "epoch": 2.3985742060920288, + "grad_norm": 0.871580958366394, + "learning_rate": 5.09274653768046e-07, + "loss": 0.0773, + "step": 7402 + }, + { + "epoch": 2.398898250162022, + "grad_norm": 0.8269453644752502, + "learning_rate": 5.087457120849984e-07, + "loss": 0.0713, + "step": 7403 + }, + { + "epoch": 2.3992222942320156, + "grad_norm": 0.8618307113647461, + "learning_rate": 5.082170141121992e-07, + "loss": 0.0769, + "step": 7404 + }, + { + "epoch": 2.399546338302009, + "grad_norm": 0.8813310861587524, + "learning_rate": 5.076885599143558e-07, + "loss": 0.075, + "step": 7405 + }, + { + "epoch": 2.3998703823720025, + "grad_norm": 0.8791375160217285, + "learning_rate": 5.071603495561444e-07, + "loss": 0.0776, + "step": 7406 + }, + { + "epoch": 2.4001944264419963, + "grad_norm": 0.9294980764389038, + "learning_rate": 5.066323831022155e-07, + "loss": 0.0752, + "step": 7407 + }, + { + "epoch": 2.4005184705119897, + "grad_norm": 0.8981596827507019, + "learning_rate": 5.061046606171849e-07, + "loss": 0.0741, + "step": 7408 + }, + { + "epoch": 2.400842514581983, + "grad_norm": 0.9037610292434692, + "learning_rate": 5.055771821656416e-07, + "loss": 0.0753, + "step": 7409 + }, + { + "epoch": 2.4011665586519766, + "grad_norm": 0.8545142412185669, + "learning_rate": 5.05049947812144e-07, + "loss": 0.0744, + "step": 7410 + }, + { + "epoch": 2.40149060272197, + "grad_norm": 0.9342654347419739, + "learning_rate": 5.045229576212191e-07, + "loss": 0.0821, + "step": 7411 + }, + { + "epoch": 2.401814646791964, + "grad_norm": 0.9164658188819885, + "learning_rate": 5.039962116573676e-07, + "loss": 0.0774, + "step": 7412 + }, + { + "epoch": 2.4021386908619573, + "grad_norm": 0.8833655118942261, + "learning_rate": 5.034697099850557e-07, + "loss": 0.0759, + "step": 7413 + }, + { + "epoch": 2.4024627349319507, + "grad_norm": 0.8713326454162598, + "learning_rate": 5.029434526687249e-07, + "loss": 0.0817, + "step": 7414 + }, + { + "epoch": 2.402786779001944, + "grad_norm": 0.9473174214363098, + "learning_rate": 5.02417439772781e-07, + "loss": 0.0757, + "step": 7415 + }, + { + "epoch": 2.4031108230719376, + "grad_norm": 0.9125643968582153, + "learning_rate": 5.01891671361606e-07, + "loss": 0.0807, + "step": 7416 + }, + { + "epoch": 2.4034348671419314, + "grad_norm": 0.9707140922546387, + "learning_rate": 5.013661474995463e-07, + "loss": 0.084, + "step": 7417 + }, + { + "epoch": 2.403758911211925, + "grad_norm": 0.8440059423446655, + "learning_rate": 5.008408682509219e-07, + "loss": 0.0736, + "step": 7418 + }, + { + "epoch": 2.4040829552819183, + "grad_norm": 0.9085173606872559, + "learning_rate": 5.003158336800218e-07, + "loss": 0.0772, + "step": 7419 + }, + { + "epoch": 2.4044069993519117, + "grad_norm": 0.8836762309074402, + "learning_rate": 4.997910438511052e-07, + "loss": 0.0792, + "step": 7420 + }, + { + "epoch": 2.404731043421905, + "grad_norm": 0.9048094749450684, + "learning_rate": 4.992664988284021e-07, + "loss": 0.077, + "step": 7421 + }, + { + "epoch": 2.405055087491899, + "grad_norm": 0.8553664088249207, + "learning_rate": 4.987421986761101e-07, + "loss": 0.0742, + "step": 7422 + }, + { + "epoch": 2.4053791315618924, + "grad_norm": 0.9607149958610535, + "learning_rate": 4.982181434583996e-07, + "loss": 0.0833, + "step": 7423 + }, + { + "epoch": 2.405703175631886, + "grad_norm": 0.8982037305831909, + "learning_rate": 4.976943332394093e-07, + "loss": 0.0718, + "step": 7424 + }, + { + "epoch": 2.4060272197018793, + "grad_norm": 0.8702731728553772, + "learning_rate": 4.971707680832491e-07, + "loss": 0.0737, + "step": 7425 + }, + { + "epoch": 2.406351263771873, + "grad_norm": 0.8381253480911255, + "learning_rate": 4.966474480539976e-07, + "loss": 0.0773, + "step": 7426 + }, + { + "epoch": 2.4066753078418666, + "grad_norm": 0.851047158241272, + "learning_rate": 4.961243732157048e-07, + "loss": 0.072, + "step": 7427 + }, + { + "epoch": 2.40699935191186, + "grad_norm": 0.8857436776161194, + "learning_rate": 4.956015436323897e-07, + "loss": 0.0812, + "step": 7428 + }, + { + "epoch": 2.4073233959818534, + "grad_norm": 0.9682055115699768, + "learning_rate": 4.950789593680422e-07, + "loss": 0.0776, + "step": 7429 + }, + { + "epoch": 2.4076474400518473, + "grad_norm": 0.8536563515663147, + "learning_rate": 4.945566204866201e-07, + "loss": 0.0733, + "step": 7430 + }, + { + "epoch": 2.4079714841218407, + "grad_norm": 0.9816359281539917, + "learning_rate": 4.940345270520536e-07, + "loss": 0.0837, + "step": 7431 + }, + { + "epoch": 2.408295528191834, + "grad_norm": 0.8802758455276489, + "learning_rate": 4.935126791282419e-07, + "loss": 0.0764, + "step": 7432 + }, + { + "epoch": 2.4086195722618275, + "grad_norm": 0.76444411277771, + "learning_rate": 4.929910767790536e-07, + "loss": 0.0652, + "step": 7433 + }, + { + "epoch": 2.408943616331821, + "grad_norm": 0.9820328950881958, + "learning_rate": 4.92469720068329e-07, + "loss": 0.0814, + "step": 7434 + }, + { + "epoch": 2.409267660401815, + "grad_norm": 0.8797508478164673, + "learning_rate": 4.919486090598749e-07, + "loss": 0.0721, + "step": 7435 + }, + { + "epoch": 2.4095917044718083, + "grad_norm": 0.8678030967712402, + "learning_rate": 4.91427743817473e-07, + "loss": 0.0716, + "step": 7436 + }, + { + "epoch": 2.4099157485418017, + "grad_norm": 0.8386548757553101, + "learning_rate": 4.909071244048694e-07, + "loss": 0.075, + "step": 7437 + }, + { + "epoch": 2.410239792611795, + "grad_norm": 0.8689800500869751, + "learning_rate": 4.903867508857857e-07, + "loss": 0.0774, + "step": 7438 + }, + { + "epoch": 2.4105638366817885, + "grad_norm": 0.9039058685302734, + "learning_rate": 4.898666233239083e-07, + "loss": 0.0809, + "step": 7439 + }, + { + "epoch": 2.4108878807517824, + "grad_norm": 0.8700675368309021, + "learning_rate": 4.893467417828967e-07, + "loss": 0.077, + "step": 7440 + }, + { + "epoch": 2.411211924821776, + "grad_norm": 0.8966511487960815, + "learning_rate": 4.888271063263791e-07, + "loss": 0.0792, + "step": 7441 + }, + { + "epoch": 2.4115359688917692, + "grad_norm": 0.904863178730011, + "learning_rate": 4.883077170179542e-07, + "loss": 0.0772, + "step": 7442 + }, + { + "epoch": 2.4118600129617627, + "grad_norm": 0.9016261696815491, + "learning_rate": 4.877885739211907e-07, + "loss": 0.0735, + "step": 7443 + }, + { + "epoch": 2.412184057031756, + "grad_norm": 0.8624441027641296, + "learning_rate": 4.872696770996246e-07, + "loss": 0.078, + "step": 7444 + }, + { + "epoch": 2.41250810110175, + "grad_norm": 0.8437727093696594, + "learning_rate": 4.867510266167669e-07, + "loss": 0.073, + "step": 7445 + }, + { + "epoch": 2.4128321451717434, + "grad_norm": 0.8561378121376038, + "learning_rate": 4.862326225360927e-07, + "loss": 0.0799, + "step": 7446 + }, + { + "epoch": 2.413156189241737, + "grad_norm": 0.8445996642112732, + "learning_rate": 4.85714464921051e-07, + "loss": 0.072, + "step": 7447 + }, + { + "epoch": 2.4134802333117302, + "grad_norm": 0.8771276473999023, + "learning_rate": 4.851965538350589e-07, + "loss": 0.0772, + "step": 7448 + }, + { + "epoch": 2.413804277381724, + "grad_norm": 0.875706672668457, + "learning_rate": 4.846788893415038e-07, + "loss": 0.0793, + "step": 7449 + }, + { + "epoch": 2.4141283214517175, + "grad_norm": 0.9223209619522095, + "learning_rate": 4.841614715037429e-07, + "loss": 0.0832, + "step": 7450 + }, + { + "epoch": 2.414452365521711, + "grad_norm": 0.8857447504997253, + "learning_rate": 4.83644300385103e-07, + "loss": 0.0809, + "step": 7451 + }, + { + "epoch": 2.4147764095917044, + "grad_norm": 0.9135066866874695, + "learning_rate": 4.831273760488816e-07, + "loss": 0.0811, + "step": 7452 + }, + { + "epoch": 2.4151004536616982, + "grad_norm": 0.8678467869758606, + "learning_rate": 4.82610698558344e-07, + "loss": 0.0724, + "step": 7453 + }, + { + "epoch": 2.4154244977316917, + "grad_norm": 0.9949643015861511, + "learning_rate": 4.820942679767268e-07, + "loss": 0.0755, + "step": 7454 + }, + { + "epoch": 2.415748541801685, + "grad_norm": 0.7823389768600464, + "learning_rate": 4.815780843672366e-07, + "loss": 0.0701, + "step": 7455 + }, + { + "epoch": 2.4160725858716785, + "grad_norm": 0.8314511179924011, + "learning_rate": 4.810621477930488e-07, + "loss": 0.0706, + "step": 7456 + }, + { + "epoch": 2.416396629941672, + "grad_norm": 0.937160849571228, + "learning_rate": 4.805464583173094e-07, + "loss": 0.0782, + "step": 7457 + }, + { + "epoch": 2.416720674011666, + "grad_norm": 0.9175649881362915, + "learning_rate": 4.800310160031335e-07, + "loss": 0.0718, + "step": 7458 + }, + { + "epoch": 2.417044718081659, + "grad_norm": 0.8398677110671997, + "learning_rate": 4.795158209136067e-07, + "loss": 0.0718, + "step": 7459 + }, + { + "epoch": 2.4173687621516526, + "grad_norm": 0.8556437492370605, + "learning_rate": 4.79000873111784e-07, + "loss": 0.0711, + "step": 7460 + }, + { + "epoch": 2.417692806221646, + "grad_norm": 0.8410119414329529, + "learning_rate": 4.784861726606893e-07, + "loss": 0.0727, + "step": 7461 + }, + { + "epoch": 2.4180168502916395, + "grad_norm": 0.8888702392578125, + "learning_rate": 4.779717196233169e-07, + "loss": 0.0771, + "step": 7462 + }, + { + "epoch": 2.4183408943616334, + "grad_norm": 0.881953775882721, + "learning_rate": 4.774575140626317e-07, + "loss": 0.0769, + "step": 7463 + }, + { + "epoch": 2.4186649384316268, + "grad_norm": 0.8769305944442749, + "learning_rate": 4.769435560415666e-07, + "loss": 0.0735, + "step": 7464 + }, + { + "epoch": 2.41898898250162, + "grad_norm": 0.8964110612869263, + "learning_rate": 4.764298456230265e-07, + "loss": 0.076, + "step": 7465 + }, + { + "epoch": 2.4193130265716136, + "grad_norm": 0.8946002125740051, + "learning_rate": 4.7591638286988234e-07, + "loss": 0.0747, + "step": 7466 + }, + { + "epoch": 2.419637070641607, + "grad_norm": 0.9408652782440186, + "learning_rate": 4.754031678449794e-07, + "loss": 0.0811, + "step": 7467 + }, + { + "epoch": 2.419961114711601, + "grad_norm": 0.8090646266937256, + "learning_rate": 4.7489020061112805e-07, + "loss": 0.0685, + "step": 7468 + }, + { + "epoch": 2.4202851587815943, + "grad_norm": 0.7830265164375305, + "learning_rate": 4.743774812311125e-07, + "loss": 0.0681, + "step": 7469 + }, + { + "epoch": 2.4206092028515878, + "grad_norm": 0.8681769967079163, + "learning_rate": 4.7386500976768337e-07, + "loss": 0.0783, + "step": 7470 + }, + { + "epoch": 2.420933246921581, + "grad_norm": 0.9414371252059937, + "learning_rate": 4.733527862835624e-07, + "loss": 0.0794, + "step": 7471 + }, + { + "epoch": 2.4212572909915746, + "grad_norm": 0.9063982963562012, + "learning_rate": 4.728408108414409e-07, + "loss": 0.0799, + "step": 7472 + }, + { + "epoch": 2.4215813350615685, + "grad_norm": 0.8320420980453491, + "learning_rate": 4.7232908350397984e-07, + "loss": 0.0723, + "step": 7473 + }, + { + "epoch": 2.421905379131562, + "grad_norm": 0.9368253350257874, + "learning_rate": 4.7181760433381017e-07, + "loss": 0.0796, + "step": 7474 + }, + { + "epoch": 2.4222294232015553, + "grad_norm": 0.8854978680610657, + "learning_rate": 4.7130637339352995e-07, + "loss": 0.0759, + "step": 7475 + }, + { + "epoch": 2.4225534672715487, + "grad_norm": 0.8954752087593079, + "learning_rate": 4.707953907457119e-07, + "loss": 0.0801, + "step": 7476 + }, + { + "epoch": 2.4228775113415426, + "grad_norm": 0.8707118630409241, + "learning_rate": 4.702846564528929e-07, + "loss": 0.0724, + "step": 7477 + }, + { + "epoch": 2.423201555411536, + "grad_norm": 0.8651489019393921, + "learning_rate": 4.6977417057758297e-07, + "loss": 0.0747, + "step": 7478 + }, + { + "epoch": 2.4235255994815295, + "grad_norm": 0.9725545048713684, + "learning_rate": 4.6926393318226045e-07, + "loss": 0.0816, + "step": 7479 + }, + { + "epoch": 2.423849643551523, + "grad_norm": 0.8611765503883362, + "learning_rate": 4.6875394432937345e-07, + "loss": 0.0763, + "step": 7480 + }, + { + "epoch": 2.4241736876215167, + "grad_norm": 0.9670865535736084, + "learning_rate": 4.6824420408133953e-07, + "loss": 0.085, + "step": 7481 + }, + { + "epoch": 2.42449773169151, + "grad_norm": 0.9267457127571106, + "learning_rate": 4.677347125005463e-07, + "loss": 0.0829, + "step": 7482 + }, + { + "epoch": 2.4248217757615036, + "grad_norm": 0.9122713208198547, + "learning_rate": 4.6722546964935114e-07, + "loss": 0.0747, + "step": 7483 + }, + { + "epoch": 2.425145819831497, + "grad_norm": 0.913609504699707, + "learning_rate": 4.6671647559007884e-07, + "loss": 0.0811, + "step": 7484 + }, + { + "epoch": 2.4254698639014904, + "grad_norm": 0.9412457346916199, + "learning_rate": 4.6620773038502625e-07, + "loss": 0.0756, + "step": 7485 + }, + { + "epoch": 2.4257939079714843, + "grad_norm": 0.9006617665290833, + "learning_rate": 4.656992340964589e-07, + "loss": 0.0763, + "step": 7486 + }, + { + "epoch": 2.4261179520414777, + "grad_norm": 0.8255707621574402, + "learning_rate": 4.651909867866117e-07, + "loss": 0.0725, + "step": 7487 + }, + { + "epoch": 2.426441996111471, + "grad_norm": 0.8910168409347534, + "learning_rate": 4.64682988517689e-07, + "loss": 0.0737, + "step": 7488 + }, + { + "epoch": 2.4267660401814646, + "grad_norm": 0.868298351764679, + "learning_rate": 4.641752393518661e-07, + "loss": 0.0737, + "step": 7489 + }, + { + "epoch": 2.427090084251458, + "grad_norm": 0.9420347809791565, + "learning_rate": 4.6366773935128423e-07, + "loss": 0.0761, + "step": 7490 + }, + { + "epoch": 2.427414128321452, + "grad_norm": 0.8676708340644836, + "learning_rate": 4.631604885780591e-07, + "loss": 0.0755, + "step": 7491 + }, + { + "epoch": 2.4277381723914453, + "grad_norm": 0.8301357626914978, + "learning_rate": 4.6265348709427146e-07, + "loss": 0.07, + "step": 7492 + }, + { + "epoch": 2.4280622164614387, + "grad_norm": 0.8298184871673584, + "learning_rate": 4.621467349619738e-07, + "loss": 0.0741, + "step": 7493 + }, + { + "epoch": 2.428386260531432, + "grad_norm": 0.8731589317321777, + "learning_rate": 4.6164023224318786e-07, + "loss": 0.0742, + "step": 7494 + }, + { + "epoch": 2.4287103046014256, + "grad_norm": 0.960504412651062, + "learning_rate": 4.6113397899990474e-07, + "loss": 0.0854, + "step": 7495 + }, + { + "epoch": 2.4290343486714194, + "grad_norm": 0.8900678157806396, + "learning_rate": 4.6062797529408537e-07, + "loss": 0.0763, + "step": 7496 + }, + { + "epoch": 2.429358392741413, + "grad_norm": 0.9121445417404175, + "learning_rate": 4.6012222118765796e-07, + "loss": 0.0855, + "step": 7497 + }, + { + "epoch": 2.4296824368114063, + "grad_norm": 0.9721124768257141, + "learning_rate": 4.5961671674252447e-07, + "loss": 0.0817, + "step": 7498 + }, + { + "epoch": 2.4300064808813997, + "grad_norm": 0.8571234345436096, + "learning_rate": 4.5911146202055113e-07, + "loss": 0.0742, + "step": 7499 + }, + { + "epoch": 2.4303305249513936, + "grad_norm": 0.9155340790748596, + "learning_rate": 4.5860645708357855e-07, + "loss": 0.0755, + "step": 7500 + }, + { + "epoch": 2.430654569021387, + "grad_norm": 0.8868303298950195, + "learning_rate": 4.581017019934131e-07, + "loss": 0.0793, + "step": 7501 + }, + { + "epoch": 2.4309786130913804, + "grad_norm": 0.884955644607544, + "learning_rate": 4.57597196811832e-07, + "loss": 0.0747, + "step": 7502 + }, + { + "epoch": 2.431302657161374, + "grad_norm": 0.9260040521621704, + "learning_rate": 4.5709294160058204e-07, + "loss": 0.0853, + "step": 7503 + }, + { + "epoch": 2.4316267012313677, + "grad_norm": 0.9470503926277161, + "learning_rate": 4.565889364213791e-07, + "loss": 0.0772, + "step": 7504 + }, + { + "epoch": 2.431950745301361, + "grad_norm": 0.8989836573600769, + "learning_rate": 4.5608518133590933e-07, + "loss": 0.075, + "step": 7505 + }, + { + "epoch": 2.4322747893713546, + "grad_norm": 0.931161105632782, + "learning_rate": 4.5558167640582545e-07, + "loss": 0.0788, + "step": 7506 + }, + { + "epoch": 2.432598833441348, + "grad_norm": 0.9857456088066101, + "learning_rate": 4.550784216927542e-07, + "loss": 0.0817, + "step": 7507 + }, + { + "epoch": 2.4329228775113414, + "grad_norm": 0.8618913888931274, + "learning_rate": 4.5457541725828696e-07, + "loss": 0.0778, + "step": 7508 + }, + { + "epoch": 2.4332469215813353, + "grad_norm": 0.9804918169975281, + "learning_rate": 4.5407266316398745e-07, + "loss": 0.076, + "step": 7509 + }, + { + "epoch": 2.4335709656513287, + "grad_norm": 0.8626995086669922, + "learning_rate": 4.5357015947138786e-07, + "loss": 0.0722, + "step": 7510 + }, + { + "epoch": 2.433895009721322, + "grad_norm": 0.9694250822067261, + "learning_rate": 4.530679062419899e-07, + "loss": 0.0836, + "step": 7511 + }, + { + "epoch": 2.4342190537913155, + "grad_norm": 0.9092100262641907, + "learning_rate": 4.5256590353726426e-07, + "loss": 0.0753, + "step": 7512 + }, + { + "epoch": 2.434543097861309, + "grad_norm": 0.8885576128959656, + "learning_rate": 4.520641514186522e-07, + "loss": 0.0825, + "step": 7513 + }, + { + "epoch": 2.434867141931303, + "grad_norm": 0.8387184739112854, + "learning_rate": 4.5156264994756144e-07, + "loss": 0.0771, + "step": 7514 + }, + { + "epoch": 2.4351911860012962, + "grad_norm": 0.8722485303878784, + "learning_rate": 4.510613991853721e-07, + "loss": 0.0747, + "step": 7515 + }, + { + "epoch": 2.4355152300712897, + "grad_norm": 0.921558678150177, + "learning_rate": 4.5056039919343236e-07, + "loss": 0.0811, + "step": 7516 + }, + { + "epoch": 2.435839274141283, + "grad_norm": 0.9149647951126099, + "learning_rate": 4.5005965003305953e-07, + "loss": 0.0766, + "step": 7517 + }, + { + "epoch": 2.4361633182112765, + "grad_norm": 0.9356411099433899, + "learning_rate": 4.4955915176554065e-07, + "loss": 0.0772, + "step": 7518 + }, + { + "epoch": 2.4364873622812704, + "grad_norm": 0.8513500690460205, + "learning_rate": 4.490589044521315e-07, + "loss": 0.0774, + "step": 7519 + }, + { + "epoch": 2.436811406351264, + "grad_norm": 0.889763355255127, + "learning_rate": 4.4855890815405867e-07, + "loss": 0.0774, + "step": 7520 + }, + { + "epoch": 2.4371354504212572, + "grad_norm": 0.8515815138816833, + "learning_rate": 4.4805916293251486e-07, + "loss": 0.0752, + "step": 7521 + }, + { + "epoch": 2.4374594944912507, + "grad_norm": 0.8402429819107056, + "learning_rate": 4.4755966884866606e-07, + "loss": 0.0702, + "step": 7522 + }, + { + "epoch": 2.4377835385612445, + "grad_norm": 0.8742625117301941, + "learning_rate": 4.470604259636438e-07, + "loss": 0.0729, + "step": 7523 + }, + { + "epoch": 2.438107582631238, + "grad_norm": 0.877052366733551, + "learning_rate": 4.465614343385524e-07, + "loss": 0.0752, + "step": 7524 + }, + { + "epoch": 2.4384316267012314, + "grad_norm": 0.8343896865844727, + "learning_rate": 4.46062694034462e-07, + "loss": 0.0757, + "step": 7525 + }, + { + "epoch": 2.438755670771225, + "grad_norm": 0.8815245628356934, + "learning_rate": 4.455642051124143e-07, + "loss": 0.0768, + "step": 7526 + }, + { + "epoch": 2.439079714841218, + "grad_norm": 0.8473525047302246, + "learning_rate": 4.4506596763341985e-07, + "loss": 0.0715, + "step": 7527 + }, + { + "epoch": 2.439403758911212, + "grad_norm": 0.8910574316978455, + "learning_rate": 4.445679816584567e-07, + "loss": 0.0757, + "step": 7528 + }, + { + "epoch": 2.4397278029812055, + "grad_norm": 0.9735031127929688, + "learning_rate": 4.4407024724847534e-07, + "loss": 0.0831, + "step": 7529 + }, + { + "epoch": 2.440051847051199, + "grad_norm": 0.9022817611694336, + "learning_rate": 4.4357276446439197e-07, + "loss": 0.078, + "step": 7530 + }, + { + "epoch": 2.4403758911211924, + "grad_norm": 0.9188567996025085, + "learning_rate": 4.4307553336709525e-07, + "loss": 0.0773, + "step": 7531 + }, + { + "epoch": 2.440699935191186, + "grad_norm": 0.854328453540802, + "learning_rate": 4.4257855401744044e-07, + "loss": 0.0743, + "step": 7532 + }, + { + "epoch": 2.4410239792611796, + "grad_norm": 0.8932430744171143, + "learning_rate": 4.42081826476253e-07, + "loss": 0.0767, + "step": 7533 + }, + { + "epoch": 2.441348023331173, + "grad_norm": 0.8871773481369019, + "learning_rate": 4.4158535080432803e-07, + "loss": 0.0731, + "step": 7534 + }, + { + "epoch": 2.4416720674011665, + "grad_norm": 0.8521711826324463, + "learning_rate": 4.4108912706242876e-07, + "loss": 0.0697, + "step": 7535 + }, + { + "epoch": 2.44199611147116, + "grad_norm": 0.8758248090744019, + "learning_rate": 4.405931553112894e-07, + "loss": 0.0703, + "step": 7536 + }, + { + "epoch": 2.442320155541154, + "grad_norm": 0.9381188750267029, + "learning_rate": 4.4009743561161e-07, + "loss": 0.0814, + "step": 7537 + }, + { + "epoch": 2.442644199611147, + "grad_norm": 0.888369083404541, + "learning_rate": 4.396019680240643e-07, + "loss": 0.0791, + "step": 7538 + }, + { + "epoch": 2.4429682436811406, + "grad_norm": 0.8075094819068909, + "learning_rate": 4.3910675260929096e-07, + "loss": 0.0729, + "step": 7539 + }, + { + "epoch": 2.443292287751134, + "grad_norm": 0.9342337250709534, + "learning_rate": 4.386117894278999e-07, + "loss": 0.0769, + "step": 7540 + }, + { + "epoch": 2.4436163318211275, + "grad_norm": 0.8695838451385498, + "learning_rate": 4.381170785404704e-07, + "loss": 0.0783, + "step": 7541 + }, + { + "epoch": 2.4439403758911213, + "grad_norm": 1.10112464427948, + "learning_rate": 4.376226200075495e-07, + "loss": 0.0892, + "step": 7542 + }, + { + "epoch": 2.4442644199611148, + "grad_norm": 0.9249840378761292, + "learning_rate": 4.3712841388965476e-07, + "loss": 0.0785, + "step": 7543 + }, + { + "epoch": 2.444588464031108, + "grad_norm": 0.8668658137321472, + "learning_rate": 4.3663446024727247e-07, + "loss": 0.0733, + "step": 7544 + }, + { + "epoch": 2.4449125081011016, + "grad_norm": 0.9287576079368591, + "learning_rate": 4.3614075914085617e-07, + "loss": 0.079, + "step": 7545 + }, + { + "epoch": 2.445236552171095, + "grad_norm": 0.8992236256599426, + "learning_rate": 4.356473106308326e-07, + "loss": 0.0833, + "step": 7546 + }, + { + "epoch": 2.445560596241089, + "grad_norm": 0.9160580039024353, + "learning_rate": 4.351541147775931e-07, + "loss": 0.0805, + "step": 7547 + }, + { + "epoch": 2.4458846403110823, + "grad_norm": 0.8093187808990479, + "learning_rate": 4.346611716415006e-07, + "loss": 0.0675, + "step": 7548 + }, + { + "epoch": 2.4462086843810757, + "grad_norm": 0.9260373115539551, + "learning_rate": 4.341684812828867e-07, + "loss": 0.0807, + "step": 7549 + }, + { + "epoch": 2.446532728451069, + "grad_norm": 0.859801709651947, + "learning_rate": 4.336760437620519e-07, + "loss": 0.0742, + "step": 7550 + }, + { + "epoch": 2.446856772521063, + "grad_norm": 0.8447008728981018, + "learning_rate": 4.331838591392662e-07, + "loss": 0.0717, + "step": 7551 + }, + { + "epoch": 2.4471808165910565, + "grad_norm": 0.9389599561691284, + "learning_rate": 4.326919274747668e-07, + "loss": 0.0794, + "step": 7552 + }, + { + "epoch": 2.44750486066105, + "grad_norm": 0.8455019593238831, + "learning_rate": 4.322002488287635e-07, + "loss": 0.0755, + "step": 7553 + }, + { + "epoch": 2.4478289047310433, + "grad_norm": 0.8933506608009338, + "learning_rate": 4.317088232614308e-07, + "loss": 0.081, + "step": 7554 + }, + { + "epoch": 2.448152948801037, + "grad_norm": 0.9255868792533875, + "learning_rate": 4.3121765083291663e-07, + "loss": 0.0815, + "step": 7555 + }, + { + "epoch": 2.4484769928710306, + "grad_norm": 0.9462663531303406, + "learning_rate": 4.307267316033342e-07, + "loss": 0.0837, + "step": 7556 + }, + { + "epoch": 2.448801036941024, + "grad_norm": 0.9613263607025146, + "learning_rate": 4.3023606563276753e-07, + "loss": 0.0836, + "step": 7557 + }, + { + "epoch": 2.4491250810110174, + "grad_norm": 0.8939090967178345, + "learning_rate": 4.297456529812702e-07, + "loss": 0.0757, + "step": 7558 + }, + { + "epoch": 2.449449125081011, + "grad_norm": 0.9026023149490356, + "learning_rate": 4.292554937088622e-07, + "loss": 0.0752, + "step": 7559 + }, + { + "epoch": 2.4497731691510047, + "grad_norm": 0.9327993392944336, + "learning_rate": 4.287655878755365e-07, + "loss": 0.0784, + "step": 7560 + }, + { + "epoch": 2.450097213220998, + "grad_norm": 0.8403828740119934, + "learning_rate": 4.282759355412505e-07, + "loss": 0.0742, + "step": 7561 + }, + { + "epoch": 2.4504212572909916, + "grad_norm": 0.8635743260383606, + "learning_rate": 4.2778653676593534e-07, + "loss": 0.074, + "step": 7562 + }, + { + "epoch": 2.450745301360985, + "grad_norm": 0.9350886344909668, + "learning_rate": 4.272973916094872e-07, + "loss": 0.0831, + "step": 7563 + }, + { + "epoch": 2.4510693454309784, + "grad_norm": 0.9209923148155212, + "learning_rate": 4.268085001317726e-07, + "loss": 0.0791, + "step": 7564 + }, + { + "epoch": 2.4513933895009723, + "grad_norm": 0.9279548525810242, + "learning_rate": 4.263198623926279e-07, + "loss": 0.0814, + "step": 7565 + }, + { + "epoch": 2.4517174335709657, + "grad_norm": 0.8781946897506714, + "learning_rate": 4.258314784518569e-07, + "loss": 0.0767, + "step": 7566 + }, + { + "epoch": 2.452041477640959, + "grad_norm": 0.8827422857284546, + "learning_rate": 4.253433483692337e-07, + "loss": 0.0772, + "step": 7567 + }, + { + "epoch": 2.4523655217109526, + "grad_norm": 0.8563030958175659, + "learning_rate": 4.248554722045009e-07, + "loss": 0.0717, + "step": 7568 + }, + { + "epoch": 2.452689565780946, + "grad_norm": 0.8876157999038696, + "learning_rate": 4.2436785001736896e-07, + "loss": 0.0746, + "step": 7569 + }, + { + "epoch": 2.45301360985094, + "grad_norm": 0.8552733659744263, + "learning_rate": 4.2388048186751823e-07, + "loss": 0.0748, + "step": 7570 + }, + { + "epoch": 2.4533376539209333, + "grad_norm": 0.9103438258171082, + "learning_rate": 4.233933678145982e-07, + "loss": 0.0773, + "step": 7571 + }, + { + "epoch": 2.4536616979909267, + "grad_norm": 0.985542893409729, + "learning_rate": 4.229065079182268e-07, + "loss": 0.0763, + "step": 7572 + }, + { + "epoch": 2.45398574206092, + "grad_norm": 0.8953328728675842, + "learning_rate": 4.224199022379913e-07, + "loss": 0.0772, + "step": 7573 + }, + { + "epoch": 2.454309786130914, + "grad_norm": 0.8356739282608032, + "learning_rate": 4.2193355083344684e-07, + "loss": 0.0724, + "step": 7574 + }, + { + "epoch": 2.4546338302009074, + "grad_norm": 0.8867468237876892, + "learning_rate": 4.2144745376411946e-07, + "loss": 0.0726, + "step": 7575 + }, + { + "epoch": 2.454957874270901, + "grad_norm": 0.8875829577445984, + "learning_rate": 4.2096161108950015e-07, + "loss": 0.0739, + "step": 7576 + }, + { + "epoch": 2.4552819183408943, + "grad_norm": 0.8456739187240601, + "learning_rate": 4.204760228690546e-07, + "loss": 0.0714, + "step": 7577 + }, + { + "epoch": 2.4556059624108877, + "grad_norm": 0.7902773022651672, + "learning_rate": 4.1999068916221184e-07, + "loss": 0.0668, + "step": 7578 + }, + { + "epoch": 2.4559300064808816, + "grad_norm": 0.8657953143119812, + "learning_rate": 4.1950561002837257e-07, + "loss": 0.0737, + "step": 7579 + }, + { + "epoch": 2.456254050550875, + "grad_norm": 0.8508570194244385, + "learning_rate": 4.1902078552690573e-07, + "loss": 0.072, + "step": 7580 + }, + { + "epoch": 2.4565780946208684, + "grad_norm": 0.867751955986023, + "learning_rate": 4.185362157171496e-07, + "loss": 0.0761, + "step": 7581 + }, + { + "epoch": 2.456902138690862, + "grad_norm": 0.9046393632888794, + "learning_rate": 4.1805190065841107e-07, + "loss": 0.0764, + "step": 7582 + }, + { + "epoch": 2.4572261827608557, + "grad_norm": 0.9099305272102356, + "learning_rate": 4.175678404099637e-07, + "loss": 0.081, + "step": 7583 + }, + { + "epoch": 2.457550226830849, + "grad_norm": 0.8977386951446533, + "learning_rate": 4.1708403503105456e-07, + "loss": 0.0759, + "step": 7584 + }, + { + "epoch": 2.4578742709008425, + "grad_norm": 0.8734598159790039, + "learning_rate": 4.166004845808941e-07, + "loss": 0.0714, + "step": 7585 + }, + { + "epoch": 2.458198314970836, + "grad_norm": 0.8450359106063843, + "learning_rate": 4.1611718911866663e-07, + "loss": 0.0708, + "step": 7586 + }, + { + "epoch": 2.4585223590408294, + "grad_norm": 0.8886892199516296, + "learning_rate": 4.1563414870352093e-07, + "loss": 0.0749, + "step": 7587 + }, + { + "epoch": 2.4588464031108233, + "grad_norm": 0.8860898017883301, + "learning_rate": 4.1515136339457725e-07, + "loss": 0.0758, + "step": 7588 + }, + { + "epoch": 2.4591704471808167, + "grad_norm": 0.8982911109924316, + "learning_rate": 4.146688332509241e-07, + "loss": 0.0784, + "step": 7589 + }, + { + "epoch": 2.45949449125081, + "grad_norm": 0.8104112148284912, + "learning_rate": 4.1418655833161794e-07, + "loss": 0.0663, + "step": 7590 + }, + { + "epoch": 2.4598185353208035, + "grad_norm": 0.8376603126525879, + "learning_rate": 4.137045386956853e-07, + "loss": 0.0751, + "step": 7591 + }, + { + "epoch": 2.460142579390797, + "grad_norm": 0.9051219820976257, + "learning_rate": 4.1322277440211973e-07, + "loss": 0.075, + "step": 7592 + }, + { + "epoch": 2.460466623460791, + "grad_norm": 0.9663034677505493, + "learning_rate": 4.1274126550988505e-07, + "loss": 0.0786, + "step": 7593 + }, + { + "epoch": 2.4607906675307842, + "grad_norm": 0.9784535765647888, + "learning_rate": 4.1226001207791327e-07, + "loss": 0.0798, + "step": 7594 + }, + { + "epoch": 2.4611147116007777, + "grad_norm": 0.93511962890625, + "learning_rate": 4.1177901416510485e-07, + "loss": 0.0793, + "step": 7595 + }, + { + "epoch": 2.461438755670771, + "grad_norm": 0.9316219687461853, + "learning_rate": 4.112982718303299e-07, + "loss": 0.08, + "step": 7596 + }, + { + "epoch": 2.4617627997407645, + "grad_norm": 0.8901696801185608, + "learning_rate": 4.1081778513242606e-07, + "loss": 0.0755, + "step": 7597 + }, + { + "epoch": 2.4620868438107584, + "grad_norm": 0.8933848142623901, + "learning_rate": 4.103375541302007e-07, + "loss": 0.0787, + "step": 7598 + }, + { + "epoch": 2.462410887880752, + "grad_norm": 0.8804818391799927, + "learning_rate": 4.0985757888242965e-07, + "loss": 0.0797, + "step": 7599 + }, + { + "epoch": 2.462734931950745, + "grad_norm": 0.8956480026245117, + "learning_rate": 4.0937785944785617e-07, + "loss": 0.0733, + "step": 7600 + }, + { + "epoch": 2.4630589760207386, + "grad_norm": 0.8888890743255615, + "learning_rate": 4.0889839588519386e-07, + "loss": 0.0777, + "step": 7601 + }, + { + "epoch": 2.4633830200907325, + "grad_norm": 0.9150384068489075, + "learning_rate": 4.0841918825312465e-07, + "loss": 0.0815, + "step": 7602 + }, + { + "epoch": 2.463707064160726, + "grad_norm": 0.8889130353927612, + "learning_rate": 4.0794023661029856e-07, + "loss": 0.0794, + "step": 7603 + }, + { + "epoch": 2.4640311082307194, + "grad_norm": 0.8525159358978271, + "learning_rate": 4.0746154101533485e-07, + "loss": 0.0778, + "step": 7604 + }, + { + "epoch": 2.464355152300713, + "grad_norm": 0.8394224047660828, + "learning_rate": 4.0698310152682107e-07, + "loss": 0.0727, + "step": 7605 + }, + { + "epoch": 2.4646791963707066, + "grad_norm": 0.8760896921157837, + "learning_rate": 4.065049182033146e-07, + "loss": 0.0716, + "step": 7606 + }, + { + "epoch": 2.4650032404407, + "grad_norm": 0.8698969483375549, + "learning_rate": 4.0602699110333795e-07, + "loss": 0.0713, + "step": 7607 + }, + { + "epoch": 2.4653272845106935, + "grad_norm": 0.9061896204948425, + "learning_rate": 4.0554932028538774e-07, + "loss": 0.0797, + "step": 7608 + }, + { + "epoch": 2.465651328580687, + "grad_norm": 0.8686105608940125, + "learning_rate": 4.050719058079244e-07, + "loss": 0.0752, + "step": 7609 + }, + { + "epoch": 2.4659753726506803, + "grad_norm": 0.8417704701423645, + "learning_rate": 4.045947477293791e-07, + "loss": 0.0717, + "step": 7610 + }, + { + "epoch": 2.466299416720674, + "grad_norm": 0.8493486642837524, + "learning_rate": 4.041178461081519e-07, + "loss": 0.0736, + "step": 7611 + }, + { + "epoch": 2.4666234607906676, + "grad_norm": 0.8493620157241821, + "learning_rate": 4.036412010026103e-07, + "loss": 0.0761, + "step": 7612 + }, + { + "epoch": 2.466947504860661, + "grad_norm": 0.8524096012115479, + "learning_rate": 4.0316481247109215e-07, + "loss": 0.0755, + "step": 7613 + }, + { + "epoch": 2.4672715489306545, + "grad_norm": 0.8808268904685974, + "learning_rate": 4.0268868057190075e-07, + "loss": 0.0798, + "step": 7614 + }, + { + "epoch": 2.467595593000648, + "grad_norm": 0.8217577338218689, + "learning_rate": 4.022128053633123e-07, + "loss": 0.0657, + "step": 7615 + }, + { + "epoch": 2.4679196370706418, + "grad_norm": 0.9325581192970276, + "learning_rate": 4.017371869035674e-07, + "loss": 0.0794, + "step": 7616 + }, + { + "epoch": 2.468243681140635, + "grad_norm": 0.8738077878952026, + "learning_rate": 4.01261825250879e-07, + "loss": 0.0732, + "step": 7617 + }, + { + "epoch": 2.4685677252106286, + "grad_norm": 0.8512647747993469, + "learning_rate": 4.0078672046342553e-07, + "loss": 0.0738, + "step": 7618 + }, + { + "epoch": 2.468891769280622, + "grad_norm": 0.9815152287483215, + "learning_rate": 4.0031187259935546e-07, + "loss": 0.0859, + "step": 7619 + }, + { + "epoch": 2.4692158133506155, + "grad_norm": 0.8624076247215271, + "learning_rate": 3.998372817167856e-07, + "loss": 0.0729, + "step": 7620 + }, + { + "epoch": 2.4695398574206093, + "grad_norm": 0.8829675316810608, + "learning_rate": 3.993629478738012e-07, + "loss": 0.0728, + "step": 7621 + }, + { + "epoch": 2.4698639014906028, + "grad_norm": 0.9542384743690491, + "learning_rate": 3.988888711284569e-07, + "loss": 0.0772, + "step": 7622 + }, + { + "epoch": 2.470187945560596, + "grad_norm": 0.899666965007782, + "learning_rate": 3.9841505153877387e-07, + "loss": 0.0792, + "step": 7623 + }, + { + "epoch": 2.4705119896305896, + "grad_norm": 0.8875526785850525, + "learning_rate": 3.9794148916274365e-07, + "loss": 0.0734, + "step": 7624 + }, + { + "epoch": 2.4708360337005835, + "grad_norm": 0.9024166464805603, + "learning_rate": 3.974681840583255e-07, + "loss": 0.0757, + "step": 7625 + }, + { + "epoch": 2.471160077770577, + "grad_norm": 0.9139639139175415, + "learning_rate": 3.969951362834476e-07, + "loss": 0.0785, + "step": 7626 + }, + { + "epoch": 2.4714841218405703, + "grad_norm": 0.9424812197685242, + "learning_rate": 3.965223458960063e-07, + "loss": 0.0789, + "step": 7627 + }, + { + "epoch": 2.4718081659105637, + "grad_norm": 0.8154076933860779, + "learning_rate": 3.9604981295386673e-07, + "loss": 0.0729, + "step": 7628 + }, + { + "epoch": 2.4721322099805576, + "grad_norm": 0.9030548334121704, + "learning_rate": 3.9557753751486237e-07, + "loss": 0.0774, + "step": 7629 + }, + { + "epoch": 2.472456254050551, + "grad_norm": 0.9466914534568787, + "learning_rate": 3.9510551963679534e-07, + "loss": 0.0798, + "step": 7630 + }, + { + "epoch": 2.4727802981205445, + "grad_norm": 0.8710300922393799, + "learning_rate": 3.9463375937743546e-07, + "loss": 0.0785, + "step": 7631 + }, + { + "epoch": 2.473104342190538, + "grad_norm": 0.9093683958053589, + "learning_rate": 3.941622567945216e-07, + "loss": 0.0805, + "step": 7632 + }, + { + "epoch": 2.4734283862605313, + "grad_norm": 0.8014646768569946, + "learning_rate": 3.9369101194576156e-07, + "loss": 0.0689, + "step": 7633 + }, + { + "epoch": 2.473752430330525, + "grad_norm": 0.8499481678009033, + "learning_rate": 3.93220024888831e-07, + "loss": 0.0765, + "step": 7634 + }, + { + "epoch": 2.4740764744005186, + "grad_norm": 0.9159678816795349, + "learning_rate": 3.927492956813747e-07, + "loss": 0.0752, + "step": 7635 + }, + { + "epoch": 2.474400518470512, + "grad_norm": 0.9946656227111816, + "learning_rate": 3.922788243810038e-07, + "loss": 0.0856, + "step": 7636 + }, + { + "epoch": 2.4747245625405054, + "grad_norm": 0.9459431171417236, + "learning_rate": 3.918086110453015e-07, + "loss": 0.0742, + "step": 7637 + }, + { + "epoch": 2.475048606610499, + "grad_norm": 0.9478768706321716, + "learning_rate": 3.9133865573181524e-07, + "loss": 0.0778, + "step": 7638 + }, + { + "epoch": 2.4753726506804927, + "grad_norm": 0.8506389856338501, + "learning_rate": 3.9086895849806547e-07, + "loss": 0.0737, + "step": 7639 + }, + { + "epoch": 2.475696694750486, + "grad_norm": 0.8261045217514038, + "learning_rate": 3.903995194015364e-07, + "loss": 0.0735, + "step": 7640 + }, + { + "epoch": 2.4760207388204796, + "grad_norm": 0.8218585848808289, + "learning_rate": 3.899303384996836e-07, + "loss": 0.0756, + "step": 7641 + }, + { + "epoch": 2.476344782890473, + "grad_norm": 0.9573899507522583, + "learning_rate": 3.894614158499302e-07, + "loss": 0.0828, + "step": 7642 + }, + { + "epoch": 2.4766688269604664, + "grad_norm": 0.8530287146568298, + "learning_rate": 3.889927515096681e-07, + "loss": 0.0715, + "step": 7643 + }, + { + "epoch": 2.4769928710304603, + "grad_norm": 0.9650495648384094, + "learning_rate": 3.885243455362578e-07, + "loss": 0.0821, + "step": 7644 + }, + { + "epoch": 2.4773169151004537, + "grad_norm": 0.877484142780304, + "learning_rate": 3.8805619798702565e-07, + "loss": 0.0707, + "step": 7645 + }, + { + "epoch": 2.477640959170447, + "grad_norm": 0.9321907162666321, + "learning_rate": 3.8758830891927056e-07, + "loss": 0.0803, + "step": 7646 + }, + { + "epoch": 2.4779650032404406, + "grad_norm": 0.9718896746635437, + "learning_rate": 3.8712067839025647e-07, + "loss": 0.0789, + "step": 7647 + }, + { + "epoch": 2.478289047310434, + "grad_norm": 0.942742109298706, + "learning_rate": 3.86653306457217e-07, + "loss": 0.0786, + "step": 7648 + }, + { + "epoch": 2.478613091380428, + "grad_norm": 0.9233473539352417, + "learning_rate": 3.861861931773542e-07, + "loss": 0.0781, + "step": 7649 + }, + { + "epoch": 2.4789371354504213, + "grad_norm": 0.8698999285697937, + "learning_rate": 3.8571933860783785e-07, + "loss": 0.0681, + "step": 7650 + }, + { + "epoch": 2.4792611795204147, + "grad_norm": 0.9471836090087891, + "learning_rate": 3.8525274280580646e-07, + "loss": 0.0738, + "step": 7651 + }, + { + "epoch": 2.479585223590408, + "grad_norm": 0.9754993915557861, + "learning_rate": 3.8478640582836733e-07, + "loss": 0.0794, + "step": 7652 + }, + { + "epoch": 2.479909267660402, + "grad_norm": 0.9015049934387207, + "learning_rate": 3.8432032773259574e-07, + "loss": 0.0806, + "step": 7653 + }, + { + "epoch": 2.4802333117303954, + "grad_norm": 0.8969537019729614, + "learning_rate": 3.838545085755341e-07, + "loss": 0.0801, + "step": 7654 + }, + { + "epoch": 2.480557355800389, + "grad_norm": 0.8950188755989075, + "learning_rate": 3.8338894841419476e-07, + "loss": 0.0739, + "step": 7655 + }, + { + "epoch": 2.4808813998703823, + "grad_norm": 0.8588986992835999, + "learning_rate": 3.8292364730555754e-07, + "loss": 0.0748, + "step": 7656 + }, + { + "epoch": 2.481205443940376, + "grad_norm": 0.9033975601196289, + "learning_rate": 3.8245860530657126e-07, + "loss": 0.0758, + "step": 7657 + }, + { + "epoch": 2.4815294880103695, + "grad_norm": 0.9325078725814819, + "learning_rate": 3.8199382247415236e-07, + "loss": 0.0837, + "step": 7658 + }, + { + "epoch": 2.481853532080363, + "grad_norm": 0.8054845333099365, + "learning_rate": 3.8152929886518587e-07, + "loss": 0.0657, + "step": 7659 + }, + { + "epoch": 2.4821775761503564, + "grad_norm": 0.8823132514953613, + "learning_rate": 3.810650345365241e-07, + "loss": 0.0747, + "step": 7660 + }, + { + "epoch": 2.48250162022035, + "grad_norm": 0.8961814045906067, + "learning_rate": 3.8060102954499024e-07, + "loss": 0.0737, + "step": 7661 + }, + { + "epoch": 2.4828256642903437, + "grad_norm": 0.9573402404785156, + "learning_rate": 3.8013728394737216e-07, + "loss": 0.0805, + "step": 7662 + }, + { + "epoch": 2.483149708360337, + "grad_norm": 0.8844099640846252, + "learning_rate": 3.796737978004289e-07, + "loss": 0.079, + "step": 7663 + }, + { + "epoch": 2.4834737524303305, + "grad_norm": 0.8763425350189209, + "learning_rate": 3.792105711608865e-07, + "loss": 0.076, + "step": 7664 + }, + { + "epoch": 2.483797796500324, + "grad_norm": 0.8611736297607422, + "learning_rate": 3.7874760408543933e-07, + "loss": 0.0784, + "step": 7665 + }, + { + "epoch": 2.4841218405703174, + "grad_norm": 0.9122564792633057, + "learning_rate": 3.7828489663075065e-07, + "loss": 0.0801, + "step": 7666 + }, + { + "epoch": 2.4844458846403112, + "grad_norm": 0.9059411287307739, + "learning_rate": 3.778224488534496e-07, + "loss": 0.0789, + "step": 7667 + }, + { + "epoch": 2.4847699287103047, + "grad_norm": 0.881196916103363, + "learning_rate": 3.773602608101376e-07, + "loss": 0.0751, + "step": 7668 + }, + { + "epoch": 2.485093972780298, + "grad_norm": 0.8495291471481323, + "learning_rate": 3.7689833255737995e-07, + "loss": 0.0765, + "step": 7669 + }, + { + "epoch": 2.4854180168502915, + "grad_norm": 0.9652096033096313, + "learning_rate": 3.764366641517145e-07, + "loss": 0.0818, + "step": 7670 + }, + { + "epoch": 2.485742060920285, + "grad_norm": 0.8796998858451843, + "learning_rate": 3.759752556496421e-07, + "loss": 0.0694, + "step": 7671 + }, + { + "epoch": 2.486066104990279, + "grad_norm": 0.9088475704193115, + "learning_rate": 3.7551410710763764e-07, + "loss": 0.0767, + "step": 7672 + }, + { + "epoch": 2.4863901490602722, + "grad_norm": 0.9164015054702759, + "learning_rate": 3.7505321858213926e-07, + "loss": 0.078, + "step": 7673 + }, + { + "epoch": 2.4867141931302656, + "grad_norm": 0.941716194152832, + "learning_rate": 3.7459259012955606e-07, + "loss": 0.0779, + "step": 7674 + }, + { + "epoch": 2.487038237200259, + "grad_norm": 0.9306308031082153, + "learning_rate": 3.7413222180626455e-07, + "loss": 0.0756, + "step": 7675 + }, + { + "epoch": 2.487362281270253, + "grad_norm": 0.8866762518882751, + "learning_rate": 3.736721136686081e-07, + "loss": 0.0742, + "step": 7676 + }, + { + "epoch": 2.4876863253402464, + "grad_norm": 0.8724895715713501, + "learning_rate": 3.7321226577290147e-07, + "loss": 0.0736, + "step": 7677 + }, + { + "epoch": 2.48801036941024, + "grad_norm": 0.7774869203567505, + "learning_rate": 3.7275267817542425e-07, + "loss": 0.0683, + "step": 7678 + }, + { + "epoch": 2.488334413480233, + "grad_norm": 0.9494317173957825, + "learning_rate": 3.7229335093242587e-07, + "loss": 0.0802, + "step": 7679 + }, + { + "epoch": 2.488658457550227, + "grad_norm": 0.8791287541389465, + "learning_rate": 3.7183428410012326e-07, + "loss": 0.0781, + "step": 7680 + }, + { + "epoch": 2.4889825016202205, + "grad_norm": 0.9582348465919495, + "learning_rate": 3.713754777347023e-07, + "loss": 0.0751, + "step": 7681 + }, + { + "epoch": 2.489306545690214, + "grad_norm": 0.8857023119926453, + "learning_rate": 3.7091693189231615e-07, + "loss": 0.0724, + "step": 7682 + }, + { + "epoch": 2.4896305897602073, + "grad_norm": 0.8695082068443298, + "learning_rate": 3.704586466290863e-07, + "loss": 0.0777, + "step": 7683 + }, + { + "epoch": 2.4899546338302008, + "grad_norm": 1.0032886266708374, + "learning_rate": 3.7000062200110266e-07, + "loss": 0.0815, + "step": 7684 + }, + { + "epoch": 2.4902786779001946, + "grad_norm": 0.9594873189926147, + "learning_rate": 3.6954285806442337e-07, + "loss": 0.0804, + "step": 7685 + }, + { + "epoch": 2.490602721970188, + "grad_norm": 0.854714035987854, + "learning_rate": 3.6908535487507335e-07, + "loss": 0.0725, + "step": 7686 + }, + { + "epoch": 2.4909267660401815, + "grad_norm": 0.8480677604675293, + "learning_rate": 3.68628112489047e-07, + "loss": 0.0729, + "step": 7687 + }, + { + "epoch": 2.491250810110175, + "grad_norm": 0.9920457601547241, + "learning_rate": 3.681711309623065e-07, + "loss": 0.0751, + "step": 7688 + }, + { + "epoch": 2.4915748541801683, + "grad_norm": 0.8475012183189392, + "learning_rate": 3.677144103507818e-07, + "loss": 0.0735, + "step": 7689 + }, + { + "epoch": 2.491898898250162, + "grad_norm": 0.9704048037528992, + "learning_rate": 3.672579507103716e-07, + "loss": 0.0799, + "step": 7690 + }, + { + "epoch": 2.4922229423201556, + "grad_norm": 0.9394801259040833, + "learning_rate": 3.668017520969405e-07, + "loss": 0.0808, + "step": 7691 + }, + { + "epoch": 2.492546986390149, + "grad_norm": 0.9730172753334045, + "learning_rate": 3.663458145663254e-07, + "loss": 0.0847, + "step": 7692 + }, + { + "epoch": 2.4928710304601425, + "grad_norm": 0.9635066986083984, + "learning_rate": 3.65890138174326e-07, + "loss": 0.077, + "step": 7693 + }, + { + "epoch": 2.493195074530136, + "grad_norm": 0.8948413729667664, + "learning_rate": 3.6543472297671495e-07, + "loss": 0.0779, + "step": 7694 + }, + { + "epoch": 2.4935191186001298, + "grad_norm": 0.8416469097137451, + "learning_rate": 3.6497956902922904e-07, + "loss": 0.0733, + "step": 7695 + }, + { + "epoch": 2.493843162670123, + "grad_norm": 0.8402621150016785, + "learning_rate": 3.645246763875754e-07, + "loss": 0.0705, + "step": 7696 + }, + { + "epoch": 2.4941672067401166, + "grad_norm": 0.9553708434104919, + "learning_rate": 3.640700451074289e-07, + "loss": 0.0795, + "step": 7697 + }, + { + "epoch": 2.49449125081011, + "grad_norm": 0.8238723278045654, + "learning_rate": 3.636156752444303e-07, + "loss": 0.0689, + "step": 7698 + }, + { + "epoch": 2.4948152948801035, + "grad_norm": 0.8835716843605042, + "learning_rate": 3.631615668541921e-07, + "loss": 0.0758, + "step": 7699 + }, + { + "epoch": 2.4951393389500973, + "grad_norm": 0.8276665210723877, + "learning_rate": 3.6270771999229124e-07, + "loss": 0.0724, + "step": 7700 + }, + { + "epoch": 2.4954633830200907, + "grad_norm": 0.9201515316963196, + "learning_rate": 3.622541347142758e-07, + "loss": 0.082, + "step": 7701 + }, + { + "epoch": 2.495787427090084, + "grad_norm": 0.9209465384483337, + "learning_rate": 3.618008110756588e-07, + "loss": 0.0825, + "step": 7702 + }, + { + "epoch": 2.4961114711600776, + "grad_norm": 0.9255928993225098, + "learning_rate": 3.6134774913192314e-07, + "loss": 0.0785, + "step": 7703 + }, + { + "epoch": 2.4964355152300715, + "grad_norm": 0.8700760006904602, + "learning_rate": 3.608949489385191e-07, + "loss": 0.0711, + "step": 7704 + }, + { + "epoch": 2.496759559300065, + "grad_norm": 0.8912569284439087, + "learning_rate": 3.6044241055086525e-07, + "loss": 0.0759, + "step": 7705 + }, + { + "epoch": 2.4970836033700583, + "grad_norm": 0.9345059990882874, + "learning_rate": 3.599901340243478e-07, + "loss": 0.0758, + "step": 7706 + }, + { + "epoch": 2.4974076474400517, + "grad_norm": 0.8387685418128967, + "learning_rate": 3.5953811941432104e-07, + "loss": 0.0718, + "step": 7707 + }, + { + "epoch": 2.4977316915100456, + "grad_norm": 0.9619724154472351, + "learning_rate": 3.590863667761077e-07, + "loss": 0.08, + "step": 7708 + }, + { + "epoch": 2.498055735580039, + "grad_norm": 0.8170272707939148, + "learning_rate": 3.5863487616499713e-07, + "loss": 0.068, + "step": 7709 + }, + { + "epoch": 2.4983797796500324, + "grad_norm": 0.9006374478340149, + "learning_rate": 3.581836476362474e-07, + "loss": 0.0817, + "step": 7710 + }, + { + "epoch": 2.498703823720026, + "grad_norm": 0.8181907534599304, + "learning_rate": 3.5773268124508485e-07, + "loss": 0.0697, + "step": 7711 + }, + { + "epoch": 2.4990278677900193, + "grad_norm": 0.8882855176925659, + "learning_rate": 3.5728197704670344e-07, + "loss": 0.0795, + "step": 7712 + }, + { + "epoch": 2.499351911860013, + "grad_norm": 0.9369103908538818, + "learning_rate": 3.5683153509626504e-07, + "loss": 0.0806, + "step": 7713 + }, + { + "epoch": 2.4996759559300066, + "grad_norm": 0.8526079654693604, + "learning_rate": 3.563813554488996e-07, + "loss": 0.0784, + "step": 7714 + }, + { + "epoch": 2.5, + "grad_norm": 0.8463532328605652, + "learning_rate": 3.559314381597034e-07, + "loss": 0.0697, + "step": 7715 + }, + { + "epoch": 2.5003240440699934, + "grad_norm": 1.0224663019180298, + "learning_rate": 3.55481783283744e-07, + "loss": 0.0814, + "step": 7716 + }, + { + "epoch": 2.500648088139987, + "grad_norm": 0.8725597858428955, + "learning_rate": 3.5503239087605337e-07, + "loss": 0.0755, + "step": 7717 + }, + { + "epoch": 2.5009721322099807, + "grad_norm": 0.9240145087242126, + "learning_rate": 3.54583260991633e-07, + "loss": 0.0751, + "step": 7718 + }, + { + "epoch": 2.501296176279974, + "grad_norm": 0.8926657438278198, + "learning_rate": 3.541343936854524e-07, + "loss": 0.0756, + "step": 7719 + }, + { + "epoch": 2.5016202203499676, + "grad_norm": 0.9064965844154358, + "learning_rate": 3.5368578901244843e-07, + "loss": 0.0736, + "step": 7720 + }, + { + "epoch": 2.501944264419961, + "grad_norm": 0.8855481743812561, + "learning_rate": 3.5323744702752657e-07, + "loss": 0.0742, + "step": 7721 + }, + { + "epoch": 2.5022683084899544, + "grad_norm": 0.8861925601959229, + "learning_rate": 3.5278936778555763e-07, + "loss": 0.0752, + "step": 7722 + }, + { + "epoch": 2.5025923525599483, + "grad_norm": 0.8892893195152283, + "learning_rate": 3.523415513413847e-07, + "loss": 0.0763, + "step": 7723 + }, + { + "epoch": 2.5029163966299417, + "grad_norm": 0.9149244427680969, + "learning_rate": 3.518939977498137e-07, + "loss": 0.0763, + "step": 7724 + }, + { + "epoch": 2.503240440699935, + "grad_norm": 0.897413969039917, + "learning_rate": 3.514467070656233e-07, + "loss": 0.0775, + "step": 7725 + }, + { + "epoch": 2.5035644847699285, + "grad_norm": 0.9678398370742798, + "learning_rate": 3.509996793435558e-07, + "loss": 0.0826, + "step": 7726 + }, + { + "epoch": 2.503888528839922, + "grad_norm": 0.8727210164070129, + "learning_rate": 3.505529146383235e-07, + "loss": 0.0726, + "step": 7727 + }, + { + "epoch": 2.504212572909916, + "grad_norm": 0.8535559773445129, + "learning_rate": 3.501064130046064e-07, + "loss": 0.0716, + "step": 7728 + }, + { + "epoch": 2.5045366169799093, + "grad_norm": 0.8813205361366272, + "learning_rate": 3.496601744970518e-07, + "loss": 0.0734, + "step": 7729 + }, + { + "epoch": 2.5048606610499027, + "grad_norm": 0.8789255023002625, + "learning_rate": 3.492141991702752e-07, + "loss": 0.0756, + "step": 7730 + }, + { + "epoch": 2.5051847051198965, + "grad_norm": 0.9259575009346008, + "learning_rate": 3.4876848707885854e-07, + "loss": 0.0787, + "step": 7731 + }, + { + "epoch": 2.50550874918989, + "grad_norm": 0.9527920484542847, + "learning_rate": 3.483230382773545e-07, + "loss": 0.076, + "step": 7732 + }, + { + "epoch": 2.5058327932598834, + "grad_norm": 0.8948280215263367, + "learning_rate": 3.478778528202803e-07, + "loss": 0.0753, + "step": 7733 + }, + { + "epoch": 2.506156837329877, + "grad_norm": 0.9487646222114563, + "learning_rate": 3.474329307621227e-07, + "loss": 0.0781, + "step": 7734 + }, + { + "epoch": 2.5064808813998702, + "grad_norm": 0.9494906663894653, + "learning_rate": 3.469882721573356e-07, + "loss": 0.0825, + "step": 7735 + }, + { + "epoch": 2.506804925469864, + "grad_norm": 0.8777042031288147, + "learning_rate": 3.465438770603416e-07, + "loss": 0.0758, + "step": 7736 + }, + { + "epoch": 2.5071289695398575, + "grad_norm": 0.9567082524299622, + "learning_rate": 3.4609974552552993e-07, + "loss": 0.0814, + "step": 7737 + }, + { + "epoch": 2.507453013609851, + "grad_norm": 0.8560287952423096, + "learning_rate": 3.456558776072585e-07, + "loss": 0.0738, + "step": 7738 + }, + { + "epoch": 2.5077770576798444, + "grad_norm": 0.9227958917617798, + "learning_rate": 3.4521227335985146e-07, + "loss": 0.0746, + "step": 7739 + }, + { + "epoch": 2.508101101749838, + "grad_norm": 0.929495632648468, + "learning_rate": 3.447689328376022e-07, + "loss": 0.0823, + "step": 7740 + }, + { + "epoch": 2.5084251458198317, + "grad_norm": 0.8940978050231934, + "learning_rate": 3.4432585609477125e-07, + "loss": 0.0738, + "step": 7741 + }, + { + "epoch": 2.508749189889825, + "grad_norm": 0.9019840955734253, + "learning_rate": 3.438830431855872e-07, + "loss": 0.0772, + "step": 7742 + }, + { + "epoch": 2.5090732339598185, + "grad_norm": 0.8739924430847168, + "learning_rate": 3.434404941642455e-07, + "loss": 0.0754, + "step": 7743 + }, + { + "epoch": 2.509397278029812, + "grad_norm": 0.8102080225944519, + "learning_rate": 3.4299820908491045e-07, + "loss": 0.071, + "step": 7744 + }, + { + "epoch": 2.5097213220998054, + "grad_norm": 0.9312887191772461, + "learning_rate": 3.4255618800171366e-07, + "loss": 0.0817, + "step": 7745 + }, + { + "epoch": 2.5100453661697992, + "grad_norm": 0.9610565304756165, + "learning_rate": 3.421144309687527e-07, + "loss": 0.084, + "step": 7746 + }, + { + "epoch": 2.5103694102397927, + "grad_norm": 0.9115961790084839, + "learning_rate": 3.4167293804009656e-07, + "loss": 0.0737, + "step": 7747 + }, + { + "epoch": 2.510693454309786, + "grad_norm": 0.8804581165313721, + "learning_rate": 3.412317092697781e-07, + "loss": 0.077, + "step": 7748 + }, + { + "epoch": 2.5110174983797795, + "grad_norm": 0.8988832831382751, + "learning_rate": 3.407907447117997e-07, + "loss": 0.0783, + "step": 7749 + }, + { + "epoch": 2.511341542449773, + "grad_norm": 0.9219678640365601, + "learning_rate": 3.4035004442013157e-07, + "loss": 0.0739, + "step": 7750 + }, + { + "epoch": 2.511665586519767, + "grad_norm": 0.868253767490387, + "learning_rate": 3.399096084487108e-07, + "loss": 0.0753, + "step": 7751 + }, + { + "epoch": 2.51198963058976, + "grad_norm": 0.8362921476364136, + "learning_rate": 3.394694368514434e-07, + "loss": 0.0707, + "step": 7752 + }, + { + "epoch": 2.5123136746597536, + "grad_norm": 0.9166553020477295, + "learning_rate": 3.390295296822002e-07, + "loss": 0.079, + "step": 7753 + }, + { + "epoch": 2.5126377187297475, + "grad_norm": 0.8944741487503052, + "learning_rate": 3.3858988699482397e-07, + "loss": 0.0769, + "step": 7754 + }, + { + "epoch": 2.512961762799741, + "grad_norm": 0.987773060798645, + "learning_rate": 3.381505088431203e-07, + "loss": 0.0813, + "step": 7755 + }, + { + "epoch": 2.5132858068697344, + "grad_norm": 0.9104339480400085, + "learning_rate": 3.377113952808669e-07, + "loss": 0.0811, + "step": 7756 + }, + { + "epoch": 2.5136098509397278, + "grad_norm": 0.932121753692627, + "learning_rate": 3.3727254636180597e-07, + "loss": 0.0782, + "step": 7757 + }, + { + "epoch": 2.513933895009721, + "grad_norm": 0.93451988697052, + "learning_rate": 3.3683396213964826e-07, + "loss": 0.0737, + "step": 7758 + }, + { + "epoch": 2.514257939079715, + "grad_norm": 0.8424649834632874, + "learning_rate": 3.363956426680728e-07, + "loss": 0.073, + "step": 7759 + }, + { + "epoch": 2.5145819831497085, + "grad_norm": 0.8437464237213135, + "learning_rate": 3.3595758800072515e-07, + "loss": 0.0735, + "step": 7760 + }, + { + "epoch": 2.514906027219702, + "grad_norm": 0.877597451210022, + "learning_rate": 3.355197981912198e-07, + "loss": 0.0731, + "step": 7761 + }, + { + "epoch": 2.5152300712896953, + "grad_norm": 0.8775132894515991, + "learning_rate": 3.350822732931361e-07, + "loss": 0.0726, + "step": 7762 + }, + { + "epoch": 2.5155541153596888, + "grad_norm": 0.8069689273834229, + "learning_rate": 3.3464501336002544e-07, + "loss": 0.0703, + "step": 7763 + }, + { + "epoch": 2.5158781594296826, + "grad_norm": 0.9043518900871277, + "learning_rate": 3.342080184454022e-07, + "loss": 0.0787, + "step": 7764 + }, + { + "epoch": 2.516202203499676, + "grad_norm": 0.8696305155754089, + "learning_rate": 3.337712886027511e-07, + "loss": 0.0705, + "step": 7765 + }, + { + "epoch": 2.5165262475696695, + "grad_norm": 0.9685964584350586, + "learning_rate": 3.3333482388552356e-07, + "loss": 0.0765, + "step": 7766 + }, + { + "epoch": 2.516850291639663, + "grad_norm": 1.0007911920547485, + "learning_rate": 3.3289862434713857e-07, + "loss": 0.083, + "step": 7767 + }, + { + "epoch": 2.5171743357096563, + "grad_norm": 0.9157785177230835, + "learning_rate": 3.3246269004098275e-07, + "loss": 0.0797, + "step": 7768 + }, + { + "epoch": 2.51749837977965, + "grad_norm": 0.8096321821212769, + "learning_rate": 3.320270210204107e-07, + "loss": 0.0695, + "step": 7769 + }, + { + "epoch": 2.5178224238496436, + "grad_norm": 0.9370676279067993, + "learning_rate": 3.3159161733874347e-07, + "loss": 0.0757, + "step": 7770 + }, + { + "epoch": 2.518146467919637, + "grad_norm": 0.9458437561988831, + "learning_rate": 3.311564790492702e-07, + "loss": 0.0779, + "step": 7771 + }, + { + "epoch": 2.5184705119896305, + "grad_norm": 0.942054271697998, + "learning_rate": 3.307216062052479e-07, + "loss": 0.0758, + "step": 7772 + }, + { + "epoch": 2.518794556059624, + "grad_norm": 0.8612393736839294, + "learning_rate": 3.3028699885990085e-07, + "loss": 0.0741, + "step": 7773 + }, + { + "epoch": 2.5191186001296177, + "grad_norm": 0.90648353099823, + "learning_rate": 3.298526570664207e-07, + "loss": 0.0795, + "step": 7774 + }, + { + "epoch": 2.519442644199611, + "grad_norm": 0.8954811096191406, + "learning_rate": 3.294185808779665e-07, + "loss": 0.0764, + "step": 7775 + }, + { + "epoch": 2.5197666882696046, + "grad_norm": 0.900047779083252, + "learning_rate": 3.289847703476659e-07, + "loss": 0.0791, + "step": 7776 + }, + { + "epoch": 2.5200907323395985, + "grad_norm": 0.8750745058059692, + "learning_rate": 3.285512255286111e-07, + "loss": 0.0747, + "step": 7777 + }, + { + "epoch": 2.5204147764095914, + "grad_norm": 0.8622852563858032, + "learning_rate": 3.2811794647386625e-07, + "loss": 0.077, + "step": 7778 + }, + { + "epoch": 2.5207388204795853, + "grad_norm": 0.8372640609741211, + "learning_rate": 3.276849332364587e-07, + "loss": 0.0732, + "step": 7779 + }, + { + "epoch": 2.5210628645495787, + "grad_norm": 0.8605118989944458, + "learning_rate": 3.2725218586938584e-07, + "loss": 0.0746, + "step": 7780 + }, + { + "epoch": 2.521386908619572, + "grad_norm": 0.9245160818099976, + "learning_rate": 3.2681970442561134e-07, + "loss": 0.0733, + "step": 7781 + }, + { + "epoch": 2.521710952689566, + "grad_norm": 0.9335709810256958, + "learning_rate": 3.2638748895806705e-07, + "loss": 0.081, + "step": 7782 + }, + { + "epoch": 2.5220349967595594, + "grad_norm": 0.914212167263031, + "learning_rate": 3.259555395196526e-07, + "loss": 0.0799, + "step": 7783 + }, + { + "epoch": 2.522359040829553, + "grad_norm": 0.9667627215385437, + "learning_rate": 3.255238561632326e-07, + "loss": 0.0784, + "step": 7784 + }, + { + "epoch": 2.5226830848995463, + "grad_norm": 0.9132807850837708, + "learning_rate": 3.250924389416432e-07, + "loss": 0.0822, + "step": 7785 + }, + { + "epoch": 2.5230071289695397, + "grad_norm": 0.9615965485572815, + "learning_rate": 3.2466128790768327e-07, + "loss": 0.0795, + "step": 7786 + }, + { + "epoch": 2.5233311730395336, + "grad_norm": 0.8738017678260803, + "learning_rate": 3.2423040311412384e-07, + "loss": 0.0746, + "step": 7787 + }, + { + "epoch": 2.523655217109527, + "grad_norm": 0.8721928000450134, + "learning_rate": 3.2379978461369976e-07, + "loss": 0.0766, + "step": 7788 + }, + { + "epoch": 2.5239792611795204, + "grad_norm": 0.9483674764633179, + "learning_rate": 3.233694324591144e-07, + "loss": 0.0831, + "step": 7789 + }, + { + "epoch": 2.524303305249514, + "grad_norm": 0.8579971194267273, + "learning_rate": 3.229393467030395e-07, + "loss": 0.0734, + "step": 7790 + }, + { + "epoch": 2.5246273493195073, + "grad_norm": 0.9196363687515259, + "learning_rate": 3.225095273981127e-07, + "loss": 0.0777, + "step": 7791 + }, + { + "epoch": 2.524951393389501, + "grad_norm": 0.9274152517318726, + "learning_rate": 3.2207997459694053e-07, + "loss": 0.0823, + "step": 7792 + }, + { + "epoch": 2.5252754374594946, + "grad_norm": 1.0055663585662842, + "learning_rate": 3.2165068835209506e-07, + "loss": 0.0832, + "step": 7793 + }, + { + "epoch": 2.525599481529488, + "grad_norm": 0.8837597370147705, + "learning_rate": 3.2122166871611736e-07, + "loss": 0.0719, + "step": 7794 + }, + { + "epoch": 2.5259235255994814, + "grad_norm": 0.9010149240493774, + "learning_rate": 3.207929157415152e-07, + "loss": 0.0727, + "step": 7795 + }, + { + "epoch": 2.526247569669475, + "grad_norm": 0.9038609862327576, + "learning_rate": 3.2036442948076395e-07, + "loss": 0.0782, + "step": 7796 + }, + { + "epoch": 2.5265716137394687, + "grad_norm": 0.9510405659675598, + "learning_rate": 3.199362099863057e-07, + "loss": 0.077, + "step": 7797 + }, + { + "epoch": 2.526895657809462, + "grad_norm": 0.9556993246078491, + "learning_rate": 3.19508257310551e-07, + "loss": 0.0768, + "step": 7798 + }, + { + "epoch": 2.5272197018794555, + "grad_norm": 0.8914854526519775, + "learning_rate": 3.190805715058765e-07, + "loss": 0.0772, + "step": 7799 + }, + { + "epoch": 2.527543745949449, + "grad_norm": 0.8729838728904724, + "learning_rate": 3.1865315262462783e-07, + "loss": 0.0731, + "step": 7800 + }, + { + "epoch": 2.5278677900194424, + "grad_norm": 0.9543265700340271, + "learning_rate": 3.182260007191157e-07, + "loss": 0.0803, + "step": 7801 + }, + { + "epoch": 2.5281918340894363, + "grad_norm": 0.8208547234535217, + "learning_rate": 3.1779911584161963e-07, + "loss": 0.0695, + "step": 7802 + }, + { + "epoch": 2.5285158781594297, + "grad_norm": 0.8887255191802979, + "learning_rate": 3.173724980443868e-07, + "loss": 0.0766, + "step": 7803 + }, + { + "epoch": 2.528839922229423, + "grad_norm": 0.8995735049247742, + "learning_rate": 3.1694614737963036e-07, + "loss": 0.071, + "step": 7804 + }, + { + "epoch": 2.529163966299417, + "grad_norm": 0.8485842943191528, + "learning_rate": 3.165200638995328e-07, + "loss": 0.0709, + "step": 7805 + }, + { + "epoch": 2.5294880103694104, + "grad_norm": 0.8586344718933105, + "learning_rate": 3.160942476562404e-07, + "loss": 0.0716, + "step": 7806 + }, + { + "epoch": 2.529812054439404, + "grad_norm": 0.8358325958251953, + "learning_rate": 3.1566869870187115e-07, + "loss": 0.0732, + "step": 7807 + }, + { + "epoch": 2.5301360985093972, + "grad_norm": 0.9183731079101562, + "learning_rate": 3.1524341708850633e-07, + "loss": 0.0829, + "step": 7808 + }, + { + "epoch": 2.5304601425793907, + "grad_norm": 0.8194887638092041, + "learning_rate": 3.148184028681983e-07, + "loss": 0.0709, + "step": 7809 + }, + { + "epoch": 2.5307841866493845, + "grad_norm": 0.8380728363990784, + "learning_rate": 3.1439365609296253e-07, + "loss": 0.0782, + "step": 7810 + }, + { + "epoch": 2.531108230719378, + "grad_norm": 0.8186984658241272, + "learning_rate": 3.1396917681478595e-07, + "loss": 0.0712, + "step": 7811 + }, + { + "epoch": 2.5314322747893714, + "grad_norm": 0.8302901387214661, + "learning_rate": 3.13544965085619e-07, + "loss": 0.0681, + "step": 7812 + }, + { + "epoch": 2.531756318859365, + "grad_norm": 0.911170244216919, + "learning_rate": 3.1312102095738205e-07, + "loss": 0.0755, + "step": 7813 + }, + { + "epoch": 2.5320803629293582, + "grad_norm": 0.9269139766693115, + "learning_rate": 3.12697344481962e-07, + "loss": 0.0769, + "step": 7814 + }, + { + "epoch": 2.532404406999352, + "grad_norm": 0.9284564256668091, + "learning_rate": 3.1227393571121117e-07, + "loss": 0.0753, + "step": 7815 + }, + { + "epoch": 2.5327284510693455, + "grad_norm": 0.9173905253410339, + "learning_rate": 3.1185079469695263e-07, + "loss": 0.078, + "step": 7816 + }, + { + "epoch": 2.533052495139339, + "grad_norm": 0.97370845079422, + "learning_rate": 3.1142792149097297e-07, + "loss": 0.0832, + "step": 7817 + }, + { + "epoch": 2.5333765392093324, + "grad_norm": 0.9200071096420288, + "learning_rate": 3.110053161450299e-07, + "loss": 0.0781, + "step": 7818 + }, + { + "epoch": 2.533700583279326, + "grad_norm": 0.9102277755737305, + "learning_rate": 3.105829787108444e-07, + "loss": 0.0759, + "step": 7819 + }, + { + "epoch": 2.5340246273493197, + "grad_norm": 0.8209194540977478, + "learning_rate": 3.10160909240107e-07, + "loss": 0.0703, + "step": 7820 + }, + { + "epoch": 2.534348671419313, + "grad_norm": 0.824000895023346, + "learning_rate": 3.0973910778447523e-07, + "loss": 0.0712, + "step": 7821 + }, + { + "epoch": 2.5346727154893065, + "grad_norm": 0.8389688730239868, + "learning_rate": 3.0931757439557313e-07, + "loss": 0.0693, + "step": 7822 + }, + { + "epoch": 2.5349967595593, + "grad_norm": 0.9663325548171997, + "learning_rate": 3.08896309124993e-07, + "loss": 0.0794, + "step": 7823 + }, + { + "epoch": 2.5353208036292934, + "grad_norm": 0.902708888053894, + "learning_rate": 3.084753120242928e-07, + "loss": 0.0775, + "step": 7824 + }, + { + "epoch": 2.535644847699287, + "grad_norm": 0.8854089379310608, + "learning_rate": 3.0805458314499855e-07, + "loss": 0.0725, + "step": 7825 + }, + { + "epoch": 2.5359688917692806, + "grad_norm": 0.8439732789993286, + "learning_rate": 3.076341225386037e-07, + "loss": 0.0745, + "step": 7826 + }, + { + "epoch": 2.536292935839274, + "grad_norm": 0.915687084197998, + "learning_rate": 3.0721393025656853e-07, + "loss": 0.0714, + "step": 7827 + }, + { + "epoch": 2.536616979909268, + "grad_norm": 0.9396669268608093, + "learning_rate": 3.0679400635032053e-07, + "loss": 0.0804, + "step": 7828 + }, + { + "epoch": 2.536941023979261, + "grad_norm": 0.8426708579063416, + "learning_rate": 3.063743508712544e-07, + "loss": 0.0727, + "step": 7829 + }, + { + "epoch": 2.537265068049255, + "grad_norm": 1.0279992818832397, + "learning_rate": 3.059549638707315e-07, + "loss": 0.0823, + "step": 7830 + }, + { + "epoch": 2.537589112119248, + "grad_norm": 0.9124258160591125, + "learning_rate": 3.0553584540008176e-07, + "loss": 0.0826, + "step": 7831 + }, + { + "epoch": 2.5379131561892416, + "grad_norm": 0.9020891785621643, + "learning_rate": 3.0511699551059927e-07, + "loss": 0.0766, + "step": 7832 + }, + { + "epoch": 2.5382372002592355, + "grad_norm": 0.877143144607544, + "learning_rate": 3.0469841425354945e-07, + "loss": 0.0779, + "step": 7833 + }, + { + "epoch": 2.538561244329229, + "grad_norm": 0.9416757822036743, + "learning_rate": 3.0428010168016107e-07, + "loss": 0.0785, + "step": 7834 + }, + { + "epoch": 2.5388852883992223, + "grad_norm": 0.9122092127799988, + "learning_rate": 3.0386205784163207e-07, + "loss": 0.0755, + "step": 7835 + }, + { + "epoch": 2.5392093324692158, + "grad_norm": 0.8774349689483643, + "learning_rate": 3.0344428278912765e-07, + "loss": 0.0727, + "step": 7836 + }, + { + "epoch": 2.539533376539209, + "grad_norm": 0.8674144744873047, + "learning_rate": 3.030267765737774e-07, + "loss": 0.0749, + "step": 7837 + }, + { + "epoch": 2.539857420609203, + "grad_norm": 0.9305141568183899, + "learning_rate": 3.026095392466824e-07, + "loss": 0.0856, + "step": 7838 + }, + { + "epoch": 2.5401814646791965, + "grad_norm": 0.8678079843521118, + "learning_rate": 3.021925708589066e-07, + "loss": 0.0758, + "step": 7839 + }, + { + "epoch": 2.54050550874919, + "grad_norm": 0.9064074754714966, + "learning_rate": 3.0177587146148435e-07, + "loss": 0.0787, + "step": 7840 + }, + { + "epoch": 2.5408295528191833, + "grad_norm": 0.9406875371932983, + "learning_rate": 3.013594411054144e-07, + "loss": 0.0781, + "step": 7841 + }, + { + "epoch": 2.5411535968891767, + "grad_norm": 0.883948564529419, + "learning_rate": 3.0094327984166506e-07, + "loss": 0.0786, + "step": 7842 + }, + { + "epoch": 2.5414776409591706, + "grad_norm": 0.9419253468513489, + "learning_rate": 3.0052738772116925e-07, + "loss": 0.0809, + "step": 7843 + }, + { + "epoch": 2.541801685029164, + "grad_norm": 0.8636225461959839, + "learning_rate": 3.001117647948287e-07, + "loss": 0.0756, + "step": 7844 + }, + { + "epoch": 2.5421257290991575, + "grad_norm": 0.8698762059211731, + "learning_rate": 2.996964111135123e-07, + "loss": 0.0747, + "step": 7845 + }, + { + "epoch": 2.542449773169151, + "grad_norm": 0.9199694395065308, + "learning_rate": 2.992813267280531e-07, + "loss": 0.0767, + "step": 7846 + }, + { + "epoch": 2.5427738172391443, + "grad_norm": 0.931075394153595, + "learning_rate": 2.988665116892564e-07, + "loss": 0.0843, + "step": 7847 + }, + { + "epoch": 2.543097861309138, + "grad_norm": 0.8880044221878052, + "learning_rate": 2.9845196604788935e-07, + "loss": 0.0744, + "step": 7848 + }, + { + "epoch": 2.5434219053791316, + "grad_norm": 0.9354466199874878, + "learning_rate": 2.980376898546888e-07, + "loss": 0.0771, + "step": 7849 + }, + { + "epoch": 2.543745949449125, + "grad_norm": 0.8996589183807373, + "learning_rate": 2.976236831603588e-07, + "loss": 0.0777, + "step": 7850 + }, + { + "epoch": 2.5440699935191184, + "grad_norm": 0.9524653553962708, + "learning_rate": 2.972099460155689e-07, + "loss": 0.0783, + "step": 7851 + }, + { + "epoch": 2.544394037589112, + "grad_norm": 0.928536057472229, + "learning_rate": 2.9679647847095735e-07, + "loss": 0.0809, + "step": 7852 + }, + { + "epoch": 2.5447180816591057, + "grad_norm": 0.8683675527572632, + "learning_rate": 2.9638328057712775e-07, + "loss": 0.0703, + "step": 7853 + }, + { + "epoch": 2.545042125729099, + "grad_norm": 0.9104581475257874, + "learning_rate": 2.9597035238465214e-07, + "loss": 0.0828, + "step": 7854 + }, + { + "epoch": 2.5453661697990926, + "grad_norm": 0.9258098602294922, + "learning_rate": 2.9555769394406934e-07, + "loss": 0.0819, + "step": 7855 + }, + { + "epoch": 2.5456902138690864, + "grad_norm": 0.8759539723396301, + "learning_rate": 2.9514530530588367e-07, + "loss": 0.0726, + "step": 7856 + }, + { + "epoch": 2.54601425793908, + "grad_norm": 0.9606726169586182, + "learning_rate": 2.947331865205677e-07, + "loss": 0.079, + "step": 7857 + }, + { + "epoch": 2.5463383020090733, + "grad_norm": 0.9050784111022949, + "learning_rate": 2.943213376385612e-07, + "loss": 0.0774, + "step": 7858 + }, + { + "epoch": 2.5466623460790667, + "grad_norm": 0.9557004570960999, + "learning_rate": 2.9390975871027046e-07, + "loss": 0.0804, + "step": 7859 + }, + { + "epoch": 2.54698639014906, + "grad_norm": 0.8810763955116272, + "learning_rate": 2.934984497860691e-07, + "loss": 0.0795, + "step": 7860 + }, + { + "epoch": 2.547310434219054, + "grad_norm": 0.8604434728622437, + "learning_rate": 2.9308741091629596e-07, + "loss": 0.0784, + "step": 7861 + }, + { + "epoch": 2.5476344782890474, + "grad_norm": 0.8944520950317383, + "learning_rate": 2.9267664215126e-07, + "loss": 0.075, + "step": 7862 + }, + { + "epoch": 2.547958522359041, + "grad_norm": 0.9683983325958252, + "learning_rate": 2.9226614354123356e-07, + "loss": 0.0823, + "step": 7863 + }, + { + "epoch": 2.5482825664290343, + "grad_norm": 0.8846125602722168, + "learning_rate": 2.9185591513645947e-07, + "loss": 0.0787, + "step": 7864 + }, + { + "epoch": 2.5486066104990277, + "grad_norm": 0.9127246737480164, + "learning_rate": 2.914459569871447e-07, + "loss": 0.0744, + "step": 7865 + }, + { + "epoch": 2.5489306545690216, + "grad_norm": 0.8919868469238281, + "learning_rate": 2.91036269143464e-07, + "loss": 0.0749, + "step": 7866 + }, + { + "epoch": 2.549254698639015, + "grad_norm": 0.9504753351211548, + "learning_rate": 2.9062685165555963e-07, + "loss": 0.0729, + "step": 7867 + }, + { + "epoch": 2.5495787427090084, + "grad_norm": 1.0182647705078125, + "learning_rate": 2.9021770457354046e-07, + "loss": 0.077, + "step": 7868 + }, + { + "epoch": 2.549902786779002, + "grad_norm": 0.923927366733551, + "learning_rate": 2.8980882794748227e-07, + "loss": 0.0802, + "step": 7869 + }, + { + "epoch": 2.5502268308489953, + "grad_norm": 0.9612078666687012, + "learning_rate": 2.894002218274261e-07, + "loss": 0.0803, + "step": 7870 + }, + { + "epoch": 2.550550874918989, + "grad_norm": 0.9394988417625427, + "learning_rate": 2.8899188626338363e-07, + "loss": 0.0846, + "step": 7871 + }, + { + "epoch": 2.5508749189889826, + "grad_norm": 0.9127728343009949, + "learning_rate": 2.8858382130532965e-07, + "loss": 0.0777, + "step": 7872 + }, + { + "epoch": 2.551198963058976, + "grad_norm": 0.8843898773193359, + "learning_rate": 2.8817602700320747e-07, + "loss": 0.078, + "step": 7873 + }, + { + "epoch": 2.5515230071289694, + "grad_norm": 0.9183049201965332, + "learning_rate": 2.8776850340692777e-07, + "loss": 0.0765, + "step": 7874 + }, + { + "epoch": 2.551847051198963, + "grad_norm": 0.9552394151687622, + "learning_rate": 2.87361250566367e-07, + "loss": 0.0776, + "step": 7875 + }, + { + "epoch": 2.5521710952689567, + "grad_norm": 0.9023454189300537, + "learning_rate": 2.869542685313692e-07, + "loss": 0.0791, + "step": 7876 + }, + { + "epoch": 2.55249513933895, + "grad_norm": 0.908761203289032, + "learning_rate": 2.865475573517451e-07, + "loss": 0.073, + "step": 7877 + }, + { + "epoch": 2.5528191834089435, + "grad_norm": 0.954011082649231, + "learning_rate": 2.8614111707727267e-07, + "loss": 0.0789, + "step": 7878 + }, + { + "epoch": 2.5531432274789374, + "grad_norm": 0.9677690267562866, + "learning_rate": 2.8573494775769485e-07, + "loss": 0.0789, + "step": 7879 + }, + { + "epoch": 2.5534672715489304, + "grad_norm": 0.9998055696487427, + "learning_rate": 2.853290494427238e-07, + "loss": 0.0818, + "step": 7880 + }, + { + "epoch": 2.5537913156189243, + "grad_norm": 0.9223211407661438, + "learning_rate": 2.8492342218203766e-07, + "loss": 0.0832, + "step": 7881 + }, + { + "epoch": 2.5541153596889177, + "grad_norm": 0.8618203997612, + "learning_rate": 2.845180660252808e-07, + "loss": 0.0725, + "step": 7882 + }, + { + "epoch": 2.554439403758911, + "grad_norm": 0.920971691608429, + "learning_rate": 2.8411298102206524e-07, + "loss": 0.08, + "step": 7883 + }, + { + "epoch": 2.554763447828905, + "grad_norm": 0.7992531061172485, + "learning_rate": 2.837081672219694e-07, + "loss": 0.0707, + "step": 7884 + }, + { + "epoch": 2.5550874918988984, + "grad_norm": 0.869141161441803, + "learning_rate": 2.833036246745385e-07, + "loss": 0.0752, + "step": 7885 + }, + { + "epoch": 2.555411535968892, + "grad_norm": 0.9022712707519531, + "learning_rate": 2.828993534292851e-07, + "loss": 0.0765, + "step": 7886 + }, + { + "epoch": 2.5557355800388852, + "grad_norm": 0.938575804233551, + "learning_rate": 2.824953535356872e-07, + "loss": 0.0774, + "step": 7887 + }, + { + "epoch": 2.5560596241088787, + "grad_norm": 0.8692349791526794, + "learning_rate": 2.820916250431907e-07, + "loss": 0.0685, + "step": 7888 + }, + { + "epoch": 2.5563836681788725, + "grad_norm": 0.8855343461036682, + "learning_rate": 2.8168816800120845e-07, + "loss": 0.0721, + "step": 7889 + }, + { + "epoch": 2.556707712248866, + "grad_norm": 0.863601803779602, + "learning_rate": 2.812849824591196e-07, + "loss": 0.0754, + "step": 7890 + }, + { + "epoch": 2.5570317563188594, + "grad_norm": 0.883549690246582, + "learning_rate": 2.808820684662705e-07, + "loss": 0.0752, + "step": 7891 + }, + { + "epoch": 2.557355800388853, + "grad_norm": 0.8747949004173279, + "learning_rate": 2.804794260719726e-07, + "loss": 0.0797, + "step": 7892 + }, + { + "epoch": 2.557679844458846, + "grad_norm": 0.8760228157043457, + "learning_rate": 2.800770553255072e-07, + "loss": 0.0759, + "step": 7893 + }, + { + "epoch": 2.55800388852884, + "grad_norm": 1.0065068006515503, + "learning_rate": 2.796749562761186e-07, + "loss": 0.0788, + "step": 7894 + }, + { + "epoch": 2.5583279325988335, + "grad_norm": 0.9806517958641052, + "learning_rate": 2.7927312897302217e-07, + "loss": 0.0827, + "step": 7895 + }, + { + "epoch": 2.558651976668827, + "grad_norm": 0.9483767747879028, + "learning_rate": 2.7887157346539574e-07, + "loss": 0.0805, + "step": 7896 + }, + { + "epoch": 2.5589760207388204, + "grad_norm": 0.8807239532470703, + "learning_rate": 2.7847028980238666e-07, + "loss": 0.0723, + "step": 7897 + }, + { + "epoch": 2.559300064808814, + "grad_norm": 0.8338807225227356, + "learning_rate": 2.780692780331079e-07, + "loss": 0.0682, + "step": 7898 + }, + { + "epoch": 2.5596241088788076, + "grad_norm": 0.9363901019096375, + "learning_rate": 2.7766853820663963e-07, + "loss": 0.0807, + "step": 7899 + }, + { + "epoch": 2.559948152948801, + "grad_norm": 0.8420773148536682, + "learning_rate": 2.7726807037202903e-07, + "loss": 0.0716, + "step": 7900 + }, + { + "epoch": 2.5602721970187945, + "grad_norm": 0.9283404350280762, + "learning_rate": 2.7686787457828796e-07, + "loss": 0.0797, + "step": 7901 + }, + { + "epoch": 2.560596241088788, + "grad_norm": 0.864181399345398, + "learning_rate": 2.764679508743981e-07, + "loss": 0.0729, + "step": 7902 + }, + { + "epoch": 2.5609202851587813, + "grad_norm": 0.89631187915802, + "learning_rate": 2.7606829930930555e-07, + "loss": 0.0727, + "step": 7903 + }, + { + "epoch": 2.561244329228775, + "grad_norm": 0.8723975419998169, + "learning_rate": 2.7566891993192347e-07, + "loss": 0.0751, + "step": 7904 + }, + { + "epoch": 2.5615683732987686, + "grad_norm": 0.827847957611084, + "learning_rate": 2.752698127911327e-07, + "loss": 0.0709, + "step": 7905 + }, + { + "epoch": 2.561892417368762, + "grad_norm": 0.8541545271873474, + "learning_rate": 2.748709779357794e-07, + "loss": 0.0735, + "step": 7906 + }, + { + "epoch": 2.562216461438756, + "grad_norm": 0.9275196194648743, + "learning_rate": 2.744724154146777e-07, + "loss": 0.0831, + "step": 7907 + }, + { + "epoch": 2.5625405055087493, + "grad_norm": 0.8933717012405396, + "learning_rate": 2.740741252766077e-07, + "loss": 0.0717, + "step": 7908 + }, + { + "epoch": 2.5628645495787428, + "grad_norm": 0.8624947667121887, + "learning_rate": 2.736761075703165e-07, + "loss": 0.0727, + "step": 7909 + }, + { + "epoch": 2.563188593648736, + "grad_norm": 0.874412477016449, + "learning_rate": 2.732783623445168e-07, + "loss": 0.0767, + "step": 7910 + }, + { + "epoch": 2.5635126377187296, + "grad_norm": 0.9391658902168274, + "learning_rate": 2.728808896478891e-07, + "loss": 0.0822, + "step": 7911 + }, + { + "epoch": 2.5638366817887235, + "grad_norm": 0.8865946531295776, + "learning_rate": 2.7248368952908055e-07, + "loss": 0.0781, + "step": 7912 + }, + { + "epoch": 2.564160725858717, + "grad_norm": 0.8825016021728516, + "learning_rate": 2.7208676203670406e-07, + "loss": 0.0784, + "step": 7913 + }, + { + "epoch": 2.5644847699287103, + "grad_norm": 0.8830437660217285, + "learning_rate": 2.716901072193404e-07, + "loss": 0.0709, + "step": 7914 + }, + { + "epoch": 2.5648088139987038, + "grad_norm": 0.9725536704063416, + "learning_rate": 2.71293725125536e-07, + "loss": 0.0849, + "step": 7915 + }, + { + "epoch": 2.565132858068697, + "grad_norm": 0.9594155550003052, + "learning_rate": 2.7089761580380346e-07, + "loss": 0.0754, + "step": 7916 + }, + { + "epoch": 2.565456902138691, + "grad_norm": 0.8194584250450134, + "learning_rate": 2.7050177930262406e-07, + "loss": 0.0711, + "step": 7917 + }, + { + "epoch": 2.5657809462086845, + "grad_norm": 0.8426803350448608, + "learning_rate": 2.701062156704434e-07, + "loss": 0.0744, + "step": 7918 + }, + { + "epoch": 2.566104990278678, + "grad_norm": 0.9032554030418396, + "learning_rate": 2.697109249556748e-07, + "loss": 0.0805, + "step": 7919 + }, + { + "epoch": 2.5664290343486713, + "grad_norm": 0.9481576681137085, + "learning_rate": 2.6931590720669807e-07, + "loss": 0.085, + "step": 7920 + }, + { + "epoch": 2.5667530784186647, + "grad_norm": 0.9876210689544678, + "learning_rate": 2.6892116247185964e-07, + "loss": 0.0689, + "step": 7921 + }, + { + "epoch": 2.5670771224886586, + "grad_norm": 0.8956534266471863, + "learning_rate": 2.6852669079947294e-07, + "loss": 0.0766, + "step": 7922 + }, + { + "epoch": 2.567401166558652, + "grad_norm": 0.9033135771751404, + "learning_rate": 2.681324922378159e-07, + "loss": 0.0742, + "step": 7923 + }, + { + "epoch": 2.5677252106286454, + "grad_norm": 0.8424497246742249, + "learning_rate": 2.6773856683513677e-07, + "loss": 0.0726, + "step": 7924 + }, + { + "epoch": 2.568049254698639, + "grad_norm": 0.8959171772003174, + "learning_rate": 2.673449146396459e-07, + "loss": 0.0754, + "step": 7925 + }, + { + "epoch": 2.5683732987686323, + "grad_norm": 0.9140695929527283, + "learning_rate": 2.6695153569952475e-07, + "loss": 0.0819, + "step": 7926 + }, + { + "epoch": 2.568697342838626, + "grad_norm": 0.8918660283088684, + "learning_rate": 2.665584300629176e-07, + "loss": 0.0733, + "step": 7927 + }, + { + "epoch": 2.5690213869086196, + "grad_norm": 0.819005012512207, + "learning_rate": 2.661655977779373e-07, + "loss": 0.0703, + "step": 7928 + }, + { + "epoch": 2.569345430978613, + "grad_norm": 0.8803487420082092, + "learning_rate": 2.6577303889266244e-07, + "loss": 0.0717, + "step": 7929 + }, + { + "epoch": 2.569669475048607, + "grad_norm": 0.9302169680595398, + "learning_rate": 2.6538075345513864e-07, + "loss": 0.0779, + "step": 7930 + }, + { + "epoch": 2.5699935191186, + "grad_norm": 0.9317216277122498, + "learning_rate": 2.6498874151337865e-07, + "loss": 0.0785, + "step": 7931 + }, + { + "epoch": 2.5703175631885937, + "grad_norm": 0.910908579826355, + "learning_rate": 2.6459700311535885e-07, + "loss": 0.0759, + "step": 7932 + }, + { + "epoch": 2.570641607258587, + "grad_norm": 0.8003696203231812, + "learning_rate": 2.642055383090264e-07, + "loss": 0.0688, + "step": 7933 + }, + { + "epoch": 2.5709656513285806, + "grad_norm": 0.8143637776374817, + "learning_rate": 2.638143471422916e-07, + "loss": 0.0736, + "step": 7934 + }, + { + "epoch": 2.5712896953985744, + "grad_norm": 0.8914068341255188, + "learning_rate": 2.634234296630328e-07, + "loss": 0.0732, + "step": 7935 + }, + { + "epoch": 2.571613739468568, + "grad_norm": 0.8780104517936707, + "learning_rate": 2.6303278591909426e-07, + "loss": 0.0765, + "step": 7936 + }, + { + "epoch": 2.5719377835385613, + "grad_norm": 0.9786894917488098, + "learning_rate": 2.626424159582872e-07, + "loss": 0.0779, + "step": 7937 + }, + { + "epoch": 2.5722618276085547, + "grad_norm": 0.892594575881958, + "learning_rate": 2.622523198283894e-07, + "loss": 0.0763, + "step": 7938 + }, + { + "epoch": 2.572585871678548, + "grad_norm": 0.9260560870170593, + "learning_rate": 2.6186249757714474e-07, + "loss": 0.0812, + "step": 7939 + }, + { + "epoch": 2.572909915748542, + "grad_norm": 0.8858899474143982, + "learning_rate": 2.614729492522633e-07, + "loss": 0.0703, + "step": 7940 + }, + { + "epoch": 2.5732339598185354, + "grad_norm": 0.9430214762687683, + "learning_rate": 2.61083674901422e-07, + "loss": 0.0809, + "step": 7941 + }, + { + "epoch": 2.573558003888529, + "grad_norm": 0.841761589050293, + "learning_rate": 2.6069467457226467e-07, + "loss": 0.073, + "step": 7942 + }, + { + "epoch": 2.5738820479585223, + "grad_norm": 0.9282339215278625, + "learning_rate": 2.6030594831240094e-07, + "loss": 0.0765, + "step": 7943 + }, + { + "epoch": 2.5742060920285157, + "grad_norm": 0.875710666179657, + "learning_rate": 2.599174961694073e-07, + "loss": 0.0785, + "step": 7944 + }, + { + "epoch": 2.5745301360985096, + "grad_norm": 0.8414289355278015, + "learning_rate": 2.595293181908265e-07, + "loss": 0.0697, + "step": 7945 + }, + { + "epoch": 2.574854180168503, + "grad_norm": 0.9343950152397156, + "learning_rate": 2.59141414424168e-07, + "loss": 0.0788, + "step": 7946 + }, + { + "epoch": 2.5751782242384964, + "grad_norm": 0.9397087693214417, + "learning_rate": 2.587537849169064e-07, + "loss": 0.0778, + "step": 7947 + }, + { + "epoch": 2.57550226830849, + "grad_norm": 0.9125270843505859, + "learning_rate": 2.5836642971648534e-07, + "loss": 0.0782, + "step": 7948 + }, + { + "epoch": 2.5758263123784833, + "grad_norm": 0.8976326584815979, + "learning_rate": 2.579793488703122e-07, + "loss": 0.0714, + "step": 7949 + }, + { + "epoch": 2.576150356448477, + "grad_norm": 0.8638294339179993, + "learning_rate": 2.5759254242576246e-07, + "loss": 0.0724, + "step": 7950 + }, + { + "epoch": 2.5764744005184705, + "grad_norm": 0.88809734582901, + "learning_rate": 2.572060104301771e-07, + "loss": 0.077, + "step": 7951 + }, + { + "epoch": 2.576798444588464, + "grad_norm": 0.9091419577598572, + "learning_rate": 2.5681975293086443e-07, + "loss": 0.0752, + "step": 7952 + }, + { + "epoch": 2.5771224886584574, + "grad_norm": 0.9178985953330994, + "learning_rate": 2.564337699750985e-07, + "loss": 0.0796, + "step": 7953 + }, + { + "epoch": 2.577446532728451, + "grad_norm": 0.9933173656463623, + "learning_rate": 2.560480616101191e-07, + "loss": 0.08, + "step": 7954 + }, + { + "epoch": 2.5777705767984447, + "grad_norm": 0.8052213191986084, + "learning_rate": 2.556626278831345e-07, + "loss": 0.0671, + "step": 7955 + }, + { + "epoch": 2.578094620868438, + "grad_norm": 0.916199266910553, + "learning_rate": 2.552774688413165e-07, + "loss": 0.073, + "step": 7956 + }, + { + "epoch": 2.5784186649384315, + "grad_norm": 0.8220997452735901, + "learning_rate": 2.5489258453180676e-07, + "loss": 0.0688, + "step": 7957 + }, + { + "epoch": 2.5787427090084254, + "grad_norm": 0.8971430063247681, + "learning_rate": 2.545079750017099e-07, + "loss": 0.0735, + "step": 7958 + }, + { + "epoch": 2.579066753078419, + "grad_norm": 0.9529185891151428, + "learning_rate": 2.541236402980987e-07, + "loss": 0.0812, + "step": 7959 + }, + { + "epoch": 2.5793907971484122, + "grad_norm": 0.8507676124572754, + "learning_rate": 2.5373958046801207e-07, + "loss": 0.0739, + "step": 7960 + }, + { + "epoch": 2.5797148412184057, + "grad_norm": 1.0006403923034668, + "learning_rate": 2.5335579555845563e-07, + "loss": 0.0805, + "step": 7961 + }, + { + "epoch": 2.580038885288399, + "grad_norm": 0.91471928358078, + "learning_rate": 2.5297228561640075e-07, + "loss": 0.0799, + "step": 7962 + }, + { + "epoch": 2.580362929358393, + "grad_norm": 0.8677946925163269, + "learning_rate": 2.5258905068878433e-07, + "loss": 0.0774, + "step": 7963 + }, + { + "epoch": 2.5806869734283864, + "grad_norm": 0.916735827922821, + "learning_rate": 2.522060908225127e-07, + "loss": 0.074, + "step": 7964 + }, + { + "epoch": 2.58101101749838, + "grad_norm": 0.9161584377288818, + "learning_rate": 2.518234060644545e-07, + "loss": 0.0721, + "step": 7965 + }, + { + "epoch": 2.5813350615683732, + "grad_norm": 0.9978834390640259, + "learning_rate": 2.5144099646144724e-07, + "loss": 0.0773, + "step": 7966 + }, + { + "epoch": 2.5816591056383666, + "grad_norm": 0.843064546585083, + "learning_rate": 2.510588620602947e-07, + "loss": 0.0709, + "step": 7967 + }, + { + "epoch": 2.5819831497083605, + "grad_norm": 0.8937796354293823, + "learning_rate": 2.506770029077657e-07, + "loss": 0.076, + "step": 7968 + }, + { + "epoch": 2.582307193778354, + "grad_norm": 0.8655698299407959, + "learning_rate": 2.502954190505963e-07, + "loss": 0.0783, + "step": 7969 + }, + { + "epoch": 2.5826312378483474, + "grad_norm": 0.8840304017066956, + "learning_rate": 2.499141105354894e-07, + "loss": 0.0797, + "step": 7970 + }, + { + "epoch": 2.582955281918341, + "grad_norm": 0.8878140449523926, + "learning_rate": 2.495330774091126e-07, + "loss": 0.0743, + "step": 7971 + }, + { + "epoch": 2.583279325988334, + "grad_norm": 0.9055677056312561, + "learning_rate": 2.4915231971810064e-07, + "loss": 0.0733, + "step": 7972 + }, + { + "epoch": 2.583603370058328, + "grad_norm": 0.8808376789093018, + "learning_rate": 2.4877183750905475e-07, + "loss": 0.075, + "step": 7973 + }, + { + "epoch": 2.5839274141283215, + "grad_norm": 0.8990373611450195, + "learning_rate": 2.483916308285425e-07, + "loss": 0.0744, + "step": 7974 + }, + { + "epoch": 2.584251458198315, + "grad_norm": 0.8307132720947266, + "learning_rate": 2.4801169972309745e-07, + "loss": 0.066, + "step": 7975 + }, + { + "epoch": 2.5845755022683083, + "grad_norm": 0.9437209963798523, + "learning_rate": 2.4763204423921937e-07, + "loss": 0.0841, + "step": 7976 + }, + { + "epoch": 2.5848995463383018, + "grad_norm": 0.9493826627731323, + "learning_rate": 2.47252664423375e-07, + "loss": 0.0777, + "step": 7977 + }, + { + "epoch": 2.5852235904082956, + "grad_norm": 0.8011552691459656, + "learning_rate": 2.4687356032199516e-07, + "loss": 0.0667, + "step": 7978 + }, + { + "epoch": 2.585547634478289, + "grad_norm": 0.8402678966522217, + "learning_rate": 2.464947319814806e-07, + "loss": 0.0706, + "step": 7979 + }, + { + "epoch": 2.5858716785482825, + "grad_norm": 0.9520148634910583, + "learning_rate": 2.461161794481945e-07, + "loss": 0.0821, + "step": 7980 + }, + { + "epoch": 2.5861957226182763, + "grad_norm": 0.8773624897003174, + "learning_rate": 2.4573790276846947e-07, + "loss": 0.0786, + "step": 7981 + }, + { + "epoch": 2.5865197666882693, + "grad_norm": 0.8360825181007385, + "learning_rate": 2.453599019886016e-07, + "loss": 0.0694, + "step": 7982 + }, + { + "epoch": 2.586843810758263, + "grad_norm": 0.9423222541809082, + "learning_rate": 2.449821771548552e-07, + "loss": 0.0773, + "step": 7983 + }, + { + "epoch": 2.5871678548282566, + "grad_norm": 0.8453056216239929, + "learning_rate": 2.446047283134606e-07, + "loss": 0.0694, + "step": 7984 + }, + { + "epoch": 2.58749189889825, + "grad_norm": 0.8733270764350891, + "learning_rate": 2.4422755551061246e-07, + "loss": 0.0732, + "step": 7985 + }, + { + "epoch": 2.587815942968244, + "grad_norm": 0.879425048828125, + "learning_rate": 2.4385065879247466e-07, + "loss": 0.076, + "step": 7986 + }, + { + "epoch": 2.5881399870382373, + "grad_norm": 0.8516666889190674, + "learning_rate": 2.4347403820517423e-07, + "loss": 0.0745, + "step": 7987 + }, + { + "epoch": 2.5884640311082308, + "grad_norm": 0.8790467977523804, + "learning_rate": 2.4309769379480764e-07, + "loss": 0.0691, + "step": 7988 + }, + { + "epoch": 2.588788075178224, + "grad_norm": 0.9171347618103027, + "learning_rate": 2.427216256074341e-07, + "loss": 0.0751, + "step": 7989 + }, + { + "epoch": 2.5891121192482176, + "grad_norm": 0.9451388120651245, + "learning_rate": 2.423458336890816e-07, + "loss": 0.0833, + "step": 7990 + }, + { + "epoch": 2.5894361633182115, + "grad_norm": 0.8528581857681274, + "learning_rate": 2.4197031808574327e-07, + "loss": 0.0739, + "step": 7991 + }, + { + "epoch": 2.589760207388205, + "grad_norm": 0.8192191123962402, + "learning_rate": 2.4159507884337877e-07, + "loss": 0.0716, + "step": 7992 + }, + { + "epoch": 2.5900842514581983, + "grad_norm": 0.9669711589813232, + "learning_rate": 2.4122011600791334e-07, + "loss": 0.0794, + "step": 7993 + }, + { + "epoch": 2.5904082955281917, + "grad_norm": 0.9040258526802063, + "learning_rate": 2.408454296252397e-07, + "loss": 0.081, + "step": 7994 + }, + { + "epoch": 2.590732339598185, + "grad_norm": 0.9025174975395203, + "learning_rate": 2.404710197412144e-07, + "loss": 0.0758, + "step": 7995 + }, + { + "epoch": 2.591056383668179, + "grad_norm": 0.9596523642539978, + "learning_rate": 2.4009688640166257e-07, + "loss": 0.0827, + "step": 7996 + }, + { + "epoch": 2.5913804277381725, + "grad_norm": 0.915219783782959, + "learning_rate": 2.397230296523742e-07, + "loss": 0.0773, + "step": 7997 + }, + { + "epoch": 2.591704471808166, + "grad_norm": 0.8500992059707642, + "learning_rate": 2.3934944953910576e-07, + "loss": 0.0725, + "step": 7998 + }, + { + "epoch": 2.5920285158781593, + "grad_norm": 0.8785587549209595, + "learning_rate": 2.3897614610757984e-07, + "loss": 0.0747, + "step": 7999 + }, + { + "epoch": 2.5923525599481527, + "grad_norm": 0.8524248003959656, + "learning_rate": 2.386031194034855e-07, + "loss": 0.0693, + "step": 8000 + }, + { + "epoch": 2.5926766040181466, + "grad_norm": 0.9006550312042236, + "learning_rate": 2.3823036947247773e-07, + "loss": 0.0764, + "step": 8001 + }, + { + "epoch": 2.59300064808814, + "grad_norm": 0.8533654808998108, + "learning_rate": 2.3785789636017604e-07, + "loss": 0.0735, + "step": 8002 + }, + { + "epoch": 2.5933246921581334, + "grad_norm": 0.8423067927360535, + "learning_rate": 2.374857001121697e-07, + "loss": 0.0731, + "step": 8003 + }, + { + "epoch": 2.593648736228127, + "grad_norm": 0.9366042017936707, + "learning_rate": 2.371137807740101e-07, + "loss": 0.0797, + "step": 8004 + }, + { + "epoch": 2.5939727802981203, + "grad_norm": 1.1670465469360352, + "learning_rate": 2.3674213839121745e-07, + "loss": 0.0781, + "step": 8005 + }, + { + "epoch": 2.594296824368114, + "grad_norm": 0.9376868009567261, + "learning_rate": 2.3637077300927762e-07, + "loss": 0.0815, + "step": 8006 + }, + { + "epoch": 2.5946208684381076, + "grad_norm": 0.8395928740501404, + "learning_rate": 2.3599968467364037e-07, + "loss": 0.0699, + "step": 8007 + }, + { + "epoch": 2.594944912508101, + "grad_norm": 0.8741575479507446, + "learning_rate": 2.3562887342972574e-07, + "loss": 0.075, + "step": 8008 + }, + { + "epoch": 2.595268956578095, + "grad_norm": 0.9370665550231934, + "learning_rate": 2.3525833932291491e-07, + "loss": 0.0759, + "step": 8009 + }, + { + "epoch": 2.5955930006480883, + "grad_norm": 0.9461312890052795, + "learning_rate": 2.3488808239855998e-07, + "loss": 0.0786, + "step": 8010 + }, + { + "epoch": 2.5959170447180817, + "grad_norm": 0.8575646281242371, + "learning_rate": 2.3451810270197494e-07, + "loss": 0.0711, + "step": 8011 + }, + { + "epoch": 2.596241088788075, + "grad_norm": 0.9234625697135925, + "learning_rate": 2.341484002784436e-07, + "loss": 0.0805, + "step": 8012 + }, + { + "epoch": 2.5965651328580686, + "grad_norm": 0.8345510959625244, + "learning_rate": 2.3377897517321224e-07, + "loss": 0.0734, + "step": 8013 + }, + { + "epoch": 2.5968891769280624, + "grad_norm": 0.8022050261497498, + "learning_rate": 2.3340982743149582e-07, + "loss": 0.0669, + "step": 8014 + }, + { + "epoch": 2.597213220998056, + "grad_norm": 0.9139599800109863, + "learning_rate": 2.3304095709847402e-07, + "loss": 0.0778, + "step": 8015 + }, + { + "epoch": 2.5975372650680493, + "grad_norm": 0.863365888595581, + "learning_rate": 2.3267236421929323e-07, + "loss": 0.0714, + "step": 8016 + }, + { + "epoch": 2.5978613091380427, + "grad_norm": 0.9220430254936218, + "learning_rate": 2.3230404883906626e-07, + "loss": 0.0783, + "step": 8017 + }, + { + "epoch": 2.598185353208036, + "grad_norm": 0.9168446660041809, + "learning_rate": 2.319360110028701e-07, + "loss": 0.0742, + "step": 8018 + }, + { + "epoch": 2.59850939727803, + "grad_norm": 0.9191902875900269, + "learning_rate": 2.3156825075574956e-07, + "loss": 0.0682, + "step": 8019 + }, + { + "epoch": 2.5988334413480234, + "grad_norm": 0.9062588810920715, + "learning_rate": 2.312007681427153e-07, + "loss": 0.0841, + "step": 8020 + }, + { + "epoch": 2.599157485418017, + "grad_norm": 0.8564749956130981, + "learning_rate": 2.30833563208743e-07, + "loss": 0.077, + "step": 8021 + }, + { + "epoch": 2.5994815294880103, + "grad_norm": 0.8729365468025208, + "learning_rate": 2.304666359987756e-07, + "loss": 0.0756, + "step": 8022 + }, + { + "epoch": 2.5998055735580037, + "grad_norm": 0.9407651424407959, + "learning_rate": 2.300999865577211e-07, + "loss": 0.0796, + "step": 8023 + }, + { + "epoch": 2.6001296176279975, + "grad_norm": 0.9075953364372253, + "learning_rate": 2.2973361493045382e-07, + "loss": 0.0782, + "step": 8024 + }, + { + "epoch": 2.600453661697991, + "grad_norm": 0.8575737476348877, + "learning_rate": 2.293675211618146e-07, + "loss": 0.0762, + "step": 8025 + }, + { + "epoch": 2.6007777057679844, + "grad_norm": 0.8725225329399109, + "learning_rate": 2.2900170529660898e-07, + "loss": 0.072, + "step": 8026 + }, + { + "epoch": 2.601101749837978, + "grad_norm": 0.8768883943557739, + "learning_rate": 2.2863616737960976e-07, + "loss": 0.0757, + "step": 8027 + }, + { + "epoch": 2.6014257939079712, + "grad_norm": 0.8702290654182434, + "learning_rate": 2.2827090745555502e-07, + "loss": 0.0729, + "step": 8028 + }, + { + "epoch": 2.601749837977965, + "grad_norm": 0.9248967170715332, + "learning_rate": 2.279059255691493e-07, + "loss": 0.0831, + "step": 8029 + }, + { + "epoch": 2.6020738820479585, + "grad_norm": 0.8153299689292908, + "learning_rate": 2.2754122176506244e-07, + "loss": 0.0705, + "step": 8030 + }, + { + "epoch": 2.602397926117952, + "grad_norm": 0.8644693493843079, + "learning_rate": 2.271767960879312e-07, + "loss": 0.0724, + "step": 8031 + }, + { + "epoch": 2.602721970187946, + "grad_norm": 0.8582781553268433, + "learning_rate": 2.2681264858235797e-07, + "loss": 0.0707, + "step": 8032 + }, + { + "epoch": 2.6030460142579392, + "grad_norm": 0.8408800959587097, + "learning_rate": 2.2644877929290932e-07, + "loss": 0.0761, + "step": 8033 + }, + { + "epoch": 2.6033700583279327, + "grad_norm": 0.8595169186592102, + "learning_rate": 2.2608518826412128e-07, + "loss": 0.0719, + "step": 8034 + }, + { + "epoch": 2.603694102397926, + "grad_norm": 1.0227017402648926, + "learning_rate": 2.2572187554049274e-07, + "loss": 0.0763, + "step": 8035 + }, + { + "epoch": 2.6040181464679195, + "grad_norm": 0.8645336627960205, + "learning_rate": 2.2535884116648976e-07, + "loss": 0.0726, + "step": 8036 + }, + { + "epoch": 2.6043421905379134, + "grad_norm": 0.8675413727760315, + "learning_rate": 2.2499608518654432e-07, + "loss": 0.0754, + "step": 8037 + }, + { + "epoch": 2.604666234607907, + "grad_norm": 0.9440889358520508, + "learning_rate": 2.2463360764505448e-07, + "loss": 0.073, + "step": 8038 + }, + { + "epoch": 2.6049902786779002, + "grad_norm": 0.9420521855354309, + "learning_rate": 2.2427140858638424e-07, + "loss": 0.0767, + "step": 8039 + }, + { + "epoch": 2.6053143227478937, + "grad_norm": 0.8823314905166626, + "learning_rate": 2.2390948805486174e-07, + "loss": 0.0723, + "step": 8040 + }, + { + "epoch": 2.605638366817887, + "grad_norm": 0.7878745794296265, + "learning_rate": 2.2354784609478485e-07, + "loss": 0.0685, + "step": 8041 + }, + { + "epoch": 2.605962410887881, + "grad_norm": 0.9195688366889954, + "learning_rate": 2.2318648275041267e-07, + "loss": 0.0742, + "step": 8042 + }, + { + "epoch": 2.6062864549578744, + "grad_norm": 0.8763304352760315, + "learning_rate": 2.2282539806597476e-07, + "loss": 0.0712, + "step": 8043 + }, + { + "epoch": 2.606610499027868, + "grad_norm": 0.8439639806747437, + "learning_rate": 2.22464592085663e-07, + "loss": 0.0722, + "step": 8044 + }, + { + "epoch": 2.606934543097861, + "grad_norm": 0.8869640231132507, + "learning_rate": 2.2210406485363656e-07, + "loss": 0.0726, + "step": 8045 + }, + { + "epoch": 2.6072585871678546, + "grad_norm": 0.9663493633270264, + "learning_rate": 2.217438164140212e-07, + "loss": 0.0801, + "step": 8046 + }, + { + "epoch": 2.6075826312378485, + "grad_norm": 0.9520936608314514, + "learning_rate": 2.213838468109075e-07, + "loss": 0.0728, + "step": 8047 + }, + { + "epoch": 2.607906675307842, + "grad_norm": 0.9151571989059448, + "learning_rate": 2.210241560883525e-07, + "loss": 0.0816, + "step": 8048 + }, + { + "epoch": 2.6082307193778353, + "grad_norm": 0.9211403131484985, + "learning_rate": 2.206647442903781e-07, + "loss": 0.0747, + "step": 8049 + }, + { + "epoch": 2.6085547634478288, + "grad_norm": 0.8817241787910461, + "learning_rate": 2.2030561146097363e-07, + "loss": 0.0784, + "step": 8050 + }, + { + "epoch": 2.608878807517822, + "grad_norm": 0.8810299634933472, + "learning_rate": 2.199467576440928e-07, + "loss": 0.0756, + "step": 8051 + }, + { + "epoch": 2.609202851587816, + "grad_norm": 0.9325541853904724, + "learning_rate": 2.195881828836563e-07, + "loss": 0.0813, + "step": 8052 + }, + { + "epoch": 2.6095268956578095, + "grad_norm": 0.9286937117576599, + "learning_rate": 2.1922988722355044e-07, + "loss": 0.0797, + "step": 8053 + }, + { + "epoch": 2.609850939727803, + "grad_norm": 0.8863601684570312, + "learning_rate": 2.188718707076265e-07, + "loss": 0.0763, + "step": 8054 + }, + { + "epoch": 2.6101749837977968, + "grad_norm": 0.9030467867851257, + "learning_rate": 2.185141333797025e-07, + "loss": 0.0777, + "step": 8055 + }, + { + "epoch": 2.6104990278677898, + "grad_norm": 0.8651726841926575, + "learning_rate": 2.181566752835626e-07, + "loss": 0.0719, + "step": 8056 + }, + { + "epoch": 2.6108230719377836, + "grad_norm": 0.9293838739395142, + "learning_rate": 2.177994964629554e-07, + "loss": 0.0827, + "step": 8057 + }, + { + "epoch": 2.611147116007777, + "grad_norm": 0.8936684727668762, + "learning_rate": 2.174425969615962e-07, + "loss": 0.0734, + "step": 8058 + }, + { + "epoch": 2.6114711600777705, + "grad_norm": 0.8786556720733643, + "learning_rate": 2.1708597682316645e-07, + "loss": 0.0698, + "step": 8059 + }, + { + "epoch": 2.6117952041477643, + "grad_norm": 0.9012254476547241, + "learning_rate": 2.1672963609131292e-07, + "loss": 0.0762, + "step": 8060 + }, + { + "epoch": 2.6121192482177578, + "grad_norm": 0.8727022409439087, + "learning_rate": 2.1637357480964821e-07, + "loss": 0.0766, + "step": 8061 + }, + { + "epoch": 2.612443292287751, + "grad_norm": 0.8324522376060486, + "learning_rate": 2.1601779302175026e-07, + "loss": 0.0743, + "step": 8062 + }, + { + "epoch": 2.6127673363577446, + "grad_norm": 0.9140172004699707, + "learning_rate": 2.1566229077116445e-07, + "loss": 0.079, + "step": 8063 + }, + { + "epoch": 2.613091380427738, + "grad_norm": 0.9623644948005676, + "learning_rate": 2.1530706810139913e-07, + "loss": 0.0846, + "step": 8064 + }, + { + "epoch": 2.613415424497732, + "grad_norm": 0.9511628746986389, + "learning_rate": 2.1495212505593221e-07, + "loss": 0.0781, + "step": 8065 + }, + { + "epoch": 2.6137394685677253, + "grad_norm": 0.900143563747406, + "learning_rate": 2.1459746167820372e-07, + "loss": 0.0783, + "step": 8066 + }, + { + "epoch": 2.6140635126377187, + "grad_norm": 0.9561235904693604, + "learning_rate": 2.142430780116214e-07, + "loss": 0.079, + "step": 8067 + }, + { + "epoch": 2.614387556707712, + "grad_norm": 0.8517616391181946, + "learning_rate": 2.1388897409955867e-07, + "loss": 0.072, + "step": 8068 + }, + { + "epoch": 2.6147116007777056, + "grad_norm": 0.9002208113670349, + "learning_rate": 2.1353514998535414e-07, + "loss": 0.0777, + "step": 8069 + }, + { + "epoch": 2.6150356448476995, + "grad_norm": 0.918381929397583, + "learning_rate": 2.1318160571231316e-07, + "loss": 0.0763, + "step": 8070 + }, + { + "epoch": 2.615359688917693, + "grad_norm": 0.864824652671814, + "learning_rate": 2.128283413237045e-07, + "loss": 0.0759, + "step": 8071 + }, + { + "epoch": 2.6156837329876863, + "grad_norm": 0.9191448092460632, + "learning_rate": 2.1247535686276632e-07, + "loss": 0.0793, + "step": 8072 + }, + { + "epoch": 2.6160077770576797, + "grad_norm": 0.8766631484031677, + "learning_rate": 2.121226523726988e-07, + "loss": 0.0725, + "step": 8073 + }, + { + "epoch": 2.616331821127673, + "grad_norm": 0.8818141222000122, + "learning_rate": 2.1177022789667045e-07, + "loss": 0.0762, + "step": 8074 + }, + { + "epoch": 2.616655865197667, + "grad_norm": 0.8716321587562561, + "learning_rate": 2.1141808347781428e-07, + "loss": 0.0733, + "step": 8075 + }, + { + "epoch": 2.6169799092676604, + "grad_norm": 0.9057003259658813, + "learning_rate": 2.110662191592297e-07, + "loss": 0.0791, + "step": 8076 + }, + { + "epoch": 2.617303953337654, + "grad_norm": 0.8317652344703674, + "learning_rate": 2.1071463498398114e-07, + "loss": 0.0728, + "step": 8077 + }, + { + "epoch": 2.6176279974076473, + "grad_norm": 0.9148250222206116, + "learning_rate": 2.103633309950995e-07, + "loss": 0.0758, + "step": 8078 + }, + { + "epoch": 2.6179520414776407, + "grad_norm": 0.8296970129013062, + "learning_rate": 2.1001230723558087e-07, + "loss": 0.0729, + "step": 8079 + }, + { + "epoch": 2.6182760855476346, + "grad_norm": 0.9037541747093201, + "learning_rate": 2.0966156374838677e-07, + "loss": 0.0737, + "step": 8080 + }, + { + "epoch": 2.618600129617628, + "grad_norm": 1.0087367296218872, + "learning_rate": 2.0931110057644505e-07, + "loss": 0.0821, + "step": 8081 + }, + { + "epoch": 2.6189241736876214, + "grad_norm": 0.9129602909088135, + "learning_rate": 2.089609177626492e-07, + "loss": 0.0733, + "step": 8082 + }, + { + "epoch": 2.6192482177576153, + "grad_norm": 0.9093072414398193, + "learning_rate": 2.0861101534985774e-07, + "loss": 0.084, + "step": 8083 + }, + { + "epoch": 2.6195722618276087, + "grad_norm": 0.8904097080230713, + "learning_rate": 2.082613933808958e-07, + "loss": 0.077, + "step": 8084 + }, + { + "epoch": 2.619896305897602, + "grad_norm": 0.8340948224067688, + "learning_rate": 2.079120518985539e-07, + "loss": 0.07, + "step": 8085 + }, + { + "epoch": 2.6202203499675956, + "grad_norm": 0.8659706711769104, + "learning_rate": 2.07562990945587e-07, + "loss": 0.0713, + "step": 8086 + }, + { + "epoch": 2.620544394037589, + "grad_norm": 0.8465261459350586, + "learning_rate": 2.0721421056471818e-07, + "loss": 0.0757, + "step": 8087 + }, + { + "epoch": 2.620868438107583, + "grad_norm": 0.9324637055397034, + "learning_rate": 2.0686571079863383e-07, + "loss": 0.0799, + "step": 8088 + }, + { + "epoch": 2.6211924821775763, + "grad_norm": 0.9742748141288757, + "learning_rate": 2.0651749168998703e-07, + "loss": 0.0832, + "step": 8089 + }, + { + "epoch": 2.6215165262475697, + "grad_norm": 0.8751499652862549, + "learning_rate": 2.0616955328139675e-07, + "loss": 0.0719, + "step": 8090 + }, + { + "epoch": 2.621840570317563, + "grad_norm": 0.9169896245002747, + "learning_rate": 2.058218956154473e-07, + "loss": 0.0795, + "step": 8091 + }, + { + "epoch": 2.6221646143875565, + "grad_norm": 0.8359293937683105, + "learning_rate": 2.0547451873468877e-07, + "loss": 0.0737, + "step": 8092 + }, + { + "epoch": 2.6224886584575504, + "grad_norm": 0.8692148327827454, + "learning_rate": 2.051274226816355e-07, + "loss": 0.0772, + "step": 8093 + }, + { + "epoch": 2.622812702527544, + "grad_norm": 0.8982337713241577, + "learning_rate": 2.0478060749877044e-07, + "loss": 0.0739, + "step": 8094 + }, + { + "epoch": 2.6231367465975373, + "grad_norm": 0.8163446187973022, + "learning_rate": 2.0443407322853882e-07, + "loss": 0.0681, + "step": 8095 + }, + { + "epoch": 2.6234607906675307, + "grad_norm": 0.8136507272720337, + "learning_rate": 2.0408781991335446e-07, + "loss": 0.0718, + "step": 8096 + }, + { + "epoch": 2.623784834737524, + "grad_norm": 0.9225520491600037, + "learning_rate": 2.0374184759559463e-07, + "loss": 0.0756, + "step": 8097 + }, + { + "epoch": 2.624108878807518, + "grad_norm": 0.8884847164154053, + "learning_rate": 2.033961563176029e-07, + "loss": 0.0728, + "step": 8098 + }, + { + "epoch": 2.6244329228775114, + "grad_norm": 0.8490235805511475, + "learning_rate": 2.0305074612168906e-07, + "loss": 0.0729, + "step": 8099 + }, + { + "epoch": 2.624756966947505, + "grad_norm": 0.9549583196640015, + "learning_rate": 2.0270561705012765e-07, + "loss": 0.0855, + "step": 8100 + }, + { + "epoch": 2.6250810110174982, + "grad_norm": 0.8930221796035767, + "learning_rate": 2.0236076914515956e-07, + "loss": 0.0778, + "step": 8101 + }, + { + "epoch": 2.6254050550874917, + "grad_norm": 0.8836579918861389, + "learning_rate": 2.020162024489894e-07, + "loss": 0.0766, + "step": 8102 + }, + { + "epoch": 2.6257290991574855, + "grad_norm": 0.888401448726654, + "learning_rate": 2.0167191700379092e-07, + "loss": 0.075, + "step": 8103 + }, + { + "epoch": 2.626053143227479, + "grad_norm": 0.8471822142601013, + "learning_rate": 2.0132791285169985e-07, + "loss": 0.07, + "step": 8104 + }, + { + "epoch": 2.6263771872974724, + "grad_norm": 0.8982986807823181, + "learning_rate": 2.0098419003481946e-07, + "loss": 0.077, + "step": 8105 + }, + { + "epoch": 2.6267012313674662, + "grad_norm": 0.9074251651763916, + "learning_rate": 2.0064074859521777e-07, + "loss": 0.0791, + "step": 8106 + }, + { + "epoch": 2.6270252754374592, + "grad_norm": 0.8768463730812073, + "learning_rate": 2.0029758857492893e-07, + "loss": 0.0765, + "step": 8107 + }, + { + "epoch": 2.627349319507453, + "grad_norm": 0.8786301016807556, + "learning_rate": 1.9995471001595267e-07, + "loss": 0.0731, + "step": 8108 + }, + { + "epoch": 2.6276733635774465, + "grad_norm": 0.9357177019119263, + "learning_rate": 1.9961211296025352e-07, + "loss": 0.0816, + "step": 8109 + }, + { + "epoch": 2.62799740764744, + "grad_norm": 0.900934100151062, + "learning_rate": 1.992697974497629e-07, + "loss": 0.0779, + "step": 8110 + }, + { + "epoch": 2.628321451717434, + "grad_norm": 0.8593557476997375, + "learning_rate": 1.989277635263756e-07, + "loss": 0.0727, + "step": 8111 + }, + { + "epoch": 2.6286454957874272, + "grad_norm": 0.8760018348693848, + "learning_rate": 1.9858601123195403e-07, + "loss": 0.0765, + "step": 8112 + }, + { + "epoch": 2.6289695398574207, + "grad_norm": 0.8476484417915344, + "learning_rate": 1.9824454060832526e-07, + "loss": 0.0741, + "step": 8113 + }, + { + "epoch": 2.629293583927414, + "grad_norm": 0.9107114672660828, + "learning_rate": 1.9790335169728197e-07, + "loss": 0.077, + "step": 8114 + }, + { + "epoch": 2.6296176279974075, + "grad_norm": 0.8905239701271057, + "learning_rate": 1.9756244454058244e-07, + "loss": 0.0753, + "step": 8115 + }, + { + "epoch": 2.6299416720674014, + "grad_norm": 0.9123937487602234, + "learning_rate": 1.9722181917995103e-07, + "loss": 0.0777, + "step": 8116 + }, + { + "epoch": 2.630265716137395, + "grad_norm": 0.8795695304870605, + "learning_rate": 1.9688147565707528e-07, + "loss": 0.0694, + "step": 8117 + }, + { + "epoch": 2.630589760207388, + "grad_norm": 0.886893093585968, + "learning_rate": 1.9654141401361183e-07, + "loss": 0.0743, + "step": 8118 + }, + { + "epoch": 2.6309138042773816, + "grad_norm": 0.8590728640556335, + "learning_rate": 1.9620163429117906e-07, + "loss": 0.0748, + "step": 8119 + }, + { + "epoch": 2.631237848347375, + "grad_norm": 0.8743232488632202, + "learning_rate": 1.958621365313648e-07, + "loss": 0.0776, + "step": 8120 + }, + { + "epoch": 2.631561892417369, + "grad_norm": 0.92427659034729, + "learning_rate": 1.9552292077571894e-07, + "loss": 0.0805, + "step": 8121 + }, + { + "epoch": 2.6318859364873624, + "grad_norm": 0.8138688802719116, + "learning_rate": 1.9518398706575846e-07, + "loss": 0.0691, + "step": 8122 + }, + { + "epoch": 2.6322099805573558, + "grad_norm": 0.8546965718269348, + "learning_rate": 1.948453354429661e-07, + "loss": 0.0748, + "step": 8123 + }, + { + "epoch": 2.632534024627349, + "grad_norm": 0.9168205857276917, + "learning_rate": 1.9450696594878804e-07, + "loss": 0.0775, + "step": 8124 + }, + { + "epoch": 2.6328580686973426, + "grad_norm": 0.8308161497116089, + "learning_rate": 1.941688786246393e-07, + "loss": 0.0719, + "step": 8125 + }, + { + "epoch": 2.6331821127673365, + "grad_norm": 0.9307392239570618, + "learning_rate": 1.9383107351189672e-07, + "loss": 0.0791, + "step": 8126 + }, + { + "epoch": 2.63350615683733, + "grad_norm": 0.8862630128860474, + "learning_rate": 1.9349355065190618e-07, + "loss": 0.0738, + "step": 8127 + }, + { + "epoch": 2.6338302009073233, + "grad_norm": 0.8637521266937256, + "learning_rate": 1.9315631008597596e-07, + "loss": 0.0716, + "step": 8128 + }, + { + "epoch": 2.6341542449773168, + "grad_norm": 0.8959033489227295, + "learning_rate": 1.9281935185538141e-07, + "loss": 0.0738, + "step": 8129 + }, + { + "epoch": 2.63447828904731, + "grad_norm": 0.9268590211868286, + "learning_rate": 1.9248267600136317e-07, + "loss": 0.083, + "step": 8130 + }, + { + "epoch": 2.634802333117304, + "grad_norm": 0.9983721971511841, + "learning_rate": 1.9214628256512656e-07, + "loss": 0.0865, + "step": 8131 + }, + { + "epoch": 2.6351263771872975, + "grad_norm": 0.8777361512184143, + "learning_rate": 1.918101715878437e-07, + "loss": 0.0715, + "step": 8132 + }, + { + "epoch": 2.635450421257291, + "grad_norm": 0.8931231498718262, + "learning_rate": 1.9147434311065028e-07, + "loss": 0.0742, + "step": 8133 + }, + { + "epoch": 2.6357744653272848, + "grad_norm": 0.8385959267616272, + "learning_rate": 1.911387971746495e-07, + "loss": 0.0697, + "step": 8134 + }, + { + "epoch": 2.636098509397278, + "grad_norm": 0.9038499593734741, + "learning_rate": 1.9080353382090798e-07, + "loss": 0.077, + "step": 8135 + }, + { + "epoch": 2.6364225534672716, + "grad_norm": 0.8563414216041565, + "learning_rate": 1.9046855309045957e-07, + "loss": 0.0759, + "step": 8136 + }, + { + "epoch": 2.636746597537265, + "grad_norm": 0.9571034908294678, + "learning_rate": 1.9013385502430175e-07, + "loss": 0.0775, + "step": 8137 + }, + { + "epoch": 2.6370706416072585, + "grad_norm": 0.9299083352088928, + "learning_rate": 1.8979943966339924e-07, + "loss": 0.0743, + "step": 8138 + }, + { + "epoch": 2.6373946856772523, + "grad_norm": 0.8955670595169067, + "learning_rate": 1.8946530704868072e-07, + "loss": 0.0721, + "step": 8139 + }, + { + "epoch": 2.6377187297472457, + "grad_norm": 0.9154726266860962, + "learning_rate": 1.891314572210412e-07, + "loss": 0.0757, + "step": 8140 + }, + { + "epoch": 2.638042773817239, + "grad_norm": 0.9504216313362122, + "learning_rate": 1.887978902213397e-07, + "loss": 0.0827, + "step": 8141 + }, + { + "epoch": 2.6383668178872326, + "grad_norm": 0.8857448697090149, + "learning_rate": 1.8846460609040302e-07, + "loss": 0.0727, + "step": 8142 + }, + { + "epoch": 2.638690861957226, + "grad_norm": 0.9364683628082275, + "learning_rate": 1.881316048690207e-07, + "loss": 0.075, + "step": 8143 + }, + { + "epoch": 2.63901490602722, + "grad_norm": 0.8507384657859802, + "learning_rate": 1.8779888659794937e-07, + "loss": 0.0733, + "step": 8144 + }, + { + "epoch": 2.6393389500972133, + "grad_norm": 0.889306366443634, + "learning_rate": 1.874664513179106e-07, + "loss": 0.0781, + "step": 8145 + }, + { + "epoch": 2.6396629941672067, + "grad_norm": 0.9445775151252747, + "learning_rate": 1.8713429906959097e-07, + "loss": 0.0788, + "step": 8146 + }, + { + "epoch": 2.6399870382372, + "grad_norm": 0.8926195502281189, + "learning_rate": 1.8680242989364327e-07, + "loss": 0.0717, + "step": 8147 + }, + { + "epoch": 2.6403110823071936, + "grad_norm": 0.93767249584198, + "learning_rate": 1.8647084383068393e-07, + "loss": 0.0792, + "step": 8148 + }, + { + "epoch": 2.6406351263771874, + "grad_norm": 0.8567405343055725, + "learning_rate": 1.8613954092129738e-07, + "loss": 0.0762, + "step": 8149 + }, + { + "epoch": 2.640959170447181, + "grad_norm": 0.8923830986022949, + "learning_rate": 1.858085212060304e-07, + "loss": 0.074, + "step": 8150 + }, + { + "epoch": 2.6412832145171743, + "grad_norm": 0.8800657391548157, + "learning_rate": 1.85477784725398e-07, + "loss": 0.0727, + "step": 8151 + }, + { + "epoch": 2.6416072585871677, + "grad_norm": 0.9234258532524109, + "learning_rate": 1.851473315198782e-07, + "loss": 0.0783, + "step": 8152 + }, + { + "epoch": 2.641931302657161, + "grad_norm": 0.9874683618545532, + "learning_rate": 1.848171616299152e-07, + "loss": 0.077, + "step": 8153 + }, + { + "epoch": 2.642255346727155, + "grad_norm": 0.8514154553413391, + "learning_rate": 1.8448727509591951e-07, + "loss": 0.0728, + "step": 8154 + }, + { + "epoch": 2.6425793907971484, + "grad_norm": 0.8893564939498901, + "learning_rate": 1.8415767195826468e-07, + "loss": 0.0739, + "step": 8155 + }, + { + "epoch": 2.642903434867142, + "grad_norm": 0.8647539019584656, + "learning_rate": 1.8382835225729256e-07, + "loss": 0.0739, + "step": 8156 + }, + { + "epoch": 2.6432274789371357, + "grad_norm": 0.8942198157310486, + "learning_rate": 1.834993160333068e-07, + "loss": 0.0779, + "step": 8157 + }, + { + "epoch": 2.6435515230071287, + "grad_norm": 0.9216142296791077, + "learning_rate": 1.831705633265804e-07, + "loss": 0.0782, + "step": 8158 + }, + { + "epoch": 2.6438755670771226, + "grad_norm": 0.9345462322235107, + "learning_rate": 1.8284209417734762e-07, + "loss": 0.0765, + "step": 8159 + }, + { + "epoch": 2.644199611147116, + "grad_norm": 0.8918818831443787, + "learning_rate": 1.8251390862581097e-07, + "loss": 0.0756, + "step": 8160 + }, + { + "epoch": 2.6445236552171094, + "grad_norm": 0.8900420069694519, + "learning_rate": 1.8218600671213698e-07, + "loss": 0.0719, + "step": 8161 + }, + { + "epoch": 2.6448476992871033, + "grad_norm": 1.0553818941116333, + "learning_rate": 1.8185838847645743e-07, + "loss": 0.0802, + "step": 8162 + }, + { + "epoch": 2.6451717433570967, + "grad_norm": 0.8427667617797852, + "learning_rate": 1.8153105395886967e-07, + "loss": 0.0705, + "step": 8163 + }, + { + "epoch": 2.64549578742709, + "grad_norm": 0.8833165764808655, + "learning_rate": 1.8120400319943692e-07, + "loss": 0.0756, + "step": 8164 + }, + { + "epoch": 2.6458198314970836, + "grad_norm": 0.8940858244895935, + "learning_rate": 1.8087723623818608e-07, + "loss": 0.0751, + "step": 8165 + }, + { + "epoch": 2.646143875567077, + "grad_norm": 0.9112303853034973, + "learning_rate": 1.805507531151107e-07, + "loss": 0.0717, + "step": 8166 + }, + { + "epoch": 2.646467919637071, + "grad_norm": 0.8989704251289368, + "learning_rate": 1.8022455387016913e-07, + "loss": 0.0718, + "step": 8167 + }, + { + "epoch": 2.6467919637070643, + "grad_norm": 0.8399885296821594, + "learning_rate": 1.7989863854328492e-07, + "loss": 0.0698, + "step": 8168 + }, + { + "epoch": 2.6471160077770577, + "grad_norm": 0.9487384557723999, + "learning_rate": 1.7957300717434706e-07, + "loss": 0.0804, + "step": 8169 + }, + { + "epoch": 2.647440051847051, + "grad_norm": 0.9109880924224854, + "learning_rate": 1.7924765980320974e-07, + "loss": 0.075, + "step": 8170 + }, + { + "epoch": 2.6477640959170445, + "grad_norm": 0.901547372341156, + "learning_rate": 1.7892259646969278e-07, + "loss": 0.0751, + "step": 8171 + }, + { + "epoch": 2.6480881399870384, + "grad_norm": 0.9755034446716309, + "learning_rate": 1.785978172135791e-07, + "loss": 0.0749, + "step": 8172 + }, + { + "epoch": 2.648412184057032, + "grad_norm": 0.9883907437324524, + "learning_rate": 1.782733220746205e-07, + "loss": 0.0868, + "step": 8173 + }, + { + "epoch": 2.6487362281270252, + "grad_norm": 0.8916382193565369, + "learning_rate": 1.7794911109253105e-07, + "loss": 0.076, + "step": 8174 + }, + { + "epoch": 2.6490602721970187, + "grad_norm": 0.8999601602554321, + "learning_rate": 1.7762518430699122e-07, + "loss": 0.0766, + "step": 8175 + }, + { + "epoch": 2.649384316267012, + "grad_norm": 0.8907502889633179, + "learning_rate": 1.7730154175764623e-07, + "loss": 0.0748, + "step": 8176 + }, + { + "epoch": 2.649708360337006, + "grad_norm": 0.9117937684059143, + "learning_rate": 1.7697818348410722e-07, + "loss": 0.0861, + "step": 8177 + }, + { + "epoch": 2.6500324044069994, + "grad_norm": 0.9239206314086914, + "learning_rate": 1.7665510952595027e-07, + "loss": 0.0715, + "step": 8178 + }, + { + "epoch": 2.650356448476993, + "grad_norm": 0.9190769195556641, + "learning_rate": 1.7633231992271572e-07, + "loss": 0.0763, + "step": 8179 + }, + { + "epoch": 2.6506804925469862, + "grad_norm": 0.8508328795433044, + "learning_rate": 1.7600981471391083e-07, + "loss": 0.0718, + "step": 8180 + }, + { + "epoch": 2.6510045366169797, + "grad_norm": 0.9987004995346069, + "learning_rate": 1.7568759393900597e-07, + "loss": 0.0826, + "step": 8181 + }, + { + "epoch": 2.6513285806869735, + "grad_norm": 0.8714659214019775, + "learning_rate": 1.7536565763743934e-07, + "loss": 0.0751, + "step": 8182 + }, + { + "epoch": 2.651652624756967, + "grad_norm": 0.9042683243751526, + "learning_rate": 1.7504400584861137e-07, + "loss": 0.0777, + "step": 8183 + }, + { + "epoch": 2.6519766688269604, + "grad_norm": 0.8759123086929321, + "learning_rate": 1.7472263861189e-07, + "loss": 0.0709, + "step": 8184 + }, + { + "epoch": 2.6523007128969542, + "grad_norm": 0.9868377447128296, + "learning_rate": 1.7440155596660735e-07, + "loss": 0.0781, + "step": 8185 + }, + { + "epoch": 2.6526247569669477, + "grad_norm": 1.0885810852050781, + "learning_rate": 1.7408075795206037e-07, + "loss": 0.0899, + "step": 8186 + }, + { + "epoch": 2.652948801036941, + "grad_norm": 0.8971470594406128, + "learning_rate": 1.7376024460751262e-07, + "loss": 0.0759, + "step": 8187 + }, + { + "epoch": 2.6532728451069345, + "grad_norm": 0.9231646060943604, + "learning_rate": 1.7344001597219024e-07, + "loss": 0.0789, + "step": 8188 + }, + { + "epoch": 2.653596889176928, + "grad_norm": 0.8947360515594482, + "learning_rate": 1.7312007208528796e-07, + "loss": 0.0782, + "step": 8189 + }, + { + "epoch": 2.653920933246922, + "grad_norm": 0.8938325643539429, + "learning_rate": 1.7280041298596257e-07, + "loss": 0.0746, + "step": 8190 + }, + { + "epoch": 2.654244977316915, + "grad_norm": 0.8647114038467407, + "learning_rate": 1.7248103871333743e-07, + "loss": 0.0722, + "step": 8191 + }, + { + "epoch": 2.6545690213869086, + "grad_norm": 0.9191124439239502, + "learning_rate": 1.7216194930650105e-07, + "loss": 0.0764, + "step": 8192 + }, + { + "epoch": 2.654893065456902, + "grad_norm": 0.9956697225570679, + "learning_rate": 1.7184314480450713e-07, + "loss": 0.0865, + "step": 8193 + }, + { + "epoch": 2.6552171095268955, + "grad_norm": 0.8507396578788757, + "learning_rate": 1.715246252463737e-07, + "loss": 0.0738, + "step": 8194 + }, + { + "epoch": 2.6555411535968894, + "grad_norm": 0.9137740135192871, + "learning_rate": 1.7120639067108508e-07, + "loss": 0.0797, + "step": 8195 + }, + { + "epoch": 2.655865197666883, + "grad_norm": 0.8848572969436646, + "learning_rate": 1.7088844111758956e-07, + "loss": 0.0751, + "step": 8196 + }, + { + "epoch": 2.656189241736876, + "grad_norm": 0.9681692719459534, + "learning_rate": 1.7057077662480131e-07, + "loss": 0.0783, + "step": 8197 + }, + { + "epoch": 2.6565132858068696, + "grad_norm": 0.8566903471946716, + "learning_rate": 1.7025339723159924e-07, + "loss": 0.0746, + "step": 8198 + }, + { + "epoch": 2.656837329876863, + "grad_norm": 0.8328977227210999, + "learning_rate": 1.6993630297682778e-07, + "loss": 0.0696, + "step": 8199 + }, + { + "epoch": 2.657161373946857, + "grad_norm": 0.8919262290000916, + "learning_rate": 1.6961949389929593e-07, + "loss": 0.0793, + "step": 8200 + }, + { + "epoch": 2.6574854180168503, + "grad_norm": 0.898391604423523, + "learning_rate": 1.693029700377785e-07, + "loss": 0.0735, + "step": 8201 + }, + { + "epoch": 2.6578094620868438, + "grad_norm": 0.8991305232048035, + "learning_rate": 1.6898673143101479e-07, + "loss": 0.0758, + "step": 8202 + }, + { + "epoch": 2.658133506156837, + "grad_norm": 0.8869103789329529, + "learning_rate": 1.6867077811770826e-07, + "loss": 0.0771, + "step": 8203 + }, + { + "epoch": 2.6584575502268306, + "grad_norm": 0.8388248682022095, + "learning_rate": 1.683551101365305e-07, + "loss": 0.0676, + "step": 8204 + }, + { + "epoch": 2.6587815942968245, + "grad_norm": 0.8820380568504333, + "learning_rate": 1.6803972752611475e-07, + "loss": 0.0733, + "step": 8205 + }, + { + "epoch": 2.659105638366818, + "grad_norm": 0.9473530650138855, + "learning_rate": 1.6772463032506126e-07, + "loss": 0.0744, + "step": 8206 + }, + { + "epoch": 2.6594296824368113, + "grad_norm": 0.829882025718689, + "learning_rate": 1.6740981857193471e-07, + "loss": 0.0712, + "step": 8207 + }, + { + "epoch": 2.659753726506805, + "grad_norm": 0.9000360369682312, + "learning_rate": 1.6709529230526544e-07, + "loss": 0.0782, + "step": 8208 + }, + { + "epoch": 2.660077770576798, + "grad_norm": 0.9191071391105652, + "learning_rate": 1.667810515635482e-07, + "loss": 0.0772, + "step": 8209 + }, + { + "epoch": 2.660401814646792, + "grad_norm": 0.9518386125564575, + "learning_rate": 1.6646709638524216e-07, + "loss": 0.0757, + "step": 8210 + }, + { + "epoch": 2.6607258587167855, + "grad_norm": 0.9048896431922913, + "learning_rate": 1.6615342680877417e-07, + "loss": 0.0738, + "step": 8211 + }, + { + "epoch": 2.661049902786779, + "grad_norm": 1.0090075731277466, + "learning_rate": 1.6584004287253235e-07, + "loss": 0.0852, + "step": 8212 + }, + { + "epoch": 2.6613739468567728, + "grad_norm": 0.8919034004211426, + "learning_rate": 1.6552694461487385e-07, + "loss": 0.0799, + "step": 8213 + }, + { + "epoch": 2.661697990926766, + "grad_norm": 0.9021881818771362, + "learning_rate": 1.652141320741174e-07, + "loss": 0.0765, + "step": 8214 + }, + { + "epoch": 2.6620220349967596, + "grad_norm": 0.9298197627067566, + "learning_rate": 1.6490160528854855e-07, + "loss": 0.0791, + "step": 8215 + }, + { + "epoch": 2.662346079066753, + "grad_norm": 0.9251485466957092, + "learning_rate": 1.6458936429641803e-07, + "loss": 0.0746, + "step": 8216 + }, + { + "epoch": 2.6626701231367464, + "grad_norm": 0.9725478887557983, + "learning_rate": 1.642774091359406e-07, + "loss": 0.0808, + "step": 8217 + }, + { + "epoch": 2.6629941672067403, + "grad_norm": 0.8733885884284973, + "learning_rate": 1.6396573984529707e-07, + "loss": 0.0739, + "step": 8218 + }, + { + "epoch": 2.6633182112767337, + "grad_norm": 0.9233888387680054, + "learning_rate": 1.6365435646263223e-07, + "loss": 0.0786, + "step": 8219 + }, + { + "epoch": 2.663642255346727, + "grad_norm": 0.8216432332992554, + "learning_rate": 1.6334325902605642e-07, + "loss": 0.0716, + "step": 8220 + }, + { + "epoch": 2.6639662994167206, + "grad_norm": 0.9246054887771606, + "learning_rate": 1.63032447573645e-07, + "loss": 0.0745, + "step": 8221 + }, + { + "epoch": 2.664290343486714, + "grad_norm": 0.9085432291030884, + "learning_rate": 1.6272192214343868e-07, + "loss": 0.0795, + "step": 8222 + }, + { + "epoch": 2.664614387556708, + "grad_norm": 0.9263147711753845, + "learning_rate": 1.6241168277344232e-07, + "loss": 0.0809, + "step": 8223 + }, + { + "epoch": 2.6649384316267013, + "grad_norm": 0.8804634213447571, + "learning_rate": 1.6210172950162639e-07, + "loss": 0.076, + "step": 8224 + }, + { + "epoch": 2.6652624756966947, + "grad_norm": 0.845128059387207, + "learning_rate": 1.617920623659261e-07, + "loss": 0.0737, + "step": 8225 + }, + { + "epoch": 2.665586519766688, + "grad_norm": 0.935748815536499, + "learning_rate": 1.6148268140424224e-07, + "loss": 0.0789, + "step": 8226 + }, + { + "epoch": 2.6659105638366816, + "grad_norm": 0.9177601933479309, + "learning_rate": 1.6117358665443922e-07, + "loss": 0.0818, + "step": 8227 + }, + { + "epoch": 2.6662346079066754, + "grad_norm": 1.0410524606704712, + "learning_rate": 1.6086477815434763e-07, + "loss": 0.0862, + "step": 8228 + }, + { + "epoch": 2.666558651976669, + "grad_norm": 0.927940845489502, + "learning_rate": 1.6055625594176254e-07, + "loss": 0.0824, + "step": 8229 + }, + { + "epoch": 2.6668826960466623, + "grad_norm": 0.9064210653305054, + "learning_rate": 1.602480200544443e-07, + "loss": 0.0824, + "step": 8230 + }, + { + "epoch": 2.6672067401166557, + "grad_norm": 0.9253581166267395, + "learning_rate": 1.5994007053011796e-07, + "loss": 0.08, + "step": 8231 + }, + { + "epoch": 2.667530784186649, + "grad_norm": 0.9229341149330139, + "learning_rate": 1.5963240740647285e-07, + "loss": 0.0813, + "step": 8232 + }, + { + "epoch": 2.667854828256643, + "grad_norm": 0.9699390530586243, + "learning_rate": 1.5932503072116524e-07, + "loss": 0.0793, + "step": 8233 + }, + { + "epoch": 2.6681788723266364, + "grad_norm": 0.8560695052146912, + "learning_rate": 1.5901794051181362e-07, + "loss": 0.0739, + "step": 8234 + }, + { + "epoch": 2.66850291639663, + "grad_norm": 0.924231767654419, + "learning_rate": 1.5871113681600464e-07, + "loss": 0.0779, + "step": 8235 + }, + { + "epoch": 2.6688269604666237, + "grad_norm": 0.9453070163726807, + "learning_rate": 1.5840461967128628e-07, + "loss": 0.0789, + "step": 8236 + }, + { + "epoch": 2.669151004536617, + "grad_norm": 0.9044185876846313, + "learning_rate": 1.5809838911517438e-07, + "loss": 0.0771, + "step": 8237 + }, + { + "epoch": 2.6694750486066106, + "grad_norm": 0.8933587670326233, + "learning_rate": 1.5779244518514813e-07, + "loss": 0.0749, + "step": 8238 + }, + { + "epoch": 2.669799092676604, + "grad_norm": 0.883310854434967, + "learning_rate": 1.574867879186523e-07, + "loss": 0.0769, + "step": 8239 + }, + { + "epoch": 2.6701231367465974, + "grad_norm": 0.8501051068305969, + "learning_rate": 1.5718141735309695e-07, + "loss": 0.0702, + "step": 8240 + }, + { + "epoch": 2.6704471808165913, + "grad_norm": 0.9186578392982483, + "learning_rate": 1.5687633352585467e-07, + "loss": 0.0821, + "step": 8241 + }, + { + "epoch": 2.6707712248865847, + "grad_norm": 0.8473071455955505, + "learning_rate": 1.5657153647426703e-07, + "loss": 0.0718, + "step": 8242 + }, + { + "epoch": 2.671095268956578, + "grad_norm": 0.9314771294593811, + "learning_rate": 1.5626702623563694e-07, + "loss": 0.0765, + "step": 8243 + }, + { + "epoch": 2.6714193130265715, + "grad_norm": 0.9406991600990295, + "learning_rate": 1.5596280284723348e-07, + "loss": 0.08, + "step": 8244 + }, + { + "epoch": 2.671743357096565, + "grad_norm": 0.8677089810371399, + "learning_rate": 1.5565886634629102e-07, + "loss": 0.0721, + "step": 8245 + }, + { + "epoch": 2.672067401166559, + "grad_norm": 0.8874663710594177, + "learning_rate": 1.5535521677000813e-07, + "loss": 0.0771, + "step": 8246 + }, + { + "epoch": 2.6723914452365523, + "grad_norm": 0.9458112120628357, + "learning_rate": 1.5505185415554903e-07, + "loss": 0.0728, + "step": 8247 + }, + { + "epoch": 2.6727154893065457, + "grad_norm": 0.8996087312698364, + "learning_rate": 1.54748778540042e-07, + "loss": 0.0798, + "step": 8248 + }, + { + "epoch": 2.673039533376539, + "grad_norm": 0.8373512625694275, + "learning_rate": 1.544459899605813e-07, + "loss": 0.0695, + "step": 8249 + }, + { + "epoch": 2.6733635774465325, + "grad_norm": 0.960312008857727, + "learning_rate": 1.5414348845422394e-07, + "loss": 0.0808, + "step": 8250 + }, + { + "epoch": 2.6736876215165264, + "grad_norm": 0.9382338523864746, + "learning_rate": 1.538412740579942e-07, + "loss": 0.0818, + "step": 8251 + }, + { + "epoch": 2.67401166558652, + "grad_norm": 0.9835246205329895, + "learning_rate": 1.5353934680888e-07, + "loss": 0.0764, + "step": 8252 + }, + { + "epoch": 2.6743357096565132, + "grad_norm": 0.8817523121833801, + "learning_rate": 1.5323770674383398e-07, + "loss": 0.0725, + "step": 8253 + }, + { + "epoch": 2.6746597537265067, + "grad_norm": 0.9373438358306885, + "learning_rate": 1.529363538997744e-07, + "loss": 0.0793, + "step": 8254 + }, + { + "epoch": 2.6749837977965, + "grad_norm": 0.9446477293968201, + "learning_rate": 1.526352883135837e-07, + "loss": 0.0751, + "step": 8255 + }, + { + "epoch": 2.675307841866494, + "grad_norm": 0.8968738317489624, + "learning_rate": 1.5233451002210964e-07, + "loss": 0.0786, + "step": 8256 + }, + { + "epoch": 2.6756318859364874, + "grad_norm": 0.8211001753807068, + "learning_rate": 1.520340190621647e-07, + "loss": 0.0704, + "step": 8257 + }, + { + "epoch": 2.675955930006481, + "grad_norm": 0.8999126553535461, + "learning_rate": 1.5173381547052528e-07, + "loss": 0.0739, + "step": 8258 + }, + { + "epoch": 2.6762799740764747, + "grad_norm": 0.8938018679618835, + "learning_rate": 1.5143389928393398e-07, + "loss": 0.0775, + "step": 8259 + }, + { + "epoch": 2.6766040181464676, + "grad_norm": 0.8835743069648743, + "learning_rate": 1.5113427053909725e-07, + "loss": 0.0747, + "step": 8260 + }, + { + "epoch": 2.6769280622164615, + "grad_norm": 0.8831462264060974, + "learning_rate": 1.508349292726874e-07, + "loss": 0.0682, + "step": 8261 + }, + { + "epoch": 2.677252106286455, + "grad_norm": 0.8573110103607178, + "learning_rate": 1.505358755213407e-07, + "loss": 0.0685, + "step": 8262 + }, + { + "epoch": 2.6775761503564484, + "grad_norm": 0.8490044474601746, + "learning_rate": 1.5023710932165758e-07, + "loss": 0.0758, + "step": 8263 + }, + { + "epoch": 2.6779001944264422, + "grad_norm": 0.9448397755622864, + "learning_rate": 1.4993863071020548e-07, + "loss": 0.0759, + "step": 8264 + }, + { + "epoch": 2.6782242384964356, + "grad_norm": 0.8347598314285278, + "learning_rate": 1.4964043972351377e-07, + "loss": 0.0688, + "step": 8265 + }, + { + "epoch": 2.678548282566429, + "grad_norm": 0.8894073367118835, + "learning_rate": 1.4934253639807994e-07, + "loss": 0.0748, + "step": 8266 + }, + { + "epoch": 2.6788723266364225, + "grad_norm": 0.8460086584091187, + "learning_rate": 1.4904492077036286e-07, + "loss": 0.0695, + "step": 8267 + }, + { + "epoch": 2.679196370706416, + "grad_norm": 0.8650538921356201, + "learning_rate": 1.4874759287678898e-07, + "loss": 0.0716, + "step": 8268 + }, + { + "epoch": 2.67952041477641, + "grad_norm": 1.0882456302642822, + "learning_rate": 1.484505527537475e-07, + "loss": 0.0779, + "step": 8269 + }, + { + "epoch": 2.679844458846403, + "grad_norm": 0.891619861125946, + "learning_rate": 1.4815380043759374e-07, + "loss": 0.0729, + "step": 8270 + }, + { + "epoch": 2.6801685029163966, + "grad_norm": 0.9411903619766235, + "learning_rate": 1.4785733596464736e-07, + "loss": 0.0789, + "step": 8271 + }, + { + "epoch": 2.68049254698639, + "grad_norm": 0.887125551700592, + "learning_rate": 1.4756115937119202e-07, + "loss": 0.0711, + "step": 8272 + }, + { + "epoch": 2.6808165910563835, + "grad_norm": 0.8783424496650696, + "learning_rate": 1.4726527069347796e-07, + "loss": 0.0773, + "step": 8273 + }, + { + "epoch": 2.6811406351263773, + "grad_norm": 0.8920451998710632, + "learning_rate": 1.4696966996771838e-07, + "loss": 0.0746, + "step": 8274 + }, + { + "epoch": 2.6814646791963708, + "grad_norm": 0.8676562905311584, + "learning_rate": 1.4667435723009187e-07, + "loss": 0.0767, + "step": 8275 + }, + { + "epoch": 2.681788723266364, + "grad_norm": 0.9486330151557922, + "learning_rate": 1.463793325167423e-07, + "loss": 0.0845, + "step": 8276 + }, + { + "epoch": 2.6821127673363576, + "grad_norm": 0.9392081499099731, + "learning_rate": 1.4608459586377743e-07, + "loss": 0.08, + "step": 8277 + }, + { + "epoch": 2.682436811406351, + "grad_norm": 0.880453884601593, + "learning_rate": 1.4579014730727037e-07, + "loss": 0.074, + "step": 8278 + }, + { + "epoch": 2.682760855476345, + "grad_norm": 0.9142937064170837, + "learning_rate": 1.4549598688325896e-07, + "loss": 0.0768, + "step": 8279 + }, + { + "epoch": 2.6830848995463383, + "grad_norm": 0.9541929364204407, + "learning_rate": 1.4520211462774548e-07, + "loss": 0.0793, + "step": 8280 + }, + { + "epoch": 2.6834089436163318, + "grad_norm": 0.8086893558502197, + "learning_rate": 1.4490853057669675e-07, + "loss": 0.07, + "step": 8281 + }, + { + "epoch": 2.683732987686325, + "grad_norm": 0.8273548483848572, + "learning_rate": 1.4461523476604482e-07, + "loss": 0.0775, + "step": 8282 + }, + { + "epoch": 2.6840570317563186, + "grad_norm": 0.9654455780982971, + "learning_rate": 1.4432222723168632e-07, + "loss": 0.0723, + "step": 8283 + }, + { + "epoch": 2.6843810758263125, + "grad_norm": 0.8216709494590759, + "learning_rate": 1.4402950800948223e-07, + "loss": 0.0723, + "step": 8284 + }, + { + "epoch": 2.684705119896306, + "grad_norm": 1.047164797782898, + "learning_rate": 1.437370771352589e-07, + "loss": 0.0794, + "step": 8285 + }, + { + "epoch": 2.6850291639662993, + "grad_norm": 0.8784307837486267, + "learning_rate": 1.4344493464480745e-07, + "loss": 0.0776, + "step": 8286 + }, + { + "epoch": 2.685353208036293, + "grad_norm": 0.9339602589607239, + "learning_rate": 1.4315308057388206e-07, + "loss": 0.0711, + "step": 8287 + }, + { + "epoch": 2.6856772521062866, + "grad_norm": 0.8273693323135376, + "learning_rate": 1.428615149582041e-07, + "loss": 0.0707, + "step": 8288 + }, + { + "epoch": 2.68600129617628, + "grad_norm": 0.8852351903915405, + "learning_rate": 1.425702378334573e-07, + "loss": 0.0792, + "step": 8289 + }, + { + "epoch": 2.6863253402462735, + "grad_norm": 0.970825731754303, + "learning_rate": 1.4227924923529228e-07, + "loss": 0.084, + "step": 8290 + }, + { + "epoch": 2.686649384316267, + "grad_norm": 0.9002171754837036, + "learning_rate": 1.4198854919932225e-07, + "loss": 0.0799, + "step": 8291 + }, + { + "epoch": 2.6869734283862607, + "grad_norm": 0.9297456741333008, + "learning_rate": 1.4169813776112652e-07, + "loss": 0.0801, + "step": 8292 + }, + { + "epoch": 2.687297472456254, + "grad_norm": 0.9804297685623169, + "learning_rate": 1.4140801495624913e-07, + "loss": 0.0758, + "step": 8293 + }, + { + "epoch": 2.6876215165262476, + "grad_norm": 0.8457171320915222, + "learning_rate": 1.4111818082019696e-07, + "loss": 0.0754, + "step": 8294 + }, + { + "epoch": 2.687945560596241, + "grad_norm": 0.9038613438606262, + "learning_rate": 1.4082863538844444e-07, + "loss": 0.0792, + "step": 8295 + }, + { + "epoch": 2.6882696046662344, + "grad_norm": 0.9129397869110107, + "learning_rate": 1.4053937869642737e-07, + "loss": 0.0764, + "step": 8296 + }, + { + "epoch": 2.6885936487362283, + "grad_norm": 0.9389241337776184, + "learning_rate": 1.402504107795502e-07, + "loss": 0.0786, + "step": 8297 + }, + { + "epoch": 2.6889176928062217, + "grad_norm": 0.8619992733001709, + "learning_rate": 1.39961731673178e-07, + "loss": 0.0741, + "step": 8298 + }, + { + "epoch": 2.689241736876215, + "grad_norm": 0.9079556465148926, + "learning_rate": 1.3967334141264277e-07, + "loss": 0.0786, + "step": 8299 + }, + { + "epoch": 2.6895657809462086, + "grad_norm": 0.8391004204750061, + "learning_rate": 1.39385240033241e-07, + "loss": 0.0763, + "step": 8300 + }, + { + "epoch": 2.689889825016202, + "grad_norm": 0.890021800994873, + "learning_rate": 1.3909742757023336e-07, + "loss": 0.0748, + "step": 8301 + }, + { + "epoch": 2.690213869086196, + "grad_norm": 0.9386758208274841, + "learning_rate": 1.3880990405884532e-07, + "loss": 0.0769, + "step": 8302 + }, + { + "epoch": 2.6905379131561893, + "grad_norm": 0.8605058789253235, + "learning_rate": 1.3852266953426674e-07, + "loss": 0.0778, + "step": 8303 + }, + { + "epoch": 2.6908619572261827, + "grad_norm": 0.8319392800331116, + "learning_rate": 1.3823572403165285e-07, + "loss": 0.0707, + "step": 8304 + }, + { + "epoch": 2.691186001296176, + "grad_norm": 0.9255814552307129, + "learning_rate": 1.3794906758612252e-07, + "loss": 0.0805, + "step": 8305 + }, + { + "epoch": 2.6915100453661696, + "grad_norm": 0.8639987111091614, + "learning_rate": 1.376627002327599e-07, + "loss": 0.0804, + "step": 8306 + }, + { + "epoch": 2.6918340894361634, + "grad_norm": 0.8433483839035034, + "learning_rate": 1.373766220066136e-07, + "loss": 0.0735, + "step": 8307 + }, + { + "epoch": 2.692158133506157, + "grad_norm": 0.9205812811851501, + "learning_rate": 1.3709083294269676e-07, + "loss": 0.0761, + "step": 8308 + }, + { + "epoch": 2.6924821775761503, + "grad_norm": 0.8764326572418213, + "learning_rate": 1.368053330759872e-07, + "loss": 0.0779, + "step": 8309 + }, + { + "epoch": 2.692806221646144, + "grad_norm": 0.8411714434623718, + "learning_rate": 1.3652012244142754e-07, + "loss": 0.0706, + "step": 8310 + }, + { + "epoch": 2.693130265716137, + "grad_norm": 0.8792476058006287, + "learning_rate": 1.362352010739243e-07, + "loss": 0.0788, + "step": 8311 + }, + { + "epoch": 2.693454309786131, + "grad_norm": 0.9278955459594727, + "learning_rate": 1.3595056900834986e-07, + "loss": 0.0807, + "step": 8312 + }, + { + "epoch": 2.6937783538561244, + "grad_norm": 0.9351165294647217, + "learning_rate": 1.3566622627953968e-07, + "loss": 0.0781, + "step": 8313 + }, + { + "epoch": 2.694102397926118, + "grad_norm": 0.9058972597122192, + "learning_rate": 1.3538217292229482e-07, + "loss": 0.0781, + "step": 8314 + }, + { + "epoch": 2.6944264419961117, + "grad_norm": 0.8710022568702698, + "learning_rate": 1.3509840897138083e-07, + "loss": 0.076, + "step": 8315 + }, + { + "epoch": 2.694750486066105, + "grad_norm": 0.9338528513908386, + "learning_rate": 1.3481493446152766e-07, + "loss": 0.0745, + "step": 8316 + }, + { + "epoch": 2.6950745301360985, + "grad_norm": 0.8894299864768982, + "learning_rate": 1.3453174942743008e-07, + "loss": 0.0784, + "step": 8317 + }, + { + "epoch": 2.695398574206092, + "grad_norm": 0.8650580048561096, + "learning_rate": 1.3424885390374593e-07, + "loss": 0.0744, + "step": 8318 + }, + { + "epoch": 2.6957226182760854, + "grad_norm": 1.0599915981292725, + "learning_rate": 1.3396624792510082e-07, + "loss": 0.0808, + "step": 8319 + }, + { + "epoch": 2.6960466623460793, + "grad_norm": 0.9517320990562439, + "learning_rate": 1.33683931526081e-07, + "loss": 0.0806, + "step": 8320 + }, + { + "epoch": 2.6963707064160727, + "grad_norm": 0.8994131684303284, + "learning_rate": 1.3340190474124104e-07, + "loss": 0.076, + "step": 8321 + }, + { + "epoch": 2.696694750486066, + "grad_norm": 0.8776226043701172, + "learning_rate": 1.3312016760509722e-07, + "loss": 0.0708, + "step": 8322 + }, + { + "epoch": 2.6970187945560595, + "grad_norm": 0.869220495223999, + "learning_rate": 1.3283872015213168e-07, + "loss": 0.0753, + "step": 8323 + }, + { + "epoch": 2.697342838626053, + "grad_norm": 0.9218916893005371, + "learning_rate": 1.3255756241679102e-07, + "loss": 0.0783, + "step": 8324 + }, + { + "epoch": 2.697666882696047, + "grad_norm": 0.9783483743667603, + "learning_rate": 1.3227669443348578e-07, + "loss": 0.082, + "step": 8325 + }, + { + "epoch": 2.6979909267660402, + "grad_norm": 0.9013862609863281, + "learning_rate": 1.3199611623659235e-07, + "loss": 0.08, + "step": 8326 + }, + { + "epoch": 2.6983149708360337, + "grad_norm": 0.9175854921340942, + "learning_rate": 1.3171582786044968e-07, + "loss": 0.0744, + "step": 8327 + }, + { + "epoch": 2.698639014906027, + "grad_norm": 0.8921010494232178, + "learning_rate": 1.3143582933936333e-07, + "loss": 0.0722, + "step": 8328 + }, + { + "epoch": 2.6989630589760205, + "grad_norm": 0.9263914227485657, + "learning_rate": 1.3115612070760174e-07, + "loss": 0.0738, + "step": 8329 + }, + { + "epoch": 2.6992871030460144, + "grad_norm": 0.8688660860061646, + "learning_rate": 1.3087670199939894e-07, + "loss": 0.0731, + "step": 8330 + }, + { + "epoch": 2.699611147116008, + "grad_norm": 0.9509294033050537, + "learning_rate": 1.3059757324895283e-07, + "loss": 0.0757, + "step": 8331 + }, + { + "epoch": 2.6999351911860012, + "grad_norm": 0.8982728123664856, + "learning_rate": 1.303187344904261e-07, + "loss": 0.0762, + "step": 8332 + }, + { + "epoch": 2.7002592352559946, + "grad_norm": 0.9074303507804871, + "learning_rate": 1.3004018575794586e-07, + "loss": 0.0788, + "step": 8333 + }, + { + "epoch": 2.700583279325988, + "grad_norm": 0.9729928374290466, + "learning_rate": 1.2976192708560432e-07, + "loss": 0.0804, + "step": 8334 + }, + { + "epoch": 2.700907323395982, + "grad_norm": 0.9070225358009338, + "learning_rate": 1.2948395850745726e-07, + "loss": 0.077, + "step": 8335 + }, + { + "epoch": 2.7012313674659754, + "grad_norm": 0.8616864085197449, + "learning_rate": 1.29206280057525e-07, + "loss": 0.0711, + "step": 8336 + }, + { + "epoch": 2.701555411535969, + "grad_norm": 0.8884169459342957, + "learning_rate": 1.2892889176979284e-07, + "loss": 0.075, + "step": 8337 + }, + { + "epoch": 2.7018794556059627, + "grad_norm": 0.9448602795600891, + "learning_rate": 1.2865179367821083e-07, + "loss": 0.0793, + "step": 8338 + }, + { + "epoch": 2.702203499675956, + "grad_norm": 0.8624501824378967, + "learning_rate": 1.283749858166927e-07, + "loss": 0.0771, + "step": 8339 + }, + { + "epoch": 2.7025275437459495, + "grad_norm": 0.9130458831787109, + "learning_rate": 1.280984682191172e-07, + "loss": 0.0735, + "step": 8340 + }, + { + "epoch": 2.702851587815943, + "grad_norm": 0.9405670762062073, + "learning_rate": 1.2782224091932775e-07, + "loss": 0.0797, + "step": 8341 + }, + { + "epoch": 2.7031756318859363, + "grad_norm": 0.8795937299728394, + "learning_rate": 1.2754630395113098e-07, + "loss": 0.0748, + "step": 8342 + }, + { + "epoch": 2.70349967595593, + "grad_norm": 0.9006626605987549, + "learning_rate": 1.2727065734830013e-07, + "loss": 0.0781, + "step": 8343 + }, + { + "epoch": 2.7038237200259236, + "grad_norm": 0.9724311232566833, + "learning_rate": 1.269953011445707e-07, + "loss": 0.078, + "step": 8344 + }, + { + "epoch": 2.704147764095917, + "grad_norm": 0.8542706370353699, + "learning_rate": 1.267202353736438e-07, + "loss": 0.0688, + "step": 8345 + }, + { + "epoch": 2.7044718081659105, + "grad_norm": 0.8407845497131348, + "learning_rate": 1.26445460069185e-07, + "loss": 0.0743, + "step": 8346 + }, + { + "epoch": 2.704795852235904, + "grad_norm": 0.8969268202781677, + "learning_rate": 1.2617097526482407e-07, + "loss": 0.0806, + "step": 8347 + }, + { + "epoch": 2.7051198963058978, + "grad_norm": 0.8536497354507446, + "learning_rate": 1.2589678099415582e-07, + "loss": 0.0747, + "step": 8348 + }, + { + "epoch": 2.705443940375891, + "grad_norm": 0.9546661376953125, + "learning_rate": 1.256228772907378e-07, + "loss": 0.0842, + "step": 8349 + }, + { + "epoch": 2.7057679844458846, + "grad_norm": 0.9164236187934875, + "learning_rate": 1.2534926418809433e-07, + "loss": 0.0775, + "step": 8350 + }, + { + "epoch": 2.706092028515878, + "grad_norm": 0.9462824463844299, + "learning_rate": 1.2507594171971198e-07, + "loss": 0.0798, + "step": 8351 + }, + { + "epoch": 2.7064160725858715, + "grad_norm": 0.8498296141624451, + "learning_rate": 1.2480290991904398e-07, + "loss": 0.0735, + "step": 8352 + }, + { + "epoch": 2.7067401166558653, + "grad_norm": 0.9814295172691345, + "learning_rate": 1.245301688195058e-07, + "loss": 0.0735, + "step": 8353 + }, + { + "epoch": 2.7070641607258588, + "grad_norm": 0.9598404765129089, + "learning_rate": 1.2425771845447853e-07, + "loss": 0.0789, + "step": 8354 + }, + { + "epoch": 2.707388204795852, + "grad_norm": 0.8988109827041626, + "learning_rate": 1.2398555885730774e-07, + "loss": 0.0806, + "step": 8355 + }, + { + "epoch": 2.7077122488658456, + "grad_norm": 0.8295194506645203, + "learning_rate": 1.2371369006130256e-07, + "loss": 0.0776, + "step": 8356 + }, + { + "epoch": 2.708036292935839, + "grad_norm": 0.9095755815505981, + "learning_rate": 1.2344211209973811e-07, + "loss": 0.0747, + "step": 8357 + }, + { + "epoch": 2.708360337005833, + "grad_norm": 0.9850380420684814, + "learning_rate": 1.2317082500585163e-07, + "loss": 0.0831, + "step": 8358 + }, + { + "epoch": 2.7086843810758263, + "grad_norm": 0.9201284050941467, + "learning_rate": 1.2289982881284718e-07, + "loss": 0.076, + "step": 8359 + }, + { + "epoch": 2.7090084251458197, + "grad_norm": 0.8760098814964294, + "learning_rate": 1.226291235538915e-07, + "loss": 0.0787, + "step": 8360 + }, + { + "epoch": 2.7093324692158136, + "grad_norm": 0.8688327074050903, + "learning_rate": 1.223587092621162e-07, + "loss": 0.0774, + "step": 8361 + }, + { + "epoch": 2.709656513285807, + "grad_norm": 0.8596488237380981, + "learning_rate": 1.2208858597061752e-07, + "loss": 0.0744, + "step": 8362 + }, + { + "epoch": 2.7099805573558005, + "grad_norm": 0.8696610331535339, + "learning_rate": 1.21818753712456e-07, + "loss": 0.0775, + "step": 8363 + }, + { + "epoch": 2.710304601425794, + "grad_norm": 0.8679714798927307, + "learning_rate": 1.2154921252065633e-07, + "loss": 0.0711, + "step": 8364 + }, + { + "epoch": 2.7106286454957873, + "grad_norm": 0.9504297375679016, + "learning_rate": 1.2127996242820822e-07, + "loss": 0.0822, + "step": 8365 + }, + { + "epoch": 2.710952689565781, + "grad_norm": 0.8958672285079956, + "learning_rate": 1.2101100346806478e-07, + "loss": 0.0749, + "step": 8366 + }, + { + "epoch": 2.7112767336357746, + "grad_norm": 0.8755009174346924, + "learning_rate": 1.2074233567314408e-07, + "loss": 0.0745, + "step": 8367 + }, + { + "epoch": 2.711600777705768, + "grad_norm": 0.9226860404014587, + "learning_rate": 1.2047395907632818e-07, + "loss": 0.0808, + "step": 8368 + }, + { + "epoch": 2.7119248217757614, + "grad_norm": 0.9615294933319092, + "learning_rate": 1.2020587371046445e-07, + "loss": 0.08, + "step": 8369 + }, + { + "epoch": 2.712248865845755, + "grad_norm": 0.956706702709198, + "learning_rate": 1.1993807960836322e-07, + "loss": 0.0772, + "step": 8370 + }, + { + "epoch": 2.7125729099157487, + "grad_norm": 0.896210253238678, + "learning_rate": 1.1967057680280058e-07, + "loss": 0.075, + "step": 8371 + }, + { + "epoch": 2.712896953985742, + "grad_norm": 0.9272021055221558, + "learning_rate": 1.1940336532651614e-07, + "loss": 0.0735, + "step": 8372 + }, + { + "epoch": 2.7132209980557356, + "grad_norm": 0.9543355703353882, + "learning_rate": 1.1913644521221345e-07, + "loss": 0.0809, + "step": 8373 + }, + { + "epoch": 2.713545042125729, + "grad_norm": 0.8474852442741394, + "learning_rate": 1.1886981649256169e-07, + "loss": 0.0731, + "step": 8374 + }, + { + "epoch": 2.7138690861957224, + "grad_norm": 0.9196550250053406, + "learning_rate": 1.1860347920019304e-07, + "loss": 0.0784, + "step": 8375 + }, + { + "epoch": 2.7141931302657163, + "grad_norm": 0.879184365272522, + "learning_rate": 1.1833743336770482e-07, + "loss": 0.0735, + "step": 8376 + }, + { + "epoch": 2.7145171743357097, + "grad_norm": 0.9110177755355835, + "learning_rate": 1.1807167902765843e-07, + "loss": 0.0745, + "step": 8377 + }, + { + "epoch": 2.714841218405703, + "grad_norm": 0.8994605541229248, + "learning_rate": 1.1780621621257953e-07, + "loss": 0.0757, + "step": 8378 + }, + { + "epoch": 2.7151652624756966, + "grad_norm": 0.8339632153511047, + "learning_rate": 1.1754104495495882e-07, + "loss": 0.0705, + "step": 8379 + }, + { + "epoch": 2.71548930654569, + "grad_norm": 0.8231309652328491, + "learning_rate": 1.1727616528724949e-07, + "loss": 0.0689, + "step": 8380 + }, + { + "epoch": 2.715813350615684, + "grad_norm": 0.9211947917938232, + "learning_rate": 1.1701157724187173e-07, + "loss": 0.0828, + "step": 8381 + }, + { + "epoch": 2.7161373946856773, + "grad_norm": 0.9162659645080566, + "learning_rate": 1.1674728085120713e-07, + "loss": 0.0784, + "step": 8382 + }, + { + "epoch": 2.7164614387556707, + "grad_norm": 0.9284588694572449, + "learning_rate": 1.1648327614760452e-07, + "loss": 0.0834, + "step": 8383 + }, + { + "epoch": 2.7167854828256646, + "grad_norm": 0.8979219198226929, + "learning_rate": 1.1621956316337391e-07, + "loss": 0.0778, + "step": 8384 + }, + { + "epoch": 2.7171095268956575, + "grad_norm": 0.7882916927337646, + "learning_rate": 1.1595614193079224e-07, + "loss": 0.067, + "step": 8385 + }, + { + "epoch": 2.7174335709656514, + "grad_norm": 0.8570539355278015, + "learning_rate": 1.1569301248209958e-07, + "loss": 0.0701, + "step": 8386 + }, + { + "epoch": 2.717757615035645, + "grad_norm": 0.9228929877281189, + "learning_rate": 1.1543017484950015e-07, + "loss": 0.0758, + "step": 8387 + }, + { + "epoch": 2.7180816591056383, + "grad_norm": 0.8897868990898132, + "learning_rate": 1.1516762906516322e-07, + "loss": 0.0745, + "step": 8388 + }, + { + "epoch": 2.718405703175632, + "grad_norm": 0.874020516872406, + "learning_rate": 1.1490537516122141e-07, + "loss": 0.078, + "step": 8389 + }, + { + "epoch": 2.7187297472456255, + "grad_norm": 0.8467276096343994, + "learning_rate": 1.1464341316977184e-07, + "loss": 0.0707, + "step": 8390 + }, + { + "epoch": 2.719053791315619, + "grad_norm": 0.9447891712188721, + "learning_rate": 1.1438174312287664e-07, + "loss": 0.0795, + "step": 8391 + }, + { + "epoch": 2.7193778353856124, + "grad_norm": 0.9125964641571045, + "learning_rate": 1.1412036505256158e-07, + "loss": 0.0781, + "step": 8392 + }, + { + "epoch": 2.719701879455606, + "grad_norm": 0.893666684627533, + "learning_rate": 1.1385927899081661e-07, + "loss": 0.0769, + "step": 8393 + }, + { + "epoch": 2.7200259235255997, + "grad_norm": 0.9132059812545776, + "learning_rate": 1.1359848496959618e-07, + "loss": 0.0725, + "step": 8394 + }, + { + "epoch": 2.720349967595593, + "grad_norm": 0.8375158905982971, + "learning_rate": 1.1333798302081922e-07, + "loss": 0.0701, + "step": 8395 + }, + { + "epoch": 2.7206740116655865, + "grad_norm": 0.9415892362594604, + "learning_rate": 1.1307777317636882e-07, + "loss": 0.0827, + "step": 8396 + }, + { + "epoch": 2.72099805573558, + "grad_norm": 0.8136382699012756, + "learning_rate": 1.1281785546809115e-07, + "loss": 0.0695, + "step": 8397 + }, + { + "epoch": 2.7213220998055734, + "grad_norm": 0.8608342409133911, + "learning_rate": 1.1255822992779858e-07, + "loss": 0.071, + "step": 8398 + }, + { + "epoch": 2.7216461438755672, + "grad_norm": 0.8565046787261963, + "learning_rate": 1.1229889658726623e-07, + "loss": 0.0757, + "step": 8399 + }, + { + "epoch": 2.7219701879455607, + "grad_norm": 0.875421404838562, + "learning_rate": 1.1203985547823427e-07, + "loss": 0.0747, + "step": 8400 + }, + { + "epoch": 2.722294232015554, + "grad_norm": 0.886735200881958, + "learning_rate": 1.1178110663240676e-07, + "loss": 0.0787, + "step": 8401 + }, + { + "epoch": 2.7226182760855475, + "grad_norm": 0.937952995300293, + "learning_rate": 1.1152265008145202e-07, + "loss": 0.0784, + "step": 8402 + }, + { + "epoch": 2.722942320155541, + "grad_norm": 0.9049589037895203, + "learning_rate": 1.1126448585700306e-07, + "loss": 0.0774, + "step": 8403 + }, + { + "epoch": 2.723266364225535, + "grad_norm": 0.9026086330413818, + "learning_rate": 1.110066139906557e-07, + "loss": 0.0744, + "step": 8404 + }, + { + "epoch": 2.7235904082955282, + "grad_norm": 0.9097543358802795, + "learning_rate": 1.1074903451397195e-07, + "loss": 0.0777, + "step": 8405 + }, + { + "epoch": 2.7239144523655217, + "grad_norm": 0.8582219481468201, + "learning_rate": 1.1049174745847657e-07, + "loss": 0.0689, + "step": 8406 + }, + { + "epoch": 2.724238496435515, + "grad_norm": 0.8198797106742859, + "learning_rate": 1.1023475285565882e-07, + "loss": 0.0707, + "step": 8407 + }, + { + "epoch": 2.7245625405055085, + "grad_norm": 0.9629983305931091, + "learning_rate": 1.099780507369727e-07, + "loss": 0.0816, + "step": 8408 + }, + { + "epoch": 2.7248865845755024, + "grad_norm": 0.9551554918289185, + "learning_rate": 1.0972164113383616e-07, + "loss": 0.0772, + "step": 8409 + }, + { + "epoch": 2.725210628645496, + "grad_norm": 0.8462727665901184, + "learning_rate": 1.09465524077631e-07, + "loss": 0.0693, + "step": 8410 + }, + { + "epoch": 2.725534672715489, + "grad_norm": 1.0670876502990723, + "learning_rate": 1.0920969959970301e-07, + "loss": 0.0793, + "step": 8411 + }, + { + "epoch": 2.725858716785483, + "grad_norm": 0.9187403321266174, + "learning_rate": 1.0895416773136408e-07, + "loss": 0.0821, + "step": 8412 + }, + { + "epoch": 2.7261827608554765, + "grad_norm": 0.939841091632843, + "learning_rate": 1.0869892850388697e-07, + "loss": 0.077, + "step": 8413 + }, + { + "epoch": 2.72650680492547, + "grad_norm": 0.9433113932609558, + "learning_rate": 1.0844398194851197e-07, + "loss": 0.0807, + "step": 8414 + }, + { + "epoch": 2.7268308489954634, + "grad_norm": 0.96625155210495, + "learning_rate": 1.0818932809644161e-07, + "loss": 0.0771, + "step": 8415 + }, + { + "epoch": 2.7271548930654568, + "grad_norm": 1.2603490352630615, + "learning_rate": 1.0793496697884265e-07, + "loss": 0.0754, + "step": 8416 + }, + { + "epoch": 2.7274789371354506, + "grad_norm": 0.8630130290985107, + "learning_rate": 1.0768089862684684e-07, + "loss": 0.0787, + "step": 8417 + }, + { + "epoch": 2.727802981205444, + "grad_norm": 0.9122354984283447, + "learning_rate": 1.0742712307154957e-07, + "loss": 0.0814, + "step": 8418 + }, + { + "epoch": 2.7281270252754375, + "grad_norm": 0.8623340129852295, + "learning_rate": 1.0717364034401073e-07, + "loss": 0.0741, + "step": 8419 + }, + { + "epoch": 2.728451069345431, + "grad_norm": 0.9016427993774414, + "learning_rate": 1.0692045047525384e-07, + "loss": 0.075, + "step": 8420 + }, + { + "epoch": 2.7287751134154243, + "grad_norm": 0.8372197151184082, + "learning_rate": 1.066675534962669e-07, + "loss": 0.0736, + "step": 8421 + }, + { + "epoch": 2.729099157485418, + "grad_norm": 0.9791613817214966, + "learning_rate": 1.0641494943800234e-07, + "loss": 0.0818, + "step": 8422 + }, + { + "epoch": 2.7294232015554116, + "grad_norm": 0.9533630609512329, + "learning_rate": 1.0616263833137602e-07, + "loss": 0.0758, + "step": 8423 + }, + { + "epoch": 2.729747245625405, + "grad_norm": 0.924326479434967, + "learning_rate": 1.0591062020726878e-07, + "loss": 0.0781, + "step": 8424 + }, + { + "epoch": 2.7300712896953985, + "grad_norm": 0.9117130041122437, + "learning_rate": 1.0565889509652483e-07, + "loss": 0.0769, + "step": 8425 + }, + { + "epoch": 2.730395333765392, + "grad_norm": 0.7807494401931763, + "learning_rate": 1.0540746302995341e-07, + "loss": 0.0665, + "step": 8426 + }, + { + "epoch": 2.7307193778353858, + "grad_norm": 0.8621374368667603, + "learning_rate": 1.0515632403832715e-07, + "loss": 0.073, + "step": 8427 + }, + { + "epoch": 2.731043421905379, + "grad_norm": 1.0108366012573242, + "learning_rate": 1.0490547815238228e-07, + "loss": 0.081, + "step": 8428 + }, + { + "epoch": 2.7313674659753726, + "grad_norm": 0.8846213817596436, + "learning_rate": 1.0465492540282146e-07, + "loss": 0.0788, + "step": 8429 + }, + { + "epoch": 2.731691510045366, + "grad_norm": 0.8477645516395569, + "learning_rate": 1.044046658203085e-07, + "loss": 0.0723, + "step": 8430 + }, + { + "epoch": 2.7320155541153595, + "grad_norm": 1.0279101133346558, + "learning_rate": 1.0415469943547335e-07, + "loss": 0.0787, + "step": 8431 + }, + { + "epoch": 2.7323395981853533, + "grad_norm": 0.8763721585273743, + "learning_rate": 1.0390502627890986e-07, + "loss": 0.0769, + "step": 8432 + }, + { + "epoch": 2.7326636422553467, + "grad_norm": 0.8510463833808899, + "learning_rate": 1.0365564638117442e-07, + "loss": 0.075, + "step": 8433 + }, + { + "epoch": 2.73298768632534, + "grad_norm": 0.891975462436676, + "learning_rate": 1.0340655977279012e-07, + "loss": 0.0804, + "step": 8434 + }, + { + "epoch": 2.733311730395334, + "grad_norm": 0.8500216007232666, + "learning_rate": 1.0315776648424119e-07, + "loss": 0.0746, + "step": 8435 + }, + { + "epoch": 2.733635774465327, + "grad_norm": 0.8794539570808411, + "learning_rate": 1.0290926654597938e-07, + "loss": 0.0736, + "step": 8436 + }, + { + "epoch": 2.733959818535321, + "grad_norm": 0.871335506439209, + "learning_rate": 1.0266105998841702e-07, + "loss": 0.072, + "step": 8437 + }, + { + "epoch": 2.7342838626053143, + "grad_norm": 0.9088922739028931, + "learning_rate": 1.0241314684193343e-07, + "loss": 0.0765, + "step": 8438 + }, + { + "epoch": 2.7346079066753077, + "grad_norm": 0.889601469039917, + "learning_rate": 1.0216552713686989e-07, + "loss": 0.0759, + "step": 8439 + }, + { + "epoch": 2.7349319507453016, + "grad_norm": 0.8953149914741516, + "learning_rate": 1.01918200903533e-07, + "loss": 0.0782, + "step": 8440 + }, + { + "epoch": 2.735255994815295, + "grad_norm": 0.9992877840995789, + "learning_rate": 1.0167116817219325e-07, + "loss": 0.0872, + "step": 8441 + }, + { + "epoch": 2.7355800388852884, + "grad_norm": 0.90580815076828, + "learning_rate": 1.0142442897308453e-07, + "loss": 0.0779, + "step": 8442 + }, + { + "epoch": 2.735904082955282, + "grad_norm": 1.0264875888824463, + "learning_rate": 1.0117798333640627e-07, + "loss": 0.082, + "step": 8443 + }, + { + "epoch": 2.7362281270252753, + "grad_norm": 0.864154577255249, + "learning_rate": 1.0093183129231993e-07, + "loss": 0.0738, + "step": 8444 + }, + { + "epoch": 2.736552171095269, + "grad_norm": 0.8238134384155273, + "learning_rate": 1.0068597287095305e-07, + "loss": 0.0756, + "step": 8445 + }, + { + "epoch": 2.7368762151652626, + "grad_norm": 0.8123138546943665, + "learning_rate": 1.0044040810239547e-07, + "loss": 0.0729, + "step": 8446 + }, + { + "epoch": 2.737200259235256, + "grad_norm": 0.8799177408218384, + "learning_rate": 1.0019513701670285e-07, + "loss": 0.0752, + "step": 8447 + }, + { + "epoch": 2.7375243033052494, + "grad_norm": 0.9700337648391724, + "learning_rate": 9.995015964389315e-08, + "loss": 0.0769, + "step": 8448 + }, + { + "epoch": 2.737848347375243, + "grad_norm": 0.9147710204124451, + "learning_rate": 9.970547601394986e-08, + "loss": 0.0766, + "step": 8449 + }, + { + "epoch": 2.7381723914452367, + "grad_norm": 0.9626017212867737, + "learning_rate": 9.94610861568196e-08, + "loss": 0.0794, + "step": 8450 + }, + { + "epoch": 2.73849643551523, + "grad_norm": 0.8789365291595459, + "learning_rate": 9.92169901024137e-08, + "loss": 0.0753, + "step": 8451 + }, + { + "epoch": 2.7388204795852236, + "grad_norm": 0.8948490023612976, + "learning_rate": 9.897318788060662e-08, + "loss": 0.0762, + "step": 8452 + }, + { + "epoch": 2.739144523655217, + "grad_norm": 0.867906928062439, + "learning_rate": 9.872967952123752e-08, + "loss": 0.0745, + "step": 8453 + }, + { + "epoch": 2.7394685677252104, + "grad_norm": 0.8962223529815674, + "learning_rate": 9.848646505410953e-08, + "loss": 0.0736, + "step": 8454 + }, + { + "epoch": 2.7397926117952043, + "grad_norm": 0.8685925602912903, + "learning_rate": 9.824354450898966e-08, + "loss": 0.0709, + "step": 8455 + }, + { + "epoch": 2.7401166558651977, + "grad_norm": 0.955605685710907, + "learning_rate": 9.800091791560939e-08, + "loss": 0.0813, + "step": 8456 + }, + { + "epoch": 2.740440699935191, + "grad_norm": 0.8607534170150757, + "learning_rate": 9.775858530366334e-08, + "loss": 0.0706, + "step": 8457 + }, + { + "epoch": 2.7407647440051845, + "grad_norm": 1.0163851976394653, + "learning_rate": 9.751654670281135e-08, + "loss": 0.0786, + "step": 8458 + }, + { + "epoch": 2.741088788075178, + "grad_norm": 0.9263463020324707, + "learning_rate": 9.727480214267559e-08, + "loss": 0.0811, + "step": 8459 + }, + { + "epoch": 2.741412832145172, + "grad_norm": 0.9529585838317871, + "learning_rate": 9.70333516528446e-08, + "loss": 0.0847, + "step": 8460 + }, + { + "epoch": 2.7417368762151653, + "grad_norm": 0.9520207643508911, + "learning_rate": 9.679219526286837e-08, + "loss": 0.0809, + "step": 8461 + }, + { + "epoch": 2.7420609202851587, + "grad_norm": 0.8074455261230469, + "learning_rate": 9.655133300226271e-08, + "loss": 0.0715, + "step": 8462 + }, + { + "epoch": 2.7423849643551526, + "grad_norm": 0.9240773916244507, + "learning_rate": 9.631076490050684e-08, + "loss": 0.0797, + "step": 8463 + }, + { + "epoch": 2.742709008425146, + "grad_norm": 0.9265851378440857, + "learning_rate": 9.60704909870433e-08, + "loss": 0.0794, + "step": 8464 + }, + { + "epoch": 2.7430330524951394, + "grad_norm": 0.865867018699646, + "learning_rate": 9.583051129128051e-08, + "loss": 0.0718, + "step": 8465 + }, + { + "epoch": 2.743357096565133, + "grad_norm": 0.8869981169700623, + "learning_rate": 9.559082584258833e-08, + "loss": 0.0818, + "step": 8466 + }, + { + "epoch": 2.7436811406351262, + "grad_norm": 0.9116880297660828, + "learning_rate": 9.535143467030327e-08, + "loss": 0.0793, + "step": 8467 + }, + { + "epoch": 2.74400518470512, + "grad_norm": 0.8475185632705688, + "learning_rate": 9.511233780372303e-08, + "loss": 0.0705, + "step": 8468 + }, + { + "epoch": 2.7443292287751135, + "grad_norm": 0.8898075819015503, + "learning_rate": 9.487353527211223e-08, + "loss": 0.0798, + "step": 8469 + }, + { + "epoch": 2.744653272845107, + "grad_norm": 0.9375563263893127, + "learning_rate": 9.463502710469697e-08, + "loss": 0.0784, + "step": 8470 + }, + { + "epoch": 2.7449773169151004, + "grad_norm": 0.8835859298706055, + "learning_rate": 9.439681333066858e-08, + "loss": 0.0717, + "step": 8471 + }, + { + "epoch": 2.745301360985094, + "grad_norm": 0.8493410348892212, + "learning_rate": 9.415889397918238e-08, + "loss": 0.0708, + "step": 8472 + }, + { + "epoch": 2.7456254050550877, + "grad_norm": 0.8702937960624695, + "learning_rate": 9.3921269079357e-08, + "loss": 0.075, + "step": 8473 + }, + { + "epoch": 2.745949449125081, + "grad_norm": 0.8226486444473267, + "learning_rate": 9.368393866027614e-08, + "loss": 0.0681, + "step": 8474 + }, + { + "epoch": 2.7462734931950745, + "grad_norm": 0.8785573244094849, + "learning_rate": 9.344690275098573e-08, + "loss": 0.0749, + "step": 8475 + }, + { + "epoch": 2.746597537265068, + "grad_norm": 0.8903120160102844, + "learning_rate": 9.321016138049727e-08, + "loss": 0.0722, + "step": 8476 + }, + { + "epoch": 2.7469215813350614, + "grad_norm": 0.9668460488319397, + "learning_rate": 9.297371457778565e-08, + "loss": 0.0811, + "step": 8477 + }, + { + "epoch": 2.7472456254050552, + "grad_norm": 0.9164747595787048, + "learning_rate": 9.273756237178938e-08, + "loss": 0.0802, + "step": 8478 + }, + { + "epoch": 2.7475696694750487, + "grad_norm": 0.9257664084434509, + "learning_rate": 9.250170479141146e-08, + "loss": 0.0764, + "step": 8479 + }, + { + "epoch": 2.747893713545042, + "grad_norm": 0.8608869314193726, + "learning_rate": 9.226614186551852e-08, + "loss": 0.0742, + "step": 8480 + }, + { + "epoch": 2.7482177576150355, + "grad_norm": 0.8738452196121216, + "learning_rate": 9.20308736229411e-08, + "loss": 0.0737, + "step": 8481 + }, + { + "epoch": 2.748541801685029, + "grad_norm": 0.9163529276847839, + "learning_rate": 9.179590009247397e-08, + "loss": 0.0807, + "step": 8482 + }, + { + "epoch": 2.748865845755023, + "grad_norm": 0.9178310632705688, + "learning_rate": 9.15612213028752e-08, + "loss": 0.074, + "step": 8483 + }, + { + "epoch": 2.749189889825016, + "grad_norm": 0.8706983923912048, + "learning_rate": 9.132683728286767e-08, + "loss": 0.0756, + "step": 8484 + }, + { + "epoch": 2.7495139338950096, + "grad_norm": 0.9542733430862427, + "learning_rate": 9.109274806113732e-08, + "loss": 0.0796, + "step": 8485 + }, + { + "epoch": 2.7498379779650035, + "grad_norm": 0.8895235061645508, + "learning_rate": 9.085895366633457e-08, + "loss": 0.0785, + "step": 8486 + }, + { + "epoch": 2.7501620220349965, + "grad_norm": 0.9087440967559814, + "learning_rate": 9.062545412707375e-08, + "loss": 0.0707, + "step": 8487 + }, + { + "epoch": 2.7504860661049904, + "grad_norm": 0.998375415802002, + "learning_rate": 9.039224947193254e-08, + "loss": 0.0817, + "step": 8488 + }, + { + "epoch": 2.750810110174984, + "grad_norm": 0.8863471746444702, + "learning_rate": 9.01593397294534e-08, + "loss": 0.0761, + "step": 8489 + }, + { + "epoch": 2.751134154244977, + "grad_norm": 0.9317454099655151, + "learning_rate": 8.992672492814158e-08, + "loss": 0.0776, + "step": 8490 + }, + { + "epoch": 2.751458198314971, + "grad_norm": 0.900424063205719, + "learning_rate": 8.969440509646821e-08, + "loss": 0.0793, + "step": 8491 + }, + { + "epoch": 2.7517822423849645, + "grad_norm": 0.8822380900382996, + "learning_rate": 8.946238026286552e-08, + "loss": 0.0736, + "step": 8492 + }, + { + "epoch": 2.752106286454958, + "grad_norm": 0.8611971139907837, + "learning_rate": 8.923065045573165e-08, + "loss": 0.076, + "step": 8493 + }, + { + "epoch": 2.7524303305249513, + "grad_norm": 0.8159151077270508, + "learning_rate": 8.899921570342807e-08, + "loss": 0.0683, + "step": 8494 + }, + { + "epoch": 2.7527543745949448, + "grad_norm": 0.7950197458267212, + "learning_rate": 8.876807603428017e-08, + "loss": 0.068, + "step": 8495 + }, + { + "epoch": 2.7530784186649386, + "grad_norm": 0.9587838649749756, + "learning_rate": 8.853723147657755e-08, + "loss": 0.0804, + "step": 8496 + }, + { + "epoch": 2.753402462734932, + "grad_norm": 0.9382262825965881, + "learning_rate": 8.830668205857263e-08, + "loss": 0.0795, + "step": 8497 + }, + { + "epoch": 2.7537265068049255, + "grad_norm": 0.9049416184425354, + "learning_rate": 8.807642780848335e-08, + "loss": 0.0774, + "step": 8498 + }, + { + "epoch": 2.754050550874919, + "grad_norm": 0.8489470481872559, + "learning_rate": 8.784646875448971e-08, + "loss": 0.0745, + "step": 8499 + }, + { + "epoch": 2.7543745949449123, + "grad_norm": 0.8656203150749207, + "learning_rate": 8.761680492473668e-08, + "loss": 0.0756, + "step": 8500 + }, + { + "epoch": 2.754698639014906, + "grad_norm": 0.824772298336029, + "learning_rate": 8.738743634733316e-08, + "loss": 0.0738, + "step": 8501 + }, + { + "epoch": 2.7550226830848996, + "grad_norm": 0.8814231753349304, + "learning_rate": 8.715836305035169e-08, + "loss": 0.0725, + "step": 8502 + }, + { + "epoch": 2.755346727154893, + "grad_norm": 0.85878586769104, + "learning_rate": 8.692958506182847e-08, + "loss": 0.0776, + "step": 8503 + }, + { + "epoch": 2.7556707712248865, + "grad_norm": 0.8849053978919983, + "learning_rate": 8.67011024097636e-08, + "loss": 0.0712, + "step": 8504 + }, + { + "epoch": 2.75599481529488, + "grad_norm": 0.9141733050346375, + "learning_rate": 8.647291512212136e-08, + "loss": 0.0816, + "step": 8505 + }, + { + "epoch": 2.7563188593648738, + "grad_norm": 0.8470417261123657, + "learning_rate": 8.624502322682942e-08, + "loss": 0.0681, + "step": 8506 + }, + { + "epoch": 2.756642903434867, + "grad_norm": 0.856762707233429, + "learning_rate": 8.601742675177993e-08, + "loss": 0.0709, + "step": 8507 + }, + { + "epoch": 2.7569669475048606, + "grad_norm": 0.8500531315803528, + "learning_rate": 8.57901257248278e-08, + "loss": 0.0693, + "step": 8508 + }, + { + "epoch": 2.757290991574854, + "grad_norm": 0.8620449304580688, + "learning_rate": 8.556312017379332e-08, + "loss": 0.0778, + "step": 8509 + }, + { + "epoch": 2.7576150356448474, + "grad_norm": 0.9037514925003052, + "learning_rate": 8.533641012645921e-08, + "loss": 0.0777, + "step": 8510 + }, + { + "epoch": 2.7579390797148413, + "grad_norm": 0.871510922908783, + "learning_rate": 8.510999561057276e-08, + "loss": 0.0746, + "step": 8511 + }, + { + "epoch": 2.7582631237848347, + "grad_norm": 0.8849137425422668, + "learning_rate": 8.488387665384457e-08, + "loss": 0.0758, + "step": 8512 + }, + { + "epoch": 2.758587167854828, + "grad_norm": 1.0339986085891724, + "learning_rate": 8.465805328395055e-08, + "loss": 0.0714, + "step": 8513 + }, + { + "epoch": 2.758911211924822, + "grad_norm": 0.8823992609977722, + "learning_rate": 8.443252552852776e-08, + "loss": 0.0776, + "step": 8514 + }, + { + "epoch": 2.7592352559948155, + "grad_norm": 0.9128614068031311, + "learning_rate": 8.42072934151797e-08, + "loss": 0.0755, + "step": 8515 + }, + { + "epoch": 2.759559300064809, + "grad_norm": 0.8525954484939575, + "learning_rate": 8.398235697147205e-08, + "loss": 0.068, + "step": 8516 + }, + { + "epoch": 2.7598833441348023, + "grad_norm": 0.9029262661933899, + "learning_rate": 8.375771622493506e-08, + "loss": 0.0798, + "step": 8517 + }, + { + "epoch": 2.7602073882047957, + "grad_norm": 0.9318966269493103, + "learning_rate": 8.353337120306282e-08, + "loss": 0.0786, + "step": 8518 + }, + { + "epoch": 2.7605314322747896, + "grad_norm": 0.9503961205482483, + "learning_rate": 8.330932193331226e-08, + "loss": 0.0827, + "step": 8519 + }, + { + "epoch": 2.760855476344783, + "grad_norm": 0.8519180417060852, + "learning_rate": 8.308556844310589e-08, + "loss": 0.0688, + "step": 8520 + }, + { + "epoch": 2.7611795204147764, + "grad_norm": 0.8932039141654968, + "learning_rate": 8.286211075982764e-08, + "loss": 0.078, + "step": 8521 + }, + { + "epoch": 2.76150356448477, + "grad_norm": 0.8670885562896729, + "learning_rate": 8.263894891082813e-08, + "loss": 0.0763, + "step": 8522 + }, + { + "epoch": 2.7618276085547633, + "grad_norm": 0.8525798320770264, + "learning_rate": 8.241608292341913e-08, + "loss": 0.074, + "step": 8523 + }, + { + "epoch": 2.762151652624757, + "grad_norm": 0.8913243412971497, + "learning_rate": 8.219351282487742e-08, + "loss": 0.0743, + "step": 8524 + }, + { + "epoch": 2.7624756966947506, + "grad_norm": 0.7880865931510925, + "learning_rate": 8.197123864244344e-08, + "loss": 0.0682, + "step": 8525 + }, + { + "epoch": 2.762799740764744, + "grad_norm": 0.8847666382789612, + "learning_rate": 8.174926040332182e-08, + "loss": 0.0758, + "step": 8526 + }, + { + "epoch": 2.7631237848347374, + "grad_norm": 0.8482584953308105, + "learning_rate": 8.152757813468027e-08, + "loss": 0.0698, + "step": 8527 + }, + { + "epoch": 2.763447828904731, + "grad_norm": 0.8862192630767822, + "learning_rate": 8.130619186365012e-08, + "loss": 0.0785, + "step": 8528 + }, + { + "epoch": 2.7637718729747247, + "grad_norm": 0.9533929228782654, + "learning_rate": 8.10851016173278e-08, + "loss": 0.0805, + "step": 8529 + }, + { + "epoch": 2.764095917044718, + "grad_norm": 0.9363769292831421, + "learning_rate": 8.086430742277191e-08, + "loss": 0.0752, + "step": 8530 + }, + { + "epoch": 2.7644199611147116, + "grad_norm": 0.8216090798377991, + "learning_rate": 8.064380930700556e-08, + "loss": 0.0719, + "step": 8531 + }, + { + "epoch": 2.764744005184705, + "grad_norm": 0.836811363697052, + "learning_rate": 8.042360729701604e-08, + "loss": 0.078, + "step": 8532 + }, + { + "epoch": 2.7650680492546984, + "grad_norm": 0.8772767186164856, + "learning_rate": 8.020370141975347e-08, + "loss": 0.0774, + "step": 8533 + }, + { + "epoch": 2.7653920933246923, + "grad_norm": 0.9011791944503784, + "learning_rate": 7.998409170213245e-08, + "loss": 0.0757, + "step": 8534 + }, + { + "epoch": 2.7657161373946857, + "grad_norm": 0.9040393233299255, + "learning_rate": 7.976477817103117e-08, + "loss": 0.0782, + "step": 8535 + }, + { + "epoch": 2.766040181464679, + "grad_norm": 0.877589762210846, + "learning_rate": 7.954576085329152e-08, + "loss": 0.0731, + "step": 8536 + }, + { + "epoch": 2.766364225534673, + "grad_norm": 0.8785209655761719, + "learning_rate": 7.93270397757187e-08, + "loss": 0.0741, + "step": 8537 + }, + { + "epoch": 2.766688269604666, + "grad_norm": 0.9807142019271851, + "learning_rate": 7.910861496508216e-08, + "loss": 0.0821, + "step": 8538 + }, + { + "epoch": 2.76701231367466, + "grad_norm": 0.8884080648422241, + "learning_rate": 7.88904864481152e-08, + "loss": 0.0752, + "step": 8539 + }, + { + "epoch": 2.7673363577446533, + "grad_norm": 0.9433414340019226, + "learning_rate": 7.867265425151454e-08, + "loss": 0.0815, + "step": 8540 + }, + { + "epoch": 2.7676604018146467, + "grad_norm": 0.9098288416862488, + "learning_rate": 7.845511840194081e-08, + "loss": 0.0772, + "step": 8541 + }, + { + "epoch": 2.7679844458846405, + "grad_norm": 0.976300835609436, + "learning_rate": 7.823787892601825e-08, + "loss": 0.0837, + "step": 8542 + }, + { + "epoch": 2.768308489954634, + "grad_norm": 0.9695088863372803, + "learning_rate": 7.802093585033449e-08, + "loss": 0.0816, + "step": 8543 + }, + { + "epoch": 2.7686325340246274, + "grad_norm": 0.8716114163398743, + "learning_rate": 7.780428920144217e-08, + "loss": 0.0727, + "step": 8544 + }, + { + "epoch": 2.768956578094621, + "grad_norm": 0.8254791498184204, + "learning_rate": 7.758793900585565e-08, + "loss": 0.0727, + "step": 8545 + }, + { + "epoch": 2.7692806221646142, + "grad_norm": 0.88580721616745, + "learning_rate": 7.737188529005484e-08, + "loss": 0.0765, + "step": 8546 + }, + { + "epoch": 2.769604666234608, + "grad_norm": 0.963691771030426, + "learning_rate": 7.715612808048251e-08, + "loss": 0.0782, + "step": 8547 + }, + { + "epoch": 2.7699287103046015, + "grad_norm": 0.894769549369812, + "learning_rate": 7.6940667403545e-08, + "loss": 0.0757, + "step": 8548 + }, + { + "epoch": 2.770252754374595, + "grad_norm": 0.8622497916221619, + "learning_rate": 7.672550328561318e-08, + "loss": 0.0733, + "step": 8549 + }, + { + "epoch": 2.7705767984445884, + "grad_norm": 0.9186211824417114, + "learning_rate": 7.651063575301986e-08, + "loss": 0.0802, + "step": 8550 + }, + { + "epoch": 2.770900842514582, + "grad_norm": 0.8643956780433655, + "learning_rate": 7.62960648320643e-08, + "loss": 0.0755, + "step": 8551 + }, + { + "epoch": 2.7712248865845757, + "grad_norm": 1.073041319847107, + "learning_rate": 7.608179054900634e-08, + "loss": 0.088, + "step": 8552 + }, + { + "epoch": 2.771548930654569, + "grad_norm": 0.878182590007782, + "learning_rate": 7.586781293007273e-08, + "loss": 0.0751, + "step": 8553 + }, + { + "epoch": 2.7718729747245625, + "grad_norm": 0.9077216982841492, + "learning_rate": 7.565413200145089e-08, + "loss": 0.078, + "step": 8554 + }, + { + "epoch": 2.772197018794556, + "grad_norm": 0.9496373534202576, + "learning_rate": 7.544074778929378e-08, + "loss": 0.0785, + "step": 8555 + }, + { + "epoch": 2.7725210628645494, + "grad_norm": 1.0797755718231201, + "learning_rate": 7.522766031971774e-08, + "loss": 0.0777, + "step": 8556 + }, + { + "epoch": 2.7728451069345432, + "grad_norm": 0.903744101524353, + "learning_rate": 7.501486961880245e-08, + "loss": 0.0752, + "step": 8557 + }, + { + "epoch": 2.7731691510045366, + "grad_norm": 0.8617210388183594, + "learning_rate": 7.480237571259153e-08, + "loss": 0.0697, + "step": 8558 + }, + { + "epoch": 2.77349319507453, + "grad_norm": 0.9265851378440857, + "learning_rate": 7.459017862709194e-08, + "loss": 0.0775, + "step": 8559 + }, + { + "epoch": 2.7738172391445235, + "grad_norm": 0.8909482359886169, + "learning_rate": 7.437827838827488e-08, + "loss": 0.0731, + "step": 8560 + }, + { + "epoch": 2.774141283214517, + "grad_norm": 0.866847038269043, + "learning_rate": 7.416667502207458e-08, + "loss": 0.0715, + "step": 8561 + }, + { + "epoch": 2.774465327284511, + "grad_norm": 0.8507097959518433, + "learning_rate": 7.395536855438923e-08, + "loss": 0.0721, + "step": 8562 + }, + { + "epoch": 2.774789371354504, + "grad_norm": 0.8779752850532532, + "learning_rate": 7.37443590110809e-08, + "loss": 0.0779, + "step": 8563 + }, + { + "epoch": 2.7751134154244976, + "grad_norm": 0.8747959733009338, + "learning_rate": 7.353364641797533e-08, + "loss": 0.0752, + "step": 8564 + }, + { + "epoch": 2.7754374594944915, + "grad_norm": 0.9139300584793091, + "learning_rate": 7.332323080086106e-08, + "loss": 0.0761, + "step": 8565 + }, + { + "epoch": 2.775761503564485, + "grad_norm": 0.9525585770606995, + "learning_rate": 7.311311218549166e-08, + "loss": 0.0815, + "step": 8566 + }, + { + "epoch": 2.7760855476344783, + "grad_norm": 0.9190864562988281, + "learning_rate": 7.290329059758294e-08, + "loss": 0.0834, + "step": 8567 + }, + { + "epoch": 2.7764095917044718, + "grad_norm": 0.8622193336486816, + "learning_rate": 7.269376606281547e-08, + "loss": 0.0754, + "step": 8568 + }, + { + "epoch": 2.776733635774465, + "grad_norm": 0.9313884973526001, + "learning_rate": 7.248453860683291e-08, + "loss": 0.0809, + "step": 8569 + }, + { + "epoch": 2.777057679844459, + "grad_norm": 0.941539466381073, + "learning_rate": 7.227560825524255e-08, + "loss": 0.0815, + "step": 8570 + }, + { + "epoch": 2.7773817239144525, + "grad_norm": 0.8436852097511292, + "learning_rate": 7.20669750336156e-08, + "loss": 0.0732, + "step": 8571 + }, + { + "epoch": 2.777705767984446, + "grad_norm": 0.9402943253517151, + "learning_rate": 7.185863896748662e-08, + "loss": 0.0792, + "step": 8572 + }, + { + "epoch": 2.7780298120544393, + "grad_norm": 0.917573094367981, + "learning_rate": 7.165060008235414e-08, + "loss": 0.0725, + "step": 8573 + }, + { + "epoch": 2.7783538561244328, + "grad_norm": 0.9383273124694824, + "learning_rate": 7.14428584036797e-08, + "loss": 0.0735, + "step": 8574 + }, + { + "epoch": 2.7786779001944266, + "grad_norm": 0.8470904231071472, + "learning_rate": 7.123541395688966e-08, + "loss": 0.0689, + "step": 8575 + }, + { + "epoch": 2.77900194426442, + "grad_norm": 0.8494551181793213, + "learning_rate": 7.102826676737202e-08, + "loss": 0.0759, + "step": 8576 + }, + { + "epoch": 2.7793259883344135, + "grad_norm": 0.9772122502326965, + "learning_rate": 7.082141686048066e-08, + "loss": 0.0809, + "step": 8577 + }, + { + "epoch": 2.779650032404407, + "grad_norm": 0.9359027743339539, + "learning_rate": 7.061486426153146e-08, + "loss": 0.0775, + "step": 8578 + }, + { + "epoch": 2.7799740764744003, + "grad_norm": 0.8508344292640686, + "learning_rate": 7.040860899580475e-08, + "loss": 0.073, + "step": 8579 + }, + { + "epoch": 2.780298120544394, + "grad_norm": 0.9112252593040466, + "learning_rate": 7.020265108854423e-08, + "loss": 0.078, + "step": 8580 + }, + { + "epoch": 2.7806221646143876, + "grad_norm": 0.8710126280784607, + "learning_rate": 6.99969905649564e-08, + "loss": 0.0665, + "step": 8581 + }, + { + "epoch": 2.780946208684381, + "grad_norm": 0.8951093554496765, + "learning_rate": 6.979162745021306e-08, + "loss": 0.0749, + "step": 8582 + }, + { + "epoch": 2.7812702527543745, + "grad_norm": 0.8741493821144104, + "learning_rate": 6.958656176944801e-08, + "loss": 0.0747, + "step": 8583 + }, + { + "epoch": 2.781594296824368, + "grad_norm": 0.9016504883766174, + "learning_rate": 6.938179354776003e-08, + "loss": 0.0765, + "step": 8584 + }, + { + "epoch": 2.7819183408943617, + "grad_norm": 0.8614570498466492, + "learning_rate": 6.917732281020995e-08, + "loss": 0.0734, + "step": 8585 + }, + { + "epoch": 2.782242384964355, + "grad_norm": 0.8869245052337646, + "learning_rate": 6.897314958182327e-08, + "loss": 0.076, + "step": 8586 + }, + { + "epoch": 2.7825664290343486, + "grad_norm": 0.877219557762146, + "learning_rate": 6.87692738875892e-08, + "loss": 0.0757, + "step": 8587 + }, + { + "epoch": 2.7828904731043425, + "grad_norm": 0.8669131994247437, + "learning_rate": 6.856569575245969e-08, + "loss": 0.077, + "step": 8588 + }, + { + "epoch": 2.7832145171743354, + "grad_norm": 0.9033114910125732, + "learning_rate": 6.836241520135123e-08, + "loss": 0.0776, + "step": 8589 + }, + { + "epoch": 2.7835385612443293, + "grad_norm": 1.0447516441345215, + "learning_rate": 6.815943225914278e-08, + "loss": 0.0828, + "step": 8590 + }, + { + "epoch": 2.7838626053143227, + "grad_norm": 0.8430342078208923, + "learning_rate": 6.795674695067783e-08, + "loss": 0.0679, + "step": 8591 + }, + { + "epoch": 2.784186649384316, + "grad_norm": 0.9234492182731628, + "learning_rate": 6.77543593007629e-08, + "loss": 0.0769, + "step": 8592 + }, + { + "epoch": 2.78451069345431, + "grad_norm": 0.9635306596755981, + "learning_rate": 6.755226933416876e-08, + "loss": 0.0751, + "step": 8593 + }, + { + "epoch": 2.7848347375243034, + "grad_norm": 0.9842798709869385, + "learning_rate": 6.735047707562863e-08, + "loss": 0.0839, + "step": 8594 + }, + { + "epoch": 2.785158781594297, + "grad_norm": 0.8382657766342163, + "learning_rate": 6.714898254984031e-08, + "loss": 0.0679, + "step": 8595 + }, + { + "epoch": 2.7854828256642903, + "grad_norm": 0.9571649432182312, + "learning_rate": 6.69477857814646e-08, + "loss": 0.0812, + "step": 8596 + }, + { + "epoch": 2.7858068697342837, + "grad_norm": 0.8997218608856201, + "learning_rate": 6.674688679512654e-08, + "loss": 0.0781, + "step": 8597 + }, + { + "epoch": 2.7861309138042776, + "grad_norm": 0.9701622128486633, + "learning_rate": 6.654628561541337e-08, + "loss": 0.0777, + "step": 8598 + }, + { + "epoch": 2.786454957874271, + "grad_norm": 1.0179133415222168, + "learning_rate": 6.634598226687772e-08, + "loss": 0.08, + "step": 8599 + }, + { + "epoch": 2.7867790019442644, + "grad_norm": 0.8601292371749878, + "learning_rate": 6.614597677403384e-08, + "loss": 0.0739, + "step": 8600 + }, + { + "epoch": 2.787103046014258, + "grad_norm": 0.8769491910934448, + "learning_rate": 6.594626916136077e-08, + "loss": 0.076, + "step": 8601 + }, + { + "epoch": 2.7874270900842513, + "grad_norm": 0.9738726019859314, + "learning_rate": 6.574685945330145e-08, + "loss": 0.0784, + "step": 8602 + }, + { + "epoch": 2.787751134154245, + "grad_norm": 0.8933708667755127, + "learning_rate": 6.554774767426026e-08, + "loss": 0.0796, + "step": 8603 + }, + { + "epoch": 2.7880751782242386, + "grad_norm": 0.9039463400840759, + "learning_rate": 6.534893384860824e-08, + "loss": 0.0783, + "step": 8604 + }, + { + "epoch": 2.788399222294232, + "grad_norm": 0.8605583906173706, + "learning_rate": 6.515041800067678e-08, + "loss": 0.0728, + "step": 8605 + }, + { + "epoch": 2.7887232663642254, + "grad_norm": 0.9187542200088501, + "learning_rate": 6.495220015476366e-08, + "loss": 0.0771, + "step": 8606 + }, + { + "epoch": 2.789047310434219, + "grad_norm": 0.8536416888237, + "learning_rate": 6.475428033512754e-08, + "loss": 0.0666, + "step": 8607 + }, + { + "epoch": 2.7893713545042127, + "grad_norm": 0.9643122553825378, + "learning_rate": 6.455665856599291e-08, + "loss": 0.0805, + "step": 8608 + }, + { + "epoch": 2.789695398574206, + "grad_norm": 0.945328950881958, + "learning_rate": 6.435933487154627e-08, + "loss": 0.078, + "step": 8609 + }, + { + "epoch": 2.7900194426441995, + "grad_norm": 0.8264806866645813, + "learning_rate": 6.416230927593803e-08, + "loss": 0.0706, + "step": 8610 + }, + { + "epoch": 2.790343486714193, + "grad_norm": 0.9553369879722595, + "learning_rate": 6.39655818032825e-08, + "loss": 0.0743, + "step": 8611 + }, + { + "epoch": 2.7906675307841864, + "grad_norm": 0.8676332235336304, + "learning_rate": 6.376915247765735e-08, + "loss": 0.0691, + "step": 8612 + }, + { + "epoch": 2.7909915748541803, + "grad_norm": 0.8517956733703613, + "learning_rate": 6.357302132310338e-08, + "loss": 0.0711, + "step": 8613 + }, + { + "epoch": 2.7913156189241737, + "grad_norm": 0.896984338760376, + "learning_rate": 6.337718836362473e-08, + "loss": 0.0777, + "step": 8614 + }, + { + "epoch": 2.791639662994167, + "grad_norm": 0.9082828760147095, + "learning_rate": 6.318165362319023e-08, + "loss": 0.0751, + "step": 8615 + }, + { + "epoch": 2.791963707064161, + "grad_norm": 0.840907871723175, + "learning_rate": 6.298641712573105e-08, + "loss": 0.0696, + "step": 8616 + }, + { + "epoch": 2.7922877511341544, + "grad_norm": 0.8931449055671692, + "learning_rate": 6.279147889514226e-08, + "loss": 0.0739, + "step": 8617 + }, + { + "epoch": 2.792611795204148, + "grad_norm": 0.9458181858062744, + "learning_rate": 6.259683895528251e-08, + "loss": 0.0794, + "step": 8618 + }, + { + "epoch": 2.7929358392741412, + "grad_norm": 0.9642060399055481, + "learning_rate": 6.24024973299736e-08, + "loss": 0.084, + "step": 8619 + }, + { + "epoch": 2.7932598833441347, + "grad_norm": 0.84830641746521, + "learning_rate": 6.220845404300124e-08, + "loss": 0.0746, + "step": 8620 + }, + { + "epoch": 2.7935839274141285, + "grad_norm": 0.8820691108703613, + "learning_rate": 6.201470911811474e-08, + "loss": 0.0767, + "step": 8621 + }, + { + "epoch": 2.793907971484122, + "grad_norm": 0.9251769185066223, + "learning_rate": 6.182126257902626e-08, + "loss": 0.0783, + "step": 8622 + }, + { + "epoch": 2.7942320155541154, + "grad_norm": 0.8165448307991028, + "learning_rate": 6.162811444941159e-08, + "loss": 0.0677, + "step": 8623 + }, + { + "epoch": 2.794556059624109, + "grad_norm": 0.9533488750457764, + "learning_rate": 6.143526475291067e-08, + "loss": 0.0765, + "step": 8624 + }, + { + "epoch": 2.7948801036941022, + "grad_norm": 0.9226420521736145, + "learning_rate": 6.124271351312605e-08, + "loss": 0.0778, + "step": 8625 + }, + { + "epoch": 2.795204147764096, + "grad_norm": 0.8921147584915161, + "learning_rate": 6.105046075362441e-08, + "loss": 0.0766, + "step": 8626 + }, + { + "epoch": 2.7955281918340895, + "grad_norm": 0.9103520512580872, + "learning_rate": 6.085850649793529e-08, + "loss": 0.0762, + "step": 8627 + }, + { + "epoch": 2.795852235904083, + "grad_norm": 0.9070742726325989, + "learning_rate": 6.066685076955264e-08, + "loss": 0.0747, + "step": 8628 + }, + { + "epoch": 2.7961762799740764, + "grad_norm": 0.8707572221755981, + "learning_rate": 6.047549359193245e-08, + "loss": 0.0719, + "step": 8629 + }, + { + "epoch": 2.79650032404407, + "grad_norm": 0.8317505717277527, + "learning_rate": 6.028443498849596e-08, + "loss": 0.0694, + "step": 8630 + }, + { + "epoch": 2.7968243681140637, + "grad_norm": 0.8567790389060974, + "learning_rate": 6.009367498262587e-08, + "loss": 0.0693, + "step": 8631 + }, + { + "epoch": 2.797148412184057, + "grad_norm": 0.926906406879425, + "learning_rate": 5.990321359767015e-08, + "loss": 0.0761, + "step": 8632 + }, + { + "epoch": 2.7974724562540505, + "grad_norm": 0.8667235970497131, + "learning_rate": 5.97130508569388e-08, + "loss": 0.0714, + "step": 8633 + }, + { + "epoch": 2.797796500324044, + "grad_norm": 0.9117993116378784, + "learning_rate": 5.95231867837065e-08, + "loss": 0.0782, + "step": 8634 + }, + { + "epoch": 2.7981205443940373, + "grad_norm": 0.8850106596946716, + "learning_rate": 5.933362140121052e-08, + "loss": 0.0771, + "step": 8635 + }, + { + "epoch": 2.798444588464031, + "grad_norm": 0.878083348274231, + "learning_rate": 5.9144354732651455e-08, + "loss": 0.0781, + "step": 8636 + }, + { + "epoch": 2.7987686325340246, + "grad_norm": 0.8513649106025696, + "learning_rate": 5.8955386801194394e-08, + "loss": 0.071, + "step": 8637 + }, + { + "epoch": 2.799092676604018, + "grad_norm": 0.8753370642662048, + "learning_rate": 5.8766717629966387e-08, + "loss": 0.0764, + "step": 8638 + }, + { + "epoch": 2.799416720674012, + "grad_norm": 0.8825068473815918, + "learning_rate": 5.857834724205979e-08, + "loss": 0.0749, + "step": 8639 + }, + { + "epoch": 2.7997407647440054, + "grad_norm": 0.8680891394615173, + "learning_rate": 5.839027566052841e-08, + "loss": 0.0749, + "step": 8640 + }, + { + "epoch": 2.8000648088139988, + "grad_norm": 0.9579867124557495, + "learning_rate": 5.820250290839047e-08, + "loss": 0.0761, + "step": 8641 + }, + { + "epoch": 2.800388852883992, + "grad_norm": 0.9145108461380005, + "learning_rate": 5.801502900862788e-08, + "loss": 0.0731, + "step": 8642 + }, + { + "epoch": 2.8007128969539856, + "grad_norm": 0.926467776298523, + "learning_rate": 5.782785398418561e-08, + "loss": 0.08, + "step": 8643 + }, + { + "epoch": 2.8010369410239795, + "grad_norm": 0.9767326712608337, + "learning_rate": 5.7640977857972016e-08, + "loss": 0.0781, + "step": 8644 + }, + { + "epoch": 2.801360985093973, + "grad_norm": 0.9075636267662048, + "learning_rate": 5.745440065285879e-08, + "loss": 0.0828, + "step": 8645 + }, + { + "epoch": 2.8016850291639663, + "grad_norm": 0.8939979076385498, + "learning_rate": 5.726812239168128e-08, + "loss": 0.078, + "step": 8646 + }, + { + "epoch": 2.8020090732339598, + "grad_norm": 0.8266425728797913, + "learning_rate": 5.708214309723792e-08, + "loss": 0.0732, + "step": 8647 + }, + { + "epoch": 2.802333117303953, + "grad_norm": 0.7988861799240112, + "learning_rate": 5.689646279229105e-08, + "loss": 0.0642, + "step": 8648 + }, + { + "epoch": 2.802657161373947, + "grad_norm": 0.9102234840393066, + "learning_rate": 5.671108149956611e-08, + "loss": 0.0711, + "step": 8649 + }, + { + "epoch": 2.8029812054439405, + "grad_norm": 0.983685314655304, + "learning_rate": 5.6525999241751894e-08, + "loss": 0.0815, + "step": 8650 + }, + { + "epoch": 2.803305249513934, + "grad_norm": 0.8759168386459351, + "learning_rate": 5.6341216041500555e-08, + "loss": 0.071, + "step": 8651 + }, + { + "epoch": 2.8036292935839273, + "grad_norm": 0.8909490704536438, + "learning_rate": 5.6156731921428455e-08, + "loss": 0.078, + "step": 8652 + }, + { + "epoch": 2.8039533376539207, + "grad_norm": 0.9161653518676758, + "learning_rate": 5.597254690411363e-08, + "loss": 0.0755, + "step": 8653 + }, + { + "epoch": 2.8042773817239146, + "grad_norm": 0.8215521574020386, + "learning_rate": 5.5788661012099176e-08, + "loss": 0.073, + "step": 8654 + }, + { + "epoch": 2.804601425793908, + "grad_norm": 0.8546934723854065, + "learning_rate": 5.560507426789069e-08, + "loss": 0.0679, + "step": 8655 + }, + { + "epoch": 2.8049254698639015, + "grad_norm": 0.9193974733352661, + "learning_rate": 5.5421786693957705e-08, + "loss": 0.0754, + "step": 8656 + }, + { + "epoch": 2.805249513933895, + "grad_norm": 0.9443433880805969, + "learning_rate": 5.523879831273282e-08, + "loss": 0.0774, + "step": 8657 + }, + { + "epoch": 2.8055735580038883, + "grad_norm": 1.0062669515609741, + "learning_rate": 5.505610914661147e-08, + "loss": 0.0806, + "step": 8658 + }, + { + "epoch": 2.805897602073882, + "grad_norm": 0.8628509044647217, + "learning_rate": 5.487371921795381e-08, + "loss": 0.0766, + "step": 8659 + }, + { + "epoch": 2.8062216461438756, + "grad_norm": 0.8530628085136414, + "learning_rate": 5.4691628549082e-08, + "loss": 0.0747, + "step": 8660 + }, + { + "epoch": 2.806545690213869, + "grad_norm": 0.8226529955863953, + "learning_rate": 5.450983716228292e-08, + "loss": 0.0721, + "step": 8661 + }, + { + "epoch": 2.806869734283863, + "grad_norm": 0.9194059371948242, + "learning_rate": 5.4328345079805164e-08, + "loss": 0.0806, + "step": 8662 + }, + { + "epoch": 2.807193778353856, + "grad_norm": 0.9187507629394531, + "learning_rate": 5.4147152323862085e-08, + "loss": 0.0722, + "step": 8663 + }, + { + "epoch": 2.8075178224238497, + "grad_norm": 0.9742519855499268, + "learning_rate": 5.3966258916629824e-08, + "loss": 0.08, + "step": 8664 + }, + { + "epoch": 2.807841866493843, + "grad_norm": 0.8697577714920044, + "learning_rate": 5.378566488024817e-08, + "loss": 0.0716, + "step": 8665 + }, + { + "epoch": 2.8081659105638366, + "grad_norm": 0.9710431694984436, + "learning_rate": 5.3605370236820276e-08, + "loss": 0.0849, + "step": 8666 + }, + { + "epoch": 2.8084899546338304, + "grad_norm": 0.8214179873466492, + "learning_rate": 5.3425375008411276e-08, + "loss": 0.0707, + "step": 8667 + }, + { + "epoch": 2.808813998703824, + "grad_norm": 0.9119242429733276, + "learning_rate": 5.3245679217052424e-08, + "loss": 0.077, + "step": 8668 + }, + { + "epoch": 2.8091380427738173, + "grad_norm": 0.929480791091919, + "learning_rate": 5.3066282884735863e-08, + "loss": 0.078, + "step": 8669 + }, + { + "epoch": 2.8094620868438107, + "grad_norm": 0.8982275128364563, + "learning_rate": 5.2887186033417914e-08, + "loss": 0.0771, + "step": 8670 + }, + { + "epoch": 2.809786130913804, + "grad_norm": 0.8926584124565125, + "learning_rate": 5.270838868501854e-08, + "loss": 0.0734, + "step": 8671 + }, + { + "epoch": 2.810110174983798, + "grad_norm": 0.9076064229011536, + "learning_rate": 5.252989086142107e-08, + "loss": 0.0772, + "step": 8672 + }, + { + "epoch": 2.8104342190537914, + "grad_norm": 0.9674218893051147, + "learning_rate": 5.235169258447137e-08, + "loss": 0.0856, + "step": 8673 + }, + { + "epoch": 2.810758263123785, + "grad_norm": 1.0249892473220825, + "learning_rate": 5.2173793875979204e-08, + "loss": 0.0851, + "step": 8674 + }, + { + "epoch": 2.8110823071937783, + "grad_norm": 0.8544679284095764, + "learning_rate": 5.199619475771856e-08, + "loss": 0.0683, + "step": 8675 + }, + { + "epoch": 2.8114063512637717, + "grad_norm": 0.9031746983528137, + "learning_rate": 5.181889525142453e-08, + "loss": 0.0784, + "step": 8676 + }, + { + "epoch": 2.8117303953337656, + "grad_norm": 0.8899961113929749, + "learning_rate": 5.164189537879782e-08, + "loss": 0.0747, + "step": 8677 + }, + { + "epoch": 2.812054439403759, + "grad_norm": 0.8602359294891357, + "learning_rate": 5.146519516150084e-08, + "loss": 0.0742, + "step": 8678 + }, + { + "epoch": 2.8123784834737524, + "grad_norm": 0.9854103326797485, + "learning_rate": 5.128879462116071e-08, + "loss": 0.0832, + "step": 8679 + }, + { + "epoch": 2.812702527543746, + "grad_norm": 0.8897486329078674, + "learning_rate": 5.111269377936656e-08, + "loss": 0.0766, + "step": 8680 + }, + { + "epoch": 2.8130265716137393, + "grad_norm": 0.8990198969841003, + "learning_rate": 5.093689265767143e-08, + "loss": 0.0767, + "step": 8681 + }, + { + "epoch": 2.813350615683733, + "grad_norm": 0.91489577293396, + "learning_rate": 5.0761391277591996e-08, + "loss": 0.0772, + "step": 8682 + }, + { + "epoch": 2.8136746597537265, + "grad_norm": 0.88707035779953, + "learning_rate": 5.05861896606083e-08, + "loss": 0.0782, + "step": 8683 + }, + { + "epoch": 2.81399870382372, + "grad_norm": 0.9026859402656555, + "learning_rate": 5.0411287828162346e-08, + "loss": 0.0708, + "step": 8684 + }, + { + "epoch": 2.8143227478937134, + "grad_norm": 1.0039856433868408, + "learning_rate": 5.023668580166091e-08, + "loss": 0.0772, + "step": 8685 + }, + { + "epoch": 2.814646791963707, + "grad_norm": 0.9419729113578796, + "learning_rate": 5.0062383602473566e-08, + "loss": 0.0777, + "step": 8686 + }, + { + "epoch": 2.8149708360337007, + "grad_norm": 0.8873862624168396, + "learning_rate": 4.9888381251933237e-08, + "loss": 0.0732, + "step": 8687 + }, + { + "epoch": 2.815294880103694, + "grad_norm": 0.9214391708374023, + "learning_rate": 4.971467877133651e-08, + "loss": 0.078, + "step": 8688 + }, + { + "epoch": 2.8156189241736875, + "grad_norm": 0.8661234378814697, + "learning_rate": 4.954127618194193e-08, + "loss": 0.0751, + "step": 8689 + }, + { + "epoch": 2.8159429682436814, + "grad_norm": 0.8473151922225952, + "learning_rate": 4.936817350497336e-08, + "loss": 0.0711, + "step": 8690 + }, + { + "epoch": 2.816267012313675, + "grad_norm": 0.867581307888031, + "learning_rate": 4.919537076161579e-08, + "loss": 0.0784, + "step": 8691 + }, + { + "epoch": 2.8165910563836682, + "grad_norm": 0.968896210193634, + "learning_rate": 4.90228679730198e-08, + "loss": 0.0714, + "step": 8692 + }, + { + "epoch": 2.8169151004536617, + "grad_norm": 0.8834143280982971, + "learning_rate": 4.8850665160297406e-08, + "loss": 0.0701, + "step": 8693 + }, + { + "epoch": 2.817239144523655, + "grad_norm": 0.8912068009376526, + "learning_rate": 4.867876234452423e-08, + "loss": 0.0774, + "step": 8694 + }, + { + "epoch": 2.817563188593649, + "grad_norm": 0.8768351078033447, + "learning_rate": 4.85071595467404e-08, + "loss": 0.0749, + "step": 8695 + }, + { + "epoch": 2.8178872326636424, + "grad_norm": 0.9168806672096252, + "learning_rate": 4.8335856787947447e-08, + "loss": 0.076, + "step": 8696 + }, + { + "epoch": 2.818211276733636, + "grad_norm": 0.8847835659980774, + "learning_rate": 4.81648540891122e-08, + "loss": 0.0708, + "step": 8697 + }, + { + "epoch": 2.8185353208036292, + "grad_norm": 0.9171305298805237, + "learning_rate": 4.799415147116265e-08, + "loss": 0.0824, + "step": 8698 + }, + { + "epoch": 2.8188593648736227, + "grad_norm": 0.8939773440361023, + "learning_rate": 4.782374895499236e-08, + "loss": 0.0752, + "step": 8699 + }, + { + "epoch": 2.8191834089436165, + "grad_norm": 0.9174849390983582, + "learning_rate": 4.7653646561455767e-08, + "loss": 0.0823, + "step": 8700 + }, + { + "epoch": 2.81950745301361, + "grad_norm": 0.87873774766922, + "learning_rate": 4.7483844311372594e-08, + "loss": 0.0725, + "step": 8701 + }, + { + "epoch": 2.8198314970836034, + "grad_norm": 0.8400615453720093, + "learning_rate": 4.731434222552456e-08, + "loss": 0.0749, + "step": 8702 + }, + { + "epoch": 2.820155541153597, + "grad_norm": 0.8082554340362549, + "learning_rate": 4.7145140324657e-08, + "loss": 0.0714, + "step": 8703 + }, + { + "epoch": 2.82047958522359, + "grad_norm": 0.8531041145324707, + "learning_rate": 4.697623862947892e-08, + "loss": 0.073, + "step": 8704 + }, + { + "epoch": 2.820803629293584, + "grad_norm": 0.9330157041549683, + "learning_rate": 4.680763716066239e-08, + "loss": 0.0787, + "step": 8705 + }, + { + "epoch": 2.8211276733635775, + "grad_norm": 0.9409183859825134, + "learning_rate": 4.663933593884229e-08, + "loss": 0.0773, + "step": 8706 + }, + { + "epoch": 2.821451717433571, + "grad_norm": 0.8692289590835571, + "learning_rate": 4.6471334984616866e-08, + "loss": 0.073, + "step": 8707 + }, + { + "epoch": 2.8217757615035644, + "grad_norm": 0.8960288166999817, + "learning_rate": 4.6303634318548006e-08, + "loss": 0.0765, + "step": 8708 + }, + { + "epoch": 2.8220998055735578, + "grad_norm": 0.9870824813842773, + "learning_rate": 4.613623396116068e-08, + "loss": 0.0898, + "step": 8709 + }, + { + "epoch": 2.8224238496435516, + "grad_norm": 0.8814342617988586, + "learning_rate": 4.596913393294322e-08, + "loss": 0.0755, + "step": 8710 + }, + { + "epoch": 2.822747893713545, + "grad_norm": 0.8680152297019958, + "learning_rate": 4.580233425434677e-08, + "loss": 0.0758, + "step": 8711 + }, + { + "epoch": 2.8230719377835385, + "grad_norm": 0.7859067320823669, + "learning_rate": 4.563583494578638e-08, + "loss": 0.0718, + "step": 8712 + }, + { + "epoch": 2.8233959818535324, + "grad_norm": 0.8998501896858215, + "learning_rate": 4.546963602763937e-08, + "loss": 0.077, + "step": 8713 + }, + { + "epoch": 2.8237200259235253, + "grad_norm": 0.8794705867767334, + "learning_rate": 4.530373752024753e-08, + "loss": 0.0736, + "step": 8714 + }, + { + "epoch": 2.824044069993519, + "grad_norm": 0.8842076063156128, + "learning_rate": 4.51381394439146e-08, + "loss": 0.0693, + "step": 8715 + }, + { + "epoch": 2.8243681140635126, + "grad_norm": 0.8637332916259766, + "learning_rate": 4.497284181890882e-08, + "loss": 0.0725, + "step": 8716 + }, + { + "epoch": 2.824692158133506, + "grad_norm": 0.9013912677764893, + "learning_rate": 4.480784466546068e-08, + "loss": 0.0785, + "step": 8717 + }, + { + "epoch": 2.8250162022035, + "grad_norm": 0.963401734828949, + "learning_rate": 4.4643148003764015e-08, + "loss": 0.0809, + "step": 8718 + }, + { + "epoch": 2.8253402462734933, + "grad_norm": 0.8618778586387634, + "learning_rate": 4.44787518539766e-08, + "loss": 0.0737, + "step": 8719 + }, + { + "epoch": 2.8256642903434868, + "grad_norm": 0.939586877822876, + "learning_rate": 4.4314656236218444e-08, + "loss": 0.0678, + "step": 8720 + }, + { + "epoch": 2.82598833441348, + "grad_norm": 0.851174533367157, + "learning_rate": 4.415086117057377e-08, + "loss": 0.0748, + "step": 8721 + }, + { + "epoch": 2.8263123784834736, + "grad_norm": 0.9462675452232361, + "learning_rate": 4.398736667708875e-08, + "loss": 0.0831, + "step": 8722 + }, + { + "epoch": 2.8266364225534675, + "grad_norm": 0.9516298174858093, + "learning_rate": 4.382417277577433e-08, + "loss": 0.0782, + "step": 8723 + }, + { + "epoch": 2.826960466623461, + "grad_norm": 0.9504342079162598, + "learning_rate": 4.3661279486603424e-08, + "loss": 0.0772, + "step": 8724 + }, + { + "epoch": 2.8272845106934543, + "grad_norm": 0.9191964864730835, + "learning_rate": 4.349868682951286e-08, + "loss": 0.0805, + "step": 8725 + }, + { + "epoch": 2.8276085547634477, + "grad_norm": 0.9080502390861511, + "learning_rate": 4.333639482440199e-08, + "loss": 0.0817, + "step": 8726 + }, + { + "epoch": 2.827932598833441, + "grad_norm": 0.812075674533844, + "learning_rate": 4.3174403491134385e-08, + "loss": 0.069, + "step": 8727 + }, + { + "epoch": 2.828256642903435, + "grad_norm": 0.8327502608299255, + "learning_rate": 4.301271284953584e-08, + "loss": 0.0694, + "step": 8728 + }, + { + "epoch": 2.8285806869734285, + "grad_norm": 0.9250109791755676, + "learning_rate": 4.285132291939526e-08, + "loss": 0.0786, + "step": 8729 + }, + { + "epoch": 2.828904731043422, + "grad_norm": 0.8713056445121765, + "learning_rate": 4.2690233720466265e-08, + "loss": 0.0728, + "step": 8730 + }, + { + "epoch": 2.8292287751134153, + "grad_norm": 0.9858421087265015, + "learning_rate": 4.2529445272463946e-08, + "loss": 0.0834, + "step": 8731 + }, + { + "epoch": 2.8295528191834087, + "grad_norm": 0.8879863619804382, + "learning_rate": 4.2368957595067264e-08, + "loss": 0.0778, + "step": 8732 + }, + { + "epoch": 2.8298768632534026, + "grad_norm": 0.9293619394302368, + "learning_rate": 4.220877070791857e-08, + "loss": 0.0757, + "step": 8733 + }, + { + "epoch": 2.830200907323396, + "grad_norm": 0.8157750964164734, + "learning_rate": 4.204888463062273e-08, + "loss": 0.0692, + "step": 8734 + }, + { + "epoch": 2.8305249513933894, + "grad_norm": 0.8670499324798584, + "learning_rate": 4.188929938274911e-08, + "loss": 0.0732, + "step": 8735 + }, + { + "epoch": 2.830848995463383, + "grad_norm": 0.9903301000595093, + "learning_rate": 4.1730014983828724e-08, + "loss": 0.084, + "step": 8736 + }, + { + "epoch": 2.8311730395333763, + "grad_norm": 0.9433955550193787, + "learning_rate": 4.157103145335628e-08, + "loss": 0.079, + "step": 8737 + }, + { + "epoch": 2.83149708360337, + "grad_norm": 1.0336216688156128, + "learning_rate": 4.141234881079065e-08, + "loss": 0.0803, + "step": 8738 + }, + { + "epoch": 2.8318211276733636, + "grad_norm": 0.9306154251098633, + "learning_rate": 4.125396707555213e-08, + "loss": 0.0789, + "step": 8739 + }, + { + "epoch": 2.832145171743357, + "grad_norm": 0.9157119393348694, + "learning_rate": 4.109588626702576e-08, + "loss": 0.0738, + "step": 8740 + }, + { + "epoch": 2.832469215813351, + "grad_norm": 0.8765474557876587, + "learning_rate": 4.0938106404558864e-08, + "loss": 0.0739, + "step": 8741 + }, + { + "epoch": 2.8327932598833443, + "grad_norm": 0.8910435438156128, + "learning_rate": 4.078062750746209e-08, + "loss": 0.0785, + "step": 8742 + }, + { + "epoch": 2.8331173039533377, + "grad_norm": 0.9361657500267029, + "learning_rate": 4.062344959500947e-08, + "loss": 0.0835, + "step": 8743 + }, + { + "epoch": 2.833441348023331, + "grad_norm": 0.9637318253517151, + "learning_rate": 4.0466572686437833e-08, + "loss": 0.079, + "step": 8744 + }, + { + "epoch": 2.8337653920933246, + "grad_norm": 0.8398253917694092, + "learning_rate": 4.0309996800947936e-08, + "loss": 0.0742, + "step": 8745 + }, + { + "epoch": 2.8340894361633184, + "grad_norm": 0.8814817070960999, + "learning_rate": 4.0153721957702504e-08, + "loss": 0.0756, + "step": 8746 + }, + { + "epoch": 2.834413480233312, + "grad_norm": 0.8335652351379395, + "learning_rate": 3.9997748175828467e-08, + "loss": 0.0705, + "step": 8747 + }, + { + "epoch": 2.8347375243033053, + "grad_norm": 0.9374161958694458, + "learning_rate": 3.9842075474415545e-08, + "loss": 0.0806, + "step": 8748 + }, + { + "epoch": 2.8350615683732987, + "grad_norm": 0.8186621069908142, + "learning_rate": 3.9686703872516e-08, + "loss": 0.0706, + "step": 8749 + }, + { + "epoch": 2.835385612443292, + "grad_norm": 0.9040276408195496, + "learning_rate": 3.953163338914656e-08, + "loss": 0.0762, + "step": 8750 + }, + { + "epoch": 2.835709656513286, + "grad_norm": 0.8741912245750427, + "learning_rate": 3.9376864043285943e-08, + "loss": 0.0759, + "step": 8751 + }, + { + "epoch": 2.8360337005832794, + "grad_norm": 0.8934181928634644, + "learning_rate": 3.922239585387649e-08, + "loss": 0.0789, + "step": 8752 + }, + { + "epoch": 2.836357744653273, + "grad_norm": 0.9757236242294312, + "learning_rate": 3.906822883982336e-08, + "loss": 0.0792, + "step": 8753 + }, + { + "epoch": 2.8366817887232663, + "grad_norm": 0.9433469772338867, + "learning_rate": 3.891436301999563e-08, + "loss": 0.0748, + "step": 8754 + }, + { + "epoch": 2.8370058327932597, + "grad_norm": 0.8428290486335754, + "learning_rate": 3.876079841322461e-08, + "loss": 0.0735, + "step": 8755 + }, + { + "epoch": 2.8373298768632536, + "grad_norm": 0.9037506580352783, + "learning_rate": 3.8607535038305276e-08, + "loss": 0.0731, + "step": 8756 + }, + { + "epoch": 2.837653920933247, + "grad_norm": 0.8666014075279236, + "learning_rate": 3.84545729139954e-08, + "loss": 0.0707, + "step": 8757 + }, + { + "epoch": 2.8379779650032404, + "grad_norm": 0.8894285559654236, + "learning_rate": 3.83019120590164e-08, + "loss": 0.0735, + "step": 8758 + }, + { + "epoch": 2.838302009073234, + "grad_norm": 0.9272712469100952, + "learning_rate": 3.814955249205221e-08, + "loss": 0.0781, + "step": 8759 + }, + { + "epoch": 2.8386260531432272, + "grad_norm": 0.9276229739189148, + "learning_rate": 3.7997494231750145e-08, + "loss": 0.0764, + "step": 8760 + }, + { + "epoch": 2.838950097213221, + "grad_norm": 0.9098535180091858, + "learning_rate": 3.784573729672086e-08, + "loss": 0.08, + "step": 8761 + }, + { + "epoch": 2.8392741412832145, + "grad_norm": 0.8889827728271484, + "learning_rate": 3.769428170553785e-08, + "loss": 0.0759, + "step": 8762 + }, + { + "epoch": 2.839598185353208, + "grad_norm": 0.8861127495765686, + "learning_rate": 3.754312747673766e-08, + "loss": 0.074, + "step": 8763 + }, + { + "epoch": 2.839922229423202, + "grad_norm": 0.9090628623962402, + "learning_rate": 3.739227462882022e-08, + "loss": 0.0791, + "step": 8764 + }, + { + "epoch": 2.840246273493195, + "grad_norm": 0.9150841236114502, + "learning_rate": 3.724172318024854e-08, + "loss": 0.0775, + "step": 8765 + }, + { + "epoch": 2.8405703175631887, + "grad_norm": 0.9157789349555969, + "learning_rate": 3.709147314944872e-08, + "loss": 0.0811, + "step": 8766 + }, + { + "epoch": 2.840894361633182, + "grad_norm": 0.9196391105651855, + "learning_rate": 3.6941524554809924e-08, + "loss": 0.0744, + "step": 8767 + }, + { + "epoch": 2.8412184057031755, + "grad_norm": 0.9282419085502625, + "learning_rate": 3.6791877414683594e-08, + "loss": 0.0788, + "step": 8768 + }, + { + "epoch": 2.8415424497731694, + "grad_norm": 0.858267068862915, + "learning_rate": 3.664253174738647e-08, + "loss": 0.0717, + "step": 8769 + }, + { + "epoch": 2.841866493843163, + "grad_norm": 0.9045819044113159, + "learning_rate": 3.649348757119614e-08, + "loss": 0.0741, + "step": 8770 + }, + { + "epoch": 2.8421905379131562, + "grad_norm": 0.926044762134552, + "learning_rate": 3.634474490435413e-08, + "loss": 0.0765, + "step": 8771 + }, + { + "epoch": 2.8425145819831497, + "grad_norm": 0.892652690410614, + "learning_rate": 3.6196303765065333e-08, + "loss": 0.0766, + "step": 8772 + }, + { + "epoch": 2.842838626053143, + "grad_norm": 1.0337949991226196, + "learning_rate": 3.60481641714977e-08, + "loss": 0.0813, + "step": 8773 + }, + { + "epoch": 2.843162670123137, + "grad_norm": 0.8592774271965027, + "learning_rate": 3.590032614178174e-08, + "loss": 0.0746, + "step": 8774 + }, + { + "epoch": 2.8434867141931304, + "grad_norm": 0.8674032092094421, + "learning_rate": 3.57527896940113e-08, + "loss": 0.0741, + "step": 8775 + }, + { + "epoch": 2.843810758263124, + "grad_norm": 0.9188392162322998, + "learning_rate": 3.560555484624417e-08, + "loss": 0.0733, + "step": 8776 + }, + { + "epoch": 2.844134802333117, + "grad_norm": 0.9348613023757935, + "learning_rate": 3.545862161649927e-08, + "loss": 0.0835, + "step": 8777 + }, + { + "epoch": 2.8444588464031106, + "grad_norm": 0.8782564997673035, + "learning_rate": 3.531199002276109e-08, + "loss": 0.0729, + "step": 8778 + }, + { + "epoch": 2.8447828904731045, + "grad_norm": 0.8679888844490051, + "learning_rate": 3.5165660082975006e-08, + "loss": 0.0746, + "step": 8779 + }, + { + "epoch": 2.845106934543098, + "grad_norm": 0.9066128134727478, + "learning_rate": 3.501963181505058e-08, + "loss": 0.0723, + "step": 8780 + }, + { + "epoch": 2.8454309786130914, + "grad_norm": 0.8842100501060486, + "learning_rate": 3.487390523686074e-08, + "loss": 0.0772, + "step": 8781 + }, + { + "epoch": 2.845755022683085, + "grad_norm": 0.9143519997596741, + "learning_rate": 3.472848036624038e-08, + "loss": 0.0754, + "step": 8782 + }, + { + "epoch": 2.846079066753078, + "grad_norm": 0.8060352802276611, + "learning_rate": 3.4583357220988326e-08, + "loss": 0.0651, + "step": 8783 + }, + { + "epoch": 2.846403110823072, + "grad_norm": 0.8758019804954529, + "learning_rate": 3.443853581886619e-08, + "loss": 0.0753, + "step": 8784 + }, + { + "epoch": 2.8467271548930655, + "grad_norm": 0.884460985660553, + "learning_rate": 3.4294016177598974e-08, + "loss": 0.0766, + "step": 8785 + }, + { + "epoch": 2.847051198963059, + "grad_norm": 0.8895008563995361, + "learning_rate": 3.4149798314874195e-08, + "loss": 0.0784, + "step": 8786 + }, + { + "epoch": 2.8473752430330523, + "grad_norm": 0.8915991187095642, + "learning_rate": 3.4005882248343e-08, + "loss": 0.0747, + "step": 8787 + }, + { + "epoch": 2.8476992871030458, + "grad_norm": 0.9081102013587952, + "learning_rate": 3.3862267995618817e-08, + "loss": 0.0789, + "step": 8788 + }, + { + "epoch": 2.8480233311730396, + "grad_norm": 0.8390566110610962, + "learning_rate": 3.3718955574279234e-08, + "loss": 0.0707, + "step": 8789 + }, + { + "epoch": 2.848347375243033, + "grad_norm": 0.8644533157348633, + "learning_rate": 3.357594500186384e-08, + "loss": 0.0693, + "step": 8790 + }, + { + "epoch": 2.8486714193130265, + "grad_norm": 0.8662945628166199, + "learning_rate": 3.3433236295876134e-08, + "loss": 0.0774, + "step": 8791 + }, + { + "epoch": 2.8489954633830203, + "grad_norm": 0.9425113797187805, + "learning_rate": 3.329082947378215e-08, + "loss": 0.0799, + "step": 8792 + }, + { + "epoch": 2.8493195074530138, + "grad_norm": 0.8594256043434143, + "learning_rate": 3.314872455301071e-08, + "loss": 0.0727, + "step": 8793 + }, + { + "epoch": 2.849643551523007, + "grad_norm": 0.8771162033081055, + "learning_rate": 3.300692155095458e-08, + "loss": 0.0759, + "step": 8794 + }, + { + "epoch": 2.8499675955930006, + "grad_norm": 0.8600313067436218, + "learning_rate": 3.286542048496904e-08, + "loss": 0.0733, + "step": 8795 + }, + { + "epoch": 2.850291639662994, + "grad_norm": 0.821404218673706, + "learning_rate": 3.272422137237219e-08, + "loss": 0.0691, + "step": 8796 + }, + { + "epoch": 2.850615683732988, + "grad_norm": 0.9205437302589417, + "learning_rate": 3.258332423044547e-08, + "loss": 0.0769, + "step": 8797 + }, + { + "epoch": 2.8509397278029813, + "grad_norm": 0.8754212260246277, + "learning_rate": 3.2442729076433697e-08, + "loss": 0.0762, + "step": 8798 + }, + { + "epoch": 2.8512637718729748, + "grad_norm": 0.9110721349716187, + "learning_rate": 3.230243592754368e-08, + "loss": 0.0787, + "step": 8799 + }, + { + "epoch": 2.851587815942968, + "grad_norm": 0.8736501932144165, + "learning_rate": 3.2162444800946655e-08, + "loss": 0.0663, + "step": 8800 + }, + { + "epoch": 2.8519118600129616, + "grad_norm": 0.8907871842384338, + "learning_rate": 3.202275571377589e-08, + "loss": 0.0794, + "step": 8801 + }, + { + "epoch": 2.8522359040829555, + "grad_norm": 0.9323838353157043, + "learning_rate": 3.188336868312769e-08, + "loss": 0.0746, + "step": 8802 + }, + { + "epoch": 2.852559948152949, + "grad_norm": 0.9358437657356262, + "learning_rate": 3.1744283726062306e-08, + "loss": 0.0787, + "step": 8803 + }, + { + "epoch": 2.8528839922229423, + "grad_norm": 0.870239794254303, + "learning_rate": 3.160550085960168e-08, + "loss": 0.0754, + "step": 8804 + }, + { + "epoch": 2.8532080362929357, + "grad_norm": 0.8820590376853943, + "learning_rate": 3.1467020100732215e-08, + "loss": 0.0759, + "step": 8805 + }, + { + "epoch": 2.853532080362929, + "grad_norm": 0.8466381430625916, + "learning_rate": 3.1328841466401746e-08, + "loss": 0.0722, + "step": 8806 + }, + { + "epoch": 2.853856124432923, + "grad_norm": 0.9091260433197021, + "learning_rate": 3.1190964973522865e-08, + "loss": 0.0795, + "step": 8807 + }, + { + "epoch": 2.8541801685029164, + "grad_norm": 0.9665597081184387, + "learning_rate": 3.105339063896956e-08, + "loss": 0.0816, + "step": 8808 + }, + { + "epoch": 2.85450421257291, + "grad_norm": 0.8894763588905334, + "learning_rate": 3.0916118479580593e-08, + "loss": 0.0751, + "step": 8809 + }, + { + "epoch": 2.8548282566429033, + "grad_norm": 0.8645616769790649, + "learning_rate": 3.077914851215585e-08, + "loss": 0.075, + "step": 8810 + }, + { + "epoch": 2.8551523007128967, + "grad_norm": 0.9180304408073425, + "learning_rate": 3.064248075345916e-08, + "loss": 0.0797, + "step": 8811 + }, + { + "epoch": 2.8554763447828906, + "grad_norm": 0.9625119566917419, + "learning_rate": 3.050611522021796e-08, + "loss": 0.0794, + "step": 8812 + }, + { + "epoch": 2.855800388852884, + "grad_norm": 0.9530915021896362, + "learning_rate": 3.0370051929121405e-08, + "loss": 0.0805, + "step": 8813 + }, + { + "epoch": 2.8561244329228774, + "grad_norm": 0.9082760810852051, + "learning_rate": 3.023429089682284e-08, + "loss": 0.0717, + "step": 8814 + }, + { + "epoch": 2.8564484769928713, + "grad_norm": 0.8581361174583435, + "learning_rate": 3.009883213993786e-08, + "loss": 0.0719, + "step": 8815 + }, + { + "epoch": 2.8567725210628643, + "grad_norm": 0.8786942362785339, + "learning_rate": 2.996367567504544e-08, + "loss": 0.0764, + "step": 8816 + }, + { + "epoch": 2.857096565132858, + "grad_norm": 0.8706663846969604, + "learning_rate": 2.9828821518687045e-08, + "loss": 0.075, + "step": 8817 + }, + { + "epoch": 2.8574206092028516, + "grad_norm": 0.9187700152397156, + "learning_rate": 2.9694269687367826e-08, + "loss": 0.077, + "step": 8818 + }, + { + "epoch": 2.857744653272845, + "grad_norm": 0.8860181570053101, + "learning_rate": 2.9560020197555716e-08, + "loss": 0.0768, + "step": 8819 + }, + { + "epoch": 2.858068697342839, + "grad_norm": 0.8907939791679382, + "learning_rate": 2.9426073065681183e-08, + "loss": 0.0751, + "step": 8820 + }, + { + "epoch": 2.8583927414128323, + "grad_norm": 0.8291096687316895, + "learning_rate": 2.929242830813861e-08, + "loss": 0.0725, + "step": 8821 + }, + { + "epoch": 2.8587167854828257, + "grad_norm": 0.9561754465103149, + "learning_rate": 2.915908594128436e-08, + "loss": 0.0772, + "step": 8822 + }, + { + "epoch": 2.859040829552819, + "grad_norm": 0.9024383425712585, + "learning_rate": 2.9026045981438434e-08, + "loss": 0.0768, + "step": 8823 + }, + { + "epoch": 2.8593648736228126, + "grad_norm": 0.8623805046081543, + "learning_rate": 2.889330844488364e-08, + "loss": 0.0742, + "step": 8824 + }, + { + "epoch": 2.8596889176928064, + "grad_norm": 0.8919248580932617, + "learning_rate": 2.8760873347865593e-08, + "loss": 0.0745, + "step": 8825 + }, + { + "epoch": 2.8600129617628, + "grad_norm": 0.9097840785980225, + "learning_rate": 2.862874070659327e-08, + "loss": 0.0785, + "step": 8826 + }, + { + "epoch": 2.8603370058327933, + "grad_norm": 0.8863422274589539, + "learning_rate": 2.8496910537238185e-08, + "loss": 0.0747, + "step": 8827 + }, + { + "epoch": 2.8606610499027867, + "grad_norm": 0.8750684261322021, + "learning_rate": 2.8365382855935487e-08, + "loss": 0.0758, + "step": 8828 + }, + { + "epoch": 2.86098509397278, + "grad_norm": 0.8420607447624207, + "learning_rate": 2.8234157678782846e-08, + "loss": 0.0684, + "step": 8829 + }, + { + "epoch": 2.861309138042774, + "grad_norm": 0.8596554398536682, + "learning_rate": 2.8103235021840204e-08, + "loss": 0.0742, + "step": 8830 + }, + { + "epoch": 2.8616331821127674, + "grad_norm": 0.908790647983551, + "learning_rate": 2.7972614901132235e-08, + "loss": 0.0815, + "step": 8831 + }, + { + "epoch": 2.861957226182761, + "grad_norm": 0.8810282945632935, + "learning_rate": 2.784229733264504e-08, + "loss": 0.0791, + "step": 8832 + }, + { + "epoch": 2.8622812702527543, + "grad_norm": 0.9018552899360657, + "learning_rate": 2.771228233232809e-08, + "loss": 0.0773, + "step": 8833 + }, + { + "epoch": 2.8626053143227477, + "grad_norm": 0.823049783706665, + "learning_rate": 2.7582569916094205e-08, + "loss": 0.0712, + "step": 8834 + }, + { + "epoch": 2.8629293583927415, + "grad_norm": 0.9358600974082947, + "learning_rate": 2.745316009981902e-08, + "loss": 0.0789, + "step": 8835 + }, + { + "epoch": 2.863253402462735, + "grad_norm": 0.9165016412734985, + "learning_rate": 2.732405289934098e-08, + "loss": 0.0759, + "step": 8836 + }, + { + "epoch": 2.8635774465327284, + "grad_norm": 0.8944207429885864, + "learning_rate": 2.719524833046133e-08, + "loss": 0.0779, + "step": 8837 + }, + { + "epoch": 2.863901490602722, + "grad_norm": 0.8423044681549072, + "learning_rate": 2.7066746408944968e-08, + "loss": 0.0691, + "step": 8838 + }, + { + "epoch": 2.8642255346727152, + "grad_norm": 0.7926920652389526, + "learning_rate": 2.6938547150518746e-08, + "loss": 0.0654, + "step": 8839 + }, + { + "epoch": 2.864549578742709, + "grad_norm": 0.9157641530036926, + "learning_rate": 2.6810650570873454e-08, + "loss": 0.0831, + "step": 8840 + }, + { + "epoch": 2.8648736228127025, + "grad_norm": 0.866848349571228, + "learning_rate": 2.6683056685662122e-08, + "loss": 0.0742, + "step": 8841 + }, + { + "epoch": 2.865197666882696, + "grad_norm": 0.8142363429069519, + "learning_rate": 2.6555765510500875e-08, + "loss": 0.0728, + "step": 8842 + }, + { + "epoch": 2.86552171095269, + "grad_norm": 0.8857007026672363, + "learning_rate": 2.6428777060969468e-08, + "loss": 0.0799, + "step": 8843 + }, + { + "epoch": 2.8658457550226832, + "grad_norm": 0.9255932569503784, + "learning_rate": 2.6302091352609637e-08, + "loss": 0.0773, + "step": 8844 + }, + { + "epoch": 2.8661697990926767, + "grad_norm": 0.9063679575920105, + "learning_rate": 2.617570840092648e-08, + "loss": 0.0747, + "step": 8845 + }, + { + "epoch": 2.86649384316267, + "grad_norm": 0.9214964509010315, + "learning_rate": 2.604962822138818e-08, + "loss": 0.0805, + "step": 8846 + }, + { + "epoch": 2.8668178872326635, + "grad_norm": 0.9403132796287537, + "learning_rate": 2.5923850829425723e-08, + "loss": 0.0808, + "step": 8847 + }, + { + "epoch": 2.8671419313026574, + "grad_norm": 0.8666374683380127, + "learning_rate": 2.579837624043291e-08, + "loss": 0.0748, + "step": 8848 + }, + { + "epoch": 2.867465975372651, + "grad_norm": 0.918161153793335, + "learning_rate": 2.5673204469766898e-08, + "loss": 0.0767, + "step": 8849 + }, + { + "epoch": 2.8677900194426442, + "grad_norm": 0.9361032843589783, + "learning_rate": 2.5548335532747105e-08, + "loss": 0.0775, + "step": 8850 + }, + { + "epoch": 2.8681140635126376, + "grad_norm": 0.8582929968833923, + "learning_rate": 2.5423769444656575e-08, + "loss": 0.0747, + "step": 8851 + }, + { + "epoch": 2.868438107582631, + "grad_norm": 0.8770160675048828, + "learning_rate": 2.52995062207409e-08, + "loss": 0.0758, + "step": 8852 + }, + { + "epoch": 2.868762151652625, + "grad_norm": 0.8568238615989685, + "learning_rate": 2.517554587620874e-08, + "loss": 0.0724, + "step": 8853 + }, + { + "epoch": 2.8690861957226184, + "grad_norm": 0.903739869594574, + "learning_rate": 2.5051888426231574e-08, + "loss": 0.0743, + "step": 8854 + }, + { + "epoch": 2.869410239792612, + "grad_norm": 0.8782305121421814, + "learning_rate": 2.492853388594396e-08, + "loss": 0.0777, + "step": 8855 + }, + { + "epoch": 2.869734283862605, + "grad_norm": 0.9001045823097229, + "learning_rate": 2.480548227044327e-08, + "loss": 0.079, + "step": 8856 + }, + { + "epoch": 2.8700583279325986, + "grad_norm": 0.9648503065109253, + "learning_rate": 2.4682733594789677e-08, + "loss": 0.0811, + "step": 8857 + }, + { + "epoch": 2.8703823720025925, + "grad_norm": 0.8991042971611023, + "learning_rate": 2.4560287874006716e-08, + "loss": 0.0732, + "step": 8858 + }, + { + "epoch": 2.870706416072586, + "grad_norm": 0.8516407012939453, + "learning_rate": 2.443814512308018e-08, + "loss": 0.0704, + "step": 8859 + }, + { + "epoch": 2.8710304601425793, + "grad_norm": 0.9465522766113281, + "learning_rate": 2.431630535695978e-08, + "loss": 0.0783, + "step": 8860 + }, + { + "epoch": 2.8713545042125728, + "grad_norm": 0.9010804295539856, + "learning_rate": 2.419476859055664e-08, + "loss": 0.0747, + "step": 8861 + }, + { + "epoch": 2.871678548282566, + "grad_norm": 0.9947733283042908, + "learning_rate": 2.4073534838746637e-08, + "loss": 0.0796, + "step": 8862 + }, + { + "epoch": 2.87200259235256, + "grad_norm": 0.8678649663925171, + "learning_rate": 2.3952604116366795e-08, + "loss": 0.0744, + "step": 8863 + }, + { + "epoch": 2.8723266364225535, + "grad_norm": 0.9155958294868469, + "learning_rate": 2.383197643821833e-08, + "loss": 0.0753, + "step": 8864 + }, + { + "epoch": 2.872650680492547, + "grad_norm": 0.8620344400405884, + "learning_rate": 2.3711651819064984e-08, + "loss": 0.0733, + "step": 8865 + }, + { + "epoch": 2.8729747245625408, + "grad_norm": 0.8952129483222961, + "learning_rate": 2.359163027363276e-08, + "loss": 0.0787, + "step": 8866 + }, + { + "epoch": 2.8732987686325338, + "grad_norm": 0.837651789188385, + "learning_rate": 2.3471911816611846e-08, + "loss": 0.0733, + "step": 8867 + }, + { + "epoch": 2.8736228127025276, + "grad_norm": 0.8996900916099548, + "learning_rate": 2.335249646265414e-08, + "loss": 0.0779, + "step": 8868 + }, + { + "epoch": 2.873946856772521, + "grad_norm": 1.0499439239501953, + "learning_rate": 2.3233384226375167e-08, + "loss": 0.0813, + "step": 8869 + }, + { + "epoch": 2.8742709008425145, + "grad_norm": 0.9339820742607117, + "learning_rate": 2.311457512235271e-08, + "loss": 0.0768, + "step": 8870 + }, + { + "epoch": 2.8745949449125083, + "grad_norm": 0.8691450357437134, + "learning_rate": 2.2996069165128198e-08, + "loss": 0.0759, + "step": 8871 + }, + { + "epoch": 2.8749189889825018, + "grad_norm": 0.8384815454483032, + "learning_rate": 2.2877866369205858e-08, + "loss": 0.0684, + "step": 8872 + }, + { + "epoch": 2.875243033052495, + "grad_norm": 0.9416871070861816, + "learning_rate": 2.2759966749051897e-08, + "loss": 0.0825, + "step": 8873 + }, + { + "epoch": 2.8755670771224886, + "grad_norm": 0.8636833429336548, + "learning_rate": 2.2642370319096718e-08, + "loss": 0.076, + "step": 8874 + }, + { + "epoch": 2.875891121192482, + "grad_norm": 0.9424716830253601, + "learning_rate": 2.2525077093732695e-08, + "loss": 0.0786, + "step": 8875 + }, + { + "epoch": 2.876215165262476, + "grad_norm": 0.8951495885848999, + "learning_rate": 2.2408087087315567e-08, + "loss": 0.0795, + "step": 8876 + }, + { + "epoch": 2.8765392093324693, + "grad_norm": 0.8318362832069397, + "learning_rate": 2.2291400314163325e-08, + "loss": 0.0735, + "step": 8877 + }, + { + "epoch": 2.8768632534024627, + "grad_norm": 0.829992413520813, + "learning_rate": 2.217501678855788e-08, + "loss": 0.0717, + "step": 8878 + }, + { + "epoch": 2.877187297472456, + "grad_norm": 0.8823353052139282, + "learning_rate": 2.2058936524742835e-08, + "loss": 0.0726, + "step": 8879 + }, + { + "epoch": 2.8775113415424496, + "grad_norm": 0.9791154861450195, + "learning_rate": 2.1943159536925994e-08, + "loss": 0.0774, + "step": 8880 + }, + { + "epoch": 2.8778353856124435, + "grad_norm": 0.9327330589294434, + "learning_rate": 2.1827685839276856e-08, + "loss": 0.0765, + "step": 8881 + }, + { + "epoch": 2.878159429682437, + "grad_norm": 1.139756202697754, + "learning_rate": 2.1712515445928285e-08, + "loss": 0.0779, + "step": 8882 + }, + { + "epoch": 2.8784834737524303, + "grad_norm": 0.9503368735313416, + "learning_rate": 2.159764837097622e-08, + "loss": 0.0787, + "step": 8883 + }, + { + "epoch": 2.8788075178224237, + "grad_norm": 0.8720259666442871, + "learning_rate": 2.1483084628479145e-08, + "loss": 0.0695, + "step": 8884 + }, + { + "epoch": 2.879131561892417, + "grad_norm": 0.8664884567260742, + "learning_rate": 2.1368824232458618e-08, + "loss": 0.0734, + "step": 8885 + }, + { + "epoch": 2.879455605962411, + "grad_norm": 0.9471036195755005, + "learning_rate": 2.125486719689929e-08, + "loss": 0.0789, + "step": 8886 + }, + { + "epoch": 2.8797796500324044, + "grad_norm": 0.9130653142929077, + "learning_rate": 2.1141213535747772e-08, + "loss": 0.0806, + "step": 8887 + }, + { + "epoch": 2.880103694102398, + "grad_norm": 0.8658857941627502, + "learning_rate": 2.1027863262914617e-08, + "loss": 0.071, + "step": 8888 + }, + { + "epoch": 2.8804277381723913, + "grad_norm": 0.9986435770988464, + "learning_rate": 2.0914816392272608e-08, + "loss": 0.0697, + "step": 8889 + }, + { + "epoch": 2.8807517822423847, + "grad_norm": 0.8567577600479126, + "learning_rate": 2.0802072937657624e-08, + "loss": 0.0725, + "step": 8890 + }, + { + "epoch": 2.8810758263123786, + "grad_norm": 0.9769272208213806, + "learning_rate": 2.068963291286863e-08, + "loss": 0.078, + "step": 8891 + }, + { + "epoch": 2.881399870382372, + "grad_norm": 0.9042466878890991, + "learning_rate": 2.0577496331666837e-08, + "loss": 0.0742, + "step": 8892 + }, + { + "epoch": 2.8817239144523654, + "grad_norm": 0.8782934546470642, + "learning_rate": 2.046566320777682e-08, + "loss": 0.0782, + "step": 8893 + }, + { + "epoch": 2.8820479585223593, + "grad_norm": 0.8273100256919861, + "learning_rate": 2.0354133554885967e-08, + "loss": 0.068, + "step": 8894 + }, + { + "epoch": 2.8823720025923527, + "grad_norm": 0.8472905158996582, + "learning_rate": 2.0242907386644195e-08, + "loss": 0.0714, + "step": 8895 + }, + { + "epoch": 2.882696046662346, + "grad_norm": 0.9096794128417969, + "learning_rate": 2.0131984716664776e-08, + "loss": 0.0804, + "step": 8896 + }, + { + "epoch": 2.8830200907323396, + "grad_norm": 0.9694199562072754, + "learning_rate": 2.002136555852352e-08, + "loss": 0.0741, + "step": 8897 + }, + { + "epoch": 2.883344134802333, + "grad_norm": 0.8909012675285339, + "learning_rate": 1.9911049925758765e-08, + "loss": 0.0724, + "step": 8898 + }, + { + "epoch": 2.883668178872327, + "grad_norm": 0.835223376750946, + "learning_rate": 1.9801037831872482e-08, + "loss": 0.0714, + "step": 8899 + }, + { + "epoch": 2.8839922229423203, + "grad_norm": 0.918133020401001, + "learning_rate": 1.9691329290329185e-08, + "loss": 0.0773, + "step": 8900 + }, + { + "epoch": 2.8843162670123137, + "grad_norm": 0.9661774039268494, + "learning_rate": 1.958192431455591e-08, + "loss": 0.0806, + "step": 8901 + }, + { + "epoch": 2.884640311082307, + "grad_norm": 0.9486656785011292, + "learning_rate": 1.9472822917942778e-08, + "loss": 0.075, + "step": 8902 + }, + { + "epoch": 2.8849643551523005, + "grad_norm": 0.8935655355453491, + "learning_rate": 1.9364025113842444e-08, + "loss": 0.0754, + "step": 8903 + }, + { + "epoch": 2.8852883992222944, + "grad_norm": 0.9863356947898865, + "learning_rate": 1.9255530915571197e-08, + "loss": 0.0809, + "step": 8904 + }, + { + "epoch": 2.885612443292288, + "grad_norm": 0.9524531364440918, + "learning_rate": 1.9147340336407584e-08, + "loss": 0.0837, + "step": 8905 + }, + { + "epoch": 2.8859364873622813, + "grad_norm": 0.9335321187973022, + "learning_rate": 1.9039453389592954e-08, + "loss": 0.0768, + "step": 8906 + }, + { + "epoch": 2.8862605314322747, + "grad_norm": 0.8417344093322754, + "learning_rate": 1.893187008833175e-08, + "loss": 0.0757, + "step": 8907 + }, + { + "epoch": 2.886584575502268, + "grad_norm": 0.9135453104972839, + "learning_rate": 1.8824590445790935e-08, + "loss": 0.0754, + "step": 8908 + }, + { + "epoch": 2.886908619572262, + "grad_norm": 0.9432932734489441, + "learning_rate": 1.871761447510084e-08, + "loss": 0.0763, + "step": 8909 + }, + { + "epoch": 2.8872326636422554, + "grad_norm": 0.8661041259765625, + "learning_rate": 1.8610942189353777e-08, + "loss": 0.0719, + "step": 8910 + }, + { + "epoch": 2.887556707712249, + "grad_norm": 0.8979371786117554, + "learning_rate": 1.850457360160568e-08, + "loss": 0.0748, + "step": 8911 + }, + { + "epoch": 2.8878807517822422, + "grad_norm": 0.8953410387039185, + "learning_rate": 1.839850872487503e-08, + "loss": 0.0753, + "step": 8912 + }, + { + "epoch": 2.8882047958522357, + "grad_norm": 0.8552532196044922, + "learning_rate": 1.829274757214339e-08, + "loss": 0.0735, + "step": 8913 + }, + { + "epoch": 2.8885288399222295, + "grad_norm": 0.9590696096420288, + "learning_rate": 1.8187290156354565e-08, + "loss": 0.0753, + "step": 8914 + }, + { + "epoch": 2.888852883992223, + "grad_norm": 0.9155333638191223, + "learning_rate": 1.808213649041546e-08, + "loss": 0.0748, + "step": 8915 + }, + { + "epoch": 2.8891769280622164, + "grad_norm": 0.9266859889030457, + "learning_rate": 1.7977286587196053e-08, + "loss": 0.0801, + "step": 8916 + }, + { + "epoch": 2.8895009721322102, + "grad_norm": 0.9092302918434143, + "learning_rate": 1.7872740459529135e-08, + "loss": 0.0774, + "step": 8917 + }, + { + "epoch": 2.8898250162022032, + "grad_norm": 0.8608276844024658, + "learning_rate": 1.7768498120209755e-08, + "loss": 0.0765, + "step": 8918 + }, + { + "epoch": 2.890149060272197, + "grad_norm": 0.9430409669876099, + "learning_rate": 1.766455958199631e-08, + "loss": 0.0771, + "step": 8919 + }, + { + "epoch": 2.8904731043421905, + "grad_norm": 0.8961855173110962, + "learning_rate": 1.7560924857610016e-08, + "loss": 0.0726, + "step": 8920 + }, + { + "epoch": 2.890797148412184, + "grad_norm": 0.8819207549095154, + "learning_rate": 1.745759395973462e-08, + "loss": 0.0788, + "step": 8921 + }, + { + "epoch": 2.891121192482178, + "grad_norm": 0.8897761106491089, + "learning_rate": 1.7354566901016944e-08, + "loss": 0.0744, + "step": 8922 + }, + { + "epoch": 2.8914452365521712, + "grad_norm": 0.8832787871360779, + "learning_rate": 1.7251843694066074e-08, + "loss": 0.0732, + "step": 8923 + }, + { + "epoch": 2.8917692806221647, + "grad_norm": 0.9169782996177673, + "learning_rate": 1.7149424351455003e-08, + "loss": 0.0779, + "step": 8924 + }, + { + "epoch": 2.892093324692158, + "grad_norm": 1.0113013982772827, + "learning_rate": 1.7047308885718427e-08, + "loss": 0.0836, + "step": 8925 + }, + { + "epoch": 2.8924173687621515, + "grad_norm": 0.928467869758606, + "learning_rate": 1.694549730935441e-08, + "loss": 0.0792, + "step": 8926 + }, + { + "epoch": 2.8927414128321454, + "grad_norm": 0.9160869121551514, + "learning_rate": 1.684398963482381e-08, + "loss": 0.0765, + "step": 8927 + }, + { + "epoch": 2.893065456902139, + "grad_norm": 0.9572937488555908, + "learning_rate": 1.674278587454975e-08, + "loss": 0.0787, + "step": 8928 + }, + { + "epoch": 2.893389500972132, + "grad_norm": 0.9312818646430969, + "learning_rate": 1.6641886040919263e-08, + "loss": 0.0751, + "step": 8929 + }, + { + "epoch": 2.8937135450421256, + "grad_norm": 0.8742788434028625, + "learning_rate": 1.654129014628081e-08, + "loss": 0.0771, + "step": 8930 + }, + { + "epoch": 2.894037589112119, + "grad_norm": 0.8427819609642029, + "learning_rate": 1.6440998202947034e-08, + "loss": 0.0747, + "step": 8931 + }, + { + "epoch": 2.894361633182113, + "grad_norm": 0.934770941734314, + "learning_rate": 1.634101022319229e-08, + "loss": 0.0737, + "step": 8932 + }, + { + "epoch": 2.8946856772521063, + "grad_norm": 0.9312441945075989, + "learning_rate": 1.6241326219254006e-08, + "loss": 0.0798, + "step": 8933 + }, + { + "epoch": 2.8950097213220998, + "grad_norm": 0.9229906797409058, + "learning_rate": 1.6141946203332703e-08, + "loss": 0.0798, + "step": 8934 + }, + { + "epoch": 2.895333765392093, + "grad_norm": 0.9197176098823547, + "learning_rate": 1.6042870187591985e-08, + "loss": 0.0769, + "step": 8935 + }, + { + "epoch": 2.8956578094620866, + "grad_norm": 0.8258293867111206, + "learning_rate": 1.5944098184156876e-08, + "loss": 0.0704, + "step": 8936 + }, + { + "epoch": 2.8959818535320805, + "grad_norm": 0.9437922239303589, + "learning_rate": 1.5845630205117147e-08, + "loss": 0.0793, + "step": 8937 + }, + { + "epoch": 2.896305897602074, + "grad_norm": 0.8437861800193787, + "learning_rate": 1.5747466262523438e-08, + "loss": 0.0678, + "step": 8938 + }, + { + "epoch": 2.8966299416720673, + "grad_norm": 0.8374882936477661, + "learning_rate": 1.5649606368390578e-08, + "loss": 0.07, + "step": 8939 + }, + { + "epoch": 2.8969539857420608, + "grad_norm": 0.8916781544685364, + "learning_rate": 1.5552050534695383e-08, + "loss": 0.076, + "step": 8940 + }, + { + "epoch": 2.897278029812054, + "grad_norm": 0.951287031173706, + "learning_rate": 1.5454798773378023e-08, + "loss": 0.0762, + "step": 8941 + }, + { + "epoch": 2.897602073882048, + "grad_norm": 0.9072433114051819, + "learning_rate": 1.5357851096340915e-08, + "loss": 0.0766, + "step": 8942 + }, + { + "epoch": 2.8979261179520415, + "grad_norm": 0.8875364065170288, + "learning_rate": 1.526120751544985e-08, + "loss": 0.0753, + "step": 8943 + }, + { + "epoch": 2.898250162022035, + "grad_norm": 0.9446718096733093, + "learning_rate": 1.5164868042532864e-08, + "loss": 0.0797, + "step": 8944 + }, + { + "epoch": 2.8985742060920288, + "grad_norm": 0.8039813041687012, + "learning_rate": 1.506883268938053e-08, + "loss": 0.0686, + "step": 8945 + }, + { + "epoch": 2.898898250162022, + "grad_norm": 0.9617775082588196, + "learning_rate": 1.4973101467747608e-08, + "loss": 0.0756, + "step": 8946 + }, + { + "epoch": 2.8992222942320156, + "grad_norm": 0.8695583343505859, + "learning_rate": 1.4877674389349728e-08, + "loss": 0.0708, + "step": 8947 + }, + { + "epoch": 2.899546338302009, + "grad_norm": 0.8433423042297363, + "learning_rate": 1.4782551465866713e-08, + "loss": 0.0711, + "step": 8948 + }, + { + "epoch": 2.8998703823720025, + "grad_norm": 0.844932496547699, + "learning_rate": 1.4687732708940916e-08, + "loss": 0.0705, + "step": 8949 + }, + { + "epoch": 2.9001944264419963, + "grad_norm": 0.8735789060592651, + "learning_rate": 1.4593218130176668e-08, + "loss": 0.0734, + "step": 8950 + }, + { + "epoch": 2.9005184705119897, + "grad_norm": 0.9091753363609314, + "learning_rate": 1.4499007741141934e-08, + "loss": 0.0773, + "step": 8951 + }, + { + "epoch": 2.900842514581983, + "grad_norm": 0.935634970664978, + "learning_rate": 1.4405101553367218e-08, + "loss": 0.0738, + "step": 8952 + }, + { + "epoch": 2.9011665586519766, + "grad_norm": 0.8336095213890076, + "learning_rate": 1.4311499578345821e-08, + "loss": 0.0711, + "step": 8953 + }, + { + "epoch": 2.90149060272197, + "grad_norm": 0.9978169798851013, + "learning_rate": 1.421820182753303e-08, + "loss": 0.0811, + "step": 8954 + }, + { + "epoch": 2.901814646791964, + "grad_norm": 0.9416351318359375, + "learning_rate": 1.4125208312348593e-08, + "loss": 0.0807, + "step": 8955 + }, + { + "epoch": 2.9021386908619573, + "grad_norm": 0.88267582654953, + "learning_rate": 1.403251904417341e-08, + "loss": 0.0718, + "step": 8956 + }, + { + "epoch": 2.9024627349319507, + "grad_norm": 0.9784106016159058, + "learning_rate": 1.3940134034351738e-08, + "loss": 0.0809, + "step": 8957 + }, + { + "epoch": 2.902786779001944, + "grad_norm": 0.8735864758491516, + "learning_rate": 1.3848053294190922e-08, + "loss": 0.0742, + "step": 8958 + }, + { + "epoch": 2.9031108230719376, + "grad_norm": 0.892750084400177, + "learning_rate": 1.3756276834960558e-08, + "loss": 0.0772, + "step": 8959 + }, + { + "epoch": 2.9034348671419314, + "grad_norm": 0.9043106436729431, + "learning_rate": 1.366480466789305e-08, + "loss": 0.0715, + "step": 8960 + }, + { + "epoch": 2.903758911211925, + "grad_norm": 0.826783299446106, + "learning_rate": 1.3573636804183887e-08, + "loss": 0.073, + "step": 8961 + }, + { + "epoch": 2.9040829552819183, + "grad_norm": 0.8980486392974854, + "learning_rate": 1.3482773254991365e-08, + "loss": 0.0747, + "step": 8962 + }, + { + "epoch": 2.9044069993519117, + "grad_norm": 0.9460495710372925, + "learning_rate": 1.3392214031435757e-08, + "loss": 0.0796, + "step": 8963 + }, + { + "epoch": 2.904731043421905, + "grad_norm": 0.8458935022354126, + "learning_rate": 1.3301959144600974e-08, + "loss": 0.0707, + "step": 8964 + }, + { + "epoch": 2.905055087491899, + "grad_norm": 1.1474437713623047, + "learning_rate": 1.3212008605533177e-08, + "loss": 0.0799, + "step": 8965 + }, + { + "epoch": 2.9053791315618924, + "grad_norm": 0.8916202783584595, + "learning_rate": 1.312236242524162e-08, + "loss": 0.0785, + "step": 8966 + }, + { + "epoch": 2.905703175631886, + "grad_norm": 0.9507387280464172, + "learning_rate": 1.3033020614698078e-08, + "loss": 0.08, + "step": 8967 + }, + { + "epoch": 2.9060272197018797, + "grad_norm": 0.9159635305404663, + "learning_rate": 1.2943983184837417e-08, + "loss": 0.0736, + "step": 8968 + }, + { + "epoch": 2.906351263771873, + "grad_norm": 0.8017541766166687, + "learning_rate": 1.2855250146556197e-08, + "loss": 0.0638, + "step": 8969 + }, + { + "epoch": 2.9066753078418666, + "grad_norm": 0.9174900054931641, + "learning_rate": 1.2766821510715177e-08, + "loss": 0.0783, + "step": 8970 + }, + { + "epoch": 2.90699935191186, + "grad_norm": 0.8530569076538086, + "learning_rate": 1.2678697288136809e-08, + "loss": 0.0733, + "step": 8971 + }, + { + "epoch": 2.9073233959818534, + "grad_norm": 0.8402195572853088, + "learning_rate": 1.2590877489606911e-08, + "loss": 0.0709, + "step": 8972 + }, + { + "epoch": 2.9076474400518473, + "grad_norm": 0.9232442378997803, + "learning_rate": 1.2503362125873552e-08, + "loss": 0.0761, + "step": 8973 + }, + { + "epoch": 2.9079714841218407, + "grad_norm": 0.8968020081520081, + "learning_rate": 1.241615120764761e-08, + "loss": 0.0748, + "step": 8974 + }, + { + "epoch": 2.908295528191834, + "grad_norm": 0.8251527547836304, + "learning_rate": 1.2329244745603596e-08, + "loss": 0.07, + "step": 8975 + }, + { + "epoch": 2.9086195722618275, + "grad_norm": 0.9001858830451965, + "learning_rate": 1.2242642750376899e-08, + "loss": 0.0788, + "step": 8976 + }, + { + "epoch": 2.908943616331821, + "grad_norm": 0.8136048316955566, + "learning_rate": 1.2156345232567923e-08, + "loss": 0.0658, + "step": 8977 + }, + { + "epoch": 2.909267660401815, + "grad_norm": 0.9805194139480591, + "learning_rate": 1.2070352202737668e-08, + "loss": 0.085, + "step": 8978 + }, + { + "epoch": 2.9095917044718083, + "grad_norm": 0.9120486378669739, + "learning_rate": 1.19846636714116e-08, + "loss": 0.0792, + "step": 8979 + }, + { + "epoch": 2.9099157485418017, + "grad_norm": 0.8749681711196899, + "learning_rate": 1.1899279649076612e-08, + "loss": 0.0758, + "step": 8980 + }, + { + "epoch": 2.910239792611795, + "grad_norm": 0.8955467343330383, + "learning_rate": 1.181420014618323e-08, + "loss": 0.0757, + "step": 8981 + }, + { + "epoch": 2.9105638366817885, + "grad_norm": 0.8159177303314209, + "learning_rate": 1.172942517314396e-08, + "loss": 0.069, + "step": 8982 + }, + { + "epoch": 2.9108878807517824, + "grad_norm": 0.8539730906486511, + "learning_rate": 1.1644954740334946e-08, + "loss": 0.0723, + "step": 8983 + }, + { + "epoch": 2.911211924821776, + "grad_norm": 0.8731797337532043, + "learning_rate": 1.1560788858094584e-08, + "loss": 0.0707, + "step": 8984 + }, + { + "epoch": 2.9115359688917692, + "grad_norm": 0.8900837898254395, + "learning_rate": 1.1476927536723248e-08, + "loss": 0.0783, + "step": 8985 + }, + { + "epoch": 2.9118600129617627, + "grad_norm": 0.9353973269462585, + "learning_rate": 1.1393370786485502e-08, + "loss": 0.0749, + "step": 8986 + }, + { + "epoch": 2.912184057031756, + "grad_norm": 1.0125935077667236, + "learning_rate": 1.1310118617607613e-08, + "loss": 0.0734, + "step": 8987 + }, + { + "epoch": 2.91250810110175, + "grad_norm": 0.8920942544937134, + "learning_rate": 1.122717104027865e-08, + "loss": 0.0751, + "step": 8988 + }, + { + "epoch": 2.9128321451717434, + "grad_norm": 0.8845387697219849, + "learning_rate": 1.1144528064650772e-08, + "loss": 0.0738, + "step": 8989 + }, + { + "epoch": 2.913156189241737, + "grad_norm": 0.8738760352134705, + "learning_rate": 1.1062189700838944e-08, + "loss": 0.0751, + "step": 8990 + }, + { + "epoch": 2.9134802333117307, + "grad_norm": 0.9137491583824158, + "learning_rate": 1.0980155958920103e-08, + "loss": 0.0756, + "step": 8991 + }, + { + "epoch": 2.9138042773817237, + "grad_norm": 0.9099265336990356, + "learning_rate": 1.089842684893455e-08, + "loss": 0.0775, + "step": 8992 + }, + { + "epoch": 2.9141283214517175, + "grad_norm": 0.9011944532394409, + "learning_rate": 1.0817002380885123e-08, + "loss": 0.0768, + "step": 8993 + }, + { + "epoch": 2.914452365521711, + "grad_norm": 0.8906083106994629, + "learning_rate": 1.0735882564737732e-08, + "loss": 0.0737, + "step": 8994 + }, + { + "epoch": 2.9147764095917044, + "grad_norm": 0.8653832674026489, + "learning_rate": 1.0655067410419994e-08, + "loss": 0.0744, + "step": 8995 + }, + { + "epoch": 2.9151004536616982, + "grad_norm": 0.9044657349586487, + "learning_rate": 1.057455692782372e-08, + "loss": 0.0776, + "step": 8996 + }, + { + "epoch": 2.9154244977316917, + "grad_norm": 0.8942205309867859, + "learning_rate": 1.049435112680186e-08, + "loss": 0.0729, + "step": 8997 + }, + { + "epoch": 2.915748541801685, + "grad_norm": 0.8953902721405029, + "learning_rate": 1.0414450017171007e-08, + "loss": 0.0782, + "step": 8998 + }, + { + "epoch": 2.9160725858716785, + "grad_norm": 0.923188328742981, + "learning_rate": 1.0334853608710838e-08, + "loss": 0.0765, + "step": 8999 + }, + { + "epoch": 2.916396629941672, + "grad_norm": 0.8497654795646667, + "learning_rate": 1.0255561911162449e-08, + "loss": 0.0725, + "step": 9000 + }, + { + "epoch": 2.916720674011666, + "grad_norm": 0.8314547538757324, + "learning_rate": 1.0176574934230854e-08, + "loss": 0.0688, + "step": 9001 + }, + { + "epoch": 2.917044718081659, + "grad_norm": 0.9178106188774109, + "learning_rate": 1.0097892687583044e-08, + "loss": 0.0754, + "step": 9002 + }, + { + "epoch": 2.9173687621516526, + "grad_norm": 0.8695831298828125, + "learning_rate": 1.0019515180849094e-08, + "loss": 0.0704, + "step": 9003 + }, + { + "epoch": 2.917692806221646, + "grad_norm": 0.884425163269043, + "learning_rate": 9.941442423621606e-09, + "loss": 0.0756, + "step": 9004 + }, + { + "epoch": 2.9180168502916395, + "grad_norm": 0.877781093120575, + "learning_rate": 9.863674425455716e-09, + "loss": 0.0717, + "step": 9005 + }, + { + "epoch": 2.9183408943616334, + "grad_norm": 0.9392322301864624, + "learning_rate": 9.78621119586992e-09, + "loss": 0.0812, + "step": 9006 + }, + { + "epoch": 2.9186649384316268, + "grad_norm": 0.9074358344078064, + "learning_rate": 9.709052744344694e-09, + "loss": 0.074, + "step": 9007 + }, + { + "epoch": 2.91898898250162, + "grad_norm": 0.8570203185081482, + "learning_rate": 9.63219908032359e-09, + "loss": 0.0724, + "step": 9008 + }, + { + "epoch": 2.9193130265716136, + "grad_norm": 0.8743963837623596, + "learning_rate": 9.55565021321242e-09, + "loss": 0.072, + "step": 9009 + }, + { + "epoch": 2.919637070641607, + "grad_norm": 0.9489037990570068, + "learning_rate": 9.479406152380632e-09, + "loss": 0.0769, + "step": 9010 + }, + { + "epoch": 2.919961114711601, + "grad_norm": 0.9726477861404419, + "learning_rate": 9.403466907159375e-09, + "loss": 0.0763, + "step": 9011 + }, + { + "epoch": 2.9202851587815943, + "grad_norm": 0.9409971833229065, + "learning_rate": 9.327832486842603e-09, + "loss": 0.0794, + "step": 9012 + }, + { + "epoch": 2.9206092028515878, + "grad_norm": 0.9196006059646606, + "learning_rate": 9.25250290068791e-09, + "loss": 0.0735, + "step": 9013 + }, + { + "epoch": 2.920933246921581, + "grad_norm": 0.8963923454284668, + "learning_rate": 9.17747815791431e-09, + "loss": 0.0765, + "step": 9014 + }, + { + "epoch": 2.9212572909915746, + "grad_norm": 0.9616303443908691, + "learning_rate": 9.102758267704736e-09, + "loss": 0.0803, + "step": 9015 + }, + { + "epoch": 2.9215813350615685, + "grad_norm": 0.9092485308647156, + "learning_rate": 9.02834323920354e-09, + "loss": 0.0712, + "step": 9016 + }, + { + "epoch": 2.921905379131562, + "grad_norm": 0.838982105255127, + "learning_rate": 8.954233081518438e-09, + "loss": 0.072, + "step": 9017 + }, + { + "epoch": 2.9222294232015553, + "grad_norm": 0.8191535472869873, + "learning_rate": 8.880427803720226e-09, + "loss": 0.0703, + "step": 9018 + }, + { + "epoch": 2.922553467271549, + "grad_norm": 0.8949090838432312, + "learning_rate": 8.806927414841959e-09, + "loss": 0.0784, + "step": 9019 + }, + { + "epoch": 2.9228775113415426, + "grad_norm": 0.9300609827041626, + "learning_rate": 8.73373192387894e-09, + "loss": 0.0806, + "step": 9020 + }, + { + "epoch": 2.923201555411536, + "grad_norm": 0.8566632270812988, + "learning_rate": 8.660841339789561e-09, + "loss": 0.0694, + "step": 9021 + }, + { + "epoch": 2.9235255994815295, + "grad_norm": 0.9081147313117981, + "learning_rate": 8.58825567149557e-09, + "loss": 0.0752, + "step": 9022 + }, + { + "epoch": 2.923849643551523, + "grad_norm": 0.876440167427063, + "learning_rate": 8.515974927880144e-09, + "loss": 0.0728, + "step": 9023 + }, + { + "epoch": 2.9241736876215167, + "grad_norm": 0.9899942278862, + "learning_rate": 8.443999117790091e-09, + "loss": 0.0767, + "step": 9024 + }, + { + "epoch": 2.92449773169151, + "grad_norm": 0.8901786208152771, + "learning_rate": 8.3723282500342e-09, + "loss": 0.0758, + "step": 9025 + }, + { + "epoch": 2.9248217757615036, + "grad_norm": 0.9070116281509399, + "learning_rate": 8.3009623333849e-09, + "loss": 0.0752, + "step": 9026 + }, + { + "epoch": 2.925145819831497, + "grad_norm": 0.8795725107192993, + "learning_rate": 8.229901376575755e-09, + "loss": 0.0714, + "step": 9027 + }, + { + "epoch": 2.9254698639014904, + "grad_norm": 0.8601377606391907, + "learning_rate": 8.15914538830509e-09, + "loss": 0.0753, + "step": 9028 + }, + { + "epoch": 2.9257939079714843, + "grad_norm": 0.9134733080863953, + "learning_rate": 8.088694377231532e-09, + "loss": 0.0781, + "step": 9029 + }, + { + "epoch": 2.9261179520414777, + "grad_norm": 0.9644197821617126, + "learning_rate": 8.018548351978738e-09, + "loss": 0.0774, + "step": 9030 + }, + { + "epoch": 2.926441996111471, + "grad_norm": 0.8478583693504333, + "learning_rate": 7.948707321130956e-09, + "loss": 0.0697, + "step": 9031 + }, + { + "epoch": 2.9267660401814646, + "grad_norm": 0.8698458671569824, + "learning_rate": 7.879171293236621e-09, + "loss": 0.073, + "step": 9032 + }, + { + "epoch": 2.927090084251458, + "grad_norm": 0.8994610905647278, + "learning_rate": 7.80994027680615e-09, + "loss": 0.074, + "step": 9033 + }, + { + "epoch": 2.927414128321452, + "grad_norm": 0.8590171337127686, + "learning_rate": 7.741014280312765e-09, + "loss": 0.0762, + "step": 9034 + }, + { + "epoch": 2.9277381723914453, + "grad_norm": 0.8662635684013367, + "learning_rate": 7.672393312192218e-09, + "loss": 0.0708, + "step": 9035 + }, + { + "epoch": 2.9280622164614387, + "grad_norm": 0.8634116649627686, + "learning_rate": 7.604077380843067e-09, + "loss": 0.0696, + "step": 9036 + }, + { + "epoch": 2.928386260531432, + "grad_norm": 0.9314274787902832, + "learning_rate": 7.536066494626681e-09, + "loss": 0.0795, + "step": 9037 + }, + { + "epoch": 2.9287103046014256, + "grad_norm": 0.9240031242370605, + "learning_rate": 7.468360661866957e-09, + "loss": 0.0763, + "step": 9038 + }, + { + "epoch": 2.9290343486714194, + "grad_norm": 0.9062063694000244, + "learning_rate": 7.400959890850046e-09, + "loss": 0.0756, + "step": 9039 + }, + { + "epoch": 2.929358392741413, + "grad_norm": 0.8826825022697449, + "learning_rate": 7.333864189825735e-09, + "loss": 0.0753, + "step": 9040 + }, + { + "epoch": 2.9296824368114063, + "grad_norm": 0.8756586909294128, + "learning_rate": 7.267073567005234e-09, + "loss": 0.075, + "step": 9041 + }, + { + "epoch": 2.9300064808814, + "grad_norm": 0.8725214004516602, + "learning_rate": 7.2005880305636714e-09, + "loss": 0.0749, + "step": 9042 + }, + { + "epoch": 2.930330524951393, + "grad_norm": 0.893915057182312, + "learning_rate": 7.134407588637871e-09, + "loss": 0.0748, + "step": 9043 + }, + { + "epoch": 2.930654569021387, + "grad_norm": 0.888831377029419, + "learning_rate": 7.068532249327742e-09, + "loss": 0.0767, + "step": 9044 + }, + { + "epoch": 2.9309786130913804, + "grad_norm": 0.9237712025642395, + "learning_rate": 7.002962020695725e-09, + "loss": 0.0777, + "step": 9045 + }, + { + "epoch": 2.931302657161374, + "grad_norm": 0.9445968866348267, + "learning_rate": 6.937696910767067e-09, + "loss": 0.0789, + "step": 9046 + }, + { + "epoch": 2.9316267012313677, + "grad_norm": 0.9167876243591309, + "learning_rate": 6.872736927529822e-09, + "loss": 0.0734, + "step": 9047 + }, + { + "epoch": 2.931950745301361, + "grad_norm": 0.8834000825881958, + "learning_rate": 6.8080820789340195e-09, + "loss": 0.0711, + "step": 9048 + }, + { + "epoch": 2.9322747893713546, + "grad_norm": 0.820752739906311, + "learning_rate": 6.743732372893053e-09, + "loss": 0.0645, + "step": 9049 + }, + { + "epoch": 2.932598833441348, + "grad_norm": 0.8337461948394775, + "learning_rate": 6.679687817282843e-09, + "loss": 0.0708, + "step": 9050 + }, + { + "epoch": 2.9329228775113414, + "grad_norm": 0.825693666934967, + "learning_rate": 6.615948419941565e-09, + "loss": 0.0689, + "step": 9051 + }, + { + "epoch": 2.9332469215813353, + "grad_norm": 0.9408130049705505, + "learning_rate": 6.5525141886702005e-09, + "loss": 0.0824, + "step": 9052 + }, + { + "epoch": 2.9335709656513287, + "grad_norm": 0.9178377389907837, + "learning_rate": 6.489385131232817e-09, + "loss": 0.0797, + "step": 9053 + }, + { + "epoch": 2.933895009721322, + "grad_norm": 0.8189285397529602, + "learning_rate": 6.426561255355457e-09, + "loss": 0.0652, + "step": 9054 + }, + { + "epoch": 2.9342190537913155, + "grad_norm": 0.9435027241706848, + "learning_rate": 6.364042568727524e-09, + "loss": 0.0835, + "step": 9055 + }, + { + "epoch": 2.934543097861309, + "grad_norm": 0.8830663561820984, + "learning_rate": 6.301829079000399e-09, + "loss": 0.0733, + "step": 9056 + }, + { + "epoch": 2.934867141931303, + "grad_norm": 0.8811532258987427, + "learning_rate": 6.239920793788546e-09, + "loss": 0.0783, + "step": 9057 + }, + { + "epoch": 2.9351911860012962, + "grad_norm": 0.8556127548217773, + "learning_rate": 6.178317720668958e-09, + "loss": 0.0755, + "step": 9058 + }, + { + "epoch": 2.9355152300712897, + "grad_norm": 0.9733546376228333, + "learning_rate": 6.117019867181162e-09, + "loss": 0.0823, + "step": 9059 + }, + { + "epoch": 2.935839274141283, + "grad_norm": 0.8635657429695129, + "learning_rate": 6.056027240827489e-09, + "loss": 0.0771, + "step": 9060 + }, + { + "epoch": 2.9361633182112765, + "grad_norm": 0.8800527453422546, + "learning_rate": 5.995339849073079e-09, + "loss": 0.0762, + "step": 9061 + }, + { + "epoch": 2.9364873622812704, + "grad_norm": 0.9648496508598328, + "learning_rate": 5.9349576993447675e-09, + "loss": 0.0749, + "step": 9062 + }, + { + "epoch": 2.936811406351264, + "grad_norm": 0.9104852080345154, + "learning_rate": 5.874880799033589e-09, + "loss": 0.077, + "step": 9063 + }, + { + "epoch": 2.9371354504212572, + "grad_norm": 1.2932277917861938, + "learning_rate": 5.815109155491716e-09, + "loss": 0.08, + "step": 9064 + }, + { + "epoch": 2.9374594944912507, + "grad_norm": 0.9217556715011597, + "learning_rate": 5.755642776035242e-09, + "loss": 0.0768, + "step": 9065 + }, + { + "epoch": 2.937783538561244, + "grad_norm": 0.9468752145767212, + "learning_rate": 5.696481667941678e-09, + "loss": 0.0814, + "step": 9066 + }, + { + "epoch": 2.938107582631238, + "grad_norm": 0.8518416881561279, + "learning_rate": 5.637625838452176e-09, + "loss": 0.072, + "step": 9067 + }, + { + "epoch": 2.9384316267012314, + "grad_norm": 0.884737491607666, + "learning_rate": 5.579075294769864e-09, + "loss": 0.0755, + "step": 9068 + }, + { + "epoch": 2.938755670771225, + "grad_norm": 0.8714679479598999, + "learning_rate": 5.520830044060677e-09, + "loss": 0.0683, + "step": 9069 + }, + { + "epoch": 2.9390797148412187, + "grad_norm": 0.8819865584373474, + "learning_rate": 5.46289009345391e-09, + "loss": 0.0747, + "step": 9070 + }, + { + "epoch": 2.939403758911212, + "grad_norm": 0.9256206750869751, + "learning_rate": 5.405255450040003e-09, + "loss": 0.0767, + "step": 9071 + }, + { + "epoch": 2.9397278029812055, + "grad_norm": 0.9435095191001892, + "learning_rate": 5.347926120873592e-09, + "loss": 0.0751, + "step": 9072 + }, + { + "epoch": 2.940051847051199, + "grad_norm": 0.9027978181838989, + "learning_rate": 5.290902112970731e-09, + "loss": 0.0725, + "step": 9073 + }, + { + "epoch": 2.9403758911211924, + "grad_norm": 0.9899285435676575, + "learning_rate": 5.234183433310835e-09, + "loss": 0.079, + "step": 9074 + }, + { + "epoch": 2.940699935191186, + "grad_norm": 0.9293980598449707, + "learning_rate": 5.177770088835854e-09, + "loss": 0.0799, + "step": 9075 + }, + { + "epoch": 2.9410239792611796, + "grad_norm": 0.873587429523468, + "learning_rate": 5.121662086449708e-09, + "loss": 0.0732, + "step": 9076 + }, + { + "epoch": 2.941348023331173, + "grad_norm": 0.8625730276107788, + "learning_rate": 5.065859433019959e-09, + "loss": 0.0709, + "step": 9077 + }, + { + "epoch": 2.9416720674011665, + "grad_norm": 0.9578263163566589, + "learning_rate": 5.010362135376423e-09, + "loss": 0.0792, + "step": 9078 + }, + { + "epoch": 2.94199611147116, + "grad_norm": 0.86197429895401, + "learning_rate": 4.955170200310888e-09, + "loss": 0.0748, + "step": 9079 + }, + { + "epoch": 2.942320155541154, + "grad_norm": 0.8910972476005554, + "learning_rate": 4.9002836345787845e-09, + "loss": 0.0769, + "step": 9080 + }, + { + "epoch": 2.942644199611147, + "grad_norm": 0.9671840071678162, + "learning_rate": 4.845702444897515e-09, + "loss": 0.0748, + "step": 9081 + }, + { + "epoch": 2.9429682436811406, + "grad_norm": 0.950499951839447, + "learning_rate": 4.791426637947294e-09, + "loss": 0.0832, + "step": 9082 + }, + { + "epoch": 2.943292287751134, + "grad_norm": 0.8624812364578247, + "learning_rate": 4.7374562203708615e-09, + "loss": 0.074, + "step": 9083 + }, + { + "epoch": 2.9436163318211275, + "grad_norm": 0.8232616782188416, + "learning_rate": 4.683791198773768e-09, + "loss": 0.0725, + "step": 9084 + }, + { + "epoch": 2.9439403758911213, + "grad_norm": 0.8955888748168945, + "learning_rate": 4.630431579724371e-09, + "loss": 0.0796, + "step": 9085 + }, + { + "epoch": 2.9442644199611148, + "grad_norm": 0.9281444549560547, + "learning_rate": 4.577377369752722e-09, + "loss": 0.0725, + "step": 9086 + }, + { + "epoch": 2.944588464031108, + "grad_norm": 0.9185742735862732, + "learning_rate": 4.524628575352796e-09, + "loss": 0.0808, + "step": 9087 + }, + { + "epoch": 2.9449125081011016, + "grad_norm": 0.8552374839782715, + "learning_rate": 4.472185202980261e-09, + "loss": 0.0713, + "step": 9088 + }, + { + "epoch": 2.945236552171095, + "grad_norm": 0.9765703678131104, + "learning_rate": 4.420047259053595e-09, + "loss": 0.0811, + "step": 9089 + }, + { + "epoch": 2.945560596241089, + "grad_norm": 0.8993111848831177, + "learning_rate": 4.36821474995408e-09, + "loss": 0.0735, + "step": 9090 + }, + { + "epoch": 2.9458846403110823, + "grad_norm": 0.8508222103118896, + "learning_rate": 4.316687682025256e-09, + "loss": 0.0722, + "step": 9091 + }, + { + "epoch": 2.9462086843810757, + "grad_norm": 0.8553758263587952, + "learning_rate": 4.26546606157402e-09, + "loss": 0.0702, + "step": 9092 + }, + { + "epoch": 2.9465327284510696, + "grad_norm": 0.8335778713226318, + "learning_rate": 4.2145498948692465e-09, + "loss": 0.0704, + "step": 9093 + }, + { + "epoch": 2.9468567725210626, + "grad_norm": 0.8840744495391846, + "learning_rate": 4.163939188142341e-09, + "loss": 0.0749, + "step": 9094 + }, + { + "epoch": 2.9471808165910565, + "grad_norm": 0.7947471737861633, + "learning_rate": 4.113633947587792e-09, + "loss": 0.0645, + "step": 9095 + }, + { + "epoch": 2.94750486066105, + "grad_norm": 0.9484292268753052, + "learning_rate": 4.063634179362341e-09, + "loss": 0.0805, + "step": 9096 + }, + { + "epoch": 2.9478289047310433, + "grad_norm": 0.9598378539085388, + "learning_rate": 4.013939889585538e-09, + "loss": 0.085, + "step": 9097 + }, + { + "epoch": 2.948152948801037, + "grad_norm": 0.8770298361778259, + "learning_rate": 3.964551084339463e-09, + "loss": 0.0748, + "step": 9098 + }, + { + "epoch": 2.9484769928710306, + "grad_norm": 0.8788001537322998, + "learning_rate": 3.915467769668724e-09, + "loss": 0.0736, + "step": 9099 + }, + { + "epoch": 2.948801036941024, + "grad_norm": 0.8909417390823364, + "learning_rate": 3.866689951580738e-09, + "loss": 0.0748, + "step": 9100 + }, + { + "epoch": 2.9491250810110174, + "grad_norm": 0.8614071011543274, + "learning_rate": 3.818217636045729e-09, + "loss": 0.0742, + "step": 9101 + }, + { + "epoch": 2.949449125081011, + "grad_norm": 0.8587439060211182, + "learning_rate": 3.770050828995897e-09, + "loss": 0.0741, + "step": 9102 + }, + { + "epoch": 2.9497731691510047, + "grad_norm": 0.9640584588050842, + "learning_rate": 3.7221895363262485e-09, + "loss": 0.0776, + "step": 9103 + }, + { + "epoch": 2.950097213220998, + "grad_norm": 0.893592119216919, + "learning_rate": 3.674633763894875e-09, + "loss": 0.0729, + "step": 9104 + }, + { + "epoch": 2.9504212572909916, + "grad_norm": 0.9449043869972229, + "learning_rate": 3.6273835175221204e-09, + "loss": 0.0827, + "step": 9105 + }, + { + "epoch": 2.950745301360985, + "grad_norm": 0.8701707124710083, + "learning_rate": 3.58043880299086e-09, + "loss": 0.0779, + "step": 9106 + }, + { + "epoch": 2.9510693454309784, + "grad_norm": 0.890649676322937, + "learning_rate": 3.533799626046497e-09, + "loss": 0.0773, + "step": 9107 + }, + { + "epoch": 2.9513933895009723, + "grad_norm": 0.9598814249038696, + "learning_rate": 3.487465992397521e-09, + "loss": 0.0737, + "step": 9108 + }, + { + "epoch": 2.9517174335709657, + "grad_norm": 0.8655813932418823, + "learning_rate": 3.4414379077146733e-09, + "loss": 0.0735, + "step": 9109 + }, + { + "epoch": 2.952041477640959, + "grad_norm": 0.8902621865272522, + "learning_rate": 3.3957153776312257e-09, + "loss": 0.0765, + "step": 9110 + }, + { + "epoch": 2.9523655217109526, + "grad_norm": 0.957643985748291, + "learning_rate": 3.3502984077429803e-09, + "loss": 0.077, + "step": 9111 + }, + { + "epoch": 2.952689565780946, + "grad_norm": 0.9296534657478333, + "learning_rate": 3.3051870036091004e-09, + "loss": 0.0731, + "step": 9112 + }, + { + "epoch": 2.95301360985094, + "grad_norm": 0.8171263933181763, + "learning_rate": 3.260381170750171e-09, + "loss": 0.069, + "step": 9113 + }, + { + "epoch": 2.9533376539209333, + "grad_norm": 0.918368935585022, + "learning_rate": 3.215880914650693e-09, + "loss": 0.0774, + "step": 9114 + }, + { + "epoch": 2.9536616979909267, + "grad_norm": 0.9412216544151306, + "learning_rate": 3.171686240756033e-09, + "loss": 0.0747, + "step": 9115 + }, + { + "epoch": 2.95398574206092, + "grad_norm": 0.8553744554519653, + "learning_rate": 3.1277971544763088e-09, + "loss": 0.0745, + "step": 9116 + }, + { + "epoch": 2.9543097861309136, + "grad_norm": 0.9307593703269958, + "learning_rate": 3.0842136611825004e-09, + "loss": 0.0809, + "step": 9117 + }, + { + "epoch": 2.9546338302009074, + "grad_norm": 0.8458048701286316, + "learning_rate": 3.0409357662086748e-09, + "loss": 0.0703, + "step": 9118 + }, + { + "epoch": 2.954957874270901, + "grad_norm": 0.8757748007774353, + "learning_rate": 2.997963474852261e-09, + "loss": 0.0726, + "step": 9119 + }, + { + "epoch": 2.9552819183408943, + "grad_norm": 0.8589341640472412, + "learning_rate": 2.9552967923721086e-09, + "loss": 0.0681, + "step": 9120 + }, + { + "epoch": 2.955605962410888, + "grad_norm": 0.8835259079933167, + "learning_rate": 2.9129357239901514e-09, + "loss": 0.0714, + "step": 9121 + }, + { + "epoch": 2.9559300064808816, + "grad_norm": 0.962489128112793, + "learning_rate": 2.8708802748914077e-09, + "loss": 0.0749, + "step": 9122 + }, + { + "epoch": 2.956254050550875, + "grad_norm": 0.9141943454742432, + "learning_rate": 2.829130450222872e-09, + "loss": 0.0805, + "step": 9123 + }, + { + "epoch": 2.9565780946208684, + "grad_norm": 0.8826735019683838, + "learning_rate": 2.7876862550940685e-09, + "loss": 0.0726, + "step": 9124 + }, + { + "epoch": 2.956902138690862, + "grad_norm": 0.9047395586967468, + "learning_rate": 2.7465476945778835e-09, + "loss": 0.0787, + "step": 9125 + }, + { + "epoch": 2.9572261827608557, + "grad_norm": 0.8255360126495361, + "learning_rate": 2.705714773708623e-09, + "loss": 0.0691, + "step": 9126 + }, + { + "epoch": 2.957550226830849, + "grad_norm": 0.8987961411476135, + "learning_rate": 2.6651874974845115e-09, + "loss": 0.0771, + "step": 9127 + }, + { + "epoch": 2.9578742709008425, + "grad_norm": 0.9574360847473145, + "learning_rate": 2.6249658708651928e-09, + "loss": 0.0798, + "step": 9128 + }, + { + "epoch": 2.958198314970836, + "grad_norm": 0.9309230446815491, + "learning_rate": 2.5850498987733952e-09, + "loss": 0.078, + "step": 9129 + }, + { + "epoch": 2.9585223590408294, + "grad_norm": 0.8887049555778503, + "learning_rate": 2.545439586094933e-09, + "loss": 0.0735, + "step": 9130 + }, + { + "epoch": 2.9588464031108233, + "grad_norm": 0.8663487434387207, + "learning_rate": 2.506134937677318e-09, + "loss": 0.0737, + "step": 9131 + }, + { + "epoch": 2.9591704471808167, + "grad_norm": 0.9009921550750732, + "learning_rate": 2.4671359583314237e-09, + "loss": 0.0763, + "step": 9132 + }, + { + "epoch": 2.95949449125081, + "grad_norm": 0.8658737540245056, + "learning_rate": 2.4284426528298212e-09, + "loss": 0.0745, + "step": 9133 + }, + { + "epoch": 2.9598185353208035, + "grad_norm": 0.8811216354370117, + "learning_rate": 2.3900550259084445e-09, + "loss": 0.0735, + "step": 9134 + }, + { + "epoch": 2.960142579390797, + "grad_norm": 0.9001662731170654, + "learning_rate": 2.351973082265757e-09, + "loss": 0.0747, + "step": 9135 + }, + { + "epoch": 2.960466623460791, + "grad_norm": 0.8873869776725769, + "learning_rate": 2.314196826562476e-09, + "loss": 0.0767, + "step": 9136 + }, + { + "epoch": 2.9607906675307842, + "grad_norm": 0.8550229072570801, + "learning_rate": 2.2767262634218466e-09, + "loss": 0.0712, + "step": 9137 + }, + { + "epoch": 2.9611147116007777, + "grad_norm": 0.8947347402572632, + "learning_rate": 2.239561397430201e-09, + "loss": 0.0723, + "step": 9138 + }, + { + "epoch": 2.961438755670771, + "grad_norm": 0.8800020217895508, + "learning_rate": 2.2027022331361226e-09, + "loss": 0.075, + "step": 9139 + }, + { + "epoch": 2.9617627997407645, + "grad_norm": 0.9262352585792542, + "learning_rate": 2.1661487750504473e-09, + "loss": 0.0776, + "step": 9140 + }, + { + "epoch": 2.9620868438107584, + "grad_norm": 0.8233925700187683, + "learning_rate": 2.129901027647652e-09, + "loss": 0.07, + "step": 9141 + }, + { + "epoch": 2.962410887880752, + "grad_norm": 1.0221971273422241, + "learning_rate": 2.0939589953633542e-09, + "loss": 0.0802, + "step": 9142 + }, + { + "epoch": 2.962734931950745, + "grad_norm": 0.9081821441650391, + "learning_rate": 2.0583226825970915e-09, + "loss": 0.0765, + "step": 9143 + }, + { + "epoch": 2.963058976020739, + "grad_norm": 0.8890132904052734, + "learning_rate": 2.022992093710097e-09, + "loss": 0.0722, + "step": 9144 + }, + { + "epoch": 2.963383020090732, + "grad_norm": 1.019970178604126, + "learning_rate": 1.9879672330266886e-09, + "loss": 0.0771, + "step": 9145 + }, + { + "epoch": 2.963707064160726, + "grad_norm": 0.9550405144691467, + "learning_rate": 1.9532481048334383e-09, + "loss": 0.0753, + "step": 9146 + }, + { + "epoch": 2.9640311082307194, + "grad_norm": 0.8712491989135742, + "learning_rate": 1.918834713379447e-09, + "loss": 0.0777, + "step": 9147 + }, + { + "epoch": 2.964355152300713, + "grad_norm": 0.9841358065605164, + "learning_rate": 1.884727062876901e-09, + "loss": 0.0801, + "step": 9148 + }, + { + "epoch": 2.9646791963707066, + "grad_norm": 0.8419924378395081, + "learning_rate": 1.8509251575002386e-09, + "loss": 0.0753, + "step": 9149 + }, + { + "epoch": 2.9650032404407, + "grad_norm": 0.8987525701522827, + "learning_rate": 1.8174290013864282e-09, + "loss": 0.0763, + "step": 9150 + }, + { + "epoch": 2.9653272845106935, + "grad_norm": 0.9673410654067993, + "learning_rate": 1.784238598634691e-09, + "loss": 0.0864, + "step": 9151 + }, + { + "epoch": 2.965651328580687, + "grad_norm": 0.8701542615890503, + "learning_rate": 1.7513539533078882e-09, + "loss": 0.0739, + "step": 9152 + }, + { + "epoch": 2.9659753726506803, + "grad_norm": 0.8537030816078186, + "learning_rate": 1.7187750694303007e-09, + "loss": 0.0737, + "step": 9153 + }, + { + "epoch": 2.966299416720674, + "grad_norm": 0.8703850507736206, + "learning_rate": 1.686501950989572e-09, + "loss": 0.0735, + "step": 9154 + }, + { + "epoch": 2.9666234607906676, + "grad_norm": 0.909617006778717, + "learning_rate": 1.6545346019350427e-09, + "loss": 0.0799, + "step": 9155 + }, + { + "epoch": 2.966947504860661, + "grad_norm": 0.8157674074172974, + "learning_rate": 1.6228730261799718e-09, + "loss": 0.0682, + "step": 9156 + }, + { + "epoch": 2.9672715489306545, + "grad_norm": 1.0222302675247192, + "learning_rate": 1.5915172275990375e-09, + "loss": 0.0821, + "step": 9157 + }, + { + "epoch": 2.967595593000648, + "grad_norm": 0.9033092260360718, + "learning_rate": 1.5604672100297258e-09, + "loss": 0.0754, + "step": 9158 + }, + { + "epoch": 2.9679196370706418, + "grad_norm": 0.9216779470443726, + "learning_rate": 1.5297229772726075e-09, + "loss": 0.0773, + "step": 9159 + }, + { + "epoch": 2.968243681140635, + "grad_norm": 0.8437354564666748, + "learning_rate": 1.499284533090506e-09, + "loss": 0.0717, + "step": 9160 + }, + { + "epoch": 2.9685677252106286, + "grad_norm": 0.9266118407249451, + "learning_rate": 1.469151881208497e-09, + "loss": 0.0744, + "step": 9161 + }, + { + "epoch": 2.968891769280622, + "grad_norm": 0.8228974342346191, + "learning_rate": 1.4393250253144642e-09, + "loss": 0.0707, + "step": 9162 + }, + { + "epoch": 2.9692158133506155, + "grad_norm": 0.9221566319465637, + "learning_rate": 1.4098039690593756e-09, + "loss": 0.0741, + "step": 9163 + }, + { + "epoch": 2.9695398574206093, + "grad_norm": 0.8759603500366211, + "learning_rate": 1.3805887160558973e-09, + "loss": 0.0726, + "step": 9164 + }, + { + "epoch": 2.9698639014906028, + "grad_norm": 0.8887774348258972, + "learning_rate": 1.3516792698797797e-09, + "loss": 0.077, + "step": 9165 + }, + { + "epoch": 2.970187945560596, + "grad_norm": 0.8909733891487122, + "learning_rate": 1.323075634069304e-09, + "loss": 0.076, + "step": 9166 + }, + { + "epoch": 2.9705119896305896, + "grad_norm": 0.8457849621772766, + "learning_rate": 1.2947778121255584e-09, + "loss": 0.0691, + "step": 9167 + }, + { + "epoch": 2.970836033700583, + "grad_norm": 0.9030209183692932, + "learning_rate": 1.2667858075113281e-09, + "loss": 0.0733, + "step": 9168 + }, + { + "epoch": 2.971160077770577, + "grad_norm": 0.8783456683158875, + "learning_rate": 1.239099623653317e-09, + "loss": 0.0711, + "step": 9169 + }, + { + "epoch": 2.9714841218405703, + "grad_norm": 0.9108428359031677, + "learning_rate": 1.2117192639393704e-09, + "loss": 0.0746, + "step": 9170 + }, + { + "epoch": 2.9718081659105637, + "grad_norm": 1.0172836780548096, + "learning_rate": 1.1846447317206967e-09, + "loss": 0.0755, + "step": 9171 + }, + { + "epoch": 2.9721322099805576, + "grad_norm": 0.849394679069519, + "learning_rate": 1.1578760303113113e-09, + "loss": 0.0711, + "step": 9172 + }, + { + "epoch": 2.972456254050551, + "grad_norm": 0.9604840874671936, + "learning_rate": 1.131413162987205e-09, + "loss": 0.0752, + "step": 9173 + }, + { + "epoch": 2.9727802981205445, + "grad_norm": 0.8981707096099854, + "learning_rate": 1.1052561329871757e-09, + "loss": 0.0768, + "step": 9174 + }, + { + "epoch": 2.973104342190538, + "grad_norm": 0.9010798931121826, + "learning_rate": 1.0794049435128296e-09, + "loss": 0.0777, + "step": 9175 + }, + { + "epoch": 2.9734283862605313, + "grad_norm": 0.9054232239723206, + "learning_rate": 1.0538595977277466e-09, + "loss": 0.0755, + "step": 9176 + }, + { + "epoch": 2.973752430330525, + "grad_norm": 0.9445099830627441, + "learning_rate": 1.028620098758315e-09, + "loss": 0.0802, + "step": 9177 + }, + { + "epoch": 2.9740764744005186, + "grad_norm": 0.8789932727813721, + "learning_rate": 1.0036864496942856e-09, + "loss": 0.0764, + "step": 9178 + }, + { + "epoch": 2.974400518470512, + "grad_norm": 0.8579067587852478, + "learning_rate": 9.79058653586551e-10, + "loss": 0.0727, + "step": 9179 + }, + { + "epoch": 2.9747245625405054, + "grad_norm": 0.7822898626327515, + "learning_rate": 9.54736713449922e-10, + "loss": 0.0699, + "step": 9180 + }, + { + "epoch": 2.975048606610499, + "grad_norm": 0.8219559788703918, + "learning_rate": 9.307206322606288e-10, + "loss": 0.0704, + "step": 9181 + }, + { + "epoch": 2.9753726506804927, + "grad_norm": 0.9843807816505432, + "learning_rate": 9.070104129582647e-10, + "loss": 0.0795, + "step": 9182 + }, + { + "epoch": 2.975696694750486, + "grad_norm": 0.8420466184616089, + "learning_rate": 8.836060584449524e-10, + "loss": 0.0702, + "step": 9183 + }, + { + "epoch": 2.9760207388204796, + "grad_norm": 0.8671947121620178, + "learning_rate": 8.6050757158479e-10, + "loss": 0.0741, + "step": 9184 + }, + { + "epoch": 2.976344782890473, + "grad_norm": 0.8479039072990417, + "learning_rate": 8.377149552049602e-10, + "loss": 0.0718, + "step": 9185 + }, + { + "epoch": 2.9766688269604664, + "grad_norm": 0.9086983799934387, + "learning_rate": 8.15228212095176e-10, + "loss": 0.0828, + "step": 9186 + }, + { + "epoch": 2.9769928710304603, + "grad_norm": 0.8832724690437317, + "learning_rate": 7.930473450074028e-10, + "loss": 0.0757, + "step": 9187 + }, + { + "epoch": 2.9773169151004537, + "grad_norm": 0.9127610325813293, + "learning_rate": 7.711723566564133e-10, + "loss": 0.0773, + "step": 9188 + }, + { + "epoch": 2.977640959170447, + "grad_norm": 0.8464711904525757, + "learning_rate": 7.496032497195105e-10, + "loss": 0.0687, + "step": 9189 + }, + { + "epoch": 2.9779650032404406, + "grad_norm": 0.8912038803100586, + "learning_rate": 7.283400268365271e-10, + "loss": 0.074, + "step": 9190 + }, + { + "epoch": 2.978289047310434, + "grad_norm": 0.8227494359016418, + "learning_rate": 7.073826906098258e-10, + "loss": 0.0739, + "step": 9191 + }, + { + "epoch": 2.978613091380428, + "grad_norm": 0.8756099939346313, + "learning_rate": 6.867312436045769e-10, + "loss": 0.0773, + "step": 9192 + }, + { + "epoch": 2.9789371354504213, + "grad_norm": 0.8831959962844849, + "learning_rate": 6.663856883482034e-10, + "loss": 0.0825, + "step": 9193 + }, + { + "epoch": 2.9792611795204147, + "grad_norm": 0.8920246958732605, + "learning_rate": 6.463460273306577e-10, + "loss": 0.0719, + "step": 9194 + }, + { + "epoch": 2.9795852235904086, + "grad_norm": 0.8495813012123108, + "learning_rate": 6.266122630049776e-10, + "loss": 0.072, + "step": 9195 + }, + { + "epoch": 2.9799092676604015, + "grad_norm": 0.9280455112457275, + "learning_rate": 6.071843977861758e-10, + "loss": 0.0774, + "step": 9196 + }, + { + "epoch": 2.9802333117303954, + "grad_norm": 0.8548804521560669, + "learning_rate": 5.880624340517948e-10, + "loss": 0.0727, + "step": 9197 + }, + { + "epoch": 2.980557355800389, + "grad_norm": 0.8589762449264526, + "learning_rate": 5.692463741424625e-10, + "loss": 0.0755, + "step": 9198 + }, + { + "epoch": 2.9808813998703823, + "grad_norm": 0.9303783178329468, + "learning_rate": 5.507362203607814e-10, + "loss": 0.0791, + "step": 9199 + }, + { + "epoch": 2.981205443940376, + "grad_norm": 0.9959030151367188, + "learning_rate": 5.325319749727165e-10, + "loss": 0.0807, + "step": 9200 + }, + { + "epoch": 2.9815294880103695, + "grad_norm": 0.8675582408905029, + "learning_rate": 5.146336402059304e-10, + "loss": 0.0704, + "step": 9201 + }, + { + "epoch": 2.981853532080363, + "grad_norm": 0.9374891519546509, + "learning_rate": 4.970412182511708e-10, + "loss": 0.0801, + "step": 9202 + }, + { + "epoch": 2.9821775761503564, + "grad_norm": 0.931382417678833, + "learning_rate": 4.797547112614376e-10, + "loss": 0.0776, + "step": 9203 + }, + { + "epoch": 2.98250162022035, + "grad_norm": 0.9373607635498047, + "learning_rate": 4.627741213525383e-10, + "loss": 0.0757, + "step": 9204 + }, + { + "epoch": 2.9828256642903437, + "grad_norm": 0.9142207503318787, + "learning_rate": 4.460994506028105e-10, + "loss": 0.0796, + "step": 9205 + }, + { + "epoch": 2.983149708360337, + "grad_norm": 0.9252228736877441, + "learning_rate": 4.2973070105256643e-10, + "loss": 0.0803, + "step": 9206 + }, + { + "epoch": 2.9834737524303305, + "grad_norm": 0.8329778909683228, + "learning_rate": 4.136678747060363e-10, + "loss": 0.0701, + "step": 9207 + }, + { + "epoch": 2.983797796500324, + "grad_norm": 0.8641289472579956, + "learning_rate": 3.9791097352831487e-10, + "loss": 0.0733, + "step": 9208 + }, + { + "epoch": 2.9841218405703174, + "grad_norm": 0.9849048852920532, + "learning_rate": 3.824599994484146e-10, + "loss": 0.0825, + "step": 9209 + }, + { + "epoch": 2.9844458846403112, + "grad_norm": 0.943912148475647, + "learning_rate": 3.673149543573229e-10, + "loss": 0.0816, + "step": 9210 + }, + { + "epoch": 2.9847699287103047, + "grad_norm": 0.8786362409591675, + "learning_rate": 3.5247584010827953e-10, + "loss": 0.0729, + "step": 9211 + }, + { + "epoch": 2.985093972780298, + "grad_norm": 0.8849793672561646, + "learning_rate": 3.3794265851816444e-10, + "loss": 0.0729, + "step": 9212 + }, + { + "epoch": 2.9854180168502915, + "grad_norm": 0.8750109672546387, + "learning_rate": 3.237154113649998e-10, + "loss": 0.0716, + "step": 9213 + }, + { + "epoch": 2.985742060920285, + "grad_norm": 0.9164808392524719, + "learning_rate": 3.0979410039017053e-10, + "loss": 0.079, + "step": 9214 + }, + { + "epoch": 2.986066104990279, + "grad_norm": 0.8990651965141296, + "learning_rate": 2.961787272978689e-10, + "loss": 0.0741, + "step": 9215 + }, + { + "epoch": 2.9863901490602722, + "grad_norm": 0.8636859059333801, + "learning_rate": 2.828692937542621e-10, + "loss": 0.0776, + "step": 9216 + }, + { + "epoch": 2.9867141931302656, + "grad_norm": 0.9257625937461853, + "learning_rate": 2.6986580138832487e-10, + "loss": 0.0764, + "step": 9217 + }, + { + "epoch": 2.987038237200259, + "grad_norm": 0.8682085275650024, + "learning_rate": 2.571682517915619e-10, + "loss": 0.0687, + "step": 9218 + }, + { + "epoch": 2.9873622812702525, + "grad_norm": 0.8867075443267822, + "learning_rate": 2.447766465180079e-10, + "loss": 0.0683, + "step": 9219 + }, + { + "epoch": 2.9876863253402464, + "grad_norm": 0.8165310025215149, + "learning_rate": 2.3269098708422754e-10, + "loss": 0.0705, + "step": 9220 + }, + { + "epoch": 2.98801036941024, + "grad_norm": 0.8468376398086548, + "learning_rate": 2.2091127496959298e-10, + "loss": 0.0734, + "step": 9221 + }, + { + "epoch": 2.988334413480233, + "grad_norm": 0.9028320908546448, + "learning_rate": 2.0943751161545122e-10, + "loss": 0.0772, + "step": 9222 + }, + { + "epoch": 2.988658457550227, + "grad_norm": 0.8585754632949829, + "learning_rate": 1.98269698426512e-10, + "loss": 0.0772, + "step": 9223 + }, + { + "epoch": 2.9889825016202205, + "grad_norm": 0.8857394456863403, + "learning_rate": 1.8740783676945984e-10, + "loss": 0.0774, + "step": 9224 + }, + { + "epoch": 2.989306545690214, + "grad_norm": 0.9060828685760498, + "learning_rate": 1.768519279732317e-10, + "loss": 0.0781, + "step": 9225 + }, + { + "epoch": 2.9896305897602073, + "grad_norm": 0.8916698694229126, + "learning_rate": 1.666019733306823e-10, + "loss": 0.0747, + "step": 9226 + }, + { + "epoch": 2.9899546338302008, + "grad_norm": 0.9118140935897827, + "learning_rate": 1.5665797409553097e-10, + "loss": 0.0743, + "step": 9227 + }, + { + "epoch": 2.9902786779001946, + "grad_norm": 0.8600789904594421, + "learning_rate": 1.4701993148485972e-10, + "loss": 0.0761, + "step": 9228 + }, + { + "epoch": 2.990602721970188, + "grad_norm": 0.8507561087608337, + "learning_rate": 1.3768784667883562e-10, + "loss": 0.0763, + "step": 9229 + }, + { + "epoch": 2.9909267660401815, + "grad_norm": 0.922105610370636, + "learning_rate": 1.2866172081904548e-10, + "loss": 0.0756, + "step": 9230 + }, + { + "epoch": 2.991250810110175, + "grad_norm": 0.8797658085823059, + "learning_rate": 1.1994155501071636e-10, + "loss": 0.0753, + "step": 9231 + }, + { + "epoch": 2.9915748541801683, + "grad_norm": 0.9012943506240845, + "learning_rate": 1.1152735032077255e-10, + "loss": 0.0758, + "step": 9232 + }, + { + "epoch": 2.991898898250162, + "grad_norm": 0.8567914962768555, + "learning_rate": 1.0341910777894593e-10, + "loss": 0.0726, + "step": 9233 + }, + { + "epoch": 2.9922229423201556, + "grad_norm": 0.8917068839073181, + "learning_rate": 9.561682837777586e-11, + "loss": 0.073, + "step": 9234 + }, + { + "epoch": 2.992546986390149, + "grad_norm": 0.9291151165962219, + "learning_rate": 8.812051307205416e-11, + "loss": 0.079, + "step": 9235 + }, + { + "epoch": 2.9928710304601425, + "grad_norm": 0.9489535689353943, + "learning_rate": 8.093016277938015e-11, + "loss": 0.0809, + "step": 9236 + }, + { + "epoch": 2.993195074530136, + "grad_norm": 0.8829405903816223, + "learning_rate": 7.404577837988313e-11, + "loss": 0.0781, + "step": 9237 + }, + { + "epoch": 2.9935191186001298, + "grad_norm": 0.905556321144104, + "learning_rate": 6.746736071594484e-11, + "loss": 0.0801, + "step": 9238 + }, + { + "epoch": 2.993843162670123, + "grad_norm": 0.8819782137870789, + "learning_rate": 6.119491059303206e-11, + "loss": 0.0757, + "step": 9239 + }, + { + "epoch": 2.9941672067401166, + "grad_norm": 0.8596265912055969, + "learning_rate": 5.522842877830892e-11, + "loss": 0.0718, + "step": 9240 + }, + { + "epoch": 2.99449125081011, + "grad_norm": 0.8243787288665771, + "learning_rate": 4.956791600230215e-11, + "loss": 0.0697, + "step": 9241 + }, + { + "epoch": 2.9948152948801035, + "grad_norm": 0.8741186857223511, + "learning_rate": 4.4213372957790935e-11, + "loss": 0.0742, + "step": 9242 + }, + { + "epoch": 2.9951393389500973, + "grad_norm": 0.9056291580200195, + "learning_rate": 3.9164800300084404e-11, + "loss": 0.0766, + "step": 9243 + }, + { + "epoch": 2.9954633830200907, + "grad_norm": 0.9046027660369873, + "learning_rate": 3.442219864729923e-11, + "loss": 0.0743, + "step": 9244 + }, + { + "epoch": 2.995787427090084, + "grad_norm": 0.9297699332237244, + "learning_rate": 2.998556857952695e-11, + "loss": 0.0782, + "step": 9245 + }, + { + "epoch": 2.996111471160078, + "grad_norm": 0.8244128823280334, + "learning_rate": 2.5854910639944165e-11, + "loss": 0.0684, + "step": 9246 + }, + { + "epoch": 2.9964355152300715, + "grad_norm": 0.8610013723373413, + "learning_rate": 2.203022533425747e-11, + "loss": 0.0778, + "step": 9247 + }, + { + "epoch": 2.996759559300065, + "grad_norm": 0.9346343278884888, + "learning_rate": 1.8511513130148317e-11, + "loss": 0.0835, + "step": 9248 + }, + { + "epoch": 2.9970836033700583, + "grad_norm": 0.8872416019439697, + "learning_rate": 1.529877445866079e-11, + "loss": 0.0726, + "step": 9249 + }, + { + "epoch": 2.9974076474400517, + "grad_norm": 0.9257415533065796, + "learning_rate": 1.2392009713091402e-11, + "loss": 0.0748, + "step": 9250 + }, + { + "epoch": 2.9977316915100456, + "grad_norm": 0.9216912984848022, + "learning_rate": 9.791219248711515e-12, + "loss": 0.075, + "step": 9251 + }, + { + "epoch": 2.998055735580039, + "grad_norm": 0.9601297378540039, + "learning_rate": 7.496403384155137e-12, + "loss": 0.0854, + "step": 9252 + }, + { + "epoch": 2.9983797796500324, + "grad_norm": 0.9359740614891052, + "learning_rate": 5.507562400308697e-12, + "loss": 0.0747, + "step": 9253 + }, + { + "epoch": 2.998703823720026, + "grad_norm": 0.8407431840896606, + "learning_rate": 3.8246965403110344e-12, + "loss": 0.067, + "step": 9254 + }, + { + "epoch": 2.9990278677900193, + "grad_norm": 0.8539666533470154, + "learning_rate": 2.4478060103860777e-12, + "loss": 0.0713, + "step": 9255 + }, + { + "epoch": 2.999351911860013, + "grad_norm": 0.881554365158081, + "learning_rate": 1.376890979287726e-12, + "loss": 0.0703, + "step": 9256 + }, + { + "epoch": 2.9996759559300066, + "grad_norm": 0.9617207646369934, + "learning_rate": 6.119515774671847e-13, + "loss": 0.0793, + "step": 9257 + }, + { + "epoch": 3.0, + "grad_norm": 0.8892857432365417, + "learning_rate": 1.529878990158551e-13, + "loss": 0.076, + "step": 9258 + } + ], + "logging_steps": 1.0, + "max_steps": 9258, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.876319338303901e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}