{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9400921658986174, "eval_steps": 500, "global_step": 108, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.027649769585253458, "grad_norm": 6.218597135492551, "learning_rate": 0.0, "loss": 0.3225, "step": 1 }, { "epoch": 0.055299539170506916, "grad_norm": 6.047780940751843, "learning_rate": 9.090909090909091e-07, "loss": 0.3102, "step": 2 }, { "epoch": 0.08294930875576037, "grad_norm": 5.725628334215099, "learning_rate": 1.8181818181818183e-06, "loss": 0.3092, "step": 3 }, { "epoch": 0.11059907834101383, "grad_norm": 5.291280324265542, "learning_rate": 2.7272727272727272e-06, "loss": 0.3167, "step": 4 }, { "epoch": 0.1382488479262673, "grad_norm": 2.7599992875130943, "learning_rate": 3.6363636363636366e-06, "loss": 0.2646, "step": 5 }, { "epoch": 0.16589861751152074, "grad_norm": 3.4307446243159427, "learning_rate": 4.5454545454545455e-06, "loss": 0.2347, "step": 6 }, { "epoch": 0.1935483870967742, "grad_norm": 2.743173006209666, "learning_rate": 5.4545454545454545e-06, "loss": 0.2243, "step": 7 }, { "epoch": 0.22119815668202766, "grad_norm": 4.332341499527876, "learning_rate": 6.363636363636364e-06, "loss": 0.2741, "step": 8 }, { "epoch": 0.2488479262672811, "grad_norm": 2.150601224938458, "learning_rate": 7.272727272727273e-06, "loss": 0.2436, "step": 9 }, { "epoch": 0.2764976958525346, "grad_norm": 1.5762454338091747, "learning_rate": 8.181818181818183e-06, "loss": 0.2042, "step": 10 }, { "epoch": 0.30414746543778803, "grad_norm": 2.456237511748212, "learning_rate": 9.090909090909091e-06, "loss": 0.1996, "step": 11 }, { "epoch": 0.3317972350230415, "grad_norm": 1.9843238243820491, "learning_rate": 1e-05, "loss": 0.1839, "step": 12 }, { "epoch": 0.35944700460829493, "grad_norm": 2.7062318993272605, "learning_rate": 9.997377845227577e-06, "loss": 0.1578, "step": 13 }, { "epoch": 0.3870967741935484, "grad_norm": 1.3941400459380622, "learning_rate": 9.98951413118856e-06, "loss": 0.1593, "step": 14 }, { "epoch": 0.4147465437788018, "grad_norm": 1.1610695697921565, "learning_rate": 9.97641710583307e-06, "loss": 0.1675, "step": 15 }, { "epoch": 0.4423963133640553, "grad_norm": 1.0523290062211754, "learning_rate": 9.958100506132127e-06, "loss": 0.1726, "step": 16 }, { "epoch": 0.4700460829493088, "grad_norm": 1.0801667737571008, "learning_rate": 9.934583543669454e-06, "loss": 0.1829, "step": 17 }, { "epoch": 0.4976958525345622, "grad_norm": 1.3571254664283856, "learning_rate": 9.905890884491196e-06, "loss": 0.1527, "step": 18 }, { "epoch": 0.5253456221198156, "grad_norm": 1.4608505233360805, "learning_rate": 9.872052623234632e-06, "loss": 0.147, "step": 19 }, { "epoch": 0.5529953917050692, "grad_norm": 0.9738980854967625, "learning_rate": 9.833104251563058e-06, "loss": 0.1717, "step": 20 }, { "epoch": 0.5806451612903226, "grad_norm": 0.9511695004905902, "learning_rate": 9.789086620939936e-06, "loss": 0.154, "step": 21 }, { "epoch": 0.6082949308755761, "grad_norm": 0.9492770790673514, "learning_rate": 9.740045899781353e-06, "loss": 0.1486, "step": 22 }, { "epoch": 0.6359447004608295, "grad_norm": 1.662083098498058, "learning_rate": 9.68603352503172e-06, "loss": 0.1423, "step": 23 }, { "epoch": 0.663594470046083, "grad_norm": 1.0620441015348367, "learning_rate": 9.627106148213521e-06, "loss": 0.1414, "step": 24 }, { "epoch": 0.6912442396313364, "grad_norm": 1.5593131122473323, "learning_rate": 9.563325576007702e-06, "loss": 0.1274, "step": 25 }, { "epoch": 0.7188940092165899, "grad_norm": 0.913058028692136, "learning_rate": 9.494758705426978e-06, "loss": 0.1414, "step": 26 }, { "epoch": 0.7465437788018433, "grad_norm": 2.508172363862381, "learning_rate": 9.421477453650118e-06, "loss": 0.1629, "step": 27 }, { "epoch": 0.7741935483870968, "grad_norm": 2.7393725430348574, "learning_rate": 9.343558682590757e-06, "loss": 0.1356, "step": 28 }, { "epoch": 0.8018433179723502, "grad_norm": 6.864836486057846, "learning_rate": 9.261084118279846e-06, "loss": 0.1591, "step": 29 }, { "epoch": 0.8294930875576036, "grad_norm": 0.8774514371920219, "learning_rate": 9.174140265146356e-06, "loss": 0.1423, "step": 30 }, { "epoch": 0.8571428571428571, "grad_norm": 1.0409147093308013, "learning_rate": 9.082818315286054e-06, "loss": 0.1397, "step": 31 }, { "epoch": 0.8847926267281107, "grad_norm": 0.7615257724651999, "learning_rate": 8.987214052813605e-06, "loss": 0.157, "step": 32 }, { "epoch": 0.9124423963133641, "grad_norm": 0.8128642416450813, "learning_rate": 8.887427753398249e-06, "loss": 0.1489, "step": 33 }, { "epoch": 0.9400921658986175, "grad_norm": 0.6930096403019158, "learning_rate": 8.783564079088478e-06, "loss": 0.1432, "step": 34 }, { "epoch": 0.967741935483871, "grad_norm": 0.8773700569916818, "learning_rate": 8.675731968536004e-06, "loss": 0.1292, "step": 35 }, { "epoch": 0.9953917050691244, "grad_norm": 1.0254335401965728, "learning_rate": 8.564044522734147e-06, "loss": 0.1403, "step": 36 }, { "epoch": 1.0, "grad_norm": 1.0254335401965728, "learning_rate": 8.448618886390523e-06, "loss": 0.1451, "step": 37 }, { "epoch": 1.0276497695852536, "grad_norm": 1.8000941773216486, "learning_rate": 8.329576125058406e-06, "loss": 0.0861, "step": 38 }, { "epoch": 1.055299539170507, "grad_norm": 2.1523838043667936, "learning_rate": 8.207041098155701e-06, "loss": 0.093, "step": 39 }, { "epoch": 1.0829493087557605, "grad_norm": 0.8728738735253629, "learning_rate": 8.081142328004638e-06, "loss": 0.1009, "step": 40 }, { "epoch": 1.1105990783410138, "grad_norm": 0.8446208855675313, "learning_rate": 7.952011865029614e-06, "loss": 0.0962, "step": 41 }, { "epoch": 1.1382488479262673, "grad_norm": 0.583782311676644, "learning_rate": 7.819785149254534e-06, "loss": 0.0873, "step": 42 }, { "epoch": 1.1658986175115207, "grad_norm": 0.7773678602540318, "learning_rate": 7.68460086824492e-06, "loss": 0.0984, "step": 43 }, { "epoch": 1.1935483870967742, "grad_norm": 0.5920912387076327, "learning_rate": 7.546600811643816e-06, "loss": 0.0961, "step": 44 }, { "epoch": 1.2211981566820276, "grad_norm": 0.6278156435849434, "learning_rate": 7.405929722454026e-06, "loss": 0.0808, "step": 45 }, { "epoch": 1.2488479262672811, "grad_norm": 0.6735110117536915, "learning_rate": 7.262735145222696e-06, "loss": 0.09, "step": 46 }, { "epoch": 1.2764976958525347, "grad_norm": 0.6721245903010009, "learning_rate": 7.117167271287453e-06, "loss": 0.0901, "step": 47 }, { "epoch": 1.304147465437788, "grad_norm": 0.6581773002499689, "learning_rate": 6.969378781246436e-06, "loss": 0.0802, "step": 48 }, { "epoch": 1.3317972350230414, "grad_norm": 0.6154758737738647, "learning_rate": 6.819524684817439e-06, "loss": 0.0773, "step": 49 }, { "epoch": 1.359447004608295, "grad_norm": 0.6078121382769383, "learning_rate": 6.667762158254104e-06, "loss": 0.0796, "step": 50 }, { "epoch": 1.3870967741935485, "grad_norm": 0.5916042033246923, "learning_rate": 6.514250379489754e-06, "loss": 0.0797, "step": 51 }, { "epoch": 1.4147465437788018, "grad_norm": 0.6742233018841318, "learning_rate": 6.3591503611817155e-06, "loss": 0.0877, "step": 52 }, { "epoch": 1.4423963133640554, "grad_norm": 0.6254587000513546, "learning_rate": 6.202624781831269e-06, "loss": 0.0852, "step": 53 }, { "epoch": 1.4700460829493087, "grad_norm": 0.6461870128180857, "learning_rate": 6.044837815156377e-06, "loss": 0.0943, "step": 54 }, { "epoch": 1.4976958525345623, "grad_norm": 0.6189619126092755, "learning_rate": 5.885954957896115e-06, "loss": 0.0941, "step": 55 }, { "epoch": 1.5253456221198156, "grad_norm": 0.6656207681608218, "learning_rate": 5.726142856227453e-06, "loss": 0.0933, "step": 56 }, { "epoch": 1.5529953917050692, "grad_norm": 0.641201435235512, "learning_rate": 5.5655691309764225e-06, "loss": 0.0827, "step": 57 }, { "epoch": 1.5806451612903225, "grad_norm": 0.5520132793031652, "learning_rate": 5.404402201807022e-06, "loss": 0.0749, "step": 58 }, { "epoch": 1.608294930875576, "grad_norm": 0.5710686877367372, "learning_rate": 5.242811110572243e-06, "loss": 0.0778, "step": 59 }, { "epoch": 1.6359447004608296, "grad_norm": 0.6110053501356016, "learning_rate": 5.080965344012509e-06, "loss": 0.0844, "step": 60 }, { "epoch": 1.663594470046083, "grad_norm": 0.5836120939696657, "learning_rate": 4.919034655987493e-06, "loss": 0.0767, "step": 61 }, { "epoch": 1.6912442396313363, "grad_norm": 0.6258155158697727, "learning_rate": 4.757188889427761e-06, "loss": 0.0886, "step": 62 }, { "epoch": 1.7188940092165899, "grad_norm": 0.5949010863382941, "learning_rate": 4.59559779819298e-06, "loss": 0.0982, "step": 63 }, { "epoch": 1.7465437788018434, "grad_norm": 0.5665380658610257, "learning_rate": 4.434430869023579e-06, "loss": 0.0811, "step": 64 }, { "epoch": 1.7741935483870968, "grad_norm": 0.5999744212269682, "learning_rate": 4.27385714377255e-06, "loss": 0.0857, "step": 65 }, { "epoch": 1.80184331797235, "grad_norm": 0.5851633715161859, "learning_rate": 4.1140450421038865e-06, "loss": 0.07, "step": 66 }, { "epoch": 1.8294930875576036, "grad_norm": 0.5602466431740857, "learning_rate": 3.955162184843625e-06, "loss": 0.0868, "step": 67 }, { "epoch": 1.8571428571428572, "grad_norm": 0.6003026453668614, "learning_rate": 3.7973752181687336e-06, "loss": 0.0897, "step": 68 }, { "epoch": 1.8847926267281108, "grad_norm": 0.5828634522878124, "learning_rate": 3.6408496388182857e-06, "loss": 0.0811, "step": 69 }, { "epoch": 1.912442396313364, "grad_norm": 0.606858679369741, "learning_rate": 3.4857496205102475e-06, "loss": 0.085, "step": 70 }, { "epoch": 1.9400921658986174, "grad_norm": 0.6071679638417599, "learning_rate": 3.3322378417458985e-06, "loss": 0.089, "step": 71 }, { "epoch": 1.967741935483871, "grad_norm": 0.5798347707839837, "learning_rate": 3.180475315182563e-06, "loss": 0.069, "step": 72 }, { "epoch": 1.9953917050691246, "grad_norm": 0.485811481914557, "learning_rate": 3.0306212187535653e-06, "loss": 0.074, "step": 73 }, { "epoch": 2.0, "grad_norm": 0.485811481914557, "learning_rate": 2.882832728712551e-06, "loss": 0.0988, "step": 74 }, { "epoch": 2.0276497695852536, "grad_norm": 1.5333123289167132, "learning_rate": 2.7372648547773063e-06, "loss": 0.0533, "step": 75 }, { "epoch": 2.055299539170507, "grad_norm": 0.47653360582102766, "learning_rate": 2.594070277545975e-06, "loss": 0.0456, "step": 76 }, { "epoch": 2.0829493087557602, "grad_norm": 0.44653376480891205, "learning_rate": 2.4533991883561868e-06, "loss": 0.0433, "step": 77 }, { "epoch": 2.110599078341014, "grad_norm": 0.45184100467810834, "learning_rate": 2.315399131755081e-06, "loss": 0.0434, "step": 78 }, { "epoch": 2.1382488479262673, "grad_norm": 0.46406715803678933, "learning_rate": 2.1802148507454675e-06, "loss": 0.0475, "step": 79 }, { "epoch": 2.165898617511521, "grad_norm": 0.44470042182732816, "learning_rate": 2.0479881349703885e-06, "loss": 0.0485, "step": 80 }, { "epoch": 2.193548387096774, "grad_norm": 0.4541836919405326, "learning_rate": 1.9188576719953635e-06, "loss": 0.0453, "step": 81 }, { "epoch": 2.2211981566820276, "grad_norm": 0.41945408404533757, "learning_rate": 1.7929589018443016e-06, "loss": 0.0476, "step": 82 }, { "epoch": 2.248847926267281, "grad_norm": 0.44239519120852955, "learning_rate": 1.6704238749415958e-06, "loss": 0.0471, "step": 83 }, { "epoch": 2.2764976958525347, "grad_norm": 0.4072713113001334, "learning_rate": 1.5513811136094786e-06, "loss": 0.0441, "step": 84 }, { "epoch": 2.3041474654377883, "grad_norm": 0.40529452752820916, "learning_rate": 1.4359554772658551e-06, "loss": 0.048, "step": 85 }, { "epoch": 2.3317972350230414, "grad_norm": 0.3932405667255203, "learning_rate": 1.3242680314639995e-06, "loss": 0.0392, "step": 86 }, { "epoch": 2.359447004608295, "grad_norm": 0.4117890616435653, "learning_rate": 1.2164359209115235e-06, "loss": 0.0409, "step": 87 }, { "epoch": 2.3870967741935485, "grad_norm": 0.46974554587042566, "learning_rate": 1.1125722466017547e-06, "loss": 0.0393, "step": 88 }, { "epoch": 2.4147465437788016, "grad_norm": 0.4350219213576534, "learning_rate": 1.012785947186397e-06, "loss": 0.0475, "step": 89 }, { "epoch": 2.442396313364055, "grad_norm": 0.49090032041418447, "learning_rate": 9.171816847139447e-07, "loss": 0.0424, "step": 90 }, { "epoch": 2.4700460829493087, "grad_norm": 0.42408815275335165, "learning_rate": 8.258597348536452e-07, "loss": 0.0413, "step": 91 }, { "epoch": 2.4976958525345623, "grad_norm": 0.40399622782019035, "learning_rate": 7.389158817201541e-07, "loss": 0.0388, "step": 92 }, { "epoch": 2.525345622119816, "grad_norm": 0.38833269245723195, "learning_rate": 6.564413174092443e-07, "loss": 0.037, "step": 93 }, { "epoch": 2.5529953917050694, "grad_norm": 0.389374030767258, "learning_rate": 5.785225463498828e-07, "loss": 0.0404, "step": 94 }, { "epoch": 2.5806451612903225, "grad_norm": 0.42570497393280116, "learning_rate": 5.05241294573024e-07, "loss": 0.0398, "step": 95 }, { "epoch": 2.608294930875576, "grad_norm": 0.4547617441003299, "learning_rate": 4.3667442399229985e-07, "loss": 0.0364, "step": 96 }, { "epoch": 2.6359447004608296, "grad_norm": 0.4076559649282273, "learning_rate": 3.728938517864794e-07, "loss": 0.0448, "step": 97 }, { "epoch": 2.6635944700460827, "grad_norm": 0.46055342435769453, "learning_rate": 3.1396647496828245e-07, "loss": 0.0365, "step": 98 }, { "epoch": 2.6912442396313363, "grad_norm": 0.4056353196574725, "learning_rate": 2.599541002186479e-07, "loss": 0.0387, "step": 99 }, { "epoch": 2.71889400921659, "grad_norm": 0.3970285169859869, "learning_rate": 2.109133790600648e-07, "loss": 0.0453, "step": 100 }, { "epoch": 2.7465437788018434, "grad_norm": 0.39424163292147957, "learning_rate": 1.6689574843694433e-07, "loss": 0.0353, "step": 101 }, { "epoch": 2.774193548387097, "grad_norm": 0.4031654414089018, "learning_rate": 1.2794737676536993e-07, "loss": 0.0397, "step": 102 }, { "epoch": 2.80184331797235, "grad_norm": 0.40508729456084114, "learning_rate": 9.410911550880474e-08, "loss": 0.0391, "step": 103 }, { "epoch": 2.8294930875576036, "grad_norm": 0.3826393754876196, "learning_rate": 6.54164563305465e-08, "loss": 0.038, "step": 104 }, { "epoch": 2.857142857142857, "grad_norm": 0.4337313927895487, "learning_rate": 4.189949386787462e-08, "loss": 0.0418, "step": 105 }, { "epoch": 2.8847926267281108, "grad_norm": 0.37776947123435684, "learning_rate": 2.358289416693027e-08, "loss": 0.0355, "step": 106 }, { "epoch": 2.912442396313364, "grad_norm": 0.36823889024056433, "learning_rate": 1.0485868811441757e-08, "loss": 0.0384, "step": 107 }, { "epoch": 2.9400921658986174, "grad_norm": 0.4725582160373693, "learning_rate": 2.6221547724253337e-09, "loss": 0.0385, "step": 108 } ], "logging_steps": 1, "max_steps": 108, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2693023331503309e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }