| { | |
| "best_metric": 1.0107625722885132, | |
| "best_model_checkpoint": "/data/Andre/Ref-Finder-Mistral/checkpoint-3170", | |
| "epoch": 10.0, | |
| "eval_steps": 500, | |
| "global_step": 3170, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.031545741324921134, | |
| "grad_norm": 0.5029881000518799, | |
| "learning_rate": 5e-05, | |
| "loss": 1.7154, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.06309148264984227, | |
| "grad_norm": 0.3064497709274292, | |
| "learning_rate": 5e-05, | |
| "loss": 1.5963, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0946372239747634, | |
| "grad_norm": 0.3118360638618469, | |
| "learning_rate": 5e-05, | |
| "loss": 1.5074, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.12618296529968454, | |
| "grad_norm": 0.33069083094596863, | |
| "learning_rate": 5e-05, | |
| "loss": 1.4047, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.15772870662460567, | |
| "grad_norm": 0.2797032296657562, | |
| "learning_rate": 5e-05, | |
| "loss": 1.4167, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.1892744479495268, | |
| "grad_norm": 0.3190701901912689, | |
| "learning_rate": 5e-05, | |
| "loss": 1.3361, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.22082018927444794, | |
| "grad_norm": 0.3070685863494873, | |
| "learning_rate": 5e-05, | |
| "loss": 1.2655, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.25236593059936907, | |
| "grad_norm": 0.3203960359096527, | |
| "learning_rate": 5e-05, | |
| "loss": 1.2295, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.28391167192429023, | |
| "grad_norm": 0.30132830142974854, | |
| "learning_rate": 5e-05, | |
| "loss": 1.2277, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.31545741324921134, | |
| "grad_norm": 0.3356678783893585, | |
| "learning_rate": 5e-05, | |
| "loss": 1.1848, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3470031545741325, | |
| "grad_norm": 0.3275781273841858, | |
| "learning_rate": 5e-05, | |
| "loss": 1.183, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.3785488958990536, | |
| "grad_norm": 0.30640777945518494, | |
| "learning_rate": 5e-05, | |
| "loss": 1.1488, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.41009463722397477, | |
| "grad_norm": 0.5068441033363342, | |
| "learning_rate": 5e-05, | |
| "loss": 1.147, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.4416403785488959, | |
| "grad_norm": 0.310285747051239, | |
| "learning_rate": 5e-05, | |
| "loss": 1.1908, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.47318611987381703, | |
| "grad_norm": 0.38677722215652466, | |
| "learning_rate": 5e-05, | |
| "loss": 1.1319, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.5047318611987381, | |
| "grad_norm": 0.3474641740322113, | |
| "learning_rate": 5e-05, | |
| "loss": 1.1215, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5362776025236593, | |
| "grad_norm": 0.37211593985557556, | |
| "learning_rate": 5e-05, | |
| "loss": 1.1503, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.5678233438485805, | |
| "grad_norm": 0.3207016885280609, | |
| "learning_rate": 5e-05, | |
| "loss": 1.182, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.5993690851735016, | |
| "grad_norm": 0.34209126234054565, | |
| "learning_rate": 5e-05, | |
| "loss": 1.1298, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.6309148264984227, | |
| "grad_norm": 0.3956719934940338, | |
| "learning_rate": 5e-05, | |
| "loss": 1.1443, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.6624605678233438, | |
| "grad_norm": 0.34193623065948486, | |
| "learning_rate": 5e-05, | |
| "loss": 1.1148, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.694006309148265, | |
| "grad_norm": 0.3550577759742737, | |
| "learning_rate": 5e-05, | |
| "loss": 1.1091, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.7255520504731862, | |
| "grad_norm": 0.34275463223457336, | |
| "learning_rate": 5e-05, | |
| "loss": 1.1185, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.7570977917981072, | |
| "grad_norm": 0.36972326040267944, | |
| "learning_rate": 5e-05, | |
| "loss": 1.1388, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.7886435331230284, | |
| "grad_norm": 0.36260902881622314, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0982, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.8201892744479495, | |
| "grad_norm": 0.35559672117233276, | |
| "learning_rate": 5e-05, | |
| "loss": 1.1032, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.8517350157728707, | |
| "grad_norm": 0.3544253706932068, | |
| "learning_rate": 5e-05, | |
| "loss": 1.1239, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.8832807570977917, | |
| "grad_norm": 0.3803843855857849, | |
| "learning_rate": 5e-05, | |
| "loss": 1.1006, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.9148264984227129, | |
| "grad_norm": 0.3776736855506897, | |
| "learning_rate": 5e-05, | |
| "loss": 1.1001, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.9463722397476341, | |
| "grad_norm": 0.4238007068634033, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0968, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.9779179810725552, | |
| "grad_norm": 0.4062643051147461, | |
| "learning_rate": 5e-05, | |
| "loss": 1.1188, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 1.1006102561950684, | |
| "eval_runtime": 66.4381, | |
| "eval_samples_per_second": 4.786, | |
| "eval_steps_per_second": 0.602, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 1.0094637223974763, | |
| "grad_norm": 0.3485482633113861, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0843, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.0410094637223974, | |
| "grad_norm": 0.40558719635009766, | |
| "learning_rate": 5e-05, | |
| "loss": 1.097, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.0725552050473186, | |
| "grad_norm": 0.4074763059616089, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0697, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.1041009463722398, | |
| "grad_norm": 0.40961453318595886, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0635, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.135646687697161, | |
| "grad_norm": 0.3752257227897644, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0936, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.167192429022082, | |
| "grad_norm": 0.3867760896682739, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0782, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.1987381703470033, | |
| "grad_norm": 0.4072268307209015, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0574, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.2302839116719242, | |
| "grad_norm": 0.3942580819129944, | |
| "learning_rate": 5e-05, | |
| "loss": 1.1081, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.2618296529968454, | |
| "grad_norm": 0.4262318015098572, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0821, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.2933753943217665, | |
| "grad_norm": 0.39012083411216736, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0712, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.3249211356466877, | |
| "grad_norm": 0.4160712659358978, | |
| "learning_rate": 5e-05, | |
| "loss": 1.1106, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.3564668769716088, | |
| "grad_norm": 0.3966641128063202, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0411, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.38801261829653, | |
| "grad_norm": 0.3720882833003998, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0815, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.4195583596214512, | |
| "grad_norm": 0.396207332611084, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0462, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.4511041009463723, | |
| "grad_norm": 0.38164132833480835, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0891, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.4826498422712935, | |
| "grad_norm": 0.38896164298057556, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0905, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.5141955835962144, | |
| "grad_norm": 0.4327830374240875, | |
| "learning_rate": 5e-05, | |
| "loss": 1.1205, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.5457413249211358, | |
| "grad_norm": 0.423364520072937, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0477, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.5772870662460567, | |
| "grad_norm": 0.4212876558303833, | |
| "learning_rate": 5e-05, | |
| "loss": 1.12, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.608832807570978, | |
| "grad_norm": 0.3814271092414856, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0695, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.640378548895899, | |
| "grad_norm": 0.3973582983016968, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0832, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.6719242902208202, | |
| "grad_norm": 0.4016555845737457, | |
| "learning_rate": 5e-05, | |
| "loss": 1.077, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.7034700315457414, | |
| "grad_norm": 0.4084228575229645, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0674, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.7350157728706623, | |
| "grad_norm": 0.4218040406703949, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0464, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.7665615141955837, | |
| "grad_norm": 0.3857240080833435, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0656, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.7981072555205047, | |
| "grad_norm": 0.3926863968372345, | |
| "learning_rate": 5e-05, | |
| "loss": 1.056, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.8296529968454258, | |
| "grad_norm": 0.4352160096168518, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0443, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.861198738170347, | |
| "grad_norm": 0.4079754650592804, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0502, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.8927444794952681, | |
| "grad_norm": 0.40210971236228943, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0613, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.9242902208201893, | |
| "grad_norm": 0.3993563950061798, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0341, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.9558359621451105, | |
| "grad_norm": 0.47853732109069824, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0071, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.9873817034700316, | |
| "grad_norm": 0.42926380038261414, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0383, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 1.0646495819091797, | |
| "eval_runtime": 66.4837, | |
| "eval_samples_per_second": 4.783, | |
| "eval_steps_per_second": 0.602, | |
| "step": 634 | |
| }, | |
| { | |
| "epoch": 2.0189274447949526, | |
| "grad_norm": 0.3886430561542511, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0667, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.050473186119874, | |
| "grad_norm": 0.4253116846084595, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0323, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.082018927444795, | |
| "grad_norm": 0.40994375944137573, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0027, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.1135646687697163, | |
| "grad_norm": 0.3847936689853668, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0022, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.145110410094637, | |
| "grad_norm": 0.43215593695640564, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0564, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.176656151419558, | |
| "grad_norm": 0.4463648498058319, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0277, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.2082018927444795, | |
| "grad_norm": 0.42896410822868347, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0466, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.2397476340694005, | |
| "grad_norm": 0.4028797149658203, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0588, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.271293375394322, | |
| "grad_norm": 0.4177733361721039, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0519, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.302839116719243, | |
| "grad_norm": 0.42829203605651855, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0202, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.334384858044164, | |
| "grad_norm": 0.5054190158843994, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9972, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.365930599369085, | |
| "grad_norm": 0.4306070804595947, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0412, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.3974763406940065, | |
| "grad_norm": 0.443590372800827, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0424, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.4290220820189274, | |
| "grad_norm": 0.4287286400794983, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0331, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.4605678233438484, | |
| "grad_norm": 0.39775350689888, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0454, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.4921135646687698, | |
| "grad_norm": 0.4093973636627197, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0442, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.5236593059936907, | |
| "grad_norm": 0.45389777421951294, | |
| "learning_rate": 5e-05, | |
| "loss": 1.024, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.555205047318612, | |
| "grad_norm": 0.428648442029953, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0407, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.586750788643533, | |
| "grad_norm": 0.41237714886665344, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0159, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.6182965299684544, | |
| "grad_norm": 0.42067545652389526, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0347, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.6498422712933754, | |
| "grad_norm": 0.4184909462928772, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0337, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.6813880126182967, | |
| "grad_norm": 0.414995014667511, | |
| "learning_rate": 5e-05, | |
| "loss": 1.092, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.7129337539432177, | |
| "grad_norm": 0.4137355089187622, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0514, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 2.7444794952681386, | |
| "grad_norm": 0.45818576216697693, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0225, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 2.77602523659306, | |
| "grad_norm": 0.455785870552063, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0483, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 2.807570977917981, | |
| "grad_norm": 0.4084894061088562, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9846, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 2.8391167192429023, | |
| "grad_norm": 0.4103436768054962, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0217, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.8706624605678233, | |
| "grad_norm": 0.40420758724212646, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0399, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 2.9022082018927446, | |
| "grad_norm": 0.5487234592437744, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0394, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 2.9337539432176656, | |
| "grad_norm": 0.47695016860961914, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0445, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 2.965299684542587, | |
| "grad_norm": 0.41771531105041504, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0377, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 2.996845425867508, | |
| "grad_norm": 0.5724055767059326, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0193, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 1.0459696054458618, | |
| "eval_runtime": 66.4844, | |
| "eval_samples_per_second": 4.783, | |
| "eval_steps_per_second": 0.602, | |
| "step": 951 | |
| }, | |
| { | |
| "epoch": 3.028391167192429, | |
| "grad_norm": 0.481629878282547, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9837, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 3.0599369085173502, | |
| "grad_norm": 0.42061686515808105, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0083, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 3.091482649842271, | |
| "grad_norm": 0.4234108030796051, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0249, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 3.1230283911671926, | |
| "grad_norm": 0.43123263120651245, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0319, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 3.1545741324921135, | |
| "grad_norm": 0.4268761873245239, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0067, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 3.186119873817035, | |
| "grad_norm": 0.41744470596313477, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0316, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 3.217665615141956, | |
| "grad_norm": 0.46088990569114685, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9993, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 3.249211356466877, | |
| "grad_norm": 0.43155333399772644, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0275, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 3.280757097791798, | |
| "grad_norm": 0.4405035972595215, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0014, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 3.312302839116719, | |
| "grad_norm": 0.466680645942688, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0066, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 3.3438485804416405, | |
| "grad_norm": 0.4462493360042572, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0081, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 3.3753943217665614, | |
| "grad_norm": 0.4766935706138611, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9957, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 3.406940063091483, | |
| "grad_norm": 0.4287005364894867, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0022, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 3.4384858044164037, | |
| "grad_norm": 0.43795284628868103, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0248, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 3.470031545741325, | |
| "grad_norm": 0.4681282937526703, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0241, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 3.501577287066246, | |
| "grad_norm": 0.44735008478164673, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0209, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 3.5331230283911674, | |
| "grad_norm": 0.4473140835762024, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9824, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 3.5646687697160884, | |
| "grad_norm": 0.44602036476135254, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0095, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 3.5962145110410093, | |
| "grad_norm": 0.455937922000885, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0045, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 3.6277602523659307, | |
| "grad_norm": 0.416535347700119, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0293, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 3.6593059936908516, | |
| "grad_norm": 0.454054057598114, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9761, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 3.690851735015773, | |
| "grad_norm": 0.4191015958786011, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0275, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 3.722397476340694, | |
| "grad_norm": 0.45472997426986694, | |
| "learning_rate": 5e-05, | |
| "loss": 0.975, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 3.753943217665615, | |
| "grad_norm": 0.429548442363739, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9638, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 3.7854889589905363, | |
| "grad_norm": 0.4479614198207855, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0034, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 3.8170347003154577, | |
| "grad_norm": 0.41878965497016907, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0102, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 3.8485804416403786, | |
| "grad_norm": 0.42527589201927185, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9746, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 3.8801261829652995, | |
| "grad_norm": 0.4646793007850647, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0139, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 3.911671924290221, | |
| "grad_norm": 0.41096052527427673, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0247, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 3.943217665615142, | |
| "grad_norm": 0.4595187306404114, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0149, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 3.9747634069400632, | |
| "grad_norm": 0.4228056073188782, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0199, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 1.032894492149353, | |
| "eval_runtime": 66.5042, | |
| "eval_samples_per_second": 4.782, | |
| "eval_steps_per_second": 0.601, | |
| "step": 1268 | |
| }, | |
| { | |
| "epoch": 4.006309148264984, | |
| "grad_norm": 0.4469398558139801, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9636, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 4.037854889589905, | |
| "grad_norm": 0.4484340250492096, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9827, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 4.069400630914826, | |
| "grad_norm": 0.4563854932785034, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9877, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 4.100946372239748, | |
| "grad_norm": 0.44243761897087097, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9872, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 4.132492113564669, | |
| "grad_norm": 0.448011189699173, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0118, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 4.16403785488959, | |
| "grad_norm": 0.4259743094444275, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0109, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 4.195583596214511, | |
| "grad_norm": 0.456064909696579, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9552, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 4.2271293375394325, | |
| "grad_norm": 0.49178850650787354, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9976, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 4.2586750788643535, | |
| "grad_norm": 0.4512215852737427, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9889, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 4.290220820189274, | |
| "grad_norm": 0.4504569172859192, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9675, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 4.321766561514195, | |
| "grad_norm": 0.4347565472126007, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9904, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 4.353312302839116, | |
| "grad_norm": 0.4649258852005005, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9832, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 4.384858044164038, | |
| "grad_norm": 0.4316873252391815, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9952, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 4.416403785488959, | |
| "grad_norm": 0.4411141872406006, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9743, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 4.44794952681388, | |
| "grad_norm": 0.46868711709976196, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9737, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 4.479495268138801, | |
| "grad_norm": 0.47713035345077515, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9646, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 4.511041009463723, | |
| "grad_norm": 0.4720157980918884, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9645, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 4.542586750788644, | |
| "grad_norm": 0.4508207440376282, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9669, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 4.574132492113565, | |
| "grad_norm": 0.4645206928253174, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9945, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 4.605678233438486, | |
| "grad_norm": 0.45657721161842346, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0193, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 4.6372239747634065, | |
| "grad_norm": 0.48605337738990784, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9796, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 4.668769716088328, | |
| "grad_norm": 0.4564870595932007, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0164, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 4.700315457413249, | |
| "grad_norm": 0.46090081334114075, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9854, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 4.73186119873817, | |
| "grad_norm": 0.4782868027687073, | |
| "learning_rate": 5e-05, | |
| "loss": 0.985, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 4.763406940063091, | |
| "grad_norm": 0.45532533526420593, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9555, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 4.794952681388013, | |
| "grad_norm": 0.4831511676311493, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9775, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 4.826498422712934, | |
| "grad_norm": 0.4660089612007141, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9805, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 4.858044164037855, | |
| "grad_norm": 0.47603532671928406, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0222, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 4.889589905362776, | |
| "grad_norm": 0.4162875711917877, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9867, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 4.921135646687697, | |
| "grad_norm": 0.4378200173377991, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9762, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 4.952681388012619, | |
| "grad_norm": 0.43556976318359375, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9557, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 4.9842271293375395, | |
| "grad_norm": 0.4165530204772949, | |
| "learning_rate": 5e-05, | |
| "loss": 0.998, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 1.02390456199646, | |
| "eval_runtime": 66.4974, | |
| "eval_samples_per_second": 4.782, | |
| "eval_steps_per_second": 0.602, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 5.0157728706624605, | |
| "grad_norm": 0.557310938835144, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9705, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 5.047318611987381, | |
| "grad_norm": 0.47156888246536255, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9629, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 5.078864353312303, | |
| "grad_norm": 0.51046222448349, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9429, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 5.110410094637224, | |
| "grad_norm": 0.48319852352142334, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9637, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 5.141955835962145, | |
| "grad_norm": 0.45673197507858276, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9233, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 5.173501577287066, | |
| "grad_norm": 0.5032113194465637, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9486, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 5.205047318611987, | |
| "grad_norm": 0.449439138174057, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9107, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 5.236593059936909, | |
| "grad_norm": 0.4683469831943512, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9608, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 5.26813880126183, | |
| "grad_norm": 0.48362118005752563, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9246, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 5.299684542586751, | |
| "grad_norm": 0.4709579050540924, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9958, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 5.331230283911672, | |
| "grad_norm": 0.4630713164806366, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9837, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 5.3627760252365935, | |
| "grad_norm": 0.475508451461792, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0084, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 5.394321766561514, | |
| "grad_norm": 0.5352875590324402, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9595, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 5.425867507886435, | |
| "grad_norm": 0.5087634325027466, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9697, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 5.457413249211356, | |
| "grad_norm": 0.4558835029602051, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9609, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 5.488958990536277, | |
| "grad_norm": 0.5090092420578003, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9732, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 5.520504731861199, | |
| "grad_norm": 0.48192793130874634, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9917, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 5.55205047318612, | |
| "grad_norm": 0.4428229033946991, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9607, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 5.583596214511041, | |
| "grad_norm": 0.4858005940914154, | |
| "learning_rate": 5e-05, | |
| "loss": 0.994, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 5.615141955835962, | |
| "grad_norm": 0.4797442555427551, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9554, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 5.646687697160884, | |
| "grad_norm": 0.4797378480434418, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9486, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 5.678233438485805, | |
| "grad_norm": 0.4509980082511902, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9693, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 5.709779179810726, | |
| "grad_norm": 0.45232152938842773, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9622, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 5.7413249211356465, | |
| "grad_norm": 0.49943023920059204, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0051, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 5.7728706624605675, | |
| "grad_norm": 0.4827818274497986, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9536, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 5.804416403785489, | |
| "grad_norm": 0.4689510464668274, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9706, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 5.83596214511041, | |
| "grad_norm": 0.47188493609428406, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9582, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 5.867507886435331, | |
| "grad_norm": 0.47195523977279663, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9688, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 5.899053627760252, | |
| "grad_norm": 0.4700336158275604, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9399, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 5.930599369085174, | |
| "grad_norm": 0.5036072731018066, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9726, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 5.962145110410095, | |
| "grad_norm": 0.5032414197921753, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9426, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 5.993690851735016, | |
| "grad_norm": 0.4505554139614105, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9911, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 1.0179320573806763, | |
| "eval_runtime": 66.4482, | |
| "eval_samples_per_second": 4.786, | |
| "eval_steps_per_second": 0.602, | |
| "step": 1902 | |
| }, | |
| { | |
| "epoch": 6.025236593059937, | |
| "grad_norm": 0.48737627267837524, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9504, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 6.056782334384858, | |
| "grad_norm": 0.520263135433197, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9411, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 6.0883280757097795, | |
| "grad_norm": 0.4799466133117676, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9448, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 6.1198738170347005, | |
| "grad_norm": 0.49849933385849, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9511, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 6.151419558359621, | |
| "grad_norm": 0.4995006322860718, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9315, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 6.182965299684542, | |
| "grad_norm": 0.5434730648994446, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9509, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 6.214511041009464, | |
| "grad_norm": 0.5055322647094727, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9449, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 6.246056782334385, | |
| "grad_norm": 0.4768029749393463, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9356, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 6.277602523659306, | |
| "grad_norm": 0.5039747357368469, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9478, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 6.309148264984227, | |
| "grad_norm": 0.5042532086372375, | |
| "learning_rate": 5e-05, | |
| "loss": 0.8941, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 6.340694006309148, | |
| "grad_norm": 0.5117079615592957, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9081, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 6.37223974763407, | |
| "grad_norm": 0.5625054836273193, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9588, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 6.403785488958991, | |
| "grad_norm": 0.49397581815719604, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9405, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 6.435331230283912, | |
| "grad_norm": 0.5129591226577759, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9357, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 6.466876971608833, | |
| "grad_norm": 0.5299010276794434, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9425, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 6.498422712933754, | |
| "grad_norm": 0.512342095375061, | |
| "learning_rate": 5e-05, | |
| "loss": 0.936, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 6.529968454258675, | |
| "grad_norm": 0.5136451721191406, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9549, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 6.561514195583596, | |
| "grad_norm": 0.6025319695472717, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9705, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 6.593059936908517, | |
| "grad_norm": 0.48766204714775085, | |
| "learning_rate": 5e-05, | |
| "loss": 0.96, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 6.624605678233438, | |
| "grad_norm": 0.4721720516681671, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9457, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 6.65615141955836, | |
| "grad_norm": 0.48331397771835327, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9105, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 6.687697160883281, | |
| "grad_norm": 0.4890565872192383, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9859, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 6.719242902208202, | |
| "grad_norm": 0.5263992547988892, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9659, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 6.750788643533123, | |
| "grad_norm": 0.45187363028526306, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9319, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 6.782334384858045, | |
| "grad_norm": 0.4888645112514496, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9623, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 6.813880126182966, | |
| "grad_norm": 0.48433786630630493, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9601, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 6.8454258675078865, | |
| "grad_norm": 0.5414565205574036, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9381, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 6.8769716088328074, | |
| "grad_norm": 0.47471919655799866, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9443, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 6.908517350157728, | |
| "grad_norm": 0.4787106513977051, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9331, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 6.94006309148265, | |
| "grad_norm": 0.4515725076198578, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9461, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 6.971608832807571, | |
| "grad_norm": 0.4714019000530243, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9587, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_loss": 1.0130518674850464, | |
| "eval_runtime": 66.5051, | |
| "eval_samples_per_second": 4.782, | |
| "eval_steps_per_second": 0.601, | |
| "step": 2219 | |
| }, | |
| { | |
| "epoch": 7.003154574132492, | |
| "grad_norm": 0.44392409920692444, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9691, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 7.034700315457413, | |
| "grad_norm": 0.538865864276886, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9095, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 7.066246056782334, | |
| "grad_norm": 0.5173049569129944, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9281, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 7.097791798107256, | |
| "grad_norm": 0.4751831293106079, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9094, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 7.129337539432177, | |
| "grad_norm": 0.5221697092056274, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9148, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 7.160883280757098, | |
| "grad_norm": 0.5088801383972168, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9383, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 7.192429022082019, | |
| "grad_norm": 0.5191715359687805, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9187, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 7.2239747634069404, | |
| "grad_norm": 0.5438238382339478, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9192, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 7.255520504731861, | |
| "grad_norm": 0.5197346210479736, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9226, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 7.287066246056782, | |
| "grad_norm": 0.5286086797714233, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9009, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 7.318611987381703, | |
| "grad_norm": 0.4977555274963379, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9524, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 7.350157728706624, | |
| "grad_norm": 0.5014932751655579, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9356, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 7.381703470031546, | |
| "grad_norm": 0.5207954049110413, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9095, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 7.413249211356467, | |
| "grad_norm": 0.512366771697998, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9357, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 7.444794952681388, | |
| "grad_norm": 0.5742561221122742, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9188, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 7.476340694006309, | |
| "grad_norm": 0.5032497644424438, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9624, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 7.50788643533123, | |
| "grad_norm": 0.6190054416656494, | |
| "learning_rate": 5e-05, | |
| "loss": 0.8954, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 7.539432176656152, | |
| "grad_norm": 0.5226176977157593, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9212, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 7.570977917981073, | |
| "grad_norm": 0.5045409202575684, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9459, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 7.6025236593059935, | |
| "grad_norm": 0.48342952132225037, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9306, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 7.634069400630915, | |
| "grad_norm": 0.48288047313690186, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9217, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 7.665615141955836, | |
| "grad_norm": 0.5119076371192932, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9594, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 7.697160883280757, | |
| "grad_norm": 0.5182865262031555, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9158, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 7.728706624605678, | |
| "grad_norm": 0.5085521340370178, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9249, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 7.760252365930599, | |
| "grad_norm": 0.49291595816612244, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9128, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 7.79179810725552, | |
| "grad_norm": 0.5067439675331116, | |
| "learning_rate": 5e-05, | |
| "loss": 0.8993, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 7.823343848580442, | |
| "grad_norm": 0.49475356936454773, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9313, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 7.854889589905363, | |
| "grad_norm": 0.5028258562088013, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9459, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 7.886435331230284, | |
| "grad_norm": 0.482112854719162, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9243, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 7.917981072555205, | |
| "grad_norm": 0.5285838842391968, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9847, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 7.9495268138801265, | |
| "grad_norm": 0.5217479467391968, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9172, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 7.981072555205047, | |
| "grad_norm": 0.47651416063308716, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9003, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_loss": 1.0111174583435059, | |
| "eval_runtime": 66.5497, | |
| "eval_samples_per_second": 4.778, | |
| "eval_steps_per_second": 0.601, | |
| "step": 2536 | |
| }, | |
| { | |
| "epoch": 8.012618296529968, | |
| "grad_norm": 0.5173642635345459, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9408, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 8.04416403785489, | |
| "grad_norm": 0.5017074346542358, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9028, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 8.07570977917981, | |
| "grad_norm": 0.5437857508659363, | |
| "learning_rate": 5e-05, | |
| "loss": 0.916, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 8.107255520504731, | |
| "grad_norm": 0.485762357711792, | |
| "learning_rate": 5e-05, | |
| "loss": 0.8698, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 8.138801261829652, | |
| "grad_norm": 0.5231262445449829, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9004, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 8.170347003154575, | |
| "grad_norm": 0.49633580446243286, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9159, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 8.201892744479496, | |
| "grad_norm": 0.5477921366691589, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9015, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 8.233438485804417, | |
| "grad_norm": 0.5651286840438843, | |
| "learning_rate": 5e-05, | |
| "loss": 0.8925, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 8.264984227129338, | |
| "grad_norm": 0.5210168957710266, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9172, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 8.296529968454259, | |
| "grad_norm": 0.5071650743484497, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9172, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 8.32807570977918, | |
| "grad_norm": 0.5585223436355591, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9234, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 8.3596214511041, | |
| "grad_norm": 0.5303429961204529, | |
| "learning_rate": 5e-05, | |
| "loss": 0.896, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 8.391167192429021, | |
| "grad_norm": 0.5033040046691895, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9431, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 8.422712933753942, | |
| "grad_norm": 0.4882967472076416, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9103, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 8.454258675078865, | |
| "grad_norm": 0.5938067436218262, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9067, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 8.485804416403786, | |
| "grad_norm": 0.5606987476348877, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9177, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 8.517350157728707, | |
| "grad_norm": 0.5056515336036682, | |
| "learning_rate": 5e-05, | |
| "loss": 0.8924, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 8.548895899053628, | |
| "grad_norm": 0.5208995938301086, | |
| "learning_rate": 5e-05, | |
| "loss": 0.917, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 8.580441640378549, | |
| "grad_norm": 0.575134813785553, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9132, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 8.61198738170347, | |
| "grad_norm": 0.5263710021972656, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9162, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 8.64353312302839, | |
| "grad_norm": 0.5916036367416382, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9147, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 8.675078864353312, | |
| "grad_norm": 0.5610800981521606, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9022, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 8.706624605678233, | |
| "grad_norm": 0.5309184193611145, | |
| "learning_rate": 5e-05, | |
| "loss": 0.8736, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 8.738170347003155, | |
| "grad_norm": 0.5035881996154785, | |
| "learning_rate": 5e-05, | |
| "loss": 0.898, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 8.769716088328076, | |
| "grad_norm": 0.5445141196250916, | |
| "learning_rate": 5e-05, | |
| "loss": 0.903, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 8.801261829652997, | |
| "grad_norm": 0.5459301471710205, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9124, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 8.832807570977918, | |
| "grad_norm": 0.5099250078201294, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9132, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 8.864353312302839, | |
| "grad_norm": 0.5143303275108337, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9085, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 8.89589905362776, | |
| "grad_norm": 0.5371480584144592, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9463, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 8.927444794952681, | |
| "grad_norm": 0.517353892326355, | |
| "learning_rate": 5e-05, | |
| "loss": 0.8911, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 8.958990536277602, | |
| "grad_norm": 0.5601980090141296, | |
| "learning_rate": 5e-05, | |
| "loss": 0.915, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 8.990536277602523, | |
| "grad_norm": 0.5473778247833252, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9254, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_loss": 1.0110243558883667, | |
| "eval_runtime": 66.5208, | |
| "eval_samples_per_second": 4.78, | |
| "eval_steps_per_second": 0.601, | |
| "step": 2853 | |
| }, | |
| { | |
| "epoch": 9.022082018927446, | |
| "grad_norm": 0.5774141550064087, | |
| "learning_rate": 5e-05, | |
| "loss": 0.8731, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 9.053627760252366, | |
| "grad_norm": 0.5381526350975037, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9125, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 9.085173501577287, | |
| "grad_norm": 0.5414624810218811, | |
| "learning_rate": 5e-05, | |
| "loss": 0.8838, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 9.116719242902208, | |
| "grad_norm": 0.526127815246582, | |
| "learning_rate": 5e-05, | |
| "loss": 0.8709, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 9.14826498422713, | |
| "grad_norm": 0.5719351768493652, | |
| "learning_rate": 5e-05, | |
| "loss": 0.8976, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 9.17981072555205, | |
| "grad_norm": 0.6119252443313599, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9006, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 9.211356466876971, | |
| "grad_norm": 0.5286473035812378, | |
| "learning_rate": 5e-05, | |
| "loss": 0.873, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 9.242902208201892, | |
| "grad_norm": 0.5602397918701172, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9113, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 9.274447949526813, | |
| "grad_norm": 0.5757038593292236, | |
| "learning_rate": 5e-05, | |
| "loss": 0.8967, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 9.305993690851736, | |
| "grad_norm": 0.5797662138938904, | |
| "learning_rate": 5e-05, | |
| "loss": 0.921, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 9.337539432176657, | |
| "grad_norm": 0.5598446726799011, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9121, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 9.369085173501578, | |
| "grad_norm": 0.5119657516479492, | |
| "learning_rate": 5e-05, | |
| "loss": 0.8748, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 9.400630914826499, | |
| "grad_norm": 0.5484170317649841, | |
| "learning_rate": 5e-05, | |
| "loss": 0.8971, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 9.43217665615142, | |
| "grad_norm": 0.5351391434669495, | |
| "learning_rate": 5e-05, | |
| "loss": 0.8466, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 9.46372239747634, | |
| "grad_norm": 0.5337589979171753, | |
| "learning_rate": 5e-05, | |
| "loss": 0.8986, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 9.495268138801261, | |
| "grad_norm": 0.5773183703422546, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9001, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 9.526813880126182, | |
| "grad_norm": 0.6163984537124634, | |
| "learning_rate": 5e-05, | |
| "loss": 0.902, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 9.558359621451103, | |
| "grad_norm": 0.5879700183868408, | |
| "learning_rate": 5e-05, | |
| "loss": 0.8855, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 9.589905362776026, | |
| "grad_norm": 0.5596455335617065, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9052, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 9.621451104100947, | |
| "grad_norm": 0.5862036943435669, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9092, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 9.652996845425868, | |
| "grad_norm": 0.5491965413093567, | |
| "learning_rate": 5e-05, | |
| "loss": 0.8887, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 9.684542586750789, | |
| "grad_norm": 0.5651736259460449, | |
| "learning_rate": 5e-05, | |
| "loss": 0.8873, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 9.71608832807571, | |
| "grad_norm": 0.5439329147338867, | |
| "learning_rate": 5e-05, | |
| "loss": 0.8871, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 9.74763406940063, | |
| "grad_norm": 0.5257729887962341, | |
| "learning_rate": 5e-05, | |
| "loss": 0.8711, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 9.779179810725552, | |
| "grad_norm": 0.5310476422309875, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9118, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 9.810725552050473, | |
| "grad_norm": 0.5593333840370178, | |
| "learning_rate": 5e-05, | |
| "loss": 0.8647, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 9.842271293375394, | |
| "grad_norm": 0.5591513514518738, | |
| "learning_rate": 5e-05, | |
| "loss": 0.8788, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 9.873817034700316, | |
| "grad_norm": 0.5862401723861694, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9113, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 9.905362776025237, | |
| "grad_norm": 0.5847012996673584, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9228, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 9.936908517350158, | |
| "grad_norm": 0.5507489442825317, | |
| "learning_rate": 5e-05, | |
| "loss": 0.9114, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 9.968454258675079, | |
| "grad_norm": 0.5988635420799255, | |
| "learning_rate": 5e-05, | |
| "loss": 0.8753, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 0.6517265439033508, | |
| "learning_rate": 5e-05, | |
| "loss": 0.8532, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_loss": 1.0107625722885132, | |
| "eval_runtime": 66.5594, | |
| "eval_samples_per_second": 4.778, | |
| "eval_steps_per_second": 0.601, | |
| "step": 3170 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 6340, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 20, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.1112958196396851e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |