{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 2075, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.060350030175015085, "grad_norm": 0.2307848036289215, "learning_rate": 8.18181818181818e-05, "loss": 1.8669, "mean_token_accuracy": 0.6139279180765151, "num_tokens": 157037.0, "step": 25 }, { "epoch": 0.12070006035003017, "grad_norm": 0.315602570772171, "learning_rate": 0.00016704545454545452, "loss": 1.0736, "mean_token_accuracy": 0.7409161877632141, "num_tokens": 282266.0, "step": 50 }, { "epoch": 0.18105009052504525, "grad_norm": 0.2508843243122101, "learning_rate": 0.0002522727272727273, "loss": 0.67, "mean_token_accuracy": 0.819593402147293, "num_tokens": 439795.0, "step": 75 }, { "epoch": 0.24140012070006034, "grad_norm": 0.4052221179008484, "learning_rate": 0.0002999887132933212, "loss": 0.5476, "mean_token_accuracy": 0.8481135076284408, "num_tokens": 567017.0, "step": 100 }, { "epoch": 0.30175015087507545, "grad_norm": 0.28632256388664246, "learning_rate": 0.0002998791256978121, "loss": 0.4307, "mean_token_accuracy": 0.8766800117492676, "num_tokens": 723951.0, "step": 125 }, { "epoch": 0.3621001810500905, "grad_norm": 0.37199538946151733, "learning_rate": 0.0002996530399366737, "loss": 0.3577, "mean_token_accuracy": 0.8976909649372101, "num_tokens": 850124.0, "step": 150 }, { "epoch": 0.4224502112251056, "grad_norm": 0.24243904650211334, "learning_rate": 0.00029931063174202567, "loss": 0.2541, "mean_token_accuracy": 0.9251525086164475, "num_tokens": 1007442.0, "step": 175 }, { "epoch": 0.4828002414001207, "grad_norm": 0.395974338054657, "learning_rate": 0.00029885216726118104, "loss": 0.2702, "mean_token_accuracy": 0.923055305480957, "num_tokens": 1134123.0, "step": 200 }, { "epoch": 0.5431502715751357, "grad_norm": 0.2322138547897339, "learning_rate": 0.00029827800284977474, "loss": 0.1811, "mean_token_accuracy": 0.9476957750320435, "num_tokens": 1293162.0, "step": 225 }, { "epoch": 0.6035003017501509, "grad_norm": 0.33882594108581543, "learning_rate": 0.00029758858479477575, "loss": 0.1739, "mean_token_accuracy": 0.949301148056984, "num_tokens": 1420644.0, "step": 250 }, { "epoch": 0.663850331925166, "grad_norm": 0.1778780221939087, "learning_rate": 0.0002967844489675963, "loss": 0.1542, "mean_token_accuracy": 0.9562543380260468, "num_tokens": 1577978.0, "step": 275 }, { "epoch": 0.724200362100181, "grad_norm": 0.24160577356815338, "learning_rate": 0.00029586622040756957, "loss": 0.1321, "mean_token_accuracy": 0.9632052791118622, "num_tokens": 1704858.0, "step": 300 }, { "epoch": 0.7845503922751962, "grad_norm": 0.21109254658222198, "learning_rate": 0.0002948346128361186, "loss": 0.1185, "mean_token_accuracy": 0.9662619507312775, "num_tokens": 1862834.0, "step": 325 }, { "epoch": 0.8449004224502112, "grad_norm": 0.3435733914375305, "learning_rate": 0.00029369042810199416, "loss": 0.1058, "mean_token_accuracy": 0.9695043462514877, "num_tokens": 1988425.0, "step": 350 }, { "epoch": 0.9052504526252263, "grad_norm": 0.14936767518520355, "learning_rate": 0.0002924345555580135, "loss": 0.0951, "mean_token_accuracy": 0.9735645836591721, "num_tokens": 2145635.0, "step": 375 }, { "epoch": 0.9656004828002414, "grad_norm": 0.36445313692092896, "learning_rate": 0.000291067971369783, "loss": 0.0809, "mean_token_accuracy": 0.9770607089996338, "num_tokens": 2273525.0, "step": 400 }, { "epoch": 1.0, "eval_loss": 0.08547013998031616, "eval_mean_token_accuracy": 0.9758709730328741, "eval_num_tokens": 2354180.0, "eval_runtime": 62.7554, "eval_samples_per_second": 5.88, "eval_steps_per_second": 2.948, "step": 415 }, { "epoch": 1.024140012070006, "grad_norm": 0.17265604436397552, "learning_rate": 0.0002895917377569438, "loss": 0.0897, "mean_token_accuracy": 0.9755592708735122, "num_tokens": 2423409.0, "step": 425 }, { "epoch": 1.0844900422450212, "grad_norm": 0.17761775851249695, "learning_rate": 0.00028800700216752875, "loss": 0.0569, "mean_token_accuracy": 0.9834855335950852, "num_tokens": 2566528.0, "step": 450 }, { "epoch": 1.1448400724200363, "grad_norm": 0.16882313787937164, "learning_rate": 0.00028631499638607285, "loss": 0.0793, "mean_token_accuracy": 0.9781825703382492, "num_tokens": 2709916.0, "step": 475 }, { "epoch": 1.2051901025950513, "grad_norm": 0.12463518232107162, "learning_rate": 0.0002845170355761712, "loss": 0.0574, "mean_token_accuracy": 0.9840801757574081, "num_tokens": 2854287.0, "step": 500 }, { "epoch": 1.2655401327700664, "grad_norm": 0.12803447246551514, "learning_rate": 0.0002826145172582274, "loss": 0.0645, "mean_token_accuracy": 0.9821405208110809, "num_tokens": 2996401.0, "step": 525 }, { "epoch": 1.3258901629450814, "grad_norm": 0.10773979872465134, "learning_rate": 0.00028060892022318764, "loss": 0.054, "mean_token_accuracy": 0.9844699889421463, "num_tokens": 3138381.0, "step": 550 }, { "epoch": 1.3862401931200965, "grad_norm": 0.14240588247776031, "learning_rate": 0.0002785018033831051, "loss": 0.0744, "mean_token_accuracy": 0.9793990004062653, "num_tokens": 3279432.0, "step": 575 }, { "epoch": 1.4465902232951118, "grad_norm": 0.12011445313692093, "learning_rate": 0.0002762948045594276, "loss": 0.0508, "mean_token_accuracy": 0.985177715420723, "num_tokens": 3422945.0, "step": 600 }, { "epoch": 1.5069402534701268, "grad_norm": 0.07065067440271378, "learning_rate": 0.0002739896392099502, "loss": 0.0672, "mean_token_accuracy": 0.9815575838088989, "num_tokens": 3565901.0, "step": 625 }, { "epoch": 1.567290283645142, "grad_norm": 0.13502469658851624, "learning_rate": 0.00027158809909542307, "loss": 0.0451, "mean_token_accuracy": 0.98676638007164, "num_tokens": 3708558.0, "step": 650 }, { "epoch": 1.627640313820157, "grad_norm": 0.08584199845790863, "learning_rate": 0.00026909205088685, "loss": 0.062, "mean_token_accuracy": 0.9832429319620133, "num_tokens": 3849682.0, "step": 675 }, { "epoch": 1.687990343995172, "grad_norm": 0.10930341482162476, "learning_rate": 0.0002665034347145612, "loss": 0.0464, "mean_token_accuracy": 0.9868946731090545, "num_tokens": 3993080.0, "step": 700 }, { "epoch": 1.748340374170187, "grad_norm": 0.1063772663474083, "learning_rate": 0.000263824262660187, "loss": 0.0611, "mean_token_accuracy": 0.9831858760118485, "num_tokens": 4135566.0, "step": 725 }, { "epoch": 1.8086904043452021, "grad_norm": 0.11367379873991013, "learning_rate": 0.0002610566171927056, "loss": 0.0426, "mean_token_accuracy": 0.9874300253391266, "num_tokens": 4278312.0, "step": 750 }, { "epoch": 1.8690404345202172, "grad_norm": 0.09623493254184723, "learning_rate": 0.00025820264954977976, "loss": 0.0594, "mean_token_accuracy": 0.9828196185827255, "num_tokens": 4417338.0, "step": 775 }, { "epoch": 1.9293904646952322, "grad_norm": 0.1381269097328186, "learning_rate": 0.00025526457806564136, "loss": 0.0387, "mean_token_accuracy": 0.9886600559949875, "num_tokens": 4556744.0, "step": 800 }, { "epoch": 1.9897404948702473, "grad_norm": 0.10048148781061172, "learning_rate": 0.00025224468644682245, "loss": 0.0436, "mean_token_accuracy": 0.9878453743457795, "num_tokens": 4688626.0, "step": 825 }, { "epoch": 2.0, "eval_loss": 0.05272972211241722, "eval_mean_token_accuracy": 0.9853407392630706, "eval_num_tokens": 4708360.0, "eval_runtime": 62.7886, "eval_samples_per_second": 5.877, "eval_steps_per_second": 2.946, "step": 830 }, { "epoch": 2.048280024140012, "grad_norm": 0.14225316047668457, "learning_rate": 0.00024914532199707444, "loss": 0.0429, "mean_token_accuracy": 0.9869677971318825, "num_tokens": 4838704.0, "step": 850 }, { "epoch": 2.1086300543150274, "grad_norm": 0.16189508140087128, "learning_rate": 0.00024596889379285353, "loss": 0.0353, "mean_token_accuracy": 0.9899542284011841, "num_tokens": 4970478.0, "step": 875 }, { "epoch": 2.1689800844900424, "grad_norm": 0.08687274158000946, "learning_rate": 0.00024271787081079228, "loss": 0.0464, "mean_token_accuracy": 0.9863937604427337, "num_tokens": 5120208.0, "step": 900 }, { "epoch": 2.2293301146650575, "grad_norm": 0.10480177402496338, "learning_rate": 0.00023939478000861117, "loss": 0.0308, "mean_token_accuracy": 0.9905521881580353, "num_tokens": 5252759.0, "step": 925 }, { "epoch": 2.2896801448400725, "grad_norm": 0.07012847810983658, "learning_rate": 0.00023600220436096318, "loss": 0.0477, "mean_token_accuracy": 0.98607047021389, "num_tokens": 5405015.0, "step": 950 }, { "epoch": 2.3500301750150876, "grad_norm": 0.23715409636497498, "learning_rate": 0.00023254278085173684, "loss": 0.0308, "mean_token_accuracy": 0.9909250718355179, "num_tokens": 5538251.0, "step": 975 }, { "epoch": 2.4103802051901027, "grad_norm": 0.08109632134437561, "learning_rate": 0.00022901919842437972, "loss": 0.0474, "mean_token_accuracy": 0.9859592580795288, "num_tokens": 5691272.0, "step": 1000 }, { "epoch": 2.4707302353651177, "grad_norm": 0.09655272960662842, "learning_rate": 0.00022543419589183397, "loss": 0.0318, "mean_token_accuracy": 0.9903653126955032, "num_tokens": 5824972.0, "step": 1025 }, { "epoch": 2.5310802655401328, "grad_norm": 0.052605971693992615, "learning_rate": 0.00022179055980770993, "loss": 0.0412, "mean_token_accuracy": 0.9876793098449707, "num_tokens": 5975189.0, "step": 1050 }, { "epoch": 2.591430295715148, "grad_norm": 0.08622315526008606, "learning_rate": 0.0002180911223003513, "loss": 0.0295, "mean_token_accuracy": 0.9909289944171905, "num_tokens": 6108290.0, "step": 1075 }, { "epoch": 2.651780325890163, "grad_norm": 0.07412232458591461, "learning_rate": 0.00021433875887147627, "loss": 0.0468, "mean_token_accuracy": 0.986087337732315, "num_tokens": 6259582.0, "step": 1100 }, { "epoch": 2.712130356065178, "grad_norm": 0.09526045620441437, "learning_rate": 0.00021053638616110525, "loss": 0.0292, "mean_token_accuracy": 0.9911455798149109, "num_tokens": 6392200.0, "step": 1125 }, { "epoch": 2.772480386240193, "grad_norm": 0.07069120556116104, "learning_rate": 0.00020668695968051274, "loss": 0.044, "mean_token_accuracy": 0.9870589804649353, "num_tokens": 6543588.0, "step": 1150 }, { "epoch": 2.832830416415208, "grad_norm": 0.09000532329082489, "learning_rate": 0.00020279347151496482, "loss": 0.0296, "mean_token_accuracy": 0.9909429329633713, "num_tokens": 6676376.0, "step": 1175 }, { "epoch": 2.8931804465902236, "grad_norm": 0.05883787199854851, "learning_rate": 0.00019885894799802922, "loss": 0.0424, "mean_token_accuracy": 0.9874570268392563, "num_tokens": 6827932.0, "step": 1200 }, { "epoch": 2.9535304767652386, "grad_norm": 0.13316047191619873, "learning_rate": 0.00019488644735926396, "loss": 0.0295, "mean_token_accuracy": 0.991017809510231, "num_tokens": 6961443.0, "step": 1225 }, { "epoch": 3.0, "eval_loss": 0.044597771018743515, "eval_mean_token_accuracy": 0.9876517611580926, "eval_num_tokens": 7062540.0, "eval_runtime": 62.7187, "eval_samples_per_second": 5.883, "eval_steps_per_second": 2.95, "step": 1245 }, { "epoch": 3.012070006035003, "grad_norm": 0.15881508588790894, "learning_rate": 0.00019087905734711452, "loss": 0.0369, "mean_token_accuracy": 0.9889105128258774, "num_tokens": 7100107.0, "step": 1250 }, { "epoch": 3.0724200362100182, "grad_norm": 0.04447310045361519, "learning_rate": 0.00018683989282886613, "loss": 0.0279, "mean_token_accuracy": 0.9910174655914307, "num_tokens": 7250123.0, "step": 1275 }, { "epoch": 3.1327700663850333, "grad_norm": 0.05848820507526398, "learning_rate": 0.0001827720933695173, "loss": 0.0283, "mean_token_accuracy": 0.9908898574113846, "num_tokens": 7386980.0, "step": 1300 }, { "epoch": 3.1931200965600484, "grad_norm": 0.05700680986046791, "learning_rate": 0.00017867882079145627, "loss": 0.0268, "mean_token_accuracy": 0.9915226358175278, "num_tokens": 7534097.0, "step": 1325 }, { "epoch": 3.2534701267350634, "grad_norm": 0.06182008236646652, "learning_rate": 0.00017456325671683724, "loss": 0.0293, "mean_token_accuracy": 0.9909615200757981, "num_tokens": 7670112.0, "step": 1350 }, { "epoch": 3.3138201569100785, "grad_norm": 0.05402887985110283, "learning_rate": 0.00017042860009456638, "loss": 0.0269, "mean_token_accuracy": 0.9914444983005524, "num_tokens": 7819619.0, "step": 1375 }, { "epoch": 3.3741701870850935, "grad_norm": 0.07528018951416016, "learning_rate": 0.00016627806471382066, "loss": 0.0282, "mean_token_accuracy": 0.9909832864999771, "num_tokens": 7955967.0, "step": 1400 }, { "epoch": 3.4345202172601086, "grad_norm": 0.05704252049326897, "learning_rate": 0.00016211487670603078, "loss": 0.0259, "mean_token_accuracy": 0.9916054201126099, "num_tokens": 8102608.0, "step": 1425 }, { "epoch": 3.4948702474351236, "grad_norm": 0.08405063301324844, "learning_rate": 0.0001579422720372715, "loss": 0.0289, "mean_token_accuracy": 0.9906303453445434, "num_tokens": 8236645.0, "step": 1450 }, { "epoch": 3.5552202776101387, "grad_norm": 0.03225329518318176, "learning_rate": 0.00015376349399300745, "loss": 0.0261, "mean_token_accuracy": 0.99147840321064, "num_tokens": 8383287.0, "step": 1475 }, { "epoch": 3.6155703077851538, "grad_norm": 0.09672655165195465, "learning_rate": 0.0001495817906571492, "loss": 0.0304, "mean_token_accuracy": 0.9903279423713685, "num_tokens": 8517618.0, "step": 1500 }, { "epoch": 3.675920337960169, "grad_norm": 0.03548620641231537, "learning_rate": 0.00014540041238738055, "loss": 0.0256, "mean_token_accuracy": 0.9921301937103272, "num_tokens": 8665120.0, "step": 1525 }, { "epoch": 3.736270368135184, "grad_norm": 0.05516292154788971, "learning_rate": 0.00014122260928871734, "loss": 0.0282, "mean_token_accuracy": 0.9913522702455521, "num_tokens": 8800218.0, "step": 1550 }, { "epoch": 3.796620398310199, "grad_norm": 0.05250876769423485, "learning_rate": 0.00013705162868726396, "loss": 0.025, "mean_token_accuracy": 0.9920567196607589, "num_tokens": 8948083.0, "step": 1575 }, { "epoch": 3.856970428485214, "grad_norm": 0.06773823499679565, "learning_rate": 0.00013289071260612855, "loss": 0.0304, "mean_token_accuracy": 0.9904211789369584, "num_tokens": 9084691.0, "step": 1600 }, { "epoch": 3.9173204586602295, "grad_norm": 0.041708026081323624, "learning_rate": 0.00012874309524546083, "loss": 0.0252, "mean_token_accuracy": 0.9919029641151428, "num_tokens": 9235980.0, "step": 1625 }, { "epoch": 3.9776704888352445, "grad_norm": 0.04703540354967117, "learning_rate": 0.00012461200046857084, "loss": 0.0262, "mean_token_accuracy": 0.99182637155056, "num_tokens": 9368839.0, "step": 1650 }, { "epoch": 4.0, "eval_loss": 0.040296006947755814, "eval_mean_token_accuracy": 0.9890867027076515, "eval_num_tokens": 9416720.0, "eval_runtime": 62.7415, "eval_samples_per_second": 5.881, "eval_steps_per_second": 2.949, "step": 1660 }, { "epoch": 4.036210018105009, "grad_norm": 0.05349954590201378, "learning_rate": 0.00012050063929608123, "loss": 0.0245, "mean_token_accuracy": 0.9919631487315463, "num_tokens": 9516874.0, "step": 1675 }, { "epoch": 4.096560048280024, "grad_norm": 0.058036044239997864, "learning_rate": 0.0001164122074100633, "loss": 0.0204, "mean_token_accuracy": 0.9933006262779236, "num_tokens": 9654378.0, "step": 1700 }, { "epoch": 4.15691007845504, "grad_norm": 0.06282967329025269, "learning_rate": 0.00011234988267009415, "loss": 0.0228, "mean_token_accuracy": 0.9928084880113601, "num_tokens": 9798671.0, "step": 1725 }, { "epoch": 4.217260108630055, "grad_norm": 0.03295363485813141, "learning_rate": 0.00010831682264316787, "loss": 0.0199, "mean_token_accuracy": 0.9936361283063888, "num_tokens": 9936183.0, "step": 1750 }, { "epoch": 4.27761013880507, "grad_norm": 0.07894692569971085, "learning_rate": 0.00010431616214937911, "loss": 0.0248, "mean_token_accuracy": 0.9920186513662338, "num_tokens": 10083447.0, "step": 1775 }, { "epoch": 4.337960168980085, "grad_norm": 0.052625857293605804, "learning_rate": 0.00010035101082528777, "loss": 0.0198, "mean_token_accuracy": 0.9934204763174057, "num_tokens": 10222918.0, "step": 1800 }, { "epoch": 4.3983101991551, "grad_norm": 0.05687833949923515, "learning_rate": 9.642445070685809e-05, "loss": 0.0245, "mean_token_accuracy": 0.9919208544492721, "num_tokens": 10371876.0, "step": 1825 }, { "epoch": 4.458660229330115, "grad_norm": 0.036979444324970245, "learning_rate": 9.253953383385157e-05, "loss": 0.0187, "mean_token_accuracy": 0.9938018727302551, "num_tokens": 10509835.0, "step": 1850 }, { "epoch": 4.51901025950513, "grad_norm": 0.08300971984863281, "learning_rate": 8.869927987753459e-05, "loss": 0.0237, "mean_token_accuracy": 0.9923136639595032, "num_tokens": 10655174.0, "step": 1875 }, { "epoch": 4.579360289680145, "grad_norm": 0.034062255173921585, "learning_rate": 8.490667379354661e-05, "loss": 0.0192, "mean_token_accuracy": 0.9937297248840332, "num_tokens": 10792672.0, "step": 1900 }, { "epoch": 4.63971031985516, "grad_norm": 0.09020201861858368, "learning_rate": 8.116466350175079e-05, "loss": 0.0239, "mean_token_accuracy": 0.9922147291898727, "num_tokens": 10939329.0, "step": 1925 }, { "epoch": 4.700060350030175, "grad_norm": 0.055495839565992355, "learning_rate": 7.747615759487304e-05, "loss": 0.0185, "mean_token_accuracy": 0.9938122999668121, "num_tokens": 11077006.0, "step": 1950 }, { "epoch": 4.76041038020519, "grad_norm": 0.05608997866511345, "learning_rate": 7.38440230777085e-05, "loss": 0.0255, "mean_token_accuracy": 0.9921148890256881, "num_tokens": 11222702.0, "step": 1975 }, { "epoch": 4.820760410380205, "grad_norm": 0.05004817619919777, "learning_rate": 7.027108313865378e-05, "loss": 0.02, "mean_token_accuracy": 0.9935011225938797, "num_tokens": 11359725.0, "step": 2000 }, { "epoch": 4.88111044055522, "grad_norm": 0.054554738104343414, "learning_rate": 6.676011495529687e-05, "loss": 0.0231, "mean_token_accuracy": 0.9923803770542144, "num_tokens": 11506931.0, "step": 2025 }, { "epoch": 4.941460470730235, "grad_norm": 0.042225901037454605, "learning_rate": 6.331384753577056e-05, "loss": 0.0187, "mean_token_accuracy": 0.9937945604324341, "num_tokens": 11643259.0, "step": 2050 }, { "epoch": 5.0, "grad_norm": 0.13635069131851196, "learning_rate": 5.993495959754631e-05, "loss": 0.0206, "mean_token_accuracy": 0.9932199802595315, "num_tokens": 11770900.0, "step": 2075 }, { "epoch": 5.0, "eval_loss": 0.039575137197971344, "eval_mean_token_accuracy": 0.9895688775423411, "eval_num_tokens": 11770900.0, "eval_runtime": 62.7395, "eval_samples_per_second": 5.881, "eval_steps_per_second": 2.949, "step": 2075 } ], "logging_steps": 25, "max_steps": 2905, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.056986746477353e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }