{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9987438399845395, "eval_steps": 1000, "global_step": 646, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 346.12055015563965, "epoch": 0.0015460430959512996, "grad_norm": 0.7521562252130102, "kl": 0.0, "learning_rate": 9.999940874631275e-07, "loss": -0.0, "reward": 0.155133934924379, "reward_std": 0.2433017324656248, "rewards/accuracy_reward": 0.09040178859140724, "rewards/format_reward": 0.06473214598372579, "step": 1 }, { "completion_length": 341.9776916503906, "epoch": 0.0030920861919025992, "grad_norm": 0.839973645517661, "kl": 0.00020295381546020508, "learning_rate": 9.999763499923432e-07, "loss": 0.0, "reward": 0.1629464365541935, "reward_std": 0.2753388248383999, "rewards/accuracy_reward": 0.09040179045405239, "rewards/format_reward": 0.072544647147879, "step": 2 }, { "completion_length": 367.03126525878906, "epoch": 0.004638129287853899, "grad_norm": 0.8337123284322902, "kl": 0.0004980564117431641, "learning_rate": 9.999467880071401e-07, "loss": 0.0, "reward": 0.16964286798611283, "reward_std": 0.25930263102054596, "rewards/accuracy_reward": 0.07924107520375401, "rewards/format_reward": 0.09040179022122175, "step": 3 }, { "completion_length": 360.6105089187622, "epoch": 0.0061841723838051985, "grad_norm": 0.7763153394552681, "kl": 0.0008927583694458008, "learning_rate": 9.99905402206664e-07, "loss": 0.0, "reward": 0.22879465203732252, "reward_std": 0.3536552395671606, "rewards/accuracy_reward": 0.10379464726429433, "rewards/format_reward": 0.12500000547152013, "step": 4 }, { "completion_length": 374.2500190734863, "epoch": 0.007730215479756498, "grad_norm": 0.8421744453738365, "kl": 0.0025339126586914062, "learning_rate": 9.998521935696952e-07, "loss": 0.0001, "reward": 0.2857142984867096, "reward_std": 0.3793561812490225, "rewards/accuracy_reward": 0.08370536053553224, "rewards/format_reward": 0.2020089365541935, "step": 5 }, { "completion_length": 378.86274337768555, "epoch": 0.009276258575707798, "grad_norm": 0.7163414080517491, "kl": 0.0053958892822265625, "learning_rate": 9.997871633546254e-07, "loss": 0.0002, "reward": 0.360491088591516, "reward_std": 0.4047303572297096, "rewards/accuracy_reward": 0.06696428812574595, "rewards/format_reward": 0.2935268022119999, "step": 6 }, { "completion_length": 336.7857265472412, "epoch": 0.010822301671659098, "grad_norm": 0.7260351267444399, "kl": 0.006798744201660156, "learning_rate": 9.997103130994294e-07, "loss": 0.0003, "reward": 0.41964287124574184, "reward_std": 0.4503227211534977, "rewards/accuracy_reward": 0.07142857543658465, "rewards/format_reward": 0.3482143012806773, "step": 7 }, { "completion_length": 379.40403270721436, "epoch": 0.012368344767610397, "grad_norm": 0.6444088440622997, "kl": 0.006015777587890625, "learning_rate": 9.996216446216266e-07, "loss": 0.0002, "reward": 0.40513394586741924, "reward_std": 0.4478660374879837, "rewards/accuracy_reward": 0.07700893201399595, "rewards/format_reward": 0.3281250149011612, "step": 8 }, { "completion_length": 304.55023860931396, "epoch": 0.013914387863561696, "grad_norm": 0.6802784628799866, "kl": 0.01288604736328125, "learning_rate": 9.995211600182396e-07, "loss": 0.0005, "reward": 0.5915178842842579, "reward_std": 0.45967594161629677, "rewards/accuracy_reward": 0.05803571711294353, "rewards/format_reward": 0.5334821660071611, "step": 9 }, { "completion_length": 266.4810380935669, "epoch": 0.015460430959512996, "grad_norm": 0.6374694714437484, "kl": 0.01629638671875, "learning_rate": 9.994088616657444e-07, "loss": 0.0007, "reward": 0.731026828289032, "reward_std": 0.39779933728277683, "rewards/accuracy_reward": 0.053571431315504014, "rewards/format_reward": 0.6774553880095482, "step": 10 }, { "completion_length": 279.8471088409424, "epoch": 0.017006474055464297, "grad_norm": 0.5307330145226442, "kl": 0.014434814453125, "learning_rate": 9.992847522200133e-07, "loss": 0.0006, "reward": 0.7477678954601288, "reward_std": 0.419852489605546, "rewards/accuracy_reward": 0.06473214563447982, "rewards/format_reward": 0.683035746216774, "step": 11 }, { "completion_length": 288.3872957229614, "epoch": 0.018552517151415596, "grad_norm": 0.5189934416948966, "kl": 0.015346527099609375, "learning_rate": 9.991488346162529e-07, "loss": 0.0006, "reward": 0.7600446753203869, "reward_std": 0.414556248113513, "rewards/accuracy_reward": 0.0714285762514919, "rewards/format_reward": 0.6886161081492901, "step": 12 }, { "completion_length": 277.90738010406494, "epoch": 0.020098560247366896, "grad_norm": 0.5515746367985946, "kl": 0.016185760498046875, "learning_rate": 9.99001112068935e-07, "loss": 0.0006, "reward": 0.745535746216774, "reward_std": 0.3840343914926052, "rewards/accuracy_reward": 0.042410716763697565, "rewards/format_reward": 0.7031250335276127, "step": 13 }, { "completion_length": 261.67412090301514, "epoch": 0.021644603343318195, "grad_norm": 0.5304390777883579, "kl": 0.017467498779296875, "learning_rate": 9.988415880717193e-07, "loss": 0.0007, "reward": 0.8191964663565159, "reward_std": 0.36659748293459415, "rewards/accuracy_reward": 0.053571431431919336, "rewards/format_reward": 0.7656250409781933, "step": 14 }, { "completion_length": 277.2522449493408, "epoch": 0.023190646439269495, "grad_norm": 0.46889954210262247, "kl": 0.0184326171875, "learning_rate": 9.986702663973722e-07, "loss": 0.0007, "reward": 0.828125037252903, "reward_std": 0.3403670620173216, "rewards/accuracy_reward": 0.061383931431919336, "rewards/format_reward": 0.7667411118745804, "step": 15 }, { "completion_length": 254.22880744934082, "epoch": 0.024736689535220794, "grad_norm": 0.4414349290049321, "kl": 0.021259307861328125, "learning_rate": 9.98487151097676e-07, "loss": 0.0009, "reward": 0.8683036081492901, "reward_std": 0.29708090890198946, "rewards/accuracy_reward": 0.045758930151350796, "rewards/format_reward": 0.8225446790456772, "step": 16 }, { "completion_length": 222.49554538726807, "epoch": 0.026282732631172093, "grad_norm": 0.45354691910214323, "kl": 0.019718170166015625, "learning_rate": 9.982922465033348e-07, "loss": 0.0008, "reward": 0.8828125447034836, "reward_std": 0.2733997143805027, "rewards/accuracy_reward": 0.033482144004665315, "rewards/format_reward": 0.8493303991854191, "step": 17 }, { "completion_length": 214.9665288925171, "epoch": 0.027828775727123393, "grad_norm": 0.30202100799531206, "kl": 0.02318572998046875, "learning_rate": 9.980855572238713e-07, "loss": 0.0009, "reward": 0.937500037252903, "reward_std": 0.206743401940912, "rewards/accuracy_reward": 0.02790178672876209, "rewards/format_reward": 0.9095982499420643, "step": 18 }, { "completion_length": 196.52791023254395, "epoch": 0.029374818823074692, "grad_norm": 0.40612403532533214, "kl": 0.02614593505859375, "learning_rate": 9.978670881475172e-07, "loss": 0.001, "reward": 0.9542411118745804, "reward_std": 0.210012833122164, "rewards/accuracy_reward": 0.03906250186264515, "rewards/format_reward": 0.9151786081492901, "step": 19 }, { "completion_length": 203.14844799041748, "epoch": 0.03092086191902599, "grad_norm": 0.3637824088513492, "kl": 0.0278778076171875, "learning_rate": 9.976368444410984e-07, "loss": 0.0011, "reward": 0.9575893245637417, "reward_std": 0.2013749754987657, "rewards/accuracy_reward": 0.044642859138548374, "rewards/format_reward": 0.9129464626312256, "step": 20 }, { "completion_length": 181.20983028411865, "epoch": 0.032466905014977294, "grad_norm": 0.33177163724731323, "kl": 0.030670166015625, "learning_rate": 9.973948315499126e-07, "loss": 0.0012, "reward": 0.9955357611179352, "reward_std": 0.16246340004727244, "rewards/accuracy_reward": 0.04575893108267337, "rewards/format_reward": 0.9497768133878708, "step": 21 }, { "completion_length": 162.81696891784668, "epoch": 0.034012948110928594, "grad_norm": 0.3084563376992779, "kl": 0.033294677734375, "learning_rate": 9.971410551976e-07, "loss": 0.0013, "reward": 1.0133928954601288, "reward_std": 0.1644470039755106, "rewards/accuracy_reward": 0.0479910735739395, "rewards/format_reward": 0.9654018096625805, "step": 22 }, { "completion_length": 166.3258991241455, "epoch": 0.03555899120687989, "grad_norm": 0.35522786928262584, "kl": 0.03772735595703125, "learning_rate": 9.968755213860092e-07, "loss": 0.0015, "reward": 1.0078125484287739, "reward_std": 0.16171623161062598, "rewards/accuracy_reward": 0.04799107334110886, "rewards/format_reward": 0.9598214514553547, "step": 23 }, { "completion_length": 162.18304443359375, "epoch": 0.03710503430283119, "grad_norm": 0.28768533090734194, "kl": 0.0369110107421875, "learning_rate": 9.96598236395054e-07, "loss": 0.0015, "reward": 1.0055803954601288, "reward_std": 0.1433702097274363, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.9698661044239998, "step": 24 }, { "completion_length": 146.10715055465698, "epoch": 0.03865107739878249, "grad_norm": 0.3854305427996945, "kl": 0.0389404296875, "learning_rate": 9.963092067825649e-07, "loss": 0.0016, "reward": 1.06026791036129, "reward_std": 0.15166151244193316, "rewards/accuracy_reward": 0.07477678940631449, "rewards/format_reward": 0.9854911118745804, "step": 25 }, { "completion_length": 172.50558757781982, "epoch": 0.04019712049473379, "grad_norm": 0.30230755818482496, "kl": 0.0357666015625, "learning_rate": 9.960084393841355e-07, "loss": 0.0014, "reward": 1.0234375447034836, "reward_std": 0.15331051405519247, "rewards/accuracy_reward": 0.05803571722935885, "rewards/format_reward": 0.9654018096625805, "step": 26 }, { "completion_length": 154.82032012939453, "epoch": 0.04174316359068509, "grad_norm": 0.25923125645215517, "kl": 0.035491943359375, "learning_rate": 9.956959413129584e-07, "loss": 0.0014, "reward": 1.0468750484287739, "reward_std": 0.11945596663281322, "rewards/accuracy_reward": 0.061383931897580624, "rewards/format_reward": 0.9854910895228386, "step": 27 }, { "completion_length": 155.0825958251953, "epoch": 0.04328920668663639, "grad_norm": 0.3636253057899995, "kl": 0.03501129150390625, "learning_rate": 9.953717199596596e-07, "loss": 0.0014, "reward": 1.070312563329935, "reward_std": 0.17273690272122622, "rewards/accuracy_reward": 0.08593750419095159, "rewards/format_reward": 0.9843750335276127, "step": 28 }, { "completion_length": 163.22210454940796, "epoch": 0.04483524978258769, "grad_norm": 0.319284871974592, "kl": 0.0344390869140625, "learning_rate": 9.950357829921218e-07, "loss": 0.0014, "reward": 1.05245541036129, "reward_std": 0.1389583395794034, "rewards/accuracy_reward": 0.0658482174621895, "rewards/format_reward": 0.9866071715950966, "step": 29 }, { "completion_length": 167.4676399230957, "epoch": 0.04638129287853899, "grad_norm": 0.28108539644169667, "kl": 0.03591156005859375, "learning_rate": 9.946881383553038e-07, "loss": 0.0014, "reward": 1.0446428954601288, "reward_std": 0.14452954661101103, "rewards/accuracy_reward": 0.06808036018628627, "rewards/format_reward": 0.9765625335276127, "step": 30 }, { "completion_length": 172.1216583251953, "epoch": 0.04792733597449029, "grad_norm": 0.3197758859345628, "kl": 0.0335693359375, "learning_rate": 9.943287942710527e-07, "loss": 0.0013, "reward": 1.0613839700818062, "reward_std": 0.16943255439400673, "rewards/accuracy_reward": 0.08147321711294353, "rewards/format_reward": 0.9799107424914837, "step": 31 }, { "completion_length": 180.24554443359375, "epoch": 0.04947337907044159, "grad_norm": 0.2639624666402686, "kl": 0.0308837890625, "learning_rate": 9.939577592379088e-07, "loss": 0.0012, "reward": 1.0569196827709675, "reward_std": 0.1737292120233178, "rewards/accuracy_reward": 0.08593750454019755, "rewards/format_reward": 0.9709821753203869, "step": 32 }, { "completion_length": 181.85715007781982, "epoch": 0.05101942216639289, "grad_norm": 0.30386107630031683, "kl": 0.02904510498046875, "learning_rate": 9.935750420309055e-07, "loss": 0.0012, "reward": 1.089285783469677, "reward_std": 0.20546742621809244, "rewards/accuracy_reward": 0.10937500558793545, "rewards/format_reward": 0.9799107387661934, "step": 33 }, { "completion_length": 183.6997833251953, "epoch": 0.05256546526234419, "grad_norm": 0.25208388975610957, "kl": 0.03362274169921875, "learning_rate": 9.931806517013612e-07, "loss": 0.0013, "reward": 1.0613839849829674, "reward_std": 0.16295406874269247, "rewards/accuracy_reward": 0.07812500325962901, "rewards/format_reward": 0.983258955180645, "step": 34 }, { "completion_length": 200.57813262939453, "epoch": 0.054111508358295486, "grad_norm": 0.2841693832437372, "kl": 0.027618408203125, "learning_rate": 9.927745975766652e-07, "loss": 0.0011, "reward": 1.095982201397419, "reward_std": 0.2079052086919546, "rewards/accuracy_reward": 0.1205357207218185, "rewards/format_reward": 0.9754464700818062, "step": 35 }, { "completion_length": 205.20982933044434, "epoch": 0.055657551454246786, "grad_norm": 0.26611828993375286, "kl": 0.0289459228515625, "learning_rate": 9.923568892600578e-07, "loss": 0.0012, "reward": 1.0569196827709675, "reward_std": 0.20081567112356424, "rewards/accuracy_reward": 0.09151786169968545, "rewards/format_reward": 0.9654018059372902, "step": 36 }, { "completion_length": 204.43527698516846, "epoch": 0.057203594550198085, "grad_norm": 0.2673992877659924, "kl": 0.03064727783203125, "learning_rate": 9.919275366304019e-07, "loss": 0.0012, "reward": 1.1171875596046448, "reward_std": 0.21135189849883318, "rewards/accuracy_reward": 0.14955357904545963, "rewards/format_reward": 0.9676339663565159, "step": 37 }, { "completion_length": 212.5390739440918, "epoch": 0.058749637646149384, "grad_norm": 0.299026719893167, "kl": 0.02680206298828125, "learning_rate": 9.91486549841951e-07, "loss": 0.0011, "reward": 1.0859375484287739, "reward_std": 0.24461901560425758, "rewards/accuracy_reward": 0.12611607753206044, "rewards/format_reward": 0.9598214663565159, "step": 38 }, { "completion_length": 216.13282012939453, "epoch": 0.060295680742100684, "grad_norm": 0.21519379356103246, "kl": 0.02599334716796875, "learning_rate": 9.910339393241069e-07, "loss": 0.001, "reward": 1.06808041036129, "reward_std": 0.18474733643233776, "rewards/accuracy_reward": 0.10267857636790723, "rewards/format_reward": 0.9654018208384514, "step": 39 }, { "completion_length": 232.81920909881592, "epoch": 0.06184172383805198, "grad_norm": 0.23099593898433254, "kl": 0.0265655517578125, "learning_rate": 9.90569715781176e-07, "loss": 0.0011, "reward": 1.1171875521540642, "reward_std": 0.23065209202468395, "rewards/accuracy_reward": 0.16071429383009672, "rewards/format_reward": 0.9564732424914837, "step": 40 }, { "completion_length": 228.38505172729492, "epoch": 0.06338776693400329, "grad_norm": 0.28586782980287884, "kl": 0.02822113037109375, "learning_rate": 9.90093890192113e-07, "loss": 0.0011, "reward": 1.0959821827709675, "reward_std": 0.26491549238562584, "rewards/accuracy_reward": 0.14732143701985478, "rewards/format_reward": 0.9486607424914837, "step": 41 }, { "completion_length": 235.67858219146729, "epoch": 0.06493381002995459, "grad_norm": 0.23651509407326327, "kl": 0.0252685546875, "learning_rate": 9.896064738102633e-07, "loss": 0.001, "reward": 1.075892899185419, "reward_std": 0.26366487983614206, "rewards/accuracy_reward": 0.1339285756694153, "rewards/format_reward": 0.9419643208384514, "step": 42 }, { "completion_length": 239.19867038726807, "epoch": 0.06647985312590589, "grad_norm": 0.24818112085731328, "kl": 0.02642059326171875, "learning_rate": 9.891074781630964e-07, "loss": 0.0011, "reward": 1.0814732573926449, "reward_std": 0.2567263371311128, "rewards/accuracy_reward": 0.14397321979049593, "rewards/format_reward": 0.9375000298023224, "step": 43 }, { "completion_length": 228.66184043884277, "epoch": 0.06802589622185719, "grad_norm": 0.2293492263861206, "kl": 0.0278472900390625, "learning_rate": 9.88596915051933e-07, "loss": 0.0011, "reward": 1.0926339849829674, "reward_std": 0.22470801509916782, "rewards/accuracy_reward": 0.12611607578583062, "rewards/format_reward": 0.9665178954601288, "step": 44 }, { "completion_length": 232.77233123779297, "epoch": 0.06957193931780849, "grad_norm": 0.2532403431821216, "kl": 0.02866363525390625, "learning_rate": 9.880747965516657e-07, "loss": 0.0011, "reward": 1.104910772293806, "reward_std": 0.26462144777178764, "rewards/accuracy_reward": 0.14955357648432255, "rewards/format_reward": 0.9553571715950966, "step": 45 }, { "completion_length": 221.61608123779297, "epoch": 0.07111798241375979, "grad_norm": 0.3845311839753924, "kl": 0.034210205078125, "learning_rate": 9.875411350104743e-07, "loss": 0.0014, "reward": 1.1305804029107094, "reward_std": 0.28739399183541536, "rewards/accuracy_reward": 0.17745536530856043, "rewards/format_reward": 0.9531250298023224, "step": 46 }, { "completion_length": 246.44420719146729, "epoch": 0.07266402550971109, "grad_norm": 0.2942726981499674, "kl": 0.02703094482421875, "learning_rate": 9.869959430495328e-07, "loss": 0.0011, "reward": 1.0948661267757416, "reward_std": 0.3003988992422819, "rewards/accuracy_reward": 0.16406250861473382, "rewards/format_reward": 0.9308036081492901, "step": 47 }, { "completion_length": 243.01229095458984, "epoch": 0.07421006860566239, "grad_norm": 0.2317582256269794, "kl": 0.02848052978515625, "learning_rate": 9.864392335627116e-07, "loss": 0.0011, "reward": 1.123883981257677, "reward_std": 0.24491179548203945, "rewards/accuracy_reward": 0.1640625069849193, "rewards/format_reward": 0.959821455180645, "step": 48 }, { "completion_length": 264.1361713409424, "epoch": 0.07575611170161368, "grad_norm": 0.23510998236666028, "kl": 0.02974700927734375, "learning_rate": 9.85871019716272e-07, "loss": 0.0012, "reward": 1.0870536305010319, "reward_std": 0.23703958932310343, "rewards/accuracy_reward": 0.1361607217695564, "rewards/format_reward": 0.9508928880095482, "step": 49 }, { "completion_length": 271.8459949493408, "epoch": 0.07730215479756498, "grad_norm": 0.23043968547092533, "kl": 0.0260772705078125, "learning_rate": 9.852913149485555e-07, "loss": 0.001, "reward": 1.1015625596046448, "reward_std": 0.28392448648810387, "rewards/accuracy_reward": 0.17075893632136285, "rewards/format_reward": 0.9308036155998707, "step": 50 }, { "completion_length": 263.7109489440918, "epoch": 0.07884819789351628, "grad_norm": 0.23315726827404867, "kl": 0.02765655517578125, "learning_rate": 9.847001329696652e-07, "loss": 0.0011, "reward": 1.0982143394649029, "reward_std": 0.26613295171409845, "rewards/accuracy_reward": 0.15401786379516125, "rewards/format_reward": 0.944196455180645, "step": 51 }, { "completion_length": 253.96429920196533, "epoch": 0.08039424098946758, "grad_norm": 0.27612394660621375, "kl": 0.031524658203125, "learning_rate": 9.84097487761142e-07, "loss": 0.0013, "reward": 1.1104911044239998, "reward_std": 0.270201469771564, "rewards/accuracy_reward": 0.156250003259629, "rewards/format_reward": 0.9542411081492901, "step": 52 }, { "completion_length": 252.7968864440918, "epoch": 0.08194028408541888, "grad_norm": 0.23393779249573185, "kl": 0.03122711181640625, "learning_rate": 9.834833935756343e-07, "loss": 0.0012, "reward": 1.0993303954601288, "reward_std": 0.25536223128437996, "rewards/accuracy_reward": 0.15178572060540318, "rewards/format_reward": 0.9475446939468384, "step": 53 }, { "completion_length": 263.58037090301514, "epoch": 0.08348632718137018, "grad_norm": 0.24153503603235818, "kl": 0.031280517578125, "learning_rate": 9.8285786493656e-07, "loss": 0.0013, "reward": 1.116071481257677, "reward_std": 0.26946403086185455, "rewards/accuracy_reward": 0.17522322095464915, "rewards/format_reward": 0.9408482536673546, "step": 54 }, { "completion_length": 245.969877243042, "epoch": 0.08503237027732148, "grad_norm": 0.24577109797822985, "kl": 0.030364990234375, "learning_rate": 9.822209166377635e-07, "loss": 0.0012, "reward": 1.1595982685685158, "reward_std": 0.28058389481157064, "rewards/accuracy_reward": 0.21093751082662493, "rewards/format_reward": 0.9486607573926449, "step": 55 }, { "completion_length": 239.87166213989258, "epoch": 0.08657841337327278, "grad_norm": 0.25455760748669054, "kl": 0.02925872802734375, "learning_rate": 9.815725637431661e-07, "loss": 0.0012, "reward": 1.1395089849829674, "reward_std": 0.2828006064519286, "rewards/accuracy_reward": 0.1908482233993709, "rewards/format_reward": 0.948660746216774, "step": 56 }, { "completion_length": 234.641752243042, "epoch": 0.08812445646922408, "grad_norm": 0.22606697065173845, "kl": 0.0311126708984375, "learning_rate": 9.809128215864096e-07, "loss": 0.0012, "reward": 1.155133981257677, "reward_std": 0.2595007698982954, "rewards/accuracy_reward": 0.196428582072258, "rewards/format_reward": 0.9587053842842579, "step": 57 }, { "completion_length": 269.3772430419922, "epoch": 0.08967049956517538, "grad_norm": 0.226077968521564, "kl": 0.027496337890625, "learning_rate": 9.802417057704929e-07, "loss": 0.0011, "reward": 1.0803571827709675, "reward_std": 0.26716334372758865, "rewards/accuracy_reward": 0.129464291036129, "rewards/format_reward": 0.9508928954601288, "step": 58 }, { "completion_length": 244.23773193359375, "epoch": 0.09121654266112668, "grad_norm": 0.2233786534493805, "kl": 0.03432464599609375, "learning_rate": 9.795592321674045e-07, "loss": 0.0014, "reward": 1.1607143431901932, "reward_std": 0.25086580449715257, "rewards/accuracy_reward": 0.18750001036096364, "rewards/format_reward": 0.9732143133878708, "step": 59 }, { "completion_length": 231.01117324829102, "epoch": 0.09276258575707798, "grad_norm": 0.4580620856422284, "kl": 0.03543853759765625, "learning_rate": 9.788654169177453e-07, "loss": 0.0014, "reward": 1.1640625447034836, "reward_std": 0.2529484983533621, "rewards/accuracy_reward": 0.20758929615840316, "rewards/format_reward": 0.9564732424914837, "step": 60 }, { "completion_length": 244.56809043884277, "epoch": 0.09430862885302928, "grad_norm": 0.20128259932243345, "kl": 0.029449462890625, "learning_rate": 9.781602764303488e-07, "loss": 0.0012, "reward": 1.132812537252903, "reward_std": 0.2523211594671011, "rewards/accuracy_reward": 0.16964286321308464, "rewards/format_reward": 0.963169664144516, "step": 61 }, { "completion_length": 252.47211074829102, "epoch": 0.09585467194898058, "grad_norm": 0.24385801279851155, "kl": 0.0316009521484375, "learning_rate": 9.77443827381891e-07, "loss": 0.0013, "reward": 1.2008929029107094, "reward_std": 0.24801458697766066, "rewards/accuracy_reward": 0.23883929755538702, "rewards/format_reward": 0.9620536044239998, "step": 62 }, { "completion_length": 256.4062604904175, "epoch": 0.09740071504493188, "grad_norm": 0.24957114344615253, "kl": 0.03313446044921875, "learning_rate": 9.767160867164978e-07, "loss": 0.0013, "reward": 1.160714328289032, "reward_std": 0.2779285181313753, "rewards/accuracy_reward": 0.1908482233993709, "rewards/format_reward": 0.9698661044239998, "step": 63 }, { "completion_length": 235.95759963989258, "epoch": 0.09894675814088318, "grad_norm": 0.2794599497921743, "kl": 0.0349578857421875, "learning_rate": 9.759770716453434e-07, "loss": 0.0014, "reward": 1.2165178954601288, "reward_std": 0.29280879255384207, "rewards/accuracy_reward": 0.25111608440056443, "rewards/format_reward": 0.9654018133878708, "step": 64 }, { "completion_length": 261.4107246398926, "epoch": 0.10049280123683448, "grad_norm": 0.271406689955585, "kl": 0.0377349853515625, "learning_rate": 9.752267996462433e-07, "loss": 0.0015, "reward": 1.1618304178118706, "reward_std": 0.2749327663332224, "rewards/accuracy_reward": 0.20647322479635477, "rewards/format_reward": 0.9553571753203869, "step": 65 }, { "completion_length": 287.212064743042, "epoch": 0.10203884433278577, "grad_norm": 0.2590940929975125, "kl": 0.033905029296875, "learning_rate": 9.744652884632404e-07, "loss": 0.0014, "reward": 1.1540179140865803, "reward_std": 0.3410901091992855, "rewards/accuracy_reward": 0.21651786717120558, "rewards/format_reward": 0.937500037252903, "step": 66 }, { "completion_length": 265.23438262939453, "epoch": 0.10358488742873707, "grad_norm": 0.19807021326442692, "kl": 0.03308868408203125, "learning_rate": 9.73692556106187e-07, "loss": 0.0013, "reward": 1.2075893357396126, "reward_std": 0.28592246398329735, "rewards/accuracy_reward": 0.2533482253784314, "rewards/format_reward": 0.9542410969734192, "step": 67 }, { "completion_length": 244.30581378936768, "epoch": 0.10513093052468837, "grad_norm": 0.2936256841896629, "kl": 0.04285430908203125, "learning_rate": 9.729086208503173e-07, "loss": 0.0017, "reward": 1.1729911155998707, "reward_std": 0.29887373745441437, "rewards/accuracy_reward": 0.20312501140870154, "rewards/format_reward": 0.9698660969734192, "step": 68 }, { "completion_length": 257.85269355773926, "epoch": 0.10667697362063967, "grad_norm": 0.2643373000547416, "kl": 0.0377349853515625, "learning_rate": 9.721135012358154e-07, "loss": 0.0015, "reward": 1.1986607685685158, "reward_std": 0.30326394457370043, "rewards/accuracy_reward": 0.23995536949951202, "rewards/format_reward": 0.9587053991854191, "step": 69 }, { "completion_length": 250.25558948516846, "epoch": 0.10822301671659097, "grad_norm": 0.2677330904686442, "kl": 0.0399627685546875, "learning_rate": 9.713072160673777e-07, "loss": 0.0016, "reward": 1.2220982760190964, "reward_std": 0.3052195580676198, "rewards/accuracy_reward": 0.26116072945296764, "rewards/format_reward": 0.9609375298023224, "step": 70 }, { "completion_length": 262.5480012893677, "epoch": 0.10976905981254227, "grad_norm": 0.25981467582773177, "kl": 0.040008544921875, "learning_rate": 9.704897844137672e-07, "loss": 0.0016, "reward": 1.2354911267757416, "reward_std": 0.305774194188416, "rewards/accuracy_reward": 0.2712053726427257, "rewards/format_reward": 0.9642857387661934, "step": 71 }, { "completion_length": 252.06474113464355, "epoch": 0.11131510290849357, "grad_norm": 0.33495594743235946, "kl": 0.0447998046875, "learning_rate": 9.696612256073633e-07, "loss": 0.0018, "reward": 1.2745536267757416, "reward_std": 0.36414896231144667, "rewards/accuracy_reward": 0.31250001583248377, "rewards/format_reward": 0.9620536044239998, "step": 72 }, { "completion_length": 267.68750953674316, "epoch": 0.11286114600444487, "grad_norm": 0.2901814604151681, "kl": 0.04259490966796875, "learning_rate": 9.688215592437038e-07, "loss": 0.0017, "reward": 1.23995541036129, "reward_std": 0.34881133772432804, "rewards/accuracy_reward": 0.28571430034935474, "rewards/format_reward": 0.9542411118745804, "step": 73 }, { "completion_length": 260.6060400009155, "epoch": 0.11440718910039617, "grad_norm": 0.31179710315006265, "kl": 0.0467529296875, "learning_rate": 9.679708051810221e-07, "loss": 0.0019, "reward": 1.2901786267757416, "reward_std": 0.31386987678706646, "rewards/accuracy_reward": 0.32589287171140313, "rewards/format_reward": 0.9642857499420643, "step": 74 }, { "completion_length": 256.3872890472412, "epoch": 0.11595323219634747, "grad_norm": 0.2875833359787326, "kl": 0.050994873046875, "learning_rate": 9.67108983539777e-07, "loss": 0.002, "reward": 1.2566964775323868, "reward_std": 0.3209824347868562, "rewards/accuracy_reward": 0.3013393022119999, "rewards/format_reward": 0.9553571715950966, "step": 75 }, { "completion_length": 266.7890739440918, "epoch": 0.11749927529229877, "grad_norm": 0.2596145911541772, "kl": 0.0485382080078125, "learning_rate": 9.66236114702178e-07, "loss": 0.0019, "reward": 1.2276786267757416, "reward_std": 0.3368382640182972, "rewards/accuracy_reward": 0.27343751583248377, "rewards/format_reward": 0.9542411081492901, "step": 76 }, { "completion_length": 298.46094608306885, "epoch": 0.11904531838825007, "grad_norm": 0.24967101135804037, "kl": 0.04463958740234375, "learning_rate": 9.653522193117012e-07, "loss": 0.0018, "reward": 1.2075893431901932, "reward_std": 0.2983228312805295, "rewards/accuracy_reward": 0.2455357238650322, "rewards/format_reward": 0.9620535969734192, "step": 77 }, { "completion_length": 271.88059425354004, "epoch": 0.12059136148420137, "grad_norm": 0.30357666896531355, "kl": 0.05115509033203125, "learning_rate": 9.644573182726034e-07, "loss": 0.002, "reward": 1.2779018357396126, "reward_std": 0.3416177658364177, "rewards/accuracy_reward": 0.31473216135054827, "rewards/format_reward": 0.9631696715950966, "step": 78 }, { "completion_length": 289.7444324493408, "epoch": 0.12213740458015267, "grad_norm": 0.3426903455243879, "kl": 0.05393218994140625, "learning_rate": 9.63551432749426e-07, "loss": 0.0022, "reward": 1.2912947088479996, "reward_std": 0.3500705398619175, "rewards/accuracy_reward": 0.34821430034935474, "rewards/format_reward": 0.9430803991854191, "step": 79 }, { "completion_length": 321.33260345458984, "epoch": 0.12368344767610397, "grad_norm": 0.2390950823744736, "kl": 0.04193115234375, "learning_rate": 9.626345841664951e-07, "loss": 0.0017, "reward": 1.2488839887082577, "reward_std": 0.33890149369835854, "rewards/accuracy_reward": 0.3046875192085281, "rewards/format_reward": 0.9441964663565159, "step": 80 }, { "completion_length": 302.24443340301514, "epoch": 0.12522949077205528, "grad_norm": 0.24699065254774763, "kl": 0.0466766357421875, "learning_rate": 9.617067942074152e-07, "loss": 0.0019, "reward": 1.251116119325161, "reward_std": 0.3308594347909093, "rewards/accuracy_reward": 0.2901785826543346, "rewards/format_reward": 0.9609375409781933, "step": 81 }, { "completion_length": 306.3482303619385, "epoch": 0.12677553386800658, "grad_norm": 0.2693849117550411, "kl": 0.04518890380859375, "learning_rate": 9.607680848145556e-07, "loss": 0.0018, "reward": 1.2399554252624512, "reward_std": 0.33904541470110416, "rewards/accuracy_reward": 0.29687501583248377, "rewards/format_reward": 0.9430803917348385, "step": 82 }, { "completion_length": 310.2366189956665, "epoch": 0.12832157696395788, "grad_norm": 0.2489242581559146, "kl": 0.05149078369140625, "learning_rate": 9.598184781885318e-07, "loss": 0.0021, "reward": 1.2823661118745804, "reward_std": 0.329466306604445, "rewards/accuracy_reward": 0.3404017984867096, "rewards/format_reward": 0.941964328289032, "step": 83 }, { "completion_length": 312.7756824493408, "epoch": 0.12986762005990918, "grad_norm": 0.2908753733409944, "kl": 0.0536651611328125, "learning_rate": 9.588579967876804e-07, "loss": 0.0021, "reward": 1.3169643506407738, "reward_std": 0.35681935027241707, "rewards/accuracy_reward": 0.36607145331799984, "rewards/format_reward": 0.9508928917348385, "step": 84 }, { "completion_length": 327.04130840301514, "epoch": 0.13141366315586048, "grad_norm": 0.46834186524915894, "kl": 0.050262451171875, "learning_rate": 9.578866633275286e-07, "loss": 0.002, "reward": 1.2890625521540642, "reward_std": 0.3798081297427416, "rewards/accuracy_reward": 0.3604910923168063, "rewards/format_reward": 0.9285714589059353, "step": 85 }, { "completion_length": 303.27679920196533, "epoch": 0.13295970625181178, "grad_norm": 0.244377166061853, "kl": 0.05584716796875, "learning_rate": 9.569045007802557e-07, "loss": 0.0022, "reward": 1.322544701397419, "reward_std": 0.3325744904577732, "rewards/accuracy_reward": 0.3649553768336773, "rewards/format_reward": 0.9575893208384514, "step": 86 }, { "completion_length": 307.0279140472412, "epoch": 0.13450574934776308, "grad_norm": 0.617101059241209, "kl": 0.05120849609375, "learning_rate": 9.55911532374151e-07, "loss": 0.002, "reward": 1.3448661342263222, "reward_std": 0.3672496071085334, "rewards/accuracy_reward": 0.3883928759023547, "rewards/format_reward": 0.9564732424914837, "step": 87 }, { "completion_length": 336.92970085144043, "epoch": 0.13605179244371438, "grad_norm": 0.40241764589377704, "kl": 0.0427703857421875, "learning_rate": 9.549077815930636e-07, "loss": 0.0017, "reward": 1.2857143506407738, "reward_std": 0.30609494261443615, "rewards/accuracy_reward": 0.34598215762525797, "rewards/format_reward": 0.9397321753203869, "step": 88 }, { "completion_length": 348.71206855773926, "epoch": 0.13759783553966567, "grad_norm": 0.430342566504197, "kl": 0.0446624755859375, "learning_rate": 9.538932721758473e-07, "loss": 0.0018, "reward": 1.2444196939468384, "reward_std": 0.30883802752941847, "rewards/accuracy_reward": 0.3080357291037217, "rewards/format_reward": 0.9363839626312256, "step": 89 }, { "completion_length": 334.3705520629883, "epoch": 0.13914387863561697, "grad_norm": 0.2101177909480244, "kl": 0.04990386962890625, "learning_rate": 9.528680281157998e-07, "loss": 0.002, "reward": 1.3504464775323868, "reward_std": 0.31677883863449097, "rewards/accuracy_reward": 0.3906250186264515, "rewards/format_reward": 0.959821455180645, "step": 90 }, { "completion_length": 326.7120666503906, "epoch": 0.14068992173156827, "grad_norm": 667661.0500854992, "kl": 4.5834197998046875, "learning_rate": 9.518320736600942e-07, "loss": 0.1829, "reward": 1.379464365541935, "reward_std": 0.35738139785826206, "rewards/accuracy_reward": 0.4341518059372902, "rewards/format_reward": 0.9453125335276127, "step": 91 }, { "completion_length": 310.23885345458984, "epoch": 0.14223596482751957, "grad_norm": 0.9700590150648007, "kl": 0.0526580810546875, "learning_rate": 9.507854333092063e-07, "loss": 0.0021, "reward": 1.36495541036129, "reward_std": 0.34171361569315195, "rewards/accuracy_reward": 0.4229910932481289, "rewards/format_reward": 0.9419643245637417, "step": 92 }, { "completion_length": 367.085955619812, "epoch": 0.14378200792347087, "grad_norm": 0.21325265667213117, "kl": 0.038970947265625, "learning_rate": 9.497281318163346e-07, "loss": 0.0016, "reward": 1.3113840036094189, "reward_std": 0.3380258809775114, "rewards/accuracy_reward": 0.377232164144516, "rewards/format_reward": 0.9341518133878708, "step": 93 }, { "completion_length": 306.70314025878906, "epoch": 0.14532805101942217, "grad_norm": 0.24003577477946295, "kl": 0.047210693359375, "learning_rate": 9.486601941868153e-07, "loss": 0.0019, "reward": 1.3895089998841286, "reward_std": 0.345655282959342, "rewards/accuracy_reward": 0.4375000186264515, "rewards/format_reward": 0.9520089626312256, "step": 94 }, { "completion_length": 336.61497020721436, "epoch": 0.14687409411537347, "grad_norm": 0.19288701509697956, "kl": 0.0437774658203125, "learning_rate": 9.475816456775312e-07, "loss": 0.0018, "reward": 1.3091518431901932, "reward_std": 0.32058625761419535, "rewards/accuracy_reward": 0.34821430034935474, "rewards/format_reward": 0.9609375298023224, "step": 95 }, { "completion_length": 337.22099685668945, "epoch": 0.14842013721132477, "grad_norm": 0.2121676495980788, "kl": 0.0443878173828125, "learning_rate": 9.464925117963131e-07, "loss": 0.0018, "reward": 1.285714328289032, "reward_std": 0.32860798202455044, "rewards/accuracy_reward": 0.33482144493609667, "rewards/format_reward": 0.9508928917348385, "step": 96 }, { "completion_length": 347.4419822692871, "epoch": 0.14996618030727607, "grad_norm": 6.255161105372137, "kl": 0.0486907958984375, "learning_rate": 9.453928183013384e-07, "loss": 0.0019, "reward": 1.2979911267757416, "reward_std": 0.31181239150464535, "rewards/accuracy_reward": 0.34040180093143135, "rewards/format_reward": 0.9575893171131611, "step": 97 }, { "completion_length": 343.88841342926025, "epoch": 0.15151222340322737, "grad_norm": 0.20834458839191217, "kl": 0.04443359375, "learning_rate": 9.442825912005201e-07, "loss": 0.0018, "reward": 1.3660714849829674, "reward_std": 0.3290967810899019, "rewards/accuracy_reward": 0.41406252328306437, "rewards/format_reward": 0.9520089589059353, "step": 98 }, { "completion_length": 329.5837230682373, "epoch": 0.15305826649917867, "grad_norm": 0.22243086973092813, "kl": 0.0462188720703125, "learning_rate": 9.431618567508932e-07, "loss": 0.0018, "reward": 1.3169643506407738, "reward_std": 0.33615455124527216, "rewards/accuracy_reward": 0.3593750176951289, "rewards/format_reward": 0.9575893208384514, "step": 99 }, { "completion_length": 353.6886291503906, "epoch": 0.15460430959512997, "grad_norm": 0.2254671905769049, "kl": 0.04034423828125, "learning_rate": 9.420306414579924e-07, "loss": 0.0016, "reward": 1.312500074505806, "reward_std": 0.3170872125774622, "rewards/accuracy_reward": 0.35937502048909664, "rewards/format_reward": 0.9531250335276127, "step": 100 }, { "completion_length": 327.55023765563965, "epoch": 0.15615035269108127, "grad_norm": 0.22645019383559273, "kl": 0.04833221435546875, "learning_rate": 9.408889720752265e-07, "loss": 0.0019, "reward": 1.3325893208384514, "reward_std": 0.35009961016476154, "rewards/accuracy_reward": 0.37500001583248377, "rewards/format_reward": 0.9575893208384514, "step": 101 }, { "completion_length": 319.8013553619385, "epoch": 0.15769639578703257, "grad_norm": 136.94510050609546, "kl": 0.0529022216796875, "learning_rate": 9.397368756032444e-07, "loss": 0.0021, "reward": 1.3727679178118706, "reward_std": 0.3476991895586252, "rewards/accuracy_reward": 0.4241071632131934, "rewards/format_reward": 0.948660746216774, "step": 102 }, { "completion_length": 309.92970180511475, "epoch": 0.15924243888298387, "grad_norm": 0.23241872812832787, "kl": 0.04306793212890625, "learning_rate": 9.385743792892982e-07, "loss": 0.0017, "reward": 1.322544701397419, "reward_std": 0.31133833061903715, "rewards/accuracy_reward": 0.34821430407464504, "rewards/format_reward": 0.9743303880095482, "step": 103 }, { "completion_length": 329.3705539703369, "epoch": 0.16078848197893517, "grad_norm": 0.27889347034529993, "kl": 0.046905517578125, "learning_rate": 9.374015106265966e-07, "loss": 0.0019, "reward": 1.3214286342263222, "reward_std": 0.3270298717543483, "rewards/accuracy_reward": 0.3504464440047741, "rewards/format_reward": 0.9709821715950966, "step": 104 }, { "completion_length": 302.4553689956665, "epoch": 0.16233452507488647, "grad_norm": 0.21855190148548795, "kl": 0.0495147705078125, "learning_rate": 9.362182973536567e-07, "loss": 0.002, "reward": 1.3604911267757416, "reward_std": 0.34386332891881466, "rewards/accuracy_reward": 0.3939732266589999, "rewards/format_reward": 0.9665178842842579, "step": 105 }, { "completion_length": 310.50894355773926, "epoch": 0.16388056817083776, "grad_norm": 0.2183271358211183, "kl": 0.0424652099609375, "learning_rate": 9.35024767453647e-07, "loss": 0.0017, "reward": 1.331473283469677, "reward_std": 0.32679081335663795, "rewards/accuracy_reward": 0.3604910895228386, "rewards/format_reward": 0.9709821790456772, "step": 106 }, { "completion_length": 305.7823791503906, "epoch": 0.16542661126678906, "grad_norm": 0.22827434444187106, "kl": 0.04443359375, "learning_rate": 9.338209491537256e-07, "loss": 0.0018, "reward": 1.300223283469677, "reward_std": 0.3417002186179161, "rewards/accuracy_reward": 0.33482144493609667, "rewards/format_reward": 0.9654018208384514, "step": 107 }, { "completion_length": 288.45983123779297, "epoch": 0.16697265436274036, "grad_norm": 0.2849358250185683, "kl": 0.043609619140625, "learning_rate": 9.326068709243726e-07, "loss": 0.0017, "reward": 1.368303656578064, "reward_std": 0.329782934859395, "rewards/accuracy_reward": 0.3928571604192257, "rewards/format_reward": 0.9754464589059353, "step": 108 }, { "completion_length": 306.11720275878906, "epoch": 0.16851869745869166, "grad_norm": 0.5004635897376872, "kl": 0.0540313720703125, "learning_rate": 9.313825614787177e-07, "loss": 0.0022, "reward": 1.2924107685685158, "reward_std": 0.259003646671772, "rewards/accuracy_reward": 0.3314732313156128, "rewards/format_reward": 0.9609375409781933, "step": 109 }, { "completion_length": 312.5613965988159, "epoch": 0.17006474055464296, "grad_norm": 0.25696558135971803, "kl": 0.0430145263671875, "learning_rate": 9.301480497718592e-07, "loss": 0.0017, "reward": 1.2812500596046448, "reward_std": 0.35916319489479065, "rewards/accuracy_reward": 0.32924108766019344, "rewards/format_reward": 0.9520089589059353, "step": 110 }, { "completion_length": 301.4062614440918, "epoch": 0.17161078365059426, "grad_norm": 0.22067680937156378, "kl": 0.04510498046875, "learning_rate": 9.289033650001816e-07, "loss": 0.0018, "reward": 1.2901786267757416, "reward_std": 0.35871356166899204, "rewards/accuracy_reward": 0.32254465762525797, "rewards/format_reward": 0.9676339700818062, "step": 111 }, { "completion_length": 298.9821605682373, "epoch": 0.17315682674654556, "grad_norm": 0.2672028517519709, "kl": 0.04608154296875, "learning_rate": 9.276485366006633e-07, "loss": 0.0018, "reward": 1.2935268580913544, "reward_std": 0.351310508325696, "rewards/accuracy_reward": 0.34375001676380634, "rewards/format_reward": 0.9497768171131611, "step": 112 }, { "completion_length": 316.8113946914673, "epoch": 0.17470286984249686, "grad_norm": 0.2298310112709615, "kl": 0.0425872802734375, "learning_rate": 9.263835942501806e-07, "loss": 0.0017, "reward": 1.29464291036129, "reward_std": 0.3255584565922618, "rewards/accuracy_reward": 0.32477680034935474, "rewards/format_reward": 0.9698661081492901, "step": 113 }, { "completion_length": 279.808048248291, "epoch": 0.17624891293844816, "grad_norm": 0.2615209587928486, "kl": 0.047119140625, "learning_rate": 9.251085678648071e-07, "loss": 0.0019, "reward": 1.3571429252624512, "reward_std": 0.36835150606930256, "rewards/accuracy_reward": 0.39285716228187084, "rewards/format_reward": 0.964285746216774, "step": 114 }, { "completion_length": 311.8616189956665, "epoch": 0.17779495603439946, "grad_norm": 0.23780266368010114, "kl": 0.04186248779296875, "learning_rate": 9.238234875991045e-07, "loss": 0.0017, "reward": 1.2678572125732899, "reward_std": 0.346679387614131, "rewards/accuracy_reward": 0.3169643022119999, "rewards/format_reward": 0.9508928917348385, "step": 115 }, { "completion_length": 287.80693531036377, "epoch": 0.17934099913035076, "grad_norm": 0.21904152204670801, "kl": 0.04578399658203125, "learning_rate": 9.22528383845411e-07, "loss": 0.0018, "reward": 1.3191964849829674, "reward_std": 0.30637710727751255, "rewards/accuracy_reward": 0.3549107341095805, "rewards/format_reward": 0.964285746216774, "step": 116 }, { "completion_length": 296.1205472946167, "epoch": 0.18088704222630206, "grad_norm": 0.22047302101468638, "kl": 0.04093170166015625, "learning_rate": 9.212232872331209e-07, "loss": 0.0016, "reward": 1.3772322237491608, "reward_std": 0.3038821369409561, "rewards/accuracy_reward": 0.39508930407464504, "rewards/format_reward": 0.9821428880095482, "step": 117 }, { "completion_length": 305.0055913925171, "epoch": 0.18243308532225336, "grad_norm": 0.26059782794497754, "kl": 0.04022979736328125, "learning_rate": 9.19908228627962e-07, "loss": 0.0016, "reward": 1.2645089700818062, "reward_std": 0.3319691941142082, "rewards/accuracy_reward": 0.31361608672887087, "rewards/format_reward": 0.9508928842842579, "step": 118 }, { "completion_length": 308.14956855773926, "epoch": 0.18397912841820466, "grad_norm": 0.24452677272376233, "kl": 0.0396270751953125, "learning_rate": 9.185832391312642e-07, "loss": 0.0016, "reward": 1.28683041036129, "reward_std": 0.3054819880053401, "rewards/accuracy_reward": 0.3125000153668225, "rewards/format_reward": 0.9743303880095482, "step": 119 }, { "completion_length": 306.4776916503906, "epoch": 0.18552517151415596, "grad_norm": 0.2411703296892844, "kl": 0.04616546630859375, "learning_rate": 9.172483500792244e-07, "loss": 0.0018, "reward": 1.280133992433548, "reward_std": 0.3194332104176283, "rewards/accuracy_reward": 0.31250001303851604, "rewards/format_reward": 0.9676339589059353, "step": 120 }, { "completion_length": 306.87501335144043, "epoch": 0.18707121461010726, "grad_norm": 0.22384641511287115, "kl": 0.04351043701171875, "learning_rate": 9.159035930421657e-07, "loss": 0.0017, "reward": 1.3470982760190964, "reward_std": 0.31912598572671413, "rewards/accuracy_reward": 0.3738839477300644, "rewards/format_reward": 0.9732143245637417, "step": 121 }, { "completion_length": 307.32032585144043, "epoch": 0.18861725770605856, "grad_norm": 0.23599895690934058, "kl": 0.0441741943359375, "learning_rate": 9.145489998237901e-07, "loss": 0.0018, "reward": 1.2946429178118706, "reward_std": 0.3349267188459635, "rewards/accuracy_reward": 0.33035715762525797, "rewards/format_reward": 0.964285746216774, "step": 122 }, { "completion_length": 318.9486770629883, "epoch": 0.19016330080200985, "grad_norm": 0.21871873097406488, "kl": 0.0420379638671875, "learning_rate": 9.131846024604273e-07, "loss": 0.0017, "reward": 1.2745536342263222, "reward_std": 0.33481548074632883, "rewards/accuracy_reward": 0.3125000149011612, "rewards/format_reward": 0.9620536044239998, "step": 123 }, { "completion_length": 306.6886291503906, "epoch": 0.19170934389796115, "grad_norm": 0.21954267148575277, "kl": 0.0423126220703125, "learning_rate": 9.118104332202758e-07, "loss": 0.0017, "reward": 1.2991072088479996, "reward_std": 0.3247332517057657, "rewards/accuracy_reward": 0.3359375186264515, "rewards/format_reward": 0.9631696678698063, "step": 124 }, { "completion_length": 294.57032775878906, "epoch": 0.19325538699391245, "grad_norm": 82.84953301785417, "kl": 2.3736343383789062, "learning_rate": 9.104265246026414e-07, "loss": 0.0947, "reward": 1.3415179178118706, "reward_std": 0.34987304732203484, "rewards/accuracy_reward": 0.38058037497103214, "rewards/format_reward": 0.9609375335276127, "step": 125 }, { "completion_length": 316.3493432998657, "epoch": 0.19480143008986375, "grad_norm": 0.39760407529404335, "kl": 0.0465850830078125, "learning_rate": 9.090329093371665e-07, "loss": 0.0019, "reward": 1.2901786267757416, "reward_std": 0.33536100946366787, "rewards/accuracy_reward": 0.3303571594879031, "rewards/format_reward": 0.9598214700818062, "step": 126 }, { "completion_length": 289.7466630935669, "epoch": 0.19634747318581505, "grad_norm": 0.2671154015585535, "kl": 0.048309326171875, "learning_rate": 9.076296203830578e-07, "loss": 0.0019, "reward": 1.3091518431901932, "reward_std": 0.30847785621881485, "rewards/accuracy_reward": 0.3381696594879031, "rewards/format_reward": 0.9709821790456772, "step": 127 }, { "completion_length": 285.84376430511475, "epoch": 0.19789351628176635, "grad_norm": 0.2490903812086192, "kl": 0.05268096923828125, "learning_rate": 9.062166909283061e-07, "loss": 0.0021, "reward": 1.3091518506407738, "reward_std": 0.3371814787387848, "rewards/accuracy_reward": 0.35156251303851604, "rewards/format_reward": 0.9575893208384514, "step": 128 }, { "completion_length": 299.4966650009155, "epoch": 0.19943955937771765, "grad_norm": 0.21982925854905125, "kl": 0.04480743408203125, "learning_rate": 9.047941543889014e-07, "loss": 0.0018, "reward": 1.2845982611179352, "reward_std": 0.33614030480384827, "rewards/accuracy_reward": 0.32924108672887087, "rewards/format_reward": 0.9553571827709675, "step": 129 }, { "completion_length": 285.21095085144043, "epoch": 0.20098560247366895, "grad_norm": 0.28619510820382726, "kl": 0.04383087158203125, "learning_rate": 9.033620444080426e-07, "loss": 0.0018, "reward": 1.3203125782310963, "reward_std": 0.3875390188768506, "rewards/accuracy_reward": 0.3683035890571773, "rewards/format_reward": 0.9520089663565159, "step": 130 }, { "completion_length": 299.3113965988159, "epoch": 0.20253164556962025, "grad_norm": 1.7967557965625058, "kl": 0.0835723876953125, "learning_rate": 9.019203948553421e-07, "loss": 0.0033, "reward": 1.23995541036129, "reward_std": 0.30588393565267324, "rewards/accuracy_reward": 0.2879464407451451, "rewards/format_reward": 0.9520089626312256, "step": 131 }, { "completion_length": 270.7310380935669, "epoch": 0.20407768866557155, "grad_norm": 0.25402770603597896, "kl": 0.05242919921875, "learning_rate": 9.004692398260243e-07, "loss": 0.0021, "reward": 1.3314732611179352, "reward_std": 0.32832660991698503, "rewards/accuracy_reward": 0.3794643059372902, "rewards/format_reward": 0.9520089626312256, "step": 132 }, { "completion_length": 291.0993432998657, "epoch": 0.20562373176152285, "grad_norm": 0.26761306927880546, "kl": 0.0457763671875, "learning_rate": 8.990086136401198e-07, "loss": 0.0018, "reward": 1.3002232685685158, "reward_std": 0.36694241128861904, "rewards/accuracy_reward": 0.3415178768336773, "rewards/format_reward": 0.9587053917348385, "step": 133 }, { "completion_length": 285.50001525878906, "epoch": 0.20716977485747415, "grad_norm": 0.2824997997678576, "kl": 0.04901885986328125, "learning_rate": 8.975385508416531e-07, "loss": 0.002, "reward": 1.3370536267757416, "reward_std": 0.3776315450668335, "rewards/accuracy_reward": 0.38616073317825794, "rewards/format_reward": 0.9508928917348385, "step": 134 }, { "completion_length": 290.7187614440918, "epoch": 0.20871581795342545, "grad_norm": 0.21403046936952053, "kl": 0.04132080078125, "learning_rate": 8.960590861978265e-07, "loss": 0.0017, "reward": 1.2790179140865803, "reward_std": 0.34963255655020475, "rewards/accuracy_reward": 0.3337053759023547, "rewards/format_reward": 0.945312537252903, "step": 135 }, { "completion_length": 291.65402603149414, "epoch": 0.21026186104937675, "grad_norm": 0.2828234757926595, "kl": 0.0491485595703125, "learning_rate": 8.945702546981968e-07, "loss": 0.002, "reward": 1.2533482611179352, "reward_std": 0.37725854851305485, "rewards/accuracy_reward": 0.3314732303842902, "rewards/format_reward": 0.9218750409781933, "step": 136 }, { "completion_length": 276.05804538726807, "epoch": 0.21180790414532805, "grad_norm": 0.253470345026858, "kl": 0.0517730712890625, "learning_rate": 8.930720915538485e-07, "loss": 0.0021, "reward": 1.3482143580913544, "reward_std": 0.3660743460059166, "rewards/accuracy_reward": 0.4095982350409031, "rewards/format_reward": 0.9386161081492901, "step": 137 }, { "completion_length": 279.04130840301514, "epoch": 0.21335394724127935, "grad_norm": 0.3250567559879322, "kl": 0.04885101318359375, "learning_rate": 8.915646321965613e-07, "loss": 0.002, "reward": 1.2901786342263222, "reward_std": 0.33989580534398556, "rewards/accuracy_reward": 0.34151787031441927, "rewards/format_reward": 0.9486607499420643, "step": 138 }, { "completion_length": 270.4475574493408, "epoch": 0.21489999033723065, "grad_norm": 0.2786796377017866, "kl": 0.05115509033203125, "learning_rate": 8.900479122779711e-07, "loss": 0.002, "reward": 1.338169701397419, "reward_std": 0.38602107763290405, "rewards/accuracy_reward": 0.404017873108387, "rewards/format_reward": 0.9341518208384514, "step": 139 }, { "completion_length": 303.4107313156128, "epoch": 0.21644603343318194, "grad_norm": 0.23657258613784393, "kl": 0.0437469482421875, "learning_rate": 8.885219676687276e-07, "loss": 0.0017, "reward": 1.1897322051227093, "reward_std": 0.3734086202457547, "rewards/accuracy_reward": 0.27455358882434666, "rewards/format_reward": 0.9151786081492901, "step": 140 }, { "completion_length": 280.35269260406494, "epoch": 0.21799207652913324, "grad_norm": 0.25802001331868923, "kl": 0.04736328125, "learning_rate": 8.869868344576459e-07, "loss": 0.0019, "reward": 1.2533482611179352, "reward_std": 0.39182595163583755, "rewards/accuracy_reward": 0.32700894214212894, "rewards/format_reward": 0.9263393208384514, "step": 141 }, { "completion_length": 309.6863965988159, "epoch": 0.21953811962508454, "grad_norm": 0.2719873166245055, "kl": 0.0402679443359375, "learning_rate": 8.85442548950853e-07, "loss": 0.0016, "reward": 1.27120541036129, "reward_std": 0.38497710414230824, "rewards/accuracy_reward": 0.34040180360898376, "rewards/format_reward": 0.9308036081492901, "step": 142 }, { "completion_length": 302.0725612640381, "epoch": 0.22108416272103584, "grad_norm": 0.2672741859644189, "kl": 0.04813385009765625, "learning_rate": 8.838891476709287e-07, "loss": 0.0019, "reward": 1.3348214849829674, "reward_std": 0.37788633070886135, "rewards/accuracy_reward": 0.396205373108387, "rewards/format_reward": 0.9386161006987095, "step": 143 }, { "completion_length": 302.4475574493408, "epoch": 0.22263020581698714, "grad_norm": 0.2391000348272415, "kl": 0.05170440673828125, "learning_rate": 8.823266673560425e-07, "loss": 0.0021, "reward": 1.2935268431901932, "reward_std": 0.3642970584332943, "rewards/accuracy_reward": 0.36607144866138697, "rewards/format_reward": 0.9274553991854191, "step": 144 }, { "completion_length": 300.9509048461914, "epoch": 0.22417624891293844, "grad_norm": 0.3324468588576691, "kl": 0.04872894287109375, "learning_rate": 8.807551449590844e-07, "loss": 0.0019, "reward": 1.2868304215371609, "reward_std": 0.36673163436353207, "rewards/accuracy_reward": 0.34933037543669343, "rewards/format_reward": 0.9375000335276127, "step": 145 }, { "completion_length": 301.10939025878906, "epoch": 0.22572229200888974, "grad_norm": 0.22559667500894864, "kl": 0.0496978759765625, "learning_rate": 8.791746176467907e-07, "loss": 0.002, "reward": 1.3281250596046448, "reward_std": 0.3775293305516243, "rewards/accuracy_reward": 0.3861607303842902, "rewards/format_reward": 0.9419643208384514, "step": 146 }, { "completion_length": 271.50558948516846, "epoch": 0.22726833510484104, "grad_norm": 0.37117896261243266, "kl": 0.0575103759765625, "learning_rate": 8.775851227988655e-07, "loss": 0.0023, "reward": 1.3872768580913544, "reward_std": 0.3841982949525118, "rewards/accuracy_reward": 0.43638395331799984, "rewards/format_reward": 0.9508928805589676, "step": 147 }, { "completion_length": 313.32032775878906, "epoch": 0.22881437820079234, "grad_norm": 0.26352904718984504, "kl": 0.04969024658203125, "learning_rate": 8.759866980070962e-07, "loss": 0.002, "reward": 1.3125000521540642, "reward_std": 0.3357900194823742, "rewards/accuracy_reward": 0.36272323317825794, "rewards/format_reward": 0.9497768208384514, "step": 148 }, { "completion_length": 296.2634038925171, "epoch": 0.23036042129674364, "grad_norm": 0.22949773790306577, "kl": 0.0525360107421875, "learning_rate": 8.743793810744653e-07, "loss": 0.0021, "reward": 1.3504464998841286, "reward_std": 0.3298855032771826, "rewards/accuracy_reward": 0.4062500186264515, "rewards/format_reward": 0.9441964663565159, "step": 149 }, { "completion_length": 310.3928737640381, "epoch": 0.23190646439269494, "grad_norm": 0.23505298800397886, "kl": 0.0519866943359375, "learning_rate": 8.72763210014255e-07, "loss": 0.0021, "reward": 1.2154018431901932, "reward_std": 0.35720257088541985, "rewards/accuracy_reward": 0.28013394214212894, "rewards/format_reward": 0.9352678917348385, "step": 150 }, { "completion_length": 310.341534614563, "epoch": 0.23345250748864624, "grad_norm": 0.2401580417784417, "kl": 0.054443359375, "learning_rate": 8.711382230491492e-07, "loss": 0.0022, "reward": 1.332589328289032, "reward_std": 0.36648705787956715, "rewards/accuracy_reward": 0.39843751676380634, "rewards/format_reward": 0.9341518208384514, "step": 151 }, { "completion_length": 283.8426465988159, "epoch": 0.23499855058459754, "grad_norm": 0.21871944121477377, "kl": 0.0557861328125, "learning_rate": 8.695044586103295e-07, "loss": 0.0022, "reward": 1.3482143431901932, "reward_std": 0.3063310859724879, "rewards/accuracy_reward": 0.37834823224693537, "rewards/format_reward": 0.9698661044239998, "step": 152 }, { "completion_length": 287.9598331451416, "epoch": 0.23654459368054884, "grad_norm": 0.2369550779188827, "kl": 0.0503997802734375, "learning_rate": 8.678619553365658e-07, "loss": 0.002, "reward": 1.3426339849829674, "reward_std": 0.34441741928458214, "rewards/accuracy_reward": 0.39174109138548374, "rewards/format_reward": 0.9508928991854191, "step": 153 }, { "completion_length": 308.0346097946167, "epoch": 0.23809063677650014, "grad_norm": 0.20910853991418754, "kl": 0.04766845703125, "learning_rate": 8.662107520733027e-07, "loss": 0.0019, "reward": 1.299107201397419, "reward_std": 0.3140460727736354, "rewards/accuracy_reward": 0.3448660895228386, "rewards/format_reward": 0.9542411155998707, "step": 154 }, { "completion_length": 307.35492515563965, "epoch": 0.23963667987245144, "grad_norm": 0.25621552367538486, "kl": 0.0547943115234375, "learning_rate": 8.645508878717409e-07, "loss": 0.0022, "reward": 1.3236607685685158, "reward_std": 0.37864473834633827, "rewards/accuracy_reward": 0.37611609138548374, "rewards/format_reward": 0.9475446678698063, "step": 155 }, { "completion_length": 343.3951072692871, "epoch": 0.24118272296840274, "grad_norm": 0.20895578496663886, "kl": 0.046417236328125, "learning_rate": 8.628824019879136e-07, "loss": 0.0019, "reward": 1.362723283469677, "reward_std": 0.3741375422105193, "rewards/accuracy_reward": 0.40959823317825794, "rewards/format_reward": 0.9531250260770321, "step": 156 }, { "completion_length": 331.4330520629883, "epoch": 0.24272876606435403, "grad_norm": 0.24703564690112567, "kl": 0.0514984130859375, "learning_rate": 8.612053338817581e-07, "loss": 0.0021, "reward": 1.3325893431901932, "reward_std": 0.366704223677516, "rewards/accuracy_reward": 0.3861607313156128, "rewards/format_reward": 0.9464286006987095, "step": 157 }, { "completion_length": 297.2131824493408, "epoch": 0.24427480916030533, "grad_norm": 0.22660646725231054, "kl": 0.04689788818359375, "learning_rate": 8.595197232161824e-07, "loss": 0.0019, "reward": 1.3102679178118706, "reward_std": 0.32240139413625, "rewards/accuracy_reward": 0.3470982303842902, "rewards/format_reward": 0.9631696827709675, "step": 158 }, { "completion_length": 300.23327350616455, "epoch": 0.24582085225625663, "grad_norm": 0.285810510558559, "kl": 0.05445098876953125, "learning_rate": 8.578256098561274e-07, "loss": 0.0022, "reward": 1.3794643431901932, "reward_std": 0.3536876458674669, "rewards/accuracy_reward": 0.42410716600716114, "rewards/format_reward": 0.9553571753203869, "step": 159 }, { "completion_length": 306.9196586608887, "epoch": 0.24736689535220793, "grad_norm": 0.291865692435505, "kl": 0.0518341064453125, "learning_rate": 8.561230338676239e-07, "loss": 0.0021, "reward": 1.366071492433548, "reward_std": 0.37660182639956474, "rewards/accuracy_reward": 0.40625001955777407, "rewards/format_reward": 0.9598214589059353, "step": 160 }, { "completion_length": 296.22546195983887, "epoch": 0.24891293844815923, "grad_norm": 0.23651868227044384, "kl": 0.054168701171875, "learning_rate": 8.544120355168451e-07, "loss": 0.0022, "reward": 1.3191964849829674, "reward_std": 0.3569078929722309, "rewards/accuracy_reward": 0.3649553721770644, "rewards/format_reward": 0.9542411118745804, "step": 161 }, { "completion_length": 297.98550510406494, "epoch": 0.25045898154411056, "grad_norm": 0.21633428252020148, "kl": 0.05194091796875, "learning_rate": 8.526926552691544e-07, "loss": 0.0021, "reward": 1.3560268506407738, "reward_std": 0.334335183724761, "rewards/accuracy_reward": 0.3973214467987418, "rewards/format_reward": 0.9587053880095482, "step": 162 }, { "completion_length": 302.9553737640381, "epoch": 0.25200502464006186, "grad_norm": 0.22484379004037225, "kl": 0.0490570068359375, "learning_rate": 8.509649337881481e-07, "loss": 0.002, "reward": 1.3325893431901932, "reward_std": 0.404594786465168, "rewards/accuracy_reward": 0.38616073224693537, "rewards/format_reward": 0.9464286081492901, "step": 163 }, { "completion_length": 297.8761320114136, "epoch": 0.25355106773601316, "grad_norm": 0.23784600952703602, "kl": 0.05670166015625, "learning_rate": 8.492289119346943e-07, "loss": 0.0023, "reward": 1.3292411491274834, "reward_std": 0.30506941489875317, "rewards/accuracy_reward": 0.3560268059372902, "rewards/format_reward": 0.9732143133878708, "step": 164 }, { "completion_length": 296.7935380935669, "epoch": 0.25509711083196446, "grad_norm": 0.25997511912499566, "kl": 0.054290771484375, "learning_rate": 8.474846307659657e-07, "loss": 0.0022, "reward": 1.389508992433548, "reward_std": 0.3452272191643715, "rewards/accuracy_reward": 0.435267873108387, "rewards/format_reward": 0.9542411081492901, "step": 165 }, { "completion_length": 317.9497890472412, "epoch": 0.25664315392791576, "grad_norm": 0.28785545083472924, "kl": 0.05512237548828125, "learning_rate": 8.457321315344693e-07, "loss": 0.0022, "reward": 1.3448661342263222, "reward_std": 0.3536453824490309, "rewards/accuracy_reward": 0.39397323690354824, "rewards/format_reward": 0.9508928880095482, "step": 166 }, { "completion_length": 302.1384086608887, "epoch": 0.25818919702386706, "grad_norm": 1.5256316592741908, "kl": 0.0953826904296875, "learning_rate": 8.439714556870704e-07, "loss": 0.0038, "reward": 1.3604911267757416, "reward_std": 0.35309666208922863, "rewards/accuracy_reward": 0.3939732275903225, "rewards/format_reward": 0.9665178917348385, "step": 167 }, { "completion_length": 316.80358505249023, "epoch": 0.25973524011981836, "grad_norm": 0.20521526045006855, "kl": 0.0464324951171875, "learning_rate": 8.422026448640123e-07, "loss": 0.0019, "reward": 1.3604911416769028, "reward_std": 0.3539897557348013, "rewards/accuracy_reward": 0.40959823224693537, "rewards/format_reward": 0.9508928991854191, "step": 168 }, { "completion_length": 297.7388515472412, "epoch": 0.26128128321576966, "grad_norm": 0.25008712232243335, "kl": 0.054901123046875, "learning_rate": 8.40425740897932e-07, "loss": 0.0022, "reward": 1.3426339998841286, "reward_std": 0.36956945434212685, "rewards/accuracy_reward": 0.3872768022119999, "rewards/format_reward": 0.9553571790456772, "step": 169 }, { "completion_length": 306.79577255249023, "epoch": 0.26282732631172095, "grad_norm": 0.2064090879408936, "kl": 0.04718780517578125, "learning_rate": 8.386407858128706e-07, "loss": 0.0019, "reward": 1.3694197162985802, "reward_std": 0.35845059901475906, "rewards/accuracy_reward": 0.41517859045416117, "rewards/format_reward": 0.9542411118745804, "step": 170 }, { "completion_length": 337.8995666503906, "epoch": 0.26437336940767225, "grad_norm": 0.19507123288591427, "kl": 0.04398345947265625, "learning_rate": 8.368478218232787e-07, "loss": 0.0018, "reward": 1.2645089663565159, "reward_std": 0.36035226471722126, "rewards/accuracy_reward": 0.32924108451697975, "rewards/format_reward": 0.9352678954601288, "step": 171 }, { "completion_length": 301.38617515563965, "epoch": 0.26591941250362355, "grad_norm": 7073.492839638274, "kl": 0.609893798828125, "learning_rate": 8.35046891333019e-07, "loss": 0.0244, "reward": 1.3236607611179352, "reward_std": 0.37537476420402527, "rewards/accuracy_reward": 0.3794643022119999, "rewards/format_reward": 0.9441964589059353, "step": 172 }, { "completion_length": 296.46876335144043, "epoch": 0.26746545559957485, "grad_norm": 0.2700132613511486, "kl": 0.0523223876953125, "learning_rate": 8.332380369343639e-07, "loss": 0.0021, "reward": 1.3482143580913544, "reward_std": 0.4106770046055317, "rewards/accuracy_reward": 0.4029018059372902, "rewards/format_reward": 0.9453125335276127, "step": 173 }, { "completion_length": 313.3984537124634, "epoch": 0.26901149869552615, "grad_norm": 0.25494308720169756, "kl": 0.04637908935546875, "learning_rate": 8.31421301406986e-07, "loss": 0.0019, "reward": 1.3504464849829674, "reward_std": 0.395091837272048, "rewards/accuracy_reward": 0.4073660932481289, "rewards/format_reward": 0.9430803880095482, "step": 174 }, { "completion_length": 311.7667570114136, "epoch": 0.27055754179147745, "grad_norm": 0.3389490431502772, "kl": 0.05242919921875, "learning_rate": 8.295967277169489e-07, "loss": 0.0021, "reward": 1.2857143357396126, "reward_std": 0.36233864538371563, "rewards/accuracy_reward": 0.3526785895228386, "rewards/format_reward": 0.9330357499420643, "step": 175 }, { "completion_length": 322.0937662124634, "epoch": 0.27210358488742875, "grad_norm": 8.53291331029802, "kl": 0.047454833984375, "learning_rate": 8.277643590156893e-07, "loss": 0.0019, "reward": 1.3069196864962578, "reward_std": 0.42329204455018044, "rewards/accuracy_reward": 0.37276787497103214, "rewards/format_reward": 0.9341518208384514, "step": 176 }, { "completion_length": 320.30693340301514, "epoch": 0.27364962798338005, "grad_norm": 0.26658570310155455, "kl": 0.04750823974609375, "learning_rate": 8.259242386389973e-07, "loss": 0.0019, "reward": 1.309151828289032, "reward_std": 0.3963526748120785, "rewards/accuracy_reward": 0.37500002048909664, "rewards/format_reward": 0.9341518208384514, "step": 177 }, { "completion_length": 316.00671195983887, "epoch": 0.27519567107933135, "grad_norm": 0.20997781907156265, "kl": 0.0463409423828125, "learning_rate": 8.240764101059912e-07, "loss": 0.0019, "reward": 1.3325893506407738, "reward_std": 0.3722992278635502, "rewards/accuracy_reward": 0.3973214477300644, "rewards/format_reward": 0.9352678917348385, "step": 178 }, { "completion_length": 315.50224685668945, "epoch": 0.27674171417528265, "grad_norm": 0.344191416290609, "kl": 0.047943115234375, "learning_rate": 8.222209171180883e-07, "loss": 0.0019, "reward": 1.3258929252624512, "reward_std": 0.380522265098989, "rewards/accuracy_reward": 0.39062501955777407, "rewards/format_reward": 0.9352678880095482, "step": 179 }, { "completion_length": 335.69421100616455, "epoch": 0.27828775727123395, "grad_norm": 0.2619377351780565, "kl": 0.04296875, "learning_rate": 8.203578035579715e-07, "loss": 0.0017, "reward": 1.2031250558793545, "reward_std": 0.43243310414254665, "rewards/accuracy_reward": 0.30022323061712086, "rewards/format_reward": 0.902901828289032, "step": 180 }, { "completion_length": 326.4218912124634, "epoch": 0.27983380036718525, "grad_norm": 0.23810059999706668, "kl": 0.04290771484375, "learning_rate": 8.184871134885512e-07, "loss": 0.0017, "reward": 1.3058036342263222, "reward_std": 0.3916765283793211, "rewards/accuracy_reward": 0.368303588591516, "rewards/format_reward": 0.9375000335276127, "step": 181 }, { "completion_length": 324.5212240219116, "epoch": 0.28137984346313655, "grad_norm": 0.2501086769388846, "kl": 0.0466156005859375, "learning_rate": 8.166088911519234e-07, "loss": 0.0019, "reward": 1.2957589849829674, "reward_std": 0.4064408652484417, "rewards/accuracy_reward": 0.34933036658912897, "rewards/format_reward": 0.9464286118745804, "step": 182 }, { "completion_length": 313.58148765563965, "epoch": 0.28292588655908785, "grad_norm": 0.2391078846943778, "kl": 0.044525146484375, "learning_rate": 8.147231809683235e-07, "loss": 0.0018, "reward": 1.3013393506407738, "reward_std": 0.3778317775577307, "rewards/accuracy_reward": 0.364955373108387, "rewards/format_reward": 0.9363839663565159, "step": 183 }, { "completion_length": 320.04242515563965, "epoch": 0.28447192965503915, "grad_norm": 0.21805016114128314, "kl": 0.04589080810546875, "learning_rate": 8.128300275350754e-07, "loss": 0.0018, "reward": 1.3593750521540642, "reward_std": 0.39226437360048294, "rewards/accuracy_reward": 0.41294644959270954, "rewards/format_reward": 0.9464286081492901, "step": 184 }, { "completion_length": 341.6540355682373, "epoch": 0.28601797275099045, "grad_norm": 0.204923115732008, "kl": 0.0406341552734375, "learning_rate": 8.109294756255373e-07, "loss": 0.0016, "reward": 1.338169701397419, "reward_std": 0.37713748775422573, "rewards/accuracy_reward": 0.39397322945296764, "rewards/format_reward": 0.9441964589059353, "step": 185 }, { "completion_length": 349.47880840301514, "epoch": 0.28756401584694175, "grad_norm": 0.20220824041176932, "kl": 0.041290283203125, "learning_rate": 8.090215701880417e-07, "loss": 0.0017, "reward": 1.3046875670552254, "reward_std": 0.3621687162667513, "rewards/accuracy_reward": 0.37611608766019344, "rewards/format_reward": 0.9285714738070965, "step": 186 }, { "completion_length": 319.6953248977661, "epoch": 0.28911005894289304, "grad_norm": 0.20707150829018228, "kl": 0.0471038818359375, "learning_rate": 8.071063563448339e-07, "loss": 0.0019, "reward": 1.3136161342263222, "reward_std": 0.31285884976387024, "rewards/accuracy_reward": 0.36607144493609667, "rewards/format_reward": 0.9475446827709675, "step": 187 }, { "completion_length": 331.95425605773926, "epoch": 0.29065610203884434, "grad_norm": 0.21016529906023715, "kl": 0.0456085205078125, "learning_rate": 8.051838793910038e-07, "loss": 0.0018, "reward": 1.3426339775323868, "reward_std": 0.3465033732354641, "rewards/accuracy_reward": 0.40290180686861277, "rewards/format_reward": 0.9397321790456772, "step": 188 }, { "completion_length": 328.84599685668945, "epoch": 0.29220214513479564, "grad_norm": 0.20077333537512704, "kl": 0.044403076171875, "learning_rate": 8.032541847934144e-07, "loss": 0.0018, "reward": 1.3816964700818062, "reward_std": 0.3744387570768595, "rewards/accuracy_reward": 0.43973215762525797, "rewards/format_reward": 0.9419643171131611, "step": 189 }, { "completion_length": 327.7009057998657, "epoch": 0.29374818823074694, "grad_norm": 0.23556242504474467, "kl": 0.0465850830078125, "learning_rate": 8.013173181896282e-07, "loss": 0.0019, "reward": 1.3560268357396126, "reward_std": 0.3397673973813653, "rewards/accuracy_reward": 0.40848216228187084, "rewards/format_reward": 0.9475446790456772, "step": 190 }, { "completion_length": 309.74220275878906, "epoch": 0.29529423132669824, "grad_norm": 0.2234330999977759, "kl": 0.04779052734375, "learning_rate": 7.993733253868256e-07, "loss": 0.0019, "reward": 1.3738839849829674, "reward_std": 0.3462688233703375, "rewards/accuracy_reward": 0.4084821604192257, "rewards/format_reward": 0.9654018171131611, "step": 191 }, { "completion_length": 303.3694305419922, "epoch": 0.29684027442264954, "grad_norm": 0.24423120692713426, "kl": 0.0520782470703125, "learning_rate": 7.974222523607235e-07, "loss": 0.0021, "reward": 1.3482143431901932, "reward_std": 0.3952969200909138, "rewards/accuracy_reward": 0.3950893059372902, "rewards/format_reward": 0.9531250335276127, "step": 192 }, { "completion_length": 318.948673248291, "epoch": 0.29838631751860084, "grad_norm": 0.5151758416613436, "kl": 0.058929443359375, "learning_rate": 7.954641452544864e-07, "loss": 0.0024, "reward": 1.3560268431901932, "reward_std": 0.35282052308321, "rewards/accuracy_reward": 0.4129464505240321, "rewards/format_reward": 0.9430803991854191, "step": 193 }, { "completion_length": 296.42412281036377, "epoch": 0.29993236061455214, "grad_norm": 0.25343019219333807, "kl": 0.0525665283203125, "learning_rate": 7.934990503776362e-07, "loss": 0.0021, "reward": 1.3482143506407738, "reward_std": 0.3762247506529093, "rewards/accuracy_reward": 0.38504465762525797, "rewards/format_reward": 0.9631696715950966, "step": 194 }, { "completion_length": 318.5413064956665, "epoch": 0.30147840371050344, "grad_norm": 0.20817652863207461, "kl": 0.050567626953125, "learning_rate": 7.915270142049566e-07, "loss": 0.002, "reward": 1.343750074505806, "reward_std": 0.33051140513271093, "rewards/accuracy_reward": 0.39843751676380634, "rewards/format_reward": 0.9453125260770321, "step": 195 }, { "completion_length": 296.956485748291, "epoch": 0.30302444680645474, "grad_norm": 0.23346459830872315, "kl": 0.050018310546875, "learning_rate": 7.89548083375394e-07, "loss": 0.002, "reward": 1.416294701397419, "reward_std": 0.3332835305482149, "rewards/accuracy_reward": 0.4575893059372902, "rewards/format_reward": 0.9587053917348385, "step": 196 }, { "completion_length": 333.3538074493408, "epoch": 0.30457048990240604, "grad_norm": 0.23310326265340547, "kl": 0.04341888427734375, "learning_rate": 7.875623046909545e-07, "loss": 0.0017, "reward": 1.3504464849829674, "reward_std": 0.3901459872722626, "rewards/accuracy_reward": 0.40625001676380634, "rewards/format_reward": 0.9441964663565159, "step": 197 }, { "completion_length": 315.774564743042, "epoch": 0.30611653299835734, "grad_norm": 0.22866876931886404, "kl": 0.04827880859375, "learning_rate": 7.855697251155966e-07, "loss": 0.0019, "reward": 1.2968750447034836, "reward_std": 0.3361909659579396, "rewards/accuracy_reward": 0.34486608766019344, "rewards/format_reward": 0.9520089700818062, "step": 198 }, { "completion_length": 318.9252347946167, "epoch": 0.30766257609430864, "grad_norm": 0.24358506376292594, "kl": 0.04312896728515625, "learning_rate": 7.835703917741213e-07, "loss": 0.0017, "reward": 1.3627232685685158, "reward_std": 0.3370063826441765, "rewards/accuracy_reward": 0.40401787031441927, "rewards/format_reward": 0.9587053917348385, "step": 199 }, { "completion_length": 303.12054920196533, "epoch": 0.30920861919025994, "grad_norm": 0.25210253760493373, "kl": 0.05517578125, "learning_rate": 7.81564351951057e-07, "loss": 0.0022, "reward": 1.3493304252624512, "reward_std": 0.32193957082927227, "rewards/accuracy_reward": 0.39843751955777407, "rewards/format_reward": 0.9508928880095482, "step": 200 }, { "completion_length": 297.3326005935669, "epoch": 0.31075466228621124, "grad_norm": 0.22792906418818815, "kl": 0.05155181884765625, "learning_rate": 7.795516530895413e-07, "loss": 0.0021, "reward": 1.3627232685685158, "reward_std": 0.33014370780438185, "rewards/accuracy_reward": 0.40401787776499987, "rewards/format_reward": 0.9587053805589676, "step": 201 }, { "completion_length": 298.0156354904175, "epoch": 0.31230070538216254, "grad_norm": 0.27368309946767866, "kl": 0.05401611328125, "learning_rate": 7.775323427901992e-07, "loss": 0.0022, "reward": 1.3850447088479996, "reward_std": 0.40433269739151, "rewards/accuracy_reward": 0.4375000223517418, "rewards/format_reward": 0.9475446678698063, "step": 202 }, { "completion_length": 336.4888553619385, "epoch": 0.31384674847811384, "grad_norm": 0.20677780115455272, "kl": 0.04669952392578125, "learning_rate": 7.755064688100171e-07, "loss": 0.0019, "reward": 1.3604911342263222, "reward_std": 0.3240428753197193, "rewards/accuracy_reward": 0.4051339514553547, "rewards/format_reward": 0.9553571753203869, "step": 203 }, { "completion_length": 312.91407680511475, "epoch": 0.31539279157406513, "grad_norm": 0.21015451407177105, "kl": 0.0531768798828125, "learning_rate": 7.734740790612136e-07, "loss": 0.0021, "reward": 1.3995536267757416, "reward_std": 0.3245171643793583, "rewards/accuracy_reward": 0.43638394586741924, "rewards/format_reward": 0.9631696827709675, "step": 204 }, { "completion_length": 342.1462230682373, "epoch": 0.31693883467001643, "grad_norm": 0.221100949213707, "kl": 0.04254150390625, "learning_rate": 7.714352216101055e-07, "loss": 0.0017, "reward": 1.2979911416769028, "reward_std": 0.34670876432210207, "rewards/accuracy_reward": 0.34486608393490314, "rewards/format_reward": 0.9531250409781933, "step": 205 }, { "completion_length": 323.51675510406494, "epoch": 0.31848487776596773, "grad_norm": 0.25656690391060616, "kl": 0.0525360107421875, "learning_rate": 7.693899446759727e-07, "loss": 0.0021, "reward": 1.3694197088479996, "reward_std": 0.3590342905372381, "rewards/accuracy_reward": 0.4218750214204192, "rewards/format_reward": 0.9475446864962578, "step": 206 }, { "completion_length": 325.7589416503906, "epoch": 0.32003092086191903, "grad_norm": 0.22476457833839483, "kl": 0.0474853515625, "learning_rate": 7.673382966299162e-07, "loss": 0.0019, "reward": 1.3404018580913544, "reward_std": 0.38420001976192, "rewards/accuracy_reward": 0.3984375186264515, "rewards/format_reward": 0.941964328289032, "step": 207 }, { "completion_length": 308.33260440826416, "epoch": 0.32157696395787033, "grad_norm": 0.2047023859451623, "kl": 0.053131103515625, "learning_rate": 7.652803259937148e-07, "loss": 0.0021, "reward": 1.3906250670552254, "reward_std": 0.30635119369253516, "rewards/accuracy_reward": 0.4363839505240321, "rewards/format_reward": 0.9542411006987095, "step": 208 }, { "completion_length": 329.8047037124634, "epoch": 0.32312300705382163, "grad_norm": 0.24320562936282875, "kl": 0.050201416015625, "learning_rate": 7.632160814386779e-07, "loss": 0.002, "reward": 1.3247768431901932, "reward_std": 0.3916434682905674, "rewards/accuracy_reward": 0.37611608672887087, "rewards/format_reward": 0.9486607536673546, "step": 209 }, { "completion_length": 330.12166595458984, "epoch": 0.32466905014977293, "grad_norm": 0.20210258878192716, "kl": 0.048004150390625, "learning_rate": 7.611456117844933e-07, "loss": 0.0019, "reward": 1.3750000521540642, "reward_std": 0.34443160705268383, "rewards/accuracy_reward": 0.4229910857975483, "rewards/format_reward": 0.9520089626312256, "step": 210 }, { "completion_length": 313.05247020721436, "epoch": 0.32621509324572423, "grad_norm": 0.22975052296553117, "kl": 0.0504150390625, "learning_rate": 7.590689659980739e-07, "loss": 0.002, "reward": 1.3872768506407738, "reward_std": 0.3637647358700633, "rewards/accuracy_reward": 0.4296875186264515, "rewards/format_reward": 0.9575893208384514, "step": 211 }, { "completion_length": 353.1417589187622, "epoch": 0.32776113634167553, "grad_norm": 0.20558603815397122, "kl": 0.0449676513671875, "learning_rate": 7.569861931923988e-07, "loss": 0.0018, "reward": 1.3113839849829674, "reward_std": 0.3357184398919344, "rewards/accuracy_reward": 0.36830359045416117, "rewards/format_reward": 0.9430803917348385, "step": 212 }, { "completion_length": 345.39845275878906, "epoch": 0.32930717943762683, "grad_norm": 0.1951795385233815, "kl": 0.04770660400390625, "learning_rate": 7.54897342625352e-07, "loss": 0.0019, "reward": 1.3169643469154835, "reward_std": 0.32671058736741543, "rewards/accuracy_reward": 0.3727678768336773, "rewards/format_reward": 0.9441964663565159, "step": 213 }, { "completion_length": 325.20537185668945, "epoch": 0.33085322253357813, "grad_norm": 0.21378330566381895, "kl": 0.045562744140625, "learning_rate": 7.528024636985573e-07, "loss": 0.0018, "reward": 1.3671875596046448, "reward_std": 0.3422316499054432, "rewards/accuracy_reward": 0.39955359045416117, "rewards/format_reward": 0.9676339589059353, "step": 214 }, { "completion_length": 347.63059520721436, "epoch": 0.33239926562952943, "grad_norm": 0.20741028273004247, "kl": 0.047088623046875, "learning_rate": 7.507016059562107e-07, "loss": 0.0019, "reward": 1.3158482760190964, "reward_std": 0.35835533402860165, "rewards/accuracy_reward": 0.38281251676380634, "rewards/format_reward": 0.9330357499420643, "step": 215 }, { "completion_length": 314.60938930511475, "epoch": 0.3339453087254807, "grad_norm": 0.1922421024030388, "kl": 0.0474700927734375, "learning_rate": 7.485948190839076e-07, "loss": 0.0019, "reward": 1.36495541036129, "reward_std": 0.31153504364192486, "rewards/accuracy_reward": 0.39285716135054827, "rewards/format_reward": 0.9720982387661934, "step": 216 }, { "completion_length": 329.1506824493408, "epoch": 0.335491351821432, "grad_norm": 0.2267954436170303, "kl": 0.0539398193359375, "learning_rate": 7.464821529074677e-07, "loss": 0.0022, "reward": 1.3359375596046448, "reward_std": 0.3485208582133055, "rewards/accuracy_reward": 0.3861607313156128, "rewards/format_reward": 0.9497768245637417, "step": 217 }, { "completion_length": 322.1038074493408, "epoch": 0.3370373949173833, "grad_norm": 0.24509689043061816, "kl": 0.049896240234375, "learning_rate": 7.443636573917584e-07, "loss": 0.002, "reward": 1.3448661267757416, "reward_std": 0.34952237410470843, "rewards/accuracy_reward": 0.38839287776499987, "rewards/format_reward": 0.9564732499420643, "step": 218 }, { "completion_length": 342.8381805419922, "epoch": 0.3385834380133346, "grad_norm": 0.2264667198200957, "kl": 0.04724884033203125, "learning_rate": 7.422393826395107e-07, "loss": 0.0019, "reward": 1.313616119325161, "reward_std": 0.3785533607006073, "rewards/accuracy_reward": 0.3694196594879031, "rewards/format_reward": 0.9441964626312256, "step": 219 }, { "completion_length": 325.2745714187622, "epoch": 0.3401294811092859, "grad_norm": 0.2159953341315843, "kl": 0.0479278564453125, "learning_rate": 7.40109378890136e-07, "loss": 0.0019, "reward": 1.373883992433548, "reward_std": 0.3572205863893032, "rewards/accuracy_reward": 0.4095982313156128, "rewards/format_reward": 0.9642857499420643, "step": 220 }, { "completion_length": 334.70202255249023, "epoch": 0.3416755242052372, "grad_norm": 0.2082474059620772, "kl": 0.0513458251953125, "learning_rate": 7.379736965185368e-07, "loss": 0.0021, "reward": 1.35714291036129, "reward_std": 0.3737869169563055, "rewards/accuracy_reward": 0.41629466600716114, "rewards/format_reward": 0.9408482499420643, "step": 221 }, { "completion_length": 310.03014945983887, "epoch": 0.3432215673011885, "grad_norm": 0.4083597343495639, "kl": 0.0516510009765625, "learning_rate": 7.358323860339164e-07, "loss": 0.0021, "reward": 1.3638393506407738, "reward_std": 0.3820534199476242, "rewards/accuracy_reward": 0.4207589467987418, "rewards/format_reward": 0.9430803917348385, "step": 222 }, { "completion_length": 373.6841640472412, "epoch": 0.3447676103971398, "grad_norm": 0.20257015354330551, "kl": 0.043853759765625, "learning_rate": 7.336854980785838e-07, "loss": 0.0018, "reward": 1.2912947051227093, "reward_std": 0.36698511708527803, "rewards/accuracy_reward": 0.3549107280559838, "rewards/format_reward": 0.9363839738070965, "step": 223 }, { "completion_length": 315.5256805419922, "epoch": 0.3463136534930911, "grad_norm": 0.3415021464029844, "kl": 0.053680419921875, "learning_rate": 7.315330834267553e-07, "loss": 0.0021, "reward": 1.3247768431901932, "reward_std": 0.36957948096096516, "rewards/accuracy_reward": 0.3738839440047741, "rewards/format_reward": 0.9508928954601288, "step": 224 }, { "completion_length": 327.15738105773926, "epoch": 0.3478596965890424, "grad_norm": 0.24320152949138812, "kl": 0.0528564453125, "learning_rate": 7.293751929833552e-07, "loss": 0.0021, "reward": 1.3426339775323868, "reward_std": 0.3638022802770138, "rewards/accuracy_reward": 0.3872768050059676, "rewards/format_reward": 0.9553571715950966, "step": 225 }, { "completion_length": 341.14064025878906, "epoch": 0.3494057396849937, "grad_norm": 0.19910484216116026, "kl": 0.048614501953125, "learning_rate": 7.272118777828108e-07, "loss": 0.0019, "reward": 1.342633992433548, "reward_std": 0.34812563471496105, "rewards/accuracy_reward": 0.3872768022119999, "rewards/format_reward": 0.9553571715950966, "step": 226 }, { "completion_length": 322.4263525009155, "epoch": 0.350951782780945, "grad_norm": 0.21185631650363318, "kl": 0.048187255859375, "learning_rate": 7.250431889878454e-07, "loss": 0.0019, "reward": 1.3404018431901932, "reward_std": 0.35336307249963284, "rewards/accuracy_reward": 0.3906250186264515, "rewards/format_reward": 0.949776828289032, "step": 227 }, { "completion_length": 328.5881824493408, "epoch": 0.3524978258768963, "grad_norm": 0.2115408546253145, "kl": 0.0516204833984375, "learning_rate": 7.228691778882692e-07, "loss": 0.0021, "reward": 1.392857201397419, "reward_std": 0.33324115723371506, "rewards/accuracy_reward": 0.43973215855658054, "rewards/format_reward": 0.953125037252903, "step": 228 }, { "completion_length": 334.0067081451416, "epoch": 0.3540438689728476, "grad_norm": 0.18625534091461218, "kl": 0.0470733642578125, "learning_rate": 7.206898958997649e-07, "loss": 0.0019, "reward": 1.389508992433548, "reward_std": 0.28795688040554523, "rewards/accuracy_reward": 0.43080359138548374, "rewards/format_reward": 0.9587053991854191, "step": 229 }, { "completion_length": 307.5301523208618, "epoch": 0.3555899120687989, "grad_norm": 0.21506933566885564, "kl": 0.0531158447265625, "learning_rate": 7.185053945626734e-07, "loss": 0.0021, "reward": 1.400669701397419, "reward_std": 0.3471977934241295, "rewards/accuracy_reward": 0.44419645331799984, "rewards/format_reward": 0.9564732536673546, "step": 230 }, { "completion_length": 320.50447845458984, "epoch": 0.3571359551647502, "grad_norm": 0.23958805924532517, "kl": 0.0554046630859375, "learning_rate": 7.163157255407732e-07, "loss": 0.0022, "reward": 1.3906250596046448, "reward_std": 0.32960117142647505, "rewards/accuracy_reward": 0.4296875186264515, "rewards/format_reward": 0.960937537252903, "step": 231 }, { "completion_length": 332.6573820114136, "epoch": 0.3586819982607015, "grad_norm": 0.21510823910094756, "kl": 0.0512237548828125, "learning_rate": 7.141209406200598e-07, "loss": 0.002, "reward": 1.4118304178118706, "reward_std": 0.34133168309926987, "rewards/accuracy_reward": 0.4676339477300644, "rewards/format_reward": 0.9441964775323868, "step": 232 }, { "completion_length": 340.3538074493408, "epoch": 0.3602280413566528, "grad_norm": 0.20495494644657836, "kl": 0.047576904296875, "learning_rate": 7.1192109170752e-07, "loss": 0.0019, "reward": 1.3917411267757416, "reward_std": 0.348129129037261, "rewards/accuracy_reward": 0.4263393022119999, "rewards/format_reward": 0.9654018245637417, "step": 233 }, { "completion_length": 344.9453239440918, "epoch": 0.3617740844526041, "grad_norm": 0.23322451052736132, "kl": 0.05426788330078125, "learning_rate": 7.097162308299054e-07, "loss": 0.0022, "reward": 1.2712054029107094, "reward_std": 0.3697692723944783, "rewards/accuracy_reward": 0.33035715762525797, "rewards/format_reward": 0.940848246216774, "step": 234 }, { "completion_length": 324.349347114563, "epoch": 0.3633201275485554, "grad_norm": 0.2563466839483564, "kl": 0.057342529296875, "learning_rate": 7.075064101325009e-07, "loss": 0.0023, "reward": 1.3750000670552254, "reward_std": 0.36463904567062855, "rewards/accuracy_reward": 0.42968751583248377, "rewards/format_reward": 0.9453125298023224, "step": 235 }, { "completion_length": 337.6294775009155, "epoch": 0.3648661706445067, "grad_norm": 0.20431790693828114, "kl": 0.0513916015625, "learning_rate": 7.052916818778917e-07, "loss": 0.0021, "reward": 1.3448661416769028, "reward_std": 0.3747240537777543, "rewards/accuracy_reward": 0.39620537124574184, "rewards/format_reward": 0.9486607424914837, "step": 236 }, { "completion_length": 335.71206855773926, "epoch": 0.366412213740458, "grad_norm": 0.21403373248211396, "kl": 0.0511627197265625, "learning_rate": 7.030720984447278e-07, "loss": 0.002, "reward": 1.3671875670552254, "reward_std": 0.3616072107106447, "rewards/accuracy_reward": 0.42075895331799984, "rewards/format_reward": 0.9464286118745804, "step": 237 }, { "completion_length": 303.6473379135132, "epoch": 0.3679582568364093, "grad_norm": 0.2306103241961612, "kl": 0.0565032958984375, "learning_rate": 7.008477123264847e-07, "loss": 0.0023, "reward": 1.4654018580913544, "reward_std": 0.34533094987273216, "rewards/accuracy_reward": 0.5022321697324514, "rewards/format_reward": 0.9631696753203869, "step": 238 }, { "completion_length": 304.810284614563, "epoch": 0.3695042999323606, "grad_norm": 0.21116437507294802, "kl": 0.0561065673828125, "learning_rate": 6.986185761302223e-07, "loss": 0.0022, "reward": 1.4486607685685158, "reward_std": 0.3632229436188936, "rewards/accuracy_reward": 0.501116095110774, "rewards/format_reward": 0.9475446715950966, "step": 239 }, { "completion_length": 314.0959987640381, "epoch": 0.3710503430283119, "grad_norm": 0.21681560548784806, "kl": 0.0566558837890625, "learning_rate": 6.963847425753402e-07, "loss": 0.0023, "reward": 1.4274554178118706, "reward_std": 0.3650170462206006, "rewards/accuracy_reward": 0.46986608766019344, "rewards/format_reward": 0.9575893133878708, "step": 240 }, { "completion_length": 337.12836265563965, "epoch": 0.3725963861242632, "grad_norm": 0.2275633980181149, "kl": 0.0500030517578125, "learning_rate": 6.941462644923317e-07, "loss": 0.002, "reward": 1.4252232760190964, "reward_std": 0.3871311116963625, "rewards/accuracy_reward": 0.47879466228187084, "rewards/format_reward": 0.9464286006987095, "step": 241 }, { "completion_length": 362.3683214187622, "epoch": 0.3741424292202145, "grad_norm": 0.23006107639599696, "kl": 0.0533294677734375, "learning_rate": 6.919031948215334e-07, "loss": 0.0021, "reward": 1.3046875521540642, "reward_std": 0.3464266639202833, "rewards/accuracy_reward": 0.3683035848662257, "rewards/format_reward": 0.936383955180645, "step": 242 }, { "completion_length": 329.23550605773926, "epoch": 0.3756884723161658, "grad_norm": 0.23580989491853632, "kl": 0.056793212890625, "learning_rate": 6.896555866118739e-07, "loss": 0.0023, "reward": 1.3627232760190964, "reward_std": 0.3576676044613123, "rewards/accuracy_reward": 0.4107143059372902, "rewards/format_reward": 0.9520089626312256, "step": 243 }, { "completion_length": 353.104923248291, "epoch": 0.3772345154121171, "grad_norm": 0.1986563343418404, "kl": 0.04616546630859375, "learning_rate": 6.87403493019619e-07, "loss": 0.0018, "reward": 1.3359375670552254, "reward_std": 0.3682770887389779, "rewards/accuracy_reward": 0.39397323597222567, "rewards/format_reward": 0.9419643133878708, "step": 244 }, { "completion_length": 311.5178699493408, "epoch": 0.3787805585080684, "grad_norm": 0.23382014786950572, "kl": 0.059722900390625, "learning_rate": 6.851469673071142e-07, "loss": 0.0024, "reward": 1.3883929178118706, "reward_std": 0.34120051097124815, "rewards/accuracy_reward": 0.42633930779993534, "rewards/format_reward": 0.9620536044239998, "step": 245 }, { "completion_length": 362.9576053619385, "epoch": 0.3803266016040197, "grad_norm": 0.2069403882463959, "kl": 0.0514984130859375, "learning_rate": 6.828860628415253e-07, "loss": 0.0021, "reward": 1.3906250596046448, "reward_std": 0.3672402985394001, "rewards/accuracy_reward": 0.45535716228187084, "rewards/format_reward": 0.9352678880095482, "step": 246 }, { "completion_length": 341.81028175354004, "epoch": 0.381872644699971, "grad_norm": 0.21465378928010895, "kl": 0.0542449951171875, "learning_rate": 6.806208330935766e-07, "loss": 0.0022, "reward": 1.4196429178118706, "reward_std": 0.31576177291572094, "rewards/accuracy_reward": 0.47321431152522564, "rewards/format_reward": 0.9464286044239998, "step": 247 }, { "completion_length": 324.74220180511475, "epoch": 0.3834186877959223, "grad_norm": 0.2396604778918695, "kl": 0.0541534423828125, "learning_rate": 6.783513316362854e-07, "loss": 0.0022, "reward": 1.354910783469677, "reward_std": 0.3600319102406502, "rewards/accuracy_reward": 0.4051339440047741, "rewards/format_reward": 0.9497768208384514, "step": 248 }, { "completion_length": 367.3292579650879, "epoch": 0.3849647308918736, "grad_norm": 0.19746448284703352, "kl": 0.049041748046875, "learning_rate": 6.760776121436961e-07, "loss": 0.002, "reward": 1.3292411267757416, "reward_std": 0.3726581148803234, "rewards/accuracy_reward": 0.3883928768336773, "rewards/format_reward": 0.940848246216774, "step": 249 }, { "completion_length": 355.854923248291, "epoch": 0.3865107739878249, "grad_norm": 0.23842659778860828, "kl": 0.05511474609375, "learning_rate": 6.737997283896103e-07, "loss": 0.0022, "reward": 1.3671875447034836, "reward_std": 0.3957639951258898, "rewards/accuracy_reward": 0.43526787497103214, "rewards/format_reward": 0.9319196827709675, "step": 250 }, { "completion_length": 339.5614013671875, "epoch": 0.3880568170837762, "grad_norm": 0.8152836957259254, "kl": 0.061004638671875, "learning_rate": 6.715177342463144e-07, "loss": 0.0024, "reward": 1.3850447162985802, "reward_std": 0.3728841785341501, "rewards/accuracy_reward": 0.4531250223517418, "rewards/format_reward": 0.9319196827709675, "step": 251 }, { "completion_length": 332.06921100616455, "epoch": 0.3896028601797275, "grad_norm": 0.24071182467254898, "kl": 0.055206298828125, "learning_rate": 6.692316836833065e-07, "loss": 0.0022, "reward": 1.3627232760190964, "reward_std": 0.37982272170484066, "rewards/accuracy_reward": 0.42745537776499987, "rewards/format_reward": 0.9352678880095482, "step": 252 }, { "completion_length": 374.55358695983887, "epoch": 0.3911489032756788, "grad_norm": 0.1980457237472933, "kl": 0.04998779296875, "learning_rate": 6.669416307660198e-07, "loss": 0.002, "reward": 1.350446492433548, "reward_std": 0.38577892258763313, "rewards/accuracy_reward": 0.42857144959270954, "rewards/format_reward": 0.9218750409781933, "step": 253 }, { "completion_length": 335.0145263671875, "epoch": 0.3926949463716301, "grad_norm": 0.24937177373507396, "kl": 0.053955078125, "learning_rate": 6.646476296545434e-07, "loss": 0.0022, "reward": 1.4140625596046448, "reward_std": 0.3899702075868845, "rewards/accuracy_reward": 0.470982164144516, "rewards/format_reward": 0.9430803954601288, "step": 254 }, { "completion_length": 346.6518039703369, "epoch": 0.3942409894675814, "grad_norm": 0.22414081076509243, "kl": 0.0550079345703125, "learning_rate": 6.623497346023417e-07, "loss": 0.0022, "reward": 1.2823661267757416, "reward_std": 0.38889360055327415, "rewards/accuracy_reward": 0.35156252048909664, "rewards/format_reward": 0.9308036081492901, "step": 255 }, { "completion_length": 366.1607275009155, "epoch": 0.3957870325635327, "grad_norm": 0.19946550128064816, "kl": 0.05108642578125, "learning_rate": 6.600479999549719e-07, "loss": 0.002, "reward": 1.3984375596046448, "reward_std": 0.37514422088861465, "rewards/accuracy_reward": 0.4464285932481289, "rewards/format_reward": 0.9520089663565159, "step": 256 }, { "completion_length": 347.09376335144043, "epoch": 0.397333075659484, "grad_norm": 0.21906399966314688, "kl": 0.0565338134765625, "learning_rate": 6.57742480148798e-07, "loss": 0.0023, "reward": 1.3638393431901932, "reward_std": 0.3663050811737776, "rewards/accuracy_reward": 0.4140625186264515, "rewards/format_reward": 0.9497768320143223, "step": 257 }, { "completion_length": 351.5256881713867, "epoch": 0.3988791187554353, "grad_norm": 0.22094704228480078, "kl": 0.0525970458984375, "learning_rate": 6.55433229709703e-07, "loss": 0.0021, "reward": 1.3772321939468384, "reward_std": 0.3996714614331722, "rewards/accuracy_reward": 0.4419643022119999, "rewards/format_reward": 0.9352678917348385, "step": 258 }, { "completion_length": 342.3962173461914, "epoch": 0.4004251618513866, "grad_norm": 22052.12304429228, "kl": 27.426498413085938, "learning_rate": 6.531203032518009e-07, "loss": 1.0997, "reward": 1.4174107909202576, "reward_std": 0.3781021721661091, "rewards/accuracy_reward": 0.482142879627645, "rewards/format_reward": 0.9352678991854191, "step": 259 }, { "completion_length": 369.5156440734863, "epoch": 0.4019712049473379, "grad_norm": 0.20695594097063355, "kl": 0.0499420166015625, "learning_rate": 6.508037554761432e-07, "loss": 0.002, "reward": 1.3995536416769028, "reward_std": 0.37337792851030827, "rewards/accuracy_reward": 0.4620535932481289, "rewards/format_reward": 0.9375000335276127, "step": 260 }, { "completion_length": 372.4040336608887, "epoch": 0.4035172480432892, "grad_norm": 0.20873356371369967, "kl": 0.04929351806640625, "learning_rate": 6.484836411694266e-07, "loss": 0.002, "reward": 1.3671875670552254, "reward_std": 0.37900043837726116, "rewards/accuracy_reward": 0.42968751676380634, "rewards/format_reward": 0.9375000335276127, "step": 261 }, { "completion_length": 337.8549289703369, "epoch": 0.4050632911392405, "grad_norm": 0.20105425276230257, "kl": 0.0541534423828125, "learning_rate": 6.461600152026964e-07, "loss": 0.0022, "reward": 1.4352679252624512, "reward_std": 0.36104387138038874, "rewards/accuracy_reward": 0.48772324062883854, "rewards/format_reward": 0.9475446864962578, "step": 262 }, { "completion_length": 334.3582696914673, "epoch": 0.4066093342351918, "grad_norm": 0.19320250342212553, "kl": 0.0567169189453125, "learning_rate": 6.438329325300499e-07, "loss": 0.0023, "reward": 1.4375000670552254, "reward_std": 0.3043056344613433, "rewards/accuracy_reward": 0.4955357350409031, "rewards/format_reward": 0.9419643208384514, "step": 263 }, { "completion_length": 358.1127414703369, "epoch": 0.4081553773311431, "grad_norm": 2.2007342285044125, "kl": 0.0511474609375, "learning_rate": 6.415024481873351e-07, "loss": 0.002, "reward": 1.4263393580913544, "reward_std": 0.37869611009955406, "rewards/accuracy_reward": 0.48772323736920953, "rewards/format_reward": 0.9386161118745804, "step": 264 }, { "completion_length": 355.3114013671875, "epoch": 0.4097014204270944, "grad_norm": 0.19762735989219898, "kl": 0.0502777099609375, "learning_rate": 6.391686172908506e-07, "loss": 0.002, "reward": 1.3761161342263222, "reward_std": 0.381501785479486, "rewards/accuracy_reward": 0.4352678768336773, "rewards/format_reward": 0.940848246216774, "step": 265 }, { "completion_length": 369.74555015563965, "epoch": 0.4112474635230457, "grad_norm": 0.18934253709488488, "kl": 0.0516204833984375, "learning_rate": 6.368314950360415e-07, "loss": 0.0021, "reward": 1.31026791036129, "reward_std": 0.3365687932819128, "rewards/accuracy_reward": 0.37276787124574184, "rewards/format_reward": 0.9375000335276127, "step": 266 }, { "completion_length": 363.0089473724365, "epoch": 0.412793506618997, "grad_norm": 0.19812140956842297, "kl": 0.05255126953125, "learning_rate": 6.344911366961934e-07, "loss": 0.0021, "reward": 1.3537947088479996, "reward_std": 0.3299210872501135, "rewards/accuracy_reward": 0.4151785895228386, "rewards/format_reward": 0.9386161081492901, "step": 267 }, { "completion_length": 343.0524730682373, "epoch": 0.4143395497149483, "grad_norm": 0.21453363779584253, "kl": 0.05327606201171875, "learning_rate": 6.321475976211265e-07, "loss": 0.0021, "reward": 1.38058041036129, "reward_std": 0.33743087109178305, "rewards/accuracy_reward": 0.439732164144516, "rewards/format_reward": 0.9408482424914837, "step": 268 }, { "completion_length": 362.7176456451416, "epoch": 0.4158855928108996, "grad_norm": 0.20021102673197005, "kl": 0.050628662109375, "learning_rate": 6.298009332358855e-07, "loss": 0.002, "reward": 1.3303571939468384, "reward_std": 0.3521122168749571, "rewards/accuracy_reward": 0.3861607313156128, "rewards/format_reward": 0.9441964626312256, "step": 269 }, { "completion_length": 345.9888505935669, "epoch": 0.4174316359068509, "grad_norm": 0.23855625741606393, "kl": 0.0542144775390625, "learning_rate": 6.274511990394293e-07, "loss": 0.0022, "reward": 1.3995536267757416, "reward_std": 0.3866823147982359, "rewards/accuracy_reward": 0.4631696678698063, "rewards/format_reward": 0.9363839738070965, "step": 270 }, { "completion_length": 330.943097114563, "epoch": 0.4189776790028022, "grad_norm": 0.2098014229848241, "kl": 0.0583343505859375, "learning_rate": 6.250984506033182e-07, "loss": 0.0023, "reward": 1.4029018580913544, "reward_std": 0.33315955474972725, "rewards/accuracy_reward": 0.45312502421438694, "rewards/format_reward": 0.949776828289032, "step": 271 }, { "completion_length": 335.938627243042, "epoch": 0.4205237220987535, "grad_norm": 0.23521150528642473, "kl": 0.0562896728515625, "learning_rate": 6.227427435703995e-07, "loss": 0.0023, "reward": 1.4129464849829674, "reward_std": 0.35767329297959805, "rewards/accuracy_reward": 0.4709821604192257, "rewards/format_reward": 0.9419643245637417, "step": 272 }, { "completion_length": 353.98327255249023, "epoch": 0.4220697651947048, "grad_norm": 0.27563371691478444, "kl": 0.0485992431640625, "learning_rate": 6.203841336534923e-07, "loss": 0.0019, "reward": 1.3247768431901932, "reward_std": 0.3616346474736929, "rewards/accuracy_reward": 0.3772321557626128, "rewards/format_reward": 0.9475446753203869, "step": 273 }, { "completion_length": 335.8303737640381, "epoch": 0.4236158082906561, "grad_norm": 0.2108108633709451, "kl": 0.0552825927734375, "learning_rate": 6.180226766340687e-07, "loss": 0.0022, "reward": 1.3850447237491608, "reward_std": 0.3937307093292475, "rewards/accuracy_reward": 0.4520089477300644, "rewards/format_reward": 0.9330357611179352, "step": 274 }, { "completion_length": 335.79131507873535, "epoch": 0.4251618513866074, "grad_norm": 0.1857049150458351, "kl": 0.05731201171875, "learning_rate": 6.156584283609358e-07, "loss": 0.0023, "reward": 1.4620536491274834, "reward_std": 0.32276696152985096, "rewards/accuracy_reward": 0.5156250223517418, "rewards/format_reward": 0.9464286081492901, "step": 275 }, { "completion_length": 333.4732303619385, "epoch": 0.4267078944825587, "grad_norm": 0.20524894595537305, "kl": 0.0605926513671875, "learning_rate": 6.132914447489136e-07, "loss": 0.0024, "reward": 1.4073661416769028, "reward_std": 0.342067115008831, "rewards/accuracy_reward": 0.4609375223517418, "rewards/format_reward": 0.9464286118745804, "step": 276 }, { "completion_length": 348.6082706451416, "epoch": 0.42825393757851, "grad_norm": 0.18717590833378536, "kl": 0.055816650390625, "learning_rate": 6.109217817775139e-07, "loss": 0.0022, "reward": 1.3571429178118706, "reward_std": 0.35624142549932003, "rewards/accuracy_reward": 0.41852680407464504, "rewards/format_reward": 0.9386161044239998, "step": 277 }, { "completion_length": 361.92523193359375, "epoch": 0.4297999806744613, "grad_norm": 0.20122936821435888, "kl": 0.05249786376953125, "learning_rate": 6.085494954896156e-07, "loss": 0.0021, "reward": 1.3638393580913544, "reward_std": 0.35334627982228994, "rewards/accuracy_reward": 0.42299109511077404, "rewards/format_reward": 0.9408482499420643, "step": 278 }, { "completion_length": 334.3839416503906, "epoch": 0.4313460237704126, "grad_norm": 0.23005658363967826, "kl": 0.0572052001953125, "learning_rate": 6.061746419901388e-07, "loss": 0.0023, "reward": 1.3917411267757416, "reward_std": 0.383831930346787, "rewards/accuracy_reward": 0.4453125176951289, "rewards/format_reward": 0.9464286118745804, "step": 279 }, { "completion_length": 348.79242992401123, "epoch": 0.4328920668663639, "grad_norm": 0.1837703729537679, "kl": 0.051788330078125, "learning_rate": 6.037972774447193e-07, "loss": 0.0021, "reward": 1.4006697162985802, "reward_std": 0.3387620700523257, "rewards/accuracy_reward": 0.44308037497103214, "rewards/format_reward": 0.9575893245637417, "step": 280 }, { "completion_length": 338.7890787124634, "epoch": 0.4344381099623152, "grad_norm": 0.1992482900316753, "kl": 0.052825927734375, "learning_rate": 6.014174580783793e-07, "loss": 0.0021, "reward": 1.4218750521540642, "reward_std": 0.3148977216333151, "rewards/accuracy_reward": 0.4720982387661934, "rewards/format_reward": 0.9497768133878708, "step": 281 }, { "completion_length": 357.0993461608887, "epoch": 0.4359841530582665, "grad_norm": 0.20970600093094255, "kl": 0.0522918701171875, "learning_rate": 5.990352401741981e-07, "loss": 0.0021, "reward": 1.3482143506407738, "reward_std": 0.39415652491152287, "rewards/accuracy_reward": 0.41517859417945147, "rewards/format_reward": 0.9330357573926449, "step": 282 }, { "completion_length": 328.3169803619385, "epoch": 0.4375301961542178, "grad_norm": 0.22161943953259583, "kl": 0.0565338134765625, "learning_rate": 5.966506800719798e-07, "loss": 0.0023, "reward": 1.4341518506407738, "reward_std": 0.3839611355215311, "rewards/accuracy_reward": 0.4832589514553547, "rewards/format_reward": 0.9508928917348385, "step": 283 }, { "completion_length": 348.8850631713867, "epoch": 0.4390762392501691, "grad_norm": 0.21948282391904447, "kl": 0.0526885986328125, "learning_rate": 5.942638341669229e-07, "loss": 0.0021, "reward": 1.4118304029107094, "reward_std": 0.3552003651857376, "rewards/accuracy_reward": 0.46651787497103214, "rewards/format_reward": 0.9453125298023224, "step": 284 }, { "completion_length": 352.7857275009155, "epoch": 0.4406222823461204, "grad_norm": 0.20338678368129295, "kl": 0.0513153076171875, "learning_rate": 5.918747589082852e-07, "loss": 0.0021, "reward": 1.3683036342263222, "reward_std": 0.37548818066716194, "rewards/accuracy_reward": 0.4207589440047741, "rewards/format_reward": 0.9475446790456772, "step": 285 }, { "completion_length": 368.7968921661377, "epoch": 0.4421683254420717, "grad_norm": 0.2151472813103246, "kl": 0.04834747314453125, "learning_rate": 5.894835107980487e-07, "loss": 0.0019, "reward": 1.398437574505806, "reward_std": 0.3863678425550461, "rewards/accuracy_reward": 0.4564732341095805, "rewards/format_reward": 0.9419643208384514, "step": 286 }, { "completion_length": 346.3292541503906, "epoch": 0.443714368538023, "grad_norm": 0.21927561121050024, "kl": 0.0564727783203125, "learning_rate": 5.87090146389584e-07, "loss": 0.0023, "reward": 1.3281250596046448, "reward_std": 0.3817227054387331, "rewards/accuracy_reward": 0.39732145331799984, "rewards/format_reward": 0.9308036118745804, "step": 287 }, { "completion_length": 339.10604095458984, "epoch": 0.4452604116339743, "grad_norm": 0.21317588582044172, "kl": 0.05975341796875, "learning_rate": 5.846947222863122e-07, "loss": 0.0024, "reward": 1.4386161342263222, "reward_std": 0.3724204935133457, "rewards/accuracy_reward": 0.4921875260770321, "rewards/format_reward": 0.9464286081492901, "step": 288 }, { "completion_length": 328.2846145629883, "epoch": 0.4468064547299256, "grad_norm": 0.20862736637777038, "kl": 0.054656982421875, "learning_rate": 5.82297295140367e-07, "loss": 0.0022, "reward": 1.3995536342263222, "reward_std": 0.3720461130142212, "rewards/accuracy_reward": 0.45312502048909664, "rewards/format_reward": 0.9464286155998707, "step": 289 }, { "completion_length": 327.1428699493408, "epoch": 0.4483524978258769, "grad_norm": 0.22682957688138164, "kl": 0.0567779541015625, "learning_rate": 5.798979216512535e-07, "loss": 0.0023, "reward": 1.4107143506407738, "reward_std": 0.38500375114381313, "rewards/accuracy_reward": 0.47656252048909664, "rewards/format_reward": 0.9341518208384514, "step": 290 }, { "completion_length": 360.409610748291, "epoch": 0.4498985409218282, "grad_norm": 0.23114743352977846, "kl": 0.0521087646484375, "learning_rate": 5.774966585645092e-07, "loss": 0.0021, "reward": 1.3526786342263222, "reward_std": 0.381423257291317, "rewards/accuracy_reward": 0.4363839514553547, "rewards/format_reward": 0.9162946827709675, "step": 291 }, { "completion_length": 368.91072845458984, "epoch": 0.4514445840177795, "grad_norm": 0.20021386748518938, "kl": 0.0507965087890625, "learning_rate": 5.750935626703597e-07, "loss": 0.002, "reward": 1.3716518580913544, "reward_std": 0.3628371078521013, "rewards/accuracy_reward": 0.4274553805589676, "rewards/format_reward": 0.9441964626312256, "step": 292 }, { "completion_length": 351.4587211608887, "epoch": 0.4529906271137308, "grad_norm": 0.22335409949607113, "kl": 0.056610107421875, "learning_rate": 5.726886908023776e-07, "loss": 0.0023, "reward": 1.4207589775323868, "reward_std": 0.35267431661486626, "rewards/accuracy_reward": 0.4933035969734192, "rewards/format_reward": 0.9274553954601288, "step": 293 }, { "completion_length": 342.4899711608887, "epoch": 0.4545366702096821, "grad_norm": 0.19585591739510527, "kl": 0.051116943359375, "learning_rate": 5.702820998361373e-07, "loss": 0.002, "reward": 1.4107143506407738, "reward_std": 0.32488697580993176, "rewards/accuracy_reward": 0.45870537497103214, "rewards/format_reward": 0.9520089738070965, "step": 294 }, { "completion_length": 345.9464473724365, "epoch": 0.4560827133056334, "grad_norm": 0.20332054057792817, "kl": 0.0555572509765625, "learning_rate": 5.678738466878698e-07, "loss": 0.0022, "reward": 1.4062500596046448, "reward_std": 0.3608044274151325, "rewards/accuracy_reward": 0.45089288242161274, "rewards/format_reward": 0.9553571678698063, "step": 295 }, { "completion_length": 310.38506031036377, "epoch": 0.4576287564015847, "grad_norm": 0.21064709917539165, "kl": 0.0596466064453125, "learning_rate": 5.654639883131177e-07, "loss": 0.0024, "reward": 1.4553572162985802, "reward_std": 0.33162576518952847, "rewards/accuracy_reward": 0.49553573690354824, "rewards/format_reward": 0.959821455180645, "step": 296 }, { "completion_length": 357.4765796661377, "epoch": 0.459174799497536, "grad_norm": 0.20396155420521594, "kl": 0.05218505859375, "learning_rate": 5.630525817053867e-07, "loss": 0.0021, "reward": 1.3560268431901932, "reward_std": 0.39283425733447075, "rewards/accuracy_reward": 0.41294644959270954, "rewards/format_reward": 0.9430803991854191, "step": 297 }, { "completion_length": 340.61831760406494, "epoch": 0.4607208425934873, "grad_norm": 0.18677159641912996, "kl": 0.0545806884765625, "learning_rate": 5.606396838947988e-07, "loss": 0.0022, "reward": 1.4464286342263222, "reward_std": 0.3255245238542557, "rewards/accuracy_reward": 0.5033482424914837, "rewards/format_reward": 0.9430803842842579, "step": 298 }, { "completion_length": 339.89398765563965, "epoch": 0.4622668856894386, "grad_norm": 0.19854307537521665, "kl": 0.053924560546875, "learning_rate": 5.582253519467432e-07, "loss": 0.0022, "reward": 1.3671875596046448, "reward_std": 0.31279612332582474, "rewards/accuracy_reward": 0.4207589477300644, "rewards/format_reward": 0.9464286006987095, "step": 299 }, { "completion_length": 329.54688835144043, "epoch": 0.4638129287853899, "grad_norm": 0.2286459207625636, "kl": 0.0568695068359375, "learning_rate": 5.558096429605262e-07, "loss": 0.0023, "reward": 1.4464286491274834, "reward_std": 0.36663145385682583, "rewards/accuracy_reward": 0.48102681152522564, "rewards/format_reward": 0.9654018245637417, "step": 300 }, { "completion_length": 370.1551513671875, "epoch": 0.4653589718813412, "grad_norm": 0.21800031015632276, "kl": 0.0556640625, "learning_rate": 5.533926140680221e-07, "loss": 0.0022, "reward": 1.289062574505806, "reward_std": 0.36649923026561737, "rewards/accuracy_reward": 0.3627232275903225, "rewards/format_reward": 0.926339328289032, "step": 301 }, { "completion_length": 349.71988677978516, "epoch": 0.4669050149772925, "grad_norm": 0.24070972268476618, "kl": 0.053985595703125, "learning_rate": 5.509743224323202e-07, "loss": 0.0022, "reward": 1.4185268506407738, "reward_std": 0.3582413122057915, "rewards/accuracy_reward": 0.47209823690354824, "rewards/format_reward": 0.9464286155998707, "step": 302 }, { "completion_length": 340.2042579650879, "epoch": 0.4684510580732438, "grad_norm": 0.2342124790915144, "kl": 0.0576629638671875, "learning_rate": 5.485548252463748e-07, "loss": 0.0023, "reward": 1.4475447088479996, "reward_std": 0.3714308552443981, "rewards/accuracy_reward": 0.5122768096625805, "rewards/format_reward": 0.9352678880095482, "step": 303 }, { "completion_length": 334.57144355773926, "epoch": 0.4699971011691951, "grad_norm": 0.2359187676031675, "kl": 0.0548095703125, "learning_rate": 5.46134179731651e-07, "loss": 0.0022, "reward": 1.4185268506407738, "reward_std": 0.35776047594845295, "rewards/accuracy_reward": 0.4765625186264515, "rewards/format_reward": 0.9419643245637417, "step": 304 }, { "completion_length": 338.9263553619385, "epoch": 0.4715431442651464, "grad_norm": 1.4686504408388361, "kl": 0.0578460693359375, "learning_rate": 5.437124431367722e-07, "loss": 0.0023, "reward": 1.3816964998841286, "reward_std": 0.3385177608579397, "rewards/accuracy_reward": 0.4330357341095805, "rewards/format_reward": 0.948660746216774, "step": 305 }, { "completion_length": 340.94867515563965, "epoch": 0.4730891873610977, "grad_norm": 0.19122560253262197, "kl": 0.0532684326171875, "learning_rate": 5.412896727361662e-07, "loss": 0.0021, "reward": 1.3950893431901932, "reward_std": 0.345390141941607, "rewards/accuracy_reward": 0.45312502328306437, "rewards/format_reward": 0.9419643171131611, "step": 306 }, { "completion_length": 350.7611770629883, "epoch": 0.474635230457049, "grad_norm": 0.2227653732942952, "kl": 0.0518646240234375, "learning_rate": 5.388659258287101e-07, "loss": 0.0021, "reward": 1.3258929178118706, "reward_std": 0.3519100649282336, "rewards/accuracy_reward": 0.3783482341095805, "rewards/format_reward": 0.9475446864962578, "step": 307 }, { "completion_length": 375.2209987640381, "epoch": 0.4761812735530003, "grad_norm": 0.22593605284123627, "kl": 0.05084228515625, "learning_rate": 5.364412597363758e-07, "loss": 0.002, "reward": 1.3526786416769028, "reward_std": 0.38130610063672066, "rewards/accuracy_reward": 0.4151785923168063, "rewards/format_reward": 0.937500037252903, "step": 308 }, { "completion_length": 379.37055587768555, "epoch": 0.47772731664895157, "grad_norm": 6.334054786887839, "kl": 0.049407958984375, "learning_rate": 5.340157318028741e-07, "loss": 0.002, "reward": 1.3303572162985802, "reward_std": 0.3717002235352993, "rewards/accuracy_reward": 0.4040178768336773, "rewards/format_reward": 0.9263393245637417, "step": 309 }, { "completion_length": 345.722110748291, "epoch": 0.47927335974490287, "grad_norm": 0.23102885586164632, "kl": 0.05413818359375, "learning_rate": 5.315893993922985e-07, "loss": 0.0022, "reward": 1.456473283469677, "reward_std": 0.34388269297778606, "rewards/accuracy_reward": 0.5156250204890966, "rewards/format_reward": 0.9408482573926449, "step": 310 }, { "completion_length": 361.8370723724365, "epoch": 0.48081940284085417, "grad_norm": 0.23463405744380164, "kl": 0.0537872314453125, "learning_rate": 5.29162319887768e-07, "loss": 0.0022, "reward": 1.378348283469677, "reward_std": 0.4168297238647938, "rewards/accuracy_reward": 0.443080373108387, "rewards/format_reward": 0.9352679029107094, "step": 311 }, { "completion_length": 360.0725612640381, "epoch": 0.48236544593680547, "grad_norm": 0.20703278079071322, "kl": 0.055572509765625, "learning_rate": 5.26734550690071e-07, "loss": 0.0022, "reward": 1.3359375521540642, "reward_std": 0.3871735446155071, "rewards/accuracy_reward": 0.40066966600716114, "rewards/format_reward": 0.9352678954601288, "step": 312 }, { "completion_length": 346.43193435668945, "epoch": 0.48391148903275677, "grad_norm": 0.20559336398954173, "kl": 0.0543060302734375, "learning_rate": 5.243061492163072e-07, "loss": 0.0022, "reward": 1.3727679252624512, "reward_std": 0.33929229620844126, "rewards/accuracy_reward": 0.43415180779993534, "rewards/format_reward": 0.9386161044239998, "step": 313 }, { "completion_length": 350.4721145629883, "epoch": 0.48545753212870807, "grad_norm": 0.21180160621141955, "kl": 0.053924560546875, "learning_rate": 5.218771728985295e-07, "loss": 0.0022, "reward": 1.3671875670552254, "reward_std": 0.3767853993922472, "rewards/accuracy_reward": 0.4330357341095805, "rewards/format_reward": 0.9341518171131611, "step": 314 }, { "completion_length": 357.71206855773926, "epoch": 0.48700357522465937, "grad_norm": 0.2072818141611984, "kl": 0.052398681640625, "learning_rate": 5.194476791823862e-07, "loss": 0.0021, "reward": 1.3683036416769028, "reward_std": 0.36060420144349337, "rewards/accuracy_reward": 0.4352678768336773, "rewards/format_reward": 0.933035746216774, "step": 315 }, { "completion_length": 322.3203296661377, "epoch": 0.48854961832061067, "grad_norm": 0.21807009482594328, "kl": 0.057159423828125, "learning_rate": 5.170177255257617e-07, "loss": 0.0023, "reward": 1.421875074505806, "reward_std": 0.3710249485448003, "rewards/accuracy_reward": 0.47098216228187084, "rewards/format_reward": 0.9508928880095482, "step": 316 }, { "completion_length": 368.0569362640381, "epoch": 0.49009566141656197, "grad_norm": 0.17699401396423542, "kl": 0.051971435546875, "learning_rate": 5.145873693974188e-07, "loss": 0.0021, "reward": 1.3950893580913544, "reward_std": 0.37810577638447285, "rewards/accuracy_reward": 0.46316966600716114, "rewards/format_reward": 0.9319196753203869, "step": 317 }, { "completion_length": 361.1607303619385, "epoch": 0.49164170451251327, "grad_norm": 0.24115725223528564, "kl": 0.0494537353515625, "learning_rate": 5.12156668275638e-07, "loss": 0.002, "reward": 1.392857201397419, "reward_std": 0.3769874516874552, "rewards/accuracy_reward": 0.45758930779993534, "rewards/format_reward": 0.9352678917348385, "step": 318 }, { "completion_length": 338.47546005249023, "epoch": 0.49318774760846457, "grad_norm": 4.090615824579645, "kl": 0.06268310546875, "learning_rate": 5.097256796468597e-07, "loss": 0.0025, "reward": 1.468750074505806, "reward_std": 0.3408730737864971, "rewards/accuracy_reward": 0.520089304074645, "rewards/format_reward": 0.948660746216774, "step": 319 }, { "completion_length": 374.2611770629883, "epoch": 0.49473379070441587, "grad_norm": 0.1931627647900453, "kl": 0.0477752685546875, "learning_rate": 5.072944610043232e-07, "loss": 0.0019, "reward": 1.370535783469677, "reward_std": 0.3581877052783966, "rewards/accuracy_reward": 0.4107143012806773, "rewards/format_reward": 0.9598214626312256, "step": 320 }, { "completion_length": 329.26117515563965, "epoch": 0.49627983380036716, "grad_norm": 0.20289071232357106, "kl": 0.057708740234375, "learning_rate": 5.04863069846708e-07, "loss": 0.0023, "reward": 1.4196429178118706, "reward_std": 0.34264824353158474, "rewards/accuracy_reward": 0.46540180686861277, "rewards/format_reward": 0.9542411081492901, "step": 321 }, { "completion_length": 354.8761329650879, "epoch": 0.49782587689631846, "grad_norm": 0.22337990019023396, "kl": 0.060791015625, "learning_rate": 5.024315636767737e-07, "loss": 0.0024, "reward": 1.4263393580913544, "reward_std": 0.3727464023977518, "rewards/accuracy_reward": 0.48549109511077404, "rewards/format_reward": 0.9408482536673546, "step": 322 }, { "completion_length": 377.4933223724365, "epoch": 0.49937191999226976, "grad_norm": 0.20264878604319975, "kl": 0.049102783203125, "learning_rate": 5e-07, "loss": 0.002, "reward": 1.360491119325161, "reward_std": 0.3612466547638178, "rewards/accuracy_reward": 0.42633930407464504, "rewards/format_reward": 0.9341518245637417, "step": 323 }, { "completion_length": 371.3526954650879, "epoch": 0.5009179630882211, "grad_norm": 0.3623065474202221, "kl": 0.0565948486328125, "learning_rate": 4.975684363232263e-07, "loss": 0.0023, "reward": 1.3214286267757416, "reward_std": 0.36202103458344936, "rewards/accuracy_reward": 0.39620537124574184, "rewards/format_reward": 0.9252232499420643, "step": 324 }, { "completion_length": 334.12055015563965, "epoch": 0.5024640061841724, "grad_norm": 0.2182352315592751, "kl": 0.0637359619140625, "learning_rate": 4.951369301532918e-07, "loss": 0.0025, "reward": 1.3984375596046448, "reward_std": 0.3848156426101923, "rewards/accuracy_reward": 0.4497768022119999, "rewards/format_reward": 0.9486607499420643, "step": 325 }, { "completion_length": 377.83707427978516, "epoch": 0.5040100492801237, "grad_norm": 0.2467678489977953, "kl": 0.059326171875, "learning_rate": 4.927055389956768e-07, "loss": 0.0024, "reward": 1.2924107648432255, "reward_std": 0.34728911705315113, "rewards/accuracy_reward": 0.36495537497103214, "rewards/format_reward": 0.9274554029107094, "step": 326 }, { "completion_length": 333.2701053619385, "epoch": 0.505556092376075, "grad_norm": 0.2631903772142661, "kl": 0.059600830078125, "learning_rate": 4.902743203531404e-07, "loss": 0.0024, "reward": 1.4296875596046448, "reward_std": 0.34190649166703224, "rewards/accuracy_reward": 0.4676339542493224, "rewards/format_reward": 0.9620536081492901, "step": 327 }, { "completion_length": 362.92746925354004, "epoch": 0.5071021354720263, "grad_norm": 0.21590931705200603, "kl": 0.054718017578125, "learning_rate": 4.87843331724362e-07, "loss": 0.0022, "reward": 1.430803619325161, "reward_std": 0.38321356754750013, "rewards/accuracy_reward": 0.4888393022119999, "rewards/format_reward": 0.9419643320143223, "step": 328 }, { "completion_length": 361.91966247558594, "epoch": 0.5086481785679776, "grad_norm": 0.2185981172742161, "kl": 0.053192138671875, "learning_rate": 4.854126306025812e-07, "loss": 0.0021, "reward": 1.4140625521540642, "reward_std": 0.35025477409362793, "rewards/accuracy_reward": 0.4754464514553547, "rewards/format_reward": 0.9386161044239998, "step": 329 }, { "completion_length": 342.757830619812, "epoch": 0.5101942216639289, "grad_norm": 0.21126681161649533, "kl": 0.0544586181640625, "learning_rate": 4.829822744742382e-07, "loss": 0.0022, "reward": 1.390625074505806, "reward_std": 0.32597475312650204, "rewards/accuracy_reward": 0.44084823317825794, "rewards/format_reward": 0.9497768133878708, "step": 330 }, { "completion_length": 337.52456855773926, "epoch": 0.5117402647598802, "grad_norm": 0.2376845418352936, "kl": 0.053253173828125, "learning_rate": 4.805523208176138e-07, "loss": 0.0021, "reward": 1.4419643506407738, "reward_std": 0.3619873020797968, "rewards/accuracy_reward": 0.4955357387661934, "rewards/format_reward": 0.9464285969734192, "step": 331 }, { "completion_length": 348.8872938156128, "epoch": 0.5132863078558315, "grad_norm": 0.19743958954063837, "kl": 0.0562591552734375, "learning_rate": 4.781228271014703e-07, "loss": 0.0023, "reward": 1.4341518506407738, "reward_std": 0.3548443615436554, "rewards/accuracy_reward": 0.5066964514553547, "rewards/format_reward": 0.9274553880095482, "step": 332 }, { "completion_length": 369.0167579650879, "epoch": 0.5148323509517828, "grad_norm": 0.215959188396259, "kl": 0.052581787109375, "learning_rate": 4.7569385078369287e-07, "loss": 0.0021, "reward": 1.362723283469677, "reward_std": 0.3749706353992224, "rewards/accuracy_reward": 0.41629466880112886, "rewards/format_reward": 0.9464286081492901, "step": 333 }, { "completion_length": 350.8716697692871, "epoch": 0.5163783940477341, "grad_norm": 0.21148588517747882, "kl": 0.0518951416015625, "learning_rate": 4.73265449309929e-07, "loss": 0.0021, "reward": 1.338169701397419, "reward_std": 0.3761606551706791, "rewards/accuracy_reward": 0.3984375260770321, "rewards/format_reward": 0.9397321678698063, "step": 334 }, { "completion_length": 347.04354095458984, "epoch": 0.5179244371436854, "grad_norm": 0.2052518062570225, "kl": 0.0567626953125, "learning_rate": 4.708376801122321e-07, "loss": 0.0023, "reward": 1.4095982760190964, "reward_std": 0.4013300519436598, "rewards/accuracy_reward": 0.4665178842842579, "rewards/format_reward": 0.9430803917348385, "step": 335 }, { "completion_length": 359.7645263671875, "epoch": 0.5194704802396367, "grad_norm": 1.5090419123904983, "kl": 0.054718017578125, "learning_rate": 4.684106006077015e-07, "loss": 0.0022, "reward": 1.4029018506407738, "reward_std": 0.3834494426846504, "rewards/accuracy_reward": 0.47321431152522564, "rewards/format_reward": 0.929687537252903, "step": 336 }, { "completion_length": 337.0859537124634, "epoch": 0.521016523335588, "grad_norm": 0.21491186258399708, "kl": 0.05377197265625, "learning_rate": 4.659842681971257e-07, "loss": 0.0022, "reward": 1.4129464849829674, "reward_std": 0.3800941947847605, "rewards/accuracy_reward": 0.4732143133878708, "rewards/format_reward": 0.9397321790456772, "step": 337 }, { "completion_length": 352.2980070114136, "epoch": 0.5225625664315393, "grad_norm": 0.20611656666502773, "kl": 0.053955078125, "learning_rate": 4.6355874026362406e-07, "loss": 0.0022, "reward": 1.3660714887082577, "reward_std": 0.3588948119431734, "rewards/accuracy_reward": 0.4229910895228386, "rewards/format_reward": 0.9430803954601288, "step": 338 }, { "completion_length": 390.8917541503906, "epoch": 0.5241086095274906, "grad_norm": 0.18700097965431828, "kl": 0.0490264892578125, "learning_rate": 4.6113407417129003e-07, "loss": 0.002, "reward": 1.3515625521540642, "reward_std": 0.36890473030507565, "rewards/accuracy_reward": 0.42968751583248377, "rewards/format_reward": 0.9218750447034836, "step": 339 }, { "completion_length": 339.3013563156128, "epoch": 0.5256546526234419, "grad_norm": 0.17907668069318344, "kl": 0.0592041015625, "learning_rate": 4.5871032726383385e-07, "loss": 0.0024, "reward": 1.397321492433548, "reward_std": 0.31921094097197056, "rewards/accuracy_reward": 0.4520089477300644, "rewards/format_reward": 0.9453125335276127, "step": 340 }, { "completion_length": 363.39733600616455, "epoch": 0.5272006957193932, "grad_norm": 0.20534424746041066, "kl": 0.05263519287109375, "learning_rate": 4.562875568632278e-07, "loss": 0.0021, "reward": 1.4296875670552254, "reward_std": 0.396926898509264, "rewards/accuracy_reward": 0.48772324435412884, "rewards/format_reward": 0.9419643320143223, "step": 341 }, { "completion_length": 320.14845085144043, "epoch": 0.5287467388153445, "grad_norm": 0.20382313970845883, "kl": 0.0609893798828125, "learning_rate": 4.5386582026834904e-07, "loss": 0.0024, "reward": 1.4363839998841286, "reward_std": 0.33548835292458534, "rewards/accuracy_reward": 0.48995538242161274, "rewards/format_reward": 0.9464286081492901, "step": 342 }, { "completion_length": 323.927472114563, "epoch": 0.5302927819112958, "grad_norm": 0.22806038172333243, "kl": 0.0590667724609375, "learning_rate": 4.5144517475362506e-07, "loss": 0.0024, "reward": 1.3616072088479996, "reward_std": 0.35409387201070786, "rewards/accuracy_reward": 0.4252232341095805, "rewards/format_reward": 0.9363839626312256, "step": 343 }, { "completion_length": 343.2578296661377, "epoch": 0.5318388250072471, "grad_norm": 0.20089413996756553, "kl": 0.057281494140625, "learning_rate": 4.4902567756767966e-07, "loss": 0.0023, "reward": 1.4017857909202576, "reward_std": 0.343368505127728, "rewards/accuracy_reward": 0.46093752048909664, "rewards/format_reward": 0.9408482573926449, "step": 344 }, { "completion_length": 349.9118404388428, "epoch": 0.5333848681031984, "grad_norm": 0.21273708192811014, "kl": 0.0590972900390625, "learning_rate": 4.4660738593197806e-07, "loss": 0.0024, "reward": 1.4196429029107094, "reward_std": 0.3941789669916034, "rewards/accuracy_reward": 0.48102680779993534, "rewards/format_reward": 0.9386161081492901, "step": 345 }, { "completion_length": 365.0636320114136, "epoch": 0.5349309111991497, "grad_norm": 0.21815109737140953, "kl": 0.053009033203125, "learning_rate": 4.441903570394739e-07, "loss": 0.0021, "reward": 1.3805804178118706, "reward_std": 0.4088291320949793, "rewards/accuracy_reward": 0.44531252700835466, "rewards/format_reward": 0.9352678954601288, "step": 346 }, { "completion_length": 331.21988105773926, "epoch": 0.536476954295101, "grad_norm": 0.19737046113131365, "kl": 0.0580596923828125, "learning_rate": 4.417746480532569e-07, "loss": 0.0023, "reward": 1.4386161491274834, "reward_std": 0.3433207552880049, "rewards/accuracy_reward": 0.49553574435412884, "rewards/format_reward": 0.9430803954601288, "step": 347 }, { "completion_length": 347.1730098724365, "epoch": 0.5380229973910523, "grad_norm": 0.19945940720698732, "kl": 0.05621337890625, "learning_rate": 4.393603161052012e-07, "loss": 0.0022, "reward": 1.3325893506407738, "reward_std": 0.35241850558668375, "rewards/accuracy_reward": 0.40290180407464504, "rewards/format_reward": 0.9296875409781933, "step": 348 }, { "completion_length": 349.1864013671875, "epoch": 0.5395690404870036, "grad_norm": 0.19343421413058784, "kl": 0.056610107421875, "learning_rate": 4.369474182946133e-07, "loss": 0.0023, "reward": 1.3917411267757416, "reward_std": 0.35905371606349945, "rewards/accuracy_reward": 0.4508928768336773, "rewards/format_reward": 0.9408482499420643, "step": 349 }, { "completion_length": 325.79912185668945, "epoch": 0.5411150835829549, "grad_norm": 0.349839780025062, "kl": 0.0573272705078125, "learning_rate": 4.345360116868822e-07, "loss": 0.0023, "reward": 1.3906250670552254, "reward_std": 0.3201076574623585, "rewards/accuracy_reward": 0.43638395331799984, "rewards/format_reward": 0.9542411081492901, "step": 350 }, { "completion_length": 332.11608600616455, "epoch": 0.5426611266789062, "grad_norm": 0.22824864592621472, "kl": 0.062469482421875, "learning_rate": 4.321261533121303e-07, "loss": 0.0025, "reward": 1.4464286491274834, "reward_std": 0.3867763392627239, "rewards/accuracy_reward": 0.4977678805589676, "rewards/format_reward": 0.9486607499420643, "step": 351 }, { "completion_length": 348.09822845458984, "epoch": 0.5442071697748575, "grad_norm": 0.20797864176039044, "kl": 0.0547332763671875, "learning_rate": 4.2971790016386283e-07, "loss": 0.0022, "reward": 1.3950893506407738, "reward_std": 0.3817948196083307, "rewards/accuracy_reward": 0.46093752048909664, "rewards/format_reward": 0.9341518245637417, "step": 352 }, { "completion_length": 381.0100612640381, "epoch": 0.5457532128708088, "grad_norm": 0.18766967223055536, "kl": 0.0488739013671875, "learning_rate": 4.273113091976225e-07, "loss": 0.002, "reward": 1.3236607909202576, "reward_std": 0.37489909771829844, "rewards/accuracy_reward": 0.3772321604192257, "rewards/format_reward": 0.9464286044239998, "step": 353 }, { "completion_length": 329.3526954650879, "epoch": 0.5472992559667601, "grad_norm": 0.2301291372928604, "kl": 0.0626220703125, "learning_rate": 4.249064373296403e-07, "loss": 0.0025, "reward": 1.4598214998841286, "reward_std": 0.3439429299905896, "rewards/accuracy_reward": 0.5033482350409031, "rewards/format_reward": 0.9564732350409031, "step": 354 }, { "completion_length": 322.1607313156128, "epoch": 0.5488452990627114, "grad_norm": 0.18047400271888908, "kl": 0.0597991943359375, "learning_rate": 4.2250334143549085e-07, "loss": 0.0024, "reward": 1.4966518431901932, "reward_std": 0.3006080109626055, "rewards/accuracy_reward": 0.5390625242143869, "rewards/format_reward": 0.9575893171131611, "step": 355 }, { "completion_length": 325.1015787124634, "epoch": 0.5503913421586627, "grad_norm": 0.23062932002387332, "kl": 0.0581512451171875, "learning_rate": 4.201020783487464e-07, "loss": 0.0023, "reward": 1.3761161416769028, "reward_std": 0.33726252242922783, "rewards/accuracy_reward": 0.41852680034935474, "rewards/format_reward": 0.9575893245637417, "step": 356 }, { "completion_length": 353.04800605773926, "epoch": 0.551937385254614, "grad_norm": 0.22097596404575218, "kl": 0.0574188232421875, "learning_rate": 4.1770270485963294e-07, "loss": 0.0023, "reward": 1.3504464849829674, "reward_std": 0.38889890164136887, "rewards/accuracy_reward": 0.4207589505240321, "rewards/format_reward": 0.929687537252903, "step": 357 }, { "completion_length": 342.64510345458984, "epoch": 0.5534834283505653, "grad_norm": 0.24054840109104664, "kl": 0.0587615966796875, "learning_rate": 4.1530527771368783e-07, "loss": 0.0023, "reward": 1.4966518506407738, "reward_std": 0.3891422264277935, "rewards/accuracy_reward": 0.5479910969734192, "rewards/format_reward": 0.9486607499420643, "step": 358 }, { "completion_length": 329.88952255249023, "epoch": 0.5550294714465166, "grad_norm": 0.2088602183885269, "kl": 0.05877685546875, "learning_rate": 4.129098536104161e-07, "loss": 0.0023, "reward": 1.4441964775323868, "reward_std": 0.32884572073817253, "rewards/accuracy_reward": 0.49776788521558046, "rewards/format_reward": 0.9464286081492901, "step": 359 }, { "completion_length": 343.7667541503906, "epoch": 0.5565755145424679, "grad_norm": 0.19746935575797003, "kl": 0.05535888671875, "learning_rate": 4.1051648920195136e-07, "loss": 0.0022, "reward": 1.4095982611179352, "reward_std": 0.3500424511730671, "rewards/accuracy_reward": 0.4587053805589676, "rewards/format_reward": 0.9508928917348385, "step": 360 }, { "completion_length": 308.35157680511475, "epoch": 0.5581215576384192, "grad_norm": 0.2580451080932716, "kl": 0.0680084228515625, "learning_rate": 4.0812524109171475e-07, "loss": 0.0027, "reward": 1.4118304252624512, "reward_std": 0.3454737737774849, "rewards/accuracy_reward": 0.463169664144516, "rewards/format_reward": 0.948660746216774, "step": 361 }, { "completion_length": 355.0480041503906, "epoch": 0.5596676007343705, "grad_norm": 0.19255777905460122, "kl": 0.0562286376953125, "learning_rate": 4.05736165833077e-07, "loss": 0.0022, "reward": 1.3973214849829674, "reward_std": 0.3390871975570917, "rewards/accuracy_reward": 0.4564732341095805, "rewards/format_reward": 0.9408482536673546, "step": 362 }, { "completion_length": 342.4754638671875, "epoch": 0.5612136438303218, "grad_norm": 0.21328612136072123, "kl": 0.054290771484375, "learning_rate": 4.033493199280202e-07, "loss": 0.0022, "reward": 1.344866119325161, "reward_std": 0.36595124658197165, "rewards/accuracy_reward": 0.3939732362050563, "rewards/format_reward": 0.9508928917348385, "step": 363 }, { "completion_length": 338.3805961608887, "epoch": 0.5627596869262731, "grad_norm": 0.21075296165539636, "kl": 0.053741455078125, "learning_rate": 4.0096475982580214e-07, "loss": 0.0022, "reward": 1.3872768506407738, "reward_std": 0.3660036325454712, "rewards/accuracy_reward": 0.42745537869632244, "rewards/format_reward": 0.959821455180645, "step": 364 }, { "completion_length": 331.44867515563965, "epoch": 0.5643057300222244, "grad_norm": 0.22469966949598, "kl": 0.0565643310546875, "learning_rate": 3.985825419216207e-07, "loss": 0.0023, "reward": 1.3225446790456772, "reward_std": 0.37969554774463177, "rewards/accuracy_reward": 0.3772321587894112, "rewards/format_reward": 0.9453125335276127, "step": 365 }, { "completion_length": 342.2779150009155, "epoch": 0.5658517731181757, "grad_norm": 0.2126782041822118, "kl": 0.0570526123046875, "learning_rate": 3.9620272255528064e-07, "loss": 0.0023, "reward": 1.3616072088479996, "reward_std": 0.3656510002911091, "rewards/accuracy_reward": 0.42968752048909664, "rewards/format_reward": 0.9319196827709675, "step": 366 }, { "completion_length": 315.90849685668945, "epoch": 0.567397816214127, "grad_norm": 0.20774842045357322, "kl": 0.0587921142578125, "learning_rate": 3.938253580098613e-07, "loss": 0.0024, "reward": 1.4631697237491608, "reward_std": 0.3117134403437376, "rewards/accuracy_reward": 0.508928595110774, "rewards/format_reward": 0.9542410969734192, "step": 367 }, { "completion_length": 324.8560428619385, "epoch": 0.5689438593100783, "grad_norm": 0.23540382037431148, "kl": 0.064117431640625, "learning_rate": 3.9145050451038447e-07, "loss": 0.0026, "reward": 1.361607201397419, "reward_std": 0.38323674723505974, "rewards/accuracy_reward": 0.412946455180645, "rewards/format_reward": 0.9486607499420643, "step": 368 }, { "completion_length": 339.01117515563965, "epoch": 0.5704899024060296, "grad_norm": 0.22191539924685727, "kl": 0.064483642578125, "learning_rate": 3.89078218222486e-07, "loss": 0.0026, "reward": 1.3058036267757416, "reward_std": 0.3616667855530977, "rewards/accuracy_reward": 0.3683035848662257, "rewards/format_reward": 0.9375000335276127, "step": 369 }, { "completion_length": 338.25559520721436, "epoch": 0.5720359455019809, "grad_norm": 0.22109516076554414, "kl": 0.053863525390625, "learning_rate": 3.867085552510864e-07, "loss": 0.0022, "reward": 1.3638393431901932, "reward_std": 0.38690496422350407, "rewards/accuracy_reward": 0.4241071594879031, "rewards/format_reward": 0.9397321790456772, "step": 370 }, { "completion_length": 331.21653270721436, "epoch": 0.5735819885979322, "grad_norm": 0.22419181115754877, "kl": 0.06097412109375, "learning_rate": 3.8434157163906433e-07, "loss": 0.0024, "reward": 1.4162947088479996, "reward_std": 0.37135083600878716, "rewards/accuracy_reward": 0.4821428768336773, "rewards/format_reward": 0.9341518245637417, "step": 371 }, { "completion_length": 326.35157680511475, "epoch": 0.5751280316938835, "grad_norm": 0.2262558214607883, "kl": 0.060150146484375, "learning_rate": 3.819773233659314e-07, "loss": 0.0024, "reward": 1.4140625670552254, "reward_std": 0.38730306550860405, "rewards/accuracy_reward": 0.4732143059372902, "rewards/format_reward": 0.940848246216774, "step": 372 }, { "completion_length": 343.60715675354004, "epoch": 0.5766740747898348, "grad_norm": 0.2207885334506212, "kl": 0.058502197265625, "learning_rate": 3.7961586634650767e-07, "loss": 0.0023, "reward": 1.3761161416769028, "reward_std": 0.3860869398340583, "rewards/accuracy_reward": 0.45535716228187084, "rewards/format_reward": 0.9207589626312256, "step": 373 }, { "completion_length": 343.1529140472412, "epoch": 0.5782201178857861, "grad_norm": 0.2023752847507902, "kl": 0.0552825927734375, "learning_rate": 3.772572564296004e-07, "loss": 0.0022, "reward": 1.3258929327130318, "reward_std": 0.3573278747498989, "rewards/accuracy_reward": 0.3694196566939354, "rewards/format_reward": 0.9564732424914837, "step": 374 }, { "completion_length": 362.9442138671875, "epoch": 0.5797661609817374, "grad_norm": 0.20436634445338317, "kl": 0.054412841796875, "learning_rate": 3.7490154939668174e-07, "loss": 0.0022, "reward": 1.2812500484287739, "reward_std": 0.3703052271157503, "rewards/accuracy_reward": 0.34933037403970957, "rewards/format_reward": 0.9319196790456772, "step": 375 }, { "completion_length": 348.5524711608887, "epoch": 0.5813122040776887, "grad_norm": 0.22873274714077377, "kl": 0.056671142578125, "learning_rate": 3.7254880096057075e-07, "loss": 0.0023, "reward": 1.3225447088479996, "reward_std": 0.3950356412678957, "rewards/accuracy_reward": 0.3783482303842902, "rewards/format_reward": 0.9441964626312256, "step": 376 }, { "completion_length": 345.7143020629883, "epoch": 0.58285824717364, "grad_norm": 0.23145365095994766, "kl": 0.05558013916015625, "learning_rate": 3.701990667641144e-07, "loss": 0.0022, "reward": 1.3281250521540642, "reward_std": 0.38679394498467445, "rewards/accuracy_reward": 0.3917410932481289, "rewards/format_reward": 0.9363839589059353, "step": 377 }, { "completion_length": 358.2667541503906, "epoch": 0.5844042902695913, "grad_norm": 0.21166159948326047, "kl": 0.0553741455078125, "learning_rate": 3.678524023788735e-07, "loss": 0.0022, "reward": 1.3314732685685158, "reward_std": 0.363291478715837, "rewards/accuracy_reward": 0.4073660891735926, "rewards/format_reward": 0.9241071790456772, "step": 378 }, { "completion_length": 332.5513505935669, "epoch": 0.5859503333655426, "grad_norm": 34914.07077168525, "kl": 18.683639526367188, "learning_rate": 3.6550886330380663e-07, "loss": 0.7473, "reward": 1.3437500819563866, "reward_std": 0.36664837040007114, "rewards/accuracy_reward": 0.39620537497103214, "rewards/format_reward": 0.9475446790456772, "step": 379 }, { "completion_length": 346.3504581451416, "epoch": 0.5874963764614939, "grad_norm": 0.2552972509979795, "kl": 0.06121826171875, "learning_rate": 3.6316850496395855e-07, "loss": 0.0024, "reward": 1.3861607909202576, "reward_std": 0.384552039206028, "rewards/accuracy_reward": 0.4419643096625805, "rewards/format_reward": 0.9441964626312256, "step": 380 }, { "completion_length": 367.38060092926025, "epoch": 0.5890424195574452, "grad_norm": 0.23741240611397457, "kl": 0.05429840087890625, "learning_rate": 3.6083138270914924e-07, "loss": 0.0022, "reward": 1.354910783469677, "reward_std": 0.42051708698272705, "rewards/accuracy_reward": 0.41629466600716114, "rewards/format_reward": 0.9386161081492901, "step": 381 }, { "completion_length": 346.66184425354004, "epoch": 0.5905884626533965, "grad_norm": 0.21937974332773263, "kl": 0.0611724853515625, "learning_rate": 3.5849755181266474e-07, "loss": 0.0024, "reward": 1.3928572088479996, "reward_std": 0.3466625986620784, "rewards/accuracy_reward": 0.44419644959270954, "rewards/format_reward": 0.9486607499420643, "step": 382 }, { "completion_length": 331.49108505249023, "epoch": 0.5921345057493478, "grad_norm": 0.23769306643214383, "kl": 0.0550537109375, "learning_rate": 3.5616706746995023e-07, "loss": 0.0022, "reward": 1.385044701397419, "reward_std": 0.34990069828927517, "rewards/accuracy_reward": 0.4330357387661934, "rewards/format_reward": 0.9520089738070965, "step": 383 }, { "completion_length": 335.6238946914673, "epoch": 0.5936805488452991, "grad_norm": 0.2512338702938806, "kl": 0.0652923583984375, "learning_rate": 3.5383998479730353e-07, "loss": 0.0026, "reward": 1.369419701397419, "reward_std": 0.3753913315013051, "rewards/accuracy_reward": 0.4296875223517418, "rewards/format_reward": 0.9397321715950966, "step": 384 }, { "completion_length": 334.7779178619385, "epoch": 0.5952265919412504, "grad_norm": 0.19803698469248418, "kl": 0.0581817626953125, "learning_rate": 3.515163588305735e-07, "loss": 0.0023, "reward": 1.4095982760190964, "reward_std": 0.3489046413451433, "rewards/accuracy_reward": 0.4676339514553547, "rewards/format_reward": 0.9419643208384514, "step": 385 }, { "completion_length": 376.1763610839844, "epoch": 0.5967726350372017, "grad_norm": 0.19909844423690443, "kl": 0.05255126953125, "learning_rate": 3.4919624452385685e-07, "loss": 0.0021, "reward": 1.3325893506407738, "reward_std": 0.38019105792045593, "rewards/accuracy_reward": 0.3973214402794838, "rewards/format_reward": 0.9352678991854191, "step": 386 }, { "completion_length": 332.7276954650879, "epoch": 0.598318678133153, "grad_norm": 0.2055605624943385, "kl": 0.0621185302734375, "learning_rate": 3.468796967481991e-07, "loss": 0.0025, "reward": 1.415178619325161, "reward_std": 0.311306769028306, "rewards/accuracy_reward": 0.4598214505240321, "rewards/format_reward": 0.9553571753203869, "step": 387 }, { "completion_length": 335.1484537124634, "epoch": 0.5998647212291043, "grad_norm": 0.2400672387578078, "kl": 0.059539794921875, "learning_rate": 3.4456677029029683e-07, "loss": 0.0024, "reward": 1.3493304178118706, "reward_std": 0.35563754476606846, "rewards/accuracy_reward": 0.3984375186264515, "rewards/format_reward": 0.9508929029107094, "step": 388 }, { "completion_length": 308.65626525878906, "epoch": 0.6014107643250556, "grad_norm": 0.239092516112053, "kl": 0.0678558349609375, "learning_rate": 3.422575198512021e-07, "loss": 0.0027, "reward": 1.3973214998841286, "reward_std": 0.3569658827036619, "rewards/accuracy_reward": 0.45535716600716114, "rewards/format_reward": 0.9419643171131611, "step": 389 }, { "completion_length": 335.8169765472412, "epoch": 0.6029568074210069, "grad_norm": 0.199007365452711, "kl": 0.0563201904296875, "learning_rate": 3.399520000450281e-07, "loss": 0.0023, "reward": 1.4151786416769028, "reward_std": 0.33390526846051216, "rewards/accuracy_reward": 0.46316966973245144, "rewards/format_reward": 0.952008955180645, "step": 390 }, { "completion_length": 338.53796005249023, "epoch": 0.6045028505169582, "grad_norm": 0.23333624682694198, "kl": 0.059600830078125, "learning_rate": 3.3765026539765827e-07, "loss": 0.0024, "reward": 1.3883929178118706, "reward_std": 0.35989311151206493, "rewards/accuracy_reward": 0.43191966228187084, "rewards/format_reward": 0.9564732387661934, "step": 391 }, { "completion_length": 338.38171005249023, "epoch": 0.6060488936129095, "grad_norm": 0.20155936574890443, "kl": 0.0611724853515625, "learning_rate": 3.3535237034545674e-07, "loss": 0.0024, "reward": 1.370535783469677, "reward_std": 0.35608108527958393, "rewards/accuracy_reward": 0.43750002328306437, "rewards/format_reward": 0.933035746216774, "step": 392 }, { "completion_length": 308.5814847946167, "epoch": 0.6075949367088608, "grad_norm": 0.23267871587703512, "kl": 0.064056396484375, "learning_rate": 3.330583692339802e-07, "loss": 0.0026, "reward": 1.4631697088479996, "reward_std": 0.35295620560646057, "rewards/accuracy_reward": 0.5145089533179998, "rewards/format_reward": 0.948660746216774, "step": 393 }, { "completion_length": 340.65179920196533, "epoch": 0.6091409798048121, "grad_norm": 0.25112282178573264, "kl": 0.0608673095703125, "learning_rate": 3.307683163166933e-07, "loss": 0.0024, "reward": 1.3727679178118706, "reward_std": 0.3918771632015705, "rewards/accuracy_reward": 0.44531252048909664, "rewards/format_reward": 0.9274553880095482, "step": 394 }, { "completion_length": 352.4877414703369, "epoch": 0.6106870229007634, "grad_norm": 0.23466214226188015, "kl": 0.0563507080078125, "learning_rate": 3.2848226575368557e-07, "loss": 0.0023, "reward": 1.3839286416769028, "reward_std": 0.36221239902079105, "rewards/accuracy_reward": 0.4352678768336773, "rewards/format_reward": 0.9486607424914837, "step": 395 }, { "completion_length": 340.0580520629883, "epoch": 0.6122330659967147, "grad_norm": 0.20162890800309638, "kl": 0.0593414306640625, "learning_rate": 3.262002716103897e-07, "loss": 0.0024, "reward": 1.3794643506407738, "reward_std": 0.3633528742939234, "rewards/accuracy_reward": 0.43415180779993534, "rewards/format_reward": 0.9453125447034836, "step": 396 }, { "completion_length": 330.94421195983887, "epoch": 0.613779109092666, "grad_norm": 0.22551045263808805, "kl": 0.05859375, "learning_rate": 3.239223878563038e-07, "loss": 0.0023, "reward": 1.4107143431901932, "reward_std": 0.3625910487025976, "rewards/accuracy_reward": 0.4654018133878708, "rewards/format_reward": 0.945312537252903, "step": 397 }, { "completion_length": 341.0033655166626, "epoch": 0.6153251521886173, "grad_norm": 0.19268943889410037, "kl": 0.0610809326171875, "learning_rate": 3.216486683637146e-07, "loss": 0.0024, "reward": 1.3526786416769028, "reward_std": 0.32396455854177475, "rewards/accuracy_reward": 0.40513394586741924, "rewards/format_reward": 0.9475446753203869, "step": 398 }, { "completion_length": 336.17635440826416, "epoch": 0.6168711952845686, "grad_norm": 0.22731050677998238, "kl": 0.06024169921875, "learning_rate": 3.1937916690642355e-07, "loss": 0.0024, "reward": 1.390625074505806, "reward_std": 0.375935398042202, "rewards/accuracy_reward": 0.45424109138548374, "rewards/format_reward": 0.9363839626312256, "step": 399 }, { "completion_length": 321.92746925354004, "epoch": 0.6184172383805199, "grad_norm": 0.2015114467780782, "kl": 0.061248779296875, "learning_rate": 3.1711393715847473e-07, "loss": 0.0024, "reward": 1.4419643506407738, "reward_std": 0.34402121789753437, "rewards/accuracy_reward": 0.48549109511077404, "rewards/format_reward": 0.9564732499420643, "step": 400 }, { "completion_length": 336.6875123977661, "epoch": 0.6199632814764712, "grad_norm": 0.25162991680939695, "kl": 0.057220458984375, "learning_rate": 3.14853032692886e-07, "loss": 0.0023, "reward": 1.3549107760190964, "reward_std": 0.33911932446062565, "rewards/accuracy_reward": 0.4040178759023547, "rewards/format_reward": 0.9508928805589676, "step": 401 }, { "completion_length": 317.30805015563965, "epoch": 0.6215093245724225, "grad_norm": 0.22979020094646926, "kl": 0.0635528564453125, "learning_rate": 3.1259650698038104e-07, "loss": 0.0025, "reward": 1.3984375521540642, "reward_std": 0.3582300879061222, "rewards/accuracy_reward": 0.4642857303842902, "rewards/format_reward": 0.9341518245637417, "step": 402 }, { "completion_length": 333.7165365219116, "epoch": 0.6230553676683738, "grad_norm": 0.22491506128812805, "kl": 0.058837890625, "learning_rate": 3.1034441338812604e-07, "loss": 0.0024, "reward": 1.4241072162985802, "reward_std": 0.38738270103931427, "rewards/accuracy_reward": 0.47544644586741924, "rewards/format_reward": 0.9486607573926449, "step": 403 }, { "completion_length": 330.37389945983887, "epoch": 0.6246014107643251, "grad_norm": 0.230705618260739, "kl": 0.0647125244140625, "learning_rate": 3.080968051784666e-07, "loss": 0.0026, "reward": 1.4073661342263222, "reward_std": 0.3756348006427288, "rewards/accuracy_reward": 0.47209823317825794, "rewards/format_reward": 0.9352678917348385, "step": 404 }, { "completion_length": 341.8805923461914, "epoch": 0.6261474538602764, "grad_norm": 0.22944940264763958, "kl": 0.0549163818359375, "learning_rate": 3.0585373550766824e-07, "loss": 0.0022, "reward": 1.4520089849829674, "reward_std": 0.35296110063791275, "rewards/accuracy_reward": 0.5100446622818708, "rewards/format_reward": 0.9419643245637417, "step": 405 }, { "completion_length": 332.58595275878906, "epoch": 0.6276934969562277, "grad_norm": 0.2083545465723241, "kl": 0.058990478515625, "learning_rate": 3.036152574246597e-07, "loss": 0.0024, "reward": 1.4017857909202576, "reward_std": 0.38524961471557617, "rewards/accuracy_reward": 0.4497768059372902, "rewards/format_reward": 0.9520089514553547, "step": 406 }, { "completion_length": 319.2790279388428, "epoch": 0.629239540052179, "grad_norm": 0.22560034926255593, "kl": 0.0594635009765625, "learning_rate": 3.0138142386977784e-07, "loss": 0.0024, "reward": 1.4441964849829674, "reward_std": 0.375104202888906, "rewards/accuracy_reward": 0.49665180407464504, "rewards/format_reward": 0.9475446790456772, "step": 407 }, { "completion_length": 330.9899730682373, "epoch": 0.6307855831481303, "grad_norm": 0.20897678861689634, "kl": 0.0592041015625, "learning_rate": 2.9915228767351535e-07, "loss": 0.0024, "reward": 1.4162947088479996, "reward_std": 0.37247754633426666, "rewards/accuracy_reward": 0.485491088591516, "rewards/format_reward": 0.9308036044239998, "step": 408 }, { "completion_length": 315.9620666503906, "epoch": 0.6323316262440816, "grad_norm": 0.2333988021674684, "kl": 0.0640869140625, "learning_rate": 2.9692790155527225e-07, "loss": 0.0026, "reward": 1.420758992433548, "reward_std": 0.37792238406836987, "rewards/accuracy_reward": 0.4799107387661934, "rewards/format_reward": 0.940848246216774, "step": 409 }, { "completion_length": 300.8895206451416, "epoch": 0.6338776693400329, "grad_norm": 0.21700899950924218, "kl": 0.064483642578125, "learning_rate": 2.9470831812210833e-07, "loss": 0.0026, "reward": 1.433035783469677, "reward_std": 0.3629978122189641, "rewards/accuracy_reward": 0.47767859511077404, "rewards/format_reward": 0.9553571790456772, "step": 410 }, { "completion_length": 327.3404207229614, "epoch": 0.6354237124359842, "grad_norm": 0.22127273224295546, "kl": 0.0587005615234375, "learning_rate": 2.9249358986749916e-07, "loss": 0.0023, "reward": 1.3649554178118706, "reward_std": 0.35340968146920204, "rewards/accuracy_reward": 0.4207589505240321, "rewards/format_reward": 0.9441964589059353, "step": 411 }, { "completion_length": 308.04577350616455, "epoch": 0.6369697555319355, "grad_norm": 0.20112983572159332, "kl": 0.0633697509765625, "learning_rate": 2.902837691700945e-07, "loss": 0.0025, "reward": 1.4475447088479996, "reward_std": 0.3340654969215393, "rewards/accuracy_reward": 0.49665181152522564, "rewards/format_reward": 0.9508928991854191, "step": 412 }, { "completion_length": 326.9654150009155, "epoch": 0.6385157986278868, "grad_norm": 0.21921080725465228, "kl": 0.0608367919921875, "learning_rate": 2.8807890829247977e-07, "loss": 0.0024, "reward": 1.3582589849829674, "reward_std": 0.37634955160319805, "rewards/accuracy_reward": 0.42745538149029016, "rewards/format_reward": 0.9308036081492901, "step": 413 }, { "completion_length": 363.4096164703369, "epoch": 0.6400618417238381, "grad_norm": 0.2522930102548938, "kl": 0.06341552734375, "learning_rate": 2.8587905937994043e-07, "loss": 0.0025, "reward": 1.30245541036129, "reward_std": 0.4197652339935303, "rewards/accuracy_reward": 0.39285716600716114, "rewards/format_reward": 0.9095982499420643, "step": 414 }, { "completion_length": 347.08706760406494, "epoch": 0.6416078848197894, "grad_norm": 0.22001181796766897, "kl": 0.05938720703125, "learning_rate": 2.8368427445922696e-07, "loss": 0.0024, "reward": 1.385044701397419, "reward_std": 0.3598229829221964, "rewards/accuracy_reward": 0.4553571632131934, "rewards/format_reward": 0.9296875335276127, "step": 415 }, { "completion_length": 325.12166690826416, "epoch": 0.6431539279157407, "grad_norm": 0.22345038359032926, "kl": 0.0614471435546875, "learning_rate": 2.814946054373266e-07, "loss": 0.0025, "reward": 1.3683036267757416, "reward_std": 0.3691785577684641, "rewards/accuracy_reward": 0.43415181152522564, "rewards/format_reward": 0.9341518208384514, "step": 416 }, { "completion_length": 326.25559520721436, "epoch": 0.644699971011692, "grad_norm": 4.4014109586370465, "kl": 0.061004638671875, "learning_rate": 2.7931010410023516e-07, "loss": 0.0024, "reward": 1.4241072088479996, "reward_std": 0.38210925087332726, "rewards/accuracy_reward": 0.4821428805589676, "rewards/format_reward": 0.9419643208384514, "step": 417 }, { "completion_length": 331.2857275009155, "epoch": 0.6462460141076433, "grad_norm": 0.43402571276694113, "kl": 0.0591278076171875, "learning_rate": 2.771308221117309e-07, "loss": 0.0024, "reward": 1.39620541036129, "reward_std": 0.3790908604860306, "rewards/accuracy_reward": 0.45089288242161274, "rewards/format_reward": 0.9453125298023224, "step": 418 }, { "completion_length": 356.74667167663574, "epoch": 0.6477920572035946, "grad_norm": 0.20286763874941383, "kl": 0.055633544921875, "learning_rate": 2.7495681101215444e-07, "loss": 0.0022, "reward": 1.3560268580913544, "reward_std": 0.37070516869425774, "rewards/accuracy_reward": 0.4140625223517418, "rewards/format_reward": 0.9419643133878708, "step": 419 }, { "completion_length": 377.13729095458984, "epoch": 0.6493381002995459, "grad_norm": 0.21685449566289342, "kl": 0.0525665283203125, "learning_rate": 2.727881222171892e-07, "loss": 0.0021, "reward": 1.3750000819563866, "reward_std": 0.3881053999066353, "rewards/accuracy_reward": 0.4486607350409031, "rewards/format_reward": 0.9263393245637417, "step": 420 }, { "completion_length": 332.5413055419922, "epoch": 0.6508841433954972, "grad_norm": 0.22808520745759506, "kl": 0.0615997314453125, "learning_rate": 2.706248070166449e-07, "loss": 0.0025, "reward": 1.4017857760190964, "reward_std": 0.4060097951442003, "rewards/accuracy_reward": 0.46875002048909664, "rewards/format_reward": 0.9330357536673546, "step": 421 }, { "completion_length": 342.7366256713867, "epoch": 0.6524301864914485, "grad_norm": 0.22179800133448319, "kl": 0.0646514892578125, "learning_rate": 2.684669165732447e-07, "loss": 0.0026, "reward": 1.3917411491274834, "reward_std": 0.37289788760244846, "rewards/accuracy_reward": 0.45758931152522564, "rewards/format_reward": 0.9341518320143223, "step": 422 }, { "completion_length": 336.8917541503906, "epoch": 0.6539762295873998, "grad_norm": 0.2109840952668981, "kl": 0.06158447265625, "learning_rate": 2.6631450192141623e-07, "loss": 0.0025, "reward": 1.4174107685685158, "reward_std": 0.3288159314543009, "rewards/accuracy_reward": 0.4687500186264515, "rewards/format_reward": 0.9486607387661934, "step": 423 }, { "completion_length": 350.8872928619385, "epoch": 0.6555222726833511, "grad_norm": 0.21549180962049622, "kl": 0.0580291748046875, "learning_rate": 2.641676139660836e-07, "loss": 0.0023, "reward": 1.3761161416769028, "reward_std": 0.3518005656078458, "rewards/accuracy_reward": 0.4285714430734515, "rewards/format_reward": 0.9475446753203869, "step": 424 }, { "completion_length": 331.3761291503906, "epoch": 0.6570683157793024, "grad_norm": 0.22339568503840662, "kl": 0.064666748046875, "learning_rate": 2.620263034814632e-07, "loss": 0.0026, "reward": 1.407366119325161, "reward_std": 0.36160945519804955, "rewards/accuracy_reward": 0.46763394959270954, "rewards/format_reward": 0.9397321864962578, "step": 425 }, { "completion_length": 346.04577445983887, "epoch": 0.6586143588752537, "grad_norm": 0.266219468888077, "kl": 0.064697265625, "learning_rate": 2.5989062110986426e-07, "loss": 0.0026, "reward": 1.354910783469677, "reward_std": 0.39491881988942623, "rewards/accuracy_reward": 0.41406252048909664, "rewards/format_reward": 0.9408482499420643, "step": 426 }, { "completion_length": 312.9687662124634, "epoch": 0.660160401971205, "grad_norm": 0.23230438554448357, "kl": 0.0657196044921875, "learning_rate": 2.5776061736048935e-07, "loss": 0.0026, "reward": 1.4520089998841286, "reward_std": 0.3484889082610607, "rewards/accuracy_reward": 0.49553573690354824, "rewards/format_reward": 0.9564732424914837, "step": 427 }, { "completion_length": 326.6730089187622, "epoch": 0.6617064450671563, "grad_norm": 0.21360458070902597, "kl": 0.0623321533203125, "learning_rate": 2.5563634260824176e-07, "loss": 0.0025, "reward": 1.3705357760190964, "reward_std": 0.38584427163004875, "rewards/accuracy_reward": 0.42633930686861277, "rewards/format_reward": 0.9441964700818062, "step": 428 }, { "completion_length": 313.2823829650879, "epoch": 0.6632524881631076, "grad_norm": 0.20172477871541358, "kl": 0.06494140625, "learning_rate": 2.5351784709253224e-07, "loss": 0.0026, "reward": 1.4375000521540642, "reward_std": 0.3257559724152088, "rewards/accuracy_reward": 0.47991072945296764, "rewards/format_reward": 0.9575893096625805, "step": 429 }, { "completion_length": 330.51898765563965, "epoch": 0.6647985312590589, "grad_norm": 0.24217509904322418, "kl": 0.060577392578125, "learning_rate": 2.5140518091609253e-07, "loss": 0.0024, "reward": 1.3828125670552254, "reward_std": 0.3513931129127741, "rewards/accuracy_reward": 0.4363839514553547, "rewards/format_reward": 0.9464286044239998, "step": 430 }, { "completion_length": 349.8917570114136, "epoch": 0.6663445743550102, "grad_norm": 0.21042282023172046, "kl": 0.0574951171875, "learning_rate": 2.492983940437893e-07, "loss": 0.0023, "reward": 1.3772322088479996, "reward_std": 0.3687204774469137, "rewards/accuracy_reward": 0.44308037497103214, "rewards/format_reward": 0.9341518245637417, "step": 431 }, { "completion_length": 327.8906412124634, "epoch": 0.6678906174509615, "grad_norm": 0.22018847956340612, "kl": 0.06109619140625, "learning_rate": 2.471975363014428e-07, "loss": 0.0024, "reward": 1.4252232909202576, "reward_std": 0.36094419844448566, "rewards/accuracy_reward": 0.48325894959270954, "rewards/format_reward": 0.9419643208384514, "step": 432 }, { "completion_length": 350.3169813156128, "epoch": 0.6694366605469128, "grad_norm": 0.2155385744959634, "kl": 0.06280517578125, "learning_rate": 2.4510265737464817e-07, "loss": 0.0025, "reward": 1.3962054252624512, "reward_std": 0.3913747174665332, "rewards/accuracy_reward": 0.4575893133878708, "rewards/format_reward": 0.9386161081492901, "step": 433 }, { "completion_length": 350.8716697692871, "epoch": 0.670982703642864, "grad_norm": 0.2199188345176245, "kl": 0.0588836669921875, "learning_rate": 2.4301380680760123e-07, "loss": 0.0024, "reward": 1.373883992433548, "reward_std": 0.3660994740203023, "rewards/accuracy_reward": 0.43080358766019344, "rewards/format_reward": 0.9430803954601288, "step": 434 }, { "completion_length": 333.19532585144043, "epoch": 0.6725287467388154, "grad_norm": 0.23670600693458826, "kl": 0.065338134765625, "learning_rate": 2.409310340019262e-07, "loss": 0.0026, "reward": 1.415178619325161, "reward_std": 0.3463025698438287, "rewards/accuracy_reward": 0.46986609511077404, "rewards/format_reward": 0.945312537252903, "step": 435 }, { "completion_length": 324.17746925354004, "epoch": 0.6740747898347667, "grad_norm": 0.22987674890033602, "kl": 0.0626983642578125, "learning_rate": 2.388543882155067e-07, "loss": 0.0025, "reward": 1.3939732685685158, "reward_std": 0.3896239101886749, "rewards/accuracy_reward": 0.4631696715950966, "rewards/format_reward": 0.9308036006987095, "step": 436 }, { "completion_length": 340.3136291503906, "epoch": 0.675620832930718, "grad_norm": 0.2034381017906134, "kl": 0.0641632080078125, "learning_rate": 2.3678391856132202e-07, "loss": 0.0026, "reward": 1.4006697162985802, "reward_std": 0.3390182964503765, "rewards/accuracy_reward": 0.46540181152522564, "rewards/format_reward": 0.9352678917348385, "step": 437 }, { "completion_length": 323.45760440826416, "epoch": 0.6771668760266693, "grad_norm": 0.22149719864423398, "kl": 0.06109619140625, "learning_rate": 2.3471967400628513e-07, "loss": 0.0024, "reward": 1.4218750819563866, "reward_std": 0.346695807762444, "rewards/accuracy_reward": 0.4754464514553547, "rewards/format_reward": 0.9464286081492901, "step": 438 }, { "completion_length": 344.4821586608887, "epoch": 0.6787129191226206, "grad_norm": 0.22160547302317368, "kl": 0.0554962158203125, "learning_rate": 2.3266170337008394e-07, "loss": 0.0022, "reward": 1.3962054252624512, "reward_std": 0.3476774301379919, "rewards/accuracy_reward": 0.4453125149011612, "rewards/format_reward": 0.9508928917348385, "step": 439 }, { "completion_length": 336.24778270721436, "epoch": 0.6802589622185718, "grad_norm": 0.21973926316008546, "kl": 0.061492919921875, "learning_rate": 2.3061005532402738e-07, "loss": 0.0025, "reward": 1.4062500521540642, "reward_std": 0.3737519718706608, "rewards/accuracy_reward": 0.47098216600716114, "rewards/format_reward": 0.9352678917348385, "step": 440 }, { "completion_length": 333.14175605773926, "epoch": 0.6818050053145231, "grad_norm": 0.23647833326564105, "kl": 0.0604400634765625, "learning_rate": 2.2856477838989453e-07, "loss": 0.0024, "reward": 1.3660714998841286, "reward_std": 0.3632985595613718, "rewards/accuracy_reward": 0.4229910895228386, "rewards/format_reward": 0.9430803954601288, "step": 441 }, { "completion_length": 338.2689895629883, "epoch": 0.6833510484104744, "grad_norm": 0.20998739149017734, "kl": 0.0627288818359375, "learning_rate": 2.2652592093878665e-07, "loss": 0.0025, "reward": 1.4174107909202576, "reward_std": 0.3535559633746743, "rewards/accuracy_reward": 0.4810268096625805, "rewards/format_reward": 0.9363839626312256, "step": 442 }, { "completion_length": 314.41965770721436, "epoch": 0.6848970915064257, "grad_norm": 0.1967973097038675, "kl": 0.0655670166015625, "learning_rate": 2.2449353118998286e-07, "loss": 0.0026, "reward": 1.4542411267757416, "reward_std": 0.3141915211454034, "rewards/accuracy_reward": 0.5022321697324514, "rewards/format_reward": 0.9520089589059353, "step": 443 }, { "completion_length": 336.2689905166626, "epoch": 0.686443134602377, "grad_norm": 0.2197720251171726, "kl": 0.0631256103515625, "learning_rate": 2.224676572098007e-07, "loss": 0.0025, "reward": 1.459821492433548, "reward_std": 0.36245420202612877, "rewards/accuracy_reward": 0.510044664144516, "rewards/format_reward": 0.9497768171131611, "step": 444 }, { "completion_length": 338.90514755249023, "epoch": 0.6879891776983283, "grad_norm": 0.26069212988132007, "kl": 0.0626983642578125, "learning_rate": 2.2044834691045872e-07, "loss": 0.0025, "reward": 1.4084822088479996, "reward_std": 0.3623805809766054, "rewards/accuracy_reward": 0.4676339477300644, "rewards/format_reward": 0.9408482536673546, "step": 445 }, { "completion_length": 327.74108505249023, "epoch": 0.6895352207942796, "grad_norm": 0.229220153902361, "kl": 0.0648193359375, "learning_rate": 2.1843564804894316e-07, "loss": 0.0026, "reward": 1.3649554029107094, "reward_std": 0.3854550626128912, "rewards/accuracy_reward": 0.42522323317825794, "rewards/format_reward": 0.9397321827709675, "step": 446 }, { "completion_length": 325.99443435668945, "epoch": 0.691081263890231, "grad_norm": 11.937536426401158, "kl": 0.0774688720703125, "learning_rate": 2.1642960822587875e-07, "loss": 0.0031, "reward": 1.469866156578064, "reward_std": 0.3489577118307352, "rewards/accuracy_reward": 0.5212053749710321, "rewards/format_reward": 0.948660746216774, "step": 447 }, { "completion_length": 312.96095085144043, "epoch": 0.6926273069861822, "grad_norm": 0.22610443923744322, "kl": 0.0628204345703125, "learning_rate": 2.1443027488440336e-07, "loss": 0.0025, "reward": 1.4375000596046448, "reward_std": 0.3588519059121609, "rewards/accuracy_reward": 0.4955357387661934, "rewards/format_reward": 0.9419643245637417, "step": 448 }, { "completion_length": 345.2176513671875, "epoch": 0.6941733500821335, "grad_norm": 0.21001830620642964, "kl": 0.0613250732421875, "learning_rate": 2.124376953090456e-07, "loss": 0.0025, "reward": 1.4017857760190964, "reward_std": 0.3555400772020221, "rewards/accuracy_reward": 0.46093752793967724, "rewards/format_reward": 0.9408482424914837, "step": 449 }, { "completion_length": 339.1506814956665, "epoch": 0.6957193931780848, "grad_norm": 5.120664987084913, "kl": 0.0623016357421875, "learning_rate": 2.1045191662460588e-07, "loss": 0.0025, "reward": 1.3616072088479996, "reward_std": 0.37004053220152855, "rewards/accuracy_reward": 0.42633930686861277, "rewards/format_reward": 0.9352678954601288, "step": 450 }, { "completion_length": 356.78126525878906, "epoch": 0.6972654362740361, "grad_norm": 0.2499473894912619, "kl": 0.0590057373046875, "learning_rate": 2.084729857950434e-07, "loss": 0.0024, "reward": 1.4441965073347092, "reward_std": 0.4203994367271662, "rewards/accuracy_reward": 0.5089285988360643, "rewards/format_reward": 0.9352678880095482, "step": 451 }, { "completion_length": 351.8638553619385, "epoch": 0.6988114793699874, "grad_norm": 0.20913922402748847, "kl": 0.06280517578125, "learning_rate": 2.0650094962236381e-07, "loss": 0.0025, "reward": 1.3281250819563866, "reward_std": 0.3526475690305233, "rewards/accuracy_reward": 0.3816964477300644, "rewards/format_reward": 0.9464286044239998, "step": 452 }, { "completion_length": 338.4709949493408, "epoch": 0.7003575224659387, "grad_norm": 0.22249945692046136, "kl": 0.058685302734375, "learning_rate": 2.0453585474551377e-07, "loss": 0.0023, "reward": 1.3883929178118706, "reward_std": 0.37745984084904194, "rewards/accuracy_reward": 0.4441964514553547, "rewards/format_reward": 0.9441964589059353, "step": 453 }, { "completion_length": 355.48774242401123, "epoch": 0.70190356556189, "grad_norm": 0.20529294704998824, "kl": 0.0570220947265625, "learning_rate": 2.0257774763927653e-07, "loss": 0.0023, "reward": 1.3917411416769028, "reward_std": 0.37449742294847965, "rewards/accuracy_reward": 0.45424109417945147, "rewards/format_reward": 0.937500037252903, "step": 454 }, { "completion_length": 328.8259086608887, "epoch": 0.7034496086578413, "grad_norm": 0.22765333179584907, "kl": 0.05810546875, "learning_rate": 2.0062667461317424e-07, "loss": 0.0023, "reward": 1.4732143506407738, "reward_std": 0.3546763164922595, "rewards/accuracy_reward": 0.5200893171131611, "rewards/format_reward": 0.9531250298023224, "step": 455 }, { "completion_length": 337.4810438156128, "epoch": 0.7049956517537926, "grad_norm": 128.03878204922609, "kl": 0.09814453125, "learning_rate": 1.9868268181037184e-07, "loss": 0.0039, "reward": 1.4397321939468384, "reward_std": 0.4142738450318575, "rewards/accuracy_reward": 0.49218752421438694, "rewards/format_reward": 0.9475446864962578, "step": 456 }, { "completion_length": 336.35046005249023, "epoch": 0.7065416948497439, "grad_norm": 0.21410699913290485, "kl": 0.0583648681640625, "learning_rate": 1.9674581520658567e-07, "loss": 0.0023, "reward": 1.3950893357396126, "reward_std": 0.3674540910869837, "rewards/accuracy_reward": 0.4497768059372902, "rewards/format_reward": 0.9453125335276127, "step": 457 }, { "completion_length": 347.31697940826416, "epoch": 0.7080877379456952, "grad_norm": 0.22054005922688236, "kl": 0.0589752197265625, "learning_rate": 1.9481612060899644e-07, "loss": 0.0024, "reward": 1.4062500596046448, "reward_std": 0.3597471322864294, "rewards/accuracy_reward": 0.4665178759023547, "rewards/format_reward": 0.9397321790456772, "step": 458 }, { "completion_length": 317.10269260406494, "epoch": 0.7096337810416465, "grad_norm": 0.2168849117671482, "kl": 0.06243896484375, "learning_rate": 1.9289364365516607e-07, "loss": 0.0025, "reward": 1.3917411267757416, "reward_std": 0.3586621480062604, "rewards/accuracy_reward": 0.44419645331799984, "rewards/format_reward": 0.9475446790456772, "step": 459 }, { "completion_length": 329.6361770629883, "epoch": 0.7111798241375978, "grad_norm": 0.2364500947537291, "kl": 0.0625457763671875, "learning_rate": 1.9097842981195832e-07, "loss": 0.0025, "reward": 1.366071492433548, "reward_std": 0.3754944261163473, "rewards/accuracy_reward": 0.42410716135054827, "rewards/format_reward": 0.941964328289032, "step": 460 }, { "completion_length": 341.3381872177124, "epoch": 0.7127258672335491, "grad_norm": 0.2775707213090051, "kl": 0.06005859375, "learning_rate": 1.8907052437446268e-07, "loss": 0.0024, "reward": 1.430803619325161, "reward_std": 0.3560457229614258, "rewards/accuracy_reward": 0.4866071678698063, "rewards/format_reward": 0.9441964663565159, "step": 461 }, { "completion_length": 333.87724590301514, "epoch": 0.7142719103295004, "grad_norm": 0.23245445220212843, "kl": 0.06536865234375, "learning_rate": 1.8716997246492437e-07, "loss": 0.0026, "reward": 1.3437500670552254, "reward_std": 0.35076540894806385, "rewards/accuracy_reward": 0.4118303805589676, "rewards/format_reward": 0.9319196827709675, "step": 462 }, { "completion_length": 340.68193531036377, "epoch": 0.7158179534254517, "grad_norm": 0.2096735534572854, "kl": 0.05731201171875, "learning_rate": 1.8527681903167642e-07, "loss": 0.0023, "reward": 1.3526786267757416, "reward_std": 0.37010612431913614, "rewards/accuracy_reward": 0.4107143124565482, "rewards/format_reward": 0.9419643208384514, "step": 463 }, { "completion_length": 326.2265787124634, "epoch": 0.717363996521403, "grad_norm": 0.22026742558577686, "kl": 0.0650787353515625, "learning_rate": 1.8339110884807668e-07, "loss": 0.0026, "reward": 1.4140625596046448, "reward_std": 0.3656027801334858, "rewards/accuracy_reward": 0.4776785895228386, "rewards/format_reward": 0.9363839663565159, "step": 464 }, { "completion_length": 327.6763563156128, "epoch": 0.7189100396173543, "grad_norm": 0.21660504070347997, "kl": 0.063568115234375, "learning_rate": 1.8151288651144892e-07, "loss": 0.0025, "reward": 1.3861607760190964, "reward_std": 0.33890372700989246, "rewards/accuracy_reward": 0.4408482387661934, "rewards/format_reward": 0.9453125260770321, "step": 465 }, { "completion_length": 346.80470085144043, "epoch": 0.7204560827133056, "grad_norm": 0.22330429589857118, "kl": 0.05865478515625, "learning_rate": 1.796421964420285e-07, "loss": 0.0023, "reward": 1.4263393580913544, "reward_std": 0.4112767241895199, "rewards/accuracy_reward": 0.4888393059372902, "rewards/format_reward": 0.9375000409781933, "step": 466 }, { "completion_length": 329.34041595458984, "epoch": 0.7220021258092569, "grad_norm": 0.24673388290558007, "kl": 0.05804443359375, "learning_rate": 1.7777908288191173e-07, "loss": 0.0023, "reward": 1.3604911342263222, "reward_std": 0.3884708546102047, "rewards/accuracy_reward": 0.4017857387661934, "rewards/format_reward": 0.9587053954601288, "step": 467 }, { "completion_length": 324.1462211608887, "epoch": 0.7235481689052082, "grad_norm": 0.2112610120156611, "kl": 0.056793212890625, "learning_rate": 1.7592358989400878e-07, "loss": 0.0023, "reward": 1.4062500596046448, "reward_std": 0.3368871994316578, "rewards/accuracy_reward": 0.45200894493609667, "rewards/format_reward": 0.9542410969734192, "step": 468 }, { "completion_length": 328.3393020629883, "epoch": 0.7250942120011595, "grad_norm": 0.2267348444739024, "kl": 0.0577850341796875, "learning_rate": 1.7407576136100278e-07, "loss": 0.0023, "reward": 1.422991119325161, "reward_std": 0.3529864028096199, "rewards/accuracy_reward": 0.4698660895228386, "rewards/format_reward": 0.9531250447034836, "step": 469 }, { "completion_length": 320.5011320114136, "epoch": 0.7266402550971108, "grad_norm": 0.2441208524163371, "kl": 0.065216064453125, "learning_rate": 1.7223564098431065e-07, "loss": 0.0026, "reward": 1.41183041036129, "reward_std": 0.3658321872353554, "rewards/accuracy_reward": 0.4665178805589676, "rewards/format_reward": 0.945312537252903, "step": 470 }, { "completion_length": 320.1506881713867, "epoch": 0.7281862981930621, "grad_norm": 0.2104689257060257, "kl": 0.0606536865234375, "learning_rate": 1.7040327228305117e-07, "loss": 0.0024, "reward": 1.3995536342263222, "reward_std": 0.36297403182834387, "rewards/accuracy_reward": 0.45312501676380634, "rewards/format_reward": 0.9464286044239998, "step": 471 }, { "completion_length": 318.6071586608887, "epoch": 0.7297323412890134, "grad_norm": 0.24393006370276585, "kl": 0.0601959228515625, "learning_rate": 1.68578698593014e-07, "loss": 0.0024, "reward": 1.4508929252624512, "reward_std": 0.3404024913907051, "rewards/accuracy_reward": 0.4832589477300644, "rewards/format_reward": 0.9676339626312256, "step": 472 }, { "completion_length": 319.76452255249023, "epoch": 0.7312783843849647, "grad_norm": 0.21878431573638968, "kl": 0.0607452392578125, "learning_rate": 1.6676196306563613e-07, "loss": 0.0024, "reward": 1.4397322162985802, "reward_std": 0.3595498651266098, "rewards/accuracy_reward": 0.49107145331799984, "rewards/format_reward": 0.9486607424914837, "step": 473 }, { "completion_length": 315.08260345458984, "epoch": 0.732824427480916, "grad_norm": 0.1963700858102897, "kl": 0.0570068359375, "learning_rate": 1.6495310866698093e-07, "loss": 0.0023, "reward": 1.4631697237491608, "reward_std": 0.3246061597019434, "rewards/accuracy_reward": 0.5011160876601934, "rewards/format_reward": 0.9620536118745804, "step": 474 }, { "completion_length": 332.11273670196533, "epoch": 0.7343704705768673, "grad_norm": 0.1850995505748403, "kl": 0.0564727783203125, "learning_rate": 1.631521781767214e-07, "loss": 0.0023, "reward": 1.358258992433548, "reward_std": 0.33148794062435627, "rewards/accuracy_reward": 0.39062501583248377, "rewards/format_reward": 0.9676339663565159, "step": 475 }, { "completion_length": 335.560284614563, "epoch": 0.7359165136728186, "grad_norm": 0.22890372893762778, "kl": 0.056854248046875, "learning_rate": 1.6135921418712955e-07, "loss": 0.0023, "reward": 1.4029018431901932, "reward_std": 0.36751374416053295, "rewards/accuracy_reward": 0.45089287869632244, "rewards/format_reward": 0.9520089626312256, "step": 476 }, { "completion_length": 324.7076025009155, "epoch": 0.7374625567687699, "grad_norm": 0.22099771199260237, "kl": 0.06005859375, "learning_rate": 1.5957425910206785e-07, "loss": 0.0024, "reward": 1.4185268506407738, "reward_std": 0.3708518836647272, "rewards/accuracy_reward": 0.470982164144516, "rewards/format_reward": 0.9475446715950966, "step": 477 }, { "completion_length": 333.8951063156128, "epoch": 0.7390085998647212, "grad_norm": 0.20215487905231866, "kl": 0.0580596923828125, "learning_rate": 1.5779735513598768e-07, "loss": 0.0023, "reward": 1.386160783469677, "reward_std": 0.35158096067607403, "rewards/accuracy_reward": 0.4341518022119999, "rewards/format_reward": 0.9520089663565159, "step": 478 }, { "completion_length": 339.6886339187622, "epoch": 0.7405546429606725, "grad_norm": 0.2538122734737048, "kl": 0.064361572265625, "learning_rate": 1.560285443129296e-07, "loss": 0.0026, "reward": 1.350446492433548, "reward_std": 0.3589059868827462, "rewards/accuracy_reward": 0.41406251676380634, "rewards/format_reward": 0.9363839663565159, "step": 479 }, { "completion_length": 338.41854095458984, "epoch": 0.7421006860566238, "grad_norm": 0.21113221712545047, "kl": 0.05914306640625, "learning_rate": 1.542678684655306e-07, "loss": 0.0024, "reward": 1.401785783469677, "reward_std": 0.3246411820873618, "rewards/accuracy_reward": 0.45870537869632244, "rewards/format_reward": 0.9430803917348385, "step": 480 }, { "completion_length": 311.5826063156128, "epoch": 0.7436467291525751, "grad_norm": 0.23548089310837764, "kl": 0.0639495849609375, "learning_rate": 1.5251536923403425e-07, "loss": 0.0026, "reward": 1.3794643506407738, "reward_std": 0.3964069280773401, "rewards/accuracy_reward": 0.439732164144516, "rewards/format_reward": 0.9397321715950966, "step": 481 }, { "completion_length": 347.28461360931396, "epoch": 0.7451927722485264, "grad_norm": 0.21877979275508133, "kl": 0.0597991943359375, "learning_rate": 1.507710880653058e-07, "loss": 0.0024, "reward": 1.3627232760190964, "reward_std": 0.35287664737552404, "rewards/accuracy_reward": 0.41629466135054827, "rewards/format_reward": 0.9464286006987095, "step": 482 }, { "completion_length": 327.4497938156128, "epoch": 0.7467388153444777, "grad_norm": 0.2150779036955563, "kl": 0.062957763671875, "learning_rate": 1.490350662118519e-07, "loss": 0.0025, "reward": 1.4274554327130318, "reward_std": 0.36159966327250004, "rewards/accuracy_reward": 0.4799107350409031, "rewards/format_reward": 0.9475446790456772, "step": 483 }, { "completion_length": 297.0480070114136, "epoch": 0.748284858440429, "grad_norm": 0.2504120067042256, "kl": 0.0648193359375, "learning_rate": 1.4730734473084566e-07, "loss": 0.0026, "reward": 1.400669701397419, "reward_std": 0.36049083806574345, "rewards/accuracy_reward": 0.43638394866138697, "rewards/format_reward": 0.9642857499420643, "step": 484 }, { "completion_length": 325.8616199493408, "epoch": 0.7498309015363803, "grad_norm": 0.23767929362097848, "kl": 0.0616607666015625, "learning_rate": 1.4558796448315503e-07, "loss": 0.0025, "reward": 1.4430804252624512, "reward_std": 0.3703152500092983, "rewards/accuracy_reward": 0.482142873108387, "rewards/format_reward": 0.9609375409781933, "step": 485 }, { "completion_length": 334.0167531967163, "epoch": 0.7513769446323316, "grad_norm": 0.20425538006052946, "kl": 0.05950927734375, "learning_rate": 1.438769661323761e-07, "loss": 0.0024, "reward": 1.38058041036129, "reward_std": 0.338851572945714, "rewards/accuracy_reward": 0.43526787869632244, "rewards/format_reward": 0.945312537252903, "step": 486 }, { "completion_length": 326.63505458831787, "epoch": 0.7529229877282829, "grad_norm": 0.22649466641096158, "kl": 0.0626983642578125, "learning_rate": 1.4217439014387251e-07, "loss": 0.0025, "reward": 1.3671875596046448, "reward_std": 0.3531641513109207, "rewards/accuracy_reward": 0.4118303833529353, "rewards/format_reward": 0.9553571715950966, "step": 487 }, { "completion_length": 354.87502002716064, "epoch": 0.7544690308242342, "grad_norm": 0.22404648081307912, "kl": 0.05670166015625, "learning_rate": 1.4048027678381757e-07, "loss": 0.0023, "reward": 1.382812574505806, "reward_std": 0.3756707487627864, "rewards/accuracy_reward": 0.44196430407464504, "rewards/format_reward": 0.9408482499420643, "step": 488 }, { "completion_length": 309.0100574493408, "epoch": 0.7560150739201855, "grad_norm": 0.24952499697223784, "kl": 0.065582275390625, "learning_rate": 1.38794666118242e-07, "loss": 0.0026, "reward": 1.3772321939468384, "reward_std": 0.3453883556649089, "rewards/accuracy_reward": 0.42522322852164507, "rewards/format_reward": 0.9520089663565159, "step": 489 }, { "completion_length": 316.46094703674316, "epoch": 0.7575611170161368, "grad_norm": 0.21017506878189693, "kl": 0.0648040771484375, "learning_rate": 1.371175980120864e-07, "loss": 0.0026, "reward": 1.421875074505806, "reward_std": 0.3169353809207678, "rewards/accuracy_reward": 0.45535716973245144, "rewards/format_reward": 0.9665178917348385, "step": 490 }, { "completion_length": 331.6629581451416, "epoch": 0.7591071601120881, "grad_norm": 0.227076130190867, "kl": 0.061676025390625, "learning_rate": 1.3544911212825905e-07, "loss": 0.0025, "reward": 1.3727679252624512, "reward_std": 0.3857467658817768, "rewards/accuracy_reward": 0.4319196604192257, "rewards/format_reward": 0.9408482573926449, "step": 491 }, { "completion_length": 324.3482255935669, "epoch": 0.7606532032080394, "grad_norm": 0.22719791377634366, "kl": 0.0603790283203125, "learning_rate": 1.3378924792669738e-07, "loss": 0.0024, "reward": 1.3950893506407738, "reward_std": 0.3710507657378912, "rewards/accuracy_reward": 0.44866073317825794, "rewards/format_reward": 0.9464286044239998, "step": 492 }, { "completion_length": 338.9397449493408, "epoch": 0.7621992463039907, "grad_norm": 0.2213339747113585, "kl": 0.060333251953125, "learning_rate": 1.321380446634342e-07, "loss": 0.0024, "reward": 1.354910783469677, "reward_std": 0.3667628737166524, "rewards/accuracy_reward": 0.4207589514553547, "rewards/format_reward": 0.9341518208384514, "step": 493 }, { "completion_length": 335.1551513671875, "epoch": 0.763745289399942, "grad_norm": 12.041606128587777, "kl": 0.0609130859375, "learning_rate": 1.304955413896705e-07, "loss": 0.0024, "reward": 1.3950893580913544, "reward_std": 0.35031204484403133, "rewards/accuracy_reward": 0.4475446632131934, "rewards/format_reward": 0.9475446790456772, "step": 494 }, { "completion_length": 338.6417541503906, "epoch": 0.7652913324958933, "grad_norm": 0.2277204907587234, "kl": 0.060882568359375, "learning_rate": 1.2886177695085077e-07, "loss": 0.0024, "reward": 1.4107143580913544, "reward_std": 0.3518304005265236, "rewards/accuracy_reward": 0.46875001676380634, "rewards/format_reward": 0.9419643208384514, "step": 495 }, { "completion_length": 310.0881814956665, "epoch": 0.7668373755918446, "grad_norm": 0.2180097571841522, "kl": 0.0600128173828125, "learning_rate": 1.2723678998574512e-07, "loss": 0.0024, "reward": 1.4151786416769028, "reward_std": 0.36422134563326836, "rewards/accuracy_reward": 0.4542410932481289, "rewards/format_reward": 0.9609375260770321, "step": 496 }, { "completion_length": 310.66072845458984, "epoch": 0.7683834186877959, "grad_norm": 0.219485421165224, "kl": 0.0672760009765625, "learning_rate": 1.2562061892553472e-07, "loss": 0.0027, "reward": 1.3571429327130318, "reward_std": 0.29585556872189045, "rewards/accuracy_reward": 0.39174109511077404, "rewards/format_reward": 0.9654018171131611, "step": 497 }, { "completion_length": 318.6663074493408, "epoch": 0.7699294617837472, "grad_norm": 0.21626779491021786, "kl": 0.0617523193359375, "learning_rate": 1.2401330199290366e-07, "loss": 0.0025, "reward": 1.4419643431901932, "reward_std": 0.3701166473329067, "rewards/accuracy_reward": 0.4899553842842579, "rewards/format_reward": 0.9520089589059353, "step": 498 }, { "completion_length": 320.49108695983887, "epoch": 0.7714755048796985, "grad_norm": 0.2388613356999118, "kl": 0.0633087158203125, "learning_rate": 1.2241487720113457e-07, "loss": 0.0025, "reward": 1.3459822088479996, "reward_std": 0.36706400848925114, "rewards/accuracy_reward": 0.40401787776499987, "rewards/format_reward": 0.9419643133878708, "step": 499 }, { "completion_length": 327.2143020629883, "epoch": 0.7730215479756498, "grad_norm": 0.23159365854562924, "kl": 0.06500244140625, "learning_rate": 1.2082538235320928e-07, "loss": 0.0026, "reward": 1.405133992433548, "reward_std": 0.336949585005641, "rewards/accuracy_reward": 0.46093752328306437, "rewards/format_reward": 0.9441964738070965, "step": 500 }, { "completion_length": 314.1997928619385, "epoch": 0.7745675910716011, "grad_norm": 0.23064979965385457, "kl": 0.0659942626953125, "learning_rate": 1.1924485504091565e-07, "loss": 0.0026, "reward": 1.4285714998841286, "reward_std": 0.32474822271615267, "rewards/accuracy_reward": 0.4687500197906047, "rewards/format_reward": 0.9598214626312256, "step": 501 }, { "completion_length": 319.6406412124634, "epoch": 0.7761136341675524, "grad_norm": 0.235175624183043, "kl": 0.0660400390625, "learning_rate": 1.1767333264395735e-07, "loss": 0.0026, "reward": 1.4296875596046448, "reward_std": 0.3421413656324148, "rewards/accuracy_reward": 0.4810268022119999, "rewards/format_reward": 0.948660746216774, "step": 502 }, { "completion_length": 340.59153175354004, "epoch": 0.7776596772635037, "grad_norm": 0.21472914242814295, "kl": 0.0576019287109375, "learning_rate": 1.1611085232907131e-07, "loss": 0.0023, "reward": 1.3258929252624512, "reward_std": 0.3458497505635023, "rewards/accuracy_reward": 0.3895089477300644, "rewards/format_reward": 0.9363839589059353, "step": 503 }, { "completion_length": 348.3013553619385, "epoch": 0.779205720359455, "grad_norm": 0.20136863882150224, "kl": 0.0609893798828125, "learning_rate": 1.1455745104914699e-07, "loss": 0.0024, "reward": 1.4006697088479996, "reward_std": 0.3795268442481756, "rewards/accuracy_reward": 0.47098216228187084, "rewards/format_reward": 0.929687537252903, "step": 504 }, { "completion_length": 306.09822940826416, "epoch": 0.7807517634554063, "grad_norm": 0.22399316386349855, "kl": 0.0655975341796875, "learning_rate": 1.1301316554235396e-07, "loss": 0.0026, "reward": 1.4464286342263222, "reward_std": 0.32782209385186434, "rewards/accuracy_reward": 0.4955357350409031, "rewards/format_reward": 0.9508928880095482, "step": 505 }, { "completion_length": 321.50559520721436, "epoch": 0.7822978065513576, "grad_norm": 0.2430042891489543, "kl": 0.066162109375, "learning_rate": 1.114780323312724e-07, "loss": 0.0026, "reward": 1.3883929327130318, "reward_std": 0.35520241502672434, "rewards/accuracy_reward": 0.43973215855658054, "rewards/format_reward": 0.9486607387661934, "step": 506 }, { "completion_length": 343.3102836608887, "epoch": 0.7838438496473089, "grad_norm": 0.22935160094105175, "kl": 0.0642547607421875, "learning_rate": 1.0995208772202897e-07, "loss": 0.0026, "reward": 1.4241072088479996, "reward_std": 0.3318333653733134, "rewards/accuracy_reward": 0.4754464579746127, "rewards/format_reward": 0.948660746216774, "step": 507 }, { "completion_length": 353.7790355682373, "epoch": 0.7853898927432602, "grad_norm": 0.4687443855645298, "kl": 0.06085205078125, "learning_rate": 1.0843536780343865e-07, "loss": 0.0024, "reward": 1.4229911416769028, "reward_std": 0.38503093272447586, "rewards/accuracy_reward": 0.46428573317825794, "rewards/format_reward": 0.9587053880095482, "step": 508 }, { "completion_length": 366.6026945114136, "epoch": 0.7869359358392115, "grad_norm": 0.21468900535166055, "kl": 0.057373046875, "learning_rate": 1.069279084461513e-07, "loss": 0.0023, "reward": 1.385044701397419, "reward_std": 0.37473675794899464, "rewards/accuracy_reward": 0.45089288149029016, "rewards/format_reward": 0.9341518208384514, "step": 509 }, { "completion_length": 318.4821586608887, "epoch": 0.7884819789351628, "grad_norm": 0.2166033502286677, "kl": 0.061737060546875, "learning_rate": 1.0542974530180327e-07, "loss": 0.0025, "reward": 1.4497768506407738, "reward_std": 0.35504958778619766, "rewards/accuracy_reward": 0.49107144959270954, "rewards/format_reward": 0.9587053954601288, "step": 510 }, { "completion_length": 307.0201015472412, "epoch": 0.7900280220311141, "grad_norm": 0.2745434618591594, "kl": 0.0645599365234375, "learning_rate": 1.0394091380217352e-07, "loss": 0.0026, "reward": 1.382812574505806, "reward_std": 0.33633330650627613, "rewards/accuracy_reward": 0.42187501676380634, "rewards/format_reward": 0.960937537252903, "step": 511 }, { "completion_length": 333.2444381713867, "epoch": 0.7915740651270654, "grad_norm": 0.2482973770288421, "kl": 0.0616455078125, "learning_rate": 1.0246144915834681e-07, "loss": 0.0025, "reward": 1.4006697088479996, "reward_std": 0.32580017764121294, "rewards/accuracy_reward": 0.44419644959270954, "rewards/format_reward": 0.9564732499420643, "step": 512 }, { "completion_length": 336.71206855773926, "epoch": 0.7931201082230167, "grad_norm": 0.20351122985143397, "kl": 0.0617218017578125, "learning_rate": 1.0099138635988024e-07, "loss": 0.0025, "reward": 1.3761161416769028, "reward_std": 0.34965120255947113, "rewards/accuracy_reward": 0.420758955180645, "rewards/format_reward": 0.9553571753203869, "step": 513 }, { "completion_length": 338.8962211608887, "epoch": 0.794666151318968, "grad_norm": 0.2351626564083753, "kl": 0.060455322265625, "learning_rate": 9.953076017397577e-08, "loss": 0.0024, "reward": 1.3671875521540642, "reward_std": 0.36319392640143633, "rewards/accuracy_reward": 0.41852680407464504, "rewards/format_reward": 0.9486607536673546, "step": 514 }, { "completion_length": 326.27903270721436, "epoch": 0.7962121944149193, "grad_norm": 0.2122719770767931, "kl": 0.063690185546875, "learning_rate": 9.80796051446579e-08, "loss": 0.0025, "reward": 1.4386161416769028, "reward_std": 0.35855181235820055, "rewards/accuracy_reward": 0.498883955180645, "rewards/format_reward": 0.9397321790456772, "step": 515 }, { "completion_length": 321.3058166503906, "epoch": 0.7977582375108706, "grad_norm": 0.24096912642251633, "kl": 0.0627899169921875, "learning_rate": 9.66379555919573e-08, "loss": 0.0025, "reward": 1.4229911267757416, "reward_std": 0.3844280084595084, "rewards/accuracy_reward": 0.474330373108387, "rewards/format_reward": 0.948660746216774, "step": 516 }, { "completion_length": 329.70314025878906, "epoch": 0.7993042806068219, "grad_norm": 0.23211221109543498, "kl": 0.0670623779296875, "learning_rate": 9.520584561109862e-08, "loss": 0.0027, "reward": 1.3995536267757416, "reward_std": 0.3337116427719593, "rewards/accuracy_reward": 0.4386160923168063, "rewards/format_reward": 0.9609375223517418, "step": 517 }, { "completion_length": 339.4363965988159, "epoch": 0.8008503237027732, "grad_norm": 0.20272034082033277, "kl": 0.0621490478515625, "learning_rate": 9.378330907169384e-08, "loss": 0.0025, "reward": 1.39620541036129, "reward_std": 0.3723446447402239, "rewards/accuracy_reward": 0.45758930779993534, "rewards/format_reward": 0.9386161081492901, "step": 518 }, { "completion_length": 335.10157775878906, "epoch": 0.8023963667987245, "grad_norm": 0.2265692122759954, "kl": 0.06353759765625, "learning_rate": 9.237037961694222e-08, "loss": 0.0025, "reward": 1.425223283469677, "reward_std": 0.3709912374615669, "rewards/accuracy_reward": 0.49553573690354824, "rewards/format_reward": 0.9296875409781933, "step": 519 }, { "completion_length": 339.5290298461914, "epoch": 0.8039424098946758, "grad_norm": 0.21425224620123948, "kl": 0.0601654052734375, "learning_rate": 9.096709066283353e-08, "loss": 0.0024, "reward": 1.4241072088479996, "reward_std": 0.35456760972738266, "rewards/accuracy_reward": 0.4709821715950966, "rewards/format_reward": 0.953125037252903, "step": 520 }, { "completion_length": 342.2946557998657, "epoch": 0.8054884529906271, "grad_norm": 0.20294599901549243, "kl": 0.0614013671875, "learning_rate": 8.957347539735871e-08, "loss": 0.0025, "reward": 1.4397322088479996, "reward_std": 0.3474265616387129, "rewards/accuracy_reward": 0.48995538242161274, "rewards/format_reward": 0.9497768208384514, "step": 521 }, { "completion_length": 328.99220180511475, "epoch": 0.8070344960865784, "grad_norm": 0.20636751668034378, "kl": 0.0650177001953125, "learning_rate": 8.818956677972405e-08, "loss": 0.0026, "reward": 1.4654018506407738, "reward_std": 0.35080175939947367, "rewards/accuracy_reward": 0.5178571678698063, "rewards/format_reward": 0.9475446715950966, "step": 522 }, { "completion_length": 349.5669822692871, "epoch": 0.8085805391825297, "grad_norm": 0.20175259308516702, "kl": 0.061004638671875, "learning_rate": 8.681539753957268e-08, "loss": 0.0024, "reward": 1.4241072162985802, "reward_std": 0.3608737774193287, "rewards/accuracy_reward": 0.46763395331799984, "rewards/format_reward": 0.9564732573926449, "step": 523 }, { "completion_length": 369.97546195983887, "epoch": 0.810126582278481, "grad_norm": 0.2124922070114759, "kl": 0.0567779541015625, "learning_rate": 8.545100017620988e-08, "loss": 0.0023, "reward": 1.3415179178118706, "reward_std": 0.3750090319663286, "rewards/accuracy_reward": 0.39955359045416117, "rewards/format_reward": 0.9419643171131611, "step": 524 }, { "completion_length": 314.8236722946167, "epoch": 0.8116726253744323, "grad_norm": 0.2333511400013603, "kl": 0.0666656494140625, "learning_rate": 8.409640695783443e-08, "loss": 0.0027, "reward": 1.4765625819563866, "reward_std": 0.3514085989445448, "rewards/accuracy_reward": 0.5167410969734192, "rewards/format_reward": 0.9598214589059353, "step": 525 }, { "completion_length": 351.0279178619385, "epoch": 0.8132186684703836, "grad_norm": 0.2550531367556895, "kl": 0.0706024169921875, "learning_rate": 8.275164992077555e-08, "loss": 0.0028, "reward": 1.4352679327130318, "reward_std": 0.3786318674683571, "rewards/accuracy_reward": 0.48995538242161274, "rewards/format_reward": 0.945312537252903, "step": 526 }, { "completion_length": 321.7622871398926, "epoch": 0.8147647115663349, "grad_norm": 3.2662979394161766, "kl": 0.06756591796875, "learning_rate": 8.141676086873573e-08, "loss": 0.0027, "reward": 1.4040179178118706, "reward_std": 0.38065068796277046, "rewards/accuracy_reward": 0.4698660923168063, "rewards/format_reward": 0.9341518245637417, "step": 527 }, { "completion_length": 329.61273765563965, "epoch": 0.8163107546622862, "grad_norm": 0.2184433678485529, "kl": 0.0630340576171875, "learning_rate": 8.009177137203793e-08, "loss": 0.0025, "reward": 1.405133992433548, "reward_std": 0.3464660570025444, "rewards/accuracy_reward": 0.44866073317825794, "rewards/format_reward": 0.9564732499420643, "step": 528 }, { "completion_length": 326.04577445983887, "epoch": 0.8178567977582375, "grad_norm": 0.23285288035982227, "kl": 0.0644378662109375, "learning_rate": 7.877671276687897e-08, "loss": 0.0026, "reward": 1.4241072088479996, "reward_std": 0.34589446894824505, "rewards/accuracy_reward": 0.46763395331799984, "rewards/format_reward": 0.9564732499420643, "step": 529 }, { "completion_length": 316.9654150009155, "epoch": 0.8194028408541888, "grad_norm": 0.22132161830017189, "kl": 0.0642547607421875, "learning_rate": 7.747161615458902e-08, "loss": 0.0026, "reward": 1.4564732909202576, "reward_std": 0.3454069085419178, "rewards/accuracy_reward": 0.5111607369035482, "rewards/format_reward": 0.9453125260770321, "step": 530 }, { "completion_length": 351.35381412506104, "epoch": 0.8209488839501401, "grad_norm": 0.20580906193008414, "kl": 0.0598907470703125, "learning_rate": 7.617651240089545e-08, "loss": 0.0024, "reward": 1.38839291036129, "reward_std": 0.3622661381959915, "rewards/accuracy_reward": 0.44084823969751596, "rewards/format_reward": 0.9475446753203869, "step": 531 }, { "completion_length": 317.3727836608887, "epoch": 0.8224949270460914, "grad_norm": 0.217430832043762, "kl": 0.0657958984375, "learning_rate": 7.4891432135193e-08, "loss": 0.0026, "reward": 1.4229911491274834, "reward_std": 0.32607366470620036, "rewards/accuracy_reward": 0.46875002793967724, "rewards/format_reward": 0.9542411044239998, "step": 532 }, { "completion_length": 309.3605012893677, "epoch": 0.8240409701420427, "grad_norm": 0.24784157644184326, "kl": 0.06610107421875, "learning_rate": 7.361640574981936e-08, "loss": 0.0026, "reward": 1.4185268431901932, "reward_std": 0.3623524159193039, "rewards/accuracy_reward": 0.46763394959270954, "rewards/format_reward": 0.9508928805589676, "step": 533 }, { "completion_length": 313.56251335144043, "epoch": 0.825587013237994, "grad_norm": 0.23296153589435797, "kl": 0.0700531005859375, "learning_rate": 7.235146339933674e-08, "loss": 0.0028, "reward": 1.4419643431901932, "reward_std": 0.33336830884218216, "rewards/accuracy_reward": 0.4899553768336773, "rewards/format_reward": 0.9520089589059353, "step": 534 }, { "completion_length": 333.7064895629883, "epoch": 0.8271330563339453, "grad_norm": 0.27771369603546736, "kl": 0.06036376953125, "learning_rate": 7.109663499981833e-08, "loss": 0.0024, "reward": 1.4196429252624512, "reward_std": 0.3457352314144373, "rewards/accuracy_reward": 0.46763394959270954, "rewards/format_reward": 0.9520089626312256, "step": 535 }, { "completion_length": 359.35380840301514, "epoch": 0.8286790994298966, "grad_norm": 0.25991248433763625, "kl": 0.061065673828125, "learning_rate": 6.985195022814066e-08, "loss": 0.0024, "reward": 1.3537946864962578, "reward_std": 0.4023553729057312, "rewards/accuracy_reward": 0.4229910895228386, "rewards/format_reward": 0.9308036118745804, "step": 536 }, { "completion_length": 342.2477836608887, "epoch": 0.8302251425258479, "grad_norm": 0.22177826758593935, "kl": 0.059356689453125, "learning_rate": 6.861743852128233e-08, "loss": 0.0024, "reward": 1.4464286342263222, "reward_std": 0.3805669452995062, "rewards/accuracy_reward": 0.5011160932481289, "rewards/format_reward": 0.945312537252903, "step": 537 }, { "completion_length": 348.2444362640381, "epoch": 0.8317711856217992, "grad_norm": 0.23265891456292442, "kl": 0.0560760498046875, "learning_rate": 6.739312907562733e-08, "loss": 0.0022, "reward": 1.4107143506407738, "reward_std": 0.375796714797616, "rewards/accuracy_reward": 0.4564732350409031, "rewards/format_reward": 0.9542411044239998, "step": 538 }, { "completion_length": 321.8694372177124, "epoch": 0.8333172287177505, "grad_norm": 0.20183992459573682, "kl": 0.0609893798828125, "learning_rate": 6.61790508462745e-08, "loss": 0.0024, "reward": 1.431919701397419, "reward_std": 0.3457864671945572, "rewards/accuracy_reward": 0.47544644586741924, "rewards/format_reward": 0.9564732424914837, "step": 539 }, { "completion_length": 317.23104190826416, "epoch": 0.8348632718137018, "grad_norm": 0.22485406092991245, "kl": 0.0631103515625, "learning_rate": 6.497523254635296e-08, "loss": 0.0025, "reward": 1.470982201397419, "reward_std": 0.35945569910109043, "rewards/accuracy_reward": 0.5189732406288385, "rewards/format_reward": 0.9520089626312256, "step": 540 }, { "completion_length": 324.7779197692871, "epoch": 0.8364093149096531, "grad_norm": 0.22069453422055874, "kl": 0.062225341796875, "learning_rate": 6.37817026463432e-08, "loss": 0.0025, "reward": 1.4430804252624512, "reward_std": 0.39325317926704884, "rewards/accuracy_reward": 0.4933035969734192, "rewards/format_reward": 0.9497768133878708, "step": 541 }, { "completion_length": 348.0256824493408, "epoch": 0.8379553580056044, "grad_norm": 0.22150733714350324, "kl": 0.05645751953125, "learning_rate": 6.25984893734034e-08, "loss": 0.0023, "reward": 1.392857201397419, "reward_std": 0.3684656862169504, "rewards/accuracy_reward": 0.4375000176951289, "rewards/format_reward": 0.9553571790456772, "step": 542 }, { "completion_length": 313.62166690826416, "epoch": 0.8395014011015557, "grad_norm": 0.24127052372547175, "kl": 0.0675048828125, "learning_rate": 6.142562071070178e-08, "loss": 0.0027, "reward": 1.4241072088479996, "reward_std": 0.33673590794205666, "rewards/accuracy_reward": 0.46875001676380634, "rewards/format_reward": 0.9553571715950966, "step": 543 }, { "completion_length": 348.56809425354004, "epoch": 0.841047444197507, "grad_norm": 0.20343476821080062, "kl": 0.0620880126953125, "learning_rate": 6.026312439675551e-08, "loss": 0.0025, "reward": 1.3973214998841286, "reward_std": 0.3458593301475048, "rewards/accuracy_reward": 0.44531252048909664, "rewards/format_reward": 0.9520089663565159, "step": 544 }, { "completion_length": 342.77233695983887, "epoch": 0.8425934872934583, "grad_norm": 0.19723522941397176, "kl": 0.0587615966796875, "learning_rate": 5.911102792477357e-08, "loss": 0.0024, "reward": 1.428571492433548, "reward_std": 0.3602485693991184, "rewards/accuracy_reward": 0.49776788987219334, "rewards/format_reward": 0.9308036044239998, "step": 545 }, { "completion_length": 342.1808204650879, "epoch": 0.8441395303894096, "grad_norm": 0.21256020710610488, "kl": 0.0609893798828125, "learning_rate": 5.796935854200763e-08, "loss": 0.0024, "reward": 1.3816964775323868, "reward_std": 0.3704910837113857, "rewards/accuracy_reward": 0.43861609138548374, "rewards/format_reward": 0.9430803991854191, "step": 546 }, { "completion_length": 339.81474781036377, "epoch": 0.8456855734853609, "grad_norm": 0.21862640764566457, "kl": 0.06243896484375, "learning_rate": 5.683814324910685e-08, "loss": 0.0025, "reward": 1.4207590147852898, "reward_std": 0.3947806600481272, "rewards/accuracy_reward": 0.49330359511077404, "rewards/format_reward": 0.9274553917348385, "step": 547 }, { "completion_length": 350.4910898208618, "epoch": 0.8472316165813122, "grad_norm": 0.21565408793521262, "kl": 0.0621337890625, "learning_rate": 5.571740879947978e-08, "loss": 0.0025, "reward": 1.3705357983708382, "reward_std": 0.34798722341656685, "rewards/accuracy_reward": 0.42968752048909664, "rewards/format_reward": 0.9408482611179352, "step": 548 }, { "completion_length": 350.00671100616455, "epoch": 0.8487776596772635, "grad_norm": 0.2069195343455135, "kl": 0.06427001953125, "learning_rate": 5.460718169866163e-08, "loss": 0.0026, "reward": 1.3984375819563866, "reward_std": 0.38219055347144604, "rewards/accuracy_reward": 0.4620535969734192, "rewards/format_reward": 0.9363839626312256, "step": 549 }, { "completion_length": 358.1741237640381, "epoch": 0.8503237027732148, "grad_norm": 0.20777811728652298, "kl": 0.057403564453125, "learning_rate": 5.350748820368689e-08, "loss": 0.0023, "reward": 1.3459822162985802, "reward_std": 0.397937485948205, "rewards/accuracy_reward": 0.41741073317825794, "rewards/format_reward": 0.9285714626312256, "step": 550 }, { "completion_length": 367.12501525878906, "epoch": 0.8518697458691661, "grad_norm": 0.2570750167624703, "kl": 0.057464599609375, "learning_rate": 5.2418354322468884e-08, "loss": 0.0023, "reward": 1.3783482685685158, "reward_std": 0.40023924224078655, "rewards/accuracy_reward": 0.4419643059372902, "rewards/format_reward": 0.9363839626312256, "step": 551 }, { "completion_length": 336.97657585144043, "epoch": 0.8534157889651174, "grad_norm": 0.21294263526364726, "kl": 0.060394287109375, "learning_rate": 5.133980581318459e-08, "loss": 0.0024, "reward": 1.456473283469677, "reward_std": 0.34913449827581644, "rewards/accuracy_reward": 0.5022321660071611, "rewards/format_reward": 0.9542411044239998, "step": 552 }, { "completion_length": 341.5547046661377, "epoch": 0.8549618320610687, "grad_norm": 0.19375658074606578, "kl": 0.0623626708984375, "learning_rate": 5.027186818366541e-08, "loss": 0.0025, "reward": 1.433035783469677, "reward_std": 0.31409448850899935, "rewards/accuracy_reward": 0.4821428805589676, "rewards/format_reward": 0.9508928917348385, "step": 553 }, { "completion_length": 353.23438835144043, "epoch": 0.85650787515702, "grad_norm": 0.21786183717047078, "kl": 0.062591552734375, "learning_rate": 4.921456669079366e-08, "loss": 0.0025, "reward": 1.4062500670552254, "reward_std": 0.4082389697432518, "rewards/accuracy_reward": 0.4799107378348708, "rewards/format_reward": 0.926339328289032, "step": 554 }, { "completion_length": 338.0178737640381, "epoch": 0.8580539182529713, "grad_norm": 0.22662779492884194, "kl": 0.0643463134765625, "learning_rate": 4.816792633990569e-08, "loss": 0.0026, "reward": 1.4062500670552254, "reward_std": 0.3891046270728111, "rewards/accuracy_reward": 0.48214288242161274, "rewards/format_reward": 0.9241071790456772, "step": 555 }, { "completion_length": 338.95202255249023, "epoch": 0.8595999613489226, "grad_norm": 0.20766823380067395, "kl": 0.0644073486328125, "learning_rate": 4.713197188420026e-08, "loss": 0.0026, "reward": 1.3292411267757416, "reward_std": 0.3269513137638569, "rewards/accuracy_reward": 0.380580373108387, "rewards/format_reward": 0.948660746216774, "step": 556 }, { "completion_length": 358.7712211608887, "epoch": 0.8611460044448739, "grad_norm": 0.22567147404628415, "kl": 0.0595245361328125, "learning_rate": 4.610672782415276e-08, "loss": 0.0024, "reward": 1.3582589849829674, "reward_std": 0.41297781467437744, "rewards/accuracy_reward": 0.4296875214204192, "rewards/format_reward": 0.9285714626312256, "step": 557 }, { "completion_length": 349.96318435668945, "epoch": 0.8626920475408252, "grad_norm": 0.22336583319960307, "kl": 0.0650634765625, "learning_rate": 4.509221840693655e-08, "loss": 0.0026, "reward": 1.3493304252624512, "reward_std": 0.36440214049071074, "rewards/accuracy_reward": 0.4040178805589676, "rewards/format_reward": 0.9453125298023224, "step": 558 }, { "completion_length": 331.6305961608887, "epoch": 0.8642380906367765, "grad_norm": 0.23329341217610414, "kl": 0.0635833740234375, "learning_rate": 4.4088467625849005e-08, "loss": 0.0025, "reward": 1.4475447088479996, "reward_std": 0.37882011383771896, "rewards/accuracy_reward": 0.48660716600716114, "rewards/format_reward": 0.9609375298023224, "step": 559 }, { "completion_length": 332.90961265563965, "epoch": 0.8657841337327278, "grad_norm": 0.20805595020741174, "kl": 0.06195068359375, "learning_rate": 4.30954992197442e-08, "loss": 0.0025, "reward": 1.4408482760190964, "reward_std": 0.334908370859921, "rewards/accuracy_reward": 0.4933035969734192, "rewards/format_reward": 0.9475446753203869, "step": 560 }, { "completion_length": 368.63729095458984, "epoch": 0.8673301768286791, "grad_norm": 0.20110778317863137, "kl": 0.0584869384765625, "learning_rate": 4.2113336672471245e-08, "loss": 0.0023, "reward": 1.392857201397419, "reward_std": 0.3654589708894491, "rewards/accuracy_reward": 0.4575893087312579, "rewards/format_reward": 0.9352678917348385, "step": 561 }, { "completion_length": 349.5468921661377, "epoch": 0.8688762199246304, "grad_norm": 0.216877846201819, "kl": 0.0605010986328125, "learning_rate": 4.1142003212319366e-08, "loss": 0.0024, "reward": 1.358258992433548, "reward_std": 0.31400518119335175, "rewards/accuracy_reward": 0.4051339514553547, "rewards/format_reward": 0.9531250335276127, "step": 562 }, { "completion_length": 324.7165298461914, "epoch": 0.8704222630205817, "grad_norm": 0.2555757201604232, "kl": 0.0695648193359375, "learning_rate": 4.018152181146822e-08, "loss": 0.0028, "reward": 1.4263393357396126, "reward_std": 0.34038644656538963, "rewards/accuracy_reward": 0.4754464542493224, "rewards/format_reward": 0.9508928917348385, "step": 563 }, { "completion_length": 340.49778175354004, "epoch": 0.871968306116533, "grad_norm": 0.22268007229592587, "kl": 0.065185546875, "learning_rate": 3.923191518544433e-08, "loss": 0.0026, "reward": 1.335937574505806, "reward_std": 0.41089982632547617, "rewards/accuracy_reward": 0.40401787869632244, "rewards/format_reward": 0.9319196827709675, "step": 564 }, { "completion_length": 358.5435457229614, "epoch": 0.8735143492124843, "grad_norm": 0.252904003109766, "kl": 0.067779541015625, "learning_rate": 3.829320579258466e-08, "loss": 0.0027, "reward": 1.3816964849829674, "reward_std": 0.405029296875, "rewards/accuracy_reward": 0.455357164144516, "rewards/format_reward": 0.9263393171131611, "step": 565 }, { "completion_length": 319.22211170196533, "epoch": 0.8750603923084356, "grad_norm": 0.23032004095138106, "kl": 0.0680999755859375, "learning_rate": 3.736541583350472e-08, "loss": 0.0027, "reward": 1.45089291036129, "reward_std": 0.3747481666505337, "rewards/accuracy_reward": 0.49665181897580624, "rewards/format_reward": 0.9542411044239998, "step": 566 }, { "completion_length": 330.94644260406494, "epoch": 0.8766064354043869, "grad_norm": 0.19768941417748906, "kl": 0.066558837890625, "learning_rate": 3.6448567250574046e-08, "loss": 0.0027, "reward": 1.408482201397419, "reward_std": 0.35070943739265203, "rewards/accuracy_reward": 0.4631696632131934, "rewards/format_reward": 0.9453125298023224, "step": 567 }, { "completion_length": 359.8058166503906, "epoch": 0.8781524785003382, "grad_norm": 35715.98452655271, "kl": 1.8270721435546875, "learning_rate": 3.554268172739661e-08, "loss": 0.0731, "reward": 1.4274554252624512, "reward_std": 0.3697248511016369, "rewards/accuracy_reward": 0.4832589514553547, "rewards/format_reward": 0.9441964589059353, "step": 568 }, { "completion_length": 308.8069305419922, "epoch": 0.8796985215962895, "grad_norm": 0.22092284711454832, "kl": 0.0644989013671875, "learning_rate": 3.4647780688298824e-08, "loss": 0.0026, "reward": 1.467633992433548, "reward_std": 0.3485427414998412, "rewards/accuracy_reward": 0.5122768059372902, "rewards/format_reward": 0.9553571753203869, "step": 569 }, { "completion_length": 360.47434997558594, "epoch": 0.8812445646922408, "grad_norm": 0.20753243662755205, "kl": 0.0585479736328125, "learning_rate": 3.376388529782215e-08, "loss": 0.0023, "reward": 1.3917411342263222, "reward_std": 0.3818623647093773, "rewards/accuracy_reward": 0.45758930779993534, "rewards/format_reward": 0.9341518171131611, "step": 570 }, { "completion_length": 365.82367515563965, "epoch": 0.8827906077881921, "grad_norm": 0.22527150064779833, "kl": 0.0602874755859375, "learning_rate": 3.289101646022296e-08, "loss": 0.0024, "reward": 1.3627232760190964, "reward_std": 0.3611252438277006, "rewards/accuracy_reward": 0.42633930314332247, "rewards/format_reward": 0.9363839663565159, "step": 571 }, { "completion_length": 342.1997957229614, "epoch": 0.8843366508841434, "grad_norm": 0.2176214008353423, "kl": 0.06353759765625, "learning_rate": 3.2029194818977976e-08, "loss": 0.0025, "reward": 1.427455447614193, "reward_std": 0.38210483454167843, "rewards/accuracy_reward": 0.4910714440047741, "rewards/format_reward": 0.9363839626312256, "step": 572 }, { "completion_length": 347.3448820114136, "epoch": 0.8858826939800947, "grad_norm": 0.21055665498306722, "kl": 0.0576019287109375, "learning_rate": 3.117844075629616e-08, "loss": 0.0023, "reward": 1.37276791036129, "reward_std": 0.36157552525401115, "rewards/accuracy_reward": 0.42968751676380634, "rewards/format_reward": 0.9430803880095482, "step": 573 }, { "completion_length": 347.6540336608887, "epoch": 0.887428737076046, "grad_norm": 0.21339399517151403, "kl": 0.061767578125, "learning_rate": 3.0338774392636655e-08, "loss": 0.0025, "reward": 1.4397322088479996, "reward_std": 0.3693514745682478, "rewards/accuracy_reward": 0.49888395331799984, "rewards/format_reward": 0.940848246216774, "step": 574 }, { "completion_length": 369.58037757873535, "epoch": 0.8889747801719973, "grad_norm": 0.21424091223068842, "kl": 0.0558013916015625, "learning_rate": 2.9510215586232734e-08, "loss": 0.0022, "reward": 1.3426339775323868, "reward_std": 0.35286693274974823, "rewards/accuracy_reward": 0.40401787497103214, "rewards/format_reward": 0.9386161044239998, "step": 575 }, { "completion_length": 371.5078296661377, "epoch": 0.8905208232679486, "grad_norm": 0.21094625506571998, "kl": 0.0608367919921875, "learning_rate": 2.8692783932622256e-08, "loss": 0.0024, "reward": 1.38058041036129, "reward_std": 0.3424485847353935, "rewards/accuracy_reward": 0.44754466228187084, "rewards/format_reward": 0.9330357536673546, "step": 576 }, { "completion_length": 344.82925605773926, "epoch": 0.8920668663638999, "grad_norm": 0.2116483437587734, "kl": 0.062896728515625, "learning_rate": 2.7886498764184584e-08, "loss": 0.0025, "reward": 1.3470982909202576, "reward_std": 0.35091717913746834, "rewards/accuracy_reward": 0.4095982350409031, "rewards/format_reward": 0.9375000335276127, "step": 577 }, { "completion_length": 331.61273860931396, "epoch": 0.8936129094598512, "grad_norm": 0.21533380877291697, "kl": 0.068817138671875, "learning_rate": 2.7091379149682682e-08, "loss": 0.0028, "reward": 1.4207589775323868, "reward_std": 0.3413937850855291, "rewards/accuracy_reward": 0.483258955180645, "rewards/format_reward": 0.937500037252903, "step": 578 }, { "completion_length": 341.9553756713867, "epoch": 0.8951589525558025, "grad_norm": 0.20904722162309516, "kl": 0.062286376953125, "learning_rate": 2.630744389381284e-08, "loss": 0.0025, "reward": 1.3694197088479996, "reward_std": 0.35494863614439964, "rewards/accuracy_reward": 0.4319196604192257, "rewards/format_reward": 0.937500037252903, "step": 579 }, { "completion_length": 343.7243461608887, "epoch": 0.8967049956517538, "grad_norm": 0.21198794235734603, "kl": 0.0630645751953125, "learning_rate": 2.5534711536759403e-08, "loss": 0.0025, "reward": 1.4363839849829674, "reward_std": 0.3539962200447917, "rewards/accuracy_reward": 0.48437502421438694, "rewards/format_reward": 0.9520089626312256, "step": 580 }, { "completion_length": 362.74220275878906, "epoch": 0.8982510387477051, "grad_norm": 0.22076671428175926, "kl": 0.0599517822265625, "learning_rate": 2.4773200353756796e-08, "loss": 0.0024, "reward": 1.3895090073347092, "reward_std": 0.35299296490848064, "rewards/accuracy_reward": 0.43973216228187084, "rewards/format_reward": 0.9497768208384514, "step": 581 }, { "completion_length": 308.6384057998657, "epoch": 0.8997970818436564, "grad_norm": 0.35033945752718565, "kl": 0.0738067626953125, "learning_rate": 2.402292835465647e-08, "loss": 0.003, "reward": 1.4877232760190964, "reward_std": 0.3395460210740566, "rewards/accuracy_reward": 0.5312500260770321, "rewards/format_reward": 0.9564732499420643, "step": 582 }, { "completion_length": 338.3538074493408, "epoch": 0.9013431249396077, "grad_norm": 0.2091367323844448, "kl": 0.0652618408203125, "learning_rate": 2.3283913283502044e-08, "loss": 0.0026, "reward": 1.421875074505806, "reward_std": 0.3386496752500534, "rewards/accuracy_reward": 0.47321430407464504, "rewards/format_reward": 0.9486607536673546, "step": 583 }, { "completion_length": 341.56921768188477, "epoch": 0.902889168035559, "grad_norm": 0.24623144440943884, "kl": 0.0618133544921875, "learning_rate": 2.2556172618108994e-08, "loss": 0.0025, "reward": 1.3950893431901932, "reward_std": 0.4164838157594204, "rewards/accuracy_reward": 0.4598214440047741, "rewards/format_reward": 0.9352678991854191, "step": 584 }, { "completion_length": 353.75559425354004, "epoch": 0.9044352111315103, "grad_norm": 0.20915988296468868, "kl": 0.0631866455078125, "learning_rate": 2.1839723569651247e-08, "loss": 0.0025, "reward": 1.4062500819563866, "reward_std": 0.36449841782450676, "rewards/accuracy_reward": 0.4654018096625805, "rewards/format_reward": 0.9408482536673546, "step": 585 }, { "completion_length": 343.208722114563, "epoch": 0.9059812542274616, "grad_norm": 0.2369880766469856, "kl": 0.0661163330078125, "learning_rate": 2.1134583082254575e-08, "loss": 0.0026, "reward": 1.35714291036129, "reward_std": 0.37463077064603567, "rewards/accuracy_reward": 0.42075894214212894, "rewards/format_reward": 0.9363839589059353, "step": 586 }, { "completion_length": 335.3906354904175, "epoch": 0.9075272973234129, "grad_norm": 0.2031899770195318, "kl": 0.0590362548828125, "learning_rate": 2.0440767832595574e-08, "loss": 0.0024, "reward": 1.4084822088479996, "reward_std": 0.3360661007463932, "rewards/accuracy_reward": 0.46093752421438694, "rewards/format_reward": 0.9475446715950966, "step": 587 }, { "completion_length": 350.8660888671875, "epoch": 0.9090733404193642, "grad_norm": 0.28578646769082233, "kl": 0.0655670166015625, "learning_rate": 1.975829422950709e-08, "loss": 0.0026, "reward": 1.433035783469677, "reward_std": 0.36060876213014126, "rewards/accuracy_reward": 0.47767859138548374, "rewards/format_reward": 0.9553571753203869, "step": 588 }, { "completion_length": 325.1428699493408, "epoch": 0.9106193835153155, "grad_norm": 0.21159533234150835, "kl": 0.065338134765625, "learning_rate": 1.9087178413590476e-08, "loss": 0.0026, "reward": 1.3950893431901932, "reward_std": 0.35729281697422266, "rewards/accuracy_reward": 0.45758930779993534, "rewards/format_reward": 0.9375000298023224, "step": 589 }, { "completion_length": 332.2053689956665, "epoch": 0.9121654266112668, "grad_norm": 0.21420602315466866, "kl": 0.06451416015625, "learning_rate": 1.842743625683385e-08, "loss": 0.0026, "reward": 1.4162947162985802, "reward_std": 0.3347237091511488, "rewards/accuracy_reward": 0.46986609138548374, "rewards/format_reward": 0.9464286081492901, "step": 590 }, { "completion_length": 331.5290355682373, "epoch": 0.9137114697072181, "grad_norm": 0.20711446824169, "kl": 0.065399169921875, "learning_rate": 1.7779083362236548e-08, "loss": 0.0026, "reward": 1.4185268506407738, "reward_std": 0.336538883857429, "rewards/accuracy_reward": 0.4765625260770321, "rewards/format_reward": 0.9419643171131611, "step": 591 }, { "completion_length": 329.62947845458984, "epoch": 0.9152575128031694, "grad_norm": 0.2745628133330339, "kl": 0.073089599609375, "learning_rate": 1.7142135063440034e-08, "loss": 0.0029, "reward": 1.407366119325161, "reward_std": 0.3269703984260559, "rewards/accuracy_reward": 0.45535716973245144, "rewards/format_reward": 0.9520089663565159, "step": 592 }, { "completion_length": 333.0982303619385, "epoch": 0.9168035558991207, "grad_norm": 0.82938727049777, "kl": 0.063720703125, "learning_rate": 1.6516606424365642e-08, "loss": 0.0025, "reward": 1.5022322088479996, "reward_std": 0.344320897012949, "rewards/accuracy_reward": 0.5602678805589676, "rewards/format_reward": 0.9419643171131611, "step": 593 }, { "completion_length": 363.80581855773926, "epoch": 0.918349598995072, "grad_norm": 0.4368618914391871, "kl": 0.0596771240234375, "learning_rate": 1.590251223885786e-08, "loss": 0.0024, "reward": 1.3995536267757416, "reward_std": 0.39757465198636055, "rewards/accuracy_reward": 0.4620535932481289, "rewards/format_reward": 0.9375000409781933, "step": 594 }, { "completion_length": 357.9073791503906, "epoch": 0.9198956420910233, "grad_norm": 0.19672021146595647, "kl": 0.0584259033203125, "learning_rate": 1.5299867030334813e-08, "loss": 0.0023, "reward": 1.4263393580913544, "reward_std": 0.36204397678375244, "rewards/accuracy_reward": 0.4854911006987095, "rewards/format_reward": 0.9408482499420643, "step": 595 }, { "completion_length": 346.0413131713867, "epoch": 0.9214416851869746, "grad_norm": 0.20096018164675397, "kl": 0.0628509521484375, "learning_rate": 1.4708685051444513e-08, "loss": 0.0025, "reward": 1.4029018357396126, "reward_std": 0.338257041759789, "rewards/accuracy_reward": 0.4464285895228386, "rewards/format_reward": 0.9564732499420643, "step": 596 }, { "completion_length": 349.7031412124634, "epoch": 0.9229877282829259, "grad_norm": 0.22309936139894196, "kl": 0.0613250732421875, "learning_rate": 1.4128980283727942e-08, "loss": 0.0025, "reward": 1.3448661342263222, "reward_std": 0.36543772742152214, "rewards/accuracy_reward": 0.4140625223517418, "rewards/format_reward": 0.9308036081492901, "step": 597 }, { "completion_length": 329.290189743042, "epoch": 0.9245337713788772, "grad_norm": 0.21476563438216226, "kl": 0.065765380859375, "learning_rate": 1.3560766437288428e-08, "loss": 0.0026, "reward": 1.3671875670552254, "reward_std": 0.3433725470677018, "rewards/accuracy_reward": 0.422991088591516, "rewards/format_reward": 0.9441964626312256, "step": 598 }, { "completion_length": 328.30135440826416, "epoch": 0.9260798144748285, "grad_norm": 0.23143385102731523, "kl": 0.0660858154296875, "learning_rate": 1.3004056950467135e-08, "loss": 0.0026, "reward": 1.4118304252624512, "reward_std": 0.34602647833526134, "rewards/accuracy_reward": 0.46205359511077404, "rewards/format_reward": 0.9497768208384514, "step": 599 }, { "completion_length": 346.9966697692871, "epoch": 0.9276258575707798, "grad_norm": 0.22747235362999266, "kl": 0.0654296875, "learning_rate": 1.2458864989525697e-08, "loss": 0.0026, "reward": 1.3772322162985802, "reward_std": 0.3586107883602381, "rewards/accuracy_reward": 0.4397321632131934, "rewards/format_reward": 0.937500037252903, "step": 600 }, { "completion_length": 366.8805980682373, "epoch": 0.929171900666731, "grad_norm": 0.20256218369917847, "kl": 0.0597686767578125, "learning_rate": 1.1925203448334198e-08, "loss": 0.0024, "reward": 1.342633992433548, "reward_std": 0.3983724657446146, "rewards/accuracy_reward": 0.40736608300358057, "rewards/format_reward": 0.9352678880095482, "step": 601 }, { "completion_length": 350.8426513671875, "epoch": 0.9307179437626824, "grad_norm": 0.23424501999264413, "kl": 0.0612030029296875, "learning_rate": 1.140308494806702e-08, "loss": 0.0024, "reward": 1.4229911118745804, "reward_std": 0.3819661373272538, "rewards/accuracy_reward": 0.4877232424914837, "rewards/format_reward": 0.9352678917348385, "step": 602 }, { "completion_length": 345.0122947692871, "epoch": 0.9322639868586337, "grad_norm": 0.20629362275892382, "kl": 0.0615692138671875, "learning_rate": 1.0892521836903479e-08, "loss": 0.0025, "reward": 1.3281250596046448, "reward_std": 0.35612331703305244, "rewards/accuracy_reward": 0.3917410932481289, "rewards/format_reward": 0.9363839663565159, "step": 603 }, { "completion_length": 357.1741237640381, "epoch": 0.933810029954585, "grad_norm": 0.20836378194877903, "kl": 0.0603179931640625, "learning_rate": 1.0393526189736601e-08, "loss": 0.0024, "reward": 1.417410783469677, "reward_std": 0.3759672315791249, "rewards/accuracy_reward": 0.491071455180645, "rewards/format_reward": 0.9263393171131611, "step": 604 }, { "completion_length": 311.6506824493408, "epoch": 0.9353560730505363, "grad_norm": 0.2397302008317822, "kl": 0.072601318359375, "learning_rate": 9.906109807887031e-09, "loss": 0.0029, "reward": 1.4888393580913544, "reward_std": 0.3653652798384428, "rewards/accuracy_reward": 0.5424107424914837, "rewards/format_reward": 0.9464286006987095, "step": 605 }, { "completion_length": 342.2243461608887, "epoch": 0.9369021161464876, "grad_norm": 0.32993534197298524, "kl": 0.06243896484375, "learning_rate": 9.430284218824025e-09, "loss": 0.0025, "reward": 1.453125074505806, "reward_std": 0.38584640622138977, "rewards/accuracy_reward": 0.5055803786963224, "rewards/format_reward": 0.9475446715950966, "step": 606 }, { "completion_length": 352.2422046661377, "epoch": 0.9384481592424389, "grad_norm": 0.20404979445376703, "kl": 0.0626373291015625, "learning_rate": 8.96606067589295e-09, "loss": 0.0025, "reward": 1.381696492433548, "reward_std": 0.3863948667421937, "rewards/accuracy_reward": 0.4631696678698063, "rewards/format_reward": 0.918526828289032, "step": 607 }, { "completion_length": 338.5826072692871, "epoch": 0.9399942023383902, "grad_norm": 0.2324472897216118, "kl": 0.0653228759765625, "learning_rate": 8.513450158049106e-09, "loss": 0.0026, "reward": 1.4174107760190964, "reward_std": 0.38653728924691677, "rewards/accuracy_reward": 0.4787946594879031, "rewards/format_reward": 0.9386161044239998, "step": 608 }, { "completion_length": 366.70760917663574, "epoch": 0.9415402454343414, "grad_norm": 0.20954771598757532, "kl": 0.0615081787109375, "learning_rate": 8.072463369597992e-09, "loss": 0.0025, "reward": 1.354910783469677, "reward_std": 0.37842289917171, "rewards/accuracy_reward": 0.41071430686861277, "rewards/format_reward": 0.9441964626312256, "step": 609 }, { "completion_length": 352.7232303619385, "epoch": 0.9430862885302927, "grad_norm": 0.21104426656954492, "kl": 0.06365966796875, "learning_rate": 7.643110739942171e-09, "loss": 0.0025, "reward": 1.4308036416769028, "reward_std": 0.40414370223879814, "rewards/accuracy_reward": 0.5000000242143869, "rewards/format_reward": 0.9308036006987095, "step": 610 }, { "completion_length": 348.8225555419922, "epoch": 0.944632331626244, "grad_norm": 0.21781643663398986, "kl": 0.0615997314453125, "learning_rate": 7.2254024233346925e-09, "loss": 0.0025, "reward": 1.4575893506407738, "reward_std": 0.364959217607975, "rewards/accuracy_reward": 0.5156250279396772, "rewards/format_reward": 0.9419643208384514, "step": 611 }, { "completion_length": 338.57814025878906, "epoch": 0.9461783747221953, "grad_norm": 0.22697044311451553, "kl": 0.0617828369140625, "learning_rate": 6.819348298638839e-09, "loss": 0.0025, "reward": 1.3995536491274834, "reward_std": 0.3851529248058796, "rewards/accuracy_reward": 0.45870538242161274, "rewards/format_reward": 0.940848246216774, "step": 612 }, { "completion_length": 336.8158588409424, "epoch": 0.9477244178181466, "grad_norm": 0.22791414963689152, "kl": 0.0634918212890625, "learning_rate": 6.424957969094536e-09, "loss": 0.0025, "reward": 1.3415179327130318, "reward_std": 0.3581673167645931, "rewards/accuracy_reward": 0.39508930407464504, "rewards/format_reward": 0.9464286006987095, "step": 613 }, { "completion_length": 357.99109172821045, "epoch": 0.949270460914098, "grad_norm": 0.21420685581756602, "kl": 0.0608367919921875, "learning_rate": 6.0422407620911995e-09, "loss": 0.0024, "reward": 1.3750000596046448, "reward_std": 0.3797103241086006, "rewards/accuracy_reward": 0.427455373108387, "rewards/format_reward": 0.9475446790456772, "step": 614 }, { "completion_length": 327.56028175354004, "epoch": 0.9508165040100492, "grad_norm": 0.2402517006522075, "kl": 0.0629119873046875, "learning_rate": 5.671205728947304e-09, "loss": 0.0025, "reward": 1.3705357760190964, "reward_std": 0.3567105159163475, "rewards/accuracy_reward": 0.4330357313156128, "rewards/format_reward": 0.9375000409781933, "step": 615 }, { "completion_length": 344.6908645629883, "epoch": 0.9523625471060005, "grad_norm": 0.22473125537054897, "kl": 0.064239501953125, "learning_rate": 5.311861644696047e-09, "loss": 0.0026, "reward": 1.3593750670552254, "reward_std": 0.39192034862935543, "rewards/accuracy_reward": 0.424107157276012, "rewards/format_reward": 0.9352678917348385, "step": 616 }, { "completion_length": 333.55023860931396, "epoch": 0.9539085902019518, "grad_norm": 0.20269900122406823, "kl": 0.060028076171875, "learning_rate": 4.96421700787808e-09, "loss": 0.0024, "reward": 1.4084822162985802, "reward_std": 0.35080490820109844, "rewards/accuracy_reward": 0.46316967345774174, "rewards/format_reward": 0.9453125447034836, "step": 617 }, { "completion_length": 335.1942090988159, "epoch": 0.9554546332979031, "grad_norm": 0.2059591707186609, "kl": 0.0630950927734375, "learning_rate": 4.628280040340271e-09, "loss": 0.0025, "reward": 1.3805804178118706, "reward_std": 0.3422225024551153, "rewards/accuracy_reward": 0.4308035895228386, "rewards/format_reward": 0.9497768171131611, "step": 618 }, { "completion_length": 341.8259048461914, "epoch": 0.9570006763938544, "grad_norm": 0.20296125217374275, "kl": 0.0625, "learning_rate": 4.304058687041534e-09, "loss": 0.0025, "reward": 1.4386161267757416, "reward_std": 0.3455498209223151, "rewards/accuracy_reward": 0.5000000204890966, "rewards/format_reward": 0.9386161081492901, "step": 619 }, { "completion_length": 346.10269260406494, "epoch": 0.9585467194898057, "grad_norm": 0.20189952395455782, "kl": 0.061798095703125, "learning_rate": 3.991560615864586e-09, "loss": 0.0025, "reward": 1.3705357909202576, "reward_std": 0.35251387767493725, "rewards/accuracy_reward": 0.4174107415601611, "rewards/format_reward": 0.953125037252903, "step": 620 }, { "completion_length": 340.8013563156128, "epoch": 0.960092762585757, "grad_norm": 0.24222143354785627, "kl": 0.067230224609375, "learning_rate": 3.690793217434984e-09, "loss": 0.0027, "reward": 1.3671875670552254, "reward_std": 0.39040255546569824, "rewards/accuracy_reward": 0.43861609138548374, "rewards/format_reward": 0.9285714626312256, "step": 621 }, { "completion_length": 343.2165355682373, "epoch": 0.9616388056817083, "grad_norm": 0.24400263239316256, "kl": 0.0666046142578125, "learning_rate": 3.4017636049460396e-09, "loss": 0.0027, "reward": 1.4196429178118706, "reward_std": 0.36995262652635574, "rewards/accuracy_reward": 0.4787946529686451, "rewards/format_reward": 0.9408482536673546, "step": 622 }, { "completion_length": 351.10939025878906, "epoch": 0.9631848487776596, "grad_norm": 0.30592165770533664, "kl": 0.061553955078125, "learning_rate": 3.124478613990733e-09, "loss": 0.0025, "reward": 1.3537946939468384, "reward_std": 0.3501143045723438, "rewards/accuracy_reward": 0.41852680314332247, "rewards/format_reward": 0.9352678842842579, "step": 623 }, { "completion_length": 353.5881862640381, "epoch": 0.9647308918736109, "grad_norm": 0.21269524261691494, "kl": 0.064178466796875, "learning_rate": 2.8589448023998986e-09, "loss": 0.0026, "reward": 1.3895090073347092, "reward_std": 0.37983597069978714, "rewards/accuracy_reward": 0.45200894586741924, "rewards/format_reward": 0.9375000335276127, "step": 624 }, { "completion_length": 356.0558214187622, "epoch": 0.9662769349695622, "grad_norm": 0.21333134243935065, "kl": 0.0682373046875, "learning_rate": 2.6051684500875136e-09, "loss": 0.0027, "reward": 1.3136161416769028, "reward_std": 0.36315673403441906, "rewards/accuracy_reward": 0.3939732350409031, "rewards/format_reward": 0.9196428954601288, "step": 625 }, { "completion_length": 335.49443531036377, "epoch": 0.9678229780655135, "grad_norm": 0.19711031558716233, "kl": 0.06414794921875, "learning_rate": 2.3631555589015418e-09, "loss": 0.0026, "reward": 1.4107143431901932, "reward_std": 0.30800105817615986, "rewards/accuracy_reward": 0.4654018050059676, "rewards/format_reward": 0.9453125335276127, "step": 626 }, { "completion_length": 333.42635440826416, "epoch": 0.9693690211614648, "grad_norm": 0.2276460057344401, "kl": 0.0671234130859375, "learning_rate": 2.132911852482766e-09, "loss": 0.0027, "reward": 1.4241072162985802, "reward_std": 0.36022947542369366, "rewards/accuracy_reward": 0.4765625246800482, "rewards/format_reward": 0.9475446753203869, "step": 627 }, { "completion_length": 325.57590675354004, "epoch": 0.9709150642574161, "grad_norm": 0.2282989926891336, "kl": 0.0683441162109375, "learning_rate": 1.914442776128622e-09, "loss": 0.0027, "reward": 1.416294701397419, "reward_std": 0.3696814440190792, "rewards/accuracy_reward": 0.47321430779993534, "rewards/format_reward": 0.9430803917348385, "step": 628 }, { "completion_length": 344.60827255249023, "epoch": 0.9724611073533674, "grad_norm": 0.21040372791309447, "kl": 0.0662994384765625, "learning_rate": 1.7077534966650765e-09, "loss": 0.0027, "reward": 1.4084821939468384, "reward_std": 0.36698276922106743, "rewards/accuracy_reward": 0.4832589514553547, "rewards/format_reward": 0.9252232499420643, "step": 629 }, { "completion_length": 345.2064895629883, "epoch": 0.9740071504493187, "grad_norm": 0.21021484993190365, "kl": 0.0596923828125, "learning_rate": 1.5128489023240597e-09, "loss": 0.0024, "reward": 1.421875074505806, "reward_std": 0.35508696362376213, "rewards/accuracy_reward": 0.4765625223517418, "rewards/format_reward": 0.9453125298023224, "step": 630 }, { "completion_length": 330.6529178619385, "epoch": 0.97555319354527, "grad_norm": 0.21587239233182223, "kl": 0.060333251953125, "learning_rate": 1.3297336026280027e-09, "loss": 0.0024, "reward": 1.452008992433548, "reward_std": 0.35464514046907425, "rewards/accuracy_reward": 0.5011160913854837, "rewards/format_reward": 0.9508928917348385, "step": 631 }, { "completion_length": 357.94421005249023, "epoch": 0.9770992366412213, "grad_norm": 0.21540458445190594, "kl": 0.059051513671875, "learning_rate": 1.1584119282806448e-09, "loss": 0.0024, "reward": 1.4207589998841286, "reward_std": 0.3640688471496105, "rewards/accuracy_reward": 0.498883955180645, "rewards/format_reward": 0.921875037252903, "step": 632 }, { "completion_length": 351.29800510406494, "epoch": 0.9786452797371726, "grad_norm": 0.24472255988599098, "kl": 0.0637664794921875, "learning_rate": 9.988879310649511e-10, "loss": 0.0026, "reward": 1.3470982760190964, "reward_std": 0.3735042801126838, "rewards/accuracy_reward": 0.4151785895228386, "rewards/format_reward": 0.9319196827709675, "step": 633 }, { "completion_length": 316.32032680511475, "epoch": 0.9801913228331239, "grad_norm": 0.2205238674106168, "kl": 0.0670318603515625, "learning_rate": 8.511653837470212e-10, "loss": 0.0027, "reward": 1.4430804327130318, "reward_std": 0.3502283478155732, "rewards/accuracy_reward": 0.4910714467987418, "rewards/format_reward": 0.9520089626312256, "step": 634 }, { "completion_length": 360.31251525878906, "epoch": 0.9817373659290752, "grad_norm": 0.19643441124495642, "kl": 0.062225341796875, "learning_rate": 7.152477799867718e-10, "loss": 0.0025, "reward": 1.3984375670552254, "reward_std": 0.38549028243869543, "rewards/accuracy_reward": 0.46540180779993534, "rewards/format_reward": 0.9330357499420643, "step": 635 }, { "completion_length": 315.0268030166626, "epoch": 0.9832834090250265, "grad_norm": 0.23892811167869393, "kl": 0.0659027099609375, "learning_rate": 5.911383342556142e-10, "loss": 0.0026, "reward": 1.4252232760190964, "reward_std": 0.37230258993804455, "rewards/accuracy_reward": 0.4821428805589676, "rewards/format_reward": 0.9430803917348385, "step": 636 }, { "completion_length": 331.6305932998657, "epoch": 0.9848294521209778, "grad_norm": 0.22981886790865214, "kl": 0.06005859375, "learning_rate": 4.788399817602929e-10, "loss": 0.0024, "reward": 1.4084822311997414, "reward_std": 0.35956521704792976, "rewards/accuracy_reward": 0.4665178759023547, "rewards/format_reward": 0.9419643245637417, "step": 637 }, { "completion_length": 351.710955619812, "epoch": 0.9863754952169291, "grad_norm": 0.2007850967457557, "kl": 0.05914306640625, "learning_rate": 3.7835537837338507e-10, "loss": 0.0024, "reward": 1.391741145402193, "reward_std": 0.3527519702911377, "rewards/accuracy_reward": 0.45870537869632244, "rewards/format_reward": 0.9330357424914837, "step": 638 }, { "completion_length": 340.9654178619385, "epoch": 0.9879215383128804, "grad_norm": 0.21263217072010437, "kl": 0.06475830078125, "learning_rate": 2.8968690057051826e-10, "loss": 0.0026, "reward": 1.4218750819563866, "reward_std": 0.3453766810707748, "rewards/accuracy_reward": 0.4665178768336773, "rewards/format_reward": 0.9553571715950966, "step": 639 }, { "completion_length": 342.92301177978516, "epoch": 0.9894675814088317, "grad_norm": 0.2530458546621635, "kl": 0.0626373291015625, "learning_rate": 2.1283664537435908e-10, "loss": 0.0025, "reward": 1.428571492433548, "reward_std": 0.3492702078074217, "rewards/accuracy_reward": 0.47433038242161274, "rewards/format_reward": 0.9542411081492901, "step": 640 }, { "completion_length": 328.6038112640381, "epoch": 0.991013624504783, "grad_norm": 0.2321465878639846, "kl": 0.0667877197265625, "learning_rate": 1.4780643030476436e-10, "loss": 0.0027, "reward": 1.4676339998841286, "reward_std": 0.3603665977716446, "rewards/accuracy_reward": 0.5200893077999353, "rewards/format_reward": 0.9475446827709675, "step": 641 }, { "completion_length": 348.6651954650879, "epoch": 0.9925596676007343, "grad_norm": 0.23190234574827098, "kl": 0.0641326904296875, "learning_rate": 9.459779333587104e-11, "loss": 0.0026, "reward": 1.4229911267757416, "reward_std": 0.3860031384974718, "rewards/accuracy_reward": 0.47656252048909664, "rewards/format_reward": 0.9464286006987095, "step": 642 }, { "completion_length": 341.2634057998657, "epoch": 0.9941057106966856, "grad_norm": 1.7848059193206942, "kl": 0.0653533935546875, "learning_rate": 5.321199285979183e-11, "loss": 0.0026, "reward": 1.4229911342263222, "reward_std": 0.41164090298116207, "rewards/accuracy_reward": 0.48437502700835466, "rewards/format_reward": 0.9386160969734192, "step": 643 }, { "completion_length": 340.9575996398926, "epoch": 0.9956517537926369, "grad_norm": 1.5107352216644576, "kl": 0.063507080078125, "learning_rate": 2.3650007656805803e-11, "loss": 0.0025, "reward": 1.4207589775323868, "reward_std": 0.37431447952985764, "rewards/accuracy_reward": 0.4698660932481289, "rewards/format_reward": 0.9508928991854191, "step": 644 }, { "completion_length": 332.3571548461914, "epoch": 0.9971977968885882, "grad_norm": 0.2201351321001044, "kl": 0.064208984375, "learning_rate": 5.912536872321183e-12, "loss": 0.0026, "reward": 1.4241072088479996, "reward_std": 0.34813660569489, "rewards/accuracy_reward": 0.4687500223517418, "rewards/format_reward": 0.9553571790456772, "step": 645 }, { "completion_length": 330.2020206451416, "epoch": 0.9987438399845395, "grad_norm": 0.22458612877831166, "kl": 0.0673675537109375, "learning_rate": 0.0, "loss": 0.0027, "reward": 1.4229911342263222, "reward_std": 0.36787193082273006, "rewards/accuracy_reward": 0.467633955180645, "rewards/format_reward": 0.9553571753203869, "step": 646 }, { "epoch": 0.9987438399845395, "step": 646, "total_flos": 0.0, "train_loss": 0.004591864831248247, "train_runtime": 81346.3601, "train_samples_per_second": 0.891, "train_steps_per_second": 0.008 } ], "logging_steps": 1, "max_steps": 646, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }