{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 5588.541748046875, "epoch": 0.002, "grad_norm": 0.3202221845849466, "learning_rate": 2e-08, "loss": -0.0, "num_tokens": 140485.0, "reward": 0.29351773485541344, "reward_std": 0.35374689288437366, "rewards/length_bonus_reward": 0.2924194373190403, "rewards/simple_accuracy_reward": 0.0833333358168602, "rewards/simple_cosine_scaled_reward": -0.16447007283568382, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 4415.291687011719, "epoch": 0.004, "grad_norm": 0.1215800055844518, "learning_rate": 4e-08, "loss": 0.0, "num_tokens": 251564.0, "reward": 0.3829944673925638, "reward_std": 0.10011515580117702, "rewards/length_bonus_reward": 0.14322916604578495, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": -0.020469460636377335, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 4342.750030517578, "epoch": 0.006, "grad_norm": 0.6980023131667008, "learning_rate": 6e-08, "loss": -0.0, "num_tokens": 360272.0, "reward": 0.14834509417414665, "reward_std": 0.3750849589705467, "rewards/length_bonus_reward": 0.1474812850356102, "rewards/simple_accuracy_reward": 0.0833333358168602, "rewards/simple_cosine_scaled_reward": -0.16493905149400234, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 5527.875061035156, "epoch": 0.008, "grad_norm": 0.48136982897853514, "learning_rate": 8e-08, "loss": 0.0, "num_tokens": 498317.0, "reward": 0.251199284568429, "reward_std": 0.2565653258934617, "rewards/length_bonus_reward": 0.2747090682387352, "rewards/simple_accuracy_reward": 0.0833333358168602, "rewards/simple_cosine_scaled_reward": -0.21368623990565538, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 5147.416748046875, "epoch": 0.01, "grad_norm": 0.399624500907218, "learning_rate": 1e-07, "loss": 0.0, "num_tokens": 627387.0, "reward": 0.6584749203175306, "reward_std": 0.36860312707722187, "rewards/length_bonus_reward": 0.2136230506002903, "rewards/simple_accuracy_reward": 0.375, "rewards/simple_cosine_scaled_reward": 0.1397036537528038, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 1556.3334045410156, "epoch": 0.012, "grad_norm": 0.2296143263786007, "learning_rate": 1.2e-07, "loss": 0.0, "num_tokens": 672407.0, "reward": 0.3092709190095775, "reward_std": 0.13769486732780933, "rewards/length_bonus_reward": 0.03960164324962534, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": 0.03933846578001976, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 4853.625091552734, "epoch": 0.014, "grad_norm": 0.12709001934655087, "learning_rate": 1.4e-07, "loss": 0.0, "num_tokens": 793142.0, "reward": 0.06208954192698002, "reward_std": 0.11933710426092148, "rewards/length_bonus_reward": 0.1937561109662056, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.26333311572670937, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 5062.416717529297, "epoch": 0.016, "grad_norm": 0.16061387989144157, "learning_rate": 1.6e-07, "loss": 0.0, "num_tokens": 919662.0, "reward": 0.06843259744346142, "reward_std": 0.1628182977437973, "rewards/length_bonus_reward": 0.2064615860581398, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.27605799213051796, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 5451.2501220703125, "epoch": 0.018, "grad_norm": 0.2575445138865162, "learning_rate": 1.8e-07, "loss": -0.0, "num_tokens": 1055874.0, "reward": 0.20521101425401866, "reward_std": 0.2829407528042793, "rewards/length_bonus_reward": 0.2875773049890995, "rewards/simple_accuracy_reward": 0.0416666679084301, "rewards/simple_cosine_scaled_reward": -0.2480659424327314, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 4822.083435058594, "epoch": 0.02, "grad_norm": 0.5197036919266123, "learning_rate": 2e-07, "loss": 0.0, "num_tokens": 1176230.0, "reward": 0.27374397963285446, "reward_std": 0.35803473368287086, "rewards/length_bonus_reward": 0.23099771700799465, "rewards/simple_accuracy_reward": 0.1250000037252903, "rewards/simple_cosine_scaled_reward": -0.16450752597302198, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 4321.125061035156, "epoch": 0.022, "grad_norm": 0.4652133370957579, "learning_rate": 2.1999999999999998e-07, "loss": 0.0, "num_tokens": 1283909.0, "reward": 0.9354557003825903, "reward_std": 0.2532319212332368, "rewards/length_bonus_reward": 0.19931030087172985, "rewards/simple_accuracy_reward": 0.5833333358168602, "rewards/simple_cosine_scaled_reward": 0.3056240640580654, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 3674.0834350585938, "epoch": 0.024, "grad_norm": 0.8759079918250702, "learning_rate": 2.4e-07, "loss": 0.0, "num_tokens": 1377223.0, "reward": 0.3548208177089691, "reward_std": 0.4739591218531132, "rewards/length_bonus_reward": 0.1669514998793602, "rewards/simple_accuracy_reward": 0.2083333358168602, "rewards/simple_cosine_scaled_reward": -0.0409280676394701, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 4784.75, "epoch": 0.026, "grad_norm": 0.6211180947861027, "learning_rate": 2.6e-07, "loss": 0.0, "num_tokens": 1496497.0, "reward": 0.7100962027907372, "reward_std": 0.573124598711729, "rewards/length_bonus_reward": 0.25386555737350136, "rewards/simple_accuracy_reward": 0.4166666679084301, "rewards/simple_cosine_scaled_reward": 0.07912788540124893, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 4814.958435058594, "epoch": 0.028, "grad_norm": 0.0830215618712904, "learning_rate": 2.8e-07, "loss": 0.0, "num_tokens": 1616628.0, "reward": 0.9544223546981812, "reward_std": 0.09986176248639822, "rewards/length_bonus_reward": 0.3004658967256546, "rewards/simple_accuracy_reward": 0.5, "rewards/simple_cosine_scaled_reward": 0.3079128582030535, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 4385.708435058594, "epoch": 0.03, "grad_norm": 0.616838228138335, "learning_rate": 3e-07, "loss": -0.0, "num_tokens": 1728521.0, "reward": 0.5137810558080673, "reward_std": 0.35365669429302216, "rewards/length_bonus_reward": 0.2525634765625, "rewards/simple_accuracy_reward": 0.2499999962747097, "rewards/simple_cosine_scaled_reward": 0.022435168735682964, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 6187.333435058594, "epoch": 0.032, "grad_norm": 0.2766844058885979, "learning_rate": 3.2e-07, "loss": 0.0, "num_tokens": 1881769.0, "reward": 0.6040653139352798, "reward_std": 0.27830212097615004, "rewards/length_bonus_reward": 0.2563985176384449, "rewards/simple_accuracy_reward": 0.2916666679084301, "rewards/simple_cosine_scaled_reward": 0.1120001757517457, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 4964.791748046875, "epoch": 0.034, "grad_norm": 0.19583653236313311, "learning_rate": 3.4000000000000003e-07, "loss": -0.0, "num_tokens": 2005610.0, "reward": 0.4976671002805233, "reward_std": 0.1736572328954935, "rewards/length_bonus_reward": 0.2369079664349556, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": 0.021518301218748093, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 5303.416687011719, "epoch": 0.036, "grad_norm": 0.13189804140462394, "learning_rate": 3.6e-07, "loss": 0.0, "num_tokens": 2138268.0, "reward": 0.4741740021854639, "reward_std": 0.13297679275274277, "rewards/length_bonus_reward": 0.254486083984375, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": -0.06062415987253189, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 6920.58349609375, "epoch": 0.038, "grad_norm": 0.22756051410460051, "learning_rate": 3.7999999999999996e-07, "loss": 0.0, "num_tokens": 2309102.0, "reward": 0.2352888509631157, "reward_std": 0.23895898647606373, "rewards/length_bonus_reward": 0.2874857559800148, "rewards/simple_accuracy_reward": 0.0416666679084301, "rewards/simple_cosine_scaled_reward": -0.1877271831035614, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 4124.708343505859, "epoch": 0.04, "grad_norm": 1.298799442808969, "learning_rate": 4e-07, "loss": 0.0, "num_tokens": 2419297.0, "reward": 0.3471077159047127, "reward_std": 0.418960427865386, "rewards/length_bonus_reward": 0.125, "rewards/simple_accuracy_reward": 0.2083333395421505, "rewards/simple_cosine_scaled_reward": 0.027548756450414658, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 3457.4583740234375, "epoch": 0.042, "grad_norm": 0.9067583820630793, "learning_rate": 4.1999999999999995e-07, "loss": 0.0, "num_tokens": 2506290.0, "reward": 0.49354083091020584, "reward_std": 0.3912106528878212, "rewards/length_bonus_reward": 0.18235270178411156, "rewards/simple_accuracy_reward": 0.2916666641831398, "rewards/simple_cosine_scaled_reward": 0.03904287703335285, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 3006.6250610351562, "epoch": 0.044, "grad_norm": 1.0312658678140594, "learning_rate": 4.3999999999999997e-07, "loss": 0.0, "num_tokens": 2582823.0, "reward": 0.46499780751764774, "reward_std": 0.3772339904680848, "rewards/length_bonus_reward": 0.0899658203125, "rewards/simple_accuracy_reward": 0.3333333283662796, "rewards/simple_cosine_scaled_reward": 0.08339725807309151, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 3990.1250915527344, "epoch": 0.046, "grad_norm": 0.46573299246089017, "learning_rate": 4.6e-07, "loss": 0.0, "num_tokens": 2683050.0, "reward": 0.8001542575657368, "reward_std": 0.26861944794654846, "rewards/length_bonus_reward": 0.2375691682100296, "rewards/simple_accuracy_reward": 0.4583333283662796, "rewards/simple_cosine_scaled_reward": 0.20850340276956558, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 4061.791717529297, "epoch": 0.048, "grad_norm": 0.2507473582471048, "learning_rate": 4.8e-07, "loss": 0.0, "num_tokens": 2785657.0, "reward": 0.7496756352484226, "reward_std": 0.2644285839051008, "rewards/length_bonus_reward": 0.1930440254509449, "rewards/simple_accuracy_reward": 0.4583333283662796, "rewards/simple_cosine_scaled_reward": 0.19659653678536415, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 3038.6250915527344, "epoch": 0.05, "grad_norm": 0.6036115651152258, "learning_rate": 5e-07, "loss": 0.0, "num_tokens": 2863756.0, "reward": 0.8596733454614878, "reward_std": 0.46443068236112595, "rewards/length_bonus_reward": 0.15618896763771772, "rewards/simple_accuracy_reward": 0.5416666716337204, "rewards/simple_cosine_scaled_reward": 0.32363542169332504, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 4288.541687011719, "epoch": 0.052, "grad_norm": 0.16622449187642346, "learning_rate": 5.2e-07, "loss": 0.0, "num_tokens": 2973281.0, "reward": 0.3951551169157028, "reward_std": 0.09651378728449345, "rewards/length_bonus_reward": 0.13368733460083604, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": 0.022935520857572556, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 4068.95849609375, "epoch": 0.054, "grad_norm": 0.43597163672925277, "learning_rate": 5.4e-07, "loss": 0.0, "num_tokens": 3076750.0, "reward": 0.3990556914359331, "reward_std": 0.3276849128305912, "rewards/length_bonus_reward": 0.27511596493422985, "rewards/simple_accuracy_reward": 0.1666666716337204, "rewards/simple_cosine_scaled_reward": -0.08545392379164696, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 5020.541839599609, "epoch": 0.056, "grad_norm": 0.3343528875251539, "learning_rate": 5.6e-07, "loss": 0.0, "num_tokens": 3202145.0, "reward": 0.49840663420036435, "reward_std": 0.3623024635016918, "rewards/length_bonus_reward": 0.24680582713335752, "rewards/simple_accuracy_reward": 0.2499999962747097, "rewards/simple_cosine_scaled_reward": 0.0032016076147556305, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 3338.916778564453, "epoch": 0.058, "grad_norm": 0.89704517936123, "learning_rate": 5.8e-07, "loss": 0.0, "num_tokens": 3286767.0, "reward": 0.6234970477526076, "reward_std": 0.5619443953037262, "rewards/length_bonus_reward": 0.23243204690515995, "rewards/simple_accuracy_reward": 0.3333333320915699, "rewards/simple_cosine_scaled_reward": 0.11546330712735653, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 2918.7083740234375, "epoch": 0.06, "grad_norm": 0.43956250692629134, "learning_rate": 6e-07, "loss": 0.0, "num_tokens": 3362078.0, "reward": 0.029184922575950623, "reward_std": 0.22401700355112553, "rewards/length_bonus_reward": 0.08308919263072312, "rewards/simple_accuracy_reward": 0.0416666679084301, "rewards/simple_cosine_scaled_reward": -0.19114186358638108, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 4803.375061035156, "epoch": 0.062, "grad_norm": 0.3379252883720788, "learning_rate": 6.2e-07, "loss": 0.0, "num_tokens": 3482021.0, "reward": 0.762636348605156, "reward_std": 0.4080433174967766, "rewards/length_bonus_reward": 0.2821451826021075, "rewards/simple_accuracy_reward": 0.4166666567325592, "rewards/simple_cosine_scaled_reward": 0.1276489421725273, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 4216.541748046875, "epoch": 0.064, "grad_norm": 0.1702836291033523, "learning_rate": 6.4e-07, "loss": 0.0, "num_tokens": 3587712.0, "reward": 0.5852973945438862, "reward_std": 0.17066550068557262, "rewards/length_bonus_reward": 0.3117472380399704, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": 0.0471002496778965, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 4591.875061035156, "epoch": 0.066, "grad_norm": 28.070010949720043, "learning_rate": 6.6e-07, "loss": -0.0, "num_tokens": 3702459.0, "reward": 0.459864541888237, "reward_std": 0.37870778888463974, "rewards/length_bonus_reward": 0.1885376013815403, "rewards/simple_accuracy_reward": 0.2499999962747097, "rewards/simple_cosine_scaled_reward": 0.0426538847386837, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 4144.250152587891, "epoch": 0.068, "grad_norm": 0.823483328856458, "learning_rate": 6.800000000000001e-07, "loss": 0.0, "num_tokens": 3806493.0, "reward": 0.7570685744285583, "reward_std": 0.6563308984041214, "rewards/length_bonus_reward": 0.17583210859447718, "rewards/simple_accuracy_reward": 0.4583333320915699, "rewards/simple_cosine_scaled_reward": 0.2458062544465065, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 4849.7083740234375, "epoch": 0.07, "grad_norm": 0.31513190683971126, "learning_rate": 7e-07, "loss": 0.0, "num_tokens": 3928844.0, "reward": 0.39534795843064785, "reward_std": 0.3156084343791008, "rewards/length_bonus_reward": 0.2510274285450578, "rewards/simple_accuracy_reward": 0.1666666716337204, "rewards/simple_cosine_scaled_reward": -0.044692326337099075, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 5534.4583740234375, "epoch": 0.072, "grad_norm": 0.36916017994308703, "learning_rate": 7.2e-07, "loss": 0.0, "num_tokens": 4066315.0, "reward": 0.6561870109289885, "reward_std": 0.3356907404959202, "rewards/length_bonus_reward": 0.2917277030646801, "rewards/simple_accuracy_reward": 0.3333333358168602, "rewards/simple_cosine_scaled_reward": 0.06225190684199333, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 3415.791717529297, "epoch": 0.074, "grad_norm": 0.3655802170633896, "learning_rate": 7.4e-07, "loss": -0.0, "num_tokens": 4154156.0, "reward": 0.48174334689974785, "reward_std": 0.30270471796393394, "rewards/length_bonus_reward": 0.16166178369894624, "rewards/simple_accuracy_reward": 0.2916666679084301, "rewards/simple_cosine_scaled_reward": 0.05682981386780739, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 4459.250152587891, "epoch": 0.076, "grad_norm": 0.4664455036382389, "learning_rate": 7.599999999999999e-07, "loss": 0.0, "num_tokens": 4265618.0, "reward": 0.46177057549357414, "reward_std": 0.27027018927037716, "rewards/length_bonus_reward": 0.2496236190199852, "rewards/simple_accuracy_reward": 0.2083333283662796, "rewards/simple_cosine_scaled_reward": 0.0076272450387477875, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 5765.416687011719, "epoch": 0.078, "grad_norm": 0.0888491960100995, "learning_rate": 7.799999999999999e-07, "loss": -0.0, "num_tokens": 4408500.0, "reward": 0.4560709074139595, "reward_std": 0.09919600374996662, "rewards/length_bonus_reward": 0.18803914386080578, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": 0.036063555628061295, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 4043.4584350585938, "epoch": 0.08, "grad_norm": 0.5459761720045538, "learning_rate": 8e-07, "loss": 0.0, "num_tokens": 4513931.0, "reward": 0.5336933135986328, "reward_std": 0.3034077547490597, "rewards/length_bonus_reward": 0.22027588391210884, "rewards/simple_accuracy_reward": 0.2916666679084301, "rewards/simple_cosine_scaled_reward": 0.04350149631500244, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 3942.083465576172, "epoch": 0.082, "grad_norm": 0.8479510354978386, "learning_rate": 8.199999999999999e-07, "loss": 0.0, "num_tokens": 4613209.0, "reward": 0.4463608041405678, "reward_std": 0.6248226426541805, "rewards/length_bonus_reward": 0.2534993514418602, "rewards/simple_accuracy_reward": 0.2083333395421505, "rewards/simple_cosine_scaled_reward": -0.030943790450692177, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 4622.0001220703125, "epoch": 0.084, "grad_norm": 0.7600499145185806, "learning_rate": 8.399999999999999e-07, "loss": 0.0, "num_tokens": 4729351.0, "reward": 0.5906441658735275, "reward_std": 0.60792625695467, "rewards/length_bonus_reward": 0.2360738143324852, "rewards/simple_accuracy_reward": 0.3333333395421505, "rewards/simple_cosine_scaled_reward": 0.04247397556900978, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 4175.666839599609, "epoch": 0.086, "grad_norm": 0.3830292853401058, "learning_rate": 8.599999999999999e-07, "loss": -0.0, "num_tokens": 4834931.0, "reward": 1.1089705973863602, "reward_std": 0.49167972430586815, "rewards/length_bonus_reward": 0.2049662321805954, "rewards/simple_accuracy_reward": 0.7083333246409893, "rewards/simple_cosine_scaled_reward": 0.3913421407341957, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 5694.125061035156, "epoch": 0.088, "grad_norm": 0.4968505489592319, "learning_rate": 8.799999999999999e-07, "loss": 0.0, "num_tokens": 4977434.0, "reward": 0.1648378185927868, "reward_std": 0.26036201044917107, "rewards/length_bonus_reward": 0.24143473617732525, "rewards/simple_accuracy_reward": 0.0416666679084301, "rewards/simple_cosine_scaled_reward": -0.23652716353535652, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 4532.166687011719, "epoch": 0.09, "grad_norm": 0.10612213052882749, "learning_rate": 9e-07, "loss": 0.0, "num_tokens": 5091312.0, "reward": 0.7931099850684404, "reward_std": 0.11585959792137146, "rewards/length_bonus_reward": 0.15336100198328495, "rewards/simple_accuracy_reward": 0.5, "rewards/simple_cosine_scaled_reward": 0.279497891664505, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 5800.666809082031, "epoch": 0.092, "grad_norm": 0.2862497598381837, "learning_rate": 9.2e-07, "loss": 0.0, "num_tokens": 5235118.0, "reward": 0.4633982819505036, "reward_std": 0.28751326724886894, "rewards/length_bonus_reward": 0.2633361779153347, "rewards/simple_accuracy_reward": 0.2083333283662796, "rewards/simple_cosine_scaled_reward": -0.0165424682199955, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 6068.625, "epoch": 0.094, "grad_norm": 0.1475955814575245, "learning_rate": 9.399999999999999e-07, "loss": 0.0, "num_tokens": 5389339.0, "reward": 0.05619503604248166, "reward_std": 0.12779691070318222, "rewards/length_bonus_reward": 0.2320047989487648, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.35161952674388885, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 2686.2083740234375, "epoch": 0.096, "grad_norm": 0.8166204112605538, "learning_rate": 9.6e-07, "loss": 0.0, "num_tokens": 5458374.0, "reward": 1.344577819108963, "reward_std": 0.44614117592573166, "rewards/length_bonus_reward": 0.18047078570816666, "rewards/simple_accuracy_reward": 0.8333333283662796, "rewards/simple_cosine_scaled_reward": 0.6615474000573158, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 5500.4583740234375, "epoch": 0.098, "grad_norm": 0.5098727276517729, "learning_rate": 9.8e-07, "loss": 0.0, "num_tokens": 5597255.0, "reward": 0.400611212477088, "reward_std": 0.48012464866042137, "rewards/length_bonus_reward": 0.2901916541159153, "rewards/simple_accuracy_reward": 0.1250000037252903, "rewards/simple_cosine_scaled_reward": -0.0291608739644289, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 4835.0833740234375, "epoch": 0.1, "grad_norm": 0.5928504683486656, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 5718331.0, "reward": 0.559112461283803, "reward_std": 0.4646828733384609, "rewards/length_bonus_reward": 0.25836181640625, "rewards/simple_accuracy_reward": 0.2916666716337204, "rewards/simple_cosine_scaled_reward": 0.01816796138882637, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 3631.1250610351562, "epoch": 0.102, "grad_norm": 0.4945446208256811, "learning_rate": 9.999878153526972e-07, "loss": 0.0, "num_tokens": 5811466.0, "reward": 0.39391457103192806, "reward_std": 0.3494595643132925, "rewards/length_bonus_reward": 0.15079752542078495, "rewards/simple_accuracy_reward": 0.2499999962747097, "rewards/simple_cosine_scaled_reward": -0.0137659702450037, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 4793.0833740234375, "epoch": 0.104, "grad_norm": 0.2687669411200645, "learning_rate": 9.99951262004652e-07, "loss": -0.0, "num_tokens": 5931216.0, "reward": 0.4039255529642105, "reward_std": 0.28776494413614273, "rewards/length_bonus_reward": 0.2746175155043602, "rewards/simple_accuracy_reward": 0.1666666716337204, "rewards/simple_cosine_scaled_reward": -0.07471724227070808, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 4306.625183105469, "epoch": 0.106, "grad_norm": 0.3310927096293874, "learning_rate": 9.998903417374226e-07, "loss": 0.0, "num_tokens": 6039423.0, "reward": 0.7708465196192265, "reward_std": 0.3330190684646368, "rewards/length_bonus_reward": 0.2520955400541425, "rewards/simple_accuracy_reward": 0.4166666716337204, "rewards/simple_cosine_scaled_reward": 0.20416854321956635, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 6618.916748046875, "epoch": 0.108, "grad_norm": 0.08422734615404971, "learning_rate": 9.99805057520177e-07, "loss": 0.0, "num_tokens": 6203173.0, "reward": 0.14276332035660744, "reward_std": 0.11548568494617939, "rewards/length_bonus_reward": 0.2903849333524704, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.2952432259917259, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 5356.875091552734, "epoch": 0.11, "grad_norm": 0.856459495103128, "learning_rate": 9.996954135095478e-07, "loss": 0.0, "num_tokens": 6338620.0, "reward": 0.3895111531019211, "reward_std": 0.4284693105146289, "rewards/length_bonus_reward": 0.24786376487463713, "rewards/simple_accuracy_reward": 0.1666666679084301, "rewards/simple_cosine_scaled_reward": -0.05003857612609863, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 3361.8750610351562, "epoch": 0.112, "grad_norm": 0.2532395826288174, "learning_rate": 9.99561415049429e-07, "loss": 0.0, "num_tokens": 6426307.0, "reward": 0.5413059238344431, "reward_std": 0.2339575458317995, "rewards/length_bonus_reward": 0.20903523545712233, "rewards/simple_accuracy_reward": 0.2916666679084301, "rewards/simple_cosine_scaled_reward": 0.08120803162455559, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 6044.791687011719, "epoch": 0.114, "grad_norm": 0.13705034227867308, "learning_rate": 9.99403068670717e-07, "loss": 0.0, "num_tokens": 6576674.0, "reward": 0.07965340954251587, "reward_std": 0.10402081301435828, "rewards/length_bonus_reward": 0.2257283516228199, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.2921498902142048, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 4718.666748046875, "epoch": 0.116, "grad_norm": 0.6238342095733336, "learning_rate": 9.992203820909905e-07, "loss": 0.0, "num_tokens": 6696678.0, "reward": 0.8683147262781858, "reward_std": 0.7392125874757767, "rewards/length_bonus_reward": 0.2880350723862648, "rewards/simple_accuracy_reward": 0.4583333432674408, "rewards/simple_cosine_scaled_reward": 0.2438925188034773, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 4920.875183105469, "epoch": 0.118, "grad_norm": 0.49160133424936453, "learning_rate": 9.990133642141357e-07, "loss": 0.0, "num_tokens": 6820509.0, "reward": 0.9466349892318249, "reward_std": 0.4984824303537607, "rewards/length_bonus_reward": 0.2711588591337204, "rewards/simple_accuracy_reward": 0.5416666716337204, "rewards/simple_cosine_scaled_reward": 0.2676188573241234, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 3082.250030517578, "epoch": 0.12, "grad_norm": 0.302602753217671, "learning_rate": 9.98782025129912e-07, "loss": 0.0, "num_tokens": 6900171.0, "reward": 0.6318811029195786, "reward_std": 0.19815576821565628, "rewards/length_bonus_reward": 0.09113566111773252, "rewards/simple_accuracy_reward": 0.4583333283662796, "rewards/simple_cosine_scaled_reward": 0.16482413560152054, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 4154.666748046875, "epoch": 0.122, "grad_norm": 0.28315088109818054, "learning_rate": 9.9852637611346e-07, "loss": 0.0, "num_tokens": 7005085.0, "reward": 0.3897430533543229, "reward_std": 0.27731580659747124, "rewards/length_bonus_reward": 0.20429483894258738, "rewards/simple_accuracy_reward": 0.2083333283662796, "rewards/simple_cosine_scaled_reward": -0.04577025771141052, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 3415.6251220703125, "epoch": 0.124, "grad_norm": 0.7178202149192794, "learning_rate": 9.982464296247522e-07, "loss": 0.0, "num_tokens": 7092154.0, "reward": 0.20390005689114332, "reward_std": 0.3449210152029991, "rewards/length_bonus_reward": 0.1845296211540699, "rewards/simple_accuracy_reward": 0.0833333358168602, "rewards/simple_cosine_scaled_reward": -0.1279258094727993, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 5104.250091552734, "epoch": 0.126, "grad_norm": 0.32871189475706225, "learning_rate": 9.97942199307985e-07, "loss": 0.0, "num_tokens": 7221964.0, "reward": 0.16401904541999102, "reward_std": 0.28889548778533936, "rewards/length_bonus_reward": 0.218170166015625, "rewards/simple_accuracy_reward": 0.0416666679084301, "rewards/simple_cosine_scaled_reward": -0.19163558818399906, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 4438.166748046875, "epoch": 0.128, "grad_norm": 0.8041783673744046, "learning_rate": 9.976136999909155e-07, "loss": 0.0, "num_tokens": 7333016.0, "reward": 0.8902743831276894, "reward_std": 0.5371754430234432, "rewards/length_bonus_reward": 0.2582600899040699, "rewards/simple_accuracy_reward": 0.5, "rewards/simple_cosine_scaled_reward": 0.26402850821614265, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 3387.7918090820312, "epoch": 0.13, "grad_norm": 0.7482108830651548, "learning_rate": 9.972609476841365e-07, "loss": -0.0, "num_tokens": 7419363.0, "reward": 1.0219443142414093, "reward_std": 0.6369538977742195, "rewards/length_bonus_reward": 0.2624104793649167, "rewards/simple_accuracy_reward": 0.5833333395421505, "rewards/simple_cosine_scaled_reward": 0.3524010172113776, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 5145.916687011719, "epoch": 0.132, "grad_norm": 0.18376919674909778, "learning_rate": 9.968839595802981e-07, "loss": 0.0, "num_tokens": 7547899.0, "reward": 0.15333662647753954, "reward_std": 0.14340407401323318, "rewards/length_bonus_reward": 0.24229939468204975, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.17792554199695587, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 4257.5001220703125, "epoch": 0.134, "grad_norm": 0.6088847714217344, "learning_rate": 9.964827540532684e-07, "loss": 0.0, "num_tokens": 7654663.0, "reward": 0.22840077243745327, "reward_std": 0.2920784130692482, "rewards/length_bonus_reward": 0.28432209976017475, "rewards/simple_accuracy_reward": 0.0416666679084301, "rewards/simple_cosine_scaled_reward": -0.19517601374536753, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 3798.5000610351562, "epoch": 0.136, "grad_norm": 0.89142693559825, "learning_rate": 9.960573506572389e-07, "loss": 0.0, "num_tokens": 7754155.0, "reward": 0.5322877466678619, "reward_std": 0.4247860945761204, "rewards/length_bonus_reward": 0.201019287109375, "rewards/simple_accuracy_reward": 0.2916666641831398, "rewards/simple_cosine_scaled_reward": 0.07920356653630733, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 5266.000183105469, "epoch": 0.138, "grad_norm": 0.7602755692292227, "learning_rate": 9.956077701257707e-07, "loss": 0.0, "num_tokens": 7889653.0, "reward": 0.2930946797132492, "reward_std": 0.42788615077733994, "rewards/length_bonus_reward": 0.2291056327521801, "rewards/simple_accuracy_reward": 0.1250000037252903, "rewards/simple_cosine_scaled_reward": -0.1220219200477004, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 3437.1251220703125, "epoch": 0.14, "grad_norm": 0.6809271474183596, "learning_rate": 9.95134034370785e-07, "loss": 0.0, "num_tokens": 7976224.0, "reward": 1.1635046601295471, "reward_std": 0.38380063883960247, "rewards/length_bonus_reward": 0.17161051696166396, "rewards/simple_accuracy_reward": 0.7499999962747097, "rewards/simple_cosine_scaled_reward": 0.4837882500141859, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 3288.5000610351562, "epoch": 0.142, "grad_norm": 0.8193360478315125, "learning_rate": 9.946361664814943e-07, "loss": -0.0, "num_tokens": 8059894.0, "reward": 0.6076652575284243, "reward_std": 0.3561842367053032, "rewards/length_bonus_reward": 0.14006551168859005, "rewards/simple_accuracy_reward": 0.375, "rewards/simple_cosine_scaled_reward": 0.1851995326578617, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 4289.416748046875, "epoch": 0.144, "grad_norm": 0.2619047878124916, "learning_rate": 9.941141907232763e-07, "loss": 0.0, "num_tokens": 8168858.0, "reward": 0.7863654531538486, "reward_std": 0.2567846514284611, "rewards/length_bonus_reward": 0.22727457247674465, "rewards/simple_accuracy_reward": 0.4583333283662796, "rewards/simple_cosine_scaled_reward": 0.2015150673687458, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 5521.7501220703125, "epoch": 0.146, "grad_norm": 0.3814956185439772, "learning_rate": 9.93568132536494e-07, "loss": 0.0, "num_tokens": 8307320.0, "reward": 0.19567161239683628, "reward_std": 0.27455841191112995, "rewards/length_bonus_reward": 0.2844136580824852, "rewards/simple_accuracy_reward": 0.0416666679084301, "rewards/simple_cosine_scaled_reward": -0.26081743836402893, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 5533.750061035156, "epoch": 0.148, "grad_norm": 0.34561498627406423, "learning_rate": 9.929980185352525e-07, "loss": 0.0, "num_tokens": 8448284.0, "reward": 0.6510901898145676, "reward_std": 0.35219161957502365, "rewards/length_bonus_reward": 0.2397054061293602, "rewards/simple_accuracy_reward": 0.375, "rewards/simple_cosine_scaled_reward": 0.07276954501867294, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 4376.458404541016, "epoch": 0.15, "grad_norm": 0.5007229528829928, "learning_rate": 9.92403876506104e-07, "loss": 0.0, "num_tokens": 8561245.0, "reward": 0.14082558825612068, "reward_std": 0.22746100835502148, "rewards/length_bonus_reward": 0.17633056826889515, "rewards/simple_accuracy_reward": 0.0416666679084301, "rewards/simple_cosine_scaled_reward": -0.15434329118579626, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 3361.3750610351562, "epoch": 0.152, "grad_norm": 0.17934635238708638, "learning_rate": 9.91785735406693e-07, "loss": -0.0, "num_tokens": 8645962.0, "reward": 0.4658235125243664, "reward_std": 0.14729403890669346, "rewards/length_bonus_reward": 0.21554565522819757, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": 0.0005557332187891006, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 3899.291793823242, "epoch": 0.154, "grad_norm": 0.9094492875618744, "learning_rate": 9.911436253643443e-07, "loss": -0.0, "num_tokens": 8743871.0, "reward": 0.274464875459671, "reward_std": 0.31044365651905537, "rewards/length_bonus_reward": 0.2531534880399704, "rewards/simple_accuracy_reward": 0.0833333358168602, "rewards/simple_cosine_scaled_reward": -0.12404388375580311, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 4455.958343505859, "epoch": 0.156, "grad_norm": 0.6998080174419681, "learning_rate": 9.904775776745956e-07, "loss": 0.0, "num_tokens": 8856652.0, "reward": 0.4352457821369171, "reward_std": 0.4964373055845499, "rewards/length_bonus_reward": 0.2125142402946949, "rewards/simple_accuracy_reward": 0.2500000074505806, "rewards/simple_cosine_scaled_reward": -0.05453695962205529, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 3685.666748046875, "epoch": 0.158, "grad_norm": 0.47387247253653353, "learning_rate": 9.89787624799672e-07, "loss": 0.0, "num_tokens": 8949752.0, "reward": 1.17104122787714, "reward_std": 0.29421181976795197, "rewards/length_bonus_reward": 0.2209676094353199, "rewards/simple_accuracy_reward": 0.7083333283662796, "rewards/simple_cosine_scaled_reward": 0.48348046839237213, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 4200.9583740234375, "epoch": 0.16, "grad_norm": 0.6604217641885854, "learning_rate": 9.890738003669027e-07, "loss": 0.0, "num_tokens": 9055465.0, "reward": 0.4483666159212589, "reward_std": 0.49113888293504715, "rewards/length_bonus_reward": 0.1949971504509449, "rewards/simple_accuracy_reward": 0.2500000074505806, "rewards/simple_cosine_scaled_reward": 0.006738867610692978, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 4932.708435058594, "epoch": 0.162, "grad_norm": 0.549886395976272, "learning_rate": 9.883361391670839e-07, "loss": 0.0, "num_tokens": 9179538.0, "reward": 0.5095759741961956, "reward_std": 0.4180926335975528, "rewards/length_bonus_reward": 0.24840292148292065, "rewards/simple_accuracy_reward": 0.2499999962747097, "rewards/simple_cosine_scaled_reward": 0.022346075624227524, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 4675.416748046875, "epoch": 0.164, "grad_norm": 0.8372666354353046, "learning_rate": 9.875746771527815e-07, "loss": 0.0, "num_tokens": 9296506.0, "reward": 0.6291187945753336, "reward_std": 0.28977332077920437, "rewards/length_bonus_reward": 0.19906616304069757, "rewards/simple_accuracy_reward": 0.375, "rewards/simple_cosine_scaled_reward": 0.1101052537560463, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 3679.8751220703125, "epoch": 0.166, "grad_norm": 0.4056505864312191, "learning_rate": 9.8678945143658e-07, "loss": 0.0, "num_tokens": 9389509.0, "reward": 0.9382852017879486, "reward_std": 0.457189217209816, "rewards/length_bonus_reward": 0.2444661483168602, "rewards/simple_accuracy_reward": 0.5416666641831398, "rewards/simple_cosine_scaled_reward": 0.30430474504828453, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 2720.9583740234375, "epoch": 0.168, "grad_norm": 0.8855949565097775, "learning_rate": 9.859805002892731e-07, "loss": 0.0, "num_tokens": 9464466.0, "reward": 0.599948063492775, "reward_std": 0.33274371549487114, "rewards/length_bonus_reward": 0.1621907576918602, "rewards/simple_accuracy_reward": 0.375, "rewards/simple_cosine_scaled_reward": 0.12551448866724968, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 4445.416748046875, "epoch": 0.17, "grad_norm": 0.3954805636333755, "learning_rate": 9.851478631379982e-07, "loss": 0.0, "num_tokens": 9577030.0, "reward": 0.6584330759942532, "reward_std": 0.3964180052280426, "rewards/length_bonus_reward": 0.2266642227768898, "rewards/simple_accuracy_reward": 0.375, "rewards/simple_cosine_scaled_reward": 0.11353765055537224, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 4716.000061035156, "epoch": 0.172, "grad_norm": 0.76093955512458, "learning_rate": 9.842915805643156e-07, "loss": 0.0, "num_tokens": 9695314.0, "reward": 0.8495085909962654, "reward_std": 0.45127921318635345, "rewards/length_bonus_reward": 0.2745259602088481, "rewards/simple_accuracy_reward": 0.4583333358168602, "rewards/simple_cosine_scaled_reward": 0.23329849913716316, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 2532.2084045410156, "epoch": 0.174, "grad_norm": 0.3739177408960678, "learning_rate": 9.834116943022297e-07, "loss": 0.0, "num_tokens": 9760857.0, "reward": 0.2704481929540634, "reward_std": 0.2934786919504404, "rewards/length_bonus_reward": 0.1488647423684597, "rewards/simple_accuracy_reward": 0.1666666716337204, "rewards/simple_cosine_scaled_reward": -0.09016644209623337, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 7041.5833740234375, "epoch": 0.176, "grad_norm": 0.07365158085129385, "learning_rate": 9.825082472361556e-07, "loss": 0.0, "num_tokens": 9935501.0, "reward": 0.16041189059615135, "reward_std": 0.09687662962824106, "rewards/length_bonus_reward": 0.2604166641831398, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.2000095695257187, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 4006.1250610351562, "epoch": 0.178, "grad_norm": 0.5050259110130607, "learning_rate": 9.81581283398829e-07, "loss": 0.0, "num_tokens": 10035914.0, "reward": 1.0287247598171234, "reward_std": 0.3183035869151354, "rewards/length_bonus_reward": 0.20811971020884812, "rewards/simple_accuracy_reward": 0.625, "rewards/simple_cosine_scaled_reward": 0.39121001586318016, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 5929.666748046875, "epoch": 0.18, "grad_norm": 0.4928298478095273, "learning_rate": 9.806308479691594e-07, "loss": 0.0, "num_tokens": 10183758.0, "reward": 0.16487563587725163, "reward_std": 0.23130928725004196, "rewards/length_bonus_reward": 0.21882120706140995, "rewards/simple_accuracy_reward": 0.0416666679084301, "rewards/simple_cosine_scaled_reward": -0.19122448563575745, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 5384.83349609375, "epoch": 0.182, "grad_norm": 0.1981296682668378, "learning_rate": 9.796569872700287e-07, "loss": -0.0, "num_tokens": 10317272.0, "reward": 0.5435238108038902, "reward_std": 0.2490558736026287, "rewards/length_bonus_reward": 0.3279012031853199, "rewards/simple_accuracy_reward": 0.2083333283662796, "rewards/simple_cosine_scaled_reward": 0.014578569680452347, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 4165.666748046875, "epoch": 0.184, "grad_norm": 0.7212836540849693, "learning_rate": 9.786597487660335e-07, "loss": 0.0, "num_tokens": 10428174.0, "reward": 0.9141942551359534, "reward_std": 0.41061049699783325, "rewards/length_bonus_reward": 0.2358195036649704, "rewards/simple_accuracy_reward": 0.5416666641831398, "rewards/simple_cosine_scaled_reward": 0.2734161149710417, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 3848.666717529297, "epoch": 0.186, "grad_norm": 0.2860358198278295, "learning_rate": 9.776391810611718e-07, "loss": 0.0, "num_tokens": 10526374.0, "reward": 0.6867482624948025, "reward_std": 0.2923105526715517, "rewards/length_bonus_reward": 0.16359456349164248, "rewards/simple_accuracy_reward": 0.4166666716337204, "rewards/simple_cosine_scaled_reward": 0.21297401562333107, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 6168.791687011719, "epoch": 0.188, "grad_norm": 0.28616206414124223, "learning_rate": 9.765953338964734e-07, "loss": 0.0, "num_tokens": 10679549.0, "reward": 0.2722206059843302, "reward_std": 0.29753352887928486, "rewards/length_bonus_reward": 0.274383544921875, "rewards/simple_accuracy_reward": 0.0833333358168602, "rewards/simple_cosine_scaled_reward": -0.17099254950881004, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 5384.041748046875, "epoch": 0.19, "grad_norm": 0.6158057949324235, "learning_rate": 9.755282581475767e-07, "loss": 0.0, "num_tokens": 10815000.0, "reward": 0.39591358229517937, "reward_std": 0.35755309648811817, "rewards/length_bonus_reward": 0.3419698029756546, "rewards/simple_accuracy_reward": 0.125, "rewards/simple_cosine_scaled_reward": -0.14211247116327286, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 5970.291687011719, "epoch": 0.192, "grad_norm": 0.10780693140096623, "learning_rate": 9.744380058222482e-07, "loss": 0.0, "num_tokens": 10962655.0, "reward": 0.4804303739219904, "reward_std": 0.11036383546888828, "rewards/length_bonus_reward": 0.21467081643640995, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": 0.031519073992967606, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 4234.958435058594, "epoch": 0.194, "grad_norm": 0.4059350991311962, "learning_rate": 9.733246300578482e-07, "loss": 0.0, "num_tokens": 11068248.0, "reward": 0.3892146429279819, "reward_std": 0.4721146933734417, "rewards/length_bonus_reward": 0.1970316544175148, "rewards/simple_accuracy_reward": 0.2083333358168602, "rewards/simple_cosine_scaled_reward": -0.032300736755132675, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 3444.8751220703125, "epoch": 0.196, "grad_norm": 0.5808549904068343, "learning_rate": 9.721881851187405e-07, "loss": 0.0, "num_tokens": 11155803.0, "reward": 0.4734135754406452, "reward_std": 0.28453803434967995, "rewards/length_bonus_reward": 0.2900390625, "rewards/simple_accuracy_reward": 0.2083333283662796, "rewards/simple_cosine_scaled_reward": -0.04991764947772026, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 4886.416809082031, "epoch": 0.198, "grad_norm": 0.1588085307249986, "learning_rate": 9.710287263936483e-07, "loss": 0.0, "num_tokens": 11277325.0, "reward": 0.4809043873101473, "reward_std": 0.1388307847082615, "rewards/length_bonus_reward": 0.2511088065803051, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": -0.04040886089205742, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 4216.8751220703125, "epoch": 0.2, "grad_norm": 0.4776322947117159, "learning_rate": 9.698463103929541e-07, "loss": 0.0, "num_tokens": 11382484.0, "reward": 0.6441160179674625, "reward_std": 0.5683612283319235, "rewards/length_bonus_reward": 0.2582499207928777, "rewards/simple_accuracy_reward": 0.3333333320915699, "rewards/simple_cosine_scaled_reward": 0.10506543144583702, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 6031.166748046875, "epoch": 0.202, "grad_norm": 0.22715705883022821, "learning_rate": 9.686409947459457e-07, "loss": 0.0, "num_tokens": 11531552.0, "reward": 0.4698410592973232, "reward_std": 0.25833618082106113, "rewards/length_bonus_reward": 0.2927958145737648, "rewards/simple_accuracy_reward": 0.2083333283662796, "rewards/simple_cosine_scaled_reward": -0.06257618963718414, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 3309.3334045410156, "epoch": 0.204, "grad_norm": 0.8446936248505523, "learning_rate": 9.674128381980071e-07, "loss": -0.0, "num_tokens": 11620744.0, "reward": 0.8768924307078123, "reward_std": 0.4192147757858038, "rewards/length_bonus_reward": 0.12841796781867743, "rewards/simple_accuracy_reward": 0.5833333283662796, "rewards/simple_cosine_scaled_reward": 0.3302822932600975, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 4452.375, "epoch": 0.206, "grad_norm": 0.8124892609464891, "learning_rate": 9.661619006077561e-07, "loss": -0.0, "num_tokens": 11738305.0, "reward": 0.23940843529999256, "reward_std": 0.260429909452796, "rewards/length_bonus_reward": 0.131103515625, "rewards/simple_accuracy_reward": 0.1666666716337204, "rewards/simple_cosine_scaled_reward": -0.11672349646687508, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 3105.2500610351562, "epoch": 0.208, "grad_norm": 0.7720030482630266, "learning_rate": 9.648882429441256e-07, "loss": 0.0, "num_tokens": 11817745.0, "reward": 0.6964684054255486, "reward_std": 0.6399029418826103, "rewards/length_bonus_reward": 0.20393880270421505, "rewards/simple_accuracy_reward": 0.416666679084301, "rewards/simple_cosine_scaled_reward": 0.1517258621752262, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 6074.6251220703125, "epoch": 0.21, "grad_norm": 0.5605516573647701, "learning_rate": 9.635919272833937e-07, "loss": 0.0, "num_tokens": 11972374.0, "reward": 0.3727143071591854, "reward_std": 0.4155346676707268, "rewards/length_bonus_reward": 0.2912699356675148, "rewards/simple_accuracy_reward": 0.1250000037252903, "rewards/simple_cosine_scaled_reward": -0.08711129677249119, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 4625.2083740234375, "epoch": 0.212, "grad_norm": 0.9060880703678975, "learning_rate": 9.622730168061567e-07, "loss": 0.0, "num_tokens": 12088599.0, "reward": 0.4089365005493164, "reward_std": 0.6783989574760199, "rewards/length_bonus_reward": 0.19434610567986965, "rewards/simple_accuracy_reward": 0.2500000037252903, "rewards/simple_cosine_scaled_reward": -0.07081922795623541, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 6526.250244140625, "epoch": 0.214, "grad_norm": 0.3106853537309348, "learning_rate": 9.609315757942502e-07, "loss": -0.0, "num_tokens": 12251103.0, "reward": 0.3939319849014282, "reward_std": 0.29753633588552475, "rewards/length_bonus_reward": 0.3060607835650444, "rewards/simple_accuracy_reward": 0.125, "rewards/simple_cosine_scaled_reward": -0.07425758987665176, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 4824.750183105469, "epoch": 0.216, "grad_norm": 0.6548749987888509, "learning_rate": 9.595676696276171e-07, "loss": 0.0, "num_tokens": 12372897.0, "reward": 0.3799270521849394, "reward_std": 0.5487200655043125, "rewards/length_bonus_reward": 0.255645751953125, "rewards/simple_accuracy_reward": 0.1666666716337204, "rewards/simple_cosine_scaled_reward": -0.08477070182561874, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 6019.333435058594, "epoch": 0.218, "grad_norm": 0.16520710400167016, "learning_rate": 9.581813647811197e-07, "loss": 0.0, "num_tokens": 12521867.0, "reward": 0.13335432671010494, "reward_std": 0.11485922336578369, "rewards/length_bonus_reward": 0.2395833320915699, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.21245801448822021, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 4784.583435058594, "epoch": 0.22, "grad_norm": 0.6730079438707998, "learning_rate": 9.567727288213004e-07, "loss": 0.0, "num_tokens": 12642439.0, "reward": 0.9326558317989111, "reward_std": 0.580977788195014, "rewards/length_bonus_reward": 0.24114990420639515, "rewards/simple_accuracy_reward": 0.5416666567325592, "rewards/simple_cosine_scaled_reward": 0.2996784672141075, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 5480.58349609375, "epoch": 0.222, "grad_norm": 0.4575076928374687, "learning_rate": 9.553418304030885e-07, "loss": 0.0, "num_tokens": 12778803.0, "reward": 0.584640733897686, "reward_std": 0.4703083075582981, "rewards/length_bonus_reward": 0.3424784317612648, "rewards/simple_accuracy_reward": 0.2500000074505806, "rewards/simple_cosine_scaled_reward": -0.01567540504038334, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 4505.958343505859, "epoch": 0.224, "grad_norm": 0.43245393327449266, "learning_rate": 9.538887392664543e-07, "loss": 0.0, "num_tokens": 12891698.0, "reward": 0.09525683335959911, "reward_std": 0.22008414007723331, "rewards/length_bonus_reward": 0.1577555350959301, "rewards/simple_accuracy_reward": 0.0416666679084301, "rewards/simple_cosine_scaled_reward": -0.20833073183894157, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 3753.5833740234375, "epoch": 0.226, "grad_norm": 0.3899366100929538, "learning_rate": 9.524135262330098e-07, "loss": 0.0, "num_tokens": 12985942.0, "reward": 0.8407284505665302, "reward_std": 0.2636660076677799, "rewards/length_bonus_reward": 0.14105224516242743, "rewards/simple_accuracy_reward": 0.5416666679084301, "rewards/simple_cosine_scaled_reward": 0.31601901911199093, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 4942.958435058594, "epoch": 0.228, "grad_norm": 0.5820999662482855, "learning_rate": 9.509162632025569e-07, "loss": 0.0, "num_tokens": 13109925.0, "reward": 0.7017070166766644, "reward_std": 0.2792724221944809, "rewards/length_bonus_reward": 0.2607421912252903, "rewards/simple_accuracy_reward": 0.375, "rewards/simple_cosine_scaled_reward": 0.13192963600158691, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 5470.9583740234375, "epoch": 0.23, "grad_norm": 0.3680228830856499, "learning_rate": 9.493970231495834e-07, "loss": 0.0, "num_tokens": 13245620.0, "reward": 1.0157615765929222, "reward_std": 0.43061237782239914, "rewards/length_bonus_reward": 0.2717386856675148, "rewards/simple_accuracy_reward": 0.5833333283662796, "rewards/simple_cosine_scaled_reward": 0.32137905806303024, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 5415.2083740234375, "epoch": 0.232, "grad_norm": 0.19302732900142083, "learning_rate": 9.478558801197064e-07, "loss": 0.0, "num_tokens": 13379911.0, "reward": 0.18993283435702324, "reward_std": 0.15281926840543747, "rewards/length_bonus_reward": 0.2957255095243454, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.21158537082374096, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 5243.916748046875, "epoch": 0.234, "grad_norm": 0.1522539935368296, "learning_rate": 9.462929092260628e-07, "loss": -0.0, "num_tokens": 13515443.0, "reward": 0.4543988760560751, "reward_std": 0.1504145972430706, "rewards/length_bonus_reward": 0.2195231094956398, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": -0.030248429626226425, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 5124.625183105469, "epoch": 0.236, "grad_norm": 0.3592959674977673, "learning_rate": 9.447081866456487e-07, "loss": 0.0, "num_tokens": 13642652.0, "reward": 0.6238602623343468, "reward_std": 0.3313994463533163, "rewards/length_bonus_reward": 0.30047607421875, "rewards/simple_accuracy_reward": 0.2916666679084301, "rewards/simple_cosine_scaled_reward": 0.06343494821339846, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 5694.166748046875, "epoch": 0.238, "grad_norm": 0.3887380720700946, "learning_rate": 9.431017896156073e-07, "loss": 0.0, "num_tokens": 13783506.0, "reward": 0.4483037441968918, "reward_std": 0.28110867738723755, "rewards/length_bonus_reward": 0.3182881698012352, "rewards/simple_accuracy_reward": 0.1666666716337204, "rewards/simple_cosine_scaled_reward": -0.07330222800374031, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 6078.8751220703125, "epoch": 0.24, "grad_norm": 0.09066300268283836, "learning_rate": 9.414737964294634e-07, "loss": 0.0, "num_tokens": 13935705.0, "reward": 0.5095425006002188, "reward_std": 0.1109990980476141, "rewards/length_bonus_reward": 0.2462361641228199, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": 0.026612676680088043, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 4817.083435058594, "epoch": 0.242, "grad_norm": 0.3472323189539251, "learning_rate": 9.398242864333083e-07, "loss": 0.0, "num_tokens": 14059781.0, "reward": 1.030790038406849, "reward_std": 0.33569352701306343, "rewards/length_bonus_reward": 0.26196289248764515, "rewards/simple_accuracy_reward": 0.625, "rewards/simple_cosine_scaled_reward": 0.28765415772795677, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 4648.625030517578, "epoch": 0.244, "grad_norm": 0.9066927266302552, "learning_rate": 9.381533400219317e-07, "loss": -0.0, "num_tokens": 14176292.0, "reward": 0.4164937399327755, "reward_std": 0.38858342729508877, "rewards/length_bonus_reward": 0.23763020522892475, "rewards/simple_accuracy_reward": 0.2083333395421505, "rewards/simple_cosine_scaled_reward": -0.0589396171271801, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 4243.45849609375, "epoch": 0.246, "grad_norm": 0.6426455885920049, "learning_rate": 9.364610386349047e-07, "loss": 0.0, "num_tokens": 14284081.0, "reward": 0.49601300433278084, "reward_std": 0.49030904471874237, "rewards/length_bonus_reward": 0.2486775666475296, "rewards/simple_accuracy_reward": 0.2500000074505806, "rewards/simple_cosine_scaled_reward": -0.005329171195626259, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 5784.625, "epoch": 0.248, "grad_norm": 0.12402571080719503, "learning_rate": 9.347474647526095e-07, "loss": 0.0, "num_tokens": 14428198.0, "reward": 0.15224174410104752, "reward_std": 0.13429583050310612, "rewards/length_bonus_reward": 0.2687581367790699, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.23303279653191566, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 5932.416748046875, "epoch": 0.25, "grad_norm": 0.3536998892446641, "learning_rate": 9.330127018922193e-07, "loss": 0.0, "num_tokens": 14575256.0, "reward": 0.4320463761687279, "reward_std": 0.34050702303647995, "rewards/length_bonus_reward": 0.3501383438706398, "rewards/simple_accuracy_reward": 0.125, "rewards/simple_cosine_scaled_reward": -0.08618395403027534, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 5054.7501220703125, "epoch": 0.252, "grad_norm": 0.234500901268806, "learning_rate": 9.312568346036287e-07, "loss": 0.0, "num_tokens": 14701706.0, "reward": 0.16274882544530556, "reward_std": 0.2940198313444853, "rewards/length_bonus_reward": 0.23492431640625, "rewards/simple_accuracy_reward": 0.0416666679084301, "rewards/simple_cosine_scaled_reward": -0.22768433822784573, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 5276.9583740234375, "epoch": 0.254, "grad_norm": 0.7013113515836652, "learning_rate": 9.294799484653322e-07, "loss": 0.0, "num_tokens": 14835469.0, "reward": 0.5542928203940392, "reward_std": 0.4889195244759321, "rewards/length_bonus_reward": 0.2224629744887352, "rewards/simple_accuracy_reward": 0.2916666641831398, "rewards/simple_cosine_scaled_reward": 0.08032635506242514, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 3786.7918701171875, "epoch": 0.256, "grad_norm": 0.5275857926686957, "learning_rate": 9.276821300802533e-07, "loss": 0.0, "num_tokens": 14932220.0, "reward": 0.6301150470972061, "reward_std": 0.48804691061377525, "rewards/length_bonus_reward": 0.2512308768928051, "rewards/simple_accuracy_reward": 0.3333333432674408, "rewards/simple_cosine_scaled_reward": 0.09110165387392044, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 4480.750091552734, "epoch": 0.258, "grad_norm": 0.6963507029561012, "learning_rate": 9.258634670715237e-07, "loss": 0.0, "num_tokens": 15044348.0, "reward": 0.44254348427057266, "reward_std": 0.45445345900952816, "rewards/length_bonus_reward": 0.24745686585083604, "rewards/simple_accuracy_reward": 0.2083333358168602, "rewards/simple_cosine_scaled_reward": -0.026493461802601814, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 6279.541748046875, "epoch": 0.26, "grad_norm": 0.12253112218380627, "learning_rate": 9.240240480782129e-07, "loss": 0.0, "num_tokens": 15201177.0, "reward": 0.15380340814590454, "reward_std": 0.11449805460870266, "rewards/length_bonus_reward": 0.2700500562787056, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.2324932962656021, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 5757.416809082031, "epoch": 0.262, "grad_norm": 0.1197665839350287, "learning_rate": 9.221639627510075e-07, "loss": 0.0, "num_tokens": 15343933.0, "reward": 0.5078718177974224, "reward_std": 0.1491607166826725, "rewards/length_bonus_reward": 0.2457682266831398, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": 0.024207163602113724, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 4833.833404541016, "epoch": 0.264, "grad_norm": 0.3538466987276316, "learning_rate": 9.202833017478421e-07, "loss": 0.0, "num_tokens": 15464643.0, "reward": 0.2749212346971035, "reward_std": 0.23782910592854023, "rewards/length_bonus_reward": 0.22119140136055648, "rewards/simple_accuracy_reward": 0.125, "rewards/simple_cosine_scaled_reward": -0.142540343105793, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 4128.2083740234375, "epoch": 0.266, "grad_norm": 0.2080935357744689, "learning_rate": 9.183821567294808e-07, "loss": 0.0, "num_tokens": 15568454.0, "reward": 0.07865859009325504, "reward_std": 0.1562840025871992, "rewards/length_bonus_reward": 0.1740722668619128, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.1908273585140705, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 4853.041748046875, "epoch": 0.268, "grad_norm": 0.38738874490714537, "learning_rate": 9.164606203550497e-07, "loss": 0.0, "num_tokens": 15689919.0, "reward": 0.9730488862842321, "reward_std": 0.4299510782584548, "rewards/length_bonus_reward": 0.2476908341050148, "rewards/simple_accuracy_reward": 0.5833333283662796, "rewards/simple_cosine_scaled_reward": 0.2840494066476822, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 6262.58349609375, "epoch": 0.27, "grad_norm": 0.09489529494528452, "learning_rate": 9.145187862775208e-07, "loss": 0.0, "num_tokens": 15846491.0, "reward": 0.19970553927123547, "reward_std": 0.12359648756682873, "rewards/length_bonus_reward": 0.3351237028837204, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.27083634212613106, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 5293.7083740234375, "epoch": 0.272, "grad_norm": 0.5234333065061119, "learning_rate": 9.125567491391475e-07, "loss": 0.0, "num_tokens": 15979672.0, "reward": 0.3327227085828781, "reward_std": 0.3268910013139248, "rewards/length_bonus_reward": 0.2928568534553051, "rewards/simple_accuracy_reward": 0.125, "rewards/simple_cosine_scaled_reward": -0.17026829719543457, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 5859.33349609375, "epoch": 0.274, "grad_norm": 0.3309961196443718, "learning_rate": 9.10574604566852e-07, "loss": -0.0, "num_tokens": 16125642.0, "reward": 0.7439026981592178, "reward_std": 0.3001306429505348, "rewards/length_bonus_reward": 0.3605855330824852, "rewards/simple_accuracy_reward": 0.3333333358168602, "rewards/simple_cosine_scaled_reward": 0.09996771439909935, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 5107.7083740234375, "epoch": 0.276, "grad_norm": 0.5695923497050503, "learning_rate": 9.085724491675642e-07, "loss": 0.0, "num_tokens": 16253243.0, "reward": 0.32675594836473465, "reward_std": 0.33636693097651005, "rewards/length_bonus_reward": 0.2494913749396801, "rewards/simple_accuracy_reward": 0.125, "rewards/simple_cosine_scaled_reward": -0.0954708568751812, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 5973.0833740234375, "epoch": 0.278, "grad_norm": 0.3281402000817213, "learning_rate": 9.065503805235137e-07, "loss": -0.0, "num_tokens": 16402921.0, "reward": 0.4385562129318714, "reward_std": 0.3288230858743191, "rewards/length_bonus_reward": 0.30902099609375, "rewards/simple_accuracy_reward": 0.1666666716337204, "rewards/simple_cosine_scaled_reward": -0.07426288723945618, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 3389.125030517578, "epoch": 0.28, "grad_norm": 0.4376765887084837, "learning_rate": 9.045084971874737e-07, "loss": -0.0, "num_tokens": 16489822.0, "reward": 0.04028365761041641, "reward_std": 0.2493639998137951, "rewards/length_bonus_reward": 0.1180623359978199, "rewards/simple_accuracy_reward": 0.0416666679084301, "rewards/simple_cosine_scaled_reward": -0.23889067955315113, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 5941.0, "epoch": 0.282, "grad_norm": 0.45313601953961397, "learning_rate": 9.02446898677957e-07, "loss": -0.0, "num_tokens": 16637512.0, "reward": 0.4604772999882698, "reward_std": 0.39700623974204063, "rewards/length_bonus_reward": 0.2891133651137352, "rewards/simple_accuracy_reward": 0.2083333395421505, "rewards/simple_cosine_scaled_reward": -0.07393878698348999, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 4761.08349609375, "epoch": 0.284, "grad_norm": 0.6822848650073692, "learning_rate": 9.003656854743666e-07, "loss": -0.0, "num_tokens": 16756536.0, "reward": 1.0011027157306671, "reward_std": 0.6296183913946152, "rewards/length_bonus_reward": 0.2329203262925148, "rewards/simple_accuracy_reward": 0.5833333395421505, "rewards/simple_cosine_scaled_reward": 0.3696981370449066, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 4268.166748046875, "epoch": 0.286, "grad_norm": 0.22234007919983648, "learning_rate": 8.982649590120981e-07, "loss": 0.0, "num_tokens": 16863322.0, "reward": 1.120678547769785, "reward_std": 0.24569211155176163, "rewards/length_bonus_reward": 0.19941203109920025, "rewards/simple_accuracy_reward": 0.7083333283662796, "rewards/simple_cosine_scaled_reward": 0.42586616426706314, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 6268.875, "epoch": 0.288, "grad_norm": 0.1618205482759648, "learning_rate": 8.961448216775953e-07, "loss": 0.0, "num_tokens": 17018773.0, "reward": 0.5277039650827646, "reward_std": 0.11753341183066368, "rewards/length_bonus_reward": 0.258514404296875, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": 0.03837912157177925, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 4860.208435058594, "epoch": 0.29, "grad_norm": 0.5448095772208122, "learning_rate": 8.940053768033608e-07, "loss": 0.0, "num_tokens": 17140302.0, "reward": 0.7241115719079971, "reward_std": 0.4873878173530102, "rewards/length_bonus_reward": 0.2304687574505806, "rewards/simple_accuracy_reward": 0.4166666716337204, "rewards/simple_cosine_scaled_reward": 0.1539523066021502, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 4076.375244140625, "epoch": 0.292, "grad_norm": 0.4156779310313601, "learning_rate": 8.918467286629198e-07, "loss": -0.0, "num_tokens": 17244201.0, "reward": 1.1241516172885895, "reward_std": 0.3645579293370247, "rewards/length_bonus_reward": 0.26678466610610485, "rewards/simple_accuracy_reward": 0.6666666567325592, "rewards/simple_cosine_scaled_reward": 0.38140053302049637, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 4994.58349609375, "epoch": 0.294, "grad_norm": 0.4390807151203295, "learning_rate": 8.896689824657371e-07, "loss": 0.0, "num_tokens": 17369003.0, "reward": 0.7065666327252984, "reward_std": 0.4446803331375122, "rewards/length_bonus_reward": 0.2897847443819046, "rewards/simple_accuracy_reward": 0.375, "rewards/simple_cosine_scaled_reward": 0.083563681691885, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 5338.0, "epoch": 0.296, "grad_norm": 0.07253167852784181, "learning_rate": 8.874722443520898e-07, "loss": 0.0, "num_tokens": 17501357.0, "reward": 0.4533107914030552, "reward_std": 0.0913370493799448, "rewards/length_bonus_reward": 0.22459919995162636, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": -0.04257684946060181, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 5038.166687011719, "epoch": 0.298, "grad_norm": 0.14525400188724358, "learning_rate": 8.852566213878946e-07, "loss": 0.0, "num_tokens": 17626839.0, "reward": 0.47856458905152977, "reward_std": 0.14683254435658455, "rewards/length_bonus_reward": 0.2153727225959301, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": 0.026383675634860992, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 6034.000061035156, "epoch": 0.3, "grad_norm": 0.4431185373657341, "learning_rate": 8.83022221559489e-07, "loss": 0.0, "num_tokens": 17776461.0, "reward": 0.3205497469753027, "reward_std": 0.3268894534558058, "rewards/length_bonus_reward": 0.2510782852768898, "rewards/simple_accuracy_reward": 0.125, "rewards/simple_cosine_scaled_reward": -0.111057098954916, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 6448.7501220703125, "epoch": 0.302, "grad_norm": 0.5495129289233517, "learning_rate": 8.807691537683684e-07, "loss": 0.0, "num_tokens": 17936397.0, "reward": 0.5696404315531254, "reward_std": 0.49690710194408894, "rewards/length_bonus_reward": 0.301727294921875, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": 0.035826217383146286, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 5254.500061035156, "epoch": 0.304, "grad_norm": 0.1760008644781453, "learning_rate": 8.784975278258782e-07, "loss": 0.0, "num_tokens": 18067575.0, "reward": 0.47693356312811375, "reward_std": 0.1501255203038454, "rewards/length_bonus_reward": 0.2382303886115551, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": -0.022593729197978973, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 5446.8333740234375, "epoch": 0.306, "grad_norm": 0.2636663685572291, "learning_rate": 8.762074544478621e-07, "loss": 0.0, "num_tokens": 18207449.0, "reward": 0.9400034695863724, "reward_std": 0.29744905419647694, "rewards/length_bonus_reward": 0.2526346854865551, "rewards/simple_accuracy_reward": 0.5416666679084301, "rewards/simple_cosine_scaled_reward": 0.2914041765034199, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 6009.916748046875, "epoch": 0.308, "grad_norm": 0.0917146401049319, "learning_rate": 8.73899045249266e-07, "loss": 0.0, "num_tokens": 18356163.0, "reward": 0.15357576683163643, "reward_std": 0.10576971992850304, "rewards/length_bonus_reward": 0.2878214567899704, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.26849138364195824, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 3527.8333740234375, "epoch": 0.31, "grad_norm": 0.2412551814581796, "learning_rate": 8.71572412738697e-07, "loss": 0.0, "num_tokens": 18450353.0, "reward": 1.1639008074998856, "reward_std": 0.259334085509181, "rewards/length_bonus_reward": 0.222381591796875, "rewards/simple_accuracy_reward": 0.7083333283662796, "rewards/simple_cosine_scaled_reward": 0.46637170016765594, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 5893.4168701171875, "epoch": 0.312, "grad_norm": 0.6675777298230039, "learning_rate": 8.69227670312942e-07, "loss": 0.0, "num_tokens": 18597507.0, "reward": 0.8357449248433113, "reward_std": 0.7200102135539055, "rewards/length_bonus_reward": 0.3181355744600296, "rewards/simple_accuracy_reward": 0.416666679084301, "rewards/simple_cosine_scaled_reward": 0.20188527554273605, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 4261.916748046875, "epoch": 0.314, "grad_norm": 0.4567998373824013, "learning_rate": 8.668649322514381e-07, "loss": 0.0, "num_tokens": 18705103.0, "reward": 0.78871014341712, "reward_std": 0.45212841406464577, "rewards/length_bonus_reward": 0.227508544921875, "rewards/simple_accuracy_reward": 0.4583333395421505, "rewards/simple_cosine_scaled_reward": 0.20573647692799568, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 3567.916748046875, "epoch": 0.316, "grad_norm": 0.6875700705483454, "learning_rate": 8.644843137107057e-07, "loss": 0.0, "num_tokens": 18796535.0, "reward": 1.0377163849771023, "reward_std": 0.43031867034733295, "rewards/length_bonus_reward": 0.28204345889389515, "rewards/simple_accuracy_reward": 0.5833333283662796, "rewards/simple_cosine_scaled_reward": 0.3446791432797909, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 5411.375061035156, "epoch": 0.318, "grad_norm": 0.11777048888430258, "learning_rate": 8.620859307187338e-07, "loss": 0.0, "num_tokens": 18931274.0, "reward": 0.9691413752734661, "reward_std": 0.14118255116045475, "rewards/length_bonus_reward": 0.2877095490694046, "rewards/simple_accuracy_reward": 0.5, "rewards/simple_cosine_scaled_reward": 0.36286352947354317, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 6413.20849609375, "epoch": 0.32, "grad_norm": 0.4558344199593882, "learning_rate": 8.596699001693255e-07, "loss": 0.0, "num_tokens": 19090039.0, "reward": 0.694373682141304, "reward_std": 0.4437053306028247, "rewards/length_bonus_reward": 0.340850830078125, "rewards/simple_accuracy_reward": 0.3333333283662796, "rewards/simple_cosine_scaled_reward": 0.04037899151444435, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 4362.291809082031, "epoch": 0.322, "grad_norm": 0.27630052421553536, "learning_rate": 8.572363398164016e-07, "loss": 0.0, "num_tokens": 19199000.0, "reward": 0.6946423957124352, "reward_std": 0.32446189038455486, "rewards/length_bonus_reward": 0.21559651754796505, "rewards/simple_accuracy_reward": 0.4166666716337204, "rewards/simple_cosine_scaled_reward": 0.12475842237472534, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 6762.541748046875, "epoch": 0.324, "grad_norm": 0.41737941809996987, "learning_rate": 8.547853682682604e-07, "loss": 0.0, "num_tokens": 19367085.0, "reward": 0.22934497147798538, "reward_std": 0.2527217324823141, "rewards/length_bonus_reward": 0.2606302946805954, "rewards/simple_accuracy_reward": 0.0833333358168602, "rewards/simple_cosine_scaled_reward": -0.22923735342919827, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 5256.70849609375, "epoch": 0.326, "grad_norm": 0.26665677332668397, "learning_rate": 8.523171049817973e-07, "loss": 0.0, "num_tokens": 19498466.0, "reward": 0.4902437776327133, "reward_std": 0.27884939312934875, "rewards/length_bonus_reward": 0.3262227401137352, "rewards/simple_accuracy_reward": 0.2083333283662796, "rewards/simple_cosine_scaled_reward": -0.08862460404634476, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 6215.9583740234375, "epoch": 0.328, "grad_norm": 0.16097179867332953, "learning_rate": 8.498316702566826e-07, "loss": 0.0, "num_tokens": 19653721.0, "reward": 0.19414933770895004, "reward_std": 0.15624351426959038, "rewards/length_bonus_reward": 0.3096313402056694, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.2309640273451805, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 4691.625061035156, "epoch": 0.33, "grad_norm": 0.7234546761121142, "learning_rate": 8.473291852294986e-07, "loss": -0.0, "num_tokens": 19770580.0, "reward": 0.5927875675261021, "reward_std": 0.5797190964221954, "rewards/length_bonus_reward": 0.2896728478372097, "rewards/simple_accuracy_reward": 0.2916666641831398, "rewards/simple_cosine_scaled_reward": 0.02289609331637621, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 5950.541687011719, "epoch": 0.332, "grad_norm": 0.39847455919911545, "learning_rate": 8.448097718678348e-07, "loss": 0.0, "num_tokens": 19917893.0, "reward": 0.6466319859027863, "reward_std": 0.5369185488671064, "rewards/length_bonus_reward": 0.3276163712143898, "rewards/simple_accuracy_reward": 0.2916666641831398, "rewards/simple_cosine_scaled_reward": 0.05469784280285239, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 5245.250091552734, "epoch": 0.334, "grad_norm": 0.5910828753774726, "learning_rate": 8.422735529643443e-07, "loss": -0.0, "num_tokens": 20048165.0, "reward": 0.5825205482542515, "reward_std": 0.43347795121371746, "rewards/length_bonus_reward": 0.23745727259665728, "rewards/simple_accuracy_reward": 0.2916666641831398, "rewards/simple_cosine_scaled_reward": 0.10679326765239239, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 6006.7083740234375, "epoch": 0.336, "grad_norm": 0.10885314576849273, "learning_rate": 8.397206521307583e-07, "loss": 0.0, "num_tokens": 20197222.0, "reward": 0.48691485077142715, "reward_std": 0.10994361899793148, "rewards/length_bonus_reward": 0.2166544571518898, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": 0.04052072577178478, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 5096.291687011719, "epoch": 0.338, "grad_norm": 0.15154930694740384, "learning_rate": 8.371511937918617e-07, "loss": -0.0, "num_tokens": 20324231.0, "reward": 0.4430125029757619, "reward_std": 0.13898874074220657, "rewards/length_bonus_reward": 0.1999308280646801, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": -0.013836614787578583, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 5827.1668701171875, "epoch": 0.34, "grad_norm": 0.3197915038118802, "learning_rate": 8.34565303179429e-07, "loss": -0.0, "num_tokens": 20469285.0, "reward": 0.7119769714772701, "reward_std": 0.30184984020888805, "rewards/length_bonus_reward": 0.3593953400850296, "rewards/simple_accuracy_reward": 0.2916666679084301, "rewards/simple_cosine_scaled_reward": 0.12182994559407234, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 5861.2083740234375, "epoch": 0.342, "grad_norm": 0.6145692580138341, "learning_rate": 8.319631063261207e-07, "loss": 0.0, "num_tokens": 20614880.0, "reward": 0.7158762998878956, "reward_std": 0.6379320546984673, "rewards/length_bonus_reward": 0.3202718123793602, "rewards/simple_accuracy_reward": 0.3333333395421505, "rewards/simple_cosine_scaled_reward": 0.12454224564135075, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 5555.83349609375, "epoch": 0.344, "grad_norm": 0.12789038671305916, "learning_rate": 8.293447300593402e-07, "loss": 0.0, "num_tokens": 20752942.0, "reward": 0.5610262956470251, "reward_std": 0.13263710960745811, "rewards/length_bonus_reward": 0.3225911408662796, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": -0.023129750043153763, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 6715.08349609375, "epoch": 0.346, "grad_norm": 0.09128838406329669, "learning_rate": 8.267103019950528e-07, "loss": 0.0, "num_tokens": 20918370.0, "reward": 0.18554409220814705, "reward_std": 0.12240323983132839, "rewards/length_bonus_reward": 0.3179829940199852, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.2648778110742569, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 5350.250061035156, "epoch": 0.348, "grad_norm": 0.5664496264586205, "learning_rate": 8.240599505315654e-07, "loss": 0.0, "num_tokens": 21051690.0, "reward": 0.28755808994174004, "reward_std": 0.4428982753306627, "rewards/length_bonus_reward": 0.22755940817296505, "rewards/simple_accuracy_reward": 0.1250000037252903, "rewards/simple_cosine_scaled_reward": -0.13000264018774033, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 4496.250183105469, "epoch": 0.35, "grad_norm": 0.5321967208510762, "learning_rate": 8.213938048432696e-07, "loss": 0.0, "num_tokens": 21164592.0, "reward": 1.1986322179436684, "reward_std": 0.45296037942171097, "rewards/length_bonus_reward": 0.3193868026137352, "rewards/simple_accuracy_reward": 0.6666666679084301, "rewards/simple_cosine_scaled_reward": 0.4251574305817485, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 6153.375, "epoch": 0.352, "grad_norm": 0.46439002318521905, "learning_rate": 8.187119948743449e-07, "loss": 0.0, "num_tokens": 21316833.0, "reward": 0.20019185543060303, "reward_std": 0.2736876755952835, "rewards/length_bonus_reward": 0.2580057755112648, "rewards/simple_accuracy_reward": 0.0416666679084301, "rewards/simple_cosine_scaled_reward": -0.1989611778408289, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 4290.416809082031, "epoch": 0.354, "grad_norm": 0.4495518811309317, "learning_rate": 8.160146513324254e-07, "loss": 0.0, "num_tokens": 21424147.0, "reward": 0.6523579061031342, "reward_std": 0.28206631913781166, "rewards/length_bonus_reward": 0.3292032852768898, "rewards/simple_accuracy_reward": 0.2916666679084301, "rewards/simple_cosine_scaled_reward": 0.06297583691775799, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 4478.333435058594, "epoch": 0.356, "grad_norm": 0.48879402782513115, "learning_rate": 8.133019056822302e-07, "loss": 0.0, "num_tokens": 21535947.0, "reward": 0.8094215616583824, "reward_std": 0.5972420014441013, "rewards/length_bonus_reward": 0.2120361328125, "rewards/simple_accuracy_reward": 0.4583333358168602, "rewards/simple_cosine_scaled_reward": 0.27810413762927055, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 5767.5001220703125, "epoch": 0.358, "grad_norm": 0.504171052975225, "learning_rate": 8.105738901391551e-07, "loss": 0.0, "num_tokens": 21679875.0, "reward": 0.26053120754659176, "reward_std": 0.23924280889332294, "rewards/length_bonus_reward": 0.2830607108771801, "rewards/simple_accuracy_reward": 0.0833333358168602, "rewards/simple_cosine_scaled_reward": -0.21172569086775184, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 6667.9583740234375, "epoch": 0.36, "grad_norm": 0.24060050675571124, "learning_rate": 8.07830737662829e-07, "loss": 0.0, "num_tokens": 21845546.0, "reward": 0.48588528111577034, "reward_std": 0.2503850422799587, "rewards/length_bonus_reward": 0.2957967147231102, "rewards/simple_accuracy_reward": 0.2083333283662796, "rewards/simple_cosine_scaled_reward": -0.0364895798265934, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 5257.4168701171875, "epoch": 0.362, "grad_norm": 0.5757395494251908, "learning_rate": 8.050725819506339e-07, "loss": 0.0, "num_tokens": 21976902.0, "reward": 0.6023547612130642, "reward_std": 0.4803642872720957, "rewards/length_bonus_reward": 0.3282674141228199, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": 0.04817462898790836, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 6231.75, "epoch": 0.364, "grad_norm": 0.31963400598341407, "learning_rate": 8.022995574311875e-07, "loss": 0.0, "num_tokens": 22131756.0, "reward": 0.6911131292581558, "reward_std": 0.3159391079097986, "rewards/length_bonus_reward": 0.2766214981675148, "rewards/simple_accuracy_reward": 0.3333333358168602, "rewards/simple_cosine_scaled_reward": 0.16231657937169075, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 6275.958435058594, "epoch": 0.366, "grad_norm": 0.21394605647219725, "learning_rate": 7.995117992577928e-07, "loss": -0.0, "num_tokens": 22287155.0, "reward": 0.21673648804426193, "reward_std": 0.25519090704619884, "rewards/length_bonus_reward": 0.288177490234375, "rewards/simple_accuracy_reward": 0.0416666679084301, "rewards/simple_cosine_scaled_reward": -0.22621533274650574, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 5603.916748046875, "epoch": 0.368, "grad_norm": 0.17296152250799715, "learning_rate": 7.967094433018508e-07, "loss": 0.0, "num_tokens": 22426245.0, "reward": 0.5235324800014496, "reward_std": 0.12968424521386623, "rewards/length_bonus_reward": 0.2736612968146801, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": -0.00025769323110580444, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 6018.3333740234375, "epoch": 0.37, "grad_norm": 0.3870567901133952, "learning_rate": 7.938926261462365e-07, "loss": 0.0, "num_tokens": 22575215.0, "reward": 0.6659985780715942, "reward_std": 0.29820241779088974, "rewards/length_bonus_reward": 0.2940673828125, "rewards/simple_accuracy_reward": 0.3333333358168602, "rewards/simple_cosine_scaled_reward": 0.07719573751091957, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 4719.958435058594, "epoch": 0.372, "grad_norm": 0.6357949798633356, "learning_rate": 7.910614850786447e-07, "loss": 0.0, "num_tokens": 22693390.0, "reward": 0.5820906460285187, "reward_std": 0.4543531946837902, "rewards/length_bonus_reward": 0.2707417830824852, "rewards/simple_accuracy_reward": 0.2916666716337204, "rewards/simple_cosine_scaled_reward": 0.03936433978378773, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 5690.83349609375, "epoch": 0.374, "grad_norm": 0.3426836804988886, "learning_rate": 7.882161580848966e-07, "loss": 0.0, "num_tokens": 22835004.0, "reward": 0.5111007839441299, "reward_std": 0.26339464634656906, "rewards/length_bonus_reward": 0.3658548966050148, "rewards/simple_accuracy_reward": 0.1666666716337204, "rewards/simple_cosine_scaled_reward": -0.042841602116823196, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 5086.7083740234375, "epoch": 0.376, "grad_norm": 0.685300696545338, "learning_rate": 7.853567838422159e-07, "loss": 0.0, "num_tokens": 22961939.0, "reward": 0.44375865533947945, "reward_std": 0.4357121158391237, "rewards/length_bonus_reward": 0.2724609375, "rewards/simple_accuracy_reward": 0.2083333358168602, "rewards/simple_cosine_scaled_reward": -0.07407124992460012, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 5932.916687011719, "epoch": 0.378, "grad_norm": 0.30788544039110777, "learning_rate": 7.82483501712469e-07, "loss": 0.0, "num_tokens": 23111361.0, "reward": 0.6742663681507111, "reward_std": 0.2806823570281267, "rewards/length_bonus_reward": 0.3145345002412796, "rewards/simple_accuracy_reward": 0.3333333358168602, "rewards/simple_cosine_scaled_reward": 0.05279699061065912, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 5512.9168701171875, "epoch": 0.38, "grad_norm": 0.514237100779754, "learning_rate": 7.795964517353733e-07, "loss": 0.0, "num_tokens": 23248759.0, "reward": 0.5908548831939697, "reward_std": 0.5195170156657696, "rewards/length_bonus_reward": 0.398590087890625, "rewards/simple_accuracy_reward": 0.2083333358168602, "rewards/simple_cosine_scaled_reward": -0.032137109665200114, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 4732.583435058594, "epoch": 0.382, "grad_norm": 0.6885648738863401, "learning_rate": 7.76695774621672e-07, "loss": 0.0, "num_tokens": 23366667.0, "reward": 0.7006201185286045, "reward_std": 0.6177800856530666, "rewards/length_bonus_reward": 0.274078369140625, "rewards/simple_accuracy_reward": 0.3750000111758709, "rewards/simple_cosine_scaled_reward": 0.10308345593512058, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 5399.7501220703125, "epoch": 0.384, "grad_norm": 0.4384703400226804, "learning_rate": 7.737816117462751e-07, "loss": -0.0, "num_tokens": 23501157.0, "reward": 0.19508541002869606, "reward_std": 0.3080579899251461, "rewards/length_bonus_reward": 0.2828369140625, "rewards/simple_accuracy_reward": 0.0416666679084301, "rewards/simple_cosine_scaled_reward": -0.2588363289833069, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 5820.9168701171875, "epoch": 0.386, "grad_norm": 0.4420580643898115, "learning_rate": 7.7085410514137e-07, "loss": -0.0, "num_tokens": 23646835.0, "reward": 0.31540603935718536, "reward_std": 0.39639159105718136, "rewards/length_bonus_reward": 0.3123575896024704, "rewards/simple_accuracy_reward": 0.0833333358168602, "rewards/simple_cosine_scaled_reward": -0.16056976071558893, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 5582.666748046875, "epoch": 0.388, "grad_norm": 0.16992209486588547, "learning_rate": 7.679133974894982e-07, "loss": 0.0, "num_tokens": 23785367.0, "reward": 0.5157365961931646, "reward_std": 0.12315243063494563, "rewards/length_bonus_reward": 0.2987772673368454, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": -0.0660814456641674, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 6931.541748046875, "epoch": 0.39, "grad_norm": 0.38336158710235635, "learning_rate": 7.649596321166024e-07, "loss": 0.0, "num_tokens": 23957118.0, "reward": 0.5662633236497641, "reward_std": 0.4414535705000162, "rewards/length_bonus_reward": 0.2793782576918602, "rewards/simple_accuracy_reward": 0.2916666716337204, "rewards/simple_cosine_scaled_reward": -0.00956324115395546, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 6237.4583740234375, "epoch": 0.392, "grad_norm": 0.08647637195332397, "learning_rate": 7.619929529850396e-07, "loss": 0.0, "num_tokens": 24111557.0, "reward": 0.13455674424767494, "reward_std": 0.10808053985238075, "rewards/length_bonus_reward": 0.2520243301987648, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.2349351868033409, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 6364.0833740234375, "epoch": 0.394, "grad_norm": 0.24073380359845703, "learning_rate": 7.590135046865651e-07, "loss": -0.0, "num_tokens": 24270607.0, "reward": 0.5693695358932018, "reward_std": 0.2630258537828922, "rewards/length_bonus_reward": 0.282867431640625, "rewards/simple_accuracy_reward": 0.2916666679084301, "rewards/simple_cosine_scaled_reward": -0.010329093784093857, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 6807.25, "epoch": 0.396, "grad_norm": 0.26686632919056336, "learning_rate": 7.560214324352858e-07, "loss": 0.0, "num_tokens": 24439315.0, "reward": 0.21294130198657513, "reward_std": 0.28032930940389633, "rewards/length_bonus_reward": 0.2626444473862648, "rewards/simple_accuracy_reward": 0.0416666679084301, "rewards/simple_cosine_scaled_reward": -0.18273964896798134, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 5138.708435058594, "epoch": 0.398, "grad_norm": 0.4643651466467595, "learning_rate": 7.530168820605818e-07, "loss": -0.0, "num_tokens": 24570456.0, "reward": 0.7427989952266216, "reward_std": 0.4500264450907707, "rewards/length_bonus_reward": 0.2869364395737648, "rewards/simple_accuracy_reward": 0.3750000037252903, "rewards/simple_cosine_scaled_reward": 0.16172509267926216, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 5998.08349609375, "epoch": 0.4, "grad_norm": 0.39271587631739674, "learning_rate": 7.5e-07, "loss": -0.0, "num_tokens": 24719126.0, "reward": 0.9221587106585503, "reward_std": 0.51909014955163, "rewards/length_bonus_reward": 0.3116251677274704, "rewards/simple_accuracy_reward": 0.5, "rewards/simple_cosine_scaled_reward": 0.2210671305656433, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 6356.8333740234375, "epoch": 0.402, "grad_norm": 0.3302375875514004, "learning_rate": 7.469709332921154e-07, "loss": 0.0, "num_tokens": 24876112.0, "reward": 0.5024892129004002, "reward_std": 0.3764161616563797, "rewards/length_bonus_reward": 0.26025390625, "rewards/simple_accuracy_reward": 0.2499999962747097, "rewards/simple_cosine_scaled_reward": -0.015529431402683258, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 4985.458435058594, "epoch": 0.404, "grad_norm": 0.39625488034000617, "learning_rate": 7.439298295693663e-07, "loss": 0.0, "num_tokens": 25001199.0, "reward": 0.7364761047065258, "reward_std": 0.3395494967699051, "rewards/length_bonus_reward": 0.26480102725327015, "rewards/simple_accuracy_reward": 0.375, "rewards/simple_cosine_scaled_reward": 0.19335006177425385, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 5478.916809082031, "epoch": 0.406, "grad_norm": 0.4490988236290001, "learning_rate": 7.408768370508576e-07, "loss": 0.0, "num_tokens": 25137379.0, "reward": 1.0774662643671036, "reward_std": 0.4881181884557009, "rewards/length_bonus_reward": 0.3063252754509449, "rewards/simple_accuracy_reward": 0.5833333432674408, "rewards/simple_cosine_scaled_reward": 0.3756152391433716, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 4849.833435058594, "epoch": 0.408, "grad_norm": 0.4926697118228306, "learning_rate": 7.378121045351377e-07, "loss": 0.0, "num_tokens": 25261947.0, "reward": 0.6673022173345089, "reward_std": 0.3426666799932718, "rewards/length_bonus_reward": 0.2885640412569046, "rewards/simple_accuracy_reward": 0.3333333358168602, "rewards/simple_cosine_scaled_reward": 0.09080958552658558, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 5064.2501220703125, "epoch": 0.41, "grad_norm": 0.5146876729807107, "learning_rate": 7.347357813929454e-07, "loss": 0.0, "num_tokens": 25388391.0, "reward": 1.1500621438026428, "reward_std": 0.6760335564613342, "rewards/length_bonus_reward": 0.3511759415268898, "rewards/simple_accuracy_reward": 0.6249999925494194, "rewards/simple_cosine_scaled_reward": 0.3477722704410553, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 5602.333435058594, "epoch": 0.412, "grad_norm": 0.2705956926574883, "learning_rate": 7.316480175599308e-07, "loss": 0.0, "num_tokens": 25527341.0, "reward": 0.6998628675937653, "reward_std": 0.31950316205620766, "rewards/length_bonus_reward": 0.3140462264418602, "rewards/simple_accuracy_reward": 0.3333333358168602, "rewards/simple_cosine_scaled_reward": 0.10496655106544495, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 5066.750061035156, "epoch": 0.414, "grad_norm": 0.43805159750345013, "learning_rate": 7.285489635293471e-07, "loss": 0.0, "num_tokens": 25653197.0, "reward": 1.007905375212431, "reward_std": 0.43036267161369324, "rewards/length_bonus_reward": 0.2590840645134449, "rewards/simple_accuracy_reward": 0.5833333283662796, "rewards/simple_cosine_scaled_reward": 0.3309759125113487, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 5905.291687011719, "epoch": 0.416, "grad_norm": 0.3918914245265233, "learning_rate": 7.254387703447153e-07, "loss": 0.0, "num_tokens": 25799154.0, "reward": 0.9878321476280689, "reward_std": 0.45452022552490234, "rewards/length_bonus_reward": 0.290252685546875, "rewards/simple_accuracy_reward": 0.5416666641831398, "rewards/simple_cosine_scaled_reward": 0.3118254579603672, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 5084.6251220703125, "epoch": 0.418, "grad_norm": 0.6252950453339188, "learning_rate": 7.223175895924637e-07, "loss": 0.0, "num_tokens": 25925907.0, "reward": 0.6921605467796326, "reward_std": 0.4960187803953886, "rewards/length_bonus_reward": 0.3101806640625, "rewards/simple_accuracy_reward": 0.3333333283662796, "rewards/simple_cosine_scaled_reward": 0.09729304909706116, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 4870.291748046875, "epoch": 0.42, "grad_norm": 0.1782130561717364, "learning_rate": 7.191855733945386e-07, "loss": 0.0, "num_tokens": 26047420.0, "reward": 0.4822753146290779, "reward_std": 0.14442711509764194, "rewards/length_bonus_reward": 0.22433471493422985, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": 0.015881139785051346, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 6521.125, "epoch": 0.422, "grad_norm": 0.13663959910747725, "learning_rate": 7.160428744009912e-07, "loss": -0.0, "num_tokens": 26208565.0, "reward": 0.557738333940506, "reward_std": 0.12792637012898922, "rewards/length_bonus_reward": 0.2866617813706398, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": 0.04215311259031296, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 5762.6668701171875, "epoch": 0.424, "grad_norm": 0.44144505473842344, "learning_rate": 7.128896457825363e-07, "loss": 0.0, "num_tokens": 26350739.0, "reward": 0.9027397520840168, "reward_std": 0.4887677412480116, "rewards/length_bonus_reward": 0.3296712189912796, "rewards/simple_accuracy_reward": 0.4583333395421505, "rewards/simple_cosine_scaled_reward": 0.22947034053504467, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 6867.666748046875, "epoch": 0.426, "grad_norm": 0.0935585776780699, "learning_rate": 7.097260412230885e-07, "loss": 0.0, "num_tokens": 26520225.0, "reward": 0.19511063024401665, "reward_std": 0.13539960980415344, "rewards/length_bonus_reward": 0.3123067244887352, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.2343921847641468, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 6893.2083740234375, "epoch": 0.428, "grad_norm": 0.36935925981287154, "learning_rate": 7.065522149122709e-07, "loss": 0.0, "num_tokens": 26690528.0, "reward": 0.3083217702805996, "reward_std": 0.29994075559079647, "rewards/length_bonus_reward": 0.2695617750287056, "rewards/simple_accuracy_reward": 0.125, "rewards/simple_cosine_scaled_reward": -0.172480009496212, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 5943.541687011719, "epoch": 0.43, "grad_norm": 0.099839458597204, "learning_rate": 7.033683215379002e-07, "loss": 0.0, "num_tokens": 26838405.0, "reward": 0.09028925281018019, "reward_std": 0.11490976437926292, "rewards/length_bonus_reward": 0.2102050706744194, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.23983165621757507, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 6176.291748046875, "epoch": 0.432, "grad_norm": 0.34836534658860613, "learning_rate": 7.001745162784475e-07, "loss": -0.0, "num_tokens": 26991058.0, "reward": 0.3512500301003456, "reward_std": 0.31561783142387867, "rewards/length_bonus_reward": 0.3544514924287796, "rewards/simple_accuracy_reward": 0.0833333358168602, "rewards/simple_cosine_scaled_reward": -0.17306959442794323, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 6849.0833740234375, "epoch": 0.434, "grad_norm": 0.22107602401343449, "learning_rate": 6.969709547954755e-07, "loss": 0.0, "num_tokens": 27162936.0, "reward": 0.2311139479279518, "reward_std": 0.22150146216154099, "rewards/length_bonus_reward": 0.302093505859375, "rewards/simple_accuracy_reward": 0.0416666679084301, "rewards/simple_cosine_scaled_reward": -0.22529246471822262, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 6116.041809082031, "epoch": 0.436, "grad_norm": 0.308473772625637, "learning_rate": 6.937577932260514e-07, "loss": 0.0, "num_tokens": 27314569.0, "reward": 0.5592161864042282, "reward_std": 0.285840954631567, "rewards/length_bonus_reward": 0.2461446113884449, "rewards/simple_accuracy_reward": 0.2916666679084301, "rewards/simple_cosine_scaled_reward": 0.04280982166528702, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 5767.58349609375, "epoch": 0.438, "grad_norm": 0.46634047108568727, "learning_rate": 6.905351881751371e-07, "loss": -0.0, "num_tokens": 27459525.0, "reward": 0.45933352038264275, "reward_std": 0.45212368480861187, "rewards/length_bonus_reward": 0.3392537385225296, "rewards/simple_accuracy_reward": 0.1666666679084301, "rewards/simple_cosine_scaled_reward": -0.09317377582192421, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 6066.291748046875, "epoch": 0.44, "grad_norm": 0.40274133517008487, "learning_rate": 6.87303296707956e-07, "loss": 0.0, "num_tokens": 27609616.0, "reward": 0.4267122521996498, "reward_std": 0.442441213876009, "rewards/length_bonus_reward": 0.3011067733168602, "rewards/simple_accuracy_reward": 0.1666666679084301, "rewards/simple_cosine_scaled_reward": -0.08212240785360336, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 4864.958435058594, "epoch": 0.442, "grad_norm": 0.3980260918998189, "learning_rate": 6.840622763423391e-07, "loss": 0.0, "num_tokens": 27730377.0, "reward": 0.7636679038405418, "reward_std": 0.34112141840159893, "rewards/length_bonus_reward": 0.3556722030043602, "rewards/simple_accuracy_reward": 0.3333333358168602, "rewards/simple_cosine_scaled_reward": 0.14932471700012684, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 6312.7501220703125, "epoch": 0.444, "grad_norm": 0.4487997990473496, "learning_rate": 6.80812285041046e-07, "loss": 0.0, "num_tokens": 27886869.0, "reward": 0.6309241242706776, "reward_std": 0.4285028986632824, "rewards/length_bonus_reward": 0.2964782789349556, "rewards/simple_accuracy_reward": 0.2916666641831398, "rewards/simple_cosine_scaled_reward": 0.08555836975574493, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 6391.791748046875, "epoch": 0.446, "grad_norm": 0.084040746217533, "learning_rate": 6.775534812040686e-07, "loss": 0.0, "num_tokens": 28046092.0, "reward": 0.23539214581251144, "reward_std": 0.10702459514141083, "rewards/length_bonus_reward": 0.3339131698012352, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.1970420517027378, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 5908.625244140625, "epoch": 0.448, "grad_norm": 0.34158166144182245, "learning_rate": 6.742860236609076e-07, "loss": 0.0, "num_tokens": 28193677.0, "reward": 0.6640284210443497, "reward_std": 0.41652693785727024, "rewards/length_bonus_reward": 0.37371826171875, "rewards/simple_accuracy_reward": 0.2916666641831398, "rewards/simple_cosine_scaled_reward": -0.0027130371890962124, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 6186.541748046875, "epoch": 0.45, "grad_norm": 0.24673599488632164, "learning_rate": 6.710100716628344e-07, "loss": -0.0, "num_tokens": 28347524.0, "reward": 0.7887777108699083, "reward_std": 0.28049985133111477, "rewards/length_bonus_reward": 0.2762552872300148, "rewards/simple_accuracy_reward": 0.4166666716337204, "rewards/simple_cosine_scaled_reward": 0.19171153008937836, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 6814.9583740234375, "epoch": 0.452, "grad_norm": 0.33808646305556433, "learning_rate": 6.677257848751276e-07, "loss": 0.0, "num_tokens": 28516795.0, "reward": 0.4205792974680662, "reward_std": 0.33256477676331997, "rewards/length_bonus_reward": 0.30633544921875, "rewards/simple_accuracy_reward": 0.1666666716337204, "rewards/simple_cosine_scaled_reward": -0.10484564304351807, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 4793.625061035156, "epoch": 0.454, "grad_norm": 0.3359560066479912, "learning_rate": 6.644333233692916e-07, "loss": 0.0, "num_tokens": 28643200.0, "reward": 0.675881564617157, "reward_std": 0.28366177156567574, "rewards/length_bonus_reward": 0.20240275282412767, "rewards/simple_accuracy_reward": 0.4166666716337204, "rewards/simple_cosine_scaled_reward": 0.11362426728010178, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 4973.5833740234375, "epoch": 0.456, "grad_norm": 0.37162327067369355, "learning_rate": 6.611328476152556e-07, "loss": 0.0, "num_tokens": 28767522.0, "reward": 0.6579143106937408, "reward_std": 0.2909209839999676, "rewards/length_bonus_reward": 0.3355814591050148, "rewards/simple_accuracy_reward": 0.2916666679084301, "rewards/simple_cosine_scaled_reward": 0.061332338489592075, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 5731.0833740234375, "epoch": 0.458, "grad_norm": 0.2719535802343545, "learning_rate": 6.578245184735512e-07, "loss": -0.0, "num_tokens": 28909694.0, "reward": 0.49097292870283127, "reward_std": 0.2251973357051611, "rewards/length_bonus_reward": 0.2810974083840847, "rewards/simple_accuracy_reward": 0.2083333283662796, "rewards/simple_cosine_scaled_reward": 0.003084380179643631, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 5884.9168701171875, "epoch": 0.46, "grad_norm": 0.38352482241358243, "learning_rate": 6.545084971874736e-07, "loss": 0.0, "num_tokens": 29055024.0, "reward": 0.34275880828499794, "reward_std": 0.33315639570355415, "rewards/length_bonus_reward": 0.3484598770737648, "rewards/simple_accuracy_reward": 0.0833333358168602, "rewards/simple_cosine_scaled_reward": -0.17806881852447987, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 7102.9583740234375, "epoch": 0.462, "grad_norm": 0.29054851332792586, "learning_rate": 6.511849453752223e-07, "loss": 0.0, "num_tokens": 29231435.0, "reward": 0.20212747901678085, "reward_std": 0.2622851338237524, "rewards/length_bonus_reward": 0.26177978515625, "rewards/simple_accuracy_reward": 0.0416666679084301, "rewards/simple_cosine_scaled_reward": -0.20263796485960484, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 6421.8751220703125, "epoch": 0.464, "grad_norm": 0.2936720747082485, "learning_rate": 6.478540250220233e-07, "loss": 0.0, "num_tokens": 29389928.0, "reward": 0.2929835729300976, "reward_std": 0.3243982084095478, "rewards/length_bonus_reward": 0.2934366837143898, "rewards/simple_accuracy_reward": 0.0833333358168602, "rewards/simple_cosine_scaled_reward": -0.1675729244016111, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 6474.70849609375, "epoch": 0.466, "grad_norm": 0.5468067402319371, "learning_rate": 6.445158984722358e-07, "loss": 0.0, "num_tokens": 29551009.0, "reward": 0.49080006405711174, "reward_std": 0.4925340497866273, "rewards/length_bonus_reward": 0.2909138947725296, "rewards/simple_accuracy_reward": 0.2083333358168602, "rewards/simple_cosine_scaled_reward": -0.016894358675926924, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 5778.5, "epoch": 0.468, "grad_norm": 0.38202497508786476, "learning_rate": 6.411707284214383e-07, "loss": 0.0, "num_tokens": 29693875.0, "reward": 0.6461177580058575, "reward_std": 0.2976618520915508, "rewards/length_bonus_reward": 0.2710266076028347, "rewards/simple_accuracy_reward": 0.3333333358168602, "rewards/simple_cosine_scaled_reward": 0.08351560309529305, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 6209.5001220703125, "epoch": 0.47, "grad_norm": 0.36714178364476335, "learning_rate": 6.378186779084995e-07, "loss": 0.0, "num_tokens": 29849071.0, "reward": 0.48788801953196526, "reward_std": 0.41367568634450436, "rewards/length_bonus_reward": 0.2918396070599556, "rewards/simple_accuracy_reward": 0.2083333395421505, "rewards/simple_cosine_scaled_reward": -0.024569887667894363, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 5789.7918701171875, "epoch": 0.472, "grad_norm": 0.27010566927930046, "learning_rate": 6.344599103076328e-07, "loss": 0.0, "num_tokens": 29991878.0, "reward": 0.6113441474735737, "reward_std": 0.27947698533535004, "rewards/length_bonus_reward": 0.3051961287856102, "rewards/simple_accuracy_reward": 0.2916666679084301, "rewards/simple_cosine_scaled_reward": 0.028962641954421997, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 5859.166748046875, "epoch": 0.474, "grad_norm": 0.26230599822377354, "learning_rate": 6.310945893204324e-07, "loss": 0.0, "num_tokens": 30137046.0, "reward": 0.4976905323565006, "reward_std": 0.2611602023243904, "rewards/length_bonus_reward": 0.3704935684800148, "rewards/simple_accuracy_reward": 0.1666666716337204, "rewards/simple_cosine_scaled_reward": -0.07893943600356579, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 6919.9583740234375, "epoch": 0.476, "grad_norm": 0.09364623859423737, "learning_rate": 6.277228789678953e-07, "loss": 0.0, "num_tokens": 30308051.0, "reward": 0.1442133877426386, "reward_std": 0.12416775710880756, "rewards/length_bonus_reward": 0.2706502303481102, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.2528736926615238, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 6493.6251220703125, "epoch": 0.478, "grad_norm": 0.4680094615389185, "learning_rate": 6.243449435824276e-07, "loss": 0.0, "num_tokens": 30469022.0, "reward": 0.3388434946537018, "reward_std": 0.4082567673176527, "rewards/length_bonus_reward": 0.2917683869600296, "rewards/simple_accuracy_reward": 0.1250000037252903, "rewards/simple_cosine_scaled_reward": -0.15584981068968773, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 5439.4583740234375, "epoch": 0.48, "grad_norm": 0.251875799480019, "learning_rate": 6.209609477998338e-07, "loss": -0.0, "num_tokens": 30604459.0, "reward": 0.5090580992400646, "reward_std": 0.26186372339725494, "rewards/length_bonus_reward": 0.33538818359375, "rewards/simple_accuracy_reward": 0.2083333283662796, "rewards/simple_cosine_scaled_reward": -0.06932682916522026, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 5979.583435058594, "epoch": 0.482, "grad_norm": 0.35080720114750596, "learning_rate": 6.17571056551295e-07, "loss": -0.0, "num_tokens": 30755697.0, "reward": 0.60244544968009, "reward_std": 0.4626775663346052, "rewards/length_bonus_reward": 0.2941385917365551, "rewards/simple_accuracy_reward": 0.2916666641831398, "rewards/simple_cosine_scaled_reward": 0.03328037774190307, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 5712.041748046875, "epoch": 0.484, "grad_norm": 0.2430228503499223, "learning_rate": 6.141754350553279e-07, "loss": 0.0, "num_tokens": 30898534.0, "reward": 0.6121168695390224, "reward_std": 0.2525808997452259, "rewards/length_bonus_reward": 0.2706197127699852, "rewards/simple_accuracy_reward": 0.2916666679084301, "rewards/simple_cosine_scaled_reward": 0.09966093767434359, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 5440.125183105469, "epoch": 0.486, "grad_norm": 0.3734435481337618, "learning_rate": 6.107742488097338e-07, "loss": 0.0, "num_tokens": 31042321.0, "reward": 1.002666674554348, "reward_std": 0.4391897153109312, "rewards/length_bonus_reward": 0.2821858711540699, "rewards/simple_accuracy_reward": 0.5833333283662796, "rewards/simple_cosine_scaled_reward": 0.2742948643863201, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 7066.375, "epoch": 0.488, "grad_norm": 0.3127260090685328, "learning_rate": 6.073676635835316e-07, "loss": 0.0, "num_tokens": 31217698.0, "reward": 0.34211307018995285, "reward_std": 0.2728648092597723, "rewards/length_bonus_reward": 0.2604166641831398, "rewards/simple_accuracy_reward": 0.125, "rewards/simple_cosine_scaled_reward": -0.0866072028875351, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 5347.375, "epoch": 0.49, "grad_norm": 0.44654551837841444, "learning_rate": 6.039558454088795e-07, "loss": 0.0, "num_tokens": 31351159.0, "reward": 0.6874539256095886, "reward_std": 0.27767086401581764, "rewards/length_bonus_reward": 0.2631428986787796, "rewards/simple_accuracy_reward": 0.375, "rewards/simple_cosine_scaled_reward": 0.09862199798226357, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 7057.5, "epoch": 0.492, "grad_norm": 0.08388843390155762, "learning_rate": 6.005389605729824e-07, "loss": 0.0, "num_tokens": 31527247.0, "reward": 0.13521139696240425, "reward_std": 0.11096891947090626, "rewards/length_bonus_reward": 0.2703247033059597, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.2702266164124012, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 5908.166748046875, "epoch": 0.494, "grad_norm": 0.4719940668037122, "learning_rate": 5.97117175609986e-07, "loss": 0.0, "num_tokens": 31675751.0, "reward": 0.7461047023534775, "reward_std": 0.46879130229353905, "rewards/length_bonus_reward": 0.359344482421875, "rewards/simple_accuracy_reward": 0.3333333432674408, "rewards/simple_cosine_scaled_reward": 0.10685371980071068, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 6048.70849609375, "epoch": 0.496, "grad_norm": 0.34728338219194127, "learning_rate": 5.936906572928624e-07, "loss": 0.0, "num_tokens": 31828354.0, "reward": 0.8028898388147354, "reward_std": 0.30233910121023655, "rewards/length_bonus_reward": 0.339691162109375, "rewards/simple_accuracy_reward": 0.375, "rewards/simple_cosine_scaled_reward": 0.17639729753136635, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 6092.2083740234375, "epoch": 0.498, "grad_norm": 0.36784028419938025, "learning_rate": 5.9025957262528e-07, "loss": 0.0, "num_tokens": 31979355.0, "reward": 0.44946896471083164, "reward_std": 0.2826632931828499, "rewards/length_bonus_reward": 0.3204142227768898, "rewards/simple_accuracy_reward": 0.1666666716337204, "rewards/simple_cosine_scaled_reward": -0.07522392645478249, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 5828.2501220703125, "epoch": 0.5, "grad_norm": 0.2272752053647502, "learning_rate": 5.868240888334652e-07, "loss": 0.0, "num_tokens": 32124747.0, "reward": 0.8542748428881168, "reward_std": 0.26021175272762775, "rewards/length_bonus_reward": 0.2898864708840847, "rewards/simple_accuracy_reward": 0.4583333283662796, "rewards/simple_cosine_scaled_reward": 0.2121100313961506, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 5716.166748046875, "epoch": 0.502, "grad_norm": 0.08995777868910086, "learning_rate": 5.833843733580512e-07, "loss": 0.0, "num_tokens": 32266951.0, "reward": 0.5449081733822823, "reward_std": 0.10964064672589302, "rewards/length_bonus_reward": 0.3155009001493454, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": -0.041185494512319565, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 6286.375244140625, "epoch": 0.504, "grad_norm": 0.37716860107544853, "learning_rate": 5.799405938459174e-07, "loss": 0.0, "num_tokens": 32424418.0, "reward": 0.3631526976823807, "reward_std": 0.4273594953119755, "rewards/length_bonus_reward": 0.2847391739487648, "rewards/simple_accuracy_reward": 0.1250000037252903, "rewards/simple_cosine_scaled_reward": -0.09317299537360668, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 4373.95849609375, "epoch": 0.506, "grad_norm": 0.6399058923341275, "learning_rate": 5.764929181420191e-07, "loss": 0.0, "num_tokens": 32535597.0, "reward": 0.6569446139037609, "reward_std": 0.5255965404212475, "rewards/length_bonus_reward": 0.3041788712143898, "rewards/simple_accuracy_reward": 0.3333333432674408, "rewards/simple_cosine_scaled_reward": 0.03886473923921585, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 5639.666748046875, "epoch": 0.508, "grad_norm": 0.46518449138354206, "learning_rate": 5.730415142812058e-07, "loss": 0.0, "num_tokens": 32679799.0, "reward": 0.34347977861762047, "reward_std": 0.29888685792684555, "rewards/length_bonus_reward": 0.3486328125, "rewards/simple_accuracy_reward": 0.0833333358168602, "rewards/simple_cosine_scaled_reward": -0.17697273567318916, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 5969.916748046875, "epoch": 0.51, "grad_norm": 0.36444567424476293, "learning_rate": 5.695865504800327e-07, "loss": -0.0, "num_tokens": 32828411.0, "reward": 0.35172041691839695, "reward_std": 0.3834228776395321, "rewards/length_bonus_reward": 0.3067626878619194, "rewards/simple_accuracy_reward": 0.125, "rewards/simple_cosine_scaled_reward": -0.1600845344364643, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 6693.166748046875, "epoch": 0.512, "grad_norm": 0.06883944427420127, "learning_rate": 5.661281951285612e-07, "loss": 0.0, "num_tokens": 32993691.0, "reward": 0.11990128085017204, "reward_std": 0.10477480664849281, "rewards/length_bonus_reward": 0.29119873046875, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.3425949029624462, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 5471.500183105469, "epoch": 0.514, "grad_norm": 0.4698799625654922, "learning_rate": 5.626666167821521e-07, "loss": 0.0, "num_tokens": 33129441.0, "reward": 0.7356847263872623, "reward_std": 0.5883271172642708, "rewards/length_bonus_reward": 0.2898864708840847, "rewards/simple_accuracy_reward": 0.3749999962747097, "rewards/simple_cosine_scaled_reward": 0.14159645326435566, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 5603.916748046875, "epoch": 0.516, "grad_norm": 0.6372408073427658, "learning_rate": 5.592019841532506e-07, "loss": 0.0, "num_tokens": 33268849.0, "reward": 0.48433225601911545, "reward_std": 0.5746349208056927, "rewards/length_bonus_reward": 0.278961181640625, "rewards/simple_accuracy_reward": 0.2500000037252903, "rewards/simple_cosine_scaled_reward": -0.08925788477063179, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 6788.5833740234375, "epoch": 0.518, "grad_norm": 0.3583173360279415, "learning_rate": 5.557344661031627e-07, "loss": 0.0, "num_tokens": 33439149.0, "reward": 0.6427120286971331, "reward_std": 0.4290166413411498, "rewards/length_bonus_reward": 0.3094787523150444, "rewards/simple_accuracy_reward": 0.2916666641831398, "rewards/simple_cosine_scaled_reward": 0.08313319459557533, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 5861.041748046875, "epoch": 0.52, "grad_norm": 0.5420124270791071, "learning_rate": 5.522642316338268e-07, "loss": -0.0, "num_tokens": 33583978.0, "reward": 1.0841117016971111, "reward_std": 0.6177413519471884, "rewards/length_bonus_reward": 0.3844502717256546, "rewards/simple_accuracy_reward": 0.5416666716337204, "rewards/simple_cosine_scaled_reward": 0.31598953530192375, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 5742.25, "epoch": 0.522, "grad_norm": 0.40550161638979687, "learning_rate": 5.487914498795747e-07, "loss": 0.0, "num_tokens": 33726526.0, "reward": 0.43814731016755104, "reward_std": 0.4421003982424736, "rewards/length_bonus_reward": 0.329986572265625, "rewards/simple_accuracy_reward": 0.1666666679084301, "rewards/simple_cosine_scaled_reward": -0.1170118860900402, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 5248.08349609375, "epoch": 0.524, "grad_norm": 0.15866334939214882, "learning_rate": 5.453162900988901e-07, "loss": 0.0, "num_tokens": 33856788.0, "reward": 0.9440031796693802, "reward_std": 0.14353657886385918, "rewards/length_bonus_reward": 0.3267822265625, "rewards/simple_accuracy_reward": 0.5, "rewards/simple_cosine_scaled_reward": 0.2344418242573738, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 6261.416748046875, "epoch": 0.526, "grad_norm": 0.09627742130250258, "learning_rate": 5.418389216661578e-07, "loss": -0.0, "num_tokens": 34012168.0, "reward": 0.17722006887197495, "reward_std": 0.12740055657923222, "rewards/length_bonus_reward": 0.2857767716050148, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.2171134240925312, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 5645.416809082031, "epoch": 0.528, "grad_norm": 0.18713339154683045, "learning_rate": 5.383595140634093e-07, "loss": 0.0, "num_tokens": 34155002.0, "reward": 0.2151702716946602, "reward_std": 0.12574042938649654, "rewards/length_bonus_reward": 0.3260599821805954, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.22177942469716072, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 6087.125061035156, "epoch": 0.53, "grad_norm": 0.07513523767747096, "learning_rate": 5.348782368720625e-07, "loss": 0.0, "num_tokens": 34305863.0, "reward": 0.5671689193695784, "reward_std": 0.10309114493429661, "rewards/length_bonus_reward": 0.3202718049287796, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": -0.0062058232724666595, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 5676.7501220703125, "epoch": 0.532, "grad_norm": 0.10715786203850339, "learning_rate": 5.313952597646567e-07, "loss": 0.0, "num_tokens": 34446509.0, "reward": 0.6420413982123137, "reward_std": 0.1284679565578699, "rewards/length_bonus_reward": 0.3580627515912056, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": 0.06795727461576462, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 5760.041748046875, "epoch": 0.534, "grad_norm": 0.40118294132394544, "learning_rate": 5.27910752496582e-07, "loss": -0.0, "num_tokens": 34589472.0, "reward": 0.7105108462274075, "reward_std": 0.4465402476489544, "rewards/length_bonus_reward": 0.3481343537569046, "rewards/simple_accuracy_reward": 0.3333333358168602, "rewards/simple_cosine_scaled_reward": 0.05808631330728531, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 5652.58349609375, "epoch": 0.536, "grad_norm": 0.23158256672410046, "learning_rate": 5.244248848978067e-07, "loss": 0.0, "num_tokens": 34729724.0, "reward": 1.2597385756671429, "reward_std": 0.2806292325258255, "rewards/length_bonus_reward": 0.3112589493393898, "rewards/simple_accuracy_reward": 0.7083333283662796, "rewards/simple_cosine_scaled_reward": 0.48029251024127007, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 4935.20849609375, "epoch": 0.538, "grad_norm": 0.2794276081448701, "learning_rate": 5.209378268645997e-07, "loss": 0.0, "num_tokens": 34856167.0, "reward": 1.2506853565573692, "reward_std": 0.3196232467889786, "rewards/length_bonus_reward": 0.3200378380715847, "rewards/simple_accuracy_reward": 0.7083333283662796, "rewards/simple_cosine_scaled_reward": 0.4446282461285591, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 5132.000061035156, "epoch": 0.54, "grad_norm": 0.5765910034156457, "learning_rate": 5.174497483512505e-07, "loss": 0.0, "num_tokens": 34983901.0, "reward": 0.5522802993655205, "reward_std": 0.5147455967962742, "rewards/length_bonus_reward": 0.3358052521944046, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": -0.06704995594918728, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 6288.3333740234375, "epoch": 0.542, "grad_norm": 0.304600033316691, "learning_rate": 5.139608193617844e-07, "loss": 0.0, "num_tokens": 35139921.0, "reward": 0.6503848358988762, "reward_std": 0.25639556907117367, "rewards/length_bonus_reward": 0.3030497208237648, "rewards/simple_accuracy_reward": 0.2916666679084301, "rewards/simple_cosine_scaled_reward": 0.11133686639368534, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 4935.083435058594, "epoch": 0.544, "grad_norm": 0.3254158130275431, "learning_rate": 5.104712099416785e-07, "loss": 0.0, "num_tokens": 35263019.0, "reward": 0.9292025864124298, "reward_std": 0.2736637778580189, "rewards/length_bonus_reward": 0.3749593123793602, "rewards/simple_accuracy_reward": 0.4583333283662796, "rewards/simple_cosine_scaled_reward": 0.19181983917951584, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 4887.666748046875, "epoch": 0.546, "grad_norm": 0.3447492382728543, "learning_rate": 5.069810901695727e-07, "loss": -0.0, "num_tokens": 35385015.0, "reward": 0.4783024489879608, "reward_std": 0.27098657563328743, "rewards/length_bonus_reward": 0.2557983435690403, "rewards/simple_accuracy_reward": 0.2083333283662796, "rewards/simple_cosine_scaled_reward": 0.02834159880876541, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 6736.5833740234375, "epoch": 0.548, "grad_norm": 0.38522324142610603, "learning_rate": 5.034906301489807e-07, "loss": 0.0, "num_tokens": 35550797.0, "reward": 0.367171972990036, "reward_std": 0.4238415826112032, "rewards/length_bonus_reward": 0.3110453262925148, "rewards/simple_accuracy_reward": 0.1250000037252903, "rewards/simple_cosine_scaled_reward": -0.13774673640727997, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 6744.45849609375, "epoch": 0.55, "grad_norm": 0.2791458943822961, "learning_rate": 5e-07, "loss": 0.0, "num_tokens": 35719414.0, "reward": 0.44616804271936417, "reward_std": 0.2931462060660124, "rewards/length_bonus_reward": 0.3021036759018898, "rewards/simple_accuracy_reward": 0.1666666716337204, "rewards/simple_cosine_scaled_reward": -0.04520463012158871, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 6220.541748046875, "epoch": 0.552, "grad_norm": 0.4216287737664356, "learning_rate": 4.965093698510192e-07, "loss": 0.0, "num_tokens": 35876219.0, "reward": 0.615056499838829, "reward_std": 0.47635061852633953, "rewards/length_bonus_reward": 0.3310546875, "rewards/simple_accuracy_reward": 0.2916666716337204, "rewards/simple_cosine_scaled_reward": -0.015329722315073013, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 5501.8333740234375, "epoch": 0.554, "grad_norm": 0.6185613904155661, "learning_rate": 4.930189098304274e-07, "loss": 0.0, "num_tokens": 36013003.0, "reward": 0.6840997934341431, "reward_std": 0.6881885379552841, "rewards/length_bonus_reward": 0.2931416779756546, "rewards/simple_accuracy_reward": 0.3333333432674408, "rewards/simple_cosine_scaled_reward": 0.1152495089918375, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 5165.6251220703125, "epoch": 0.556, "grad_norm": 0.5673313068321398, "learning_rate": 4.895287900583216e-07, "loss": 0.0, "num_tokens": 36143056.0, "reward": 0.629047304391861, "reward_std": 0.6504702866077423, "rewards/length_bonus_reward": 0.3357442244887352, "rewards/simple_accuracy_reward": 0.2916666753590107, "rewards/simple_cosine_scaled_reward": 0.003272791625931859, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 6263.2501220703125, "epoch": 0.558, "grad_norm": 0.23371435361976153, "learning_rate": 4.860391806382156e-07, "loss": 0.0, "num_tokens": 36299506.0, "reward": 0.224880775436759, "reward_std": 0.2405831404030323, "rewards/length_bonus_reward": 0.2944234162569046, "rewards/simple_accuracy_reward": 0.0416666679084301, "rewards/simple_cosine_scaled_reward": -0.2224186323583126, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 7080.2083740234375, "epoch": 0.56, "grad_norm": 0.3221563290076954, "learning_rate": 4.825502516487496e-07, "loss": 0.0, "num_tokens": 36473799.0, "reward": 0.35034242272377014, "reward_std": 0.2945863679051399, "rewards/length_bonus_reward": 0.2700602188706398, "rewards/simple_accuracy_reward": 0.125, "rewards/simple_cosine_scaled_reward": -0.08943561464548111, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 6118.916748046875, "epoch": 0.562, "grad_norm": 0.2830535076801935, "learning_rate": 4.790621731354002e-07, "loss": -0.0, "num_tokens": 36628201.0, "reward": 0.47960783168673515, "reward_std": 0.27905278466641903, "rewards/length_bonus_reward": 0.323486328125, "rewards/simple_accuracy_reward": 0.1666666716337204, "rewards/simple_cosine_scaled_reward": -0.021090317517518997, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 7049.0833740234375, "epoch": 0.564, "grad_norm": 0.20695190947211628, "learning_rate": 4.7557511510219335e-07, "loss": 0.0, "num_tokens": 36804795.0, "reward": 0.19611439853906631, "reward_std": 0.24950358644127846, "rewards/length_bonus_reward": 0.2604166641831398, "rewards/simple_accuracy_reward": 0.0416666679084301, "rewards/simple_cosine_scaled_reward": -0.21193786337971687, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 5312.666748046875, "epoch": 0.566, "grad_norm": 0.15672363511273524, "learning_rate": 4.7208924750341805e-07, "loss": -0.0, "num_tokens": 36936667.0, "reward": 0.5677157491445541, "reward_std": 0.1654718667268753, "rewards/length_bonus_reward": 0.2886555977165699, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": 0.05812034755945206, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 6589.83349609375, "epoch": 0.568, "grad_norm": 0.4039679954399733, "learning_rate": 4.686047402353433e-07, "loss": 0.0, "num_tokens": 37100667.0, "reward": 0.3722701594233513, "reward_std": 0.43717330135405064, "rewards/length_bonus_reward": 0.307281494140625, "rewards/simple_accuracy_reward": 0.1250000037252903, "rewards/simple_cosine_scaled_reward": -0.12002268992364407, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 5581.3751220703125, "epoch": 0.57, "grad_norm": 0.4353770688098492, "learning_rate": 4.6512176312793735e-07, "loss": 0.0, "num_tokens": 37239186.0, "reward": 1.121002770960331, "reward_std": 0.5534261725842953, "rewards/length_bonus_reward": 0.33612060546875, "rewards/simple_accuracy_reward": 0.6249999962747097, "rewards/simple_cosine_scaled_reward": 0.31976425647735596, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 6525.916748046875, "epoch": 0.572, "grad_norm": 0.30687365860417826, "learning_rate": 4.6164048593659065e-07, "loss": 0.0, "num_tokens": 37400158.0, "reward": 0.4112445302307606, "reward_std": 0.29628824442625046, "rewards/length_bonus_reward": 0.2881062850356102, "rewards/simple_accuracy_reward": 0.1666666716337204, "rewards/simple_cosine_scaled_reward": -0.08705686032772064, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 5348.041748046875, "epoch": 0.574, "grad_norm": 0.47134650808342615, "learning_rate": 4.5816107833384233e-07, "loss": 0.0, "num_tokens": 37533563.0, "reward": 0.8489188514649868, "reward_std": 0.5346241891384125, "rewards/length_bonus_reward": 0.293701171875, "rewards/simple_accuracy_reward": 0.4583333358168602, "rewards/simple_cosine_scaled_reward": 0.19376858510077, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 6135.4583740234375, "epoch": 0.576, "grad_norm": 0.4515224657048974, "learning_rate": 4.5468370990110997e-07, "loss": 0.0, "num_tokens": 37688440.0, "reward": 0.37657709047198296, "reward_std": 0.3289579637348652, "rewards/length_bonus_reward": 0.2308858223259449, "rewards/simple_accuracy_reward": 0.1666666679084301, "rewards/simple_cosine_scaled_reward": -0.04195086006075144, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 5801.791748046875, "epoch": 0.578, "grad_norm": 0.2674959471596714, "learning_rate": 4.512085501204253e-07, "loss": 0.0, "num_tokens": 37832657.0, "reward": 0.6024268716573715, "reward_std": 0.2548858467489481, "rewards/length_bonus_reward": 0.3032735213637352, "rewards/simple_accuracy_reward": 0.2916666679084301, "rewards/simple_cosine_scaled_reward": 0.01497337594628334, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 6544.4583740234375, "epoch": 0.58, "grad_norm": 0.3763376950125901, "learning_rate": 4.477357683661733e-07, "loss": 0.0, "num_tokens": 37994074.0, "reward": 0.7827924974262714, "reward_std": 0.42396412789821625, "rewards/length_bonus_reward": 0.3350016251206398, "rewards/simple_accuracy_reward": 0.375, "rewards/simple_cosine_scaled_reward": 0.1455817073583603, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 5227.6251220703125, "epoch": 0.582, "grad_norm": 0.3752673642446373, "learning_rate": 4.442655338968373e-07, "loss": 0.0, "num_tokens": 38126053.0, "reward": 0.4260019361972809, "reward_std": 0.3601754065603018, "rewards/length_bonus_reward": 0.2876688651740551, "rewards/simple_accuracy_reward": 0.1666666716337204, "rewards/simple_cosine_scaled_reward": -0.05666723661124706, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 6770.45849609375, "epoch": 0.584, "grad_norm": 0.22342656177733586, "learning_rate": 4.407980158467495e-07, "loss": 0.0, "num_tokens": 38293554.0, "reward": 0.6190893054008484, "reward_std": 0.23930539935827255, "rewards/length_bonus_reward": 0.3110860213637352, "rewards/simple_accuracy_reward": 0.2916666679084301, "rewards/simple_cosine_scaled_reward": 0.032673229929059744, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 5403.7501220703125, "epoch": 0.586, "grad_norm": 0.5488607969341849, "learning_rate": 4.3733338321784777e-07, "loss": -0.0, "num_tokens": 38433000.0, "reward": 0.2839606013149023, "reward_std": 0.3648219183087349, "rewards/length_bonus_reward": 0.2805684395134449, "rewards/simple_accuracy_reward": 0.0833333358168602, "rewards/simple_cosine_scaled_reward": -0.15988235361874104, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 5770.041748046875, "epoch": 0.588, "grad_norm": 0.38923667095583686, "learning_rate": 4.338718048714387e-07, "loss": 0.0, "num_tokens": 38576773.0, "reward": 0.6404517814517021, "reward_std": 0.38283276930451393, "rewards/length_bonus_reward": 0.3367309644818306, "rewards/simple_accuracy_reward": 0.2916666641831398, "rewards/simple_cosine_scaled_reward": 0.02410823106765747, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 6091.7083740234375, "epoch": 0.59, "grad_norm": 0.13292722287166242, "learning_rate": 4.304134495199674e-07, "loss": 0.0, "num_tokens": 38728116.0, "reward": 0.17447657883167267, "reward_std": 0.13100742921233177, "rewards/length_bonus_reward": 0.2865702286362648, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.22418733686208725, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 6130.33349609375, "epoch": 0.592, "grad_norm": 0.4695354838333748, "learning_rate": 4.2695848571879424e-07, "loss": 0.0, "num_tokens": 38880170.0, "reward": 0.9023044854402542, "reward_std": 0.6443986110389233, "rewards/length_bonus_reward": 0.3282267302274704, "rewards/simple_accuracy_reward": 0.4583333358168602, "rewards/simple_cosine_scaled_reward": 0.23148886673152447, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 7108.3333740234375, "epoch": 0.594, "grad_norm": 0.24614355937234383, "learning_rate": 4.23507081857981e-07, "loss": 0.0, "num_tokens": 39056986.0, "reward": 0.3041830491274595, "reward_std": 0.27574355341494083, "rewards/length_bonus_reward": 0.2604370042681694, "rewards/simple_accuracy_reward": 0.125, "rewards/simple_cosine_scaled_reward": -0.16250790283083916, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 5634.9583740234375, "epoch": 0.596, "grad_norm": 0.10506365421681307, "learning_rate": 4.200594061540826e-07, "loss": 0.0, "num_tokens": 39201135.0, "reward": 0.15383774042129517, "reward_std": 0.08960865018889308, "rewards/length_bonus_reward": 0.2815551720559597, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.2554348949342966, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 6493.3333740234375, "epoch": 0.598, "grad_norm": 0.07621589330782302, "learning_rate": 4.166156266419489e-07, "loss": -0.0, "num_tokens": 39362141.0, "reward": 0.16579487174749374, "reward_std": 0.09776275791227818, "rewards/length_bonus_reward": 0.2844136580824852, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.23723757080733776, "step": 299 }, { "clip_ratio": 0.0, "completion_length": 5670.4168701171875, "epoch": 0.6, "grad_norm": 0.4229220881208931, "learning_rate": 4.131759111665348e-07, "loss": 0.0, "num_tokens": 39503295.0, "reward": 0.42372070997953415, "reward_std": 0.42675819620490074, "rewards/length_bonus_reward": 0.3646036833524704, "rewards/simple_accuracy_reward": 0.1250000037252903, "rewards/simple_cosine_scaled_reward": -0.1317659355700016, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 5854.5833740234375, "epoch": 0.602, "grad_norm": 0.3232064663889703, "learning_rate": 4.0974042737472005e-07, "loss": 0.0, "num_tokens": 39648725.0, "reward": 0.37608762085437775, "reward_std": 0.3316739797592163, "rewards/length_bonus_reward": 0.2988077774643898, "rewards/simple_accuracy_reward": 0.125, "rewards/simple_cosine_scaled_reward": -0.0954403318464756, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 6393.041748046875, "epoch": 0.604, "grad_norm": 0.39980062192905513, "learning_rate": 4.0630934270713755e-07, "loss": 0.0, "num_tokens": 39807774.0, "reward": 0.6470168046653271, "reward_std": 0.4458403792232275, "rewards/length_bonus_reward": 0.3199462890625, "rewards/simple_accuracy_reward": 0.2916666641831398, "rewards/simple_cosine_scaled_reward": 0.07080765813589096, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 5521.4583740234375, "epoch": 0.606, "grad_norm": 0.31526426215649156, "learning_rate": 4.028828243900141e-07, "loss": 0.0, "num_tokens": 39945623.0, "reward": 0.22051254659891129, "reward_std": 0.2831824868917465, "rewards/length_bonus_reward": 0.2773640938103199, "rewards/simple_accuracy_reward": 0.0416666679084301, "rewards/simple_cosine_scaled_reward": -0.1970364348962903, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 6198.2083740234375, "epoch": 0.608, "grad_norm": 0.2540221302455636, "learning_rate": 3.9946103942701775e-07, "loss": 0.0, "num_tokens": 40101208.0, "reward": 0.288693655282259, "reward_std": 0.24944246001541615, "rewards/length_bonus_reward": 0.3511454276740551, "rewards/simple_accuracy_reward": 0.0416666679084301, "rewards/simple_cosine_scaled_reward": -0.20823688991367817, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 5765.7918701171875, "epoch": 0.61, "grad_norm": 0.3559736554817662, "learning_rate": 3.960441545911204e-07, "loss": 0.0, "num_tokens": 40244921.0, "reward": 0.682052455842495, "reward_std": 0.34663386829197407, "rewards/length_bonus_reward": 0.3061319962143898, "rewards/simple_accuracy_reward": 0.3333333358168602, "rewards/simple_cosine_scaled_reward": 0.0851741973310709, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 6272.4169921875, "epoch": 0.612, "grad_norm": 0.4649780854539724, "learning_rate": 3.9263233641646836e-07, "loss": 0.0, "num_tokens": 40399839.0, "reward": 0.7973469570279121, "reward_std": 0.6144492533057928, "rewards/length_bonus_reward": 0.3437906950712204, "rewards/simple_accuracy_reward": 0.3749999962747097, "rewards/simple_cosine_scaled_reward": 0.15711251716129482, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 6654.791748046875, "epoch": 0.614, "grad_norm": 0.06269821687239233, "learning_rate": 3.8922575119026635e-07, "loss": 0.0, "num_tokens": 40569754.0, "reward": 0.5112118367105722, "reward_std": 0.08813247829675674, "rewards/length_bonus_reward": 0.2808736115694046, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": -0.039323605597019196, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 5573.041748046875, "epoch": 0.616, "grad_norm": 0.14549541196422144, "learning_rate": 3.8582456494467206e-07, "loss": 0.0, "num_tokens": 40710665.0, "reward": 0.24084912985563278, "reward_std": 0.14995857141911983, "rewards/length_bonus_reward": 0.3784077912569046, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.27511734142899513, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 5739.33349609375, "epoch": 0.618, "grad_norm": 0.21677870459923693, "learning_rate": 3.8242894344870495e-07, "loss": 0.0, "num_tokens": 40853359.0, "reward": 0.6743404418230057, "reward_std": 0.2582657225430012, "rewards/length_bonus_reward": 0.3389078825712204, "rewards/simple_accuracy_reward": 0.2916666679084301, "rewards/simple_cosine_scaled_reward": 0.08753174263983965, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 6737.4583740234375, "epoch": 0.62, "grad_norm": 0.24357902515196508, "learning_rate": 3.790390522001662e-07, "loss": 0.0, "num_tokens": 41019810.0, "reward": 0.4439855366945267, "reward_std": 0.22343775629997253, "rewards/length_bonus_reward": 0.2640075609087944, "rewards/simple_accuracy_reward": 0.2083333283662796, "rewards/simple_cosine_scaled_reward": -0.056710757315158844, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 4911.1251220703125, "epoch": 0.622, "grad_norm": 0.5201709710286498, "learning_rate": 3.7565505641757266e-07, "loss": 0.0, "num_tokens": 41142423.0, "reward": 1.0866494812071323, "reward_std": 0.5298759117722511, "rewards/length_bonus_reward": 0.3530171662569046, "rewards/simple_accuracy_reward": 0.5833333432674408, "rewards/simple_cosine_scaled_reward": 0.3005978614091873, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 6121.7501220703125, "epoch": 0.624, "grad_norm": 0.5080190709244763, "learning_rate": 3.722771210321048e-07, "loss": 0.0, "num_tokens": 41297403.0, "reward": 0.2857213709503412, "reward_std": 0.40174809098243713, "rewards/length_bonus_reward": 0.2822876051068306, "rewards/simple_accuracy_reward": 0.0833333358168602, "rewards/simple_cosine_scaled_reward": -0.15979912504553795, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 6624.166748046875, "epoch": 0.626, "grad_norm": 0.10657844848539924, "learning_rate": 3.689054106795677e-07, "loss": 0.0, "num_tokens": 41462137.0, "reward": 0.1630537286400795, "reward_std": 0.1260687503963709, "rewards/length_bonus_reward": 0.28277587890625, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.2394443117082119, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 5900.791748046875, "epoch": 0.628, "grad_norm": 0.40367233926739365, "learning_rate": 3.6554008969236715e-07, "loss": 0.0, "num_tokens": 41608916.0, "reward": 0.5714323278516531, "reward_std": 0.5085080396384001, "rewards/length_bonus_reward": 0.3215230256319046, "rewards/simple_accuracy_reward": 0.2500000074505806, "rewards/simple_cosine_scaled_reward": -0.0001814340939745307, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 5907.3751220703125, "epoch": 0.63, "grad_norm": 0.3859791364495553, "learning_rate": 3.621813220915004e-07, "loss": 0.0, "num_tokens": 41756207.0, "reward": 0.20652074925601482, "reward_std": 0.28501600585877895, "rewards/length_bonus_reward": 0.2872416228055954, "rewards/simple_accuracy_reward": 0.0416666679084301, "rewards/simple_cosine_scaled_reward": -0.24477509036660194, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 6121.9583740234375, "epoch": 0.632, "grad_norm": 0.43329426519518977, "learning_rate": 3.5882927157856167e-07, "loss": -0.0, "num_tokens": 41907430.0, "reward": 0.9316474907100201, "reward_std": 0.5440495144575834, "rewards/length_bonus_reward": 0.3115336075425148, "rewards/simple_accuracy_reward": 0.4999999925494194, "rewards/simple_cosine_scaled_reward": 0.24022779613733292, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 6373.3751220703125, "epoch": 0.634, "grad_norm": 0.4468607462219318, "learning_rate": 3.554841015277641e-07, "loss": 0.0, "num_tokens": 42067711.0, "reward": 0.7084978967905045, "reward_std": 0.4753319313749671, "rewards/length_bonus_reward": 0.3254598006606102, "rewards/simple_accuracy_reward": 0.3333333432674408, "rewards/simple_cosine_scaled_reward": 0.09940946847200394, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 6016.416809082031, "epoch": 0.636, "grad_norm": 0.2667581989838514, "learning_rate": 3.521459749779768e-07, "loss": -0.0, "num_tokens": 42217817.0, "reward": 0.2430511750280857, "reward_std": 0.2776245344430208, "rewards/length_bonus_reward": 0.2888285294175148, "rewards/simple_accuracy_reward": 0.0416666679084301, "rewards/simple_cosine_scaled_reward": -0.17488805297762156, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 6160.250061035156, "epoch": 0.638, "grad_norm": 0.3188153673275072, "learning_rate": 3.488150546247778e-07, "loss": 0.0, "num_tokens": 42379307.0, "reward": 0.49330712389200926, "reward_std": 0.3125795405358076, "rewards/length_bonus_reward": 0.2445170097053051, "rewards/simple_accuracy_reward": 0.2499999962747097, "rewards/simple_cosine_scaled_reward": -0.0024197762832045555, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 6560.0, "epoch": 0.64, "grad_norm": 0.2582401505250187, "learning_rate": 3.454915028125263e-07, "loss": 0.0, "num_tokens": 42542297.0, "reward": 0.4161204379051924, "reward_std": 0.2118233572691679, "rewards/length_bonus_reward": 0.276214599609375, "rewards/simple_accuracy_reward": 0.1666666716337204, "rewards/simple_cosine_scaled_reward": -0.053521718829870224, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 5211.95849609375, "epoch": 0.642, "grad_norm": 0.5043786658967019, "learning_rate": 3.421754815264488e-07, "loss": 0.0, "num_tokens": 42672118.0, "reward": 0.6506116669625044, "reward_std": 0.4358144663274288, "rewards/length_bonus_reward": 0.323486328125, "rewards/simple_accuracy_reward": 0.2916666641831398, "rewards/simple_cosine_scaled_reward": 0.0709172785282135, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 7112.416748046875, "epoch": 0.644, "grad_norm": 0.24186327229447885, "learning_rate": 3.388671523847445e-07, "loss": 0.0, "num_tokens": 42846770.0, "reward": 0.16958734393119812, "reward_std": 0.26079648546874523, "rewards/length_bonus_reward": 0.26361083984375, "rewards/simple_accuracy_reward": 0.0416666679084301, "rewards/simple_cosine_scaled_reward": -0.27138033509254456, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 5950.875, "epoch": 0.646, "grad_norm": 0.2974597136939909, "learning_rate": 3.3556667663070835e-07, "loss": 0.0, "num_tokens": 42994355.0, "reward": 0.7867113538086414, "reward_std": 0.2916571293026209, "rewards/length_bonus_reward": 0.3100382462143898, "rewards/simple_accuracy_reward": 0.4166666716337204, "rewards/simple_cosine_scaled_reward": 0.12001282721757889, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 6311.8751220703125, "epoch": 0.648, "grad_norm": 0.3984212272615014, "learning_rate": 3.3227421512487255e-07, "loss": 0.0, "num_tokens": 43155326.0, "reward": 0.3340246109291911, "reward_std": 0.4295175392180681, "rewards/length_bonus_reward": 0.273956298828125, "rewards/simple_accuracy_reward": 0.1250000037252903, "rewards/simple_cosine_scaled_reward": -0.12986339814960957, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 5710.0418701171875, "epoch": 0.65, "grad_norm": 0.3916380834478734, "learning_rate": 3.2898992833716563e-07, "loss": -0.0, "num_tokens": 43298073.0, "reward": 0.6703433841466904, "reward_std": 0.4213653542101383, "rewards/length_bonus_reward": 0.355987548828125, "rewards/simple_accuracy_reward": 0.2916666641831398, "rewards/simple_cosine_scaled_reward": 0.04537835344672203, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 6333.2083740234375, "epoch": 0.652, "grad_norm": 0.09335677084714937, "learning_rate": 3.257139763390925e-07, "loss": 0.0, "num_tokens": 43457948.0, "reward": 0.5283689666539431, "reward_std": 0.1217026561498642, "rewards/length_bonus_reward": 0.304718017578125, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": -0.05269811302423477, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 6495.25, "epoch": 0.654, "grad_norm": 0.36746740417036394, "learning_rate": 3.2244651879593156e-07, "loss": 0.0, "num_tokens": 43618172.0, "reward": 0.7330612987279892, "reward_std": 0.2945491261780262, "rewards/length_bonus_reward": 0.3168843537569046, "rewards/simple_accuracy_reward": 0.375, "rewards/simple_cosine_scaled_reward": 0.0823538526892662, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 6842.5001220703125, "epoch": 0.656, "grad_norm": 0.07325414191437148, "learning_rate": 3.191877149589539e-07, "loss": 0.0, "num_tokens": 43787402.0, "reward": 0.17220256850123405, "reward_std": 0.11583153158426285, "rewards/length_bonus_reward": 0.289642333984375, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.23487954586744308, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 5618.0, "epoch": 0.658, "grad_norm": 0.2560549639189407, "learning_rate": 3.15937723657661e-07, "loss": 0.0, "num_tokens": 43926248.0, "reward": 0.8929425217211246, "reward_std": 0.288953959941864, "rewards/length_bonus_reward": 0.3337809219956398, "rewards/simple_accuracy_reward": 0.4583333283662796, "rewards/simple_cosine_scaled_reward": 0.20165639743208885, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 5803.6668701171875, "epoch": 0.66, "grad_norm": 0.3661643890973214, "learning_rate": 3.1269670329204393e-07, "loss": 0.0, "num_tokens": 44071164.0, "reward": 0.9733074363321066, "reward_std": 0.36961137503385544, "rewards/length_bonus_reward": 0.3408610001206398, "rewards/simple_accuracy_reward": 0.4999999962747097, "rewards/simple_cosine_scaled_reward": 0.26489284075796604, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 6329.8751220703125, "epoch": 0.662, "grad_norm": 0.265919688680182, "learning_rate": 3.0946481182486297e-07, "loss": 0.0, "num_tokens": 44228481.0, "reward": 0.522054348140955, "reward_std": 0.26174790412187576, "rewards/length_bonus_reward": 0.322845458984375, "rewards/simple_accuracy_reward": 0.2083333283662796, "rewards/simple_cosine_scaled_reward": -0.01824895665049553, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 6285.041748046875, "epoch": 0.664, "grad_norm": 0.24387478221491674, "learning_rate": 3.0624220677394854e-07, "loss": 0.0, "num_tokens": 44385226.0, "reward": 0.8571124039590359, "reward_std": 0.26404275745153427, "rewards/length_bonus_reward": 0.3247782364487648, "rewards/simple_accuracy_reward": 0.4583333283662796, "rewards/simple_cosine_scaled_reward": 0.14800162613391876, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 7036.541748046875, "epoch": 0.666, "grad_norm": 0.38421159152615564, "learning_rate": 3.0302904520452443e-07, "loss": -0.0, "num_tokens": 44558729.0, "reward": 0.453582089394331, "reward_std": 0.4581709336489439, "rewards/length_bonus_reward": 0.2708333358168602, "rewards/simple_accuracy_reward": 0.2083333358168602, "rewards/simple_cosine_scaled_reward": -0.05116913956589997, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 7026.791748046875, "epoch": 0.668, "grad_norm": 0.26011014919063213, "learning_rate": 2.9982548372155256e-07, "loss": 0.0, "num_tokens": 44731686.0, "reward": 0.6516045965254307, "reward_std": 0.25488439202308655, "rewards/length_bonus_reward": 0.2751057893037796, "rewards/simple_accuracy_reward": 0.3333333358168602, "rewards/simple_cosine_scaled_reward": 0.08633086504414678, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 6649.20849609375, "epoch": 0.67, "grad_norm": 0.0727254794080774, "learning_rate": 2.9663167846209996e-07, "loss": -0.0, "num_tokens": 44896169.0, "reward": 0.527042530477047, "reward_std": 0.10560564510524273, "rewards/length_bonus_reward": 0.2798258438706398, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": -0.0055666230618953705, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 6466.1251220703125, "epoch": 0.672, "grad_norm": 0.4137511050654125, "learning_rate": 2.9344778508772914e-07, "loss": -0.0, "num_tokens": 45055988.0, "reward": 0.38149104081094265, "reward_std": 0.41136366315186024, "rewards/length_bonus_reward": 0.30841064453125, "rewards/simple_accuracy_reward": 0.1250000037252903, "rewards/simple_cosine_scaled_reward": -0.10383920185267925, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 5010.08349609375, "epoch": 0.674, "grad_norm": 0.3490347715573145, "learning_rate": 2.902739587769114e-07, "loss": 0.0, "num_tokens": 45181018.0, "reward": 1.0307516269385815, "reward_std": 0.3653286173939705, "rewards/length_bonus_reward": 0.23341878317296505, "rewards/simple_accuracy_reward": 0.625, "rewards/simple_cosine_scaled_reward": 0.34466563537716866, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 6557.625, "epoch": 0.676, "grad_norm": 0.0712074034090726, "learning_rate": 2.8711035421746363e-07, "loss": 0.0, "num_tokens": 45345475.0, "reward": 0.5628174655139446, "reward_std": 0.10147103853523731, "rewards/length_bonus_reward": 0.305816650390625, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": 0.014001615345478058, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 6639.5, "epoch": 0.678, "grad_norm": 0.31863717234505806, "learning_rate": 2.8395712559900874e-07, "loss": 0.0, "num_tokens": 45510205.0, "reward": 0.5432556457817554, "reward_std": 0.3744794391095638, "rewards/length_bonus_reward": 0.2960408478975296, "rewards/simple_accuracy_reward": 0.2499999962747097, "rewards/simple_cosine_scaled_reward": -0.005570471286773682, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 6033.95849609375, "epoch": 0.68, "grad_norm": 0.38147519364609256, "learning_rate": 2.808144266054612e-07, "loss": 0.0, "num_tokens": 45661752.0, "reward": 0.36187500320374966, "reward_std": 0.291139580309391, "rewards/length_bonus_reward": 0.3106994554400444, "rewards/simple_accuracy_reward": 0.125, "rewards/simple_cosine_scaled_reward": -0.14764895103871822, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 6648.45849609375, "epoch": 0.682, "grad_norm": 0.3630321666303949, "learning_rate": 2.776824104075364e-07, "loss": 0.0, "num_tokens": 45831293.0, "reward": 0.4698590934276581, "reward_std": 0.3613135479390621, "rewards/length_bonus_reward": 0.2778218537569046, "rewards/simple_accuracy_reward": 0.2083333395421505, "rewards/simple_cosine_scaled_reward": -0.032592300325632095, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 6402.3751220703125, "epoch": 0.684, "grad_norm": 0.4032857244532214, "learning_rate": 2.745612296552847e-07, "loss": 0.0, "num_tokens": 45989132.0, "reward": 0.5345658985897899, "reward_std": 0.4071739763021469, "rewards/length_bonus_reward": 0.3365885466337204, "rewards/simple_accuracy_reward": 0.2083333395421505, "rewards/simple_cosine_scaled_reward": -0.02071199007332325, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 6715.3333740234375, "epoch": 0.686, "grad_norm": 0.08927060286244569, "learning_rate": 2.71451036470653e-07, "loss": 0.0, "num_tokens": 46154536.0, "reward": 0.18689699098467827, "reward_std": 0.11871294397860765, "rewards/length_bonus_reward": 0.29510498046875, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.21641597896814346, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 5333.666748046875, "epoch": 0.688, "grad_norm": 0.11693262345763111, "learning_rate": 2.683519824400692e-07, "loss": 0.0, "num_tokens": 46293380.0, "reward": 0.9153371304273605, "reward_std": 0.12452462315559387, "rewards/length_bonus_reward": 0.264495849609375, "rewards/simple_accuracy_reward": 0.5, "rewards/simple_cosine_scaled_reward": 0.3016825318336487, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 5046.33349609375, "epoch": 0.69, "grad_norm": 0.34289159214112386, "learning_rate": 2.6526421860705473e-07, "loss": 0.0, "num_tokens": 46419058.0, "reward": 1.1333957072347403, "reward_std": 0.37177950888872147, "rewards/length_bonus_reward": 0.3320515975356102, "rewards/simple_accuracy_reward": 0.625, "rewards/simple_cosine_scaled_reward": 0.35268818959593773, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 5115.041687011719, "epoch": 0.692, "grad_norm": 0.15590607264107806, "learning_rate": 2.621878954648623e-07, "loss": 0.0, "num_tokens": 46548809.0, "reward": 0.4619153179228306, "reward_std": 0.15834440477192402, "rewards/length_bonus_reward": 0.20705159939825535, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": 0.009727336466312408, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 6769.416748046875, "epoch": 0.694, "grad_norm": 0.30548014377670735, "learning_rate": 2.591231629491423e-07, "loss": 0.0, "num_tokens": 46717491.0, "reward": 0.3123680017888546, "reward_std": 0.28094547241926193, "rewards/length_bonus_reward": 0.3032328262925148, "rewards/simple_accuracy_reward": 0.0833333358168602, "rewards/simple_cosine_scaled_reward": -0.1483963578939438, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 4997.791809082031, "epoch": 0.696, "grad_norm": 0.5812280109763096, "learning_rate": 2.5607017043063353e-07, "loss": 0.0, "num_tokens": 46843792.0, "reward": 0.8728577271103859, "reward_std": 0.39729489013552666, "rewards/length_bonus_reward": 0.2967122383415699, "rewards/simple_accuracy_reward": 0.4583333395421505, "rewards/simple_cosine_scaled_reward": 0.23562423884868622, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 6103.45849609375, "epoch": 0.698, "grad_norm": 0.3428522955096518, "learning_rate": 2.530290667078846e-07, "loss": 0.0, "num_tokens": 46994637.0, "reward": 0.3260865584015846, "reward_std": 0.3347742184996605, "rewards/length_bonus_reward": 0.303924560546875, "rewards/simple_accuracy_reward": 0.0833333358168602, "rewards/simple_cosine_scaled_reward": -0.12234268337488174, "step": 349 }, { "clip_ratio": 0.0, "completion_length": 5801.0001220703125, "epoch": 0.7, "grad_norm": 0.25670969987160286, "learning_rate": 2.500000000000001e-07, "loss": 0.0, "num_tokens": 47139165.0, "reward": 0.5152554214000702, "reward_std": 0.30346043035387993, "rewards/length_bonus_reward": 0.3307088240981102, "rewards/simple_accuracy_reward": 0.2083333283662796, "rewards/simple_cosine_scaled_reward": -0.04757357016205788, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 5517.45849609375, "epoch": 0.702, "grad_norm": 0.34934407839878256, "learning_rate": 2.469831179394182e-07, "loss": 0.0, "num_tokens": 47276510.0, "reward": 0.7593820914626122, "reward_std": 0.34713531471788883, "rewards/length_bonus_reward": 0.350921630859375, "rewards/simple_accuracy_reward": 0.375, "rewards/simple_cosine_scaled_reward": 0.06692089140415192, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 7168.0, "epoch": 0.704, "grad_norm": 0.053063177016757106, "learning_rate": 2.439785675647143e-07, "loss": 0.0, "num_tokens": 47456690.0, "reward": 0.15201527439057827, "reward_std": 0.0739121912047267, "rewards/length_bonus_reward": 0.2500305101275444, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.19603049010038376, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 7111.5, "epoch": 0.706, "grad_norm": 0.24425998542612423, "learning_rate": 2.4098649531343494e-07, "loss": -0.0, "num_tokens": 47632730.0, "reward": 0.19427236542105675, "reward_std": 0.23929008096456528, "rewards/length_bonus_reward": 0.2604370042681694, "rewards/simple_accuracy_reward": 0.0416666679084301, "rewards/simple_cosine_scaled_reward": -0.2156626395881176, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 6783.625, "epoch": 0.708, "grad_norm": 0.1906391713487306, "learning_rate": 2.380070470149605e-07, "loss": 0.0, "num_tokens": 47800427.0, "reward": 0.2207618448883295, "reward_std": 0.19059299491345882, "rewards/length_bonus_reward": 0.2827860489487648, "rewards/simple_accuracy_reward": 0.0416666679084301, "rewards/simple_cosine_scaled_reward": -0.20738175883889198, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 6274.0001220703125, "epoch": 0.71, "grad_norm": 0.07803238726832958, "learning_rate": 2.350403678833976e-07, "loss": 0.0, "num_tokens": 47955389.0, "reward": 0.2098095640540123, "reward_std": 0.11201258283108473, "rewards/length_bonus_reward": 0.3187967911362648, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.2179744839668274, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 5771.416748046875, "epoch": 0.712, "grad_norm": 0.10620730987196032, "learning_rate": 2.3208660251050156e-07, "loss": 0.0, "num_tokens": 48099027.0, "reward": 0.5389553420245647, "reward_std": 0.12794847786426544, "rewards/length_bonus_reward": 0.2937520369887352, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": -0.009593449532985687, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 6050.5418701171875, "epoch": 0.714, "grad_norm": 0.2890589432470446, "learning_rate": 2.2914589485863012e-07, "loss": 0.0, "num_tokens": 48252190.0, "reward": 0.41459546610713005, "reward_std": 0.30699630081653595, "rewards/length_bonus_reward": 0.2859598733484745, "rewards/simple_accuracy_reward": 0.1666666716337204, "rewards/simple_cosine_scaled_reward": -0.07606217637658119, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 6712.875, "epoch": 0.716, "grad_norm": 0.34959821746847014, "learning_rate": 2.262183882537249e-07, "loss": 0.0, "num_tokens": 48417817.0, "reward": 0.28864990919828415, "reward_std": 0.341975387185812, "rewards/length_bonus_reward": 0.2871602326631546, "rewards/simple_accuracy_reward": 0.0833333358168602, "rewards/simple_cosine_scaled_reward": -0.16368731949478388, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 5532.0, "epoch": 0.718, "grad_norm": 0.22001293965202928, "learning_rate": 2.23304225378328e-07, "loss": 0.0, "num_tokens": 48557809.0, "reward": 0.5001139342784882, "reward_std": 0.2646260615438223, "rewards/length_bonus_reward": 0.3091532364487648, "rewards/simple_accuracy_reward": 0.2083333283662796, "rewards/simple_cosine_scaled_reward": -0.034745343029499054, "step": 359 }, { "clip_ratio": 0.0, "completion_length": 5703.4583740234375, "epoch": 0.72, "grad_norm": 0.3951160315850499, "learning_rate": 2.2040354826462664e-07, "loss": 0.0, "num_tokens": 48700854.0, "reward": 0.3717580735683441, "reward_std": 0.41627471148967743, "rewards/length_bonus_reward": 0.3164876252412796, "rewards/simple_accuracy_reward": 0.1250000037252903, "rewards/simple_cosine_scaled_reward": -0.13945913966745138, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 6746.33349609375, "epoch": 0.722, "grad_norm": 0.11429108999221914, "learning_rate": 2.1751649828753106e-07, "loss": 0.0, "num_tokens": 48868466.0, "reward": 0.19057508558034897, "reward_std": 0.1401625107973814, "rewards/length_bonus_reward": 0.2994588166475296, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.21776747703552246, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 5461.416748046875, "epoch": 0.724, "grad_norm": 0.2413060664996543, "learning_rate": 2.146432161577842e-07, "loss": 0.0, "num_tokens": 49005648.0, "reward": 0.647798664867878, "reward_std": 0.26144311018288136, "rewards/length_bonus_reward": 0.3428751677274704, "rewards/simple_accuracy_reward": 0.2916666679084301, "rewards/simple_cosine_scaled_reward": 0.026513613760471344, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 6611.70849609375, "epoch": 0.726, "grad_norm": 0.2572501168046266, "learning_rate": 2.117838419151034e-07, "loss": 0.0, "num_tokens": 49171529.0, "reward": 0.24225858598947525, "reward_std": 0.2566937208175659, "rewards/length_bonus_reward": 0.3187662810087204, "rewards/simple_accuracy_reward": 0.0416666679084301, "rewards/simple_cosine_scaled_reward": -0.2363487333059311, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 6219.916748046875, "epoch": 0.728, "grad_norm": 0.07970713876566374, "learning_rate": 2.0893851492135532e-07, "loss": 0.0, "num_tokens": 49328031.0, "reward": 0.15589985251426697, "reward_std": 0.10338756814599037, "rewards/length_bonus_reward": 0.284210205078125, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.25662072375416756, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 5942.791687011719, "epoch": 0.73, "grad_norm": 0.36581111224511453, "learning_rate": 2.0610737385376348e-07, "loss": 0.0, "num_tokens": 49477072.0, "reward": 0.805458664894104, "reward_std": 0.3124531991779804, "rewards/length_bonus_reward": 0.2900492362678051, "rewards/simple_accuracy_reward": 0.4166666716337204, "rewards/simple_cosine_scaled_reward": 0.19748548790812492, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 5729.000061035156, "epoch": 0.732, "grad_norm": 0.11745443029853457, "learning_rate": 2.0329055669814933e-07, "loss": 0.0, "num_tokens": 49622470.0, "reward": 0.9558068066835403, "reward_std": 0.12889241613447666, "rewards/length_bonus_reward": 0.3078918531537056, "rewards/simple_accuracy_reward": 0.5, "rewards/simple_cosine_scaled_reward": 0.29582984559237957, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 5111.2501220703125, "epoch": 0.734, "grad_norm": 0.2927107065217231, "learning_rate": 2.0048820074220711e-07, "loss": 0.0, "num_tokens": 49749610.0, "reward": 0.9561471417546272, "reward_std": 0.28061569668352604, "rewards/length_bonus_reward": 0.3545023575425148, "rewards/simple_accuracy_reward": 0.4583333283662796, "rewards/simple_cosine_scaled_reward": 0.2866228558123112, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 6022.2501220703125, "epoch": 0.736, "grad_norm": 0.3346963376624778, "learning_rate": 1.9770044256881258e-07, "loss": 0.0, "num_tokens": 49898860.0, "reward": 0.3339090719819069, "reward_std": 0.26547081768512726, "rewards/length_bonus_reward": 0.3425089493393898, "rewards/simple_accuracy_reward": 0.0833333358168602, "rewards/simple_cosine_scaled_reward": -0.18386645521968603, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 5988.583435058594, "epoch": 0.738, "grad_norm": 0.36283715922481086, "learning_rate": 1.9492741804936618e-07, "loss": 0.0, "num_tokens": 50047458.0, "reward": 0.6363462414592505, "reward_std": 0.41168148815631866, "rewards/length_bonus_reward": 0.22426350601017475, "rewards/simple_accuracy_reward": 0.3750000037252903, "rewards/simple_cosine_scaled_reward": 0.07416537776589394, "step": 369 }, { "clip_ratio": 0.0, "completion_length": 4730.3333740234375, "epoch": 0.74, "grad_norm": 0.27360122170479334, "learning_rate": 1.9216926233717084e-07, "loss": 0.0, "num_tokens": 50166008.0, "reward": 0.8202562220394611, "reward_std": 0.2960926480591297, "rewards/length_bonus_reward": 0.24010213185101748, "rewards/simple_accuracy_reward": 0.4583333283662796, "rewards/simple_cosine_scaled_reward": 0.24364148080348969, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 6369.666748046875, "epoch": 0.742, "grad_norm": 0.44260807759388526, "learning_rate": 1.8942610986084484e-07, "loss": -0.0, "num_tokens": 50326488.0, "reward": 0.3547972999513149, "reward_std": 0.28970395121723413, "rewards/length_bonus_reward": 0.314422607421875, "rewards/simple_accuracy_reward": 0.125, "rewards/simple_cosine_scaled_reward": -0.16925057768821716, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 6236.3333740234375, "epoch": 0.744, "grad_norm": 0.3201821582655666, "learning_rate": 1.8669809431776988e-07, "loss": -0.0, "num_tokens": 50483654.0, "reward": 0.37900192849338055, "reward_std": 0.2817553859204054, "rewards/length_bonus_reward": 0.3263041228055954, "rewards/simple_accuracy_reward": 0.125, "rewards/simple_cosine_scaled_reward": -0.1446043699979782, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 6184.3751220703125, "epoch": 0.746, "grad_norm": 0.13007382595120842, "learning_rate": 1.8398534866757455e-07, "loss": 0.0, "num_tokens": 50637989.0, "reward": 0.5576141364872456, "reward_std": 0.1399791594594717, "rewards/length_bonus_reward": 0.2928568571805954, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": 0.02951446920633316, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 6447.125, "epoch": 0.748, "grad_norm": 0.07950489585494867, "learning_rate": 1.812880051256551e-07, "loss": 0.0, "num_tokens": 50798102.0, "reward": 0.19794733077287674, "reward_std": 0.11804980784654617, "rewards/length_bonus_reward": 0.3195393830537796, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.24318412691354752, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 6335.3333740234375, "epoch": 0.75, "grad_norm": 0.10675228051638375, "learning_rate": 1.7860619515673032e-07, "loss": 0.0, "num_tokens": 50954368.0, "reward": 0.199926832690835, "reward_std": 0.12403971888124943, "rewards/length_bonus_reward": 0.3121846541762352, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.2245156466960907, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 4578.541809082031, "epoch": 0.752, "grad_norm": 0.2922561732206295, "learning_rate": 1.7594004946843454e-07, "loss": -0.0, "num_tokens": 51071753.0, "reward": 0.6341448351740837, "reward_std": 0.3011226952075958, "rewards/length_bonus_reward": 0.3313700407743454, "rewards/simple_accuracy_reward": 0.2916666679084301, "rewards/simple_cosine_scaled_reward": 0.022216279059648514, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 6729.791748046875, "epoch": 0.754, "grad_norm": 0.4961292773155607, "learning_rate": 1.7328969800494726e-07, "loss": 0.0, "num_tokens": 51239502.0, "reward": 0.4299081340432167, "reward_std": 0.48168056458234787, "rewards/length_bonus_reward": 0.2905375212430954, "rewards/simple_accuracy_reward": 0.1666666679084301, "rewards/simple_cosine_scaled_reward": -0.05459211638662964, "step": 377 }, { "clip_ratio": 0.0, "completion_length": 6605.6251220703125, "epoch": 0.756, "grad_norm": 0.28190219777416814, "learning_rate": 1.7065526994065972e-07, "loss": 0.0, "num_tokens": 51402117.0, "reward": 0.3130038268864155, "reward_std": 0.27758636698126793, "rewards/length_bonus_reward": 0.3144938126206398, "rewards/simple_accuracy_reward": 0.0833333358168602, "rewards/simple_cosine_scaled_reward": -0.16964666172862053, "step": 378 }, { "clip_ratio": 0.0, "completion_length": 5515.583435058594, "epoch": 0.758, "grad_norm": 0.26200496632183784, "learning_rate": 1.6803689367387918e-07, "loss": 0.0, "num_tokens": 51539315.0, "reward": 0.5998614858835936, "reward_std": 0.2622463349252939, "rewards/length_bonus_reward": 0.2875467911362648, "rewards/simple_accuracy_reward": 0.2916666679084301, "rewards/simple_cosine_scaled_reward": 0.041296002455055714, "step": 379 }, { "clip_ratio": 0.0, "completion_length": 5922.375244140625, "epoch": 0.76, "grad_norm": 0.23614741140423534, "learning_rate": 1.6543469682057104e-07, "loss": 0.0, "num_tokens": 51686186.0, "reward": 0.26071146316826344, "reward_std": 0.24300545640289783, "rewards/length_bonus_reward": 0.2941792830824852, "rewards/simple_accuracy_reward": 0.0416666679084301, "rewards/simple_cosine_scaled_reward": -0.15026896074414253, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 4937.6251220703125, "epoch": 0.762, "grad_norm": 0.5538245966803743, "learning_rate": 1.6284880620813846e-07, "loss": 0.0, "num_tokens": 51812561.0, "reward": 0.8250564001500607, "reward_std": 0.286323307082057, "rewards/length_bonus_reward": 0.2494099922478199, "rewards/simple_accuracy_reward": 0.4583333283662796, "rewards/simple_cosine_scaled_reward": 0.2346261478960514, "step": 381 }, { "clip_ratio": 0.0, "completion_length": 6065.041748046875, "epoch": 0.764, "grad_norm": 0.11692914688534461, "learning_rate": 1.6027934786924185e-07, "loss": 0.0, "num_tokens": 51965838.0, "reward": 0.5828330814838409, "reward_std": 0.15099831484258175, "rewards/length_bonus_reward": 0.3545837476849556, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": -0.04350139573216438, "step": 382 }, { "clip_ratio": 0.0, "completion_length": 5288.666931152344, "epoch": 0.766, "grad_norm": 0.5100385347883108, "learning_rate": 1.5772644703565564e-07, "loss": 0.0, "num_tokens": 52098796.0, "reward": 0.7287304252386093, "reward_std": 0.5640304945409298, "rewards/length_bonus_reward": 0.35638427734375, "rewards/simple_accuracy_reward": 0.3333333320915699, "rewards/simple_cosine_scaled_reward": 0.07802562857978046, "step": 383 }, { "clip_ratio": 0.0, "completion_length": 7168.0, "epoch": 0.768, "grad_norm": 0.07440035782746739, "learning_rate": 1.551902281321651e-07, "loss": 0.0, "num_tokens": 52276126.0, "reward": 0.09670547023415565, "reward_std": 0.0964379720389843, "rewards/length_bonus_reward": 0.25006103515625, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.3067111372947693, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 5987.5, "epoch": 0.77, "grad_norm": 0.21880621978829665, "learning_rate": 1.5267081477050131e-07, "loss": -0.0, "num_tokens": 52426558.0, "reward": 0.6113961488008499, "reward_std": 0.24381374940276146, "rewards/length_bonus_reward": 0.2559407576918602, "rewards/simple_accuracy_reward": 0.2916666679084301, "rewards/simple_cosine_scaled_reward": 0.12757749389857054, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 6554.3751220703125, "epoch": 0.772, "grad_norm": 0.4739495016801891, "learning_rate": 1.5016832974331723e-07, "loss": 0.0, "num_tokens": 52590253.0, "reward": 0.4862647783011198, "reward_std": 0.43354570865631104, "rewards/length_bonus_reward": 0.30377197265625, "rewards/simple_accuracy_reward": 0.2083333358168602, "rewards/simple_cosine_scaled_reward": -0.05168103915639222, "step": 386 }, { "clip_ratio": 0.0, "completion_length": 6863.9583740234375, "epoch": 0.774, "grad_norm": 0.08455758073977702, "learning_rate": 1.4768289501820263e-07, "loss": 0.0, "num_tokens": 52759902.0, "reward": 0.178485207259655, "reward_std": 0.1239917203783989, "rewards/length_bonus_reward": 0.2951456755399704, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.2333209477365017, "step": 387 }, { "clip_ratio": 0.0, "completion_length": 6814.8333740234375, "epoch": 0.776, "grad_norm": 0.09313602121649582, "learning_rate": 1.4521463173173965e-07, "loss": 0.0, "num_tokens": 52927988.0, "reward": 0.17468434944748878, "reward_std": 0.13441512547433376, "rewards/length_bonus_reward": 0.2940165251493454, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.23866434022784233, "step": 388 }, { "clip_ratio": 0.0, "completion_length": 5137.83349609375, "epoch": 0.778, "grad_norm": 0.4153029572422351, "learning_rate": 1.4276366018359842e-07, "loss": 0.0, "num_tokens": 53055916.0, "reward": 0.7136026918888092, "reward_std": 0.48491785302758217, "rewards/length_bonus_reward": 0.2679341658949852, "rewards/simple_accuracy_reward": 0.3750000037252903, "rewards/simple_cosine_scaled_reward": 0.14133698400110006, "step": 389 }, { "clip_ratio": 0.0, "completion_length": 5861.3751220703125, "epoch": 0.78, "grad_norm": 0.2902912094658125, "learning_rate": 1.4033009983067452e-07, "loss": 0.0, "num_tokens": 53201995.0, "reward": 0.26602280512452126, "reward_std": 0.28672423399984837, "rewards/length_bonus_reward": 0.31658935546875, "rewards/simple_accuracy_reward": 0.0416666679084301, "rewards/simple_cosine_scaled_reward": -0.1844664658419788, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 6028.7083740234375, "epoch": 0.782, "grad_norm": 0.09584655997713525, "learning_rate": 1.3791406928126635e-07, "loss": -0.0, "num_tokens": 53351136.0, "reward": 0.956916868686676, "reward_std": 0.10663202032446861, "rewards/length_bonus_reward": 0.312042236328125, "rewards/simple_accuracy_reward": 0.5, "rewards/simple_cosine_scaled_reward": 0.28974928334355354, "step": 391 }, { "clip_ratio": 0.0, "completion_length": 6034.9583740234375, "epoch": 0.784, "grad_norm": 0.34870491228432726, "learning_rate": 1.3551568628929432e-07, "loss": 0.0, "num_tokens": 53501147.0, "reward": 0.7281591184437275, "reward_std": 0.33034917153418064, "rewards/length_bonus_reward": 0.29693603515625, "rewards/simple_accuracy_reward": 0.375, "rewards/simple_cosine_scaled_reward": 0.11244607716798782, "step": 392 }, { "clip_ratio": 0.0, "completion_length": 6351.375244140625, "epoch": 0.786, "grad_norm": 0.5109029402322164, "learning_rate": 1.3313506774856175e-07, "loss": 0.0, "num_tokens": 53657858.0, "reward": 0.5869603082537651, "reward_std": 0.536841593682766, "rewards/length_bonus_reward": 0.3242492601275444, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": 0.02542204037308693, "step": 393 }, { "clip_ratio": 0.0, "completion_length": 6317.0, "epoch": 0.788, "grad_norm": 0.09378818409301377, "learning_rate": 1.3077232968705805e-07, "loss": -0.0, "num_tokens": 53813852.0, "reward": 0.5469566136598587, "reward_std": 0.1107094269245863, "rewards/length_bonus_reward": 0.2705993689596653, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": 0.0527145080268383, "step": 394 }, { "clip_ratio": 0.0, "completion_length": 5354.2501220703125, "epoch": 0.79, "grad_norm": 0.2736955846840174, "learning_rate": 1.284275872613028e-07, "loss": -0.0, "num_tokens": 53946878.0, "reward": 0.7809486947953701, "reward_std": 0.23208532948046923, "rewards/length_bonus_reward": 0.23980712704360485, "rewards/simple_accuracy_reward": 0.4583333283662796, "rewards/simple_cosine_scaled_reward": 0.1656164899468422, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 5674.5, "epoch": 0.792, "grad_norm": 0.24928745738937433, "learning_rate": 1.2610095475073413e-07, "loss": 0.0, "num_tokens": 54087806.0, "reward": 0.5214796997606754, "reward_std": 0.27824391424655914, "rewards/length_bonus_reward": 0.3456827774643898, "rewards/simple_accuracy_reward": 0.2083333283662796, "rewards/simple_cosine_scaled_reward": -0.06507281959056854, "step": 396 }, { "clip_ratio": 0.0, "completion_length": 6359.3751220703125, "epoch": 0.794, "grad_norm": 0.5423151314725128, "learning_rate": 1.2379254555213786e-07, "loss": 0.0, "num_tokens": 54246701.0, "reward": 0.643811009824276, "reward_std": 0.6100348737090826, "rewards/length_bonus_reward": 0.3095906600356102, "rewards/simple_accuracy_reward": 0.2916666753590107, "rewards/simple_cosine_scaled_reward": 0.08510737586766481, "step": 397 }, { "clip_ratio": 0.0, "completion_length": 6428.5001220703125, "epoch": 0.796, "grad_norm": 0.12006305630096972, "learning_rate": 1.2150247217412185e-07, "loss": 0.0, "num_tokens": 54407681.0, "reward": 0.20640602335333824, "reward_std": 0.13360532745718956, "rewards/length_bonus_reward": 0.3039449080824852, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.1950777731835842, "step": 398 }, { "clip_ratio": 0.0, "completion_length": 5648.5001220703125, "epoch": 0.798, "grad_norm": 0.31421466544723586, "learning_rate": 1.192308462316317e-07, "loss": 0.0, "num_tokens": 54553733.0, "reward": 0.5507405288517475, "reward_std": 0.2654908001422882, "rewards/length_bonus_reward": 0.2499593049287796, "rewards/simple_accuracy_reward": 0.2916666679084301, "rewards/simple_cosine_scaled_reward": 0.01822902075946331, "step": 399 }, { "clip_ratio": 0.0, "completion_length": 5179.3333740234375, "epoch": 0.8, "grad_norm": 0.32926956957468223, "learning_rate": 1.1697777844051104e-07, "loss": 0.0, "num_tokens": 54686191.0, "reward": 0.5182418972253799, "reward_std": 0.246239323168993, "rewards/length_bonus_reward": 0.3153584823012352, "rewards/simple_accuracy_reward": 0.2083333283662796, "rewards/simple_cosine_scaled_reward": -0.010899841785430908, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 7113.2083740234375, "epoch": 0.802, "grad_norm": 0.05663880431050089, "learning_rate": 1.1474337861210543e-07, "loss": 0.0, "num_tokens": 54866352.0, "reward": 0.151049692183733, "reward_std": 0.08858164213597775, "rewards/length_bonus_reward": 0.2628682479262352, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.22363713197410107, "step": 401 }, { "clip_ratio": 0.0, "completion_length": 6495.3333740234375, "epoch": 0.804, "grad_norm": 0.07328990174774194, "learning_rate": 1.1252775564791023e-07, "loss": 0.0, "num_tokens": 55028222.0, "reward": 0.5405621193349361, "reward_std": 0.09770700708031654, "rewards/length_bonus_reward": 0.2930094376206398, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": -0.004894688725471497, "step": 402 }, { "clip_ratio": 0.0, "completion_length": 5759.8751220703125, "epoch": 0.806, "grad_norm": 0.3373437759210431, "learning_rate": 1.1033101753426282e-07, "loss": 0.0, "num_tokens": 55171019.0, "reward": 0.6062191836535931, "reward_std": 0.3944154493510723, "rewards/length_bonus_reward": 0.2643636055290699, "rewards/simple_accuracy_reward": 0.2916666641831398, "rewards/simple_cosine_scaled_reward": 0.1003777738660574, "step": 403 }, { "clip_ratio": 0.0, "completion_length": 6490.7083740234375, "epoch": 0.808, "grad_norm": 0.07417306739804319, "learning_rate": 1.0815327133708013e-07, "loss": 0.0, "num_tokens": 55331398.0, "reward": 0.16152993217110634, "reward_std": 0.10920374467968941, "rewards/length_bonus_reward": 0.2834370955824852, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.24381433799862862, "step": 404 }, { "clip_ratio": 0.0, "completion_length": 7117.5, "epoch": 0.81, "grad_norm": 0.07927057323810276, "learning_rate": 1.0599462319663904e-07, "loss": 0.0, "num_tokens": 55507384.0, "reward": 0.11949224025011063, "reward_std": 0.1115232277661562, "rewards/length_bonus_reward": 0.2604878693819046, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.28199126943945885, "step": 405 }, { "clip_ratio": 0.0, "completion_length": 6971.875244140625, "epoch": 0.812, "grad_norm": 0.05507141163703589, "learning_rate": 1.038551783224047e-07, "loss": 0.0, "num_tokens": 55679269.0, "reward": 0.1146170161664486, "reward_std": 0.08482633531093597, "rewards/length_bonus_reward": 0.2746887132525444, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.3201434314250946, "step": 406 }, { "clip_ratio": 0.0, "completion_length": 6532.9583740234375, "epoch": 0.814, "grad_norm": 0.2934104401100327, "learning_rate": 1.0173504098790186e-07, "loss": 0.0, "num_tokens": 55840890.0, "reward": 0.2624971140176058, "reward_std": 0.2516635339707136, "rewards/length_bonus_reward": 0.32061767578125, "rewards/simple_accuracy_reward": 0.0416666679084301, "rewards/simple_cosine_scaled_reward": -0.19957446865737438, "step": 407 }, { "clip_ratio": 0.0, "completion_length": 5604.0833740234375, "epoch": 0.816, "grad_norm": 0.6714735950914413, "learning_rate": 9.963431452563331e-08, "loss": 0.0, "num_tokens": 55980542.0, "reward": 0.4626415856182575, "reward_std": 0.42609792575240135, "rewards/length_bonus_reward": 0.2658284530043602, "rewards/simple_accuracy_reward": 0.2083333395421505, "rewards/simple_cosine_scaled_reward": -0.023040438536554575, "step": 408 }, { "clip_ratio": 0.0, "completion_length": 6792.791748046875, "epoch": 0.818, "grad_norm": 0.284556474964489, "learning_rate": 9.755310132204297e-08, "loss": 0.0, "num_tokens": 56147727.0, "reward": 0.3970888499170542, "reward_std": 0.26235133968293667, "rewards/length_bonus_reward": 0.2892557755112648, "rewards/simple_accuracy_reward": 0.1666666716337204, "rewards/simple_cosine_scaled_reward": -0.11766722425818443, "step": 409 }, { "clip_ratio": 0.0, "completion_length": 4443.500183105469, "epoch": 0.82, "grad_norm": 0.11850097544322995, "learning_rate": 9.549150281252632e-08, "loss": 0.0, "num_tokens": 56259543.0, "reward": 0.9336798340082169, "reward_std": 0.11582905054092407, "rewards/length_bonus_reward": 0.3058776883408427, "rewards/simple_accuracy_reward": 0.5, "rewards/simple_cosine_scaled_reward": 0.25560417771339417, "step": 410 }, { "clip_ratio": 0.0, "completion_length": 6430.5, "epoch": 0.822, "grad_norm": 0.22678370152153357, "learning_rate": 9.344961947648622e-08, "loss": -0.0, "num_tokens": 56417895.0, "reward": 0.2119182050228119, "reward_std": 0.26083753630518913, "rewards/length_bonus_reward": 0.2847391739487648, "rewards/simple_accuracy_reward": 0.0416666679084301, "rewards/simple_cosine_scaled_reward": -0.22897527180612087, "step": 411 }, { "clip_ratio": 0.0, "completion_length": 5849.8333740234375, "epoch": 0.824, "grad_norm": 0.23428509844441386, "learning_rate": 9.142755083243575e-08, "loss": 0.0, "num_tokens": 56562923.0, "reward": 0.9517627339810133, "reward_std": 0.2709892652928829, "rewards/length_bonus_reward": 0.24837239645421505, "rewards/simple_accuracy_reward": 0.5416666679084301, "rewards/simple_cosine_scaled_reward": 0.3234473541378975, "step": 412 }, { "clip_ratio": 0.0, "completion_length": 6100.9583740234375, "epoch": 0.826, "grad_norm": 0.13054030526326657, "learning_rate": 8.942539543314798e-08, "loss": 0.0, "num_tokens": 56714560.0, "reward": 0.4866980351507664, "reward_std": 0.13414510898292065, "rewards/length_bonus_reward": 0.22826130874454975, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": 0.016873452812433243, "step": 413 }, { "clip_ratio": 0.0, "completion_length": 6305.0833740234375, "epoch": 0.828, "grad_norm": 0.0759645283470696, "learning_rate": 8.744325086085247e-08, "loss": 0.0, "num_tokens": 56871012.0, "reward": 0.555214049294591, "reward_std": 0.09412568900734186, "rewards/length_bonus_reward": 0.3165079727768898, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": -0.022587895393371582, "step": 414 }, { "clip_ratio": 0.0, "completion_length": 6818.7918701171875, "epoch": 0.83, "grad_norm": 0.08141775646953497, "learning_rate": 8.548121372247919e-08, "loss": 0.0, "num_tokens": 57039379.0, "reward": 0.14836387522518635, "reward_std": 0.11993545852601528, "rewards/length_bonus_reward": 0.28289794921875, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.269068144261837, "step": 415 }, { "clip_ratio": 0.0, "completion_length": 5720.291809082031, "epoch": 0.832, "grad_norm": 0.3047839085225318, "learning_rate": 8.353937964495028e-08, "loss": 0.0, "num_tokens": 57181280.0, "reward": 0.8903752099722624, "reward_std": 0.30855077877640724, "rewards/length_bonus_reward": 0.3343912735581398, "rewards/simple_accuracy_reward": 0.4583333283662796, "rewards/simple_cosine_scaled_reward": 0.1953011304140091, "step": 416 }, { "clip_ratio": 0.0, "completion_length": 5499.666748046875, "epoch": 0.834, "grad_norm": 0.44099553926451673, "learning_rate": 8.161784327051919e-08, "loss": 0.0, "num_tokens": 57319014.0, "reward": 1.1624255999922752, "reward_std": 0.41847446747124195, "rewards/length_bonus_reward": 0.363067626953125, "rewards/simple_accuracy_reward": 0.625, "rewards/simple_cosine_scaled_reward": 0.3487159423530102, "step": 417 }, { "clip_ratio": 0.0, "completion_length": 6389.041748046875, "epoch": 0.836, "grad_norm": 0.4407566766572917, "learning_rate": 7.971669825215787e-08, "loss": 0.0, "num_tokens": 57479617.0, "reward": 0.7180022671818733, "reward_std": 0.5980872884392738, "rewards/length_bonus_reward": 0.3133544847369194, "rewards/simple_accuracy_reward": 0.375, "rewards/simple_cosine_scaled_reward": 0.05929549131542444, "step": 418 }, { "clip_ratio": 0.0, "completion_length": 6558.3333740234375, "epoch": 0.838, "grad_norm": 0.07578693319581553, "learning_rate": 7.783603724899257e-08, "loss": -0.0, "num_tokens": 57642015.0, "reward": 0.5522406157106161, "reward_std": 0.09540114924311638, "rewards/length_bonus_reward": 0.2949015274643898, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": 0.014678183943033218, "step": 419 }, { "clip_ratio": 0.0, "completion_length": 5685.9583740234375, "epoch": 0.84, "grad_norm": 0.7713371214181205, "learning_rate": 7.597595192178702e-08, "loss": 0.0, "num_tokens": 57791342.0, "reward": 0.7764605581760406, "reward_std": 0.6175034996122122, "rewards/length_bonus_reward": 0.31915283203125, "rewards/simple_accuracy_reward": 0.375, "rewards/simple_cosine_scaled_reward": 0.16461539268493652, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 6426.6668701171875, "epoch": 0.842, "grad_norm": 0.4380552412172544, "learning_rate": 7.413653292847616e-08, "loss": 0.0, "num_tokens": 57949992.0, "reward": 0.5025061778724194, "reward_std": 0.47345481999218464, "rewards/length_bonus_reward": 0.34710693359375, "rewards/simple_accuracy_reward": 0.2083333358168602, "rewards/simple_cosine_scaled_reward": -0.1058682193979621, "step": 421 }, { "clip_ratio": 0.0, "completion_length": 5019.791748046875, "epoch": 0.844, "grad_norm": 0.4592149849830736, "learning_rate": 7.23178699197467e-08, "loss": 0.0, "num_tokens": 58075345.0, "reward": 0.8263261318206787, "reward_std": 0.32511134818196297, "rewards/length_bonus_reward": 0.3514506071805954, "rewards/simple_accuracy_reward": 0.375, "rewards/simple_cosine_scaled_reward": 0.19975098595023155, "step": 422 }, { "clip_ratio": 0.0, "completion_length": 6890.83349609375, "epoch": 0.846, "grad_norm": 0.34411304997414294, "learning_rate": 7.052005153466778e-08, "loss": 0.0, "num_tokens": 58245327.0, "reward": 0.29266736656427383, "reward_std": 0.3249049633741379, "rewards/length_bonus_reward": 0.2752787284553051, "rewards/simple_accuracy_reward": 0.0833333358168602, "rewards/simple_cosine_scaled_reward": -0.13188943825662136, "step": 423 }, { "clip_ratio": 0.0, "completion_length": 7059.875, "epoch": 0.848, "grad_norm": 0.37328151781940866, "learning_rate": 6.874316539637126e-08, "loss": 0.0, "num_tokens": 58420326.0, "reward": 0.2623723540455103, "reward_std": 0.2735820487141609, "rewards/length_bonus_reward": 0.26043701171875, "rewards/simple_accuracy_reward": 0.0833333358168602, "rewards/simple_cosine_scaled_reward": -0.16279598139226437, "step": 424 }, { "clip_ratio": 0.0, "completion_length": 6639.75, "epoch": 0.85, "grad_norm": 0.3089015069912826, "learning_rate": 6.698729810778064e-08, "loss": 0.0, "num_tokens": 58584060.0, "reward": 0.3056452311575413, "reward_std": 0.25151272118091583, "rewards/length_bonus_reward": 0.2862752303481102, "rewards/simple_accuracy_reward": 0.0833333358168602, "rewards/simple_cosine_scaled_reward": -0.12792666628956795, "step": 425 }, { "clip_ratio": 0.0, "completion_length": 5189.7918701171875, "epoch": 0.852, "grad_norm": 0.2991668424512954, "learning_rate": 6.52525352473905e-08, "loss": 0.0, "num_tokens": 58714915.0, "reward": 0.3444834426045418, "reward_std": 0.2934001237154007, "rewards/length_bonus_reward": 0.3860677033662796, "rewards/simple_accuracy_reward": 0.0416666679084301, "rewards/simple_cosine_scaled_reward": -0.16650189517531544, "step": 426 }, { "clip_ratio": 0.0, "completion_length": 6247.9168701171875, "epoch": 0.854, "grad_norm": 0.43005998990038224, "learning_rate": 6.353896136509524e-08, "loss": 0.0, "num_tokens": 58871729.0, "reward": 0.5485345609486103, "reward_std": 0.4097930630668998, "rewards/length_bonus_reward": 0.2964782789349556, "rewards/simple_accuracy_reward": 0.2499999962747097, "rewards/simple_cosine_scaled_reward": 0.004112493246793747, "step": 427 }, { "clip_ratio": 0.0, "completion_length": 5612.250061035156, "epoch": 0.856, "grad_norm": 0.6948535424074689, "learning_rate": 6.184665997806831e-08, "loss": 0.0, "num_tokens": 59011445.0, "reward": 0.8354339152574539, "reward_std": 0.585272453725338, "rewards/length_bonus_reward": 0.279510498046875, "rewards/simple_accuracy_reward": 0.4583333358168602, "rewards/simple_cosine_scaled_reward": 0.19518008967861533, "step": 428 }, { "clip_ratio": 0.0, "completion_length": 6439.5418701171875, "epoch": 0.858, "grad_norm": 0.08831610574449544, "learning_rate": 6.017571356669182e-08, "loss": 0.0, "num_tokens": 59170296.0, "reward": 0.19197950512170792, "reward_std": 0.13150840252637863, "rewards/length_bonus_reward": 0.3359171524643898, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.28787532448768616, "step": 429 }, { "clip_ratio": 0.0, "completion_length": 7168.0, "epoch": 0.86, "grad_norm": 0.059290487686179325, "learning_rate": 5.8526203570536504e-08, "loss": 0.0, "num_tokens": 59349168.0, "reward": 0.12618795037269592, "reward_std": 0.0869597103446722, "rewards/length_bonus_reward": 0.25, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.24762410670518875, "step": 430 }, { "clip_ratio": 0.0, "completion_length": 5675.708435058594, "epoch": 0.862, "grad_norm": 0.31247362909774373, "learning_rate": 5.689821038439263e-08, "loss": 0.0, "num_tokens": 59489207.0, "reward": 0.42583927139639854, "reward_std": 0.3320373333990574, "rewards/length_bonus_reward": 0.288909912109375, "rewards/simple_accuracy_reward": 0.1666666716337204, "rewards/simple_cosine_scaled_reward": -0.0594746395945549, "step": 431 }, { "clip_ratio": 0.0, "completion_length": 5350.416748046875, "epoch": 0.864, "grad_norm": 0.18802805143370502, "learning_rate": 5.529181335435124e-08, "loss": 0.0, "num_tokens": 59621745.0, "reward": 0.11751963198184967, "reward_std": 0.17388194613158703, "rewards/length_bonus_reward": 0.2422587051987648, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.24947816133499146, "step": 432 }, { "clip_ratio": 0.0, "completion_length": 6217.95849609375, "epoch": 0.866, "grad_norm": 0.25953901588520245, "learning_rate": 5.37070907739372e-08, "loss": 0.0, "num_tokens": 59776778.0, "reward": 0.9247330017387867, "reward_std": 0.2663750611245632, "rewards/length_bonus_reward": 0.3447062149643898, "rewards/simple_accuracy_reward": 0.4583333283662796, "rewards/simple_cosine_scaled_reward": 0.2433868609368801, "step": 433 }, { "clip_ratio": 0.0, "completion_length": 6650.666748046875, "epoch": 0.868, "grad_norm": 0.10735779207138771, "learning_rate": 5.2144119880293544e-08, "loss": 0.0, "num_tokens": 59941770.0, "reward": 0.17704936116933823, "reward_std": 0.1415756568312645, "rewards/length_bonus_reward": 0.3043924942612648, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.25468628481030464, "step": 434 }, { "clip_ratio": 0.0, "completion_length": 7135.125, "epoch": 0.87, "grad_norm": 0.06418429544337628, "learning_rate": 5.060297685041659e-08, "loss": -0.0, "num_tokens": 60119139.0, "reward": 0.1372895110398531, "reward_std": 0.08284015581011772, "rewards/length_bonus_reward": 0.258056640625, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.2415342628955841, "step": 435 }, { "clip_ratio": 0.0, "completion_length": 5130.7501220703125, "epoch": 0.872, "grad_norm": 0.3567343305529879, "learning_rate": 4.908373679744315e-08, "loss": 0.0, "num_tokens": 60247155.0, "reward": 1.1271448247134686, "reward_std": 0.3249554168432951, "rewards/length_bonus_reward": 0.3203023262321949, "rewards/simple_accuracy_reward": 0.625, "rewards/simple_cosine_scaled_reward": 0.36368490383028984, "step": 436 }, { "clip_ratio": 0.0, "completion_length": 7124.7083740234375, "epoch": 0.874, "grad_norm": 0.3643196914670573, "learning_rate": 4.758647376699032e-08, "loss": -0.0, "num_tokens": 60426284.0, "reward": 0.3523840680718422, "reward_std": 0.2953133136034012, "rewards/length_bonus_reward": 0.2604471892118454, "rewards/simple_accuracy_reward": 0.125, "rewards/simple_cosine_scaled_reward": -0.06612623855471611, "step": 437 }, { "clip_ratio": 0.0, "completion_length": 5649.833435058594, "epoch": 0.876, "grad_norm": 0.28809169244340016, "learning_rate": 4.611126073354571e-08, "loss": 0.0, "num_tokens": 60566818.0, "reward": 0.4818029012531042, "reward_std": 0.26373469829559326, "rewards/length_bonus_reward": 0.305999755859375, "rewards/simple_accuracy_reward": 0.2083333283662796, "rewards/simple_cosine_scaled_reward": -0.0650604572147131, "step": 438 }, { "clip_ratio": 0.0, "completion_length": 6480.625, "epoch": 0.878, "grad_norm": 0.11089491567916712, "learning_rate": 4.465816959691149e-08, "loss": 0.0, "num_tokens": 60729673.0, "reward": 0.1823902726173401, "reward_std": 0.1205295491963625, "rewards/length_bonus_reward": 0.2869771346449852, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.20917373150587082, "step": 439 }, { "clip_ratio": 0.0, "completion_length": 6157.6251220703125, "epoch": 0.88, "grad_norm": 0.27606328218434256, "learning_rate": 4.322727117869951e-08, "loss": 0.0, "num_tokens": 60882166.0, "reward": 0.45063526555895805, "reward_std": 0.28579695150256157, "rewards/length_bonus_reward": 0.3422139436006546, "rewards/simple_accuracy_reward": 0.1666666716337204, "rewards/simple_cosine_scaled_reward": -0.11649075523018837, "step": 440 }, { "clip_ratio": 0.0, "completion_length": 6538.7501220703125, "epoch": 0.882, "grad_norm": 0.26889389827397486, "learning_rate": 4.181863521888018e-08, "loss": -0.0, "num_tokens": 61043458.0, "reward": 0.5994397923350334, "reward_std": 0.2529545109719038, "rewards/length_bonus_reward": 0.3143717497587204, "rewards/simple_accuracy_reward": 0.2916666679084301, "rewards/simple_cosine_scaled_reward": -0.013197213411331177, "step": 441 }, { "clip_ratio": 0.0, "completion_length": 4944.250244140625, "epoch": 0.884, "grad_norm": 0.5342685275207272, "learning_rate": 4.043233037238281e-08, "loss": 0.0, "num_tokens": 61166488.0, "reward": 0.5495770499110222, "reward_std": 0.6013855561614037, "rewards/length_bonus_reward": 0.3183492012321949, "rewards/simple_accuracy_reward": 0.2500000074505806, "rewards/simple_cosine_scaled_reward": -0.037544308812357485, "step": 442 }, { "clip_ratio": 0.0, "completion_length": 7168.0, "epoch": 0.886, "grad_norm": 0.05938359663916768, "learning_rate": 3.9068424205749794e-08, "loss": 0.0, "num_tokens": 61348126.0, "reward": 0.12213847786188126, "reward_std": 0.08142664656043053, "rewards/length_bonus_reward": 0.2500508651137352, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.2558247707784176, "step": 443 }, { "clip_ratio": 0.0, "completion_length": 7040.416748046875, "epoch": 0.888, "grad_norm": 0.23119012863934235, "learning_rate": 3.7726983193843485e-08, "loss": 0.0, "num_tokens": 61523750.0, "reward": 0.21949231624603271, "reward_std": 0.2434244230389595, "rewards/length_bonus_reward": 0.269439697265625, "rewards/simple_accuracy_reward": 0.0416666679084301, "rewards/simple_cosine_scaled_reward": -0.18322810251265764, "step": 444 }, { "clip_ratio": 0.0, "completion_length": 5824.541748046875, "epoch": 0.89, "grad_norm": 0.30160104634298973, "learning_rate": 3.6408072716606345e-08, "loss": 0.0, "num_tokens": 61669155.0, "reward": 0.6689499169588089, "reward_std": 0.2787558864802122, "rewards/length_bonus_reward": 0.3101908341050148, "rewards/simple_accuracy_reward": 0.3333333358168602, "rewards/simple_cosine_scaled_reward": 0.05085138976573944, "step": 445 }, { "clip_ratio": 0.0, "completion_length": 6331.45849609375, "epoch": 0.892, "grad_norm": 0.7227492080851057, "learning_rate": 3.5111757055874326e-08, "loss": 0.0, "num_tokens": 61831988.0, "reward": 0.6479414589703083, "reward_std": 0.4678618125617504, "rewards/length_bonus_reward": 0.2667439728975296, "rewards/simple_accuracy_reward": 0.3333333283662796, "rewards/simple_cosine_scaled_reward": 0.09572824090719223, "step": 446 }, { "clip_ratio": 0.0, "completion_length": 6654.9168701171875, "epoch": 0.894, "grad_norm": 0.33233380215548114, "learning_rate": 3.3838099392243915e-08, "loss": 0.0, "num_tokens": 61996818.0, "reward": 0.6311581917107105, "reward_std": 0.2972813993692398, "rewards/length_bonus_reward": 0.30126953125, "rewards/simple_accuracy_reward": 0.2916666679084301, "rewards/simple_cosine_scaled_reward": 0.07644391432404518, "step": 447 }, { "clip_ratio": 0.0, "completion_length": 6141.750061035156, "epoch": 0.896, "grad_norm": 0.3785008071626161, "learning_rate": 3.258716180199278e-08, "loss": -0.0, "num_tokens": 62148882.0, "reward": 0.3477446511387825, "reward_std": 0.33324576169252396, "rewards/length_bonus_reward": 0.2976582795381546, "rewards/simple_accuracy_reward": 0.125, "rewards/simple_cosine_scaled_reward": -0.149827241897583, "step": 448 }, { "clip_ratio": 0.0, "completion_length": 6300.70849609375, "epoch": 0.898, "grad_norm": 0.31332395219871856, "learning_rate": 3.135900525405427e-08, "loss": 0.0, "num_tokens": 62304971.0, "reward": 0.6263754218816757, "reward_std": 0.28098783269524574, "rewards/length_bonus_reward": 0.3074544221162796, "rewards/simple_accuracy_reward": 0.2916666679084301, "rewards/simple_cosine_scaled_reward": 0.05450857989490032, "step": 449 }, { "clip_ratio": 0.0, "completion_length": 6660.7501220703125, "epoch": 0.9, "grad_norm": 0.3315858682844166, "learning_rate": 3.015368960704584e-08, "loss": -0.0, "num_tokens": 62472605.0, "reward": 0.3248768709599972, "reward_std": 0.2892419043928385, "rewards/length_bonus_reward": 0.2894185371696949, "rewards/simple_accuracy_reward": 0.125, "rewards/simple_cosine_scaled_reward": -0.17908332496881485, "step": 450 }, { "clip_ratio": 0.0, "completion_length": 6238.7918701171875, "epoch": 0.902, "grad_norm": 0.5136344595724691, "learning_rate": 2.8971273606351655e-08, "loss": 0.0, "num_tokens": 62631828.0, "reward": 0.8788756132125854, "reward_std": 0.5879737827926874, "rewards/length_bonus_reward": 0.3221028670668602, "rewards/simple_accuracy_reward": 0.4583333358168602, "rewards/simple_cosine_scaled_reward": 0.19687870983034372, "step": 451 }, { "clip_ratio": 0.0, "completion_length": 6913.7083740234375, "epoch": 0.904, "grad_norm": 0.33457259588009924, "learning_rate": 2.78118148812595e-08, "loss": -0.0, "num_tokens": 62805803.0, "reward": 0.29730749875307083, "reward_std": 0.2927531283348799, "rewards/length_bonus_reward": 0.2718302384018898, "rewards/simple_accuracy_reward": 0.0833333358168602, "rewards/simple_cosine_scaled_reward": -0.11571216210722923, "step": 452 }, { "clip_ratio": 0.0, "completion_length": 5790.791687011719, "epoch": 0.906, "grad_norm": 0.10073120782547108, "learning_rate": 2.667536994215186e-08, "loss": 0.0, "num_tokens": 62949876.0, "reward": 0.9141623638570309, "reward_std": 0.11676057614386082, "rewards/length_bonus_reward": 0.26679483987390995, "rewards/simple_accuracy_reward": 0.5, "rewards/simple_cosine_scaled_reward": 0.2947349399328232, "step": 453 }, { "clip_ratio": 0.0, "completion_length": 6258.166748046875, "epoch": 0.908, "grad_norm": 0.22949359632718438, "learning_rate": 2.5561994177751732e-08, "loss": 0.0, "num_tokens": 63104668.0, "reward": 0.9073836840689182, "reward_std": 0.2645928133279085, "rewards/length_bonus_reward": 0.3318990021944046, "rewards/simple_accuracy_reward": 0.4583333283662796, "rewards/simple_cosine_scaled_reward": 0.23430264741182327, "step": 454 }, { "clip_ratio": 0.0, "completion_length": 6439.45849609375, "epoch": 0.91, "grad_norm": 0.14435380441060527, "learning_rate": 2.4471741852423233e-08, "loss": 0.0, "num_tokens": 63264639.0, "reward": 0.1585113424807787, "reward_std": 0.14603707380592823, "rewards/length_bonus_reward": 0.2903035506606102, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.263584416359663, "step": 455 }, { "clip_ratio": 0.0, "completion_length": 5555.3751220703125, "epoch": 0.912, "grad_norm": 0.31721369527175974, "learning_rate": 2.3404666103526537e-08, "loss": 0.0, "num_tokens": 63410814.0, "reward": 1.0311752818524837, "reward_std": 0.34994690492749214, "rewards/length_bonus_reward": 0.2439371719956398, "rewards/simple_accuracy_reward": 0.625, "rewards/simple_cosine_scaled_reward": 0.3244762234389782, "step": 456 }, { "clip_ratio": 0.0, "completion_length": 5754.3751220703125, "epoch": 0.914, "grad_norm": 0.3381342116527178, "learning_rate": 2.2360818938828187e-08, "loss": 0.0, "num_tokens": 63553611.0, "reward": 0.846661139279604, "reward_std": 0.3967811055481434, "rewards/length_bonus_reward": 0.2317708283662796, "rewards/simple_accuracy_reward": 0.4999999962747097, "rewards/simple_cosine_scaled_reward": 0.22978058457374573, "step": 457 }, { "clip_ratio": 0.0, "completion_length": 6357.2918701171875, "epoch": 0.916, "grad_norm": 0.25336405874661966, "learning_rate": 2.1340251233966377e-08, "loss": 0.0, "num_tokens": 63714856.0, "reward": 0.710291987285018, "reward_std": 0.278077207505703, "rewards/length_bonus_reward": 0.2873230017721653, "rewards/simple_accuracy_reward": 0.375, "rewards/simple_cosine_scaled_reward": 0.09593785181641579, "step": 458 }, { "clip_ratio": 0.0, "completion_length": 7025.166748046875, "epoch": 0.918, "grad_norm": 0.37897019762974055, "learning_rate": 2.0343012729971243e-08, "loss": 0.0, "num_tokens": 63888680.0, "reward": 0.3651631660759449, "reward_std": 0.40126587450504303, "rewards/length_bonus_reward": 0.2670694962143898, "rewards/simple_accuracy_reward": 0.1666666679084301, "rewards/simple_cosine_scaled_reward": -0.1371460035443306, "step": 459 }, { "clip_ratio": 0.0, "completion_length": 5597.70849609375, "epoch": 0.92, "grad_norm": 0.5369169828056517, "learning_rate": 1.936915203084055e-08, "loss": 0.0, "num_tokens": 64028053.0, "reward": 0.8564869575202465, "reward_std": 0.5742658376693726, "rewards/length_bonus_reward": 0.3766276091337204, "rewards/simple_accuracy_reward": 0.4166666679084301, "rewards/simple_cosine_scaled_reward": 0.1263852883130312, "step": 460 }, { "clip_ratio": 0.0, "completion_length": 5707.083435058594, "epoch": 0.922, "grad_norm": 0.3466210311136669, "learning_rate": 1.8418716601170947e-08, "loss": 0.0, "num_tokens": 64168923.0, "reward": 1.0999571606516838, "reward_std": 0.33327653631567955, "rewards/length_bonus_reward": 0.33892822265625, "rewards/simple_accuracy_reward": 0.5833333358168602, "rewards/simple_cosine_scaled_reward": 0.35539117455482483, "step": 461 }, { "clip_ratio": 0.0, "completion_length": 6293.291748046875, "epoch": 0.924, "grad_norm": 0.08221587553624132, "learning_rate": 1.7491752763844292e-08, "loss": -0.0, "num_tokens": 64324642.0, "reward": 0.538226380944252, "reward_std": 0.11780405975878239, "rewards/length_bonus_reward": 0.3062540665268898, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": -0.03605534881353378, "step": 462 }, { "clip_ratio": 0.0, "completion_length": 5280.875061035156, "epoch": 0.926, "grad_norm": 0.2698394098410639, "learning_rate": 1.658830569777031e-08, "loss": 0.0, "num_tokens": 64455973.0, "reward": 0.6702635665424168, "reward_std": 0.28551460802555084, "rewards/length_bonus_reward": 0.2756042508408427, "rewards/simple_accuracy_reward": 0.3333333358168602, "rewards/simple_cosine_scaled_reward": 0.1226519662886858, "step": 463 }, { "clip_ratio": 0.0, "completion_length": 5608.125061035156, "epoch": 0.928, "grad_norm": 0.5444339327497161, "learning_rate": 1.570841943568446e-08, "loss": 0.0, "num_tokens": 64599892.0, "reward": 0.6004882510751486, "reward_std": 0.4996199943125248, "rewards/length_bonus_reward": 0.317962646484375, "rewards/simple_accuracy_reward": 0.2916666716337204, "rewards/simple_cosine_scaled_reward": -0.01828218623995781, "step": 464 }, { "clip_ratio": 0.0, "completion_length": 6237.5833740234375, "epoch": 0.93, "grad_norm": 0.3214956115353503, "learning_rate": 1.4852136862001763e-08, "loss": 0.0, "num_tokens": 64754520.0, "reward": 0.5017019808292389, "reward_std": 0.4076361861079931, "rewards/length_bonus_reward": 0.3213907852768898, "rewards/simple_accuracy_reward": 0.2083333395421505, "rewards/simple_cosine_scaled_reward": -0.05604430101811886, "step": 465 }, { "clip_ratio": 0.0, "completion_length": 5841.58349609375, "epoch": 0.932, "grad_norm": 0.45966528796597955, "learning_rate": 1.4019499710726911e-08, "loss": 0.0, "num_tokens": 64900550.0, "reward": 0.6386283859610558, "reward_std": 0.45821450278162956, "rewards/length_bonus_reward": 0.3439229279756546, "rewards/simple_accuracy_reward": 0.2500000074505806, "rewards/simple_cosine_scaled_reward": 0.08941087685525417, "step": 466 }, { "clip_ratio": 0.0, "completion_length": 5269.625061035156, "epoch": 0.934, "grad_norm": 0.5643482463429849, "learning_rate": 1.3210548563419855e-08, "loss": 0.0, "num_tokens": 65036597.0, "reward": 0.5183953568339348, "reward_std": 0.3168577328324318, "rewards/length_bonus_reward": 0.3197530172765255, "rewards/simple_accuracy_reward": 0.2083333283662796, "rewards/simple_cosine_scaled_reward": -0.01938200369477272, "step": 467 }, { "clip_ratio": 0.0, "completion_length": 5624.541748046875, "epoch": 0.936, "grad_norm": 0.6557308353846493, "learning_rate": 1.2425322847218367e-08, "loss": 0.0, "num_tokens": 65181930.0, "reward": 0.9513291604816914, "reward_std": 0.5370087698101997, "rewards/length_bonus_reward": 0.290679931640625, "rewards/simple_accuracy_reward": 0.5416666567325592, "rewards/simple_cosine_scaled_reward": 0.23796507343649864, "step": 468 }, { "clip_ratio": 0.0, "completion_length": 5876.500244140625, "epoch": 0.938, "grad_norm": 0.463872146576031, "learning_rate": 1.166386083291604e-08, "loss": -0.0, "num_tokens": 65328726.0, "reward": 0.7566004432737827, "reward_std": 0.43503858521580696, "rewards/length_bonus_reward": 0.3195088729262352, "rewards/simple_accuracy_reward": 0.375, "rewards/simple_cosine_scaled_reward": 0.12418316304683685, "step": 469 }, { "clip_ratio": 0.0, "completion_length": 6570.1251220703125, "epoch": 0.94, "grad_norm": 0.4098982865852208, "learning_rate": 1.0926199633097154e-08, "loss": 0.0, "num_tokens": 65492871.0, "reward": 0.46568559017032385, "reward_std": 0.4340946804732084, "rewards/length_bonus_reward": 0.3097432479262352, "rewards/simple_accuracy_reward": 0.2083333395421505, "rewards/simple_cosine_scaled_reward": -0.10478200763463974, "step": 470 }, { "clip_ratio": 0.0, "completion_length": 6592.4583740234375, "epoch": 0.942, "grad_norm": 0.26243020468153927, "learning_rate": 1.0212375200327972e-08, "loss": -0.0, "num_tokens": 65658812.0, "reward": 0.3384756036102772, "reward_std": 0.30310577526688576, "rewards/length_bonus_reward": 0.2905070036649704, "rewards/simple_accuracy_reward": 0.125, "rewards/simple_cosine_scaled_reward": -0.15406280010938644, "step": 471 }, { "clip_ratio": 0.0, "completion_length": 7021.0833740234375, "epoch": 0.944, "grad_norm": 0.40654128367062625, "learning_rate": 9.522422325404233e-09, "loss": 0.0, "num_tokens": 65834746.0, "reward": 0.3747340105473995, "reward_std": 0.4520654771476984, "rewards/length_bonus_reward": 0.2753499299287796, "rewards/simple_accuracy_reward": 0.1666666716337204, "rewards/simple_cosine_scaled_reward": -0.13456518575549126, "step": 472 }, { "clip_ratio": 0.0, "completion_length": 5759.833435058594, "epoch": 0.946, "grad_norm": 0.12000800029274052, "learning_rate": 8.856374635655695e-09, "loss": -0.0, "num_tokens": 65976942.0, "reward": 0.5733238719403744, "reward_std": 0.13525211438536644, "rewards/length_bonus_reward": 0.3248189240694046, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": -0.002990107983350754, "step": 473 }, { "clip_ratio": 0.0, "completion_length": 6999.0833740234375, "epoch": 0.948, "grad_norm": 0.07819643330067047, "learning_rate": 8.214264593307096e-09, "loss": 0.0, "num_tokens": 66150998.0, "reward": 0.1597321778535843, "reward_std": 0.10074889473617077, "rewards/length_bonus_reward": 0.2834472730755806, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.2474301978945732, "step": 474 }, { "clip_ratio": 0.0, "completion_length": 5797.5001220703125, "epoch": 0.95, "grad_norm": 0.4015023792501555, "learning_rate": 7.59612349389599e-09, "loss": 0.0, "num_tokens": 66296264.0, "reward": 0.822616457939148, "reward_std": 0.4041143413633108, "rewards/length_bonus_reward": 0.3942769318819046, "rewards/simple_accuracy_reward": 0.3333333283662796, "rewards/simple_cosine_scaled_reward": 0.19001229899004102, "step": 475 }, { "clip_ratio": 0.0, "completion_length": 5605.33349609375, "epoch": 0.952, "grad_norm": 0.2556693651150387, "learning_rate": 7.0019814647475636e-09, "loss": 0.0, "num_tokens": 66435766.0, "reward": 1.247697576880455, "reward_std": 0.30850939080119133, "rewards/length_bonus_reward": 0.327239990234375, "rewards/simple_accuracy_reward": 0.7083333283662796, "rewards/simple_cosine_scaled_reward": 0.4242483675479889, "step": 476 }, { "clip_ratio": 0.0, "completion_length": 6773.08349609375, "epoch": 0.954, "grad_norm": 0.09413952106499143, "learning_rate": 6.431867463506046e-09, "loss": -0.0, "num_tokens": 66602880.0, "reward": 0.1985863521695137, "reward_std": 0.13596713915467262, "rewards/length_bonus_reward": 0.2930806428194046, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.18898859061300755, "step": 477 }, { "clip_ratio": 0.0, "completion_length": 5466.375061035156, "epoch": 0.956, "grad_norm": 0.1471060735145419, "learning_rate": 5.8858092767236076e-09, "loss": 0.0, "num_tokens": 66738243.0, "reward": 0.5137970745563507, "reward_std": 0.15122080594301224, "rewards/length_bonus_reward": 0.2828267365694046, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": -0.03805939853191376, "step": 478 }, { "clip_ratio": 0.0, "completion_length": 6580.4168701171875, "epoch": 0.958, "grad_norm": 0.09225421214832066, "learning_rate": 5.3638335185058335e-09, "loss": 0.0, "num_tokens": 66900625.0, "reward": 0.5939861312508583, "reward_std": 0.12820147350430489, "rewards/length_bonus_reward": 0.3229166641831398, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": 0.04213889315724373, "step": 479 }, { "clip_ratio": 0.0, "completion_length": 5277.2501220703125, "epoch": 0.96, "grad_norm": 0.373758372715874, "learning_rate": 4.865965629214819e-09, "loss": 0.0, "num_tokens": 67033075.0, "reward": 0.8709317054599524, "reward_std": 0.3812190666794777, "rewards/length_bonus_reward": 0.2217203713953495, "rewards/simple_accuracy_reward": 0.4999999962747097, "rewards/simple_cosine_scaled_reward": 0.2984225987456739, "step": 480 }, { "clip_ratio": 0.0, "completion_length": 7168.0, "epoch": 0.962, "grad_norm": 0.21375336598812353, "learning_rate": 4.3922298742291585e-09, "loss": 0.0, "num_tokens": 67209817.0, "reward": 0.15378118120133877, "reward_std": 0.22519694175571203, "rewards/length_bonus_reward": 0.2500101700425148, "rewards/simple_accuracy_reward": 0.0416666679084301, "rewards/simple_cosine_scaled_reward": -0.27579133957624435, "step": 481 }, { "clip_ratio": 0.0, "completion_length": 6327.83349609375, "epoch": 0.964, "grad_norm": 0.22252137066899288, "learning_rate": 3.9426493427611175e-09, "loss": 0.0, "num_tokens": 67366635.0, "reward": 0.4461268186569214, "reward_std": 0.2308435570448637, "rewards/length_bonus_reward": 0.260009765625, "rewards/simple_accuracy_reward": 0.2083333283662796, "rewards/simple_cosine_scaled_reward": -0.04443260654807091, "step": 482 }, { "clip_ratio": 0.0, "completion_length": 6498.70849609375, "epoch": 0.966, "grad_norm": 0.06844345680760755, "learning_rate": 3.5172459467315286e-09, "loss": 0.0, "num_tokens": 67526894.0, "reward": 0.5389311909675598, "reward_std": 0.10059661976993084, "rewards/length_bonus_reward": 0.3189697265625, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": -0.06007714197039604, "step": 483 }, { "clip_ratio": 0.0, "completion_length": 6885.83349609375, "epoch": 0.968, "grad_norm": 0.34645468652014133, "learning_rate": 3.116040419701815e-09, "loss": -0.0, "num_tokens": 67698238.0, "reward": 0.40158696472644806, "reward_std": 0.2968661803752184, "rewards/length_bonus_reward": 0.2873026505112648, "rewards/simple_accuracy_reward": 0.1666666716337204, "rewards/simple_cosine_scaled_reward": -0.10476469248533249, "step": 484 }, { "clip_ratio": 0.0, "completion_length": 4988.0836181640625, "epoch": 0.97, "grad_norm": 0.22697569580765914, "learning_rate": 2.739052315863355e-09, "loss": -0.0, "num_tokens": 67822776.0, "reward": 0.9332360811531544, "reward_std": 0.2719055339694023, "rewards/length_bonus_reward": 0.3317362479865551, "rewards/simple_accuracy_reward": 0.4583333283662796, "rewards/simple_cosine_scaled_reward": 0.2863330151885748, "step": 485 }, { "clip_ratio": 0.0, "completion_length": 5354.958435058594, "epoch": 0.972, "grad_norm": 0.13520913865413495, "learning_rate": 2.3863000090844076e-09, "loss": 0.0, "num_tokens": 67955219.0, "reward": 0.5870820246636868, "reward_std": 0.14384707808494568, "rewards/length_bonus_reward": 0.3193562850356102, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": 0.035451438277959824, "step": 486 }, { "clip_ratio": 0.0, "completion_length": 6108.041748046875, "epoch": 0.974, "grad_norm": 0.10572257531034046, "learning_rate": 2.057800692014833e-09, "loss": -0.0, "num_tokens": 68106126.0, "reward": 0.46765612810850143, "reward_std": 0.11438697576522827, "rewards/length_bonus_reward": 0.2387695275247097, "rewards/simple_accuracy_reward": 0.25, "rewards/simple_cosine_scaled_reward": -0.042226798832416534, "step": 487 }, { "clip_ratio": 0.0, "completion_length": 7168.0, "epoch": 0.976, "grad_norm": 0.06028405236800945, "learning_rate": 1.7535703752478147e-09, "loss": 0.0, "num_tokens": 68284770.0, "reward": 0.13624239340424538, "reward_std": 0.0861327089369297, "rewards/length_bonus_reward": 0.2500203475356102, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.22755591198801994, "step": 488 }, { "clip_ratio": 0.0, "completion_length": 6811.2501220703125, "epoch": 0.978, "grad_norm": 0.29314919351411556, "learning_rate": 1.4736238865398765e-09, "loss": 0.0, "num_tokens": 68456082.0, "reward": 0.386551920324564, "reward_std": 0.3178923111408949, "rewards/length_bonus_reward": 0.30096435546875, "rewards/simple_accuracy_reward": 0.125, "rewards/simple_cosine_scaled_reward": -0.07882492616772652, "step": 489 }, { "clip_ratio": 0.0, "completion_length": 6386.8333740234375, "epoch": 0.98, "grad_norm": 0.29551231578609893, "learning_rate": 1.217974870087901e-09, "loss": 0.0, "num_tokens": 68613860.0, "reward": 0.6768747419118881, "reward_std": 0.30196968652307987, "rewards/length_bonus_reward": 0.2780456505715847, "rewards/simple_accuracy_reward": 0.3333333358168602, "rewards/simple_cosine_scaled_reward": 0.1309914756566286, "step": 490 }, { "clip_ratio": 0.0, "completion_length": 6074.000244140625, "epoch": 0.982, "grad_norm": 0.5741301401763038, "learning_rate": 9.866357858642205e-10, "loss": 0.0, "num_tokens": 68765456.0, "reward": 0.6753575876355171, "reward_std": 0.6447435468435287, "rewards/length_bonus_reward": 0.3604838028550148, "rewards/simple_accuracy_reward": 0.2916666679084301, "rewards/simple_cosine_scaled_reward": 0.04641416296362877, "step": 491 }, { "clip_ratio": 0.0, "completion_length": 5453.125244140625, "epoch": 0.984, "grad_norm": 0.370871152670709, "learning_rate": 7.79617909009489e-10, "loss": 0.0, "num_tokens": 68900897.0, "reward": 0.599255308508873, "reward_std": 0.4469200521707535, "rewards/length_bonus_reward": 0.4123738631606102, "rewards/simple_accuracy_reward": 0.2083333395421505, "rewards/simple_cosine_scaled_reward": -0.04290385078638792, "step": 492 }, { "clip_ratio": 0.0, "completion_length": 6605.7083740234375, "epoch": 0.986, "grad_norm": 0.41180171616646455, "learning_rate": 5.969313292830125e-10, "loss": -0.0, "num_tokens": 69068290.0, "reward": 0.7704456094652414, "reward_std": 0.4267530832439661, "rewards/length_bonus_reward": 0.3352457657456398, "rewards/simple_accuracy_reward": 0.375, "rewards/simple_cosine_scaled_reward": 0.12039967253804207, "step": 493 }, { "clip_ratio": 0.0, "completion_length": 5488.375061035156, "epoch": 0.988, "grad_norm": 0.09411348962797281, "learning_rate": 4.3858495057080836e-10, "loss": 0.0, "num_tokens": 69205153.0, "reward": 0.14003321155905724, "reward_std": 0.11374452896416187, "rewards/length_bonus_reward": 0.2699483223259449, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.2598302438855171, "step": 494 }, { "clip_ratio": 0.0, "completion_length": 6115.958435058594, "epoch": 0.99, "grad_norm": 0.23686457838572572, "learning_rate": 3.0458649045211894e-10, "loss": -0.0, "num_tokens": 69357120.0, "reward": 0.5697316247969866, "reward_std": 0.2670122776180506, "rewards/length_bonus_reward": 0.26483154296875, "rewards/simple_accuracy_reward": 0.2916666679084301, "rewards/simple_cosine_scaled_reward": 0.026466842740774155, "step": 495 }, { "clip_ratio": 0.0, "completion_length": 4919.958557128906, "epoch": 0.992, "grad_norm": 0.34348739993572325, "learning_rate": 1.9494247982282387e-10, "loss": 0.0, "num_tokens": 69480155.0, "reward": 0.9497171714901924, "reward_std": 0.3975832127034664, "rewards/length_bonus_reward": 0.31110636703670025, "rewards/simple_accuracy_reward": 0.4999999962747097, "rewards/simple_cosine_scaled_reward": 0.2772214760771021, "step": 496 }, { "clip_ratio": 0.0, "completion_length": 6697.8751220703125, "epoch": 0.994, "grad_norm": 0.25409798467307015, "learning_rate": 1.0965826257725019e-10, "loss": 0.0, "num_tokens": 69650450.0, "reward": 0.6519821397960186, "reward_std": 0.28294576331973076, "rewards/length_bonus_reward": 0.276947021484375, "rewards/simple_accuracy_reward": 0.3333333358168602, "rewards/simple_cosine_scaled_reward": 0.0834035612642765, "step": 497 }, { "clip_ratio": 0.0, "completion_length": 5030.4168701171875, "epoch": 0.996, "grad_norm": 0.49270880076654927, "learning_rate": 4.873799534788059e-11, "loss": 0.0, "num_tokens": 69776808.0, "reward": 0.6053420826792717, "reward_std": 0.44794849678874016, "rewards/length_bonus_reward": 0.33514404296875, "rewards/simple_accuracy_reward": 0.2499999962747097, "rewards/simple_cosine_scaled_reward": 0.04039607383310795, "step": 498 }, { "clip_ratio": 0.0, "completion_length": 7168.0, "epoch": 0.998, "grad_norm": 0.04722955562309797, "learning_rate": 1.2184647302626582e-11, "loss": 0.0, "num_tokens": 69954036.0, "reward": 0.1623464748263359, "reward_std": 0.06856032041832805, "rewards/length_bonus_reward": 0.25, "rewards/simple_accuracy_reward": 0.0, "rewards/simple_cosine_scaled_reward": -0.17530706524848938, "step": 499 }, { "clip_ratio": 0.0, "completion_length": 5715.75, "epoch": 1.0, "grad_norm": 0.3654388357400278, "learning_rate": 0.0, "loss": 0.0, "num_tokens": 70001430.0, "reward": 0.9678235724568367, "reward_std": 0.4678789768368006, "rewards/length_bonus_reward": 0.349029541015625, "rewards/simple_accuracy_reward": 0.5000000074505806, "rewards/simple_cosine_scaled_reward": 0.23758802097290754, "step": 500 }, { "epoch": 1.0, "step": 500, "total_flos": 0.0, "train_loss": 1.4838452613352704e-08, "train_runtime": 213401.8807, "train_samples_per_second": 0.009, "train_steps_per_second": 0.002 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }