{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 12140, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0032948929159802307, "grad_norm": 79.82798723206938, "learning_rate": 9.992586490939045e-07, "logits/chosen": 0.48542481660842896, "logits/rejected": 0.42302244901657104, "logps/chosen": -349.20001220703125, "logps/rejected": -313.8999938964844, "loss": 0.6832, "rewards/accuracies": 0.34375, "rewards/chosen": -0.01416626013815403, "rewards/margins": 0.0244903564453125, "rewards/rejected": -0.03866424411535263, "step": 10 }, { "epoch": 0.006589785831960461, "grad_norm": 69.58070353773478, "learning_rate": 9.984349258649094e-07, "logits/chosen": 0.583728015422821, "logits/rejected": 0.4895996153354645, "logps/chosen": -323.3999938964844, "logps/rejected": -298.75, "loss": 0.6413, "rewards/accuracies": 0.59375, "rewards/chosen": -0.16487732529640198, "rewards/margins": 0.167256161570549, "rewards/rejected": -0.33222657442092896, "step": 20 }, { "epoch": 0.009884678747940691, "grad_norm": 93.99072182203672, "learning_rate": 9.976112026359142e-07, "logits/chosen": 0.551220715045929, "logits/rejected": 0.502398669719696, "logps/chosen": -322.3500061035156, "logps/rejected": -300.79998779296875, "loss": 0.6437, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.3476425111293793, "rewards/margins": 0.21305541694164276, "rewards/rejected": -0.561383068561554, "step": 30 }, { "epoch": 0.013179571663920923, "grad_norm": 92.29478500516927, "learning_rate": 9.967874794069192e-07, "logits/chosen": 0.5774780511856079, "logits/rejected": 0.4931640625, "logps/chosen": -345.8999938964844, "logps/rejected": -300.5, "loss": 0.6621, "rewards/accuracies": 0.59375, "rewards/chosen": -0.289306640625, "rewards/margins": 0.21346434950828552, "rewards/rejected": -0.5038086175918579, "step": 40 }, { "epoch": 0.016474464579901153, "grad_norm": 88.02500905523506, "learning_rate": 9.959637561779242e-07, "logits/chosen": 0.642779529094696, "logits/rejected": 0.5755859613418579, "logps/chosen": -360.29998779296875, "logps/rejected": -307.20001220703125, "loss": 0.7407, "rewards/accuracies": 0.53125, "rewards/chosen": -0.2960449159145355, "rewards/margins": 0.05170287936925888, "rewards/rejected": -0.3478027284145355, "step": 50 }, { "epoch": 0.019769357495881382, "grad_norm": 75.30210523660122, "learning_rate": 9.95140032948929e-07, "logits/chosen": 0.623730480670929, "logits/rejected": 0.618481457233429, "logps/chosen": -336.1499938964844, "logps/rejected": -328.29998779296875, "loss": 0.6178, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.04628906399011612, "rewards/margins": 0.29819947481155396, "rewards/rejected": -0.252105712890625, "step": 60 }, { "epoch": 0.023064250411861616, "grad_norm": 85.63330698381948, "learning_rate": 9.94316309719934e-07, "logits/chosen": 0.4390869140625, "logits/rejected": 0.4840331971645355, "logps/chosen": -296.1499938964844, "logps/rejected": -300.82501220703125, "loss": 0.6328, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.06740112602710724, "rewards/margins": 0.2802673280239105, "rewards/rejected": -0.3476623594760895, "step": 70 }, { "epoch": 0.026359143327841845, "grad_norm": 69.49699025572036, "learning_rate": 9.934925864909391e-07, "logits/chosen": 0.4911865293979645, "logits/rejected": 0.4721923768520355, "logps/chosen": -311.25, "logps/rejected": -311.1000061035156, "loss": 0.6051, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.10686035454273224, "rewards/margins": 0.4120025634765625, "rewards/rejected": -0.519238293170929, "step": 80 }, { "epoch": 0.029654036243822075, "grad_norm": 81.50252907795367, "learning_rate": 9.92668863261944e-07, "logits/chosen": 0.5025879144668579, "logits/rejected": 0.4652298092842102, "logps/chosen": -363.04998779296875, "logps/rejected": -319.1000061035156, "loss": 0.586, "rewards/accuracies": 0.65625, "rewards/chosen": -0.06522216647863388, "rewards/margins": 0.4526306092739105, "rewards/rejected": -0.5172363519668579, "step": 90 }, { "epoch": 0.032948929159802305, "grad_norm": 87.21430448898457, "learning_rate": 9.918451400329488e-07, "logits/chosen": 0.560839831829071, "logits/rejected": 0.43626707792282104, "logps/chosen": -319.8500061035156, "logps/rejected": -288.20001220703125, "loss": 0.591, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.08500365912914276, "rewards/margins": 0.4297027587890625, "rewards/rejected": -0.344818115234375, "step": 100 }, { "epoch": 0.036243822075782535, "grad_norm": 87.78386736962302, "learning_rate": 9.910214168039538e-07, "logits/chosen": 0.519787609577179, "logits/rejected": 0.43634337186813354, "logps/chosen": -337.3500061035156, "logps/rejected": -299.8500061035156, "loss": 0.5931, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.3761230409145355, "rewards/margins": 0.528637707233429, "rewards/rejected": -0.15232543647289276, "step": 110 }, { "epoch": 0.039538714991762765, "grad_norm": 90.47645135088133, "learning_rate": 9.901976935749588e-07, "logits/chosen": 0.49652940034866333, "logits/rejected": 0.3860107362270355, "logps/chosen": -302.4750061035156, "logps/rejected": -294.6499938964844, "loss": 0.6685, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.36230772733688354, "rewards/margins": 0.3122924864292145, "rewards/rejected": 0.04908447340130806, "step": 120 }, { "epoch": 0.042833607907743, "grad_norm": 78.73871772544851, "learning_rate": 9.893739703459636e-07, "logits/chosen": 0.4654541015625, "logits/rejected": 0.3798461854457855, "logps/chosen": -325.82501220703125, "logps/rejected": -306.45001220703125, "loss": 0.6084, "rewards/accuracies": 0.625, "rewards/chosen": 0.6466064453125, "rewards/margins": 0.40382689237594604, "rewards/rejected": 0.2421875, "step": 130 }, { "epoch": 0.04612850082372323, "grad_norm": 111.72884974333387, "learning_rate": 9.885502471169687e-07, "logits/chosen": 0.3345092833042145, "logits/rejected": 0.35247802734375, "logps/chosen": -320.1000061035156, "logps/rejected": -310.1000061035156, "loss": 0.5695, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.5425964593887329, "rewards/margins": 0.6009765863418579, "rewards/rejected": -0.05853881686925888, "step": 140 }, { "epoch": 0.04942339373970346, "grad_norm": 154.4420693106514, "learning_rate": 9.877265238879737e-07, "logits/chosen": 0.502490222454071, "logits/rejected": 0.3869384825229645, "logps/chosen": -350.70001220703125, "logps/rejected": -307.6499938964844, "loss": 0.591, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.569531261920929, "rewards/margins": 0.5262451171875, "rewards/rejected": 0.04312743991613388, "step": 150 }, { "epoch": 0.05271828665568369, "grad_norm": 85.71924576281258, "learning_rate": 9.869028006589785e-07, "logits/chosen": 0.4935546815395355, "logits/rejected": 0.4081176817417145, "logps/chosen": -340.2250061035156, "logps/rejected": -306.5, "loss": 0.6157, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.4912658631801605, "rewards/margins": 0.47014158964157104, "rewards/rejected": 0.021148681640625, "step": 160 }, { "epoch": 0.05601317957166392, "grad_norm": 100.0485316477698, "learning_rate": 9.860790774299836e-07, "logits/chosen": 0.4760070741176605, "logits/rejected": 0.4016967713832855, "logps/chosen": -362.6499938964844, "logps/rejected": -314.25, "loss": 0.5822, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.6664794683456421, "rewards/margins": 0.544964611530304, "rewards/rejected": 0.12069091945886612, "step": 170 }, { "epoch": 0.05930807248764415, "grad_norm": 67.52343671879255, "learning_rate": 9.852553542009884e-07, "logits/chosen": 0.59033203125, "logits/rejected": 0.40869140625, "logps/chosen": -291.75, "logps/rejected": -276.0, "loss": 0.5577, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.787158191204071, "rewards/margins": 0.650317370891571, "rewards/rejected": 0.13720703125, "step": 180 }, { "epoch": 0.06260296540362438, "grad_norm": 108.7549912759886, "learning_rate": 9.844316309719934e-07, "logits/chosen": 0.4718017578125, "logits/rejected": 0.42152100801467896, "logps/chosen": -310.75, "logps/rejected": -306.20001220703125, "loss": 0.6239, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.3551879823207855, "rewards/margins": 0.4419189393520355, "rewards/rejected": -0.08673095703125, "step": 190 }, { "epoch": 0.06589785831960461, "grad_norm": 59.427928222835504, "learning_rate": 9.836079077429982e-07, "logits/chosen": 0.578906238079071, "logits/rejected": 0.4795593321323395, "logps/chosen": -359.1499938964844, "logps/rejected": -324.29998779296875, "loss": 0.5193, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.34440916776657104, "rewards/margins": 0.745361328125, "rewards/rejected": -0.4012084901332855, "step": 200 }, { "epoch": 0.06919275123558484, "grad_norm": 89.08407427003111, "learning_rate": 9.827841845140033e-07, "logits/chosen": 0.42144775390625, "logits/rejected": 0.33745115995407104, "logps/chosen": -346.8999938964844, "logps/rejected": -300.8500061035156, "loss": 0.5723, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.02063903771340847, "rewards/margins": 0.623333752155304, "rewards/rejected": -0.602526843547821, "step": 210 }, { "epoch": 0.07248764415156507, "grad_norm": 71.30010525972386, "learning_rate": 9.819604612850083e-07, "logits/chosen": 0.484619140625, "logits/rejected": 0.473388671875, "logps/chosen": -308.04998779296875, "logps/rejected": -308.29998779296875, "loss": 0.5608, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.08465576171875, "rewards/margins": 0.724414050579071, "rewards/rejected": -0.640087902545929, "step": 220 }, { "epoch": 0.0757825370675453, "grad_norm": 92.86466762460051, "learning_rate": 9.811367380560131e-07, "logits/chosen": 0.4952636659145355, "logits/rejected": 0.4126220643520355, "logps/chosen": -353.1000061035156, "logps/rejected": -339.54998779296875, "loss": 0.5532, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.2898925840854645, "rewards/margins": 0.6960815191268921, "rewards/rejected": -0.40607911348342896, "step": 230 }, { "epoch": 0.07907742998352553, "grad_norm": 70.52802613200953, "learning_rate": 9.803130148270181e-07, "logits/chosen": 0.42412108182907104, "logits/rejected": 0.4525390565395355, "logps/chosen": -308.20001220703125, "logps/rejected": -305.25, "loss": 0.6006, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.273284912109375, "rewards/margins": 0.658923327922821, "rewards/rejected": -0.38555908203125, "step": 240 }, { "epoch": 0.08237232289950576, "grad_norm": 73.38026889331708, "learning_rate": 9.794892915980232e-07, "logits/chosen": 0.3894287049770355, "logits/rejected": 0.25033265352249146, "logps/chosen": -302.25, "logps/rejected": -284.0, "loss": 0.5313, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.06866455078125, "rewards/margins": 0.8505859375, "rewards/rejected": -0.781542956829071, "step": 250 }, { "epoch": 0.085667215815486, "grad_norm": 86.33791514208109, "learning_rate": 9.78665568369028e-07, "logits/chosen": 0.4732559323310852, "logits/rejected": 0.294677734375, "logps/chosen": -332.8999938964844, "logps/rejected": -319.95001220703125, "loss": 0.5862, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.335174560546875, "rewards/margins": 0.7300781011581421, "rewards/rejected": -0.3959289491176605, "step": 260 }, { "epoch": 0.08896210873146623, "grad_norm": 73.05049444737519, "learning_rate": 9.778418451400328e-07, "logits/chosen": 0.441558837890625, "logits/rejected": 0.26263427734375, "logps/chosen": -306.7250061035156, "logps/rejected": -309.79998779296875, "loss": 0.5349, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.35188180208206177, "rewards/margins": 0.9444335699081421, "rewards/rejected": -0.592517077922821, "step": 270 }, { "epoch": 0.09225700164744646, "grad_norm": 77.76388861153046, "learning_rate": 9.770181219110378e-07, "logits/chosen": 0.4569458067417145, "logits/rejected": 0.26864928007125854, "logps/chosen": -332.79998779296875, "logps/rejected": -345.1000061035156, "loss": 0.5185, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.6717284917831421, "rewards/margins": 0.8722900152206421, "rewards/rejected": -0.20068359375, "step": 280 }, { "epoch": 0.09555189456342669, "grad_norm": 75.43925728502843, "learning_rate": 9.761943986820429e-07, "logits/chosen": 0.468994140625, "logits/rejected": 0.28055113554000854, "logps/chosen": -303.29998779296875, "logps/rejected": -290.75, "loss": 0.513, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.7313476800918579, "rewards/margins": 0.818310558795929, "rewards/rejected": -0.08632812649011612, "step": 290 }, { "epoch": 0.09884678747940692, "grad_norm": 126.91703716823127, "learning_rate": 9.753706754530477e-07, "logits/chosen": 0.365234375, "logits/rejected": 0.09965820610523224, "logps/chosen": -306.1499938964844, "logps/rejected": -303.1499938964844, "loss": 0.6272, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.31669312715530396, "rewards/margins": 0.6839599609375, "rewards/rejected": -0.36723631620407104, "step": 300 }, { "epoch": 0.10214168039538715, "grad_norm": 76.58264190273297, "learning_rate": 9.745469522240527e-07, "logits/chosen": 0.4480224549770355, "logits/rejected": 0.31113892793655396, "logps/chosen": -344.3999938964844, "logps/rejected": -297.04998779296875, "loss": 0.5121, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.8077934384346008, "rewards/margins": 0.8919677734375, "rewards/rejected": -0.08388672024011612, "step": 310 }, { "epoch": 0.10543657331136738, "grad_norm": 92.66993566553238, "learning_rate": 9.737232289950577e-07, "logits/chosen": 0.4165283143520355, "logits/rejected": 0.2827514708042145, "logps/chosen": -331.6499938964844, "logps/rejected": -310.25, "loss": 0.533, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.8047851324081421, "rewards/margins": 0.931323230266571, "rewards/rejected": -0.12606200575828552, "step": 320 }, { "epoch": 0.10873146622734761, "grad_norm": 65.65410942585409, "learning_rate": 9.728995057660626e-07, "logits/chosen": 0.26713865995407104, "logits/rejected": 0.17235717177391052, "logps/chosen": -404.75, "logps/rejected": -354.20001220703125, "loss": 0.5742, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4439453184604645, "rewards/margins": 0.7660156488418579, "rewards/rejected": -0.3230651915073395, "step": 330 }, { "epoch": 0.11202635914332784, "grad_norm": 94.48782923319743, "learning_rate": 9.720757825370674e-07, "logits/chosen": 0.28741455078125, "logits/rejected": 0.2512664794921875, "logps/chosen": -345.95001220703125, "logps/rejected": -298.70001220703125, "loss": 0.6041, "rewards/accuracies": 0.6875, "rewards/chosen": 0.454345703125, "rewards/margins": 0.8409973382949829, "rewards/rejected": -0.3866333067417145, "step": 340 }, { "epoch": 0.11532125205930807, "grad_norm": 82.86450917223961, "learning_rate": 9.712520593080724e-07, "logits/chosen": 0.3732971251010895, "logits/rejected": 0.34648436307907104, "logps/chosen": -329.375, "logps/rejected": -300.1499938964844, "loss": 0.5235, "rewards/accuracies": 0.71875, "rewards/chosen": 0.3381385803222656, "rewards/margins": 0.883715808391571, "rewards/rejected": -0.5452026128768921, "step": 350 }, { "epoch": 0.1186161449752883, "grad_norm": 62.687033401461285, "learning_rate": 9.704283360790774e-07, "logits/chosen": 0.37182655930519104, "logits/rejected": 0.3161254823207855, "logps/chosen": -350.8999938964844, "logps/rejected": -285.70001220703125, "loss": 0.5471, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.17962646484375, "rewards/margins": 0.7940673828125, "rewards/rejected": -0.613543689250946, "step": 360 }, { "epoch": 0.12191103789126853, "grad_norm": 97.64543411242967, "learning_rate": 9.696046128500823e-07, "logits/chosen": 0.28059083223342896, "logits/rejected": 0.2744247317314148, "logps/chosen": -297.8500061035156, "logps/rejected": -330.54998779296875, "loss": 0.6508, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.2039794921875, "rewards/margins": 0.714892566204071, "rewards/rejected": -0.511523425579071, "step": 370 }, { "epoch": 0.12520593080724876, "grad_norm": 64.47013065944766, "learning_rate": 9.687808896210873e-07, "logits/chosen": 0.3805786073207855, "logits/rejected": 0.17166748642921448, "logps/chosen": -340.6499938964844, "logps/rejected": -305.6499938964844, "loss": 0.4978, "rewards/accuracies": 0.78125, "rewards/chosen": 0.31431883573532104, "rewards/margins": 0.9708007574081421, "rewards/rejected": -0.6572021245956421, "step": 380 }, { "epoch": 0.128500823723229, "grad_norm": 79.158700583081, "learning_rate": 9.679571663920923e-07, "logits/chosen": 0.28583985567092896, "logits/rejected": 0.22131653130054474, "logps/chosen": -310.8999938964844, "logps/rejected": -295.3999938964844, "loss": 0.642, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.384857177734375, "rewards/margins": 0.6392822265625, "rewards/rejected": -0.2542480528354645, "step": 390 }, { "epoch": 0.13179571663920922, "grad_norm": 69.55282717038605, "learning_rate": 9.671334431630971e-07, "logits/chosen": 0.3690734803676605, "logits/rejected": 0.2624053955078125, "logps/chosen": -342.20001220703125, "logps/rejected": -309.3500061035156, "loss": 0.5704, "rewards/accuracies": 0.6875, "rewards/chosen": 0.7783203125, "rewards/margins": 0.706787109375, "rewards/rejected": 0.07215575873851776, "step": 400 }, { "epoch": 0.13509060955518945, "grad_norm": 72.1858039886783, "learning_rate": 9.663097199341022e-07, "logits/chosen": 0.36674803495407104, "logits/rejected": 0.2531982362270355, "logps/chosen": -331.8999938964844, "logps/rejected": -300.1499938964844, "loss": 0.4728, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.7454589605331421, "rewards/margins": 1.0399901866912842, "rewards/rejected": -0.29533690214157104, "step": 410 }, { "epoch": 0.13838550247116968, "grad_norm": 41.9228131853377, "learning_rate": 9.65485996705107e-07, "logits/chosen": 0.30638426542282104, "logits/rejected": 0.15997619926929474, "logps/chosen": -360.8999938964844, "logps/rejected": -320.6499938964844, "loss": 0.4357, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 0.469970703125, "rewards/margins": 1.2327148914337158, "rewards/rejected": -0.763427734375, "step": 420 }, { "epoch": 0.1416803953871499, "grad_norm": 91.5557081917999, "learning_rate": 9.64662273476112e-07, "logits/chosen": 0.4604034423828125, "logits/rejected": 0.3447814881801605, "logps/chosen": -333.1499938964844, "logps/rejected": -347.79998779296875, "loss": 0.62, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.2174835205078125, "rewards/margins": 0.9893554449081421, "rewards/rejected": -0.7729247808456421, "step": 430 }, { "epoch": 0.14497528830313014, "grad_norm": 81.71493519917499, "learning_rate": 9.638385502471168e-07, "logits/chosen": 0.28846436738967896, "logits/rejected": 0.19417114555835724, "logps/chosen": -299.375, "logps/rejected": -278.04998779296875, "loss": 0.6222, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.03385009616613388, "rewards/margins": 0.936230480670929, "rewards/rejected": -0.9022461175918579, "step": 440 }, { "epoch": 0.14827018121911037, "grad_norm": 80.60590510155299, "learning_rate": 9.630148270181219e-07, "logits/chosen": 0.37889403104782104, "logits/rejected": 0.2896728515625, "logps/chosen": -327.1000061035156, "logps/rejected": -333.95001220703125, "loss": 0.6206, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.28594666719436646, "rewards/margins": 0.7520996332168579, "rewards/rejected": -0.4659667909145355, "step": 450 }, { "epoch": 0.1515650741350906, "grad_norm": 88.53545252993416, "learning_rate": 9.62191103789127e-07, "logits/chosen": 0.4272216856479645, "logits/rejected": 0.3598876893520355, "logps/chosen": -308.0249938964844, "logps/rejected": -282.79998779296875, "loss": 0.5138, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.4689575135707855, "rewards/margins": 0.910400390625, "rewards/rejected": -0.44035643339157104, "step": 460 }, { "epoch": 0.15485996705107083, "grad_norm": 107.31233111848069, "learning_rate": 9.613673805601317e-07, "logits/chosen": 0.45708006620407104, "logits/rejected": 0.3986450135707855, "logps/chosen": -319.57501220703125, "logps/rejected": -291.5, "loss": 0.4892, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.764514148235321, "rewards/margins": 1.0881226062774658, "rewards/rejected": -0.32279205322265625, "step": 470 }, { "epoch": 0.15815485996705106, "grad_norm": 69.28579425947922, "learning_rate": 9.605436573311368e-07, "logits/chosen": 0.49903565645217896, "logits/rejected": 0.4414428770542145, "logps/chosen": -302.3500061035156, "logps/rejected": -311.3500061035156, "loss": 0.5347, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.776538074016571, "rewards/margins": 1.067285180091858, "rewards/rejected": -0.2903076112270355, "step": 480 }, { "epoch": 0.1614497528830313, "grad_norm": 90.86045717505917, "learning_rate": 9.597199341021416e-07, "logits/chosen": 0.3804687559604645, "logits/rejected": 0.2865844666957855, "logps/chosen": -319.29998779296875, "logps/rejected": -311.95001220703125, "loss": 0.573, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.75830078125, "rewards/margins": 0.928875744342804, "rewards/rejected": -0.1707763671875, "step": 490 }, { "epoch": 0.16474464579901152, "grad_norm": 84.34851986017138, "learning_rate": 9.588962108731466e-07, "logits/chosen": 0.265869140625, "logits/rejected": 0.08668212592601776, "logps/chosen": -321.13751220703125, "logps/rejected": -315.0, "loss": 0.5159, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.5106567144393921, "rewards/margins": 1.2899291515350342, "rewards/rejected": -0.7787109613418579, "step": 500 }, { "epoch": 0.16803953871499178, "grad_norm": 66.7070596740813, "learning_rate": 9.580724876441514e-07, "logits/chosen": 0.235401913523674, "logits/rejected": 0.0606689453125, "logps/chosen": -316.1000061035156, "logps/rejected": -297.3999938964844, "loss": 0.5632, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.41120606660842896, "rewards/margins": 0.9374023675918579, "rewards/rejected": -0.5254455804824829, "step": 510 }, { "epoch": 0.171334431630972, "grad_norm": 62.524186039369646, "learning_rate": 9.572487644151565e-07, "logits/chosen": 0.34227293729782104, "logits/rejected": 0.304891973733902, "logps/chosen": -349.29998779296875, "logps/rejected": -309.8999938964844, "loss": 0.6216, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.9341796636581421, "rewards/margins": 0.777026355266571, "rewards/rejected": 0.15764160454273224, "step": 520 }, { "epoch": 0.17462932454695224, "grad_norm": 65.41402937735955, "learning_rate": 9.564250411861615e-07, "logits/chosen": 0.40996092557907104, "logits/rejected": 0.2829956114292145, "logps/chosen": -329.29998779296875, "logps/rejected": -312.3500061035156, "loss": 0.5823, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 1.1394531726837158, "rewards/margins": 0.8077148199081421, "rewards/rejected": 0.33184814453125, "step": 530 }, { "epoch": 0.17792421746293247, "grad_norm": 74.30441247384246, "learning_rate": 9.556013179571663e-07, "logits/chosen": 0.416015625, "logits/rejected": 0.38300782442092896, "logps/chosen": -295.3999938964844, "logps/rejected": -305.95001220703125, "loss": 0.5711, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 1.1608397960662842, "rewards/margins": 0.94384765625, "rewards/rejected": 0.21721191704273224, "step": 540 }, { "epoch": 0.1812191103789127, "grad_norm": 61.113211020860426, "learning_rate": 9.547775947281713e-07, "logits/chosen": 0.5682128667831421, "logits/rejected": 0.40113526582717896, "logps/chosen": -294.20001220703125, "logps/rejected": -293.8999938964844, "loss": 0.5582, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.9608154296875, "rewards/margins": 1.0480468273162842, "rewards/rejected": -0.08632812649011612, "step": 550 }, { "epoch": 0.18451400329489293, "grad_norm": 95.15486788007685, "learning_rate": 9.539538714991764e-07, "logits/chosen": 0.4859375059604645, "logits/rejected": 0.35716551542282104, "logps/chosen": -346.70001220703125, "logps/rejected": -348.95001220703125, "loss": 0.4671, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.7884277105331421, "rewards/margins": 1.220458984375, "rewards/rejected": -0.4330810606479645, "step": 560 }, { "epoch": 0.18780889621087316, "grad_norm": 97.99443335237804, "learning_rate": 9.531301482701811e-07, "logits/chosen": 0.4699157774448395, "logits/rejected": 0.36726075410842896, "logps/chosen": -322.5249938964844, "logps/rejected": -337.0, "loss": 0.5509, "rewards/accuracies": 0.71875, "rewards/chosen": 0.40620118379592896, "rewards/margins": 1.0316894054412842, "rewards/rejected": -0.6258880496025085, "step": 570 }, { "epoch": 0.19110378912685339, "grad_norm": 86.35472007437161, "learning_rate": 9.523064250411861e-07, "logits/chosen": 0.4000244140625, "logits/rejected": 0.3962417542934418, "logps/chosen": -337.54998779296875, "logps/rejected": -317.5, "loss": 0.5651, "rewards/accuracies": 0.71875, "rewards/chosen": 0.4275878965854645, "rewards/margins": 1.1104004383087158, "rewards/rejected": -0.682666003704071, "step": 580 }, { "epoch": 0.19439868204283361, "grad_norm": 39.135067813389504, "learning_rate": 9.51482701812191e-07, "logits/chosen": 0.3795715272426605, "logits/rejected": 0.3142639100551605, "logps/chosen": -337.5, "logps/rejected": -318.95001220703125, "loss": 0.5866, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.36307984590530396, "rewards/margins": 0.998095691204071, "rewards/rejected": -0.635498046875, "step": 590 }, { "epoch": 0.19769357495881384, "grad_norm": 69.86623082538497, "learning_rate": 9.50658978583196e-07, "logits/chosen": 0.4887451231479645, "logits/rejected": 0.45478516817092896, "logps/chosen": -317.125, "logps/rejected": -306.75, "loss": 0.4714, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.6434326171875, "rewards/margins": 1.092041015625, "rewards/rejected": -0.44953614473342896, "step": 600 }, { "epoch": 0.20098846787479407, "grad_norm": 75.35676665025922, "learning_rate": 9.49835255354201e-07, "logits/chosen": 0.49809569120407104, "logits/rejected": 0.3509582579135895, "logps/chosen": -341.95001220703125, "logps/rejected": -299.54998779296875, "loss": 0.489, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.793164074420929, "rewards/margins": 1.1057617664337158, "rewards/rejected": -0.31239014863967896, "step": 610 }, { "epoch": 0.2042833607907743, "grad_norm": 91.35385298930471, "learning_rate": 9.490115321252059e-07, "logits/chosen": 0.4243408143520355, "logits/rejected": 0.3503784239292145, "logps/chosen": -307.8500061035156, "logps/rejected": -339.42498779296875, "loss": 0.5499, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6379638910293579, "rewards/margins": 1.0324218273162842, "rewards/rejected": -0.39509278535842896, "step": 620 }, { "epoch": 0.20757825370675453, "grad_norm": 90.35227775744855, "learning_rate": 9.481878088962108e-07, "logits/chosen": 0.38749998807907104, "logits/rejected": 0.24461059272289276, "logps/chosen": -338.95001220703125, "logps/rejected": -302.54998779296875, "loss": 0.6485, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.44501954317092896, "rewards/margins": 0.816113293170929, "rewards/rejected": -0.37060546875, "step": 630 }, { "epoch": 0.21087314662273476, "grad_norm": 82.19950945982987, "learning_rate": 9.473640856672158e-07, "logits/chosen": 0.3841919004917145, "logits/rejected": 0.3309570252895355, "logps/chosen": -315.875, "logps/rejected": -315.1499938964844, "loss": 0.5826, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.7072998285293579, "rewards/margins": 0.895556628704071, "rewards/rejected": -0.18946227431297302, "step": 640 }, { "epoch": 0.214168039538715, "grad_norm": 69.67693164073742, "learning_rate": 9.465403624382207e-07, "logits/chosen": 0.4371337890625, "logits/rejected": 0.3116210997104645, "logps/chosen": -292.45001220703125, "logps/rejected": -296.45001220703125, "loss": 0.5213, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.7756103277206421, "rewards/margins": 1.060205101966858, "rewards/rejected": -0.2842468321323395, "step": 650 }, { "epoch": 0.21746293245469522, "grad_norm": 63.430374307593624, "learning_rate": 9.457166392092256e-07, "logits/chosen": 0.39027708768844604, "logits/rejected": 0.30493468046188354, "logps/chosen": -304.3999938964844, "logps/rejected": -295.1000061035156, "loss": 0.5291, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.549511730670929, "rewards/margins": 0.8910156488418579, "rewards/rejected": -0.3421691954135895, "step": 660 }, { "epoch": 0.22075782537067545, "grad_norm": 68.30006831115556, "learning_rate": 9.448929159802305e-07, "logits/chosen": 0.33940428495407104, "logits/rejected": 0.32845765352249146, "logps/chosen": -318.54998779296875, "logps/rejected": -295.95001220703125, "loss": 0.5944, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.4856201112270355, "rewards/margins": 0.8628295660018921, "rewards/rejected": -0.3766235411167145, "step": 670 }, { "epoch": 0.22405271828665568, "grad_norm": 82.72210371306538, "learning_rate": 9.440691927512356e-07, "logits/chosen": 0.3480468690395355, "logits/rejected": 0.28638917207717896, "logps/chosen": -311.3999938964844, "logps/rejected": -317.6499938964844, "loss": 0.5137, "rewards/accuracies": 0.6875, "rewards/chosen": 0.67138671875, "rewards/margins": 1.055688500404358, "rewards/rejected": -0.3845275938510895, "step": 680 }, { "epoch": 0.2273476112026359, "grad_norm": 51.41481251183416, "learning_rate": 9.432454695222405e-07, "logits/chosen": 0.3772949278354645, "logits/rejected": 0.2061767578125, "logps/chosen": -281.3500061035156, "logps/rejected": -308.3500061035156, "loss": 0.5495, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.587506115436554, "rewards/margins": 1.1298828125, "rewards/rejected": -0.5418335199356079, "step": 690 }, { "epoch": 0.23064250411861614, "grad_norm": 62.48602563317984, "learning_rate": 9.424217462932454e-07, "logits/chosen": 0.32110595703125, "logits/rejected": 0.23623046278953552, "logps/chosen": -281.20001220703125, "logps/rejected": -310.5, "loss": 0.5615, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.4318130612373352, "rewards/margins": 1.133203148841858, "rewards/rejected": -0.701977550983429, "step": 700 }, { "epoch": 0.23393739703459637, "grad_norm": 46.44378339521754, "learning_rate": 9.415980230642504e-07, "logits/chosen": 0.4119506776332855, "logits/rejected": 0.37965697050094604, "logps/chosen": -326.70001220703125, "logps/rejected": -291.17498779296875, "loss": 0.5718, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.552966296672821, "rewards/margins": 1.0740478038787842, "rewards/rejected": -0.5212036371231079, "step": 710 }, { "epoch": 0.2372322899505766, "grad_norm": 75.53113107570186, "learning_rate": 9.407742998352554e-07, "logits/chosen": 0.49515992403030396, "logits/rejected": 0.4088378846645355, "logps/chosen": -364.79998779296875, "logps/rejected": -313.1499938964844, "loss": 0.532, "rewards/accuracies": 0.71875, "rewards/chosen": 0.6323608160018921, "rewards/margins": 1.0304687023162842, "rewards/rejected": -0.39808350801467896, "step": 720 }, { "epoch": 0.24052718286655683, "grad_norm": 69.34519416232322, "learning_rate": 9.399505766062602e-07, "logits/chosen": 0.3895507752895355, "logits/rejected": 0.37303465604782104, "logps/chosen": -317.25, "logps/rejected": -289.04998779296875, "loss": 0.6015, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.4892211854457855, "rewards/margins": 0.741748034954071, "rewards/rejected": -0.25249022245407104, "step": 730 }, { "epoch": 0.24382207578253706, "grad_norm": 71.96763041667525, "learning_rate": 9.391268533772651e-07, "logits/chosen": 0.44337767362594604, "logits/rejected": 0.3846069276332855, "logps/chosen": -316.8500061035156, "logps/rejected": -288.79998779296875, "loss": 0.5651, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.588305652141571, "rewards/margins": 0.8441162109375, "rewards/rejected": -0.2557128965854645, "step": 740 }, { "epoch": 0.2471169686985173, "grad_norm": 57.23190466002365, "learning_rate": 9.383031301482702e-07, "logits/chosen": 0.3619140684604645, "logits/rejected": 0.218505859375, "logps/chosen": -352.95001220703125, "logps/rejected": -332.25, "loss": 0.5051, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.4599609375, "rewards/margins": 0.9164062738418579, "rewards/rejected": -0.45672607421875, "step": 750 }, { "epoch": 0.2504118616144975, "grad_norm": 94.38388670247738, "learning_rate": 9.374794069192751e-07, "logits/chosen": 0.252288818359375, "logits/rejected": 0.12288818508386612, "logps/chosen": -298.1000061035156, "logps/rejected": -293.8500061035156, "loss": 0.5445, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.32982176542282104, "rewards/margins": 1.047607421875, "rewards/rejected": -0.7175995111465454, "step": 760 }, { "epoch": 0.25370675453047775, "grad_norm": 76.53426880899359, "learning_rate": 9.3665568369028e-07, "logits/chosen": 0.35832518339157104, "logits/rejected": 0.26861876249313354, "logps/chosen": -337.8999938964844, "logps/rejected": -313.25, "loss": 0.481, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.30763548612594604, "rewards/margins": 1.15625, "rewards/rejected": -0.848773181438446, "step": 770 }, { "epoch": 0.257001647446458, "grad_norm": 96.68399279943404, "learning_rate": 9.35831960461285e-07, "logits/chosen": 0.43328857421875, "logits/rejected": 0.22647705674171448, "logps/chosen": -370.6000061035156, "logps/rejected": -359.25, "loss": 0.5196, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.04587402194738388, "rewards/margins": 1.3884766101837158, "rewards/rejected": -1.341699242591858, "step": 780 }, { "epoch": 0.2602965403624382, "grad_norm": 51.18578708341709, "learning_rate": 9.3500823723229e-07, "logits/chosen": 0.306771844625473, "logits/rejected": 0.18918457627296448, "logps/chosen": -356.6499938964844, "logps/rejected": -318.29998779296875, "loss": 0.5056, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.3742431700229645, "rewards/margins": 1.177734375, "rewards/rejected": -0.80419921875, "step": 790 }, { "epoch": 0.26359143327841844, "grad_norm": 67.29943210892671, "learning_rate": 9.341845140032949e-07, "logits/chosen": 0.3370605409145355, "logits/rejected": 0.28424835205078125, "logps/chosen": -315.04998779296875, "logps/rejected": -312.6000061035156, "loss": 0.5311, "rewards/accuracies": 0.71875, "rewards/chosen": 0.4626098573207855, "rewards/margins": 0.94873046875, "rewards/rejected": -0.48540037870407104, "step": 800 }, { "epoch": 0.26688632619439867, "grad_norm": 75.76010774344776, "learning_rate": 9.333607907742997e-07, "logits/chosen": 0.2525634765625, "logits/rejected": 0.1981201171875, "logps/chosen": -345.5, "logps/rejected": -345.8999938964844, "loss": 0.4864, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.7721923589706421, "rewards/margins": 1.262304663658142, "rewards/rejected": -0.4901367127895355, "step": 810 }, { "epoch": 0.2701812191103789, "grad_norm": 96.78097235786448, "learning_rate": 9.325370675453047e-07, "logits/chosen": 0.327789306640625, "logits/rejected": 0.24588623642921448, "logps/chosen": -338.3999938964844, "logps/rejected": -334.3500061035156, "loss": 0.5478, "rewards/accuracies": 0.6875, "rewards/chosen": 0.628222644329071, "rewards/margins": 1.0828125476837158, "rewards/rejected": -0.4547973573207855, "step": 820 }, { "epoch": 0.27347611202635913, "grad_norm": 64.83908624103529, "learning_rate": 9.317133443163097e-07, "logits/chosen": 0.23132935166358948, "logits/rejected": 0.17145995795726776, "logps/chosen": -308.29998779296875, "logps/rejected": -320.0, "loss": 0.5241, "rewards/accuracies": 0.75, "rewards/chosen": 0.29450684785842896, "rewards/margins": 1.0076172351837158, "rewards/rejected": -0.713208019733429, "step": 830 }, { "epoch": 0.27677100494233936, "grad_norm": 85.03845229627136, "learning_rate": 9.308896210873146e-07, "logits/chosen": 0.2330641746520996, "logits/rejected": 0.14672470092773438, "logps/chosen": -325.8999938964844, "logps/rejected": -325.29998779296875, "loss": 0.5477, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.3542846739292145, "rewards/margins": 1.1135132312774658, "rewards/rejected": -1.467626929283142, "step": 840 }, { "epoch": 0.2800658978583196, "grad_norm": 90.98285230959351, "learning_rate": 9.300658978583196e-07, "logits/chosen": 0.18002930283546448, "logits/rejected": 0.18264159560203552, "logps/chosen": -336.0, "logps/rejected": -309.3500061035156, "loss": 0.5119, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.48087769746780396, "rewards/margins": 1.297460913658142, "rewards/rejected": -1.779687523841858, "step": 850 }, { "epoch": 0.2833607907742998, "grad_norm": 95.51168628487703, "learning_rate": 9.292421746293245e-07, "logits/chosen": 0.18294677138328552, "logits/rejected": 0.0629173293709755, "logps/chosen": -364.70001220703125, "logps/rejected": -356.1499938964844, "loss": 0.47, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.10072632133960724, "rewards/margins": 1.333288550376892, "rewards/rejected": -1.434423804283142, "step": 860 }, { "epoch": 0.28665568369028005, "grad_norm": 86.31776129695466, "learning_rate": 9.284184514003295e-07, "logits/chosen": 0.279397577047348, "logits/rejected": 0.21329346299171448, "logps/chosen": -397.6499938964844, "logps/rejected": -371.04998779296875, "loss": 0.4808, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.21240845322608948, "rewards/margins": 1.2189452648162842, "rewards/rejected": -1.4304687976837158, "step": 870 }, { "epoch": 0.2899505766062603, "grad_norm": 73.6358283758832, "learning_rate": 9.275947281713344e-07, "logits/chosen": 0.367919921875, "logits/rejected": 0.15497437119483948, "logps/chosen": -315.6499938964844, "logps/rejected": -302.6499938964844, "loss": 0.5928, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.23782959580421448, "rewards/margins": 0.9922851324081421, "rewards/rejected": -1.231347680091858, "step": 880 }, { "epoch": 0.2932454695222405, "grad_norm": 94.44295896460889, "learning_rate": 9.267710049423393e-07, "logits/chosen": 0.2821716368198395, "logits/rejected": 0.1756591796875, "logps/chosen": -281.9750061035156, "logps/rejected": -270.6000061035156, "loss": 0.6382, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.17470702528953552, "rewards/margins": 1.0238769054412842, "rewards/rejected": -0.8487304449081421, "step": 890 }, { "epoch": 0.29654036243822074, "grad_norm": 75.58361688503504, "learning_rate": 9.259472817133442e-07, "logits/chosen": 0.2879653871059418, "logits/rejected": 0.01409759558737278, "logps/chosen": -393.45001220703125, "logps/rejected": -352.20001220703125, "loss": 0.455, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.6849365234375, "rewards/margins": 1.2383301258087158, "rewards/rejected": -0.5533447265625, "step": 900 }, { "epoch": 0.29983525535420097, "grad_norm": 75.31628283931363, "learning_rate": 9.251235584843492e-07, "logits/chosen": 0.4173583984375, "logits/rejected": 0.235687255859375, "logps/chosen": -353.8500061035156, "logps/rejected": -298.6000061035156, "loss": 0.5584, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.3875488340854645, "rewards/margins": 1.0297973155975342, "rewards/rejected": -0.6412109136581421, "step": 910 }, { "epoch": 0.3031301482701812, "grad_norm": 56.82132490586957, "learning_rate": 9.242998352553542e-07, "logits/chosen": 0.3359008729457855, "logits/rejected": 0.15568847954273224, "logps/chosen": -328.6499938964844, "logps/rejected": -312.45001220703125, "loss": 0.4789, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.575451672077179, "rewards/margins": 1.260986328125, "rewards/rejected": -0.686450183391571, "step": 920 }, { "epoch": 0.30642504118616143, "grad_norm": 71.04218041391663, "learning_rate": 9.234761120263591e-07, "logits/chosen": 0.33085936307907104, "logits/rejected": 0.22769775986671448, "logps/chosen": -339.3999938964844, "logps/rejected": -345.20001220703125, "loss": 0.4971, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.634204089641571, "rewards/margins": 1.4111328125, "rewards/rejected": -0.776782214641571, "step": 930 }, { "epoch": 0.30971993410214166, "grad_norm": 136.51840257387485, "learning_rate": 9.22652388797364e-07, "logits/chosen": 0.2823287844657898, "logits/rejected": 0.236521914601326, "logps/chosen": -367.5, "logps/rejected": -349.1000061035156, "loss": 0.5365, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.4603515565395355, "rewards/margins": 1.13330078125, "rewards/rejected": -0.672924816608429, "step": 940 }, { "epoch": 0.3130148270181219, "grad_norm": 49.36406664181084, "learning_rate": 9.21828665568369e-07, "logits/chosen": 0.33665770292282104, "logits/rejected": 0.21030274033546448, "logps/chosen": -344.125, "logps/rejected": -333.20001220703125, "loss": 0.6055, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.3988098204135895, "rewards/margins": 1.0692138671875, "rewards/rejected": -0.671093761920929, "step": 950 }, { "epoch": 0.3163097199341021, "grad_norm": 94.16981726565966, "learning_rate": 9.21004942339374e-07, "logits/chosen": 0.38066405057907104, "logits/rejected": 0.37468260526657104, "logps/chosen": -325.95001220703125, "logps/rejected": -307.75, "loss": 0.5367, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.2846923768520355, "rewards/margins": 0.9864257574081421, "rewards/rejected": -0.7023559808731079, "step": 960 }, { "epoch": 0.31960461285008235, "grad_norm": 59.191149142642786, "learning_rate": 9.201812191103788e-07, "logits/chosen": 0.34730833768844604, "logits/rejected": 0.24004516005516052, "logps/chosen": -320.3500061035156, "logps/rejected": -306.8999938964844, "loss": 0.4815, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.28614503145217896, "rewards/margins": 1.196752905845642, "rewards/rejected": -0.9111572504043579, "step": 970 }, { "epoch": 0.3228995057660626, "grad_norm": 80.49101123352368, "learning_rate": 9.193574958813837e-07, "logits/chosen": 0.4377807676792145, "logits/rejected": 0.23113402724266052, "logps/chosen": -316.04998779296875, "logps/rejected": -308.29998779296875, "loss": 0.5126, "rewards/accuracies": 0.71875, "rewards/chosen": 0.590869128704071, "rewards/margins": 1.264306664466858, "rewards/rejected": -0.6736816167831421, "step": 980 }, { "epoch": 0.3261943986820428, "grad_norm": 62.085177218293126, "learning_rate": 9.185337726523888e-07, "logits/chosen": 0.41864013671875, "logits/rejected": 0.36046600341796875, "logps/chosen": -347.54998779296875, "logps/rejected": -335.3999938964844, "loss": 0.6288, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.17043456435203552, "rewards/margins": 0.9693359136581421, "rewards/rejected": -0.7982422113418579, "step": 990 }, { "epoch": 0.32948929159802304, "grad_norm": 50.10411643725113, "learning_rate": 9.177100494233937e-07, "logits/chosen": 0.4230712950229645, "logits/rejected": 0.2791198790073395, "logps/chosen": -316.17498779296875, "logps/rejected": -315.20001220703125, "loss": 0.5309, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.48920899629592896, "rewards/margins": 1.07421875, "rewards/rejected": -0.584643542766571, "step": 1000 }, { "epoch": 0.33278418451400327, "grad_norm": 38.69540595277244, "learning_rate": 9.168863261943986e-07, "logits/chosen": 0.34144288301467896, "logits/rejected": 0.3138671815395355, "logps/chosen": -357.04998779296875, "logps/rejected": -307.6000061035156, "loss": 0.5518, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.4128662049770355, "rewards/margins": 0.959179699420929, "rewards/rejected": -0.5463622808456421, "step": 1010 }, { "epoch": 0.33607907742998355, "grad_norm": 80.48513179665672, "learning_rate": 9.160626029654037e-07, "logits/chosen": 0.29560548067092896, "logits/rejected": 0.22395019233226776, "logps/chosen": -365.54998779296875, "logps/rejected": -336.95001220703125, "loss": 0.6024, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.25166624784469604, "rewards/margins": 0.8348144292831421, "rewards/rejected": -0.5827881097793579, "step": 1020 }, { "epoch": 0.3393739703459638, "grad_norm": 89.1347084894479, "learning_rate": 9.152388797364086e-07, "logits/chosen": 0.3163085877895355, "logits/rejected": 0.18386688828468323, "logps/chosen": -303.6499938964844, "logps/rejected": -277.3500061035156, "loss": 0.6469, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.24575194716453552, "rewards/margins": 0.713134765625, "rewards/rejected": -0.4675048887729645, "step": 1030 }, { "epoch": 0.342668863261944, "grad_norm": 75.45858245193969, "learning_rate": 9.144151565074135e-07, "logits/chosen": 0.2662506103515625, "logits/rejected": 0.05948181077837944, "logps/chosen": -334.79998779296875, "logps/rejected": -315.25, "loss": 0.4865, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.2732788026332855, "rewards/margins": 1.062890648841858, "rewards/rejected": -0.789355456829071, "step": 1040 }, { "epoch": 0.34596375617792424, "grad_norm": 57.542177516958, "learning_rate": 9.135914332784183e-07, "logits/chosen": 0.18818359076976776, "logits/rejected": 0.0186920166015625, "logps/chosen": -341.95001220703125, "logps/rejected": -359.25, "loss": 0.5949, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.10939941555261612, "rewards/margins": 0.8434692621231079, "rewards/rejected": -0.951831042766571, "step": 1050 }, { "epoch": 0.34925864909390447, "grad_norm": 65.95162844957736, "learning_rate": 9.127677100494234e-07, "logits/chosen": 0.2563720643520355, "logits/rejected": 0.04230957105755806, "logps/chosen": -317.1000061035156, "logps/rejected": -310.3999938964844, "loss": 0.5598, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.22700805962085724, "rewards/margins": 1.155371069908142, "rewards/rejected": -1.380273461341858, "step": 1060 }, { "epoch": 0.3525535420098847, "grad_norm": 76.18048039726133, "learning_rate": 9.119439868204283e-07, "logits/chosen": 0.16095733642578125, "logits/rejected": 0.01203002966940403, "logps/chosen": -309.79998779296875, "logps/rejected": -334.3999938964844, "loss": 0.612, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.13756103813648224, "rewards/margins": 1.1559569835662842, "rewards/rejected": -1.292578101158142, "step": 1070 }, { "epoch": 0.35584843492586493, "grad_norm": 92.26643239553219, "learning_rate": 9.111202635914332e-07, "logits/chosen": 0.13077697157859802, "logits/rejected": -0.0025054931174963713, "logps/chosen": -311.95001220703125, "logps/rejected": -344.70001220703125, "loss": 0.4678, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.1768798828125, "rewards/margins": 1.3147094249725342, "rewards/rejected": -1.137109398841858, "step": 1080 }, { "epoch": 0.35914332784184516, "grad_norm": 64.55271917956375, "learning_rate": 9.102965403624382e-07, "logits/chosen": 0.09535522758960724, "logits/rejected": 0.10384521633386612, "logps/chosen": -316.6499938964844, "logps/rejected": -326.1499938964844, "loss": 0.5039, "rewards/accuracies": 0.71875, "rewards/chosen": 0.4090332090854645, "rewards/margins": 1.045654296875, "rewards/rejected": -0.636492908000946, "step": 1090 }, { "epoch": 0.3624382207578254, "grad_norm": 54.16977082562094, "learning_rate": 9.094728171334432e-07, "logits/chosen": 0.137553408741951, "logits/rejected": 0.04032592847943306, "logps/chosen": -323.8999938964844, "logps/rejected": -303.75, "loss": 0.5292, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.27916258573532104, "rewards/margins": 1.103491187095642, "rewards/rejected": -0.8242431879043579, "step": 1100 }, { "epoch": 0.3657331136738056, "grad_norm": 89.32370711755355, "learning_rate": 9.086490939044481e-07, "logits/chosen": 0.0727691650390625, "logits/rejected": 0.09251251071691513, "logps/chosen": -328.75, "logps/rejected": -319.3500061035156, "loss": 0.5113, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.2686401307582855, "rewards/margins": 1.327246069908142, "rewards/rejected": -1.058251976966858, "step": 1110 }, { "epoch": 0.36902800658978585, "grad_norm": 99.67547219959754, "learning_rate": 9.07825370675453e-07, "logits/chosen": 0.11506347358226776, "logits/rejected": 0.010516357608139515, "logps/chosen": -302.1499938964844, "logps/rejected": -278.79998779296875, "loss": 0.5957, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.01357421837747097, "rewards/margins": 0.9873291254043579, "rewards/rejected": -0.9739929437637329, "step": 1120 }, { "epoch": 0.3723228995057661, "grad_norm": 65.13845967685755, "learning_rate": 9.070016474464579e-07, "logits/chosen": 0.20585937798023224, "logits/rejected": -0.013916015625, "logps/chosen": -320.5, "logps/rejected": -296.1000061035156, "loss": 0.5561, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.23078612983226776, "rewards/margins": 0.974804699420929, "rewards/rejected": -0.742358386516571, "step": 1130 }, { "epoch": 0.3756177924217463, "grad_norm": 58.475167412515376, "learning_rate": 9.061779242174629e-07, "logits/chosen": 0.280996710062027, "logits/rejected": 0.1132354736328125, "logps/chosen": -328.7749938964844, "logps/rejected": -307.6000061035156, "loss": 0.4771, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.640380859375, "rewards/margins": 1.244531273841858, "rewards/rejected": -0.604077160358429, "step": 1140 }, { "epoch": 0.37891268533772654, "grad_norm": 46.49206846150426, "learning_rate": 9.053542009884678e-07, "logits/chosen": 0.23594971001148224, "logits/rejected": 0.17484131455421448, "logps/chosen": -341.3500061035156, "logps/rejected": -314.70001220703125, "loss": 0.5079, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.4418579041957855, "rewards/margins": 1.0879395008087158, "rewards/rejected": -0.645764172077179, "step": 1150 }, { "epoch": 0.38220757825370677, "grad_norm": 93.2688294837542, "learning_rate": 9.045304777594728e-07, "logits/chosen": 0.209259033203125, "logits/rejected": 0.17213591933250427, "logps/chosen": -338.8500061035156, "logps/rejected": -301.3999938964844, "loss": 0.5213, "rewards/accuracies": 0.71875, "rewards/chosen": 0.439605712890625, "rewards/margins": 1.201562523841858, "rewards/rejected": -0.760913074016571, "step": 1160 }, { "epoch": 0.385502471169687, "grad_norm": 85.80784601972852, "learning_rate": 9.037067545304777e-07, "logits/chosen": 0.315185546875, "logits/rejected": 0.19833068549633026, "logps/chosen": -346.79998779296875, "logps/rejected": -318.5, "loss": 0.6016, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.09201431274414062, "rewards/margins": 0.91943359375, "rewards/rejected": -0.827221691608429, "step": 1170 }, { "epoch": 0.38879736408566723, "grad_norm": 103.20527139939765, "learning_rate": 9.028830313014827e-07, "logits/chosen": 0.29752808809280396, "logits/rejected": 0.21656493842601776, "logps/chosen": -349.54998779296875, "logps/rejected": -317.3999938964844, "loss": 0.4938, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.24335937201976776, "rewards/margins": 1.3114745616912842, "rewards/rejected": -1.0693359375, "step": 1180 }, { "epoch": 0.39209225700164746, "grad_norm": 95.88724771637858, "learning_rate": 9.020593080724876e-07, "logits/chosen": 0.32829588651657104, "logits/rejected": 0.2605728209018707, "logps/chosen": -320.5, "logps/rejected": -308.5, "loss": 0.5091, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.14407959580421448, "rewards/margins": 1.1927978992462158, "rewards/rejected": -1.0486328601837158, "step": 1190 }, { "epoch": 0.3953871499176277, "grad_norm": 87.4651985168619, "learning_rate": 9.012355848434925e-07, "logits/chosen": 0.2701171934604645, "logits/rejected": 0.09358825534582138, "logps/chosen": -344.75, "logps/rejected": -366.20001220703125, "loss": 0.4639, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.17343750596046448, "rewards/margins": 1.4482421875, "rewards/rejected": -1.273828148841858, "step": 1200 }, { "epoch": 0.3986820428336079, "grad_norm": 68.7915762721844, "learning_rate": 9.004118616144974e-07, "logits/chosen": 0.20061035454273224, "logits/rejected": 0.11295776069164276, "logps/chosen": -332.6499938964844, "logps/rejected": -348.1000061035156, "loss": 0.6321, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.20812682807445526, "rewards/margins": 1.1362793445587158, "rewards/rejected": -1.344506859779358, "step": 1210 }, { "epoch": 0.40197693574958815, "grad_norm": 85.27296262530358, "learning_rate": 8.995881383855024e-07, "logits/chosen": 0.17213745415210724, "logits/rejected": 0.065185546875, "logps/chosen": -313.95001220703125, "logps/rejected": -313.3500061035156, "loss": 0.5255, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.13497619330883026, "rewards/margins": 1.08544921875, "rewards/rejected": -0.9515625238418579, "step": 1220 }, { "epoch": 0.4052718286655684, "grad_norm": 91.70000318440728, "learning_rate": 8.987644151565074e-07, "logits/chosen": 0.16020508110523224, "logits/rejected": 0.10495605319738388, "logps/chosen": -316.3500061035156, "logps/rejected": -306.54998779296875, "loss": 0.527, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.4781494140625, "rewards/margins": 0.944580078125, "rewards/rejected": -0.4659057557582855, "step": 1230 }, { "epoch": 0.4085667215815486, "grad_norm": 104.29055473668213, "learning_rate": 8.979406919275123e-07, "logits/chosen": 0.23106078803539276, "logits/rejected": 0.07950439304113388, "logps/chosen": -367.3999938964844, "logps/rejected": -333.79998779296875, "loss": 0.6256, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.5311034917831421, "rewards/margins": 0.758593738079071, "rewards/rejected": -0.22700195014476776, "step": 1240 }, { "epoch": 0.41186161449752884, "grad_norm": 89.56390592767885, "learning_rate": 8.971169686985172e-07, "logits/chosen": 0.11252441257238388, "logits/rejected": 0.08608398586511612, "logps/chosen": -268.95001220703125, "logps/rejected": -279.20001220703125, "loss": 0.5313, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.4826293885707855, "rewards/margins": 1.097753882408142, "rewards/rejected": -0.6153320074081421, "step": 1250 }, { "epoch": 0.41515650741350907, "grad_norm": 63.491577169497944, "learning_rate": 8.962932454695223e-07, "logits/chosen": 0.16145935654640198, "logits/rejected": 0.03901367262005806, "logps/chosen": -319.1499938964844, "logps/rejected": -314.5, "loss": 0.4812, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.18287353217601776, "rewards/margins": 1.2314941883087158, "rewards/rejected": -1.0492675304412842, "step": 1260 }, { "epoch": 0.4184514003294893, "grad_norm": 69.29648828001298, "learning_rate": 8.954695222405272e-07, "logits/chosen": 0.07495727390050888, "logits/rejected": -0.00892028771340847, "logps/chosen": -309.125, "logps/rejected": -317.79998779296875, "loss": 0.4663, "rewards/accuracies": 0.71875, "rewards/chosen": 0.14810791611671448, "rewards/margins": 1.294335961341858, "rewards/rejected": -1.145898461341858, "step": 1270 }, { "epoch": 0.42174629324546953, "grad_norm": 110.19384374583002, "learning_rate": 8.94645799011532e-07, "logits/chosen": 0.12225951999425888, "logits/rejected": 0.01290741004049778, "logps/chosen": -315.75, "logps/rejected": -303.3500061035156, "loss": 0.502, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.17666015028953552, "rewards/margins": 1.2252929210662842, "rewards/rejected": -1.3996093273162842, "step": 1280 }, { "epoch": 0.42504118616144976, "grad_norm": 91.13482240584626, "learning_rate": 8.93822075782537e-07, "logits/chosen": 0.20921631157398224, "logits/rejected": 0.12027587741613388, "logps/chosen": -311.3500061035156, "logps/rejected": -331.8500061035156, "loss": 0.5143, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.46672362089157104, "rewards/margins": 1.2639892101287842, "rewards/rejected": -0.7975219488143921, "step": 1290 }, { "epoch": 0.42833607907743, "grad_norm": 52.79157627182335, "learning_rate": 8.92998352553542e-07, "logits/chosen": 0.260009765625, "logits/rejected": 0.10669555515050888, "logps/chosen": -320.29998779296875, "logps/rejected": -296.75, "loss": 0.5175, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.575244128704071, "rewards/margins": 1.2657959461212158, "rewards/rejected": -0.69085693359375, "step": 1300 }, { "epoch": 0.4316309719934102, "grad_norm": 88.26063799395227, "learning_rate": 8.921746293245469e-07, "logits/chosen": 0.2584899961948395, "logits/rejected": 0.2611450254917145, "logps/chosen": -342.25, "logps/rejected": -298.70001220703125, "loss": 0.5726, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.3549560606479645, "rewards/margins": 1.0017578601837158, "rewards/rejected": -0.6468750238418579, "step": 1310 }, { "epoch": 0.43492586490939045, "grad_norm": 57.5176490071122, "learning_rate": 8.913509060955518e-07, "logits/chosen": 0.28084105253219604, "logits/rejected": 0.11066894233226776, "logps/chosen": -331.04998779296875, "logps/rejected": -321.1000061035156, "loss": 0.54, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.11623535305261612, "rewards/margins": 1.21484375, "rewards/rejected": -1.0985839366912842, "step": 1320 }, { "epoch": 0.4382207578253707, "grad_norm": 82.34342612127253, "learning_rate": 8.905271828665569e-07, "logits/chosen": 0.2982131838798523, "logits/rejected": 0.18662109971046448, "logps/chosen": -348.6499938964844, "logps/rejected": -335.04998779296875, "loss": 0.6004, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.01936035230755806, "rewards/margins": 1.049658179283142, "rewards/rejected": -1.068945288658142, "step": 1330 }, { "epoch": 0.4415156507413509, "grad_norm": 73.9175117829384, "learning_rate": 8.897034596375618e-07, "logits/chosen": 0.33289796113967896, "logits/rejected": 0.16590270400047302, "logps/chosen": -346.1000061035156, "logps/rejected": -350.1000061035156, "loss": 0.5119, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3820556700229645, "rewards/margins": 1.40380859375, "rewards/rejected": -1.0234375, "step": 1340 }, { "epoch": 0.44481054365733114, "grad_norm": 81.47896739013851, "learning_rate": 8.888797364085667e-07, "logits/chosen": 0.21027831733226776, "logits/rejected": 0.2289886474609375, "logps/chosen": -316.8999938964844, "logps/rejected": -320.25, "loss": 0.5977, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.5809326171875, "rewards/margins": 1.0066406726837158, "rewards/rejected": -0.42731934785842896, "step": 1350 }, { "epoch": 0.44810543657331137, "grad_norm": 88.42119109452554, "learning_rate": 8.880560131795715e-07, "logits/chosen": 0.29271239042282104, "logits/rejected": 0.21610717475414276, "logps/chosen": -360.5, "logps/rejected": -337.8500061035156, "loss": 0.4488, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.939257800579071, "rewards/margins": 1.330078125, "rewards/rejected": -0.39075928926467896, "step": 1360 }, { "epoch": 0.4514003294892916, "grad_norm": 69.0472347598778, "learning_rate": 8.872322899505766e-07, "logits/chosen": 0.1353759765625, "logits/rejected": 0.12762145698070526, "logps/chosen": -311.0, "logps/rejected": -325.8999938964844, "loss": 0.5552, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.5824950933456421, "rewards/margins": 1.237402319908142, "rewards/rejected": -0.6546875238418579, "step": 1370 }, { "epoch": 0.4546952224052718, "grad_norm": 77.76064008663933, "learning_rate": 8.864085667215815e-07, "logits/chosen": 0.08379821479320526, "logits/rejected": -0.05556640774011612, "logps/chosen": -348.1000061035156, "logps/rejected": -345.6000061035156, "loss": 0.4942, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.42744141817092896, "rewards/margins": 1.3280761241912842, "rewards/rejected": -0.9011474847793579, "step": 1380 }, { "epoch": 0.45799011532125206, "grad_norm": 98.62354285598826, "learning_rate": 8.855848434925864e-07, "logits/chosen": 0.1969451904296875, "logits/rejected": -0.0049682618118822575, "logps/chosen": -328.0, "logps/rejected": -341.3999938964844, "loss": 0.5053, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.6458374261856079, "rewards/margins": 1.3444335460662842, "rewards/rejected": -0.699462890625, "step": 1390 }, { "epoch": 0.4612850082372323, "grad_norm": 60.48739025449471, "learning_rate": 8.847611202635914e-07, "logits/chosen": 0.27763596177101135, "logits/rejected": 0.13303223252296448, "logps/chosen": -348.8999938964844, "logps/rejected": -324.6000061035156, "loss": 0.5287, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.843066394329071, "rewards/margins": 1.2917969226837158, "rewards/rejected": -0.44841307401657104, "step": 1400 }, { "epoch": 0.4645799011532125, "grad_norm": 63.98222324960489, "learning_rate": 8.839373970345964e-07, "logits/chosen": 0.22194823622703552, "logits/rejected": 0.15836182236671448, "logps/chosen": -270.95001220703125, "logps/rejected": -287.92498779296875, "loss": 0.5518, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.9715331792831421, "rewards/margins": 1.0986328125, "rewards/rejected": -0.12736816704273224, "step": 1410 }, { "epoch": 0.46787479406919275, "grad_norm": 53.229208246329755, "learning_rate": 8.831136738056013e-07, "logits/chosen": 0.21991577744483948, "logits/rejected": 0.06973572075366974, "logps/chosen": -297.75, "logps/rejected": -290.6000061035156, "loss": 0.5888, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.895800769329071, "rewards/margins": 0.8135620355606079, "rewards/rejected": 0.08310546725988388, "step": 1420 }, { "epoch": 0.471169686985173, "grad_norm": 65.02860523416146, "learning_rate": 8.822899505766062e-07, "logits/chosen": 0.296234130859375, "logits/rejected": 0.18052978813648224, "logps/chosen": -305.5, "logps/rejected": -306.3999938964844, "loss": 0.6029, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.917236328125, "rewards/margins": 0.9626098871231079, "rewards/rejected": -0.04460449144244194, "step": 1430 }, { "epoch": 0.4744645799011532, "grad_norm": 65.7681960289503, "learning_rate": 8.814662273476111e-07, "logits/chosen": 0.24066467583179474, "logits/rejected": 0.17343750596046448, "logps/chosen": -307.04998779296875, "logps/rejected": -326.0, "loss": 0.5687, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.6743408441543579, "rewards/margins": 1.118798851966858, "rewards/rejected": -0.44355469942092896, "step": 1440 }, { "epoch": 0.47775947281713343, "grad_norm": 66.89688963751605, "learning_rate": 8.806425041186161e-07, "logits/chosen": 0.252197265625, "logits/rejected": 0.11285400390625, "logps/chosen": -311.3500061035156, "logps/rejected": -312.4750061035156, "loss": 0.4838, "rewards/accuracies": 0.75, "rewards/chosen": 0.7622162103652954, "rewards/margins": 1.307275414466858, "rewards/rejected": -0.5456787347793579, "step": 1450 }, { "epoch": 0.48105436573311366, "grad_norm": 35.61019684463692, "learning_rate": 8.79818780889621e-07, "logits/chosen": 0.35527342557907104, "logits/rejected": 0.29186707735061646, "logps/chosen": -354.8999938964844, "logps/rejected": -334.75, "loss": 0.5413, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.44512939453125, "rewards/margins": 1.2235107421875, "rewards/rejected": -0.779125988483429, "step": 1460 }, { "epoch": 0.4843492586490939, "grad_norm": 99.30977622664855, "learning_rate": 8.78995057660626e-07, "logits/chosen": 0.4095214903354645, "logits/rejected": 0.32426148653030396, "logps/chosen": -373.79998779296875, "logps/rejected": -328.04998779296875, "loss": 0.6236, "rewards/accuracies": 0.65625, "rewards/chosen": -0.044403076171875, "rewards/margins": 0.861328125, "rewards/rejected": -0.9043945074081421, "step": 1470 }, { "epoch": 0.4876441515650741, "grad_norm": 76.11827882713078, "learning_rate": 8.781713344316309e-07, "logits/chosen": 0.38579100370407104, "logits/rejected": 0.24263915419578552, "logps/chosen": -330.04998779296875, "logps/rejected": -322.6000061035156, "loss": 0.4983, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.3929687440395355, "rewards/margins": 1.252709984779358, "rewards/rejected": -0.858593761920929, "step": 1480 }, { "epoch": 0.49093904448105435, "grad_norm": 65.21331367814717, "learning_rate": 8.773476112026359e-07, "logits/chosen": 0.35002440214157104, "logits/rejected": 0.18306884169578552, "logps/chosen": -313.0, "logps/rejected": -282.6000061035156, "loss": 0.4909, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.3979736268520355, "rewards/margins": 1.170507788658142, "rewards/rejected": -0.772509753704071, "step": 1490 }, { "epoch": 0.4942339373970346, "grad_norm": 55.107181458687286, "learning_rate": 8.765238879736409e-07, "logits/chosen": 0.2374832183122635, "logits/rejected": 0.04262695461511612, "logps/chosen": -326.04998779296875, "logps/rejected": -294.29998779296875, "loss": 0.4678, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.3893981873989105, "rewards/margins": 1.3320801258087158, "rewards/rejected": -0.9437500238418579, "step": 1500 }, { "epoch": 0.4975288303130148, "grad_norm": 80.41077913015522, "learning_rate": 8.757001647446458e-07, "logits/chosen": 0.20871582627296448, "logits/rejected": 0.09722900390625, "logps/chosen": -323.20001220703125, "logps/rejected": -329.20001220703125, "loss": 0.5059, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.41315919160842896, "rewards/margins": 1.2111327648162842, "rewards/rejected": -0.7985595464706421, "step": 1510 }, { "epoch": 0.500823723228995, "grad_norm": 62.747042432772936, "learning_rate": 8.748764415156506e-07, "logits/chosen": 0.29954224824905396, "logits/rejected": 0.20170898735523224, "logps/chosen": -357.54998779296875, "logps/rejected": -315.04998779296875, "loss": 0.5529, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.7132629156112671, "rewards/margins": 1.127685546875, "rewards/rejected": -0.4144043028354645, "step": 1520 }, { "epoch": 0.5041186161449753, "grad_norm": 96.84656825385183, "learning_rate": 8.740527182866556e-07, "logits/chosen": 0.30225831270217896, "logits/rejected": 0.21319580078125, "logps/chosen": -330.1499938964844, "logps/rejected": -307.3500061035156, "loss": 0.5508, "rewards/accuracies": 0.71875, "rewards/chosen": 0.705676257610321, "rewards/margins": 1.1947021484375, "rewards/rejected": -0.4894042909145355, "step": 1530 }, { "epoch": 0.5074135090609555, "grad_norm": 74.61755064050153, "learning_rate": 8.732289950576606e-07, "logits/chosen": 0.25541990995407104, "logits/rejected": 0.21402588486671448, "logps/chosen": -283.6000061035156, "logps/rejected": -296.1000061035156, "loss": 0.5471, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.36949461698532104, "rewards/margins": 1.1082031726837158, "rewards/rejected": -0.73907470703125, "step": 1540 }, { "epoch": 0.5107084019769358, "grad_norm": 79.20122112498385, "learning_rate": 8.724052718286655e-07, "logits/chosen": 0.3419189453125, "logits/rejected": 0.2639404237270355, "logps/chosen": -339.45001220703125, "logps/rejected": -305.29998779296875, "loss": 0.5728, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.0031616210471838713, "rewards/margins": 1.259765625, "rewards/rejected": -1.2610352039337158, "step": 1550 }, { "epoch": 0.514003294892916, "grad_norm": 45.27979959905706, "learning_rate": 8.715815485996705e-07, "logits/chosen": 0.30729371309280396, "logits/rejected": 0.14318542182445526, "logps/chosen": -309.20001220703125, "logps/rejected": -327.3500061035156, "loss": 0.5542, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.22086182236671448, "rewards/margins": 1.23492431640625, "rewards/rejected": -1.013769507408142, "step": 1560 }, { "epoch": 0.5172981878088962, "grad_norm": 60.940536559453776, "learning_rate": 8.707578253706755e-07, "logits/chosen": 0.31135255098342896, "logits/rejected": 0.162181094288826, "logps/chosen": -304.29998779296875, "logps/rejected": -307.1499938964844, "loss": 0.4999, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.31383055448532104, "rewards/margins": 1.296972632408142, "rewards/rejected": -0.9837890863418579, "step": 1570 }, { "epoch": 0.5205930807248764, "grad_norm": 73.40289034746701, "learning_rate": 8.699341021416804e-07, "logits/chosen": 0.3877929747104645, "logits/rejected": 0.21665039658546448, "logps/chosen": -315.29998779296875, "logps/rejected": -302.79998779296875, "loss": 0.4592, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.6254638433456421, "rewards/margins": 1.33056640625, "rewards/rejected": -0.705004870891571, "step": 1580 }, { "epoch": 0.5238879736408567, "grad_norm": 55.846213330449494, "learning_rate": 8.691103789126853e-07, "logits/chosen": 0.37604981660842896, "logits/rejected": 0.12689514458179474, "logps/chosen": -340.0, "logps/rejected": -351.79998779296875, "loss": 0.5889, "rewards/accuracies": 0.6875, "rewards/chosen": 0.501049816608429, "rewards/margins": 1.091455101966858, "rewards/rejected": -0.589337170124054, "step": 1590 }, { "epoch": 0.5271828665568369, "grad_norm": 89.11045479363266, "learning_rate": 8.682866556836902e-07, "logits/chosen": 0.43110352754592896, "logits/rejected": 0.2722412049770355, "logps/chosen": -286.875, "logps/rejected": -296.92498779296875, "loss": 0.4773, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.695751965045929, "rewards/margins": 1.1914551258087158, "rewards/rejected": -0.495269775390625, "step": 1600 }, { "epoch": 0.5304777594728172, "grad_norm": 69.29900281520727, "learning_rate": 8.674629324546952e-07, "logits/chosen": 0.38823240995407104, "logits/rejected": 0.29738616943359375, "logps/chosen": -354.5, "logps/rejected": -307.29998779296875, "loss": 0.5848, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.592333972454071, "rewards/margins": 0.974536120891571, "rewards/rejected": -0.38206785917282104, "step": 1610 }, { "epoch": 0.5337726523887973, "grad_norm": 62.433567383027906, "learning_rate": 8.666392092257001e-07, "logits/chosen": 0.4869628846645355, "logits/rejected": 0.32954102754592896, "logps/chosen": -315.1000061035156, "logps/rejected": -306.5, "loss": 0.5496, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.8557373285293579, "rewards/margins": 1.2158691883087158, "rewards/rejected": -0.35986328125, "step": 1620 }, { "epoch": 0.5370675453047776, "grad_norm": 64.2557267582658, "learning_rate": 8.65815485996705e-07, "logits/chosen": 0.38605958223342896, "logits/rejected": 0.3140319883823395, "logps/chosen": -368.8999938964844, "logps/rejected": -328.5, "loss": 0.6172, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.7457031011581421, "rewards/margins": 0.995800793170929, "rewards/rejected": -0.2491455078125, "step": 1630 }, { "epoch": 0.5403624382207578, "grad_norm": 45.62052030985852, "learning_rate": 8.649917627677101e-07, "logits/chosen": 0.35588377714157104, "logits/rejected": 0.24706116318702698, "logps/chosen": -288.54998779296875, "logps/rejected": -302.8500061035156, "loss": 0.539, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.6021484136581421, "rewards/margins": 1.073999047279358, "rewards/rejected": -0.4720092713832855, "step": 1640 }, { "epoch": 0.5436573311367381, "grad_norm": 70.56008570269572, "learning_rate": 8.64168039538715e-07, "logits/chosen": 0.2781127989292145, "logits/rejected": 0.31819456815719604, "logps/chosen": -336.54998779296875, "logps/rejected": -343.6000061035156, "loss": 0.5748, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.707836925983429, "rewards/margins": 1.1368286609649658, "rewards/rejected": -0.4298095703125, "step": 1650 }, { "epoch": 0.5469522240527183, "grad_norm": 75.8516218041996, "learning_rate": 8.633443163097199e-07, "logits/chosen": 0.3962646424770355, "logits/rejected": 0.2856079041957855, "logps/chosen": -345.95001220703125, "logps/rejected": -375.20001220703125, "loss": 0.5384, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.32819825410842896, "rewards/margins": 1.200097680091858, "rewards/rejected": -0.8719406127929688, "step": 1660 }, { "epoch": 0.5502471169686985, "grad_norm": 87.9601220121444, "learning_rate": 8.625205930807248e-07, "logits/chosen": 0.39438170194625854, "logits/rejected": 0.26573485136032104, "logps/chosen": -341.6000061035156, "logps/rejected": -334.79998779296875, "loss": 0.5375, "rewards/accuracies": 0.71875, "rewards/chosen": 0.37041014432907104, "rewards/margins": 1.4622802734375, "rewards/rejected": -1.0928466320037842, "step": 1670 }, { "epoch": 0.5535420098846787, "grad_norm": 86.41500578552409, "learning_rate": 8.616968698517298e-07, "logits/chosen": 0.4393981993198395, "logits/rejected": 0.3656249940395355, "logps/chosen": -306.0, "logps/rejected": -330.79998779296875, "loss": 0.5573, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.41099852323532104, "rewards/margins": 1.1882812976837158, "rewards/rejected": -0.7787109613418579, "step": 1680 }, { "epoch": 0.556836902800659, "grad_norm": 63.66608925844501, "learning_rate": 8.608731466227347e-07, "logits/chosen": 0.2635498046875, "logits/rejected": 0.206929013133049, "logps/chosen": -305.5, "logps/rejected": -287.79998779296875, "loss": 0.5023, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.10076904296875, "rewards/margins": 1.198583960533142, "rewards/rejected": -1.098046898841858, "step": 1690 }, { "epoch": 0.5601317957166392, "grad_norm": 55.68484671897867, "learning_rate": 8.600494233937396e-07, "logits/chosen": 0.35882568359375, "logits/rejected": 0.2573303282260895, "logps/chosen": -361.1499938964844, "logps/rejected": -326.1000061035156, "loss": 0.6123, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.18467101454734802, "rewards/margins": 0.963793933391571, "rewards/rejected": -0.779101550579071, "step": 1700 }, { "epoch": 0.5634266886326195, "grad_norm": 72.93631063216021, "learning_rate": 8.592257001647446e-07, "logits/chosen": 0.35996073484420776, "logits/rejected": 0.35966795682907104, "logps/chosen": -333.20001220703125, "logps/rejected": -314.45001220703125, "loss": 0.5034, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.7538086175918579, "rewards/margins": 1.272216796875, "rewards/rejected": -0.519238293170929, "step": 1710 }, { "epoch": 0.5667215815485996, "grad_norm": 82.64442348661065, "learning_rate": 8.584019769357496e-07, "logits/chosen": 0.29625242948532104, "logits/rejected": 0.22791747748851776, "logps/chosen": -346.3999938964844, "logps/rejected": -332.04998779296875, "loss": 0.5099, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.65283203125, "rewards/margins": 1.0999023914337158, "rewards/rejected": -0.4474121034145355, "step": 1720 }, { "epoch": 0.5700164744645799, "grad_norm": 46.13390723377239, "learning_rate": 8.575782537067545e-07, "logits/chosen": 0.43522948026657104, "logits/rejected": 0.2543701231479645, "logps/chosen": -346.25, "logps/rejected": -338.29998779296875, "loss": 0.4407, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.7994629144668579, "rewards/margins": 1.344970703125, "rewards/rejected": -0.545886218547821, "step": 1730 }, { "epoch": 0.5733113673805601, "grad_norm": 97.07468726138393, "learning_rate": 8.567545304777595e-07, "logits/chosen": 0.2568908631801605, "logits/rejected": 0.1741790771484375, "logps/chosen": -326.7250061035156, "logps/rejected": -303.45001220703125, "loss": 0.6716, "rewards/accuracies": 0.65625, "rewards/chosen": 0.40614014863967896, "rewards/margins": 0.774169921875, "rewards/rejected": -0.367431640625, "step": 1740 }, { "epoch": 0.5766062602965404, "grad_norm": 70.92145879831038, "learning_rate": 8.559308072487644e-07, "logits/chosen": 0.258627325296402, "logits/rejected": 0.21183165907859802, "logps/chosen": -364.5, "logps/rejected": -358.3500061035156, "loss": 0.5138, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.27618408203125, "rewards/margins": 1.3600585460662842, "rewards/rejected": -1.083398461341858, "step": 1750 }, { "epoch": 0.5799011532125206, "grad_norm": 67.20954699555803, "learning_rate": 8.551070840197693e-07, "logits/chosen": 0.350372314453125, "logits/rejected": 0.16538238525390625, "logps/chosen": -326.04998779296875, "logps/rejected": -342.25, "loss": 0.5502, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.14816895127296448, "rewards/margins": 1.207910180091858, "rewards/rejected": -1.0591644048690796, "step": 1760 }, { "epoch": 0.5831960461285008, "grad_norm": 105.83321456511753, "learning_rate": 8.542833607907742e-07, "logits/chosen": 0.2709602415561676, "logits/rejected": 0.14964599907398224, "logps/chosen": -337.0, "logps/rejected": -340.54998779296875, "loss": 0.5421, "rewards/accuracies": 0.71875, "rewards/chosen": 0.0050292969681322575, "rewards/margins": 1.226904273033142, "rewards/rejected": -1.221472144126892, "step": 1770 }, { "epoch": 0.586490939044481, "grad_norm": 72.68470123299367, "learning_rate": 8.534596375617792e-07, "logits/chosen": 0.22978515923023224, "logits/rejected": 0.20607909560203552, "logps/chosen": -310.1499938964844, "logps/rejected": -291.25, "loss": 0.5798, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.15476074814796448, "rewards/margins": 0.9345703125, "rewards/rejected": -1.0889160633087158, "step": 1780 }, { "epoch": 0.5897858319604613, "grad_norm": 92.16971521631662, "learning_rate": 8.526359143327841e-07, "logits/chosen": 0.21014709770679474, "logits/rejected": 0.13288573920726776, "logps/chosen": -304.25, "logps/rejected": -337.0, "loss": 0.5264, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.13460692763328552, "rewards/margins": 1.0286865234375, "rewards/rejected": -0.8935791254043579, "step": 1790 }, { "epoch": 0.5930807248764415, "grad_norm": 145.651868245826, "learning_rate": 8.518121911037891e-07, "logits/chosen": 0.24648436903953552, "logits/rejected": 0.23802490532398224, "logps/chosen": -299.0, "logps/rejected": -333.1499938964844, "loss": 0.6748, "rewards/accuracies": 0.65625, "rewards/chosen": -0.07589111477136612, "rewards/margins": 0.8537842035293579, "rewards/rejected": -0.930419921875, "step": 1800 }, { "epoch": 0.5963756177924218, "grad_norm": 79.29842863777276, "learning_rate": 8.509884678747941e-07, "logits/chosen": 0.16072997450828552, "logits/rejected": 0.14475098252296448, "logps/chosen": -352.95001220703125, "logps/rejected": -325.6499938964844, "loss": 0.5003, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.17926025390625, "rewards/margins": 1.2705078125, "rewards/rejected": -1.0922362804412842, "step": 1810 }, { "epoch": 0.5996705107084019, "grad_norm": 73.65561887815569, "learning_rate": 8.50164744645799e-07, "logits/chosen": 0.22906494140625, "logits/rejected": 0.07870177924633026, "logps/chosen": -312.6000061035156, "logps/rejected": -311.79998779296875, "loss": 0.5521, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.22260132431983948, "rewards/margins": 1.268884301185608, "rewards/rejected": -1.0465819835662842, "step": 1820 }, { "epoch": 0.6029654036243822, "grad_norm": 103.69020036973694, "learning_rate": 8.493410214168038e-07, "logits/chosen": 0.20070800185203552, "logits/rejected": 0.15247802436351776, "logps/chosen": -326.125, "logps/rejected": -323.04998779296875, "loss": 0.566, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.22048339247703552, "rewards/margins": 1.308496117591858, "rewards/rejected": -1.087499976158142, "step": 1830 }, { "epoch": 0.6062602965403624, "grad_norm": 57.90029705375129, "learning_rate": 8.485172981878088e-07, "logits/chosen": 0.43023681640625, "logits/rejected": 0.3002990782260895, "logps/chosen": -373.1000061035156, "logps/rejected": -347.8500061035156, "loss": 0.6016, "rewards/accuracies": 0.6875, "rewards/chosen": 0.496826171875, "rewards/margins": 1.02197265625, "rewards/rejected": -0.5245605707168579, "step": 1840 }, { "epoch": 0.6095551894563427, "grad_norm": 76.5167400715453, "learning_rate": 8.476935749588138e-07, "logits/chosen": 0.3379974365234375, "logits/rejected": 0.23365478217601776, "logps/chosen": -357.20001220703125, "logps/rejected": -324.20001220703125, "loss": 0.4591, "rewards/accuracies": 0.75, "rewards/chosen": 0.6111084222793579, "rewards/margins": 1.440673828125, "rewards/rejected": -0.830859363079071, "step": 1850 }, { "epoch": 0.6128500823723229, "grad_norm": 50.533366949457886, "learning_rate": 8.468698517298187e-07, "logits/chosen": 0.3613647520542145, "logits/rejected": 0.3226318359375, "logps/chosen": -297.1499938964844, "logps/rejected": -316.29998779296875, "loss": 0.541, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.492919921875, "rewards/margins": 1.1494140625, "rewards/rejected": -0.6573730707168579, "step": 1860 }, { "epoch": 0.6161449752883031, "grad_norm": 80.53425757052942, "learning_rate": 8.460461285008237e-07, "logits/chosen": 0.35759276151657104, "logits/rejected": 0.21473999321460724, "logps/chosen": -336.1000061035156, "logps/rejected": -322.1499938964844, "loss": 0.4885, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.670703113079071, "rewards/margins": 1.5442383289337158, "rewards/rejected": -0.872387707233429, "step": 1870 }, { "epoch": 0.6194398682042833, "grad_norm": 75.88285363939112, "learning_rate": 8.452224052718287e-07, "logits/chosen": 0.49055176973342896, "logits/rejected": 0.325765997171402, "logps/chosen": -331.54998779296875, "logps/rejected": -321.4750061035156, "loss": 0.5413, "rewards/accuracies": 0.71875, "rewards/chosen": 0.45074462890625, "rewards/margins": 1.118749976158142, "rewards/rejected": -0.6672607660293579, "step": 1880 }, { "epoch": 0.6227347611202636, "grad_norm": 48.37943304536438, "learning_rate": 8.443986820428336e-07, "logits/chosen": 0.43339234590530396, "logits/rejected": 0.2958618104457855, "logps/chosen": -327.8999938964844, "logps/rejected": -315.1000061035156, "loss": 0.4698, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.545849621295929, "rewards/margins": 1.312890648841858, "rewards/rejected": -0.7667480707168579, "step": 1890 }, { "epoch": 0.6260296540362438, "grad_norm": 53.898171045871905, "learning_rate": 8.435749588138385e-07, "logits/chosen": 0.45261842012405396, "logits/rejected": 0.30366212129592896, "logps/chosen": -368.82501220703125, "logps/rejected": -297.07501220703125, "loss": 0.478, "rewards/accuracies": 0.75, "rewards/chosen": 0.620135486125946, "rewards/margins": 1.2765624523162842, "rewards/rejected": -0.6561279296875, "step": 1900 }, { "epoch": 0.6293245469522241, "grad_norm": 61.25261715717486, "learning_rate": 8.427512355848434e-07, "logits/chosen": 0.3850769102573395, "logits/rejected": 0.39082640409469604, "logps/chosen": -345.70001220703125, "logps/rejected": -336.25, "loss": 0.5197, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.615063488483429, "rewards/margins": 1.2251465320587158, "rewards/rejected": -0.6099609136581421, "step": 1910 }, { "epoch": 0.6326194398682042, "grad_norm": 84.86739479021179, "learning_rate": 8.419275123558484e-07, "logits/chosen": 0.4069580137729645, "logits/rejected": 0.3340820372104645, "logps/chosen": -307.04998779296875, "logps/rejected": -326.75, "loss": 0.5794, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.41199952363967896, "rewards/margins": 0.94921875, "rewards/rejected": -0.537280261516571, "step": 1920 }, { "epoch": 0.6359143327841845, "grad_norm": 45.1491299989567, "learning_rate": 8.411037891268533e-07, "logits/chosen": 0.43316650390625, "logits/rejected": 0.3518920838832855, "logps/chosen": -332.54998779296875, "logps/rejected": -321.17498779296875, "loss": 0.541, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.68927001953125, "rewards/margins": 1.130859375, "rewards/rejected": -0.44157713651657104, "step": 1930 }, { "epoch": 0.6392092257001647, "grad_norm": 146.16779524908975, "learning_rate": 8.402800658978582e-07, "logits/chosen": 0.34920042753219604, "logits/rejected": 0.21175536513328552, "logps/chosen": -308.20001220703125, "logps/rejected": -311.95001220703125, "loss": 0.5701, "rewards/accuracies": 0.75, "rewards/chosen": 0.5166870355606079, "rewards/margins": 1.251953125, "rewards/rejected": -0.737597644329071, "step": 1940 }, { "epoch": 0.642504118616145, "grad_norm": 64.25045898657275, "learning_rate": 8.394563426688633e-07, "logits/chosen": 0.419677734375, "logits/rejected": 0.32099610567092896, "logps/chosen": -355.45001220703125, "logps/rejected": -355.95001220703125, "loss": 0.455, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.45023804903030396, "rewards/margins": 1.3701171875, "rewards/rejected": -0.9190429449081421, "step": 1950 }, { "epoch": 0.6457990115321252, "grad_norm": 94.26721543168692, "learning_rate": 8.386326194398682e-07, "logits/chosen": 0.44453126192092896, "logits/rejected": 0.32189637422561646, "logps/chosen": -316.6499938964844, "logps/rejected": -318.8500061035156, "loss": 0.4629, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.547778308391571, "rewards/margins": 1.3239257335662842, "rewards/rejected": -0.776470959186554, "step": 1960 }, { "epoch": 0.6490939044481054, "grad_norm": 95.60738374684502, "learning_rate": 8.378088962108731e-07, "logits/chosen": 0.482177734375, "logits/rejected": 0.3993774354457855, "logps/chosen": -345.1000061035156, "logps/rejected": -320.25, "loss": 0.5513, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.4328979551792145, "rewards/margins": 1.126708984375, "rewards/rejected": -0.6934814453125, "step": 1970 }, { "epoch": 0.6523887973640856, "grad_norm": 54.392701733923914, "learning_rate": 8.369851729818781e-07, "logits/chosen": 0.4969238340854645, "logits/rejected": 0.4039672911167145, "logps/chosen": -304.70001220703125, "logps/rejected": -307.29998779296875, "loss": 0.5926, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.5959228277206421, "rewards/margins": 1.1273193359375, "rewards/rejected": -0.531665027141571, "step": 1980 }, { "epoch": 0.6556836902800659, "grad_norm": 64.71461366244966, "learning_rate": 8.36161449752883e-07, "logits/chosen": 0.4797729551792145, "logits/rejected": 0.379608154296875, "logps/chosen": -301.6000061035156, "logps/rejected": -325.8999938964844, "loss": 0.5567, "rewards/accuracies": 0.6875, "rewards/chosen": 0.7986084222793579, "rewards/margins": 1.1824219226837158, "rewards/rejected": -0.383401483297348, "step": 1990 }, { "epoch": 0.6589785831960461, "grad_norm": 40.55001209389521, "learning_rate": 8.353377265238879e-07, "logits/chosen": 0.41456907987594604, "logits/rejected": 0.19802245497703552, "logps/chosen": -354.8999938964844, "logps/rejected": -306.25, "loss": 0.5151, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.6188720464706421, "rewards/margins": 1.475976586341858, "rewards/rejected": -0.857421875, "step": 2000 }, { "epoch": 0.6622734761120264, "grad_norm": 55.1319161676338, "learning_rate": 8.345140032948928e-07, "logits/chosen": 0.3551879823207855, "logits/rejected": 0.20760878920555115, "logps/chosen": -303.70001220703125, "logps/rejected": -304.45001220703125, "loss": 0.4554, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.6817626953125, "rewards/margins": 1.3660156726837158, "rewards/rejected": -0.685986340045929, "step": 2010 }, { "epoch": 0.6655683690280065, "grad_norm": 73.20636613766564, "learning_rate": 8.336902800658978e-07, "logits/chosen": 0.31028443574905396, "logits/rejected": 0.21785888075828552, "logps/chosen": -311.1499938964844, "logps/rejected": -313.95001220703125, "loss": 0.4374, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.6781708002090454, "rewards/margins": 1.5224609375, "rewards/rejected": -0.843994140625, "step": 2020 }, { "epoch": 0.6688632619439868, "grad_norm": 73.24725047653106, "learning_rate": 8.328665568369028e-07, "logits/chosen": 0.4008544981479645, "logits/rejected": 0.2931884825229645, "logps/chosen": -322.1499938964844, "logps/rejected": -316.54998779296875, "loss": 0.5904, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.44926756620407104, "rewards/margins": 1.261474609375, "rewards/rejected": -0.811816394329071, "step": 2030 }, { "epoch": 0.6721581548599671, "grad_norm": 56.908062308399366, "learning_rate": 8.320428336079077e-07, "logits/chosen": 0.39667969942092896, "logits/rejected": 0.25526732206344604, "logps/chosen": -303.7749938964844, "logps/rejected": -284.6499938964844, "loss": 0.5842, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3904785215854645, "rewards/margins": 1.2084228992462158, "rewards/rejected": -0.8184890747070312, "step": 2040 }, { "epoch": 0.6754530477759473, "grad_norm": 93.84621536829407, "learning_rate": 8.312191103789127e-07, "logits/chosen": 0.3386596739292145, "logits/rejected": 0.1236419677734375, "logps/chosen": -353.6499938964844, "logps/rejected": -342.8500061035156, "loss": 0.5224, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.5411742925643921, "rewards/margins": 1.468847632408142, "rewards/rejected": -0.927014172077179, "step": 2050 }, { "epoch": 0.6787479406919276, "grad_norm": 65.76899444311675, "learning_rate": 8.303953871499177e-07, "logits/chosen": 0.37248533964157104, "logits/rejected": 0.28472900390625, "logps/chosen": -299.8500061035156, "logps/rejected": -291.6499938964844, "loss": 0.5578, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.570019543170929, "rewards/margins": 1.212744116783142, "rewards/rejected": -0.6428467035293579, "step": 2060 }, { "epoch": 0.6820428336079077, "grad_norm": 82.42871226128882, "learning_rate": 8.295716639209225e-07, "logits/chosen": 0.45207518339157104, "logits/rejected": 0.3311523497104645, "logps/chosen": -308.04998779296875, "logps/rejected": -330.20001220703125, "loss": 0.5615, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.647448718547821, "rewards/margins": 1.163720726966858, "rewards/rejected": -0.515307605266571, "step": 2070 }, { "epoch": 0.685337726523888, "grad_norm": 76.23838381480583, "learning_rate": 8.287479406919274e-07, "logits/chosen": 0.42741698026657104, "logits/rejected": 0.2802490293979645, "logps/chosen": -282.8500061035156, "logps/rejected": -266.6499938964844, "loss": 0.542, "rewards/accuracies": 0.6875, "rewards/chosen": 0.7264404296875, "rewards/margins": 1.15576171875, "rewards/rejected": -0.4303955137729645, "step": 2080 }, { "epoch": 0.6886326194398682, "grad_norm": 90.50926685818811, "learning_rate": 8.279242174629324e-07, "logits/chosen": 0.4603714048862457, "logits/rejected": 0.369384765625, "logps/chosen": -310.79998779296875, "logps/rejected": -278.3500061035156, "loss": 0.6234, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.745361328125, "rewards/margins": 0.924511730670929, "rewards/rejected": -0.17917481064796448, "step": 2090 }, { "epoch": 0.6919275123558485, "grad_norm": 96.35278344301167, "learning_rate": 8.271004942339374e-07, "logits/chosen": 0.40516358613967896, "logits/rejected": 0.310638427734375, "logps/chosen": -355.25, "logps/rejected": -344.25, "loss": 0.5442, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.8212646245956421, "rewards/margins": 1.1418945789337158, "rewards/rejected": -0.32054442167282104, "step": 2100 }, { "epoch": 0.6952224052718287, "grad_norm": 72.96772621025072, "learning_rate": 8.262767710049423e-07, "logits/chosen": 0.3533569276332855, "logits/rejected": 0.30207520723342896, "logps/chosen": -332.25, "logps/rejected": -331.8500061035156, "loss": 0.5959, "rewards/accuracies": 0.65625, "rewards/chosen": 1.006250023841858, "rewards/margins": 1.023461937904358, "rewards/rejected": -0.01713867112994194, "step": 2110 }, { "epoch": 0.6985172981878089, "grad_norm": 68.27839243701514, "learning_rate": 8.254530477759473e-07, "logits/chosen": 0.4202026426792145, "logits/rejected": 0.29291993379592896, "logps/chosen": -334.79998779296875, "logps/rejected": -293.95001220703125, "loss": 0.5256, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.934069812297821, "rewards/margins": 1.1249268054962158, "rewards/rejected": -0.19066771864891052, "step": 2120 }, { "epoch": 0.7018121911037891, "grad_norm": 71.21591681988752, "learning_rate": 8.246293245469522e-07, "logits/chosen": 0.27034300565719604, "logits/rejected": 0.25499266386032104, "logps/chosen": -308.6499938964844, "logps/rejected": -296.1000061035156, "loss": 0.5494, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.714599609375, "rewards/margins": 1.181372046470642, "rewards/rejected": -0.465301513671875, "step": 2130 }, { "epoch": 0.7051070840197694, "grad_norm": 55.421515726654704, "learning_rate": 8.238056013179572e-07, "logits/chosen": 0.42280274629592896, "logits/rejected": 0.3083252012729645, "logps/chosen": -359.5, "logps/rejected": -337.3500061035156, "loss": 0.6445, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.650195300579071, "rewards/margins": 1.0386230945587158, "rewards/rejected": -0.3882812559604645, "step": 2140 }, { "epoch": 0.7084019769357496, "grad_norm": 58.24024495187052, "learning_rate": 8.22981878088962e-07, "logits/chosen": 0.25927734375, "logits/rejected": 0.15028686821460724, "logps/chosen": -338.0, "logps/rejected": -325.8999938964844, "loss": 0.5039, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.4570068418979645, "rewards/margins": 1.221289038658142, "rewards/rejected": -0.764538586139679, "step": 2150 }, { "epoch": 0.7116968698517299, "grad_norm": 67.66227687691058, "learning_rate": 8.22158154859967e-07, "logits/chosen": 0.301840215921402, "logits/rejected": 0.28450316190719604, "logps/chosen": -326.5, "logps/rejected": -319.6000061035156, "loss": 0.5458, "rewards/accuracies": 0.71875, "rewards/chosen": 0.4855102598667145, "rewards/margins": 1.08984375, "rewards/rejected": -0.605194091796875, "step": 2160 }, { "epoch": 0.71499176276771, "grad_norm": 95.54219772690664, "learning_rate": 8.213344316309719e-07, "logits/chosen": 0.3382812440395355, "logits/rejected": 0.2081298828125, "logps/chosen": -332.79998779296875, "logps/rejected": -342.7250061035156, "loss": 0.511, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.46558839082717896, "rewards/margins": 1.284765601158142, "rewards/rejected": -0.819750964641571, "step": 2170 }, { "epoch": 0.7182866556836903, "grad_norm": 41.85180151366222, "learning_rate": 8.205107084019769e-07, "logits/chosen": 0.2747802734375, "logits/rejected": 0.21428222954273224, "logps/chosen": -281.1499938964844, "logps/rejected": -323.6499938964844, "loss": 0.5565, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.3396972715854645, "rewards/margins": 1.372473120689392, "rewards/rejected": -1.0339019298553467, "step": 2180 }, { "epoch": 0.7215815485996705, "grad_norm": 52.839817898590745, "learning_rate": 8.196869851729819e-07, "logits/chosen": 0.3211608827114105, "logits/rejected": 0.2578125, "logps/chosen": -328.6499938964844, "logps/rejected": -329.3500061035156, "loss": 0.5598, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.41676026582717896, "rewards/margins": 1.1286132335662842, "rewards/rejected": -0.71185302734375, "step": 2190 }, { "epoch": 0.7248764415156508, "grad_norm": 95.64589956578492, "learning_rate": 8.188632619439868e-07, "logits/chosen": 0.2619384825229645, "logits/rejected": 0.21499022841453552, "logps/chosen": -338.70001220703125, "logps/rejected": -327.20001220703125, "loss": 0.5607, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.4099365174770355, "rewards/margins": 1.15283203125, "rewards/rejected": -0.7430419921875, "step": 2200 }, { "epoch": 0.728171334431631, "grad_norm": 62.317563245976764, "learning_rate": 8.180395387149917e-07, "logits/chosen": 0.29645997285842896, "logits/rejected": 0.17283324897289276, "logps/chosen": -327.5, "logps/rejected": -294.20001220703125, "loss": 0.5334, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.753222644329071, "rewards/margins": 1.0772216320037842, "rewards/rejected": -0.32354736328125, "step": 2210 }, { "epoch": 0.7314662273476112, "grad_norm": 76.9201271678833, "learning_rate": 8.172158154859968e-07, "logits/chosen": 0.3148742616176605, "logits/rejected": 0.21042481064796448, "logps/chosen": -311.20001220703125, "logps/rejected": -337.70001220703125, "loss": 0.6342, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.5815185308456421, "rewards/margins": 0.9966796636581421, "rewards/rejected": -0.4139465391635895, "step": 2220 }, { "epoch": 0.7347611202635914, "grad_norm": 41.782706447892345, "learning_rate": 8.163920922570016e-07, "logits/chosen": 0.3514343202114105, "logits/rejected": 0.27692872285842896, "logps/chosen": -322.20001220703125, "logps/rejected": -310.3500061035156, "loss": 0.6203, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.37720948457717896, "rewards/margins": 0.8788818120956421, "rewards/rejected": -0.5011962652206421, "step": 2230 }, { "epoch": 0.7380560131795717, "grad_norm": 67.3913344125578, "learning_rate": 8.155683690280065e-07, "logits/chosen": 0.2921142578125, "logits/rejected": 0.21306762099266052, "logps/chosen": -276.75, "logps/rejected": -275.0, "loss": 0.5242, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.47382813692092896, "rewards/margins": 1.270410180091858, "rewards/rejected": -0.795458972454071, "step": 2240 }, { "epoch": 0.7413509060955519, "grad_norm": 28.969457946529904, "learning_rate": 8.147446457990114e-07, "logits/chosen": 0.31379395723342896, "logits/rejected": 0.28594970703125, "logps/chosen": -348.7749938964844, "logps/rejected": -356.1499938964844, "loss": 0.411, "rewards/accuracies": 0.78125, "rewards/chosen": 0.552172839641571, "rewards/margins": 1.56201171875, "rewards/rejected": -1.0104491710662842, "step": 2250 }, { "epoch": 0.7446457990115322, "grad_norm": 60.44886161912939, "learning_rate": 8.139209225700165e-07, "logits/chosen": 0.24783936142921448, "logits/rejected": 0.11160888522863388, "logps/chosen": -334.95001220703125, "logps/rejected": -333.20001220703125, "loss": 0.4995, "rewards/accuracies": 0.75, "rewards/chosen": 0.3149169981479645, "rewards/margins": 1.50250244140625, "rewards/rejected": -1.1878173351287842, "step": 2260 }, { "epoch": 0.7479406919275123, "grad_norm": 72.02838876490004, "learning_rate": 8.130971993410214e-07, "logits/chosen": 0.24656982719898224, "logits/rejected": 0.20645752549171448, "logps/chosen": -307.6499938964844, "logps/rejected": -292.25, "loss": 0.496, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.29340821504592896, "rewards/margins": 1.6213867664337158, "rewards/rejected": -1.326379418373108, "step": 2270 }, { "epoch": 0.7512355848434926, "grad_norm": 61.207888603650254, "learning_rate": 8.122734761120263e-07, "logits/chosen": 0.3216796815395355, "logits/rejected": 0.17801514267921448, "logps/chosen": -338.875, "logps/rejected": -314.54998779296875, "loss": 0.5711, "rewards/accuracies": 0.71875, "rewards/chosen": -0.02036590501666069, "rewards/margins": 1.291723608970642, "rewards/rejected": -1.312890648841858, "step": 2280 }, { "epoch": 0.7545304777594728, "grad_norm": 72.08306630209971, "learning_rate": 8.114497528830313e-07, "logits/chosen": 0.15659180283546448, "logits/rejected": 0.10572204738855362, "logps/chosen": -327.04998779296875, "logps/rejected": -309.25, "loss": 0.5727, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.26872557401657104, "rewards/margins": 1.107324242591858, "rewards/rejected": -0.8380492925643921, "step": 2290 }, { "epoch": 0.7578253706754531, "grad_norm": 74.04139737650543, "learning_rate": 8.106260296540363e-07, "logits/chosen": 0.31947022676467896, "logits/rejected": 0.2140243500471115, "logps/chosen": -310.79998779296875, "logps/rejected": -310.54998779296875, "loss": 0.5147, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.46870118379592896, "rewards/margins": 1.3502197265625, "rewards/rejected": -0.8823486566543579, "step": 2300 }, { "epoch": 0.7611202635914333, "grad_norm": 93.35235720960387, "learning_rate": 8.098023064250411e-07, "logits/chosen": 0.29799193143844604, "logits/rejected": 0.16516724228858948, "logps/chosen": -285.95001220703125, "logps/rejected": -290.0, "loss": 0.571, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.44671630859375, "rewards/margins": 1.0691406726837158, "rewards/rejected": -0.62353515625, "step": 2310 }, { "epoch": 0.7644151565074135, "grad_norm": 63.142249984134224, "learning_rate": 8.08978583196046e-07, "logits/chosen": 0.2718749940395355, "logits/rejected": 0.10973815619945526, "logps/chosen": -288.29998779296875, "logps/rejected": -309.04998779296875, "loss": 0.5007, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.521801769733429, "rewards/margins": 1.294189453125, "rewards/rejected": -0.772753894329071, "step": 2320 }, { "epoch": 0.7677100494233937, "grad_norm": 70.58119625147961, "learning_rate": 8.08154859967051e-07, "logits/chosen": 0.16443482041358948, "logits/rejected": 0.11476745456457138, "logps/chosen": -335.25, "logps/rejected": -300.1499938964844, "loss": 0.4683, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.275390625, "rewards/margins": 1.2751953601837158, "rewards/rejected": -1.00146484375, "step": 2330 }, { "epoch": 0.771004942339374, "grad_norm": 62.53837276788589, "learning_rate": 8.07331136738056e-07, "logits/chosen": 0.16733399033546448, "logits/rejected": 0.03753967210650444, "logps/chosen": -355.45001220703125, "logps/rejected": -292.6000061035156, "loss": 0.5252, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.28509521484375, "rewards/margins": 1.3644530773162842, "rewards/rejected": -1.07940673828125, "step": 2340 }, { "epoch": 0.7742998352553542, "grad_norm": 75.42821481643472, "learning_rate": 8.065074135090609e-07, "logits/chosen": 0.31037598848342896, "logits/rejected": 0.24193724989891052, "logps/chosen": -307.79998779296875, "logps/rejected": -279.95001220703125, "loss": 0.5399, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4538940489292145, "rewards/margins": 1.1731445789337158, "rewards/rejected": -0.7205566167831421, "step": 2350 }, { "epoch": 0.7775947281713345, "grad_norm": 74.3567330300486, "learning_rate": 8.056836902800659e-07, "logits/chosen": 0.23219910264015198, "logits/rejected": 0.1884002685546875, "logps/chosen": -312.1499938964844, "logps/rejected": -298.20001220703125, "loss": 0.5401, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.517822265625, "rewards/margins": 1.0588867664337158, "rewards/rejected": -0.5409179925918579, "step": 2360 }, { "epoch": 0.7808896210873146, "grad_norm": 54.41034349795546, "learning_rate": 8.048599670510709e-07, "logits/chosen": 0.20416259765625, "logits/rejected": 0.05150756984949112, "logps/chosen": -343.04998779296875, "logps/rejected": -322.8999938964844, "loss": 0.5426, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.47508543729782104, "rewards/margins": 1.353515625, "rewards/rejected": -0.8768310546875, "step": 2370 }, { "epoch": 0.7841845140032949, "grad_norm": 73.70249235432293, "learning_rate": 8.040362438220758e-07, "logits/chosen": 0.18041686713695526, "logits/rejected": 0.10263671725988388, "logps/chosen": -329.8500061035156, "logps/rejected": -339.20001220703125, "loss": 0.5677, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.31390380859375, "rewards/margins": 1.3507080078125, "rewards/rejected": -1.0362060070037842, "step": 2380 }, { "epoch": 0.7874794069192751, "grad_norm": 55.740839802632856, "learning_rate": 8.032125205930806e-07, "logits/chosen": 0.23272705078125, "logits/rejected": 0.21962890028953552, "logps/chosen": -321.0, "logps/rejected": -333.8999938964844, "loss": 0.5466, "rewards/accuracies": 0.71875, "rewards/chosen": 0.456787109375, "rewards/margins": 1.267480492591858, "rewards/rejected": -0.8108276128768921, "step": 2390 }, { "epoch": 0.7907742998352554, "grad_norm": 67.31253933145251, "learning_rate": 8.023887973640856e-07, "logits/chosen": 0.3057617247104645, "logits/rejected": 0.2066604644060135, "logps/chosen": -270.54998779296875, "logps/rejected": -287.79998779296875, "loss": 0.615, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.14560547471046448, "rewards/margins": 0.858349621295929, "rewards/rejected": -0.713427722454071, "step": 2400 }, { "epoch": 0.7940691927512356, "grad_norm": 69.80648230836974, "learning_rate": 8.015650741350906e-07, "logits/chosen": 0.24364623427391052, "logits/rejected": 0.19688721001148224, "logps/chosen": -319.17498779296875, "logps/rejected": -353.04998779296875, "loss": 0.4787, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.5258544683456421, "rewards/margins": 1.600958228111267, "rewards/rejected": -1.0771484375, "step": 2410 }, { "epoch": 0.7973640856672158, "grad_norm": 72.9003101534135, "learning_rate": 8.007413509060955e-07, "logits/chosen": 0.24959106743335724, "logits/rejected": 0.12028808891773224, "logps/chosen": -292.95001220703125, "logps/rejected": -350.70001220703125, "loss": 0.5661, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.23342284560203552, "rewards/margins": 1.433203101158142, "rewards/rejected": -1.200097680091858, "step": 2420 }, { "epoch": 0.800658978583196, "grad_norm": 52.81153895410322, "learning_rate": 7.999176276771005e-07, "logits/chosen": 0.27447509765625, "logits/rejected": 0.18231201171875, "logps/chosen": -348.6000061035156, "logps/rejected": -321.54998779296875, "loss": 0.5017, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.3136230409145355, "rewards/margins": 1.2105712890625, "rewards/rejected": -0.8978027105331421, "step": 2430 }, { "epoch": 0.8039538714991763, "grad_norm": 50.55055936953733, "learning_rate": 7.990939044481054e-07, "logits/chosen": 0.18552246689796448, "logits/rejected": 0.18233947455883026, "logps/chosen": -320.3999938964844, "logps/rejected": -310.3999938964844, "loss": 0.571, "rewards/accuracies": 0.6875, "rewards/chosen": 0.23148193955421448, "rewards/margins": 1.2768065929412842, "rewards/rejected": -1.046044945716858, "step": 2440 }, { "epoch": 0.8072487644151565, "grad_norm": 91.5521565712543, "learning_rate": 7.982701812191104e-07, "logits/chosen": 0.28229981660842896, "logits/rejected": 0.14729003608226776, "logps/chosen": -339.8500061035156, "logps/rejected": -330.1000061035156, "loss": 0.5282, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.20733642578125, "rewards/margins": 1.427734375, "rewards/rejected": -1.2227051258087158, "step": 2450 }, { "epoch": 0.8105436573311368, "grad_norm": 47.95780604184854, "learning_rate": 7.974464579901154e-07, "logits/chosen": 0.27395325899124146, "logits/rejected": 0.16347046196460724, "logps/chosen": -351.25, "logps/rejected": -340.29998779296875, "loss": 0.3966, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.517895519733429, "rewards/margins": 1.7786133289337158, "rewards/rejected": -1.261010766029358, "step": 2460 }, { "epoch": 0.8138385502471169, "grad_norm": 65.86207510409537, "learning_rate": 7.966227347611202e-07, "logits/chosen": 0.13554687798023224, "logits/rejected": 0.01431884802877903, "logps/chosen": -314.1000061035156, "logps/rejected": -336.45001220703125, "loss": 0.5709, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.12090148776769638, "rewards/margins": 1.222741723060608, "rewards/rejected": -1.102508544921875, "step": 2470 }, { "epoch": 0.8171334431630972, "grad_norm": 78.17675336171659, "learning_rate": 7.957990115321251e-07, "logits/chosen": 0.19962158799171448, "logits/rejected": 0.09346923977136612, "logps/chosen": -353.6499938964844, "logps/rejected": -344.6000061035156, "loss": 0.5401, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.06403808295726776, "rewards/margins": 1.209570288658142, "rewards/rejected": -1.145263671875, "step": 2480 }, { "epoch": 0.8204283360790774, "grad_norm": 58.99184578846836, "learning_rate": 7.949752883031301e-07, "logits/chosen": 0.21348877251148224, "logits/rejected": 0.07079009711742401, "logps/chosen": -347.5, "logps/rejected": -323.75, "loss": 0.4268, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.24440917372703552, "rewards/margins": 1.4538085460662842, "rewards/rejected": -1.20947265625, "step": 2490 }, { "epoch": 0.8237232289950577, "grad_norm": 81.68298408525655, "learning_rate": 7.941515650741351e-07, "logits/chosen": 0.31340330839157104, "logits/rejected": 0.18778076767921448, "logps/chosen": -306.75, "logps/rejected": -306.25, "loss": 0.5048, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.08677978813648224, "rewards/margins": 1.3525390625, "rewards/rejected": -1.2659180164337158, "step": 2500 }, { "epoch": 0.8270181219110379, "grad_norm": 65.67263594044785, "learning_rate": 7.9332784184514e-07, "logits/chosen": 0.392578125, "logits/rejected": 0.23325195908546448, "logps/chosen": -349.3999938964844, "logps/rejected": -340.54998779296875, "loss": 0.44, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.21888427436351776, "rewards/margins": 1.530419945716858, "rewards/rejected": -1.311279296875, "step": 2510 }, { "epoch": 0.8303130148270181, "grad_norm": 65.52987224230817, "learning_rate": 7.925041186161449e-07, "logits/chosen": 0.36347657442092896, "logits/rejected": 0.11872558295726776, "logps/chosen": -364.1000061035156, "logps/rejected": -328.8500061035156, "loss": 0.5377, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.0943603515625, "rewards/margins": 1.469751000404358, "rewards/rejected": -1.375585913658142, "step": 2520 }, { "epoch": 0.8336079077429983, "grad_norm": 62.450037985738, "learning_rate": 7.9168039538715e-07, "logits/chosen": 0.3382720947265625, "logits/rejected": 0.24943237006664276, "logps/chosen": -346.375, "logps/rejected": -357.54998779296875, "loss": 0.5184, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.3341430723667145, "rewards/margins": 1.550634741783142, "rewards/rejected": -1.2158691883087158, "step": 2530 }, { "epoch": 0.8369028006589786, "grad_norm": 67.41476243600249, "learning_rate": 7.908566721581548e-07, "logits/chosen": 0.2376708984375, "logits/rejected": 0.24680785834789276, "logps/chosen": -319.54998779296875, "logps/rejected": -300.79998779296875, "loss": 0.4725, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.22202149033546448, "rewards/margins": 1.25146484375, "rewards/rejected": -1.030029296875, "step": 2540 }, { "epoch": 0.8401976935749588, "grad_norm": 107.55530395769323, "learning_rate": 7.900329489291597e-07, "logits/chosen": 0.35466307401657104, "logits/rejected": 0.27456361055374146, "logps/chosen": -344.8500061035156, "logps/rejected": -321.8999938964844, "loss": 0.5215, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.3862976133823395, "rewards/margins": 1.2377440929412842, "rewards/rejected": -0.8529297113418579, "step": 2550 }, { "epoch": 0.8434925864909391, "grad_norm": 29.932581549814497, "learning_rate": 7.892092257001646e-07, "logits/chosen": 0.35466307401657104, "logits/rejected": 0.20094604790210724, "logps/chosen": -315.3999938964844, "logps/rejected": -343.79998779296875, "loss": 0.4481, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.6367553472518921, "rewards/margins": 1.635009765625, "rewards/rejected": -0.998730480670929, "step": 2560 }, { "epoch": 0.8467874794069192, "grad_norm": 77.19462244980512, "learning_rate": 7.883855024711697e-07, "logits/chosen": 0.4398559629917145, "logits/rejected": 0.30804443359375, "logps/chosen": -334.75, "logps/rejected": -318.79998779296875, "loss": 0.544, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.6449218988418579, "rewards/margins": 1.1712646484375, "rewards/rejected": -0.525927722454071, "step": 2570 }, { "epoch": 0.8500823723228995, "grad_norm": 69.33870268701422, "learning_rate": 7.875617792421746e-07, "logits/chosen": 0.43269044160842896, "logits/rejected": 0.23188476264476776, "logps/chosen": -284.20001220703125, "logps/rejected": -287.6000061035156, "loss": 0.5448, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.49894410371780396, "rewards/margins": 1.130029320716858, "rewards/rejected": -0.631762683391571, "step": 2580 }, { "epoch": 0.8533772652388797, "grad_norm": 67.87226505459252, "learning_rate": 7.867380560131795e-07, "logits/chosen": 0.3031249940395355, "logits/rejected": 0.21192017197608948, "logps/chosen": -374.75, "logps/rejected": -317.6499938964844, "loss": 0.5266, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.390716552734375, "rewards/margins": 1.224267601966858, "rewards/rejected": -0.8331054449081421, "step": 2590 }, { "epoch": 0.85667215815486, "grad_norm": 56.529377748060334, "learning_rate": 7.859143327841846e-07, "logits/chosen": 0.42451173067092896, "logits/rejected": 0.31730347871780396, "logps/chosen": -296.25, "logps/rejected": -309.1499938964844, "loss": 0.5793, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.22348633408546448, "rewards/margins": 1.1222045421600342, "rewards/rejected": -0.898730456829071, "step": 2600 }, { "epoch": 0.8599670510708401, "grad_norm": 83.75514159242887, "learning_rate": 7.850906095551895e-07, "logits/chosen": 0.36279296875, "logits/rejected": 0.24924317002296448, "logps/chosen": -320.6499938964844, "logps/rejected": -330.20001220703125, "loss": 0.534, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.38734740018844604, "rewards/margins": 1.339746117591858, "rewards/rejected": -0.950390636920929, "step": 2610 }, { "epoch": 0.8632619439868204, "grad_norm": 55.93739565497534, "learning_rate": 7.842668863261943e-07, "logits/chosen": 0.3804931640625, "logits/rejected": 0.30375367403030396, "logps/chosen": -291.20001220703125, "logps/rejected": -333.79998779296875, "loss": 0.5261, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.29747313261032104, "rewards/margins": 1.2220947742462158, "rewards/rejected": -0.9267822504043579, "step": 2620 }, { "epoch": 0.8665568369028006, "grad_norm": 59.21167591118024, "learning_rate": 7.834431630971992e-07, "logits/chosen": 0.39335936307907104, "logits/rejected": 0.3005127012729645, "logps/chosen": -326.0, "logps/rejected": -337.1499938964844, "loss": 0.4274, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.30559998750686646, "rewards/margins": 1.499609351158142, "rewards/rejected": -1.1946289539337158, "step": 2630 }, { "epoch": 0.8698517298187809, "grad_norm": 98.53752695793736, "learning_rate": 7.826194398682043e-07, "logits/chosen": 0.31871336698532104, "logits/rejected": 0.23347778618335724, "logps/chosen": -323.1499938964844, "logps/rejected": -291.8999938964844, "loss": 0.6469, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.4493774473667145, "rewards/margins": 1.136621117591858, "rewards/rejected": -0.6866821050643921, "step": 2640 }, { "epoch": 0.8731466227347611, "grad_norm": 43.13327555989111, "learning_rate": 7.817957166392092e-07, "logits/chosen": 0.2554931640625, "logits/rejected": 0.24516601860523224, "logps/chosen": -302.6499938964844, "logps/rejected": -293.04998779296875, "loss": 0.4966, "rewards/accuracies": 0.75, "rewards/chosen": 0.4494567811489105, "rewards/margins": 1.40423583984375, "rewards/rejected": -0.9546874761581421, "step": 2650 }, { "epoch": 0.8764415156507414, "grad_norm": 64.2375568968546, "learning_rate": 7.809719934102141e-07, "logits/chosen": 0.2771057188510895, "logits/rejected": 0.2275390625, "logps/chosen": -332.4750061035156, "logps/rejected": -309.1000061035156, "loss": 0.5146, "rewards/accuracies": 0.75, "rewards/chosen": 0.507885754108429, "rewards/margins": 1.4408690929412842, "rewards/rejected": -0.93310546875, "step": 2660 }, { "epoch": 0.8797364085667215, "grad_norm": 55.26919896934824, "learning_rate": 7.801482701812191e-07, "logits/chosen": 0.29710692167282104, "logits/rejected": 0.24587401747703552, "logps/chosen": -316.1000061035156, "logps/rejected": -335.70001220703125, "loss": 0.5383, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.48115235567092896, "rewards/margins": 1.1620361804962158, "rewards/rejected": -0.67987060546875, "step": 2670 }, { "epoch": 0.8830313014827018, "grad_norm": 59.71307661039097, "learning_rate": 7.793245469522241e-07, "logits/chosen": 0.3031005859375, "logits/rejected": 0.22981663048267365, "logps/chosen": -309.1499938964844, "logps/rejected": -276.79998779296875, "loss": 0.4591, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.22524413466453552, "rewards/margins": 1.4277832508087158, "rewards/rejected": -1.2031738758087158, "step": 2680 }, { "epoch": 0.886326194398682, "grad_norm": 34.26967729401619, "learning_rate": 7.78500823723229e-07, "logits/chosen": 0.263113409280777, "logits/rejected": 0.16273193061351776, "logps/chosen": -336.8999938964844, "logps/rejected": -294.0, "loss": 0.5589, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.13063964247703552, "rewards/margins": 1.3425781726837158, "rewards/rejected": -1.2126953601837158, "step": 2690 }, { "epoch": 0.8896210873146623, "grad_norm": 56.45034183173711, "learning_rate": 7.776771004942338e-07, "logits/chosen": 0.17807617783546448, "logits/rejected": 0.18472442030906677, "logps/chosen": -305.82501220703125, "logps/rejected": -271.29998779296875, "loss": 0.6119, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.25224608182907104, "rewards/margins": 1.1628906726837158, "rewards/rejected": -0.910876452922821, "step": 2700 }, { "epoch": 0.8929159802306426, "grad_norm": 53.86957598620759, "learning_rate": 7.768533772652388e-07, "logits/chosen": 0.16094970703125, "logits/rejected": 0.10933838039636612, "logps/chosen": -329.95001220703125, "logps/rejected": -319.04998779296875, "loss": 0.5742, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.3914428651332855, "rewards/margins": 1.1833984851837158, "rewards/rejected": -0.7927490472793579, "step": 2710 }, { "epoch": 0.8962108731466227, "grad_norm": 52.83020515613402, "learning_rate": 7.760296540362438e-07, "logits/chosen": 0.34882813692092896, "logits/rejected": 0.21574707329273224, "logps/chosen": -312.1000061035156, "logps/rejected": -309.29998779296875, "loss": 0.4634, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.3932556211948395, "rewards/margins": 1.3518555164337158, "rewards/rejected": -0.9591064453125, "step": 2720 }, { "epoch": 0.899505766062603, "grad_norm": 84.88340135523475, "learning_rate": 7.752059308072487e-07, "logits/chosen": 0.12624511122703552, "logits/rejected": 0.19576415419578552, "logps/chosen": -291.25, "logps/rejected": -310.8999938964844, "loss": 0.5368, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.3185791075229645, "rewards/margins": 1.1396484375, "rewards/rejected": -0.8201659917831421, "step": 2730 }, { "epoch": 0.9028006589785832, "grad_norm": 73.78820744756288, "learning_rate": 7.743822075782537e-07, "logits/chosen": 0.33464354276657104, "logits/rejected": 0.12598419189453125, "logps/chosen": -270.125, "logps/rejected": -323.8999938964844, "loss": 0.4768, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.3169006407260895, "rewards/margins": 1.2841796875, "rewards/rejected": -0.966601550579071, "step": 2740 }, { "epoch": 0.9060955518945635, "grad_norm": 82.0624514277793, "learning_rate": 7.735584843492586e-07, "logits/chosen": 0.20704345405101776, "logits/rejected": 0.16107177734375, "logps/chosen": -331.70001220703125, "logps/rejected": -321.5, "loss": 0.5085, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.45037841796875, "rewards/margins": 1.3949463367462158, "rewards/rejected": -0.9444335699081421, "step": 2750 }, { "epoch": 0.9093904448105437, "grad_norm": 65.68342690777291, "learning_rate": 7.727347611202636e-07, "logits/chosen": 0.234130859375, "logits/rejected": 0.15070800483226776, "logps/chosen": -298.57501220703125, "logps/rejected": -302.57501220703125, "loss": 0.5029, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.33269041776657104, "rewards/margins": 1.260839819908142, "rewards/rejected": -0.9276672601699829, "step": 2760 }, { "epoch": 0.9126853377265239, "grad_norm": 70.17947924083097, "learning_rate": 7.719110378912686e-07, "logits/chosen": 0.26379090547561646, "logits/rejected": 0.12652587890625, "logps/chosen": -383.6000061035156, "logps/rejected": -314.8999938964844, "loss": 0.4866, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.12835082411766052, "rewards/margins": 1.394628882408142, "rewards/rejected": -1.2653319835662842, "step": 2770 }, { "epoch": 0.9159802306425041, "grad_norm": 99.71205322881134, "learning_rate": 7.710873146622734e-07, "logits/chosen": 0.280844122171402, "logits/rejected": 0.17344971001148224, "logps/chosen": -312.6499938964844, "logps/rejected": -321.45001220703125, "loss": 0.5311, "rewards/accuracies": 0.71875, "rewards/chosen": 0.18195191025733948, "rewards/margins": 1.404687523841858, "rewards/rejected": -1.223242163658142, "step": 2780 }, { "epoch": 0.9192751235584844, "grad_norm": 80.98747567121026, "learning_rate": 7.702635914332783e-07, "logits/chosen": 0.2507263123989105, "logits/rejected": 0.17046508193016052, "logps/chosen": -295.3500061035156, "logps/rejected": -292.1000061035156, "loss": 0.5255, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.30991822481155396, "rewards/margins": 1.334985375404358, "rewards/rejected": -1.0243408679962158, "step": 2790 }, { "epoch": 0.9225700164744646, "grad_norm": 57.430150033062596, "learning_rate": 7.694398682042833e-07, "logits/chosen": 0.20993652939796448, "logits/rejected": 0.21867676079273224, "logps/chosen": -301.45001220703125, "logps/rejected": -306.8500061035156, "loss": 0.5136, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.24102783203125, "rewards/margins": 1.198144555091858, "rewards/rejected": -0.95751953125, "step": 2800 }, { "epoch": 0.9258649093904449, "grad_norm": 61.183443337375955, "learning_rate": 7.686161449752883e-07, "logits/chosen": 0.2950195372104645, "logits/rejected": 0.19235840439796448, "logps/chosen": -321.45001220703125, "logps/rejected": -332.04998779296875, "loss": 0.553, "rewards/accuracies": 0.6875, "rewards/chosen": 0.24407958984375, "rewards/margins": 1.28759765625, "rewards/rejected": -1.042822241783142, "step": 2810 }, { "epoch": 0.929159802306425, "grad_norm": 75.43872451832871, "learning_rate": 7.677924217462932e-07, "logits/chosen": 0.32658690214157104, "logits/rejected": 0.22481688857078552, "logps/chosen": -308.75, "logps/rejected": -322.29998779296875, "loss": 0.629, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.2742553651332855, "rewards/margins": 1.0199706554412842, "rewards/rejected": -0.746691882610321, "step": 2820 }, { "epoch": 0.9324546952224053, "grad_norm": 70.12206703655555, "learning_rate": 7.669686985172981e-07, "logits/chosen": 0.27745360136032104, "logits/rejected": 0.223876953125, "logps/chosen": -333.3999938964844, "logps/rejected": -339.95001220703125, "loss": 0.6307, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.33702391386032104, "rewards/margins": 1.087255835533142, "rewards/rejected": -0.7515624761581421, "step": 2830 }, { "epoch": 0.9357495881383855, "grad_norm": 66.67740152041246, "learning_rate": 7.661449752883032e-07, "logits/chosen": 0.22267456352710724, "logits/rejected": 0.15753173828125, "logps/chosen": -303.9750061035156, "logps/rejected": -345.3999938964844, "loss": 0.4566, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.878222644329071, "rewards/margins": 1.4899413585662842, "rewards/rejected": -0.612060546875, "step": 2840 }, { "epoch": 0.9390444810543658, "grad_norm": 84.04830410464126, "learning_rate": 7.653212520593081e-07, "logits/chosen": 0.18471069633960724, "logits/rejected": 0.1744842529296875, "logps/chosen": -365.54998779296875, "logps/rejected": -316.45001220703125, "loss": 0.5743, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.486978143453598, "rewards/margins": 0.9319823980331421, "rewards/rejected": -0.4449218809604645, "step": 2850 }, { "epoch": 0.942339373970346, "grad_norm": 83.34518444827897, "learning_rate": 7.644975288303129e-07, "logits/chosen": 0.20348510146141052, "logits/rejected": 0.02377624437212944, "logps/chosen": -315.75, "logps/rejected": -307.1000061035156, "loss": 0.5545, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.38896483182907104, "rewards/margins": 1.037451148033142, "rewards/rejected": -0.6487792730331421, "step": 2860 }, { "epoch": 0.9456342668863262, "grad_norm": 69.40071155991824, "learning_rate": 7.636738056013178e-07, "logits/chosen": 0.17537841200828552, "logits/rejected": 0.20211562514305115, "logps/chosen": -341.95001220703125, "logps/rejected": -338.6000061035156, "loss": 0.576, "rewards/accuracies": 0.71875, "rewards/chosen": 0.49476319551467896, "rewards/margins": 1.0639159679412842, "rewards/rejected": -0.5701538324356079, "step": 2870 }, { "epoch": 0.9489291598023064, "grad_norm": 60.10074800713279, "learning_rate": 7.628500823723229e-07, "logits/chosen": 0.21361084282398224, "logits/rejected": 0.13518066704273224, "logps/chosen": -354.6000061035156, "logps/rejected": -313.45001220703125, "loss": 0.6423, "rewards/accuracies": 0.71875, "rewards/chosen": 0.511181652545929, "rewards/margins": 0.9640136957168579, "rewards/rejected": -0.45380860567092896, "step": 2880 }, { "epoch": 0.9522240527182867, "grad_norm": 62.863706679561986, "learning_rate": 7.620263591433278e-07, "logits/chosen": 0.28654783964157104, "logits/rejected": 0.23370361328125, "logps/chosen": -321.3999938964844, "logps/rejected": -321.6000061035156, "loss": 0.5665, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.276763916015625, "rewards/margins": 1.2199828624725342, "rewards/rejected": -0.943066418170929, "step": 2890 }, { "epoch": 0.9555189456342669, "grad_norm": 75.85174819518592, "learning_rate": 7.612026359143327e-07, "logits/chosen": 0.25048828125, "logits/rejected": 0.170501708984375, "logps/chosen": -318.29998779296875, "logps/rejected": -300.5, "loss": 0.5158, "rewards/accuracies": 0.75, "rewards/chosen": 0.5542846918106079, "rewards/margins": 1.392187476158142, "rewards/rejected": -0.8365234136581421, "step": 2900 }, { "epoch": 0.9588138385502472, "grad_norm": 48.87314149595329, "learning_rate": 7.603789126853378e-07, "logits/chosen": 0.18199463188648224, "logits/rejected": 0.03056030347943306, "logps/chosen": -352.125, "logps/rejected": -324.8999938964844, "loss": 0.484, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.4823242127895355, "rewards/margins": 1.5027344226837158, "rewards/rejected": -1.019921898841858, "step": 2910 }, { "epoch": 0.9621087314662273, "grad_norm": 76.38354398383704, "learning_rate": 7.595551894563427e-07, "logits/chosen": 0.23739013075828552, "logits/rejected": 0.18264159560203552, "logps/chosen": -316.20001220703125, "logps/rejected": -300.75, "loss": 0.5522, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.42424315214157104, "rewards/margins": 1.313269019126892, "rewards/rejected": -0.8888915777206421, "step": 2920 }, { "epoch": 0.9654036243822076, "grad_norm": 65.66217581285945, "learning_rate": 7.587314662273476e-07, "logits/chosen": 0.18336181342601776, "logits/rejected": 0.09786377102136612, "logps/chosen": -322.70001220703125, "logps/rejected": -330.29998779296875, "loss": 0.4831, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.38055419921875, "rewards/margins": 1.301660180091858, "rewards/rejected": -0.923046886920929, "step": 2930 }, { "epoch": 0.9686985172981878, "grad_norm": 90.19711995389912, "learning_rate": 7.579077429983524e-07, "logits/chosen": 0.2946533262729645, "logits/rejected": 0.13812866806983948, "logps/chosen": -325.75, "logps/rejected": -283.79998779296875, "loss": 0.603, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.07813720405101776, "rewards/margins": 1.134179711341858, "rewards/rejected": -1.0585448741912842, "step": 2940 }, { "epoch": 0.9719934102141681, "grad_norm": 62.170707446509, "learning_rate": 7.570840197693575e-07, "logits/chosen": 0.2771667540073395, "logits/rejected": 0.08527527004480362, "logps/chosen": -311.54998779296875, "logps/rejected": -322.29998779296875, "loss": 0.5803, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.17315673828125, "rewards/margins": 1.104394555091858, "rewards/rejected": -0.931689441204071, "step": 2950 }, { "epoch": 0.9752883031301482, "grad_norm": 91.31587955222454, "learning_rate": 7.562602965403624e-07, "logits/chosen": 0.23511353135108948, "logits/rejected": 0.04322509840130806, "logps/chosen": -355.95001220703125, "logps/rejected": -329.8500061035156, "loss": 0.498, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.09868469089269638, "rewards/margins": 1.5612304210662842, "rewards/rejected": -1.463134765625, "step": 2960 }, { "epoch": 0.9785831960461285, "grad_norm": 89.41919481710254, "learning_rate": 7.554365733113673e-07, "logits/chosen": 0.22012940049171448, "logits/rejected": 0.12391357123851776, "logps/chosen": -333.29998779296875, "logps/rejected": -336.0, "loss": 0.581, "rewards/accuracies": 0.71875, "rewards/chosen": 0.04587402194738388, "rewards/margins": 1.274072289466858, "rewards/rejected": -1.2297852039337158, "step": 2970 }, { "epoch": 0.9818780889621087, "grad_norm": 83.85764433981284, "learning_rate": 7.546128500823723e-07, "logits/chosen": 0.27946776151657104, "logits/rejected": 0.17839355766773224, "logps/chosen": -313.70001220703125, "logps/rejected": -351.20001220703125, "loss": 0.6075, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.05972900241613388, "rewards/margins": 0.9143737554550171, "rewards/rejected": -0.973339855670929, "step": 2980 }, { "epoch": 0.985172981878089, "grad_norm": 88.02416634467195, "learning_rate": 7.537891268533773e-07, "logits/chosen": 0.25300294160842896, "logits/rejected": 0.09105224907398224, "logps/chosen": -317.79998779296875, "logps/rejected": -299.75, "loss": 0.578, "rewards/accuracies": 0.71875, "rewards/chosen": 0.224761962890625, "rewards/margins": 1.1178710460662842, "rewards/rejected": -0.8937011957168579, "step": 2990 }, { "epoch": 0.9884678747940692, "grad_norm": 89.58404341507998, "learning_rate": 7.529654036243822e-07, "logits/chosen": 0.36787110567092896, "logits/rejected": 0.21736907958984375, "logps/chosen": -357.75, "logps/rejected": -328.29998779296875, "loss": 0.5089, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.15277710556983948, "rewards/margins": 1.275292992591858, "rewards/rejected": -1.122802734375, "step": 3000 }, { "epoch": 0.9917627677100495, "grad_norm": 60.28326800368977, "learning_rate": 7.521416803953872e-07, "logits/chosen": 0.36302489042282104, "logits/rejected": 0.3271545469760895, "logps/chosen": -354.75, "logps/rejected": -330.1000061035156, "loss": 0.5553, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.5450683832168579, "rewards/margins": 1.20556640625, "rewards/rejected": -0.6590820550918579, "step": 3010 }, { "epoch": 0.9950576606260296, "grad_norm": 78.11995508503067, "learning_rate": 7.51317957166392e-07, "logits/chosen": 0.32553330063819885, "logits/rejected": 0.2623046934604645, "logps/chosen": -329.1499938964844, "logps/rejected": -324.25, "loss": 0.4286, "rewards/accuracies": 0.8125, "rewards/chosen": 0.33454591035842896, "rewards/margins": 1.547753930091858, "rewards/rejected": -1.2138671875, "step": 3020 }, { "epoch": 0.9983525535420099, "grad_norm": 83.23297762629093, "learning_rate": 7.50494233937397e-07, "logits/chosen": 0.38023680448532104, "logits/rejected": 0.23785552382469177, "logps/chosen": -353.54998779296875, "logps/rejected": -328.3999938964844, "loss": 0.4984, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.38398438692092896, "rewards/margins": 1.376562476158142, "rewards/rejected": -0.9917968511581421, "step": 3030 }, { "epoch": 1.00164744645799, "grad_norm": 11.19680975415995, "learning_rate": 7.496705107084019e-07, "logits/chosen": 0.382568359375, "logits/rejected": 0.32524412870407104, "logps/chosen": -306.3999938964844, "logps/rejected": -323.79998779296875, "loss": 0.3031, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 1.084814429283142, "rewards/margins": 2.637890577316284, "rewards/rejected": -1.553497314453125, "step": 3040 }, { "epoch": 1.0049423393739703, "grad_norm": 10.132211061269032, "learning_rate": 7.488467874794069e-07, "logits/chosen": 0.23908691108226776, "logits/rejected": 0.18700408935546875, "logps/chosen": -313.6499938964844, "logps/rejected": -320.54998779296875, "loss": 0.0878, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.7986328601837158, "rewards/margins": 4.186718940734863, "rewards/rejected": -2.385937452316284, "step": 3050 }, { "epoch": 1.0082372322899507, "grad_norm": 22.991638634123237, "learning_rate": 7.480230642504118e-07, "logits/chosen": 0.2668212950229645, "logits/rejected": 0.18588562309741974, "logps/chosen": -311.375, "logps/rejected": -322.54998779296875, "loss": 0.0786, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.377832055091858, "rewards/margins": 4.063281059265137, "rewards/rejected": -2.6878905296325684, "step": 3060 }, { "epoch": 1.0115321252059308, "grad_norm": 30.87392013788486, "learning_rate": 7.471993410214168e-07, "logits/chosen": 0.162628173828125, "logits/rejected": 0.009289550594985485, "logps/chosen": -334.8500061035156, "logps/rejected": -349.8999938964844, "loss": 0.047, "rewards/accuracies": 1.0, "rewards/chosen": 1.408935546875, "rewards/margins": 4.998437404632568, "rewards/rejected": -3.59375, "step": 3070 }, { "epoch": 1.014827018121911, "grad_norm": 59.131644794704755, "learning_rate": 7.463756177924218e-07, "logits/chosen": 0.0269775390625, "logits/rejected": -0.014956665225327015, "logps/chosen": -311.79998779296875, "logps/rejected": -337.79998779296875, "loss": 0.0928, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.716198742389679, "rewards/margins": 4.65625, "rewards/rejected": -3.9375, "step": 3080 }, { "epoch": 1.0181219110378912, "grad_norm": 9.441013266848827, "learning_rate": 7.455518945634267e-07, "logits/chosen": 0.01442871056497097, "logits/rejected": -0.06273193657398224, "logps/chosen": -359.1499938964844, "logps/rejected": -339.45001220703125, "loss": 0.06, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7784179449081421, "rewards/margins": 4.903906345367432, "rewards/rejected": -4.128125190734863, "step": 3090 }, { "epoch": 1.0214168039538716, "grad_norm": 14.70562781745289, "learning_rate": 7.447281713344315e-07, "logits/chosen": -0.010791015811264515, "logits/rejected": -0.05517578125, "logps/chosen": -337.3500061035156, "logps/rejected": -332.45001220703125, "loss": 0.0592, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 1.031042456626892, "rewards/margins": 5.010937690734863, "rewards/rejected": -3.9828124046325684, "step": 3100 }, { "epoch": 1.0247116968698518, "grad_norm": 46.56344469204253, "learning_rate": 7.439044481054365e-07, "logits/chosen": 0.008129882626235485, "logits/rejected": -0.05895080417394638, "logps/chosen": -306.70001220703125, "logps/rejected": -316.04998779296875, "loss": 0.1034, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.690966784954071, "rewards/margins": 4.349218845367432, "rewards/rejected": -3.6546874046325684, "step": 3110 }, { "epoch": 1.028006589785832, "grad_norm": 11.377568920156142, "learning_rate": 7.430807248764415e-07, "logits/chosen": 0.08388061821460724, "logits/rejected": -0.09454345703125, "logps/chosen": -323.70001220703125, "logps/rejected": -348.5, "loss": 0.0782, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.6152588129043579, "rewards/margins": 4.733593940734863, "rewards/rejected": -4.118750095367432, "step": 3120 }, { "epoch": 1.031301482701812, "grad_norm": 8.870153083251028, "learning_rate": 7.422570016474464e-07, "logits/chosen": -0.070465087890625, "logits/rejected": -0.23527374863624573, "logps/chosen": -303.125, "logps/rejected": -332.29998779296875, "loss": 0.0683, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.49378663301467896, "rewards/margins": 4.79296875, "rewards/rejected": -4.30078125, "step": 3130 }, { "epoch": 1.0345963756177925, "grad_norm": 12.419385244917686, "learning_rate": 7.414332784184513e-07, "logits/chosen": -0.16442260146141052, "logits/rejected": -0.2728332579135895, "logps/chosen": -330.25, "logps/rejected": -390.3500061035156, "loss": 0.0538, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.31730955839157104, "rewards/margins": 5.5234375, "rewards/rejected": -5.202343940734863, "step": 3140 }, { "epoch": 1.0378912685337727, "grad_norm": 17.732245564537653, "learning_rate": 7.406095551894564e-07, "logits/chosen": -0.15170899033546448, "logits/rejected": -0.26023560762405396, "logps/chosen": -317.0, "logps/rejected": -381.8500061035156, "loss": 0.0515, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.02573242224752903, "rewards/margins": 5.799218654632568, "rewards/rejected": -5.827343940734863, "step": 3150 }, { "epoch": 1.0411861614497528, "grad_norm": 59.01806554735916, "learning_rate": 7.397858319604613e-07, "logits/chosen": -0.20616455376148224, "logits/rejected": -0.29736328125, "logps/chosen": -353.54998779296875, "logps/rejected": -370.8999938964844, "loss": 0.0909, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.01210937462747097, "rewards/margins": 5.702343940734863, "rewards/rejected": -5.69140625, "step": 3160 }, { "epoch": 1.044481054365733, "grad_norm": 27.330557492786696, "learning_rate": 7.389621087314662e-07, "logits/chosen": -0.11037597805261612, "logits/rejected": -0.13364258408546448, "logps/chosen": -320.25, "logps/rejected": -350.20001220703125, "loss": 0.0622, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.5759521722793579, "rewards/margins": 5.617968559265137, "rewards/rejected": -5.037499904632568, "step": 3170 }, { "epoch": 1.0477759472817134, "grad_norm": 6.6621208389814575, "learning_rate": 7.38138385502471e-07, "logits/chosen": -0.09835205227136612, "logits/rejected": -0.15324707329273224, "logps/chosen": -324.54998779296875, "logps/rejected": -353.8999938964844, "loss": 0.0517, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 1.0003662109375, "rewards/margins": 5.451562404632568, "rewards/rejected": -4.449999809265137, "step": 3180 }, { "epoch": 1.0510708401976936, "grad_norm": 41.45954257353409, "learning_rate": 7.373146622734761e-07, "logits/chosen": -0.06831665337085724, "logits/rejected": -0.22577209770679474, "logps/chosen": -355.54998779296875, "logps/rejected": -401.1499938964844, "loss": 0.0919, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.5484863519668579, "rewards/margins": 6.079687595367432, "rewards/rejected": -5.535937309265137, "step": 3190 }, { "epoch": 1.0543657331136738, "grad_norm": 11.241754695341083, "learning_rate": 7.36490939044481e-07, "logits/chosen": -0.1728668212890625, "logits/rejected": -0.28021240234375, "logps/chosen": -331.20001220703125, "logps/rejected": -359.1499938964844, "loss": 0.0395, "rewards/accuracies": 1.0, "rewards/chosen": 0.714599609375, "rewards/margins": 5.756249904632568, "rewards/rejected": -5.037499904632568, "step": 3200 }, { "epoch": 1.057660626029654, "grad_norm": 24.24873893777661, "learning_rate": 7.356672158154859e-07, "logits/chosen": -0.0968017578125, "logits/rejected": -0.25459593534469604, "logps/chosen": -295.6499938964844, "logps/rejected": -314.3999938964844, "loss": 0.075, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.3728271424770355, "rewards/margins": 5.0234375, "rewards/rejected": -4.650781154632568, "step": 3210 }, { "epoch": 1.0609555189456343, "grad_norm": 8.618285453428832, "learning_rate": 7.34843492586491e-07, "logits/chosen": -0.18349608778953552, "logits/rejected": -0.3699707090854645, "logps/chosen": -357.75, "logps/rejected": -358.3500061035156, "loss": 0.0707, "rewards/accuracies": 0.96875, "rewards/chosen": 0.520904541015625, "rewards/margins": 5.765625, "rewards/rejected": -5.245312690734863, "step": 3220 }, { "epoch": 1.0642504118616145, "grad_norm": 7.1584620774351215, "learning_rate": 7.340197693574959e-07, "logits/chosen": -0.2515014708042145, "logits/rejected": -0.28291624784469604, "logps/chosen": -335.95001220703125, "logps/rejected": -355.29998779296875, "loss": 0.0472, "rewards/accuracies": 1.0, "rewards/chosen": 0.9957031011581421, "rewards/margins": 5.753125190734863, "rewards/rejected": -4.751562595367432, "step": 3230 }, { "epoch": 1.0675453047775947, "grad_norm": 14.42312923928861, "learning_rate": 7.331960461285008e-07, "logits/chosen": -0.21013489365577698, "logits/rejected": -0.38520509004592896, "logps/chosen": -328.95001220703125, "logps/rejected": -357.5, "loss": 0.0461, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.518200695514679, "rewards/margins": 5.529687404632568, "rewards/rejected": -5.012499809265137, "step": 3240 }, { "epoch": 1.0708401976935749, "grad_norm": 5.431000666481532, "learning_rate": 7.323723228995057e-07, "logits/chosen": -0.10678710788488388, "logits/rejected": -0.3402099609375, "logps/chosen": -339.6000061035156, "logps/rejected": -398.3500061035156, "loss": 0.0383, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.47705078125, "rewards/margins": 5.744531154632568, "rewards/rejected": -5.268750190734863, "step": 3250 }, { "epoch": 1.0741350906095553, "grad_norm": 5.152326824085003, "learning_rate": 7.315485996705107e-07, "logits/chosen": -0.298806756734848, "logits/rejected": -0.38371580839157104, "logps/chosen": -338.8999938964844, "logps/rejected": -343.25, "loss": 0.06, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.37108153104782104, "rewards/margins": 5.53125, "rewards/rejected": -5.163281440734863, "step": 3260 }, { "epoch": 1.0774299835255354, "grad_norm": 13.11840023549415, "learning_rate": 7.307248764415156e-07, "logits/chosen": -0.14284057915210724, "logits/rejected": -0.33320313692092896, "logps/chosen": -341.3999938964844, "logps/rejected": -375.29998779296875, "loss": 0.0679, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.5404907464981079, "rewards/margins": 5.642187595367432, "rewards/rejected": -5.104687690734863, "step": 3270 }, { "epoch": 1.0807248764415156, "grad_norm": 16.632196302569263, "learning_rate": 7.299011532125205e-07, "logits/chosen": -0.24086913466453552, "logits/rejected": -0.508532702922821, "logps/chosen": -334.0249938964844, "logps/rejected": -359.95001220703125, "loss": 0.0436, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.562695324420929, "rewards/margins": 5.464062690734863, "rewards/rejected": -4.896874904632568, "step": 3280 }, { "epoch": 1.084019769357496, "grad_norm": 12.488300552564402, "learning_rate": 7.290774299835255e-07, "logits/chosen": -0.22083739936351776, "logits/rejected": -0.18668517470359802, "logps/chosen": -318.70001220703125, "logps/rejected": -347.1000061035156, "loss": 0.0547, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.52960205078125, "rewards/margins": 5.655468940734863, "rewards/rejected": -5.125, "step": 3290 }, { "epoch": 1.0873146622734762, "grad_norm": 26.011822652058484, "learning_rate": 7.282537067545305e-07, "logits/chosen": -0.21538086235523224, "logits/rejected": -0.22573241591453552, "logps/chosen": -306.5, "logps/rejected": -378.25, "loss": 0.0795, "rewards/accuracies": 0.96875, "rewards/chosen": 0.3234497010707855, "rewards/margins": 5.239062309265137, "rewards/rejected": -4.9140625, "step": 3300 }, { "epoch": 1.0906095551894563, "grad_norm": 16.348811946524638, "learning_rate": 7.274299835255354e-07, "logits/chosen": -0.18476562201976776, "logits/rejected": -0.39628297090530396, "logps/chosen": -326.29998779296875, "logps/rejected": -349.6499938964844, "loss": 0.0589, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.2606201171875, "rewards/margins": 5.264843940734863, "rewards/rejected": -5.528124809265137, "step": 3310 }, { "epoch": 1.0939044481054365, "grad_norm": 46.64658809811542, "learning_rate": 7.266062602965404e-07, "logits/chosen": -0.22144165635108948, "logits/rejected": -0.3678039610385895, "logps/chosen": -336.45001220703125, "logps/rejected": -352.70001220703125, "loss": 0.0792, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.13089600205421448, "rewards/margins": 5.337500095367432, "rewards/rejected": -5.209374904632568, "step": 3320 }, { "epoch": 1.0971993410214167, "grad_norm": 20.57117456998853, "learning_rate": 7.257825370675452e-07, "logits/chosen": -0.42021483182907104, "logits/rejected": -0.37968748807907104, "logps/chosen": -302.5, "logps/rejected": -346.04998779296875, "loss": 0.0811, "rewards/accuracies": 0.96875, "rewards/chosen": 0.11017151176929474, "rewards/margins": 5.833593845367432, "rewards/rejected": -5.72265625, "step": 3330 }, { "epoch": 1.100494233937397, "grad_norm": 13.208432901709934, "learning_rate": 7.249588138385502e-07, "logits/chosen": -0.2612365782260895, "logits/rejected": -0.352731317281723, "logps/chosen": -358.3500061035156, "logps/rejected": -387.04998779296875, "loss": 0.05, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.27235716581344604, "rewards/margins": 5.6796875, "rewards/rejected": -5.40234375, "step": 3340 }, { "epoch": 1.1037891268533773, "grad_norm": 5.212147044942872, "learning_rate": 7.241350906095551e-07, "logits/chosen": -0.12652587890625, "logits/rejected": -0.23823852837085724, "logps/chosen": -330.29998779296875, "logps/rejected": -408.1000061035156, "loss": 0.0676, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.35441893339157104, "rewards/margins": 6.042187690734863, "rewards/rejected": -5.682812690734863, "step": 3350 }, { "epoch": 1.1070840197693574, "grad_norm": 16.818972303432307, "learning_rate": 7.233113673805601e-07, "logits/chosen": -0.18056640028953552, "logits/rejected": -0.365274041891098, "logps/chosen": -333.5249938964844, "logps/rejected": -327.29998779296875, "loss": 0.0775, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.5258423089981079, "rewards/margins": 5.313281059265137, "rewards/rejected": -4.785937309265137, "step": 3360 }, { "epoch": 1.1103789126853378, "grad_norm": 6.160566718997456, "learning_rate": 7.22487644151565e-07, "logits/chosen": -0.13991698622703552, "logits/rejected": -0.276824951171875, "logps/chosen": -319.8500061035156, "logps/rejected": -348.8999938964844, "loss": 0.0849, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.4405517578125, "rewards/margins": 5.625, "rewards/rejected": -5.185156345367432, "step": 3370 }, { "epoch": 1.113673805601318, "grad_norm": 66.73861450355348, "learning_rate": 7.2166392092257e-07, "logits/chosen": -0.29865723848342896, "logits/rejected": -0.3695312440395355, "logps/chosen": -318.3999938964844, "logps/rejected": -347.8999938964844, "loss": 0.101, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.18603515625, "rewards/margins": 5.72265625, "rewards/rejected": -5.532812595367432, "step": 3380 }, { "epoch": 1.1169686985172982, "grad_norm": 22.70762649065318, "learning_rate": 7.20840197693575e-07, "logits/chosen": -0.21878662705421448, "logits/rejected": -0.29643553495407104, "logps/chosen": -302.79998779296875, "logps/rejected": -351.45001220703125, "loss": 0.0922, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.2469482421875, "rewards/margins": 5.524218559265137, "rewards/rejected": -5.275781154632568, "step": 3390 }, { "epoch": 1.1202635914332784, "grad_norm": 8.899003730660318, "learning_rate": 7.200164744645799e-07, "logits/chosen": -0.1968994140625, "logits/rejected": -0.2518859803676605, "logps/chosen": -340.20001220703125, "logps/rejected": -386.70001220703125, "loss": 0.0548, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.47993165254592896, "rewards/margins": 5.492968559265137, "rewards/rejected": -5.015625, "step": 3400 }, { "epoch": 1.1235584843492585, "grad_norm": 15.370396809041074, "learning_rate": 7.191927512355847e-07, "logits/chosen": -0.11787720024585724, "logits/rejected": -0.305349737405777, "logps/chosen": -331.79998779296875, "logps/rejected": -363.79998779296875, "loss": 0.0559, "rewards/accuracies": 1.0, "rewards/chosen": 0.3969970643520355, "rewards/margins": 5.453906059265137, "rewards/rejected": -5.053124904632568, "step": 3410 }, { "epoch": 1.126853377265239, "grad_norm": 48.97505296832362, "learning_rate": 7.183690280065897e-07, "logits/chosen": -0.028564453125, "logits/rejected": -0.28558653593063354, "logps/chosen": -302.3500061035156, "logps/rejected": -333.70001220703125, "loss": 0.1019, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.0633544921875, "rewards/margins": 5.4296875, "rewards/rejected": -5.360937595367432, "step": 3420 }, { "epoch": 1.130148270181219, "grad_norm": 19.10070407485391, "learning_rate": 7.175453047775947e-07, "logits/chosen": -0.15157470107078552, "logits/rejected": -0.31853026151657104, "logps/chosen": -334.6499938964844, "logps/rejected": -369.3999938964844, "loss": 0.0615, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.13630370795726776, "rewards/margins": 5.555468559265137, "rewards/rejected": -5.421875, "step": 3430 }, { "epoch": 1.1334431630971993, "grad_norm": 18.96977431777034, "learning_rate": 7.167215815485996e-07, "logits/chosen": -0.30205076932907104, "logits/rejected": -0.37575072050094604, "logps/chosen": -327.75, "logps/rejected": -364.3999938964844, "loss": 0.0561, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.28300172090530396, "rewards/margins": 5.823437690734863, "rewards/rejected": -6.110156059265137, "step": 3440 }, { "epoch": 1.1367380560131797, "grad_norm": 1.867764788259933, "learning_rate": 7.158978583196045e-07, "logits/chosen": -0.18898925185203552, "logits/rejected": -0.28760987520217896, "logps/chosen": -316.8999938964844, "logps/rejected": -351.3999938964844, "loss": 0.0898, "rewards/accuracies": 0.96875, "rewards/chosen": 0.1628265380859375, "rewards/margins": 5.732812404632568, "rewards/rejected": -5.565625190734863, "step": 3450 }, { "epoch": 1.1400329489291599, "grad_norm": 7.126289836466283, "learning_rate": 7.150741350906096e-07, "logits/chosen": -0.09157104790210724, "logits/rejected": -0.4722656309604645, "logps/chosen": -292.95001220703125, "logps/rejected": -357.5, "loss": 0.0485, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.2821411192417145, "rewards/margins": 6.025000095367432, "rewards/rejected": -5.740624904632568, "step": 3460 }, { "epoch": 1.14332784184514, "grad_norm": 48.14537788945499, "learning_rate": 7.142504118616145e-07, "logits/chosen": -0.296630859375, "logits/rejected": -0.394775390625, "logps/chosen": -315.5, "logps/rejected": -411.20001220703125, "loss": 0.067, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.18513183295726776, "rewards/margins": 6.435937404632568, "rewards/rejected": -6.245312690734863, "step": 3470 }, { "epoch": 1.1466227347611202, "grad_norm": 11.38376349552352, "learning_rate": 7.134266886326194e-07, "logits/chosen": -0.25236815214157104, "logits/rejected": -0.3161254823207855, "logps/chosen": -325.0, "logps/rejected": -376.6000061035156, "loss": 0.1108, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.19497069716453552, "rewards/margins": 5.733593940734863, "rewards/rejected": -5.541406154632568, "step": 3480 }, { "epoch": 1.1499176276771004, "grad_norm": 31.195580852403637, "learning_rate": 7.126029654036244e-07, "logits/chosen": -0.143341064453125, "logits/rejected": -0.18976441025733948, "logps/chosen": -296.7250061035156, "logps/rejected": -335.95001220703125, "loss": 0.087, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.672778308391571, "rewards/margins": 5.637499809265137, "rewards/rejected": -4.964062690734863, "step": 3490 }, { "epoch": 1.1532125205930808, "grad_norm": 26.021650068188023, "learning_rate": 7.117792421746293e-07, "logits/chosen": -0.16868896782398224, "logits/rejected": -0.35948485136032104, "logps/chosen": -358.75, "logps/rejected": -379.45001220703125, "loss": 0.0567, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.6000000238418579, "rewards/margins": 6.298437595367432, "rewards/rejected": -5.698437690734863, "step": 3500 }, { "epoch": 1.156507413509061, "grad_norm": 16.415333676675985, "learning_rate": 7.109555189456342e-07, "logits/chosen": -0.27772217988967896, "logits/rejected": -0.39875489473342896, "logps/chosen": -340.1000061035156, "logps/rejected": -389.25, "loss": 0.0634, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.08135986328125, "rewards/margins": 5.852343559265137, "rewards/rejected": -5.772656440734863, "step": 3510 }, { "epoch": 1.1598023064250411, "grad_norm": 12.98487629169776, "learning_rate": 7.101317957166391e-07, "logits/chosen": -0.18986816704273224, "logits/rejected": -0.2652343809604645, "logps/chosen": -303.1499938964844, "logps/rejected": -367.0, "loss": 0.0432, "rewards/accuracies": 1.0, "rewards/chosen": -0.22139891982078552, "rewards/margins": 5.54296875, "rewards/rejected": -5.764062404632568, "step": 3520 }, { "epoch": 1.1630971993410215, "grad_norm": 29.414208873307192, "learning_rate": 7.093080724876442e-07, "logits/chosen": -0.09163513034582138, "logits/rejected": -0.25904542207717896, "logps/chosen": -334.3999938964844, "logps/rejected": -379.6000061035156, "loss": 0.1545, "rewards/accuracies": 0.96875, "rewards/chosen": 0.05522460862994194, "rewards/margins": 5.471875190734863, "rewards/rejected": -5.414843559265137, "step": 3530 }, { "epoch": 1.1663920922570017, "grad_norm": 36.58402816530656, "learning_rate": 7.084843492586491e-07, "logits/chosen": -0.17304687201976776, "logits/rejected": -0.2842163145542145, "logps/chosen": -334.8500061035156, "logps/rejected": -382.54998779296875, "loss": 0.0508, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.4195312559604645, "rewards/margins": 5.982812404632568, "rewards/rejected": -5.5625, "step": 3540 }, { "epoch": 1.1696869851729819, "grad_norm": 2.637155213062344, "learning_rate": 7.07660626029654e-07, "logits/chosen": -0.17376098036766052, "logits/rejected": -0.3501830995082855, "logps/chosen": -334.45001220703125, "logps/rejected": -348.75, "loss": 0.054, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.18468627333641052, "rewards/margins": 5.723437309265137, "rewards/rejected": -5.538281440734863, "step": 3550 }, { "epoch": 1.172981878088962, "grad_norm": 36.48220185177535, "learning_rate": 7.06836902800659e-07, "logits/chosen": -0.21186523139476776, "logits/rejected": -0.3752075135707855, "logps/chosen": -316.8500061035156, "logps/rejected": -345.3999938964844, "loss": 0.0694, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.32740479707717896, "rewards/margins": 5.619531154632568, "rewards/rejected": -5.293749809265137, "step": 3560 }, { "epoch": 1.1762767710049424, "grad_norm": 24.551146477573592, "learning_rate": 7.060131795716639e-07, "logits/chosen": -0.21619872748851776, "logits/rejected": -0.2844604551792145, "logps/chosen": -315.8500061035156, "logps/rejected": -376.1000061035156, "loss": 0.0724, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.32196044921875, "rewards/margins": 5.592187404632568, "rewards/rejected": -5.268750190734863, "step": 3570 }, { "epoch": 1.1795716639209226, "grad_norm": 16.563542017188343, "learning_rate": 7.051894563426688e-07, "logits/chosen": -0.3635497987270355, "logits/rejected": -0.4938949644565582, "logps/chosen": -332.3500061035156, "logps/rejected": -345.79998779296875, "loss": 0.052, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.4625244140625, "rewards/margins": 6.157812595367432, "rewards/rejected": -5.6953125, "step": 3580 }, { "epoch": 1.1828665568369028, "grad_norm": 13.337926186888152, "learning_rate": 7.043657331136737e-07, "logits/chosen": -0.10344848781824112, "logits/rejected": -0.20332030951976776, "logps/chosen": -327.79998779296875, "logps/rejected": -374.0, "loss": 0.0695, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.557812511920929, "rewards/margins": 5.8671875, "rewards/rejected": -5.30859375, "step": 3590 }, { "epoch": 1.186161449752883, "grad_norm": 10.788599514700964, "learning_rate": 7.035420098846787e-07, "logits/chosen": -0.20097045600414276, "logits/rejected": -0.41022950410842896, "logps/chosen": -359.1000061035156, "logps/rejected": -382.0, "loss": 0.043, "rewards/accuracies": 1.0, "rewards/chosen": 0.625561535358429, "rewards/margins": 5.984375, "rewards/rejected": -5.36328125, "step": 3600 }, { "epoch": 1.1894563426688634, "grad_norm": 21.89751704182672, "learning_rate": 7.027182866556837e-07, "logits/chosen": -0.35384827852249146, "logits/rejected": -0.47001951932907104, "logps/chosen": -312.8999938964844, "logps/rejected": -357.6000061035156, "loss": 0.069, "rewards/accuracies": 0.96875, "rewards/chosen": 0.21202392876148224, "rewards/margins": 5.7265625, "rewards/rejected": -5.515625, "step": 3610 }, { "epoch": 1.1927512355848435, "grad_norm": 19.12870215465729, "learning_rate": 7.018945634266886e-07, "logits/chosen": -0.29582518339157104, "logits/rejected": -0.6346679925918579, "logps/chosen": -291.1000061035156, "logps/rejected": -331.95001220703125, "loss": 0.069, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.24547119438648224, "rewards/margins": 5.453906059265137, "rewards/rejected": -5.692968845367432, "step": 3620 }, { "epoch": 1.1960461285008237, "grad_norm": 12.598496617066585, "learning_rate": 7.010708401976936e-07, "logits/chosen": -0.38170164823532104, "logits/rejected": -0.522216796875, "logps/chosen": -287.6499938964844, "logps/rejected": -337.8999938964844, "loss": 0.0833, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.10673828423023224, "rewards/margins": 5.346093654632568, "rewards/rejected": -5.243750095367432, "step": 3630 }, { "epoch": 1.1993410214168039, "grad_norm": 4.457157450046307, "learning_rate": 7.002471169686985e-07, "logits/chosen": -0.42274171113967896, "logits/rejected": -0.5426269769668579, "logps/chosen": -336.70001220703125, "logps/rejected": -381.0, "loss": 0.0691, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.4240478575229645, "rewards/margins": 5.807812690734863, "rewards/rejected": -5.3828125, "step": 3640 }, { "epoch": 1.2026359143327843, "grad_norm": 3.613767589226146, "learning_rate": 6.994233937397034e-07, "logits/chosen": -0.3341613709926605, "logits/rejected": -0.507080078125, "logps/chosen": -337.95001220703125, "logps/rejected": -348.75, "loss": 0.0708, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.37503355741500854, "rewards/margins": 5.671875, "rewards/rejected": -5.294140815734863, "step": 3650 }, { "epoch": 1.2059308072487644, "grad_norm": 18.32840057447811, "learning_rate": 6.985996705107083e-07, "logits/chosen": -0.4156738221645355, "logits/rejected": -0.5690551996231079, "logps/chosen": -316.1000061035156, "logps/rejected": -335.95001220703125, "loss": 0.1039, "rewards/accuracies": 0.96875, "rewards/chosen": -0.09447021782398224, "rewards/margins": 5.46484375, "rewards/rejected": -5.564062595367432, "step": 3660 }, { "epoch": 1.2092257001647446, "grad_norm": 23.31131928752302, "learning_rate": 6.977759472817133e-07, "logits/chosen": -0.5107421875, "logits/rejected": -0.629199206829071, "logps/chosen": -370.54998779296875, "logps/rejected": -370.1499938964844, "loss": 0.0639, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.0035888671409338713, "rewards/margins": 5.651562690734863, "rewards/rejected": -5.661718845367432, "step": 3670 }, { "epoch": 1.2125205930807248, "grad_norm": 10.224827052649621, "learning_rate": 6.969522240527182e-07, "logits/chosen": -0.461181640625, "logits/rejected": -0.538684070110321, "logps/chosen": -288.29998779296875, "logps/rejected": -367.25, "loss": 0.0423, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.22756347060203552, "rewards/margins": 6.043749809265137, "rewards/rejected": -5.817187309265137, "step": 3680 }, { "epoch": 1.2158154859967052, "grad_norm": 2.5023246552875404, "learning_rate": 6.961285008237232e-07, "logits/chosen": -0.4109207093715668, "logits/rejected": -0.623730480670929, "logps/chosen": -330.29998779296875, "logps/rejected": -367.79998779296875, "loss": 0.058, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.13078613579273224, "rewards/margins": 5.946875095367432, "rewards/rejected": -5.8125, "step": 3690 }, { "epoch": 1.2191103789126854, "grad_norm": 21.10322552512054, "learning_rate": 6.953047775947282e-07, "logits/chosen": -0.5343254208564758, "logits/rejected": -0.621197521686554, "logps/chosen": -314.6499938964844, "logps/rejected": -338.6000061035156, "loss": 0.1144, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.09907837212085724, "rewards/margins": 5.428124904632568, "rewards/rejected": -5.328125, "step": 3700 }, { "epoch": 1.2224052718286655, "grad_norm": 53.51597205378802, "learning_rate": 6.944810543657331e-07, "logits/chosen": -0.30572509765625, "logits/rejected": -0.57958984375, "logps/chosen": -346.79998779296875, "logps/rejected": -352.20001220703125, "loss": 0.0794, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.09287109225988388, "rewards/margins": 5.885937690734863, "rewards/rejected": -5.979687690734863, "step": 3710 }, { "epoch": 1.2257001647446457, "grad_norm": 53.956176258413564, "learning_rate": 6.93657331136738e-07, "logits/chosen": -0.42011719942092896, "logits/rejected": -0.529370129108429, "logps/chosen": -339.45001220703125, "logps/rejected": -377.0, "loss": 0.0706, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.2508178651332855, "rewards/margins": 6.235937595367432, "rewards/rejected": -5.9921875, "step": 3720 }, { "epoch": 1.2289950576606261, "grad_norm": 4.627224961566724, "learning_rate": 6.92833607907743e-07, "logits/chosen": -0.3093933165073395, "logits/rejected": -0.541125476360321, "logps/chosen": -291.04998779296875, "logps/rejected": -386.25, "loss": 0.0619, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.10118408501148224, "rewards/margins": 5.646874904632568, "rewards/rejected": -5.543749809265137, "step": 3730 }, { "epoch": 1.2322899505766063, "grad_norm": 9.228370411556165, "learning_rate": 6.920098846787479e-07, "logits/chosen": -0.2991271913051605, "logits/rejected": -0.43287354707717896, "logps/chosen": -305.2250061035156, "logps/rejected": -348.45001220703125, "loss": 0.0844, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.43408203125, "rewards/margins": 5.8125, "rewards/rejected": -5.37890625, "step": 3740 }, { "epoch": 1.2355848434925865, "grad_norm": 13.762701981149945, "learning_rate": 6.911861614497528e-07, "logits/chosen": -0.19949951767921448, "logits/rejected": -0.4571777284145355, "logps/chosen": -306.45001220703125, "logps/rejected": -364.6000061035156, "loss": 0.0746, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.23686523735523224, "rewards/margins": 5.7109375, "rewards/rejected": -5.473437309265137, "step": 3750 }, { "epoch": 1.2388797364085666, "grad_norm": 31.42018670661175, "learning_rate": 6.903624382207577e-07, "logits/chosen": -0.46330565214157104, "logits/rejected": -0.538256824016571, "logps/chosen": -317.45001220703125, "logps/rejected": -390.70001220703125, "loss": 0.0989, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.3233398497104645, "rewards/margins": 6.315625190734863, "rewards/rejected": -5.9921875, "step": 3760 }, { "epoch": 1.242174629324547, "grad_norm": 4.828339030817575, "learning_rate": 6.895387149917628e-07, "logits/chosen": -0.339447021484375, "logits/rejected": -0.5807129144668579, "logps/chosen": -296.20001220703125, "logps/rejected": -336.3500061035156, "loss": 0.0404, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.22389526665210724, "rewards/margins": 5.860937595367432, "rewards/rejected": -5.634375095367432, "step": 3770 }, { "epoch": 1.2454695222405272, "grad_norm": 11.762344464720499, "learning_rate": 6.887149917627677e-07, "logits/chosen": -0.3319152891635895, "logits/rejected": -0.51806640625, "logps/chosen": -282.9750061035156, "logps/rejected": -342.20001220703125, "loss": 0.0944, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.16370239853858948, "rewards/margins": 5.489062309265137, "rewards/rejected": -5.328125, "step": 3780 }, { "epoch": 1.2487644151565074, "grad_norm": 23.109806412506792, "learning_rate": 6.878912685337726e-07, "logits/chosen": -0.34858399629592896, "logits/rejected": -0.43378907442092896, "logps/chosen": -351.0, "logps/rejected": -383.5, "loss": 0.0742, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.4736328125, "rewards/margins": 6.032812595367432, "rewards/rejected": -5.55859375, "step": 3790 }, { "epoch": 1.2520593080724876, "grad_norm": 10.210541579656432, "learning_rate": 6.870675453047777e-07, "logits/chosen": -0.34370118379592896, "logits/rejected": -0.5303710699081421, "logps/chosen": -346.29998779296875, "logps/rejected": -415.0, "loss": 0.0632, "rewards/accuracies": 0.96875, "rewards/chosen": 0.22423096001148224, "rewards/margins": 6.390625, "rewards/rejected": -6.161718845367432, "step": 3800 }, { "epoch": 1.255354200988468, "grad_norm": 25.69219030169174, "learning_rate": 6.862438220757825e-07, "logits/chosen": -0.26958006620407104, "logits/rejected": -0.43982237577438354, "logps/chosen": -344.45001220703125, "logps/rejected": -402.0, "loss": 0.0696, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.27885740995407104, "rewards/margins": 6.282812595367432, "rewards/rejected": -6.560156345367432, "step": 3810 }, { "epoch": 1.2586490939044481, "grad_norm": 12.039315407629799, "learning_rate": 6.854200988467874e-07, "logits/chosen": -0.16887207329273224, "logits/rejected": -0.43885499238967896, "logps/chosen": -329.3999938964844, "logps/rejected": -366.54998779296875, "loss": 0.0773, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.33906251192092896, "rewards/margins": 6.189062595367432, "rewards/rejected": -6.526562690734863, "step": 3820 }, { "epoch": 1.2619439868204283, "grad_norm": 14.3854358565093, "learning_rate": 6.845963756177923e-07, "logits/chosen": -0.24827881157398224, "logits/rejected": -0.31391602754592896, "logps/chosen": -349.1499938964844, "logps/rejected": -351.04998779296875, "loss": 0.0646, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.14462891221046448, "rewards/margins": 5.8828125, "rewards/rejected": -5.732812404632568, "step": 3830 }, { "epoch": 1.2652388797364087, "grad_norm": 57.66931773630049, "learning_rate": 6.837726523887974e-07, "logits/chosen": -0.21945953369140625, "logits/rejected": -0.23093871772289276, "logps/chosen": -317.3500061035156, "logps/rejected": -382.3999938964844, "loss": 0.0624, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.4610351622104645, "rewards/margins": 6.168749809265137, "rewards/rejected": -5.707812309265137, "step": 3840 }, { "epoch": 1.2685337726523889, "grad_norm": 46.454552756282375, "learning_rate": 6.829489291598023e-07, "logits/chosen": -0.13417968153953552, "logits/rejected": -0.35932618379592896, "logps/chosen": -315.04998779296875, "logps/rejected": -351.6000061035156, "loss": 0.0536, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.4401001036167145, "rewards/margins": 5.544531345367432, "rewards/rejected": -5.100781440734863, "step": 3850 }, { "epoch": 1.271828665568369, "grad_norm": 6.946626728607292, "learning_rate": 6.821252059308072e-07, "logits/chosen": -0.05968017503619194, "logits/rejected": -0.3468261659145355, "logps/chosen": -335.04998779296875, "logps/rejected": -345.0, "loss": 0.0532, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.5944274663925171, "rewards/margins": 5.5546875, "rewards/rejected": -4.954687595367432, "step": 3860 }, { "epoch": 1.2751235584843492, "grad_norm": 23.53268294330574, "learning_rate": 6.813014827018122e-07, "logits/chosen": -0.13907471299171448, "logits/rejected": -0.32426756620407104, "logps/chosen": -329.95001220703125, "logps/rejected": -331.25, "loss": 0.0822, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.5280822515487671, "rewards/margins": 5.296875, "rewards/rejected": -4.772656440734863, "step": 3870 }, { "epoch": 1.2784184514003294, "grad_norm": 19.033139514001128, "learning_rate": 6.804777594728171e-07, "logits/chosen": -0.10679931938648224, "logits/rejected": -0.33369141817092896, "logps/chosen": -326.1000061035156, "logps/rejected": -377.29998779296875, "loss": 0.0731, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.01564941368997097, "rewards/margins": 5.825781345367432, "rewards/rejected": -5.806250095367432, "step": 3880 }, { "epoch": 1.2817133443163098, "grad_norm": 12.8198007690175, "learning_rate": 6.79654036243822e-07, "logits/chosen": -0.11619262397289276, "logits/rejected": -0.27443236112594604, "logps/chosen": -328.6000061035156, "logps/rejected": -386.8999938964844, "loss": 0.061, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.36342161893844604, "rewards/margins": 5.869531154632568, "rewards/rejected": -5.50390625, "step": 3890 }, { "epoch": 1.28500823723229, "grad_norm": 6.30556063609187, "learning_rate": 6.788303130148269e-07, "logits/chosen": -0.1175384521484375, "logits/rejected": -0.2975524961948395, "logps/chosen": -344.8500061035156, "logps/rejected": -367.45001220703125, "loss": 0.0587, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1768753081560135, "rewards/margins": 5.545312404632568, "rewards/rejected": -5.371874809265137, "step": 3900 }, { "epoch": 1.2883031301482701, "grad_norm": 14.03282442524985, "learning_rate": 6.780065897858319e-07, "logits/chosen": -0.12521973252296448, "logits/rejected": -0.32023924589157104, "logps/chosen": -352.3500061035156, "logps/rejected": -366.3999938964844, "loss": 0.0674, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.02346191368997097, "rewards/margins": 5.80078125, "rewards/rejected": -5.772656440734863, "step": 3910 }, { "epoch": 1.2915980230642505, "grad_norm": 9.725499492307673, "learning_rate": 6.771828665568369e-07, "logits/chosen": -0.255615234375, "logits/rejected": -0.3710083067417145, "logps/chosen": -326.125, "logps/rejected": -377.79998779296875, "loss": 0.0519, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.11384277045726776, "rewards/margins": 6.020312309265137, "rewards/rejected": -6.135937690734863, "step": 3920 }, { "epoch": 1.2948929159802307, "grad_norm": 13.449192943519892, "learning_rate": 6.763591433278418e-07, "logits/chosen": -0.3351074159145355, "logits/rejected": -0.40986937284469604, "logps/chosen": -320.29998779296875, "logps/rejected": -349.0, "loss": 0.0682, "rewards/accuracies": 0.96875, "rewards/chosen": -0.086181640625, "rewards/margins": 6.450781345367432, "rewards/rejected": -6.534375190734863, "step": 3930 }, { "epoch": 1.2981878088962109, "grad_norm": 43.52316363443117, "learning_rate": 6.755354200988468e-07, "logits/chosen": -0.2573486268520355, "logits/rejected": -0.5364013910293579, "logps/chosen": -337.0, "logps/rejected": -362.29998779296875, "loss": 0.0867, "rewards/accuracies": 0.96875, "rewards/chosen": 0.02272949181497097, "rewards/margins": 6.348437309265137, "rewards/rejected": -6.321093559265137, "step": 3940 }, { "epoch": 1.301482701812191, "grad_norm": 39.72200998750585, "learning_rate": 6.747116968698517e-07, "logits/chosen": -0.24953003227710724, "logits/rejected": -0.2890258729457855, "logps/chosen": -322.29998779296875, "logps/rejected": -374.8500061035156, "loss": 0.0601, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.12836913764476776, "rewards/margins": 5.807812690734863, "rewards/rejected": -5.682812690734863, "step": 3950 }, { "epoch": 1.3047775947281712, "grad_norm": 42.24033807720835, "learning_rate": 6.738879736408566e-07, "logits/chosen": -0.26288145780563354, "logits/rejected": -0.35014647245407104, "logps/chosen": -369.95001220703125, "logps/rejected": -380.95001220703125, "loss": 0.0854, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.3551025390625, "rewards/margins": 5.842187404632568, "rewards/rejected": -5.495312690734863, "step": 3960 }, { "epoch": 1.3080724876441516, "grad_norm": 21.084150013503276, "learning_rate": 6.730642504118616e-07, "logits/chosen": -0.2689575254917145, "logits/rejected": -0.4330383241176605, "logps/chosen": -342.0, "logps/rejected": -363.3500061035156, "loss": 0.0618, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.5621093511581421, "rewards/margins": 6.112500190734863, "rewards/rejected": -5.551562309265137, "step": 3970 }, { "epoch": 1.3113673805601318, "grad_norm": 12.895588231484691, "learning_rate": 6.722405271828665e-07, "logits/chosen": -0.26811522245407104, "logits/rejected": -0.39561766386032104, "logps/chosen": -308.25, "logps/rejected": -336.25, "loss": 0.0749, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.13271483778953552, "rewards/margins": 5.559374809265137, "rewards/rejected": -5.423437595367432, "step": 3980 }, { "epoch": 1.314662273476112, "grad_norm": 44.87419395772032, "learning_rate": 6.714168039538714e-07, "logits/chosen": -0.36052244901657104, "logits/rejected": -0.521484375, "logps/chosen": -294.6000061035156, "logps/rejected": -366.1000061035156, "loss": 0.0881, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.025146484375, "rewards/margins": 5.446875095367432, "rewards/rejected": -5.421093940734863, "step": 3990 }, { "epoch": 1.3179571663920924, "grad_norm": 7.375865442957904, "learning_rate": 6.705930807248764e-07, "logits/chosen": -0.31752318143844604, "logits/rejected": -0.5267578363418579, "logps/chosen": -321.0, "logps/rejected": -350.8500061035156, "loss": 0.0457, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.35335081815719604, "rewards/margins": 6.037499904632568, "rewards/rejected": -5.680468559265137, "step": 4000 }, { "epoch": 1.3212520593080725, "grad_norm": 7.729923408697291, "learning_rate": 6.697693574958814e-07, "logits/chosen": -0.318490594625473, "logits/rejected": -0.4708007872104645, "logps/chosen": -327.6499938964844, "logps/rejected": -362.54998779296875, "loss": 0.0679, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.37034910917282104, "rewards/margins": 5.8515625, "rewards/rejected": -5.477343559265137, "step": 4010 }, { "epoch": 1.3245469522240527, "grad_norm": 27.813264305973785, "learning_rate": 6.689456342668863e-07, "logits/chosen": -0.3848876953125, "logits/rejected": -0.5586913824081421, "logps/chosen": -299.04998779296875, "logps/rejected": -356.07501220703125, "loss": 0.0915, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.10322265326976776, "rewards/margins": 5.856249809265137, "rewards/rejected": -5.755468845367432, "step": 4020 }, { "epoch": 1.327841845140033, "grad_norm": 21.117043066385758, "learning_rate": 6.681219110378913e-07, "logits/chosen": -0.4801025390625, "logits/rejected": -0.6336425542831421, "logps/chosen": -341.04998779296875, "logps/rejected": -369.04998779296875, "loss": 0.0742, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.4017578065395355, "rewards/margins": 6.227343559265137, "rewards/rejected": -5.827343940734863, "step": 4030 }, { "epoch": 1.331136738056013, "grad_norm": 19.938294267051006, "learning_rate": 6.672981878088962e-07, "logits/chosen": -0.48249512910842896, "logits/rejected": -0.6373046636581421, "logps/chosen": -318.20001220703125, "logps/rejected": -358.5, "loss": 0.0282, "rewards/accuracies": 1.0, "rewards/chosen": 0.01938476599752903, "rewards/margins": 6.25, "rewards/rejected": -6.231249809265137, "step": 4040 }, { "epoch": 1.3344316309719935, "grad_norm": 68.01860144468101, "learning_rate": 6.664744645799011e-07, "logits/chosen": -0.5308593511581421, "logits/rejected": -0.6451171636581421, "logps/chosen": -344.54998779296875, "logps/rejected": -373.6499938964844, "loss": 0.0484, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.36430662870407104, "rewards/margins": 6.259375095367432, "rewards/rejected": -6.625, "step": 4050 }, { "epoch": 1.3377265238879736, "grad_norm": 63.91341248893878, "learning_rate": 6.65650741350906e-07, "logits/chosen": -0.4595947265625, "logits/rejected": -0.665332019329071, "logps/chosen": -314.6000061035156, "logps/rejected": -383.8999938964844, "loss": 0.0948, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.6462646722793579, "rewards/margins": 6.046875, "rewards/rejected": -6.689062595367432, "step": 4060 }, { "epoch": 1.3410214168039538, "grad_norm": 16.271904514844792, "learning_rate": 6.64827018121911e-07, "logits/chosen": -0.5560058355331421, "logits/rejected": -0.6141601800918579, "logps/chosen": -340.79998779296875, "logps/rejected": -374.8999938964844, "loss": 0.1029, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.690380871295929, "rewards/margins": 5.9765625, "rewards/rejected": -6.668749809265137, "step": 4070 }, { "epoch": 1.3443163097199342, "grad_norm": 12.278539929451581, "learning_rate": 6.64003294892916e-07, "logits/chosen": -0.500732421875, "logits/rejected": -0.61865234375, "logps/chosen": -316.8999938964844, "logps/rejected": -330.25, "loss": 0.1643, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.22915038466453552, "rewards/margins": 5.833593845367432, "rewards/rejected": -6.065625190734863, "step": 4080 }, { "epoch": 1.3476112026359144, "grad_norm": 39.882284191260766, "learning_rate": 6.631795716639209e-07, "logits/chosen": -0.5091797113418579, "logits/rejected": -0.5869811773300171, "logps/chosen": -335.1499938964844, "logps/rejected": -365.20001220703125, "loss": 0.0744, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.09860839694738388, "rewards/margins": 5.878125190734863, "rewards/rejected": -5.778124809265137, "step": 4090 }, { "epoch": 1.3509060955518946, "grad_norm": 82.72416589931501, "learning_rate": 6.623558484349258e-07, "logits/chosen": -0.5512939691543579, "logits/rejected": -0.6400390863418579, "logps/chosen": -302.32501220703125, "logps/rejected": -357.70001220703125, "loss": 0.1101, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.05372924730181694, "rewards/margins": 5.890625, "rewards/rejected": -5.836718559265137, "step": 4100 }, { "epoch": 1.3542009884678747, "grad_norm": 22.786309629219097, "learning_rate": 6.615321252059309e-07, "logits/chosen": -0.532763659954071, "logits/rejected": -0.7105468511581421, "logps/chosen": -349.45001220703125, "logps/rejected": -392.29998779296875, "loss": 0.0497, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.03449096530675888, "rewards/margins": 6.235156059265137, "rewards/rejected": -6.196875095367432, "step": 4110 }, { "epoch": 1.357495881383855, "grad_norm": 13.277777954140792, "learning_rate": 6.607084019769357e-07, "logits/chosen": -0.544677734375, "logits/rejected": -0.799853503704071, "logps/chosen": -350.3999938964844, "logps/rejected": -359.1000061035156, "loss": 0.0706, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.29622191190719604, "rewards/margins": 5.494531154632568, "rewards/rejected": -5.794531345367432, "step": 4120 }, { "epoch": 1.3607907742998353, "grad_norm": 22.123088233115954, "learning_rate": 6.598846787479406e-07, "logits/chosen": -0.70654296875, "logits/rejected": -0.9234374761581421, "logps/chosen": -363.8500061035156, "logps/rejected": -413.6000061035156, "loss": 0.0573, "rewards/accuracies": 0.96875, "rewards/chosen": -0.842968761920929, "rewards/margins": 6.606249809265137, "rewards/rejected": -7.448437690734863, "step": 4130 }, { "epoch": 1.3640856672158155, "grad_norm": 14.777110834234453, "learning_rate": 6.590609555189455e-07, "logits/chosen": -0.7340332269668579, "logits/rejected": -0.87353515625, "logps/chosen": -327.04998779296875, "logps/rejected": -396.1499938964844, "loss": 0.0988, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.705810546875, "rewards/margins": 6.3046875, "rewards/rejected": -7.0078125, "step": 4140 }, { "epoch": 1.3673805601317957, "grad_norm": 11.850461457248594, "learning_rate": 6.582372322899506e-07, "logits/chosen": -0.53076171875, "logits/rejected": -0.71875, "logps/chosen": -337.6499938964844, "logps/rejected": -352.3999938964844, "loss": 0.0906, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.01848144456744194, "rewards/margins": 5.881249904632568, "rewards/rejected": -5.903124809265137, "step": 4150 }, { "epoch": 1.370675453047776, "grad_norm": 6.070155045766375, "learning_rate": 6.574135090609555e-07, "logits/chosen": -0.46953123807907104, "logits/rejected": -0.52581787109375, "logps/chosen": -315.8500061035156, "logps/rejected": -361.45001220703125, "loss": 0.0585, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.45500487089157104, "rewards/margins": 6.096875190734863, "rewards/rejected": -5.635156154632568, "step": 4160 }, { "epoch": 1.3739703459637562, "grad_norm": 9.633043514877071, "learning_rate": 6.565897858319604e-07, "logits/chosen": -0.33299559354782104, "logits/rejected": -0.618457019329071, "logps/chosen": -326.45001220703125, "logps/rejected": -377.29998779296875, "loss": 0.0626, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.4098144471645355, "rewards/margins": 6.350781440734863, "rewards/rejected": -5.94921875, "step": 4170 }, { "epoch": 1.3772652388797364, "grad_norm": 30.713586457853136, "learning_rate": 6.557660626029654e-07, "logits/chosen": -0.4357666075229645, "logits/rejected": -0.5281127691268921, "logps/chosen": -328.1000061035156, "logps/rejected": -353.3500061035156, "loss": 0.0703, "rewards/accuracies": 0.96875, "rewards/chosen": -0.27814942598342896, "rewards/margins": 5.833593845367432, "rewards/rejected": -6.109375, "step": 4180 }, { "epoch": 1.3805601317957166, "grad_norm": 7.442967438595032, "learning_rate": 6.549423393739704e-07, "logits/chosen": -0.5233398675918579, "logits/rejected": -0.7594238519668579, "logps/chosen": -363.1000061035156, "logps/rejected": -383.3500061035156, "loss": 0.0292, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.17348632216453552, "rewards/margins": 6.860937595367432, "rewards/rejected": -7.035937309265137, "step": 4190 }, { "epoch": 1.3838550247116967, "grad_norm": 82.62864347195702, "learning_rate": 6.541186161449752e-07, "logits/chosen": -0.3759765625, "logits/rejected": -0.67041015625, "logps/chosen": -326.79998779296875, "logps/rejected": -383.25, "loss": 0.0909, "rewards/accuracies": 0.96875, "rewards/chosen": -1.0574462413787842, "rewards/margins": 5.66796875, "rewards/rejected": -6.720312595367432, "step": 4200 }, { "epoch": 1.3871499176276771, "grad_norm": 17.30166767936165, "learning_rate": 6.532948929159802e-07, "logits/chosen": -0.53955078125, "logits/rejected": -0.66748046875, "logps/chosen": -336.6000061035156, "logps/rejected": -344.1000061035156, "loss": 0.0527, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.4744873046875, "rewards/margins": 6.482812404632568, "rewards/rejected": -6.953125, "step": 4210 }, { "epoch": 1.3904448105436573, "grad_norm": 76.57583795646094, "learning_rate": 6.524711696869851e-07, "logits/chosen": -0.2519164979457855, "logits/rejected": -0.4950805604457855, "logps/chosen": -341.95001220703125, "logps/rejected": -392.8999938964844, "loss": 0.081, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.7520507574081421, "rewards/margins": 6.456250190734863, "rewards/rejected": -7.214062690734863, "step": 4220 }, { "epoch": 1.3937397034596375, "grad_norm": 14.499800869224568, "learning_rate": 6.516474464579901e-07, "logits/chosen": -0.16679687798023224, "logits/rejected": -0.4837890565395355, "logps/chosen": -335.3999938964844, "logps/rejected": -380.3999938964844, "loss": 0.0675, "rewards/accuracies": 0.96875, "rewards/chosen": -0.816699206829071, "rewards/margins": 5.954687595367432, "rewards/rejected": -6.768750190734863, "step": 4230 }, { "epoch": 1.3970345963756179, "grad_norm": 41.77180296684911, "learning_rate": 6.50823723228995e-07, "logits/chosen": -0.10767821967601776, "logits/rejected": -0.39423829317092896, "logps/chosen": -319.6499938964844, "logps/rejected": -386.95001220703125, "loss": 0.0872, "rewards/accuracies": 0.96875, "rewards/chosen": -0.33165282011032104, "rewards/margins": 5.940625190734863, "rewards/rejected": -6.268750190734863, "step": 4240 }, { "epoch": 1.400329489291598, "grad_norm": 5.115944838156937, "learning_rate": 6.5e-07, "logits/chosen": -0.29753416776657104, "logits/rejected": -0.3951171934604645, "logps/chosen": -293.3500061035156, "logps/rejected": -329.79998779296875, "loss": 0.0722, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.37543946504592896, "rewards/margins": 5.871874809265137, "rewards/rejected": -6.246874809265137, "step": 4250 }, { "epoch": 1.4036243822075782, "grad_norm": 26.121929843592813, "learning_rate": 6.49176276771005e-07, "logits/chosen": -0.42365723848342896, "logits/rejected": -0.541125476360321, "logps/chosen": -331.6000061035156, "logps/rejected": -367.8999938964844, "loss": 0.1049, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.02089843712747097, "rewards/margins": 6.032031059265137, "rewards/rejected": -6.0546875, "step": 4260 }, { "epoch": 1.4069192751235584, "grad_norm": 10.491253908126302, "learning_rate": 6.483525535420099e-07, "logits/chosen": -0.356719970703125, "logits/rejected": -0.45612794160842896, "logps/chosen": -321.6499938964844, "logps/rejected": -378.3999938964844, "loss": 0.0636, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.28022462129592896, "rewards/margins": 5.6796875, "rewards/rejected": -5.956250190734863, "step": 4270 }, { "epoch": 1.4102141680395386, "grad_norm": 8.980151092008665, "learning_rate": 6.475288303130148e-07, "logits/chosen": -0.29693603515625, "logits/rejected": -0.30976563692092896, "logps/chosen": -286.20001220703125, "logps/rejected": -330.5, "loss": 0.1086, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.02741699293255806, "rewards/margins": 5.4609375, "rewards/rejected": -5.434374809265137, "step": 4280 }, { "epoch": 1.413509060955519, "grad_norm": 16.183237264933222, "learning_rate": 6.467051070840197e-07, "logits/chosen": -0.302398681640625, "logits/rejected": -0.32478028535842896, "logps/chosen": -302.6499938964844, "logps/rejected": -344.04998779296875, "loss": 0.0821, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.18132324516773224, "rewards/margins": 5.665625095367432, "rewards/rejected": -5.489843845367432, "step": 4290 }, { "epoch": 1.4168039538714992, "grad_norm": 16.632981383157038, "learning_rate": 6.458813838550246e-07, "logits/chosen": -0.4195800721645355, "logits/rejected": -0.5165771245956421, "logps/chosen": -298.3999938964844, "logps/rejected": -388.45001220703125, "loss": 0.0949, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.2415771484375, "rewards/margins": 5.96875, "rewards/rejected": -5.721093654632568, "step": 4300 }, { "epoch": 1.4200988467874793, "grad_norm": 40.822180729727535, "learning_rate": 6.450576606260296e-07, "logits/chosen": -0.29731446504592896, "logits/rejected": -0.46728515625, "logps/chosen": -350.6499938964844, "logps/rejected": -354.0, "loss": 0.0578, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.218963623046875, "rewards/margins": 6.078125, "rewards/rejected": -5.860937595367432, "step": 4310 }, { "epoch": 1.4233937397034597, "grad_norm": 29.712959003350317, "learning_rate": 6.442339373970346e-07, "logits/chosen": -0.332662969827652, "logits/rejected": -0.3715972900390625, "logps/chosen": -313.6000061035156, "logps/rejected": -358.20001220703125, "loss": 0.0594, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.4495605528354645, "rewards/margins": 6.203906059265137, "rewards/rejected": -5.751562595367432, "step": 4320 }, { "epoch": 1.42668863261944, "grad_norm": 9.923523024681893, "learning_rate": 6.434102141680395e-07, "logits/chosen": -0.32469481229782104, "logits/rejected": -0.39030760526657104, "logps/chosen": -317.54998779296875, "logps/rejected": -323.95001220703125, "loss": 0.0828, "rewards/accuracies": 0.96875, "rewards/chosen": 0.12734374403953552, "rewards/margins": 5.285937309265137, "rewards/rejected": -5.162499904632568, "step": 4330 }, { "epoch": 1.42998352553542, "grad_norm": 63.36990742887736, "learning_rate": 6.425864909390445e-07, "logits/chosen": -0.24439696967601776, "logits/rejected": -0.35417479276657104, "logps/chosen": -329.45001220703125, "logps/rejected": -370.8999938964844, "loss": 0.0659, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.10765381157398224, "rewards/margins": 5.8671875, "rewards/rejected": -5.759375095367432, "step": 4340 }, { "epoch": 1.4332784184514002, "grad_norm": 7.026629512654064, "learning_rate": 6.417627677100495e-07, "logits/chosen": -0.30499267578125, "logits/rejected": -0.4173339903354645, "logps/chosen": -317.04998779296875, "logps/rejected": -364.95001220703125, "loss": 0.0605, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.10957030951976776, "rewards/margins": 5.845312595367432, "rewards/rejected": -5.954687595367432, "step": 4350 }, { "epoch": 1.4365733113673804, "grad_norm": 9.073645442048097, "learning_rate": 6.409390444810543e-07, "logits/chosen": -0.3954834043979645, "logits/rejected": -0.4430175721645355, "logps/chosen": -328.54998779296875, "logps/rejected": -360.54998779296875, "loss": 0.1033, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.139892578125, "rewards/margins": 5.909375190734863, "rewards/rejected": -6.043749809265137, "step": 4360 }, { "epoch": 1.4398682042833608, "grad_norm": 9.136816712417836, "learning_rate": 6.401153212520592e-07, "logits/chosen": -0.367767333984375, "logits/rejected": -0.4554687440395355, "logps/chosen": -304.79998779296875, "logps/rejected": -358.20001220703125, "loss": 0.0947, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.527880847454071, "rewards/margins": 5.787499904632568, "rewards/rejected": -6.314062595367432, "step": 4370 }, { "epoch": 1.443163097199341, "grad_norm": 47.98285193203856, "learning_rate": 6.392915980230642e-07, "logits/chosen": -0.34248048067092896, "logits/rejected": -0.3666748106479645, "logps/chosen": -323.29998779296875, "logps/rejected": -381.6499938964844, "loss": 0.0744, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.344970703125, "rewards/margins": 6.162499904632568, "rewards/rejected": -5.8203125, "step": 4380 }, { "epoch": 1.4464579901153214, "grad_norm": 34.91747282803155, "learning_rate": 6.384678747940692e-07, "logits/chosen": -0.24158325791358948, "logits/rejected": -0.3470214903354645, "logps/chosen": -366.45001220703125, "logps/rejected": -382.20001220703125, "loss": 0.0739, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.05809326097369194, "rewards/margins": 5.828906059265137, "rewards/rejected": -5.885156154632568, "step": 4390 }, { "epoch": 1.4497528830313016, "grad_norm": 17.362720146316924, "learning_rate": 6.376441515650741e-07, "logits/chosen": -0.22775879502296448, "logits/rejected": -0.37322694063186646, "logps/chosen": -359.25, "logps/rejected": -376.45001220703125, "loss": 0.0857, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.030303955078125, "rewards/margins": 5.385937690734863, "rewards/rejected": -5.421875, "step": 4400 }, { "epoch": 1.4530477759472817, "grad_norm": 7.341723647105968, "learning_rate": 6.36820428336079e-07, "logits/chosen": -0.16837158799171448, "logits/rejected": -0.21934814751148224, "logps/chosen": -318.8500061035156, "logps/rejected": -380.04998779296875, "loss": 0.0597, "rewards/accuracies": 0.96875, "rewards/chosen": 0.44099122285842896, "rewards/margins": 6.1171875, "rewards/rejected": -5.677343845367432, "step": 4410 }, { "epoch": 1.456342668863262, "grad_norm": 13.656630099371025, "learning_rate": 6.359967051070841e-07, "logits/chosen": -0.22449645400047302, "logits/rejected": -0.4166503846645355, "logps/chosen": -308.1499938964844, "logps/rejected": -338.25, "loss": 0.0741, "rewards/accuracies": 0.96875, "rewards/chosen": 0.03883056715130806, "rewards/margins": 5.470312595367432, "rewards/rejected": -5.435156345367432, "step": 4420 }, { "epoch": 1.459637561779242, "grad_norm": 7.359155125445693, "learning_rate": 6.35172981878089e-07, "logits/chosen": -0.21458740532398224, "logits/rejected": -0.42138671875, "logps/chosen": -331.04998779296875, "logps/rejected": -371.70001220703125, "loss": 0.0497, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.07473144680261612, "rewards/margins": 5.888281345367432, "rewards/rejected": -5.962500095367432, "step": 4430 }, { "epoch": 1.4629324546952225, "grad_norm": 5.318703485970226, "learning_rate": 6.343492586490938e-07, "logits/chosen": -0.23011168837547302, "logits/rejected": -0.41694337129592896, "logps/chosen": -340.04998779296875, "logps/rejected": -381.04998779296875, "loss": 0.0491, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04865112155675888, "rewards/margins": 6.185937404632568, "rewards/rejected": -6.134375095367432, "step": 4440 }, { "epoch": 1.4662273476112027, "grad_norm": 38.532390736740886, "learning_rate": 6.335255354200988e-07, "logits/chosen": -0.21198424696922302, "logits/rejected": -0.4187988340854645, "logps/chosen": -358.75, "logps/rejected": -383.75, "loss": 0.105, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.15105590224266052, "rewards/margins": 6.314843654632568, "rewards/rejected": -6.158593654632568, "step": 4450 }, { "epoch": 1.4695222405271828, "grad_norm": 15.167351582500423, "learning_rate": 6.327018121911038e-07, "logits/chosen": -0.3410095274448395, "logits/rejected": -0.442626953125, "logps/chosen": -372.1499938964844, "logps/rejected": -341.3500061035156, "loss": 0.1379, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.07305908203125, "rewards/margins": 5.699999809265137, "rewards/rejected": -5.6328125, "step": 4460 }, { "epoch": 1.4728171334431632, "grad_norm": 32.07534162343863, "learning_rate": 6.318780889621087e-07, "logits/chosen": -0.18928222358226776, "logits/rejected": -0.352325439453125, "logps/chosen": -324.5, "logps/rejected": -336.0, "loss": 0.1016, "rewards/accuracies": 0.96875, "rewards/chosen": -0.03695068508386612, "rewards/margins": 5.778906345367432, "rewards/rejected": -5.818749904632568, "step": 4470 }, { "epoch": 1.4761120263591434, "grad_norm": 4.998824575056428, "learning_rate": 6.310543657331136e-07, "logits/chosen": -0.21260985732078552, "logits/rejected": -0.45549315214157104, "logps/chosen": -327.20001220703125, "logps/rejected": -349.45001220703125, "loss": 0.0397, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.40289306640625, "rewards/margins": 6.010937690734863, "rewards/rejected": -6.418749809265137, "step": 4480 }, { "epoch": 1.4794069192751236, "grad_norm": 21.97704196305448, "learning_rate": 6.302306425041186e-07, "logits/chosen": -0.3072189390659332, "logits/rejected": -0.564697265625, "logps/chosen": -374.75, "logps/rejected": -383.70001220703125, "loss": 0.0487, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.14860840141773224, "rewards/margins": 6.123437404632568, "rewards/rejected": -5.974999904632568, "step": 4490 }, { "epoch": 1.4827018121911038, "grad_norm": 26.37301085410988, "learning_rate": 6.294069192751236e-07, "logits/chosen": -0.3497070372104645, "logits/rejected": -0.44477540254592896, "logps/chosen": -325.1000061035156, "logps/rejected": -368.29998779296875, "loss": 0.0629, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.17985840141773224, "rewards/margins": 5.882031440734863, "rewards/rejected": -5.70703125, "step": 4500 }, { "epoch": 1.485996705107084, "grad_norm": 4.047366202702708, "learning_rate": 6.285831960461285e-07, "logits/chosen": -0.41632080078125, "logits/rejected": -0.6356445550918579, "logps/chosen": -324.6000061035156, "logps/rejected": -386.54998779296875, "loss": 0.0957, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.140380859375, "rewards/margins": 5.893750190734863, "rewards/rejected": -5.748437404632568, "step": 4510 }, { "epoch": 1.4892915980230643, "grad_norm": 30.836942019215122, "learning_rate": 6.277594728171334e-07, "logits/chosen": -0.4251953065395355, "logits/rejected": -0.579394519329071, "logps/chosen": -316.29998779296875, "logps/rejected": -368.8999938964844, "loss": 0.0424, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.0240020751953125, "rewards/margins": 6.035937309265137, "rewards/rejected": -6.009375095367432, "step": 4520 }, { "epoch": 1.4925864909390445, "grad_norm": 60.04389302391044, "learning_rate": 6.269357495881383e-07, "logits/chosen": -0.43841552734375, "logits/rejected": -0.670214831829071, "logps/chosen": -357.70001220703125, "logps/rejected": -385.1000061035156, "loss": 0.0922, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.3675293028354645, "rewards/margins": 5.973437309265137, "rewards/rejected": -6.340624809265137, "step": 4530 }, { "epoch": 1.4958813838550247, "grad_norm": 20.66200988356282, "learning_rate": 6.261120263591433e-07, "logits/chosen": -0.6037353277206421, "logits/rejected": -0.645764172077179, "logps/chosen": -356.04998779296875, "logps/rejected": -377.79998779296875, "loss": 0.0368, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.3774475157260895, "rewards/margins": 6.7578125, "rewards/rejected": -7.137499809265137, "step": 4540 }, { "epoch": 1.499176276771005, "grad_norm": 6.3223147960444095, "learning_rate": 6.252883031301482e-07, "logits/chosen": -0.6019287109375, "logits/rejected": -0.76580810546875, "logps/chosen": -381.8500061035156, "logps/rejected": -392.79998779296875, "loss": 0.0549, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.661572277545929, "rewards/margins": 6.643750190734863, "rewards/rejected": -7.3046875, "step": 4550 }, { "epoch": 1.5024711696869852, "grad_norm": 16.35202515102648, "learning_rate": 6.244645799011532e-07, "logits/chosen": -0.662402331829071, "logits/rejected": -0.805468738079071, "logps/chosen": -323.8500061035156, "logps/rejected": -363.0, "loss": 0.0585, "rewards/accuracies": 0.96875, "rewards/chosen": -0.46782225370407104, "rewards/margins": 6.314062595367432, "rewards/rejected": -6.782812595367432, "step": 4560 }, { "epoch": 1.5057660626029654, "grad_norm": 39.63382596317326, "learning_rate": 6.236408566721582e-07, "logits/chosen": -0.625244140625, "logits/rejected": -0.8128906488418579, "logps/chosen": -315.8500061035156, "logps/rejected": -378.3500061035156, "loss": 0.0705, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.513293445110321, "rewards/margins": 6.199999809265137, "rewards/rejected": -6.717187404632568, "step": 4570 }, { "epoch": 1.5090609555189456, "grad_norm": 10.452947922995095, "learning_rate": 6.228171334431631e-07, "logits/chosen": -0.6988769769668579, "logits/rejected": -0.7865356206893921, "logps/chosen": -325.75, "logps/rejected": -400.70001220703125, "loss": 0.0492, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.6620117425918579, "rewards/margins": 6.332812309265137, "rewards/rejected": -6.993750095367432, "step": 4580 }, { "epoch": 1.5123558484349258, "grad_norm": 2.564552149077161, "learning_rate": 6.21993410214168e-07, "logits/chosen": -0.58544921875, "logits/rejected": -0.8213866949081421, "logps/chosen": -344.04998779296875, "logps/rejected": -385.25, "loss": 0.0679, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.4264587461948395, "rewards/margins": 6.475781440734863, "rewards/rejected": -6.900000095367432, "step": 4590 }, { "epoch": 1.515650741350906, "grad_norm": 7.42432426713319, "learning_rate": 6.211696869851729e-07, "logits/chosen": -0.5819336175918579, "logits/rejected": -0.672802746295929, "logps/chosen": -335.70001220703125, "logps/rejected": -362.79998779296875, "loss": 0.0741, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.21682128310203552, "rewards/margins": 6.40625, "rewards/rejected": -6.626562595367432, "step": 4600 }, { "epoch": 1.5189456342668863, "grad_norm": 21.39934883168487, "learning_rate": 6.203459637561779e-07, "logits/chosen": -0.521191418170929, "logits/rejected": -0.63134765625, "logps/chosen": -331.5, "logps/rejected": -392.75, "loss": 0.0597, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.058349609375, "rewards/margins": 6.3828125, "rewards/rejected": -6.44921875, "step": 4610 }, { "epoch": 1.5222405271828665, "grad_norm": 6.476373423601012, "learning_rate": 6.195222405271828e-07, "logits/chosen": -0.536083996295929, "logits/rejected": -0.6341308355331421, "logps/chosen": -313.70001220703125, "logps/rejected": -367.8999938964844, "loss": 0.0456, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.3888183534145355, "rewards/margins": 5.884375095367432, "rewards/rejected": -6.267187595367432, "step": 4620 }, { "epoch": 1.525535420098847, "grad_norm": 22.447006319231626, "learning_rate": 6.186985172981878e-07, "logits/chosen": -0.4220947325229645, "logits/rejected": -0.573486328125, "logps/chosen": -377.20001220703125, "logps/rejected": -383.3999938964844, "loss": 0.088, "rewards/accuracies": 0.96875, "rewards/chosen": -0.4203124940395355, "rewards/margins": 6.5, "rewards/rejected": -6.920312404632568, "step": 4630 }, { "epoch": 1.528830313014827, "grad_norm": 55.06428226763268, "learning_rate": 6.178747940691927e-07, "logits/chosen": -0.48826903104782104, "logits/rejected": -0.758984386920929, "logps/chosen": -349.3500061035156, "logps/rejected": -402.1000061035156, "loss": 0.0909, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -1.1736328601837158, "rewards/margins": 6.205468654632568, "rewards/rejected": -7.378125190734863, "step": 4640 }, { "epoch": 1.5321252059308073, "grad_norm": 20.297980236837745, "learning_rate": 6.170510708401977e-07, "logits/chosen": -0.4474731385707855, "logits/rejected": -0.69580078125, "logps/chosen": -323.5, "logps/rejected": -377.8999938964844, "loss": 0.0512, "rewards/accuracies": 1.0, "rewards/chosen": -1.195898413658142, "rewards/margins": 6.444531440734863, "rewards/rejected": -7.640625, "step": 4650 }, { "epoch": 1.5354200988467874, "grad_norm": 29.148484307730524, "learning_rate": 6.162273476112027e-07, "logits/chosen": -0.3472656309604645, "logits/rejected": -0.6102539300918579, "logps/chosen": -342.6000061035156, "logps/rejected": -368.1000061035156, "loss": 0.0699, "rewards/accuracies": 0.96875, "rewards/chosen": -1.0325195789337158, "rewards/margins": 6.291406154632568, "rewards/rejected": -7.317187309265137, "step": 4660 }, { "epoch": 1.5387149917627676, "grad_norm": 10.01932667902691, "learning_rate": 6.154036243822075e-07, "logits/chosen": -0.34797364473342896, "logits/rejected": -0.5126708745956421, "logps/chosen": -298.25, "logps/rejected": -394.6000061035156, "loss": 0.0926, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.5654357671737671, "rewards/margins": 6.246874809265137, "rewards/rejected": -6.814062595367432, "step": 4670 }, { "epoch": 1.5420098846787478, "grad_norm": 61.849600838071254, "learning_rate": 6.145799011532124e-07, "logits/chosen": -0.3517822325229645, "logits/rejected": -0.4991210997104645, "logps/chosen": -329.32501220703125, "logps/rejected": -347.25, "loss": 0.112, "rewards/accuracies": 0.96875, "rewards/chosen": -0.24416503310203552, "rewards/margins": 5.792187690734863, "rewards/rejected": -6.034375190734863, "step": 4680 }, { "epoch": 1.5453047775947282, "grad_norm": 13.512033157599483, "learning_rate": 6.137561779242175e-07, "logits/chosen": -0.294677734375, "logits/rejected": -0.43474119901657104, "logps/chosen": -284.8500061035156, "logps/rejected": -357.6499938964844, "loss": 0.1219, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.18122558295726776, "rewards/margins": 5.602343559265137, "rewards/rejected": -5.784375190734863, "step": 4690 }, { "epoch": 1.5485996705107083, "grad_norm": 27.8088808883619, "learning_rate": 6.129324546952224e-07, "logits/chosen": -0.22525635361671448, "logits/rejected": -0.48515623807907104, "logps/chosen": -335.6000061035156, "logps/rejected": -385.8999938964844, "loss": 0.0561, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.04325561597943306, "rewards/margins": 6.581250190734863, "rewards/rejected": -6.625, "step": 4700 }, { "epoch": 1.5518945634266887, "grad_norm": 26.96016571998364, "learning_rate": 6.121087314662273e-07, "logits/chosen": -0.5072021484375, "logits/rejected": -0.5824218988418579, "logps/chosen": -307.04998779296875, "logps/rejected": -361.25, "loss": 0.0827, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.44654542207717896, "rewards/margins": 6.172656059265137, "rewards/rejected": -6.619531154632568, "step": 4710 }, { "epoch": 1.555189456342669, "grad_norm": 44.72333973245404, "learning_rate": 6.112850082372322e-07, "logits/chosen": -0.38023680448532104, "logits/rejected": -0.5352538824081421, "logps/chosen": -343.5, "logps/rejected": -412.8999938964844, "loss": 0.0857, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.4214233458042145, "rewards/margins": 6.709374904632568, "rewards/rejected": -7.129687309265137, "step": 4720 }, { "epoch": 1.558484349258649, "grad_norm": 21.493524778532983, "learning_rate": 6.104612850082373e-07, "logits/chosen": -0.4293212890625, "logits/rejected": -0.6334472894668579, "logps/chosen": -309.3500061035156, "logps/rejected": -321.3999938964844, "loss": 0.0619, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.399169921875, "rewards/margins": 5.822656154632568, "rewards/rejected": -6.217187404632568, "step": 4730 }, { "epoch": 1.5617792421746293, "grad_norm": 29.18977164766403, "learning_rate": 6.096375617792422e-07, "logits/chosen": -0.4281982481479645, "logits/rejected": -0.5966796875, "logps/chosen": -362.45001220703125, "logps/rejected": -359.0, "loss": 0.0555, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.3436035215854645, "rewards/margins": 6.060937404632568, "rewards/rejected": -6.404687404632568, "step": 4740 }, { "epoch": 1.5650741350906094, "grad_norm": 55.82001535840692, "learning_rate": 6.08813838550247e-07, "logits/chosen": -0.4210571348667145, "logits/rejected": -0.537158191204071, "logps/chosen": -297.75, "logps/rejected": -382.3500061035156, "loss": 0.07, "rewards/accuracies": 0.96875, "rewards/chosen": -0.23908691108226776, "rewards/margins": 6.534375190734863, "rewards/rejected": -6.7734375, "step": 4750 }, { "epoch": 1.5683690280065898, "grad_norm": 15.655743223882995, "learning_rate": 6.07990115321252e-07, "logits/chosen": -0.3732543885707855, "logits/rejected": -0.4637451171875, "logps/chosen": -366.3999938964844, "logps/rejected": -381.79998779296875, "loss": 0.0509, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.37939453125, "rewards/margins": 6.229687690734863, "rewards/rejected": -6.609375, "step": 4760 }, { "epoch": 1.57166392092257, "grad_norm": 15.20208164424635, "learning_rate": 6.07166392092257e-07, "logits/chosen": -0.4430297911167145, "logits/rejected": -0.6727050542831421, "logps/chosen": -344.54998779296875, "logps/rejected": -369.1499938964844, "loss": 0.0675, "rewards/accuracies": 0.96875, "rewards/chosen": -0.24151916801929474, "rewards/margins": 6.464062690734863, "rewards/rejected": -6.707812309265137, "step": 4770 }, { "epoch": 1.5749588138385504, "grad_norm": 10.958242436728415, "learning_rate": 6.063426688632619e-07, "logits/chosen": -0.47993165254592896, "logits/rejected": -0.608593761920929, "logps/chosen": -333.75, "logps/rejected": -405.6499938964844, "loss": 0.0582, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.20504149794578552, "rewards/margins": 6.546093940734863, "rewards/rejected": -6.745312690734863, "step": 4780 }, { "epoch": 1.5782537067545306, "grad_norm": 73.13206451099477, "learning_rate": 6.055189456342668e-07, "logits/chosen": -0.25656431913375854, "logits/rejected": -0.42256468534469604, "logps/chosen": -328.45001220703125, "logps/rejected": -372.6499938964844, "loss": 0.0682, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.12763671576976776, "rewards/margins": 6.212500095367432, "rewards/rejected": -6.34375, "step": 4790 }, { "epoch": 1.5815485996705108, "grad_norm": 27.93313223831176, "learning_rate": 6.046952224052718e-07, "logits/chosen": -0.4566894471645355, "logits/rejected": -0.6222168207168579, "logps/chosen": -329.8500061035156, "logps/rejected": -362.25, "loss": 0.1046, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.43305665254592896, "rewards/margins": 6.153906345367432, "rewards/rejected": -6.587500095367432, "step": 4800 }, { "epoch": 1.584843492586491, "grad_norm": 9.294964727327509, "learning_rate": 6.038714991762768e-07, "logits/chosen": -0.3805175721645355, "logits/rejected": -0.506359875202179, "logps/chosen": -323.0, "logps/rejected": -368.6499938964844, "loss": 0.0688, "rewards/accuracies": 0.96875, "rewards/chosen": -0.32178956270217896, "rewards/margins": 6.098437309265137, "rewards/rejected": -6.426562309265137, "step": 4810 }, { "epoch": 1.588138385502471, "grad_norm": 34.84182772399106, "learning_rate": 6.030477759472817e-07, "logits/chosen": -0.515332043170929, "logits/rejected": -0.5965331792831421, "logps/chosen": -346.04998779296875, "logps/rejected": -352.95001220703125, "loss": 0.0848, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.824536144733429, "rewards/margins": 6.057031154632568, "rewards/rejected": -6.876562595367432, "step": 4820 }, { "epoch": 1.5914332784184513, "grad_norm": 2.1798628209813047, "learning_rate": 6.022240527182866e-07, "logits/chosen": -0.49079591035842896, "logits/rejected": -0.69921875, "logps/chosen": -336.20001220703125, "logps/rejected": -402.5, "loss": 0.0652, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.0007445812225342, "rewards/margins": 6.598437309265137, "rewards/rejected": -7.598437309265137, "step": 4830 }, { "epoch": 1.5947281713344317, "grad_norm": 17.649666270548217, "learning_rate": 6.014003294892915e-07, "logits/chosen": -0.5047851800918579, "logits/rejected": -0.672070324420929, "logps/chosen": -405.29998779296875, "logps/rejected": -420.70001220703125, "loss": 0.0478, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.269677758216858, "rewards/margins": 7.426562309265137, "rewards/rejected": -8.699999809265137, "step": 4840 }, { "epoch": 1.5980230642504119, "grad_norm": 69.47031812151597, "learning_rate": 6.005766062602965e-07, "logits/chosen": -0.520092785358429, "logits/rejected": -0.7275390625, "logps/chosen": -327.5, "logps/rejected": -369.29998779296875, "loss": 0.1247, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.1554687023162842, "rewards/margins": 6.092968940734863, "rewards/rejected": -7.249218940734863, "step": 4850 }, { "epoch": 1.6013179571663922, "grad_norm": 8.540017217229565, "learning_rate": 5.997528830313014e-07, "logits/chosen": -0.4622802734375, "logits/rejected": -0.623339831829071, "logps/chosen": -323.54998779296875, "logps/rejected": -337.6499938964844, "loss": 0.0593, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.818554699420929, "rewards/margins": 6.2421875, "rewards/rejected": -7.0625, "step": 4860 }, { "epoch": 1.6046128500823724, "grad_norm": 21.323235135219495, "learning_rate": 5.989291598023064e-07, "logits/chosen": -0.4110107421875, "logits/rejected": -0.6045898199081421, "logps/chosen": -350.3999938964844, "logps/rejected": -400.8999938964844, "loss": 0.0658, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.555859386920929, "rewards/margins": 6.676562309265137, "rewards/rejected": -7.235937595367432, "step": 4870 }, { "epoch": 1.6079077429983526, "grad_norm": 41.75907649156852, "learning_rate": 5.981054365733114e-07, "logits/chosen": -0.39659422636032104, "logits/rejected": -0.609667956829071, "logps/chosen": -345.70001220703125, "logps/rejected": -409.54998779296875, "loss": 0.0659, "rewards/accuracies": 0.96875, "rewards/chosen": -0.554003894329071, "rewards/margins": 6.329687595367432, "rewards/rejected": -6.887499809265137, "step": 4880 }, { "epoch": 1.6112026359143328, "grad_norm": 17.3765986498207, "learning_rate": 5.972817133443163e-07, "logits/chosen": -0.6353759765625, "logits/rejected": -0.761645495891571, "logps/chosen": -350.20001220703125, "logps/rejected": -398.20001220703125, "loss": 0.0563, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.871630847454071, "rewards/margins": 6.53515625, "rewards/rejected": -7.404687404632568, "step": 4890 }, { "epoch": 1.614497528830313, "grad_norm": 21.683523892244693, "learning_rate": 5.964579901153213e-07, "logits/chosen": -0.5724121332168579, "logits/rejected": -0.6957855224609375, "logps/chosen": -314.29998779296875, "logps/rejected": -330.75, "loss": 0.075, "rewards/accuracies": 0.96875, "rewards/chosen": -1.085790991783142, "rewards/margins": 5.762499809265137, "rewards/rejected": -6.848437309265137, "step": 4900 }, { "epoch": 1.6177924217462931, "grad_norm": 51.96566086549504, "learning_rate": 5.956342668863261e-07, "logits/chosen": -0.4626220762729645, "logits/rejected": -0.731152355670929, "logps/chosen": -338.70001220703125, "logps/rejected": -383.29998779296875, "loss": 0.0452, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.6124023199081421, "rewards/margins": 7.279687404632568, "rewards/rejected": -7.885937690734863, "step": 4910 }, { "epoch": 1.6210873146622735, "grad_norm": 16.58134089491916, "learning_rate": 5.94810543657331e-07, "logits/chosen": -0.6300293207168579, "logits/rejected": -0.729541003704071, "logps/chosen": -374.29998779296875, "logps/rejected": -395.54998779296875, "loss": 0.0427, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.726269543170929, "rewards/margins": 6.8671875, "rewards/rejected": -7.598437309265137, "step": 4920 }, { "epoch": 1.6243822075782537, "grad_norm": 93.71238908350055, "learning_rate": 5.939868204283361e-07, "logits/chosen": -0.547802746295929, "logits/rejected": -0.753125011920929, "logps/chosen": -342.20001220703125, "logps/rejected": -373.79998779296875, "loss": 0.0829, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.568646252155304, "rewards/margins": 6.432812690734863, "rewards/rejected": -7.004687309265137, "step": 4930 }, { "epoch": 1.627677100494234, "grad_norm": 8.603541092045898, "learning_rate": 5.93163097199341e-07, "logits/chosen": -0.534375011920929, "logits/rejected": -0.653027355670929, "logps/chosen": -334.29998779296875, "logps/rejected": -401.3999938964844, "loss": 0.0676, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.3344360291957855, "rewards/margins": 6.122656345367432, "rewards/rejected": -6.462500095367432, "step": 4940 }, { "epoch": 1.6309719934102143, "grad_norm": 37.31616321287001, "learning_rate": 5.923393739703459e-07, "logits/chosen": -0.3642639219760895, "logits/rejected": -0.602294921875, "logps/chosen": -324.54998779296875, "logps/rejected": -329.54998779296875, "loss": 0.1047, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.22899779677391052, "rewards/margins": 5.7265625, "rewards/rejected": -5.95703125, "step": 4950 }, { "epoch": 1.6342668863261944, "grad_norm": 9.154380322185991, "learning_rate": 5.915156507413509e-07, "logits/chosen": -0.29761964082717896, "logits/rejected": -0.595410168170929, "logps/chosen": -312.3999938964844, "logps/rejected": -373.54998779296875, "loss": 0.064, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20786133408546448, "rewards/margins": 6.055468559265137, "rewards/rejected": -5.8515625, "step": 4960 }, { "epoch": 1.6375617792421746, "grad_norm": 7.877238897761258, "learning_rate": 5.906919275123559e-07, "logits/chosen": -0.4151245057582855, "logits/rejected": -0.526806652545929, "logps/chosen": -370.67498779296875, "logps/rejected": -356.8500061035156, "loss": 0.0651, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.4642700254917145, "rewards/margins": 5.857812404632568, "rewards/rejected": -5.392187595367432, "step": 4970 }, { "epoch": 1.6408566721581548, "grad_norm": 13.669334212369813, "learning_rate": 5.898682042833608e-07, "logits/chosen": -0.48284912109375, "logits/rejected": -0.5834716558456421, "logps/chosen": -314.75, "logps/rejected": -348.1499938964844, "loss": 0.0614, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.35893553495407104, "rewards/margins": 6.176562309265137, "rewards/rejected": -5.817968845367432, "step": 4980 }, { "epoch": 1.644151565074135, "grad_norm": 12.002818523308658, "learning_rate": 5.890444810543656e-07, "logits/chosen": -0.33433228731155396, "logits/rejected": -0.49640196561813354, "logps/chosen": -370.54998779296875, "logps/rejected": -399.3999938964844, "loss": 0.0516, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.10422363132238388, "rewards/margins": 6.967187404632568, "rewards/rejected": -6.862500190734863, "step": 4990 }, { "epoch": 1.6474464579901154, "grad_norm": 1.9861420564083572, "learning_rate": 5.882207578253707e-07, "logits/chosen": -0.49144285917282104, "logits/rejected": -0.593457043170929, "logps/chosen": -330.6000061035156, "logps/rejected": -390.04998779296875, "loss": 0.0514, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.47442626953125, "rewards/margins": 6.155468940734863, "rewards/rejected": -6.6328125, "step": 5000 }, { "epoch": 1.6507413509060955, "grad_norm": 2.612631860805083, "learning_rate": 5.873970345963756e-07, "logits/chosen": -0.402069091796875, "logits/rejected": -0.5499267578125, "logps/chosen": -351.0, "logps/rejected": -392.1499938964844, "loss": 0.0659, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.17425537109375, "rewards/margins": 6.383593559265137, "rewards/rejected": -6.55859375, "step": 5010 }, { "epoch": 1.654036243822076, "grad_norm": 17.97523063068133, "learning_rate": 5.865733113673805e-07, "logits/chosen": -0.49836426973342896, "logits/rejected": -0.684619128704071, "logps/chosen": -332.0, "logps/rejected": -387.5, "loss": 0.0586, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.179931640625, "rewards/margins": 6.223437309265137, "rewards/rejected": -6.405468940734863, "step": 5020 }, { "epoch": 1.657331136738056, "grad_norm": 2.616126782285547, "learning_rate": 5.857495881383854e-07, "logits/chosen": -0.3437255918979645, "logits/rejected": -0.7083984613418579, "logps/chosen": -361.29998779296875, "logps/rejected": -403.75, "loss": 0.0395, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.662304699420929, "rewards/margins": 6.451562404632568, "rewards/rejected": -7.1171875, "step": 5030 }, { "epoch": 1.6606260296540363, "grad_norm": 42.80646341652587, "learning_rate": 5.849258649093905e-07, "logits/chosen": -0.623504638671875, "logits/rejected": -0.83154296875, "logps/chosen": -294.3500061035156, "logps/rejected": -348.6000061035156, "loss": 0.0835, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.6485351324081421, "rewards/margins": 6.432812690734863, "rewards/rejected": -7.078125, "step": 5040 }, { "epoch": 1.6639209225700164, "grad_norm": 28.80539431956242, "learning_rate": 5.841021416803954e-07, "logits/chosen": -0.549267590045929, "logits/rejected": -0.6797851324081421, "logps/chosen": -320.57501220703125, "logps/rejected": -355.04998779296875, "loss": 0.0763, "rewards/accuracies": 0.96875, "rewards/chosen": -0.946533203125, "rewards/margins": 6.453125, "rewards/rejected": -7.403124809265137, "step": 5050 }, { "epoch": 1.6672158154859966, "grad_norm": 8.615044080216025, "learning_rate": 5.832784184514003e-07, "logits/chosen": -0.4178466796875, "logits/rejected": -0.6424316167831421, "logps/chosen": -317.6000061035156, "logps/rejected": -383.79998779296875, "loss": 0.0841, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.80889892578125, "rewards/margins": 5.801562309265137, "rewards/rejected": -6.614062309265137, "step": 5060 }, { "epoch": 1.6705107084019768, "grad_norm": 19.37442097456263, "learning_rate": 5.824546952224052e-07, "logits/chosen": -0.554394543170929, "logits/rejected": -0.5677124261856079, "logps/chosen": -323.5, "logps/rejected": -356.1499938964844, "loss": 0.064, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.3059326112270355, "rewards/margins": 5.939062595367432, "rewards/rejected": -6.240624904632568, "step": 5070 }, { "epoch": 1.6738056013179572, "grad_norm": 55.58631802016465, "learning_rate": 5.816309719934102e-07, "logits/chosen": -0.4764770567417145, "logits/rejected": -0.607128918170929, "logps/chosen": -359.75, "logps/rejected": -364.3999938964844, "loss": 0.0755, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.36638182401657104, "rewards/margins": 6.206250190734863, "rewards/rejected": -6.573437690734863, "step": 5080 }, { "epoch": 1.6771004942339374, "grad_norm": 45.662716690877, "learning_rate": 5.808072487644151e-07, "logits/chosen": -0.523693859577179, "logits/rejected": -0.6766113042831421, "logps/chosen": -341.70001220703125, "logps/rejected": -366.75, "loss": 0.0549, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.329193115234375, "rewards/margins": 6.469531059265137, "rewards/rejected": -6.798437595367432, "step": 5090 }, { "epoch": 1.6803953871499178, "grad_norm": 28.49508414738359, "learning_rate": 5.7998352553542e-07, "logits/chosen": -0.5603271722793579, "logits/rejected": -0.706787109375, "logps/chosen": -323.70001220703125, "logps/rejected": -378.70001220703125, "loss": 0.0756, "rewards/accuracies": 0.96875, "rewards/chosen": -0.7274414300918579, "rewards/margins": 5.80078125, "rewards/rejected": -6.528124809265137, "step": 5100 }, { "epoch": 1.683690280065898, "grad_norm": 1.3681399974278357, "learning_rate": 5.79159802306425e-07, "logits/chosen": -0.554675281047821, "logits/rejected": -0.645214855670929, "logps/chosen": -374.6000061035156, "logps/rejected": -397.3999938964844, "loss": 0.0766, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.7638183832168579, "rewards/margins": 6.021093845367432, "rewards/rejected": -6.790625095367432, "step": 5110 }, { "epoch": 1.6869851729818781, "grad_norm": 6.43699879571317, "learning_rate": 5.7833607907743e-07, "logits/chosen": -0.4800979495048523, "logits/rejected": -0.634960949420929, "logps/chosen": -363.45001220703125, "logps/rejected": -398.25, "loss": 0.0534, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.40581053495407104, "rewards/margins": 6.685937404632568, "rewards/rejected": -7.088281154632568, "step": 5120 }, { "epoch": 1.6902800658978583, "grad_norm": 29.027831880128677, "learning_rate": 5.775123558484349e-07, "logits/chosen": -0.408447265625, "logits/rejected": -0.6263672113418579, "logps/chosen": -374.79998779296875, "logps/rejected": -411.79998779296875, "loss": 0.0478, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.05742187425494194, "rewards/margins": 6.698437690734863, "rewards/rejected": -6.751562595367432, "step": 5130 }, { "epoch": 1.6935749588138385, "grad_norm": 7.552090654268546, "learning_rate": 5.766886326194399e-07, "logits/chosen": -0.518902599811554, "logits/rejected": -0.6717284917831421, "logps/chosen": -284.25, "logps/rejected": -364.25, "loss": 0.0579, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.50299072265625, "rewards/margins": 5.776562690734863, "rewards/rejected": -6.279687404632568, "step": 5140 }, { "epoch": 1.6968698517298186, "grad_norm": 7.755757429497563, "learning_rate": 5.758649093904447e-07, "logits/chosen": -0.4490112364292145, "logits/rejected": -0.6581786870956421, "logps/chosen": -362.70001220703125, "logps/rejected": -411.95001220703125, "loss": 0.0363, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.561663806438446, "rewards/margins": 6.859375, "rewards/rejected": -7.426562309265137, "step": 5150 }, { "epoch": 1.700164744645799, "grad_norm": 28.413680203835067, "learning_rate": 5.750411861614497e-07, "logits/chosen": -0.5292724370956421, "logits/rejected": -0.7967773675918579, "logps/chosen": -325.8999938964844, "logps/rejected": -398.0, "loss": 0.0652, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.7249755859375, "rewards/margins": 6.410937309265137, "rewards/rejected": -7.134375095367432, "step": 5160 }, { "epoch": 1.7034596375617792, "grad_norm": 12.340376407058905, "learning_rate": 5.742174629324547e-07, "logits/chosen": -0.6396850347518921, "logits/rejected": -0.731231689453125, "logps/chosen": -351.3999938964844, "logps/rejected": -366.5, "loss": 0.0932, "rewards/accuracies": 0.96875, "rewards/chosen": -0.31504517793655396, "rewards/margins": 6.167187690734863, "rewards/rejected": -6.485937595367432, "step": 5170 }, { "epoch": 1.7067545304777596, "grad_norm": 53.42654306676929, "learning_rate": 5.733937397034596e-07, "logits/chosen": -0.5916992425918579, "logits/rejected": -0.712841808795929, "logps/chosen": -328.0, "logps/rejected": -376.29998779296875, "loss": 0.082, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.06085815280675888, "rewards/margins": 5.876562595367432, "rewards/rejected": -5.81640625, "step": 5180 }, { "epoch": 1.7100494233937398, "grad_norm": 31.86443695984681, "learning_rate": 5.725700164744646e-07, "logits/chosen": -0.3725830018520355, "logits/rejected": -0.6058593988418579, "logps/chosen": -326.79998779296875, "logps/rejected": -376.1000061035156, "loss": 0.0953, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.142822265625, "rewards/margins": 5.767187595367432, "rewards/rejected": -5.628125190734863, "step": 5190 }, { "epoch": 1.71334431630972, "grad_norm": 9.074351215706399, "learning_rate": 5.717462932454695e-07, "logits/chosen": -0.3567871153354645, "logits/rejected": -0.591552734375, "logps/chosen": -335.5, "logps/rejected": -366.8999938964844, "loss": 0.0389, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.44001466035842896, "rewards/margins": 6.464062690734863, "rewards/rejected": -6.025000095367432, "step": 5200 }, { "epoch": 1.7166392092257001, "grad_norm": 22.185068575975954, "learning_rate": 5.709225700164745e-07, "logits/chosen": -0.51025390625, "logits/rejected": -0.712451159954071, "logps/chosen": -308.6000061035156, "logps/rejected": -340.04998779296875, "loss": 0.0762, "rewards/accuracies": 0.96875, "rewards/chosen": 0.11900635063648224, "rewards/margins": 5.939843654632568, "rewards/rejected": -5.821875095367432, "step": 5210 }, { "epoch": 1.7199341021416803, "grad_norm": 13.25674881263795, "learning_rate": 5.700988467874793e-07, "logits/chosen": -0.5281311273574829, "logits/rejected": -0.653857409954071, "logps/chosen": -344.04998779296875, "logps/rejected": -396.0, "loss": 0.085, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.109375, "rewards/margins": 6.00390625, "rewards/rejected": -5.8984375, "step": 5220 }, { "epoch": 1.7232289950576605, "grad_norm": 15.775688397226272, "learning_rate": 5.692751235584843e-07, "logits/chosen": -0.490823358297348, "logits/rejected": -0.6161133050918579, "logps/chosen": -347.3999938964844, "logps/rejected": -371.8500061035156, "loss": 0.0421, "rewards/accuracies": 1.0, "rewards/chosen": 0.4661621153354645, "rewards/margins": 5.987500190734863, "rewards/rejected": -5.51953125, "step": 5230 }, { "epoch": 1.7265238879736409, "grad_norm": 4.456943047481457, "learning_rate": 5.684514003294893e-07, "logits/chosen": -0.44892579317092896, "logits/rejected": -0.5118163824081421, "logps/chosen": -299.6000061035156, "logps/rejected": -381.75, "loss": 0.0334, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.23897704482078552, "rewards/margins": 6.114062309265137, "rewards/rejected": -5.870312690734863, "step": 5240 }, { "epoch": 1.729818780889621, "grad_norm": 17.90923361689156, "learning_rate": 5.676276771004942e-07, "logits/chosen": -0.4076294004917145, "logits/rejected": -0.551025390625, "logps/chosen": -361.8500061035156, "logps/rejected": -400.0, "loss": 0.0351, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.12408447265625, "rewards/margins": 6.504687309265137, "rewards/rejected": -6.37890625, "step": 5250 }, { "epoch": 1.7331136738056014, "grad_norm": 35.901931221309084, "learning_rate": 5.668039538714991e-07, "logits/chosen": -0.547375500202179, "logits/rejected": -0.667694091796875, "logps/chosen": -292.75, "logps/rejected": -341.3500061035156, "loss": 0.1142, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4837402403354645, "rewards/margins": 5.678906440734863, "rewards/rejected": -6.1640625, "step": 5260 }, { "epoch": 1.7364085667215816, "grad_norm": 7.905018071757282, "learning_rate": 5.659802306425041e-07, "logits/chosen": -0.510058581829071, "logits/rejected": -0.778613269329071, "logps/chosen": -307.1499938964844, "logps/rejected": -368.54998779296875, "loss": 0.0534, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.5916748046875, "rewards/margins": 6.418749809265137, "rewards/rejected": -7.004687309265137, "step": 5270 }, { "epoch": 1.7397034596375618, "grad_norm": 18.902020951058404, "learning_rate": 5.651565074135091e-07, "logits/chosen": -0.570996105670929, "logits/rejected": -0.723193347454071, "logps/chosen": -381.29998779296875, "logps/rejected": -422.0, "loss": 0.0503, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.30231934785842896, "rewards/margins": 7.145312309265137, "rewards/rejected": -7.449999809265137, "step": 5280 }, { "epoch": 1.742998352553542, "grad_norm": 5.516630262541101, "learning_rate": 5.64332784184514e-07, "logits/chosen": -0.47807615995407104, "logits/rejected": -0.7949279546737671, "logps/chosen": -366.29998779296875, "logps/rejected": -415.25, "loss": 0.0386, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.27672117948532104, "rewards/margins": 7.285937309265137, "rewards/rejected": -7.5625, "step": 5290 }, { "epoch": 1.7462932454695221, "grad_norm": 5.3384177050087525, "learning_rate": 5.635090609555188e-07, "logits/chosen": -0.612841784954071, "logits/rejected": -0.8243652582168579, "logps/chosen": -347.54998779296875, "logps/rejected": -354.95001220703125, "loss": 0.075, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.24484863877296448, "rewards/margins": 6.252343654632568, "rewards/rejected": -6.495312690734863, "step": 5300 }, { "epoch": 1.7495881383855023, "grad_norm": 13.718724828499681, "learning_rate": 5.626853377265239e-07, "logits/chosen": -0.632373034954071, "logits/rejected": -0.685742199420929, "logps/chosen": -344.6000061035156, "logps/rejected": -407.25, "loss": 0.0421, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.2587646543979645, "rewards/margins": 6.451562404632568, "rewards/rejected": -6.707812309265137, "step": 5310 }, { "epoch": 1.7528830313014827, "grad_norm": 38.52466685323268, "learning_rate": 5.618616144975288e-07, "logits/chosen": -0.6832275390625, "logits/rejected": -0.9332519769668579, "logps/chosen": -340.8999938964844, "logps/rejected": -371.20001220703125, "loss": 0.0682, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.5124267339706421, "rewards/margins": 6.698437690734863, "rewards/rejected": -7.209374904632568, "step": 5320 }, { "epoch": 1.7561779242174629, "grad_norm": 23.571292177185722, "learning_rate": 5.610378912685337e-07, "logits/chosen": -0.6499267816543579, "logits/rejected": -0.8257812261581421, "logps/chosen": -378.6000061035156, "logps/rejected": -450.8999938964844, "loss": 0.1058, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.639843761920929, "rewards/margins": 7.045312404632568, "rewards/rejected": -7.681250095367432, "step": 5330 }, { "epoch": 1.7594728171334433, "grad_norm": 9.352926402738055, "learning_rate": 5.602141680395386e-07, "logits/chosen": -0.704296886920929, "logits/rejected": -0.905078113079071, "logps/chosen": -320.875, "logps/rejected": -344.8500061035156, "loss": 0.0593, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.4468994140625, "rewards/margins": 6.599999904632568, "rewards/rejected": -7.048437595367432, "step": 5340 }, { "epoch": 1.7627677100494235, "grad_norm": 6.465915287309118, "learning_rate": 5.593904448105437e-07, "logits/chosen": -0.6473633050918579, "logits/rejected": -0.7611328363418579, "logps/chosen": -353.79998779296875, "logps/rejected": -376.04998779296875, "loss": 0.0307, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.43730467557907104, "rewards/margins": 6.245312690734863, "rewards/rejected": -6.685937404632568, "step": 5350 }, { "epoch": 1.7660626029654036, "grad_norm": 14.73659758246354, "learning_rate": 5.585667215815486e-07, "logits/chosen": -0.7113037109375, "logits/rejected": -0.898242175579071, "logps/chosen": -369.1000061035156, "logps/rejected": -390.20001220703125, "loss": 0.0872, "rewards/accuracies": 0.96875, "rewards/chosen": -1.137304663658142, "rewards/margins": 6.701562404632568, "rewards/rejected": -7.839062690734863, "step": 5360 }, { "epoch": 1.7693574958813838, "grad_norm": 42.220963212384106, "learning_rate": 5.577429983525535e-07, "logits/chosen": -0.8101562261581421, "logits/rejected": -0.9248046875, "logps/chosen": -342.04998779296875, "logps/rejected": -356.54998779296875, "loss": 0.0726, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.3429687023162842, "rewards/margins": 5.922656059265137, "rewards/rejected": -7.268750190734863, "step": 5370 }, { "epoch": 1.772652388797364, "grad_norm": 48.30072823603811, "learning_rate": 5.569192751235584e-07, "logits/chosen": -0.712695300579071, "logits/rejected": -0.8800293207168579, "logps/chosen": -338.6000061035156, "logps/rejected": -384.70001220703125, "loss": 0.1189, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.892333984375, "rewards/margins": 6.778906345367432, "rewards/rejected": -7.670312404632568, "step": 5380 }, { "epoch": 1.7759472817133442, "grad_norm": 55.897593159383405, "learning_rate": 5.560955518945634e-07, "logits/chosen": -0.699511706829071, "logits/rejected": -0.862109363079071, "logps/chosen": -298.0, "logps/rejected": -365.25, "loss": 0.0603, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.4347290098667145, "rewards/margins": 6.315625190734863, "rewards/rejected": -6.753125190734863, "step": 5390 }, { "epoch": 1.7792421746293245, "grad_norm": 63.868117218426605, "learning_rate": 5.552718286655683e-07, "logits/chosen": -0.590527355670929, "logits/rejected": -0.8373047113418579, "logps/chosen": -390.3999938964844, "logps/rejected": -403.70001220703125, "loss": 0.072, "rewards/accuracies": 0.96875, "rewards/chosen": 0.0791015625, "rewards/margins": 6.465624809265137, "rewards/rejected": -6.387499809265137, "step": 5400 }, { "epoch": 1.782537067545305, "grad_norm": 31.738017015825417, "learning_rate": 5.544481054365733e-07, "logits/chosen": -0.636645495891571, "logits/rejected": -0.814746081829071, "logps/chosen": -360.54998779296875, "logps/rejected": -355.5, "loss": 0.0648, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.20754393935203552, "rewards/margins": 6.074999809265137, "rewards/rejected": -6.279687404632568, "step": 5410 }, { "epoch": 1.7858319604612851, "grad_norm": 54.14821887157504, "learning_rate": 5.536243822075783e-07, "logits/chosen": -0.6896728277206421, "logits/rejected": -0.8233398199081421, "logps/chosen": -341.54998779296875, "logps/rejected": -345.3500061035156, "loss": 0.0889, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.18305663764476776, "rewards/margins": 6.074999809265137, "rewards/rejected": -6.25, "step": 5420 }, { "epoch": 1.7891268533772653, "grad_norm": 11.54541241765876, "learning_rate": 5.528006589785832e-07, "logits/chosen": -0.7411133050918579, "logits/rejected": -0.956835925579071, "logps/chosen": -325.20001220703125, "logps/rejected": -373.04998779296875, "loss": 0.0544, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.3269287049770355, "rewards/margins": 6.691796779632568, "rewards/rejected": -7.026562690734863, "step": 5430 }, { "epoch": 1.7924217462932455, "grad_norm": 35.83428502692937, "learning_rate": 5.519769357495881e-07, "logits/chosen": -0.6333984136581421, "logits/rejected": -0.8197265863418579, "logps/chosen": -359.20001220703125, "logps/rejected": -388.0, "loss": 0.0628, "rewards/accuracies": 0.96875, "rewards/chosen": -0.431640625, "rewards/margins": 6.53125, "rewards/rejected": -6.963281154632568, "step": 5440 }, { "epoch": 1.7957166392092256, "grad_norm": 53.79862359226748, "learning_rate": 5.511532125205931e-07, "logits/chosen": -0.5640624761581421, "logits/rejected": -0.8682616949081421, "logps/chosen": -337.1000061035156, "logps/rejected": -391.29998779296875, "loss": 0.0727, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.26030272245407104, "rewards/margins": 6.313281059265137, "rewards/rejected": -6.576562404632568, "step": 5450 }, { "epoch": 1.7990115321252058, "grad_norm": 11.326321143759419, "learning_rate": 5.50329489291598e-07, "logits/chosen": -0.4927978515625, "logits/rejected": -0.7603515386581421, "logps/chosen": -322.6000061035156, "logps/rejected": -358.0, "loss": 0.0462, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.01824951171875, "rewards/margins": 6.067187309265137, "rewards/rejected": -6.086718559265137, "step": 5460 }, { "epoch": 1.8023064250411862, "grad_norm": 21.40750021754933, "learning_rate": 5.495057660626029e-07, "logits/chosen": -0.540454089641571, "logits/rejected": -0.7240234613418579, "logps/chosen": -318.29998779296875, "logps/rejected": -400.95001220703125, "loss": 0.0614, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.21240234375, "rewards/margins": 6.728125095367432, "rewards/rejected": -6.512499809265137, "step": 5470 }, { "epoch": 1.8056013179571664, "grad_norm": 18.802246951605337, "learning_rate": 5.486820428336079e-07, "logits/chosen": -0.5360107421875, "logits/rejected": -0.684863269329071, "logps/chosen": -362.29998779296875, "logps/rejected": -402.0, "loss": 0.0472, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.17906494438648224, "rewards/margins": 6.2578125, "rewards/rejected": -6.443749904632568, "step": 5480 }, { "epoch": 1.8088962108731468, "grad_norm": 29.40434054625206, "learning_rate": 5.478583196046128e-07, "logits/chosen": -0.6345459222793579, "logits/rejected": -0.8656250238418579, "logps/chosen": -304.75, "logps/rejected": -336.25, "loss": 0.1145, "rewards/accuracies": 0.96875, "rewards/chosen": 0.04685058444738388, "rewards/margins": 5.619531154632568, "rewards/rejected": -5.56640625, "step": 5490 }, { "epoch": 1.812191103789127, "grad_norm": 8.540182535794557, "learning_rate": 5.470345963756178e-07, "logits/chosen": -0.47532883286476135, "logits/rejected": -0.6421874761581421, "logps/chosen": -334.79998779296875, "logps/rejected": -388.54998779296875, "loss": 0.0535, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.01206054724752903, "rewards/margins": 6.134375095367432, "rewards/rejected": -6.1484375, "step": 5500 }, { "epoch": 1.8154859967051071, "grad_norm": 9.510015277188351, "learning_rate": 5.462108731466227e-07, "logits/chosen": -0.5662841796875, "logits/rejected": -0.6996704339981079, "logps/chosen": -316.6000061035156, "logps/rejected": -378.6000061035156, "loss": 0.0658, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.336801141500473, "rewards/margins": 5.925000190734863, "rewards/rejected": -6.260937690734863, "step": 5510 }, { "epoch": 1.8187808896210873, "grad_norm": 27.69556630324395, "learning_rate": 5.453871499176277e-07, "logits/chosen": -0.532666027545929, "logits/rejected": -0.7310546636581421, "logps/chosen": -354.5, "logps/rejected": -411.75, "loss": 0.0458, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.08244629204273224, "rewards/margins": 6.467968940734863, "rewards/rejected": -6.37890625, "step": 5520 }, { "epoch": 1.8220757825370675, "grad_norm": 19.66937471875601, "learning_rate": 5.445634266886326e-07, "logits/chosen": -0.6282714605331421, "logits/rejected": -0.806347668170929, "logps/chosen": -323.6499938964844, "logps/rejected": -366.25, "loss": 0.0887, "rewards/accuracies": 0.96875, "rewards/chosen": -0.4439453184604645, "rewards/margins": 6.057812690734863, "rewards/rejected": -6.5078125, "step": 5530 }, { "epoch": 1.8253706754530477, "grad_norm": 6.7981863757060355, "learning_rate": 5.437397034596375e-07, "logits/chosen": -0.5567382574081421, "logits/rejected": -0.7725585699081421, "logps/chosen": -335.5, "logps/rejected": -363.75, "loss": 0.0504, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.731274425983429, "rewards/margins": 5.991406440734863, "rewards/rejected": -6.724999904632568, "step": 5540 }, { "epoch": 1.828665568369028, "grad_norm": 3.011831347691321, "learning_rate": 5.429159802306425e-07, "logits/chosen": -0.6802734136581421, "logits/rejected": -0.794140636920929, "logps/chosen": -344.1499938964844, "logps/rejected": -387.04998779296875, "loss": 0.0439, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.644091784954071, "rewards/margins": 6.381249904632568, "rewards/rejected": -7.020312309265137, "step": 5550 }, { "epoch": 1.8319604612850082, "grad_norm": 4.492962910751861, "learning_rate": 5.420922570016474e-07, "logits/chosen": -0.7606445550918579, "logits/rejected": -0.8099365234375, "logps/chosen": -312.6000061035156, "logps/rejected": -366.6499938964844, "loss": 0.0625, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.266870141029358, "rewards/margins": 6.510937690734863, "rewards/rejected": -7.775000095367432, "step": 5560 }, { "epoch": 1.8352553542009886, "grad_norm": 11.708368099896704, "learning_rate": 5.412685337726523e-07, "logits/chosen": -0.585156261920929, "logits/rejected": -0.7665039300918579, "logps/chosen": -384.8500061035156, "logps/rejected": -421.70001220703125, "loss": 0.0665, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.886181652545929, "rewards/margins": 6.754687309265137, "rewards/rejected": -7.646874904632568, "step": 5570 }, { "epoch": 1.8385502471169688, "grad_norm": 12.340116442029537, "learning_rate": 5.404448105436573e-07, "logits/chosen": -0.527783215045929, "logits/rejected": -0.782519519329071, "logps/chosen": -357.45001220703125, "logps/rejected": -383.6000061035156, "loss": 0.0774, "rewards/accuracies": 0.96875, "rewards/chosen": -0.8974609375, "rewards/margins": 6.785937309265137, "rewards/rejected": -7.681250095367432, "step": 5580 }, { "epoch": 1.841845140032949, "grad_norm": 5.826841019502732, "learning_rate": 5.396210873146623e-07, "logits/chosen": -0.6455078125, "logits/rejected": -0.825878918170929, "logps/chosen": -410.29998779296875, "logps/rejected": -416.79998779296875, "loss": 0.0627, "rewards/accuracies": 0.96875, "rewards/chosen": -1.1421082019805908, "rewards/margins": 6.753125190734863, "rewards/rejected": -7.892187595367432, "step": 5590 }, { "epoch": 1.8451400329489291, "grad_norm": 4.546538069221616, "learning_rate": 5.387973640856672e-07, "logits/chosen": -0.6851562261581421, "logits/rejected": -0.999218761920929, "logps/chosen": -296.04998779296875, "logps/rejected": -346.20001220703125, "loss": 0.0647, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.690478503704071, "rewards/margins": 6.305468559265137, "rewards/rejected": -6.995312690734863, "step": 5600 }, { "epoch": 1.8484349258649093, "grad_norm": 19.91149623996379, "learning_rate": 5.379736408566721e-07, "logits/chosen": -0.6616150140762329, "logits/rejected": -0.818066418170929, "logps/chosen": -318.125, "logps/rejected": -371.70001220703125, "loss": 0.0745, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.5234130620956421, "rewards/margins": 6.442187309265137, "rewards/rejected": -6.965624809265137, "step": 5610 }, { "epoch": 1.8517298187808895, "grad_norm": 46.89930742126466, "learning_rate": 5.371499176276771e-07, "logits/chosen": -0.6342529058456421, "logits/rejected": -0.721630871295929, "logps/chosen": -334.70001220703125, "logps/rejected": -391.3500061035156, "loss": 0.0988, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.5440673828125, "rewards/margins": 6.051562309265137, "rewards/rejected": -6.590624809265137, "step": 5620 }, { "epoch": 1.8550247116968699, "grad_norm": 27.226004630247427, "learning_rate": 5.36326194398682e-07, "logits/chosen": -0.40583497285842896, "logits/rejected": -0.730712890625, "logps/chosen": -324.5, "logps/rejected": -352.79998779296875, "loss": 0.0412, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.05854492262005806, "rewards/margins": 6.378125190734863, "rewards/rejected": -6.435937404632568, "step": 5630 }, { "epoch": 1.85831960461285, "grad_norm": 37.53923871888112, "learning_rate": 5.355024711696869e-07, "logits/chosen": -0.4990234375, "logits/rejected": -0.6195312738418579, "logps/chosen": -312.79998779296875, "logps/rejected": -328.0, "loss": 0.0514, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.22163085639476776, "rewards/margins": 6.042187690734863, "rewards/rejected": -6.267968654632568, "step": 5640 }, { "epoch": 1.8616144975288305, "grad_norm": 6.15871533830703, "learning_rate": 5.34678747940692e-07, "logits/chosen": -0.44880372285842896, "logits/rejected": -0.7342773675918579, "logps/chosen": -402.79998779296875, "logps/rejected": -399.54998779296875, "loss": 0.0472, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.3091064393520355, "rewards/margins": 6.693749904632568, "rewards/rejected": -7.001562595367432, "step": 5650 }, { "epoch": 1.8649093904448106, "grad_norm": 114.795296440561, "learning_rate": 5.338550247116969e-07, "logits/chosen": -0.5282226800918579, "logits/rejected": -0.6317383050918579, "logps/chosen": -355.3999938964844, "logps/rejected": -424.70001220703125, "loss": 0.0966, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.2844482362270355, "rewards/margins": 6.3828125, "rewards/rejected": -6.665625095367432, "step": 5660 }, { "epoch": 1.8682042833607908, "grad_norm": 41.93316718246877, "learning_rate": 5.330313014827018e-07, "logits/chosen": -0.39696043729782104, "logits/rejected": -0.622607409954071, "logps/chosen": -322.1000061035156, "logps/rejected": -351.29998779296875, "loss": 0.0774, "rewards/accuracies": 0.96875, "rewards/chosen": -0.4013671875, "rewards/margins": 6.01171875, "rewards/rejected": -6.422656059265137, "step": 5670 }, { "epoch": 1.871499176276771, "grad_norm": 28.577125398813134, "learning_rate": 5.322075782537067e-07, "logits/chosen": -0.620434582233429, "logits/rejected": -0.7596679925918579, "logps/chosen": -330.1000061035156, "logps/rejected": -373.8999938964844, "loss": 0.0652, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.002355933189392, "rewards/margins": 6.245312690734863, "rewards/rejected": -7.245312690734863, "step": 5680 }, { "epoch": 1.8747940691927512, "grad_norm": 65.03694584613031, "learning_rate": 5.313838550247118e-07, "logits/chosen": -0.4549560546875, "logits/rejected": -0.775195300579071, "logps/chosen": -349.1000061035156, "logps/rejected": -385.8999938964844, "loss": 0.0744, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.599853515625, "rewards/margins": 6.574999809265137, "rewards/rejected": -7.1796875, "step": 5690 }, { "epoch": 1.8780889621087313, "grad_norm": 49.507888106313295, "learning_rate": 5.305601317957166e-07, "logits/chosen": -0.520263671875, "logits/rejected": -0.6092758178710938, "logps/chosen": -360.6000061035156, "logps/rejected": -405.04998779296875, "loss": 0.0689, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.25731199979782104, "rewards/margins": 6.568749904632568, "rewards/rejected": -6.823437690734863, "step": 5700 }, { "epoch": 1.8813838550247117, "grad_norm": 30.377487110638107, "learning_rate": 5.297364085667215e-07, "logits/chosen": -0.519024670124054, "logits/rejected": -0.750781238079071, "logps/chosen": -334.5, "logps/rejected": -374.04998779296875, "loss": 0.0908, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.42822265625, "rewards/margins": 6.0546875, "rewards/rejected": -6.4765625, "step": 5710 }, { "epoch": 1.884678747940692, "grad_norm": 3.056164867780282, "learning_rate": 5.289126853377265e-07, "logits/chosen": -0.525585949420929, "logits/rejected": -0.7222656011581421, "logps/chosen": -316.1000061035156, "logps/rejected": -356.8500061035156, "loss": 0.1179, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.000244140625, "rewards/margins": 6.014843940734863, "rewards/rejected": -6.018750190734863, "step": 5720 }, { "epoch": 1.8879736408566723, "grad_norm": 20.96632295579567, "learning_rate": 5.280889621087315e-07, "logits/chosen": -0.4003662168979645, "logits/rejected": -0.6983886957168579, "logps/chosen": -297.29998779296875, "logps/rejected": -361.0, "loss": 0.0511, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.4209960997104645, "rewards/margins": 6.328125, "rewards/rejected": -5.912499904632568, "step": 5730 }, { "epoch": 1.8912685337726525, "grad_norm": 17.60464796333876, "learning_rate": 5.272652388797364e-07, "logits/chosen": -0.4031982421875, "logits/rejected": -0.57177734375, "logps/chosen": -350.95001220703125, "logps/rejected": -397.3500061035156, "loss": 0.036, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.4459594786167145, "rewards/margins": 6.4375, "rewards/rejected": -5.993750095367432, "step": 5740 }, { "epoch": 1.8945634266886326, "grad_norm": 27.913251523886327, "learning_rate": 5.264415156507413e-07, "logits/chosen": -0.4498535096645355, "logits/rejected": -0.5254150629043579, "logps/chosen": -329.8500061035156, "logps/rejected": -355.8999938964844, "loss": 0.0442, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.12431640923023224, "rewards/margins": 5.783593654632568, "rewards/rejected": -5.660937309265137, "step": 5750 }, { "epoch": 1.8978583196046128, "grad_norm": 19.14710852631642, "learning_rate": 5.256177924217463e-07, "logits/chosen": -0.416656494140625, "logits/rejected": -0.5989745855331421, "logps/chosen": -317.3500061035156, "logps/rejected": -354.79998779296875, "loss": 0.0502, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.19063416123390198, "rewards/margins": 6.008593559265137, "rewards/rejected": -6.1953125, "step": 5760 }, { "epoch": 1.901153212520593, "grad_norm": 14.375928269794919, "learning_rate": 5.247940691927513e-07, "logits/chosen": -0.5549255609512329, "logits/rejected": -0.6823974847793579, "logps/chosen": -291.7749938964844, "logps/rejected": -360.45001220703125, "loss": 0.0489, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.3000732362270355, "rewards/margins": 6.115624904632568, "rewards/rejected": -6.412499904632568, "step": 5770 }, { "epoch": 1.9044481054365732, "grad_norm": 30.230818584725565, "learning_rate": 5.239703459637561e-07, "logits/chosen": -0.586621105670929, "logits/rejected": -0.7459472417831421, "logps/chosen": -322.625, "logps/rejected": -392.0, "loss": 0.0777, "rewards/accuracies": 0.96875, "rewards/chosen": -0.4053710997104645, "rewards/margins": 6.275781154632568, "rewards/rejected": -6.685937404632568, "step": 5780 }, { "epoch": 1.9077429983525536, "grad_norm": 19.632181920750803, "learning_rate": 5.231466227347611e-07, "logits/chosen": -0.674365222454071, "logits/rejected": -0.80322265625, "logps/chosen": -299.0, "logps/rejected": -336.1000061035156, "loss": 0.069, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.37815552949905396, "rewards/margins": 5.65625, "rewards/rejected": -6.036718845367432, "step": 5790 }, { "epoch": 1.9110378912685337, "grad_norm": 36.65835180100976, "learning_rate": 5.22322899505766e-07, "logits/chosen": -0.621044933795929, "logits/rejected": -0.77587890625, "logps/chosen": -315.3999938964844, "logps/rejected": -375.29998779296875, "loss": 0.0825, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.3651580810546875, "rewards/margins": 6.278124809265137, "rewards/rejected": -6.642187595367432, "step": 5800 }, { "epoch": 1.9143327841845141, "grad_norm": 14.586164004265171, "learning_rate": 5.21499176276771e-07, "logits/chosen": -0.609790027141571, "logits/rejected": -0.763964831829071, "logps/chosen": -301.79998779296875, "logps/rejected": -360.79998779296875, "loss": 0.0937, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.895190417766571, "rewards/margins": 6.072656154632568, "rewards/rejected": -6.967968940734863, "step": 5810 }, { "epoch": 1.9176276771004943, "grad_norm": 12.259230577621214, "learning_rate": 5.206754530477759e-07, "logits/chosen": -0.702014148235321, "logits/rejected": -0.8229004144668579, "logps/chosen": -338.70001220703125, "logps/rejected": -353.6499938964844, "loss": 0.0505, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.796459972858429, "rewards/margins": 6.373437404632568, "rewards/rejected": -7.167187690734863, "step": 5820 }, { "epoch": 1.9209225700164745, "grad_norm": 6.886345511103235, "learning_rate": 5.198517298187809e-07, "logits/chosen": -0.6190429925918579, "logits/rejected": -0.700024425983429, "logps/chosen": -352.1000061035156, "logps/rejected": -421.25, "loss": 0.0522, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.734619140625, "rewards/margins": 6.392968654632568, "rewards/rejected": -7.131249904632568, "step": 5830 }, { "epoch": 1.9242174629324547, "grad_norm": 23.058890307500878, "learning_rate": 5.190280065897858e-07, "logits/chosen": -0.575085461139679, "logits/rejected": -0.642382800579071, "logps/chosen": -342.3500061035156, "logps/rejected": -450.70001220703125, "loss": 0.1036, "rewards/accuracies": 0.96875, "rewards/chosen": -0.5586303472518921, "rewards/margins": 6.4609375, "rewards/rejected": -7.015625, "step": 5840 }, { "epoch": 1.9275123558484348, "grad_norm": 8.934816241575158, "learning_rate": 5.182042833607908e-07, "logits/chosen": -0.5035461187362671, "logits/rejected": -0.671435534954071, "logps/chosen": -339.04998779296875, "logps/rejected": -379.45001220703125, "loss": 0.0581, "rewards/accuracies": 0.96875, "rewards/chosen": -0.39344483613967896, "rewards/margins": 5.940625190734863, "rewards/rejected": -6.331250190734863, "step": 5850 }, { "epoch": 1.930807248764415, "grad_norm": 21.424726468474127, "learning_rate": 5.173805601317957e-07, "logits/chosen": -0.586376965045929, "logits/rejected": -0.85009765625, "logps/chosen": -349.29998779296875, "logps/rejected": -388.45001220703125, "loss": 0.0518, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.41453856229782104, "rewards/margins": 6.309374809265137, "rewards/rejected": -6.7265625, "step": 5860 }, { "epoch": 1.9341021416803954, "grad_norm": 65.94499281535721, "learning_rate": 5.165568369028006e-07, "logits/chosen": -0.709716796875, "logits/rejected": -0.7801758050918579, "logps/chosen": -311.3500061035156, "logps/rejected": -348.0, "loss": 0.0751, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.3819213807582855, "rewards/margins": 5.920312404632568, "rewards/rejected": -6.303124904632568, "step": 5870 }, { "epoch": 1.9373970345963756, "grad_norm": 20.307411702213113, "learning_rate": 5.157331136738055e-07, "logits/chosen": -0.5281738042831421, "logits/rejected": -0.7123047113418579, "logps/chosen": -376.20001220703125, "logps/rejected": -376.3999938964844, "loss": 0.0572, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.17021484673023224, "rewards/margins": 6.370312690734863, "rewards/rejected": -6.196875095367432, "step": 5880 }, { "epoch": 1.940691927512356, "grad_norm": 6.671510449333576, "learning_rate": 5.149093904448106e-07, "logits/chosen": -0.6426757574081421, "logits/rejected": -0.706347644329071, "logps/chosen": -331.6000061035156, "logps/rejected": -362.0, "loss": 0.0847, "rewards/accuracies": 0.96875, "rewards/chosen": -0.2802490293979645, "rewards/margins": 6.0625, "rewards/rejected": -6.345312595367432, "step": 5890 }, { "epoch": 1.9439868204283361, "grad_norm": 32.28354675494224, "learning_rate": 5.140856672158155e-07, "logits/chosen": -0.5477539300918579, "logits/rejected": -0.7972656488418579, "logps/chosen": -337.67498779296875, "logps/rejected": -418.79998779296875, "loss": 0.0812, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.22663573920726776, "rewards/margins": 6.498437404632568, "rewards/rejected": -6.271093845367432, "step": 5900 }, { "epoch": 1.9472817133443163, "grad_norm": 15.436170104223475, "learning_rate": 5.132619439868204e-07, "logits/chosen": -0.68896484375, "logits/rejected": -0.654101550579071, "logps/chosen": -326.5, "logps/rejected": -398.95001220703125, "loss": 0.0695, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.11549071967601776, "rewards/margins": 6.426562309265137, "rewards/rejected": -6.306250095367432, "step": 5910 }, { "epoch": 1.9505766062602965, "grad_norm": 8.855990242110469, "learning_rate": 5.124382207578253e-07, "logits/chosen": -0.620312511920929, "logits/rejected": -0.825878918170929, "logps/chosen": -322.3500061035156, "logps/rejected": -341.6000061035156, "loss": 0.0623, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.10751952975988388, "rewards/margins": 6.157812595367432, "rewards/rejected": -6.048437595367432, "step": 5920 }, { "epoch": 1.9538714991762767, "grad_norm": 26.434864734738333, "learning_rate": 5.116144975288303e-07, "logits/chosen": -0.47435301542282104, "logits/rejected": -0.624951183795929, "logps/chosen": -341.70001220703125, "logps/rejected": -421.6000061035156, "loss": 0.034, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.0075744627974927425, "rewards/margins": 7.129687309265137, "rewards/rejected": -7.123437404632568, "step": 5930 }, { "epoch": 1.9571663920922568, "grad_norm": 4.0316529495427105, "learning_rate": 5.107907742998352e-07, "logits/chosen": -0.563494861125946, "logits/rejected": -0.65625, "logps/chosen": -341.3500061035156, "logps/rejected": -412.3500061035156, "loss": 0.061, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.801867663860321, "rewards/margins": 6.810937404632568, "rewards/rejected": -7.607812404632568, "step": 5940 }, { "epoch": 1.9604612850082372, "grad_norm": 7.3141885124947, "learning_rate": 5.099670510708401e-07, "logits/chosen": -0.676806628704071, "logits/rejected": -0.77099609375, "logps/chosen": -388.1499938964844, "logps/rejected": -411.5, "loss": 0.0572, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.7044922113418579, "rewards/margins": 6.5546875, "rewards/rejected": -7.256249904632568, "step": 5950 }, { "epoch": 1.9637561779242174, "grad_norm": 5.413387076227498, "learning_rate": 5.091433278418452e-07, "logits/chosen": -0.649609386920929, "logits/rejected": -0.7900390625, "logps/chosen": -347.20001220703125, "logps/rejected": -452.29998779296875, "loss": 0.0579, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.3760986328125, "rewards/margins": 6.8671875, "rewards/rejected": -7.248437404632568, "step": 5960 }, { "epoch": 1.9670510708401978, "grad_norm": 68.15661784566046, "learning_rate": 5.083196046128501e-07, "logits/chosen": -0.620800793170929, "logits/rejected": -0.7403320074081421, "logps/chosen": -335.70001220703125, "logps/rejected": -370.54998779296875, "loss": 0.0561, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.409881591796875, "rewards/margins": 6.029687404632568, "rewards/rejected": -6.435937404632568, "step": 5970 }, { "epoch": 1.970345963756178, "grad_norm": 11.623709513223554, "learning_rate": 5.07495881383855e-07, "logits/chosen": -0.6048583984375, "logits/rejected": -0.79443359375, "logps/chosen": -338.95001220703125, "logps/rejected": -386.79998779296875, "loss": 0.0916, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.924243152141571, "rewards/margins": 6.21875, "rewards/rejected": -7.142187595367432, "step": 5980 }, { "epoch": 1.9736408566721582, "grad_norm": 17.247426494914198, "learning_rate": 5.066721581548599e-07, "logits/chosen": -0.677539050579071, "logits/rejected": -0.815966784954071, "logps/chosen": -339.0, "logps/rejected": -363.3999938964844, "loss": 0.082, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.8752502202987671, "rewards/margins": 5.875, "rewards/rejected": -6.748437404632568, "step": 5990 }, { "epoch": 1.9769357495881383, "grad_norm": 4.85351398168737, "learning_rate": 5.05848434925865e-07, "logits/chosen": -0.664398193359375, "logits/rejected": -0.8999999761581421, "logps/chosen": -365.8999938964844, "logps/rejected": -345.25, "loss": 0.0642, "rewards/accuracies": 0.96875, "rewards/chosen": -0.6678711175918579, "rewards/margins": 5.95703125, "rewards/rejected": -6.625, "step": 6000 }, { "epoch": 1.9802306425041185, "grad_norm": 17.534739110315286, "learning_rate": 5.050247116968698e-07, "logits/chosen": -0.62255859375, "logits/rejected": -0.7568359375, "logps/chosen": -331.25, "logps/rejected": -381.70001220703125, "loss": 0.0818, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.365234375, "rewards/margins": 5.928124904632568, "rewards/rejected": -6.293749809265137, "step": 6010 }, { "epoch": 1.9835255354200987, "grad_norm": 7.503334890502957, "learning_rate": 5.042009884678747e-07, "logits/chosen": -0.550122082233429, "logits/rejected": -0.72021484375, "logps/chosen": -353.07501220703125, "logps/rejected": -396.1499938964844, "loss": 0.1218, "rewards/accuracies": 0.9375, "rewards/chosen": -0.734082043170929, "rewards/margins": 6.329687595367432, "rewards/rejected": -7.060937404632568, "step": 6020 }, { "epoch": 1.986820428336079, "grad_norm": 7.204237536959709, "learning_rate": 5.033772652388797e-07, "logits/chosen": -0.4658203125, "logits/rejected": -0.721142590045929, "logps/chosen": -348.70001220703125, "logps/rejected": -362.5, "loss": 0.066, "rewards/accuracies": 0.96875, "rewards/chosen": -0.526141345500946, "rewards/margins": 6.173437595367432, "rewards/rejected": -6.701562404632568, "step": 6030 }, { "epoch": 1.9901153212520593, "grad_norm": 27.547454319148784, "learning_rate": 5.025535420098847e-07, "logits/chosen": -0.62353515625, "logits/rejected": -0.777539074420929, "logps/chosen": -278.3500061035156, "logps/rejected": -327.25, "loss": 0.0513, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.694287121295929, "rewards/margins": 6.064843654632568, "rewards/rejected": -6.762499809265137, "step": 6040 }, { "epoch": 1.9934102141680397, "grad_norm": 5.896608769585915, "learning_rate": 5.017298187808896e-07, "logits/chosen": -0.6250976324081421, "logits/rejected": -0.8263183832168579, "logps/chosen": -357.3999938964844, "logps/rejected": -371.1000061035156, "loss": 0.0855, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.8399658203125, "rewards/margins": 6.411718845367432, "rewards/rejected": -7.259375095367432, "step": 6050 }, { "epoch": 1.9967051070840198, "grad_norm": 0.7816242412242255, "learning_rate": 5.009060955518945e-07, "logits/chosen": -0.684814453125, "logits/rejected": -0.8262695074081421, "logps/chosen": -297.3500061035156, "logps/rejected": -387.8500061035156, "loss": 0.0672, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.0529906749725342, "rewards/margins": 6.762499809265137, "rewards/rejected": -7.815625190734863, "step": 6060 }, { "epoch": 2.0, "grad_norm": 16.241763291807587, "learning_rate": 5.000823723228995e-07, "logits/chosen": -0.766894519329071, "logits/rejected": -0.861572265625, "logps/chosen": -303.8500061035156, "logps/rejected": -355.3999938964844, "loss": 0.0794, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.9824463129043579, "rewards/margins": 6.047656059265137, "rewards/rejected": -7.025000095367432, "step": 6070 }, { "epoch": 2.00329489291598, "grad_norm": 2.0835561363948703, "learning_rate": 4.992586490939045e-07, "logits/chosen": -0.5648437738418579, "logits/rejected": -0.8470703363418579, "logps/chosen": -381.8500061035156, "logps/rejected": -410.29998779296875, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -0.24492187798023224, "rewards/margins": 7.854687690734863, "rewards/rejected": -8.103124618530273, "step": 6080 }, { "epoch": 2.0065897858319603, "grad_norm": 3.6025289078416196, "learning_rate": 4.984349258649094e-07, "logits/chosen": -0.7920898199081421, "logits/rejected": -0.8949218988418579, "logps/chosen": -338.04998779296875, "logps/rejected": -401.0, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": 0.08220215141773224, "rewards/margins": 7.615624904632568, "rewards/rejected": -7.529687404632568, "step": 6090 }, { "epoch": 2.0098846787479405, "grad_norm": 0.7277384058479822, "learning_rate": 4.976112026359143e-07, "logits/chosen": -0.8667968511581421, "logits/rejected": -1.021875023841858, "logps/chosen": -305.25, "logps/rejected": -402.04998779296875, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -0.3479370176792145, "rewards/margins": 9.0703125, "rewards/rejected": -9.412500381469727, "step": 6100 }, { "epoch": 2.013179571663921, "grad_norm": 1.6591278625903507, "learning_rate": 4.967874794069192e-07, "logits/chosen": -0.825439453125, "logits/rejected": -1.064062476158142, "logps/chosen": -326.9750061035156, "logps/rejected": -413.20001220703125, "loss": 0.0122, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.2239990234375, "rewards/margins": 8.643750190734863, "rewards/rejected": -9.8671875, "step": 6110 }, { "epoch": 2.0164744645799013, "grad_norm": 3.1567315092247714, "learning_rate": 4.959637561779242e-07, "logits/chosen": -0.81787109375, "logits/rejected": -1.057226538658142, "logps/chosen": -343.54998779296875, "logps/rejected": -376.70001220703125, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -1.1337401866912842, "rewards/margins": 8.443750381469727, "rewards/rejected": -9.581250190734863, "step": 6120 }, { "epoch": 2.0197693574958815, "grad_norm": 2.5185419713728923, "learning_rate": 4.951400329489292e-07, "logits/chosen": -0.8720703125, "logits/rejected": -1.120263695716858, "logps/chosen": -358.0, "logps/rejected": -426.1000061035156, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": -0.7167724370956421, "rewards/margins": 8.934374809265137, "rewards/rejected": -9.646875381469727, "step": 6130 }, { "epoch": 2.0230642504118617, "grad_norm": 0.2863636620526102, "learning_rate": 4.943163097199341e-07, "logits/chosen": -0.969677746295929, "logits/rejected": -1.1560547351837158, "logps/chosen": -314.67498779296875, "logps/rejected": -425.29998779296875, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -1.3440430164337158, "rewards/margins": 9.046875, "rewards/rejected": -10.393750190734863, "step": 6140 }, { "epoch": 2.026359143327842, "grad_norm": 1.4624519900134967, "learning_rate": 4.93492586490939e-07, "logits/chosen": -0.768139660358429, "logits/rejected": -0.9751952886581421, "logps/chosen": -359.04998779296875, "logps/rejected": -371.70001220703125, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": -1.22564697265625, "rewards/margins": 8.381250381469727, "rewards/rejected": -9.612500190734863, "step": 6150 }, { "epoch": 2.029654036243822, "grad_norm": 0.5362335246073563, "learning_rate": 4.92668863261944e-07, "logits/chosen": -0.719189465045929, "logits/rejected": -1.119140625, "logps/chosen": -339.95001220703125, "logps/rejected": -424.79998779296875, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": -1.4660155773162842, "rewards/margins": 8.899999618530273, "rewards/rejected": -10.368749618530273, "step": 6160 }, { "epoch": 2.032948929159802, "grad_norm": 0.7435270663343161, "learning_rate": 4.918451400329489e-07, "logits/chosen": -0.881054699420929, "logits/rejected": -1.06982421875, "logps/chosen": -313.25, "logps/rejected": -394.3500061035156, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -1.6943359375, "rewards/margins": 8.543749809265137, "rewards/rejected": -10.234375, "step": 6170 }, { "epoch": 2.0362438220757824, "grad_norm": 2.6901922255274133, "learning_rate": 4.910214168039538e-07, "logits/chosen": -0.82305908203125, "logits/rejected": -1.2109375, "logps/chosen": -377.1499938964844, "logps/rejected": -427.45001220703125, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -1.1284668445587158, "rewards/margins": 9.384374618530273, "rewards/rejected": -10.512499809265137, "step": 6180 }, { "epoch": 2.039538714991763, "grad_norm": 36.6255092358073, "learning_rate": 4.901976935749587e-07, "logits/chosen": -0.763989269733429, "logits/rejected": -1.1769530773162842, "logps/chosen": -330.45001220703125, "logps/rejected": -392.79998779296875, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": -1.156591773033142, "rewards/margins": 9.301562309265137, "rewards/rejected": -10.454687118530273, "step": 6190 }, { "epoch": 2.042833607907743, "grad_norm": 9.921773460368957, "learning_rate": 4.893739703459638e-07, "logits/chosen": -0.733203113079071, "logits/rejected": -1.1291992664337158, "logps/chosen": -311.8500061035156, "logps/rejected": -360.5, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": -1.4968993663787842, "rewards/margins": 8.537500381469727, "rewards/rejected": -10.035937309265137, "step": 6200 }, { "epoch": 2.0461285008237233, "grad_norm": 0.9735219203423237, "learning_rate": 4.885502471169687e-07, "logits/chosen": -0.8247314691543579, "logits/rejected": -1.075292944908142, "logps/chosen": -340.45001220703125, "logps/rejected": -378.95001220703125, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -0.8247925043106079, "rewards/margins": 8.426562309265137, "rewards/rejected": -9.25, "step": 6210 }, { "epoch": 2.0494233937397035, "grad_norm": 4.598969392777621, "learning_rate": 4.877265238879736e-07, "logits/chosen": -0.8770507574081421, "logits/rejected": -1.003808617591858, "logps/chosen": -390.5, "logps/rejected": -446.1000061035156, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": -1.0084960460662842, "rewards/margins": 8.818750381469727, "rewards/rejected": -9.824999809265137, "step": 6220 }, { "epoch": 2.0527182866556837, "grad_norm": 10.704023716315568, "learning_rate": 4.869028006589785e-07, "logits/chosen": -0.8968750238418579, "logits/rejected": -0.8753906488418579, "logps/chosen": -342.95001220703125, "logps/rejected": -414.54998779296875, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": -1.175561547279358, "rewards/margins": 8.907812118530273, "rewards/rejected": -10.084375381469727, "step": 6230 }, { "epoch": 2.056013179571664, "grad_norm": 2.143431289983792, "learning_rate": 4.860790774299835e-07, "logits/chosen": -0.737640380859375, "logits/rejected": -1.0636718273162842, "logps/chosen": -331.45001220703125, "logps/rejected": -406.70001220703125, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -1.2053711414337158, "rewards/margins": 9.149999618530273, "rewards/rejected": -10.356249809265137, "step": 6240 }, { "epoch": 2.059308072487644, "grad_norm": 0.9977464617427986, "learning_rate": 4.852553542009885e-07, "logits/chosen": -0.807543933391571, "logits/rejected": -1.0286133289337158, "logps/chosen": -324.95001220703125, "logps/rejected": -424.8999938964844, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -1.7351806163787842, "rewards/margins": 9.232812881469727, "rewards/rejected": -10.96875, "step": 6250 }, { "epoch": 2.062602965403624, "grad_norm": 2.1159435013993804, "learning_rate": 4.844316309719934e-07, "logits/chosen": -0.977343738079071, "logits/rejected": -1.312890648841858, "logps/chosen": -332.54998779296875, "logps/rejected": -434.1000061035156, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -2.341796875, "rewards/margins": 9.490625381469727, "rewards/rejected": -11.828125, "step": 6260 }, { "epoch": 2.065897858319605, "grad_norm": 2.8140883523058022, "learning_rate": 4.836079077429984e-07, "logits/chosen": -0.9248046875, "logits/rejected": -1.128271460533142, "logps/chosen": -351.79998779296875, "logps/rejected": -416.79998779296875, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -1.552954077720642, "rewards/margins": 9.134374618530273, "rewards/rejected": -10.684374809265137, "step": 6270 }, { "epoch": 2.069192751235585, "grad_norm": 1.4458844490303755, "learning_rate": 4.827841845140033e-07, "logits/chosen": -0.86474609375, "logits/rejected": -1.2263672351837158, "logps/chosen": -334.6000061035156, "logps/rejected": -407.25, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": -1.279296875, "rewards/margins": 9.465624809265137, "rewards/rejected": -10.737500190734863, "step": 6280 }, { "epoch": 2.072487644151565, "grad_norm": 2.5457110175796442, "learning_rate": 4.819604612850082e-07, "logits/chosen": -0.8366149663925171, "logits/rejected": -1.143652319908142, "logps/chosen": -324.70001220703125, "logps/rejected": -387.79998779296875, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": -1.01220703125, "rewards/margins": 8.4921875, "rewards/rejected": -9.5, "step": 6290 }, { "epoch": 2.0757825370675453, "grad_norm": 2.0530258803841672, "learning_rate": 4.811367380560131e-07, "logits/chosen": -0.945910632610321, "logits/rejected": -1.13671875, "logps/chosen": -295.45001220703125, "logps/rejected": -356.3999938964844, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": -1.1477782726287842, "rewards/margins": 8.246874809265137, "rewards/rejected": -9.393750190734863, "step": 6300 }, { "epoch": 2.0790774299835255, "grad_norm": 0.44917410734918223, "learning_rate": 4.803130148270181e-07, "logits/chosen": -0.943774402141571, "logits/rejected": -1.2287108898162842, "logps/chosen": -334.1000061035156, "logps/rejected": -427.45001220703125, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": -1.1630370616912842, "rewards/margins": 9.145312309265137, "rewards/rejected": -10.309374809265137, "step": 6310 }, { "epoch": 2.0823723228995057, "grad_norm": 1.8186996873993777, "learning_rate": 4.794892915980231e-07, "logits/chosen": -0.884033203125, "logits/rejected": -1.10986328125, "logps/chosen": -306.75, "logps/rejected": -405.04998779296875, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -1.545556664466858, "rewards/margins": 9.446874618530273, "rewards/rejected": -10.990625381469727, "step": 6320 }, { "epoch": 2.085667215815486, "grad_norm": 12.43032390529674, "learning_rate": 4.78665568369028e-07, "logits/chosen": -0.8702148199081421, "logits/rejected": -0.9827514886856079, "logps/chosen": -342.75, "logps/rejected": -393.5, "loss": 0.0159, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.403222680091858, "rewards/margins": 8.649999618530273, "rewards/rejected": -10.0546875, "step": 6330 }, { "epoch": 2.088962108731466, "grad_norm": 3.1391841271431, "learning_rate": 4.778418451400329e-07, "logits/chosen": -0.830639660358429, "logits/rejected": -1.1423828601837158, "logps/chosen": -294.4750061035156, "logps/rejected": -390.25, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": -1.6357910633087158, "rewards/margins": 9.181249618530273, "rewards/rejected": -10.809374809265137, "step": 6340 }, { "epoch": 2.0922570016474467, "grad_norm": 1.8961276448905793, "learning_rate": 4.770181219110379e-07, "logits/chosen": -0.848095715045929, "logits/rejected": -1.201074242591858, "logps/chosen": -342.3999938964844, "logps/rejected": -395.79998779296875, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -1.577050805091858, "rewards/margins": 9.065625190734863, "rewards/rejected": -10.640625, "step": 6350 }, { "epoch": 2.095551894563427, "grad_norm": 5.333487792258499, "learning_rate": 4.761943986820428e-07, "logits/chosen": -1.0631835460662842, "logits/rejected": -1.177734375, "logps/chosen": -352.70001220703125, "logps/rejected": -449.20001220703125, "loss": 0.0142, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.49017333984375, "rewards/margins": 8.928125381469727, "rewards/rejected": -10.4296875, "step": 6360 }, { "epoch": 2.098846787479407, "grad_norm": 1.2423915580151612, "learning_rate": 4.7537067545304776e-07, "logits/chosen": -1.0119140148162842, "logits/rejected": -1.2118651866912842, "logps/chosen": -362.20001220703125, "logps/rejected": -442.29998779296875, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -0.7743316888809204, "rewards/margins": 9.259374618530273, "rewards/rejected": -10.03125, "step": 6370 }, { "epoch": 2.102141680395387, "grad_norm": 2.6058614352749903, "learning_rate": 4.7454695222405274e-07, "logits/chosen": -1.195214867591858, "logits/rejected": -1.3533203601837158, "logps/chosen": -333.1499938964844, "logps/rejected": -405.70001220703125, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": -1.059912085533142, "rewards/margins": 9.529687881469727, "rewards/rejected": -10.587499618530273, "step": 6380 }, { "epoch": 2.1054365733113674, "grad_norm": 0.14224053998759426, "learning_rate": 4.737232289950576e-07, "logits/chosen": -1.0211060047149658, "logits/rejected": -1.2951171398162842, "logps/chosen": -362.3500061035156, "logps/rejected": -448.29998779296875, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": -1.3800780773162842, "rewards/margins": 9.345312118530273, "rewards/rejected": -10.725000381469727, "step": 6390 }, { "epoch": 2.1087314662273475, "grad_norm": 0.396463006502659, "learning_rate": 4.728995057660626e-07, "logits/chosen": -1.064062476158142, "logits/rejected": -1.368749976158142, "logps/chosen": -353.1000061035156, "logps/rejected": -409.29998779296875, "loss": 0.0183, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.7763183116912842, "rewards/margins": 9.4140625, "rewards/rejected": -11.193750381469727, "step": 6400 }, { "epoch": 2.1120263591433277, "grad_norm": 5.98464662353215, "learning_rate": 4.720757825370675e-07, "logits/chosen": -0.971875011920929, "logits/rejected": -1.2140624523162842, "logps/chosen": -348.29998779296875, "logps/rejected": -409.6000061035156, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": -1.834228515625, "rewards/margins": 9.440625190734863, "rewards/rejected": -11.278124809265137, "step": 6410 }, { "epoch": 2.115321252059308, "grad_norm": 3.73636583112232, "learning_rate": 4.712520593080725e-07, "logits/chosen": -0.9966796636581421, "logits/rejected": -1.047949194908142, "logps/chosen": -321.54998779296875, "logps/rejected": -404.79998779296875, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": -2.384960889816284, "rewards/margins": 8.600000381469727, "rewards/rejected": -10.978124618530273, "step": 6420 }, { "epoch": 2.1186161449752885, "grad_norm": 1.9131863295687788, "learning_rate": 4.704283360790774e-07, "logits/chosen": -0.8351806402206421, "logits/rejected": -1.14794921875, "logps/chosen": -347.54998779296875, "logps/rejected": -413.6000061035156, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": -1.906835913658142, "rewards/margins": 8.826562881469727, "rewards/rejected": -10.731249809265137, "step": 6430 }, { "epoch": 2.1219110378912687, "grad_norm": 4.395106056755237, "learning_rate": 4.6960461285008234e-07, "logits/chosen": -0.7099243402481079, "logits/rejected": -1.0768554210662842, "logps/chosen": -368.75, "logps/rejected": -409.04998779296875, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -1.44427490234375, "rewards/margins": 8.817187309265137, "rewards/rejected": -10.25, "step": 6440 }, { "epoch": 2.125205930807249, "grad_norm": 13.749832617948018, "learning_rate": 4.687808896210873e-07, "logits/chosen": -0.780346691608429, "logits/rejected": -0.945721447467804, "logps/chosen": -396.04998779296875, "logps/rejected": -462.20001220703125, "loss": 0.0132, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.6622314453125, "rewards/margins": 9.359375, "rewards/rejected": -11.024999618530273, "step": 6450 }, { "epoch": 2.128500823723229, "grad_norm": 5.510690402934815, "learning_rate": 4.6795716639209225e-07, "logits/chosen": -0.83154296875, "logits/rejected": -1.068017601966858, "logps/chosen": -339.1499938964844, "logps/rejected": -388.6000061035156, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": -2.098828077316284, "rewards/margins": 8.75, "rewards/rejected": -10.850000381469727, "step": 6460 }, { "epoch": 2.131795716639209, "grad_norm": 0.7502191368372189, "learning_rate": 4.6713344316309717e-07, "logits/chosen": -0.8584960699081421, "logits/rejected": -1.0940430164337158, "logps/chosen": -348.6000061035156, "logps/rejected": -395.79998779296875, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -2.0689454078674316, "rewards/margins": 9.051562309265137, "rewards/rejected": -11.118749618530273, "step": 6470 }, { "epoch": 2.1350906095551894, "grad_norm": 0.4530205843793937, "learning_rate": 4.663097199341021e-07, "logits/chosen": -0.759521484375, "logits/rejected": -1.026757836341858, "logps/chosen": -351.70001220703125, "logps/rejected": -420.0, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -1.9236328601837158, "rewards/margins": 9.171875, "rewards/rejected": -11.090624809265137, "step": 6480 }, { "epoch": 2.1383855024711695, "grad_norm": 7.645565276352228, "learning_rate": 4.6548599670510707e-07, "logits/chosen": -0.8244384527206421, "logits/rejected": -0.963574230670929, "logps/chosen": -363.04998779296875, "logps/rejected": -407.3999938964844, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -1.4449462890625, "rewards/margins": 9.1484375, "rewards/rejected": -10.606249809265137, "step": 6490 }, { "epoch": 2.1416803953871497, "grad_norm": 1.921905027184312, "learning_rate": 4.6466227347611205e-07, "logits/chosen": -0.8377929925918579, "logits/rejected": -1.041601538658142, "logps/chosen": -293.04998779296875, "logps/rejected": -386.3999938964844, "loss": 0.014, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.750878930091858, "rewards/margins": 8.953125, "rewards/rejected": -10.707812309265137, "step": 6500 }, { "epoch": 2.1449752883031303, "grad_norm": 1.8780504565855194, "learning_rate": 4.638385502471169e-07, "logits/chosen": -0.7935791015625, "logits/rejected": -0.957763671875, "logps/chosen": -349.1000061035156, "logps/rejected": -471.5, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -1.6452147960662842, "rewards/margins": 9.370312690734863, "rewards/rejected": -11.009374618530273, "step": 6510 }, { "epoch": 2.1482701812191105, "grad_norm": 1.5064121670311204, "learning_rate": 4.630148270181219e-07, "logits/chosen": -0.734234631061554, "logits/rejected": -1.0654296875, "logps/chosen": -340.70001220703125, "logps/rejected": -413.6499938964844, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -1.108007788658142, "rewards/margins": 9.246874809265137, "rewards/rejected": -10.362500190734863, "step": 6520 }, { "epoch": 2.1515650741350907, "grad_norm": 0.700638577979129, "learning_rate": 4.621911037891268e-07, "logits/chosen": -0.8741210699081421, "logits/rejected": -1.1116211414337158, "logps/chosen": -342.25, "logps/rejected": -433.20001220703125, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -1.9407470226287842, "rewards/margins": 9.371874809265137, "rewards/rejected": -11.309374809265137, "step": 6530 }, { "epoch": 2.154859967051071, "grad_norm": 3.362655736054884, "learning_rate": 4.613673805601318e-07, "logits/chosen": -0.9693359136581421, "logits/rejected": -1.1902344226837158, "logps/chosen": -368.25, "logps/rejected": -428.1000061035156, "loss": 0.0139, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.453369140625, "rewards/margins": 9.535937309265137, "rewards/rejected": -11.990625381469727, "step": 6540 }, { "epoch": 2.158154859967051, "grad_norm": 2.268389854804406, "learning_rate": 4.6054365733113673e-07, "logits/chosen": -0.823925793170929, "logits/rejected": -1.1037108898162842, "logps/chosen": -355.75, "logps/rejected": -453.8500061035156, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -2.044384717941284, "rewards/margins": 9.185937881469727, "rewards/rejected": -11.228124618530273, "step": 6550 }, { "epoch": 2.161449752883031, "grad_norm": 1.942803747057546, "learning_rate": 4.5971993410214165e-07, "logits/chosen": -0.89892578125, "logits/rejected": -1.1826171875, "logps/chosen": -368.8999938964844, "logps/rejected": -437.70001220703125, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -2.626953125, "rewards/margins": 9.381250381469727, "rewards/rejected": -12.009374618530273, "step": 6560 }, { "epoch": 2.1647446457990114, "grad_norm": 6.6395583059265055, "learning_rate": 4.5889621087314663e-07, "logits/chosen": -0.8602539300918579, "logits/rejected": -1.043066382408142, "logps/chosen": -342.95001220703125, "logps/rejected": -404.8500061035156, "loss": 0.0203, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.9501953125, "rewards/margins": 9.479687690734863, "rewards/rejected": -11.428125381469727, "step": 6570 }, { "epoch": 2.168039538714992, "grad_norm": 11.087340525267217, "learning_rate": 4.580724876441515e-07, "logits/chosen": -0.9434570074081421, "logits/rejected": -1.033593773841858, "logps/chosen": -348.25, "logps/rejected": -447.0, "loss": 0.0196, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.057592749595642, "rewards/margins": 8.989062309265137, "rewards/rejected": -10.042187690734863, "step": 6580 }, { "epoch": 2.171334431630972, "grad_norm": 2.6093341924818443, "learning_rate": 4.572487644151565e-07, "logits/chosen": -0.9029296636581421, "logits/rejected": -1.2197265625, "logps/chosen": -330.29998779296875, "logps/rejected": -388.54998779296875, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -1.1680176258087158, "rewards/margins": 9.176562309265137, "rewards/rejected": -10.34375, "step": 6590 }, { "epoch": 2.1746293245469523, "grad_norm": 0.7063441076984118, "learning_rate": 4.564250411861614e-07, "logits/chosen": -0.908398449420929, "logits/rejected": -1.1580078601837158, "logps/chosen": -358.75, "logps/rejected": -404.1499938964844, "loss": 0.0086, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.5262939929962158, "rewards/margins": 9.346875190734863, "rewards/rejected": -10.868749618530273, "step": 6600 }, { "epoch": 2.1779242174629325, "grad_norm": 0.34342262496234344, "learning_rate": 4.556013179571664e-07, "logits/chosen": -0.7397216558456421, "logits/rejected": -1.0388672351837158, "logps/chosen": -335.79998779296875, "logps/rejected": -374.79998779296875, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -1.280798316001892, "rewards/margins": 8.982812881469727, "rewards/rejected": -10.265625, "step": 6610 }, { "epoch": 2.1812191103789127, "grad_norm": 3.5032325667810404, "learning_rate": 4.547775947281713e-07, "logits/chosen": -0.893481433391571, "logits/rejected": -1.0935547351837158, "logps/chosen": -342.1000061035156, "logps/rejected": -420.54998779296875, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -1.980566382408142, "rewards/margins": 9.303125381469727, "rewards/rejected": -11.28125, "step": 6620 }, { "epoch": 2.184514003294893, "grad_norm": 1.117969029832842, "learning_rate": 4.5395387149917623e-07, "logits/chosen": -0.862353503704071, "logits/rejected": -1.041601538658142, "logps/chosen": -340.8500061035156, "logps/rejected": -403.1499938964844, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": -2.219433546066284, "rewards/margins": 9.073437690734863, "rewards/rejected": -11.300000190734863, "step": 6630 }, { "epoch": 2.187808896210873, "grad_norm": 5.338342090912333, "learning_rate": 4.531301482701812e-07, "logits/chosen": -0.84912109375, "logits/rejected": -1.0063965320587158, "logps/chosen": -348.8999938964844, "logps/rejected": -439.79998779296875, "loss": 0.0079, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.0269532203674316, "rewards/margins": 10.162500381469727, "rewards/rejected": -12.184374809265137, "step": 6640 }, { "epoch": 2.191103789126853, "grad_norm": 2.1145257968509568, "learning_rate": 4.5230642504118614e-07, "logits/chosen": -0.79638671875, "logits/rejected": -0.873095691204071, "logps/chosen": -353.70001220703125, "logps/rejected": -452.8999938964844, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -2.494921922683716, "rewards/margins": 9.487500190734863, "rewards/rejected": -11.975000381469727, "step": 6650 }, { "epoch": 2.1943986820428334, "grad_norm": 0.49185997210292737, "learning_rate": 4.5148270181219106e-07, "logits/chosen": -0.7855285406112671, "logits/rejected": -0.997851550579071, "logps/chosen": -381.75, "logps/rejected": -438.3999938964844, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -2.79296875, "rewards/margins": 9.478124618530273, "rewards/rejected": -12.28125, "step": 6660 }, { "epoch": 2.197693574958814, "grad_norm": 0.4839811524115673, "learning_rate": 4.5065897858319604e-07, "logits/chosen": -0.989208996295929, "logits/rejected": -1.1354491710662842, "logps/chosen": -317.45001220703125, "logps/rejected": -422.8500061035156, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -2.5289063453674316, "rewards/margins": 10.243749618530273, "rewards/rejected": -12.762499809265137, "step": 6670 }, { "epoch": 2.200988467874794, "grad_norm": 0.7647123003185892, "learning_rate": 4.4983525535420096e-07, "logits/chosen": -0.825488269329071, "logits/rejected": -1.0246093273162842, "logps/chosen": -378.79998779296875, "logps/rejected": -429.20001220703125, "loss": 0.0119, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.930810570716858, "rewards/margins": 9.065625190734863, "rewards/rejected": -10.993749618530273, "step": 6680 }, { "epoch": 2.2042833607907744, "grad_norm": 0.4960982905856486, "learning_rate": 4.4901153212520594e-07, "logits/chosen": -0.890820324420929, "logits/rejected": -1.0078125, "logps/chosen": -346.04998779296875, "logps/rejected": -463.1000061035156, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -1.4108397960662842, "rewards/margins": 9.529687881469727, "rewards/rejected": -10.943750381469727, "step": 6690 }, { "epoch": 2.2075782537067545, "grad_norm": 0.2794242653042014, "learning_rate": 4.481878088962108e-07, "logits/chosen": -0.7930663824081421, "logits/rejected": -0.9635009765625, "logps/chosen": -360.6000061035156, "logps/rejected": -387.20001220703125, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -1.9010741710662842, "rewards/margins": 8.903124809265137, "rewards/rejected": -10.806249618530273, "step": 6700 }, { "epoch": 2.2108731466227347, "grad_norm": 1.8029229724039848, "learning_rate": 4.473640856672158e-07, "logits/chosen": -0.7042266726493835, "logits/rejected": -1.051513671875, "logps/chosen": -334.29998779296875, "logps/rejected": -408.5, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -1.376708984375, "rewards/margins": 8.65625, "rewards/rejected": -10.032812118530273, "step": 6710 }, { "epoch": 2.214168039538715, "grad_norm": 0.9908178169547321, "learning_rate": 4.465403624382207e-07, "logits/chosen": -0.9173828363418579, "logits/rejected": -1.1765625476837158, "logps/chosen": -390.3999938964844, "logps/rejected": -440.29998779296875, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -1.020410180091858, "rewards/margins": 10.181249618530273, "rewards/rejected": -11.203125, "step": 6720 }, { "epoch": 2.217462932454695, "grad_norm": 0.7992127416084193, "learning_rate": 4.457166392092257e-07, "logits/chosen": -0.83917236328125, "logits/rejected": -1.1287109851837158, "logps/chosen": -357.79998779296875, "logps/rejected": -382.20001220703125, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": -1.407617211341858, "rewards/margins": 9.246874809265137, "rewards/rejected": -10.6640625, "step": 6730 }, { "epoch": 2.2207578253706757, "grad_norm": 1.2661242610482992, "learning_rate": 4.448929159802306e-07, "logits/chosen": -0.824169933795929, "logits/rejected": -1.040624976158142, "logps/chosen": -336.3500061035156, "logps/rejected": -394.3999938964844, "loss": 0.0123, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.6290283203125, "rewards/margins": 9.185937881469727, "rewards/rejected": -10.815625190734863, "step": 6740 }, { "epoch": 2.224052718286656, "grad_norm": 24.60613209329485, "learning_rate": 4.4406919275123555e-07, "logits/chosen": -0.711962878704071, "logits/rejected": -1.027734398841858, "logps/chosen": -349.3500061035156, "logps/rejected": -422.6499938964844, "loss": 0.0142, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.7321288585662842, "rewards/margins": 9.965624809265137, "rewards/rejected": -11.696874618530273, "step": 6750 }, { "epoch": 2.227347611202636, "grad_norm": 4.41116623905839, "learning_rate": 4.432454695222405e-07, "logits/chosen": -0.878662109375, "logits/rejected": -1.1701171398162842, "logps/chosen": -348.54998779296875, "logps/rejected": -455.79998779296875, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": -2.378100633621216, "rewards/margins": 9.801562309265137, "rewards/rejected": -12.184374809265137, "step": 6760 }, { "epoch": 2.230642504118616, "grad_norm": 4.154192581314821, "learning_rate": 4.4242174629324545e-07, "logits/chosen": -1.02099609375, "logits/rejected": -1.1179687976837158, "logps/chosen": -339.3500061035156, "logps/rejected": -420.79998779296875, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -2.260058641433716, "rewards/margins": 9.654687881469727, "rewards/rejected": -11.912500381469727, "step": 6770 }, { "epoch": 2.2339373970345964, "grad_norm": 2.0368251280941667, "learning_rate": 4.4159802306425037e-07, "logits/chosen": -0.6382080316543579, "logits/rejected": -1.053808569908142, "logps/chosen": -332.8999938964844, "logps/rejected": -404.20001220703125, "loss": 0.0302, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.5774414539337158, "rewards/margins": 9.479687690734863, "rewards/rejected": -11.056249618530273, "step": 6780 }, { "epoch": 2.2372322899505765, "grad_norm": 1.0242252648651944, "learning_rate": 4.4077429983525535e-07, "logits/chosen": -0.860247790813446, "logits/rejected": -1.1011230945587158, "logps/chosen": -338.29998779296875, "logps/rejected": -419.29998779296875, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": -1.6796875, "rewards/margins": 9.0390625, "rewards/rejected": -10.714062690734863, "step": 6790 }, { "epoch": 2.2405271828665567, "grad_norm": 0.47059421487862524, "learning_rate": 4.399505766062603e-07, "logits/chosen": -0.864550769329071, "logits/rejected": -1.0158202648162842, "logps/chosen": -356.54998779296875, "logps/rejected": -439.1000061035156, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -1.973242163658142, "rewards/margins": 9.296875, "rewards/rejected": -11.265625, "step": 6800 }, { "epoch": 2.243822075782537, "grad_norm": 34.22779169941975, "learning_rate": 4.3912685337726525e-07, "logits/chosen": -0.749554455280304, "logits/rejected": -1.0705077648162842, "logps/chosen": -354.29998779296875, "logps/rejected": -412.5, "loss": 0.0185, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.5471680164337158, "rewards/margins": 9.578125, "rewards/rejected": -11.125, "step": 6810 }, { "epoch": 2.247116968698517, "grad_norm": 1.0689840299113667, "learning_rate": 4.383031301482701e-07, "logits/chosen": -0.9737304449081421, "logits/rejected": -1.0954101085662842, "logps/chosen": -346.3999938964844, "logps/rejected": -416.1000061035156, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -2.229736328125, "rewards/margins": 9.475000381469727, "rewards/rejected": -11.699999809265137, "step": 6820 }, { "epoch": 2.2504118616144977, "grad_norm": 5.57699197318237, "learning_rate": 4.374794069192751e-07, "logits/chosen": -0.724316418170929, "logits/rejected": -0.978320300579071, "logps/chosen": -368.3999938964844, "logps/rejected": -432.29998779296875, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": -1.6910400390625, "rewards/margins": 10.237500190734863, "rewards/rejected": -11.928125381469727, "step": 6830 }, { "epoch": 2.253706754530478, "grad_norm": 1.563313379156851, "learning_rate": 4.3665568369028003e-07, "logits/chosen": -1.0724608898162842, "logits/rejected": -1.176367163658142, "logps/chosen": -350.79998779296875, "logps/rejected": -430.1000061035156, "loss": 0.0096, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.0650391578674316, "rewards/margins": 9.798437118530273, "rewards/rejected": -11.868749618530273, "step": 6840 }, { "epoch": 2.257001647446458, "grad_norm": 43.94613138571453, "learning_rate": 4.35831960461285e-07, "logits/chosen": -1.0012695789337158, "logits/rejected": -1.1476562023162842, "logps/chosen": -361.04998779296875, "logps/rejected": -428.8999938964844, "loss": 0.0135, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.373291015625, "rewards/margins": 9.240625381469727, "rewards/rejected": -11.620312690734863, "step": 6850 }, { "epoch": 2.260296540362438, "grad_norm": 11.770126738996906, "learning_rate": 4.3500823723228993e-07, "logits/chosen": -0.896044909954071, "logits/rejected": -1.150390625, "logps/chosen": -334.04998779296875, "logps/rejected": -446.0, "loss": 0.0077, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.892797827720642, "rewards/margins": 9.878125190734863, "rewards/rejected": -11.771875381469727, "step": 6860 }, { "epoch": 2.2635914332784184, "grad_norm": 0.2009388839759446, "learning_rate": 4.3418451400329486e-07, "logits/chosen": -0.733935534954071, "logits/rejected": -0.96044921875, "logps/chosen": -348.6499938964844, "logps/rejected": -454.1499938964844, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -2.169384717941284, "rewards/margins": 9.4296875, "rewards/rejected": -11.600000381469727, "step": 6870 }, { "epoch": 2.2668863261943986, "grad_norm": 0.47699557347463145, "learning_rate": 4.3336079077429983e-07, "logits/chosen": -0.86962890625, "logits/rejected": -1.1365234851837158, "logps/chosen": -371.29998779296875, "logps/rejected": -444.79998779296875, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -1.8787109851837158, "rewards/margins": 9.792187690734863, "rewards/rejected": -11.668749809265137, "step": 6880 }, { "epoch": 2.2701812191103787, "grad_norm": 2.505360316441085, "learning_rate": 4.3253706754530476e-07, "logits/chosen": -0.760449230670929, "logits/rejected": -0.9263916015625, "logps/chosen": -301.45001220703125, "logps/rejected": -426.6000061035156, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -1.5383789539337158, "rewards/margins": 9.731249809265137, "rewards/rejected": -11.271875381469727, "step": 6890 }, { "epoch": 2.2734761120263594, "grad_norm": 0.40223980120796216, "learning_rate": 4.317133443163097e-07, "logits/chosen": -0.8015381097793579, "logits/rejected": -1.0509765148162842, "logps/chosen": -328.04998779296875, "logps/rejected": -422.79998779296875, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -1.784033179283142, "rewards/margins": 9.5625, "rewards/rejected": -11.348437309265137, "step": 6900 }, { "epoch": 2.2767710049423395, "grad_norm": 8.437870262031062, "learning_rate": 4.3088962108731466e-07, "logits/chosen": -0.84716796875, "logits/rejected": -1.131005883216858, "logps/chosen": -328.1000061035156, "logps/rejected": -394.6000061035156, "loss": 0.0258, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.4539062976837158, "rewards/margins": 9.478124618530273, "rewards/rejected": -10.935937881469727, "step": 6910 }, { "epoch": 2.2800658978583197, "grad_norm": 0.781154372777336, "learning_rate": 4.300658978583196e-07, "logits/chosen": -0.894604504108429, "logits/rejected": -1.0265624523162842, "logps/chosen": -359.5, "logps/rejected": -414.3999938964844, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -1.3147461414337158, "rewards/margins": 9.265625, "rewards/rejected": -10.578125, "step": 6920 }, { "epoch": 2.2833607907743, "grad_norm": 2.442975926074644, "learning_rate": 4.2924217462932456e-07, "logits/chosen": -0.8897460699081421, "logits/rejected": -1.1923828125, "logps/chosen": -374.8999938964844, "logps/rejected": -430.70001220703125, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": -1.7156250476837158, "rewards/margins": 9.732812881469727, "rewards/rejected": -11.456250190734863, "step": 6930 }, { "epoch": 2.28665568369028, "grad_norm": 0.9944283418658051, "learning_rate": 4.2841845140032944e-07, "logits/chosen": -0.8727051019668579, "logits/rejected": -1.1448242664337158, "logps/chosen": -385.8999938964844, "logps/rejected": -456.1000061035156, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -2.1366209983825684, "rewards/margins": 9.854687690734863, "rewards/rejected": -11.990625381469727, "step": 6940 }, { "epoch": 2.2899505766062602, "grad_norm": 1.0866455494194764, "learning_rate": 4.275947281713344e-07, "logits/chosen": -1.159570336341858, "logits/rejected": -1.314453125, "logps/chosen": -345.54998779296875, "logps/rejected": -416.5, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -2.2689452171325684, "rewards/margins": 9.676562309265137, "rewards/rejected": -11.946874618530273, "step": 6950 }, { "epoch": 2.2932454695222404, "grad_norm": 2.2304185025207803, "learning_rate": 4.2677100494233934e-07, "logits/chosen": -0.893261730670929, "logits/rejected": -1.2746093273162842, "logps/chosen": -360.8999938964844, "logps/rejected": -430.8999938964844, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -2.1078124046325684, "rewards/margins": 9.418749809265137, "rewards/rejected": -11.53125, "step": 6960 }, { "epoch": 2.2965403624382206, "grad_norm": 71.7805030340757, "learning_rate": 4.259472817133443e-07, "logits/chosen": -0.974316418170929, "logits/rejected": -1.3175780773162842, "logps/chosen": -341.70001220703125, "logps/rejected": -439.1000061035156, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": -2.427734375, "rewards/margins": 9.71875, "rewards/rejected": -12.149999618530273, "step": 6970 }, { "epoch": 2.2998352553542007, "grad_norm": 1.3872633066620177, "learning_rate": 4.2512355848434924e-07, "logits/chosen": -1.006933569908142, "logits/rejected": -1.2781250476837158, "logps/chosen": -401.6499938964844, "logps/rejected": -427.6000061035156, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -2.62109375, "rewards/margins": 9.5078125, "rewards/rejected": -12.134374618530273, "step": 6980 }, { "epoch": 2.3031301482701814, "grad_norm": 0.4185594121795738, "learning_rate": 4.2429983525535417e-07, "logits/chosen": -0.8586791753768921, "logits/rejected": -1.2810547351837158, "logps/chosen": -370.6000061035156, "logps/rejected": -420.6000061035156, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -2.113085985183716, "rewards/margins": 9.556249618530273, "rewards/rejected": -11.681249618530273, "step": 6990 }, { "epoch": 2.3064250411861615, "grad_norm": 0.7694507590457582, "learning_rate": 4.2347611202635914e-07, "logits/chosen": -0.900097668170929, "logits/rejected": -1.16259765625, "logps/chosen": -350.79998779296875, "logps/rejected": -402.70001220703125, "loss": 0.0327, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.9926269054412842, "rewards/margins": 8.949999809265137, "rewards/rejected": -10.943750381469727, "step": 7000 }, { "epoch": 2.3097199341021417, "grad_norm": 0.8754928825115146, "learning_rate": 4.2265238879736407e-07, "logits/chosen": -0.98583984375, "logits/rejected": -1.113012671470642, "logps/chosen": -294.6499938964844, "logps/rejected": -393.1499938964844, "loss": 0.0214, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.70556640625, "rewards/margins": 9.178125381469727, "rewards/rejected": -10.878125190734863, "step": 7010 }, { "epoch": 2.313014827018122, "grad_norm": 4.108953879221097, "learning_rate": 4.21828665568369e-07, "logits/chosen": -0.736523449420929, "logits/rejected": -1.150781273841858, "logps/chosen": -311.95001220703125, "logps/rejected": -415.3999938964844, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": -1.9755859375, "rewards/margins": 9.707812309265137, "rewards/rejected": -11.6875, "step": 7020 }, { "epoch": 2.316309719934102, "grad_norm": 9.087578741792885, "learning_rate": 4.2100494233937397e-07, "logits/chosen": -1.0138671398162842, "logits/rejected": -1.172265648841858, "logps/chosen": -376.75, "logps/rejected": -448.20001220703125, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -1.7282226085662842, "rewards/margins": 9.75, "rewards/rejected": -11.475000381469727, "step": 7030 }, { "epoch": 2.3196046128500822, "grad_norm": 2.432727289389618, "learning_rate": 4.201812191103789e-07, "logits/chosen": -1.008203148841858, "logits/rejected": -1.228515625, "logps/chosen": -364.95001220703125, "logps/rejected": -411.5, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -1.5362060070037842, "rewards/margins": 9.665624618530273, "rewards/rejected": -11.206250190734863, "step": 7040 }, { "epoch": 2.3228995057660624, "grad_norm": 1.4273299322088286, "learning_rate": 4.193574958813839e-07, "logits/chosen": -1.0505859851837158, "logits/rejected": -1.277734398841858, "logps/chosen": -334.75, "logps/rejected": -385.5, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": -1.4871094226837158, "rewards/margins": 9.350000381469727, "rewards/rejected": -10.837499618530273, "step": 7050 }, { "epoch": 2.326194398682043, "grad_norm": 8.248164946655226, "learning_rate": 4.1853377265238875e-07, "logits/chosen": -0.8516601324081421, "logits/rejected": -0.988525390625, "logps/chosen": -340.04998779296875, "logps/rejected": -430.79998779296875, "loss": 0.0102, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.703832983970642, "rewards/margins": 9.3984375, "rewards/rejected": -11.100000381469727, "step": 7060 }, { "epoch": 2.329489291598023, "grad_norm": 6.902619544538205, "learning_rate": 4.177100494233937e-07, "logits/chosen": -0.9541015625, "logits/rejected": -1.1492187976837158, "logps/chosen": -380.20001220703125, "logps/rejected": -442.20001220703125, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": -1.528417944908142, "rewards/margins": 9.457812309265137, "rewards/rejected": -10.987500190734863, "step": 7070 }, { "epoch": 2.3327841845140034, "grad_norm": 3.5966389918547685, "learning_rate": 4.1688632619439865e-07, "logits/chosen": -0.845410168170929, "logits/rejected": -1.058984398841858, "logps/chosen": -371.75, "logps/rejected": -426.5, "loss": 0.0181, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.2407348155975342, "rewards/margins": 9.185937881469727, "rewards/rejected": -10.421875, "step": 7080 }, { "epoch": 2.3360790774299836, "grad_norm": 5.986652890010289, "learning_rate": 4.1606260296540363e-07, "logits/chosen": -0.660839855670929, "logits/rejected": -1.097265601158142, "logps/chosen": -327.54998779296875, "logps/rejected": -425.1499938964844, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -1.321142554283142, "rewards/margins": 9.740625381469727, "rewards/rejected": -11.064062118530273, "step": 7090 }, { "epoch": 2.3393739703459637, "grad_norm": 2.534709887737591, "learning_rate": 4.1523887973640855e-07, "logits/chosen": -0.7828124761581421, "logits/rejected": -1.023193359375, "logps/chosen": -332.25, "logps/rejected": -408.5, "loss": 0.0088, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.511132836341858, "rewards/margins": 9.309374809265137, "rewards/rejected": -10.815625190734863, "step": 7100 }, { "epoch": 2.342668863261944, "grad_norm": 8.189294037488622, "learning_rate": 4.144151565074135e-07, "logits/chosen": -0.813403308391571, "logits/rejected": -0.9966796636581421, "logps/chosen": -333.1000061035156, "logps/rejected": -445.70001220703125, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -1.6198241710662842, "rewards/margins": 9.956250190734863, "rewards/rejected": -11.571874618530273, "step": 7110 }, { "epoch": 2.345963756177924, "grad_norm": 0.10246079712945592, "learning_rate": 4.1359143327841846e-07, "logits/chosen": -0.712145984172821, "logits/rejected": -1.170312523841858, "logps/chosen": -363.29998779296875, "logps/rejected": -418.3999938964844, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -1.931677222251892, "rewards/margins": 10.067187309265137, "rewards/rejected": -12.0, "step": 7120 }, { "epoch": 2.3492586490939047, "grad_norm": 0.40292993899391205, "learning_rate": 4.127677100494234e-07, "logits/chosen": -0.751904308795929, "logits/rejected": -1.0759766101837158, "logps/chosen": -346.6000061035156, "logps/rejected": -423.20001220703125, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -1.461523413658142, "rewards/margins": 9.696874618530273, "rewards/rejected": -11.15625, "step": 7130 }, { "epoch": 2.352553542009885, "grad_norm": 6.172329208848716, "learning_rate": 4.119439868204283e-07, "logits/chosen": -0.9482421875, "logits/rejected": -1.166015625, "logps/chosen": -318.54998779296875, "logps/rejected": -399.29998779296875, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": -1.6896483898162842, "rewards/margins": 9.274999618530273, "rewards/rejected": -10.965624809265137, "step": 7140 }, { "epoch": 2.355848434925865, "grad_norm": 3.100588575528722, "learning_rate": 4.111202635914333e-07, "logits/chosen": -0.896411120891571, "logits/rejected": -1.2705078125, "logps/chosen": -382.8999938964844, "logps/rejected": -429.0, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -1.539453148841858, "rewards/margins": 10.237500190734863, "rewards/rejected": -11.78125, "step": 7150 }, { "epoch": 2.359143327841845, "grad_norm": 6.926176434108999, "learning_rate": 4.102965403624382e-07, "logits/chosen": -0.8773437738418579, "logits/rejected": -1.084375023841858, "logps/chosen": -335.5, "logps/rejected": -435.45001220703125, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -1.977563500404358, "rewards/margins": 10.293749809265137, "rewards/rejected": -12.28125, "step": 7160 }, { "epoch": 2.3624382207578254, "grad_norm": 3.3368667605995306, "learning_rate": 4.094728171334432e-07, "logits/chosen": -0.812512218952179, "logits/rejected": -1.205664038658142, "logps/chosen": -342.6499938964844, "logps/rejected": -379.1499938964844, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": -1.7685546875, "rewards/margins": 9.467187881469727, "rewards/rejected": -11.234375, "step": 7170 }, { "epoch": 2.3657331136738056, "grad_norm": 6.634899035420172, "learning_rate": 4.0864909390444806e-07, "logits/chosen": -0.83709716796875, "logits/rejected": -1.1350586414337158, "logps/chosen": -331.3500061035156, "logps/rejected": -412.29998779296875, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -2.030566453933716, "rewards/margins": 10.185937881469727, "rewards/rejected": -12.215624809265137, "step": 7180 }, { "epoch": 2.3690280065897857, "grad_norm": 5.401420942035176, "learning_rate": 4.0782537067545304e-07, "logits/chosen": -0.9574218988418579, "logits/rejected": -1.147070288658142, "logps/chosen": -378.25, "logps/rejected": -442.25, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": -2.0504150390625, "rewards/margins": 9.809374809265137, "rewards/rejected": -11.853124618530273, "step": 7190 }, { "epoch": 2.372322899505766, "grad_norm": 16.349066072691766, "learning_rate": 4.0700164744645796e-07, "logits/chosen": -0.9044433832168579, "logits/rejected": -1.1916015148162842, "logps/chosen": -352.8500061035156, "logps/rejected": -442.1000061035156, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -2.306640625, "rewards/margins": 9.751562118530273, "rewards/rejected": -12.053125381469727, "step": 7200 }, { "epoch": 2.375617792421746, "grad_norm": 2.5283741424966557, "learning_rate": 4.0617792421746294e-07, "logits/chosen": -0.8705078363418579, "logits/rejected": -1.122460961341858, "logps/chosen": -365.9750061035156, "logps/rejected": -429.1499938964844, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -2.0869140625, "rewards/margins": 9.149999618530273, "rewards/rejected": -11.225000381469727, "step": 7210 }, { "epoch": 2.3789126853377267, "grad_norm": 0.5398884850246706, "learning_rate": 4.0535420098846786e-07, "logits/chosen": -0.71734619140625, "logits/rejected": -1.078125, "logps/chosen": -326.54998779296875, "logps/rejected": -396.6000061035156, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -1.592382788658142, "rewards/margins": 9.604687690734863, "rewards/rejected": -11.203125, "step": 7220 }, { "epoch": 2.382207578253707, "grad_norm": 3.9892753655935054, "learning_rate": 4.045304777594728e-07, "logits/chosen": -0.978515625, "logits/rejected": -1.099609375, "logps/chosen": -333.20001220703125, "logps/rejected": -428.95001220703125, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -1.8380858898162842, "rewards/margins": 9.949999809265137, "rewards/rejected": -11.793749809265137, "step": 7230 }, { "epoch": 2.385502471169687, "grad_norm": 0.4568918742495934, "learning_rate": 4.0370675453047777e-07, "logits/chosen": -0.792773425579071, "logits/rejected": -0.8766113519668579, "logps/chosen": -341.5, "logps/rejected": -466.6000061035156, "loss": 0.0114, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.456091284751892, "rewards/margins": 9.0703125, "rewards/rejected": -10.524999618530273, "step": 7240 }, { "epoch": 2.3887973640856672, "grad_norm": 1.0739436445047297, "learning_rate": 4.0288303130148264e-07, "logits/chosen": -0.8387695550918579, "logits/rejected": -0.986621081829071, "logps/chosen": -332.95001220703125, "logps/rejected": -397.1000061035156, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": -1.28857421875, "rewards/margins": 9.199999809265137, "rewards/rejected": -10.485937118530273, "step": 7250 }, { "epoch": 2.3920922570016474, "grad_norm": 2.0042661256262293, "learning_rate": 4.020593080724876e-07, "logits/chosen": -0.7127319574356079, "logits/rejected": -1.0543944835662842, "logps/chosen": -398.1000061035156, "logps/rejected": -416.1000061035156, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -1.0555908679962158, "rewards/margins": 9.725000381469727, "rewards/rejected": -10.798437118530273, "step": 7260 }, { "epoch": 2.3953871499176276, "grad_norm": 9.785512584747632, "learning_rate": 4.012355848434926e-07, "logits/chosen": -0.858593761920929, "logits/rejected": -1.100000023841858, "logps/chosen": -383.8999938964844, "logps/rejected": -428.79998779296875, "loss": 0.01, "rewards/accuracies": 1.0, "rewards/chosen": -1.8923828601837158, "rewards/margins": 10.165624618530273, "rewards/rejected": -12.059374809265137, "step": 7270 }, { "epoch": 2.3986820428336078, "grad_norm": 6.2884931761034535, "learning_rate": 4.004118616144975e-07, "logits/chosen": -0.835736095905304, "logits/rejected": -1.0876953601837158, "logps/chosen": -354.3500061035156, "logps/rejected": -440.0, "loss": 0.0109, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.220507860183716, "rewards/margins": 9.326562881469727, "rewards/rejected": -11.550000190734863, "step": 7280 }, { "epoch": 2.4019769357495884, "grad_norm": 0.29938721776595867, "learning_rate": 3.9958813838550244e-07, "logits/chosen": -0.772705078125, "logits/rejected": -1.040917992591858, "logps/chosen": -338.54998779296875, "logps/rejected": -431.3500061035156, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -2.030468702316284, "rewards/margins": 9.8515625, "rewards/rejected": -11.887499809265137, "step": 7290 }, { "epoch": 2.4052718286655685, "grad_norm": 0.7022997824518632, "learning_rate": 3.9876441515650737e-07, "logits/chosen": -0.785107433795929, "logits/rejected": -1.099218726158142, "logps/chosen": -360.8500061035156, "logps/rejected": -435.6499938964844, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -2.182421922683716, "rewards/margins": 9.792187690734863, "rewards/rejected": -11.981249809265137, "step": 7300 }, { "epoch": 2.4085667215815487, "grad_norm": 0.20321788533736856, "learning_rate": 3.9794069192751235e-07, "logits/chosen": -0.609375, "logits/rejected": -1.094335913658142, "logps/chosen": -359.6499938964844, "logps/rejected": -430.8999938964844, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -2.566601514816284, "rewards/margins": 9.873437881469727, "rewards/rejected": -12.4375, "step": 7310 }, { "epoch": 2.411861614497529, "grad_norm": 3.722650305720516, "learning_rate": 3.9711696869851727e-07, "logits/chosen": -0.7688964605331421, "logits/rejected": -0.964404284954071, "logps/chosen": -349.04998779296875, "logps/rejected": -437.8999938964844, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -2.371386766433716, "rewards/margins": 10.243749618530273, "rewards/rejected": -12.612500190734863, "step": 7320 }, { "epoch": 2.415156507413509, "grad_norm": 6.693076397619086, "learning_rate": 3.962932454695222e-07, "logits/chosen": -0.7305663824081421, "logits/rejected": -0.991406261920929, "logps/chosen": -405.8500061035156, "logps/rejected": -473.70001220703125, "loss": 0.0135, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.408398389816284, "rewards/margins": 9.6015625, "rewards/rejected": -12.006250381469727, "step": 7330 }, { "epoch": 2.4184514003294892, "grad_norm": 3.0336849664088033, "learning_rate": 3.954695222405272e-07, "logits/chosen": -0.76318359375, "logits/rejected": -0.921582043170929, "logps/chosen": -350.20001220703125, "logps/rejected": -421.8999938964844, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -1.6726562976837158, "rewards/margins": 9.610937118530273, "rewards/rejected": -11.290624618530273, "step": 7340 }, { "epoch": 2.4217462932454694, "grad_norm": 11.294795948813041, "learning_rate": 3.946457990115321e-07, "logits/chosen": -0.648510754108429, "logits/rejected": -0.9825195074081421, "logps/chosen": -375.79998779296875, "logps/rejected": -429.70001220703125, "loss": 0.0098, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.50830078125, "rewards/margins": 9.203125, "rewards/rejected": -10.701562881469727, "step": 7350 }, { "epoch": 2.4250411861614496, "grad_norm": 0.5174271290765481, "learning_rate": 3.938220757825371e-07, "logits/chosen": -0.781933605670929, "logits/rejected": -1.1116211414337158, "logps/chosen": -311.3999938964844, "logps/rejected": -372.3999938964844, "loss": 0.0102, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.950781226158142, "rewards/margins": 9.0234375, "rewards/rejected": -10.984375, "step": 7360 }, { "epoch": 2.4283360790774298, "grad_norm": 0.47207550125447223, "learning_rate": 3.9299835255354195e-07, "logits/chosen": -0.725878894329071, "logits/rejected": -0.972705066204071, "logps/chosen": -343.04998779296875, "logps/rejected": -379.1000061035156, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -1.1938965320587158, "rewards/margins": 9.028124809265137, "rewards/rejected": -10.21875, "step": 7370 }, { "epoch": 2.4316309719934104, "grad_norm": 4.261641567735967, "learning_rate": 3.9217462932454693e-07, "logits/chosen": -0.85107421875, "logits/rejected": -1.083593726158142, "logps/chosen": -350.20001220703125, "logps/rejected": -453.1000061035156, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -1.7541015148162842, "rewards/margins": 9.524999618530273, "rewards/rejected": -11.278124809265137, "step": 7380 }, { "epoch": 2.4349258649093906, "grad_norm": 1.2958903764853904, "learning_rate": 3.913509060955519e-07, "logits/chosen": -0.8697265386581421, "logits/rejected": -1.090917944908142, "logps/chosen": -367.5, "logps/rejected": -403.3999938964844, "loss": 0.0102, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.769628882408142, "rewards/margins": 9.254687309265137, "rewards/rejected": -11.028124809265137, "step": 7390 }, { "epoch": 2.4382207578253707, "grad_norm": 2.674603473382133, "learning_rate": 3.9052718286655683e-07, "logits/chosen": -0.8642578125, "logits/rejected": -1.086523413658142, "logps/chosen": -386.04998779296875, "logps/rejected": -435.5, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -1.354833960533142, "rewards/margins": 9.145312309265137, "rewards/rejected": -10.503125190734863, "step": 7400 }, { "epoch": 2.441515650741351, "grad_norm": 0.789019234036371, "learning_rate": 3.8970345963756176e-07, "logits/chosen": -0.8374481201171875, "logits/rejected": -1.1288573741912842, "logps/chosen": -326.8999938964844, "logps/rejected": -424.20001220703125, "loss": 0.0212, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.1480469703674316, "rewards/margins": 9.629687309265137, "rewards/rejected": -11.768750190734863, "step": 7410 }, { "epoch": 2.444810543657331, "grad_norm": 3.448576573746464, "learning_rate": 3.888797364085667e-07, "logits/chosen": -0.827807605266571, "logits/rejected": -1.1416504383087158, "logps/chosen": -354.45001220703125, "logps/rejected": -432.6000061035156, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -2.2724609375, "rewards/margins": 10.467187881469727, "rewards/rejected": -12.734375, "step": 7420 }, { "epoch": 2.4481054365733113, "grad_norm": 1.6262599981599812, "learning_rate": 3.8805601317957166e-07, "logits/chosen": -0.828808605670929, "logits/rejected": -1.1935546398162842, "logps/chosen": -368.1000061035156, "logps/rejected": -438.75, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": -1.843115210533142, "rewards/margins": 9.756250381469727, "rewards/rejected": -11.6015625, "step": 7430 }, { "epoch": 2.4514003294892914, "grad_norm": 0.3973059300756358, "learning_rate": 3.872322899505766e-07, "logits/chosen": -0.7310546636581421, "logits/rejected": -1.1594727039337158, "logps/chosen": -356.95001220703125, "logps/rejected": -413.6499938964844, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -1.5434081554412842, "rewards/margins": 9.557812690734863, "rewards/rejected": -11.096875190734863, "step": 7440 }, { "epoch": 2.454695222405272, "grad_norm": 0.3396320569264985, "learning_rate": 3.864085667215815e-07, "logits/chosen": -0.880444347858429, "logits/rejected": -1.153906226158142, "logps/chosen": -350.1499938964844, "logps/rejected": -404.1000061035156, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": -1.470239281654358, "rewards/margins": 9.270312309265137, "rewards/rejected": -10.737500190734863, "step": 7450 }, { "epoch": 2.4579901153212522, "grad_norm": 2.1353966966659352, "learning_rate": 3.855848434925865e-07, "logits/chosen": -0.908007800579071, "logits/rejected": -1.1162109375, "logps/chosen": -344.45001220703125, "logps/rejected": -449.8999938964844, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -1.8944580554962158, "rewards/margins": 10.103124618530273, "rewards/rejected": -12.003125190734863, "step": 7460 }, { "epoch": 2.4612850082372324, "grad_norm": 5.331527411923286, "learning_rate": 3.847611202635914e-07, "logits/chosen": -0.8993164300918579, "logits/rejected": -1.05859375, "logps/chosen": -366.29998779296875, "logps/rejected": -470.20001220703125, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -1.7100830078125, "rewards/margins": 9.753125190734863, "rewards/rejected": -11.462499618530273, "step": 7470 }, { "epoch": 2.4645799011532126, "grad_norm": 0.6869246850445511, "learning_rate": 3.839373970345964e-07, "logits/chosen": -0.898144543170929, "logits/rejected": -1.1941406726837158, "logps/chosen": -367.75, "logps/rejected": -447.0, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -1.859375, "rewards/margins": 9.625, "rewards/rejected": -11.493749618530273, "step": 7480 }, { "epoch": 2.4678747940691927, "grad_norm": 5.638803156845093, "learning_rate": 3.8311367380560126e-07, "logits/chosen": -0.993457019329071, "logits/rejected": -1.171484351158142, "logps/chosen": -341.1000061035156, "logps/rejected": -456.6000061035156, "loss": 0.0094, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.075671434402466, "rewards/margins": 9.9609375, "rewards/rejected": -12.056249618530273, "step": 7490 }, { "epoch": 2.471169686985173, "grad_norm": 5.252553056017753, "learning_rate": 3.8228995057660624e-07, "logits/chosen": -0.8139892816543579, "logits/rejected": -1.138281226158142, "logps/chosen": -340.75, "logps/rejected": -421.0, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": -2.039294481277466, "rewards/margins": 9.982812881469727, "rewards/rejected": -12.024999618530273, "step": 7500 }, { "epoch": 2.474464579901153, "grad_norm": 2.8603805142213523, "learning_rate": 3.814662273476112e-07, "logits/chosen": -0.984570324420929, "logits/rejected": -1.1242187023162842, "logps/chosen": -341.75, "logps/rejected": -419.79998779296875, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -2.171875, "rewards/margins": 9.6640625, "rewards/rejected": -11.828125, "step": 7510 }, { "epoch": 2.4777594728171333, "grad_norm": 1.0709422917054132, "learning_rate": 3.8064250411861614e-07, "logits/chosen": -0.804003894329071, "logits/rejected": -1.241796851158142, "logps/chosen": -315.57501220703125, "logps/rejected": -409.95001220703125, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -2.023632764816284, "rewards/margins": 9.5703125, "rewards/rejected": -11.587499618530273, "step": 7520 }, { "epoch": 2.4810543657331134, "grad_norm": 1.6480799037792442, "learning_rate": 3.7981878088962107e-07, "logits/chosen": -0.8163818120956421, "logits/rejected": -1.292382836341858, "logps/chosen": -333.0, "logps/rejected": -400.54998779296875, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -1.7554931640625, "rewards/margins": 10.237500190734863, "rewards/rejected": -11.990625381469727, "step": 7530 }, { "epoch": 2.484349258649094, "grad_norm": 1.5437912366229785, "learning_rate": 3.78995057660626e-07, "logits/chosen": -0.9195312261581421, "logits/rejected": -1.2683594226837158, "logps/chosen": -350.1499938964844, "logps/rejected": -432.79998779296875, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -1.756005883216858, "rewards/margins": 10.3359375, "rewards/rejected": -12.096875190734863, "step": 7540 }, { "epoch": 2.4876441515650742, "grad_norm": 1.702720076754409, "learning_rate": 3.7817133443163097e-07, "logits/chosen": -0.8446899652481079, "logits/rejected": -1.271484375, "logps/chosen": -328.20001220703125, "logps/rejected": -457.8999938964844, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -2.3646483421325684, "rewards/margins": 10.421875, "rewards/rejected": -12.78125, "step": 7550 }, { "epoch": 2.4909390444810544, "grad_norm": 7.7734342211831855, "learning_rate": 3.773476112026359e-07, "logits/chosen": -0.955737292766571, "logits/rejected": -1.148535132408142, "logps/chosen": -310.04998779296875, "logps/rejected": -430.29998779296875, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -2.361523389816284, "rewards/margins": 10.328125, "rewards/rejected": -12.684374809265137, "step": 7560 }, { "epoch": 2.4942339373970346, "grad_norm": 0.6635673549916647, "learning_rate": 3.765238879736408e-07, "logits/chosen": -0.939404308795929, "logits/rejected": -1.260351538658142, "logps/chosen": -381.6499938964844, "logps/rejected": -445.0, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -2.5127930641174316, "rewards/margins": 9.985937118530273, "rewards/rejected": -12.509374618530273, "step": 7570 }, { "epoch": 2.4975288303130148, "grad_norm": 24.76087292815326, "learning_rate": 3.757001647446458e-07, "logits/chosen": -0.7447509765625, "logits/rejected": -1.141210913658142, "logps/chosen": -360.0, "logps/rejected": -452.0, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": -2.3484864234924316, "rewards/margins": 10.134374618530273, "rewards/rejected": -12.493749618530273, "step": 7580 }, { "epoch": 2.500823723228995, "grad_norm": 1.3574305714137422, "learning_rate": 3.748764415156507e-07, "logits/chosen": -0.9439452886581421, "logits/rejected": -1.1047852039337158, "logps/chosen": -336.1000061035156, "logps/rejected": -410.1000061035156, "loss": 0.0192, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.888281226158142, "rewards/margins": 9.346875190734863, "rewards/rejected": -11.243749618530273, "step": 7590 }, { "epoch": 2.504118616144975, "grad_norm": 7.5156301003310295, "learning_rate": 3.740527182866557e-07, "logits/chosen": -0.86279296875, "logits/rejected": -1.280664086341858, "logps/chosen": -330.29998779296875, "logps/rejected": -397.8999938964844, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": -1.56201171875, "rewards/margins": 9.798437118530273, "rewards/rejected": -11.365625381469727, "step": 7600 }, { "epoch": 2.5074135090609557, "grad_norm": 0.3926305357934441, "learning_rate": 3.7322899505766057e-07, "logits/chosen": -0.83984375, "logits/rejected": -1.142480492591858, "logps/chosen": -359.5, "logps/rejected": -415.6000061035156, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -1.621337890625, "rewards/margins": 10.0078125, "rewards/rejected": -11.625, "step": 7610 }, { "epoch": 2.510708401976936, "grad_norm": 9.880543689572827, "learning_rate": 3.7240527182866555e-07, "logits/chosen": -0.858593761920929, "logits/rejected": -1.0674560070037842, "logps/chosen": -373.07501220703125, "logps/rejected": -470.3999938964844, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -2.295117139816284, "rewards/margins": 10.454687118530273, "rewards/rejected": -12.753125190734863, "step": 7620 }, { "epoch": 2.514003294892916, "grad_norm": 1.1224546737982577, "learning_rate": 3.7158154859967053e-07, "logits/chosen": -0.68505859375, "logits/rejected": -0.98779296875, "logps/chosen": -373.8500061035156, "logps/rejected": -422.3999938964844, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": -1.879003882408142, "rewards/margins": 9.557812690734863, "rewards/rejected": -11.4375, "step": 7630 }, { "epoch": 2.5172981878088962, "grad_norm": 0.4334401877985769, "learning_rate": 3.7075782537067545e-07, "logits/chosen": -0.912384033203125, "logits/rejected": -1.150781273841858, "logps/chosen": -375.1499938964844, "logps/rejected": -438.3999938964844, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -1.921484351158142, "rewards/margins": 10.404687881469727, "rewards/rejected": -12.318750381469727, "step": 7640 }, { "epoch": 2.5205930807248764, "grad_norm": 1.1194068692176082, "learning_rate": 3.699341021416804e-07, "logits/chosen": -0.9078124761581421, "logits/rejected": -1.259374976158142, "logps/chosen": -341.29998779296875, "logps/rejected": -460.70001220703125, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -2.6792969703674316, "rewards/margins": 10.948437690734863, "rewards/rejected": -13.634374618530273, "step": 7650 }, { "epoch": 2.5238879736408566, "grad_norm": 0.492818008173205, "learning_rate": 3.691103789126853e-07, "logits/chosen": -0.823925793170929, "logits/rejected": -1.2423827648162842, "logps/chosen": -352.1499938964844, "logps/rejected": -439.70001220703125, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": -2.3077149391174316, "rewards/margins": 9.629687309265137, "rewards/rejected": -11.926562309265137, "step": 7660 }, { "epoch": 2.5271828665568368, "grad_norm": 2.205091482408319, "learning_rate": 3.682866556836903e-07, "logits/chosen": -0.933789074420929, "logits/rejected": -1.0431640148162842, "logps/chosen": -317.6499938964844, "logps/rejected": -445.0, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": -1.773718237876892, "rewards/margins": 9.5390625, "rewards/rejected": -11.324999809265137, "step": 7670 }, { "epoch": 2.5304777594728174, "grad_norm": 9.987966205906213, "learning_rate": 3.674629324546952e-07, "logits/chosen": -1.022363305091858, "logits/rejected": -1.249609351158142, "logps/chosen": -346.1499938964844, "logps/rejected": -399.45001220703125, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": -2.4498047828674316, "rewards/margins": 9.865625381469727, "rewards/rejected": -12.3125, "step": 7680 }, { "epoch": 2.533772652388797, "grad_norm": 0.33289521709402303, "learning_rate": 3.6663920922570013e-07, "logits/chosen": -0.9796508550643921, "logits/rejected": -1.219140648841858, "logps/chosen": -370.29998779296875, "logps/rejected": -456.29998779296875, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -2.1758790016174316, "rewards/margins": 10.443750381469727, "rewards/rejected": -12.612500190734863, "step": 7690 }, { "epoch": 2.5370675453047777, "grad_norm": 1.2036601195436307, "learning_rate": 3.658154859967051e-07, "logits/chosen": -0.8631347417831421, "logits/rejected": -1.1124999523162842, "logps/chosen": -388.6000061035156, "logps/rejected": -449.5, "loss": 0.0099, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.867334008216858, "rewards/margins": 10.331250190734863, "rewards/rejected": -12.199999809265137, "step": 7700 }, { "epoch": 2.540362438220758, "grad_norm": 2.8399342674577985, "learning_rate": 3.6499176276771003e-07, "logits/chosen": -0.9559570550918579, "logits/rejected": -1.276464819908142, "logps/chosen": -325.3500061035156, "logps/rejected": -410.6000061035156, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": -2.407421827316284, "rewards/margins": 10.0625, "rewards/rejected": -12.482812881469727, "step": 7710 }, { "epoch": 2.543657331136738, "grad_norm": 1.5045933334140025, "learning_rate": 3.64168039538715e-07, "logits/chosen": -0.779223620891571, "logits/rejected": -1.111718773841858, "logps/chosen": -368.1000061035156, "logps/rejected": -440.3999938964844, "loss": 0.0133, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.028125047683716, "rewards/margins": 10.014062881469727, "rewards/rejected": -12.043749809265137, "step": 7720 }, { "epoch": 2.5469522240527183, "grad_norm": 1.2609080098613696, "learning_rate": 3.633443163097199e-07, "logits/chosen": -0.7906738519668579, "logits/rejected": -1.1985352039337158, "logps/chosen": -339.5, "logps/rejected": -404.79998779296875, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -1.8974120616912842, "rewards/margins": 10.209375381469727, "rewards/rejected": -12.107812881469727, "step": 7730 }, { "epoch": 2.5502471169686984, "grad_norm": 0.19592076664403021, "learning_rate": 3.6252059308072486e-07, "logits/chosen": -0.9394165277481079, "logits/rejected": -1.238671898841858, "logps/chosen": -357.70001220703125, "logps/rejected": -422.0, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -2.0091795921325684, "rewards/margins": 9.884374618530273, "rewards/rejected": -11.896875381469727, "step": 7740 }, { "epoch": 2.5535420098846786, "grad_norm": 2.303435795894461, "learning_rate": 3.6169686985172984e-07, "logits/chosen": -1.0460205078125, "logits/rejected": -1.2509765625, "logps/chosen": -317.3500061035156, "logps/rejected": -419.1000061035156, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": -1.710205078125, "rewards/margins": 9.675000190734863, "rewards/rejected": -11.384374618530273, "step": 7750 }, { "epoch": 2.556836902800659, "grad_norm": 17.10534238153856, "learning_rate": 3.6087314662273476e-07, "logits/chosen": -1.180566430091858, "logits/rejected": -1.3214843273162842, "logps/chosen": -334.1000061035156, "logps/rejected": -396.79998779296875, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": -1.5352294445037842, "rewards/margins": 9.532812118530273, "rewards/rejected": -11.071874618530273, "step": 7760 }, { "epoch": 2.5601317957166394, "grad_norm": 1.9903002746601592, "learning_rate": 3.600494233937397e-07, "logits/chosen": -0.9002441167831421, "logits/rejected": -1.3171875476837158, "logps/chosen": -345.8999938964844, "logps/rejected": -435.20001220703125, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -1.6720092296600342, "rewards/margins": 9.481249809265137, "rewards/rejected": -11.149999618530273, "step": 7770 }, { "epoch": 2.5634266886326196, "grad_norm": 0.48470924881409655, "learning_rate": 3.592257001647446e-07, "logits/chosen": -1.0830078125, "logits/rejected": -1.296289086341858, "logps/chosen": -352.25, "logps/rejected": -415.29998779296875, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -2.2425780296325684, "rewards/margins": 10.024999618530273, "rewards/rejected": -12.268750190734863, "step": 7780 }, { "epoch": 2.5667215815485998, "grad_norm": 0.6116766312392135, "learning_rate": 3.584019769357496e-07, "logits/chosen": -0.9798339605331421, "logits/rejected": -1.115234375, "logps/chosen": -336.1499938964844, "logps/rejected": -438.0, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -2.358227491378784, "rewards/margins": 10.135937690734863, "rewards/rejected": -12.5, "step": 7790 }, { "epoch": 2.57001647446458, "grad_norm": 3.773669291342645, "learning_rate": 3.575782537067545e-07, "logits/chosen": -1.0674316883087158, "logits/rejected": -1.2292969226837158, "logps/chosen": -354.3500061035156, "logps/rejected": -425.1499938964844, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": -2.0738282203674316, "rewards/margins": 9.709375381469727, "rewards/rejected": -11.778124809265137, "step": 7800 }, { "epoch": 2.57331136738056, "grad_norm": 5.0655858025745495, "learning_rate": 3.5675453047775944e-07, "logits/chosen": -0.821484386920929, "logits/rejected": -1.159765601158142, "logps/chosen": -355.70001220703125, "logps/rejected": -413.5, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -2.0860838890075684, "rewards/margins": 9.65625, "rewards/rejected": -11.740625381469727, "step": 7810 }, { "epoch": 2.5766062602965403, "grad_norm": 1.5726046399982692, "learning_rate": 3.559308072487644e-07, "logits/chosen": -0.9337402582168579, "logits/rejected": -1.1199219226837158, "logps/chosen": -341.32501220703125, "logps/rejected": -409.1000061035156, "loss": 0.0111, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.3792724609375, "rewards/margins": 9.8125, "rewards/rejected": -11.193750381469727, "step": 7820 }, { "epoch": 2.5799011532125204, "grad_norm": 0.2461313491704167, "learning_rate": 3.5510708401976934e-07, "logits/chosen": -0.91943359375, "logits/rejected": -1.041015625, "logps/chosen": -367.20001220703125, "logps/rejected": -428.20001220703125, "loss": 0.0102, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.083581566810608, "rewards/margins": 9.732812881469727, "rewards/rejected": -10.815625190734863, "step": 7830 }, { "epoch": 2.583196046128501, "grad_norm": 1.222577118692114, "learning_rate": 3.542833607907743e-07, "logits/chosen": -0.883105456829071, "logits/rejected": -1.2179687023162842, "logps/chosen": -359.8999938964844, "logps/rejected": -420.5, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -1.552343726158142, "rewards/margins": 9.979687690734863, "rewards/rejected": -11.528124809265137, "step": 7840 }, { "epoch": 2.586490939044481, "grad_norm": 0.8164697264957274, "learning_rate": 3.534596375617792e-07, "logits/chosen": -0.8206542730331421, "logits/rejected": -1.0846679210662842, "logps/chosen": -340.29998779296875, "logps/rejected": -447.79998779296875, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -1.7264007329940796, "rewards/margins": 9.667187690734863, "rewards/rejected": -11.384374618530273, "step": 7850 }, { "epoch": 2.5897858319604614, "grad_norm": 1.9852252785421207, "learning_rate": 3.5263591433278417e-07, "logits/chosen": -0.854785144329071, "logits/rejected": -1.128515601158142, "logps/chosen": -282.04998779296875, "logps/rejected": -372.3500061035156, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -1.841406226158142, "rewards/margins": 9.15625, "rewards/rejected": -11.003125190734863, "step": 7860 }, { "epoch": 2.5930807248764416, "grad_norm": 9.97553910161928, "learning_rate": 3.5181219110378915e-07, "logits/chosen": -0.9078613519668579, "logits/rejected": -1.150390625, "logps/chosen": -357.29998779296875, "logps/rejected": -430.5, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -1.72998046875, "rewards/margins": 9.685937881469727, "rewards/rejected": -11.415624618530273, "step": 7870 }, { "epoch": 2.5963756177924218, "grad_norm": 0.45781342205492254, "learning_rate": 3.509884678747941e-07, "logits/chosen": -0.8609374761581421, "logits/rejected": -1.134765625, "logps/chosen": -331.8999938964844, "logps/rejected": -419.0, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": -1.879113793373108, "rewards/margins": 10.109375, "rewards/rejected": -12.0, "step": 7880 }, { "epoch": 2.599670510708402, "grad_norm": 0.48738283763697754, "learning_rate": 3.50164744645799e-07, "logits/chosen": -0.817187488079071, "logits/rejected": -1.1818358898162842, "logps/chosen": -378.3999938964844, "logps/rejected": -408.75, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -2.28466796875, "rewards/margins": 9.857812881469727, "rewards/rejected": -12.149999618530273, "step": 7890 }, { "epoch": 2.602965403624382, "grad_norm": 0.7119071741474291, "learning_rate": 3.493410214168039e-07, "logits/chosen": -0.9129883050918579, "logits/rejected": -1.0499999523162842, "logps/chosen": -332.70001220703125, "logps/rejected": -416.8999938964844, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -2.521655321121216, "rewards/margins": 9.8046875, "rewards/rejected": -12.332812309265137, "step": 7900 }, { "epoch": 2.6062602965403623, "grad_norm": 4.031577732169987, "learning_rate": 3.485172981878089e-07, "logits/chosen": -0.8953613042831421, "logits/rejected": -1.16033935546875, "logps/chosen": -335.8500061035156, "logps/rejected": -453.8999938964844, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -2.570507764816284, "rewards/margins": 10.184374809265137, "rewards/rejected": -12.753125190734863, "step": 7910 }, { "epoch": 2.6095551894563425, "grad_norm": 0.4993266610593257, "learning_rate": 3.476935749588138e-07, "logits/chosen": -0.918347179889679, "logits/rejected": -1.203515648841858, "logps/chosen": -370.75, "logps/rejected": -451.20001220703125, "loss": 0.0262, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.0687499046325684, "rewards/margins": 9.675000190734863, "rewards/rejected": -12.731249809265137, "step": 7920 }, { "epoch": 2.612850082372323, "grad_norm": 6.119811250960395, "learning_rate": 3.4686985172981875e-07, "logits/chosen": -0.996777355670929, "logits/rejected": -1.2439453601837158, "logps/chosen": -365.8999938964844, "logps/rejected": -445.6499938964844, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -2.818359375, "rewards/margins": 10.046875, "rewards/rejected": -12.862500190734863, "step": 7930 }, { "epoch": 2.6161449752883033, "grad_norm": 0.935407082591678, "learning_rate": 3.4604612850082373e-07, "logits/chosen": -1.075585961341858, "logits/rejected": -1.284765601158142, "logps/chosen": -349.0, "logps/rejected": -416.3999938964844, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": -2.406933546066284, "rewards/margins": 9.524999618530273, "rewards/rejected": -11.925000190734863, "step": 7940 }, { "epoch": 2.6194398682042834, "grad_norm": 0.7992274484439083, "learning_rate": 3.4522240527182865e-07, "logits/chosen": -0.9571288824081421, "logits/rejected": -1.143164038658142, "logps/chosen": -333.0, "logps/rejected": -433.6000061035156, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -2.097460985183716, "rewards/margins": 10.40625, "rewards/rejected": -12.506250381469727, "step": 7950 }, { "epoch": 2.6227347611202636, "grad_norm": 1.044988132312211, "learning_rate": 3.443986820428336e-07, "logits/chosen": -0.95068359375, "logits/rejected": -1.3283202648162842, "logps/chosen": -361.6000061035156, "logps/rejected": -445.79998779296875, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -2.809375047683716, "rewards/margins": 9.721875190734863, "rewards/rejected": -12.543749809265137, "step": 7960 }, { "epoch": 2.6260296540362438, "grad_norm": 0.4905697444595223, "learning_rate": 3.435749588138385e-07, "logits/chosen": -1.0490233898162842, "logits/rejected": -1.2208983898162842, "logps/chosen": -341.0, "logps/rejected": -404.3999938964844, "loss": 0.0134, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.397656202316284, "rewards/margins": 9.621874809265137, "rewards/rejected": -12.024999618530273, "step": 7970 }, { "epoch": 2.629324546952224, "grad_norm": 0.49852824910634624, "learning_rate": 3.427512355848435e-07, "logits/chosen": -0.950457751750946, "logits/rejected": -1.2060546875, "logps/chosen": -322.3500061035156, "logps/rejected": -426.20001220703125, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -2.584765672683716, "rewards/margins": 9.846875190734863, "rewards/rejected": -12.434374809265137, "step": 7980 }, { "epoch": 2.632619439868204, "grad_norm": 0.3032004754281449, "learning_rate": 3.4192751235584846e-07, "logits/chosen": -0.841796875, "logits/rejected": -1.295312523841858, "logps/chosen": -344.75, "logps/rejected": -407.3999938964844, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -2.086621046066284, "rewards/margins": 9.912500381469727, "rewards/rejected": -12.0, "step": 7990 }, { "epoch": 2.6359143327841847, "grad_norm": 2.1633573251841254, "learning_rate": 3.4110378912685333e-07, "logits/chosen": -0.9161132574081421, "logits/rejected": -1.0797851085662842, "logps/chosen": -367.3500061035156, "logps/rejected": -464.29998779296875, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -2.6412110328674316, "rewards/margins": 9.856249809265137, "rewards/rejected": -12.484375, "step": 8000 }, { "epoch": 2.6392092257001645, "grad_norm": 87.66861877871106, "learning_rate": 3.402800658978583e-07, "logits/chosen": -0.7848755121231079, "logits/rejected": -1.1213867664337158, "logps/chosen": -365.79998779296875, "logps/rejected": -428.70001220703125, "loss": 0.0215, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.6009764671325684, "rewards/margins": 9.162500381469727, "rewards/rejected": -11.762499809265137, "step": 8010 }, { "epoch": 2.642504118616145, "grad_norm": 2.7458957635371006, "learning_rate": 3.3945634266886324e-07, "logits/chosen": -0.9678710699081421, "logits/rejected": -1.2361328601837158, "logps/chosen": -376.1499938964844, "logps/rejected": -426.70001220703125, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": -1.8799316883087158, "rewards/margins": 9.651562690734863, "rewards/rejected": -11.524999618530273, "step": 8020 }, { "epoch": 2.6457990115321253, "grad_norm": 0.3426609280230769, "learning_rate": 3.386326194398682e-07, "logits/chosen": -0.949755847454071, "logits/rejected": -1.2958984375, "logps/chosen": -319.3500061035156, "logps/rejected": -421.70001220703125, "loss": 0.0105, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.8328857421875, "rewards/margins": 10.328125, "rewards/rejected": -12.15625, "step": 8030 }, { "epoch": 2.6490939044481054, "grad_norm": 2.6908969327261842, "learning_rate": 3.378088962108731e-07, "logits/chosen": -1.019140601158142, "logits/rejected": -1.29052734375, "logps/chosen": -346.1000061035156, "logps/rejected": -422.8500061035156, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": -1.9044921398162842, "rewards/margins": 9.893750190734863, "rewards/rejected": -11.806249618530273, "step": 8040 }, { "epoch": 2.6523887973640856, "grad_norm": 2.087681287328796, "learning_rate": 3.3698517298187806e-07, "logits/chosen": -1.0098145008087158, "logits/rejected": -1.388085961341858, "logps/chosen": -361.8500061035156, "logps/rejected": -426.0, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": -2.55908203125, "rewards/margins": 10.073437690734863, "rewards/rejected": -12.634374618530273, "step": 8050 }, { "epoch": 2.655683690280066, "grad_norm": 1.9763756787256597, "learning_rate": 3.3616144975288304e-07, "logits/chosen": -0.825634777545929, "logits/rejected": -1.2384765148162842, "logps/chosen": -391.54998779296875, "logps/rejected": -486.3999938964844, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": -2.4623045921325684, "rewards/margins": 10.157812118530273, "rewards/rejected": -12.615625381469727, "step": 8060 }, { "epoch": 2.658978583196046, "grad_norm": 0.8171064187883096, "learning_rate": 3.3533772652388797e-07, "logits/chosen": -0.945507824420929, "logits/rejected": -1.307226538658142, "logps/chosen": -354.6000061035156, "logps/rejected": -429.70001220703125, "loss": 0.0167, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.429492235183716, "rewards/margins": 10.637499809265137, "rewards/rejected": -13.065625190734863, "step": 8070 }, { "epoch": 2.662273476112026, "grad_norm": 5.847597532084825, "learning_rate": 3.345140032948929e-07, "logits/chosen": -0.942626953125, "logits/rejected": -1.360937476158142, "logps/chosen": -341.29998779296875, "logps/rejected": -405.70001220703125, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -2.619921922683716, "rewards/margins": 9.514062881469727, "rewards/rejected": -12.140625, "step": 8080 }, { "epoch": 2.6655683690280068, "grad_norm": 4.865391718696702, "learning_rate": 3.336902800658978e-07, "logits/chosen": -1.042633056640625, "logits/rejected": -1.2297852039337158, "logps/chosen": -349.6499938964844, "logps/rejected": -405.0, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -1.8107421398162842, "rewards/margins": 9.706250190734863, "rewards/rejected": -11.518750190734863, "step": 8090 }, { "epoch": 2.668863261943987, "grad_norm": 0.3762891906017383, "learning_rate": 3.328665568369028e-07, "logits/chosen": -1.063256859779358, "logits/rejected": -1.312109351158142, "logps/chosen": -365.79998779296875, "logps/rejected": -425.6499938964844, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -1.478997826576233, "rewards/margins": 10.026562690734863, "rewards/rejected": -11.496874809265137, "step": 8100 }, { "epoch": 2.672158154859967, "grad_norm": 9.038840893343872, "learning_rate": 3.3204283360790777e-07, "logits/chosen": -1.0623047351837158, "logits/rejected": -1.3583495616912842, "logps/chosen": -384.54998779296875, "logps/rejected": -413.0, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -1.4655029773712158, "rewards/margins": 9.582812309265137, "rewards/rejected": -11.050000190734863, "step": 8110 }, { "epoch": 2.6754530477759473, "grad_norm": 14.423560861952538, "learning_rate": 3.3121911037891264e-07, "logits/chosen": -1.072363257408142, "logits/rejected": -1.4470703601837158, "logps/chosen": -314.3500061035156, "logps/rejected": -388.6000061035156, "loss": 0.0193, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.6869003772735596, "rewards/margins": 9.506250381469727, "rewards/rejected": -11.196874618530273, "step": 8120 }, { "epoch": 2.6787479406919275, "grad_norm": 2.7829503839079033, "learning_rate": 3.303953871499176e-07, "logits/chosen": -1.070703148841858, "logits/rejected": -1.392968773841858, "logps/chosen": -360.625, "logps/rejected": -451.5, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -1.97113037109375, "rewards/margins": 10.035937309265137, "rewards/rejected": -12.006250381469727, "step": 8130 }, { "epoch": 2.6820428336079076, "grad_norm": 1.4442330504060525, "learning_rate": 3.2957166392092255e-07, "logits/chosen": -1.155664086341858, "logits/rejected": -1.3142578601837158, "logps/chosen": -398.20001220703125, "logps/rejected": -431.79998779296875, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -1.3536255359649658, "rewards/margins": 9.845312118530273, "rewards/rejected": -11.209375381469727, "step": 8140 }, { "epoch": 2.685337726523888, "grad_norm": 5.026605270461858, "learning_rate": 3.287479406919275e-07, "logits/chosen": -1.122460961341858, "logits/rejected": -1.420312523841858, "logps/chosen": -350.45001220703125, "logps/rejected": -416.54998779296875, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -2.0259766578674316, "rewards/margins": 9.234375, "rewards/rejected": -11.259374618530273, "step": 8150 }, { "epoch": 2.6886326194398684, "grad_norm": 0.5049372636216897, "learning_rate": 3.279242174629324e-07, "logits/chosen": -1.001611351966858, "logits/rejected": -1.3093750476837158, "logps/chosen": -360.04998779296875, "logps/rejected": -425.1000061035156, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -1.5755126476287842, "rewards/margins": 10.153124809265137, "rewards/rejected": -11.731249809265137, "step": 8160 }, { "epoch": 2.6919275123558486, "grad_norm": 0.25134061397740637, "learning_rate": 3.271004942339374e-07, "logits/chosen": -0.9178711175918579, "logits/rejected": -1.1789062023162842, "logps/chosen": -388.75, "logps/rejected": -460.79998779296875, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -1.601293921470642, "rewards/margins": 10.206250190734863, "rewards/rejected": -11.803125381469727, "step": 8170 }, { "epoch": 2.6952224052718288, "grad_norm": 14.035234309263465, "learning_rate": 3.2627677100494235e-07, "logits/chosen": -0.9745117425918579, "logits/rejected": -1.179296851158142, "logps/chosen": -367.3500061035156, "logps/rejected": -485.6000061035156, "loss": 0.0128, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.4266600608825684, "rewards/margins": 10.576562881469727, "rewards/rejected": -12.998437881469727, "step": 8180 }, { "epoch": 2.698517298187809, "grad_norm": 1.4180706622544106, "learning_rate": 3.254530477759473e-07, "logits/chosen": -0.953808605670929, "logits/rejected": -1.169531226158142, "logps/chosen": -340.29998779296875, "logps/rejected": -416.1000061035156, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -2.134765625, "rewards/margins": 10.604687690734863, "rewards/rejected": -12.740625381469727, "step": 8190 }, { "epoch": 2.701812191103789, "grad_norm": 0.7740298957323256, "learning_rate": 3.246293245469522e-07, "logits/chosen": -1.035742163658142, "logits/rejected": -1.214257836341858, "logps/chosen": -371.04998779296875, "logps/rejected": -419.8999938964844, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": -2.3607420921325684, "rewards/margins": 9.996874809265137, "rewards/rejected": -12.365625381469727, "step": 8200 }, { "epoch": 2.7051070840197693, "grad_norm": 0.5175829871234757, "learning_rate": 3.238056013179571e-07, "logits/chosen": -1.038964867591858, "logits/rejected": -1.208398461341858, "logps/chosen": -343.375, "logps/rejected": -411.6499938964844, "loss": 0.0104, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.152435302734375, "rewards/margins": 10.162500381469727, "rewards/rejected": -12.318750381469727, "step": 8210 }, { "epoch": 2.7084019769357495, "grad_norm": 15.36687998858968, "learning_rate": 3.229818780889621e-07, "logits/chosen": -0.731433093547821, "logits/rejected": -1.093164086341858, "logps/chosen": -346.8999938964844, "logps/rejected": -446.1000061035156, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -2.080859422683716, "rewards/margins": 10.618749618530273, "rewards/rejected": -12.690625190734863, "step": 8220 }, { "epoch": 2.71169686985173, "grad_norm": 0.28180550257745746, "learning_rate": 3.221581548599671e-07, "logits/chosen": -0.86859130859375, "logits/rejected": -1.2194335460662842, "logps/chosen": -383.0, "logps/rejected": -460.20001220703125, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -1.867578148841858, "rewards/margins": 10.490625381469727, "rewards/rejected": -12.353124618530273, "step": 8230 }, { "epoch": 2.71499176276771, "grad_norm": 5.5763653565524365, "learning_rate": 3.2133443163097195e-07, "logits/chosen": -0.82049560546875, "logits/rejected": -1.017187476158142, "logps/chosen": -308.25, "logps/rejected": -411.79998779296875, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": -2.011523485183716, "rewards/margins": 10.225000381469727, "rewards/rejected": -12.234375, "step": 8240 }, { "epoch": 2.7182866556836904, "grad_norm": 0.68656295649491, "learning_rate": 3.2051070840197693e-07, "logits/chosen": -0.842578113079071, "logits/rejected": -1.1804687976837158, "logps/chosen": -352.8500061035156, "logps/rejected": -424.70001220703125, "loss": 0.0118, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.62841796875, "rewards/margins": 9.920312881469727, "rewards/rejected": -11.556249618530273, "step": 8250 }, { "epoch": 2.7215815485996706, "grad_norm": 0.7583825477893662, "learning_rate": 3.1968698517298186e-07, "logits/chosen": -0.9345703125, "logits/rejected": -1.3388671875, "logps/chosen": -346.1499938964844, "logps/rejected": -437.0, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -1.9708983898162842, "rewards/margins": 10.256250381469727, "rewards/rejected": -12.225000381469727, "step": 8260 }, { "epoch": 2.724876441515651, "grad_norm": 2.7523270383887084, "learning_rate": 3.1886326194398683e-07, "logits/chosen": -1.0066406726837158, "logits/rejected": -1.2462890148162842, "logps/chosen": -352.8500061035156, "logps/rejected": -397.6499938964844, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -1.9636719226837158, "rewards/margins": 9.074999809265137, "rewards/rejected": -11.040624618530273, "step": 8270 }, { "epoch": 2.728171334431631, "grad_norm": 0.13077662840610738, "learning_rate": 3.180395387149917e-07, "logits/chosen": -1.042871117591858, "logits/rejected": -1.280664086341858, "logps/chosen": -329.1000061035156, "logps/rejected": -419.5, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -2.0277342796325684, "rewards/margins": 10.228124618530273, "rewards/rejected": -12.256250381469727, "step": 8280 }, { "epoch": 2.731466227347611, "grad_norm": 2.069280944765123, "learning_rate": 3.172158154859967e-07, "logits/chosen": -1.0504882335662842, "logits/rejected": -1.2952148914337158, "logps/chosen": -359.70001220703125, "logps/rejected": -412.20001220703125, "loss": 0.0115, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.9308593273162842, "rewards/margins": 9.771875381469727, "rewards/rejected": -11.709375381469727, "step": 8290 }, { "epoch": 2.7347611202635913, "grad_norm": 0.32396548179609436, "learning_rate": 3.1639209225700166e-07, "logits/chosen": -0.959912121295929, "logits/rejected": -1.292578101158142, "logps/chosen": -340.25, "logps/rejected": -401.0, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": -2.143603563308716, "rewards/margins": 9.623437881469727, "rewards/rejected": -11.768750190734863, "step": 8300 }, { "epoch": 2.7380560131795715, "grad_norm": 10.141104748333474, "learning_rate": 3.155683690280066e-07, "logits/chosen": -1.16455078125, "logits/rejected": -1.3740234375, "logps/chosen": -351.20001220703125, "logps/rejected": -457.79998779296875, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -2.364453077316284, "rewards/margins": 10.306249618530273, "rewards/rejected": -12.6875, "step": 8310 }, { "epoch": 2.741350906095552, "grad_norm": 2.907599850836408, "learning_rate": 3.147446457990115e-07, "logits/chosen": -1.1355469226837158, "logits/rejected": -1.4064452648162842, "logps/chosen": -406.04998779296875, "logps/rejected": -443.5, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -2.5123291015625, "rewards/margins": 10.106249809265137, "rewards/rejected": -12.628125190734863, "step": 8320 }, { "epoch": 2.7446457990115323, "grad_norm": 0.14951027131931094, "learning_rate": 3.1392092257001644e-07, "logits/chosen": -1.0244140625, "logits/rejected": -1.374609351158142, "logps/chosen": -357.70001220703125, "logps/rejected": -445.1000061035156, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -2.2422852516174316, "rewards/margins": 10.467187881469727, "rewards/rejected": -12.709375381469727, "step": 8330 }, { "epoch": 2.7479406919275124, "grad_norm": 1.684776210186522, "learning_rate": 3.130971993410214e-07, "logits/chosen": -1.1708984375, "logits/rejected": -1.446874976158142, "logps/chosen": -388.1499938964844, "logps/rejected": -462.8999938964844, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -2.679394483566284, "rewards/margins": 10.603124618530273, "rewards/rejected": -13.274999618530273, "step": 8340 }, { "epoch": 2.7512355848434926, "grad_norm": 13.745068502075119, "learning_rate": 3.122734761120264e-07, "logits/chosen": -0.950854480266571, "logits/rejected": -1.3527343273162842, "logps/chosen": -337.8999938964844, "logps/rejected": -414.29998779296875, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": -2.630078077316284, "rewards/margins": 10.590624809265137, "rewards/rejected": -13.221875190734863, "step": 8350 }, { "epoch": 2.754530477759473, "grad_norm": 2.537462816929821, "learning_rate": 3.1144975288303127e-07, "logits/chosen": -1.1123046875, "logits/rejected": -1.4109375476837158, "logps/chosen": -362.20001220703125, "logps/rejected": -438.0, "loss": 0.0245, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.7803955078125, "rewards/margins": 9.803125381469727, "rewards/rejected": -12.587499618530273, "step": 8360 }, { "epoch": 2.757825370675453, "grad_norm": 1.2343640394088962, "learning_rate": 3.1062602965403624e-07, "logits/chosen": -1.18359375, "logits/rejected": -1.327539086341858, "logps/chosen": -342.20001220703125, "logps/rejected": -414.29998779296875, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -2.189746141433716, "rewards/margins": 9.484375, "rewards/rejected": -11.675000190734863, "step": 8370 }, { "epoch": 2.761120263591433, "grad_norm": 13.473918507386154, "learning_rate": 3.0980230642504117e-07, "logits/chosen": -1.147851586341858, "logits/rejected": -1.3357422351837158, "logps/chosen": -334.3999938964844, "logps/rejected": -494.5, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": -2.4017577171325684, "rewards/margins": 9.8125, "rewards/rejected": -12.209375381469727, "step": 8380 }, { "epoch": 2.7644151565074138, "grad_norm": 3.0938128933857416, "learning_rate": 3.0897858319604615e-07, "logits/chosen": -1.090063452720642, "logits/rejected": -1.515625, "logps/chosen": -367.6499938964844, "logps/rejected": -418.79998779296875, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -1.8101685047149658, "rewards/margins": 10.182812690734863, "rewards/rejected": -11.996874809265137, "step": 8390 }, { "epoch": 2.7677100494233935, "grad_norm": 0.5629675055360978, "learning_rate": 3.08154859967051e-07, "logits/chosen": -1.1984374523162842, "logits/rejected": -1.491796851158142, "logps/chosen": -364.04998779296875, "logps/rejected": -457.20001220703125, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -1.88623046875, "rewards/margins": 10.300000190734863, "rewards/rejected": -12.181249618530273, "step": 8400 }, { "epoch": 2.771004942339374, "grad_norm": 0.8894502938999788, "learning_rate": 3.07331136738056e-07, "logits/chosen": -1.1037108898162842, "logits/rejected": -1.4464843273162842, "logps/chosen": -353.95001220703125, "logps/rejected": -395.20001220703125, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -2.0121092796325684, "rewards/margins": 9.276562690734863, "rewards/rejected": -11.287500381469727, "step": 8410 }, { "epoch": 2.7742998352553543, "grad_norm": 1.4311596457274245, "learning_rate": 3.06507413509061e-07, "logits/chosen": -0.98046875, "logits/rejected": -1.4011719226837158, "logps/chosen": -358.79998779296875, "logps/rejected": -464.5, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -2.2900390625, "rewards/margins": 10.4453125, "rewards/rejected": -12.743749618530273, "step": 8420 }, { "epoch": 2.7775947281713345, "grad_norm": 0.9308924671231164, "learning_rate": 3.056836902800659e-07, "logits/chosen": -1.0637695789337158, "logits/rejected": -1.401953101158142, "logps/chosen": -389.54998779296875, "logps/rejected": -430.1000061035156, "loss": 0.0085, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.737219214439392, "rewards/margins": 10.026562690734863, "rewards/rejected": -11.764062881469727, "step": 8430 }, { "epoch": 2.7808896210873146, "grad_norm": 0.5208771651126142, "learning_rate": 3.048599670510708e-07, "logits/chosen": -0.9466797113418579, "logits/rejected": -1.3527343273162842, "logps/chosen": -378.45001220703125, "logps/rejected": -451.0, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -2.4644532203674316, "rewards/margins": 10.6875, "rewards/rejected": -13.153124809265137, "step": 8440 }, { "epoch": 2.784184514003295, "grad_norm": 1.0525621904881406, "learning_rate": 3.0403624382207575e-07, "logits/chosen": -0.749584972858429, "logits/rejected": -1.2666015625, "logps/chosen": -355.04998779296875, "logps/rejected": -424.75, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -2.3441405296325684, "rewards/margins": 10.078125, "rewards/rejected": -12.418749809265137, "step": 8450 }, { "epoch": 2.787479406919275, "grad_norm": 4.359905186893516, "learning_rate": 3.032125205930807e-07, "logits/chosen": -0.9512695074081421, "logits/rejected": -1.2853515148162842, "logps/chosen": -361.75, "logps/rejected": -415.0, "loss": 0.014, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.229199171066284, "rewards/margins": 10.221875190734863, "rewards/rejected": -12.453125, "step": 8460 }, { "epoch": 2.790774299835255, "grad_norm": 84.72122244749201, "learning_rate": 3.023887973640857e-07, "logits/chosen": -0.923291027545929, "logits/rejected": -1.278222680091858, "logps/chosen": -366.1000061035156, "logps/rejected": -423.20001220703125, "loss": 0.0136, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.6861329078674316, "rewards/margins": 9.870312690734863, "rewards/rejected": -12.550000190734863, "step": 8470 }, { "epoch": 2.7940691927512358, "grad_norm": 4.640558342354766, "learning_rate": 3.015650741350906e-07, "logits/chosen": -1.051855444908142, "logits/rejected": -1.298437476158142, "logps/chosen": -330.8999938964844, "logps/rejected": -437.20001220703125, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -2.33984375, "rewards/margins": 10.699999809265137, "rewards/rejected": -13.034375190734863, "step": 8480 }, { "epoch": 2.797364085667216, "grad_norm": 3.677731026327155, "learning_rate": 3.0074135090609555e-07, "logits/chosen": -1.0166015625, "logits/rejected": -1.2791016101837158, "logps/chosen": -310.70001220703125, "logps/rejected": -391.70001220703125, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -1.912500023841858, "rewards/margins": 10.360937118530273, "rewards/rejected": -12.271875381469727, "step": 8490 }, { "epoch": 2.800658978583196, "grad_norm": 0.7879422153721867, "learning_rate": 2.999176276771005e-07, "logits/chosen": -1.0958983898162842, "logits/rejected": -1.407812476158142, "logps/chosen": -318.75, "logps/rejected": -390.79998779296875, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -1.6514403820037842, "rewards/margins": 10.154687881469727, "rewards/rejected": -11.806249618530273, "step": 8500 }, { "epoch": 2.8039538714991763, "grad_norm": 1.8260432973297587, "learning_rate": 2.9909390444810546e-07, "logits/chosen": -1.0832030773162842, "logits/rejected": -1.335351586341858, "logps/chosen": -377.6499938964844, "logps/rejected": -434.8999938964844, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -2.211962938308716, "rewards/margins": 10.2734375, "rewards/rejected": -12.490625381469727, "step": 8510 }, { "epoch": 2.8072487644151565, "grad_norm": 0.39093299654989855, "learning_rate": 2.9827018121911033e-07, "logits/chosen": -1.173828125, "logits/rejected": -1.5183594226837158, "logps/chosen": -374.0, "logps/rejected": -431.95001220703125, "loss": 0.0075, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.197998046875, "rewards/margins": 10.128125190734863, "rewards/rejected": -12.324999809265137, "step": 8520 }, { "epoch": 2.8105436573311366, "grad_norm": 0.2503742212479378, "learning_rate": 2.974464579901153e-07, "logits/chosen": -1.36328125, "logits/rejected": -1.513671875, "logps/chosen": -379.6499938964844, "logps/rejected": -467.5, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -2.938281297683716, "rewards/margins": 10.642187118530273, "rewards/rejected": -13.578125, "step": 8530 }, { "epoch": 2.813838550247117, "grad_norm": 2.544163012486473, "learning_rate": 2.966227347611203e-07, "logits/chosen": -1.135498046875, "logits/rejected": -1.445703148841858, "logps/chosen": -321.29998779296875, "logps/rejected": -417.29998779296875, "loss": 0.0155, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.0586915016174316, "rewards/margins": 10.098437309265137, "rewards/rejected": -12.15625, "step": 8540 }, { "epoch": 2.8171334431630974, "grad_norm": 17.56698519706449, "learning_rate": 2.957990115321252e-07, "logits/chosen": -1.3935546875, "logits/rejected": -1.622656226158142, "logps/chosen": -381.54998779296875, "logps/rejected": -438.79998779296875, "loss": 0.0074, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.745898485183716, "rewards/margins": 10.353124618530273, "rewards/rejected": -13.100000381469727, "step": 8550 }, { "epoch": 2.820428336079077, "grad_norm": 1.613500409068234, "learning_rate": 2.9497528830313013e-07, "logits/chosen": -1.0367920398712158, "logits/rejected": -1.365820288658142, "logps/chosen": -354.20001220703125, "logps/rejected": -431.6499938964844, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -2.6787109375, "rewards/margins": 10.290624618530273, "rewards/rejected": -12.973437309265137, "step": 8560 }, { "epoch": 2.823723228995058, "grad_norm": 0.1543027982015424, "learning_rate": 2.9415156507413506e-07, "logits/chosen": -1.122314453125, "logits/rejected": -1.4697265625, "logps/chosen": -328.3500061035156, "logps/rejected": -418.04998779296875, "loss": 0.014, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.507617235183716, "rewards/margins": 9.348437309265137, "rewards/rejected": -11.865625381469727, "step": 8570 }, { "epoch": 2.827018121911038, "grad_norm": 2.3036239466616086, "learning_rate": 2.9332784184514004e-07, "logits/chosen": -1.2130858898162842, "logits/rejected": -1.520117163658142, "logps/chosen": -334.0, "logps/rejected": -420.54998779296875, "loss": 0.0078, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.1585450172424316, "rewards/margins": 9.759374618530273, "rewards/rejected": -11.921875, "step": 8580 }, { "epoch": 2.830313014827018, "grad_norm": 1.4303541140082976, "learning_rate": 2.925041186161449e-07, "logits/chosen": -1.103124976158142, "logits/rejected": -1.4255859851837158, "logps/chosen": -366.6499938964844, "logps/rejected": -425.6000061035156, "loss": 0.0111, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.928076148033142, "rewards/margins": 9.506250381469727, "rewards/rejected": -11.428125381469727, "step": 8590 }, { "epoch": 2.8336079077429983, "grad_norm": 7.338554572953372, "learning_rate": 2.916803953871499e-07, "logits/chosen": -1.0207030773162842, "logits/rejected": -1.344335913658142, "logps/chosen": -362.8500061035156, "logps/rejected": -461.8999938964844, "loss": 0.0106, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.729394555091858, "rewards/margins": 10.0234375, "rewards/rejected": -11.753125190734863, "step": 8600 }, { "epoch": 2.8369028006589785, "grad_norm": 0.3988511679498627, "learning_rate": 2.9085667215815486e-07, "logits/chosen": -1.1222655773162842, "logits/rejected": -1.3523437976837158, "logps/chosen": -339.6499938964844, "logps/rejected": -438.8999938964844, "loss": 0.0073, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.523614525794983, "rewards/margins": 9.792187690734863, "rewards/rejected": -11.3125, "step": 8610 }, { "epoch": 2.8401976935749587, "grad_norm": 0.9787537746995514, "learning_rate": 2.900329489291598e-07, "logits/chosen": -1.039941430091858, "logits/rejected": -1.4582030773162842, "logps/chosen": -362.5, "logps/rejected": -412.8500061035156, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": -1.6853516101837158, "rewards/margins": 9.953125, "rewards/rejected": -11.643750190734863, "step": 8620 }, { "epoch": 2.843492586490939, "grad_norm": 4.227054160481482, "learning_rate": 2.892092257001647e-07, "logits/chosen": -1.167382836341858, "logits/rejected": -1.3527343273162842, "logps/chosen": -339.3999938964844, "logps/rejected": -389.3999938964844, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -1.430322289466858, "rewards/margins": 9.243749618530273, "rewards/rejected": -10.675000190734863, "step": 8630 }, { "epoch": 2.8467874794069195, "grad_norm": 0.7232857349258454, "learning_rate": 2.8838550247116964e-07, "logits/chosen": -1.0500977039337158, "logits/rejected": -1.263085961341858, "logps/chosen": -325.0, "logps/rejected": -406.20001220703125, "loss": 0.0102, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.2166016101837158, "rewards/margins": 9.767187118530273, "rewards/rejected": -10.981249809265137, "step": 8640 }, { "epoch": 2.8500823723228996, "grad_norm": 0.8481148033536061, "learning_rate": 2.875617792421746e-07, "logits/chosen": -1.110742211341858, "logits/rejected": -1.385156273841858, "logps/chosen": -329.1000061035156, "logps/rejected": -447.0, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -1.718652367591858, "rewards/margins": 9.839062690734863, "rewards/rejected": -11.556249618530273, "step": 8650 }, { "epoch": 2.85337726523888, "grad_norm": 0.3844150524701228, "learning_rate": 2.867380560131796e-07, "logits/chosen": -1.0372803211212158, "logits/rejected": -1.2683594226837158, "logps/chosen": -363.79998779296875, "logps/rejected": -463.0, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -1.517724633216858, "rewards/margins": 10.243749618530273, "rewards/rejected": -11.764062881469727, "step": 8660 }, { "epoch": 2.85667215815486, "grad_norm": 4.170276578092691, "learning_rate": 2.8591433278418447e-07, "logits/chosen": -1.0886719226837158, "logits/rejected": -1.2804687023162842, "logps/chosen": -322.5, "logps/rejected": -393.3999938964844, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": -1.9308593273162842, "rewards/margins": 9.482812881469727, "rewards/rejected": -11.412500381469727, "step": 8670 }, { "epoch": 2.85996705107084, "grad_norm": 1.453673196950862, "learning_rate": 2.8509060955518945e-07, "logits/chosen": -1.017333984375, "logits/rejected": -1.257421851158142, "logps/chosen": -383.5, "logps/rejected": -466.5, "loss": 0.0089, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.9285156726837158, "rewards/margins": 10.035937309265137, "rewards/rejected": -11.959375381469727, "step": 8680 }, { "epoch": 2.8632619439868203, "grad_norm": 1.1090750328886494, "learning_rate": 2.8426688632619437e-07, "logits/chosen": -0.7940948605537415, "logits/rejected": -1.2410156726837158, "logps/chosen": -324.0, "logps/rejected": -402.0, "loss": 0.0101, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.7604491710662842, "rewards/margins": 10.204687118530273, "rewards/rejected": -11.96875, "step": 8690 }, { "epoch": 2.8665568369028005, "grad_norm": 5.3981548050612185, "learning_rate": 2.8344316309719935e-07, "logits/chosen": -1.018945336341858, "logits/rejected": -1.238671898841858, "logps/chosen": -318.1499938964844, "logps/rejected": -420.1499938964844, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": -1.950781226158142, "rewards/margins": 10.184374809265137, "rewards/rejected": -12.137499809265137, "step": 8700 }, { "epoch": 2.869851729818781, "grad_norm": 0.7886002975319196, "learning_rate": 2.826194398682042e-07, "logits/chosen": -1.0615234375, "logits/rejected": -1.415624976158142, "logps/chosen": -399.5, "logps/rejected": -451.1000061035156, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": -2.7945313453674316, "rewards/margins": 10.407812118530273, "rewards/rejected": -13.209375381469727, "step": 8710 }, { "epoch": 2.873146622734761, "grad_norm": 0.39934056690968794, "learning_rate": 2.817957166392092e-07, "logits/chosen": -1.0330078601837158, "logits/rejected": -1.198144555091858, "logps/chosen": -347.3500061035156, "logps/rejected": -411.79998779296875, "loss": 0.01, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.2518553733825684, "rewards/margins": 9.415624618530273, "rewards/rejected": -11.654687881469727, "step": 8720 }, { "epoch": 2.8764415156507415, "grad_norm": 4.066849281352825, "learning_rate": 2.809719934102142e-07, "logits/chosen": -1.0006835460662842, "logits/rejected": -1.304296851158142, "logps/chosen": -341.8500061035156, "logps/rejected": -414.29998779296875, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -2.071093797683716, "rewards/margins": 9.232812881469727, "rewards/rejected": -11.309374809265137, "step": 8730 }, { "epoch": 2.8797364085667216, "grad_norm": 10.356652131770959, "learning_rate": 2.801482701812191e-07, "logits/chosen": -0.9876953363418579, "logits/rejected": -1.326171875, "logps/chosen": -355.1000061035156, "logps/rejected": -435.20001220703125, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -2.173510789871216, "rewards/margins": 10.151562690734863, "rewards/rejected": -12.318750381469727, "step": 8740 }, { "epoch": 2.883031301482702, "grad_norm": 7.161980096908692, "learning_rate": 2.79324546952224e-07, "logits/chosen": -0.9261718988418579, "logits/rejected": -1.3250000476837158, "logps/chosen": -371.79998779296875, "logps/rejected": -439.6000061035156, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -2.3448243141174316, "rewards/margins": 9.584375381469727, "rewards/rejected": -11.931249618530273, "step": 8750 }, { "epoch": 2.886326194398682, "grad_norm": 4.609793670479286, "learning_rate": 2.7850082372322895e-07, "logits/chosen": -0.929003894329071, "logits/rejected": -1.202734351158142, "logps/chosen": -354.70001220703125, "logps/rejected": -456.8999938964844, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": -2.3458008766174316, "rewards/margins": 10.621874809265137, "rewards/rejected": -12.981249809265137, "step": 8760 }, { "epoch": 2.889621087314662, "grad_norm": 10.106937372910867, "learning_rate": 2.7767710049423393e-07, "logits/chosen": -0.856640636920929, "logits/rejected": -1.2917969226837158, "logps/chosen": -317.70001220703125, "logps/rejected": -419.8999938964844, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -2.6131834983825684, "rewards/margins": 10.4296875, "rewards/rejected": -13.040624618530273, "step": 8770 }, { "epoch": 2.892915980230643, "grad_norm": 1.7899826700141404, "learning_rate": 2.768533772652389e-07, "logits/chosen": -0.809796154499054, "logits/rejected": -1.366796851158142, "logps/chosen": -415.1000061035156, "logps/rejected": -478.6000061035156, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -2.1021485328674316, "rewards/margins": 10.578125, "rewards/rejected": -12.675000190734863, "step": 8780 }, { "epoch": 2.8962108731466225, "grad_norm": 5.487344125021303, "learning_rate": 2.760296540362438e-07, "logits/chosen": -0.891308605670929, "logits/rejected": -1.3517577648162842, "logps/chosen": -339.75, "logps/rejected": -439.54998779296875, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": -1.903710961341858, "rewards/margins": 10.362500190734863, "rewards/rejected": -12.262499809265137, "step": 8790 }, { "epoch": 2.899505766062603, "grad_norm": 4.35271697847242, "learning_rate": 2.7520593080724876e-07, "logits/chosen": -1.0099608898162842, "logits/rejected": -1.288476586341858, "logps/chosen": -356.45001220703125, "logps/rejected": -428.8500061035156, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -2.4180665016174316, "rewards/margins": 9.4921875, "rewards/rejected": -11.915624618530273, "step": 8800 }, { "epoch": 2.9028006589785833, "grad_norm": 0.4000162632618573, "learning_rate": 2.743822075782537e-07, "logits/chosen": -1.089257836341858, "logits/rejected": -1.369531273841858, "logps/chosen": -348.25, "logps/rejected": -454.95001220703125, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -2.5269532203674316, "rewards/margins": 10.260937690734863, "rewards/rejected": -12.771875381469727, "step": 8810 }, { "epoch": 2.9060955518945635, "grad_norm": 1.4632712565550263, "learning_rate": 2.7355848434925866e-07, "logits/chosen": -1.015234351158142, "logits/rejected": -1.159082055091858, "logps/chosen": -380.3999938964844, "logps/rejected": -463.79998779296875, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -2.4666991233825684, "rewards/margins": 10.267187118530273, "rewards/rejected": -12.734375, "step": 8820 }, { "epoch": 2.9093904448105437, "grad_norm": 4.006903576239262, "learning_rate": 2.7273476112026353e-07, "logits/chosen": -1.0357666015625, "logits/rejected": -1.184960961341858, "logps/chosen": -371.04998779296875, "logps/rejected": -452.70001220703125, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": -2.2552733421325684, "rewards/margins": 10.074999809265137, "rewards/rejected": -12.331250190734863, "step": 8830 }, { "epoch": 2.912685337726524, "grad_norm": 0.646429027680254, "learning_rate": 2.719110378912685e-07, "logits/chosen": -0.991992175579071, "logits/rejected": -1.255273461341858, "logps/chosen": -385.04998779296875, "logps/rejected": -449.95001220703125, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -2.0923829078674316, "rewards/margins": 10.026562690734863, "rewards/rejected": -12.118749618530273, "step": 8840 }, { "epoch": 2.915980230642504, "grad_norm": 1.3948262225600192, "learning_rate": 2.710873146622735e-07, "logits/chosen": -1.1433594226837158, "logits/rejected": -1.2786133289337158, "logps/chosen": -356.0, "logps/rejected": -454.79998779296875, "loss": 0.0219, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.8480437994003296, "rewards/margins": 9.925000190734863, "rewards/rejected": -11.768750190734863, "step": 8850 }, { "epoch": 2.919275123558484, "grad_norm": 1.4517707816045378, "learning_rate": 2.702635914332784e-07, "logits/chosen": -1.0481445789337158, "logits/rejected": -1.2882812023162842, "logps/chosen": -392.75, "logps/rejected": -474.1000061035156, "loss": 0.0098, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.8041503429412842, "rewards/margins": 9.978124618530273, "rewards/rejected": -11.774999618530273, "step": 8860 }, { "epoch": 2.922570016474465, "grad_norm": 4.755513554644729, "learning_rate": 2.6943986820428334e-07, "logits/chosen": -1.1525390148162842, "logits/rejected": -1.3361327648162842, "logps/chosen": -307.70001220703125, "logps/rejected": -378.1499938964844, "loss": 0.018, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.3617920875549316, "rewards/margins": 9.434374809265137, "rewards/rejected": -11.800000190734863, "step": 8870 }, { "epoch": 2.925864909390445, "grad_norm": 0.17394641675482356, "learning_rate": 2.6861614497528826e-07, "logits/chosen": -1.044921875, "logits/rejected": -1.2666015625, "logps/chosen": -374.04998779296875, "logps/rejected": -412.6000061035156, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -1.9619629383087158, "rewards/margins": 9.532812118530273, "rewards/rejected": -11.487500190734863, "step": 8880 }, { "epoch": 2.929159802306425, "grad_norm": 0.19392791517660393, "learning_rate": 2.6779242174629324e-07, "logits/chosen": -1.0740234851837158, "logits/rejected": -1.318750023841858, "logps/chosen": -346.04998779296875, "logps/rejected": -426.1499938964844, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -1.99609375, "rewards/margins": 9.762499809265137, "rewards/rejected": -11.756250381469727, "step": 8890 }, { "epoch": 2.9324546952224053, "grad_norm": 2.9985095073428787, "learning_rate": 2.669686985172982e-07, "logits/chosen": -1.059814453125, "logits/rejected": -1.396875023841858, "logps/chosen": -312.3999938964844, "logps/rejected": -412.8500061035156, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -1.804101586341858, "rewards/margins": 10.040624618530273, "rewards/rejected": -11.846875190734863, "step": 8900 }, { "epoch": 2.9357495881383855, "grad_norm": 0.5889932084310694, "learning_rate": 2.661449752883031e-07, "logits/chosen": -1.060937523841858, "logits/rejected": -1.480078101158142, "logps/chosen": -343.25, "logps/rejected": -431.1000061035156, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -2.034912109375, "rewards/margins": 9.596875190734863, "rewards/rejected": -11.634374618530273, "step": 8910 }, { "epoch": 2.9390444810543657, "grad_norm": 1.1900485193019517, "learning_rate": 2.6532125205930807e-07, "logits/chosen": -1.035058617591858, "logits/rejected": -1.3498046398162842, "logps/chosen": -323.54998779296875, "logps/rejected": -424.0, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -2.0833983421325684, "rewards/margins": 9.473437309265137, "rewards/rejected": -11.556249618530273, "step": 8920 }, { "epoch": 2.942339373970346, "grad_norm": 13.773994364300377, "learning_rate": 2.64497528830313e-07, "logits/chosen": -1.0857422351837158, "logits/rejected": -1.2888672351837158, "logps/chosen": -355.45001220703125, "logps/rejected": -449.0, "loss": 0.0138, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.635107398033142, "rewards/margins": 9.714062690734863, "rewards/rejected": -11.346875190734863, "step": 8930 }, { "epoch": 2.9456342668863265, "grad_norm": 45.75763208561998, "learning_rate": 2.6367380560131797e-07, "logits/chosen": -1.072167992591858, "logits/rejected": -1.391210913658142, "logps/chosen": -315.8500061035156, "logps/rejected": -419.79998779296875, "loss": 0.0234, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.71630859375, "rewards/margins": 9.800000190734863, "rewards/rejected": -11.518750190734863, "step": 8940 }, { "epoch": 2.948929159802306, "grad_norm": 3.6397051907417928, "learning_rate": 2.6285008237232284e-07, "logits/chosen": -1.004492163658142, "logits/rejected": -1.384374976158142, "logps/chosen": -329.95001220703125, "logps/rejected": -423.3500061035156, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -1.532434105873108, "rewards/margins": 10.268750190734863, "rewards/rejected": -11.787500381469727, "step": 8950 }, { "epoch": 2.952224052718287, "grad_norm": 67.3017834968059, "learning_rate": 2.620263591433278e-07, "logits/chosen": -1.0022461414337158, "logits/rejected": -1.3166992664337158, "logps/chosen": -365.3999938964844, "logps/rejected": -443.1499938964844, "loss": 0.0132, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.878820776939392, "rewards/margins": 9.418749809265137, "rewards/rejected": -11.290624618530273, "step": 8960 }, { "epoch": 2.955518945634267, "grad_norm": 0.7992145254004662, "learning_rate": 2.612026359143328e-07, "logits/chosen": -1.1328125, "logits/rejected": -1.381445288658142, "logps/chosen": -331.5, "logps/rejected": -405.54998779296875, "loss": 0.0068, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.470019578933716, "rewards/margins": 10.082812309265137, "rewards/rejected": -12.550000190734863, "step": 8970 }, { "epoch": 2.958813838550247, "grad_norm": 3.4480344890115844, "learning_rate": 2.603789126853377e-07, "logits/chosen": -1.026757836341858, "logits/rejected": -1.324804663658142, "logps/chosen": -355.3500061035156, "logps/rejected": -442.29998779296875, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -2.822265625, "rewards/margins": 9.459375381469727, "rewards/rejected": -12.28125, "step": 8980 }, { "epoch": 2.9621087314662273, "grad_norm": 1.4636209386840948, "learning_rate": 2.5955518945634265e-07, "logits/chosen": -1.05859375, "logits/rejected": -1.147070288658142, "logps/chosen": -351.54998779296875, "logps/rejected": -440.0, "loss": 0.0091, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.835351586341858, "rewards/margins": 10.421875, "rewards/rejected": -12.262499809265137, "step": 8990 }, { "epoch": 2.9654036243822075, "grad_norm": 4.549947890049657, "learning_rate": 2.5873146622734757e-07, "logits/chosen": -1.06689453125, "logits/rejected": -1.225976586341858, "logps/chosen": -295.125, "logps/rejected": -385.3999938964844, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": -1.9118163585662842, "rewards/margins": 9.526562690734863, "rewards/rejected": -11.443750381469727, "step": 9000 }, { "epoch": 2.9686985172981877, "grad_norm": 2.603336692086788, "learning_rate": 2.5790774299835255e-07, "logits/chosen": -1.0027344226837158, "logits/rejected": -1.195898413658142, "logps/chosen": -399.75, "logps/rejected": -457.20001220703125, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -1.940332055091858, "rewards/margins": 10.606249809265137, "rewards/rejected": -12.551562309265137, "step": 9010 }, { "epoch": 2.971993410214168, "grad_norm": 1.050356646842966, "learning_rate": 2.5708401976935753e-07, "logits/chosen": -1.08984375, "logits/rejected": -1.334570288658142, "logps/chosen": -327.5, "logps/rejected": -431.45001220703125, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -1.9373047351837158, "rewards/margins": 10.140625, "rewards/rejected": -12.084375381469727, "step": 9020 }, { "epoch": 2.9752883031301485, "grad_norm": 0.12547084382358434, "learning_rate": 2.562602965403624e-07, "logits/chosen": -0.957470715045929, "logits/rejected": -1.241601586341858, "logps/chosen": -355.5, "logps/rejected": -400.20001220703125, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -2.095898389816284, "rewards/margins": 9.768750190734863, "rewards/rejected": -11.862500190734863, "step": 9030 }, { "epoch": 2.9785831960461286, "grad_norm": 2.408300620461717, "learning_rate": 2.554365733113674e-07, "logits/chosen": -1.12109375, "logits/rejected": -1.259179711341858, "logps/chosen": -371.1000061035156, "logps/rejected": -438.20001220703125, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -2.505908250808716, "rewards/margins": 9.949999809265137, "rewards/rejected": -12.453125, "step": 9040 }, { "epoch": 2.981878088962109, "grad_norm": 1.490589888095339, "learning_rate": 2.546128500823723e-07, "logits/chosen": -1.0343749523162842, "logits/rejected": -1.269140601158142, "logps/chosen": -358.8500061035156, "logps/rejected": -467.5, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -2.475390672683716, "rewards/margins": 10.714062690734863, "rewards/rejected": -13.203125, "step": 9050 }, { "epoch": 2.985172981878089, "grad_norm": 1.0345503432993874, "learning_rate": 2.537891268533773e-07, "logits/chosen": -1.1681640148162842, "logits/rejected": -1.2578125, "logps/chosen": -370.1499938964844, "logps/rejected": -430.6000061035156, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -2.4930663108825684, "rewards/margins": 9.810937881469727, "rewards/rejected": -12.306249618530273, "step": 9060 }, { "epoch": 2.988467874794069, "grad_norm": 0.03820121993886754, "learning_rate": 2.5296540362438215e-07, "logits/chosen": -0.994433581829071, "logits/rejected": -1.330664038658142, "logps/chosen": -349.2250061035156, "logps/rejected": -465.3999938964844, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -2.783984422683716, "rewards/margins": 10.515625, "rewards/rejected": -13.303125381469727, "step": 9070 }, { "epoch": 2.9917627677100493, "grad_norm": 0.6435761458173139, "learning_rate": 2.5214168039538713e-07, "logits/chosen": -1.044921875, "logits/rejected": -1.3425781726837158, "logps/chosen": -394.20001220703125, "logps/rejected": -437.29998779296875, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -2.5443358421325684, "rewards/margins": 9.934374809265137, "rewards/rejected": -12.471875190734863, "step": 9080 }, { "epoch": 2.9950576606260295, "grad_norm": 3.7099796589816916, "learning_rate": 2.513179571663921e-07, "logits/chosen": -1.0574219226837158, "logits/rejected": -1.463281273841858, "logps/chosen": -374.1000061035156, "logps/rejected": -445.20001220703125, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -2.503222703933716, "rewards/margins": 10.096875190734863, "rewards/rejected": -12.603124618530273, "step": 9090 }, { "epoch": 2.99835255354201, "grad_norm": 0.5009694155850635, "learning_rate": 2.5049423393739703e-07, "logits/chosen": -1.139257788658142, "logits/rejected": -1.3916015625, "logps/chosen": -330.29998779296875, "logps/rejected": -426.5, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -2.103466749191284, "rewards/margins": 10.800000190734863, "rewards/rejected": -12.899999618530273, "step": 9100 }, { "epoch": 3.0016474464579903, "grad_norm": 1.3870242462588713, "learning_rate": 2.4967051070840196e-07, "logits/chosen": -1.1482422351837158, "logits/rejected": -1.407324194908142, "logps/chosen": -318.1499938964844, "logps/rejected": -410.3999938964844, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -2.561230421066284, "rewards/margins": 10.582812309265137, "rewards/rejected": -13.146875381469727, "step": 9110 }, { "epoch": 3.0049423393739705, "grad_norm": 0.11466819250109314, "learning_rate": 2.488467874794069e-07, "logits/chosen": -1.166955590248108, "logits/rejected": -1.4298827648162842, "logps/chosen": -338.54998779296875, "logps/rejected": -439.29998779296875, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -2.2049317359924316, "rewards/margins": 11.496874809265137, "rewards/rejected": -13.709375381469727, "step": 9120 }, { "epoch": 3.0082372322899507, "grad_norm": 0.3459951578058407, "learning_rate": 2.4802306425041186e-07, "logits/chosen": -0.900500476360321, "logits/rejected": -1.3220703601837158, "logps/chosen": -357.6499938964844, "logps/rejected": -462.79998779296875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.760937452316284, "rewards/margins": 11.215624809265137, "rewards/rejected": -13.971875190734863, "step": 9130 }, { "epoch": 3.011532125205931, "grad_norm": 0.021590166029751237, "learning_rate": 2.471993410214168e-07, "logits/chosen": -1.10791015625, "logits/rejected": -1.3806641101837158, "logps/chosen": -336.20001220703125, "logps/rejected": -421.45001220703125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.28125, "rewards/margins": 11.337499618530273, "rewards/rejected": -13.628125190734863, "step": 9140 }, { "epoch": 3.014827018121911, "grad_norm": 0.529229692092649, "learning_rate": 2.4637561779242176e-07, "logits/chosen": -1.121679663658142, "logits/rejected": -1.2958984375, "logps/chosen": -352.75, "logps/rejected": -471.0, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.6419920921325684, "rewards/margins": 11.943750381469727, "rewards/rejected": -14.587499618530273, "step": 9150 }, { "epoch": 3.018121911037891, "grad_norm": 0.17332675423408583, "learning_rate": 2.455518945634267e-07, "logits/chosen": -1.124658226966858, "logits/rejected": -1.4539062976837158, "logps/chosen": -357.75, "logps/rejected": -451.70001220703125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.90625, "rewards/margins": 11.565625190734863, "rewards/rejected": -14.471875190734863, "step": 9160 }, { "epoch": 3.0214168039538714, "grad_norm": 0.06822155817494854, "learning_rate": 2.447281713344316e-07, "logits/chosen": -1.061425805091858, "logits/rejected": -1.413671851158142, "logps/chosen": -381.5, "logps/rejected": -477.20001220703125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.4830079078674316, "rewards/margins": 12.103124618530273, "rewards/rejected": -14.581250190734863, "step": 9170 }, { "epoch": 3.0247116968698515, "grad_norm": 0.17204185941667244, "learning_rate": 2.4390444810543654e-07, "logits/chosen": -1.05615234375, "logits/rejected": -1.323828101158142, "logps/chosen": -358.95001220703125, "logps/rejected": -489.29998779296875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.09765625, "rewards/margins": 12.028124809265137, "rewards/rejected": -15.128125190734863, "step": 9180 }, { "epoch": 3.028006589785832, "grad_norm": 0.5265811303459712, "learning_rate": 2.430807248764415e-07, "logits/chosen": -1.261328101158142, "logits/rejected": -1.38427734375, "logps/chosen": -334.1000061035156, "logps/rejected": -424.8999938964844, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.8656249046325684, "rewards/margins": 11.475000381469727, "rewards/rejected": -14.337499618530273, "step": 9190 }, { "epoch": 3.0313014827018123, "grad_norm": 0.4226036399414932, "learning_rate": 2.4225700164744644e-07, "logits/chosen": -1.083837866783142, "logits/rejected": -1.2625000476837158, "logps/chosen": -359.54998779296875, "logps/rejected": -435.3999938964844, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.843554735183716, "rewards/margins": 11.9375, "rewards/rejected": -14.78125, "step": 9200 }, { "epoch": 3.0345963756177925, "grad_norm": 0.3172318486171352, "learning_rate": 2.414332784184514e-07, "logits/chosen": -1.12158203125, "logits/rejected": -1.4080078601837158, "logps/chosen": -362.0, "logps/rejected": -470.8999938964844, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.6664061546325684, "rewards/margins": 12.096875190734863, "rewards/rejected": -14.762499809265137, "step": 9210 }, { "epoch": 3.0378912685337727, "grad_norm": 0.0703555196647552, "learning_rate": 2.4060955518945634e-07, "logits/chosen": -1.088623046875, "logits/rejected": -1.309179663658142, "logps/chosen": -346.20001220703125, "logps/rejected": -452.29998779296875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.331835985183716, "rewards/margins": 11.634374618530273, "rewards/rejected": -13.965624809265137, "step": 9220 }, { "epoch": 3.041186161449753, "grad_norm": 0.06775939737382257, "learning_rate": 2.3978583196046127e-07, "logits/chosen": -1.0779297351837158, "logits/rejected": -1.319238305091858, "logps/chosen": -336.3500061035156, "logps/rejected": -444.20001220703125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.446484327316284, "rewards/margins": 11.478124618530273, "rewards/rejected": -13.921875, "step": 9230 }, { "epoch": 3.044481054365733, "grad_norm": 0.7978807978738122, "learning_rate": 2.389621087314662e-07, "logits/chosen": -1.2109375, "logits/rejected": -1.3605468273162842, "logps/chosen": -347.79998779296875, "logps/rejected": -424.1499938964844, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.7367186546325684, "rewards/margins": 11.421875, "rewards/rejected": -14.175000190734863, "step": 9240 }, { "epoch": 3.047775947281713, "grad_norm": 0.09541541910142154, "learning_rate": 2.3813838550247115e-07, "logits/chosen": -1.021728515625, "logits/rejected": -1.3712890148162842, "logps/chosen": -378.04998779296875, "logps/rejected": -452.20001220703125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.9849610328674316, "rewards/margins": 11.403124809265137, "rewards/rejected": -14.396875381469727, "step": 9250 }, { "epoch": 3.0510708401976934, "grad_norm": 0.7273801772939819, "learning_rate": 2.373146622734761e-07, "logits/chosen": -1.1466796398162842, "logits/rejected": -1.388671875, "logps/chosen": -373.95001220703125, "logps/rejected": -465.5, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -3.265625, "rewards/margins": 11.196874618530273, "rewards/rejected": -14.462499618530273, "step": 9260 }, { "epoch": 3.054365733113674, "grad_norm": 0.028286651890437987, "learning_rate": 2.3649093904448102e-07, "logits/chosen": -1.0184814929962158, "logits/rejected": -1.361328125, "logps/chosen": -365.20001220703125, "logps/rejected": -465.5, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.9769530296325684, "rewards/margins": 11.303125381469727, "rewards/rejected": -14.293749809265137, "step": 9270 }, { "epoch": 3.057660626029654, "grad_norm": 0.16837995481946835, "learning_rate": 2.3566721581548597e-07, "logits/chosen": -1.0654296875, "logits/rejected": -1.3445312976837158, "logps/chosen": -310.8500061035156, "logps/rejected": -420.6000061035156, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.688281297683716, "rewards/margins": 11.534375190734863, "rewards/rejected": -14.231249809265137, "step": 9280 }, { "epoch": 3.0609555189456343, "grad_norm": 0.04116393697990081, "learning_rate": 2.3484349258649093e-07, "logits/chosen": -1.058007836341858, "logits/rejected": -1.412695288658142, "logps/chosen": -339.95001220703125, "logps/rejected": -446.3999938964844, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.9195313453674316, "rewards/margins": 11.890625, "rewards/rejected": -14.818750381469727, "step": 9290 }, { "epoch": 3.0642504118616145, "grad_norm": 0.06761025988601944, "learning_rate": 2.3401976935749588e-07, "logits/chosen": -1.048925757408142, "logits/rejected": -1.3048827648162842, "logps/chosen": -327.6000061035156, "logps/rejected": -449.1000061035156, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -3.3062500953674316, "rewards/margins": 11.643750190734863, "rewards/rejected": -14.943750381469727, "step": 9300 }, { "epoch": 3.0675453047775947, "grad_norm": 1.3522267749424552, "learning_rate": 2.331960461285008e-07, "logits/chosen": -1.0549805164337158, "logits/rejected": -1.3017578125, "logps/chosen": -389.75, "logps/rejected": -463.5, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.7896485328674316, "rewards/margins": 11.157812118530273, "rewards/rejected": -13.953125, "step": 9310 }, { "epoch": 3.070840197693575, "grad_norm": 0.1574666255000909, "learning_rate": 2.3237232289950575e-07, "logits/chosen": -1.085546851158142, "logits/rejected": -1.566796898841858, "logps/chosen": -379.20001220703125, "logps/rejected": -463.6000061035156, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.7586913108825684, "rewards/margins": 11.953125, "rewards/rejected": -14.709375381469727, "step": 9320 }, { "epoch": 3.074135090609555, "grad_norm": 0.017706374383599267, "learning_rate": 2.3154859967051068e-07, "logits/chosen": -1.129492163658142, "logits/rejected": -1.3935546875, "logps/chosen": -362.1499938964844, "logps/rejected": -459.0, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.5155272483825684, "rewards/margins": 11.628125190734863, "rewards/rejected": -14.131250381469727, "step": 9330 }, { "epoch": 3.077429983525535, "grad_norm": 0.35866739596239744, "learning_rate": 2.3072487644151563e-07, "logits/chosen": -1.0597655773162842, "logits/rejected": -1.373632788658142, "logps/chosen": -351.8500061035156, "logps/rejected": -483.79998779296875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.3021483421325684, "rewards/margins": 12.012499809265137, "rewards/rejected": -14.3125, "step": 9340 }, { "epoch": 3.080724876441516, "grad_norm": 0.03313794033216632, "learning_rate": 2.2990115321252058e-07, "logits/chosen": -1.146875023841858, "logits/rejected": -1.466796875, "logps/chosen": -373.1000061035156, "logps/rejected": -482.3999938964844, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.005932569503784, "rewards/margins": 11.878125190734863, "rewards/rejected": -13.875, "step": 9350 }, { "epoch": 3.084019769357496, "grad_norm": 0.40493229190735835, "learning_rate": 2.2907742998352553e-07, "logits/chosen": -1.112402319908142, "logits/rejected": -1.2409179210662842, "logps/chosen": -350.20001220703125, "logps/rejected": -474.5, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.8716797828674316, "rewards/margins": 11.809374809265137, "rewards/rejected": -14.681249618530273, "step": 9360 }, { "epoch": 3.087314662273476, "grad_norm": 0.15530077399886852, "learning_rate": 2.2825370675453046e-07, "logits/chosen": -1.09912109375, "logits/rejected": -1.4503905773162842, "logps/chosen": -404.54998779296875, "logps/rejected": -495.5, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.4169921875, "rewards/margins": 12.240625381469727, "rewards/rejected": -14.65625, "step": 9370 }, { "epoch": 3.0906095551894563, "grad_norm": 0.34602634211218525, "learning_rate": 2.274299835255354e-07, "logits/chosen": -1.1491210460662842, "logits/rejected": -1.316015601158142, "logps/chosen": -332.3500061035156, "logps/rejected": -458.70001220703125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.8824219703674316, "rewards/margins": 11.284375190734863, "rewards/rejected": -14.165624618530273, "step": 9380 }, { "epoch": 3.0939044481054365, "grad_norm": 0.06848124798325303, "learning_rate": 2.2660626029654033e-07, "logits/chosen": -1.1300780773162842, "logits/rejected": -1.4617187976837158, "logps/chosen": -367.54998779296875, "logps/rejected": -474.29998779296875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.876953125, "rewards/margins": 12.221875190734863, "rewards/rejected": -15.096875190734863, "step": 9390 }, { "epoch": 3.0971993410214167, "grad_norm": 0.22851635465451847, "learning_rate": 2.2578253706754528e-07, "logits/chosen": -1.149267554283142, "logits/rejected": -1.548828125, "logps/chosen": -353.29998779296875, "logps/rejected": -440.8999938964844, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.904296875, "rewards/margins": 11.215624809265137, "rewards/rejected": -14.125, "step": 9400 }, { "epoch": 3.100494233937397, "grad_norm": 0.11625036374504218, "learning_rate": 2.2495881383855024e-07, "logits/chosen": -1.002050757408142, "logits/rejected": -1.3566405773162842, "logps/chosen": -372.70001220703125, "logps/rejected": -472.8999938964844, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -2.377148389816284, "rewards/margins": 12.171875, "rewards/rejected": -14.543749809265137, "step": 9410 }, { "epoch": 3.1037891268533775, "grad_norm": 0.08081544383272297, "learning_rate": 2.241350906095552e-07, "logits/chosen": -1.0408935546875, "logits/rejected": -1.2744140625, "logps/chosen": -336.3500061035156, "logps/rejected": -425.3999938964844, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -2.0667967796325684, "rewards/margins": 11.943750381469727, "rewards/rejected": -14.012499809265137, "step": 9420 }, { "epoch": 3.1070840197693577, "grad_norm": 0.09434968270610145, "learning_rate": 2.233113673805601e-07, "logits/chosen": -1.0115234851837158, "logits/rejected": -1.336328148841858, "logps/chosen": -373.3999938964844, "logps/rejected": -436.1499938964844, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.4820313453674316, "rewards/margins": 11.28125, "rewards/rejected": -13.768750190734863, "step": 9430 }, { "epoch": 3.110378912685338, "grad_norm": 0.24150215632358338, "learning_rate": 2.2248764415156506e-07, "logits/chosen": -0.9779297113418579, "logits/rejected": -1.4128906726837158, "logps/chosen": -386.5, "logps/rejected": -448.0, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -2.416796922683716, "rewards/margins": 11.418749809265137, "rewards/rejected": -13.84375, "step": 9440 }, { "epoch": 3.113673805601318, "grad_norm": 0.20344321212559616, "learning_rate": 2.2166392092257e-07, "logits/chosen": -1.278906226158142, "logits/rejected": -1.545312523841858, "logps/chosen": -336.29998779296875, "logps/rejected": -428.8500061035156, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.570117235183716, "rewards/margins": 10.931249618530273, "rewards/rejected": -13.506250381469727, "step": 9450 }, { "epoch": 3.116968698517298, "grad_norm": 0.16022047751766338, "learning_rate": 2.2084019769357494e-07, "logits/chosen": -1.02490234375, "logits/rejected": -1.43359375, "logps/chosen": -349.75, "logps/rejected": -447.5, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.373242139816284, "rewards/margins": 11.290624618530273, "rewards/rejected": -13.659375190734863, "step": 9460 }, { "epoch": 3.1202635914332784, "grad_norm": 0.2043977198390295, "learning_rate": 2.200164744645799e-07, "logits/chosen": -1.1111328601837158, "logits/rejected": -1.3468749523162842, "logps/chosen": -381.3999938964844, "logps/rejected": -472.70001220703125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.677734375, "rewards/margins": 11.856249809265137, "rewards/rejected": -14.534375190734863, "step": 9470 }, { "epoch": 3.1235584843492585, "grad_norm": 0.09517674711710103, "learning_rate": 2.1919275123558484e-07, "logits/chosen": -1.137304663658142, "logits/rejected": -1.287109375, "logps/chosen": -325.6499938964844, "logps/rejected": -425.0, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.5094666481018066, "rewards/margins": 11.243749618530273, "rewards/rejected": -13.756250381469727, "step": 9480 }, { "epoch": 3.1268533772652387, "grad_norm": 0.07547784717377964, "learning_rate": 2.1836902800658977e-07, "logits/chosen": -1.1184570789337158, "logits/rejected": -1.5402343273162842, "logps/chosen": -382.04998779296875, "logps/rejected": -515.7000122070312, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.7490234375, "rewards/margins": 12.399999618530273, "rewards/rejected": -15.15625, "step": 9490 }, { "epoch": 3.130148270181219, "grad_norm": 0.13243511259350907, "learning_rate": 2.1754530477759472e-07, "logits/chosen": -1.1707031726837158, "logits/rejected": -1.4289062023162842, "logps/chosen": -347.70001220703125, "logps/rejected": -457.8999938964844, "loss": 0.0069, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.5064330101013184, "rewards/margins": 11.209375381469727, "rewards/rejected": -13.715624809265137, "step": 9500 }, { "epoch": 3.1334431630971995, "grad_norm": 1.164390186223862, "learning_rate": 2.1672158154859964e-07, "logits/chosen": -1.2263672351837158, "logits/rejected": -1.509374976158142, "logps/chosen": -342.8999938964844, "logps/rejected": -432.79998779296875, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -3.0433592796325684, "rewards/margins": 11.081250190734863, "rewards/rejected": -14.118749618530273, "step": 9510 }, { "epoch": 3.1367380560131797, "grad_norm": 0.19901917562483304, "learning_rate": 2.158978583196046e-07, "logits/chosen": -1.064111351966858, "logits/rejected": -1.4182617664337158, "logps/chosen": -339.29998779296875, "logps/rejected": -468.0, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.2569336891174316, "rewards/margins": 12.537500381469727, "rewards/rejected": -14.790624618530273, "step": 9520 }, { "epoch": 3.14003294892916, "grad_norm": 3.0865537895417363, "learning_rate": 2.1507413509060955e-07, "logits/chosen": -1.10400390625, "logits/rejected": -1.319738745689392, "logps/chosen": -300.79998779296875, "logps/rejected": -423.8999938964844, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -1.8538818359375, "rewards/margins": 11.324999809265137, "rewards/rejected": -13.171875, "step": 9530 }, { "epoch": 3.14332784184514, "grad_norm": 4.3819040106538045, "learning_rate": 2.142504118616145e-07, "logits/chosen": -1.01611328125, "logits/rejected": -1.4249999523162842, "logps/chosen": -352.0, "logps/rejected": -433.1000061035156, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -2.1039061546325684, "rewards/margins": 11.774999618530273, "rewards/rejected": -13.884374618530273, "step": 9540 }, { "epoch": 3.14662273476112, "grad_norm": 0.3060622033004641, "learning_rate": 2.1342668863261942e-07, "logits/chosen": -1.012304663658142, "logits/rejected": -1.28076171875, "logps/chosen": -314.29998779296875, "logps/rejected": -461.6000061035156, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.4715819358825684, "rewards/margins": 12.0, "rewards/rejected": -14.471875190734863, "step": 9550 }, { "epoch": 3.1499176276771004, "grad_norm": 0.08858671590126652, "learning_rate": 2.1260296540362437e-07, "logits/chosen": -1.192773461341858, "logits/rejected": -1.3982422351837158, "logps/chosen": -371.20001220703125, "logps/rejected": -448.5, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.621875047683716, "rewards/margins": 11.465624809265137, "rewards/rejected": -14.084375381469727, "step": 9560 }, { "epoch": 3.1532125205930805, "grad_norm": 0.2375872993686805, "learning_rate": 2.117792421746293e-07, "logits/chosen": -1.1005859375, "logits/rejected": -1.4738280773162842, "logps/chosen": -310.29998779296875, "logps/rejected": -420.20001220703125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.922656297683716, "rewards/margins": 11.893750190734863, "rewards/rejected": -14.8125, "step": 9570 }, { "epoch": 3.156507413509061, "grad_norm": 3.1843436353068357, "learning_rate": 2.1095551894563425e-07, "logits/chosen": -1.0878417491912842, "logits/rejected": -1.4246094226837158, "logps/chosen": -366.95001220703125, "logps/rejected": -482.70001220703125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.2743163108825684, "rewards/margins": 12.396875381469727, "rewards/rejected": -14.681249618530273, "step": 9580 }, { "epoch": 3.1598023064250413, "grad_norm": 0.07476219488596558, "learning_rate": 2.101317957166392e-07, "logits/chosen": -1.255957007408142, "logits/rejected": -1.453710913658142, "logps/chosen": -423.70001220703125, "logps/rejected": -461.1000061035156, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -1.97222900390625, "rewards/margins": 12.034375190734863, "rewards/rejected": -14.006250381469727, "step": 9590 }, { "epoch": 3.1630971993410215, "grad_norm": 0.08832815091151804, "learning_rate": 2.0930807248764415e-07, "logits/chosen": -1.1476562023162842, "logits/rejected": -1.439453125, "logps/chosen": -353.6499938964844, "logps/rejected": -480.54998779296875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.212890625, "rewards/margins": 12.074999809265137, "rewards/rejected": -15.287500381469727, "step": 9600 }, { "epoch": 3.1663920922570017, "grad_norm": 0.15982079406838914, "learning_rate": 2.0848434925864908e-07, "logits/chosen": -1.199804663658142, "logits/rejected": -1.4923827648162842, "logps/chosen": -326.29998779296875, "logps/rejected": -445.5, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.646679639816284, "rewards/margins": 12.300000190734863, "rewards/rejected": -14.953125, "step": 9610 }, { "epoch": 3.169686985172982, "grad_norm": 0.30913089166033025, "learning_rate": 2.0766062602965403e-07, "logits/chosen": -1.229101538658142, "logits/rejected": -1.4181640148162842, "logps/chosen": -341.0, "logps/rejected": -466.0, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.8187499046325684, "rewards/margins": 11.887499809265137, "rewards/rejected": -14.721875190734863, "step": 9620 }, { "epoch": 3.172981878088962, "grad_norm": 0.054263691564662264, "learning_rate": 2.0683690280065896e-07, "logits/chosen": -1.126684546470642, "logits/rejected": -1.4521484375, "logps/chosen": -321.0, "logps/rejected": -442.0, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.58203125, "rewards/margins": 12.084375381469727, "rewards/rejected": -14.65625, "step": 9630 }, { "epoch": 3.176276771004942, "grad_norm": 0.08026827167476783, "learning_rate": 2.060131795716639e-07, "logits/chosen": -1.2087891101837158, "logits/rejected": -1.4201171398162842, "logps/chosen": -338.8500061035156, "logps/rejected": -440.5, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.6025390625, "rewards/margins": 11.584375381469727, "rewards/rejected": -14.196874618530273, "step": 9640 }, { "epoch": 3.1795716639209224, "grad_norm": 0.22279817700931082, "learning_rate": 2.0518945634266886e-07, "logits/chosen": -1.037011742591858, "logits/rejected": -1.468359351158142, "logps/chosen": -322.6000061035156, "logps/rejected": -454.20001220703125, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -3.0052733421325684, "rewards/margins": 12.112500190734863, "rewards/rejected": -15.109375, "step": 9650 }, { "epoch": 3.182866556836903, "grad_norm": 0.3867603594796655, "learning_rate": 2.043657331136738e-07, "logits/chosen": -1.105224609375, "logits/rejected": -1.405664086341858, "logps/chosen": -377.6499938964844, "logps/rejected": -471.3999938964844, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.9496092796325684, "rewards/margins": 11.243749618530273, "rewards/rejected": -14.193750381469727, "step": 9660 }, { "epoch": 3.186161449752883, "grad_norm": 0.7684850429488923, "learning_rate": 2.0354200988467873e-07, "logits/chosen": -1.0099608898162842, "logits/rejected": -1.329687476158142, "logps/chosen": -367.25, "logps/rejected": -447.54998779296875, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -2.058154344558716, "rewards/margins": 11.396875381469727, "rewards/rejected": -13.459375381469727, "step": 9670 }, { "epoch": 3.1894563426688634, "grad_norm": 1.1325408649107953, "learning_rate": 2.0271828665568369e-07, "logits/chosen": -1.3015625476837158, "logits/rejected": -1.3650391101837158, "logps/chosen": -331.0, "logps/rejected": -468.70001220703125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.7425780296325684, "rewards/margins": 11.462499618530273, "rewards/rejected": -14.199999809265137, "step": 9680 }, { "epoch": 3.1927512355848435, "grad_norm": 0.3084355032345166, "learning_rate": 2.018945634266886e-07, "logits/chosen": -1.2693359851837158, "logits/rejected": -1.5125000476837158, "logps/chosen": -357.45001220703125, "logps/rejected": -429.20001220703125, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -2.8171629905700684, "rewards/margins": 11.790624618530273, "rewards/rejected": -14.606249809265137, "step": 9690 }, { "epoch": 3.1960461285008237, "grad_norm": 0.2032727322145945, "learning_rate": 2.010708401976936e-07, "logits/chosen": -1.1687500476837158, "logits/rejected": -1.465234398841858, "logps/chosen": -371.3500061035156, "logps/rejected": -455.8999938964844, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.5931639671325684, "rewards/margins": 11.496874809265137, "rewards/rejected": -14.084375381469727, "step": 9700 }, { "epoch": 3.199341021416804, "grad_norm": 0.1411818853319637, "learning_rate": 2.0024711696869851e-07, "logits/chosen": -1.084375023841858, "logits/rejected": -1.43359375, "logps/chosen": -407.8999938964844, "logps/rejected": -494.29998779296875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.5772461891174316, "rewards/margins": 11.581250190734863, "rewards/rejected": -14.162500381469727, "step": 9710 }, { "epoch": 3.202635914332784, "grad_norm": 0.00800567370916942, "learning_rate": 1.9942339373970346e-07, "logits/chosen": -0.9151366949081421, "logits/rejected": -1.426367163658142, "logps/chosen": -353.54998779296875, "logps/rejected": -455.5, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.843994140625, "rewards/margins": 12.456250190734863, "rewards/rejected": -15.303125381469727, "step": 9720 }, { "epoch": 3.2059308072487642, "grad_norm": 0.6362402489937314, "learning_rate": 1.985996705107084e-07, "logits/chosen": -1.075109839439392, "logits/rejected": -1.2746093273162842, "logps/chosen": -368.70001220703125, "logps/rejected": -463.20001220703125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.8866209983825684, "rewards/margins": 12.09375, "rewards/rejected": -14.971875190734863, "step": 9730 }, { "epoch": 3.209225700164745, "grad_norm": 0.10207022063108573, "learning_rate": 1.9777594728171334e-07, "logits/chosen": -1.114843726158142, "logits/rejected": -1.3621094226837158, "logps/chosen": -348.8500061035156, "logps/rejected": -463.20001220703125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.0707030296325684, "rewards/margins": 12.106249809265137, "rewards/rejected": -15.178125381469727, "step": 9740 }, { "epoch": 3.212520593080725, "grad_norm": 0.3453000873892003, "learning_rate": 1.9695222405271827e-07, "logits/chosen": -1.0275390148162842, "logits/rejected": -1.3253905773162842, "logps/chosen": -383.75, "logps/rejected": -444.0, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.591992139816284, "rewards/margins": 11.428125381469727, "rewards/rejected": -14.015625, "step": 9750 }, { "epoch": 3.215815485996705, "grad_norm": 2.8594779407997475, "learning_rate": 1.9612850082372324e-07, "logits/chosen": -1.041894555091858, "logits/rejected": -1.380761742591858, "logps/chosen": -351.1000061035156, "logps/rejected": -452.79998779296875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -1.8186523914337158, "rewards/margins": 11.412500381469727, "rewards/rejected": -13.21875, "step": 9760 }, { "epoch": 3.2191103789126854, "grad_norm": 0.4726406050763253, "learning_rate": 1.9530477759472817e-07, "logits/chosen": -1.1487305164337158, "logits/rejected": -1.4812500476837158, "logps/chosen": -328.1499938964844, "logps/rejected": -437.3999938964844, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -2.564404249191284, "rewards/margins": 11.800000190734863, "rewards/rejected": -14.356249809265137, "step": 9770 }, { "epoch": 3.2224052718286655, "grad_norm": 0.1178389195879682, "learning_rate": 1.9448105436573312e-07, "logits/chosen": -1.134667992591858, "logits/rejected": -1.4287109375, "logps/chosen": -340.8500061035156, "logps/rejected": -433.29998779296875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.3949217796325684, "rewards/margins": 11.803125381469727, "rewards/rejected": -14.209375381469727, "step": 9780 }, { "epoch": 3.2257001647446457, "grad_norm": 0.0501303586479297, "learning_rate": 1.9365733113673805e-07, "logits/chosen": -1.0348632335662842, "logits/rejected": -1.502343773841858, "logps/chosen": -342.54998779296875, "logps/rejected": -438.29998779296875, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -2.783935546875, "rewards/margins": 10.868749618530273, "rewards/rejected": -13.665624618530273, "step": 9790 }, { "epoch": 3.228995057660626, "grad_norm": 0.053668346105972466, "learning_rate": 1.92833607907743e-07, "logits/chosen": -1.177148461341858, "logits/rejected": -1.451171875, "logps/chosen": -356.3999938964844, "logps/rejected": -458.0, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.639843702316284, "rewards/margins": 11.331250190734863, "rewards/rejected": -13.965624809265137, "step": 9800 }, { "epoch": 3.232289950576606, "grad_norm": 0.7342447596236003, "learning_rate": 1.9200988467874792e-07, "logits/chosen": -1.025781273841858, "logits/rejected": -1.424218773841858, "logps/chosen": -381.75, "logps/rejected": -444.70001220703125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.6341795921325684, "rewards/margins": 11.362500190734863, "rewards/rejected": -13.984375, "step": 9810 }, { "epoch": 3.2355848434925867, "grad_norm": 0.5184833048973686, "learning_rate": 1.911861614497529e-07, "logits/chosen": -1.130859375, "logits/rejected": -1.450781226158142, "logps/chosen": -324.70001220703125, "logps/rejected": -416.1000061035156, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.440869092941284, "rewards/margins": 11.971875190734863, "rewards/rejected": -14.418749809265137, "step": 9820 }, { "epoch": 3.238879736408567, "grad_norm": 0.4292218367692622, "learning_rate": 1.9036243822075782e-07, "logits/chosen": -1.208837866783142, "logits/rejected": -1.4488036632537842, "logps/chosen": -339.75, "logps/rejected": -442.20001220703125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.555859327316284, "rewards/margins": 11.403124809265137, "rewards/rejected": -13.96875, "step": 9830 }, { "epoch": 3.242174629324547, "grad_norm": 1.4911455705619299, "learning_rate": 1.8953871499176278e-07, "logits/chosen": -1.068359375, "logits/rejected": -1.350000023841858, "logps/chosen": -328.04998779296875, "logps/rejected": -436.5, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.287890672683716, "rewards/margins": 11.903124809265137, "rewards/rejected": -15.178125381469727, "step": 9840 }, { "epoch": 3.245469522240527, "grad_norm": 0.15920520036155564, "learning_rate": 1.887149917627677e-07, "logits/chosen": -1.059667944908142, "logits/rejected": -1.257910132408142, "logps/chosen": -369.8500061035156, "logps/rejected": -463.20001220703125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.566601514816284, "rewards/margins": 11.403124809265137, "rewards/rejected": -13.959375381469727, "step": 9850 }, { "epoch": 3.2487644151565074, "grad_norm": 0.125272033213726, "learning_rate": 1.8789126853377265e-07, "logits/chosen": -1.069970726966858, "logits/rejected": -1.480859398841858, "logps/chosen": -355.54998779296875, "logps/rejected": -469.3999938964844, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.721484422683716, "rewards/margins": 11.915624618530273, "rewards/rejected": -14.643750190734863, "step": 9860 }, { "epoch": 3.2520593080724876, "grad_norm": 0.15533964711161793, "learning_rate": 1.8706754530477758e-07, "logits/chosen": -0.9432617425918579, "logits/rejected": -1.1794922351837158, "logps/chosen": -368.5, "logps/rejected": -457.79998779296875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.653857469558716, "rewards/margins": 12.053125381469727, "rewards/rejected": -14.703125, "step": 9870 }, { "epoch": 3.2553542009884677, "grad_norm": 0.039289773174011666, "learning_rate": 1.8624382207578255e-07, "logits/chosen": -1.21630859375, "logits/rejected": -1.422460913658142, "logps/chosen": -369.75, "logps/rejected": -468.29998779296875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.2191405296325684, "rewards/margins": 12.015625, "rewards/rejected": -15.237500190734863, "step": 9880 }, { "epoch": 3.258649093904448, "grad_norm": 0.06492982755418676, "learning_rate": 1.8542009884678748e-07, "logits/chosen": -1.074609398841858, "logits/rejected": -1.3337891101837158, "logps/chosen": -358.8999938964844, "logps/rejected": -472.20001220703125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -2.9637694358825684, "rewards/margins": 11.853124618530273, "rewards/rejected": -14.806249618530273, "step": 9890 }, { "epoch": 3.2619439868204285, "grad_norm": 0.0604410271233437, "learning_rate": 1.8459637561779243e-07, "logits/chosen": -1.005468726158142, "logits/rejected": -1.3916015625, "logps/chosen": -377.79998779296875, "logps/rejected": -471.29998779296875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.907824754714966, "rewards/margins": 12.181249618530273, "rewards/rejected": -15.09375, "step": 9900 }, { "epoch": 3.2652388797364087, "grad_norm": 0.13693393775115567, "learning_rate": 1.8377265238879736e-07, "logits/chosen": -1.0601074695587158, "logits/rejected": -1.5046875476837158, "logps/chosen": -352.25, "logps/rejected": -420.70001220703125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.719433546066284, "rewards/margins": 11.634374618530273, "rewards/rejected": -14.34375, "step": 9910 }, { "epoch": 3.268533772652389, "grad_norm": 0.179822602156792, "learning_rate": 1.8294892915980228e-07, "logits/chosen": -1.201171875, "logits/rejected": -1.5, "logps/chosen": -369.6499938964844, "logps/rejected": -468.20001220703125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.044140577316284, "rewards/margins": 11.806249618530273, "rewards/rejected": -14.856249809265137, "step": 9920 }, { "epoch": 3.271828665568369, "grad_norm": 0.08825573081490642, "learning_rate": 1.8212520593080723e-07, "logits/chosen": -1.1261718273162842, "logits/rejected": -1.40234375, "logps/chosen": -385.8500061035156, "logps/rejected": -466.95001220703125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.3794922828674316, "rewards/margins": 12.024999618530273, "rewards/rejected": -14.40625, "step": 9930 }, { "epoch": 3.275123558484349, "grad_norm": 0.1402732519271084, "learning_rate": 1.8130148270181216e-07, "logits/chosen": -1.0481445789337158, "logits/rejected": -1.37109375, "logps/chosen": -384.5, "logps/rejected": -469.0, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.7582030296325684, "rewards/margins": 11.209375381469727, "rewards/rejected": -13.962499618530273, "step": 9940 }, { "epoch": 3.2784184514003294, "grad_norm": 1.2329045040738924, "learning_rate": 1.8047775947281714e-07, "logits/chosen": -0.9012695550918579, "logits/rejected": -1.1671874523162842, "logps/chosen": -376.20001220703125, "logps/rejected": -465.1000061035156, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.0679688453674316, "rewards/margins": 12.078125, "rewards/rejected": -14.143750190734863, "step": 9950 }, { "epoch": 3.2817133443163096, "grad_norm": 0.2766594465686544, "learning_rate": 1.7965403624382206e-07, "logits/chosen": -0.9654785394668579, "logits/rejected": -1.3427734375, "logps/chosen": -368.3999938964844, "logps/rejected": -466.8999938964844, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.018749952316284, "rewards/margins": 11.459375381469727, "rewards/rejected": -14.490625381469727, "step": 9960 }, { "epoch": 3.28500823723229, "grad_norm": 0.06852820622918905, "learning_rate": 1.78830313014827e-07, "logits/chosen": -1.183984398841858, "logits/rejected": -1.376953125, "logps/chosen": -362.20001220703125, "logps/rejected": -447.70001220703125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.702929735183716, "rewards/margins": 11.381250381469727, "rewards/rejected": -14.087499618530273, "step": 9970 }, { "epoch": 3.2883031301482704, "grad_norm": 1.1933998405154325, "learning_rate": 1.7800658978583194e-07, "logits/chosen": -1.061132788658142, "logits/rejected": -1.488671898841858, "logps/chosen": -345.20001220703125, "logps/rejected": -436.20001220703125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.405956983566284, "rewards/margins": 12.296875, "rewards/rejected": -14.6875, "step": 9980 }, { "epoch": 3.2915980230642505, "grad_norm": 0.1654398870102455, "learning_rate": 1.771828665568369e-07, "logits/chosen": -1.076562523841858, "logits/rejected": -1.4890625476837158, "logps/chosen": -337.04998779296875, "logps/rejected": -446.20001220703125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.6180663108825684, "rewards/margins": 12.143750190734863, "rewards/rejected": -14.765625, "step": 9990 }, { "epoch": 3.2948929159802307, "grad_norm": 0.14410782421183443, "learning_rate": 1.763591433278418e-07, "logits/chosen": -1.1570312976837158, "logits/rejected": -1.3408203125, "logps/chosen": -329.3500061035156, "logps/rejected": -438.95001220703125, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -2.25506591796875, "rewards/margins": 11.359375, "rewards/rejected": -13.615625381469727, "step": 10000 }, { "epoch": 3.298187808896211, "grad_norm": 0.07962995609535488, "learning_rate": 1.755354200988468e-07, "logits/chosen": -1.1490967273712158, "logits/rejected": -1.4619140625, "logps/chosen": -341.5, "logps/rejected": -437.20001220703125, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.602343797683716, "rewards/margins": 11.59375, "rewards/rejected": -14.190625190734863, "step": 10010 }, { "epoch": 3.301482701812191, "grad_norm": 0.05197290672478411, "learning_rate": 1.7471169686985172e-07, "logits/chosen": -1.0613281726837158, "logits/rejected": -1.4050781726837158, "logps/chosen": -420.20001220703125, "logps/rejected": -516.7000122070312, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.165539503097534, "rewards/margins": 12.550000190734863, "rewards/rejected": -14.715624809265137, "step": 10020 }, { "epoch": 3.3047775947281712, "grad_norm": 0.6679709648660035, "learning_rate": 1.7388797364085667e-07, "logits/chosen": -1.100000023841858, "logits/rejected": -1.3904297351837158, "logps/chosen": -359.1000061035156, "logps/rejected": -484.8999938964844, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.9361329078674316, "rewards/margins": 11.828125, "rewards/rejected": -14.765625, "step": 10030 }, { "epoch": 3.3080724876441514, "grad_norm": 0.573874700264213, "learning_rate": 1.730642504118616e-07, "logits/chosen": -1.052099585533142, "logits/rejected": -1.4462890625, "logps/chosen": -397.0, "logps/rejected": -453.8999938964844, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.6451172828674316, "rewards/margins": 11.609375, "rewards/rejected": -14.262499809265137, "step": 10040 }, { "epoch": 3.3113673805601316, "grad_norm": 0.16785300403644818, "learning_rate": 1.7224052718286654e-07, "logits/chosen": -1.207763671875, "logits/rejected": -1.4402344226837158, "logps/chosen": -329.8500061035156, "logps/rejected": -447.5, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.136157274246216, "rewards/margins": 11.690625190734863, "rewards/rejected": -13.840624809265137, "step": 10050 }, { "epoch": 3.314662273476112, "grad_norm": 0.17844194400424346, "learning_rate": 1.7141680395387147e-07, "logits/chosen": -1.001684546470642, "logits/rejected": -1.406640648841858, "logps/chosen": -375.5, "logps/rejected": -418.5, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -1.798071265220642, "rewards/margins": 11.446874618530273, "rewards/rejected": -13.25, "step": 10060 }, { "epoch": 3.3179571663920924, "grad_norm": 0.00613817966111882, "learning_rate": 1.7059308072487645e-07, "logits/chosen": -1.100000023841858, "logits/rejected": -1.4621093273162842, "logps/chosen": -384.8500061035156, "logps/rejected": -456.70001220703125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -2.526928663253784, "rewards/margins": 12.021875381469727, "rewards/rejected": -14.537500381469727, "step": 10070 }, { "epoch": 3.3212520593080725, "grad_norm": 3.023266874022267, "learning_rate": 1.6976935749588137e-07, "logits/chosen": -1.0583984851837158, "logits/rejected": -1.4587891101837158, "logps/chosen": -352.75, "logps/rejected": -439.8999938964844, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.649243116378784, "rewards/margins": 11.321874618530273, "rewards/rejected": -13.978124618530273, "step": 10080 }, { "epoch": 3.3245469522240527, "grad_norm": 0.21091667999727026, "learning_rate": 1.6894563426688632e-07, "logits/chosen": -1.04248046875, "logits/rejected": -1.361328125, "logps/chosen": -353.04998779296875, "logps/rejected": -436.70001220703125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.369335889816284, "rewards/margins": 11.78125, "rewards/rejected": -14.15625, "step": 10090 }, { "epoch": 3.327841845140033, "grad_norm": 0.3157686946214414, "learning_rate": 1.6812191103789125e-07, "logits/chosen": -1.1876952648162842, "logits/rejected": -1.448828101158142, "logps/chosen": -351.25, "logps/rejected": -448.20001220703125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.7412109375, "rewards/margins": 11.574999809265137, "rewards/rejected": -14.3125, "step": 10100 }, { "epoch": 3.331136738056013, "grad_norm": 0.06277040457534684, "learning_rate": 1.672981878088962e-07, "logits/chosen": -1.1824219226837158, "logits/rejected": -1.471289038658142, "logps/chosen": -363.3500061035156, "logps/rejected": -463.70001220703125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.5677733421325684, "rewards/margins": 11.912500381469727, "rewards/rejected": -14.46875, "step": 10110 }, { "epoch": 3.3344316309719932, "grad_norm": 0.08977723400284665, "learning_rate": 1.6647446457990112e-07, "logits/chosen": -1.1837890148162842, "logits/rejected": -1.547265648841858, "logps/chosen": -348.6000061035156, "logps/rejected": -432.3999938964844, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.736328125, "rewards/margins": 12.162500381469727, "rewards/rejected": -14.90625, "step": 10120 }, { "epoch": 3.337726523887974, "grad_norm": 0.11409845305238525, "learning_rate": 1.656507413509061e-07, "logits/chosen": -1.076208472251892, "logits/rejected": -1.3391602039337158, "logps/chosen": -398.0, "logps/rejected": -485.70001220703125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.324511766433716, "rewards/margins": 11.565625190734863, "rewards/rejected": -13.875, "step": 10130 }, { "epoch": 3.341021416803954, "grad_norm": 0.4102448421633541, "learning_rate": 1.6482701812191103e-07, "logits/chosen": -1.17578125, "logits/rejected": -1.4796874523162842, "logps/chosen": -362.25, "logps/rejected": -465.70001220703125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.877734422683716, "rewards/margins": 12.346875190734863, "rewards/rejected": -15.225000381469727, "step": 10140 }, { "epoch": 3.344316309719934, "grad_norm": 0.030371237630258223, "learning_rate": 1.6400329489291598e-07, "logits/chosen": -1.1018555164337158, "logits/rejected": -1.335546851158142, "logps/chosen": -373.04998779296875, "logps/rejected": -425.8999938964844, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.5943360328674316, "rewards/margins": 11.165624618530273, "rewards/rejected": -13.759374618530273, "step": 10150 }, { "epoch": 3.3476112026359144, "grad_norm": 0.12570319055527215, "learning_rate": 1.631795716639209e-07, "logits/chosen": -1.151953101158142, "logits/rejected": -1.5515625476837158, "logps/chosen": -358.04998779296875, "logps/rejected": -439.70001220703125, "loss": 0.0045, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.01171875, "rewards/margins": 11.518750190734863, "rewards/rejected": -14.524999618530273, "step": 10160 }, { "epoch": 3.3509060955518946, "grad_norm": 0.908702213668279, "learning_rate": 1.6235584843492585e-07, "logits/chosen": -1.306054711341858, "logits/rejected": -1.5750000476837158, "logps/chosen": -389.6000061035156, "logps/rejected": -483.29998779296875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.0458006858825684, "rewards/margins": 12.268750190734863, "rewards/rejected": -15.3125, "step": 10170 }, { "epoch": 3.3542009884678747, "grad_norm": 0.174286321368989, "learning_rate": 1.6153212520593078e-07, "logits/chosen": -1.180078148841858, "logits/rejected": -1.314062476158142, "logps/chosen": -358.75, "logps/rejected": -423.6000061035156, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -2.8824219703674316, "rewards/margins": 11.056249618530273, "rewards/rejected": -13.946874618530273, "step": 10180 }, { "epoch": 3.357495881383855, "grad_norm": 0.17461899972419404, "learning_rate": 1.6070840197693576e-07, "logits/chosen": -1.1853516101837158, "logits/rejected": -1.561914086341858, "logps/chosen": -358.75, "logps/rejected": -447.29998779296875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.0223145484924316, "rewards/margins": 11.946874618530273, "rewards/rejected": -14.971875190734863, "step": 10190 }, { "epoch": 3.360790774299835, "grad_norm": 0.209452554603807, "learning_rate": 1.5988467874794068e-07, "logits/chosen": -1.131250023841858, "logits/rejected": -1.436132788658142, "logps/chosen": -363.70001220703125, "logps/rejected": -464.0, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -2.9248046875, "rewards/margins": 11.759374618530273, "rewards/rejected": -14.681249618530273, "step": 10200 }, { "epoch": 3.3640856672158153, "grad_norm": 0.12427323235528709, "learning_rate": 1.5906095551894563e-07, "logits/chosen": -1.1896483898162842, "logits/rejected": -1.553125023841858, "logps/chosen": -360.0, "logps/rejected": -444.29998779296875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.659960985183716, "rewards/margins": 11.831250190734863, "rewards/rejected": -14.481249809265137, "step": 10210 }, { "epoch": 3.367380560131796, "grad_norm": 0.07830452147589836, "learning_rate": 1.5823723228995056e-07, "logits/chosen": -1.1707031726837158, "logits/rejected": -1.543554663658142, "logps/chosen": -342.3999938964844, "logps/rejected": -441.29998779296875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.8482422828674316, "rewards/margins": 12.265625, "rewards/rejected": -15.109375, "step": 10220 }, { "epoch": 3.370675453047776, "grad_norm": 0.2533660742180293, "learning_rate": 1.574135090609555e-07, "logits/chosen": -1.11328125, "logits/rejected": -1.5988280773162842, "logps/chosen": -298.95001220703125, "logps/rejected": -388.1499938964844, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.1166014671325684, "rewards/margins": 11.621874809265137, "rewards/rejected": -14.743749618530273, "step": 10230 }, { "epoch": 3.3739703459637562, "grad_norm": 0.385657903518016, "learning_rate": 1.5658978583196043e-07, "logits/chosen": -1.2121093273162842, "logits/rejected": -1.526757836341858, "logps/chosen": -328.6499938964844, "logps/rejected": -441.04998779296875, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -3.515625, "rewards/margins": 11.378125190734863, "rewards/rejected": -14.881250381469727, "step": 10240 }, { "epoch": 3.3772652388797364, "grad_norm": 0.11212979159007458, "learning_rate": 1.557660626029654e-07, "logits/chosen": -1.0689697265625, "logits/rejected": -1.4373047351837158, "logps/chosen": -337.3999938964844, "logps/rejected": -446.29998779296875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.0728516578674316, "rewards/margins": 11.740625381469727, "rewards/rejected": -14.796875, "step": 10250 }, { "epoch": 3.3805601317957166, "grad_norm": 0.05650187371644122, "learning_rate": 1.5494233937397034e-07, "logits/chosen": -1.1183593273162842, "logits/rejected": -1.457617163658142, "logps/chosen": -380.20001220703125, "logps/rejected": -475.5, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.020703077316284, "rewards/margins": 11.928125381469727, "rewards/rejected": -14.946874618530273, "step": 10260 }, { "epoch": 3.3838550247116967, "grad_norm": 0.23339625400483874, "learning_rate": 1.541186161449753e-07, "logits/chosen": -1.0715820789337158, "logits/rejected": -1.384765625, "logps/chosen": -361.45001220703125, "logps/rejected": -457.6000061035156, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.1097655296325684, "rewards/margins": 11.649999618530273, "rewards/rejected": -14.756250381469727, "step": 10270 }, { "epoch": 3.387149917627677, "grad_norm": 0.1383979528688858, "learning_rate": 1.5329489291598021e-07, "logits/chosen": -1.231054663658142, "logits/rejected": -1.4582030773162842, "logps/chosen": -418.95001220703125, "logps/rejected": -488.20001220703125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.0418457984924316, "rewards/margins": 12.065625190734863, "rewards/rejected": -15.109375, "step": 10280 }, { "epoch": 3.3904448105436575, "grad_norm": 0.41193700115149107, "learning_rate": 1.5247116968698517e-07, "logits/chosen": -1.1649169921875, "logits/rejected": -1.454687476158142, "logps/chosen": -390.8500061035156, "logps/rejected": -471.6000061035156, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -3.1773438453674316, "rewards/margins": 12.024999618530273, "rewards/rejected": -15.21875, "step": 10290 }, { "epoch": 3.3937397034596377, "grad_norm": 4.565839305056687, "learning_rate": 1.516474464579901e-07, "logits/chosen": -1.2158203125, "logits/rejected": -1.475976586341858, "logps/chosen": -342.8999938964844, "logps/rejected": -436.0, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.9222655296325684, "rewards/margins": 11.3125, "rewards/rejected": -14.234375, "step": 10300 }, { "epoch": 3.397034596375618, "grad_norm": 0.04881045239641464, "learning_rate": 1.5082372322899507e-07, "logits/chosen": -1.1096680164337158, "logits/rejected": -1.437890648841858, "logps/chosen": -378.79998779296875, "logps/rejected": -484.20001220703125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.5107421875, "rewards/margins": 11.990625381469727, "rewards/rejected": -14.503125190734863, "step": 10310 }, { "epoch": 3.400329489291598, "grad_norm": 0.061643002312942556, "learning_rate": 1.5e-07, "logits/chosen": -1.2454102039337158, "logits/rejected": -1.494531273841858, "logps/chosen": -375.0, "logps/rejected": -473.0, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.129687547683716, "rewards/margins": 12.346875190734863, "rewards/rejected": -15.481249809265137, "step": 10320 }, { "epoch": 3.4036243822075782, "grad_norm": 0.29497941466629535, "learning_rate": 1.4917627677100494e-07, "logits/chosen": -1.1525146961212158, "logits/rejected": -1.3664062023162842, "logps/chosen": -355.20001220703125, "logps/rejected": -450.1499938964844, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.022167921066284, "rewards/margins": 11.440625190734863, "rewards/rejected": -14.459375381469727, "step": 10330 }, { "epoch": 3.4069192751235584, "grad_norm": 0.06757690035499934, "learning_rate": 1.4835255354200987e-07, "logits/chosen": -1.1823241710662842, "logits/rejected": -1.339453101158142, "logps/chosen": -365.45001220703125, "logps/rejected": -464.1000061035156, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.5087890625, "rewards/margins": 12.068750381469727, "rewards/rejected": -14.581250190734863, "step": 10340 }, { "epoch": 3.4102141680395386, "grad_norm": 0.05987189294793467, "learning_rate": 1.4752883031301482e-07, "logits/chosen": -1.360937476158142, "logits/rejected": -1.5578124523162842, "logps/chosen": -376.3500061035156, "logps/rejected": -501.6000061035156, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.2339844703674316, "rewards/margins": 12.234375, "rewards/rejected": -15.471875190734863, "step": 10350 }, { "epoch": 3.4135090609555188, "grad_norm": 0.19780956602629826, "learning_rate": 1.4670510708401975e-07, "logits/chosen": -1.1760742664337158, "logits/rejected": -1.509765625, "logps/chosen": -334.6000061035156, "logps/rejected": -429.0, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.4599609375, "rewards/margins": 11.550000190734863, "rewards/rejected": -14.015625, "step": 10360 }, { "epoch": 3.416803953871499, "grad_norm": 0.08243072762828618, "learning_rate": 1.4588138385502472e-07, "logits/chosen": -1.159326195716858, "logits/rejected": -1.564062476158142, "logps/chosen": -358.70001220703125, "logps/rejected": -431.5, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -2.569140672683716, "rewards/margins": 11.771875381469727, "rewards/rejected": -14.346875190734863, "step": 10370 }, { "epoch": 3.4200988467874796, "grad_norm": 0.27623160153301696, "learning_rate": 1.4505766062602965e-07, "logits/chosen": -1.135351538658142, "logits/rejected": -1.4580078125, "logps/chosen": -348.8500061035156, "logps/rejected": -463.20001220703125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.0748047828674316, "rewards/margins": 11.643750190734863, "rewards/rejected": -14.709375381469727, "step": 10380 }, { "epoch": 3.4233937397034597, "grad_norm": 0.23029273323424213, "learning_rate": 1.442339373970346e-07, "logits/chosen": -1.1921875476837158, "logits/rejected": -1.550878882408142, "logps/chosen": -378.54998779296875, "logps/rejected": -435.1000061035156, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.0718750953674316, "rewards/margins": 11.71875, "rewards/rejected": -13.78125, "step": 10390 }, { "epoch": 3.42668863261944, "grad_norm": 0.31900165925167207, "learning_rate": 1.4341021416803953e-07, "logits/chosen": -1.1015136241912842, "logits/rejected": -1.36181640625, "logps/chosen": -331.8999938964844, "logps/rejected": -404.20001220703125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.10791015625, "rewards/margins": 11.399999618530273, "rewards/rejected": -13.509374618530273, "step": 10400 }, { "epoch": 3.42998352553542, "grad_norm": 0.17546014855592912, "learning_rate": 1.4258649093904448e-07, "logits/chosen": -1.128515601158142, "logits/rejected": -1.405859351158142, "logps/chosen": -352.54998779296875, "logps/rejected": -453.8999938964844, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.469531297683716, "rewards/margins": 11.331250190734863, "rewards/rejected": -13.803125381469727, "step": 10410 }, { "epoch": 3.4332784184514002, "grad_norm": 0.4987902414147298, "learning_rate": 1.417627677100494e-07, "logits/chosen": -1.171875, "logits/rejected": -1.4259765148162842, "logps/chosen": -396.5, "logps/rejected": -485.8999938964844, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.6626954078674316, "rewards/margins": 11.90625, "rewards/rejected": -14.568750381469727, "step": 10420 }, { "epoch": 3.4365733113673804, "grad_norm": 0.0965889384403122, "learning_rate": 1.4093904448105438e-07, "logits/chosen": -1.04443359375, "logits/rejected": -1.3878905773162842, "logps/chosen": -336.3999938964844, "logps/rejected": -441.5, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.4126954078674316, "rewards/margins": 10.653124809265137, "rewards/rejected": -13.0625, "step": 10430 }, { "epoch": 3.4398682042833606, "grad_norm": 0.1785397571125647, "learning_rate": 1.401153212520593e-07, "logits/chosen": -0.9726318120956421, "logits/rejected": -1.3552734851837158, "logps/chosen": -380.20001220703125, "logps/rejected": -460.79998779296875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.5086913108825684, "rewards/margins": 11.515625, "rewards/rejected": -14.024999618530273, "step": 10440 }, { "epoch": 3.443163097199341, "grad_norm": 0.06983003050599461, "learning_rate": 1.3929159802306426e-07, "logits/chosen": -1.2013671398162842, "logits/rejected": -1.466210961341858, "logps/chosen": -412.45001220703125, "logps/rejected": -451.70001220703125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.2919921875, "rewards/margins": 11.603124618530273, "rewards/rejected": -13.899999618530273, "step": 10450 }, { "epoch": 3.4464579901153214, "grad_norm": 0.3133121316491681, "learning_rate": 1.3846787479406918e-07, "logits/chosen": -1.036230444908142, "logits/rejected": -1.4113280773162842, "logps/chosen": -367.5, "logps/rejected": -482.20001220703125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.6725096702575684, "rewards/margins": 12.028124809265137, "rewards/rejected": -14.703125, "step": 10460 }, { "epoch": 3.4497528830313016, "grad_norm": 0.12058814401095327, "learning_rate": 1.3764415156507413e-07, "logits/chosen": -1.2800781726837158, "logits/rejected": -1.473046898841858, "logps/chosen": -314.3999938964844, "logps/rejected": -458.6000061035156, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.0738282203674316, "rewards/margins": 11.737500190734863, "rewards/rejected": -14.800000190734863, "step": 10470 }, { "epoch": 3.4530477759472817, "grad_norm": 0.23186912768506826, "learning_rate": 1.3682042833607906e-07, "logits/chosen": -1.213891625404358, "logits/rejected": -1.4328124523162842, "logps/chosen": -366.25, "logps/rejected": -453.0, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.444531202316284, "rewards/margins": 11.653124809265137, "rewards/rejected": -14.096875190734863, "step": 10480 }, { "epoch": 3.456342668863262, "grad_norm": 0.07561043425432182, "learning_rate": 1.3599670510708403e-07, "logits/chosen": -1.102148413658142, "logits/rejected": -1.2976562976837158, "logps/chosen": -342.8500061035156, "logps/rejected": -466.1000061035156, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.8645262718200684, "rewards/margins": 11.574999809265137, "rewards/rejected": -14.428125381469727, "step": 10490 }, { "epoch": 3.459637561779242, "grad_norm": 9.707182320888927, "learning_rate": 1.3517298187808896e-07, "logits/chosen": -1.072265625, "logits/rejected": -1.4140625, "logps/chosen": -356.6499938964844, "logps/rejected": -455.5, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -3.37890625, "rewards/margins": 12.059374809265137, "rewards/rejected": -15.449999809265137, "step": 10500 }, { "epoch": 3.4629324546952223, "grad_norm": 0.3516772768073101, "learning_rate": 1.343492586490939e-07, "logits/chosen": -1.1150391101837158, "logits/rejected": -1.3634765148162842, "logps/chosen": -343.5, "logps/rejected": -440.8500061035156, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.210156202316284, "rewards/margins": 11.903124809265137, "rewards/rejected": -15.115625381469727, "step": 10510 }, { "epoch": 3.466227347611203, "grad_norm": 0.7358578721599869, "learning_rate": 1.3352553542009884e-07, "logits/chosen": -1.0728271007537842, "logits/rejected": -1.3811523914337158, "logps/chosen": -372.04998779296875, "logps/rejected": -476.3999938964844, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.814697265625, "rewards/margins": 12.103124618530273, "rewards/rejected": -14.912500381469727, "step": 10520 }, { "epoch": 3.469522240527183, "grad_norm": 0.3818268691647434, "learning_rate": 1.327018121911038e-07, "logits/chosen": -1.137304663658142, "logits/rejected": -1.396691918373108, "logps/chosen": -360.1499938964844, "logps/rejected": -465.5, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.5755858421325684, "rewards/margins": 11.428125381469727, "rewards/rejected": -14.009374618530273, "step": 10530 }, { "epoch": 3.4728171334431632, "grad_norm": 0.49359299938891826, "learning_rate": 1.318780889621087e-07, "logits/chosen": -1.1103515625, "logits/rejected": -1.479882836341858, "logps/chosen": -346.6499938964844, "logps/rejected": -432.8999938964844, "loss": 0.0045, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.205078125, "rewards/margins": 11.559374809265137, "rewards/rejected": -14.768750190734863, "step": 10540 }, { "epoch": 3.4761120263591434, "grad_norm": 1.66866265183372, "learning_rate": 1.310543657331137e-07, "logits/chosen": -1.0947265625, "logits/rejected": -1.486718773841858, "logps/chosen": -312.1000061035156, "logps/rejected": -386.70001220703125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.301953077316284, "rewards/margins": 11.628125190734863, "rewards/rejected": -14.915624618530273, "step": 10550 }, { "epoch": 3.4794069192751236, "grad_norm": 1.2837770103620465, "learning_rate": 1.3023064250411862e-07, "logits/chosen": -1.0036132335662842, "logits/rejected": -1.433007836341858, "logps/chosen": -418.8500061035156, "logps/rejected": -437.1000061035156, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.274609327316284, "rewards/margins": 11.675000190734863, "rewards/rejected": -14.956250190734863, "step": 10560 }, { "epoch": 3.4827018121911038, "grad_norm": 1.128812202614614, "learning_rate": 1.2940691927512357e-07, "logits/chosen": -0.9681152105331421, "logits/rejected": -1.266992211341858, "logps/chosen": -365.75, "logps/rejected": -447.8999938964844, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.725781202316284, "rewards/margins": 12.081250190734863, "rewards/rejected": -14.806249618530273, "step": 10570 }, { "epoch": 3.485996705107084, "grad_norm": 0.3367922362381448, "learning_rate": 1.285831960461285e-07, "logits/chosen": -1.274804711341858, "logits/rejected": -1.4386718273162842, "logps/chosen": -392.95001220703125, "logps/rejected": -467.8999938964844, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.5259766578674316, "rewards/margins": 12.324999809265137, "rewards/rejected": -14.84375, "step": 10580 }, { "epoch": 3.489291598023064, "grad_norm": 0.09182600376143278, "learning_rate": 1.2775947281713342e-07, "logits/chosen": -1.021093726158142, "logits/rejected": -1.235937476158142, "logps/chosen": -382.6000061035156, "logps/rejected": -465.79998779296875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.0646729469299316, "rewards/margins": 12.568750381469727, "rewards/rejected": -15.621874809265137, "step": 10590 }, { "epoch": 3.4925864909390443, "grad_norm": 0.46369633833904034, "learning_rate": 1.2693574958813837e-07, "logits/chosen": -0.98663330078125, "logits/rejected": -1.3254883289337158, "logps/chosen": -350.5, "logps/rejected": -440.79998779296875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.1478271484375, "rewards/margins": 11.506250381469727, "rewards/rejected": -14.643750190734863, "step": 10600 }, { "epoch": 3.495881383855025, "grad_norm": 0.07752714439991601, "learning_rate": 1.2611202635914332e-07, "logits/chosen": -1.167578101158142, "logits/rejected": -1.269140601158142, "logps/chosen": -333.6499938964844, "logps/rejected": -405.70001220703125, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -2.305468797683716, "rewards/margins": 11.778124809265137, "rewards/rejected": -14.09375, "step": 10610 }, { "epoch": 3.499176276771005, "grad_norm": 0.1853654314986208, "learning_rate": 1.2528830313014827e-07, "logits/chosen": -0.95263671875, "logits/rejected": -1.3271484375, "logps/chosen": -405.25, "logps/rejected": -463.8999938964844, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.911328077316284, "rewards/margins": 11.340624809265137, "rewards/rejected": -14.25, "step": 10620 }, { "epoch": 3.5024711696869852, "grad_norm": 0.16635025181573893, "learning_rate": 1.244645799011532e-07, "logits/chosen": -1.128320336341858, "logits/rejected": -1.3466796875, "logps/chosen": -380.95001220703125, "logps/rejected": -451.5, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.3394532203674316, "rewards/margins": 11.675000190734863, "rewards/rejected": -14.003125190734863, "step": 10630 }, { "epoch": 3.5057660626029654, "grad_norm": 0.09537995751968269, "learning_rate": 1.2364085667215815e-07, "logits/chosen": -1.027124047279358, "logits/rejected": -1.4197266101837158, "logps/chosen": -335.70001220703125, "logps/rejected": -440.29998779296875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.836230516433716, "rewards/margins": 11.815625190734863, "rewards/rejected": -14.640625, "step": 10640 }, { "epoch": 3.5090609555189456, "grad_norm": 0.0754715389452113, "learning_rate": 1.228171334431631e-07, "logits/chosen": -1.140625, "logits/rejected": -1.3693358898162842, "logps/chosen": -368.70001220703125, "logps/rejected": -447.6000061035156, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.214062452316284, "rewards/margins": 11.934374809265137, "rewards/rejected": -14.143750190734863, "step": 10650 }, { "epoch": 3.5123558484349258, "grad_norm": 0.046349127165010835, "learning_rate": 1.2199341021416802e-07, "logits/chosen": -1.016699194908142, "logits/rejected": -1.371679663658142, "logps/chosen": -334.79998779296875, "logps/rejected": -456.79998779296875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.364062547683716, "rewards/margins": 12.65625, "rewards/rejected": -15.018750190734863, "step": 10660 }, { "epoch": 3.515650741350906, "grad_norm": 0.6822071790801998, "learning_rate": 1.2116968698517297e-07, "logits/chosen": -0.9355224370956421, "logits/rejected": -1.359375, "logps/chosen": -353.04998779296875, "logps/rejected": -445.8999938964844, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.6291871070861816, "rewards/margins": 12.268750190734863, "rewards/rejected": -14.887499809265137, "step": 10670 }, { "epoch": 3.5189456342668866, "grad_norm": 0.30492171710816, "learning_rate": 1.2034596375617793e-07, "logits/chosen": -0.992431640625, "logits/rejected": -1.207128882408142, "logps/chosen": -368.3999938964844, "logps/rejected": -461.6499938964844, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.149462938308716, "rewards/margins": 11.893750190734863, "rewards/rejected": -14.043749809265137, "step": 10680 }, { "epoch": 3.5222405271828663, "grad_norm": 0.17790716262037673, "learning_rate": 1.1952224052718285e-07, "logits/chosen": -1.1515624523162842, "logits/rejected": -1.497460961341858, "logps/chosen": -354.3999938964844, "logps/rejected": -445.3500061035156, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.7203612327575684, "rewards/margins": 12.356249809265137, "rewards/rejected": -15.074999809265137, "step": 10690 }, { "epoch": 3.525535420098847, "grad_norm": 0.0388516878946852, "learning_rate": 1.186985172981878e-07, "logits/chosen": -1.130273461341858, "logits/rejected": -1.3896484375, "logps/chosen": -398.2250061035156, "logps/rejected": -490.0, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.8128905296325684, "rewards/margins": 12.690625190734863, "rewards/rejected": -15.509374618530273, "step": 10700 }, { "epoch": 3.528830313014827, "grad_norm": 0.07039424031914461, "learning_rate": 1.1787479406919275e-07, "logits/chosen": -1.1067016124725342, "logits/rejected": -1.3582031726837158, "logps/chosen": -338.92498779296875, "logps/rejected": -448.70001220703125, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -3.006054639816284, "rewards/margins": 11.431249618530273, "rewards/rejected": -14.446874618530273, "step": 10710 }, { "epoch": 3.5321252059308073, "grad_norm": 0.08702758601777243, "learning_rate": 1.1705107084019769e-07, "logits/chosen": -1.0939452648162842, "logits/rejected": -1.352636694908142, "logps/chosen": -363.70001220703125, "logps/rejected": -470.1000061035156, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -3.1175780296325684, "rewards/margins": 11.203125, "rewards/rejected": -14.324999809265137, "step": 10720 }, { "epoch": 3.5354200988467874, "grad_norm": 0.3337395798820264, "learning_rate": 1.1622734761120263e-07, "logits/chosen": -1.170507788658142, "logits/rejected": -1.4416015148162842, "logps/chosen": -352.8500061035156, "logps/rejected": -430.79998779296875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.74755859375, "rewards/margins": 11.462499618530273, "rewards/rejected": -14.21875, "step": 10730 }, { "epoch": 3.5387149917627676, "grad_norm": 0.06957577047897417, "learning_rate": 1.1540362438220758e-07, "logits/chosen": -1.0730469226837158, "logits/rejected": -1.404296875, "logps/chosen": -369.3999938964844, "logps/rejected": -454.0, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -2.575091600418091, "rewards/margins": 11.8125, "rewards/rejected": -14.384374618530273, "step": 10740 }, { "epoch": 3.5420098846787478, "grad_norm": 3.985452195306721, "learning_rate": 1.1457990115321252e-07, "logits/chosen": -1.1876952648162842, "logits/rejected": -1.373632788658142, "logps/chosen": -362.95001220703125, "logps/rejected": -473.79998779296875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.6455078125, "rewards/margins": 11.318750381469727, "rewards/rejected": -13.965624809265137, "step": 10750 }, { "epoch": 3.545304777594728, "grad_norm": 0.13232596499824267, "learning_rate": 1.1375617792421746e-07, "logits/chosen": -1.0867187976837158, "logits/rejected": -1.4025390148162842, "logps/chosen": -354.45001220703125, "logps/rejected": -463.70001220703125, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -3.2308592796325684, "rewards/margins": 11.462499618530273, "rewards/rejected": -14.699999809265137, "step": 10760 }, { "epoch": 3.5485996705107086, "grad_norm": 0.3859282427642236, "learning_rate": 1.1293245469522241e-07, "logits/chosen": -1.097265601158142, "logits/rejected": -1.461328148841858, "logps/chosen": -379.45001220703125, "logps/rejected": -485.70001220703125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.0728516578674316, "rewards/margins": 12.212499618530273, "rewards/rejected": -15.278124809265137, "step": 10770 }, { "epoch": 3.5518945634266887, "grad_norm": 0.4254231719874009, "learning_rate": 1.1210873146622735e-07, "logits/chosen": -1.131445288658142, "logits/rejected": -1.455664038658142, "logps/chosen": -381.5, "logps/rejected": -434.70001220703125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.375195264816284, "rewards/margins": 11.399999618530273, "rewards/rejected": -13.787500381469727, "step": 10780 }, { "epoch": 3.555189456342669, "grad_norm": 0.11578333105638816, "learning_rate": 1.1128500823723229e-07, "logits/chosen": -1.0232422351837158, "logits/rejected": -1.310449242591858, "logps/chosen": -341.8500061035156, "logps/rejected": -440.8999938964844, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.531445264816284, "rewards/margins": 11.265625, "rewards/rejected": -13.793749809265137, "step": 10790 }, { "epoch": 3.558484349258649, "grad_norm": 0.16849007298884008, "learning_rate": 1.1046128500823724e-07, "logits/chosen": -0.9985595941543579, "logits/rejected": -1.237890601158142, "logps/chosen": -336.29998779296875, "logps/rejected": -425.29998779296875, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -2.6773438453674316, "rewards/margins": 11.584375381469727, "rewards/rejected": -14.259374618530273, "step": 10800 }, { "epoch": 3.5617792421746293, "grad_norm": 0.43644421931238025, "learning_rate": 1.0963756177924218e-07, "logits/chosen": -1.114160180091858, "logits/rejected": -1.478515625, "logps/chosen": -356.25, "logps/rejected": -446.8999938964844, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.34423828125, "rewards/margins": 11.759374618530273, "rewards/rejected": -14.096875190734863, "step": 10810 }, { "epoch": 3.5650741350906094, "grad_norm": 0.04868294630060617, "learning_rate": 1.0881383855024711e-07, "logits/chosen": -0.916015625, "logits/rejected": -1.2605469226837158, "logps/chosen": -328.75, "logps/rejected": -424.6000061035156, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.3919920921325684, "rewards/margins": 11.421875, "rewards/rejected": -13.821874618530273, "step": 10820 }, { "epoch": 3.5683690280065896, "grad_norm": 0.09402596088977125, "learning_rate": 1.0799011532125206e-07, "logits/chosen": -0.9640136957168579, "logits/rejected": -1.3230469226837158, "logps/chosen": -360.25, "logps/rejected": -440.3999938964844, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.6773438453674316, "rewards/margins": 11.828125, "rewards/rejected": -14.515625, "step": 10830 }, { "epoch": 3.5716639209225702, "grad_norm": 0.029883292050662046, "learning_rate": 1.07166392092257e-07, "logits/chosen": -0.86083984375, "logits/rejected": -1.2839844226837158, "logps/chosen": -346.5, "logps/rejected": -447.29998779296875, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -2.8388671875, "rewards/margins": 11.559374809265137, "rewards/rejected": -14.409375190734863, "step": 10840 }, { "epoch": 3.5749588138385504, "grad_norm": 0.668340749608954, "learning_rate": 1.0634266886326194e-07, "logits/chosen": -0.843157947063446, "logits/rejected": -1.259179711341858, "logps/chosen": -358.0, "logps/rejected": -433.5, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.642578125, "rewards/margins": 11.903124809265137, "rewards/rejected": -14.537500381469727, "step": 10850 }, { "epoch": 3.5782537067545306, "grad_norm": 0.6993985451487514, "learning_rate": 1.0551894563426689e-07, "logits/chosen": -1.072021484375, "logits/rejected": -1.3800780773162842, "logps/chosen": -313.1499938964844, "logps/rejected": -421.8999938964844, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -2.84912109375, "rewards/margins": 11.584375381469727, "rewards/rejected": -14.418749809265137, "step": 10860 }, { "epoch": 3.5815485996705108, "grad_norm": 0.9127675568472786, "learning_rate": 1.0469522240527183e-07, "logits/chosen": -1.1335937976837158, "logits/rejected": -1.2131836414337158, "logps/chosen": -372.70001220703125, "logps/rejected": -468.1000061035156, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.4569334983825684, "rewards/margins": 11.921875, "rewards/rejected": -14.387499809265137, "step": 10870 }, { "epoch": 3.584843492586491, "grad_norm": 0.386602424071016, "learning_rate": 1.0387149917627677e-07, "logits/chosen": -1.0420410633087158, "logits/rejected": -1.3815429210662842, "logps/chosen": -315.0, "logps/rejected": -446.6000061035156, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.897656202316284, "rewards/margins": 11.643750190734863, "rewards/rejected": -14.53125, "step": 10880 }, { "epoch": 3.588138385502471, "grad_norm": 0.09678771346229793, "learning_rate": 1.0304777594728172e-07, "logits/chosen": -1.237695336341858, "logits/rejected": -1.3712890148162842, "logps/chosen": -328.5, "logps/rejected": -461.3999938964844, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.0083985328674316, "rewards/margins": 11.840624809265137, "rewards/rejected": -14.850000381469727, "step": 10890 }, { "epoch": 3.5914332784184513, "grad_norm": 0.9604532929833038, "learning_rate": 1.0222405271828666e-07, "logits/chosen": -1.045312523841858, "logits/rejected": -1.55078125, "logps/chosen": -360.29998779296875, "logps/rejected": -434.20001220703125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.0901856422424316, "rewards/margins": 11.850000381469727, "rewards/rejected": -14.940625190734863, "step": 10900 }, { "epoch": 3.594728171334432, "grad_norm": 0.09282666164694071, "learning_rate": 1.014003294892916e-07, "logits/chosen": -0.901611328125, "logits/rejected": -1.2726562023162842, "logps/chosen": -359.6499938964844, "logps/rejected": -467.29998779296875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.8822264671325684, "rewards/margins": 12.459375381469727, "rewards/rejected": -15.356249809265137, "step": 10910 }, { "epoch": 3.5980230642504116, "grad_norm": 0.25016920810806764, "learning_rate": 1.0057660626029655e-07, "logits/chosen": -1.0830078125, "logits/rejected": -1.3357422351837158, "logps/chosen": -334.70001220703125, "logps/rejected": -416.20001220703125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.962109327316284, "rewards/margins": 12.009374618530273, "rewards/rejected": -14.965624809265137, "step": 10920 }, { "epoch": 3.6013179571663922, "grad_norm": 0.08874485312251143, "learning_rate": 9.975288303130147e-08, "logits/chosen": -1.0504882335662842, "logits/rejected": -1.382421851158142, "logps/chosen": -378.70001220703125, "logps/rejected": -471.29998779296875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.2152099609375, "rewards/margins": 12.040624618530273, "rewards/rejected": -15.25, "step": 10930 }, { "epoch": 3.6046128500823724, "grad_norm": 0.22606134806415218, "learning_rate": 9.892915980230641e-08, "logits/chosen": -1.0310547351837158, "logits/rejected": -1.4656250476837158, "logps/chosen": -395.45001220703125, "logps/rejected": -470.3500061035156, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.5210938453674316, "rewards/margins": 11.568750381469727, "rewards/rejected": -14.074999809265137, "step": 10940 }, { "epoch": 3.6079077429983526, "grad_norm": 0.3070399438893611, "learning_rate": 9.810543657331136e-08, "logits/chosen": -1.0862305164337158, "logits/rejected": -1.318750023841858, "logps/chosen": -385.6000061035156, "logps/rejected": -473.8999938964844, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.94189453125, "rewards/margins": 11.690625190734863, "rewards/rejected": -14.615625381469727, "step": 10950 }, { "epoch": 3.6112026359143328, "grad_norm": 0.34339225954167535, "learning_rate": 9.72817133443163e-08, "logits/chosen": -0.989086925983429, "logits/rejected": -1.255273461341858, "logps/chosen": -364.6499938964844, "logps/rejected": -448.3999938964844, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.819140672683716, "rewards/margins": 11.518750190734863, "rewards/rejected": -14.331250190734863, "step": 10960 }, { "epoch": 3.614497528830313, "grad_norm": 0.23017706725926826, "learning_rate": 9.645799011532124e-08, "logits/chosen": -1.123437523841858, "logits/rejected": -1.5263671875, "logps/chosen": -345.20001220703125, "logps/rejected": -460.1000061035156, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.5289063453674316, "rewards/margins": 11.946874618530273, "rewards/rejected": -15.487500190734863, "step": 10970 }, { "epoch": 3.617792421746293, "grad_norm": 0.13759410141691025, "learning_rate": 9.563426688632619e-08, "logits/chosen": -1.0539062023162842, "logits/rejected": -1.213281273841858, "logps/chosen": -374.04998779296875, "logps/rejected": -483.20001220703125, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -2.2190918922424316, "rewards/margins": 12.449999809265137, "rewards/rejected": -14.678125381469727, "step": 10980 }, { "epoch": 3.6210873146622733, "grad_norm": 0.03227307114327764, "learning_rate": 9.481054365733113e-08, "logits/chosen": -0.9888671636581421, "logits/rejected": -1.4660155773162842, "logps/chosen": -340.1499938964844, "logps/rejected": -429.5, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.490917921066284, "rewards/margins": 11.662500381469727, "rewards/rejected": -14.143750190734863, "step": 10990 }, { "epoch": 3.624382207578254, "grad_norm": 0.23973767658347597, "learning_rate": 9.398682042833607e-08, "logits/chosen": -1.0322265625, "logits/rejected": -1.328710913658142, "logps/chosen": -332.5, "logps/rejected": -440.70001220703125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -2.447070360183716, "rewards/margins": 12.637499809265137, "rewards/rejected": -15.087499618530273, "step": 11000 }, { "epoch": 3.627677100494234, "grad_norm": 0.037574970340734364, "learning_rate": 9.316309719934102e-08, "logits/chosen": -1.16162109375, "logits/rejected": -1.345800757408142, "logps/chosen": -374.04998779296875, "logps/rejected": -464.70001220703125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.789843797683716, "rewards/margins": 11.609375, "rewards/rejected": -14.396875381469727, "step": 11010 }, { "epoch": 3.6309719934102143, "grad_norm": 0.16727131403572515, "learning_rate": 9.233937397034596e-08, "logits/chosen": -1.203222632408142, "logits/rejected": -1.2351562976837158, "logps/chosen": -385.75, "logps/rejected": -480.70001220703125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.5660157203674316, "rewards/margins": 12.153124809265137, "rewards/rejected": -14.71875, "step": 11020 }, { "epoch": 3.6342668863261944, "grad_norm": 0.1313795012151527, "learning_rate": 9.15156507413509e-08, "logits/chosen": -1.0632812976837158, "logits/rejected": -1.4416015148162842, "logps/chosen": -369.0, "logps/rejected": -441.6000061035156, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.63720703125, "rewards/margins": 11.931249618530273, "rewards/rejected": -14.553125381469727, "step": 11030 }, { "epoch": 3.6375617792421746, "grad_norm": 0.2279505071140006, "learning_rate": 9.069192751235585e-08, "logits/chosen": -1.065576195716858, "logits/rejected": -1.380859375, "logps/chosen": -445.54998779296875, "logps/rejected": -520.7999877929688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.755078077316284, "rewards/margins": 11.746874809265137, "rewards/rejected": -14.496874809265137, "step": 11040 }, { "epoch": 3.640856672158155, "grad_norm": 0.04321385742935459, "learning_rate": 8.986820428336078e-08, "logits/chosen": -1.0675780773162842, "logits/rejected": -1.5048828125, "logps/chosen": -346.3999938964844, "logps/rejected": -441.79998779296875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.7406249046325684, "rewards/margins": 11.78125, "rewards/rejected": -14.543749809265137, "step": 11050 }, { "epoch": 3.644151565074135, "grad_norm": 0.22872393474691224, "learning_rate": 8.904448105436572e-08, "logits/chosen": -1.0949218273162842, "logits/rejected": -1.390234351158142, "logps/chosen": -340.75, "logps/rejected": -413.8999938964844, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.603222608566284, "rewards/margins": 12.165624618530273, "rewards/rejected": -14.762499809265137, "step": 11060 }, { "epoch": 3.6474464579901156, "grad_norm": 0.5610095573765219, "learning_rate": 8.822075782537067e-08, "logits/chosen": -1.07373046875, "logits/rejected": -1.3796875476837158, "logps/chosen": -359.8999938964844, "logps/rejected": -475.3999938964844, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.7421875, "rewards/margins": 11.806249618530273, "rewards/rejected": -14.556249618530273, "step": 11070 }, { "epoch": 3.6507413509060953, "grad_norm": 0.177708342603855, "learning_rate": 8.739703459637561e-08, "logits/chosen": -1.149804711341858, "logits/rejected": -1.4074218273162842, "logps/chosen": -377.70001220703125, "logps/rejected": -442.45001220703125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.819531202316284, "rewards/margins": 11.356249809265137, "rewards/rejected": -14.165624618530273, "step": 11080 }, { "epoch": 3.654036243822076, "grad_norm": 0.8097222272472574, "learning_rate": 8.657331136738055e-08, "logits/chosen": -1.1056396961212158, "logits/rejected": -1.400390625, "logps/chosen": -321.79998779296875, "logps/rejected": -458.70001220703125, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -3.2228760719299316, "rewards/margins": 12.162500381469727, "rewards/rejected": -15.396875381469727, "step": 11090 }, { "epoch": 3.657331136738056, "grad_norm": 6.477274872822532, "learning_rate": 8.57495881383855e-08, "logits/chosen": -1.060156226158142, "logits/rejected": -1.3498046398162842, "logps/chosen": -320.0, "logps/rejected": -428.70001220703125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -2.1854491233825684, "rewards/margins": 11.731249809265137, "rewards/rejected": -13.912500381469727, "step": 11100 }, { "epoch": 3.6606260296540363, "grad_norm": 0.05174515159663581, "learning_rate": 8.492586490939044e-08, "logits/chosen": -1.151953101158142, "logits/rejected": -1.319921851158142, "logps/chosen": -393.20001220703125, "logps/rejected": -475.29998779296875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.165820360183716, "rewards/margins": 11.743749618530273, "rewards/rejected": -14.912500381469727, "step": 11110 }, { "epoch": 3.6639209225700164, "grad_norm": 0.2569885485793858, "learning_rate": 8.410214168039538e-08, "logits/chosen": -1.0714843273162842, "logits/rejected": -1.3942382335662842, "logps/chosen": -389.70001220703125, "logps/rejected": -479.6000061035156, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.838085889816284, "rewards/margins": 12.459375381469727, "rewards/rejected": -15.300000190734863, "step": 11120 }, { "epoch": 3.6672158154859966, "grad_norm": 0.18843710550796286, "learning_rate": 8.327841845140033e-08, "logits/chosen": -0.9576171636581421, "logits/rejected": -1.3015625476837158, "logps/chosen": -368.04998779296875, "logps/rejected": -454.6499938964844, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.360546827316284, "rewards/margins": 12.253125190734863, "rewards/rejected": -14.625, "step": 11130 }, { "epoch": 3.670510708401977, "grad_norm": 13.688410213270982, "learning_rate": 8.245469522240527e-08, "logits/chosen": -0.870849609375, "logits/rejected": -1.3302733898162842, "logps/chosen": -337.04998779296875, "logps/rejected": -425.5, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -2.589648485183716, "rewards/margins": 12.071874618530273, "rewards/rejected": -14.659375190734863, "step": 11140 }, { "epoch": 3.673805601317957, "grad_norm": 0.48431265127049206, "learning_rate": 8.16309719934102e-08, "logits/chosen": -0.949902355670929, "logits/rejected": -1.3515625, "logps/chosen": -390.25, "logps/rejected": -480.1000061035156, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.8169922828674316, "rewards/margins": 12.478124618530273, "rewards/rejected": -15.296875, "step": 11150 }, { "epoch": 3.6771004942339376, "grad_norm": 0.30740199555942704, "learning_rate": 8.080724876441516e-08, "logits/chosen": -1.033056616783142, "logits/rejected": -1.3923828601837158, "logps/chosen": -388.04998779296875, "logps/rejected": -466.6000061035156, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.9170899391174316, "rewards/margins": 12.340624809265137, "rewards/rejected": -15.240625381469727, "step": 11160 }, { "epoch": 3.6803953871499178, "grad_norm": 0.1093492592975276, "learning_rate": 7.99835255354201e-08, "logits/chosen": -1.0314452648162842, "logits/rejected": -1.3357422351837158, "logps/chosen": -323.45001220703125, "logps/rejected": -434.1000061035156, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.192187547683716, "rewards/margins": 12.040624618530273, "rewards/rejected": -15.231249809265137, "step": 11170 }, { "epoch": 3.683690280065898, "grad_norm": 0.02258117002013833, "learning_rate": 7.915980230642503e-08, "logits/chosen": -1.1130859851837158, "logits/rejected": -1.433203101158142, "logps/chosen": -397.04998779296875, "logps/rejected": -484.8999938964844, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.5250000953674316, "rewards/margins": 12.46875, "rewards/rejected": -16.006250381469727, "step": 11180 }, { "epoch": 3.686985172981878, "grad_norm": 0.6596910161179975, "learning_rate": 7.833607907742998e-08, "logits/chosen": -0.8675781488418579, "logits/rejected": -1.3435547351837158, "logps/chosen": -373.79998779296875, "logps/rejected": -426.6000061035156, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.5511717796325684, "rewards/margins": 12.296875, "rewards/rejected": -14.856249809265137, "step": 11190 }, { "epoch": 3.6902800658978583, "grad_norm": 0.1512458364337547, "learning_rate": 7.751235584843492e-08, "logits/chosen": -0.9830077886581421, "logits/rejected": -1.342382788658142, "logps/chosen": -412.79998779296875, "logps/rejected": -456.79998779296875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.502148389816284, "rewards/margins": 11.762499809265137, "rewards/rejected": -14.259374618530273, "step": 11200 }, { "epoch": 3.6935749588138385, "grad_norm": 1.8627363574674718, "learning_rate": 7.668863261943986e-08, "logits/chosen": -1.0636718273162842, "logits/rejected": -1.3679687976837158, "logps/chosen": -401.54998779296875, "logps/rejected": -478.6000061035156, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -2.91015625, "rewards/margins": 11.290624618530273, "rewards/rejected": -14.196874618530273, "step": 11210 }, { "epoch": 3.6968698517298186, "grad_norm": 0.5198343703221839, "learning_rate": 7.586490939044481e-08, "logits/chosen": -0.9058593511581421, "logits/rejected": -1.381250023841858, "logps/chosen": -350.98748779296875, "logps/rejected": -455.20001220703125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.4325194358825684, "rewards/margins": 12.115625381469727, "rewards/rejected": -14.559374809265137, "step": 11220 }, { "epoch": 3.7001647446457993, "grad_norm": 0.1477750403496408, "learning_rate": 7.504118616144975e-08, "logits/chosen": -1.172265648841858, "logits/rejected": -1.409570336341858, "logps/chosen": -349.54998779296875, "logps/rejected": -454.20001220703125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.101757764816284, "rewards/margins": 12.490625381469727, "rewards/rejected": -15.600000381469727, "step": 11230 }, { "epoch": 3.703459637561779, "grad_norm": 3.3267003328940086, "learning_rate": 7.421746293245469e-08, "logits/chosen": -1.0701172351837158, "logits/rejected": -1.406640648841858, "logps/chosen": -322.5, "logps/rejected": -429.3999938964844, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.744921922683716, "rewards/margins": 11.899999618530273, "rewards/rejected": -14.640625, "step": 11240 }, { "epoch": 3.7067545304777596, "grad_norm": 2.8650899155432, "learning_rate": 7.339373970345964e-08, "logits/chosen": -1.1018555164337158, "logits/rejected": -1.27197265625, "logps/chosen": -375.6499938964844, "logps/rejected": -501.3999938964844, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.7933592796325684, "rewards/margins": 11.862500190734863, "rewards/rejected": -14.665624618530273, "step": 11250 }, { "epoch": 3.7100494233937398, "grad_norm": 3.918605068724528, "learning_rate": 7.257001647446458e-08, "logits/chosen": -1.08544921875, "logits/rejected": -1.44140625, "logps/chosen": -354.20001220703125, "logps/rejected": -464.3999938964844, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -3.287304639816284, "rewards/margins": 11.912500381469727, "rewards/rejected": -15.215624809265137, "step": 11260 }, { "epoch": 3.71334431630972, "grad_norm": 0.3036106124342894, "learning_rate": 7.174629324546952e-08, "logits/chosen": -1.0397460460662842, "logits/rejected": -1.30859375, "logps/chosen": -390.20001220703125, "logps/rejected": -441.0, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.616015672683716, "rewards/margins": 11.756250381469727, "rewards/rejected": -14.375, "step": 11270 }, { "epoch": 3.7166392092257, "grad_norm": 1.0991545277431953, "learning_rate": 7.092257001647447e-08, "logits/chosen": -1.0764648914337158, "logits/rejected": -1.515625, "logps/chosen": -349.125, "logps/rejected": -440.6000061035156, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.2646484375, "rewards/margins": 11.737500190734863, "rewards/rejected": -14.012499809265137, "step": 11280 }, { "epoch": 3.7199341021416803, "grad_norm": 2.2039304111302895, "learning_rate": 7.00988467874794e-08, "logits/chosen": -0.9844726324081421, "logits/rejected": -1.3298828601837158, "logps/chosen": -378.3500061035156, "logps/rejected": -479.1000061035156, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.5252928733825684, "rewards/margins": 12.296875, "rewards/rejected": -14.818750381469727, "step": 11290 }, { "epoch": 3.7232289950576605, "grad_norm": 0.24367374091999805, "learning_rate": 6.927512355848434e-08, "logits/chosen": -1.12353515625, "logits/rejected": -1.3073241710662842, "logps/chosen": -356.04998779296875, "logps/rejected": -449.3999938964844, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.593701124191284, "rewards/margins": 11.809374809265137, "rewards/rejected": -14.393750190734863, "step": 11300 }, { "epoch": 3.7265238879736406, "grad_norm": 0.12778134033266628, "learning_rate": 6.84514003294893e-08, "logits/chosen": -1.0725586414337158, "logits/rejected": -1.415429711341858, "logps/chosen": -353.1499938964844, "logps/rejected": -412.29998779296875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.445361375808716, "rewards/margins": 11.059374809265137, "rewards/rejected": -13.5, "step": 11310 }, { "epoch": 3.7298187808896213, "grad_norm": 0.022470639778854443, "learning_rate": 6.762767710049423e-08, "logits/chosen": -0.976269543170929, "logits/rejected": -1.4630858898162842, "logps/chosen": -350.20001220703125, "logps/rejected": -478.70001220703125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.781445264816284, "rewards/margins": 12.287500381469727, "rewards/rejected": -15.068750381469727, "step": 11320 }, { "epoch": 3.7331136738056014, "grad_norm": 0.10753205453445909, "learning_rate": 6.680395387149917e-08, "logits/chosen": -1.036108374595642, "logits/rejected": -1.499414086341858, "logps/chosen": -316.45001220703125, "logps/rejected": -399.20001220703125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.8042969703674316, "rewards/margins": 12.068750381469727, "rewards/rejected": -14.878125190734863, "step": 11330 }, { "epoch": 3.7364085667215816, "grad_norm": 0.22506631770037389, "learning_rate": 6.598023064250412e-08, "logits/chosen": -1.1160156726837158, "logits/rejected": -1.470703125, "logps/chosen": -331.6000061035156, "logps/rejected": -471.3999938964844, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.5298829078674316, "rewards/margins": 12.600000381469727, "rewards/rejected": -15.128125190734863, "step": 11340 }, { "epoch": 3.739703459637562, "grad_norm": 0.1745522888121762, "learning_rate": 6.515650741350906e-08, "logits/chosen": -1.01251220703125, "logits/rejected": -1.429101586341858, "logps/chosen": -338.45001220703125, "logps/rejected": -456.5, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.7759766578674316, "rewards/margins": 12.378125190734863, "rewards/rejected": -15.146875381469727, "step": 11350 }, { "epoch": 3.742998352553542, "grad_norm": 0.17158592053501764, "learning_rate": 6.4332784184514e-08, "logits/chosen": -1.075952172279358, "logits/rejected": -1.3224608898162842, "logps/chosen": -361.95001220703125, "logps/rejected": -457.6000061035156, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.7294921875, "rewards/margins": 12.056249618530273, "rewards/rejected": -14.793749809265137, "step": 11360 }, { "epoch": 3.746293245469522, "grad_norm": 0.058463800817594086, "learning_rate": 6.350906095551895e-08, "logits/chosen": -1.097753882408142, "logits/rejected": -1.3585937023162842, "logps/chosen": -361.20001220703125, "logps/rejected": -462.8999938964844, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.291516065597534, "rewards/margins": 12.203125, "rewards/rejected": -14.503125190734863, "step": 11370 }, { "epoch": 3.7495881383855023, "grad_norm": 0.04585103545889218, "learning_rate": 6.268533772652389e-08, "logits/chosen": -1.1609375476837158, "logits/rejected": -1.408593773841858, "logps/chosen": -369.54998779296875, "logps/rejected": -453.1000061035156, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.398242235183716, "rewards/margins": 12.384374618530273, "rewards/rejected": -14.778124809265137, "step": 11380 }, { "epoch": 3.752883031301483, "grad_norm": 0.3007420944185624, "learning_rate": 6.186161449752883e-08, "logits/chosen": -1.0172851085662842, "logits/rejected": -1.402929663658142, "logps/chosen": -351.54998779296875, "logps/rejected": -465.5, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.749746799468994, "rewards/margins": 11.828125, "rewards/rejected": -14.568750381469727, "step": 11390 }, { "epoch": 3.7561779242174627, "grad_norm": 0.0815579474118848, "learning_rate": 6.103789126853378e-08, "logits/chosen": -0.919628918170929, "logits/rejected": -1.3107421398162842, "logps/chosen": -323.79998779296875, "logps/rejected": -436.29998779296875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.6939454078674316, "rewards/margins": 11.653124809265137, "rewards/rejected": -14.353124618530273, "step": 11400 }, { "epoch": 3.7594728171334433, "grad_norm": 0.34451808054862104, "learning_rate": 6.021416803953872e-08, "logits/chosen": -0.98583984375, "logits/rejected": -1.255859375, "logps/chosen": -364.6499938964844, "logps/rejected": -482.0, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.728808641433716, "rewards/margins": 12.190625190734863, "rewards/rejected": -14.921875, "step": 11410 }, { "epoch": 3.7627677100494235, "grad_norm": 0.07177060080489217, "learning_rate": 5.9390444810543655e-08, "logits/chosen": -0.990283191204071, "logits/rejected": -1.3590819835662842, "logps/chosen": -351.79998779296875, "logps/rejected": -448.75, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.726757764816284, "rewards/margins": 11.962499618530273, "rewards/rejected": -14.684374809265137, "step": 11420 }, { "epoch": 3.7660626029654036, "grad_norm": 0.10412941007214409, "learning_rate": 5.85667215815486e-08, "logits/chosen": -1.130761742591858, "logits/rejected": -1.407812476158142, "logps/chosen": -393.3999938964844, "logps/rejected": -465.0, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.357421875, "rewards/margins": 11.981249809265137, "rewards/rejected": -15.328125, "step": 11430 }, { "epoch": 3.769357495881384, "grad_norm": 0.07389718042300845, "learning_rate": 5.774299835255354e-08, "logits/chosen": -1.0224609375, "logits/rejected": -1.36328125, "logps/chosen": -355.1000061035156, "logps/rejected": -441.29998779296875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.661914110183716, "rewards/margins": 11.793749809265137, "rewards/rejected": -14.456250190734863, "step": 11440 }, { "epoch": 3.772652388797364, "grad_norm": 0.6038235108113545, "learning_rate": 5.6919275123558476e-08, "logits/chosen": -0.9952636957168579, "logits/rejected": -1.331445336341858, "logps/chosen": -369.0, "logps/rejected": -430.70001220703125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.8851561546325684, "rewards/margins": 11.703125, "rewards/rejected": -14.578125, "step": 11450 }, { "epoch": 3.775947281713344, "grad_norm": 1.3672996970550215, "learning_rate": 5.609555189456342e-08, "logits/chosen": -1.112451195716858, "logits/rejected": -1.2869141101837158, "logps/chosen": -368.25, "logps/rejected": -512.5999755859375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.862499952316284, "rewards/margins": 12.528124809265137, "rewards/rejected": -15.375, "step": 11460 }, { "epoch": 3.7792421746293243, "grad_norm": 0.04385798502988559, "learning_rate": 5.5271828665568366e-08, "logits/chosen": -1.0802733898162842, "logits/rejected": -1.48828125, "logps/chosen": -400.8999938964844, "logps/rejected": -456.79998779296875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.0765624046325684, "rewards/margins": 12.168749809265137, "rewards/rejected": -15.25, "step": 11470 }, { "epoch": 3.782537067545305, "grad_norm": 0.08607384371771853, "learning_rate": 5.4448105436573304e-08, "logits/chosen": -1.139746069908142, "logits/rejected": -1.339453101158142, "logps/chosen": -425.0, "logps/rejected": -494.8999938964844, "loss": 0.0053, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.064062595367432, "rewards/margins": 11.78125, "rewards/rejected": -15.84375, "step": 11480 }, { "epoch": 3.785831960461285, "grad_norm": 0.15008964609647973, "learning_rate": 5.362438220757825e-08, "logits/chosen": -0.945996105670929, "logits/rejected": -1.384179711341858, "logps/chosen": -405.3999938964844, "logps/rejected": -490.8999938964844, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.9332032203674316, "rewards/margins": 12.365625381469727, "rewards/rejected": -15.296875, "step": 11490 }, { "epoch": 3.7891268533772653, "grad_norm": 0.07574737663786098, "learning_rate": 5.2800658978583193e-08, "logits/chosen": -1.1145508289337158, "logits/rejected": -1.4912109375, "logps/chosen": -369.79998779296875, "logps/rejected": -448.79998779296875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.134765625, "rewards/margins": 11.837499618530273, "rewards/rejected": -14.965624809265137, "step": 11500 }, { "epoch": 3.7924217462932455, "grad_norm": 0.018934553650424738, "learning_rate": 5.197693574958813e-08, "logits/chosen": -0.9842773675918579, "logits/rejected": -1.380468726158142, "logps/chosen": -334.1000061035156, "logps/rejected": -471.79998779296875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.3304686546325684, "rewards/margins": 12.515625, "rewards/rejected": -15.856249809265137, "step": 11510 }, { "epoch": 3.7957166392092256, "grad_norm": 0.3786215377063364, "learning_rate": 5.1153212520593076e-08, "logits/chosen": -0.927539050579071, "logits/rejected": -1.3826172351837158, "logps/chosen": -351.54998779296875, "logps/rejected": -456.1000061035156, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.879101514816284, "rewards/margins": 11.631250381469727, "rewards/rejected": -14.509374618530273, "step": 11520 }, { "epoch": 3.799011532125206, "grad_norm": 0.04672967961951158, "learning_rate": 5.032948929159802e-08, "logits/chosen": -1.048608422279358, "logits/rejected": -1.423242211341858, "logps/chosen": -332.8999938964844, "logps/rejected": -467.6000061035156, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.081249952316284, "rewards/margins": 12.34375, "rewards/rejected": -15.431249618530273, "step": 11530 }, { "epoch": 3.802306425041186, "grad_norm": 0.09923248710198052, "learning_rate": 4.950576606260296e-08, "logits/chosen": -1.2021484375, "logits/rejected": -1.4328124523162842, "logps/chosen": -352.6000061035156, "logps/rejected": -480.0, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.327929735183716, "rewards/margins": 12.199999809265137, "rewards/rejected": -15.524999618530273, "step": 11540 }, { "epoch": 3.8056013179571666, "grad_norm": 0.05331045185881388, "learning_rate": 4.8682042833607904e-08, "logits/chosen": -1.0962402820587158, "logits/rejected": -1.408203125, "logps/chosen": -392.45001220703125, "logps/rejected": -515.2000122070312, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.268359422683716, "rewards/margins": 12.137499809265137, "rewards/rejected": -15.396875381469727, "step": 11550 }, { "epoch": 3.808896210873147, "grad_norm": 0.37513928710149463, "learning_rate": 4.785831960461285e-08, "logits/chosen": -1.114160180091858, "logits/rejected": -1.2039062976837158, "logps/chosen": -346.1499938964844, "logps/rejected": -462.8999938964844, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.23681640625, "rewards/margins": 11.946874618530273, "rewards/rejected": -15.190625190734863, "step": 11560 }, { "epoch": 3.812191103789127, "grad_norm": 0.23813594021806783, "learning_rate": 4.703459637561779e-08, "logits/chosen": -1.086523413658142, "logits/rejected": -1.4402344226837158, "logps/chosen": -377.3999938964844, "logps/rejected": -466.8999938964844, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.8944334983825684, "rewards/margins": 12.712499618530273, "rewards/rejected": -15.609375, "step": 11570 }, { "epoch": 3.815485996705107, "grad_norm": 6.68303169262533, "learning_rate": 4.621087314662273e-08, "logits/chosen": -1.041113257408142, "logits/rejected": -1.41015625, "logps/chosen": -354.70001220703125, "logps/rejected": -483.70001220703125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -3.0980467796325684, "rewards/margins": 12.734375, "rewards/rejected": -15.828125, "step": 11580 }, { "epoch": 3.8187808896210873, "grad_norm": 0.029038101735594766, "learning_rate": 4.5387149917627677e-08, "logits/chosen": -0.942065417766571, "logits/rejected": -1.337499976158142, "logps/chosen": -330.3999938964844, "logps/rejected": -447.1000061035156, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -2.5035157203674316, "rewards/margins": 12.262499809265137, "rewards/rejected": -14.771875381469727, "step": 11590 }, { "epoch": 3.8220757825370675, "grad_norm": 0.8983508449065214, "learning_rate": 4.4563426688632615e-08, "logits/chosen": -1.161523461341858, "logits/rejected": -1.5285155773162842, "logps/chosen": -351.3500061035156, "logps/rejected": -462.1000061035156, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.012500047683716, "rewards/margins": 12.309374809265137, "rewards/rejected": -15.306249618530273, "step": 11600 }, { "epoch": 3.8253706754530477, "grad_norm": 0.9312673962226599, "learning_rate": 4.373970345963756e-08, "logits/chosen": -1.1044921875, "logits/rejected": -1.3225586414337158, "logps/chosen": -352.04998779296875, "logps/rejected": -457.79998779296875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.1500000953674316, "rewards/margins": 12.125, "rewards/rejected": -15.262499809265137, "step": 11610 }, { "epoch": 3.8286655683690283, "grad_norm": 0.4001540880701042, "learning_rate": 4.2915980230642504e-08, "logits/chosen": -1.198828101158142, "logits/rejected": -1.4660155773162842, "logps/chosen": -356.3500061035156, "logps/rejected": -457.3999938964844, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -3.0601563453674316, "rewards/margins": 12.012499809265137, "rewards/rejected": -15.074999809265137, "step": 11620 }, { "epoch": 3.831960461285008, "grad_norm": 0.18621593235568853, "learning_rate": 4.209225700164744e-08, "logits/chosen": -0.9969726800918579, "logits/rejected": -1.333398461341858, "logps/chosen": -395.79998779296875, "logps/rejected": -519.4000244140625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.0673828125, "rewards/margins": 11.850000381469727, "rewards/rejected": -14.915624618530273, "step": 11630 }, { "epoch": 3.8352553542009886, "grad_norm": 0.12506949153369826, "learning_rate": 4.126853377265239e-08, "logits/chosen": -1.032470703125, "logits/rejected": -1.468164086341858, "logps/chosen": -329.5, "logps/rejected": -419.1000061035156, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.683789014816284, "rewards/margins": 11.543749809265137, "rewards/rejected": -14.221875190734863, "step": 11640 }, { "epoch": 3.838550247116969, "grad_norm": 0.09755632688876154, "learning_rate": 4.044481054365733e-08, "logits/chosen": -1.062646508216858, "logits/rejected": -1.414648413658142, "logps/chosen": -372.6499938964844, "logps/rejected": -445.79998779296875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.899609327316284, "rewards/margins": 11.859375, "rewards/rejected": -14.756250381469727, "step": 11650 }, { "epoch": 3.841845140032949, "grad_norm": 0.03964274952558809, "learning_rate": 3.962108731466227e-08, "logits/chosen": -0.9507812261581421, "logits/rejected": -1.321874976158142, "logps/chosen": -357.1000061035156, "logps/rejected": -464.6000061035156, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -2.729785203933716, "rewards/margins": 12.206250190734863, "rewards/rejected": -14.9375, "step": 11660 }, { "epoch": 3.845140032948929, "grad_norm": 0.1426736317075618, "learning_rate": 3.8797364085667215e-08, "logits/chosen": -1.0876953601837158, "logits/rejected": -1.4435546398162842, "logps/chosen": -385.0, "logps/rejected": -456.6000061035156, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.715625047683716, "rewards/margins": 12.146875381469727, "rewards/rejected": -14.862500190734863, "step": 11670 }, { "epoch": 3.8484349258649093, "grad_norm": 0.03097025640244107, "learning_rate": 3.797364085667216e-08, "logits/chosen": -1.044531226158142, "logits/rejected": -1.3310546875, "logps/chosen": -342.3999938964844, "logps/rejected": -460.29998779296875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.50634765625, "rewards/margins": 12.053125381469727, "rewards/rejected": -14.565625190734863, "step": 11680 }, { "epoch": 3.8517298187808895, "grad_norm": 0.01645882085144323, "learning_rate": 3.71499176276771e-08, "logits/chosen": -1.115332007408142, "logits/rejected": -1.4832031726837158, "logps/chosen": -385.54998779296875, "logps/rejected": -493.5, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.235156297683716, "rewards/margins": 11.931249618530273, "rewards/rejected": -15.165624618530273, "step": 11690 }, { "epoch": 3.8550247116968697, "grad_norm": 0.13187499089359073, "learning_rate": 3.632619439868204e-08, "logits/chosen": -1.075292944908142, "logits/rejected": -1.5105469226837158, "logps/chosen": -351.70001220703125, "logps/rejected": -501.0, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.32861328125, "rewards/margins": 12.146875381469727, "rewards/rejected": -15.465624809265137, "step": 11700 }, { "epoch": 3.8583196046128503, "grad_norm": 0.15195156005189298, "learning_rate": 3.550247116968699e-08, "logits/chosen": -1.082421898841858, "logits/rejected": -1.389062523841858, "logps/chosen": -363.1000061035156, "logps/rejected": -479.20001220703125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.553906202316284, "rewards/margins": 12.03125, "rewards/rejected": -15.574999809265137, "step": 11710 }, { "epoch": 3.8616144975288305, "grad_norm": 0.2212420455990417, "learning_rate": 3.4678747940691926e-08, "logits/chosen": -0.993847668170929, "logits/rejected": -1.4578125476837158, "logps/chosen": -320.5, "logps/rejected": -459.3999938964844, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -3.4515624046325684, "rewards/margins": 11.696874618530273, "rewards/rejected": -15.146875381469727, "step": 11720 }, { "epoch": 3.8649093904448106, "grad_norm": 0.09396879870735328, "learning_rate": 3.385502471169687e-08, "logits/chosen": -1.106835961341858, "logits/rejected": -1.388671875, "logps/chosen": -346.04998779296875, "logps/rejected": -431.3500061035156, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -2.842578172683716, "rewards/margins": 11.703125, "rewards/rejected": -14.550000190734863, "step": 11730 }, { "epoch": 3.868204283360791, "grad_norm": 0.11973318847766527, "learning_rate": 3.3031301482701815e-08, "logits/chosen": -1.101953148841858, "logits/rejected": -1.3564453125, "logps/chosen": -373.0, "logps/rejected": -479.0, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.9164061546325684, "rewards/margins": 12.393750190734863, "rewards/rejected": -15.318750381469727, "step": 11740 }, { "epoch": 3.871499176276771, "grad_norm": 9.099172727653801, "learning_rate": 3.2207578253706753e-08, "logits/chosen": -1.1623046398162842, "logits/rejected": -1.556249976158142, "logps/chosen": -330.0, "logps/rejected": -415.79998779296875, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -3.2914061546325684, "rewards/margins": 11.734375, "rewards/rejected": -15.028124809265137, "step": 11750 }, { "epoch": 3.874794069192751, "grad_norm": 0.07548664105692915, "learning_rate": 3.13838550247117e-08, "logits/chosen": -1.073828101158142, "logits/rejected": -1.4578125476837158, "logps/chosen": -350.29998779296875, "logps/rejected": -450.8999938964844, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.05078125, "rewards/margins": 12.456250190734863, "rewards/rejected": -15.506250381469727, "step": 11760 }, { "epoch": 3.8780889621087313, "grad_norm": 0.41731429820042737, "learning_rate": 3.0560131795716636e-08, "logits/chosen": -0.9384765625, "logits/rejected": -1.4177734851837158, "logps/chosen": -332.75, "logps/rejected": -436.20001220703125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.020703077316284, "rewards/margins": 12.303125381469727, "rewards/rejected": -15.331250190734863, "step": 11770 }, { "epoch": 3.881383855024712, "grad_norm": 0.09890779746019654, "learning_rate": 2.973640856672158e-08, "logits/chosen": -0.991992175579071, "logits/rejected": -1.231054663658142, "logps/chosen": -317.375, "logps/rejected": -444.1000061035156, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.623828172683716, "rewards/margins": 12.381250381469727, "rewards/rejected": -15.009374618530273, "step": 11780 }, { "epoch": 3.8846787479406917, "grad_norm": 1.5660179840676145, "learning_rate": 2.8912685337726523e-08, "logits/chosen": -1.0217773914337158, "logits/rejected": -1.3654296398162842, "logps/chosen": -383.45001220703125, "logps/rejected": -491.8999938964844, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.9791016578674316, "rewards/margins": 12.131250381469727, "rewards/rejected": -15.112500190734863, "step": 11790 }, { "epoch": 3.8879736408566723, "grad_norm": 0.9879377950677273, "learning_rate": 2.8088962108731464e-08, "logits/chosen": -1.07861328125, "logits/rejected": -1.473046898841858, "logps/chosen": -317.25, "logps/rejected": -421.8999938964844, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.8623046875, "rewards/margins": 12.046875, "rewards/rejected": -14.915624618530273, "step": 11800 }, { "epoch": 3.8912685337726525, "grad_norm": 0.04755404188199403, "learning_rate": 2.726523887973641e-08, "logits/chosen": -1.1164062023162842, "logits/rejected": -1.365625023841858, "logps/chosen": -334.5, "logps/rejected": -421.1000061035156, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.059277296066284, "rewards/margins": 11.646875381469727, "rewards/rejected": -14.709375381469727, "step": 11810 }, { "epoch": 3.8945634266886326, "grad_norm": 0.05545035381311794, "learning_rate": 2.644151565074135e-08, "logits/chosen": -1.0908203125, "logits/rejected": -1.3875000476837158, "logps/chosen": -327.45001220703125, "logps/rejected": -449.8500061035156, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.4175782203674316, "rewards/margins": 12.300000190734863, "rewards/rejected": -15.703125, "step": 11820 }, { "epoch": 3.897858319604613, "grad_norm": 0.15366928193267973, "learning_rate": 2.5617792421746292e-08, "logits/chosen": -1.046875, "logits/rejected": -1.275976538658142, "logps/chosen": -372.3999938964844, "logps/rejected": -476.8999938964844, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.701171875, "rewards/margins": 12.225000381469727, "rewards/rejected": -14.925000190734863, "step": 11830 }, { "epoch": 3.901153212520593, "grad_norm": 0.2714445284548862, "learning_rate": 2.4794069192751237e-08, "logits/chosen": -1.069433569908142, "logits/rejected": -1.3406250476837158, "logps/chosen": -361.25, "logps/rejected": -440.0, "loss": 0.0115, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.7984375953674316, "rewards/margins": 11.734375, "rewards/rejected": -14.534375190734863, "step": 11840 }, { "epoch": 3.904448105436573, "grad_norm": 0.18364127271225228, "learning_rate": 2.3970345963756178e-08, "logits/chosen": -1.00634765625, "logits/rejected": -1.366601586341858, "logps/chosen": -346.29998779296875, "logps/rejected": -495.8999938964844, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.559375047683716, "rewards/margins": 12.625, "rewards/rejected": -16.1875, "step": 11850 }, { "epoch": 3.9077429983525533, "grad_norm": 0.03739956421339725, "learning_rate": 2.3146622734761116e-08, "logits/chosen": -1.169531226158142, "logits/rejected": -1.485742211341858, "logps/chosen": -333.8999938964844, "logps/rejected": -450.8999938964844, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.099609375, "rewards/margins": 12.221875190734863, "rewards/rejected": -15.315625190734863, "step": 11860 }, { "epoch": 3.911037891268534, "grad_norm": 0.338997750737495, "learning_rate": 2.232289950576606e-08, "logits/chosen": -1.158789038658142, "logits/rejected": -1.210839867591858, "logps/chosen": -323.0, "logps/rejected": -432.0, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.0609679222106934, "rewards/margins": 11.240625381469727, "rewards/rejected": -14.300000190734863, "step": 11870 }, { "epoch": 3.914332784184514, "grad_norm": 0.11055916678059347, "learning_rate": 2.1499176276771003e-08, "logits/chosen": -1.002294898033142, "logits/rejected": -1.4323241710662842, "logps/chosen": -346.3500061035156, "logps/rejected": -449.3999938964844, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.134570360183716, "rewards/margins": 12.300000190734863, "rewards/rejected": -15.431249618530273, "step": 11880 }, { "epoch": 3.9176276771004943, "grad_norm": 0.4107385466401725, "learning_rate": 2.0675453047775944e-08, "logits/chosen": -1.102148413658142, "logits/rejected": -1.5808594226837158, "logps/chosen": -348.45001220703125, "logps/rejected": -438.6000061035156, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.932421922683716, "rewards/margins": 12.334375381469727, "rewards/rejected": -15.271875381469727, "step": 11890 }, { "epoch": 3.9209225700164745, "grad_norm": 0.14772376738467466, "learning_rate": 1.985172981878089e-08, "logits/chosen": -0.9695800542831421, "logits/rejected": -1.365820288658142, "logps/chosen": -378.0, "logps/rejected": -487.79998779296875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.1148438453674316, "rewards/margins": 12.084375381469727, "rewards/rejected": -15.196874618530273, "step": 11900 }, { "epoch": 3.9242174629324547, "grad_norm": 2.9778020957796616, "learning_rate": 1.902800658978583e-08, "logits/chosen": -0.869946300983429, "logits/rejected": -1.44970703125, "logps/chosen": -320.1000061035156, "logps/rejected": -451.3999938964844, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.073046922683716, "rewards/margins": 12.540624618530273, "rewards/rejected": -15.606249809265137, "step": 11910 }, { "epoch": 3.927512355848435, "grad_norm": 0.24088382688907914, "learning_rate": 1.8204283360790772e-08, "logits/chosen": -1.100683569908142, "logits/rejected": -1.3605468273162842, "logps/chosen": -367.25, "logps/rejected": -492.79998779296875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.19921875, "rewards/margins": 12.403124809265137, "rewards/rejected": -15.590624809265137, "step": 11920 }, { "epoch": 3.930807248764415, "grad_norm": 0.5247582426720832, "learning_rate": 1.7380560131795717e-08, "logits/chosen": -0.9383789300918579, "logits/rejected": -1.2296874523162842, "logps/chosen": -380.75, "logps/rejected": -447.8999938964844, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.6019530296325684, "rewards/margins": 12.443750381469727, "rewards/rejected": -15.040624618530273, "step": 11930 }, { "epoch": 3.9341021416803956, "grad_norm": 0.13654765253862736, "learning_rate": 1.6556836902800658e-08, "logits/chosen": -0.927929699420929, "logits/rejected": -1.345703125, "logps/chosen": -403.3500061035156, "logps/rejected": -468.20001220703125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -3.477343797683716, "rewards/margins": 11.365625381469727, "rewards/rejected": -14.850000381469727, "step": 11940 }, { "epoch": 3.9373970345963754, "grad_norm": 0.7922060717918293, "learning_rate": 1.57331136738056e-08, "logits/chosen": -1.061914086341858, "logits/rejected": -1.3748047351837158, "logps/chosen": -379.20001220703125, "logps/rejected": -465.1000061035156, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.0960936546325684, "rewards/margins": 12.0625, "rewards/rejected": -15.153124809265137, "step": 11950 }, { "epoch": 3.940691927512356, "grad_norm": 0.06365623084030082, "learning_rate": 1.4909390444810544e-08, "logits/chosen": -1.1005859375, "logits/rejected": -1.276953101158142, "logps/chosen": -368.79998779296875, "logps/rejected": -466.29998779296875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.916332960128784, "rewards/margins": 12.175000190734863, "rewards/rejected": -15.081250190734863, "step": 11960 }, { "epoch": 3.943986820428336, "grad_norm": 0.565201099514534, "learning_rate": 1.4085667215815486e-08, "logits/chosen": -0.9607909917831421, "logits/rejected": -1.201269507408142, "logps/chosen": -316.3500061035156, "logps/rejected": -457.79998779296875, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -2.570385694503784, "rewards/margins": 12.324999809265137, "rewards/rejected": -14.896875381469727, "step": 11970 }, { "epoch": 3.9472817133443163, "grad_norm": 0.46235926946662387, "learning_rate": 1.3261943986820429e-08, "logits/chosen": -1.0168945789337158, "logits/rejected": -1.4611327648162842, "logps/chosen": -381.25, "logps/rejected": -494.29998779296875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.753936767578125, "rewards/margins": 12.993749618530273, "rewards/rejected": -15.75, "step": 11980 }, { "epoch": 3.9505766062602965, "grad_norm": 0.044169561837062214, "learning_rate": 1.2438220757825369e-08, "logits/chosen": -1.1536133289337158, "logits/rejected": -1.5402343273162842, "logps/chosen": -344.1000061035156, "logps/rejected": -452.79998779296875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.1167969703674316, "rewards/margins": 12.946874618530273, "rewards/rejected": -16.068750381469727, "step": 11990 }, { "epoch": 3.9538714991762767, "grad_norm": 0.09753515464232107, "learning_rate": 1.1614497528830312e-08, "logits/chosen": -1.011132836341858, "logits/rejected": -1.2880859375, "logps/chosen": -351.0, "logps/rejected": -451.29998779296875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.046191453933716, "rewards/margins": 12.340624809265137, "rewards/rejected": -15.384374618530273, "step": 12000 }, { "epoch": 3.957166392092257, "grad_norm": 0.0856957240939599, "learning_rate": 1.0790774299835255e-08, "logits/chosen": -1.038915991783142, "logits/rejected": -1.3738281726837158, "logps/chosen": -371.6499938964844, "logps/rejected": -467.20001220703125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.9886717796325684, "rewards/margins": 12.25, "rewards/rejected": -15.246874809265137, "step": 12010 }, { "epoch": 3.960461285008237, "grad_norm": 0.14863822517469558, "learning_rate": 9.967051070840197e-09, "logits/chosen": -1.068359375, "logits/rejected": -1.3816406726837158, "logps/chosen": -354.3999938964844, "logps/rejected": -446.8999938964844, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.600390672683716, "rewards/margins": 11.865625381469727, "rewards/rejected": -14.478124618530273, "step": 12020 }, { "epoch": 3.9637561779242176, "grad_norm": 0.1509099874551952, "learning_rate": 9.14332784184514e-09, "logits/chosen": -0.995898425579071, "logits/rejected": -1.3337891101837158, "logps/chosen": -380.79998779296875, "logps/rejected": -455.5, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.9794921875, "rewards/margins": 12.778124809265137, "rewards/rejected": -15.762499809265137, "step": 12030 }, { "epoch": 3.967051070840198, "grad_norm": 0.2711649795107698, "learning_rate": 8.319604612850083e-09, "logits/chosen": -1.109960913658142, "logits/rejected": -1.479882836341858, "logps/chosen": -389.45001220703125, "logps/rejected": -496.6000061035156, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.2152342796325684, "rewards/margins": 12.453125, "rewards/rejected": -15.671875, "step": 12040 }, { "epoch": 3.970345963756178, "grad_norm": 1.271897266311383, "learning_rate": 7.495881383855024e-09, "logits/chosen": -1.1220703125, "logits/rejected": -1.40655517578125, "logps/chosen": -302.45001220703125, "logps/rejected": -439.0, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.7386717796325684, "rewards/margins": 11.600000381469727, "rewards/rejected": -14.350000381469727, "step": 12050 }, { "epoch": 3.973640856672158, "grad_norm": 0.8536517433137732, "learning_rate": 6.672158154859967e-09, "logits/chosen": -1.014379858970642, "logits/rejected": -1.510351538658142, "logps/chosen": -355.8999938964844, "logps/rejected": -463.20001220703125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.34375, "rewards/margins": 12.237500190734863, "rewards/rejected": -15.578125, "step": 12060 }, { "epoch": 3.9769357495881383, "grad_norm": 0.026979697421122755, "learning_rate": 5.848434925864909e-09, "logits/chosen": -1.01025390625, "logits/rejected": -1.392968773841858, "logps/chosen": -369.5, "logps/rejected": -502.79998779296875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.4345703125, "rewards/margins": 12.696874618530273, "rewards/rejected": -16.128124237060547, "step": 12070 }, { "epoch": 3.9802306425041185, "grad_norm": 0.9472036958100655, "learning_rate": 5.024711696869852e-09, "logits/chosen": -0.890698254108429, "logits/rejected": -1.2751953601837158, "logps/chosen": -335.95001220703125, "logps/rejected": -431.5, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": -2.553112745285034, "rewards/margins": 12.068750381469727, "rewards/rejected": -14.618749618530273, "step": 12080 }, { "epoch": 3.9835255354200987, "grad_norm": 0.053751743723942855, "learning_rate": 4.2009884678747935e-09, "logits/chosen": -1.0674316883087158, "logits/rejected": -1.3992187976837158, "logps/chosen": -379.25, "logps/rejected": -528.0, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.506640672683716, "rewards/margins": 12.331250190734863, "rewards/rejected": -15.840624809265137, "step": 12090 }, { "epoch": 3.9868204283360793, "grad_norm": 0.8643850291296545, "learning_rate": 3.3772652388797362e-09, "logits/chosen": -0.9620605707168579, "logits/rejected": -1.350195288658142, "logps/chosen": -338.75, "logps/rejected": -449.95001220703125, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -2.9066405296325684, "rewards/margins": 12.175000190734863, "rewards/rejected": -15.078125, "step": 12100 }, { "epoch": 3.990115321252059, "grad_norm": 0.16328564911580545, "learning_rate": 2.5535420098846785e-09, "logits/chosen": -1.0193359851837158, "logits/rejected": -1.46484375, "logps/chosen": -363.8999938964844, "logps/rejected": -479.1000061035156, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.0274415016174316, "rewards/margins": 12.412500381469727, "rewards/rejected": -15.459375381469727, "step": 12110 }, { "epoch": 3.9934102141680397, "grad_norm": 0.36248668629933284, "learning_rate": 1.729818780889621e-09, "logits/chosen": -0.893994152545929, "logits/rejected": -1.2322266101837158, "logps/chosen": -385.29998779296875, "logps/rejected": -484.20001220703125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.05572509765625, "rewards/margins": 11.956250190734863, "rewards/rejected": -15.015625, "step": 12120 }, { "epoch": 3.99670510708402, "grad_norm": 2.6327953635347168, "learning_rate": 9.060955518945634e-10, "logits/chosen": -1.1873047351837158, "logits/rejected": -1.2746093273162842, "logps/chosen": -370.79998779296875, "logps/rejected": -482.20001220703125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.9476561546325684, "rewards/margins": 12.328125, "rewards/rejected": -15.284375190734863, "step": 12130 }, { "epoch": 4.0, "grad_norm": 0.2703576192855618, "learning_rate": 8.237232289950576e-11, "logits/chosen": -1.191796898841858, "logits/rejected": -1.3425781726837158, "logps/chosen": -385.5, "logps/rejected": -447.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.2406249046325684, "rewards/margins": 11.649999618530273, "rewards/rejected": -14.884374618530273, "step": 12140 } ], "logging_steps": 10, "max_steps": 12140, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }