{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9939659901261657, "eval_steps": 500, "global_step": 302, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006582556226001097, "grad_norm": 2.453043920718015, "learning_rate": 3.2258064516129035e-07, "loss": 2.2482, "step": 1 }, { "epoch": 0.03291278113000549, "grad_norm": 2.0003115811727206, "learning_rate": 1.6129032258064516e-06, "loss": 2.0687, "step": 5 }, { "epoch": 0.06582556226001098, "grad_norm": 0.9576535879971833, "learning_rate": 3.225806451612903e-06, "loss": 1.6905, "step": 10 }, { "epoch": 0.09873834339001646, "grad_norm": 0.30116763306370925, "learning_rate": 4.838709677419355e-06, "loss": 1.0142, "step": 15 }, { "epoch": 0.13165112452002195, "grad_norm": 0.20223223062062798, "learning_rate": 6.451612903225806e-06, "loss": 0.7111, "step": 20 }, { "epoch": 0.16456390565002743, "grad_norm": 0.14657727394091244, "learning_rate": 8.064516129032258e-06, "loss": 0.6454, "step": 25 }, { "epoch": 0.1974766867800329, "grad_norm": 0.11208122922317201, "learning_rate": 9.67741935483871e-06, "loss": 0.5687, "step": 30 }, { "epoch": 0.2303894679100384, "grad_norm": 0.1073134953639841, "learning_rate": 9.99462543481167e-06, "loss": 0.5147, "step": 35 }, { "epoch": 0.2633022490400439, "grad_norm": 0.13688522912038725, "learning_rate": 9.9728110648995e-06, "loss": 0.4698, "step": 40 }, { "epoch": 0.29621503017004935, "grad_norm": 0.0918265619598427, "learning_rate": 9.934294192840518e-06, "loss": 0.4815, "step": 45 }, { "epoch": 0.32912781130005486, "grad_norm": 0.11050070883185271, "learning_rate": 9.879204187744036e-06, "loss": 0.453, "step": 50 }, { "epoch": 0.3620405924300603, "grad_norm": 0.07394441588683744, "learning_rate": 9.807726083973192e-06, "loss": 0.4228, "step": 55 }, { "epoch": 0.3949533735600658, "grad_norm": 0.09321757955302397, "learning_rate": 9.720099959658062e-06, "loss": 0.4392, "step": 60 }, { "epoch": 0.42786615469007133, "grad_norm": 0.09437542122654453, "learning_rate": 9.61662013032972e-06, "loss": 0.4304, "step": 65 }, { "epoch": 0.4607789358200768, "grad_norm": 0.07578178267682123, "learning_rate": 9.497634160383627e-06, "loss": 0.4391, "step": 70 }, { "epoch": 0.4936917169500823, "grad_norm": 0.07062146422446726, "learning_rate": 9.36354169569261e-06, "loss": 0.4149, "step": 75 }, { "epoch": 0.5266044980800878, "grad_norm": 0.07563993059966992, "learning_rate": 9.214793121290442e-06, "loss": 0.424, "step": 80 }, { "epoch": 0.5595172792100932, "grad_norm": 0.0714902674482327, "learning_rate": 9.051888048634471e-06, "loss": 0.4538, "step": 85 }, { "epoch": 0.5924300603400987, "grad_norm": 0.06635393353384471, "learning_rate": 8.875373637528336e-06, "loss": 0.4058, "step": 90 }, { "epoch": 0.6253428414701042, "grad_norm": 0.06813778711254845, "learning_rate": 8.685842758340912e-06, "loss": 0.3843, "step": 95 }, { "epoch": 0.6582556226001097, "grad_norm": 0.06038001684085916, "learning_rate": 8.483932000694295e-06, "loss": 0.3839, "step": 100 }, { "epoch": 0.6911684037301152, "grad_norm": 0.06387985587791511, "learning_rate": 8.270319535309035e-06, "loss": 0.3949, "step": 105 }, { "epoch": 0.7240811848601206, "grad_norm": 0.07413216886866929, "learning_rate": 8.04572283618829e-06, "loss": 0.3857, "step": 110 }, { "epoch": 0.7569939659901261, "grad_norm": 0.06197794577102131, "learning_rate": 7.810896270791484e-06, "loss": 0.3867, "step": 115 }, { "epoch": 0.7899067471201316, "grad_norm": 0.06775910453909832, "learning_rate": 7.566628566291537e-06, "loss": 0.4253, "step": 120 }, { "epoch": 0.8228195282501372, "grad_norm": 0.06335190737071104, "learning_rate": 7.313740160425887e-06, "loss": 0.3592, "step": 125 }, { "epoch": 0.8557323093801427, "grad_norm": 0.06053821259981546, "learning_rate": 7.053080445839211e-06, "loss": 0.3982, "step": 130 }, { "epoch": 0.8886450905101481, "grad_norm": 0.05817539529457328, "learning_rate": 6.7855249171734e-06, "loss": 0.3825, "step": 135 }, { "epoch": 0.9215578716401536, "grad_norm": 0.06281193554143008, "learning_rate": 6.511972230487091e-06, "loss": 0.3455, "step": 140 }, { "epoch": 0.9544706527701591, "grad_norm": 0.057663764408870714, "learning_rate": 6.2333411848814415e-06, "loss": 0.3971, "step": 145 }, { "epoch": 0.9873834339001646, "grad_norm": 0.06667224024941, "learning_rate": 5.95056763647016e-06, "loss": 0.3715, "step": 150 }, { "epoch": 0.9939659901261657, "eval_loss": 0.40147778391838074, "eval_runtime": 33.4304, "eval_samples_per_second": 20.61, "eval_steps_per_second": 5.175, "step": 151 }, { "epoch": 1.0263302249040045, "grad_norm": 0.06761665079835917, "learning_rate": 5.664601355059044e-06, "loss": 0.4201, "step": 155 }, { "epoch": 1.05924300603401, "grad_norm": 0.06719806083275706, "learning_rate": 5.376402834092721e-06, "loss": 0.3496, "step": 160 }, { "epoch": 1.0921557871640153, "grad_norm": 0.05895174295699646, "learning_rate": 5.086940064583179e-06, "loss": 0.3448, "step": 165 }, { "epoch": 1.125068568294021, "grad_norm": 0.05113621376287118, "learning_rate": 4.797185283855756e-06, "loss": 0.3397, "step": 170 }, { "epoch": 1.1579813494240263, "grad_norm": 0.0675921905296945, "learning_rate": 4.5081117100327594e-06, "loss": 0.3611, "step": 175 }, { "epoch": 1.1908941305540317, "grad_norm": 0.05801392979206245, "learning_rate": 4.220690273222802e-06, "loss": 0.3465, "step": 180 }, { "epoch": 1.2238069116840373, "grad_norm": 0.07011122057291769, "learning_rate": 3.935886354395057e-06, "loss": 0.3342, "step": 185 }, { "epoch": 1.2567196928140427, "grad_norm": 0.06127719545378434, "learning_rate": 3.6546565428917623e-06, "loss": 0.353, "step": 190 }, { "epoch": 1.2896324739440483, "grad_norm": 0.05597451127292148, "learning_rate": 3.377945423469727e-06, "loss": 0.3267, "step": 195 }, { "epoch": 1.3225452550740537, "grad_norm": 0.06363474070842762, "learning_rate": 3.1066824036624086e-06, "loss": 0.3667, "step": 200 }, { "epoch": 1.3554580362040594, "grad_norm": 0.06166154027576164, "learning_rate": 2.84177859211873e-06, "loss": 0.355, "step": 205 }, { "epoch": 1.3883708173340648, "grad_norm": 0.06396509045884582, "learning_rate": 2.584123738403527e-06, "loss": 0.3663, "step": 210 }, { "epoch": 1.4212835984640702, "grad_norm": 0.060133414345748225, "learning_rate": 2.3345832445381415e-06, "loss": 0.3389, "step": 215 }, { "epoch": 1.4541963795940758, "grad_norm": 0.05614627720977617, "learning_rate": 2.0939952583186806e-06, "loss": 0.3233, "step": 220 }, { "epoch": 1.4871091607240812, "grad_norm": 0.06341047469554213, "learning_rate": 1.8631678581748059e-06, "loss": 0.3828, "step": 225 }, { "epoch": 1.5200219418540866, "grad_norm": 0.0586104014108502, "learning_rate": 1.6428763390244462e-06, "loss": 0.3142, "step": 230 }, { "epoch": 1.5529347229840922, "grad_norm": 0.06293576847799762, "learning_rate": 1.4338606082406232e-06, "loss": 0.3617, "step": 235 }, { "epoch": 1.5858475041140978, "grad_norm": 0.05673087625262431, "learning_rate": 1.236822700476683e-06, "loss": 0.3574, "step": 240 }, { "epoch": 1.618760285244103, "grad_norm": 0.06411343693892131, "learning_rate": 1.0524244196971228e-06, "loss": 0.3708, "step": 245 }, { "epoch": 1.6516730663741086, "grad_norm": 0.05756395567753849, "learning_rate": 8.812851163337976e-07, "loss": 0.3201, "step": 250 }, { "epoch": 1.6845858475041142, "grad_norm": 0.06491241755900852, "learning_rate": 7.239796070335775e-07, "loss": 0.3551, "step": 255 }, { "epoch": 1.7174986286341196, "grad_norm": 0.06269841166969671, "learning_rate": 5.810362439844897e-07, "loss": 0.3198, "step": 260 }, { "epoch": 1.750411409764125, "grad_norm": 0.06872125923905117, "learning_rate": 4.529351403050586e-07, "loss": 0.3461, "step": 265 }, { "epoch": 1.7833241908941306, "grad_norm": 0.07301293362304442, "learning_rate": 3.401065574573187e-07, "loss": 0.389, "step": 270 }, { "epoch": 1.816236972024136, "grad_norm": 0.06735671030896039, "learning_rate": 2.4292946009982163e-07, "loss": 0.3653, "step": 275 }, { "epoch": 1.8491497531541414, "grad_norm": 0.05540343390769577, "learning_rate": 1.6173024323451747e-07, "loss": 0.341, "step": 280 }, { "epoch": 1.882062534284147, "grad_norm": 0.05948050374671173, "learning_rate": 9.678163592275858e-08, "loss": 0.3237, "step": 285 }, { "epoch": 1.9149753154141524, "grad_norm": 0.05494271383448326, "learning_rate": 4.830178525256079e-08, "loss": 0.3365, "step": 290 }, { "epoch": 1.9478880965441578, "grad_norm": 0.052912085010542946, "learning_rate": 1.6453523633868096e-08, "loss": 0.301, "step": 295 }, { "epoch": 1.9808008776741635, "grad_norm": 0.0646409275755533, "learning_rate": 1.3438218828076831e-09, "loss": 0.3628, "step": 300 }, { "epoch": 1.9939659901261657, "eval_loss": 0.39538663625717163, "eval_runtime": 33.2149, "eval_samples_per_second": 20.744, "eval_steps_per_second": 5.209, "step": 302 }, { "epoch": 1.9939659901261657, "step": 302, "total_flos": 6.708841195629445e+17, "train_loss": 0.4515860276111704, "train_runtime": 2865.436, "train_samples_per_second": 5.089, "train_steps_per_second": 0.105 } ], "logging_steps": 5, "max_steps": 302, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.708841195629445e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }