File size: 12,673 Bytes
a2fd03b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 10.0,
  "eval_steps": 500,
  "global_step": 27270,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.18335166850018336,
      "grad_norm": 3.5391387939453125,
      "learning_rate": 4.908324165749908e-05,
      "loss": 2.4276,
      "num_input_tokens_seen": 1750768,
      "step": 500
    },
    {
      "epoch": 0.3667033370003667,
      "grad_norm": 3.4214749336242676,
      "learning_rate": 4.816648331499817e-05,
      "loss": 2.2532,
      "num_input_tokens_seen": 3485632,
      "step": 1000
    },
    {
      "epoch": 0.5500550055005501,
      "grad_norm": 3.533691167831421,
      "learning_rate": 4.724972497249725e-05,
      "loss": 2.1894,
      "num_input_tokens_seen": 5230688,
      "step": 1500
    },
    {
      "epoch": 0.7334066740007334,
      "grad_norm": 3.7089884281158447,
      "learning_rate": 4.633296662999633e-05,
      "loss": 2.1511,
      "num_input_tokens_seen": 6971344,
      "step": 2000
    },
    {
      "epoch": 0.9167583425009168,
      "grad_norm": 4.088582515716553,
      "learning_rate": 4.541620828749542e-05,
      "loss": 2.1089,
      "num_input_tokens_seen": 8738136,
      "step": 2500
    },
    {
      "epoch": 1.1001100110011002,
      "grad_norm": 4.8249077796936035,
      "learning_rate": 4.449944994499451e-05,
      "loss": 2.0594,
      "num_input_tokens_seen": 10466350,
      "step": 3000
    },
    {
      "epoch": 1.2834616795012834,
      "grad_norm": 3.9551169872283936,
      "learning_rate": 4.358269160249359e-05,
      "loss": 2.0194,
      "num_input_tokens_seen": 12222070,
      "step": 3500
    },
    {
      "epoch": 1.466813348001467,
      "grad_norm": 3.0416815280914307,
      "learning_rate": 4.266593325999267e-05,
      "loss": 2.0019,
      "num_input_tokens_seen": 13976918,
      "step": 4000
    },
    {
      "epoch": 1.6501650165016502,
      "grad_norm": 3.295426607131958,
      "learning_rate": 4.174917491749175e-05,
      "loss": 2.0024,
      "num_input_tokens_seen": 15721702,
      "step": 4500
    },
    {
      "epoch": 1.8335166850018334,
      "grad_norm": 4.8525309562683105,
      "learning_rate": 4.0832416574990836e-05,
      "loss": 1.9935,
      "num_input_tokens_seen": 17458590,
      "step": 5000
    },
    {
      "epoch": 2.0168683535020167,
      "grad_norm": 4.256695747375488,
      "learning_rate": 3.991565823248992e-05,
      "loss": 1.9769,
      "num_input_tokens_seen": 19188010,
      "step": 5500
    },
    {
      "epoch": 2.2002200220022003,
      "grad_norm": 4.129441738128662,
      "learning_rate": 3.8998899889989e-05,
      "loss": 1.9108,
      "num_input_tokens_seen": 20932210,
      "step": 6000
    },
    {
      "epoch": 2.3835716905023836,
      "grad_norm": 2.544461250305176,
      "learning_rate": 3.808214154748808e-05,
      "loss": 1.9047,
      "num_input_tokens_seen": 22658578,
      "step": 6500
    },
    {
      "epoch": 2.566923359002567,
      "grad_norm": 4.752838611602783,
      "learning_rate": 3.716538320498717e-05,
      "loss": 1.9119,
      "num_input_tokens_seen": 24411482,
      "step": 7000
    },
    {
      "epoch": 2.7502750275027505,
      "grad_norm": 4.965038776397705,
      "learning_rate": 3.624862486248625e-05,
      "loss": 1.8986,
      "num_input_tokens_seen": 26157770,
      "step": 7500
    },
    {
      "epoch": 2.933626696002934,
      "grad_norm": 4.416258335113525,
      "learning_rate": 3.5331866519985334e-05,
      "loss": 1.9086,
      "num_input_tokens_seen": 27912394,
      "step": 8000
    },
    {
      "epoch": 3.116978364503117,
      "grad_norm": 3.501598596572876,
      "learning_rate": 3.4415108177484414e-05,
      "loss": 1.868,
      "num_input_tokens_seen": 29671328,
      "step": 8500
    },
    {
      "epoch": 3.3003300330033003,
      "grad_norm": 3.8959696292877197,
      "learning_rate": 3.34983498349835e-05,
      "loss": 1.8465,
      "num_input_tokens_seen": 31405544,
      "step": 9000
    },
    {
      "epoch": 3.4836817015034836,
      "grad_norm": 3.5625758171081543,
      "learning_rate": 3.258159149248258e-05,
      "loss": 1.8463,
      "num_input_tokens_seen": 33146784,
      "step": 9500
    },
    {
      "epoch": 3.667033370003667,
      "grad_norm": 3.303110122680664,
      "learning_rate": 3.166483314998166e-05,
      "loss": 1.8394,
      "num_input_tokens_seen": 34888072,
      "step": 10000
    },
    {
      "epoch": 3.8503850385038505,
      "grad_norm": 3.5172908306121826,
      "learning_rate": 3.074807480748075e-05,
      "loss": 1.8379,
      "num_input_tokens_seen": 36645960,
      "step": 10500
    },
    {
      "epoch": 4.033736707004033,
      "grad_norm": 4.386786460876465,
      "learning_rate": 2.983131646497983e-05,
      "loss": 1.8245,
      "num_input_tokens_seen": 38388631,
      "step": 11000
    },
    {
      "epoch": 4.2170883755042174,
      "grad_norm": 3.2586567401885986,
      "learning_rate": 2.891455812247892e-05,
      "loss": 1.8029,
      "num_input_tokens_seen": 40139079,
      "step": 11500
    },
    {
      "epoch": 4.400440044004401,
      "grad_norm": 3.6384007930755615,
      "learning_rate": 2.7997799779978003e-05,
      "loss": 1.7909,
      "num_input_tokens_seen": 41872751,
      "step": 12000
    },
    {
      "epoch": 4.583791712504584,
      "grad_norm": 4.475183486938477,
      "learning_rate": 2.7081041437477084e-05,
      "loss": 1.791,
      "num_input_tokens_seen": 43618911,
      "step": 12500
    },
    {
      "epoch": 4.767143381004767,
      "grad_norm": 4.72713041305542,
      "learning_rate": 2.6164283094976168e-05,
      "loss": 1.7745,
      "num_input_tokens_seen": 45373143,
      "step": 13000
    },
    {
      "epoch": 4.9504950495049505,
      "grad_norm": 3.3076839447021484,
      "learning_rate": 2.5247524752475248e-05,
      "loss": 1.7968,
      "num_input_tokens_seen": 47112151,
      "step": 13500
    },
    {
      "epoch": 5.133846718005134,
      "grad_norm": 4.046383857727051,
      "learning_rate": 2.4330766409974332e-05,
      "loss": 1.7611,
      "num_input_tokens_seen": 48852751,
      "step": 14000
    },
    {
      "epoch": 5.317198386505317,
      "grad_norm": 3.291144609451294,
      "learning_rate": 2.3414008067473413e-05,
      "loss": 1.7363,
      "num_input_tokens_seen": 50602567,
      "step": 14500
    },
    {
      "epoch": 5.5005500550055,
      "grad_norm": 4.23388671875,
      "learning_rate": 2.24972497249725e-05,
      "loss": 1.7814,
      "num_input_tokens_seen": 52369863,
      "step": 15000
    },
    {
      "epoch": 5.683901723505684,
      "grad_norm": 3.1835505962371826,
      "learning_rate": 2.158049138247158e-05,
      "loss": 1.751,
      "num_input_tokens_seen": 54115983,
      "step": 15500
    },
    {
      "epoch": 5.867253392005868,
      "grad_norm": 3.593493938446045,
      "learning_rate": 2.0663733039970665e-05,
      "loss": 1.7481,
      "num_input_tokens_seen": 55853919,
      "step": 16000
    },
    {
      "epoch": 6.050605060506051,
      "grad_norm": 4.3933258056640625,
      "learning_rate": 1.9746974697469746e-05,
      "loss": 1.7506,
      "num_input_tokens_seen": 57581239,
      "step": 16500
    },
    {
      "epoch": 6.233956729006234,
      "grad_norm": 3.6081910133361816,
      "learning_rate": 1.883021635496883e-05,
      "loss": 1.7294,
      "num_input_tokens_seen": 59313735,
      "step": 17000
    },
    {
      "epoch": 6.417308397506417,
      "grad_norm": 3.7784392833709717,
      "learning_rate": 1.7913458012467914e-05,
      "loss": 1.719,
      "num_input_tokens_seen": 61061911,
      "step": 17500
    },
    {
      "epoch": 6.600660066006601,
      "grad_norm": 3.5482571125030518,
      "learning_rate": 1.6996699669966998e-05,
      "loss": 1.7184,
      "num_input_tokens_seen": 62802279,
      "step": 18000
    },
    {
      "epoch": 6.784011734506784,
      "grad_norm": 3.797348737716675,
      "learning_rate": 1.6079941327466082e-05,
      "loss": 1.7101,
      "num_input_tokens_seen": 64536303,
      "step": 18500
    },
    {
      "epoch": 6.967363403006967,
      "grad_norm": 3.9275312423706055,
      "learning_rate": 1.5163182984965163e-05,
      "loss": 1.7153,
      "num_input_tokens_seen": 66282967,
      "step": 19000
    },
    {
      "epoch": 7.15071507150715,
      "grad_norm": 3.65077805519104,
      "learning_rate": 1.4246424642464248e-05,
      "loss": 1.7181,
      "num_input_tokens_seen": 68030296,
      "step": 19500
    },
    {
      "epoch": 7.334066740007334,
      "grad_norm": 4.696651458740234,
      "learning_rate": 1.3329666299963331e-05,
      "loss": 1.6992,
      "num_input_tokens_seen": 69767824,
      "step": 20000
    },
    {
      "epoch": 7.517418408507518,
      "grad_norm": 5.405508518218994,
      "learning_rate": 1.2412907957462413e-05,
      "loss": 1.6903,
      "num_input_tokens_seen": 71509128,
      "step": 20500
    },
    {
      "epoch": 7.700770077007701,
      "grad_norm": 3.7343809604644775,
      "learning_rate": 1.1496149614961496e-05,
      "loss": 1.7019,
      "num_input_tokens_seen": 73255224,
      "step": 21000
    },
    {
      "epoch": 7.884121745507884,
      "grad_norm": 4.133444786071777,
      "learning_rate": 1.057939127246058e-05,
      "loss": 1.6959,
      "num_input_tokens_seen": 75002496,
      "step": 21500
    },
    {
      "epoch": 8.067473414008067,
      "grad_norm": 4.398416996002197,
      "learning_rate": 9.662632929959662e-06,
      "loss": 1.7018,
      "num_input_tokens_seen": 76756073,
      "step": 22000
    },
    {
      "epoch": 8.250825082508252,
      "grad_norm": 4.565046310424805,
      "learning_rate": 8.745874587458746e-06,
      "loss": 1.6837,
      "num_input_tokens_seen": 78483465,
      "step": 22500
    },
    {
      "epoch": 8.434176751008435,
      "grad_norm": 3.950497627258301,
      "learning_rate": 7.829116244957828e-06,
      "loss": 1.6913,
      "num_input_tokens_seen": 80220865,
      "step": 23000
    },
    {
      "epoch": 8.617528419508618,
      "grad_norm": 3.9700405597686768,
      "learning_rate": 6.912357902456913e-06,
      "loss": 1.6814,
      "num_input_tokens_seen": 81964649,
      "step": 23500
    },
    {
      "epoch": 8.800880088008801,
      "grad_norm": 3.21114444732666,
      "learning_rate": 5.995599559955996e-06,
      "loss": 1.689,
      "num_input_tokens_seen": 83718889,
      "step": 24000
    },
    {
      "epoch": 8.984231756508985,
      "grad_norm": 3.5966849327087402,
      "learning_rate": 5.078841217455079e-06,
      "loss": 1.6734,
      "num_input_tokens_seen": 85471529,
      "step": 24500
    },
    {
      "epoch": 9.167583425009168,
      "grad_norm": 3.4596688747406006,
      "learning_rate": 4.162082874954162e-06,
      "loss": 1.6792,
      "num_input_tokens_seen": 87214771,
      "step": 25000
    },
    {
      "epoch": 9.350935093509351,
      "grad_norm": 3.9838054180145264,
      "learning_rate": 3.2453245324532458e-06,
      "loss": 1.6583,
      "num_input_tokens_seen": 88949475,
      "step": 25500
    },
    {
      "epoch": 9.534286762009534,
      "grad_norm": 3.389430522918701,
      "learning_rate": 2.3285661899523286e-06,
      "loss": 1.6836,
      "num_input_tokens_seen": 90694267,
      "step": 26000
    },
    {
      "epoch": 9.717638430509718,
      "grad_norm": 4.560466289520264,
      "learning_rate": 1.411807847451412e-06,
      "loss": 1.6804,
      "num_input_tokens_seen": 92441267,
      "step": 26500
    },
    {
      "epoch": 9.900990099009901,
      "grad_norm": 4.484193325042725,
      "learning_rate": 4.950495049504951e-07,
      "loss": 1.6876,
      "num_input_tokens_seen": 94186835,
      "step": 27000
    },
    {
      "epoch": 10.0,
      "num_input_tokens_seen": 95128823,
      "step": 27270,
      "total_flos": 3.4538173670639616e+16,
      "train_loss": 1.8293144167322006,
      "train_runtime": 2454.5506,
      "train_samples_per_second": 88.859,
      "train_steps_per_second": 11.11,
      "train_tokens_per_second": 38762.215
    }
  ],
  "logging_steps": 500,
  "max_steps": 27270,
  "num_input_tokens_seen": 95128823,
  "num_train_epochs": 10,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 3.4538173670639616e+16,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}