| {"vocab_size": 50304, "dim": 768, "n_layers": 12, "n_heads": 12, "n_kv_heads": 4, "multiple_of": 32, "dropout": 0.1, "context_len": 512, "batch_size": 16, "grad_accum": 4, "lr": 0.0006, "min_lr": 6e-05, "warmup_steps": 200, "max_steps": 3000, "device": "cuda"} |