| { | |
| "vocab_size": 32128, | |
| "hidden_dim": 4096, | |
| "num_layers": 24, | |
| "n_head": 64, | |
| "kv_heads": 64, | |
| "head_dim": 64, | |
| "ff_dim": 10240, | |
| "dropout": 0.1, | |
| "eps": 1e-6, | |
| "emb_weight_dtype": "bfloat16", | |
| "linear_weight_dtype": "bfloat16", | |
| "norm_weight_dtype": "bfloat16", | |
| "ffn_type": "GEGLU", | |
| "gelu_approx": "tanh", | |
| "attn_impl": "sdpa", | |
| "flex_kernel_options": {}, | |
| "relative_attention_num_buckets": 32, | |
| "relative_attention_max_distance": 128, | |
| "scale_qk": false, | |
| "pad_token_id": 0, | |
| "decoder_start_token_id": 0, | |
| "label_ignore_index": -100, | |
| "pos_emb_per_layer": false, | |
| "elementwise_affine": true | |
| } |