MoA-150M / config.json
reaperdoesntknow's picture
Upload MoAMetricLM
63361ab verified
{
"alpha_init": 1.5,
"architectures": [
"MoAMetricLM"
],
"attn_drop": 0.1,
"attn_heads": 32,
"bos_token_id": null,
"conv_kernel": 5,
"conv_mult": 2,
"dim": 512,
"discrepancy_modulation": true,
"drop_path": 0.0,
"dtype": "float32",
"enable_feature_gates": true,
"enable_router_gates": true,
"energy_amplification": 3.1415,
"eos_token_id": 151643,
"ff_mult": 3,
"ffn_hidden": 1536,
"head_feature_heads": 32,
"layer_scale_init_value": 0.0001,
"learn_alpha": true,
"learn_radius": true,
"lr_rank": 32,
"maha_init": 1.0,
"max_position_embeddings": 2048,
"max_seq_len_cached": 8192,
"metric": "maha_diag",
"mixer_hidden": 1536,
"model_type": "moa_metric",
"mqa_q_heads": 64,
"n_branches": 3,
"n_token_router_heads": 4,
"num_hidden_layers": 8,
"num_layers": 8,
"origin_init_scale": 0.0,
"pad_token_id": 151643,
"proj_drop": 0.1,
"r_basis": 32,
"radius_init": 5.0,
"router_bias_heads": 4,
"router_dropout": 0.1,
"router_hidden": 1536,
"router_init_temperature": 1.5,
"router_temperature": 2.0,
"router_topk": 3,
"shared_kv_ratio": 0.55,
"theta_base": 100000.0,
"ti_reg_samples": 0,
"ti_reg_weight": 0.0,
"transformers_version": "4.56.1",
"use_balls": true,
"vocab_size": 151665
}