1+ {
2+ "architectures" : [
3+ " DeepseekV3ForCausalLM"
4+ ],
5+ "attention_bias" : false ,
6+ "attention_dropout" : 0.0 ,
7+ "auto_map" : {
8+ "AutoConfig" : " configuration_deepseek.DeepseekV3Config" ,
9+ "AutoModel" : " modeling_deepseek.DeepseekV3Model" ,
10+ "AutoModelForCausalLM" : " modeling_deepseek.DeepseekV3ForCausalLM"
11+ },
12+ "aux_loss_alpha" : 0.001 ,
13+ "bos_token_id" : 163584 ,
14+ "eos_token_id" : 163585 ,
15+ "first_k_dense_replace" : 1 ,
16+ "hidden_act" : " silu" ,
17+ "hidden_size" : 7168 ,
18+ "initializer_range" : 0.02 ,
19+ "intermediate_size" : 18432 ,
20+ "kv_lora_rank" : 512 ,
21+ "max_position_embeddings" : 131072 ,
22+ "model_type" : " kimi_k2" ,
23+ "moe_intermediate_size" : 2048 ,
24+ "moe_layer_freq" : 1 ,
25+ "n_group" : 1 ,
26+ "n_routed_experts" : 384 ,
27+ "n_shared_experts" : 1 ,
28+ "norm_topk_prob" : true ,
29+ "num_attention_heads" : 64 ,
30+ "num_experts_per_tok" : 8 ,
31+ "num_hidden_layers" : 61 ,
32+ "num_key_value_heads" : 64 ,
33+ "num_nextn_predict_layers" : 0 ,
34+ "pretraining_tp" : 1 ,
35+ "q_lora_rank" : 1536 ,
36+ "qk_nope_head_dim" : 128 ,
37+ "qk_rope_head_dim" : 64 ,
38+ "quantization_config" : {
39+ "activation_scheme" : " dynamic" ,
40+ "fmt" : " e4m3" ,
41+ "quant_method" : " fp8" ,
42+ "weight_block_size" : [
43+ 128 ,
44+ 128
45+ ]
46+ },
47+ "rms_norm_eps" : 1e-06 ,
48+ "rope_theta" : 50000.0 ,
49+ "routed_scaling_factor" : 2.827 ,
50+ "rope_scaling" : {
51+ "beta_fast" : 1.0 ,
52+ "beta_slow" : 1.0 ,
53+ "factor" : 32.0 ,
54+ "mscale" : 1.0 ,
55+ "mscale_all_dim" : 1.0 ,
56+ "original_max_position_embeddings" : 4096 ,
57+ "type" : " yarn"
58+ },
59+ "scoring_func" : " sigmoid" ,
60+ "seq_aux" : true ,
61+ "tie_word_embeddings" : false ,
62+ "topk_group" : 1 ,
63+ "topk_method" : " noaux_tc" ,
64+ "torch_dtype" : " bfloat16" ,
65+ "transformers_version" : " 4.48.3" ,
66+ "use_cache" : true ,
67+ "v_head_dim" : 128 ,
68+ "vocab_size" : 163840
69+ }
0 commit comments