@@ -38,3 +38,47 @@ rope_max_timescale: 10_000_000
3838
3939# General Model Settings
4040enable_dropout : False
41+
42+ # Vision Encoder Configuration
43+ # Based on https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py
44+ image_size_for_vit : 768
45+ hidden_size_for_vit : 1152
46+ intermediate_size_for_vit : 4304
47+ num_attention_heads_for_vit : 16
48+ num_hidden_layers_for_vit : 27
49+ num_channels_for_vit : 3
50+ patch_size_for_vit : 16
51+ temporal_patch_size_for_vit : 2
52+ spatial_merge_size_for_vit : 2
53+ out_hidden_size_for_vit : 2048
54+ num_position_embeddings_for_vit : 2304
55+ deepstack_visual_indexes_for_vit : [8, 16, 24]
56+
57+ use_multimodal : true
58+ use_audio : true
59+ # Audio Encoder Configuration (need to set use_audio=true to enable)
60+ # Based on https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py
61+ d_model_for_audio : 1280
62+ encoder_layers_for_audio : 32
63+ encoder_attention_heads_for_audio : 20
64+ encoder_ffn_dim_for_audio : 5120
65+ max_source_positions_for_audio : 1500
66+ num_mel_bins_for_audio : 128
67+ downsample_hidden_size_for_audio : 480
68+ output_dim_for_audio : 2048
69+ attention_dropout_for_audio : 0.0
70+ n_window_for_audio : 50
71+ n_window_infer_for_audio : 400
72+ conv_chunksize_for_audio : 500
73+ num_conv_layers_for_audio : 3
74+ max_timescale_for_audio : 10000.0
75+ max_sample_len_for_audio : 10000
76+
77+ freeze_audio_encoder_params : false
78+ freeze_vision_encoder_params : false
79+ # MRoPE Settings (Multi-dimensional RoPE for multimodal)
80+ use_mrope : true
81+ mrope_section : [24, 20, 20]
82+
83+
84+ image_placeholder : " <|image|>"
0 commit comments