|
106 | 106 | "clip_thresh": 0.1, |
107 | 107 | "initial_learning_rate": 5e-4, |
108 | 108 | }, |
109 | | - "deepvoice3_niklm": { |
110 | | - "n_speakers": 118, |
111 | | - "speaker_embed_dim": 16, |
112 | | - "downsample_step": 4, |
113 | | - "outputs_per_step": 1, |
114 | | - "embedding_weight_std": 0.1, |
115 | | - "speaker_embedding_weight_std": 0.05, |
116 | | - "dropout": 1 - 0.95, |
117 | | - "kernel_size": 3, |
118 | | - "text_embed_dim": 256, |
119 | | - "encoder_channels": 512, |
120 | | - "decoder_channels": 256, |
121 | | - "converter_channels": 256, |
122 | | - "use_guided_attention": True, |
123 | | - "guided_attention_sigma": 0.4, |
124 | | - "binary_divergence_weight": 0.1, |
125 | | - "use_decoder_state_for_postnet_input": True, |
126 | | - "max_positions": 1200, |
127 | | - "query_position_rate": 2.0, |
128 | | - "key_position_rate": 7.6, |
129 | | - "key_projection": True, |
130 | | - "value_projection": True, |
131 | | - "clip_thresh": 0.1, |
132 | | - "initial_learning_rate": 5e-4, |
133 | | - "batch_size": 8, |
134 | | - "text_embed_dim":256, |
| 109 | + "deepvoice3_niklm": { |
| 110 | + "n_speakers": 118, |
| 111 | + "speaker_embed_dim": 16, |
| 112 | + "downsample_step": 4, |
| 113 | + "outputs_per_step": 1, |
| 114 | + "embedding_weight_std": 0.1, |
| 115 | + "speaker_embedding_weight_std": 0.05, |
| 116 | + "dropout": 1 - 0.95, |
| 117 | + "kernel_size": 3, |
| 118 | + "text_embed_dim": 256, |
| 119 | + "encoder_channels": 512, |
| 120 | + "decoder_channels": 256, |
| 121 | + "converter_channels": 256, |
| 122 | + "use_guided_attention": True, |
| 123 | + "guided_attention_sigma": 0.4, |
| 124 | + "binary_divergence_weight": 0.1, |
| 125 | + "use_decoder_state_for_postnet_input": True, |
| 126 | + "max_positions": 1200, |
| 127 | + "query_position_rate": 2.0, |
| 128 | + "key_position_rate": 7.6, |
| 129 | + "key_projection": True, |
| 130 | + "value_projection": True, |
| 131 | + "clip_thresh": 0.1, |
| 132 | + "initial_learning_rate": 5e-4, |
| 133 | + "batch_size": 8, |
| 134 | + "text_embed_dim": 256, |
135 | 135 | }, |
136 | | - "deepvoice3_nikls": { |
137 | | - "n_speakers": 1, |
138 | | - "speaker_embed_dim": 16, |
139 | | - "downsample_step": 4, |
140 | | - "outputs_per_step": 1, |
141 | | - "embedding_weight_std": 0.1, |
142 | | - "speaker_embedding_weight_std": 0.05, |
143 | | - "dropout": 1 - 0.95, |
144 | | - "kernel_size": 3, |
145 | | - "text_embed_dim": 256, |
146 | | - "encoder_channels": 512, |
147 | | - "decoder_channels": 256, |
148 | | - "converter_channels": 256, |
149 | | - "use_guided_attention": True, |
150 | | - "guided_attention_sigma": 0.4, |
151 | | - "binary_divergence_weight": 0.1, |
152 | | - "use_decoder_state_for_postnet_input": True, |
153 | | - "max_positions": 512, |
154 | | - "query_position_rate": 2.0, |
155 | | - "key_position_rate": 7.6, |
156 | | - "key_projection": True, |
157 | | - "value_projection": True, |
158 | | - "clip_thresh": 0.1, |
159 | | - "initial_learning_rate": 5e-4, |
160 | | - "batch_size": 8, |
161 | | - "text_embed_dim":256, |
| 136 | + "deepvoice3_nikls": { |
| 137 | + "n_speakers": 1, |
| 138 | + "speaker_embed_dim": 16, |
| 139 | + "downsample_step": 4, |
| 140 | + "outputs_per_step": 1, |
| 141 | + "embedding_weight_std": 0.1, |
| 142 | + "speaker_embedding_weight_std": 0.05, |
| 143 | + "dropout": 1 - 0.95, |
| 144 | + "kernel_size": 3, |
| 145 | + "text_embed_dim": 256, |
| 146 | + "encoder_channels": 512, |
| 147 | + "decoder_channels": 256, |
| 148 | + "converter_channels": 256, |
| 149 | + "use_guided_attention": True, |
| 150 | + "guided_attention_sigma": 0.4, |
| 151 | + "binary_divergence_weight": 0.1, |
| 152 | + "use_decoder_state_for_postnet_input": True, |
| 153 | + "max_positions": 512, |
| 154 | + "query_position_rate": 2.0, |
| 155 | + "key_position_rate": 7.6, |
| 156 | + "key_projection": True, |
| 157 | + "value_projection": True, |
| 158 | + "clip_thresh": 0.1, |
| 159 | + "initial_learning_rate": 5e-4, |
| 160 | + "batch_size": 8, |
| 161 | + "text_embed_dim": 256, |
162 | 162 | }, |
163 | 163 | }, |
164 | 164 |
|
|
180 | 180 | # mel-spectrogram is normalized to [0, 1] for each utterance and clipping may |
181 | 181 | # happen depends on min_level_db and ref_level_db, causing clipping noise. |
182 | 182 | # If False, assertion is added to ensure no clipping happens. |
183 | | - allow_clipping_in_normalization=False, |
| 183 | + allow_clipping_in_normalization=True, |
184 | 184 |
|
185 | 185 | # Model: |
186 | 186 | downsample_step=4, # must be 4 when builder="nyanko" |
|
0 commit comments