|
| 1 | +# Pretrained diffusers model path. |
| 2 | +pretrained_model_path: "./models/model_scope_diffusers/" #https://huggingface.co/damo-vilab/text-to-video-ms-1.7b/tree/main |
| 3 | + |
| 4 | +# The folder where your training outputs will be placed. |
| 5 | +output_dir: "./outputs" |
| 6 | + |
| 7 | +# You can train multiple datasets at once. They will be joined together for training. |
| 8 | +# Simply remove the line you don't need, or keep them all for mixed training. |
| 9 | + |
| 10 | +# 'image': A folder of images and captions (.txt) |
| 11 | +# 'folder': A folder a videos and captions (.txt) |
| 12 | +# 'json': The JSON file created with automatic BLIP2 captions using https://github.com/ExponentialML/Video-BLIP2-Preprocessor |
| 13 | +# 'single_video': A single video file.mp4 and text prompt |
| 14 | +dataset_types: |
| 15 | + - 'image' |
| 16 | + - 'folder' |
| 17 | + - 'json' |
| 18 | + - 'single_video' |
| 19 | + |
| 20 | +# Adds offset noise to training. See https://www.crosslabs.org/blog/diffusion-with-offset-noise |
| 21 | +offset_noise_strength: 0.1 |
| 22 | +use_offset_noise: False |
| 23 | + |
| 24 | +# When True, this extends all items in all enabled datasets to the highest length. |
| 25 | +# For example, if you have 200 videos and 10 images, 10 images will be duplicated to the length of 200. |
| 26 | +extend_dataset: False |
| 27 | + |
| 28 | +# Caches the latents (Frames-Image -> VAE -> Latent) to a HDD or SDD. |
| 29 | +# The latents will be saved under your training folder, and loaded automatically for training. |
| 30 | +# This both saves memory and speeds up training and takes very little disk space. |
| 31 | +cache_latents: True |
| 32 | + |
| 33 | +# If you have cached latents set to `True` and have a directory of cached latents, |
| 34 | +# you can skip the caching process and load previously saved ones. |
| 35 | +cached_latent_dir: null #/path/to/cached_latents |
| 36 | + |
| 37 | +# Train the text encoder. Leave at false to use LoRA only (Recommended). |
| 38 | +train_text_encoder: False |
| 39 | + |
| 40 | +# https://github.com/cloneofsimo/lora |
| 41 | +# Use LoRA to train extra layers whilst saving memory. It trains both a LoRA & the model itself. |
| 42 | +# This works slightly different than vanilla LoRA and DOES NOT save a separate file. |
| 43 | +# It is simply used as a mechanism for saving memory by keeping layers frozen and training the residual. |
| 44 | + |
| 45 | +# Use LoRA for the UNET model. |
| 46 | +use_unet_lora: True |
| 47 | + |
| 48 | +# Use LoRA for the Text Encoder. |
| 49 | +use_text_lora: True |
| 50 | + |
| 51 | +# Use trained LoRA to continue training with. Only LoRA files trained with this repository will work. |
| 52 | +# LoRA files are saved in the same directory as outputs under 'lora'. |
| 53 | +# To load them, they must have 'unet' and 'text_encoder' in their names. |
| 54 | +lora_path: '' |
| 55 | + |
| 56 | +# The modules to use for LoRA. Different from 'trainable_modules'. |
| 57 | +# This does trains the entire UNET's linear and convolution layers. |
| 58 | +# For potentially saved memory, uncomment the other lines, and comment out "UNet3DConditionModel" by using a #. |
| 59 | +unet_lora_modules: |
| 60 | + # The entire UNET model |
| 61 | + - "UNet3DConditionModel" |
| 62 | + |
| 63 | + # The attention layers for spatial dimension (Image data). |
| 64 | + #- "Transformer2D" |
| 65 | + |
| 66 | + # The convolution layers for spatial dimension. |
| 67 | + #- "Transformer2DModel" |
| 68 | + |
| 69 | + # The convolution layers for temporal dimension (Frame data) |
| 70 | + #- "TemporalConvLayer" |
| 71 | + |
| 72 | + # The attention layers for the temporal dimension (Frame data). |
| 73 | + #- "TransformerTemporalModel" |
| 74 | + |
| 75 | +# The modules to use for LoRA. Different from `trainable_text_modules`. |
| 76 | +text_encoder_lora_modules: |
| 77 | + - "CLIPEncoderLayer" |
| 78 | + |
| 79 | +# The rank for LoRA training. With ModelScope, the maximum should be 1024. |
| 80 | +# VRAM increases with higher rank, lower when decreased. |
| 81 | +lora_rank: 32 |
| 82 | + |
| 83 | +# Training data parameters |
| 84 | +train_data: |
| 85 | + |
| 86 | + # The width and height in which you want your training data to be resized to. |
| 87 | + width: 256 |
| 88 | + height: 256 |
| 89 | + |
| 90 | + # This will find the closest aspect ratio to your input width and height. |
| 91 | + # For example, 512x512 width and height with a video of resolution 1280x720 will be resized to 512x256 |
| 92 | + use_bucketing: True |
| 93 | + |
| 94 | + # The start frame index where your videos should start (Leave this at one for json and folder based training). |
| 95 | + sample_start_idx: 1 |
| 96 | + |
| 97 | + # Used for 'folder'. The rate at which your frames are sampled. Does nothing for 'json' and 'single_video' dataset. |
| 98 | + fps: 24 |
| 99 | + |
| 100 | + # For 'single_video' and 'json'. The number of frames to "step" (1,2,3,4) (frame_step=2) -> (1,3,5,7, ...). |
| 101 | + frame_step: 5 |
| 102 | + |
| 103 | + # The number of frames to sample. The higher this number, the higher the VRAM (acts similar to batch size). |
| 104 | + n_sample_frames: 8 |
| 105 | + |
| 106 | + # 'single_video' |
| 107 | + single_video_path: "path/to/single/video.mp4" |
| 108 | + |
| 109 | + # The prompt when using a a single video file |
| 110 | + single_video_prompt: "" |
| 111 | + |
| 112 | + # Fallback prompt if caption cannot be read. Enabled for 'image' and 'folder'. |
| 113 | + fallback_prompt: '' |
| 114 | + |
| 115 | + # 'folder' |
| 116 | + path: "path/to/folder/of/videos/" |
| 117 | + |
| 118 | + # 'json' |
| 119 | + json_path: 'path/to/train/json/' |
| 120 | + |
| 121 | + # 'image' |
| 122 | + image_dir: 'path/to/image/directory' |
| 123 | + |
| 124 | + # The prompt for all image files. Leave blank to use caption files (.txt) |
| 125 | + single_img_prompt: "" |
| 126 | + |
| 127 | +# Validation data parameters. |
| 128 | +validation_data: |
| 129 | + |
| 130 | + # A custom prompt that is different from your training dataset. |
| 131 | + prompt: "" |
| 132 | + |
| 133 | + # Whether or not to sample preview during training (Requires more VRAM). |
| 134 | + sample_preview: True |
| 135 | + |
| 136 | + # The number of frames to sample during validation. |
| 137 | + num_frames: 16 |
| 138 | + |
| 139 | + # Height and width of validation sample. |
| 140 | + width: 256 |
| 141 | + height: 256 |
| 142 | + |
| 143 | + # Number of inference steps when generating the video. |
| 144 | + num_inference_steps: 25 |
| 145 | + |
| 146 | + # CFG scale |
| 147 | + guidance_scale: 9 |
| 148 | + |
| 149 | +# Learning rate for AdamW |
| 150 | +learning_rate: 5e-6 |
| 151 | + |
| 152 | +# Weight decay. Higher = more regularization. Lower = closer to dataset. |
| 153 | +adam_weight_decay: 1e-2 |
| 154 | + |
| 155 | +# Optimizer parameters for the UNET. Overrides base learning rate parameters. |
| 156 | +extra_unet_params: null |
| 157 | + #learning_rate: 1e-5 |
| 158 | + #adam_weight_decay: 1e-4 |
| 159 | + |
| 160 | +# Optimizer parameters for the Text Encoder. Overrides base learning rate parameters. |
| 161 | +extra_text_encoder_params: null |
| 162 | + #learning_rate: 5e-6 |
| 163 | + #adam_weight_decay: 0.2 |
| 164 | + |
| 165 | +# How many batches to train. Not to be confused with video frames. |
| 166 | +train_batch_size: 1 |
| 167 | + |
| 168 | +# Maximum number of train steps. Model is saved after training. |
| 169 | +max_train_steps: 10000 |
| 170 | + |
| 171 | +# Saves a model every nth step. |
| 172 | +checkpointing_steps: 2500 |
| 173 | + |
| 174 | +# How many steps to do for validation if sample_preview is enabled. |
| 175 | +validation_steps: 100 |
| 176 | + |
| 177 | +# Seed for validation. |
| 178 | +seed: 64 |
| 179 | + |
| 180 | +# Whether or not we want to use mixed precision with accelerate |
| 181 | +mixed_precision: "fp16" |
| 182 | + |
| 183 | +# This seems to be incompatible at the moment. |
| 184 | +use_8bit_adam: False |
| 185 | + |
| 186 | +# Trades VRAM usage for speed. You lose roughly 20% of training speed, but save a lot of VRAM. |
| 187 | +# If you need to save more VRAM, it can also be enabled for the text encoder, but reduces speed x2. |
| 188 | +gradient_checkpointing: True |
| 189 | +text_encoder_gradient_checkpointing: False |
| 190 | + |
| 191 | +# Xformers must be installed for best memory savings and performance (< Pytorch 2.0) |
| 192 | +enable_xformers_memory_efficient_attention: False |
| 193 | + |
| 194 | +# Use scaled dot product attention (Only available with >= Torch 2.0) |
| 195 | +enable_torch_2_attn: True |
0 commit comments