Skip to content
This repository was archived by the owner on Dec 14, 2023. It is now read-only.

Commit 053b517

Browse files
Merge pull request #48 from ExponentialML/feat/temporal-lora
Add Full Lora Training
2 parents 7d384d6 + 56af863 commit 053b517

File tree

4 files changed

+1614
-40
lines changed

4 files changed

+1614
-40
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
[output.webm](https://user-images.githubusercontent.com/59846140/230748413-fe91e90b-94b9-49ea-97ec-250469ee9472.webm)
55

66
### Updates
7+
- **2023-4-8**: LoRA Training released! Checkout `configs/v2/lora_training_config.yaml` for instructions.
78
- **2023-4-8**: Version 2 is released!
89
- **2023-3-29**: Added gradient checkpointing support.
910
- **2023-3-27**: Support for using Scaled Dot Product Attention for Torch 2.0 users.
Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
# Pretrained diffusers model path.
2+
pretrained_model_path: "./models/model_scope_diffusers/" #https://huggingface.co/damo-vilab/text-to-video-ms-1.7b/tree/main
3+
4+
# The folder where your training outputs will be placed.
5+
output_dir: "./outputs"
6+
7+
# You can train multiple datasets at once. They will be joined together for training.
8+
# Simply remove the line you don't need, or keep them all for mixed training.
9+
10+
# 'image': A folder of images and captions (.txt)
11+
# 'folder': A folder a videos and captions (.txt)
12+
# 'json': The JSON file created with automatic BLIP2 captions using https://github.com/ExponentialML/Video-BLIP2-Preprocessor
13+
# 'single_video': A single video file.mp4 and text prompt
14+
dataset_types:
15+
- 'image'
16+
- 'folder'
17+
- 'json'
18+
- 'single_video'
19+
20+
# Adds offset noise to training. See https://www.crosslabs.org/blog/diffusion-with-offset-noise
21+
offset_noise_strength: 0.1
22+
use_offset_noise: False
23+
24+
# When True, this extends all items in all enabled datasets to the highest length.
25+
# For example, if you have 200 videos and 10 images, 10 images will be duplicated to the length of 200.
26+
extend_dataset: False
27+
28+
# Caches the latents (Frames-Image -> VAE -> Latent) to a HDD or SDD.
29+
# The latents will be saved under your training folder, and loaded automatically for training.
30+
# This both saves memory and speeds up training and takes very little disk space.
31+
cache_latents: True
32+
33+
# If you have cached latents set to `True` and have a directory of cached latents,
34+
# you can skip the caching process and load previously saved ones.
35+
cached_latent_dir: null #/path/to/cached_latents
36+
37+
# Train the text encoder. Leave at false to use LoRA only (Recommended).
38+
train_text_encoder: False
39+
40+
# https://github.com/cloneofsimo/lora
41+
# Use LoRA to train extra layers whilst saving memory. It trains both a LoRA & the model itself.
42+
# This works slightly different than vanilla LoRA and DOES NOT save a separate file.
43+
# It is simply used as a mechanism for saving memory by keeping layers frozen and training the residual.
44+
45+
# Use LoRA for the UNET model.
46+
use_unet_lora: True
47+
48+
# Use LoRA for the Text Encoder.
49+
use_text_lora: True
50+
51+
# Use trained LoRA to continue training with. Only LoRA files trained with this repository will work.
52+
# LoRA files are saved in the same directory as outputs under 'lora'.
53+
# To load them, they must have 'unet' and 'text_encoder' in their names.
54+
lora_path: ''
55+
56+
# The modules to use for LoRA. Different from 'trainable_modules'.
57+
# This does trains the entire UNET's linear and convolution layers.
58+
# For potentially saved memory, uncomment the other lines, and comment out "UNet3DConditionModel" by using a #.
59+
unet_lora_modules:
60+
# The entire UNET model
61+
- "UNet3DConditionModel"
62+
63+
# The attention layers for spatial dimension (Image data).
64+
#- "Transformer2D"
65+
66+
# The convolution layers for spatial dimension.
67+
#- "Transformer2DModel"
68+
69+
# The convolution layers for temporal dimension (Frame data)
70+
#- "TemporalConvLayer"
71+
72+
# The attention layers for the temporal dimension (Frame data).
73+
#- "TransformerTemporalModel"
74+
75+
# The modules to use for LoRA. Different from `trainable_text_modules`.
76+
text_encoder_lora_modules:
77+
- "CLIPEncoderLayer"
78+
79+
# The rank for LoRA training. With ModelScope, the maximum should be 1024.
80+
# VRAM increases with higher rank, lower when decreased.
81+
lora_rank: 32
82+
83+
# Training data parameters
84+
train_data:
85+
86+
# The width and height in which you want your training data to be resized to.
87+
width: 256
88+
height: 256
89+
90+
# This will find the closest aspect ratio to your input width and height.
91+
# For example, 512x512 width and height with a video of resolution 1280x720 will be resized to 512x256
92+
use_bucketing: True
93+
94+
# The start frame index where your videos should start (Leave this at one for json and folder based training).
95+
sample_start_idx: 1
96+
97+
# Used for 'folder'. The rate at which your frames are sampled. Does nothing for 'json' and 'single_video' dataset.
98+
fps: 24
99+
100+
# For 'single_video' and 'json'. The number of frames to "step" (1,2,3,4) (frame_step=2) -> (1,3,5,7, ...).
101+
frame_step: 5
102+
103+
# The number of frames to sample. The higher this number, the higher the VRAM (acts similar to batch size).
104+
n_sample_frames: 8
105+
106+
# 'single_video'
107+
single_video_path: "path/to/single/video.mp4"
108+
109+
# The prompt when using a a single video file
110+
single_video_prompt: ""
111+
112+
# Fallback prompt if caption cannot be read. Enabled for 'image' and 'folder'.
113+
fallback_prompt: ''
114+
115+
# 'folder'
116+
path: "path/to/folder/of/videos/"
117+
118+
# 'json'
119+
json_path: 'path/to/train/json/'
120+
121+
# 'image'
122+
image_dir: 'path/to/image/directory'
123+
124+
# The prompt for all image files. Leave blank to use caption files (.txt)
125+
single_img_prompt: ""
126+
127+
# Validation data parameters.
128+
validation_data:
129+
130+
# A custom prompt that is different from your training dataset.
131+
prompt: ""
132+
133+
# Whether or not to sample preview during training (Requires more VRAM).
134+
sample_preview: True
135+
136+
# The number of frames to sample during validation.
137+
num_frames: 16
138+
139+
# Height and width of validation sample.
140+
width: 256
141+
height: 256
142+
143+
# Number of inference steps when generating the video.
144+
num_inference_steps: 25
145+
146+
# CFG scale
147+
guidance_scale: 9
148+
149+
# Learning rate for AdamW
150+
learning_rate: 5e-6
151+
152+
# Weight decay. Higher = more regularization. Lower = closer to dataset.
153+
adam_weight_decay: 1e-2
154+
155+
# Optimizer parameters for the UNET. Overrides base learning rate parameters.
156+
extra_unet_params: null
157+
#learning_rate: 1e-5
158+
#adam_weight_decay: 1e-4
159+
160+
# Optimizer parameters for the Text Encoder. Overrides base learning rate parameters.
161+
extra_text_encoder_params: null
162+
#learning_rate: 5e-6
163+
#adam_weight_decay: 0.2
164+
165+
# How many batches to train. Not to be confused with video frames.
166+
train_batch_size: 1
167+
168+
# Maximum number of train steps. Model is saved after training.
169+
max_train_steps: 10000
170+
171+
# Saves a model every nth step.
172+
checkpointing_steps: 2500
173+
174+
# How many steps to do for validation if sample_preview is enabled.
175+
validation_steps: 100
176+
177+
# Seed for validation.
178+
seed: 64
179+
180+
# Whether or not we want to use mixed precision with accelerate
181+
mixed_precision: "fp16"
182+
183+
# This seems to be incompatible at the moment.
184+
use_8bit_adam: False
185+
186+
# Trades VRAM usage for speed. You lose roughly 20% of training speed, but save a lot of VRAM.
187+
# If you need to save more VRAM, it can also be enabled for the text encoder, but reduces speed x2.
188+
gradient_checkpointing: True
189+
text_encoder_gradient_checkpointing: False
190+
191+
# Xformers must be installed for best memory savings and performance (< Pytorch 2.0)
192+
enable_xformers_memory_efficient_attention: False
193+
194+
# Use scaled dot product attention (Only available with >= Torch 2.0)
195+
enable_torch_2_attn: True

0 commit comments

Comments
 (0)