update structure.md

This commit is contained in:
zhengzangw 2024-04-23 07:41:33 +00:00
parent 54a1161389
commit 01d14cb5e2
9 changed files with 288 additions and 514 deletions

View file

@ -122,7 +122,7 @@ More samples are available in our [gallery](https://hpcaitech.github.io/Open-Sor
Other useful documents and links are listed below.
* Report: [1.0](docs/report_01.md), [1.1](docs/report_02.md), [acceleration.md](docs/acceleration.md)
* Report: [report 1.0](docs/report_01.md), [report 1.1](docs/report_02.md), [acceleration.md](docs/acceleration.md)
* Repo structure: [structure.md](docs/structure.md)
* Config file explanation: [config.md](docs/config.md)
* Useful commands: [commands.md](docs/commands.md)

View file

@ -1,55 +0,0 @@
# scripts/inference_long.py
num_frames = 16
fps = 24 // 3
image_size = (256, 256)
# Define model
model = dict(
type="STDiT-XL/2",
space_scale=0.5,
time_scale=1.0,
enable_flashattn=True,
enable_layernorm_kernel=True,
from_pretrained=None,
)
vae = dict(
type="VideoAutoencoderKL",
from_pretrained="stabilityai/sd-vae-ft-ema",
micro_batch_size=4,
)
text_encoder = dict(
type="t5",
from_pretrained="DeepFloyd/t5-v1_1-xxl",
model_max_length=120,
)
scheduler = dict(
type="iddpm",
# type="dpm-solver",
num_sampling_steps=100,
cfg_scale=7.0,
)
dtype = "bf16"
# Condition
prompt_path = None
prompt = [
"Drone view of waves crashing against the rugged cliffs along Big Surs garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliffs edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff's edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.",
"In an ornate, historical hall, a massive tidal wave peaks and begins to crash. Two surfers, seizing the moment, skillfully navigate the face of the wave.",
]
loop = 10
condition_frame_length = 4
reference_path = [
"assets/images/condition/cliff.png",
"assets/images/condition/wave.png",
]
mask_strategy = [
"0,0,0,1,0",
"0,0,0,1,0",
] # valid when reference_path is not None
# (loop id, ref id, ref start, length, target start)
# Others
batch_size = 2
seed = 42
save_dir = "./samples/samples/"

View file

@ -1,51 +0,0 @@
# scripts/inference_long.py
num_frames = 16
fps = 24 // 3
image_size = (256, 256)
# Define model
model = dict(
type="STDiT-XL/2",
space_scale=0.5,
time_scale=1.0,
enable_flashattn=True,
enable_layernorm_kernel=True,
from_pretrained=None,
)
vae = dict(
type="VideoAutoencoderKL",
from_pretrained="stabilityai/sd-vae-ft-ema",
micro_batch_size=4,
)
text_encoder = dict(
type="t5",
from_pretrained="DeepFloyd/t5-v1_1-xxl",
model_max_length=120,
)
scheduler = dict(
type="iddpm",
num_sampling_steps=100,
cfg_scale=7.0,
)
dtype = "fp16"
# Condition
prompt_path = None
prompt = [
"A car driving on a road in the middle of a desert.",
]
loop = 1
condition_frame_length = 4
reference_path = [
"https://cdn.openai.com/tmp/s/interp/d0.mp4",
]
mask_strategy = [
"0,0,0,1,0,0",
] # valid when reference_path is not None
# (loop id, ref id, ref start, length, target start)
# Others
batch_size = len(prompt)
seed = 42
save_dir = "outputs/SDEdit"

View file

@ -1,50 +0,0 @@
# scripts/inference_long.py
num_frames = 16
fps = 24 // 3
image_size = (512, 512)
# Define model
model = dict(
type="STDiT-XL/2",
space_scale=1.0,
time_scale=1.0,
use_x_mask=True,
enable_flashattn=True,
enable_layernorm_kernel=True,
from_pretrained=None,
)
vae = dict(
type="VideoAutoencoderKL",
from_pretrained="stabilityai/sd-vae-ft-ema",
micro_batch_size=4,
)
text_encoder = dict(
type="t5",
from_pretrained="DeepFloyd/t5-v1_1-xxl",
model_max_length=120,
)
scheduler = dict(
type="iddpm",
# type="dpm-solver",
num_sampling_steps=100,
cfg_scale=7.0,
)
dtype = "bf16"
# Condition
prompt_path = None
prompt = [
"Drone view of waves crashing against the rugged cliffs along Big Surs garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliffs edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff's edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.",
"Pirate ship in a cosmic maelstrom nebula.",
]
loop = 1
# condition_frame_length = 4
reference_path = ["assets/images/condition/cliff.png", "assets/images/condition/ship.png"]
mask_strategy = ["0,0,0,1,0", "0,0,0,1,0"] # valid when reference_path is not None
# (loop id, ref id, ref start, length, target start)
# Others
batch_size = 2
seed = 42
save_dir = "./samples/samples/"

View file

@ -1,53 +0,0 @@
# scripts/inference_long.py
num_frames = 16
fps = 24 // 3
image_size = (512, 512)
# Define model
model = dict(
type="STDiT-XL/2",
space_scale=1.0,
time_scale=1.0,
use_x_mask=True,
enable_flashattn=True,
enable_layernorm_kernel=True,
from_pretrained=None,
)
vae = dict(
type="VideoAutoencoderKL",
from_pretrained="stabilityai/sd-vae-ft-ema",
micro_batch_size=4,
)
text_encoder = dict(
type="t5",
from_pretrained="DeepFloyd/t5-v1_1-xxl",
model_max_length=120,
)
scheduler = dict(
type="iddpm",
# type="dpm-solver",
num_sampling_steps=100,
cfg_scale=7.0,
)
dtype = "bf16"
# Condition
prompt_path = None
prompt = [
"Drone view of waves crashing against the rugged cliffs along Big Surs garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliffs edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff's edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.",
"A sad small cactus with in the Sahara desert becomes happy.",
]
loop = 1
condition_frame_length = 4
reference_path = [
"assets/images/condition/cliff.png",
"assets/images/condition/cactus-sad.png;assets/images/condition/cactus-happy.png",
]
mask_strategy = ["0,0,0,1,0;0,0,0,1,-1", "0,0,0,1,0;0,1,0,1,-1"] # valid when reference_path is not None
# (loop id, ref id, ref start, length, target start)
# Others
batch_size = 2
seed = 42
save_dir = "./samples/samples/"

View file

@ -1,50 +0,0 @@
# scripts/inference_long.py
num_frames = 16
fps = 24 // 3
image_size = (512, 512)
# Define model
model = dict(
type="STDiT-XL/2",
space_scale=1.0,
time_scale=1.0,
use_x_mask=True,
enable_flashattn=True,
enable_layernorm_kernel=True,
from_pretrained=None,
)
vae = dict(
type="VideoAutoencoderKL",
from_pretrained="stabilityai/sd-vae-ft-ema",
micro_batch_size=4,
)
text_encoder = dict(
type="t5",
from_pretrained="DeepFloyd/t5-v1_1-xxl",
model_max_length=120,
)
scheduler = dict(
type="iddpm",
# type="dpm-solver",
num_sampling_steps=100,
cfg_scale=7.0,
)
dtype = "bf16"
# Condition
prompt_path = None
prompt = [
"Drone view of waves crashing against the rugged cliffs along Big Surs garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliffs edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff's edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.",
"In an ornate, historical hall, a massive tidal wave peaks and begins to crash. Two surfers, seizing the moment, skillfully navigate the face of the wave.",
]
loop = 10
condition_frame_length = 4
reference_path = ["assets/images/condition/cliff.png", "assets/images/condition/wave.png"]
mask_strategy = ["0,0,0,1,0", "0,0,0,1,0"] # valid when reference_path is not None
# (loop id, ref id, ref start, length, target start)
# Others
batch_size = 2
seed = 42
save_dir = "./samples/samples/"

View file

@ -0,0 +1,228 @@
## Inference config demos
To change the inference settings, you can directly modify the corresponding config file. Or you can pass arguments to overwrite the config file ([config_utils.py](/opensora/utils/config_utils.py)). To change sampling prompts, you should modify the `.txt` file passed to the `--prompt_path` argument.
```plaintext
--prompt_path ./assets/texts/t2v_samples.txt -> prompt_path
--ckpt-path ./path/to/your/ckpt.pth -> model["from_pretrained"]
```
The explanation of each field is provided below.
```python
# Define sampling size
num_frames = 64 # number of frames
fps = 24 // 2 # frames per second (divided by 2 for frame_interval=2)
image_size = (512, 512) # image size (height, width)
# Define model
model = dict(
type="STDiT-XL/2", # Select model type (STDiT-XL/2, DiT-XL/2, etc.)
space_scale=1.0, # (Optional) Space positional encoding scale (new height / old height)
time_scale=2 / 3, # (Optional) Time positional encoding scale (new frame_interval / old frame_interval)
enable_flashattn=True, # (Optional) Speed up training and inference with flash attention
enable_layernorm_kernel=True, # (Optional) Speed up training and inference with fused kernel
from_pretrained="PRETRAINED_MODEL", # (Optional) Load from pretrained model
no_temporal_pos_emb=True, # (Optional) Disable temporal positional encoding (for image)
)
vae = dict(
type="VideoAutoencoderKL", # Select VAE type
from_pretrained="stabilityai/sd-vae-ft-ema", # Load from pretrained VAE
micro_batch_size=128, # VAE with micro batch size to save memory
)
text_encoder = dict(
type="t5", # Select text encoder type (t5, clip)
from_pretrained="DeepFloyd/t5-v1_1-xxl", # Load from pretrained text encoder
model_max_length=120, # Maximum length of input text
)
scheduler = dict(
type="iddpm", # Select scheduler type (iddpm, dpm-solver)
num_sampling_steps=100, # Number of sampling steps
cfg_scale=7.0, # hyper-parameter for classifier-free diffusion
cfg_channel=3, # how many channels to use for classifier-free diffusion, if None, use all channels
)
dtype = "fp16" # Computation type (fp16, fp32, bf16)
# Other settings
batch_size = 1 # batch size
seed = 42 # random seed
prompt_path = "./assets/texts/t2v_samples.txt" # path to prompt file
save_dir = "./samples" # path to save samples
```
## Training config demos
```python
# Define sampling size
num_frames = 64
frame_interval = 2 # sample every 2 frames
image_size = (512, 512)
# Define dataset
root = None # root path to the dataset
data_path = "CSV_PATH" # path to the csv file
use_image_transform = False # True if training on images
num_workers = 4 # number of workers for dataloader
# Define acceleration
dtype = "bf16" # Computation type (fp16, bf16)
grad_checkpoint = True # Use gradient checkpointing
plugin = "zero2" # Plugin for distributed training (zero2, zero2-seq)
sp_size = 1 # Sequence parallelism size (1 for no sequence parallelism)
# Define model
model = dict(
type="STDiT-XL/2",
space_scale=1.0,
time_scale=2 / 3,
from_pretrained="YOUR_PRETRAINED_MODEL",
enable_flashattn=True, # Enable flash attention
enable_layernorm_kernel=True, # Enable layernorm kernel
)
vae = dict(
type="VideoAutoencoderKL",
from_pretrained="stabilityai/sd-vae-ft-ema",
micro_batch_size=128,
)
text_encoder = dict(
type="t5",
from_pretrained="DeepFloyd/t5-v1_1-xxl",
model_max_length=120,
shardformer=True, # Enable shardformer for T5 acceleration
)
scheduler = dict(
type="iddpm",
timestep_respacing="", # Default 1000 timesteps
)
# Others
seed = 42
outputs = "outputs" # path to save checkpoints
wandb = False # Use wandb for logging
epochs = 1000 # number of epochs (just large enough, kill when satisfied)
log_every = 10
ckpt_every = 250
load = None # path to resume training
batch_size = 4
lr = 2e-5
grad_clip = 1.0 # gradient clipping
```
## Inference-long specific arguments
The [`inference-long.py`](/scripts/inference-long.py) script is used to generate long videos, and it also provides all functions of the [`inference.py`](/scripts/inference.py) script. The following arguments are specific to the `inference-long.py` script.
```python
loop = 10
condition_frame_length = 4
reference_path = ["one.png;two.mp4"]
mask_strategy = ["0,0,0,1,0;0,0,0,1,-1"]
```
To generate a long video of any time, our strategy is to generate a video with a fixed length first, and then use the last `condition_frame_length` number of frames for the next video generation. This will loop for `loop` times. Thus, the total length of the video is `loop * (num_frames - condition_frame_length) + condition_frame_length`.
To condition the generation on images or videos, we introduce the `mask_strategy`. It is 5 number tuples separated by `;`. Each tuple indicate an insertion of the condition image or video to the target generation. The meaning of each number is:
- First number: the index of the condition image or video in the `reference_path`. (0 means one.png, and 1 means two.mp4)
- Second number: the loop index of the condition image or video. (0 means the first loop, 1 means the second loop, etc.)
- Third number: the start frame of the condition image or video. (0 means the first frame, and images only have one frame)
- Fourth number: the number of frames to insert. (1 means insert one frame, and images only have one frame)
- Fifth number: the location to insert. (0 means insert at the beginning, 1 means insert at the end, and -1 means insert at the end of the video)
Thus, "0,0,0,1,-1" means insert the first frame of one.png at the end of the video at the first loop.
## Bucket Configs
To enable dynamic training (for STDiT2), use `VariableVideoText` dataset, and set the `bucket_config` in the config. An example is:
```python
bucket_config = {
"240p": {16: (1.0, 16), 32: (1.0, 8), 64: (1.0, 4), 128: (1.0, 2)},
"256": {1: (1.0, 256)},
"512": {1: (1.0, 80)},
"480p": {1: (1.0, 52), 16: (0.5, 4), 32: (0.0, None)},
"720p": {16: (1.0, 2), 32: (0.0, None)},
"1024": {1: (1.0, 20)},
"1080p": {1: (1.0, 8)},
}
```
This looks a bit difficult to understand at the first glance. Let's understand this config step by step.
### Three-level bucket
We design a three-level bucket: `(resolution, num_frames, aspect_ratios)`. The resolution and aspect ratios is predefined in [aspect.py](/opensora/datasets/aspect.py). Commonly used resolutions (e.g., 240p, 1080p) are supported, and the name represents the number of pixels (e.g., 240p is 240x426, however, we define 240p to represent any size with HxW approximately 240x426=102240 pixels). The aspect ratios are defined for each resolution. You do not need to define the aspect ratios in the `bucket_config`.
The `num_frames` is the number of frames in each sample, with `num_frames=1` especially for images. If `frame_intervals` is not 1, a bucket with `num_frames=k` will contain videos with `k*frame_intervals` frames except for images. Only a video with more than `num_frames` and more than `resolution` pixels will be likely to be put into the bucket.
The two number defined in the bucket config is `(keep_prob, batch_size)`. Since the memory and speed of samples from different buckets may be different, we use `batch_size` to balance the processing speed. Since our computation is limited, we cannot process videos with their original resolution as stated in OpenAI's sora's report. Thus, we give a `keep_prob` to control the number of samples in each bucket. The `keep_prob` is the probability to keep a sample in the bucket. Let's take the following config as an example:
```python
bucket_config = {
"480p": {16: (1.0, 8),},
"720p": {16: (0.5, 4),},
"1080p": {16: (0.2, 2)},
"4K", {16: (0.1, 1)},
}
```
Given a 2K video with more than 16 frames, the program will first try to put it into bucket "1080p" since it has a larger resolution than 1080p but less than 4K. Since the `keep_prob` for 1080p is 20%, a random number is generated, and if it is less than 0.2, the video will be put into the bucket. If the video is not put into the bucket, the program will try to put it into the "720p" bucket. Since the `keep_prob` for 720p is 50%, the video has a 50% chance to be put into the bucket. If the video is not put into the bucket, the program will try to put it into the "480p" bucket directly as it is the smallest resolution.
### Examples
Let's see some simple examples to understand the bucket config. First, the aspect ratio bucket is compulsory, if you want to modify this you need to add your own resolution definition in [aspect.py](/opensora/datasets/aspect.py). Then, to keep only 256x256 resolution and 16 frames as OpenSora 1.0, you can use the following config:
```python
bucket_config = {
"256": {16: (1.0, 8)},
}
```
If you want to train a model supporting different resolutions of images, you can use the following config:
```python
bucket_config = {
"256": {1: (1.0, 256)},
"512": {1: (1.0, 80)},
"480p": {1: (1.0, 52)},
"1024": {1: (1.0, 20)},
"1080p": {1: (1.0, 8)},
}
```
Or if you find the number of high-resolution images is too large, you can modify the `keep_prob` to reduce the number of samples in the bucket:
```python
bucket_config = {
"256": {1: (1.0, 256)},
"512": {1: (0.8, 80)},
"480p": {1: (0.5, 52)},
"1024": {1: (0.5, 20)},
"1080p": {1: (0.2, 8)},
}
```
And similarly for videos:
```python
bucket_config = {
"240p": {16: (1.0, 16), 32: (1.0, 8), 64: (1.0, 4), 128: (1.0, 2)},
"480p": {16: (1.0, 4)},
"720p": {16: (0.5, 2)},
}
```
Note that in the above case, a video with 480p resolution and more than 16 frames will all go into bucket `("480p", 16)`, since they all satisfy this bucket's requirement. But training long videos with 480p resolution may be slow, so you can modify the config as follows to enforce the video with more than 32 frames to go into the 240p bucket.
```python
bucket_config = {
"240p": {16: (1.0, 16), 32: (1.0, 8), 64: (1.0, 4), 128: (1.0, 2)},
"480p": {16: (1.0, 4), 32: (0.0, None)},
"720p": {16: (0.5, 2)},
}
```
Combine the above examples together, we think you can understand the bucket config provided at the beginning of this section and in the config files.

View file

@ -74,24 +74,25 @@ As we found in Open-Sora 1.0, the data number and quality are crucial for traini
We plan to use [panda-70M](https://snap-research.github.io/Panda-70M/) and other data to traing the model, which is approximately 30M+ data. However, we find disk IO a botteleneck for training and data processing at the same time. Thus, we can only prepare a 10M dataset and did not go through all processing pipeline that we built. Finally, we use a dataset with 9.7M videos + 2.6M images for pre-training, and 560k videos + 1.6M images for fine-tuning. The pretraining dataset statistics are shown below.
Image text tokens (by T5 tokenizer): ![image text tokens](/assets/readme/report_image_textlen.png)
Image text tokens (by T5 tokenizer):
![image text tokens](/assets/readme/report_image_textlen.png)
Video text tokens (by T5 tokenizer). We directly use panda's short caption for training, and caption other datasets by ourselves. The generated caption is usually less than 200 tokens.
![video text tokens](/assets/readme/report_video_textlen.png)
Video duration: ![video duration](/assets/readme/report_video_duration.png)
Video duration:
![video duration](/assets/readme/report_video_duration.png)
## Training Details
With limited computational resources, we have to carefully monitor the training process, and change the training strategy if we speculate the model is not learning well since there is no computation for ablation study. Thus, Open-Sora 1.1's training includes multiple changes, and as a result, ema is not applied.
1. First, we fine-tune 6k steps with images of different resolution from `Pixart-alpha-1024` checkpoints. We find the model easily adapts to generate images with different resolutions. We use [SpeeDiT](https://github.com/1zeryu/SpeeDiT) (iddpm-speed) to accelerate the diffusion training.
2. **[Stage 1]** Then, we pretrain the model with gradient-checkpointing for 24k steps, which takes 4 days on 64 H800 GPUs. Although the number of samples seen by the model is the same, we find the model learns slowly compared to a smaller batch size. We speculate that at an early stage, the number of steps is more important for training. The most videos are in 240p resolution, and the config is similar to [stage2.py](/configs/opensora-v1-1/train/stage2.py).
3. **[Stage 1]** To increase the number of steps, we switch to a smaller batch size without gradient-checkpointing. We also add fps conditioning at this point. We trained 40k steps for 2 days. The most videos are in 144p resolution, and the config file is [stage1.py](/configs/opensora-v1-1/train/stage1.py).
4. **[Stage 1]** We find the model cannot learn well for long videos, and find a noised generation result as speculated to be half-precision problem found in Open-Sora 1.0 training. Thus, we adopt the QK-normalization to stabilize the training. We also switch iddpm-speed to iddpm. We trained for 17k steps for 14 hours. The most videos are in 144p resolution, and the config file is [stage1.py](/configs/opensora-v1-1/train/stage1.py). The stage 1 training lasts for approximately one week, with total step 81k.
5. **[Stage 2]** We switch to a higher resolution, where most videos are in 240p and 480p resolution ([stage2.py](/configs/opensora-v1-1/train/stage2.py)). We trained 22k steps for one day on all pre-training data, and ?k with ? hours on high-quality data.
6. **[Stage 3]** We switch to a higher resolution, where most videos are in 480p and 720p resolution ([stage3.py](/configs/opensora-v1-1/train/stage3.py)). We trained ?k with ? hours on high-quality data.
1. First, we fine-tune **6k** steps with images of different resolution from `Pixart-alpha-1024` checkpoints. We find the model easily adapts to generate images with different resolutions. We use [SpeeDiT](https://github.com/1zeryu/SpeeDiT) (iddpm-speed) to accelerate the diffusion training.
2. **[Stage 1]** Then, we pretrain the model with gradient-checkpointing for **24k** steps, which takes **4 days** on 64 H800 GPUs. Although the number of samples seen by the model is the same, we find the model learns slowly compared to a smaller batch size. We speculate that at an early stage, the number of steps is more important for training. The most videos are in **240p** resolution, and the config is similar to [stage2.py](/configs/opensora-v1-1/train/stage2.py).
3. **[Stage 1]** To increase the number of steps, we switch to a smaller batch size without gradient-checkpointing. We also add fps conditioning at this point. We trained **40k** steps for **2 days**. The most videos are in **144p** resolution, and the config file is [stage1.py](/configs/opensora-v1-1/train/stage1.py).
4. **[Stage 1]** We find the model cannot learn well for long videos, and find a noised generation result as speculated to be half-precision problem found in Open-Sora 1.0 training. Thus, we adopt the QK-normalization to stabilize the training. We also switch iddpm-speed to iddpm. We trained for **17k** steps for **14 hours**. The most videos are in **144p** resolution, and the config file is [stage1.py](/configs/opensora-v1-1/train/stage1.py). The stage 1 training lasts for approximately one week, with total step **81k**.
5. **[Stage 2]** We switch to a higher resolution, where most videos are in **240p and 480p** resolution ([stage2.py](/configs/opensora-v1-1/train/stage2.py)). We trained **22k** steps for **one day** on all pre-training data, and **?k** with **? hours** on high-quality data.
6. **[Stage 3]** We switch to a higher resolution, where most videos are in **480p and 720p** resolution ([stage3.py](/configs/opensora-v1-1/train/stage3.py)). We trained **?k** with **? hours** on high-quality data.
## Results and Evaluation
@ -104,5 +105,5 @@ As we get one step closer to the replication of Sora, we find many limitations f
- **Bad human generation**: We find the model cannot generate high-quality human videos. We think the problem is due to the lack of human data. We plan to collect more human data and continue training the model to improve the human generation.
- **Low aesthetic score**: we find the model's aesthetic score is not high. The problem is due to the lack of aesthetic score filtering, which is not conducted due to IO bottleneck. We plan to filter the data by aesthetic score and finetuning the model to improve the aesthetic score.
> **Algorithm & Acceleration**: Zangwei Zheng, Xiangyu Peng, Shenggui Li, Hongxing Liu, Yukun Zhou
> **Data Collection & Pipeline**: Xiangyu Peng, Zangwei Zheng, Chenhui Shen, Tom Young, Junjie Wang, Chenfeng Yu
> - **Algorithm & Acceleration**: Zangwei Zheng, Xiangyu Peng, Shenggui Li, Hongxing Liu, Yukun Zhou
> - **Data Collection & Pipeline**: Xiangyu Peng, Zangwei Zheng, Chenhui Shen, Tom Young, Junjie Wang, Chenfeng Yu

View file

@ -1,31 +1,36 @@
# Repo & Config Structure
- [Repo \& Config Structure](#repo--config-structure)
- [Repo Structure](#repo-structure)
- [Configs](#configs)
- [Inference config demos](#inference-config-demos)
- [Training config demos](#training-config-demos)
- [Bucket Configs](#bucket-configs)
- [Why bucket?](#why-bucket)
- [Three-level bucket](#three-level-bucket)
- [Examples](#examples)
## Repo Structure
# Repo Structure
```plaintext
Open-Sora
├── README.md
├── assets
│ ├── images -> images used for image-conditioned generation
│ ├── texts -> prompts used for text-conditioned generation
│ └── readme -> images used in README
├── configs -> Configs for training & inference
├── docs
│ ├── acceleration.md -> Acceleration & Speed benchmark
│ ├── acceleration.md -> Report on acceleration & speed benchmark
│ ├── command.md -> Commands for training & inference
│ ├── datasets.md -> Datasets used in this project
| ├── data_pipeline.md -> Data pipeline documents
│ ├── structure.md -> This file
│ └── report_v1.md -> Report for Open-Sora v1
│ ├── config.md -> Configs meaning
│ ├── report_01.md -> Report for Open-Sora 1.1
│ ├── report_02.md -> Report for Open-Sora 1.0
│ └── zh_CN -> Chinese version of the above
├── eval -> Evaluation scripts
│ ├── README.md -> Evaluation documentation
| ├── sample.sh -> script for quickly launching inference on predefined prompts
| ├── launch.sh -> script for launching 8 cards sampling
| ├── vbench -> for VBench evaluation
│ └── vbench_i2v -> for VBench i2v evaluation
├── gradio -> Gradio demo related code
├── notebooks -> Jupyter notebooks for generating commands to run
├── scripts
│ ├── train.py -> diffusion training script
│ └── inference.py -> Report for Open-Sora v1
├── configs -> Configs for training & inference
│ ├── inference.py -> diffusion inference script
│ ├── inference-long.py -> inference script supporting more advanced features
│ └── misc -> misc scripts, including batch size search
├── opensora
│ ├── __init__.py
│ ├── registry.py -> Registry helper
@ -46,6 +51,7 @@ Open-Sora
│   │   ├── iddpm -> IDDPM for training and inference
│   │ └── dpms -> DPM-Solver for fast inference
│ └── utils
├── tests -> Tests for the project
└── tools -> Tools for data processing and more
```
@ -56,6 +62,17 @@ Our config files follows [MMEgine](https://github.com/open-mmlab/mmengine). MMEn
```plaintext
Open-Sora
└── configs -> Configs for training & inference
├── opensora-v1-1 -> STDiT2 related configs
│ ├── inference
│ │ ├── sample.py -> Sample videos and images
│ │ └── sample-ref.py -> Sample videos with image/video condition
│ └── train
│ ├── stage1.py -> Stage 1 training config
│ ├── stage2.py -> Stage 2 training config
│ ├── stage3.py -> Stage 3 training config
│ ├── image.py -> Illustration of image training config
│ ├── video.py -> Illustration of video training config
│ └── benchmark.py -> For batch size searching
├── opensora -> STDiT related configs
│ ├── inference
│ │ ├── 16x256x256.py -> Sample videos 16 frames 256x256
@ -77,230 +94,17 @@ Open-Sora
└── pixart -> PixArt related configs
```
## Inference config demos
To change the inference settings, you can directly modify the corresponding config file. Or you can pass arguments to overwrite the config file ([config_utils.py](/opensora/utils/config_utils.py)). To change sampling prompts, you should modify the `.txt` file passed to the `--prompt_path` argument.
## Tools
```plaintext
--prompt_path ./assets/texts/t2v_samples.txt -> prompt_path
--ckpt-path ./path/to/your/ckpt.pth -> model["from_pretrained"]
```
The explanation of each field is provided below.
```python
# Define sampling size
num_frames = 64 # number of frames
fps = 24 // 2 # frames per second (divided by 2 for frame_interval=2)
image_size = (512, 512) # image size (height, width)
# Define model
model = dict(
type="STDiT-XL/2", # Select model type (STDiT-XL/2, DiT-XL/2, etc.)
space_scale=1.0, # (Optional) Space positional encoding scale (new height / old height)
time_scale=2 / 3, # (Optional) Time positional encoding scale (new frame_interval / old frame_interval)
enable_flashattn=True, # (Optional) Speed up training and inference with flash attention
enable_layernorm_kernel=True, # (Optional) Speed up training and inference with fused kernel
from_pretrained="PRETRAINED_MODEL", # (Optional) Load from pretrained model
no_temporal_pos_emb=True, # (Optional) Disable temporal positional encoding (for image)
)
vae = dict(
type="VideoAutoencoderKL", # Select VAE type
from_pretrained="stabilityai/sd-vae-ft-ema", # Load from pretrained VAE
micro_batch_size=128, # VAE with micro batch size to save memory
)
text_encoder = dict(
type="t5", # Select text encoder type (t5, clip)
from_pretrained="DeepFloyd/t5-v1_1-xxl", # Load from pretrained text encoder
model_max_length=120, # Maximum length of input text
)
scheduler = dict(
type="iddpm", # Select scheduler type (iddpm, dpm-solver)
num_sampling_steps=100, # Number of sampling steps
cfg_scale=7.0, # hyper-parameter for classifier-free diffusion
cfg_channel=3, # how many channels to use for classifier-free diffusion, if None, use all channels
)
dtype = "fp16" # Computation type (fp16, fp32, bf16)
# Other settings
batch_size = 1 # batch size
seed = 42 # random seed
prompt_path = "./assets/texts/t2v_samples.txt" # path to prompt file
save_dir = "./samples" # path to save samples
```
## Training config demos
```python
# Define sampling size
num_frames = 64
frame_interval = 2 # sample every 2 frames
image_size = (512, 512)
# Define dataset
root = None # root path to the dataset
data_path = "CSV_PATH" # path to the csv file
use_image_transform = False # True if training on images
num_workers = 4 # number of workers for dataloader
# Define acceleration
dtype = "bf16" # Computation type (fp16, bf16)
grad_checkpoint = True # Use gradient checkpointing
plugin = "zero2" # Plugin for distributed training (zero2, zero2-seq)
sp_size = 1 # Sequence parallelism size (1 for no sequence parallelism)
# Define model
model = dict(
type="STDiT-XL/2",
space_scale=1.0,
time_scale=2 / 3,
from_pretrained="YOUR_PRETRAINED_MODEL",
enable_flashattn=True, # Enable flash attention
enable_layernorm_kernel=True, # Enable layernorm kernel
)
vae = dict(
type="VideoAutoencoderKL",
from_pretrained="stabilityai/sd-vae-ft-ema",
micro_batch_size=128,
)
text_encoder = dict(
type="t5",
from_pretrained="DeepFloyd/t5-v1_1-xxl",
model_max_length=120,
shardformer=True, # Enable shardformer for T5 acceleration
)
scheduler = dict(
type="iddpm",
timestep_respacing="", # Default 1000 timesteps
)
# Others
seed = 42
outputs = "outputs" # path to save checkpoints
wandb = False # Use wandb for logging
epochs = 1000 # number of epochs (just large enough, kill when satisfied)
log_every = 10
ckpt_every = 250
load = None # path to resume training
batch_size = 4
lr = 2e-5
grad_clip = 1.0 # gradient clipping
```
## Inference-long specific arguments
The [`inference-long.py`](/scripts/inference-long.py) script is used to generate long videos, and it also provides all functions of the [`inference.py`](/scripts/inference.py) script. The following arguments are specific to the `inference-long.py` script.
```python
loop = 10
condition_frame_length = 4
reference_path = ["one.png;two.mp4"]
mask_strategy = ["0,0,0,1,0;0,0,0,1,-1"]
```
To generate a long video of any time, our strategy is to generate a video with a fixed length first, and then use the last `condition_frame_length` number of frames for the next video generation. This will loop for `loop` times. Thus, the total length of the video is `loop * (num_frames - condition_frame_length) + condition_frame_length`.
To condition the generation on images or videos, we introduce the `mask_strategy`. It is 5 number tuples separated by `;`. Each tuple indicate an insertion of the condition image or video to the target generation. The meaning of each number is:
- First number: the index of the condition image or video in the `reference_path`. (0 means one.png, and 1 means two.mp4)
- Second number: the loop index of the condition image or video. (0 means the first loop, 1 means the second loop, etc.)
- Third number: the start frame of the condition image or video. (0 means the first frame, and images only have one frame)
- Fourth number: the number of frames to insert. (1 means insert one frame, and images only have one frame)
- Fifth number: the location to insert. (0 means insert at the beginning, 1 means insert at the end, and -1 means insert at the end of the video)
Thus, "0,0,0,1,-1" means insert the first frame of one.png at the end of the video at the first loop.
## Bucket Configs
To enable dynamic training (for STDiT2), use `VariableVideoText` dataset, and set the `bucket_config` in the config. An example is:
```python
bucket_config = {
"240p": {16: (1.0, 16), 32: (1.0, 8), 64: (1.0, 4), 128: (1.0, 2)},
"256": {1: (1.0, 256)},
"512": {1: (1.0, 80)},
"480p": {1: (1.0, 52), 16: (0.5, 4), 32: (0.0, None)},
"720p": {16: (1.0, 2), 32: (0.0, None)},
"1024": {1: (1.0, 20)},
"1080p": {1: (1.0, 8)},
}
```
This looks a bit difficult to understand at the first glance. Let's understand this config step by step.
### Three-level bucket
We design a three-level bucket: `(resolution, num_frames, aspect_ratios)`. The resolution and aspect ratios is predefined in [aspect.py](/opensora/datasets/aspect.py). Commonly used resolutions (e.g., 240p, 1080p) are supported, and the name represents the number of pixels (e.g., 240p is 240x426, however, we define 240p to represent any size with HxW approximately 240x426=102240 pixels). The aspect ratios are defined for each resolution. You do not need to define the aspect ratios in the `bucket_config`.
The `num_frames` is the number of frames in each sample, with `num_frames=1` especially for images. If `frame_intervals` is not 1, a bucket with `num_frames=k` will contain videos with `k*frame_intervals` frames except for images. Only a video with more than `num_frames` and more than `resolution` pixels will be likely to be put into the bucket.
The two number defined in the bucket config is `(keep_prob, batch_size)`. Since the memory and speed of samples from different buckets may be different, we use `batch_size` to balance the processing speed. Since our computation is limited, we cannot process videos with their original resolution as stated in OpenAI's sora's report. Thus, we give a `keep_prob` to control the number of samples in each bucket. The `keep_prob` is the probability to keep a sample in the bucket. Let's take the following config as an example:
```python
bucket_config = {
"480p": {16: (1.0, 8),},
"720p": {16: (0.5, 4),},
"1080p": {16: (0.2, 2)},
"4K", {16: (0.1, 1)},
}
```
Given a 2K video with more than 16 frames, the program will first try to put it into bucket "1080p" since it has a larger resolution than 1080p but less than 4K. Since the `keep_prob` for 1080p is 20%, a random number is generated, and if it is less than 0.2, the video will be put into the bucket. If the video is not put into the bucket, the program will try to put it into the "720p" bucket. Since the `keep_prob` for 720p is 50%, the video has a 50% chance to be put into the bucket. If the video is not put into the bucket, the program will try to put it into the "480p" bucket directly as it is the smallest resolution.
### Examples
Let's see some simple examples to understand the bucket config. First, the aspect ratio bucket is compulsory, if you want to modify this you need to add your own resolution definition in [aspect.py](/opensora/datasets/aspect.py). Then, to keep only 256x256 resolution and 16 frames as OpenSora 1.0, you can use the following config:
```python
bucket_config = {
"256": {16: (1.0, 8)},
}
```
If you want to train a model supporting different resolutions of images, you can use the following config:
```python
bucket_config = {
"256": {1: (1.0, 256)},
"512": {1: (1.0, 80)},
"480p": {1: (1.0, 52)},
"1024": {1: (1.0, 20)},
"1080p": {1: (1.0, 8)},
}
```
Or if you find the number of high-resolution images is too large, you can modify the `keep_prob` to reduce the number of samples in the bucket:
```python
bucket_config = {
"256": {1: (1.0, 256)},
"512": {1: (0.8, 80)},
"480p": {1: (0.5, 52)},
"1024": {1: (0.5, 20)},
"1080p": {1: (0.2, 8)},
}
```
And similarly for videos:
```python
bucket_config = {
"240p": {16: (1.0, 16), 32: (1.0, 8), 64: (1.0, 4), 128: (1.0, 2)},
"480p": {16: (1.0, 4)},
"720p": {16: (0.5, 2)},
}
```
Note that in the above case, a video with 480p resolution and more than 16 frames will all go into bucket `("480p", 16)`, since they all satisfy this bucket's requirement. But training long videos with 480p resolution may be slow, so you can modify the config as follows to enforce the video with more than 32 frames to go into the 240p bucket.
```python
bucket_config = {
"240p": {16: (1.0, 16), 32: (1.0, 8), 64: (1.0, 4), 128: (1.0, 2)},
"480p": {16: (1.0, 4), 32: (0.0, None)},
"720p": {16: (0.5, 2)},
}
```
Combine the above examples together, we think you can understand the bucket config provided at the beginning of this section and in the config files.
Open-Sora
└── tools
├── datasets -> dataset management related code
├── scene_cut -> scene cut related code
├── caption -> caption related code
├── scoring -> scoring related code
│ ├── aesthetic -> aesthetic scoring related code
│ ├── matching -> matching scoring related code
│ ├── ocr -> ocr scoring related code
│ └── optical_flow -> optical flow scoring related code
└── frame_interpolation -> frame interpolation related code