update structure.md

2026-04-10 21:01:26 +02:00 · 2024-04-23 07:41:33 +00:00 · 2024-04-23 07:41:33 +00:00 · 01d14cb5e2
commit 01d14cb5e2
parent 54a1161389
9 changed files with 288 additions and 514 deletions
--- a/README.md
+++ b/README.md
@ -122,7 +122,7 @@ More samples are available in our [gallery](https://hpcaitech.github.io/Open-Sor

 Other useful documents and links are listed below.

-* Report: [1.0](docs/report_01.md), [1.1](docs/report_02.md), [acceleration.md](docs/acceleration.md)
+* Report: [report 1.0](docs/report_01.md), [report 1.1](docs/report_02.md), [acceleration.md](docs/acceleration.md)
 * Repo structure: [structure.md](docs/structure.md)
 * Config file explanation: [config.md](docs/config.md)
 * Useful commands: [commands.md](docs/commands.md)
--- a/configs/opensora/inference-long/16x256x256-extend.py
+++ b/configs/opensora/inference-long/16x256x256-extend.py
@ -1,55 +0,0 @@
-# scripts/inference_long.py
-num_frames = 16
-fps = 24 // 3
-image_size = (256, 256)
-
-# Define model
-model = dict(
-    type="STDiT-XL/2",
-    space_scale=0.5,
-    time_scale=1.0,
-    enable_flashattn=True,
-    enable_layernorm_kernel=True,
-    from_pretrained=None,
-)
-vae = dict(
-    type="VideoAutoencoderKL",
-    from_pretrained="stabilityai/sd-vae-ft-ema",
-    micro_batch_size=4,
-)
-text_encoder = dict(
-    type="t5",
-    from_pretrained="DeepFloyd/t5-v1_1-xxl",
-    model_max_length=120,
-)
-scheduler = dict(
-    type="iddpm",
-    # type="dpm-solver",
-    num_sampling_steps=100,
-    cfg_scale=7.0,
-)
-dtype = "bf16"
-
-# Condition
-prompt_path = None
-prompt = [
-    "Drone view of waves crashing against the rugged cliffs along Big Sur’s garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff’s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff's edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.",
-    "In an ornate, historical hall, a massive tidal wave peaks and begins to crash. Two surfers, seizing the moment, skillfully navigate the face of the wave.",
-]
-
-loop = 10
-condition_frame_length = 4
-reference_path = [
-    "assets/images/condition/cliff.png",
-    "assets/images/condition/wave.png",
-]
-mask_strategy = [
-    "0,0,0,1,0",
-    "0,0,0,1,0",
-]  # valid when reference_path is not None
-# (loop id, ref id, ref start, length, target start)
-
-# Others
-batch_size = 2
-seed = 42
-save_dir = "./samples/samples/"
--- a/configs/opensora/inference-long/16x256x256-sdedit.py
+++ b/configs/opensora/inference-long/16x256x256-sdedit.py
@ -1,51 +0,0 @@
-# scripts/inference_long.py
-num_frames = 16
-fps = 24 // 3
-image_size = (256, 256)
-
-# Define model
-model = dict(
-    type="STDiT-XL/2",
-    space_scale=0.5,
-    time_scale=1.0,
-    enable_flashattn=True,
-    enable_layernorm_kernel=True,
-    from_pretrained=None,
-)
-vae = dict(
-    type="VideoAutoencoderKL",
-    from_pretrained="stabilityai/sd-vae-ft-ema",
-    micro_batch_size=4,
-)
-text_encoder = dict(
-    type="t5",
-    from_pretrained="DeepFloyd/t5-v1_1-xxl",
-    model_max_length=120,
-)
-scheduler = dict(
-    type="iddpm",
-    num_sampling_steps=100,
-    cfg_scale=7.0,
-)
-dtype = "fp16"
-
-# Condition
-prompt_path = None
-prompt = [
-    "A car driving on a road in the middle of a desert.",
-]
-
-loop = 1
-condition_frame_length = 4
-reference_path = [
-    "https://cdn.openai.com/tmp/s/interp/d0.mp4",
-]
-mask_strategy = [
-    "0,0,0,1,0,0",
-]  # valid when reference_path is not None
-# (loop id, ref id, ref start, length, target start)
-
-# Others
-batch_size = len(prompt)
-seed = 42
-save_dir = "outputs/SDEdit"
--- a/configs/opensora/inference-long/16x512x512-annimate.py
+++ b/configs/opensora/inference-long/16x512x512-annimate.py
@ -1,50 +0,0 @@
-# scripts/inference_long.py
-num_frames = 16
-fps = 24 // 3
-image_size = (512, 512)
-
-# Define model
-model = dict(
-    type="STDiT-XL/2",
-    space_scale=1.0,
-    time_scale=1.0,
-    use_x_mask=True,
-    enable_flashattn=True,
-    enable_layernorm_kernel=True,
-    from_pretrained=None,
-)
-vae = dict(
-    type="VideoAutoencoderKL",
-    from_pretrained="stabilityai/sd-vae-ft-ema",
-    micro_batch_size=4,
-)
-text_encoder = dict(
-    type="t5",
-    from_pretrained="DeepFloyd/t5-v1_1-xxl",
-    model_max_length=120,
-)
-scheduler = dict(
-    type="iddpm",
-    # type="dpm-solver",
-    num_sampling_steps=100,
-    cfg_scale=7.0,
-)
-dtype = "bf16"
-
-# Condition
-prompt_path = None
-prompt = [
-    "Drone view of waves crashing against the rugged cliffs along Big Sur’s garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff’s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff's edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.",
-    "Pirate ship in a cosmic maelstrom nebula.",
-]
-
-loop = 1
-# condition_frame_length = 4
-reference_path = ["assets/images/condition/cliff.png", "assets/images/condition/ship.png"]
-mask_strategy = ["0,0,0,1,0", "0,0,0,1,0"]  # valid when reference_path is not None
-# (loop id, ref id, ref start, length, target start)
-
-# Others
-batch_size = 2
-seed = 42
-save_dir = "./samples/samples/"
--- a/configs/opensora/inference-long/16x512x512-connect.py
+++ b/configs/opensora/inference-long/16x512x512-connect.py
@ -1,53 +0,0 @@
-# scripts/inference_long.py
-num_frames = 16
-fps = 24 // 3
-image_size = (512, 512)
-
-# Define model
-model = dict(
-    type="STDiT-XL/2",
-    space_scale=1.0,
-    time_scale=1.0,
-    use_x_mask=True,
-    enable_flashattn=True,
-    enable_layernorm_kernel=True,
-    from_pretrained=None,
-)
-vae = dict(
-    type="VideoAutoencoderKL",
-    from_pretrained="stabilityai/sd-vae-ft-ema",
-    micro_batch_size=4,
-)
-text_encoder = dict(
-    type="t5",
-    from_pretrained="DeepFloyd/t5-v1_1-xxl",
-    model_max_length=120,
-)
-scheduler = dict(
-    type="iddpm",
-    # type="dpm-solver",
-    num_sampling_steps=100,
-    cfg_scale=7.0,
-)
-dtype = "bf16"
-
-# Condition
-prompt_path = None
-prompt = [
-    "Drone view of waves crashing against the rugged cliffs along Big Sur’s garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff’s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff's edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.",
-    "A sad small cactus with in the Sahara desert becomes happy.",
-]
-
-loop = 1
-condition_frame_length = 4
-reference_path = [
-    "assets/images/condition/cliff.png",
-    "assets/images/condition/cactus-sad.png;assets/images/condition/cactus-happy.png",
-]
-mask_strategy = ["0,0,0,1,0;0,0,0,1,-1", "0,0,0,1,0;0,1,0,1,-1"]  # valid when reference_path is not None
-# (loop id, ref id, ref start, length, target start)
-
-# Others
-batch_size = 2
-seed = 42
-save_dir = "./samples/samples/"
--- a/configs/opensora/inference-long/16x512x512-extend.py
+++ b/configs/opensora/inference-long/16x512x512-extend.py
@ -1,50 +0,0 @@
-# scripts/inference_long.py
-num_frames = 16
-fps = 24 // 3
-image_size = (512, 512)
-
-# Define model
-model = dict(
-    type="STDiT-XL/2",
-    space_scale=1.0,
-    time_scale=1.0,
-    use_x_mask=True,
-    enable_flashattn=True,
-    enable_layernorm_kernel=True,
-    from_pretrained=None,
-)
-vae = dict(
-    type="VideoAutoencoderKL",
-    from_pretrained="stabilityai/sd-vae-ft-ema",
-    micro_batch_size=4,
-)
-text_encoder = dict(
-    type="t5",
-    from_pretrained="DeepFloyd/t5-v1_1-xxl",
-    model_max_length=120,
-)
-scheduler = dict(
-    type="iddpm",
-    # type="dpm-solver",
-    num_sampling_steps=100,
-    cfg_scale=7.0,
-)
-dtype = "bf16"
-
-# Condition
-prompt_path = None
-prompt = [
-    "Drone view of waves crashing against the rugged cliffs along Big Sur’s garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff’s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff's edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.",
-    "In an ornate, historical hall, a massive tidal wave peaks and begins to crash. Two surfers, seizing the moment, skillfully navigate the face of the wave.",
-]
-
-loop = 10
-condition_frame_length = 4
-reference_path = ["assets/images/condition/cliff.png", "assets/images/condition/wave.png"]
-mask_strategy = ["0,0,0,1,0", "0,0,0,1,0"]  # valid when reference_path is not None
-# (loop id, ref id, ref start, length, target start)
-
-# Others
-batch_size = 2
-seed = 42
-save_dir = "./samples/samples/"
--- a/docs/config.md
+++ b/docs/config.md
@ -0,0 +1,228 @@
+
+## Inference config demos
+
+To change the inference settings, you can directly modify the corresponding config file. Or you can pass arguments to overwrite the config file ([config_utils.py](/opensora/utils/config_utils.py)). To change sampling prompts, you should modify the `.txt` file passed to the `--prompt_path` argument.
+
+```plaintext
+--prompt_path ./assets/texts/t2v_samples.txt  -> prompt_path
+--ckpt-path ./path/to/your/ckpt.pth           -> model["from_pretrained"]
+```
+
+The explanation of each field is provided below.
+
+```python
+# Define sampling size
+num_frames = 64               # number of frames
+fps = 24 // 2                 # frames per second (divided by 2 for frame_interval=2)
+image_size = (512, 512)       # image size (height, width)
+
+# Define model
+model = dict(
+    type="STDiT-XL/2",        # Select model type (STDiT-XL/2, DiT-XL/2, etc.)
+    space_scale=1.0,          # (Optional) Space positional encoding scale (new height / old height)
+    time_scale=2 / 3,         # (Optional) Time positional encoding scale (new frame_interval / old frame_interval)
+    enable_flashattn=True,    # (Optional) Speed up training and inference with flash attention
+    enable_layernorm_kernel=True, # (Optional) Speed up training and inference with fused kernel
+    from_pretrained="PRETRAINED_MODEL",  # (Optional) Load from pretrained model
+    no_temporal_pos_emb=True,  # (Optional) Disable temporal positional encoding (for image)
+)
+vae = dict(
+    type="VideoAutoencoderKL", # Select VAE type
+    from_pretrained="stabilityai/sd-vae-ft-ema", # Load from pretrained VAE
+    micro_batch_size=128,      # VAE with micro batch size to save memory
+)
+text_encoder = dict(
+    type="t5",                 # Select text encoder type (t5, clip)
+    from_pretrained="DeepFloyd/t5-v1_1-xxl", # Load from pretrained text encoder
+    model_max_length=120,      # Maximum length of input text
+)
+scheduler = dict(
+    type="iddpm",              # Select scheduler type (iddpm, dpm-solver)
+    num_sampling_steps=100,    # Number of sampling steps
+    cfg_scale=7.0,             # hyper-parameter for classifier-free diffusion
+    cfg_channel=3,             # how many channels to use for classifier-free diffusion, if None, use all channels
+)
+dtype = "fp16"                 # Computation type (fp16, fp32, bf16)
+
+# Other settings
+batch_size = 1                 # batch size
+seed = 42                      # random seed
+prompt_path = "./assets/texts/t2v_samples.txt"  # path to prompt file
+save_dir = "./samples"         # path to save samples
+```
+
+## Training config demos
+
+```python
+# Define sampling size
+num_frames = 64
+frame_interval = 2             # sample every 2 frames
+image_size = (512, 512)
+
+# Define dataset
+root = None                    # root path to the dataset
+data_path = "CSV_PATH"         # path to the csv file
+use_image_transform = False    # True if training on images
+num_workers = 4                # number of workers for dataloader
+
+# Define acceleration
+dtype = "bf16"                 # Computation type (fp16, bf16)
+grad_checkpoint = True         # Use gradient checkpointing
+plugin = "zero2"               # Plugin for distributed training (zero2, zero2-seq)
+sp_size = 1                    # Sequence parallelism size (1 for no sequence parallelism)
+
+# Define model
+model = dict(
+    type="STDiT-XL/2",
+    space_scale=1.0,
+    time_scale=2 / 3,
+    from_pretrained="YOUR_PRETRAINED_MODEL",
+    enable_flashattn=True,        # Enable flash attention
+    enable_layernorm_kernel=True, # Enable layernorm kernel
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+    micro_batch_size=128,
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
+    model_max_length=120,
+    shardformer=True,           # Enable shardformer for T5 acceleration
+)
+scheduler = dict(
+    type="iddpm",
+    timestep_respacing="",      # Default 1000 timesteps
+)
+
+# Others
+seed = 42
+outputs = "outputs"             # path to save checkpoints
+wandb = False                   # Use wandb for logging
+
+epochs = 1000                   # number of epochs (just large enough, kill when satisfied)
+log_every = 10
+ckpt_every = 250
+load = None                     # path to resume training
+
+batch_size = 4
+lr = 2e-5
+grad_clip = 1.0                 # gradient clipping
+```
+
+## Inference-long specific arguments
+
+The [`inference-long.py`](/scripts/inference-long.py) script is used to generate long videos, and it also provides all functions of the [`inference.py`](/scripts/inference.py) script. The following arguments are specific to the `inference-long.py` script.
+
+```python
+loop = 10
+condition_frame_length = 4
+reference_path = ["one.png;two.mp4"]
+mask_strategy = ["0,0,0,1,0;0,0,0,1,-1"]
+```
+
+To generate a long video of any time, our strategy is to generate a video with a fixed length first, and then use the last `condition_frame_length` number of frames for the next video generation. This will loop for `loop` times. Thus, the total length of the video is `loop * (num_frames - condition_frame_length) + condition_frame_length`.
+
+To condition the generation on images or videos, we introduce the `mask_strategy`. It is 5 number tuples separated by `;`.  Each tuple indicate an insertion of the condition image or video to the target generation. The meaning of each number is:
+
+- First number: the index of the condition image or video in the `reference_path`. (0 means one.png, and 1 means two.mp4)
+- Second number: the loop index of the condition image or video. (0 means the first loop, 1 means the second loop, etc.)
+- Third number: the start frame of the condition image or video. (0 means the first frame, and images only have one frame)
+- Fourth number: the number of frames to insert. (1 means insert one frame, and images only have one frame)
+- Fifth number: the location to insert. (0 means insert at the beginning, 1 means insert at the end, and -1 means insert at the end of the video)
+
+Thus, "0,0,0,1,-1" means insert the first frame of one.png at the end of the video at the first loop.
+
+## Bucket Configs
+
+To enable dynamic training (for STDiT2), use `VariableVideoText` dataset, and set the `bucket_config` in the config. An example is:
+
+```python
+bucket_config = {
+    "240p": {16: (1.0, 16), 32: (1.0, 8), 64: (1.0, 4), 128: (1.0, 2)},
+    "256": {1: (1.0, 256)},
+    "512": {1: (1.0, 80)},
+    "480p": {1: (1.0, 52), 16: (0.5, 4), 32: (0.0, None)},
+    "720p": {16: (1.0, 2), 32: (0.0, None)},
+    "1024": {1: (1.0, 20)},
+    "1080p": {1: (1.0, 8)},
+}
+```
+
+This looks a bit difficult to understand at the first glance. Let's understand this config step by step.
+
+### Three-level bucket
+
+We design a three-level bucket: `(resolution, num_frames, aspect_ratios)`. The resolution and aspect ratios is predefined in [aspect.py](/opensora/datasets/aspect.py). Commonly used resolutions (e.g., 240p, 1080p) are supported, and the name represents the number of pixels (e.g., 240p is 240x426, however, we define 240p to represent any size with HxW approximately 240x426=102240 pixels). The aspect ratios are defined for each resolution. You do not need to define the aspect ratios in the `bucket_config`.
+
+The `num_frames` is the number of frames in each sample, with `num_frames=1` especially for images. If `frame_intervals` is not 1, a bucket with `num_frames=k` will contain videos with `k*frame_intervals` frames except for images. Only a video with more than `num_frames` and more than `resolution` pixels will be likely to be put into the bucket.
+
+The two number defined in the bucket config is `(keep_prob, batch_size)`. Since the memory and speed of samples from different buckets may be different, we use `batch_size` to balance the processing speed. Since our computation is limited, we cannot process videos with their original resolution as stated in OpenAI's sora's report. Thus, we give a `keep_prob` to control the number of samples in each bucket. The `keep_prob` is the probability to keep a sample in the bucket. Let's take the following config as an example:
+
+```python
+bucket_config = {
+    "480p": {16: (1.0, 8),},
+    "720p": {16: (0.5, 4),},
+    "1080p": {16: (0.2, 2)},
+    "4K", {16: (0.1, 1)},
+}
+```
+
+Given a 2K video with more than 16 frames, the program will first try to put it into bucket "1080p" since it has a larger resolution than 1080p but less than 4K. Since the `keep_prob` for 1080p is 20%, a random number is generated, and if it is less than 0.2, the video will be put into the bucket. If the video is not put into the bucket, the program will try to put it into the "720p" bucket. Since the `keep_prob` for 720p is 50%, the video has a 50% chance to be put into the bucket. If the video is not put into the bucket, the program will try to put it into the "480p" bucket directly as it is the smallest resolution.
+
+### Examples
+
+Let's see some simple examples to understand the bucket config. First, the aspect ratio bucket is compulsory, if you want to modify this you need to add your own resolution definition in [aspect.py](/opensora/datasets/aspect.py). Then, to keep only 256x256 resolution and 16 frames as OpenSora 1.0, you can use the following config:
+
+```python
+bucket_config = {
+    "256": {16: (1.0, 8)},
+}
+```
+
+If you want to train a model supporting different resolutions of images, you can use the following config:
+
+```python
+bucket_config = {
+    "256": {1: (1.0, 256)},
+    "512": {1: (1.0, 80)},
+    "480p": {1: (1.0, 52)},
+    "1024": {1: (1.0, 20)},
+    "1080p": {1: (1.0, 8)},
+}
+```
+
+Or if you find the number of high-resolution images is too large, you can modify the `keep_prob` to reduce the number of samples in the bucket:
+
+```python
+bucket_config = {
+    "256": {1: (1.0, 256)},
+    "512": {1: (0.8, 80)},
+    "480p": {1: (0.5, 52)},
+    "1024": {1: (0.5, 20)},
+    "1080p": {1: (0.2, 8)},
+}
+```
+
+And similarly for videos:
+
+```python
+bucket_config = {
+    "240p": {16: (1.0, 16), 32: (1.0, 8), 64: (1.0, 4), 128: (1.0, 2)},
+    "480p": {16: (1.0, 4)},
+    "720p": {16: (0.5, 2)},
+}
+```
+
+Note that in the above case, a video with 480p resolution and more than 16 frames will all go into bucket `("480p", 16)`, since they all satisfy this bucket's requirement. But training long videos with 480p resolution may be slow, so you can modify the config as follows to enforce the video with more than 32 frames to go into the 240p bucket.
+
+```python
+bucket_config = {
+    "240p": {16: (1.0, 16), 32: (1.0, 8), 64: (1.0, 4), 128: (1.0, 2)},
+    "480p": {16: (1.0, 4), 32: (0.0, None)},
+    "720p": {16: (0.5, 2)},
+}
+```
+
+Combine the above examples together, we think you can understand the bucket config provided at the beginning of this section and in the config files.
--- a/docs/report_02.md
+++ b/docs/report_02.md
@ -74,24 +74,25 @@ As we found in Open-Sora 1.0, the data number and quality are crucial for traini

 We plan to use [panda-70M](https://snap-research.github.io/Panda-70M/) and other data to traing the model, which is approximately 30M+ data. However, we find disk IO a botteleneck for training and data processing at the same time. Thus, we can only prepare a 10M dataset and did not go through all processing pipeline that we built. Finally, we use a dataset with 9.7M videos + 2.6M images for pre-training, and 560k videos + 1.6M images for fine-tuning. The pretraining dataset statistics are shown below.

-Image text tokens (by T5 tokenizer): ![image text tokens](/assets/readme/report_image_textlen.png)
+Image text tokens (by T5 tokenizer):
+![image text tokens](/assets/readme/report_image_textlen.png)

 Video text tokens (by T5 tokenizer). We directly use panda's short caption for training, and caption other datasets by ourselves. The generated caption is usually less than 200 tokens.
-
 ![video text tokens](/assets/readme/report_video_textlen.png)

-Video duration: ![video duration](/assets/readme/report_video_duration.png)
+Video duration:
+![video duration](/assets/readme/report_video_duration.png)

 ## Training Details

 With limited computational resources, we have to carefully monitor the training process, and change the training strategy if we speculate the model is not learning well since there is no computation for ablation study. Thus, Open-Sora 1.1's training includes multiple changes, and as a result, ema is not applied.

-1. First, we fine-tune 6k steps with images of different resolution from `Pixart-alpha-1024` checkpoints. We find the model easily adapts to generate images with different resolutions. We use [SpeeDiT](https://github.com/1zeryu/SpeeDiT) (iddpm-speed) to accelerate the diffusion training.
-2. **[Stage 1]** Then, we pretrain the model with gradient-checkpointing for 24k steps, which takes 4 days on 64 H800 GPUs. Although the number of samples seen by the model is the same, we find the model learns slowly compared to a smaller batch size. We speculate that at an early stage, the number of steps is more important for training. The most videos are in 240p resolution, and the config is similar to [stage2.py](/configs/opensora-v1-1/train/stage2.py).
-3. **[Stage 1]** To increase the number of steps, we switch to a smaller batch size without gradient-checkpointing. We also add fps conditioning at this point. We trained 40k steps for 2 days. The most videos are in 144p resolution, and the config file is [stage1.py](/configs/opensora-v1-1/train/stage1.py).
-4. **[Stage 1]** We find the model cannot learn well for long videos, and find a noised generation result as speculated to be half-precision problem found in Open-Sora 1.0 training. Thus, we adopt the QK-normalization to stabilize the training. We also switch iddpm-speed to iddpm. We trained for 17k steps for 14 hours. The most videos are in 144p resolution, and the config file is [stage1.py](/configs/opensora-v1-1/train/stage1.py). The stage 1 training lasts for approximately one week, with total step 81k.
-5. **[Stage 2]** We switch to a higher resolution, where most videos are in 240p and 480p resolution ([stage2.py](/configs/opensora-v1-1/train/stage2.py)). We trained 22k steps for one day on all pre-training data, and ?k with ? hours on high-quality data.
-6. **[Stage 3]** We switch to a higher resolution, where most videos are in 480p and 720p resolution ([stage3.py](/configs/opensora-v1-1/train/stage3.py)). We trained ?k with ? hours on high-quality data.
+1. First, we fine-tune **6k** steps with images of different resolution from `Pixart-alpha-1024` checkpoints. We find the model easily adapts to generate images with different resolutions. We use [SpeeDiT](https://github.com/1zeryu/SpeeDiT) (iddpm-speed) to accelerate the diffusion training.
+2. **[Stage 1]** Then, we pretrain the model with gradient-checkpointing for **24k** steps, which takes **4 days** on 64 H800 GPUs. Although the number of samples seen by the model is the same, we find the model learns slowly compared to a smaller batch size. We speculate that at an early stage, the number of steps is more important for training. The most videos are in **240p** resolution, and the config is similar to [stage2.py](/configs/opensora-v1-1/train/stage2.py).
+3. **[Stage 1]** To increase the number of steps, we switch to a smaller batch size without gradient-checkpointing. We also add fps conditioning at this point. We trained **40k** steps for **2 days**. The most videos are in **144p** resolution, and the config file is [stage1.py](/configs/opensora-v1-1/train/stage1.py).
+4. **[Stage 1]** We find the model cannot learn well for long videos, and find a noised generation result as speculated to be half-precision problem found in Open-Sora 1.0 training. Thus, we adopt the QK-normalization to stabilize the training. We also switch iddpm-speed to iddpm. We trained for **17k** steps for **14 hours**. The most videos are in **144p** resolution, and the config file is [stage1.py](/configs/opensora-v1-1/train/stage1.py). The stage 1 training lasts for approximately one week, with total step **81k**.
+5. **[Stage 2]** We switch to a higher resolution, where most videos are in **240p and 480p** resolution ([stage2.py](/configs/opensora-v1-1/train/stage2.py)). We trained **22k** steps for **one day** on all pre-training data, and **?k** with **? hours** on high-quality data.
+6. **[Stage 3]** We switch to a higher resolution, where most videos are in **480p and 720p** resolution ([stage3.py](/configs/opensora-v1-1/train/stage3.py)). We trained **?k** with **? hours** on high-quality data.

 ## Results and Evaluation

@ -104,5 +105,5 @@ As we get one step closer to the replication of Sora, we find many limitations f
 - **Bad human generation**: We find the model cannot generate high-quality human videos. We think the problem is due to the lack of human data. We plan to collect more human data and continue training the model to improve the human generation.
 - **Low aesthetic score**: we find the model's aesthetic score is not high. The problem is due to the lack of aesthetic score filtering, which is not conducted due to IO bottleneck. We plan to filter the data by aesthetic score and finetuning the model to improve the aesthetic score.

-> **Algorithm & Acceleration**: Zangwei Zheng, Xiangyu Peng, Shenggui Li, Hongxing Liu, Yukun Zhou
-> **Data Collection & Pipeline**: Xiangyu Peng, Zangwei Zheng, Chenhui Shen, Tom Young, Junjie Wang, Chenfeng Yu
+> - **Algorithm & Acceleration**: Zangwei Zheng, Xiangyu Peng, Shenggui Li, Hongxing Liu, Yukun Zhou
+> - **Data Collection & Pipeline**: Xiangyu Peng, Zangwei Zheng, Chenhui Shen, Tom Young, Junjie Wang, Chenfeng Yu
--- a/docs/structure.md
+++ b/docs/structure.md
@ -1,31 +1,36 @@
-# Repo & Config Structure
-
- [Repo \& Config Structure](#repo--config-structure)
-  - [Repo Structure](#repo-structure)
-  - [Configs](#configs)
-  - [Inference config demos](#inference-config-demos)
-  - [Training config demos](#training-config-demos)
-  - [Bucket Configs](#bucket-configs)
-    - [Why bucket?](#why-bucket)
-    - [Three-level bucket](#three-level-bucket)
-    - [Examples](#examples)
-
-
-## Repo Structure
+# Repo Structure

 ```plaintext
 Open-Sora
 ├── README.md
+├── assets
+│   ├── images                     -> images used for image-conditioned generation
+│   ├── texts                      -> prompts used for text-conditioned generation
+│   └── readme                     -> images used in README
+├── configs                        -> Configs for training & inference
 ├── docs
-│   ├── acceleration.md            -> Acceleration & Speed benchmark
+│   ├── acceleration.md            -> Report on acceleration & speed benchmark
 │   ├── command.md                 -> Commands for training & inference
 │   ├── datasets.md                -> Datasets used in this project
+|   ├── data_pipeline.md           -> Data pipeline documents
 │   ├── structure.md               -> This file
-│   └── report_v1.md               -> Report for Open-Sora v1
+│   ├── config.md                  -> Configs meaning
+│   ├── report_01.md               -> Report for Open-Sora 1.1
+│   ├── report_02.md               -> Report for Open-Sora 1.0
+│   └── zh_CN                      -> Chinese version of the above
+├── eval                           -> Evaluation scripts
+│   ├── README.md                  -> Evaluation documentation
+|   ├── sample.sh                  -> script for quickly launching inference on predefined prompts
+|   ├── launch.sh                  -> script for launching 8 cards sampling
+|   ├── vbench                     -> for VBench evaluation
+│   └── vbench_i2v                 -> for VBench i2v evaluation
+├── gradio                         -> Gradio demo related code
+├── notebooks                      -> Jupyter notebooks for generating commands to run
 ├── scripts
 │   ├── train.py                   -> diffusion training script
-│   └── inference.py               -> Report for Open-Sora v1
-├── configs                        -> Configs for training & inference
+│   ├── inference.py               -> diffusion inference script
+│   ├── inference-long.py          -> inference script supporting more advanced features
+│   └── misc                       -> misc scripts, including batch size search
 ├── opensora
 │   ├── __init__.py
 │   ├── registry.py                -> Registry helper
@ -46,6 +51,7 @@ Open-Sora
 │   │   ├── iddpm                  -> IDDPM for training and inference
 │   │   └── dpms                   -> DPM-Solver for fast inference
 │   └── utils
+├── tests                          -> Tests for the project
 └── tools                          -> Tools for data processing and more
 ```

@ -56,6 +62,17 @@ Our config files follows [MMEgine](https://github.com/open-mmlab/mmengine). MMEn
 ```plaintext
 Open-Sora
 └── configs                        -> Configs for training & inference
+    ├── opensora-v1-1              -> STDiT2 related configs
+    │   ├── inference
+    │   │   ├── sample.py          -> Sample videos and images
+    │   │   └── sample-ref.py      -> Sample videos with image/video condition
+    │   └── train
+    │       ├── stage1.py          -> Stage 1 training config
+    │       ├── stage2.py          -> Stage 2 training config
+    │       ├── stage3.py          -> Stage 3 training config
+    │       ├── image.py           -> Illustration of image training config
+    │       ├── video.py           -> Illustration of video training config
+    │       └── benchmark.py       -> For batch size searching
    ├── opensora                   -> STDiT related configs
    │   ├── inference
    │   │   ├── 16x256x256.py      -> Sample videos 16 frames 256x256
@ -77,230 +94,17 @@ Open-Sora
    └── pixart                     -> PixArt related configs
 ```

-## Inference config demos
-
-To change the inference settings, you can directly modify the corresponding config file. Or you can pass arguments to overwrite the config file ([config_utils.py](/opensora/utils/config_utils.py)). To change sampling prompts, you should modify the `.txt` file passed to the `--prompt_path` argument.
+## Tools

 ```plaintext
--prompt_path ./assets/texts/t2v_samples.txt  -> prompt_path
--ckpt-path ./path/to/your/ckpt.pth           -> model["from_pretrained"]
-```
-
-The explanation of each field is provided below.
-
-```python
-# Define sampling size
-num_frames = 64               # number of frames
-fps = 24 // 2                 # frames per second (divided by 2 for frame_interval=2)
-image_size = (512, 512)       # image size (height, width)
-
-# Define model
-model = dict(
-    type="STDiT-XL/2",        # Select model type (STDiT-XL/2, DiT-XL/2, etc.)
-    space_scale=1.0,          # (Optional) Space positional encoding scale (new height / old height)
-    time_scale=2 / 3,         # (Optional) Time positional encoding scale (new frame_interval / old frame_interval)
-    enable_flashattn=True,    # (Optional) Speed up training and inference with flash attention
-    enable_layernorm_kernel=True, # (Optional) Speed up training and inference with fused kernel
-    from_pretrained="PRETRAINED_MODEL",  # (Optional) Load from pretrained model
-    no_temporal_pos_emb=True,  # (Optional) Disable temporal positional encoding (for image)
-)
-vae = dict(
-    type="VideoAutoencoderKL", # Select VAE type
-    from_pretrained="stabilityai/sd-vae-ft-ema", # Load from pretrained VAE
-    micro_batch_size=128,      # VAE with micro batch size to save memory
-)
-text_encoder = dict(
-    type="t5",                 # Select text encoder type (t5, clip)
-    from_pretrained="DeepFloyd/t5-v1_1-xxl", # Load from pretrained text encoder
-    model_max_length=120,      # Maximum length of input text
-)
-scheduler = dict(
-    type="iddpm",              # Select scheduler type (iddpm, dpm-solver)
-    num_sampling_steps=100,    # Number of sampling steps
-    cfg_scale=7.0,             # hyper-parameter for classifier-free diffusion
-    cfg_channel=3,             # how many channels to use for classifier-free diffusion, if None, use all channels
-)
-dtype = "fp16"                 # Computation type (fp16, fp32, bf16)
-
-# Other settings
-batch_size = 1                 # batch size
-seed = 42                      # random seed
-prompt_path = "./assets/texts/t2v_samples.txt"  # path to prompt file
-save_dir = "./samples"         # path to save samples
-```
-
-## Training config demos
-
-```python
-# Define sampling size
-num_frames = 64
-frame_interval = 2             # sample every 2 frames
-image_size = (512, 512)
-
-# Define dataset
-root = None                    # root path to the dataset
-data_path = "CSV_PATH"         # path to the csv file
-use_image_transform = False    # True if training on images
-num_workers = 4                # number of workers for dataloader
-
-# Define acceleration
-dtype = "bf16"                 # Computation type (fp16, bf16)
-grad_checkpoint = True         # Use gradient checkpointing
-plugin = "zero2"               # Plugin for distributed training (zero2, zero2-seq)
-sp_size = 1                    # Sequence parallelism size (1 for no sequence parallelism)
-
-# Define model
-model = dict(
-    type="STDiT-XL/2",
-    space_scale=1.0,
-    time_scale=2 / 3,
-    from_pretrained="YOUR_PRETRAINED_MODEL",
-    enable_flashattn=True,        # Enable flash attention
-    enable_layernorm_kernel=True, # Enable layernorm kernel
-)
-vae = dict(
-    type="VideoAutoencoderKL",
-    from_pretrained="stabilityai/sd-vae-ft-ema",
-    micro_batch_size=128,
-)
-text_encoder = dict(
-    type="t5",
-    from_pretrained="DeepFloyd/t5-v1_1-xxl",
-    model_max_length=120,
-    shardformer=True,           # Enable shardformer for T5 acceleration
-)
-scheduler = dict(
-    type="iddpm",
-    timestep_respacing="",      # Default 1000 timesteps
-)
-
-# Others
-seed = 42
-outputs = "outputs"             # path to save checkpoints
-wandb = False                   # Use wandb for logging
-
-epochs = 1000                   # number of epochs (just large enough, kill when satisfied)
-log_every = 10
-ckpt_every = 250
-load = None                     # path to resume training
-
-batch_size = 4
-lr = 2e-5
-grad_clip = 1.0                 # gradient clipping
-```
-
-## Inference-long specific arguments
-
-The [`inference-long.py`](/scripts/inference-long.py) script is used to generate long videos, and it also provides all functions of the [`inference.py`](/scripts/inference.py) script. The following arguments are specific to the `inference-long.py` script.
-
-```python
-loop = 10
-condition_frame_length = 4
-reference_path = ["one.png;two.mp4"]
-mask_strategy = ["0,0,0,1,0;0,0,0,1,-1"]
-```
-
-To generate a long video of any time, our strategy is to generate a video with a fixed length first, and then use the last `condition_frame_length` number of frames for the next video generation. This will loop for `loop` times. Thus, the total length of the video is `loop * (num_frames - condition_frame_length) + condition_frame_length`.
-
-To condition the generation on images or videos, we introduce the `mask_strategy`. It is 5 number tuples separated by `;`.  Each tuple indicate an insertion of the condition image or video to the target generation. The meaning of each number is:
-
- First number: the index of the condition image or video in the `reference_path`. (0 means one.png, and 1 means two.mp4)
- Second number: the loop index of the condition image or video. (0 means the first loop, 1 means the second loop, etc.)
- Third number: the start frame of the condition image or video. (0 means the first frame, and images only have one frame)
- Fourth number: the number of frames to insert. (1 means insert one frame, and images only have one frame)
- Fifth number: the location to insert. (0 means insert at the beginning, 1 means insert at the end, and -1 means insert at the end of the video)
-
-Thus, "0,0,0,1,-1" means insert the first frame of one.png at the end of the video at the first loop.
-
-## Bucket Configs
-
-To enable dynamic training (for STDiT2), use `VariableVideoText` dataset, and set the `bucket_config` in the config. An example is:
-
-```python
-bucket_config = {
-    "240p": {16: (1.0, 16), 32: (1.0, 8), 64: (1.0, 4), 128: (1.0, 2)},
-    "256": {1: (1.0, 256)},
-    "512": {1: (1.0, 80)},
-    "480p": {1: (1.0, 52), 16: (0.5, 4), 32: (0.0, None)},
-    "720p": {16: (1.0, 2), 32: (0.0, None)},
-    "1024": {1: (1.0, 20)},
-    "1080p": {1: (1.0, 8)},
-}
-```
-
-This looks a bit difficult to understand at the first glance. Let's understand this config step by step.
-
-### Three-level bucket
-
-We design a three-level bucket: `(resolution, num_frames, aspect_ratios)`. The resolution and aspect ratios is predefined in [aspect.py](/opensora/datasets/aspect.py). Commonly used resolutions (e.g., 240p, 1080p) are supported, and the name represents the number of pixels (e.g., 240p is 240x426, however, we define 240p to represent any size with HxW approximately 240x426=102240 pixels). The aspect ratios are defined for each resolution. You do not need to define the aspect ratios in the `bucket_config`.
-
-The `num_frames` is the number of frames in each sample, with `num_frames=1` especially for images. If `frame_intervals` is not 1, a bucket with `num_frames=k` will contain videos with `k*frame_intervals` frames except for images. Only a video with more than `num_frames` and more than `resolution` pixels will be likely to be put into the bucket.
-
-The two number defined in the bucket config is `(keep_prob, batch_size)`. Since the memory and speed of samples from different buckets may be different, we use `batch_size` to balance the processing speed. Since our computation is limited, we cannot process videos with their original resolution as stated in OpenAI's sora's report. Thus, we give a `keep_prob` to control the number of samples in each bucket. The `keep_prob` is the probability to keep a sample in the bucket. Let's take the following config as an example:
-
-```python
-bucket_config = {
-    "480p": {16: (1.0, 8),},
-    "720p": {16: (0.5, 4),},
-    "1080p": {16: (0.2, 2)},
-    "4K", {16: (0.1, 1)},
-}
-```
-
-Given a 2K video with more than 16 frames, the program will first try to put it into bucket "1080p" since it has a larger resolution than 1080p but less than 4K. Since the `keep_prob` for 1080p is 20%, a random number is generated, and if it is less than 0.2, the video will be put into the bucket. If the video is not put into the bucket, the program will try to put it into the "720p" bucket. Since the `keep_prob` for 720p is 50%, the video has a 50% chance to be put into the bucket. If the video is not put into the bucket, the program will try to put it into the "480p" bucket directly as it is the smallest resolution.
-
-### Examples
-
-Let's see some simple examples to understand the bucket config. First, the aspect ratio bucket is compulsory, if you want to modify this you need to add your own resolution definition in [aspect.py](/opensora/datasets/aspect.py). Then, to keep only 256x256 resolution and 16 frames as OpenSora 1.0, you can use the following config:
-
-```python
-bucket_config = {
-    "256": {16: (1.0, 8)},
-}
-```
-
-If you want to train a model supporting different resolutions of images, you can use the following config:
-
-```python
-bucket_config = {
-    "256": {1: (1.0, 256)},
-    "512": {1: (1.0, 80)},
-    "480p": {1: (1.0, 52)},
-    "1024": {1: (1.0, 20)},
-    "1080p": {1: (1.0, 8)},
-}
-```
-
-Or if you find the number of high-resolution images is too large, you can modify the `keep_prob` to reduce the number of samples in the bucket:
-
-```python
-bucket_config = {
-    "256": {1: (1.0, 256)},
-    "512": {1: (0.8, 80)},
-    "480p": {1: (0.5, 52)},
-    "1024": {1: (0.5, 20)},
-    "1080p": {1: (0.2, 8)},
-}
-```
-
-And similarly for videos:
-
-```python
-bucket_config = {
-    "240p": {16: (1.0, 16), 32: (1.0, 8), 64: (1.0, 4), 128: (1.0, 2)},
-    "480p": {16: (1.0, 4)},
-    "720p": {16: (0.5, 2)},
-}
-```
-
-Note that in the above case, a video with 480p resolution and more than 16 frames will all go into bucket `("480p", 16)`, since they all satisfy this bucket's requirement. But training long videos with 480p resolution may be slow, so you can modify the config as follows to enforce the video with more than 32 frames to go into the 240p bucket.
-
-```python
-bucket_config = {
-    "240p": {16: (1.0, 16), 32: (1.0, 8), 64: (1.0, 4), 128: (1.0, 2)},
-    "480p": {16: (1.0, 4), 32: (0.0, None)},
-    "720p": {16: (0.5, 2)},
-}
-```
-
-Combine the above examples together, we think you can understand the bucket config provided at the beginning of this section and in the config files.
+Open-Sora
+└── tools
+    ├── datasets                   -> dataset management related code
+    ├── scene_cut                  -> scene cut related code
+    ├── caption                    -> caption related code
+    ├── scoring                    -> scoring related code
+    │   ├── aesthetic              -> aesthetic scoring related code
+    │   ├── matching               -> matching scoring related code
+    │   ├── ocr                    -> ocr scoring related code
+    │   └── optical_flow           -> optical flow scoring related code
+    └── frame_interpolation        -> frame interpolation related code