update cai to latest

2026-04-10 12:49:38 +02:00 · 2024-07-08 06:29:17 +00:00 · 2024-07-08 06:29:17 +00:00 · 19617f6bc4
commit 19617f6bc4
parent 189354557a 06f7eb93f0
35 changed files with 1285 additions and 155 deletions
--- a/.gitignore
+++ b/.gitignore
@ -179,6 +179,7 @@ pretrained_models
 evaluation_results/
 cache/
 *.swp
+debug/

 # Secret files
 hostfile
--- a/assets/texts/rand_types.txt
+++ b/assets/texts/rand_types.txt
@ -0,0 +1,40 @@
+随机电影镜头
+随机电影镜头
+随机电影镜头
+随机电影镜头
+随机电影镜头
+随机任务镜头
+随机任务镜头
+随机任务镜头
+随机任务镜头
+随机任务镜头
+随机游戏镜头
+随机游戏镜头
+随机游戏镜头
+随机游戏镜头
+随机游戏镜头
+随机开车镜头
+随机开车镜头
+随机开车镜头
+随机开车镜头
+随机开车镜头
+随机动物镜头
+随机动物镜头
+随机动物镜头
+随机动物镜头
+随机动物镜头
+随机森林镜头
+随机森林镜头
+随机森林镜头
+随机森林镜头
+随机森林镜头
+随机动漫镜头
+随机动漫镜头
+随机动漫镜头
+随机动漫镜头
+随机动漫镜头
+随机舞蹈镜头
+随机舞蹈镜头
+随机舞蹈镜头
+随机舞蹈镜头
+随机舞蹈镜头
--- a/configs/opensora-v1-2/inference/sample.py
+++ b/configs/opensora-v1-2/inference/sample.py
@ -19,14 +19,12 @@ model = dict(
    qk_norm=True,
    enable_flash_attn=True,
    enable_layernorm_kernel=True,
-    force_huggingface=True,
 )
 vae = dict(
    type="OpenSoraVAE_V1_2",
    from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
    micro_frame_size=17,
    micro_batch_size=4,
-    force_huggingface=True,
 )
 text_encoder = dict(
    type="t5",
--- a/configs/opensora-v1-2/inference/sample_hf.py
+++ b/configs/opensora-v1-2/inference/sample_hf.py
@ -0,0 +1,44 @@
+resolution = "240p"
+aspect_ratio = "9:16"
+num_frames = 51
+fps = 24
+frame_interval = 1
+save_fps = 24
+
+save_dir = "./samples/samples/"
+seed = 42
+batch_size = 1
+multi_resolution = "STDiT2"
+dtype = "bf16"
+condition_frame_length = 5
+align = 5
+
+model = dict(
+    type="STDiT3-XL/2",
+    from_pretrained="hpcai-tech/OpenSora-STDiT-v3",
+    qk_norm=True,
+    enable_flash_attn=True,
+    enable_layernorm_kernel=True,
+    force_huggingface=True,
+)
+vae = dict(
+    type="OpenSoraVAE_V1_2",
+    from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
+    micro_frame_size=17,
+    micro_batch_size=4,
+    force_huggingface=True,
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
+    model_max_length=300,
+)
+scheduler = dict(
+    type="rflow",
+    use_timestep_transform=True,
+    num_sampling_steps=30,
+    cfg_scale=7.0,
+)
+
+aes = 6.5
+flow = None
--- a/configs/opensora-v1-2/train/demo_360p.py
+++ b/configs/opensora-v1-2/train/demo_360p.py
@ -5,7 +5,7 @@ dataset = dict(
 )

 # webvid
-bucket_config = {"360p": {102: (1.0, 5)}}
+bucket_config = {"360p": {102: (1.0, 1)}}
 grad_checkpoint = True

 # Acceleration settings
--- a/configs/opensora-v1-2/train/stage2.py
+++ b/configs/opensora-v1-2/train/stage2.py
@ -60,19 +60,21 @@ scheduler = dict(
 )

 # Mask settings
+# 25%
 mask_ratios = {
-    "random": 0.05,
-    "intepolate": 0.005,
-    "quarter_random": 0.005,
-    "quarter_head": 0.005,
-    "quarter_tail": 0.005,
-    "quarter_head_tail": 0.005,
-    "image_random": 0.025,
-    "image_head": 0.05,
-    "image_tail": 0.025,
-    "image_head_tail": 0.025,
+    "random": 0.005,
+    "intepolate": 0.002,
+    "quarter_random": 0.007,
+    "quarter_head": 0.002,
+    "quarter_tail": 0.002,
+    "quarter_head_tail": 0.002,
+    "image_random": 0.0,
+    "image_head": 0.22,
+    "image_tail": 0.005,
+    "image_head_tail": 0.005,
 }

+
 # Log settings
 seed = 42
 outputs = "outputs"
--- a/configs/pixart/inference/1x2048MS.py
+++ b/configs/pixart/inference/1x2048MS.py
@ -1,6 +1,6 @@
 num_frames = 1
 fps = 1
-image_size = (2560, 1536)
+# image_size = (2560, 1536)
 # image_size = (2048, 2048)

 model = dict(
@ -16,6 +16,7 @@ vae = dict(
    type="VideoAutoencoderKL",
    from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers",
    subfolder="vae",
+    scaling_factor=0.13025,
 )
 text_encoder = dict(
    type="t5",
--- a/docs/commands.md
+++ b/docs/commands.md
@ -1,5 +1,6 @@
 # Commands

+- [Config](#Config)
 - [Inference](#inference)
  - [Inference with Open-Sora 1.2](#inference-with-open-sora-12)
  - [Inference with Open-Sora 1.1](#inference-with-open-sora-11)
@ -12,6 +13,36 @@
  - [Training Hyperparameters](#training-hyperparameters)
 - [Search batch size for buckets](#search-batch-size-for-buckets)

+## Config
+Note that currently our model loading for vae and diffusion model supports two types:
+
+* load from local file path
+* load from huggingface
+
+Our config supports loading from huggingface online image by default.
+If you wish to load from a local path downloaded from huggingface image, you need to set `force_huggingface=True`, for instance:
+
+```python
+# for vae
+vae = dict(
+    type="OpenSoraVAE_V1_2",
+    from_pretrained="/root/commonData/OpenSora-VAE-v1.2",
+    micro_frame_size=17,
+    micro_batch_size=4,
+    force_huggingface=True, # NOTE: set here
+)
+# for diffusion model
+model = dict(
+    type="STDiT3-XL/2",
+    from_pretrained="/root/commonData/OpenSora-STDiT-v3",
+    qk_norm=True,
+    enable_flash_attn=True,
+    enable_layernorm_kernel=True,
+    force_huggingface=True, # NOTE: set here
+)
+```
+However, if you want to load a self-trained model, do not set `force_huggingface=True` since your image won't be in huggingface format.
+
 ## Inference

 You can modify corresponding config files to change the inference settings. See more details [here](/docs/structure.md#inference-config-demos).
--- a/docs/report_03.md
+++ b/docs/report_03.md
@ -7,7 +7,11 @@
 - [Evaluation](#evaluation)
 - [Sequence parallelism](#sequence-parallelism)

+<<<<<<< HEAD
 In Open-Sora 1.2 release, we train a 1.1B models on >30M data (\~80k hours), with training cost 35k H100 GPU hours, supporting 0s\~16s, 144p to 720p, various aspect ratios video generation. Our configurations is listed below. Following our 1.1 version, Open-Sora 1.2 can also do image-to-video generation and video extension.
+=======
+In Open-Sora 1.2 release, we train a 1.1B models on >30M data (about 80k hours), with training cost 35k H100 GPU hours, supporting 0s to 16s, 144p to 720p, various aspect ratios video generation. Our configurations is listed below. Following our 1.1 version, Open-Sora 1.2 can also do image-to-video generation and video extension.
+>>>>>>> 65efc8ce0e1cce0bd1a5a2aaab5cd845abee2c95

 |      | image | 2s  | 4s  | 8s  | 16s |
 | ---- | ----- | --- | --- | --- | --- |
--- a/eval/README.md
+++ b/eval/README.md
@ -48,8 +48,14 @@ First, generate the relevant videos with the following commands:
 ```bash
 # vbench task, if evaluation all set start_index to 0, end_index to 2000
 bash eval/sample.sh /path/to/ckpt num_frames model_name_for_log  -4 start_index end_index
+
 # Alternatively, launch 8 jobs at once (you must read the script to understand the details)
 bash eval/vbench/launch.sh /path/to/ckpt num_frames model_name
+
+# in addition, you can specify resolution, aspect ratio, sampling steps, flow, and llm-refine
+bash eval/vbench/launch.sh /path/to/ckpt num_frames model_name res_value aspect_ratio_value steps_value flow_value llm_refine_value
+# for example
+# bash eval/vbench/launch.sh /mnt/jfs-hdd/sora/checkpoints/outputs/042-STDiT3-XL-2/epoch1-global_step16200_llm_refine/ema.pt 51 042-STDiT3-XL-2 240p 9:16 30 2 True
 ```

 After generation, install the VBench package following our [installation](../docs/installation.md)'s sections of "Evaluation Dependencies". Then, run the following commands to evaluate the generated samples.
@ -89,6 +95,15 @@ python eval/vbench_i2v/tabulate_vbench_i2v_scores.py path/to/videos/folder path/

 ```

+Similarly as VBench, you can specify resolution, aspect ratio, sampling steps, flow, and llm-refine
+
+```bash
+bash eval/vbench_i2v/launch.sh /path/to/ckpt num_frames model_name_for_log res_value aspect_ratio_value steps_value flow_value llm_refine_value
+# for example
+# bash eval/vbench_i2v/launch.sh /mnt/jfs-hdd/sora/checkpoints/outputs/042-STDiT3-XL-2/epoch1-global_step16200_llm_refine/ema.pt 51 042-STDiT3-XL-2 240p 9:16 30 2 True
+# if no flow control, use "None" instead
+```
+
 ## VAE

 Install the dependencies package following our [installation](../docs/installation.md)'s s sections of "Evaluation Dependencies". Then, run the following evaluation command:
--- a/eval/loss/launch.sh
+++ b/eval/loss/launch.sh
@ -3,8 +3,16 @@
 CMD="torchrun --standalone --nproc_per_node 1 eval/loss/eval_loss.py configs/opensora-v1-2/misc/eval_loss.py"
 CKPT_PATH=$1
 MODEL_NAME=$2
-IMG_PATH="/mnt/jfs-hdd/sora/meta/validation/img_1k.csv"
-VID_PATH="/mnt/jfs-hdd/sora/meta/validation/vid_100.csv"
+IMG_PATH=$3
+VID_PATH=$4
+
+if [ -z $IMG_PATH ]; then
+    IMG_PATH="/mnt/jfs-hdd/sora/meta/validation/img_1k.csv"
+fi
+
+if [ -z $VID_PATH ]; then
+    VID_PATH="/mnt/jfs-hdd/sora/meta/validation/vid_100.csv"
+fi

 if [[ $CKPT_PATH == *"ema"* ]]; then
    parentdir=$(dirname $CKPT_PATH)
--- a/eval/sample.sh
+++ b/eval/sample.sh
@ -3,12 +3,44 @@
 CKPT=$1
 NUM_FRAMES=$2
 MODEL_NAME=$3
-
+TASK_TYPE=$4
 VBENCH_START_INDEX=$5
 VBENCH_END_INDEX=$6
 VBENCH_RES=$7
 VBENCH_ASP_RATIO=$8

+NUM_SAMPLING_STEPS=$9
+FLOW=${10}
+LLM_REFINE=${11}
+
+BASE_ASPECT_RATIO=360p
+ASPECT_RATIOS=(144p 240p 360p 480p 720p 1080p)
+# Loop through the list of aspect ratios
+i=0
+for r in "${ASPECT_RATIOS[@]}"; do
+  if [[ "$r" == "$BASE_ASPECT_RATIO" ]]; then
+    # get aspect ratio 1 level up
+    if [[ $((i+1)) -lt ${#ASPECT_RATIOS[@]} ]]; then
+      ASPECT_RATIO_INCR_1=${ASPECT_RATIOS[$((i+1))]}
+    else
+      # If this is the highest ratio, return the highest ratio
+      ASPECT_RATIO_INCR_1=${ASPECT_RATIOS[-1]}
+    fi
+    # get aspect ratio 2 levels up
+    if [[ $((i+2)) -lt ${#ASPECT_RATIOS[@]} ]]; then
+      ASPECT_RATIO_INCR_2=${ASPECT_RATIOS[$((i+2))]}
+    else
+      # If this is the highest ratio, return the highest ratio
+      ASPECT_RATIO_INCR_2=${ASPECT_RATIOS[-1]}
+    fi
+  fi
+  i=$((i+1))
+done
+echo "base aspect ratio: ${BASE_ASPECT_RATIO}"
+echo "aspect ratio 1 level up: ${ASPECT_RATIO_INCR_1}"
+echo "aspect ratio 2 levels up: ${ASPECT_RATIO_INCR_2}"
+echo "Note that this aspect ratio level setting is used for videos only, not images"
+
 echo "NUM_FRAMES=${NUM_FRAMES}"

 if [ -z "${NUM_FRAMES}" ]; then
@ -39,7 +71,7 @@ DEFAULT_BS=1
 # called inside run_video_b
 function run_image() { # 14min
  # 1.1 1024x1024
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2i_samples.txt --save-dir $OUTPUT --num-frames 1 --resolution 1024 --aspect_ratio 1:1 --sample-name image_1024_1_1 --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2i_samples.txt --save-dir $OUTPUT --num-frames 1 --resolution 1024 --aspect-ratio 1:1 --sample-name image_1024_1_1 --batch-size $DEFAULT_BS

  # 1.2 240x426
  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2i_samples.txt --save-dir $OUTPUT --num-frames 1 --resolution 240p --aspect-ratio 9:16 --sample-name image_240p_9_16 --end-index 3 --batch-size $DEFAULT_BS
@ -89,13 +121,13 @@ function run_video_a() { # ~ 30min ?
  # eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 2s --resolution 720p --aspect-ratio 9:16 --sample-name sample_2s_720p_9_16 --batch-size $DEFAULT_BS

  # sample, 720p, 9:16, 2s
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 4s --resolution 720p --aspect-ratio 9:16 --sample-name sample_4s_720p --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 4s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 9:16 --sample-name sample_4s_${ASPECT_RATIO_INCR_2} --batch-size $DEFAULT_BS

  # sample, 480p, 9:16, 8s
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 8s --resolution 480p --aspect-ratio 9:16 --sample-name sample_8s_480p --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 8s --resolution ${ASPECT_RATIO_INCR_1} --aspect-ratio 9:16 --sample-name sample_8s_${ASPECT_RATIO_INCR_1} --batch-size $DEFAULT_BS

-  # sample, 240p, 9:16, 16s
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 16s --resolution 360p --aspect-ratio 9:16 --sample-name sample_16s_360p --batch-size $DEFAULT_BS
+  # sample, 360p, 9:16, 16s
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 16s --resolution ${BASE_ASPECT_RATIO} --aspect-ratio 9:16 --sample-name sample_16s_${BASE_ASPECT_RATIO} --batch-size $DEFAULT_BS
 }

 function run_video_b() { # 18min + 14min = 32min, short 16x240p & 64x240p
@ -112,10 +144,10 @@ function run_video_b() { # 18min + 14min = 32min, short 16x240p & 64x240p
  # eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 8s --resolution 240p --aspect-ratio 9:16 --sample-name short_8s_240p_9_16 --batch-size $DEFAULT_BS

  # short, 480p, 9:16, 8s: ~24min
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 8s --resolution 480p --aspect-ratio 9:16 --sample-name short_8s_480p --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 8s --resolution ${ASPECT_RATIO_INCR_1} --aspect-ratio 9:16 --sample-name short_8s_${ASPECT_RATIO_INCR_1} --batch-size $DEFAULT_BS

-  # short, 240p, 9:16, 16s: ~24min
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 16s --resolution 360p --aspect-ratio 9:16 --sample-name short_16s_360p --batch-size $DEFAULT_BS
+  # short, 360p, 9:16, 16s: ~24min
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 16s --resolution ${BASE_ASPECT_RATIO} --aspect-ratio 9:16 --sample-name short_16s_${BASE_ASPECT_RATIO} --batch-size $DEFAULT_BS

 }

@ -129,10 +161,10 @@ function run_video_c() {
  # eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 16s --resolution 240p --aspect-ratio 9:16 --sample-name sora_16s_240p_9_16 --batch-size $DEFAULT_BS

  # short, 720p, 9:16, 2s: ~9min
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 4s --resolution 720p --aspect-ratio 9:16 --sample-name short_4s_720p --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 4s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 9:16 --sample-name short_4s_${ASPECT_RATIO_INCR_2} --batch-size $DEFAULT_BS

-  # sora, 240p, 9:16, 16s: ~40min
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 16s --resolution 360p --aspect-ratio 9:16 --sample-name sora_16s_360p --batch-size $DEFAULT_BS
+  # sora, 360p, 9:16, 16s: ~40min
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 16s --resolution ${BASE_ASPECT_RATIO} --aspect-ratio 9:16 --sample-name sora_16s_${BASE_ASPECT_RATIO} --batch-size $DEFAULT_BS
 }

 function run_video_d() {
@ -143,17 +175,17 @@ function run_video_d() {
  # eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 8s --resolution 480p --aspect-ratio 9:16 --sample-name sora_8s_480p_9_16 --batch-size $DEFAULT_BS --start-index 0 --end-index 16

  # sora, 480p, 9:16, 8s, 1/3 # moved from run_video_e, 30min
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 8s --resolution 480p --aspect-ratio 9:16 --sample-name sora_8s_480p --batch-size $DEFAULT_BS --start-index 0 --end-index 16
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 8s --resolution ${ASPECT_RATIO_INCR_1} --aspect-ratio 9:16 --sample-name sora_8s_${ASPECT_RATIO_INCR_1} --batch-size $DEFAULT_BS --start-index 0 --end-index 16
 }

 function run_video_e() { # 90min * 2/3 = 60min
  # sora, 480p, 9:16, 8s, 2/3
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 8s --resolution 480p --aspect-ratio 9:16 --sample-name sora_8s_480p --batch-size $DEFAULT_BS --start-index 16 --end-index 100
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 8s --resolution ${ASPECT_RATIO_INCR_1} --aspect-ratio 9:16 --sample-name sora_8s_${ASPECT_RATIO_INCR_1} --batch-size $DEFAULT_BS --start-index 16 --end-index 100
 }

 function run_video_f() { # 60min
  # sora, 720p, 9:16, 2s
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 4s --resolution 720p --aspect-ratio 9:16 --sample-name sora_4s_720p --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 4s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 9:16 --sample-name sora_4s_${ASPECT_RATIO_INCR_2} --batch-size $DEFAULT_BS
 }

 # --resolution 720p --aspect-ratio [16:9, 9:16, ...]
@ -162,22 +194,22 @@ function run_video_g() { # 15min
  # 720p, 2s multi-resolution
  # 1:1
  PROMPT="A soaring drone footage captures the majestic beauty of a coastal cliff, its red and yellow stratified rock faces rich in color and against the vibrant turquoise of the sea. Seabirds can be seen taking flight around the cliff's precipices. As the drone slowly moves from different angles, the changing sunlight casts shifting shadows that highlight the rugged textures of the cliff and the surrounding calm sea. The water gently laps at the rock base and the greenery that clings to the top of the cliff, and the scene gives a sense of peaceful isolation at the fringes of the ocean. The video captures the essence of pristine natural beauty untouched by human structures."
-  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution 720p --aspect-ratio 1:1 --sample-name drone_cliff_prompt_720p_2s_1_1
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 1:1 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_1_1
  # 16:9
-  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution 720p --aspect-ratio 16:9 --sample-name drone_cliff_prompt_720p_2s_16_9
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 16:9 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_16_9
  # 9:16
-  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution 720p --aspect-ratio 9:16 --sample-name drone_cliff_prompt_720p_2s_9_16
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 9:16 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_9_16
  # 4:3
-  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution 720p --aspect-ratio 4:3 --sample-name drone_cliff_prompt_720p_2s_4_3
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 4:3 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_4_3
  # 3:4
-  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution 720p --aspect-ratio 3:4 --sample-name drone_cliff_prompt_720p_2s_3_4
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 3:4 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_3_4
  # 1:2
-  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution 720p --aspect-ratio 1:2 --sample-name drone_cliff_prompt_720p_2s_1_2
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 1:2 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_1_2
  # 2:1
-  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution 720p --aspect-ratio 2:1 --sample-name drone_cliff_prompt_720p_2s_2_1
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 2:1 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_2_1

  # add motion score
-  eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --num-frames 2s --resolution 720p --sample-name motion_2s_720p --prompt \
+  eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --sample-name motion_2s_${ASPECT_RATIO_INCR_2} --prompt \
    \"A stylish woman walking in the street of Tokyo.\" \"A stylish woman walking in the street of Tokyo. motion score: 0.0\" \
    \"A stylish woman walking in the street of Tokyo. motion score: 2.0\" \
    \"A stylish woman walking in the street of Tokyo. motion score: 4.0\" \
@ -188,7 +220,7 @@ function run_video_g() { # 15min
    \"A stylish woman walking in the street of Tokyo. motion score: 100.0\"

  # add aes score
-  eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --num-frames 2s --resolution 720p --sample-name aes_2s_720p --prompt \
+  eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --sample-name aes_2s_${ASPECT_RATIO_INCR_2} --prompt \
    \"A stylish woman walking in the street of Tokyo.\" \"A stylish woman walking in the street of Tokyo. aesthetic score: 4.0\" \
    \"A stylish woman walking in the street of Tokyo. aesthetic score: 4.5\" \
    \"A stylish woman walking in the street of Tokyo. aesthetic score: 5.0\" \
@ -202,24 +234,24 @@ function run_video_g() { # 15min

 function run_video_h() { # 61min
  # 3.1 image-conditioned long video generation
-  eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --sample-name ref_L5C5_2s_360p_9_16 \
+  eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --sample-name ref_L5C5_2s_${BASE_ASPECT_RATIO}_9_16 \
    --prompt-path assets/texts/t2v_ref.txt --start-index 0 --end-index 3 \
-    --num-frames 2s --resolution 360p --aspect-ratio 9:16 \
+    --num-frames 2s --resolution ${BASE_ASPECT_RATIO} --aspect-ratio 9:16 \
    --loop 5 --condition-frame-length 5 \
    --reference-path assets/images/condition/cliff.png assets/images/condition/wave.png assets/images/condition/ship.png \
    --mask-strategy "0" "0" "0" --batch-size $DEFAULT_BS

-  eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --sample-name ref_L5C10_16s_360p_9_16 \
+  eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --sample-name ref_L5C10_16s_${BASE_ASPECT_RATIO}_9_16 \
    --prompt-path assets/texts/t2v_ref.txt --start-index 0 --end-index 3 \
-    --num-frames 16s --resolution 360p --aspect-ratio 9:16 \
+    --num-frames 16s --resolution ${BASE_ASPECT_RATIO} --aspect-ratio 9:16 \
    --loop 5 --condition-frame-length 10 \
    --reference-path assets/images/condition/cliff.png assets/images/condition/wave.png assets/images/condition/ship.png \
    --mask-strategy "0" "0" "0" --batch-size $DEFAULT_BS

  # 3.2
-  eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --sample-name ref_L1_16s_240p_9_16 \
+  eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --sample-name ref_L1_16s_${BASE_ASPECT_RATIO}_9_16 \
    --prompt-path assets/texts/t2v_ref.txt --start-index 3 --end-index 6 \
-    --num-frames 16s --resolution 360p --aspect-ratio 9:16 \
+    --num-frames 16s --resolution ${BASE_ASPECT_RATIO} --aspect-ratio 9:16 \
    --loop 1 \
    --reference-path assets/images/condition/cliff.png "assets/images/condition/cactus-sad.png\;assets/images/condition/cactus-happy.png" https://cdn.openai.com/tmp/s/interp/d0.mp4 \
    --mask-strategy "0" "0\;0,1,0,-1,1" "0,0,0,0,${QUAD_FRAMES},0.5" --batch-size $DEFAULT_BS
@ -238,10 +270,38 @@ function run_vbench() {
      --image-size $VBENCH_H $VBENCH_W \
      --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
  else
-    eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
-      --prompt-path assets/texts/VBench/all_dimension.txt \
-      --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO \
-      --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+    if [ -z ${NUM_SAMPLING_STEPS} ]; then
+        eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
+        --prompt-path assets/texts/VBench/all_dimension.txt \
+        --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO \
+        --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+    else
+      if [ -z ${FLOW} ]; then
+        eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
+        --prompt-path assets/texts/VBench/all_dimension.txt \
+        --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} \
+        --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+      else
+        if [ -z ${LLM_REFINE} ]; then
+          eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
+          --prompt-path assets/texts/VBench/all_dimension.txt \
+          --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --flow ${FLOW} \
+          --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+        else
+          if [ "${FLOW}" = "None" ]; then
+            eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
+            --prompt-path assets/texts/VBench/all_dimension.txt \
+            --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --llm-refine ${LLM_REFINE} \
+            --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+          else
+            eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
+            --prompt-path assets/texts/VBench/all_dimension.txt \
+            --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --flow ${FLOW} --llm-refine ${LLM_REFINE} \
+            --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+          fi
+        fi
+      fi
+    fi
  fi
 }

@ -255,16 +315,41 @@ function run_vbench_i2v() {
    eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
      --prompt-path assets/texts/VBench/all_i2v.txt \
      --image-size $VBENCH_I2V_H $VBENCH_I2V_W \
-      --start-index $1 --end-index $2 \
-      --num-frames $NUM_FRAMES --batch-size $VBENCH_BS
+      --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
  else
-    eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
-      --prompt-path assets/texts/VBench/all_i2v.txt \
-      --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO \
-      --start-index $1 --end-index $2 \
-      --num-frames $NUM_FRAMES --batch-size $VBENCH_BS
+    if [ -z ${NUM_SAMPLING_STEPS} ]; then
+        eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
+        --prompt-path assets/texts/VBench/all_i2v.txt \
+        --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO \
+        --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+    else
+      if [ -z ${FLOW} ]; then
+        eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
+        --prompt-path assets/texts/VBench/all_i2v.txt \
+        --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} \
+        --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+      else
+        if [ -z ${LLM_REFINE} ]; then
+          eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
+          --prompt-path assets/texts/VBench/all_i2v.txt \
+          --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --flow ${FLOW} \
+          --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+        else
+          if [ "${FLOW}" = "None" ]; then
+            eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
+            --prompt-path assets/texts/VBench/all_i2v.txt \
+            --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --llm-refine ${LLM_REFINE} \
+            --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+          else
+            eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
+            --prompt-path assets/texts/VBench/all_i2v.txt \
+            --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --flow ${FLOW} --llm-refine ${LLM_REFINE} \
+            --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+          fi
+        fi
+      fi
+    fi
  fi
-
 }

 ### Main
--- a/eval/vbench/calc_vbench.py
+++ b/eval/vbench/calc_vbench.py
@ -8,24 +8,30 @@ from vbench import VBench

 full_info_path = "eval/vbench/VBench_full_info.json"
 dimensions = [
-    # Quality Score
-    "subject_consistency",
-    "background_consistency",
-    "motion_smoothness",
-    "dynamic_degree",
-    "aesthetic_quality",
-    "imaging_quality",
-    "temporal_flickering",
-    # Semantic Score
-    "object_class",
-    "multiple_objects",
-    "color",
-    "spatial_relationship",
-    "scene",
-    "temporal_style",
-    "overall_consistency",
-    "human_action",
-    "appearance_style",
+    # a: 10min
+    "subject_consistency",  # 4min
+    "imaging_quality",  # 6min
+    # b: 12min
+    "background_consistency",  # 2min
+    "motion_smoothness",  # 5min
+    "overall_consistency",  # 2min
+    "human_action",  # 3min
+    # c: 14min
+    "multiple_objects",  # 14min
+    # d: 14min
+    "spatial_relationship",  # 14min
+    # e: 12min
+    "object_class",  # 12min
+    # f: 12min
+    "color",  # 12min
+    # g: 10.5min
+    "aesthetic_quality",  # 2.5min
+    "appearance_style",  # 6min
+    "temporal_flickering",  # 2min
+    # h: 9min
+    "scene",  # 3min
+    "temporal_style",  # 2min
+    "dynamic_degree",  # 4min
 ]


--- a/eval/vbench/launch.sh
+++ b/eval/vbench/launch.sh
@ -6,6 +6,10 @@ MODEL_NAME=$3
 RES=$4
 ASP_RATIO=$5

+NUM_SAMPLING_STEPS=$6
+FLOW=$7
+LLM_REFINE=$8
+
 if [[ $CKPT == *"ema"* ]]; then
    parentdir=$(dirname $CKPT)
    CKPT_BASE=$(basename $parentdir)_ema
@ -20,11 +24,36 @@ TASK_ID_LIST=(4a 4b 4c 4d 4e 4f 4g 4h) # for log records only
 START_INDEX_LIST=(0 120 240 360 480 600 720 840)
 END_INDEX_LIST=(120 240 360 480 600 720 840 2000)

+## Modify the following to run on multiple machines for faster results
+## 720p will take quite long on a single machine
+# START_INDEX_LIST=(60 180 300 420 540 660 780 900)
+# END_INDEX_LIST=(120 240 360 480 600 720 840 2000)
+# LOG_BASE=$(dirname $CKPT)/eval/last_60
+# mkdir -p ${LOG_BASE}
+# echo "Logging to $LOG_BASE"
+
+
+
 for i in "${!GPUS[@]}"; do
    if [ -z ${RES} ] || [ -z ${ASP_RATIO} ]  ;
        then
-            CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+            CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
        else
-            CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+            if [ -z ${NUM_SAMPLING_STEPS} ];
+                then
+                    CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                else
+                    if [ -z ${FLOW} ];
+                    then
+                        CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                    else
+                        if [ -z ${LLM_REFINE} ];
+                            then
+                                CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                            else
+                                CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} ${LLM_REFINE} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                        fi
+                    fi
+            fi
    fi
 done
--- a/eval/vbench/launch_calc.sh
+++ b/eval/vbench/launch_calc.sh
@ -7,11 +7,10 @@ mkdir -p $LOG_BASE
 echo "Logging to $LOG_BASE"

 GPUS=(0 1 2 3 4 5 6 7)
-START_INDEX_LIST=(0 2 4 6 8 10 12 14)
-END_INDEX_LIST=(2 4 6 8 10 12 14 16)
+START_INDEX_LIST=(0 2 6 7 8 9 10 13)
+END_INDEX_LIST=(2 6 7 8 9 10 13 16)
 TASK_ID_LIST=(calc_vbench_a calc_vbench_b calc_vbench_c calc_vbench_d calc_vbench_e calc_vbench_f calc_vbench_g calc_vbench_h) # for log records only

-
 for i in "${!GPUS[@]}"; do
    CUDA_VISIBLE_DEVICES=${GPUS[i]} python eval/vbench/calc_vbench.py $VIDEO_DIR $CKPT_DIR --start ${START_INDEX_LIST[i]} --end ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
 done
--- a/eval/vbench_i2v/launch.sh
+++ b/eval/vbench_i2v/launch.sh
@ -6,6 +6,10 @@ MODEL_NAME=$3
 RES=$4
 ASP_RATIO=$5

+NUM_SAMPLING_STEPS=$6
+FLOW=$7
+LLM_REFINE=$8
+
 if [[ $CKPT == *"ema"* ]]; then
    parentdir=$(dirname $CKPT)
    CKPT_BASE=$(basename $parentdir)_ema
@ -20,11 +24,27 @@ TASK_ID_LIST=(5a 5b 5c 5d 5e 5f 5g 5h) # for log records only
 START_INDEX_LIST=(0 140 280 420 560 700 840 980)
 END_INDEX_LIST=(140 280 420 560 700 840 980 2000)

+
 for i in "${!GPUS[@]}"; do
    if [ -z ${RES} ] || [ -z ${ASP_RATIO} ]  ;
        then
-            CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+            CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
        else
-            CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+            if [ -z ${NUM_SAMPLING_STEPS} ];
+                then
+                    CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                else
+                    if [ -z ${FLOW} ];
+                    then
+                        CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                    else
+                        if [ -z ${LLM_REFINE} ];
+                            then
+                                CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                            else
+                                CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} ${LLM_REFINE} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                        fi
+                    fi
+            fi
    fi
 done
--- a/opensora/datasets/aspect.py
+++ b/opensora/datasets/aspect.py
@ -465,7 +465,10 @@ def get_num_pixels(name):


 def get_image_size(resolution, ar_ratio):
-    ar_key = ASPECT_RATIO_MAP[ar_ratio]
+    if ar_ratio in ASPECT_RATIO_MAP:
+        ar_key = ASPECT_RATIO_MAP[ar_ratio]
+    else:
+        ar_key = ar_ratio
    rs_dict = ASPECT_RATIOS[resolution][1]
    assert ar_key in rs_dict, f"Aspect ratio {ar_ratio} not found for resolution {resolution}"
    return rs_dict[ar_key]
--- a/opensora/datasets/dataloader.py
+++ b/opensora/datasets/dataloader.py
@ -111,6 +111,9 @@ def prepare_dataloader(


 def collate_fn_default(batch):
+    # filter out None
+    batch = [x for x in batch if x is not None]
+
    # HACK: for loading text features
    use_mask = False
    if "mask" in batch[0] and isinstance(batch[0]["mask"], int):
@ -132,6 +135,9 @@ def collate_fn_batch(batch):
    """
    Used only with BatchDistributedSampler
    """
+    # filter out None
+    batch = [x for x in batch if x is not None]
+    
    res = torch.utils.data.default_collate(batch)

    # squeeze the first dimension, which is due to torch.stack() in default_collate()
--- a/opensora/datasets/datasets.py
+++ b/opensora/datasets/datasets.py
@ -190,7 +190,10 @@ class VariableVideoTextDataset(VideoTextDataset):
        return ret

    def __getitem__(self, index):
-        return self.getitem(index)
+        try:
+            return self.getitem(index)
+        except:
+            return None


@DATASETS.register_module()
--- a/opensora/models/layers/blocks.py
+++ b/opensora/models/layers/blocks.py
@ -163,6 +163,8 @@ class Attention(nn.Module):
        if rope is not None:
            self.rope = True
            self.rotary_emb = rope
+        
+        self.is_causal = False

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        B, N, C = x.shape
@ -198,12 +200,17 @@ class Attention(nn.Module):
                v,
                dropout_p=self.attn_drop.p if self.training else 0.0,
                softmax_scale=self.scale,
+                causal=self.is_causal,
            )
        else:
            dtype = q.dtype
            q = q * self.scale
            attn = q @ k.transpose(-2, -1)  # translate attn to float32
            attn = attn.to(torch.float32)
+            if self.is_causal:
+                causal_mask = torch.tril(torch.ones_like(attn), diagonal=0)
+                causal_mask = torch.where(causal_mask.bool(), 0, float('-inf'))
+                attn += causal_mask
            attn = attn.softmax(dim=-1)
            attn = attn.to(dtype)  # cast back attn to original dtype
            attn = self.attn_drop(attn)
--- a/opensora/models/pixart/pixart.py
+++ b/opensora/models/pixart/pixart.py
@ -197,16 +197,18 @@ class PixArt(nn.Module):
            if freeze == "text":
                self.freeze_text()

-    def forward(self, x, timestep, y, mask=None):
+    def forward(self, x, timestep, y, mask=None, **kwargs):
        """
        Forward pass of PixArt.
        x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
        t: (N,) tensor of diffusion timesteps
        y: (N, 1, 120, C) tensor of class labels
        """
-        x = x.to(self.dtype)
-        timestep = timestep.to(self.dtype)
-        y = y.to(self.dtype)
+        dtype = self.x_embedder.proj.weight.dtype
+        B = x.size(0)
+        x = x.to(dtype)
+        timestep = timestep.to(dtype)
+        y = y.to(dtype)

        # embedding
        x = self.x_embedder(x)  # (B, N, D)
--- a/opensora/models/stdit/stdit3.py
+++ b/opensora/models/stdit/stdit3.py
@ -4,6 +4,7 @@ import numpy as np
 import torch
 import torch.distributed as dist
 import torch.nn as nn
+import torch.nn.functional as F
 from einops import rearrange
 from rotary_embedding_torch import RotaryEmbedding
 from timm.models.layers import DropPath
@ -361,6 +362,24 @@ class STDiT3(PreTrainedModel):
        # === get pos embed ===
        _, _, Tx, Hx, Wx = x.size()
        T, H, W = self.get_dynamic_size(x)
+
+        # adjust for sequence parallelism
+        # we need to ensure H * W is divisible by sequence parallel size
+        # for simplicity, we can adjust the height to make it divisible
+        if self.enable_sequence_parallelism:
+            sp_size = dist.get_world_size(get_sequence_parallel_group())
+            if H % sp_size != 0:
+                h_pad_size = sp_size - H % sp_size
+            else:
+                h_pad_size = 0
+
+            if h_pad_size > 0:
+                hx_pad_size = h_pad_size * self.patch_size[1]
+
+                # pad x along the H dimension
+                H += h_pad_size
+                x = F.pad(x, (0, 0, 0, hx_pad_size))
+
        S = H * W
        base_size = round(S**0.5)
        resolution_sq = (height[0].item() * width[0].item()) ** 0.5
--- a/opensora/models/vae/vae.py
+++ b/opensora/models/vae/vae.py
@ -13,7 +13,13 @@ from opensora.utils.ckpt_utils import load_checkpoint
@MODELS.register_module()
 class VideoAutoencoderKL(nn.Module):
    def __init__(
-        self, from_pretrained=None, micro_batch_size=None, cache_dir=None, local_files_only=False, subfolder=None
+        self,
+        from_pretrained=None,
+        micro_batch_size=None,
+        cache_dir=None,
+        local_files_only=False,
+        subfolder=None,
+        scaling_factor=0.18215,
    ):
        super().__init__()
        self.module = AutoencoderKL.from_pretrained(
@ -25,6 +31,7 @@ class VideoAutoencoderKL(nn.Module):
        self.out_channels = self.module.config.latent_channels
        self.patch_size = (1, 8, 8)
        self.micro_batch_size = micro_batch_size
+        self.scaling_factor = scaling_factor

    def encode(self, x):
        # x: (B, C, T, H, W)
@ -32,14 +39,14 @@ class VideoAutoencoderKL(nn.Module):
        x = rearrange(x, "B C T H W -> (B T) C H W")

        if self.micro_batch_size is None:
-            x = self.module.encode(x).latent_dist.sample().mul_(0.18215)
+            x = self.module.encode(x).latent_dist.sample().mul_(self.scaling_factor)
        else:
            # NOTE: cannot be used for training
            bs = self.micro_batch_size
            x_out = []
            for i in range(0, x.shape[0], bs):
                x_bs = x[i : i + bs]
-                x_bs = self.module.encode(x_bs).latent_dist.sample().mul_(0.18215)
+                x_bs = self.module.encode(x_bs).latent_dist.sample().mul_(self.scaling_factor)
                x_out.append(x_bs)
            x = torch.cat(x_out, dim=0)
        x = rearrange(x, "(B T) C H W -> B C T H W", B=B)
@ -50,14 +57,14 @@ class VideoAutoencoderKL(nn.Module):
        B = x.shape[0]
        x = rearrange(x, "B C T H W -> (B T) C H W")
        if self.micro_batch_size is None:
-            x = self.module.decode(x / 0.18215).sample
+            x = self.module.decode(x / self.scaling_factor).sample
        else:
            # NOTE: cannot be used for training
            bs = self.micro_batch_size
            x_out = []
            for i in range(0, x.shape[0], bs):
                x_bs = x[i : i + bs]
-                x_bs = self.module.decode(x_bs / 0.18215).sample
+                x_bs = self.module.decode(x_bs / self.scaling_factor).sample
                x_out.append(x_bs)
            x = torch.cat(x_out, dim=0)
        x = rearrange(x, "(B T) C H W -> B C T H W", B=B)
--- a/opensora/models/vae/video_sdxl/blocks.py
+++ b/opensora/models/vae/video_sdxl/blocks.py
@ -0,0 +1,724 @@
+"""
+Adapted from SDXL VAE (https://huggingface.co/stabilityai/sdxl-vae/blob/main/config.json)
+All default values of kwargs are the same as SDXL
+"""
+
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.models.attention_processor import Attention
+from einops import rearrange
+
+
+def video_to_image(func):
+    def wrapper(self, x, *args, **kwargs):
+        if x.ndim == 5:
+            B = x.shape[0]
+            x = rearrange(x, 'B C T H W -> (B T) C H W')
+
+            if hasattr(self, 'micro_batch_size') and self.micro_batch_size is None:
+                x = func(self, x, *args, **kwargs)
+            else:
+                bs = self.micro_batch_size
+                x_out = []
+                for i in range(0, x.shape[0], bs):
+                    x_i = func(self, x[i:i + bs], *args, **kwargs)
+                    x_out.append(x_i)
+                x = torch.cat(x_out, dim=0)
+
+            x = rearrange(x, '(B T) C H W -> B C T H W', B=B)
+        return x
+    return wrapper
+
+
+class VideoConv2d(nn.Conv2d):
+    def __init__(self, *args, micro_batch_size=None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.micro_batch_size = micro_batch_size
+
+    @video_to_image
+    def forward(self, x):
+        return super().forward(x)
+
+
+class ResnetBlock2D(nn.Module):
+    """
+        Use nn.Conv2d
+        Default activation is nn.SiLU()
+        Make sure input tensor is of shape [B, C, T, H, W] or [B, C, H, W]
+        Support micro_batch_size
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        norm_groups: int = 32,
+        norm_eps: float = 1e-6,
+        micro_batch_size=None,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.micro_batch_size = micro_batch_size
+
+        conv_cls = nn.Conv2d
+        self.norm1 = torch.nn.GroupNorm(num_groups=norm_groups, num_channels=in_channels, eps=norm_eps, affine=True)
+        self.conv1 = conv_cls(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+
+        self.norm2 = torch.nn.GroupNorm(num_groups=norm_groups, num_channels=out_channels, eps=norm_eps, affine=True)
+        self.conv2 = conv_cls(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+
+        self.act = nn.SiLU()
+
+        self.use_in_shortcut = self.in_channels != out_channels
+
+        self.conv_shortcut = None
+        if self.use_in_shortcut:
+            self.conv_shortcut = conv_cls(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            )
+
+    @video_to_image
+    def forward(self, x):
+        res = self.norm1(x)
+        res = self.act(res)
+        res = self.conv1(res)
+
+        res = self.norm2(res)
+        res = self.act(res)
+        res = self.conv2(res)
+
+        if self.conv_shortcut is not None:
+            x = self.conv_shortcut(x)
+
+        out = x + res
+        return out
+
+
+class ResnetBlock3D(nn.Module):
+    """
+        Use nn.Conv3d
+        Default activation is nn.SiLU()
+        Make sure input tensor is of shape [B, C, T, H, W]
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        norm_groups: int = 32,
+        norm_eps: float = 1e-6,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+
+        conv_cls = nn.Conv3d
+        self.norm1 = torch.nn.GroupNorm(num_groups=norm_groups, num_channels=in_channels, eps=norm_eps, affine=True)
+        self.conv1 = conv_cls(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+
+        self.norm2 = torch.nn.GroupNorm(num_groups=norm_groups, num_channels=out_channels, eps=norm_eps, affine=True)
+        self.conv2 = conv_cls(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+
+        self.act = nn.SiLU()
+
+        self.use_in_shortcut = self.in_channels != out_channels
+
+        self.conv_shortcut = None
+        if self.use_in_shortcut:
+            self.conv_shortcut = conv_cls(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            )
+        
+    def forward(self, x):
+        res = self.norm1(x)
+        res = self.act(res)
+        res = self.conv1(res)
+
+        res = self.norm2(res)
+        res = self.act(res)
+        res = self.conv2(res)
+
+        if self.conv_shortcut is not None:
+            x = self.conv_shortcut(x)
+
+        out = x + res
+        return out
+
+
+class SpatialDownsample2x(nn.Module):
+    """
+        Default downsample is Conv2d(stride=2)
+        Make sure input tensor is of shape [B, C, T, H, W]
+        Support micro_batch_size
+    """
+    def __init__(
+        self,
+        channels: int,
+        use_conv: bool = True,
+        micro_batch_size=None,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.use_conv = use_conv
+        self.micro_batch_size = micro_batch_size
+
+        if use_conv:
+            self.downsample = nn.Conv2d(
+                self.channels, self.channels, kernel_size=3, stride=2, padding=0,
+            )
+        else:
+            self.downsample = nn.AvgPool2d(kernel_size=2, stride=2)
+
+    @video_to_image
+    def forward(self, x):
+        # implementation from SDXL
+        pad = (0, 1, 0, 1)
+        x = F.pad(x, pad, mode="constant", value=0)
+
+        x = self.downsample(x)
+        return x
+
+
+class SpatialUpsample2x(nn.Module):
+    """
+        Default upsample is F.interpolate(scale_factor=2) + Conv2d(stride=1)
+        Make sure input tensor is of shape [B, C, T, H, W]
+        Support micro_batch_size
+    """
+    def __init__(
+        self,
+        channels: int,
+        use_interpolate=True,
+        micro_batch_size=None,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.use_interpolate = use_interpolate
+        self.micro_batch_size = micro_batch_size
+
+        if use_interpolate:
+            self.conv = nn.Conv2d(self.channels, self.channels, kernel_size=3, padding=1)
+        else:
+            raise NotImplementedError
+            self.upsample = nn.ConvTranspose2d(channels, self.channels, kernel_size=4, stride=2, padding=1)
+    
+    def forward(self, x):
+        B = x.shape[0]
+        x = rearrange(x, 'B C T H W -> (B T) C H W')
+
+        if self.micro_batch_size is None:
+            x = self.forward_BCHW(x)
+        else:
+            bs = self.micro_batch_size
+            x_out = []
+            for i in range(0, x.shape[0], bs):
+                x_i = self.forward_BCHW(x[i:i + bs])
+                x_out.append(x_i)
+            x = torch.cat(x_out, dim=0)
+
+        x = rearrange(x, '(B T) C H W -> B C T H W', B=B)
+        return x
+
+    def forward_BCHW(self, x):
+        if self.use_interpolate:
+            # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
+            if x.shape[0] >= 64:
+                x = x.contiguous()
+
+            # interpolate tensor of bfloat16 is fixed in pytorch 2.1. see https://github.com/pytorch/pytorch/issues/86679
+            x = F.interpolate(x, scale_factor=2.0, mode="nearest")
+            x = self.conv(x)
+        else:
+            x = self.upsample(x)
+
+        return x
+
+
+class TemporalDownsample2x(nn.Module):
+    """
+        Default downsample is Conv3d(stride=(2, 1, 1))
+        Make sure input tensor is of shape [B, C, T, H, W]
+    """
+    def __init__(
+        self,
+        channels: int,
+        use_conv: bool = True,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.use_conv = use_conv
+
+        if use_conv:
+            self.downsample = nn.Conv3d(
+                self.channels, self.channels, kernel_size=(3, 3, 3), stride=(2, 1, 1), padding=(1, 1, 1),
+           )
+        else:
+            self.downsample = nn.AvgPool3d(kernel_size=(3, 1, 1), stride=(2, 1, 1))
+
+    def forward(self, x):
+        x = self.downsample(x)
+        return x
+
+
+class TemporalUpsample2x(nn.Module):
+    """
+        Default upsample is F.interpolate(scale_factor=(2, 1, 1)) + Conv3d(stride=1)
+        Make sure input tensor is of shape [B, C, T, H, W]
+        Support micro_batch_size
+    """
+    def __init__(
+        self,
+        channels,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.conv = nn.Conv3d(channels, channels, kernel_size=3, padding=1)
+
+    def forward(self, x):
+        if x.shape[0] >= 64:
+            x = x.contiguous()
+        x = F.interpolate(x, scale_factor=(2, 1, 1), mode="trilinear")
+        x = self.conv(x)
+        return x
+
+
+class UNetMidBlock2D(nn.Module):
+    """
+        default is ResnetBlock2D + Spatial Attention + ResnetBlock2D
+        Make sure input tensor is of shape [B, C, T, H, W] or [B, C, H, W]
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        num_layers: int = 1,
+        norm_groups: int = 32,
+        norm_eps: float = 1e-6,
+        attn_groups: Optional[int] = None,
+        add_attention: bool = True,
+        attention_head_dim: int = 512,
+    ):
+        super().__init__()
+        self.add_attention = add_attention
+
+        if attn_groups is None:
+            attn_groups = norm_groups
+
+        if attention_head_dim is None:
+            attention_head_dim = in_channels
+
+        res_blocks = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                norm_eps=norm_eps,
+                norm_groups=norm_groups,
+            )
+        ]
+        attn_blocks = []
+
+        for _ in range(num_layers):
+            if self.add_attention:
+                attn_blocks.append(
+                    Attention(
+                        in_channels,
+                        heads=in_channels // attention_head_dim,
+                        dim_head=attention_head_dim,
+                        # rescale_output_factor=output_scale_factor,
+                        rescale_output_factor=1.0,
+                        eps=norm_eps,
+                        norm_num_groups=attn_groups,
+                        # spatial_norm_dim=temb_channels if resnet_time_scale_shift == "spatial" else None,
+                        spatial_norm_dim=None,
+                        residual_connection=True,
+                        bias=True,
+                        upcast_softmax=True,
+                        _from_deprecated_attn_block=True,
+                    )
+                )
+
+            res_blocks.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    norm_eps=norm_eps,
+                    norm_groups=norm_groups,
+                )
+            )
+
+        self.attn_blocks = nn.ModuleList(attn_blocks)
+        self.res_blocks = nn.ModuleList(res_blocks)
+
+    def forward(self, x):
+        has_T = x.ndim == 5
+        if has_T:
+            B = x.shape[0]
+            x = rearrange(x, 'B C T H W -> (B T) C H W')
+
+        x = self.res_blocks[0](x)
+        for attn, res_block in zip(self.attn_blocks, self.res_blocks[1:]):
+            if attn is not None:
+                x = attn(x)
+            x = res_block(x)
+
+        if has_T:
+            x = rearrange(x, '(B T) C H W -> B C T H W', B=B)
+        return x
+
+
+class Encoder(nn.Module):
+    """
+        default arch is conv_in + blocks + mid_block + out_block
+        Make sure input tensor is of shape [B, C, T, H, W]
+    """
+    def __init__(
+        self,
+        in_channels=3,
+        out_channels=4,
+        norm_groups=32,
+        norm_eps=1e-6,
+        double_z=True,
+        micro_batch_size=None,
+    ):
+        super().__init__()
+        in_channels_encoder = in_channels
+        out_channels_encoder = out_channels
+        block_out_channels = [128, 256, 512, 512]
+
+        # conv_in
+        self.conv_in = VideoConv2d(
+            in_channels_encoder,
+            block_out_channels[0],
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            micro_batch_size=micro_batch_size,
+        )
+
+        # blocks
+        blocks = []
+
+        # the first block: ResnetBlock2D
+        in_channels = block_out_channels[0]
+        out_channels = block_out_channels[0]
+        blocks.append(
+            nn.Sequential(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    norm_groups=norm_groups,
+                    norm_eps=norm_eps,
+                    micro_batch_size=micro_batch_size,
+                ),
+                ResnetBlock2D(
+                    in_channels=out_channels,
+                    out_channels=out_channels,
+                    norm_groups=norm_groups,
+                    norm_eps=norm_eps,
+                    micro_batch_size=micro_batch_size,
+                ),
+                SpatialDownsample2x(
+                    channels=out_channels,
+                    use_conv=True,
+                    micro_batch_size=micro_batch_size, 
+                ),
+            )
+        )
+
+        # the second block: ResnetBlock2D
+        in_channels = block_out_channels[0]
+        out_channels = block_out_channels[1]
+        blocks.append(
+            nn.Sequential(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    norm_groups=norm_groups,
+                    norm_eps=norm_eps,
+                    micro_batch_size=micro_batch_size,
+                ),
+                ResnetBlock2D(
+                    in_channels=out_channels,
+                    out_channels=out_channels,
+                    norm_groups=norm_groups,
+                    norm_eps=norm_eps,
+                    micro_batch_size=micro_batch_size,
+                ),
+                SpatialDownsample2x(
+                    channels=out_channels,
+                    use_conv=True,
+                    micro_batch_size=micro_batch_size, 
+                ),
+                TemporalDownsample2x(
+                    channels=out_channels,
+                    use_conv=True,
+                )
+            )
+        )
+
+        # the third block: ResnetBlock3D
+        in_channels = block_out_channels[1]
+        out_channels = block_out_channels[2]
+        blocks.append(
+            nn.Sequential(
+                ResnetBlock3D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    norm_groups=norm_groups,
+                    norm_eps=norm_eps,
+                ),
+                ResnetBlock3D(
+                    in_channels=out_channels,
+                    out_channels=out_channels,
+                    norm_groups=norm_groups,
+                    norm_eps=norm_eps,
+                ),
+                SpatialDownsample2x(
+                    channels=out_channels,
+                    use_conv=True,
+                ),
+                TemporalDownsample2x(
+                    channels=out_channels,
+                    use_conv=True,
+                )
+            )
+        )
+
+        # the fourth block: ResnetBlock3D
+        in_channels = block_out_channels[2]
+        out_channels = block_out_channels[3]
+        blocks.append(
+            nn.Sequential(
+                ResnetBlock3D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    norm_groups=norm_groups,
+                    norm_eps=norm_eps,
+                ),
+                ResnetBlock3D(
+                    in_channels=out_channels,
+                    out_channels=out_channels,
+                    norm_groups=norm_groups,
+                    norm_eps=norm_eps,
+                ),
+            )
+        )
+
+        self.blocks = nn.ModuleList(blocks)
+
+
+        # mid_block
+        in_channels = block_out_channels[-1]
+        self.mid_block = UNetMidBlock2D(
+            in_channels=in_channels,
+            num_layers=1,
+            norm_groups=norm_groups,
+            norm_eps=norm_eps,
+            add_attention=True,
+            attention_head_dim=in_channels,
+        )
+
+        # out_block
+        in_channels = block_out_channels[-1]
+        out_channels = 2 * out_channels_encoder if double_z else out_channels_encoder
+        self.out_block = nn.Sequential(
+            nn.GroupNorm(num_channels=in_channels, num_groups=norm_groups, eps=norm_eps),
+            nn.SiLU(),
+            nn.Conv3d(in_channels, out_channels, kernel_size=3, padding=1),
+        )
+    
+    def forward(self, x):
+        x = self.conv_in(x)
+
+        for block in self.blocks:
+            x = block(x)
+
+        x = self.mid_block(x)
+
+        x = self.out_block(x)
+        return x
+
+
+class Decoder(nn.Module):
+    """
+        default arch is conv_in + mid_block + blocks + out_block
+        Make sure input tensor is of shape [B, C, T, H, W]
+    """
+    def __init__(
+        self,
+        in_channels=4,
+        out_channels=3,
+        norm_groups=32,
+        norm_eps=1e-6,
+    ):
+        super().__init__()
+        in_channels_decoder = in_channels
+        out_channels_decoder = out_channels
+        block_out_channels = [512, 512, 256, 128]
+
+        # conv_in
+        self.conv_in = nn.Conv3d(
+            in_channels_decoder,
+            block_out_channels[0],
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+
+        # mid_block
+        in_channels = block_out_channels[0]
+        self.mid_block = UNetMidBlock2D(
+            in_channels=in_channels,
+            num_layers=1,
+            norm_groups=norm_groups,
+            norm_eps=norm_eps,
+            add_attention=True,
+            attention_head_dim=in_channels,
+        )
+
+        # blocks
+        blocks = []
+        layer_per_block = 3
+
+        # the first up block: ResnetBlock3D
+        in_channels = block_out_channels[0]
+        out_channels = block_out_channels[0]
+        seq = [
+            ResnetBlock3D(
+                in_channels=in_channels if idx ==0 else out_channels,
+                out_channels=out_channels,
+                norm_groups=norm_groups,
+                norm_eps=norm_eps,
+            )
+            for idx in range(layer_per_block)
+        ] + [
+            SpatialUpsample2x(
+                channels=out_channels,
+                use_interpolate=True,
+            ),
+            TemporalUpsample2x(
+                channels=out_channels,
+            ),
+        ]
+        blocks.append(nn.Sequential(*seq))
+
+        # the second up block: ResnetBlock3D
+        in_channels = block_out_channels[0]
+        out_channels = block_out_channels[1]
+        seq = [
+            ResnetBlock3D(
+                in_channels=in_channels if idx ==0 else out_channels,
+                out_channels=out_channels,
+                norm_groups=norm_groups,
+                norm_eps=norm_eps,
+            )
+            for idx in range(layer_per_block)
+        ] + [
+            SpatialUpsample2x(
+                channels=out_channels,
+                use_interpolate=True,
+            ),
+            TemporalUpsample2x(
+                channels=out_channels,
+            ),
+        ]
+        blocks.append(nn.Sequential(*seq))
+
+        # the third up block: ResnetBlock3D
+        in_channels = block_out_channels[1]
+        out_channels = block_out_channels[2]
+        seq = [
+            ResnetBlock3D(
+                in_channels=in_channels if idx ==0 else out_channels,
+                out_channels=out_channels,
+                norm_groups=norm_groups,
+                norm_eps=norm_eps,
+            )
+            for idx in range(layer_per_block)
+        ] + [
+            SpatialUpsample2x(
+                channels=out_channels,
+                use_interpolate=True,
+            ),
+        ]
+        blocks.append(nn.Sequential(*seq))
+
+        # the fourth up block: ResnetBlock2D
+        in_channels = block_out_channels[2]
+        out_channels = block_out_channels[3]
+        seq = [
+            ResnetBlock2D(
+                in_channels=in_channels if idx ==0 else out_channels,
+                out_channels=out_channels,
+                norm_groups=norm_groups,
+                norm_eps=norm_eps,
+            )
+            for idx in range(layer_per_block)
+        ]
+        blocks.append(nn.Sequential(*seq))
+
+        self.blocks = nn.ModuleList(blocks)
+
+        # out_block
+        in_channels = block_out_channels[-1]
+        out_channels = out_channels_decoder
+        self.out_block = nn.Sequential(
+            nn.GroupNorm(num_channels=in_channels, num_groups=norm_groups, eps=norm_eps),
+            nn.SiLU(),
+            nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
+        )
+
+    def forward(self, x):
+        x = self.conv_in(x)
+        print(torch.cuda.memory_allocated() /  1024 ** 3)
+
+        x = self.mid_block(x)
+        print(torch.cuda.memory_allocated() /  1024 ** 3)
+
+        for block in self.blocks:
+            x = block(x)
+        print(torch.cuda.memory_allocated() /  1024 ** 3)
+
+        x = self.out_block(x)
+        print(torch.cuda.memory_allocated() /  1024 ** 3)
+        return x
+
+if __name__ == '__main__':
+    from opensora.utils.misc import count_params
+    device = 'cuda'
+    dtype = torch.bfloat16
+
+    encoder = Encoder(
+        in_channels=3,
+        out_channels=4,
+        double_z=False,
+        micro_batch_size=4,
+    ).to(torch.bfloat16).to(device, dtype).eval()
+
+    decoder = Decoder(
+        in_channels=4,
+        out_channels=3,
+    ).to(torch.bfloat16).to(device, dtype).eval()
+    num_params_enc = count_params(encoder)
+    num_params_dec = count_params(decoder)
+    print(f'Encoder #params: {num_params_enc}')
+    print(f'Decoder #params: {num_params_dec}')
+
+    # inference
+    x = torch.rand(1, 3, 51, 720, 1080).to(device, dtype)
+    with torch.inference_mode():
+        x_enc = encoder(x)
+        x_dec = decoder(x_enc)
+    print(torch.cuda.memory_allocated() /  1024 ** 3)
+    breakpoint()
--- a/opensora/schedulers/dpms/init.py
+++ b/opensora/schedulers/dpms/init.py
@ -24,7 +24,8 @@ class DPM_SOLVER:
        mask=None,
        progress=True,
    ):
-        assert mask is None, "mask is not supported in dpm-solver"
+        if mask is not None:
+            print("[WARNING] mask is not supported in dpm-solver, it will be ignored")
        n = len(prompts)
        model_args = text_encoder.encode(prompts)
        y = model_args.pop("y")
--- a/opensora/schedulers/dpms/dpm_solver.py
+++ b/opensora/schedulers/dpms/dpm_solver.py
@ -1419,7 +1419,7 @@ class DPM_Solver:
                for step in progress_fn(range(order, steps + 1)):
                    t = timesteps[step]
                    # We only use lower order for steps < 10
-                    if lower_order_final and steps < 10:
+                    if lower_order_final:  # recommended by Shuchen Xue
                        step_order = min(order, steps + 1 - step)
                    else:
                        step_order = order
--- a/requirements/requirements-cu121.txt
+++ b/requirements/requirements-cu121.txt
@ -1,3 +1,3 @@
-torch==2.3.0 --index-url https://download.pytorch.org/whl/cu121
-torchvision==0.18.0 --index-url https://download.pytorch.org/whl/cu121
-xformers==0.0.26.post1 --index-url https://download.pytorch.org/whl/cu121
+torch==2.2.2 --index-url https://download.pytorch.org/whl/cu121
+torchvision==0.17.2 --index-url https://download.pytorch.org/whl/cu121
+xformers==0.0.25.post1 --index-url https://download.pytorch.org/whl/cu121
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@ -1,4 +1,4 @@
-colossalai==0.3.7
+colossalai>=0.4.0
 mmengine>=0.10.3
 pandas>=2.0.3
 timm==0.9.16
@ -7,6 +7,7 @@ ftfy>=6.2.0 # for t5
 diffusers==0.27.2 # for vae
 accelerate==0.29.2 # for t5
 av>=12.0.0 # for video loading
+numpy<2.0.0

 # [gradio]
 gradio>=4.26.0
--- a/scripts/inference.py
+++ b/scripts/inference.py
@ -260,6 +260,7 @@ def main():
                    )

                # == sampling ==
+                torch.manual_seed(1024)
                z = torch.randn(len(batch_prompts), vae.out_channels, *latent_size, device=device, dtype=dtype)
                masks = apply_mask_strategy(z, refs, ms, loop_i, align=align)
                samples = scheduler.sample(
--- a/scripts/train.py
+++ b/scripts/train.py
@ -347,24 +347,27 @@ def main():
                    tb_writer.add_scalar("loss", loss.item(), global_step)
                    # wandb
                    if cfg.get("wandb", False):
-                        wandb.log(
-                            {
-                                "iter": global_step,
-                                "acc_step": acc_step,
-                                "epoch": epoch,
-                                "loss": loss.item(),
-                                "avg_loss": avg_loss,
-                                "lr": optimizer.param_groups[0]["lr"],
-                                "debug/move_data_time": move_data_t.elapsed_time,
-                                "debug/encode_time": encode_t.elapsed_time,
-                                "debug/mask_time": mask_t.elapsed_time,
-                                "debug/diffusion_time": loss_t.elapsed_time,
-                                "debug/backward_time": backward_t.elapsed_time,
-                                "debug/update_ema_time": ema_t.elapsed_time,
-                                "debug/reduce_loss_time": reduce_loss_t.elapsed_time,
-                            },
-                            step=global_step,
-                        )
+                        wandb_dict = {
+                            "iter": global_step,
+                            "acc_step": acc_step,
+                            "epoch": epoch,
+                            "loss": loss.item(),
+                            "avg_loss": avg_loss,
+                            "lr": optimizer.param_groups[0]["lr"],
+                        }
+                        if record_time:
+                            wandb_dict.update(
+                                {
+                                    "debug/move_data_time": move_data_t.elapsed_time,
+                                    "debug/encode_time": encode_t.elapsed_time,
+                                    "debug/mask_time": mask_t.elapsed_time,
+                                    "debug/diffusion_time": loss_t.elapsed_time,
+                                    "debug/backward_time": backward_t.elapsed_time,
+                                    "debug/update_ema_time": ema_t.elapsed_time,
+                                    "debug/reduce_loss_time": reduce_loss_t.elapsed_time,
+                                }
+                            )
+                        wandb.log(wandb_dict, step=global_step)

                    running_loss = 0.0
                    log_step = 0
--- a/tests/test_stdit3_sequence_parallelism.py
+++ b/tests/test_stdit3_sequence_parallelism.py
@ -9,7 +9,7 @@ from opensora.models.stdit.stdit3 import STDiT3, STDiT3Config


 def get_sample_data():
-    x = torch.rand([1, 4, 15, 20, 27], dtype=torch.bfloat16)  # (B, C, T, H, W)
+    x = torch.rand([1, 4, 15, 20, 28], dtype=torch.bfloat16)  # (B, C, T, H, W)
    timestep = torch.Tensor([924.0]).to(torch.bfloat16)
    y = torch.rand(1, 1, 300, 4096, dtype=torch.bfloat16)
    mask = torch.ones([1, 300], dtype=torch.int32)
@ -66,6 +66,17 @@ def run_model(rank, world_size, port):
    set_seed(1024)
    dist_model_cfg = get_stdit3_config(enable_sequence_parallelism=True)
    dist_model = STDiT3(dist_model_cfg).cuda().to(torch.bfloat16)
+
+    # ensure model weights are equal
+    for p1, p2 in zip(non_dist_model.parameters(), dist_model.parameters()):
+        assert torch.equal(p1, p2)
+
+    # ensure model weights are equal across all ranks
+    for p in dist_model.parameters():
+        p_list = [torch.zeros_like(p) for _ in range(world_size)]
+        dist.all_gather(p_list, p, group=dist.group.WORLD)
+        assert torch.equal(*p_list)
+
    dist_out = dist_model(**data)
    dist_out.mean().backward()

@ -84,9 +95,8 @@ def run_model(rank, world_size, port):
    for (n1, p1), (n2, p2) in zip(non_dist_model.named_parameters(), dist_model.named_parameters()):
        assert n1 == n2
        if p1.grad is not None and p2.grad is not None:
-            if not torch.allclose(p1.grad, p2.grad, rtol=1e-2, atol=1e-4):
-                if dist.get_rank() == 0:
-                    print(f"gradient of {n1} is not equal, {p1.grad} vs {p2.grad}")
+            if not torch.allclose(p1.grad, p2.grad, rtol=1e-2, atol=1e-4) and dist.get_rank() == 0:
+                print(f"gradient of {n1} is not equal, {p1.grad} vs {p2.grad}")
        else:
            assert p1.grad is None and p2.grad is None

--- a/tools/caption/README.md
+++ b/tools/caption/README.md
@ -4,7 +4,7 @@ Human labeling of videos is expensive and time-consuming. We adopt powerful imag

 ## PLLaVA Captioning

-To balance captioning speed and performance, we chose the 13B version of PLLaVA configured with 2*2 spatial pooling. We feed it with 4 frames evenly extracted from the video.
+To balance captioning speed and performance, we chose the 13B version of PLLaVA configured with 2*2 spatial pooling. We feed it with 4 frames evenly extracted from the video. We accelerate its inference via (1) batching and (2) offload frame extraction to a separate process such that the GPU computations and frame extraction happen in parallel.

 ### Installation
 Install the required dependancies by following our [installation instructions](../../docs/installation.md)'s "Data Dependencies" and "PLLaVA Captioning" sections.
--- a/tools/caption/pllava_dir/caption_pllava.py
+++ b/tools/caption/pllava_dir/caption_pllava.py
@ -1,3 +1,17 @@
+import sys
+import os
+import os
+from pathlib import Path
+
+current_file = Path(__file__)  # Gets the path of the current file
+fourth_level_parent = current_file.parents[3]
+
+datasets_dir = os.path.join(fourth_level_parent, "opensora/datasets")
+import sys
+sys.path.append(datasets_dir)
+from read_video import read_video_av
+sys.path.remove(datasets_dir)
+
 import itertools
 import logging
 import multiprocessing as mp
@ -95,21 +109,49 @@ def get_index(num_frames, num_segments):
    return offsets


+# def load_video(video_path, num_frames, return_msg=False, resolution=336):
+#     transforms = torchvision.transforms.Resize(size=resolution)
+#     vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+#     total_num_frames = len(vr)
+#     frame_indices = get_index(total_num_frames, num_frames)
+#     images_group = list()
+#     for frame_index in frame_indices:
+#         img = Image.fromarray(vr[frame_index].asnumpy())
+#         images_group.append(transforms(img))
+#     if return_msg:
+#         fps = float(vr.get_avg_fps())
+#         sec = ", ".join([str(round(f / fps, 1)) for f in frame_indices])
+#         # " " should be added in the start and end
+#         msg = f"The video contains {len(frame_indices)} frames sampled at {sec} seconds."
+#         return images_group, msg
+#     else:
+#         return images_group
+
+
 def load_video(video_path, num_frames, return_msg=False, resolution=336):
    transforms = torchvision.transforms.Resize(size=resolution)
-    vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
-    total_num_frames = len(vr)
+    # vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+    vframes, aframes, info = read_video_av(
+        video_path,
+        pts_unit="sec", 
+        output_format="THWC"
+    )
+    print(vframes.shape)
+    total_num_frames = len(vframes)
+    # print("Video path: ", video_path)
+    # print("Total number of frames: ", total_num_frames)
    frame_indices = get_index(total_num_frames, num_frames)
    images_group = list()
    for frame_index in frame_indices:
-        img = Image.fromarray(vr[frame_index].asnumpy())
+        img = Image.fromarray(vframes[frame_index].numpy())
        images_group.append(transforms(img))
    if return_msg:
-        fps = float(vr.get_avg_fps())
-        sec = ", ".join([str(round(f / fps, 1)) for f in frame_indices])
-        # " " should be added in the start and end
-        msg = f"The video contains {len(frame_indices)} frames sampled at {sec} seconds."
-        return images_group, msg
+        # fps = float(vframes.get_avg_fps())
+        # sec = ", ".join([str(round(f / fps, 1)) for f in frame_indices])
+        # # " " should be added in the start and end
+        # msg = f"The video contains {len(frame_indices)} frames sampled at {sec} seconds."
+        # return images_group, msg
+        exit('return_msg not implemented yet')
    else:
        return images_group

@ -130,7 +172,10 @@ class CSVDataset(Dataset):
    def __getitem__(self, idx):
        if idx < 0 or idx >= len(self.data_list):
            raise IndexError
-        video = load_video(self.data_list[idx], self.num_frames, resolution=RESOLUTION)
+        try:
+            video = load_video(self.data_list[idx], self.num_frames, resolution=RESOLUTION)
+        except:
+            return None
        return video

    def set_rank_and_world_size(self, rank, world_size):
@ -191,7 +236,7 @@ def parse_args():
        "--error_message",
        type=str,
        required=False,
-        default=None,
+        default='error occured during captioning',
    )
    args = parser.parse_args()
    return args
@ -233,8 +278,11 @@ def infer(
    processor,
    video_list,
    conv_mode,
-    print_res=True,
+    print_res=False,
 ):
+    # check if any video in video_list is None, if so, raise an exception
+    if any([video is None for video in video_list]):
+        raise Exception("Video not loaded properly")
    conv = conv_template.copy()
    conv.user_query("Describe the video in details.", is_mm=True)

@ -308,7 +356,8 @@ def run(rank, args, world_size, output_queue):
            )
        except Exception as e:
            logger.error(f"error in {batch}: {str(e)}")
-            preds = args.error_message
+            # preds = args.error_message duplicated for each video in the batch
+            preds = [args.error_message] * len(batch)
        result_list.extend(preds)
    output_queue.put((rank, result_list))
    return result_list
@ -369,7 +418,7 @@ def main():
    # write the dataframe to a new csv file called '*_pllava_13b_caption.csv'
    new_csv_path = args.csv_path.replace(".csv", "_text.csv")
    df.to_csv(new_csv_path, index=False)
-
+    print(f"Results saved to {new_csv_path}")

 if __name__ == "__main__":
    main()
--- a/tools/datasets/convert.py
+++ b/tools/datasets/convert.py
@ -6,7 +6,7 @@ import pandas as pd
 from torchvision.datasets import ImageNet

 IMG_EXTENSIONS = (".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif", ".tiff", ".webp")
-VID_EXTENSIONS = (".mp4", ".avi", ".mov", ".mkv")
+VID_EXTENSIONS = (".mp4", ".avi", ".mov", ".mkv", ".m2ts")


 def scan_recursively(root):
--- a/tools/scene_cut/cut.py
+++ b/tools/scene_cut/cut.py
@ -29,15 +29,20 @@ def process_single_row(row, args):
    # check mp4 integrity
    # if not is_intact_video(video_path, logger=logger):
    #     return False
-
-    if "timestamp" in row:
-        timestamp = row["timestamp"]
-        if not (timestamp.startswith("[") and timestamp.endswith("]")):
+    try:
+        if "timestamp" in row:
+            timestamp = row["timestamp"]
+            if not (timestamp.startswith("[") and timestamp.endswith("]")):
+                return False
+            scene_list = eval(timestamp)
+            scene_list = [(FrameTimecode(s, fps=1), FrameTimecode(t, fps=1)) for s, t in scene_list]
+        else:
+            scene_list = [None]
+        if args.drop_invalid_timestamps:
+            return True
+    except Exception as e:
+        if args.drop_invalid_timestamps:
            return False
-        scene_list = eval(timestamp)
-        scene_list = [(FrameTimecode(s, fps=1), FrameTimecode(t, fps=1)) for s, t in scene_list]
-    else:
-        scene_list = [None]

    if "relpath" in row:
        save_dir = os.path.dirname(os.path.join(args.save_dir, row["relpath"]))
@ -61,7 +66,7 @@ def process_single_row(row, args):
        shorter_size=shorter_size,
        logger=logger,
    )
-
+    return True

 def split_video(
    video_path,
@ -108,7 +113,10 @@ def split_video(
        fname_wo_ext = os.path.splitext(fname)[0]
        # TODO: fname pattern
        save_path = os.path.join(save_dir, f"{fname_wo_ext}_scene-{idx}.mp4")
-
+        if os.path.exists(save_path):
+            # print_log(f"File '{save_path}' already exists. Skip.", logger=logger)
+            continue
+        
        # ffmpeg cmd
        cmd = [FFMPEG_PATH]

@ -134,7 +142,7 @@ def split_video(
            # cmd += ['-vf', f"scale='if(gt(iw,ih),{shorter_size},trunc(ow/a/2)*2)':-2"]

        cmd += ["-map", "0:v", save_path]
-
+        # print(cmd)
        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
        stdout, stderr = proc.communicate()
        # stdout = stdout.decode("utf-8")
@ -159,11 +167,11 @@ def parse_args():
    )
    parser.add_argument("--target_fps", type=int, default=None, help="target fps of clips")
    parser.add_argument(
-        "--shorter_size", type=int, default=1080, help="resize the shorter size by keeping ratio; will not do upscale"
+        "--shorter_size", type=int, default=None, help="resize the shorter size by keeping ratio; will not do upscale"
    )
    parser.add_argument("--num_workers", type=int, default=None, help="#workers for pandarallel")
    parser.add_argument("--disable_parallel", action="store_true", help="disable parallel processing")
-
+    parser.add_argument("--drop_invalid_timestamps", action="store_true", help="drop rows with invalid timestamps")
    args = parser.parse_args()
    return args

@ -175,7 +183,7 @@ def main():
        print(f"Meta file '{meta_path}' not found. Exit.")
        exit()

-    # create logger
+    # create save_dir
    os.makedirs(args.save_dir, exist_ok=True)

    # initialize pandarallel
@ -189,10 +197,13 @@ def main():
    # process
    meta = pd.read_csv(args.meta_path)
    if not args.disable_parallel:
-        meta.parallel_apply(process_single_row_partial, axis=1)
+        results = meta.parallel_apply(process_single_row_partial, axis=1)
    else:
-        meta.apply(process_single_row_partial, axis=1)
-
-
+        results = meta.apply(process_single_row_partial, axis=1)
+    if args.drop_invalid_timestamps:
+        meta = meta[results]
+        assert args.meta_path.endswith("timestamp.csv"), "Only support *timestamp.csv"
+        meta.to_csv(args.meta_path.replace("timestamp.csv", "correct_timestamp.csv"), index=False)
+        print(f"Corrected timestamp file saved to '{args.meta_path.replace('timestamp.csv', 'correct_timestamp.csv')}'")
 if __name__ == "__main__":
    main()