diff --git a/.gitignore b/.gitignore
index 57b6f55..04b419a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -179,6 +179,7 @@ pretrained_models
 evaluation_results/
 cache/
 *.swp
+debug/
 
 # Secret files
 hostfile
diff --git a/assets/texts/rand_types.txt b/assets/texts/rand_types.txt
new file mode 100644
index 0000000..bd4b5d8
--- /dev/null
+++ b/assets/texts/rand_types.txt
@@ -0,0 +1,40 @@
+随机电影镜头
+随机电影镜头
+随机电影镜头
+随机电影镜头
+随机电影镜头
+随机任务镜头
+随机任务镜头
+随机任务镜头
+随机任务镜头
+随机任务镜头
+随机游戏镜头
+随机游戏镜头
+随机游戏镜头
+随机游戏镜头
+随机游戏镜头
+随机开车镜头
+随机开车镜头
+随机开车镜头
+随机开车镜头
+随机开车镜头
+随机动物镜头
+随机动物镜头
+随机动物镜头
+随机动物镜头
+随机动物镜头
+随机森林镜头
+随机森林镜头
+随机森林镜头
+随机森林镜头
+随机森林镜头
+随机动漫镜头
+随机动漫镜头
+随机动漫镜头
+随机动漫镜头
+随机动漫镜头
+随机舞蹈镜头
+随机舞蹈镜头
+随机舞蹈镜头
+随机舞蹈镜头
+随机舞蹈镜头
diff --git a/configs/opensora-v1-2/inference/sample.py b/configs/opensora-v1-2/inference/sample.py
index 0e84251..3e2c623 100644
--- a/configs/opensora-v1-2/inference/sample.py
+++ b/configs/opensora-v1-2/inference/sample.py
@@ -19,14 +19,12 @@ model = dict(
     qk_norm=True,
     enable_flash_attn=True,
     enable_layernorm_kernel=True,
-    force_huggingface=True,
 )
 vae = dict(
     type="OpenSoraVAE_V1_2",
     from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
     micro_frame_size=17,
     micro_batch_size=4,
-    force_huggingface=True,
 )
 text_encoder = dict(
     type="t5",
diff --git a/configs/opensora-v1-2/inference/sample_hf.py b/configs/opensora-v1-2/inference/sample_hf.py
new file mode 100644
index 0000000..0e84251
--- /dev/null
+++ b/configs/opensora-v1-2/inference/sample_hf.py
@@ -0,0 +1,44 @@
+resolution = "240p"
+aspect_ratio = "9:16"
+num_frames = 51
+fps = 24
+frame_interval = 1
+save_fps = 24
+
+save_dir = "./samples/samples/"
+seed = 42
+batch_size = 1
+multi_resolution = "STDiT2"
+dtype = "bf16"
+condition_frame_length = 5
+align = 5
+
+model = dict(
+    type="STDiT3-XL/2",
+    from_pretrained="hpcai-tech/OpenSora-STDiT-v3",
+    qk_norm=True,
+    enable_flash_attn=True,
+    enable_layernorm_kernel=True,
+    force_huggingface=True,
+)
+vae = dict(
+    type="OpenSoraVAE_V1_2",
+    from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
+    micro_frame_size=17,
+    micro_batch_size=4,
+    force_huggingface=True,
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
+    model_max_length=300,
+)
+scheduler = dict(
+    type="rflow",
+    use_timestep_transform=True,
+    num_sampling_steps=30,
+    cfg_scale=7.0,
+)
+
+aes = 6.5
+flow = None
diff --git a/configs/opensora-v1-2/train/demo_360p.py b/configs/opensora-v1-2/train/demo_360p.py
index e27bd3c..f49a00e 100644
--- a/configs/opensora-v1-2/train/demo_360p.py
+++ b/configs/opensora-v1-2/train/demo_360p.py
@@ -5,7 +5,7 @@ dataset = dict(
 )
 
 # webvid
-bucket_config = {"360p": {102: (1.0, 5)}}
+bucket_config = {"360p": {102: (1.0, 1)}}
 grad_checkpoint = True
 
 # Acceleration settings
diff --git a/configs/opensora-v1-2/train/stage2.py b/configs/opensora-v1-2/train/stage2.py
index 94e6975..8620066 100644
--- a/configs/opensora-v1-2/train/stage2.py
+++ b/configs/opensora-v1-2/train/stage2.py
@@ -60,19 +60,21 @@ scheduler = dict(
 )
 
 # Mask settings
+# 25%
 mask_ratios = {
-    "random": 0.05,
-    "intepolate": 0.005,
-    "quarter_random": 0.005,
-    "quarter_head": 0.005,
-    "quarter_tail": 0.005,
-    "quarter_head_tail": 0.005,
-    "image_random": 0.025,
-    "image_head": 0.05,
-    "image_tail": 0.025,
-    "image_head_tail": 0.025,
+    "random": 0.005,
+    "intepolate": 0.002,
+    "quarter_random": 0.007,
+    "quarter_head": 0.002,
+    "quarter_tail": 0.002,
+    "quarter_head_tail": 0.002,
+    "image_random": 0.0,
+    "image_head": 0.22,
+    "image_tail": 0.005,
+    "image_head_tail": 0.005,
 }
 
+
 # Log settings
 seed = 42
 outputs = "outputs"
diff --git a/configs/pixart/inference/1x2048MS.py b/configs/pixart/inference/1x2048MS.py
index a0daca4..23f26ff 100644
--- a/configs/pixart/inference/1x2048MS.py
+++ b/configs/pixart/inference/1x2048MS.py
@@ -1,6 +1,6 @@
 num_frames = 1
 fps = 1
-image_size = (2560, 1536)
+# image_size = (2560, 1536)
 # image_size = (2048, 2048)
 
 model = dict(
@@ -16,6 +16,7 @@ vae = dict(
     type="VideoAutoencoderKL",
     from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers",
     subfolder="vae",
+    scaling_factor=0.13025,
 )
 text_encoder = dict(
     type="t5",
diff --git a/docs/commands.md b/docs/commands.md
index 2c948de..92ff5e6 100644
--- a/docs/commands.md
+++ b/docs/commands.md
@@ -1,5 +1,6 @@
 # Commands
 
+- [Config](#Config)
 - [Inference](#inference)
   - [Inference with Open-Sora 1.2](#inference-with-open-sora-12)
   - [Inference with Open-Sora 1.1](#inference-with-open-sora-11)
@@ -12,6 +13,36 @@
   - [Training Hyperparameters](#training-hyperparameters)
 - [Search batch size for buckets](#search-batch-size-for-buckets)
 
+## Config
+Note that currently our model loading for vae and diffusion model supports two types:
+
+* load from local file path
+* load from huggingface
+
+Our config supports loading from huggingface online image by default.
+If you wish to load from a local path downloaded from huggingface image, you need to set `force_huggingface=True`, for instance:
+
+```python
+# for vae
+vae = dict(
+    type="OpenSoraVAE_V1_2",
+    from_pretrained="/root/commonData/OpenSora-VAE-v1.2",
+    micro_frame_size=17,
+    micro_batch_size=4,
+    force_huggingface=True, # NOTE: set here
+)
+# for diffusion model
+model = dict(
+    type="STDiT3-XL/2",
+    from_pretrained="/root/commonData/OpenSora-STDiT-v3",
+    qk_norm=True,
+    enable_flash_attn=True,
+    enable_layernorm_kernel=True,
+    force_huggingface=True, # NOTE: set here
+)
+```
+However, if you want to load a self-trained model, do not set `force_huggingface=True` since your image won't be in huggingface format.
+
 ## Inference
 
 You can modify corresponding config files to change the inference settings. See more details [here](/docs/structure.md#inference-config-demos).
diff --git a/docs/report_03.md b/docs/report_03.md
index 8b02c8b..af9af26 100644
--- a/docs/report_03.md
+++ b/docs/report_03.md
@@ -7,7 +7,11 @@
 - [Evaluation](#evaluation)
 - [Sequence parallelism](#sequence-parallelism)
 
+<<<<<<< HEAD
 In Open-Sora 1.2 release, we train a 1.1B models on >30M data (\~80k hours), with training cost 35k H100 GPU hours, supporting 0s\~16s, 144p to 720p, various aspect ratios video generation. Our configurations is listed below. Following our 1.1 version, Open-Sora 1.2 can also do image-to-video generation and video extension.
+=======
+In Open-Sora 1.2 release, we train a 1.1B models on >30M data (about 80k hours), with training cost 35k H100 GPU hours, supporting 0s to 16s, 144p to 720p, various aspect ratios video generation. Our configurations is listed below. Following our 1.1 version, Open-Sora 1.2 can also do image-to-video generation and video extension.
+>>>>>>> 65efc8ce0e1cce0bd1a5a2aaab5cd845abee2c95
 
 |      | image | 2s  | 4s  | 8s  | 16s |
 | ---- | ----- | --- | --- | --- | --- |
diff --git a/eval/README.md b/eval/README.md
index 7ae6e32..261c21b 100644
--- a/eval/README.md
+++ b/eval/README.md
@@ -48,8 +48,14 @@ First, generate the relevant videos with the following commands:
 ```bash
 # vbench task, if evaluation all set start_index to 0, end_index to 2000
 bash eval/sample.sh /path/to/ckpt num_frames model_name_for_log  -4 start_index end_index
+
 # Alternatively, launch 8 jobs at once (you must read the script to understand the details)
 bash eval/vbench/launch.sh /path/to/ckpt num_frames model_name
+
+# in addition, you can specify resolution, aspect ratio, sampling steps, flow, and llm-refine
+bash eval/vbench/launch.sh /path/to/ckpt num_frames model_name res_value aspect_ratio_value steps_value flow_value llm_refine_value
+# for example
+# bash eval/vbench/launch.sh /mnt/jfs-hdd/sora/checkpoints/outputs/042-STDiT3-XL-2/epoch1-global_step16200_llm_refine/ema.pt 51 042-STDiT3-XL-2 240p 9:16 30 2 True
 ```
 
 After generation, install the VBench package following our [installation](../docs/installation.md)'s sections of "Evaluation Dependencies". Then, run the following commands to evaluate the generated samples.
@@ -89,6 +95,15 @@ python eval/vbench_i2v/tabulate_vbench_i2v_scores.py path/to/videos/folder path/
 
 ```
 
+Similarly as VBench, you can specify resolution, aspect ratio, sampling steps, flow, and llm-refine
+
+```bash
+bash eval/vbench_i2v/launch.sh /path/to/ckpt num_frames model_name_for_log res_value aspect_ratio_value steps_value flow_value llm_refine_value
+# for example
+# bash eval/vbench_i2v/launch.sh /mnt/jfs-hdd/sora/checkpoints/outputs/042-STDiT3-XL-2/epoch1-global_step16200_llm_refine/ema.pt 51 042-STDiT3-XL-2 240p 9:16 30 2 True
+# if no flow control, use "None" instead
+```
+
 ## VAE
 
 Install the dependencies package following our [installation](../docs/installation.md)'s s sections of "Evaluation Dependencies". Then, run the following evaluation command:
diff --git a/eval/loss/launch.sh b/eval/loss/launch.sh
index 5e19c7c..c70c52d 100644
--- a/eval/loss/launch.sh
+++ b/eval/loss/launch.sh
@@ -3,8 +3,16 @@
 CMD="torchrun --standalone --nproc_per_node 1 eval/loss/eval_loss.py configs/opensora-v1-2/misc/eval_loss.py"
 CKPT_PATH=$1
 MODEL_NAME=$2
-IMG_PATH="/mnt/jfs-hdd/sora/meta/validation/img_1k.csv"
-VID_PATH="/mnt/jfs-hdd/sora/meta/validation/vid_100.csv"
+IMG_PATH=$3
+VID_PATH=$4
+
+if [ -z $IMG_PATH ]; then
+    IMG_PATH="/mnt/jfs-hdd/sora/meta/validation/img_1k.csv"
+fi
+
+if [ -z $VID_PATH ]; then
+    VID_PATH="/mnt/jfs-hdd/sora/meta/validation/vid_100.csv"
+fi
 
 if [[ $CKPT_PATH == *"ema"* ]]; then
     parentdir=$(dirname $CKPT_PATH)
diff --git a/eval/sample.sh b/eval/sample.sh
index 0123309..61df175 100644
--- a/eval/sample.sh
+++ b/eval/sample.sh
@@ -3,12 +3,44 @@
 CKPT=$1
 NUM_FRAMES=$2
 MODEL_NAME=$3
-
+TASK_TYPE=$4
 VBENCH_START_INDEX=$5
 VBENCH_END_INDEX=$6
 VBENCH_RES=$7
 VBENCH_ASP_RATIO=$8
 
+NUM_SAMPLING_STEPS=$9
+FLOW=${10}
+LLM_REFINE=${11}
+
+BASE_ASPECT_RATIO=360p
+ASPECT_RATIOS=(144p 240p 360p 480p 720p 1080p)
+# Loop through the list of aspect ratios
+i=0
+for r in "${ASPECT_RATIOS[@]}"; do
+  if [[ "$r" == "$BASE_ASPECT_RATIO" ]]; then
+    # get aspect ratio 1 level up
+    if [[ $((i+1)) -lt ${#ASPECT_RATIOS[@]} ]]; then
+      ASPECT_RATIO_INCR_1=${ASPECT_RATIOS[$((i+1))]}
+    else
+      # If this is the highest ratio, return the highest ratio
+      ASPECT_RATIO_INCR_1=${ASPECT_RATIOS[-1]}
+    fi
+    # get aspect ratio 2 levels up
+    if [[ $((i+2)) -lt ${#ASPECT_RATIOS[@]} ]]; then
+      ASPECT_RATIO_INCR_2=${ASPECT_RATIOS[$((i+2))]}
+    else
+      # If this is the highest ratio, return the highest ratio
+      ASPECT_RATIO_INCR_2=${ASPECT_RATIOS[-1]}
+    fi
+  fi
+  i=$((i+1))
+done
+echo "base aspect ratio: ${BASE_ASPECT_RATIO}"
+echo "aspect ratio 1 level up: ${ASPECT_RATIO_INCR_1}"
+echo "aspect ratio 2 levels up: ${ASPECT_RATIO_INCR_2}"
+echo "Note that this aspect ratio level setting is used for videos only, not images"
+
 echo "NUM_FRAMES=${NUM_FRAMES}"
 
 if [ -z "${NUM_FRAMES}" ]; then
@@ -39,7 +71,7 @@ DEFAULT_BS=1
 # called inside run_video_b
 function run_image() { # 14min
   # 1.1 1024x1024
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2i_samples.txt --save-dir $OUTPUT --num-frames 1 --resolution 1024 --aspect_ratio 1:1 --sample-name image_1024_1_1 --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2i_samples.txt --save-dir $OUTPUT --num-frames 1 --resolution 1024 --aspect-ratio 1:1 --sample-name image_1024_1_1 --batch-size $DEFAULT_BS
 
   # 1.2 240x426
   eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2i_samples.txt --save-dir $OUTPUT --num-frames 1 --resolution 240p --aspect-ratio 9:16 --sample-name image_240p_9_16 --end-index 3 --batch-size $DEFAULT_BS
@@ -89,13 +121,13 @@ function run_video_a() { # ~ 30min ?
   # eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 2s --resolution 720p --aspect-ratio 9:16 --sample-name sample_2s_720p_9_16 --batch-size $DEFAULT_BS
 
   # sample, 720p, 9:16, 2s
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 4s --resolution 720p --aspect-ratio 9:16 --sample-name sample_4s_720p --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 4s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 9:16 --sample-name sample_4s_${ASPECT_RATIO_INCR_2} --batch-size $DEFAULT_BS
 
   # sample, 480p, 9:16, 8s
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 8s --resolution 480p --aspect-ratio 9:16 --sample-name sample_8s_480p --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 8s --resolution ${ASPECT_RATIO_INCR_1} --aspect-ratio 9:16 --sample-name sample_8s_${ASPECT_RATIO_INCR_1} --batch-size $DEFAULT_BS
 
-  # sample, 240p, 9:16, 16s
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 16s --resolution 360p --aspect-ratio 9:16 --sample-name sample_16s_360p --batch-size $DEFAULT_BS
+  # sample, 360p, 9:16, 16s
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 16s --resolution ${BASE_ASPECT_RATIO} --aspect-ratio 9:16 --sample-name sample_16s_${BASE_ASPECT_RATIO} --batch-size $DEFAULT_BS
 }
 
 function run_video_b() { # 18min + 14min = 32min, short 16x240p & 64x240p
@@ -112,10 +144,10 @@ function run_video_b() { # 18min + 14min = 32min, short 16x240p & 64x240p
   # eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 8s --resolution 240p --aspect-ratio 9:16 --sample-name short_8s_240p_9_16 --batch-size $DEFAULT_BS
 
   # short, 480p, 9:16, 8s: ~24min
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 8s --resolution 480p --aspect-ratio 9:16 --sample-name short_8s_480p --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 8s --resolution ${ASPECT_RATIO_INCR_1} --aspect-ratio 9:16 --sample-name short_8s_${ASPECT_RATIO_INCR_1} --batch-size $DEFAULT_BS
 
-  # short, 240p, 9:16, 16s: ~24min
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 16s --resolution 360p --aspect-ratio 9:16 --sample-name short_16s_360p --batch-size $DEFAULT_BS
+  # short, 360p, 9:16, 16s: ~24min
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 16s --resolution ${BASE_ASPECT_RATIO} --aspect-ratio 9:16 --sample-name short_16s_${BASE_ASPECT_RATIO} --batch-size $DEFAULT_BS
 
 }
 
@@ -129,10 +161,10 @@ function run_video_c() {
   # eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 16s --resolution 240p --aspect-ratio 9:16 --sample-name sora_16s_240p_9_16 --batch-size $DEFAULT_BS
 
   # short, 720p, 9:16, 2s: ~9min
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 4s --resolution 720p --aspect-ratio 9:16 --sample-name short_4s_720p --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 4s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 9:16 --sample-name short_4s_${ASPECT_RATIO_INCR_2} --batch-size $DEFAULT_BS
 
-  # sora, 240p, 9:16, 16s: ~40min
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 16s --resolution 360p --aspect-ratio 9:16 --sample-name sora_16s_360p --batch-size $DEFAULT_BS
+  # sora, 360p, 9:16, 16s: ~40min
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 16s --resolution ${BASE_ASPECT_RATIO} --aspect-ratio 9:16 --sample-name sora_16s_${BASE_ASPECT_RATIO} --batch-size $DEFAULT_BS
 }
 
 function run_video_d() {
@@ -143,17 +175,17 @@ function run_video_d() {
   # eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 8s --resolution 480p --aspect-ratio 9:16 --sample-name sora_8s_480p_9_16 --batch-size $DEFAULT_BS --start-index 0 --end-index 16
 
   # sora, 480p, 9:16, 8s, 1/3 # moved from run_video_e, 30min
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 8s --resolution 480p --aspect-ratio 9:16 --sample-name sora_8s_480p --batch-size $DEFAULT_BS --start-index 0 --end-index 16
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 8s --resolution ${ASPECT_RATIO_INCR_1} --aspect-ratio 9:16 --sample-name sora_8s_${ASPECT_RATIO_INCR_1} --batch-size $DEFAULT_BS --start-index 0 --end-index 16
 }
 
 function run_video_e() { # 90min * 2/3 = 60min
   # sora, 480p, 9:16, 8s, 2/3
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 8s --resolution 480p --aspect-ratio 9:16 --sample-name sora_8s_480p --batch-size $DEFAULT_BS --start-index 16 --end-index 100
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 8s --resolution ${ASPECT_RATIO_INCR_1} --aspect-ratio 9:16 --sample-name sora_8s_${ASPECT_RATIO_INCR_1} --batch-size $DEFAULT_BS --start-index 16 --end-index 100
 }
 
 function run_video_f() { # 60min
   # sora, 720p, 9:16, 2s
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 4s --resolution 720p --aspect-ratio 9:16 --sample-name sora_4s_720p --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 4s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 9:16 --sample-name sora_4s_${ASPECT_RATIO_INCR_2} --batch-size $DEFAULT_BS
 }
 
 # --resolution 720p --aspect-ratio [16:9, 9:16, ...]
@@ -162,22 +194,22 @@ function run_video_g() { # 15min
   # 720p, 2s multi-resolution
   # 1:1
   PROMPT="A soaring drone footage captures the majestic beauty of a coastal cliff, its red and yellow stratified rock faces rich in color and against the vibrant turquoise of the sea. Seabirds can be seen taking flight around the cliff's precipices. As the drone slowly moves from different angles, the changing sunlight casts shifting shadows that highlight the rugged textures of the cliff and the surrounding calm sea. The water gently laps at the rock base and the greenery that clings to the top of the cliff, and the scene gives a sense of peaceful isolation at the fringes of the ocean. The video captures the essence of pristine natural beauty untouched by human structures."
-  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution 720p --aspect-ratio 1:1 --sample-name drone_cliff_prompt_720p_2s_1_1
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 1:1 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_1_1
   # 16:9
-  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution 720p --aspect-ratio 16:9 --sample-name drone_cliff_prompt_720p_2s_16_9
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 16:9 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_16_9
   # 9:16
-  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution 720p --aspect-ratio 9:16 --sample-name drone_cliff_prompt_720p_2s_9_16
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 9:16 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_9_16
   # 4:3
-  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution 720p --aspect-ratio 4:3 --sample-name drone_cliff_prompt_720p_2s_4_3
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 4:3 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_4_3
   # 3:4
-  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution 720p --aspect-ratio 3:4 --sample-name drone_cliff_prompt_720p_2s_3_4
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 3:4 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_3_4
   # 1:2
-  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution 720p --aspect-ratio 1:2 --sample-name drone_cliff_prompt_720p_2s_1_2
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 1:2 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_1_2
   # 2:1
-  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution 720p --aspect-ratio 2:1 --sample-name drone_cliff_prompt_720p_2s_2_1
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 2:1 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_2_1
 
   # add motion score
-  eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --num-frames 2s --resolution 720p --sample-name motion_2s_720p --prompt \
+  eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --sample-name motion_2s_${ASPECT_RATIO_INCR_2} --prompt \
     \"A stylish woman walking in the street of Tokyo.\" \"A stylish woman walking in the street of Tokyo. motion score: 0.0\" \
     \"A stylish woman walking in the street of Tokyo. motion score: 2.0\" \
     \"A stylish woman walking in the street of Tokyo. motion score: 4.0\" \
@@ -188,7 +220,7 @@ function run_video_g() { # 15min
     \"A stylish woman walking in the street of Tokyo. motion score: 100.0\"
 
   # add aes score
-  eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --num-frames 2s --resolution 720p --sample-name aes_2s_720p --prompt \
+  eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --sample-name aes_2s_${ASPECT_RATIO_INCR_2} --prompt \
     \"A stylish woman walking in the street of Tokyo.\" \"A stylish woman walking in the street of Tokyo. aesthetic score: 4.0\" \
     \"A stylish woman walking in the street of Tokyo. aesthetic score: 4.5\" \
     \"A stylish woman walking in the street of Tokyo. aesthetic score: 5.0\" \
@@ -202,24 +234,24 @@ function run_video_g() { # 15min
 
 function run_video_h() { # 61min
   # 3.1 image-conditioned long video generation
-  eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --sample-name ref_L5C5_2s_360p_9_16 \
+  eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --sample-name ref_L5C5_2s_${BASE_ASPECT_RATIO}_9_16 \
     --prompt-path assets/texts/t2v_ref.txt --start-index 0 --end-index 3 \
-    --num-frames 2s --resolution 360p --aspect-ratio 9:16 \
+    --num-frames 2s --resolution ${BASE_ASPECT_RATIO} --aspect-ratio 9:16 \
     --loop 5 --condition-frame-length 5 \
     --reference-path assets/images/condition/cliff.png assets/images/condition/wave.png assets/images/condition/ship.png \
     --mask-strategy "0" "0" "0" --batch-size $DEFAULT_BS
 
-  eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --sample-name ref_L5C10_16s_360p_9_16 \
+  eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --sample-name ref_L5C10_16s_${BASE_ASPECT_RATIO}_9_16 \
     --prompt-path assets/texts/t2v_ref.txt --start-index 0 --end-index 3 \
-    --num-frames 16s --resolution 360p --aspect-ratio 9:16 \
+    --num-frames 16s --resolution ${BASE_ASPECT_RATIO} --aspect-ratio 9:16 \
     --loop 5 --condition-frame-length 10 \
     --reference-path assets/images/condition/cliff.png assets/images/condition/wave.png assets/images/condition/ship.png \
     --mask-strategy "0" "0" "0" --batch-size $DEFAULT_BS
 
   # 3.2
-  eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --sample-name ref_L1_16s_240p_9_16 \
+  eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --sample-name ref_L1_16s_${BASE_ASPECT_RATIO}_9_16 \
     --prompt-path assets/texts/t2v_ref.txt --start-index 3 --end-index 6 \
-    --num-frames 16s --resolution 360p --aspect-ratio 9:16 \
+    --num-frames 16s --resolution ${BASE_ASPECT_RATIO} --aspect-ratio 9:16 \
     --loop 1 \
     --reference-path assets/images/condition/cliff.png "assets/images/condition/cactus-sad.png\;assets/images/condition/cactus-happy.png" https://cdn.openai.com/tmp/s/interp/d0.mp4 \
     --mask-strategy "0" "0\;0,1,0,-1,1" "0,0,0,0,${QUAD_FRAMES},0.5" --batch-size $DEFAULT_BS
@@ -238,10 +270,38 @@ function run_vbench() {
       --image-size $VBENCH_H $VBENCH_W \
       --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
   else
-    eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
-      --prompt-path assets/texts/VBench/all_dimension.txt \
-      --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO \
-      --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+    if [ -z ${NUM_SAMPLING_STEPS} ]; then
+        eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
+        --prompt-path assets/texts/VBench/all_dimension.txt \
+        --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO \
+        --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+    else
+      if [ -z ${FLOW} ]; then
+        eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
+        --prompt-path assets/texts/VBench/all_dimension.txt \
+        --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} \
+        --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+      else
+        if [ -z ${LLM_REFINE} ]; then
+          eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
+          --prompt-path assets/texts/VBench/all_dimension.txt \
+          --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --flow ${FLOW} \
+          --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+        else
+          if [ "${FLOW}" = "None" ]; then
+            eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
+            --prompt-path assets/texts/VBench/all_dimension.txt \
+            --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --llm-refine ${LLM_REFINE} \
+            --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+          else
+            eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
+            --prompt-path assets/texts/VBench/all_dimension.txt \
+            --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --flow ${FLOW} --llm-refine ${LLM_REFINE} \
+            --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+          fi
+        fi
+      fi
+    fi
   fi
 }
 
@@ -255,16 +315,41 @@ function run_vbench_i2v() {
     eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
       --prompt-path assets/texts/VBench/all_i2v.txt \
       --image-size $VBENCH_I2V_H $VBENCH_I2V_W \
-      --start-index $1 --end-index $2 \
-      --num-frames $NUM_FRAMES --batch-size $VBENCH_BS
+      --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
   else
-    eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
-      --prompt-path assets/texts/VBench/all_i2v.txt \
-      --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO \
-      --start-index $1 --end-index $2 \
-      --num-frames $NUM_FRAMES --batch-size $VBENCH_BS
+    if [ -z ${NUM_SAMPLING_STEPS} ]; then
+        eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
+        --prompt-path assets/texts/VBench/all_i2v.txt \
+        --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO \
+        --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+    else
+      if [ -z ${FLOW} ]; then
+        eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
+        --prompt-path assets/texts/VBench/all_i2v.txt \
+        --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} \
+        --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+      else
+        if [ -z ${LLM_REFINE} ]; then
+          eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
+          --prompt-path assets/texts/VBench/all_i2v.txt \
+          --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --flow ${FLOW} \
+          --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+        else
+          if [ "${FLOW}" = "None" ]; then
+            eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
+            --prompt-path assets/texts/VBench/all_i2v.txt \
+            --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --llm-refine ${LLM_REFINE} \
+            --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+          else
+            eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
+            --prompt-path assets/texts/VBench/all_i2v.txt \
+            --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --flow ${FLOW} --llm-refine ${LLM_REFINE} \
+            --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+          fi
+        fi
+      fi
+    fi
   fi
-
 }
 
 ### Main
diff --git a/eval/vbench/calc_vbench.py b/eval/vbench/calc_vbench.py
index b456f28..e5570a1 100644
--- a/eval/vbench/calc_vbench.py
+++ b/eval/vbench/calc_vbench.py
@@ -8,24 +8,30 @@ from vbench import VBench
 
 full_info_path = "eval/vbench/VBench_full_info.json"
 dimensions = [
-    # Quality Score
-    "subject_consistency",
-    "background_consistency",
-    "motion_smoothness",
-    "dynamic_degree",
-    "aesthetic_quality",
-    "imaging_quality",
-    "temporal_flickering",
-    # Semantic Score
-    "object_class",
-    "multiple_objects",
-    "color",
-    "spatial_relationship",
-    "scene",
-    "temporal_style",
-    "overall_consistency",
-    "human_action",
-    "appearance_style",
+    # a: 10min
+    "subject_consistency",  # 4min
+    "imaging_quality",  # 6min
+    # b: 12min
+    "background_consistency",  # 2min
+    "motion_smoothness",  # 5min
+    "overall_consistency",  # 2min
+    "human_action",  # 3min
+    # c: 14min
+    "multiple_objects",  # 14min
+    # d: 14min
+    "spatial_relationship",  # 14min
+    # e: 12min
+    "object_class",  # 12min
+    # f: 12min
+    "color",  # 12min
+    # g: 10.5min
+    "aesthetic_quality",  # 2.5min
+    "appearance_style",  # 6min
+    "temporal_flickering",  # 2min
+    # h: 9min
+    "scene",  # 3min
+    "temporal_style",  # 2min
+    "dynamic_degree",  # 4min
 ]
 
 
diff --git a/eval/vbench/launch.sh b/eval/vbench/launch.sh
index eedd9b3..df4d06d 100644
--- a/eval/vbench/launch.sh
+++ b/eval/vbench/launch.sh
@@ -6,6 +6,10 @@ MODEL_NAME=$3
 RES=$4
 ASP_RATIO=$5
 
+NUM_SAMPLING_STEPS=$6
+FLOW=$7
+LLM_REFINE=$8
+
 if [[ $CKPT == *"ema"* ]]; then
     parentdir=$(dirname $CKPT)
     CKPT_BASE=$(basename $parentdir)_ema
@@ -20,11 +24,36 @@ TASK_ID_LIST=(4a 4b 4c 4d 4e 4f 4g 4h) # for log records only
 START_INDEX_LIST=(0 120 240 360 480 600 720 840)
 END_INDEX_LIST=(120 240 360 480 600 720 840 2000)
 
+## Modify the following to run on multiple machines for faster results
+## 720p will take quite long on a single machine
+# START_INDEX_LIST=(60 180 300 420 540 660 780 900)
+# END_INDEX_LIST=(120 240 360 480 600 720 840 2000)
+# LOG_BASE=$(dirname $CKPT)/eval/last_60
+# mkdir -p ${LOG_BASE}
+# echo "Logging to $LOG_BASE"
+
+
+
 for i in "${!GPUS[@]}"; do
     if [ -z ${RES} ] || [ -z ${ASP_RATIO} ]  ;
         then
-            CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+            CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
         else
-            CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+            if [ -z ${NUM_SAMPLING_STEPS} ];
+                then
+                    CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                else
+                    if [ -z ${FLOW} ];
+                    then
+                        CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                    else
+                        if [ -z ${LLM_REFINE} ];
+                            then
+                                CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                            else
+                                CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} ${LLM_REFINE} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                        fi
+                    fi
+            fi
     fi
 done
diff --git a/eval/vbench/launch_calc.sh b/eval/vbench/launch_calc.sh
index 53114b9..9f14ce5 100644
--- a/eval/vbench/launch_calc.sh
+++ b/eval/vbench/launch_calc.sh
@@ -7,11 +7,10 @@ mkdir -p $LOG_BASE
 echo "Logging to $LOG_BASE"
 
 GPUS=(0 1 2 3 4 5 6 7)
-START_INDEX_LIST=(0 2 4 6 8 10 12 14)
-END_INDEX_LIST=(2 4 6 8 10 12 14 16)
+START_INDEX_LIST=(0 2 6 7 8 9 10 13)
+END_INDEX_LIST=(2 6 7 8 9 10 13 16)
 TASK_ID_LIST=(calc_vbench_a calc_vbench_b calc_vbench_c calc_vbench_d calc_vbench_e calc_vbench_f calc_vbench_g calc_vbench_h) # for log records only
 
-
 for i in "${!GPUS[@]}"; do
     CUDA_VISIBLE_DEVICES=${GPUS[i]} python eval/vbench/calc_vbench.py $VIDEO_DIR $CKPT_DIR --start ${START_INDEX_LIST[i]} --end ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
 done
diff --git a/eval/vbench_i2v/launch.sh b/eval/vbench_i2v/launch.sh
index d8eea1d..193c581 100644
--- a/eval/vbench_i2v/launch.sh
+++ b/eval/vbench_i2v/launch.sh
@@ -6,6 +6,10 @@ MODEL_NAME=$3
 RES=$4
 ASP_RATIO=$5
 
+NUM_SAMPLING_STEPS=$6
+FLOW=$7
+LLM_REFINE=$8
+
 if [[ $CKPT == *"ema"* ]]; then
     parentdir=$(dirname $CKPT)
     CKPT_BASE=$(basename $parentdir)_ema
@@ -20,11 +24,27 @@ TASK_ID_LIST=(5a 5b 5c 5d 5e 5f 5g 5h) # for log records only
 START_INDEX_LIST=(0 140 280 420 560 700 840 980)
 END_INDEX_LIST=(140 280 420 560 700 840 980 2000)
 
+
 for i in "${!GPUS[@]}"; do
     if [ -z ${RES} ] || [ -z ${ASP_RATIO} ]  ;
         then
-            CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+            CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
         else
-            CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+            if [ -z ${NUM_SAMPLING_STEPS} ];
+                then
+                    CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                else
+                    if [ -z ${FLOW} ];
+                    then
+                        CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                    else
+                        if [ -z ${LLM_REFINE} ];
+                            then
+                                CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                            else
+                                CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} ${LLM_REFINE} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                        fi
+                    fi
+            fi
     fi
 done
diff --git a/opensora/datasets/aspect.py b/opensora/datasets/aspect.py
index 011ad40..f6defb7 100644
--- a/opensora/datasets/aspect.py
+++ b/opensora/datasets/aspect.py
@@ -465,7 +465,10 @@ def get_num_pixels(name):
 
 
 def get_image_size(resolution, ar_ratio):
-    ar_key = ASPECT_RATIO_MAP[ar_ratio]
+    if ar_ratio in ASPECT_RATIO_MAP:
+        ar_key = ASPECT_RATIO_MAP[ar_ratio]
+    else:
+        ar_key = ar_ratio
     rs_dict = ASPECT_RATIOS[resolution][1]
     assert ar_key in rs_dict, f"Aspect ratio {ar_ratio} not found for resolution {resolution}"
     return rs_dict[ar_key]
diff --git a/opensora/datasets/dataloader.py b/opensora/datasets/dataloader.py
index 15058ac..60d0b24 100644
--- a/opensora/datasets/dataloader.py
+++ b/opensora/datasets/dataloader.py
@@ -111,6 +111,9 @@ def prepare_dataloader(
 
 
 def collate_fn_default(batch):
+    # filter out None
+    batch = [x for x in batch if x is not None]
+
     # HACK: for loading text features
     use_mask = False
     if "mask" in batch[0] and isinstance(batch[0]["mask"], int):
@@ -132,6 +135,9 @@ def collate_fn_batch(batch):
     """
     Used only with BatchDistributedSampler
     """
+    # filter out None
+    batch = [x for x in batch if x is not None]
+    
     res = torch.utils.data.default_collate(batch)
 
     # squeeze the first dimension, which is due to torch.stack() in default_collate()
diff --git a/opensora/datasets/datasets.py b/opensora/datasets/datasets.py
index 8b5fdd6..b148268 100644
--- a/opensora/datasets/datasets.py
+++ b/opensora/datasets/datasets.py
@@ -190,7 +190,10 @@ class VariableVideoTextDataset(VideoTextDataset):
         return ret
 
     def __getitem__(self, index):
-        return self.getitem(index)
+        try:
+            return self.getitem(index)
+        except:
+            return None
 
 
 @DATASETS.register_module()
diff --git a/opensora/models/layers/blocks.py b/opensora/models/layers/blocks.py
index 5e2c13d..40e6abb 100644
--- a/opensora/models/layers/blocks.py
+++ b/opensora/models/layers/blocks.py
@@ -163,6 +163,8 @@ class Attention(nn.Module):
         if rope is not None:
             self.rope = True
             self.rotary_emb = rope
+        
+        self.is_causal = False
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         B, N, C = x.shape
@@ -198,12 +200,17 @@ class Attention(nn.Module):
                 v,
                 dropout_p=self.attn_drop.p if self.training else 0.0,
                 softmax_scale=self.scale,
+                causal=self.is_causal,
             )
         else:
             dtype = q.dtype
             q = q * self.scale
             attn = q @ k.transpose(-2, -1)  # translate attn to float32
             attn = attn.to(torch.float32)
+            if self.is_causal:
+                causal_mask = torch.tril(torch.ones_like(attn), diagonal=0)
+                causal_mask = torch.where(causal_mask.bool(), 0, float('-inf'))
+                attn += causal_mask
             attn = attn.softmax(dim=-1)
             attn = attn.to(dtype)  # cast back attn to original dtype
             attn = self.attn_drop(attn)
diff --git a/opensora/models/pixart/pixart.py b/opensora/models/pixart/pixart.py
index 02f8b67..d99c572 100644
--- a/opensora/models/pixart/pixart.py
+++ b/opensora/models/pixart/pixart.py
@@ -197,16 +197,18 @@ class PixArt(nn.Module):
             if freeze == "text":
                 self.freeze_text()
 
-    def forward(self, x, timestep, y, mask=None):
+    def forward(self, x, timestep, y, mask=None, **kwargs):
         """
         Forward pass of PixArt.
         x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
         t: (N,) tensor of diffusion timesteps
         y: (N, 1, 120, C) tensor of class labels
         """
-        x = x.to(self.dtype)
-        timestep = timestep.to(self.dtype)
-        y = y.to(self.dtype)
+        dtype = self.x_embedder.proj.weight.dtype
+        B = x.size(0)
+        x = x.to(dtype)
+        timestep = timestep.to(dtype)
+        y = y.to(dtype)
 
         # embedding
         x = self.x_embedder(x)  # (B, N, D)
diff --git a/opensora/models/stdit/stdit3.py b/opensora/models/stdit/stdit3.py
index bd9672d..b0c046a 100644
--- a/opensora/models/stdit/stdit3.py
+++ b/opensora/models/stdit/stdit3.py
@@ -4,6 +4,7 @@ import numpy as np
 import torch
 import torch.distributed as dist
 import torch.nn as nn
+import torch.nn.functional as F
 from einops import rearrange
 from rotary_embedding_torch import RotaryEmbedding
 from timm.models.layers import DropPath
@@ -361,6 +362,24 @@ class STDiT3(PreTrainedModel):
         # === get pos embed ===
         _, _, Tx, Hx, Wx = x.size()
         T, H, W = self.get_dynamic_size(x)
+
+        # adjust for sequence parallelism
+        # we need to ensure H * W is divisible by sequence parallel size
+        # for simplicity, we can adjust the height to make it divisible
+        if self.enable_sequence_parallelism:
+            sp_size = dist.get_world_size(get_sequence_parallel_group())
+            if H % sp_size != 0:
+                h_pad_size = sp_size - H % sp_size
+            else:
+                h_pad_size = 0
+
+            if h_pad_size > 0:
+                hx_pad_size = h_pad_size * self.patch_size[1]
+
+                # pad x along the H dimension
+                H += h_pad_size
+                x = F.pad(x, (0, 0, 0, hx_pad_size))
+
         S = H * W
         base_size = round(S**0.5)
         resolution_sq = (height[0].item() * width[0].item()) ** 0.5
diff --git a/opensora/models/vae/vae.py b/opensora/models/vae/vae.py
index 9802b02..3e85bf5 100644
--- a/opensora/models/vae/vae.py
+++ b/opensora/models/vae/vae.py
@@ -13,7 +13,13 @@ from opensora.utils.ckpt_utils import load_checkpoint
 @MODELS.register_module()
 class VideoAutoencoderKL(nn.Module):
     def __init__(
-        self, from_pretrained=None, micro_batch_size=None, cache_dir=None, local_files_only=False, subfolder=None
+        self,
+        from_pretrained=None,
+        micro_batch_size=None,
+        cache_dir=None,
+        local_files_only=False,
+        subfolder=None,
+        scaling_factor=0.18215,
     ):
         super().__init__()
         self.module = AutoencoderKL.from_pretrained(
@@ -25,6 +31,7 @@ class VideoAutoencoderKL(nn.Module):
         self.out_channels = self.module.config.latent_channels
         self.patch_size = (1, 8, 8)
         self.micro_batch_size = micro_batch_size
+        self.scaling_factor = scaling_factor
 
     def encode(self, x):
         # x: (B, C, T, H, W)
@@ -32,14 +39,14 @@ class VideoAutoencoderKL(nn.Module):
         x = rearrange(x, "B C T H W -> (B T) C H W")
 
         if self.micro_batch_size is None:
-            x = self.module.encode(x).latent_dist.sample().mul_(0.18215)
+            x = self.module.encode(x).latent_dist.sample().mul_(self.scaling_factor)
         else:
             # NOTE: cannot be used for training
             bs = self.micro_batch_size
             x_out = []
             for i in range(0, x.shape[0], bs):
                 x_bs = x[i : i + bs]
-                x_bs = self.module.encode(x_bs).latent_dist.sample().mul_(0.18215)
+                x_bs = self.module.encode(x_bs).latent_dist.sample().mul_(self.scaling_factor)
                 x_out.append(x_bs)
             x = torch.cat(x_out, dim=0)
         x = rearrange(x, "(B T) C H W -> B C T H W", B=B)
@@ -50,14 +57,14 @@ class VideoAutoencoderKL(nn.Module):
         B = x.shape[0]
         x = rearrange(x, "B C T H W -> (B T) C H W")
         if self.micro_batch_size is None:
-            x = self.module.decode(x / 0.18215).sample
+            x = self.module.decode(x / self.scaling_factor).sample
         else:
             # NOTE: cannot be used for training
             bs = self.micro_batch_size
             x_out = []
             for i in range(0, x.shape[0], bs):
                 x_bs = x[i : i + bs]
-                x_bs = self.module.decode(x_bs / 0.18215).sample
+                x_bs = self.module.decode(x_bs / self.scaling_factor).sample
                 x_out.append(x_bs)
             x = torch.cat(x_out, dim=0)
         x = rearrange(x, "(B T) C H W -> B C T H W", B=B)
diff --git a/opensora/models/vae/video_sdxl/blocks.py b/opensora/models/vae/video_sdxl/blocks.py
new file mode 100644
index 0000000..0ed4973
--- /dev/null
+++ b/opensora/models/vae/video_sdxl/blocks.py
@@ -0,0 +1,724 @@
+"""
+Adapted from SDXL VAE (https://huggingface.co/stabilityai/sdxl-vae/blob/main/config.json)
+All default values of kwargs are the same as SDXL
+"""
+
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.models.attention_processor import Attention
+from einops import rearrange
+
+
+def video_to_image(func):
+    def wrapper(self, x, *args, **kwargs):
+        if x.ndim == 5:
+            B = x.shape[0]
+            x = rearrange(x, 'B C T H W -> (B T) C H W')
+
+            if hasattr(self, 'micro_batch_size') and self.micro_batch_size is None:
+                x = func(self, x, *args, **kwargs)
+            else:
+                bs = self.micro_batch_size
+                x_out = []
+                for i in range(0, x.shape[0], bs):
+                    x_i = func(self, x[i:i + bs], *args, **kwargs)
+                    x_out.append(x_i)
+                x = torch.cat(x_out, dim=0)
+
+            x = rearrange(x, '(B T) C H W -> B C T H W', B=B)
+        return x
+    return wrapper
+
+
+class VideoConv2d(nn.Conv2d):
+    def __init__(self, *args, micro_batch_size=None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.micro_batch_size = micro_batch_size
+
+    @video_to_image
+    def forward(self, x):
+        return super().forward(x)
+
+
+class ResnetBlock2D(nn.Module):
+    """
+        Use nn.Conv2d
+        Default activation is nn.SiLU()
+        Make sure input tensor is of shape [B, C, T, H, W] or [B, C, H, W]
+        Support micro_batch_size
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        norm_groups: int = 32,
+        norm_eps: float = 1e-6,
+        micro_batch_size=None,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.micro_batch_size = micro_batch_size
+
+        conv_cls = nn.Conv2d
+        self.norm1 = torch.nn.GroupNorm(num_groups=norm_groups, num_channels=in_channels, eps=norm_eps, affine=True)
+        self.conv1 = conv_cls(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+
+        self.norm2 = torch.nn.GroupNorm(num_groups=norm_groups, num_channels=out_channels, eps=norm_eps, affine=True)
+        self.conv2 = conv_cls(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+
+        self.act = nn.SiLU()
+
+        self.use_in_shortcut = self.in_channels != out_channels
+
+        self.conv_shortcut = None
+        if self.use_in_shortcut:
+            self.conv_shortcut = conv_cls(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            )
+
+    @video_to_image
+    def forward(self, x):
+        res = self.norm1(x)
+        res = self.act(res)
+        res = self.conv1(res)
+
+        res = self.norm2(res)
+        res = self.act(res)
+        res = self.conv2(res)
+
+        if self.conv_shortcut is not None:
+            x = self.conv_shortcut(x)
+
+        out = x + res
+        return out
+
+
+class ResnetBlock3D(nn.Module):
+    """
+        Use nn.Conv3d
+        Default activation is nn.SiLU()
+        Make sure input tensor is of shape [B, C, T, H, W]
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        norm_groups: int = 32,
+        norm_eps: float = 1e-6,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+
+        conv_cls = nn.Conv3d
+        self.norm1 = torch.nn.GroupNorm(num_groups=norm_groups, num_channels=in_channels, eps=norm_eps, affine=True)
+        self.conv1 = conv_cls(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+
+        self.norm2 = torch.nn.GroupNorm(num_groups=norm_groups, num_channels=out_channels, eps=norm_eps, affine=True)
+        self.conv2 = conv_cls(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+
+        self.act = nn.SiLU()
+
+        self.use_in_shortcut = self.in_channels != out_channels
+
+        self.conv_shortcut = None
+        if self.use_in_shortcut:
+            self.conv_shortcut = conv_cls(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            )
+        
+    def forward(self, x):
+        res = self.norm1(x)
+        res = self.act(res)
+        res = self.conv1(res)
+
+        res = self.norm2(res)
+        res = self.act(res)
+        res = self.conv2(res)
+
+        if self.conv_shortcut is not None:
+            x = self.conv_shortcut(x)
+
+        out = x + res
+        return out
+
+
+class SpatialDownsample2x(nn.Module):
+    """
+        Default downsample is Conv2d(stride=2)
+        Make sure input tensor is of shape [B, C, T, H, W]
+        Support micro_batch_size
+    """
+    def __init__(
+        self,
+        channels: int,
+        use_conv: bool = True,
+        micro_batch_size=None,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.use_conv = use_conv
+        self.micro_batch_size = micro_batch_size
+
+        if use_conv:
+            self.downsample = nn.Conv2d(
+                self.channels, self.channels, kernel_size=3, stride=2, padding=0,
+            )
+        else:
+            self.downsample = nn.AvgPool2d(kernel_size=2, stride=2)
+
+    @video_to_image
+    def forward(self, x):
+        # implementation from SDXL
+        pad = (0, 1, 0, 1)
+        x = F.pad(x, pad, mode="constant", value=0)
+
+        x = self.downsample(x)
+        return x
+
+
+class SpatialUpsample2x(nn.Module):
+    """
+        Default upsample is F.interpolate(scale_factor=2) + Conv2d(stride=1)
+        Make sure input tensor is of shape [B, C, T, H, W]
+        Support micro_batch_size
+    """
+    def __init__(
+        self,
+        channels: int,
+        use_interpolate=True,
+        micro_batch_size=None,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.use_interpolate = use_interpolate
+        self.micro_batch_size = micro_batch_size
+
+        if use_interpolate:
+            self.conv = nn.Conv2d(self.channels, self.channels, kernel_size=3, padding=1)
+        else:
+            raise NotImplementedError
+            self.upsample = nn.ConvTranspose2d(channels, self.channels, kernel_size=4, stride=2, padding=1)
+    
+    def forward(self, x):
+        B = x.shape[0]
+        x = rearrange(x, 'B C T H W -> (B T) C H W')
+
+        if self.micro_batch_size is None:
+            x = self.forward_BCHW(x)
+        else:
+            bs = self.micro_batch_size
+            x_out = []
+            for i in range(0, x.shape[0], bs):
+                x_i = self.forward_BCHW(x[i:i + bs])
+                x_out.append(x_i)
+            x = torch.cat(x_out, dim=0)
+
+        x = rearrange(x, '(B T) C H W -> B C T H W', B=B)
+        return x
+
+    def forward_BCHW(self, x):
+        if self.use_interpolate:
+            # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
+            if x.shape[0] >= 64:
+                x = x.contiguous()
+
+            # interpolate tensor of bfloat16 is fixed in pytorch 2.1. see https://github.com/pytorch/pytorch/issues/86679
+            x = F.interpolate(x, scale_factor=2.0, mode="nearest")
+            x = self.conv(x)
+        else:
+            x = self.upsample(x)
+
+        return x
+
+
+class TemporalDownsample2x(nn.Module):
+    """
+        Default downsample is Conv3d(stride=(2, 1, 1))
+        Make sure input tensor is of shape [B, C, T, H, W]
+    """
+    def __init__(
+        self,
+        channels: int,
+        use_conv: bool = True,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.use_conv = use_conv
+
+        if use_conv:
+            self.downsample = nn.Conv3d(
+                self.channels, self.channels, kernel_size=(3, 3, 3), stride=(2, 1, 1), padding=(1, 1, 1),
+           )
+        else:
+            self.downsample = nn.AvgPool3d(kernel_size=(3, 1, 1), stride=(2, 1, 1))
+
+    def forward(self, x):
+        x = self.downsample(x)
+        return x
+
+
+class TemporalUpsample2x(nn.Module):
+    """
+        Default upsample is F.interpolate(scale_factor=(2, 1, 1)) + Conv3d(stride=1)
+        Make sure input tensor is of shape [B, C, T, H, W]
+        Support micro_batch_size
+    """
+    def __init__(
+        self,
+        channels,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.conv = nn.Conv3d(channels, channels, kernel_size=3, padding=1)
+
+    def forward(self, x):
+        if x.shape[0] >= 64:
+            x = x.contiguous()
+        x = F.interpolate(x, scale_factor=(2, 1, 1), mode="trilinear")
+        x = self.conv(x)
+        return x
+
+
+class UNetMidBlock2D(nn.Module):
+    """
+        default is ResnetBlock2D + Spatial Attention + ResnetBlock2D
+        Make sure input tensor is of shape [B, C, T, H, W] or [B, C, H, W]
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        num_layers: int = 1,
+        norm_groups: int = 32,
+        norm_eps: float = 1e-6,
+        attn_groups: Optional[int] = None,
+        add_attention: bool = True,
+        attention_head_dim: int = 512,
+    ):
+        super().__init__()
+        self.add_attention = add_attention
+
+        if attn_groups is None:
+            attn_groups = norm_groups
+
+        if attention_head_dim is None:
+            attention_head_dim = in_channels
+
+        res_blocks = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                norm_eps=norm_eps,
+                norm_groups=norm_groups,
+            )
+        ]
+        attn_blocks = []
+
+        for _ in range(num_layers):
+            if self.add_attention:
+                attn_blocks.append(
+                    Attention(
+                        in_channels,
+                        heads=in_channels // attention_head_dim,
+                        dim_head=attention_head_dim,
+                        # rescale_output_factor=output_scale_factor,
+                        rescale_output_factor=1.0,
+                        eps=norm_eps,
+                        norm_num_groups=attn_groups,
+                        # spatial_norm_dim=temb_channels if resnet_time_scale_shift == "spatial" else None,
+                        spatial_norm_dim=None,
+                        residual_connection=True,
+                        bias=True,
+                        upcast_softmax=True,
+                        _from_deprecated_attn_block=True,
+                    )
+                )
+
+            res_blocks.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    norm_eps=norm_eps,
+                    norm_groups=norm_groups,
+                )
+            )
+
+        self.attn_blocks = nn.ModuleList(attn_blocks)
+        self.res_blocks = nn.ModuleList(res_blocks)
+
+    def forward(self, x):
+        has_T = x.ndim == 5
+        if has_T:
+            B = x.shape[0]
+            x = rearrange(x, 'B C T H W -> (B T) C H W')
+
+        x = self.res_blocks[0](x)
+        for attn, res_block in zip(self.attn_blocks, self.res_blocks[1:]):
+            if attn is not None:
+                x = attn(x)
+            x = res_block(x)
+
+        if has_T:
+            x = rearrange(x, '(B T) C H W -> B C T H W', B=B)
+        return x
+
+
+class Encoder(nn.Module):
+    """
+        default arch is conv_in + blocks + mid_block + out_block
+        Make sure input tensor is of shape [B, C, T, H, W]
+    """
+    def __init__(
+        self,
+        in_channels=3,
+        out_channels=4,
+        norm_groups=32,
+        norm_eps=1e-6,
+        double_z=True,
+        micro_batch_size=None,
+    ):
+        super().__init__()
+        in_channels_encoder = in_channels
+        out_channels_encoder = out_channels
+        block_out_channels = [128, 256, 512, 512]
+
+        # conv_in
+        self.conv_in = VideoConv2d(
+            in_channels_encoder,
+            block_out_channels[0],
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            micro_batch_size=micro_batch_size,
+        )
+
+        # blocks
+        blocks = []
+
+        # the first block: ResnetBlock2D
+        in_channels = block_out_channels[0]
+        out_channels = block_out_channels[0]
+        blocks.append(
+            nn.Sequential(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    norm_groups=norm_groups,
+                    norm_eps=norm_eps,
+                    micro_batch_size=micro_batch_size,
+                ),
+                ResnetBlock2D(
+                    in_channels=out_channels,
+                    out_channels=out_channels,
+                    norm_groups=norm_groups,
+                    norm_eps=norm_eps,
+                    micro_batch_size=micro_batch_size,
+                ),
+                SpatialDownsample2x(
+                    channels=out_channels,
+                    use_conv=True,
+                    micro_batch_size=micro_batch_size, 
+                ),
+            )
+        )
+
+        # the second block: ResnetBlock2D
+        in_channels = block_out_channels[0]
+        out_channels = block_out_channels[1]
+        blocks.append(
+            nn.Sequential(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    norm_groups=norm_groups,
+                    norm_eps=norm_eps,
+                    micro_batch_size=micro_batch_size,
+                ),
+                ResnetBlock2D(
+                    in_channels=out_channels,
+                    out_channels=out_channels,
+                    norm_groups=norm_groups,
+                    norm_eps=norm_eps,
+                    micro_batch_size=micro_batch_size,
+                ),
+                SpatialDownsample2x(
+                    channels=out_channels,
+                    use_conv=True,
+                    micro_batch_size=micro_batch_size, 
+                ),
+                TemporalDownsample2x(
+                    channels=out_channels,
+                    use_conv=True,
+                )
+            )
+        )
+
+        # the third block: ResnetBlock3D
+        in_channels = block_out_channels[1]
+        out_channels = block_out_channels[2]
+        blocks.append(
+            nn.Sequential(
+                ResnetBlock3D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    norm_groups=norm_groups,
+                    norm_eps=norm_eps,
+                ),
+                ResnetBlock3D(
+                    in_channels=out_channels,
+                    out_channels=out_channels,
+                    norm_groups=norm_groups,
+                    norm_eps=norm_eps,
+                ),
+                SpatialDownsample2x(
+                    channels=out_channels,
+                    use_conv=True,
+                ),
+                TemporalDownsample2x(
+                    channels=out_channels,
+                    use_conv=True,
+                )
+            )
+        )
+
+        # the fourth block: ResnetBlock3D
+        in_channels = block_out_channels[2]
+        out_channels = block_out_channels[3]
+        blocks.append(
+            nn.Sequential(
+                ResnetBlock3D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    norm_groups=norm_groups,
+                    norm_eps=norm_eps,
+                ),
+                ResnetBlock3D(
+                    in_channels=out_channels,
+                    out_channels=out_channels,
+                    norm_groups=norm_groups,
+                    norm_eps=norm_eps,
+                ),
+            )
+        )
+
+        self.blocks = nn.ModuleList(blocks)
+
+
+        # mid_block
+        in_channels = block_out_channels[-1]
+        self.mid_block = UNetMidBlock2D(
+            in_channels=in_channels,
+            num_layers=1,
+            norm_groups=norm_groups,
+            norm_eps=norm_eps,
+            add_attention=True,
+            attention_head_dim=in_channels,
+        )
+
+        # out_block
+        in_channels = block_out_channels[-1]
+        out_channels = 2 * out_channels_encoder if double_z else out_channels_encoder
+        self.out_block = nn.Sequential(
+            nn.GroupNorm(num_channels=in_channels, num_groups=norm_groups, eps=norm_eps),
+            nn.SiLU(),
+            nn.Conv3d(in_channels, out_channels, kernel_size=3, padding=1),
+        )
+    
+    def forward(self, x):
+        x = self.conv_in(x)
+
+        for block in self.blocks:
+            x = block(x)
+
+        x = self.mid_block(x)
+
+        x = self.out_block(x)
+        return x
+
+
+class Decoder(nn.Module):
+    """
+        default arch is conv_in + mid_block + blocks + out_block
+        Make sure input tensor is of shape [B, C, T, H, W]
+    """
+    def __init__(
+        self,
+        in_channels=4,
+        out_channels=3,
+        norm_groups=32,
+        norm_eps=1e-6,
+    ):
+        super().__init__()
+        in_channels_decoder = in_channels
+        out_channels_decoder = out_channels
+        block_out_channels = [512, 512, 256, 128]
+
+        # conv_in
+        self.conv_in = nn.Conv3d(
+            in_channels_decoder,
+            block_out_channels[0],
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+
+        # mid_block
+        in_channels = block_out_channels[0]
+        self.mid_block = UNetMidBlock2D(
+            in_channels=in_channels,
+            num_layers=1,
+            norm_groups=norm_groups,
+            norm_eps=norm_eps,
+            add_attention=True,
+            attention_head_dim=in_channels,
+        )
+
+        # blocks
+        blocks = []
+        layer_per_block = 3
+
+        # the first up block: ResnetBlock3D
+        in_channels = block_out_channels[0]
+        out_channels = block_out_channels[0]
+        seq = [
+            ResnetBlock3D(
+                in_channels=in_channels if idx ==0 else out_channels,
+                out_channels=out_channels,
+                norm_groups=norm_groups,
+                norm_eps=norm_eps,
+            )
+            for idx in range(layer_per_block)
+        ] + [
+            SpatialUpsample2x(
+                channels=out_channels,
+                use_interpolate=True,
+            ),
+            TemporalUpsample2x(
+                channels=out_channels,
+            ),
+        ]
+        blocks.append(nn.Sequential(*seq))
+
+        # the second up block: ResnetBlock3D
+        in_channels = block_out_channels[0]
+        out_channels = block_out_channels[1]
+        seq = [
+            ResnetBlock3D(
+                in_channels=in_channels if idx ==0 else out_channels,
+                out_channels=out_channels,
+                norm_groups=norm_groups,
+                norm_eps=norm_eps,
+            )
+            for idx in range(layer_per_block)
+        ] + [
+            SpatialUpsample2x(
+                channels=out_channels,
+                use_interpolate=True,
+            ),
+            TemporalUpsample2x(
+                channels=out_channels,
+            ),
+        ]
+        blocks.append(nn.Sequential(*seq))
+
+        # the third up block: ResnetBlock3D
+        in_channels = block_out_channels[1]
+        out_channels = block_out_channels[2]
+        seq = [
+            ResnetBlock3D(
+                in_channels=in_channels if idx ==0 else out_channels,
+                out_channels=out_channels,
+                norm_groups=norm_groups,
+                norm_eps=norm_eps,
+            )
+            for idx in range(layer_per_block)
+        ] + [
+            SpatialUpsample2x(
+                channels=out_channels,
+                use_interpolate=True,
+            ),
+        ]
+        blocks.append(nn.Sequential(*seq))
+
+        # the fourth up block: ResnetBlock2D
+        in_channels = block_out_channels[2]
+        out_channels = block_out_channels[3]
+        seq = [
+            ResnetBlock2D(
+                in_channels=in_channels if idx ==0 else out_channels,
+                out_channels=out_channels,
+                norm_groups=norm_groups,
+                norm_eps=norm_eps,
+            )
+            for idx in range(layer_per_block)
+        ]
+        blocks.append(nn.Sequential(*seq))
+
+        self.blocks = nn.ModuleList(blocks)
+
+        # out_block
+        in_channels = block_out_channels[-1]
+        out_channels = out_channels_decoder
+        self.out_block = nn.Sequential(
+            nn.GroupNorm(num_channels=in_channels, num_groups=norm_groups, eps=norm_eps),
+            nn.SiLU(),
+            nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
+        )
+
+    def forward(self, x):
+        x = self.conv_in(x)
+        print(torch.cuda.memory_allocated() /  1024 ** 3)
+
+        x = self.mid_block(x)
+        print(torch.cuda.memory_allocated() /  1024 ** 3)
+
+        for block in self.blocks:
+            x = block(x)
+        print(torch.cuda.memory_allocated() /  1024 ** 3)
+
+        x = self.out_block(x)
+        print(torch.cuda.memory_allocated() /  1024 ** 3)
+        return x
+
+if __name__ == '__main__':
+    from opensora.utils.misc import count_params
+    device = 'cuda'
+    dtype = torch.bfloat16
+
+    encoder = Encoder(
+        in_channels=3,
+        out_channels=4,
+        double_z=False,
+        micro_batch_size=4,
+    ).to(torch.bfloat16).to(device, dtype).eval()
+
+    decoder = Decoder(
+        in_channels=4,
+        out_channels=3,
+    ).to(torch.bfloat16).to(device, dtype).eval()
+    num_params_enc = count_params(encoder)
+    num_params_dec = count_params(decoder)
+    print(f'Encoder #params: {num_params_enc}')
+    print(f'Decoder #params: {num_params_dec}')
+
+    # inference
+    x = torch.rand(1, 3, 51, 720, 1080).to(device, dtype)
+    with torch.inference_mode():
+        x_enc = encoder(x)
+        x_dec = decoder(x_enc)
+    print(torch.cuda.memory_allocated() /  1024 ** 3)
+    breakpoint()
diff --git a/opensora/schedulers/dpms/__init__.py b/opensora/schedulers/dpms/__init__.py
index df10477..111e97b 100644
--- a/opensora/schedulers/dpms/__init__.py
+++ b/opensora/schedulers/dpms/__init__.py
@@ -24,7 +24,8 @@ class DPM_SOLVER:
         mask=None,
         progress=True,
     ):
-        assert mask is None, "mask is not supported in dpm-solver"
+        if mask is not None:
+            print("[WARNING] mask is not supported in dpm-solver, it will be ignored")
         n = len(prompts)
         model_args = text_encoder.encode(prompts)
         y = model_args.pop("y")
diff --git a/opensora/schedulers/dpms/dpm_solver.py b/opensora/schedulers/dpms/dpm_solver.py
index d422a0a..2eddfbd 100644
--- a/opensora/schedulers/dpms/dpm_solver.py
+++ b/opensora/schedulers/dpms/dpm_solver.py
@@ -1419,7 +1419,7 @@ class DPM_Solver:
                 for step in progress_fn(range(order, steps + 1)):
                     t = timesteps[step]
                     # We only use lower order for steps < 10
-                    if lower_order_final and steps < 10:
+                    if lower_order_final:  # recommended by Shuchen Xue
                         step_order = min(order, steps + 1 - step)
                     else:
                         step_order = order
diff --git a/requirements/requirements-cu121.txt b/requirements/requirements-cu121.txt
index cc13920..362381d 100644
--- a/requirements/requirements-cu121.txt
+++ b/requirements/requirements-cu121.txt
@@ -1,3 +1,3 @@
-torch==2.3.0 --index-url https://download.pytorch.org/whl/cu121
-torchvision==0.18.0 --index-url https://download.pytorch.org/whl/cu121
-xformers==0.0.26.post1 --index-url https://download.pytorch.org/whl/cu121
+torch==2.2.2 --index-url https://download.pytorch.org/whl/cu121
+torchvision==0.17.2 --index-url https://download.pytorch.org/whl/cu121
+xformers==0.0.25.post1 --index-url https://download.pytorch.org/whl/cu121
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 66ab7bf..92d9620 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -1,4 +1,4 @@
-colossalai==0.3.7
+colossalai>=0.4.0
 mmengine>=0.10.3
 pandas>=2.0.3
 timm==0.9.16
@@ -7,6 +7,7 @@ ftfy>=6.2.0 # for t5
 diffusers==0.27.2 # for vae
 accelerate==0.29.2 # for t5
 av>=12.0.0 # for video loading
+numpy<2.0.0
 
 # [gradio]
 gradio>=4.26.0
diff --git a/scripts/inference.py b/scripts/inference.py
index 5095fd2..c4578a7 100644
--- a/scripts/inference.py
+++ b/scripts/inference.py
@@ -260,6 +260,7 @@ def main():
                     )
 
                 # == sampling ==
+                torch.manual_seed(1024)
                 z = torch.randn(len(batch_prompts), vae.out_channels, *latent_size, device=device, dtype=dtype)
                 masks = apply_mask_strategy(z, refs, ms, loop_i, align=align)
                 samples = scheduler.sample(
diff --git a/scripts/train.py b/scripts/train.py
index 1066977..110f2f8 100644
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -347,24 +347,27 @@ def main():
                     tb_writer.add_scalar("loss", loss.item(), global_step)
                     # wandb
                     if cfg.get("wandb", False):
-                        wandb.log(
-                            {
-                                "iter": global_step,
-                                "acc_step": acc_step,
-                                "epoch": epoch,
-                                "loss": loss.item(),
-                                "avg_loss": avg_loss,
-                                "lr": optimizer.param_groups[0]["lr"],
-                                "debug/move_data_time": move_data_t.elapsed_time,
-                                "debug/encode_time": encode_t.elapsed_time,
-                                "debug/mask_time": mask_t.elapsed_time,
-                                "debug/diffusion_time": loss_t.elapsed_time,
-                                "debug/backward_time": backward_t.elapsed_time,
-                                "debug/update_ema_time": ema_t.elapsed_time,
-                                "debug/reduce_loss_time": reduce_loss_t.elapsed_time,
-                            },
-                            step=global_step,
-                        )
+                        wandb_dict = {
+                            "iter": global_step,
+                            "acc_step": acc_step,
+                            "epoch": epoch,
+                            "loss": loss.item(),
+                            "avg_loss": avg_loss,
+                            "lr": optimizer.param_groups[0]["lr"],
+                        }
+                        if record_time:
+                            wandb_dict.update(
+                                {
+                                    "debug/move_data_time": move_data_t.elapsed_time,
+                                    "debug/encode_time": encode_t.elapsed_time,
+                                    "debug/mask_time": mask_t.elapsed_time,
+                                    "debug/diffusion_time": loss_t.elapsed_time,
+                                    "debug/backward_time": backward_t.elapsed_time,
+                                    "debug/update_ema_time": ema_t.elapsed_time,
+                                    "debug/reduce_loss_time": reduce_loss_t.elapsed_time,
+                                }
+                            )
+                        wandb.log(wandb_dict, step=global_step)
 
                     running_loss = 0.0
                     log_step = 0
diff --git a/tests/test_stdit3_sequence_parallelism.py b/tests/test_stdit3_sequence_parallelism.py
index ba715b7..70786f4 100644
--- a/tests/test_stdit3_sequence_parallelism.py
+++ b/tests/test_stdit3_sequence_parallelism.py
@@ -9,7 +9,7 @@ from opensora.models.stdit.stdit3 import STDiT3, STDiT3Config
 
 
 def get_sample_data():
-    x = torch.rand([1, 4, 15, 20, 27], dtype=torch.bfloat16)  # (B, C, T, H, W)
+    x = torch.rand([1, 4, 15, 20, 28], dtype=torch.bfloat16)  # (B, C, T, H, W)
     timestep = torch.Tensor([924.0]).to(torch.bfloat16)
     y = torch.rand(1, 1, 300, 4096, dtype=torch.bfloat16)
     mask = torch.ones([1, 300], dtype=torch.int32)
@@ -66,6 +66,17 @@ def run_model(rank, world_size, port):
     set_seed(1024)
     dist_model_cfg = get_stdit3_config(enable_sequence_parallelism=True)
     dist_model = STDiT3(dist_model_cfg).cuda().to(torch.bfloat16)
+
+    # ensure model weights are equal
+    for p1, p2 in zip(non_dist_model.parameters(), dist_model.parameters()):
+        assert torch.equal(p1, p2)
+
+    # ensure model weights are equal across all ranks
+    for p in dist_model.parameters():
+        p_list = [torch.zeros_like(p) for _ in range(world_size)]
+        dist.all_gather(p_list, p, group=dist.group.WORLD)
+        assert torch.equal(*p_list)
+
     dist_out = dist_model(**data)
     dist_out.mean().backward()
 
@@ -84,9 +95,8 @@ def run_model(rank, world_size, port):
     for (n1, p1), (n2, p2) in zip(non_dist_model.named_parameters(), dist_model.named_parameters()):
         assert n1 == n2
         if p1.grad is not None and p2.grad is not None:
-            if not torch.allclose(p1.grad, p2.grad, rtol=1e-2, atol=1e-4):
-                if dist.get_rank() == 0:
-                    print(f"gradient of {n1} is not equal, {p1.grad} vs {p2.grad}")
+            if not torch.allclose(p1.grad, p2.grad, rtol=1e-2, atol=1e-4) and dist.get_rank() == 0:
+                print(f"gradient of {n1} is not equal, {p1.grad} vs {p2.grad}")
         else:
             assert p1.grad is None and p2.grad is None
 
diff --git a/tools/caption/README.md b/tools/caption/README.md
index f6fe0c8..8f7dfed 100644
--- a/tools/caption/README.md
+++ b/tools/caption/README.md
@@ -4,7 +4,7 @@ Human labeling of videos is expensive and time-consuming. We adopt powerful imag
 
 ## PLLaVA Captioning
 
-To balance captioning speed and performance, we chose the 13B version of PLLaVA configured with 2*2 spatial pooling. We feed it with 4 frames evenly extracted from the video.
+To balance captioning speed and performance, we chose the 13B version of PLLaVA configured with 2*2 spatial pooling. We feed it with 4 frames evenly extracted from the video. We accelerate its inference via (1) batching and (2) offload frame extraction to a separate process such that the GPU computations and frame extraction happen in parallel.
 
 ### Installation
 Install the required dependancies by following our [installation instructions](../../docs/installation.md)'s "Data Dependencies" and "PLLaVA Captioning" sections.
diff --git a/tools/caption/pllava_dir/caption_pllava.py b/tools/caption/pllava_dir/caption_pllava.py
index 2213367..ceb0721 100644
--- a/tools/caption/pllava_dir/caption_pllava.py
+++ b/tools/caption/pllava_dir/caption_pllava.py
@@ -1,3 +1,17 @@
+import sys
+import os
+import os
+from pathlib import Path
+
+current_file = Path(__file__)  # Gets the path of the current file
+fourth_level_parent = current_file.parents[3]
+
+datasets_dir = os.path.join(fourth_level_parent, "opensora/datasets")
+import sys
+sys.path.append(datasets_dir)
+from read_video import read_video_av
+sys.path.remove(datasets_dir)
+
 import itertools
 import logging
 import multiprocessing as mp
@@ -95,21 +109,49 @@ def get_index(num_frames, num_segments):
     return offsets
 
 
+# def load_video(video_path, num_frames, return_msg=False, resolution=336):
+#     transforms = torchvision.transforms.Resize(size=resolution)
+#     vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+#     total_num_frames = len(vr)
+#     frame_indices = get_index(total_num_frames, num_frames)
+#     images_group = list()
+#     for frame_index in frame_indices:
+#         img = Image.fromarray(vr[frame_index].asnumpy())
+#         images_group.append(transforms(img))
+#     if return_msg:
+#         fps = float(vr.get_avg_fps())
+#         sec = ", ".join([str(round(f / fps, 1)) for f in frame_indices])
+#         # " " should be added in the start and end
+#         msg = f"The video contains {len(frame_indices)} frames sampled at {sec} seconds."
+#         return images_group, msg
+#     else:
+#         return images_group
+
+
 def load_video(video_path, num_frames, return_msg=False, resolution=336):
     transforms = torchvision.transforms.Resize(size=resolution)
-    vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
-    total_num_frames = len(vr)
+    # vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+    vframes, aframes, info = read_video_av(
+        video_path,
+        pts_unit="sec", 
+        output_format="THWC"
+    )
+    print(vframes.shape)
+    total_num_frames = len(vframes)
+    # print("Video path: ", video_path)
+    # print("Total number of frames: ", total_num_frames)
     frame_indices = get_index(total_num_frames, num_frames)
     images_group = list()
     for frame_index in frame_indices:
-        img = Image.fromarray(vr[frame_index].asnumpy())
+        img = Image.fromarray(vframes[frame_index].numpy())
         images_group.append(transforms(img))
     if return_msg:
-        fps = float(vr.get_avg_fps())
-        sec = ", ".join([str(round(f / fps, 1)) for f in frame_indices])
-        # " " should be added in the start and end
-        msg = f"The video contains {len(frame_indices)} frames sampled at {sec} seconds."
-        return images_group, msg
+        # fps = float(vframes.get_avg_fps())
+        # sec = ", ".join([str(round(f / fps, 1)) for f in frame_indices])
+        # # " " should be added in the start and end
+        # msg = f"The video contains {len(frame_indices)} frames sampled at {sec} seconds."
+        # return images_group, msg
+        exit('return_msg not implemented yet')
     else:
         return images_group
 
@@ -130,7 +172,10 @@ class CSVDataset(Dataset):
     def __getitem__(self, idx):
         if idx < 0 or idx >= len(self.data_list):
             raise IndexError
-        video = load_video(self.data_list[idx], self.num_frames, resolution=RESOLUTION)
+        try:
+            video = load_video(self.data_list[idx], self.num_frames, resolution=RESOLUTION)
+        except:
+            return None
         return video
 
     def set_rank_and_world_size(self, rank, world_size):
@@ -191,7 +236,7 @@ def parse_args():
         "--error_message",
         type=str,
         required=False,
-        default=None,
+        default='error occured during captioning',
     )
     args = parser.parse_args()
     return args
@@ -233,8 +278,11 @@ def infer(
     processor,
     video_list,
     conv_mode,
-    print_res=True,
+    print_res=False,
 ):
+    # check if any video in video_list is None, if so, raise an exception
+    if any([video is None for video in video_list]):
+        raise Exception("Video not loaded properly")
     conv = conv_template.copy()
     conv.user_query("Describe the video in details.", is_mm=True)
 
@@ -308,7 +356,8 @@ def run(rank, args, world_size, output_queue):
             )
         except Exception as e:
             logger.error(f"error in {batch}: {str(e)}")
-            preds = args.error_message
+            # preds = args.error_message duplicated for each video in the batch
+            preds = [args.error_message] * len(batch)
         result_list.extend(preds)
     output_queue.put((rank, result_list))
     return result_list
@@ -369,7 +418,7 @@ def main():
     # write the dataframe to a new csv file called '*_pllava_13b_caption.csv'
     new_csv_path = args.csv_path.replace(".csv", "_text.csv")
     df.to_csv(new_csv_path, index=False)
-
+    print(f"Results saved to {new_csv_path}")
 
 if __name__ == "__main__":
     main()
diff --git a/tools/datasets/convert.py b/tools/datasets/convert.py
index 6253e8e..fad128f 100644
--- a/tools/datasets/convert.py
+++ b/tools/datasets/convert.py
@@ -6,7 +6,7 @@ import pandas as pd
 from torchvision.datasets import ImageNet
 
 IMG_EXTENSIONS = (".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif", ".tiff", ".webp")
-VID_EXTENSIONS = (".mp4", ".avi", ".mov", ".mkv")
+VID_EXTENSIONS = (".mp4", ".avi", ".mov", ".mkv", ".m2ts")
 
 
 def scan_recursively(root):
diff --git a/tools/scene_cut/cut.py b/tools/scene_cut/cut.py
index cf724a6..ab9bd80 100644
--- a/tools/scene_cut/cut.py
+++ b/tools/scene_cut/cut.py
@@ -29,15 +29,20 @@ def process_single_row(row, args):
     # check mp4 integrity
     # if not is_intact_video(video_path, logger=logger):
     #     return False
-
-    if "timestamp" in row:
-        timestamp = row["timestamp"]
-        if not (timestamp.startswith("[") and timestamp.endswith("]")):
+    try:
+        if "timestamp" in row:
+            timestamp = row["timestamp"]
+            if not (timestamp.startswith("[") and timestamp.endswith("]")):
+                return False
+            scene_list = eval(timestamp)
+            scene_list = [(FrameTimecode(s, fps=1), FrameTimecode(t, fps=1)) for s, t in scene_list]
+        else:
+            scene_list = [None]
+        if args.drop_invalid_timestamps:
+            return True
+    except Exception as e:
+        if args.drop_invalid_timestamps:
             return False
-        scene_list = eval(timestamp)
-        scene_list = [(FrameTimecode(s, fps=1), FrameTimecode(t, fps=1)) for s, t in scene_list]
-    else:
-        scene_list = [None]
 
     if "relpath" in row:
         save_dir = os.path.dirname(os.path.join(args.save_dir, row["relpath"]))
@@ -61,7 +66,7 @@ def process_single_row(row, args):
         shorter_size=shorter_size,
         logger=logger,
     )
-
+    return True
 
 def split_video(
     video_path,
@@ -108,7 +113,10 @@ def split_video(
         fname_wo_ext = os.path.splitext(fname)[0]
         # TODO: fname pattern
         save_path = os.path.join(save_dir, f"{fname_wo_ext}_scene-{idx}.mp4")
-
+        if os.path.exists(save_path):
+            # print_log(f"File '{save_path}' already exists. Skip.", logger=logger)
+            continue
+        
         # ffmpeg cmd
         cmd = [FFMPEG_PATH]
 
@@ -134,7 +142,7 @@ def split_video(
             # cmd += ['-vf', f"scale='if(gt(iw,ih),{shorter_size},trunc(ow/a/2)*2)':-2"]
 
         cmd += ["-map", "0:v", save_path]
-
+        # print(cmd)
         proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
         stdout, stderr = proc.communicate()
         # stdout = stdout.decode("utf-8")
@@ -159,11 +167,11 @@ def parse_args():
     )
     parser.add_argument("--target_fps", type=int, default=None, help="target fps of clips")
     parser.add_argument(
-        "--shorter_size", type=int, default=1080, help="resize the shorter size by keeping ratio; will not do upscale"
+        "--shorter_size", type=int, default=None, help="resize the shorter size by keeping ratio; will not do upscale"
     )
     parser.add_argument("--num_workers", type=int, default=None, help="#workers for pandarallel")
     parser.add_argument("--disable_parallel", action="store_true", help="disable parallel processing")
-
+    parser.add_argument("--drop_invalid_timestamps", action="store_true", help="drop rows with invalid timestamps")
     args = parser.parse_args()
     return args
 
@@ -175,7 +183,7 @@ def main():
         print(f"Meta file '{meta_path}' not found. Exit.")
         exit()
 
-    # create logger
+    # create save_dir
     os.makedirs(args.save_dir, exist_ok=True)
 
     # initialize pandarallel
@@ -189,10 +197,13 @@ def main():
     # process
     meta = pd.read_csv(args.meta_path)
     if not args.disable_parallel:
-        meta.parallel_apply(process_single_row_partial, axis=1)
+        results = meta.parallel_apply(process_single_row_partial, axis=1)
     else:
-        meta.apply(process_single_row_partial, axis=1)
-
-
+        results = meta.apply(process_single_row_partial, axis=1)
+    if args.drop_invalid_timestamps:
+        meta = meta[results]
+        assert args.meta_path.endswith("timestamp.csv"), "Only support *timestamp.csv"
+        meta.to_csv(args.meta_path.replace("timestamp.csv", "correct_timestamp.csv"), index=False)
+        print(f"Corrected timestamp file saved to '{args.meta_path.replace('timestamp.csv', 'correct_timestamp.csv')}'")
 if __name__ == "__main__":
     main()