diff --git a/docs/commands.md b/docs/commands.md
index 2c948de..d982db8 100644
--- a/docs/commands.md
+++ b/docs/commands.md
@@ -1,5 +1,6 @@
 # Commands
 
+- [Config](#Config)
 - [Inference](#inference)
   - [Inference with Open-Sora 1.2](#inference-with-open-sora-12)
   - [Inference with Open-Sora 1.1](#inference-with-open-sora-11)
@@ -12,6 +13,35 @@
   - [Training Hyperparameters](#training-hyperparameters)
 - [Search batch size for buckets](#search-batch-size-for-buckets)
 
+## Config
+Note that currently our model loading for vae and diffusion model supports two types:
+
+* load from local file path
+* load from huggingface
+
+Our config supports loading from huggingface by default.
+If you wish to load from a local path, you need to set `force_huggingface=True`, for instance:
+
+```python
+# for vae
+vae = dict(
+    type="OpenSoraVAE_V1_2",
+    from_pretrained="/root/commonData/OpenSora-VAE-v1.2",
+    micro_frame_size=17,
+    micro_batch_size=4,
+    force_huggingface=True, # NOTE: set here
+)
+# for diffusion model
+model = dict(
+    type="STDiT3-XL/2",
+    from_pretrained="/root/commonData/OpenSora-STDiT-v3",
+    qk_norm=True,
+    enable_flash_attn=True,
+    enable_layernorm_kernel=True,
+    force_huggingface=True, # NOTE: set here
+)
+```
+
 ## Inference
 
 You can modify corresponding config files to change the inference settings. See more details [here](/docs/structure.md#inference-config-demos).
diff --git a/eval/README.md b/eval/README.md
index 7ae6e32..261c21b 100644
--- a/eval/README.md
+++ b/eval/README.md
@@ -48,8 +48,14 @@ First, generate the relevant videos with the following commands:
 ```bash
 # vbench task, if evaluation all set start_index to 0, end_index to 2000
 bash eval/sample.sh /path/to/ckpt num_frames model_name_for_log  -4 start_index end_index
+
 # Alternatively, launch 8 jobs at once (you must read the script to understand the details)
 bash eval/vbench/launch.sh /path/to/ckpt num_frames model_name
+
+# in addition, you can specify resolution, aspect ratio, sampling steps, flow, and llm-refine
+bash eval/vbench/launch.sh /path/to/ckpt num_frames model_name res_value aspect_ratio_value steps_value flow_value llm_refine_value
+# for example
+# bash eval/vbench/launch.sh /mnt/jfs-hdd/sora/checkpoints/outputs/042-STDiT3-XL-2/epoch1-global_step16200_llm_refine/ema.pt 51 042-STDiT3-XL-2 240p 9:16 30 2 True
 ```
 
 After generation, install the VBench package following our [installation](../docs/installation.md)'s sections of "Evaluation Dependencies". Then, run the following commands to evaluate the generated samples.
@@ -89,6 +95,15 @@ python eval/vbench_i2v/tabulate_vbench_i2v_scores.py path/to/videos/folder path/
 
 ```
 
+Similarly as VBench, you can specify resolution, aspect ratio, sampling steps, flow, and llm-refine
+
+```bash
+bash eval/vbench_i2v/launch.sh /path/to/ckpt num_frames model_name_for_log res_value aspect_ratio_value steps_value flow_value llm_refine_value
+# for example
+# bash eval/vbench_i2v/launch.sh /mnt/jfs-hdd/sora/checkpoints/outputs/042-STDiT3-XL-2/epoch1-global_step16200_llm_refine/ema.pt 51 042-STDiT3-XL-2 240p 9:16 30 2 True
+# if no flow control, use "None" instead
+```
+
 ## VAE
 
 Install the dependencies package following our [installation](../docs/installation.md)'s s sections of "Evaluation Dependencies". Then, run the following evaluation command:
diff --git a/eval/sample.sh b/eval/sample.sh
index 241c229..83218bd 100644
--- a/eval/sample.sh
+++ b/eval/sample.sh
@@ -9,6 +9,10 @@ VBENCH_END_INDEX=$6
 VBENCH_RES=$7
 VBENCH_ASP_RATIO=$8
 
+NUM_SAMPLING_STEPS=$9
+FLOW=${10}
+LLM_REFINE=${11}
+
 echo "NUM_FRAMES=${NUM_FRAMES}"
 
 if [ -z "${NUM_FRAMES}" ]; then
@@ -238,10 +242,38 @@ function run_vbench() {
       --image-size $VBENCH_H $VBENCH_W \
       --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
   else
-    eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
-      --prompt-path assets/texts/VBench/all_dimension.txt \
-      --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO \
-      --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+    if [ -z ${NUM_SAMPLING_STEPS} ]; then
+        eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
+        --prompt-path assets/texts/VBench/all_dimension.txt \
+        --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO \
+        --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+    else
+      if [ -z ${FLOW} ]; then
+        eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
+        --prompt-path assets/texts/VBench/all_dimension.txt \
+        --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} \
+        --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+      else
+        if [ -z ${LLM_REFINE} ]; then
+          eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
+          --prompt-path assets/texts/VBench/all_dimension.txt \
+          --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --flow ${FLOW} \
+          --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+        else
+          if [ "${FLOW}" = "None" ]; then
+            eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
+            --prompt-path assets/texts/VBench/all_dimension.txt \
+            --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --llm-refine ${LLM_REFINE} \
+            --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+          else
+            eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
+            --prompt-path assets/texts/VBench/all_dimension.txt \
+            --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --flow ${FLOW} --llm-refine ${LLM_REFINE} \
+            --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+          fi
+        fi
+      fi
+    fi
   fi
 }
 
@@ -255,16 +287,41 @@ function run_vbench_i2v() {
     eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
       --prompt-path assets/texts/VBench/all_i2v.txt \
       --image-size $VBENCH_I2V_H $VBENCH_I2V_W \
-      --start-index $1 --end-index $2 \
-      --num-frames $NUM_FRAMES --batch-size $VBENCH_BS
+      --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
   else
-    eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
-      --prompt-path assets/texts/VBench/all_i2v.txt \
-      --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO \
-      --start-index $1 --end-index $2 \
-      --num-frames $NUM_FRAMES --batch-size $VBENCH_BS
+    if [ -z ${NUM_SAMPLING_STEPS} ]; then
+        eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
+        --prompt-path assets/texts/VBench/all_i2v.txt \
+        --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO \
+        --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+    else
+      if [ -z ${FLOW} ]; then
+        eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
+        --prompt-path assets/texts/VBench/all_i2v.txt \
+        --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} \
+        --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+      else
+        if [ -z ${LLM_REFINE} ]; then
+          eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
+          --prompt-path assets/texts/VBench/all_i2v.txt \
+          --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --flow ${FLOW} \
+          --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+        else
+          if [ "${FLOW}" = "None" ]; then
+            eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
+            --prompt-path assets/texts/VBench/all_i2v.txt \
+            --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --llm-refine ${LLM_REFINE} \
+            --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+          else
+            eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
+            --prompt-path assets/texts/VBench/all_i2v.txt \
+            --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --flow ${FLOW} --llm-refine ${LLM_REFINE} \
+            --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+          fi
+        fi
+      fi
+    fi
   fi
-
 }
 
 ### Main
diff --git a/eval/vbench/calc_vbench.py b/eval/vbench/calc_vbench.py
index b456f28..e5570a1 100644
--- a/eval/vbench/calc_vbench.py
+++ b/eval/vbench/calc_vbench.py
@@ -8,24 +8,30 @@ from vbench import VBench
 
 full_info_path = "eval/vbench/VBench_full_info.json"
 dimensions = [
-    # Quality Score
-    "subject_consistency",
-    "background_consistency",
-    "motion_smoothness",
-    "dynamic_degree",
-    "aesthetic_quality",
-    "imaging_quality",
-    "temporal_flickering",
-    # Semantic Score
-    "object_class",
-    "multiple_objects",
-    "color",
-    "spatial_relationship",
-    "scene",
-    "temporal_style",
-    "overall_consistency",
-    "human_action",
-    "appearance_style",
+    # a: 10min
+    "subject_consistency",  # 4min
+    "imaging_quality",  # 6min
+    # b: 12min
+    "background_consistency",  # 2min
+    "motion_smoothness",  # 5min
+    "overall_consistency",  # 2min
+    "human_action",  # 3min
+    # c: 14min
+    "multiple_objects",  # 14min
+    # d: 14min
+    "spatial_relationship",  # 14min
+    # e: 12min
+    "object_class",  # 12min
+    # f: 12min
+    "color",  # 12min
+    # g: 10.5min
+    "aesthetic_quality",  # 2.5min
+    "appearance_style",  # 6min
+    "temporal_flickering",  # 2min
+    # h: 9min
+    "scene",  # 3min
+    "temporal_style",  # 2min
+    "dynamic_degree",  # 4min
 ]
 
 
diff --git a/eval/vbench/launch.sh b/eval/vbench/launch.sh
index eedd9b3..e7c1165 100644
--- a/eval/vbench/launch.sh
+++ b/eval/vbench/launch.sh
@@ -6,6 +6,10 @@ MODEL_NAME=$3
 RES=$4
 ASP_RATIO=$5
 
+NUM_SAMPLING_STEPS=$6
+FLOW=$7
+LLM_REFINE=$8
+
 if [[ $CKPT == *"ema"* ]]; then
     parentdir=$(dirname $CKPT)
     CKPT_BASE=$(basename $parentdir)_ema
@@ -20,11 +24,36 @@ TASK_ID_LIST=(4a 4b 4c 4d 4e 4f 4g 4h) # for log records only
 START_INDEX_LIST=(0 120 240 360 480 600 720 840)
 END_INDEX_LIST=(120 240 360 480 600 720 840 2000)
 
+## Modify the following to run on multiple machines for faster results
+## 720p will take quite long on a single machine
+# START_INDEX_LIST=(60 180 300 420 540 660 780 900)
+# END_INDEX_LIST=(120 240 360 480 600 720 840 2000)
+# LOG_BASE=$(dirname $CKPT)/eval/last_60
+# mkdir -p ${LOG_BASE}
+# echo "Logging to $LOG_BASE"
+
+
+
 for i in "${!GPUS[@]}"; do
     if [ -z ${RES} ] || [ -z ${ASP_RATIO} ]  ;
         then
             CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
         else
-            CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+            if [ -z ${NUM_SAMPLING_STEPS} ];
+                then
+                    CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                else
+                    if [ -z ${FLOW} ];
+                    then
+                        CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                    else
+                        if [ -z ${LLM_REFINE} ];
+                            then
+                                CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                            else
+                                CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} ${LLM_REFINE}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                        fi
+                    fi
+            fi
     fi
 done
diff --git a/eval/vbench/launch_calc.sh b/eval/vbench/launch_calc.sh
index 53114b9..9f14ce5 100644
--- a/eval/vbench/launch_calc.sh
+++ b/eval/vbench/launch_calc.sh
@@ -7,11 +7,10 @@ mkdir -p $LOG_BASE
 echo "Logging to $LOG_BASE"
 
 GPUS=(0 1 2 3 4 5 6 7)
-START_INDEX_LIST=(0 2 4 6 8 10 12 14)
-END_INDEX_LIST=(2 4 6 8 10 12 14 16)
+START_INDEX_LIST=(0 2 6 7 8 9 10 13)
+END_INDEX_LIST=(2 6 7 8 9 10 13 16)
 TASK_ID_LIST=(calc_vbench_a calc_vbench_b calc_vbench_c calc_vbench_d calc_vbench_e calc_vbench_f calc_vbench_g calc_vbench_h) # for log records only
 
-
 for i in "${!GPUS[@]}"; do
     CUDA_VISIBLE_DEVICES=${GPUS[i]} python eval/vbench/calc_vbench.py $VIDEO_DIR $CKPT_DIR --start ${START_INDEX_LIST[i]} --end ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
 done
diff --git a/eval/vbench_i2v/launch.sh b/eval/vbench_i2v/launch.sh
index d8eea1d..2b03309 100644
--- a/eval/vbench_i2v/launch.sh
+++ b/eval/vbench_i2v/launch.sh
@@ -6,6 +6,10 @@ MODEL_NAME=$3
 RES=$4
 ASP_RATIO=$5
 
+NUM_SAMPLING_STEPS=$6
+FLOW=$7
+LLM_REFINE=$8
+
 if [[ $CKPT == *"ema"* ]]; then
     parentdir=$(dirname $CKPT)
     CKPT_BASE=$(basename $parentdir)_ema
@@ -20,11 +24,27 @@ TASK_ID_LIST=(5a 5b 5c 5d 5e 5f 5g 5h) # for log records only
 START_INDEX_LIST=(0 140 280 420 560 700 840 980)
 END_INDEX_LIST=(140 280 420 560 700 840 980 2000)
 
+
 for i in "${!GPUS[@]}"; do
     if [ -z ${RES} ] || [ -z ${ASP_RATIO} ]  ;
         then
             CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
         else
-            CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+            if [ -z ${NUM_SAMPLING_STEPS} ];
+                then
+                    CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                else
+                    if [ -z ${FLOW} ];
+                    then
+                        CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                    else
+                        if [ -z ${LLM_REFINE} ];
+                            then
+                                CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                            else
+                                CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} ${LLM_REFINE}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                        fi
+                    fi
+            fi
     fi
 done
diff --git a/opensora/models/stdit/stdit3.py b/opensora/models/stdit/stdit3.py
index bd9672d..bb71d04 100644
--- a/opensora/models/stdit/stdit3.py
+++ b/opensora/models/stdit/stdit3.py
@@ -4,6 +4,7 @@ import numpy as np
 import torch
 import torch.distributed as dist
 import torch.nn as nn
+import torch.nn.functional as F
 from einops import rearrange
 from rotary_embedding_torch import RotaryEmbedding
 from timm.models.layers import DropPath
@@ -361,6 +362,19 @@ class STDiT3(PreTrainedModel):
         # === get pos embed ===
         _, _, Tx, Hx, Wx = x.size()
         T, H, W = self.get_dynamic_size(x)
+
+        # adjust for sequence parallelism
+        # we need to ensure H * W is divisible by sequence parallel size
+        # for simplicity, we can adjust the height to make it divisible
+        if self.enable_sequence_parallelism:
+            sp_size = dist.get_world_size(get_sequence_parallel_group())
+            h_pad_size = sp_size - H % sp_size
+            hx_pad_size = h_pad_size * self.patch_size[1]
+
+            # pad x along the H dimension
+            H += h_pad_size
+            x = F.pad(x, (0, 0, 0, hx_pad_size))
+
         S = H * W
         base_size = round(S**0.5)
         resolution_sq = (height[0].item() * width[0].item()) ** 0.5