diff --git a/configs/opensora-v1-1/inference/sample-ref.py b/configs/opensora-v1-1/inference/sample-ref.py
index ae07486..75bd377 100644
--- a/configs/opensora-v1-1/inference/sample-ref.py
+++ b/configs/opensora-v1-1/inference/sample-ref.py
@@ -32,6 +32,7 @@ model = dict(
     type="STDiT2-XL/2",
     from_pretrained=None,
     input_sq_size=512,
+    qk_norm=True,
     enable_flashattn=True,
     enable_layernorm_kernel=True,
 )
diff --git a/configs/opensora-v1-1/inference/sample.py b/configs/opensora-v1-1/inference/sample.py
index 6e20906..cec8073 100644
--- a/configs/opensora-v1-1/inference/sample.py
+++ b/configs/opensora-v1-1/inference/sample.py
@@ -9,7 +9,7 @@ model = dict(
     type="STDiT2-XL/2",
     from_pretrained=None,
     input_sq_size=512,
-    # qk_norm=True,
+    qk_norm=True,
     enable_flashattn=True,
     enable_layernorm_kernel=True,
 )
diff --git a/configs/opensora-v1-1/train/stage1-2.py b/configs/opensora-v1-1/train/stage1.py
similarity index 100%
rename from configs/opensora-v1-1/train/stage1-2.py
rename to configs/opensora-v1-1/train/stage1.py
diff --git a/configs/opensora-v1-1/train/stage1-1.py b/configs/opensora-v1-1/train/stage2.py
similarity index 78%
rename from configs/opensora-v1-1/train/stage1-1.py
rename to configs/opensora-v1-1/train/stage2.py
index f9b1b60..dc03935 100644
--- a/configs/opensora-v1-1/train/stage1-1.py
+++ b/configs/opensora-v1-1/train/stage2.py
@@ -17,15 +17,15 @@ bucket_config = {  # 6s/it
     "1080p": {1: (0.4, 8)},
 }
 mask_ratios = {
-    "mask_no": 0.9,
-    "mask_quarter_random": 0.01,
-    "mask_quarter_head": 0.01,
-    "mask_quarter_tail": 0.01,
-    "mask_quarter_head_tail": 0.02,
-    "mask_image_random": 0.01,
-    "mask_image_head": 0.01,
-    "mask_image_tail": 0.01,
-    "mask_image_head_tail": 0.02,
+    "mask_no": 0.75,
+    "mask_quarter_random": 0.025,
+    "mask_quarter_head": 0.025,
+    "mask_quarter_tail": 0.025,
+    "mask_quarter_head_tail": 0.05,
+    "mask_image_random": 0.025,
+    "mask_image_head": 0.025,
+    "mask_image_tail": 0.025,
+    "mask_image_head_tail": 0.05,
 }
 
 # Define acceleration
@@ -41,6 +41,7 @@ model = dict(
     type="STDiT2-XL/2",
     from_pretrained=None,
     input_sq_size=512,  # pretrained model is trained on 512x512
+    qk_norm=True,
     enable_flashattn=True,
     enable_layernorm_kernel=True,
 )
@@ -48,15 +49,17 @@ vae = dict(
     type="VideoAutoencoderKL",
     from_pretrained="stabilityai/sd-vae-ft-ema",
     micro_batch_size=4,
+    local_files_only=True,
 )
 text_encoder = dict(
     type="t5",
     from_pretrained="DeepFloyd/t5-v1_1-xxl",
     model_max_length=200,
     shardformer=True,
+    local_files_only=True,
 )
 scheduler = dict(
-    type="iddpm-speed",
+    type="iddpm",
     timestep_respacing="",
 )
 
diff --git a/eval/launch.sh b/eval/launch.sh
index 8716d23..305ea76 100644
--- a/eval/launch.sh
+++ b/eval/launch.sh
@@ -14,23 +14,23 @@ LOG_BASE=logs/sample/$CKPT_BASE
 echo "Logging to $LOG_BASE"
 
 # == sample & human evaluation ==
-# CUDA_VISIBLE_DEVICES=0 bash eval/sample.sh $CKPT -1 >${LOG_BASE}_1.log 2>&1 &
-# CUDA_VISIBLE_DEVICES=1 bash eval/sample.sh $CKPT -3 >${LOG_BASE}_3.log 2>&1 &
-# CUDA_VISIBLE_DEVICES=2 bash eval/sample.sh $CKPT -2a >${LOG_BASE}_2a.log 2>&1 &
-# CUDA_VISIBLE_DEVICES=3 bash eval/sample.sh $CKPT -2b >${LOG_BASE}_2b.log 2>&1 &
-# CUDA_VISIBLE_DEVICES=4 bash eval/sample.sh $CKPT -2c >${LOG_BASE}_2c.log 2>&1 &
-# CUDA_VISIBLE_DEVICES=5 bash eval/sample.sh $CKPT -2d >${LOG_BASE}_2d.log 2>&1 &
-# CUDA_VISIBLE_DEVICES=6 bash eval/sample.sh $CKPT -2e >${LOG_BASE}_2e.log 2>&1 &
-# CUDA_VISIBLE_DEVICES=7 bash eval/sample.sh $CKPT -2f >${LOG_BASE}_2f.log 2>&1 &
+CUDA_VISIBLE_DEVICES=0 bash eval/sample.sh $CKPT -1 >${LOG_BASE}_1.log 2>&1 &
+CUDA_VISIBLE_DEVICES=1 bash eval/sample.sh $CKPT -3 >${LOG_BASE}_3.log 2>&1 &
+CUDA_VISIBLE_DEVICES=2 bash eval/sample.sh $CKPT -2a >${LOG_BASE}_2a.log 2>&1 &
+CUDA_VISIBLE_DEVICES=3 bash eval/sample.sh $CKPT -2b >${LOG_BASE}_2b.log 2>&1 &
+CUDA_VISIBLE_DEVICES=4 bash eval/sample.sh $CKPT -2c >${LOG_BASE}_2c.log 2>&1 &
+CUDA_VISIBLE_DEVICES=5 bash eval/sample.sh $CKPT -2d >${LOG_BASE}_2d.log 2>&1 &
+CUDA_VISIBLE_DEVICES=6 bash eval/sample.sh $CKPT -2e >${LOG_BASE}_2e.log 2>&1 &
+CUDA_VISIBLE_DEVICES=7 bash eval/sample.sh $CKPT -2f >${LOG_BASE}_2f.log 2>&1 &
 
 # == vbench ==
-CUDA_VISIBLE_DEVICES=0 bash eval/sample.sh $CKPT -4a >${LOG_BASE}_4a.log 2>&1 &
-CUDA_VISIBLE_DEVICES=1 bash eval/sample.sh $CKPT -4b >${LOG_BASE}_4b.log 2>&1 &
-CUDA_VISIBLE_DEVICES=2 bash eval/sample.sh $CKPT -4c >${LOG_BASE}_4c.log 2>&1 &
-CUDA_VISIBLE_DEVICES=3 bash eval/sample.sh $CKPT -4d >${LOG_BASE}_4d.log 2>&1 &
-CUDA_VISIBLE_DEVICES=4 bash eval/sample.sh $CKPT -4e >${LOG_BASE}_4e.log 2>&1 &
-CUDA_VISIBLE_DEVICES=5 bash eval/sample.sh $CKPT -4f >${LOG_BASE}_4f.log 2>&1 &
-CUDA_VISIBLE_DEVICES=6 bash eval/sample.sh $CKPT -4g >${LOG_BASE}_4g.log 2>&1 &
-CUDA_VISIBLE_DEVICES=7 bash eval/sample.sh $CKPT -4h >${LOG_BASE}_4h.log 2>&1 &
+# CUDA_VISIBLE_DEVICES=0 bash eval/sample.sh $CKPT -4a >${LOG_BASE}_4a.log 2>&1 &
+# CUDA_VISIBLE_DEVICES=1 bash eval/sample.sh $CKPT -4b >${LOG_BASE}_4b.log 2>&1 &
+# CUDA_VISIBLE_DEVICES=2 bash eval/sample.sh $CKPT -4c >${LOG_BASE}_4c.log 2>&1 &
+# CUDA_VISIBLE_DEVICES=3 bash eval/sample.sh $CKPT -4d >${LOG_BASE}_4d.log 2>&1 &
+# CUDA_VISIBLE_DEVICES=4 bash eval/sample.sh $CKPT -4e >${LOG_BASE}_4e.log 2>&1 &
+# CUDA_VISIBLE_DEVICES=5 bash eval/sample.sh $CKPT -4f >${LOG_BASE}_4f.log 2>&1 &
+# CUDA_VISIBLE_DEVICES=6 bash eval/sample.sh $CKPT -4g >${LOG_BASE}_4g.log 2>&1 &
+# CUDA_VISIBLE_DEVICES=7 bash eval/sample.sh $CKPT -4h >${LOG_BASE}_4h.log 2>&1 &
 
 # kill all by: pkill -f "inference"
diff --git a/eval/sample.sh b/eval/sample.sh
index 396061f..1f0f34e 100644
--- a/eval/sample.sh
+++ b/eval/sample.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-set -x
+# set -x
 set -e
 
 CKPT=$1
@@ -86,7 +86,7 @@ function run_video_c() { # 30min
   eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 16 --image-size 240 426 --sample-name sora_16x240x426
 
   # 2.3.2 128x240x426
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 128 --image-size 240 426 --sample-name short_128x240x426
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 48 --image-size 256 256 --sample-name sora_48x256x256
 }
 
 function run_video_d() { # 30min
diff --git a/opensora/utils/ckpt_utils.py b/opensora/utils/ckpt_utils.py
index 086742f..746f42e 100644
--- a/opensora/utils/ckpt_utils.py
+++ b/opensora/utils/ckpt_utils.py
@@ -256,7 +256,7 @@ def create_logger(logging_dir):
     return logger
 
 
-def load_checkpoint(model, ckpt_path, save_as_pt=True):
+def load_checkpoint(model, ckpt_path, save_as_pt=False):
     if ckpt_path.endswith(".pt") or ckpt_path.endswith(".pth"):
         state_dict = find_model(ckpt_path, model=model)
         missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)