diff --git a/configs/opensora-v1-2/inference/sample-ref.py b/configs/opensora-v1-2/inference/sample-ref.py
new file mode 100644
index 0000000..e57e2ec
--- /dev/null
+++ b/configs/opensora-v1-2/inference/sample-ref.py
@@ -0,0 +1,56 @@
+image_size = (240, 426)
+num_frames = 34
+fps = 24
+frame_interval = 1
+
+prompt_path = None
+save_dir = "./samples/samples/"
+seed = 42
+batch_size = 1
+multi_resolution = "STDiT2"
+dtype = "bf16"
+
+# Condition
+prompt = [
+    'Drone view of waves crashing against the rugged cliffs along Big Sur\'s garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff\'s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff\'s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.{"reference_path": "assets/images/condition/cliff.png", "mask_strategy": "0,0,0,0;0,0,0,1;0,0,0,2;0,0,0,3;0,0,0,4"}',
+]
+loop = 1
+condition_frame_length = 4
+
+# Define model
+model = dict(
+    type="STDiT3-XL/2",
+    from_pretrained=None,
+    qk_norm=True,
+    enable_flashattn=True,
+    enable_layernorm_kernel=True,
+)
+vae = dict(
+    type="VideoAutoencoderPipeline",
+    from_pretrained="pretrained_models/vae-v2",
+    micro_frame_size=17,
+    vae_2d=dict(
+        type="VideoAutoencoderKL",
+        from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers",
+        subfolder="vae",
+        micro_batch_size=4,
+        local_files_only=True,
+    ),
+    vae_temporal=dict(
+        type="VAE_Temporal_SD",
+        from_pretrained=None,
+    ),
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
+    model_max_length=300,
+    local_files_only=True,
+)
+scheduler = dict(
+    type="rflow",
+    use_discrete_timesteps=False,
+    use_timestep_transform=True,
+    num_sampling_steps=30,
+    cfg_scale=4.5,
+)
diff --git a/configs/opensora-v1-2/inference/sample.py b/configs/opensora-v1-2/inference/sample.py
index a4d1191..ae0b7ca 100644
--- a/configs/opensora-v1-2/inference/sample.py
+++ b/configs/opensora-v1-2/inference/sample.py
@@ -1,6 +1,6 @@
 image_size = (240, 426)
 num_frames = 34
-fps = 30
+fps = 24
 frame_interval = 1
 
 prompt_path = "./assets/texts/t2v_samples.txt"
diff --git a/configs/opensora-v1-2/train/stage1-gc.py b/configs/opensora-v1-2/train/stage1-gc.py
index eb73c2f..3b98f78 100644
--- a/configs/opensora-v1-2/train/stage1-gc.py
+++ b/configs/opensora-v1-2/train/stage1-gc.py
@@ -19,7 +19,7 @@ bucket_config = {  # 20s/it
     "1024": {1: (0.1, 20), 51: (0.3, 2), 102: (0.3, 1), 204: (0.0, None)},
     # ---
     "1080p": {1: (0.1, 10)},
-    # # ---
+    # ---
     "2048": {1: (0.1, 5)},
 }
 grad_checkpoint = True
@@ -41,8 +41,8 @@ model = dict(
 )
 vae = dict(
     type="VideoAutoencoderPipeline",
-    from_pretrained="pretrained_models/vae-v2",
-    micro_frame_size=16,
+    from_pretrained="pretrained_models/vae-v3",
+    micro_frame_size=17,
     vae_2d=dict(
         type="VideoAutoencoderKL",
         from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers",
@@ -71,16 +71,16 @@ scheduler = dict(
 
 # Mask settings
 mask_ratios = {
-    "mask_random": 0.4,
+    "mask_random": 0.2,
     "mask_intepolate": 0.01,
     "mask_quarter_random": 0.01,
     "mask_quarter_head": 0.01,
     "mask_quarter_tail": 0.01,
     "mask_quarter_head_tail": 0.01,
-    "mask_image_random": 0.01,
-    "mask_image_head": 0.01,
-    "mask_image_tail": 0.01,
-    "mask_image_head_tail": 0.01,
+    "mask_image_random": 0.05,
+    "mask_image_head": 0.1,
+    "mask_image_tail": 0.05,
+    "mask_image_head_tail": 0.05,
 }
 
 # Log settings
diff --git a/configs/opensora-v1-2/train/stage1.py b/configs/opensora-v1-2/train/stage1.py
index 605b4c7..2a05262 100644
--- a/configs/opensora-v1-2/train/stage1.py
+++ b/configs/opensora-v1-2/train/stage1.py
@@ -36,11 +36,10 @@ model = dict(
     qk_norm=True,
     enable_flashattn=True,
     enable_layernorm_kernel=True,
-    only_train_temporal=True,
 )
 vae = dict(
     type="VideoAutoencoderPipeline",
-    from_pretrained="pretrained_models/vae-v2",
+    from_pretrained="pretrained_models/vae-v3",
     micro_frame_size=17,
     vae_2d=dict(
         type="VideoAutoencoderKL",
@@ -70,16 +69,16 @@ scheduler = dict(
 
 # Mask settings
 mask_ratios = {
-    "mask_random": 0.4,
+    "mask_random": 0.2,
     "mask_intepolate": 0.01,
     "mask_quarter_random": 0.01,
     "mask_quarter_head": 0.01,
     "mask_quarter_tail": 0.01,
     "mask_quarter_head_tail": 0.01,
-    "mask_image_random": 0.01,
-    "mask_image_head": 0.01,
-    "mask_image_tail": 0.01,
-    "mask_image_head_tail": 0.01,
+    "mask_image_random": 0.05,
+    "mask_image_head": 0.1,
+    "mask_image_tail": 0.05,
+    "mask_image_head_tail": 0.05,
 }
 
 # Log settings
diff --git a/scripts/inference-long.py b/scripts/inference-long.py
index 81b996d..e1b7d1d 100644
--- a/scripts/inference-long.py
+++ b/scripts/inference-long.py
@@ -291,7 +291,7 @@ def main():
                     additional_args=model_args,
                     mask=masks,  # scheduler must support mask
                 )
-                samples = vae.decode(samples.to(dtype))
+                samples = vae.decode(samples.to(dtype), num_frames=cfg.num_frames)
                 video_clips.append(samples)
 
                 # 4.7. save video