diff --git a/configs/opensora-v1-1/train/benchmark.py b/configs/opensora-v1-1/train/benchmark.py
index 18b1c4a..eb87531 100644
--- a/configs/opensora-v1-1/train/benchmark.py
+++ b/configs/opensora-v1-1/train/benchmark.py
@@ -18,6 +18,7 @@ dataset = dict(
 # 5. { resolution: {num_frames: (0.0, None)} }, this bucket will not be used
 
 bucket_config = {
+    # == manual search ==
     # "240p": {128: (1.0, 2)}, # 4.28s/it
     # "240p": {64: (1.0, 4)},
     # "240p": {32: (1.0, 8)},  # 4.6s/it
@@ -32,26 +33,26 @@ bucket_config = {
     # "1024": {1: (1.0, 20)}, # 4.3s/it
     # "1080p": {1: (1.0, 16)}, # 8.6s/it
     # "1080p": {1: (1.0, 8)},  # 4.4s/it
-    "240p": {
-        16: (1.0, (2, 32)),
-        32: (1.0, (2, 16)),
-        64: (1.0, (2, 8)),
-        128: (1.0, (2, 6)),
-    },
-    "256": {1: (1.0, (128, 300))},
-    "512": {1: (0.5, (64, 128))},
-    "480p": {1: (0.4, (32, 128)), 16: (0.4, (2, 32)), 32: (0.0, None)},
-    "720p": {16: (0.1, (2, 16)), 32: (0.0, None)},  # No examples now
-    "1024": {1: (0.3, (8, 64))},
-    "1080p": {1: (0.3, (2, 32))},
+    # == stage 0 ==
+    # "240p": {
+    #     16: (1.0, (2, 32)),
+    #     32: (1.0, (2, 16)),
+    #     64: (1.0, (2, 8)),
+    #     128: (1.0, (2, 6)),
+    # },
+    # "256": {1: (1.0, (128, 300))},
+    # "512": {1: (0.5, (64, 128))},
+    # "480p": {1: (0.4, (32, 128)), 16: (0.4, (2, 32)), 32: (0.0, None)},
+    # "720p": {16: (0.1, (2, 16)), 32: (0.0, None)},  # No examples now
+    # "1024": {1: (0.3, (8, 64))},
+    # "1080p": {1: (0.3, (2, 32))},
+    # == stage 2 ==
 }
-# mask_ratios = {
-#     "mask_no": 0.0,
-#     "mask_random": 1.0,
-# }
+
 
 # Define acceleration
 num_workers = 4
+num_bucket_build_workers = 16
 dtype = "bf16"
 grad_checkpoint = True
 plugin = "zero2"
@@ -62,6 +63,7 @@ model = dict(
     type="STDiT2-XL/2",
     from_pretrained=None,
     input_sq_size=512,  # pretrained model is trained on 512x512
+    qk_norm=True,
     enable_flashattn=True,
     enable_layernorm_kernel=True,
 )
@@ -69,12 +71,14 @@ vae = dict(
     type="VideoAutoencoderKL",
     from_pretrained="stabilityai/sd-vae-ft-ema",
     micro_batch_size=4,
+    local_files_only=True,
 )
 text_encoder = dict(
     type="t5",
     from_pretrained="DeepFloyd/t5-v1_1-xxl",
     model_max_length=200,
     shardformer=True,
+    local_files_only=True,
 )
 scheduler = dict(
     type="iddpm-speed",
@@ -91,6 +95,6 @@ log_every = 10
 ckpt_every = 1000
 load = None
 
-batch_size = 10  # only for logging
+batch_size = None
 lr = 2e-5
 grad_clip = 1.0
diff --git a/configs/opensora-v1-1/train/stage2.py b/configs/opensora-v1-1/train/stage2.py
index dc03935..bc5923a 100644
--- a/configs/opensora-v1-1/train/stage2.py
+++ b/configs/opensora-v1-1/train/stage2.py
@@ -7,12 +7,13 @@ dataset = dict(
     image_size=(None, None),
     transform_name="resize_crop",
 )
-bucket_config = {  # 6s/it
-    "256": {1: (1.0, 254)},
-    "240p": {16: (1.0, 17), 32: (1.0, 9), 64: (1.0, 4), 128: (1.0, 2)},
-    "512": {1: (0.5, 86)},
+bucket_config = {  # 7s/it
+    "144p": {1: (1.0, 48), 16: (1.0, 17), 32: (1.0, 9), 64: (1.0, 4), 128: (0.8, 1)},
+    "256": {1: (0.8, 254), 16: (0.5, 17), 32: (0.5, 9), 64: (0.5, 4), 128: (0.5, 1)},
+    "240p": {1: (0.1, 20), 16: (0.9, 17), 32: (0.8, 9), 64: (0.8, 4), 128: (0.8, 2)},
+    "512": {1: (0.5, 86), 16: (0.2, 4), 32: (0.2, 3), 64: (0.2, 2), 128: (0.0, None)},
     "480p": {1: (0.4, 54), 16: (0.4, 4), 32: (0.0, None)},
-    "720p": {16: (0.1, 2), 32: (0.0, None)},
+    "720p": {1: (0.1, 20), 16: (0.1, 2), 32: (0.0, None)},
     "1024": {1: (0.3, 20)},
     "1080p": {1: (0.4, 8)},
 }
diff --git a/opensora/datasets/sampler.py b/opensora/datasets/sampler.py
index 535b60d..bdb1b76 100644
--- a/opensora/datasets/sampler.py
+++ b/opensora/datasets/sampler.py
@@ -216,10 +216,10 @@ class VariableVideoBatchSampler(DistributedSampler):
             print(f"Total training samples: {total_samples}, num buckets: {len(num_dict)}")
             print("Bucket samples:")
             pprint(num_dict)
-            print("Bucket samples by HxWxT:")
-            pprint(num_hwt_dict)
             print("Bucket samples by aspect ratio:")
             pprint(num_aspect_dict)
+            print("Bucket samples by HxWxT:")
+            pprint(num_hwt_dict)
             print(f"Number of batches: {num_batch}")
         self.approximate_num_batch = num_batch