update training config

2026-04-11 05:13:31 +02:00 · 2024-04-22 12:05:59 +08:00 · 2024-04-22 12:05:59 +08:00 · 3944e93065
commit 3944e93065
parent 8931efccd7
3 changed files with 29 additions and 24 deletions
--- a/configs/opensora-v1-1/train/benchmark.py
+++ b/configs/opensora-v1-1/train/benchmark.py
@ -18,6 +18,7 @@ dataset = dict(
 # 5. { resolution: {num_frames: (0.0, None)} }, this bucket will not be used

 bucket_config = {
+    # == manual search ==
    # "240p": {128: (1.0, 2)}, # 4.28s/it
    # "240p": {64: (1.0, 4)},
    # "240p": {32: (1.0, 8)},  # 4.6s/it
@ -32,26 +33,26 @@ bucket_config = {
    # "1024": {1: (1.0, 20)}, # 4.3s/it
    # "1080p": {1: (1.0, 16)}, # 8.6s/it
    # "1080p": {1: (1.0, 8)},  # 4.4s/it
-    "240p": {
-        16: (1.0, (2, 32)),
-        32: (1.0, (2, 16)),
-        64: (1.0, (2, 8)),
-        128: (1.0, (2, 6)),
-    },
-    "256": {1: (1.0, (128, 300))},
-    "512": {1: (0.5, (64, 128))},
-    "480p": {1: (0.4, (32, 128)), 16: (0.4, (2, 32)), 32: (0.0, None)},
-    "720p": {16: (0.1, (2, 16)), 32: (0.0, None)},  # No examples now
-    "1024": {1: (0.3, (8, 64))},
-    "1080p": {1: (0.3, (2, 32))},
+    # == stage 0 ==
+    # "240p": {
+    #     16: (1.0, (2, 32)),
+    #     32: (1.0, (2, 16)),
+    #     64: (1.0, (2, 8)),
+    #     128: (1.0, (2, 6)),
+    # },
+    # "256": {1: (1.0, (128, 300))},
+    # "512": {1: (0.5, (64, 128))},
+    # "480p": {1: (0.4, (32, 128)), 16: (0.4, (2, 32)), 32: (0.0, None)},
+    # "720p": {16: (0.1, (2, 16)), 32: (0.0, None)},  # No examples now
+    # "1024": {1: (0.3, (8, 64))},
+    # "1080p": {1: (0.3, (2, 32))},
+    # == stage 2 ==
 }
-# mask_ratios = {
-#     "mask_no": 0.0,
-#     "mask_random": 1.0,
-# }
+

 # Define acceleration
 num_workers = 4
+num_bucket_build_workers = 16
 dtype = "bf16"
 grad_checkpoint = True
 plugin = "zero2"
@ -62,6 +63,7 @@ model = dict(
    type="STDiT2-XL/2",
    from_pretrained=None,
    input_sq_size=512,  # pretrained model is trained on 512x512
+    qk_norm=True,
    enable_flashattn=True,
    enable_layernorm_kernel=True,
 )
@ -69,12 +71,14 @@ vae = dict(
    type="VideoAutoencoderKL",
    from_pretrained="stabilityai/sd-vae-ft-ema",
    micro_batch_size=4,
+    local_files_only=True,
 )
 text_encoder = dict(
    type="t5",
    from_pretrained="DeepFloyd/t5-v1_1-xxl",
    model_max_length=200,
    shardformer=True,
+    local_files_only=True,
 )
 scheduler = dict(
    type="iddpm-speed",
@ -91,6 +95,6 @@ log_every = 10
 ckpt_every = 1000
 load = None

-batch_size = 10  # only for logging
+batch_size = None
 lr = 2e-5
 grad_clip = 1.0
--- a/configs/opensora-v1-1/train/stage2.py
+++ b/configs/opensora-v1-1/train/stage2.py
@ -7,12 +7,13 @@ dataset = dict(
    image_size=(None, None),
    transform_name="resize_crop",
 )
-bucket_config = {  # 6s/it
-    "256": {1: (1.0, 254)},
-    "240p": {16: (1.0, 17), 32: (1.0, 9), 64: (1.0, 4), 128: (1.0, 2)},
-    "512": {1: (0.5, 86)},
+bucket_config = {  # 7s/it
+    "144p": {1: (1.0, 48), 16: (1.0, 17), 32: (1.0, 9), 64: (1.0, 4), 128: (0.8, 1)},
+    "256": {1: (0.8, 254), 16: (0.5, 17), 32: (0.5, 9), 64: (0.5, 4), 128: (0.5, 1)},
+    "240p": {1: (0.1, 20), 16: (0.9, 17), 32: (0.8, 9), 64: (0.8, 4), 128: (0.8, 2)},
+    "512": {1: (0.5, 86), 16: (0.2, 4), 32: (0.2, 3), 64: (0.2, 2), 128: (0.0, None)},
    "480p": {1: (0.4, 54), 16: (0.4, 4), 32: (0.0, None)},
-    "720p": {16: (0.1, 2), 32: (0.0, None)},
+    "720p": {1: (0.1, 20), 16: (0.1, 2), 32: (0.0, None)},
    "1024": {1: (0.3, 20)},
    "1080p": {1: (0.4, 8)},
 }
--- a/opensora/datasets/sampler.py
+++ b/opensora/datasets/sampler.py
@ -216,10 +216,10 @@ class VariableVideoBatchSampler(DistributedSampler):
            print(f"Total training samples: {total_samples}, num buckets: {len(num_dict)}")
            print("Bucket samples:")
            pprint(num_dict)
-            print("Bucket samples by HxWxT:")
-            pprint(num_hwt_dict)
            print("Bucket samples by aspect ratio:")
            pprint(num_aspect_dict)
+            print("Bucket samples by HxWxT:")
+            pprint(num_hwt_dict)
            print(f"Number of batches: {num_batch}")
        self.approximate_num_batch = num_batch