diff --git a/configs/opensora-v1-1/train/benchmark.py b/configs/opensora-v1-1/train/benchmark.py index 18b1c4a..eb87531 100644 --- a/configs/opensora-v1-1/train/benchmark.py +++ b/configs/opensora-v1-1/train/benchmark.py @@ -18,6 +18,7 @@ dataset = dict( # 5. { resolution: {num_frames: (0.0, None)} }, this bucket will not be used bucket_config = { + # == manual search == # "240p": {128: (1.0, 2)}, # 4.28s/it # "240p": {64: (1.0, 4)}, # "240p": {32: (1.0, 8)}, # 4.6s/it @@ -32,26 +33,26 @@ bucket_config = { # "1024": {1: (1.0, 20)}, # 4.3s/it # "1080p": {1: (1.0, 16)}, # 8.6s/it # "1080p": {1: (1.0, 8)}, # 4.4s/it - "240p": { - 16: (1.0, (2, 32)), - 32: (1.0, (2, 16)), - 64: (1.0, (2, 8)), - 128: (1.0, (2, 6)), - }, - "256": {1: (1.0, (128, 300))}, - "512": {1: (0.5, (64, 128))}, - "480p": {1: (0.4, (32, 128)), 16: (0.4, (2, 32)), 32: (0.0, None)}, - "720p": {16: (0.1, (2, 16)), 32: (0.0, None)}, # No examples now - "1024": {1: (0.3, (8, 64))}, - "1080p": {1: (0.3, (2, 32))}, + # == stage 0 == + # "240p": { + # 16: (1.0, (2, 32)), + # 32: (1.0, (2, 16)), + # 64: (1.0, (2, 8)), + # 128: (1.0, (2, 6)), + # }, + # "256": {1: (1.0, (128, 300))}, + # "512": {1: (0.5, (64, 128))}, + # "480p": {1: (0.4, (32, 128)), 16: (0.4, (2, 32)), 32: (0.0, None)}, + # "720p": {16: (0.1, (2, 16)), 32: (0.0, None)}, # No examples now + # "1024": {1: (0.3, (8, 64))}, + # "1080p": {1: (0.3, (2, 32))}, + # == stage 2 == } -# mask_ratios = { -# "mask_no": 0.0, -# "mask_random": 1.0, -# } + # Define acceleration num_workers = 4 +num_bucket_build_workers = 16 dtype = "bf16" grad_checkpoint = True plugin = "zero2" @@ -62,6 +63,7 @@ model = dict( type="STDiT2-XL/2", from_pretrained=None, input_sq_size=512, # pretrained model is trained on 512x512 + qk_norm=True, enable_flashattn=True, enable_layernorm_kernel=True, ) @@ -69,12 +71,14 @@ vae = dict( type="VideoAutoencoderKL", from_pretrained="stabilityai/sd-vae-ft-ema", micro_batch_size=4, + local_files_only=True, ) text_encoder = dict( type="t5", from_pretrained="DeepFloyd/t5-v1_1-xxl", model_max_length=200, shardformer=True, + local_files_only=True, ) scheduler = dict( type="iddpm-speed", @@ -91,6 +95,6 @@ log_every = 10 ckpt_every = 1000 load = None -batch_size = 10 # only for logging +batch_size = None lr = 2e-5 grad_clip = 1.0 diff --git a/configs/opensora-v1-1/train/stage2.py b/configs/opensora-v1-1/train/stage2.py index dc03935..bc5923a 100644 --- a/configs/opensora-v1-1/train/stage2.py +++ b/configs/opensora-v1-1/train/stage2.py @@ -7,12 +7,13 @@ dataset = dict( image_size=(None, None), transform_name="resize_crop", ) -bucket_config = { # 6s/it - "256": {1: (1.0, 254)}, - "240p": {16: (1.0, 17), 32: (1.0, 9), 64: (1.0, 4), 128: (1.0, 2)}, - "512": {1: (0.5, 86)}, +bucket_config = { # 7s/it + "144p": {1: (1.0, 48), 16: (1.0, 17), 32: (1.0, 9), 64: (1.0, 4), 128: (0.8, 1)}, + "256": {1: (0.8, 254), 16: (0.5, 17), 32: (0.5, 9), 64: (0.5, 4), 128: (0.5, 1)}, + "240p": {1: (0.1, 20), 16: (0.9, 17), 32: (0.8, 9), 64: (0.8, 4), 128: (0.8, 2)}, + "512": {1: (0.5, 86), 16: (0.2, 4), 32: (0.2, 3), 64: (0.2, 2), 128: (0.0, None)}, "480p": {1: (0.4, 54), 16: (0.4, 4), 32: (0.0, None)}, - "720p": {16: (0.1, 2), 32: (0.0, None)}, + "720p": {1: (0.1, 20), 16: (0.1, 2), 32: (0.0, None)}, "1024": {1: (0.3, 20)}, "1080p": {1: (0.4, 8)}, } diff --git a/opensora/datasets/sampler.py b/opensora/datasets/sampler.py index 535b60d..bdb1b76 100644 --- a/opensora/datasets/sampler.py +++ b/opensora/datasets/sampler.py @@ -216,10 +216,10 @@ class VariableVideoBatchSampler(DistributedSampler): print(f"Total training samples: {total_samples}, num buckets: {len(num_dict)}") print("Bucket samples:") pprint(num_dict) - print("Bucket samples by HxWxT:") - pprint(num_hwt_dict) print("Bucket samples by aspect ratio:") pprint(num_aspect_dict) + print("Bucket samples by HxWxT:") + pprint(num_hwt_dict) print(f"Number of batches: {num_batch}") self.approximate_num_batch = num_batch