[exp] update config

2026-04-11 05:13:31 +02:00 · 2024-05-07 14:17:10 +08:00 · 2024-05-07 14:17:10 +08:00 · 36486a4bee
commit 36486a4bee
parent 10657621ef
5 changed files with 72 additions and 17 deletions
--- a/configs/opensora-v1-2/inference/sample-ref.py
+++ b/configs/opensora-v1-2/inference/sample-ref.py
@ -0,0 +1,56 @@
+image_size = (240, 426)
+num_frames = 34
+fps = 24
+frame_interval = 1
+
+prompt_path = None
+save_dir = "./samples/samples/"
+seed = 42
+batch_size = 1
+multi_resolution = "STDiT2"
+dtype = "bf16"
+
+# Condition
+prompt = [
+    'Drone view of waves crashing against the rugged cliffs along Big Sur\'s garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff\'s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff\'s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.{"reference_path": "assets/images/condition/cliff.png", "mask_strategy": "0,0,0,0;0,0,0,1;0,0,0,2;0,0,0,3;0,0,0,4"}',
+]
+loop = 1
+condition_frame_length = 4
+
+# Define model
+model = dict(
+    type="STDiT3-XL/2",
+    from_pretrained=None,
+    qk_norm=True,
+    enable_flashattn=True,
+    enable_layernorm_kernel=True,
+)
+vae = dict(
+    type="VideoAutoencoderPipeline",
+    from_pretrained="pretrained_models/vae-v2",
+    micro_frame_size=17,
+    vae_2d=dict(
+        type="VideoAutoencoderKL",
+        from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers",
+        subfolder="vae",
+        micro_batch_size=4,
+        local_files_only=True,
+    ),
+    vae_temporal=dict(
+        type="VAE_Temporal_SD",
+        from_pretrained=None,
+    ),
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
+    model_max_length=300,
+    local_files_only=True,
+)
+scheduler = dict(
+    type="rflow",
+    use_discrete_timesteps=False,
+    use_timestep_transform=True,
+    num_sampling_steps=30,
+    cfg_scale=4.5,
+)
--- a/configs/opensora-v1-2/inference/sample.py
+++ b/configs/opensora-v1-2/inference/sample.py
@ -1,6 +1,6 @@
 image_size = (240, 426)
 num_frames = 34
-fps = 30
+fps = 24
 frame_interval = 1

 prompt_path = "./assets/texts/t2v_samples.txt"
--- a/configs/opensora-v1-2/train/stage1-gc.py
+++ b/configs/opensora-v1-2/train/stage1-gc.py
@ -19,7 +19,7 @@ bucket_config = {  # 20s/it
    "1024": {1: (0.1, 20), 51: (0.3, 2), 102: (0.3, 1), 204: (0.0, None)},
    # ---
    "1080p": {1: (0.1, 10)},
-    # # ---
+    # ---
    "2048": {1: (0.1, 5)},
 }
 grad_checkpoint = True
@ -41,8 +41,8 @@ model = dict(
 )
 vae = dict(
    type="VideoAutoencoderPipeline",
-    from_pretrained="pretrained_models/vae-v2",
-    micro_frame_size=16,
+    from_pretrained="pretrained_models/vae-v3",
+    micro_frame_size=17,
    vae_2d=dict(
        type="VideoAutoencoderKL",
        from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers",
@ -71,16 +71,16 @@ scheduler = dict(

 # Mask settings
 mask_ratios = {
-    "mask_random": 0.4,
+    "mask_random": 0.2,
    "mask_intepolate": 0.01,
    "mask_quarter_random": 0.01,
    "mask_quarter_head": 0.01,
    "mask_quarter_tail": 0.01,
    "mask_quarter_head_tail": 0.01,
-    "mask_image_random": 0.01,
-    "mask_image_head": 0.01,
-    "mask_image_tail": 0.01,
-    "mask_image_head_tail": 0.01,
+    "mask_image_random": 0.05,
+    "mask_image_head": 0.1,
+    "mask_image_tail": 0.05,
+    "mask_image_head_tail": 0.05,
 }

 # Log settings
--- a/configs/opensora-v1-2/train/stage1.py
+++ b/configs/opensora-v1-2/train/stage1.py
@ -36,11 +36,10 @@ model = dict(
    qk_norm=True,
    enable_flashattn=True,
    enable_layernorm_kernel=True,
-    only_train_temporal=True,
 )
 vae = dict(
    type="VideoAutoencoderPipeline",
-    from_pretrained="pretrained_models/vae-v2",
+    from_pretrained="pretrained_models/vae-v3",
    micro_frame_size=17,
    vae_2d=dict(
        type="VideoAutoencoderKL",
@ -70,16 +69,16 @@ scheduler = dict(

 # Mask settings
 mask_ratios = {
-    "mask_random": 0.4,
+    "mask_random": 0.2,
    "mask_intepolate": 0.01,
    "mask_quarter_random": 0.01,
    "mask_quarter_head": 0.01,
    "mask_quarter_tail": 0.01,
    "mask_quarter_head_tail": 0.01,
-    "mask_image_random": 0.01,
-    "mask_image_head": 0.01,
-    "mask_image_tail": 0.01,
-    "mask_image_head_tail": 0.01,
+    "mask_image_random": 0.05,
+    "mask_image_head": 0.1,
+    "mask_image_tail": 0.05,
+    "mask_image_head_tail": 0.05,
 }

 # Log settings
--- a/scripts/inference-long.py
+++ b/scripts/inference-long.py
@ -291,7 +291,7 @@ def main():
                    additional_args=model_args,
                    mask=masks,  # scheduler must support mask
                )
-                samples = vae.decode(samples.to(dtype))
+                samples = vae.decode(samples.to(dtype), num_frames=cfg.num_frames)
                video_clips.append(samples)

                # 4.7. save video