diff --git a/configs/opensora-v1-2/inference/sample-ref.py b/configs/opensora-v1-2/inference/sample-ref.py new file mode 100644 index 0000000..e57e2ec --- /dev/null +++ b/configs/opensora-v1-2/inference/sample-ref.py @@ -0,0 +1,56 @@ +image_size = (240, 426) +num_frames = 34 +fps = 24 +frame_interval = 1 + +prompt_path = None +save_dir = "./samples/samples/" +seed = 42 +batch_size = 1 +multi_resolution = "STDiT2" +dtype = "bf16" + +# Condition +prompt = [ + 'Drone view of waves crashing against the rugged cliffs along Big Sur\'s garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff\'s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff\'s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.{"reference_path": "assets/images/condition/cliff.png", "mask_strategy": "0,0,0,0;0,0,0,1;0,0,0,2;0,0,0,3;0,0,0,4"}', +] +loop = 1 +condition_frame_length = 4 + +# Define model +model = dict( + type="STDiT3-XL/2", + from_pretrained=None, + qk_norm=True, + enable_flashattn=True, + enable_layernorm_kernel=True, +) +vae = dict( + type="VideoAutoencoderPipeline", + from_pretrained="pretrained_models/vae-v2", + micro_frame_size=17, + vae_2d=dict( + type="VideoAutoencoderKL", + from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers", + subfolder="vae", + micro_batch_size=4, + local_files_only=True, + ), + vae_temporal=dict( + type="VAE_Temporal_SD", + from_pretrained=None, + ), +) +text_encoder = dict( + type="t5", + from_pretrained="DeepFloyd/t5-v1_1-xxl", + model_max_length=300, + local_files_only=True, +) +scheduler = dict( + type="rflow", + use_discrete_timesteps=False, + use_timestep_transform=True, + num_sampling_steps=30, + cfg_scale=4.5, +) diff --git a/configs/opensora-v1-2/inference/sample.py b/configs/opensora-v1-2/inference/sample.py index a4d1191..ae0b7ca 100644 --- a/configs/opensora-v1-2/inference/sample.py +++ b/configs/opensora-v1-2/inference/sample.py @@ -1,6 +1,6 @@ image_size = (240, 426) num_frames = 34 -fps = 30 +fps = 24 frame_interval = 1 prompt_path = "./assets/texts/t2v_samples.txt" diff --git a/configs/opensora-v1-2/train/stage1-gc.py b/configs/opensora-v1-2/train/stage1-gc.py index eb73c2f..3b98f78 100644 --- a/configs/opensora-v1-2/train/stage1-gc.py +++ b/configs/opensora-v1-2/train/stage1-gc.py @@ -19,7 +19,7 @@ bucket_config = { # 20s/it "1024": {1: (0.1, 20), 51: (0.3, 2), 102: (0.3, 1), 204: (0.0, None)}, # --- "1080p": {1: (0.1, 10)}, - # # --- + # --- "2048": {1: (0.1, 5)}, } grad_checkpoint = True @@ -41,8 +41,8 @@ model = dict( ) vae = dict( type="VideoAutoencoderPipeline", - from_pretrained="pretrained_models/vae-v2", - micro_frame_size=16, + from_pretrained="pretrained_models/vae-v3", + micro_frame_size=17, vae_2d=dict( type="VideoAutoencoderKL", from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers", @@ -71,16 +71,16 @@ scheduler = dict( # Mask settings mask_ratios = { - "mask_random": 0.4, + "mask_random": 0.2, "mask_intepolate": 0.01, "mask_quarter_random": 0.01, "mask_quarter_head": 0.01, "mask_quarter_tail": 0.01, "mask_quarter_head_tail": 0.01, - "mask_image_random": 0.01, - "mask_image_head": 0.01, - "mask_image_tail": 0.01, - "mask_image_head_tail": 0.01, + "mask_image_random": 0.05, + "mask_image_head": 0.1, + "mask_image_tail": 0.05, + "mask_image_head_tail": 0.05, } # Log settings diff --git a/configs/opensora-v1-2/train/stage1.py b/configs/opensora-v1-2/train/stage1.py index 605b4c7..2a05262 100644 --- a/configs/opensora-v1-2/train/stage1.py +++ b/configs/opensora-v1-2/train/stage1.py @@ -36,11 +36,10 @@ model = dict( qk_norm=True, enable_flashattn=True, enable_layernorm_kernel=True, - only_train_temporal=True, ) vae = dict( type="VideoAutoencoderPipeline", - from_pretrained="pretrained_models/vae-v2", + from_pretrained="pretrained_models/vae-v3", micro_frame_size=17, vae_2d=dict( type="VideoAutoencoderKL", @@ -70,16 +69,16 @@ scheduler = dict( # Mask settings mask_ratios = { - "mask_random": 0.4, + "mask_random": 0.2, "mask_intepolate": 0.01, "mask_quarter_random": 0.01, "mask_quarter_head": 0.01, "mask_quarter_tail": 0.01, "mask_quarter_head_tail": 0.01, - "mask_image_random": 0.01, - "mask_image_head": 0.01, - "mask_image_tail": 0.01, - "mask_image_head_tail": 0.01, + "mask_image_random": 0.05, + "mask_image_head": 0.1, + "mask_image_tail": 0.05, + "mask_image_head_tail": 0.05, } # Log settings diff --git a/scripts/inference-long.py b/scripts/inference-long.py index 81b996d..e1b7d1d 100644 --- a/scripts/inference-long.py +++ b/scripts/inference-long.py @@ -291,7 +291,7 @@ def main(): additional_args=model_args, mask=masks, # scheduler must support mask ) - samples = vae.decode(samples.to(dtype)) + samples = vae.decode(samples.to(dtype), num_frames=cfg.num_frames) video_clips.append(samples) # 4.7. save video