complete masked demo

2026-05-20 17:35:58 +02:00 · 2024-03-24 15:09:42 +08:00 · 2024-03-24 15:09:42 +08:00 · 7d478f5094
commit 7d478f5094
parent dcbacb03a6
5 changed files with 66 additions and 8 deletions
--- a/configs/opensora/inference-long/16x512x512-annimate.py
+++ b/configs/opensora/inference-long/16x512x512-annimate.py
@ -35,13 +35,13 @@ dtype = "fp16"
 prompt_path = None
 prompt = [
    "Drone view of waves crashing against the rugged cliffs along Big Sur’s garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff’s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff's edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.",
-    "A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about.",
+    "Pirate ship in a cosmic maelstrom nebula.",
 ]

 loop = 1
 # condition_frame_length = 4
-reference_path = ["assets/images/condition/cliff.png"]
-mask_strategy = ["0,0,0,1,0"]  # valid when reference_path is not None
+reference_path = ["assets/images/condition/cliff.png", "assets/images/condition/ship.png"]
+mask_strategy = ["0,0,0,1,0", "0,0,0,1,0"]  # valid when reference_path is not None
 # (loop id, ref id, ref start, length, target start)

 # Others
--- a/configs/opensora/inference-long/16x512x512-connect.py
+++ b/configs/opensora/inference-long/16x512x512-connect.py
@ -0,0 +1,53 @@
+# scripts/inference_long.py
+num_frames = 16
+fps = 24 // 3
+image_size = (512, 512)
+
+# Define model
+model = dict(
+    type="STDiT-XL/2",
+    space_scale=1.0,
+    time_scale=1.0,
+    use_x_mask=True,
+    enable_flashattn=True,
+    enable_layernorm_kernel=True,
+    from_pretrained=None,
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+    micro_batch_size=4,
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
+    model_max_length=120,
+)
+scheduler = dict(
+    type="iddpm",
+    # type="dpm-solver",
+    num_sampling_steps=100,
+    cfg_scale=7.0,
+)
+dtype = "fp16"
+
+# Condition
+prompt_path = None
+prompt = [
+    "Drone view of waves crashing against the rugged cliffs along Big Sur’s garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff’s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff's edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.",
+    "A sad small cactus with in the Sahara desert becomes happy.",
+]
+
+loop = 1
+condition_frame_length = 4
+reference_path = [
+    "assets/images/condition/cliff.png",
+    "assets/images/condition/cactus-sad.png;assets/images/condition/cactus-happy.png",
+]
+mask_strategy = ["0,0,0,1,0;0,0,0,1,-1", "0,0,0,1,0;0,1,0,1,-1"]  # valid when reference_path is not None
+# (loop id, ref id, ref start, length, target start)
+
+# Others
+batch_size = 2
+seed = 42
+save_dir = "./outputs/samples/"
--- a/configs/opensora/inference-long/16x512x512-extend.py
+++ b/configs/opensora/inference-long/16x512x512-extend.py
@ -34,13 +34,14 @@ dtype = "fp16"
 # Condition
 prompt_path = None
 prompt = [
+    "Drone view of waves crashing against the rugged cliffs along Big Sur’s garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff’s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff's edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.",
    "In an ornate, historical hall, a massive tidal wave peaks and begins to crash. Two surfers, seizing the moment, skillfully navigate the face of the wave."
 ]

-loop = 5
+loop = 10
 condition_frame_length = 4
-reference_path = ["assets/images/condition/wave.png"]
-mask_strategy = ["0,0,0,1,0"]  # valid when reference_path is not None
+reference_path = ["assets/images/condition/cliff.png", "assets/images/condition/wave.png"]
+mask_strategy = ["0,0,0,1,0", "0,0,0,1,0"]  # valid when reference_path is not None
 # (loop id, ref id, ref start, length, target start)

 # Others
--- a/configs/pixart/inference/1x512x512.py
+++ b/configs/pixart/inference/1x512x512.py
@ -16,7 +16,7 @@ vae = dict(
 )
 text_encoder = dict(
    type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
    model_max_length=120,
 )
 scheduler = dict(
@ -28,7 +28,9 @@ dtype = "fp16"

 # prompt_path = "./assets/texts/t2i_samples.txt"
 prompt = [
-    "A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about.",
+    "Pirate ship trapped in a cosmic maelstrom nebula.",
+    "A small cactus with a happy face in the Sahara desert.",
+    "A small cactus with a sad face in the Sahara desert.",
 ]

 # Others
--- a/opensora/models/pixart/pixart.py
+++ b/opensora/models/pixart/pixart.py
@ -132,8 +132,10 @@ class PixArt(nn.Module):
        time_scale=1.0,
        enable_flashattn=False,
        enable_layernorm_kernel=False,
+        enable_sequence_parallelism=False,
    ):
        super().__init__()
+        assert enable_sequence_parallelism is False, "Sequence parallelism is not supported in this version."
        self.pred_sigma = pred_sigma
        self.in_channels = in_channels
        self.out_channels = in_channels * 2 if pred_sigma else in_channels