From 7abee23c4ccbf10f25267c18bda7a7992daeacbf Mon Sep 17 00:00:00 2001
From: Shreyangshu <79297450+sbera7@users.noreply.github.com>
Date: Mon, 25 Mar 2024 09:15:56 +0530
Subject: [PATCH 1/4] install gradio using pip before running the demo.py file
 (#202)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 5094c31..c00826a 100644
--- a/README.md
+++ b/README.md
@@ -162,6 +162,7 @@ on improving the quality and text alignment.
 We have provided a Gradio application in this repository, you can use the following the command to start an interactive web application to experience video generation with Open-Sora.
 
 ```bash
+pip install gradio
 python scripts/demo.py
 ```
 

From b454751f8f02f395e9710bb1c34774dcdf65bb20 Mon Sep 17 00:00:00 2001
From: "Zheng Zangwei (Alex Zheng)" <zangwei@comp.nus.edu.sg>
Date: Mon, 25 Mar 2024 13:19:11 +0800
Subject: [PATCH 2/4] Docs/fix (#211)

* fix #210

* fix #209
---
 configs/opensora/train/16x512x512.py    | 2 +-
 configs/opensora/train/360x512x512.py   | 2 +-
 configs/opensora/train/64x512x512-sp.py | 2 +-
 configs/opensora/train/64x512x512.py    | 2 +-
 configs/pixart/inference/16x256x256.py  | 2 +-
 configs/pixart/inference/1x1024MS.py    | 2 +-
 configs/pixart/inference/1x256x256.py   | 2 +-
 configs/pixart/inference/1x512x512.py   | 2 +-
 configs/pixart/train/16x256x256.py      | 2 +-
 configs/pixart/train/1x512x512.py       | 2 +-
 configs/pixart/train/64x512x512.py      | 2 +-
 docs/report_v1.md                       | 2 +-
 docs/structure.md                       | 4 ++--
 docs/zh_CN/structure.md                 | 4 ++--
 14 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/configs/opensora/train/16x512x512.py b/configs/opensora/train/16x512x512.py
index 885aad1..22031dc 100644
--- a/configs/opensora/train/16x512x512.py
+++ b/configs/opensora/train/16x512x512.py
@@ -30,7 +30,7 @@ vae = dict(
 )
 text_encoder = dict(
     type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
     model_max_length=120,
     shardformer=True,
 )
diff --git a/configs/opensora/train/360x512x512.py b/configs/opensora/train/360x512x512.py
index 7a6f759..0f4e170 100644
--- a/configs/opensora/train/360x512x512.py
+++ b/configs/opensora/train/360x512x512.py
@@ -31,7 +31,7 @@ vae = dict(
 )
 text_encoder = dict(
     type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
     model_max_length=120,
     shardformer=True,
 )
diff --git a/configs/opensora/train/64x512x512-sp.py b/configs/opensora/train/64x512x512-sp.py
index b0b9062..f9b138e 100644
--- a/configs/opensora/train/64x512x512-sp.py
+++ b/configs/opensora/train/64x512x512-sp.py
@@ -30,7 +30,7 @@ vae = dict(
 )
 text_encoder = dict(
     type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
     model_max_length=120,
     shardformer=True,
 )
diff --git a/configs/opensora/train/64x512x512.py b/configs/opensora/train/64x512x512.py
index dfcdcc0..81162de 100644
--- a/configs/opensora/train/64x512x512.py
+++ b/configs/opensora/train/64x512x512.py
@@ -30,7 +30,7 @@ vae = dict(
 )
 text_encoder = dict(
     type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
     model_max_length=120,
     shardformer=True,
 )
diff --git a/configs/pixart/inference/16x256x256.py b/configs/pixart/inference/16x256x256.py
index 6fc8ee6..fed26f5 100644
--- a/configs/pixart/inference/16x256x256.py
+++ b/configs/pixart/inference/16x256x256.py
@@ -15,7 +15,7 @@ vae = dict(
 )
 text_encoder = dict(
     type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
     model_max_length=120,
 )
 scheduler = dict(
diff --git a/configs/pixart/inference/1x1024MS.py b/configs/pixart/inference/1x1024MS.py
index 41cc97a..3b34ebf 100644
--- a/configs/pixart/inference/1x1024MS.py
+++ b/configs/pixart/inference/1x1024MS.py
@@ -17,7 +17,7 @@ vae = dict(
 )
 text_encoder = dict(
     type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
     model_max_length=120,
 )
 scheduler = dict(
diff --git a/configs/pixart/inference/1x256x256.py b/configs/pixart/inference/1x256x256.py
index 11e06d7..44ce10b 100644
--- a/configs/pixart/inference/1x256x256.py
+++ b/configs/pixart/inference/1x256x256.py
@@ -16,7 +16,7 @@ vae = dict(
 )
 text_encoder = dict(
     type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
     model_max_length=120,
 )
 scheduler = dict(
diff --git a/configs/pixart/inference/1x512x512.py b/configs/pixart/inference/1x512x512.py
index 5674259..243af4a 100644
--- a/configs/pixart/inference/1x512x512.py
+++ b/configs/pixart/inference/1x512x512.py
@@ -16,7 +16,7 @@ vae = dict(
 )
 text_encoder = dict(
     type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
     model_max_length=120,
 )
 scheduler = dict(
diff --git a/configs/pixart/train/16x256x256.py b/configs/pixart/train/16x256x256.py
index b47731e..dc91937 100644
--- a/configs/pixart/train/16x256x256.py
+++ b/configs/pixart/train/16x256x256.py
@@ -29,7 +29,7 @@ vae = dict(
 )
 text_encoder = dict(
     type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
     model_max_length=120,
     shardformer=True,
 )
diff --git a/configs/pixart/train/1x512x512.py b/configs/pixart/train/1x512x512.py
index 619c9aa..ec73b8c 100644
--- a/configs/pixart/train/1x512x512.py
+++ b/configs/pixart/train/1x512x512.py
@@ -30,7 +30,7 @@ vae = dict(
 )
 text_encoder = dict(
     type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
     model_max_length=120,
     shardformer=True,
 )
diff --git a/configs/pixart/train/64x512x512.py b/configs/pixart/train/64x512x512.py
index 628cf25..deebdf6 100644
--- a/configs/pixart/train/64x512x512.py
+++ b/configs/pixart/train/64x512x512.py
@@ -30,7 +30,7 @@ vae = dict(
 )
 text_encoder = dict(
     type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
     model_max_length=120,
     shardformer=True,
 )
diff --git a/docs/report_v1.md b/docs/report_v1.md
index b3b8073..edb3644 100644
--- a/docs/report_v1.md
+++ b/docs/report_v1.md
@@ -14,7 +14,7 @@ As shown in the figure, we insert a temporal attention right after each spatial
 
 To focus on video generation, we hope to train the model based on a powerful image generation model. [PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha) is an efficiently trained high-quality image generation model with T5-conditioned DiT structure. We initialize our model with PixArt-α and initialize the projection layer of inserted temporal attention with zero. This initialization preserves model's ability of image generation at beginning, while Latte's architecture cannot. The inserted attention increases the number of parameter from 580M to 724M.
 
-![Architecture](https://i0.imgs.ovh/2024/03/16/erC1d.png)
+![Architecture](https://image.jiqizhixin.com/uploads/editor/ff49eaba-6b19-43d7-b65d-ad2ecdb9d555/640.jpeg)
 
 Drawing from the success of PixArt-α and Stable Video Diffusion, we also adopt a progressive training strategy: 16x256x256 on 366K pretraining datasets, and then 16x256x256, 16x512x512, and 64x512x512 on 20K datasets. With scaled position embedding, this strategy greatly reduces the computational cost.
 
diff --git a/docs/structure.md b/docs/structure.md
index 2d83620..3eb7c69 100644
--- a/docs/structure.md
+++ b/docs/structure.md
@@ -100,7 +100,7 @@ vae = dict(
 )
 text_encoder = dict(
     type="t5",                 # Select text encoder type (t5, clip)
-    from_pretrained="./pretrained_models/t5_ckpts", # Load from pretrained text encoder
+    from_pretrained="DeepFloyd/t5-v1_1-xxl", # Load from pretrained text encoder
     model_max_length=120,      # Maximum length of input text
 )
 scheduler = dict(
@@ -153,7 +153,7 @@ vae = dict(
 )
 text_encoder = dict(
     type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
     model_max_length=120,
     shardformer=True,           # Enable shardformer for T5 acceleration
 )
diff --git a/docs/zh_CN/structure.md b/docs/zh_CN/structure.md
index 2d83620..3eb7c69 100644
--- a/docs/zh_CN/structure.md
+++ b/docs/zh_CN/structure.md
@@ -100,7 +100,7 @@ vae = dict(
 )
 text_encoder = dict(
     type="t5",                 # Select text encoder type (t5, clip)
-    from_pretrained="./pretrained_models/t5_ckpts", # Load from pretrained text encoder
+    from_pretrained="DeepFloyd/t5-v1_1-xxl", # Load from pretrained text encoder
     model_max_length=120,      # Maximum length of input text
 )
 scheduler = dict(
@@ -153,7 +153,7 @@ vae = dict(
 )
 text_encoder = dict(
     type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
     model_max_length=120,
     shardformer=True,           # Enable shardformer for T5 acceleration
 )

From 1a913cd21b3002a1334f370cb18f166abb86a312 Mon Sep 17 00:00:00 2001
From: "Zheng Zangwei (Alex Zheng)" <zangwei@comp.nus.edu.sg>
Date: Mon, 25 Mar 2024 13:30:49 +0800
Subject: [PATCH 3/4] Docs/fix (#213)

* fix #210

* fix #209

* fix #188
---
 configs/opensora/inference/16x256x256.py | 1 +
 docs/structure.md                        | 1 +
 opensora/schedulers/iddpm/__init__.py    | 6 ++++--
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/configs/opensora/inference/16x256x256.py b/configs/opensora/inference/16x256x256.py
index db6f2e4..7679e0b 100644
--- a/configs/opensora/inference/16x256x256.py
+++ b/configs/opensora/inference/16x256x256.py
@@ -25,6 +25,7 @@ scheduler = dict(
     type="iddpm",
     num_sampling_steps=100,
     cfg_scale=7.0,
+    cfg_channel=3, # or None
 )
 dtype = "fp16"
 
diff --git a/docs/structure.md b/docs/structure.md
index 3eb7c69..9978c33 100644
--- a/docs/structure.md
+++ b/docs/structure.md
@@ -107,6 +107,7 @@ scheduler = dict(
     type="iddpm",              # Select scheduler type (iddpm, dpm-solver)
     num_sampling_steps=100,    # Number of sampling steps
     cfg_scale=7.0,             # hyper-parameter for classifier-free diffusion
+    cfg_channel=3,             # how many channels to use for classifier-free diffusion, if None, use all channels
 )
 dtype = "fp16"                 # Computation type (fp16, fp32, bf16)
 
diff --git a/opensora/schedulers/iddpm/__init__.py b/opensora/schedulers/iddpm/__init__.py
index b9806ad..cb2d19b 100644
--- a/opensora/schedulers/iddpm/__init__.py
+++ b/opensora/schedulers/iddpm/__init__.py
@@ -82,13 +82,15 @@ class IDDPM(SpacedDiffusion):
         return samples
 
 
-def forward_with_cfg(model, x, timestep, y, cfg_scale, **kwargs):
+def forward_with_cfg(model, x, timestep, y, cfg_scale, cfg_channel=None, **kwargs):
     # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
     half = x[: len(x) // 2]
     combined = torch.cat([half, half], dim=0)
     model_out = model.forward(combined, timestep, y, **kwargs)
     model_out = model_out["x"] if isinstance(model_out, dict) else model_out
-    eps, rest = model_out[:, :3], model_out[:, 3:]
+    if cfg_channel is None:
+        cfg_channel = model_out.shape[1] // 2
+    eps, rest = model_out[:, :cfg_channel], model_out[:, cfg_channel:]
     cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
     half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
     eps = torch.cat([half_eps, half_eps], dim=0)

From e826311de4b7ec8a667d11da3653170ec930d2f5 Mon Sep 17 00:00:00 2001
From: "Zheng Zangwei (Alex Zheng)" <zangwei@comp.nus.edu.sg>
Date: Mon, 25 Mar 2024 21:15:16 +0800
Subject: [PATCH 4/4] fix cfg_channel (#217)

---
 opensora/schedulers/iddpm/__init__.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/opensora/schedulers/iddpm/__init__.py b/opensora/schedulers/iddpm/__init__.py
index cb2d19b..2061dc3 100644
--- a/opensora/schedulers/iddpm/__init__.py
+++ b/opensora/schedulers/iddpm/__init__.py
@@ -22,6 +22,7 @@ class IDDPM(SpacedDiffusion):
         rescale_learned_sigmas=False,
         diffusion_steps=1000,
         cfg_scale=4.0,
+        cfg_channel=None,
     ):
         betas = gd.get_named_beta_schedule(noise_schedule, diffusion_steps)
         if use_kl:
@@ -49,6 +50,7 @@ class IDDPM(SpacedDiffusion):
         )
 
         self.cfg_scale = cfg_scale
+        self.cfg_channel = cfg_channel
 
     def sample(
         self,
@@ -68,7 +70,7 @@ class IDDPM(SpacedDiffusion):
         if additional_args is not None:
             model_args.update(additional_args)
 
-        forward = partial(forward_with_cfg, model, cfg_scale=self.cfg_scale)
+        forward = partial(forward_with_cfg, model, cfg_scale=self.cfg_scale, cfg_channel=self.cfg_channel)
         samples = self.p_sample_loop(
             forward,
             z.shape,