From 0b6348d8d259ea62b6dec9fe2de3bba51aee011a Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Wed, 24 Apr 2024 22:52:36 +0800
Subject: [PATCH 01/26] updated gradio model version (#299)

---
 gradio/app.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/gradio/app.py b/gradio/app.py
index 7f64896..80fc2e8 100644
--- a/gradio/app.py
+++ b/gradio/app.py
@@ -23,10 +23,12 @@ import gradio as gr
 
 MODEL_TYPES = ["v1.1"]
 CONFIG_MAP = {
-    "v1.1": "configs/opensora-v1-1/inference/sample-ref.py",
+    "v1.1-stage2": "configs/opensora-v1-1/inference/sample-ref.py",
+    "v1.1-stage3": "configs/opensora-v1-1/inference/sample-ref.py",
 }
 HF_STDIT_MAP = {
-    "v1.1": "hpcai-tech/OpenSora-STDiT-v2-stage2",
+    "v1.1-stage2": "hpcai-tech/OpenSora-STDiT-v2-stage2",
+    "v1.1-stage3": "hpcai-tech/OpenSora-STDiT-v2-stage3",
 }
 RESOLUTION_MAP = {
     "360p": (360, 480),
@@ -249,7 +251,7 @@ def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--model-type",
-        default="v1.1",
+        default="v1.1-stage3",
         choices=MODEL_TYPES,
         help=f"The type of model to run for the Gradio App, can only be {MODEL_TYPES}",
     )

From 6770f2ee670c57eacd148e2eb57817fafa44ced9 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Wed, 24 Apr 2024 23:03:22 +0800
Subject: [PATCH 02/26] updated version (#300)

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 45049bb..78e25a9 100644
--- a/setup.py
+++ b/setup.py
@@ -30,7 +30,7 @@ def fetch_readme() -> str:
 
 setup(
     name="opensora",
-    version="1.0.0",
+    version="1.1.0",
     packages=find_packages(
         exclude=(
             "assets",

From 11d1912b3cd02dc195b291ce1bc0e230a54e2f58 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Wed, 24 Apr 2024 23:27:03 +0800
Subject: [PATCH 03/26] updated gradio options (#301)

---
 gradio/app.py           | 6 ++++--
 gradio/requirements.txt | 2 +-
 requirements.txt        | 1 +
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/gradio/app.py b/gradio/app.py
index 80fc2e8..905c587 100644
--- a/gradio/app.py
+++ b/gradio/app.py
@@ -31,6 +31,8 @@ HF_STDIT_MAP = {
     "v1.1-stage3": "hpcai-tech/OpenSora-STDiT-v2-stage3",
 }
 RESOLUTION_MAP = {
+    "144p": (144, 256),
+    "240p": (240, 426),
     "360p": (360, 480),
     "480p": (480, 858),
     "720p": (720, 1280),
@@ -452,8 +454,8 @@ def main():
                     lines=4,
                 )
                 resolution = gr.Radio(
-                     choices=["360p", "480p", "720p", "1080p"],
-                     value="360p",
+                     choices=["144p", "240p", "360p", "480p", "720p", "1080p"],
+                     value="144p",
                     label="Resolution", 
                 )
                 length = gr.Radio(
diff --git a/gradio/requirements.txt b/gradio/requirements.txt
index 8ed5596..f0c5b94 100644
--- a/gradio/requirements.txt
+++ b/gradio/requirements.txt
@@ -1,3 +1,3 @@
 xformers
-git+https://github.com/hpcaitech/Open-Sora.git#egg=opensora
 transformers
+git+https://github.com/hpcaitech/Open-Sora.git#egg=opensora
diff --git a/requirements.txt b/requirements.txt
index d675337..e8031a8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,3 +14,4 @@ tqdm
 transformers
 wandb
 rotary_embedding_torch
+pandarallel

From 42dc33aaf3b475ee59876889609a6e7b52621f89 Mon Sep 17 00:00:00 2001
From: Zangwei Zheng <zangwei@comp.nus.edu.sg>
Date: Thu, 25 Apr 2024 00:46:33 +0800
Subject: [PATCH 04/26] update readme

---
 README.md | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/README.md b/README.md
index ab00ee4..1333d78 100644
--- a/README.md
+++ b/README.md
@@ -25,7 +25,7 @@ With Open-Sora, we aim to inspire innovation, creativity, and inclusivity in the
 
 ## 📰 News
 
-* **[2024.04.22]** 🔥 We release **Open-Sora 1.1**, which supports **2s~15s, 144p to 720p, any aspect ratio** text-to-image, **text-to-video, image-to-video, video-to-video, infinite time** generation. In addition, a full video processing pipeline is released. [[checkpoints]]() [[report]](/docs/report_02.md)
+* **[2024.04.25]** 🔥 We release **Open-Sora 1.1**, which supports **2s~15s, 144p to 720p, any aspect ratio** text-to-image, **text-to-video, image-to-video, video-to-video, infinite time** generation. In addition, a full video processing pipeline is released. [[checkpoints]]() [[report]](/docs/report_02.md)
 * **[2024.03.18]** We release **Open-Sora 1.0**, a fully open-source project for video generation.
   Open-Sora 1.0 supports a full pipeline of video data preprocessing, training with
   <a href="https://github.com/hpcaitech/ColossalAI"><img src="assets/readme/colossal_ai.png" width="8%" ></a>
@@ -134,8 +134,6 @@ Other useful documents and links are listed below.
 
 ## Installation
 
-TODO: discuss how to include data installation here.
-
 ```bash
 # create a virtual env
 conda create -n opensora python=3.10

From 99b0ba1e9800b11f87531a34a1da5690d2f23297 Mon Sep 17 00:00:00 2001
From: Zangwei Zheng <zangwei@comp.nus.edu.sg>
Date: Thu, 25 Apr 2024 11:27:09 +0800
Subject: [PATCH 05/26] update reference sample

---
 configs/opensora-v1-1/inference/sample-ref.py | 22 +++++++++++++------
 docs/commands.md                              | 21 +++++++++++++++++-
 2 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/configs/opensora-v1-1/inference/sample-ref.py b/configs/opensora-v1-1/inference/sample-ref.py
index eaf8865..735c01b 100644
--- a/configs/opensora-v1-1/inference/sample-ref.py
+++ b/configs/opensora-v1-1/inference/sample-ref.py
@@ -14,18 +14,26 @@ prompt = [
 
 loop = 2
 condition_frame_length = 4
-reference_path = [
-    "https://cdn.openai.com/tmp/s/interp/d0.mp4",
-    None,
-    "assets/images/condition/wave.png",
-]
-# valid when reference_path is not None
-# (loop id, ref id, ref start, length, target start)
+# (
+#   loop id, [the loop index of the condition image or video]
+#   reference id, [the index of the condition image or video in the reference_path]
+#   reference start, [the start frame of the condition image or video]
+#   target start, [the location to insert]
+#   length, [the number of frames to insert]
+#   edit_ratio [the edit rate of the condition image or video]
+# )
+# See https://github.com/hpcaitech/Open-Sora/blob/main/docs/config.md#advanced-inference-config for more details
+# See https://github.com/hpcaitech/Open-Sora/blob/main/docs/commands.md#inference-with-open-sora-11 for more examples
 mask_strategy = [
     "0,0,0,0,8,0.3",
     None,
     "0",
 ]
+reference_path = [
+    "https://cdn.openai.com/tmp/s/interp/d0.mp4",
+    None,
+    "assets/images/condition/wave.png",
+]
 
 # Define model
 model = dict(
diff --git a/docs/commands.md b/docs/commands.md
index 944fc88..2d7420f 100644
--- a/docs/commands.md
+++ b/docs/commands.md
@@ -51,11 +51,30 @@ You can adjust the `--num-frames` and `--image-size` to generate different resul
 `inference-long.py` is compatible with `inference.py` and supports advanced features.
 
 ```bash
-# long video generation
 # image condition
+python scripts/inference-long.py configs/opensora-v1-1/inference/sample.py --ckpt-path CKPT_PATH \
+  --num-frames 32 --image-size 240 426 --sample-name image-cond \
+  --prompt 'A breathtaking sunrise scene.{"reference_path": "assets/images/condition/wave.png","mask_strategy": "0"}'
+
 # video extending
+python scripts/inference-long.py configs/opensora-v1-1/inference/sample.py --ckpt-path CKPT_PATH \
+  --num-frames 32 --image-size 240 426 --sample-name image-cond \
+  --prompt 'A car driving on the ocean.{"reference_path": "https://cdn.openai.com/tmp/s/interp/d0.mp4","mask_strategy": "0,0,0,-8,8"}'
+
+# long video generation
+python scripts/inference-long.py configs/opensora-v1-1/inference/sample.py --ckpt-path CKPT_PATH \
+  --num-frames 32 --image-size 240 426 --loop 16 --condition-frame-length 8 --sample-name long \
+  --prompt '|0|a white jeep equipped with a roof rack driving on a dirt road in a coniferous forest.|2|a white jeep equipped with a roof rack driving on a dirt road in the desert.|4|a white jeep equipped with a roof rack driving on a dirt road in a mountain.|6|A white jeep equipped with a roof rack driving on a dirt road in a city.|8|a white jeep equipped with a roof rack driving on a dirt road on the surface of a river.|10|a white jeep equipped with a roof rack driving on a dirt road under the lake.|12|a white jeep equipped with a roof rack flying into the sky.|14|a white jeep equipped with a roof rack driving in the universe. Earth is the background.{"reference_path": "https://cdn.openai.com/tmp/s/interp/d0.mp4", "mask_strategy": "0,0,0,0,16"}'
+
 # video connecting
+python scripts/inference-long.py configs/opensora-v1-1/inference/sample.py --ckpt-path CKPT_PATH \
+  --num-frames 32 --image-size 240 426 --sample-name connect \
+  --prompt 'A breathtaking sunrise scene.{"reference_path": "assets/images/condition/sunset1.png;assets/images/condition/sunset2.png","mask_strategy": "0;0,1,0,-1,1"}'
+
 # video editing
+python scripts/inference-long.py configs/opensora-v1-1/inference/sample.py --ckpt-path CKPT_PATH \
+  --num-frames 32 --image-size 480 853 --sample-name edit \
+  --prompt 'A cyberpunk-style city at night.{"reference_path": "https://cdn.pixabay.com/video/2021/10/12/91744-636709154_large.mp4","mask_strategy": "0,0,0,0,32,0.4"}'
 ```
 
 ### Inference with DiT pretrained on ImageNet

From 42aea5470e173b8d91c996ed6afe698512ceb947 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Thu, 25 Apr 2024 12:50:55 +0800
Subject: [PATCH 06/26] Release/v1.1 update (#305)

* Update structure.md

* Update report_v1.md

* Update sample-ref.py (#75)

* Update interpolation.py

* Dev/pxy (#77)

* update scoring/matching

* update scoring/matching

* update scoring/matching

* update scoring/matching

* update scoring/matching

* update scoring/matching

* update scoring/matching

* update scoring/matching

* update scoring/matching

* update scene_cut

* update scene_cut

* update scene_cut[A

* update scene_cut

* update scene_cut

* update scene_cut

* update scene_cut

* update scene_cut

* update scene_cut

* m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* update readme

* update readme

* extract frames using opencv everywhere

* extract frames using opencv everywhere

* extract frames using opencv everywhere

* filter panda10m

* filter panda10m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* ocr

* add ocr

* add main.sh

* add ocr

* add ocr

* add ocr

* add ocr

* add ocr

* add ocr

* update scene_cut

* update remove main.sh

* update scoring

* update scoring

* update scoring

* update README

* update readme

* update scene_cut

* update readme

* update scoring

* update readme

* update readme

* update filter_panda10m

* update readme

* update readme

* update launch.ipynb

* update scene_cut

* update scene_cut

* update readme

* update launch.ipynb

* update readme

* add 1.1 demo

* update readme

* add 1.1 demo

* update readme

* Update README.md

---------

Co-authored-by: Yanjia0 <42895286+Yanjia0@users.noreply.github.com>
Co-authored-by: YuKun Zhou <90625606+1zeryu@users.noreply.github.com>
Co-authored-by: xyupeng <99191637+xyupeng@users.noreply.github.com>
---
 README.md                                     | 11 +++++-
 configs/opensora-v1-1/inference/sample-ref.py |  2 +-
 docs/zh_CN/report_v1.md                       | 38 ++++++++++---------
 docs/zh_CN/structure.md                       | 17 +++++----
 tools/frame_interpolation/interpolation.py    |  2 +-
 5 files changed, 41 insertions(+), 29 deletions(-)

diff --git a/README.md b/README.md
index 1333d78..53e11c3 100644
--- a/README.md
+++ b/README.md
@@ -37,7 +37,16 @@ With Open-Sora, we aim to inspire innovation, creativity, and inclusivity in the
 
 ## 🎥 Latest Demo
 
-TBD
+
+| **2s 240×426**   | **2s 240×426**  | 
+| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 
+| [<img src="assets/demo/sample_16x240x426_9.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c31ebc52-de39-4a4e-9b1e-9211d45e05b2)                                | [<img src="assets/demo/sora_16x240x426_26.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c31ebc52-de39-4a4e-9b1e-9211d45e05b2)                         | 
+| [<img src="assets/demo/sora_16x240x426_27.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/f7ce4aaa-528f-40a8-be7a-72e61eaacbbd)                                | [<img src="assets/demo/sora_16x240x426_40.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/5d58d71e-1fda-4d90-9ad3-5f2f7b75c6a9)                         | 
+
+| **2s 426×240**   | **2s 426×240**  | **4s 480×854**  | 
+| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 
+| [<img src="assets/demo/sora_16x426x240_24.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/34ecb4a0-4eef-4286-ad4c-8e3a87e5a9fd) | [<img src="assets/demo/sora_16x426x240_3.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/3e892ad2-9543-4049-b005-643a4c1bf3bf) | [<img src="assets/demo/sample_32x480x854_9.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c1619333-25d7-42ba-a91c-18dbc1870b18) | 
+
 
 <details>
 <summary>OpenSora 1.0 Demo</summary>
diff --git a/configs/opensora-v1-1/inference/sample-ref.py b/configs/opensora-v1-1/inference/sample-ref.py
index eaf8865..557bb70 100644
--- a/configs/opensora-v1-1/inference/sample-ref.py
+++ b/configs/opensora-v1-1/inference/sample-ref.py
@@ -20,7 +20,7 @@ reference_path = [
     "assets/images/condition/wave.png",
 ]
 # valid when reference_path is not None
-# (loop id, ref id, ref start, length, target start)
+# (loop id, ref id, ref start, target start, length, edit_ratio)
 mask_strategy = [
     "0,0,0,0,8,0.3",
     None,
diff --git a/docs/zh_CN/report_v1.md b/docs/zh_CN/report_v1.md
index b3b8073..bf12131 100644
--- a/docs/zh_CN/report_v1.md
+++ b/docs/zh_CN/report_v1.md
@@ -1,47 +1,49 @@
-# Open-Sora v1 Report
+# Open-Sora v1 技术报告
 
-OpenAI's Sora is amazing at generating one minutes high quality videos. However, it reveals almost no information about its details. To make AI more "open", we are dedicated to build an open-source version of Sora. This report describes our first attempt to train a transformer-based video diffusion model.
+OpenAI的Sora在生成一分钟高质量视频方面非常出色。然而，它几乎没有透露任何关于其细节的信息。为了使人工智能更加“开放”，我们致力于构建一个开源版本的Sora。这份报告描述了我们第一次尝试训练一个基于Transformer的视频扩散模型。
 
-## Efficiency in choosing the architecture
+## 选择高效的架构
 
-To lower the computational cost, we want to utilize existing VAE models. Sora uses spatial-temporal VAE to reduce the temporal dimensions. However, we found that there is no open-source high-quality spatial-temporal VAE model. [MAGVIT](https://github.com/google-research/magvit)'s 4x4x4 VAE is not open-sourced, while [VideoGPT](https://wilson1yan.github.io/videogpt/index.html)'s 2x4x4 VAE has a low quality in our experiments. Thus, we decided to use a 2D VAE (from [Stability-AI](https://huggingface.co/stabilityai/sd-vae-ft-mse-original)) in our first version.
+为了降低计算成本，我们希望利用现有的VAE模型。Sora使用时空VAE来减少时间维度。然而，我们发现没有开源的高质量时空VAE模型。[MAGVIT](https://github.com/google-research/magvit)的4x4x4 VAE并未开源，而[VideoGPT](https://wilson1yan.github.io/videogpt/index.html)的2x4x4 VAE在我们的实验中质量较低。因此，我们决定在我们第一个版本中使用2D VAE（来自[Stability-AI](https://huggingface.co/stabilityai/sd-vae-ft-mse-original)）。
 
-The video training involves a large amount of tokens. Considering 24fps 1min videos, we have 1440 frames. With VAE downsampling 4x and patch size downsampling 2x, we have 1440x1024≈1.5M tokens. Full attention on 1.5M tokens leads to a huge computational cost. Thus, we use spatial-temporal attention to reduce the cost following [Latte](https://github.com/Vchitect/Latte).
+视频训练涉及大量的token。考虑到24fps的1分钟视频，我们有1440帧。通过VAE下采样4倍和patch大小下采样2倍，我们得到了1440x1024≈150万个token。在150万个token上进行全注意力计算将带来巨大的计算成本。因此，我们使用时空注意力来降低成本，这是遵循[Latte](https://github.com/Vchitect/Latte)的方法。
+
+如图中所示，在STDiT（ST代表时空）中，我们在每个空间注意力之后立即插入一个时间注意力。这类似于Latte论文中的变种3。然而，我们并没有控制这些变体的相似数量的参数。虽然Latte的论文声称他们的变体比变种3更好，但我们在16x256x256视频上的实验表明，相同数量的迭代次数下，性能排名为：DiT（完整）> STDiT（顺序）> STDiT（并行）≈ Latte。因此，我们出于效率考虑选择了STDiT（顺序）。[这里](/docs/acceleration.md#efficient-stdit)提供了速度基准测试。
 
-As shown in the figure, we insert a temporal attention right after each spatial attention in STDiT (ST stands for spatial-temporal). This is similar to variant 3 in Latte's paper. However, we do not control a similar number of parameters for these variants. While Latte's paper claims their variant is better than variant 3, our experiments on 16x256x256 videos show that with same number of iterations, the performance ranks as: DiT (full) > STDiT (Sequential) > STDiT (Parallel) ≈ Latte. Thus, we choose STDiT (Sequential) out of efficiency. Speed benchmark is provided [here](/docs/acceleration.md#efficient-stdit).
 
 ![Architecture Comparison](https://i0.imgs.ovh/2024/03/15/eLk9D.png)
 
-To focus on video generation, we hope to train the model based on a powerful image generation model. [PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha) is an efficiently trained high-quality image generation model with T5-conditioned DiT structure. We initialize our model with PixArt-α and initialize the projection layer of inserted temporal attention with zero. This initialization preserves model's ability of image generation at beginning, while Latte's architecture cannot. The inserted attention increases the number of parameter from 580M to 724M.
+为了专注于视频生成，我们希望基于一个强大的图像生成模型来训练我们的模型。PixArt-α是一个经过高效训练的高质量图像生成模型，具有T5条件化的DiT结构。我们使用[PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha)初始化我们的模型，并将插入的时间注意力的投影层初始化为零。这种初始化在开始时保留了模型的图像生成能力，而Latte的架构则不能。插入的注意力将参数数量从5.8亿增加到7.24亿。
 
 ![Architecture](https://i0.imgs.ovh/2024/03/16/erC1d.png)
 
-Drawing from the success of PixArt-α and Stable Video Diffusion, we also adopt a progressive training strategy: 16x256x256 on 366K pretraining datasets, and then 16x256x256, 16x512x512, and 64x512x512 on 20K datasets. With scaled position embedding, this strategy greatly reduces the computational cost.
+借鉴PixArt-α和Stable Video Diffusion的成功，我们还采用了渐进式训练策略：在366K预训练数据集上进行16x256x256的训练，然后在20K数据集上进行16x256x256、16x512x512和64x512x512的训练。通过扩展位置嵌入，这一策略极大地降低了计算成本。
 
-We also try to use a 3D patch embedder in DiT. However, with 2x downsampling on temporal dimension, the generated videos have a low quality. Thus, we leave the downsampling to temporal VAE in our next version. For now, we sample at every 3 frames with 16 frames training and every 2 frames with 64 frames training.
+我们还尝试在DiT中使用3D patch嵌入器。然而，在时间维度上2倍下采样后，生成的视频质量较低。因此，我们将在下一版本中将下采样留给时间VAE。目前，我们在每3帧采样一次进行16帧训练，以及在每2帧采样一次进行64帧训练。
 
-## Data is the key to high quality
 
-We find that the number and quality of data have a great impact on the quality of generated videos, even larger than the model architecture and training strategy. At this time, we only prepared the first split (366K video clips) from [HD-VG-130M](https://github.com/daooshee/HD-VG-130M). The quality of these videos varies greatly, and the captions are not that accurate. Thus, we further collect 20k relatively high quality videos from [Pexels](https://www.pexels.com/), which provides free license videos. We label the video with LLaVA, an image captioning model, with three frames and a designed prompt. With designed prompt, LLaVA can generate good quality of captions.
+## 数据是训练高质量模型的核心
+
+我们发现数据的数量和质量对生成视频的质量有很大的影响，甚至比模型架构和训练策略的影响还要大。目前，我们只从[HD-VG-130M](https://github.com/daooshee/HD-VG-130M)准备了第一批分割（366K个视频片段）。这些视频的质量参差不齐，而且字幕也不够准确。因此，我们进一步从提供免费许可视频的[Pexels](https://www.pexels.com/)收集了20k相对高质量的视频。我们使用LLaVA，一个图像字幕模型，通过三个帧和一个设计好的提示来标记视频。有了设计好的提示，LLaVA能够生成高质量的字幕。
 
 ![Caption](https://i0.imgs.ovh/2024/03/16/eXdvC.png)
 
-As we lay more emphasis on the quality of data, we prepare to collect more data and build a video preprocessing pipeline in our next version.
+由于我们更加注重数据质量，我们准备收集更多数据，并在下一版本中构建一个视频预处理流程。
 
-## Training Details
+## 训练细节
 
-With a limited training budgets, we made only a few exploration. We find learning rate 1e-4 is too large and scales down to 2e-5. When training with a large batch size, we find `fp16` less stable than `bf16` and may lead to generation failure. Thus, we switch to `bf16` for training on 64x512x512. For other hyper-parameters, we follow previous works.
+在有限的训练预算下，我们只进行了一些探索。我们发现学习率1e-4过大，因此将其降低到2e-5。在进行大批量训练时，我们发现`fp16`比`bf16`不太稳定，可能会导致生成失败。因此，我们在64x512x512的训练中切换到`bf16`。对于其他超参数，我们遵循了之前的研究工作。
 
-## Loss curves
+## 损失曲线
 
-16x256x256 Pretraining Loss Curve
+16x256x256 预训练损失曲线
 
 ![16x256x256 Pretraining Loss Curve](https://i0.imgs.ovh/2024/03/16/erXQj.png)
 
-16x256x256 HQ Training Loss Curve
+16x256x256 高质量训练损失曲线
 
 ![16x256x256 HQ Training Loss Curve](https://i0.imgs.ovh/2024/03/16/ernXv.png)
 
-16x512x512 HQ Training Loss Curve
+16x512x512 高质量训练损失曲线
 
 ![16x512x512 HQ Training Loss Curve](https://i0.imgs.ovh/2024/03/16/erHBe.png)
diff --git a/docs/zh_CN/structure.md b/docs/zh_CN/structure.md
index 3eb7c69..6e25d84 100644
--- a/docs/zh_CN/structure.md
+++ b/docs/zh_CN/structure.md
@@ -1,6 +1,6 @@
-# Repo & Config Structure
+# 代码仓库和配置文件结构
 
-## Repo Structure
+## 代码仓库结构
 
 ```plaintext
 Open-Sora
@@ -38,9 +38,10 @@ Open-Sora
 └── tools                          -> Tools for data processing and more
 ```
 
-## Configs
+## 配置文件结构
 
-Our config files follows [MMEgine](https://github.com/open-mmlab/mmengine). MMEngine will reads the config file (a `.py` file) and parse it into a dictionary-like object.
+
+我们的配置文件遵循[MMEgine](https://github.com/open-mmlab/mmengine)。 MMEngine 将读取配置文件（“.py”文件）并将其解析为类似字典的对象。
 
 ```plaintext
 Open-Sora
@@ -66,16 +67,16 @@ Open-Sora
     └── pixart                     -> PixArt related configs
 ```
 
-## Inference config demos
+## 推理配置演示
 
-To change the inference settings, you can directly modify the corresponding config file. Or you can pass arguments to overwrite the config file ([config_utils.py](/opensora/utils/config_utils.py)). To change sampling prompts, you should modify the `.txt` file passed to the `--prompt_path` argument.
+要更改推理设置，可以直接修改相应的配置文件。或者您可以传递参数来覆盖配置文件（[config_utils.py](/opensora/utils/config_utils.py)）。要更改采样提示，您应该修改传递给“--prompt_path”参数的“.txt”文件。
 
 ```plaintext
 --prompt_path ./assets/texts/t2v_samples.txt  -> prompt_path
 --ckpt-path ./path/to/your/ckpt.pth           -> model["from_pretrained"]
 ```
 
-The explanation of each field is provided below.
+下面提供了每个字段的解释。
 
 ```python
 # Define sampling size
@@ -117,7 +118,7 @@ prompt_path = "./assets/texts/t2v_samples.txt"  # path to prompt file
 save_dir = "./samples"         # path to save samples
 ```
 
-## Training config demos
+## 训练配置演示
 
 ```python
 # Define sampling size
diff --git a/tools/frame_interpolation/interpolation.py b/tools/frame_interpolation/interpolation.py
index 808bf8b..0cd822b 100644
--- a/tools/frame_interpolation/interpolation.py
+++ b/tools/frame_interpolation/interpolation.py
@@ -209,7 +209,7 @@ if __name__ == "__main__":
 
     if args.folder:
         for file in os.listdir(input_path):
-            if osp.splitext(input_path)[-1].lower() in VID_EXT:
+            if osp.splitext(file)[-1].lower() in VID_EXT:
                 vid_path = os.path.join(input_path, file)
                 process(model, vid_path, output_path, fps, iters)
     else:

From ed23a8cdcf6ee0c075ab785a72a9ddadcfd70609 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Thu, 25 Apr 2024 12:57:36 +0800
Subject: [PATCH 07/26] updated weights link (#307)

---
 README.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 53e11c3..3b63cae 100644
--- a/README.md
+++ b/README.md
@@ -176,7 +176,11 @@ pip install -v .
 
 ### Open-Sora 1.1 Model Weights
 
-TBD
+| Resolution | Data                   | #iterations | Batch Size | URL                                                                                           |
+| ---------- | ---------------------  | ----------- | ---------- | --------------------------------------------------------------------------------------------- |
+| dynamic    | 10M videos + 2M images | 100         | dynamic    | [:link:](https://huggingface.co/hpcai-tech/OpenSora-STDiT-v2-stage2) |
+| dynamic    | 20K HQ                 | 4k          | dynamic    | [:link:](https://huggingface.co/hpcai-tech/OpenSora-STDiT-v2-stage3) |
+
 
 ### Open-Sora 1.0 Model Weights
 

From 4da80a477889abcfbf6a3b7e04c25a5ffdf4fc02 Mon Sep 17 00:00:00 2001
From: Zangwei Zheng <zangwei@comp.nus.edu.sg>
Date: Thu, 25 Apr 2024 13:08:21 +0800
Subject: [PATCH 08/26] update docs

---
 README.md         | 30 ++++++++++++++----------------
 docs/report_02.md |  2 +-
 2 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index 3b63cae..3ebf6fd 100644
--- a/README.md
+++ b/README.md
@@ -37,15 +37,16 @@ With Open-Sora, we aim to inspire innovation, creativity, and inclusivity in the
 
 ## 🎥 Latest Demo
 
+More samples are available in our [gallery](https://hpcaitech.github.io/Open-Sora/).
 
-| **2s 240×426**   | **2s 240×426**  | 
-| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 
-| [<img src="assets/demo/sample_16x240x426_9.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c31ebc52-de39-4a4e-9b1e-9211d45e05b2)                                | [<img src="assets/demo/sora_16x240x426_26.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c31ebc52-de39-4a4e-9b1e-9211d45e05b2)                         | 
-| [<img src="assets/demo/sora_16x240x426_27.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/f7ce4aaa-528f-40a8-be7a-72e61eaacbbd)                                | [<img src="assets/demo/sora_16x240x426_40.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/5d58d71e-1fda-4d90-9ad3-5f2f7b75c6a9)                         | 
+| **2s 240×426**                                                                                                                                              | **2s 240×426**                                                                                                                                             |
+| ----------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [<img src="assets/demo/sample_16x240x426_9.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c31ebc52-de39-4a4e-9b1e-9211d45e05b2) | [<img src="assets/demo/sora_16x240x426_26.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c31ebc52-de39-4a4e-9b1e-9211d45e05b2) |
+| [<img src="assets/demo/sora_16x240x426_27.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/f7ce4aaa-528f-40a8-be7a-72e61eaacbbd)  | [<img src="assets/demo/sora_16x240x426_40.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/5d58d71e-1fda-4d90-9ad3-5f2f7b75c6a9) |
 
-| **2s 426×240**   | **2s 426×240**  | **4s 480×854**  | 
-| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 
-| [<img src="assets/demo/sora_16x426x240_24.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/34ecb4a0-4eef-4286-ad4c-8e3a87e5a9fd) | [<img src="assets/demo/sora_16x426x240_3.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/3e892ad2-9543-4049-b005-643a4c1bf3bf) | [<img src="assets/demo/sample_32x480x854_9.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c1619333-25d7-42ba-a91c-18dbc1870b18) | 
+| **2s 426×240**                                                                                                                                             | **2s 426×240**                                                                                                                                            | **4s 480×854**                                                                                                                                              |
+| ---------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [<img src="assets/demo/sora_16x426x240_24.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/34ecb4a0-4eef-4286-ad4c-8e3a87e5a9fd) | [<img src="assets/demo/sora_16x426x240_3.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/3e892ad2-9543-4049-b005-643a4c1bf3bf) | [<img src="assets/demo/sample_32x480x854_9.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c1619333-25d7-42ba-a91c-18dbc1870b18) |
 
 
 <details>
@@ -63,8 +64,6 @@ see [here](/assets/texts/t2v_samples.txt) for full prompts.
 
 </details>
 
-More samples are available in our [gallery](https://hpcaitech.github.io/Open-Sora/).
-
 ## 🔆 New Features/Updates
 
 * 📍 **Open-Sora 1.1** released. Model weights are available [here](). It is trained on **0s~15s, 144p to 720p, various aspect ratios** videos. See our **[report 1.1](docs/report_02.md)** for more discussions.
@@ -176,11 +175,10 @@ pip install -v .
 
 ### Open-Sora 1.1 Model Weights
 
-| Resolution | Data                   | #iterations | Batch Size | URL                                                                                           |
-| ---------- | ---------------------  | ----------- | ---------- | --------------------------------------------------------------------------------------------- |
-| dynamic    | 10M videos + 2M images | 100         | dynamic    | [:link:](https://huggingface.co/hpcai-tech/OpenSora-STDiT-v2-stage2) |
-| dynamic    | 20K HQ                 | 4k          | dynamic    | [:link:](https://huggingface.co/hpcai-tech/OpenSora-STDiT-v2-stage3) |
-
+| Resolution         | Data                       | #iterations | Batch Size                                        | URL                                                                  |
+| ------------------ | -------------------------- | ----------- | ------------------------------------------------- | -------------------------------------------------------------------- |
+| mainly 144p & 240p | 10M videos + 2M images     | 100k        | [dynamic](/configs/opensora-v1-1/train/stage2.py) | [:link:](https://huggingface.co/hpcai-tech/OpenSora-STDiT-v2-stage2) |
+| 144p to 720p       | 500K HQ videos + 1M images | 4k          | [dynamic](/configs/opensora-v1-1/train/stage3.py) | [:link:](https://huggingface.co/hpcai-tech/OpenSora-STDiT-v2-stage3) |
 
 ### Open-Sora 1.0 Model Weights
 
@@ -223,12 +221,12 @@ This will launch a Gradio application on your localhost. If you want to know mor
 Since Open-Sora 1.1 supports inference with dynamic input size, you can pass the input size as an argument.
 
 ```bash
-# video sampling
+# text to video
 python scripts/inference.py configs/opensora-v1-1/inference/sample.py \
     --ckpt-path CKPT_PATH --prompt "A beautiful sunset over the city" --num-frames 32 --image-size 480 854
 ```
 
-See [here](docs/commands.md#inference-with-open-sora-11) for more instructions.
+See [here](docs/commands.md#inference-with-open-sora-11) for more instructions including text-to-image, image-to-video, video-to-video, and infinite time generation.
 
 ### Open-Sora 1.0 Command Line Inference
 
diff --git a/docs/report_02.md b/docs/report_02.md
index 9d2c1eb..ec54853 100644
--- a/docs/report_02.md
+++ b/docs/report_02.md
@@ -106,7 +106,7 @@ To summarize, the training of Open-Sora 1.1 requires approximately **9 days** on
 
 As we get one step closer to the replication of Sora, we find many limitations for the current model, and these limitations point to the future work.
 
-- **Generation Failure**: we fine many cases (especially when the total token number is large or the content is complex),  our model fails to generate the scene. There may be a collapse in the temporal attention and we have identified a potential bug in our code. We are working hard to fix it.
+- **Generation Failure**: we fine many cases (especially when the total token number is large or the content is complex),  our model fails to generate the scene. There may be a collapse in the temporal attention and we have identified a potential bug in our code. We are working hard to fix it. Besides, we will increase our model size and training data to improve the generation quality in the next version.
 - **Noisy generation and influency**: we find the generated model is sometimes noisy and not fluent, especially for long videos. We think the problem is due to not using a temporal VAE. As [Pixart-Sigma](https://arxiv.org/abs/2403.04692) finds that adapting to a new VAE is simple, we plan to develop a temporal VAE for the model in the next version.
 - **Lack of time consistency**: we find the model cannot generate videos with high time consistency. We think the problem is due to the lack of training FLOPs. We plan to collect more data and continue training the model to improve the time consistency.
 - **Bad human generation**: We find the model cannot generate high-quality human videos. We think the problem is due to the lack of human data. We plan to collect more human data and continue training the model to improve the human generation.

From 74f0af7e699b37cb405ce51ee7b4b3c5ac45c59c Mon Sep 17 00:00:00 2001
From: "Zheng Zangwei (Alex Zheng)" <zangwei@comp.nus.edu.sg>
Date: Thu, 25 Apr 2024 13:09:26 +0800
Subject: [PATCH 09/26] Docs/v1.1 zangwei (#308)

* update reference sample

* update docs
---
 README.md                                     | 30 +++++++++----------
 configs/opensora-v1-1/inference/sample-ref.py | 22 +++++++++-----
 docs/commands.md                              | 21 ++++++++++++-
 docs/report_02.md                             |  2 +-
 4 files changed, 50 insertions(+), 25 deletions(-)

diff --git a/README.md b/README.md
index 3b63cae..3ebf6fd 100644
--- a/README.md
+++ b/README.md
@@ -37,15 +37,16 @@ With Open-Sora, we aim to inspire innovation, creativity, and inclusivity in the
 
 ## 🎥 Latest Demo
 
+More samples are available in our [gallery](https://hpcaitech.github.io/Open-Sora/).
 
-| **2s 240×426**   | **2s 240×426**  | 
-| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 
-| [<img src="assets/demo/sample_16x240x426_9.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c31ebc52-de39-4a4e-9b1e-9211d45e05b2)                                | [<img src="assets/demo/sora_16x240x426_26.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c31ebc52-de39-4a4e-9b1e-9211d45e05b2)                         | 
-| [<img src="assets/demo/sora_16x240x426_27.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/f7ce4aaa-528f-40a8-be7a-72e61eaacbbd)                                | [<img src="assets/demo/sora_16x240x426_40.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/5d58d71e-1fda-4d90-9ad3-5f2f7b75c6a9)                         | 
+| **2s 240×426**                                                                                                                                              | **2s 240×426**                                                                                                                                             |
+| ----------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [<img src="assets/demo/sample_16x240x426_9.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c31ebc52-de39-4a4e-9b1e-9211d45e05b2) | [<img src="assets/demo/sora_16x240x426_26.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c31ebc52-de39-4a4e-9b1e-9211d45e05b2) |
+| [<img src="assets/demo/sora_16x240x426_27.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/f7ce4aaa-528f-40a8-be7a-72e61eaacbbd)  | [<img src="assets/demo/sora_16x240x426_40.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/5d58d71e-1fda-4d90-9ad3-5f2f7b75c6a9) |
 
-| **2s 426×240**   | **2s 426×240**  | **4s 480×854**  | 
-| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 
-| [<img src="assets/demo/sora_16x426x240_24.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/34ecb4a0-4eef-4286-ad4c-8e3a87e5a9fd) | [<img src="assets/demo/sora_16x426x240_3.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/3e892ad2-9543-4049-b005-643a4c1bf3bf) | [<img src="assets/demo/sample_32x480x854_9.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c1619333-25d7-42ba-a91c-18dbc1870b18) | 
+| **2s 426×240**                                                                                                                                             | **2s 426×240**                                                                                                                                            | **4s 480×854**                                                                                                                                              |
+| ---------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [<img src="assets/demo/sora_16x426x240_24.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/34ecb4a0-4eef-4286-ad4c-8e3a87e5a9fd) | [<img src="assets/demo/sora_16x426x240_3.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/3e892ad2-9543-4049-b005-643a4c1bf3bf) | [<img src="assets/demo/sample_32x480x854_9.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c1619333-25d7-42ba-a91c-18dbc1870b18) |
 
 
 <details>
@@ -63,8 +64,6 @@ see [here](/assets/texts/t2v_samples.txt) for full prompts.
 
 </details>
 
-More samples are available in our [gallery](https://hpcaitech.github.io/Open-Sora/).
-
 ## 🔆 New Features/Updates
 
 * 📍 **Open-Sora 1.1** released. Model weights are available [here](). It is trained on **0s~15s, 144p to 720p, various aspect ratios** videos. See our **[report 1.1](docs/report_02.md)** for more discussions.
@@ -176,11 +175,10 @@ pip install -v .
 
 ### Open-Sora 1.1 Model Weights
 
-| Resolution | Data                   | #iterations | Batch Size | URL                                                                                           |
-| ---------- | ---------------------  | ----------- | ---------- | --------------------------------------------------------------------------------------------- |
-| dynamic    | 10M videos + 2M images | 100         | dynamic    | [:link:](https://huggingface.co/hpcai-tech/OpenSora-STDiT-v2-stage2) |
-| dynamic    | 20K HQ                 | 4k          | dynamic    | [:link:](https://huggingface.co/hpcai-tech/OpenSora-STDiT-v2-stage3) |
-
+| Resolution         | Data                       | #iterations | Batch Size                                        | URL                                                                  |
+| ------------------ | -------------------------- | ----------- | ------------------------------------------------- | -------------------------------------------------------------------- |
+| mainly 144p & 240p | 10M videos + 2M images     | 100k        | [dynamic](/configs/opensora-v1-1/train/stage2.py) | [:link:](https://huggingface.co/hpcai-tech/OpenSora-STDiT-v2-stage2) |
+| 144p to 720p       | 500K HQ videos + 1M images | 4k          | [dynamic](/configs/opensora-v1-1/train/stage3.py) | [:link:](https://huggingface.co/hpcai-tech/OpenSora-STDiT-v2-stage3) |
 
 ### Open-Sora 1.0 Model Weights
 
@@ -223,12 +221,12 @@ This will launch a Gradio application on your localhost. If you want to know mor
 Since Open-Sora 1.1 supports inference with dynamic input size, you can pass the input size as an argument.
 
 ```bash
-# video sampling
+# text to video
 python scripts/inference.py configs/opensora-v1-1/inference/sample.py \
     --ckpt-path CKPT_PATH --prompt "A beautiful sunset over the city" --num-frames 32 --image-size 480 854
 ```
 
-See [here](docs/commands.md#inference-with-open-sora-11) for more instructions.
+See [here](docs/commands.md#inference-with-open-sora-11) for more instructions including text-to-image, image-to-video, video-to-video, and infinite time generation.
 
 ### Open-Sora 1.0 Command Line Inference
 
diff --git a/configs/opensora-v1-1/inference/sample-ref.py b/configs/opensora-v1-1/inference/sample-ref.py
index 557bb70..735c01b 100644
--- a/configs/opensora-v1-1/inference/sample-ref.py
+++ b/configs/opensora-v1-1/inference/sample-ref.py
@@ -14,18 +14,26 @@ prompt = [
 
 loop = 2
 condition_frame_length = 4
-reference_path = [
-    "https://cdn.openai.com/tmp/s/interp/d0.mp4",
-    None,
-    "assets/images/condition/wave.png",
-]
-# valid when reference_path is not None
-# (loop id, ref id, ref start, target start, length, edit_ratio)
+# (
+#   loop id, [the loop index of the condition image or video]
+#   reference id, [the index of the condition image or video in the reference_path]
+#   reference start, [the start frame of the condition image or video]
+#   target start, [the location to insert]
+#   length, [the number of frames to insert]
+#   edit_ratio [the edit rate of the condition image or video]
+# )
+# See https://github.com/hpcaitech/Open-Sora/blob/main/docs/config.md#advanced-inference-config for more details
+# See https://github.com/hpcaitech/Open-Sora/blob/main/docs/commands.md#inference-with-open-sora-11 for more examples
 mask_strategy = [
     "0,0,0,0,8,0.3",
     None,
     "0",
 ]
+reference_path = [
+    "https://cdn.openai.com/tmp/s/interp/d0.mp4",
+    None,
+    "assets/images/condition/wave.png",
+]
 
 # Define model
 model = dict(
diff --git a/docs/commands.md b/docs/commands.md
index 944fc88..2d7420f 100644
--- a/docs/commands.md
+++ b/docs/commands.md
@@ -51,11 +51,30 @@ You can adjust the `--num-frames` and `--image-size` to generate different resul
 `inference-long.py` is compatible with `inference.py` and supports advanced features.
 
 ```bash
-# long video generation
 # image condition
+python scripts/inference-long.py configs/opensora-v1-1/inference/sample.py --ckpt-path CKPT_PATH \
+  --num-frames 32 --image-size 240 426 --sample-name image-cond \
+  --prompt 'A breathtaking sunrise scene.{"reference_path": "assets/images/condition/wave.png","mask_strategy": "0"}'
+
 # video extending
+python scripts/inference-long.py configs/opensora-v1-1/inference/sample.py --ckpt-path CKPT_PATH \
+  --num-frames 32 --image-size 240 426 --sample-name image-cond \
+  --prompt 'A car driving on the ocean.{"reference_path": "https://cdn.openai.com/tmp/s/interp/d0.mp4","mask_strategy": "0,0,0,-8,8"}'
+
+# long video generation
+python scripts/inference-long.py configs/opensora-v1-1/inference/sample.py --ckpt-path CKPT_PATH \
+  --num-frames 32 --image-size 240 426 --loop 16 --condition-frame-length 8 --sample-name long \
+  --prompt '|0|a white jeep equipped with a roof rack driving on a dirt road in a coniferous forest.|2|a white jeep equipped with a roof rack driving on a dirt road in the desert.|4|a white jeep equipped with a roof rack driving on a dirt road in a mountain.|6|A white jeep equipped with a roof rack driving on a dirt road in a city.|8|a white jeep equipped with a roof rack driving on a dirt road on the surface of a river.|10|a white jeep equipped with a roof rack driving on a dirt road under the lake.|12|a white jeep equipped with a roof rack flying into the sky.|14|a white jeep equipped with a roof rack driving in the universe. Earth is the background.{"reference_path": "https://cdn.openai.com/tmp/s/interp/d0.mp4", "mask_strategy": "0,0,0,0,16"}'
+
 # video connecting
+python scripts/inference-long.py configs/opensora-v1-1/inference/sample.py --ckpt-path CKPT_PATH \
+  --num-frames 32 --image-size 240 426 --sample-name connect \
+  --prompt 'A breathtaking sunrise scene.{"reference_path": "assets/images/condition/sunset1.png;assets/images/condition/sunset2.png","mask_strategy": "0;0,1,0,-1,1"}'
+
 # video editing
+python scripts/inference-long.py configs/opensora-v1-1/inference/sample.py --ckpt-path CKPT_PATH \
+  --num-frames 32 --image-size 480 853 --sample-name edit \
+  --prompt 'A cyberpunk-style city at night.{"reference_path": "https://cdn.pixabay.com/video/2021/10/12/91744-636709154_large.mp4","mask_strategy": "0,0,0,0,32,0.4"}'
 ```
 
 ### Inference with DiT pretrained on ImageNet
diff --git a/docs/report_02.md b/docs/report_02.md
index 9d2c1eb..ec54853 100644
--- a/docs/report_02.md
+++ b/docs/report_02.md
@@ -106,7 +106,7 @@ To summarize, the training of Open-Sora 1.1 requires approximately **9 days** on
 
 As we get one step closer to the replication of Sora, we find many limitations for the current model, and these limitations point to the future work.
 
-- **Generation Failure**: we fine many cases (especially when the total token number is large or the content is complex),  our model fails to generate the scene. There may be a collapse in the temporal attention and we have identified a potential bug in our code. We are working hard to fix it.
+- **Generation Failure**: we fine many cases (especially when the total token number is large or the content is complex),  our model fails to generate the scene. There may be a collapse in the temporal attention and we have identified a potential bug in our code. We are working hard to fix it. Besides, we will increase our model size and training data to improve the generation quality in the next version.
 - **Noisy generation and influency**: we find the generated model is sometimes noisy and not fluent, especially for long videos. We think the problem is due to not using a temporal VAE. As [Pixart-Sigma](https://arxiv.org/abs/2403.04692) finds that adapting to a new VAE is simple, we plan to develop a temporal VAE for the model in the next version.
 - **Lack of time consistency**: we find the model cannot generate videos with high time consistency. We think the problem is due to the lack of training FLOPs. We plan to collect more data and continue training the model to improve the time consistency.
 - **Bad human generation**: We find the model cannot generate high-quality human videos. We think the problem is due to the lack of human data. We plan to collect more human data and continue training the model to improve the human generation.

From ef05427dd0d822cafaedcb7a391a76da9e0574f5 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Thu, 25 Apr 2024 13:15:29 +0800
Subject: [PATCH 10/26] added gradio to readme (#309)

* added gradio to readme

* polish
---
 README.md | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 3ebf6fd..8815140 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,7 @@
     <a href="https://twitter.com/yangyou1991/status/1769411544083996787?s=61&t=jT0Dsx2d-MS5vS9rNM5e5g"><img src="https://img.shields.io/badge/Twitter-Discuss-blue?logo=twitter&amp"></a>
     <a href="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/WeChat.png"><img src="https://img.shields.io/badge/微信-小助手加群-green?logo=wechat&amp"></a>
     <a href="https://hpc-ai.com/blog/open-sora-v1.0"><img src="https://img.shields.io/badge/Open_Sora-Blog-blue"></a>
+    <a href="https://huggingface.co/spaces/hpcai-tech/open-sora"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Gradio Demo-blue"></a>
 </div>
 
 ## Open-Sora: Democratizing Efficient Video Production for All
@@ -25,6 +26,7 @@ With Open-Sora, we aim to inspire innovation, creativity, and inclusivity in the
 
 ## 📰 News
 
+* **[2024.04.25]** 🤗 We released the [Gradio demo for Open-Sora](https://huggingface.co/spaces/hpcai-tech/open-sora) on Hugging Face Spaces.
 * **[2024.04.25]** 🔥 We release **Open-Sora 1.1**, which supports **2s~15s, 144p to 720p, any aspect ratio** text-to-image, **text-to-video, image-to-video, video-to-video, infinite time** generation. In addition, a full video processing pipeline is released. [[checkpoints]]() [[report]](/docs/report_02.md)
 * **[2024.03.18]** We release **Open-Sora 1.0**, a fully open-source project for video generation.
   Open-Sora 1.0 supports a full pipeline of video data preprocessing, training with
@@ -37,8 +39,11 @@ With Open-Sora, we aim to inspire innovation, creativity, and inclusivity in the
 
 ## 🎥 Latest Demo
 
+🔥 You can experinece Open-Sora on our [🤗 Gradio application on Hugging Face](https://huggingface.co/spaces/hpcai-tech/open-sora)
+
 More samples are available in our [gallery](https://hpcaitech.github.io/Open-Sora/).
 
+
 | **2s 240×426**                                                                                                                                              | **2s 240×426**                                                                                                                                             |
 | ----------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | [<img src="assets/demo/sample_16x240x426_9.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c31ebc52-de39-4a4e-9b1e-9211d45e05b2) | [<img src="assets/demo/sora_16x240x426_26.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c31ebc52-de39-4a4e-9b1e-9211d45e05b2) |
@@ -62,6 +67,7 @@ More samples are available in our [gallery](https://hpcaitech.github.io/Open-Sor
 Videos are downsampled to `.gif` for display. Click for original videos. Prompts are trimmed for display,
 see [here](/assets/texts/t2v_samples.txt) for full prompts.
 
+
 </details>
 
 ## 🔆 New Features/Updates
@@ -207,7 +213,9 @@ on improving the quality and text alignment.
 
 ### Gradio Demo
 
-We have provided a [Gradio application](./gradio) in this repository, you can use the following the command to start an interactive web application to experience video generation with Open-Sora.
+🔥 You can experinece Open-Sora on our [🤗 Gradio application](https://huggingface.co/spaces/hpcai-tech/open-sora) on Hugging Face online. 
+
+If you want to deploy gradio locally, we have also provided a [Gradio application](./gradio) in this repository, you can use the following the command to start an interactive web application to experience video generation with Open-Sora.
 
 ```bash
 pip install gradio spaces

From 22e03e626578411f8e48e0725f3caa09f23d4024 Mon Sep 17 00:00:00 2001
From: xyupeng <99191637+xyupeng@users.noreply.github.com>
Date: Thu, 25 Apr 2024 14:27:29 +0800
Subject: [PATCH 11/26] update 1.1 demo (#311)

* add 1.1 demo

* Update README.md

* Update README.md
---
 README.md | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 8815140..75f81df 100644
--- a/README.md
+++ b/README.md
@@ -27,8 +27,8 @@ With Open-Sora, we aim to inspire innovation, creativity, and inclusivity in the
 ## 📰 News
 
 * **[2024.04.25]** 🤗 We released the [Gradio demo for Open-Sora](https://huggingface.co/spaces/hpcai-tech/open-sora) on Hugging Face Spaces.
-* **[2024.04.25]** 🔥 We release **Open-Sora 1.1**, which supports **2s~15s, 144p to 720p, any aspect ratio** text-to-image, **text-to-video, image-to-video, video-to-video, infinite time** generation. In addition, a full video processing pipeline is released. [[checkpoints]]() [[report]](/docs/report_02.md)
-* **[2024.03.18]** We release **Open-Sora 1.0**, a fully open-source project for video generation.
+* **[2024.04.25]** 🔥 We released **Open-Sora 1.1**, which supports **2s~15s, 144p to 720p, any aspect ratio** text-to-image, **text-to-video, image-to-video, video-to-video, infinite time** generation. In addition, a full video processing pipeline is released. [[checkpoints]]() [[report]](/docs/report_02.md)
+* **[2024.03.18]** We released **Open-Sora 1.0**, a fully open-source project for video generation.
   Open-Sora 1.0 supports a full pipeline of video data preprocessing, training with
   <a href="https://github.com/hpcaitech/ColossalAI"><img src="assets/readme/colossal_ai.png" width="8%" ></a>
   acceleration,
@@ -46,12 +46,16 @@ More samples are available in our [gallery](https://hpcaitech.github.io/Open-Sor
 
 | **2s 240×426**                                                                                                                                              | **2s 240×426**                                                                                                                                             |
 | ----------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| [<img src="assets/demo/sample_16x240x426_9.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c31ebc52-de39-4a4e-9b1e-9211d45e05b2) | [<img src="assets/demo/sora_16x240x426_26.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c31ebc52-de39-4a4e-9b1e-9211d45e05b2) |
+| [<img src="assets/demo/sample_16x240x426_9.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c31ebc52-de39-4a4e-9b1e-9211d45e05b2) | [<img src="assets/demo/sora_16x240x426_26.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c31ebc52-de39-4a4e-9b1e-9211d45e05b2) | 
 | [<img src="assets/demo/sora_16x240x426_27.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/f7ce4aaa-528f-40a8-be7a-72e61eaacbbd)  | [<img src="assets/demo/sora_16x240x426_40.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/5d58d71e-1fda-4d90-9ad3-5f2f7b75c6a9) |
 
-| **2s 426×240**                                                                                                                                             | **2s 426×240**                                                                                                                                            | **4s 480×854**                                                                                                                                              |
-| ---------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| [<img src="assets/demo/sora_16x426x240_24.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/34ecb4a0-4eef-4286-ad4c-8e3a87e5a9fd) | [<img src="assets/demo/sora_16x426x240_3.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/3e892ad2-9543-4049-b005-643a4c1bf3bf) | [<img src="assets/demo/sample_32x480x854_9.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c1619333-25d7-42ba-a91c-18dbc1870b18) |
+| **2s 426×240**                                                                                                                                             | **4s 480×854**                                                                                                                                            |
+| ---------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | 
+| [<img src="assets/demo/sora_16x426x240_24.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/34ecb4a0-4eef-4286-ad4c-8e3a87e5a9fd) | [<img src="assets/demo/sample_32x480x854_9.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/c1619333-25d7-42ba-a91c-18dbc1870b18) |
+
+| **16s 320×320**                                                                                                                                             | **16s 224×448**                                                                                                                                               |  **2s 426×240**                                                                                                                                         |
+| ----------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [<img src="assets/demo/sample_16s_320x320.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/3cab536e-9b43-4b33-8da8-a0f9cf842ff2) | [<img src="assets/demo/sample_16s_224x448.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/9fb0b9e0-c6f4-4935-b29e-4cac10b373c4) |   [<img src="assets/demo/sora_16x426x240_3.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/3e892ad2-9543-4049-b005-643a4c1bf3bf) |
 
 
 <details>
@@ -62,7 +66,7 @@ More samples are available in our [gallery](https://hpcaitech.github.io/Open-Sor
 | [<img src="assets/readme/sample_0.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/de1963d3-b43b-4e68-a670-bb821ebb6f80)                                 | [<img src="assets/readme/sample_1.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/13f8338f-3d42-4b71-8142-d234fbd746cc)                              | [<img src="assets/readme/sample_2.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/fa6a65a6-e32a-4d64-9a9e-eabb0ebb8c16)    |
 | A serene night scene in a forested area. [...] The video is a time-lapse, capturing the transition from day to night, with the lake and forest serving as a constant backdrop. | A soaring drone footage captures the majestic beauty of a coastal cliff, [...] The water gently laps at the rock base and the greenery that clings to the top of the cliff. | The majestic beauty of a waterfall cascading down a cliff into a serene lake. [...] The camera angle provides a bird's eye view of the waterfall. |
 | [<img src="assets/readme/sample_3.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/64232f84-1b36-4750-a6c0-3e610fa9aa94)                                 | [<img src="assets/readme/sample_4.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/983a1965-a374-41a7-a76b-c07941a6c1e9)                              | [<img src="assets/readme/sample_5.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/ec10c879-9767-4c31-865f-2e8d6cf11e65)    |
-| A bustling city street at night, filled with the glow of car headlights and the ambient light of streetlights. [...]                                                           | The vibrant beauty of a sunflower field. The sunflowers are arranged in neat rows, creating a sense of order and symmetry. [...]                                            | A serene underwater scene featuring a sea turtle swimming through a coral reef. The turtle, with its greenish-brown shell [...]                   |
+| A bustling city street at night, filled with the glow of car headlights and the ambient light of streetlights. [...]                                                           | The vibrant beauty of a sunflower field. The sunflowers are arranged in neat rows, creating a sense of order and symmetry. [...]                                            | A serene underwater scene featuring a sea turtle swimming through a coral reef. The turtle, with its greenish-brown shell [...]                |
 
 Videos are downsampled to `.gif` for display. Click for original videos. Prompts are trimmed for display,
 see [here](/assets/texts/t2v_samples.txt) for full prompts.

From bf26cabca80936905920a674c6f48e7f68b93f02 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Thu, 25 Apr 2024 14:36:52 +0800
Subject: [PATCH 12/26] update gradio (#310)

* update gradio

* update gradio
---
 gradio/app.py | 215 +++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 160 insertions(+), 55 deletions(-)

diff --git a/gradio/app.py b/gradio/app.py
index 905c587..181c059 100644
--- a/gradio/app.py
+++ b/gradio/app.py
@@ -19,9 +19,12 @@ import spaces
 import torch
 
 import gradio as gr
+from tempfile import NamedTemporaryFile
+import datetime
 
 
-MODEL_TYPES = ["v1.1"]
+
+MODEL_TYPES = ["v1.1-stage2", "v1.1-stage3"]
 CONFIG_MAP = {
     "v1.1-stage2": "configs/opensora-v1-1/inference/sample-ref.py",
     "v1.1-stage3": "configs/opensora-v1-1/inference/sample-ref.py",
@@ -31,12 +34,41 @@ HF_STDIT_MAP = {
     "v1.1-stage3": "hpcai-tech/OpenSora-STDiT-v2-stage3",
 }
 RESOLUTION_MAP = {
-    "144p": (144, 256),
-    "240p": (240, 426),
-    "360p": (360, 480),
-    "480p": (480, 858),
-    "720p": (720, 1280),
-    "1080p": (1080, 1920)
+    "144p": {
+        "16:9": (256, 144), 
+        "9:16": (144, 256),
+        "4:3": (221, 165),
+        "3:4": (165, 221),
+        "1:1": (192, 192),
+    },
+    "240p": {
+        "16:9": (426, 240), 
+        "9:16": (240, 426),
+        "4:3": (370, 278),
+        "3:4": (278, 370),
+        "1:1": (320, 320),
+    },
+    "360p": {
+        "16:9": (640, 360), 
+        "9:16": (360, 640),
+        "4:3": (554, 416),
+        "3:4": (416, 554),
+        "1:1": (480, 480),
+    },
+    "480p": {
+        "16:9": (854, 480), 
+        "9:16": (480, 854),
+        "4:3": (740, 555),
+        "3:4": (555, 740),
+        "1:1": (640, 640),
+    },
+    "720p": {
+        "16:9": (1280, 720), 
+        "9:16": (720, 1280),
+        "4:3": (1108, 832),
+        "3:4": (832, 1110),
+        "1:1": (960, 960),
+    },
 }
 
 
@@ -302,37 +334,53 @@ device = torch.device("cuda")
 vae, text_encoder, stdit, scheduler = build_models(args.model_type, config, enable_optimization=args.enable_optimization)
 
 
-@spaces.GPU(duration=200)
-def run_inference(mode, prompt_text, resolution, length, reference_image):
+def run_inference(mode, prompt_text, resolution, aspect_ratio, length, reference_image, seed, sampling_steps, cfg_scale):
+    torch.manual_seed(seed)
     with torch.inference_mode():
         # ======================
         # 1. Preparation
         # ======================
         # parse the inputs
-        resolution = RESOLUTION_MAP[resolution]
-        
+        resolution = RESOLUTION_MAP[resolution][aspect_ratio]
+
+        # gather args from config
+        num_frames = config.num_frames
+        frame_interval = config.frame_interval
+        fps = config.fps
+        condition_frame_length = config.condition_frame_length
+
         # compute number of loops
-        num_seconds = int(length.rstrip('s'))
-        total_number_of_frames = num_seconds * config.fps / config.frame_interval
-        num_loop = math.ceil(total_number_of_frames / config.num_frames)
+        if mode == "Text2Image":
+            num_frames = 1
+            num_loop = 1
+        else:
+            num_seconds = int(length.rstrip('s'))
+            if num_seconds <= 16:
+                num_frames = num_seconds * fps // frame_interval
+                num_loop = 1
+            else:
+                config.num_frames = 16
+                total_number_of_frames = num_seconds * fps / frame_interval
+                num_loop = math.ceil((total_number_of_frames - condition_frame_length) / (num_frames - condition_frame_length))
 
         # prepare model args
-        model_args = dict()
-        height = torch.tensor([resolution[0]], device=device, dtype=dtype)
-        width = torch.tensor([resolution[1]], device=device, dtype=dtype)
-        num_frames = torch.tensor([config.num_frames], device=device, dtype=dtype)
-        ar = torch.tensor([resolution[0] / resolution[1]], device=device, dtype=dtype)
         if config.num_frames == 1:
-            config.fps = IMG_FPS
-        fps = torch.tensor([config.fps], device=device, dtype=dtype)
-        model_args["height"] = height
-        model_args["width"] = width
-        model_args["num_frames"] = num_frames
-        model_args["ar"] = ar
-        model_args["fps"] = fps
+            fps = IMG_FPS
+
+        model_args = dict()
+        height_tensor = torch.tensor([resolution[0]], device=device, dtype=dtype)
+        width_tensor = torch.tensor([resolution[1]], device=device, dtype=dtype)
+        num_frames_tensor = torch.tensor([num_frames], device=device, dtype=dtype)
+        ar_tensor = torch.tensor([resolution[0] / resolution[1]], device=device, dtype=dtype)
+        fps_tensor = torch.tensor([fps], device=device, dtype=dtype)
+        model_args["height"] = height_tensor
+        model_args["width"] = width_tensor
+        model_args["num_frames"] = num_frames_tensor
+        model_args["ar"] = ar_tensor
+        model_args["fps"] = fps_tensor
 
         # compute latent size
-        input_size = (config.num_frames, *resolution)
+        input_size = (num_frames, *resolution)
         latent_size = vae.get_latent_size(input_size)
 
         # process prompt
@@ -342,24 +390,33 @@ def run_inference(mode, prompt_text, resolution, length, reference_image):
         video_clips = []
 
         # prepare mask strategy
-        if mode == "Text2Video":
+        if mode == "Text2Image":
             mask_strategy = [None]
-        elif mode == "Image2Video":
-            mask_strategy = ['0']
+        elif mode == "Text2Video":
+            if reference_image is not None:
+                mask_strategy = ['0']
+            else:
+                mask_strategy = [None]
         else:
             raise ValueError(f"Invalid mode: {mode}")
 
         # =========================
         # 2. Load reference images
         # =========================
-        if mode == "Text2Video":
+        if mode == "Text2Image":
             refs_x = collect_references_batch([None], vae, resolution)
-        elif mode == "Image2Video":
-            # save image to disk
-            from PIL import Image
-            im = Image.fromarray(reference_image)
-            im.save("test.jpg")
-            refs_x = collect_references_batch(["test.jpg"], vae, resolution)
+        elif mode == "Text2Video":
+            if reference_image is not None:
+                # save image to disk
+                from PIL import Image
+                im = Image.fromarray(reference_image)
+                idx = os.environ['CUDA_VISIBLE_DEVICES']
+
+                with NamedTemporaryFile(suffix=".jpg") as temp_file:
+                    im.save(temp_file.name)
+                    refs_x = collect_references_batch([temp_file.name], vae, resolution)
+            else:
+                refs_x = collect_references_batch([None], vae, resolution)
         else:
             raise ValueError(f"Invalid mode: {mode}")
 
@@ -386,11 +443,20 @@ def run_inference(mode, prompt_text, resolution, length, reference_image):
                         mask_strategy[j] += ";"
                     mask_strategy[
                         j
-                    ] += f"{loop_i},{len(refs)-1},-{config.condition_frame_length},0,{config.condition_frame_length}"
+                    ] += f"{loop_i},{len(refs)-1},-{condition_frame_length},0,{condition_frame_length}"
 
             masks = apply_mask_strategy(z, refs_x, mask_strategy, loop_i)
 
             # 4.6. diffusion sampling
+            # hack to update num_sampling_steps and cfg_scale
+            scheduler_kwargs = config.scheduler.copy()
+            scheduler_kwargs.pop('type')
+            scheduler_kwargs['num_sampling_steps'] = sampling_steps
+            scheduler_kwargs['cfg_scale'] = cfg_scale
+
+            scheduler.__init__(
+                **scheduler_kwargs
+            )
             samples = scheduler.sample(
                 stdit,
                 text_encoder,
@@ -410,10 +476,20 @@ def run_inference(mode, prompt_text, resolution, length, reference_image):
                     for i in range(1, num_loop)
                 ]
                 video = torch.cat(video_clips_list, dim=1)
-                save_path = f"{args.output}/sample"
-                saved_path = save_sample(video, fps=config.fps // config.frame_interval, save_path=save_path, force_video=True)
+                current_datetime = datetime.datetime.now()
+                timestamp = current_datetime.timestamp()
+                save_path = os.path.join(args.output, f"output_{timestamp}")
+                saved_path = save_sample(video, save_path=save_path, fps=config.fps // config.frame_interval)
                 return saved_path
 
+@spaces.GPU(duration=200)
+def run_image_inference(prompt_text, resolution, aspect_ratio, length, reference_image, seed, sampling_steps, cfg_scale):
+    return run_inference("Text2Image", prompt_text, resolution, aspect_ratio, length, reference_image, seed, sampling_steps, cfg_scale)
+
+@spaces.GPU(duration=200)
+def run_video_inference(prompt_text, resolution, aspect_ratio, length, reference_image, seed, sampling_steps, cfg_scale):
+    return run_inference("Text2Video", prompt_text, resolution, aspect_ratio, length, reference_image, seed, sampling_steps, cfg_scale)
+
 
 def main():
     # create demo
@@ -442,31 +518,54 @@ def main():
 
         with gr.Row():
             with gr.Column():
-                mode = gr.Radio(
-                    choices=["Text2Video", "Image2Video"], 
-                    value="Text2Video",
-                    label="Usage", 
-                    info="Choose your usage scenario",
-                )
                 prompt_text = gr.Textbox(
                     label="Prompt",
                     placeholder="Describe your video here",
                     lines=4,
                 )
                 resolution = gr.Radio(
-                     choices=["144p", "240p", "360p", "480p", "720p", "1080p"],
-                     value="144p",
+                     choices=["144p", "240p", "360p", "480p", "720p"],
+                     value="240p",
                     label="Resolution", 
                 )
+                aspect_ratio = gr.Radio(
+                     choices=["9:16", "16:9", "3:4", "4:3", "1:1"],
+                     value="9:16",
+                    label="Aspect Ratio (H:W)", 
+                )
                 length = gr.Radio(
-                    choices=["2s", "4s", "8s"], 
+                    choices=["2s", "4s", "8s", "16s"], 
                     value="2s",
-                    label="Video Length", 
+                    label="Video Length (only effective for video generation)", 
                     info="8s may fail as Hugging Face ZeroGPU has the limitation of max 200 seconds inference time."
                 )
 
+                with gr.Row():
+                    seed = gr.Slider(
+                        value=1024,
+                        minimum=1,
+                        maximum=2048,
+                        step=1,
+                        label="Seed"
+                    )
+
+                    sampling_steps = gr.Slider(
+                        value=100,
+                        minimum=1,
+                        maximum=200,
+                        step=1,
+                        label="Sampling steps"
+                    )
+                    cfg_scale = gr.Slider(
+                        value=7.0,
+                        minimum=0.0,
+                        maximum=10.0,
+                        step=0.1,
+                        label="CFG Scale"
+                    )
+                
                 reference_image = gr.Image(
-                    label="Reference Image (only used for Image2Video)",
+                    label="Reference Image (Optional)",
                 )
             
             with gr.Column():
@@ -476,12 +575,18 @@ def main():
                 )
 
         with gr.Row():
-             submit_button = gr.Button("Generate video")
+             image_gen_button = gr.Button("Generate image")
+             video_gen_button = gr.Button("Generate video")
         
 
-        submit_button.click(
-             fn=run_inference, 
-             inputs=[mode, prompt_text, resolution, length, reference_image], 
+        image_gen_button.click(
+             fn=run_image_inference, 
+             inputs=[prompt_text, resolution, aspect_ratio, length, reference_image, seed, sampling_steps, cfg_scale], 
+             outputs=reference_image
+             )
+        video_gen_button.click(
+             fn=run_video_inference, 
+             inputs=[prompt_text, resolution, aspect_ratio, length, reference_image, seed, sampling_steps, cfg_scale], 
              outputs=output_video
              )
 

From 104aa1017ebd4fd20b1e39a6f2190ed42a059f9f Mon Sep 17 00:00:00 2001
From: Tommy in Tongji <36354458+TommyZihao@users.noreply.github.com>
Date: Thu, 25 Apr 2024 18:14:44 +0800
Subject: [PATCH 13/26] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E6=BD=9E=E6=99=A8?=
 =?UTF-8?q?=E4=BA=91=E9=83=A8=E7=BD=B2Open-Sora=E8=A7=86=E9=A2=91=E6=95=99?=
 =?UTF-8?q?=E7=A8=8B=20(#297)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Update README.md

* Update README.md
---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 75f81df..3dfa878 100644
--- a/README.md
+++ b/README.md
@@ -22,6 +22,8 @@ With Open-Sora, we aim to inspire innovation, creativity, and inclusivity in the
 
 [[中文文档]](/docs/zh_CN/README.md)
 
+[潞晨云部署Open-Sora保姆级视频教程](https://www.bilibili.com/video/BV141421R7Ag)
+
 <h4>Open-Sora is still at an early stage and under active development.</h4>
 
 ## 📰 News

From d7f58b135f30dd22fd77ce1dd5dd3547f1200557 Mon Sep 17 00:00:00 2001
From: WANGJUNJIE <32323900+wanng-ide@users.noreply.github.com>
Date: Thu, 25 Apr 2024 19:49:35 +0900
Subject: [PATCH 14/26] Update dataset.md (#314)

* Update datasets.md

* Update datasets.md
---
 docs/datasets.md | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/docs/datasets.md b/docs/datasets.md
index 2d5e995..bdf5471 100644
--- a/docs/datasets.md
+++ b/docs/datasets.md
@@ -22,3 +22,17 @@ The dataset is proposed for super-resolution tasks. We use the dataset for HQ fi
 [HD-VG-130M](https://github.com/daooshee/HD-VG-130M?tab=readme-ov-file) comprises 130M text-video pairs. 
 The caption is generated by BLIP-2. 
 We find the scene and the text quality are relatively poor. For OpenSora 1.0, we only use ~350K samples from this dataset.
+
+## Midjourney-v5-1.7M
+[Midjourney-v5-1.7M](https://huggingface.co/datasets/wanng/midjourney-v5-202304-clean) includes 1.7M image-text pairs.
+In detail, this dataset introduces two subsets: original and upscale.
+This dataset is proposed for exploring the relationship of prompts and high-quality images.
+
+## Midjourney-kaggle-clean
+[Midjourney-kaggle-clean](https://huggingface.co/datasets/wanng/midjourney-kaggle-clean) is a reconstructed version of [Midjourney User Prompts & Generated Images (250k)](https://www.kaggle.com/datasets/succinctlyai/midjourney-texttoimage?select=general-01_2022_06_20.json%5D), which is cleaned by rules.
+Moreover, this dataset is divided into two subsets: original and upscale.
+This dataset is proposed for enabling research on text-to-image model prompting.
+
+## upsplash-lite
+The [Unsplash-lite](https://github.com/unsplash/datasets) Dataset comprises 25k nature-themed Unsplash photos, 25k keywords, and 1M searches. 
+This dataset covers a vast range of uses and contexts. Its extensive scope in intent and semantics opens new avenues for research and learning.

From 26d4ab6eb1256daaf25295724bb0ecc71a5187ac Mon Sep 17 00:00:00 2001
From: Zangwei Zheng <zangwei@comp.nus.edu.sg>
Date: Thu, 25 Apr 2024 19:21:35 +0800
Subject: [PATCH 15/26] update readme

---
 README.md        | 56 ++++++++++++++++++------------------------------
 docs/datasets.md | 38 +++++++++++++++++++++-----------
 2 files changed, 47 insertions(+), 47 deletions(-)

diff --git a/README.md b/README.md
index 2f43cfc..3b9af88 100644
--- a/README.md
+++ b/README.md
@@ -20,9 +20,7 @@ Open-Sora not only democratizes access to advanced video generation techniques,
 streamlined and user-friendly platform that simplifies the complexities of video production.
 With Open-Sora, we aim to inspire innovation, creativity, and inclusivity in the realm of content creation.
 
-[[中文文档]](/docs/zh_CN/README.md)
-
-[潞晨云部署Open-Sora保姆级视频教程](https://www.bilibili.com/video/BV141421R7Ag)
+[[中文文档]](/docs/zh_CN/README.md) [[潞晨云部署视频教程]](https://www.bilibili.com/video/BV141421R7Ag)
 
 <h4>Open-Sora is still at an early stage and under active development.</h4>
 
@@ -41,9 +39,7 @@ With Open-Sora, we aim to inspire innovation, creativity, and inclusivity in the
 
 ## 🎥 Latest Demo
 
-🔥 You can experinece Open-Sora on our [🤗 Gradio application on Hugging Face](https://huggingface.co/spaces/hpcai-tech/open-sora)
-
-More samples are available in our [gallery](https://hpcaitech.github.io/Open-Sora/).
+🔥 You can experinece Open-Sora on our [🤗 Gradio application on Hugging Face](https://huggingface.co/spaces/hpcai-tech/open-sora). More samples are available in our [Gallery](https://hpcaitech.github.io/Open-Sora/).
 
 | **2s 240×426**                                                                                                                                              | **2s 240×426**                                                                                                                                             |
 | ----------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
@@ -57,6 +53,10 @@ More samples are available in our [gallery](https://hpcaitech.github.io/Open-Sor
 | **16s 320×320**                                                                                                                                        | **16s 224×448**                                                                                                                                        | **2s 426×240**                                                                                                                                            |
 | ------------------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | [<img src="assets/demo/sample_16s_320x320.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/3cab536e-9b43-4b33-8da8-a0f9cf842ff2) | [<img src="assets/demo/sample_16s_224x448.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/9fb0b9e0-c6f4-4935-b29e-4cac10b373c4) | [<img src="assets/demo/sora_16x426x240_3.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/3e892ad2-9543-4049-b005-643a4c1bf3bf) |
+<<<<<<< Updated upstream
+=======
+
+>>>>>>> Stashed changes
 
 <details>
 <summary>OpenSora 1.0 Demo</summary>
@@ -114,8 +114,8 @@ see [here](/assets/texts/t2v_samples.txt) for full prompts.
 ### TODO list sorted by priority
 
 * [ ] Training Video-VAE and adapt our model to new VAE. **[WIP]**
-* [ ] Incoporate a better scheduler, e.g., rectified flow in SD3.
-* [ ] Scaling model parameters and dataset size.
+* [ ] Scaling model parameters and dataset size. **[WIP]**
+* [ ] Incoporate a better scheduler, e.g., rectified flow in SD3. **[WIP]**
 
 <details>
 <summary>View more</summary>
@@ -184,21 +184,25 @@ pip install -v .
 
 ### Open-Sora 1.1 Model Weights
 
-| Resolution         | Data                       | #iterations | Batch Size                                        | URL                                                                  |
-| ------------------ | -------------------------- | ----------- | ------------------------------------------------- | -------------------------------------------------------------------- |
-| mainly 144p & 240p | 10M videos + 2M images     | 100k        | [dynamic](/configs/opensora-v1-1/train/stage2.py) | [:link:](https://huggingface.co/hpcai-tech/OpenSora-STDiT-v2-stage2) |
-| 144p to 720p       | 500K HQ videos + 1M images | 4k          | [dynamic](/configs/opensora-v1-1/train/stage3.py) | [:link:](https://huggingface.co/hpcai-tech/OpenSora-STDiT-v2-stage3) |
+| Resolution         | Model Size | Data                       | #iterations | Batch Size                                        | URL                                                                  |
+| ------------------ | ---------- | -------------------------- | ----------- | ------------------------------------------------- | -------------------------------------------------------------------- |
+| mainly 144p & 240p | 700M       | 10M videos + 2M images     | 100k        | [dynamic](/configs/opensora-v1-1/train/stage2.py) | [:link:](https://huggingface.co/hpcai-tech/OpenSora-STDiT-v2-stage2) |
+| 144p to 720p       | 700M       | 500K HQ videos + 1M images | 4k          | [dynamic](/configs/opensora-v1-1/train/stage3.py) | [:link:](https://huggingface.co/hpcai-tech/OpenSora-STDiT-v2-stage3) |
+
+See our **[report 1.1](docs/report_02.md)** for more infomation.
+
+:warning: **LIMITATION**: This version contains known issues which we are going to fix in the next version (as we save computation resource for the next release). In addition, the video generation may fail for long duration, and high resolution will have noisy results due to this problem.
 
 ### Open-Sora 1.0 Model Weights
 
 <details>
 <summary>View more</summary>
 
-| Resolution | Data   | #iterations | Batch Size | GPU days (H800) | URL                                                                                           |
-| ---------- | ------ | ----------- | ---------- | --------------- | --------------------------------------------------------------------------------------------- |
-| 16×512×512 | 20K HQ | 20k         | 2×64       | 35              | [:link:](https://huggingface.co/hpcai-tech/Open-Sora/blob/main/OpenSora-v1-HQ-16x512x512.pth) |
-| 16×256×256 | 20K HQ | 24k         | 8×64       | 45              | [:link:](https://huggingface.co/hpcai-tech/Open-Sora/blob/main/OpenSora-v1-HQ-16x256x256.pth) |
-| 16×256×256 | 366K   | 80k         | 8×64       | 117             | [:link:](https://huggingface.co/hpcai-tech/Open-Sora/blob/main/OpenSora-v1-16x256x256.pth)    |
+| Resolution | Model Size | Data   | #iterations | Batch Size | GPU days (H800) | URL                                                                                           |
+| ---------- | ---------- | ------ | ----------- | ---------- | --------------- |
+| 16×512×512 | 700M       | 20K HQ | 20k         | 2×64       | 35              | [:link:](https://huggingface.co/hpcai-tech/Open-Sora/blob/main/OpenSora-v1-HQ-16x512x512.pth) |
+| 16×256×256 | 700M       | 20K HQ | 24k         | 8×64       | 45              | [:link:](https://huggingface.co/hpcai-tech/Open-Sora/blob/main/OpenSora-v1-HQ-16x256x256.pth) |
+| 16×256×256 | 700M       | 366K   | 80k         | 8×64       | 117             | [:link:](https://huggingface.co/hpcai-tech/Open-Sora/blob/main/OpenSora-v1-16x256x256.pth)    |
 
 Training orders: 16x256x256 $\rightarrow$ 16x256x256 HQ $\rightarrow$ 16x512x512 HQ.
 
@@ -328,24 +332,6 @@ following [all-contributors](https://github.com/all-contributors/all-contributor
 <!-- ALL-CONTRIBUTORS-LIST:START - Do not remove or modify this section -->
 <!-- prettier-ignore-start -->
 <!-- markdownlint-disable -->
-<table>
-  <tbody>
-    <tr>
-      <td align="center" valign="top" width="14.28%"><a href="https://github.com/zhengzangw"><img src="https://avatars.githubusercontent.com/zhengzangw?v=4?s=100" width="100px;" alt="zhengzangw"/><br /><sub><b>zhengzangw</b></sub></a><br /><a href="https://github.com/hpcaitech/Open-Sora/commits?author=zhengzangw" title="Code">💻</a> <a href="https://github.com/hpcaitech/Open-Sora/commits?author=zhengzangw" title="Documentation">📖</a> <a href="#ideas-zhengzangw" title="Ideas, Planning, & Feedback">🤔</a> <a href="#video-zhengzangw" title="Videos">📹</a> <a href="#maintenance-zhengzangw" title="Maintenance">🚧</a></td>
-      <td align="center" valign="top" width="14.28%"><a href="https://github.com/ver217"><img src="https://avatars.githubusercontent.com/ver217?v=4?s=100" width="100px;" alt="ver217"/><br /><sub><b>ver217</b></sub></a><br /><a href="https://github.com/hpcaitech/Open-Sora/commits?author=ver217" title="Code">💻</a> <a href="#ideas-ver217" title="Ideas, Planning, & Feedback">🤔</a> <a href="https://github.com/hpcaitech/Open-Sora/commits?author=ver217" title="Documentation">📖</a> <a href="#bug-ver217" title="Bug reports">🐛</a></td>
-      <td align="center" valign="top" width="14.28%"><a href="https://github.com/FrankLeeeee"><img src="https://avatars.githubusercontent.com/FrankLeeeee?v=4?s=100" width="100px;" alt="FrankLeeeee"/><br /><sub><b>FrankLeeeee</b></sub></a><br /><a href="https://github.com/hpcaitech/Open-Sora/commits?author=FrankLeeeee" title="Code">💻</a> <a href="#infra-FrankLeeeee" title="Infrastructure (Hosting, Build-Tools, etc)">🚇</a> <a href="#tool-FrankLeeeee" title="Tools">🔧</a></td>
-      <td align="center" valign="top" width="14.28%"><a href="https://github.com/xyupeng"><img src="https://avatars.githubusercontent.com/xyupeng?v=4?s=100" width="100px;" alt="xyupeng"/><br /><sub><b>xyupeng</b></sub></a><br /><a href="https://github.com/hpcaitech/Open-Sora/commits?author=xyupeng" title="Code">💻</a> <a href="#doc-xyupeng" title="Documentation">📖</a> <a href="#design-xyupeng" title="Design">🎨</a></td>
-      <td align="center" valign="top" width="14.28%"><a href="https://github.com/Yanjia0"><img src="https://avatars.githubusercontent.com/Yanjia0?v=4?s=100" width="100px;" alt="Yanjia0"/><br /><sub><b>Yanjia0</b></sub></a><br /><a href="#doc-Yanjia0" title="Documentation">📖</a></td>
-    </tr>
-    <tr>
-      <td align="center" valign="top" width="14.28%"><a href="https://github.com/binmakeswell"><img src="https://avatars.githubusercontent.com/binmakeswell?v=4?s=100" width="100px;" alt="binmakeswell"/><br /><sub><b>binmakeswell</b></sub></a><br /><a href="#doc-binmakeswell" title="Documentation">📖</a></td>
-      <td align="center" valign="top" width="14.28%"><a href="https://github.com/eltociear"><img src="https://avatars.githubusercontent.com/eltociear?v=4?s=100" width="100px;" alt="eltociear"/><br /><sub><b>eltociear</b></sub></a><br /><a href="#doc-eltociear" title="Documentation">📖</a></td>
-      <td align="center" valign="top" width="14.28%"><a href="https://github.com/ganeshkrishnan1"><img src="https://avatars.githubusercontent.com/ganeshkrishnan1?v=4?s=100" width="100px;" alt="ganeshkrishnan1"/><br /><sub><b>ganeshkrishnan1</b></sub></a><br /><a href="#doc-ganeshkrishnan1" title="Documentation">📖</a></td>
-      <td align="center" valign="top" width="14.28%"><a href="https://github.com/fastalgo"><img src="https://avatars.githubusercontent.com/fastalgo?v=4?s=100" width="100px;" alt="fastalgo"/><br /><sub><b>fastalgo</b></sub></a><br /><a href="#doc-fastalgo" title="Documentation">📖</a></td>
-      <td align="center" valign="top" width="14.28%"><a href="https://github.com/powerzbt"><img src="https://avatars.githubusercontent.com/powerzbt?v=4?s=100" width="100px;" alt="powerzbt"/><br /><sub><b>powerzbt</b></sub></a><br /><a href="#doc-powerzbt" title="Documentation">📖</a></td>
-    </tr>
-  </tbody>
-</table>
 
 <!-- markdownlint-restore -->
 <!-- prettier-ignore-end -->
diff --git a/docs/datasets.md b/docs/datasets.md
index bdf5471..ca09635 100644
--- a/docs/datasets.md
+++ b/docs/datasets.md
@@ -3,36 +3,50 @@
 For Open-Sora 1.1, we conduct mixed training with both images and videos. The main datasets we use are listed below.
 Please refer to [README](/README.md#data-processing) for data processing.
 
-## Panda-70M
+## Video
+
+### Panda-70M
+
 [Panda-70M](https://github.com/snap-research/Panda-70M) is a large-scale dataset with 70M video-caption pairs.
-We use the [training-10M subset](https://github.com/snap-research/Panda-70M/tree/main/dataset_dataloading) for training, 
+We use the [training-10M subset](https://github.com/snap-research/Panda-70M/tree/main/dataset_dataloading) for training,
 which contains ~10M videos of better quality.
 
-## Pexels
-[Pexels](https://www.pexels.com/) is a popular online platform that provides high-quality stock photos, videos, and music for free. 
+### Pexels
+
+[Pexels](https://www.pexels.com/) is a popular online platform that provides high-quality stock photos, videos, and music for free.
 Most videos from this website are of high quality. Thus, we use them for both pre-training and HQ fine-tuning.
 We really appreciate the great platform and the contributors!
 
-## Inter4K
+### Inter4K
+
 [Inter4K](https://github.com/alexandrosstergiou/Inter4K) is a dataset containing 1K video clips with 4K resolution.
 The dataset is proposed for super-resolution tasks. We use the dataset for HQ fine-tuning.
 
+### HD-VG-130M
 
-## HD-VG-130M
-[HD-VG-130M](https://github.com/daooshee/HD-VG-130M?tab=readme-ov-file) comprises 130M text-video pairs. 
-The caption is generated by BLIP-2. 
+[HD-VG-130M](https://github.com/daooshee/HD-VG-130M?tab=readme-ov-file) comprises 130M text-video pairs.
+The caption is generated by BLIP-2.
 We find the scene and the text quality are relatively poor. For OpenSora 1.0, we only use ~350K samples from this dataset.
 
-## Midjourney-v5-1.7M
+## Image
+
+### Midjourney-v5-1.7M
+
 [Midjourney-v5-1.7M](https://huggingface.co/datasets/wanng/midjourney-v5-202304-clean) includes 1.7M image-text pairs.
 In detail, this dataset introduces two subsets: original and upscale.
 This dataset is proposed for exploring the relationship of prompts and high-quality images.
 
-## Midjourney-kaggle-clean
+### Midjourney-kaggle-clean
+
 [Midjourney-kaggle-clean](https://huggingface.co/datasets/wanng/midjourney-kaggle-clean) is a reconstructed version of [Midjourney User Prompts & Generated Images (250k)](https://www.kaggle.com/datasets/succinctlyai/midjourney-texttoimage?select=general-01_2022_06_20.json%5D), which is cleaned by rules.
 Moreover, this dataset is divided into two subsets: original and upscale.
 This dataset is proposed for enabling research on text-to-image model prompting.
 
-## upsplash-lite
-The [Unsplash-lite](https://github.com/unsplash/datasets) Dataset comprises 25k nature-themed Unsplash photos, 25k keywords, and 1M searches. 
+### upsplash-lite
+
+The [Unsplash-lite](https://github.com/unsplash/datasets) Dataset comprises 25k nature-themed Unsplash photos, 25k keywords, and 1M searches.
 This dataset covers a vast range of uses and contexts. Its extensive scope in intent and semantics opens new avenues for research and learning.
+
+### LAION-AESTHETICS 6.5+
+
+LAION aesthetic 6.5+ dataset is a subset of the LAION dataset, which contains 625K high-quality images with aesthetic scores > 6.5. However, as LAION is currently not publicly available, we use this 168k [subset](https://huggingface.co/datasets/bhargavsdesai/laion_improved_aesthetics_6.5plus_with_images).

From 4f4ca56a5e6e160f68a29b0a2754f10ffd8a9029 Mon Sep 17 00:00:00 2001
From: Zangwei Zheng <zangwei@comp.nus.edu.sg>
Date: Thu, 25 Apr 2024 19:47:11 +0800
Subject: [PATCH 16/26] update readme

---
 .all-contributorsrc |  4 ----
 .gitignore          |  5 +++++
 README.md           | 18 ++++++++++++++++++
 3 files changed, 23 insertions(+), 4 deletions(-)
 delete mode 100644 .all-contributorsrc

diff --git a/.all-contributorsrc b/.all-contributorsrc
deleted file mode 100644
index f324586..0000000
--- a/.all-contributorsrc
+++ /dev/null
@@ -1,4 +0,0 @@
-{
-  "projectName": "Open-Sora",
-  "projectOwner": "hpcaitech"
-}
diff --git a/.gitignore b/.gitignore
index b9f8121..917be8b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -181,3 +181,8 @@ cache/
 hostfile
 gradio_cached_examples/
 wandb/
+
+# npm
+node_modules/
+package-lock.json
+package.json
diff --git a/README.md b/README.md
index 3b9af88..9c96bf1 100644
--- a/README.md
+++ b/README.md
@@ -332,6 +332,24 @@ following [all-contributors](https://github.com/all-contributors/all-contributor
 <!-- ALL-CONTRIBUTORS-LIST:START - Do not remove or modify this section -->
 <!-- prettier-ignore-start -->
 <!-- markdownlint-disable -->
+<table>
+  <tbody>
+    <tr>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/zhengzangw"><img src="https://avatars.githubusercontent.com/zhengzangw?v=4?s=100" width="100px;" alt="zhengzangw"/><br /><sub><b>zhengzangw</b></sub></a><br /><a href="https://github.com/hpcaitech/Open-Sora/commits?author=zhengzangw" title="Code">💻</a> <a href="https://github.com/hpcaitech/Open-Sora/commits?author=zhengzangw" title="Documentation">📖</a> <a href="#ideas-zhengzangw" title="Ideas, Planning, & Feedback">🤔</a> <a href="#video-zhengzangw" title="Videos">📹</a> <a href="#maintenance-zhengzangw" title="Maintenance">🚧</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/ver217"><img src="https://avatars.githubusercontent.com/ver217?v=4?s=100" width="100px;" alt="ver217"/><br /><sub><b>ver217</b></sub></a><br /><a href="https://github.com/hpcaitech/Open-Sora/commits?author=ver217" title="Code">💻</a> <a href="#ideas-ver217" title="Ideas, Planning, & Feedback">🤔</a> <a href="https://github.com/hpcaitech/Open-Sora/commits?author=ver217" title="Documentation">📖</a> <a href="#bug-ver217" title="Bug reports">🐛</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/FrankLeeeee"><img src="https://avatars.githubusercontent.com/FrankLeeeee?v=4?s=100" width="100px;" alt="FrankLeeeee"/><br /><sub><b>FrankLeeeee</b></sub></a><br /><a href="https://github.com/hpcaitech/Open-Sora/commits?author=FrankLeeeee" title="Code">💻</a> <a href="#infra-FrankLeeeee" title="Infrastructure (Hosting, Build-Tools, etc)">🚇</a> <a href="#tool-FrankLeeeee" title="Tools">🔧</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/xyupeng"><img src="https://avatars.githubusercontent.com/xyupeng?v=4?s=100" width="100px;" alt="xyupeng"/><br /><sub><b>xyupeng</b></sub></a><br /><a href="https://github.com/hpcaitech/Open-Sora/commits?author=xyupeng" title="Code">💻</a> <a href="#doc-xyupeng" title="Documentation">📖</a> <a href="#design-xyupeng" title="Design">🎨</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/Yanjia0"><img src="https://avatars.githubusercontent.com/Yanjia0?v=4?s=100" width="100px;" alt="Yanjia0"/><br /><sub><b>Yanjia0</b></sub></a><br /><a href="#doc-Yanjia0" title="Documentation">📖</a></td>
+    </tr>
+    <tr>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/binmakeswell"><img src="https://avatars.githubusercontent.com/binmakeswell?v=4?s=100" width="100px;" alt="binmakeswell"/><br /><sub><b>binmakeswell</b></sub></a><br /><a href="#doc-binmakeswell" title="Documentation">📖</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/eltociear"><img src="https://avatars.githubusercontent.com/eltociear?v=4?s=100" width="100px;" alt="eltociear"/><br /><sub><b>eltociear</b></sub></a><br /><a href="#doc-eltociear" title="Documentation">📖</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/ganeshkrishnan1"><img src="https://avatars.githubusercontent.com/ganeshkrishnan1?v=4?s=100" width="100px;" alt="ganeshkrishnan1"/><br /><sub><b>ganeshkrishnan1</b></sub></a><br /><a href="#doc-ganeshkrishnan1" title="Documentation">📖</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/fastalgo"><img src="https://avatars.githubusercontent.com/fastalgo?v=4?s=100" width="100px;" alt="fastalgo"/><br /><sub><b>fastalgo</b></sub></a><br /><a href="#doc-fastalgo" title="Documentation">📖</a></td>
+      <td align="center" valign="top" width="14.28%"><a href="https://github.com/powerzbt"><img src="https://avatars.githubusercontent.com/powerzbt?v=4?s=100" width="100px;" alt="powerzbt"/><br /><sub><b>powerzbt</b></sub></a><br /><a href="#doc-powerzbt" title="Documentation">📖</a></td>
+    </tr>
+  </tbody>
+</table>
 
 <!-- markdownlint-restore -->
 <!-- prettier-ignore-end -->

From 7caa80823c82f68448cd15524a376bdc38733200 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Fri, 26 Apr 2024 15:48:27 +0800
Subject: [PATCH 17/26] added workflow for gallery (#326)

* added workflow for gallery

* polish
---
 .github/workflows/github_page.yaml | 42 ++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 .github/workflows/github_page.yaml

diff --git a/.github/workflows/github_page.yaml b/.github/workflows/github_page.yaml
new file mode 100644
index 0000000..5a354fb
--- /dev/null
+++ b/.github/workflows/github_page.yaml
@@ -0,0 +1,42 @@
+name: GitHub Pages
+
+on:
+  workflow_dispatch:
+
+jobs:
+  deploy:
+    runs-on: ubuntu-22.04
+    permissions:
+      contents: write
+    concurrency:
+      group: ${{ github.workflow }}-${{ github.ref }}
+    steps:
+      - uses: actions/checkout@v3
+        ref: gallery
+
+
+      - name: Setup Node
+        uses: actions/setup-node@v4
+        with:
+          node-version: 18.6
+
+      - name: Get yarn cache
+        id: yarn-cache
+        run: echo "YARN_CACHE_DIR=$(yarn cache dir)" >> "${GITHUB_OUTPUT}"
+
+      - name: Cache dependencies
+        uses: actions/cache@v3
+        with:
+          path: ${{ steps.yarn-cache.outputs.YARN_CACHE_DIR }}
+          key: ${{ runner.os }}-yarn-${{ hashFiles('**/yarn.lock') }}
+          restore-keys: |
+            ${{ runner.os }}-yarn
+
+      - run: yarn install
+      - run: yarn build
+
+      - name: Deploy
+        uses: peaceiris/actions-gh-pages@v3
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          publish_dir: ./build

From 17e0ca4c2827732ec34115f0f0b0a77fd75eac39 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Fri, 26 Apr 2024 15:51:37 +0800
Subject: [PATCH 18/26] fixed workflow typo (#327)

---
 .github/workflows/github_page.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/github_page.yaml b/.github/workflows/github_page.yaml
index 5a354fb..cc8f3a9 100644
--- a/.github/workflows/github_page.yaml
+++ b/.github/workflows/github_page.yaml
@@ -12,8 +12,8 @@ jobs:
       group: ${{ github.workflow }}-${{ github.ref }}
     steps:
       - uses: actions/checkout@v3
-        ref: gallery
-
+        with:
+          ref: gallery
 
       - name: Setup Node
         uses: actions/setup-node@v4

From cff6298f6f3639f0ecd6bdb1f27c6324517266ac Mon Sep 17 00:00:00 2001
From: "Zheng Zangwei (Alex Zheng)" <zangwei@comp.nus.edu.sg>
Date: Fri, 26 Apr 2024 18:12:12 +0800
Subject: [PATCH 19/26] update caption readme (#331)

---
 tools/caption/README.md | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/tools/caption/README.md b/tools/caption/README.md
index b30856c..c3a3ad9 100644
--- a/tools/caption/README.md
+++ b/tools/caption/README.md
@@ -6,7 +6,7 @@ Human labeling of videos is expensive and time-consuming. We adopt powerful imag
 
 ## LLaVA Captioning
 
-We extract three frames from the video for captioning. With batch inference, we can achieve 10 times speedup. With approximatly 720p resolution and 3 frames, the speed is 2~3 videos/s on 8 GPUs. If we resize the smaller side to 336, the speed can be 8 videos/s.
+We extract three frames from the video for captioning. With batch inference, we can achieve 10 times speedup. With approximatly 720p resolution and 1 frames, the speed is 2~3 videos/s on 8 GPUs. If we resize the smaller side to 336, the speed can be 8 videos/s. In Open-Sora v1.1, to lower the cost, we use the 7B model.
 
 ### Requirement
 
@@ -36,13 +36,18 @@ pip install flash-attn --no-build-isolation
 pip install colossalai decord
 ```
 
-Since only the 34B model's performance is comparable to GPT-4V, we only provide the usage of the 34B model. The 34B model is available [here](https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b), or run our script and it will be downloaded automatically.
-
 ### Usage
 
-Prepare a csv file for processing. The csv file can be generated by `convert_dataset.py` according to its [documentation](/tools/datasets/README.md). Then, run the following command to generate captions for videos/images with LLaVA:
+Prepare a csv file for processing. The csv file can be generated by `convert_dataset.py` according to its [documentation](/tools/datasets/README.md). Then, run the following command to generate captions for videos/images with Llava:
 
 ```bash
+# caption with mistral-7B
+torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava DATA.csv --dp-size 8 --tp-size 1 --model-path liuhaotian/llava-v1.6-mistral-7b --prompt video
+
+# caption with llava-34B
+# NOTE: remember to enable flash attention for this model
+torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava DATA.csv --dp-size 4 --tp-size 2 --model-path liuhaotian/llava-v1.6-34b --prompt image-3ex --flash-attention
+
 # we run this on 8xH800 GPUs
 torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava DATA.csv --tp-size 2 --dp-size 4 --bs 16
 
@@ -51,14 +56,6 @@ torchrun --nproc_per_node 2 --standalone -m tools.caption.caption_llava DATA.csv
 
 # can also caption images
 torchrun --nproc_per_node 2 --standalone -m tools.caption.caption_llava DATA.csv --tp-size 2 --dp-size 1 --bs 16 --prompt image-3ex
-
-# caption with llava-34B
-# NOTE: remember to enable flash attention for this model
-torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava DATA.csv --dp-size 4 --tp-size 2 --model-path liuhaotian/llava-v1.6-34b --prompt image-3ex --flash-attention
-
-# caption with mistral-7B
-torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava DATA.csv --dp-size 8 --tp-size 1 --model-path liuhaotian/llava-v1.6-mistral-7b --prompt video
-# bs can be 48
 ```
 
 Please note that you should add the `--flash-attention` flag when running with Llama-based Llava models as it provides speedup but do turn it off for mistral-based ones. Reasons can be found in [this issue](https://discuss.huggingface.co/t/flash-attention-has-no-effect-on-inference/73453).

From 0b37f4bd636b7c602867da6d0fa60bd511a8ec43 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Sat, 27 Apr 2024 13:13:59 +0800
Subject: [PATCH 20/26] Hotfix/gallery build (#337)

* fixed gallery build workflow

* updated node version
---
 .github/workflows/github_page.yaml | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/github_page.yaml b/.github/workflows/github_page.yaml
index cc8f3a9..483c2ad 100644
--- a/.github/workflows/github_page.yaml
+++ b/.github/workflows/github_page.yaml
@@ -18,22 +18,10 @@ jobs:
       - name: Setup Node
         uses: actions/setup-node@v4
         with:
-          node-version: 18.6
+          node-version: 20
 
-      - name: Get yarn cache
-        id: yarn-cache
-        run: echo "YARN_CACHE_DIR=$(yarn cache dir)" >> "${GITHUB_OUTPUT}"
-
-      - name: Cache dependencies
-        uses: actions/cache@v3
-        with:
-          path: ${{ steps.yarn-cache.outputs.YARN_CACHE_DIR }}
-          key: ${{ runner.os }}-yarn-${{ hashFiles('**/yarn.lock') }}
-          restore-keys: |
-            ${{ runner.os }}-yarn
-
-      - run: yarn install
-      - run: yarn build
+      - run: npm install
+      - run: npm run build
 
       - name: Deploy
         uses: peaceiris/actions-gh-pages@v3

From 7f48fef81ef2ededf36d6f2e0ee659c8de60680f Mon Sep 17 00:00:00 2001
From: Ikko Eltociear Ashimine <eltociear@gmail.com>
Date: Sat, 27 Apr 2024 19:01:51 +0900
Subject: [PATCH 21/26] Update README.md (#334)

experinece -> experience
---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 9c96bf1..0bddbc8 100644
--- a/README.md
+++ b/README.md
@@ -39,7 +39,7 @@ With Open-Sora, we aim to inspire innovation, creativity, and inclusivity in the
 
 ## 🎥 Latest Demo
 
-🔥 You can experinece Open-Sora on our [🤗 Gradio application on Hugging Face](https://huggingface.co/spaces/hpcai-tech/open-sora). More samples are available in our [Gallery](https://hpcaitech.github.io/Open-Sora/).
+🔥 You can experience Open-Sora on our [🤗 Gradio application on Hugging Face](https://huggingface.co/spaces/hpcai-tech/open-sora). More samples are available in our [Gallery](https://hpcaitech.github.io/Open-Sora/).
 
 | **2s 240×426**                                                                                                                                              | **2s 240×426**                                                                                                                                             |
 | ----------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
@@ -220,7 +220,7 @@ on improving the quality and text alignment.
 
 ### Gradio Demo
 
-🔥 You can experinece Open-Sora on our [🤗 Gradio application](https://huggingface.co/spaces/hpcai-tech/open-sora) on Hugging Face online.
+🔥 You can experience Open-Sora on our [🤗 Gradio application](https://huggingface.co/spaces/hpcai-tech/open-sora) on Hugging Face online.
 
 If you want to deploy gradio locally, we have also provided a [Gradio application](./gradio) in this repository, you can use the following the command to start an interactive web application to experience video generation with Open-Sora.
 

From ed04275d0c7bfdb2d6b93d50fa64e103e49542b1 Mon Sep 17 00:00:00 2001
From: fastalgo <youyang@cs.berkeley.edu>
Date: Mon, 29 Apr 2024 21:30:24 +0800
Subject: [PATCH 22/26] Update README.md

---
 README.md | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 0bddbc8..e278031 100644
--- a/README.md
+++ b/README.md
@@ -14,11 +14,11 @@
 
 ## Open-Sora: Democratizing Efficient Video Production for All
 
-We present **Open-Sora**, an initiative dedicated to **efficiently** produce high-quality video and make the model,
-tools and contents accessible to all. By embracing **open-source** principles,
+We design and implement **Open-Sora**, an initiative dedicated to **efficiently** producing high-quality video. We hope to make the model,
+tools and all details accessible to all. By embracing **open-source** principles,
 Open-Sora not only democratizes access to advanced video generation techniques, but also offers a
-streamlined and user-friendly platform that simplifies the complexities of video production.
-With Open-Sora, we aim to inspire innovation, creativity, and inclusivity in the realm of content creation.
+streamlined and user-friendly platform that simplifies the complexities of video generation.
+With Open-Sora, our goal is to foster innovation, creativity, and inclusivity within the field of content creation.
 
 [[中文文档]](/docs/zh_CN/README.md) [[潞晨云部署视频教程]](https://www.bilibili.com/video/BV141421R7Ag)
 
@@ -77,7 +77,7 @@ see [here](/assets/texts/t2v_samples.txt) for full prompts.
 
 * 📍 **Open-Sora 1.1** released. Model weights are available [here](). It is trained on **0s~15s, 144p to 720p, various aspect ratios** videos. See our **[report 1.1](docs/report_02.md)** for more discussions.
 * 🔧 **Data processing pipeline v1.1** is released. An automatic [processing pipeline](#data-processing) from raw videos to (text, video clip) pairs is provided, including scene cutting $\rightarrow$ filtering(aesthetic, optical flow, OCR, etc.) $\rightarrow$ captioning $\rightarrow$ managing. With this tool, you can easily build your video dataset.
-* ✅ Modified ST-DiT architecture includes rope positional encoding, qk norm, longer text length, etc.
+* ✅ Improved ST-DiT architecture includes rope positional encoding, qk norm, longer text length, etc.
 * ✅ Support training with any resolution, aspect ratio, and duration (including images).
 * ✅ Support image and video conditioning and video editing, and thus support animating images, connecting videos, etc.
 * 📍 **Open-Sora 1.0** released. Model weights are available [here](#model-weights). With only 400K video clips and 200 H800
@@ -85,7 +85,7 @@ see [here](/assets/texts/t2v_samples.txt) for full prompts.
 * ✅ Three-stage training from an image diffusion model to a video diffusion model. We provide the weights for each
   stage.
 * ✅ Support training acceleration including accelerated transformer, faster T5 and VAE, and sequence parallelism.
-  Open-Sora improve **55%** training speed when training on 64x512x512 videos. Details locates
+  Open-Sora improves **55%** training speed when training on 64x512x512 videos. Details locates
   at [acceleration.md](docs/acceleration.md).
 * 🔧 **Data preprocessing pipeline v1.0**,
   including [downloading](/tools/datasets/README.md), [video cutting](/tools/scenedetect/README.md),
@@ -358,9 +358,6 @@ following [all-contributors](https://github.com/all-contributors/all-contributor
 
 If you wish to contribute to this project, you can refer to the [Contribution Guideline](./CONTRIBUTING.md).
 
-[Zangwei Zheng](https://github.com/zhengzangw) and [Xiangyu Peng](https://github.com/xyupeng) equally contributed to
-this work during their internship at [HPC-AI Tech](https://hpc-ai.com/).
-
 ## Acknowledgement
 
 * [ColossalAI](https://github.com/hpcaitech/ColossalAI): A powerful large model parallel acceleration and optimization

From 98388ee37a5661659745af7e75e14a8a9e11d3d6 Mon Sep 17 00:00:00 2001
From: Luo Yihang <luo_yihang@outlook.com>
Date: Fri, 3 May 2024 13:48:28 +0800
Subject: [PATCH 23/26] [feature]: add dockerfile (#362)

* add opensora dockerfile

* update README
---
 README.md            | 16 ++++++++++++++++
 docker/Dockerfile    | 25 +++++++++++++++++++++++++
 docs/zh_CN/README.md | 15 +++++++++++++++
 3 files changed, 56 insertions(+)
 create mode 100644 docker/Dockerfile

diff --git a/README.md b/README.md
index e278031..b724310 100644
--- a/README.md
+++ b/README.md
@@ -151,6 +151,8 @@ Other useful documents and links are listed below.
 
 ## Installation
 
+### Install from Source
+
 ```bash
 # create a virtual env
 conda create -n opensora python=3.10
@@ -180,6 +182,20 @@ cd Open-Sora
 pip install -v .
 ```
 
+### Use Docker
+
+Run the following command to build a docker image from Dockerfile provided.
+
+```bash
+docker build -t opensora ./docker
+```
+
+Run the following command to start the docker container in interactive mode.
+
+```bash
+docker run -ti --gpus all -v {MOUNT_DIR}:/data opensora
+```
+
 ## Model Weights
 
 ### Open-Sora 1.1 Model Weights
diff --git a/docker/Dockerfile b/docker/Dockerfile
new file mode 100644
index 0000000..a361bfb
--- /dev/null
+++ b/docker/Dockerfile
@@ -0,0 +1,25 @@
+FROM hpcaitech/pytorch-cuda:2.1.0-12.1.0
+
+# metainformation
+LABEL org.opencontainers.image.source = "https://github.com/hpcaitech/Open-Sora"
+LABEL org.opencontainers.image.licenses = "Apache License 2.0"
+LABEL org.opencontainers.image.base.name = "docker.io/library/hpcaitech/pytorch-cuda:2.1.0-12.1.0"
+
+COPY . /workspace/Open-Sora
+
+# inatall library dependencies
+RUN apt-get update && apt-get install ffmpeg libsm6 libxext6  -y
+
+# install flash attention
+RUN pip install flash-attn --no-build-isolation
+
+# install apex
+RUN pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git
+
+# install xformers
+RUN pip install xformers --index-url https://download.pytorch.org/whl/cu121
+
+# install this project
+RUN git clone https://github.com/hpcaitech/Open-Sora && \
+    cd Open-Sora && \
+    pip install -v .
\ No newline at end of file
diff --git a/docs/zh_CN/README.md b/docs/zh_CN/README.md
index 21f8c6d..d84d4d7 100644
--- a/docs/zh_CN/README.md
+++ b/docs/zh_CN/README.md
@@ -87,6 +87,7 @@
 
 ## 安装
 
+### 从源码安装
 ```bash
 # create a virtual env
 conda create -n opensora python=3.10
@@ -112,6 +113,20 @@ cd Open-Sora
 pip install -v .
 ```
 
+### 使用Docker镜像
+
+运行如下指令使用提供的Dockerfile构建镜像：
+
+```bash
+docker build -t opensora ./docker
+```
+
+运行以下命令以启动交互模式下的 Docker 容器：
+
+```bash
+docker run -ti --gpus all -v {MOUNT_DIR}:/data opensora
+```
+
 安装完成后，建议阅读[结构](structure.md)，了解项目结构以及如何使用配置文件。
 
 ## 模型权重

From 19911b99b8861b607221959b6d3778567dce32e7 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 6 May 2024 14:15:04 +0800
Subject: [PATCH 24/26] converted model to hf format (#366)

---
 configs/opensora-v1-1/inference/sample-ref.py |   4 +-
 configs/opensora-v1-1/inference/sample.py     |   4 +-
 gradio/app.py                                 |   5 +-
 opensora/models/layers/blocks.py              |  20 +--
 opensora/models/stdit/stdit2.py               | 168 ++++++++++--------
 requirements.txt                              |   2 +-
 scripts/inference-long.py                     |   1 -
 scripts/inference.py                          |   2 +-
 8 files changed, 107 insertions(+), 99 deletions(-)

diff --git a/configs/opensora-v1-1/inference/sample-ref.py b/configs/opensora-v1-1/inference/sample-ref.py
index 735c01b..c896cae 100644
--- a/configs/opensora-v1-1/inference/sample-ref.py
+++ b/configs/opensora-v1-1/inference/sample-ref.py
@@ -38,10 +38,10 @@ reference_path = [
 # Define model
 model = dict(
     type="STDiT2-XL/2",
-    from_pretrained=None,
+    from_pretrained="hpcai-tech/OpenSora-STDiT-v2-stage3",
     input_sq_size=512,
     qk_norm=True,
-    enable_flashattn=True,
+    enable_flash_attn=True,
     enable_layernorm_kernel=True,
 )
 vae = dict(
diff --git a/configs/opensora-v1-1/inference/sample.py b/configs/opensora-v1-1/inference/sample.py
index cec8073..29e63ca 100644
--- a/configs/opensora-v1-1/inference/sample.py
+++ b/configs/opensora-v1-1/inference/sample.py
@@ -7,10 +7,10 @@ multi_resolution = "STDiT2"
 # Define model
 model = dict(
     type="STDiT2-XL/2",
-    from_pretrained=None,
+    from_pretrained="hpcai-tech/OpenSora-STDiT-v2-stage3",
     input_sq_size=512,
     qk_norm=True,
-    enable_flashattn=True,
+    enable_flash_attn=True,
     enable_layernorm_kernel=True,
 )
 vae = dict(
diff --git a/gradio/app.py b/gradio/app.py
index 181c059..726b244 100644
--- a/gradio/app.py
+++ b/gradio/app.py
@@ -255,9 +255,9 @@ def build_models(model_type, config, enable_optimization=False):
     # build stdit
     # we load model from HuggingFace directly so that we don't need to
     # handle model download logic in HuggingFace Space
-    from transformers import AutoModel
+    from opensora.models.stdit.stdit2 import STDiT2
 
-    stdit = AutoModel.from_pretrained(
+    stdit = STDiT2.from_pretrained(
         HF_STDIT_MAP[model_type],
         enable_flash_attn=enable_optimization,
         trust_remote_code=True,
@@ -410,7 +410,6 @@ def run_inference(mode, prompt_text, resolution, aspect_ratio, length, reference
                 # save image to disk
                 from PIL import Image
                 im = Image.fromarray(reference_image)
-                idx = os.environ['CUDA_VISIBLE_DEVICES']
 
                 with NamedTemporaryFile(suffix=".jpg") as temp_file:
                     im.save(temp_file.name)
diff --git a/opensora/models/layers/blocks.py b/opensora/models/layers/blocks.py
index 55d874a..c4dd5c3 100644
--- a/opensora/models/layers/blocks.py
+++ b/opensora/models/layers/blocks.py
@@ -139,7 +139,7 @@ class Attention(nn.Module):
         attn_drop: float = 0.0,
         proj_drop: float = 0.0,
         norm_layer: nn.Module = LlamaRMSNorm,
-        enable_flashattn: bool = False,
+        enable_flash_attn: bool = False,
         rope=None,
     ) -> None:
         super().__init__()
@@ -148,7 +148,7 @@ class Attention(nn.Module):
         self.num_heads = num_heads
         self.head_dim = dim // num_heads
         self.scale = self.head_dim**-0.5
-        self.enable_flashattn = enable_flashattn
+        self.enable_flash_attn = enable_flash_attn
 
         self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
         self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
@@ -165,7 +165,7 @@ class Attention(nn.Module):
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         B, N, C = x.shape
         # flash attn is not memory efficient for small sequences, this is empirical
-        enable_flashattn = self.enable_flashattn and (N > B)
+        enable_flash_attn = self.enable_flash_attn and (N > B)
         qkv = self.qkv(x)
         qkv_shape = (B, N, 3, self.num_heads, self.head_dim)
 
@@ -177,7 +177,7 @@ class Attention(nn.Module):
             k = self.rotary_emb(k)
         q, k = self.q_norm(q), self.k_norm(k)
 
-        if enable_flashattn:
+        if enable_flash_attn:
             from flash_attn import flash_attn_func
 
             # (B, #heads, N, #dim) -> (B, N, #heads, #dim)
@@ -202,7 +202,7 @@ class Attention(nn.Module):
             x = attn @ v
 
         x_output_shape = (B, N, C)
-        if not enable_flashattn:
+        if not enable_flash_attn:
             x = x.transpose(1, 2)
         x = x.reshape(x_output_shape)
         x = self.proj(x)
@@ -220,7 +220,7 @@ class SeqParallelAttention(Attention):
         attn_drop: float = 0.0,
         proj_drop: float = 0.0,
         norm_layer: nn.Module = LlamaRMSNorm,
-        enable_flashattn: bool = False,
+        enable_flash_attn: bool = False,
         rope=None,
     ) -> None:
         assert rope is None, "Rope is not supported in SeqParallelAttention"
@@ -232,7 +232,7 @@ class SeqParallelAttention(Attention):
             attn_drop=attn_drop,
             proj_drop=proj_drop,
             norm_layer=norm_layer,
-            enable_flashattn=enable_flashattn,
+            enable_flash_attn=enable_flash_attn,
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -248,7 +248,7 @@ class SeqParallelAttention(Attention):
         # [B, SUB_N, 3, NUM_HEAD, HEAD_DIM] -> [B, N, 3, NUM_HEAD_PER_DEVICE, HEAD_DIM]
         qkv = all_to_all(qkv, sp_group, scatter_dim=3, gather_dim=1)
 
-        if self.enable_flashattn:
+        if self.enable_flash_attn:
             qkv_permute_shape = (
                 2,
                 0,
@@ -269,7 +269,7 @@ class SeqParallelAttention(Attention):
         # ERROR: Should qk_norm first
         q, k, v = qkv.unbind(0)
         q, k = self.q_norm(q), self.k_norm(k)
-        if self.enable_flashattn:
+        if self.enable_flash_attn:
             from flash_attn import flash_attn_func
 
             x = flash_attn_func(
@@ -289,7 +289,7 @@ class SeqParallelAttention(Attention):
             attn = self.attn_drop(attn)
             x = attn @ v
 
-        if not self.enable_flashattn:
+        if not self.enable_flash_attn:
             x = x.transpose(1, 2)
 
         # apply all to all to gather back attention heads and split sequence
diff --git a/opensora/models/stdit/stdit2.py b/opensora/models/stdit/stdit2.py
index 73fe276..afcc0c1 100644
--- a/opensora/models/stdit/stdit2.py
+++ b/opensora/models/stdit/stdit2.py
@@ -1,6 +1,5 @@
 import numpy as np
 import torch
-import torch.distributed as dist
 import torch.nn as nn
 from einops import rearrange
 from rotary_embedding_torch import RotaryEmbedding
@@ -8,16 +7,12 @@ from timm.models.layers import DropPath
 from timm.models.vision_transformer import Mlp
 
 from opensora.acceleration.checkpoint import auto_grad_checkpoint
-from opensora.acceleration.communications import gather_forward_split_backward, split_forward_gather_backward
-from opensora.acceleration.parallel_states import get_sequence_parallel_group
 from opensora.models.layers.blocks import (
     Attention,
     CaptionEmbedder,
     MultiHeadCrossAttention,
     PatchEmbed3D,
     PositionEmbedding2D,
-    SeqParallelAttention,
-    SeqParallelMultiHeadCrossAttention,
     SizeEmbedder,
     T2IFinalLayer,
     TimestepEmbedder,
@@ -27,7 +22,7 @@ from opensora.models.layers.blocks import (
     t2i_modulate,
 )
 from opensora.registry import MODELS
-from opensora.utils.ckpt_utils import load_checkpoint
+from transformers import PretrainedConfig, PreTrainedModel
 
 
 class STDiT2Block(nn.Module):
@@ -37,7 +32,7 @@ class STDiT2Block(nn.Module):
         num_heads,
         mlp_ratio=4.0,
         drop_path=0.0,
-        enable_flashattn=False,
+        enable_flash_attn=False,
         enable_layernorm_kernel=False,
         enable_sequence_parallelism=False,
         rope=None,
@@ -45,30 +40,22 @@ class STDiT2Block(nn.Module):
     ):
         super().__init__()
         self.hidden_size = hidden_size
-        self.enable_flashattn = enable_flashattn
+        self.enable_flash_attn = enable_flash_attn
         self._enable_sequence_parallelism = enable_sequence_parallelism
 
-        assert not self._enable_sequence_parallelism, "Sequence parallelism is not supported."
-        if enable_sequence_parallelism:
-            self.attn_cls = SeqParallelAttention
-            self.mha_cls = SeqParallelMultiHeadCrossAttention
-        else:
-            self.attn_cls = Attention
-            self.mha_cls = MultiHeadCrossAttention
-
         # spatial branch
         self.norm1 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel)
-        self.attn = self.attn_cls(
+        self.attn = Attention(
             hidden_size,
             num_heads=num_heads,
             qkv_bias=True,
-            enable_flashattn=enable_flashattn,
+            enable_flash_attn=enable_flash_attn,
             qk_norm=qk_norm,
         )
         self.scale_shift_table = nn.Parameter(torch.randn(6, hidden_size) / hidden_size**0.5)
 
         # cross attn
-        self.cross_attn = self.mha_cls(hidden_size, num_heads)
+        self.cross_attn = MultiHeadCrossAttention(hidden_size, num_heads)
 
         # mlp branch
         self.norm2 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel)
@@ -79,11 +66,11 @@ class STDiT2Block(nn.Module):
 
         # temporal branch
         self.norm_temp = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel)  # new
-        self.attn_temp = self.attn_cls(
+        self.attn_temp = Attention(
             hidden_size,
             num_heads=num_heads,
             qkv_bias=True,
-            enable_flashattn=self.enable_flashattn,
+            enable_flash_attn=self.enable_flash_attn,
             rope=rope,
             qk_norm=qk_norm,
         )
@@ -174,8 +161,10 @@ class STDiT2Block(nn.Module):
         return x
 
 
-@MODELS.register_module()
-class STDiT2(nn.Module):
+class STDiT2Config(PretrainedConfig):
+    
+    model_type = "STDiT2"
+
     def __init__(
         self,
         input_size=(None, None, None),
@@ -192,45 +181,73 @@ class STDiT2(nn.Module):
         no_temporal_pos_emb=False,
         caption_channels=4096,
         model_max_length=120,
-        dtype=torch.float32,
         freeze=None,
         qk_norm=False,
-        enable_flashattn=False,
+        enable_flash_attn=False,
         enable_layernorm_kernel=False,
-        enable_sequence_parallelism=False,
+        **kwargs,
     ):
-        super().__init__()
-        self.pred_sigma = pred_sigma
-        self.in_channels = in_channels
-        self.out_channels = in_channels * 2 if pred_sigma else in_channels
-        self.hidden_size = hidden_size
-        self.num_heads = num_heads
-        self.dtype = dtype
-        self.no_temporal_pos_emb = no_temporal_pos_emb
-        self.depth = depth
-        self.mlp_ratio = mlp_ratio
-        self.enable_flashattn = enable_flashattn
-        self.enable_layernorm_kernel = enable_layernorm_kernel
-
-        # support dynamic input
-        self.patch_size = patch_size
         self.input_size = input_size
         self.input_sq_size = input_sq_size
-        self.pos_embed = PositionEmbedding2D(hidden_size)
+        self.in_channels = in_channels
+        self.patch_size = patch_size
+        self.hidden_size = hidden_size
+        self.depth = depth
+        self.num_heads = num_heads
+        self.mlp_ratio = mlp_ratio
+        self.class_dropout_prob = class_dropout_prob
+        self.pred_sigma = pred_sigma
+        self.drop_path = drop_path
+        self.no_temporal_pos_emb = no_temporal_pos_emb
+        self.caption_channels = caption_channels
+        self.model_max_length = model_max_length
+        self.freeze = freeze
+        self.qk_norm = qk_norm
+        self.enable_flash_attn = enable_flash_attn
+        self.enable_layernorm_kernel = enable_layernorm_kernel
+        super().__init__(**kwargs)
 
-        self.x_embedder = PatchEmbed3D(patch_size, in_channels, hidden_size)
-        self.t_embedder = TimestepEmbedder(hidden_size)
-        self.t_block = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 6 * hidden_size, bias=True))
-        self.t_block_temp = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 3 * hidden_size, bias=True))  # new
+
+@MODELS.register_module()
+class STDiT2(PreTrainedModel):
+
+    config_class = STDiT2Config
+
+    def __init__(
+        self,
+        config
+    ):
+        super().__init__(config)
+        self.pred_sigma = config.pred_sigma
+        self.in_channels = config.in_channels
+        self.out_channels = config.in_channels * 2 if config.pred_sigma else config.in_channels
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_heads
+        self.no_temporal_pos_emb = config.no_temporal_pos_emb
+        self.depth = config.depth
+        self.mlp_ratio = config.mlp_ratio
+        self.enable_flash_attn = config.enable_flash_attn
+        self.enable_layernorm_kernel = config.enable_layernorm_kernel
+
+        # support dynamic input
+        self.patch_size = config.patch_size
+        self.input_size = config.input_size
+        self.input_sq_size = config.input_sq_size
+        self.pos_embed = PositionEmbedding2D(config.hidden_size)
+
+        self.x_embedder = PatchEmbed3D(config.patch_size, config.in_channels, config.hidden_size)
+        self.t_embedder = TimestepEmbedder(config.hidden_size)
+        self.t_block = nn.Sequential(nn.SiLU(), nn.Linear(config.hidden_size, 6 * config.hidden_size, bias=True))
+        self.t_block_temp = nn.Sequential(nn.SiLU(), nn.Linear(config.hidden_size, 3 * config.hidden_size, bias=True))  # new
         self.y_embedder = CaptionEmbedder(
-            in_channels=caption_channels,
-            hidden_size=hidden_size,
-            uncond_prob=class_dropout_prob,
+            in_channels=config.caption_channels,
+            hidden_size=config.hidden_size,
+            uncond_prob=config.class_dropout_prob,
             act_layer=approx_gelu,
-            token_num=model_max_length,
+            token_num=config.model_max_length,
         )
 
-        drop_path = [x.item() for x in torch.linspace(0, drop_path, depth)]
+        drop_path = [x.item() for x in torch.linspace(0, config.drop_path, config.depth)]
         self.rope = RotaryEmbedding(dim=self.hidden_size // self.num_heads)  # new
         self.blocks = nn.ModuleList(
             [
@@ -239,16 +256,15 @@ class STDiT2(nn.Module):
                     self.num_heads,
                     mlp_ratio=self.mlp_ratio,
                     drop_path=drop_path[i],
-                    enable_flashattn=self.enable_flashattn,
+                    enable_flash_attn=self.enable_flash_attn,
                     enable_layernorm_kernel=self.enable_layernorm_kernel,
-                    enable_sequence_parallelism=enable_sequence_parallelism,
                     rope=self.rope.rotate_queries_or_keys,
-                    qk_norm=qk_norm,
+                    qk_norm=config.qk_norm,
                 )
                 for i in range(self.depth)
             ]
         )
-        self.final_layer = T2IFinalLayer(hidden_size, np.prod(self.patch_size), self.out_channels)
+        self.final_layer = T2IFinalLayer(config.hidden_size, np.prod(self.patch_size), self.out_channels)
 
         # multi_res
         assert self.hidden_size % 3 == 0, "hidden_size must be divisible by 3"
@@ -260,20 +276,13 @@ class STDiT2(nn.Module):
         # init model
         self.initialize_weights()
         self.initialize_temporal()
-        if freeze is not None:
-            assert freeze in ["not_temporal", "text"]
-            if freeze == "not_temporal":
+        if config.freeze is not None:
+            assert config.freeze in ["not_temporal", "text"]
+            if config.freeze == "not_temporal":
                 self.freeze_not_temporal()
-            elif freeze == "text":
+            elif config.freeze == "text":
                 self.freeze_text()
 
-        # sequence parallel related configs
-        self.enable_sequence_parallelism = enable_sequence_parallelism
-        if enable_sequence_parallelism:
-            self.sp_rank = dist.get_rank(get_sequence_parallel_group())
-        else:
-            self.sp_rank = None
-
     def get_dynamic_size(self, x):
         _, _, T, H, W = x.size()
         if T % self.patch_size[0] != 0:
@@ -302,9 +311,10 @@ class STDiT2(nn.Module):
             x (torch.Tensor): output latent representation; of shape [B, C, T, H, W]
         """
         B = x.shape[0]
-        x = x.to(self.dtype)
-        timestep = timestep.to(self.dtype)
-        y = y.to(self.dtype)
+        dtype = self.x_embedder.proj.weight.dtype
+        x = x.to(dtype)
+        timestep = timestep.to(dtype)
+        y = y.to(dtype)
 
         # === process data info ===
         # 1. get dynamic size
@@ -337,10 +347,6 @@ class STDiT2(nn.Module):
         x = x + pos_emb
         x = rearrange(x, "B T S C -> B (T S) C")
 
-        # shard over the sequence dim if sp is enabled
-        if self.enable_sequence_parallelism:
-            x = split_forward_gather_backward(x, get_sequence_parallel_group(), dim=1, grad_scale="down")
-
         # prepare adaIN
         t = self.t_embedder(timestep, dtype=x.dtype)  # [B, C]
         t_spc = t + data_info  # [B, C]
@@ -388,10 +394,7 @@ class STDiT2(nn.Module):
                 T,
                 S,
             )
-
-        if self.enable_sequence_parallelism:
-            x = gather_forward_split_backward(x, get_sequence_parallel_group(), dim=1, grad_scale="up")
-        # x.shape: [B, N, C]
+            # x.shape: [B, N, C]
 
         # final process
         x = self.final_layer(x, t, x_mask, t0_spc, T, S)  # [B, N, C=T_p * H_p * W_p * C_out]
@@ -498,7 +501,14 @@ class STDiT2(nn.Module):
 
 @MODELS.register_module("STDiT2-XL/2")
 def STDiT2_XL_2(from_pretrained=None, **kwargs):
-    model = STDiT2(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs)
     if from_pretrained is not None:
-        load_checkpoint(model, from_pretrained)
+        model = STDiT2.from_pretrained(from_pretrained, **kwargs)
+    else:
+        config = STDiT2Config(
+            depth=28,
+            hidden_size=1152,
+            patch_size=(1, 2, 2),
+            num_heads=16, **kwargs
+        )
+        model = STDiT2(config)
     return model
diff --git a/requirements.txt b/requirements.txt
index e8031a8..f8d7948 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,7 +7,7 @@ mmengine
 pandas
 pre-commit
 pyarrow
-pyav
+av
 tensorboard
 timm
 tqdm
diff --git a/scripts/inference-long.py b/scripts/inference-long.py
index eec19cf..f77ebad 100644
--- a/scripts/inference-long.py
+++ b/scripts/inference-long.py
@@ -168,7 +168,6 @@ def main():
         in_channels=vae.out_channels,
         caption_channels=text_encoder.output_dim,
         model_max_length=text_encoder.model_max_length,
-        dtype=dtype,
         enable_sequence_parallelism=enable_sequence_parallelism,
     )
     text_encoder.y_embedder = model.y_embedder  # hack for classifier-free guidance
diff --git a/scripts/inference.py b/scripts/inference.py
index 7a83ead..369379e 100644
--- a/scripts/inference.py
+++ b/scripts/inference.py
@@ -55,6 +55,7 @@ def main():
     vae = build_module(cfg.vae, MODELS)
     latent_size = vae.get_latent_size(input_size)
     text_encoder = build_module(cfg.text_encoder, MODELS, device=device)  # T5 must be fp32
+
     model = build_module(
         cfg.model,
         MODELS,
@@ -62,7 +63,6 @@ def main():
         in_channels=vae.out_channels,
         caption_channels=text_encoder.output_dim,
         model_max_length=text_encoder.model_max_length,
-        dtype=dtype,
         enable_sequence_parallelism=enable_sequence_parallelism,
     )
     text_encoder.y_embedder = model.y_embedder  # hack for classifier-free guidance

From cb90d06142e90cab96b53518e84e996e29813267 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 6 May 2024 15:27:15 +0800
Subject: [PATCH 25/26] fixed readme (#368)

---
 README.md | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/README.md b/README.md
index b724310..6116f8b 100644
--- a/README.md
+++ b/README.md
@@ -53,10 +53,6 @@ With Open-Sora, our goal is to foster innovation, creativity, and inclusivity wi
 | **16s 320×320**                                                                                                                                        | **16s 224×448**                                                                                                                                        | **2s 426×240**                                                                                                                                            |
 | ------------------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | [<img src="assets/demo/sample_16s_320x320.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/3cab536e-9b43-4b33-8da8-a0f9cf842ff2) | [<img src="assets/demo/sample_16s_224x448.gif" width="">](https://github.com/hpcaitech/Open-Sora/assets/99191637/9fb0b9e0-c6f4-4935-b29e-4cac10b373c4) | [<img src="assets/demo/sora_16x426x240_3.gif" width="">](https://github.com/hpcaitech/Open-Sora-dev/assets/99191637/3e892ad2-9543-4049-b005-643a4c1bf3bf) |
-<<<<<<< Updated upstream
-=======
-
->>>>>>> Stashed changes
 
 <details>
 <summary>OpenSora 1.0 Demo</summary>

From d599fd75b32317121aa3acdd220dd955ef9095b0 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Tue, 7 May 2024 10:37:06 +0800
Subject: [PATCH 26/26] adapted pretrained model to training (#371)

---
 opensora/models/stdit/stdit2.py | 18 +++++++++++++++++-
 scripts/train.py                |  3 +--
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/opensora/models/stdit/stdit2.py b/opensora/models/stdit/stdit2.py
index afcc0c1..5de1769 100644
--- a/opensora/models/stdit/stdit2.py
+++ b/opensora/models/stdit/stdit2.py
@@ -1,6 +1,7 @@
 import numpy as np
 import torch
 import torch.nn as nn
+import os
 from einops import rearrange
 from rotary_embedding_torch import RotaryEmbedding
 from timm.models.layers import DropPath
@@ -23,6 +24,7 @@ from opensora.models.layers.blocks import (
 )
 from opensora.registry import MODELS
 from transformers import PretrainedConfig, PreTrainedModel
+from opensora.utils.ckpt_utils import load_checkpoint
 
 
 class STDiT2Block(nn.Module):
@@ -502,8 +504,22 @@ class STDiT2(PreTrainedModel):
 @MODELS.register_module("STDiT2-XL/2")
 def STDiT2_XL_2(from_pretrained=None, **kwargs):
     if from_pretrained is not None:
-        model = STDiT2.from_pretrained(from_pretrained, **kwargs)
+        if os.path.isdir(from_pretrained) or os.path.isfile(from_pretrained):
+            # if it is a directory or a file, we load the checkpoint manually
+            config = STDiT2Config(
+                depth=28,
+                hidden_size=1152,
+                patch_size=(1, 2, 2),
+                num_heads=16, **kwargs
+            )
+            model = STDiT2(config)
+            load_checkpoint(model, from_pretrained)
+            return model
+        else:
+            # otherwise, we load the model from hugging face hub
+            return STDiT2.from_pretrained(from_pretrained)
     else:
+        # create a new model
         config = STDiT2Config(
             depth=28,
             hidden_size=1152,
diff --git a/scripts/train.py b/scripts/train.py
index aed7b45..bfb6d39 100644
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -133,8 +133,7 @@ def main():
         input_size=latent_size,
         in_channels=vae.out_channels,
         caption_channels=text_encoder.output_dim,
-        model_max_length=text_encoder.model_max_length,
-        dtype=dtype,
+        model_max_length=text_encoder.model_max_length
     )
     model_numel, model_numel_trainable = get_model_numel(model)
     logger.info(