From 4d243955caf45a8e6d15128132e1438c0bdad27f Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 17 Jun 2024 14:59:33 +0800
Subject: [PATCH 1/4] [checkpoint] adapted vae to hf (#145)

---
 configs/opensora-v1-2/inference/sample.py |   2 +-
 opensora/models/vae/vae.py                | 106 ++++++++++++++--------
 opensora/utils/ckpt_utils.py              |  10 +-
 3 files changed, 72 insertions(+), 46 deletions(-)

diff --git a/configs/opensora-v1-2/inference/sample.py b/configs/opensora-v1-2/inference/sample.py
index f07587c..5a93076 100644
--- a/configs/opensora-v1-2/inference/sample.py
+++ b/configs/opensora-v1-2/inference/sample.py
@@ -22,7 +22,7 @@ model = dict(
 )
 vae = dict(
     type="OpenSoraVAE_V1_2",
-    from_pretrained="/mnt/jfs/sora_checkpoints/vae-pipeline",
+    from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
     micro_frame_size=17,
     micro_batch_size=4,
 )
diff --git a/opensora/models/vae/vae.py b/opensora/models/vae/vae.py
index f5769a2..f9823d9 100644
--- a/opensora/models/vae/vae.py
+++ b/opensora/models/vae/vae.py
@@ -1,3 +1,4 @@
+import os
 import torch
 import torch.nn as nn
 from diffusers.models import AutoencoderKL, AutoencoderKLTemporalDecoder
@@ -5,6 +6,7 @@ from einops import rearrange
 
 from opensora.registry import MODELS, build_module
 from opensora.utils.ckpt_utils import load_checkpoint
+from transformers import PretrainedConfig, PreTrainedModel
 
 
 @MODELS.register_module()
@@ -115,9 +117,9 @@ class VideoAutoencoderKLTemporalDecoder(nn.Module):
     def dtype(self):
         return next(self.parameters()).dtype
 
-
-@MODELS.register_module()
-class VideoAutoencoderPipeline(nn.Module):
+class VideoAutoencoderPipelineConfig(PretrainedConfig):
+    model_type = "VideoAutoencoderPipeline"
+    
     def __init__(
         self,
         vae_2d=None,
@@ -128,25 +130,43 @@ class VideoAutoencoderPipeline(nn.Module):
         micro_frame_size=None,
         shift=0.0,
         scale=1.0,
+        **kwargs
     ):
-        super().__init__()
-        self.spatial_vae = build_module(vae_2d, MODELS)
-        self.temporal_vae = build_module(vae_temporal, MODELS)
+        self.vae_2d = vae_2d
+        self.vae_temporal = vae_temporal
+        self.from_pretrained = from_pretrained
+        self.freeze_vae_2d = freeze_vae_2d
         self.cal_loss = cal_loss
         self.micro_frame_size = micro_frame_size
-        self.micro_z_frame_size = self.temporal_vae.get_latent_size([micro_frame_size, None, None])[0]
+        self.shift = shift
+        self.scale = scale
+        super().__init__(**kwargs)
 
-        if from_pretrained is not None:
-            load_checkpoint(self, from_pretrained)
-        if freeze_vae_2d:
+
+@MODELS.register_module()
+class VideoAutoencoderPipeline(PreTrainedModel):
+    config_class = VideoAutoencoderPipelineConfig
+    
+    def __init__(
+        self,
+        config: VideoAutoencoderPipelineConfig
+    ):
+        super().__init__(config=config)
+        self.spatial_vae = build_module(config.vae_2d, MODELS)
+        self.temporal_vae = build_module(config.vae_temporal, MODELS)
+        self.cal_loss = config.cal_loss
+        self.micro_frame_size = config.micro_frame_size
+        self.micro_z_frame_size = self.temporal_vae.get_latent_size([config.micro_frame_size, None, None])[0]
+
+        if config.freeze_vae_2d:
             for param in self.spatial_vae.parameters():
                 param.requires_grad = False
 
         self.out_channels = self.temporal_vae.out_channels
 
         # normalization parameters
-        scale = torch.tensor(scale)
-        shift = torch.tensor(shift)
+        scale = torch.tensor(config.scale)
+        shift = torch.tensor(config.shift)
         if len(scale.shape) > 0:
             scale = scale[None, :, None, None, None]
         if len(shift.shape) > 0:
@@ -225,38 +245,44 @@ class VideoAutoencoderPipeline(nn.Module):
     def dtype(self):
         return next(self.parameters()).dtype
 
-
 @MODELS.register_module()
-class OpenSoraVAE_V1_2(VideoAutoencoderPipeline):
-    def __init__(
-        self,
-        micro_batch_size=4,
-        micro_frame_size=17,
-        from_pretrained=None,
-        local_files_only=False,
-        freeze_vae_2d=False,
-        cal_loss=False,
-    ):
-        vae_2d = dict(
+def OpenSoraVAE_V1_2(
+    micro_batch_size=4,
+    micro_frame_size=17,
+    from_pretrained=None,
+    local_files_only=False,
+    freeze_vae_2d=False,
+    cal_loss=False,
+):
+    vae_2d = dict(
             type="VideoAutoencoderKL",
             from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers",
             subfolder="vae",
             micro_batch_size=micro_batch_size,
             local_files_only=local_files_only,
         )
-        vae_temporal = dict(
-            type="VAE_Temporal_SD",
-            from_pretrained=None,
-        )
-        shift = (-0.10, 0.34, 0.27, 0.98)
-        scale = (3.85, 2.32, 2.33, 3.06)
-        super().__init__(
-            vae_2d,
-            vae_temporal,
-            from_pretrained,
-            freeze_vae_2d=freeze_vae_2d,
-            cal_loss=cal_loss,
-            micro_frame_size=micro_frame_size,
-            shift=shift,
-            scale=scale,
-        )
+    vae_temporal = dict(
+        type="VAE_Temporal_SD",
+        from_pretrained=None,
+    )
+    shift = (-0.10, 0.34, 0.27, 0.98)
+    scale = (3.85, 2.32, 2.33, 3.06)
+    kwargs = dict(
+        vae_2d=vae_2d,
+        vae_temporal=vae_temporal,
+        freeze_vae_2d=freeze_vae_2d,
+        cal_loss=cal_loss,
+        micro_frame_size=micro_frame_size,
+        shift=shift,
+        scale=scale
+    )
+    
+    if from_pretrained is not None and not os.path.isdir(from_pretrained):
+        model = VideoAutoencoderPipeline.from_pretrained(from_pretrained, **kwargs)
+    else:
+        config = VideoAutoencoderPipelineConfig(**kwargs)
+        model = VideoAutoencoderPipeline(config)
+        
+        if from_pretrained:
+            load_checkpoint(model, from_pretrained)
+    return model
diff --git a/opensora/utils/ckpt_utils.py b/opensora/utils/ckpt_utils.py
index 8ecd350..b2ac5e2 100644
--- a/opensora/utils/ckpt_utils.py
+++ b/opensora/utils/ckpt_utils.py
@@ -145,9 +145,9 @@ def download_model(model_name=None, local_path=None, url=None):
     return model
 
 
-def load_from_sharded_state_dict(model, ckpt_path, model_name="model"):
+def load_from_sharded_state_dict(model, ckpt_path, model_name="model", strict=False):
     ckpt_io = GeneralCheckpointIO()
-    ckpt_io.load_model(model, os.path.join(ckpt_path, model_name))
+    ckpt_io.load_model(model, os.path.join(ckpt_path, model_name), strict=strict)
 
 
 def model_sharding(model: torch.nn.Module):
@@ -187,14 +187,14 @@ def record_model_param_shape(model: torch.nn.Module) -> dict:
     return param_shape
 
 
-def load_checkpoint(model, ckpt_path, save_as_pt=False, model_name="model"):
+def load_checkpoint(model, ckpt_path, save_as_pt=False, model_name="model", strict=False):
     if ckpt_path.endswith(".pt") or ckpt_path.endswith(".pth"):
         state_dict = find_model(ckpt_path, model=model)
-        missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
+        missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=strict)
         get_logger().info("Missing keys: %s", missing_keys)
         get_logger().info("Unexpected keys: %s", unexpected_keys)
     elif os.path.isdir(ckpt_path):
-        load_from_sharded_state_dict(model, ckpt_path, model_name)
+        load_from_sharded_state_dict(model, ckpt_path, model_name, strict=strict)
         get_logger().info("Model checkpoint loaded from %s", ckpt_path)
         if save_as_pt:
             save_path = os.path.join(ckpt_path, model_name + "_ckpt.pt")

From 41f772d1f8c12497975fbc1ed05d878a72e5b7ba Mon Sep 17 00:00:00 2001
From: zhengzangw <zhengzangw@gmail.com>
Date: Mon, 17 Jun 2024 07:01:27 +0000
Subject: [PATCH 2/4] [docs] update readme

---
 README.md                         | 46 ++++++++++++++++++++++++++++---
 docs/commands.md                  | 12 ++++++++
 opensora/utils/inference_utils.py |  2 +-
 3 files changed, 55 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index c010e2c..1bb48e5 100644
--- a/README.md
+++ b/README.md
@@ -77,6 +77,7 @@ see [here](/assets/texts/t2v_samples.txt) for full prompts.
 
 * 📍 **Open-Sora 1.2** released. Model weights are available [here](#model-weights). See our **[report 1.2](docs/report_03.md)** for more details.
 * ✅ Support rectified flow scheduling.
+* ✅ Support more conditioning including fps, aesthetic score, motion strength and camera motion.
 * ✅ Trained our 3D-VAE for temporal dimension compression.
 * 📍 **Open-Sora 1.1** released. Model weights are available [here](#model-weights). It is trained on **0s~15s, 144p to 720p, various aspect ratios** videos. See our **[report 1.1](/docs/report_02.md)** for more discussions.
 * 🔧 **Data processing pipeline v1.1** is released. An automatic [processing pipeline](#data-processing) from raw videos to (text, video clip) pairs is provided, including scene cutting $\rightarrow$ filtering(aesthetic, optical flow, OCR, etc.) $\rightarrow$ captioning $\rightarrow$ managing. With this tool, you can easily build your video dataset.
@@ -206,9 +207,10 @@ docker run -ti --gpus all -v {MOUNT_DIR}:/data opensora
 
 ### Open-Sora 1.2 Model Weights
 
-| Resolution | Model Size | Data | #iterations | Batch Size | URL |
-| ---------- | ---------- | ---- | ----------- | ---------- | --- |
-| TBD        |
+| Model     | Model Size | Data | #iterations | Batch Size | URL                                                           |
+| --------- | ---------- | ---- | ----------- | ---------- | ------------------------------------------------------------- |
+| Diffusion | 1.1B       | 30M  | 70k         | Dynamic    | [:link:](https://huggingface.co/hpcai-tech/OpenSora-STDiT-v3) |
+| VAE       | 384M       |      |             |            | [:link:](https://huggingface.co/hpcai-tech/OpenSora-VAE-v1.2) |
 
 See our **[report 1.2](docs/report_03.md)** for more infomation.
 
@@ -304,6 +306,26 @@ For more advanced usage, you can refer to [Gradio README](./gradio/README.md#adv
 
 ### Open-Sora 1.2 Command Line Inference
 
+The basic command line inference is as follows:
+
+```bash
+# text to video
+python scripts/inference.py configs/opensora-v1-2/inference/sample.py \
+  --num-frames 4s --resolution 720p \
+  --prompt "a beautiful waterfall"
+```
+
+You can add more options to the command line to customize the generation.
+
+```bash
+python scripts/inference.py configs/opensora-v1-2/inference/sample.py \
+  --num-frames 4s --resolution 720p \
+  --num-sampling-steps 30 --flow 5 --aes 6.5 \
+  --prompt "a beautiful waterfall"
+```
+
+For image to video generation and other functionalities, the API is compatible with Open-Sora 1.1. See [here](docs/commands.md) for more instructions.
+
 ### GPT-4o Prompt Refinement
 
 We find that GPT-4o can refine the prompt and improve the quality of the generated video. With this feature, you can also use other language (e.g., Chinese) as the prompt. To enable this feature, you need prepare your openai api key in the environment:
@@ -312,7 +334,12 @@ We find that GPT-4o can refine the prompt and improve the quality of the generat
 export OPENAI_API_KEY=YOUR_API_KEY
 ```
 
-Then you can inference with `--llm-refine True` to enable the GPT-4o prompt refinement.
+Then you can inference with `--llm-refine True` to enable the GPT-4o prompt refinement, or leave prompt empty to get a random prompt generated by GPT-4o.
+
+```bash
+python scripts/inference.py configs/opensora-v1-2/inference/sample.py \
+  --num-frames 4s --resolution 720p --llm-refine True
+```
 
 ### Open-Sora 1.1 Command Line Inference
 
@@ -376,6 +403,17 @@ Also check out the [datasets](docs/datasets.md) we use.
 
 ### Open-Sora 1.2 Training
 
+The training process is same as Open-Sora 1.1.
+
+```bash
+# one node
+torchrun --standalone --nproc_per_node 8 scripts/train.py \
+    configs/opensora-v1-2/train/stage1.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT
+# multiple nodes
+colossalai run --nproc_per_node 8 --hostfile hostfile scripts/train.py \
+    configs/opensora-v1-2/train/stage1.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT
+```
+
 ### Open-Sora 1.1 Training
 
 <details>
diff --git a/docs/commands.md b/docs/commands.md
index c8dabe6..536e1a7 100644
--- a/docs/commands.md
+++ b/docs/commands.md
@@ -15,6 +15,18 @@
 
 You can modify corresponding config files to change the inference settings. See more details [here](/docs/structure.md#inference-config-demos).
 
+### Inference with Open-Sora 1.2
+
+The inference API is compatible with Open-Sora 1.1. To ease users' experience, we add support to `--resolution` and `--aspect-ratio` options, which is a more user-friendly way to specify the image size.
+
+```bash
+python scripts/inference.py configs/opensora-v1-2/inference/sample.py \
+    --resolution 480p --aspect-ratio 9:16
+# equivalent to
+python scripts/inference.py configs/opensora-v1-2/inference/sample.py \
+    --image-size 480 853
+```
+
 ### Inference with Open-Sora 1.1
 
 Since Open-Sora 1.1 supports inference with dynamic input size, you can pass the input size as an argument.
diff --git a/opensora/utils/inference_utils.py b/opensora/utils/inference_utils.py
index fcf24ca..b9536a7 100644
--- a/opensora/utils/inference_utils.py
+++ b/opensora/utils/inference_utils.py
@@ -45,7 +45,7 @@ def get_save_path_name(
 ):
     if sample_name is None:
         sample_name = "" if prompt_as_path else "sample"
-    sample_name_suffix = prompt if prompt_as_path else f"_{sample_idx}"
+    sample_name_suffix = prompt if prompt_as_path else f"_{sample_idx:04d}"
     save_path = os.path.join(save_dir, f"{sample_name}{sample_name_suffix}")
     if num_sample != 1:
         save_path = f"{save_path}-{k}"

From a8225d7f44bf82a29969ba434b48b42f3fdc4d6f Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 17 Jun 2024 15:28:06 +0800
Subject: [PATCH 3/4] [gradio] updated gradio doc (#146)

---
 gradio/README.md | 38 +++++++++++++++++++++++---------------
 gradio/app.py    |  9 ++++++---
 2 files changed, 29 insertions(+), 18 deletions(-)

diff --git a/gradio/README.md b/gradio/README.md
index a785cde..609bd7c 100644
--- a/gradio/README.md
+++ b/gradio/README.md
@@ -1,3 +1,20 @@
+---
+title: Open Sora
+emoji: 🎥
+colorFrom: red
+colorTo: purple
+sdk: gradio
+sdk_version: 4.25.0
+app_file: app.py
+pinned: false
+license: apache-2.0
+preload_from_hub:
+    - hpcai-tech/OpenSora-STDiT-v3
+    - hpcai-tech/OpenSora-VAE-v1.2
+    - DeepFloyd/t5-v1_1-xxl    
+---
+
+
 # 🕹 Gradio Demo
 
 We have provided a Gradio demo app for you to generate videos via a web interface. You can choose to run it locally or deploy it to Hugging Face by following the instructions given below.
@@ -12,17 +29,14 @@ We assume that you have already installed `opensora` based on the instructions g
 pip install gradio spaces
 ```
 
-2. Afterwards, you can use the following command to launch different models. Remember to launch the command in the project root directory instead of the `gradio` folder.
+2. Afterwards, you can use the following command to launch the application. Remember to launch the command in the project root directory instead of the `gradio` folder.
 
 ```bash
-# run the default model v1-HQ-16x256x256
+# start the gradio app
 python gradio/app.py
 
-# run the model with higher resolution
-python gradio/app.py --model-type v1-HQ-16x512x512
-
-# run with a different host and port
-python gradio/app.py --port 8000 --host 0.0.0.0
+# run with a different port
+python gradio/app.py --port 8000
 
 # run with acceleration such as flash attention and fused norm
 python gradio/app.py --enable-optimization
@@ -45,13 +59,7 @@ We have also tested this Gradio app on Hugging Face Spaces. You can follow the s
 
 ```text
 - configs
-    - opensora
-        - inference
-            - 16x256x256.py
-            - 16x512x512.py
-            - 64x512x512.py
-        ...
-    ...
+    - ...
 - app.py
 - requirements.txt
 - README.md
@@ -63,7 +71,7 @@ We have also tested this Gradio app on Hugging Face Spaces. You can follow the s
 
 ## Advanced Usage
 
-![Gradio Demo](/assets/readme/gradio_advanced.png)
+![Gradio Demo](../assets/readme/gradio_advanced.png)
 
 For the "**FPS**" option, as now we fix the output video's FPS to 24, this option will not affect the output video's length. Thus, for a smaller FPS, the video is supposed to be longer but accelerated due to 24 FPS. Thus, the video will be less smooth but faster. For a larger FPS, the video will be smoother but slower.
 
diff --git a/gradio/app.py b/gradio/app.py
index 7d1a88c..3973a48 100644
--- a/gradio/app.py
+++ b/gradio/app.py
@@ -197,6 +197,10 @@ vae, text_encoder, stdit, scheduler = build_models(args.model_type, config, enab
 
 
 def run_inference(mode, prompt_text, resolution, aspect_ratio, length, motion_strength, aesthetic_score, use_motion_strength, use_aesthetic_score, camera_motion, reference_image, refine_prompt, fps, num_loop, seed, sampling_steps, cfg_scale):
+    if prompt_text is None or prompt_text == "":
+        gr.Warning("Your prompt is empty, please enter a valid prompt")
+        return None
+    
     torch.manual_seed(seed)
     with torch.inference_mode():
         # ======================
@@ -496,11 +500,10 @@ def main():
                 prompt_text = gr.Textbox(
                     label="Prompt",
                     placeholder="Describe your video here",
-                    info="Empty prompt will mean random prompt from OpenAI.",
-                    lines=4,
+                    lines=4
                 )
                 refine_prompt = gr.Checkbox(value=True, label="Refine prompt with GPT4o")
-                random_prompt_btn = gr.Button("Random Prompt")
+                random_prompt_btn = gr.Button("Random Prompt By GPT4o")
                 
                 gr.Markdown("## Basic Settings")
                 resolution = gr.Radio(

From 82dbaf7caa5611d10ae2b74a25f96aa23117c6a8 Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 17 Jun 2024 15:54:34 +0800
Subject: [PATCH 4/4] removed dummy file (#147)

---
 --llm-refine | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 --llm-refine

diff --git a/--llm-refine b/--llm-refine
deleted file mode 100644
index e69de29..0000000