Merge branch 'dev/v1.2' of https://github.com/hpcaitech/Open-Sora-dev into dev/v1.2

2026-04-11 13:14:44 +02:00 · 2024-06-17 09:06:33 +00:00 · 2024-06-17 09:06:33 +00:00 · c4430fe744
commit c4430fe744
parent c115068996 82dbaf7caa
9 changed files with 156 additions and 69 deletions
--- a/0
+++ b/0
--- a/README.md
+++ b/README.md
@ -77,6 +77,7 @@ see [here](/assets/texts/t2v_samples.txt) for full prompts.

 * 📍 **Open-Sora 1.2** released. Model weights are available [here](#model-weights). See our **[report 1.2](docs/report_03.md)** for more details.
 * ✅ Support rectified flow scheduling.
+* ✅ Support more conditioning including fps, aesthetic score, motion strength and camera motion.
 * ✅ Trained our 3D-VAE for temporal dimension compression.
 * 📍 **Open-Sora 1.1** released. Model weights are available [here](#model-weights). It is trained on **0s~15s, 144p to 720p, various aspect ratios** videos. See our **[report 1.1](/docs/report_02.md)** for more discussions.
 * 🔧 **Data processing pipeline v1.1** is released. An automatic [processing pipeline](#data-processing) from raw videos to (text, video clip) pairs is provided, including scene cutting $\rightarrow$ filtering(aesthetic, optical flow, OCR, etc.) $\rightarrow$ captioning $\rightarrow$ managing. With this tool, you can easily build your video dataset.
@ -206,9 +207,10 @@ docker run -ti --gpus all -v {MOUNT_DIR}:/data opensora

 ### Open-Sora 1.2 Model Weights

-| Resolution | Model Size | Data | #iterations | Batch Size | URL |
-| ---------- | ---------- | ---- | ----------- | ---------- | --- |
-| TBD        |
+| Model     | Model Size | Data | #iterations | Batch Size | URL                                                           |
+| --------- | ---------- | ---- | ----------- | ---------- | ------------------------------------------------------------- |
+| Diffusion | 1.1B       | 30M  | 70k         | Dynamic    | [:link:](https://huggingface.co/hpcai-tech/OpenSora-STDiT-v3) |
+| VAE       | 384M       |      |             |            | [:link:](https://huggingface.co/hpcai-tech/OpenSora-VAE-v1.2) |

 See our **[report 1.2](docs/report_03.md)** for more infomation.

@ -304,6 +306,26 @@ For more advanced usage, you can refer to [Gradio README](./gradio/README.md#adv

 ### Open-Sora 1.2 Command Line Inference

+The basic command line inference is as follows:
+
+```bash
+# text to video
+python scripts/inference.py configs/opensora-v1-2/inference/sample.py \
+  --num-frames 4s --resolution 720p \
+  --prompt "a beautiful waterfall"
+```
+
+You can add more options to the command line to customize the generation.
+
+```bash
+python scripts/inference.py configs/opensora-v1-2/inference/sample.py \
+  --num-frames 4s --resolution 720p \
+  --num-sampling-steps 30 --flow 5 --aes 6.5 \
+  --prompt "a beautiful waterfall"
+```
+
+For image to video generation and other functionalities, the API is compatible with Open-Sora 1.1. See [here](docs/commands.md) for more instructions.
+
 ### GPT-4o Prompt Refinement

 We find that GPT-4o can refine the prompt and improve the quality of the generated video. With this feature, you can also use other language (e.g., Chinese) as the prompt. To enable this feature, you need prepare your openai api key in the environment:
@ -312,7 +334,12 @@ We find that GPT-4o can refine the prompt and improve the quality of the generat
 export OPENAI_API_KEY=YOUR_API_KEY
 ```

-Then you can inference with `--llm-refine True` to enable the GPT-4o prompt refinement.
+Then you can inference with `--llm-refine True` to enable the GPT-4o prompt refinement, or leave prompt empty to get a random prompt generated by GPT-4o.
+
+```bash
+python scripts/inference.py configs/opensora-v1-2/inference/sample.py \
+  --num-frames 4s --resolution 720p --llm-refine True
+```

 ### Open-Sora 1.1 Command Line Inference

@ -376,6 +403,17 @@ Also check out the [datasets](docs/datasets.md) we use.

 ### Open-Sora 1.2 Training

+The training process is same as Open-Sora 1.1.
+
+```bash
+# one node
+torchrun --standalone --nproc_per_node 8 scripts/train.py \
+    configs/opensora-v1-2/train/stage1.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT
+# multiple nodes
+colossalai run --nproc_per_node 8 --hostfile hostfile scripts/train.py \
+    configs/opensora-v1-2/train/stage1.py --data-path YOUR_CSV_PATH --ckpt-path YOUR_PRETRAINED_CKPT
+```
+
 ### Open-Sora 1.1 Training

 <details>
--- a/configs/opensora-v1-2/inference/sample.py
+++ b/configs/opensora-v1-2/inference/sample.py
@ -22,7 +22,7 @@ model = dict(
 )
 vae = dict(
    type="OpenSoraVAE_V1_2",
-    from_pretrained="/mnt/jfs/sora_checkpoints/vae-pipeline",
+    from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
    micro_frame_size=17,
    micro_batch_size=4,
 )
--- a/docs/commands.md
+++ b/docs/commands.md
@ -15,6 +15,18 @@

 You can modify corresponding config files to change the inference settings. See more details [here](/docs/structure.md#inference-config-demos).

+### Inference with Open-Sora 1.2
+
+The inference API is compatible with Open-Sora 1.1. To ease users' experience, we add support to `--resolution` and `--aspect-ratio` options, which is a more user-friendly way to specify the image size.
+
+```bash
+python scripts/inference.py configs/opensora-v1-2/inference/sample.py \
+    --resolution 480p --aspect-ratio 9:16
+# equivalent to
+python scripts/inference.py configs/opensora-v1-2/inference/sample.py \
+    --image-size 480 853
+```
+
 ### Inference with Open-Sora 1.1

 Since Open-Sora 1.1 supports inference with dynamic input size, you can pass the input size as an argument.
--- a/gradio/README.md
+++ b/gradio/README.md
@ -1,3 +1,20 @@
+---
+title: Open Sora
+emoji: 🎥
+colorFrom: red
+colorTo: purple
+sdk: gradio
+sdk_version: 4.25.0
+app_file: app.py
+pinned: false
+license: apache-2.0
+preload_from_hub:
+    - hpcai-tech/OpenSora-STDiT-v3
+    - hpcai-tech/OpenSora-VAE-v1.2
+    - DeepFloyd/t5-v1_1-xxl    
+---
+
+
 # 🕹 Gradio Demo

 We have provided a Gradio demo app for you to generate videos via a web interface. You can choose to run it locally or deploy it to Hugging Face by following the instructions given below.
@ -12,17 +29,14 @@ We assume that you have already installed `opensora` based on the instructions g
 pip install gradio spaces
 ```

-2. Afterwards, you can use the following command to launch different models. Remember to launch the command in the project root directory instead of the `gradio` folder.
+2. Afterwards, you can use the following command to launch the application. Remember to launch the command in the project root directory instead of the `gradio` folder.

 ```bash
-# run the default model v1-HQ-16x256x256
+# start the gradio app
 python gradio/app.py

-# run the model with higher resolution
-python gradio/app.py --model-type v1-HQ-16x512x512
-
-# run with a different host and port
-python gradio/app.py --port 8000 --host 0.0.0.0
+# run with a different port
+python gradio/app.py --port 8000

 # run with acceleration such as flash attention and fused norm
 python gradio/app.py --enable-optimization
@ -45,13 +59,7 @@ We have also tested this Gradio app on Hugging Face Spaces. You can follow the s

 ```text
 - configs
-    - opensora
-        - inference
-            - 16x256x256.py
-            - 16x512x512.py
-            - 64x512x512.py
-        ...
-    ...
+    - ...
 - app.py
 - requirements.txt
 - README.md
@ -63,7 +71,7 @@ We have also tested this Gradio app on Hugging Face Spaces. You can follow the s

 ## Advanced Usage

-![Gradio Demo](/assets/readme/gradio_advanced.png)
+![Gradio Demo](../assets/readme/gradio_advanced.png)

 For the "**FPS**" option, as now we fix the output video's FPS to 24, this option will not affect the output video's length. Thus, for a smaller FPS, the video is supposed to be longer but accelerated due to 24 FPS. Thus, the video will be less smooth but faster. For a larger FPS, the video will be smoother but slower.

--- a/gradio/app.py
+++ b/gradio/app.py
@ -197,6 +197,10 @@ vae, text_encoder, stdit, scheduler = build_models(args.model_type, config, enab


 def run_inference(mode, prompt_text, resolution, aspect_ratio, length, motion_strength, aesthetic_score, use_motion_strength, use_aesthetic_score, camera_motion, reference_image, refine_prompt, fps, num_loop, seed, sampling_steps, cfg_scale):
+    if prompt_text is None or prompt_text == "":
+        gr.Warning("Your prompt is empty, please enter a valid prompt")
+        return None
+    
    torch.manual_seed(seed)
    with torch.inference_mode():
        # ======================
@ -496,11 +500,10 @@ def main():
                prompt_text = gr.Textbox(
                    label="Prompt",
                    placeholder="Describe your video here",
-                    info="Empty prompt will mean random prompt from OpenAI.",
-                    lines=4,
+                    lines=4
                )
                refine_prompt = gr.Checkbox(value=True, label="Refine prompt with GPT4o")
-                random_prompt_btn = gr.Button("Random Prompt")
+                random_prompt_btn = gr.Button("Random Prompt By GPT4o")
                
                gr.Markdown("## Basic Settings")
                resolution = gr.Radio(
--- a/opensora/models/vae/vae.py
+++ b/opensora/models/vae/vae.py
@ -1,3 +1,4 @@
+import os
 import torch
 import torch.nn as nn
 from diffusers.models import AutoencoderKL, AutoencoderKLTemporalDecoder
@ -5,6 +6,7 @@ from einops import rearrange

 from opensora.registry import MODELS, build_module
 from opensora.utils.ckpt_utils import load_checkpoint
+from transformers import PretrainedConfig, PreTrainedModel


@MODELS.register_module()
@ -115,9 +117,9 @@ class VideoAutoencoderKLTemporalDecoder(nn.Module):
    def dtype(self):
        return next(self.parameters()).dtype

-
-@MODELS.register_module()
-class VideoAutoencoderPipeline(nn.Module):
+class VideoAutoencoderPipelineConfig(PretrainedConfig):
+    model_type = "VideoAutoencoderPipeline"
+    
    def __init__(
        self,
        vae_2d=None,
@ -128,25 +130,43 @@ class VideoAutoencoderPipeline(nn.Module):
        micro_frame_size=None,
        shift=0.0,
        scale=1.0,
+        **kwargs
    ):
-        super().__init__()
-        self.spatial_vae = build_module(vae_2d, MODELS)
-        self.temporal_vae = build_module(vae_temporal, MODELS)
+        self.vae_2d = vae_2d
+        self.vae_temporal = vae_temporal
+        self.from_pretrained = from_pretrained
+        self.freeze_vae_2d = freeze_vae_2d
        self.cal_loss = cal_loss
        self.micro_frame_size = micro_frame_size
-        self.micro_z_frame_size = self.temporal_vae.get_latent_size([micro_frame_size, None, None])[0]
+        self.shift = shift
+        self.scale = scale
+        super().__init__(**kwargs)

-        if from_pretrained is not None:
-            load_checkpoint(self, from_pretrained)
-        if freeze_vae_2d:
+
+@MODELS.register_module()
+class VideoAutoencoderPipeline(PreTrainedModel):
+    config_class = VideoAutoencoderPipelineConfig
+    
+    def __init__(
+        self,
+        config: VideoAutoencoderPipelineConfig
+    ):
+        super().__init__(config=config)
+        self.spatial_vae = build_module(config.vae_2d, MODELS)
+        self.temporal_vae = build_module(config.vae_temporal, MODELS)
+        self.cal_loss = config.cal_loss
+        self.micro_frame_size = config.micro_frame_size
+        self.micro_z_frame_size = self.temporal_vae.get_latent_size([config.micro_frame_size, None, None])[0]
+
+        if config.freeze_vae_2d:
            for param in self.spatial_vae.parameters():
                param.requires_grad = False

        self.out_channels = self.temporal_vae.out_channels

        # normalization parameters
-        scale = torch.tensor(scale)
-        shift = torch.tensor(shift)
+        scale = torch.tensor(config.scale)
+        shift = torch.tensor(config.shift)
        if len(scale.shape) > 0:
            scale = scale[None, :, None, None, None]
        if len(shift.shape) > 0:
@ -225,38 +245,44 @@ class VideoAutoencoderPipeline(nn.Module):
    def dtype(self):
        return next(self.parameters()).dtype

-
@MODELS.register_module()
-class OpenSoraVAE_V1_2(VideoAutoencoderPipeline):
-    def __init__(
-        self,
-        micro_batch_size=4,
-        micro_frame_size=17,
-        from_pretrained=None,
-        local_files_only=False,
-        freeze_vae_2d=False,
-        cal_loss=False,
-    ):
-        vae_2d = dict(
+def OpenSoraVAE_V1_2(
+    micro_batch_size=4,
+    micro_frame_size=17,
+    from_pretrained=None,
+    local_files_only=False,
+    freeze_vae_2d=False,
+    cal_loss=False,
+):
+    vae_2d = dict(
            type="VideoAutoencoderKL",
            from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers",
            subfolder="vae",
            micro_batch_size=micro_batch_size,
            local_files_only=local_files_only,
        )
-        vae_temporal = dict(
-            type="VAE_Temporal_SD",
-            from_pretrained=None,
-        )
-        shift = (-0.10, 0.34, 0.27, 0.98)
-        scale = (3.85, 2.32, 2.33, 3.06)
-        super().__init__(
-            vae_2d,
-            vae_temporal,
-            from_pretrained,
-            freeze_vae_2d=freeze_vae_2d,
-            cal_loss=cal_loss,
-            micro_frame_size=micro_frame_size,
-            shift=shift,
-            scale=scale,
-        )
+    vae_temporal = dict(
+        type="VAE_Temporal_SD",
+        from_pretrained=None,
+    )
+    shift = (-0.10, 0.34, 0.27, 0.98)
+    scale = (3.85, 2.32, 2.33, 3.06)
+    kwargs = dict(
+        vae_2d=vae_2d,
+        vae_temporal=vae_temporal,
+        freeze_vae_2d=freeze_vae_2d,
+        cal_loss=cal_loss,
+        micro_frame_size=micro_frame_size,
+        shift=shift,
+        scale=scale
+    )
+    
+    if from_pretrained is not None and not os.path.isdir(from_pretrained):
+        model = VideoAutoencoderPipeline.from_pretrained(from_pretrained, **kwargs)
+    else:
+        config = VideoAutoencoderPipelineConfig(**kwargs)
+        model = VideoAutoencoderPipeline(config)
+        
+        if from_pretrained:
+            load_checkpoint(model, from_pretrained)
+    return model
--- a/opensora/utils/ckpt_utils.py
+++ b/opensora/utils/ckpt_utils.py
@ -145,9 +145,9 @@ def download_model(model_name=None, local_path=None, url=None):
    return model


-def load_from_sharded_state_dict(model, ckpt_path, model_name="model"):
+def load_from_sharded_state_dict(model, ckpt_path, model_name="model", strict=False):
    ckpt_io = GeneralCheckpointIO()
-    ckpt_io.load_model(model, os.path.join(ckpt_path, model_name))
+    ckpt_io.load_model(model, os.path.join(ckpt_path, model_name), strict=strict)


 def model_sharding(model: torch.nn.Module):
@ -187,14 +187,14 @@ def record_model_param_shape(model: torch.nn.Module) -> dict:
    return param_shape


-def load_checkpoint(model, ckpt_path, save_as_pt=False, model_name="model"):
+def load_checkpoint(model, ckpt_path, save_as_pt=False, model_name="model", strict=False):
    if ckpt_path.endswith(".pt") or ckpt_path.endswith(".pth"):
        state_dict = find_model(ckpt_path, model=model)
-        missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
+        missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=strict)
        get_logger().info("Missing keys: %s", missing_keys)
        get_logger().info("Unexpected keys: %s", unexpected_keys)
    elif os.path.isdir(ckpt_path):
-        load_from_sharded_state_dict(model, ckpt_path, model_name)
+        load_from_sharded_state_dict(model, ckpt_path, model_name, strict=strict)
        get_logger().info("Model checkpoint loaded from %s", ckpt_path)
        if save_as_pt:
            save_path = os.path.join(ckpt_path, model_name + "_ckpt.pt")
--- a/opensora/utils/inference_utils.py
+++ b/opensora/utils/inference_utils.py
@ -45,7 +45,7 @@ def get_save_path_name(
 ):
    if sample_name is None:
        sample_name = "" if prompt_as_path else "sample"
-    sample_name_suffix = prompt if prompt_as_path else f"_{sample_idx}"
+    sample_name_suffix = prompt if prompt_as_path else f"_{sample_idx:04d}"
    save_path = os.path.join(save_dir, f"{sample_name}{sample_name_suffix}")
    if num_sample != 1:
        save_path = f"{save_path}-{k}"