Docs/readme (#73)

* update docs * update docs
2026-04-11 05:13:31 +02:00 · 2024-03-16 17:09:00 +08:00 · 2024-03-16 17:09:00 +08:00 · 5880d01ee3
commit 5880d01ee3
parent 88fbcd43ff
16 changed files with 113 additions and 132 deletions
--- a/.gitignore
+++ b/.gitignore
@ -169,6 +169,7 @@ dataset/
 runs/
 checkpoints/
 outputs/
+samples/

 # Secret files
 hostfile
--- a/README.md
+++ b/README.md
@ -86,9 +86,28 @@ Click for the original video.
 ## Installation

 ```bash
+# create a virtual env
+conda create -n opensora python=3.10
+
+# install torch
+# the command below is for CUDA 12.1, choose install commands from 
+# https://pytorch.org/get-started/locally/ based on your own CUDA version
+pip3 install torch torchvision
+
+# install flash attention (optional)
+pip install packaging ninja
+pip install flash-attn --no-build-isolation
+
+# install apex (optional)
+pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git
+
+# install xformers
+pip3 install -U xformers --index-url https://download.pytorch.org/whl/cu121
+
+# install this project
 git clone https://github.com/hpcaitech/Open-Sora
 cd Open-Sora
-pip install xxx
+pip install -v -e .
 ```

 After installation, we suggest reading [structure.md](docs/structure.md) to learn the project structure and how to use the config files.
@ -101,10 +120,21 @@ After installation, we suggest reading [structure.md](docs/structure.md) to lear

 ## Inference

+To run inference with our provided weights, first prepare the pretrained weights including XXX. [WIP]
+
+Then run the following commands to generate samples. See [here](docs/structure.md#inference-config-demos) to customize the configuration.
+
 ```bash
-python scripts/inference.py configs/opensora/inference/16x256x256.py
+# Sample 16x256x256 (~2s)
+python scripts/inference.py configs/opensora/inference/16x256x256.py --ckpt-path ./path/to/your/ckpt.pth
+# Sample 16x512x512 (~2s)
+python scripts/inference.py configs/opensora/inference/16x512x512.py
+# Sample 64x512x512 (~5s)
+python scripts/inference.py configs/opensora/inference/64x512x512.py
 ```

+For inference with other models, see [here](docs/commands.md) for more instructions.
+
 ## Data Processing

 ### Split video into clips
@ -124,8 +154,7 @@ We provide code to split a long video into separate clips efficiently using `mul
 * [StabilityAI VAE](https://huggingface.co/stabilityai/sd-vae-ft-mse-original): A powerful image VAE model.
 * [CLIP](https://github.com/openai/CLIP): A powerful text-image embedding model.
 * [T5](https://github.com/google-research/text-to-text-transfer-transformer): A powerful text encoder.
-* [LLaVA](https://github.com/haotian-liu/LLaVA): A powerful image captioning model based on [LLaMA](https://github.com/meta-llama/llama) and [Yi-34B](https://huggingface.co/01-ai/Yi-34B).
-* [PySceneDetect](https://github.com/Breakthrough/PySceneDetect): A powerful tool to split video into clips.
+* [LLaVA](https://github.com/haotian-liu/LLaVA): A powerful image captioning model based on [Yi-34B](https://huggingface.co/01-ai/Yi-34B).

 We are grateful for their exceptional work and generous contribution to open source.

--- a/configs/opensora/inference/16x256x256.py
+++ b/configs/opensora/inference/16x256x256.py
@ -1,15 +1,13 @@
-# sample size
 num_frames = 16
 fps = 24 // 3
 image_size = (256, 256)

-# model config
+# Define model
 model = dict(
    type="STDiT-XL/2",
    space_scale=0.5,
    time_scale=1.0,
-    from_pretrained="outputs/129-F16S3-PixArt-ST-XL-2/epoch83-global_step80000/ema.pt",
-    # from_pretrained="outputs/285-F16S3-PixArt-ST-XL-2/epoch615-global_step24000/ema.pt",
+    from_pretrained="PRETRAINED_MODEL",
 )
 vae = dict(
    type="VideoAutoencoderKL",
@ -21,15 +19,13 @@ text_encoder = dict(
    model_max_length=120,
 )
 scheduler = dict(
-    # type="iddpm",
-    # num_sampling_steps=250,
-    type="dpm-solver",
-    num_sampling_steps=20,
+    type="iddpm",
+    num_sampling_steps=100,
    cfg_scale=7.0,
 )
 dtype = "fp16"

-# prompts
+# Others
 batch_size = 2
 seed = 42
 prompt_path = "./assets/texts/t2v_samples.txt"
--- a/configs/opensora/inference/16x512x512.py
+++ b/configs/opensora/inference/16x512x512.py
@ -1,14 +1,13 @@
-# sample size
 num_frames = 16
 fps = 24 // 3
 image_size = (512, 512)

-# model config
+# Define model
 model = dict(
    type="STDiT-XL/2",
    space_scale=1.0,
    time_scale=1.0,
-    from_pretrained="outputs/314-F16S3-PixArt-ST-XL-2/epoch128-global_step20000/ema.pt",
+    from_pretrained="PRETRAINED_MODEL",
 )
 vae = dict(
    type="VideoAutoencoderKL",
@ -23,13 +22,11 @@ text_encoder = dict(
 scheduler = dict(
    type="iddpm",
    num_sampling_steps=100,
-    # type = "dpm-solver",
-    # num_sampling_steps=20,
    cfg_scale=7.0,
 )
 dtype = "fp16"

-# prompts
+# Others
 batch_size = 2
 seed = 42
 prompt_path = "./assets/texts/t2v_samples.txt"
--- a/configs/opensora/inference/64x512x512-v2.py
+++ b/configs/opensora/inference/64x512x512-v2.py
@ -1,36 +0,0 @@
-# sample size
-num_frames = 64
-fps = 24 // 2
-image_size = (512, 512)
-
-# model config
-model = dict(
-    type="STDiT-XL/2",
-    space_scale=1.0,
-    time_scale=2 / 3,
-    from_pretrained="outputs/524-F64S2-STDiT-XL-2/epoch4-global_step750/",
-)
-vae = dict(
-    type="VideoAutoencoderKL",
-    from_pretrained="stabilityai/sd-vae-ft-ema",
-    split=8,
-)
-text_encoder = dict(
-    type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
-    model_max_length=120,
-)
-scheduler = dict(
-    type="iddpm",
-    num_sampling_steps=100,
-    # type = "dpm-solver",
-    # num_sampling_steps=20,
-    cfg_scale=7.0,
-)
-dtype = "fp16"
-
-# prompts
-batch_size = 1
-seed = 42
-prompt_path = "./assets/texts/t2v_samples.txt"
-save_dir = "./outputs/samples/"
--- a/configs/opensora/inference/64x512x512.py
+++ b/configs/opensora/inference/64x512x512.py
@ -1,18 +1,18 @@
-# sample size
 num_frames = 64
 fps = 24 // 2
 image_size = (512, 512)

-# model config
+# Define model
 model = dict(
    type="STDiT-XL/2",
    space_scale=1.0,
    time_scale=2 / 3,
-    from_pretrained="outputs/314-F16S3-PixArt-ST-XL-2/epoch96-global_step15000/",
+    from_pretrained="PRETRAINED_MODEL",
 )
 vae = dict(
    type="VideoAutoencoderKL",
    from_pretrained="stabilityai/sd-vae-ft-ema",
+    split=8,
 )
 text_encoder = dict(
    type="t5",
@ -22,13 +22,11 @@ text_encoder = dict(
 scheduler = dict(
    type="iddpm",
    num_sampling_steps=100,
-    # type = "dpm-solver",
-    # num_sampling_steps=20,
    cfg_scale=7.0,
 )
 dtype = "fp16"

-# prompts
+# Others
 batch_size = 1
 seed = 42
 prompt_path = "./assets/texts/t2v_samples.txt"
--- a/configs/opensora/train/16x256x256.py
+++ b/configs/opensora/train/16x256x256.py
@ -1,22 +1,20 @@
-# sample size
 num_frames = 16
 frame_interval = 3
 image_size = (256, 256)

-# dataset
+# Define dataset
 root = None
-# data_path = "/mnt/hdd/data/csv/bak_00/pexels_inter4k_fmin_48_rp.csv"
-data_path = "/mnt/hdd/data/csv/ucf101_videos.csv"
+data_path = "CSV_PATH"
 use_image_transform = False
 num_workers = 4

-# acceleration
-dtype = "fp16"
+# Define acceleration
+dtype = "bf16"
 grad_checkpoint = True
 plugin = "zero2"
 sp_size = 1

-# model config
+# Define model
 model = dict(
    type="STDiT-XL/2",
    space_scale=0.5,
@ -40,7 +38,7 @@ scheduler = dict(
    timestep_respacing="",
 )

-# runtime
+# Others
 seed = 42
 outputs = "outputs"
 wandb = False
--- a/configs/opensora/train/16x512x512.py
+++ b/configs/opensora/train/16x512x512.py
@ -1,26 +1,25 @@
-# sample size
 num_frames = 16
 frame_interval = 3
 image_size = (512, 512)

-# dataset
+# Define dataset
 root = None
-data_path = "/home/zhaowangbo/data_hdd/csv/inter4k_pexels_rp_fmin_48.csv"
+data_path = "CSV_PATH"
 use_image_transform = False
 num_workers = 4

-# acceleration
-dtype = "fp16"
+# Define acceleration
+dtype = "bf16"
 grad_checkpoint = True
 plugin = "zero2"
 sp_size = 1

-# model config
+# Define model
 model = dict(
    type="STDiT-XL/2",
    space_scale=1.0,
    time_scale=1.0,
-    from_pretrained="outputs/285-F16S3-PixArt-ST-XL-2/epoch615-global_step24000/ema.pt",
+    from_pretrained=None,
    enable_flashattn=True,
    enable_layernorm_kernel=True,
 )
@ -40,7 +39,7 @@ scheduler = dict(
    timestep_respacing="",
 )

-# runtime
+# Others
 seed = 42
 outputs = "outputs"
 wandb = False
--- a/configs/opensora/train/64x512x512-sp.py
+++ b/configs/opensora/train/64x512x512-sp.py
@ -1,33 +1,32 @@
-# sample size
 num_frames = 64
 frame_interval = 2
 image_size = (512, 512)

-# dataset
+# Define dataset
 root = None
-data_path = "/mnt/hdd/data/csv/inter4k_pexels_rp_fmin_128.csv"
+data_path = "CSV_PATH"
 use_image_transform = False
 num_workers = 4

-# acceleration
-dtype = "fp16"
+# Define acceleration
+dtype = "bf16"
 grad_checkpoint = True
-plugin = "zero2"
-sp_size = 1
+plugin = "zero2-seq"
+sp_size = 2

-# model config
+# Define model
 model = dict(
    type="STDiT-XL/2",
    space_scale=1.0,
    time_scale=2 / 3,
-    from_pretrained="outputs/314-F16S3-PixArt-ST-XL-2/epoch128-global_step20000/ema.pt",
+    from_pretrained=None,
    enable_flashattn=True,
    enable_layernorm_kernel=True,
+    enable_sequence_parallelism=True,  # enable sq here
 )
 vae = dict(
    type="VideoAutoencoderKL",
    from_pretrained="stabilityai/sd-vae-ft-ema",
-    split=8,
 )
 text_encoder = dict(
    type="t5",
@ -40,16 +39,16 @@ scheduler = dict(
    timestep_respacing="",
 )

-# runtime
+# Others
 seed = 42
 outputs = "outputs"
 wandb = False

 epochs = 1000
 log_every = 10
-ckpt_every = 250
+ckpt_every = 1000
 load = None

-batch_size = 4
+batch_size = 1
 lr = 2e-5
 grad_clip = 1.0
--- a/configs/opensora/train/64x512x512.py
+++ b/configs/opensora/train/64x512x512.py
@ -1,34 +1,32 @@
-# sample size
 num_frames = 64
 frame_interval = 2
 image_size = (512, 512)

-# dataset
+# Define dataset
 root = None
-# data_path = "/mnt/hdd/data/csv/bak_00/pexels_inter4k_fmin_48_rp.csv"
-data_path = "/mnt/hdd/data/csv/ucf101_videos.csv"
+data_path = "CSV_PATH"
 use_image_transform = False
 num_workers = 4

-# acceleration
-dtype = "fp16"
+# Define acceleration
+dtype = "bf16"
 grad_checkpoint = True
-plugin = "zero2-seq"
-sp_size = 2
+plugin = "zero2"
+sp_size = 1

-# model config
+# Define model
 model = dict(
    type="STDiT-XL/2",
    space_scale=1.0,
    time_scale=2 / 3,
-    from_pretrained="PixArt-XL-2-512x512.pth",
+    from_pretrained=None,
    enable_flashattn=True,
    enable_layernorm_kernel=True,
-    enable_sequence_parallelism=True,  # enable sq here
 )
 vae = dict(
    type="VideoAutoencoderKL",
    from_pretrained="stabilityai/sd-vae-ft-ema",
+    split=8,    # split to lower memory usage
 )
 text_encoder = dict(
    type="t5",
@ -41,16 +39,16 @@ scheduler = dict(
    timestep_respacing="",
 )

-# runtime
+# Others
 seed = 42
 outputs = "outputs"
 wandb = False

 epochs = 1000
 log_every = 10
-ckpt_every = 1000
+ckpt_every = 250
 load = None

-batch_size = 1
+batch_size = 4
 lr = 2e-5
 grad_clip = 1.0
--- a/docs/command.md
+++ b/docs/command.md
--- a/docs/commands.md
+++ b/docs/commands.md
@ -0,0 +1,9 @@
+# Commands
+
+## Inference
+
+### Inference with DiT pretrained on ImageNet
+
+
+
+## Training
--- a/docs/structure.md
+++ b/docs/structure.md
@ -68,6 +68,15 @@ Open-Sora

 ## Inference config demos

+To change the inference settings, you can directly modify the corresponding config file. Or you can pass arguments to overwrite the config file ([config_utils.py](/opensora/utils/config_utils.py)). To change sampling prompts, you should modify the `.txt` file passed to the `--prompt_path` argument.
+
+```plaintext
+--prompt_path ./assets/texts/t2v_samples.txt  -> prompt_path
+--ckpt-path ./path/to/your/ckpt.pth           -> model["from_pretrained"]
+```
+
+The explanation of each field is provided below.
+
 ```python
 # Define sampling size
 num_frames = 64               # number of frames
--- a/opensora/utils/config_utils.py
+++ b/opensora/utils/config_utils.py
@ -32,13 +32,14 @@ def parse_args(training=False):
    else:
        parser.add_argument("--wandb", default=None, type=bool, help="enable wandb")
        parser.add_argument("--load", default=None, type=str, help="path to continue training")
+        parser.add_argument("--data-path", default=None, type=str, help="path to data csv")

    return parser.parse_args()


 def merge_args(cfg, args, training=False):
    if args.ckpt_path is not None:
-        cfg.model["from_pratrained"] = args.ckpt_path
+        cfg.model["from_pretrained"] = args.ckpt_path
        args.ckpt_path = None

    if not training:
--- a/requirements.txt
+++ b/requirements.txt
@ -1,28 +1,13 @@
-# should be installed by users
-torch>=1.13
 colossalai
-flash_attn # optional
-apex # optional
-
-# packages
 accelerate
 diffusers
-timm
-transformers
-xformers # necessary for PixArt
-
-# config & logging
-gdown
-pre-commit
-matplotlib
-mmengine
-tensorboard
-tqdm
-wandb
-
-# video
-pyav
-
-# text
-clip
 ftfy
+gdown
+mmengine
+pre-commit
+pyav
+tensorboard
+timm
+tqdm
+transformers
+wandb
--- a/setup.py
+++ b/setup.py
@ -30,12 +30,10 @@ def fetch_readme() -> str:

 setup(
    name="opensora",
-    version="0.1.0",
+    version="1.0.0",
    packages=find_packages(
        exclude=(
-            "videos",
            "tests",
-            "figure",
            "*.egg-info",
        )
    ),