mirror of
https://github.com/hpcaitech/Open-Sora.git
synced 2026-04-10 12:49:38 +02:00
parent
88fbcd43ff
commit
5880d01ee3
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -169,6 +169,7 @@ dataset/
|
|||
runs/
|
||||
checkpoints/
|
||||
outputs/
|
||||
samples/
|
||||
|
||||
# Secret files
|
||||
hostfile
|
||||
|
|
|
|||
37
README.md
37
README.md
|
|
@ -86,9 +86,28 @@ Click for the original video.
|
|||
## Installation
|
||||
|
||||
```bash
|
||||
# create a virtual env
|
||||
conda create -n opensora python=3.10
|
||||
|
||||
# install torch
|
||||
# the command below is for CUDA 12.1, choose install commands from
|
||||
# https://pytorch.org/get-started/locally/ based on your own CUDA version
|
||||
pip3 install torch torchvision
|
||||
|
||||
# install flash attention (optional)
|
||||
pip install packaging ninja
|
||||
pip install flash-attn --no-build-isolation
|
||||
|
||||
# install apex (optional)
|
||||
pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git
|
||||
|
||||
# install xformers
|
||||
pip3 install -U xformers --index-url https://download.pytorch.org/whl/cu121
|
||||
|
||||
# install this project
|
||||
git clone https://github.com/hpcaitech/Open-Sora
|
||||
cd Open-Sora
|
||||
pip install xxx
|
||||
pip install -v -e .
|
||||
```
|
||||
|
||||
After installation, we suggest reading [structure.md](docs/structure.md) to learn the project structure and how to use the config files.
|
||||
|
|
@ -101,10 +120,21 @@ After installation, we suggest reading [structure.md](docs/structure.md) to lear
|
|||
|
||||
## Inference
|
||||
|
||||
To run inference with our provided weights, first prepare the pretrained weights including XXX. [WIP]
|
||||
|
||||
Then run the following commands to generate samples. See [here](docs/structure.md#inference-config-demos) to customize the configuration.
|
||||
|
||||
```bash
|
||||
python scripts/inference.py configs/opensora/inference/16x256x256.py
|
||||
# Sample 16x256x256 (~2s)
|
||||
python scripts/inference.py configs/opensora/inference/16x256x256.py --ckpt-path ./path/to/your/ckpt.pth
|
||||
# Sample 16x512x512 (~2s)
|
||||
python scripts/inference.py configs/opensora/inference/16x512x512.py
|
||||
# Sample 64x512x512 (~5s)
|
||||
python scripts/inference.py configs/opensora/inference/64x512x512.py
|
||||
```
|
||||
|
||||
For inference with other models, see [here](docs/commands.md) for more instructions.
|
||||
|
||||
## Data Processing
|
||||
|
||||
### Split video into clips
|
||||
|
|
@ -124,8 +154,7 @@ We provide code to split a long video into separate clips efficiently using `mul
|
|||
* [StabilityAI VAE](https://huggingface.co/stabilityai/sd-vae-ft-mse-original): A powerful image VAE model.
|
||||
* [CLIP](https://github.com/openai/CLIP): A powerful text-image embedding model.
|
||||
* [T5](https://github.com/google-research/text-to-text-transfer-transformer): A powerful text encoder.
|
||||
* [LLaVA](https://github.com/haotian-liu/LLaVA): A powerful image captioning model based on [LLaMA](https://github.com/meta-llama/llama) and [Yi-34B](https://huggingface.co/01-ai/Yi-34B).
|
||||
* [PySceneDetect](https://github.com/Breakthrough/PySceneDetect): A powerful tool to split video into clips.
|
||||
* [LLaVA](https://github.com/haotian-liu/LLaVA): A powerful image captioning model based on [Yi-34B](https://huggingface.co/01-ai/Yi-34B).
|
||||
|
||||
We are grateful for their exceptional work and generous contribution to open source.
|
||||
|
||||
|
|
|
|||
|
|
@ -1,15 +1,13 @@
|
|||
# sample size
|
||||
num_frames = 16
|
||||
fps = 24 // 3
|
||||
image_size = (256, 256)
|
||||
|
||||
# model config
|
||||
# Define model
|
||||
model = dict(
|
||||
type="STDiT-XL/2",
|
||||
space_scale=0.5,
|
||||
time_scale=1.0,
|
||||
from_pretrained="outputs/129-F16S3-PixArt-ST-XL-2/epoch83-global_step80000/ema.pt",
|
||||
# from_pretrained="outputs/285-F16S3-PixArt-ST-XL-2/epoch615-global_step24000/ema.pt",
|
||||
from_pretrained="PRETRAINED_MODEL",
|
||||
)
|
||||
vae = dict(
|
||||
type="VideoAutoencoderKL",
|
||||
|
|
@ -21,15 +19,13 @@ text_encoder = dict(
|
|||
model_max_length=120,
|
||||
)
|
||||
scheduler = dict(
|
||||
# type="iddpm",
|
||||
# num_sampling_steps=250,
|
||||
type="dpm-solver",
|
||||
num_sampling_steps=20,
|
||||
type="iddpm",
|
||||
num_sampling_steps=100,
|
||||
cfg_scale=7.0,
|
||||
)
|
||||
dtype = "fp16"
|
||||
|
||||
# prompts
|
||||
# Others
|
||||
batch_size = 2
|
||||
seed = 42
|
||||
prompt_path = "./assets/texts/t2v_samples.txt"
|
||||
|
|
|
|||
|
|
@ -1,14 +1,13 @@
|
|||
# sample size
|
||||
num_frames = 16
|
||||
fps = 24 // 3
|
||||
image_size = (512, 512)
|
||||
|
||||
# model config
|
||||
# Define model
|
||||
model = dict(
|
||||
type="STDiT-XL/2",
|
||||
space_scale=1.0,
|
||||
time_scale=1.0,
|
||||
from_pretrained="outputs/314-F16S3-PixArt-ST-XL-2/epoch128-global_step20000/ema.pt",
|
||||
from_pretrained="PRETRAINED_MODEL",
|
||||
)
|
||||
vae = dict(
|
||||
type="VideoAutoencoderKL",
|
||||
|
|
@ -23,13 +22,11 @@ text_encoder = dict(
|
|||
scheduler = dict(
|
||||
type="iddpm",
|
||||
num_sampling_steps=100,
|
||||
# type = "dpm-solver",
|
||||
# num_sampling_steps=20,
|
||||
cfg_scale=7.0,
|
||||
)
|
||||
dtype = "fp16"
|
||||
|
||||
# prompts
|
||||
# Others
|
||||
batch_size = 2
|
||||
seed = 42
|
||||
prompt_path = "./assets/texts/t2v_samples.txt"
|
||||
|
|
|
|||
|
|
@ -1,36 +0,0 @@
|
|||
# sample size
|
||||
num_frames = 64
|
||||
fps = 24 // 2
|
||||
image_size = (512, 512)
|
||||
|
||||
# model config
|
||||
model = dict(
|
||||
type="STDiT-XL/2",
|
||||
space_scale=1.0,
|
||||
time_scale=2 / 3,
|
||||
from_pretrained="outputs/524-F64S2-STDiT-XL-2/epoch4-global_step750/",
|
||||
)
|
||||
vae = dict(
|
||||
type="VideoAutoencoderKL",
|
||||
from_pretrained="stabilityai/sd-vae-ft-ema",
|
||||
split=8,
|
||||
)
|
||||
text_encoder = dict(
|
||||
type="t5",
|
||||
from_pretrained="./pretrained_models/t5_ckpts",
|
||||
model_max_length=120,
|
||||
)
|
||||
scheduler = dict(
|
||||
type="iddpm",
|
||||
num_sampling_steps=100,
|
||||
# type = "dpm-solver",
|
||||
# num_sampling_steps=20,
|
||||
cfg_scale=7.0,
|
||||
)
|
||||
dtype = "fp16"
|
||||
|
||||
# prompts
|
||||
batch_size = 1
|
||||
seed = 42
|
||||
prompt_path = "./assets/texts/t2v_samples.txt"
|
||||
save_dir = "./outputs/samples/"
|
||||
|
|
@ -1,18 +1,18 @@
|
|||
# sample size
|
||||
num_frames = 64
|
||||
fps = 24 // 2
|
||||
image_size = (512, 512)
|
||||
|
||||
# model config
|
||||
# Define model
|
||||
model = dict(
|
||||
type="STDiT-XL/2",
|
||||
space_scale=1.0,
|
||||
time_scale=2 / 3,
|
||||
from_pretrained="outputs/314-F16S3-PixArt-ST-XL-2/epoch96-global_step15000/",
|
||||
from_pretrained="PRETRAINED_MODEL",
|
||||
)
|
||||
vae = dict(
|
||||
type="VideoAutoencoderKL",
|
||||
from_pretrained="stabilityai/sd-vae-ft-ema",
|
||||
split=8,
|
||||
)
|
||||
text_encoder = dict(
|
||||
type="t5",
|
||||
|
|
@ -22,13 +22,11 @@ text_encoder = dict(
|
|||
scheduler = dict(
|
||||
type="iddpm",
|
||||
num_sampling_steps=100,
|
||||
# type = "dpm-solver",
|
||||
# num_sampling_steps=20,
|
||||
cfg_scale=7.0,
|
||||
)
|
||||
dtype = "fp16"
|
||||
|
||||
# prompts
|
||||
# Others
|
||||
batch_size = 1
|
||||
seed = 42
|
||||
prompt_path = "./assets/texts/t2v_samples.txt"
|
||||
|
|
|
|||
|
|
@ -1,22 +1,20 @@
|
|||
# sample size
|
||||
num_frames = 16
|
||||
frame_interval = 3
|
||||
image_size = (256, 256)
|
||||
|
||||
# dataset
|
||||
# Define dataset
|
||||
root = None
|
||||
# data_path = "/mnt/hdd/data/csv/bak_00/pexels_inter4k_fmin_48_rp.csv"
|
||||
data_path = "/mnt/hdd/data/csv/ucf101_videos.csv"
|
||||
data_path = "CSV_PATH"
|
||||
use_image_transform = False
|
||||
num_workers = 4
|
||||
|
||||
# acceleration
|
||||
dtype = "fp16"
|
||||
# Define acceleration
|
||||
dtype = "bf16"
|
||||
grad_checkpoint = True
|
||||
plugin = "zero2"
|
||||
sp_size = 1
|
||||
|
||||
# model config
|
||||
# Define model
|
||||
model = dict(
|
||||
type="STDiT-XL/2",
|
||||
space_scale=0.5,
|
||||
|
|
@ -40,7 +38,7 @@ scheduler = dict(
|
|||
timestep_respacing="",
|
||||
)
|
||||
|
||||
# runtime
|
||||
# Others
|
||||
seed = 42
|
||||
outputs = "outputs"
|
||||
wandb = False
|
||||
|
|
|
|||
|
|
@ -1,26 +1,25 @@
|
|||
# sample size
|
||||
num_frames = 16
|
||||
frame_interval = 3
|
||||
image_size = (512, 512)
|
||||
|
||||
# dataset
|
||||
# Define dataset
|
||||
root = None
|
||||
data_path = "/home/zhaowangbo/data_hdd/csv/inter4k_pexels_rp_fmin_48.csv"
|
||||
data_path = "CSV_PATH"
|
||||
use_image_transform = False
|
||||
num_workers = 4
|
||||
|
||||
# acceleration
|
||||
dtype = "fp16"
|
||||
# Define acceleration
|
||||
dtype = "bf16"
|
||||
grad_checkpoint = True
|
||||
plugin = "zero2"
|
||||
sp_size = 1
|
||||
|
||||
# model config
|
||||
# Define model
|
||||
model = dict(
|
||||
type="STDiT-XL/2",
|
||||
space_scale=1.0,
|
||||
time_scale=1.0,
|
||||
from_pretrained="outputs/285-F16S3-PixArt-ST-XL-2/epoch615-global_step24000/ema.pt",
|
||||
from_pretrained=None,
|
||||
enable_flashattn=True,
|
||||
enable_layernorm_kernel=True,
|
||||
)
|
||||
|
|
@ -40,7 +39,7 @@ scheduler = dict(
|
|||
timestep_respacing="",
|
||||
)
|
||||
|
||||
# runtime
|
||||
# Others
|
||||
seed = 42
|
||||
outputs = "outputs"
|
||||
wandb = False
|
||||
|
|
|
|||
|
|
@ -1,33 +1,32 @@
|
|||
# sample size
|
||||
num_frames = 64
|
||||
frame_interval = 2
|
||||
image_size = (512, 512)
|
||||
|
||||
# dataset
|
||||
# Define dataset
|
||||
root = None
|
||||
data_path = "/mnt/hdd/data/csv/inter4k_pexels_rp_fmin_128.csv"
|
||||
data_path = "CSV_PATH"
|
||||
use_image_transform = False
|
||||
num_workers = 4
|
||||
|
||||
# acceleration
|
||||
dtype = "fp16"
|
||||
# Define acceleration
|
||||
dtype = "bf16"
|
||||
grad_checkpoint = True
|
||||
plugin = "zero2"
|
||||
sp_size = 1
|
||||
plugin = "zero2-seq"
|
||||
sp_size = 2
|
||||
|
||||
# model config
|
||||
# Define model
|
||||
model = dict(
|
||||
type="STDiT-XL/2",
|
||||
space_scale=1.0,
|
||||
time_scale=2 / 3,
|
||||
from_pretrained="outputs/314-F16S3-PixArt-ST-XL-2/epoch128-global_step20000/ema.pt",
|
||||
from_pretrained=None,
|
||||
enable_flashattn=True,
|
||||
enable_layernorm_kernel=True,
|
||||
enable_sequence_parallelism=True, # enable sq here
|
||||
)
|
||||
vae = dict(
|
||||
type="VideoAutoencoderKL",
|
||||
from_pretrained="stabilityai/sd-vae-ft-ema",
|
||||
split=8,
|
||||
)
|
||||
text_encoder = dict(
|
||||
type="t5",
|
||||
|
|
@ -40,16 +39,16 @@ scheduler = dict(
|
|||
timestep_respacing="",
|
||||
)
|
||||
|
||||
# runtime
|
||||
# Others
|
||||
seed = 42
|
||||
outputs = "outputs"
|
||||
wandb = False
|
||||
|
||||
epochs = 1000
|
||||
log_every = 10
|
||||
ckpt_every = 250
|
||||
ckpt_every = 1000
|
||||
load = None
|
||||
|
||||
batch_size = 4
|
||||
batch_size = 1
|
||||
lr = 2e-5
|
||||
grad_clip = 1.0
|
||||
|
|
@ -1,34 +1,32 @@
|
|||
# sample size
|
||||
num_frames = 64
|
||||
frame_interval = 2
|
||||
image_size = (512, 512)
|
||||
|
||||
# dataset
|
||||
# Define dataset
|
||||
root = None
|
||||
# data_path = "/mnt/hdd/data/csv/bak_00/pexels_inter4k_fmin_48_rp.csv"
|
||||
data_path = "/mnt/hdd/data/csv/ucf101_videos.csv"
|
||||
data_path = "CSV_PATH"
|
||||
use_image_transform = False
|
||||
num_workers = 4
|
||||
|
||||
# acceleration
|
||||
dtype = "fp16"
|
||||
# Define acceleration
|
||||
dtype = "bf16"
|
||||
grad_checkpoint = True
|
||||
plugin = "zero2-seq"
|
||||
sp_size = 2
|
||||
plugin = "zero2"
|
||||
sp_size = 1
|
||||
|
||||
# model config
|
||||
# Define model
|
||||
model = dict(
|
||||
type="STDiT-XL/2",
|
||||
space_scale=1.0,
|
||||
time_scale=2 / 3,
|
||||
from_pretrained="PixArt-XL-2-512x512.pth",
|
||||
from_pretrained=None,
|
||||
enable_flashattn=True,
|
||||
enable_layernorm_kernel=True,
|
||||
enable_sequence_parallelism=True, # enable sq here
|
||||
)
|
||||
vae = dict(
|
||||
type="VideoAutoencoderKL",
|
||||
from_pretrained="stabilityai/sd-vae-ft-ema",
|
||||
split=8, # split to lower memory usage
|
||||
)
|
||||
text_encoder = dict(
|
||||
type="t5",
|
||||
|
|
@ -41,16 +39,16 @@ scheduler = dict(
|
|||
timestep_respacing="",
|
||||
)
|
||||
|
||||
# runtime
|
||||
# Others
|
||||
seed = 42
|
||||
outputs = "outputs"
|
||||
wandb = False
|
||||
|
||||
epochs = 1000
|
||||
log_every = 10
|
||||
ckpt_every = 1000
|
||||
ckpt_every = 250
|
||||
load = None
|
||||
|
||||
batch_size = 1
|
||||
batch_size = 4
|
||||
lr = 2e-5
|
||||
grad_clip = 1.0
|
||||
|
|
|
|||
9
docs/commands.md
Normal file
9
docs/commands.md
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
# Commands
|
||||
|
||||
## Inference
|
||||
|
||||
### Inference with DiT pretrained on ImageNet
|
||||
|
||||
|
||||
|
||||
## Training
|
||||
|
|
@ -68,6 +68,15 @@ Open-Sora
|
|||
|
||||
## Inference config demos
|
||||
|
||||
To change the inference settings, you can directly modify the corresponding config file. Or you can pass arguments to overwrite the config file ([config_utils.py](/opensora/utils/config_utils.py)). To change sampling prompts, you should modify the `.txt` file passed to the `--prompt_path` argument.
|
||||
|
||||
```plaintext
|
||||
--prompt_path ./assets/texts/t2v_samples.txt -> prompt_path
|
||||
--ckpt-path ./path/to/your/ckpt.pth -> model["from_pretrained"]
|
||||
```
|
||||
|
||||
The explanation of each field is provided below.
|
||||
|
||||
```python
|
||||
# Define sampling size
|
||||
num_frames = 64 # number of frames
|
||||
|
|
|
|||
|
|
@ -32,13 +32,14 @@ def parse_args(training=False):
|
|||
else:
|
||||
parser.add_argument("--wandb", default=None, type=bool, help="enable wandb")
|
||||
parser.add_argument("--load", default=None, type=str, help="path to continue training")
|
||||
parser.add_argument("--data-path", default=None, type=str, help="path to data csv")
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def merge_args(cfg, args, training=False):
|
||||
if args.ckpt_path is not None:
|
||||
cfg.model["from_pratrained"] = args.ckpt_path
|
||||
cfg.model["from_pretrained"] = args.ckpt_path
|
||||
args.ckpt_path = None
|
||||
|
||||
if not training:
|
||||
|
|
|
|||
|
|
@ -1,28 +1,13 @@
|
|||
# should be installed by users
|
||||
torch>=1.13
|
||||
colossalai
|
||||
flash_attn # optional
|
||||
apex # optional
|
||||
|
||||
# packages
|
||||
accelerate
|
||||
diffusers
|
||||
timm
|
||||
transformers
|
||||
xformers # necessary for PixArt
|
||||
|
||||
# config & logging
|
||||
gdown
|
||||
pre-commit
|
||||
matplotlib
|
||||
mmengine
|
||||
tensorboard
|
||||
tqdm
|
||||
wandb
|
||||
|
||||
# video
|
||||
pyav
|
||||
|
||||
# text
|
||||
clip
|
||||
ftfy
|
||||
gdown
|
||||
mmengine
|
||||
pre-commit
|
||||
pyav
|
||||
tensorboard
|
||||
timm
|
||||
tqdm
|
||||
transformers
|
||||
wandb
|
||||
|
|
|
|||
Loading…
Reference in a new issue