diff --git a/.gitignore b/.gitignore index b1848ea..7b81785 100644 --- a/.gitignore +++ b/.gitignore @@ -169,6 +169,7 @@ dataset/ runs/ checkpoints/ outputs/ +samples/ # Secret files hostfile diff --git a/README.md b/README.md index b97f507..6422cb4 100644 --- a/README.md +++ b/README.md @@ -86,9 +86,28 @@ Click for the original video. ## Installation ```bash +# create a virtual env +conda create -n opensora python=3.10 + +# install torch +# the command below is for CUDA 12.1, choose install commands from +# https://pytorch.org/get-started/locally/ based on your own CUDA version +pip3 install torch torchvision + +# install flash attention (optional) +pip install packaging ninja +pip install flash-attn --no-build-isolation + +# install apex (optional) +pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git + +# install xformers +pip3 install -U xformers --index-url https://download.pytorch.org/whl/cu121 + +# install this project git clone https://github.com/hpcaitech/Open-Sora cd Open-Sora -pip install xxx +pip install -v -e . ``` After installation, we suggest reading [structure.md](docs/structure.md) to learn the project structure and how to use the config files. @@ -101,10 +120,21 @@ After installation, we suggest reading [structure.md](docs/structure.md) to lear ## Inference +To run inference with our provided weights, first prepare the pretrained weights including XXX. [WIP] + +Then run the following commands to generate samples. See [here](docs/structure.md#inference-config-demos) to customize the configuration. + ```bash -python scripts/inference.py configs/opensora/inference/16x256x256.py +# Sample 16x256x256 (~2s) +python scripts/inference.py configs/opensora/inference/16x256x256.py --ckpt-path ./path/to/your/ckpt.pth +# Sample 16x512x512 (~2s) +python scripts/inference.py configs/opensora/inference/16x512x512.py +# Sample 64x512x512 (~5s) +python scripts/inference.py configs/opensora/inference/64x512x512.py ``` +For inference with other models, see [here](docs/commands.md) for more instructions. + ## Data Processing ### Split video into clips @@ -124,8 +154,7 @@ We provide code to split a long video into separate clips efficiently using `mul * [StabilityAI VAE](https://huggingface.co/stabilityai/sd-vae-ft-mse-original): A powerful image VAE model. * [CLIP](https://github.com/openai/CLIP): A powerful text-image embedding model. * [T5](https://github.com/google-research/text-to-text-transfer-transformer): A powerful text encoder. -* [LLaVA](https://github.com/haotian-liu/LLaVA): A powerful image captioning model based on [LLaMA](https://github.com/meta-llama/llama) and [Yi-34B](https://huggingface.co/01-ai/Yi-34B). -* [PySceneDetect](https://github.com/Breakthrough/PySceneDetect): A powerful tool to split video into clips. +* [LLaVA](https://github.com/haotian-liu/LLaVA): A powerful image captioning model based on [Yi-34B](https://huggingface.co/01-ai/Yi-34B). We are grateful for their exceptional work and generous contribution to open source. diff --git a/configs/opensora/inference/16x256x256.py b/configs/opensora/inference/16x256x256.py index 7b1fa99..4b2e4f2 100644 --- a/configs/opensora/inference/16x256x256.py +++ b/configs/opensora/inference/16x256x256.py @@ -1,15 +1,13 @@ -# sample size num_frames = 16 fps = 24 // 3 image_size = (256, 256) -# model config +# Define model model = dict( type="STDiT-XL/2", space_scale=0.5, time_scale=1.0, - from_pretrained="outputs/129-F16S3-PixArt-ST-XL-2/epoch83-global_step80000/ema.pt", - # from_pretrained="outputs/285-F16S3-PixArt-ST-XL-2/epoch615-global_step24000/ema.pt", + from_pretrained="PRETRAINED_MODEL", ) vae = dict( type="VideoAutoencoderKL", @@ -21,15 +19,13 @@ text_encoder = dict( model_max_length=120, ) scheduler = dict( - # type="iddpm", - # num_sampling_steps=250, - type="dpm-solver", - num_sampling_steps=20, + type="iddpm", + num_sampling_steps=100, cfg_scale=7.0, ) dtype = "fp16" -# prompts +# Others batch_size = 2 seed = 42 prompt_path = "./assets/texts/t2v_samples.txt" diff --git a/configs/opensora/inference/16x512x512.py b/configs/opensora/inference/16x512x512.py index 6ca7cf6..2837275 100644 --- a/configs/opensora/inference/16x512x512.py +++ b/configs/opensora/inference/16x512x512.py @@ -1,14 +1,13 @@ -# sample size num_frames = 16 fps = 24 // 3 image_size = (512, 512) -# model config +# Define model model = dict( type="STDiT-XL/2", space_scale=1.0, time_scale=1.0, - from_pretrained="outputs/314-F16S3-PixArt-ST-XL-2/epoch128-global_step20000/ema.pt", + from_pretrained="PRETRAINED_MODEL", ) vae = dict( type="VideoAutoencoderKL", @@ -23,13 +22,11 @@ text_encoder = dict( scheduler = dict( type="iddpm", num_sampling_steps=100, - # type = "dpm-solver", - # num_sampling_steps=20, cfg_scale=7.0, ) dtype = "fp16" -# prompts +# Others batch_size = 2 seed = 42 prompt_path = "./assets/texts/t2v_samples.txt" diff --git a/configs/opensora/inference/64x512x512-v2.py b/configs/opensora/inference/64x512x512-v2.py deleted file mode 100644 index dfb9358..0000000 --- a/configs/opensora/inference/64x512x512-v2.py +++ /dev/null @@ -1,36 +0,0 @@ -# sample size -num_frames = 64 -fps = 24 // 2 -image_size = (512, 512) - -# model config -model = dict( - type="STDiT-XL/2", - space_scale=1.0, - time_scale=2 / 3, - from_pretrained="outputs/524-F64S2-STDiT-XL-2/epoch4-global_step750/", -) -vae = dict( - type="VideoAutoencoderKL", - from_pretrained="stabilityai/sd-vae-ft-ema", - split=8, -) -text_encoder = dict( - type="t5", - from_pretrained="./pretrained_models/t5_ckpts", - model_max_length=120, -) -scheduler = dict( - type="iddpm", - num_sampling_steps=100, - # type = "dpm-solver", - # num_sampling_steps=20, - cfg_scale=7.0, -) -dtype = "fp16" - -# prompts -batch_size = 1 -seed = 42 -prompt_path = "./assets/texts/t2v_samples.txt" -save_dir = "./outputs/samples/" diff --git a/configs/opensora/inference/64x512x512.py b/configs/opensora/inference/64x512x512.py index 00b3e8f..25aad23 100644 --- a/configs/opensora/inference/64x512x512.py +++ b/configs/opensora/inference/64x512x512.py @@ -1,18 +1,18 @@ -# sample size num_frames = 64 fps = 24 // 2 image_size = (512, 512) -# model config +# Define model model = dict( type="STDiT-XL/2", space_scale=1.0, time_scale=2 / 3, - from_pretrained="outputs/314-F16S3-PixArt-ST-XL-2/epoch96-global_step15000/", + from_pretrained="PRETRAINED_MODEL", ) vae = dict( type="VideoAutoencoderKL", from_pretrained="stabilityai/sd-vae-ft-ema", + split=8, ) text_encoder = dict( type="t5", @@ -22,13 +22,11 @@ text_encoder = dict( scheduler = dict( type="iddpm", num_sampling_steps=100, - # type = "dpm-solver", - # num_sampling_steps=20, cfg_scale=7.0, ) dtype = "fp16" -# prompts +# Others batch_size = 1 seed = 42 prompt_path = "./assets/texts/t2v_samples.txt" diff --git a/configs/opensora/train/16x256x256.py b/configs/opensora/train/16x256x256.py index ad2a62e..a64a318 100644 --- a/configs/opensora/train/16x256x256.py +++ b/configs/opensora/train/16x256x256.py @@ -1,22 +1,20 @@ -# sample size num_frames = 16 frame_interval = 3 image_size = (256, 256) -# dataset +# Define dataset root = None -# data_path = "/mnt/hdd/data/csv/bak_00/pexels_inter4k_fmin_48_rp.csv" -data_path = "/mnt/hdd/data/csv/ucf101_videos.csv" +data_path = "CSV_PATH" use_image_transform = False num_workers = 4 -# acceleration -dtype = "fp16" +# Define acceleration +dtype = "bf16" grad_checkpoint = True plugin = "zero2" sp_size = 1 -# model config +# Define model model = dict( type="STDiT-XL/2", space_scale=0.5, @@ -40,7 +38,7 @@ scheduler = dict( timestep_respacing="", ) -# runtime +# Others seed = 42 outputs = "outputs" wandb = False diff --git a/configs/opensora/train/16x512x512.py b/configs/opensora/train/16x512x512.py index edcd92f..f827ce3 100644 --- a/configs/opensora/train/16x512x512.py +++ b/configs/opensora/train/16x512x512.py @@ -1,26 +1,25 @@ -# sample size num_frames = 16 frame_interval = 3 image_size = (512, 512) -# dataset +# Define dataset root = None -data_path = "/home/zhaowangbo/data_hdd/csv/inter4k_pexels_rp_fmin_48.csv" +data_path = "CSV_PATH" use_image_transform = False num_workers = 4 -# acceleration -dtype = "fp16" +# Define acceleration +dtype = "bf16" grad_checkpoint = True plugin = "zero2" sp_size = 1 -# model config +# Define model model = dict( type="STDiT-XL/2", space_scale=1.0, time_scale=1.0, - from_pretrained="outputs/285-F16S3-PixArt-ST-XL-2/epoch615-global_step24000/ema.pt", + from_pretrained=None, enable_flashattn=True, enable_layernorm_kernel=True, ) @@ -40,7 +39,7 @@ scheduler = dict( timestep_respacing="", ) -# runtime +# Others seed = 42 outputs = "outputs" wandb = False diff --git a/configs/opensora/train/64x512x512-v2.py b/configs/opensora/train/64x512x512-sp.py similarity index 69% rename from configs/opensora/train/64x512x512-v2.py rename to configs/opensora/train/64x512x512-sp.py index f20a8f3..b0b9062 100644 --- a/configs/opensora/train/64x512x512-v2.py +++ b/configs/opensora/train/64x512x512-sp.py @@ -1,33 +1,32 @@ -# sample size num_frames = 64 frame_interval = 2 image_size = (512, 512) -# dataset +# Define dataset root = None -data_path = "/mnt/hdd/data/csv/inter4k_pexels_rp_fmin_128.csv" +data_path = "CSV_PATH" use_image_transform = False num_workers = 4 -# acceleration -dtype = "fp16" +# Define acceleration +dtype = "bf16" grad_checkpoint = True -plugin = "zero2" -sp_size = 1 +plugin = "zero2-seq" +sp_size = 2 -# model config +# Define model model = dict( type="STDiT-XL/2", space_scale=1.0, time_scale=2 / 3, - from_pretrained="outputs/314-F16S3-PixArt-ST-XL-2/epoch128-global_step20000/ema.pt", + from_pretrained=None, enable_flashattn=True, enable_layernorm_kernel=True, + enable_sequence_parallelism=True, # enable sq here ) vae = dict( type="VideoAutoencoderKL", from_pretrained="stabilityai/sd-vae-ft-ema", - split=8, ) text_encoder = dict( type="t5", @@ -40,16 +39,16 @@ scheduler = dict( timestep_respacing="", ) -# runtime +# Others seed = 42 outputs = "outputs" wandb = False epochs = 1000 log_every = 10 -ckpt_every = 250 +ckpt_every = 1000 load = None -batch_size = 4 +batch_size = 1 lr = 2e-5 grad_clip = 1.0 diff --git a/configs/opensora/train/64x512x512.py b/configs/opensora/train/64x512x512.py index 642abe4..d902849 100644 --- a/configs/opensora/train/64x512x512.py +++ b/configs/opensora/train/64x512x512.py @@ -1,34 +1,32 @@ -# sample size num_frames = 64 frame_interval = 2 image_size = (512, 512) -# dataset +# Define dataset root = None -# data_path = "/mnt/hdd/data/csv/bak_00/pexels_inter4k_fmin_48_rp.csv" -data_path = "/mnt/hdd/data/csv/ucf101_videos.csv" +data_path = "CSV_PATH" use_image_transform = False num_workers = 4 -# acceleration -dtype = "fp16" +# Define acceleration +dtype = "bf16" grad_checkpoint = True -plugin = "zero2-seq" -sp_size = 2 +plugin = "zero2" +sp_size = 1 -# model config +# Define model model = dict( type="STDiT-XL/2", space_scale=1.0, time_scale=2 / 3, - from_pretrained="PixArt-XL-2-512x512.pth", + from_pretrained=None, enable_flashattn=True, enable_layernorm_kernel=True, - enable_sequence_parallelism=True, # enable sq here ) vae = dict( type="VideoAutoencoderKL", from_pretrained="stabilityai/sd-vae-ft-ema", + split=8, # split to lower memory usage ) text_encoder = dict( type="t5", @@ -41,16 +39,16 @@ scheduler = dict( timestep_respacing="", ) -# runtime +# Others seed = 42 outputs = "outputs" wandb = False epochs = 1000 log_every = 10 -ckpt_every = 1000 +ckpt_every = 250 load = None -batch_size = 1 +batch_size = 4 lr = 2e-5 grad_clip = 1.0 diff --git a/docs/command.md b/docs/command.md deleted file mode 100644 index e69de29..0000000 diff --git a/docs/commands.md b/docs/commands.md new file mode 100644 index 0000000..261e179 --- /dev/null +++ b/docs/commands.md @@ -0,0 +1,9 @@ +# Commands + +## Inference + +### Inference with DiT pretrained on ImageNet + + + +## Training diff --git a/docs/structure.md b/docs/structure.md index deb967b..fbbcc05 100644 --- a/docs/structure.md +++ b/docs/structure.md @@ -68,6 +68,15 @@ Open-Sora ## Inference config demos +To change the inference settings, you can directly modify the corresponding config file. Or you can pass arguments to overwrite the config file ([config_utils.py](/opensora/utils/config_utils.py)). To change sampling prompts, you should modify the `.txt` file passed to the `--prompt_path` argument. + +```plaintext +--prompt_path ./assets/texts/t2v_samples.txt -> prompt_path +--ckpt-path ./path/to/your/ckpt.pth -> model["from_pretrained"] +``` + +The explanation of each field is provided below. + ```python # Define sampling size num_frames = 64 # number of frames diff --git a/opensora/utils/config_utils.py b/opensora/utils/config_utils.py index 48f2d9a..5ef8150 100644 --- a/opensora/utils/config_utils.py +++ b/opensora/utils/config_utils.py @@ -32,13 +32,14 @@ def parse_args(training=False): else: parser.add_argument("--wandb", default=None, type=bool, help="enable wandb") parser.add_argument("--load", default=None, type=str, help="path to continue training") + parser.add_argument("--data-path", default=None, type=str, help="path to data csv") return parser.parse_args() def merge_args(cfg, args, training=False): if args.ckpt_path is not None: - cfg.model["from_pratrained"] = args.ckpt_path + cfg.model["from_pretrained"] = args.ckpt_path args.ckpt_path = None if not training: diff --git a/requirements.txt b/requirements.txt index d2e02b8..0724a86 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,28 +1,13 @@ -# should be installed by users -torch>=1.13 colossalai -flash_attn # optional -apex # optional - -# packages accelerate diffusers -timm -transformers -xformers # necessary for PixArt - -# config & logging -gdown -pre-commit -matplotlib -mmengine -tensorboard -tqdm -wandb - -# video -pyav - -# text -clip ftfy +gdown +mmengine +pre-commit +pyav +tensorboard +timm +tqdm +transformers +wandb diff --git a/setup.py b/setup.py index 1ac64d2..b4b3f49 100644 --- a/setup.py +++ b/setup.py @@ -30,12 +30,10 @@ def fetch_readme() -> str: setup( name="opensora", - version="0.1.0", + version="1.0.0", packages=find_packages( exclude=( - "videos", "tests", - "figure", "*.egg-info", ) ),