diff --git a/docs/commands.md b/docs/commands.md index 2c948de..d982db8 100644 --- a/docs/commands.md +++ b/docs/commands.md @@ -1,5 +1,6 @@ # Commands +- [Config](#Config) - [Inference](#inference) - [Inference with Open-Sora 1.2](#inference-with-open-sora-12) - [Inference with Open-Sora 1.1](#inference-with-open-sora-11) @@ -12,6 +13,35 @@ - [Training Hyperparameters](#training-hyperparameters) - [Search batch size for buckets](#search-batch-size-for-buckets) +## Config +Note that currently our model loading for vae and diffusion model supports two types: + +* load from local file path +* load from huggingface + +Our config supports loading from huggingface by default. +If you wish to load from a local path, you need to set `force_huggingface=True`, for instance: + +```python +# for vae +vae = dict( + type="OpenSoraVAE_V1_2", + from_pretrained="/root/commonData/OpenSora-VAE-v1.2", + micro_frame_size=17, + micro_batch_size=4, + force_huggingface=True, # NOTE: set here +) +# for diffusion model +model = dict( + type="STDiT3-XL/2", + from_pretrained="/root/commonData/OpenSora-STDiT-v3", + qk_norm=True, + enable_flash_attn=True, + enable_layernorm_kernel=True, + force_huggingface=True, # NOTE: set here +) +``` + ## Inference You can modify corresponding config files to change the inference settings. See more details [here](/docs/structure.md#inference-config-demos). diff --git a/eval/README.md b/eval/README.md index 7ae6e32..261c21b 100644 --- a/eval/README.md +++ b/eval/README.md @@ -48,8 +48,14 @@ First, generate the relevant videos with the following commands: ```bash # vbench task, if evaluation all set start_index to 0, end_index to 2000 bash eval/sample.sh /path/to/ckpt num_frames model_name_for_log -4 start_index end_index + # Alternatively, launch 8 jobs at once (you must read the script to understand the details) bash eval/vbench/launch.sh /path/to/ckpt num_frames model_name + +# in addition, you can specify resolution, aspect ratio, sampling steps, flow, and llm-refine +bash eval/vbench/launch.sh /path/to/ckpt num_frames model_name res_value aspect_ratio_value steps_value flow_value llm_refine_value +# for example +# bash eval/vbench/launch.sh /mnt/jfs-hdd/sora/checkpoints/outputs/042-STDiT3-XL-2/epoch1-global_step16200_llm_refine/ema.pt 51 042-STDiT3-XL-2 240p 9:16 30 2 True ``` After generation, install the VBench package following our [installation](../docs/installation.md)'s sections of "Evaluation Dependencies". Then, run the following commands to evaluate the generated samples. @@ -89,6 +95,15 @@ python eval/vbench_i2v/tabulate_vbench_i2v_scores.py path/to/videos/folder path/ ``` +Similarly as VBench, you can specify resolution, aspect ratio, sampling steps, flow, and llm-refine + +```bash +bash eval/vbench_i2v/launch.sh /path/to/ckpt num_frames model_name_for_log res_value aspect_ratio_value steps_value flow_value llm_refine_value +# for example +# bash eval/vbench_i2v/launch.sh /mnt/jfs-hdd/sora/checkpoints/outputs/042-STDiT3-XL-2/epoch1-global_step16200_llm_refine/ema.pt 51 042-STDiT3-XL-2 240p 9:16 30 2 True +# if no flow control, use "None" instead +``` + ## VAE Install the dependencies package following our [installation](../docs/installation.md)'s s sections of "Evaluation Dependencies". Then, run the following evaluation command: diff --git a/eval/sample.sh b/eval/sample.sh index 241c229..83218bd 100644 --- a/eval/sample.sh +++ b/eval/sample.sh @@ -9,6 +9,10 @@ VBENCH_END_INDEX=$6 VBENCH_RES=$7 VBENCH_ASP_RATIO=$8 +NUM_SAMPLING_STEPS=$9 +FLOW=${10} +LLM_REFINE=${11} + echo "NUM_FRAMES=${NUM_FRAMES}" if [ -z "${NUM_FRAMES}" ]; then @@ -238,10 +242,38 @@ function run_vbench() { --image-size $VBENCH_H $VBENCH_W \ --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2 else - eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \ - --prompt-path assets/texts/VBench/all_dimension.txt \ - --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO \ - --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2 + if [ -z ${NUM_SAMPLING_STEPS} ]; then + eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \ + --prompt-path assets/texts/VBench/all_dimension.txt \ + --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO \ + --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2 + else + if [ -z ${FLOW} ]; then + eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \ + --prompt-path assets/texts/VBench/all_dimension.txt \ + --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} \ + --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2 + else + if [ -z ${LLM_REFINE} ]; then + eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \ + --prompt-path assets/texts/VBench/all_dimension.txt \ + --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --flow ${FLOW} \ + --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2 + else + if [ "${FLOW}" = "None" ]; then + eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \ + --prompt-path assets/texts/VBench/all_dimension.txt \ + --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --llm-refine ${LLM_REFINE} \ + --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2 + else + eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \ + --prompt-path assets/texts/VBench/all_dimension.txt \ + --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --flow ${FLOW} --llm-refine ${LLM_REFINE} \ + --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2 + fi + fi + fi + fi fi } @@ -255,16 +287,41 @@ function run_vbench_i2v() { eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \ --prompt-path assets/texts/VBench/all_i2v.txt \ --image-size $VBENCH_I2V_H $VBENCH_I2V_W \ - --start-index $1 --end-index $2 \ - --num-frames $NUM_FRAMES --batch-size $VBENCH_BS + --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2 else - eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \ - --prompt-path assets/texts/VBench/all_i2v.txt \ - --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO \ - --start-index $1 --end-index $2 \ - --num-frames $NUM_FRAMES --batch-size $VBENCH_BS + if [ -z ${NUM_SAMPLING_STEPS} ]; then + eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \ + --prompt-path assets/texts/VBench/all_i2v.txt \ + --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO \ + --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2 + else + if [ -z ${FLOW} ]; then + eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \ + --prompt-path assets/texts/VBench/all_i2v.txt \ + --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} \ + --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2 + else + if [ -z ${LLM_REFINE} ]; then + eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \ + --prompt-path assets/texts/VBench/all_i2v.txt \ + --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --flow ${FLOW} \ + --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2 + else + if [ "${FLOW}" = "None" ]; then + eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \ + --prompt-path assets/texts/VBench/all_i2v.txt \ + --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --llm-refine ${LLM_REFINE} \ + --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2 + else + eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \ + --prompt-path assets/texts/VBench/all_i2v.txt \ + --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --flow ${FLOW} --llm-refine ${LLM_REFINE} \ + --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2 + fi + fi + fi + fi fi - } ### Main diff --git a/eval/vbench/calc_vbench.py b/eval/vbench/calc_vbench.py index b456f28..e5570a1 100644 --- a/eval/vbench/calc_vbench.py +++ b/eval/vbench/calc_vbench.py @@ -8,24 +8,30 @@ from vbench import VBench full_info_path = "eval/vbench/VBench_full_info.json" dimensions = [ - # Quality Score - "subject_consistency", - "background_consistency", - "motion_smoothness", - "dynamic_degree", - "aesthetic_quality", - "imaging_quality", - "temporal_flickering", - # Semantic Score - "object_class", - "multiple_objects", - "color", - "spatial_relationship", - "scene", - "temporal_style", - "overall_consistency", - "human_action", - "appearance_style", + # a: 10min + "subject_consistency", # 4min + "imaging_quality", # 6min + # b: 12min + "background_consistency", # 2min + "motion_smoothness", # 5min + "overall_consistency", # 2min + "human_action", # 3min + # c: 14min + "multiple_objects", # 14min + # d: 14min + "spatial_relationship", # 14min + # e: 12min + "object_class", # 12min + # f: 12min + "color", # 12min + # g: 10.5min + "aesthetic_quality", # 2.5min + "appearance_style", # 6min + "temporal_flickering", # 2min + # h: 9min + "scene", # 3min + "temporal_style", # 2min + "dynamic_degree", # 4min ] diff --git a/eval/vbench/launch.sh b/eval/vbench/launch.sh index eedd9b3..e7c1165 100644 --- a/eval/vbench/launch.sh +++ b/eval/vbench/launch.sh @@ -6,6 +6,10 @@ MODEL_NAME=$3 RES=$4 ASP_RATIO=$5 +NUM_SAMPLING_STEPS=$6 +FLOW=$7 +LLM_REFINE=$8 + if [[ $CKPT == *"ema"* ]]; then parentdir=$(dirname $CKPT) CKPT_BASE=$(basename $parentdir)_ema @@ -20,11 +24,36 @@ TASK_ID_LIST=(4a 4b 4c 4d 4e 4f 4g 4h) # for log records only START_INDEX_LIST=(0 120 240 360 480 600 720 840) END_INDEX_LIST=(120 240 360 480 600 720 840 2000) +## Modify the following to run on multiple machines for faster results +## 720p will take quite long on a single machine +# START_INDEX_LIST=(60 180 300 420 540 660 780 900) +# END_INDEX_LIST=(120 240 360 480 600 720 840 2000) +# LOG_BASE=$(dirname $CKPT)/eval/last_60 +# mkdir -p ${LOG_BASE} +# echo "Logging to $LOG_BASE" + + + for i in "${!GPUS[@]}"; do if [ -z ${RES} ] || [ -z ${ASP_RATIO} ] ; then CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & else - CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & + if [ -z ${NUM_SAMPLING_STEPS} ]; + then + CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & + else + if [ -z ${FLOW} ]; + then + CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & + else + if [ -z ${LLM_REFINE} ]; + then + CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & + else + CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} ${LLM_REFINE}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & + fi + fi + fi fi done diff --git a/eval/vbench/launch_calc.sh b/eval/vbench/launch_calc.sh index 53114b9..9f14ce5 100644 --- a/eval/vbench/launch_calc.sh +++ b/eval/vbench/launch_calc.sh @@ -7,11 +7,10 @@ mkdir -p $LOG_BASE echo "Logging to $LOG_BASE" GPUS=(0 1 2 3 4 5 6 7) -START_INDEX_LIST=(0 2 4 6 8 10 12 14) -END_INDEX_LIST=(2 4 6 8 10 12 14 16) +START_INDEX_LIST=(0 2 6 7 8 9 10 13) +END_INDEX_LIST=(2 6 7 8 9 10 13 16) TASK_ID_LIST=(calc_vbench_a calc_vbench_b calc_vbench_c calc_vbench_d calc_vbench_e calc_vbench_f calc_vbench_g calc_vbench_h) # for log records only - for i in "${!GPUS[@]}"; do CUDA_VISIBLE_DEVICES=${GPUS[i]} python eval/vbench/calc_vbench.py $VIDEO_DIR $CKPT_DIR --start ${START_INDEX_LIST[i]} --end ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & done diff --git a/eval/vbench_i2v/launch.sh b/eval/vbench_i2v/launch.sh index d8eea1d..2b03309 100644 --- a/eval/vbench_i2v/launch.sh +++ b/eval/vbench_i2v/launch.sh @@ -6,6 +6,10 @@ MODEL_NAME=$3 RES=$4 ASP_RATIO=$5 +NUM_SAMPLING_STEPS=$6 +FLOW=$7 +LLM_REFINE=$8 + if [[ $CKPT == *"ema"* ]]; then parentdir=$(dirname $CKPT) CKPT_BASE=$(basename $parentdir)_ema @@ -20,11 +24,27 @@ TASK_ID_LIST=(5a 5b 5c 5d 5e 5f 5g 5h) # for log records only START_INDEX_LIST=(0 140 280 420 560 700 840 980) END_INDEX_LIST=(140 280 420 560 700 840 980 2000) + for i in "${!GPUS[@]}"; do if [ -z ${RES} ] || [ -z ${ASP_RATIO} ] ; then CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & else - CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & + if [ -z ${NUM_SAMPLING_STEPS} ]; + then + CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & + else + if [ -z ${FLOW} ]; + then + CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & + else + if [ -z ${LLM_REFINE} ]; + then + CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & + else + CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} ${LLM_REFINE}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 & + fi + fi + fi fi done diff --git a/opensora/models/stdit/stdit3.py b/opensora/models/stdit/stdit3.py index bd9672d..bb71d04 100644 --- a/opensora/models/stdit/stdit3.py +++ b/opensora/models/stdit/stdit3.py @@ -4,6 +4,7 @@ import numpy as np import torch import torch.distributed as dist import torch.nn as nn +import torch.nn.functional as F from einops import rearrange from rotary_embedding_torch import RotaryEmbedding from timm.models.layers import DropPath @@ -361,6 +362,19 @@ class STDiT3(PreTrainedModel): # === get pos embed === _, _, Tx, Hx, Wx = x.size() T, H, W = self.get_dynamic_size(x) + + # adjust for sequence parallelism + # we need to ensure H * W is divisible by sequence parallel size + # for simplicity, we can adjust the height to make it divisible + if self.enable_sequence_parallelism: + sp_size = dist.get_world_size(get_sequence_parallel_group()) + h_pad_size = sp_size - H % sp_size + hx_pad_size = h_pad_size * self.patch_size[1] + + # pad x along the H dimension + H += h_pad_size + x = F.pad(x, (0, 0, 0, hx_pad_size)) + S = H * W base_size = round(S**0.5) resolution_sq = (height[0].item() * width[0].item()) ** 0.5