mirror of
https://github.com/hpcaitech/Open-Sora.git
synced 2026-05-21 11:59:01 +02:00
Merge branch 'main' of https://github.com/hpcaitech/Open-Sora-dev into main
This commit is contained in:
commit
477c800db1
|
|
@ -19,14 +19,12 @@ model = dict(
|
|||
qk_norm=True,
|
||||
enable_flash_attn=True,
|
||||
enable_layernorm_kernel=True,
|
||||
force_huggingface=True,
|
||||
)
|
||||
vae = dict(
|
||||
type="OpenSoraVAE_V1_2",
|
||||
from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
|
||||
micro_frame_size=17,
|
||||
micro_batch_size=4,
|
||||
force_huggingface=True,
|
||||
)
|
||||
text_encoder = dict(
|
||||
type="t5",
|
||||
|
|
|
|||
44
configs/opensora-v1-2/inference/sample_hf.py
Normal file
44
configs/opensora-v1-2/inference/sample_hf.py
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
resolution = "240p"
|
||||
aspect_ratio = "9:16"
|
||||
num_frames = 51
|
||||
fps = 24
|
||||
frame_interval = 1
|
||||
save_fps = 24
|
||||
|
||||
save_dir = "./samples/samples/"
|
||||
seed = 42
|
||||
batch_size = 1
|
||||
multi_resolution = "STDiT2"
|
||||
dtype = "bf16"
|
||||
condition_frame_length = 5
|
||||
align = 5
|
||||
|
||||
model = dict(
|
||||
type="STDiT3-XL/2",
|
||||
from_pretrained="hpcai-tech/OpenSora-STDiT-v3",
|
||||
qk_norm=True,
|
||||
enable_flash_attn=True,
|
||||
enable_layernorm_kernel=True,
|
||||
force_huggingface=True,
|
||||
)
|
||||
vae = dict(
|
||||
type="OpenSoraVAE_V1_2",
|
||||
from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
|
||||
micro_frame_size=17,
|
||||
micro_batch_size=4,
|
||||
force_huggingface=True,
|
||||
)
|
||||
text_encoder = dict(
|
||||
type="t5",
|
||||
from_pretrained="DeepFloyd/t5-v1_1-xxl",
|
||||
model_max_length=300,
|
||||
)
|
||||
scheduler = dict(
|
||||
type="rflow",
|
||||
use_timestep_transform=True,
|
||||
num_sampling_steps=30,
|
||||
cfg_scale=7.0,
|
||||
)
|
||||
|
||||
aes = 6.5
|
||||
flow = None
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
num_frames = 1
|
||||
fps = 1
|
||||
image_size = (2560, 1536)
|
||||
# image_size = (2560, 1536)
|
||||
# image_size = (2048, 2048)
|
||||
|
||||
model = dict(
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
# Commands
|
||||
|
||||
- [Config](#Config)
|
||||
- [Inference](#inference)
|
||||
- [Inference with Open-Sora 1.2](#inference-with-open-sora-12)
|
||||
- [Inference with Open-Sora 1.1](#inference-with-open-sora-11)
|
||||
|
|
@ -12,6 +13,36 @@
|
|||
- [Training Hyperparameters](#training-hyperparameters)
|
||||
- [Search batch size for buckets](#search-batch-size-for-buckets)
|
||||
|
||||
## Config
|
||||
Note that currently our model loading for vae and diffusion model supports two types:
|
||||
|
||||
* load from local file path
|
||||
* load from huggingface
|
||||
|
||||
Our config supports loading from huggingface online image by default.
|
||||
If you wish to load from a local path downloaded from huggingface image, you need to set `force_huggingface=True`, for instance:
|
||||
|
||||
```python
|
||||
# for vae
|
||||
vae = dict(
|
||||
type="OpenSoraVAE_V1_2",
|
||||
from_pretrained="/root/commonData/OpenSora-VAE-v1.2",
|
||||
micro_frame_size=17,
|
||||
micro_batch_size=4,
|
||||
force_huggingface=True, # NOTE: set here
|
||||
)
|
||||
# for diffusion model
|
||||
model = dict(
|
||||
type="STDiT3-XL/2",
|
||||
from_pretrained="/root/commonData/OpenSora-STDiT-v3",
|
||||
qk_norm=True,
|
||||
enable_flash_attn=True,
|
||||
enable_layernorm_kernel=True,
|
||||
force_huggingface=True, # NOTE: set here
|
||||
)
|
||||
```
|
||||
However, if you want to load a self-trained model, do not set `force_huggingface=True` since your image won't be in huggingface format.
|
||||
|
||||
## Inference
|
||||
|
||||
You can modify corresponding config files to change the inference settings. See more details [here](/docs/structure.md#inference-config-demos).
|
||||
|
|
|
|||
|
|
@ -48,8 +48,14 @@ First, generate the relevant videos with the following commands:
|
|||
```bash
|
||||
# vbench task, if evaluation all set start_index to 0, end_index to 2000
|
||||
bash eval/sample.sh /path/to/ckpt num_frames model_name_for_log -4 start_index end_index
|
||||
|
||||
# Alternatively, launch 8 jobs at once (you must read the script to understand the details)
|
||||
bash eval/vbench/launch.sh /path/to/ckpt num_frames model_name
|
||||
|
||||
# in addition, you can specify resolution, aspect ratio, sampling steps, flow, and llm-refine
|
||||
bash eval/vbench/launch.sh /path/to/ckpt num_frames model_name res_value aspect_ratio_value steps_value flow_value llm_refine_value
|
||||
# for example
|
||||
# bash eval/vbench/launch.sh /mnt/jfs-hdd/sora/checkpoints/outputs/042-STDiT3-XL-2/epoch1-global_step16200_llm_refine/ema.pt 51 042-STDiT3-XL-2 240p 9:16 30 2 True
|
||||
```
|
||||
|
||||
After generation, install the VBench package following our [installation](../docs/installation.md)'s sections of "Evaluation Dependencies". Then, run the following commands to evaluate the generated samples.
|
||||
|
|
@ -89,6 +95,15 @@ python eval/vbench_i2v/tabulate_vbench_i2v_scores.py path/to/videos/folder path/
|
|||
|
||||
```
|
||||
|
||||
Similarly as VBench, you can specify resolution, aspect ratio, sampling steps, flow, and llm-refine
|
||||
|
||||
```bash
|
||||
bash eval/vbench_i2v/launch.sh /path/to/ckpt num_frames model_name_for_log res_value aspect_ratio_value steps_value flow_value llm_refine_value
|
||||
# for example
|
||||
# bash eval/vbench_i2v/launch.sh /mnt/jfs-hdd/sora/checkpoints/outputs/042-STDiT3-XL-2/epoch1-global_step16200_llm_refine/ema.pt 51 042-STDiT3-XL-2 240p 9:16 30 2 True
|
||||
# if no flow control, use "None" instead
|
||||
```
|
||||
|
||||
## VAE
|
||||
|
||||
Install the dependencies package following our [installation](../docs/installation.md)'s s sections of "Evaluation Dependencies". Then, run the following evaluation command:
|
||||
|
|
|
|||
|
|
@ -3,8 +3,16 @@
|
|||
CMD="torchrun --standalone --nproc_per_node 1 eval/loss/eval_loss.py configs/opensora-v1-2/misc/eval_loss.py"
|
||||
CKPT_PATH=$1
|
||||
MODEL_NAME=$2
|
||||
IMG_PATH="/mnt/jfs-hdd/sora/meta/validation/img_1k.csv"
|
||||
VID_PATH="/mnt/jfs-hdd/sora/meta/validation/vid_100.csv"
|
||||
IMG_PATH=$3
|
||||
VID_PATH=$4
|
||||
|
||||
if [ -z $IMG_PATH ]; then
|
||||
IMG_PATH="/mnt/jfs-hdd/sora/meta/validation/img_1k.csv"
|
||||
fi
|
||||
|
||||
if [ -z $VID_PATH ]; then
|
||||
VID_PATH="/mnt/jfs-hdd/sora/meta/validation/vid_100.csv"
|
||||
fi
|
||||
|
||||
if [[ $CKPT_PATH == *"ema"* ]]; then
|
||||
parentdir=$(dirname $CKPT_PATH)
|
||||
|
|
|
|||
|
|
@ -9,6 +9,10 @@ VBENCH_END_INDEX=$6
|
|||
VBENCH_RES=$7
|
||||
VBENCH_ASP_RATIO=$8
|
||||
|
||||
NUM_SAMPLING_STEPS=$9
|
||||
FLOW=${10}
|
||||
LLM_REFINE=${11}
|
||||
|
||||
echo "NUM_FRAMES=${NUM_FRAMES}"
|
||||
|
||||
if [ -z "${NUM_FRAMES}" ]; then
|
||||
|
|
@ -39,7 +43,7 @@ DEFAULT_BS=1
|
|||
# called inside run_video_b
|
||||
function run_image() { # 14min
|
||||
# 1.1 1024x1024
|
||||
eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2i_samples.txt --save-dir $OUTPUT --num-frames 1 --resolution 1024 --aspect_ratio 1:1 --sample-name image_1024_1_1 --batch-size $DEFAULT_BS
|
||||
eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2i_samples.txt --save-dir $OUTPUT --num-frames 1 --resolution 1024 --aspect-ratio 1:1 --sample-name image_1024_1_1 --batch-size $DEFAULT_BS
|
||||
|
||||
# 1.2 240x426
|
||||
eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2i_samples.txt --save-dir $OUTPUT --num-frames 1 --resolution 240p --aspect-ratio 9:16 --sample-name image_240p_9_16 --end-index 3 --batch-size $DEFAULT_BS
|
||||
|
|
@ -238,10 +242,38 @@ function run_vbench() {
|
|||
--image-size $VBENCH_H $VBENCH_W \
|
||||
--batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
|
||||
else
|
||||
eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
|
||||
--prompt-path assets/texts/VBench/all_dimension.txt \
|
||||
--resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO \
|
||||
--batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
|
||||
if [ -z ${NUM_SAMPLING_STEPS} ]; then
|
||||
eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
|
||||
--prompt-path assets/texts/VBench/all_dimension.txt \
|
||||
--resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO \
|
||||
--batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
|
||||
else
|
||||
if [ -z ${FLOW} ]; then
|
||||
eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
|
||||
--prompt-path assets/texts/VBench/all_dimension.txt \
|
||||
--resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} \
|
||||
--batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
|
||||
else
|
||||
if [ -z ${LLM_REFINE} ]; then
|
||||
eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
|
||||
--prompt-path assets/texts/VBench/all_dimension.txt \
|
||||
--resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --flow ${FLOW} \
|
||||
--batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
|
||||
else
|
||||
if [ "${FLOW}" = "None" ]; then
|
||||
eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
|
||||
--prompt-path assets/texts/VBench/all_dimension.txt \
|
||||
--resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --llm-refine ${LLM_REFINE} \
|
||||
--batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
|
||||
else
|
||||
eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
|
||||
--prompt-path assets/texts/VBench/all_dimension.txt \
|
||||
--resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --flow ${FLOW} --llm-refine ${LLM_REFINE} \
|
||||
--batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
|
|
@ -255,16 +287,41 @@ function run_vbench_i2v() {
|
|||
eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
|
||||
--prompt-path assets/texts/VBench/all_i2v.txt \
|
||||
--image-size $VBENCH_I2V_H $VBENCH_I2V_W \
|
||||
--start-index $1 --end-index $2 \
|
||||
--num-frames $NUM_FRAMES --batch-size $VBENCH_BS
|
||||
--batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
|
||||
else
|
||||
eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
|
||||
--prompt-path assets/texts/VBench/all_i2v.txt \
|
||||
--resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO \
|
||||
--start-index $1 --end-index $2 \
|
||||
--num-frames $NUM_FRAMES --batch-size $VBENCH_BS
|
||||
if [ -z ${NUM_SAMPLING_STEPS} ]; then
|
||||
eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
|
||||
--prompt-path assets/texts/VBench/all_i2v.txt \
|
||||
--resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO \
|
||||
--batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
|
||||
else
|
||||
if [ -z ${FLOW} ]; then
|
||||
eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
|
||||
--prompt-path assets/texts/VBench/all_i2v.txt \
|
||||
--resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} \
|
||||
--batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
|
||||
else
|
||||
if [ -z ${LLM_REFINE} ]; then
|
||||
eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
|
||||
--prompt-path assets/texts/VBench/all_i2v.txt \
|
||||
--resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --flow ${FLOW} \
|
||||
--batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
|
||||
else
|
||||
if [ "${FLOW}" = "None" ]; then
|
||||
eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
|
||||
--prompt-path assets/texts/VBench/all_i2v.txt \
|
||||
--resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --llm-refine ${LLM_REFINE} \
|
||||
--batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
|
||||
else
|
||||
eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
|
||||
--prompt-path assets/texts/VBench/all_i2v.txt \
|
||||
--resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --flow ${FLOW} --llm-refine ${LLM_REFINE} \
|
||||
--batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
}
|
||||
|
||||
### Main
|
||||
|
|
|
|||
|
|
@ -8,24 +8,30 @@ from vbench import VBench
|
|||
|
||||
full_info_path = "eval/vbench/VBench_full_info.json"
|
||||
dimensions = [
|
||||
# Quality Score
|
||||
"subject_consistency",
|
||||
"background_consistency",
|
||||
"motion_smoothness",
|
||||
"dynamic_degree",
|
||||
"aesthetic_quality",
|
||||
"imaging_quality",
|
||||
"temporal_flickering",
|
||||
# Semantic Score
|
||||
"object_class",
|
||||
"multiple_objects",
|
||||
"color",
|
||||
"spatial_relationship",
|
||||
"scene",
|
||||
"temporal_style",
|
||||
"overall_consistency",
|
||||
"human_action",
|
||||
"appearance_style",
|
||||
# a: 10min
|
||||
"subject_consistency", # 4min
|
||||
"imaging_quality", # 6min
|
||||
# b: 12min
|
||||
"background_consistency", # 2min
|
||||
"motion_smoothness", # 5min
|
||||
"overall_consistency", # 2min
|
||||
"human_action", # 3min
|
||||
# c: 14min
|
||||
"multiple_objects", # 14min
|
||||
# d: 14min
|
||||
"spatial_relationship", # 14min
|
||||
# e: 12min
|
||||
"object_class", # 12min
|
||||
# f: 12min
|
||||
"color", # 12min
|
||||
# g: 10.5min
|
||||
"aesthetic_quality", # 2.5min
|
||||
"appearance_style", # 6min
|
||||
"temporal_flickering", # 2min
|
||||
# h: 9min
|
||||
"scene", # 3min
|
||||
"temporal_style", # 2min
|
||||
"dynamic_degree", # 4min
|
||||
]
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -6,6 +6,10 @@ MODEL_NAME=$3
|
|||
RES=$4
|
||||
ASP_RATIO=$5
|
||||
|
||||
NUM_SAMPLING_STEPS=$6
|
||||
FLOW=$7
|
||||
LLM_REFINE=$8
|
||||
|
||||
if [[ $CKPT == *"ema"* ]]; then
|
||||
parentdir=$(dirname $CKPT)
|
||||
CKPT_BASE=$(basename $parentdir)_ema
|
||||
|
|
@ -20,11 +24,36 @@ TASK_ID_LIST=(4a 4b 4c 4d 4e 4f 4g 4h) # for log records only
|
|||
START_INDEX_LIST=(0 120 240 360 480 600 720 840)
|
||||
END_INDEX_LIST=(120 240 360 480 600 720 840 2000)
|
||||
|
||||
## Modify the following to run on multiple machines for faster results
|
||||
## 720p will take quite long on a single machine
|
||||
# START_INDEX_LIST=(60 180 300 420 540 660 780 900)
|
||||
# END_INDEX_LIST=(120 240 360 480 600 720 840 2000)
|
||||
# LOG_BASE=$(dirname $CKPT)/eval/last_60
|
||||
# mkdir -p ${LOG_BASE}
|
||||
# echo "Logging to $LOG_BASE"
|
||||
|
||||
|
||||
|
||||
for i in "${!GPUS[@]}"; do
|
||||
if [ -z ${RES} ] || [ -z ${ASP_RATIO} ] ;
|
||||
then
|
||||
CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
|
||||
else
|
||||
CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
|
||||
if [ -z ${NUM_SAMPLING_STEPS} ];
|
||||
then
|
||||
CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
|
||||
else
|
||||
if [ -z ${FLOW} ];
|
||||
then
|
||||
CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
|
||||
else
|
||||
if [ -z ${LLM_REFINE} ];
|
||||
then
|
||||
CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
|
||||
else
|
||||
CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} ${LLM_REFINE}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
|
|
|||
|
|
@ -7,11 +7,10 @@ mkdir -p $LOG_BASE
|
|||
echo "Logging to $LOG_BASE"
|
||||
|
||||
GPUS=(0 1 2 3 4 5 6 7)
|
||||
START_INDEX_LIST=(0 2 4 6 8 10 12 14)
|
||||
END_INDEX_LIST=(2 4 6 8 10 12 14 16)
|
||||
START_INDEX_LIST=(0 2 6 7 8 9 10 13)
|
||||
END_INDEX_LIST=(2 6 7 8 9 10 13 16)
|
||||
TASK_ID_LIST=(calc_vbench_a calc_vbench_b calc_vbench_c calc_vbench_d calc_vbench_e calc_vbench_f calc_vbench_g calc_vbench_h) # for log records only
|
||||
|
||||
|
||||
for i in "${!GPUS[@]}"; do
|
||||
CUDA_VISIBLE_DEVICES=${GPUS[i]} python eval/vbench/calc_vbench.py $VIDEO_DIR $CKPT_DIR --start ${START_INDEX_LIST[i]} --end ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
|
||||
done
|
||||
|
|
|
|||
|
|
@ -6,6 +6,10 @@ MODEL_NAME=$3
|
|||
RES=$4
|
||||
ASP_RATIO=$5
|
||||
|
||||
NUM_SAMPLING_STEPS=$6
|
||||
FLOW=$7
|
||||
LLM_REFINE=$8
|
||||
|
||||
if [[ $CKPT == *"ema"* ]]; then
|
||||
parentdir=$(dirname $CKPT)
|
||||
CKPT_BASE=$(basename $parentdir)_ema
|
||||
|
|
@ -20,11 +24,27 @@ TASK_ID_LIST=(5a 5b 5c 5d 5e 5f 5g 5h) # for log records only
|
|||
START_INDEX_LIST=(0 140 280 420 560 700 840 980)
|
||||
END_INDEX_LIST=(140 280 420 560 700 840 980 2000)
|
||||
|
||||
|
||||
for i in "${!GPUS[@]}"; do
|
||||
if [ -z ${RES} ] || [ -z ${ASP_RATIO} ] ;
|
||||
then
|
||||
CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
|
||||
else
|
||||
CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
|
||||
if [ -z ${NUM_SAMPLING_STEPS} ];
|
||||
then
|
||||
CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
|
||||
else
|
||||
if [ -z ${FLOW} ];
|
||||
then
|
||||
CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
|
||||
else
|
||||
if [ -z ${LLM_REFINE} ];
|
||||
then
|
||||
CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
|
||||
else
|
||||
CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} ${LLM_REFINE}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
|
|
|||
|
|
@ -111,6 +111,9 @@ def prepare_dataloader(
|
|||
|
||||
|
||||
def collate_fn_default(batch):
|
||||
# filter out None
|
||||
batch = [x for x in batch if x is not None]
|
||||
|
||||
# HACK: for loading text features
|
||||
use_mask = False
|
||||
if "mask" in batch[0] and isinstance(batch[0]["mask"], int):
|
||||
|
|
@ -132,6 +135,9 @@ def collate_fn_batch(batch):
|
|||
"""
|
||||
Used only with BatchDistributedSampler
|
||||
"""
|
||||
# filter out None
|
||||
batch = [x for x in batch if x is not None]
|
||||
|
||||
res = torch.utils.data.default_collate(batch)
|
||||
|
||||
# squeeze the first dimension, which is due to torch.stack() in default_collate()
|
||||
|
|
|
|||
|
|
@ -190,7 +190,10 @@ class VariableVideoTextDataset(VideoTextDataset):
|
|||
return ret
|
||||
|
||||
def __getitem__(self, index):
|
||||
return self.getitem(index)
|
||||
try:
|
||||
return self.getitem(index)
|
||||
except:
|
||||
return None
|
||||
|
||||
|
||||
@DATASETS.register_module()
|
||||
|
|
|
|||
|
|
@ -499,7 +499,7 @@ class SeqParallelMultiHeadCrossAttention(MultiHeadCrossAttention):
|
|||
|
||||
# shape:
|
||||
# q, k, v: [B, SUB_N, NUM_HEADS, HEAD_DIM]
|
||||
q = self.q_linear(x).view(1, -1, self.num_heads, self.head_dim)
|
||||
q = self.q_linear(x).view(B, -1, self.num_heads, self.head_dim)
|
||||
kv = self.kv_linear(cond).view(1, -1, 2, self.num_heads, self.head_dim)
|
||||
kv = split_forward_gather_backward(kv, get_sequence_parallel_group(), dim=3, grad_scale="down")
|
||||
k, v = kv.unbind(2)
|
||||
|
|
|
|||
|
|
@ -204,9 +204,11 @@ class PixArt(nn.Module):
|
|||
t: (N,) tensor of diffusion timesteps
|
||||
y: (N, 1, 120, C) tensor of class labels
|
||||
"""
|
||||
x = x.to(self.dtype)
|
||||
timestep = timestep.to(self.dtype)
|
||||
y = y.to(self.dtype)
|
||||
dtype = self.x_embedder.proj.weight.dtype
|
||||
B = x.size(0)
|
||||
x = x.to(dtype)
|
||||
timestep = timestep.to(dtype)
|
||||
y = y.to(dtype)
|
||||
|
||||
# embedding
|
||||
x = self.x_embedder(x) # (B, N, D)
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ import numpy as np
|
|||
import torch
|
||||
import torch.distributed as dist
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from einops import rearrange
|
||||
from rotary_embedding_torch import RotaryEmbedding
|
||||
from timm.models.layers import DropPath
|
||||
|
|
@ -361,6 +362,24 @@ class STDiT3(PreTrainedModel):
|
|||
# === get pos embed ===
|
||||
_, _, Tx, Hx, Wx = x.size()
|
||||
T, H, W = self.get_dynamic_size(x)
|
||||
|
||||
# adjust for sequence parallelism
|
||||
# we need to ensure H * W is divisible by sequence parallel size
|
||||
# for simplicity, we can adjust the height to make it divisible
|
||||
if self.enable_sequence_parallelism:
|
||||
sp_size = dist.get_world_size(get_sequence_parallel_group())
|
||||
if H % sp_size != 0:
|
||||
h_pad_size = sp_size - H % sp_size
|
||||
else:
|
||||
h_pad_size = 0
|
||||
|
||||
if h_pad_size > 0:
|
||||
hx_pad_size = h_pad_size * self.patch_size[1]
|
||||
|
||||
# pad x along the H dimension
|
||||
H += h_pad_size
|
||||
x = F.pad(x, (0, 0, 0, hx_pad_size))
|
||||
|
||||
S = H * W
|
||||
base_size = round(S**0.5)
|
||||
resolution_sq = (height[0].item() * width[0].item()) ** 0.5
|
||||
|
|
@ -448,7 +467,7 @@ class STDiT3(PreTrainedModel):
|
|||
@MODELS.register_module("STDiT3-XL/2")
|
||||
def STDiT3_XL_2(from_pretrained=None, **kwargs):
|
||||
force_huggingface = kwargs.pop("force_huggingface", False)
|
||||
if force_huggingface or from_pretrained is not None and not os.path.isdir(from_pretrained):
|
||||
if force_huggingface or from_pretrained is not None and not os.path.exists(from_pretrained):
|
||||
model = STDiT3.from_pretrained(from_pretrained, **kwargs)
|
||||
else:
|
||||
config = STDiT3Config(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs)
|
||||
|
|
@ -460,7 +479,8 @@ def STDiT3_XL_2(from_pretrained=None, **kwargs):
|
|||
|
||||
@MODELS.register_module("STDiT3-3B/2")
|
||||
def STDiT3_3B_2(from_pretrained=None, **kwargs):
|
||||
if from_pretrained is not None and not os.path.isdir(from_pretrained):
|
||||
force_huggingface = kwargs.pop("force_huggingface", False)
|
||||
if force_huggingface or from_pretrained is not None and not os.path.exists(from_pretrained):
|
||||
model = STDiT3.from_pretrained(from_pretrained, **kwargs)
|
||||
else:
|
||||
config = STDiT3Config(depth=28, hidden_size=1872, patch_size=(1, 2, 2), num_heads=26, **kwargs)
|
||||
|
|
|
|||
|
|
@ -277,7 +277,7 @@ def OpenSoraVAE_V1_2(
|
|||
scale=scale,
|
||||
)
|
||||
|
||||
if force_huggingface or (from_pretrained is not None and not os.path.isdir(from_pretrained)):
|
||||
if force_huggingface or (from_pretrained is not None and not os.path.exists(from_pretrained)):
|
||||
model = VideoAutoencoderPipeline.from_pretrained(from_pretrained, **kwargs)
|
||||
else:
|
||||
config = VideoAutoencoderPipelineConfig(**kwargs)
|
||||
|
|
|
|||
|
|
@ -24,7 +24,8 @@ class DPM_SOLVER:
|
|||
mask=None,
|
||||
progress=True,
|
||||
):
|
||||
assert mask is None, "mask is not supported in dpm-solver"
|
||||
if mask is not None:
|
||||
print("[WARNING] mask is not supported in dpm-solver, it will be ignored")
|
||||
n = len(prompts)
|
||||
model_args = text_encoder.encode(prompts)
|
||||
y = model_args.pop("y")
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ from opensora.models.stdit.stdit3 import STDiT3, STDiT3Config
|
|||
|
||||
|
||||
def get_sample_data():
|
||||
x = torch.rand([1, 4, 15, 20, 27], dtype=torch.bfloat16) # (B, C, T, H, W)
|
||||
x = torch.rand([1, 4, 15, 20, 28], dtype=torch.bfloat16) # (B, C, T, H, W)
|
||||
timestep = torch.Tensor([924.0]).to(torch.bfloat16)
|
||||
y = torch.rand(1, 1, 300, 4096, dtype=torch.bfloat16)
|
||||
mask = torch.ones([1, 300], dtype=torch.int32)
|
||||
|
|
@ -66,6 +66,17 @@ def run_model(rank, world_size, port):
|
|||
set_seed(1024)
|
||||
dist_model_cfg = get_stdit3_config(enable_sequence_parallelism=True)
|
||||
dist_model = STDiT3(dist_model_cfg).cuda().to(torch.bfloat16)
|
||||
|
||||
# ensure model weights are equal
|
||||
for p1, p2 in zip(non_dist_model.parameters(), dist_model.parameters()):
|
||||
assert torch.equal(p1, p2)
|
||||
|
||||
# ensure model weights are equal across all ranks
|
||||
for p in dist_model.parameters():
|
||||
p_list = [torch.zeros_like(p) for _ in range(world_size)]
|
||||
dist.all_gather(p_list, p, group=dist.group.WORLD)
|
||||
assert torch.equal(*p_list)
|
||||
|
||||
dist_out = dist_model(**data)
|
||||
dist_out.mean().backward()
|
||||
|
||||
|
|
@ -84,9 +95,8 @@ def run_model(rank, world_size, port):
|
|||
for (n1, p1), (n2, p2) in zip(non_dist_model.named_parameters(), dist_model.named_parameters()):
|
||||
assert n1 == n2
|
||||
if p1.grad is not None and p2.grad is not None:
|
||||
if not torch.allclose(p1.grad, p2.grad, rtol=1e-2, atol=1e-4):
|
||||
if dist.get_rank() == 0:
|
||||
print(f"gradient of {n1} is not equal, {p1.grad} vs {p2.grad}")
|
||||
if not torch.allclose(p1.grad, p2.grad, rtol=1e-2, atol=1e-4) and dist.get_rank() == 0:
|
||||
print(f"gradient of {n1} is not equal, {p1.grad} vs {p2.grad}")
|
||||
else:
|
||||
assert p1.grad is None and p2.grad is None
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue