update cai to latest

This commit is contained in:
zhengzangw 2024-07-08 06:29:17 +00:00
commit 19617f6bc4
35 changed files with 1285 additions and 155 deletions

1
.gitignore vendored
View file

@ -179,6 +179,7 @@ pretrained_models
evaluation_results/
cache/
*.swp
debug/
# Secret files
hostfile

View file

@ -0,0 +1,40 @@
随机电影镜头
随机电影镜头
随机电影镜头
随机电影镜头
随机电影镜头
随机任务镜头
随机任务镜头
随机任务镜头
随机任务镜头
随机任务镜头
随机游戏镜头
随机游戏镜头
随机游戏镜头
随机游戏镜头
随机游戏镜头
随机开车镜头
随机开车镜头
随机开车镜头
随机开车镜头
随机开车镜头
随机动物镜头
随机动物镜头
随机动物镜头
随机动物镜头
随机动物镜头
随机森林镜头
随机森林镜头
随机森林镜头
随机森林镜头
随机森林镜头
随机动漫镜头
随机动漫镜头
随机动漫镜头
随机动漫镜头
随机动漫镜头
随机舞蹈镜头
随机舞蹈镜头
随机舞蹈镜头
随机舞蹈镜头
随机舞蹈镜头

View file

@ -19,14 +19,12 @@ model = dict(
qk_norm=True,
enable_flash_attn=True,
enable_layernorm_kernel=True,
force_huggingface=True,
)
vae = dict(
type="OpenSoraVAE_V1_2",
from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
micro_frame_size=17,
micro_batch_size=4,
force_huggingface=True,
)
text_encoder = dict(
type="t5",

View file

@ -0,0 +1,44 @@
resolution = "240p"
aspect_ratio = "9:16"
num_frames = 51
fps = 24
frame_interval = 1
save_fps = 24
save_dir = "./samples/samples/"
seed = 42
batch_size = 1
multi_resolution = "STDiT2"
dtype = "bf16"
condition_frame_length = 5
align = 5
model = dict(
type="STDiT3-XL/2",
from_pretrained="hpcai-tech/OpenSora-STDiT-v3",
qk_norm=True,
enable_flash_attn=True,
enable_layernorm_kernel=True,
force_huggingface=True,
)
vae = dict(
type="OpenSoraVAE_V1_2",
from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
micro_frame_size=17,
micro_batch_size=4,
force_huggingface=True,
)
text_encoder = dict(
type="t5",
from_pretrained="DeepFloyd/t5-v1_1-xxl",
model_max_length=300,
)
scheduler = dict(
type="rflow",
use_timestep_transform=True,
num_sampling_steps=30,
cfg_scale=7.0,
)
aes = 6.5
flow = None

View file

@ -5,7 +5,7 @@ dataset = dict(
)
# webvid
bucket_config = {"360p": {102: (1.0, 5)}}
bucket_config = {"360p": {102: (1.0, 1)}}
grad_checkpoint = True
# Acceleration settings

View file

@ -60,19 +60,21 @@ scheduler = dict(
)
# Mask settings
# 25%
mask_ratios = {
"random": 0.05,
"intepolate": 0.005,
"quarter_random": 0.005,
"quarter_head": 0.005,
"quarter_tail": 0.005,
"quarter_head_tail": 0.005,
"image_random": 0.025,
"image_head": 0.05,
"image_tail": 0.025,
"image_head_tail": 0.025,
"random": 0.005,
"intepolate": 0.002,
"quarter_random": 0.007,
"quarter_head": 0.002,
"quarter_tail": 0.002,
"quarter_head_tail": 0.002,
"image_random": 0.0,
"image_head": 0.22,
"image_tail": 0.005,
"image_head_tail": 0.005,
}
# Log settings
seed = 42
outputs = "outputs"

View file

@ -1,6 +1,6 @@
num_frames = 1
fps = 1
image_size = (2560, 1536)
# image_size = (2560, 1536)
# image_size = (2048, 2048)
model = dict(
@ -16,6 +16,7 @@ vae = dict(
type="VideoAutoencoderKL",
from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers",
subfolder="vae",
scaling_factor=0.13025,
)
text_encoder = dict(
type="t5",

View file

@ -1,5 +1,6 @@
# Commands
- [Config](#Config)
- [Inference](#inference)
- [Inference with Open-Sora 1.2](#inference-with-open-sora-12)
- [Inference with Open-Sora 1.1](#inference-with-open-sora-11)
@ -12,6 +13,36 @@
- [Training Hyperparameters](#training-hyperparameters)
- [Search batch size for buckets](#search-batch-size-for-buckets)
## Config
Note that currently our model loading for vae and diffusion model supports two types:
* load from local file path
* load from huggingface
Our config supports loading from huggingface online image by default.
If you wish to load from a local path downloaded from huggingface image, you need to set `force_huggingface=True`, for instance:
```python
# for vae
vae = dict(
type="OpenSoraVAE_V1_2",
from_pretrained="/root/commonData/OpenSora-VAE-v1.2",
micro_frame_size=17,
micro_batch_size=4,
force_huggingface=True, # NOTE: set here
)
# for diffusion model
model = dict(
type="STDiT3-XL/2",
from_pretrained="/root/commonData/OpenSora-STDiT-v3",
qk_norm=True,
enable_flash_attn=True,
enable_layernorm_kernel=True,
force_huggingface=True, # NOTE: set here
)
```
However, if you want to load a self-trained model, do not set `force_huggingface=True` since your image won't be in huggingface format.
## Inference
You can modify corresponding config files to change the inference settings. See more details [here](/docs/structure.md#inference-config-demos).

View file

@ -7,7 +7,11 @@
- [Evaluation](#evaluation)
- [Sequence parallelism](#sequence-parallelism)
<<<<<<< HEAD
In Open-Sora 1.2 release, we train a 1.1B models on >30M data (\~80k hours), with training cost 35k H100 GPU hours, supporting 0s\~16s, 144p to 720p, various aspect ratios video generation. Our configurations is listed below. Following our 1.1 version, Open-Sora 1.2 can also do image-to-video generation and video extension.
=======
In Open-Sora 1.2 release, we train a 1.1B models on >30M data (about 80k hours), with training cost 35k H100 GPU hours, supporting 0s to 16s, 144p to 720p, various aspect ratios video generation. Our configurations is listed below. Following our 1.1 version, Open-Sora 1.2 can also do image-to-video generation and video extension.
>>>>>>> 65efc8ce0e1cce0bd1a5a2aaab5cd845abee2c95
| | image | 2s | 4s | 8s | 16s |
| ---- | ----- | --- | --- | --- | --- |

View file

@ -48,8 +48,14 @@ First, generate the relevant videos with the following commands:
```bash
# vbench task, if evaluation all set start_index to 0, end_index to 2000
bash eval/sample.sh /path/to/ckpt num_frames model_name_for_log -4 start_index end_index
# Alternatively, launch 8 jobs at once (you must read the script to understand the details)
bash eval/vbench/launch.sh /path/to/ckpt num_frames model_name
# in addition, you can specify resolution, aspect ratio, sampling steps, flow, and llm-refine
bash eval/vbench/launch.sh /path/to/ckpt num_frames model_name res_value aspect_ratio_value steps_value flow_value llm_refine_value
# for example
# bash eval/vbench/launch.sh /mnt/jfs-hdd/sora/checkpoints/outputs/042-STDiT3-XL-2/epoch1-global_step16200_llm_refine/ema.pt 51 042-STDiT3-XL-2 240p 9:16 30 2 True
```
After generation, install the VBench package following our [installation](../docs/installation.md)'s sections of "Evaluation Dependencies". Then, run the following commands to evaluate the generated samples.
@ -89,6 +95,15 @@ python eval/vbench_i2v/tabulate_vbench_i2v_scores.py path/to/videos/folder path/
```
Similarly as VBench, you can specify resolution, aspect ratio, sampling steps, flow, and llm-refine
```bash
bash eval/vbench_i2v/launch.sh /path/to/ckpt num_frames model_name_for_log res_value aspect_ratio_value steps_value flow_value llm_refine_value
# for example
# bash eval/vbench_i2v/launch.sh /mnt/jfs-hdd/sora/checkpoints/outputs/042-STDiT3-XL-2/epoch1-global_step16200_llm_refine/ema.pt 51 042-STDiT3-XL-2 240p 9:16 30 2 True
# if no flow control, use "None" instead
```
## VAE
Install the dependencies package following our [installation](../docs/installation.md)'s s sections of "Evaluation Dependencies". Then, run the following evaluation command:

View file

@ -3,8 +3,16 @@
CMD="torchrun --standalone --nproc_per_node 1 eval/loss/eval_loss.py configs/opensora-v1-2/misc/eval_loss.py"
CKPT_PATH=$1
MODEL_NAME=$2
IMG_PATH="/mnt/jfs-hdd/sora/meta/validation/img_1k.csv"
VID_PATH="/mnt/jfs-hdd/sora/meta/validation/vid_100.csv"
IMG_PATH=$3
VID_PATH=$4
if [ -z $IMG_PATH ]; then
IMG_PATH="/mnt/jfs-hdd/sora/meta/validation/img_1k.csv"
fi
if [ -z $VID_PATH ]; then
VID_PATH="/mnt/jfs-hdd/sora/meta/validation/vid_100.csv"
fi
if [[ $CKPT_PATH == *"ema"* ]]; then
parentdir=$(dirname $CKPT_PATH)

View file

@ -3,12 +3,44 @@
CKPT=$1
NUM_FRAMES=$2
MODEL_NAME=$3
TASK_TYPE=$4
VBENCH_START_INDEX=$5
VBENCH_END_INDEX=$6
VBENCH_RES=$7
VBENCH_ASP_RATIO=$8
NUM_SAMPLING_STEPS=$9
FLOW=${10}
LLM_REFINE=${11}
BASE_ASPECT_RATIO=360p
ASPECT_RATIOS=(144p 240p 360p 480p 720p 1080p)
# Loop through the list of aspect ratios
i=0
for r in "${ASPECT_RATIOS[@]}"; do
if [[ "$r" == "$BASE_ASPECT_RATIO" ]]; then
# get aspect ratio 1 level up
if [[ $((i+1)) -lt ${#ASPECT_RATIOS[@]} ]]; then
ASPECT_RATIO_INCR_1=${ASPECT_RATIOS[$((i+1))]}
else
# If this is the highest ratio, return the highest ratio
ASPECT_RATIO_INCR_1=${ASPECT_RATIOS[-1]}
fi
# get aspect ratio 2 levels up
if [[ $((i+2)) -lt ${#ASPECT_RATIOS[@]} ]]; then
ASPECT_RATIO_INCR_2=${ASPECT_RATIOS[$((i+2))]}
else
# If this is the highest ratio, return the highest ratio
ASPECT_RATIO_INCR_2=${ASPECT_RATIOS[-1]}
fi
fi
i=$((i+1))
done
echo "base aspect ratio: ${BASE_ASPECT_RATIO}"
echo "aspect ratio 1 level up: ${ASPECT_RATIO_INCR_1}"
echo "aspect ratio 2 levels up: ${ASPECT_RATIO_INCR_2}"
echo "Note that this aspect ratio level setting is used for videos only, not images"
echo "NUM_FRAMES=${NUM_FRAMES}"
if [ -z "${NUM_FRAMES}" ]; then
@ -39,7 +71,7 @@ DEFAULT_BS=1
# called inside run_video_b
function run_image() { # 14min
# 1.1 1024x1024
eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2i_samples.txt --save-dir $OUTPUT --num-frames 1 --resolution 1024 --aspect_ratio 1:1 --sample-name image_1024_1_1 --batch-size $DEFAULT_BS
eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2i_samples.txt --save-dir $OUTPUT --num-frames 1 --resolution 1024 --aspect-ratio 1:1 --sample-name image_1024_1_1 --batch-size $DEFAULT_BS
# 1.2 240x426
eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2i_samples.txt --save-dir $OUTPUT --num-frames 1 --resolution 240p --aspect-ratio 9:16 --sample-name image_240p_9_16 --end-index 3 --batch-size $DEFAULT_BS
@ -89,13 +121,13 @@ function run_video_a() { # ~ 30min ?
# eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 2s --resolution 720p --aspect-ratio 9:16 --sample-name sample_2s_720p_9_16 --batch-size $DEFAULT_BS
# sample, 720p, 9:16, 2s
eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 4s --resolution 720p --aspect-ratio 9:16 --sample-name sample_4s_720p --batch-size $DEFAULT_BS
eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 4s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 9:16 --sample-name sample_4s_${ASPECT_RATIO_INCR_2} --batch-size $DEFAULT_BS
# sample, 480p, 9:16, 8s
eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 8s --resolution 480p --aspect-ratio 9:16 --sample-name sample_8s_480p --batch-size $DEFAULT_BS
eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 8s --resolution ${ASPECT_RATIO_INCR_1} --aspect-ratio 9:16 --sample-name sample_8s_${ASPECT_RATIO_INCR_1} --batch-size $DEFAULT_BS
# sample, 240p, 9:16, 16s
eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 16s --resolution 360p --aspect-ratio 9:16 --sample-name sample_16s_360p --batch-size $DEFAULT_BS
# sample, 360p, 9:16, 16s
eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 16s --resolution ${BASE_ASPECT_RATIO} --aspect-ratio 9:16 --sample-name sample_16s_${BASE_ASPECT_RATIO} --batch-size $DEFAULT_BS
}
function run_video_b() { # 18min + 14min = 32min, short 16x240p & 64x240p
@ -112,10 +144,10 @@ function run_video_b() { # 18min + 14min = 32min, short 16x240p & 64x240p
# eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 8s --resolution 240p --aspect-ratio 9:16 --sample-name short_8s_240p_9_16 --batch-size $DEFAULT_BS
# short, 480p, 9:16, 8s: ~24min
eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 8s --resolution 480p --aspect-ratio 9:16 --sample-name short_8s_480p --batch-size $DEFAULT_BS
eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 8s --resolution ${ASPECT_RATIO_INCR_1} --aspect-ratio 9:16 --sample-name short_8s_${ASPECT_RATIO_INCR_1} --batch-size $DEFAULT_BS
# short, 240p, 9:16, 16s: ~24min
eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 16s --resolution 360p --aspect-ratio 9:16 --sample-name short_16s_360p --batch-size $DEFAULT_BS
# short, 360p, 9:16, 16s: ~24min
eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 16s --resolution ${BASE_ASPECT_RATIO} --aspect-ratio 9:16 --sample-name short_16s_${BASE_ASPECT_RATIO} --batch-size $DEFAULT_BS
}
@ -129,10 +161,10 @@ function run_video_c() {
# eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 16s --resolution 240p --aspect-ratio 9:16 --sample-name sora_16s_240p_9_16 --batch-size $DEFAULT_BS
# short, 720p, 9:16, 2s: ~9min
eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 4s --resolution 720p --aspect-ratio 9:16 --sample-name short_4s_720p --batch-size $DEFAULT_BS
eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 4s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 9:16 --sample-name short_4s_${ASPECT_RATIO_INCR_2} --batch-size $DEFAULT_BS
# sora, 240p, 9:16, 16s: ~40min
eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 16s --resolution 360p --aspect-ratio 9:16 --sample-name sora_16s_360p --batch-size $DEFAULT_BS
# sora, 360p, 9:16, 16s: ~40min
eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 16s --resolution ${BASE_ASPECT_RATIO} --aspect-ratio 9:16 --sample-name sora_16s_${BASE_ASPECT_RATIO} --batch-size $DEFAULT_BS
}
function run_video_d() {
@ -143,17 +175,17 @@ function run_video_d() {
# eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 8s --resolution 480p --aspect-ratio 9:16 --sample-name sora_8s_480p_9_16 --batch-size $DEFAULT_BS --start-index 0 --end-index 16
# sora, 480p, 9:16, 8s, 1/3 # moved from run_video_e, 30min
eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 8s --resolution 480p --aspect-ratio 9:16 --sample-name sora_8s_480p --batch-size $DEFAULT_BS --start-index 0 --end-index 16
eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 8s --resolution ${ASPECT_RATIO_INCR_1} --aspect-ratio 9:16 --sample-name sora_8s_${ASPECT_RATIO_INCR_1} --batch-size $DEFAULT_BS --start-index 0 --end-index 16
}
function run_video_e() { # 90min * 2/3 = 60min
# sora, 480p, 9:16, 8s, 2/3
eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 8s --resolution 480p --aspect-ratio 9:16 --sample-name sora_8s_480p --batch-size $DEFAULT_BS --start-index 16 --end-index 100
eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 8s --resolution ${ASPECT_RATIO_INCR_1} --aspect-ratio 9:16 --sample-name sora_8s_${ASPECT_RATIO_INCR_1} --batch-size $DEFAULT_BS --start-index 16 --end-index 100
}
function run_video_f() { # 60min
# sora, 720p, 9:16, 2s
eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 4s --resolution 720p --aspect-ratio 9:16 --sample-name sora_4s_720p --batch-size $DEFAULT_BS
eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 4s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 9:16 --sample-name sora_4s_${ASPECT_RATIO_INCR_2} --batch-size $DEFAULT_BS
}
# --resolution 720p --aspect-ratio [16:9, 9:16, ...]
@ -162,22 +194,22 @@ function run_video_g() { # 15min
# 720p, 2s multi-resolution
# 1:1
PROMPT="A soaring drone footage captures the majestic beauty of a coastal cliff, its red and yellow stratified rock faces rich in color and against the vibrant turquoise of the sea. Seabirds can be seen taking flight around the cliff's precipices. As the drone slowly moves from different angles, the changing sunlight casts shifting shadows that highlight the rugged textures of the cliff and the surrounding calm sea. The water gently laps at the rock base and the greenery that clings to the top of the cliff, and the scene gives a sense of peaceful isolation at the fringes of the ocean. The video captures the essence of pristine natural beauty untouched by human structures."
eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution 720p --aspect-ratio 1:1 --sample-name drone_cliff_prompt_720p_2s_1_1
eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 1:1 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_1_1
# 16:9
eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution 720p --aspect-ratio 16:9 --sample-name drone_cliff_prompt_720p_2s_16_9
eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 16:9 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_16_9
# 9:16
eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution 720p --aspect-ratio 9:16 --sample-name drone_cliff_prompt_720p_2s_9_16
eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 9:16 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_9_16
# 4:3
eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution 720p --aspect-ratio 4:3 --sample-name drone_cliff_prompt_720p_2s_4_3
eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 4:3 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_4_3
# 3:4
eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution 720p --aspect-ratio 3:4 --sample-name drone_cliff_prompt_720p_2s_3_4
eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 3:4 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_3_4
# 1:2
eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution 720p --aspect-ratio 1:2 --sample-name drone_cliff_prompt_720p_2s_1_2
eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 1:2 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_1_2
# 2:1
eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution 720p --aspect-ratio 2:1 --sample-name drone_cliff_prompt_720p_2s_2_1
eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 2:1 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_2_1
# add motion score
eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --num-frames 2s --resolution 720p --sample-name motion_2s_720p --prompt \
eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --sample-name motion_2s_${ASPECT_RATIO_INCR_2} --prompt \
\"A stylish woman walking in the street of Tokyo.\" \"A stylish woman walking in the street of Tokyo. motion score: 0.0\" \
\"A stylish woman walking in the street of Tokyo. motion score: 2.0\" \
\"A stylish woman walking in the street of Tokyo. motion score: 4.0\" \
@ -188,7 +220,7 @@ function run_video_g() { # 15min
\"A stylish woman walking in the street of Tokyo. motion score: 100.0\"
# add aes score
eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --num-frames 2s --resolution 720p --sample-name aes_2s_720p --prompt \
eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --sample-name aes_2s_${ASPECT_RATIO_INCR_2} --prompt \
\"A stylish woman walking in the street of Tokyo.\" \"A stylish woman walking in the street of Tokyo. aesthetic score: 4.0\" \
\"A stylish woman walking in the street of Tokyo. aesthetic score: 4.5\" \
\"A stylish woman walking in the street of Tokyo. aesthetic score: 5.0\" \
@ -202,24 +234,24 @@ function run_video_g() { # 15min
function run_video_h() { # 61min
# 3.1 image-conditioned long video generation
eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --sample-name ref_L5C5_2s_360p_9_16 \
eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --sample-name ref_L5C5_2s_${BASE_ASPECT_RATIO}_9_16 \
--prompt-path assets/texts/t2v_ref.txt --start-index 0 --end-index 3 \
--num-frames 2s --resolution 360p --aspect-ratio 9:16 \
--num-frames 2s --resolution ${BASE_ASPECT_RATIO} --aspect-ratio 9:16 \
--loop 5 --condition-frame-length 5 \
--reference-path assets/images/condition/cliff.png assets/images/condition/wave.png assets/images/condition/ship.png \
--mask-strategy "0" "0" "0" --batch-size $DEFAULT_BS
eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --sample-name ref_L5C10_16s_360p_9_16 \
eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --sample-name ref_L5C10_16s_${BASE_ASPECT_RATIO}_9_16 \
--prompt-path assets/texts/t2v_ref.txt --start-index 0 --end-index 3 \
--num-frames 16s --resolution 360p --aspect-ratio 9:16 \
--num-frames 16s --resolution ${BASE_ASPECT_RATIO} --aspect-ratio 9:16 \
--loop 5 --condition-frame-length 10 \
--reference-path assets/images/condition/cliff.png assets/images/condition/wave.png assets/images/condition/ship.png \
--mask-strategy "0" "0" "0" --batch-size $DEFAULT_BS
# 3.2
eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --sample-name ref_L1_16s_240p_9_16 \
eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --sample-name ref_L1_16s_${BASE_ASPECT_RATIO}_9_16 \
--prompt-path assets/texts/t2v_ref.txt --start-index 3 --end-index 6 \
--num-frames 16s --resolution 360p --aspect-ratio 9:16 \
--num-frames 16s --resolution ${BASE_ASPECT_RATIO} --aspect-ratio 9:16 \
--loop 1 \
--reference-path assets/images/condition/cliff.png "assets/images/condition/cactus-sad.png\;assets/images/condition/cactus-happy.png" https://cdn.openai.com/tmp/s/interp/d0.mp4 \
--mask-strategy "0" "0\;0,1,0,-1,1" "0,0,0,0,${QUAD_FRAMES},0.5" --batch-size $DEFAULT_BS
@ -238,10 +270,38 @@ function run_vbench() {
--image-size $VBENCH_H $VBENCH_W \
--batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
else
eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
--prompt-path assets/texts/VBench/all_dimension.txt \
--resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO \
--batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
if [ -z ${NUM_SAMPLING_STEPS} ]; then
eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
--prompt-path assets/texts/VBench/all_dimension.txt \
--resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO \
--batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
else
if [ -z ${FLOW} ]; then
eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
--prompt-path assets/texts/VBench/all_dimension.txt \
--resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} \
--batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
else
if [ -z ${LLM_REFINE} ]; then
eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
--prompt-path assets/texts/VBench/all_dimension.txt \
--resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --flow ${FLOW} \
--batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
else
if [ "${FLOW}" = "None" ]; then
eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
--prompt-path assets/texts/VBench/all_dimension.txt \
--resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --llm-refine ${LLM_REFINE} \
--batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
else
eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
--prompt-path assets/texts/VBench/all_dimension.txt \
--resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --flow ${FLOW} --llm-refine ${LLM_REFINE} \
--batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
fi
fi
fi
fi
fi
}
@ -255,16 +315,41 @@ function run_vbench_i2v() {
eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
--prompt-path assets/texts/VBench/all_i2v.txt \
--image-size $VBENCH_I2V_H $VBENCH_I2V_W \
--start-index $1 --end-index $2 \
--num-frames $NUM_FRAMES --batch-size $VBENCH_BS
--batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
else
eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
--prompt-path assets/texts/VBench/all_i2v.txt \
--resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO \
--start-index $1 --end-index $2 \
--num-frames $NUM_FRAMES --batch-size $VBENCH_BS
if [ -z ${NUM_SAMPLING_STEPS} ]; then
eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
--prompt-path assets/texts/VBench/all_i2v.txt \
--resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO \
--batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
else
if [ -z ${FLOW} ]; then
eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
--prompt-path assets/texts/VBench/all_i2v.txt \
--resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} \
--batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
else
if [ -z ${LLM_REFINE} ]; then
eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
--prompt-path assets/texts/VBench/all_i2v.txt \
--resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --flow ${FLOW} \
--batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
else
if [ "${FLOW}" = "None" ]; then
eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
--prompt-path assets/texts/VBench/all_i2v.txt \
--resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --llm-refine ${LLM_REFINE} \
--batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
else
eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
--prompt-path assets/texts/VBench/all_i2v.txt \
--resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --flow ${FLOW} --llm-refine ${LLM_REFINE} \
--batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
fi
fi
fi
fi
fi
}
### Main

View file

@ -8,24 +8,30 @@ from vbench import VBench
full_info_path = "eval/vbench/VBench_full_info.json"
dimensions = [
# Quality Score
"subject_consistency",
"background_consistency",
"motion_smoothness",
"dynamic_degree",
"aesthetic_quality",
"imaging_quality",
"temporal_flickering",
# Semantic Score
"object_class",
"multiple_objects",
"color",
"spatial_relationship",
"scene",
"temporal_style",
"overall_consistency",
"human_action",
"appearance_style",
# a: 10min
"subject_consistency", # 4min
"imaging_quality", # 6min
# b: 12min
"background_consistency", # 2min
"motion_smoothness", # 5min
"overall_consistency", # 2min
"human_action", # 3min
# c: 14min
"multiple_objects", # 14min
# d: 14min
"spatial_relationship", # 14min
# e: 12min
"object_class", # 12min
# f: 12min
"color", # 12min
# g: 10.5min
"aesthetic_quality", # 2.5min
"appearance_style", # 6min
"temporal_flickering", # 2min
# h: 9min
"scene", # 3min
"temporal_style", # 2min
"dynamic_degree", # 4min
]

View file

@ -6,6 +6,10 @@ MODEL_NAME=$3
RES=$4
ASP_RATIO=$5
NUM_SAMPLING_STEPS=$6
FLOW=$7
LLM_REFINE=$8
if [[ $CKPT == *"ema"* ]]; then
parentdir=$(dirname $CKPT)
CKPT_BASE=$(basename $parentdir)_ema
@ -20,11 +24,36 @@ TASK_ID_LIST=(4a 4b 4c 4d 4e 4f 4g 4h) # for log records only
START_INDEX_LIST=(0 120 240 360 480 600 720 840)
END_INDEX_LIST=(120 240 360 480 600 720 840 2000)
## Modify the following to run on multiple machines for faster results
## 720p will take quite long on a single machine
# START_INDEX_LIST=(60 180 300 420 540 660 780 900)
# END_INDEX_LIST=(120 240 360 480 600 720 840 2000)
# LOG_BASE=$(dirname $CKPT)/eval/last_60
# mkdir -p ${LOG_BASE}
# echo "Logging to $LOG_BASE"
for i in "${!GPUS[@]}"; do
if [ -z ${RES} ] || [ -z ${ASP_RATIO} ] ;
then
CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
else
CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
if [ -z ${NUM_SAMPLING_STEPS} ];
then
CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
else
if [ -z ${FLOW} ];
then
CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
else
if [ -z ${LLM_REFINE} ];
then
CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
else
CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} ${LLM_REFINE} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
fi
fi
fi
fi
done

View file

@ -7,11 +7,10 @@ mkdir -p $LOG_BASE
echo "Logging to $LOG_BASE"
GPUS=(0 1 2 3 4 5 6 7)
START_INDEX_LIST=(0 2 4 6 8 10 12 14)
END_INDEX_LIST=(2 4 6 8 10 12 14 16)
START_INDEX_LIST=(0 2 6 7 8 9 10 13)
END_INDEX_LIST=(2 6 7 8 9 10 13 16)
TASK_ID_LIST=(calc_vbench_a calc_vbench_b calc_vbench_c calc_vbench_d calc_vbench_e calc_vbench_f calc_vbench_g calc_vbench_h) # for log records only
for i in "${!GPUS[@]}"; do
CUDA_VISIBLE_DEVICES=${GPUS[i]} python eval/vbench/calc_vbench.py $VIDEO_DIR $CKPT_DIR --start ${START_INDEX_LIST[i]} --end ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
done

View file

@ -6,6 +6,10 @@ MODEL_NAME=$3
RES=$4
ASP_RATIO=$5
NUM_SAMPLING_STEPS=$6
FLOW=$7
LLM_REFINE=$8
if [[ $CKPT == *"ema"* ]]; then
parentdir=$(dirname $CKPT)
CKPT_BASE=$(basename $parentdir)_ema
@ -20,11 +24,27 @@ TASK_ID_LIST=(5a 5b 5c 5d 5e 5f 5g 5h) # for log records only
START_INDEX_LIST=(0 140 280 420 560 700 840 980)
END_INDEX_LIST=(140 280 420 560 700 840 980 2000)
for i in "${!GPUS[@]}"; do
if [ -z ${RES} ] || [ -z ${ASP_RATIO} ] ;
then
CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
else
CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
if [ -z ${NUM_SAMPLING_STEPS} ];
then
CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
else
if [ -z ${FLOW} ];
then
CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
else
if [ -z ${LLM_REFINE} ];
then
CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
else
CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} ${LLM_REFINE} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
fi
fi
fi
fi
done

View file

@ -465,7 +465,10 @@ def get_num_pixels(name):
def get_image_size(resolution, ar_ratio):
ar_key = ASPECT_RATIO_MAP[ar_ratio]
if ar_ratio in ASPECT_RATIO_MAP:
ar_key = ASPECT_RATIO_MAP[ar_ratio]
else:
ar_key = ar_ratio
rs_dict = ASPECT_RATIOS[resolution][1]
assert ar_key in rs_dict, f"Aspect ratio {ar_ratio} not found for resolution {resolution}"
return rs_dict[ar_key]

View file

@ -111,6 +111,9 @@ def prepare_dataloader(
def collate_fn_default(batch):
# filter out None
batch = [x for x in batch if x is not None]
# HACK: for loading text features
use_mask = False
if "mask" in batch[0] and isinstance(batch[0]["mask"], int):
@ -132,6 +135,9 @@ def collate_fn_batch(batch):
"""
Used only with BatchDistributedSampler
"""
# filter out None
batch = [x for x in batch if x is not None]
res = torch.utils.data.default_collate(batch)
# squeeze the first dimension, which is due to torch.stack() in default_collate()

View file

@ -190,7 +190,10 @@ class VariableVideoTextDataset(VideoTextDataset):
return ret
def __getitem__(self, index):
return self.getitem(index)
try:
return self.getitem(index)
except:
return None
@DATASETS.register_module()

View file

@ -163,6 +163,8 @@ class Attention(nn.Module):
if rope is not None:
self.rope = True
self.rotary_emb = rope
self.is_causal = False
def forward(self, x: torch.Tensor) -> torch.Tensor:
B, N, C = x.shape
@ -198,12 +200,17 @@ class Attention(nn.Module):
v,
dropout_p=self.attn_drop.p if self.training else 0.0,
softmax_scale=self.scale,
causal=self.is_causal,
)
else:
dtype = q.dtype
q = q * self.scale
attn = q @ k.transpose(-2, -1) # translate attn to float32
attn = attn.to(torch.float32)
if self.is_causal:
causal_mask = torch.tril(torch.ones_like(attn), diagonal=0)
causal_mask = torch.where(causal_mask.bool(), 0, float('-inf'))
attn += causal_mask
attn = attn.softmax(dim=-1)
attn = attn.to(dtype) # cast back attn to original dtype
attn = self.attn_drop(attn)

View file

@ -197,16 +197,18 @@ class PixArt(nn.Module):
if freeze == "text":
self.freeze_text()
def forward(self, x, timestep, y, mask=None):
def forward(self, x, timestep, y, mask=None, **kwargs):
"""
Forward pass of PixArt.
x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
t: (N,) tensor of diffusion timesteps
y: (N, 1, 120, C) tensor of class labels
"""
x = x.to(self.dtype)
timestep = timestep.to(self.dtype)
y = y.to(self.dtype)
dtype = self.x_embedder.proj.weight.dtype
B = x.size(0)
x = x.to(dtype)
timestep = timestep.to(dtype)
y = y.to(dtype)
# embedding
x = self.x_embedder(x) # (B, N, D)

View file

@ -4,6 +4,7 @@ import numpy as np
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.nn.functional as F
from einops import rearrange
from rotary_embedding_torch import RotaryEmbedding
from timm.models.layers import DropPath
@ -361,6 +362,24 @@ class STDiT3(PreTrainedModel):
# === get pos embed ===
_, _, Tx, Hx, Wx = x.size()
T, H, W = self.get_dynamic_size(x)
# adjust for sequence parallelism
# we need to ensure H * W is divisible by sequence parallel size
# for simplicity, we can adjust the height to make it divisible
if self.enable_sequence_parallelism:
sp_size = dist.get_world_size(get_sequence_parallel_group())
if H % sp_size != 0:
h_pad_size = sp_size - H % sp_size
else:
h_pad_size = 0
if h_pad_size > 0:
hx_pad_size = h_pad_size * self.patch_size[1]
# pad x along the H dimension
H += h_pad_size
x = F.pad(x, (0, 0, 0, hx_pad_size))
S = H * W
base_size = round(S**0.5)
resolution_sq = (height[0].item() * width[0].item()) ** 0.5

View file

@ -13,7 +13,13 @@ from opensora.utils.ckpt_utils import load_checkpoint
@MODELS.register_module()
class VideoAutoencoderKL(nn.Module):
def __init__(
self, from_pretrained=None, micro_batch_size=None, cache_dir=None, local_files_only=False, subfolder=None
self,
from_pretrained=None,
micro_batch_size=None,
cache_dir=None,
local_files_only=False,
subfolder=None,
scaling_factor=0.18215,
):
super().__init__()
self.module = AutoencoderKL.from_pretrained(
@ -25,6 +31,7 @@ class VideoAutoencoderKL(nn.Module):
self.out_channels = self.module.config.latent_channels
self.patch_size = (1, 8, 8)
self.micro_batch_size = micro_batch_size
self.scaling_factor = scaling_factor
def encode(self, x):
# x: (B, C, T, H, W)
@ -32,14 +39,14 @@ class VideoAutoencoderKL(nn.Module):
x = rearrange(x, "B C T H W -> (B T) C H W")
if self.micro_batch_size is None:
x = self.module.encode(x).latent_dist.sample().mul_(0.18215)
x = self.module.encode(x).latent_dist.sample().mul_(self.scaling_factor)
else:
# NOTE: cannot be used for training
bs = self.micro_batch_size
x_out = []
for i in range(0, x.shape[0], bs):
x_bs = x[i : i + bs]
x_bs = self.module.encode(x_bs).latent_dist.sample().mul_(0.18215)
x_bs = self.module.encode(x_bs).latent_dist.sample().mul_(self.scaling_factor)
x_out.append(x_bs)
x = torch.cat(x_out, dim=0)
x = rearrange(x, "(B T) C H W -> B C T H W", B=B)
@ -50,14 +57,14 @@ class VideoAutoencoderKL(nn.Module):
B = x.shape[0]
x = rearrange(x, "B C T H W -> (B T) C H W")
if self.micro_batch_size is None:
x = self.module.decode(x / 0.18215).sample
x = self.module.decode(x / self.scaling_factor).sample
else:
# NOTE: cannot be used for training
bs = self.micro_batch_size
x_out = []
for i in range(0, x.shape[0], bs):
x_bs = x[i : i + bs]
x_bs = self.module.decode(x_bs / 0.18215).sample
x_bs = self.module.decode(x_bs / self.scaling_factor).sample
x_out.append(x_bs)
x = torch.cat(x_out, dim=0)
x = rearrange(x, "(B T) C H W -> B C T H W", B=B)

View file

@ -0,0 +1,724 @@
"""
Adapted from SDXL VAE (https://huggingface.co/stabilityai/sdxl-vae/blob/main/config.json)
All default values of kwargs are the same as SDXL
"""
from typing import Optional, Tuple, Union
import torch
import torch.nn as nn
import torch.nn.functional as F
from diffusers.models.attention_processor import Attention
from einops import rearrange
def video_to_image(func):
def wrapper(self, x, *args, **kwargs):
if x.ndim == 5:
B = x.shape[0]
x = rearrange(x, 'B C T H W -> (B T) C H W')
if hasattr(self, 'micro_batch_size') and self.micro_batch_size is None:
x = func(self, x, *args, **kwargs)
else:
bs = self.micro_batch_size
x_out = []
for i in range(0, x.shape[0], bs):
x_i = func(self, x[i:i + bs], *args, **kwargs)
x_out.append(x_i)
x = torch.cat(x_out, dim=0)
x = rearrange(x, '(B T) C H W -> B C T H W', B=B)
return x
return wrapper
class VideoConv2d(nn.Conv2d):
def __init__(self, *args, micro_batch_size=None, **kwargs):
super().__init__(*args, **kwargs)
self.micro_batch_size = micro_batch_size
@video_to_image
def forward(self, x):
return super().forward(x)
class ResnetBlock2D(nn.Module):
"""
Use nn.Conv2d
Default activation is nn.SiLU()
Make sure input tensor is of shape [B, C, T, H, W] or [B, C, H, W]
Support micro_batch_size
"""
def __init__(
self,
in_channels: int,
out_channels: Optional[int] = None,
norm_groups: int = 32,
norm_eps: float = 1e-6,
micro_batch_size=None,
):
super().__init__()
self.in_channels = in_channels
out_channels = in_channels if out_channels is None else out_channels
self.out_channels = out_channels
self.micro_batch_size = micro_batch_size
conv_cls = nn.Conv2d
self.norm1 = torch.nn.GroupNorm(num_groups=norm_groups, num_channels=in_channels, eps=norm_eps, affine=True)
self.conv1 = conv_cls(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
self.norm2 = torch.nn.GroupNorm(num_groups=norm_groups, num_channels=out_channels, eps=norm_eps, affine=True)
self.conv2 = conv_cls(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
self.act = nn.SiLU()
self.use_in_shortcut = self.in_channels != out_channels
self.conv_shortcut = None
if self.use_in_shortcut:
self.conv_shortcut = conv_cls(
in_channels,
out_channels,
kernel_size=1,
stride=1,
padding=0,
)
@video_to_image
def forward(self, x):
res = self.norm1(x)
res = self.act(res)
res = self.conv1(res)
res = self.norm2(res)
res = self.act(res)
res = self.conv2(res)
if self.conv_shortcut is not None:
x = self.conv_shortcut(x)
out = x + res
return out
class ResnetBlock3D(nn.Module):
"""
Use nn.Conv3d
Default activation is nn.SiLU()
Make sure input tensor is of shape [B, C, T, H, W]
"""
def __init__(
self,
in_channels: int,
out_channels: Optional[int] = None,
norm_groups: int = 32,
norm_eps: float = 1e-6,
):
super().__init__()
self.in_channels = in_channels
out_channels = in_channels if out_channels is None else out_channels
self.out_channels = out_channels
conv_cls = nn.Conv3d
self.norm1 = torch.nn.GroupNorm(num_groups=norm_groups, num_channels=in_channels, eps=norm_eps, affine=True)
self.conv1 = conv_cls(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
self.norm2 = torch.nn.GroupNorm(num_groups=norm_groups, num_channels=out_channels, eps=norm_eps, affine=True)
self.conv2 = conv_cls(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
self.act = nn.SiLU()
self.use_in_shortcut = self.in_channels != out_channels
self.conv_shortcut = None
if self.use_in_shortcut:
self.conv_shortcut = conv_cls(
in_channels,
out_channels,
kernel_size=1,
stride=1,
padding=0,
)
def forward(self, x):
res = self.norm1(x)
res = self.act(res)
res = self.conv1(res)
res = self.norm2(res)
res = self.act(res)
res = self.conv2(res)
if self.conv_shortcut is not None:
x = self.conv_shortcut(x)
out = x + res
return out
class SpatialDownsample2x(nn.Module):
"""
Default downsample is Conv2d(stride=2)
Make sure input tensor is of shape [B, C, T, H, W]
Support micro_batch_size
"""
def __init__(
self,
channels: int,
use_conv: bool = True,
micro_batch_size=None,
):
super().__init__()
self.channels = channels
self.use_conv = use_conv
self.micro_batch_size = micro_batch_size
if use_conv:
self.downsample = nn.Conv2d(
self.channels, self.channels, kernel_size=3, stride=2, padding=0,
)
else:
self.downsample = nn.AvgPool2d(kernel_size=2, stride=2)
@video_to_image
def forward(self, x):
# implementation from SDXL
pad = (0, 1, 0, 1)
x = F.pad(x, pad, mode="constant", value=0)
x = self.downsample(x)
return x
class SpatialUpsample2x(nn.Module):
"""
Default upsample is F.interpolate(scale_factor=2) + Conv2d(stride=1)
Make sure input tensor is of shape [B, C, T, H, W]
Support micro_batch_size
"""
def __init__(
self,
channels: int,
use_interpolate=True,
micro_batch_size=None,
):
super().__init__()
self.channels = channels
self.use_interpolate = use_interpolate
self.micro_batch_size = micro_batch_size
if use_interpolate:
self.conv = nn.Conv2d(self.channels, self.channels, kernel_size=3, padding=1)
else:
raise NotImplementedError
self.upsample = nn.ConvTranspose2d(channels, self.channels, kernel_size=4, stride=2, padding=1)
def forward(self, x):
B = x.shape[0]
x = rearrange(x, 'B C T H W -> (B T) C H W')
if self.micro_batch_size is None:
x = self.forward_BCHW(x)
else:
bs = self.micro_batch_size
x_out = []
for i in range(0, x.shape[0], bs):
x_i = self.forward_BCHW(x[i:i + bs])
x_out.append(x_i)
x = torch.cat(x_out, dim=0)
x = rearrange(x, '(B T) C H W -> B C T H W', B=B)
return x
def forward_BCHW(self, x):
if self.use_interpolate:
# upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
if x.shape[0] >= 64:
x = x.contiguous()
# interpolate tensor of bfloat16 is fixed in pytorch 2.1. see https://github.com/pytorch/pytorch/issues/86679
x = F.interpolate(x, scale_factor=2.0, mode="nearest")
x = self.conv(x)
else:
x = self.upsample(x)
return x
class TemporalDownsample2x(nn.Module):
"""
Default downsample is Conv3d(stride=(2, 1, 1))
Make sure input tensor is of shape [B, C, T, H, W]
"""
def __init__(
self,
channels: int,
use_conv: bool = True,
):
super().__init__()
self.channels = channels
self.use_conv = use_conv
if use_conv:
self.downsample = nn.Conv3d(
self.channels, self.channels, kernel_size=(3, 3, 3), stride=(2, 1, 1), padding=(1, 1, 1),
)
else:
self.downsample = nn.AvgPool3d(kernel_size=(3, 1, 1), stride=(2, 1, 1))
def forward(self, x):
x = self.downsample(x)
return x
class TemporalUpsample2x(nn.Module):
"""
Default upsample is F.interpolate(scale_factor=(2, 1, 1)) + Conv3d(stride=1)
Make sure input tensor is of shape [B, C, T, H, W]
Support micro_batch_size
"""
def __init__(
self,
channels,
):
super().__init__()
self.channels = channels
self.conv = nn.Conv3d(channels, channels, kernel_size=3, padding=1)
def forward(self, x):
if x.shape[0] >= 64:
x = x.contiguous()
x = F.interpolate(x, scale_factor=(2, 1, 1), mode="trilinear")
x = self.conv(x)
return x
class UNetMidBlock2D(nn.Module):
"""
default is ResnetBlock2D + Spatial Attention + ResnetBlock2D
Make sure input tensor is of shape [B, C, T, H, W] or [B, C, H, W]
"""
def __init__(
self,
in_channels: int,
num_layers: int = 1,
norm_groups: int = 32,
norm_eps: float = 1e-6,
attn_groups: Optional[int] = None,
add_attention: bool = True,
attention_head_dim: int = 512,
):
super().__init__()
self.add_attention = add_attention
if attn_groups is None:
attn_groups = norm_groups
if attention_head_dim is None:
attention_head_dim = in_channels
res_blocks = [
ResnetBlock2D(
in_channels=in_channels,
out_channels=in_channels,
norm_eps=norm_eps,
norm_groups=norm_groups,
)
]
attn_blocks = []
for _ in range(num_layers):
if self.add_attention:
attn_blocks.append(
Attention(
in_channels,
heads=in_channels // attention_head_dim,
dim_head=attention_head_dim,
# rescale_output_factor=output_scale_factor,
rescale_output_factor=1.0,
eps=norm_eps,
norm_num_groups=attn_groups,
# spatial_norm_dim=temb_channels if resnet_time_scale_shift == "spatial" else None,
spatial_norm_dim=None,
residual_connection=True,
bias=True,
upcast_softmax=True,
_from_deprecated_attn_block=True,
)
)
res_blocks.append(
ResnetBlock2D(
in_channels=in_channels,
out_channels=in_channels,
norm_eps=norm_eps,
norm_groups=norm_groups,
)
)
self.attn_blocks = nn.ModuleList(attn_blocks)
self.res_blocks = nn.ModuleList(res_blocks)
def forward(self, x):
has_T = x.ndim == 5
if has_T:
B = x.shape[0]
x = rearrange(x, 'B C T H W -> (B T) C H W')
x = self.res_blocks[0](x)
for attn, res_block in zip(self.attn_blocks, self.res_blocks[1:]):
if attn is not None:
x = attn(x)
x = res_block(x)
if has_T:
x = rearrange(x, '(B T) C H W -> B C T H W', B=B)
return x
class Encoder(nn.Module):
"""
default arch is conv_in + blocks + mid_block + out_block
Make sure input tensor is of shape [B, C, T, H, W]
"""
def __init__(
self,
in_channels=3,
out_channels=4,
norm_groups=32,
norm_eps=1e-6,
double_z=True,
micro_batch_size=None,
):
super().__init__()
in_channels_encoder = in_channels
out_channels_encoder = out_channels
block_out_channels = [128, 256, 512, 512]
# conv_in
self.conv_in = VideoConv2d(
in_channels_encoder,
block_out_channels[0],
kernel_size=3,
stride=1,
padding=1,
micro_batch_size=micro_batch_size,
)
# blocks
blocks = []
# the first block: ResnetBlock2D
in_channels = block_out_channels[0]
out_channels = block_out_channels[0]
blocks.append(
nn.Sequential(
ResnetBlock2D(
in_channels=in_channels,
out_channels=out_channels,
norm_groups=norm_groups,
norm_eps=norm_eps,
micro_batch_size=micro_batch_size,
),
ResnetBlock2D(
in_channels=out_channels,
out_channels=out_channels,
norm_groups=norm_groups,
norm_eps=norm_eps,
micro_batch_size=micro_batch_size,
),
SpatialDownsample2x(
channels=out_channels,
use_conv=True,
micro_batch_size=micro_batch_size,
),
)
)
# the second block: ResnetBlock2D
in_channels = block_out_channels[0]
out_channels = block_out_channels[1]
blocks.append(
nn.Sequential(
ResnetBlock2D(
in_channels=in_channels,
out_channels=out_channels,
norm_groups=norm_groups,
norm_eps=norm_eps,
micro_batch_size=micro_batch_size,
),
ResnetBlock2D(
in_channels=out_channels,
out_channels=out_channels,
norm_groups=norm_groups,
norm_eps=norm_eps,
micro_batch_size=micro_batch_size,
),
SpatialDownsample2x(
channels=out_channels,
use_conv=True,
micro_batch_size=micro_batch_size,
),
TemporalDownsample2x(
channels=out_channels,
use_conv=True,
)
)
)
# the third block: ResnetBlock3D
in_channels = block_out_channels[1]
out_channels = block_out_channels[2]
blocks.append(
nn.Sequential(
ResnetBlock3D(
in_channels=in_channels,
out_channels=out_channels,
norm_groups=norm_groups,
norm_eps=norm_eps,
),
ResnetBlock3D(
in_channels=out_channels,
out_channels=out_channels,
norm_groups=norm_groups,
norm_eps=norm_eps,
),
SpatialDownsample2x(
channels=out_channels,
use_conv=True,
),
TemporalDownsample2x(
channels=out_channels,
use_conv=True,
)
)
)
# the fourth block: ResnetBlock3D
in_channels = block_out_channels[2]
out_channels = block_out_channels[3]
blocks.append(
nn.Sequential(
ResnetBlock3D(
in_channels=in_channels,
out_channels=out_channels,
norm_groups=norm_groups,
norm_eps=norm_eps,
),
ResnetBlock3D(
in_channels=out_channels,
out_channels=out_channels,
norm_groups=norm_groups,
norm_eps=norm_eps,
),
)
)
self.blocks = nn.ModuleList(blocks)
# mid_block
in_channels = block_out_channels[-1]
self.mid_block = UNetMidBlock2D(
in_channels=in_channels,
num_layers=1,
norm_groups=norm_groups,
norm_eps=norm_eps,
add_attention=True,
attention_head_dim=in_channels,
)
# out_block
in_channels = block_out_channels[-1]
out_channels = 2 * out_channels_encoder if double_z else out_channels_encoder
self.out_block = nn.Sequential(
nn.GroupNorm(num_channels=in_channels, num_groups=norm_groups, eps=norm_eps),
nn.SiLU(),
nn.Conv3d(in_channels, out_channels, kernel_size=3, padding=1),
)
def forward(self, x):
x = self.conv_in(x)
for block in self.blocks:
x = block(x)
x = self.mid_block(x)
x = self.out_block(x)
return x
class Decoder(nn.Module):
"""
default arch is conv_in + mid_block + blocks + out_block
Make sure input tensor is of shape [B, C, T, H, W]
"""
def __init__(
self,
in_channels=4,
out_channels=3,
norm_groups=32,
norm_eps=1e-6,
):
super().__init__()
in_channels_decoder = in_channels
out_channels_decoder = out_channels
block_out_channels = [512, 512, 256, 128]
# conv_in
self.conv_in = nn.Conv3d(
in_channels_decoder,
block_out_channels[0],
kernel_size=3,
stride=1,
padding=1,
)
# mid_block
in_channels = block_out_channels[0]
self.mid_block = UNetMidBlock2D(
in_channels=in_channels,
num_layers=1,
norm_groups=norm_groups,
norm_eps=norm_eps,
add_attention=True,
attention_head_dim=in_channels,
)
# blocks
blocks = []
layer_per_block = 3
# the first up block: ResnetBlock3D
in_channels = block_out_channels[0]
out_channels = block_out_channels[0]
seq = [
ResnetBlock3D(
in_channels=in_channels if idx ==0 else out_channels,
out_channels=out_channels,
norm_groups=norm_groups,
norm_eps=norm_eps,
)
for idx in range(layer_per_block)
] + [
SpatialUpsample2x(
channels=out_channels,
use_interpolate=True,
),
TemporalUpsample2x(
channels=out_channels,
),
]
blocks.append(nn.Sequential(*seq))
# the second up block: ResnetBlock3D
in_channels = block_out_channels[0]
out_channels = block_out_channels[1]
seq = [
ResnetBlock3D(
in_channels=in_channels if idx ==0 else out_channels,
out_channels=out_channels,
norm_groups=norm_groups,
norm_eps=norm_eps,
)
for idx in range(layer_per_block)
] + [
SpatialUpsample2x(
channels=out_channels,
use_interpolate=True,
),
TemporalUpsample2x(
channels=out_channels,
),
]
blocks.append(nn.Sequential(*seq))
# the third up block: ResnetBlock3D
in_channels = block_out_channels[1]
out_channels = block_out_channels[2]
seq = [
ResnetBlock3D(
in_channels=in_channels if idx ==0 else out_channels,
out_channels=out_channels,
norm_groups=norm_groups,
norm_eps=norm_eps,
)
for idx in range(layer_per_block)
] + [
SpatialUpsample2x(
channels=out_channels,
use_interpolate=True,
),
]
blocks.append(nn.Sequential(*seq))
# the fourth up block: ResnetBlock2D
in_channels = block_out_channels[2]
out_channels = block_out_channels[3]
seq = [
ResnetBlock2D(
in_channels=in_channels if idx ==0 else out_channels,
out_channels=out_channels,
norm_groups=norm_groups,
norm_eps=norm_eps,
)
for idx in range(layer_per_block)
]
blocks.append(nn.Sequential(*seq))
self.blocks = nn.ModuleList(blocks)
# out_block
in_channels = block_out_channels[-1]
out_channels = out_channels_decoder
self.out_block = nn.Sequential(
nn.GroupNorm(num_channels=in_channels, num_groups=norm_groups, eps=norm_eps),
nn.SiLU(),
nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
)
def forward(self, x):
x = self.conv_in(x)
print(torch.cuda.memory_allocated() / 1024 ** 3)
x = self.mid_block(x)
print(torch.cuda.memory_allocated() / 1024 ** 3)
for block in self.blocks:
x = block(x)
print(torch.cuda.memory_allocated() / 1024 ** 3)
x = self.out_block(x)
print(torch.cuda.memory_allocated() / 1024 ** 3)
return x
if __name__ == '__main__':
from opensora.utils.misc import count_params
device = 'cuda'
dtype = torch.bfloat16
encoder = Encoder(
in_channels=3,
out_channels=4,
double_z=False,
micro_batch_size=4,
).to(torch.bfloat16).to(device, dtype).eval()
decoder = Decoder(
in_channels=4,
out_channels=3,
).to(torch.bfloat16).to(device, dtype).eval()
num_params_enc = count_params(encoder)
num_params_dec = count_params(decoder)
print(f'Encoder #params: {num_params_enc}')
print(f'Decoder #params: {num_params_dec}')
# inference
x = torch.rand(1, 3, 51, 720, 1080).to(device, dtype)
with torch.inference_mode():
x_enc = encoder(x)
x_dec = decoder(x_enc)
print(torch.cuda.memory_allocated() / 1024 ** 3)
breakpoint()

View file

@ -24,7 +24,8 @@ class DPM_SOLVER:
mask=None,
progress=True,
):
assert mask is None, "mask is not supported in dpm-solver"
if mask is not None:
print("[WARNING] mask is not supported in dpm-solver, it will be ignored")
n = len(prompts)
model_args = text_encoder.encode(prompts)
y = model_args.pop("y")

View file

@ -1419,7 +1419,7 @@ class DPM_Solver:
for step in progress_fn(range(order, steps + 1)):
t = timesteps[step]
# We only use lower order for steps < 10
if lower_order_final and steps < 10:
if lower_order_final: # recommended by Shuchen Xue
step_order = min(order, steps + 1 - step)
else:
step_order = order

View file

@ -1,3 +1,3 @@
torch==2.3.0 --index-url https://download.pytorch.org/whl/cu121
torchvision==0.18.0 --index-url https://download.pytorch.org/whl/cu121
xformers==0.0.26.post1 --index-url https://download.pytorch.org/whl/cu121
torch==2.2.2 --index-url https://download.pytorch.org/whl/cu121
torchvision==0.17.2 --index-url https://download.pytorch.org/whl/cu121
xformers==0.0.25.post1 --index-url https://download.pytorch.org/whl/cu121

View file

@ -1,4 +1,4 @@
colossalai==0.3.7
colossalai>=0.4.0
mmengine>=0.10.3
pandas>=2.0.3
timm==0.9.16
@ -7,6 +7,7 @@ ftfy>=6.2.0 # for t5
diffusers==0.27.2 # for vae
accelerate==0.29.2 # for t5
av>=12.0.0 # for video loading
numpy<2.0.0
# [gradio]
gradio>=4.26.0

View file

@ -260,6 +260,7 @@ def main():
)
# == sampling ==
torch.manual_seed(1024)
z = torch.randn(len(batch_prompts), vae.out_channels, *latent_size, device=device, dtype=dtype)
masks = apply_mask_strategy(z, refs, ms, loop_i, align=align)
samples = scheduler.sample(

View file

@ -347,24 +347,27 @@ def main():
tb_writer.add_scalar("loss", loss.item(), global_step)
# wandb
if cfg.get("wandb", False):
wandb.log(
{
"iter": global_step,
"acc_step": acc_step,
"epoch": epoch,
"loss": loss.item(),
"avg_loss": avg_loss,
"lr": optimizer.param_groups[0]["lr"],
"debug/move_data_time": move_data_t.elapsed_time,
"debug/encode_time": encode_t.elapsed_time,
"debug/mask_time": mask_t.elapsed_time,
"debug/diffusion_time": loss_t.elapsed_time,
"debug/backward_time": backward_t.elapsed_time,
"debug/update_ema_time": ema_t.elapsed_time,
"debug/reduce_loss_time": reduce_loss_t.elapsed_time,
},
step=global_step,
)
wandb_dict = {
"iter": global_step,
"acc_step": acc_step,
"epoch": epoch,
"loss": loss.item(),
"avg_loss": avg_loss,
"lr": optimizer.param_groups[0]["lr"],
}
if record_time:
wandb_dict.update(
{
"debug/move_data_time": move_data_t.elapsed_time,
"debug/encode_time": encode_t.elapsed_time,
"debug/mask_time": mask_t.elapsed_time,
"debug/diffusion_time": loss_t.elapsed_time,
"debug/backward_time": backward_t.elapsed_time,
"debug/update_ema_time": ema_t.elapsed_time,
"debug/reduce_loss_time": reduce_loss_t.elapsed_time,
}
)
wandb.log(wandb_dict, step=global_step)
running_loss = 0.0
log_step = 0

View file

@ -9,7 +9,7 @@ from opensora.models.stdit.stdit3 import STDiT3, STDiT3Config
def get_sample_data():
x = torch.rand([1, 4, 15, 20, 27], dtype=torch.bfloat16) # (B, C, T, H, W)
x = torch.rand([1, 4, 15, 20, 28], dtype=torch.bfloat16) # (B, C, T, H, W)
timestep = torch.Tensor([924.0]).to(torch.bfloat16)
y = torch.rand(1, 1, 300, 4096, dtype=torch.bfloat16)
mask = torch.ones([1, 300], dtype=torch.int32)
@ -66,6 +66,17 @@ def run_model(rank, world_size, port):
set_seed(1024)
dist_model_cfg = get_stdit3_config(enable_sequence_parallelism=True)
dist_model = STDiT3(dist_model_cfg).cuda().to(torch.bfloat16)
# ensure model weights are equal
for p1, p2 in zip(non_dist_model.parameters(), dist_model.parameters()):
assert torch.equal(p1, p2)
# ensure model weights are equal across all ranks
for p in dist_model.parameters():
p_list = [torch.zeros_like(p) for _ in range(world_size)]
dist.all_gather(p_list, p, group=dist.group.WORLD)
assert torch.equal(*p_list)
dist_out = dist_model(**data)
dist_out.mean().backward()
@ -84,9 +95,8 @@ def run_model(rank, world_size, port):
for (n1, p1), (n2, p2) in zip(non_dist_model.named_parameters(), dist_model.named_parameters()):
assert n1 == n2
if p1.grad is not None and p2.grad is not None:
if not torch.allclose(p1.grad, p2.grad, rtol=1e-2, atol=1e-4):
if dist.get_rank() == 0:
print(f"gradient of {n1} is not equal, {p1.grad} vs {p2.grad}")
if not torch.allclose(p1.grad, p2.grad, rtol=1e-2, atol=1e-4) and dist.get_rank() == 0:
print(f"gradient of {n1} is not equal, {p1.grad} vs {p2.grad}")
else:
assert p1.grad is None and p2.grad is None

View file

@ -4,7 +4,7 @@ Human labeling of videos is expensive and time-consuming. We adopt powerful imag
## PLLaVA Captioning
To balance captioning speed and performance, we chose the 13B version of PLLaVA configured with 2*2 spatial pooling. We feed it with 4 frames evenly extracted from the video.
To balance captioning speed and performance, we chose the 13B version of PLLaVA configured with 2*2 spatial pooling. We feed it with 4 frames evenly extracted from the video. We accelerate its inference via (1) batching and (2) offload frame extraction to a separate process such that the GPU computations and frame extraction happen in parallel.
### Installation
Install the required dependancies by following our [installation instructions](../../docs/installation.md)'s "Data Dependencies" and "PLLaVA Captioning" sections.

View file

@ -1,3 +1,17 @@
import sys
import os
import os
from pathlib import Path
current_file = Path(__file__) # Gets the path of the current file
fourth_level_parent = current_file.parents[3]
datasets_dir = os.path.join(fourth_level_parent, "opensora/datasets")
import sys
sys.path.append(datasets_dir)
from read_video import read_video_av
sys.path.remove(datasets_dir)
import itertools
import logging
import multiprocessing as mp
@ -95,21 +109,49 @@ def get_index(num_frames, num_segments):
return offsets
# def load_video(video_path, num_frames, return_msg=False, resolution=336):
# transforms = torchvision.transforms.Resize(size=resolution)
# vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
# total_num_frames = len(vr)
# frame_indices = get_index(total_num_frames, num_frames)
# images_group = list()
# for frame_index in frame_indices:
# img = Image.fromarray(vr[frame_index].asnumpy())
# images_group.append(transforms(img))
# if return_msg:
# fps = float(vr.get_avg_fps())
# sec = ", ".join([str(round(f / fps, 1)) for f in frame_indices])
# # " " should be added in the start and end
# msg = f"The video contains {len(frame_indices)} frames sampled at {sec} seconds."
# return images_group, msg
# else:
# return images_group
def load_video(video_path, num_frames, return_msg=False, resolution=336):
transforms = torchvision.transforms.Resize(size=resolution)
vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
total_num_frames = len(vr)
# vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
vframes, aframes, info = read_video_av(
video_path,
pts_unit="sec",
output_format="THWC"
)
print(vframes.shape)
total_num_frames = len(vframes)
# print("Video path: ", video_path)
# print("Total number of frames: ", total_num_frames)
frame_indices = get_index(total_num_frames, num_frames)
images_group = list()
for frame_index in frame_indices:
img = Image.fromarray(vr[frame_index].asnumpy())
img = Image.fromarray(vframes[frame_index].numpy())
images_group.append(transforms(img))
if return_msg:
fps = float(vr.get_avg_fps())
sec = ", ".join([str(round(f / fps, 1)) for f in frame_indices])
# " " should be added in the start and end
msg = f"The video contains {len(frame_indices)} frames sampled at {sec} seconds."
return images_group, msg
# fps = float(vframes.get_avg_fps())
# sec = ", ".join([str(round(f / fps, 1)) for f in frame_indices])
# # " " should be added in the start and end
# msg = f"The video contains {len(frame_indices)} frames sampled at {sec} seconds."
# return images_group, msg
exit('return_msg not implemented yet')
else:
return images_group
@ -130,7 +172,10 @@ class CSVDataset(Dataset):
def __getitem__(self, idx):
if idx < 0 or idx >= len(self.data_list):
raise IndexError
video = load_video(self.data_list[idx], self.num_frames, resolution=RESOLUTION)
try:
video = load_video(self.data_list[idx], self.num_frames, resolution=RESOLUTION)
except:
return None
return video
def set_rank_and_world_size(self, rank, world_size):
@ -191,7 +236,7 @@ def parse_args():
"--error_message",
type=str,
required=False,
default=None,
default='error occured during captioning',
)
args = parser.parse_args()
return args
@ -233,8 +278,11 @@ def infer(
processor,
video_list,
conv_mode,
print_res=True,
print_res=False,
):
# check if any video in video_list is None, if so, raise an exception
if any([video is None for video in video_list]):
raise Exception("Video not loaded properly")
conv = conv_template.copy()
conv.user_query("Describe the video in details.", is_mm=True)
@ -308,7 +356,8 @@ def run(rank, args, world_size, output_queue):
)
except Exception as e:
logger.error(f"error in {batch}: {str(e)}")
preds = args.error_message
# preds = args.error_message duplicated for each video in the batch
preds = [args.error_message] * len(batch)
result_list.extend(preds)
output_queue.put((rank, result_list))
return result_list
@ -369,7 +418,7 @@ def main():
# write the dataframe to a new csv file called '*_pllava_13b_caption.csv'
new_csv_path = args.csv_path.replace(".csv", "_text.csv")
df.to_csv(new_csv_path, index=False)
print(f"Results saved to {new_csv_path}")
if __name__ == "__main__":
main()

View file

@ -6,7 +6,7 @@ import pandas as pd
from torchvision.datasets import ImageNet
IMG_EXTENSIONS = (".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif", ".tiff", ".webp")
VID_EXTENSIONS = (".mp4", ".avi", ".mov", ".mkv")
VID_EXTENSIONS = (".mp4", ".avi", ".mov", ".mkv", ".m2ts")
def scan_recursively(root):

View file

@ -29,15 +29,20 @@ def process_single_row(row, args):
# check mp4 integrity
# if not is_intact_video(video_path, logger=logger):
# return False
if "timestamp" in row:
timestamp = row["timestamp"]
if not (timestamp.startswith("[") and timestamp.endswith("]")):
try:
if "timestamp" in row:
timestamp = row["timestamp"]
if not (timestamp.startswith("[") and timestamp.endswith("]")):
return False
scene_list = eval(timestamp)
scene_list = [(FrameTimecode(s, fps=1), FrameTimecode(t, fps=1)) for s, t in scene_list]
else:
scene_list = [None]
if args.drop_invalid_timestamps:
return True
except Exception as e:
if args.drop_invalid_timestamps:
return False
scene_list = eval(timestamp)
scene_list = [(FrameTimecode(s, fps=1), FrameTimecode(t, fps=1)) for s, t in scene_list]
else:
scene_list = [None]
if "relpath" in row:
save_dir = os.path.dirname(os.path.join(args.save_dir, row["relpath"]))
@ -61,7 +66,7 @@ def process_single_row(row, args):
shorter_size=shorter_size,
logger=logger,
)
return True
def split_video(
video_path,
@ -108,7 +113,10 @@ def split_video(
fname_wo_ext = os.path.splitext(fname)[0]
# TODO: fname pattern
save_path = os.path.join(save_dir, f"{fname_wo_ext}_scene-{idx}.mp4")
if os.path.exists(save_path):
# print_log(f"File '{save_path}' already exists. Skip.", logger=logger)
continue
# ffmpeg cmd
cmd = [FFMPEG_PATH]
@ -134,7 +142,7 @@ def split_video(
# cmd += ['-vf', f"scale='if(gt(iw,ih),{shorter_size},trunc(ow/a/2)*2)':-2"]
cmd += ["-map", "0:v", save_path]
# print(cmd)
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
stdout, stderr = proc.communicate()
# stdout = stdout.decode("utf-8")
@ -159,11 +167,11 @@ def parse_args():
)
parser.add_argument("--target_fps", type=int, default=None, help="target fps of clips")
parser.add_argument(
"--shorter_size", type=int, default=1080, help="resize the shorter size by keeping ratio; will not do upscale"
"--shorter_size", type=int, default=None, help="resize the shorter size by keeping ratio; will not do upscale"
)
parser.add_argument("--num_workers", type=int, default=None, help="#workers for pandarallel")
parser.add_argument("--disable_parallel", action="store_true", help="disable parallel processing")
parser.add_argument("--drop_invalid_timestamps", action="store_true", help="drop rows with invalid timestamps")
args = parser.parse_args()
return args
@ -175,7 +183,7 @@ def main():
print(f"Meta file '{meta_path}' not found. Exit.")
exit()
# create logger
# create save_dir
os.makedirs(args.save_dir, exist_ok=True)
# initialize pandarallel
@ -189,10 +197,13 @@ def main():
# process
meta = pd.read_csv(args.meta_path)
if not args.disable_parallel:
meta.parallel_apply(process_single_row_partial, axis=1)
results = meta.parallel_apply(process_single_row_partial, axis=1)
else:
meta.apply(process_single_row_partial, axis=1)
results = meta.apply(process_single_row_partial, axis=1)
if args.drop_invalid_timestamps:
meta = meta[results]
assert args.meta_path.endswith("timestamp.csv"), "Only support *timestamp.csv"
meta.to_csv(args.meta_path.replace("timestamp.csv", "correct_timestamp.csv"), index=False)
print(f"Corrected timestamp file saved to '{args.meta_path.replace('timestamp.csv', 'correct_timestamp.csv')}'")
if __name__ == "__main__":
main()