Merge branch 'dev/v1.2' of https://github.com/hpcaitech/Open-Sora-dev into dev/v1.2

2026-04-19 01:15:33 +02:00 · 2024-05-10 21:25:31 +08:00 · 2024-05-10 21:25:31 +08:00 · be24571cbc
commit be24571cbc
parent cbc0d954b1 970302fb14
10 changed files with 260 additions and 81 deletions
--- a/configs/opensora-v1-2/inference/sample.py
+++ b/configs/opensora-v1-2/inference/sample.py
@ -1,5 +1,5 @@
 image_size = (240, 426)
-num_frames = 34
+num_frames = 51
 fps = 24
 frame_interval = 1

--- a/configs/opensora-v1-2/train/eval.py
+++ b/configs/opensora-v1-2/train/eval.py
@ -0,0 +1,88 @@
+# Dataset settings
+dataset = dict(
+    type="VariableVideoTextDataset",
+    transform_name="resize_crop",
+    frame_interval=1,
+)
+
+bucket_config = {  # 20s/it
+    "1024": {1: (1.0, 1)},
+}
+
+grad_checkpoint = True
+batch_size = None
+
+# Acceleration settings
+num_workers = 8
+num_bucket_build_workers = 16
+dtype = "bf16"
+plugin = "zero2"
+
+# Model settings
+model = dict(
+    type="STDiT3-XL/2",
+    from_pretrained=None,
+    qk_norm=True,
+    enable_flash_attn=True,
+    enable_layernorm_kernel=True,
+)
+vae = dict(
+    type="VideoAutoencoderPipeline",
+    from_pretrained="pretrained_models/vae-pipeline",
+    micro_frame_size=17,
+    shift=(-0.10, 0.34, 0.27, 0.98),
+    scale=(3.85, 2.32, 2.33, 3.06),
+    vae_2d=dict(
+        type="VideoAutoencoderKL",
+        from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers",
+        subfolder="vae",
+        micro_batch_size=4,
+        local_files_only=True,
+    ),
+    vae_temporal=dict(
+        type="VAE_Temporal_SD",
+        from_pretrained=None,
+    ),
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
+    model_max_length=300,
+    shardformer=True,
+    local_files_only=True,
+)
+scheduler = dict(
+    type="rflow",
+    use_discrete_timesteps=False,
+    use_timestep_transform=False,
+    # sample_method="logit-normal",
+)
+
+# Mask settings
+# mask_ratios = {
+#     "random": 0.1,
+#     "intepolate": 0.01,
+#     "quarter_random": 0.01,
+#     "quarter_head": 0.01,
+#     "quarter_tail": 0.01,
+#     "quarter_head_tail": 0.01,
+#     "image_random": 0.05,
+#     "image_head": 0.1,
+#     "image_tail": 0.05,
+#     "image_head_tail": 0.05,
+# }
+
+# Log settings
+seed = 42
+outputs = "outputs"
+wandb = False
+epochs = 1000
+log_every = 10
+ckpt_every = 500
+
+# optimization settings
+load = None
+grad_clip = 1.0
+lr = 2e-4
+ema_decay = 0.99
+adam_eps = 1e-15
--- a/configs/opensora-v1-2/train/stage1-gc.py
+++ b/configs/opensora-v1-2/train/stage1-gc.py
@ -26,18 +26,18 @@ dataset = dict(
 # }
 # webvid
 bucket_config = {  # 20s/it
-    "144p": {1: (1.0, 100), 51: (1.0, 30), 102: (1.0, 20), 204: ((1.0, 0.2), 8), 408: ((1.0, 0.1), 4)},
+    "144p": {1: (1.0, 100), 51: (1.0, 30), 102: ((1.0, 0.33), 20), 204: ((1.0, 0.1), 8), 408: ((1.0, 0.1), 4)},
    # ---
-    "256": {1: (0.5, 100), 51: (0.3, 24), 102: (0.3, 12), 204: ((0.3, 0.2), 4), 408: ((0.3, 0.1), 2)},
-    "240p": {1: (0.5, 100), 51: (0.3, 24), 102: (0.3, 12), 204: ((0.3, 0.2), 4), 408: ((0.3, 0.1), 2)},
+    "256": {1: (0.4, 100), 51: (0.5, 24), 102: ((0.5, 0.33), 12), 204: ((0.5, 0.1), 4), 408: ((0.5, 0.1), 2)},
+    "240p": {1: (0.3, 100), 51: (0.4, 24), 102: ((0.4, 0.33), 12), 204: ((0.4, 0.1), 4), 408: ((0.4, 0.1), 2)},
    # ---
-    "360p": {1: (0.5, 60), 51: (0.3, 12), 102: (0.3, 6), 204: ((0.3, 0.2), 2), 408: ((0.3, 0.1), 1)},
-    "512": {1: (0.5, 60), 51: (0.3, 12), 102: (0.3, 6), 204: (0.3, 2), 408: (0.3, 1)},
+    "360p": {1: (0.2, 60), 51: (0.15, 12), 102: ((0.15, 0.33), 6), 204: ((0.15, 0.1), 2), 408: ((0.15, 0.1), 1)},
+    "512": {1: (0.1, 60), 51: (0.3, 12), 102: (0.3, 6), 204: (0.3, 2), 408: (0.3, 1)},
    # ---
-    "480p": {1: (0.5, 40), 51: (0.3, 6), 102: (0.3, 3), 204: (0.3, 1), 408: (0.0, None)},
+    "480p": {1: (0.1, 40), 51: (0.3, 6), 102: (0.3, 3), 204: (0.3, 1), 408: (0.0, None)},
    # ---
-    "720p": {1: (0.2, 20), 51: (0.3, 2), 102: (0.3, 1), 204: (0.0, None)},
-    "1024": {1: (0.1, 20), 51: (0.3, 2), 102: (0.3, 1), 204: (0.0, None)},
+    "720p": {1: (0.05, 20), 51: (0.3, 2), 102: (0.3, 1), 204: (0.0, None)},
+    "1024": {1: (0.05, 20), 51: (0.3, 2), 102: (0.3, 1), 204: (0.0, None)},
    # ---
    "1080p": {1: (0.1, 10)},
    # ---
@ -63,7 +63,9 @@ model = dict(
 )
 vae = dict(
    type="VideoAutoencoderPipeline",
-    from_pretrained="pretrained_models/vae-v3",
+    from_pretrained="pretrained_models/vae-pipeline",
+    shift=(-0.10, 0.34, 0.27, 0.98),
+    scale=(3.85, 2.32, 2.33, 3.06),
    micro_frame_size=17,
    vae_2d=dict(
        type="VideoAutoencoderKL",
@ -116,6 +118,6 @@ ckpt_every = 500
 # optimization settings
 load = None
 grad_clip = 1.0
-lr = 1e-4
+lr = 2e-4
 ema_decay = 0.99
 adam_eps = 1e-15
--- a/eval/launch.sh
+++ b/eval/launch.sh
@ -4,26 +4,29 @@ set -x
 set -e

 CKPT=$1
+NUM_FRAMES=$2
+MODEL_NAME=$3
+
 if [[ $CKPT == *"ema"* ]]; then
    parentdir=$(dirname $CKPT)
    CKPT_BASE=$(basename $parentdir)_ema
 else
    CKPT_BASE=$(basename $CKPT)
 fi
-LOG_BASE=logs/sample/$CKPT_BASE
+LOG_BASE=logs/sample/${MODEL_NAME}_${CKPT_BASE}
 echo "Logging to $LOG_BASE"

 # == sample & human evaluation ==
-# CUDA_VISIBLE_DEVICES=0 bash eval/sample.sh $CKPT -1 >${LOG_BASE}_1.log 2>&1 &
-# CUDA_VISIBLE_DEVICES=1 bash eval/sample.sh $CKPT -2a >${LOG_BASE}_2a.log 2>&1 &
-# CUDA_VISIBLE_DEVICES=2 bash eval/sample.sh $CKPT -2b >${LOG_BASE}_2b.log 2>&1 &
-# CUDA_VISIBLE_DEVICES=3 bash eval/sample.sh $CKPT -2c >${LOG_BASE}_2c.log 2>&1 &
-# CUDA_VISIBLE_DEVICES=4 bash eval/sample.sh $CKPT -2d >${LOG_BASE}_2d.log 2>&1 &
-# CUDA_VISIBLE_DEVICES=5 bash eval/sample.sh $CKPT -2e >${LOG_BASE}_2e.log 2>&1 &
-# CUDA_VISIBLE_DEVICES=6 bash eval/sample.sh $CKPT -2f >${LOG_BASE}_2f.log 2>&1 &
-# CUDA_VISIBLE_DEVICES=7 bash eval/sample.sh $CKPT -2g >${LOG_BASE}_2g.log 2>&1 &
+CUDA_VISIBLE_DEVICES=0 bash eval/sample.sh $CKPT 1 $MODEL_NAME -1 >${LOG_BASE}_1.log 2>&1 &
+CUDA_VISIBLE_DEVICES=1 bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -2a >${LOG_BASE}_2a.log 2>&1 &
+CUDA_VISIBLE_DEVICES=2 bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -2b >${LOG_BASE}_2b.log 2>&1 &
+CUDA_VISIBLE_DEVICES=3 bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -2c >${LOG_BASE}_2c.log 2>&1 &
+CUDA_VISIBLE_DEVICES=4 bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -2d >${LOG_BASE}_2d.log 2>&1 &
+CUDA_VISIBLE_DEVICES=5 bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -2e >${LOG_BASE}_2e.log 2>&1 &
+CUDA_VISIBLE_DEVICES=6 bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -2f >${LOG_BASE}_2f.log 2>&1 &
+CUDA_VISIBLE_DEVICES=7 bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -2g >${LOG_BASE}_2g.log 2>&1 &

-# CUDA_VISIBLE_DEVICES=0 bash eval/sample.sh $CKPT -2h >${LOG_BASE}_2h.log 2>&1 &
+# CUDA_VISIBLE_DEVICES=7 bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -2h >${LOG_BASE}_2h.log 2>&1 &

 # == vbench ==
 # CUDA_VISIBLE_DEVICES=0 bash eval/sample.sh $CKPT -4a >${LOG_BASE}_4a.log 2>&1 &
--- a/eval/launch_per_gpu.sh
+++ b/eval/launch_per_gpu.sh
@ -0,0 +1,40 @@
+#!/bin/bash
+
+set -x
+set -e
+
+CKPT=$1
+CUDA_NUM=$2
+NUM_FRAMES=$3
+MODEL_NAME=$4
+
+
+if [[ $CKPT == *"ema"* ]]; then
+    parentdir=$(dirname $CKPT)
+    CKPT_BASE=$(basename $parentdir)_ema
+else
+    CKPT_BASE=$(basename $CKPT)
+fi
+
+LOG_BASE=logs/sample/${MODEL_NAME}_${CKPT_BASE}
+echo "Logging to $LOG_BASE"
+
+# == sample & human evaluation ==
+echo "running image task"
+CUDA_VISIBLE_DEVICES=$CUDA_NUM bash eval/sample.sh $CKPT 1 $MODEL_NAME -1 >${LOG_BASE}_1.log 2>&1
+echo "running task 2a"
+CUDA_VISIBLE_DEVICES=$CUDA_NUM bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -2a >${LOG_BASE}_2a.log 2>&1
+# echo "running task 2b"
+# CUDA_VISIBLE_DEVICES=$CUDA_NUM bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -2b >${LOG_BASE}_2b.log 2>&1
+# echo "running task 2c"
+# CUDA_VISIBLE_DEVICES=$CUDA_NUM bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -2c >${LOG_BASE}_2c.log 2>&1
+# echo "running task 2d"
+# CUDA_VISIBLE_DEVICES=$CUDA_NUM bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -2d >${LOG_BASE}_2d.log 2>&1
+# echo "running task 2e"
+# CUDA_VISIBLE_DEVICES=$CUDA_NUM bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -2e >${LOG_BASE}_2e.log 2>&1
+# echo "running task 2f"
+# CUDA_VISIBLE_DEVICES=$CUDA_NUM bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -2f >${LOG_BASE}_2f.log 2>&1
+# echo "running task 2g"
+# CUDA_VISIBLE_DEVICES=$CUDA_NUM bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -2g >${LOG_BASE}_2g.log 2>&1
+# echo "running task 2h"
+# CUDA_VISIBLE_DEVICES=$CUDA_NUM bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -2h >${LOG_BASE}_2h.log 2>&1
--- a/eval/sample.sh
+++ b/eval/sample.sh
@ -4,33 +4,39 @@
 set -e

 CKPT=$1
+NUM_FRAMES=$2
+MODEL_NAME=$3
+
+let DOUBLE_FRAMES=$2*2
+let QUAD_FRAMES=$2*4
+let OCT_FRAMES=$2*8

 CMD="python scripts/inference.py configs/opensora-v1-2/inference/sample.py"
-CMD_REF="python scripts/inference-long.py configs/opensora-v1-1/inference/sample.py"
+CMD_REF="python scripts/inference-long.py configs/opensora-v1-2/inference/sample.py"
 if [[ $CKPT == *"ema"* ]]; then
  parentdir=$(dirname $CKPT)
  CKPT_BASE=$(basename $parentdir)_ema
 else
  CKPT_BASE=$(basename $CKPT)
 fi
-OUTPUT="./samples/samples_${CKPT_BASE}"
+OUTPUT="./samples/samples_${MODEL_NAME}_${CKPT_BASE}"
 start=$(date +%s)
-DEFAULT_BS=8
+DEFAULT_BS=1

 ### Functions

 function run_image() { # 10min
-  # 1.1 1024x1024
+  # # 1.1 1024x1024
  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2i_samples.txt --save-dir $OUTPUT --num-frames 1 --image-size 1024 1024 --sample-name 1024x1024 --batch-size $DEFAULT_BS

  # 1.2 240x426
  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2i_samples.txt --save-dir $OUTPUT --num-frames 1 --image-size 240 426 --sample-name 240x426 --end-index 3 --batch-size $DEFAULT_BS

  # 1.3 512x512
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2i_samples.txt --save-dir $OUTPUT --num-frames 1 --image-size 512 512 --sample-name 512x512 --end-index 3 --batch-size $DEFAULT_BS
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 1 --image-size 512 512 --sample-name 512x512 --end-index 3 --batch-size $DEFAULT_BS
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 1 --image-size 512 512 --sample-name 512x512 --end-index 3 --batch-size $DEFAULT_BS
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 1 --image-size 512 512 --sample-name 512x512 --end-index 3 --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2i_samples.txt --save-dir $OUTPUT --num-frames 1 --image-size 512 512 --sample-name t2i_512x512 --end-index 3 --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 1 --image-size 512 512 --sample-name t2v_512x512 --end-index 3 --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 1 --image-size 512 512 --sample-name short_512x512 --end-index 3 --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 1 --image-size 512 512 --sample-name sora_512x512 --end-index 3 --batch-size $DEFAULT_BS

  # 1.4 720p multi-resolution
  # 1:1
@ -52,80 +58,80 @@ function run_image() { # 10min

 function run_video_a() { # 30min, sample & multi-resolution
  # sample
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 16 --image-size 144 256 --sample-name sample_16x144x256 --batch-size $DEFAULT_BS
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 16 --image-size 240 426 --sample-name sample_16x240x426 --batch-size $DEFAULT_BS
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 32 --image-size 240 426 --sample-name sample_32x240x426 --batch-size $DEFAULT_BS
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 64 --image-size 240 426 --sample-name sample_64x240x426 --batch-size $DEFAULT_BS
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 16 --image-size 480 854 --sample-name sample_16x480x854 --batch-size $DEFAULT_BS
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 32 --image-size 480 854 --sample-name sample_32x480x854 --batch-size $DEFAULT_BS
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 16 --image-size 720 1280 --sample-name sample_16x720x1280 --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames $NUM_FRAMES --image-size 144 256 --sample-name sample_${NUM_FRAMES}x144x256 --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames $NUM_FRAMES --image-size 240 426 --sample-name sample_${NUM_FRAMES}x240x426 --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames $DOUBLE_FRAMES --image-size 240 426 --sample-name sample_${DOUBLE_FRAMES}x240x426 --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames $QUAD_FRAMES --image-size 240 426 --sample-name sample_${QUAD_FRAMES}x240x426 --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames $NUM_FRAMES --image-size 480 854 --sample-name sample_${NUM_FRAMES}x480x854 --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames $DOUBLE_FRAMES --image-size 480 854 --sample-name sample_${DOUBLE_FRAMES}x480x854 --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames $NUM_FRAMES --image-size 720 1280 --sample-name sample_${NUM_FRAMES}x720x1280 --batch-size $DEFAULT_BS
 }

 function run_video_b() { # 30min, short 16x240p & 64x240p
  # 32x240p, short
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 32 --image-size 240 426 --sample-name short_32x240x426 --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames $DOUBLE_FRAMES --image-size 240 426 --sample-name short_${DOUBLE_FRAMES}x240x426 --batch-size $DEFAULT_BS

  # 64x240p, short
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 64 --image-size 240 426 --sample-name short_64x240x426 --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames $QUAD_FRAMES --image-size 240 426 --sample-name short_${QUAD_FRAMES}x240x426 --batch-size $DEFAULT_BS
 }

 function run_video_c() { # 30min, sora 16x240p & short 128x240p
  # 16x240p, sora
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 16 --image-size 426 240 --sample-name sora_16x426x240 --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames $NUM_FRAMES --image-size 426 240 --sample-name sora_${NUM_FRAMES}x426x240 --batch-size $DEFAULT_BS

  # 16x240p, sora
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 16 --image-size 240 426 --sample-name sora_16x240x426 --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames $NUM_FRAMES --image-size 240 426 --sample-name sora_${NUM_FRAMES}x240x426 --batch-size $DEFAULT_BS

  # 128x240p, sora
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 128 --image-size 240 426 --sample-name sora_128x240x426 --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames $OCT_FRAMES  --image-size 240 426 --sample-name sora_${OCT_FRAMES}x240x426 --batch-size $DEFAULT_BS
 }

 function run_video_d() { # 30min, sora 32x480p
  # 32x480p, short
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 32 --image-size 480 854 --sample-name short_32x480x854 --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames $DOUBLE_FRAMES --image-size 480 854 --sample-name short_${DOUBLE_FRAMES}x480x854 --batch-size $DEFAULT_BS
 }

 function run_video_e() { # 30min
  # 64x480p, sora
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 64 --image-size 480 854 --sample-name sora_64x480x854 --batch-size 4
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames $QUAD_FRAMES --image-size 480 854 --sample-name sora_${QUAD_FRAMES}x480x854 --batch-size $DEFAULT_BS
 }

 function run_video_f() { # 30min
  # 16x720p, sora
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 16 --image-size 720 1280 --sample-name sora_16x720x1280 --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames $NUM_FRAMES --image-size 720 1280 --sample-name sora_${NUM_FRAMES}x720x1280 --batch-size $DEFAULT_BS
 }

 function run_video_g() {
  # 16x720p multi-resolution
  # 1:1
  PROMPT="A soaring drone footage captures the majestic beauty of a coastal cliff, its red and yellow stratified rock faces rich in color and against the vibrant turquoise of the sea. Seabirds can be seen taking flight around the cliff's precipices. As the drone slowly moves from different angles, the changing sunlight casts shifting shadows that highlight the rugged textures of the cliff and the surrounding calm sea. The water gently laps at the rock base and the greenery that clings to the top of the cliff, and the scene gives a sense of peaceful isolation at the fringes of the ocean. The video captures the essence of pristine natural beauty untouched by human structures."
-  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 16 --image-size 960 960 --sample-name 720p_1_1
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames $NUM_FRAMES --image-size 960 960 --sample-name 720p_1_1
  # 16:9
-  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 16 --image-size 720 1280 --sample-name 720p_16_9
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames $NUM_FRAMES --image-size 720 1280 --sample-name 720p_16_9
  # 9:16
-  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 16 --image-size 1280 720 --sample-name 720p_9_16
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames $NUM_FRAMES --image-size 1280 720 --sample-name 720p_9_16
  # 4:3
-  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 16 --image-size 832 1108 --sample-name 720p_4_3
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames $NUM_FRAMES --image-size 832 1108 --sample-name 720p_4_3
  # 3:4
-  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 16 --image-size 1108 832 --sample-name 720p_3_4
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames $NUM_FRAMES --image-size 1108 832 --sample-name 720p_3_4
  # 1:2
-  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 16 --image-size 1358 600 --sample-name 720p_1_2
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames $NUM_FRAMES --image-size 1358 600 --sample-name 720p_1_2
  # 2:1
-  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 16 --image-size 600 1358 --sample-name 720p_2_1
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames $NUM_FRAMES --image-size 600 1358 --sample-name 720p_2_1
 }

 function run_video_h() { # 23min
  # 3.1 image-conditioned long video generation
  eval $CMD_REF --ckpt-path $CKPT --save-dir $OUTPUT --sample-name ref_L10C4_16x240x426 \
    --prompt-path assets/texts/t2v_ref.txt --start-index 0 --end-index 3 \
-    --num-frames 16 --image-size 240 426 \
+    --num-frames $NUM_FRAMES --image-size 240 426 \
    --loop 5 --condition-frame-length 4 \
    --reference-path assets/images/condition/cliff.png assets/images/condition/wave.png assets/images/condition/ship.png \
    --mask-strategy "0" "0" "0" --batch-size $DEFAULT_BS

  eval $CMD_REF --ckpt-path $CKPT --save-dir $OUTPUT --sample-name ref_L10C4_64x240x426 \
    --prompt-path assets/texts/t2v_ref.txt --start-index 0 --end-index 3 \
-    --num-frames 64 --image-size 240 426 \
+    --num-frames $NUM_FRAMES --image-size 240 426 \
    --loop 5 --condition-frame-length 16 \
    --reference-path assets/images/condition/cliff.png assets/images/condition/wave.png assets/images/condition/ship.png \
    --mask-strategy "0" "0" "0" --batch-size $DEFAULT_BS
@ -133,7 +139,7 @@ function run_video_h() { # 23min
  # 3.2
  eval $CMD_REF --ckpt-path $CKPT --save-dir $OUTPUT --sample-name ref_L1_128x240x426 \
    --prompt-path assets/texts/t2v_ref.txt --start-index 3 --end-index 6 \
-    --num-frames 128 --image-size 240 426 \
+    --num-frames $NUM_FRAMES  --image-size 240 426 \
    --loop 1 \
    --reference-path assets/images/condition/cliff.png "assets/images/condition/cactus-sad.png\;assets/images/condition/cactus-happy.png" https://cdn.openai.com/tmp/s/interp/d0.mp4 \
    --mask-strategy "0\;0,0,0,-1,1" "0\;0,1,0,-1,1" "0,0,0,0,64,0.5" --batch-size $DEFAULT_BS
--- a/eval/test.sh
+++ b/eval/test.sh
@ -0,0 +1,14 @@
+#!/bin/bash
+
+# set -x
+set -e
+
+CKPT=$1
+NUM_FRAMES=$2
+MODEL_NAME=$3
+
+let DOUBLE_FRAMES=$2*2
+let TRIPLE_FRAMES=$2*3
+
+echo $DOUBLE_FRAMES
+echo $TRIPLE_FRAMES
--- a/opensora/models/vae/vae.py
+++ b/opensora/models/vae/vae.py
@ -126,7 +126,8 @@ class VideoAutoencoderPipeline(nn.Module):
        freeze_vae_2d=False,
        cal_loss=False,
        micro_frame_size=None,
-        scale=2.5,
+        shift=0.0,
+        scale=1.0,
    ):
        super().__init__()
        self.spatial_vae = build_module(vae_2d, MODELS)
@ -142,7 +143,12 @@ class VideoAutoencoderPipeline(nn.Module):
                param.requires_grad = False

        self.out_channels = self.temporal_vae.out_channels
-        self.scale = scale  # make std = 1.0
+        self.scale = torch.tensor(scale).cuda()
+        self.shift = torch.tensor(shift).cuda()
+        if len(self.scale.shape) > 0:
+            self.scale = self.scale[None, :, None, None, None]
+        if len(self.shift.shape) > 0:
+            self.shift = self.shift[None, :, None, None, None]

    def encode(self, x):
        x_z = self.spatial_vae.encode(x)
@ -161,11 +167,11 @@ class VideoAutoencoderPipeline(nn.Module):
        if self.cal_loss:
            return z, posterior, x_z
        else:
-            return z / self.scale
+            return (z - self.shift) / self.scale

    def decode(self, z, num_frames=None):
        if not self.cal_loss:
-            z = z * self.scale
+            z = z * self.scale + self.shift

        if self.micro_frame_size is None:
            x_z = self.temporal_vae.decode(z, num_frames=num_frames)
--- a/opensora/schedulers/rf/rectified_flow.py
+++ b/opensora/schedulers/rf/rectified_flow.py
@ -112,7 +112,7 @@ class RFlowScheduler:
        """
        compatible with diffusers add_noise()
        """
-        timepoints = timesteps.float() / self.num_timesteps  # [0, 999/1000]
+        timepoints = timesteps.float() / self.num_timesteps
        timepoints = 1 - timepoints  # [1,1/1000]

        # timepoint  (bsz) noise: (bsz, 4, frame, w ,h)
--- a/scripts/inference_vae.py
+++ b/scripts/inference_vae.py
@ -8,7 +8,7 @@ from mmengine.runner import set_random_seed
 from tqdm import tqdm

 from opensora.acceleration.parallel_states import get_data_parallel_group
-from opensora.datasets import prepare_dataloader, save_sample
+from opensora.datasets import prepare_dataloader
 from opensora.models.vae.losses import VAELoss
 from opensora.registry import DATASETS, MODELS, build_module
 from opensora.utils.config_utils import parse_configs
@ -24,11 +24,10 @@ def main():

    # init distributed
    if os.environ.get("WORLD_SIZE", None):
-        use_dist = True
        colossalai.launch_from_torch({})
        coordinator = DistCoordinator()
    else:
-        use_dist = False
+        pass

    # ======================================================
    # 2. runtime variables
@ -67,7 +66,7 @@ def main():
    # ======================================================
    # 5. inference
    # ======================================================
-    save_dir = cfg.save_dir
+    cfg.save_dir

    # define loss function
    vae_loss_fn = VAELoss(
@ -90,8 +89,11 @@ def main():

    calc_std = cfg.get("calc_std", False)
    if calc_std:
-        running_std_sum = 0.0
-        num_samples = 0.0
+        running_sum = 0.0
+        running_sum_c = torch.zeros(model.out_channels, dtype=torch.float, device=device)
+        running_var = 0.0
+        running_var_c = torch.zeros(model.out_channels, dtype=torch.float, device=device)
+        num_samples = 0

    with tqdm(
        range(total_steps),
@ -110,13 +112,31 @@ def main():

            # calc std
            if calc_std:
-                num_samples += z.size(0)
-                running_std_sum += z.std(dim=(1, 2, 3, 4)).sum().item()
-                pbar.set_postfix({"z std": running_std_sum / num_samples, "std sum": running_std_sum})
+                num_samples += 1
+                running_sum += z.mean().item()
+                running_var += (z - running_sum / num_samples).pow(2).mean().item()
+
+                running_sum_c += z.mean(dim=(0, 2, 3, 4)).float()
+                running_var_c += (
+                    (z - running_sum_c[None, :, None, None, None] / num_samples).pow(2).mean(dim=(0, 2, 3, 4)).float()
+                )
+                pbar.set_postfix(
+                    {
+                        "mean": running_sum / num_samples,
+                        "std": (running_var / num_samples) ** 0.5,
+                    }
+                )
+                if num_samples % 100 == 0:
+                    print(
+                        " mean_c ",
+                        (running_sum_c / num_samples).cpu().tolist(),
+                        "std_c ",
+                        (running_var_c / num_samples).sqrt().cpu().tolist(),
+                    )

            assert list(z.shape[2:]) == latent_size, f"z shape: {z.shape}, latent_size: {latent_size}"
            x_rec, x_z_rec = model.decode(z, num_frames=x.size(2))
-            x_ref = model.spatial_vae.decode(x_z)
+            model.spatial_vae.decode(x_z)

            # loss calculation
            nll_loss, weighted_nll_loss, weighted_kl_loss = vae_loss_fn(x, x_rec, posterior)
@ -127,18 +147,18 @@ def main():
            running_nll = nll_loss.item() / loss_steps + running_nll * ((loss_steps - 1) / loss_steps)
            running_nll_z = nll_loss_z.item() / loss_steps + running_nll_z * ((loss_steps - 1) / loss_steps)

-            if not use_dist or coordinator.is_master():
-                ori_dir = f"{save_dir}_ori"
-                rec_dir = f"{save_dir}_rec"
-                ref_dir = f"{save_dir}_ref"
-                os.makedirs(ori_dir, exist_ok=True)
-                os.makedirs(rec_dir, exist_ok=True)
-                os.makedirs(ref_dir, exist_ok=True)
-                for idx, vid in enumerate(x):
-                    pos = step * cfg.batch_size + idx
-                    save_sample(vid, fps=cfg.fps, save_path=f"{ori_dir}/{pos:03d}")
-                    save_sample(x_rec[idx], fps=cfg.fps, save_path=f"{rec_dir}/{pos:03d}")
-                    save_sample(x_ref[idx], fps=cfg.fps, save_path=f"{ref_dir}/{pos:03d}")
+            # if not use_dist or coordinator.is_master():
+            #     ori_dir = f"{save_dir}_ori"
+            #     rec_dir = f"{save_dir}_rec"
+            #     ref_dir = f"{save_dir}_ref"
+            #     os.makedirs(ori_dir, exist_ok=True)
+            #     os.makedirs(rec_dir, exist_ok=True)
+            #     os.makedirs(ref_dir, exist_ok=True)
+            #     for idx, vid in enumerate(x):
+            #         pos = step * cfg.batch_size + idx
+            #         save_sample(vid, fps=cfg.fps, save_path=f"{ori_dir}/{pos:03d}")
+            #         save_sample(x_rec[idx], fps=cfg.fps, save_path=f"{rec_dir}/{pos:03d}")
+            #         save_sample(x_ref[idx], fps=cfg.fps, save_path=f"{ref_dir}/{pos:03d}")

    print("test vae loss:", running_loss)
    print("test nll loss:", running_nll)