From 03198a12a7c9a4d9c21beb559398f2bb49cd9ad5 Mon Sep 17 00:00:00 2001
From: Shen-Chenhui <shen_chenhui@u.nus.edu>
Date: Mon, 17 Jun 2024 03:13:32 +0000
Subject: [PATCH 01/34] format

---
 eval/sample.sh        | 33 +++++++++++++++++++++++++++++----
 eval/vbench/launch.sh | 21 ++++++++++++++++++++-
 2 files changed, 49 insertions(+), 5 deletions(-)

diff --git a/eval/sample.sh b/eval/sample.sh
index 0123309..1c52506 100644
--- a/eval/sample.sh
+++ b/eval/sample.sh
@@ -9,6 +9,10 @@ VBENCH_END_INDEX=$6
 VBENCH_RES=$7
 VBENCH_ASP_RATIO=$8
 
+NUM_SAMPLING_STEPS=$9
+FLOW=${10}
+LLM_REFINE=${11}
+
 echo "NUM_FRAMES=${NUM_FRAMES}"
 
 if [ -z "${NUM_FRAMES}" ]; then
@@ -238,10 +242,31 @@ function run_vbench() {
       --image-size $VBENCH_H $VBENCH_W \
       --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
   else
-    eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
-      --prompt-path assets/texts/VBench/all_dimension.txt \
-      --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO \
-      --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+    if [ -z ${NUM_SAMPLING_STEPS} ]; then
+        eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
+        --prompt-path assets/texts/VBench/all_dimension.txt \
+        --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO \
+        --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+    else
+      if [ -z ${FLOW} ]; then
+        eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
+        --prompt-path assets/texts/VBench/all_dimension.txt \
+        --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} \
+        --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+      else
+        if [ -z ${LLM_REFINE} ]; then
+          eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
+          --prompt-path assets/texts/VBench/all_dimension.txt \
+          --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --flow ${FLOW} \
+          --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+        else
+          eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
+          --prompt-path assets/texts/VBench/all_dimension.txt \
+          --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --flow ${FLOW} --llm-refine ${LLM_REFINE} \
+          --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+        fi
+      fi
+    fi
   fi
 }
 
diff --git a/eval/vbench/launch.sh b/eval/vbench/launch.sh
index eedd9b3..c37526d 100644
--- a/eval/vbench/launch.sh
+++ b/eval/vbench/launch.sh
@@ -6,6 +6,10 @@ MODEL_NAME=$3
 RES=$4
 ASP_RATIO=$5
 
+NUM_SAMPLING_STEPS=$6
+FLOW=$7
+LLM_REFINE=$8
+
 if [[ $CKPT == *"ema"* ]]; then
     parentdir=$(dirname $CKPT)
     CKPT_BASE=$(basename $parentdir)_ema
@@ -25,6 +29,21 @@ for i in "${!GPUS[@]}"; do
         then
             CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
         else
-            CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+            if [ -z ${NUM_SAMPLING_STEPS} ];
+                then
+                    CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                else
+                    if [ -z ${FLOW} ];
+                    then
+                        CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                    else
+                        if [ -z ${LLM_REFINE} ];
+                            then
+                                CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                            else
+                                CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} ${LLM_REFINE}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                        fi
+                    fi
+            fi
     fi
 done

From 1573dbbc01756fe9c05d389ef2859cd574405062 Mon Sep 17 00:00:00 2001
From: Shen-Chenhui <shen_chenhui@u.nus.edu>
Date: Mon, 17 Jun 2024 03:50:04 +0000
Subject: [PATCH 02/34] format

---
 eval/sample.sh             | 15 ++++++---
 eval/vbench/calc_vbench.py | 65 ++++++++++++++++++++++----------------
 eval/vbench/launch_calc.sh |  5 ++-
 3 files changed, 50 insertions(+), 35 deletions(-)

diff --git a/eval/sample.sh b/eval/sample.sh
index 1c52506..eff1368 100644
--- a/eval/sample.sh
+++ b/eval/sample.sh
@@ -260,10 +260,17 @@ function run_vbench() {
           --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --flow ${FLOW} \
           --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
         else
-          eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
-          --prompt-path assets/texts/VBench/all_dimension.txt \
-          --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --flow ${FLOW} --llm-refine ${LLM_REFINE} \
-          --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+          if [ "${FLOW}" = "None" ]; then
+            eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
+            --prompt-path assets/texts/VBench/all_dimension.txt \
+            --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --llm-refine ${LLM_REFINE} \
+            --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+          else
+            eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench --prompt-as-path --num-sample 5 \
+            --prompt-path assets/texts/VBench/all_dimension.txt \
+            --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --flow ${FLOW} --llm-refine ${LLM_REFINE} \
+            --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+          fi
         fi
       fi
     fi
diff --git a/eval/vbench/calc_vbench.py b/eval/vbench/calc_vbench.py
index b2505bf..e5570a1 100644
--- a/eval/vbench/calc_vbench.py
+++ b/eval/vbench/calc_vbench.py
@@ -1,37 +1,46 @@
 import argparse
 import os
-from vbench import VBench
-import torch
 import time
 
+import torch
+
+from vbench import VBench
+
 full_info_path = "eval/vbench/VBench_full_info.json"
 dimensions = [
-    # Quality Score
-    "subject_consistency",
-    "background_consistency",
-    "motion_smoothness",
-    "dynamic_degree",
-    "aesthetic_quality",
-    "imaging_quality",
-    "temporal_flickering",
-    # Semantic Score
-    "object_class",
-    "multiple_objects",
-    "color",
-    "spatial_relationship",
-    "scene",
-    "temporal_style",
-    "overall_consistency",
-    "human_action",
-    "appearance_style",
+    # a: 10min
+    "subject_consistency",  # 4min
+    "imaging_quality",  # 6min
+    # b: 12min
+    "background_consistency",  # 2min
+    "motion_smoothness",  # 5min
+    "overall_consistency",  # 2min
+    "human_action",  # 3min
+    # c: 14min
+    "multiple_objects",  # 14min
+    # d: 14min
+    "spatial_relationship",  # 14min
+    # e: 12min
+    "object_class",  # 12min
+    # f: 12min
+    "color",  # 12min
+    # g: 10.5min
+    "aesthetic_quality",  # 2.5min
+    "appearance_style",  # 6min
+    "temporal_flickering",  # 2min
+    # h: 9min
+    "scene",  # 3min
+    "temporal_style",  # 2min
+    "dynamic_degree",  # 4min
 ]
 
+
 def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("video_folder", type=str)  # samples/samples..._vbench/eval
     parser.add_argument("model_ckpt", type=str)
-    parser.add_argument("--start", type=int, default=0) # start index of dimension to be evaluated
-    parser.add_argument("--end", type=int, default=-1) # start index of dimension to be evaluated
+    parser.add_argument("--start", type=int, default=0)  # start index of dimension to be evaluated
+    parser.add_argument("--end", type=int, default=-1)  # start index of dimension to be evaluated
 
     args = parser.parse_args()
     return args
@@ -44,23 +53,23 @@ if __name__ == "__main__":
     video_path = args.video_folder
 
     kwargs = {}
-    kwargs['imaging_quality_preprocessing_mode'] = 'longer' # use VBench/evaluate.py default
+    kwargs["imaging_quality_preprocessing_mode"] = "longer"  # use VBench/evaluate.py default
 
     start_time = time.time()
 
     # NOTE: important to use torch.device("cuda"), else will have issue with object_class third_party module
     my_VBench = VBench(torch.device("cuda"), full_info_path, output_dir)
-    if args.end == -1: # adjust end accordingly
+    if args.end == -1:  # adjust end accordingly
         args.end = len(dimensions)
-    for dim in dimensions[args.start:args.end]:
+    for dim in dimensions[args.start : args.end]:
         my_VBench.evaluate(
             videos_path=video_path,
             name=dim,
             local=False,
             read_frame=False,
             dimension_list=[dim],
-            mode='vbench_standard',
-            **kwargs
+            mode="vbench_standard",
+            **kwargs,
         )
 
-    print("Runtime: %s seconds " % (time.time() - start_time))
\ No newline at end of file
+    print("Runtime: %s seconds " % (time.time() - start_time))
diff --git a/eval/vbench/launch_calc.sh b/eval/vbench/launch_calc.sh
index 53114b9..9f14ce5 100644
--- a/eval/vbench/launch_calc.sh
+++ b/eval/vbench/launch_calc.sh
@@ -7,11 +7,10 @@ mkdir -p $LOG_BASE
 echo "Logging to $LOG_BASE"
 
 GPUS=(0 1 2 3 4 5 6 7)
-START_INDEX_LIST=(0 2 4 6 8 10 12 14)
-END_INDEX_LIST=(2 4 6 8 10 12 14 16)
+START_INDEX_LIST=(0 2 6 7 8 9 10 13)
+END_INDEX_LIST=(2 6 7 8 9 10 13 16)
 TASK_ID_LIST=(calc_vbench_a calc_vbench_b calc_vbench_c calc_vbench_d calc_vbench_e calc_vbench_f calc_vbench_g calc_vbench_h) # for log records only
 
-
 for i in "${!GPUS[@]}"; do
     CUDA_VISIBLE_DEVICES=${GPUS[i]} python eval/vbench/calc_vbench.py $VIDEO_DIR $CKPT_DIR --start ${START_INDEX_LIST[i]} --end ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
 done

From d4db36f7820b456b0e5a5e77c44799695b4abf0a Mon Sep 17 00:00:00 2001
From: Shen-Chenhui <shen_chenhui@u.nus.edu>
Date: Mon, 17 Jun 2024 06:35:59 +0000
Subject: [PATCH 03/34] save update

---
 eval/vbench/launch.sh | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/eval/vbench/launch.sh b/eval/vbench/launch.sh
index c37526d..f3a5b62 100644
--- a/eval/vbench/launch.sh
+++ b/eval/vbench/launch.sh
@@ -24,6 +24,14 @@ TASK_ID_LIST=(4a 4b 4c 4d 4e 4f 4g 4h) # for log records only
 START_INDEX_LIST=(0 120 240 360 480 600 720 840)
 END_INDEX_LIST=(120 240 360 480 600 720 840 2000)
 
+# START_INDEX_LIST=(60 180 300 420 540 660 780 900)
+# END_INDEX_LIST=(120 240 360 480 600 720 840 2000)
+# LOG_BASE=$(dirname $CKPT)/eval/last_60
+# mkdir -p ${LOG_BASE}
+# echo "Logging to $LOG_BASE"
+
+
+
 for i in "${!GPUS[@]}"; do
     if [ -z ${RES} ] || [ -z ${ASP_RATIO} ]  ;
         then

From 8fe96e4b886c53024a3edcfb46052ec0321903df Mon Sep 17 00:00:00 2001
From: Shen-Chenhui <shen_chenhui@u.nus.edu>
Date: Thu, 20 Jun 2024 04:03:13 +0000
Subject: [PATCH 04/34] more options for vbench

---
 eval/README.md        | 6 ++++++
 eval/vbench/launch.sh | 2 ++
 2 files changed, 8 insertions(+)

diff --git a/eval/README.md b/eval/README.md
index 7ae6e32..652d3a3 100644
--- a/eval/README.md
+++ b/eval/README.md
@@ -48,8 +48,14 @@ First, generate the relevant videos with the following commands:
 ```bash
 # vbench task, if evaluation all set start_index to 0, end_index to 2000
 bash eval/sample.sh /path/to/ckpt num_frames model_name_for_log  -4 start_index end_index
+
 # Alternatively, launch 8 jobs at once (you must read the script to understand the details)
 bash eval/vbench/launch.sh /path/to/ckpt num_frames model_name
+
+# in addition, you can specify resolution, aspect ratio, sampling steps, flow, and llm-refine
+bash eval/vbench/launch.sh /path/to/ckpt num_frames model_name res_value aspect_ratio_value steps_value flow_value llm_refine_value
+# for example
+# bash eval/vbench/launch.sh /mnt/jfs-hdd/sora/checkpoints/outputs/042-STDiT3-XL-2/epoch1-global_step16200_llm_refine/ema.pt 51 042-STDiT3-XL-2 240p 9:16 30 2 True
 ```
 
 After generation, install the VBench package following our [installation](../docs/installation.md)'s sections of "Evaluation Dependencies". Then, run the following commands to evaluate the generated samples.
diff --git a/eval/vbench/launch.sh b/eval/vbench/launch.sh
index f3a5b62..e7c1165 100644
--- a/eval/vbench/launch.sh
+++ b/eval/vbench/launch.sh
@@ -24,6 +24,8 @@ TASK_ID_LIST=(4a 4b 4c 4d 4e 4f 4g 4h) # for log records only
 START_INDEX_LIST=(0 120 240 360 480 600 720 840)
 END_INDEX_LIST=(120 240 360 480 600 720 840 2000)
 
+## Modify the following to run on multiple machines for faster results
+## 720p will take quite long on a single machine
 # START_INDEX_LIST=(60 180 300 420 540 660 780 900)
 # END_INDEX_LIST=(120 240 360 480 600 720 840 2000)
 # LOG_BASE=$(dirname $CKPT)/eval/last_60

From 8f239c87bf12564defc5e37cf611fa00c1976443 Mon Sep 17 00:00:00 2001
From: HangXu <hangxu0304@gmail.com>
Date: Thu, 20 Jun 2024 11:48:42 +0300
Subject: [PATCH 05/34]  Added causal mask in Attention forward pass

---
 opensora/models/layers/blocks.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/opensora/models/layers/blocks.py b/opensora/models/layers/blocks.py
index 8bc7e72..73a0162 100644
--- a/opensora/models/layers/blocks.py
+++ b/opensora/models/layers/blocks.py
@@ -163,6 +163,8 @@ class Attention(nn.Module):
         if rope is not None:
             self.rope = True
             self.rotary_emb = rope
+        
+        self.is_causal = False
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         B, N, C = x.shape
@@ -198,12 +200,17 @@ class Attention(nn.Module):
                 v,
                 dropout_p=self.attn_drop.p if self.training else 0.0,
                 softmax_scale=self.scale,
+                causal=self.is_causal,
             )
         else:
             dtype = q.dtype
             q = q * self.scale
             attn = q @ k.transpose(-2, -1)  # translate attn to float32
             attn = attn.to(torch.float32)
+            if self.is_causal:
+                causal_mask = torch.tril(torch.ones_like(attn), diagonal=0)
+                causal_mask = torch.where(causal_mask.bool(), 0, float('-inf'))
+                attn += causal_mask
             attn = attn.softmax(dim=-1)
             attn = attn.to(dtype)  # cast back attn to original dtype
             attn = self.attn_drop(attn)

From 6b42f4aa95c4137e3b6dbf18862a4781da469bbd Mon Sep 17 00:00:00 2001
From: Tom Young <tomyoung903@gmail.com>
Date: Fri, 21 Jun 2024 01:29:21 +0000
Subject: [PATCH 06/34] small update on readme

---
 tools/caption/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/caption/README.md b/tools/caption/README.md
index f6fe0c8..8f7dfed 100644
--- a/tools/caption/README.md
+++ b/tools/caption/README.md
@@ -4,7 +4,7 @@ Human labeling of videos is expensive and time-consuming. We adopt powerful imag
 
 ## PLLaVA Captioning
 
-To balance captioning speed and performance, we chose the 13B version of PLLaVA configured with 2*2 spatial pooling. We feed it with 4 frames evenly extracted from the video.
+To balance captioning speed and performance, we chose the 13B version of PLLaVA configured with 2*2 spatial pooling. We feed it with 4 frames evenly extracted from the video. We accelerate its inference via (1) batching and (2) offload frame extraction to a separate process such that the GPU computations and frame extraction happen in parallel.
 
 ### Installation
 Install the required dependancies by following our [installation instructions](../../docs/installation.md)'s "Data Dependencies" and "PLLaVA Captioning" sections.

From 98958b8e56b64ab7450d6bbafd62b2074a25bd1f Mon Sep 17 00:00:00 2001
From: Shen-Chenhui <shen_chenhui@u.nus.edu>
Date: Fri, 21 Jun 2024 09:18:09 +0000
Subject: [PATCH 07/34] format

---
 eval/README.md            |  9 +++++++++
 eval/sample.sh            | 41 +++++++++++++++++++++++++++++++--------
 eval/vbench_i2v/launch.sh | 21 +++++++++++++++++++-
 3 files changed, 62 insertions(+), 9 deletions(-)

diff --git a/eval/README.md b/eval/README.md
index 652d3a3..261c21b 100644
--- a/eval/README.md
+++ b/eval/README.md
@@ -95,6 +95,15 @@ python eval/vbench_i2v/tabulate_vbench_i2v_scores.py path/to/videos/folder path/
 
 ```
 
+Similarly as VBench, you can specify resolution, aspect ratio, sampling steps, flow, and llm-refine
+
+```bash
+bash eval/vbench_i2v/launch.sh /path/to/ckpt num_frames model_name_for_log res_value aspect_ratio_value steps_value flow_value llm_refine_value
+# for example
+# bash eval/vbench_i2v/launch.sh /mnt/jfs-hdd/sora/checkpoints/outputs/042-STDiT3-XL-2/epoch1-global_step16200_llm_refine/ema.pt 51 042-STDiT3-XL-2 240p 9:16 30 2 True
+# if no flow control, use "None" instead
+```
+
 ## VAE
 
 Install the dependencies package following our [installation](../docs/installation.md)'s s sections of "Evaluation Dependencies". Then, run the following evaluation command:
diff --git a/eval/sample.sh b/eval/sample.sh
index eff1368..0f28550 100644
--- a/eval/sample.sh
+++ b/eval/sample.sh
@@ -287,16 +287,41 @@ function run_vbench_i2v() {
     eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
       --prompt-path assets/texts/VBench/all_i2v.txt \
       --image-size $VBENCH_I2V_H $VBENCH_I2V_W \
-      --start-index $1 --end-index $2 \
-      --num-frames $NUM_FRAMES --batch-size $VBENCH_BS
+      --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
   else
-    eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
-      --prompt-path assets/texts/VBench/all_i2v.txt \
-      --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO \
-      --start-index $1 --end-index $2 \
-      --num-frames $NUM_FRAMES --batch-size $VBENCH_BS
+    if [ -z ${NUM_SAMPLING_STEPS} ]; then
+        eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
+        --prompt-path assets/texts/VBench/all_i2v.txt \
+        --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO \
+        --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+    else
+      if [ -z ${FLOW} ]; then
+        eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
+        --prompt-path assets/texts/VBench/all_i2v.txt \
+        --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} \
+        --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+      else
+        if [ -z ${LLM_REFINE} ]; then
+          eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
+          --prompt-path assets/texts/VBench/all_i2v.txt \
+          --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --flow ${FLOW} \
+          --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+        else
+          if [ "${FLOW}" = "None" ]; then
+            eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
+            --prompt-path assets/texts/VBench/all_i2v.txt \
+            --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --llm-refine ${LLM_REFINE} \
+            --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+          else
+            eval $CMD --ckpt-path $CKPT --save-dir ${OUTPUT}_vbench_i2v --prompt-as-path --num-sample 5 \
+            --prompt-path assets/texts/VBench/all_i2v.txt \
+            --resolution $VBENCH_RES --aspect-ratio $VBENCH_ASP_RATIO --num-sampling-steps ${NUM_SAMPLING_STEPS} --flow ${FLOW} --llm-refine ${LLM_REFINE} \
+            --batch-size $VBENCH_BS --num-frames $NUM_FRAMES --start-index $1 --end-index $2
+          fi
+        fi
+      fi
+    fi
   fi
-
 }
 
 ### Main
diff --git a/eval/vbench_i2v/launch.sh b/eval/vbench_i2v/launch.sh
index d8eea1d..b972acc 100644
--- a/eval/vbench_i2v/launch.sh
+++ b/eval/vbench_i2v/launch.sh
@@ -6,6 +6,10 @@ MODEL_NAME=$3
 RES=$4
 ASP_RATIO=$5
 
+NUM_SAMPLING_STEPS=$6
+FLOW=$7
+LLM_REFINE=$8
+
 if [[ $CKPT == *"ema"* ]]; then
     parentdir=$(dirname $CKPT)
     CKPT_BASE=$(basename $parentdir)_ema
@@ -25,6 +29,21 @@ for i in "${!GPUS[@]}"; do
         then
             CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
         else
-            CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+            if [ -z ${NUM_SAMPLING_STEPS} ];
+                then
+                    CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                else
+                    if [ -z ${FLOW} ];
+                    then
+                        CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                    else
+                        if [ -z ${LLM_REFINE} ];
+                            then
+                                CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                            else
+                                CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} ${LLM_REFINE}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                        fi
+                    fi
+            fi
     fi
 done

From e581319f37e8c851cc2f3676928dc2f5d60cdaf1 Mon Sep 17 00:00:00 2001
From: Shen-Chenhui <shen_chenhui@u.nus.edu>
Date: Fri, 21 Jun 2024 09:44:38 +0000
Subject: [PATCH 08/34] add instruction for force huggingface

---
 docs/commands.md | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/docs/commands.md b/docs/commands.md
index 2c948de..d982db8 100644
--- a/docs/commands.md
+++ b/docs/commands.md
@@ -1,5 +1,6 @@
 # Commands
 
+- [Config](#Config)
 - [Inference](#inference)
   - [Inference with Open-Sora 1.2](#inference-with-open-sora-12)
   - [Inference with Open-Sora 1.1](#inference-with-open-sora-11)
@@ -12,6 +13,35 @@
   - [Training Hyperparameters](#training-hyperparameters)
 - [Search batch size for buckets](#search-batch-size-for-buckets)
 
+## Config
+Note that currently our model loading for vae and diffusion model supports two types:
+
+* load from local file path
+* load from huggingface
+
+Our config supports loading from huggingface by default.
+If you wish to load from a local path, you need to set `force_huggingface=True`, for instance:
+
+```python
+# for vae
+vae = dict(
+    type="OpenSoraVAE_V1_2",
+    from_pretrained="/root/commonData/OpenSora-VAE-v1.2",
+    micro_frame_size=17,
+    micro_batch_size=4,
+    force_huggingface=True, # NOTE: set here
+)
+# for diffusion model
+model = dict(
+    type="STDiT3-XL/2",
+    from_pretrained="/root/commonData/OpenSora-STDiT-v3",
+    qk_norm=True,
+    enable_flash_attn=True,
+    enable_layernorm_kernel=True,
+    force_huggingface=True, # NOTE: set here
+)
+```
+
 ## Inference
 
 You can modify corresponding config files to change the inference settings. See more details [here](/docs/structure.md#inference-config-demos).

From 54f1da7cf1ba91843e112f37e0facc81b0a13ce7 Mon Sep 17 00:00:00 2001
From: zhengzangw <zhengzangw@gmail.com>
Date: Sat, 22 Jun 2024 09:24:05 +0000
Subject: [PATCH 09/34] update

---
 configs/opensora-v1-2/train/stage2.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/configs/opensora-v1-2/train/stage2.py b/configs/opensora-v1-2/train/stage2.py
index 94e6975..8620066 100644
--- a/configs/opensora-v1-2/train/stage2.py
+++ b/configs/opensora-v1-2/train/stage2.py
@@ -60,19 +60,21 @@ scheduler = dict(
 )
 
 # Mask settings
+# 25%
 mask_ratios = {
-    "random": 0.05,
-    "intepolate": 0.005,
-    "quarter_random": 0.005,
-    "quarter_head": 0.005,
-    "quarter_tail": 0.005,
-    "quarter_head_tail": 0.005,
-    "image_random": 0.025,
-    "image_head": 0.05,
-    "image_tail": 0.025,
-    "image_head_tail": 0.025,
+    "random": 0.005,
+    "intepolate": 0.002,
+    "quarter_random": 0.007,
+    "quarter_head": 0.002,
+    "quarter_tail": 0.002,
+    "quarter_head_tail": 0.002,
+    "image_random": 0.0,
+    "image_head": 0.22,
+    "image_tail": 0.005,
+    "image_head_tail": 0.005,
 }
 
+
 # Log settings
 seed = 42
 outputs = "outputs"

From c1d47153c0ec48e282ecf363931630521087dcea Mon Sep 17 00:00:00 2001
From: zhengzangw <zhengzangw@gmail.com>
Date: Sat, 22 Jun 2024 13:29:40 +0000
Subject: [PATCH 10/34] update gitignore

---
 .gitignore | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.gitignore b/.gitignore
index eb382c6..61be3e7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -185,3 +185,7 @@ wandb/
 
 # vae weights
 eval/vae/flolpips/weights/
+
+outputs
+vbench
+vbench2_beta_i2v

From 711586431477701a765312008db697d0eea493b6 Mon Sep 17 00:00:00 2001
From: zhengzangw <zhengzangw@gmail.com>
Date: Sat, 22 Jun 2024 15:41:32 +0000
Subject: [PATCH 11/34] [fix] HF loading

---
 configs/opensora-v1-2/inference/sample.py    |  2 -
 configs/opensora-v1-2/inference/sample_hf.py | 44 ++++++++++++++++++++
 eval/sample.sh                               |  2 +-
 opensora/models/stdit/stdit3.py              |  5 ++-
 opensora/models/vae/vae.py                   |  2 +-
 5 files changed, 49 insertions(+), 6 deletions(-)
 create mode 100644 configs/opensora-v1-2/inference/sample_hf.py

diff --git a/configs/opensora-v1-2/inference/sample.py b/configs/opensora-v1-2/inference/sample.py
index 0e84251..3e2c623 100644
--- a/configs/opensora-v1-2/inference/sample.py
+++ b/configs/opensora-v1-2/inference/sample.py
@@ -19,14 +19,12 @@ model = dict(
     qk_norm=True,
     enable_flash_attn=True,
     enable_layernorm_kernel=True,
-    force_huggingface=True,
 )
 vae = dict(
     type="OpenSoraVAE_V1_2",
     from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
     micro_frame_size=17,
     micro_batch_size=4,
-    force_huggingface=True,
 )
 text_encoder = dict(
     type="t5",
diff --git a/configs/opensora-v1-2/inference/sample_hf.py b/configs/opensora-v1-2/inference/sample_hf.py
new file mode 100644
index 0000000..0e84251
--- /dev/null
+++ b/configs/opensora-v1-2/inference/sample_hf.py
@@ -0,0 +1,44 @@
+resolution = "240p"
+aspect_ratio = "9:16"
+num_frames = 51
+fps = 24
+frame_interval = 1
+save_fps = 24
+
+save_dir = "./samples/samples/"
+seed = 42
+batch_size = 1
+multi_resolution = "STDiT2"
+dtype = "bf16"
+condition_frame_length = 5
+align = 5
+
+model = dict(
+    type="STDiT3-XL/2",
+    from_pretrained="hpcai-tech/OpenSora-STDiT-v3",
+    qk_norm=True,
+    enable_flash_attn=True,
+    enable_layernorm_kernel=True,
+    force_huggingface=True,
+)
+vae = dict(
+    type="OpenSoraVAE_V1_2",
+    from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
+    micro_frame_size=17,
+    micro_batch_size=4,
+    force_huggingface=True,
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
+    model_max_length=300,
+)
+scheduler = dict(
+    type="rflow",
+    use_timestep_transform=True,
+    num_sampling_steps=30,
+    cfg_scale=7.0,
+)
+
+aes = 6.5
+flow = None
diff --git a/eval/sample.sh b/eval/sample.sh
index 0123309..241c229 100644
--- a/eval/sample.sh
+++ b/eval/sample.sh
@@ -39,7 +39,7 @@ DEFAULT_BS=1
 # called inside run_video_b
 function run_image() { # 14min
   # 1.1 1024x1024
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2i_samples.txt --save-dir $OUTPUT --num-frames 1 --resolution 1024 --aspect_ratio 1:1 --sample-name image_1024_1_1 --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2i_samples.txt --save-dir $OUTPUT --num-frames 1 --resolution 1024 --aspect-ratio 1:1 --sample-name image_1024_1_1 --batch-size $DEFAULT_BS
 
   # 1.2 240x426
   eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2i_samples.txt --save-dir $OUTPUT --num-frames 1 --resolution 240p --aspect-ratio 9:16 --sample-name image_240p_9_16 --end-index 3 --batch-size $DEFAULT_BS
diff --git a/opensora/models/stdit/stdit3.py b/opensora/models/stdit/stdit3.py
index 8703b2d..bd9672d 100644
--- a/opensora/models/stdit/stdit3.py
+++ b/opensora/models/stdit/stdit3.py
@@ -448,7 +448,7 @@ class STDiT3(PreTrainedModel):
 @MODELS.register_module("STDiT3-XL/2")
 def STDiT3_XL_2(from_pretrained=None, **kwargs):
     force_huggingface = kwargs.pop("force_huggingface", False)
-    if force_huggingface or from_pretrained is not None and not os.path.isdir(from_pretrained):
+    if force_huggingface or from_pretrained is not None and not os.path.exists(from_pretrained):
         model = STDiT3.from_pretrained(from_pretrained, **kwargs)
     else:
         config = STDiT3Config(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs)
@@ -460,7 +460,8 @@ def STDiT3_XL_2(from_pretrained=None, **kwargs):
 
 @MODELS.register_module("STDiT3-3B/2")
 def STDiT3_3B_2(from_pretrained=None, **kwargs):
-    if from_pretrained is not None and not os.path.isdir(from_pretrained):
+    force_huggingface = kwargs.pop("force_huggingface", False)
+    if force_huggingface or from_pretrained is not None and not os.path.exists(from_pretrained):
         model = STDiT3.from_pretrained(from_pretrained, **kwargs)
     else:
         config = STDiT3Config(depth=28, hidden_size=1872, patch_size=(1, 2, 2), num_heads=26, **kwargs)
diff --git a/opensora/models/vae/vae.py b/opensora/models/vae/vae.py
index bf50ec8..9802b02 100644
--- a/opensora/models/vae/vae.py
+++ b/opensora/models/vae/vae.py
@@ -277,7 +277,7 @@ def OpenSoraVAE_V1_2(
         scale=scale,
     )
 
-    if force_huggingface or (from_pretrained is not None and not os.path.isdir(from_pretrained)):
+    if force_huggingface or (from_pretrained is not None and not os.path.exists(from_pretrained)):
         model = VideoAutoencoderPipeline.from_pretrained(from_pretrained, **kwargs)
     else:
         config = VideoAutoencoderPipelineConfig(**kwargs)

From ee1c79a89870c71fdc40bef8982a8ddecd7ef97e Mon Sep 17 00:00:00 2001
From: Frank Lee <somerlee.9@gmail.com>
Date: Mon, 24 Jun 2024 13:59:29 +0800
Subject: [PATCH 12/34] [sp] added padding (#160)

---
 opensora/models/stdit/stdit3.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/opensora/models/stdit/stdit3.py b/opensora/models/stdit/stdit3.py
index bd9672d..bb71d04 100644
--- a/opensora/models/stdit/stdit3.py
+++ b/opensora/models/stdit/stdit3.py
@@ -4,6 +4,7 @@ import numpy as np
 import torch
 import torch.distributed as dist
 import torch.nn as nn
+import torch.nn.functional as F
 from einops import rearrange
 from rotary_embedding_torch import RotaryEmbedding
 from timm.models.layers import DropPath
@@ -361,6 +362,19 @@ class STDiT3(PreTrainedModel):
         # === get pos embed ===
         _, _, Tx, Hx, Wx = x.size()
         T, H, W = self.get_dynamic_size(x)
+
+        # adjust for sequence parallelism
+        # we need to ensure H * W is divisible by sequence parallel size
+        # for simplicity, we can adjust the height to make it divisible
+        if self.enable_sequence_parallelism:
+            sp_size = dist.get_world_size(get_sequence_parallel_group())
+            h_pad_size = sp_size - H % sp_size
+            hx_pad_size = h_pad_size * self.patch_size[1]
+
+            # pad x along the H dimension
+            H += h_pad_size
+            x = F.pad(x, (0, 0, 0, hx_pad_size))
+
         S = H * W
         base_size = round(S**0.5)
         resolution_sq = (height[0].item() * width[0].item()) ** 0.5

From 70226e81e2e047cf7d31d83ea0b3a8355a3aa046 Mon Sep 17 00:00:00 2001
From: Shen-Chenhui <shen_chenhui@u.nus.edu>
Date: Mon, 24 Jun 2024 06:39:57 +0000
Subject: [PATCH 13/34] update

---
 eval/vbench_i2v/launch.sh | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/eval/vbench_i2v/launch.sh b/eval/vbench_i2v/launch.sh
index b972acc..30ee9c3 100644
--- a/eval/vbench_i2v/launch.sh
+++ b/eval/vbench_i2v/launch.sh
@@ -19,10 +19,16 @@ fi
 LOG_BASE=$(dirname $CKPT)/eval
 echo "Logging to $LOG_BASE"
 
-GPUS=(0 1 2 3 4 5 6 7)
-TASK_ID_LIST=(5a 5b 5c 5d 5e 5f 5g 5h) # for log records only
-START_INDEX_LIST=(0 140 280 420 560 700 840 980)
-END_INDEX_LIST=(140 280 420 560 700 840 980 2000)
+# GPUS=(0 1 2 3 4 5 6 7)
+# TASK_ID_LIST=(5a 5b 5c 5d 5e 5f 5g 5h) # for log records only
+# START_INDEX_LIST=(0 140 280 420 560 700 840 980)
+# END_INDEX_LIST=(140 280 420 560 700 840 980 2000)
+
+GPUS=(4 5 6 7)
+TASK_ID_LIST=(5a 5b 5c 5d) # for log records only
+START_INDEX_LIST=(0 140 280 420)
+END_INDEX_LIST=(140 280 420 560)
+
 
 for i in "${!GPUS[@]}"; do
     if [ -z ${RES} ] || [ -z ${ASP_RATIO} ]  ;

From 1e6c44238b0c42d0010df5fd5fb63c6654f7d4d8 Mon Sep 17 00:00:00 2001
From: Shen-Chenhui <shen_chenhui@u.nus.edu>
Date: Mon, 24 Jun 2024 06:48:08 +0000
Subject: [PATCH 14/34] vbench i2v passed test

---
 eval/vbench_i2v/launch.sh | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/eval/vbench_i2v/launch.sh b/eval/vbench_i2v/launch.sh
index 30ee9c3..2b03309 100644
--- a/eval/vbench_i2v/launch.sh
+++ b/eval/vbench_i2v/launch.sh
@@ -19,15 +19,10 @@ fi
 LOG_BASE=$(dirname $CKPT)/eval
 echo "Logging to $LOG_BASE"
 
-# GPUS=(0 1 2 3 4 5 6 7)
-# TASK_ID_LIST=(5a 5b 5c 5d 5e 5f 5g 5h) # for log records only
-# START_INDEX_LIST=(0 140 280 420 560 700 840 980)
-# END_INDEX_LIST=(140 280 420 560 700 840 980 2000)
-
-GPUS=(4 5 6 7)
-TASK_ID_LIST=(5a 5b 5c 5d) # for log records only
-START_INDEX_LIST=(0 140 280 420)
-END_INDEX_LIST=(140 280 420 560)
+GPUS=(0 1 2 3 4 5 6 7)
+TASK_ID_LIST=(5a 5b 5c 5d 5e 5f 5g 5h) # for log records only
+START_INDEX_LIST=(0 140 280 420 560 700 840 980)
+END_INDEX_LIST=(140 280 420 560 700 840 980 2000)
 
 
 for i in "${!GPUS[@]}"; do

From 491403218d5f579c76ab0211c233edd4ba67cd41 Mon Sep 17 00:00:00 2001
From: zhengzangw <zhengzangw@gmail.com>
Date: Mon, 24 Jun 2024 07:04:08 +0000
Subject: [PATCH 15/34] update for pixart

---
 configs/pixart/inference/1x2048MS.py | 2 +-
 opensora/models/pixart/pixart.py     | 8 +++++---
 opensora/schedulers/dpms/__init__.py | 3 ++-
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/configs/pixart/inference/1x2048MS.py b/configs/pixart/inference/1x2048MS.py
index a0daca4..0f48824 100644
--- a/configs/pixart/inference/1x2048MS.py
+++ b/configs/pixart/inference/1x2048MS.py
@@ -1,6 +1,6 @@
 num_frames = 1
 fps = 1
-image_size = (2560, 1536)
+# image_size = (2560, 1536)
 # image_size = (2048, 2048)
 
 model = dict(
diff --git a/opensora/models/pixart/pixart.py b/opensora/models/pixart/pixart.py
index 02f8b67..9544fcb 100644
--- a/opensora/models/pixart/pixart.py
+++ b/opensora/models/pixart/pixart.py
@@ -204,9 +204,11 @@ class PixArt(nn.Module):
         t: (N,) tensor of diffusion timesteps
         y: (N, 1, 120, C) tensor of class labels
         """
-        x = x.to(self.dtype)
-        timestep = timestep.to(self.dtype)
-        y = y.to(self.dtype)
+        dtype = self.x_embedder.proj.weight.dtype
+        B = x.size(0)
+        x = x.to(dtype)
+        timestep = timestep.to(dtype)
+        y = y.to(dtype)
 
         # embedding
         x = self.x_embedder(x)  # (B, N, D)
diff --git a/opensora/schedulers/dpms/__init__.py b/opensora/schedulers/dpms/__init__.py
index df10477..111e97b 100644
--- a/opensora/schedulers/dpms/__init__.py
+++ b/opensora/schedulers/dpms/__init__.py
@@ -24,7 +24,8 @@ class DPM_SOLVER:
         mask=None,
         progress=True,
     ):
-        assert mask is None, "mask is not supported in dpm-solver"
+        if mask is not None:
+            print("[WARNING] mask is not supported in dpm-solver, it will be ignored")
         n = len(prompts)
         model_args = text_encoder.encode(prompts)
         y = model_args.pop("y")

From 6a72b8910b29ab39199d8da173c2fcd9920d3173 Mon Sep 17 00:00:00 2001
From: FrankLeeeee <somerlee.9@gmail.com>
Date: Mon, 24 Jun 2024 08:50:35 +0000
Subject: [PATCH 16/34] [data] added error handling to dataset

---
 opensora/datasets/dataloader.py | 6 ++++++
 opensora/datasets/datasets.py   | 5 ++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/opensora/datasets/dataloader.py b/opensora/datasets/dataloader.py
index 15058ac..60d0b24 100644
--- a/opensora/datasets/dataloader.py
+++ b/opensora/datasets/dataloader.py
@@ -111,6 +111,9 @@ def prepare_dataloader(
 
 
 def collate_fn_default(batch):
+    # filter out None
+    batch = [x for x in batch if x is not None]
+
     # HACK: for loading text features
     use_mask = False
     if "mask" in batch[0] and isinstance(batch[0]["mask"], int):
@@ -132,6 +135,9 @@ def collate_fn_batch(batch):
     """
     Used only with BatchDistributedSampler
     """
+    # filter out None
+    batch = [x for x in batch if x is not None]
+    
     res = torch.utils.data.default_collate(batch)
 
     # squeeze the first dimension, which is due to torch.stack() in default_collate()
diff --git a/opensora/datasets/datasets.py b/opensora/datasets/datasets.py
index 8b5fdd6..b148268 100644
--- a/opensora/datasets/datasets.py
+++ b/opensora/datasets/datasets.py
@@ -190,7 +190,10 @@ class VariableVideoTextDataset(VideoTextDataset):
         return ret
 
     def __getitem__(self, index):
-        return self.getitem(index)
+        try:
+            return self.getitem(index)
+        except:
+            return None
 
 
 @DATASETS.register_module()

From 7b30ede7f0c461475cb6dd0748985b75ccda142a Mon Sep 17 00:00:00 2001
From: zhengzangw <zhengzangw@gmail.com>
Date: Mon, 24 Jun 2024 09:17:32 +0000
Subject: [PATCH 17/34] update requirements

---
 requirements/requirements-cu121.txt | 6 +++---
 requirements/requirements.txt       | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/requirements/requirements-cu121.txt b/requirements/requirements-cu121.txt
index cc13920..362381d 100644
--- a/requirements/requirements-cu121.txt
+++ b/requirements/requirements-cu121.txt
@@ -1,3 +1,3 @@
-torch==2.3.0 --index-url https://download.pytorch.org/whl/cu121
-torchvision==0.18.0 --index-url https://download.pytorch.org/whl/cu121
-xformers==0.0.26.post1 --index-url https://download.pytorch.org/whl/cu121
+torch==2.2.2 --index-url https://download.pytorch.org/whl/cu121
+torchvision==0.17.2 --index-url https://download.pytorch.org/whl/cu121
+xformers==0.0.25.post1 --index-url https://download.pytorch.org/whl/cu121
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 66ab7bf..504f381 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -1,4 +1,4 @@
-colossalai==0.3.7
+colossalai==0.3.9
 mmengine>=0.10.3
 pandas>=2.0.3
 timm==0.9.16
@@ -7,6 +7,7 @@ ftfy>=6.2.0 # for t5
 diffusers==0.27.2 # for vae
 accelerate==0.29.2 # for t5
 av>=12.0.0 # for video loading
+numpy<2.0.0
 
 # [gradio]
 gradio>=4.26.0

From 3552145f847b5171cd2c85982d116b04eb98c5bf Mon Sep 17 00:00:00 2001
From: FrankLeeeee <somerlee.9@gmail.com>
Date: Tue, 25 Jun 2024 06:17:24 +0000
Subject: [PATCH 18/34] [sp] updated precision test

---
 opensora/models/stdit/stdit3.py           | 15 ++++++++++-----
 tests/test_stdit3_sequence_parallelism.py | 18 ++++++++++++++----
 2 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/opensora/models/stdit/stdit3.py b/opensora/models/stdit/stdit3.py
index bb71d04..b0c046a 100644
--- a/opensora/models/stdit/stdit3.py
+++ b/opensora/models/stdit/stdit3.py
@@ -368,12 +368,17 @@ class STDiT3(PreTrainedModel):
         # for simplicity, we can adjust the height to make it divisible
         if self.enable_sequence_parallelism:
             sp_size = dist.get_world_size(get_sequence_parallel_group())
-            h_pad_size = sp_size - H % sp_size
-            hx_pad_size = h_pad_size * self.patch_size[1]
+            if H % sp_size != 0:
+                h_pad_size = sp_size - H % sp_size
+            else:
+                h_pad_size = 0
 
-            # pad x along the H dimension
-            H += h_pad_size
-            x = F.pad(x, (0, 0, 0, hx_pad_size))
+            if h_pad_size > 0:
+                hx_pad_size = h_pad_size * self.patch_size[1]
+
+                # pad x along the H dimension
+                H += h_pad_size
+                x = F.pad(x, (0, 0, 0, hx_pad_size))
 
         S = H * W
         base_size = round(S**0.5)
diff --git a/tests/test_stdit3_sequence_parallelism.py b/tests/test_stdit3_sequence_parallelism.py
index ba715b7..70786f4 100644
--- a/tests/test_stdit3_sequence_parallelism.py
+++ b/tests/test_stdit3_sequence_parallelism.py
@@ -9,7 +9,7 @@ from opensora.models.stdit.stdit3 import STDiT3, STDiT3Config
 
 
 def get_sample_data():
-    x = torch.rand([1, 4, 15, 20, 27], dtype=torch.bfloat16)  # (B, C, T, H, W)
+    x = torch.rand([1, 4, 15, 20, 28], dtype=torch.bfloat16)  # (B, C, T, H, W)
     timestep = torch.Tensor([924.0]).to(torch.bfloat16)
     y = torch.rand(1, 1, 300, 4096, dtype=torch.bfloat16)
     mask = torch.ones([1, 300], dtype=torch.int32)
@@ -66,6 +66,17 @@ def run_model(rank, world_size, port):
     set_seed(1024)
     dist_model_cfg = get_stdit3_config(enable_sequence_parallelism=True)
     dist_model = STDiT3(dist_model_cfg).cuda().to(torch.bfloat16)
+
+    # ensure model weights are equal
+    for p1, p2 in zip(non_dist_model.parameters(), dist_model.parameters()):
+        assert torch.equal(p1, p2)
+
+    # ensure model weights are equal across all ranks
+    for p in dist_model.parameters():
+        p_list = [torch.zeros_like(p) for _ in range(world_size)]
+        dist.all_gather(p_list, p, group=dist.group.WORLD)
+        assert torch.equal(*p_list)
+
     dist_out = dist_model(**data)
     dist_out.mean().backward()
 
@@ -84,9 +95,8 @@ def run_model(rank, world_size, port):
     for (n1, p1), (n2, p2) in zip(non_dist_model.named_parameters(), dist_model.named_parameters()):
         assert n1 == n2
         if p1.grad is not None and p2.grad is not None:
-            if not torch.allclose(p1.grad, p2.grad, rtol=1e-2, atol=1e-4):
-                if dist.get_rank() == 0:
-                    print(f"gradient of {n1} is not equal, {p1.grad} vs {p2.grad}")
+            if not torch.allclose(p1.grad, p2.grad, rtol=1e-2, atol=1e-4) and dist.get_rank() == 0:
+                print(f"gradient of {n1} is not equal, {p1.grad} vs {p2.grad}")
         else:
             assert p1.grad is None and p2.grad is None
 

From 1c64c82c05523c125d91c52af5d34791f532170a Mon Sep 17 00:00:00 2001
From: Shen-Chenhui <shen_chenhui@u.nus.edu>
Date: Tue, 25 Jun 2024 08:26:35 +0000
Subject: [PATCH 19/34] allow path spec for cloud machine eval

---
 eval/loss/launch.sh | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/eval/loss/launch.sh b/eval/loss/launch.sh
index 5e19c7c..c70c52d 100644
--- a/eval/loss/launch.sh
+++ b/eval/loss/launch.sh
@@ -3,8 +3,16 @@
 CMD="torchrun --standalone --nproc_per_node 1 eval/loss/eval_loss.py configs/opensora-v1-2/misc/eval_loss.py"
 CKPT_PATH=$1
 MODEL_NAME=$2
-IMG_PATH="/mnt/jfs-hdd/sora/meta/validation/img_1k.csv"
-VID_PATH="/mnt/jfs-hdd/sora/meta/validation/vid_100.csv"
+IMG_PATH=$3
+VID_PATH=$4
+
+if [ -z $IMG_PATH ]; then
+    IMG_PATH="/mnt/jfs-hdd/sora/meta/validation/img_1k.csv"
+fi
+
+if [ -z $VID_PATH ]; then
+    VID_PATH="/mnt/jfs-hdd/sora/meta/validation/vid_100.csv"
+fi
 
 if [[ $CKPT_PATH == *"ema"* ]]; then
     parentdir=$(dirname $CKPT_PATH)

From 31858eccebfad8998716716ade1dccb7f0d58746 Mon Sep 17 00:00:00 2001
From: Shen-Chenhui <shen_chenhui@u.nus.edu>
Date: Wed, 26 Jun 2024 02:43:19 +0000
Subject: [PATCH 20/34] fix readme

---
 docs/commands.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/commands.md b/docs/commands.md
index d982db8..92ff5e6 100644
--- a/docs/commands.md
+++ b/docs/commands.md
@@ -19,8 +19,8 @@ Note that currently our model loading for vae and diffusion model supports two t
 * load from local file path
 * load from huggingface
 
-Our config supports loading from huggingface by default.
-If you wish to load from a local path, you need to set `force_huggingface=True`, for instance:
+Our config supports loading from huggingface online image by default.
+If you wish to load from a local path downloaded from huggingface image, you need to set `force_huggingface=True`, for instance:
 
 ```python
 # for vae
@@ -41,6 +41,7 @@ model = dict(
     force_huggingface=True, # NOTE: set here
 )
 ```
+However, if you want to load a self-trained model, do not set `force_huggingface=True` since your image won't be in huggingface format.
 
 ## Inference
 

From b65126834f4da0975d544ed105c974e53523e8bf Mon Sep 17 00:00:00 2001
From: Tom Young <tomyoung903@gmail.com>
Date: Wed, 26 Jun 2024 03:56:41 +0000
Subject: [PATCH 21/34] add resume and drop invalid timestamps

---
 tools/scene_cut/cut.py | 45 ++++++++++++++++++++++++++----------------
 1 file changed, 28 insertions(+), 17 deletions(-)

diff --git a/tools/scene_cut/cut.py b/tools/scene_cut/cut.py
index cf724a6..d614d01 100644
--- a/tools/scene_cut/cut.py
+++ b/tools/scene_cut/cut.py
@@ -29,15 +29,20 @@ def process_single_row(row, args):
     # check mp4 integrity
     # if not is_intact_video(video_path, logger=logger):
     #     return False
-
-    if "timestamp" in row:
-        timestamp = row["timestamp"]
-        if not (timestamp.startswith("[") and timestamp.endswith("]")):
+    try:
+        if "timestamp" in row:
+            timestamp = row["timestamp"]
+            if not (timestamp.startswith("[") and timestamp.endswith("]")):
+                return False
+            scene_list = eval(timestamp)
+            scene_list = [(FrameTimecode(s, fps=1), FrameTimecode(t, fps=1)) for s, t in scene_list]
+        else:
+            scene_list = [None]
+        if args.drop_invalid_timestamps:
+            return True
+    except Exception as e:
+        if args.drop_invalid_timestamps:
             return False
-        scene_list = eval(timestamp)
-        scene_list = [(FrameTimecode(s, fps=1), FrameTimecode(t, fps=1)) for s, t in scene_list]
-    else:
-        scene_list = [None]
 
     if "relpath" in row:
         save_dir = os.path.dirname(os.path.join(args.save_dir, row["relpath"]))
@@ -61,7 +66,7 @@ def process_single_row(row, args):
         shorter_size=shorter_size,
         logger=logger,
     )
-
+    return True
 
 def split_video(
     video_path,
@@ -108,7 +113,10 @@ def split_video(
         fname_wo_ext = os.path.splitext(fname)[0]
         # TODO: fname pattern
         save_path = os.path.join(save_dir, f"{fname_wo_ext}_scene-{idx}.mp4")
-
+        if os.path.exists(save_path):
+            # print_log(f"File '{save_path}' already exists. Skip.", logger=logger)
+            continue
+        
         # ffmpeg cmd
         cmd = [FFMPEG_PATH]
 
@@ -134,7 +142,7 @@ def split_video(
             # cmd += ['-vf', f"scale='if(gt(iw,ih),{shorter_size},trunc(ow/a/2)*2)':-2"]
 
         cmd += ["-map", "0:v", save_path]
-
+        # print(cmd)
         proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
         stdout, stderr = proc.communicate()
         # stdout = stdout.decode("utf-8")
@@ -163,7 +171,7 @@ def parse_args():
     )
     parser.add_argument("--num_workers", type=int, default=None, help="#workers for pandarallel")
     parser.add_argument("--disable_parallel", action="store_true", help="disable parallel processing")
-
+    parser.add_argument("--drop_invalid_timestamps", action="store_true", help="drop rows with invalid timestamps")
     args = parser.parse_args()
     return args
 
@@ -175,7 +183,7 @@ def main():
         print(f"Meta file '{meta_path}' not found. Exit.")
         exit()
 
-    # create logger
+    # create save_dir
     os.makedirs(args.save_dir, exist_ok=True)
 
     # initialize pandarallel
@@ -189,10 +197,13 @@ def main():
     # process
     meta = pd.read_csv(args.meta_path)
     if not args.disable_parallel:
-        meta.parallel_apply(process_single_row_partial, axis=1)
+        results = meta.parallel_apply(process_single_row_partial, axis=1)
     else:
-        meta.apply(process_single_row_partial, axis=1)
-
-
+        results = meta.apply(process_single_row_partial, axis=1)
+    if args.drop_invalid_timestamps:
+        meta = meta[results]
+        assert args.meta_path.endswith("timestamp.csv"), "Only support *timestamp.csv"
+        meta.to_csv(args.meta_path.replace("timestamp.csv", "correct_timestamp.csv"), index=False)
+        print(f"Corrected timestamp file saved to '{args.meta_path.replace('timestamp.csv', 'correct_timestamp.csv')}'")
 if __name__ == "__main__":
     main()

From 4b2b47b34d310309f6b328fb0004321bbacc0d25 Mon Sep 17 00:00:00 2001
From: zhengzangw <zhengzangw@gmail.com>
Date: Wed, 26 Jun 2024 07:00:24 +0000
Subject: [PATCH 22/34] [fix] pixart sampling

---
 configs/pixart/inference/1x2048MS.py   |  1 +
 opensora/datasets/aspect.py            |  5 ++++-
 opensora/models/pixart/pixart.py       |  2 +-
 opensora/models/vae/vae.py             | 17 ++++++++++++-----
 opensora/schedulers/dpms/dpm_solver.py |  2 +-
 scripts/inference.py                   |  1 +
 6 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/configs/pixart/inference/1x2048MS.py b/configs/pixart/inference/1x2048MS.py
index 0f48824..23f26ff 100644
--- a/configs/pixart/inference/1x2048MS.py
+++ b/configs/pixart/inference/1x2048MS.py
@@ -16,6 +16,7 @@ vae = dict(
     type="VideoAutoencoderKL",
     from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers",
     subfolder="vae",
+    scaling_factor=0.13025,
 )
 text_encoder = dict(
     type="t5",
diff --git a/opensora/datasets/aspect.py b/opensora/datasets/aspect.py
index 011ad40..f6defb7 100644
--- a/opensora/datasets/aspect.py
+++ b/opensora/datasets/aspect.py
@@ -465,7 +465,10 @@ def get_num_pixels(name):
 
 
 def get_image_size(resolution, ar_ratio):
-    ar_key = ASPECT_RATIO_MAP[ar_ratio]
+    if ar_ratio in ASPECT_RATIO_MAP:
+        ar_key = ASPECT_RATIO_MAP[ar_ratio]
+    else:
+        ar_key = ar_ratio
     rs_dict = ASPECT_RATIOS[resolution][1]
     assert ar_key in rs_dict, f"Aspect ratio {ar_ratio} not found for resolution {resolution}"
     return rs_dict[ar_key]
diff --git a/opensora/models/pixart/pixart.py b/opensora/models/pixart/pixart.py
index 9544fcb..d99c572 100644
--- a/opensora/models/pixart/pixart.py
+++ b/opensora/models/pixart/pixart.py
@@ -197,7 +197,7 @@ class PixArt(nn.Module):
             if freeze == "text":
                 self.freeze_text()
 
-    def forward(self, x, timestep, y, mask=None):
+    def forward(self, x, timestep, y, mask=None, **kwargs):
         """
         Forward pass of PixArt.
         x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
diff --git a/opensora/models/vae/vae.py b/opensora/models/vae/vae.py
index 9802b02..3e85bf5 100644
--- a/opensora/models/vae/vae.py
+++ b/opensora/models/vae/vae.py
@@ -13,7 +13,13 @@ from opensora.utils.ckpt_utils import load_checkpoint
 @MODELS.register_module()
 class VideoAutoencoderKL(nn.Module):
     def __init__(
-        self, from_pretrained=None, micro_batch_size=None, cache_dir=None, local_files_only=False, subfolder=None
+        self,
+        from_pretrained=None,
+        micro_batch_size=None,
+        cache_dir=None,
+        local_files_only=False,
+        subfolder=None,
+        scaling_factor=0.18215,
     ):
         super().__init__()
         self.module = AutoencoderKL.from_pretrained(
@@ -25,6 +31,7 @@ class VideoAutoencoderKL(nn.Module):
         self.out_channels = self.module.config.latent_channels
         self.patch_size = (1, 8, 8)
         self.micro_batch_size = micro_batch_size
+        self.scaling_factor = scaling_factor
 
     def encode(self, x):
         # x: (B, C, T, H, W)
@@ -32,14 +39,14 @@ class VideoAutoencoderKL(nn.Module):
         x = rearrange(x, "B C T H W -> (B T) C H W")
 
         if self.micro_batch_size is None:
-            x = self.module.encode(x).latent_dist.sample().mul_(0.18215)
+            x = self.module.encode(x).latent_dist.sample().mul_(self.scaling_factor)
         else:
             # NOTE: cannot be used for training
             bs = self.micro_batch_size
             x_out = []
             for i in range(0, x.shape[0], bs):
                 x_bs = x[i : i + bs]
-                x_bs = self.module.encode(x_bs).latent_dist.sample().mul_(0.18215)
+                x_bs = self.module.encode(x_bs).latent_dist.sample().mul_(self.scaling_factor)
                 x_out.append(x_bs)
             x = torch.cat(x_out, dim=0)
         x = rearrange(x, "(B T) C H W -> B C T H W", B=B)
@@ -50,14 +57,14 @@ class VideoAutoencoderKL(nn.Module):
         B = x.shape[0]
         x = rearrange(x, "B C T H W -> (B T) C H W")
         if self.micro_batch_size is None:
-            x = self.module.decode(x / 0.18215).sample
+            x = self.module.decode(x / self.scaling_factor).sample
         else:
             # NOTE: cannot be used for training
             bs = self.micro_batch_size
             x_out = []
             for i in range(0, x.shape[0], bs):
                 x_bs = x[i : i + bs]
-                x_bs = self.module.decode(x_bs / 0.18215).sample
+                x_bs = self.module.decode(x_bs / self.scaling_factor).sample
                 x_out.append(x_bs)
             x = torch.cat(x_out, dim=0)
         x = rearrange(x, "(B T) C H W -> B C T H W", B=B)
diff --git a/opensora/schedulers/dpms/dpm_solver.py b/opensora/schedulers/dpms/dpm_solver.py
index d422a0a..2eddfbd 100644
--- a/opensora/schedulers/dpms/dpm_solver.py
+++ b/opensora/schedulers/dpms/dpm_solver.py
@@ -1419,7 +1419,7 @@ class DPM_Solver:
                 for step in progress_fn(range(order, steps + 1)):
                     t = timesteps[step]
                     # We only use lower order for steps < 10
-                    if lower_order_final and steps < 10:
+                    if lower_order_final:  # recommended by Shuchen Xue
                         step_order = min(order, steps + 1 - step)
                     else:
                         step_order = order
diff --git a/scripts/inference.py b/scripts/inference.py
index 5095fd2..c4578a7 100644
--- a/scripts/inference.py
+++ b/scripts/inference.py
@@ -260,6 +260,7 @@ def main():
                     )
 
                 # == sampling ==
+                torch.manual_seed(1024)
                 z = torch.randn(len(batch_prompts), vae.out_channels, *latent_size, device=device, dtype=dtype)
                 masks = apply_mask_strategy(z, refs, ms, loop_i, align=align)
                 samples = scheduler.sample(

From ecebf888657cd1e7eb14acbf3d9fe41fa34964df Mon Sep 17 00:00:00 2001
From: zhengzangw <zhengzangw@gmail.com>
Date: Wed, 26 Jun 2024 07:45:15 +0000
Subject: [PATCH 23/34] update readme

---
 configs/opensora-v1-2/train/demo_360p.py | 2 +-
 docs/report_03.md                        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/configs/opensora-v1-2/train/demo_360p.py b/configs/opensora-v1-2/train/demo_360p.py
index e27bd3c..f49a00e 100644
--- a/configs/opensora-v1-2/train/demo_360p.py
+++ b/configs/opensora-v1-2/train/demo_360p.py
@@ -5,7 +5,7 @@ dataset = dict(
 )
 
 # webvid
-bucket_config = {"360p": {102: (1.0, 5)}}
+bucket_config = {"360p": {102: (1.0, 1)}}
 grad_checkpoint = True
 
 # Acceleration settings
diff --git a/docs/report_03.md b/docs/report_03.md
index d6012e4..88baf3a 100644
--- a/docs/report_03.md
+++ b/docs/report_03.md
@@ -7,7 +7,7 @@
 - [Evaluation](#evaluation)
 - [Sequence parallelism](#sequence-parallelism)
 
-In Open-Sora 1.2 release, we train a 1.1B models on >30M data (~80k hours), with training cost 35k H100 GPU hours, supporting 0s~16s, 144p to 720p, various aspect ratios video generation. Our configurations is listed below. Following our 1.1 version, Open-Sora 1.2 can also do image-to-video generation and video extension.
+In Open-Sora 1.2 release, we train a 1.1B models on >30M data (about 80k hours), with training cost 35k H100 GPU hours, supporting 0s to 16s, 144p to 720p, various aspect ratios video generation. Our configurations is listed below. Following our 1.1 version, Open-Sora 1.2 can also do image-to-video generation and video extension.
 
 |      | image | 2s  | 4s  | 8s  | 16s |
 | ---- | ----- | --- | --- | --- | --- |

From ab3a74797a30febda43a2a5263f6f8ceaecf755d Mon Sep 17 00:00:00 2001
From: Shen-Chenhui <shen_chenhui@u.nus.edu>
Date: Thu, 27 Jun 2024 07:08:02 +0000
Subject: [PATCH 24/34] formate

---
 assets/texts/rand_types.txt | 40 +++++++++++++++++++++++++++++++++++++
 eval/sample.sh              |  5 +++++
 2 files changed, 45 insertions(+)
 create mode 100644 assets/texts/rand_types.txt

diff --git a/assets/texts/rand_types.txt b/assets/texts/rand_types.txt
new file mode 100644
index 0000000..bd4b5d8
--- /dev/null
+++ b/assets/texts/rand_types.txt
@@ -0,0 +1,40 @@
+随机电影镜头
+随机电影镜头
+随机电影镜头
+随机电影镜头
+随机电影镜头
+随机任务镜头
+随机任务镜头
+随机任务镜头
+随机任务镜头
+随机任务镜头
+随机游戏镜头
+随机游戏镜头
+随机游戏镜头
+随机游戏镜头
+随机游戏镜头
+随机开车镜头
+随机开车镜头
+随机开车镜头
+随机开车镜头
+随机开车镜头
+随机动物镜头
+随机动物镜头
+随机动物镜头
+随机动物镜头
+随机动物镜头
+随机森林镜头
+随机森林镜头
+随机森林镜头
+随机森林镜头
+随机森林镜头
+随机动漫镜头
+随机动漫镜头
+随机动漫镜头
+随机动漫镜头
+随机动漫镜头
+随机舞蹈镜头
+随机舞蹈镜头
+随机舞蹈镜头
+随机舞蹈镜头
+随机舞蹈镜头
diff --git a/eval/sample.sh b/eval/sample.sh
index 83218bd..af59080 100644
--- a/eval/sample.sh
+++ b/eval/sample.sh
@@ -13,6 +13,11 @@ NUM_SAMPLING_STEPS=$9
 FLOW=${10}
 LLM_REFINE=${11}
 
+BASE_VID_RES=480p
+BASE_IMG_RES=240p
+
+
+
 echo "NUM_FRAMES=${NUM_FRAMES}"
 
 if [ -z "${NUM_FRAMES}" ]; then

From 671d936c969fc0aad3d0639ba4074564c14666b9 Mon Sep 17 00:00:00 2001
From: Shen-Chenhui <shen_chenhui@u.nus.edu>
Date: Thu, 27 Jun 2024 09:23:11 +0000
Subject: [PATCH 25/34] fix argument passing and task type position

---
 eval/sample.sh            |  2 +-
 eval/vbench/launch.sh     | 10 +++++-----
 eval/vbench_i2v/launch.sh | 10 +++++-----
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/eval/sample.sh b/eval/sample.sh
index 83218bd..7df2a9f 100644
--- a/eval/sample.sh
+++ b/eval/sample.sh
@@ -3,7 +3,7 @@
 CKPT=$1
 NUM_FRAMES=$2
 MODEL_NAME=$3
-
+TASK_TYPE=$4
 VBENCH_START_INDEX=$5
 VBENCH_END_INDEX=$6
 VBENCH_RES=$7
diff --git a/eval/vbench/launch.sh b/eval/vbench/launch.sh
index e7c1165..df4d06d 100644
--- a/eval/vbench/launch.sh
+++ b/eval/vbench/launch.sh
@@ -37,21 +37,21 @@ END_INDEX_LIST=(120 240 360 480 600 720 840 2000)
 for i in "${!GPUS[@]}"; do
     if [ -z ${RES} ] || [ -z ${ASP_RATIO} ]  ;
         then
-            CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+            CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
         else
             if [ -z ${NUM_SAMPLING_STEPS} ];
                 then
-                    CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                    CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
                 else
                     if [ -z ${FLOW} ];
                     then
-                        CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                        CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
                     else
                         if [ -z ${LLM_REFINE} ];
                             then
-                                CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                                CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
                             else
-                                CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} ${LLM_REFINE}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                                CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -4 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} ${LLM_REFINE} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
                         fi
                     fi
             fi
diff --git a/eval/vbench_i2v/launch.sh b/eval/vbench_i2v/launch.sh
index 2b03309..193c581 100644
--- a/eval/vbench_i2v/launch.sh
+++ b/eval/vbench_i2v/launch.sh
@@ -28,21 +28,21 @@ END_INDEX_LIST=(140 280 420 560 700 840 980 2000)
 for i in "${!GPUS[@]}"; do
     if [ -z ${RES} ] || [ -z ${ASP_RATIO} ]  ;
         then
-            CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+            CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT $NUM_FRAMES $MODEL_NAME -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
         else
             if [ -z ${NUM_SAMPLING_STEPS} ];
                 then
-                    CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                    CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
                 else
                     if [ -z ${FLOW} ];
                     then
-                        CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                        CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
                     else
                         if [ -z ${LLM_REFINE} ];
                             then
-                                CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                                CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
                             else
-                                CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} ${LLM_REFINE}>${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
+                                CUDA_VISIBLE_DEVICES=${GPUS[i]} bash eval/sample.sh $CKPT ${NUM_FRAMES} ${MODEL_NAME} -5 ${START_INDEX_LIST[i]} ${END_INDEX_LIST[i]} ${RES} ${ASP_RATIO} ${NUM_SAMPLING_STEPS} ${FLOW} ${LLM_REFINE} > ${LOG_BASE}/${TASK_ID_LIST[i]}.log 2>&1 &
                         fi
                     fi
             fi

From c4dcc975582516e2f1b0d6c5a77df6ebad9aa4d4 Mon Sep 17 00:00:00 2001
From: Shen-Chenhui <shen_chenhui@u.nus.edu>
Date: Thu, 27 Jun 2024 09:58:45 +0000
Subject: [PATCH 26/34] enable res level for vid only:

---
 eval/sample.sh | 85 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 54 insertions(+), 31 deletions(-)

diff --git a/eval/sample.sh b/eval/sample.sh
index af59080..5c170c1 100644
--- a/eval/sample.sh
+++ b/eval/sample.sh
@@ -13,10 +13,33 @@ NUM_SAMPLING_STEPS=$9
 FLOW=${10}
 LLM_REFINE=${11}
 
-BASE_VID_RES=480p
-BASE_IMG_RES=240p
-
-
+BASE_ASPECT_RATIO=360p
+ASPECT_RATIOS=(144p 240p 360p 480p 720p 1080p)
+# Loop through the list of aspect ratios
+i=0
+for r in "${ASPECT_RATIOS[@]}"; do
+  if [[ "$r" == "$BASE_ASPECT_RATIO" ]]; then
+    # get aspect ratio 1 level up
+    if [[ $((i+1)) -lt ${#ASPECT_RATIOS[@]} ]]; then
+      ASPECT_RATIO_INCR_1=${ASPECT_RATIOS[$((i+1))]}
+    else
+      # If this is the highest ratio, return the highest ratio
+      ASPECT_RATIO_INCR_1=${ASPECT_RATIOS[-1]}
+    fi
+    # get aspect ratio 2 levels up
+    if [[ $((i+2)) -lt ${#ASPECT_RATIOS[@]} ]]; then
+      ASPECT_RATIO_INCR_2=${ASPECT_RATIOS[$((i+2))]}
+    else
+      # If this is the highest ratio, return the highest ratio
+      ASPECT_RATIO_INCR_2=${ASPECT_RATIOS[-1]}
+    fi
+  fi
+  i=$((i+1))
+done
+echo "base aspect ratio: ${BASE_ASPECT_RATIO}"
+echo "aspect ratio 1 level up: ${ASPECT_RATIO_INCR_1}"
+echo "aspect ratio 2 levels up: ${ASPECT_RATIO_INCR_2}"
+echo "Note that this aspect ratio level setting is used for videos only, not images"
 
 echo "NUM_FRAMES=${NUM_FRAMES}"
 
@@ -98,13 +121,13 @@ function run_video_a() { # ~ 30min ?
   # eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 2s --resolution 720p --aspect-ratio 9:16 --sample-name sample_2s_720p_9_16 --batch-size $DEFAULT_BS
 
   # sample, 720p, 9:16, 2s
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 4s --resolution 720p --aspect-ratio 9:16 --sample-name sample_4s_720p --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 4s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 9:16 --sample-name sample_4s_${ASPECT_RATIO_INCR_2} --batch-size $DEFAULT_BS
 
   # sample, 480p, 9:16, 8s
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 8s --resolution 480p --aspect-ratio 9:16 --sample-name sample_8s_480p --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 8s --resolution ${ASPECT_RATIO_INCR_1} --aspect-ratio 9:16 --sample-name sample_8s_${ASPECT_RATIO_INCR_1} --batch-size $DEFAULT_BS
 
-  # sample, 240p, 9:16, 16s
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 16s --resolution 360p --aspect-ratio 9:16 --sample-name sample_16s_360p --batch-size $DEFAULT_BS
+  # sample, 360p, 9:16, 16s
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_samples.txt --save-dir $OUTPUT --num-frames 16s --resolution ${BASE_ASPECT_RATIO} --aspect-ratio 9:16 --sample-name sample_16s_${BASE_ASPECT_RATIO} --batch-size $DEFAULT_BS
 }
 
 function run_video_b() { # 18min + 14min = 32min, short 16x240p & 64x240p
@@ -121,10 +144,10 @@ function run_video_b() { # 18min + 14min = 32min, short 16x240p & 64x240p
   # eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 8s --resolution 240p --aspect-ratio 9:16 --sample-name short_8s_240p_9_16 --batch-size $DEFAULT_BS
 
   # short, 480p, 9:16, 8s: ~24min
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 8s --resolution 480p --aspect-ratio 9:16 --sample-name short_8s_480p --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 8s --resolution ${ASPECT_RATIO_INCR_1} --aspect-ratio 9:16 --sample-name short_8s_${ASPECT_RATIO_INCR_1} --batch-size $DEFAULT_BS
 
   # short, 240p, 9:16, 16s: ~24min
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 16s --resolution 360p --aspect-ratio 9:16 --sample-name short_16s_360p --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 16s --resolution ${BASE_ASPECT_RATIO} --aspect-ratio 9:16 --sample-name short_16s_${BASE_ASPECT_RATIO} --batch-size $DEFAULT_BS
 
 }
 
@@ -138,10 +161,10 @@ function run_video_c() {
   # eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 16s --resolution 240p --aspect-ratio 9:16 --sample-name sora_16s_240p_9_16 --batch-size $DEFAULT_BS
 
   # short, 720p, 9:16, 2s: ~9min
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 4s --resolution 720p --aspect-ratio 9:16 --sample-name short_4s_720p --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 4s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 9:16 --sample-name short_4s_${ASPECT_RATIO_INCR_2} --batch-size $DEFAULT_BS
 
-  # sora, 240p, 9:16, 16s: ~40min
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 16s --resolution 360p --aspect-ratio 9:16 --sample-name sora_16s_360p --batch-size $DEFAULT_BS
+  # sora, 360p, 9:16, 16s: ~40min
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 16s --resolution ${BASE_ASPECT_RATIO} --aspect-ratio 9:16 --sample-name sora_16s_${BASE_ASPECT_RATIO} --batch-size $DEFAULT_BS
 }
 
 function run_video_d() {
@@ -152,17 +175,17 @@ function run_video_d() {
   # eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 8s --resolution 480p --aspect-ratio 9:16 --sample-name sora_8s_480p_9_16 --batch-size $DEFAULT_BS --start-index 0 --end-index 16
 
   # sora, 480p, 9:16, 8s, 1/3 # moved from run_video_e, 30min
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 8s --resolution 480p --aspect-ratio 9:16 --sample-name sora_8s_480p --batch-size $DEFAULT_BS --start-index 0 --end-index 16
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 8s --resolution ${ASPECT_RATIO_INCR_1} --aspect-ratio 9:16 --sample-name sora_8s_${ASPECT_RATIO_INCR_1} --batch-size $DEFAULT_BS --start-index 0 --end-index 16
 }
 
 function run_video_e() { # 90min * 2/3 = 60min
   # sora, 480p, 9:16, 8s, 2/3
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 8s --resolution 480p --aspect-ratio 9:16 --sample-name sora_8s_480p --batch-size $DEFAULT_BS --start-index 16 --end-index 100
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 8s --resolution ${ASPECT_RATIO_INCR_1} --aspect-ratio 9:16 --sample-name sora_8s_${ASPECT_RATIO_INCR_1} --batch-size $DEFAULT_BS --start-index 16 --end-index 100
 }
 
 function run_video_f() { # 60min
   # sora, 720p, 9:16, 2s
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 4s --resolution 720p --aspect-ratio 9:16 --sample-name sora_4s_720p --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_sora.txt --save-dir $OUTPUT --num-frames 4s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 9:16 --sample-name sora_4s_${ASPECT_RATIO_INCR_2} --batch-size $DEFAULT_BS
 }
 
 # --resolution 720p --aspect-ratio [16:9, 9:16, ...]
@@ -171,22 +194,22 @@ function run_video_g() { # 15min
   # 720p, 2s multi-resolution
   # 1:1
   PROMPT="A soaring drone footage captures the majestic beauty of a coastal cliff, its red and yellow stratified rock faces rich in color and against the vibrant turquoise of the sea. Seabirds can be seen taking flight around the cliff's precipices. As the drone slowly moves from different angles, the changing sunlight casts shifting shadows that highlight the rugged textures of the cliff and the surrounding calm sea. The water gently laps at the rock base and the greenery that clings to the top of the cliff, and the scene gives a sense of peaceful isolation at the fringes of the ocean. The video captures the essence of pristine natural beauty untouched by human structures."
-  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution 720p --aspect-ratio 1:1 --sample-name drone_cliff_prompt_720p_2s_1_1
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 1:1 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_1_1
   # 16:9
-  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution 720p --aspect-ratio 16:9 --sample-name drone_cliff_prompt_720p_2s_16_9
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 16:9 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_16_9
   # 9:16
-  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution 720p --aspect-ratio 9:16 --sample-name drone_cliff_prompt_720p_2s_9_16
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 9:16 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_9_16
   # 4:3
-  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution 720p --aspect-ratio 4:3 --sample-name drone_cliff_prompt_720p_2s_4_3
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 4:3 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_4_3
   # 3:4
-  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution 720p --aspect-ratio 3:4 --sample-name drone_cliff_prompt_720p_2s_3_4
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 3:4 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_3_4
   # 1:2
-  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution 720p --aspect-ratio 1:2 --sample-name drone_cliff_prompt_720p_2s_1_2
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 1:2 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_1_2
   # 2:1
-  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution 720p --aspect-ratio 2:1 --sample-name drone_cliff_prompt_720p_2s_2_1
+  eval $CMD --ckpt-path $CKPT --prompt \"$PROMPT\" --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --aspect-ratio 2:1 --sample-name drone_cliff_prompt_${ASPECT_RATIO_INCR_2}_2s_2_1
 
   # add motion score
-  eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --num-frames 2s --resolution 720p --sample-name motion_2s_720p --prompt \
+  eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --sample-name motion_2s_${ASPECT_RATIO_INCR_2} --prompt \
     \"A stylish woman walking in the street of Tokyo.\" \"A stylish woman walking in the street of Tokyo. motion score: 0.0\" \
     \"A stylish woman walking in the street of Tokyo. motion score: 2.0\" \
     \"A stylish woman walking in the street of Tokyo. motion score: 4.0\" \
@@ -197,7 +220,7 @@ function run_video_g() { # 15min
     \"A stylish woman walking in the street of Tokyo. motion score: 100.0\"
 
   # add aes score
-  eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --num-frames 2s --resolution 720p --sample-name aes_2s_720p --prompt \
+  eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --num-frames 2s --resolution ${ASPECT_RATIO_INCR_2} --sample-name aes_2s_${ASPECT_RATIO_INCR_2} --prompt \
     \"A stylish woman walking in the street of Tokyo.\" \"A stylish woman walking in the street of Tokyo. aesthetic score: 4.0\" \
     \"A stylish woman walking in the street of Tokyo. aesthetic score: 4.5\" \
     \"A stylish woman walking in the street of Tokyo. aesthetic score: 5.0\" \
@@ -211,24 +234,24 @@ function run_video_g() { # 15min
 
 function run_video_h() { # 61min
   # 3.1 image-conditioned long video generation
-  eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --sample-name ref_L5C5_2s_360p_9_16 \
+  eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --sample-name ref_L5C5_2s_${BASE_ASPECT_RATIO}_9_16 \
     --prompt-path assets/texts/t2v_ref.txt --start-index 0 --end-index 3 \
-    --num-frames 2s --resolution 360p --aspect-ratio 9:16 \
+    --num-frames 2s --resolution ${BASE_ASPECT_RATIO} --aspect-ratio 9:16 \
     --loop 5 --condition-frame-length 5 \
     --reference-path assets/images/condition/cliff.png assets/images/condition/wave.png assets/images/condition/ship.png \
     --mask-strategy "0" "0" "0" --batch-size $DEFAULT_BS
 
-  eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --sample-name ref_L5C10_16s_360p_9_16 \
+  eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --sample-name ref_L5C10_16s_${BASE_ASPECT_RATIO}_9_16 \
     --prompt-path assets/texts/t2v_ref.txt --start-index 0 --end-index 3 \
-    --num-frames 16s --resolution 360p --aspect-ratio 9:16 \
+    --num-frames 16s --resolution ${BASE_ASPECT_RATIO} --aspect-ratio 9:16 \
     --loop 5 --condition-frame-length 10 \
     --reference-path assets/images/condition/cliff.png assets/images/condition/wave.png assets/images/condition/ship.png \
     --mask-strategy "0" "0" "0" --batch-size $DEFAULT_BS
 
   # 3.2
-  eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --sample-name ref_L1_16s_240p_9_16 \
+  eval $CMD --ckpt-path $CKPT --save-dir $OUTPUT --sample-name ref_L1_16s_${BASE_ASPECT_RATIO}_9_16 \
     --prompt-path assets/texts/t2v_ref.txt --start-index 3 --end-index 6 \
-    --num-frames 16s --resolution 360p --aspect-ratio 9:16 \
+    --num-frames 16s --resolution ${BASE_ASPECT_RATIO} --aspect-ratio 9:16 \
     --loop 1 \
     --reference-path assets/images/condition/cliff.png "assets/images/condition/cactus-sad.png\;assets/images/condition/cactus-happy.png" https://cdn.openai.com/tmp/s/interp/d0.mp4 \
     --mask-strategy "0" "0\;0,1,0,-1,1" "0,0,0,0,${QUAD_FRAMES},0.5" --batch-size $DEFAULT_BS

From 20acdd811ed6f2aee299cde5b89c50ea548a463d Mon Sep 17 00:00:00 2001
From: Shen-Chenhui <shen_chenhui@u.nus.edu>
Date: Thu, 27 Jun 2024 10:01:39 +0000
Subject: [PATCH 27/34] fix typo

---
 eval/sample.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/eval/sample.sh b/eval/sample.sh
index 5c170c1..a715189 100644
--- a/eval/sample.sh
+++ b/eval/sample.sh
@@ -146,7 +146,7 @@ function run_video_b() { # 18min + 14min = 32min, short 16x240p & 64x240p
   # short, 480p, 9:16, 8s: ~24min
   eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 8s --resolution ${ASPECT_RATIO_INCR_1} --aspect-ratio 9:16 --sample-name short_8s_${ASPECT_RATIO_INCR_1} --batch-size $DEFAULT_BS
 
-  # short, 240p, 9:16, 16s: ~24min
+  # short, 360p, 9:16, 16s: ~24min
   eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2v_short.txt --save-dir $OUTPUT --num-frames 16s --resolution ${BASE_ASPECT_RATIO} --aspect-ratio 9:16 --sample-name short_16s_${BASE_ASPECT_RATIO} --batch-size $DEFAULT_BS
 
 }

From fdcd22257c2d8fa64119642e89a113b02ad8a763 Mon Sep 17 00:00:00 2001
From: Tom Young <tomyoung903@gmail.com>
Date: Fri, 28 Jun 2024 08:30:29 +0000
Subject: [PATCH 28/34] small improvements

---
 tools/caption/pllava_dir/caption_pllava.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tools/caption/pllava_dir/caption_pllava.py b/tools/caption/pllava_dir/caption_pllava.py
index 2213367..34fd842 100644
--- a/tools/caption/pllava_dir/caption_pllava.py
+++ b/tools/caption/pllava_dir/caption_pllava.py
@@ -130,7 +130,10 @@ class CSVDataset(Dataset):
     def __getitem__(self, idx):
         if idx < 0 or idx >= len(self.data_list):
             raise IndexError
-        video = load_video(self.data_list[idx], self.num_frames, resolution=RESOLUTION)
+        try:
+            video = load_video(self.data_list[idx], self.num_frames, resolution=RESOLUTION)
+        except:
+            return None
         return video
 
     def set_rank_and_world_size(self, rank, world_size):
@@ -191,7 +194,7 @@ def parse_args():
         "--error_message",
         type=str,
         required=False,
-        default=None,
+        default='error occured during captioning',
     )
     args = parser.parse_args()
     return args
@@ -235,6 +238,9 @@ def infer(
     conv_mode,
     print_res=True,
 ):
+    # check if any video in video_list is None, if so, raise an exception
+    if any([video is None for video in video_list]):
+        raise Exception("Video not loaded properly")
     conv = conv_template.copy()
     conv.user_query("Describe the video in details.", is_mm=True)
 

From a2bc2fde96ff0ca4a9404a228553b087244b4993 Mon Sep 17 00:00:00 2001
From: Tom Young <tomyoung903@gmail.com>
Date: Sat, 29 Jun 2024 06:03:48 +0000
Subject: [PATCH 29/34] added notification

---
 tools/caption/pllava_dir/caption_pllava.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/caption/pllava_dir/caption_pllava.py b/tools/caption/pllava_dir/caption_pllava.py
index 34fd842..da99523 100644
--- a/tools/caption/pllava_dir/caption_pllava.py
+++ b/tools/caption/pllava_dir/caption_pllava.py
@@ -375,7 +375,7 @@ def main():
     # write the dataframe to a new csv file called '*_pllava_13b_caption.csv'
     new_csv_path = args.csv_path.replace(".csv", "_text.csv")
     df.to_csv(new_csv_path, index=False)
-
+    print(f"Results saved to {new_csv_path}")
 
 if __name__ == "__main__":
     main()

From 7ec46c47320cf6dea813ff9df09832079f815cf4 Mon Sep 17 00:00:00 2001
From: Tom Young <tomyoung903@gmail.com>
Date: Sun, 30 Jun 2024 03:01:59 +0000
Subject: [PATCH 30/34] small fix

---
 tools/caption/pllava_dir/caption_pllava.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tools/caption/pllava_dir/caption_pllava.py b/tools/caption/pllava_dir/caption_pllava.py
index da99523..f220f5a 100644
--- a/tools/caption/pllava_dir/caption_pllava.py
+++ b/tools/caption/pllava_dir/caption_pllava.py
@@ -236,7 +236,7 @@ def infer(
     processor,
     video_list,
     conv_mode,
-    print_res=True,
+    print_res=False,
 ):
     # check if any video in video_list is None, if so, raise an exception
     if any([video is None for video in video_list]):
@@ -314,7 +314,8 @@ def run(rank, args, world_size, output_queue):
             )
         except Exception as e:
             logger.error(f"error in {batch}: {str(e)}")
-            preds = args.error_message
+            # preds = args.error_message duplicated for each video in the batch
+            preds = [args.error_message] * len(batch)
         result_list.extend(preds)
     output_queue.put((rank, result_list))
     return result_list

From 08c5222cb932aedd03b968ae3e964e8546ea8042 Mon Sep 17 00:00:00 2001
From: Tom Young <tomyoung903@gmail.com>
Date: Sun, 30 Jun 2024 09:35:59 +0000
Subject: [PATCH 31/34] added .m2ts support

---
 tools/datasets/convert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/datasets/convert.py b/tools/datasets/convert.py
index 6253e8e..fad128f 100644
--- a/tools/datasets/convert.py
+++ b/tools/datasets/convert.py
@@ -6,7 +6,7 @@ import pandas as pd
 from torchvision.datasets import ImageNet
 
 IMG_EXTENSIONS = (".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif", ".tiff", ".webp")
-VID_EXTENSIONS = (".mp4", ".avi", ".mov", ".mkv")
+VID_EXTENSIONS = (".mp4", ".avi", ".mov", ".mkv", ".m2ts")
 
 
 def scan_recursively(root):

From 2ac4900c814a0f2fd09372af71e56ce8c31c7063 Mon Sep 17 00:00:00 2001
From: pxy <pexure@gmail.com>
Date: Thu, 4 Jul 2024 03:14:08 +0000
Subject: [PATCH 32/34] update default shorter_size

---
 .gitignore                               |   1 +
 opensora/models/vae/video_sdxl/blocks.py | 724 +++++++++++++++++++++++
 tools/scene_cut/cut.py                   |   2 +-
 3 files changed, 726 insertions(+), 1 deletion(-)
 create mode 100644 opensora/models/vae/video_sdxl/blocks.py

diff --git a/.gitignore b/.gitignore
index 57b6f55..04b419a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -179,6 +179,7 @@ pretrained_models
 evaluation_results/
 cache/
 *.swp
+debug/
 
 # Secret files
 hostfile
diff --git a/opensora/models/vae/video_sdxl/blocks.py b/opensora/models/vae/video_sdxl/blocks.py
new file mode 100644
index 0000000..0ed4973
--- /dev/null
+++ b/opensora/models/vae/video_sdxl/blocks.py
@@ -0,0 +1,724 @@
+"""
+Adapted from SDXL VAE (https://huggingface.co/stabilityai/sdxl-vae/blob/main/config.json)
+All default values of kwargs are the same as SDXL
+"""
+
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.models.attention_processor import Attention
+from einops import rearrange
+
+
+def video_to_image(func):
+    def wrapper(self, x, *args, **kwargs):
+        if x.ndim == 5:
+            B = x.shape[0]
+            x = rearrange(x, 'B C T H W -> (B T) C H W')
+
+            if hasattr(self, 'micro_batch_size') and self.micro_batch_size is None:
+                x = func(self, x, *args, **kwargs)
+            else:
+                bs = self.micro_batch_size
+                x_out = []
+                for i in range(0, x.shape[0], bs):
+                    x_i = func(self, x[i:i + bs], *args, **kwargs)
+                    x_out.append(x_i)
+                x = torch.cat(x_out, dim=0)
+
+            x = rearrange(x, '(B T) C H W -> B C T H W', B=B)
+        return x
+    return wrapper
+
+
+class VideoConv2d(nn.Conv2d):
+    def __init__(self, *args, micro_batch_size=None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.micro_batch_size = micro_batch_size
+
+    @video_to_image
+    def forward(self, x):
+        return super().forward(x)
+
+
+class ResnetBlock2D(nn.Module):
+    """
+        Use nn.Conv2d
+        Default activation is nn.SiLU()
+        Make sure input tensor is of shape [B, C, T, H, W] or [B, C, H, W]
+        Support micro_batch_size
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        norm_groups: int = 32,
+        norm_eps: float = 1e-6,
+        micro_batch_size=None,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.micro_batch_size = micro_batch_size
+
+        conv_cls = nn.Conv2d
+        self.norm1 = torch.nn.GroupNorm(num_groups=norm_groups, num_channels=in_channels, eps=norm_eps, affine=True)
+        self.conv1 = conv_cls(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+
+        self.norm2 = torch.nn.GroupNorm(num_groups=norm_groups, num_channels=out_channels, eps=norm_eps, affine=True)
+        self.conv2 = conv_cls(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+
+        self.act = nn.SiLU()
+
+        self.use_in_shortcut = self.in_channels != out_channels
+
+        self.conv_shortcut = None
+        if self.use_in_shortcut:
+            self.conv_shortcut = conv_cls(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            )
+
+    @video_to_image
+    def forward(self, x):
+        res = self.norm1(x)
+        res = self.act(res)
+        res = self.conv1(res)
+
+        res = self.norm2(res)
+        res = self.act(res)
+        res = self.conv2(res)
+
+        if self.conv_shortcut is not None:
+            x = self.conv_shortcut(x)
+
+        out = x + res
+        return out
+
+
+class ResnetBlock3D(nn.Module):
+    """
+        Use nn.Conv3d
+        Default activation is nn.SiLU()
+        Make sure input tensor is of shape [B, C, T, H, W]
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        norm_groups: int = 32,
+        norm_eps: float = 1e-6,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+
+        conv_cls = nn.Conv3d
+        self.norm1 = torch.nn.GroupNorm(num_groups=norm_groups, num_channels=in_channels, eps=norm_eps, affine=True)
+        self.conv1 = conv_cls(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+
+        self.norm2 = torch.nn.GroupNorm(num_groups=norm_groups, num_channels=out_channels, eps=norm_eps, affine=True)
+        self.conv2 = conv_cls(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+
+        self.act = nn.SiLU()
+
+        self.use_in_shortcut = self.in_channels != out_channels
+
+        self.conv_shortcut = None
+        if self.use_in_shortcut:
+            self.conv_shortcut = conv_cls(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            )
+        
+    def forward(self, x):
+        res = self.norm1(x)
+        res = self.act(res)
+        res = self.conv1(res)
+
+        res = self.norm2(res)
+        res = self.act(res)
+        res = self.conv2(res)
+
+        if self.conv_shortcut is not None:
+            x = self.conv_shortcut(x)
+
+        out = x + res
+        return out
+
+
+class SpatialDownsample2x(nn.Module):
+    """
+        Default downsample is Conv2d(stride=2)
+        Make sure input tensor is of shape [B, C, T, H, W]
+        Support micro_batch_size
+    """
+    def __init__(
+        self,
+        channels: int,
+        use_conv: bool = True,
+        micro_batch_size=None,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.use_conv = use_conv
+        self.micro_batch_size = micro_batch_size
+
+        if use_conv:
+            self.downsample = nn.Conv2d(
+                self.channels, self.channels, kernel_size=3, stride=2, padding=0,
+            )
+        else:
+            self.downsample = nn.AvgPool2d(kernel_size=2, stride=2)
+
+    @video_to_image
+    def forward(self, x):
+        # implementation from SDXL
+        pad = (0, 1, 0, 1)
+        x = F.pad(x, pad, mode="constant", value=0)
+
+        x = self.downsample(x)
+        return x
+
+
+class SpatialUpsample2x(nn.Module):
+    """
+        Default upsample is F.interpolate(scale_factor=2) + Conv2d(stride=1)
+        Make sure input tensor is of shape [B, C, T, H, W]
+        Support micro_batch_size
+    """
+    def __init__(
+        self,
+        channels: int,
+        use_interpolate=True,
+        micro_batch_size=None,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.use_interpolate = use_interpolate
+        self.micro_batch_size = micro_batch_size
+
+        if use_interpolate:
+            self.conv = nn.Conv2d(self.channels, self.channels, kernel_size=3, padding=1)
+        else:
+            raise NotImplementedError
+            self.upsample = nn.ConvTranspose2d(channels, self.channels, kernel_size=4, stride=2, padding=1)
+    
+    def forward(self, x):
+        B = x.shape[0]
+        x = rearrange(x, 'B C T H W -> (B T) C H W')
+
+        if self.micro_batch_size is None:
+            x = self.forward_BCHW(x)
+        else:
+            bs = self.micro_batch_size
+            x_out = []
+            for i in range(0, x.shape[0], bs):
+                x_i = self.forward_BCHW(x[i:i + bs])
+                x_out.append(x_i)
+            x = torch.cat(x_out, dim=0)
+
+        x = rearrange(x, '(B T) C H W -> B C T H W', B=B)
+        return x
+
+    def forward_BCHW(self, x):
+        if self.use_interpolate:
+            # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
+            if x.shape[0] >= 64:
+                x = x.contiguous()
+
+            # interpolate tensor of bfloat16 is fixed in pytorch 2.1. see https://github.com/pytorch/pytorch/issues/86679
+            x = F.interpolate(x, scale_factor=2.0, mode="nearest")
+            x = self.conv(x)
+        else:
+            x = self.upsample(x)
+
+        return x
+
+
+class TemporalDownsample2x(nn.Module):
+    """
+        Default downsample is Conv3d(stride=(2, 1, 1))
+        Make sure input tensor is of shape [B, C, T, H, W]
+    """
+    def __init__(
+        self,
+        channels: int,
+        use_conv: bool = True,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.use_conv = use_conv
+
+        if use_conv:
+            self.downsample = nn.Conv3d(
+                self.channels, self.channels, kernel_size=(3, 3, 3), stride=(2, 1, 1), padding=(1, 1, 1),
+           )
+        else:
+            self.downsample = nn.AvgPool3d(kernel_size=(3, 1, 1), stride=(2, 1, 1))
+
+    def forward(self, x):
+        x = self.downsample(x)
+        return x
+
+
+class TemporalUpsample2x(nn.Module):
+    """
+        Default upsample is F.interpolate(scale_factor=(2, 1, 1)) + Conv3d(stride=1)
+        Make sure input tensor is of shape [B, C, T, H, W]
+        Support micro_batch_size
+    """
+    def __init__(
+        self,
+        channels,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.conv = nn.Conv3d(channels, channels, kernel_size=3, padding=1)
+
+    def forward(self, x):
+        if x.shape[0] >= 64:
+            x = x.contiguous()
+        x = F.interpolate(x, scale_factor=(2, 1, 1), mode="trilinear")
+        x = self.conv(x)
+        return x
+
+
+class UNetMidBlock2D(nn.Module):
+    """
+        default is ResnetBlock2D + Spatial Attention + ResnetBlock2D
+        Make sure input tensor is of shape [B, C, T, H, W] or [B, C, H, W]
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        num_layers: int = 1,
+        norm_groups: int = 32,
+        norm_eps: float = 1e-6,
+        attn_groups: Optional[int] = None,
+        add_attention: bool = True,
+        attention_head_dim: int = 512,
+    ):
+        super().__init__()
+        self.add_attention = add_attention
+
+        if attn_groups is None:
+            attn_groups = norm_groups
+
+        if attention_head_dim is None:
+            attention_head_dim = in_channels
+
+        res_blocks = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                norm_eps=norm_eps,
+                norm_groups=norm_groups,
+            )
+        ]
+        attn_blocks = []
+
+        for _ in range(num_layers):
+            if self.add_attention:
+                attn_blocks.append(
+                    Attention(
+                        in_channels,
+                        heads=in_channels // attention_head_dim,
+                        dim_head=attention_head_dim,
+                        # rescale_output_factor=output_scale_factor,
+                        rescale_output_factor=1.0,
+                        eps=norm_eps,
+                        norm_num_groups=attn_groups,
+                        # spatial_norm_dim=temb_channels if resnet_time_scale_shift == "spatial" else None,
+                        spatial_norm_dim=None,
+                        residual_connection=True,
+                        bias=True,
+                        upcast_softmax=True,
+                        _from_deprecated_attn_block=True,
+                    )
+                )
+
+            res_blocks.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    norm_eps=norm_eps,
+                    norm_groups=norm_groups,
+                )
+            )
+
+        self.attn_blocks = nn.ModuleList(attn_blocks)
+        self.res_blocks = nn.ModuleList(res_blocks)
+
+    def forward(self, x):
+        has_T = x.ndim == 5
+        if has_T:
+            B = x.shape[0]
+            x = rearrange(x, 'B C T H W -> (B T) C H W')
+
+        x = self.res_blocks[0](x)
+        for attn, res_block in zip(self.attn_blocks, self.res_blocks[1:]):
+            if attn is not None:
+                x = attn(x)
+            x = res_block(x)
+
+        if has_T:
+            x = rearrange(x, '(B T) C H W -> B C T H W', B=B)
+        return x
+
+
+class Encoder(nn.Module):
+    """
+        default arch is conv_in + blocks + mid_block + out_block
+        Make sure input tensor is of shape [B, C, T, H, W]
+    """
+    def __init__(
+        self,
+        in_channels=3,
+        out_channels=4,
+        norm_groups=32,
+        norm_eps=1e-6,
+        double_z=True,
+        micro_batch_size=None,
+    ):
+        super().__init__()
+        in_channels_encoder = in_channels
+        out_channels_encoder = out_channels
+        block_out_channels = [128, 256, 512, 512]
+
+        # conv_in
+        self.conv_in = VideoConv2d(
+            in_channels_encoder,
+            block_out_channels[0],
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            micro_batch_size=micro_batch_size,
+        )
+
+        # blocks
+        blocks = []
+
+        # the first block: ResnetBlock2D
+        in_channels = block_out_channels[0]
+        out_channels = block_out_channels[0]
+        blocks.append(
+            nn.Sequential(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    norm_groups=norm_groups,
+                    norm_eps=norm_eps,
+                    micro_batch_size=micro_batch_size,
+                ),
+                ResnetBlock2D(
+                    in_channels=out_channels,
+                    out_channels=out_channels,
+                    norm_groups=norm_groups,
+                    norm_eps=norm_eps,
+                    micro_batch_size=micro_batch_size,
+                ),
+                SpatialDownsample2x(
+                    channels=out_channels,
+                    use_conv=True,
+                    micro_batch_size=micro_batch_size, 
+                ),
+            )
+        )
+
+        # the second block: ResnetBlock2D
+        in_channels = block_out_channels[0]
+        out_channels = block_out_channels[1]
+        blocks.append(
+            nn.Sequential(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    norm_groups=norm_groups,
+                    norm_eps=norm_eps,
+                    micro_batch_size=micro_batch_size,
+                ),
+                ResnetBlock2D(
+                    in_channels=out_channels,
+                    out_channels=out_channels,
+                    norm_groups=norm_groups,
+                    norm_eps=norm_eps,
+                    micro_batch_size=micro_batch_size,
+                ),
+                SpatialDownsample2x(
+                    channels=out_channels,
+                    use_conv=True,
+                    micro_batch_size=micro_batch_size, 
+                ),
+                TemporalDownsample2x(
+                    channels=out_channels,
+                    use_conv=True,
+                )
+            )
+        )
+
+        # the third block: ResnetBlock3D
+        in_channels = block_out_channels[1]
+        out_channels = block_out_channels[2]
+        blocks.append(
+            nn.Sequential(
+                ResnetBlock3D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    norm_groups=norm_groups,
+                    norm_eps=norm_eps,
+                ),
+                ResnetBlock3D(
+                    in_channels=out_channels,
+                    out_channels=out_channels,
+                    norm_groups=norm_groups,
+                    norm_eps=norm_eps,
+                ),
+                SpatialDownsample2x(
+                    channels=out_channels,
+                    use_conv=True,
+                ),
+                TemporalDownsample2x(
+                    channels=out_channels,
+                    use_conv=True,
+                )
+            )
+        )
+
+        # the fourth block: ResnetBlock3D
+        in_channels = block_out_channels[2]
+        out_channels = block_out_channels[3]
+        blocks.append(
+            nn.Sequential(
+                ResnetBlock3D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    norm_groups=norm_groups,
+                    norm_eps=norm_eps,
+                ),
+                ResnetBlock3D(
+                    in_channels=out_channels,
+                    out_channels=out_channels,
+                    norm_groups=norm_groups,
+                    norm_eps=norm_eps,
+                ),
+            )
+        )
+
+        self.blocks = nn.ModuleList(blocks)
+
+
+        # mid_block
+        in_channels = block_out_channels[-1]
+        self.mid_block = UNetMidBlock2D(
+            in_channels=in_channels,
+            num_layers=1,
+            norm_groups=norm_groups,
+            norm_eps=norm_eps,
+            add_attention=True,
+            attention_head_dim=in_channels,
+        )
+
+        # out_block
+        in_channels = block_out_channels[-1]
+        out_channels = 2 * out_channels_encoder if double_z else out_channels_encoder
+        self.out_block = nn.Sequential(
+            nn.GroupNorm(num_channels=in_channels, num_groups=norm_groups, eps=norm_eps),
+            nn.SiLU(),
+            nn.Conv3d(in_channels, out_channels, kernel_size=3, padding=1),
+        )
+    
+    def forward(self, x):
+        x = self.conv_in(x)
+
+        for block in self.blocks:
+            x = block(x)
+
+        x = self.mid_block(x)
+
+        x = self.out_block(x)
+        return x
+
+
+class Decoder(nn.Module):
+    """
+        default arch is conv_in + mid_block + blocks + out_block
+        Make sure input tensor is of shape [B, C, T, H, W]
+    """
+    def __init__(
+        self,
+        in_channels=4,
+        out_channels=3,
+        norm_groups=32,
+        norm_eps=1e-6,
+    ):
+        super().__init__()
+        in_channels_decoder = in_channels
+        out_channels_decoder = out_channels
+        block_out_channels = [512, 512, 256, 128]
+
+        # conv_in
+        self.conv_in = nn.Conv3d(
+            in_channels_decoder,
+            block_out_channels[0],
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+
+        # mid_block
+        in_channels = block_out_channels[0]
+        self.mid_block = UNetMidBlock2D(
+            in_channels=in_channels,
+            num_layers=1,
+            norm_groups=norm_groups,
+            norm_eps=norm_eps,
+            add_attention=True,
+            attention_head_dim=in_channels,
+        )
+
+        # blocks
+        blocks = []
+        layer_per_block = 3
+
+        # the first up block: ResnetBlock3D
+        in_channels = block_out_channels[0]
+        out_channels = block_out_channels[0]
+        seq = [
+            ResnetBlock3D(
+                in_channels=in_channels if idx ==0 else out_channels,
+                out_channels=out_channels,
+                norm_groups=norm_groups,
+                norm_eps=norm_eps,
+            )
+            for idx in range(layer_per_block)
+        ] + [
+            SpatialUpsample2x(
+                channels=out_channels,
+                use_interpolate=True,
+            ),
+            TemporalUpsample2x(
+                channels=out_channels,
+            ),
+        ]
+        blocks.append(nn.Sequential(*seq))
+
+        # the second up block: ResnetBlock3D
+        in_channels = block_out_channels[0]
+        out_channels = block_out_channels[1]
+        seq = [
+            ResnetBlock3D(
+                in_channels=in_channels if idx ==0 else out_channels,
+                out_channels=out_channels,
+                norm_groups=norm_groups,
+                norm_eps=norm_eps,
+            )
+            for idx in range(layer_per_block)
+        ] + [
+            SpatialUpsample2x(
+                channels=out_channels,
+                use_interpolate=True,
+            ),
+            TemporalUpsample2x(
+                channels=out_channels,
+            ),
+        ]
+        blocks.append(nn.Sequential(*seq))
+
+        # the third up block: ResnetBlock3D
+        in_channels = block_out_channels[1]
+        out_channels = block_out_channels[2]
+        seq = [
+            ResnetBlock3D(
+                in_channels=in_channels if idx ==0 else out_channels,
+                out_channels=out_channels,
+                norm_groups=norm_groups,
+                norm_eps=norm_eps,
+            )
+            for idx in range(layer_per_block)
+        ] + [
+            SpatialUpsample2x(
+                channels=out_channels,
+                use_interpolate=True,
+            ),
+        ]
+        blocks.append(nn.Sequential(*seq))
+
+        # the fourth up block: ResnetBlock2D
+        in_channels = block_out_channels[2]
+        out_channels = block_out_channels[3]
+        seq = [
+            ResnetBlock2D(
+                in_channels=in_channels if idx ==0 else out_channels,
+                out_channels=out_channels,
+                norm_groups=norm_groups,
+                norm_eps=norm_eps,
+            )
+            for idx in range(layer_per_block)
+        ]
+        blocks.append(nn.Sequential(*seq))
+
+        self.blocks = nn.ModuleList(blocks)
+
+        # out_block
+        in_channels = block_out_channels[-1]
+        out_channels = out_channels_decoder
+        self.out_block = nn.Sequential(
+            nn.GroupNorm(num_channels=in_channels, num_groups=norm_groups, eps=norm_eps),
+            nn.SiLU(),
+            nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
+        )
+
+    def forward(self, x):
+        x = self.conv_in(x)
+        print(torch.cuda.memory_allocated() /  1024 ** 3)
+
+        x = self.mid_block(x)
+        print(torch.cuda.memory_allocated() /  1024 ** 3)
+
+        for block in self.blocks:
+            x = block(x)
+        print(torch.cuda.memory_allocated() /  1024 ** 3)
+
+        x = self.out_block(x)
+        print(torch.cuda.memory_allocated() /  1024 ** 3)
+        return x
+
+if __name__ == '__main__':
+    from opensora.utils.misc import count_params
+    device = 'cuda'
+    dtype = torch.bfloat16
+
+    encoder = Encoder(
+        in_channels=3,
+        out_channels=4,
+        double_z=False,
+        micro_batch_size=4,
+    ).to(torch.bfloat16).to(device, dtype).eval()
+
+    decoder = Decoder(
+        in_channels=4,
+        out_channels=3,
+    ).to(torch.bfloat16).to(device, dtype).eval()
+    num_params_enc = count_params(encoder)
+    num_params_dec = count_params(decoder)
+    print(f'Encoder #params: {num_params_enc}')
+    print(f'Decoder #params: {num_params_dec}')
+
+    # inference
+    x = torch.rand(1, 3, 51, 720, 1080).to(device, dtype)
+    with torch.inference_mode():
+        x_enc = encoder(x)
+        x_dec = decoder(x_enc)
+    print(torch.cuda.memory_allocated() /  1024 ** 3)
+    breakpoint()
diff --git a/tools/scene_cut/cut.py b/tools/scene_cut/cut.py
index cf724a6..d6ccb08 100644
--- a/tools/scene_cut/cut.py
+++ b/tools/scene_cut/cut.py
@@ -159,7 +159,7 @@ def parse_args():
     )
     parser.add_argument("--target_fps", type=int, default=None, help="target fps of clips")
     parser.add_argument(
-        "--shorter_size", type=int, default=1080, help="resize the shorter size by keeping ratio; will not do upscale"
+        "--shorter_size", type=int, default=None, help="resize the shorter size by keeping ratio; will not do upscale"
     )
     parser.add_argument("--num_workers", type=int, default=None, help="#workers for pandarallel")
     parser.add_argument("--disable_parallel", action="store_true", help="disable parallel processing")

From 32b06c62d00627b4527d0ac24b2effb8cc340d19 Mon Sep 17 00:00:00 2001
From: Tom Young <44153440+tomyoung903@users.noreply.github.com>
Date: Thu, 4 Jul 2024 11:17:05 +0800
Subject: [PATCH 33/34] align pllava video loader with the one in get video
 info (#167)

---
 tools/caption/pllava_dir/caption_pllava.py | 58 +++++++++++++++++++---
 1 file changed, 50 insertions(+), 8 deletions(-)

diff --git a/tools/caption/pllava_dir/caption_pllava.py b/tools/caption/pllava_dir/caption_pllava.py
index f220f5a..ceb0721 100644
--- a/tools/caption/pllava_dir/caption_pllava.py
+++ b/tools/caption/pllava_dir/caption_pllava.py
@@ -1,3 +1,17 @@
+import sys
+import os
+import os
+from pathlib import Path
+
+current_file = Path(__file__)  # Gets the path of the current file
+fourth_level_parent = current_file.parents[3]
+
+datasets_dir = os.path.join(fourth_level_parent, "opensora/datasets")
+import sys
+sys.path.append(datasets_dir)
+from read_video import read_video_av
+sys.path.remove(datasets_dir)
+
 import itertools
 import logging
 import multiprocessing as mp
@@ -95,21 +109,49 @@ def get_index(num_frames, num_segments):
     return offsets
 
 
+# def load_video(video_path, num_frames, return_msg=False, resolution=336):
+#     transforms = torchvision.transforms.Resize(size=resolution)
+#     vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+#     total_num_frames = len(vr)
+#     frame_indices = get_index(total_num_frames, num_frames)
+#     images_group = list()
+#     for frame_index in frame_indices:
+#         img = Image.fromarray(vr[frame_index].asnumpy())
+#         images_group.append(transforms(img))
+#     if return_msg:
+#         fps = float(vr.get_avg_fps())
+#         sec = ", ".join([str(round(f / fps, 1)) for f in frame_indices])
+#         # " " should be added in the start and end
+#         msg = f"The video contains {len(frame_indices)} frames sampled at {sec} seconds."
+#         return images_group, msg
+#     else:
+#         return images_group
+
+
 def load_video(video_path, num_frames, return_msg=False, resolution=336):
     transforms = torchvision.transforms.Resize(size=resolution)
-    vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
-    total_num_frames = len(vr)
+    # vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+    vframes, aframes, info = read_video_av(
+        video_path,
+        pts_unit="sec", 
+        output_format="THWC"
+    )
+    print(vframes.shape)
+    total_num_frames = len(vframes)
+    # print("Video path: ", video_path)
+    # print("Total number of frames: ", total_num_frames)
     frame_indices = get_index(total_num_frames, num_frames)
     images_group = list()
     for frame_index in frame_indices:
-        img = Image.fromarray(vr[frame_index].asnumpy())
+        img = Image.fromarray(vframes[frame_index].numpy())
         images_group.append(transforms(img))
     if return_msg:
-        fps = float(vr.get_avg_fps())
-        sec = ", ".join([str(round(f / fps, 1)) for f in frame_indices])
-        # " " should be added in the start and end
-        msg = f"The video contains {len(frame_indices)} frames sampled at {sec} seconds."
-        return images_group, msg
+        # fps = float(vframes.get_avg_fps())
+        # sec = ", ".join([str(round(f / fps, 1)) for f in frame_indices])
+        # # " " should be added in the start and end
+        # msg = f"The video contains {len(frame_indices)} frames sampled at {sec} seconds."
+        # return images_group, msg
+        exit('return_msg not implemented yet')
     else:
         return images_group
 

From 06f7eb93f0a7260b3c89e79b41bbe5909ff4edb5 Mon Sep 17 00:00:00 2001
From: zhengzangw <zhengzangw@gmail.com>
Date: Sun, 7 Jul 2024 05:34:08 +0000
Subject: [PATCH 34/34] [fix] wandb log only if record time

---
 scripts/train.py | 39 +++++++++++++++++++++------------------
 1 file changed, 21 insertions(+), 18 deletions(-)

diff --git a/scripts/train.py b/scripts/train.py
index 1066977..110f2f8 100644
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -347,24 +347,27 @@ def main():
                     tb_writer.add_scalar("loss", loss.item(), global_step)
                     # wandb
                     if cfg.get("wandb", False):
-                        wandb.log(
-                            {
-                                "iter": global_step,
-                                "acc_step": acc_step,
-                                "epoch": epoch,
-                                "loss": loss.item(),
-                                "avg_loss": avg_loss,
-                                "lr": optimizer.param_groups[0]["lr"],
-                                "debug/move_data_time": move_data_t.elapsed_time,
-                                "debug/encode_time": encode_t.elapsed_time,
-                                "debug/mask_time": mask_t.elapsed_time,
-                                "debug/diffusion_time": loss_t.elapsed_time,
-                                "debug/backward_time": backward_t.elapsed_time,
-                                "debug/update_ema_time": ema_t.elapsed_time,
-                                "debug/reduce_loss_time": reduce_loss_t.elapsed_time,
-                            },
-                            step=global_step,
-                        )
+                        wandb_dict = {
+                            "iter": global_step,
+                            "acc_step": acc_step,
+                            "epoch": epoch,
+                            "loss": loss.item(),
+                            "avg_loss": avg_loss,
+                            "lr": optimizer.param_groups[0]["lr"],
+                        }
+                        if record_time:
+                            wandb_dict.update(
+                                {
+                                    "debug/move_data_time": move_data_t.elapsed_time,
+                                    "debug/encode_time": encode_t.elapsed_time,
+                                    "debug/mask_time": mask_t.elapsed_time,
+                                    "debug/diffusion_time": loss_t.elapsed_time,
+                                    "debug/backward_time": backward_t.elapsed_time,
+                                    "debug/update_ema_time": ema_t.elapsed_time,
+                                    "debug/reduce_loss_time": reduce_loss_t.elapsed_time,
+                                }
+                            )
+                        wandb.log(wandb_dict, step=global_step)
 
                     running_loss = 0.0
                     log_step = 0