From 8f239c87bf12564defc5e37cf611fa00c1976443 Mon Sep 17 00:00:00 2001
From: HangXu <hangxu0304@gmail.com>
Date: Thu, 20 Jun 2024 11:48:42 +0300
Subject: [PATCH 1/8]  Added causal mask in Attention forward pass

---
 opensora/models/layers/blocks.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/opensora/models/layers/blocks.py b/opensora/models/layers/blocks.py
index 8bc7e72..73a0162 100644
--- a/opensora/models/layers/blocks.py
+++ b/opensora/models/layers/blocks.py
@@ -163,6 +163,8 @@ class Attention(nn.Module):
         if rope is not None:
             self.rope = True
             self.rotary_emb = rope
+        
+        self.is_causal = False
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         B, N, C = x.shape
@@ -198,12 +200,17 @@ class Attention(nn.Module):
                 v,
                 dropout_p=self.attn_drop.p if self.training else 0.0,
                 softmax_scale=self.scale,
+                causal=self.is_causal,
             )
         else:
             dtype = q.dtype
             q = q * self.scale
             attn = q @ k.transpose(-2, -1)  # translate attn to float32
             attn = attn.to(torch.float32)
+            if self.is_causal:
+                causal_mask = torch.tril(torch.ones_like(attn), diagonal=0)
+                causal_mask = torch.where(causal_mask.bool(), 0, float('-inf'))
+                attn += causal_mask
             attn = attn.softmax(dim=-1)
             attn = attn.to(dtype)  # cast back attn to original dtype
             attn = self.attn_drop(attn)

From 6b42f4aa95c4137e3b6dbf18862a4781da469bbd Mon Sep 17 00:00:00 2001
From: Tom Young <tomyoung903@gmail.com>
Date: Fri, 21 Jun 2024 01:29:21 +0000
Subject: [PATCH 2/8] small update on readme

---
 tools/caption/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/caption/README.md b/tools/caption/README.md
index f6fe0c8..8f7dfed 100644
--- a/tools/caption/README.md
+++ b/tools/caption/README.md
@@ -4,7 +4,7 @@ Human labeling of videos is expensive and time-consuming. We adopt powerful imag
 
 ## PLLaVA Captioning
 
-To balance captioning speed and performance, we chose the 13B version of PLLaVA configured with 2*2 spatial pooling. We feed it with 4 frames evenly extracted from the video.
+To balance captioning speed and performance, we chose the 13B version of PLLaVA configured with 2*2 spatial pooling. We feed it with 4 frames evenly extracted from the video. We accelerate its inference via (1) batching and (2) offload frame extraction to a separate process such that the GPU computations and frame extraction happen in parallel.
 
 ### Installation
 Install the required dependancies by following our [installation instructions](../../docs/installation.md)'s "Data Dependencies" and "PLLaVA Captioning" sections.

From 9a9a6c2f3e571136978818dcd561dac915ce7157 Mon Sep 17 00:00:00 2001
From: zhengzangw <zhengzangw@gmail.com>
Date: Sat, 22 Jun 2024 15:54:27 +0000
Subject: [PATCH 3/8] [fix] better support local ckpt

---
 opensora/models/stdit/stdit3.py | 5 +++--
 opensora/models/vae/vae.py      | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/opensora/models/stdit/stdit3.py b/opensora/models/stdit/stdit3.py
index 8703b2d..bd9672d 100644
--- a/opensora/models/stdit/stdit3.py
+++ b/opensora/models/stdit/stdit3.py
@@ -448,7 +448,7 @@ class STDiT3(PreTrainedModel):
 @MODELS.register_module("STDiT3-XL/2")
 def STDiT3_XL_2(from_pretrained=None, **kwargs):
     force_huggingface = kwargs.pop("force_huggingface", False)
-    if force_huggingface or from_pretrained is not None and not os.path.isdir(from_pretrained):
+    if force_huggingface or from_pretrained is not None and not os.path.exists(from_pretrained):
         model = STDiT3.from_pretrained(from_pretrained, **kwargs)
     else:
         config = STDiT3Config(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs)
@@ -460,7 +460,8 @@ def STDiT3_XL_2(from_pretrained=None, **kwargs):
 
 @MODELS.register_module("STDiT3-3B/2")
 def STDiT3_3B_2(from_pretrained=None, **kwargs):
-    if from_pretrained is not None and not os.path.isdir(from_pretrained):
+    force_huggingface = kwargs.pop("force_huggingface", False)
+    if force_huggingface or from_pretrained is not None and not os.path.exists(from_pretrained):
         model = STDiT3.from_pretrained(from_pretrained, **kwargs)
     else:
         config = STDiT3Config(depth=28, hidden_size=1872, patch_size=(1, 2, 2), num_heads=26, **kwargs)
diff --git a/opensora/models/vae/vae.py b/opensora/models/vae/vae.py
index bf50ec8..9802b02 100644
--- a/opensora/models/vae/vae.py
+++ b/opensora/models/vae/vae.py
@@ -277,7 +277,7 @@ def OpenSoraVAE_V1_2(
         scale=scale,
     )
 
-    if force_huggingface or (from_pretrained is not None and not os.path.isdir(from_pretrained)):
+    if force_huggingface or (from_pretrained is not None and not os.path.exists(from_pretrained)):
         model = VideoAutoencoderPipeline.from_pretrained(from_pretrained, **kwargs)
     else:
         config = VideoAutoencoderPipelineConfig(**kwargs)

From 00fef1d1af0b431ffd4dadea684a2d59d5d880f2 Mon Sep 17 00:00:00 2001
From: Jiacheng Yang <kipsora@gmail.com>
Date: Mon, 24 Jun 2024 05:07:49 -0400
Subject: [PATCH 4/8] fix SeqParallelMultiHeadCrossAttention for consistent
 results in distributed mode (#510)

---
 opensora/models/layers/blocks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/opensora/models/layers/blocks.py b/opensora/models/layers/blocks.py
index 8bc7e72..5e2c13d 100644
--- a/opensora/models/layers/blocks.py
+++ b/opensora/models/layers/blocks.py
@@ -499,7 +499,7 @@ class SeqParallelMultiHeadCrossAttention(MultiHeadCrossAttention):
 
         # shape:
         # q, k, v: [B, SUB_N, NUM_HEADS, HEAD_DIM]
-        q = self.q_linear(x).view(1, -1, self.num_heads, self.head_dim)
+        q = self.q_linear(x).view(B, -1, self.num_heads, self.head_dim)
         kv = self.kv_linear(cond).view(1, -1, 2, self.num_heads, self.head_dim)
         kv = split_forward_gather_backward(kv, get_sequence_parallel_group(), dim=3, grad_scale="down")
         k, v = kv.unbind(2)

From 3552145f847b5171cd2c85982d116b04eb98c5bf Mon Sep 17 00:00:00 2001
From: FrankLeeeee <somerlee.9@gmail.com>
Date: Tue, 25 Jun 2024 06:17:24 +0000
Subject: [PATCH 5/8] [sp] updated precision test

---
 opensora/models/stdit/stdit3.py           | 15 ++++++++++-----
 tests/test_stdit3_sequence_parallelism.py | 18 ++++++++++++++----
 2 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/opensora/models/stdit/stdit3.py b/opensora/models/stdit/stdit3.py
index bb71d04..b0c046a 100644
--- a/opensora/models/stdit/stdit3.py
+++ b/opensora/models/stdit/stdit3.py
@@ -368,12 +368,17 @@ class STDiT3(PreTrainedModel):
         # for simplicity, we can adjust the height to make it divisible
         if self.enable_sequence_parallelism:
             sp_size = dist.get_world_size(get_sequence_parallel_group())
-            h_pad_size = sp_size - H % sp_size
-            hx_pad_size = h_pad_size * self.patch_size[1]
+            if H % sp_size != 0:
+                h_pad_size = sp_size - H % sp_size
+            else:
+                h_pad_size = 0
 
-            # pad x along the H dimension
-            H += h_pad_size
-            x = F.pad(x, (0, 0, 0, hx_pad_size))
+            if h_pad_size > 0:
+                hx_pad_size = h_pad_size * self.patch_size[1]
+
+                # pad x along the H dimension
+                H += h_pad_size
+                x = F.pad(x, (0, 0, 0, hx_pad_size))
 
         S = H * W
         base_size = round(S**0.5)
diff --git a/tests/test_stdit3_sequence_parallelism.py b/tests/test_stdit3_sequence_parallelism.py
index ba715b7..70786f4 100644
--- a/tests/test_stdit3_sequence_parallelism.py
+++ b/tests/test_stdit3_sequence_parallelism.py
@@ -9,7 +9,7 @@ from opensora.models.stdit.stdit3 import STDiT3, STDiT3Config
 
 
 def get_sample_data():
-    x = torch.rand([1, 4, 15, 20, 27], dtype=torch.bfloat16)  # (B, C, T, H, W)
+    x = torch.rand([1, 4, 15, 20, 28], dtype=torch.bfloat16)  # (B, C, T, H, W)
     timestep = torch.Tensor([924.0]).to(torch.bfloat16)
     y = torch.rand(1, 1, 300, 4096, dtype=torch.bfloat16)
     mask = torch.ones([1, 300], dtype=torch.int32)
@@ -66,6 +66,17 @@ def run_model(rank, world_size, port):
     set_seed(1024)
     dist_model_cfg = get_stdit3_config(enable_sequence_parallelism=True)
     dist_model = STDiT3(dist_model_cfg).cuda().to(torch.bfloat16)
+
+    # ensure model weights are equal
+    for p1, p2 in zip(non_dist_model.parameters(), dist_model.parameters()):
+        assert torch.equal(p1, p2)
+
+    # ensure model weights are equal across all ranks
+    for p in dist_model.parameters():
+        p_list = [torch.zeros_like(p) for _ in range(world_size)]
+        dist.all_gather(p_list, p, group=dist.group.WORLD)
+        assert torch.equal(*p_list)
+
     dist_out = dist_model(**data)
     dist_out.mean().backward()
 
@@ -84,9 +95,8 @@ def run_model(rank, world_size, port):
     for (n1, p1), (n2, p2) in zip(non_dist_model.named_parameters(), dist_model.named_parameters()):
         assert n1 == n2
         if p1.grad is not None and p2.grad is not None:
-            if not torch.allclose(p1.grad, p2.grad, rtol=1e-2, atol=1e-4):
-                if dist.get_rank() == 0:
-                    print(f"gradient of {n1} is not equal, {p1.grad} vs {p2.grad}")
+            if not torch.allclose(p1.grad, p2.grad, rtol=1e-2, atol=1e-4) and dist.get_rank() == 0:
+                print(f"gradient of {n1} is not equal, {p1.grad} vs {p2.grad}")
         else:
             assert p1.grad is None and p2.grad is None
 

From 1c64c82c05523c125d91c52af5d34791f532170a Mon Sep 17 00:00:00 2001
From: Shen-Chenhui <shen_chenhui@u.nus.edu>
Date: Tue, 25 Jun 2024 08:26:35 +0000
Subject: [PATCH 6/8] allow path spec for cloud machine eval

---
 eval/loss/launch.sh | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/eval/loss/launch.sh b/eval/loss/launch.sh
index 5e19c7c..c70c52d 100644
--- a/eval/loss/launch.sh
+++ b/eval/loss/launch.sh
@@ -3,8 +3,16 @@
 CMD="torchrun --standalone --nproc_per_node 1 eval/loss/eval_loss.py configs/opensora-v1-2/misc/eval_loss.py"
 CKPT_PATH=$1
 MODEL_NAME=$2
-IMG_PATH="/mnt/jfs-hdd/sora/meta/validation/img_1k.csv"
-VID_PATH="/mnt/jfs-hdd/sora/meta/validation/vid_100.csv"
+IMG_PATH=$3
+VID_PATH=$4
+
+if [ -z $IMG_PATH ]; then
+    IMG_PATH="/mnt/jfs-hdd/sora/meta/validation/img_1k.csv"
+fi
+
+if [ -z $VID_PATH ]; then
+    VID_PATH="/mnt/jfs-hdd/sora/meta/validation/vid_100.csv"
+fi
 
 if [[ $CKPT_PATH == *"ema"* ]]; then
     parentdir=$(dirname $CKPT_PATH)

From 31858eccebfad8998716716ade1dccb7f0d58746 Mon Sep 17 00:00:00 2001
From: Shen-Chenhui <shen_chenhui@u.nus.edu>
Date: Wed, 26 Jun 2024 02:43:19 +0000
Subject: [PATCH 7/8] fix readme

---
 docs/commands.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/commands.md b/docs/commands.md
index d982db8..92ff5e6 100644
--- a/docs/commands.md
+++ b/docs/commands.md
@@ -19,8 +19,8 @@ Note that currently our model loading for vae and diffusion model supports two t
 * load from local file path
 * load from huggingface
 
-Our config supports loading from huggingface by default.
-If you wish to load from a local path, you need to set `force_huggingface=True`, for instance:
+Our config supports loading from huggingface online image by default.
+If you wish to load from a local path downloaded from huggingface image, you need to set `force_huggingface=True`, for instance:
 
 ```python
 # for vae
@@ -41,6 +41,7 @@ model = dict(
     force_huggingface=True, # NOTE: set here
 )
 ```
+However, if you want to load a self-trained model, do not set `force_huggingface=True` since your image won't be in huggingface format.
 
 ## Inference
 

From b65126834f4da0975d544ed105c974e53523e8bf Mon Sep 17 00:00:00 2001
From: Tom Young <tomyoung903@gmail.com>
Date: Wed, 26 Jun 2024 03:56:41 +0000
Subject: [PATCH 8/8] add resume and drop invalid timestamps

---
 tools/scene_cut/cut.py | 45 ++++++++++++++++++++++++++----------------
 1 file changed, 28 insertions(+), 17 deletions(-)

diff --git a/tools/scene_cut/cut.py b/tools/scene_cut/cut.py
index cf724a6..d614d01 100644
--- a/tools/scene_cut/cut.py
+++ b/tools/scene_cut/cut.py
@@ -29,15 +29,20 @@ def process_single_row(row, args):
     # check mp4 integrity
     # if not is_intact_video(video_path, logger=logger):
     #     return False
-
-    if "timestamp" in row:
-        timestamp = row["timestamp"]
-        if not (timestamp.startswith("[") and timestamp.endswith("]")):
+    try:
+        if "timestamp" in row:
+            timestamp = row["timestamp"]
+            if not (timestamp.startswith("[") and timestamp.endswith("]")):
+                return False
+            scene_list = eval(timestamp)
+            scene_list = [(FrameTimecode(s, fps=1), FrameTimecode(t, fps=1)) for s, t in scene_list]
+        else:
+            scene_list = [None]
+        if args.drop_invalid_timestamps:
+            return True
+    except Exception as e:
+        if args.drop_invalid_timestamps:
             return False
-        scene_list = eval(timestamp)
-        scene_list = [(FrameTimecode(s, fps=1), FrameTimecode(t, fps=1)) for s, t in scene_list]
-    else:
-        scene_list = [None]
 
     if "relpath" in row:
         save_dir = os.path.dirname(os.path.join(args.save_dir, row["relpath"]))
@@ -61,7 +66,7 @@ def process_single_row(row, args):
         shorter_size=shorter_size,
         logger=logger,
     )
-
+    return True
 
 def split_video(
     video_path,
@@ -108,7 +113,10 @@ def split_video(
         fname_wo_ext = os.path.splitext(fname)[0]
         # TODO: fname pattern
         save_path = os.path.join(save_dir, f"{fname_wo_ext}_scene-{idx}.mp4")
-
+        if os.path.exists(save_path):
+            # print_log(f"File '{save_path}' already exists. Skip.", logger=logger)
+            continue
+        
         # ffmpeg cmd
         cmd = [FFMPEG_PATH]
 
@@ -134,7 +142,7 @@ def split_video(
             # cmd += ['-vf', f"scale='if(gt(iw,ih),{shorter_size},trunc(ow/a/2)*2)':-2"]
 
         cmd += ["-map", "0:v", save_path]
-
+        # print(cmd)
         proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
         stdout, stderr = proc.communicate()
         # stdout = stdout.decode("utf-8")
@@ -163,7 +171,7 @@ def parse_args():
     )
     parser.add_argument("--num_workers", type=int, default=None, help="#workers for pandarallel")
     parser.add_argument("--disable_parallel", action="store_true", help="disable parallel processing")
-
+    parser.add_argument("--drop_invalid_timestamps", action="store_true", help="drop rows with invalid timestamps")
     args = parser.parse_args()
     return args
 
@@ -175,7 +183,7 @@ def main():
         print(f"Meta file '{meta_path}' not found. Exit.")
         exit()
 
-    # create logger
+    # create save_dir
     os.makedirs(args.save_dir, exist_ok=True)
 
     # initialize pandarallel
@@ -189,10 +197,13 @@ def main():
     # process
     meta = pd.read_csv(args.meta_path)
     if not args.disable_parallel:
-        meta.parallel_apply(process_single_row_partial, axis=1)
+        results = meta.parallel_apply(process_single_row_partial, axis=1)
     else:
-        meta.apply(process_single_row_partial, axis=1)
-
-
+        results = meta.apply(process_single_row_partial, axis=1)
+    if args.drop_invalid_timestamps:
+        meta = meta[results]
+        assert args.meta_path.endswith("timestamp.csv"), "Only support *timestamp.csv"
+        meta.to_csv(args.meta_path.replace("timestamp.csv", "correct_timestamp.csv"), index=False)
+        print(f"Corrected timestamp file saved to '{args.meta_path.replace('timestamp.csv', 'correct_timestamp.csv')}'")
 if __name__ == "__main__":
     main()