From 8f239c87bf12564defc5e37cf611fa00c1976443 Mon Sep 17 00:00:00 2001 From: HangXu Date: Thu, 20 Jun 2024 11:48:42 +0300 Subject: [PATCH 1/8] Added causal mask in Attention forward pass --- opensora/models/layers/blocks.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/opensora/models/layers/blocks.py b/opensora/models/layers/blocks.py index 8bc7e72..73a0162 100644 --- a/opensora/models/layers/blocks.py +++ b/opensora/models/layers/blocks.py @@ -163,6 +163,8 @@ class Attention(nn.Module): if rope is not None: self.rope = True self.rotary_emb = rope + + self.is_causal = False def forward(self, x: torch.Tensor) -> torch.Tensor: B, N, C = x.shape @@ -198,12 +200,17 @@ class Attention(nn.Module): v, dropout_p=self.attn_drop.p if self.training else 0.0, softmax_scale=self.scale, + causal=self.is_causal, ) else: dtype = q.dtype q = q * self.scale attn = q @ k.transpose(-2, -1) # translate attn to float32 attn = attn.to(torch.float32) + if self.is_causal: + causal_mask = torch.tril(torch.ones_like(attn), diagonal=0) + causal_mask = torch.where(causal_mask.bool(), 0, float('-inf')) + attn += causal_mask attn = attn.softmax(dim=-1) attn = attn.to(dtype) # cast back attn to original dtype attn = self.attn_drop(attn) From 6b42f4aa95c4137e3b6dbf18862a4781da469bbd Mon Sep 17 00:00:00 2001 From: Tom Young Date: Fri, 21 Jun 2024 01:29:21 +0000 Subject: [PATCH 2/8] small update on readme --- tools/caption/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/caption/README.md b/tools/caption/README.md index f6fe0c8..8f7dfed 100644 --- a/tools/caption/README.md +++ b/tools/caption/README.md @@ -4,7 +4,7 @@ Human labeling of videos is expensive and time-consuming. We adopt powerful imag ## PLLaVA Captioning -To balance captioning speed and performance, we chose the 13B version of PLLaVA configured with 2*2 spatial pooling. We feed it with 4 frames evenly extracted from the video. +To balance captioning speed and performance, we chose the 13B version of PLLaVA configured with 2*2 spatial pooling. We feed it with 4 frames evenly extracted from the video. We accelerate its inference via (1) batching and (2) offload frame extraction to a separate process such that the GPU computations and frame extraction happen in parallel. ### Installation Install the required dependancies by following our [installation instructions](../../docs/installation.md)'s "Data Dependencies" and "PLLaVA Captioning" sections. From 9a9a6c2f3e571136978818dcd561dac915ce7157 Mon Sep 17 00:00:00 2001 From: zhengzangw Date: Sat, 22 Jun 2024 15:54:27 +0000 Subject: [PATCH 3/8] [fix] better support local ckpt --- opensora/models/stdit/stdit3.py | 5 +++-- opensora/models/vae/vae.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/opensora/models/stdit/stdit3.py b/opensora/models/stdit/stdit3.py index 8703b2d..bd9672d 100644 --- a/opensora/models/stdit/stdit3.py +++ b/opensora/models/stdit/stdit3.py @@ -448,7 +448,7 @@ class STDiT3(PreTrainedModel): @MODELS.register_module("STDiT3-XL/2") def STDiT3_XL_2(from_pretrained=None, **kwargs): force_huggingface = kwargs.pop("force_huggingface", False) - if force_huggingface or from_pretrained is not None and not os.path.isdir(from_pretrained): + if force_huggingface or from_pretrained is not None and not os.path.exists(from_pretrained): model = STDiT3.from_pretrained(from_pretrained, **kwargs) else: config = STDiT3Config(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs) @@ -460,7 +460,8 @@ def STDiT3_XL_2(from_pretrained=None, **kwargs): @MODELS.register_module("STDiT3-3B/2") def STDiT3_3B_2(from_pretrained=None, **kwargs): - if from_pretrained is not None and not os.path.isdir(from_pretrained): + force_huggingface = kwargs.pop("force_huggingface", False) + if force_huggingface or from_pretrained is not None and not os.path.exists(from_pretrained): model = STDiT3.from_pretrained(from_pretrained, **kwargs) else: config = STDiT3Config(depth=28, hidden_size=1872, patch_size=(1, 2, 2), num_heads=26, **kwargs) diff --git a/opensora/models/vae/vae.py b/opensora/models/vae/vae.py index bf50ec8..9802b02 100644 --- a/opensora/models/vae/vae.py +++ b/opensora/models/vae/vae.py @@ -277,7 +277,7 @@ def OpenSoraVAE_V1_2( scale=scale, ) - if force_huggingface or (from_pretrained is not None and not os.path.isdir(from_pretrained)): + if force_huggingface or (from_pretrained is not None and not os.path.exists(from_pretrained)): model = VideoAutoencoderPipeline.from_pretrained(from_pretrained, **kwargs) else: config = VideoAutoencoderPipelineConfig(**kwargs) From 00fef1d1af0b431ffd4dadea684a2d59d5d880f2 Mon Sep 17 00:00:00 2001 From: Jiacheng Yang Date: Mon, 24 Jun 2024 05:07:49 -0400 Subject: [PATCH 4/8] fix SeqParallelMultiHeadCrossAttention for consistent results in distributed mode (#510) --- opensora/models/layers/blocks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opensora/models/layers/blocks.py b/opensora/models/layers/blocks.py index 8bc7e72..5e2c13d 100644 --- a/opensora/models/layers/blocks.py +++ b/opensora/models/layers/blocks.py @@ -499,7 +499,7 @@ class SeqParallelMultiHeadCrossAttention(MultiHeadCrossAttention): # shape: # q, k, v: [B, SUB_N, NUM_HEADS, HEAD_DIM] - q = self.q_linear(x).view(1, -1, self.num_heads, self.head_dim) + q = self.q_linear(x).view(B, -1, self.num_heads, self.head_dim) kv = self.kv_linear(cond).view(1, -1, 2, self.num_heads, self.head_dim) kv = split_forward_gather_backward(kv, get_sequence_parallel_group(), dim=3, grad_scale="down") k, v = kv.unbind(2) From 3552145f847b5171cd2c85982d116b04eb98c5bf Mon Sep 17 00:00:00 2001 From: FrankLeeeee Date: Tue, 25 Jun 2024 06:17:24 +0000 Subject: [PATCH 5/8] [sp] updated precision test --- opensora/models/stdit/stdit3.py | 15 ++++++++++----- tests/test_stdit3_sequence_parallelism.py | 18 ++++++++++++++---- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/opensora/models/stdit/stdit3.py b/opensora/models/stdit/stdit3.py index bb71d04..b0c046a 100644 --- a/opensora/models/stdit/stdit3.py +++ b/opensora/models/stdit/stdit3.py @@ -368,12 +368,17 @@ class STDiT3(PreTrainedModel): # for simplicity, we can adjust the height to make it divisible if self.enable_sequence_parallelism: sp_size = dist.get_world_size(get_sequence_parallel_group()) - h_pad_size = sp_size - H % sp_size - hx_pad_size = h_pad_size * self.patch_size[1] + if H % sp_size != 0: + h_pad_size = sp_size - H % sp_size + else: + h_pad_size = 0 - # pad x along the H dimension - H += h_pad_size - x = F.pad(x, (0, 0, 0, hx_pad_size)) + if h_pad_size > 0: + hx_pad_size = h_pad_size * self.patch_size[1] + + # pad x along the H dimension + H += h_pad_size + x = F.pad(x, (0, 0, 0, hx_pad_size)) S = H * W base_size = round(S**0.5) diff --git a/tests/test_stdit3_sequence_parallelism.py b/tests/test_stdit3_sequence_parallelism.py index ba715b7..70786f4 100644 --- a/tests/test_stdit3_sequence_parallelism.py +++ b/tests/test_stdit3_sequence_parallelism.py @@ -9,7 +9,7 @@ from opensora.models.stdit.stdit3 import STDiT3, STDiT3Config def get_sample_data(): - x = torch.rand([1, 4, 15, 20, 27], dtype=torch.bfloat16) # (B, C, T, H, W) + x = torch.rand([1, 4, 15, 20, 28], dtype=torch.bfloat16) # (B, C, T, H, W) timestep = torch.Tensor([924.0]).to(torch.bfloat16) y = torch.rand(1, 1, 300, 4096, dtype=torch.bfloat16) mask = torch.ones([1, 300], dtype=torch.int32) @@ -66,6 +66,17 @@ def run_model(rank, world_size, port): set_seed(1024) dist_model_cfg = get_stdit3_config(enable_sequence_parallelism=True) dist_model = STDiT3(dist_model_cfg).cuda().to(torch.bfloat16) + + # ensure model weights are equal + for p1, p2 in zip(non_dist_model.parameters(), dist_model.parameters()): + assert torch.equal(p1, p2) + + # ensure model weights are equal across all ranks + for p in dist_model.parameters(): + p_list = [torch.zeros_like(p) for _ in range(world_size)] + dist.all_gather(p_list, p, group=dist.group.WORLD) + assert torch.equal(*p_list) + dist_out = dist_model(**data) dist_out.mean().backward() @@ -84,9 +95,8 @@ def run_model(rank, world_size, port): for (n1, p1), (n2, p2) in zip(non_dist_model.named_parameters(), dist_model.named_parameters()): assert n1 == n2 if p1.grad is not None and p2.grad is not None: - if not torch.allclose(p1.grad, p2.grad, rtol=1e-2, atol=1e-4): - if dist.get_rank() == 0: - print(f"gradient of {n1} is not equal, {p1.grad} vs {p2.grad}") + if not torch.allclose(p1.grad, p2.grad, rtol=1e-2, atol=1e-4) and dist.get_rank() == 0: + print(f"gradient of {n1} is not equal, {p1.grad} vs {p2.grad}") else: assert p1.grad is None and p2.grad is None From 1c64c82c05523c125d91c52af5d34791f532170a Mon Sep 17 00:00:00 2001 From: Shen-Chenhui Date: Tue, 25 Jun 2024 08:26:35 +0000 Subject: [PATCH 6/8] allow path spec for cloud machine eval --- eval/loss/launch.sh | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/eval/loss/launch.sh b/eval/loss/launch.sh index 5e19c7c..c70c52d 100644 --- a/eval/loss/launch.sh +++ b/eval/loss/launch.sh @@ -3,8 +3,16 @@ CMD="torchrun --standalone --nproc_per_node 1 eval/loss/eval_loss.py configs/opensora-v1-2/misc/eval_loss.py" CKPT_PATH=$1 MODEL_NAME=$2 -IMG_PATH="/mnt/jfs-hdd/sora/meta/validation/img_1k.csv" -VID_PATH="/mnt/jfs-hdd/sora/meta/validation/vid_100.csv" +IMG_PATH=$3 +VID_PATH=$4 + +if [ -z $IMG_PATH ]; then + IMG_PATH="/mnt/jfs-hdd/sora/meta/validation/img_1k.csv" +fi + +if [ -z $VID_PATH ]; then + VID_PATH="/mnt/jfs-hdd/sora/meta/validation/vid_100.csv" +fi if [[ $CKPT_PATH == *"ema"* ]]; then parentdir=$(dirname $CKPT_PATH) From 31858eccebfad8998716716ade1dccb7f0d58746 Mon Sep 17 00:00:00 2001 From: Shen-Chenhui Date: Wed, 26 Jun 2024 02:43:19 +0000 Subject: [PATCH 7/8] fix readme --- docs/commands.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/commands.md b/docs/commands.md index d982db8..92ff5e6 100644 --- a/docs/commands.md +++ b/docs/commands.md @@ -19,8 +19,8 @@ Note that currently our model loading for vae and diffusion model supports two t * load from local file path * load from huggingface -Our config supports loading from huggingface by default. -If you wish to load from a local path, you need to set `force_huggingface=True`, for instance: +Our config supports loading from huggingface online image by default. +If you wish to load from a local path downloaded from huggingface image, you need to set `force_huggingface=True`, for instance: ```python # for vae @@ -41,6 +41,7 @@ model = dict( force_huggingface=True, # NOTE: set here ) ``` +However, if you want to load a self-trained model, do not set `force_huggingface=True` since your image won't be in huggingface format. ## Inference From b65126834f4da0975d544ed105c974e53523e8bf Mon Sep 17 00:00:00 2001 From: Tom Young Date: Wed, 26 Jun 2024 03:56:41 +0000 Subject: [PATCH 8/8] add resume and drop invalid timestamps --- tools/scene_cut/cut.py | 45 ++++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/tools/scene_cut/cut.py b/tools/scene_cut/cut.py index cf724a6..d614d01 100644 --- a/tools/scene_cut/cut.py +++ b/tools/scene_cut/cut.py @@ -29,15 +29,20 @@ def process_single_row(row, args): # check mp4 integrity # if not is_intact_video(video_path, logger=logger): # return False - - if "timestamp" in row: - timestamp = row["timestamp"] - if not (timestamp.startswith("[") and timestamp.endswith("]")): + try: + if "timestamp" in row: + timestamp = row["timestamp"] + if not (timestamp.startswith("[") and timestamp.endswith("]")): + return False + scene_list = eval(timestamp) + scene_list = [(FrameTimecode(s, fps=1), FrameTimecode(t, fps=1)) for s, t in scene_list] + else: + scene_list = [None] + if args.drop_invalid_timestamps: + return True + except Exception as e: + if args.drop_invalid_timestamps: return False - scene_list = eval(timestamp) - scene_list = [(FrameTimecode(s, fps=1), FrameTimecode(t, fps=1)) for s, t in scene_list] - else: - scene_list = [None] if "relpath" in row: save_dir = os.path.dirname(os.path.join(args.save_dir, row["relpath"])) @@ -61,7 +66,7 @@ def process_single_row(row, args): shorter_size=shorter_size, logger=logger, ) - + return True def split_video( video_path, @@ -108,7 +113,10 @@ def split_video( fname_wo_ext = os.path.splitext(fname)[0] # TODO: fname pattern save_path = os.path.join(save_dir, f"{fname_wo_ext}_scene-{idx}.mp4") - + if os.path.exists(save_path): + # print_log(f"File '{save_path}' already exists. Skip.", logger=logger) + continue + # ffmpeg cmd cmd = [FFMPEG_PATH] @@ -134,7 +142,7 @@ def split_video( # cmd += ['-vf', f"scale='if(gt(iw,ih),{shorter_size},trunc(ow/a/2)*2)':-2"] cmd += ["-map", "0:v", save_path] - + # print(cmd) proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) stdout, stderr = proc.communicate() # stdout = stdout.decode("utf-8") @@ -163,7 +171,7 @@ def parse_args(): ) parser.add_argument("--num_workers", type=int, default=None, help="#workers for pandarallel") parser.add_argument("--disable_parallel", action="store_true", help="disable parallel processing") - + parser.add_argument("--drop_invalid_timestamps", action="store_true", help="drop rows with invalid timestamps") args = parser.parse_args() return args @@ -175,7 +183,7 @@ def main(): print(f"Meta file '{meta_path}' not found. Exit.") exit() - # create logger + # create save_dir os.makedirs(args.save_dir, exist_ok=True) # initialize pandarallel @@ -189,10 +197,13 @@ def main(): # process meta = pd.read_csv(args.meta_path) if not args.disable_parallel: - meta.parallel_apply(process_single_row_partial, axis=1) + results = meta.parallel_apply(process_single_row_partial, axis=1) else: - meta.apply(process_single_row_partial, axis=1) - - + results = meta.apply(process_single_row_partial, axis=1) + if args.drop_invalid_timestamps: + meta = meta[results] + assert args.meta_path.endswith("timestamp.csv"), "Only support *timestamp.csv" + meta.to_csv(args.meta_path.replace("timestamp.csv", "correct_timestamp.csv"), index=False) + print(f"Corrected timestamp file saved to '{args.meta_path.replace('timestamp.csv', 'correct_timestamp.csv')}'") if __name__ == "__main__": main()