From f9f539f07e0773a2c11af69eef0d063da909bdb8 Mon Sep 17 00:00:00 2001
From: "Zheng Zangwei (Alex Zheng)" <zangwei@comp.nus.edu.sg>
Date: Sat, 30 Mar 2024 13:34:19 +0800
Subject: [PATCH] format and some fix (#8)

---
 CONTRIBUTING.md                               |  2 +-
 LICENSE                                       |  4 +-
 README.md                                     |  2 +-
 configs/opensora-v1-1/train/Vx360p.py         |  2 +-
 .../inference-long/16x512x512-extend.py       |  2 +-
 configs/opensora/inference/16x256x256.py      |  4 +-
 configs/opensora/inference/16x512x512.py      |  2 +-
 docs/zh_CN/README.md                          |  2 +-
 opensora/datasets/aspect.py                   |  1 -
 opensora/datasets/bucket.py                   | 16 ++----
 opensora/datasets/dataloader.py               | 33 -----------
 opensora/datasets/datasets.py                 | 11 +++-
 opensora/datasets/sampler.py                  | 39 ++++++-------
 opensora/datasets/utils.py                    |  6 +-
 opensora/models/layers/blocks.py              |  6 +-
 opensora/models/text_encoder/t5.py            |  2 -
 opensora/models/vae/vae.py                    |  8 ++-
 opensora/utils/ckpt_utils.py                  | 21 ++-----
 opensora/utils/config_utils.py                |  5 +-
 scripts/train.py                              | 57 +++++--------------
 tests/test_seq_parallel_attention.py          | 28 ++++++---
 tools/caption/README.md                       |  2 +-
 tools/datasets/README.md                      |  2 +-
 tools/datasets/convert_dataset.py             |  1 -
 tools/scenedetect/README.md                   |  2 +-
 25 files changed, 94 insertions(+), 166 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index b2ef579..d606846 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -4,7 +4,7 @@ The Open-Sora project welcomes any constructive contribution from the community
 
 ## Development Environment Setup
 
-To contribute to Open-Sora, we would like to first guide you to set up a proper development environment so that you can better implement your code. You can install this library from source with the `editable` flag (`-e`, for development mode) so that your change to the source code will be reflected in runtime without re-installation. 
+To contribute to Open-Sora, we would like to first guide you to set up a proper development environment so that you can better implement your code. You can install this library from source with the `editable` flag (`-e`, for development mode) so that your change to the source code will be reflected in runtime without re-installation.
 
 You can refer to the [Installation Section](./README.md#installation) and replace `pip install -v .` with `pip install -v -e .`.
 
diff --git a/LICENSE b/LICENSE
index 7327c12..553df3c 100644
--- a/LICENSE
+++ b/LICENSE
@@ -313,7 +313,7 @@
       such as asking that all changes be marked or described.
       Although not required by our licenses, you are encouraged to
       respect those requests where reasonable. More_considerations
-      for the public: 
+      for the public:
       wiki.creativecommons.org/Considerations_for_licensees
 
    =======================================================================
@@ -677,5 +677,3 @@
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License.
-
-
diff --git a/README.md b/README.md
index 7aac775..8000852 100644
--- a/README.md
+++ b/README.md
@@ -118,7 +118,7 @@ conda create -n opensora python=3.10
 conda activate opensora
 
 # install torch
-# the command below is for CUDA 12.1, choose install commands from 
+# the command below is for CUDA 12.1, choose install commands from
 # https://pytorch.org/get-started/locally/ based on your own CUDA version
 pip install torch torchvision
 
diff --git a/configs/opensora-v1-1/train/Vx360p.py b/configs/opensora-v1-1/train/Vx360p.py
index 81ff50a..72afafa 100644
--- a/configs/opensora-v1-1/train/Vx360p.py
+++ b/configs/opensora-v1-1/train/Vx360p.py
@@ -15,7 +15,7 @@ bucket_config = {
 }
 
 # Define acceleration
-num_workers = 0
+num_workers = 4
 dtype = "bf16"
 grad_checkpoint = True
 plugin = "zero2"
diff --git a/configs/opensora/inference-long/16x512x512-extend.py b/configs/opensora/inference-long/16x512x512-extend.py
index cdd1cde..8feb1a4 100644
--- a/configs/opensora/inference-long/16x512x512-extend.py
+++ b/configs/opensora/inference-long/16x512x512-extend.py
@@ -35,7 +35,7 @@ dtype = "fp16"
 prompt_path = None
 prompt = [
     "Drone view of waves crashing against the rugged cliffs along Big Sur’s garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff’s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff's edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.",
-    "In an ornate, historical hall, a massive tidal wave peaks and begins to crash. Two surfers, seizing the moment, skillfully navigate the face of the wave."
+    "In an ornate, historical hall, a massive tidal wave peaks and begins to crash. Two surfers, seizing the moment, skillfully navigate the face of the wave.",
 ]
 
 loop = 10
diff --git a/configs/opensora/inference/16x256x256.py b/configs/opensora/inference/16x256x256.py
index 492543b..7508846 100644
--- a/configs/opensora/inference/16x256x256.py
+++ b/configs/opensora/inference/16x256x256.py
@@ -25,13 +25,13 @@ scheduler = dict(
     type="iddpm",
     num_sampling_steps=100,
     cfg_scale=7.0,
-    cfg_channel=3, # or None
+    cfg_channel=3,  # or None
 )
 dtype = "fp16"
 
 # Condition
 prompt_path = "./assets/texts/t2v_samples.txt"
-prompt = None # prompt has higher priority than prompt_path
+prompt = None  # prompt has higher priority than prompt_path
 
 # Others
 batch_size = 1
diff --git a/configs/opensora/inference/16x512x512.py b/configs/opensora/inference/16x512x512.py
index 2064074..af67602 100644
--- a/configs/opensora/inference/16x512x512.py
+++ b/configs/opensora/inference/16x512x512.py
@@ -9,7 +9,7 @@ model = dict(
     time_scale=1.0,
     enable_flashattn=True,
     enable_layernorm_kernel=True,
-    from_pretrained="PRETRAINED_MODEL"
+    from_pretrained="PRETRAINED_MODEL",
 )
 vae = dict(
     type="VideoAutoencoderKL",
diff --git a/docs/zh_CN/README.md b/docs/zh_CN/README.md
index 8e52abf..265c3dc 100644
--- a/docs/zh_CN/README.md
+++ b/docs/zh_CN/README.md
@@ -91,7 +91,7 @@
 conda create -n opensora python=3.10
 
 # install torch
-# the command below is for CUDA 12.1, choose install commands from 
+# the command below is for CUDA 12.1, choose install commands from
 # https://pytorch.org/get-started/locally/ based on your own CUDA version
 pip3 install torch torchvision
 
diff --git a/opensora/datasets/aspect.py b/opensora/datasets/aspect.py
index 9191ef2..5a8ac05 100644
--- a/opensora/datasets/aspect.py
+++ b/opensora/datasets/aspect.py
@@ -1,6 +1,5 @@
 import math
 
-
 # Ours
 
 
diff --git a/opensora/datasets/bucket.py b/opensora/datasets/bucket.py
index e78baa9..574e5cb 100644
--- a/opensora/datasets/bucket.py
+++ b/opensora/datasets/bucket.py
@@ -33,19 +33,11 @@ class Bucket:
         # wrap config with OrderedDict
         bucket_probs = OrderedDict()
         bucket_bs = OrderedDict()
-        bucket_names = sorted(
-            bucket_config.keys(), key=lambda x: ASPECT_RATIOS[x][0], reverse=True
-        )
+        bucket_names = sorted(bucket_config.keys(), key=lambda x: ASPECT_RATIOS[x][0], reverse=True)
         for key in bucket_names:
-            bucket_time_names = sorted(
-                bucket_config[key].keys(), key=lambda x: x, reverse=True
-            )
-            bucket_probs[key] = OrderedDict(
-                {k: bucket_config[key][k][0] for k in bucket_time_names}
-            )
-            bucket_bs[key] = OrderedDict(
-                {k: bucket_config[key][k][1] for k in bucket_time_names}
-            )
+            bucket_time_names = sorted(bucket_config[key].keys(), key=lambda x: x, reverse=True)
+            bucket_probs[key] = OrderedDict({k: bucket_config[key][k][0] for k in bucket_time_names})
+            bucket_bs[key] = OrderedDict({k: bucket_config[key][k][1] for k in bucket_time_names})
 
         # first level: HW
         num_bucket = 0
diff --git a/opensora/datasets/dataloader.py b/opensora/datasets/dataloader.py
index ec65805..5077a07 100644
--- a/opensora/datasets/dataloader.py
+++ b/opensora/datasets/dataloader.py
@@ -8,7 +8,6 @@ from torch.distributed.distributed_c10d import _get_default_group
 from torch.utils.data import DataLoader, Dataset
 from torch.utils.data.distributed import DistributedSampler
 
-from .bucket import Bucket
 from .sampler import DistributedVariableVideoSampler, VariableVideoBatchSampler
 
 
@@ -98,38 +97,6 @@ def prepare_dataloader(
     )
 
 
-class _VariableVideoBatchSampler(torch.utils.data.BatchSampler):
-    def __init__(self, sampler, batch_size, drop_last, dataset, buckect_config):
-        self.sampler = sampler
-        self.dataset = dataset
-        self.batch_size = batch_size
-        self.drop_last = drop_last
-        self.bucket = Bucket(buckect_config)
-        self.frame_interval = self.dataset.frame_interval
-        self.bucket.info_bucket(self.dataset, self.frame_interval)
-
-    def __iter__(self):
-        for idx in self.sampler:
-            T, H, W = self.dataset.get_data_info(idx)
-            bucket_id = self.bucket.get_bucket_id(T, H, W, self.frame_interval)
-            if bucket_id is None:
-                continue
-            rT, rH, rW = self.bucket.get_thw(bucket_id)
-            self.dataset.set_data_info(idx, rT, rH, rW)
-            buffer = self.bucket[bucket_id]
-            buffer.append(idx)
-            if len(buffer) >= self.bucket.get_batch_size(bucket_id):
-                yield buffer
-                self.bucket.set_empty(bucket_id)
-
-        for k1, v1 in self.bucket.bucket.items():
-            for k2, v2 in v1.items():
-                for k3, buffer in v2.items():
-                    if len(buffer) > 0 and not self.drop_last:
-                        yield buffer
-                        self.bucket.set_empty((k1, k2, k3))
-
-
 def prepare_variable_dataloader(
     dataset,
     batch_size,
diff --git a/opensora/datasets/datasets.py b/opensora/datasets/datasets.py
index e0cae5f..f9cbf83 100644
--- a/opensora/datasets/datasets.py
+++ b/opensora/datasets/datasets.py
@@ -39,6 +39,16 @@ class VideoTextDataset(torch.utils.data.Dataset):
             "video": get_transforms_video(transform_name, image_size),
         }
 
+    def _print_data_number(self):
+        num_videos = 0
+        num_images = 0
+        for path in self.data["path"]:
+            if self.get_type(path) == "video":
+                num_videos += 1
+            else:
+                num_images += 1
+        print(f"Dataset contains {num_videos} videos and {num_images} images.")
+
     def get_type(self, path):
         ext = os.path.splitext(path)[-1].lower()
         if ext.lower() in VID_EXTENSIONS:
@@ -148,7 +158,6 @@ class VariableVideoTextDataset(VideoTextDataset):
         return {"video": video, "text": text, "num_frames": num_frames, "height": height, "width": width, "ar": ar}
 
     def __getitem__(self, index):
-        return self.getitem(index)
         for _ in range(10):
             try:
                 return self.getitem(index)
diff --git a/opensora/datasets/sampler.py b/opensora/datasets/sampler.py
index 633bbc2..df7ccd7 100644
--- a/opensora/datasets/sampler.py
+++ b/opensora/datasets/sampler.py
@@ -1,6 +1,7 @@
 import math
 import warnings
-from collections import OrderedDict
+from collections import OrderedDict, defaultdict
+from pprint import pprint
 from typing import Iterator, List, Optional, Tuple
 
 import torch
@@ -43,9 +44,7 @@ class DistributedVariableVideoSampler(DistributedSampler):
         # group by bucket
         for i in range(len(self.dataset)):
             t, h, w = self.dataset.get_data_info(i)
-            bucket_id = self.bucket.get_bucket_id(
-                t, h, w, self.dataset.frame_interval, g
-            )
+            bucket_id = self.bucket.get_bucket_id(t, h, w, self.dataset.frame_interval, g)
             if bucket_id is None:
                 continue
             real_t, real_h, real_w = self.bucket.get_thw(bucket_id)
@@ -56,12 +55,8 @@ class DistributedVariableVideoSampler(DistributedSampler):
         # shuffle
         if self.shuffle:
             # sort buckets
-            bucket_indices = torch.randperm(
-                len(bucket_sample_dict), generator=g
-            ).tolist()
-            bucket_order = {
-                k: bucket_indices[i] for i, k in enumerate(bucket_sample_dict)
-            }
+            bucket_indices = torch.randperm(len(bucket_sample_dict), generator=g).tolist()
+            bucket_order = {k: bucket_indices[i] for i, k in enumerate(bucket_sample_dict)}
             # sort samples in each bucket
             for k, v in bucket_sample_dict.items():
                 sample_indices = torch.randperm(len(v), generator=g).tolist()
@@ -90,11 +85,7 @@ class DistributedVariableVideoSampler(DistributedSampler):
         if self.verbose:
             self._print_bucket_info(bucket_sample_dict)
         if self.shuffle:
-            bucket_sample_dict = OrderedDict(
-                sorted(
-                    bucket_sample_dict.items(), key=lambda item: bucket_order[item[0]]
-                )
-            )
+            bucket_sample_dict = OrderedDict(sorted(bucket_sample_dict.items(), key=lambda item: bucket_order[item[0]]))
         # iterate
         found_last_bucket = self.last_bucket_id is None
         for k, v in bucket_sample_dict.items():
@@ -126,13 +117,21 @@ class DistributedVariableVideoSampler(DistributedSampler):
     def _print_bucket_info(self, bucket_sample_dict: dict) -> None:
         total_samples = 0
         num_dict = {}
+        num_aspect_dict = defaultdict(int)
+        num_hwt_dict = defaultdict(int)
         for k, v in bucket_sample_dict.items():
             size = len(v) * self.num_replicas
             total_samples += size
             num_dict[k] = size
-        print(
-            f"Total training samples: {total_samples}, num buckets: {len(num_dict)}, bucket samples: {num_dict}"
-        )
+            num_aspect_dict[k[-1]] += size
+            num_hwt_dict[k[:-1]] += size
+        print(f"Total training samples: {total_samples}, num buckets: {len(num_dict)}")
+        print("Bucket samples:")
+        pprint(num_dict)
+        print("Bucket samples by HxWxT:")
+        pprint(num_hwt_dict)
+        print("Bucket samples by aspect ratio:")
+        pprint(num_aspect_dict)
 
     def state_dict(self) -> dict:
         # users must ensure bucket config is the same
@@ -175,9 +174,7 @@ class VariableVideoBatchSampler(Sampler[List[int]]):
                 cur_sample_indices = [sample_idx]
             else:
                 cur_sample_indices.append(sample_idx)
-        if len(cur_sample_indices) > 0 and (
-            not self.drop_last or len(cur_sample_indices) == cur_batch_size
-        ):
+        if len(cur_sample_indices) > 0 and (not self.drop_last or len(cur_sample_indices) == cur_batch_size):
             yield cur_sample_indices
 
     def state_dict(self) -> dict:
diff --git a/opensora/datasets/utils.py b/opensora/datasets/utils.py
index f5c71c9..85d1d18 100644
--- a/opensora/datasets/utils.py
+++ b/opensora/datasets/utils.py
@@ -1,5 +1,3 @@
-import numbers
-
 import numpy as np
 import torch
 import torchvision
@@ -146,8 +144,8 @@ def center_crop_arr(pil_image, image_size):
 
 
 def resize_crop_to_fill(pil_image, image_size):
-    w, h = pil_image.size # PIL is (W, H)
-    th, tw = image_size 
+    w, h = pil_image.size  # PIL is (W, H)
+    th, tw = image_size
     rh, rw = th / h, tw / w
     if rh > rw:
         sh, sw = th, int(w * rh)
diff --git a/opensora/models/layers/blocks.py b/opensora/models/layers/blocks.py
index 1f44f1f..f7c67b1 100644
--- a/opensora/models/layers/blocks.py
+++ b/opensora/models/layers/blocks.py
@@ -23,10 +23,7 @@ import xformers.ops
 from einops import rearrange
 from timm.models.vision_transformer import Mlp
 
-from opensora.acceleration.communications import (
-    all_to_all,
-    split_forward_gather_backward,
-)
+from opensora.acceleration.communications import all_to_all, split_forward_gather_backward
 from opensora.acceleration.parallel_states import get_sequence_parallel_group
 
 approx_gelu = lambda: nn.GELU(approximate="tanh")
@@ -568,7 +565,6 @@ class CaptionEmbedder(nn.Module):
         self.register_buffer(
             "y_embedding",
             torch.randn(token_num, in_channels) / in_channels**0.5,
-            persistent=False,
         )
         self.uncond_prob = uncond_prob
 
diff --git a/opensora/models/text_encoder/t5.py b/opensora/models/text_encoder/t5.py
index 7f55ef9..1bd535d 100644
--- a/opensora/models/text_encoder/t5.py
+++ b/opensora/models/text_encoder/t5.py
@@ -23,14 +23,12 @@
 
 
 import html
-import os
 import re
 import urllib.parse as ul
 
 import ftfy
 import torch
 from bs4 import BeautifulSoup
-from huggingface_hub import hf_hub_download
 from transformers import AutoTokenizer, T5EncoderModel
 
 from opensora.registry import MODELS
diff --git a/opensora/models/vae/vae.py b/opensora/models/vae/vae.py
index 6f96dff..e6e1441 100644
--- a/opensora/models/vae/vae.py
+++ b/opensora/models/vae/vae.py
@@ -53,7 +53,9 @@ class VideoAutoencoderKL(nn.Module):
     def get_latent_size(self, input_size):
         latent_size = []
         for i in range(3):
-            assert input_size[i] is None or input_size[i] % self.patch_size[i] == 0, "Input size must be divisible by patch size"
+            assert (
+                input_size[i] is None or input_size[i] % self.patch_size[i] == 0
+            ), "Input size must be divisible by patch size"
             latent_size.append(input_size[i] // self.patch_size[i] if input_size[i] is not None else None)
         return latent_size
 
@@ -87,7 +89,9 @@ class VideoAutoencoderKLTemporalDecoder(nn.Module):
     def get_latent_size(self, input_size):
         latent_size = []
         for i in range(3):
-            assert input_size[i] is None or input_size[i] % self.patch_size[i] == 0, "Input size must be divisible by patch size"
+            assert (
+                input_size[i] is None or input_size[i] % self.patch_size[i] == 0
+            ), "Input size must be divisible by patch size"
             latent_size.append(input_size[i] // self.patch_size[i] if input_size[i] is not None else None)
         return latent_size
 
diff --git a/opensora/utils/ckpt_utils.py b/opensora/utils/ckpt_utils.py
index 871fc59..87c5ac1 100644
--- a/opensora/utils/ckpt_utils.py
+++ b/opensora/utils/ckpt_utils.py
@@ -5,7 +5,6 @@ import operator
 import os
 from typing import Tuple
 
-import colossalai
 import torch
 import torch.distributed as dist
 import torch.nn as nn
@@ -55,9 +54,7 @@ def find_model(model_name):
         model = reparameter(model, model_name)
         return model
     else:  # Load a custom DiT checkpoint:
-        assert os.path.isfile(
-            model_name
-        ), f"Could not find DiT checkpoint at {model_name}"
+        assert os.path.isfile(model_name), f"Could not find DiT checkpoint at {model_name}"
         checkpoint = torch.load(model_name, map_location=lambda storage, loc: storage)
         if "pos_embed_temporal" in checkpoint:
             del checkpoint["pos_embed_temporal"]
@@ -93,9 +90,7 @@ def model_sharding(model: torch.nn.Module):
     for _, param in model.named_parameters():
         padding_size = (world_size - param.numel() % world_size) % world_size
         if padding_size > 0:
-            padding_param = torch.nn.functional.pad(
-                param.data.view(-1), [0, padding_size]
-            )
+            padding_param = torch.nn.functional.pad(param.data.view(-1), [0, padding_size])
         else:
             padding_param = param.data.view(-1)
         splited_params = padding_param.split(padding_param.numel() // world_size)
@@ -125,9 +120,7 @@ def model_gathering(model: torch.nn.Module, model_shape_dict: dict):
         dist.all_gather(all_params, param.data, group=dist.group.WORLD)
         if int(global_rank) == 0:
             all_params = torch.cat(all_params)
-            param.data = remove_padding(all_params, model_shape_dict[name]).view(
-                model_shape_dict[name]
-            )
+            param.data = remove_padding(all_params, model_shape_dict[name]).view(model_shape_dict[name])
     dist.barrier()
 
 
@@ -164,9 +157,7 @@ def save(
         torch.save(ema.state_dict(), os.path.join(save_dir, "ema.pt"))
         model_sharding(ema)
 
-    booster.save_optimizer(
-        optimizer, os.path.join(save_dir, "optimizer"), shard=True, size_per_shard=4096
-    )
+    booster.save_optimizer(optimizer, os.path.join(save_dir, "optimizer"), shard=True, size_per_shard=4096)
     if lr_scheduler is not None:
         booster.save_lr_scheduler(lr_scheduler, os.path.join(save_dir, "lr_scheduler"))
     running_states = {
@@ -194,9 +185,7 @@ def load(
     booster.load_model(model, os.path.join(load_dir, "model"))
     # ema is not boosted, so we don't use booster.load_model
     # ema.load_state_dict(torch.load(os.path.join(load_dir, "ema.pt")))
-    ema.load_state_dict(
-        torch.load(os.path.join(load_dir, "ema.pt"), map_location=torch.device("cpu"))
-    )
+    ema.load_state_dict(torch.load(os.path.join(load_dir, "ema.pt"), map_location=torch.device("cpu")))
     booster.load_optimizer(optimizer, os.path.join(load_dir, "optimizer"))
     if lr_scheduler is not None:
         booster.load_lr_scheduler(lr_scheduler, os.path.join(load_dir, "lr_scheduler"))
diff --git a/opensora/utils/config_utils.py b/opensora/utils/config_utils.py
index 2840605..d47c0a7 100644
--- a/opensora/utils/config_utils.py
+++ b/opensora/utils/config_utils.py
@@ -47,14 +47,13 @@ def merge_args(cfg, args, training=False):
     if args.ckpt_path is not None:
         cfg.model["from_pretrained"] = args.ckpt_path
         args.ckpt_path = None
-    
 
     for k, v in vars(args).items():
         if k in cfg and v is not None:
             cfg[k] = v
 
     if not training:
-    # Inference only
+        # Inference only
         if "reference_path" not in cfg:
             cfg["reference_path"] = None
         if "loop" not in cfg:
@@ -63,7 +62,7 @@ def merge_args(cfg, args, training=False):
             assert cfg["prompt_path"] is not None, "prompt or prompt_path must be provided"
             cfg["prompt"] = load_prompts(cfg["prompt_path"])
     else:
-    # Training only
+        # Training only
         if args.data_path is not None:
             cfg.dataset["data_path"] = args.data_path
             args.data_path = None
diff --git a/scripts/train.py b/scripts/train.py
index a8cac61..439b461 100644
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -1,4 +1,5 @@
 from copy import deepcopy
+from pprint import pprint
 
 import colossalai
 import torch
@@ -20,26 +21,14 @@ from opensora.acceleration.parallel_states import (
 from opensora.acceleration.plugin import ZeroSeqParallelPlugin
 from opensora.datasets import prepare_dataloader, prepare_variable_dataloader
 from opensora.registry import DATASETS, MODELS, SCHEDULERS, build_module
-from opensora.utils.ckpt_utils import (
-    create_logger,
-    load,
-    model_sharding,
-    record_model_param_shape,
-    save,
-)
+from opensora.utils.ckpt_utils import create_logger, load, model_sharding, record_model_param_shape, save
 from opensora.utils.config_utils import (
     create_experiment_workspace,
     create_tensorboard_writer,
     parse_configs,
     save_training_config,
 )
-from opensora.utils.misc import (
-    all_reduce_mean,
-    format_numel_str,
-    get_model_numel,
-    requires_grad,
-    to_torch_dtype,
-)
+from opensora.utils.misc import all_reduce_mean, format_numel_str, get_model_numel, requires_grad, to_torch_dtype
 from opensora.utils.train_utils import MaskGenerator, update_ema
 
 
@@ -48,7 +37,8 @@ def main():
     # 1. args & cfg
     # ======================================================
     cfg = parse_configs(training=True)
-    print(cfg)
+    print("Training configuration:")
+    pprint(cfg._cfg_dict)
     exp_name, exp_dir = create_experiment_workspace(cfg)
     save_training_config(cfg._cfg_dict, exp_dir)
 
@@ -115,12 +105,10 @@ def main():
     if cfg.bucket_config is None:
         dataloader = prepare_dataloader(**dataloader_args)
     else:
-        dataloader = prepare_variable_dataloader(
-            bucket_config=cfg.bucket_config, **dataloader_args
-        )
-    total_batch_size = cfg.batch_size * dist.get_world_size() // cfg.sp_size
-    logger.info(f"Dataset contains {len(dataset):,} videos ({dataset.data_path})")
-    logger.info(f"Total batch size: {total_batch_size}")
+        dataloader = prepare_variable_dataloader(bucket_config=cfg.bucket_config, **dataloader_args)
+    if cfg.dataset.type == "VideoTextDataset":
+        total_batch_size = cfg.batch_size * dist.get_world_size() // cfg.sp_size
+        logger.info(f"Total batch size: {total_batch_size}")
 
     # ======================================================
     # 4. build model
@@ -193,11 +181,7 @@ def main():
     # =======================================================
     start_epoch = start_step = log_step = sampler_start_idx = 0
     running_loss = 0.0
-    sampler_to_io = (
-        dataloader.batch_sampler
-        if cfg.dataset.type == "VariableVideoTextDataset"
-        else None
-    )
+    sampler_to_io = dataloader.batch_sampler if cfg.dataset.type == "VariableVideoTextDataset" else None
     # 6.1. resume training
     if cfg.load is not None:
         logger.info("Loading checkpoint")
@@ -210,12 +194,8 @@ def main():
             cfg.load,
             sampler=sampler_to_io,
         )
-        logger.info(
-            f"Loaded checkpoint {cfg.load} at epoch {start_epoch} step {start_step}"
-        )
-    logger.info(
-        f"Training for {cfg.epochs} epochs with {num_steps_per_epoch} steps per epoch"
-    )
+        logger.info(f"Loaded checkpoint {cfg.load} at epoch {start_epoch} step {start_step}")
+    logger.info(f"Training for {cfg.epochs} epochs with {num_steps_per_epoch} steps per epoch")
 
     if cfg.dataset.type == "VideoTextDataset":
         dataloader.sampler.set_start_index(sampler_start_idx)
@@ -257,12 +237,8 @@ def main():
                     model_args[k] = v.to(device, dtype)
 
                 # Diffusion
-                t = torch.randint(
-                    0, scheduler.num_timesteps, (x.shape[0],), device=device
-                )
-                loss_dict = scheduler.training_losses(
-                    model, x, t, model_args, mask=mask
-                )
+                t = torch.randint(0, scheduler.num_timesteps, (x.shape[0],), device=device)
+                loss_dict = scheduler.training_losses(model, x, t, model_args, mask=mask)
 
                 # Backward & update
                 loss = loss_dict["loss"].mean()
@@ -282,9 +258,7 @@ def main():
                 # Log to tensorboard
                 if coordinator.is_master() and (global_step + 1) % cfg.log_every == 0:
                     avg_loss = running_loss / log_step
-                    pbar.set_postfix(
-                        {"loss": avg_loss, "step": step, "global_step": global_step}
-                    )
+                    pbar.set_postfix({"loss": avg_loss, "step": step, "global_step": global_step})
                     running_loss = 0
                     log_step = 0
                     writer.add_scalar("loss", loss.item(), global_step)
@@ -292,7 +266,6 @@ def main():
                         wandb.log(
                             {
                                 "iter": global_step,
-                                "num_samples": global_step * total_batch_size,
                                 "epoch": epoch,
                                 "loss": loss.item(),
                                 "avg_loss": avg_loss,
diff --git a/tests/test_seq_parallel_attention.py b/tests/test_seq_parallel_attention.py
index 00966ad..9cce2b2 100644
--- a/tests/test_seq_parallel_attention.py
+++ b/tests/test_seq_parallel_attention.py
@@ -69,16 +69,24 @@ def run_cross_attention(rank, world_size):
     # create model
     torch.manual_seed(1024)
     set_sequence_parallel_group(dist.group.WORLD)
-    seq_parallel_attention = SeqParallelMultiHeadCrossAttention(
-        d_model=256,
-        num_heads=4,
-    ).cuda().to(torch.bfloat16)
+    seq_parallel_attention = (
+        SeqParallelMultiHeadCrossAttention(
+            d_model=256,
+            num_heads=4,
+        )
+        .cuda()
+        .to(torch.bfloat16)
+    )
 
     torch.manual_seed(1024)
-    attention = MultiHeadCrossAttention(
-        d_model=256,
-        num_heads=4,
-    ).cuda().to(torch.bfloat16)
+    attention = (
+        MultiHeadCrossAttention(
+            d_model=256,
+            num_heads=4,
+        )
+        .cuda()
+        .to(torch.bfloat16)
+    )
 
     # make sure the weights are the same
     for p1, p2 in zip(seq_parallel_attention.parameters(), attention.parameters()):
@@ -128,7 +136,9 @@ def run_cross_attention(rank, world_size):
 
     # # check grad
     for p1, p2 in zip(seq_parallel_attention.named_parameters(), attention.named_parameters()):
-        assert torch.allclose(p1[1].grad, p2[1].grad, rtol=1e-3, atol=1e-4), f"\n{p1[0]}\nvs\n{p2[0]}:\n{p1[1].grad}\nvs\n{p2[1].grad}"
+        assert torch.allclose(
+            p1[1].grad, p2[1].grad, rtol=1e-3, atol=1e-4
+        ), f"\n{p1[0]}\nvs\n{p2[0]}:\n{p1[1].grad}\nvs\n{p2[1].grad}"
 
     # # check input grad
     assert torch.allclose(x.grad, seq_x.grad, atol=1e-7), f"{x.grad}\nvs\n{seq_x.grad}"
diff --git a/tools/caption/README.md b/tools/caption/README.md
index fc1ae31..3c4c056 100644
--- a/tools/caption/README.md
+++ b/tools/caption/README.md
@@ -39,7 +39,7 @@ conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvi
 pip install flash-attn --no-build-isolation
 ```
 
-First, install LLaVA according to their [official instructions](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#install). We use the `liuhaotian/llava-v1.6-34b` model for captioning, which can be download [here](https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b). 
+First, install LLaVA according to their [official instructions](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#install). We use the `liuhaotian/llava-v1.6-34b` model for captioning, which can be download [here](https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b).
 
 ### Usage
 
diff --git a/tools/datasets/README.md b/tools/datasets/README.md
index 354a858..2ea5cea 100644
--- a/tools/datasets/README.md
+++ b/tools/datasets/README.md
@@ -60,7 +60,7 @@ head -n 10 DATA1.csv
 wc -l DATA1.csv
 ```
 
-Additionally, Ww provide `csvutils.py` to manage the CSV files. 
+Additionally, Ww provide `csvutils.py` to manage the CSV files.
 
 ### Requirement
 
diff --git a/tools/datasets/convert_dataset.py b/tools/datasets/convert_dataset.py
index aefc661..f832eb7 100644
--- a/tools/datasets/convert_dataset.py
+++ b/tools/datasets/convert_dataset.py
@@ -4,7 +4,6 @@ import os
 import pandas as pd
 from torchvision.datasets import ImageNet
 
-
 IMG_EXTENSIONS = (".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif", ".tiff", ".webp")
 VID_EXTENSIONS = (".mp4", ".avi", ".mov", ".mkv")
 
diff --git a/tools/scenedetect/README.md b/tools/scenedetect/README.md
index 8052733..5e7ac29 100644
--- a/tools/scenedetect/README.md
+++ b/tools/scenedetect/README.md
@@ -1,6 +1,6 @@
 # Scene Detection and Video Split
 
-Raw videos from the Internet may be too long for training. 
+Raw videos from the Internet may be too long for training.
 Thus, we detect scenes in raw videos and split them into short clips based on the scenes.
 First prepare the video processing packages.
 ```bash