Merge branch 'dev/v1.2' into feature/vbench_i2v

2026-04-12 13:54:53 +02:00 · 2024-06-05 08:42:07 +00:00 · 2024-06-05 08:42:07 +00:00 · ac97bc6baa
commit ac97bc6baa
parent 3f9710fed3 5f9e642278
8 changed files with 140 additions and 52 deletions
--- a/configs/opensora-v1-2/train/stage2.py
+++ b/configs/opensora-v1-2/train/stage2.py
@ -8,20 +8,20 @@ dataset = dict(
 bucket_config = {  # 12s/it
    "144p": {1: (1.0, 475), 51: (1.0, 51), 102: ((1.0, 0.33), 27), 204: ((1.0, 0.1), 13), 408: ((1.0, 0.1), 6)},
    # ---
-    "256": {1: (0.4, 297), 51: (0.5, 20), 102: ((0.5, 0.33), 10), 204: ((0.5, 0.1), 5), 408: ((0.5, 0.1), 2)},
-    "240p": {1: (0.3, 297), 51: (0.4, 20), 102: ((0.4, 0.33), 10), 204: ((0.4, 0.1), 5), 408: ((0.4, 0.1), 2)},
+    "256": {1: (0.4, 297), 51: (0.5, 20), 102: ((0.5, 0.33), 10), 204: ((0.5, 1.0), 5), 408: ((0.5, 1.0), 2)},
+    "240p": {1: (0.3, 297), 51: (0.4, 20), 102: ((0.4, 0.33), 10), 204: ((0.4, 1.0), 5), 408: ((0.4, 1.0), 2)},
    # ---
-    "360p": {1: (0.2, 141), 51: (0.15, 8), 102: ((0.15, 0.33), 4), 204: ((0.15, 0.1), 2), 408: ((0.15, 0.1), 1)},
-    "512": {1: (0.2, 141), 51: (0.15, 8), 102: ((0.15, 0.33), 4), 204: ((0.15, 0.1), 2), 408: ((0.15, 0.1), 1)},
+    "360p": {1: (0.5, 141), 51: (0.15, 8), 102: ((0.3, 0.5), 4), 204: ((0.3, 1.0), 2), 408: ((0.5, 0.5), 1)},
+    "512": {1: (0.4, 141), 51: (0.15, 8), 102: ((0.2, 0.4), 4), 204: ((0.2, 1.0), 2), 408: ((0.4, 0.5), 1)},
    # ---
-    "480p": {1: (0.1, 89), 51: (0.1, 5), 102: (0.1, 2), 204: (0.1, 1)},
+    "480p": {1: (0.5, 89), 51: (0.2, 5), 102: (0.2, 2), 204: (0.1, 1)},
    # ---
-    "720p": {1: (0.05, 36), 51: (0.1, 1)},
-    "1024": {1: (0.05, 36), 51: (0.1, 1)},
+    "720p": {1: (0.1, 36), 51: (0.03, 1)},
+    "1024": {1: (0.1, 36), 51: (0.02, 1)},
    # ---
-    "1080p": {1: (0.1, 5)},
+    "1080p": {1: (0.01, 5)},
    # ---
-    "2048": {1: (0.1, 5)},
+    "2048": {1: (0.01, 5)},
 }

 grad_checkpoint = True
@ -88,3 +88,5 @@ grad_clip = 1.0
 lr = 1e-4
 ema_decay = 0.99
 adam_eps = 1e-15
+warmup_steps = 1000
+
--- a/opensora/utils/lr_scheduler.py
+++ b/opensora/utils/lr_scheduler.py
@ -0,0 +1,22 @@
+from torch.optim.lr_scheduler import _LRScheduler
+
+
+class LinearWarmupLR(_LRScheduler):
+    """Linearly warmup learning rate and then linearly decay.
+
+    Args:
+        optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
+        warmup_steps (int, optional): Number of warmup steps, defaults to 0
+        last_step (int, optional): The index of last step, defaults to -1. When last_step=-1,
+            the schedule is started from the beginning or When last_step=-1, sets initial lr as lr.
+    """
+
+    def __init__(self, optimizer, warmup_steps: int = 0, last_epoch: int = -1):
+        self.warmup_steps = warmup_steps
+        super().__init__(optimizer, last_epoch=last_epoch)
+
+    def get_lr(self):
+        if self.last_epoch < self.warmup_steps:
+            return [(self.last_epoch + 1) / (self.warmup_steps + 1) * lr for lr in self.base_lrs]
+        else:
+            return self.base_lrs
--- a/requirements/requirements-data.txt
+++ b/requirements/requirements-data.txt
@ -20,6 +20,7 @@ lingua-language-detector==2.0.2
 imageio>=2.34.1

 # [aesthetic]
+setuptools==68.2.2
 clip @ git+https://github.com/openai/CLIP.git

 # [ocr]
--- a/scripts/train.py
+++ b/scripts/train.py
@ -17,6 +17,7 @@ from opensora.acceleration.checkpoint import set_grad_checkpoint
 from opensora.acceleration.parallel_states import get_data_parallel_group
 from opensora.datasets.dataloader import prepare_dataloader
 from opensora.registry import DATASETS, MODELS, SCHEDULERS, build_module
+from opensora.utils.lr_scheduler import LinearWarmupLR
 from opensora.utils.ckpt_utils import load, model_gathering, model_sharding, record_model_param_shape, save
 from opensora.utils.config_utils import define_experiment_workspace, parse_configs, save_training_config
 from opensora.utils.misc import (
@ -169,7 +170,13 @@ def main():
        weight_decay=cfg.get("weight_decay", 0),
        eps=cfg.get("adam_eps", 1e-8),
    )
-    lr_scheduler = None
+
+    warmup_steps = cfg.get("warmup_steps", None)
+
+    if warmup_steps is None:
+        lr_scheduler = None
+    else:
+        lr_scheduler = LinearWarmupLR(optimizer, warmup_steps=cfg.get("warmup_steps"))

    # == additional preparation ==
    if cfg.get("grad_checkpoint", False):
@ -288,6 +295,10 @@ def main():
                    booster.backward(loss=loss, optimizer=optimizer)
                    optimizer.step()
                    optimizer.zero_grad()
+
+                    # update learning rate
+                    if lr_scheduler is not None:
+                        lr_scheduler.step()           
                    coordinator.block_all()
                timer_list.append(backward_t)

@ -323,6 +334,7 @@ def main():
                                "loss": loss.item(),
                                "avg_loss": avg_loss,
                                "acc_step": acc_step,
+                                "lr": optimizer.param_groups[0]["lr"],
                                "move_data_time": move_data_t.elapsed_time,
                                "encode_time": encode_t.elapsed_time,
                                "mask_time": mask_t.elapsed_time,
--- a/tests/test_lr_scheduler.py
+++ b/tests/test_lr_scheduler.py
@ -0,0 +1,30 @@
+import torch
+from torch.optim import Adam
+from torchvision.models import resnet50
+
+from opensora.utils.lr_scheduler import LinearWarmupLR
+
+
+def test_lr_scheduler():
+    model = resnet50().cuda()
+    optimizer = Adam(model.parameters(), lr=0.01)
+    scheduler = LinearWarmupLR(optimizer, warmup_steps=10)
+    current_lr = scheduler.get_lr()[0]
+    data = torch.rand(128, 3, 224, 224).cuda()
+
+    for i in range(100):
+        out = model(data)
+        out.mean().backward()
+
+        optimizer.step()
+        scheduler.step()
+
+        if i >= 10:
+            assert scheduler.get_lr()[0] == 0.01
+        else:
+            assert scheduler.get_lr()[0] > current_lr, f"{scheduler.get_lr()[0]} <= {current_lr}"
+            current_lr = scheduler.get_lr()[0]
+
+
+if __name__ == "__main__":
+    test_lr_scheduler()
--- a/tools/datasets/utils.py
+++ b/tools/datasets/utils.py
@ -1,6 +1,7 @@
 import os

 import cv2
+import numpy as np
 from PIL import Image

 IMG_EXTENSIONS = (".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif", ".tiff", ".webp")
--- a/tools/scoring/aesthetic/inference.py
+++ b/tools/scoring/aesthetic/inference.py
@ -1,5 +1,8 @@
 # adapted from https://github.com/christophschuhmann/improved-aesthetic-predictor/blob/main/simple_inference.py
+import cv2  # isort:skip
+
 import argparse
+import gc
 import os
 from datetime import timedelta

@ -11,7 +14,6 @@ import torch.distributed as dist
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
-from PIL import Image
 from torch.utils.data import DataLoader, DistributedSampler
 from torchvision.datasets.folder import pil_loader
 from tqdm import tqdm
@ -24,6 +26,7 @@ NUM_FRAMES_POINTS = {
    3: (0.1, 0.5, 0.9),
 }

+
 def merge_scores(gathered_list: list, meta: pd.DataFrame, column):
    # reorder
    indices_list = list(map(lambda x: x[0], gathered_list))
@ -41,32 +44,36 @@ def merge_scores(gathered_list: list, meta: pd.DataFrame, column):
    # filter duplicates
    unique_indices, unique_indices_idx = np.unique(flat_indices, return_index=True)
    meta.loc[unique_indices, column] = flat_scores[unique_indices_idx]
-    
-    # jun 3 quickfix
-    # lose indices in meta not in unique_indices
+
+    # drop indices in meta not in unique_indices
    meta = meta.loc[unique_indices]
    return meta


 class VideoTextDataset(torch.utils.data.Dataset):
-    def __init__(self, csv_path, transform=None, num_frames=3):
-        self.csv_path = csv_path
-        self.meta = pd.read_csv(csv_path)
+    def __init__(self, meta_path, transform=None, num_frames=3):
+        self.meta_path = meta_path
+        self.meta = pd.read_csv(meta_path)
        self.transform = transform
        self.points = NUM_FRAMES_POINTS[num_frames]

    def __getitem__(self, index):
        sample = self.meta.iloc[index]
        path = sample["path"]
+
+        # extract frames
        if not is_video(path):
            images = [pil_loader(path)]
        else:
-            num_frames = None
-            if "num_frames" in sample:
-                num_frames = sample["num_frames"]
+            num_frames = sample["num_frames"] if "num_frames" in sample else None
            images = extract_frames(sample["path"], points=self.points, backend="opencv", num_frames=num_frames)
+
+        # transform
        images = [self.transform(img) for img in images]
+
+        # stack
        images = torch.stack(images)
+
        ret = dict(index=index, images=images)
        return ret

@ -97,7 +104,6 @@ class AestheticScorer(nn.Module):
    def __init__(self, input_size, device):
        super().__init__()
        self.mlp = MLP(input_size)
-        self.mlp.load_state_dict(torch.load("pretrained_models/aesthetic.pth"))
        self.clip, self.preprocess = clip.load("ViT-L/14", device=device)

        self.eval()
@ -122,6 +128,7 @@ def main():
    # build model
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = AestheticScorer(768, device)
+    model.mlp.load_state_dict(torch.load("pretrained_models/aesthetic.pth", map_location=device))
    preprocess = model.preprocess

    # build dataset
@ -138,7 +145,7 @@ def main():
            drop_last=False,
        ),
    )
-    
+
    # compute aesthetic scores
    indices_list = []
    scores_list = []
@ -153,26 +160,28 @@ def main():
        # compute score
        with torch.no_grad():
            scores = model(images)
+
        scores = rearrange(scores, "(B N) 1 -> B N", B=B)
        scores = scores.mean(dim=1)
        scores_np = scores.to(torch.float32).cpu().numpy()

-        indices_list.extend(indices)
-        scores_list.extend(scores_np)
-    
-    # jun 3 quickfix
-    meta_local = merge_scores([(indices_list, scores_list)], dataset.meta, column='aes')
-    out_path_local = out_path.replace('.csv', f'_part_{dist.get_rank()}.csv')
+        indices_list.extend(indices.tolist())
+        scores_list.extend(scores_np.tolist())
+
+    # save local results
+    meta_local = merge_scores([(indices_list, scores_list)], dataset.meta, column="aes")
+    out_path_local = out_path.replace(".csv", f"_part_{dist.get_rank()}.csv")
    meta_local.to_csv(out_path_local, index=False)

    # wait for all ranks to finish data processing
-    dist.barrier()  
+    dist.barrier()

+    torch.cuda.empty_cache()
+    gc.collect()
    gathered_list = [None] * dist.get_world_size()
-    breakpoint()
    dist.all_gather_object(gathered_list, (indices_list, scores_list))
    if dist.get_rank() == 0:
-        meta_new = merge_scores(gathered_list, dataset.meta, column='aes')
+        meta_new = merge_scores(gathered_list, dataset.meta, column="aes")
        meta_new.to_csv(out_path, index=False)
        print(f"New meta with aesthetic scores saved to '{out_path}'.")

@ -182,11 +191,12 @@ def parse_args():
    parser.add_argument("meta_path", type=str, help="Path to the input CSV file")
    parser.add_argument("--bs", type=int, default=1024, help="Batch size")
    parser.add_argument("--num_workers", type=int, default=16, help="Number of workers")
-    parser.add_argument("--prefetch_factor", type=int, default=2, help="Prefetch factor")
+    parser.add_argument("--prefetch_factor", type=int, default=3, help="Prefetch factor")
    parser.add_argument("--num_frames", type=int, default=3, help="Number of frames to extract")
    args = parser.parse_args()

    return args

+
 if __name__ == "__main__":
    main()
--- a/tools/scoring/optical_flow/inference.py
+++ b/tools/scoring/optical_flow/inference.py
@ -1,6 +1,7 @@
 import cv2  # isort:skip

 import argparse
+import gc
 import os
 from datetime import timedelta

@ -38,8 +39,7 @@ def merge_scores(gathered_list: list, meta: pd.DataFrame, column):
    unique_indices, unique_indices_idx = np.unique(flat_indices, return_index=True)
    meta.loc[unique_indices, column] = flat_scores[unique_indices_idx]

-    # jun 3 quickfix
-    # lose indices in meta not in unique_indices
+    # drop indices in meta not in unique_indices
    meta = meta.loc[unique_indices]
    return meta

@ -51,32 +51,30 @@ class VideoTextDataset(torch.utils.data.Dataset):
        self.frame_inds = frame_inds

    def __getitem__(self, index):
-        row = self.meta.iloc[index]
-        images = extract_frames(row["path"], frame_inds=self.frame_inds, backend="opencv")
+        sample = self.meta.iloc[index]
+        path = sample["path"]
+
+        # extract frames
+        images = extract_frames(path, frame_inds=self.frame_inds, backend="opencv")

        # transform
-        images = torch.stack([pil_to_tensor(x) for x in images])  # shape: [N, C, H, W]; dtype: torch.uint8
+        images = torch.stack([pil_to_tensor(x) for x in images])
+
+        # stack
+        # shape: [N, C, H, W]; dtype: torch.uint8
        images = images.float()
        H, W = images.shape[-2:]
        if H > W:
            images = rearrange(images, "N C H W -> N C W H")
        images = F.interpolate(images, size=(320, 576), mode="bilinear", align_corners=True)

-        return images, index
+        ret = dict(index=index, images=images)
+        return ret

    def __len__(self):
        return len(self.meta)


-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("meta_path", type=str, help="Path to the input CSV file")
-    parser.add_argument("--bs", type=int, default=4, help="Batch size")  # don't use too large bs for unimatch
-    parser.add_argument("--num_workers", type=int, default=16, help="Number of workers")
-    args = parser.parse_args()
-    return args
-
-
 def main():
    args = parse_args()

@ -124,10 +122,11 @@ def main():
    indices_list = []
    scores_list = []
    model.eval()
-    for images, indices in tqdm(dataloader, disable=dist.get_rank() != 0):
-        images = images.to(device)
-        B = images.shape[0]
+    for batch in tqdm(dataloader, disable=dist.get_rank() != 0):
+        indices = batch["index"]
+        images = batch["images"].to(device, non_blocking=True)

+        B = images.shape[0]
        batch_0 = rearrange(images[:, :-1], "B N C H W -> (B N) C H W").contiguous()
        batch_1 = rearrange(images[:, 1:], "B N C H W -> (B N) C H W").contiguous()

@ -148,10 +147,10 @@ def main():
            flow_scores = flow_maps.abs().mean(dim=[1, 2, 3, 4])
            flow_scores = flow_scores.tolist()

-        indices_list.extend(indices)
+        indices_list.extend(indices.tolist())
        scores_list.extend(flow_scores)

-    # jun 3 quickfix
+    # save local results
    meta_local = merge_scores([(indices_list, scores_list)], dataset.meta, column="flow")
    out_path_local = out_path.replace(".csv", f"_part_{dist.get_rank()}.csv")
    meta_local.to_csv(out_path_local, index=False)
@ -159,6 +158,8 @@ def main():
    # wait for all ranks to finish data processing
    dist.barrier()

+    torch.cuda.empty_cache()
+    gc.collect()
    gathered_list = [None] * dist.get_world_size()
    dist.all_gather_object(gathered_list, (indices_list, scores_list))
    if dist.get_rank() == 0:
@ -167,5 +168,14 @@ def main():
        print(f"New meta with optical flow scores saved to '{out_path}'.")


+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("meta_path", type=str, help="Path to the input CSV file")
+    parser.add_argument("--bs", type=int, default=4, help="Batch size")  # don't use too large bs for unimatch
+    parser.add_argument("--num_workers", type=int, default=16, help="Number of workers")
+    args = parser.parse_args()
+    return args
+
+
 if __name__ == "__main__":
    main()