mirror of
https://github.com/hpcaitech/Open-Sora.git
synced 2026-04-12 13:54:53 +02:00
Merge branch 'dev/v1.2' into feature/vbench_i2v
This commit is contained in:
commit
ac97bc6baa
|
|
@ -8,20 +8,20 @@ dataset = dict(
|
|||
bucket_config = { # 12s/it
|
||||
"144p": {1: (1.0, 475), 51: (1.0, 51), 102: ((1.0, 0.33), 27), 204: ((1.0, 0.1), 13), 408: ((1.0, 0.1), 6)},
|
||||
# ---
|
||||
"256": {1: (0.4, 297), 51: (0.5, 20), 102: ((0.5, 0.33), 10), 204: ((0.5, 0.1), 5), 408: ((0.5, 0.1), 2)},
|
||||
"240p": {1: (0.3, 297), 51: (0.4, 20), 102: ((0.4, 0.33), 10), 204: ((0.4, 0.1), 5), 408: ((0.4, 0.1), 2)},
|
||||
"256": {1: (0.4, 297), 51: (0.5, 20), 102: ((0.5, 0.33), 10), 204: ((0.5, 1.0), 5), 408: ((0.5, 1.0), 2)},
|
||||
"240p": {1: (0.3, 297), 51: (0.4, 20), 102: ((0.4, 0.33), 10), 204: ((0.4, 1.0), 5), 408: ((0.4, 1.0), 2)},
|
||||
# ---
|
||||
"360p": {1: (0.2, 141), 51: (0.15, 8), 102: ((0.15, 0.33), 4), 204: ((0.15, 0.1), 2), 408: ((0.15, 0.1), 1)},
|
||||
"512": {1: (0.2, 141), 51: (0.15, 8), 102: ((0.15, 0.33), 4), 204: ((0.15, 0.1), 2), 408: ((0.15, 0.1), 1)},
|
||||
"360p": {1: (0.5, 141), 51: (0.15, 8), 102: ((0.3, 0.5), 4), 204: ((0.3, 1.0), 2), 408: ((0.5, 0.5), 1)},
|
||||
"512": {1: (0.4, 141), 51: (0.15, 8), 102: ((0.2, 0.4), 4), 204: ((0.2, 1.0), 2), 408: ((0.4, 0.5), 1)},
|
||||
# ---
|
||||
"480p": {1: (0.1, 89), 51: (0.1, 5), 102: (0.1, 2), 204: (0.1, 1)},
|
||||
"480p": {1: (0.5, 89), 51: (0.2, 5), 102: (0.2, 2), 204: (0.1, 1)},
|
||||
# ---
|
||||
"720p": {1: (0.05, 36), 51: (0.1, 1)},
|
||||
"1024": {1: (0.05, 36), 51: (0.1, 1)},
|
||||
"720p": {1: (0.1, 36), 51: (0.03, 1)},
|
||||
"1024": {1: (0.1, 36), 51: (0.02, 1)},
|
||||
# ---
|
||||
"1080p": {1: (0.1, 5)},
|
||||
"1080p": {1: (0.01, 5)},
|
||||
# ---
|
||||
"2048": {1: (0.1, 5)},
|
||||
"2048": {1: (0.01, 5)},
|
||||
}
|
||||
|
||||
grad_checkpoint = True
|
||||
|
|
@ -88,3 +88,5 @@ grad_clip = 1.0
|
|||
lr = 1e-4
|
||||
ema_decay = 0.99
|
||||
adam_eps = 1e-15
|
||||
warmup_steps = 1000
|
||||
|
||||
|
|
|
|||
22
opensora/utils/lr_scheduler.py
Normal file
22
opensora/utils/lr_scheduler.py
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
from torch.optim.lr_scheduler import _LRScheduler
|
||||
|
||||
|
||||
class LinearWarmupLR(_LRScheduler):
|
||||
"""Linearly warmup learning rate and then linearly decay.
|
||||
|
||||
Args:
|
||||
optimizer (:class:`torch.optim.Optimizer`): Wrapped optimizer.
|
||||
warmup_steps (int, optional): Number of warmup steps, defaults to 0
|
||||
last_step (int, optional): The index of last step, defaults to -1. When last_step=-1,
|
||||
the schedule is started from the beginning or When last_step=-1, sets initial lr as lr.
|
||||
"""
|
||||
|
||||
def __init__(self, optimizer, warmup_steps: int = 0, last_epoch: int = -1):
|
||||
self.warmup_steps = warmup_steps
|
||||
super().__init__(optimizer, last_epoch=last_epoch)
|
||||
|
||||
def get_lr(self):
|
||||
if self.last_epoch < self.warmup_steps:
|
||||
return [(self.last_epoch + 1) / (self.warmup_steps + 1) * lr for lr in self.base_lrs]
|
||||
else:
|
||||
return self.base_lrs
|
||||
|
|
@ -20,6 +20,7 @@ lingua-language-detector==2.0.2
|
|||
imageio>=2.34.1
|
||||
|
||||
# [aesthetic]
|
||||
setuptools==68.2.2
|
||||
clip @ git+https://github.com/openai/CLIP.git
|
||||
|
||||
# [ocr]
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@ from opensora.acceleration.checkpoint import set_grad_checkpoint
|
|||
from opensora.acceleration.parallel_states import get_data_parallel_group
|
||||
from opensora.datasets.dataloader import prepare_dataloader
|
||||
from opensora.registry import DATASETS, MODELS, SCHEDULERS, build_module
|
||||
from opensora.utils.lr_scheduler import LinearWarmupLR
|
||||
from opensora.utils.ckpt_utils import load, model_gathering, model_sharding, record_model_param_shape, save
|
||||
from opensora.utils.config_utils import define_experiment_workspace, parse_configs, save_training_config
|
||||
from opensora.utils.misc import (
|
||||
|
|
@ -169,7 +170,13 @@ def main():
|
|||
weight_decay=cfg.get("weight_decay", 0),
|
||||
eps=cfg.get("adam_eps", 1e-8),
|
||||
)
|
||||
lr_scheduler = None
|
||||
|
||||
warmup_steps = cfg.get("warmup_steps", None)
|
||||
|
||||
if warmup_steps is None:
|
||||
lr_scheduler = None
|
||||
else:
|
||||
lr_scheduler = LinearWarmupLR(optimizer, warmup_steps=cfg.get("warmup_steps"))
|
||||
|
||||
# == additional preparation ==
|
||||
if cfg.get("grad_checkpoint", False):
|
||||
|
|
@ -288,6 +295,10 @@ def main():
|
|||
booster.backward(loss=loss, optimizer=optimizer)
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
|
||||
# update learning rate
|
||||
if lr_scheduler is not None:
|
||||
lr_scheduler.step()
|
||||
coordinator.block_all()
|
||||
timer_list.append(backward_t)
|
||||
|
||||
|
|
@ -323,6 +334,7 @@ def main():
|
|||
"loss": loss.item(),
|
||||
"avg_loss": avg_loss,
|
||||
"acc_step": acc_step,
|
||||
"lr": optimizer.param_groups[0]["lr"],
|
||||
"move_data_time": move_data_t.elapsed_time,
|
||||
"encode_time": encode_t.elapsed_time,
|
||||
"mask_time": mask_t.elapsed_time,
|
||||
|
|
|
|||
30
tests/test_lr_scheduler.py
Normal file
30
tests/test_lr_scheduler.py
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
import torch
|
||||
from torch.optim import Adam
|
||||
from torchvision.models import resnet50
|
||||
|
||||
from opensora.utils.lr_scheduler import LinearWarmupLR
|
||||
|
||||
|
||||
def test_lr_scheduler():
|
||||
model = resnet50().cuda()
|
||||
optimizer = Adam(model.parameters(), lr=0.01)
|
||||
scheduler = LinearWarmupLR(optimizer, warmup_steps=10)
|
||||
current_lr = scheduler.get_lr()[0]
|
||||
data = torch.rand(128, 3, 224, 224).cuda()
|
||||
|
||||
for i in range(100):
|
||||
out = model(data)
|
||||
out.mean().backward()
|
||||
|
||||
optimizer.step()
|
||||
scheduler.step()
|
||||
|
||||
if i >= 10:
|
||||
assert scheduler.get_lr()[0] == 0.01
|
||||
else:
|
||||
assert scheduler.get_lr()[0] > current_lr, f"{scheduler.get_lr()[0]} <= {current_lr}"
|
||||
current_lr = scheduler.get_lr()[0]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_lr_scheduler()
|
||||
|
|
@ -1,6 +1,7 @@
|
|||
import os
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
IMG_EXTENSIONS = (".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif", ".tiff", ".webp")
|
||||
|
|
|
|||
|
|
@ -1,5 +1,8 @@
|
|||
# adapted from https://github.com/christophschuhmann/improved-aesthetic-predictor/blob/main/simple_inference.py
|
||||
import cv2 # isort:skip
|
||||
|
||||
import argparse
|
||||
import gc
|
||||
import os
|
||||
from datetime import timedelta
|
||||
|
||||
|
|
@ -11,7 +14,6 @@ import torch.distributed as dist
|
|||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from einops import rearrange
|
||||
from PIL import Image
|
||||
from torch.utils.data import DataLoader, DistributedSampler
|
||||
from torchvision.datasets.folder import pil_loader
|
||||
from tqdm import tqdm
|
||||
|
|
@ -24,6 +26,7 @@ NUM_FRAMES_POINTS = {
|
|||
3: (0.1, 0.5, 0.9),
|
||||
}
|
||||
|
||||
|
||||
def merge_scores(gathered_list: list, meta: pd.DataFrame, column):
|
||||
# reorder
|
||||
indices_list = list(map(lambda x: x[0], gathered_list))
|
||||
|
|
@ -41,32 +44,36 @@ def merge_scores(gathered_list: list, meta: pd.DataFrame, column):
|
|||
# filter duplicates
|
||||
unique_indices, unique_indices_idx = np.unique(flat_indices, return_index=True)
|
||||
meta.loc[unique_indices, column] = flat_scores[unique_indices_idx]
|
||||
|
||||
# jun 3 quickfix
|
||||
# lose indices in meta not in unique_indices
|
||||
|
||||
# drop indices in meta not in unique_indices
|
||||
meta = meta.loc[unique_indices]
|
||||
return meta
|
||||
|
||||
|
||||
class VideoTextDataset(torch.utils.data.Dataset):
|
||||
def __init__(self, csv_path, transform=None, num_frames=3):
|
||||
self.csv_path = csv_path
|
||||
self.meta = pd.read_csv(csv_path)
|
||||
def __init__(self, meta_path, transform=None, num_frames=3):
|
||||
self.meta_path = meta_path
|
||||
self.meta = pd.read_csv(meta_path)
|
||||
self.transform = transform
|
||||
self.points = NUM_FRAMES_POINTS[num_frames]
|
||||
|
||||
def __getitem__(self, index):
|
||||
sample = self.meta.iloc[index]
|
||||
path = sample["path"]
|
||||
|
||||
# extract frames
|
||||
if not is_video(path):
|
||||
images = [pil_loader(path)]
|
||||
else:
|
||||
num_frames = None
|
||||
if "num_frames" in sample:
|
||||
num_frames = sample["num_frames"]
|
||||
num_frames = sample["num_frames"] if "num_frames" in sample else None
|
||||
images = extract_frames(sample["path"], points=self.points, backend="opencv", num_frames=num_frames)
|
||||
|
||||
# transform
|
||||
images = [self.transform(img) for img in images]
|
||||
|
||||
# stack
|
||||
images = torch.stack(images)
|
||||
|
||||
ret = dict(index=index, images=images)
|
||||
return ret
|
||||
|
||||
|
|
@ -97,7 +104,6 @@ class AestheticScorer(nn.Module):
|
|||
def __init__(self, input_size, device):
|
||||
super().__init__()
|
||||
self.mlp = MLP(input_size)
|
||||
self.mlp.load_state_dict(torch.load("pretrained_models/aesthetic.pth"))
|
||||
self.clip, self.preprocess = clip.load("ViT-L/14", device=device)
|
||||
|
||||
self.eval()
|
||||
|
|
@ -122,6 +128,7 @@ def main():
|
|||
# build model
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
model = AestheticScorer(768, device)
|
||||
model.mlp.load_state_dict(torch.load("pretrained_models/aesthetic.pth", map_location=device))
|
||||
preprocess = model.preprocess
|
||||
|
||||
# build dataset
|
||||
|
|
@ -138,7 +145,7 @@ def main():
|
|||
drop_last=False,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
# compute aesthetic scores
|
||||
indices_list = []
|
||||
scores_list = []
|
||||
|
|
@ -153,26 +160,28 @@ def main():
|
|||
# compute score
|
||||
with torch.no_grad():
|
||||
scores = model(images)
|
||||
|
||||
scores = rearrange(scores, "(B N) 1 -> B N", B=B)
|
||||
scores = scores.mean(dim=1)
|
||||
scores_np = scores.to(torch.float32).cpu().numpy()
|
||||
|
||||
indices_list.extend(indices)
|
||||
scores_list.extend(scores_np)
|
||||
|
||||
# jun 3 quickfix
|
||||
meta_local = merge_scores([(indices_list, scores_list)], dataset.meta, column='aes')
|
||||
out_path_local = out_path.replace('.csv', f'_part_{dist.get_rank()}.csv')
|
||||
indices_list.extend(indices.tolist())
|
||||
scores_list.extend(scores_np.tolist())
|
||||
|
||||
# save local results
|
||||
meta_local = merge_scores([(indices_list, scores_list)], dataset.meta, column="aes")
|
||||
out_path_local = out_path.replace(".csv", f"_part_{dist.get_rank()}.csv")
|
||||
meta_local.to_csv(out_path_local, index=False)
|
||||
|
||||
# wait for all ranks to finish data processing
|
||||
dist.barrier()
|
||||
dist.barrier()
|
||||
|
||||
torch.cuda.empty_cache()
|
||||
gc.collect()
|
||||
gathered_list = [None] * dist.get_world_size()
|
||||
breakpoint()
|
||||
dist.all_gather_object(gathered_list, (indices_list, scores_list))
|
||||
if dist.get_rank() == 0:
|
||||
meta_new = merge_scores(gathered_list, dataset.meta, column='aes')
|
||||
meta_new = merge_scores(gathered_list, dataset.meta, column="aes")
|
||||
meta_new.to_csv(out_path, index=False)
|
||||
print(f"New meta with aesthetic scores saved to '{out_path}'.")
|
||||
|
||||
|
|
@ -182,11 +191,12 @@ def parse_args():
|
|||
parser.add_argument("meta_path", type=str, help="Path to the input CSV file")
|
||||
parser.add_argument("--bs", type=int, default=1024, help="Batch size")
|
||||
parser.add_argument("--num_workers", type=int, default=16, help="Number of workers")
|
||||
parser.add_argument("--prefetch_factor", type=int, default=2, help="Prefetch factor")
|
||||
parser.add_argument("--prefetch_factor", type=int, default=3, help="Prefetch factor")
|
||||
parser.add_argument("--num_frames", type=int, default=3, help="Number of frames to extract")
|
||||
args = parser.parse_args()
|
||||
|
||||
return args
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
import cv2 # isort:skip
|
||||
|
||||
import argparse
|
||||
import gc
|
||||
import os
|
||||
from datetime import timedelta
|
||||
|
||||
|
|
@ -38,8 +39,7 @@ def merge_scores(gathered_list: list, meta: pd.DataFrame, column):
|
|||
unique_indices, unique_indices_idx = np.unique(flat_indices, return_index=True)
|
||||
meta.loc[unique_indices, column] = flat_scores[unique_indices_idx]
|
||||
|
||||
# jun 3 quickfix
|
||||
# lose indices in meta not in unique_indices
|
||||
# drop indices in meta not in unique_indices
|
||||
meta = meta.loc[unique_indices]
|
||||
return meta
|
||||
|
||||
|
|
@ -51,32 +51,30 @@ class VideoTextDataset(torch.utils.data.Dataset):
|
|||
self.frame_inds = frame_inds
|
||||
|
||||
def __getitem__(self, index):
|
||||
row = self.meta.iloc[index]
|
||||
images = extract_frames(row["path"], frame_inds=self.frame_inds, backend="opencv")
|
||||
sample = self.meta.iloc[index]
|
||||
path = sample["path"]
|
||||
|
||||
# extract frames
|
||||
images = extract_frames(path, frame_inds=self.frame_inds, backend="opencv")
|
||||
|
||||
# transform
|
||||
images = torch.stack([pil_to_tensor(x) for x in images]) # shape: [N, C, H, W]; dtype: torch.uint8
|
||||
images = torch.stack([pil_to_tensor(x) for x in images])
|
||||
|
||||
# stack
|
||||
# shape: [N, C, H, W]; dtype: torch.uint8
|
||||
images = images.float()
|
||||
H, W = images.shape[-2:]
|
||||
if H > W:
|
||||
images = rearrange(images, "N C H W -> N C W H")
|
||||
images = F.interpolate(images, size=(320, 576), mode="bilinear", align_corners=True)
|
||||
|
||||
return images, index
|
||||
ret = dict(index=index, images=images)
|
||||
return ret
|
||||
|
||||
def __len__(self):
|
||||
return len(self.meta)
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("meta_path", type=str, help="Path to the input CSV file")
|
||||
parser.add_argument("--bs", type=int, default=4, help="Batch size") # don't use too large bs for unimatch
|
||||
parser.add_argument("--num_workers", type=int, default=16, help="Number of workers")
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
|
|
@ -124,10 +122,11 @@ def main():
|
|||
indices_list = []
|
||||
scores_list = []
|
||||
model.eval()
|
||||
for images, indices in tqdm(dataloader, disable=dist.get_rank() != 0):
|
||||
images = images.to(device)
|
||||
B = images.shape[0]
|
||||
for batch in tqdm(dataloader, disable=dist.get_rank() != 0):
|
||||
indices = batch["index"]
|
||||
images = batch["images"].to(device, non_blocking=True)
|
||||
|
||||
B = images.shape[0]
|
||||
batch_0 = rearrange(images[:, :-1], "B N C H W -> (B N) C H W").contiguous()
|
||||
batch_1 = rearrange(images[:, 1:], "B N C H W -> (B N) C H W").contiguous()
|
||||
|
||||
|
|
@ -148,10 +147,10 @@ def main():
|
|||
flow_scores = flow_maps.abs().mean(dim=[1, 2, 3, 4])
|
||||
flow_scores = flow_scores.tolist()
|
||||
|
||||
indices_list.extend(indices)
|
||||
indices_list.extend(indices.tolist())
|
||||
scores_list.extend(flow_scores)
|
||||
|
||||
# jun 3 quickfix
|
||||
# save local results
|
||||
meta_local = merge_scores([(indices_list, scores_list)], dataset.meta, column="flow")
|
||||
out_path_local = out_path.replace(".csv", f"_part_{dist.get_rank()}.csv")
|
||||
meta_local.to_csv(out_path_local, index=False)
|
||||
|
|
@ -159,6 +158,8 @@ def main():
|
|||
# wait for all ranks to finish data processing
|
||||
dist.barrier()
|
||||
|
||||
torch.cuda.empty_cache()
|
||||
gc.collect()
|
||||
gathered_list = [None] * dist.get_world_size()
|
||||
dist.all_gather_object(gathered_list, (indices_list, scores_list))
|
||||
if dist.get_rank() == 0:
|
||||
|
|
@ -167,5 +168,14 @@ def main():
|
|||
print(f"New meta with optical flow scores saved to '{out_path}'.")
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("meta_path", type=str, help="Path to the input CSV file")
|
||||
parser.add_argument("--bs", type=int, default=4, help="Batch size") # don't use too large bs for unimatch
|
||||
parser.add_argument("--num_workers", type=int, default=16, help="Number of workers")
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
|||
Loading…
Reference in a new issue