diff --git a/configs/opensora-v1-2/train/stage1.py b/configs/opensora-v1-2/train/stage1.py index faea0e3..57bb7d2 100644 --- a/configs/opensora-v1-2/train/stage1.py +++ b/configs/opensora-v1-2/train/stage1.py @@ -108,3 +108,4 @@ grad_clip = 1.0 lr = 1e-4 ema_decay = 0.99 adam_eps = 1e-15 +warmup_steps = 1000 diff --git a/configs/opensora-v1-2/train/stage3.py b/configs/opensora-v1-2/train/stage3.py new file mode 100644 index 0000000..2595d4d --- /dev/null +++ b/configs/opensora-v1-2/train/stage3.py @@ -0,0 +1,92 @@ +# Dataset settings +dataset = dict( + type="VariableVideoTextDataset", + transform_name="resize_crop", +) + +# webvid +bucket_config = { # 20s/it + "144p": {1: (1.0, 475), 51: (1.0, 51), 102: (1.0, 27), 204: (1.0, 13), 408: (1.0, 6)}, + # --- + "256": {1: (1.0, 297), 51: (0.5, 20), 102: (0.5, 10), 204: (0.5, 5), 408: ((0.5, 0.5), 2)}, + "240p": {1: (1.0, 297), 51: (0.5, 20), 102: (0.5, 10), 204: (0.5, 5), 408: ((0.5, 0.4), 2)}, + # --- + "360p": {1: (1.0, 141), 51: (0.5, 8), 102: (0.5, 4), 204: (0.5, 2), 408: ((0.5, 0.3), 1)}, + "512": {1: (1.0, 141), 51: (0.5, 8), 102: (0.5, 4), 204: (0.5, 2), 408: ((0.5, 0.2), 1)}, + # --- + "480p": {1: (1.0, 89), 51: (0.5, 5), 102: (0.5, 3), 204: ((0.5, 0.5), 1), 408: (0.0, None)}, + # --- + "720p": {1: (0.3, 36), 51: (0.2, 2), 102: (0.1, 1), 204: (0.0, None)}, + "1024": {1: (0.3, 36), 51: (0.1, 2), 102: (0.1, 1), 204: (0.0, None)}, + # --- + "1080p": {1: (0.1, 5)}, + # --- + "2048": {1: (0.05, 5)}, +} + +grad_checkpoint = True + +# Acceleration settings +num_workers = 8 +num_bucket_build_workers = 16 +dtype = "bf16" +plugin = "zero2" + +# Model settings +model = dict( + type="STDiT3-XL/2", + from_pretrained=None, + qk_norm=True, + enable_flash_attn=True, + enable_layernorm_kernel=True, + freeze_y_embedder=True, +) +vae = dict( + type="OpenSoraVAE_V1_2", + from_pretrained="/mnt/jfs/sora_checkpoints/vae-pipeline", + micro_frame_size=17, + micro_batch_size=4, +) +text_encoder = dict( + type="t5", + from_pretrained="DeepFloyd/t5-v1_1-xxl", + model_max_length=300, + shardformer=True, + local_files_only=True, +) +scheduler = dict( + type="rflow", + use_timestep_transform=True, + sample_method="logit-normal", +) + +# Mask settings +# 25% +mask_ratios = { + "random": 0.01, + "intepolate": 0.002, + "quarter_random": 0.002, + "quarter_head": 0.002, + "quarter_tail": 0.002, + "quarter_head_tail": 0.002, + "image_random": 0.0, + "image_head": 0.22, + "image_tail": 0.005, + "image_head_tail": 0.005, +} + +# Log settings +seed = 42 +outputs = "outputs" +wandb = False +epochs = 1000 +log_every = 10 +ckpt_every = 200 + +# optimization settings +load = None +grad_clip = 1.0 +lr = 1e-4 +ema_decay = 0.99 +adam_eps = 1e-15 +warmup_steps = 1000 diff --git a/tools/datasets/datautil.py b/tools/datasets/datautil.py index 4654f76..f972dfd 100644 --- a/tools/datasets/datautil.py +++ b/tools/datasets/datautil.py @@ -512,12 +512,6 @@ def main(args): data = pd.merge(data, data_new[cols_to_use], on=col_on, how="inner") print(f"Intersection number of samples: {len(data)}.") - # train columns - if args.train_column: - all_columns = data.columns - columns_to_drop = all_columns.difference(TRAIN_COLUMNS) - data = data.drop(columns=columns_to_drop) - # get output path output_path = get_output_path(args, input_name) @@ -593,6 +587,8 @@ def main(args): if args.append_text is not None: assert "text" in data.columns data["text"] = data["text"] + args.append_text + if args.score_to_text: + data["text"] = apply(data, score_to_text, axis=1) if args.clean_caption: assert "text" in data.columns data["text"] = apply( @@ -602,8 +598,6 @@ def main(args): if args.count_num_token is not None: assert "text" in data.columns data["text_len"] = apply(data["text"], lambda x: len(tokenizer(x)["input_ids"])) - if args.score_to_text: - data["text"] = apply(data, score_to_text, axis=1) if args.update_text is not None: data_new = pd.read_csv(args.update_text) num_updated = data.path.isin(data_new.path).sum() @@ -667,6 +661,12 @@ def main(args): if args.head is not None: data = data.head(args.head) + # train columns + if args.train_column: + all_columns = data.columns + columns_to_drop = all_columns.difference(TRAIN_COLUMNS) + data = data.drop(columns=columns_to_drop) + print(f"Filtered number of samples: {len(data)}.") # shard data