From ddbb130fe753ba679cb9bfb0fcfdd5eecda57758 Mon Sep 17 00:00:00 2001 From: Frank Lee Date: Fri, 15 Mar 2024 21:53:49 +0800 Subject: [PATCH] added configs and tests (#65) --- configs/dit/inference/16x256x256.py | 32 ++++++++++++ configs/dit/inference/1x256x256-class.py | 32 ++++++++++++ configs/dit/inference/1x256x256.py | 33 ++++++++++++ configs/dit/train/16x256x256.py | 51 +++++++++++++++++++ configs/dit/train/1x256x256.py | 51 +++++++++++++++++++ configs/latte/inference/16x256x256-class.py | 31 ++++++++++++ configs/latte/inference/16x256x256.py | 32 ++++++++++++ configs/latte/train/16x256x256.py | 50 ++++++++++++++++++ configs/opensora/inference/16x256x256.py | 36 +++++++++++++ configs/opensora/inference/16x512x512.py | 36 +++++++++++++ configs/opensora/inference/64x512x512-v2.py | 36 +++++++++++++ configs/opensora/inference/64x512x512.py | 35 +++++++++++++ configs/opensora/train/16x256x256.py | 55 ++++++++++++++++++++ configs/opensora/train/16x512x512.py | 55 ++++++++++++++++++++ configs/opensora/train/64x512x512-v2.py | 55 ++++++++++++++++++++ configs/opensora/train/64x512x512.py | 56 +++++++++++++++++++++ configs/pixart/inference/16x256x256.py | 33 ++++++++++++ configs/pixart/inference/1x1024MS.py | 35 +++++++++++++ configs/pixart/inference/1x256x256.py | 36 +++++++++++++ configs/pixart/inference/1x512x512.py | 34 +++++++++++++ configs/pixart/train/16x256x256.py | 55 ++++++++++++++++++++ configs/pixart/train/1x512x512.py | 55 ++++++++++++++++++++ 22 files changed, 924 insertions(+) create mode 100644 configs/dit/inference/16x256x256.py create mode 100644 configs/dit/inference/1x256x256-class.py create mode 100644 configs/dit/inference/1x256x256.py create mode 100644 configs/dit/train/16x256x256.py create mode 100644 configs/dit/train/1x256x256.py create mode 100644 configs/latte/inference/16x256x256-class.py create mode 100644 configs/latte/inference/16x256x256.py create mode 100644 configs/latte/train/16x256x256.py create mode 100644 configs/opensora/inference/16x256x256.py create mode 100644 configs/opensora/inference/16x512x512.py create mode 100644 configs/opensora/inference/64x512x512-v2.py create mode 100644 configs/opensora/inference/64x512x512.py create mode 100644 configs/opensora/train/16x256x256.py create mode 100644 configs/opensora/train/16x512x512.py create mode 100644 configs/opensora/train/64x512x512-v2.py create mode 100644 configs/opensora/train/64x512x512.py create mode 100644 configs/pixart/inference/16x256x256.py create mode 100644 configs/pixart/inference/1x1024MS.py create mode 100644 configs/pixart/inference/1x256x256.py create mode 100644 configs/pixart/inference/1x512x512.py create mode 100644 configs/pixart/train/16x256x256.py create mode 100644 configs/pixart/train/1x512x512.py diff --git a/configs/dit/inference/16x256x256.py b/configs/dit/inference/16x256x256.py new file mode 100644 index 0000000..ca4c416 --- /dev/null +++ b/configs/dit/inference/16x256x256.py @@ -0,0 +1,32 @@ +# sample size +num_frames = 16 +fps = 8 +image_size = (256, 256) + +# model config +model = dict( + type="DiT-XL/2", + condition="text", + from_pretrained="YOUR_MODEL_PATH", +) +vae = dict( + type="VideoAutoencoderKL", + from_pretrained="stabilityai/sd-vae-ft-ema", +) +text_encoder = dict( + type="clip", + from_pretrained="openai/clip-vit-base-patch32", + model_max_length=77, +) +scheduler = dict( + type="dpm-solver", + num_sampling_steps=20, + cfg_scale=4.0, +) +dtype = "fp16" + +# prompts +batch_size = 2 +seed = 42 +prompt_path = "./assets/texts/ucf101_labels.txt" +save_dir = "./samples/" diff --git a/configs/dit/inference/1x256x256-class.py b/configs/dit/inference/1x256x256-class.py new file mode 100644 index 0000000..a760922 --- /dev/null +++ b/configs/dit/inference/1x256x256-class.py @@ -0,0 +1,32 @@ +# sample size +num_frames = 1 +fps = 1 +image_size = (256, 256) + +# model config +model = dict( + type="DiT-XL/2", + no_temporal_pos_emb=True, + condition="label_1000", + from_pretrained="DiT-XL-2-256x256.pt", +) +vae = dict( + type="VideoAutoencoderKL", + from_pretrained="stabilityai/sd-vae-ft-ema", +) +text_encoder = dict( + type="classes", + num_classes=1000, +) +scheduler = dict( + type="dpm-solver", + num_sampling_steps=20, + cfg_scale=4.0, +) +dtype = "fp16" + +# prompts +batch_size = 2 +seed = 42 +prompt_path = "./assets/texts/imagenet_id.txt" +save_dir = "./samples/" diff --git a/configs/dit/inference/1x256x256.py b/configs/dit/inference/1x256x256.py new file mode 100644 index 0000000..eab57d1 --- /dev/null +++ b/configs/dit/inference/1x256x256.py @@ -0,0 +1,33 @@ +# sample size +num_frames = 1 +fps = 1 +image_size = (256, 256) + +# model config +model = dict( + type="DiT-XL/2", + no_temporal_pos_emb=True, + condition="text", + from_pretrained="YOUR_MODEL_PATH", +) +vae = dict( + type="VideoAutoencoderKL", + from_pretrained="stabilityai/sd-vae-ft-ema", +) +text_encoder = dict( + type="clip", + from_pretrained="openai/clip-vit-base-patch32", + model_max_length=77, +) +scheduler = dict( + type="dpm-solver", + num_sampling_steps=20, + cfg_scale=4.0, +) +dtype = "fp16" + +# prompts +batch_size = 2 +seed = 42 +prompt_path = "./assets/texts/imagenet_labels.txt" +save_dir = "./samples/" diff --git a/configs/dit/train/16x256x256.py b/configs/dit/train/16x256x256.py new file mode 100644 index 0000000..df809e5 --- /dev/null +++ b/configs/dit/train/16x256x256.py @@ -0,0 +1,51 @@ +# sample size +num_frames = 16 +frame_interval = 3 +image_size = (256, 256) + +# dataset +root = None +data_path = "/mnt/hdd/data/csv/ucf101_videos.csv" +use_image_transform = False +num_workers = 4 + +# acceleration +dtype = "fp16" +grad_checkpoint = True +plugin = "zero2" +sp_size = 1 + +# model config +model = dict( + type="DiT-XL/2", + from_pretrained="DiT-XL-2-256x256.pt", + enable_flashattn=True, + enable_layernorm_kernel=True, +) +vae = dict( + type="VideoAutoencoderKL", + from_pretrained="stabilityai/sd-vae-ft-ema", +) +text_encoder = dict( + type="clip", + from_pretrained="openai/clip-vit-base-patch32", + model_max_length=77, +) +scheduler = dict( + type="iddpm", + timestep_respacing="", +) + +# runtime +seed = 42 +outputs = "outputs" +wandb = False + +epochs = 1000 +log_every = 10 +ckpt_every = 1000 +load = None + +batch_size = 8 +lr = 2e-5 +grad_clip = 1.0 diff --git a/configs/dit/train/1x256x256.py b/configs/dit/train/1x256x256.py new file mode 100644 index 0000000..ee9dde0 --- /dev/null +++ b/configs/dit/train/1x256x256.py @@ -0,0 +1,51 @@ +# sample size +num_frames = 1 +frame_interval = 1 +image_size = (256, 256) + +# dataset +root = None +data_path = "/mnt/hdd/data/csv/imagenet_train.csv" +use_image_transform = True +num_workers = 4 + +# acceleration +dtype = "fp16" +grad_checkpoint = True +plugin = "zero2" +sp_size = 1 + +# model config +model = dict( + type="DiT-XL/2", + no_temporal_pos_emb=True, + enable_flashattn=True, + enable_layernorm_kernel=True, +) +vae = dict( + type="VideoAutoencoderKL", + from_pretrained="stabilityai/sd-vae-ft-ema", +) +text_encoder = dict( + type="clip", + from_pretrained="openai/clip-vit-base-patch32", + model_max_length=77, +) +scheduler = dict( + type="iddpm", + timestep_respacing="", +) + +# runtime +seed = 42 +outputs = "outputs" +wandb = False + +epochs = 1000 +log_every = 10 +ckpt_every = 1000 +load = None + +batch_size = 128 +lr = 1e-4 # according to DiT repo +grad_clip = 1.0 diff --git a/configs/latte/inference/16x256x256-class.py b/configs/latte/inference/16x256x256-class.py new file mode 100644 index 0000000..db2bfbf --- /dev/null +++ b/configs/latte/inference/16x256x256-class.py @@ -0,0 +1,31 @@ +# sample size +num_frames = 16 +fps = 8 +image_size = (256, 256) + +# model config +model = dict( + type="Latte-XL/2", + condition="label_101", + from_pretrained="Latte-XL-2-256x256-ucf101.pt", +) +vae = dict( + type="VideoAutoencoderKL", + from_pretrained="stabilityai/sd-vae-ft-ema", +) +text_encoder = dict( + type="classes", + num_classes=101, +) +scheduler = dict( + type="dpm-solver", + num_sampling_steps=20, + cfg_scale=4.0, +) +dtype = "fp16" + +# prompts +batch_size = 2 +seed = 42 +prompt_path = "./assets/texts/ucf101_id.txt" +save_dir = "./samples/" diff --git a/configs/latte/inference/16x256x256.py b/configs/latte/inference/16x256x256.py new file mode 100644 index 0000000..a6d1986 --- /dev/null +++ b/configs/latte/inference/16x256x256.py @@ -0,0 +1,32 @@ +# sample size +num_frames = 16 +fps = 8 +image_size = (256, 256) + +# model config +model = dict( + type="Latte-XL/2", + condition="text", + from_pretrained="YOUR_MODEL_PATH", +) +vae = dict( + type="VideoAutoencoderKL", + from_pretrained="stabilityai/sd-vae-ft-ema", +) +text_encoder = dict( + type="clip", + from_pretrained="openai/clip-vit-base-patch32", + model_max_length=77, +) +scheduler = dict( + type="dpm-solver", + num_sampling_steps=20, + cfg_scale=4.0, +) +dtype = "fp16" + +# prompts +batch_size = 2 +seed = 42 +prompt_path = "./assets/texts/ucf101_labels.txt" +save_dir = "./samples/" diff --git a/configs/latte/train/16x256x256.py b/configs/latte/train/16x256x256.py new file mode 100644 index 0000000..685996d --- /dev/null +++ b/configs/latte/train/16x256x256.py @@ -0,0 +1,50 @@ +# sample size +num_frames = 16 +frame_interval = 3 +image_size = (256, 256) + +# dataset +root = None +data_path = "/mnt/hdd/data/csv/ucf101_videos.csv" +use_image_transform = False +num_workers = 4 + +# acceleration +dtype = "fp16" +grad_checkpoint = True +plugin = "zero2" +sp_size = 1 + +# model config +model = dict( + type="Latte-XL/2", + enable_flashattn=True, + enable_layernorm_kernel=True, +) +vae = dict( + type="VideoAutoencoderKL", + from_pretrained="stabilityai/sd-vae-ft-ema", +) +text_encoder = dict( + type="clip", + from_pretrained="openai/clip-vit-base-patch32", + model_max_length=77, +) +scheduler = dict( + type="iddpm", + timestep_respacing="", +) + +# runtime +seed = 42 +outputs = "outputs" +wandb = False + +epochs = 1000 +log_every = 10 +ckpt_every = 1000 +load = None + +batch_size = 8 +lr = 2e-5 +grad_clip = 1.0 diff --git a/configs/opensora/inference/16x256x256.py b/configs/opensora/inference/16x256x256.py new file mode 100644 index 0000000..1970635 --- /dev/null +++ b/configs/opensora/inference/16x256x256.py @@ -0,0 +1,36 @@ +# sample size +num_frames = 16 +fps = 24 // 3 +image_size = (256, 256) + +# model config +model = dict( + type="STDiT-XL/2", + space_scale=0.5, + time_scale=1.0, + from_pretrained="outputs/129-F16S3-PixArt-ST-XL-2/epoch83-global_step80000/ema.pt", + # from_pretrained="outputs/285-F16S3-PixArt-ST-XL-2/epoch615-global_step24000/ema.pt", +) +vae = dict( + type="VideoAutoencoderKL", + from_pretrained="stabilityai/sd-vae-ft-ema", +) +text_encoder = dict( + type="t5", + from_pretrained="./pretrained_models/t5_ckpts", + model_max_length=120, +) +scheduler = dict( + # type="iddpm", + # num_sampling_steps=250, + type = "dpm-solver", + num_sampling_steps=20, + cfg_scale=7.0, +) +dtype = "fp16" + +# prompts +batch_size = 2 +seed = 42 +prompt_path = "./assets/texts/t2v_samples.txt" +save_dir = "./samples/" diff --git a/configs/opensora/inference/16x512x512.py b/configs/opensora/inference/16x512x512.py new file mode 100644 index 0000000..763d47a --- /dev/null +++ b/configs/opensora/inference/16x512x512.py @@ -0,0 +1,36 @@ +# sample size +num_frames = 16 +fps = 24 // 3 +image_size = (512, 512) + +# model config +model = dict( + type="STDiT-XL/2", + space_scale=1.0, + time_scale=1.0, + from_pretrained="outputs/314-F16S3-PixArt-ST-XL-2/epoch128-global_step20000/ema.pt", +) +vae = dict( + type="VideoAutoencoderKL", + from_pretrained="stabilityai/sd-vae-ft-ema", + split=8, +) +text_encoder = dict( + type="t5", + from_pretrained="./pretrained_models/t5_ckpts", + model_max_length=120, +) +scheduler = dict( + type="iddpm", + num_sampling_steps=100, + # type = "dpm-solver", + # num_sampling_steps=20, + cfg_scale=7.0, +) +dtype = "fp16" + +# prompts +batch_size = 2 +seed = 42 +prompt_path = "./assets/texts/t2v_samples.txt" +save_dir = "./samples/" diff --git a/configs/opensora/inference/64x512x512-v2.py b/configs/opensora/inference/64x512x512-v2.py new file mode 100644 index 0000000..fd851b7 --- /dev/null +++ b/configs/opensora/inference/64x512x512-v2.py @@ -0,0 +1,36 @@ +# sample size +num_frames = 64 +fps = 24 // 2 +image_size = (512, 512) + +# model config +model = dict( + type="STDiT-XL/2", + space_scale=1.0, + time_scale=2 / 3, + from_pretrained="outputs/524-F64S2-STDiT-XL-2/epoch4-global_step750/", +) +vae = dict( + type="VideoAutoencoderKL", + from_pretrained="stabilityai/sd-vae-ft-ema", + split=8, +) +text_encoder = dict( + type="t5", + from_pretrained="./pretrained_models/t5_ckpts", + model_max_length=120, +) +scheduler = dict( + type="iddpm", + num_sampling_steps=100, + # type = "dpm-solver", + # num_sampling_steps=20, + cfg_scale=7.0, +) +dtype = "fp16" + +# prompts +batch_size = 1 +seed = 42 +prompt_path = "./assets/texts/t2v_samples.txt" +save_dir = "./samples" diff --git a/configs/opensora/inference/64x512x512.py b/configs/opensora/inference/64x512x512.py new file mode 100644 index 0000000..ccd83dc --- /dev/null +++ b/configs/opensora/inference/64x512x512.py @@ -0,0 +1,35 @@ +# sample size +num_frames = 64 +fps = 24 // 2 +image_size = (512, 512) + +# model config +model = dict( + type="STDiT-XL/2", + space_scale=1.0, + time_scale=2 / 3, + from_pretrained="outputs/314-F16S3-PixArt-ST-XL-2/epoch96-global_step15000/", +) +vae = dict( + type="VideoAutoencoderKL", + from_pretrained="stabilityai/sd-vae-ft-ema", +) +text_encoder = dict( + type="t5", + from_pretrained="./pretrained_models/t5_ckpts", + model_max_length=120, +) +scheduler = dict( + type="iddpm", + num_sampling_steps=100, + # type = "dpm-solver", + # num_sampling_steps=20, + cfg_scale=7.0, +) +dtype = "fp16" + +# prompts +batch_size = 1 +seed = 42 +prompt_path = "./assets/texts/t2v_samples.txt" +save_dir = "./samples" diff --git a/configs/opensora/train/16x256x256.py b/configs/opensora/train/16x256x256.py new file mode 100644 index 0000000..ad2a62e --- /dev/null +++ b/configs/opensora/train/16x256x256.py @@ -0,0 +1,55 @@ +# sample size +num_frames = 16 +frame_interval = 3 +image_size = (256, 256) + +# dataset +root = None +# data_path = "/mnt/hdd/data/csv/bak_00/pexels_inter4k_fmin_48_rp.csv" +data_path = "/mnt/hdd/data/csv/ucf101_videos.csv" +use_image_transform = False +num_workers = 4 + +# acceleration +dtype = "fp16" +grad_checkpoint = True +plugin = "zero2" +sp_size = 1 + +# model config +model = dict( + type="STDiT-XL/2", + space_scale=0.5, + time_scale=1.0, + from_pretrained="PixArt-XL-2-512x512.pth", + enable_flashattn=True, + enable_layernorm_kernel=True, +) +vae = dict( + type="VideoAutoencoderKL", + from_pretrained="stabilityai/sd-vae-ft-ema", +) +text_encoder = dict( + type="t5", + from_pretrained="./pretrained_models/t5_ckpts", + model_max_length=120, + shardformer=True, +) +scheduler = dict( + type="iddpm", + timestep_respacing="", +) + +# runtime +seed = 42 +outputs = "outputs" +wandb = False + +epochs = 1000 +log_every = 10 +ckpt_every = 1000 +load = None + +batch_size = 8 +lr = 2e-5 +grad_clip = 1.0 diff --git a/configs/opensora/train/16x512x512.py b/configs/opensora/train/16x512x512.py new file mode 100644 index 0000000..edcd92f --- /dev/null +++ b/configs/opensora/train/16x512x512.py @@ -0,0 +1,55 @@ +# sample size +num_frames = 16 +frame_interval = 3 +image_size = (512, 512) + +# dataset +root = None +data_path = "/home/zhaowangbo/data_hdd/csv/inter4k_pexels_rp_fmin_48.csv" +use_image_transform = False +num_workers = 4 + +# acceleration +dtype = "fp16" +grad_checkpoint = True +plugin = "zero2" +sp_size = 1 + +# model config +model = dict( + type="STDiT-XL/2", + space_scale=1.0, + time_scale=1.0, + from_pretrained="outputs/285-F16S3-PixArt-ST-XL-2/epoch615-global_step24000/ema.pt", + enable_flashattn=True, + enable_layernorm_kernel=True, +) +vae = dict( + type="VideoAutoencoderKL", + from_pretrained="stabilityai/sd-vae-ft-ema", + split=4, +) +text_encoder = dict( + type="t5", + from_pretrained="./pretrained_models/t5_ckpts", + model_max_length=120, + shardformer=True, +) +scheduler = dict( + type="iddpm", + timestep_respacing="", +) + +# runtime +seed = 42 +outputs = "outputs" +wandb = False + +epochs = 1000 +log_every = 10 +ckpt_every = 500 +load = None + +batch_size = 8 +lr = 2e-5 +grad_clip = 1.0 diff --git a/configs/opensora/train/64x512x512-v2.py b/configs/opensora/train/64x512x512-v2.py new file mode 100644 index 0000000..f20a8f3 --- /dev/null +++ b/configs/opensora/train/64x512x512-v2.py @@ -0,0 +1,55 @@ +# sample size +num_frames = 64 +frame_interval = 2 +image_size = (512, 512) + +# dataset +root = None +data_path = "/mnt/hdd/data/csv/inter4k_pexels_rp_fmin_128.csv" +use_image_transform = False +num_workers = 4 + +# acceleration +dtype = "fp16" +grad_checkpoint = True +plugin = "zero2" +sp_size = 1 + +# model config +model = dict( + type="STDiT-XL/2", + space_scale=1.0, + time_scale=2 / 3, + from_pretrained="outputs/314-F16S3-PixArt-ST-XL-2/epoch128-global_step20000/ema.pt", + enable_flashattn=True, + enable_layernorm_kernel=True, +) +vae = dict( + type="VideoAutoencoderKL", + from_pretrained="stabilityai/sd-vae-ft-ema", + split=8, +) +text_encoder = dict( + type="t5", + from_pretrained="./pretrained_models/t5_ckpts", + model_max_length=120, + shardformer=True, +) +scheduler = dict( + type="iddpm", + timestep_respacing="", +) + +# runtime +seed = 42 +outputs = "outputs" +wandb = False + +epochs = 1000 +log_every = 10 +ckpt_every = 250 +load = None + +batch_size = 4 +lr = 2e-5 +grad_clip = 1.0 diff --git a/configs/opensora/train/64x512x512.py b/configs/opensora/train/64x512x512.py new file mode 100644 index 0000000..642abe4 --- /dev/null +++ b/configs/opensora/train/64x512x512.py @@ -0,0 +1,56 @@ +# sample size +num_frames = 64 +frame_interval = 2 +image_size = (512, 512) + +# dataset +root = None +# data_path = "/mnt/hdd/data/csv/bak_00/pexels_inter4k_fmin_48_rp.csv" +data_path = "/mnt/hdd/data/csv/ucf101_videos.csv" +use_image_transform = False +num_workers = 4 + +# acceleration +dtype = "fp16" +grad_checkpoint = True +plugin = "zero2-seq" +sp_size = 2 + +# model config +model = dict( + type="STDiT-XL/2", + space_scale=1.0, + time_scale=2 / 3, + from_pretrained="PixArt-XL-2-512x512.pth", + enable_flashattn=True, + enable_layernorm_kernel=True, + enable_sequence_parallelism=True, # enable sq here +) +vae = dict( + type="VideoAutoencoderKL", + from_pretrained="stabilityai/sd-vae-ft-ema", +) +text_encoder = dict( + type="t5", + from_pretrained="./pretrained_models/t5_ckpts", + model_max_length=120, + shardformer=True, +) +scheduler = dict( + type="iddpm", + timestep_respacing="", +) + +# runtime +seed = 42 +outputs = "outputs" +wandb = False + +epochs = 1000 +log_every = 10 +ckpt_every = 1000 +load = None + +batch_size = 1 +lr = 2e-5 +grad_clip = 1.0 diff --git a/configs/pixart/inference/16x256x256.py b/configs/pixart/inference/16x256x256.py new file mode 100644 index 0000000..5fc0a98 --- /dev/null +++ b/configs/pixart/inference/16x256x256.py @@ -0,0 +1,33 @@ +# sample size +num_frames = 16 +fps = 8 +image_size = (256, 256) + +# model config +model = dict( + type="PixArt-XL/2", + space_scale=0.5, + time_scale=1.0, + from_pretrained="outputs/098-F16S3-PixArt-XL-2/epoch7-global_step30000/model_ckpt.pt", +) +vae = dict( + type="VideoAutoencoderKL", + from_pretrained="stabilityai/sd-vae-ft-ema", +) +text_encoder = dict( + type="t5", + from_pretrained="./pretrained_models/t5_ckpts", + model_max_length=120, +) +scheduler = dict( + type="dpm-solver", + num_sampling_steps=20, + cfg_scale=7.0, +) +dtype = "fp16" + +# prompts +batch_size = 2 +seed = 42 +prompt_path = "./assets/texts/t2v_samples.txt" +save_dir = "./samples/" diff --git a/configs/pixart/inference/1x1024MS.py b/configs/pixart/inference/1x1024MS.py new file mode 100644 index 0000000..5795fbd --- /dev/null +++ b/configs/pixart/inference/1x1024MS.py @@ -0,0 +1,35 @@ +# sample size +num_frames = 1 +fps = 1 +image_size = (1920, 512) +multi_resolution = True + +# model config +model = dict( + type="PixArtMS-XL/2", + space_scale=2.0, + time_scale=1.0, + no_temporal_pos_emb=True, + from_pretrained="PixArt-XL-2-1024-MS.pth", +) +vae = dict( + type="VideoAutoencoderKL", + from_pretrained="stabilityai/sd-vae-ft-ema", +) +text_encoder = dict( + type="t5", + from_pretrained="./pretrained_models/t5_ckpts", + model_max_length=120, +) +scheduler = dict( + type = "dpm-solver", + num_sampling_steps=20, + cfg_scale=7.0, +) +dtype = "fp16" + +# prompts +batch_size = 2 +seed = 42 +prompt_path = "./assets/texts/t2i_samples.txt" +save_dir = "./samples/" diff --git a/configs/pixart/inference/1x256x256.py b/configs/pixart/inference/1x256x256.py new file mode 100644 index 0000000..fa89c56 --- /dev/null +++ b/configs/pixart/inference/1x256x256.py @@ -0,0 +1,36 @@ +# sample size +num_frames = 1 +fps = 1 +image_size = (256, 256) + +# model config +model = dict( + type="PixArt-XL/2", + space_scale=1.0, + time_scale=1.0, + no_temporal_pos_emb=True, + from_pretrained="PixArt-XL-2-256x256.pth", +) +vae = dict( + # type="VideoAutoencoderKL", + # from_pretrained="stabilityai/sd-vae-ft-ema", + type="VideoAutoencoderKLTemporalDecoder", + from_pretrained="pretrained_models/vae_temporal_decoder" +) +text_encoder = dict( + type="t5", + from_pretrained="./pretrained_models/t5_ckpts", + model_max_length=120, +) +scheduler = dict( + type="dpm-solver", + num_sampling_steps=20, + cfg_scale=7.0, +) +dtype = "fp16" + +# prompts +batch_size = 2 +seed = 42 +prompt_path = "./assets/texts/t2i_samples.txt" +save_dir = "./samples/" diff --git a/configs/pixart/inference/1x512x512.py b/configs/pixart/inference/1x512x512.py new file mode 100644 index 0000000..6e4daa0 --- /dev/null +++ b/configs/pixart/inference/1x512x512.py @@ -0,0 +1,34 @@ +# sample size +num_frames = 1 +fps = 1 +image_size = (512, 512) + +# model config +model = dict( + type="PixArt-XL/2", + space_scale=1.0, + time_scale=1.0, + no_temporal_pos_emb=True, + from_pretrained="PixArt-XL-2-512x512.pth", +) +vae = dict( + type="VideoAutoencoderKL", + from_pretrained="stabilityai/sd-vae-ft-ema", +) +text_encoder = dict( + type="t5", + from_pretrained="./pretrained_models/t5_ckpts", + model_max_length=120, +) +scheduler = dict( + type = "dpm-solver", + num_sampling_steps=20, + cfg_scale=7.0, +) +dtype = "fp16" + +# prompts +batch_size = 2 +seed = 42 +prompt_path = "./assets/texts/t2i_samples.txt" +save_dir = "./samples/" diff --git a/configs/pixart/train/16x256x256.py b/configs/pixart/train/16x256x256.py new file mode 100644 index 0000000..4db54ff --- /dev/null +++ b/configs/pixart/train/16x256x256.py @@ -0,0 +1,55 @@ +# sample size +num_frames = 16 +frame_interval = 3 +image_size = (256, 256) + +# dataset +root = None +# data_path = "/mnt/hdd/data/csv/bak_00/pexels_inter4k_fmin_48_rp.csv" +data_path = "/mnt/hdd/data/csv/ucf101_videos.csv" +use_image_transform = False +num_workers = 4 + +# acceleration +dtype = "fp16" +grad_checkpoint = True +plugin = "zero2" +sp_size = 1 + +# model config +model = dict( + type="PixArt-XL/2", + space_scale=0.5, + time_scale=1.0, + from_pretrained="PixArt-XL-2-512x512.pth", + enable_flashattn=True, + enable_layernorm_kernel=True, +) +vae = dict( + type="VideoAutoencoderKL", + from_pretrained="stabilityai/sd-vae-ft-ema", +) +text_encoder = dict( + type="t5", + from_pretrained="./pretrained_models/t5_ckpts", + model_max_length=120, + shardformer=True, +) +scheduler = dict( + type="iddpm", + timestep_respacing="", +) + +# runtime +seed = 42 +outputs = "outputs" +wandb = False + +epochs = 1000 +log_every = 10 +ckpt_every = 1000 +load = None + +batch_size = 8 +lr = 2e-5 +grad_clip = 1.0 diff --git a/configs/pixart/train/1x512x512.py b/configs/pixart/train/1x512x512.py new file mode 100644 index 0000000..56dc00a --- /dev/null +++ b/configs/pixart/train/1x512x512.py @@ -0,0 +1,55 @@ +# sample size +num_frames = 1 +frame_interval = 1 +image_size = (512, 512) + +# dataset +root = None +data_path = "/mnt/hdd/data/csv/imagenet_train.csv" +use_image_transform = True +num_workers = 4 + +# acceleration +dtype = "fp16" +grad_checkpoint = True +plugin = "zero2" +sp_size = 1 + +# model config +model = dict( + type="PixArt-XL/2", + space_scale=1.0, + time_scale=1.0, + no_temporal_pos_emb=True, + from_pretrained="PixArt-XL-2-512x512.pth", + enable_flashattn=True, + enable_layernorm_kernel=True, +) +vae = dict( + type="VideoAutoencoderKL", + from_pretrained="stabilityai/sd-vae-ft-ema", +) +text_encoder = dict( + type="t5", + from_pretrained="./pretrained_models/t5_ckpts", + model_max_length=120, + shardformer=True, +) +scheduler = dict( + type="iddpm", + timestep_respacing="", +) + +# runtime +seed = 42 +outputs = "outputs" +wandb = False + +epochs = 1000 +log_every = 10 +ckpt_every = 1000 +load = None + +batch_size = 32 +lr = 2e-5 +grad_clip = 1.0