added configs and tests (#65)

2026-04-11 05:13:31 +02:00 · 2024-03-15 21:53:49 +08:00 · 2024-03-15 21:53:49 +08:00 · ddbb130fe7
commit ddbb130fe7
parent c4c5d64e49
22 changed files with 924 additions and 0 deletions
--- a/configs/dit/inference/16x256x256.py
+++ b/configs/dit/inference/16x256x256.py
@ -0,0 +1,32 @@
+# sample size
+num_frames = 16
+fps = 8
+image_size = (256, 256)
+
+# model config
+model = dict(
+    type="DiT-XL/2",
+    condition="text",
+    from_pretrained="YOUR_MODEL_PATH",
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+)
+text_encoder = dict(
+    type="clip",
+    from_pretrained="openai/clip-vit-base-patch32",
+    model_max_length=77,
+)
+scheduler = dict(
+    type="dpm-solver",
+    num_sampling_steps=20,
+    cfg_scale=4.0,
+)
+dtype = "fp16"
+
+# prompts
+batch_size = 2
+seed = 42
+prompt_path = "./assets/texts/ucf101_labels.txt"
+save_dir = "./samples/"
--- a/configs/dit/inference/1x256x256-class.py
+++ b/configs/dit/inference/1x256x256-class.py
@ -0,0 +1,32 @@
+# sample size
+num_frames = 1
+fps = 1
+image_size = (256, 256)
+
+# model config
+model = dict(
+    type="DiT-XL/2",
+    no_temporal_pos_emb=True,
+    condition="label_1000",
+    from_pretrained="DiT-XL-2-256x256.pt",
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+)
+text_encoder = dict(
+    type="classes",
+    num_classes=1000,
+)
+scheduler = dict(
+    type="dpm-solver",
+    num_sampling_steps=20,
+    cfg_scale=4.0,
+)
+dtype = "fp16"
+
+# prompts
+batch_size = 2
+seed = 42
+prompt_path = "./assets/texts/imagenet_id.txt"
+save_dir = "./samples/"
--- a/configs/dit/inference/1x256x256.py
+++ b/configs/dit/inference/1x256x256.py
@ -0,0 +1,33 @@
+# sample size
+num_frames = 1
+fps = 1
+image_size = (256, 256)
+
+# model config
+model = dict(
+    type="DiT-XL/2",
+    no_temporal_pos_emb=True,
+    condition="text",
+    from_pretrained="YOUR_MODEL_PATH",
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+)
+text_encoder = dict(
+    type="clip",
+    from_pretrained="openai/clip-vit-base-patch32",
+    model_max_length=77,
+)
+scheduler = dict(
+    type="dpm-solver",
+    num_sampling_steps=20,
+    cfg_scale=4.0,
+)
+dtype = "fp16"
+
+# prompts
+batch_size = 2
+seed = 42
+prompt_path = "./assets/texts/imagenet_labels.txt"
+save_dir = "./samples/"
--- a/configs/dit/train/16x256x256.py
+++ b/configs/dit/train/16x256x256.py
@ -0,0 +1,51 @@
+# sample size
+num_frames = 16
+frame_interval = 3
+image_size = (256, 256)
+
+# dataset
+root = None
+data_path = "/mnt/hdd/data/csv/ucf101_videos.csv"
+use_image_transform = False
+num_workers = 4
+
+# acceleration
+dtype = "fp16"
+grad_checkpoint = True
+plugin = "zero2"
+sp_size = 1
+
+# model config
+model = dict(
+    type="DiT-XL/2",
+    from_pretrained="DiT-XL-2-256x256.pt",
+    enable_flashattn=True,
+    enable_layernorm_kernel=True,
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+)
+text_encoder = dict(
+    type="clip",
+    from_pretrained="openai/clip-vit-base-patch32",
+    model_max_length=77,
+)
+scheduler = dict(
+    type="iddpm",
+    timestep_respacing="",
+)
+
+# runtime
+seed = 42
+outputs = "outputs"
+wandb = False
+
+epochs = 1000
+log_every = 10
+ckpt_every = 1000
+load = None
+
+batch_size = 8
+lr = 2e-5
+grad_clip = 1.0
--- a/configs/dit/train/1x256x256.py
+++ b/configs/dit/train/1x256x256.py
@ -0,0 +1,51 @@
+# sample size
+num_frames = 1
+frame_interval = 1
+image_size = (256, 256)
+
+# dataset
+root = None
+data_path = "/mnt/hdd/data/csv/imagenet_train.csv"
+use_image_transform = True
+num_workers = 4
+
+# acceleration
+dtype = "fp16"
+grad_checkpoint = True
+plugin = "zero2"
+sp_size = 1
+
+# model config
+model = dict(
+    type="DiT-XL/2",
+    no_temporal_pos_emb=True,
+    enable_flashattn=True,
+    enable_layernorm_kernel=True,
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+)
+text_encoder = dict(
+    type="clip",
+    from_pretrained="openai/clip-vit-base-patch32",
+    model_max_length=77,
+)
+scheduler = dict(
+    type="iddpm",
+    timestep_respacing="",
+)
+
+# runtime
+seed = 42
+outputs = "outputs"
+wandb = False
+
+epochs = 1000
+log_every = 10
+ckpt_every = 1000
+load = None
+
+batch_size = 128
+lr = 1e-4 # according to DiT repo
+grad_clip = 1.0
--- a/configs/latte/inference/16x256x256-class.py
+++ b/configs/latte/inference/16x256x256-class.py
@ -0,0 +1,31 @@
+# sample size
+num_frames = 16
+fps = 8
+image_size = (256, 256)
+
+# model config
+model = dict(
+    type="Latte-XL/2",
+    condition="label_101",
+    from_pretrained="Latte-XL-2-256x256-ucf101.pt",
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+)
+text_encoder = dict(
+    type="classes",
+    num_classes=101,
+)
+scheduler = dict(
+    type="dpm-solver",
+    num_sampling_steps=20,
+    cfg_scale=4.0,
+)
+dtype = "fp16"
+
+# prompts
+batch_size = 2
+seed = 42
+prompt_path = "./assets/texts/ucf101_id.txt"
+save_dir = "./samples/"
--- a/configs/latte/inference/16x256x256.py
+++ b/configs/latte/inference/16x256x256.py
@ -0,0 +1,32 @@
+# sample size
+num_frames = 16
+fps = 8
+image_size = (256, 256)
+
+# model config
+model = dict(
+    type="Latte-XL/2",
+    condition="text",
+    from_pretrained="YOUR_MODEL_PATH",
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+)
+text_encoder = dict(
+    type="clip",
+    from_pretrained="openai/clip-vit-base-patch32",
+    model_max_length=77,
+)
+scheduler = dict(
+    type="dpm-solver",
+    num_sampling_steps=20,
+    cfg_scale=4.0,
+)
+dtype = "fp16"
+
+# prompts
+batch_size = 2
+seed = 42
+prompt_path = "./assets/texts/ucf101_labels.txt"
+save_dir = "./samples/"
--- a/configs/latte/train/16x256x256.py
+++ b/configs/latte/train/16x256x256.py
@ -0,0 +1,50 @@
+# sample size
+num_frames = 16
+frame_interval = 3
+image_size = (256, 256)
+
+# dataset
+root = None
+data_path = "/mnt/hdd/data/csv/ucf101_videos.csv"
+use_image_transform = False
+num_workers = 4
+
+# acceleration
+dtype = "fp16"
+grad_checkpoint = True
+plugin = "zero2"
+sp_size = 1
+
+# model config
+model = dict(
+    type="Latte-XL/2",
+    enable_flashattn=True,
+    enable_layernorm_kernel=True,
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+)
+text_encoder = dict(
+    type="clip",
+    from_pretrained="openai/clip-vit-base-patch32",
+    model_max_length=77,
+)
+scheduler = dict(
+    type="iddpm",
+    timestep_respacing="",
+)
+
+# runtime
+seed = 42
+outputs = "outputs"
+wandb = False
+
+epochs = 1000
+log_every = 10
+ckpt_every = 1000
+load = None
+
+batch_size = 8
+lr = 2e-5
+grad_clip = 1.0
--- a/configs/opensora/inference/16x256x256.py
+++ b/configs/opensora/inference/16x256x256.py
@ -0,0 +1,36 @@
+# sample size
+num_frames = 16
+fps = 24 // 3
+image_size = (256, 256)
+
+# model config
+model = dict(
+    type="STDiT-XL/2",
+    space_scale=0.5,
+    time_scale=1.0,
+    from_pretrained="outputs/129-F16S3-PixArt-ST-XL-2/epoch83-global_step80000/ema.pt",
+    # from_pretrained="outputs/285-F16S3-PixArt-ST-XL-2/epoch615-global_step24000/ema.pt",
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="./pretrained_models/t5_ckpts",
+    model_max_length=120,
+)
+scheduler = dict(
+    # type="iddpm",
+    # num_sampling_steps=250,
+    type = "dpm-solver",
+    num_sampling_steps=20,
+    cfg_scale=7.0,
+)
+dtype = "fp16"
+
+# prompts
+batch_size = 2
+seed = 42
+prompt_path = "./assets/texts/t2v_samples.txt"
+save_dir = "./samples/"
--- a/configs/opensora/inference/16x512x512.py
+++ b/configs/opensora/inference/16x512x512.py
@ -0,0 +1,36 @@
+# sample size
+num_frames = 16
+fps = 24 // 3
+image_size = (512, 512)
+
+# model config
+model = dict(
+    type="STDiT-XL/2",
+    space_scale=1.0,
+    time_scale=1.0,
+    from_pretrained="outputs/314-F16S3-PixArt-ST-XL-2/epoch128-global_step20000/ema.pt",
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+    split=8,
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="./pretrained_models/t5_ckpts",
+    model_max_length=120,
+)
+scheduler = dict(
+    type="iddpm",
+    num_sampling_steps=100,
+    # type = "dpm-solver",
+    # num_sampling_steps=20,
+    cfg_scale=7.0,
+)
+dtype = "fp16"
+
+# prompts
+batch_size = 2
+seed = 42
+prompt_path = "./assets/texts/t2v_samples.txt"
+save_dir = "./samples/"
--- a/configs/opensora/inference/64x512x512-v2.py
+++ b/configs/opensora/inference/64x512x512-v2.py
@ -0,0 +1,36 @@
+# sample size
+num_frames = 64
+fps = 24 // 2
+image_size = (512, 512)
+
+# model config
+model = dict(
+    type="STDiT-XL/2",
+    space_scale=1.0,
+    time_scale=2 / 3,
+    from_pretrained="outputs/524-F64S2-STDiT-XL-2/epoch4-global_step750/",
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+    split=8,
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="./pretrained_models/t5_ckpts",
+    model_max_length=120,
+)
+scheduler = dict(
+    type="iddpm",
+    num_sampling_steps=100,
+    # type = "dpm-solver",
+    # num_sampling_steps=20,
+    cfg_scale=7.0,
+)
+dtype = "fp16"
+
+# prompts
+batch_size = 1
+seed = 42
+prompt_path = "./assets/texts/t2v_samples.txt"
+save_dir = "./samples"
--- a/configs/opensora/inference/64x512x512.py
+++ b/configs/opensora/inference/64x512x512.py
@ -0,0 +1,35 @@
+# sample size
+num_frames = 64
+fps = 24 // 2
+image_size = (512, 512)
+
+# model config
+model = dict(
+    type="STDiT-XL/2",
+    space_scale=1.0,
+    time_scale=2 / 3,
+    from_pretrained="outputs/314-F16S3-PixArt-ST-XL-2/epoch96-global_step15000/",
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="./pretrained_models/t5_ckpts",
+    model_max_length=120,
+)
+scheduler = dict(
+    type="iddpm",
+    num_sampling_steps=100,
+    # type = "dpm-solver",
+    # num_sampling_steps=20,
+    cfg_scale=7.0,
+)
+dtype = "fp16"
+
+# prompts
+batch_size = 1
+seed = 42
+prompt_path = "./assets/texts/t2v_samples.txt"
+save_dir = "./samples"
--- a/configs/opensora/train/16x256x256.py
+++ b/configs/opensora/train/16x256x256.py
@ -0,0 +1,55 @@
+# sample size
+num_frames = 16
+frame_interval = 3
+image_size = (256, 256)
+
+# dataset
+root = None
+# data_path = "/mnt/hdd/data/csv/bak_00/pexels_inter4k_fmin_48_rp.csv"
+data_path = "/mnt/hdd/data/csv/ucf101_videos.csv"
+use_image_transform = False
+num_workers = 4
+
+# acceleration
+dtype = "fp16"
+grad_checkpoint = True
+plugin = "zero2"
+sp_size = 1
+
+# model config
+model = dict(
+    type="STDiT-XL/2",
+    space_scale=0.5,
+    time_scale=1.0,
+    from_pretrained="PixArt-XL-2-512x512.pth",
+    enable_flashattn=True,
+    enable_layernorm_kernel=True,
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="./pretrained_models/t5_ckpts",
+    model_max_length=120,
+    shardformer=True,
+)
+scheduler = dict(
+    type="iddpm",
+    timestep_respacing="",
+)
+
+# runtime
+seed = 42
+outputs = "outputs"
+wandb = False
+
+epochs = 1000
+log_every = 10
+ckpt_every = 1000
+load = None
+
+batch_size = 8
+lr = 2e-5
+grad_clip = 1.0
--- a/configs/opensora/train/16x512x512.py
+++ b/configs/opensora/train/16x512x512.py
@ -0,0 +1,55 @@
+# sample size
+num_frames = 16
+frame_interval = 3
+image_size = (512, 512)
+
+# dataset
+root = None
+data_path = "/home/zhaowangbo/data_hdd/csv/inter4k_pexels_rp_fmin_48.csv"
+use_image_transform = False
+num_workers = 4
+
+# acceleration
+dtype = "fp16"
+grad_checkpoint = True
+plugin = "zero2"
+sp_size = 1
+
+# model config
+model = dict(
+    type="STDiT-XL/2",
+    space_scale=1.0,
+    time_scale=1.0,
+    from_pretrained="outputs/285-F16S3-PixArt-ST-XL-2/epoch615-global_step24000/ema.pt",
+    enable_flashattn=True,
+    enable_layernorm_kernel=True,
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+    split=4,
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="./pretrained_models/t5_ckpts",
+    model_max_length=120,
+    shardformer=True,
+)
+scheduler = dict(
+    type="iddpm",
+    timestep_respacing="",
+)
+
+# runtime
+seed = 42
+outputs = "outputs"
+wandb = False
+
+epochs = 1000
+log_every = 10
+ckpt_every = 500
+load = None
+
+batch_size = 8
+lr = 2e-5
+grad_clip = 1.0
--- a/configs/opensora/train/64x512x512-v2.py
+++ b/configs/opensora/train/64x512x512-v2.py
@ -0,0 +1,55 @@
+# sample size
+num_frames = 64
+frame_interval = 2
+image_size = (512, 512)
+
+# dataset
+root = None
+data_path = "/mnt/hdd/data/csv/inter4k_pexels_rp_fmin_128.csv"
+use_image_transform = False
+num_workers = 4
+
+# acceleration
+dtype = "fp16"
+grad_checkpoint = True
+plugin = "zero2"
+sp_size = 1
+
+# model config
+model = dict(
+    type="STDiT-XL/2",
+    space_scale=1.0,
+    time_scale=2 / 3,
+    from_pretrained="outputs/314-F16S3-PixArt-ST-XL-2/epoch128-global_step20000/ema.pt",
+    enable_flashattn=True,
+    enable_layernorm_kernel=True,
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+    split=8,
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="./pretrained_models/t5_ckpts",
+    model_max_length=120,
+    shardformer=True,
+)
+scheduler = dict(
+    type="iddpm",
+    timestep_respacing="",
+)
+
+# runtime
+seed = 42
+outputs = "outputs"
+wandb = False
+
+epochs = 1000
+log_every = 10
+ckpt_every = 250
+load = None
+
+batch_size = 4
+lr = 2e-5
+grad_clip = 1.0
--- a/configs/opensora/train/64x512x512.py
+++ b/configs/opensora/train/64x512x512.py
@ -0,0 +1,56 @@
+# sample size
+num_frames = 64
+frame_interval = 2
+image_size = (512, 512)
+
+# dataset
+root = None
+# data_path = "/mnt/hdd/data/csv/bak_00/pexels_inter4k_fmin_48_rp.csv"
+data_path = "/mnt/hdd/data/csv/ucf101_videos.csv"
+use_image_transform = False
+num_workers = 4
+
+# acceleration
+dtype = "fp16"
+grad_checkpoint = True
+plugin = "zero2-seq"
+sp_size = 2
+
+# model config
+model = dict(
+    type="STDiT-XL/2",
+    space_scale=1.0,
+    time_scale=2 / 3,
+    from_pretrained="PixArt-XL-2-512x512.pth",
+    enable_flashattn=True,
+    enable_layernorm_kernel=True,
+    enable_sequence_parallelism=True,  # enable sq here
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="./pretrained_models/t5_ckpts",
+    model_max_length=120,
+    shardformer=True,
+)
+scheduler = dict(
+    type="iddpm",
+    timestep_respacing="",
+)
+
+# runtime
+seed = 42
+outputs = "outputs"
+wandb = False
+
+epochs = 1000
+log_every = 10
+ckpt_every = 1000
+load = None
+
+batch_size = 1
+lr = 2e-5
+grad_clip = 1.0
--- a/configs/pixart/inference/16x256x256.py
+++ b/configs/pixart/inference/16x256x256.py
@ -0,0 +1,33 @@
+# sample size
+num_frames = 16
+fps = 8
+image_size = (256, 256)
+
+# model config
+model = dict(
+    type="PixArt-XL/2",
+    space_scale=0.5,
+    time_scale=1.0,
+    from_pretrained="outputs/098-F16S3-PixArt-XL-2/epoch7-global_step30000/model_ckpt.pt",
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="./pretrained_models/t5_ckpts",
+    model_max_length=120,
+)
+scheduler = dict(
+    type="dpm-solver",
+    num_sampling_steps=20,
+    cfg_scale=7.0,
+)
+dtype = "fp16"
+
+# prompts
+batch_size = 2
+seed = 42
+prompt_path = "./assets/texts/t2v_samples.txt"
+save_dir = "./samples/"
--- a/configs/pixart/inference/1x1024MS.py
+++ b/configs/pixart/inference/1x1024MS.py
@ -0,0 +1,35 @@
+# sample size
+num_frames = 1
+fps = 1
+image_size = (1920, 512)
+multi_resolution = True
+
+# model config
+model = dict(
+    type="PixArtMS-XL/2",
+    space_scale=2.0,
+    time_scale=1.0,
+    no_temporal_pos_emb=True,
+    from_pretrained="PixArt-XL-2-1024-MS.pth",
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="./pretrained_models/t5_ckpts",
+    model_max_length=120,
+)
+scheduler = dict(
+    type = "dpm-solver",
+    num_sampling_steps=20,
+    cfg_scale=7.0,
+)
+dtype = "fp16"
+
+# prompts
+batch_size = 2
+seed = 42
+prompt_path = "./assets/texts/t2i_samples.txt"
+save_dir = "./samples/"
--- a/configs/pixart/inference/1x256x256.py
+++ b/configs/pixart/inference/1x256x256.py
@ -0,0 +1,36 @@
+# sample size
+num_frames = 1
+fps = 1
+image_size = (256, 256)
+
+# model config
+model = dict(
+    type="PixArt-XL/2",
+    space_scale=1.0,
+    time_scale=1.0,
+    no_temporal_pos_emb=True,
+    from_pretrained="PixArt-XL-2-256x256.pth",
+)
+vae = dict(
+    # type="VideoAutoencoderKL",
+    # from_pretrained="stabilityai/sd-vae-ft-ema",
+    type="VideoAutoencoderKLTemporalDecoder",
+    from_pretrained="pretrained_models/vae_temporal_decoder"
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="./pretrained_models/t5_ckpts",
+    model_max_length=120,
+)
+scheduler = dict(
+    type="dpm-solver",
+    num_sampling_steps=20,
+    cfg_scale=7.0,
+)
+dtype = "fp16"
+
+# prompts
+batch_size = 2
+seed = 42
+prompt_path = "./assets/texts/t2i_samples.txt"
+save_dir = "./samples/"
--- a/configs/pixart/inference/1x512x512.py
+++ b/configs/pixart/inference/1x512x512.py
@ -0,0 +1,34 @@
+# sample size
+num_frames = 1
+fps = 1
+image_size = (512, 512)
+
+# model config
+model = dict(
+    type="PixArt-XL/2",
+    space_scale=1.0,
+    time_scale=1.0,
+    no_temporal_pos_emb=True,
+    from_pretrained="PixArt-XL-2-512x512.pth",
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="./pretrained_models/t5_ckpts",
+    model_max_length=120,
+)
+scheduler = dict(
+    type = "dpm-solver",
+    num_sampling_steps=20,
+    cfg_scale=7.0,
+)
+dtype = "fp16"
+
+# prompts
+batch_size = 2
+seed = 42
+prompt_path = "./assets/texts/t2i_samples.txt"
+save_dir = "./samples/"
--- a/configs/pixart/train/16x256x256.py
+++ b/configs/pixart/train/16x256x256.py
@ -0,0 +1,55 @@
+# sample size
+num_frames = 16
+frame_interval = 3
+image_size = (256, 256)
+
+# dataset
+root = None
+# data_path = "/mnt/hdd/data/csv/bak_00/pexels_inter4k_fmin_48_rp.csv"
+data_path = "/mnt/hdd/data/csv/ucf101_videos.csv"
+use_image_transform = False
+num_workers = 4
+
+# acceleration
+dtype = "fp16"
+grad_checkpoint = True
+plugin = "zero2"
+sp_size = 1
+
+# model config
+model = dict(
+    type="PixArt-XL/2",
+    space_scale=0.5,
+    time_scale=1.0,
+    from_pretrained="PixArt-XL-2-512x512.pth",
+    enable_flashattn=True,
+    enable_layernorm_kernel=True,
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="./pretrained_models/t5_ckpts",
+    model_max_length=120,
+    shardformer=True,
+)
+scheduler = dict(
+    type="iddpm",
+    timestep_respacing="",
+)
+
+# runtime
+seed = 42
+outputs = "outputs"
+wandb = False
+
+epochs = 1000
+log_every = 10
+ckpt_every = 1000
+load = None
+
+batch_size = 8
+lr = 2e-5
+grad_clip = 1.0
--- a/configs/pixart/train/1x512x512.py
+++ b/configs/pixart/train/1x512x512.py
@ -0,0 +1,55 @@
+# sample size
+num_frames = 1
+frame_interval = 1
+image_size = (512, 512)
+
+# dataset
+root = None
+data_path = "/mnt/hdd/data/csv/imagenet_train.csv"
+use_image_transform = True
+num_workers = 4
+
+# acceleration
+dtype = "fp16"
+grad_checkpoint = True
+plugin = "zero2"
+sp_size = 1
+
+# model config
+model = dict(
+    type="PixArt-XL/2",
+    space_scale=1.0,
+    time_scale=1.0,
+    no_temporal_pos_emb=True,
+    from_pretrained="PixArt-XL-2-512x512.pth",
+    enable_flashattn=True,
+    enable_layernorm_kernel=True,
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="./pretrained_models/t5_ckpts",
+    model_max_length=120,
+    shardformer=True,
+)
+scheduler = dict(
+    type="iddpm",
+    timestep_respacing="",
+)
+
+# runtime
+seed = 42
+outputs = "outputs"
+wandb = False
+
+epochs = 1000
+log_every = 10
+ckpt_every = 1000
+load = None
+
+batch_size = 32
+lr = 2e-5
+grad_clip = 1.0