added configs and tests (#65)

This commit is contained in:
Frank Lee 2024-03-15 21:53:49 +08:00 committed by GitHub
parent c4c5d64e49
commit ddbb130fe7
22 changed files with 924 additions and 0 deletions

View file

@ -0,0 +1,32 @@
# sample size
num_frames = 16
fps = 8
image_size = (256, 256)
# model config
model = dict(
type="DiT-XL/2",
condition="text",
from_pretrained="YOUR_MODEL_PATH",
)
vae = dict(
type="VideoAutoencoderKL",
from_pretrained="stabilityai/sd-vae-ft-ema",
)
text_encoder = dict(
type="clip",
from_pretrained="openai/clip-vit-base-patch32",
model_max_length=77,
)
scheduler = dict(
type="dpm-solver",
num_sampling_steps=20,
cfg_scale=4.0,
)
dtype = "fp16"
# prompts
batch_size = 2
seed = 42
prompt_path = "./assets/texts/ucf101_labels.txt"
save_dir = "./samples/"

View file

@ -0,0 +1,32 @@
# sample size
num_frames = 1
fps = 1
image_size = (256, 256)
# model config
model = dict(
type="DiT-XL/2",
no_temporal_pos_emb=True,
condition="label_1000",
from_pretrained="DiT-XL-2-256x256.pt",
)
vae = dict(
type="VideoAutoencoderKL",
from_pretrained="stabilityai/sd-vae-ft-ema",
)
text_encoder = dict(
type="classes",
num_classes=1000,
)
scheduler = dict(
type="dpm-solver",
num_sampling_steps=20,
cfg_scale=4.0,
)
dtype = "fp16"
# prompts
batch_size = 2
seed = 42
prompt_path = "./assets/texts/imagenet_id.txt"
save_dir = "./samples/"

View file

@ -0,0 +1,33 @@
# sample size
num_frames = 1
fps = 1
image_size = (256, 256)
# model config
model = dict(
type="DiT-XL/2",
no_temporal_pos_emb=True,
condition="text",
from_pretrained="YOUR_MODEL_PATH",
)
vae = dict(
type="VideoAutoencoderKL",
from_pretrained="stabilityai/sd-vae-ft-ema",
)
text_encoder = dict(
type="clip",
from_pretrained="openai/clip-vit-base-patch32",
model_max_length=77,
)
scheduler = dict(
type="dpm-solver",
num_sampling_steps=20,
cfg_scale=4.0,
)
dtype = "fp16"
# prompts
batch_size = 2
seed = 42
prompt_path = "./assets/texts/imagenet_labels.txt"
save_dir = "./samples/"

View file

@ -0,0 +1,51 @@
# sample size
num_frames = 16
frame_interval = 3
image_size = (256, 256)
# dataset
root = None
data_path = "/mnt/hdd/data/csv/ucf101_videos.csv"
use_image_transform = False
num_workers = 4
# acceleration
dtype = "fp16"
grad_checkpoint = True
plugin = "zero2"
sp_size = 1
# model config
model = dict(
type="DiT-XL/2",
from_pretrained="DiT-XL-2-256x256.pt",
enable_flashattn=True,
enable_layernorm_kernel=True,
)
vae = dict(
type="VideoAutoencoderKL",
from_pretrained="stabilityai/sd-vae-ft-ema",
)
text_encoder = dict(
type="clip",
from_pretrained="openai/clip-vit-base-patch32",
model_max_length=77,
)
scheduler = dict(
type="iddpm",
timestep_respacing="",
)
# runtime
seed = 42
outputs = "outputs"
wandb = False
epochs = 1000
log_every = 10
ckpt_every = 1000
load = None
batch_size = 8
lr = 2e-5
grad_clip = 1.0

View file

@ -0,0 +1,51 @@
# sample size
num_frames = 1
frame_interval = 1
image_size = (256, 256)
# dataset
root = None
data_path = "/mnt/hdd/data/csv/imagenet_train.csv"
use_image_transform = True
num_workers = 4
# acceleration
dtype = "fp16"
grad_checkpoint = True
plugin = "zero2"
sp_size = 1
# model config
model = dict(
type="DiT-XL/2",
no_temporal_pos_emb=True,
enable_flashattn=True,
enable_layernorm_kernel=True,
)
vae = dict(
type="VideoAutoencoderKL",
from_pretrained="stabilityai/sd-vae-ft-ema",
)
text_encoder = dict(
type="clip",
from_pretrained="openai/clip-vit-base-patch32",
model_max_length=77,
)
scheduler = dict(
type="iddpm",
timestep_respacing="",
)
# runtime
seed = 42
outputs = "outputs"
wandb = False
epochs = 1000
log_every = 10
ckpt_every = 1000
load = None
batch_size = 128
lr = 1e-4 # according to DiT repo
grad_clip = 1.0

View file

@ -0,0 +1,31 @@
# sample size
num_frames = 16
fps = 8
image_size = (256, 256)
# model config
model = dict(
type="Latte-XL/2",
condition="label_101",
from_pretrained="Latte-XL-2-256x256-ucf101.pt",
)
vae = dict(
type="VideoAutoencoderKL",
from_pretrained="stabilityai/sd-vae-ft-ema",
)
text_encoder = dict(
type="classes",
num_classes=101,
)
scheduler = dict(
type="dpm-solver",
num_sampling_steps=20,
cfg_scale=4.0,
)
dtype = "fp16"
# prompts
batch_size = 2
seed = 42
prompt_path = "./assets/texts/ucf101_id.txt"
save_dir = "./samples/"

View file

@ -0,0 +1,32 @@
# sample size
num_frames = 16
fps = 8
image_size = (256, 256)
# model config
model = dict(
type="Latte-XL/2",
condition="text",
from_pretrained="YOUR_MODEL_PATH",
)
vae = dict(
type="VideoAutoencoderKL",
from_pretrained="stabilityai/sd-vae-ft-ema",
)
text_encoder = dict(
type="clip",
from_pretrained="openai/clip-vit-base-patch32",
model_max_length=77,
)
scheduler = dict(
type="dpm-solver",
num_sampling_steps=20,
cfg_scale=4.0,
)
dtype = "fp16"
# prompts
batch_size = 2
seed = 42
prompt_path = "./assets/texts/ucf101_labels.txt"
save_dir = "./samples/"

View file

@ -0,0 +1,50 @@
# sample size
num_frames = 16
frame_interval = 3
image_size = (256, 256)
# dataset
root = None
data_path = "/mnt/hdd/data/csv/ucf101_videos.csv"
use_image_transform = False
num_workers = 4
# acceleration
dtype = "fp16"
grad_checkpoint = True
plugin = "zero2"
sp_size = 1
# model config
model = dict(
type="Latte-XL/2",
enable_flashattn=True,
enable_layernorm_kernel=True,
)
vae = dict(
type="VideoAutoencoderKL",
from_pretrained="stabilityai/sd-vae-ft-ema",
)
text_encoder = dict(
type="clip",
from_pretrained="openai/clip-vit-base-patch32",
model_max_length=77,
)
scheduler = dict(
type="iddpm",
timestep_respacing="",
)
# runtime
seed = 42
outputs = "outputs"
wandb = False
epochs = 1000
log_every = 10
ckpt_every = 1000
load = None
batch_size = 8
lr = 2e-5
grad_clip = 1.0

View file

@ -0,0 +1,36 @@
# sample size
num_frames = 16
fps = 24 // 3
image_size = (256, 256)
# model config
model = dict(
type="STDiT-XL/2",
space_scale=0.5,
time_scale=1.0,
from_pretrained="outputs/129-F16S3-PixArt-ST-XL-2/epoch83-global_step80000/ema.pt",
# from_pretrained="outputs/285-F16S3-PixArt-ST-XL-2/epoch615-global_step24000/ema.pt",
)
vae = dict(
type="VideoAutoencoderKL",
from_pretrained="stabilityai/sd-vae-ft-ema",
)
text_encoder = dict(
type="t5",
from_pretrained="./pretrained_models/t5_ckpts",
model_max_length=120,
)
scheduler = dict(
# type="iddpm",
# num_sampling_steps=250,
type = "dpm-solver",
num_sampling_steps=20,
cfg_scale=7.0,
)
dtype = "fp16"
# prompts
batch_size = 2
seed = 42
prompt_path = "./assets/texts/t2v_samples.txt"
save_dir = "./samples/"

View file

@ -0,0 +1,36 @@
# sample size
num_frames = 16
fps = 24 // 3
image_size = (512, 512)
# model config
model = dict(
type="STDiT-XL/2",
space_scale=1.0,
time_scale=1.0,
from_pretrained="outputs/314-F16S3-PixArt-ST-XL-2/epoch128-global_step20000/ema.pt",
)
vae = dict(
type="VideoAutoencoderKL",
from_pretrained="stabilityai/sd-vae-ft-ema",
split=8,
)
text_encoder = dict(
type="t5",
from_pretrained="./pretrained_models/t5_ckpts",
model_max_length=120,
)
scheduler = dict(
type="iddpm",
num_sampling_steps=100,
# type = "dpm-solver",
# num_sampling_steps=20,
cfg_scale=7.0,
)
dtype = "fp16"
# prompts
batch_size = 2
seed = 42
prompt_path = "./assets/texts/t2v_samples.txt"
save_dir = "./samples/"

View file

@ -0,0 +1,36 @@
# sample size
num_frames = 64
fps = 24 // 2
image_size = (512, 512)
# model config
model = dict(
type="STDiT-XL/2",
space_scale=1.0,
time_scale=2 / 3,
from_pretrained="outputs/524-F64S2-STDiT-XL-2/epoch4-global_step750/",
)
vae = dict(
type="VideoAutoencoderKL",
from_pretrained="stabilityai/sd-vae-ft-ema",
split=8,
)
text_encoder = dict(
type="t5",
from_pretrained="./pretrained_models/t5_ckpts",
model_max_length=120,
)
scheduler = dict(
type="iddpm",
num_sampling_steps=100,
# type = "dpm-solver",
# num_sampling_steps=20,
cfg_scale=7.0,
)
dtype = "fp16"
# prompts
batch_size = 1
seed = 42
prompt_path = "./assets/texts/t2v_samples.txt"
save_dir = "./samples"

View file

@ -0,0 +1,35 @@
# sample size
num_frames = 64
fps = 24 // 2
image_size = (512, 512)
# model config
model = dict(
type="STDiT-XL/2",
space_scale=1.0,
time_scale=2 / 3,
from_pretrained="outputs/314-F16S3-PixArt-ST-XL-2/epoch96-global_step15000/",
)
vae = dict(
type="VideoAutoencoderKL",
from_pretrained="stabilityai/sd-vae-ft-ema",
)
text_encoder = dict(
type="t5",
from_pretrained="./pretrained_models/t5_ckpts",
model_max_length=120,
)
scheduler = dict(
type="iddpm",
num_sampling_steps=100,
# type = "dpm-solver",
# num_sampling_steps=20,
cfg_scale=7.0,
)
dtype = "fp16"
# prompts
batch_size = 1
seed = 42
prompt_path = "./assets/texts/t2v_samples.txt"
save_dir = "./samples"

View file

@ -0,0 +1,55 @@
# sample size
num_frames = 16
frame_interval = 3
image_size = (256, 256)
# dataset
root = None
# data_path = "/mnt/hdd/data/csv/bak_00/pexels_inter4k_fmin_48_rp.csv"
data_path = "/mnt/hdd/data/csv/ucf101_videos.csv"
use_image_transform = False
num_workers = 4
# acceleration
dtype = "fp16"
grad_checkpoint = True
plugin = "zero2"
sp_size = 1
# model config
model = dict(
type="STDiT-XL/2",
space_scale=0.5,
time_scale=1.0,
from_pretrained="PixArt-XL-2-512x512.pth",
enable_flashattn=True,
enable_layernorm_kernel=True,
)
vae = dict(
type="VideoAutoencoderKL",
from_pretrained="stabilityai/sd-vae-ft-ema",
)
text_encoder = dict(
type="t5",
from_pretrained="./pretrained_models/t5_ckpts",
model_max_length=120,
shardformer=True,
)
scheduler = dict(
type="iddpm",
timestep_respacing="",
)
# runtime
seed = 42
outputs = "outputs"
wandb = False
epochs = 1000
log_every = 10
ckpt_every = 1000
load = None
batch_size = 8
lr = 2e-5
grad_clip = 1.0

View file

@ -0,0 +1,55 @@
# sample size
num_frames = 16
frame_interval = 3
image_size = (512, 512)
# dataset
root = None
data_path = "/home/zhaowangbo/data_hdd/csv/inter4k_pexels_rp_fmin_48.csv"
use_image_transform = False
num_workers = 4
# acceleration
dtype = "fp16"
grad_checkpoint = True
plugin = "zero2"
sp_size = 1
# model config
model = dict(
type="STDiT-XL/2",
space_scale=1.0,
time_scale=1.0,
from_pretrained="outputs/285-F16S3-PixArt-ST-XL-2/epoch615-global_step24000/ema.pt",
enable_flashattn=True,
enable_layernorm_kernel=True,
)
vae = dict(
type="VideoAutoencoderKL",
from_pretrained="stabilityai/sd-vae-ft-ema",
split=4,
)
text_encoder = dict(
type="t5",
from_pretrained="./pretrained_models/t5_ckpts",
model_max_length=120,
shardformer=True,
)
scheduler = dict(
type="iddpm",
timestep_respacing="",
)
# runtime
seed = 42
outputs = "outputs"
wandb = False
epochs = 1000
log_every = 10
ckpt_every = 500
load = None
batch_size = 8
lr = 2e-5
grad_clip = 1.0

View file

@ -0,0 +1,55 @@
# sample size
num_frames = 64
frame_interval = 2
image_size = (512, 512)
# dataset
root = None
data_path = "/mnt/hdd/data/csv/inter4k_pexels_rp_fmin_128.csv"
use_image_transform = False
num_workers = 4
# acceleration
dtype = "fp16"
grad_checkpoint = True
plugin = "zero2"
sp_size = 1
# model config
model = dict(
type="STDiT-XL/2",
space_scale=1.0,
time_scale=2 / 3,
from_pretrained="outputs/314-F16S3-PixArt-ST-XL-2/epoch128-global_step20000/ema.pt",
enable_flashattn=True,
enable_layernorm_kernel=True,
)
vae = dict(
type="VideoAutoencoderKL",
from_pretrained="stabilityai/sd-vae-ft-ema",
split=8,
)
text_encoder = dict(
type="t5",
from_pretrained="./pretrained_models/t5_ckpts",
model_max_length=120,
shardformer=True,
)
scheduler = dict(
type="iddpm",
timestep_respacing="",
)
# runtime
seed = 42
outputs = "outputs"
wandb = False
epochs = 1000
log_every = 10
ckpt_every = 250
load = None
batch_size = 4
lr = 2e-5
grad_clip = 1.0

View file

@ -0,0 +1,56 @@
# sample size
num_frames = 64
frame_interval = 2
image_size = (512, 512)
# dataset
root = None
# data_path = "/mnt/hdd/data/csv/bak_00/pexels_inter4k_fmin_48_rp.csv"
data_path = "/mnt/hdd/data/csv/ucf101_videos.csv"
use_image_transform = False
num_workers = 4
# acceleration
dtype = "fp16"
grad_checkpoint = True
plugin = "zero2-seq"
sp_size = 2
# model config
model = dict(
type="STDiT-XL/2",
space_scale=1.0,
time_scale=2 / 3,
from_pretrained="PixArt-XL-2-512x512.pth",
enable_flashattn=True,
enable_layernorm_kernel=True,
enable_sequence_parallelism=True, # enable sq here
)
vae = dict(
type="VideoAutoencoderKL",
from_pretrained="stabilityai/sd-vae-ft-ema",
)
text_encoder = dict(
type="t5",
from_pretrained="./pretrained_models/t5_ckpts",
model_max_length=120,
shardformer=True,
)
scheduler = dict(
type="iddpm",
timestep_respacing="",
)
# runtime
seed = 42
outputs = "outputs"
wandb = False
epochs = 1000
log_every = 10
ckpt_every = 1000
load = None
batch_size = 1
lr = 2e-5
grad_clip = 1.0

View file

@ -0,0 +1,33 @@
# sample size
num_frames = 16
fps = 8
image_size = (256, 256)
# model config
model = dict(
type="PixArt-XL/2",
space_scale=0.5,
time_scale=1.0,
from_pretrained="outputs/098-F16S3-PixArt-XL-2/epoch7-global_step30000/model_ckpt.pt",
)
vae = dict(
type="VideoAutoencoderKL",
from_pretrained="stabilityai/sd-vae-ft-ema",
)
text_encoder = dict(
type="t5",
from_pretrained="./pretrained_models/t5_ckpts",
model_max_length=120,
)
scheduler = dict(
type="dpm-solver",
num_sampling_steps=20,
cfg_scale=7.0,
)
dtype = "fp16"
# prompts
batch_size = 2
seed = 42
prompt_path = "./assets/texts/t2v_samples.txt"
save_dir = "./samples/"

View file

@ -0,0 +1,35 @@
# sample size
num_frames = 1
fps = 1
image_size = (1920, 512)
multi_resolution = True
# model config
model = dict(
type="PixArtMS-XL/2",
space_scale=2.0,
time_scale=1.0,
no_temporal_pos_emb=True,
from_pretrained="PixArt-XL-2-1024-MS.pth",
)
vae = dict(
type="VideoAutoencoderKL",
from_pretrained="stabilityai/sd-vae-ft-ema",
)
text_encoder = dict(
type="t5",
from_pretrained="./pretrained_models/t5_ckpts",
model_max_length=120,
)
scheduler = dict(
type = "dpm-solver",
num_sampling_steps=20,
cfg_scale=7.0,
)
dtype = "fp16"
# prompts
batch_size = 2
seed = 42
prompt_path = "./assets/texts/t2i_samples.txt"
save_dir = "./samples/"

View file

@ -0,0 +1,36 @@
# sample size
num_frames = 1
fps = 1
image_size = (256, 256)
# model config
model = dict(
type="PixArt-XL/2",
space_scale=1.0,
time_scale=1.0,
no_temporal_pos_emb=True,
from_pretrained="PixArt-XL-2-256x256.pth",
)
vae = dict(
# type="VideoAutoencoderKL",
# from_pretrained="stabilityai/sd-vae-ft-ema",
type="VideoAutoencoderKLTemporalDecoder",
from_pretrained="pretrained_models/vae_temporal_decoder"
)
text_encoder = dict(
type="t5",
from_pretrained="./pretrained_models/t5_ckpts",
model_max_length=120,
)
scheduler = dict(
type="dpm-solver",
num_sampling_steps=20,
cfg_scale=7.0,
)
dtype = "fp16"
# prompts
batch_size = 2
seed = 42
prompt_path = "./assets/texts/t2i_samples.txt"
save_dir = "./samples/"

View file

@ -0,0 +1,34 @@
# sample size
num_frames = 1
fps = 1
image_size = (512, 512)
# model config
model = dict(
type="PixArt-XL/2",
space_scale=1.0,
time_scale=1.0,
no_temporal_pos_emb=True,
from_pretrained="PixArt-XL-2-512x512.pth",
)
vae = dict(
type="VideoAutoencoderKL",
from_pretrained="stabilityai/sd-vae-ft-ema",
)
text_encoder = dict(
type="t5",
from_pretrained="./pretrained_models/t5_ckpts",
model_max_length=120,
)
scheduler = dict(
type = "dpm-solver",
num_sampling_steps=20,
cfg_scale=7.0,
)
dtype = "fp16"
# prompts
batch_size = 2
seed = 42
prompt_path = "./assets/texts/t2i_samples.txt"
save_dir = "./samples/"

View file

@ -0,0 +1,55 @@
# sample size
num_frames = 16
frame_interval = 3
image_size = (256, 256)
# dataset
root = None
# data_path = "/mnt/hdd/data/csv/bak_00/pexels_inter4k_fmin_48_rp.csv"
data_path = "/mnt/hdd/data/csv/ucf101_videos.csv"
use_image_transform = False
num_workers = 4
# acceleration
dtype = "fp16"
grad_checkpoint = True
plugin = "zero2"
sp_size = 1
# model config
model = dict(
type="PixArt-XL/2",
space_scale=0.5,
time_scale=1.0,
from_pretrained="PixArt-XL-2-512x512.pth",
enable_flashattn=True,
enable_layernorm_kernel=True,
)
vae = dict(
type="VideoAutoencoderKL",
from_pretrained="stabilityai/sd-vae-ft-ema",
)
text_encoder = dict(
type="t5",
from_pretrained="./pretrained_models/t5_ckpts",
model_max_length=120,
shardformer=True,
)
scheduler = dict(
type="iddpm",
timestep_respacing="",
)
# runtime
seed = 42
outputs = "outputs"
wandb = False
epochs = 1000
log_every = 10
ckpt_every = 1000
load = None
batch_size = 8
lr = 2e-5
grad_clip = 1.0

View file

@ -0,0 +1,55 @@
# sample size
num_frames = 1
frame_interval = 1
image_size = (512, 512)
# dataset
root = None
data_path = "/mnt/hdd/data/csv/imagenet_train.csv"
use_image_transform = True
num_workers = 4
# acceleration
dtype = "fp16"
grad_checkpoint = True
plugin = "zero2"
sp_size = 1
# model config
model = dict(
type="PixArt-XL/2",
space_scale=1.0,
time_scale=1.0,
no_temporal_pos_emb=True,
from_pretrained="PixArt-XL-2-512x512.pth",
enable_flashattn=True,
enable_layernorm_kernel=True,
)
vae = dict(
type="VideoAutoencoderKL",
from_pretrained="stabilityai/sd-vae-ft-ema",
)
text_encoder = dict(
type="t5",
from_pretrained="./pretrained_models/t5_ckpts",
model_max_length=120,
shardformer=True,
)
scheduler = dict(
type="iddpm",
timestep_respacing="",
)
# runtime
seed = 42
outputs = "outputs"
wandb = False
epochs = 1000
log_every = 10
ckpt_every = 1000
load = None
batch_size = 32
lr = 2e-5
grad_clip = 1.0