mirror of
https://github.com/hpcaitech/Open-Sora.git
synced 2026-04-11 05:13:31 +02:00
added configs and tests (#65)
This commit is contained in:
parent
c4c5d64e49
commit
ddbb130fe7
32
configs/dit/inference/16x256x256.py
Normal file
32
configs/dit/inference/16x256x256.py
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
# sample size
|
||||
num_frames = 16
|
||||
fps = 8
|
||||
image_size = (256, 256)
|
||||
|
||||
# model config
|
||||
model = dict(
|
||||
type="DiT-XL/2",
|
||||
condition="text",
|
||||
from_pretrained="YOUR_MODEL_PATH",
|
||||
)
|
||||
vae = dict(
|
||||
type="VideoAutoencoderKL",
|
||||
from_pretrained="stabilityai/sd-vae-ft-ema",
|
||||
)
|
||||
text_encoder = dict(
|
||||
type="clip",
|
||||
from_pretrained="openai/clip-vit-base-patch32",
|
||||
model_max_length=77,
|
||||
)
|
||||
scheduler = dict(
|
||||
type="dpm-solver",
|
||||
num_sampling_steps=20,
|
||||
cfg_scale=4.0,
|
||||
)
|
||||
dtype = "fp16"
|
||||
|
||||
# prompts
|
||||
batch_size = 2
|
||||
seed = 42
|
||||
prompt_path = "./assets/texts/ucf101_labels.txt"
|
||||
save_dir = "./samples/"
|
||||
32
configs/dit/inference/1x256x256-class.py
Normal file
32
configs/dit/inference/1x256x256-class.py
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
# sample size
|
||||
num_frames = 1
|
||||
fps = 1
|
||||
image_size = (256, 256)
|
||||
|
||||
# model config
|
||||
model = dict(
|
||||
type="DiT-XL/2",
|
||||
no_temporal_pos_emb=True,
|
||||
condition="label_1000",
|
||||
from_pretrained="DiT-XL-2-256x256.pt",
|
||||
)
|
||||
vae = dict(
|
||||
type="VideoAutoencoderKL",
|
||||
from_pretrained="stabilityai/sd-vae-ft-ema",
|
||||
)
|
||||
text_encoder = dict(
|
||||
type="classes",
|
||||
num_classes=1000,
|
||||
)
|
||||
scheduler = dict(
|
||||
type="dpm-solver",
|
||||
num_sampling_steps=20,
|
||||
cfg_scale=4.0,
|
||||
)
|
||||
dtype = "fp16"
|
||||
|
||||
# prompts
|
||||
batch_size = 2
|
||||
seed = 42
|
||||
prompt_path = "./assets/texts/imagenet_id.txt"
|
||||
save_dir = "./samples/"
|
||||
33
configs/dit/inference/1x256x256.py
Normal file
33
configs/dit/inference/1x256x256.py
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
# sample size
|
||||
num_frames = 1
|
||||
fps = 1
|
||||
image_size = (256, 256)
|
||||
|
||||
# model config
|
||||
model = dict(
|
||||
type="DiT-XL/2",
|
||||
no_temporal_pos_emb=True,
|
||||
condition="text",
|
||||
from_pretrained="YOUR_MODEL_PATH",
|
||||
)
|
||||
vae = dict(
|
||||
type="VideoAutoencoderKL",
|
||||
from_pretrained="stabilityai/sd-vae-ft-ema",
|
||||
)
|
||||
text_encoder = dict(
|
||||
type="clip",
|
||||
from_pretrained="openai/clip-vit-base-patch32",
|
||||
model_max_length=77,
|
||||
)
|
||||
scheduler = dict(
|
||||
type="dpm-solver",
|
||||
num_sampling_steps=20,
|
||||
cfg_scale=4.0,
|
||||
)
|
||||
dtype = "fp16"
|
||||
|
||||
# prompts
|
||||
batch_size = 2
|
||||
seed = 42
|
||||
prompt_path = "./assets/texts/imagenet_labels.txt"
|
||||
save_dir = "./samples/"
|
||||
51
configs/dit/train/16x256x256.py
Normal file
51
configs/dit/train/16x256x256.py
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
# sample size
|
||||
num_frames = 16
|
||||
frame_interval = 3
|
||||
image_size = (256, 256)
|
||||
|
||||
# dataset
|
||||
root = None
|
||||
data_path = "/mnt/hdd/data/csv/ucf101_videos.csv"
|
||||
use_image_transform = False
|
||||
num_workers = 4
|
||||
|
||||
# acceleration
|
||||
dtype = "fp16"
|
||||
grad_checkpoint = True
|
||||
plugin = "zero2"
|
||||
sp_size = 1
|
||||
|
||||
# model config
|
||||
model = dict(
|
||||
type="DiT-XL/2",
|
||||
from_pretrained="DiT-XL-2-256x256.pt",
|
||||
enable_flashattn=True,
|
||||
enable_layernorm_kernel=True,
|
||||
)
|
||||
vae = dict(
|
||||
type="VideoAutoencoderKL",
|
||||
from_pretrained="stabilityai/sd-vae-ft-ema",
|
||||
)
|
||||
text_encoder = dict(
|
||||
type="clip",
|
||||
from_pretrained="openai/clip-vit-base-patch32",
|
||||
model_max_length=77,
|
||||
)
|
||||
scheduler = dict(
|
||||
type="iddpm",
|
||||
timestep_respacing="",
|
||||
)
|
||||
|
||||
# runtime
|
||||
seed = 42
|
||||
outputs = "outputs"
|
||||
wandb = False
|
||||
|
||||
epochs = 1000
|
||||
log_every = 10
|
||||
ckpt_every = 1000
|
||||
load = None
|
||||
|
||||
batch_size = 8
|
||||
lr = 2e-5
|
||||
grad_clip = 1.0
|
||||
51
configs/dit/train/1x256x256.py
Normal file
51
configs/dit/train/1x256x256.py
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
# sample size
|
||||
num_frames = 1
|
||||
frame_interval = 1
|
||||
image_size = (256, 256)
|
||||
|
||||
# dataset
|
||||
root = None
|
||||
data_path = "/mnt/hdd/data/csv/imagenet_train.csv"
|
||||
use_image_transform = True
|
||||
num_workers = 4
|
||||
|
||||
# acceleration
|
||||
dtype = "fp16"
|
||||
grad_checkpoint = True
|
||||
plugin = "zero2"
|
||||
sp_size = 1
|
||||
|
||||
# model config
|
||||
model = dict(
|
||||
type="DiT-XL/2",
|
||||
no_temporal_pos_emb=True,
|
||||
enable_flashattn=True,
|
||||
enable_layernorm_kernel=True,
|
||||
)
|
||||
vae = dict(
|
||||
type="VideoAutoencoderKL",
|
||||
from_pretrained="stabilityai/sd-vae-ft-ema",
|
||||
)
|
||||
text_encoder = dict(
|
||||
type="clip",
|
||||
from_pretrained="openai/clip-vit-base-patch32",
|
||||
model_max_length=77,
|
||||
)
|
||||
scheduler = dict(
|
||||
type="iddpm",
|
||||
timestep_respacing="",
|
||||
)
|
||||
|
||||
# runtime
|
||||
seed = 42
|
||||
outputs = "outputs"
|
||||
wandb = False
|
||||
|
||||
epochs = 1000
|
||||
log_every = 10
|
||||
ckpt_every = 1000
|
||||
load = None
|
||||
|
||||
batch_size = 128
|
||||
lr = 1e-4 # according to DiT repo
|
||||
grad_clip = 1.0
|
||||
31
configs/latte/inference/16x256x256-class.py
Normal file
31
configs/latte/inference/16x256x256-class.py
Normal file
|
|
@ -0,0 +1,31 @@
|
|||
# sample size
|
||||
num_frames = 16
|
||||
fps = 8
|
||||
image_size = (256, 256)
|
||||
|
||||
# model config
|
||||
model = dict(
|
||||
type="Latte-XL/2",
|
||||
condition="label_101",
|
||||
from_pretrained="Latte-XL-2-256x256-ucf101.pt",
|
||||
)
|
||||
vae = dict(
|
||||
type="VideoAutoencoderKL",
|
||||
from_pretrained="stabilityai/sd-vae-ft-ema",
|
||||
)
|
||||
text_encoder = dict(
|
||||
type="classes",
|
||||
num_classes=101,
|
||||
)
|
||||
scheduler = dict(
|
||||
type="dpm-solver",
|
||||
num_sampling_steps=20,
|
||||
cfg_scale=4.0,
|
||||
)
|
||||
dtype = "fp16"
|
||||
|
||||
# prompts
|
||||
batch_size = 2
|
||||
seed = 42
|
||||
prompt_path = "./assets/texts/ucf101_id.txt"
|
||||
save_dir = "./samples/"
|
||||
32
configs/latte/inference/16x256x256.py
Normal file
32
configs/latte/inference/16x256x256.py
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
# sample size
|
||||
num_frames = 16
|
||||
fps = 8
|
||||
image_size = (256, 256)
|
||||
|
||||
# model config
|
||||
model = dict(
|
||||
type="Latte-XL/2",
|
||||
condition="text",
|
||||
from_pretrained="YOUR_MODEL_PATH",
|
||||
)
|
||||
vae = dict(
|
||||
type="VideoAutoencoderKL",
|
||||
from_pretrained="stabilityai/sd-vae-ft-ema",
|
||||
)
|
||||
text_encoder = dict(
|
||||
type="clip",
|
||||
from_pretrained="openai/clip-vit-base-patch32",
|
||||
model_max_length=77,
|
||||
)
|
||||
scheduler = dict(
|
||||
type="dpm-solver",
|
||||
num_sampling_steps=20,
|
||||
cfg_scale=4.0,
|
||||
)
|
||||
dtype = "fp16"
|
||||
|
||||
# prompts
|
||||
batch_size = 2
|
||||
seed = 42
|
||||
prompt_path = "./assets/texts/ucf101_labels.txt"
|
||||
save_dir = "./samples/"
|
||||
50
configs/latte/train/16x256x256.py
Normal file
50
configs/latte/train/16x256x256.py
Normal file
|
|
@ -0,0 +1,50 @@
|
|||
# sample size
|
||||
num_frames = 16
|
||||
frame_interval = 3
|
||||
image_size = (256, 256)
|
||||
|
||||
# dataset
|
||||
root = None
|
||||
data_path = "/mnt/hdd/data/csv/ucf101_videos.csv"
|
||||
use_image_transform = False
|
||||
num_workers = 4
|
||||
|
||||
# acceleration
|
||||
dtype = "fp16"
|
||||
grad_checkpoint = True
|
||||
plugin = "zero2"
|
||||
sp_size = 1
|
||||
|
||||
# model config
|
||||
model = dict(
|
||||
type="Latte-XL/2",
|
||||
enable_flashattn=True,
|
||||
enable_layernorm_kernel=True,
|
||||
)
|
||||
vae = dict(
|
||||
type="VideoAutoencoderKL",
|
||||
from_pretrained="stabilityai/sd-vae-ft-ema",
|
||||
)
|
||||
text_encoder = dict(
|
||||
type="clip",
|
||||
from_pretrained="openai/clip-vit-base-patch32",
|
||||
model_max_length=77,
|
||||
)
|
||||
scheduler = dict(
|
||||
type="iddpm",
|
||||
timestep_respacing="",
|
||||
)
|
||||
|
||||
# runtime
|
||||
seed = 42
|
||||
outputs = "outputs"
|
||||
wandb = False
|
||||
|
||||
epochs = 1000
|
||||
log_every = 10
|
||||
ckpt_every = 1000
|
||||
load = None
|
||||
|
||||
batch_size = 8
|
||||
lr = 2e-5
|
||||
grad_clip = 1.0
|
||||
36
configs/opensora/inference/16x256x256.py
Normal file
36
configs/opensora/inference/16x256x256.py
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
# sample size
|
||||
num_frames = 16
|
||||
fps = 24 // 3
|
||||
image_size = (256, 256)
|
||||
|
||||
# model config
|
||||
model = dict(
|
||||
type="STDiT-XL/2",
|
||||
space_scale=0.5,
|
||||
time_scale=1.0,
|
||||
from_pretrained="outputs/129-F16S3-PixArt-ST-XL-2/epoch83-global_step80000/ema.pt",
|
||||
# from_pretrained="outputs/285-F16S3-PixArt-ST-XL-2/epoch615-global_step24000/ema.pt",
|
||||
)
|
||||
vae = dict(
|
||||
type="VideoAutoencoderKL",
|
||||
from_pretrained="stabilityai/sd-vae-ft-ema",
|
||||
)
|
||||
text_encoder = dict(
|
||||
type="t5",
|
||||
from_pretrained="./pretrained_models/t5_ckpts",
|
||||
model_max_length=120,
|
||||
)
|
||||
scheduler = dict(
|
||||
# type="iddpm",
|
||||
# num_sampling_steps=250,
|
||||
type = "dpm-solver",
|
||||
num_sampling_steps=20,
|
||||
cfg_scale=7.0,
|
||||
)
|
||||
dtype = "fp16"
|
||||
|
||||
# prompts
|
||||
batch_size = 2
|
||||
seed = 42
|
||||
prompt_path = "./assets/texts/t2v_samples.txt"
|
||||
save_dir = "./samples/"
|
||||
36
configs/opensora/inference/16x512x512.py
Normal file
36
configs/opensora/inference/16x512x512.py
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
# sample size
|
||||
num_frames = 16
|
||||
fps = 24 // 3
|
||||
image_size = (512, 512)
|
||||
|
||||
# model config
|
||||
model = dict(
|
||||
type="STDiT-XL/2",
|
||||
space_scale=1.0,
|
||||
time_scale=1.0,
|
||||
from_pretrained="outputs/314-F16S3-PixArt-ST-XL-2/epoch128-global_step20000/ema.pt",
|
||||
)
|
||||
vae = dict(
|
||||
type="VideoAutoencoderKL",
|
||||
from_pretrained="stabilityai/sd-vae-ft-ema",
|
||||
split=8,
|
||||
)
|
||||
text_encoder = dict(
|
||||
type="t5",
|
||||
from_pretrained="./pretrained_models/t5_ckpts",
|
||||
model_max_length=120,
|
||||
)
|
||||
scheduler = dict(
|
||||
type="iddpm",
|
||||
num_sampling_steps=100,
|
||||
# type = "dpm-solver",
|
||||
# num_sampling_steps=20,
|
||||
cfg_scale=7.0,
|
||||
)
|
||||
dtype = "fp16"
|
||||
|
||||
# prompts
|
||||
batch_size = 2
|
||||
seed = 42
|
||||
prompt_path = "./assets/texts/t2v_samples.txt"
|
||||
save_dir = "./samples/"
|
||||
36
configs/opensora/inference/64x512x512-v2.py
Normal file
36
configs/opensora/inference/64x512x512-v2.py
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
# sample size
|
||||
num_frames = 64
|
||||
fps = 24 // 2
|
||||
image_size = (512, 512)
|
||||
|
||||
# model config
|
||||
model = dict(
|
||||
type="STDiT-XL/2",
|
||||
space_scale=1.0,
|
||||
time_scale=2 / 3,
|
||||
from_pretrained="outputs/524-F64S2-STDiT-XL-2/epoch4-global_step750/",
|
||||
)
|
||||
vae = dict(
|
||||
type="VideoAutoencoderKL",
|
||||
from_pretrained="stabilityai/sd-vae-ft-ema",
|
||||
split=8,
|
||||
)
|
||||
text_encoder = dict(
|
||||
type="t5",
|
||||
from_pretrained="./pretrained_models/t5_ckpts",
|
||||
model_max_length=120,
|
||||
)
|
||||
scheduler = dict(
|
||||
type="iddpm",
|
||||
num_sampling_steps=100,
|
||||
# type = "dpm-solver",
|
||||
# num_sampling_steps=20,
|
||||
cfg_scale=7.0,
|
||||
)
|
||||
dtype = "fp16"
|
||||
|
||||
# prompts
|
||||
batch_size = 1
|
||||
seed = 42
|
||||
prompt_path = "./assets/texts/t2v_samples.txt"
|
||||
save_dir = "./samples"
|
||||
35
configs/opensora/inference/64x512x512.py
Normal file
35
configs/opensora/inference/64x512x512.py
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
# sample size
|
||||
num_frames = 64
|
||||
fps = 24 // 2
|
||||
image_size = (512, 512)
|
||||
|
||||
# model config
|
||||
model = dict(
|
||||
type="STDiT-XL/2",
|
||||
space_scale=1.0,
|
||||
time_scale=2 / 3,
|
||||
from_pretrained="outputs/314-F16S3-PixArt-ST-XL-2/epoch96-global_step15000/",
|
||||
)
|
||||
vae = dict(
|
||||
type="VideoAutoencoderKL",
|
||||
from_pretrained="stabilityai/sd-vae-ft-ema",
|
||||
)
|
||||
text_encoder = dict(
|
||||
type="t5",
|
||||
from_pretrained="./pretrained_models/t5_ckpts",
|
||||
model_max_length=120,
|
||||
)
|
||||
scheduler = dict(
|
||||
type="iddpm",
|
||||
num_sampling_steps=100,
|
||||
# type = "dpm-solver",
|
||||
# num_sampling_steps=20,
|
||||
cfg_scale=7.0,
|
||||
)
|
||||
dtype = "fp16"
|
||||
|
||||
# prompts
|
||||
batch_size = 1
|
||||
seed = 42
|
||||
prompt_path = "./assets/texts/t2v_samples.txt"
|
||||
save_dir = "./samples"
|
||||
55
configs/opensora/train/16x256x256.py
Normal file
55
configs/opensora/train/16x256x256.py
Normal file
|
|
@ -0,0 +1,55 @@
|
|||
# sample size
|
||||
num_frames = 16
|
||||
frame_interval = 3
|
||||
image_size = (256, 256)
|
||||
|
||||
# dataset
|
||||
root = None
|
||||
# data_path = "/mnt/hdd/data/csv/bak_00/pexels_inter4k_fmin_48_rp.csv"
|
||||
data_path = "/mnt/hdd/data/csv/ucf101_videos.csv"
|
||||
use_image_transform = False
|
||||
num_workers = 4
|
||||
|
||||
# acceleration
|
||||
dtype = "fp16"
|
||||
grad_checkpoint = True
|
||||
plugin = "zero2"
|
||||
sp_size = 1
|
||||
|
||||
# model config
|
||||
model = dict(
|
||||
type="STDiT-XL/2",
|
||||
space_scale=0.5,
|
||||
time_scale=1.0,
|
||||
from_pretrained="PixArt-XL-2-512x512.pth",
|
||||
enable_flashattn=True,
|
||||
enable_layernorm_kernel=True,
|
||||
)
|
||||
vae = dict(
|
||||
type="VideoAutoencoderKL",
|
||||
from_pretrained="stabilityai/sd-vae-ft-ema",
|
||||
)
|
||||
text_encoder = dict(
|
||||
type="t5",
|
||||
from_pretrained="./pretrained_models/t5_ckpts",
|
||||
model_max_length=120,
|
||||
shardformer=True,
|
||||
)
|
||||
scheduler = dict(
|
||||
type="iddpm",
|
||||
timestep_respacing="",
|
||||
)
|
||||
|
||||
# runtime
|
||||
seed = 42
|
||||
outputs = "outputs"
|
||||
wandb = False
|
||||
|
||||
epochs = 1000
|
||||
log_every = 10
|
||||
ckpt_every = 1000
|
||||
load = None
|
||||
|
||||
batch_size = 8
|
||||
lr = 2e-5
|
||||
grad_clip = 1.0
|
||||
55
configs/opensora/train/16x512x512.py
Normal file
55
configs/opensora/train/16x512x512.py
Normal file
|
|
@ -0,0 +1,55 @@
|
|||
# sample size
|
||||
num_frames = 16
|
||||
frame_interval = 3
|
||||
image_size = (512, 512)
|
||||
|
||||
# dataset
|
||||
root = None
|
||||
data_path = "/home/zhaowangbo/data_hdd/csv/inter4k_pexels_rp_fmin_48.csv"
|
||||
use_image_transform = False
|
||||
num_workers = 4
|
||||
|
||||
# acceleration
|
||||
dtype = "fp16"
|
||||
grad_checkpoint = True
|
||||
plugin = "zero2"
|
||||
sp_size = 1
|
||||
|
||||
# model config
|
||||
model = dict(
|
||||
type="STDiT-XL/2",
|
||||
space_scale=1.0,
|
||||
time_scale=1.0,
|
||||
from_pretrained="outputs/285-F16S3-PixArt-ST-XL-2/epoch615-global_step24000/ema.pt",
|
||||
enable_flashattn=True,
|
||||
enable_layernorm_kernel=True,
|
||||
)
|
||||
vae = dict(
|
||||
type="VideoAutoencoderKL",
|
||||
from_pretrained="stabilityai/sd-vae-ft-ema",
|
||||
split=4,
|
||||
)
|
||||
text_encoder = dict(
|
||||
type="t5",
|
||||
from_pretrained="./pretrained_models/t5_ckpts",
|
||||
model_max_length=120,
|
||||
shardformer=True,
|
||||
)
|
||||
scheduler = dict(
|
||||
type="iddpm",
|
||||
timestep_respacing="",
|
||||
)
|
||||
|
||||
# runtime
|
||||
seed = 42
|
||||
outputs = "outputs"
|
||||
wandb = False
|
||||
|
||||
epochs = 1000
|
||||
log_every = 10
|
||||
ckpt_every = 500
|
||||
load = None
|
||||
|
||||
batch_size = 8
|
||||
lr = 2e-5
|
||||
grad_clip = 1.0
|
||||
55
configs/opensora/train/64x512x512-v2.py
Normal file
55
configs/opensora/train/64x512x512-v2.py
Normal file
|
|
@ -0,0 +1,55 @@
|
|||
# sample size
|
||||
num_frames = 64
|
||||
frame_interval = 2
|
||||
image_size = (512, 512)
|
||||
|
||||
# dataset
|
||||
root = None
|
||||
data_path = "/mnt/hdd/data/csv/inter4k_pexels_rp_fmin_128.csv"
|
||||
use_image_transform = False
|
||||
num_workers = 4
|
||||
|
||||
# acceleration
|
||||
dtype = "fp16"
|
||||
grad_checkpoint = True
|
||||
plugin = "zero2"
|
||||
sp_size = 1
|
||||
|
||||
# model config
|
||||
model = dict(
|
||||
type="STDiT-XL/2",
|
||||
space_scale=1.0,
|
||||
time_scale=2 / 3,
|
||||
from_pretrained="outputs/314-F16S3-PixArt-ST-XL-2/epoch128-global_step20000/ema.pt",
|
||||
enable_flashattn=True,
|
||||
enable_layernorm_kernel=True,
|
||||
)
|
||||
vae = dict(
|
||||
type="VideoAutoencoderKL",
|
||||
from_pretrained="stabilityai/sd-vae-ft-ema",
|
||||
split=8,
|
||||
)
|
||||
text_encoder = dict(
|
||||
type="t5",
|
||||
from_pretrained="./pretrained_models/t5_ckpts",
|
||||
model_max_length=120,
|
||||
shardformer=True,
|
||||
)
|
||||
scheduler = dict(
|
||||
type="iddpm",
|
||||
timestep_respacing="",
|
||||
)
|
||||
|
||||
# runtime
|
||||
seed = 42
|
||||
outputs = "outputs"
|
||||
wandb = False
|
||||
|
||||
epochs = 1000
|
||||
log_every = 10
|
||||
ckpt_every = 250
|
||||
load = None
|
||||
|
||||
batch_size = 4
|
||||
lr = 2e-5
|
||||
grad_clip = 1.0
|
||||
56
configs/opensora/train/64x512x512.py
Normal file
56
configs/opensora/train/64x512x512.py
Normal file
|
|
@ -0,0 +1,56 @@
|
|||
# sample size
|
||||
num_frames = 64
|
||||
frame_interval = 2
|
||||
image_size = (512, 512)
|
||||
|
||||
# dataset
|
||||
root = None
|
||||
# data_path = "/mnt/hdd/data/csv/bak_00/pexels_inter4k_fmin_48_rp.csv"
|
||||
data_path = "/mnt/hdd/data/csv/ucf101_videos.csv"
|
||||
use_image_transform = False
|
||||
num_workers = 4
|
||||
|
||||
# acceleration
|
||||
dtype = "fp16"
|
||||
grad_checkpoint = True
|
||||
plugin = "zero2-seq"
|
||||
sp_size = 2
|
||||
|
||||
# model config
|
||||
model = dict(
|
||||
type="STDiT-XL/2",
|
||||
space_scale=1.0,
|
||||
time_scale=2 / 3,
|
||||
from_pretrained="PixArt-XL-2-512x512.pth",
|
||||
enable_flashattn=True,
|
||||
enable_layernorm_kernel=True,
|
||||
enable_sequence_parallelism=True, # enable sq here
|
||||
)
|
||||
vae = dict(
|
||||
type="VideoAutoencoderKL",
|
||||
from_pretrained="stabilityai/sd-vae-ft-ema",
|
||||
)
|
||||
text_encoder = dict(
|
||||
type="t5",
|
||||
from_pretrained="./pretrained_models/t5_ckpts",
|
||||
model_max_length=120,
|
||||
shardformer=True,
|
||||
)
|
||||
scheduler = dict(
|
||||
type="iddpm",
|
||||
timestep_respacing="",
|
||||
)
|
||||
|
||||
# runtime
|
||||
seed = 42
|
||||
outputs = "outputs"
|
||||
wandb = False
|
||||
|
||||
epochs = 1000
|
||||
log_every = 10
|
||||
ckpt_every = 1000
|
||||
load = None
|
||||
|
||||
batch_size = 1
|
||||
lr = 2e-5
|
||||
grad_clip = 1.0
|
||||
33
configs/pixart/inference/16x256x256.py
Normal file
33
configs/pixart/inference/16x256x256.py
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
# sample size
|
||||
num_frames = 16
|
||||
fps = 8
|
||||
image_size = (256, 256)
|
||||
|
||||
# model config
|
||||
model = dict(
|
||||
type="PixArt-XL/2",
|
||||
space_scale=0.5,
|
||||
time_scale=1.0,
|
||||
from_pretrained="outputs/098-F16S3-PixArt-XL-2/epoch7-global_step30000/model_ckpt.pt",
|
||||
)
|
||||
vae = dict(
|
||||
type="VideoAutoencoderKL",
|
||||
from_pretrained="stabilityai/sd-vae-ft-ema",
|
||||
)
|
||||
text_encoder = dict(
|
||||
type="t5",
|
||||
from_pretrained="./pretrained_models/t5_ckpts",
|
||||
model_max_length=120,
|
||||
)
|
||||
scheduler = dict(
|
||||
type="dpm-solver",
|
||||
num_sampling_steps=20,
|
||||
cfg_scale=7.0,
|
||||
)
|
||||
dtype = "fp16"
|
||||
|
||||
# prompts
|
||||
batch_size = 2
|
||||
seed = 42
|
||||
prompt_path = "./assets/texts/t2v_samples.txt"
|
||||
save_dir = "./samples/"
|
||||
35
configs/pixart/inference/1x1024MS.py
Normal file
35
configs/pixart/inference/1x1024MS.py
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
# sample size
|
||||
num_frames = 1
|
||||
fps = 1
|
||||
image_size = (1920, 512)
|
||||
multi_resolution = True
|
||||
|
||||
# model config
|
||||
model = dict(
|
||||
type="PixArtMS-XL/2",
|
||||
space_scale=2.0,
|
||||
time_scale=1.0,
|
||||
no_temporal_pos_emb=True,
|
||||
from_pretrained="PixArt-XL-2-1024-MS.pth",
|
||||
)
|
||||
vae = dict(
|
||||
type="VideoAutoencoderKL",
|
||||
from_pretrained="stabilityai/sd-vae-ft-ema",
|
||||
)
|
||||
text_encoder = dict(
|
||||
type="t5",
|
||||
from_pretrained="./pretrained_models/t5_ckpts",
|
||||
model_max_length=120,
|
||||
)
|
||||
scheduler = dict(
|
||||
type = "dpm-solver",
|
||||
num_sampling_steps=20,
|
||||
cfg_scale=7.0,
|
||||
)
|
||||
dtype = "fp16"
|
||||
|
||||
# prompts
|
||||
batch_size = 2
|
||||
seed = 42
|
||||
prompt_path = "./assets/texts/t2i_samples.txt"
|
||||
save_dir = "./samples/"
|
||||
36
configs/pixart/inference/1x256x256.py
Normal file
36
configs/pixart/inference/1x256x256.py
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
# sample size
|
||||
num_frames = 1
|
||||
fps = 1
|
||||
image_size = (256, 256)
|
||||
|
||||
# model config
|
||||
model = dict(
|
||||
type="PixArt-XL/2",
|
||||
space_scale=1.0,
|
||||
time_scale=1.0,
|
||||
no_temporal_pos_emb=True,
|
||||
from_pretrained="PixArt-XL-2-256x256.pth",
|
||||
)
|
||||
vae = dict(
|
||||
# type="VideoAutoencoderKL",
|
||||
# from_pretrained="stabilityai/sd-vae-ft-ema",
|
||||
type="VideoAutoencoderKLTemporalDecoder",
|
||||
from_pretrained="pretrained_models/vae_temporal_decoder"
|
||||
)
|
||||
text_encoder = dict(
|
||||
type="t5",
|
||||
from_pretrained="./pretrained_models/t5_ckpts",
|
||||
model_max_length=120,
|
||||
)
|
||||
scheduler = dict(
|
||||
type="dpm-solver",
|
||||
num_sampling_steps=20,
|
||||
cfg_scale=7.0,
|
||||
)
|
||||
dtype = "fp16"
|
||||
|
||||
# prompts
|
||||
batch_size = 2
|
||||
seed = 42
|
||||
prompt_path = "./assets/texts/t2i_samples.txt"
|
||||
save_dir = "./samples/"
|
||||
34
configs/pixart/inference/1x512x512.py
Normal file
34
configs/pixart/inference/1x512x512.py
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
# sample size
|
||||
num_frames = 1
|
||||
fps = 1
|
||||
image_size = (512, 512)
|
||||
|
||||
# model config
|
||||
model = dict(
|
||||
type="PixArt-XL/2",
|
||||
space_scale=1.0,
|
||||
time_scale=1.0,
|
||||
no_temporal_pos_emb=True,
|
||||
from_pretrained="PixArt-XL-2-512x512.pth",
|
||||
)
|
||||
vae = dict(
|
||||
type="VideoAutoencoderKL",
|
||||
from_pretrained="stabilityai/sd-vae-ft-ema",
|
||||
)
|
||||
text_encoder = dict(
|
||||
type="t5",
|
||||
from_pretrained="./pretrained_models/t5_ckpts",
|
||||
model_max_length=120,
|
||||
)
|
||||
scheduler = dict(
|
||||
type = "dpm-solver",
|
||||
num_sampling_steps=20,
|
||||
cfg_scale=7.0,
|
||||
)
|
||||
dtype = "fp16"
|
||||
|
||||
# prompts
|
||||
batch_size = 2
|
||||
seed = 42
|
||||
prompt_path = "./assets/texts/t2i_samples.txt"
|
||||
save_dir = "./samples/"
|
||||
55
configs/pixart/train/16x256x256.py
Normal file
55
configs/pixart/train/16x256x256.py
Normal file
|
|
@ -0,0 +1,55 @@
|
|||
# sample size
|
||||
num_frames = 16
|
||||
frame_interval = 3
|
||||
image_size = (256, 256)
|
||||
|
||||
# dataset
|
||||
root = None
|
||||
# data_path = "/mnt/hdd/data/csv/bak_00/pexels_inter4k_fmin_48_rp.csv"
|
||||
data_path = "/mnt/hdd/data/csv/ucf101_videos.csv"
|
||||
use_image_transform = False
|
||||
num_workers = 4
|
||||
|
||||
# acceleration
|
||||
dtype = "fp16"
|
||||
grad_checkpoint = True
|
||||
plugin = "zero2"
|
||||
sp_size = 1
|
||||
|
||||
# model config
|
||||
model = dict(
|
||||
type="PixArt-XL/2",
|
||||
space_scale=0.5,
|
||||
time_scale=1.0,
|
||||
from_pretrained="PixArt-XL-2-512x512.pth",
|
||||
enable_flashattn=True,
|
||||
enable_layernorm_kernel=True,
|
||||
)
|
||||
vae = dict(
|
||||
type="VideoAutoencoderKL",
|
||||
from_pretrained="stabilityai/sd-vae-ft-ema",
|
||||
)
|
||||
text_encoder = dict(
|
||||
type="t5",
|
||||
from_pretrained="./pretrained_models/t5_ckpts",
|
||||
model_max_length=120,
|
||||
shardformer=True,
|
||||
)
|
||||
scheduler = dict(
|
||||
type="iddpm",
|
||||
timestep_respacing="",
|
||||
)
|
||||
|
||||
# runtime
|
||||
seed = 42
|
||||
outputs = "outputs"
|
||||
wandb = False
|
||||
|
||||
epochs = 1000
|
||||
log_every = 10
|
||||
ckpt_every = 1000
|
||||
load = None
|
||||
|
||||
batch_size = 8
|
||||
lr = 2e-5
|
||||
grad_clip = 1.0
|
||||
55
configs/pixart/train/1x512x512.py
Normal file
55
configs/pixart/train/1x512x512.py
Normal file
|
|
@ -0,0 +1,55 @@
|
|||
# sample size
|
||||
num_frames = 1
|
||||
frame_interval = 1
|
||||
image_size = (512, 512)
|
||||
|
||||
# dataset
|
||||
root = None
|
||||
data_path = "/mnt/hdd/data/csv/imagenet_train.csv"
|
||||
use_image_transform = True
|
||||
num_workers = 4
|
||||
|
||||
# acceleration
|
||||
dtype = "fp16"
|
||||
grad_checkpoint = True
|
||||
plugin = "zero2"
|
||||
sp_size = 1
|
||||
|
||||
# model config
|
||||
model = dict(
|
||||
type="PixArt-XL/2",
|
||||
space_scale=1.0,
|
||||
time_scale=1.0,
|
||||
no_temporal_pos_emb=True,
|
||||
from_pretrained="PixArt-XL-2-512x512.pth",
|
||||
enable_flashattn=True,
|
||||
enable_layernorm_kernel=True,
|
||||
)
|
||||
vae = dict(
|
||||
type="VideoAutoencoderKL",
|
||||
from_pretrained="stabilityai/sd-vae-ft-ema",
|
||||
)
|
||||
text_encoder = dict(
|
||||
type="t5",
|
||||
from_pretrained="./pretrained_models/t5_ckpts",
|
||||
model_max_length=120,
|
||||
shardformer=True,
|
||||
)
|
||||
scheduler = dict(
|
||||
type="iddpm",
|
||||
timestep_respacing="",
|
||||
)
|
||||
|
||||
# runtime
|
||||
seed = 42
|
||||
outputs = "outputs"
|
||||
wandb = False
|
||||
|
||||
epochs = 1000
|
||||
log_every = 10
|
||||
ckpt_every = 1000
|
||||
load = None
|
||||
|
||||
batch_size = 32
|
||||
lr = 2e-5
|
||||
grad_clip = 1.0
|
||||
Loading…
Reference in a new issue