From c347f79bdebdb1db556e6ac2d5ad11b60fc0aa58 Mon Sep 17 00:00:00 2001
From: Shen-Chenhui <shen_chenhui@u.nus.edu>
Date: Sat, 27 Apr 2024 15:14:01 +0800
Subject: [PATCH] new config

---
 .../17x128x128_pixabay_8_GPU_pipeline.py      | 93 +++++++++++++++++++
 1 file changed, 93 insertions(+)
 create mode 100644 configs/vae_magvit_v2/train/17x128x128_pixabay_8_GPU_pipeline.py

diff --git a/configs/vae_magvit_v2/train/17x128x128_pixabay_8_GPU_pipeline.py b/configs/vae_magvit_v2/train/17x128x128_pixabay_8_GPU_pipeline.py
new file mode 100644
index 0000000..4a68bd7
--- /dev/null
+++ b/configs/vae_magvit_v2/train/17x128x128_pixabay_8_GPU_pipeline.py
@@ -0,0 +1,93 @@
+dataset = dict(
+    type="VideoTextDataset",
+    data_path=None,
+    num_frames=17,
+    frame_interval=3,
+    image_size=(128, 128),
+    get_text=False,
+)
+
+# Define acceleration
+num_workers = 4
+dtype = "bf16"
+grad_checkpoint = True
+plugin = "zero2"
+sp_size = 1
+
+use_pipeline = True
+video_contains_first_frame = True
+
+# Define model
+vae_2d = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+    # SDXL
+)
+model = dict(
+    type="VAE_MAGVIT_V2",
+    in_out_channels=4,
+    latent_embed_dim=64,
+    filters=128,
+    num_res_blocks=4,
+    channel_multipliers=(1, 2, 2, 4),
+    temporal_downsample=(False, True, True),
+    num_groups=32,  # for nn.GroupNorm
+    kl_embed_dim=4,
+    activation_fn="swish",
+    separate_first_frame_encoding=False,
+    disable_space=True,
+    encoder_double_z=True,
+    custom_conv_padding=None,
+)
+
+
+discriminator = dict(
+    type="DISCRIMINATOR_3D",
+    image_size=(16, 16),  # NOTE: here image size is different
+    num_frames=16,
+    in_channels=4,
+    filters=128,
+    use_pretrained=True,  # NOTE: set to False only if we want to disable load
+    # channel_multipliers = (2,4,4,4,4), # (2,4,4,4) for 64x64 resolution
+    channel_multipliers=(2, 4, 4),  # since on intermediate layer dimension ofs z 
+)
+
+
+# loss weights
+logvar_init = 0.0
+kl_loss_weight = 0.000001
+perceptual_loss_weight = 0.1  # use vgg is not None and more than 0
+discriminator_factor = 1.0  # for discriminator adversarial loss
+generator_factor = 0.1  # SCH: generator adversarial loss, MAGVIT v2 uses 0.1
+lecam_loss_weight = None  # NOTE: MAVGIT v2 use 0.001
+discriminator_loss_type = "non-saturating"
+generator_loss_type = "non-saturating"
+# discriminator_loss_type="hinge"
+# generator_loss_type="hinge"
+discriminator_start = 10000  # 8k data / (8*1) = 1000 steps per epoch
+gradient_penalty_loss_weight = None  # 10 # SCH: MAGVIT uses 10, opensora plan doesn't use
+ema_decay = 0.999  # ema decay factor for generator
+
+
+# Others
+seed = 42
+outputs = "outputs"
+wandb = False
+
+# Training
+""" NOTE:
+magvit uses about # samples (K) * epochs ~ 2-5 K,  num_frames = 4, reso = 128
+==> ours num_frams = 16, reso = 256, so samples (K) * epochs ~ [500 - 1200],
+3-6 epochs for pexel, from pexel observation its correct
+"""
+
+epochs = 20
+log_every = 1
+
+ckpt_every = 1000
+load = None
+
+batch_size = 1
+
+lr = 1e-4
+grad_clip = 1.0