From d2e710c97776330f4928283787ffe049fedc3ee8 Mon Sep 17 00:00:00 2001
From: Shen-Chenhui <shen_chenhui@u.nus.edu>
Date: Mon, 17 Jun 2024 09:13:12 +0000
Subject: [PATCH] format

---
 README.md                                     |  32 ++++-
 configs/vae/train/{image.py => stage1.py}     |   9 +-
 .../vae/train/{video_disc.py => stage2.py}    |  24 +---
 configs/vae/train/{video.py => stage3.py}     |   4 +-
 docs/installation.md                          | 135 ++++++++++--------
 docs/vae.md                                   |  63 ++++++++
 eval/vae/cal_flolpips.py                      |   6 +-
 opensora/models/vae/README.md                 |  79 ----------
 opensora/models/vae/lpips.py                  |   3 +-
 opensora/models/vae/utils.py                  |  19 +--
 requirements/requirements-vae.txt             |   5 +
 setup.py                                      |   1 +
 12 files changed, 193 insertions(+), 187 deletions(-)
 rename configs/vae/train/{image.py => stage1.py} (82%)
 rename configs/vae/train/{video_disc.py => stage2.py} (67%)
 rename configs/vae/train/{video.py => stage3.py} (91%)
 create mode 100644 docs/vae.md
 delete mode 100644 opensora/models/vae/README.md
 create mode 100644 requirements/requirements-vae.txt

diff --git a/README.md b/README.md
index c010e2c..32a61d7 100644
--- a/README.md
+++ b/README.md
@@ -136,6 +136,7 @@ see [here](/assets/texts/t2v_samples.txt) for full prompts.
 * [Gradio Demo](#gradio-demo)
 * [Inference](#inference)
 * [Data Processing](#data-processing)
+* [VAE](#vae)
 * [Training](#training)
 * [Evaluation](#evaluation)
 * [Contribution](#contribution)
@@ -157,7 +158,7 @@ Other useful documents and links are listed below.
 
 ### Install from Source
 
-For CUDA 12.1, you can install the dependencies with the following commands. Otherwise, please refer to [Installation](docs/installation.md) for more instructions on different cuda version, and additional dependency for data preprocessing.
+For CUDA 12.1, you can install the dependencies with the following commands. Otherwise, please refer to [Installation](docs/installation.md) for more instructions on different cuda version, and additional dependency for data preprocessing, VAE, and model evaluation.
 
 ```bash
 # create a virtual env and activate (conda as an example)
@@ -372,6 +373,35 @@ Also check out the [datasets](docs/datasets.md) we use.
 
 ![Data Processing Pipeline](assets/readme/report_data_pipeline.png)
 
+## VAE
+We train a VAE pipeline that consists of a spatial VAE followed by a temporal VAE.
+For more details, refer to our [VAE documentation](docs/vae.md).
+Before you run the following commands, follow our [Installation Documentation](docs/installation.md) to install the required dependencies for VAE and Evaluation.
+
+Once you prepare the data in a `csv` file, run the following commands to train the VAE.
+Note that you need to adjust the number of trained epochs (`epochs`) in the config file accordingly with respect to your own csv data size.
+
+```bash
+# stage 1 training, 380k steps, 8 GPUs
+torchrun --nnodes=1 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage1.py --data-path YOUR_CSV_PATH
+# stage 2 training, 260k steps, 8 GPUs
+torchrun --nnodes=1 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage[1-3].py --data-path YOUR_CSV_PATH
+# stage 3 training, 540k steps, 24 GPUs
+torchrun --nnodes=3 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage[1-3].py --data-path YOUR_CSV_PATH
+```
+To evaluate the VAE performance, you need to run VAE inference first to generate the videos, then calculate scores on the generated videos:
+
+```bash
+# video generation
+torchrun --standalone --nnodes=1 --nproc_per_node=1 scripts/inference_vae.py configs/vae/inference/video.py --ckpt-path YOUR_VAE_CKPT_PATH --data-path YOUR_CSV_PATH --save-dir YOUR_VIDEO_DIR
+# the original videos will be saved to `YOUR_VIDEO_DIR_ori`
+# the reconstructed videos through the pipeline will be saved to `YOUR_VIDEO_DIR_rec`
+# the reconstructed videos through the spatial VAE only will be saved to `YOUR_VIDEO_DIR_spatial`
+
+# score calculation
+python eval/vae/eval_common_metric.py --batch_size 2 --real_video_dir YOUR_VIDEO_DIR_ori --generated_video_dir YOUR_VIDEO_DIR_rec --device cuda --sample_fps 24 --crop_size 256 --resolution 256 --num_frames 17 --sample_rate 1 --metric ssim psnr lpips flolpips
+```
+
 ## Training
 
 ### Open-Sora 1.2 Training
diff --git a/configs/vae/train/image.py b/configs/vae/train/stage1.py
similarity index 82%
rename from configs/vae/train/image.py
rename to configs/vae/train/stage1.py
index 46621dc..a6899ec 100644
--- a/configs/vae/train/image.py
+++ b/configs/vae/train/stage1.py
@@ -1,4 +1,4 @@
-num_frames = 1
+num_frames = 17
 image_size = (256, 256)
 
 # Define dataset
@@ -35,10 +35,11 @@ model = dict(
 )
 
 # loss weights
-perceptual_loss_weight = 0.0  # use vgg is not None and more than 0
+perceptual_loss_weight = 0.1  # use vgg is not None and more than 0
 kl_loss_weight = 1e-6
 
-mixed_image_ratio = 0.1
+mixed_strategy = "mixed_video_image"
+mixed_image_ratio = 0.2
 use_real_rec_loss = False
 use_z_rec_loss = True
 use_image_identity_loss = True
@@ -48,7 +49,7 @@ seed = 42
 outputs = "outputs"
 wandb = False
 
-epochs = 100
+epochs = 100  # NOTE: adjust accordingly w.r.t dataset size
 log_every = 1
 ckpt_every = 1000
 load = None
diff --git a/configs/vae/train/video_disc.py b/configs/vae/train/stage2.py
similarity index 67%
rename from configs/vae/train/video_disc.py
rename to configs/vae/train/stage2.py
index 7af989d..80748d4 100644
--- a/configs/vae/train/video_disc.py
+++ b/configs/vae/train/stage2.py
@@ -34,30 +34,14 @@ model = dict(
     ),
 )
 
-discriminator = dict(
-    type="NLayerDiscriminator",
-    from_pretrained="/home/shenchenhui/opensoraplan-v1.0.0-discriminator.pt",
-    input_nc=3,
-    n_layers=3,
-    use_actnorm=False,
-)
-
-# discriminator hyper-parames TODO
-discriminator_factor = 1
-discriminator_start = -1
-generator_factor = 0.5
-generator_loss_type = "hinge"
-discriminator_loss_type = "hinge"
-lecam_loss_weight = None
-gradient_penalty_loss_weight = None
-
 # loss weights
 perceptual_loss_weight = 0.1  # use vgg is not None and more than 0
 kl_loss_weight = 1e-6
 
+mixed_strategy = "mixed_video_image"
 mixed_image_ratio = 0.2
-use_real_rec_loss = True
-use_z_rec_loss = False
+use_real_rec_loss = False
+use_z_rec_loss = True
 use_image_identity_loss = False
 
 # Others
@@ -65,7 +49,7 @@ seed = 42
 outputs = "outputs"
 wandb = False
 
-epochs = 100
+epochs = 100  # NOTE: adjust accordingly w.r.t dataset size
 log_every = 1
 ckpt_every = 1000
 load = None
diff --git a/configs/vae/train/video.py b/configs/vae/train/stage3.py
similarity index 91%
rename from configs/vae/train/video.py
rename to configs/vae/train/stage3.py
index 8dd96c2..2b6bc12 100644
--- a/configs/vae/train/video.py
+++ b/configs/vae/train/stage3.py
@@ -38,7 +38,7 @@ model = dict(
 perceptual_loss_weight = 0.1  # use vgg is not None and more than 0
 kl_loss_weight = 1e-6
 
-mixed_image_ratio = 0.2
+mixed_strategy = "mixed_video_random"
 use_real_rec_loss = True
 use_z_rec_loss = False
 use_image_identity_loss = False
@@ -48,7 +48,7 @@ seed = 42
 outputs = "outputs"
 wandb = False
 
-epochs = 100
+epochs = 100  # NOTE: adjust accordingly w.r.t dataset size
 log_every = 1
 ckpt_every = 1000
 load = None
diff --git a/docs/installation.md b/docs/installation.md
index bf7c711..5e6cd29 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -7,7 +7,7 @@ Note that besides these packages, some packages needs to be mannually installed,
 
 You need to install `opensora` for training and inference. You can follow the steps below for installation. We also provide guideline for different CUDA versions for compatiblity.
 
-Please note that the default installation is for training and inference only. Other optional dependencies are detailed in the sections [Data Proessing](#data-processing) and [Evaluation](#evaluation) respectively.
+Please note that the default installation is for training and inference only. Other optional dependencies are detailed in the sections [Data Processing](#data-processing), [Evaluation](#evaluation), and [VAE](#vae) respectively.
 
 ### Step 1: Install PyTorch and xformers
 
@@ -58,63 +58,6 @@ pip install flash-attn --no-build-isolation
 pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git
 ```
 
-## Evaluation
-
-### Step 1: Install Requirements
-
-To conduct evaluation, run the following command to install requirements:
-
-```bash
-pip install -v .[eval]
-# For development:`pip install -v -e .[eval]`
-```
-
-### Step 2: Install VBench
-
-<!-- You need to manually install [VBench](https://github.com/Vchitect/VBench):
-
-```bash
-pip install --no-deps vbench==0.1.1
-# If the installation shows a warning about the intalled vbench not in PATH, you need to add it by:
-export PATH="/path/to/vbench:$PATH"
-``` -->
-
-You need to install VBench mannually by:
-```bash
-# first clone their repo
-cd .. # assume you are in the Open-Sora root folder, you may install at other location but make sure the soft link paths later are correct
-git clone https://github.com/Vchitect/VBench.git
-cd VBench
-git checkout v0.1.2
-
-# next, fix their hard-coded path isse
-vim vbench2_beta_i2v/utils.py
-# find `image_root` in the `load_i2v_dimension_info` function, change it to point to your appropriate image folder
-
-# last, create softlinks
-cd ../Open-Sora # or `cd ../Open-Sora-dev` for development
-ln -s ../VBench/vbench vbench # you may need to change ../VBench/vbench to your corresponding path
-ln -s ../VBench/vbench2_beta_i2v vbench2_beta_i2v # you may need to change ../VBench/vbench_beta_i2v to your corresponding path
-# later you need to make sure to run evaluation from your Open-Sora folder, else vbench, vbench2_beta_i2v cannot be found
-```
-
-
-### Step 3: Install `cupy` for Potential VAE Errors
-
-You need to mannually install [cupy](https://docs.cupy.dev/en/stable/install.html).
-
-- For CUDA v11.2~11.8 (x86_64 / aarch64), `pip install cupy-cuda11x`
-- For CUDA v12.x (x86_64 / aarch64), `pip install cupy-cuda12x`
-
-Note that for VAE evaluation, you may run into error with `ModuleNotFoundError: No module named 'torchvision.transforms.functional_tensor'`, in this case, you need to go to the corresponding file (`.../pytorchvideo/transforms/augmentations.py`) reporting this error, then change as following:
-
-```python
-# find the original line:
-import torchvision.transforms.functional_tensor as F_t
-# change to:
-import torchvision.transforms._functional_tensor as F_t
-```
-
 ## Data Processing
 
 ### Step 1: Install Requirements
@@ -193,3 +136,79 @@ and change the assert of `mmcv_version < digit_version(mmcv_maximum_version)` to
 
 If you are unsure of your path to the mmdet init file, simply run our [OCR command](../tools/scoring/README.md), wait for the mmdeet assertion error on mmcv versions.
 The error will contain the exact path to the mmdet init file.
+
+
+## Evaluation
+
+### Step 1: Install Requirements
+
+To conduct evaluation, run the following command to install requirements:
+
+```bash
+pip install -v .[eval]
+# For development:`pip install -v -e .[eval]`
+```
+
+### Step 2: Install VBench
+
+<!-- You need to manually install [VBench](https://github.com/Vchitect/VBench):
+
+```bash
+pip install --no-deps vbench==0.1.1
+# If the installation shows a warning about the intalled vbench not in PATH, you need to add it by:
+export PATH="/path/to/vbench:$PATH"
+``` -->
+
+You need to install VBench mannually by:
+```bash
+# first clone their repo
+cd .. # assume you are in the Open-Sora root folder, you may install at other location but make sure the soft link paths later are correct
+git clone https://github.com/Vchitect/VBench.git
+cd VBench
+git checkout v0.1.2
+
+# next, fix their hard-coded path isse
+vim vbench2_beta_i2v/utils.py
+# find `image_root` in the `load_i2v_dimension_info` function, change it to point to your appropriate image folder
+
+# last, create softlinks
+cd ../Open-Sora # or `cd ../Open-Sora-dev` for development
+ln -s ../VBench/vbench vbench # you may need to change ../VBench/vbench to your corresponding path
+ln -s ../VBench/vbench2_beta_i2v vbench2_beta_i2v # you may need to change ../VBench/vbench_beta_i2v to your corresponding path
+# later you need to make sure to run evaluation from your Open-Sora folder, else vbench, vbench2_beta_i2v cannot be found
+```
+
+
+### Step 3: Install `cupy` for Potential VAE Errors
+
+You need to mannually install [cupy](https://docs.cupy.dev/en/stable/install.html).
+
+- For CUDA v11.2~11.8 (x86_64 / aarch64), `pip install cupy-cuda11x`
+- For CUDA v12.x (x86_64 / aarch64), `pip install cupy-cuda12x`
+
+Note that for VAE evaluation, you may run into error with `ModuleNotFoundError: No module named 'torchvision.transforms.functional_tensor'`, in this case, you need to go to the corresponding file (`.../pytorchvideo/transforms/augmentations.py`) reporting this error, then change as following:
+
+```python
+# find the original line:
+import torchvision.transforms.functional_tensor as F_t
+# change to:
+import torchvision.transforms._functional_tensor as F_t
+```
+
+
+
+
+## VAE
+
+### Step 1: Install Requirements
+
+To train and evaluate your own VAE, run the following command to install requirements:
+
+```bash
+pip install -v .[vae]
+# For development:`pip install -v -e .[vae]`
+```
+
+### Step 2: VAE Evaluation (`cupy` and Potential VAE Errors)
+
+Refer to [Evaluation's VAE section](#step-3-install-cupy-for-potential-vae-errors).
diff --git a/docs/vae.md b/docs/vae.md
new file mode 100644
index 0000000..114a7e5
--- /dev/null
+++ b/docs/vae.md
@@ -0,0 +1,63 @@
+# VAE Report
+
+As [Pixart-Sigma](https://arxiv.org/abs/2403.04692) finds that adapting to a new VAE is simple, we develop a temporal VAE for the diffusion model to adapt to.
+Specifically, our VAE consists of a pipeline of a [spatial VAE](https://huggingface.co/PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers) followed by a temporal VAE.
+For the temporal VAE, we follow the implementation of [MAGVIT-v2](https://arxiv.org/abs/2310.05737), with the following modifications:
+* We remove the architecture specific to the codebook.
+* We do not use the discriminator, and use the VAE reconstruction loss, kl loss, and perceptual loss for training.
+* In the last linear layer of the encoder, we scale down to a diagonal Gaussian Distribution of 4 channels, following our previously trained STDiT that takes in 4 channels input.
+* Our decoder is symmetric to the encoder architecture.
+
+## Training
+
+We train the model in different stages.
+
+We first train the temporal VAE only by freezing the spatial VAE for 380k steps on a single machine (8 GPUs).
+We use an additional identity loss to make features from the 3D VAE similar to the features from the 2D VAE.
+We train the VAE using 20% images and 80% videos with 17 frames.
+```bash
+torchrun --nnodes=1 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage1.py --data-path YOUR_CSV_PATH
+```
+Next, we remove the indentity loss and train the 3D VAE pipeline to reconstructe the 2D-compressed videos for 260k steps.
+```bash
+torchrun --nnodes=1 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage2.py --data-path YOUR_CSV_PATH
+```
+Finally, we remove the reconstruction loss for the 2D-compressed videos and train the VAE pipeline to construct the 3D videos for 540k steps.
+We train our VAE with a random number within 34 frames to make it more robust to different video lengths.
+This stage is trained on 24 GPUs.
+```bash
+torchrun --nnodes=3 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage3.py --data-path YOUR_CSV_PATH
+```
+
+Note that you need to adjust the `epochs` in the config file accordingly with respect to your own csv data size.
+
+
+## Inference
+
+To visually check the performance of the VAE, you may run the following inference.
+It saves the original video to your specified video directory with `_ori` postfix (i.e. `"YOUR_VIDEO_DIR"_ori`), the reconstructed video from the full pipeline with the `_rec` postfix (i.e. `"YOUR_VIDEO_DIR"_rec`), and the reconstructed video from the 2D compression and decompression with the `_spatial` postfix (i.e. `"YOUR_VIDEO_DIR"_spatial`).
+```bash
+torchrun --standalone --nnodes=1 --nproc_per_node=1 scripts/inference_vae.py configs/vae/inference/video.py --ckpt-path YOUR_VAE_CKPT_PATH --data-path YOUR_CSV_PATH --save-dir YOUR_VIDEO_DIR
+```
+
+## Evaluation
+We can then calculate the scores of the VAE performances on metrics of SSIM, PSNR, LPIPS, and FLOLPIPS.
+
+* SSIM: structural similarity index measure, the higher the better
+* PSNR: peak-signal-to-noise ratio, the higher the better
+* LPIPS:  learned perceptual image quality degradation, the lower the better
+* [FloLPIPS](https://arxiv.org/pdf/2207.08119): LPIPS with video interpolation, the lower the better.
+
+```bash
+python eval/vae/eval_common_metric.py --batch_size 2 --real_video_dir YOUR_VIDEO_DIR_ori --generated_video_dir YOUR_VIDEO_DIR_rec --device cuda --sample_fps 24 --crop_size 256 --resolution 256 --num_frames 17 --sample_rate 1 --metric ssim psnr lpips flolpips
+```
+
+
+## Acknowledgement
+We are grateful for the following work:
+* [MAGVIT-v2](https://arxiv.org/abs/2310.05737): Language Model Beats Diffusion -- Tokenizer is Key to Visual Generation
+* [Taming Transformers](https://github.com/CompVis/taming-transformers): Taming Transformers for High-Resolution Image Synthesis
+* [3D blur pooling](https://github.com/adobe/antialiased-cnns/pull/39/commits/3d6f02b6943c58b68c19c07bc26fad57492ff3bc)
+* [Open-Sora-Plan](https://github.com/PKU-YuanGroup/Open-Sora-Plan)
+
+Special thanks go to the authors of [Open-Sora-Plan](https://github.com/PKU-YuanGroup/Open-Sora-Plan) for their valuable advice and help.
diff --git a/eval/vae/cal_flolpips.py b/eval/vae/cal_flolpips.py
index 7d8f08c..a8824da 100644
--- a/eval/vae/cal_flolpips.py
+++ b/eval/vae/cal_flolpips.py
@@ -6,10 +6,10 @@ from tqdm import tqdm
 
 sys.path.append(".")
 
-# ERROR: cannot locate the model file
-from flolpips.pwcnet import Network as PWCNet
 from flolpips.flolpips import FloLPIPS
-loss_fn = FloLPIPS(net='alex', version='0.1').eval().requires_grad_(False)
+from flolpips.pwcnet import Network as PWCNet
+
+loss_fn = FloLPIPS(net="alex", version="0.1").eval().requires_grad_(False)
 flownet = PWCNet().eval().requires_grad_(False)
 
 
diff --git a/opensora/models/vae/README.md b/opensora/models/vae/README.md
deleted file mode 100644
index 0a6e6c6..0000000
--- a/opensora/models/vae/README.md
+++ /dev/null
@@ -1,79 +0,0 @@
-## Commands 
-
-## 0. References
-
-* https://github.com/google-research/magvit
-* https://github.com/CompVis/taming-transformers
-* https://github.com/adobe/antialiased-cnns/pull/39/commits/3d6f02b6943c58b68c19c07bc26fad57492ff3bc
-* https://github.com/PKU-YuanGroup/Open-Sora-Plan
-
-
-## 1. VAE 3D
-### 1.1 Train
-
-```yaml
-# train on pexel dataset
-WANDB_API_KEY=<wandb_api_key> CUDA_VISIBLE_DEVICES=<n> torchrun --master_port=<port_num> --nnodes=1 --nproc_per_node=1 scripts/train-vae.py configs/vae_3d/train/16x256x256.py --data-path /home/shenchenhui/data/pexels/train.csv --wandb True
-```
-
-### 1.2 Inference 
-
-```yaml
-CUDA_VISIBLE_DEVICES=6 torchrun --standalone --nnodes=1 --nproc_per_node=1 scripts/inference-vae.py configs/vae_3d/inference/16x256x256.py --ckpt-path /home/shenchenhui/Open-Sora-dev/outputs/train_pexel_028/epoch3-global_step20000/ --data-path /home/shenchenhui/data/pexels/debug.csv --save-dir outputs/pexel
-
-
-# resume training debug
-CUDA_VISIBLE_DEVICES=5 torchrun --master_port=29530 --nnodes=1 --nproc_per_node=1 scripts/train-vae.py configs/vae_3d/train/16x256x256.py --data-path /home/shenchenhui/data/pexels/debug.csv  --load /home/shenchenhui/Open-Sora-dev/outputs/006-F16S3-VAE_3D_B/epoch49-global_step50
-```
-
-version 2 pipeline
-```yaml
-# NOTE: first VAE is pretrained 2D, 16x128x128 --> 16x16x16
-# then we train our own temporal VAE, 16x16x16 --> 4x16x16
-# we use a 3 layer discriminator on the intermediate of 16x16x16
-WANDB_API_KEY=<wandb_api_key> CUDA_VISIBLE_DEVICES=7 torchrun --master_port=29580 --nnodes=1 --nproc_per_node=1 scripts/train-vae-v2.py configs/vae_magvit_v2/train/pipeline_16x128x128.py --data-path /home/shenchenhui/data/trial_data/train_short.csv --wandb True
-```
-
-
-## 2. MAGVIT-v2
-
-### 2.1 dependencies
-```
-'accelerate>=0.24.0',
-'beartype',
-'einops>=0.7.0',
-'ema-pytorch>=0.2.4',
-'pytorch-warmup',
-'gateloop-transformer>=0.2.2',
-'kornia',
-'opencv-python',
-'pillow',
-'pytorch-custom-utils>=0.0.9',
-'numpy',
-'vector-quantize-pytorch>=1.11.8',
-'taylor-series-linear-attention>=0.1.5',
-'torch',
-'torchvision',
-'x-transformers'
-```
-
-Note: 
-uses `hotfix/zero` branch of `https://github.com/ver217/ColossalAI.git`.
-clone the repo, go to the branch, then do `pip install .` 
-
-
-### 2.2 Train
-
-```yaml
-CUDA_VISIBLE_DEVICES7 torchrun --master_port=29510 --nnodes=1 --nproc_per_node=1 scripts/train-vae-v2.py configs/vae_magvit_v2/train/17x128x128.py --data-path /home/shenchenhui/data/pexels/train.csv
-```
-
-### 2.3 Inference
-
-
-### 2.4 Data
-
-full data combining the follwing: `/home/shenchenhui/data/pixabay+pexels.csv`
-
-* ~/data/pixabay: `/home/data/sora_data/pixabay/raw/data/split-0`
-* pexels: `/home/litianyi/data/pexels/processed/meta/pexels_caption_vinfo_ready_noempty_clean.csv`
\ No newline at end of file
diff --git a/opensora/models/vae/lpips.py b/opensora/models/vae/lpips.py
index 358cfe5..e643cba 100644
--- a/opensora/models/vae/lpips.py
+++ b/opensora/models/vae/lpips.py
@@ -50,7 +50,7 @@ class LPIPS(nn.Module):
         super().__init__()
         self.scaling_layer = ScalingLayer()
         self.chns = [64, 128, 256, 512, 512]  # vg16 features
-        self.net = vgg16(pretrained=True, requires_grad=False)  # NOTE: TODO: need in_channels = 4 to use
+        self.net = vgg16(pretrained=True, requires_grad=False)
         self.lin0 = NetLinLayer(self.chns[0], use_dropout=use_dropout)
         self.lin1 = NetLinLayer(self.chns[1], use_dropout=use_dropout)
         self.lin2 = NetLinLayer(self.chns[2], use_dropout=use_dropout)
@@ -90,7 +90,6 @@ class LPIPS(nn.Module):
         return val
 
 
-# SCH: TODO: this channel shift & scale may need to be changed
 class ScalingLayer(nn.Module):
     def __init__(self):
         super(ScalingLayer, self).__init__()
diff --git a/opensora/models/vae/utils.py b/opensora/models/vae/utils.py
index 6c0c07c..6b0ba6b 100644
--- a/opensora/models/vae/utils.py
+++ b/opensora/models/vae/utils.py
@@ -1,26 +1,9 @@
 import numpy as np
 import torch
 
-# from taming.modules.losses.lpips import LPIPS # need to pip install https://github.com/CompVis/taming-transformers
-# from taming.modules.discriminator.model import NLayerDiscriminator, weights_init
-
 """Stripped version of https://github.com/richzhang/PerceptualSimilarity/tree/master/models"""
 
 
-## NOTE: not used since we only have 'GN'
-# def get_norm_layer(norm_type, dtype):
-#   if norm_type == 'LN':
-#     # supply a few args with partial function and pass the rest of the args when this norm_fn is called
-#     norm_fn = functools.partial(nn.LayerNorm, dtype=dtype)
-#   elif norm_type == 'GN': #
-#     norm_fn = functools.partial(nn.GroupNorm, dtype=dtype)
-#   elif norm_type is None:
-#     norm_fn = lambda: (lambda x: x)
-#   else:
-#     raise NotImplementedError(f'norm_type: {norm_type}')
-#   return norm_fn
-
-
 class DiagonalGaussianDistribution(object):
     def __init__(
         self,
@@ -57,7 +40,7 @@ class DiagonalGaussianDistribution(object):
                     dim=[1, 2, 3, 4],
                 )
 
-    def nll(self, sample, dims=[1, 2, 3, 4]):  # TODO: what does this do?
+    def nll(self, sample, dims=[1, 2, 3, 4]):
         if self.deterministic:
             return torch.Tensor([0.0])
         logtwopi = np.log(2.0 * np.pi)
diff --git a/requirements/requirements-vae.txt b/requirements/requirements-vae.txt
new file mode 100644
index 0000000..75530e4
--- /dev/null
+++ b/requirements/requirements-vae.txt
@@ -0,0 +1,5 @@
+beartype==0.18.5
+einops==0.8.0
+einops-exts==0.0.4
+opencv-python==4.9.0.80
+pillow==10.3.0
diff --git a/setup.py b/setup.py
index a90450e..f9b1d42 100644
--- a/setup.py
+++ b/setup.py
@@ -79,6 +79,7 @@ setup(
     extras_require={
         "data": fetch_requirements("requirements/requirements-data.txt"),
         "eval": fetch_requirements("requirements/requirements-eval.txt"),
+        "vae": fetch_requirements("requirements/requirements-vae.txt"),
         "full": fetch_requirements(
             [
                 "requirements/requirements-data.txt",