mirror of
https://github.com/hpcaitech/Open-Sora.git
synced 2026-04-11 05:13:31 +02:00
Merge pull request #148 from hpcaitech/feature/docs_v1.2
Feature/docs v1.2
This commit is contained in:
commit
f0c98dd186
32
README.md
32
README.md
|
|
@ -137,6 +137,7 @@ see [here](/assets/texts/t2v_samples.txt) for full prompts.
|
|||
* [Gradio Demo](#gradio-demo)
|
||||
* [Inference](#inference)
|
||||
* [Data Processing](#data-processing)
|
||||
* [VAE](#vae)
|
||||
* [Training](#training)
|
||||
* [Evaluation](#evaluation)
|
||||
* [Contribution](#contribution)
|
||||
|
|
@ -158,7 +159,7 @@ Other useful documents and links are listed below.
|
|||
|
||||
### Install from Source
|
||||
|
||||
For CUDA 12.1, you can install the dependencies with the following commands. Otherwise, please refer to [Installation](docs/installation.md) for more instructions on different cuda version, and additional dependency for data preprocessing.
|
||||
For CUDA 12.1, you can install the dependencies with the following commands. Otherwise, please refer to [Installation](docs/installation.md) for more instructions on different cuda version, and additional dependency for data preprocessing, VAE, and model evaluation.
|
||||
|
||||
```bash
|
||||
# create a virtual env and activate (conda as an example)
|
||||
|
|
@ -399,6 +400,35 @@ Also check out the [datasets](docs/datasets.md) we use.
|
|||
|
||||

|
||||
|
||||
## VAE
|
||||
We train a VAE pipeline that consists of a spatial VAE followed by a temporal VAE.
|
||||
For more details, refer to our [VAE documentation](docs/vae.md).
|
||||
Before you run the following commands, follow our [Installation Documentation](docs/installation.md) to install the required dependencies for VAE and Evaluation.
|
||||
|
||||
Once you prepare the data in a `csv` file, run the following commands to train the VAE.
|
||||
Note that you need to adjust the number of trained epochs (`epochs`) in the config file accordingly with respect to your own csv data size.
|
||||
|
||||
```bash
|
||||
# stage 1 training, 380k steps, 8 GPUs
|
||||
torchrun --nnodes=1 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage1.py --data-path YOUR_CSV_PATH
|
||||
# stage 2 training, 260k steps, 8 GPUs
|
||||
torchrun --nnodes=1 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage[1-3].py --data-path YOUR_CSV_PATH
|
||||
# stage 3 training, 540k steps, 24 GPUs
|
||||
torchrun --nnodes=3 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage[1-3].py --data-path YOUR_CSV_PATH
|
||||
```
|
||||
To evaluate the VAE performance, you need to run VAE inference first to generate the videos, then calculate scores on the generated videos:
|
||||
|
||||
```bash
|
||||
# video generation
|
||||
torchrun --standalone --nnodes=1 --nproc_per_node=1 scripts/inference_vae.py configs/vae/inference/video.py --ckpt-path YOUR_VAE_CKPT_PATH --data-path YOUR_CSV_PATH --save-dir YOUR_VIDEO_DIR
|
||||
# the original videos will be saved to `YOUR_VIDEO_DIR_ori`
|
||||
# the reconstructed videos through the pipeline will be saved to `YOUR_VIDEO_DIR_rec`
|
||||
# the reconstructed videos through the spatial VAE only will be saved to `YOUR_VIDEO_DIR_spatial`
|
||||
|
||||
# score calculation
|
||||
python eval/vae/eval_common_metric.py --batch_size 2 --real_video_dir YOUR_VIDEO_DIR_ori --generated_video_dir YOUR_VIDEO_DIR_rec --device cuda --sample_fps 24 --crop_size 256 --resolution 256 --num_frames 17 --sample_rate 1 --metric ssim psnr lpips flolpips
|
||||
```
|
||||
|
||||
## Training
|
||||
|
||||
### Open-Sora 1.2 Training
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
num_frames = 1
|
||||
num_frames = 17
|
||||
image_size = (256, 256)
|
||||
|
||||
# Define dataset
|
||||
|
|
@ -35,10 +35,11 @@ model = dict(
|
|||
)
|
||||
|
||||
# loss weights
|
||||
perceptual_loss_weight = 0.0 # use vgg is not None and more than 0
|
||||
perceptual_loss_weight = 0.1 # use vgg is not None and more than 0
|
||||
kl_loss_weight = 1e-6
|
||||
|
||||
mixed_image_ratio = 0.1
|
||||
mixed_strategy = "mixed_video_image"
|
||||
mixed_image_ratio = 0.2
|
||||
use_real_rec_loss = False
|
||||
use_z_rec_loss = True
|
||||
use_image_identity_loss = True
|
||||
|
|
@ -48,7 +49,7 @@ seed = 42
|
|||
outputs = "outputs"
|
||||
wandb = False
|
||||
|
||||
epochs = 100
|
||||
epochs = 100 # NOTE: adjust accordingly w.r.t dataset size
|
||||
log_every = 1
|
||||
ckpt_every = 1000
|
||||
load = None
|
||||
|
|
@ -34,30 +34,14 @@ model = dict(
|
|||
),
|
||||
)
|
||||
|
||||
discriminator = dict(
|
||||
type="NLayerDiscriminator",
|
||||
from_pretrained="/home/shenchenhui/opensoraplan-v1.0.0-discriminator.pt",
|
||||
input_nc=3,
|
||||
n_layers=3,
|
||||
use_actnorm=False,
|
||||
)
|
||||
|
||||
# discriminator hyper-parames TODO
|
||||
discriminator_factor = 1
|
||||
discriminator_start = -1
|
||||
generator_factor = 0.5
|
||||
generator_loss_type = "hinge"
|
||||
discriminator_loss_type = "hinge"
|
||||
lecam_loss_weight = None
|
||||
gradient_penalty_loss_weight = None
|
||||
|
||||
# loss weights
|
||||
perceptual_loss_weight = 0.1 # use vgg is not None and more than 0
|
||||
kl_loss_weight = 1e-6
|
||||
|
||||
mixed_strategy = "mixed_video_image"
|
||||
mixed_image_ratio = 0.2
|
||||
use_real_rec_loss = True
|
||||
use_z_rec_loss = False
|
||||
use_real_rec_loss = False
|
||||
use_z_rec_loss = True
|
||||
use_image_identity_loss = False
|
||||
|
||||
# Others
|
||||
|
|
@ -65,7 +49,7 @@ seed = 42
|
|||
outputs = "outputs"
|
||||
wandb = False
|
||||
|
||||
epochs = 100
|
||||
epochs = 100 # NOTE: adjust accordingly w.r.t dataset size
|
||||
log_every = 1
|
||||
ckpt_every = 1000
|
||||
load = None
|
||||
|
|
@ -38,7 +38,7 @@ model = dict(
|
|||
perceptual_loss_weight = 0.1 # use vgg is not None and more than 0
|
||||
kl_loss_weight = 1e-6
|
||||
|
||||
mixed_image_ratio = 0.2
|
||||
mixed_strategy = "mixed_video_random"
|
||||
use_real_rec_loss = True
|
||||
use_z_rec_loss = False
|
||||
use_image_identity_loss = False
|
||||
|
|
@ -48,7 +48,7 @@ seed = 42
|
|||
outputs = "outputs"
|
||||
wandb = False
|
||||
|
||||
epochs = 100
|
||||
epochs = 100 # NOTE: adjust accordingly w.r.t dataset size
|
||||
log_every = 1
|
||||
ckpt_every = 1000
|
||||
load = None
|
||||
|
|
@ -7,7 +7,7 @@ Note that besides these packages, some packages needs to be mannually installed,
|
|||
|
||||
You need to install `opensora` for training and inference. You can follow the steps below for installation. We also provide guideline for different CUDA versions for compatiblity.
|
||||
|
||||
Please note that the default installation is for training and inference only. Other optional dependencies are detailed in the sections [Data Proessing](#data-processing) and [Evaluation](#evaluation) respectively.
|
||||
Please note that the default installation is for training and inference only. Other optional dependencies are detailed in the sections [Data Processing](#data-processing), [Evaluation](#evaluation), and [VAE](#vae) respectively.
|
||||
|
||||
### Step 1: Install PyTorch and xformers
|
||||
|
||||
|
|
@ -58,63 +58,6 @@ pip install flash-attn --no-build-isolation
|
|||
pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git
|
||||
```
|
||||
|
||||
## Evaluation
|
||||
|
||||
### Step 1: Install Requirements
|
||||
|
||||
To conduct evaluation, run the following command to install requirements:
|
||||
|
||||
```bash
|
||||
pip install -v .[eval]
|
||||
# For development:`pip install -v -e .[eval]`
|
||||
```
|
||||
|
||||
### Step 2: Install VBench
|
||||
|
||||
<!-- You need to manually install [VBench](https://github.com/Vchitect/VBench):
|
||||
|
||||
```bash
|
||||
pip install --no-deps vbench==0.1.1
|
||||
# If the installation shows a warning about the intalled vbench not in PATH, you need to add it by:
|
||||
export PATH="/path/to/vbench:$PATH"
|
||||
``` -->
|
||||
|
||||
You need to install VBench mannually by:
|
||||
```bash
|
||||
# first clone their repo
|
||||
cd .. # assume you are in the Open-Sora root folder, you may install at other location but make sure the soft link paths later are correct
|
||||
git clone https://github.com/Vchitect/VBench.git
|
||||
cd VBench
|
||||
git checkout v0.1.2
|
||||
|
||||
# next, fix their hard-coded path isse
|
||||
vim vbench2_beta_i2v/utils.py
|
||||
# find `image_root` in the `load_i2v_dimension_info` function, change it to point to your appropriate image folder
|
||||
|
||||
# last, create softlinks
|
||||
cd ../Open-Sora # or `cd ../Open-Sora-dev` for development
|
||||
ln -s ../VBench/vbench vbench # you may need to change ../VBench/vbench to your corresponding path
|
||||
ln -s ../VBench/vbench2_beta_i2v vbench2_beta_i2v # you may need to change ../VBench/vbench_beta_i2v to your corresponding path
|
||||
# later you need to make sure to run evaluation from your Open-Sora folder, else vbench, vbench2_beta_i2v cannot be found
|
||||
```
|
||||
|
||||
|
||||
### Step 3: Install `cupy` for Potential VAE Errors
|
||||
|
||||
You need to mannually install [cupy](https://docs.cupy.dev/en/stable/install.html).
|
||||
|
||||
- For CUDA v11.2~11.8 (x86_64 / aarch64), `pip install cupy-cuda11x`
|
||||
- For CUDA v12.x (x86_64 / aarch64), `pip install cupy-cuda12x`
|
||||
|
||||
Note that for VAE evaluation, you may run into error with `ModuleNotFoundError: No module named 'torchvision.transforms.functional_tensor'`, in this case, you need to go to the corresponding file (`.../pytorchvideo/transforms/augmentations.py`) reporting this error, then change as following:
|
||||
|
||||
```python
|
||||
# find the original line:
|
||||
import torchvision.transforms.functional_tensor as F_t
|
||||
# change to:
|
||||
import torchvision.transforms._functional_tensor as F_t
|
||||
```
|
||||
|
||||
## Data Processing
|
||||
|
||||
### Step 1: Install Requirements
|
||||
|
|
@ -193,3 +136,79 @@ and change the assert of `mmcv_version < digit_version(mmcv_maximum_version)` to
|
|||
|
||||
If you are unsure of your path to the mmdet init file, simply run our [OCR command](../tools/scoring/README.md), wait for the mmdeet assertion error on mmcv versions.
|
||||
The error will contain the exact path to the mmdet init file.
|
||||
|
||||
|
||||
## Evaluation
|
||||
|
||||
### Step 1: Install Requirements
|
||||
|
||||
To conduct evaluation, run the following command to install requirements:
|
||||
|
||||
```bash
|
||||
pip install -v .[eval]
|
||||
# For development:`pip install -v -e .[eval]`
|
||||
```
|
||||
|
||||
### Step 2: Install VBench
|
||||
|
||||
<!-- You need to manually install [VBench](https://github.com/Vchitect/VBench):
|
||||
|
||||
```bash
|
||||
pip install --no-deps vbench==0.1.1
|
||||
# If the installation shows a warning about the intalled vbench not in PATH, you need to add it by:
|
||||
export PATH="/path/to/vbench:$PATH"
|
||||
``` -->
|
||||
|
||||
You need to install VBench mannually by:
|
||||
```bash
|
||||
# first clone their repo
|
||||
cd .. # assume you are in the Open-Sora root folder, you may install at other location but make sure the soft link paths later are correct
|
||||
git clone https://github.com/Vchitect/VBench.git
|
||||
cd VBench
|
||||
git checkout v0.1.2
|
||||
|
||||
# next, fix their hard-coded path isse
|
||||
vim vbench2_beta_i2v/utils.py
|
||||
# find `image_root` in the `load_i2v_dimension_info` function, change it to point to your appropriate image folder
|
||||
|
||||
# last, create softlinks
|
||||
cd ../Open-Sora # or `cd ../Open-Sora-dev` for development
|
||||
ln -s ../VBench/vbench vbench # you may need to change ../VBench/vbench to your corresponding path
|
||||
ln -s ../VBench/vbench2_beta_i2v vbench2_beta_i2v # you may need to change ../VBench/vbench_beta_i2v to your corresponding path
|
||||
# later you need to make sure to run evaluation from your Open-Sora folder, else vbench, vbench2_beta_i2v cannot be found
|
||||
```
|
||||
|
||||
|
||||
### Step 3: Install `cupy` for Potential VAE Errors
|
||||
|
||||
You need to mannually install [cupy](https://docs.cupy.dev/en/stable/install.html).
|
||||
|
||||
- For CUDA v11.2~11.8 (x86_64 / aarch64), `pip install cupy-cuda11x`
|
||||
- For CUDA v12.x (x86_64 / aarch64), `pip install cupy-cuda12x`
|
||||
|
||||
Note that for VAE evaluation, you may run into error with `ModuleNotFoundError: No module named 'torchvision.transforms.functional_tensor'`, in this case, you need to go to the corresponding file (`.../pytorchvideo/transforms/augmentations.py`) reporting this error, then change as following:
|
||||
|
||||
```python
|
||||
# find the original line:
|
||||
import torchvision.transforms.functional_tensor as F_t
|
||||
# change to:
|
||||
import torchvision.transforms._functional_tensor as F_t
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
## VAE
|
||||
|
||||
### Step 1: Install Requirements
|
||||
|
||||
To train and evaluate your own VAE, run the following command to install requirements:
|
||||
|
||||
```bash
|
||||
pip install -v .[vae]
|
||||
# For development:`pip install -v -e .[vae]`
|
||||
```
|
||||
|
||||
### Step 2: VAE Evaluation (`cupy` and Potential VAE Errors)
|
||||
|
||||
Refer to [Evaluation's VAE section](#step-3-install-cupy-for-potential-vae-errors).
|
||||
|
|
|
|||
|
|
@ -44,7 +44,7 @@ Our training involves three stages:
|
|||
2. For the next 260k steps, We remove the identity loss and just learn the 3D VAE.
|
||||
3. For the last 540k steps , since we find only reconstruction 2D VAE's feature cannot lead to further improvement, we remove the loss and train the whole VAE to reconstruct the original videos. This stage is trained on on 24 GPUs.
|
||||
|
||||
For the first half of training, we adopt 20% images and 80% videos. We find videos with length different from 17 frames will suffer from blurring. Thus, we use a random number within 34 frames to make our VAE more robust to different video lengths. Our [training](/scripts/train_vae.py) and [inference](/scripts/inference_vae.py) code is available in the Open-Sora 1.2 release.
|
||||
For both stage 1 and stage 2 training, we adopt 20% images and 80% videos. We find videos with length different from 17 frames will suffer from blurring. Thus, we use a random number within 34 frames to make our VAE more robust to different video lengths. Our [training](/scripts/train_vae.py) and [inference](/scripts/inference_vae.py) code is available in the Open-Sora 1.2 release.
|
||||
|
||||
When using the VAE for diffusion model, our stacked VAE requires small memory as the our VAE's input is already compressed. We also split the input videos input several 17 frames clips to make the inference more efficient. The performance of our VAE is on par with another open-sourced 3D VAE in [Open-Sora-Plan](https://github.com/PKU-YuanGroup/Open-Sora-Plan/blob/main/docs/Report-v1.1.0.md).
|
||||
|
||||
|
|
|
|||
63
docs/vae.md
Normal file
63
docs/vae.md
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
# VAE Report
|
||||
|
||||
As [Pixart-Sigma](https://arxiv.org/abs/2403.04692) finds that adapting to a new VAE is simple, we develop a temporal VAE for the diffusion model to adapt to.
|
||||
Specifically, our VAE consists of a pipeline of a [spatial VAE](https://huggingface.co/PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers) followed by a temporal VAE.
|
||||
For the temporal VAE, we follow the implementation of [MAGVIT-v2](https://arxiv.org/abs/2310.05737), with the following modifications:
|
||||
* We remove the architecture specific to the codebook.
|
||||
* We do not use the discriminator, and use the VAE reconstruction loss, kl loss, and perceptual loss for training.
|
||||
* In the last linear layer of the encoder, we scale down to a diagonal Gaussian Distribution of 4 channels, following our previously trained STDiT that takes in 4 channels input.
|
||||
* Our decoder is symmetric to the encoder architecture.
|
||||
|
||||
## Training
|
||||
|
||||
We train the model in different stages.
|
||||
|
||||
We first train the temporal VAE only by freezing the spatial VAE for 380k steps on a single machine (8 GPUs).
|
||||
We use an additional identity loss to make features from the 3D VAE similar to the features from the 2D VAE.
|
||||
We train the VAE using 20% images and 80% videos with 17 frames.
|
||||
```bash
|
||||
torchrun --nnodes=1 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage1.py --data-path YOUR_CSV_PATH
|
||||
```
|
||||
Next, we remove the indentity loss and train the 3D VAE pipeline to reconstructe the 2D-compressed videos for 260k steps.
|
||||
```bash
|
||||
torchrun --nnodes=1 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage2.py --data-path YOUR_CSV_PATH
|
||||
```
|
||||
Finally, we remove the reconstruction loss for the 2D-compressed videos and train the VAE pipeline to construct the 3D videos for 540k steps.
|
||||
We train our VAE with a random number within 34 frames to make it more robust to different video lengths.
|
||||
This stage is trained on 24 GPUs.
|
||||
```bash
|
||||
torchrun --nnodes=3 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage3.py --data-path YOUR_CSV_PATH
|
||||
```
|
||||
|
||||
Note that you need to adjust the `epochs` in the config file accordingly with respect to your own csv data size.
|
||||
|
||||
|
||||
## Inference
|
||||
|
||||
To visually check the performance of the VAE, you may run the following inference.
|
||||
It saves the original video to your specified video directory with `_ori` postfix (i.e. `"YOUR_VIDEO_DIR"_ori`), the reconstructed video from the full pipeline with the `_rec` postfix (i.e. `"YOUR_VIDEO_DIR"_rec`), and the reconstructed video from the 2D compression and decompression with the `_spatial` postfix (i.e. `"YOUR_VIDEO_DIR"_spatial`).
|
||||
```bash
|
||||
torchrun --standalone --nnodes=1 --nproc_per_node=1 scripts/inference_vae.py configs/vae/inference/video.py --ckpt-path YOUR_VAE_CKPT_PATH --data-path YOUR_CSV_PATH --save-dir YOUR_VIDEO_DIR
|
||||
```
|
||||
|
||||
## Evaluation
|
||||
We can then calculate the scores of the VAE performances on metrics of SSIM, PSNR, LPIPS, and FLOLPIPS.
|
||||
|
||||
* SSIM: structural similarity index measure, the higher the better
|
||||
* PSNR: peak-signal-to-noise ratio, the higher the better
|
||||
* LPIPS: learned perceptual image quality degradation, the lower the better
|
||||
* [FloLPIPS](https://arxiv.org/pdf/2207.08119): LPIPS with video interpolation, the lower the better.
|
||||
|
||||
```bash
|
||||
python eval/vae/eval_common_metric.py --batch_size 2 --real_video_dir YOUR_VIDEO_DIR_ori --generated_video_dir YOUR_VIDEO_DIR_rec --device cuda --sample_fps 24 --crop_size 256 --resolution 256 --num_frames 17 --sample_rate 1 --metric ssim psnr lpips flolpips
|
||||
```
|
||||
|
||||
|
||||
## Acknowledgement
|
||||
We are grateful for the following work:
|
||||
* [MAGVIT-v2](https://arxiv.org/abs/2310.05737): Language Model Beats Diffusion -- Tokenizer is Key to Visual Generation
|
||||
* [Taming Transformers](https://github.com/CompVis/taming-transformers): Taming Transformers for High-Resolution Image Synthesis
|
||||
* [3D blur pooling](https://github.com/adobe/antialiased-cnns/pull/39/commits/3d6f02b6943c58b68c19c07bc26fad57492ff3bc)
|
||||
* [Open-Sora-Plan](https://github.com/PKU-YuanGroup/Open-Sora-Plan)
|
||||
|
||||
Special thanks go to the authors of [Open-Sora-Plan](https://github.com/PKU-YuanGroup/Open-Sora-Plan) for their valuable advice and help.
|
||||
|
|
@ -6,10 +6,10 @@ from tqdm import tqdm
|
|||
|
||||
sys.path.append(".")
|
||||
|
||||
# ERROR: cannot locate the model file
|
||||
from flolpips.pwcnet import Network as PWCNet
|
||||
from flolpips.flolpips import FloLPIPS
|
||||
loss_fn = FloLPIPS(net='alex', version='0.1').eval().requires_grad_(False)
|
||||
from flolpips.pwcnet import Network as PWCNet
|
||||
|
||||
loss_fn = FloLPIPS(net="alex", version="0.1").eval().requires_grad_(False)
|
||||
flownet = PWCNet().eval().requires_grad_(False)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,79 +0,0 @@
|
|||
## Commands
|
||||
|
||||
## 0. References
|
||||
|
||||
* https://github.com/google-research/magvit
|
||||
* https://github.com/CompVis/taming-transformers
|
||||
* https://github.com/adobe/antialiased-cnns/pull/39/commits/3d6f02b6943c58b68c19c07bc26fad57492ff3bc
|
||||
* https://github.com/PKU-YuanGroup/Open-Sora-Plan
|
||||
|
||||
|
||||
## 1. VAE 3D
|
||||
### 1.1 Train
|
||||
|
||||
```yaml
|
||||
# train on pexel dataset
|
||||
WANDB_API_KEY=<wandb_api_key> CUDA_VISIBLE_DEVICES=<n> torchrun --master_port=<port_num> --nnodes=1 --nproc_per_node=1 scripts/train-vae.py configs/vae_3d/train/16x256x256.py --data-path /home/shenchenhui/data/pexels/train.csv --wandb True
|
||||
```
|
||||
|
||||
### 1.2 Inference
|
||||
|
||||
```yaml
|
||||
CUDA_VISIBLE_DEVICES=6 torchrun --standalone --nnodes=1 --nproc_per_node=1 scripts/inference-vae.py configs/vae_3d/inference/16x256x256.py --ckpt-path /home/shenchenhui/Open-Sora-dev/outputs/train_pexel_028/epoch3-global_step20000/ --data-path /home/shenchenhui/data/pexels/debug.csv --save-dir outputs/pexel
|
||||
|
||||
|
||||
# resume training debug
|
||||
CUDA_VISIBLE_DEVICES=5 torchrun --master_port=29530 --nnodes=1 --nproc_per_node=1 scripts/train-vae.py configs/vae_3d/train/16x256x256.py --data-path /home/shenchenhui/data/pexels/debug.csv --load /home/shenchenhui/Open-Sora-dev/outputs/006-F16S3-VAE_3D_B/epoch49-global_step50
|
||||
```
|
||||
|
||||
version 2 pipeline
|
||||
```yaml
|
||||
# NOTE: first VAE is pretrained 2D, 16x128x128 --> 16x16x16
|
||||
# then we train our own temporal VAE, 16x16x16 --> 4x16x16
|
||||
# we use a 3 layer discriminator on the intermediate of 16x16x16
|
||||
WANDB_API_KEY=<wandb_api_key> CUDA_VISIBLE_DEVICES=7 torchrun --master_port=29580 --nnodes=1 --nproc_per_node=1 scripts/train-vae-v2.py configs/vae_magvit_v2/train/pipeline_16x128x128.py --data-path /home/shenchenhui/data/trial_data/train_short.csv --wandb True
|
||||
```
|
||||
|
||||
|
||||
## 2. MAGVIT-v2
|
||||
|
||||
### 2.1 dependencies
|
||||
```
|
||||
'accelerate>=0.24.0',
|
||||
'beartype',
|
||||
'einops>=0.7.0',
|
||||
'ema-pytorch>=0.2.4',
|
||||
'pytorch-warmup',
|
||||
'gateloop-transformer>=0.2.2',
|
||||
'kornia',
|
||||
'opencv-python',
|
||||
'pillow',
|
||||
'pytorch-custom-utils>=0.0.9',
|
||||
'numpy',
|
||||
'vector-quantize-pytorch>=1.11.8',
|
||||
'taylor-series-linear-attention>=0.1.5',
|
||||
'torch',
|
||||
'torchvision',
|
||||
'x-transformers'
|
||||
```
|
||||
|
||||
Note:
|
||||
uses `hotfix/zero` branch of `https://github.com/ver217/ColossalAI.git`.
|
||||
clone the repo, go to the branch, then do `pip install .`
|
||||
|
||||
|
||||
### 2.2 Train
|
||||
|
||||
```yaml
|
||||
CUDA_VISIBLE_DEVICES7 torchrun --master_port=29510 --nnodes=1 --nproc_per_node=1 scripts/train-vae-v2.py configs/vae_magvit_v2/train/17x128x128.py --data-path /home/shenchenhui/data/pexels/train.csv
|
||||
```
|
||||
|
||||
### 2.3 Inference
|
||||
|
||||
|
||||
### 2.4 Data
|
||||
|
||||
full data combining the follwing: `/home/shenchenhui/data/pixabay+pexels.csv`
|
||||
|
||||
* ~/data/pixabay: `/home/data/sora_data/pixabay/raw/data/split-0`
|
||||
* pexels: `/home/litianyi/data/pexels/processed/meta/pexels_caption_vinfo_ready_noempty_clean.csv`
|
||||
|
|
@ -50,7 +50,7 @@ class LPIPS(nn.Module):
|
|||
super().__init__()
|
||||
self.scaling_layer = ScalingLayer()
|
||||
self.chns = [64, 128, 256, 512, 512] # vg16 features
|
||||
self.net = vgg16(pretrained=True, requires_grad=False) # NOTE: TODO: need in_channels = 4 to use
|
||||
self.net = vgg16(pretrained=True, requires_grad=False)
|
||||
self.lin0 = NetLinLayer(self.chns[0], use_dropout=use_dropout)
|
||||
self.lin1 = NetLinLayer(self.chns[1], use_dropout=use_dropout)
|
||||
self.lin2 = NetLinLayer(self.chns[2], use_dropout=use_dropout)
|
||||
|
|
@ -90,7 +90,6 @@ class LPIPS(nn.Module):
|
|||
return val
|
||||
|
||||
|
||||
# SCH: TODO: this channel shift & scale may need to be changed
|
||||
class ScalingLayer(nn.Module):
|
||||
def __init__(self):
|
||||
super(ScalingLayer, self).__init__()
|
||||
|
|
|
|||
|
|
@ -1,26 +1,9 @@
|
|||
import numpy as np
|
||||
import torch
|
||||
|
||||
# from taming.modules.losses.lpips import LPIPS # need to pip install https://github.com/CompVis/taming-transformers
|
||||
# from taming.modules.discriminator.model import NLayerDiscriminator, weights_init
|
||||
|
||||
"""Stripped version of https://github.com/richzhang/PerceptualSimilarity/tree/master/models"""
|
||||
|
||||
|
||||
## NOTE: not used since we only have 'GN'
|
||||
# def get_norm_layer(norm_type, dtype):
|
||||
# if norm_type == 'LN':
|
||||
# # supply a few args with partial function and pass the rest of the args when this norm_fn is called
|
||||
# norm_fn = functools.partial(nn.LayerNorm, dtype=dtype)
|
||||
# elif norm_type == 'GN': #
|
||||
# norm_fn = functools.partial(nn.GroupNorm, dtype=dtype)
|
||||
# elif norm_type is None:
|
||||
# norm_fn = lambda: (lambda x: x)
|
||||
# else:
|
||||
# raise NotImplementedError(f'norm_type: {norm_type}')
|
||||
# return norm_fn
|
||||
|
||||
|
||||
class DiagonalGaussianDistribution(object):
|
||||
def __init__(
|
||||
self,
|
||||
|
|
@ -57,7 +40,7 @@ class DiagonalGaussianDistribution(object):
|
|||
dim=[1, 2, 3, 4],
|
||||
)
|
||||
|
||||
def nll(self, sample, dims=[1, 2, 3, 4]): # TODO: what does this do?
|
||||
def nll(self, sample, dims=[1, 2, 3, 4]):
|
||||
if self.deterministic:
|
||||
return torch.Tensor([0.0])
|
||||
logtwopi = np.log(2.0 * np.pi)
|
||||
|
|
|
|||
5
requirements/requirements-vae.txt
Normal file
5
requirements/requirements-vae.txt
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
beartype==0.18.5
|
||||
einops==0.8.0
|
||||
einops-exts==0.0.4
|
||||
opencv-python==4.9.0.80
|
||||
pillow==10.3.0
|
||||
1
setup.py
1
setup.py
|
|
@ -79,6 +79,7 @@ setup(
|
|||
extras_require={
|
||||
"data": fetch_requirements("requirements/requirements-data.txt"),
|
||||
"eval": fetch_requirements("requirements/requirements-eval.txt"),
|
||||
"vae": fetch_requirements("requirements/requirements-vae.txt"),
|
||||
"full": fetch_requirements(
|
||||
[
|
||||
"requirements/requirements-data.txt",
|
||||
|
|
|
|||
Loading…
Reference in a new issue