mirror of
https://github.com/hpcaitech/Open-Sora.git
synced 2026-04-15 20:36:58 +02:00
format
This commit is contained in:
parent
0fb6415007
commit
d2e710c977
32
README.md
32
README.md
|
|
@ -136,6 +136,7 @@ see [here](/assets/texts/t2v_samples.txt) for full prompts.
|
|||
* [Gradio Demo](#gradio-demo)
|
||||
* [Inference](#inference)
|
||||
* [Data Processing](#data-processing)
|
||||
* [VAE](#vae)
|
||||
* [Training](#training)
|
||||
* [Evaluation](#evaluation)
|
||||
* [Contribution](#contribution)
|
||||
|
|
@ -157,7 +158,7 @@ Other useful documents and links are listed below.
|
|||
|
||||
### Install from Source
|
||||
|
||||
For CUDA 12.1, you can install the dependencies with the following commands. Otherwise, please refer to [Installation](docs/installation.md) for more instructions on different cuda version, and additional dependency for data preprocessing.
|
||||
For CUDA 12.1, you can install the dependencies with the following commands. Otherwise, please refer to [Installation](docs/installation.md) for more instructions on different cuda version, and additional dependency for data preprocessing, VAE, and model evaluation.
|
||||
|
||||
```bash
|
||||
# create a virtual env and activate (conda as an example)
|
||||
|
|
@ -372,6 +373,35 @@ Also check out the [datasets](docs/datasets.md) we use.
|
|||
|
||||

|
||||
|
||||
## VAE
|
||||
We train a VAE pipeline that consists of a spatial VAE followed by a temporal VAE.
|
||||
For more details, refer to our [VAE documentation](docs/vae.md).
|
||||
Before you run the following commands, follow our [Installation Documentation](docs/installation.md) to install the required dependencies for VAE and Evaluation.
|
||||
|
||||
Once you prepare the data in a `csv` file, run the following commands to train the VAE.
|
||||
Note that you need to adjust the number of trained epochs (`epochs`) in the config file accordingly with respect to your own csv data size.
|
||||
|
||||
```bash
|
||||
# stage 1 training, 380k steps, 8 GPUs
|
||||
torchrun --nnodes=1 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage1.py --data-path YOUR_CSV_PATH
|
||||
# stage 2 training, 260k steps, 8 GPUs
|
||||
torchrun --nnodes=1 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage[1-3].py --data-path YOUR_CSV_PATH
|
||||
# stage 3 training, 540k steps, 24 GPUs
|
||||
torchrun --nnodes=3 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage[1-3].py --data-path YOUR_CSV_PATH
|
||||
```
|
||||
To evaluate the VAE performance, you need to run VAE inference first to generate the videos, then calculate scores on the generated videos:
|
||||
|
||||
```bash
|
||||
# video generation
|
||||
torchrun --standalone --nnodes=1 --nproc_per_node=1 scripts/inference_vae.py configs/vae/inference/video.py --ckpt-path YOUR_VAE_CKPT_PATH --data-path YOUR_CSV_PATH --save-dir YOUR_VIDEO_DIR
|
||||
# the original videos will be saved to `YOUR_VIDEO_DIR_ori`
|
||||
# the reconstructed videos through the pipeline will be saved to `YOUR_VIDEO_DIR_rec`
|
||||
# the reconstructed videos through the spatial VAE only will be saved to `YOUR_VIDEO_DIR_spatial`
|
||||
|
||||
# score calculation
|
||||
python eval/vae/eval_common_metric.py --batch_size 2 --real_video_dir YOUR_VIDEO_DIR_ori --generated_video_dir YOUR_VIDEO_DIR_rec --device cuda --sample_fps 24 --crop_size 256 --resolution 256 --num_frames 17 --sample_rate 1 --metric ssim psnr lpips flolpips
|
||||
```
|
||||
|
||||
## Training
|
||||
|
||||
### Open-Sora 1.2 Training
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
num_frames = 1
|
||||
num_frames = 17
|
||||
image_size = (256, 256)
|
||||
|
||||
# Define dataset
|
||||
|
|
@ -35,10 +35,11 @@ model = dict(
|
|||
)
|
||||
|
||||
# loss weights
|
||||
perceptual_loss_weight = 0.0 # use vgg is not None and more than 0
|
||||
perceptual_loss_weight = 0.1 # use vgg is not None and more than 0
|
||||
kl_loss_weight = 1e-6
|
||||
|
||||
mixed_image_ratio = 0.1
|
||||
mixed_strategy = "mixed_video_image"
|
||||
mixed_image_ratio = 0.2
|
||||
use_real_rec_loss = False
|
||||
use_z_rec_loss = True
|
||||
use_image_identity_loss = True
|
||||
|
|
@ -48,7 +49,7 @@ seed = 42
|
|||
outputs = "outputs"
|
||||
wandb = False
|
||||
|
||||
epochs = 100
|
||||
epochs = 100 # NOTE: adjust accordingly w.r.t dataset size
|
||||
log_every = 1
|
||||
ckpt_every = 1000
|
||||
load = None
|
||||
|
|
@ -34,30 +34,14 @@ model = dict(
|
|||
),
|
||||
)
|
||||
|
||||
discriminator = dict(
|
||||
type="NLayerDiscriminator",
|
||||
from_pretrained="/home/shenchenhui/opensoraplan-v1.0.0-discriminator.pt",
|
||||
input_nc=3,
|
||||
n_layers=3,
|
||||
use_actnorm=False,
|
||||
)
|
||||
|
||||
# discriminator hyper-parames TODO
|
||||
discriminator_factor = 1
|
||||
discriminator_start = -1
|
||||
generator_factor = 0.5
|
||||
generator_loss_type = "hinge"
|
||||
discriminator_loss_type = "hinge"
|
||||
lecam_loss_weight = None
|
||||
gradient_penalty_loss_weight = None
|
||||
|
||||
# loss weights
|
||||
perceptual_loss_weight = 0.1 # use vgg is not None and more than 0
|
||||
kl_loss_weight = 1e-6
|
||||
|
||||
mixed_strategy = "mixed_video_image"
|
||||
mixed_image_ratio = 0.2
|
||||
use_real_rec_loss = True
|
||||
use_z_rec_loss = False
|
||||
use_real_rec_loss = False
|
||||
use_z_rec_loss = True
|
||||
use_image_identity_loss = False
|
||||
|
||||
# Others
|
||||
|
|
@ -65,7 +49,7 @@ seed = 42
|
|||
outputs = "outputs"
|
||||
wandb = False
|
||||
|
||||
epochs = 100
|
||||
epochs = 100 # NOTE: adjust accordingly w.r.t dataset size
|
||||
log_every = 1
|
||||
ckpt_every = 1000
|
||||
load = None
|
||||
|
|
@ -38,7 +38,7 @@ model = dict(
|
|||
perceptual_loss_weight = 0.1 # use vgg is not None and more than 0
|
||||
kl_loss_weight = 1e-6
|
||||
|
||||
mixed_image_ratio = 0.2
|
||||
mixed_strategy = "mixed_video_random"
|
||||
use_real_rec_loss = True
|
||||
use_z_rec_loss = False
|
||||
use_image_identity_loss = False
|
||||
|
|
@ -48,7 +48,7 @@ seed = 42
|
|||
outputs = "outputs"
|
||||
wandb = False
|
||||
|
||||
epochs = 100
|
||||
epochs = 100 # NOTE: adjust accordingly w.r.t dataset size
|
||||
log_every = 1
|
||||
ckpt_every = 1000
|
||||
load = None
|
||||
|
|
@ -7,7 +7,7 @@ Note that besides these packages, some packages needs to be mannually installed,
|
|||
|
||||
You need to install `opensora` for training and inference. You can follow the steps below for installation. We also provide guideline for different CUDA versions for compatiblity.
|
||||
|
||||
Please note that the default installation is for training and inference only. Other optional dependencies are detailed in the sections [Data Proessing](#data-processing) and [Evaluation](#evaluation) respectively.
|
||||
Please note that the default installation is for training and inference only. Other optional dependencies are detailed in the sections [Data Processing](#data-processing), [Evaluation](#evaluation), and [VAE](#vae) respectively.
|
||||
|
||||
### Step 1: Install PyTorch and xformers
|
||||
|
||||
|
|
@ -58,63 +58,6 @@ pip install flash-attn --no-build-isolation
|
|||
pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git
|
||||
```
|
||||
|
||||
## Evaluation
|
||||
|
||||
### Step 1: Install Requirements
|
||||
|
||||
To conduct evaluation, run the following command to install requirements:
|
||||
|
||||
```bash
|
||||
pip install -v .[eval]
|
||||
# For development:`pip install -v -e .[eval]`
|
||||
```
|
||||
|
||||
### Step 2: Install VBench
|
||||
|
||||
<!-- You need to manually install [VBench](https://github.com/Vchitect/VBench):
|
||||
|
||||
```bash
|
||||
pip install --no-deps vbench==0.1.1
|
||||
# If the installation shows a warning about the intalled vbench not in PATH, you need to add it by:
|
||||
export PATH="/path/to/vbench:$PATH"
|
||||
``` -->
|
||||
|
||||
You need to install VBench mannually by:
|
||||
```bash
|
||||
# first clone their repo
|
||||
cd .. # assume you are in the Open-Sora root folder, you may install at other location but make sure the soft link paths later are correct
|
||||
git clone https://github.com/Vchitect/VBench.git
|
||||
cd VBench
|
||||
git checkout v0.1.2
|
||||
|
||||
# next, fix their hard-coded path isse
|
||||
vim vbench2_beta_i2v/utils.py
|
||||
# find `image_root` in the `load_i2v_dimension_info` function, change it to point to your appropriate image folder
|
||||
|
||||
# last, create softlinks
|
||||
cd ../Open-Sora # or `cd ../Open-Sora-dev` for development
|
||||
ln -s ../VBench/vbench vbench # you may need to change ../VBench/vbench to your corresponding path
|
||||
ln -s ../VBench/vbench2_beta_i2v vbench2_beta_i2v # you may need to change ../VBench/vbench_beta_i2v to your corresponding path
|
||||
# later you need to make sure to run evaluation from your Open-Sora folder, else vbench, vbench2_beta_i2v cannot be found
|
||||
```
|
||||
|
||||
|
||||
### Step 3: Install `cupy` for Potential VAE Errors
|
||||
|
||||
You need to mannually install [cupy](https://docs.cupy.dev/en/stable/install.html).
|
||||
|
||||
- For CUDA v11.2~11.8 (x86_64 / aarch64), `pip install cupy-cuda11x`
|
||||
- For CUDA v12.x (x86_64 / aarch64), `pip install cupy-cuda12x`
|
||||
|
||||
Note that for VAE evaluation, you may run into error with `ModuleNotFoundError: No module named 'torchvision.transforms.functional_tensor'`, in this case, you need to go to the corresponding file (`.../pytorchvideo/transforms/augmentations.py`) reporting this error, then change as following:
|
||||
|
||||
```python
|
||||
# find the original line:
|
||||
import torchvision.transforms.functional_tensor as F_t
|
||||
# change to:
|
||||
import torchvision.transforms._functional_tensor as F_t
|
||||
```
|
||||
|
||||
## Data Processing
|
||||
|
||||
### Step 1: Install Requirements
|
||||
|
|
@ -193,3 +136,79 @@ and change the assert of `mmcv_version < digit_version(mmcv_maximum_version)` to
|
|||
|
||||
If you are unsure of your path to the mmdet init file, simply run our [OCR command](../tools/scoring/README.md), wait for the mmdeet assertion error on mmcv versions.
|
||||
The error will contain the exact path to the mmdet init file.
|
||||
|
||||
|
||||
## Evaluation
|
||||
|
||||
### Step 1: Install Requirements
|
||||
|
||||
To conduct evaluation, run the following command to install requirements:
|
||||
|
||||
```bash
|
||||
pip install -v .[eval]
|
||||
# For development:`pip install -v -e .[eval]`
|
||||
```
|
||||
|
||||
### Step 2: Install VBench
|
||||
|
||||
<!-- You need to manually install [VBench](https://github.com/Vchitect/VBench):
|
||||
|
||||
```bash
|
||||
pip install --no-deps vbench==0.1.1
|
||||
# If the installation shows a warning about the intalled vbench not in PATH, you need to add it by:
|
||||
export PATH="/path/to/vbench:$PATH"
|
||||
``` -->
|
||||
|
||||
You need to install VBench mannually by:
|
||||
```bash
|
||||
# first clone their repo
|
||||
cd .. # assume you are in the Open-Sora root folder, you may install at other location but make sure the soft link paths later are correct
|
||||
git clone https://github.com/Vchitect/VBench.git
|
||||
cd VBench
|
||||
git checkout v0.1.2
|
||||
|
||||
# next, fix their hard-coded path isse
|
||||
vim vbench2_beta_i2v/utils.py
|
||||
# find `image_root` in the `load_i2v_dimension_info` function, change it to point to your appropriate image folder
|
||||
|
||||
# last, create softlinks
|
||||
cd ../Open-Sora # or `cd ../Open-Sora-dev` for development
|
||||
ln -s ../VBench/vbench vbench # you may need to change ../VBench/vbench to your corresponding path
|
||||
ln -s ../VBench/vbench2_beta_i2v vbench2_beta_i2v # you may need to change ../VBench/vbench_beta_i2v to your corresponding path
|
||||
# later you need to make sure to run evaluation from your Open-Sora folder, else vbench, vbench2_beta_i2v cannot be found
|
||||
```
|
||||
|
||||
|
||||
### Step 3: Install `cupy` for Potential VAE Errors
|
||||
|
||||
You need to mannually install [cupy](https://docs.cupy.dev/en/stable/install.html).
|
||||
|
||||
- For CUDA v11.2~11.8 (x86_64 / aarch64), `pip install cupy-cuda11x`
|
||||
- For CUDA v12.x (x86_64 / aarch64), `pip install cupy-cuda12x`
|
||||
|
||||
Note that for VAE evaluation, you may run into error with `ModuleNotFoundError: No module named 'torchvision.transforms.functional_tensor'`, in this case, you need to go to the corresponding file (`.../pytorchvideo/transforms/augmentations.py`) reporting this error, then change as following:
|
||||
|
||||
```python
|
||||
# find the original line:
|
||||
import torchvision.transforms.functional_tensor as F_t
|
||||
# change to:
|
||||
import torchvision.transforms._functional_tensor as F_t
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
## VAE
|
||||
|
||||
### Step 1: Install Requirements
|
||||
|
||||
To train and evaluate your own VAE, run the following command to install requirements:
|
||||
|
||||
```bash
|
||||
pip install -v .[vae]
|
||||
# For development:`pip install -v -e .[vae]`
|
||||
```
|
||||
|
||||
### Step 2: VAE Evaluation (`cupy` and Potential VAE Errors)
|
||||
|
||||
Refer to [Evaluation's VAE section](#step-3-install-cupy-for-potential-vae-errors).
|
||||
|
|
|
|||
63
docs/vae.md
Normal file
63
docs/vae.md
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
# VAE Report
|
||||
|
||||
As [Pixart-Sigma](https://arxiv.org/abs/2403.04692) finds that adapting to a new VAE is simple, we develop a temporal VAE for the diffusion model to adapt to.
|
||||
Specifically, our VAE consists of a pipeline of a [spatial VAE](https://huggingface.co/PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers) followed by a temporal VAE.
|
||||
For the temporal VAE, we follow the implementation of [MAGVIT-v2](https://arxiv.org/abs/2310.05737), with the following modifications:
|
||||
* We remove the architecture specific to the codebook.
|
||||
* We do not use the discriminator, and use the VAE reconstruction loss, kl loss, and perceptual loss for training.
|
||||
* In the last linear layer of the encoder, we scale down to a diagonal Gaussian Distribution of 4 channels, following our previously trained STDiT that takes in 4 channels input.
|
||||
* Our decoder is symmetric to the encoder architecture.
|
||||
|
||||
## Training
|
||||
|
||||
We train the model in different stages.
|
||||
|
||||
We first train the temporal VAE only by freezing the spatial VAE for 380k steps on a single machine (8 GPUs).
|
||||
We use an additional identity loss to make features from the 3D VAE similar to the features from the 2D VAE.
|
||||
We train the VAE using 20% images and 80% videos with 17 frames.
|
||||
```bash
|
||||
torchrun --nnodes=1 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage1.py --data-path YOUR_CSV_PATH
|
||||
```
|
||||
Next, we remove the indentity loss and train the 3D VAE pipeline to reconstructe the 2D-compressed videos for 260k steps.
|
||||
```bash
|
||||
torchrun --nnodes=1 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage2.py --data-path YOUR_CSV_PATH
|
||||
```
|
||||
Finally, we remove the reconstruction loss for the 2D-compressed videos and train the VAE pipeline to construct the 3D videos for 540k steps.
|
||||
We train our VAE with a random number within 34 frames to make it more robust to different video lengths.
|
||||
This stage is trained on 24 GPUs.
|
||||
```bash
|
||||
torchrun --nnodes=3 --nproc_per_node=8 scripts/train_vae.py configs/vae/train/stage3.py --data-path YOUR_CSV_PATH
|
||||
```
|
||||
|
||||
Note that you need to adjust the `epochs` in the config file accordingly with respect to your own csv data size.
|
||||
|
||||
|
||||
## Inference
|
||||
|
||||
To visually check the performance of the VAE, you may run the following inference.
|
||||
It saves the original video to your specified video directory with `_ori` postfix (i.e. `"YOUR_VIDEO_DIR"_ori`), the reconstructed video from the full pipeline with the `_rec` postfix (i.e. `"YOUR_VIDEO_DIR"_rec`), and the reconstructed video from the 2D compression and decompression with the `_spatial` postfix (i.e. `"YOUR_VIDEO_DIR"_spatial`).
|
||||
```bash
|
||||
torchrun --standalone --nnodes=1 --nproc_per_node=1 scripts/inference_vae.py configs/vae/inference/video.py --ckpt-path YOUR_VAE_CKPT_PATH --data-path YOUR_CSV_PATH --save-dir YOUR_VIDEO_DIR
|
||||
```
|
||||
|
||||
## Evaluation
|
||||
We can then calculate the scores of the VAE performances on metrics of SSIM, PSNR, LPIPS, and FLOLPIPS.
|
||||
|
||||
* SSIM: structural similarity index measure, the higher the better
|
||||
* PSNR: peak-signal-to-noise ratio, the higher the better
|
||||
* LPIPS: learned perceptual image quality degradation, the lower the better
|
||||
* [FloLPIPS](https://arxiv.org/pdf/2207.08119): LPIPS with video interpolation, the lower the better.
|
||||
|
||||
```bash
|
||||
python eval/vae/eval_common_metric.py --batch_size 2 --real_video_dir YOUR_VIDEO_DIR_ori --generated_video_dir YOUR_VIDEO_DIR_rec --device cuda --sample_fps 24 --crop_size 256 --resolution 256 --num_frames 17 --sample_rate 1 --metric ssim psnr lpips flolpips
|
||||
```
|
||||
|
||||
|
||||
## Acknowledgement
|
||||
We are grateful for the following work:
|
||||
* [MAGVIT-v2](https://arxiv.org/abs/2310.05737): Language Model Beats Diffusion -- Tokenizer is Key to Visual Generation
|
||||
* [Taming Transformers](https://github.com/CompVis/taming-transformers): Taming Transformers for High-Resolution Image Synthesis
|
||||
* [3D blur pooling](https://github.com/adobe/antialiased-cnns/pull/39/commits/3d6f02b6943c58b68c19c07bc26fad57492ff3bc)
|
||||
* [Open-Sora-Plan](https://github.com/PKU-YuanGroup/Open-Sora-Plan)
|
||||
|
||||
Special thanks go to the authors of [Open-Sora-Plan](https://github.com/PKU-YuanGroup/Open-Sora-Plan) for their valuable advice and help.
|
||||
|
|
@ -6,10 +6,10 @@ from tqdm import tqdm
|
|||
|
||||
sys.path.append(".")
|
||||
|
||||
# ERROR: cannot locate the model file
|
||||
from flolpips.pwcnet import Network as PWCNet
|
||||
from flolpips.flolpips import FloLPIPS
|
||||
loss_fn = FloLPIPS(net='alex', version='0.1').eval().requires_grad_(False)
|
||||
from flolpips.pwcnet import Network as PWCNet
|
||||
|
||||
loss_fn = FloLPIPS(net="alex", version="0.1").eval().requires_grad_(False)
|
||||
flownet = PWCNet().eval().requires_grad_(False)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,79 +0,0 @@
|
|||
## Commands
|
||||
|
||||
## 0. References
|
||||
|
||||
* https://github.com/google-research/magvit
|
||||
* https://github.com/CompVis/taming-transformers
|
||||
* https://github.com/adobe/antialiased-cnns/pull/39/commits/3d6f02b6943c58b68c19c07bc26fad57492ff3bc
|
||||
* https://github.com/PKU-YuanGroup/Open-Sora-Plan
|
||||
|
||||
|
||||
## 1. VAE 3D
|
||||
### 1.1 Train
|
||||
|
||||
```yaml
|
||||
# train on pexel dataset
|
||||
WANDB_API_KEY=<wandb_api_key> CUDA_VISIBLE_DEVICES=<n> torchrun --master_port=<port_num> --nnodes=1 --nproc_per_node=1 scripts/train-vae.py configs/vae_3d/train/16x256x256.py --data-path /home/shenchenhui/data/pexels/train.csv --wandb True
|
||||
```
|
||||
|
||||
### 1.2 Inference
|
||||
|
||||
```yaml
|
||||
CUDA_VISIBLE_DEVICES=6 torchrun --standalone --nnodes=1 --nproc_per_node=1 scripts/inference-vae.py configs/vae_3d/inference/16x256x256.py --ckpt-path /home/shenchenhui/Open-Sora-dev/outputs/train_pexel_028/epoch3-global_step20000/ --data-path /home/shenchenhui/data/pexels/debug.csv --save-dir outputs/pexel
|
||||
|
||||
|
||||
# resume training debug
|
||||
CUDA_VISIBLE_DEVICES=5 torchrun --master_port=29530 --nnodes=1 --nproc_per_node=1 scripts/train-vae.py configs/vae_3d/train/16x256x256.py --data-path /home/shenchenhui/data/pexels/debug.csv --load /home/shenchenhui/Open-Sora-dev/outputs/006-F16S3-VAE_3D_B/epoch49-global_step50
|
||||
```
|
||||
|
||||
version 2 pipeline
|
||||
```yaml
|
||||
# NOTE: first VAE is pretrained 2D, 16x128x128 --> 16x16x16
|
||||
# then we train our own temporal VAE, 16x16x16 --> 4x16x16
|
||||
# we use a 3 layer discriminator on the intermediate of 16x16x16
|
||||
WANDB_API_KEY=<wandb_api_key> CUDA_VISIBLE_DEVICES=7 torchrun --master_port=29580 --nnodes=1 --nproc_per_node=1 scripts/train-vae-v2.py configs/vae_magvit_v2/train/pipeline_16x128x128.py --data-path /home/shenchenhui/data/trial_data/train_short.csv --wandb True
|
||||
```
|
||||
|
||||
|
||||
## 2. MAGVIT-v2
|
||||
|
||||
### 2.1 dependencies
|
||||
```
|
||||
'accelerate>=0.24.0',
|
||||
'beartype',
|
||||
'einops>=0.7.0',
|
||||
'ema-pytorch>=0.2.4',
|
||||
'pytorch-warmup',
|
||||
'gateloop-transformer>=0.2.2',
|
||||
'kornia',
|
||||
'opencv-python',
|
||||
'pillow',
|
||||
'pytorch-custom-utils>=0.0.9',
|
||||
'numpy',
|
||||
'vector-quantize-pytorch>=1.11.8',
|
||||
'taylor-series-linear-attention>=0.1.5',
|
||||
'torch',
|
||||
'torchvision',
|
||||
'x-transformers'
|
||||
```
|
||||
|
||||
Note:
|
||||
uses `hotfix/zero` branch of `https://github.com/ver217/ColossalAI.git`.
|
||||
clone the repo, go to the branch, then do `pip install .`
|
||||
|
||||
|
||||
### 2.2 Train
|
||||
|
||||
```yaml
|
||||
CUDA_VISIBLE_DEVICES7 torchrun --master_port=29510 --nnodes=1 --nproc_per_node=1 scripts/train-vae-v2.py configs/vae_magvit_v2/train/17x128x128.py --data-path /home/shenchenhui/data/pexels/train.csv
|
||||
```
|
||||
|
||||
### 2.3 Inference
|
||||
|
||||
|
||||
### 2.4 Data
|
||||
|
||||
full data combining the follwing: `/home/shenchenhui/data/pixabay+pexels.csv`
|
||||
|
||||
* ~/data/pixabay: `/home/data/sora_data/pixabay/raw/data/split-0`
|
||||
* pexels: `/home/litianyi/data/pexels/processed/meta/pexels_caption_vinfo_ready_noempty_clean.csv`
|
||||
|
|
@ -50,7 +50,7 @@ class LPIPS(nn.Module):
|
|||
super().__init__()
|
||||
self.scaling_layer = ScalingLayer()
|
||||
self.chns = [64, 128, 256, 512, 512] # vg16 features
|
||||
self.net = vgg16(pretrained=True, requires_grad=False) # NOTE: TODO: need in_channels = 4 to use
|
||||
self.net = vgg16(pretrained=True, requires_grad=False)
|
||||
self.lin0 = NetLinLayer(self.chns[0], use_dropout=use_dropout)
|
||||
self.lin1 = NetLinLayer(self.chns[1], use_dropout=use_dropout)
|
||||
self.lin2 = NetLinLayer(self.chns[2], use_dropout=use_dropout)
|
||||
|
|
@ -90,7 +90,6 @@ class LPIPS(nn.Module):
|
|||
return val
|
||||
|
||||
|
||||
# SCH: TODO: this channel shift & scale may need to be changed
|
||||
class ScalingLayer(nn.Module):
|
||||
def __init__(self):
|
||||
super(ScalingLayer, self).__init__()
|
||||
|
|
|
|||
|
|
@ -1,26 +1,9 @@
|
|||
import numpy as np
|
||||
import torch
|
||||
|
||||
# from taming.modules.losses.lpips import LPIPS # need to pip install https://github.com/CompVis/taming-transformers
|
||||
# from taming.modules.discriminator.model import NLayerDiscriminator, weights_init
|
||||
|
||||
"""Stripped version of https://github.com/richzhang/PerceptualSimilarity/tree/master/models"""
|
||||
|
||||
|
||||
## NOTE: not used since we only have 'GN'
|
||||
# def get_norm_layer(norm_type, dtype):
|
||||
# if norm_type == 'LN':
|
||||
# # supply a few args with partial function and pass the rest of the args when this norm_fn is called
|
||||
# norm_fn = functools.partial(nn.LayerNorm, dtype=dtype)
|
||||
# elif norm_type == 'GN': #
|
||||
# norm_fn = functools.partial(nn.GroupNorm, dtype=dtype)
|
||||
# elif norm_type is None:
|
||||
# norm_fn = lambda: (lambda x: x)
|
||||
# else:
|
||||
# raise NotImplementedError(f'norm_type: {norm_type}')
|
||||
# return norm_fn
|
||||
|
||||
|
||||
class DiagonalGaussianDistribution(object):
|
||||
def __init__(
|
||||
self,
|
||||
|
|
@ -57,7 +40,7 @@ class DiagonalGaussianDistribution(object):
|
|||
dim=[1, 2, 3, 4],
|
||||
)
|
||||
|
||||
def nll(self, sample, dims=[1, 2, 3, 4]): # TODO: what does this do?
|
||||
def nll(self, sample, dims=[1, 2, 3, 4]):
|
||||
if self.deterministic:
|
||||
return torch.Tensor([0.0])
|
||||
logtwopi = np.log(2.0 * np.pi)
|
||||
|
|
|
|||
5
requirements/requirements-vae.txt
Normal file
5
requirements/requirements-vae.txt
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
beartype==0.18.5
|
||||
einops==0.8.0
|
||||
einops-exts==0.0.4
|
||||
opencv-python==4.9.0.80
|
||||
pillow==10.3.0
|
||||
1
setup.py
1
setup.py
|
|
@ -79,6 +79,7 @@ setup(
|
|||
extras_require={
|
||||
"data": fetch_requirements("requirements/requirements-data.txt"),
|
||||
"eval": fetch_requirements("requirements/requirements-eval.txt"),
|
||||
"vae": fetch_requirements("requirements/requirements-vae.txt"),
|
||||
"full": fetch_requirements(
|
||||
[
|
||||
"requirements/requirements-data.txt",
|
||||
|
|
|
|||
Loading…
Reference in a new issue