From 05927230222852aa89f297ab8b5fcd5d16584c64 Mon Sep 17 00:00:00 2001 From: zhengzangw Date: Wed, 15 May 2024 13:05:53 +0000 Subject: [PATCH] [feat] update bs search and loss eval --- configs/opensora-v1-2/misc/bs.py | 55 +++++++++++++++++-------- configs/opensora-v1-2/misc/eval_loss.py | 18 ++++---- eval/README.md | 10 +++++ opensora/utils/config_utils.py | 2 +- scripts/misc/eval_loss.py | 5 ++- scripts/misc/search_bs.py | 7 +++- 6 files changed, 67 insertions(+), 30 deletions(-) diff --git a/configs/opensora-v1-2/misc/bs.py b/configs/opensora-v1-2/misc/bs.py index e6304b9..c1a60cd 100644 --- a/configs/opensora-v1-2/misc/bs.py +++ b/configs/opensora-v1-2/misc/bs.py @@ -10,25 +10,46 @@ grad_checkpoint = True base = ("512", "408") base_step_time = 12 bucket_config = { - # "144p": {1: (100, 50), 51: (30, 20), 102: (20, 10), 204: (8, 4), 408: (4, 4)}, - # # --- - # "240p": {1: (100, 20), 51: (24, 5), 102: (12, 4), 204: (4, 2), 408: (2, 1)}, - # --- - "512": { - # 1: (141, 0), - 51: (9, 4), - 102: (6, 2), - 204: (2, 1), - # 408: (1, 0), + "144p": { + 1: (100, 300), + 51: (30, 100), + 102: (20, 100), + 204: (8, 20), + 408: (4, 10), }, # --- - # "480p": {1: (40, 10), 51: (6, 2), 102: (3, 2), 204: (1, 1)}, - # # --- - # "1024": {1: (20, 10), 51: (2, 1), 102: (1, 1)}, - # # --- - # "1080p": {1: (10, 5)}, - # # --- - # "2048": {1: (5, 2)}, + "240p": { + 1: (100, 100), + 51: (24, 10), + 102: (12, 10), + 204: (4, 8), + 408: (2, 8), + }, + # --- + # "512": { + # 1: (141, 0), + # 51: (8, 0), + # 102: (4, 0), + # 204: (2, 0), + # 408: (1, 0), + # }, + # --- + "480p": { + 1: (50, 50), + 51: (6, 6), + 102: (3, 3), + 204: (1, 2), + }, + # --- + "1024": { + 1: (20, 20), + 51: (2, 2), + 102: (1, 1), + }, + # --- + "1080p": {1: (10, 10)}, + # --- + "2048": {1: (5, 5)}, } # Acceleration settings diff --git a/configs/opensora-v1-2/misc/eval_loss.py b/configs/opensora-v1-2/misc/eval_loss.py index 62d576c..f052ad3 100644 --- a/configs/opensora-v1-2/misc/eval_loss.py +++ b/configs/opensora-v1-2/misc/eval_loss.py @@ -9,21 +9,20 @@ dataset = dict( transform_name="resize_crop", ) -# just occupy the space.... actually in evaluation we will create dataset for different resolutions -bucket_config = { # 20s/it - "144p": {1: (1.0, 100), 51: (1.0, 30), 102: ((1.0, 0.33), 20), 204: ((1.0, 0.1), 8), 408: ((1.0, 0.1), 4)}, +bucket_config = { + "144p": {1: (None, 100), 51: (None, 30), 102: (None, 20), 204: (None, 8), 408: (None, 4)}, # --- - "240p": {1: (0.3, 100), 51: (0.4, 24), 102: ((0.4, 0.33), 12), 204: ((0.4, 0.1), 4), 408: ((0.4, 0.1), 2)}, + "240p": {1: (None, 100), 51: (None, 24), 102: (None, 12), 204: (None, 4), 408: (None, 2)}, # --- - "360p": {1: (0.2, 60), 51: (0.15, 12), 102: ((0.15, 0.33), 6), 204: ((0.15, 0.1), 2), 408: ((0.15, 0.1), 1)}, + "360p": {1: (None, 60), 51: (None, 12), 102: (None, 6), 204: (None, 2), 408: (None, 1)}, # --- - "480p": {1: (0.1, 40), 51: (0.3, 6), 102: (0.3, 3), 204: (0.3, 1), 408: (0.0, None)}, + "480p": {1: (None, 40), 51: (None, 6), 102: (None, 3), 204: (None, 1)}, # --- - "720p": {1: (0.05, 20), 51: (0.3, 2), 102: (0.3, 1), 204: (0.0, None)}, + "720p": {1: (None, 20), 51: (None, 2), 102: (None, 1)}, # --- - "1080p": {1: (0.1, 10)}, + "1080p": {1: (None, 10)}, # --- - "2048": {1: (0.1, 5)}, + "2048": {1: (None, 5)}, } # Model settings @@ -39,6 +38,7 @@ vae = dict( from_pretrained="pretrained_models/vae-pipeline", micro_frame_size=17, micro_batch_size=4, + local_files_only=True, ) text_encoder = dict( type="t5", diff --git a/eval/README.md b/eval/README.md index df1fe7c..16ac166 100644 --- a/eval/README.md +++ b/eval/README.md @@ -13,6 +13,16 @@ bash eval/sample.sh /path/to/ckpt -2a bash eval/launch.sh /path/to/ckpt ``` +## Rectified Flow Loss + +```bash +CUDA_VISIBLE_DEVICES=2 torchrun --standalone --nproc_per_node 1 scripts/misc/eval_loss.py configs/opensora-v1-2/misc/eval_loss.py --data-path /mnt/nfs-207/sora_data/meta/img_1k.csv --ckpt-path /home/lishenggui/projects/sora/Open-Sora-dev/outputs/207-STDiT3-XL-2/epoch0-global_step9000/ + +CUDA_VISIBLE_DEVICES=3 torchrun --standalone --nproc_per_node 1 scripts/misc/eval_loss.py configs/opensora-v1-2/misc/eval_loss.py --data-path /mnt/nfs-207/sora_data/meta/vid_100.csv --ckpt-path /home/lishenggui/projects/sora/Open-Sora-dev/outputs/207-STDiT3-XL-2/epoch0-global_step9000/ + +CUDA_VISIBLE_DEVICES=3 torchrun --standalone --nproc_per_node 1 scripts/misc/eval_loss.py configs/opensora-v1-2/misc/eval_loss.py --data-path /mnt/nfs-207/sora_data/meta/vid_100.csv --ckpt-path /home/lishenggui/projects/sora/Open-Sora-dev/outputs/207-STDiT3-XL-2/epoch0-global_step9000/ --resolution 720p +``` + ## VBench [VBench](https://github.com/Vchitect/VBench) is a benchmark for short text to video generation. We provide a script for easily generating samples required by VBench. diff --git a/opensora/utils/config_utils.py b/opensora/utils/config_utils.py index cc9e838..0fd558f 100644 --- a/opensora/utils/config_utils.py +++ b/opensora/utils/config_utils.py @@ -26,6 +26,7 @@ def parse_args(training=False): parser.add_argument("--outputs", default=None, type=str, help="the dir to save model weights") parser.add_argument("--flash-attn", default=None, type=str2bool, help="enable flash attention") parser.add_argument("--layernorm-kernel", default=None, type=str2bool, help="enable layernorm kernel") + parser.add_argument("--resolution", default=None, type=str, help="multi resolution") # ====================================================== # Inference @@ -50,7 +51,6 @@ def parse_args(training=False): parser.add_argument("--fps", default=None, type=int, help="fps") parser.add_argument("--image-size", default=None, type=int, nargs=2, help="image size") parser.add_argument("--frame-interval", default=None, type=int, help="frame interval") - parser.add_argument("--resolution", default=None, type=str, help="multi resolution") parser.add_argument("--aspect-ratio", default=None, type=float, help="aspect ratio") # hyperparameters diff --git a/scripts/misc/eval_loss.py b/scripts/misc/eval_loss.py index b839713..4df070c 100644 --- a/scripts/misc/eval_loss.py +++ b/scripts/misc/eval_loss.py @@ -84,7 +84,10 @@ def main(): # ====================================================== # start evaluation, prepare a dataset everytime in the loop bucket_config = cfg.bucket_config + if cfg.get("resolution", None) is not None: + bucket_config = {cfg.resolution: bucket_config[cfg.resolution]} assert bucket_config is not None, "bucket_config is required for evaluation" + logger.info("Evaluating bucket_config: %s", bucket_config) def build_dataset(resolution, num_frames, batch_size): bucket_config = {resolution: {num_frames: (1.0, batch_size)}} @@ -118,7 +121,7 @@ def main(): continue evaluation_t_losses = [] - for t in torch.linspace(0, scheduler.num_timesteps, cfg.get("num_eval_timesteps", 10)): + for t in torch.linspace(0, scheduler.num_timesteps, cfg.get("num_eval_timesteps", 10) + 2)[1:-1]: loss_t = 0.0 num_samples = 0 dataloader_iter = iter(dataloader) diff --git a/scripts/misc/search_bs.py b/scripts/misc/search_bs.py index 37b588e..3926422 100644 --- a/scripts/misc/search_bs.py +++ b/scripts/misc/search_bs.py @@ -282,10 +282,13 @@ def main(): return target_batch_size, target_step_time # == build bucket == - output_bucket_cfg = deepcopy(cfg.bucket_config) + bucket_config = cfg.bucket_config + output_bucket_cfg = deepcopy(bucket_config) + if cfg.get("resolution", None) is not None: + bucket_config = {cfg.resolution: bucket_config[cfg.resolution]} buckets = { (resolution, num_frames): (max(guess_bs - variance, 1), guess_bs + variance) - for resolution, t_bucket in cfg.bucket_config.items() + for resolution, t_bucket in bucket_config.items() for num_frames, (guess_bs, variance) in t_bucket.items() }