From b5414b36b844511a45855ed511896c3fef2f2343 Mon Sep 17 00:00:00 2001 From: "Zheng Zangwei (Alex Zheng)" Date: Tue, 2 Apr 2024 14:51:21 +0800 Subject: [PATCH] Dev/datapipe (#21) * fix #210 * fix #209 * fix #188 * [docs] add training order * update data pipeline --------- Co-authored-by: Sze-qq <68757353+Sze-qq@users.noreply.github.com> --- README.md | 32 +++++---- tools/datasets/convert.py | 21 +++--- tools/datasets/csvutil.py | 24 +++---- tools/scoring/README.md | 96 +++++++++++++++---------- tools/scoring/aesthetic/README.md | 27 ------- tools/scoring/optical_flow/inference.py | 52 ++++++-------- 6 files changed, 125 insertions(+), 127 deletions(-) delete mode 100644 tools/scoring/aesthetic/README.md diff --git a/README.md b/README.md index 85635c3..977d019 100644 --- a/README.md +++ b/README.md @@ -149,6 +149,8 @@ the config files. | 16×256×256 | 20K HQ | 24k | 8×64 | 45 | [:link:](https://huggingface.co/hpcai-tech/Open-Sora/blob/main/OpenSora-v1-HQ-16x256x256.pth) | | 16×256×256 | 366K | 80k | 8×64 | 117 | [:link:](https://huggingface.co/hpcai-tech/Open-Sora/blob/main/OpenSora-v1-16x256x256.pth) | +Training orders: 16x256x256 $\rightarrow$ 16x256x256 HQ $\rightarrow$ 16x512x512 HQ. + Our model's weight is partially initialized from [PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha). The number of parameters is 724M. More information about training can be found in our **[report](/docs/report_v1.md)**. More about the dataset can be found in [datasets.md](/docs/datasets.md). HQ means high quality. @@ -203,33 +205,35 @@ Below is an example workflow to process data. However, we recommend you to read ```bash # Suppose files under ~/dataset/ # 1. Convert dataset to CSV -# output: ~/dataset.csv -python -m tools.dataset.convert video ~/dataset +python -m tools.datasets.convert video ~/dataset --output ~/dataset/meta.csv # filter out broken videos (broken ones num_frames=0) -python -m tools.dataset.csvutil ~/dataset.csv --video-info --fmin 2 --output ~/dataset.csv +python -m tools.datasets.csvutil ~/dataset.csv --info --fmin 1 --output ~/dataset/meta.csv # 2. Filter dataset by aesthetic scores -# output: ~/dataset_aesthetic.csv -python -m tools.aesthetic.inference ~/dataset.csv +# output: ~/dataset/meta_aes.csv +python -m tools.aesthetic.inference ~/dataset/meta.csv # sort and examine videos by aesthetic scores -python -m tools.datasets.csvutil ~/dataset_aesthetic.csv --sort-descending aesthetic_score +# output: ~/dataset/meta_aes_sort.csv +python -m tools.datasets.csvutil ~/dataset/meta_aes.csv --sort-descending aes # bad videos (aesthetic_score < 5) -tail ~/dataset_aesthetic.csv +tail ~/dataset/meta_aes_sort.csv # filter videos by aesthetic scores -# output: ~/dataset_aesthetic_aesmin_5.csv -python -m tools.datasets.csvutil ~/dataset_aesthetic.csv --aesmin 5 +# output: ~/dataset/meta_aes_aesmin5.csv +python -m tools.datasets.csvutil ~/dataset/meta_aes.csv --aesmin 5 # 3. Caption dataset -# output: ~/dataset_aesthetic_aesmin_5_caption.csv -torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava ~/dataset_aesthetic_aesmin_5.csv --tp-size 2 --dp-size 4 --bs 16 +# output: ~/dataset/meta_aes_aesmin5_caption_parti.csv +torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava ~/dataset/meta_aes_aesmin5.csv --tp-size 2 --dp-size 4 --bs 16 +# merge generated results +python -m tools.datasets.csvutil ~/dataset/meta_aes_aesmin5_caption_part*.csv --output ~/dataset/meta_caption.csv # remove empty captions and process captions (may need to re-caption lost ones) -python -m tools.datasets.csvutil ~/dataset_aesthetic_aesmin_5_caption.csv --remove-caption-prefix --remove-empty-caption +python -m tools.datasets.csvutil ~/dataset/meta_caption.csv --remove-caption-prefix --remove-empty-caption --output ~/dataset/meta_caption_processed.csv # 4. Sanity check & prepare for training # sanity check -python -m tools.datasets.csvutil ~/dataset_aesthetic_aesmin_5_caption.csv --ext --video-info --output ~/dataset_ready.csv +python -m tools.datasets.csvutil ~/dataset/meta_caption_processed.csv --ext --video-info --output ~/dataset/meta_ready.csv # filter out videos less than 48 frames -# output: ~/dataset_ready_fmin_48.csv +# output: ~/dataset/meta_ready_fmin48.csv python -m tools.datasets.csvutil ~/dataset_ready.csv --fmin 48 ``` diff --git a/tools/datasets/convert.py b/tools/datasets/convert.py index f832eb7..ad80461 100644 --- a/tools/datasets/convert.py +++ b/tools/datasets/convert.py @@ -65,20 +65,24 @@ def process_vidprom(root, info): print(f"Saved {len(df)} samples to vidprom.csv.") -def process_general_images(root): +def process_general_images(root, output): root = os.path.expanduser(root) image_lists = get_filelist(root, IMG_EXTENSIONS) df = pd.DataFrame(dict(path=image_lists)) - df.to_csv("images.csv", index=False) - print(f"Saved {len(df)} samples to images.csv.") + if output is None: + output = "images.csv" + df.to_csv(output, index=False) + print(f"Saved {len(df)} samples to {output}.") -def process_general_videos(root): +def process_general_videos(root, output): root = os.path.expanduser(root) video_lists = get_filelist(root, VID_EXTENSIONS) df = pd.DataFrame(dict(path=video_lists)) - df.to_csv("videos.csv", index=False) - print(f"Saved {len(df)} samples to videos.csv.") + if output is None: + output = "videos.csv" + df.to_csv(output, index=False) + print(f"Saved {len(df)} samples to {output}.") if __name__ == "__main__": @@ -87,6 +91,7 @@ if __name__ == "__main__": parser.add_argument("root", type=str) parser.add_argument("--split", type=str, default="train") parser.add_argument("--info", type=str, default=None) + parser.add_argument("--output", type=str, default=None) args = parser.parse_args() if args.dataset == "imagenet": @@ -96,8 +101,8 @@ if __name__ == "__main__": elif args.dataset == "vidprom": process_vidprom(args.root, args.info) elif args.dataset == "image": - process_general_images(args.root) + process_general_images(args.root, args.output) elif args.dataset == "video": - process_general_videos(args.root) + process_general_videos(args.root, args.output) else: raise ValueError("Invalid dataset") diff --git a/tools/datasets/csvutil.py b/tools/datasets/csvutil.py index b1ac152..ba68d42 100644 --- a/tools/datasets/csvutil.py +++ b/tools/datasets/csvutil.py @@ -124,7 +124,7 @@ def parse_args(): parser.add_argument("--remove-caption-prefix", action="store_true") parser.add_argument("--unescape", action="store_true") # num_frames processing - parser.add_argument("--video-info", action="store_true") + parser.add_argument("--info", action="store_true") # num_frames filtering parser.add_argument("--fmin", type=int, default=None) parser.add_argument("--fmax", type=int, default=None) @@ -163,19 +163,19 @@ def get_output_path(args, input_name): if args.unescape: name += "_unescape" # num_frames processing - if args.video_info: - name += "_vinfo" + if args.info: + name += "_info" # num_frames filtering if args.fmin is not None: - name += f"_fmin_{args.fmin}" + name += f"_fmin{args.fmin}" if args.fmax is not None: - name += f"_fmax_{args.fmax}" + name += f"_fmax{args.fmax}" # aesthetic filtering if args.aesmin is not None: - name += f"_aesmin_{args.aesmin}" + name += f"_aesmin{args.aesmin}" # clip score filtering if args.matchmin is not None: - name += f"_matchmin_{args.matchmin}" + name += f"_matchmin{args.matchmin}" # sort if args.sort_descending is not None: assert args.sort_ascending is None @@ -254,7 +254,7 @@ def main(args): if args.unescape: assert "text" in data.columns data["text"] = apply(data["text"], html.unescape) - if args.video_info: + if args.info: info = apply(data["path"], get_video_info) data["num_frames"], data["height"], data["width"], data["aspect_ratio"], data["fps"] = zip(*info) @@ -266,11 +266,11 @@ def main(args): assert "num_frames" in data.columns data = data[data["num_frames"] <= args.fmax] if args.aesmin is not None: - assert "aesthetic_score" in data.columns - data = data[data["aesthetic_score"] >= args.aesmin] + assert "aes" in data.columns + data = data[data["aes"] >= args.aesmin] if args.matchmin is not None: - assert "clip_score" in data.columns - data = data[data["clip_score"] >= args.matchmin] + assert "match" in data.columns + data = data[data["match"] >= args.matchmin] print(f"Filtered number of samples: {len(data)}.") # sort diff --git a/tools/scoring/README.md b/tools/scoring/README.md index 4852fec..d2e7326 100644 --- a/tools/scoring/README.md +++ b/tools/scoring/README.md @@ -1,37 +1,59 @@ -# Data Scoring and Filtering -Important!!! All scoring jobs require these columns in meta files: -- `path`: absolute path to a sample - -## Aesthetic Score -First prepare the environment and pretrained models. -```bash -# install clip -pip install git+https://github.com/openai/CLIP.git -pip install decord - -# get pretrained model -wget https://github.com/christophschuhmann/improved-aesthetic-predictor/raw/main/sac+logos+ava1-l14-linearMSE.pth -O pretrained_models/aesthetic.pth -``` - -Then run: -```bash -python -m tools.scoring.aesthetic.inference /path/to/meta.csv -``` -The output should be `/path/to/meta_aes.csv` with column `aes`. Aesthetic scores range from 1 to 10, with 10 being the best quality. - -## Optical Flow Score -First get the pretrained model. -```bash -wget https://s3.eu-central-1.amazonaws.com/avg-projects/unimatch/pretrained/gmflow-scale2-regrefine6-mixdata-train320x576-4e7b215d.pth -P pretrained_models/unimatch -``` - -Then run: -``` -python tools/scoring/optical_flow/inference.py /path/to/meta.csv -``` -The output should be `/path/to/meta_flow.csv` with column `flow`. Higher optical flow scores indicate larger movement. - -## Matching Score -Require column `text` in meta files, which is the caption of the sample. - -TODO. +# Data Scoring and Filtering + +- [Data Scoring and Filtering](#data-scoring-and-filtering) + - [Aesthetic Scoring](#aesthetic-scoring) + - [Requirement](#requirement) + - [Usage](#usage) + - [Optical Flow Score](#optical-flow-score) + - [Matching Score](#matching-score) + + +## Aesthetic Scoring + +To evaluate the aesthetic quality of videos, we use a pretrained model from [CLIP+MLP Aesthetic Score Predictor](https://github.com/christophschuhmann/improved-aesthetic-predictor). This model is trained on 176K SAC (Simulacra Aesthetic Captions) pairs, 15K LAION-Logos (Logos) pairs, and 250K AVA (The Aesthetic Visual Analysis) image-text pairs. + +The score is between 1 and 10, where 5.5 can be considered as the threshold for fair aesthetics, and 6.5 for good aesthetics. Good text-to-image models can achieve a score of 7.0 or higher. + +For videos, we extract the first, last, and the middle frames for evaluation. The script also supports images. Our script enables 1k videos/s with one GPU. It also supports multiple GPUs to further accelerate the process. + +### Requirement + +```bash +# install clip +pip install git+https://github.com/openai/CLIP.git +pip install decord + +# get pretrained model +wget https://github.com/christophschuhmann/improved-aesthetic-predictor/raw/main/sac+logos+ava1-l14-linearMSE.pth -O pretrained_models/aesthetic.pth +``` + +### Usage + +With `meta.csv` containing the paths to the videos, run the following command: + +```bash +# output: DATA_aes.csv +python -m tools.aesthetic.inference meta.csv +``` + +## Optical Flow Score + +First get the pretrained model. + +```bash +wget https://s3.eu-central-1.amazonaws.com/avg-projects/unimatch/pretrained/gmflow-scale2-regrefine6-mixdata-train320x576-4e7b215d.pth -P pretrained_models/unimatch +``` + +With `meta.csv` containing the paths to the videos, run the following command: + +```bash +python -m tools.scoring.optical_flow.inference /path/to/meta.csv +``` + +The output should be `/path/to/meta_flow.csv` with column `flow`. Higher optical flow scores indicate larger movement. + +## Matching Score + +Require column `text` in meta files, which is the caption of the sample. + +TODO. diff --git a/tools/scoring/aesthetic/README.md b/tools/scoring/aesthetic/README.md deleted file mode 100644 index 9cdf2bc..0000000 --- a/tools/scoring/aesthetic/README.md +++ /dev/null @@ -1,27 +0,0 @@ -# Aesthetic Scoring - -To evaluate the aesthetic quality of videos, we use a pretrained model from [CLIP+MLP Aesthetic Score Predictor](https://github.com/christophschuhmann/improved-aesthetic-predictor). This model is trained on 176K SAC (Simulacra Aesthetic Captions) pairs, 15K LAION-Logos (Logos) pairs, and 250K AVA (The Aesthetic Visual Analysis) image-text pairs. - -The score is between 1 and 10, where 5.5 can be considered as the threshold for fair aesthetics, and 6.5 for good aesthetics. Good text-to-image models can achieve a score of 7.0 or higher. - -For videos, we extract the first, last, and the middle frames for evaluation. The script also supports images. Our script enables 1k videos/s with one GPU. It also supports multiple GPUs to further accelerate the process. - -## Requirement - -```bash -# install clip -pip install git+https://github.com/openai/CLIP.git -pip install decord - -# get pretrained model -wget https://github.com/christophschuhmann/improved-aesthetic-predictor/raw/main/sac+logos+ava1-l14-linearMSE.pth -O pretrained_models/aesthetic.pth -``` - -## Usage - -With `DATA.csv` containing the paths to the videos, run the following command: - -```bash -# output: DATA_aes.csv -python -m tools.aesthetic.inference DATA.csv -``` diff --git a/tools/scoring/optical_flow/inference.py b/tools/scoring/optical_flow/inference.py index 62f887c..d48ecf7 100644 --- a/tools/scoring/optical_flow/inference.py +++ b/tools/scoring/optical_flow/inference.py @@ -1,22 +1,17 @@ -import os -# os.chdir('../..') -print(f'Current working directory: {os.getcwd()}') - import argparse +import os + import av import numpy as np import pandas as pd +import torch +import torch.nn.functional as F from einops import rearrange from tqdm import tqdm -import torch -import torch.nn as nn -import torch.nn.functional as F -from torch.utils.data import Dataset -from torchvision.transforms.functional import pil_to_tensor +from .unimatch import UniMatch -import decord -from unimatch import UniMatch +import decord # isort: skip def extract_frames_av(video_path, frame_inds=[0, 10, 20, 30]): @@ -57,11 +52,11 @@ class VideoTextDataset(torch.utils.data.Dataset): # transform images = torch.from_numpy(images).float() - images = rearrange(images, 'N H W C -> N C H W') + images = rearrange(images, "N H W C -> N C H W") H, W = images.shape[-2:] if H > W: - images = rearrange(images, 'N C H W -> N C W H') - images = F.interpolate(images, size=(320, 576), mode='bilinear', align_corners=True) + images = rearrange(images, "N C H W -> N C W H") + images = F.interpolate(images, size=(320, 576), mode="bilinear", align_corners=True) return images @@ -78,7 +73,7 @@ def main(): meta_path = args.meta_path wo_ext, ext = os.path.splitext(meta_path) - out_path = f'{wo_ext}_flow{ext}' + out_path = f"{wo_ext}_flow{ext}" # build model device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") @@ -90,12 +85,10 @@ def main(): ffn_dim_expansion=4, num_transformer_layers=6, reg_refine=True, - task='flow', + task="flow", ) - ckpt = torch.load( - './pretrained_models/unimatch/gmflow-scale2-regrefine6-mixdata-train320x576-4e7b215d.pth' - ) - model.load_state_dict(ckpt['model']) + ckpt = torch.load("./pretrained_models/unimatch/gmflow-scale2-regrefine6-mixdata-train320x576-4e7b215d.pth") + model.load_state_dict(ckpt["model"]) model = model.to(device) # model = torch.nn.DataParallel(model) @@ -115,30 +108,31 @@ def main(): images = images.to(device) B = images.shape[0] - batch_0 = rearrange(images[:, :-1], 'B N C H W -> (B N) C H W').contiguous() - batch_1 = rearrange(images[:, 1:], 'B N C H W -> (B N) C H W').contiguous() + batch_0 = rearrange(images[:, :-1], "B N C H W -> (B N) C H W").contiguous() + batch_1 = rearrange(images[:, 1:], "B N C H W -> (B N) C H W").contiguous() with torch.no_grad(): res = model( - batch_0, batch_1, - attn_type='swin', + batch_0, + batch_1, + attn_type="swin", attn_splits_list=[2, 8], corr_radius_list=[-1, 4], prop_radius_list=[-1, 1], num_reg_refine=6, - task='flow', + task="flow", pred_bidir_flow=False, ) - flow_maps = res['flow_preds'][-1].cpu() # [B * (N-1), 2, H, W] - flow_maps = rearrange(flow_maps, '(B N) C H W -> B N H W C', B=B) + flow_maps = res["flow_preds"][-1].cpu() # [B * (N-1), 2, H, W] + flow_maps = rearrange(flow_maps, "(B N) C H W -> B N H W C", B=B) flow_scores = flow_maps.abs().mean(dim=[1, 2, 3, 4]) flow_scores_np = flow_scores.numpy() - dataset.meta.loc[index: index + B - 1, "flow"] = flow_scores_np + dataset.meta.loc[index : index + B - 1, "flow"] = flow_scores_np index += B dataset.meta.to_csv(out_path, index=False) - print(f"New meta with optical flow scores saved to \'{out_path}\'.") + print(f"New meta with optical flow scores saved to '{out_path}'.") if __name__ == "__main__":