From b5414b36b844511a45855ed511896c3fef2f2343 Mon Sep 17 00:00:00 2001
From: "Zheng Zangwei (Alex Zheng)" <zangwei@comp.nus.edu.sg>
Date: Tue, 2 Apr 2024 14:51:21 +0800
Subject: [PATCH] Dev/datapipe (#21)

* fix #210

* fix #209

* fix #188

* [docs] add training order

* update data pipeline

---------

Co-authored-by: Sze-qq <68757353+Sze-qq@users.noreply.github.com>
---
 README.md                               | 32 +++++----
 tools/datasets/convert.py               | 21 +++---
 tools/datasets/csvutil.py               | 24 +++----
 tools/scoring/README.md                 | 96 +++++++++++++++----------
 tools/scoring/aesthetic/README.md       | 27 -------
 tools/scoring/optical_flow/inference.py | 52 ++++++--------
 6 files changed, 125 insertions(+), 127 deletions(-)
 delete mode 100644 tools/scoring/aesthetic/README.md

diff --git a/README.md b/README.md
index 85635c3..977d019 100644
--- a/README.md
+++ b/README.md
@@ -149,6 +149,8 @@ the config files.
 | 16×256×256 | 20K HQ | 24k         | 8×64       | 45              | [:link:](https://huggingface.co/hpcai-tech/Open-Sora/blob/main/OpenSora-v1-HQ-16x256x256.pth) |
 | 16×256×256 | 366K   | 80k         | 8×64       | 117             | [:link:](https://huggingface.co/hpcai-tech/Open-Sora/blob/main/OpenSora-v1-16x256x256.pth)    |
 
+Training orders: 16x256x256 $\rightarrow$ 16x256x256 HQ $\rightarrow$ 16x512x512 HQ.
+
 Our model's weight is partially initialized from [PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha). The number of
 parameters is 724M. More information about training can be found in our **[report](/docs/report_v1.md)**. More about
 the dataset can be found in [datasets.md](/docs/datasets.md). HQ means high quality.
@@ -203,33 +205,35 @@ Below is an example workflow to process data. However, we recommend you to read
 ```bash
 # Suppose files under ~/dataset/
 # 1. Convert dataset to CSV
-# output: ~/dataset.csv
-python -m tools.dataset.convert video ~/dataset
+python -m tools.datasets.convert video ~/dataset --output ~/dataset/meta.csv
 # filter out broken videos (broken ones num_frames=0)
-python -m tools.dataset.csvutil ~/dataset.csv --video-info --fmin 2 --output ~/dataset.csv
+python -m tools.datasets.csvutil ~/dataset.csv --info --fmin 1 --output ~/dataset/meta.csv
 
 # 2. Filter dataset by aesthetic scores
-# output: ~/dataset_aesthetic.csv
-python -m tools.aesthetic.inference ~/dataset.csv
+# output: ~/dataset/meta_aes.csv
+python -m tools.aesthetic.inference ~/dataset/meta.csv
 # sort and examine videos by aesthetic scores
-python -m tools.datasets.csvutil ~/dataset_aesthetic.csv --sort-descending aesthetic_score
+# output: ~/dataset/meta_aes_sort.csv
+python -m tools.datasets.csvutil ~/dataset/meta_aes.csv --sort-descending aes
 # bad videos (aesthetic_score < 5)
-tail ~/dataset_aesthetic.csv
+tail ~/dataset/meta_aes_sort.csv
 # filter videos by aesthetic scores
-# output: ~/dataset_aesthetic_aesmin_5.csv
-python -m tools.datasets.csvutil ~/dataset_aesthetic.csv --aesmin 5
+# output: ~/dataset/meta_aes_aesmin5.csv
+python -m tools.datasets.csvutil ~/dataset/meta_aes.csv --aesmin 5
 
 # 3. Caption dataset
-# output: ~/dataset_aesthetic_aesmin_5_caption.csv
-torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava ~/dataset_aesthetic_aesmin_5.csv --tp-size 2 --dp-size 4 --bs 16
+# output: ~/dataset/meta_aes_aesmin5_caption_parti.csv
+torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava ~/dataset/meta_aes_aesmin5.csv --tp-size 2 --dp-size 4 --bs 16
+# merge generated results
+python -m tools.datasets.csvutil ~/dataset/meta_aes_aesmin5_caption_part*.csv --output ~/dataset/meta_caption.csv
 # remove empty captions and process captions (may need to re-caption lost ones)
-python -m tools.datasets.csvutil ~/dataset_aesthetic_aesmin_5_caption.csv --remove-caption-prefix --remove-empty-caption
+python -m tools.datasets.csvutil ~/dataset/meta_caption.csv --remove-caption-prefix --remove-empty-caption --output ~/dataset/meta_caption_processed.csv
 
 # 4. Sanity check & prepare for training
 # sanity check
-python -m tools.datasets.csvutil ~/dataset_aesthetic_aesmin_5_caption.csv --ext --video-info --output ~/dataset_ready.csv
+python -m tools.datasets.csvutil ~/dataset/meta_caption_processed.csv --ext --video-info --output ~/dataset/meta_ready.csv
 # filter out videos less than 48 frames
-# output: ~/dataset_ready_fmin_48.csv
+# output: ~/dataset/meta_ready_fmin48.csv
 python -m tools.datasets.csvutil ~/dataset_ready.csv --fmin 48
 ```
 
diff --git a/tools/datasets/convert.py b/tools/datasets/convert.py
index f832eb7..ad80461 100644
--- a/tools/datasets/convert.py
+++ b/tools/datasets/convert.py
@@ -65,20 +65,24 @@ def process_vidprom(root, info):
     print(f"Saved {len(df)} samples to vidprom.csv.")
 
 
-def process_general_images(root):
+def process_general_images(root, output):
     root = os.path.expanduser(root)
     image_lists = get_filelist(root, IMG_EXTENSIONS)
     df = pd.DataFrame(dict(path=image_lists))
-    df.to_csv("images.csv", index=False)
-    print(f"Saved {len(df)} samples to images.csv.")
+    if output is None:
+        output = "images.csv"
+    df.to_csv(output, index=False)
+    print(f"Saved {len(df)} samples to {output}.")
 
 
-def process_general_videos(root):
+def process_general_videos(root, output):
     root = os.path.expanduser(root)
     video_lists = get_filelist(root, VID_EXTENSIONS)
     df = pd.DataFrame(dict(path=video_lists))
-    df.to_csv("videos.csv", index=False)
-    print(f"Saved {len(df)} samples to videos.csv.")
+    if output is None:
+        output = "videos.csv"
+    df.to_csv(output, index=False)
+    print(f"Saved {len(df)} samples to {output}.")
 
 
 if __name__ == "__main__":
@@ -87,6 +91,7 @@ if __name__ == "__main__":
     parser.add_argument("root", type=str)
     parser.add_argument("--split", type=str, default="train")
     parser.add_argument("--info", type=str, default=None)
+    parser.add_argument("--output", type=str, default=None)
     args = parser.parse_args()
 
     if args.dataset == "imagenet":
@@ -96,8 +101,8 @@ if __name__ == "__main__":
     elif args.dataset == "vidprom":
         process_vidprom(args.root, args.info)
     elif args.dataset == "image":
-        process_general_images(args.root)
+        process_general_images(args.root, args.output)
     elif args.dataset == "video":
-        process_general_videos(args.root)
+        process_general_videos(args.root, args.output)
     else:
         raise ValueError("Invalid dataset")
diff --git a/tools/datasets/csvutil.py b/tools/datasets/csvutil.py
index b1ac152..ba68d42 100644
--- a/tools/datasets/csvutil.py
+++ b/tools/datasets/csvutil.py
@@ -124,7 +124,7 @@ def parse_args():
     parser.add_argument("--remove-caption-prefix", action="store_true")
     parser.add_argument("--unescape", action="store_true")
     # num_frames processing
-    parser.add_argument("--video-info", action="store_true")
+    parser.add_argument("--info", action="store_true")
     # num_frames filtering
     parser.add_argument("--fmin", type=int, default=None)
     parser.add_argument("--fmax", type=int, default=None)
@@ -163,19 +163,19 @@ def get_output_path(args, input_name):
     if args.unescape:
         name += "_unescape"
     # num_frames processing
-    if args.video_info:
-        name += "_vinfo"
+    if args.info:
+        name += "_info"
     # num_frames filtering
     if args.fmin is not None:
-        name += f"_fmin_{args.fmin}"
+        name += f"_fmin{args.fmin}"
     if args.fmax is not None:
-        name += f"_fmax_{args.fmax}"
+        name += f"_fmax{args.fmax}"
     # aesthetic filtering
     if args.aesmin is not None:
-        name += f"_aesmin_{args.aesmin}"
+        name += f"_aesmin{args.aesmin}"
     # clip score filtering
     if args.matchmin is not None:
-        name += f"_matchmin_{args.matchmin}"
+        name += f"_matchmin{args.matchmin}"
     # sort
     if args.sort_descending is not None:
         assert args.sort_ascending is None
@@ -254,7 +254,7 @@ def main(args):
     if args.unescape:
         assert "text" in data.columns
         data["text"] = apply(data["text"], html.unescape)
-    if args.video_info:
+    if args.info:
         info = apply(data["path"], get_video_info)
         data["num_frames"], data["height"], data["width"], data["aspect_ratio"], data["fps"] = zip(*info)
 
@@ -266,11 +266,11 @@ def main(args):
         assert "num_frames" in data.columns
         data = data[data["num_frames"] <= args.fmax]
     if args.aesmin is not None:
-        assert "aesthetic_score" in data.columns
-        data = data[data["aesthetic_score"] >= args.aesmin]
+        assert "aes" in data.columns
+        data = data[data["aes"] >= args.aesmin]
     if args.matchmin is not None:
-        assert "clip_score" in data.columns
-        data = data[data["clip_score"] >= args.matchmin]
+        assert "match" in data.columns
+        data = data[data["match"] >= args.matchmin]
     print(f"Filtered number of samples: {len(data)}.")
 
     # sort
diff --git a/tools/scoring/README.md b/tools/scoring/README.md
index 4852fec..d2e7326 100644
--- a/tools/scoring/README.md
+++ b/tools/scoring/README.md
@@ -1,37 +1,59 @@
-# Data Scoring and Filtering
-Important!!! All scoring jobs require these columns in meta files:
-- `path`: absolute path to a sample
-
-## Aesthetic Score
-First prepare the environment and pretrained models.
-```bash
-# install clip
-pip install git+https://github.com/openai/CLIP.git
-pip install decord
-
-# get pretrained model
-wget https://github.com/christophschuhmann/improved-aesthetic-predictor/raw/main/sac+logos+ava1-l14-linearMSE.pth -O pretrained_models/aesthetic.pth
-```
-
-Then run:
-```bash
-python -m tools.scoring.aesthetic.inference /path/to/meta.csv
-```
-The output should be `/path/to/meta_aes.csv` with column `aes`. Aesthetic scores range from 1 to 10, with 10 being the best quality.
-
-## Optical Flow Score
-First get the pretrained model.
-```bash
-wget https://s3.eu-central-1.amazonaws.com/avg-projects/unimatch/pretrained/gmflow-scale2-regrefine6-mixdata-train320x576-4e7b215d.pth -P pretrained_models/unimatch
-```
-
-Then run:
-```
-python tools/scoring/optical_flow/inference.py /path/to/meta.csv
-```
-The output should be `/path/to/meta_flow.csv` with column `flow`. Higher optical flow scores indicate larger movement.
-
-## Matching Score
-Require column `text` in meta files, which is the caption of the sample.
-
-TODO.
+# Data Scoring and Filtering
+
+- [Data Scoring and Filtering](#data-scoring-and-filtering)
+  - [Aesthetic Scoring](#aesthetic-scoring)
+    - [Requirement](#requirement)
+    - [Usage](#usage)
+  - [Optical Flow Score](#optical-flow-score)
+  - [Matching Score](#matching-score)
+
+
+## Aesthetic Scoring
+
+To evaluate the aesthetic quality of videos, we use a pretrained model from [CLIP+MLP Aesthetic Score Predictor](https://github.com/christophschuhmann/improved-aesthetic-predictor). This model is trained on 176K SAC (Simulacra Aesthetic Captions) pairs, 15K LAION-Logos (Logos) pairs, and 250K AVA (The Aesthetic Visual Analysis) image-text pairs.
+
+The score is between 1 and 10, where 5.5 can be considered as the threshold for fair aesthetics, and 6.5 for good aesthetics. Good text-to-image models can achieve a score of 7.0 or higher.
+
+For videos, we extract the first, last, and the middle frames for evaluation. The script also supports images. Our script enables 1k videos/s with one GPU. It also supports multiple GPUs to further accelerate the process.
+
+### Requirement
+
+```bash
+# install clip
+pip install git+https://github.com/openai/CLIP.git
+pip install decord
+
+# get pretrained model
+wget https://github.com/christophschuhmann/improved-aesthetic-predictor/raw/main/sac+logos+ava1-l14-linearMSE.pth -O pretrained_models/aesthetic.pth
+```
+
+### Usage
+
+With `meta.csv` containing the paths to the videos, run the following command:
+
+```bash
+# output: DATA_aes.csv
+python -m tools.aesthetic.inference meta.csv
+```
+
+## Optical Flow Score
+
+First get the pretrained model.
+
+```bash
+wget https://s3.eu-central-1.amazonaws.com/avg-projects/unimatch/pretrained/gmflow-scale2-regrefine6-mixdata-train320x576-4e7b215d.pth -P pretrained_models/unimatch
+```
+
+With `meta.csv` containing the paths to the videos, run the following command:
+
+```bash
+python -m tools.scoring.optical_flow.inference /path/to/meta.csv
+```
+
+The output should be `/path/to/meta_flow.csv` with column `flow`. Higher optical flow scores indicate larger movement.
+
+## Matching Score
+
+Require column `text` in meta files, which is the caption of the sample.
+
+TODO.
diff --git a/tools/scoring/aesthetic/README.md b/tools/scoring/aesthetic/README.md
deleted file mode 100644
index 9cdf2bc..0000000
--- a/tools/scoring/aesthetic/README.md
+++ /dev/null
@@ -1,27 +0,0 @@
-# Aesthetic Scoring
-
-To evaluate the aesthetic quality of videos, we use a pretrained model from [CLIP+MLP Aesthetic Score Predictor](https://github.com/christophschuhmann/improved-aesthetic-predictor). This model is trained on 176K SAC (Simulacra Aesthetic Captions) pairs, 15K LAION-Logos (Logos) pairs, and 250K AVA (The Aesthetic Visual Analysis) image-text pairs.
-
-The score is between 1 and 10, where 5.5 can be considered as the threshold for fair aesthetics, and 6.5 for good aesthetics. Good text-to-image models can achieve a score of 7.0 or higher.
-
-For videos, we extract the first, last, and the middle frames for evaluation. The script also supports images. Our script enables 1k videos/s with one GPU. It also supports multiple GPUs to further accelerate the process.
-
-## Requirement
-
-```bash
-# install clip
-pip install git+https://github.com/openai/CLIP.git
-pip install decord
-
-# get pretrained model
-wget https://github.com/christophschuhmann/improved-aesthetic-predictor/raw/main/sac+logos+ava1-l14-linearMSE.pth -O pretrained_models/aesthetic.pth
-```
-
-## Usage
-
-With `DATA.csv` containing the paths to the videos, run the following command:
-
-```bash
-# output: DATA_aes.csv
-python -m tools.aesthetic.inference DATA.csv
-```
diff --git a/tools/scoring/optical_flow/inference.py b/tools/scoring/optical_flow/inference.py
index 62f887c..d48ecf7 100644
--- a/tools/scoring/optical_flow/inference.py
+++ b/tools/scoring/optical_flow/inference.py
@@ -1,22 +1,17 @@
-import os
-# os.chdir('../..')
-print(f'Current working directory: {os.getcwd()}')
-
 import argparse
+import os
+
 import av
 import numpy as np
 import pandas as pd
+import torch
+import torch.nn.functional as F
 from einops import rearrange
 from tqdm import tqdm
 
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.utils.data import Dataset
-from torchvision.transforms.functional import pil_to_tensor
+from .unimatch import UniMatch
 
-import decord
-from unimatch import UniMatch
+import decord  # isort: skip
 
 
 def extract_frames_av(video_path, frame_inds=[0, 10, 20, 30]):
@@ -57,11 +52,11 @@ class VideoTextDataset(torch.utils.data.Dataset):
 
         # transform
         images = torch.from_numpy(images).float()
-        images = rearrange(images, 'N H W C -> N C H W')
+        images = rearrange(images, "N H W C -> N C H W")
         H, W = images.shape[-2:]
         if H > W:
-            images = rearrange(images, 'N C H W -> N C W H')
-        images = F.interpolate(images, size=(320, 576), mode='bilinear', align_corners=True)
+            images = rearrange(images, "N C H W -> N C W H")
+        images = F.interpolate(images, size=(320, 576), mode="bilinear", align_corners=True)
 
         return images
 
@@ -78,7 +73,7 @@ def main():
 
     meta_path = args.meta_path
     wo_ext, ext = os.path.splitext(meta_path)
-    out_path = f'{wo_ext}_flow{ext}'
+    out_path = f"{wo_ext}_flow{ext}"
 
     # build model
     device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
@@ -90,12 +85,10 @@ def main():
         ffn_dim_expansion=4,
         num_transformer_layers=6,
         reg_refine=True,
-        task='flow',
+        task="flow",
     )
-    ckpt = torch.load(
-        './pretrained_models/unimatch/gmflow-scale2-regrefine6-mixdata-train320x576-4e7b215d.pth'
-    )
-    model.load_state_dict(ckpt['model'])
+    ckpt = torch.load("./pretrained_models/unimatch/gmflow-scale2-regrefine6-mixdata-train320x576-4e7b215d.pth")
+    model.load_state_dict(ckpt["model"])
     model = model.to(device)
     # model = torch.nn.DataParallel(model)
 
@@ -115,30 +108,31 @@ def main():
         images = images.to(device)
         B = images.shape[0]
 
-        batch_0 = rearrange(images[:, :-1], 'B N C H W -> (B N) C H W').contiguous()
-        batch_1 = rearrange(images[:, 1:], 'B N C H W -> (B N) C H W').contiguous()
+        batch_0 = rearrange(images[:, :-1], "B N C H W -> (B N) C H W").contiguous()
+        batch_1 = rearrange(images[:, 1:], "B N C H W -> (B N) C H W").contiguous()
 
         with torch.no_grad():
             res = model(
-                batch_0, batch_1,
-                attn_type='swin',
+                batch_0,
+                batch_1,
+                attn_type="swin",
                 attn_splits_list=[2, 8],
                 corr_radius_list=[-1, 4],
                 prop_radius_list=[-1, 1],
                 num_reg_refine=6,
-                task='flow',
+                task="flow",
                 pred_bidir_flow=False,
             )
-            flow_maps = res['flow_preds'][-1].cpu()  # [B * (N-1), 2, H, W]
-            flow_maps = rearrange(flow_maps, '(B N) C H W -> B N H W C', B=B)
+            flow_maps = res["flow_preds"][-1].cpu()  # [B * (N-1), 2, H, W]
+            flow_maps = rearrange(flow_maps, "(B N) C H W -> B N H W C", B=B)
             flow_scores = flow_maps.abs().mean(dim=[1, 2, 3, 4])
             flow_scores_np = flow_scores.numpy()
 
-        dataset.meta.loc[index: index + B - 1, "flow"] = flow_scores_np
+        dataset.meta.loc[index : index + B - 1, "flow"] = flow_scores_np
         index += B
 
     dataset.meta.to_csv(out_path, index=False)
-    print(f"New meta with optical flow scores saved to \'{out_path}\'.")
+    print(f"New meta with optical flow scores saved to '{out_path}'.")
 
 
 if __name__ == "__main__":