Dev/datapipe (#21)

* fix #210 * fix #209 * fix #188 * [docs] add training order * update data pipeline --------- Co-authored-by: Sze-qq <68757353+Sze-qq@users.noreply.github.com>
2026-04-10 21:01:26 +02:00 · 2024-04-02 14:51:21 +08:00 · 2024-04-02 14:51:21 +08:00 · b5414b36b8
commit b5414b36b8
parent 7612d22fc6
6 changed files with 125 additions and 127 deletions
--- a/README.md
+++ b/README.md
@ -149,6 +149,8 @@ the config files.
 | 16×256×256 | 20K HQ | 24k         | 8×64       | 45              | [:link:](https://huggingface.co/hpcai-tech/Open-Sora/blob/main/OpenSora-v1-HQ-16x256x256.pth) |
 | 16×256×256 | 366K   | 80k         | 8×64       | 117             | [:link:](https://huggingface.co/hpcai-tech/Open-Sora/blob/main/OpenSora-v1-16x256x256.pth)    |

+Training orders: 16x256x256 $\rightarrow$ 16x256x256 HQ $\rightarrow$ 16x512x512 HQ.
+
 Our model's weight is partially initialized from [PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha). The number of
 parameters is 724M. More information about training can be found in our **[report](/docs/report_v1.md)**. More about
 the dataset can be found in [datasets.md](/docs/datasets.md). HQ means high quality.
@ -203,33 +205,35 @@ Below is an example workflow to process data. However, we recommend you to read
 ```bash
 # Suppose files under ~/dataset/
 # 1. Convert dataset to CSV
-# output: ~/dataset.csv
-python -m tools.dataset.convert video ~/dataset
+python -m tools.datasets.convert video ~/dataset --output ~/dataset/meta.csv
 # filter out broken videos (broken ones num_frames=0)
-python -m tools.dataset.csvutil ~/dataset.csv --video-info --fmin 2 --output ~/dataset.csv
+python -m tools.datasets.csvutil ~/dataset.csv --info --fmin 1 --output ~/dataset/meta.csv

 # 2. Filter dataset by aesthetic scores
-# output: ~/dataset_aesthetic.csv
-python -m tools.aesthetic.inference ~/dataset.csv
+# output: ~/dataset/meta_aes.csv
+python -m tools.aesthetic.inference ~/dataset/meta.csv
 # sort and examine videos by aesthetic scores
-python -m tools.datasets.csvutil ~/dataset_aesthetic.csv --sort-descending aesthetic_score
+# output: ~/dataset/meta_aes_sort.csv
+python -m tools.datasets.csvutil ~/dataset/meta_aes.csv --sort-descending aes
 # bad videos (aesthetic_score < 5)
-tail ~/dataset_aesthetic.csv
+tail ~/dataset/meta_aes_sort.csv
 # filter videos by aesthetic scores
-# output: ~/dataset_aesthetic_aesmin_5.csv
-python -m tools.datasets.csvutil ~/dataset_aesthetic.csv --aesmin 5
+# output: ~/dataset/meta_aes_aesmin5.csv
+python -m tools.datasets.csvutil ~/dataset/meta_aes.csv --aesmin 5

 # 3. Caption dataset
-# output: ~/dataset_aesthetic_aesmin_5_caption.csv
-torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava ~/dataset_aesthetic_aesmin_5.csv --tp-size 2 --dp-size 4 --bs 16
+# output: ~/dataset/meta_aes_aesmin5_caption_parti.csv
+torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava ~/dataset/meta_aes_aesmin5.csv --tp-size 2 --dp-size 4 --bs 16
+# merge generated results
+python -m tools.datasets.csvutil ~/dataset/meta_aes_aesmin5_caption_part*.csv --output ~/dataset/meta_caption.csv
 # remove empty captions and process captions (may need to re-caption lost ones)
-python -m tools.datasets.csvutil ~/dataset_aesthetic_aesmin_5_caption.csv --remove-caption-prefix --remove-empty-caption
+python -m tools.datasets.csvutil ~/dataset/meta_caption.csv --remove-caption-prefix --remove-empty-caption --output ~/dataset/meta_caption_processed.csv

 # 4. Sanity check & prepare for training
 # sanity check
-python -m tools.datasets.csvutil ~/dataset_aesthetic_aesmin_5_caption.csv --ext --video-info --output ~/dataset_ready.csv
+python -m tools.datasets.csvutil ~/dataset/meta_caption_processed.csv --ext --video-info --output ~/dataset/meta_ready.csv
 # filter out videos less than 48 frames
-# output: ~/dataset_ready_fmin_48.csv
+# output: ~/dataset/meta_ready_fmin48.csv
 python -m tools.datasets.csvutil ~/dataset_ready.csv --fmin 48
 ```

--- a/tools/datasets/convert.py
+++ b/tools/datasets/convert.py
@ -65,20 +65,24 @@ def process_vidprom(root, info):
    print(f"Saved {len(df)} samples to vidprom.csv.")


-def process_general_images(root):
+def process_general_images(root, output):
    root = os.path.expanduser(root)
    image_lists = get_filelist(root, IMG_EXTENSIONS)
    df = pd.DataFrame(dict(path=image_lists))
-    df.to_csv("images.csv", index=False)
-    print(f"Saved {len(df)} samples to images.csv.")
+    if output is None:
+        output = "images.csv"
+    df.to_csv(output, index=False)
+    print(f"Saved {len(df)} samples to {output}.")


-def process_general_videos(root):
+def process_general_videos(root, output):
    root = os.path.expanduser(root)
    video_lists = get_filelist(root, VID_EXTENSIONS)
    df = pd.DataFrame(dict(path=video_lists))
-    df.to_csv("videos.csv", index=False)
-    print(f"Saved {len(df)} samples to videos.csv.")
+    if output is None:
+        output = "videos.csv"
+    df.to_csv(output, index=False)
+    print(f"Saved {len(df)} samples to {output}.")


 if __name__ == "__main__":
@ -87,6 +91,7 @@ if __name__ == "__main__":
    parser.add_argument("root", type=str)
    parser.add_argument("--split", type=str, default="train")
    parser.add_argument("--info", type=str, default=None)
+    parser.add_argument("--output", type=str, default=None)
    args = parser.parse_args()

    if args.dataset == "imagenet":
@ -96,8 +101,8 @@ if __name__ == "__main__":
    elif args.dataset == "vidprom":
        process_vidprom(args.root, args.info)
    elif args.dataset == "image":
-        process_general_images(args.root)
+        process_general_images(args.root, args.output)
    elif args.dataset == "video":
-        process_general_videos(args.root)
+        process_general_videos(args.root, args.output)
    else:
        raise ValueError("Invalid dataset")
--- a/tools/datasets/csvutil.py
+++ b/tools/datasets/csvutil.py
@ -124,7 +124,7 @@ def parse_args():
    parser.add_argument("--remove-caption-prefix", action="store_true")
    parser.add_argument("--unescape", action="store_true")
    # num_frames processing
-    parser.add_argument("--video-info", action="store_true")
+    parser.add_argument("--info", action="store_true")
    # num_frames filtering
    parser.add_argument("--fmin", type=int, default=None)
    parser.add_argument("--fmax", type=int, default=None)
@ -163,19 +163,19 @@ def get_output_path(args, input_name):
    if args.unescape:
        name += "_unescape"
    # num_frames processing
-    if args.video_info:
-        name += "_vinfo"
+    if args.info:
+        name += "_info"
    # num_frames filtering
    if args.fmin is not None:
-        name += f"_fmin_{args.fmin}"
+        name += f"_fmin{args.fmin}"
    if args.fmax is not None:
-        name += f"_fmax_{args.fmax}"
+        name += f"_fmax{args.fmax}"
    # aesthetic filtering
    if args.aesmin is not None:
-        name += f"_aesmin_{args.aesmin}"
+        name += f"_aesmin{args.aesmin}"
    # clip score filtering
    if args.matchmin is not None:
-        name += f"_matchmin_{args.matchmin}"
+        name += f"_matchmin{args.matchmin}"
    # sort
    if args.sort_descending is not None:
        assert args.sort_ascending is None
@ -254,7 +254,7 @@ def main(args):
    if args.unescape:
        assert "text" in data.columns
        data["text"] = apply(data["text"], html.unescape)
-    if args.video_info:
+    if args.info:
        info = apply(data["path"], get_video_info)
        data["num_frames"], data["height"], data["width"], data["aspect_ratio"], data["fps"] = zip(*info)

@ -266,11 +266,11 @@ def main(args):
        assert "num_frames" in data.columns
        data = data[data["num_frames"] <= args.fmax]
    if args.aesmin is not None:
-        assert "aesthetic_score" in data.columns
-        data = data[data["aesthetic_score"] >= args.aesmin]
+        assert "aes" in data.columns
+        data = data[data["aes"] >= args.aesmin]
    if args.matchmin is not None:
-        assert "clip_score" in data.columns
-        data = data[data["clip_score"] >= args.matchmin]
+        assert "match" in data.columns
+        data = data[data["match"] >= args.matchmin]
    print(f"Filtered number of samples: {len(data)}.")

    # sort
--- a/tools/scoring/README.md
+++ b/tools/scoring/README.md
@ -1,37 +1,59 @@
-# Data Scoring and Filtering
-Important!!! All scoring jobs require these columns in meta files:
- `path`: absolute path to a sample
-
-## Aesthetic Score
-First prepare the environment and pretrained models.
-```bash
-# install clip
-pip install git+https://github.com/openai/CLIP.git
-pip install decord
-
-# get pretrained model
-wget https://github.com/christophschuhmann/improved-aesthetic-predictor/raw/main/sac+logos+ava1-l14-linearMSE.pth -O pretrained_models/aesthetic.pth
-```
-
-Then run:
-```bash
-python -m tools.scoring.aesthetic.inference /path/to/meta.csv
-```
-The output should be `/path/to/meta_aes.csv` with column `aes`. Aesthetic scores range from 1 to 10, with 10 being the best quality.
-
-## Optical Flow Score
-First get the pretrained model.
-```bash
-wget https://s3.eu-central-1.amazonaws.com/avg-projects/unimatch/pretrained/gmflow-scale2-regrefine6-mixdata-train320x576-4e7b215d.pth -P pretrained_models/unimatch
-```
-
-Then run:
-```
-python tools/scoring/optical_flow/inference.py /path/to/meta.csv
-```
-The output should be `/path/to/meta_flow.csv` with column `flow`. Higher optical flow scores indicate larger movement.
-
-## Matching Score
-Require column `text` in meta files, which is the caption of the sample.
-
-TODO.
+# Data Scoring and Filtering
+
+- [Data Scoring and Filtering](#data-scoring-and-filtering)
+  - [Aesthetic Scoring](#aesthetic-scoring)
+    - [Requirement](#requirement)
+    - [Usage](#usage)
+  - [Optical Flow Score](#optical-flow-score)
+  - [Matching Score](#matching-score)
+
+
+## Aesthetic Scoring
+
+To evaluate the aesthetic quality of videos, we use a pretrained model from [CLIP+MLP Aesthetic Score Predictor](https://github.com/christophschuhmann/improved-aesthetic-predictor). This model is trained on 176K SAC (Simulacra Aesthetic Captions) pairs, 15K LAION-Logos (Logos) pairs, and 250K AVA (The Aesthetic Visual Analysis) image-text pairs.
+
+The score is between 1 and 10, where 5.5 can be considered as the threshold for fair aesthetics, and 6.5 for good aesthetics. Good text-to-image models can achieve a score of 7.0 or higher.
+
+For videos, we extract the first, last, and the middle frames for evaluation. The script also supports images. Our script enables 1k videos/s with one GPU. It also supports multiple GPUs to further accelerate the process.
+
+### Requirement
+
+```bash
+# install clip
+pip install git+https://github.com/openai/CLIP.git
+pip install decord
+
+# get pretrained model
+wget https://github.com/christophschuhmann/improved-aesthetic-predictor/raw/main/sac+logos+ava1-l14-linearMSE.pth -O pretrained_models/aesthetic.pth
+```
+
+### Usage
+
+With `meta.csv` containing the paths to the videos, run the following command:
+
+```bash
+# output: DATA_aes.csv
+python -m tools.aesthetic.inference meta.csv
+```
+
+## Optical Flow Score
+
+First get the pretrained model.
+
+```bash
+wget https://s3.eu-central-1.amazonaws.com/avg-projects/unimatch/pretrained/gmflow-scale2-regrefine6-mixdata-train320x576-4e7b215d.pth -P pretrained_models/unimatch
+```
+
+With `meta.csv` containing the paths to the videos, run the following command:
+
+```bash
+python -m tools.scoring.optical_flow.inference /path/to/meta.csv
+```
+
+The output should be `/path/to/meta_flow.csv` with column `flow`. Higher optical flow scores indicate larger movement.
+
+## Matching Score
+
+Require column `text` in meta files, which is the caption of the sample.
+
+TODO.
--- a/tools/scoring/aesthetic/README.md
+++ b/tools/scoring/aesthetic/README.md
@ -1,27 +0,0 @@
-# Aesthetic Scoring
-
-To evaluate the aesthetic quality of videos, we use a pretrained model from [CLIP+MLP Aesthetic Score Predictor](https://github.com/christophschuhmann/improved-aesthetic-predictor). This model is trained on 176K SAC (Simulacra Aesthetic Captions) pairs, 15K LAION-Logos (Logos) pairs, and 250K AVA (The Aesthetic Visual Analysis) image-text pairs.
-
-The score is between 1 and 10, where 5.5 can be considered as the threshold for fair aesthetics, and 6.5 for good aesthetics. Good text-to-image models can achieve a score of 7.0 or higher.
-
-For videos, we extract the first, last, and the middle frames for evaluation. The script also supports images. Our script enables 1k videos/s with one GPU. It also supports multiple GPUs to further accelerate the process.
-
-## Requirement
-
-```bash
-# install clip
-pip install git+https://github.com/openai/CLIP.git
-pip install decord
-
-# get pretrained model
-wget https://github.com/christophschuhmann/improved-aesthetic-predictor/raw/main/sac+logos+ava1-l14-linearMSE.pth -O pretrained_models/aesthetic.pth
-```
-
-## Usage
-
-With `DATA.csv` containing the paths to the videos, run the following command:
-
-```bash
-# output: DATA_aes.csv
-python -m tools.aesthetic.inference DATA.csv
-```
--- a/tools/scoring/optical_flow/inference.py
+++ b/tools/scoring/optical_flow/inference.py
@ -1,22 +1,17 @@
-import os
-# os.chdir('../..')
-print(f'Current working directory: {os.getcwd()}')
-
 import argparse
+import os
+
 import av
 import numpy as np
 import pandas as pd
+import torch
+import torch.nn.functional as F
 from einops import rearrange
 from tqdm import tqdm

-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.utils.data import Dataset
-from torchvision.transforms.functional import pil_to_tensor
+from .unimatch import UniMatch

-import decord
-from unimatch import UniMatch
+import decord  # isort: skip


 def extract_frames_av(video_path, frame_inds=[0, 10, 20, 30]):
@ -57,11 +52,11 @@ class VideoTextDataset(torch.utils.data.Dataset):

        # transform
        images = torch.from_numpy(images).float()
-        images = rearrange(images, 'N H W C -> N C H W')
+        images = rearrange(images, "N H W C -> N C H W")
        H, W = images.shape[-2:]
        if H > W:
-            images = rearrange(images, 'N C H W -> N C W H')
-        images = F.interpolate(images, size=(320, 576), mode='bilinear', align_corners=True)
+            images = rearrange(images, "N C H W -> N C W H")
+        images = F.interpolate(images, size=(320, 576), mode="bilinear", align_corners=True)

        return images

@ -78,7 +73,7 @@ def main():

    meta_path = args.meta_path
    wo_ext, ext = os.path.splitext(meta_path)
-    out_path = f'{wo_ext}_flow{ext}'
+    out_path = f"{wo_ext}_flow{ext}"

    # build model
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
@ -90,12 +85,10 @@ def main():
        ffn_dim_expansion=4,
        num_transformer_layers=6,
        reg_refine=True,
-        task='flow',
+        task="flow",
    )
-    ckpt = torch.load(
-        './pretrained_models/unimatch/gmflow-scale2-regrefine6-mixdata-train320x576-4e7b215d.pth'
-    )
-    model.load_state_dict(ckpt['model'])
+    ckpt = torch.load("./pretrained_models/unimatch/gmflow-scale2-regrefine6-mixdata-train320x576-4e7b215d.pth")
+    model.load_state_dict(ckpt["model"])
    model = model.to(device)
    # model = torch.nn.DataParallel(model)

@ -115,30 +108,31 @@ def main():
        images = images.to(device)
        B = images.shape[0]

-        batch_0 = rearrange(images[:, :-1], 'B N C H W -> (B N) C H W').contiguous()
-        batch_1 = rearrange(images[:, 1:], 'B N C H W -> (B N) C H W').contiguous()
+        batch_0 = rearrange(images[:, :-1], "B N C H W -> (B N) C H W").contiguous()
+        batch_1 = rearrange(images[:, 1:], "B N C H W -> (B N) C H W").contiguous()

        with torch.no_grad():
            res = model(
-                batch_0, batch_1,
-                attn_type='swin',
+                batch_0,
+                batch_1,
+                attn_type="swin",
                attn_splits_list=[2, 8],
                corr_radius_list=[-1, 4],
                prop_radius_list=[-1, 1],
                num_reg_refine=6,
-                task='flow',
+                task="flow",
                pred_bidir_flow=False,
            )
-            flow_maps = res['flow_preds'][-1].cpu()  # [B * (N-1), 2, H, W]
-            flow_maps = rearrange(flow_maps, '(B N) C H W -> B N H W C', B=B)
+            flow_maps = res["flow_preds"][-1].cpu()  # [B * (N-1), 2, H, W]
+            flow_maps = rearrange(flow_maps, "(B N) C H W -> B N H W C", B=B)
            flow_scores = flow_maps.abs().mean(dim=[1, 2, 3, 4])
            flow_scores_np = flow_scores.numpy()

-        dataset.meta.loc[index: index + B - 1, "flow"] = flow_scores_np
+        dataset.meta.loc[index : index + B - 1, "flow"] = flow_scores_np
        index += B

    dataset.meta.to_csv(out_path, index=False)
-    print(f"New meta with optical flow scores saved to \'{out_path}\'.")
+    print(f"New meta with optical flow scores saved to '{out_path}'.")


 if __name__ == "__main__":