Dev/datapipe (#21)

* fix #210

* fix #209

* fix #188

* [docs] add training order

* update data pipeline

---------

Co-authored-by: Sze-qq <68757353+Sze-qq@users.noreply.github.com>
This commit is contained in:
Zheng Zangwei (Alex Zheng) 2024-04-02 14:51:21 +08:00 committed by GitHub
parent 7612d22fc6
commit b5414b36b8
6 changed files with 125 additions and 127 deletions

View file

@ -149,6 +149,8 @@ the config files.
| 16×256×256 | 20K HQ | 24k | 8×64 | 45 | [:link:](https://huggingface.co/hpcai-tech/Open-Sora/blob/main/OpenSora-v1-HQ-16x256x256.pth) |
| 16×256×256 | 366K | 80k | 8×64 | 117 | [:link:](https://huggingface.co/hpcai-tech/Open-Sora/blob/main/OpenSora-v1-16x256x256.pth) |
Training orders: 16x256x256 $\rightarrow$ 16x256x256 HQ $\rightarrow$ 16x512x512 HQ.
Our model's weight is partially initialized from [PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha). The number of
parameters is 724M. More information about training can be found in our **[report](/docs/report_v1.md)**. More about
the dataset can be found in [datasets.md](/docs/datasets.md). HQ means high quality.
@ -203,33 +205,35 @@ Below is an example workflow to process data. However, we recommend you to read
```bash
# Suppose files under ~/dataset/
# 1. Convert dataset to CSV
# output: ~/dataset.csv
python -m tools.dataset.convert video ~/dataset
python -m tools.datasets.convert video ~/dataset --output ~/dataset/meta.csv
# filter out broken videos (broken ones num_frames=0)
python -m tools.dataset.csvutil ~/dataset.csv --video-info --fmin 2 --output ~/dataset.csv
python -m tools.datasets.csvutil ~/dataset.csv --info --fmin 1 --output ~/dataset/meta.csv
# 2. Filter dataset by aesthetic scores
# output: ~/dataset_aesthetic.csv
python -m tools.aesthetic.inference ~/dataset.csv
# output: ~/dataset/meta_aes.csv
python -m tools.aesthetic.inference ~/dataset/meta.csv
# sort and examine videos by aesthetic scores
python -m tools.datasets.csvutil ~/dataset_aesthetic.csv --sort-descending aesthetic_score
# output: ~/dataset/meta_aes_sort.csv
python -m tools.datasets.csvutil ~/dataset/meta_aes.csv --sort-descending aes
# bad videos (aesthetic_score < 5)
tail ~/dataset_aesthetic.csv
tail ~/dataset/meta_aes_sort.csv
# filter videos by aesthetic scores
# output: ~/dataset_aesthetic_aesmin_5.csv
python -m tools.datasets.csvutil ~/dataset_aesthetic.csv --aesmin 5
# output: ~/dataset/meta_aes_aesmin5.csv
python -m tools.datasets.csvutil ~/dataset/meta_aes.csv --aesmin 5
# 3. Caption dataset
# output: ~/dataset_aesthetic_aesmin_5_caption.csv
torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava ~/dataset_aesthetic_aesmin_5.csv --tp-size 2 --dp-size 4 --bs 16
# output: ~/dataset/meta_aes_aesmin5_caption_parti.csv
torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava ~/dataset/meta_aes_aesmin5.csv --tp-size 2 --dp-size 4 --bs 16
# merge generated results
python -m tools.datasets.csvutil ~/dataset/meta_aes_aesmin5_caption_part*.csv --output ~/dataset/meta_caption.csv
# remove empty captions and process captions (may need to re-caption lost ones)
python -m tools.datasets.csvutil ~/dataset_aesthetic_aesmin_5_caption.csv --remove-caption-prefix --remove-empty-caption
python -m tools.datasets.csvutil ~/dataset/meta_caption.csv --remove-caption-prefix --remove-empty-caption --output ~/dataset/meta_caption_processed.csv
# 4. Sanity check & prepare for training
# sanity check
python -m tools.datasets.csvutil ~/dataset_aesthetic_aesmin_5_caption.csv --ext --video-info --output ~/dataset_ready.csv
python -m tools.datasets.csvutil ~/dataset/meta_caption_processed.csv --ext --video-info --output ~/dataset/meta_ready.csv
# filter out videos less than 48 frames
# output: ~/dataset_ready_fmin_48.csv
# output: ~/dataset/meta_ready_fmin48.csv
python -m tools.datasets.csvutil ~/dataset_ready.csv --fmin 48
```

View file

@ -65,20 +65,24 @@ def process_vidprom(root, info):
print(f"Saved {len(df)} samples to vidprom.csv.")
def process_general_images(root):
def process_general_images(root, output):
root = os.path.expanduser(root)
image_lists = get_filelist(root, IMG_EXTENSIONS)
df = pd.DataFrame(dict(path=image_lists))
df.to_csv("images.csv", index=False)
print(f"Saved {len(df)} samples to images.csv.")
if output is None:
output = "images.csv"
df.to_csv(output, index=False)
print(f"Saved {len(df)} samples to {output}.")
def process_general_videos(root):
def process_general_videos(root, output):
root = os.path.expanduser(root)
video_lists = get_filelist(root, VID_EXTENSIONS)
df = pd.DataFrame(dict(path=video_lists))
df.to_csv("videos.csv", index=False)
print(f"Saved {len(df)} samples to videos.csv.")
if output is None:
output = "videos.csv"
df.to_csv(output, index=False)
print(f"Saved {len(df)} samples to {output}.")
if __name__ == "__main__":
@ -87,6 +91,7 @@ if __name__ == "__main__":
parser.add_argument("root", type=str)
parser.add_argument("--split", type=str, default="train")
parser.add_argument("--info", type=str, default=None)
parser.add_argument("--output", type=str, default=None)
args = parser.parse_args()
if args.dataset == "imagenet":
@ -96,8 +101,8 @@ if __name__ == "__main__":
elif args.dataset == "vidprom":
process_vidprom(args.root, args.info)
elif args.dataset == "image":
process_general_images(args.root)
process_general_images(args.root, args.output)
elif args.dataset == "video":
process_general_videos(args.root)
process_general_videos(args.root, args.output)
else:
raise ValueError("Invalid dataset")

View file

@ -124,7 +124,7 @@ def parse_args():
parser.add_argument("--remove-caption-prefix", action="store_true")
parser.add_argument("--unescape", action="store_true")
# num_frames processing
parser.add_argument("--video-info", action="store_true")
parser.add_argument("--info", action="store_true")
# num_frames filtering
parser.add_argument("--fmin", type=int, default=None)
parser.add_argument("--fmax", type=int, default=None)
@ -163,19 +163,19 @@ def get_output_path(args, input_name):
if args.unescape:
name += "_unescape"
# num_frames processing
if args.video_info:
name += "_vinfo"
if args.info:
name += "_info"
# num_frames filtering
if args.fmin is not None:
name += f"_fmin_{args.fmin}"
name += f"_fmin{args.fmin}"
if args.fmax is not None:
name += f"_fmax_{args.fmax}"
name += f"_fmax{args.fmax}"
# aesthetic filtering
if args.aesmin is not None:
name += f"_aesmin_{args.aesmin}"
name += f"_aesmin{args.aesmin}"
# clip score filtering
if args.matchmin is not None:
name += f"_matchmin_{args.matchmin}"
name += f"_matchmin{args.matchmin}"
# sort
if args.sort_descending is not None:
assert args.sort_ascending is None
@ -254,7 +254,7 @@ def main(args):
if args.unescape:
assert "text" in data.columns
data["text"] = apply(data["text"], html.unescape)
if args.video_info:
if args.info:
info = apply(data["path"], get_video_info)
data["num_frames"], data["height"], data["width"], data["aspect_ratio"], data["fps"] = zip(*info)
@ -266,11 +266,11 @@ def main(args):
assert "num_frames" in data.columns
data = data[data["num_frames"] <= args.fmax]
if args.aesmin is not None:
assert "aesthetic_score" in data.columns
data = data[data["aesthetic_score"] >= args.aesmin]
assert "aes" in data.columns
data = data[data["aes"] >= args.aesmin]
if args.matchmin is not None:
assert "clip_score" in data.columns
data = data[data["clip_score"] >= args.matchmin]
assert "match" in data.columns
data = data[data["match"] >= args.matchmin]
print(f"Filtered number of samples: {len(data)}.")
# sort

View file

@ -1,37 +1,59 @@
# Data Scoring and Filtering
Important!!! All scoring jobs require these columns in meta files:
- `path`: absolute path to a sample
## Aesthetic Score
First prepare the environment and pretrained models.
```bash
# install clip
pip install git+https://github.com/openai/CLIP.git
pip install decord
# get pretrained model
wget https://github.com/christophschuhmann/improved-aesthetic-predictor/raw/main/sac+logos+ava1-l14-linearMSE.pth -O pretrained_models/aesthetic.pth
```
Then run:
```bash
python -m tools.scoring.aesthetic.inference /path/to/meta.csv
```
The output should be `/path/to/meta_aes.csv` with column `aes`. Aesthetic scores range from 1 to 10, with 10 being the best quality.
## Optical Flow Score
First get the pretrained model.
```bash
wget https://s3.eu-central-1.amazonaws.com/avg-projects/unimatch/pretrained/gmflow-scale2-regrefine6-mixdata-train320x576-4e7b215d.pth -P pretrained_models/unimatch
```
Then run:
```
python tools/scoring/optical_flow/inference.py /path/to/meta.csv
```
The output should be `/path/to/meta_flow.csv` with column `flow`. Higher optical flow scores indicate larger movement.
## Matching Score
Require column `text` in meta files, which is the caption of the sample.
TODO.
# Data Scoring and Filtering
- [Data Scoring and Filtering](#data-scoring-and-filtering)
- [Aesthetic Scoring](#aesthetic-scoring)
- [Requirement](#requirement)
- [Usage](#usage)
- [Optical Flow Score](#optical-flow-score)
- [Matching Score](#matching-score)
## Aesthetic Scoring
To evaluate the aesthetic quality of videos, we use a pretrained model from [CLIP+MLP Aesthetic Score Predictor](https://github.com/christophschuhmann/improved-aesthetic-predictor). This model is trained on 176K SAC (Simulacra Aesthetic Captions) pairs, 15K LAION-Logos (Logos) pairs, and 250K AVA (The Aesthetic Visual Analysis) image-text pairs.
The score is between 1 and 10, where 5.5 can be considered as the threshold for fair aesthetics, and 6.5 for good aesthetics. Good text-to-image models can achieve a score of 7.0 or higher.
For videos, we extract the first, last, and the middle frames for evaluation. The script also supports images. Our script enables 1k videos/s with one GPU. It also supports multiple GPUs to further accelerate the process.
### Requirement
```bash
# install clip
pip install git+https://github.com/openai/CLIP.git
pip install decord
# get pretrained model
wget https://github.com/christophschuhmann/improved-aesthetic-predictor/raw/main/sac+logos+ava1-l14-linearMSE.pth -O pretrained_models/aesthetic.pth
```
### Usage
With `meta.csv` containing the paths to the videos, run the following command:
```bash
# output: DATA_aes.csv
python -m tools.aesthetic.inference meta.csv
```
## Optical Flow Score
First get the pretrained model.
```bash
wget https://s3.eu-central-1.amazonaws.com/avg-projects/unimatch/pretrained/gmflow-scale2-regrefine6-mixdata-train320x576-4e7b215d.pth -P pretrained_models/unimatch
```
With `meta.csv` containing the paths to the videos, run the following command:
```bash
python -m tools.scoring.optical_flow.inference /path/to/meta.csv
```
The output should be `/path/to/meta_flow.csv` with column `flow`. Higher optical flow scores indicate larger movement.
## Matching Score
Require column `text` in meta files, which is the caption of the sample.
TODO.

View file

@ -1,27 +0,0 @@
# Aesthetic Scoring
To evaluate the aesthetic quality of videos, we use a pretrained model from [CLIP+MLP Aesthetic Score Predictor](https://github.com/christophschuhmann/improved-aesthetic-predictor). This model is trained on 176K SAC (Simulacra Aesthetic Captions) pairs, 15K LAION-Logos (Logos) pairs, and 250K AVA (The Aesthetic Visual Analysis) image-text pairs.
The score is between 1 and 10, where 5.5 can be considered as the threshold for fair aesthetics, and 6.5 for good aesthetics. Good text-to-image models can achieve a score of 7.0 or higher.
For videos, we extract the first, last, and the middle frames for evaluation. The script also supports images. Our script enables 1k videos/s with one GPU. It also supports multiple GPUs to further accelerate the process.
## Requirement
```bash
# install clip
pip install git+https://github.com/openai/CLIP.git
pip install decord
# get pretrained model
wget https://github.com/christophschuhmann/improved-aesthetic-predictor/raw/main/sac+logos+ava1-l14-linearMSE.pth -O pretrained_models/aesthetic.pth
```
## Usage
With `DATA.csv` containing the paths to the videos, run the following command:
```bash
# output: DATA_aes.csv
python -m tools.aesthetic.inference DATA.csv
```

View file

@ -1,22 +1,17 @@
import os
# os.chdir('../..')
print(f'Current working directory: {os.getcwd()}')
import argparse
import os
import av
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from einops import rearrange
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torchvision.transforms.functional import pil_to_tensor
from .unimatch import UniMatch
import decord
from unimatch import UniMatch
import decord # isort: skip
def extract_frames_av(video_path, frame_inds=[0, 10, 20, 30]):
@ -57,11 +52,11 @@ class VideoTextDataset(torch.utils.data.Dataset):
# transform
images = torch.from_numpy(images).float()
images = rearrange(images, 'N H W C -> N C H W')
images = rearrange(images, "N H W C -> N C H W")
H, W = images.shape[-2:]
if H > W:
images = rearrange(images, 'N C H W -> N C W H')
images = F.interpolate(images, size=(320, 576), mode='bilinear', align_corners=True)
images = rearrange(images, "N C H W -> N C W H")
images = F.interpolate(images, size=(320, 576), mode="bilinear", align_corners=True)
return images
@ -78,7 +73,7 @@ def main():
meta_path = args.meta_path
wo_ext, ext = os.path.splitext(meta_path)
out_path = f'{wo_ext}_flow{ext}'
out_path = f"{wo_ext}_flow{ext}"
# build model
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
@ -90,12 +85,10 @@ def main():
ffn_dim_expansion=4,
num_transformer_layers=6,
reg_refine=True,
task='flow',
task="flow",
)
ckpt = torch.load(
'./pretrained_models/unimatch/gmflow-scale2-regrefine6-mixdata-train320x576-4e7b215d.pth'
)
model.load_state_dict(ckpt['model'])
ckpt = torch.load("./pretrained_models/unimatch/gmflow-scale2-regrefine6-mixdata-train320x576-4e7b215d.pth")
model.load_state_dict(ckpt["model"])
model = model.to(device)
# model = torch.nn.DataParallel(model)
@ -115,30 +108,31 @@ def main():
images = images.to(device)
B = images.shape[0]
batch_0 = rearrange(images[:, :-1], 'B N C H W -> (B N) C H W').contiguous()
batch_1 = rearrange(images[:, 1:], 'B N C H W -> (B N) C H W').contiguous()
batch_0 = rearrange(images[:, :-1], "B N C H W -> (B N) C H W").contiguous()
batch_1 = rearrange(images[:, 1:], "B N C H W -> (B N) C H W").contiguous()
with torch.no_grad():
res = model(
batch_0, batch_1,
attn_type='swin',
batch_0,
batch_1,
attn_type="swin",
attn_splits_list=[2, 8],
corr_radius_list=[-1, 4],
prop_radius_list=[-1, 1],
num_reg_refine=6,
task='flow',
task="flow",
pred_bidir_flow=False,
)
flow_maps = res['flow_preds'][-1].cpu() # [B * (N-1), 2, H, W]
flow_maps = rearrange(flow_maps, '(B N) C H W -> B N H W C', B=B)
flow_maps = res["flow_preds"][-1].cpu() # [B * (N-1), 2, H, W]
flow_maps = rearrange(flow_maps, "(B N) C H W -> B N H W C", B=B)
flow_scores = flow_maps.abs().mean(dim=[1, 2, 3, 4])
flow_scores_np = flow_scores.numpy()
dataset.meta.loc[index: index + B - 1, "flow"] = flow_scores_np
dataset.meta.loc[index : index + B - 1, "flow"] = flow_scores_np
index += B
dataset.meta.to_csv(out_path, index=False)
print(f"New meta with optical flow scores saved to \'{out_path}\'.")
print(f"New meta with optical flow scores saved to '{out_path}'.")
if __name__ == "__main__":