mirror of
https://github.com/hpcaitech/Open-Sora.git
synced 2026-04-10 21:01:26 +02:00
Dev/datapipe (#21)
* fix #210 * fix #209 * fix #188 * [docs] add training order * update data pipeline --------- Co-authored-by: Sze-qq <68757353+Sze-qq@users.noreply.github.com>
This commit is contained in:
parent
7612d22fc6
commit
b5414b36b8
32
README.md
32
README.md
|
|
@ -149,6 +149,8 @@ the config files.
|
|||
| 16×256×256 | 20K HQ | 24k | 8×64 | 45 | [:link:](https://huggingface.co/hpcai-tech/Open-Sora/blob/main/OpenSora-v1-HQ-16x256x256.pth) |
|
||||
| 16×256×256 | 366K | 80k | 8×64 | 117 | [:link:](https://huggingface.co/hpcai-tech/Open-Sora/blob/main/OpenSora-v1-16x256x256.pth) |
|
||||
|
||||
Training orders: 16x256x256 $\rightarrow$ 16x256x256 HQ $\rightarrow$ 16x512x512 HQ.
|
||||
|
||||
Our model's weight is partially initialized from [PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha). The number of
|
||||
parameters is 724M. More information about training can be found in our **[report](/docs/report_v1.md)**. More about
|
||||
the dataset can be found in [datasets.md](/docs/datasets.md). HQ means high quality.
|
||||
|
|
@ -203,33 +205,35 @@ Below is an example workflow to process data. However, we recommend you to read
|
|||
```bash
|
||||
# Suppose files under ~/dataset/
|
||||
# 1. Convert dataset to CSV
|
||||
# output: ~/dataset.csv
|
||||
python -m tools.dataset.convert video ~/dataset
|
||||
python -m tools.datasets.convert video ~/dataset --output ~/dataset/meta.csv
|
||||
# filter out broken videos (broken ones num_frames=0)
|
||||
python -m tools.dataset.csvutil ~/dataset.csv --video-info --fmin 2 --output ~/dataset.csv
|
||||
python -m tools.datasets.csvutil ~/dataset.csv --info --fmin 1 --output ~/dataset/meta.csv
|
||||
|
||||
# 2. Filter dataset by aesthetic scores
|
||||
# output: ~/dataset_aesthetic.csv
|
||||
python -m tools.aesthetic.inference ~/dataset.csv
|
||||
# output: ~/dataset/meta_aes.csv
|
||||
python -m tools.aesthetic.inference ~/dataset/meta.csv
|
||||
# sort and examine videos by aesthetic scores
|
||||
python -m tools.datasets.csvutil ~/dataset_aesthetic.csv --sort-descending aesthetic_score
|
||||
# output: ~/dataset/meta_aes_sort.csv
|
||||
python -m tools.datasets.csvutil ~/dataset/meta_aes.csv --sort-descending aes
|
||||
# bad videos (aesthetic_score < 5)
|
||||
tail ~/dataset_aesthetic.csv
|
||||
tail ~/dataset/meta_aes_sort.csv
|
||||
# filter videos by aesthetic scores
|
||||
# output: ~/dataset_aesthetic_aesmin_5.csv
|
||||
python -m tools.datasets.csvutil ~/dataset_aesthetic.csv --aesmin 5
|
||||
# output: ~/dataset/meta_aes_aesmin5.csv
|
||||
python -m tools.datasets.csvutil ~/dataset/meta_aes.csv --aesmin 5
|
||||
|
||||
# 3. Caption dataset
|
||||
# output: ~/dataset_aesthetic_aesmin_5_caption.csv
|
||||
torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava ~/dataset_aesthetic_aesmin_5.csv --tp-size 2 --dp-size 4 --bs 16
|
||||
# output: ~/dataset/meta_aes_aesmin5_caption_parti.csv
|
||||
torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava ~/dataset/meta_aes_aesmin5.csv --tp-size 2 --dp-size 4 --bs 16
|
||||
# merge generated results
|
||||
python -m tools.datasets.csvutil ~/dataset/meta_aes_aesmin5_caption_part*.csv --output ~/dataset/meta_caption.csv
|
||||
# remove empty captions and process captions (may need to re-caption lost ones)
|
||||
python -m tools.datasets.csvutil ~/dataset_aesthetic_aesmin_5_caption.csv --remove-caption-prefix --remove-empty-caption
|
||||
python -m tools.datasets.csvutil ~/dataset/meta_caption.csv --remove-caption-prefix --remove-empty-caption --output ~/dataset/meta_caption_processed.csv
|
||||
|
||||
# 4. Sanity check & prepare for training
|
||||
# sanity check
|
||||
python -m tools.datasets.csvutil ~/dataset_aesthetic_aesmin_5_caption.csv --ext --video-info --output ~/dataset_ready.csv
|
||||
python -m tools.datasets.csvutil ~/dataset/meta_caption_processed.csv --ext --video-info --output ~/dataset/meta_ready.csv
|
||||
# filter out videos less than 48 frames
|
||||
# output: ~/dataset_ready_fmin_48.csv
|
||||
# output: ~/dataset/meta_ready_fmin48.csv
|
||||
python -m tools.datasets.csvutil ~/dataset_ready.csv --fmin 48
|
||||
```
|
||||
|
||||
|
|
|
|||
|
|
@ -65,20 +65,24 @@ def process_vidprom(root, info):
|
|||
print(f"Saved {len(df)} samples to vidprom.csv.")
|
||||
|
||||
|
||||
def process_general_images(root):
|
||||
def process_general_images(root, output):
|
||||
root = os.path.expanduser(root)
|
||||
image_lists = get_filelist(root, IMG_EXTENSIONS)
|
||||
df = pd.DataFrame(dict(path=image_lists))
|
||||
df.to_csv("images.csv", index=False)
|
||||
print(f"Saved {len(df)} samples to images.csv.")
|
||||
if output is None:
|
||||
output = "images.csv"
|
||||
df.to_csv(output, index=False)
|
||||
print(f"Saved {len(df)} samples to {output}.")
|
||||
|
||||
|
||||
def process_general_videos(root):
|
||||
def process_general_videos(root, output):
|
||||
root = os.path.expanduser(root)
|
||||
video_lists = get_filelist(root, VID_EXTENSIONS)
|
||||
df = pd.DataFrame(dict(path=video_lists))
|
||||
df.to_csv("videos.csv", index=False)
|
||||
print(f"Saved {len(df)} samples to videos.csv.")
|
||||
if output is None:
|
||||
output = "videos.csv"
|
||||
df.to_csv(output, index=False)
|
||||
print(f"Saved {len(df)} samples to {output}.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
@ -87,6 +91,7 @@ if __name__ == "__main__":
|
|||
parser.add_argument("root", type=str)
|
||||
parser.add_argument("--split", type=str, default="train")
|
||||
parser.add_argument("--info", type=str, default=None)
|
||||
parser.add_argument("--output", type=str, default=None)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.dataset == "imagenet":
|
||||
|
|
@ -96,8 +101,8 @@ if __name__ == "__main__":
|
|||
elif args.dataset == "vidprom":
|
||||
process_vidprom(args.root, args.info)
|
||||
elif args.dataset == "image":
|
||||
process_general_images(args.root)
|
||||
process_general_images(args.root, args.output)
|
||||
elif args.dataset == "video":
|
||||
process_general_videos(args.root)
|
||||
process_general_videos(args.root, args.output)
|
||||
else:
|
||||
raise ValueError("Invalid dataset")
|
||||
|
|
|
|||
|
|
@ -124,7 +124,7 @@ def parse_args():
|
|||
parser.add_argument("--remove-caption-prefix", action="store_true")
|
||||
parser.add_argument("--unescape", action="store_true")
|
||||
# num_frames processing
|
||||
parser.add_argument("--video-info", action="store_true")
|
||||
parser.add_argument("--info", action="store_true")
|
||||
# num_frames filtering
|
||||
parser.add_argument("--fmin", type=int, default=None)
|
||||
parser.add_argument("--fmax", type=int, default=None)
|
||||
|
|
@ -163,19 +163,19 @@ def get_output_path(args, input_name):
|
|||
if args.unescape:
|
||||
name += "_unescape"
|
||||
# num_frames processing
|
||||
if args.video_info:
|
||||
name += "_vinfo"
|
||||
if args.info:
|
||||
name += "_info"
|
||||
# num_frames filtering
|
||||
if args.fmin is not None:
|
||||
name += f"_fmin_{args.fmin}"
|
||||
name += f"_fmin{args.fmin}"
|
||||
if args.fmax is not None:
|
||||
name += f"_fmax_{args.fmax}"
|
||||
name += f"_fmax{args.fmax}"
|
||||
# aesthetic filtering
|
||||
if args.aesmin is not None:
|
||||
name += f"_aesmin_{args.aesmin}"
|
||||
name += f"_aesmin{args.aesmin}"
|
||||
# clip score filtering
|
||||
if args.matchmin is not None:
|
||||
name += f"_matchmin_{args.matchmin}"
|
||||
name += f"_matchmin{args.matchmin}"
|
||||
# sort
|
||||
if args.sort_descending is not None:
|
||||
assert args.sort_ascending is None
|
||||
|
|
@ -254,7 +254,7 @@ def main(args):
|
|||
if args.unescape:
|
||||
assert "text" in data.columns
|
||||
data["text"] = apply(data["text"], html.unescape)
|
||||
if args.video_info:
|
||||
if args.info:
|
||||
info = apply(data["path"], get_video_info)
|
||||
data["num_frames"], data["height"], data["width"], data["aspect_ratio"], data["fps"] = zip(*info)
|
||||
|
||||
|
|
@ -266,11 +266,11 @@ def main(args):
|
|||
assert "num_frames" in data.columns
|
||||
data = data[data["num_frames"] <= args.fmax]
|
||||
if args.aesmin is not None:
|
||||
assert "aesthetic_score" in data.columns
|
||||
data = data[data["aesthetic_score"] >= args.aesmin]
|
||||
assert "aes" in data.columns
|
||||
data = data[data["aes"] >= args.aesmin]
|
||||
if args.matchmin is not None:
|
||||
assert "clip_score" in data.columns
|
||||
data = data[data["clip_score"] >= args.matchmin]
|
||||
assert "match" in data.columns
|
||||
data = data[data["match"] >= args.matchmin]
|
||||
print(f"Filtered number of samples: {len(data)}.")
|
||||
|
||||
# sort
|
||||
|
|
|
|||
|
|
@ -1,37 +1,59 @@
|
|||
# Data Scoring and Filtering
|
||||
Important!!! All scoring jobs require these columns in meta files:
|
||||
- `path`: absolute path to a sample
|
||||
|
||||
## Aesthetic Score
|
||||
First prepare the environment and pretrained models.
|
||||
```bash
|
||||
# install clip
|
||||
pip install git+https://github.com/openai/CLIP.git
|
||||
pip install decord
|
||||
|
||||
# get pretrained model
|
||||
wget https://github.com/christophschuhmann/improved-aesthetic-predictor/raw/main/sac+logos+ava1-l14-linearMSE.pth -O pretrained_models/aesthetic.pth
|
||||
```
|
||||
|
||||
Then run:
|
||||
```bash
|
||||
python -m tools.scoring.aesthetic.inference /path/to/meta.csv
|
||||
```
|
||||
The output should be `/path/to/meta_aes.csv` with column `aes`. Aesthetic scores range from 1 to 10, with 10 being the best quality.
|
||||
|
||||
## Optical Flow Score
|
||||
First get the pretrained model.
|
||||
```bash
|
||||
wget https://s3.eu-central-1.amazonaws.com/avg-projects/unimatch/pretrained/gmflow-scale2-regrefine6-mixdata-train320x576-4e7b215d.pth -P pretrained_models/unimatch
|
||||
```
|
||||
|
||||
Then run:
|
||||
```
|
||||
python tools/scoring/optical_flow/inference.py /path/to/meta.csv
|
||||
```
|
||||
The output should be `/path/to/meta_flow.csv` with column `flow`. Higher optical flow scores indicate larger movement.
|
||||
|
||||
## Matching Score
|
||||
Require column `text` in meta files, which is the caption of the sample.
|
||||
|
||||
TODO.
|
||||
# Data Scoring and Filtering
|
||||
|
||||
- [Data Scoring and Filtering](#data-scoring-and-filtering)
|
||||
- [Aesthetic Scoring](#aesthetic-scoring)
|
||||
- [Requirement](#requirement)
|
||||
- [Usage](#usage)
|
||||
- [Optical Flow Score](#optical-flow-score)
|
||||
- [Matching Score](#matching-score)
|
||||
|
||||
|
||||
## Aesthetic Scoring
|
||||
|
||||
To evaluate the aesthetic quality of videos, we use a pretrained model from [CLIP+MLP Aesthetic Score Predictor](https://github.com/christophschuhmann/improved-aesthetic-predictor). This model is trained on 176K SAC (Simulacra Aesthetic Captions) pairs, 15K LAION-Logos (Logos) pairs, and 250K AVA (The Aesthetic Visual Analysis) image-text pairs.
|
||||
|
||||
The score is between 1 and 10, where 5.5 can be considered as the threshold for fair aesthetics, and 6.5 for good aesthetics. Good text-to-image models can achieve a score of 7.0 or higher.
|
||||
|
||||
For videos, we extract the first, last, and the middle frames for evaluation. The script also supports images. Our script enables 1k videos/s with one GPU. It also supports multiple GPUs to further accelerate the process.
|
||||
|
||||
### Requirement
|
||||
|
||||
```bash
|
||||
# install clip
|
||||
pip install git+https://github.com/openai/CLIP.git
|
||||
pip install decord
|
||||
|
||||
# get pretrained model
|
||||
wget https://github.com/christophschuhmann/improved-aesthetic-predictor/raw/main/sac+logos+ava1-l14-linearMSE.pth -O pretrained_models/aesthetic.pth
|
||||
```
|
||||
|
||||
### Usage
|
||||
|
||||
With `meta.csv` containing the paths to the videos, run the following command:
|
||||
|
||||
```bash
|
||||
# output: DATA_aes.csv
|
||||
python -m tools.aesthetic.inference meta.csv
|
||||
```
|
||||
|
||||
## Optical Flow Score
|
||||
|
||||
First get the pretrained model.
|
||||
|
||||
```bash
|
||||
wget https://s3.eu-central-1.amazonaws.com/avg-projects/unimatch/pretrained/gmflow-scale2-regrefine6-mixdata-train320x576-4e7b215d.pth -P pretrained_models/unimatch
|
||||
```
|
||||
|
||||
With `meta.csv` containing the paths to the videos, run the following command:
|
||||
|
||||
```bash
|
||||
python -m tools.scoring.optical_flow.inference /path/to/meta.csv
|
||||
```
|
||||
|
||||
The output should be `/path/to/meta_flow.csv` with column `flow`. Higher optical flow scores indicate larger movement.
|
||||
|
||||
## Matching Score
|
||||
|
||||
Require column `text` in meta files, which is the caption of the sample.
|
||||
|
||||
TODO.
|
||||
|
|
|
|||
|
|
@ -1,27 +0,0 @@
|
|||
# Aesthetic Scoring
|
||||
|
||||
To evaluate the aesthetic quality of videos, we use a pretrained model from [CLIP+MLP Aesthetic Score Predictor](https://github.com/christophschuhmann/improved-aesthetic-predictor). This model is trained on 176K SAC (Simulacra Aesthetic Captions) pairs, 15K LAION-Logos (Logos) pairs, and 250K AVA (The Aesthetic Visual Analysis) image-text pairs.
|
||||
|
||||
The score is between 1 and 10, where 5.5 can be considered as the threshold for fair aesthetics, and 6.5 for good aesthetics. Good text-to-image models can achieve a score of 7.0 or higher.
|
||||
|
||||
For videos, we extract the first, last, and the middle frames for evaluation. The script also supports images. Our script enables 1k videos/s with one GPU. It also supports multiple GPUs to further accelerate the process.
|
||||
|
||||
## Requirement
|
||||
|
||||
```bash
|
||||
# install clip
|
||||
pip install git+https://github.com/openai/CLIP.git
|
||||
pip install decord
|
||||
|
||||
# get pretrained model
|
||||
wget https://github.com/christophschuhmann/improved-aesthetic-predictor/raw/main/sac+logos+ava1-l14-linearMSE.pth -O pretrained_models/aesthetic.pth
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
With `DATA.csv` containing the paths to the videos, run the following command:
|
||||
|
||||
```bash
|
||||
# output: DATA_aes.csv
|
||||
python -m tools.aesthetic.inference DATA.csv
|
||||
```
|
||||
|
|
@ -1,22 +1,17 @@
|
|||
import os
|
||||
# os.chdir('../..')
|
||||
print(f'Current working directory: {os.getcwd()}')
|
||||
|
||||
import argparse
|
||||
import os
|
||||
|
||||
import av
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from einops import rearrange
|
||||
from tqdm import tqdm
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from torch.utils.data import Dataset
|
||||
from torchvision.transforms.functional import pil_to_tensor
|
||||
from .unimatch import UniMatch
|
||||
|
||||
import decord
|
||||
from unimatch import UniMatch
|
||||
import decord # isort: skip
|
||||
|
||||
|
||||
def extract_frames_av(video_path, frame_inds=[0, 10, 20, 30]):
|
||||
|
|
@ -57,11 +52,11 @@ class VideoTextDataset(torch.utils.data.Dataset):
|
|||
|
||||
# transform
|
||||
images = torch.from_numpy(images).float()
|
||||
images = rearrange(images, 'N H W C -> N C H W')
|
||||
images = rearrange(images, "N H W C -> N C H W")
|
||||
H, W = images.shape[-2:]
|
||||
if H > W:
|
||||
images = rearrange(images, 'N C H W -> N C W H')
|
||||
images = F.interpolate(images, size=(320, 576), mode='bilinear', align_corners=True)
|
||||
images = rearrange(images, "N C H W -> N C W H")
|
||||
images = F.interpolate(images, size=(320, 576), mode="bilinear", align_corners=True)
|
||||
|
||||
return images
|
||||
|
||||
|
|
@ -78,7 +73,7 @@ def main():
|
|||
|
||||
meta_path = args.meta_path
|
||||
wo_ext, ext = os.path.splitext(meta_path)
|
||||
out_path = f'{wo_ext}_flow{ext}'
|
||||
out_path = f"{wo_ext}_flow{ext}"
|
||||
|
||||
# build model
|
||||
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
||||
|
|
@ -90,12 +85,10 @@ def main():
|
|||
ffn_dim_expansion=4,
|
||||
num_transformer_layers=6,
|
||||
reg_refine=True,
|
||||
task='flow',
|
||||
task="flow",
|
||||
)
|
||||
ckpt = torch.load(
|
||||
'./pretrained_models/unimatch/gmflow-scale2-regrefine6-mixdata-train320x576-4e7b215d.pth'
|
||||
)
|
||||
model.load_state_dict(ckpt['model'])
|
||||
ckpt = torch.load("./pretrained_models/unimatch/gmflow-scale2-regrefine6-mixdata-train320x576-4e7b215d.pth")
|
||||
model.load_state_dict(ckpt["model"])
|
||||
model = model.to(device)
|
||||
# model = torch.nn.DataParallel(model)
|
||||
|
||||
|
|
@ -115,30 +108,31 @@ def main():
|
|||
images = images.to(device)
|
||||
B = images.shape[0]
|
||||
|
||||
batch_0 = rearrange(images[:, :-1], 'B N C H W -> (B N) C H W').contiguous()
|
||||
batch_1 = rearrange(images[:, 1:], 'B N C H W -> (B N) C H W').contiguous()
|
||||
batch_0 = rearrange(images[:, :-1], "B N C H W -> (B N) C H W").contiguous()
|
||||
batch_1 = rearrange(images[:, 1:], "B N C H W -> (B N) C H W").contiguous()
|
||||
|
||||
with torch.no_grad():
|
||||
res = model(
|
||||
batch_0, batch_1,
|
||||
attn_type='swin',
|
||||
batch_0,
|
||||
batch_1,
|
||||
attn_type="swin",
|
||||
attn_splits_list=[2, 8],
|
||||
corr_radius_list=[-1, 4],
|
||||
prop_radius_list=[-1, 1],
|
||||
num_reg_refine=6,
|
||||
task='flow',
|
||||
task="flow",
|
||||
pred_bidir_flow=False,
|
||||
)
|
||||
flow_maps = res['flow_preds'][-1].cpu() # [B * (N-1), 2, H, W]
|
||||
flow_maps = rearrange(flow_maps, '(B N) C H W -> B N H W C', B=B)
|
||||
flow_maps = res["flow_preds"][-1].cpu() # [B * (N-1), 2, H, W]
|
||||
flow_maps = rearrange(flow_maps, "(B N) C H W -> B N H W C", B=B)
|
||||
flow_scores = flow_maps.abs().mean(dim=[1, 2, 3, 4])
|
||||
flow_scores_np = flow_scores.numpy()
|
||||
|
||||
dataset.meta.loc[index: index + B - 1, "flow"] = flow_scores_np
|
||||
dataset.meta.loc[index : index + B - 1, "flow"] = flow_scores_np
|
||||
index += B
|
||||
|
||||
dataset.meta.to_csv(out_path, index=False)
|
||||
print(f"New meta with optical flow scores saved to \'{out_path}\'.")
|
||||
print(f"New meta with optical flow scores saved to '{out_path}'.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
Loading…
Reference in a new issue