Merge branch 'dev/v1.2' of github.com:hpcaitech/Open-Sora-dev into dev/v1.2

This commit is contained in:
zhengzangw 2024-05-14 05:40:23 +00:00
commit aec8c60036
5 changed files with 51 additions and 42 deletions

View file

@ -38,16 +38,13 @@ python -m tools.datasets.convert video ${ROOT_CLIPS} --output ${ROOT_META}/meta_
# 2.4 Get clips information and remove broken ones. This should output ${ROOT_META}/meta_clips_info_fmin1.csv
python -m tools.datasets.datautil ${ROOT_META}/meta_clips.csv --info --fmin 1
# 3.1 Predict aesthetic scores. This should output ${ROOT_META}/meta_clips_info_fmin1_aes_part*.csv
# 3.1 Predict aesthetic scores. This should output ${ROOT_META}/meta_clips_info_fmin1_aes.csv
torchrun --nproc_per_node 8 -m tools.scoring.aesthetic.inference \
${ROOT_META}/meta_clips_info_fmin1.csv \
--bs 1024 \
--num_workers 16
# 3.2 Merge files; This should output ${ROOT_META}/meta_clips_info_fmin1_aes.csv
python -m tools.datasets.datautil ${ROOT_META}/meta_clips_info_fmin1_aes_part*.csv --output ${ROOT_META}/meta_clips_info_fmin1_aes.csv
# 3.3 Filter by aesthetic scores. This should output ${ROOT_META}/meta_clips_info_fmin1_aes_aesmin5.csv
# 3.2 Filter by aesthetic scores. This should output ${ROOT_META}/meta_clips_info_fmin1_aes_aesmin5.csv
python -m tools.datasets.datautil ${ROOT_META}/meta_clips_info_fmin1_aes.csv --aesmin 5
# 4.1 Generate caption. This should output ${ROOT_META}/meta_clips_info_fmin1_aes_aesmin5_caption_part*.csv

View file

@ -94,20 +94,22 @@ def process_vidprom(root, info):
def process_general_images(root, output):
root = os.path.expanduser(root)
image_lists = get_filelist(root, IMG_EXTENSIONS)
df = pd.DataFrame(dict(path=image_lists))
if output is None:
output = "images.csv"
path_list = get_filelist(root, IMG_EXTENSIONS)
fname_list = [os.path.splitext(os.path.basename(x))[0] for x in path_list]
df = pd.DataFrame(dict(id=fname_list, path=path_list))
os.makedirs(os.path.dirname(output), exist_ok=True)
df.to_csv(output, index=False)
print(f"Saved {len(df)} samples to {output}.")
def process_general_videos(root, output):
root = os.path.expanduser(root)
video_lists = get_filelist(root, VID_EXTENSIONS)
df = pd.DataFrame(dict(path=video_lists))
if output is None:
output = "videos.csv"
path_list = get_filelist(root, VID_EXTENSIONS)
fname_list = [os.path.splitext(os.path.basename(x))[0] for x in path_list]
relpath_list = [os.path.relpath(x, root) for x in path_list]
df = pd.DataFrame(dict(path=path_list, id=fname_list, relpath=relpath_list))
os.makedirs(os.path.dirname(output), exist_ok=True)
df.to_csv(output, index=False)
print(f"Saved {len(df)} samples to {output}.")
@ -119,7 +121,7 @@ if __name__ == "__main__":
parser.add_argument("root", type=str)
parser.add_argument("--split", type=str, default="train")
parser.add_argument("--info", type=str, default=None)
parser.add_argument("--output", type=str, default=None)
parser.add_argument("--output", type=str, default=None, required=True, help='Output path')
args = parser.parse_args()
if args.dataset == "imagenet":

View file

@ -54,7 +54,7 @@ def get_info(path):
if ext in IMG_EXTENSIONS:
im = cv2.imread(path)
if im is None:
return 0, 0, 0, np.nan, np.nan
return 0, 0, 0, np.nan, np.nan, np.nan
height, width = im.shape[:2]
num_frames, fps = 1, np.nan
else:
@ -415,8 +415,12 @@ def main(args):
data_new = pd.read_csv(args.intersection)
print(f"Intersection csv contains {len(data_new)} samples.")
cols_to_use = data_new.columns.difference(data.columns)
cols_to_use = cols_to_use.insert(0, "path")
data = pd.merge(data, data_new[cols_to_use], on="path", how="inner")
col_on = 'path'
# if 'id' in data.columns and 'id' in data_new.columns:
# col_on = 'id'
cols_to_use = cols_to_use.insert(0, col_on)
data = pd.merge(data, data_new[cols_to_use], on=col_on, how="inner")
print(f"Intersection number of samples: {len(data)}.")
# train columns
@ -484,6 +488,8 @@ def main(args):
data["path"] = apply(data["path"], lambda x: os.path.relpath(x, args.relpath))
if args.abspath is not None:
data["path"] = apply(data["path"], lambda x: os.path.join(args.abspath, x))
if args.path_to_id:
data["id"] = apply(data["path"], lambda x: os.path.splitext(os.path.basename(x))[0])
if args.merge_cmotion:
data["text"] = apply(data, lambda x: merge_cmotion(x["text"], x["cmotion"]), axis=1)
if args.refine_llm_caption:
@ -581,6 +587,7 @@ def parse_args():
# path processing
parser.add_argument("--relpath", type=str, default=None, help="modify the path to relative path by root given")
parser.add_argument("--abspath", type=str, default=None, help="modify the path to absolute path by root given")
parser.add_argument("--path-to-id", action='store_true', help="add id based on path")
# caption filtering
parser.add_argument(

View file

@ -50,10 +50,10 @@ def split_video(
video_path,
scene_list,
save_dir,
min_seconds=2.0,
max_seconds=15.0,
min_seconds=2,
max_seconds=15,
target_fps=30,
shorter_size=720,
shorter_size=None,
verbose=False,
logger=None,
):
@ -134,8 +134,8 @@ def parse_args():
help='if not None, clip shorter than min_seconds is ignored')
parser.add_argument("--max_seconds", type=float, default=None,
help='if not None, clip longer than max_seconds is truncated')
parser.add_argument("--target_fps", type=int, default=30, help='target fps of clips')
parser.add_argument("--shorter_size", type=int, default=720, help='resize the shorter size by keeping ratio')
parser.add_argument("--target_fps", type=int, default=None, help='target fps of clips')
parser.add_argument("--shorter_size", type=int, default=None, help='resize the shorter size by keeping ratio')
parser.add_argument("--num_workers", type=int, default=None, help='#workers for pandarallel')
args = parser.parse_args()

View file

@ -1,7 +1,7 @@
import argparse
import os
from datetime import timedelta
import colossalai
import numpy as np
import pandas as pd
import torch
@ -13,25 +13,27 @@ from torchvision.transforms.functional import pil_to_tensor
from tqdm import tqdm
from tools.datasets.utils import extract_frames
from .unimatch import UniMatch
def merge_scores(gathered_list: list, meta: pd.DataFrame):
def merge_scores(gathered_list: list, meta: pd.DataFrame, column):
# reorder
indices_list = list(map(lambda x: x[0], gathered_list))
flow_scores_list = list(map(lambda x: x[1], gathered_list))
scores_list = list(map(lambda x: x[1], gathered_list))
flat_indices = []
for x in zip(*indices_list):
flat_indices.extend(x)
flat_flow_scores = []
for x in zip(*flow_scores_list):
flat_flow_scores.extend(x)
flat_scores = []
for x in zip(*scores_list):
flat_scores.extend(x)
flat_indices = np.array(flat_indices)
flat_flow_scores = np.array(flat_flow_scores)
flat_scores = np.array(flat_scores)
# filter duplicates
unique_indices, unique_indices_idx = np.unique(flat_indices, return_index=True)
meta.loc[unique_indices, "flow"] = flat_flow_scores[unique_indices_idx]
meta.loc[unique_indices, column] = flat_scores[unique_indices_idx]
return meta
class VideoTextDataset(torch.utils.data.Dataset):
@ -61,17 +63,19 @@ class VideoTextDataset(torch.utils.data.Dataset):
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("meta_path", type=str, help="Path to the input CSV file")
parser.add_argument("--bs", type=int, default=4, help="Batch size")
parser.add_argument("--bs", type=int, default=4, help="Batch size") # don't use too large bs for unimatch
parser.add_argument("--num_workers", type=int, default=16, help="Number of workers")
args = parser.parse_args()
return args
def main():
args = parse_args()
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
colossalai.launch_from_torch({})
args = parse_args()
dist.init_process_group(backend="nccl", timeout=timedelta(hours=24))
torch.cuda.set_device(dist.get_rank() % torch.cuda.device_count())
meta_path = args.meta_path
wo_ext, ext = os.path.splitext(meta_path)
@ -88,11 +92,10 @@ def main():
num_transformer_layers=6,
reg_refine=True,
task="flow",
).eval()
)
ckpt = torch.load("./pretrained_models/unimatch/gmflow-scale2-regrefine6-mixdata-train320x576-4e7b215d.pth")
model.load_state_dict(ckpt["model"])
model = model.to(device)
# model = torch.nn.DataParallel(model)
# build dataset
dataset = VideoTextDataset(meta_path=meta_path, frame_inds=[0, 10, 20, 30])
@ -110,9 +113,9 @@ def main():
)
# compute optical flow scores
dataset.meta["flow"] = np.nan
indices_list = []
flow_scores_list = []
scores_list = []
model.eval()
for images, indices in tqdm(dataloader, disable=dist.get_rank() != 0):
images = images.to(device)
B = images.shape[0]
@ -138,13 +141,13 @@ def main():
flow_scores = flow_scores.tolist()
indices_list.extend(indices)
flow_scores_list.extend(flow_scores)
scores_list.extend(flow_scores)
gathered_list = [None] * dist.get_world_size()
dist.all_gather_object(gathered_list, (indices_list, flow_scores_list))
dist.all_gather_object(gathered_list, (indices_list, scores_list))
if dist.get_rank() == 0:
merge_scores(gathered_list, dataset.meta)
dataset.meta.to_csv(out_path, index=False)
meta_new = merge_scores(gathered_list, dataset.meta, column='flow')
meta_new.to_csv(out_path, index=False)
print(f"New meta with optical flow scores saved to '{out_path}'.")