mirror of
https://github.com/hpcaitech/Open-Sora.git
synced 2026-04-11 13:14:44 +02:00
Merge branch 'dev/v1.2' of github.com:hpcaitech/Open-Sora-dev into dev/v1.2
This commit is contained in:
commit
aec8c60036
|
|
@ -38,16 +38,13 @@ python -m tools.datasets.convert video ${ROOT_CLIPS} --output ${ROOT_META}/meta_
|
|||
# 2.4 Get clips information and remove broken ones. This should output ${ROOT_META}/meta_clips_info_fmin1.csv
|
||||
python -m tools.datasets.datautil ${ROOT_META}/meta_clips.csv --info --fmin 1
|
||||
|
||||
# 3.1 Predict aesthetic scores. This should output ${ROOT_META}/meta_clips_info_fmin1_aes_part*.csv
|
||||
# 3.1 Predict aesthetic scores. This should output ${ROOT_META}/meta_clips_info_fmin1_aes.csv
|
||||
torchrun --nproc_per_node 8 -m tools.scoring.aesthetic.inference \
|
||||
${ROOT_META}/meta_clips_info_fmin1.csv \
|
||||
--bs 1024 \
|
||||
--num_workers 16
|
||||
|
||||
# 3.2 Merge files; This should output ${ROOT_META}/meta_clips_info_fmin1_aes.csv
|
||||
python -m tools.datasets.datautil ${ROOT_META}/meta_clips_info_fmin1_aes_part*.csv --output ${ROOT_META}/meta_clips_info_fmin1_aes.csv
|
||||
|
||||
# 3.3 Filter by aesthetic scores. This should output ${ROOT_META}/meta_clips_info_fmin1_aes_aesmin5.csv
|
||||
# 3.2 Filter by aesthetic scores. This should output ${ROOT_META}/meta_clips_info_fmin1_aes_aesmin5.csv
|
||||
python -m tools.datasets.datautil ${ROOT_META}/meta_clips_info_fmin1_aes.csv --aesmin 5
|
||||
|
||||
# 4.1 Generate caption. This should output ${ROOT_META}/meta_clips_info_fmin1_aes_aesmin5_caption_part*.csv
|
||||
|
|
|
|||
|
|
@ -94,20 +94,22 @@ def process_vidprom(root, info):
|
|||
|
||||
def process_general_images(root, output):
|
||||
root = os.path.expanduser(root)
|
||||
image_lists = get_filelist(root, IMG_EXTENSIONS)
|
||||
df = pd.DataFrame(dict(path=image_lists))
|
||||
if output is None:
|
||||
output = "images.csv"
|
||||
path_list = get_filelist(root, IMG_EXTENSIONS)
|
||||
fname_list = [os.path.splitext(os.path.basename(x))[0] for x in path_list]
|
||||
df = pd.DataFrame(dict(id=fname_list, path=path_list))
|
||||
|
||||
os.makedirs(os.path.dirname(output), exist_ok=True)
|
||||
df.to_csv(output, index=False)
|
||||
print(f"Saved {len(df)} samples to {output}.")
|
||||
|
||||
|
||||
def process_general_videos(root, output):
|
||||
root = os.path.expanduser(root)
|
||||
video_lists = get_filelist(root, VID_EXTENSIONS)
|
||||
df = pd.DataFrame(dict(path=video_lists))
|
||||
if output is None:
|
||||
output = "videos.csv"
|
||||
path_list = get_filelist(root, VID_EXTENSIONS)
|
||||
fname_list = [os.path.splitext(os.path.basename(x))[0] for x in path_list]
|
||||
relpath_list = [os.path.relpath(x, root) for x in path_list]
|
||||
df = pd.DataFrame(dict(path=path_list, id=fname_list, relpath=relpath_list))
|
||||
|
||||
os.makedirs(os.path.dirname(output), exist_ok=True)
|
||||
df.to_csv(output, index=False)
|
||||
print(f"Saved {len(df)} samples to {output}.")
|
||||
|
|
@ -119,7 +121,7 @@ if __name__ == "__main__":
|
|||
parser.add_argument("root", type=str)
|
||||
parser.add_argument("--split", type=str, default="train")
|
||||
parser.add_argument("--info", type=str, default=None)
|
||||
parser.add_argument("--output", type=str, default=None)
|
||||
parser.add_argument("--output", type=str, default=None, required=True, help='Output path')
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.dataset == "imagenet":
|
||||
|
|
|
|||
|
|
@ -54,7 +54,7 @@ def get_info(path):
|
|||
if ext in IMG_EXTENSIONS:
|
||||
im = cv2.imread(path)
|
||||
if im is None:
|
||||
return 0, 0, 0, np.nan, np.nan
|
||||
return 0, 0, 0, np.nan, np.nan, np.nan
|
||||
height, width = im.shape[:2]
|
||||
num_frames, fps = 1, np.nan
|
||||
else:
|
||||
|
|
@ -415,8 +415,12 @@ def main(args):
|
|||
data_new = pd.read_csv(args.intersection)
|
||||
print(f"Intersection csv contains {len(data_new)} samples.")
|
||||
cols_to_use = data_new.columns.difference(data.columns)
|
||||
cols_to_use = cols_to_use.insert(0, "path")
|
||||
data = pd.merge(data, data_new[cols_to_use], on="path", how="inner")
|
||||
|
||||
col_on = 'path'
|
||||
# if 'id' in data.columns and 'id' in data_new.columns:
|
||||
# col_on = 'id'
|
||||
cols_to_use = cols_to_use.insert(0, col_on)
|
||||
data = pd.merge(data, data_new[cols_to_use], on=col_on, how="inner")
|
||||
print(f"Intersection number of samples: {len(data)}.")
|
||||
|
||||
# train columns
|
||||
|
|
@ -484,6 +488,8 @@ def main(args):
|
|||
data["path"] = apply(data["path"], lambda x: os.path.relpath(x, args.relpath))
|
||||
if args.abspath is not None:
|
||||
data["path"] = apply(data["path"], lambda x: os.path.join(args.abspath, x))
|
||||
if args.path_to_id:
|
||||
data["id"] = apply(data["path"], lambda x: os.path.splitext(os.path.basename(x))[0])
|
||||
if args.merge_cmotion:
|
||||
data["text"] = apply(data, lambda x: merge_cmotion(x["text"], x["cmotion"]), axis=1)
|
||||
if args.refine_llm_caption:
|
||||
|
|
@ -581,6 +587,7 @@ def parse_args():
|
|||
# path processing
|
||||
parser.add_argument("--relpath", type=str, default=None, help="modify the path to relative path by root given")
|
||||
parser.add_argument("--abspath", type=str, default=None, help="modify the path to absolute path by root given")
|
||||
parser.add_argument("--path-to-id", action='store_true', help="add id based on path")
|
||||
|
||||
# caption filtering
|
||||
parser.add_argument(
|
||||
|
|
|
|||
|
|
@ -50,10 +50,10 @@ def split_video(
|
|||
video_path,
|
||||
scene_list,
|
||||
save_dir,
|
||||
min_seconds=2.0,
|
||||
max_seconds=15.0,
|
||||
min_seconds=2,
|
||||
max_seconds=15,
|
||||
target_fps=30,
|
||||
shorter_size=720,
|
||||
shorter_size=None,
|
||||
verbose=False,
|
||||
logger=None,
|
||||
):
|
||||
|
|
@ -134,8 +134,8 @@ def parse_args():
|
|||
help='if not None, clip shorter than min_seconds is ignored')
|
||||
parser.add_argument("--max_seconds", type=float, default=None,
|
||||
help='if not None, clip longer than max_seconds is truncated')
|
||||
parser.add_argument("--target_fps", type=int, default=30, help='target fps of clips')
|
||||
parser.add_argument("--shorter_size", type=int, default=720, help='resize the shorter size by keeping ratio')
|
||||
parser.add_argument("--target_fps", type=int, default=None, help='target fps of clips')
|
||||
parser.add_argument("--shorter_size", type=int, default=None, help='resize the shorter size by keeping ratio')
|
||||
parser.add_argument("--num_workers", type=int, default=None, help='#workers for pandarallel')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
import argparse
|
||||
import os
|
||||
from datetime import timedelta
|
||||
|
||||
import colossalai
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import torch
|
||||
|
|
@ -13,25 +13,27 @@ from torchvision.transforms.functional import pil_to_tensor
|
|||
from tqdm import tqdm
|
||||
|
||||
from tools.datasets.utils import extract_frames
|
||||
|
||||
from .unimatch import UniMatch
|
||||
|
||||
|
||||
def merge_scores(gathered_list: list, meta: pd.DataFrame):
|
||||
def merge_scores(gathered_list: list, meta: pd.DataFrame, column):
|
||||
# reorder
|
||||
indices_list = list(map(lambda x: x[0], gathered_list))
|
||||
flow_scores_list = list(map(lambda x: x[1], gathered_list))
|
||||
scores_list = list(map(lambda x: x[1], gathered_list))
|
||||
|
||||
flat_indices = []
|
||||
for x in zip(*indices_list):
|
||||
flat_indices.extend(x)
|
||||
flat_flow_scores = []
|
||||
for x in zip(*flow_scores_list):
|
||||
flat_flow_scores.extend(x)
|
||||
flat_scores = []
|
||||
for x in zip(*scores_list):
|
||||
flat_scores.extend(x)
|
||||
flat_indices = np.array(flat_indices)
|
||||
flat_flow_scores = np.array(flat_flow_scores)
|
||||
flat_scores = np.array(flat_scores)
|
||||
|
||||
# filter duplicates
|
||||
unique_indices, unique_indices_idx = np.unique(flat_indices, return_index=True)
|
||||
meta.loc[unique_indices, "flow"] = flat_flow_scores[unique_indices_idx]
|
||||
meta.loc[unique_indices, column] = flat_scores[unique_indices_idx]
|
||||
return meta
|
||||
|
||||
|
||||
class VideoTextDataset(torch.utils.data.Dataset):
|
||||
|
|
@ -61,17 +63,19 @@ class VideoTextDataset(torch.utils.data.Dataset):
|
|||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("meta_path", type=str, help="Path to the input CSV file")
|
||||
parser.add_argument("--bs", type=int, default=4, help="Batch size")
|
||||
parser.add_argument("--bs", type=int, default=4, help="Batch size") # don't use too large bs for unimatch
|
||||
parser.add_argument("--num_workers", type=int, default=16, help="Number of workers")
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
torch.backends.cudnn.deterministic = True
|
||||
torch.backends.cudnn.benchmark = False
|
||||
colossalai.launch_from_torch({})
|
||||
args = parse_args()
|
||||
dist.init_process_group(backend="nccl", timeout=timedelta(hours=24))
|
||||
torch.cuda.set_device(dist.get_rank() % torch.cuda.device_count())
|
||||
|
||||
meta_path = args.meta_path
|
||||
wo_ext, ext = os.path.splitext(meta_path)
|
||||
|
|
@ -88,11 +92,10 @@ def main():
|
|||
num_transformer_layers=6,
|
||||
reg_refine=True,
|
||||
task="flow",
|
||||
).eval()
|
||||
)
|
||||
ckpt = torch.load("./pretrained_models/unimatch/gmflow-scale2-regrefine6-mixdata-train320x576-4e7b215d.pth")
|
||||
model.load_state_dict(ckpt["model"])
|
||||
model = model.to(device)
|
||||
# model = torch.nn.DataParallel(model)
|
||||
|
||||
# build dataset
|
||||
dataset = VideoTextDataset(meta_path=meta_path, frame_inds=[0, 10, 20, 30])
|
||||
|
|
@ -110,9 +113,9 @@ def main():
|
|||
)
|
||||
|
||||
# compute optical flow scores
|
||||
dataset.meta["flow"] = np.nan
|
||||
indices_list = []
|
||||
flow_scores_list = []
|
||||
scores_list = []
|
||||
model.eval()
|
||||
for images, indices in tqdm(dataloader, disable=dist.get_rank() != 0):
|
||||
images = images.to(device)
|
||||
B = images.shape[0]
|
||||
|
|
@ -138,13 +141,13 @@ def main():
|
|||
flow_scores = flow_scores.tolist()
|
||||
|
||||
indices_list.extend(indices)
|
||||
flow_scores_list.extend(flow_scores)
|
||||
scores_list.extend(flow_scores)
|
||||
|
||||
gathered_list = [None] * dist.get_world_size()
|
||||
dist.all_gather_object(gathered_list, (indices_list, flow_scores_list))
|
||||
dist.all_gather_object(gathered_list, (indices_list, scores_list))
|
||||
if dist.get_rank() == 0:
|
||||
merge_scores(gathered_list, dataset.meta)
|
||||
dataset.meta.to_csv(out_path, index=False)
|
||||
meta_new = merge_scores(gathered_list, dataset.meta, column='flow')
|
||||
meta_new.to_csv(out_path, index=False)
|
||||
print(f"New meta with optical flow scores saved to '{out_path}'.")
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue