diff --git a/docs/data_processing.md b/docs/data_processing.md index 0d1c195..e9cf362 100644 --- a/docs/data_processing.md +++ b/docs/data_processing.md @@ -38,16 +38,13 @@ python -m tools.datasets.convert video ${ROOT_CLIPS} --output ${ROOT_META}/meta_ # 2.4 Get clips information and remove broken ones. This should output ${ROOT_META}/meta_clips_info_fmin1.csv python -m tools.datasets.datautil ${ROOT_META}/meta_clips.csv --info --fmin 1 -# 3.1 Predict aesthetic scores. This should output ${ROOT_META}/meta_clips_info_fmin1_aes_part*.csv +# 3.1 Predict aesthetic scores. This should output ${ROOT_META}/meta_clips_info_fmin1_aes.csv torchrun --nproc_per_node 8 -m tools.scoring.aesthetic.inference \ ${ROOT_META}/meta_clips_info_fmin1.csv \ --bs 1024 \ --num_workers 16 -# 3.2 Merge files; This should output ${ROOT_META}/meta_clips_info_fmin1_aes.csv -python -m tools.datasets.datautil ${ROOT_META}/meta_clips_info_fmin1_aes_part*.csv --output ${ROOT_META}/meta_clips_info_fmin1_aes.csv - -# 3.3 Filter by aesthetic scores. This should output ${ROOT_META}/meta_clips_info_fmin1_aes_aesmin5.csv +# 3.2 Filter by aesthetic scores. This should output ${ROOT_META}/meta_clips_info_fmin1_aes_aesmin5.csv python -m tools.datasets.datautil ${ROOT_META}/meta_clips_info_fmin1_aes.csv --aesmin 5 # 4.1 Generate caption. This should output ${ROOT_META}/meta_clips_info_fmin1_aes_aesmin5_caption_part*.csv diff --git a/tools/datasets/convert.py b/tools/datasets/convert.py index 2c0db28..49acf72 100644 --- a/tools/datasets/convert.py +++ b/tools/datasets/convert.py @@ -94,20 +94,22 @@ def process_vidprom(root, info): def process_general_images(root, output): root = os.path.expanduser(root) - image_lists = get_filelist(root, IMG_EXTENSIONS) - df = pd.DataFrame(dict(path=image_lists)) - if output is None: - output = "images.csv" + path_list = get_filelist(root, IMG_EXTENSIONS) + fname_list = [os.path.splitext(os.path.basename(x))[0] for x in path_list] + df = pd.DataFrame(dict(id=fname_list, path=path_list)) + + os.makedirs(os.path.dirname(output), exist_ok=True) df.to_csv(output, index=False) print(f"Saved {len(df)} samples to {output}.") def process_general_videos(root, output): root = os.path.expanduser(root) - video_lists = get_filelist(root, VID_EXTENSIONS) - df = pd.DataFrame(dict(path=video_lists)) - if output is None: - output = "videos.csv" + path_list = get_filelist(root, VID_EXTENSIONS) + fname_list = [os.path.splitext(os.path.basename(x))[0] for x in path_list] + relpath_list = [os.path.relpath(x, root) for x in path_list] + df = pd.DataFrame(dict(path=path_list, id=fname_list, relpath=relpath_list)) + os.makedirs(os.path.dirname(output), exist_ok=True) df.to_csv(output, index=False) print(f"Saved {len(df)} samples to {output}.") @@ -119,7 +121,7 @@ if __name__ == "__main__": parser.add_argument("root", type=str) parser.add_argument("--split", type=str, default="train") parser.add_argument("--info", type=str, default=None) - parser.add_argument("--output", type=str, default=None) + parser.add_argument("--output", type=str, default=None, required=True, help='Output path') args = parser.parse_args() if args.dataset == "imagenet": diff --git a/tools/datasets/datautil.py b/tools/datasets/datautil.py index d06b22f..b7d0b5f 100644 --- a/tools/datasets/datautil.py +++ b/tools/datasets/datautil.py @@ -54,7 +54,7 @@ def get_info(path): if ext in IMG_EXTENSIONS: im = cv2.imread(path) if im is None: - return 0, 0, 0, np.nan, np.nan + return 0, 0, 0, np.nan, np.nan, np.nan height, width = im.shape[:2] num_frames, fps = 1, np.nan else: @@ -415,8 +415,12 @@ def main(args): data_new = pd.read_csv(args.intersection) print(f"Intersection csv contains {len(data_new)} samples.") cols_to_use = data_new.columns.difference(data.columns) - cols_to_use = cols_to_use.insert(0, "path") - data = pd.merge(data, data_new[cols_to_use], on="path", how="inner") + + col_on = 'path' + # if 'id' in data.columns and 'id' in data_new.columns: + # col_on = 'id' + cols_to_use = cols_to_use.insert(0, col_on) + data = pd.merge(data, data_new[cols_to_use], on=col_on, how="inner") print(f"Intersection number of samples: {len(data)}.") # train columns @@ -484,6 +488,8 @@ def main(args): data["path"] = apply(data["path"], lambda x: os.path.relpath(x, args.relpath)) if args.abspath is not None: data["path"] = apply(data["path"], lambda x: os.path.join(args.abspath, x)) + if args.path_to_id: + data["id"] = apply(data["path"], lambda x: os.path.splitext(os.path.basename(x))[0]) if args.merge_cmotion: data["text"] = apply(data, lambda x: merge_cmotion(x["text"], x["cmotion"]), axis=1) if args.refine_llm_caption: @@ -581,6 +587,7 @@ def parse_args(): # path processing parser.add_argument("--relpath", type=str, default=None, help="modify the path to relative path by root given") parser.add_argument("--abspath", type=str, default=None, help="modify the path to absolute path by root given") + parser.add_argument("--path-to-id", action='store_true', help="add id based on path") # caption filtering parser.add_argument( diff --git a/tools/scene_cut/cut.py b/tools/scene_cut/cut.py index 0e02aed..0b9c9e6 100644 --- a/tools/scene_cut/cut.py +++ b/tools/scene_cut/cut.py @@ -50,10 +50,10 @@ def split_video( video_path, scene_list, save_dir, - min_seconds=2.0, - max_seconds=15.0, + min_seconds=2, + max_seconds=15, target_fps=30, - shorter_size=720, + shorter_size=None, verbose=False, logger=None, ): @@ -134,8 +134,8 @@ def parse_args(): help='if not None, clip shorter than min_seconds is ignored') parser.add_argument("--max_seconds", type=float, default=None, help='if not None, clip longer than max_seconds is truncated') - parser.add_argument("--target_fps", type=int, default=30, help='target fps of clips') - parser.add_argument("--shorter_size", type=int, default=720, help='resize the shorter size by keeping ratio') + parser.add_argument("--target_fps", type=int, default=None, help='target fps of clips') + parser.add_argument("--shorter_size", type=int, default=None, help='resize the shorter size by keeping ratio') parser.add_argument("--num_workers", type=int, default=None, help='#workers for pandarallel') args = parser.parse_args() diff --git a/tools/scoring/optical_flow/inference.py b/tools/scoring/optical_flow/inference.py index 170b076..499ff06 100644 --- a/tools/scoring/optical_flow/inference.py +++ b/tools/scoring/optical_flow/inference.py @@ -1,7 +1,7 @@ import argparse import os +from datetime import timedelta -import colossalai import numpy as np import pandas as pd import torch @@ -13,25 +13,27 @@ from torchvision.transforms.functional import pil_to_tensor from tqdm import tqdm from tools.datasets.utils import extract_frames - from .unimatch import UniMatch -def merge_scores(gathered_list: list, meta: pd.DataFrame): +def merge_scores(gathered_list: list, meta: pd.DataFrame, column): # reorder indices_list = list(map(lambda x: x[0], gathered_list)) - flow_scores_list = list(map(lambda x: x[1], gathered_list)) + scores_list = list(map(lambda x: x[1], gathered_list)) + flat_indices = [] for x in zip(*indices_list): flat_indices.extend(x) - flat_flow_scores = [] - for x in zip(*flow_scores_list): - flat_flow_scores.extend(x) + flat_scores = [] + for x in zip(*scores_list): + flat_scores.extend(x) flat_indices = np.array(flat_indices) - flat_flow_scores = np.array(flat_flow_scores) + flat_scores = np.array(flat_scores) + # filter duplicates unique_indices, unique_indices_idx = np.unique(flat_indices, return_index=True) - meta.loc[unique_indices, "flow"] = flat_flow_scores[unique_indices_idx] + meta.loc[unique_indices, column] = flat_scores[unique_indices_idx] + return meta class VideoTextDataset(torch.utils.data.Dataset): @@ -61,17 +63,19 @@ class VideoTextDataset(torch.utils.data.Dataset): def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("meta_path", type=str, help="Path to the input CSV file") - parser.add_argument("--bs", type=int, default=4, help="Batch size") + parser.add_argument("--bs", type=int, default=4, help="Batch size") # don't use too large bs for unimatch parser.add_argument("--num_workers", type=int, default=16, help="Number of workers") args = parser.parse_args() return args def main(): + args = parse_args() + torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False - colossalai.launch_from_torch({}) - args = parse_args() + dist.init_process_group(backend="nccl", timeout=timedelta(hours=24)) + torch.cuda.set_device(dist.get_rank() % torch.cuda.device_count()) meta_path = args.meta_path wo_ext, ext = os.path.splitext(meta_path) @@ -88,11 +92,10 @@ def main(): num_transformer_layers=6, reg_refine=True, task="flow", - ).eval() + ) ckpt = torch.load("./pretrained_models/unimatch/gmflow-scale2-regrefine6-mixdata-train320x576-4e7b215d.pth") model.load_state_dict(ckpt["model"]) model = model.to(device) - # model = torch.nn.DataParallel(model) # build dataset dataset = VideoTextDataset(meta_path=meta_path, frame_inds=[0, 10, 20, 30]) @@ -110,9 +113,9 @@ def main(): ) # compute optical flow scores - dataset.meta["flow"] = np.nan indices_list = [] - flow_scores_list = [] + scores_list = [] + model.eval() for images, indices in tqdm(dataloader, disable=dist.get_rank() != 0): images = images.to(device) B = images.shape[0] @@ -138,13 +141,13 @@ def main(): flow_scores = flow_scores.tolist() indices_list.extend(indices) - flow_scores_list.extend(flow_scores) + scores_list.extend(flow_scores) gathered_list = [None] * dist.get_world_size() - dist.all_gather_object(gathered_list, (indices_list, flow_scores_list)) + dist.all_gather_object(gathered_list, (indices_list, scores_list)) if dist.get_rank() == 0: - merge_scores(gathered_list, dataset.meta) - dataset.meta.to_csv(out_path, index=False) + meta_new = merge_scores(gathered_list, dataset.meta, column='flow') + meta_new.to_csv(out_path, index=False) print(f"New meta with optical flow scores saved to '{out_path}'.")