From 0dfea90a893ce7a73494269396fef2ba5f71bc19 Mon Sep 17 00:00:00 2001 From: xyupeng <99191637+xyupeng@users.noreply.github.com> Date: Sat, 6 Apr 2024 16:33:06 +0800 Subject: [PATCH] Dev/pxy (#36) update scoring & scene_cut --- tools/scene_cut/README.md | 26 ++ tools/{splitting => scene_cut}/__init__.py | 0 tools/scene_cut/main_cut_multi_thread.py | 168 +++++++++++++ tools/scene_cut/main_cut_pandarallel.py | 163 ++++++++++++ tools/scene_cut/process_meta.py | 266 ++++++++++++++++++++ tools/scene_cut/scene_detect.py | 89 +++++++ tools/scene_cut/utils_video.py | 97 +++++++ tools/scoring/matching/inference.py | 16 +- tools/splitting/README.md | 11 - tools/splitting/scenedetect/__init__.py | 0 tools/splitting/scenedetect/scene_detect.py | 138 ---------- tools/splitting/scenedetect/utils.py | 145 ----------- 12 files changed, 823 insertions(+), 296 deletions(-) create mode 100644 tools/scene_cut/README.md rename tools/{splitting => scene_cut}/__init__.py (100%) create mode 100644 tools/scene_cut/main_cut_multi_thread.py create mode 100644 tools/scene_cut/main_cut_pandarallel.py create mode 100644 tools/scene_cut/process_meta.py create mode 100644 tools/scene_cut/scene_detect.py create mode 100644 tools/scene_cut/utils_video.py delete mode 100644 tools/splitting/README.md delete mode 100644 tools/splitting/scenedetect/__init__.py delete mode 100644 tools/splitting/scenedetect/scene_detect.py delete mode 100644 tools/splitting/scenedetect/utils.py diff --git a/tools/scene_cut/README.md b/tools/scene_cut/README.md new file mode 100644 index 0000000..9376af4 --- /dev/null +++ b/tools/scene_cut/README.md @@ -0,0 +1,26 @@ +## Scene Detection and Video Splitting + +### Formatting +Input meta should be `{prefix}.csv` with column `'videoId'` +``` +python tools/scene_cut/process_meta.py --task append_format --meta_path /mnt/hdd/data/pexels_new/raw/meta/popular_6.csv --split popular_6 +``` +Output is `{prefix}_format.csv` (with column `path`) and `{prefix}_intact.csv` (with column `intact` and `path`) + +### Scene Detection +Input meta should be `{prefix}_format.csv` +``` +python tools/scene_cut/scene_detect.py --meta_path /mnt/hdd/data/pexels_new/raw/meta/popular_6_format.csv +``` +Output is `{prefix}_format_timestamp.csv` + +### Video Splitting +Input meta should be `{prefix}_timestamp.csv` +``` +python tools/scene_cut/main_cut_pandarallel.py \ + --meta_path /mnt/hdd/data/pexels_new/raw/meta/popular_6_format_timestamp.csv \ + --out_dir /mnt/hdd/data/pexels_new/scene_cut/data/popular_6 +``` +Output is `{out_dir}/{wo_ext}_scene-{sid}.mp4` + +TODO: meta for video clips diff --git a/tools/splitting/__init__.py b/tools/scene_cut/__init__.py similarity index 100% rename from tools/splitting/__init__.py rename to tools/scene_cut/__init__.py diff --git a/tools/scene_cut/main_cut_multi_thread.py b/tools/scene_cut/main_cut_multi_thread.py new file mode 100644 index 0000000..45fd2aa --- /dev/null +++ b/tools/scene_cut/main_cut_multi_thread.py @@ -0,0 +1,168 @@ +import os +import argparse +import time +import subprocess +from tqdm import tqdm + +import pandas as pd +from scenedetect import FrameTimecode +from imageio_ffmpeg import get_ffmpeg_exe +from concurrent.futures import ThreadPoolExecutor, as_completed + +from mmengine.logging import MMLogger, print_log +from utils_video import is_intact_video, iterate_files, clone_folder_structure + + +def single_process(row, save_dir, logger=None): + # video_id = row['videoID'] + # video_path = os.path.join(root_src, f'{video_id}.mp4') + video_path = row['path'] + + # check mp4 integrity + # if not is_intact_video(video_path, logger=logger): + # return False + + timestamp = row['timestamp'] + if not (timestamp.startswith('[') and timestamp.endswith(']')): + return False + scene_list = eval(timestamp) + scene_list = [ + (FrameTimecode(s, fps=1), FrameTimecode(t, fps=1)) + for s, t in scene_list + ] + split_video(video_path, scene_list, save_dir=save_dir, logger=logger) + return True + + +def split_video( + video_path, + scene_list, + save_dir, + min_seconds=None, + max_seconds=None, + target_fps=30, + shorter_size=512, + verbose=False, + logger=None, +): + """ + scenes shorter than min_seconds will be ignored; + scenes longer than max_seconds will be cut to save the beginning max_seconds. + Currently, the saved file name pattern is f'{fname}_scene-{idx}'.mp4 + + Args: + scene_list (List[Tuple[FrameTimecode, FrameTimecode]]): each element is (s, t): start and end of a scene. + min_seconds (float | None) + max_seconds (float | None) + target_fps (int | None) + shorter_size (int | None) + """ + FFMPEG_PATH = get_ffmpeg_exe() + + save_path_list = [] + for idx, scene in enumerate(scene_list): + s, t = scene # FrameTimecode + if min_seconds is not None: + if (t - s).get_seconds() < min_seconds: + continue + + duration = t - s + if max_seconds is not None: + fps = s.framerate + max_duration = FrameTimecode(timecode="00:00:00", fps=fps) + max_duration.frame_num = round(fps * max_seconds) + duration = min(max_duration, duration) + + # save path + fname = os.path.basename(video_path) + fname_wo_ext = os.path.splitext(fname)[0] + # TODO: fname pattern + save_path = os.path.join(save_dir, f"{fname_wo_ext}_scene-{idx}.mp4") + + # ffmpeg cmd + cmd = [FFMPEG_PATH] + + # Only show ffmpeg output for the first call, which will display any + # errors if it fails, and then break the loop. We only show error messages + # for the remaining calls. + # cmd += ['-v', 'error'] + + # input path + # cmd += ["-i", video_path] + + # clip to cut + cmd += ["-nostdin", "-y", "-ss", str(s.get_seconds()), "-i", video_path, "-t", str(duration.get_seconds())] + # cmd += ["-nostdin", "-y", "-ss", str(s.get_seconds()), "-t", str(duration.get_seconds())] + + # target fps + # cmd += ['-vf', 'select=mod(n\,2)'] + if target_fps is not None: + cmd += ["-r", f"{target_fps}"] + + # aspect ratio + if shorter_size is not None: + cmd += ["-vf", f"scale='if(gt(iw,ih),-2,{shorter_size})':'if(gt(iw,ih),{shorter_size},-2)'"] + # cmd += ['-vf', f"scale='if(gt(iw,ih),{shorter_size},trunc(ow/a/2)*2)':-2"] + + cmd += ["-map", "0", save_path] + + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + stdout, stderr = proc.communicate() + if verbose: + stdout = stdout.decode("utf-8") + print_log(stdout, logger=logger) + + save_path_list.append(video_path) + print_log(f"Video clip saved to '{save_path}'", logger=logger) + + return save_path_list + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('--root', default='F:/Panda-70M/') + parser.add_argument('--split', default='test') + parser.add_argument('--num_workers', default=5, type=int) + + args = parser.parse_args() + return args + + +def main(): + # args = parse_args() + # root = args.root + # split = args.split + + root = 'F:/Panda-70M/' + root, split = 'F:/pexels_new/', 'popular_2' + meta_path = os.path.join(root, f'raw/meta/{split}_format_timestamp.csv') + root_dst = os.path.join(root, f'scene_cut/data/{split}') + + folder_dst = root_dst + # folder_src = os.path.join(root_src, f'data/{split}') + # folder_dst = os.path.join(root_dst, os.path.relpath(folder_src, root_src)) + os.makedirs(folder_dst, exist_ok=True) + + meta = pd.read_csv(meta_path) + + # create logger + # folder_path_log = os.path.dirname(root_dst) + # log_name = os.path.basename(root_dst) + # timestamp = time.strftime("%Y%m%d-%H%M%S", time.localtime(time.time())) + # log_path = os.path.join(folder_path_log, f"{log_name}_{timestamp}.log") + # logger = MMLogger.get_instance(log_name, log_file=log_path) + logger = None + + tasks = [] + pool = ThreadPoolExecutor(max_workers=1) + for idx, row in meta.iterrows(): + task = pool.submit(single_process, row, folder_dst, logger) + tasks.append(task) + + for task in tqdm(as_completed(tasks), total=len(meta)): + task.result() + pool.shutdown() + + +if __name__ == '__main__': + main() diff --git a/tools/scene_cut/main_cut_pandarallel.py b/tools/scene_cut/main_cut_pandarallel.py new file mode 100644 index 0000000..e011254 --- /dev/null +++ b/tools/scene_cut/main_cut_pandarallel.py @@ -0,0 +1,163 @@ +import os +import argparse +import time +import subprocess +from tqdm import tqdm + +import pandas as pd +from scenedetect import FrameTimecode +from functools import partial +from pandarallel import pandarallel +from imageio_ffmpeg import get_ffmpeg_exe + +from mmengine.logging import MMLogger, print_log +from utils_video import is_intact_video, iterate_files, clone_folder_structure + + +def process_single_row(row, save_dir, log_name=None): + video_path = row['path'] + + logger = None + if log_name is not None: + logger = MMLogger.get_instance(log_name) + + # check mp4 integrity + # if not is_intact_video(video_path, logger=logger): + # return False + + timestamp = row['timestamp'] + if not (timestamp.startswith('[') and timestamp.endswith(']')): + return False + scene_list = eval(timestamp) + scene_list = [ + (FrameTimecode(s, fps=1), FrameTimecode(t, fps=1)) + for s, t in scene_list + ] + split_video(video_path, scene_list, save_dir=save_dir, + min_seconds=2, max_seconds=15, shorter_size=720, + logger=logger) + + +def split_video( + video_path, + scene_list, + save_dir, + min_seconds=None, + max_seconds=None, + target_fps=30, + shorter_size=512, + verbose=False, + logger=None, +): + """ + scenes shorter than min_seconds will be ignored; + scenes longer than max_seconds will be cut to save the beginning max_seconds. + Currently, the saved file name pattern is f'{fname}_scene-{idx}'.mp4 + + Args: + scene_list (List[Tuple[FrameTimecode, FrameTimecode]]): each element is (s, t): start and end of a scene. + min_seconds (float | None) + max_seconds (float | None) + target_fps (int | None) + shorter_size (int | None) + """ + FFMPEG_PATH = get_ffmpeg_exe() + + save_path_list = [] + for idx, scene in enumerate(scene_list): + s, t = scene # FrameTimecode + if min_seconds is not None: + if (t - s).get_seconds() < min_seconds: + continue + + duration = t - s + if max_seconds is not None: + fps = s.framerate + max_duration = FrameTimecode(timecode="00:00:00", fps=fps) + max_duration.frame_num = round(fps * max_seconds) + duration = min(max_duration, duration) + + # save path + fname = os.path.basename(video_path) + fname_wo_ext = os.path.splitext(fname)[0] + # TODO: fname pattern + save_path = os.path.join(save_dir, f"{fname_wo_ext}_scene-{idx}.mp4") + + # ffmpeg cmd + cmd = [FFMPEG_PATH] + + # Only show ffmpeg output for the first call, which will display any + # errors if it fails, and then break the loop. We only show error messages + # for the remaining calls. + # cmd += ['-v', 'error'] + + # -ss after -i is very slow; put -ss before -i + # input path + # cmd += ["-i", video_path] + + # clip to cut + # cmd += ["-nostdin", "-y", "-ss", str(s.get_seconds()), "-t", str(duration.get_seconds())] + + # clip to cut + cmd += ["-nostdin", "-y", "-ss", str(s.get_seconds()), "-i", video_path, "-t", str(duration.get_seconds())] + + # target fps + # cmd += ['-vf', 'select=mod(n\,2)'] + if target_fps is not None: + cmd += ["-r", f"{target_fps}"] + + # aspect ratio + if shorter_size is not None: + cmd += ["-vf", f"scale='if(gt(iw,ih),-2,{shorter_size})':'if(gt(iw,ih),{shorter_size},-2)'"] + # cmd += ['-vf', f"scale='if(gt(iw,ih),{shorter_size},trunc(ow/a/2)*2)':-2"] + + cmd += ["-map", "0", save_path] + + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + stdout, stderr = proc.communicate() + # stdout = stdout.decode("utf-8") + # print_log(stdout, logger=logger) + + save_path_list.append(video_path) + if verbose: + print_log(f"Video clip saved to '{save_path}'", logger=logger) + + return save_path_list + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('--meta_path', default='./data/pexels_new/raw/meta/popular_5_format_timestamp.csv') + parser.add_argument('--out_dir', default='./data/pexels_new/scene_cut/data/popular_5') + parser.add_argument('--num_workers', default=5, type=int) + + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + meta_path = args.meta_path + out_dir = args.out_dir + + assert os.path.basename(os.path.dirname(out_dir)) == 'data' + + os.makedirs(out_dir, exist_ok=True) + + meta = pd.read_csv(meta_path) + + # create logger + log_dir = os.path.dirname(out_dir) + log_name = os.path.basename(out_dir) + timestamp = time.strftime("%Y%m%d-%H%M%S", time.localtime(time.time())) + log_path = os.path.join(log_dir, f"{log_name}_{timestamp}.log") + logger = MMLogger.get_instance(log_name, log_file=log_path) + # logger = None + + pandarallel.initialize(progress_bar=True) + process_single_row_partial = partial(process_single_row, save_dir=out_dir, log_name=log_name) + meta.parallel_apply(process_single_row_partial, axis=1) + + +if __name__ == '__main__': + main() diff --git a/tools/scene_cut/process_meta.py b/tools/scene_cut/process_meta.py new file mode 100644 index 0000000..0feffe9 --- /dev/null +++ b/tools/scene_cut/process_meta.py @@ -0,0 +1,266 @@ +""" +1. format_raw_meta() + - only keep intact videos + - add 'path' column (abs path) +2. create_meta_for_folder() +""" + +import os +# os.chdir('../..') +print(f'Current working directory: {os.getcwd()}') + +import argparse +import json +import subprocess +import pandas as pd +from tqdm import tqdm +import pickle as pkl +from pandarallel import pandarallel +from functools import partial +import numpy as np + +from utils_video import is_intact_video + + +def has_downloaded_success(json_path): + if not os.path.exists(json_path): + return False + + try: + with open(json_path, 'r') as f: + data = json.load(f) + if 'success' not in data or isinstance(data['success'], bool) is False or data['success'] is False: + return False + except Exception as e: + return False + + return True + + +def split_meta_csv(chunk_size=60000): + """ + Split csv into multiple small csv in order + """ + root = './data/Panda-70M' + # meta_name = 'meta/panda70m_training_full.csv' + meta_name = 'meta/panda70m_training_10m.csv' + # meta_name = 'meta/training_10m/train_0.csv' + meta_path = os.path.join(root, meta_name) + + df = pd.read_csv(meta_path) + num_rows = len(df) + + # Split the DataFrame into smaller DataFrames + for idx, i in enumerate(range(0, num_rows, chunk_size)): + df_i = df.iloc[i:i + chunk_size] + out_path = os.path.join(root, f'meta/train_{idx}.csv') + df_i.to_csv(out_path, index=False) + + # If there are remaining rows + if num_rows > chunk_size and num_rows % chunk_size != 0: + df_last = df.iloc[-(num_rows % chunk_size):] + out_path = os.path.join(root, f'meta/train_{idx + 1}.csv') + df_last.to_csv(out_path, index=False) + + +def remove_index(): + df = pd.read_csv('your_file.csv', index_col=0) + df.to_csv('your_file_without_index.csv', index=False) + + +def append_format(meta_path, mode='.mp4'): + """ + Append _format to csv file: + - filter broken videos; only intact videos are kept + - add column 'path' + + input csv should satisfy: + - name should be: {split}.csv + - contain column 'videoID'/'videoId' + """ + # meta_path = os.path.join(root, f'raw/meta/{split}.csv') + meta_dirname = os.path.dirname(meta_path) + assert meta_dirname.endswith('raw/meta') + root_raw = os.path.dirname(meta_dirname) + + meta_fname = os.path.basename(meta_path) + split, ext = os.path.splitext(meta_fname) + + meta = pd.read_csv(meta_path) + + path_list = [] + new_meta = [] + for idx, row in tqdm(meta.iterrows(), total=len(meta)): + # video_id = row['videoID'] # panda + video_id = row['videoId'] # pexels_new + video_path = os.path.join(root_raw, f'data/{split}/{video_id}.mp4') + if mode == '.mp4': + if not is_intact_video(video_path): + continue + elif mode == '.json': + json_path = os.path.join(root_raw, f'data/{split}/{video_id}.json') + if not has_downloaded_success(json_path): + continue + else: + raise ValueError + + new_meta.append(row) + path_list.append(video_path) + + new_meta = pd.DataFrame(new_meta) + new_meta['path'] = path_list + + out_path = os.path.join(root_raw, f'meta/{split}_format.csv') + new_meta.to_csv(out_path, index=False) + print(f'New meta (shape={new_meta.shape}) saved to \'{out_path}\'') + + +def append_format_pandarallel(meta_path, split, mode='.mp4'): + """ + Append _format to csv file: + - filter broken videos; only intact videos are kept + - add column 'path' + + input csv should satisfy: + - name should be: {split}.csv + - contain column 'videoID'/'videoId' + """ + # meta_path = os.path.join(root, f'raw/meta/{split}.csv') + meta_dirname = os.path.dirname(meta_path) + assert meta_dirname.endswith('raw/meta') + root_raw = os.path.dirname(meta_dirname) + + meta_fname = os.path.basename(meta_path) + wo_ext, ext = os.path.splitext(meta_fname) + + meta = pd.read_csv(meta_path) + + def is_intact(row, mode='.json'): + video_id = row['videoId'] # pexels_new + video_path = os.path.join(root_raw, f'data/{split}/{video_id}.mp4') + row['path'] = video_path + if mode == '.mp4': + if is_intact_video(video_path): + return True, video_path + return False, video_path + elif mode == '.json': + json_path = os.path.join(root_raw, f'data/{split}/{video_id}.json') + if has_downloaded_success(json_path): + return True, video_path + return False, video_path + else: + raise ValueError + + pandarallel.initialize(progress_bar=True) + is_intact_partial = partial(is_intact, mode=mode) + ret = meta.parallel_apply(is_intact_partial, axis=1) + + intact, paths = list(zip(*ret)) + + meta['intact'] = intact + meta['path'] = paths + out_path = os.path.join(root_raw, f'meta/{wo_ext}_intact.csv') + meta.to_csv(out_path, index=False) + print(f'New meta (shape={meta.shape}) with intact info saved to \'{out_path}\'') + + # meta_format = meta[meta['intact']] + meta_format = meta[np.array(intact)] + meta_format.drop('intact', axis=1, inplace=True) + out_path = os.path.join(root_raw, f'meta/{wo_ext}_format.csv') + meta_format.to_csv(out_path, index=False) + print(f'New meta (shape={meta_format.shape}) with format info saved to \'{out_path}\'') + + +def create_subset(meta_path): + meta = pd.read_csv(meta_path) + meta_subset = meta.iloc[:100] + + wo_ext, ext = os.path.splitext(meta_path) + out_path = f'{wo_ext}_head-100{ext}' + meta_subset.to_csv(out_path, index=False) + print(f'New meta (shape={meta_subset.shape}) saved to \'{out_path}\'') + + +def append_cut(root='./data/Panda-70M'): + """ + Append _cut to csv file + input csv should satisfy: + - name_should be {split}_intact.csv + - contain column 'timestamp': list of timestamp + """ + split = 'test' + meta_path = os.path.join(root, f'processed/meta/{split}_intact.csv') + + wo_ext, ext = os.path.splitext(meta_path) + suffix = 'cut' + out_path = f'{wo_ext}_{suffix}{ext}' + + meta = pd.read_csv(meta_path) + + new_meta = [] + for idx, row in tqdm(meta.iterrows(), total=len(meta)): + video_id = row['videoID'] + timestamps = eval(row['timestamp']) + captions = eval(row['caption']) + scores = eval(row['matching_score']) + + num_clips = len(timestamps) + for idx_c in range(num_clips): + path_i = os.path.join(root, f'processed/{split}/{video_id}_scene-{idx_c}.mp4') + # if not is_intact_video(path_i): + # continue + + row_i = [f'{video_id}_scene-{idx_c}', path_i, timestamps[idx_c], captions[idx_c], scores[idx_c]] + + new_meta.append(row_i) + + columns = ['videoID', 'path', 'timestamp', 'text', 'match_official'] + new_meta = pd.DataFrame(new_meta, columns=columns) + + new_meta.to_csv(out_path, index=False) + print(f'New meta (shape={new_meta.shape}) saved to \'{out_path}\'') + + +def debug_meta_topk(): + meta_path = 'F:/Panda-70M/meta/test_intact_cut_flow.csv' + meta = pd.read_csv(meta_path) + + score_column = 'flow_score' + topk = meta.nlargest(10, columns=score_column) + topk_s = meta.nsmallest(200, columns=score_column) + + topk_list = [(row['path'], row['caption'], row[score_column]) for idx, row in topk.iterrows()] + topk_s_list = [(row['path'], row['caption'], row[score_column]) for idx, row in topk_s.iterrows()] + + x = 0 + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('--task', default='append_format') + parser.add_argument('--meta_path', default='./data/pexels_new/raw/meta/popular_1.csv') + parser.add_argument('--split', default='popular_5') + parser.add_argument('--num_workers', default=5, type=int) + + args = parser.parse_args() + return args + + +if __name__ == '__main__': + # split_meta_csv() + + args = parse_args() + meta_path = args.meta_path + task = args.task + + if task == 'append_format': + # append_format(meta_path=meta_path, mode='.mp4') + append_format_pandarallel(meta_path=meta_path, split=args.split, mode='.json') + elif task == 'create_subset': + create_subset(meta_path=meta_path) + else: + raise ValueError + + # append_cut(root=root) + # append_score(root=root) + # debug_meta_topk() diff --git a/tools/scene_cut/scene_detect.py b/tools/scene_cut/scene_detect.py new file mode 100644 index 0000000..7a952cb --- /dev/null +++ b/tools/scene_cut/scene_detect.py @@ -0,0 +1,89 @@ +import argparse +import os +import numpy as np +from tqdm import tqdm +import pandas as pd +from functools import partial +from pandarallel import pandarallel +from scenedetect import detect, ContentDetector, AdaptiveDetector, FrameTimecode + + +def process_single_row(row): + # windows + # from scenedetect import detect, ContentDetector, AdaptiveDetector + + video_path = row['path'] + + detector = AdaptiveDetector( + adaptive_threshold=3.0, + # luma_only=True, + ) + # detector = ContentDetector() + # TODO: catch error here + try: + scene_list = detect(video_path, detector, start_in_scene=True) + timestamp = [(s.get_timecode(), t.get_timecode()) for s, t in scene_list] + return True, str(timestamp) + except Exception as e: + print(f'Video \'{video_path}\' with error {e}') + return False, '' + + +def main(): + meta_path = 'F:/pexels_new/raw/meta/popular_1_format.csv' + meta = pd.read_csv(meta_path) + + timestamp_list = [] + for idx, row in tqdm(meta.iterrows()): + video_path = row['path'] + + detector = AdaptiveDetector( + adaptive_threshold=1.5, + luma_only=True, + ) + # detector = ContentDetector() + scene_list = detect(video_path, detector, start_in_scene=True) + + timestamp = [(s.get_timecode(), t.get_timecode()) for s, t in scene_list] + timestamp_list.append(timestamp) + + meta['timestamp'] = timestamp_list + + wo_ext, ext = os.path.splitext(meta_path) + out_path = f"{wo_ext}_timestamp{ext}" + meta.to_csv(out_path, index=False) + print(f"New meta with timestamp saved to '{out_path}'.") + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('--meta_path', default='F:/pexels_new/raw/meta/popular_1_format.csv') + parser.add_argument('--num_workers', default=5, type=int) + + args = parser.parse_args() + return args + + +def main_pandarallel(): + args = parse_args() + meta_path = args.meta_path + + # meta_path = 'F:/pexels_new/raw/meta/popular_1_format.csv' + meta = pd.read_csv(meta_path) + + pandarallel.initialize(progress_bar=True) + ret = meta.parallel_apply(process_single_row, axis=1) + + succ, timestamps = list(zip(*ret)) + + meta['timestamp'] = timestamps + meta = meta[np.array(succ)] + + wo_ext, ext = os.path.splitext(meta_path) + out_path = f"{wo_ext}_timestamp{ext}" + meta.to_csv(out_path, index=False) + print(f"New meta (shape={meta.shape}) with timestamp saved to '{out_path}'.") + + +if __name__ == '__main__': + main_pandarallel() diff --git a/tools/scene_cut/utils_video.py b/tools/scene_cut/utils_video.py new file mode 100644 index 0000000..2f02e8f --- /dev/null +++ b/tools/scene_cut/utils_video.py @@ -0,0 +1,97 @@ +import os +import cv2 +from mmengine.logging import print_log +from moviepy.editor import VideoFileClip + + +def iterate_files(folder_path): + for root, dirs, files in os.walk(folder_path): + # root contains the current directory path + # dirs contains the list of subdirectories in the current directory + # files contains the list of files in the current directory + + # Process files in the current directory + for file in files: + file_path = os.path.join(root, file) + # print("File:", file_path) + yield file_path + + # Process subdirectories and recursively call the function + for subdir in dirs: + subdir_path = os.path.join(root, subdir) + # print("Subdirectory:", subdir_path) + iterate_files(subdir_path) + + +def iterate_folders(folder_path): + for root, dirs, files in os.walk(folder_path): + for subdir in dirs: + subdir_path = os.path.join(root, subdir) + yield subdir_path + # print("Subdirectory:", subdir_path) + iterate_folders(subdir_path) + + +def clone_folder_structure(root_src, root_dst, verbose=False): + src_path_list = iterate_folders(root_src) + src_relpath_list = [os.path.relpath(x, root_src) for x in src_path_list] + + os.makedirs(root_dst, exist_ok=True) + dst_path_list = [os.path.join(root_dst, x) for x in src_relpath_list] + for folder_path in dst_path_list: + os.makedirs(folder_path, exist_ok=True) + if verbose: + print(f"Create folder: '{folder_path}'") + + +def is_intact_video(video_path, mode='moviepy', verbose=False, logger=None): + if not os.path.exists(video_path): + if verbose: + print_log(f"Could not find '{video_path}'", logger=logger) + return False + + if mode == 'moviepy': + try: + VideoFileClip(video_path) + if verbose: + print_log(f"The video file '{video_path}' is intact.", logger=logger) + return True + except Exception as e: + if verbose: + print_log(f"Error: {e}", logger=logger) + print_log(f"The video file '{video_path}' is not intact.", logger=logger) + return False + elif mode == 'cv2': + try: + cap = cv2.VideoCapture(video_path) + if cap.isOpened(): + if verbose: + print_log(f"The video file '{video_path}' is intact.", logger=logger) + return True + except Exception as e: + if verbose: + print_log(f"Error: {e}", logger=logger) + print_log(f"The video file '{video_path}' is not intact.", logger=logger) + return False + else: + raise ValueError + + +def count_frames(video_path, logger=None): + cap = cv2.VideoCapture(video_path) + + if not cap.isOpened(): + print_log(f"Error: Could not open video file '{video_path}'", logger=logger) + return + + total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + print_log(f"Total frames in the video '{video_path}': {total_frames}", logger=logger) + + cap.release() + + +def count_files(root, suffix=".mp4"): + files_list = iterate_files(root) + cnt = len([x for x in files_list if x.endswith(suffix)]) + return cnt + diff --git a/tools/scoring/matching/inference.py b/tools/scoring/matching/inference.py index a332583..e7609cd 100644 --- a/tools/scoring/matching/inference.py +++ b/tools/scoring/matching/inference.py @@ -8,6 +8,10 @@ import pandas as pd import torch import torch.nn.functional as F from torchvision.datasets.folder import pil_loader +<<<<<<< HEAD + +======= +>>>>>>> dev/v1.0.1 from tqdm import tqdm IMG_EXTENSIONS = ( @@ -24,6 +28,15 @@ IMG_EXTENSIONS = ( VID_EXTENSIONS = (".mp4", ".avi", ".mov", ".mkv") +def is_video(filename): + ext = os.path.splitext(filename)[-1].lower() + return ext in VID_EXTENSIONS + + +IMG_EXTENSIONS = (".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif", ".tiff", ".webp") +VID_EXTENSIONS = (".mp4", ".avi", ".mov", ".mkv") + + def is_video(filename): ext = os.path.splitext(filename)[-1].lower() return ext in VID_EXTENSIONS @@ -52,13 +65,12 @@ class VideoTextDataset(torch.utils.data.Dataset): def __getitem__(self, index): row = self.meta.iloc[index] - path = row["path"] + path = row['path'] if is_video(path): img = extract_frames(path, points=[0.5])[0] else: img = pil_loader(path) - img = self.transform(img) text = row["text"] diff --git a/tools/splitting/README.md b/tools/splitting/README.md deleted file mode 100644 index 160daa3..0000000 --- a/tools/splitting/README.md +++ /dev/null @@ -1,11 +0,0 @@ -# Scene Detection and Video Split - -Raw videos from the Internet may be too long for training. -Thus, we detect scenes in raw videos and split them into short clips based on the scenes. -First prepare the video processing packages. - -```bash -pip install scenedetect moviepy opencv-python -``` - -Then run `scene_detect.py`. We provide efficient processing using `multiprocessing`. Don't forget to specify your own dataset path. diff --git a/tools/splitting/scenedetect/__init__.py b/tools/splitting/scenedetect/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tools/splitting/scenedetect/scene_detect.py b/tools/splitting/scenedetect/scene_detect.py deleted file mode 100644 index c46e59d..0000000 --- a/tools/splitting/scenedetect/scene_detect.py +++ /dev/null @@ -1,138 +0,0 @@ -import os -from multiprocessing import Pool - -from mmengine.logging import MMLogger -from scenedetect import ContentDetector, detect -from tqdm import tqdm - -from opensora.utils.misc import get_timestamp - -from .utils import check_mp4_integrity, clone_folder_structure, iterate_files, split_video - -# config -target_fps = 30 # int -shorter_size = 512 # int -min_seconds = 1 # float -max_seconds = 5 # float -assert max_seconds > min_seconds -cfg = dict( - target_fps=target_fps, - min_seconds=min_seconds, - max_seconds=max_seconds, - shorter_size=shorter_size, -) - - -def process_folder(root_src, root_dst): - # create logger - folder_path_log = os.path.dirname(root_dst) - log_name = os.path.basename(root_dst) - timestamp = get_timestamp() - log_path = os.path.join(folder_path_log, f"{log_name}_{timestamp}.log") - logger = MMLogger.get_instance(log_name, log_file=log_path) - - # clone folder structure - clone_folder_structure(root_src, root_dst) - - # all source videos - mp4_list = [x for x in iterate_files(root_src) if x.endswith(".mp4")] - mp4_list = sorted(mp4_list) - - for idx, sample_path in tqdm(enumerate(mp4_list)): - folder_src = os.path.dirname(sample_path) - folder_dst = os.path.join(root_dst, os.path.relpath(folder_src, root_src)) - - # check src video integrity - if not check_mp4_integrity(sample_path, logger=logger): - continue - - # detect scenes - scene_list = detect(sample_path, ContentDetector(), start_in_scene=True) - - # split scenes - save_path_list = split_video(sample_path, scene_list, save_dir=folder_dst, **cfg, logger=logger) - - # check integrity of generated clips - for x in save_path_list: - check_mp4_integrity(x, logger=logger) - - -def scene_detect(): - """detect & cut scenes using a single process - Expected dataset structure: - data/ - your_dataset/ - raw_videos/ - xxx.mp4 - yyy.mp4 - - This function results in: - data/ - your_dataset/ - raw_videos/ - xxx.mp4 - yyy.mp4 - zzz.mp4 - clips/ - xxx_scene-0.mp4 - yyy_scene-0.mp4 - yyy_scene-1.mp4 - """ - # TODO: specify your dataset root - root_src = f"./data/your_dataset/raw_videos" - root_dst = f"./data/your_dataset/clips" - - process_folder(root_src, root_dst) - - -def scene_detect_mp(): - """detect & cut scenes using multiple processes - Expected dataset structure: - data/ - your_dataset/ - raw_videos/ - split_0/ - xxx.mp4 - yyy.mp4 - split_1/ - xxx.mp4 - yyy.mp4 - - This function results in: - data/ - your_dataset/ - raw_videos/ - split_0/ - xxx.mp4 - yyy.mp4 - split_1/ - xxx.mp4 - yyy.mp4 - clips/ - split_0/ - xxx_scene-0.mp4 - yyy_scene-0.mp4 - split_1/ - xxx_scene-0.mp4 - yyy_scene-0.mp4 - yyy_scene-1.mp4 - """ - # TODO: specify your dataset root - root_src = f"./data/your_dataset/raw_videos" - root_dst = f"./data/your_dataset/clips" - - # TODO: specify your splits - splits = ["split_0", "split_1"] - - # process folders - root_src_list = [os.path.join(root_src, x) for x in splits] - root_dst_list = [os.path.join(root_dst, x) for x in splits] - - with Pool(processes=len(splits)) as pool: - pool.starmap(process_folder, list(zip(root_src_list, root_dst_list))) - - -if __name__ == "__main__": - # TODO: choose single process or multiprocessing - scene_detect() - # scene_detect_mp() diff --git a/tools/splitting/scenedetect/utils.py b/tools/splitting/scenedetect/utils.py deleted file mode 100644 index 19eae31..0000000 --- a/tools/splitting/scenedetect/utils.py +++ /dev/null @@ -1,145 +0,0 @@ -import os -import subprocess - -import cv2 -from imageio_ffmpeg import get_ffmpeg_exe -from mmengine.logging import print_log -from moviepy.editor import VideoFileClip -from scenedetect import FrameTimecode - - -def iterate_files(folder_path): - for root, dirs, files in os.walk(folder_path): - # root contains the current directory path - # dirs contains the list of subdirectories in the current directory - # files contains the list of files in the current directory - - # Process files in the current directory - for file in files: - file_path = os.path.join(root, file) - # print("File:", file_path) - yield file_path - - # Process subdirectories and recursively call the function - for subdir in dirs: - subdir_path = os.path.join(root, subdir) - # print("Subdirectory:", subdir_path) - iterate_files(subdir_path) - - -def iterate_folders(folder_path): - for root, dirs, files in os.walk(folder_path): - for subdir in dirs: - subdir_path = os.path.join(root, subdir) - yield subdir_path - # print("Subdirectory:", subdir_path) - iterate_folders(subdir_path) - - -def clone_folder_structure(root_src, root_dst, verbose=False): - src_path_list = iterate_folders(root_src) - src_relpath_list = [os.path.relpath(x, root_src) for x in src_path_list] - - os.makedirs(root_dst, exist_ok=True) - dst_path_list = [os.path.join(root_dst, x) for x in src_relpath_list] - for folder_path in dst_path_list: - os.makedirs(folder_path, exist_ok=True) - if verbose: - print(f"Create folder: '{folder_path}'") - - -def count_files(root, suffix=".mp4"): - files_list = iterate_files(root) - cnt = len([x for x in files_list if x.endswith(suffix)]) - return cnt - - -def check_mp4_integrity(file_path, verbose=True, logger=None): - try: - VideoFileClip(file_path) - if verbose: - print_log(f"The MP4 file '{file_path}' is intact.", logger=logger) - return True - except Exception as e: - if verbose: - print_log(f"Error: {e}", logger=logger) - print_log(f"The MP4 file '{file_path}' is not intact.", logger=logger) - return False - - -def count_frames(video_path): - cap = cv2.VideoCapture(video_path) - - if not cap.isOpened(): - print(f"Error: Could not open video file '{video_path}'") - return - - total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) - print(f"Total frames in the video '{video_path}': {total_frames}") - - cap.release() - - -def split_video( - sample_path, - scene_list, - save_dir, - target_fps=30, - min_seconds=1, - max_seconds=10, - shorter_size=512, - verbose=False, - logger=None, -): - FFMPEG_PATH = get_ffmpeg_exe() - - save_path_list = [] - for idx, scene in enumerate(scene_list): - s, t = scene # FrameTimecode - fps = s.framerate - max_duration = FrameTimecode(timecode="00:00:00", fps=fps) - max_duration.frame_num = round(fps * max_seconds) - duration = min(max_duration, t - s) - if duration.get_frames() < round(min_seconds * fps): - continue - - # save path - fname = os.path.basename(sample_path) - fname_wo_ext = os.path.splitext(fname)[0] - # TODO: fname pattern - save_path = os.path.join(save_dir, f"{fname_wo_ext}_scene-{idx}.mp4") - - # ffmpeg cmd - cmd = [FFMPEG_PATH] - - # Only show ffmpeg output for the first call, which will display any - # errors if it fails, and then break the loop. We only show error messages - # for the remaining calls. - # cmd += ['-v', 'error'] - - # input path - cmd += ["-i", sample_path] - - # clip to cut - cmd += ["-nostdin", "-y", "-ss", str(s.get_seconds()), "-t", str(duration.get_seconds())] - - # target fps - # cmd += ['-vf', 'select=mod(n\,2)'] - cmd += ["-r", f"{target_fps}"] - - # aspect ratio - cmd += ["-vf", f"scale='if(gt(iw,ih),-2,{shorter_size})':'if(gt(iw,ih),{shorter_size},-2)'"] - # cmd += ['-vf', f"scale='if(gt(iw,ih),{shorter_size},trunc(ow/a/2)*2)':-2"] - - cmd += ["-map", "0", save_path] - - proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - stdout, stderr = proc.communicate() - if verbose: - stdout = stdout.decode("utf-8") - print_log(stdout, logger=logger) - - save_path_list.append(sample_path) - print_log(f"Video clip saved to '{save_path}'", logger=logger) - - return save_path_list