Dev/pxy (#36)

update scoring & scene_cut
2026-04-12 05:46:22 +02:00 · 2024-04-06 16:33:06 +08:00 · 2024-04-06 16:33:06 +08:00 · 0dfea90a89
commit 0dfea90a89
parent c9b81d8fd6
12 changed files with 823 additions and 296 deletions
--- a/tools/scene_cut/README.md
+++ b/tools/scene_cut/README.md
@ -0,0 +1,26 @@
+## Scene Detection and Video Splitting
+
+### Formatting
+Input meta should be `{prefix}.csv` with column `'videoId'`
+```
+python tools/scene_cut/process_meta.py --task append_format --meta_path /mnt/hdd/data/pexels_new/raw/meta/popular_6.csv --split popular_6
+```
+Output is `{prefix}_format.csv` (with column `path`) and `{prefix}_intact.csv` (with column `intact` and `path`)
+
+### Scene Detection
+Input meta should be `{prefix}_format.csv`
+```
+python tools/scene_cut/scene_detect.py --meta_path /mnt/hdd/data/pexels_new/raw/meta/popular_6_format.csv
+```
+Output is `{prefix}_format_timestamp.csv`
+
+### Video Splitting
+Input meta should be `{prefix}_timestamp.csv`
+```
+python tools/scene_cut/main_cut_pandarallel.py \
+    --meta_path /mnt/hdd/data/pexels_new/raw/meta/popular_6_format_timestamp.csv \
+    --out_dir /mnt/hdd/data/pexels_new/scene_cut/data/popular_6
+```
+Output is `{out_dir}/{wo_ext}_scene-{sid}.mp4`
+
+TODO: meta for video clips
--- a/tools/scene_cut/init.py
+++ b/tools/scene_cut/init.py
--- a/tools/scene_cut/main_cut_multi_thread.py
+++ b/tools/scene_cut/main_cut_multi_thread.py
@ -0,0 +1,168 @@
+import os
+import argparse
+import time
+import subprocess
+from tqdm import tqdm
+
+import pandas as pd
+from scenedetect import FrameTimecode
+from imageio_ffmpeg import get_ffmpeg_exe
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+from mmengine.logging import MMLogger, print_log
+from utils_video import is_intact_video, iterate_files, clone_folder_structure
+
+
+def single_process(row, save_dir, logger=None):
+    # video_id = row['videoID']
+    # video_path = os.path.join(root_src, f'{video_id}.mp4')
+    video_path = row['path']
+
+    # check mp4 integrity
+    # if not is_intact_video(video_path, logger=logger):
+    #     return False
+
+    timestamp = row['timestamp']
+    if not (timestamp.startswith('[') and timestamp.endswith(']')):
+        return False
+    scene_list = eval(timestamp)
+    scene_list = [
+        (FrameTimecode(s, fps=1), FrameTimecode(t, fps=1))
+        for s, t in scene_list
+    ]
+    split_video(video_path, scene_list, save_dir=save_dir, logger=logger)
+    return True
+
+
+def split_video(
+        video_path,
+        scene_list,
+        save_dir,
+        min_seconds=None,
+        max_seconds=None,
+        target_fps=30,
+        shorter_size=512,
+        verbose=False,
+        logger=None,
+):
+    """
+    scenes shorter than min_seconds will be ignored;
+    scenes longer than max_seconds will be cut to save the beginning max_seconds.
+    Currently, the saved file name pattern is f'{fname}_scene-{idx}'.mp4
+
+    Args:
+        scene_list (List[Tuple[FrameTimecode, FrameTimecode]]): each element is (s, t): start and end of a scene.
+        min_seconds (float | None)
+        max_seconds (float | None)
+        target_fps (int | None)
+        shorter_size (int | None)
+    """
+    FFMPEG_PATH = get_ffmpeg_exe()
+
+    save_path_list = []
+    for idx, scene in enumerate(scene_list):
+        s, t = scene  # FrameTimecode
+        if min_seconds is not None:
+            if (t - s).get_seconds() < min_seconds:
+                continue
+
+        duration = t - s
+        if max_seconds is not None:
+            fps = s.framerate
+            max_duration = FrameTimecode(timecode="00:00:00", fps=fps)
+            max_duration.frame_num = round(fps * max_seconds)
+            duration = min(max_duration, duration)
+
+        # save path
+        fname = os.path.basename(video_path)
+        fname_wo_ext = os.path.splitext(fname)[0]
+        # TODO: fname pattern
+        save_path = os.path.join(save_dir, f"{fname_wo_ext}_scene-{idx}.mp4")
+
+        # ffmpeg cmd
+        cmd = [FFMPEG_PATH]
+
+        # Only show ffmpeg output for the first call, which will display any
+        # errors if it fails, and then break the loop. We only show error messages
+        # for the remaining calls.
+        # cmd += ['-v', 'error']
+
+        # input path
+        # cmd += ["-i", video_path]
+
+        # clip to cut
+        cmd += ["-nostdin", "-y", "-ss", str(s.get_seconds()), "-i", video_path, "-t", str(duration.get_seconds())]
+        # cmd += ["-nostdin", "-y", "-ss", str(s.get_seconds()), "-t", str(duration.get_seconds())]
+
+        # target fps
+        # cmd += ['-vf', 'select=mod(n\,2)']
+        if target_fps is not None:
+            cmd += ["-r", f"{target_fps}"]
+
+        # aspect ratio
+        if shorter_size is not None:
+            cmd += ["-vf", f"scale='if(gt(iw,ih),-2,{shorter_size})':'if(gt(iw,ih),{shorter_size},-2)'"]
+        # cmd += ['-vf', f"scale='if(gt(iw,ih),{shorter_size},trunc(ow/a/2)*2)':-2"]
+
+        cmd += ["-map", "0", save_path]
+
+        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+        stdout, stderr = proc.communicate()
+        if verbose:
+            stdout = stdout.decode("utf-8")
+            print_log(stdout, logger=logger)
+
+        save_path_list.append(video_path)
+        print_log(f"Video clip saved to '{save_path}'", logger=logger)
+
+    return save_path_list
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--root', default='F:/Panda-70M/')
+    parser.add_argument('--split', default='test')
+    parser.add_argument('--num_workers', default=5, type=int)
+
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    # args = parse_args()
+    # root = args.root
+    # split = args.split
+
+    root = 'F:/Panda-70M/'
+    root, split = 'F:/pexels_new/', 'popular_2'
+    meta_path = os.path.join(root, f'raw/meta/{split}_format_timestamp.csv')
+    root_dst = os.path.join(root, f'scene_cut/data/{split}')
+
+    folder_dst = root_dst
+    # folder_src = os.path.join(root_src, f'data/{split}')
+    # folder_dst = os.path.join(root_dst, os.path.relpath(folder_src, root_src))
+    os.makedirs(folder_dst, exist_ok=True)
+
+    meta = pd.read_csv(meta_path)
+
+    # create logger
+    # folder_path_log = os.path.dirname(root_dst)
+    # log_name = os.path.basename(root_dst)
+    # timestamp = time.strftime("%Y%m%d-%H%M%S", time.localtime(time.time()))
+    # log_path = os.path.join(folder_path_log, f"{log_name}_{timestamp}.log")
+    # logger = MMLogger.get_instance(log_name, log_file=log_path)
+    logger = None
+
+    tasks = []
+    pool = ThreadPoolExecutor(max_workers=1)
+    for idx, row in meta.iterrows():
+        task = pool.submit(single_process, row, folder_dst, logger)
+        tasks.append(task)
+
+    for task in tqdm(as_completed(tasks), total=len(meta)):
+        task.result()
+    pool.shutdown()
+
+
+if __name__ == '__main__':
+    main()
--- a/tools/scene_cut/main_cut_pandarallel.py
+++ b/tools/scene_cut/main_cut_pandarallel.py
@ -0,0 +1,163 @@
+import os
+import argparse
+import time
+import subprocess
+from tqdm import tqdm
+
+import pandas as pd
+from scenedetect import FrameTimecode
+from functools import partial
+from pandarallel import pandarallel
+from imageio_ffmpeg import get_ffmpeg_exe
+
+from mmengine.logging import MMLogger, print_log
+from utils_video import is_intact_video, iterate_files, clone_folder_structure
+
+
+def process_single_row(row, save_dir, log_name=None):
+    video_path = row['path']
+
+    logger = None
+    if log_name is not None:
+        logger = MMLogger.get_instance(log_name)
+
+    # check mp4 integrity
+    # if not is_intact_video(video_path, logger=logger):
+    #     return False
+
+    timestamp = row['timestamp']
+    if not (timestamp.startswith('[') and timestamp.endswith(']')):
+        return False
+    scene_list = eval(timestamp)
+    scene_list = [
+        (FrameTimecode(s, fps=1), FrameTimecode(t, fps=1))
+        for s, t in scene_list
+    ]
+    split_video(video_path, scene_list, save_dir=save_dir,
+                min_seconds=2, max_seconds=15, shorter_size=720,
+                logger=logger)
+
+
+def split_video(
+        video_path,
+        scene_list,
+        save_dir,
+        min_seconds=None,
+        max_seconds=None,
+        target_fps=30,
+        shorter_size=512,
+        verbose=False,
+        logger=None,
+):
+    """
+    scenes shorter than min_seconds will be ignored;
+    scenes longer than max_seconds will be cut to save the beginning max_seconds.
+    Currently, the saved file name pattern is f'{fname}_scene-{idx}'.mp4
+
+    Args:
+        scene_list (List[Tuple[FrameTimecode, FrameTimecode]]): each element is (s, t): start and end of a scene.
+        min_seconds (float | None)
+        max_seconds (float | None)
+        target_fps (int | None)
+        shorter_size (int | None)
+    """
+    FFMPEG_PATH = get_ffmpeg_exe()
+
+    save_path_list = []
+    for idx, scene in enumerate(scene_list):
+        s, t = scene  # FrameTimecode
+        if min_seconds is not None:
+            if (t - s).get_seconds() < min_seconds:
+                continue
+
+        duration = t - s
+        if max_seconds is not None:
+            fps = s.framerate
+            max_duration = FrameTimecode(timecode="00:00:00", fps=fps)
+            max_duration.frame_num = round(fps * max_seconds)
+            duration = min(max_duration, duration)
+
+        # save path
+        fname = os.path.basename(video_path)
+        fname_wo_ext = os.path.splitext(fname)[0]
+        # TODO: fname pattern
+        save_path = os.path.join(save_dir, f"{fname_wo_ext}_scene-{idx}.mp4")
+
+        # ffmpeg cmd
+        cmd = [FFMPEG_PATH]
+
+        # Only show ffmpeg output for the first call, which will display any
+        # errors if it fails, and then break the loop. We only show error messages
+        # for the remaining calls.
+        # cmd += ['-v', 'error']
+
+        # -ss after -i is very slow; put -ss before -i
+        # input path
+        # cmd += ["-i", video_path]
+
+        # clip to cut
+        # cmd += ["-nostdin", "-y", "-ss", str(s.get_seconds()), "-t", str(duration.get_seconds())]
+
+        # clip to cut
+        cmd += ["-nostdin", "-y", "-ss", str(s.get_seconds()), "-i", video_path, "-t", str(duration.get_seconds())]
+
+        # target fps
+        # cmd += ['-vf', 'select=mod(n\,2)']
+        if target_fps is not None:
+            cmd += ["-r", f"{target_fps}"]
+
+        # aspect ratio
+        if shorter_size is not None:
+            cmd += ["-vf", f"scale='if(gt(iw,ih),-2,{shorter_size})':'if(gt(iw,ih),{shorter_size},-2)'"]
+        # cmd += ['-vf', f"scale='if(gt(iw,ih),{shorter_size},trunc(ow/a/2)*2)':-2"]
+
+        cmd += ["-map", "0", save_path]
+
+        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+        stdout, stderr = proc.communicate()
+        # stdout = stdout.decode("utf-8")
+        # print_log(stdout, logger=logger)
+
+        save_path_list.append(video_path)
+        if verbose:
+            print_log(f"Video clip saved to '{save_path}'", logger=logger)
+
+    return save_path_list
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--meta_path', default='./data/pexels_new/raw/meta/popular_5_format_timestamp.csv')
+    parser.add_argument('--out_dir', default='./data/pexels_new/scene_cut/data/popular_5')
+    parser.add_argument('--num_workers', default=5, type=int)
+
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    meta_path = args.meta_path
+    out_dir = args.out_dir
+
+    assert os.path.basename(os.path.dirname(out_dir)) == 'data'
+
+    os.makedirs(out_dir, exist_ok=True)
+
+    meta = pd.read_csv(meta_path)
+
+    # create logger
+    log_dir = os.path.dirname(out_dir)
+    log_name = os.path.basename(out_dir)
+    timestamp = time.strftime("%Y%m%d-%H%M%S", time.localtime(time.time()))
+    log_path = os.path.join(log_dir, f"{log_name}_{timestamp}.log")
+    logger = MMLogger.get_instance(log_name, log_file=log_path)
+    # logger = None
+
+    pandarallel.initialize(progress_bar=True)
+    process_single_row_partial = partial(process_single_row, save_dir=out_dir, log_name=log_name)
+    meta.parallel_apply(process_single_row_partial, axis=1)
+
+
+if __name__ == '__main__':
+    main()
--- a/tools/scene_cut/process_meta.py
+++ b/tools/scene_cut/process_meta.py
@ -0,0 +1,266 @@
+"""
+1. format_raw_meta()
+    - only keep intact videos
+    - add 'path' column (abs path)
+2. create_meta_for_folder()
+"""
+
+import os
+# os.chdir('../..')
+print(f'Current working directory: {os.getcwd()}')
+
+import argparse
+import json
+import subprocess
+import pandas as pd
+from tqdm import tqdm
+import pickle as pkl
+from pandarallel import pandarallel
+from functools import partial
+import numpy as np
+
+from utils_video import is_intact_video
+
+
+def has_downloaded_success(json_path):
+    if not os.path.exists(json_path):
+        return False
+
+    try:
+        with open(json_path, 'r') as f:
+            data = json.load(f)
+            if 'success' not in data or isinstance(data['success'], bool) is False or data['success'] is False:
+                return False
+    except Exception as e:
+        return False
+
+    return True
+
+
+def split_meta_csv(chunk_size=60000):
+    """
+    Split csv into multiple small csv in order
+    """
+    root = './data/Panda-70M'
+    # meta_name = 'meta/panda70m_training_full.csv'
+    meta_name = 'meta/panda70m_training_10m.csv'
+    # meta_name = 'meta/training_10m/train_0.csv'
+    meta_path = os.path.join(root, meta_name)
+
+    df = pd.read_csv(meta_path)
+    num_rows = len(df)
+
+    # Split the DataFrame into smaller DataFrames
+    for idx, i in enumerate(range(0, num_rows, chunk_size)):
+        df_i = df.iloc[i:i + chunk_size]
+        out_path = os.path.join(root, f'meta/train_{idx}.csv')
+        df_i.to_csv(out_path, index=False)
+
+    # If there are remaining rows
+    if num_rows > chunk_size and num_rows % chunk_size != 0:
+        df_last = df.iloc[-(num_rows % chunk_size):]
+        out_path = os.path.join(root, f'meta/train_{idx + 1}.csv')
+        df_last.to_csv(out_path, index=False)
+
+
+def remove_index():
+    df = pd.read_csv('your_file.csv', index_col=0)
+    df.to_csv('your_file_without_index.csv', index=False)
+
+
+def append_format(meta_path, mode='.mp4'):
+    """
+    Append _format to csv file:
+        - filter broken videos; only intact videos are kept
+        - add column 'path'
+
+    input csv should satisfy:
+        - name should be: {split}.csv
+        - contain column 'videoID'/'videoId'
+    """
+    # meta_path = os.path.join(root, f'raw/meta/{split}.csv')
+    meta_dirname = os.path.dirname(meta_path)
+    assert meta_dirname.endswith('raw/meta')
+    root_raw = os.path.dirname(meta_dirname)
+
+    meta_fname = os.path.basename(meta_path)
+    split, ext = os.path.splitext(meta_fname)
+
+    meta = pd.read_csv(meta_path)
+
+    path_list = []
+    new_meta = []
+    for idx, row in tqdm(meta.iterrows(), total=len(meta)):
+        # video_id = row['videoID']  # panda
+        video_id = row['videoId']  # pexels_new
+        video_path = os.path.join(root_raw, f'data/{split}/{video_id}.mp4')
+        if mode == '.mp4':
+            if not is_intact_video(video_path):
+                continue
+        elif mode == '.json':
+            json_path = os.path.join(root_raw, f'data/{split}/{video_id}.json')
+            if not has_downloaded_success(json_path):
+                continue
+        else:
+            raise ValueError
+
+        new_meta.append(row)
+        path_list.append(video_path)
+
+    new_meta = pd.DataFrame(new_meta)
+    new_meta['path'] = path_list
+
+    out_path = os.path.join(root_raw, f'meta/{split}_format.csv')
+    new_meta.to_csv(out_path, index=False)
+    print(f'New meta (shape={new_meta.shape}) saved to \'{out_path}\'')
+
+
+def append_format_pandarallel(meta_path, split, mode='.mp4'):
+    """
+    Append _format to csv file:
+        - filter broken videos; only intact videos are kept
+        - add column 'path'
+
+    input csv should satisfy:
+        - name should be: {split}.csv
+        - contain column 'videoID'/'videoId'
+    """
+    # meta_path = os.path.join(root, f'raw/meta/{split}.csv')
+    meta_dirname = os.path.dirname(meta_path)
+    assert meta_dirname.endswith('raw/meta')
+    root_raw = os.path.dirname(meta_dirname)
+
+    meta_fname = os.path.basename(meta_path)
+    wo_ext, ext = os.path.splitext(meta_fname)
+
+    meta = pd.read_csv(meta_path)
+
+    def is_intact(row, mode='.json'):
+        video_id = row['videoId']  # pexels_new
+        video_path = os.path.join(root_raw, f'data/{split}/{video_id}.mp4')
+        row['path'] = video_path
+        if mode == '.mp4':
+            if is_intact_video(video_path):
+                return True, video_path
+            return False, video_path
+        elif mode == '.json':
+            json_path = os.path.join(root_raw, f'data/{split}/{video_id}.json')
+            if has_downloaded_success(json_path):
+                return True, video_path
+            return False, video_path
+        else:
+            raise ValueError
+
+    pandarallel.initialize(progress_bar=True)
+    is_intact_partial = partial(is_intact, mode=mode)
+    ret = meta.parallel_apply(is_intact_partial, axis=1)
+
+    intact, paths = list(zip(*ret))
+
+    meta['intact'] = intact
+    meta['path'] = paths
+    out_path = os.path.join(root_raw, f'meta/{wo_ext}_intact.csv')
+    meta.to_csv(out_path, index=False)
+    print(f'New meta (shape={meta.shape}) with intact info saved to \'{out_path}\'')
+
+    # meta_format = meta[meta['intact']]
+    meta_format = meta[np.array(intact)]
+    meta_format.drop('intact', axis=1, inplace=True)
+    out_path = os.path.join(root_raw, f'meta/{wo_ext}_format.csv')
+    meta_format.to_csv(out_path, index=False)
+    print(f'New meta (shape={meta_format.shape}) with format info saved to \'{out_path}\'')
+
+
+def create_subset(meta_path):
+    meta = pd.read_csv(meta_path)
+    meta_subset = meta.iloc[:100]
+
+    wo_ext, ext = os.path.splitext(meta_path)
+    out_path = f'{wo_ext}_head-100{ext}'
+    meta_subset.to_csv(out_path, index=False)
+    print(f'New meta (shape={meta_subset.shape}) saved to \'{out_path}\'')
+
+
+def append_cut(root='./data/Panda-70M'):
+    """
+    Append _cut to csv file
+    input csv should satisfy:
+        - name_should be {split}_intact.csv
+        - contain column 'timestamp': list of timestamp
+    """
+    split = 'test'
+    meta_path = os.path.join(root, f'processed/meta/{split}_intact.csv')
+
+    wo_ext, ext = os.path.splitext(meta_path)
+    suffix = 'cut'
+    out_path = f'{wo_ext}_{suffix}{ext}'
+
+    meta = pd.read_csv(meta_path)
+
+    new_meta = []
+    for idx, row in tqdm(meta.iterrows(), total=len(meta)):
+        video_id = row['videoID']
+        timestamps = eval(row['timestamp'])
+        captions = eval(row['caption'])
+        scores = eval(row['matching_score'])
+
+        num_clips = len(timestamps)
+        for idx_c in range(num_clips):
+            path_i = os.path.join(root, f'processed/{split}/{video_id}_scene-{idx_c}.mp4')
+            # if not is_intact_video(path_i):
+            #     continue
+
+            row_i = [f'{video_id}_scene-{idx_c}', path_i, timestamps[idx_c], captions[idx_c], scores[idx_c]]
+
+            new_meta.append(row_i)
+
+    columns = ['videoID', 'path', 'timestamp', 'text', 'match_official']
+    new_meta = pd.DataFrame(new_meta, columns=columns)
+
+    new_meta.to_csv(out_path, index=False)
+    print(f'New meta (shape={new_meta.shape}) saved to \'{out_path}\'')
+
+
+def debug_meta_topk():
+    meta_path = 'F:/Panda-70M/meta/test_intact_cut_flow.csv'
+    meta = pd.read_csv(meta_path)
+
+    score_column = 'flow_score'
+    topk = meta.nlargest(10, columns=score_column)
+    topk_s = meta.nsmallest(200, columns=score_column)
+
+    topk_list = [(row['path'], row['caption'], row[score_column]) for idx, row in topk.iterrows()]
+    topk_s_list = [(row['path'], row['caption'], row[score_column]) for idx, row in topk_s.iterrows()]
+
+    x = 0
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--task', default='append_format')
+    parser.add_argument('--meta_path', default='./data/pexels_new/raw/meta/popular_1.csv')
+    parser.add_argument('--split', default='popular_5')
+    parser.add_argument('--num_workers', default=5, type=int)
+
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    # split_meta_csv()
+
+    args = parse_args()
+    meta_path = args.meta_path
+    task = args.task
+
+    if task == 'append_format':
+        # append_format(meta_path=meta_path, mode='.mp4')
+        append_format_pandarallel(meta_path=meta_path, split=args.split, mode='.json')
+    elif task == 'create_subset':
+        create_subset(meta_path=meta_path)
+    else:
+        raise ValueError
+
+    # append_cut(root=root)
+    # append_score(root=root)
+    # debug_meta_topk()
--- a/tools/scene_cut/scene_detect.py
+++ b/tools/scene_cut/scene_detect.py
@ -0,0 +1,89 @@
+import argparse
+import os
+import numpy as np
+from tqdm import tqdm
+import pandas as pd
+from functools import partial
+from pandarallel import pandarallel
+from scenedetect import detect, ContentDetector, AdaptiveDetector, FrameTimecode
+
+
+def process_single_row(row):
+    # windows
+    # from scenedetect import detect, ContentDetector, AdaptiveDetector
+
+    video_path = row['path']
+
+    detector = AdaptiveDetector(
+        adaptive_threshold=3.0,
+        # luma_only=True,
+    )
+    # detector = ContentDetector()
+    # TODO: catch error here
+    try:
+        scene_list = detect(video_path, detector, start_in_scene=True)
+        timestamp = [(s.get_timecode(), t.get_timecode()) for s, t in scene_list]
+        return True, str(timestamp)
+    except Exception as e:
+        print(f'Video \'{video_path}\' with error {e}')
+        return False, ''
+
+
+def main():
+    meta_path = 'F:/pexels_new/raw/meta/popular_1_format.csv'
+    meta = pd.read_csv(meta_path)
+
+    timestamp_list = []
+    for idx, row in tqdm(meta.iterrows()):
+        video_path = row['path']
+
+        detector = AdaptiveDetector(
+            adaptive_threshold=1.5,
+            luma_only=True,
+        )
+        # detector = ContentDetector()
+        scene_list = detect(video_path, detector, start_in_scene=True)
+
+        timestamp = [(s.get_timecode(), t.get_timecode()) for s, t in scene_list]
+        timestamp_list.append(timestamp)
+
+    meta['timestamp'] = timestamp_list
+
+    wo_ext, ext = os.path.splitext(meta_path)
+    out_path = f"{wo_ext}_timestamp{ext}"
+    meta.to_csv(out_path, index=False)
+    print(f"New meta with timestamp saved to '{out_path}'.")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--meta_path', default='F:/pexels_new/raw/meta/popular_1_format.csv')
+    parser.add_argument('--num_workers', default=5, type=int)
+
+    args = parser.parse_args()
+    return args
+
+
+def main_pandarallel():
+    args = parse_args()
+    meta_path = args.meta_path
+
+    # meta_path = 'F:/pexels_new/raw/meta/popular_1_format.csv'
+    meta = pd.read_csv(meta_path)
+
+    pandarallel.initialize(progress_bar=True)
+    ret = meta.parallel_apply(process_single_row, axis=1)
+
+    succ, timestamps = list(zip(*ret))
+
+    meta['timestamp'] = timestamps
+    meta = meta[np.array(succ)]
+
+    wo_ext, ext = os.path.splitext(meta_path)
+    out_path = f"{wo_ext}_timestamp{ext}"
+    meta.to_csv(out_path, index=False)
+    print(f"New meta (shape={meta.shape}) with timestamp saved to '{out_path}'.")
+
+
+if __name__ == '__main__':
+    main_pandarallel()
--- a/tools/scene_cut/utils_video.py
+++ b/tools/scene_cut/utils_video.py
@ -0,0 +1,97 @@
+import os
+import cv2
+from mmengine.logging import print_log
+from moviepy.editor import VideoFileClip
+
+
+def iterate_files(folder_path):
+    for root, dirs, files in os.walk(folder_path):
+        # root contains the current directory path
+        # dirs contains the list of subdirectories in the current directory
+        # files contains the list of files in the current directory
+
+        # Process files in the current directory
+        for file in files:
+            file_path = os.path.join(root, file)
+            # print("File:", file_path)
+            yield file_path
+
+        # Process subdirectories and recursively call the function
+        for subdir in dirs:
+            subdir_path = os.path.join(root, subdir)
+            # print("Subdirectory:", subdir_path)
+            iterate_files(subdir_path)
+
+
+def iterate_folders(folder_path):
+    for root, dirs, files in os.walk(folder_path):
+        for subdir in dirs:
+            subdir_path = os.path.join(root, subdir)
+            yield subdir_path
+            # print("Subdirectory:", subdir_path)
+            iterate_folders(subdir_path)
+
+
+def clone_folder_structure(root_src, root_dst, verbose=False):
+    src_path_list = iterate_folders(root_src)
+    src_relpath_list = [os.path.relpath(x, root_src) for x in src_path_list]
+
+    os.makedirs(root_dst, exist_ok=True)
+    dst_path_list = [os.path.join(root_dst, x) for x in src_relpath_list]
+    for folder_path in dst_path_list:
+        os.makedirs(folder_path, exist_ok=True)
+        if verbose:
+            print(f"Create folder: '{folder_path}'")
+
+
+def is_intact_video(video_path, mode='moviepy', verbose=False, logger=None):
+    if not os.path.exists(video_path):
+        if verbose:
+            print_log(f"Could not find '{video_path}'", logger=logger)
+        return False
+
+    if mode == 'moviepy':
+        try:
+            VideoFileClip(video_path)
+            if verbose:
+                print_log(f"The video file '{video_path}' is intact.", logger=logger)
+            return True
+        except Exception as e:
+            if verbose:
+                print_log(f"Error: {e}", logger=logger)
+                print_log(f"The video file '{video_path}' is not intact.", logger=logger)
+            return False
+    elif mode == 'cv2':
+        try:
+            cap = cv2.VideoCapture(video_path)
+            if cap.isOpened():
+                if verbose:
+                    print_log(f"The video file '{video_path}' is intact.", logger=logger)
+                return True
+        except Exception as e:
+            if verbose:
+                print_log(f"Error: {e}", logger=logger)
+                print_log(f"The video file '{video_path}' is not intact.", logger=logger)
+            return False
+    else:
+        raise ValueError
+
+
+def count_frames(video_path, logger=None):
+    cap = cv2.VideoCapture(video_path)
+
+    if not cap.isOpened():
+        print_log(f"Error: Could not open video file '{video_path}'", logger=logger)
+        return
+
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    print_log(f"Total frames in the video '{video_path}': {total_frames}", logger=logger)
+
+    cap.release()
+
+
+def count_files(root, suffix=".mp4"):
+    files_list = iterate_files(root)
+    cnt = len([x for x in files_list if x.endswith(suffix)])
+    return cnt
+
--- a/tools/scoring/matching/inference.py
+++ b/tools/scoring/matching/inference.py
@ -8,6 +8,10 @@ import pandas as pd
 import torch
 import torch.nn.functional as F
 from torchvision.datasets.folder import pil_loader
+<<<<<<< HEAD
+
+=======
+>>>>>>> dev/v1.0.1
 from tqdm import tqdm

 IMG_EXTENSIONS = (
@ -24,6 +28,15 @@ IMG_EXTENSIONS = (
 VID_EXTENSIONS = (".mp4", ".avi", ".mov", ".mkv")


+def is_video(filename):
+    ext = os.path.splitext(filename)[-1].lower()
+    return ext in VID_EXTENSIONS
+
+
+IMG_EXTENSIONS = (".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif", ".tiff", ".webp")
+VID_EXTENSIONS = (".mp4", ".avi", ".mov", ".mkv")
+
+
 def is_video(filename):
    ext = os.path.splitext(filename)[-1].lower()
    return ext in VID_EXTENSIONS
@ -52,13 +65,12 @@ class VideoTextDataset(torch.utils.data.Dataset):

    def __getitem__(self, index):
        row = self.meta.iloc[index]
-        path = row["path"]
+        path = row['path']

        if is_video(path):
            img = extract_frames(path, points=[0.5])[0]
        else:
            img = pil_loader(path)
-
        img = self.transform(img)

        text = row["text"]
--- a/tools/splitting/README.md
+++ b/tools/splitting/README.md
@ -1,11 +0,0 @@
-# Scene Detection and Video Split
-
-Raw videos from the Internet may be too long for training.
-Thus, we detect scenes in raw videos and split them into short clips based on the scenes.
-First prepare the video processing packages.
-
-```bash
-pip install scenedetect moviepy opencv-python
-```
-
-Then run `scene_detect.py`. We provide efficient processing using `multiprocessing`. Don't forget to specify your own dataset path.
--- a/tools/splitting/scenedetect/init.py
+++ b/tools/splitting/scenedetect/init.py
--- a/tools/splitting/scenedetect/scene_detect.py
+++ b/tools/splitting/scenedetect/scene_detect.py
@ -1,138 +0,0 @@
-import os
-from multiprocessing import Pool
-
-from mmengine.logging import MMLogger
-from scenedetect import ContentDetector, detect
-from tqdm import tqdm
-
-from opensora.utils.misc import get_timestamp
-
-from .utils import check_mp4_integrity, clone_folder_structure, iterate_files, split_video
-
-# config
-target_fps = 30  # int
-shorter_size = 512  # int
-min_seconds = 1  # float
-max_seconds = 5  # float
-assert max_seconds > min_seconds
-cfg = dict(
-    target_fps=target_fps,
-    min_seconds=min_seconds,
-    max_seconds=max_seconds,
-    shorter_size=shorter_size,
-)
-
-
-def process_folder(root_src, root_dst):
-    # create logger
-    folder_path_log = os.path.dirname(root_dst)
-    log_name = os.path.basename(root_dst)
-    timestamp = get_timestamp()
-    log_path = os.path.join(folder_path_log, f"{log_name}_{timestamp}.log")
-    logger = MMLogger.get_instance(log_name, log_file=log_path)
-
-    # clone folder structure
-    clone_folder_structure(root_src, root_dst)
-
-    # all source videos
-    mp4_list = [x for x in iterate_files(root_src) if x.endswith(".mp4")]
-    mp4_list = sorted(mp4_list)
-
-    for idx, sample_path in tqdm(enumerate(mp4_list)):
-        folder_src = os.path.dirname(sample_path)
-        folder_dst = os.path.join(root_dst, os.path.relpath(folder_src, root_src))
-
-        # check src video integrity
-        if not check_mp4_integrity(sample_path, logger=logger):
-            continue
-
-        # detect scenes
-        scene_list = detect(sample_path, ContentDetector(), start_in_scene=True)
-
-        # split scenes
-        save_path_list = split_video(sample_path, scene_list, save_dir=folder_dst, **cfg, logger=logger)
-
-        # check integrity of generated clips
-        for x in save_path_list:
-            check_mp4_integrity(x, logger=logger)
-
-
-def scene_detect():
-    """detect & cut scenes using a single process
-    Expected dataset structure:
-    data/
-        your_dataset/
-            raw_videos/
-                xxx.mp4
-                yyy.mp4
-
-    This function results in:
-    data/
-        your_dataset/
-            raw_videos/
-                xxx.mp4
-                yyy.mp4
-                zzz.mp4
-            clips/
-                xxx_scene-0.mp4
-                yyy_scene-0.mp4
-                yyy_scene-1.mp4
-    """
-    # TODO: specify your dataset root
-    root_src = f"./data/your_dataset/raw_videos"
-    root_dst = f"./data/your_dataset/clips"
-
-    process_folder(root_src, root_dst)
-
-
-def scene_detect_mp():
-    """detect & cut scenes using multiple processes
-    Expected dataset structure:
-    data/
-        your_dataset/
-            raw_videos/
-                split_0/
-                    xxx.mp4
-                    yyy.mp4
-                split_1/
-                    xxx.mp4
-                    yyy.mp4
-
-    This function results in:
-    data/
-        your_dataset/
-            raw_videos/
-                split_0/
-                    xxx.mp4
-                    yyy.mp4
-                split_1/
-                    xxx.mp4
-                    yyy.mp4
-            clips/
-                split_0/
-                    xxx_scene-0.mp4
-                    yyy_scene-0.mp4
-                split_1/
-                    xxx_scene-0.mp4
-                    yyy_scene-0.mp4
-                    yyy_scene-1.mp4
-    """
-    # TODO: specify your dataset root
-    root_src = f"./data/your_dataset/raw_videos"
-    root_dst = f"./data/your_dataset/clips"
-
-    # TODO: specify your splits
-    splits = ["split_0", "split_1"]
-
-    # process folders
-    root_src_list = [os.path.join(root_src, x) for x in splits]
-    root_dst_list = [os.path.join(root_dst, x) for x in splits]
-
-    with Pool(processes=len(splits)) as pool:
-        pool.starmap(process_folder, list(zip(root_src_list, root_dst_list)))
-
-
-if __name__ == "__main__":
-    # TODO: choose single process or multiprocessing
-    scene_detect()
-    # scene_detect_mp()
--- a/tools/splitting/scenedetect/utils.py
+++ b/tools/splitting/scenedetect/utils.py
@ -1,145 +0,0 @@
-import os
-import subprocess
-
-import cv2
-from imageio_ffmpeg import get_ffmpeg_exe
-from mmengine.logging import print_log
-from moviepy.editor import VideoFileClip
-from scenedetect import FrameTimecode
-
-
-def iterate_files(folder_path):
-    for root, dirs, files in os.walk(folder_path):
-        # root contains the current directory path
-        # dirs contains the list of subdirectories in the current directory
-        # files contains the list of files in the current directory
-
-        # Process files in the current directory
-        for file in files:
-            file_path = os.path.join(root, file)
-            # print("File:", file_path)
-            yield file_path
-
-        # Process subdirectories and recursively call the function
-        for subdir in dirs:
-            subdir_path = os.path.join(root, subdir)
-            # print("Subdirectory:", subdir_path)
-            iterate_files(subdir_path)
-
-
-def iterate_folders(folder_path):
-    for root, dirs, files in os.walk(folder_path):
-        for subdir in dirs:
-            subdir_path = os.path.join(root, subdir)
-            yield subdir_path
-            # print("Subdirectory:", subdir_path)
-            iterate_folders(subdir_path)
-
-
-def clone_folder_structure(root_src, root_dst, verbose=False):
-    src_path_list = iterate_folders(root_src)
-    src_relpath_list = [os.path.relpath(x, root_src) for x in src_path_list]
-
-    os.makedirs(root_dst, exist_ok=True)
-    dst_path_list = [os.path.join(root_dst, x) for x in src_relpath_list]
-    for folder_path in dst_path_list:
-        os.makedirs(folder_path, exist_ok=True)
-        if verbose:
-            print(f"Create folder: '{folder_path}'")
-
-
-def count_files(root, suffix=".mp4"):
-    files_list = iterate_files(root)
-    cnt = len([x for x in files_list if x.endswith(suffix)])
-    return cnt
-
-
-def check_mp4_integrity(file_path, verbose=True, logger=None):
-    try:
-        VideoFileClip(file_path)
-        if verbose:
-            print_log(f"The MP4 file '{file_path}' is intact.", logger=logger)
-        return True
-    except Exception as e:
-        if verbose:
-            print_log(f"Error: {e}", logger=logger)
-            print_log(f"The MP4 file '{file_path}' is not intact.", logger=logger)
-        return False
-
-
-def count_frames(video_path):
-    cap = cv2.VideoCapture(video_path)
-
-    if not cap.isOpened():
-        print(f"Error: Could not open video file '{video_path}'")
-        return
-
-    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-    print(f"Total frames in the video '{video_path}': {total_frames}")
-
-    cap.release()
-
-
-def split_video(
-    sample_path,
-    scene_list,
-    save_dir,
-    target_fps=30,
-    min_seconds=1,
-    max_seconds=10,
-    shorter_size=512,
-    verbose=False,
-    logger=None,
-):
-    FFMPEG_PATH = get_ffmpeg_exe()
-
-    save_path_list = []
-    for idx, scene in enumerate(scene_list):
-        s, t = scene  # FrameTimecode
-        fps = s.framerate
-        max_duration = FrameTimecode(timecode="00:00:00", fps=fps)
-        max_duration.frame_num = round(fps * max_seconds)
-        duration = min(max_duration, t - s)
-        if duration.get_frames() < round(min_seconds * fps):
-            continue
-
-        # save path
-        fname = os.path.basename(sample_path)
-        fname_wo_ext = os.path.splitext(fname)[0]
-        # TODO: fname pattern
-        save_path = os.path.join(save_dir, f"{fname_wo_ext}_scene-{idx}.mp4")
-
-        # ffmpeg cmd
-        cmd = [FFMPEG_PATH]
-
-        # Only show ffmpeg output for the first call, which will display any
-        # errors if it fails, and then break the loop. We only show error messages
-        # for the remaining calls.
-        # cmd += ['-v', 'error']
-
-        # input path
-        cmd += ["-i", sample_path]
-
-        # clip to cut
-        cmd += ["-nostdin", "-y", "-ss", str(s.get_seconds()), "-t", str(duration.get_seconds())]
-
-        # target fps
-        # cmd += ['-vf', 'select=mod(n\,2)']
-        cmd += ["-r", f"{target_fps}"]
-
-        # aspect ratio
-        cmd += ["-vf", f"scale='if(gt(iw,ih),-2,{shorter_size})':'if(gt(iw,ih),{shorter_size},-2)'"]
-        # cmd += ['-vf', f"scale='if(gt(iw,ih),{shorter_size},trunc(ow/a/2)*2)':-2"]
-
-        cmd += ["-map", "0", save_path]
-
-        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-        stdout, stderr = proc.communicate()
-        if verbose:
-            stdout = stdout.decode("utf-8")
-            print_log(stdout, logger=logger)
-
-        save_path_list.append(sample_path)
-        print_log(f"Video clip saved to '{save_path}'", logger=logger)
-
-    return save_path_list