Open-Sora/tools/scene_cut/convert_id_to_path.py
xyupeng 93ae382c2f update scoring (#82)
* update scoring/matching

* update scoring/matching

* update scoring/matching

* update scoring/matching

* update scoring/matching

* update scoring/matching

* update scoring/matching

* update scoring/matching

* update scoring/matching

* update scene_cut

* update scene_cut

* update scene_cut[A

* update scene_cut

* update scene_cut

* update scene_cut

* update scene_cut

* update scene_cut

* update scene_cut

* m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* update readme

* update readme

* extract frames using opencv everywhere

* extract frames using opencv everywhere

* extract frames using opencv everywhere

* filter panda10m

* filter panda10m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* ocr

* add ocr

* add main.sh

* add ocr

* add ocr

* add ocr

* add ocr

* add ocr

* add ocr

* update scene_cut

* update remove main.sh

* update scoring

* update scoring

* update scoring

* update README

* update readme

* update scene_cut

* update readme

* update scoring

* update readme

* update readme

* update filter_panda10m

* update readme

* update readme

* update launch.ipynb

* update scene_cut

* update scene_cut

* update readme

* update launch.ipynb

* update readme

* add 1.1 demo

* update readme

* add 1.1 demo

* update readme

* Update README.md

* add num_workers for pandarallel

* update scene_cut

* update readme

* update datautil

* update scoring

* update scoring
2024-04-30 14:44:45 +08:00

133 lines
4.1 KiB
Python

import os
import argparse
import json
from functools import partial
import numpy as np
import pandas as pd
from pandarallel import pandarallel
import cv2
from mmengine.logging import print_log
from moviepy.editor import VideoFileClip
from tqdm import tqdm
tqdm.pandas()
def is_intact_video(video_path, mode="moviepy", verbose=False, logger=None):
if not os.path.exists(video_path):
if verbose:
print_log(f"Could not find '{video_path}'", logger=logger)
return False
if mode == "moviepy":
try:
VideoFileClip(video_path)
if verbose:
print_log(f"The video file '{video_path}' is intact.", logger=logger)
return True
except Exception as e:
if verbose:
print_log(f"Error: {e}", logger=logger)
print_log(f"The video file '{video_path}' is not intact.", logger=logger)
return False
elif mode == "cv2":
try:
cap = cv2.VideoCapture(video_path)
if cap.isOpened():
if verbose:
print_log(f"The video file '{video_path}' is intact.", logger=logger)
return True
except Exception as e:
if verbose:
print_log(f"Error: {e}", logger=logger)
print_log(f"The video file '{video_path}' is not intact.", logger=logger)
return False
else:
raise ValueError
def has_downloaded_success(json_path):
if not os.path.exists(json_path):
return False
try:
with open(json_path, "r") as f:
data = json.load(f)
if "success" not in data or isinstance(data["success"], bool) is False or data["success"] is False:
return False
except Exception:
return False
return True
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("meta_path", type=str)
parser.add_argument("--folder_path", type=str, required=True)
parser.add_argument("--mode", type=str, default=None)
parser.add_argument("--num_workers", type=int, default=None, help='#workers for pandarallel')
args = parser.parse_args()
return args
def main():
args = parse_args()
meta_path = args.meta_path
folder_path = args.folder_path
mode = args.mode
def is_intact(row, mode=None):
video_id = row["id"]
video_path = os.path.join(folder_path, f"{video_id}.mp4")
row["path"] = video_path
if mode == ".mp4":
if is_intact_video(video_path):
return True, video_path
return False, video_path
elif mode == ".json":
# json_path = os.path.join(root_raw, f"data/{split}/{video_id}.json")
json_path = os.path.join(folder_path, f"{video_id}.json")
if has_downloaded_success(json_path):
return True, video_path
return False, video_path
elif mode is None:
return True, video_path
else:
raise ValueError
meta_dirpath = os.path.dirname(meta_path)
meta_fname = os.path.basename(meta_path)
wo_ext, ext = os.path.splitext(meta_fname)
if args.num_workers is not None:
pandarallel.initialize(progress_bar=True, nb_workers=args.num_workers)
else:
pandarallel.initialize(progress_bar=True)
is_intact_partial = partial(is_intact, mode=mode)
meta = pd.read_csv(meta_path)
ret = meta.parallel_apply(is_intact_partial, axis=1)
intact, paths = list(zip(*ret))
meta["intact"] = intact
meta["path"] = paths
out_path = os.path.join(meta_dirpath, f"{wo_ext}_path_intact.csv")
meta.to_csv(out_path, index=False)
print(f"New meta (shape={meta.shape}) with intact info saved to '{out_path}'")
meta_format = meta[np.array(intact)]
meta_format.drop("intact", axis=1, inplace=True)
out_path = os.path.join(meta_dirpath, f"{wo_ext}_path-filtered.csv")
meta_format.to_csv(out_path, index=False)
print(f"New meta (shape={meta_format.shape}) with format info saved to '{out_path}'")
if __name__ == "__main__":
main()