Open-Sora/tools/datasets/convert.py

import argparse
import os
import time

import pandas as pd
from torchvision.datasets import ImageNet

IMG_EXTENSIONS = (".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif", ".tiff", ".webp")
VID_EXTENSIONS = (".mp4", ".avi", ".mov", ".mkv")


def scan_recursively(root):
    num = 0
    for entry in os.scandir(root):
        if entry.is_file():
            yield entry
        elif entry.is_dir():
            num += 1
            if num % 100 == 0:
                print(f"Scanned {num} directories.")
            yield from scan_recursively(entry.path)


def get_filelist(file_path, exts=None):
    filelist = []
    time_start = time.time()

    # == OS Walk ==
    # for home, dirs, files in os.walk(file_path):
    #     for filename in files:
    #         ext = os.path.splitext(filename)[-1].lower()
    #         if exts is None or ext in exts:
    #             filelist.append(os.path.join(home, filename))

    # == Scandir ==
    obj = scan_recursively(file_path)
    for entry in obj:
        if entry.is_file():
            ext = os.path.splitext(entry.name)[-1].lower()
            if exts is None or ext in exts:
                filelist.append(entry.path)

    time_end = time.time()
    print(f"Scanned {len(filelist)} files in {time_end - time_start:.2f} seconds.")
    return filelist


def split_by_capital(name):
    # BoxingPunchingBag -> Boxing Punching Bag
    new_name = ""
    for i in range(len(name)):
        if name[i].isupper() and i != 0:
            new_name += " "
        new_name += name[i]
    return new_name


def process_imagenet(root, split):
    root = os.path.expanduser(root)
    data = ImageNet(root, split=split)
    samples = [(path, data.classes[label][0]) for path, label in data.samples]
    output = f"imagenet_{split}.csv"

    df = pd.DataFrame(samples, columns=["path", "text"])
    df.to_csv(output, index=False)
    print(f"Saved {len(samples)} samples to {output}.")


def process_ucf101(root, split):
    root = os.path.expanduser(root)
    video_lists = get_filelist(os.path.join(root, split))
    classes = [x.split("/")[-2] for x in video_lists]
    classes = [split_by_capital(x) for x in classes]
    samples = list(zip(video_lists, classes))
    output = f"ucf101_{split}.csv"

    df = pd.DataFrame(samples, columns=["path", "text"])
    df.to_csv(output, index=False)
    print(f"Saved {len(samples)} samples to {output}.")


def process_vidprom(root, info):
    root = os.path.expanduser(root)
    video_lists = get_filelist(root)
    video_set = set(video_lists)
    # read info csv
    infos = pd.read_csv(info)
    abs_path = infos["uuid"].apply(lambda x: os.path.join(root, f"pika-{x}.mp4"))
    is_exist = abs_path.apply(lambda x: x in video_set)
    df = pd.DataFrame(dict(path=abs_path[is_exist], text=infos["prompt"][is_exist]))
    df.to_csv("vidprom.csv", index=False)
    print(f"Saved {len(df)} samples to vidprom.csv.")


def process_general_images(root, output):
    root = os.path.expanduser(root)
    if not os.path.exists(root):
        return
    path_list = get_filelist(root, IMG_EXTENSIONS)
    fname_list = [os.path.splitext(os.path.basename(x))[0] for x in path_list]
    df = pd.DataFrame(dict(id=fname_list, path=path_list))

    os.makedirs(os.path.dirname(output), exist_ok=True)
    df.to_csv(output, index=False)
    print(f"Saved {len(df)} samples to {output}.")


def process_general_videos(root, output):
    root = os.path.expanduser(root)
    if not os.path.exists(root):
        return
    path_list = get_filelist(root, VID_EXTENSIONS)
    path_list = list(set(path_list))  # remove duplicates
    fname_list = [os.path.splitext(os.path.basename(x))[0] for x in path_list]
    relpath_list = [os.path.relpath(x, root) for x in path_list]
    df = pd.DataFrame(dict(path=path_list, id=fname_list, relpath=relpath_list))

    os.makedirs(os.path.dirname(output), exist_ok=True)
    df.to_csv(output, index=False)
    print(f"Saved {len(df)} samples to {output}.")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("dataset", type=str, choices=["imagenet", "ucf101", "vidprom", "image", "video"])
    parser.add_argument("root", type=str)
    parser.add_argument("--split", type=str, default="train")
    parser.add_argument("--info", type=str, default=None)
    parser.add_argument("--output", type=str, default=None, required=True, help="Output path")
    args = parser.parse_args()

    if args.dataset == "imagenet":
        process_imagenet(args.root, args.split)
    elif args.dataset == "ucf101":
        process_ucf101(args.root, args.split)
    elif args.dataset == "vidprom":
        process_vidprom(args.root, args.info)
    elif args.dataset == "image":
        process_general_images(args.root, args.output)
    elif args.dataset == "video":
        process_general_videos(args.root, args.output)
    else:
        raise ValueError("Invalid dataset")
add datasets doc 2024-03-17 13:09:58 +01:00			`import argparse`
			`import os`
[feat] faster get file list 2024-04-11 07:34:02 +02:00			`import time`
add datasets doc 2024-03-17 13:09:58 +01:00
update convert_dataset and launch exp 2024-03-24 15:03:31 +01:00			`import pandas as pd`
add datasets doc 2024-03-17 13:09:58 +01:00			`from torchvision.datasets import ImageNet`

[wip] image wrong with flashattn 2024-03-28 15:04:43 +01:00			`IMG_EXTENSIONS = (".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif", ".tiff", ".webp")`
Update image process (#5) * [docs] update tool docs * update aes 2024-03-29 16:34:10 +01:00			`VID_EXTENSIONS = (".mp4", ".avi", ".mov", ".mkv")`
[wip] image wrong with flashattn 2024-03-28 15:04:43 +01:00

[feat] faster get file list 2024-04-11 07:34:02 +02:00			`def scan_recursively(root):`
			`num = 0`
			`for entry in os.scandir(root):`
			`if entry.is_file():`
			`yield entry`
			`elif entry.is_dir():`
			`num += 1`
			`if num % 100 == 0:`
			`print(f"Scanned {num} directories.")`
			`yield from scan_recursively(entry.path)`


[wip] image wrong with flashattn 2024-03-28 15:04:43 +01:00			`def get_filelist(file_path, exts=None):`
[feat] faster get file list 2024-04-11 07:34:02 +02:00			`filelist = []`
			`time_start = time.time()`

			`# == OS Walk ==`
			`# for home, dirs, files in os.walk(file_path):`
			`# for filename in files:`
			`# ext = os.path.splitext(filename)[-1].lower()`
			`# if exts is None or ext in exts:`
			`# filelist.append(os.path.join(home, filename))`

			`# == Scandir ==`
			`obj = scan_recursively(file_path)`
			`for entry in obj:`
			`if entry.is_file():`
			`ext = os.path.splitext(entry.name)[-1].lower()`
[wip] image wrong with flashattn 2024-03-28 15:04:43 +01:00			`if exts is None or ext in exts:`
[feat] faster get file list 2024-04-11 07:34:02 +02:00			`filelist.append(entry.path)`

			`time_end = time.time()`
			`print(f"Scanned {len(filelist)} files in {time_end - time_start:.2f} seconds.")`
			`return filelist`
add datasets doc 2024-03-17 13:09:58 +01:00

			`def split_by_capital(name):`
			`# BoxingPunchingBag -> Boxing Punching Bag`
			`new_name = ""`
			`for i in range(len(name)):`
			`if name[i].isupper() and i != 0:`
			`new_name += " "`
			`new_name += name[i]`
			`return new_name`


			`def process_imagenet(root, split):`
			`root = os.path.expanduser(root)`
			`data = ImageNet(root, split=split)`
			`samples = [(path, data.classes[label][0]) for path, label in data.samples]`
			`output = f"imagenet_{split}.csv"`

update convert_dataset and launch exp 2024-03-24 15:03:31 +01:00			`df = pd.DataFrame(samples, columns=["path", "text"])`
			`df.to_csv(output, index=False)`
add datasets doc 2024-03-17 13:09:58 +01:00			`print(f"Saved {len(samples)} samples to {output}.")`


			`def process_ucf101(root, split):`
			`root = os.path.expanduser(root)`
			`video_lists = get_filelist(os.path.join(root, split))`
			`classes = [x.split("/")[-2] for x in video_lists]`
			`classes = [split_by_capital(x) for x in classes]`
			`samples = list(zip(video_lists, classes))`
			`output = f"ucf101_{split}.csv"`

update convert_dataset and launch exp 2024-03-24 15:03:31 +01:00			`df = pd.DataFrame(samples, columns=["path", "text"])`
			`df.to_csv(output, index=False)`
add datasets doc 2024-03-17 13:09:58 +01:00			`print(f"Saved {len(samples)} samples to {output}.")`


update csvutil for vidprom 2024-03-23 09:02:26 +01:00			`def process_vidprom(root, info):`
			`root = os.path.expanduser(root)`
			`video_lists = get_filelist(root)`
			`video_set = set(video_lists)`
			`# read info csv`
update convert_dataset and launch exp 2024-03-24 15:03:31 +01:00			`infos = pd.read_csv(info)`
			`abs_path = infos["uuid"].apply(lambda x: os.path.join(root, f"pika-{x}.mp4"))`
			`is_exist = abs_path.apply(lambda x: x in video_set)`
			`df = pd.DataFrame(dict(path=abs_path[is_exist], text=infos["prompt"][is_exist]))`
			`df.to_csv("vidprom.csv", index=False)`
			`print(f"Saved {len(df)} samples to vidprom.csv.")`
update csvutil for vidprom 2024-03-23 09:02:26 +01:00

Dev/datapipe (#21) * fix #210 * fix #209 * fix #188 * [docs] add training order * update data pipeline --------- Co-authored-by: Sze-qq <68757353+Sze-qq@users.noreply.github.com> 2024-04-02 08:51:21 +02:00			`def process_general_images(root, output):`
[wip] image wrong with flashattn 2024-03-28 15:04:43 +01:00			`root = os.path.expanduser(root)`
skip empty folder & meta 2024-06-10 07:30:59 +02:00			`if not os.path.exists(root):`
			`return`
Dev/pxy (#100) * update scoring/matching * update scoring/matching * update scoring/matching * update scoring/matching * update scoring/matching * update scoring/matching * update scoring/matching * update scoring/matching * update scoring/matching * update scene_cut * update scene_cut * update scene_cut[A * update scene_cut * update scene_cut * update scene_cut * update scene_cut * update scene_cut * update scene_cut * m * m * m * m * m * m * m * m * m * m * m * m * m * m * update readme * update readme * extract frames using opencv everywhere * extract frames using opencv everywhere * extract frames using opencv everywhere * filter panda10m * filter panda10m * m * m * m * m * m * m * m * m * m * m * m * m * m * m * m * m * m * ocr * add ocr * add main.sh * add ocr * add ocr * add ocr * add ocr * add ocr * add ocr * update scene_cut * update remove main.sh * update scoring * update scoring * update scoring * update README * update readme * update scene_cut * update readme * update scoring * update readme * update readme * update filter_panda10m * update readme * update readme * update launch.ipynb * update scene_cut * update scene_cut * update readme * update launch.ipynb * update readme * add 1.1 demo * update readme * add 1.1 demo * update readme * Update README.md * add num_workers for pandarallel * update scene_cut * update readme * update datautil * update scoring * update scoring * update readme * update scoring * update scene_cut * update scene_cut * udpate datautil * update datautil 2024-05-14 05:21:14 +02:00			`path_list = get_filelist(root, IMG_EXTENSIONS)`
			`fname_list = [os.path.splitext(os.path.basename(x))[0] for x in path_list]`
			`df = pd.DataFrame(dict(id=fname_list, path=path_list))`

			`os.makedirs(os.path.dirname(output), exist_ok=True)`
Dev/datapipe (#21) * fix #210 * fix #209 * fix #188 * [docs] add training order * update data pipeline --------- Co-authored-by: Sze-qq <68757353+Sze-qq@users.noreply.github.com> 2024-04-02 08:51:21 +02:00			`df.to_csv(output, index=False)`
			`print(f"Saved {len(df)} samples to {output}.")`
[wip] image wrong with flashattn 2024-03-28 15:04:43 +01:00

Dev/datapipe (#21) * fix #210 * fix #209 * fix #188 * [docs] add training order * update data pipeline --------- Co-authored-by: Sze-qq <68757353+Sze-qq@users.noreply.github.com> 2024-04-02 08:51:21 +02:00			`def process_general_videos(root, output):`
[wip] image wrong with flashattn 2024-03-28 15:04:43 +01:00			`root = os.path.expanduser(root)`
skip empty folder & meta 2024-06-10 07:30:59 +02:00			`if not os.path.exists(root):`
			`return`
Dev/pxy (#100) * update scoring/matching * update scoring/matching * update scoring/matching * update scoring/matching * update scoring/matching * update scoring/matching * update scoring/matching * update scoring/matching * update scoring/matching * update scene_cut * update scene_cut * update scene_cut[A * update scene_cut * update scene_cut * update scene_cut * update scene_cut * update scene_cut * update scene_cut * m * m * m * m * m * m * m * m * m * m * m * m * m * m * update readme * update readme * extract frames using opencv everywhere * extract frames using opencv everywhere * extract frames using opencv everywhere * filter panda10m * filter panda10m * m * m * m * m * m * m * m * m * m * m * m * m * m * m * m * m * m * ocr * add ocr * add main.sh * add ocr * add ocr * add ocr * add ocr * add ocr * add ocr * update scene_cut * update remove main.sh * update scoring * update scoring * update scoring * update README * update readme * update scene_cut * update readme * update scoring * update readme * update readme * update filter_panda10m * update readme * update readme * update launch.ipynb * update scene_cut * update scene_cut * update readme * update launch.ipynb * update readme * add 1.1 demo * update readme * add 1.1 demo * update readme * Update README.md * add num_workers for pandarallel * update scene_cut * update readme * update datautil * update scoring * update scoring * update readme * update scoring * update scene_cut * update scene_cut * udpate datautil * update datautil 2024-05-14 05:21:14 +02:00			`path_list = get_filelist(root, VID_EXTENSIONS)`
reformat and update docs 2024-06-17 17:37:23 +02:00			`path_list = list(set(path_list)) # remove duplicates`
Dev/pxy (#100) * update scoring/matching * update scoring/matching * update scoring/matching * update scoring/matching * update scoring/matching * update scoring/matching * update scoring/matching * update scoring/matching * update scoring/matching * update scene_cut * update scene_cut * update scene_cut[A * update scene_cut * update scene_cut * update scene_cut * update scene_cut * update scene_cut * update scene_cut * m * m * m * m * m * m * m * m * m * m * m * m * m * m * update readme * update readme * extract frames using opencv everywhere * extract frames using opencv everywhere * extract frames using opencv everywhere * filter panda10m * filter panda10m * m * m * m * m * m * m * m * m * m * m * m * m * m * m * m * m * m * ocr * add ocr * add main.sh * add ocr * add ocr * add ocr * add ocr * add ocr * add ocr * update scene_cut * update remove main.sh * update scoring * update scoring * update scoring * update README * update readme * update scene_cut * update readme * update scoring * update readme * update readme * update filter_panda10m * update readme * update readme * update launch.ipynb * update scene_cut * update scene_cut * update readme * update launch.ipynb * update readme * add 1.1 demo * update readme * add 1.1 demo * update readme * Update README.md * add num_workers for pandarallel * update scene_cut * update readme * update datautil * update scoring * update scoring * update readme * update scoring * update scene_cut * update scene_cut * udpate datautil * update datautil 2024-05-14 05:21:14 +02:00			`fname_list = [os.path.splitext(os.path.basename(x))[0] for x in path_list]`
			`relpath_list = [os.path.relpath(x, root) for x in path_list]`
			`df = pd.DataFrame(dict(path=path_list, id=fname_list, relpath=relpath_list))`

update scoring (#82) * update scoring/matching * update scoring/matching * update scoring/matching * update scoring/matching * update scoring/matching * update scoring/matching * update scoring/matching * update scoring/matching * update scoring/matching * update scene_cut * update scene_cut * update scene_cut[A * update scene_cut * update scene_cut * update scene_cut * update scene_cut * update scene_cut * update scene_cut * m * m * m * m * m * m * m * m * m * m * m * m * m * m * update readme * update readme * extract frames using opencv everywhere * extract frames using opencv everywhere * extract frames using opencv everywhere * filter panda10m * filter panda10m * m * m * m * m * m * m * m * m * m * m * m * m * m * m * m * m * m * ocr * add ocr * add main.sh * add ocr * add ocr * add ocr * add ocr * add ocr * add ocr * update scene_cut * update remove main.sh * update scoring * update scoring * update scoring * update README * update readme * update scene_cut * update readme * update scoring * update readme * update readme * update filter_panda10m * update readme * update readme * update launch.ipynb * update scene_cut * update scene_cut * update readme * update launch.ipynb * update readme * add 1.1 demo * update readme * add 1.1 demo * update readme * Update README.md * add num_workers for pandarallel * update scene_cut * update readme * update datautil * update scoring * update scoring 2024-04-30 08:44:45 +02:00			`os.makedirs(os.path.dirname(output), exist_ok=True)`
Dev/datapipe (#21) * fix #210 * fix #209 * fix #188 * [docs] add training order * update data pipeline --------- Co-authored-by: Sze-qq <68757353+Sze-qq@users.noreply.github.com> 2024-04-02 08:51:21 +02:00			`df.to_csv(output, index=False)`
			`print(f"Saved {len(df)} samples to {output}.")`
[wip] image wrong with flashattn 2024-03-28 15:04:43 +01:00

add datasets doc 2024-03-17 13:09:58 +01:00			`if __name__ == "__main__":`
			`parser = argparse.ArgumentParser()`
[wip] image wrong with flashattn 2024-03-28 15:04:43 +01:00			`parser.add_argument("dataset", type=str, choices=["imagenet", "ucf101", "vidprom", "image", "video"])`
add datasets doc 2024-03-17 13:09:58 +01:00			`parser.add_argument("root", type=str)`
			`parser.add_argument("--split", type=str, default="train")`
update csvutil for vidprom 2024-03-23 09:02:26 +01:00			`parser.add_argument("--info", type=str, default=None)`
reformat and update docs 2024-06-17 17:37:23 +02:00			`parser.add_argument("--output", type=str, default=None, required=True, help="Output path")`
add datasets doc 2024-03-17 13:09:58 +01:00			`args = parser.parse_args()`

			`if args.dataset == "imagenet":`
			`process_imagenet(args.root, args.split)`
			`elif args.dataset == "ucf101":`
			`process_ucf101(args.root, args.split)`
update csvutil for vidprom 2024-03-23 09:02:26 +01:00			`elif args.dataset == "vidprom":`
			`process_vidprom(args.root, args.info)`
[wip] image wrong with flashattn 2024-03-28 15:04:43 +01:00			`elif args.dataset == "image":`
Dev/datapipe (#21) * fix #210 * fix #209 * fix #188 * [docs] add training order * update data pipeline --------- Co-authored-by: Sze-qq <68757353+Sze-qq@users.noreply.github.com> 2024-04-02 08:51:21 +02:00			`process_general_images(args.root, args.output)`
[wip] image wrong with flashattn 2024-03-28 15:04:43 +01:00			`elif args.dataset == "video":`
Dev/datapipe (#21) * fix #210 * fix #209 * fix #188 * [docs] add training order * update data pipeline --------- Co-authored-by: Sze-qq <68757353+Sze-qq@users.noreply.github.com> 2024-04-02 08:51:21 +02:00			`process_general_videos(args.root, args.output)`
add datasets doc 2024-03-17 13:09:58 +01:00			`else:`
			`raise ValueError("Invalid dataset")`