a bunch of update for data

2026-04-10 21:01:26 +02:00 · 2024-04-13 15:44:24 +08:00 · 2024-04-13 15:44:24 +08:00 · 3b85effe5a
commit 3b85effe5a
parent 4d338419a7
12 changed files with 806 additions and 175 deletions
--- a/.gitignore
+++ b/.gitignore
@ -170,6 +170,7 @@ runs/
 checkpoints/
 outputs/
 samples/
+logs/
 pretrained_models/
 *.swp

--- a/README.md
+++ b/README.md
@ -123,10 +123,12 @@ conda activate opensora
 pip install torch torchvision

 # install flash attention (optional)
+# required if enable_flashattn=True
 pip install packaging ninja
 pip install flash-attn --no-build-isolation

 # install apex (optional)
+# required if enable_layernorm_kernel=True
 pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git

 # install xformers
@ -201,40 +203,25 @@ the following steps:
 3. Score and filter videos. [[docs](/tools/scoring/README.md)]
 4. Generate video captions. [[docs](/tools/caption/README.md)]

-Below is an example workflow to process data. However, we recommend you to read the detailed documentation for each tool, and decide which tools to use based on your needs. This pipeline applies to both image and video data.
+Below is an example workflow to process data. However, we recommend you to read the detailed documentation for each tool, and decide which tools to use based on your needs. This pipeline applies to both image and video data. Full pipeline is available in [datasets.md](/tools/datasets/README.md#data-process-pipeline).

 ```bash
-# Suppose files under ~/dataset/
-# 1. Convert dataset to CSV
-python -m tools.datasets.convert video ~/dataset --output ~/dataset/meta.csv
-# filter out broken videos (broken ones num_frames=0)
-python -m tools.datasets.csvutil ~/dataset.csv --info --fmin 1 --output ~/dataset/meta.csv
-
-# 2. Filter dataset by aesthetic scores
-# output: ~/dataset/meta_aes.csv
-python -m tools.scoring.aesthetic.inference ~/dataset/meta.csv
-# sort and examine videos by aesthetic scores
-# output: ~/dataset/meta_aes_sort.csv
-python -m tools.datasets.csvutil ~/dataset/meta_aes.csv --sort-descending aes
-# bad videos (aesthetic_score < 5)
-tail ~/dataset/meta_aes_sort.csv
-# filter videos by aesthetic scores
-# output: ~/dataset/meta_aes_aesmin5.csv
-python -m tools.datasets.csvutil ~/dataset/meta_aes.csv --aesmin 5
-
-# 3. Caption dataset
-# output: ~/dataset/meta_aes_aesmin5_caption_parti.csv
-torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava ~/dataset/meta_aes_aesmin5.csv --tp-size 2 --dp-size 4 --bs 16
-# merge generated results
-python -m tools.datasets.csvutil ~/dataset/meta_aes_aesmin5_caption_part*.csv --output ~/dataset/meta_caption.csv
-# remove empty captions and process captions (may need to re-caption lost ones)
-python -m tools.datasets.csvutil ~/dataset/meta_caption.csv --clean-caption --remove-caption-prefix --remove-empty-caption --output ~/dataset/meta_caption_processed.csv
-
-# 4. Sanity check & prepare for training
-# sanity check
-python -m tools.datasets.csvutil ~/dataset/meta_caption_processed.csv --ext --info --output ~/dataset/meta_ready.csv
-# filter out videos less than 48 frames
-# output: ~/dataset/meta_ready_fmin48.csv
+# Suppose videos and images under ~/dataset/
+# 1. Convert dataset to CSV (meta.csv)
+python -m tools.datasets.convert video ~/dataset --output meta.csv
+# 2. Get video information (meta_info_fmin1.csv)
+python -m tools.datasets.datautil meta.csv --info --fmin 1
+# 3. Get caption information
+torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava meta_info_fmin1.csv --dp-size 8 --tp-size 1 --model-path liuhaotian/llava-v1.6-mistral-7b --prompt video
+# merge generated results (meta_caption.csv)
+python -m tools.datasets.datautil meta_info_fmin1_caption_part*.csv --output meta_caption.csv
+# clean caption (meta_caption_processed.csv)
+python -m tools.datasets.datautil meta_caption.csv --clean-caption --refine-llm-caption --remove-empty-caption --output meta_caption_processed.csv
+# 4. Scoring (meta_caption_processed_aes.csv)
+torchrun --nproc_per_node 8  -m tools.scoring.aesthetic.inference meta_caption_processed.csv --bs 1024 --num_workers 16
+# Filter videos by aesthetic scores (meta_aes_aesmin5.csv)
+python -m tools.datasets.csvutil meta_caption_processed_aes.csv --aesmin 5 --output meta_aes_aesmin5.csv
+# 5. Additional filtering
 python -m tools.datasets.csvutil ~/dataset_ready.csv --fmin 48
 ```

--- a/notebooks/data.ipynb
+++ b/notebooks/data.ipynb
@ -0,0 +1,649 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Data Process Pipeline"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First, you should add hosts in your ~/.ssh/config file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 105,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import paramiko\n",
+    "\n",
+    "HOSTS = [\"h800-80\", \"h800-81\", \"h800-82\", \"h800-83\", \"h800-84\", \"h800-85\", \"h800-86\", \"h800-170\", \"h800-171\"]\n",
+    "\n",
+    "# load from ~/.ssh/config\n",
+    "ssh_config = paramiko.SSHConfig()\n",
+    "user_config_file = os.path.expanduser(\"~/.ssh/config\")\n",
+    "if os.path.exists(user_config_file):\n",
+    "    with open(user_config_file) as f:\n",
+    "        ssh_config.parse(f)\n",
+    "\n",
+    "\n",
+    "def get_ssh_config(hostname):\n",
+    "    # get the configuration for the host\n",
+    "    user_config = ssh_config.lookup(hostname)\n",
+    "    user_config\n",
+    "    cfg = {\n",
+    "        \"hostname\": user_config[\"hostname\"],\n",
+    "        \"username\": user_config[\"user\"],\n",
+    "        \"port\": int(user_config[\"port\"]),\n",
+    "        \"key_filename\": user_config[\"identityfile\"],\n",
+    "    }\n",
+    "    return cfg\n",
+    "\n",
+    "\n",
+    "def connect(hostname):\n",
+    "    cfg = get_ssh_config(hostname)\n",
+    "    # connect\n",
+    "    client = paramiko.SSHClient()\n",
+    "    client.set_missing_host_key_policy(paramiko.AutoAddPolicy())\n",
+    "    client.connect(**cfg)\n",
+    "    return client\n",
+    "\n",
+    "\n",
+    "def run_command(command, hostname, get_pty=True, log=True, nohup=False, log_file=None):\n",
+    "    client = connect(hostname)\n",
+    "    command = f'bash -ic \"{command}\"' if get_pty else command\n",
+    "    if log_file:\n",
+    "        command = f'{command} > {log_file} 2>&1'\n",
+    "    if nohup:\n",
+    "        command = f'nohup {command} &'\n",
+    "    stdin, stdout, stderr = client.exec_command(command, get_pty=get_pty)\n",
+    "    stdout_str = stdout.read().decode()\n",
+    "    stderr_str = stderr.read().decode()\n",
+    "    if log:\n",
+    "        print(\"HOST:\", hostname)\n",
+    "        if stdout_str:\n",
+    "            print(\"==== STDOUT ====\\n\", stdout_str)\n",
+    "        if stderr_str:\n",
+    "            print(\"==== STDERR ====\\n\", stderr_str)\n",
+    "    client.close()\n",
+    "\n",
+    "\n",
+    "def run_command_all_hosts(command, hosts=HOSTS):\n",
+    "    for hostname in hosts:\n",
+    "        run_command(command, hostname)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 96,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def nvidia_smi(host):\n",
+    "    if host:\n",
+    "        run_command(\"nvidia-smi\", host)\n",
+    "    else:\n",
+    "        run_command_all_hosts(\"nvidia-smi\")\n",
+    "\n",
+    "\n",
+    "def nvitop(host=None):\n",
+    "    if host:\n",
+    "        run_command(f\"/home/zhaowangbo/.local/bin/nvitop -1\", host)\n",
+    "    else:\n",
+    "        run_command_all_hosts(\"/home/zhaowangbo/.local/bin/nvitop -1\")\n",
+    "\n",
+    "\n",
+    "def ps(host=None, interest=\"python|sleep|torchrun|colossal\"):\n",
+    "    if host:\n",
+    "        if interest is None:\n",
+    "            run_command(\"ps ux | cat\", host)\n",
+    "        else:\n",
+    "            run_command(f\"ps ux | cat | grep --color=never -E '{interest}'\", host)\n",
+    "    else:\n",
+    "        if interest is None:\n",
+    "            run_command_all_hosts(\"ps ux | cat\")\n",
+    "        else:\n",
+    "            run_command_all_hosts(f\"ps ux | cat | grep --color=never -E '{interest}'\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 106,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "OPEN_SORA_HOME = \"/home/zhaowangbo/open-sora\"\n",
+    "def convert_dataset_cmd(input_dir, output_file, datatype=\"video\"):\n",
+    "    commands = []\n",
+    "    commands.append(f'cd {OPEN_SORA_HOME}')\n",
+    "    # makedirs\n",
+    "    output_dir = os.path.dirname(output_file)\n",
+    "    commands.append(f'mkdir -p {output_dir}')\n",
+    "    commands.append(f'python -m tools.datasets.convert {datatype} {input_dir} --output {output_dir}')\n",
+    "    return \" && \".join(commands), output_file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 107,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "host = \"h800-83\"\n",
+    "log_file = \"./logs/data-panda-16-split.log\"\n",
+    "cmd, output_file = convert_dataset_cmd(\"/mnt/disk1/data-panda/16\", \"/mnt/hdd/data/panda70m_by/raw/meta/split-16/meta.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 109,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "HOST: h800-83\n",
+      "==== STDOUT ====\n",
+      " \n"
+     ]
+    }
+   ],
+   "source": [
+    "run_command(cmd, host, nohup=True, log_file=log_file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 108,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'cd /home/zhaowangbo/open-sora && mkdir -p /mnt/hdd/data/panda70m_by/raw/meta/split-16 && python -m tools.datasets.convert video /mnt/disk1/data-panda/16 --output /mnt/hdd/data/panda70m_by/raw/meta/split-16'"
+      ]
+     },
+     "execution_count": 108,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cmd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 102,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "HOST: h800-83\n",
+      "==== STDOUT ====\n",
+      " zhaowan+ 2928070 10.0  0.0  14216  5556 pts/13   Ss   15:25   0:00 bash -ic ps ux | cat | grep --color=never -E 'convert'\n",
+      "zhaowan+ 2929492  0.0  0.0  12116   664 pts/13   S+   15:25   0:00 grep --color=auto --color=never -E convert\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "ps(host, interest=\"convert\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1819,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "RUN_CONVERT =               False\n",
+    "RUN_ZERO_FRAME =            False\n",
+    "RUN_CORRUPTED =             True\n",
+    "RUN_ZERO_FRAME_CORRUPTED =  False\n",
+    "RUN_COPY_OVER_CAPTION =     False\n",
+    "\n",
+    "RUN_AESTHETICS =            False\n",
+    "RUN_COLLATE_AESTHETICS =    False\n",
+    "RUN_OPTICAL_FLOW =          False\n",
+    "RUN_AES_FLOW =              False\n",
+    "\n",
+    "captioning_input_file = 'meta_remove_corrupted_aes_flow.csv'; RUN_CAPTIONING =            False\n",
+    "RUN_COLLATE_CAPTIONING =    False; \n",
+    "RUN_COLLATE_CAPTIONING_INTERRUPTED =    False\n",
+    "\n",
+    "RUN_INTERSECT_INFO =        False\n",
+    "\n",
+    "RUN_CLEAN_REMOVE_LLAVA =    False\n",
+    "RUN_CLEAN_REMOVE =          False\n",
+    "clean_remove_input_path = 'meta_remove_corrupted_aes_flow_caption.csv'\n",
+    "clean_remove_output_path = 'meta_remove_corrupted_aes_flow_caption_cleaned_and_removed.csv'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Convert to meta and collate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1821,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if RUN_CONVERT:\n",
+    "    nohup_filename = \"outs/\" + NAME + \"_convert_video.out\"\n",
+    "\n",
+    "    command = \\\n",
+    "        'cd /home/tom/Open-Sora-dev/ && ' + \\\n",
+    "        '/home/zhaowangbo/.conda/envs/opensora/bin/python convert_videos.py ' + \\\n",
+    "        ' ' + DATA_PATH + ' ' + \\\n",
+    "        ' ' + PATH + ' > ' + nohup_filename + ' 2>&1 &'\n",
+    "    print(command)\n",
+    "    output = client.exec_command(command)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Filter out zero-frame and corrupted"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1822,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if RUN_CORRUPTED:\n",
+    "    nohup_filename = \"outs/\" + NAME + \"_filter_corrupted.out\"\n",
+    "    command = \\\n",
+    "        'cd /home/tom/Open-Sora-dev/ && ' + \\\n",
+    "        'nohup /home/zhaowangbo/.conda/envs/opensora/bin/python -m tools.datasets.csvutil ' + \\\n",
+    "        PATH + 'meta.csv ' + \\\n",
+    "        '--remove-corrupted' + ' > ' + nohup_filename + ' 2>&1 &'\n",
+    "    output = client.exec_command(command)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1823,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if RUN_ZERO_FRAME:\n",
+    "    nohup_filename = \"outs/\" + NAME + \"_filter_zero_frame.out\"\n",
+    "    command = \\\n",
+    "        'cd /home/tom/Open-Sora-dev/ && ' + \\\n",
+    "        'nohup /home/zhaowangbo/.conda/envs/opensora/bin/python -m tools.datasets.csvutil ' + \\\n",
+    "        PATH + 'meta_remove_corrupted.csv ' + \\\n",
+    "        '--info --fmin 1 --output ' + \\\n",
+    "        PATH + 'meta_remove_corrupted.csv' + ' > ' + nohup_filename + ' 2>&1 &'\n",
+    "    output = client.exec_command(command)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1824,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if RUN_ZERO_FRAME_CORRUPTED:\n",
+    "    out_filename = \"outs/\" + NAME + \"_filter_zero_frame_corrupted.out\"\n",
+    "    command = \\\n",
+    "        'cd /home/tom/Open-Sora-dev/ && ' + \\\n",
+    "        '/home/zhaowangbo/.conda/envs/opensora/bin/python -m tools.datasets.csvutil ' + \\\n",
+    "        PATH + 'meta.csv ' + \\\n",
+    "        '--info --fmin 1 --output ' + \\\n",
+    "        PATH + 'meta.csv ' + '> ' + out_filename + ' 2>&1 && ' + \\\n",
+    "        'echo \"[DONE]: filter zero frame\" >> ' + out_filename + ' && ' + \\\n",
+    "        '/home/zhaowangbo/.conda/envs/opensora/bin/python -m tools.datasets.csvutil ' + \\\n",
+    "        PATH + 'meta.csv ' + \\\n",
+    "        '--remove-corrupted' + ' >> ' + out_filename + ' 2>&1 && ' + \\\n",
+    "        'echo \"[DONE]: filter corrupted\" >> ' + out_filename\n",
+    "    print(command)\n",
+    "    output = client.exec_command(command)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Run copy over caption"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1825,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if RUN_COPY_OVER_CAPTION:\n",
+    "    nohup_filename = \"outs/\" + NAME + \"_copy_over_caption.out\"\n",
+    "    command = \\\n",
+    "        'cd /home/tom/Open-Sora-dev/ && ' + \\\n",
+    "        'nohup /home/zhaowangbo/.conda/envs/opensora/bin/python copy_over_caption.py' + \\\n",
+    "        ' ' + PATH + 'meta.csv > ' + nohup_filename + ' 2>&1 &'\n",
+    "    output = client.exec_command(command)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Run aesthetics and collate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1826,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if RUN_AESTHETICS:\n",
+    "    nohup_filename = \"outs/\" + NAME + \"_aes.out\"\n",
+    "    command = \\\n",
+    "        'cd /home/tom/Open-Sora-dev/ && ' + \\\n",
+    "        'nohup /home/zhaowangbo/.conda/envs/opensora/bin/torchrun --nproc_per_node 8 -m tools.scoring.aesthetic.inference ' + \\\n",
+    "        PATH + 'meta_remove_corrupted.csv ' + \\\n",
+    "        '--bs 1024 --num_workers 16 > ' + \\\n",
+    "        nohup_filename + ' 2>&1 &'\n",
+    "    output = client.exec_command(command)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1827,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if RUN_COLLATE_AESTHETICS:\n",
+    "    command = \\\n",
+    "        'cd /home/tom/Open-Sora-dev/ && ' + \\\n",
+    "        'nohup /home/zhaowangbo/.conda/envs/opensora/bin/python -m tools.datasets.csvutil ' + \\\n",
+    "        PATH + 'meta_remove_corrupted_aes_part*.csv ' + \\\n",
+    "        '--output ' + \\\n",
+    "        PATH + 'meta_remove_corrupted_aes.csv '\n",
+    "    output = client.exec_command(command)\n",
+    "    # this takes priority! delete all meta_remove_corrupted_aes_part*.csv\n",
+    "    # output = client.exec_command(\"rm \" + PATH + \"meta_remove_corrupted_aes_part*.csv\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Run optical flow\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1828,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if RUN_OPTICAL_FLOW:\n",
+    "    command = \\\n",
+    "        'cd /home/tom/Open-Sora-dev/ && ' + \\\n",
+    "        'nohup /home/zhaowangbo/.conda/envs/llava2/bin/torchrun --standalone --nproc_per_node 8 -m tools.scoring.optical_flow.inference_parallel ' + \\\n",
+    "        PATH + 'meta_remove_corrupted_aes.csv > ' + \\\n",
+    "        \"outs/\" + NAME + \"_flow.out 2>&1 &\"\n",
+    "    print(command)\n",
+    "    output = client.exec_command(command)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Run aesthetics and optical flow\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1829,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if RUN_AES_FLOW:\n",
+    "    out_filename = \"outs/\" + NAME + \"_aes_flow.out\"\n",
+    "    command = \\\n",
+    "        'cd /home/tom/Open-Sora-dev/ && ' + \\\n",
+    "        '/home/zhaowangbo/.conda/envs/opensora/bin/torchrun --nproc_per_node 8 -m tools.scoring.aesthetic.inference ' + \\\n",
+    "        PATH + 'meta_remove_corrupted.csv ' + \\\n",
+    "        '--bs 1024 --num_workers 16 > ' + \\\n",
+    "        out_filename + ' 2>&1 && ' + \\\n",
+    "        'echo \"[DONE]: aesthetic\" >> ' + out_filename + ' && ' + \\\n",
+    "        '/home/zhaowangbo/.conda/envs/opensora/bin/python -m tools.datasets.csvutil ' + \\\n",
+    "        PATH + 'meta_remove_corrupted_aes_part*.csv ' + \\\n",
+    "        '--output ' + \\\n",
+    "        PATH + 'meta_remove_corrupted_aes.csv ' + ' >> ' + out_filename + ' 2>&1 && ' + \\\n",
+    "        'echo \"[DONE]: collate aesthetic\" >> ' + out_filename + ' && ' + \\\n",
+    "        '/home/zhaowangbo/.conda/envs/llava2/bin/torchrun --standalone --nproc_per_node 8 -m tools.scoring.optical_flow.inference_parallel ' + \\\n",
+    "        PATH + 'meta_remove_corrupted_aes.csv >> ' + \\\n",
+    "        out_filename + \" 2>&1 && \" + \\\n",
+    "        'echo \"[DONE]: flow\" >> ' + out_filename\n",
+    "        # CONTINUE \n",
+    "    output = client.exec_command(command)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Run captioning and collate\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1830,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if RUN_CAPTIONING:\n",
+    "    out_filename = \"outs/\" + NAME + \"_caption.out\"\n",
+    "    command = \\\n",
+    "        'cd /home/tom/Open-Sora-dev/ && ' + \\\n",
+    "        '/home/zhaowangbo/.conda/envs/llava2/bin/torchrun --nproc_per_node 8 ' + \\\n",
+    "        '--standalone ' + \\\n",
+    "        '-m tools.caption.caption_llava ' + \\\n",
+    "        PATH + captioning_input_file + ' ' + \\\n",
+    "        '--tp-size 2 ' + \\\n",
+    "        '--dp-size 4 ' + \\\n",
+    "        '--model-path liuhaotian/llava-v1.6-mistral-7b ' + \\\n",
+    "        '--bs 16 ' + \\\n",
+    "        '--prompt video > ' + \\\n",
+    "        out_filename + \" 2>&1 && \" + \\\n",
+    "        '/home/zhaowangbo/.conda/envs/llava2/bin/python -m tools.datasets.csvutil ' + \\\n",
+    "        PATH + 'meta_remove_corrupted_aes_flow_caption_part*.csv ' + \\\n",
+    "        '--output ' + \\\n",
+    "        PATH + 'meta_remove_corrupted_aes_flow_caption.csv ' + ' >> ' + out_filename + ' 2>&1 '\n",
+    "    print(command)\n",
+    "    output = client.exec_command(command)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1831,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# cd /home/tom/Open-Sora-dev/ && /home/zhaowangbo/.conda/envs/llava2/bin/torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava /mnt/hdd/data/v2text/raw/meta/split-18/meta_remove_corrupted_aes_flow.csv --tp-size 2 --dp-size 4 --model-path liuhaotian/llava-v1.6-mistral-7b --bs 16 --prompt video > outs/split-18_caption.out 2>&1 && /home/zhaowangbo/.conda/envs/llava2/bin/python -m tools.datasets.csvutil /mnt/hdd/data/v2text/raw/meta/split-18/meta_remove_corrupted_aes_flow_caption_part*.csv --output /mnt/hdd/data/v2text/raw/meta/split-18/meta_remove_corrupted_aes_flow_caption.csv >> outs/split-18_caption.out 2>&1 && "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1832,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if RUN_COLLATE_CAPTIONING_INTERRUPTED:\n",
+    "    command = \\\n",
+    "        'cd /home/tom/Open-Sora-dev/ && ' + \\\n",
+    "        '/home/zhaowangbo/.conda/envs/llava2/bin/python -m tools.datasets.csvutil ' + \\\n",
+    "        PATH + 'meta_remove_corrupted_aes_flow_remaining_caption_part*.csv ' + \\\n",
+    "        '--output ' + \\\n",
+    "        PATH + 'meta_remove_corrupted_aes_flow_remaining_caption.csv '\n",
+    "    output = client.exec_command(command)\n",
+    "    command = \\\n",
+    "        'cd /home/tom/Open-Sora-dev/ && ' + \\\n",
+    "        '/home/zhaowangbo/.conda/envs/llava2/bin/python -m tools.datasets.csvutil ' + \\\n",
+    "        PATH + 'meta_remove_corrupted_aes_flow_remaining_caption.csv ' + \\\n",
+    "        PATH + 'meta_remove_corrupted_aes_flow_caption_partial.csv ' + \\\n",
+    "        '--output ' + \\\n",
+    "        PATH + 'meta_remove_corrupted_aes_flow_caption.csv '\n",
+    "    output = client.exec_command(command)\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Clean and remove"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1833,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if RUN_CLEAN_REMOVE_LLAVA:\n",
+    "    out_filename = \"outs/\" + NAME + \"_clean_remove_llava.out\"\n",
+    "    command = \\\n",
+    "        'cd /home/tom/Open-Sora-dev/ && ' + \\\n",
+    "        '/home/zhaowangbo/.conda/envs/opensora/bin/python -m tools.datasets.csvutil ' + \\\n",
+    "        PATH + clean_remove_input_path + ' ' + \\\n",
+    "        '--clean-caption --remove-caption-prefix --remove-empty-caption ' + \\\n",
+    "        '--output ' + \\\n",
+    "        PATH + clean_remove_output_path + ' > ' + out_filename + ' 2>&1 && ' + \\\n",
+    "        'echo \"[DONE]: RUN_CLEAN_REMOVE_LLAVA\" >> ' + out_filename\n",
+    "    output = client.exec_command(command)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1834,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if RUN_CLEAN_REMOVE:\n",
+    "    out_filename = \"outs/\" + NAME + \"_clean_remove.out\"\n",
+    "    command = \\\n",
+    "        'cd /home/tom/Open-Sora-dev/ && ' + \\\n",
+    "        '/home/zhaowangbo/.conda/envs/opensora/bin/python -m tools.datasets.csvutil ' + \\\n",
+    "        PATH + clean_remove_input_path + ' ' + \\\n",
+    "        '--clean-caption --remove-caption-prefix --remove-empty-caption ' + \\\n",
+    "        '--output ' + \\\n",
+    "        PATH + clean_remove_output_path + ' > ' + out_filename + ' 2>&1 && ' + \\\n",
+    "        'echo \"[DONE]: collate RUN_CLEAN_REMOVE\" >> ' + out_filename\n",
+    "    output = client.exec_command(command)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Intersect"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1835,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# python -m tools.datasets.csvutil ~/dataset/HD-VG-130M/meta_remove_corrupted_aes.csv --intersect ~/dataset/HD-VG-130M/meta_remove_corrupted_flow.csv --output ~/dataset/HD-VG-130M/meta_remove_corrupted_aes_flow.csv\n",
+    "if RUN_INTERSECT_INFO:\n",
+    "    command = \\\n",
+    "        'cd /home/tom/Open-Sora-dev/ && ' + \\\n",
+    "        '/home/zhaowangbo/.conda/envs/opensora/bin/python -m tools.datasets.csvutil ' + \\\n",
+    "        PATH + 'meta_remove_corrupted.csv ' + \\\n",
+    "        '--intersect ' + PATH + 'meta_remove_corrupted_aes_flow_caption.csv ' + \\\n",
+    "        '--output ' + PATH + 'meta_remove_corrupted_aes_flow_caption.csv '\n",
+    "    output = client.exec_command(command)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1836,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # # remove empty captions and process captions (may need to re-caption lost ones)\n",
+    "\n",
+    "# # --remove-caption-prefix: llava has a prefix, remove it\n",
+    "\n",
+    "# # --clean-caption makes it T5 friendly\n",
+    "\n",
+    "# python -m tools.datasets.csvutil ~/dataset/meta_caption.csv --clean-caption --remove-caption-prefix --remove-empty-caption --output ~/dataset/meta_caption_processed.csv\n",
+    "\n",
+    "# python -m tools.datasets.csvutil ~/dataset/meta_caption.csv --clean-caption --remove-caption-prefix --remove-empty-caption --output ~/dataset/meta_caption_processed.csv\n",
+    "\n",
+    "# # # 4. Sanity check & prepare for training\n",
+    "# # # sanity check\n",
+    "# # python -m tools.datasets.csvutil ~/dataset/meta_caption_processed.csv --info --output ~/dataset/meta_ready.csv\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1837,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# command1 = 'conda activate /home/zhaowangbo/.conda/envs/llava2'\n",
+    "# command2 = ' cd ~/Open-Sora-dev'\n",
+    "# command3 = 'nohup torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava /mnt/hdd/data/unsplash-full/resize_4k/meta/meta_machine_1.csv --tp-size 2 --dp-size 4 --bs 8 > llava_unsplash-full_machine_1.out &'\n",
+    "# stdin, stdout, stderr = client.exec_command(command1)\n",
+    "    # stdin, stdout, stderr = client.exec_command(command2)\n",
+    "    # Execute the command\n",
+    "    # stdin, stdout, stderr = client.exec_command(command3)\n",
+    "    # output = stdout.read().decode()\n",
+    "    # print(output)\n",
+    "    # error = stderr.read().decode()\n",
+    "    # print(error)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/opensora/datasets/datasets.py
+++ b/opensora/datasets/datasets.py
@ -153,6 +153,3 @@ class VariableVideoTextDataset(VideoTextDataset):
        # TCHW -> CTHW
        video = video.permute(1, 0, 2, 3)
        return {"video": video, "text": text, "num_frames": num_frames, "height": height, "width": width, "ar": ar}
-
-    def __getitem__(self, index):
-        return self.getitem(index)
--- a/tools/caption/README.md
+++ b/tools/caption/README.md
@ -66,7 +66,7 @@ Please note that you should add the `--flash-attention` flag when running with L
 After running the script, with `dp-size=N`, you will get `N` parts of csv files. Run the following command to merge them:

 ```bash
-python -m tools.datasets.csvutil DATA_caption_part*.csv --output DATA_caption.csv
+python -m tools.datasets.datautil DATA_caption_part*.csv --output DATA_caption.csv
 ```

 ### Resume
@ -75,10 +75,10 @@ Sometimes the process may be interrupted. We can resume the process by running t

 ```bash
 # merge generated results
-python -m tools.datasets.csvutil DATA_caption_part*.csv --output DATA_caption.csv
+python -m tools.datasets.datautil DATA_caption_part*.csv --output DATA_caption.csv

 # get the remaining videos
-python -m tools.datasets.csvutil DATA.csv --difference DATA_caption.csv --output DATA_remaining.csv
+python -m tools.datasets.datautil DATA.csv --difference DATA_caption.csv --output DATA_remaining.csv
 ```

 Then use the output csv file to resume the process.
--- a/tools/caption/camera_motion_detect.py
+++ b/tools/caption/camera_motion_detect.py
@ -119,6 +119,7 @@ def main(args):
    data = pd.read_csv(args.input)
    data["cmotion"] = apply(data["path"], process)
    data.to_csv(output_file, index=False)
+    print(f"Output saved to {output_file}")


 if __name__ == "__main__":
--- a/tools/caption/utils.py
+++ b/tools/caption/utils.py
@ -72,11 +72,20 @@ def extract_frames(video_path, points=(0.1, 0.5, 0.9)):
    return frames_pil, total_frames


+def read_file(input_path):
+    if input_path.endswith(".csv"):
+        return pd.read_csv(input_path)
+    elif input_path.endswith(".parquet"):
+        return pd.read_parquet(input_path)
+    else:
+        raise NotImplementedError(f"Unsupported file format: {input_path}")
+
+
 class VideoTextDataset(torch.utils.data.Dataset):
    def __init__(self, csv_path, transform=None, num_frames=3, get_text_input_ids=None, resize=None):
        self.csv_path = csv_path
        self.transform = transform
-        self.data = pd.read_csv(csv_path)
+        self.data = read_file(csv_path)
        self.points = NUM_FRAMES_POINTS[num_frames]
        self.get_text_input_ids = get_text_input_ids
        self.use_text = False
--- a/tools/datasets/README.md
+++ b/tools/datasets/README.md
@ -13,6 +13,7 @@
    - [Frame extraction](#frame-extraction)
    - [Crop Midjourney 4 grid](#crop-midjourney-4-grid)
  - [Analyze datasets](#analyze-datasets)
+  - [Data Process Pipeline](#data-process-pipeline)

 After preparing the raw dataset according to the [instructions](/docs/datasets.md), you can use the following commands to manage the dataset.

@ -74,7 +75,7 @@ python -m tools.datasets.convert vidprom VIDPROM_FOLDER --info VidProM_semantic_

 ## Manage datasets

-Use `csvutil` to manage the dataset.
+Use `datautil` to manage the dataset.

 ### Requirement

@ -88,6 +89,10 @@ To get image and video information, you need to install [opencv-python](https://

 ```bash
 pip install opencv-python
+# If your videos are in av1 codec instead of h264, you need to
+# - install ffmpeg first
+# - install via conda to support av1 codec
+conda install -c conda-forge opencv
 ```

 Or to get video information, you can install ffmpeg and ffmpeg-python:
@ -107,30 +112,27 @@ pip install lingua-language-detector
 You can use the following commands to process the `csv` or `parquet` files. The output file will be saved in the same directory as the input, with different suffixes indicating the processed method.

 ```bash
-# csvutil takes multiple CSV files as input and merge them into one CSV file
+# datautil takes multiple CSV files as input and merge them into one CSV file
 # output: DATA1+DATA2.csv
-python -m tools.datasets.csvutil DATA1.csv DATA2.csv
+python -m tools.datasets.datautil DATA1.csv DATA2.csv

 # shard CSV files into multiple CSV files
 # output: DATA1_0.csv, DATA1_1.csv, ...
-python -m tools.datasets.csvutil DATA1.csv --shard 10
+python -m tools.datasets.datautil DATA1.csv --shard 10

 # filter frames between 128 and 256, with captions
 # output: DATA1_fmin_128_fmax_256.csv
-python -m tools.datasets.csvutil DATA.csv --fmin 128 --fmax 256
+python -m tools.datasets.datautil DATA.csv --fmin 128 --fmax 256

 # Disable parallel processing
-python -m tools.datasets.csvutil DATA.csv --fmin 128 --fmax 256 --disable-parallel
-
-# Remove corrupted video from the csv
-python -m tools.datasets.csvutil DATA.csv --remove-corrupted
+python -m tools.datasets.datautil DATA.csv --fmin 128 --fmax 256 --disable-parallel

 # Compute num_frames, height, width, fps, aspect_ratio for videos or images
 # output: IMG_DATA+VID_DATA_vinfo.csv
-python -m tools.datasets.csvutil IMG_DATA.csv VID_DATA.csv --video-info
+python -m tools.datasets.datautil IMG_DATA.csv VID_DATA.csv --video-info

 # You can run multiple operations at the same time.
-python -m tools.datasets.csvutil DATA.csv --video-info --remove-empty-caption --remove-url --lang en
+python -m tools.datasets.datautil DATA.csv --video-info --remove-empty-caption --remove-url --lang en
 ```

 ### Score filtering
@ -140,7 +142,7 @@ To examine and filter the quality of the dataset by aesthetic score and clip sco
 ```bash
 # sort the dataset by aesthetic score
 # output: DATA_sort.csv
-python -m tools.datasets.csvutil DATA.csv --sort aesthetic_score
+python -m tools.datasets.datautil DATA.csv --sort aesthetic_score
 # View examples of high aesthetic score
 head -n 10 DATA_sort.csv
 # View examples of low aesthetic score
@ -148,19 +150,19 @@ tail -n 10 DATA_sort.csv

 # sort the dataset by clip score
 # output: DATA_sort.csv
-python -m tools.datasets.csvutil DATA.csv --sort clip_score
+python -m tools.datasets.datautil DATA.csv --sort clip_score

 # filter the dataset by aesthetic score
 # output: DATA_aesmin_0.5.csv
-python -m tools.datasets.csvutil DATA.csv --aesmin 0.5
+python -m tools.datasets.datautil DATA.csv --aesmin 0.5
 # filter the dataset by clip score
 # output: DATA_matchmin_0.5.csv
-python -m tools.datasets.csvutil DATA.csv --matchmin 0.5
+python -m tools.datasets.datautil DATA.csv --matchmin 0.5
 ```

 ### Documentation

-You can also use `python -m tools.datasets.csvutil --help` to see usage.
+You can also use `python -m tools.datasets.datautil --help` to see usage.

 | Args                        | File suffix    | Description                                                   |
 | --------------------------- | -------------- | ------------------------------------------------------------- |
@ -174,9 +176,7 @@ You can also use `python -m tools.datasets.csvutil --help` to see usage.
 | `--difference DATA.csv`     |                | Remove the paths in DATA.csv from the dataset                 |
 | `--intersection DATA.csv`   |                | Keep the paths in DATA.csv from the dataset and merge columns |
 | `--info`                    | `_info`        | Get the basic information of each video and image (cv2)       |
-| `--video-info`              | `_vinfo`       | Get the basic information of each video (ffmpeg)              |
 | `--ext`                     | `_ext`         | Remove rows if the file does not exist                        |
-| `--remove-corrupted`        | `_nocorrupted` | Remove the corrupted video and image                          |
 | `--relpath`                 | `_relpath`     | Modify the path to relative path by root given                |
 | `--abspath`                 | `_abspath`     | Modify the path to absolute path by root given                |
 | `--remove-empty-caption`    | `_noempty`     | Remove rows with empty caption                                |
@ -245,3 +245,37 @@ For the dataset provided in a `.csv` or `.parquet` file, you can easily analyze
 ```python
 pyhton -m tools.datasets.analyze DATA_info.csv
 ```
+
+## Data Process Pipeline
+
+```bash
+# Suppose videos and images under ~/dataset/
+# 1. Convert dataset to CSV
+python -m tools.datasets.convert video ~/dataset --output meta.csv
+
+# 2. Get video information
+python -m tools.datasets.datautil meta.csv --info --fmin 1
+
+# 3. Get caption
+# 3.1. generate caption
+torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava meta_info_fmin1.csv --dp-size 8 --tp-size 1 --model-path liuhaotian/llava-v1.6-mistral-7b --prompt video
+# merge generated results
+python -m tools.datasets.datautil meta_info_fmin1_caption_part*.csv --output meta_caption.csv
+# merge caption and info
+python -m tools.datasets.datautil meta_info_fmin1.csv --intersection meta_caption.csv --output meta_caption_info.csv
+# clean caption
+python -m tools.datasets.datautil meta_caption_info.csv --clean-caption --refine-llm-caption --remove-empty-caption --output meta_caption_processed.csv
+# 3.2. extract caption
+python -m tools.datasets.datautil meta_info_fmin1.csv --load-caption json --remove-empty-caption --clean-caption
+
+# 4. Scoring
+# aesthetic scoring
+torchrun --standalone --nproc_per_node 8 -m tools.scoring.aesthetic.inference meta_caption_processed.csv
+python -m tools.datasets.datautil meta_caption_processed_part*.csv --output meta_caption_processed_aes.csv
+# optical flow scoring
+torchrun --standalone --nproc_per_node 8 -m tools.scoring.optical_flow.inference_parallel meta_caption_processed.csv
+# matching scoring
+torchrun --standalone --nproc_per_node 8 -m tools.scoring.matching.inference_parallel meta_caption_processed.csv
+# camera motion
+python -m tools.caption.camera_motion_detect meta_caption_processed.csv
+```
--- a/tools/datasets/datautil.py
+++ b/tools/datasets/datautil.py
@ -7,6 +7,7 @@ import re
 from functools import partial
 from glob import glob

+import cv2
 import numpy as np
 import pandas as pd
 from tqdm import tqdm
@ -47,8 +48,6 @@ def apply(df, func, **kwargs):


 def get_info(path):
-    import cv2
-
    try:
        ext = os.path.splitext(path)[1].lower()
        if ext in IMG_EXTENSIONS:
@ -72,42 +71,6 @@ def get_info(path):
        return 0, 0, 0, np.nan, np.nan, np.nan


-# ======================================================
-# --video-info
-# ======================================================
-
-
-def get_video_info(path):
-    import ffmpeg
-
-    try:
-        info = ffmpeg.probe(path)["streams"][0]
-        height = int(info["height"])
-        width = int(info["width"])
-        num_frames = int(info["nb_frames"])
-        aspect_ratio = height / width
-        hw = height * width
-        fps = np.nan
-        return num_frames, height, width, aspect_ratio, fps, hw
-    except:
-        return 0, 0, 0, np.nan, np.nan, np.nan
-
-
-# ======================================================
-# --remove-corrupted
-# ======================================================
-
-
-def is_video_valid(path):
-    import decord
-
-    try:
-        decord.VideoReader(path, num_threads=1)
-        return True
-    except:
-        return False
-
-
 # ======================================================
 # --refine-llm-caption
 # ======================================================
@ -357,12 +320,15 @@ def text_preprocessing(text, use_text_preprocessing: bool = True):


 def load_caption(path, ext):
-    assert ext in ["json"]
-    json_path = path.split(".")[0] + ".json"
-    with open(json_path, "r") as f:
-        data = json.load(f)
-    caption = data["caption"]
-    return caption
+    try:
+        assert ext in ["json"]
+        json_path = path.split(".")[0] + ".json"
+        with open(json_path, "r") as f:
+            data = json.load(f)
+        caption = data["caption"]
+        return caption
+    except:
+        return ""


 # ======================================================
@ -396,6 +362,7 @@ def read_data(input_paths):
        input_list.extend(glob(input_path))
    print("Input files:", input_list)
    for i, input_path in enumerate(input_list):
+        assert os.path.exists(input_path)
        data.append(read_file(input_path))
        input_name += os.path.basename(input_path).split(".")[0]
        if i != len(input_list) - 1:
@ -444,10 +411,25 @@ def main(args):

        tokenizer = AutoTokenizer.from_pretrained("DeepFloyd/t5-v1_1-xxl")

-    # filtering
+    # IO-related
+    if args.load_caption is not None:
+        assert "path" in data.columns
+        data["text"] = apply(data["path"], load_caption, ext=args.load_caption)
+    if args.info:
+        info = apply(data["path"], get_info)
+        (
+            data["num_frames"],
+            data["height"],
+            data["width"],
+            data["aspect_ratio"],
+            data["fps"],
+            data["resolution"],
+        ) = zip(*info)
    if args.ext:
        assert "path" in data.columns
        data = data[apply(data["path"], os.path.exists)]
+
+    # filtering
    if args.remove_url:
        assert "text" in data.columns
        data = data[~data["text"].str.contains(r"(?P<url>https?://[^\s]+)", regex=True)]
@ -461,9 +443,6 @@ def main(args):
    if args.remove_path_duplication:
        assert "path" in data.columns
        data = data.drop_duplicates(subset=["path"])
-    if args.remove_corrupted:
-        assert "path" in data.columns
-        data = data[apply(data["path"], is_video_valid)]

    # processing
    if args.relpath is not None:
@ -487,30 +466,6 @@ def main(args):
    if args.count_num_token is not None:
        assert "text" in data.columns
        data["text_len"] = apply(data["text"], lambda x: len(tokenizer(x)["input_ids"]))
-    if args.info:
-        info = apply(data["path"], get_info)
-        (
-            data["num_frames"],
-            data["height"],
-            data["width"],
-            data["aspect_ratio"],
-            data["fps"],
-            data["resolution"],
-        ) = zip(*info)
-    if args.video_info:
-        assert "path" in data.columns
-        info = apply(data["path"], get_video_info)
-        (
-            data["num_frames"],
-            data["height"],
-            data["width"],
-            data["aspect_ratio"],
-            data["fps"],
-            data["resolution"],
-        ) = zip(*info)
-    if args.load_caption is not None:
-        assert "path" in data.columns
-        data["text"] = apply(data["path"], load_caption, ext=args.load_caption)

    # sort
    if args.sort is not None:
@ -577,9 +532,10 @@ def parse_args():

    # IO-related
    parser.add_argument("--info", action="store_true", help="get the basic information of each video and image")
-    parser.add_argument("--video-info", action="store_true", help="get the basic information of each video")
    parser.add_argument("--ext", action="store_true", help="check if the file exists")
-    parser.add_argument("--remove-corrupted", action="store_true", help="remove the corrupted video and image")
+    parser.add_argument(
+        "--load-caption", type=str, default=None, choices=["json", "txt"], help="load the caption from json or txt"
+    )

    # path processing
    parser.add_argument("--relpath", type=str, default=None, help="modify the path to relative path by root given")
@ -606,9 +562,6 @@ def parse_args():
    parser.add_argument(
        "--count-num-token", type=str, choices=["t5"], default=None, help="Count the number of tokens in the caption"
    )
-    parser.add_argument(
-        "--load-caption", type=str, default=None, choices=["json", "txt"], help="load the caption from json or txt"
-    )

    # score filtering
    parser.add_argument("--fmin", type=int, default=None, help="filter the dataset by minimum number of frames")
@ -636,14 +589,13 @@ def get_output_path(args, input_name):
        name += "_sort"

    # IO-related
+    # for IO-related, the function must be wrapped in try-except
    if args.info:
        name += "_info"
-    if args.video_info:
-        name += "_vinfo"
    if args.ext:
        name += "_ext"
-    if args.remove_corrupted:
-        name += "_nocorrupted"
+    if args.load_caption:
+        name += f"_load{args.load_caption}"

    # path processing
    if args.relpath is not None:
@ -674,8 +626,6 @@ def get_output_path(args, input_name):
        name += "_cmcaption"
    if args.count_num_token:
        name += "_ntoken"
-    if args.load_caption:
-        name += f"_load{args.load_caption}"

    # score filtering
    if args.fmin is not None:
--- a/tools/scene_cut/README.md
+++ b/tools/scene_cut/README.md
@ -1,26 +1,35 @@
-## Scene Detection and Video Splitting
-
-### Formatting
-Input meta should be `{prefix}.csv` with column `'videoId'`
-```
-python tools/scene_cut/process_meta.py --task append_format --meta_path /mnt/hdd/data/pexels_new/raw/meta/popular_6.csv --split popular_6
-```
-Output is `{prefix}_format.csv` (with column `path`) and `{prefix}_intact.csv` (with column `intact` and `path`)
-
-### Scene Detection
-Input meta should be `{prefix}_format.csv`
-```
-python tools/scene_cut/scene_detect.py --meta_path /mnt/hdd/data/pexels_new/raw/meta/popular_6_format.csv
-```
-Output is `{prefix}_format_timestamp.csv`
-
-### Video Splitting
-Input meta should be `{prefix}_timestamp.csv`
-```
-python tools/scene_cut/main_cut_pandarallel.py \
-    --meta_path /mnt/hdd/data/pexels_new/raw/meta/popular_6_format_timestamp.csv \
-    --out_dir /mnt/hdd/data/pexels_new/scene_cut/data/popular_6
-```
-Output is `{out_dir}/{wo_ext}_scene-{sid}.mp4`
-
-TODO: meta for video clips
+## Scene Detection and Video Splitting
+
+### Formatting
+
+Input meta should be `{prefix}.csv` with column `'videoId'`
+
+```bash
+python tools/scene_cut/process_meta.py --task append_format --meta_path /mnt/hdd/data/pexels_new/raw/meta/popular_6.csv --split popular_6
+```
+
+Output is `{prefix}_format.csv` (with column `path`) and `{prefix}_intact.csv` (with column `intact` and `path`)
+
+### Scene Detection
+
+Input meta should be `{prefix}_format.csv`
+
+```bash
+python tools/scene_cut/scene_detect.py --meta_path /mnt/hdd/data/pexels_new/raw/meta/popular_6_format.csv
+```
+
+Output is `{prefix}_format_timestamp.csv`
+
+### Video Splitting
+
+Input meta should be `{prefix}_timestamp.csv`
+
+```bash
+python tools/scene_cut/main_cut_pandarallel.py \
+    --meta_path /mnt/hdd/data/pexels_new/raw/meta/popular_6_format_timestamp.csv \
+    --out_dir /mnt/hdd/data/pexels_new/scene_cut/data/popular_6
+```
+
+Output is `{out_dir}/{wo_ext}_scene-{sid}.mp4`
+
+TODO: meta for video clips
--- a/tools/scoring/aesthetic/inference.py
+++ b/tools/scoring/aesthetic/inference.py
@ -166,7 +166,7 @@ def main(args):
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("input", type=str, help="Path to the input CSV file")
-    parser.add_argument("--bs", type=int, default=512, help="Batch size")
+    parser.add_argument("--bs", type=int, default=1024, help="Batch size")
    parser.add_argument("--num_workers", type=int, default=16, help="Number of workers")
    parser.add_argument("--accumulate", type=int, default=1, help="batch to accumulate")
    parser.add_argument("--prefetch_factor", type=int, default=2, help="Prefetch factor")
--- a/tools/scoring/matching/inference.py
+++ b/tools/scoring/matching/inference.py
@ -8,10 +8,6 @@ import pandas as pd
 import torch
 import torch.nn.functional as F
 from torchvision.datasets.folder import pil_loader
-<<<<<<< HEAD
-
-=======
->>>>>>> dev/v1.0.1
 from tqdm import tqdm

 IMG_EXTENSIONS = (
@ -48,9 +44,7 @@ def extract_frames(video_path, points=[0.5]):
    frames = []
    for point in points:
        target_frame = total_frames * point
-        target_timestamp = int(
-            (target_frame * av.time_base) / container.streams.video[0].average_rate
-        )
+        target_timestamp = int((target_frame * av.time_base) / container.streams.video[0].average_rate)
        container.seek(target_timestamp)
        frame = next(container.decode(video=0)).to_image()
        frames.append(frame)
@ -65,7 +59,7 @@ class VideoTextDataset(torch.utils.data.Dataset):

    def __getitem__(self, index):
        row = self.meta.iloc[index]
-        path = row['path']
+        path = row["path"]

        if is_video(path):
            img = extract_frames(path, points=[0.5])[0]