mirror of
https://github.com/hpcaitech/Open-Sora.git
synced 2026-04-10 21:01:26 +02:00
a bunch of update for data
This commit is contained in:
parent
4d338419a7
commit
3b85effe5a
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -170,6 +170,7 @@ runs/
|
|||
checkpoints/
|
||||
outputs/
|
||||
samples/
|
||||
logs/
|
||||
pretrained_models/
|
||||
*.swp
|
||||
|
||||
|
|
|
|||
51
README.md
51
README.md
|
|
@ -123,10 +123,12 @@ conda activate opensora
|
|||
pip install torch torchvision
|
||||
|
||||
# install flash attention (optional)
|
||||
# required if enable_flashattn=True
|
||||
pip install packaging ninja
|
||||
pip install flash-attn --no-build-isolation
|
||||
|
||||
# install apex (optional)
|
||||
# required if enable_layernorm_kernel=True
|
||||
pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git
|
||||
|
||||
# install xformers
|
||||
|
|
@ -201,40 +203,25 @@ the following steps:
|
|||
3. Score and filter videos. [[docs](/tools/scoring/README.md)]
|
||||
4. Generate video captions. [[docs](/tools/caption/README.md)]
|
||||
|
||||
Below is an example workflow to process data. However, we recommend you to read the detailed documentation for each tool, and decide which tools to use based on your needs. This pipeline applies to both image and video data.
|
||||
Below is an example workflow to process data. However, we recommend you to read the detailed documentation for each tool, and decide which tools to use based on your needs. This pipeline applies to both image and video data. Full pipeline is available in [datasets.md](/tools/datasets/README.md#data-process-pipeline).
|
||||
|
||||
```bash
|
||||
# Suppose files under ~/dataset/
|
||||
# 1. Convert dataset to CSV
|
||||
python -m tools.datasets.convert video ~/dataset --output ~/dataset/meta.csv
|
||||
# filter out broken videos (broken ones num_frames=0)
|
||||
python -m tools.datasets.csvutil ~/dataset.csv --info --fmin 1 --output ~/dataset/meta.csv
|
||||
|
||||
# 2. Filter dataset by aesthetic scores
|
||||
# output: ~/dataset/meta_aes.csv
|
||||
python -m tools.scoring.aesthetic.inference ~/dataset/meta.csv
|
||||
# sort and examine videos by aesthetic scores
|
||||
# output: ~/dataset/meta_aes_sort.csv
|
||||
python -m tools.datasets.csvutil ~/dataset/meta_aes.csv --sort-descending aes
|
||||
# bad videos (aesthetic_score < 5)
|
||||
tail ~/dataset/meta_aes_sort.csv
|
||||
# filter videos by aesthetic scores
|
||||
# output: ~/dataset/meta_aes_aesmin5.csv
|
||||
python -m tools.datasets.csvutil ~/dataset/meta_aes.csv --aesmin 5
|
||||
|
||||
# 3. Caption dataset
|
||||
# output: ~/dataset/meta_aes_aesmin5_caption_parti.csv
|
||||
torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava ~/dataset/meta_aes_aesmin5.csv --tp-size 2 --dp-size 4 --bs 16
|
||||
# merge generated results
|
||||
python -m tools.datasets.csvutil ~/dataset/meta_aes_aesmin5_caption_part*.csv --output ~/dataset/meta_caption.csv
|
||||
# remove empty captions and process captions (may need to re-caption lost ones)
|
||||
python -m tools.datasets.csvutil ~/dataset/meta_caption.csv --clean-caption --remove-caption-prefix --remove-empty-caption --output ~/dataset/meta_caption_processed.csv
|
||||
|
||||
# 4. Sanity check & prepare for training
|
||||
# sanity check
|
||||
python -m tools.datasets.csvutil ~/dataset/meta_caption_processed.csv --ext --info --output ~/dataset/meta_ready.csv
|
||||
# filter out videos less than 48 frames
|
||||
# output: ~/dataset/meta_ready_fmin48.csv
|
||||
# Suppose videos and images under ~/dataset/
|
||||
# 1. Convert dataset to CSV (meta.csv)
|
||||
python -m tools.datasets.convert video ~/dataset --output meta.csv
|
||||
# 2. Get video information (meta_info_fmin1.csv)
|
||||
python -m tools.datasets.datautil meta.csv --info --fmin 1
|
||||
# 3. Get caption information
|
||||
torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava meta_info_fmin1.csv --dp-size 8 --tp-size 1 --model-path liuhaotian/llava-v1.6-mistral-7b --prompt video
|
||||
# merge generated results (meta_caption.csv)
|
||||
python -m tools.datasets.datautil meta_info_fmin1_caption_part*.csv --output meta_caption.csv
|
||||
# clean caption (meta_caption_processed.csv)
|
||||
python -m tools.datasets.datautil meta_caption.csv --clean-caption --refine-llm-caption --remove-empty-caption --output meta_caption_processed.csv
|
||||
# 4. Scoring (meta_caption_processed_aes.csv)
|
||||
torchrun --nproc_per_node 8 -m tools.scoring.aesthetic.inference meta_caption_processed.csv --bs 1024 --num_workers 16
|
||||
# Filter videos by aesthetic scores (meta_aes_aesmin5.csv)
|
||||
python -m tools.datasets.csvutil meta_caption_processed_aes.csv --aesmin 5 --output meta_aes_aesmin5.csv
|
||||
# 5. Additional filtering
|
||||
python -m tools.datasets.csvutil ~/dataset_ready.csv --fmin 48
|
||||
```
|
||||
|
||||
|
|
|
|||
649
notebooks/data.ipynb
Normal file
649
notebooks/data.ipynb
Normal file
|
|
@ -0,0 +1,649 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Data Process Pipeline"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"First, you should add hosts in your ~/.ssh/config file"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 105,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import paramiko\n",
|
||||
"\n",
|
||||
"HOSTS = [\"h800-80\", \"h800-81\", \"h800-82\", \"h800-83\", \"h800-84\", \"h800-85\", \"h800-86\", \"h800-170\", \"h800-171\"]\n",
|
||||
"\n",
|
||||
"# load from ~/.ssh/config\n",
|
||||
"ssh_config = paramiko.SSHConfig()\n",
|
||||
"user_config_file = os.path.expanduser(\"~/.ssh/config\")\n",
|
||||
"if os.path.exists(user_config_file):\n",
|
||||
" with open(user_config_file) as f:\n",
|
||||
" ssh_config.parse(f)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_ssh_config(hostname):\n",
|
||||
" # get the configuration for the host\n",
|
||||
" user_config = ssh_config.lookup(hostname)\n",
|
||||
" user_config\n",
|
||||
" cfg = {\n",
|
||||
" \"hostname\": user_config[\"hostname\"],\n",
|
||||
" \"username\": user_config[\"user\"],\n",
|
||||
" \"port\": int(user_config[\"port\"]),\n",
|
||||
" \"key_filename\": user_config[\"identityfile\"],\n",
|
||||
" }\n",
|
||||
" return cfg\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def connect(hostname):\n",
|
||||
" cfg = get_ssh_config(hostname)\n",
|
||||
" # connect\n",
|
||||
" client = paramiko.SSHClient()\n",
|
||||
" client.set_missing_host_key_policy(paramiko.AutoAddPolicy())\n",
|
||||
" client.connect(**cfg)\n",
|
||||
" return client\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def run_command(command, hostname, get_pty=True, log=True, nohup=False, log_file=None):\n",
|
||||
" client = connect(hostname)\n",
|
||||
" command = f'bash -ic \"{command}\"' if get_pty else command\n",
|
||||
" if log_file:\n",
|
||||
" command = f'{command} > {log_file} 2>&1'\n",
|
||||
" if nohup:\n",
|
||||
" command = f'nohup {command} &'\n",
|
||||
" stdin, stdout, stderr = client.exec_command(command, get_pty=get_pty)\n",
|
||||
" stdout_str = stdout.read().decode()\n",
|
||||
" stderr_str = stderr.read().decode()\n",
|
||||
" if log:\n",
|
||||
" print(\"HOST:\", hostname)\n",
|
||||
" if stdout_str:\n",
|
||||
" print(\"==== STDOUT ====\\n\", stdout_str)\n",
|
||||
" if stderr_str:\n",
|
||||
" print(\"==== STDERR ====\\n\", stderr_str)\n",
|
||||
" client.close()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def run_command_all_hosts(command, hosts=HOSTS):\n",
|
||||
" for hostname in hosts:\n",
|
||||
" run_command(command, hostname)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 96,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def nvidia_smi(host):\n",
|
||||
" if host:\n",
|
||||
" run_command(\"nvidia-smi\", host)\n",
|
||||
" else:\n",
|
||||
" run_command_all_hosts(\"nvidia-smi\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def nvitop(host=None):\n",
|
||||
" if host:\n",
|
||||
" run_command(f\"/home/zhaowangbo/.local/bin/nvitop -1\", host)\n",
|
||||
" else:\n",
|
||||
" run_command_all_hosts(\"/home/zhaowangbo/.local/bin/nvitop -1\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def ps(host=None, interest=\"python|sleep|torchrun|colossal\"):\n",
|
||||
" if host:\n",
|
||||
" if interest is None:\n",
|
||||
" run_command(\"ps ux | cat\", host)\n",
|
||||
" else:\n",
|
||||
" run_command(f\"ps ux | cat | grep --color=never -E '{interest}'\", host)\n",
|
||||
" else:\n",
|
||||
" if interest is None:\n",
|
||||
" run_command_all_hosts(\"ps ux | cat\")\n",
|
||||
" else:\n",
|
||||
" run_command_all_hosts(f\"ps ux | cat | grep --color=never -E '{interest}'\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 106,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"OPEN_SORA_HOME = \"/home/zhaowangbo/open-sora\"\n",
|
||||
"def convert_dataset_cmd(input_dir, output_file, datatype=\"video\"):\n",
|
||||
" commands = []\n",
|
||||
" commands.append(f'cd {OPEN_SORA_HOME}')\n",
|
||||
" # makedirs\n",
|
||||
" output_dir = os.path.dirname(output_file)\n",
|
||||
" commands.append(f'mkdir -p {output_dir}')\n",
|
||||
" commands.append(f'python -m tools.datasets.convert {datatype} {input_dir} --output {output_dir}')\n",
|
||||
" return \" && \".join(commands), output_file"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 107,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"host = \"h800-83\"\n",
|
||||
"log_file = \"./logs/data-panda-16-split.log\"\n",
|
||||
"cmd, output_file = convert_dataset_cmd(\"/mnt/disk1/data-panda/16\", \"/mnt/hdd/data/panda70m_by/raw/meta/split-16/meta.csv\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 109,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"HOST: h800-83\n",
|
||||
"==== STDOUT ====\n",
|
||||
" \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"run_command(cmd, host, nohup=True, log_file=log_file)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 108,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'cd /home/zhaowangbo/open-sora && mkdir -p /mnt/hdd/data/panda70m_by/raw/meta/split-16 && python -m tools.datasets.convert video /mnt/disk1/data-panda/16 --output /mnt/hdd/data/panda70m_by/raw/meta/split-16'"
|
||||
]
|
||||
},
|
||||
"execution_count": 108,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"cmd"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 102,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"HOST: h800-83\n",
|
||||
"==== STDOUT ====\n",
|
||||
" zhaowan+ 2928070 10.0 0.0 14216 5556 pts/13 Ss 15:25 0:00 bash -ic ps ux | cat | grep --color=never -E 'convert'\n",
|
||||
"zhaowan+ 2929492 0.0 0.0 12116 664 pts/13 S+ 15:25 0:00 grep --color=auto --color=never -E convert\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"ps(host, interest=\"convert\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1819,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"RUN_CONVERT = False\n",
|
||||
"RUN_ZERO_FRAME = False\n",
|
||||
"RUN_CORRUPTED = True\n",
|
||||
"RUN_ZERO_FRAME_CORRUPTED = False\n",
|
||||
"RUN_COPY_OVER_CAPTION = False\n",
|
||||
"\n",
|
||||
"RUN_AESTHETICS = False\n",
|
||||
"RUN_COLLATE_AESTHETICS = False\n",
|
||||
"RUN_OPTICAL_FLOW = False\n",
|
||||
"RUN_AES_FLOW = False\n",
|
||||
"\n",
|
||||
"captioning_input_file = 'meta_remove_corrupted_aes_flow.csv'; RUN_CAPTIONING = False\n",
|
||||
"RUN_COLLATE_CAPTIONING = False; \n",
|
||||
"RUN_COLLATE_CAPTIONING_INTERRUPTED = False\n",
|
||||
"\n",
|
||||
"RUN_INTERSECT_INFO = False\n",
|
||||
"\n",
|
||||
"RUN_CLEAN_REMOVE_LLAVA = False\n",
|
||||
"RUN_CLEAN_REMOVE = False\n",
|
||||
"clean_remove_input_path = 'meta_remove_corrupted_aes_flow_caption.csv'\n",
|
||||
"clean_remove_output_path = 'meta_remove_corrupted_aes_flow_caption_cleaned_and_removed.csv'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Convert to meta and collate"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1821,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"if RUN_CONVERT:\n",
|
||||
" nohup_filename = \"outs/\" + NAME + \"_convert_video.out\"\n",
|
||||
"\n",
|
||||
" command = \\\n",
|
||||
" 'cd /home/tom/Open-Sora-dev/ && ' + \\\n",
|
||||
" '/home/zhaowangbo/.conda/envs/opensora/bin/python convert_videos.py ' + \\\n",
|
||||
" ' ' + DATA_PATH + ' ' + \\\n",
|
||||
" ' ' + PATH + ' > ' + nohup_filename + ' 2>&1 &'\n",
|
||||
" print(command)\n",
|
||||
" output = client.exec_command(command)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Filter out zero-frame and corrupted"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1822,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"if RUN_CORRUPTED:\n",
|
||||
" nohup_filename = \"outs/\" + NAME + \"_filter_corrupted.out\"\n",
|
||||
" command = \\\n",
|
||||
" 'cd /home/tom/Open-Sora-dev/ && ' + \\\n",
|
||||
" 'nohup /home/zhaowangbo/.conda/envs/opensora/bin/python -m tools.datasets.csvutil ' + \\\n",
|
||||
" PATH + 'meta.csv ' + \\\n",
|
||||
" '--remove-corrupted' + ' > ' + nohup_filename + ' 2>&1 &'\n",
|
||||
" output = client.exec_command(command)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1823,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"if RUN_ZERO_FRAME:\n",
|
||||
" nohup_filename = \"outs/\" + NAME + \"_filter_zero_frame.out\"\n",
|
||||
" command = \\\n",
|
||||
" 'cd /home/tom/Open-Sora-dev/ && ' + \\\n",
|
||||
" 'nohup /home/zhaowangbo/.conda/envs/opensora/bin/python -m tools.datasets.csvutil ' + \\\n",
|
||||
" PATH + 'meta_remove_corrupted.csv ' + \\\n",
|
||||
" '--info --fmin 1 --output ' + \\\n",
|
||||
" PATH + 'meta_remove_corrupted.csv' + ' > ' + nohup_filename + ' 2>&1 &'\n",
|
||||
" output = client.exec_command(command)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1824,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"if RUN_ZERO_FRAME_CORRUPTED:\n",
|
||||
" out_filename = \"outs/\" + NAME + \"_filter_zero_frame_corrupted.out\"\n",
|
||||
" command = \\\n",
|
||||
" 'cd /home/tom/Open-Sora-dev/ && ' + \\\n",
|
||||
" '/home/zhaowangbo/.conda/envs/opensora/bin/python -m tools.datasets.csvutil ' + \\\n",
|
||||
" PATH + 'meta.csv ' + \\\n",
|
||||
" '--info --fmin 1 --output ' + \\\n",
|
||||
" PATH + 'meta.csv ' + '> ' + out_filename + ' 2>&1 && ' + \\\n",
|
||||
" 'echo \"[DONE]: filter zero frame\" >> ' + out_filename + ' && ' + \\\n",
|
||||
" '/home/zhaowangbo/.conda/envs/opensora/bin/python -m tools.datasets.csvutil ' + \\\n",
|
||||
" PATH + 'meta.csv ' + \\\n",
|
||||
" '--remove-corrupted' + ' >> ' + out_filename + ' 2>&1 && ' + \\\n",
|
||||
" 'echo \"[DONE]: filter corrupted\" >> ' + out_filename\n",
|
||||
" print(command)\n",
|
||||
" output = client.exec_command(command)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Run copy over caption"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1825,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"if RUN_COPY_OVER_CAPTION:\n",
|
||||
" nohup_filename = \"outs/\" + NAME + \"_copy_over_caption.out\"\n",
|
||||
" command = \\\n",
|
||||
" 'cd /home/tom/Open-Sora-dev/ && ' + \\\n",
|
||||
" 'nohup /home/zhaowangbo/.conda/envs/opensora/bin/python copy_over_caption.py' + \\\n",
|
||||
" ' ' + PATH + 'meta.csv > ' + nohup_filename + ' 2>&1 &'\n",
|
||||
" output = client.exec_command(command)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Run aesthetics and collate"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1826,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"if RUN_AESTHETICS:\n",
|
||||
" nohup_filename = \"outs/\" + NAME + \"_aes.out\"\n",
|
||||
" command = \\\n",
|
||||
" 'cd /home/tom/Open-Sora-dev/ && ' + \\\n",
|
||||
" 'nohup /home/zhaowangbo/.conda/envs/opensora/bin/torchrun --nproc_per_node 8 -m tools.scoring.aesthetic.inference ' + \\\n",
|
||||
" PATH + 'meta_remove_corrupted.csv ' + \\\n",
|
||||
" '--bs 1024 --num_workers 16 > ' + \\\n",
|
||||
" nohup_filename + ' 2>&1 &'\n",
|
||||
" output = client.exec_command(command)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1827,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"if RUN_COLLATE_AESTHETICS:\n",
|
||||
" command = \\\n",
|
||||
" 'cd /home/tom/Open-Sora-dev/ && ' + \\\n",
|
||||
" 'nohup /home/zhaowangbo/.conda/envs/opensora/bin/python -m tools.datasets.csvutil ' + \\\n",
|
||||
" PATH + 'meta_remove_corrupted_aes_part*.csv ' + \\\n",
|
||||
" '--output ' + \\\n",
|
||||
" PATH + 'meta_remove_corrupted_aes.csv '\n",
|
||||
" output = client.exec_command(command)\n",
|
||||
" # this takes priority! delete all meta_remove_corrupted_aes_part*.csv\n",
|
||||
" # output = client.exec_command(\"rm \" + PATH + \"meta_remove_corrupted_aes_part*.csv\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Run optical flow\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1828,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"if RUN_OPTICAL_FLOW:\n",
|
||||
" command = \\\n",
|
||||
" 'cd /home/tom/Open-Sora-dev/ && ' + \\\n",
|
||||
" 'nohup /home/zhaowangbo/.conda/envs/llava2/bin/torchrun --standalone --nproc_per_node 8 -m tools.scoring.optical_flow.inference_parallel ' + \\\n",
|
||||
" PATH + 'meta_remove_corrupted_aes.csv > ' + \\\n",
|
||||
" \"outs/\" + NAME + \"_flow.out 2>&1 &\"\n",
|
||||
" print(command)\n",
|
||||
" output = client.exec_command(command)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Run aesthetics and optical flow\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1829,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"if RUN_AES_FLOW:\n",
|
||||
" out_filename = \"outs/\" + NAME + \"_aes_flow.out\"\n",
|
||||
" command = \\\n",
|
||||
" 'cd /home/tom/Open-Sora-dev/ && ' + \\\n",
|
||||
" '/home/zhaowangbo/.conda/envs/opensora/bin/torchrun --nproc_per_node 8 -m tools.scoring.aesthetic.inference ' + \\\n",
|
||||
" PATH + 'meta_remove_corrupted.csv ' + \\\n",
|
||||
" '--bs 1024 --num_workers 16 > ' + \\\n",
|
||||
" out_filename + ' 2>&1 && ' + \\\n",
|
||||
" 'echo \"[DONE]: aesthetic\" >> ' + out_filename + ' && ' + \\\n",
|
||||
" '/home/zhaowangbo/.conda/envs/opensora/bin/python -m tools.datasets.csvutil ' + \\\n",
|
||||
" PATH + 'meta_remove_corrupted_aes_part*.csv ' + \\\n",
|
||||
" '--output ' + \\\n",
|
||||
" PATH + 'meta_remove_corrupted_aes.csv ' + ' >> ' + out_filename + ' 2>&1 && ' + \\\n",
|
||||
" 'echo \"[DONE]: collate aesthetic\" >> ' + out_filename + ' && ' + \\\n",
|
||||
" '/home/zhaowangbo/.conda/envs/llava2/bin/torchrun --standalone --nproc_per_node 8 -m tools.scoring.optical_flow.inference_parallel ' + \\\n",
|
||||
" PATH + 'meta_remove_corrupted_aes.csv >> ' + \\\n",
|
||||
" out_filename + \" 2>&1 && \" + \\\n",
|
||||
" 'echo \"[DONE]: flow\" >> ' + out_filename\n",
|
||||
" # CONTINUE \n",
|
||||
" output = client.exec_command(command)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Run captioning and collate\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1830,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"if RUN_CAPTIONING:\n",
|
||||
" out_filename = \"outs/\" + NAME + \"_caption.out\"\n",
|
||||
" command = \\\n",
|
||||
" 'cd /home/tom/Open-Sora-dev/ && ' + \\\n",
|
||||
" '/home/zhaowangbo/.conda/envs/llava2/bin/torchrun --nproc_per_node 8 ' + \\\n",
|
||||
" '--standalone ' + \\\n",
|
||||
" '-m tools.caption.caption_llava ' + \\\n",
|
||||
" PATH + captioning_input_file + ' ' + \\\n",
|
||||
" '--tp-size 2 ' + \\\n",
|
||||
" '--dp-size 4 ' + \\\n",
|
||||
" '--model-path liuhaotian/llava-v1.6-mistral-7b ' + \\\n",
|
||||
" '--bs 16 ' + \\\n",
|
||||
" '--prompt video > ' + \\\n",
|
||||
" out_filename + \" 2>&1 && \" + \\\n",
|
||||
" '/home/zhaowangbo/.conda/envs/llava2/bin/python -m tools.datasets.csvutil ' + \\\n",
|
||||
" PATH + 'meta_remove_corrupted_aes_flow_caption_part*.csv ' + \\\n",
|
||||
" '--output ' + \\\n",
|
||||
" PATH + 'meta_remove_corrupted_aes_flow_caption.csv ' + ' >> ' + out_filename + ' 2>&1 '\n",
|
||||
" print(command)\n",
|
||||
" output = client.exec_command(command)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1831,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# cd /home/tom/Open-Sora-dev/ && /home/zhaowangbo/.conda/envs/llava2/bin/torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava /mnt/hdd/data/v2text/raw/meta/split-18/meta_remove_corrupted_aes_flow.csv --tp-size 2 --dp-size 4 --model-path liuhaotian/llava-v1.6-mistral-7b --bs 16 --prompt video > outs/split-18_caption.out 2>&1 && /home/zhaowangbo/.conda/envs/llava2/bin/python -m tools.datasets.csvutil /mnt/hdd/data/v2text/raw/meta/split-18/meta_remove_corrupted_aes_flow_caption_part*.csv --output /mnt/hdd/data/v2text/raw/meta/split-18/meta_remove_corrupted_aes_flow_caption.csv >> outs/split-18_caption.out 2>&1 && "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1832,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"if RUN_COLLATE_CAPTIONING_INTERRUPTED:\n",
|
||||
" command = \\\n",
|
||||
" 'cd /home/tom/Open-Sora-dev/ && ' + \\\n",
|
||||
" '/home/zhaowangbo/.conda/envs/llava2/bin/python -m tools.datasets.csvutil ' + \\\n",
|
||||
" PATH + 'meta_remove_corrupted_aes_flow_remaining_caption_part*.csv ' + \\\n",
|
||||
" '--output ' + \\\n",
|
||||
" PATH + 'meta_remove_corrupted_aes_flow_remaining_caption.csv '\n",
|
||||
" output = client.exec_command(command)\n",
|
||||
" command = \\\n",
|
||||
" 'cd /home/tom/Open-Sora-dev/ && ' + \\\n",
|
||||
" '/home/zhaowangbo/.conda/envs/llava2/bin/python -m tools.datasets.csvutil ' + \\\n",
|
||||
" PATH + 'meta_remove_corrupted_aes_flow_remaining_caption.csv ' + \\\n",
|
||||
" PATH + 'meta_remove_corrupted_aes_flow_caption_partial.csv ' + \\\n",
|
||||
" '--output ' + \\\n",
|
||||
" PATH + 'meta_remove_corrupted_aes_flow_caption.csv '\n",
|
||||
" output = client.exec_command(command)\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Clean and remove"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1833,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"if RUN_CLEAN_REMOVE_LLAVA:\n",
|
||||
" out_filename = \"outs/\" + NAME + \"_clean_remove_llava.out\"\n",
|
||||
" command = \\\n",
|
||||
" 'cd /home/tom/Open-Sora-dev/ && ' + \\\n",
|
||||
" '/home/zhaowangbo/.conda/envs/opensora/bin/python -m tools.datasets.csvutil ' + \\\n",
|
||||
" PATH + clean_remove_input_path + ' ' + \\\n",
|
||||
" '--clean-caption --remove-caption-prefix --remove-empty-caption ' + \\\n",
|
||||
" '--output ' + \\\n",
|
||||
" PATH + clean_remove_output_path + ' > ' + out_filename + ' 2>&1 && ' + \\\n",
|
||||
" 'echo \"[DONE]: RUN_CLEAN_REMOVE_LLAVA\" >> ' + out_filename\n",
|
||||
" output = client.exec_command(command)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1834,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"if RUN_CLEAN_REMOVE:\n",
|
||||
" out_filename = \"outs/\" + NAME + \"_clean_remove.out\"\n",
|
||||
" command = \\\n",
|
||||
" 'cd /home/tom/Open-Sora-dev/ && ' + \\\n",
|
||||
" '/home/zhaowangbo/.conda/envs/opensora/bin/python -m tools.datasets.csvutil ' + \\\n",
|
||||
" PATH + clean_remove_input_path + ' ' + \\\n",
|
||||
" '--clean-caption --remove-caption-prefix --remove-empty-caption ' + \\\n",
|
||||
" '--output ' + \\\n",
|
||||
" PATH + clean_remove_output_path + ' > ' + out_filename + ' 2>&1 && ' + \\\n",
|
||||
" 'echo \"[DONE]: collate RUN_CLEAN_REMOVE\" >> ' + out_filename\n",
|
||||
" output = client.exec_command(command)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Intersect"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1835,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# python -m tools.datasets.csvutil ~/dataset/HD-VG-130M/meta_remove_corrupted_aes.csv --intersect ~/dataset/HD-VG-130M/meta_remove_corrupted_flow.csv --output ~/dataset/HD-VG-130M/meta_remove_corrupted_aes_flow.csv\n",
|
||||
"if RUN_INTERSECT_INFO:\n",
|
||||
" command = \\\n",
|
||||
" 'cd /home/tom/Open-Sora-dev/ && ' + \\\n",
|
||||
" '/home/zhaowangbo/.conda/envs/opensora/bin/python -m tools.datasets.csvutil ' + \\\n",
|
||||
" PATH + 'meta_remove_corrupted.csv ' + \\\n",
|
||||
" '--intersect ' + PATH + 'meta_remove_corrupted_aes_flow_caption.csv ' + \\\n",
|
||||
" '--output ' + PATH + 'meta_remove_corrupted_aes_flow_caption.csv '\n",
|
||||
" output = client.exec_command(command)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1836,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# # # remove empty captions and process captions (may need to re-caption lost ones)\n",
|
||||
"\n",
|
||||
"# # --remove-caption-prefix: llava has a prefix, remove it\n",
|
||||
"\n",
|
||||
"# # --clean-caption makes it T5 friendly\n",
|
||||
"\n",
|
||||
"# python -m tools.datasets.csvutil ~/dataset/meta_caption.csv --clean-caption --remove-caption-prefix --remove-empty-caption --output ~/dataset/meta_caption_processed.csv\n",
|
||||
"\n",
|
||||
"# python -m tools.datasets.csvutil ~/dataset/meta_caption.csv --clean-caption --remove-caption-prefix --remove-empty-caption --output ~/dataset/meta_caption_processed.csv\n",
|
||||
"\n",
|
||||
"# # # 4. Sanity check & prepare for training\n",
|
||||
"# # # sanity check\n",
|
||||
"# # python -m tools.datasets.csvutil ~/dataset/meta_caption_processed.csv --info --output ~/dataset/meta_ready.csv\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1837,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# command1 = 'conda activate /home/zhaowangbo/.conda/envs/llava2'\n",
|
||||
"# command2 = ' cd ~/Open-Sora-dev'\n",
|
||||
"# command3 = 'nohup torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava /mnt/hdd/data/unsplash-full/resize_4k/meta/meta_machine_1.csv --tp-size 2 --dp-size 4 --bs 8 > llava_unsplash-full_machine_1.out &'\n",
|
||||
"# stdin, stdout, stderr = client.exec_command(command1)\n",
|
||||
" # stdin, stdout, stderr = client.exec_command(command2)\n",
|
||||
" # Execute the command\n",
|
||||
" # stdin, stdout, stderr = client.exec_command(command3)\n",
|
||||
" # output = stdout.read().decode()\n",
|
||||
" # print(output)\n",
|
||||
" # error = stderr.read().decode()\n",
|
||||
" # print(error)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
|
|
@ -153,6 +153,3 @@ class VariableVideoTextDataset(VideoTextDataset):
|
|||
# TCHW -> CTHW
|
||||
video = video.permute(1, 0, 2, 3)
|
||||
return {"video": video, "text": text, "num_frames": num_frames, "height": height, "width": width, "ar": ar}
|
||||
|
||||
def __getitem__(self, index):
|
||||
return self.getitem(index)
|
||||
|
|
|
|||
|
|
@ -66,7 +66,7 @@ Please note that you should add the `--flash-attention` flag when running with L
|
|||
After running the script, with `dp-size=N`, you will get `N` parts of csv files. Run the following command to merge them:
|
||||
|
||||
```bash
|
||||
python -m tools.datasets.csvutil DATA_caption_part*.csv --output DATA_caption.csv
|
||||
python -m tools.datasets.datautil DATA_caption_part*.csv --output DATA_caption.csv
|
||||
```
|
||||
|
||||
### Resume
|
||||
|
|
@ -75,10 +75,10 @@ Sometimes the process may be interrupted. We can resume the process by running t
|
|||
|
||||
```bash
|
||||
# merge generated results
|
||||
python -m tools.datasets.csvutil DATA_caption_part*.csv --output DATA_caption.csv
|
||||
python -m tools.datasets.datautil DATA_caption_part*.csv --output DATA_caption.csv
|
||||
|
||||
# get the remaining videos
|
||||
python -m tools.datasets.csvutil DATA.csv --difference DATA_caption.csv --output DATA_remaining.csv
|
||||
python -m tools.datasets.datautil DATA.csv --difference DATA_caption.csv --output DATA_remaining.csv
|
||||
```
|
||||
|
||||
Then use the output csv file to resume the process.
|
||||
|
|
|
|||
|
|
@ -119,6 +119,7 @@ def main(args):
|
|||
data = pd.read_csv(args.input)
|
||||
data["cmotion"] = apply(data["path"], process)
|
||||
data.to_csv(output_file, index=False)
|
||||
print(f"Output saved to {output_file}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
|
|
@ -72,11 +72,20 @@ def extract_frames(video_path, points=(0.1, 0.5, 0.9)):
|
|||
return frames_pil, total_frames
|
||||
|
||||
|
||||
def read_file(input_path):
|
||||
if input_path.endswith(".csv"):
|
||||
return pd.read_csv(input_path)
|
||||
elif input_path.endswith(".parquet"):
|
||||
return pd.read_parquet(input_path)
|
||||
else:
|
||||
raise NotImplementedError(f"Unsupported file format: {input_path}")
|
||||
|
||||
|
||||
class VideoTextDataset(torch.utils.data.Dataset):
|
||||
def __init__(self, csv_path, transform=None, num_frames=3, get_text_input_ids=None, resize=None):
|
||||
self.csv_path = csv_path
|
||||
self.transform = transform
|
||||
self.data = pd.read_csv(csv_path)
|
||||
self.data = read_file(csv_path)
|
||||
self.points = NUM_FRAMES_POINTS[num_frames]
|
||||
self.get_text_input_ids = get_text_input_ids
|
||||
self.use_text = False
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@
|
|||
- [Frame extraction](#frame-extraction)
|
||||
- [Crop Midjourney 4 grid](#crop-midjourney-4-grid)
|
||||
- [Analyze datasets](#analyze-datasets)
|
||||
- [Data Process Pipeline](#data-process-pipeline)
|
||||
|
||||
After preparing the raw dataset according to the [instructions](/docs/datasets.md), you can use the following commands to manage the dataset.
|
||||
|
||||
|
|
@ -74,7 +75,7 @@ python -m tools.datasets.convert vidprom VIDPROM_FOLDER --info VidProM_semantic_
|
|||
|
||||
## Manage datasets
|
||||
|
||||
Use `csvutil` to manage the dataset.
|
||||
Use `datautil` to manage the dataset.
|
||||
|
||||
### Requirement
|
||||
|
||||
|
|
@ -88,6 +89,10 @@ To get image and video information, you need to install [opencv-python](https://
|
|||
|
||||
```bash
|
||||
pip install opencv-python
|
||||
# If your videos are in av1 codec instead of h264, you need to
|
||||
# - install ffmpeg first
|
||||
# - install via conda to support av1 codec
|
||||
conda install -c conda-forge opencv
|
||||
```
|
||||
|
||||
Or to get video information, you can install ffmpeg and ffmpeg-python:
|
||||
|
|
@ -107,30 +112,27 @@ pip install lingua-language-detector
|
|||
You can use the following commands to process the `csv` or `parquet` files. The output file will be saved in the same directory as the input, with different suffixes indicating the processed method.
|
||||
|
||||
```bash
|
||||
# csvutil takes multiple CSV files as input and merge them into one CSV file
|
||||
# datautil takes multiple CSV files as input and merge them into one CSV file
|
||||
# output: DATA1+DATA2.csv
|
||||
python -m tools.datasets.csvutil DATA1.csv DATA2.csv
|
||||
python -m tools.datasets.datautil DATA1.csv DATA2.csv
|
||||
|
||||
# shard CSV files into multiple CSV files
|
||||
# output: DATA1_0.csv, DATA1_1.csv, ...
|
||||
python -m tools.datasets.csvutil DATA1.csv --shard 10
|
||||
python -m tools.datasets.datautil DATA1.csv --shard 10
|
||||
|
||||
# filter frames between 128 and 256, with captions
|
||||
# output: DATA1_fmin_128_fmax_256.csv
|
||||
python -m tools.datasets.csvutil DATA.csv --fmin 128 --fmax 256
|
||||
python -m tools.datasets.datautil DATA.csv --fmin 128 --fmax 256
|
||||
|
||||
# Disable parallel processing
|
||||
python -m tools.datasets.csvutil DATA.csv --fmin 128 --fmax 256 --disable-parallel
|
||||
|
||||
# Remove corrupted video from the csv
|
||||
python -m tools.datasets.csvutil DATA.csv --remove-corrupted
|
||||
python -m tools.datasets.datautil DATA.csv --fmin 128 --fmax 256 --disable-parallel
|
||||
|
||||
# Compute num_frames, height, width, fps, aspect_ratio for videos or images
|
||||
# output: IMG_DATA+VID_DATA_vinfo.csv
|
||||
python -m tools.datasets.csvutil IMG_DATA.csv VID_DATA.csv --video-info
|
||||
python -m tools.datasets.datautil IMG_DATA.csv VID_DATA.csv --video-info
|
||||
|
||||
# You can run multiple operations at the same time.
|
||||
python -m tools.datasets.csvutil DATA.csv --video-info --remove-empty-caption --remove-url --lang en
|
||||
python -m tools.datasets.datautil DATA.csv --video-info --remove-empty-caption --remove-url --lang en
|
||||
```
|
||||
|
||||
### Score filtering
|
||||
|
|
@ -140,7 +142,7 @@ To examine and filter the quality of the dataset by aesthetic score and clip sco
|
|||
```bash
|
||||
# sort the dataset by aesthetic score
|
||||
# output: DATA_sort.csv
|
||||
python -m tools.datasets.csvutil DATA.csv --sort aesthetic_score
|
||||
python -m tools.datasets.datautil DATA.csv --sort aesthetic_score
|
||||
# View examples of high aesthetic score
|
||||
head -n 10 DATA_sort.csv
|
||||
# View examples of low aesthetic score
|
||||
|
|
@ -148,19 +150,19 @@ tail -n 10 DATA_sort.csv
|
|||
|
||||
# sort the dataset by clip score
|
||||
# output: DATA_sort.csv
|
||||
python -m tools.datasets.csvutil DATA.csv --sort clip_score
|
||||
python -m tools.datasets.datautil DATA.csv --sort clip_score
|
||||
|
||||
# filter the dataset by aesthetic score
|
||||
# output: DATA_aesmin_0.5.csv
|
||||
python -m tools.datasets.csvutil DATA.csv --aesmin 0.5
|
||||
python -m tools.datasets.datautil DATA.csv --aesmin 0.5
|
||||
# filter the dataset by clip score
|
||||
# output: DATA_matchmin_0.5.csv
|
||||
python -m tools.datasets.csvutil DATA.csv --matchmin 0.5
|
||||
python -m tools.datasets.datautil DATA.csv --matchmin 0.5
|
||||
```
|
||||
|
||||
### Documentation
|
||||
|
||||
You can also use `python -m tools.datasets.csvutil --help` to see usage.
|
||||
You can also use `python -m tools.datasets.datautil --help` to see usage.
|
||||
|
||||
| Args | File suffix | Description |
|
||||
| --------------------------- | -------------- | ------------------------------------------------------------- |
|
||||
|
|
@ -174,9 +176,7 @@ You can also use `python -m tools.datasets.csvutil --help` to see usage.
|
|||
| `--difference DATA.csv` | | Remove the paths in DATA.csv from the dataset |
|
||||
| `--intersection DATA.csv` | | Keep the paths in DATA.csv from the dataset and merge columns |
|
||||
| `--info` | `_info` | Get the basic information of each video and image (cv2) |
|
||||
| `--video-info` | `_vinfo` | Get the basic information of each video (ffmpeg) |
|
||||
| `--ext` | `_ext` | Remove rows if the file does not exist |
|
||||
| `--remove-corrupted` | `_nocorrupted` | Remove the corrupted video and image |
|
||||
| `--relpath` | `_relpath` | Modify the path to relative path by root given |
|
||||
| `--abspath` | `_abspath` | Modify the path to absolute path by root given |
|
||||
| `--remove-empty-caption` | `_noempty` | Remove rows with empty caption |
|
||||
|
|
@ -245,3 +245,37 @@ For the dataset provided in a `.csv` or `.parquet` file, you can easily analyze
|
|||
```python
|
||||
pyhton -m tools.datasets.analyze DATA_info.csv
|
||||
```
|
||||
|
||||
## Data Process Pipeline
|
||||
|
||||
```bash
|
||||
# Suppose videos and images under ~/dataset/
|
||||
# 1. Convert dataset to CSV
|
||||
python -m tools.datasets.convert video ~/dataset --output meta.csv
|
||||
|
||||
# 2. Get video information
|
||||
python -m tools.datasets.datautil meta.csv --info --fmin 1
|
||||
|
||||
# 3. Get caption
|
||||
# 3.1. generate caption
|
||||
torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava meta_info_fmin1.csv --dp-size 8 --tp-size 1 --model-path liuhaotian/llava-v1.6-mistral-7b --prompt video
|
||||
# merge generated results
|
||||
python -m tools.datasets.datautil meta_info_fmin1_caption_part*.csv --output meta_caption.csv
|
||||
# merge caption and info
|
||||
python -m tools.datasets.datautil meta_info_fmin1.csv --intersection meta_caption.csv --output meta_caption_info.csv
|
||||
# clean caption
|
||||
python -m tools.datasets.datautil meta_caption_info.csv --clean-caption --refine-llm-caption --remove-empty-caption --output meta_caption_processed.csv
|
||||
# 3.2. extract caption
|
||||
python -m tools.datasets.datautil meta_info_fmin1.csv --load-caption json --remove-empty-caption --clean-caption
|
||||
|
||||
# 4. Scoring
|
||||
# aesthetic scoring
|
||||
torchrun --standalone --nproc_per_node 8 -m tools.scoring.aesthetic.inference meta_caption_processed.csv
|
||||
python -m tools.datasets.datautil meta_caption_processed_part*.csv --output meta_caption_processed_aes.csv
|
||||
# optical flow scoring
|
||||
torchrun --standalone --nproc_per_node 8 -m tools.scoring.optical_flow.inference_parallel meta_caption_processed.csv
|
||||
# matching scoring
|
||||
torchrun --standalone --nproc_per_node 8 -m tools.scoring.matching.inference_parallel meta_caption_processed.csv
|
||||
# camera motion
|
||||
python -m tools.caption.camera_motion_detect meta_caption_processed.csv
|
||||
```
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ import re
|
|||
from functools import partial
|
||||
from glob import glob
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
|
|
@ -47,8 +48,6 @@ def apply(df, func, **kwargs):
|
|||
|
||||
|
||||
def get_info(path):
|
||||
import cv2
|
||||
|
||||
try:
|
||||
ext = os.path.splitext(path)[1].lower()
|
||||
if ext in IMG_EXTENSIONS:
|
||||
|
|
@ -72,42 +71,6 @@ def get_info(path):
|
|||
return 0, 0, 0, np.nan, np.nan, np.nan
|
||||
|
||||
|
||||
# ======================================================
|
||||
# --video-info
|
||||
# ======================================================
|
||||
|
||||
|
||||
def get_video_info(path):
|
||||
import ffmpeg
|
||||
|
||||
try:
|
||||
info = ffmpeg.probe(path)["streams"][0]
|
||||
height = int(info["height"])
|
||||
width = int(info["width"])
|
||||
num_frames = int(info["nb_frames"])
|
||||
aspect_ratio = height / width
|
||||
hw = height * width
|
||||
fps = np.nan
|
||||
return num_frames, height, width, aspect_ratio, fps, hw
|
||||
except:
|
||||
return 0, 0, 0, np.nan, np.nan, np.nan
|
||||
|
||||
|
||||
# ======================================================
|
||||
# --remove-corrupted
|
||||
# ======================================================
|
||||
|
||||
|
||||
def is_video_valid(path):
|
||||
import decord
|
||||
|
||||
try:
|
||||
decord.VideoReader(path, num_threads=1)
|
||||
return True
|
||||
except:
|
||||
return False
|
||||
|
||||
|
||||
# ======================================================
|
||||
# --refine-llm-caption
|
||||
# ======================================================
|
||||
|
|
@ -357,12 +320,15 @@ def text_preprocessing(text, use_text_preprocessing: bool = True):
|
|||
|
||||
|
||||
def load_caption(path, ext):
|
||||
assert ext in ["json"]
|
||||
json_path = path.split(".")[0] + ".json"
|
||||
with open(json_path, "r") as f:
|
||||
data = json.load(f)
|
||||
caption = data["caption"]
|
||||
return caption
|
||||
try:
|
||||
assert ext in ["json"]
|
||||
json_path = path.split(".")[0] + ".json"
|
||||
with open(json_path, "r") as f:
|
||||
data = json.load(f)
|
||||
caption = data["caption"]
|
||||
return caption
|
||||
except:
|
||||
return ""
|
||||
|
||||
|
||||
# ======================================================
|
||||
|
|
@ -396,6 +362,7 @@ def read_data(input_paths):
|
|||
input_list.extend(glob(input_path))
|
||||
print("Input files:", input_list)
|
||||
for i, input_path in enumerate(input_list):
|
||||
assert os.path.exists(input_path)
|
||||
data.append(read_file(input_path))
|
||||
input_name += os.path.basename(input_path).split(".")[0]
|
||||
if i != len(input_list) - 1:
|
||||
|
|
@ -444,10 +411,25 @@ def main(args):
|
|||
|
||||
tokenizer = AutoTokenizer.from_pretrained("DeepFloyd/t5-v1_1-xxl")
|
||||
|
||||
# filtering
|
||||
# IO-related
|
||||
if args.load_caption is not None:
|
||||
assert "path" in data.columns
|
||||
data["text"] = apply(data["path"], load_caption, ext=args.load_caption)
|
||||
if args.info:
|
||||
info = apply(data["path"], get_info)
|
||||
(
|
||||
data["num_frames"],
|
||||
data["height"],
|
||||
data["width"],
|
||||
data["aspect_ratio"],
|
||||
data["fps"],
|
||||
data["resolution"],
|
||||
) = zip(*info)
|
||||
if args.ext:
|
||||
assert "path" in data.columns
|
||||
data = data[apply(data["path"], os.path.exists)]
|
||||
|
||||
# filtering
|
||||
if args.remove_url:
|
||||
assert "text" in data.columns
|
||||
data = data[~data["text"].str.contains(r"(?P<url>https?://[^\s]+)", regex=True)]
|
||||
|
|
@ -461,9 +443,6 @@ def main(args):
|
|||
if args.remove_path_duplication:
|
||||
assert "path" in data.columns
|
||||
data = data.drop_duplicates(subset=["path"])
|
||||
if args.remove_corrupted:
|
||||
assert "path" in data.columns
|
||||
data = data[apply(data["path"], is_video_valid)]
|
||||
|
||||
# processing
|
||||
if args.relpath is not None:
|
||||
|
|
@ -487,30 +466,6 @@ def main(args):
|
|||
if args.count_num_token is not None:
|
||||
assert "text" in data.columns
|
||||
data["text_len"] = apply(data["text"], lambda x: len(tokenizer(x)["input_ids"]))
|
||||
if args.info:
|
||||
info = apply(data["path"], get_info)
|
||||
(
|
||||
data["num_frames"],
|
||||
data["height"],
|
||||
data["width"],
|
||||
data["aspect_ratio"],
|
||||
data["fps"],
|
||||
data["resolution"],
|
||||
) = zip(*info)
|
||||
if args.video_info:
|
||||
assert "path" in data.columns
|
||||
info = apply(data["path"], get_video_info)
|
||||
(
|
||||
data["num_frames"],
|
||||
data["height"],
|
||||
data["width"],
|
||||
data["aspect_ratio"],
|
||||
data["fps"],
|
||||
data["resolution"],
|
||||
) = zip(*info)
|
||||
if args.load_caption is not None:
|
||||
assert "path" in data.columns
|
||||
data["text"] = apply(data["path"], load_caption, ext=args.load_caption)
|
||||
|
||||
# sort
|
||||
if args.sort is not None:
|
||||
|
|
@ -577,9 +532,10 @@ def parse_args():
|
|||
|
||||
# IO-related
|
||||
parser.add_argument("--info", action="store_true", help="get the basic information of each video and image")
|
||||
parser.add_argument("--video-info", action="store_true", help="get the basic information of each video")
|
||||
parser.add_argument("--ext", action="store_true", help="check if the file exists")
|
||||
parser.add_argument("--remove-corrupted", action="store_true", help="remove the corrupted video and image")
|
||||
parser.add_argument(
|
||||
"--load-caption", type=str, default=None, choices=["json", "txt"], help="load the caption from json or txt"
|
||||
)
|
||||
|
||||
# path processing
|
||||
parser.add_argument("--relpath", type=str, default=None, help="modify the path to relative path by root given")
|
||||
|
|
@ -606,9 +562,6 @@ def parse_args():
|
|||
parser.add_argument(
|
||||
"--count-num-token", type=str, choices=["t5"], default=None, help="Count the number of tokens in the caption"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--load-caption", type=str, default=None, choices=["json", "txt"], help="load the caption from json or txt"
|
||||
)
|
||||
|
||||
# score filtering
|
||||
parser.add_argument("--fmin", type=int, default=None, help="filter the dataset by minimum number of frames")
|
||||
|
|
@ -636,14 +589,13 @@ def get_output_path(args, input_name):
|
|||
name += "_sort"
|
||||
|
||||
# IO-related
|
||||
# for IO-related, the function must be wrapped in try-except
|
||||
if args.info:
|
||||
name += "_info"
|
||||
if args.video_info:
|
||||
name += "_vinfo"
|
||||
if args.ext:
|
||||
name += "_ext"
|
||||
if args.remove_corrupted:
|
||||
name += "_nocorrupted"
|
||||
if args.load_caption:
|
||||
name += f"_load{args.load_caption}"
|
||||
|
||||
# path processing
|
||||
if args.relpath is not None:
|
||||
|
|
@ -674,8 +626,6 @@ def get_output_path(args, input_name):
|
|||
name += "_cmcaption"
|
||||
if args.count_num_token:
|
||||
name += "_ntoken"
|
||||
if args.load_caption:
|
||||
name += f"_load{args.load_caption}"
|
||||
|
||||
# score filtering
|
||||
if args.fmin is not None:
|
||||
|
|
@ -1,26 +1,35 @@
|
|||
## Scene Detection and Video Splitting
|
||||
|
||||
### Formatting
|
||||
Input meta should be `{prefix}.csv` with column `'videoId'`
|
||||
```
|
||||
python tools/scene_cut/process_meta.py --task append_format --meta_path /mnt/hdd/data/pexels_new/raw/meta/popular_6.csv --split popular_6
|
||||
```
|
||||
Output is `{prefix}_format.csv` (with column `path`) and `{prefix}_intact.csv` (with column `intact` and `path`)
|
||||
|
||||
### Scene Detection
|
||||
Input meta should be `{prefix}_format.csv`
|
||||
```
|
||||
python tools/scene_cut/scene_detect.py --meta_path /mnt/hdd/data/pexels_new/raw/meta/popular_6_format.csv
|
||||
```
|
||||
Output is `{prefix}_format_timestamp.csv`
|
||||
|
||||
### Video Splitting
|
||||
Input meta should be `{prefix}_timestamp.csv`
|
||||
```
|
||||
python tools/scene_cut/main_cut_pandarallel.py \
|
||||
--meta_path /mnt/hdd/data/pexels_new/raw/meta/popular_6_format_timestamp.csv \
|
||||
--out_dir /mnt/hdd/data/pexels_new/scene_cut/data/popular_6
|
||||
```
|
||||
Output is `{out_dir}/{wo_ext}_scene-{sid}.mp4`
|
||||
|
||||
TODO: meta for video clips
|
||||
## Scene Detection and Video Splitting
|
||||
|
||||
### Formatting
|
||||
|
||||
Input meta should be `{prefix}.csv` with column `'videoId'`
|
||||
|
||||
```bash
|
||||
python tools/scene_cut/process_meta.py --task append_format --meta_path /mnt/hdd/data/pexels_new/raw/meta/popular_6.csv --split popular_6
|
||||
```
|
||||
|
||||
Output is `{prefix}_format.csv` (with column `path`) and `{prefix}_intact.csv` (with column `intact` and `path`)
|
||||
|
||||
### Scene Detection
|
||||
|
||||
Input meta should be `{prefix}_format.csv`
|
||||
|
||||
```bash
|
||||
python tools/scene_cut/scene_detect.py --meta_path /mnt/hdd/data/pexels_new/raw/meta/popular_6_format.csv
|
||||
```
|
||||
|
||||
Output is `{prefix}_format_timestamp.csv`
|
||||
|
||||
### Video Splitting
|
||||
|
||||
Input meta should be `{prefix}_timestamp.csv`
|
||||
|
||||
```bash
|
||||
python tools/scene_cut/main_cut_pandarallel.py \
|
||||
--meta_path /mnt/hdd/data/pexels_new/raw/meta/popular_6_format_timestamp.csv \
|
||||
--out_dir /mnt/hdd/data/pexels_new/scene_cut/data/popular_6
|
||||
```
|
||||
|
||||
Output is `{out_dir}/{wo_ext}_scene-{sid}.mp4`
|
||||
|
||||
TODO: meta for video clips
|
||||
|
|
|
|||
|
|
@ -166,7 +166,7 @@ def main(args):
|
|||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("input", type=str, help="Path to the input CSV file")
|
||||
parser.add_argument("--bs", type=int, default=512, help="Batch size")
|
||||
parser.add_argument("--bs", type=int, default=1024, help="Batch size")
|
||||
parser.add_argument("--num_workers", type=int, default=16, help="Number of workers")
|
||||
parser.add_argument("--accumulate", type=int, default=1, help="batch to accumulate")
|
||||
parser.add_argument("--prefetch_factor", type=int, default=2, help="Prefetch factor")
|
||||
|
|
|
|||
|
|
@ -8,10 +8,6 @@ import pandas as pd
|
|||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torchvision.datasets.folder import pil_loader
|
||||
<<<<<<< HEAD
|
||||
|
||||
=======
|
||||
>>>>>>> dev/v1.0.1
|
||||
from tqdm import tqdm
|
||||
|
||||
IMG_EXTENSIONS = (
|
||||
|
|
@ -48,9 +44,7 @@ def extract_frames(video_path, points=[0.5]):
|
|||
frames = []
|
||||
for point in points:
|
||||
target_frame = total_frames * point
|
||||
target_timestamp = int(
|
||||
(target_frame * av.time_base) / container.streams.video[0].average_rate
|
||||
)
|
||||
target_timestamp = int((target_frame * av.time_base) / container.streams.video[0].average_rate)
|
||||
container.seek(target_timestamp)
|
||||
frame = next(container.decode(video=0)).to_image()
|
||||
frames.append(frame)
|
||||
|
|
@ -65,7 +59,7 @@ class VideoTextDataset(torch.utils.data.Dataset):
|
|||
|
||||
def __getitem__(self, index):
|
||||
row = self.meta.iloc[index]
|
||||
path = row['path']
|
||||
path = row["path"]
|
||||
|
||||
if is_video(path):
|
||||
img = extract_frames(path, points=[0.5])[0]
|
||||
|
|
|
|||
Loading…
Reference in a new issue