Open-Sora/notebooks/launch.ipynb
xyupeng 2219fc8703 Dev/pxy (#72)
* update scoring/matching

* update scoring/matching

* update scoring/matching

* update scoring/matching

* update scoring/matching

* update scoring/matching

* update scoring/matching

* update scoring/matching

* update scoring/matching

* update scene_cut

* update scene_cut

* update scene_cut[A

* update scene_cut

* update scene_cut

* update scene_cut

* update scene_cut

* update scene_cut

* update scene_cut

* m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* update readme

* update readme

* extract frames using opencv everywhere

* extract frames using opencv everywhere

* extract frames using opencv everywhere

* filter panda10m

* filter panda10m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* m

* ocr

* add ocr

* add main.sh

* add ocr

* add ocr

* add ocr

* add ocr

* add ocr

* add ocr

* update scene_cut

* update remove main.sh

* update scoring

* update scoring

* update scoring

* update README

* update readme

* update scene_cut

* update readme

* update scoring

* update readme

* update readme

* update filter_panda10m

* update readme

* update readme

* update launch.ipynb

* update scene_cut

* update scene_cut

* update readme

* update launch.ipynb
2024-04-24 15:52:07 +08:00

449 lines
14 KiB
Text

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data Process Pipeline"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Data Process Commands"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"# TODO: change to your own project path!!!\n",
"OPEN_SORA_HOME = \"/path/to/Open-Sora/\"\n",
"\n",
"\n",
"def convert_dataset_cmd(input_dir, output_file, datatype=\"video\"):\n",
" commands = []\n",
" commands.append(f'echo \"Converting {input_dir} to {output_file}\"')\n",
" output_dir = os.path.dirname(output_file)\n",
"\n",
" commands.append(f\"mkdir -p {output_dir}\")\n",
" commands.append(f\"cd {OPEN_SORA_HOME}\")\n",
" commands.append(f\"python -m tools.datasets.convert {datatype} {input_dir} --output {output_file}\")\n",
" return \" && \".join(commands), output_file\n",
"\n",
"\n",
"def get_video_info(input_file):\n",
" commands = []\n",
" base, ext = os.path.splitext(input_file)\n",
" output_file = f\"{base}_info{ext}\"\n",
" output_format = ext[1:]\n",
"\n",
" commands.append(f'echo \"Getting info of {input_file} to {output_file}\"')\n",
" commands.append(f\"cd {OPEN_SORA_HOME}\")\n",
" commands.append(\n",
" f\"python -m tools.datasets.datautil {input_file} --output {output_file} --format {output_format} --info --fmin 1\"\n",
" )\n",
" return \" && \".join(commands), output_file\n",
"\n",
"\n",
"def get_video_info_torchvision(input_file):\n",
" commands = []\n",
" base, ext = os.path.splitext(input_file)\n",
" output_file = f\"{base}_info{ext}\"\n",
" output_format = ext[1:]\n",
"\n",
" commands.append(f'echo \"Getting info of {input_file} to {output_file}\"')\n",
" commands.append(f\"cd {OPEN_SORA_HOME}\")\n",
" commands.append(\n",
" f\"python -m tools.datasets.datautil {input_file} --output {output_file} --format {output_format} --video-info --fmin 1\"\n",
" )\n",
" return \" && \".join(commands), output_file\n",
"\n",
"\n",
"def get_caption_llava7b_video(input_file):\n",
" commands = []\n",
" base, ext = os.path.splitext(input_file)\n",
" output_file = f\"{base}_caption{ext}\"\n",
" output_format = ext[1:]\n",
"\n",
" commands.append(f'echo \"Getting info of {input_file} to {output_file}\"')\n",
" commands.append(f\"cd {OPEN_SORA_HOME}\")\n",
" commands.append(f\"conda activate llava2\")\n",
" commands.append(\n",
" f\"torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava {input_file} --dp-size 8 --tp-size 1 --model-path liuhaotian/llava-v1.6-mistral-7b --prompt video\"\n",
" )\n",
" commands.append(f\"conda activate opensora\")\n",
" commands.append(\n",
" f\"python -m tools.datasets.datautil {base}_caption_part*{ext} --output {output_file} --format {output_format} --intersection {input_file} --clean-caption --refine-llm-caption --remove-empty-caption\"\n",
" )\n",
" return \" && \".join(commands), output_file\n",
"\n",
"\n",
"def get_caption_load(input_file):\n",
" commands = []\n",
" base, ext = os.path.splitext(input_file)\n",
" output_file = f\"{base}_caption{ext}\"\n",
" output_format = ext[1:]\n",
"\n",
" commands.append(f'echo \"Getting caption of {input_file} to {output_file}\"')\n",
" commands.append(f\"cd {OPEN_SORA_HOME}\")\n",
" commands.append(\n",
" f\"python -m tools.datasets.datautil {input_file} --output {output_file} --format {output_format} --load-caption json --remove-empty-caption --clean-caption\"\n",
" )\n",
" return \" && \".join(commands), output_file\n",
"\n",
"\n",
"def get_aesthetic_score(input_file):\n",
" commands = []\n",
" base, ext = os.path.splitext(input_file)\n",
" output_file = f\"{base}_aes{ext}\"\n",
" output_format = ext[1:]\n",
"\n",
" commands.append(f'echo \"Getting aesthetic score of {input_file} to {output_file}\"')\n",
" commands.append(f\"cd {OPEN_SORA_HOME}\")\n",
" commands.append(f\"torchrun --standalone --nproc_per_node 8 -m tools.scoring.aesthetic.inference {input_file}\")\n",
" commands.append(\n",
" f\"python -m tools.datasets.datautil {base}_aes_part*{ext} --output {output_file} --format {output_format} --sort aes\"\n",
" )\n",
" return \" && \".join(commands), output_file\n",
"\n",
"\n",
"def get_flow_score(input_file):\n",
" commands = []\n",
" base, ext = os.path.splitext(input_file)\n",
" output_file = f\"{base}_flow{ext}\"\n",
"\n",
" commands.append(f'echo \"Getting flow score of {input_file} to {output_file}\"')\n",
" commands.append(f\"cd {OPEN_SORA_HOME}\")\n",
" commands.append(f\"torchrun --standalone --nproc_per_node 8 -m tools.scoring.optical_flow.inference {input_file}\")\n",
" return \" && \".join(commands), output_file\n",
"\n",
"\n",
"def get_ocr(input_file):\n",
" commands = []\n",
" base, ext = os.path.splitext(input_file)\n",
" output_file = f\"{base}_match{ext}\"\n",
"\n",
" commands.append(f'echo \"Getting match score of {input_file} to {output_file}\"')\n",
" commands.append(f\"cd {OPEN_SORA_HOME}\")\n",
" commands.append(f\"torchrun --standalone --nproc_per_node 8 -m tools.scoring.ocr.inference {input_file}\")\n",
" return \" && \".join(commands), output_file\n",
"\n",
" \n",
"def get_match_score(input_file):\n",
" commands = []\n",
" base, ext = os.path.splitext(input_file)\n",
" output_file = f\"{base}_match{ext}\"\n",
"\n",
" commands.append(f'echo \"Getting match score of {input_file} to {output_file}\"')\n",
" commands.append(f\"cd {OPEN_SORA_HOME}\")\n",
" commands.append(f\"torchrun --standalone --nproc_per_node 8 -m tools.scoring.matching.inference {input_file}\")\n",
" return \" && \".join(commands), output_file\n",
"\n",
"\n",
"def get_cmotion_score(input_file):\n",
" commands = []\n",
" base, ext = os.path.splitext(input_file)\n",
" output_file = f\"{base}_cmotion{ext}\"\n",
"\n",
" commands.append(f'echo \"Getting cmotion score of {input_file} to {output_file}\"')\n",
" commands.append(f\"cd {OPEN_SORA_HOME}\")\n",
" commands.append(f\"python -m tools.caption.camera_motion_detect {input_file}\")\n",
" return \" && \".join(commands), output_file\n",
"\n",
"\n",
"def get_commands(job_list):\n",
" commands = []\n",
" output_file = None\n",
" for job in job_list:\n",
" cmd = job.pop(\"cmd\")\n",
" if output_file is None:\n",
" command, output_file = cmd(**job)\n",
" commands.append(command)\n",
" else:\n",
" job[\"input_file\"] = output_file\n",
" command, output_file = cmd(**job)\n",
" commands.append(command)\n",
" commands.append(f'echo \"All Done!\"')\n",
" return \" && \".join(commands), output_file"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Remote Launch via Paramiko\n",
"\n",
"First, add hosts to `~/.ssh/config`"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import paramiko\n",
"\n",
"HOSTS = [\"host-0\", \"host-1\", \"host-2\", \"host-3\", \"host-4\", \"host-5\", \"host-6\", \"host-7\"]\n",
"\n",
"# load from ~/.ssh/config\n",
"ssh_config = paramiko.SSHConfig()\n",
"user_config_file = os.path.expanduser(\"~/.ssh/config\")\n",
"if os.path.exists(user_config_file):\n",
" with open(user_config_file) as f:\n",
" ssh_config.parse(f)\n",
"\n",
"\n",
"def get_ssh_config(hostname):\n",
" # get the configuration for the host\n",
" user_config = ssh_config.lookup(hostname)\n",
" cfg = {\n",
" \"hostname\": user_config[\"hostname\"],\n",
" \"username\": user_config[\"user\"],\n",
" \"port\": int(user_config[\"port\"]),\n",
" \"key_filename\": user_config[\"identityfile\"],\n",
" }\n",
" return cfg\n",
"\n",
"\n",
"def connect(hostname):\n",
" cfg = get_ssh_config(hostname)\n",
" # connect\n",
" client = paramiko.SSHClient()\n",
" client.set_missing_host_key_policy(paramiko.AutoAddPolicy())\n",
" client.connect(**cfg)\n",
" return client\n",
"\n",
"\n",
"def run_command(command, hostname, nohup=False, log_file=None, sleep=None):\n",
" client = connect(hostname)\n",
" print(\"HOST:\", hostname)\n",
" if sleep:\n",
" command = f\"sleep {sleep}; {command}\"\n",
" command = f\"bash -ic '{command}'\"\n",
" if log_file:\n",
" command = f\"{command} >> {log_file} 2>&1\"\n",
" if nohup:\n",
" command = f\"nohup {command} &\"\n",
" print(\"COMMAND:\", command)\n",
" stdin, stdout, stderr = client.exec_command(command, get_pty=False)\n",
"\n",
" stdout_str = stdout.read().decode()\n",
" stderr_str = stderr.read().decode()\n",
" if stdout_str:\n",
" print(\"==== STDOUT ====\\n\", stdout_str)\n",
" if stderr_str:\n",
" print(\"==== STDERR ====\\n\", stderr_str)\n",
"\n",
" client.close()\n",
"\n",
"\n",
"def run_command_all_hosts(command, hosts=HOSTS):\n",
" for hostname in hosts:\n",
" run_command(command, hostname)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Here are tools to examine machine's status."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def nvidia_smi(host):\n",
" if host:\n",
" run_command(\"nvidia-smi\", host)\n",
" else:\n",
" run_command_all_hosts(\"nvidia-smi\")\n",
"\n",
"\n",
"def nvitop(host=None):\n",
" if host:\n",
" run_command(f\"/home/user/.local/bin/nvitop -1\", host)\n",
" else:\n",
" run_command_all_hosts(\"/home/user/.local/bin/nvitop -1\")\n",
"\n",
"\n",
"def ps(host=None, interest=\"python|sleep|torchrun|colossal\", all=True):\n",
" cmd = \"ps aux\" if all else \"ps ux\"\n",
" if host:\n",
" if interest is None:\n",
" run_command(f\"{cmd} | cat\", host)\n",
" else:\n",
" run_command(f'{cmd} | cat | grep --color=never -E \"{interest}\"', host)\n",
" else:\n",
" if interest is None:\n",
" run_command_all_hosts(f\"{cmd} | cat\")\n",
" else:\n",
" run_command_all_hosts(f'{cmd} | cat | grep --color=never -E \"{interest}\"')\n",
"\n",
"\n",
"def kill(pid, host):\n",
" run_command(f\"kill -KILL {pid}\", host)\n",
"\n",
"\n",
"def pkill(interest, host):\n",
" run_command(f'pkill -9 -f \"{interest}\"', host)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Example\n",
"\n",
"Remote launch via paramiko."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sleep = None\n",
"run_command(cmd, host, log_file=log_file, nohup=True, sleep=sleep)\n",
"ps(host)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Using following commands to monitor the status of the jobs."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ps()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"nvitop(host)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"kill(, host)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Training"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def colossal_run(data_path, load_path=None):\n",
" commands = []\n",
" commands.append(f\"cd {OPEN_SORA_HOME}\")\n",
" command = f\"colossalai run --nproc_per_node 8 --hostfile hostfile scripts/train.py configs/opensora-v1-1/train/video.py --wandb True --data-path {data_path}\"\n",
" if load_path:\n",
" command = f\"{command} --load-path {load_path}\"\n",
" commands.append(command)\n",
" cmd = \" && \".join(commands)\n",
" return cmd\n",
"\n",
"\n",
"def kill_all():\n",
" commands = []\n",
" commands.append(f\"cd {OPEN_SORA_HOME}\")\n",
" commands.append('cat hostfile | xargs -I \"{}\" ssh \"{}\" pkill -9 python')\n",
" cmd = \" && \".join(commands)\n",
" return cmd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Examples"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"host = \"host-0\"\n",
"log_file = os.path.join(OPEN_SORA_HOME, \"logs/train.log\")\n",
"data_path = \"/path/to/meta.csv\"\n",
"cmd = colossal_run(data_path)\n",
"print(cmd)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"run_command(cmd, host, log_file=log_file, nohup=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"cmd = kill_all()\n",
"run_command(cmd, host)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 4
}