From 5ade5e5984e375ea93ebfbea5bc6f63cb929053a Mon Sep 17 00:00:00 2001 From: Zangwei Zheng Date: Mon, 1 Apr 2024 16:45:23 +0800 Subject: [PATCH] [fix] use decord for tools --- README.md | 2 +- tools/aesthetic/README.md | 1 + tools/aesthetic/inference.py | 26 ++++++++++++-------------- tools/caption/README.md | 4 ++-- tools/caption/utils.py | 21 ++++++++++----------- tools/datasets/README.md | 12 ++++++++++++ 6 files changed, 38 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index fd1fa7e..85635c3 100644 --- a/README.md +++ b/README.md @@ -198,7 +198,7 @@ the following steps: 2. Split videos into clips. [[docs](/tools/scenedetect/README.md)] 3. Generate video captions. [[docs](/tools/caption/README.md)] -Below is an example workflow to process data. However, we recommend you to read the detailed documentation for each tool, and decide which tools to use based on your needs. +Below is an example workflow to process data. However, we recommend you to read the detailed documentation for each tool, and decide which tools to use based on your needs. This pipeline applies to both image and video data. ```bash # Suppose files under ~/dataset/ diff --git a/tools/aesthetic/README.md b/tools/aesthetic/README.md index db7179c..9cdf2bc 100644 --- a/tools/aesthetic/README.md +++ b/tools/aesthetic/README.md @@ -11,6 +11,7 @@ For videos, we extract the first, last, and the middle frames for evaluation. Th ```bash # install clip pip install git+https://github.com/openai/CLIP.git +pip install decord # get pretrained model wget https://github.com/christophschuhmann/improved-aesthetic-predictor/raw/main/sac+logos+ava1-l14-linearMSE.pth -O pretrained_models/aesthetic.pth diff --git a/tools/aesthetic/inference.py b/tools/aesthetic/inference.py index 3d0b73b..40ff78a 100644 --- a/tools/aesthetic/inference.py +++ b/tools/aesthetic/inference.py @@ -2,14 +2,15 @@ import argparse import os -import av import clip +import decord import numpy as np import pandas as pd import torch import torch.nn as nn import torch.nn.functional as F from einops import rearrange +from PIL import Image from torchvision.datasets.folder import pil_loader from tqdm import tqdm @@ -23,16 +24,13 @@ def is_video(filename): def extract_frames(video_path, points=(0.1, 0.5, 0.9)): - container = av.open(video_path) - total_frames = container.streams.video[0].frames - frames = [] - for point in points: - target_frame = total_frames * point - target_timestamp = int((target_frame * av.time_base) / container.streams.video[0].average_rate) - container.seek(target_timestamp) - frame = next(container.decode(video=0)).to_image() - frames.append(frame) - return frames + container = decord.VideoReader(video_path, num_threads=1) + total_frames = len(container) + frame_inds = (np.array(points) * total_frames).astype(np.int32) + frame_inds[frame_inds >= total_frames] = total_frames - 1 + frames = container.get_batch(frame_inds).asnumpy() # [N, H, W, C] + frames_pil = [Image.fromarray(frame) for frame in frames] + return frames_pil class VideoTextDataset(torch.utils.data.Dataset): @@ -107,14 +105,14 @@ def main(args): model = torch.nn.DataParallel(model) # build dataset - dataset = VideoTextDataset(args.input, transform=preprocess, points=(0.5,)) + dataset = VideoTextDataset(args.input, transform=preprocess) dataloader = torch.utils.data.DataLoader( dataset, batch_size=args.bs, shuffle=False, - num_workers=args.num_workers, + num_workers=0, pin_memory=True, - prefetch_factor=args.prefetch_factor, + # prefetch_factor=args.prefetch_factor, ) # compute aesthetic scores diff --git a/tools/caption/README.md b/tools/caption/README.md index fd53303..df8932f 100644 --- a/tools/caption/README.md +++ b/tools/caption/README.md @@ -32,8 +32,8 @@ pip install -e . # install flash attention pip install flash-attn --no-build-isolation -# install colossalai and pyav -pip install colossalai pyav +# install colossalai and decord +pip install colossalai decord ``` Since only the 34B model's performance is comparable to GPT-4V, we only provide the usage of the 34B model. The 34B model is available [here](https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b), or run our script and it will be downloaded automatically. diff --git a/tools/caption/utils.py b/tools/caption/utils.py index 44226d5..36b0053 100644 --- a/tools/caption/utils.py +++ b/tools/caption/utils.py @@ -1,10 +1,12 @@ import os import time -import av +import decord +import numpy as np import pandas as pd import torch import torchvision.transforms as transforms +from PIL import Image from torchvision.datasets.folder import pil_loader PROMPTS = { @@ -31,16 +33,13 @@ def is_video(filename): def extract_frames(video_path, points=(0.1, 0.5, 0.9)): - container = av.open(video_path) - total_frames = container.streams.video[0].frames - frames = [] - for point in points: - target_frame = total_frames * point - target_timestamp = int((target_frame * av.time_base) / container.streams.video[0].average_rate) - container.seek(target_timestamp) - frame = next(container.decode(video=0)).to_image() - frames.append(frame) - return frames, total_frames + container = decord.VideoReader(video_path, num_threads=1) + total_frames = len(container) + frame_inds = (np.array(points) * total_frames).astype(np.int32) + frame_inds[frame_inds >= total_frames] = total_frames - 1 + frames = container.get_batch(frame_inds).asnumpy() # [N, H, W, C] + frames_pil = [Image.fromarray(frame) for frame in frames] + return frames_pil, total_frames class VideoTextDataset(torch.utils.data.Dataset): diff --git a/tools/datasets/README.md b/tools/datasets/README.md index e0fc1dc..4c11c6b 100644 --- a/tools/datasets/README.md +++ b/tools/datasets/README.md @@ -160,3 +160,15 @@ python -m tools.datasets.csvutil DATA.csv --aesmin 0.5 # output: DATA_matchmin_0.5.csv python -m tools.datasets.csvutil DATA.csv --matchmin 0.5 ``` + +## Frame extraction speed + +We use three libraries to extract frames from videos: `opencv`, `pyav` and `decord`. Our benchmark results of loading 256 video's middle frames are as follows: + +| Library | Time (s) | +| ------- | -------- | +| opencv | 33 | +| decord | 28 | +| pyav | 10 | + +Although `pyav` is the fastest, it can only extract the key frames instead of frames at any time. Therefore, we use `decord` as the default library for frame extraction. For dataset management, without a bottleneck on loading speed, we choose `opencv` as the default library for video information extraction.