[fix] use decord for tools

This commit is contained in:
Zangwei Zheng 2024-04-01 16:45:23 +08:00
parent ff15a0acfb
commit 5ade5e5984
6 changed files with 38 additions and 28 deletions

View file

@ -198,7 +198,7 @@ the following steps:
2. Split videos into clips. [[docs](/tools/scenedetect/README.md)]
3. Generate video captions. [[docs](/tools/caption/README.md)]
Below is an example workflow to process data. However, we recommend you to read the detailed documentation for each tool, and decide which tools to use based on your needs.
Below is an example workflow to process data. However, we recommend you to read the detailed documentation for each tool, and decide which tools to use based on your needs. This pipeline applies to both image and video data.
```bash
# Suppose files under ~/dataset/

View file

@ -11,6 +11,7 @@ For videos, we extract the first, last, and the middle frames for evaluation. Th
```bash
# install clip
pip install git+https://github.com/openai/CLIP.git
pip install decord
# get pretrained model
wget https://github.com/christophschuhmann/improved-aesthetic-predictor/raw/main/sac+logos+ava1-l14-linearMSE.pth -O pretrained_models/aesthetic.pth

View file

@ -2,14 +2,15 @@
import argparse
import os
import av
import clip
import decord
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from einops import rearrange
from PIL import Image
from torchvision.datasets.folder import pil_loader
from tqdm import tqdm
@ -23,16 +24,13 @@ def is_video(filename):
def extract_frames(video_path, points=(0.1, 0.5, 0.9)):
container = av.open(video_path)
total_frames = container.streams.video[0].frames
frames = []
for point in points:
target_frame = total_frames * point
target_timestamp = int((target_frame * av.time_base) / container.streams.video[0].average_rate)
container.seek(target_timestamp)
frame = next(container.decode(video=0)).to_image()
frames.append(frame)
return frames
container = decord.VideoReader(video_path, num_threads=1)
total_frames = len(container)
frame_inds = (np.array(points) * total_frames).astype(np.int32)
frame_inds[frame_inds >= total_frames] = total_frames - 1
frames = container.get_batch(frame_inds).asnumpy() # [N, H, W, C]
frames_pil = [Image.fromarray(frame) for frame in frames]
return frames_pil
class VideoTextDataset(torch.utils.data.Dataset):
@ -107,14 +105,14 @@ def main(args):
model = torch.nn.DataParallel(model)
# build dataset
dataset = VideoTextDataset(args.input, transform=preprocess, points=(0.5,))
dataset = VideoTextDataset(args.input, transform=preprocess)
dataloader = torch.utils.data.DataLoader(
dataset,
batch_size=args.bs,
shuffle=False,
num_workers=args.num_workers,
num_workers=0,
pin_memory=True,
prefetch_factor=args.prefetch_factor,
# prefetch_factor=args.prefetch_factor,
)
# compute aesthetic scores

View file

@ -32,8 +32,8 @@ pip install -e .
# install flash attention
pip install flash-attn --no-build-isolation
# install colossalai and pyav
pip install colossalai pyav
# install colossalai and decord
pip install colossalai decord
```
Since only the 34B model's performance is comparable to GPT-4V, we only provide the usage of the 34B model. The 34B model is available [here](https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b), or run our script and it will be downloaded automatically.

View file

@ -1,10 +1,12 @@
import os
import time
import av
import decord
import numpy as np
import pandas as pd
import torch
import torchvision.transforms as transforms
from PIL import Image
from torchvision.datasets.folder import pil_loader
PROMPTS = {
@ -31,16 +33,13 @@ def is_video(filename):
def extract_frames(video_path, points=(0.1, 0.5, 0.9)):
container = av.open(video_path)
total_frames = container.streams.video[0].frames
frames = []
for point in points:
target_frame = total_frames * point
target_timestamp = int((target_frame * av.time_base) / container.streams.video[0].average_rate)
container.seek(target_timestamp)
frame = next(container.decode(video=0)).to_image()
frames.append(frame)
return frames, total_frames
container = decord.VideoReader(video_path, num_threads=1)
total_frames = len(container)
frame_inds = (np.array(points) * total_frames).astype(np.int32)
frame_inds[frame_inds >= total_frames] = total_frames - 1
frames = container.get_batch(frame_inds).asnumpy() # [N, H, W, C]
frames_pil = [Image.fromarray(frame) for frame in frames]
return frames_pil, total_frames
class VideoTextDataset(torch.utils.data.Dataset):

View file

@ -160,3 +160,15 @@ python -m tools.datasets.csvutil DATA.csv --aesmin 0.5
# output: DATA_matchmin_0.5.csv
python -m tools.datasets.csvutil DATA.csv --matchmin 0.5
```
## Frame extraction speed
We use three libraries to extract frames from videos: `opencv`, `pyav` and `decord`. Our benchmark results of loading 256 video's middle frames are as follows:
| Library | Time (s) |
| ------- | -------- |
| opencv | 33 |
| decord | 28 |
| pyav | 10 |
Although `pyav` is the fastest, it can only extract the key frames instead of frames at any time. Therefore, we use `decord` as the default library for frame extraction. For dataset management, without a bottleneck on loading speed, we choose `opencv` as the default library for video information extraction.