[fix] use decord for tools

2026-04-11 13:14:44 +02:00 · 2024-04-01 16:45:23 +08:00 · 2024-04-01 16:45:23 +08:00 · 5ade5e5984
commit 5ade5e5984
parent ff15a0acfb
6 changed files with 38 additions and 28 deletions
--- a/README.md
+++ b/README.md
@ -198,7 +198,7 @@ the following steps:
 2. Split videos into clips. [[docs](/tools/scenedetect/README.md)]
 3. Generate video captions. [[docs](/tools/caption/README.md)]

-Below is an example workflow to process data. However, we recommend you to read the detailed documentation for each tool, and decide which tools to use based on your needs.
+Below is an example workflow to process data. However, we recommend you to read the detailed documentation for each tool, and decide which tools to use based on your needs. This pipeline applies to both image and video data.

 ```bash
 # Suppose files under ~/dataset/
--- a/tools/aesthetic/README.md
+++ b/tools/aesthetic/README.md
@ -11,6 +11,7 @@ For videos, we extract the first, last, and the middle frames for evaluation. Th
 ```bash
 # install clip
 pip install git+https://github.com/openai/CLIP.git
+pip install decord

 # get pretrained model
 wget https://github.com/christophschuhmann/improved-aesthetic-predictor/raw/main/sac+logos+ava1-l14-linearMSE.pth -O pretrained_models/aesthetic.pth
--- a/tools/aesthetic/inference.py
+++ b/tools/aesthetic/inference.py
@ -2,14 +2,15 @@
 import argparse
 import os

-import av
 import clip
+import decord
 import numpy as np
 import pandas as pd
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
+from PIL import Image
 from torchvision.datasets.folder import pil_loader
 from tqdm import tqdm

@ -23,16 +24,13 @@ def is_video(filename):


 def extract_frames(video_path, points=(0.1, 0.5, 0.9)):
-    container = av.open(video_path)
-    total_frames = container.streams.video[0].frames
-    frames = []
-    for point in points:
-        target_frame = total_frames * point
-        target_timestamp = int((target_frame * av.time_base) / container.streams.video[0].average_rate)
-        container.seek(target_timestamp)
-        frame = next(container.decode(video=0)).to_image()
-        frames.append(frame)
-    return frames
+    container = decord.VideoReader(video_path, num_threads=1)
+    total_frames = len(container)
+    frame_inds = (np.array(points) * total_frames).astype(np.int32)
+    frame_inds[frame_inds >= total_frames] = total_frames - 1
+    frames = container.get_batch(frame_inds).asnumpy()  # [N, H, W, C]
+    frames_pil = [Image.fromarray(frame) for frame in frames]
+    return frames_pil


 class VideoTextDataset(torch.utils.data.Dataset):
@ -107,14 +105,14 @@ def main(args):
    model = torch.nn.DataParallel(model)

    # build dataset
-    dataset = VideoTextDataset(args.input, transform=preprocess, points=(0.5,))
+    dataset = VideoTextDataset(args.input, transform=preprocess)
    dataloader = torch.utils.data.DataLoader(
        dataset,
        batch_size=args.bs,
        shuffle=False,
-        num_workers=args.num_workers,
+        num_workers=0,
        pin_memory=True,
-        prefetch_factor=args.prefetch_factor,
+        # prefetch_factor=args.prefetch_factor,
    )

    # compute aesthetic scores
--- a/tools/caption/README.md
+++ b/tools/caption/README.md
@ -32,8 +32,8 @@ pip install -e .

 # install flash attention
 pip install flash-attn --no-build-isolation
-# install colossalai and pyav
-pip install colossalai pyav
+# install colossalai and decord
+pip install colossalai decord
 ```

 Since only the 34B model's performance is comparable to GPT-4V, we only provide the usage of the 34B model. The 34B model is available [here](https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b), or run our script and it will be downloaded automatically.
--- a/tools/caption/utils.py
+++ b/tools/caption/utils.py
@ -1,10 +1,12 @@
 import os
 import time

-import av
+import decord
+import numpy as np
 import pandas as pd
 import torch
 import torchvision.transforms as transforms
+from PIL import Image
 from torchvision.datasets.folder import pil_loader

 PROMPTS = {
@ -31,16 +33,13 @@ def is_video(filename):


 def extract_frames(video_path, points=(0.1, 0.5, 0.9)):
-    container = av.open(video_path)
-    total_frames = container.streams.video[0].frames
-    frames = []
-    for point in points:
-        target_frame = total_frames * point
-        target_timestamp = int((target_frame * av.time_base) / container.streams.video[0].average_rate)
-        container.seek(target_timestamp)
-        frame = next(container.decode(video=0)).to_image()
-        frames.append(frame)
-    return frames, total_frames
+    container = decord.VideoReader(video_path, num_threads=1)
+    total_frames = len(container)
+    frame_inds = (np.array(points) * total_frames).astype(np.int32)
+    frame_inds[frame_inds >= total_frames] = total_frames - 1
+    frames = container.get_batch(frame_inds).asnumpy()  # [N, H, W, C]
+    frames_pil = [Image.fromarray(frame) for frame in frames]
+    return frames_pil, total_frames


 class VideoTextDataset(torch.utils.data.Dataset):
--- a/tools/datasets/README.md
+++ b/tools/datasets/README.md
@ -160,3 +160,15 @@ python -m tools.datasets.csvutil DATA.csv --aesmin 0.5
 # output: DATA_matchmin_0.5.csv
 python -m tools.datasets.csvutil DATA.csv --matchmin 0.5
 ```
+
+## Frame extraction speed
+
+We use three libraries to extract frames from videos: `opencv`, `pyav` and `decord`. Our benchmark results of loading 256 video's middle frames are as follows:
+
+| Library | Time (s) |
+| ------- | -------- |
+| opencv  | 33       |
+| decord  | 28       |
+| pyav    | 10       |
+
+Although `pyav` is the fastest, it can only extract the key frames instead of frames at any time. Therefore, we use `decord` as the default library for frame extraction. For dataset management, without a bottleneck on loading speed, we choose `opencv` as the default library for video information extraction.