From 5ade5e5984e375ea93ebfbea5bc6f63cb929053a Mon Sep 17 00:00:00 2001
From: Zangwei Zheng <zangwei@comp.nus.edu.sg>
Date: Mon, 1 Apr 2024 16:45:23 +0800
Subject: [PATCH] [fix] use decord for tools

---
 README.md                    |  2 +-
 tools/aesthetic/README.md    |  1 +
 tools/aesthetic/inference.py | 26 ++++++++++++--------------
 tools/caption/README.md      |  4 ++--
 tools/caption/utils.py       | 21 ++++++++++-----------
 tools/datasets/README.md     | 12 ++++++++++++
 6 files changed, 38 insertions(+), 28 deletions(-)

diff --git a/README.md b/README.md
index fd1fa7e..85635c3 100644
--- a/README.md
+++ b/README.md
@@ -198,7 +198,7 @@ the following steps:
 2. Split videos into clips. [[docs](/tools/scenedetect/README.md)]
 3. Generate video captions. [[docs](/tools/caption/README.md)]
 
-Below is an example workflow to process data. However, we recommend you to read the detailed documentation for each tool, and decide which tools to use based on your needs.
+Below is an example workflow to process data. However, we recommend you to read the detailed documentation for each tool, and decide which tools to use based on your needs. This pipeline applies to both image and video data.
 
 ```bash
 # Suppose files under ~/dataset/
diff --git a/tools/aesthetic/README.md b/tools/aesthetic/README.md
index db7179c..9cdf2bc 100644
--- a/tools/aesthetic/README.md
+++ b/tools/aesthetic/README.md
@@ -11,6 +11,7 @@ For videos, we extract the first, last, and the middle frames for evaluation. Th
 ```bash
 # install clip
 pip install git+https://github.com/openai/CLIP.git
+pip install decord
 
 # get pretrained model
 wget https://github.com/christophschuhmann/improved-aesthetic-predictor/raw/main/sac+logos+ava1-l14-linearMSE.pth -O pretrained_models/aesthetic.pth
diff --git a/tools/aesthetic/inference.py b/tools/aesthetic/inference.py
index 3d0b73b..40ff78a 100644
--- a/tools/aesthetic/inference.py
+++ b/tools/aesthetic/inference.py
@@ -2,14 +2,15 @@
 import argparse
 import os
 
-import av
 import clip
+import decord
 import numpy as np
 import pandas as pd
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
+from PIL import Image
 from torchvision.datasets.folder import pil_loader
 from tqdm import tqdm
 
@@ -23,16 +24,13 @@ def is_video(filename):
 
 
 def extract_frames(video_path, points=(0.1, 0.5, 0.9)):
-    container = av.open(video_path)
-    total_frames = container.streams.video[0].frames
-    frames = []
-    for point in points:
-        target_frame = total_frames * point
-        target_timestamp = int((target_frame * av.time_base) / container.streams.video[0].average_rate)
-        container.seek(target_timestamp)
-        frame = next(container.decode(video=0)).to_image()
-        frames.append(frame)
-    return frames
+    container = decord.VideoReader(video_path, num_threads=1)
+    total_frames = len(container)
+    frame_inds = (np.array(points) * total_frames).astype(np.int32)
+    frame_inds[frame_inds >= total_frames] = total_frames - 1
+    frames = container.get_batch(frame_inds).asnumpy()  # [N, H, W, C]
+    frames_pil = [Image.fromarray(frame) for frame in frames]
+    return frames_pil
 
 
 class VideoTextDataset(torch.utils.data.Dataset):
@@ -107,14 +105,14 @@ def main(args):
     model = torch.nn.DataParallel(model)
 
     # build dataset
-    dataset = VideoTextDataset(args.input, transform=preprocess, points=(0.5,))
+    dataset = VideoTextDataset(args.input, transform=preprocess)
     dataloader = torch.utils.data.DataLoader(
         dataset,
         batch_size=args.bs,
         shuffle=False,
-        num_workers=args.num_workers,
+        num_workers=0,
         pin_memory=True,
-        prefetch_factor=args.prefetch_factor,
+        # prefetch_factor=args.prefetch_factor,
     )
 
     # compute aesthetic scores
diff --git a/tools/caption/README.md b/tools/caption/README.md
index fd53303..df8932f 100644
--- a/tools/caption/README.md
+++ b/tools/caption/README.md
@@ -32,8 +32,8 @@ pip install -e .
 
 # install flash attention
 pip install flash-attn --no-build-isolation
-# install colossalai and pyav
-pip install colossalai pyav
+# install colossalai and decord
+pip install colossalai decord
 ```
 
 Since only the 34B model's performance is comparable to GPT-4V, we only provide the usage of the 34B model. The 34B model is available [here](https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b), or run our script and it will be downloaded automatically.
diff --git a/tools/caption/utils.py b/tools/caption/utils.py
index 44226d5..36b0053 100644
--- a/tools/caption/utils.py
+++ b/tools/caption/utils.py
@@ -1,10 +1,12 @@
 import os
 import time
 
-import av
+import decord
+import numpy as np
 import pandas as pd
 import torch
 import torchvision.transforms as transforms
+from PIL import Image
 from torchvision.datasets.folder import pil_loader
 
 PROMPTS = {
@@ -31,16 +33,13 @@ def is_video(filename):
 
 
 def extract_frames(video_path, points=(0.1, 0.5, 0.9)):
-    container = av.open(video_path)
-    total_frames = container.streams.video[0].frames
-    frames = []
-    for point in points:
-        target_frame = total_frames * point
-        target_timestamp = int((target_frame * av.time_base) / container.streams.video[0].average_rate)
-        container.seek(target_timestamp)
-        frame = next(container.decode(video=0)).to_image()
-        frames.append(frame)
-    return frames, total_frames
+    container = decord.VideoReader(video_path, num_threads=1)
+    total_frames = len(container)
+    frame_inds = (np.array(points) * total_frames).astype(np.int32)
+    frame_inds[frame_inds >= total_frames] = total_frames - 1
+    frames = container.get_batch(frame_inds).asnumpy()  # [N, H, W, C]
+    frames_pil = [Image.fromarray(frame) for frame in frames]
+    return frames_pil, total_frames
 
 
 class VideoTextDataset(torch.utils.data.Dataset):
diff --git a/tools/datasets/README.md b/tools/datasets/README.md
index e0fc1dc..4c11c6b 100644
--- a/tools/datasets/README.md
+++ b/tools/datasets/README.md
@@ -160,3 +160,15 @@ python -m tools.datasets.csvutil DATA.csv --aesmin 0.5
 # output: DATA_matchmin_0.5.csv
 python -m tools.datasets.csvutil DATA.csv --matchmin 0.5
 ```
+
+## Frame extraction speed
+
+We use three libraries to extract frames from videos: `opencv`, `pyav` and `decord`. Our benchmark results of loading 256 video's middle frames are as follows:
+
+| Library | Time (s) |
+| ------- | -------- |
+| opencv  | 33       |
+| decord  | 28       |
+| pyav    | 10       |
+
+Although `pyav` is the fastest, it can only extract the key frames instead of frames at any time. Therefore, we use `decord` as the default library for frame extraction. For dataset management, without a bottleneck on loading speed, we choose `opencv` as the default library for video information extraction.