mirror of
https://github.com/hpcaitech/Open-Sora.git
synced 2026-04-11 13:14:44 +02:00
[fix] use decord for tools
This commit is contained in:
parent
ff15a0acfb
commit
5ade5e5984
|
|
@ -198,7 +198,7 @@ the following steps:
|
|||
2. Split videos into clips. [[docs](/tools/scenedetect/README.md)]
|
||||
3. Generate video captions. [[docs](/tools/caption/README.md)]
|
||||
|
||||
Below is an example workflow to process data. However, we recommend you to read the detailed documentation for each tool, and decide which tools to use based on your needs.
|
||||
Below is an example workflow to process data. However, we recommend you to read the detailed documentation for each tool, and decide which tools to use based on your needs. This pipeline applies to both image and video data.
|
||||
|
||||
```bash
|
||||
# Suppose files under ~/dataset/
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ For videos, we extract the first, last, and the middle frames for evaluation. Th
|
|||
```bash
|
||||
# install clip
|
||||
pip install git+https://github.com/openai/CLIP.git
|
||||
pip install decord
|
||||
|
||||
# get pretrained model
|
||||
wget https://github.com/christophschuhmann/improved-aesthetic-predictor/raw/main/sac+logos+ava1-l14-linearMSE.pth -O pretrained_models/aesthetic.pth
|
||||
|
|
|
|||
|
|
@ -2,14 +2,15 @@
|
|||
import argparse
|
||||
import os
|
||||
|
||||
import av
|
||||
import clip
|
||||
import decord
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from einops import rearrange
|
||||
from PIL import Image
|
||||
from torchvision.datasets.folder import pil_loader
|
||||
from tqdm import tqdm
|
||||
|
||||
|
|
@ -23,16 +24,13 @@ def is_video(filename):
|
|||
|
||||
|
||||
def extract_frames(video_path, points=(0.1, 0.5, 0.9)):
|
||||
container = av.open(video_path)
|
||||
total_frames = container.streams.video[0].frames
|
||||
frames = []
|
||||
for point in points:
|
||||
target_frame = total_frames * point
|
||||
target_timestamp = int((target_frame * av.time_base) / container.streams.video[0].average_rate)
|
||||
container.seek(target_timestamp)
|
||||
frame = next(container.decode(video=0)).to_image()
|
||||
frames.append(frame)
|
||||
return frames
|
||||
container = decord.VideoReader(video_path, num_threads=1)
|
||||
total_frames = len(container)
|
||||
frame_inds = (np.array(points) * total_frames).astype(np.int32)
|
||||
frame_inds[frame_inds >= total_frames] = total_frames - 1
|
||||
frames = container.get_batch(frame_inds).asnumpy() # [N, H, W, C]
|
||||
frames_pil = [Image.fromarray(frame) for frame in frames]
|
||||
return frames_pil
|
||||
|
||||
|
||||
class VideoTextDataset(torch.utils.data.Dataset):
|
||||
|
|
@ -107,14 +105,14 @@ def main(args):
|
|||
model = torch.nn.DataParallel(model)
|
||||
|
||||
# build dataset
|
||||
dataset = VideoTextDataset(args.input, transform=preprocess, points=(0.5,))
|
||||
dataset = VideoTextDataset(args.input, transform=preprocess)
|
||||
dataloader = torch.utils.data.DataLoader(
|
||||
dataset,
|
||||
batch_size=args.bs,
|
||||
shuffle=False,
|
||||
num_workers=args.num_workers,
|
||||
num_workers=0,
|
||||
pin_memory=True,
|
||||
prefetch_factor=args.prefetch_factor,
|
||||
# prefetch_factor=args.prefetch_factor,
|
||||
)
|
||||
|
||||
# compute aesthetic scores
|
||||
|
|
|
|||
|
|
@ -32,8 +32,8 @@ pip install -e .
|
|||
|
||||
# install flash attention
|
||||
pip install flash-attn --no-build-isolation
|
||||
# install colossalai and pyav
|
||||
pip install colossalai pyav
|
||||
# install colossalai and decord
|
||||
pip install colossalai decord
|
||||
```
|
||||
|
||||
Since only the 34B model's performance is comparable to GPT-4V, we only provide the usage of the 34B model. The 34B model is available [here](https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b), or run our script and it will be downloaded automatically.
|
||||
|
|
|
|||
|
|
@ -1,10 +1,12 @@
|
|||
import os
|
||||
import time
|
||||
|
||||
import av
|
||||
import decord
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import torch
|
||||
import torchvision.transforms as transforms
|
||||
from PIL import Image
|
||||
from torchvision.datasets.folder import pil_loader
|
||||
|
||||
PROMPTS = {
|
||||
|
|
@ -31,16 +33,13 @@ def is_video(filename):
|
|||
|
||||
|
||||
def extract_frames(video_path, points=(0.1, 0.5, 0.9)):
|
||||
container = av.open(video_path)
|
||||
total_frames = container.streams.video[0].frames
|
||||
frames = []
|
||||
for point in points:
|
||||
target_frame = total_frames * point
|
||||
target_timestamp = int((target_frame * av.time_base) / container.streams.video[0].average_rate)
|
||||
container.seek(target_timestamp)
|
||||
frame = next(container.decode(video=0)).to_image()
|
||||
frames.append(frame)
|
||||
return frames, total_frames
|
||||
container = decord.VideoReader(video_path, num_threads=1)
|
||||
total_frames = len(container)
|
||||
frame_inds = (np.array(points) * total_frames).astype(np.int32)
|
||||
frame_inds[frame_inds >= total_frames] = total_frames - 1
|
||||
frames = container.get_batch(frame_inds).asnumpy() # [N, H, W, C]
|
||||
frames_pil = [Image.fromarray(frame) for frame in frames]
|
||||
return frames_pil, total_frames
|
||||
|
||||
|
||||
class VideoTextDataset(torch.utils.data.Dataset):
|
||||
|
|
|
|||
|
|
@ -160,3 +160,15 @@ python -m tools.datasets.csvutil DATA.csv --aesmin 0.5
|
|||
# output: DATA_matchmin_0.5.csv
|
||||
python -m tools.datasets.csvutil DATA.csv --matchmin 0.5
|
||||
```
|
||||
|
||||
## Frame extraction speed
|
||||
|
||||
We use three libraries to extract frames from videos: `opencv`, `pyav` and `decord`. Our benchmark results of loading 256 video's middle frames are as follows:
|
||||
|
||||
| Library | Time (s) |
|
||||
| ------- | -------- |
|
||||
| opencv | 33 |
|
||||
| decord | 28 |
|
||||
| pyav | 10 |
|
||||
|
||||
Although `pyav` is the fastest, it can only extract the key frames instead of frames at any time. Therefore, we use `decord` as the default library for frame extraction. For dataset management, without a bottleneck on loading speed, we choose `opencv` as the default library for video information extraction.
|
||||
|
|
|
|||
Loading…
Reference in a new issue