mirror of
https://github.com/hpcaitech/Open-Sora.git
synced 2026-04-11 05:13:31 +02:00
update data pipeline
This commit is contained in:
parent
72bd0313bf
commit
00c20261f0
11
README.md
11
README.md
|
|
@ -197,8 +197,9 @@ is [here](/docs/datasets.md). We provide tools to process video data. Our data p
|
|||
the following steps:
|
||||
|
||||
1. Manage datasets. [[docs](/tools/datasets/README.md)]
|
||||
2. Split videos into clips. [[docs](/tools/scenedetect/README.md)]
|
||||
3. Generate video captions. [[docs](/tools/caption/README.md)]
|
||||
2. Scene detection and video splitting. [[docs](/tools/scenedetect/README.md)]
|
||||
3. Score and filter videos. [[docs](/tools/scoring/README.md)]
|
||||
4. Generate video captions. [[docs](/tools/caption/README.md)]
|
||||
|
||||
Below is an example workflow to process data. However, we recommend you to read the detailed documentation for each tool, and decide which tools to use based on your needs. This pipeline applies to both image and video data.
|
||||
|
||||
|
|
@ -211,7 +212,7 @@ python -m tools.datasets.csvutil ~/dataset.csv --info --fmin 1 --output ~/datase
|
|||
|
||||
# 2. Filter dataset by aesthetic scores
|
||||
# output: ~/dataset/meta_aes.csv
|
||||
python -m tools.aesthetic.inference ~/dataset/meta.csv
|
||||
python -m tools.scoring.aesthetic.inference ~/dataset/meta.csv
|
||||
# sort and examine videos by aesthetic scores
|
||||
# output: ~/dataset/meta_aes_sort.csv
|
||||
python -m tools.datasets.csvutil ~/dataset/meta_aes.csv --sort-descending aes
|
||||
|
|
@ -227,11 +228,11 @@ torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava ~/datase
|
|||
# merge generated results
|
||||
python -m tools.datasets.csvutil ~/dataset/meta_aes_aesmin5_caption_part*.csv --output ~/dataset/meta_caption.csv
|
||||
# remove empty captions and process captions (may need to re-caption lost ones)
|
||||
python -m tools.datasets.csvutil ~/dataset/meta_caption.csv --remove-caption-prefix --remove-empty-caption --output ~/dataset/meta_caption_processed.csv
|
||||
python -m tools.datasets.csvutil ~/dataset/meta_caption.csv --clean-caption --remove-caption-prefix --remove-empty-caption --output ~/dataset/meta_caption_processed.csv
|
||||
|
||||
# 4. Sanity check & prepare for training
|
||||
# sanity check
|
||||
python -m tools.datasets.csvutil ~/dataset/meta_caption_processed.csv --ext --video-info --output ~/dataset/meta_ready.csv
|
||||
python -m tools.datasets.csvutil ~/dataset/meta_caption_processed.csv --ext --info --output ~/dataset/meta_ready.csv
|
||||
# filter out videos less than 48 frames
|
||||
# output: ~/dataset/meta_ready_fmin48.csv
|
||||
python -m tools.datasets.csvutil ~/dataset_ready.csv --fmin 48
|
||||
|
|
|
|||
|
|
@ -27,9 +27,10 @@ The columns are defined as follows:
|
|||
- `fps`: the frame rate of the video. Optional.
|
||||
- `width`: the width of the video frame. Necessary for STDiT2.
|
||||
- `height`: the height of the video frame. Necessary for STDiT2.
|
||||
- `resolution`: height x width.
|
||||
- `aspect_ratio`: the aspect ratio of the video frame (height divided by width). Optional.
|
||||
- `aesthetic_score`: the aesthetic score by [asethetic scorer](/tools/aesthetic/README.md). Optional.
|
||||
- `clip_score`: the clip score by [clip scorer](/tools/clip/README.md). Optional.
|
||||
- `aes`: the aesthetic score by [asethetic scorer](/tools/aesthetic/README.md). Optional.
|
||||
- `match`: the clip score by [clip scorer](/tools/clip/README.md). Optional.
|
||||
|
||||
## Dataset to CSV
|
||||
|
||||
|
|
@ -161,6 +162,19 @@ python -m tools.datasets.csvutil DATA.csv --aesmin 0.5
|
|||
python -m tools.datasets.csvutil DATA.csv --matchmin 0.5
|
||||
```
|
||||
|
||||
## Analyze datasets
|
||||
|
||||
Since the dataset is provided in a CSV file, you can easily analyze the dataset using pandas (after applying `--info`). Here are some examples:
|
||||
|
||||
```python
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
data = pd.read_csv('meta.csv')
|
||||
data.hist(column="resolution")
|
||||
plt.savefig('info.jpg')
|
||||
```
|
||||
|
||||
## Frame extraction speed
|
||||
|
||||
We use three libraries to extract frames from videos: `opencv`, `pyav` and `decord`. Our benchmark results of loading 256 video's middle frames are as follows:
|
||||
|
|
|
|||
|
|
@ -57,8 +57,9 @@ def get_video_info(path):
|
|||
int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),
|
||||
float(cap.get(cv2.CAP_PROP_FPS)),
|
||||
)
|
||||
hw = height * width
|
||||
aspect_ratio = height / width if width > 0 else np.nan
|
||||
return num_frames, height, width, aspect_ratio, fps
|
||||
return num_frames, height, width, aspect_ratio, fps, hw
|
||||
|
||||
|
||||
LLAVA_PREFIX = [
|
||||
|
|
@ -99,11 +100,7 @@ def build_lang_detector(lang_to_detect):
|
|||
lang_dict = dict(en=Language.ENGLISH)
|
||||
assert lang_to_detect in lang_dict
|
||||
valid_lang = lang_dict[lang_to_detect]
|
||||
detector = (
|
||||
LanguageDetectorBuilder.from_all_spoken_languages()
|
||||
.with_low_accuracy_mode()
|
||||
.build()
|
||||
)
|
||||
detector = LanguageDetectorBuilder.from_all_spoken_languages().with_low_accuracy_mode().build()
|
||||
|
||||
def detect_lang(caption):
|
||||
confidence_values = detector.compute_language_confidence_values(caption)
|
||||
|
|
@ -124,19 +121,7 @@ def basic_clean(text):
|
|||
|
||||
|
||||
BAD_PUNCT_REGEX = re.compile(
|
||||
r"["
|
||||
+ "#®•©™&@·º½¾¿¡§~"
|
||||
+ "\)"
|
||||
+ "\("
|
||||
+ "\]"
|
||||
+ "\["
|
||||
+ "\}"
|
||||
+ "\{"
|
||||
+ "\|"
|
||||
+ "\\"
|
||||
+ "\/"
|
||||
+ "\*"
|
||||
+ r"]{1,}"
|
||||
r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
|
||||
) # noqa
|
||||
|
||||
|
||||
|
|
@ -237,14 +222,10 @@ def clean_caption(caption):
|
|||
caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
|
||||
caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
|
||||
caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
|
||||
caption = re.sub(
|
||||
r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption
|
||||
)
|
||||
caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
|
||||
caption = re.sub(r"\bpage\s+\d+\b", "", caption)
|
||||
|
||||
caption = re.sub(
|
||||
r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption
|
||||
) # j2d1a2a...
|
||||
caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption) # j2d1a2a...
|
||||
|
||||
caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
|
||||
|
||||
|
|
@ -430,10 +411,11 @@ def main(args):
|
|||
if args.unescape:
|
||||
assert "text" in data.columns
|
||||
data["text"] = apply(data["text"], html.unescape)
|
||||
if "text" in data.columns:
|
||||
if args.clean_caption:
|
||||
assert "text" in data.columns
|
||||
data["text"] = apply(
|
||||
data["text"],
|
||||
partial(text_preprocessing, use_text_preprocessing=args.clean_caption),
|
||||
partial(text_preprocessing, use_text_preprocessing=True),
|
||||
)
|
||||
if args.info:
|
||||
info = apply(data["path"], get_video_info)
|
||||
|
|
@ -443,6 +425,7 @@ def main(args):
|
|||
data["width"],
|
||||
data["aspect_ratio"],
|
||||
data["fps"],
|
||||
data["resolution"],
|
||||
) = zip(*info)
|
||||
|
||||
# filtering
|
||||
|
|
|
|||
|
|
@ -7,7 +7,6 @@
|
|||
- [Optical Flow Score](#optical-flow-score)
|
||||
- [Matching Score](#matching-score)
|
||||
|
||||
|
||||
## Aesthetic Scoring
|
||||
|
||||
To evaluate the aesthetic quality of videos, we use a pretrained model from [CLIP+MLP Aesthetic Score Predictor](https://github.com/christophschuhmann/improved-aesthetic-predictor). This model is trained on 176K SAC (Simulacra Aesthetic Captions) pairs, 15K LAION-Logos (Logos) pairs, and 250K AVA (The Aesthetic Visual Analysis) image-text pairs.
|
||||
|
|
@ -32,8 +31,8 @@ wget https://github.com/christophschuhmann/improved-aesthetic-predictor/raw/main
|
|||
With `meta.csv` containing the paths to the videos, run the following command:
|
||||
|
||||
```bash
|
||||
# output: DATA_aes.csv
|
||||
python -m tools.aesthetic.inference meta.csv
|
||||
# output: meta_aes.csv
|
||||
python -m tools.scoring.aesthetic.inference meta.csv
|
||||
```
|
||||
|
||||
## Optical Flow Score
|
||||
|
|
|
|||
Loading…
Reference in a new issue