update data pipeline

2026-04-11 05:13:31 +02:00 · 2024-04-02 15:55:58 +08:00 · 2024-04-02 15:55:58 +08:00 · 00c20261f0
commit 00c20261f0
parent 72bd0313bf
4 changed files with 34 additions and 37 deletions
--- a/README.md
+++ b/README.md
@ -197,8 +197,9 @@ is [here](/docs/datasets.md). We provide tools to process video data. Our data p
 the following steps:

 1. Manage datasets. [[docs](/tools/datasets/README.md)]
-2. Split videos into clips. [[docs](/tools/scenedetect/README.md)]
-3. Generate video captions. [[docs](/tools/caption/README.md)]
+2. Scene detection and video splitting. [[docs](/tools/scenedetect/README.md)]
+3. Score and filter videos. [[docs](/tools/scoring/README.md)]
+4. Generate video captions. [[docs](/tools/caption/README.md)]

 Below is an example workflow to process data. However, we recommend you to read the detailed documentation for each tool, and decide which tools to use based on your needs. This pipeline applies to both image and video data.

@ -211,7 +212,7 @@ python -m tools.datasets.csvutil ~/dataset.csv --info --fmin 1 --output ~/datase

 # 2. Filter dataset by aesthetic scores
 # output: ~/dataset/meta_aes.csv
-python -m tools.aesthetic.inference ~/dataset/meta.csv
+python -m tools.scoring.aesthetic.inference ~/dataset/meta.csv
 # sort and examine videos by aesthetic scores
 # output: ~/dataset/meta_aes_sort.csv
 python -m tools.datasets.csvutil ~/dataset/meta_aes.csv --sort-descending aes
@ -227,11 +228,11 @@ torchrun --nproc_per_node 8 --standalone -m tools.caption.caption_llava ~/datase
 # merge generated results
 python -m tools.datasets.csvutil ~/dataset/meta_aes_aesmin5_caption_part*.csv --output ~/dataset/meta_caption.csv
 # remove empty captions and process captions (may need to re-caption lost ones)
-python -m tools.datasets.csvutil ~/dataset/meta_caption.csv --remove-caption-prefix --remove-empty-caption --output ~/dataset/meta_caption_processed.csv
+python -m tools.datasets.csvutil ~/dataset/meta_caption.csv --clean-caption --remove-caption-prefix --remove-empty-caption --output ~/dataset/meta_caption_processed.csv

 # 4. Sanity check & prepare for training
 # sanity check
-python -m tools.datasets.csvutil ~/dataset/meta_caption_processed.csv --ext --video-info --output ~/dataset/meta_ready.csv
+python -m tools.datasets.csvutil ~/dataset/meta_caption_processed.csv --ext --info --output ~/dataset/meta_ready.csv
 # filter out videos less than 48 frames
 # output: ~/dataset/meta_ready_fmin48.csv
 python -m tools.datasets.csvutil ~/dataset_ready.csv --fmin 48
--- a/tools/datasets/README.md
+++ b/tools/datasets/README.md
@ -27,9 +27,10 @@ The columns are defined as follows:
 - `fps`: the frame rate of the video. Optional.
 - `width`: the width of the video frame. Necessary for STDiT2.
 - `height`: the height of the video frame. Necessary for STDiT2.
+- `resolution`: height x width.
 - `aspect_ratio`: the aspect ratio of the video frame (height divided by width). Optional.
- `aesthetic_score`: the aesthetic score by [asethetic scorer](/tools/aesthetic/README.md). Optional.
- `clip_score`: the clip score by [clip scorer](/tools/clip/README.md). Optional.
+- `aes`: the aesthetic score by [asethetic scorer](/tools/aesthetic/README.md). Optional.
+- `match`: the clip score by [clip scorer](/tools/clip/README.md). Optional.

 ## Dataset to CSV

@ -161,6 +162,19 @@ python -m tools.datasets.csvutil DATA.csv --aesmin 0.5
 python -m tools.datasets.csvutil DATA.csv --matchmin 0.5
 ```

+## Analyze datasets
+
+Since the dataset is provided in a CSV file, you can easily analyze the dataset using pandas (after applying `--info`). Here are some examples:
+
+```python
+import pandas as pd
+import matplotlib.pyplot as plt
+
+data = pd.read_csv('meta.csv')
+data.hist(column="resolution")
+plt.savefig('info.jpg')
+```
+
 ## Frame extraction speed

 We use three libraries to extract frames from videos: `opencv`, `pyav` and `decord`. Our benchmark results of loading 256 video's middle frames are as follows:
--- a/tools/datasets/csvutil.py
+++ b/tools/datasets/csvutil.py
@ -57,8 +57,9 @@ def get_video_info(path):
            int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),
            float(cap.get(cv2.CAP_PROP_FPS)),
        )
+    hw = height * width
    aspect_ratio = height / width if width > 0 else np.nan
-    return num_frames, height, width, aspect_ratio, fps
+    return num_frames, height, width, aspect_ratio, fps, hw


 LLAVA_PREFIX = [
@ -99,11 +100,7 @@ def build_lang_detector(lang_to_detect):
    lang_dict = dict(en=Language.ENGLISH)
    assert lang_to_detect in lang_dict
    valid_lang = lang_dict[lang_to_detect]
-    detector = (
-        LanguageDetectorBuilder.from_all_spoken_languages()
-        .with_low_accuracy_mode()
-        .build()
-    )
+    detector = LanguageDetectorBuilder.from_all_spoken_languages().with_low_accuracy_mode().build()

    def detect_lang(caption):
        confidence_values = detector.compute_language_confidence_values(caption)
@ -124,19 +121,7 @@ def basic_clean(text):


 BAD_PUNCT_REGEX = re.compile(
-    r"["
-    + "#®•©™&@·º½¾¿¡§~"
-    + "\)"
-    + "\("
-    + "\]"
-    + "\["
-    + "\}"
-    + "\{"
-    + "\|"
-    + "\\"
-    + "\/"
-    + "\*"
-    + r"]{1,}"
+    r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
 )  # noqa


@ -237,14 +222,10 @@ def clean_caption(caption):
    caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
    caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
    caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
-    caption = re.sub(
-        r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption
-    )
+    caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
    caption = re.sub(r"\bpage\s+\d+\b", "", caption)

-    caption = re.sub(
-        r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption
-    )  # j2d1a2a...
+    caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...

    caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)

@ -430,10 +411,11 @@ def main(args):
    if args.unescape:
        assert "text" in data.columns
        data["text"] = apply(data["text"], html.unescape)
-    if "text" in data.columns:
+    if args.clean_caption:
+        assert "text" in data.columns
        data["text"] = apply(
            data["text"],
-            partial(text_preprocessing, use_text_preprocessing=args.clean_caption),
+            partial(text_preprocessing, use_text_preprocessing=True),
        )
    if args.info:
        info = apply(data["path"], get_video_info)
@ -443,6 +425,7 @@ def main(args):
            data["width"],
            data["aspect_ratio"],
            data["fps"],
+            data["resolution"],
        ) = zip(*info)

    # filtering
--- a/tools/scoring/README.md
+++ b/tools/scoring/README.md
@ -7,7 +7,6 @@
  - [Optical Flow Score](#optical-flow-score)
  - [Matching Score](#matching-score)

-
 ## Aesthetic Scoring

 To evaluate the aesthetic quality of videos, we use a pretrained model from [CLIP+MLP Aesthetic Score Predictor](https://github.com/christophschuhmann/improved-aesthetic-predictor). This model is trained on 176K SAC (Simulacra Aesthetic Captions) pairs, 15K LAION-Logos (Logos) pairs, and 250K AVA (The Aesthetic Visual Analysis) image-text pairs.
@ -32,8 +31,8 @@ wget https://github.com/christophschuhmann/improved-aesthetic-predictor/raw/main
 With `meta.csv` containing the paths to the videos, run the following command:

 ```bash
-# output: DATA_aes.csv
-python -m tools.aesthetic.inference meta.csv
+# output: meta_aes.csv
+python -m tools.scoring.aesthetic.inference meta.csv
 ```

 ## Optical Flow Score