update csvutil

2026-04-13 06:46:08 +02:00 · 2024-03-25 17:08:35 +08:00 · 2024-03-25 17:08:35 +08:00 · 6140f1bbba
commit 6140f1bbba
parent 5e3eca2d0f
2 changed files with 61 additions and 13 deletions
--- a/tools/datasets/README.md
+++ b/tools/datasets/README.md
@ -25,12 +25,12 @@ python -m tools.datasets.convert_dataset ucf101 UCF101_FOLDER --split videos

 ## Dataset Format

-The dataset should be provided in a CSV file, which is used both for training and data preprocessing. The CSV file should only contain the following columns (can be optional):
+The dataset should be provided in a CSV file, which is used both for training and data preprocessing. The CSV file should only contain the following columns (can be optional). Aspect ratio is width divided by height.

 ```csv
-path, text, num_frames, aesthetic_score, fps, width, height, aspect_ratio
-/absolute/path/to/image1.jpg, caption1, num_of_frames, score1
-/absolute/path/to/video2.mp4, caption2, num_of_frames, score2
+path, text, num_frames, fps, width, height, aspect_ratio, aesthetic_score, clip_score
+/absolute/path/to/image1.jpg, caption1, num_of_frames
+/absolute/path/to/video2.mp4, caption2, num_of_frames
 ```

 We use pandas to manage the CSV files. You can use the following code to read and write the CSV files:
@ -47,14 +47,20 @@ We provide `csvutils.py` to manage the CSV files. You can use the following comm
 ```bash
 # csvutil takes multiple CSV files as input and merge them into one CSV file
 python -m tools.datasets.csvutil DATA1.csv DATA2.csv
+
 # filter frames between 128 and 256, with captions
 python -m tools.datasets.csvutil DATA.csv --fmin 128 --fmax 256 --remove-empty-caption
 # compute the number of frames for each video
-python -m tools.datasets.csvutil DATA.csv --relength
+python -m tools.datasets.csvutil DATA.csv --video-info
 # remove caption prefix
 python -m tools.datasets.csvutil DATA.csv --remove-caption-prefix
 # generate DATA_root.csv with absolute path
 python -m tools.datasets.csvutil DATA.csv --abspath /absolute/path/to/dataset
+
+# examine the first 10 rows of the CSV file
+head -n 10 DATA1.csv
+# count the number of data in the CSV file (approximately)
+wc -l DATA1.csv
 ```

 To accelerate processing speed, you can install [pandarallel](https://github.com/nalepae/pandarallel):
@ -62,3 +68,15 @@ To accelerate processing speed, you can install [pandarallel](https://github.com
 ```bash
 pip install pandarallel
 ```
+
+To filter text language, you need to install [lingua](https://github.com/pemistahl/lingua-py):
+
+```bash
+pip install lingua-language-detector
+```
+
+To get video information, you need to install [opencv-python](https://github.com/opencv/opencv-python):
+
+```bash
+pip install opencv-python
+```
--- a/tools/datasets/csvutil.py
+++ b/tools/datasets/csvutil.py
@ -23,11 +23,18 @@ def apply(df, func):
    return df.progress_apply(func)


-def get_video_length(path):
+def get_video_info(path):
    import cv2

    cap = cv2.VideoCapture(path)
-    return int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    num_frames, height, width, fps = (
+        int(cap.get(cv2.CAP_PROP_FRAME_COUNT)),
+        int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)),
+        int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),
+        float(cap.get(cv2.CAP_PROP_FPS)),
+    )
+    aspect_ratio = width / height if height > 0 else np.nan
+    return num_frames, height, width, aspect_ratio, fps


 LLAVA_PREFIX = [
@ -76,6 +83,8 @@ def parse_args():
    parser.add_argument("--disable-parallel", action="store_true")
    # special case
    parser.add_argument("--shard", type=int, default=None)
+    parser.add_argument("--sort-descending", type=str, default=None)
+    parser.add_argument("--sort-ascending", type=str, default=None)

    # path processing
    parser.add_argument("--abspath", type=str, default=None)
@ -88,12 +97,13 @@ def parse_args():
    parser.add_argument("--remove-caption-prefix", action="store_true")
    parser.add_argument("--unescape", action="store_true")
    # num_frames processing
-    parser.add_argument("--relength", action="store_true")
+    parser.add_argument("--video-info", action="store_true")
    # num_frames filtering
    parser.add_argument("--fmin", type=int, default=None)
    parser.add_argument("--fmax", type=int, default=None)
    # aesthetic filtering
    parser.add_argument("--aesmin", type=float, default=None)
+    parser.add_argument("--matchmin", type=float, default=None)

    return parser.parse_args()

@ -123,8 +133,8 @@ def get_output_path(args, input_name):
    if args.unescape:
        name += "_unescape"
    # num_frames processing
-    if args.relength:
-        name += "_relength"
+    if args.video_info:
+        name += "_vinfo"
    # num_frames filtering
    if args.fmin is not None:
        name += f"_fmin_{args.fmin}"
@ -133,6 +143,16 @@ def get_output_path(args, input_name):
    # aesthetic filtering
    if args.aesmin is not None:
        name += f"_aesmin_{args.aesmin}"
+    # clip score filtering
+    if args.matchmin is not None:
+        name += f"_matchmin_{args.matchmin}"
+    # sort
+    if args.sort_descending is not None:
+        assert args.sort_ascending is None
+        name += "_sort"
+    if args.sort_ascending is not None:
+        assert args.sort_descending is None
+        name += "_sort"

    output_path = os.path.join(dir_path, f"{name}.csv")
    return output_path
@ -171,8 +191,9 @@ def main(args):
    if args.unescape:
        assert "text" in data.columns
        data["text"] = apply(data["text"], html.unescape)
-    if args.relength:
-        data["num_frames"] = apply(data["path"], get_video_length)
+    if args.video_info:
+        info = apply(data["path"], get_video_info)
+        data["num_frames"], data["height"], data["width"], data["aspect_ratio"], data["fps"] = zip(*info)

    # filtering
    if args.remove_empty_caption:
@ -183,7 +204,7 @@ def main(args):
        data = data[~data["text"].str.contains(r"(?P<url>https?://[^\s]+)", regex=True)]
    if args.lang is not None:
        assert "text" in data.columns
-        data = data[data["text"].progress_apply(detect_lang)] # cannot parallelize
+        data = data[data["text"].progress_apply(detect_lang)]  # cannot parallelize
    if args.fmin is not None:
        assert "num_frames" in data.columns
        data = data[data["num_frames"] >= args.fmin]
@ -193,8 +214,17 @@ def main(args):
    if args.aesmin is not None:
        assert "aesthetic_score" in data.columns
        data = data[data["aesthetic_score"] >= args.aesmin]
+    if args.matchmin is not None:
+        assert "clip_score" in data.columns
+        data = data[data["clip_score"] >= args.matchmin]
    print(f"Filtered number of samples: {len(data)}.")

+    # sort
+    if args.sort_descending is not None:
+        data = data.sort_values(by=args.sort_descending, ascending=False)
+    if args.sort_ascending is not None:
+        data = data.sort_values(by=args.sort_ascending, ascending=True)
+
    # shard data
    if args.shard is not None:
        sharded_data = np.array_split(data, args.shard)