update csvutil

This commit is contained in:
Zangwei Zheng 2024-03-25 17:08:35 +08:00
parent 5e3eca2d0f
commit 6140f1bbba
2 changed files with 61 additions and 13 deletions

View file

@ -25,12 +25,12 @@ python -m tools.datasets.convert_dataset ucf101 UCF101_FOLDER --split videos
## Dataset Format
The dataset should be provided in a CSV file, which is used both for training and data preprocessing. The CSV file should only contain the following columns (can be optional):
The dataset should be provided in a CSV file, which is used both for training and data preprocessing. The CSV file should only contain the following columns (can be optional). Aspect ratio is width divided by height.
```csv
path, text, num_frames, aesthetic_score, fps, width, height, aspect_ratio
/absolute/path/to/image1.jpg, caption1, num_of_frames, score1
/absolute/path/to/video2.mp4, caption2, num_of_frames, score2
path, text, num_frames, fps, width, height, aspect_ratio, aesthetic_score, clip_score
/absolute/path/to/image1.jpg, caption1, num_of_frames
/absolute/path/to/video2.mp4, caption2, num_of_frames
```
We use pandas to manage the CSV files. You can use the following code to read and write the CSV files:
@ -47,14 +47,20 @@ We provide `csvutils.py` to manage the CSV files. You can use the following comm
```bash
# csvutil takes multiple CSV files as input and merge them into one CSV file
python -m tools.datasets.csvutil DATA1.csv DATA2.csv
# filter frames between 128 and 256, with captions
python -m tools.datasets.csvutil DATA.csv --fmin 128 --fmax 256 --remove-empty-caption
# compute the number of frames for each video
python -m tools.datasets.csvutil DATA.csv --relength
python -m tools.datasets.csvutil DATA.csv --video-info
# remove caption prefix
python -m tools.datasets.csvutil DATA.csv --remove-caption-prefix
# generate DATA_root.csv with absolute path
python -m tools.datasets.csvutil DATA.csv --abspath /absolute/path/to/dataset
# examine the first 10 rows of the CSV file
head -n 10 DATA1.csv
# count the number of data in the CSV file (approximately)
wc -l DATA1.csv
```
To accelerate processing speed, you can install [pandarallel](https://github.com/nalepae/pandarallel):
@ -62,3 +68,15 @@ To accelerate processing speed, you can install [pandarallel](https://github.com
```bash
pip install pandarallel
```
To filter text language, you need to install [lingua](https://github.com/pemistahl/lingua-py):
```bash
pip install lingua-language-detector
```
To get video information, you need to install [opencv-python](https://github.com/opencv/opencv-python):
```bash
pip install opencv-python
```

View file

@ -23,11 +23,18 @@ def apply(df, func):
return df.progress_apply(func)
def get_video_length(path):
def get_video_info(path):
import cv2
cap = cv2.VideoCapture(path)
return int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
num_frames, height, width, fps = (
int(cap.get(cv2.CAP_PROP_FRAME_COUNT)),
int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)),
int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),
float(cap.get(cv2.CAP_PROP_FPS)),
)
aspect_ratio = width / height if height > 0 else np.nan
return num_frames, height, width, aspect_ratio, fps
LLAVA_PREFIX = [
@ -76,6 +83,8 @@ def parse_args():
parser.add_argument("--disable-parallel", action="store_true")
# special case
parser.add_argument("--shard", type=int, default=None)
parser.add_argument("--sort-descending", type=str, default=None)
parser.add_argument("--sort-ascending", type=str, default=None)
# path processing
parser.add_argument("--abspath", type=str, default=None)
@ -88,12 +97,13 @@ def parse_args():
parser.add_argument("--remove-caption-prefix", action="store_true")
parser.add_argument("--unescape", action="store_true")
# num_frames processing
parser.add_argument("--relength", action="store_true")
parser.add_argument("--video-info", action="store_true")
# num_frames filtering
parser.add_argument("--fmin", type=int, default=None)
parser.add_argument("--fmax", type=int, default=None)
# aesthetic filtering
parser.add_argument("--aesmin", type=float, default=None)
parser.add_argument("--matchmin", type=float, default=None)
return parser.parse_args()
@ -123,8 +133,8 @@ def get_output_path(args, input_name):
if args.unescape:
name += "_unescape"
# num_frames processing
if args.relength:
name += "_relength"
if args.video_info:
name += "_vinfo"
# num_frames filtering
if args.fmin is not None:
name += f"_fmin_{args.fmin}"
@ -133,6 +143,16 @@ def get_output_path(args, input_name):
# aesthetic filtering
if args.aesmin is not None:
name += f"_aesmin_{args.aesmin}"
# clip score filtering
if args.matchmin is not None:
name += f"_matchmin_{args.matchmin}"
# sort
if args.sort_descending is not None:
assert args.sort_ascending is None
name += "_sort"
if args.sort_ascending is not None:
assert args.sort_descending is None
name += "_sort"
output_path = os.path.join(dir_path, f"{name}.csv")
return output_path
@ -171,8 +191,9 @@ def main(args):
if args.unescape:
assert "text" in data.columns
data["text"] = apply(data["text"], html.unescape)
if args.relength:
data["num_frames"] = apply(data["path"], get_video_length)
if args.video_info:
info = apply(data["path"], get_video_info)
data["num_frames"], data["height"], data["width"], data["aspect_ratio"], data["fps"] = zip(*info)
# filtering
if args.remove_empty_caption:
@ -183,7 +204,7 @@ def main(args):
data = data[~data["text"].str.contains(r"(?P<url>https?://[^\s]+)", regex=True)]
if args.lang is not None:
assert "text" in data.columns
data = data[data["text"].progress_apply(detect_lang)] # cannot parallelize
data = data[data["text"].progress_apply(detect_lang)] # cannot parallelize
if args.fmin is not None:
assert "num_frames" in data.columns
data = data[data["num_frames"] >= args.fmin]
@ -193,8 +214,17 @@ def main(args):
if args.aesmin is not None:
assert "aesthetic_score" in data.columns
data = data[data["aesthetic_score"] >= args.aesmin]
if args.matchmin is not None:
assert "clip_score" in data.columns
data = data[data["clip_score"] >= args.matchmin]
print(f"Filtered number of samples: {len(data)}.")
# sort
if args.sort_descending is not None:
data = data.sort_values(by=args.sort_descending, ascending=False)
if args.sort_ascending is not None:
data = data.sort_values(by=args.sort_ascending, ascending=True)
# shard data
if args.shard is not None:
sharded_data = np.array_split(data, args.shard)