mirror of
https://github.com/hpcaitech/Open-Sora.git
synced 2026-04-13 06:46:08 +02:00
update csvutil
This commit is contained in:
parent
5e3eca2d0f
commit
6140f1bbba
|
|
@ -25,12 +25,12 @@ python -m tools.datasets.convert_dataset ucf101 UCF101_FOLDER --split videos
|
|||
|
||||
## Dataset Format
|
||||
|
||||
The dataset should be provided in a CSV file, which is used both for training and data preprocessing. The CSV file should only contain the following columns (can be optional):
|
||||
The dataset should be provided in a CSV file, which is used both for training and data preprocessing. The CSV file should only contain the following columns (can be optional). Aspect ratio is width divided by height.
|
||||
|
||||
```csv
|
||||
path, text, num_frames, aesthetic_score, fps, width, height, aspect_ratio
|
||||
/absolute/path/to/image1.jpg, caption1, num_of_frames, score1
|
||||
/absolute/path/to/video2.mp4, caption2, num_of_frames, score2
|
||||
path, text, num_frames, fps, width, height, aspect_ratio, aesthetic_score, clip_score
|
||||
/absolute/path/to/image1.jpg, caption1, num_of_frames
|
||||
/absolute/path/to/video2.mp4, caption2, num_of_frames
|
||||
```
|
||||
|
||||
We use pandas to manage the CSV files. You can use the following code to read and write the CSV files:
|
||||
|
|
@ -47,14 +47,20 @@ We provide `csvutils.py` to manage the CSV files. You can use the following comm
|
|||
```bash
|
||||
# csvutil takes multiple CSV files as input and merge them into one CSV file
|
||||
python -m tools.datasets.csvutil DATA1.csv DATA2.csv
|
||||
|
||||
# filter frames between 128 and 256, with captions
|
||||
python -m tools.datasets.csvutil DATA.csv --fmin 128 --fmax 256 --remove-empty-caption
|
||||
# compute the number of frames for each video
|
||||
python -m tools.datasets.csvutil DATA.csv --relength
|
||||
python -m tools.datasets.csvutil DATA.csv --video-info
|
||||
# remove caption prefix
|
||||
python -m tools.datasets.csvutil DATA.csv --remove-caption-prefix
|
||||
# generate DATA_root.csv with absolute path
|
||||
python -m tools.datasets.csvutil DATA.csv --abspath /absolute/path/to/dataset
|
||||
|
||||
# examine the first 10 rows of the CSV file
|
||||
head -n 10 DATA1.csv
|
||||
# count the number of data in the CSV file (approximately)
|
||||
wc -l DATA1.csv
|
||||
```
|
||||
|
||||
To accelerate processing speed, you can install [pandarallel](https://github.com/nalepae/pandarallel):
|
||||
|
|
@ -62,3 +68,15 @@ To accelerate processing speed, you can install [pandarallel](https://github.com
|
|||
```bash
|
||||
pip install pandarallel
|
||||
```
|
||||
|
||||
To filter text language, you need to install [lingua](https://github.com/pemistahl/lingua-py):
|
||||
|
||||
```bash
|
||||
pip install lingua-language-detector
|
||||
```
|
||||
|
||||
To get video information, you need to install [opencv-python](https://github.com/opencv/opencv-python):
|
||||
|
||||
```bash
|
||||
pip install opencv-python
|
||||
```
|
||||
|
|
|
|||
|
|
@ -23,11 +23,18 @@ def apply(df, func):
|
|||
return df.progress_apply(func)
|
||||
|
||||
|
||||
def get_video_length(path):
|
||||
def get_video_info(path):
|
||||
import cv2
|
||||
|
||||
cap = cv2.VideoCapture(path)
|
||||
return int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||
num_frames, height, width, fps = (
|
||||
int(cap.get(cv2.CAP_PROP_FRAME_COUNT)),
|
||||
int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)),
|
||||
int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),
|
||||
float(cap.get(cv2.CAP_PROP_FPS)),
|
||||
)
|
||||
aspect_ratio = width / height if height > 0 else np.nan
|
||||
return num_frames, height, width, aspect_ratio, fps
|
||||
|
||||
|
||||
LLAVA_PREFIX = [
|
||||
|
|
@ -76,6 +83,8 @@ def parse_args():
|
|||
parser.add_argument("--disable-parallel", action="store_true")
|
||||
# special case
|
||||
parser.add_argument("--shard", type=int, default=None)
|
||||
parser.add_argument("--sort-descending", type=str, default=None)
|
||||
parser.add_argument("--sort-ascending", type=str, default=None)
|
||||
|
||||
# path processing
|
||||
parser.add_argument("--abspath", type=str, default=None)
|
||||
|
|
@ -88,12 +97,13 @@ def parse_args():
|
|||
parser.add_argument("--remove-caption-prefix", action="store_true")
|
||||
parser.add_argument("--unescape", action="store_true")
|
||||
# num_frames processing
|
||||
parser.add_argument("--relength", action="store_true")
|
||||
parser.add_argument("--video-info", action="store_true")
|
||||
# num_frames filtering
|
||||
parser.add_argument("--fmin", type=int, default=None)
|
||||
parser.add_argument("--fmax", type=int, default=None)
|
||||
# aesthetic filtering
|
||||
parser.add_argument("--aesmin", type=float, default=None)
|
||||
parser.add_argument("--matchmin", type=float, default=None)
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
|
@ -123,8 +133,8 @@ def get_output_path(args, input_name):
|
|||
if args.unescape:
|
||||
name += "_unescape"
|
||||
# num_frames processing
|
||||
if args.relength:
|
||||
name += "_relength"
|
||||
if args.video_info:
|
||||
name += "_vinfo"
|
||||
# num_frames filtering
|
||||
if args.fmin is not None:
|
||||
name += f"_fmin_{args.fmin}"
|
||||
|
|
@ -133,6 +143,16 @@ def get_output_path(args, input_name):
|
|||
# aesthetic filtering
|
||||
if args.aesmin is not None:
|
||||
name += f"_aesmin_{args.aesmin}"
|
||||
# clip score filtering
|
||||
if args.matchmin is not None:
|
||||
name += f"_matchmin_{args.matchmin}"
|
||||
# sort
|
||||
if args.sort_descending is not None:
|
||||
assert args.sort_ascending is None
|
||||
name += "_sort"
|
||||
if args.sort_ascending is not None:
|
||||
assert args.sort_descending is None
|
||||
name += "_sort"
|
||||
|
||||
output_path = os.path.join(dir_path, f"{name}.csv")
|
||||
return output_path
|
||||
|
|
@ -171,8 +191,9 @@ def main(args):
|
|||
if args.unescape:
|
||||
assert "text" in data.columns
|
||||
data["text"] = apply(data["text"], html.unescape)
|
||||
if args.relength:
|
||||
data["num_frames"] = apply(data["path"], get_video_length)
|
||||
if args.video_info:
|
||||
info = apply(data["path"], get_video_info)
|
||||
data["num_frames"], data["height"], data["width"], data["aspect_ratio"], data["fps"] = zip(*info)
|
||||
|
||||
# filtering
|
||||
if args.remove_empty_caption:
|
||||
|
|
@ -183,7 +204,7 @@ def main(args):
|
|||
data = data[~data["text"].str.contains(r"(?P<url>https?://[^\s]+)", regex=True)]
|
||||
if args.lang is not None:
|
||||
assert "text" in data.columns
|
||||
data = data[data["text"].progress_apply(detect_lang)] # cannot parallelize
|
||||
data = data[data["text"].progress_apply(detect_lang)] # cannot parallelize
|
||||
if args.fmin is not None:
|
||||
assert "num_frames" in data.columns
|
||||
data = data[data["num_frames"] >= args.fmin]
|
||||
|
|
@ -193,8 +214,17 @@ def main(args):
|
|||
if args.aesmin is not None:
|
||||
assert "aesthetic_score" in data.columns
|
||||
data = data[data["aesthetic_score"] >= args.aesmin]
|
||||
if args.matchmin is not None:
|
||||
assert "clip_score" in data.columns
|
||||
data = data[data["clip_score"] >= args.matchmin]
|
||||
print(f"Filtered number of samples: {len(data)}.")
|
||||
|
||||
# sort
|
||||
if args.sort_descending is not None:
|
||||
data = data.sort_values(by=args.sort_descending, ascending=False)
|
||||
if args.sort_ascending is not None:
|
||||
data = data.sort_values(by=args.sort_ascending, ascending=True)
|
||||
|
||||
# shard data
|
||||
if args.shard is not None:
|
||||
sharded_data = np.array_split(data, args.shard)
|
||||
|
|
|
|||
Loading…
Reference in a new issue