From 987283fa1bf6fcb78f093ee464cff9e97fe31ac9 Mon Sep 17 00:00:00 2001 From: Zangwei Zheng Date: Sat, 30 Mar 2024 17:05:15 +0800 Subject: [PATCH] [fix] transform may not fit enough --- configs/opensora-v1-1/train/Vx360p.py | 2 +- opensora/datasets/aspect.py | 20 +++++++++++++++++++ opensora/datasets/utils.py | 5 +++-- opensora/datasets/video_transforms.py | 5 +++-- tools/datasets/README.md | 14 ++++++------- .../{convert_dataset.py => convert.py} | 0 tools/datasets/csvutil.py | 2 ++ 7 files changed, 36 insertions(+), 12 deletions(-) rename tools/datasets/{convert_dataset.py => convert.py} (100%) diff --git a/configs/opensora-v1-1/train/Vx360p.py b/configs/opensora-v1-1/train/Vx360p.py index 72afafa..81ff50a 100644 --- a/configs/opensora-v1-1/train/Vx360p.py +++ b/configs/opensora-v1-1/train/Vx360p.py @@ -15,7 +15,7 @@ bucket_config = { } # Define acceleration -num_workers = 4 +num_workers = 0 dtype = "bf16" grad_checkpoint = True plugin = "zero2" diff --git a/opensora/datasets/aspect.py b/opensora/datasets/aspect.py index 5a8ac05..f0ead5b 100644 --- a/opensora/datasets/aspect.py +++ b/opensora/datasets/aspect.py @@ -19,6 +19,25 @@ def get_aspect_ratios_dict( return est +# S = 8294400 +ASPECT_RATIO_4K = { + "0.39": (1798, 4610), + "0.42": (1866, 4442), + "0.48": (1996, 4158), + "0.50": (2036, 4072), + "0.52": (2076, 3992), + "0.56": (2160, 3840), + "0.66": (2340, 3546), + "0.75": (2494, 3326), + "1.00": (2880, 2880), + "1.33": (3322, 2498), + "1.52": (3550, 2336), + "1.78": (3842, 2158), + "1.92": (3990, 2078), + "2.00": (4072, 2036), + "2.10": (4174, 1988), +} + # S = 2073600 ASPECT_RATIO_1080P = { "0.39": (900, 2308), @@ -264,4 +283,5 @@ ASPECT_RATIOS = { "720p": (921600, ASPECT_RATIO_720P), "1024": (1048576, ASPECT_RATIO_1024), "1080p": (2073600, ASPECT_RATIO_1080P), + "4k": (8294400, ASPECT_RATIO_4K), } diff --git a/opensora/datasets/utils.py b/opensora/datasets/utils.py index 85d1d18..267d4a8 100644 --- a/opensora/datasets/utils.py +++ b/opensora/datasets/utils.py @@ -148,14 +148,15 @@ def resize_crop_to_fill(pil_image, image_size): th, tw = image_size rh, rw = th / h, tw / w if rh > rw: - sh, sw = th, int(w * rh) + sh, sw = th, round(w * rh) image = pil_image.resize((sw, sh), Image.BICUBIC) i = 0 j = int(round((sw - tw) / 2.0)) else: - sh, sw = int(h * rw), tw + sh, sw = round(h * rw), tw image = pil_image.resize((sw, sh), Image.BICUBIC) i = int(round((sh - th) / 2.0)) j = 0 arr = np.array(image) + assert i + th <= arr.shape[0] and j + tw <= arr.shape[1] return Image.fromarray(arr[i : i + th, j : j + tw]) diff --git a/opensora/datasets/video_transforms.py b/opensora/datasets/video_transforms.py index bccf0a3..8cf5046 100644 --- a/opensora/datasets/video_transforms.py +++ b/opensora/datasets/video_transforms.py @@ -111,15 +111,16 @@ def resize_crop_to_fill(clip, target_size): th, tw = target_size[0], target_size[1] rh, rw = th / h, tw / w if rh > rw: - sh, sw = th, int(w * rh) + sh, sw = th, round(w * rh) clip = resize(clip, (sh, sw), "bilinear") i = 0 j = int(round(sw - tw) / 2.0) else: - sh, sw = int(h * rw), tw + sh, sw = round(h * rw), tw clip = resize(clip, (sh, sw), "bilinear") i = int(round(sh - th) / 2.0) j = 0 + assert i + th <= clip.size(-2) and j + tw <= clip.size(-1) return crop(clip, i, j, th, tw) diff --git a/tools/datasets/README.md b/tools/datasets/README.md index 2ea5cea..3ec7a81 100644 --- a/tools/datasets/README.md +++ b/tools/datasets/README.md @@ -33,20 +33,20 @@ The columns are defined as follows: ## Dataset to CSV -As a start point, `convert_dataset.py` is used to convert the dataset to a CSV file. You can use the following commands to convert the dataset to a CSV file: +As a start point, `convert.py` is used to convert the dataset to a CSV file. You can use the following commands to convert the dataset to a CSV file: ```bash -python -m tools.datasets.convert_dataset DATASET-TYPE DATA_FOLDER +python -m tools.datasets.convert DATASET-TYPE DATA_FOLDER # general video folder -python -m tools.datasets.convert_dataset video VIDEO_FOLDER +python -m tools.datasets.convert video VIDEO_FOLDER # general image folder -python -m tools.datasets.convert_dataset image IMAGE_FOLDER +python -m tools.datasets.convert image IMAGE_FOLDER # imagenet -python -m tools.datasets.convert_dataset imagenet IMAGENET_FOLDER --split train +python -m tools.datasets.convert imagenet IMAGENET_FOLDER --split train # ucf101 -python -m tools.datasets.convert_dataset ucf101 UCF101_FOLDER --split videos +python -m tools.datasets.convert ucf101 UCF101_FOLDER --split videos # vidprom -python -m tools.datasets.convert_dataset vidprom VIDPROM_FOLDER --info VidProM_semantic_unique.csv +python -m tools.datasets.convert vidprom VIDPROM_FOLDER --info VidProM_semantic_unique.csv ``` ## Manage datasets diff --git a/tools/datasets/convert_dataset.py b/tools/datasets/convert.py similarity index 100% rename from tools/datasets/convert_dataset.py rename to tools/datasets/convert.py diff --git a/tools/datasets/csvutil.py b/tools/datasets/csvutil.py index 0ab0f47..110be8c 100644 --- a/tools/datasets/csvutil.py +++ b/tools/datasets/csvutil.py @@ -32,6 +32,8 @@ def get_video_info(path): ext = os.path.splitext(path)[1].lower() if ext in IMG_EXTENSIONS: im = cv2.imread(path) + if im is None: + return 0, 0, 0, np.nan, np.nan height, width = im.shape[:2] num_frames, fps = 1, np.nan else: