[fix] transform may not fit enough

2026-05-21 03:33:55 +02:00 · 2024-03-30 17:05:15 +08:00 · 2024-03-30 17:05:15 +08:00 · 987283fa1b
commit 987283fa1b
parent b44acc022c
7 changed files with 36 additions and 12 deletions
--- a/configs/opensora-v1-1/train/Vx360p.py
+++ b/configs/opensora-v1-1/train/Vx360p.py
@ -15,7 +15,7 @@ bucket_config = {
 }

 # Define acceleration
-num_workers = 4
+num_workers = 0
 dtype = "bf16"
 grad_checkpoint = True
 plugin = "zero2"
--- a/opensora/datasets/aspect.py
+++ b/opensora/datasets/aspect.py
@ -19,6 +19,25 @@ def get_aspect_ratios_dict(
    return est


+# S = 8294400
+ASPECT_RATIO_4K = {
+    "0.39": (1798, 4610),
+    "0.42": (1866, 4442),
+    "0.48": (1996, 4158),
+    "0.50": (2036, 4072),
+    "0.52": (2076, 3992),
+    "0.56": (2160, 3840),
+    "0.66": (2340, 3546),
+    "0.75": (2494, 3326),
+    "1.00": (2880, 2880),
+    "1.33": (3322, 2498),
+    "1.52": (3550, 2336),
+    "1.78": (3842, 2158),
+    "1.92": (3990, 2078),
+    "2.00": (4072, 2036),
+    "2.10": (4174, 1988),
+}
+
 # S = 2073600
 ASPECT_RATIO_1080P = {
    "0.39": (900, 2308),
@ -264,4 +283,5 @@ ASPECT_RATIOS = {
    "720p": (921600, ASPECT_RATIO_720P),
    "1024": (1048576, ASPECT_RATIO_1024),
    "1080p": (2073600, ASPECT_RATIO_1080P),
+    "4k": (8294400, ASPECT_RATIO_4K),
 }
--- a/opensora/datasets/utils.py
+++ b/opensora/datasets/utils.py
@ -148,14 +148,15 @@ def resize_crop_to_fill(pil_image, image_size):
    th, tw = image_size
    rh, rw = th / h, tw / w
    if rh > rw:
-        sh, sw = th, int(w * rh)
+        sh, sw = th, round(w * rh)
        image = pil_image.resize((sw, sh), Image.BICUBIC)
        i = 0
        j = int(round((sw - tw) / 2.0))
    else:
-        sh, sw = int(h * rw), tw
+        sh, sw = round(h * rw), tw
        image = pil_image.resize((sw, sh), Image.BICUBIC)
        i = int(round((sh - th) / 2.0))
        j = 0
    arr = np.array(image)
+    assert i + th <= arr.shape[0] and j + tw <= arr.shape[1]
    return Image.fromarray(arr[i : i + th, j : j + tw])
--- a/opensora/datasets/video_transforms.py
+++ b/opensora/datasets/video_transforms.py
@ -111,15 +111,16 @@ def resize_crop_to_fill(clip, target_size):
    th, tw = target_size[0], target_size[1]
    rh, rw = th / h, tw / w
    if rh > rw:
-        sh, sw = th, int(w * rh)
+        sh, sw = th, round(w * rh)
        clip = resize(clip, (sh, sw), "bilinear")
        i = 0
        j = int(round(sw - tw) / 2.0)
    else:
-        sh, sw = int(h * rw), tw
+        sh, sw = round(h * rw), tw
        clip = resize(clip, (sh, sw), "bilinear")
        i = int(round(sh - th) / 2.0)
        j = 0
+    assert i + th <= clip.size(-2) and j + tw <= clip.size(-1)
    return crop(clip, i, j, th, tw)


--- a/tools/datasets/README.md
+++ b/tools/datasets/README.md
@ -33,20 +33,20 @@ The columns are defined as follows:

 ## Dataset to CSV

-As a start point, `convert_dataset.py` is used to convert the dataset to a CSV file. You can use the following commands to convert the dataset to a CSV file:
+As a start point, `convert.py` is used to convert the dataset to a CSV file. You can use the following commands to convert the dataset to a CSV file:

 ```bash
-python -m tools.datasets.convert_dataset DATASET-TYPE DATA_FOLDER
+python -m tools.datasets.convert DATASET-TYPE DATA_FOLDER
 # general video folder
-python -m tools.datasets.convert_dataset video VIDEO_FOLDER
+python -m tools.datasets.convert video VIDEO_FOLDER
 # general image folder
-python -m tools.datasets.convert_dataset image IMAGE_FOLDER
+python -m tools.datasets.convert image IMAGE_FOLDER
 # imagenet
-python -m tools.datasets.convert_dataset imagenet IMAGENET_FOLDER --split train
+python -m tools.datasets.convert imagenet IMAGENET_FOLDER --split train
 # ucf101
-python -m tools.datasets.convert_dataset ucf101 UCF101_FOLDER --split videos
+python -m tools.datasets.convert ucf101 UCF101_FOLDER --split videos
 # vidprom
-python -m tools.datasets.convert_dataset vidprom VIDPROM_FOLDER --info VidProM_semantic_unique.csv
+python -m tools.datasets.convert vidprom VIDPROM_FOLDER --info VidProM_semantic_unique.csv
 ```

 ## Manage datasets
--- a/tools/datasets/convert_dataset.py
+++ b/tools/datasets/convert_dataset.py
--- a/tools/datasets/csvutil.py
+++ b/tools/datasets/csvutil.py
@ -32,6 +32,8 @@ def get_video_info(path):
    ext = os.path.splitext(path)[1].lower()
    if ext in IMG_EXTENSIONS:
        im = cv2.imread(path)
+        if im is None:
+            return 0, 0, 0, np.nan, np.nan
        height, width = im.shape[:2]
        num_frames, fps = 1, np.nan
    else: