From 987283fa1bf6fcb78f093ee464cff9e97fe31ac9 Mon Sep 17 00:00:00 2001
From: Zangwei Zheng <zangwei@comp.nus.edu.sg>
Date: Sat, 30 Mar 2024 17:05:15 +0800
Subject: [PATCH] [fix] transform may not fit enough

---
 configs/opensora-v1-1/train/Vx360p.py         |  2 +-
 opensora/datasets/aspect.py                   | 20 +++++++++++++++++++
 opensora/datasets/utils.py                    |  5 +++--
 opensora/datasets/video_transforms.py         |  5 +++--
 tools/datasets/README.md                      | 14 ++++++-------
 .../{convert_dataset.py => convert.py}        |  0
 tools/datasets/csvutil.py                     |  2 ++
 7 files changed, 36 insertions(+), 12 deletions(-)
 rename tools/datasets/{convert_dataset.py => convert.py} (100%)

diff --git a/configs/opensora-v1-1/train/Vx360p.py b/configs/opensora-v1-1/train/Vx360p.py
index 72afafa..81ff50a 100644
--- a/configs/opensora-v1-1/train/Vx360p.py
+++ b/configs/opensora-v1-1/train/Vx360p.py
@@ -15,7 +15,7 @@ bucket_config = {
 }
 
 # Define acceleration
-num_workers = 4
+num_workers = 0
 dtype = "bf16"
 grad_checkpoint = True
 plugin = "zero2"
diff --git a/opensora/datasets/aspect.py b/opensora/datasets/aspect.py
index 5a8ac05..f0ead5b 100644
--- a/opensora/datasets/aspect.py
+++ b/opensora/datasets/aspect.py
@@ -19,6 +19,25 @@ def get_aspect_ratios_dict(
     return est
 
 
+# S = 8294400
+ASPECT_RATIO_4K = {
+    "0.39": (1798, 4610),
+    "0.42": (1866, 4442),
+    "0.48": (1996, 4158),
+    "0.50": (2036, 4072),
+    "0.52": (2076, 3992),
+    "0.56": (2160, 3840),
+    "0.66": (2340, 3546),
+    "0.75": (2494, 3326),
+    "1.00": (2880, 2880),
+    "1.33": (3322, 2498),
+    "1.52": (3550, 2336),
+    "1.78": (3842, 2158),
+    "1.92": (3990, 2078),
+    "2.00": (4072, 2036),
+    "2.10": (4174, 1988),
+}
+
 # S = 2073600
 ASPECT_RATIO_1080P = {
     "0.39": (900, 2308),
@@ -264,4 +283,5 @@ ASPECT_RATIOS = {
     "720p": (921600, ASPECT_RATIO_720P),
     "1024": (1048576, ASPECT_RATIO_1024),
     "1080p": (2073600, ASPECT_RATIO_1080P),
+    "4k": (8294400, ASPECT_RATIO_4K),
 }
diff --git a/opensora/datasets/utils.py b/opensora/datasets/utils.py
index 85d1d18..267d4a8 100644
--- a/opensora/datasets/utils.py
+++ b/opensora/datasets/utils.py
@@ -148,14 +148,15 @@ def resize_crop_to_fill(pil_image, image_size):
     th, tw = image_size
     rh, rw = th / h, tw / w
     if rh > rw:
-        sh, sw = th, int(w * rh)
+        sh, sw = th, round(w * rh)
         image = pil_image.resize((sw, sh), Image.BICUBIC)
         i = 0
         j = int(round((sw - tw) / 2.0))
     else:
-        sh, sw = int(h * rw), tw
+        sh, sw = round(h * rw), tw
         image = pil_image.resize((sw, sh), Image.BICUBIC)
         i = int(round((sh - th) / 2.0))
         j = 0
     arr = np.array(image)
+    assert i + th <= arr.shape[0] and j + tw <= arr.shape[1]
     return Image.fromarray(arr[i : i + th, j : j + tw])
diff --git a/opensora/datasets/video_transforms.py b/opensora/datasets/video_transforms.py
index bccf0a3..8cf5046 100644
--- a/opensora/datasets/video_transforms.py
+++ b/opensora/datasets/video_transforms.py
@@ -111,15 +111,16 @@ def resize_crop_to_fill(clip, target_size):
     th, tw = target_size[0], target_size[1]
     rh, rw = th / h, tw / w
     if rh > rw:
-        sh, sw = th, int(w * rh)
+        sh, sw = th, round(w * rh)
         clip = resize(clip, (sh, sw), "bilinear")
         i = 0
         j = int(round(sw - tw) / 2.0)
     else:
-        sh, sw = int(h * rw), tw
+        sh, sw = round(h * rw), tw
         clip = resize(clip, (sh, sw), "bilinear")
         i = int(round(sh - th) / 2.0)
         j = 0
+    assert i + th <= clip.size(-2) and j + tw <= clip.size(-1)
     return crop(clip, i, j, th, tw)
 
 
diff --git a/tools/datasets/README.md b/tools/datasets/README.md
index 2ea5cea..3ec7a81 100644
--- a/tools/datasets/README.md
+++ b/tools/datasets/README.md
@@ -33,20 +33,20 @@ The columns are defined as follows:
 
 ## Dataset to CSV
 
-As a start point, `convert_dataset.py` is used to convert the dataset to a CSV file. You can use the following commands to convert the dataset to a CSV file:
+As a start point, `convert.py` is used to convert the dataset to a CSV file. You can use the following commands to convert the dataset to a CSV file:
 
 ```bash
-python -m tools.datasets.convert_dataset DATASET-TYPE DATA_FOLDER
+python -m tools.datasets.convert DATASET-TYPE DATA_FOLDER
 # general video folder
-python -m tools.datasets.convert_dataset video VIDEO_FOLDER
+python -m tools.datasets.convert video VIDEO_FOLDER
 # general image folder
-python -m tools.datasets.convert_dataset image IMAGE_FOLDER
+python -m tools.datasets.convert image IMAGE_FOLDER
 # imagenet
-python -m tools.datasets.convert_dataset imagenet IMAGENET_FOLDER --split train
+python -m tools.datasets.convert imagenet IMAGENET_FOLDER --split train
 # ucf101
-python -m tools.datasets.convert_dataset ucf101 UCF101_FOLDER --split videos
+python -m tools.datasets.convert ucf101 UCF101_FOLDER --split videos
 # vidprom
-python -m tools.datasets.convert_dataset vidprom VIDPROM_FOLDER --info VidProM_semantic_unique.csv
+python -m tools.datasets.convert vidprom VIDPROM_FOLDER --info VidProM_semantic_unique.csv
 ```
 
 ## Manage datasets
diff --git a/tools/datasets/convert_dataset.py b/tools/datasets/convert.py
similarity index 100%
rename from tools/datasets/convert_dataset.py
rename to tools/datasets/convert.py
diff --git a/tools/datasets/csvutil.py b/tools/datasets/csvutil.py
index 0ab0f47..110be8c 100644
--- a/tools/datasets/csvutil.py
+++ b/tools/datasets/csvutil.py
@@ -32,6 +32,8 @@ def get_video_info(path):
     ext = os.path.splitext(path)[1].lower()
     if ext in IMG_EXTENSIONS:
         im = cv2.imread(path)
+        if im is None:
+            return 0, 0, 0, np.nan, np.nan
         height, width = im.shape[:2]
         num_frames, fps = 1, np.nan
     else: