From 861d465aad7587d01dc01bb67584c2b58d43b39b Mon Sep 17 00:00:00 2001
From: zhengzangw <zhengzangw@gmail.com>
Date: Tue, 11 Jun 2024 06:21:07 +0000
Subject: [PATCH] update

---
 tools/datasets/datautil.py | 57 +++++++++++++++++++++++---------------
 1 file changed, 34 insertions(+), 23 deletions(-)

diff --git a/tools/datasets/datautil.py b/tools/datasets/datautil.py
index db073e9..d62a467 100644
--- a/tools/datasets/datautil.py
+++ b/tools/datasets/datautil.py
@@ -12,10 +12,10 @@ import numpy as np
 import pandas as pd
 from PIL import Image
 from tqdm import tqdm
-import torchvision
+
+from opensora.datasets.read_video import read_video
 
 from .utils import IMG_EXTENSIONS
-from opensora.datasets.read_video import read_video
 
 tqdm.pandas()
 
@@ -186,29 +186,29 @@ def remove_caption_prefix(caption):
 # ======================================================
 
 CMOTION_TEXT = {
-    "static": "The camera is static.",
-    "dynamic": "The camera is moving.",
-    "unknown": None,
-    "zoom in": "The camera is zooming in.",
-    "zoom out": "The camera is zooming out.",
-    "pan left": "The camera is panning left.",
-    "pan right": "The camera is panning right.",
-    "tilt up": "The camera is tilting up.",
-    "tilt down": "The camera is tilting down.",
-    "pan/tilt": "The camera is panning.",
+    "static": "static",
+    "pan_right": "pan right",
+    "pan_left": "pan left",
+    "zoom_in": "zoom in",
+    "zoom_out": "zoom out",
+    "tilt_up": "tilt up",
+    "tilt_down": "tilt down",
+    # "pan/tilt": "The camera is panning.",
+    # "dynamic": "The camera is moving.",
+    # "unknown": None,
 }
 CMOTION_PROBS = {
     # hard-coded probabilities
     "static": 1.0,
-    "dynamic": 1.0,
-    "unknown": 0.0,
-    "zoom in": 1.0,
-    "zoom out": 1.0,
-    "pan left": 1.0,
-    "pan right": 1.0,
-    "tilt up": 1.0,
-    "tilt down": 1.0,
-    "pan/tilt": 1.0,
+    "zoom_in": 1.0,
+    "zoom_out": 1.0,
+    "pan_left": 1.0,
+    "pan_right": 1.0,
+    "tilt_up": 1.0,
+    "tilt_down": 1.0,
+    # "dynamic": 1.0,
+    # "unknown": 0.0,
+    # "pan/tilt": 1.0,
 }
 
 
@@ -216,7 +216,7 @@ def merge_cmotion(caption, cmotion):
     text = CMOTION_TEXT[cmotion]
     prob = CMOTION_PROBS[cmotion]
     if text is not None and random.random() < prob:
-        caption = f"{caption} {text}"
+        caption = f"{caption} Camera motion: {text}."
     return caption
 
 
@@ -472,7 +472,7 @@ def read_data(input_paths):
         input_name += os.path.basename(input_path).split(".")[0]
         if i != len(input_list) - 1:
             input_name += "+"
-        print(f"Loaded {len(data[-1])} samples from \'{input_path}\'.")
+        print(f"Loaded {len(data[-1])} samples from '{input_path}'.")
     if len(data) == 0:
         print(f"No samples to process. Exit.")
         exit()
@@ -600,6 +600,14 @@ def main(args):
         data["text_len"] = apply(data["text"], lambda x: len(tokenizer(x)["input_ids"]))
     if args.score_to_text:
         data["text"] = apply(data, score_to_text, axis=1)
+    if args.update_text is not None:
+        data_new = pd.read_csv(args.update_text)
+        num_updated = data.path.isin(data_new.path).sum()
+        print(f"Number of updated samples: {num_updated}.")
+        data = data.set_index("path")
+        data_new = data_new[["path", "text"]].set_index("path")
+        data.update(data_new)
+        data = data.reset_index()
 
     # sort
     if args.sort is not None:
@@ -727,6 +735,7 @@ def parse_args():
     )
     parser.add_argument("--append-text", type=str, default=None, help="append text to the caption")
     parser.add_argument("--score-to-text", action="store_true", help="convert score to text")
+    parser.add_argument("--update-text", type=str, default=None, help="update the text with the given text")
 
     # score filtering
     parser.add_argument("--filesize", action="store_true", help="get the filesize of each video and image in MB")
@@ -806,6 +815,8 @@ def get_output_path(args, input_name):
         name += "_appendtext"
     if args.score_to_text:
         name += "_score2text"
+    if args.update_text is not None:
+        name += "_update"
 
     # score filtering
     if args.filesize: