diff --git a/tools/datasets/datautil.py b/tools/datasets/datautil.py
index e39197f..d12df80 100644
--- a/tools/datasets/datautil.py
+++ b/tools/datasets/datautil.py
@@ -604,6 +604,12 @@ def main(args):
         data = data.sort_values(by=args.sort_ascending, ascending=True)
 
     # filtering
+    if args.filesize:
+        assert "path" in data.columns
+        data["filesize"] = apply(data["path"], lambda x: os.stat(x).st_size / 1024 / 1024)
+    if args.fsmax is not None:
+        assert "filesize" in data.columns
+        data = data[data["filesize"] <= args.fsmax]
     if args.remove_empty_caption:
         assert "text" in data.columns
         data = data[data["text"].str.len() > 0]
@@ -715,6 +721,8 @@ def parse_args():
     parser.add_argument("--score-to-text", action="store_true", help="convert score to text")
 
     # score filtering
+    parser.add_argument("--filesize", action="store_true", help="get the filesize of each video and image in MB")
+    parser.add_argument("--fsmax", type=int, default=None, help="filter the dataset by maximum filesize")
     parser.add_argument("--fmin", type=int, default=None, help="filter the dataset by minimum number of frames")
     parser.add_argument("--fmax", type=int, default=None, help="filter the dataset by maximum number of frames")
     parser.add_argument("--hwmax", type=int, default=None, help="filter the dataset by maximum resolution")
@@ -790,6 +798,10 @@ def get_output_path(args, input_name):
         name += "_score2text"
 
     # score filtering
+    if args.filesize:
+        name += "_filesize"
+    if args.fsmax is not None:
+        name += f"_fsmax{args.fsmax}"
     if args.fmin is not None:
         name += f"_fmin{args.fmin}"
     if args.fmax is not None: