automatically calculate scaled scores

2026-05-09 17:36:36 +02:00 · 2024-06-05 06:37:23 +00:00 · 2024-06-05 06:37:23 +00:00 · b8a93d5aba
commit b8a93d5aba
parent b3e62fe989
6 changed files with 225 additions and 10 deletions
--- a/eval/README.md
+++ b/eval/README.md
@ -42,9 +42,9 @@ First, generate the relevant videos with the following commands:

 ```bash
 # vbench tasks (4a 4b 4c ...)
-bash eval/sample.sh /path/to/ckpt  -4a
+bash eval/sample.sh /path/to/ckpt num_frames model_name_for_log  -4a
 # launch 8 jobs at once (you must read the script to understand the details)
-bash eval/vbench/launch.sh /path/to/ckpt
+bash eval/vbench/launch.sh /path/to/ckpt num_frames model_name
 ```

 After generation, install the VBench package following our [installation](../docs/installation.md)'s sections of "Evaluation Dependencies". Then, run the following commands to evaluate the generated samples.
@ -53,11 +53,31 @@ After generation, install the VBench package following our [installation](../doc
 bash eval/vbench/vbench.sh /path/to/video_folder
 ```

+Finally, we obtain the scaled scores for the model by:
+```bash
+python eval/vbench/tabulate_vbench_scores.py --score_dir path/to/evaluation_results/dir
+```
+
 ## VBench-i2v

 [VBench-i2v](https://github.com/Vchitect/VBench/tree/master/vbench2_beta_i2v) is a benchmark for short image to video generation (beta version).
+Similarly, install the VBench package following our [installation](../docs/installation.md)'s sections of "Evaluation Dependencies". Then, run the following commands to evaluate the generated samples.

-TBD
+```bash
+# Step 1: generate the relevant videos
+# vbench i2v tasks (5a 5b 5c ...)
+bash eval/sample.sh /path/to/ckpt num_frames model_name_for_log -5a
+# launch 8 jobs at once
+bash eval/vbench_i2v/launch.sh /path/to/ckpt num_frames model_name
+
+# Step 2: run vbench to evaluate the generated samples 
+python eval/vbench_i2v/vbench_i2v.py
+python eval/vbench_i2v/vbench_video_quality.py 
+
+# Step 3: obtain the scaled scores
+python eval/vbench_i2v/tabulate_vbench_i2v_scores.py --score_dir path/to/evaluation_results/dir
+
+```

 ## VAE

--- a/eval/vbench/launch.sh
+++ b/eval/vbench/launch.sh
@ -5,7 +5,7 @@ set -e

 CKPT=$1
 MODEL_NAME=$2
-NUM_FRAMES=51
+NUM_FRAMES=$3

 if [[ $CKPT == *"ema"* ]]; then
    parentdir=$(dirname $CKPT)
--- a/eval/vbench/tabulate_vbench_scores.py
+++ b/eval/vbench/tabulate_vbench_scores.py
@ -3,6 +3,68 @@ import json
 import os
 from ast import literal_eval

+SEMANTIC_WEIGHT = 1
+QUALITY_WEIGHT = 4
+
+QUALITY_LIST = [ 
+    "subject consistency",
+    "background consistency",
+    "temporal flickering",
+    "motion smoothness",
+    "aesthetic quality",
+    "imaging quality",
+    "dynamic degree",]
+
+SEMANTIC_LIST = [
+    "object class",
+    "multiple objects",
+    "human action",
+    "color",
+    "spatial relationship",
+    "scene",
+    "appearance style",
+    "temporal style",
+    "overall consistency"
+]
+
+NORMALIZE_DIC = {
+  "subject consistency": {"Min": 0.1462, "Max": 1.0},
+  "background consistency": {"Min": 0.2615, "Max": 1.0},
+  "temporal flickering": {"Min": 0.6293, "Max": 1.0},
+  "motion smoothness": {"Min": 0.706, "Max": 0.9975},
+  "dynamic degree": {"Min": 0.0, "Max": 1.0},
+  "aesthetic quality": {"Min": 0.0, "Max": 1.0},
+  "imaging quality": {"Min": 0.0, "Max": 1.0},
+  "object class": {"Min": 0.0, "Max": 1.0},
+  "multiple objects": {"Min": 0.0, "Max": 1.0},
+  "human action": {"Min": 0.0, "Max": 1.0},
+  "color": {"Min": 0.0, "Max": 1.0},
+  "spatial relationship": {"Min": 0.0, "Max": 1.0},
+  "scene": {"Min": 0.0, "Max": 0.8222},
+  "appearance style": {"Min": 0.0009, "Max": 0.2855},
+  "temporal style": {"Min": 0.0, "Max": 0.364},
+  "overall consistency": {"Min": 0.0, "Max": 0.364}
+}
+
+DIM_WEIGHT = {
+"subject consistency":1,
+"background consistency":1,
+"temporal flickering":1,
+"motion smoothness":1,
+"aesthetic quality":1,
+"imaging quality":1,
+"dynamic degree":0.5,
+"object class":1,
+"multiple objects":1,
+"human action":1,
+"color":1,
+"spatial relationship":1,
+"scene":1,
+"appearance style":1,
+"temporal style":1,
+"overall consistency":1
+}
+
 def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--score_dir", type=str) # evaluation_results/samples_...
@ -19,7 +81,6 @@ if __name__ == "__main__":
    assert len(res_files) == len(info_files), f"got {len(res_files)} res files, but {len(info_files)} info files"

    full_results = {}
-
    for res_file in res_files:
        # first check if results is normal
        info_file = res_file.split(res_postfix)[0] + info_postfix
@ -30,12 +91,37 @@ if __name__ == "__main__":
        with open(os.path.join(args.score_dir, res_file), "r", encoding="utf-8") as f:
            data = json.load(f)
            for key, val in data.items():
-                full_results[key] = format(val[0]*100, ".2f")
+                full_results[key] = format(val[0], ".4f")
    
+    scaled_results = {}
+    dims = set()
+    for key, val in full_results.items():
+        dim = key.replace("_", " ") if "_" in key else key
+        scaled_score = (float(val) - NORMALIZE_DIC[dim]["Min"]) / (NORMALIZE_DIC[dim]["Max"] -  NORMALIZE_DIC[dim]["Min"])
+        scaled_score *= DIM_WEIGHT[dim]
+        scaled_results[dim] = scaled_score
+        dims.add(dim)
+    
+    assert len(dims) == len(NORMALIZE_DIC), f"{set(NORMALIZE_DIC.keys())-dims} not calculated yet"
+
+    quality_score = sum([scaled_results[i] for i in QUALITY_LIST]) / sum([DIM_WEIGHT[i] for i in QUALITY_LIST])
+    semantic_score = sum([scaled_results[i] for i in SEMANTIC_LIST]) / sum([DIM_WEIGHT[i] for i in SEMANTIC_LIST])
+    scaled_results["quality score"] = quality_score
+    scaled_results["semantic score"] = semantic_score 
+    scaled_results["total score"] = (quality_score * QUALITY_WEIGHT + semantic_score * SEMANTIC_WEIGHT) / (QUALITY_WEIGHT + SEMANTIC_WEIGHT)
+ 
+    formated_scaled_results = {}
+    for key,val in scaled_results.items():
+        formated_scaled_results[key] = format(val*100, ".2f")+"%"

    output_file_path = os.path.join(args.score_dir, "all_results.json")
    with open(output_file_path, "w") as outfile:
        json.dump(full_results, outfile, indent=4, sort_keys=True)
    print(f"results saved to: {output_file_path}")
+    
+    scaled_file_path = os.path.join(args.score_dir, "scaled_results.json")
+    with open(scaled_file_path, "w") as outfile:
+        json.dump(formated_scaled_results, outfile, indent=4, sort_keys=True)
+    print(f"results saved to: {scaled_file_path}")


--- a/eval/vbench_i2v/launch.sh
+++ b/eval/vbench_i2v/launch.sh
@ -4,8 +4,8 @@ set -x
 set -e

 CKPT=$1
-NUM_FRAMES=51
-MODEL_NAME=$2
+NUM_FRAMES=$2
+MODEL_NAME=$3

 if [[ $CKPT == *"ema"* ]]; then
    parentdir=$(dirname $CKPT)
@ -13,7 +13,6 @@ if [[ $CKPT == *"ema"* ]]; then
 else
    CKPT_BASE=$(basename $CKPT)
 fi
-# LOG_BASE=logs/sample/${MODEL_NAME}_${CKPT_BASE}
 LOG_BASE=$(dirname $CKPT)/eval
 echo "Logging to $LOG_BASE"

--- a/eval/vbench_i2v/tabulate_vbench_i2v_scores.py
+++ b/eval/vbench_i2v/tabulate_vbench_i2v_scores.py
@ -0,0 +1,110 @@
+import argparse
+import json
+import os
+from ast import literal_eval
+
+I2V_WEIGHT = 1.0
+I2V_QUALITY_WEIGHT = 1.0
+
+I2V_LIST = [
+    "Video-Image Subject Consistency",
+    "Video-Image Background Consistency",
+]
+
+I2V_QUALITY_LIST = [
+    "Subject Consistency",
+    "Background Consistency",
+    "Motion Smoothness",
+    "Dynamic Degree",
+    "Aesthetic Quality",
+    "Imaging Quality",
+    "Temporal Flickering"
+]
+
+DIM_WEIGHT_I2V = {
+"Video-Text Camera Motion": 0.1,
+"Video-Image Subject Consistency": 1,
+"Video-Image Background Consistency": 1,
+"Subject Consistency": 1,
+"Background Consistency": 1,
+"Motion Smoothness": 1,
+"Dynamic Degree": 0.5,
+"Aesthetic Quality": 1,
+"Imaging Quality": 1,
+"Temporal Flickering": 1
+}
+
+NORMALIZE_DIC_I2V = {
+    "Video-Text Camera Motion" :{"Min": 0.0, "Max":1.0 },
+    "Video-Image Subject Consistency":{"Min": 0.1462, "Max": 1.0},
+    "Video-Image Background Consistency":{"Min": 0.2615, "Max":1.0 },
+    "Subject Consistency":{"Min": 0.1462, "Max": 1.0},
+    "Background Consistency":{"Min": 0.2615, "Max": 1.0 },
+    "Motion Smoothness":{"Min": 0.7060, "Max": 0.9975},
+    "Dynamic Degree":{"Min": 0.0, "Max": 1.0},
+    "Aesthetic Quality":{"Min": 0.0, "Max": 1.0},
+    "Imaging Quality":{"Min": 0.0, "Max": 1.0},
+    "Temporal Flickering":{"Min":0.6293, "Max": 1.0}
+}
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--score_dir", type=str) # evaluation_results/samples_...
+    args = parser.parse_args()
+    return args
+
+if __name__ == "__main__":
+    args = parse_args()
+    res_postfix = "_eval_results.json"
+    info_postfix = "_full_info.json"
+    files = os.listdir(args.score_dir)
+    res_files = [x for x in files if res_postfix in x]
+    info_files = [x for x in files if info_postfix in x]
+    assert len(res_files) == len(info_files), f"got {len(res_files)} res files, but {len(info_files)} info files"
+
+    full_results = {}
+    for res_file in res_files:
+        # first check if results is normal
+        info_file = res_file.split(res_postfix)[0] + info_postfix
+        with open(os.path.join(args.score_dir, info_file), "r", encoding="utf-8") as f:
+            info = json.load(f)
+            assert len(info[0]["video_list"]) > 0, f"Error: {info_file} has 0 video list"
+        # read results
+        with open(os.path.join(args.score_dir, res_file), "r", encoding="utf-8") as f:
+            data = json.load(f)
+            for key, val in data.items():
+                full_results[key] = format(val[0], ".4f")
+
+    scaled_results = {}
+    dims = set()
+    for key, val in full_results.items():
+        dim = key
+        scaled_score = (float(val) - NORMALIZE_DIC_I2V[dim]["Min"]) / (NORMALIZE_DIC_I2V[dim]["Max"] -  NORMALIZE_DIC_I2V[dim]["Min"])
+        scaled_score *= DIM_WEIGHT_I2V[dim]
+        scaled_results[dim] = scaled_score
+        dims.add(dim)
+    
+    assert len(dims) == len(NORMALIZE_DIC_I2V), f"{set(NORMALIZE_DIC_I2V.keys())-dims} not calculated yet"
+
+    quality_score = sum([scaled_results[i] for i in I2V_QUALITY_LIST]) / sum([DIM_WEIGHT_I2V[i] for i in I2V_QUALITY_LIST])
+    i2v_score = sum([scaled_results[i] for i in I2V_LIST]) / sum([DIM_WEIGHT_I2V[i] for i in I2V_LIST])
+
+    scaled_results["quality score"] = quality_score
+    scaled_results["i2v score"] = i2v_score 
+    scaled_results["total score"] = (quality_score * I2V_QUALITY_WEIGHT + i2v_score * I2V_WEIGHT) / (I2V_QUALITY_WEIGHT + I2V_WEIGHT)
+    
+    formated_scaled_results = {}
+    for key,val in scaled_results.items():
+        formated_scaled_results[key] = format(val*100, ".2f")+"%"
+
+    output_file_path = os.path.join(args.score_dir, "all_results.json")
+    with open(output_file_path, "w") as outfile:
+        json.dump(full_results, outfile, indent=4, sort_keys=True)
+    print(f"results saved to: {output_file_path}")
+
+
+    scaled_file_path = os.path.join(args.score_dir, "scaled_results.json")
+    with open(scaled_file_path, "w") as outfile:
+        json.dump(formated_scaled_results, outfile, indent=4, sort_keys=True)
+    print(f"results saved to: {scaled_file_path}")
--- a/eval/vbench_i2v/vbench_video_quality.py
+++ b/eval/vbench_i2v/vbench_video_quality.py
@ -2,7 +2,7 @@ from vbench import VBench


 VIDEO_PATH = ""
-DIMENSIONS = ["subject consistency", "background_consistency", "motion_smoothness", "dynamic_degree", "aesthetic_quality", "imaging_quality"]
+DIMENSIONS = ["subject_consistency", "background_consistency", "motion_smoothness", "dynamic_degree", "aesthetic_quality", "imaging_quality", "temporal_flickering"]

 my_VBench = VBench("cuda", "vbench2_beta_i2v/vbench2_i2v_full_info.json", "evaluation_results")
 my_VBench.evaluate(