diff --git a/eval/README.md b/eval/README.md index b80ab0c..701db17 100644 --- a/eval/README.md +++ b/eval/README.md @@ -42,9 +42,9 @@ First, generate the relevant videos with the following commands: ```bash # vbench tasks (4a 4b 4c ...) -bash eval/sample.sh /path/to/ckpt -4a +bash eval/sample.sh /path/to/ckpt num_frames model_name_for_log -4a # launch 8 jobs at once (you must read the script to understand the details) -bash eval/vbench/launch.sh /path/to/ckpt +bash eval/vbench/launch.sh /path/to/ckpt num_frames model_name ``` After generation, install the VBench package following our [installation](../docs/installation.md)'s sections of "Evaluation Dependencies". Then, run the following commands to evaluate the generated samples. @@ -53,11 +53,31 @@ After generation, install the VBench package following our [installation](../doc bash eval/vbench/vbench.sh /path/to/video_folder ``` +Finally, we obtain the scaled scores for the model by: +```bash +python eval/vbench/tabulate_vbench_scores.py --score_dir path/to/evaluation_results/dir +``` + ## VBench-i2v [VBench-i2v](https://github.com/Vchitect/VBench/tree/master/vbench2_beta_i2v) is a benchmark for short image to video generation (beta version). +Similarly, install the VBench package following our [installation](../docs/installation.md)'s sections of "Evaluation Dependencies". Then, run the following commands to evaluate the generated samples. -TBD +```bash +# Step 1: generate the relevant videos +# vbench i2v tasks (5a 5b 5c ...) +bash eval/sample.sh /path/to/ckpt num_frames model_name_for_log -5a +# launch 8 jobs at once +bash eval/vbench_i2v/launch.sh /path/to/ckpt num_frames model_name + +# Step 2: run vbench to evaluate the generated samples +python eval/vbench_i2v/vbench_i2v.py +python eval/vbench_i2v/vbench_video_quality.py + +# Step 3: obtain the scaled scores +python eval/vbench_i2v/tabulate_vbench_i2v_scores.py --score_dir path/to/evaluation_results/dir + +``` ## VAE diff --git a/eval/vbench/launch.sh b/eval/vbench/launch.sh index 0ee4009..436a937 100644 --- a/eval/vbench/launch.sh +++ b/eval/vbench/launch.sh @@ -5,7 +5,7 @@ set -e CKPT=$1 MODEL_NAME=$2 -NUM_FRAMES=51 +NUM_FRAMES=$3 if [[ $CKPT == *"ema"* ]]; then parentdir=$(dirname $CKPT) diff --git a/eval/vbench/tabulate_vbench_scores.py b/eval/vbench/tabulate_vbench_scores.py index 3df9805..c7f27b8 100644 --- a/eval/vbench/tabulate_vbench_scores.py +++ b/eval/vbench/tabulate_vbench_scores.py @@ -3,6 +3,68 @@ import json import os from ast import literal_eval +SEMANTIC_WEIGHT = 1 +QUALITY_WEIGHT = 4 + +QUALITY_LIST = [ + "subject consistency", + "background consistency", + "temporal flickering", + "motion smoothness", + "aesthetic quality", + "imaging quality", + "dynamic degree",] + +SEMANTIC_LIST = [ + "object class", + "multiple objects", + "human action", + "color", + "spatial relationship", + "scene", + "appearance style", + "temporal style", + "overall consistency" +] + +NORMALIZE_DIC = { + "subject consistency": {"Min": 0.1462, "Max": 1.0}, + "background consistency": {"Min": 0.2615, "Max": 1.0}, + "temporal flickering": {"Min": 0.6293, "Max": 1.0}, + "motion smoothness": {"Min": 0.706, "Max": 0.9975}, + "dynamic degree": {"Min": 0.0, "Max": 1.0}, + "aesthetic quality": {"Min": 0.0, "Max": 1.0}, + "imaging quality": {"Min": 0.0, "Max": 1.0}, + "object class": {"Min": 0.0, "Max": 1.0}, + "multiple objects": {"Min": 0.0, "Max": 1.0}, + "human action": {"Min": 0.0, "Max": 1.0}, + "color": {"Min": 0.0, "Max": 1.0}, + "spatial relationship": {"Min": 0.0, "Max": 1.0}, + "scene": {"Min": 0.0, "Max": 0.8222}, + "appearance style": {"Min": 0.0009, "Max": 0.2855}, + "temporal style": {"Min": 0.0, "Max": 0.364}, + "overall consistency": {"Min": 0.0, "Max": 0.364} +} + +DIM_WEIGHT = { +"subject consistency":1, +"background consistency":1, +"temporal flickering":1, +"motion smoothness":1, +"aesthetic quality":1, +"imaging quality":1, +"dynamic degree":0.5, +"object class":1, +"multiple objects":1, +"human action":1, +"color":1, +"spatial relationship":1, +"scene":1, +"appearance style":1, +"temporal style":1, +"overall consistency":1 +} + def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--score_dir", type=str) # evaluation_results/samples_... @@ -19,7 +81,6 @@ if __name__ == "__main__": assert len(res_files) == len(info_files), f"got {len(res_files)} res files, but {len(info_files)} info files" full_results = {} - for res_file in res_files: # first check if results is normal info_file = res_file.split(res_postfix)[0] + info_postfix @@ -30,12 +91,37 @@ if __name__ == "__main__": with open(os.path.join(args.score_dir, res_file), "r", encoding="utf-8") as f: data = json.load(f) for key, val in data.items(): - full_results[key] = format(val[0]*100, ".2f") + full_results[key] = format(val[0], ".4f") + scaled_results = {} + dims = set() + for key, val in full_results.items(): + dim = key.replace("_", " ") if "_" in key else key + scaled_score = (float(val) - NORMALIZE_DIC[dim]["Min"]) / (NORMALIZE_DIC[dim]["Max"] - NORMALIZE_DIC[dim]["Min"]) + scaled_score *= DIM_WEIGHT[dim] + scaled_results[dim] = scaled_score + dims.add(dim) + + assert len(dims) == len(NORMALIZE_DIC), f"{set(NORMALIZE_DIC.keys())-dims} not calculated yet" + + quality_score = sum([scaled_results[i] for i in QUALITY_LIST]) / sum([DIM_WEIGHT[i] for i in QUALITY_LIST]) + semantic_score = sum([scaled_results[i] for i in SEMANTIC_LIST]) / sum([DIM_WEIGHT[i] for i in SEMANTIC_LIST]) + scaled_results["quality score"] = quality_score + scaled_results["semantic score"] = semantic_score + scaled_results["total score"] = (quality_score * QUALITY_WEIGHT + semantic_score * SEMANTIC_WEIGHT) / (QUALITY_WEIGHT + SEMANTIC_WEIGHT) + + formated_scaled_results = {} + for key,val in scaled_results.items(): + formated_scaled_results[key] = format(val*100, ".2f")+"%" output_file_path = os.path.join(args.score_dir, "all_results.json") with open(output_file_path, "w") as outfile: json.dump(full_results, outfile, indent=4, sort_keys=True) print(f"results saved to: {output_file_path}") + + scaled_file_path = os.path.join(args.score_dir, "scaled_results.json") + with open(scaled_file_path, "w") as outfile: + json.dump(formated_scaled_results, outfile, indent=4, sort_keys=True) + print(f"results saved to: {scaled_file_path}") diff --git a/eval/vbench_i2v/launch.sh b/eval/vbench_i2v/launch.sh index 8900b90..5a183e6 100644 --- a/eval/vbench_i2v/launch.sh +++ b/eval/vbench_i2v/launch.sh @@ -4,8 +4,8 @@ set -x set -e CKPT=$1 -NUM_FRAMES=51 -MODEL_NAME=$2 +NUM_FRAMES=$2 +MODEL_NAME=$3 if [[ $CKPT == *"ema"* ]]; then parentdir=$(dirname $CKPT) @@ -13,7 +13,6 @@ if [[ $CKPT == *"ema"* ]]; then else CKPT_BASE=$(basename $CKPT) fi -# LOG_BASE=logs/sample/${MODEL_NAME}_${CKPT_BASE} LOG_BASE=$(dirname $CKPT)/eval echo "Logging to $LOG_BASE" diff --git a/eval/vbench_i2v/tabulate_vbench_i2v_scores.py b/eval/vbench_i2v/tabulate_vbench_i2v_scores.py new file mode 100644 index 0000000..9f6dc77 --- /dev/null +++ b/eval/vbench_i2v/tabulate_vbench_i2v_scores.py @@ -0,0 +1,110 @@ +import argparse +import json +import os +from ast import literal_eval + +I2V_WEIGHT = 1.0 +I2V_QUALITY_WEIGHT = 1.0 + +I2V_LIST = [ + "Video-Image Subject Consistency", + "Video-Image Background Consistency", +] + +I2V_QUALITY_LIST = [ + "Subject Consistency", + "Background Consistency", + "Motion Smoothness", + "Dynamic Degree", + "Aesthetic Quality", + "Imaging Quality", + "Temporal Flickering" +] + +DIM_WEIGHT_I2V = { +"Video-Text Camera Motion": 0.1, +"Video-Image Subject Consistency": 1, +"Video-Image Background Consistency": 1, +"Subject Consistency": 1, +"Background Consistency": 1, +"Motion Smoothness": 1, +"Dynamic Degree": 0.5, +"Aesthetic Quality": 1, +"Imaging Quality": 1, +"Temporal Flickering": 1 +} + +NORMALIZE_DIC_I2V = { + "Video-Text Camera Motion" :{"Min": 0.0, "Max":1.0 }, + "Video-Image Subject Consistency":{"Min": 0.1462, "Max": 1.0}, + "Video-Image Background Consistency":{"Min": 0.2615, "Max":1.0 }, + "Subject Consistency":{"Min": 0.1462, "Max": 1.0}, + "Background Consistency":{"Min": 0.2615, "Max": 1.0 }, + "Motion Smoothness":{"Min": 0.7060, "Max": 0.9975}, + "Dynamic Degree":{"Min": 0.0, "Max": 1.0}, + "Aesthetic Quality":{"Min": 0.0, "Max": 1.0}, + "Imaging Quality":{"Min": 0.0, "Max": 1.0}, + "Temporal Flickering":{"Min":0.6293, "Max": 1.0} +} + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--score_dir", type=str) # evaluation_results/samples_... + args = parser.parse_args() + return args + +if __name__ == "__main__": + args = parse_args() + res_postfix = "_eval_results.json" + info_postfix = "_full_info.json" + files = os.listdir(args.score_dir) + res_files = [x for x in files if res_postfix in x] + info_files = [x for x in files if info_postfix in x] + assert len(res_files) == len(info_files), f"got {len(res_files)} res files, but {len(info_files)} info files" + + full_results = {} + for res_file in res_files: + # first check if results is normal + info_file = res_file.split(res_postfix)[0] + info_postfix + with open(os.path.join(args.score_dir, info_file), "r", encoding="utf-8") as f: + info = json.load(f) + assert len(info[0]["video_list"]) > 0, f"Error: {info_file} has 0 video list" + # read results + with open(os.path.join(args.score_dir, res_file), "r", encoding="utf-8") as f: + data = json.load(f) + for key, val in data.items(): + full_results[key] = format(val[0], ".4f") + + scaled_results = {} + dims = set() + for key, val in full_results.items(): + dim = key + scaled_score = (float(val) - NORMALIZE_DIC_I2V[dim]["Min"]) / (NORMALIZE_DIC_I2V[dim]["Max"] - NORMALIZE_DIC_I2V[dim]["Min"]) + scaled_score *= DIM_WEIGHT_I2V[dim] + scaled_results[dim] = scaled_score + dims.add(dim) + + assert len(dims) == len(NORMALIZE_DIC_I2V), f"{set(NORMALIZE_DIC_I2V.keys())-dims} not calculated yet" + + quality_score = sum([scaled_results[i] for i in I2V_QUALITY_LIST]) / sum([DIM_WEIGHT_I2V[i] for i in I2V_QUALITY_LIST]) + i2v_score = sum([scaled_results[i] for i in I2V_LIST]) / sum([DIM_WEIGHT_I2V[i] for i in I2V_LIST]) + + scaled_results["quality score"] = quality_score + scaled_results["i2v score"] = i2v_score + scaled_results["total score"] = (quality_score * I2V_QUALITY_WEIGHT + i2v_score * I2V_WEIGHT) / (I2V_QUALITY_WEIGHT + I2V_WEIGHT) + + formated_scaled_results = {} + for key,val in scaled_results.items(): + formated_scaled_results[key] = format(val*100, ".2f")+"%" + + output_file_path = os.path.join(args.score_dir, "all_results.json") + with open(output_file_path, "w") as outfile: + json.dump(full_results, outfile, indent=4, sort_keys=True) + print(f"results saved to: {output_file_path}") + + + scaled_file_path = os.path.join(args.score_dir, "scaled_results.json") + with open(scaled_file_path, "w") as outfile: + json.dump(formated_scaled_results, outfile, indent=4, sort_keys=True) + print(f"results saved to: {scaled_file_path}") \ No newline at end of file diff --git a/eval/vbench_i2v/vbench_video_quality.py b/eval/vbench_i2v/vbench_video_quality.py index 01be7bc..a291426 100644 --- a/eval/vbench_i2v/vbench_video_quality.py +++ b/eval/vbench_i2v/vbench_video_quality.py @@ -2,7 +2,7 @@ from vbench import VBench VIDEO_PATH = "" -DIMENSIONS = ["subject consistency", "background_consistency", "motion_smoothness", "dynamic_degree", "aesthetic_quality", "imaging_quality"] +DIMENSIONS = ["subject_consistency", "background_consistency", "motion_smoothness", "dynamic_degree", "aesthetic_quality", "imaging_quality", "temporal_flickering"] my_VBench = VBench("cuda", "vbench2_beta_i2v/vbench2_i2v_full_info.json", "evaluation_results") my_VBench.evaluate(