diff --git a/.gitignore b/.gitignore index 7bfd505..a09f631 100644 --- a/.gitignore +++ b/.gitignore @@ -198,4 +198,5 @@ flash-attention datasets # inference scaling -temp/ \ No newline at end of file +temp* +samples* \ No newline at end of file diff --git a/README.md b/README.md index 72b6e8d..29a5755 100644 --- a/README.md +++ b/README.md @@ -262,6 +262,20 @@ export OPENAI_API_KEY=sk-xxxx torchrun --nproc_per_node 1 --standalone scripts/diffusion/inference.py configs/diffusion/inference/t2i2v_256px.py --save-dir samples --prompt "raining, sea" --refine-prompt True ``` +### Inference Scaling + +We implemented an inference scaling sampling method inspaired by [Inference-Time Scaling for Diffusion Models beyond Scaling Denoising Steps](https://inference-scale-diffusion.github.io). You can spent more computational resources to get better results. Use it by specifying the sampling option. + +``` +torchrun --nproc_per_node 4 --standalone scripts/diffusion/inference.py configs/diffusion/inference/768px_t2i2v_inference_scaling.py --save-dir samples --dataset.data-path assets/texts/sora.csv +``` + +| Orignal |
num_subtree=3
num_scaling_steps=5
num_noise=1
time=16min |
num_subtree=7
num_scaling_steps=8
num_noise=1
time=1h | +|----------------------|----------------------------------------------------------------|----------------------------------------------------------------| +| [Video Placeholder 1] | [Video Placeholder 2] | [Video Placeholder 3] | +| [Video Placeholder 1] | [Video Placeholder 2] | [Video Placeholder 3] | + + ### Reproductivity To make the results reproducible, you can set the random seed by: diff --git a/configs/diffusion/inference/256px_inference_scaling.py b/configs/diffusion/inference/256px_inference_scaling.py deleted file mode 100644 index 9c9d4e5..0000000 --- a/configs/diffusion/inference/256px_inference_scaling.py +++ /dev/null @@ -1,28 +0,0 @@ -_base_ = [ # inherit grammer from mmengine - "256px.py", - "plugins/t2i2v.py", - "plugins/tp.py", # use tensor parallel -] -sampling_option = dict( - resolution="256px", # 256px or 768px - aspect_ratio="16:9", # 9:16 or 16:9 or 1:1 - num_frames=129, # number of frames - num_steps=50, # number of steps - shift=True, - temporal_reduction=4, - is_causal_vae=True, - guidance=7.5, # guidance for text-to-video - guidance_img=3.0, # guidance for image-to-video - text_osci=True, # enable text guidance oscillation - image_osci=True, # enable image guidance oscillation - scale_temporal_osci=True, - method="i2v_inference_scaling", # hard-coded for now - vbench_dimension_list=['subject_consistency'], - do_inference_scaling=True, - num_subtree=3, - backward_scale=0.78, - forward_scale=0.83, - scaling_steps=[1,2,4,7,9,15,20], - seed=None, # random seed for z - vbench_gpus=[4,5,6,7] -) \ No newline at end of file diff --git a/configs/diffusion/inference/768px_inference_scaling.py b/configs/diffusion/inference/768px_inference_scaling.py deleted file mode 100644 index 2e38397..0000000 --- a/configs/diffusion/inference/768px_inference_scaling.py +++ /dev/null @@ -1,17 +0,0 @@ -_base_ = [ # inherit grammer from mmengine - "256px.py", - "plugins/sp.py", # use sequence parallel - "plugins/t2i2v.py", -] -sampling_option = dict( - resolution="768px", # 256px or 768px - method="i2v_inference_scaling", # hard-coded for now - vbench_dimension_list=['subject_consistency'], - do_inference_scaling=True, - num_subtree=3, - backward_scale=0.78, - forward_scale=0.83, - scaling_steps=[1,2,4,7,9,15,20], - seed=None, # random seed for z - vbench_gpus=[4,5,6,7] -) \ No newline at end of file diff --git a/configs/diffusion/inference/t2i2v_256px_inference_scaling.py b/configs/diffusion/inference/t2i2v_256px_inference_scaling.py new file mode 100644 index 0000000..8372f6f --- /dev/null +++ b/configs/diffusion/inference/t2i2v_256px_inference_scaling.py @@ -0,0 +1,17 @@ +_base_ = [ # inherit grammer from mmengine + "256px.py", + "plugins/t2i2v.py", +] + +# update the inference scaling parameters +sampling_option = dict( + method="i2v_inference_scaling", + vbench_dimension_list=['subject_consistency', 'background_consistency', 'motion_smoothness', 'dynamic_degree', 'aesthetic_quality', 'imaging_quality'], + do_inference_scaling=True, + num_subtree=3, + backward_scale=0.78, + forward_scale=0.83, + scaling_steps=[1,2,4,7,9,15,20], + vbench_gpus=[4,5,6,7], + seed=42 +) \ No newline at end of file diff --git a/configs/diffusion/inference/t2i2v_768px_inference_scaling.py b/configs/diffusion/inference/t2i2v_768px_inference_scaling.py new file mode 100644 index 0000000..3833767 --- /dev/null +++ b/configs/diffusion/inference/t2i2v_768px_inference_scaling.py @@ -0,0 +1,43 @@ +_base_ = [ # inherit grammer from mmengine + "768px.py", + "plugins/t2i2v.py", +] + +# # update the inference scaling parameters +# sampling_option = dict( +# method="i2v_inference_scaling", +# vbench_dimension_list=['subject_consistency', 'background_consistency', 'motion_smoothness', 'dynamic_degree', 'aesthetic_quality', 'imaging_quality'], +# do_inference_scaling=True, +# num_subtree=3, +# backward_scale=1.0, +# forward_scale=0.5, +# scaling_steps=[1,2,4,8,13], +# vbench_gpus=[4,5,6,7], +# seed=42 +# ) + +# second setting +sampling_option = dict( + method="i2v_inference_scaling", + vbench_dimension_list=['subject_consistency', 'background_consistency', 'motion_smoothness', 'dynamic_degree', 'aesthetic_quality', 'imaging_quality'], + do_inference_scaling=True, + num_subtree=5, + backward_scale=1.0, + forward_scale=0.5, + scaling_steps=[1,2,3,4,6,8,10,13], + vbench_gpus=[6,7], + seed=42 +) + +# third setting +# sampling_option = dict( +# method="i2v_inference_scaling", +# vbench_dimension_list=['subject_consistency', 'background_consistency', 'motion_smoothness', 'dynamic_degree', 'aesthetic_quality', 'imaging_quality'], +# do_inference_scaling=True, +# num_subtree=8, +# backward_scale=0.78, +# forward_scale=0.83, +# scaling_steps=[1,2,3,4,5,6,7,8,9,12,15,18,20,22,25,30,35], +# vbench_gpus=[6,7], +# seed=42 +# ) diff --git a/demo b/demo new file mode 160000 index 0000000..5f49710 --- /dev/null +++ b/demo @@ -0,0 +1 @@ +Subproject commit 5f49710b9bfb3d3d74fca6363b6cb6b7d54aff23 diff --git a/opensora/utils/sampling.py b/opensora/utils/sampling.py index 907f7f2..92bb988 100644 --- a/opensora/utils/sampling.py +++ b/opensora/utils/sampling.py @@ -6,6 +6,7 @@ from dataclasses import dataclass, replace import json import subprocess from collections import defaultdict +import sys import torch import torchvision @@ -105,62 +106,69 @@ class SamplingOption: vbench_dimension_list: list = None -def find_highest_score_video(data): - video_scores = defaultdict(list) - normalization_rules = { - "subject_consistency": lambda e: e["video_results"], - "background_consistency": lambda e: e["video_results"], - "temporal_flickering": lambda e: e["video_results"], - "motion_smoothness": lambda e: e["video_results"], - - "dynamic_degree": lambda e: 1.0 if e["video_results"] else 0.0, - - "aesthetic_quality": lambda e: e["video_results"], - "imaging_quality": lambda e: e["video_results"] / 100, - - "human_action": lambda e: e["cor_num_per_video"], - - "temporal_style": lambda e: e["video_results"], - "overall_consistency": lambda e: e["video_results"] - } +NORMALIZE_DIC = { + "subject consistency": {"Min": 0.1462, "Max": 1.0}, + "background consistency": {"Min": 0.2615, "Max": 1.0}, + "motion smoothness": {"Min": 0.706, "Max": 0.9975}, + "dynamic degree": {"Min": 0.0, "Max": 1.0}, + "aesthetic quality": {"Min": 0.0, "Max": 1.0}, + "imaging quality": {"Min": 0.0, "Max": 1.0}, +} +DIM_WEIGHT = { + "subject consistency":1, + "background consistency":1, + "motion smoothness":1, + "aesthetic quality":1, + "imaging quality":1, + "dynamic degree":0.5, +} + + +def find_highest_score_video(data): + video_scores = defaultdict(dict) + for metric_name, metric_data in data.items(): if not isinstance(metric_data, list) or len(metric_data) < 2: continue - process_rule = normalization_rules.get(metric_name) - if not process_rule: + if metric_name not in NORMALIZE_DIC: continue + min_val = NORMALIZE_DIC[metric_name]["Min"] + max_val = NORMALIZE_DIC[metric_name]["Max"] + dim_weight = DIM_WEIGHT[metric_name] + for entry in metric_data[1]: try: path_parts = entry["video_path"].split("/") filename = path_parts[-1] video_index = int(filename.split(".")[0]) - score = process_rule(entry) - video_scores[video_index].append(score) + if "video_results" in entry: + raw_score = entry["video_results"] + elif "cor_num_per_video" in entry: + raw_score = entry["cor_num_per_video"] + else: + continue + + norm_score = (raw_score - min_val) / (max_val - min_val) * dim_weight + video_scores[video_index][metric_name] = norm_score except (KeyError, ValueError, IndexError): continue - avg_scores = {} + final_scores = {} for vid, scores in video_scores.items(): - if len(scores) == 0: - avg_scores[vid] = 0.0 - continue - - avg_scores[vid] = sum(scores) / len(scores) - - if not avg_scores: + if len(scores) > 0: + final_scores[vid] = sum(scores.values()) / sum(DIM_WEIGHT[key] for key in scores.keys()) + + if not final_scores: return -1 - max_score = max(avg_scores.values()) - candidates = sorted( - [vid for vid, score in avg_scores.items() if score == max_score] - ) - - return candidates[0] if candidates else -1 + max_score = max(final_scores.values()) + candidates = [vid for vid, score in final_scores.items() if score == max_score] + return min(candidates) if candidates else -1 def sanitize_sampling_option(sampling_option: SamplingOption) -> SamplingOption: @@ -364,7 +372,7 @@ class I2VScalingDenoiser(Denoiser): scale_temporal_osci = kwargs.pop("scale_temporal_osci", False) prompt = [prompt[0]] - prompt_short = prompt[0][:30].replace(" ", "_") + prompt_short = sanitize_filename(prompt[0]) save_dir = f'temp/{prompt_short}' os.makedirs(save_dir, exist_ok=True) @@ -437,9 +445,9 @@ class I2VScalingDenoiser(Denoiser): noise = torch.randn(noise_shape, device=cond_x.device, dtype=cond_x.dtype) zeros = torch.zeros(noise_shape, device=cond_x.device, dtype=cond_x.dtype) noise = torch.cat([noise, zeros, zeros], dim=0) - subtree = img - (t_curr - timesteps[i-1]) * forward_scale * noise - t_subtree = t_curr - (t_curr - timesteps[i-1]) * forward_scale - t_subtree_prev = t_subtree + (t_curr - timesteps[i-1]) * backward_scale + subtree = img - (timesteps[i+1] - t_curr) * forward_scale * noise + t_subtree = t_curr + t_subtree_prev = t_subtree + (timesteps[i+1] - t_curr) * backward_scale t_subtree_vec = torch.full((img.shape[0],), t_subtree, dtype=cond.dtype, device=cond.device) subtree_noise_pred = model( img=subtree, @@ -506,20 +514,21 @@ class I2VScalingDenoiser(Denoiser): videos_path = f"{save_dir}/{i}_subtree" output_path = f"{save_dir}/{i}_subtree" - prompt_file = "temp/prompt.json" + prompt_file = "temp/prompt.json" # hard coded for now with open(prompt_file, "w") as fp: prompt_json = {f"{index}.mp4": prompt[0] for index in range(num_subtree)} json.dump(prompt_json, fp) + python_path = os.path.dirname(sys.executable) minimal_env = { - "PATH": "/usr/local/bin:/usr/bin:/bin:/mnt/jfs-hdd/home/huangshijie/opensora_vbench/bin", + "PATH": f"{python_path}:/usr/local/bin:/usr/bin:/bin", "CUDA_VISIBLE_DEVICES": ",".join([str(item) for item in vbench_gpus]) } cmd_args = [ 'vbench', 'evaluate', '--dimension', - ','.join(vbench_dimension_list), + ' '.join(vbench_dimension_list), '--videos_path', videos_path, '--mode', @@ -1060,3 +1069,23 @@ def prepare_api( return x return api_fn + + +def sanitize_filename(prompt): + """Sanitize the prompt to create a valid filename.""" + # Remove or replace special characters + invalid_chars = '<>:"/\\|?*\n\r\t' + filename = prompt.strip() + for char in invalid_chars: + filename = filename.replace(char, '_') + + # Replace multiple spaces/underscores with single underscore + filename = '_'.join(filter(None, filename.split())) + + # Limit length and ensure it's not empty + filename = filename[:30] if filename else "default" + + # Remove leading/trailing special characters + filename = filename.strip('._-') + + return filename or "default"