diff --git a/.gitignore b/.gitignore
index 7bfd505..a09f631 100644
--- a/.gitignore
+++ b/.gitignore
@@ -198,4 +198,5 @@ flash-attention
datasets
# inference scaling
-temp/
\ No newline at end of file
+temp*
+samples*
\ No newline at end of file
diff --git a/README.md b/README.md
index 72b6e8d..29a5755 100644
--- a/README.md
+++ b/README.md
@@ -262,6 +262,20 @@ export OPENAI_API_KEY=sk-xxxx
torchrun --nproc_per_node 1 --standalone scripts/diffusion/inference.py configs/diffusion/inference/t2i2v_256px.py --save-dir samples --prompt "raining, sea" --refine-prompt True
```
+### Inference Scaling
+
+We implemented an inference scaling sampling method inspaired by [Inference-Time Scaling for Diffusion Models beyond Scaling Denoising Steps](https://inference-scale-diffusion.github.io). You can spent more computational resources to get better results. Use it by specifying the sampling option.
+
+```
+torchrun --nproc_per_node 4 --standalone scripts/diffusion/inference.py configs/diffusion/inference/768px_t2i2v_inference_scaling.py --save-dir samples --dataset.data-path assets/texts/sora.csv
+```
+
+| Orignal |
num_subtree=3
num_scaling_steps=5
num_noise=1
time=16min |
num_subtree=7
num_scaling_steps=8
num_noise=1
time=1h |
+|----------------------|----------------------------------------------------------------|----------------------------------------------------------------|
+| [Video Placeholder 1] | [Video Placeholder 2] | [Video Placeholder 3] |
+| [Video Placeholder 1] | [Video Placeholder 2] | [Video Placeholder 3] |
+
+
### Reproductivity
To make the results reproducible, you can set the random seed by:
diff --git a/configs/diffusion/inference/256px_inference_scaling.py b/configs/diffusion/inference/256px_inference_scaling.py
deleted file mode 100644
index 9c9d4e5..0000000
--- a/configs/diffusion/inference/256px_inference_scaling.py
+++ /dev/null
@@ -1,28 +0,0 @@
-_base_ = [ # inherit grammer from mmengine
- "256px.py",
- "plugins/t2i2v.py",
- "plugins/tp.py", # use tensor parallel
-]
-sampling_option = dict(
- resolution="256px", # 256px or 768px
- aspect_ratio="16:9", # 9:16 or 16:9 or 1:1
- num_frames=129, # number of frames
- num_steps=50, # number of steps
- shift=True,
- temporal_reduction=4,
- is_causal_vae=True,
- guidance=7.5, # guidance for text-to-video
- guidance_img=3.0, # guidance for image-to-video
- text_osci=True, # enable text guidance oscillation
- image_osci=True, # enable image guidance oscillation
- scale_temporal_osci=True,
- method="i2v_inference_scaling", # hard-coded for now
- vbench_dimension_list=['subject_consistency'],
- do_inference_scaling=True,
- num_subtree=3,
- backward_scale=0.78,
- forward_scale=0.83,
- scaling_steps=[1,2,4,7,9,15,20],
- seed=None, # random seed for z
- vbench_gpus=[4,5,6,7]
-)
\ No newline at end of file
diff --git a/configs/diffusion/inference/768px_inference_scaling.py b/configs/diffusion/inference/768px_inference_scaling.py
deleted file mode 100644
index 2e38397..0000000
--- a/configs/diffusion/inference/768px_inference_scaling.py
+++ /dev/null
@@ -1,17 +0,0 @@
-_base_ = [ # inherit grammer from mmengine
- "256px.py",
- "plugins/sp.py", # use sequence parallel
- "plugins/t2i2v.py",
-]
-sampling_option = dict(
- resolution="768px", # 256px or 768px
- method="i2v_inference_scaling", # hard-coded for now
- vbench_dimension_list=['subject_consistency'],
- do_inference_scaling=True,
- num_subtree=3,
- backward_scale=0.78,
- forward_scale=0.83,
- scaling_steps=[1,2,4,7,9,15,20],
- seed=None, # random seed for z
- vbench_gpus=[4,5,6,7]
-)
\ No newline at end of file
diff --git a/configs/diffusion/inference/t2i2v_256px_inference_scaling.py b/configs/diffusion/inference/t2i2v_256px_inference_scaling.py
new file mode 100644
index 0000000..8372f6f
--- /dev/null
+++ b/configs/diffusion/inference/t2i2v_256px_inference_scaling.py
@@ -0,0 +1,17 @@
+_base_ = [ # inherit grammer from mmengine
+ "256px.py",
+ "plugins/t2i2v.py",
+]
+
+# update the inference scaling parameters
+sampling_option = dict(
+ method="i2v_inference_scaling",
+ vbench_dimension_list=['subject_consistency', 'background_consistency', 'motion_smoothness', 'dynamic_degree', 'aesthetic_quality', 'imaging_quality'],
+ do_inference_scaling=True,
+ num_subtree=3,
+ backward_scale=0.78,
+ forward_scale=0.83,
+ scaling_steps=[1,2,4,7,9,15,20],
+ vbench_gpus=[4,5,6,7],
+ seed=42
+)
\ No newline at end of file
diff --git a/configs/diffusion/inference/t2i2v_768px_inference_scaling.py b/configs/diffusion/inference/t2i2v_768px_inference_scaling.py
new file mode 100644
index 0000000..3833767
--- /dev/null
+++ b/configs/diffusion/inference/t2i2v_768px_inference_scaling.py
@@ -0,0 +1,43 @@
+_base_ = [ # inherit grammer from mmengine
+ "768px.py",
+ "plugins/t2i2v.py",
+]
+
+# # update the inference scaling parameters
+# sampling_option = dict(
+# method="i2v_inference_scaling",
+# vbench_dimension_list=['subject_consistency', 'background_consistency', 'motion_smoothness', 'dynamic_degree', 'aesthetic_quality', 'imaging_quality'],
+# do_inference_scaling=True,
+# num_subtree=3,
+# backward_scale=1.0,
+# forward_scale=0.5,
+# scaling_steps=[1,2,4,8,13],
+# vbench_gpus=[4,5,6,7],
+# seed=42
+# )
+
+# second setting
+sampling_option = dict(
+ method="i2v_inference_scaling",
+ vbench_dimension_list=['subject_consistency', 'background_consistency', 'motion_smoothness', 'dynamic_degree', 'aesthetic_quality', 'imaging_quality'],
+ do_inference_scaling=True,
+ num_subtree=5,
+ backward_scale=1.0,
+ forward_scale=0.5,
+ scaling_steps=[1,2,3,4,6,8,10,13],
+ vbench_gpus=[6,7],
+ seed=42
+)
+
+# third setting
+# sampling_option = dict(
+# method="i2v_inference_scaling",
+# vbench_dimension_list=['subject_consistency', 'background_consistency', 'motion_smoothness', 'dynamic_degree', 'aesthetic_quality', 'imaging_quality'],
+# do_inference_scaling=True,
+# num_subtree=8,
+# backward_scale=0.78,
+# forward_scale=0.83,
+# scaling_steps=[1,2,3,4,5,6,7,8,9,12,15,18,20,22,25,30,35],
+# vbench_gpus=[6,7],
+# seed=42
+# )
diff --git a/demo b/demo
new file mode 160000
index 0000000..5f49710
--- /dev/null
+++ b/demo
@@ -0,0 +1 @@
+Subproject commit 5f49710b9bfb3d3d74fca6363b6cb6b7d54aff23
diff --git a/opensora/utils/sampling.py b/opensora/utils/sampling.py
index 907f7f2..92bb988 100644
--- a/opensora/utils/sampling.py
+++ b/opensora/utils/sampling.py
@@ -6,6 +6,7 @@ from dataclasses import dataclass, replace
import json
import subprocess
from collections import defaultdict
+import sys
import torch
import torchvision
@@ -105,62 +106,69 @@ class SamplingOption:
vbench_dimension_list: list = None
-def find_highest_score_video(data):
- video_scores = defaultdict(list)
- normalization_rules = {
- "subject_consistency": lambda e: e["video_results"],
- "background_consistency": lambda e: e["video_results"],
- "temporal_flickering": lambda e: e["video_results"],
- "motion_smoothness": lambda e: e["video_results"],
-
- "dynamic_degree": lambda e: 1.0 if e["video_results"] else 0.0,
-
- "aesthetic_quality": lambda e: e["video_results"],
- "imaging_quality": lambda e: e["video_results"] / 100,
-
- "human_action": lambda e: e["cor_num_per_video"],
-
- "temporal_style": lambda e: e["video_results"],
- "overall_consistency": lambda e: e["video_results"]
- }
+NORMALIZE_DIC = {
+ "subject consistency": {"Min": 0.1462, "Max": 1.0},
+ "background consistency": {"Min": 0.2615, "Max": 1.0},
+ "motion smoothness": {"Min": 0.706, "Max": 0.9975},
+ "dynamic degree": {"Min": 0.0, "Max": 1.0},
+ "aesthetic quality": {"Min": 0.0, "Max": 1.0},
+ "imaging quality": {"Min": 0.0, "Max": 1.0},
+}
+DIM_WEIGHT = {
+ "subject consistency":1,
+ "background consistency":1,
+ "motion smoothness":1,
+ "aesthetic quality":1,
+ "imaging quality":1,
+ "dynamic degree":0.5,
+}
+
+
+def find_highest_score_video(data):
+ video_scores = defaultdict(dict)
+
for metric_name, metric_data in data.items():
if not isinstance(metric_data, list) or len(metric_data) < 2:
continue
- process_rule = normalization_rules.get(metric_name)
- if not process_rule:
+ if metric_name not in NORMALIZE_DIC:
continue
+ min_val = NORMALIZE_DIC[metric_name]["Min"]
+ max_val = NORMALIZE_DIC[metric_name]["Max"]
+ dim_weight = DIM_WEIGHT[metric_name]
+
for entry in metric_data[1]:
try:
path_parts = entry["video_path"].split("/")
filename = path_parts[-1]
video_index = int(filename.split(".")[0])
- score = process_rule(entry)
- video_scores[video_index].append(score)
+ if "video_results" in entry:
+ raw_score = entry["video_results"]
+ elif "cor_num_per_video" in entry:
+ raw_score = entry["cor_num_per_video"]
+ else:
+ continue
+
+ norm_score = (raw_score - min_val) / (max_val - min_val) * dim_weight
+ video_scores[video_index][metric_name] = norm_score
except (KeyError, ValueError, IndexError):
continue
- avg_scores = {}
+ final_scores = {}
for vid, scores in video_scores.items():
- if len(scores) == 0:
- avg_scores[vid] = 0.0
- continue
-
- avg_scores[vid] = sum(scores) / len(scores)
-
- if not avg_scores:
+ if len(scores) > 0:
+ final_scores[vid] = sum(scores.values()) / sum(DIM_WEIGHT[key] for key in scores.keys())
+
+ if not final_scores:
return -1
- max_score = max(avg_scores.values())
- candidates = sorted(
- [vid for vid, score in avg_scores.items() if score == max_score]
- )
-
- return candidates[0] if candidates else -1
+ max_score = max(final_scores.values())
+ candidates = [vid for vid, score in final_scores.items() if score == max_score]
+ return min(candidates) if candidates else -1
def sanitize_sampling_option(sampling_option: SamplingOption) -> SamplingOption:
@@ -364,7 +372,7 @@ class I2VScalingDenoiser(Denoiser):
scale_temporal_osci = kwargs.pop("scale_temporal_osci", False)
prompt = [prompt[0]]
- prompt_short = prompt[0][:30].replace(" ", "_")
+ prompt_short = sanitize_filename(prompt[0])
save_dir = f'temp/{prompt_short}'
os.makedirs(save_dir, exist_ok=True)
@@ -437,9 +445,9 @@ class I2VScalingDenoiser(Denoiser):
noise = torch.randn(noise_shape, device=cond_x.device, dtype=cond_x.dtype)
zeros = torch.zeros(noise_shape, device=cond_x.device, dtype=cond_x.dtype)
noise = torch.cat([noise, zeros, zeros], dim=0)
- subtree = img - (t_curr - timesteps[i-1]) * forward_scale * noise
- t_subtree = t_curr - (t_curr - timesteps[i-1]) * forward_scale
- t_subtree_prev = t_subtree + (t_curr - timesteps[i-1]) * backward_scale
+ subtree = img - (timesteps[i+1] - t_curr) * forward_scale * noise
+ t_subtree = t_curr
+ t_subtree_prev = t_subtree + (timesteps[i+1] - t_curr) * backward_scale
t_subtree_vec = torch.full((img.shape[0],), t_subtree, dtype=cond.dtype, device=cond.device)
subtree_noise_pred = model(
img=subtree,
@@ -506,20 +514,21 @@ class I2VScalingDenoiser(Denoiser):
videos_path = f"{save_dir}/{i}_subtree"
output_path = f"{save_dir}/{i}_subtree"
- prompt_file = "temp/prompt.json"
+ prompt_file = "temp/prompt.json" # hard coded for now
with open(prompt_file, "w") as fp:
prompt_json = {f"{index}.mp4": prompt[0] for index in range(num_subtree)}
json.dump(prompt_json, fp)
+ python_path = os.path.dirname(sys.executable)
minimal_env = {
- "PATH": "/usr/local/bin:/usr/bin:/bin:/mnt/jfs-hdd/home/huangshijie/opensora_vbench/bin",
+ "PATH": f"{python_path}:/usr/local/bin:/usr/bin:/bin",
"CUDA_VISIBLE_DEVICES": ",".join([str(item) for item in vbench_gpus])
}
cmd_args = [
'vbench',
'evaluate',
'--dimension',
- ','.join(vbench_dimension_list),
+ ' '.join(vbench_dimension_list),
'--videos_path',
videos_path,
'--mode',
@@ -1060,3 +1069,23 @@ def prepare_api(
return x
return api_fn
+
+
+def sanitize_filename(prompt):
+ """Sanitize the prompt to create a valid filename."""
+ # Remove or replace special characters
+ invalid_chars = '<>:"/\\|?*\n\r\t'
+ filename = prompt.strip()
+ for char in invalid_chars:
+ filename = filename.replace(char, '_')
+
+ # Replace multiple spaces/underscores with single underscore
+ filename = '_'.join(filter(None, filename.split()))
+
+ # Limit length and ensure it's not empty
+ filename = filename[:30] if filename else "default"
+
+ # Remove leading/trailing special characters
+ filename = filename.strip('._-')
+
+ return filename or "default"