add configs and clean code

This commit is contained in:
nicolaus 2025-03-19 17:33:32 +08:00
parent bb64366a85
commit a2e4e1689f
8 changed files with 149 additions and 89 deletions

3
.gitignore vendored
View file

@ -198,4 +198,5 @@ flash-attention
datasets
# inference scaling
temp/
temp*
samples*

View file

@ -262,6 +262,20 @@ export OPENAI_API_KEY=sk-xxxx
torchrun --nproc_per_node 1 --standalone scripts/diffusion/inference.py configs/diffusion/inference/t2i2v_256px.py --save-dir samples --prompt "raining, sea" --refine-prompt True
```
### Inference Scaling
We implemented an inference scaling sampling method inspaired by [Inference-Time Scaling for Diffusion Models beyond Scaling Denoising Steps](https://inference-scale-diffusion.github.io). You can spent more computational resources to get better results. Use it by specifying the sampling option.
```
torchrun --nproc_per_node 4 --standalone scripts/diffusion/inference.py configs/diffusion/inference/768px_t2i2v_inference_scaling.py --save-dir samples --dataset.data-path assets/texts/sora.csv
```
| Orignal | <br>num_subtree=3<br>num_scaling_steps=5<br>num_noise=1<br>time=16min | <br>num_subtree=7<br>num_scaling_steps=8<br>num_noise=1<br>time=1h |
|----------------------|----------------------------------------------------------------|----------------------------------------------------------------|
| [Video Placeholder 1] | [Video Placeholder 2] | [Video Placeholder 3] |
| [Video Placeholder 1] | [Video Placeholder 2] | [Video Placeholder 3] |
### Reproductivity
To make the results reproducible, you can set the random seed by:

View file

@ -1,28 +0,0 @@
_base_ = [ # inherit grammer from mmengine
"256px.py",
"plugins/t2i2v.py",
"plugins/tp.py", # use tensor parallel
]
sampling_option = dict(
resolution="256px", # 256px or 768px
aspect_ratio="16:9", # 9:16 or 16:9 or 1:1
num_frames=129, # number of frames
num_steps=50, # number of steps
shift=True,
temporal_reduction=4,
is_causal_vae=True,
guidance=7.5, # guidance for text-to-video
guidance_img=3.0, # guidance for image-to-video
text_osci=True, # enable text guidance oscillation
image_osci=True, # enable image guidance oscillation
scale_temporal_osci=True,
method="i2v_inference_scaling", # hard-coded for now
vbench_dimension_list=['subject_consistency'],
do_inference_scaling=True,
num_subtree=3,
backward_scale=0.78,
forward_scale=0.83,
scaling_steps=[1,2,4,7,9,15,20],
seed=None, # random seed for z
vbench_gpus=[4,5,6,7]
)

View file

@ -1,17 +0,0 @@
_base_ = [ # inherit grammer from mmengine
"256px.py",
"plugins/sp.py", # use sequence parallel
"plugins/t2i2v.py",
]
sampling_option = dict(
resolution="768px", # 256px or 768px
method="i2v_inference_scaling", # hard-coded for now
vbench_dimension_list=['subject_consistency'],
do_inference_scaling=True,
num_subtree=3,
backward_scale=0.78,
forward_scale=0.83,
scaling_steps=[1,2,4,7,9,15,20],
seed=None, # random seed for z
vbench_gpus=[4,5,6,7]
)

View file

@ -0,0 +1,17 @@
_base_ = [ # inherit grammer from mmengine
"256px.py",
"plugins/t2i2v.py",
]
# update the inference scaling parameters
sampling_option = dict(
method="i2v_inference_scaling",
vbench_dimension_list=['subject_consistency', 'background_consistency', 'motion_smoothness', 'dynamic_degree', 'aesthetic_quality', 'imaging_quality'],
do_inference_scaling=True,
num_subtree=3,
backward_scale=0.78,
forward_scale=0.83,
scaling_steps=[1,2,4,7,9,15,20],
vbench_gpus=[4,5,6,7],
seed=42
)

View file

@ -0,0 +1,43 @@
_base_ = [ # inherit grammer from mmengine
"768px.py",
"plugins/t2i2v.py",
]
# # update the inference scaling parameters
# sampling_option = dict(
# method="i2v_inference_scaling",
# vbench_dimension_list=['subject_consistency', 'background_consistency', 'motion_smoothness', 'dynamic_degree', 'aesthetic_quality', 'imaging_quality'],
# do_inference_scaling=True,
# num_subtree=3,
# backward_scale=1.0,
# forward_scale=0.5,
# scaling_steps=[1,2,4,8,13],
# vbench_gpus=[4,5,6,7],
# seed=42
# )
# second setting
sampling_option = dict(
method="i2v_inference_scaling",
vbench_dimension_list=['subject_consistency', 'background_consistency', 'motion_smoothness', 'dynamic_degree', 'aesthetic_quality', 'imaging_quality'],
do_inference_scaling=True,
num_subtree=5,
backward_scale=1.0,
forward_scale=0.5,
scaling_steps=[1,2,3,4,6,8,10,13],
vbench_gpus=[6,7],
seed=42
)
# third setting
# sampling_option = dict(
# method="i2v_inference_scaling",
# vbench_dimension_list=['subject_consistency', 'background_consistency', 'motion_smoothness', 'dynamic_degree', 'aesthetic_quality', 'imaging_quality'],
# do_inference_scaling=True,
# num_subtree=8,
# backward_scale=0.78,
# forward_scale=0.83,
# scaling_steps=[1,2,3,4,5,6,7,8,9,12,15,18,20,22,25,30,35],
# vbench_gpus=[6,7],
# seed=42
# )

1
demo Submodule

@ -0,0 +1 @@
Subproject commit 5f49710b9bfb3d3d74fca6363b6cb6b7d54aff23

View file

@ -6,6 +6,7 @@ from dataclasses import dataclass, replace
import json
import subprocess
from collections import defaultdict
import sys
import torch
import torchvision
@ -105,62 +106,69 @@ class SamplingOption:
vbench_dimension_list: list = None
NORMALIZE_DIC = {
"subject consistency": {"Min": 0.1462, "Max": 1.0},
"background consistency": {"Min": 0.2615, "Max": 1.0},
"motion smoothness": {"Min": 0.706, "Max": 0.9975},
"dynamic degree": {"Min": 0.0, "Max": 1.0},
"aesthetic quality": {"Min": 0.0, "Max": 1.0},
"imaging quality": {"Min": 0.0, "Max": 1.0},
}
DIM_WEIGHT = {
"subject consistency":1,
"background consistency":1,
"motion smoothness":1,
"aesthetic quality":1,
"imaging quality":1,
"dynamic degree":0.5,
}
def find_highest_score_video(data):
video_scores = defaultdict(list)
normalization_rules = {
"subject_consistency": lambda e: e["video_results"],
"background_consistency": lambda e: e["video_results"],
"temporal_flickering": lambda e: e["video_results"],
"motion_smoothness": lambda e: e["video_results"],
"dynamic_degree": lambda e: 1.0 if e["video_results"] else 0.0,
"aesthetic_quality": lambda e: e["video_results"],
"imaging_quality": lambda e: e["video_results"] / 100,
"human_action": lambda e: e["cor_num_per_video"],
"temporal_style": lambda e: e["video_results"],
"overall_consistency": lambda e: e["video_results"]
}
video_scores = defaultdict(dict)
for metric_name, metric_data in data.items():
if not isinstance(metric_data, list) or len(metric_data) < 2:
continue
process_rule = normalization_rules.get(metric_name)
if not process_rule:
if metric_name not in NORMALIZE_DIC:
continue
min_val = NORMALIZE_DIC[metric_name]["Min"]
max_val = NORMALIZE_DIC[metric_name]["Max"]
dim_weight = DIM_WEIGHT[metric_name]
for entry in metric_data[1]:
try:
path_parts = entry["video_path"].split("/")
filename = path_parts[-1]
video_index = int(filename.split(".")[0])
score = process_rule(entry)
video_scores[video_index].append(score)
if "video_results" in entry:
raw_score = entry["video_results"]
elif "cor_num_per_video" in entry:
raw_score = entry["cor_num_per_video"]
else:
continue
norm_score = (raw_score - min_val) / (max_val - min_val) * dim_weight
video_scores[video_index][metric_name] = norm_score
except (KeyError, ValueError, IndexError):
continue
avg_scores = {}
final_scores = {}
for vid, scores in video_scores.items():
if len(scores) == 0:
avg_scores[vid] = 0.0
continue
if len(scores) > 0:
final_scores[vid] = sum(scores.values()) / sum(DIM_WEIGHT[key] for key in scores.keys())
avg_scores[vid] = sum(scores) / len(scores)
if not avg_scores:
if not final_scores:
return -1
max_score = max(avg_scores.values())
candidates = sorted(
[vid for vid, score in avg_scores.items() if score == max_score]
)
return candidates[0] if candidates else -1
max_score = max(final_scores.values())
candidates = [vid for vid, score in final_scores.items() if score == max_score]
return min(candidates) if candidates else -1
def sanitize_sampling_option(sampling_option: SamplingOption) -> SamplingOption:
@ -364,7 +372,7 @@ class I2VScalingDenoiser(Denoiser):
scale_temporal_osci = kwargs.pop("scale_temporal_osci", False)
prompt = [prompt[0]]
prompt_short = prompt[0][:30].replace(" ", "_")
prompt_short = sanitize_filename(prompt[0])
save_dir = f'temp/{prompt_short}'
os.makedirs(save_dir, exist_ok=True)
@ -437,9 +445,9 @@ class I2VScalingDenoiser(Denoiser):
noise = torch.randn(noise_shape, device=cond_x.device, dtype=cond_x.dtype)
zeros = torch.zeros(noise_shape, device=cond_x.device, dtype=cond_x.dtype)
noise = torch.cat([noise, zeros, zeros], dim=0)
subtree = img - (t_curr - timesteps[i-1]) * forward_scale * noise
t_subtree = t_curr - (t_curr - timesteps[i-1]) * forward_scale
t_subtree_prev = t_subtree + (t_curr - timesteps[i-1]) * backward_scale
subtree = img - (timesteps[i+1] - t_curr) * forward_scale * noise
t_subtree = t_curr
t_subtree_prev = t_subtree + (timesteps[i+1] - t_curr) * backward_scale
t_subtree_vec = torch.full((img.shape[0],), t_subtree, dtype=cond.dtype, device=cond.device)
subtree_noise_pred = model(
img=subtree,
@ -506,20 +514,21 @@ class I2VScalingDenoiser(Denoiser):
videos_path = f"{save_dir}/{i}_subtree"
output_path = f"{save_dir}/{i}_subtree"
prompt_file = "temp/prompt.json"
prompt_file = "temp/prompt.json" # hard coded for now
with open(prompt_file, "w") as fp:
prompt_json = {f"{index}.mp4": prompt[0] for index in range(num_subtree)}
json.dump(prompt_json, fp)
python_path = os.path.dirname(sys.executable)
minimal_env = {
"PATH": "/usr/local/bin:/usr/bin:/bin:/mnt/jfs-hdd/home/huangshijie/opensora_vbench/bin",
"PATH": f"{python_path}:/usr/local/bin:/usr/bin:/bin",
"CUDA_VISIBLE_DEVICES": ",".join([str(item) for item in vbench_gpus])
}
cmd_args = [
'vbench',
'evaluate',
'--dimension',
','.join(vbench_dimension_list),
' '.join(vbench_dimension_list),
'--videos_path',
videos_path,
'--mode',
@ -1060,3 +1069,23 @@ def prepare_api(
return x
return api_fn
def sanitize_filename(prompt):
"""Sanitize the prompt to create a valid filename."""
# Remove or replace special characters
invalid_chars = '<>:"/\\|?*\n\r\t'
filename = prompt.strip()
for char in invalid_chars:
filename = filename.replace(char, '_')
# Replace multiple spaces/underscores with single underscore
filename = '_'.join(filter(None, filename.split()))
# Limit length and ensure it's not empty
filename = filename[:30] if filename else "default"
# Remove leading/trailing special characters
filename = filename.strip('._-')
return filename or "default"