add configs and clean code

This commit is contained in:
nicolaus 2025-03-19 17:33:32 +08:00
parent bb64366a85
commit a2e4e1689f
8 changed files with 149 additions and 89 deletions

3
.gitignore vendored
View file

@ -198,4 +198,5 @@ flash-attention
datasets datasets
# inference scaling # inference scaling
temp/ temp*
samples*

View file

@ -262,6 +262,20 @@ export OPENAI_API_KEY=sk-xxxx
torchrun --nproc_per_node 1 --standalone scripts/diffusion/inference.py configs/diffusion/inference/t2i2v_256px.py --save-dir samples --prompt "raining, sea" --refine-prompt True torchrun --nproc_per_node 1 --standalone scripts/diffusion/inference.py configs/diffusion/inference/t2i2v_256px.py --save-dir samples --prompt "raining, sea" --refine-prompt True
``` ```
### Inference Scaling
We implemented an inference scaling sampling method inspaired by [Inference-Time Scaling for Diffusion Models beyond Scaling Denoising Steps](https://inference-scale-diffusion.github.io). You can spent more computational resources to get better results. Use it by specifying the sampling option.
```
torchrun --nproc_per_node 4 --standalone scripts/diffusion/inference.py configs/diffusion/inference/768px_t2i2v_inference_scaling.py --save-dir samples --dataset.data-path assets/texts/sora.csv
```
| Orignal | <br>num_subtree=3<br>num_scaling_steps=5<br>num_noise=1<br>time=16min | <br>num_subtree=7<br>num_scaling_steps=8<br>num_noise=1<br>time=1h |
|----------------------|----------------------------------------------------------------|----------------------------------------------------------------|
| [Video Placeholder 1] | [Video Placeholder 2] | [Video Placeholder 3] |
| [Video Placeholder 1] | [Video Placeholder 2] | [Video Placeholder 3] |
### Reproductivity ### Reproductivity
To make the results reproducible, you can set the random seed by: To make the results reproducible, you can set the random seed by:

View file

@ -1,28 +0,0 @@
_base_ = [ # inherit grammer from mmengine
"256px.py",
"plugins/t2i2v.py",
"plugins/tp.py", # use tensor parallel
]
sampling_option = dict(
resolution="256px", # 256px or 768px
aspect_ratio="16:9", # 9:16 or 16:9 or 1:1
num_frames=129, # number of frames
num_steps=50, # number of steps
shift=True,
temporal_reduction=4,
is_causal_vae=True,
guidance=7.5, # guidance for text-to-video
guidance_img=3.0, # guidance for image-to-video
text_osci=True, # enable text guidance oscillation
image_osci=True, # enable image guidance oscillation
scale_temporal_osci=True,
method="i2v_inference_scaling", # hard-coded for now
vbench_dimension_list=['subject_consistency'],
do_inference_scaling=True,
num_subtree=3,
backward_scale=0.78,
forward_scale=0.83,
scaling_steps=[1,2,4,7,9,15,20],
seed=None, # random seed for z
vbench_gpus=[4,5,6,7]
)

View file

@ -1,17 +0,0 @@
_base_ = [ # inherit grammer from mmengine
"256px.py",
"plugins/sp.py", # use sequence parallel
"plugins/t2i2v.py",
]
sampling_option = dict(
resolution="768px", # 256px or 768px
method="i2v_inference_scaling", # hard-coded for now
vbench_dimension_list=['subject_consistency'],
do_inference_scaling=True,
num_subtree=3,
backward_scale=0.78,
forward_scale=0.83,
scaling_steps=[1,2,4,7,9,15,20],
seed=None, # random seed for z
vbench_gpus=[4,5,6,7]
)

View file

@ -0,0 +1,17 @@
_base_ = [ # inherit grammer from mmengine
"256px.py",
"plugins/t2i2v.py",
]
# update the inference scaling parameters
sampling_option = dict(
method="i2v_inference_scaling",
vbench_dimension_list=['subject_consistency', 'background_consistency', 'motion_smoothness', 'dynamic_degree', 'aesthetic_quality', 'imaging_quality'],
do_inference_scaling=True,
num_subtree=3,
backward_scale=0.78,
forward_scale=0.83,
scaling_steps=[1,2,4,7,9,15,20],
vbench_gpus=[4,5,6,7],
seed=42
)

View file

@ -0,0 +1,43 @@
_base_ = [ # inherit grammer from mmengine
"768px.py",
"plugins/t2i2v.py",
]
# # update the inference scaling parameters
# sampling_option = dict(
# method="i2v_inference_scaling",
# vbench_dimension_list=['subject_consistency', 'background_consistency', 'motion_smoothness', 'dynamic_degree', 'aesthetic_quality', 'imaging_quality'],
# do_inference_scaling=True,
# num_subtree=3,
# backward_scale=1.0,
# forward_scale=0.5,
# scaling_steps=[1,2,4,8,13],
# vbench_gpus=[4,5,6,7],
# seed=42
# )
# second setting
sampling_option = dict(
method="i2v_inference_scaling",
vbench_dimension_list=['subject_consistency', 'background_consistency', 'motion_smoothness', 'dynamic_degree', 'aesthetic_quality', 'imaging_quality'],
do_inference_scaling=True,
num_subtree=5,
backward_scale=1.0,
forward_scale=0.5,
scaling_steps=[1,2,3,4,6,8,10,13],
vbench_gpus=[6,7],
seed=42
)
# third setting
# sampling_option = dict(
# method="i2v_inference_scaling",
# vbench_dimension_list=['subject_consistency', 'background_consistency', 'motion_smoothness', 'dynamic_degree', 'aesthetic_quality', 'imaging_quality'],
# do_inference_scaling=True,
# num_subtree=8,
# backward_scale=0.78,
# forward_scale=0.83,
# scaling_steps=[1,2,3,4,5,6,7,8,9,12,15,18,20,22,25,30,35],
# vbench_gpus=[6,7],
# seed=42
# )

1
demo Submodule

@ -0,0 +1 @@
Subproject commit 5f49710b9bfb3d3d74fca6363b6cb6b7d54aff23

View file

@ -6,6 +6,7 @@ from dataclasses import dataclass, replace
import json import json
import subprocess import subprocess
from collections import defaultdict from collections import defaultdict
import sys
import torch import torch
import torchvision import torchvision
@ -105,62 +106,69 @@ class SamplingOption:
vbench_dimension_list: list = None vbench_dimension_list: list = None
def find_highest_score_video(data): NORMALIZE_DIC = {
video_scores = defaultdict(list) "subject consistency": {"Min": 0.1462, "Max": 1.0},
normalization_rules = { "background consistency": {"Min": 0.2615, "Max": 1.0},
"subject_consistency": lambda e: e["video_results"], "motion smoothness": {"Min": 0.706, "Max": 0.9975},
"background_consistency": lambda e: e["video_results"], "dynamic degree": {"Min": 0.0, "Max": 1.0},
"temporal_flickering": lambda e: e["video_results"], "aesthetic quality": {"Min": 0.0, "Max": 1.0},
"motion_smoothness": lambda e: e["video_results"], "imaging quality": {"Min": 0.0, "Max": 1.0},
}
"dynamic_degree": lambda e: 1.0 if e["video_results"] else 0.0,
"aesthetic_quality": lambda e: e["video_results"],
"imaging_quality": lambda e: e["video_results"] / 100,
"human_action": lambda e: e["cor_num_per_video"],
"temporal_style": lambda e: e["video_results"],
"overall_consistency": lambda e: e["video_results"]
}
DIM_WEIGHT = {
"subject consistency":1,
"background consistency":1,
"motion smoothness":1,
"aesthetic quality":1,
"imaging quality":1,
"dynamic degree":0.5,
}
def find_highest_score_video(data):
video_scores = defaultdict(dict)
for metric_name, metric_data in data.items(): for metric_name, metric_data in data.items():
if not isinstance(metric_data, list) or len(metric_data) < 2: if not isinstance(metric_data, list) or len(metric_data) < 2:
continue continue
process_rule = normalization_rules.get(metric_name) if metric_name not in NORMALIZE_DIC:
if not process_rule:
continue continue
min_val = NORMALIZE_DIC[metric_name]["Min"]
max_val = NORMALIZE_DIC[metric_name]["Max"]
dim_weight = DIM_WEIGHT[metric_name]
for entry in metric_data[1]: for entry in metric_data[1]:
try: try:
path_parts = entry["video_path"].split("/") path_parts = entry["video_path"].split("/")
filename = path_parts[-1] filename = path_parts[-1]
video_index = int(filename.split(".")[0]) video_index = int(filename.split(".")[0])
score = process_rule(entry) if "video_results" in entry:
video_scores[video_index].append(score) raw_score = entry["video_results"]
elif "cor_num_per_video" in entry:
raw_score = entry["cor_num_per_video"]
else:
continue
norm_score = (raw_score - min_val) / (max_val - min_val) * dim_weight
video_scores[video_index][metric_name] = norm_score
except (KeyError, ValueError, IndexError): except (KeyError, ValueError, IndexError):
continue continue
avg_scores = {} final_scores = {}
for vid, scores in video_scores.items(): for vid, scores in video_scores.items():
if len(scores) == 0: if len(scores) > 0:
avg_scores[vid] = 0.0 final_scores[vid] = sum(scores.values()) / sum(DIM_WEIGHT[key] for key in scores.keys())
continue
if not final_scores:
avg_scores[vid] = sum(scores) / len(scores)
if not avg_scores:
return -1 return -1
max_score = max(avg_scores.values()) max_score = max(final_scores.values())
candidates = sorted( candidates = [vid for vid, score in final_scores.items() if score == max_score]
[vid for vid, score in avg_scores.items() if score == max_score] return min(candidates) if candidates else -1
)
return candidates[0] if candidates else -1
def sanitize_sampling_option(sampling_option: SamplingOption) -> SamplingOption: def sanitize_sampling_option(sampling_option: SamplingOption) -> SamplingOption:
@ -364,7 +372,7 @@ class I2VScalingDenoiser(Denoiser):
scale_temporal_osci = kwargs.pop("scale_temporal_osci", False) scale_temporal_osci = kwargs.pop("scale_temporal_osci", False)
prompt = [prompt[0]] prompt = [prompt[0]]
prompt_short = prompt[0][:30].replace(" ", "_") prompt_short = sanitize_filename(prompt[0])
save_dir = f'temp/{prompt_short}' save_dir = f'temp/{prompt_short}'
os.makedirs(save_dir, exist_ok=True) os.makedirs(save_dir, exist_ok=True)
@ -437,9 +445,9 @@ class I2VScalingDenoiser(Denoiser):
noise = torch.randn(noise_shape, device=cond_x.device, dtype=cond_x.dtype) noise = torch.randn(noise_shape, device=cond_x.device, dtype=cond_x.dtype)
zeros = torch.zeros(noise_shape, device=cond_x.device, dtype=cond_x.dtype) zeros = torch.zeros(noise_shape, device=cond_x.device, dtype=cond_x.dtype)
noise = torch.cat([noise, zeros, zeros], dim=0) noise = torch.cat([noise, zeros, zeros], dim=0)
subtree = img - (t_curr - timesteps[i-1]) * forward_scale * noise subtree = img - (timesteps[i+1] - t_curr) * forward_scale * noise
t_subtree = t_curr - (t_curr - timesteps[i-1]) * forward_scale t_subtree = t_curr
t_subtree_prev = t_subtree + (t_curr - timesteps[i-1]) * backward_scale t_subtree_prev = t_subtree + (timesteps[i+1] - t_curr) * backward_scale
t_subtree_vec = torch.full((img.shape[0],), t_subtree, dtype=cond.dtype, device=cond.device) t_subtree_vec = torch.full((img.shape[0],), t_subtree, dtype=cond.dtype, device=cond.device)
subtree_noise_pred = model( subtree_noise_pred = model(
img=subtree, img=subtree,
@ -506,20 +514,21 @@ class I2VScalingDenoiser(Denoiser):
videos_path = f"{save_dir}/{i}_subtree" videos_path = f"{save_dir}/{i}_subtree"
output_path = f"{save_dir}/{i}_subtree" output_path = f"{save_dir}/{i}_subtree"
prompt_file = "temp/prompt.json" prompt_file = "temp/prompt.json" # hard coded for now
with open(prompt_file, "w") as fp: with open(prompt_file, "w") as fp:
prompt_json = {f"{index}.mp4": prompt[0] for index in range(num_subtree)} prompt_json = {f"{index}.mp4": prompt[0] for index in range(num_subtree)}
json.dump(prompt_json, fp) json.dump(prompt_json, fp)
python_path = os.path.dirname(sys.executable)
minimal_env = { minimal_env = {
"PATH": "/usr/local/bin:/usr/bin:/bin:/mnt/jfs-hdd/home/huangshijie/opensora_vbench/bin", "PATH": f"{python_path}:/usr/local/bin:/usr/bin:/bin",
"CUDA_VISIBLE_DEVICES": ",".join([str(item) for item in vbench_gpus]) "CUDA_VISIBLE_DEVICES": ",".join([str(item) for item in vbench_gpus])
} }
cmd_args = [ cmd_args = [
'vbench', 'vbench',
'evaluate', 'evaluate',
'--dimension', '--dimension',
','.join(vbench_dimension_list), ' '.join(vbench_dimension_list),
'--videos_path', '--videos_path',
videos_path, videos_path,
'--mode', '--mode',
@ -1060,3 +1069,23 @@ def prepare_api(
return x return x
return api_fn return api_fn
def sanitize_filename(prompt):
"""Sanitize the prompt to create a valid filename."""
# Remove or replace special characters
invalid_chars = '<>:"/\\|?*\n\r\t'
filename = prompt.strip()
for char in invalid_chars:
filename = filename.replace(char, '_')
# Replace multiple spaces/underscores with single underscore
filename = '_'.join(filter(None, filename.split()))
# Limit length and ensure it's not empty
filename = filename[:30] if filename else "default"
# Remove leading/trailing special characters
filename = filename.strip('._-')
return filename or "default"