mirror of
https://github.com/hpcaitech/Open-Sora.git
synced 2026-04-14 18:25:35 +02:00
add configs and clean code
This commit is contained in:
parent
bb64366a85
commit
a2e4e1689f
3
.gitignore
vendored
3
.gitignore
vendored
|
|
@ -198,4 +198,5 @@ flash-attention
|
|||
datasets
|
||||
|
||||
# inference scaling
|
||||
temp/
|
||||
temp*
|
||||
samples*
|
||||
14
README.md
14
README.md
|
|
@ -262,6 +262,20 @@ export OPENAI_API_KEY=sk-xxxx
|
|||
torchrun --nproc_per_node 1 --standalone scripts/diffusion/inference.py configs/diffusion/inference/t2i2v_256px.py --save-dir samples --prompt "raining, sea" --refine-prompt True
|
||||
```
|
||||
|
||||
### Inference Scaling
|
||||
|
||||
We implemented an inference scaling sampling method inspaired by [Inference-Time Scaling for Diffusion Models beyond Scaling Denoising Steps](https://inference-scale-diffusion.github.io). You can spent more computational resources to get better results. Use it by specifying the sampling option.
|
||||
|
||||
```
|
||||
torchrun --nproc_per_node 4 --standalone scripts/diffusion/inference.py configs/diffusion/inference/768px_t2i2v_inference_scaling.py --save-dir samples --dataset.data-path assets/texts/sora.csv
|
||||
```
|
||||
|
||||
| Orignal | <br>num_subtree=3<br>num_scaling_steps=5<br>num_noise=1<br>time=16min | <br>num_subtree=7<br>num_scaling_steps=8<br>num_noise=1<br>time=1h |
|
||||
|----------------------|----------------------------------------------------------------|----------------------------------------------------------------|
|
||||
| [Video Placeholder 1] | [Video Placeholder 2] | [Video Placeholder 3] |
|
||||
| [Video Placeholder 1] | [Video Placeholder 2] | [Video Placeholder 3] |
|
||||
|
||||
|
||||
### Reproductivity
|
||||
|
||||
To make the results reproducible, you can set the random seed by:
|
||||
|
|
|
|||
|
|
@ -1,28 +0,0 @@
|
|||
_base_ = [ # inherit grammer from mmengine
|
||||
"256px.py",
|
||||
"plugins/t2i2v.py",
|
||||
"plugins/tp.py", # use tensor parallel
|
||||
]
|
||||
sampling_option = dict(
|
||||
resolution="256px", # 256px or 768px
|
||||
aspect_ratio="16:9", # 9:16 or 16:9 or 1:1
|
||||
num_frames=129, # number of frames
|
||||
num_steps=50, # number of steps
|
||||
shift=True,
|
||||
temporal_reduction=4,
|
||||
is_causal_vae=True,
|
||||
guidance=7.5, # guidance for text-to-video
|
||||
guidance_img=3.0, # guidance for image-to-video
|
||||
text_osci=True, # enable text guidance oscillation
|
||||
image_osci=True, # enable image guidance oscillation
|
||||
scale_temporal_osci=True,
|
||||
method="i2v_inference_scaling", # hard-coded for now
|
||||
vbench_dimension_list=['subject_consistency'],
|
||||
do_inference_scaling=True,
|
||||
num_subtree=3,
|
||||
backward_scale=0.78,
|
||||
forward_scale=0.83,
|
||||
scaling_steps=[1,2,4,7,9,15,20],
|
||||
seed=None, # random seed for z
|
||||
vbench_gpus=[4,5,6,7]
|
||||
)
|
||||
|
|
@ -1,17 +0,0 @@
|
|||
_base_ = [ # inherit grammer from mmengine
|
||||
"256px.py",
|
||||
"plugins/sp.py", # use sequence parallel
|
||||
"plugins/t2i2v.py",
|
||||
]
|
||||
sampling_option = dict(
|
||||
resolution="768px", # 256px or 768px
|
||||
method="i2v_inference_scaling", # hard-coded for now
|
||||
vbench_dimension_list=['subject_consistency'],
|
||||
do_inference_scaling=True,
|
||||
num_subtree=3,
|
||||
backward_scale=0.78,
|
||||
forward_scale=0.83,
|
||||
scaling_steps=[1,2,4,7,9,15,20],
|
||||
seed=None, # random seed for z
|
||||
vbench_gpus=[4,5,6,7]
|
||||
)
|
||||
17
configs/diffusion/inference/t2i2v_256px_inference_scaling.py
Normal file
17
configs/diffusion/inference/t2i2v_256px_inference_scaling.py
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
_base_ = [ # inherit grammer from mmengine
|
||||
"256px.py",
|
||||
"plugins/t2i2v.py",
|
||||
]
|
||||
|
||||
# update the inference scaling parameters
|
||||
sampling_option = dict(
|
||||
method="i2v_inference_scaling",
|
||||
vbench_dimension_list=['subject_consistency', 'background_consistency', 'motion_smoothness', 'dynamic_degree', 'aesthetic_quality', 'imaging_quality'],
|
||||
do_inference_scaling=True,
|
||||
num_subtree=3,
|
||||
backward_scale=0.78,
|
||||
forward_scale=0.83,
|
||||
scaling_steps=[1,2,4,7,9,15,20],
|
||||
vbench_gpus=[4,5,6,7],
|
||||
seed=42
|
||||
)
|
||||
43
configs/diffusion/inference/t2i2v_768px_inference_scaling.py
Normal file
43
configs/diffusion/inference/t2i2v_768px_inference_scaling.py
Normal file
|
|
@ -0,0 +1,43 @@
|
|||
_base_ = [ # inherit grammer from mmengine
|
||||
"768px.py",
|
||||
"plugins/t2i2v.py",
|
||||
]
|
||||
|
||||
# # update the inference scaling parameters
|
||||
# sampling_option = dict(
|
||||
# method="i2v_inference_scaling",
|
||||
# vbench_dimension_list=['subject_consistency', 'background_consistency', 'motion_smoothness', 'dynamic_degree', 'aesthetic_quality', 'imaging_quality'],
|
||||
# do_inference_scaling=True,
|
||||
# num_subtree=3,
|
||||
# backward_scale=1.0,
|
||||
# forward_scale=0.5,
|
||||
# scaling_steps=[1,2,4,8,13],
|
||||
# vbench_gpus=[4,5,6,7],
|
||||
# seed=42
|
||||
# )
|
||||
|
||||
# second setting
|
||||
sampling_option = dict(
|
||||
method="i2v_inference_scaling",
|
||||
vbench_dimension_list=['subject_consistency', 'background_consistency', 'motion_smoothness', 'dynamic_degree', 'aesthetic_quality', 'imaging_quality'],
|
||||
do_inference_scaling=True,
|
||||
num_subtree=5,
|
||||
backward_scale=1.0,
|
||||
forward_scale=0.5,
|
||||
scaling_steps=[1,2,3,4,6,8,10,13],
|
||||
vbench_gpus=[6,7],
|
||||
seed=42
|
||||
)
|
||||
|
||||
# third setting
|
||||
# sampling_option = dict(
|
||||
# method="i2v_inference_scaling",
|
||||
# vbench_dimension_list=['subject_consistency', 'background_consistency', 'motion_smoothness', 'dynamic_degree', 'aesthetic_quality', 'imaging_quality'],
|
||||
# do_inference_scaling=True,
|
||||
# num_subtree=8,
|
||||
# backward_scale=0.78,
|
||||
# forward_scale=0.83,
|
||||
# scaling_steps=[1,2,3,4,5,6,7,8,9,12,15,18,20,22,25,30,35],
|
||||
# vbench_gpus=[6,7],
|
||||
# seed=42
|
||||
# )
|
||||
1
demo
Submodule
1
demo
Submodule
|
|
@ -0,0 +1 @@
|
|||
Subproject commit 5f49710b9bfb3d3d74fca6363b6cb6b7d54aff23
|
||||
|
|
@ -6,6 +6,7 @@ from dataclasses import dataclass, replace
|
|||
import json
|
||||
import subprocess
|
||||
from collections import defaultdict
|
||||
import sys
|
||||
|
||||
import torch
|
||||
import torchvision
|
||||
|
|
@ -105,62 +106,69 @@ class SamplingOption:
|
|||
vbench_dimension_list: list = None
|
||||
|
||||
|
||||
NORMALIZE_DIC = {
|
||||
"subject consistency": {"Min": 0.1462, "Max": 1.0},
|
||||
"background consistency": {"Min": 0.2615, "Max": 1.0},
|
||||
"motion smoothness": {"Min": 0.706, "Max": 0.9975},
|
||||
"dynamic degree": {"Min": 0.0, "Max": 1.0},
|
||||
"aesthetic quality": {"Min": 0.0, "Max": 1.0},
|
||||
"imaging quality": {"Min": 0.0, "Max": 1.0},
|
||||
}
|
||||
|
||||
DIM_WEIGHT = {
|
||||
"subject consistency":1,
|
||||
"background consistency":1,
|
||||
"motion smoothness":1,
|
||||
"aesthetic quality":1,
|
||||
"imaging quality":1,
|
||||
"dynamic degree":0.5,
|
||||
}
|
||||
|
||||
|
||||
def find_highest_score_video(data):
|
||||
video_scores = defaultdict(list)
|
||||
normalization_rules = {
|
||||
"subject_consistency": lambda e: e["video_results"],
|
||||
"background_consistency": lambda e: e["video_results"],
|
||||
"temporal_flickering": lambda e: e["video_results"],
|
||||
"motion_smoothness": lambda e: e["video_results"],
|
||||
|
||||
"dynamic_degree": lambda e: 1.0 if e["video_results"] else 0.0,
|
||||
|
||||
"aesthetic_quality": lambda e: e["video_results"],
|
||||
"imaging_quality": lambda e: e["video_results"] / 100,
|
||||
|
||||
"human_action": lambda e: e["cor_num_per_video"],
|
||||
|
||||
"temporal_style": lambda e: e["video_results"],
|
||||
"overall_consistency": lambda e: e["video_results"]
|
||||
}
|
||||
video_scores = defaultdict(dict)
|
||||
|
||||
for metric_name, metric_data in data.items():
|
||||
if not isinstance(metric_data, list) or len(metric_data) < 2:
|
||||
continue
|
||||
|
||||
process_rule = normalization_rules.get(metric_name)
|
||||
if not process_rule:
|
||||
if metric_name not in NORMALIZE_DIC:
|
||||
continue
|
||||
|
||||
min_val = NORMALIZE_DIC[metric_name]["Min"]
|
||||
max_val = NORMALIZE_DIC[metric_name]["Max"]
|
||||
dim_weight = DIM_WEIGHT[metric_name]
|
||||
|
||||
for entry in metric_data[1]:
|
||||
try:
|
||||
path_parts = entry["video_path"].split("/")
|
||||
filename = path_parts[-1]
|
||||
video_index = int(filename.split(".")[0])
|
||||
|
||||
score = process_rule(entry)
|
||||
video_scores[video_index].append(score)
|
||||
if "video_results" in entry:
|
||||
raw_score = entry["video_results"]
|
||||
elif "cor_num_per_video" in entry:
|
||||
raw_score = entry["cor_num_per_video"]
|
||||
else:
|
||||
continue
|
||||
|
||||
norm_score = (raw_score - min_val) / (max_val - min_val) * dim_weight
|
||||
video_scores[video_index][metric_name] = norm_score
|
||||
|
||||
except (KeyError, ValueError, IndexError):
|
||||
continue
|
||||
|
||||
avg_scores = {}
|
||||
final_scores = {}
|
||||
for vid, scores in video_scores.items():
|
||||
if len(scores) == 0:
|
||||
avg_scores[vid] = 0.0
|
||||
continue
|
||||
if len(scores) > 0:
|
||||
final_scores[vid] = sum(scores.values()) / sum(DIM_WEIGHT[key] for key in scores.keys())
|
||||
|
||||
avg_scores[vid] = sum(scores) / len(scores)
|
||||
|
||||
if not avg_scores:
|
||||
if not final_scores:
|
||||
return -1
|
||||
|
||||
max_score = max(avg_scores.values())
|
||||
candidates = sorted(
|
||||
[vid for vid, score in avg_scores.items() if score == max_score]
|
||||
)
|
||||
|
||||
return candidates[0] if candidates else -1
|
||||
max_score = max(final_scores.values())
|
||||
candidates = [vid for vid, score in final_scores.items() if score == max_score]
|
||||
return min(candidates) if candidates else -1
|
||||
|
||||
|
||||
def sanitize_sampling_option(sampling_option: SamplingOption) -> SamplingOption:
|
||||
|
|
@ -364,7 +372,7 @@ class I2VScalingDenoiser(Denoiser):
|
|||
scale_temporal_osci = kwargs.pop("scale_temporal_osci", False)
|
||||
|
||||
prompt = [prompt[0]]
|
||||
prompt_short = prompt[0][:30].replace(" ", "_")
|
||||
prompt_short = sanitize_filename(prompt[0])
|
||||
save_dir = f'temp/{prompt_short}'
|
||||
os.makedirs(save_dir, exist_ok=True)
|
||||
|
||||
|
|
@ -437,9 +445,9 @@ class I2VScalingDenoiser(Denoiser):
|
|||
noise = torch.randn(noise_shape, device=cond_x.device, dtype=cond_x.dtype)
|
||||
zeros = torch.zeros(noise_shape, device=cond_x.device, dtype=cond_x.dtype)
|
||||
noise = torch.cat([noise, zeros, zeros], dim=0)
|
||||
subtree = img - (t_curr - timesteps[i-1]) * forward_scale * noise
|
||||
t_subtree = t_curr - (t_curr - timesteps[i-1]) * forward_scale
|
||||
t_subtree_prev = t_subtree + (t_curr - timesteps[i-1]) * backward_scale
|
||||
subtree = img - (timesteps[i+1] - t_curr) * forward_scale * noise
|
||||
t_subtree = t_curr
|
||||
t_subtree_prev = t_subtree + (timesteps[i+1] - t_curr) * backward_scale
|
||||
t_subtree_vec = torch.full((img.shape[0],), t_subtree, dtype=cond.dtype, device=cond.device)
|
||||
subtree_noise_pred = model(
|
||||
img=subtree,
|
||||
|
|
@ -506,20 +514,21 @@ class I2VScalingDenoiser(Denoiser):
|
|||
videos_path = f"{save_dir}/{i}_subtree"
|
||||
output_path = f"{save_dir}/{i}_subtree"
|
||||
|
||||
prompt_file = "temp/prompt.json"
|
||||
prompt_file = "temp/prompt.json" # hard coded for now
|
||||
with open(prompt_file, "w") as fp:
|
||||
prompt_json = {f"{index}.mp4": prompt[0] for index in range(num_subtree)}
|
||||
json.dump(prompt_json, fp)
|
||||
|
||||
python_path = os.path.dirname(sys.executable)
|
||||
minimal_env = {
|
||||
"PATH": "/usr/local/bin:/usr/bin:/bin:/mnt/jfs-hdd/home/huangshijie/opensora_vbench/bin",
|
||||
"PATH": f"{python_path}:/usr/local/bin:/usr/bin:/bin",
|
||||
"CUDA_VISIBLE_DEVICES": ",".join([str(item) for item in vbench_gpus])
|
||||
}
|
||||
cmd_args = [
|
||||
'vbench',
|
||||
'evaluate',
|
||||
'--dimension',
|
||||
','.join(vbench_dimension_list),
|
||||
' '.join(vbench_dimension_list),
|
||||
'--videos_path',
|
||||
videos_path,
|
||||
'--mode',
|
||||
|
|
@ -1060,3 +1069,23 @@ def prepare_api(
|
|||
return x
|
||||
|
||||
return api_fn
|
||||
|
||||
|
||||
def sanitize_filename(prompt):
|
||||
"""Sanitize the prompt to create a valid filename."""
|
||||
# Remove or replace special characters
|
||||
invalid_chars = '<>:"/\\|?*\n\r\t'
|
||||
filename = prompt.strip()
|
||||
for char in invalid_chars:
|
||||
filename = filename.replace(char, '_')
|
||||
|
||||
# Replace multiple spaces/underscores with single underscore
|
||||
filename = '_'.join(filter(None, filename.split()))
|
||||
|
||||
# Limit length and ensure it's not empty
|
||||
filename = filename[:30] if filename else "default"
|
||||
|
||||
# Remove leading/trailing special characters
|
||||
filename = filename.strip('._-')
|
||||
|
||||
return filename or "default"
|
||||
|
|
|
|||
Loading…
Reference in a new issue