mirror of
https://github.com/hpcaitech/Open-Sora.git
synced 2026-04-14 18:25:35 +02:00
add configs and clean code
This commit is contained in:
parent
bb64366a85
commit
a2e4e1689f
3
.gitignore
vendored
3
.gitignore
vendored
|
|
@ -198,4 +198,5 @@ flash-attention
|
||||||
datasets
|
datasets
|
||||||
|
|
||||||
# inference scaling
|
# inference scaling
|
||||||
temp/
|
temp*
|
||||||
|
samples*
|
||||||
14
README.md
14
README.md
|
|
@ -262,6 +262,20 @@ export OPENAI_API_KEY=sk-xxxx
|
||||||
torchrun --nproc_per_node 1 --standalone scripts/diffusion/inference.py configs/diffusion/inference/t2i2v_256px.py --save-dir samples --prompt "raining, sea" --refine-prompt True
|
torchrun --nproc_per_node 1 --standalone scripts/diffusion/inference.py configs/diffusion/inference/t2i2v_256px.py --save-dir samples --prompt "raining, sea" --refine-prompt True
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Inference Scaling
|
||||||
|
|
||||||
|
We implemented an inference scaling sampling method inspaired by [Inference-Time Scaling for Diffusion Models beyond Scaling Denoising Steps](https://inference-scale-diffusion.github.io). You can spent more computational resources to get better results. Use it by specifying the sampling option.
|
||||||
|
|
||||||
|
```
|
||||||
|
torchrun --nproc_per_node 4 --standalone scripts/diffusion/inference.py configs/diffusion/inference/768px_t2i2v_inference_scaling.py --save-dir samples --dataset.data-path assets/texts/sora.csv
|
||||||
|
```
|
||||||
|
|
||||||
|
| Orignal | <br>num_subtree=3<br>num_scaling_steps=5<br>num_noise=1<br>time=16min | <br>num_subtree=7<br>num_scaling_steps=8<br>num_noise=1<br>time=1h |
|
||||||
|
|----------------------|----------------------------------------------------------------|----------------------------------------------------------------|
|
||||||
|
| [Video Placeholder 1] | [Video Placeholder 2] | [Video Placeholder 3] |
|
||||||
|
| [Video Placeholder 1] | [Video Placeholder 2] | [Video Placeholder 3] |
|
||||||
|
|
||||||
|
|
||||||
### Reproductivity
|
### Reproductivity
|
||||||
|
|
||||||
To make the results reproducible, you can set the random seed by:
|
To make the results reproducible, you can set the random seed by:
|
||||||
|
|
|
||||||
|
|
@ -1,28 +0,0 @@
|
||||||
_base_ = [ # inherit grammer from mmengine
|
|
||||||
"256px.py",
|
|
||||||
"plugins/t2i2v.py",
|
|
||||||
"plugins/tp.py", # use tensor parallel
|
|
||||||
]
|
|
||||||
sampling_option = dict(
|
|
||||||
resolution="256px", # 256px or 768px
|
|
||||||
aspect_ratio="16:9", # 9:16 or 16:9 or 1:1
|
|
||||||
num_frames=129, # number of frames
|
|
||||||
num_steps=50, # number of steps
|
|
||||||
shift=True,
|
|
||||||
temporal_reduction=4,
|
|
||||||
is_causal_vae=True,
|
|
||||||
guidance=7.5, # guidance for text-to-video
|
|
||||||
guidance_img=3.0, # guidance for image-to-video
|
|
||||||
text_osci=True, # enable text guidance oscillation
|
|
||||||
image_osci=True, # enable image guidance oscillation
|
|
||||||
scale_temporal_osci=True,
|
|
||||||
method="i2v_inference_scaling", # hard-coded for now
|
|
||||||
vbench_dimension_list=['subject_consistency'],
|
|
||||||
do_inference_scaling=True,
|
|
||||||
num_subtree=3,
|
|
||||||
backward_scale=0.78,
|
|
||||||
forward_scale=0.83,
|
|
||||||
scaling_steps=[1,2,4,7,9,15,20],
|
|
||||||
seed=None, # random seed for z
|
|
||||||
vbench_gpus=[4,5,6,7]
|
|
||||||
)
|
|
||||||
|
|
@ -1,17 +0,0 @@
|
||||||
_base_ = [ # inherit grammer from mmengine
|
|
||||||
"256px.py",
|
|
||||||
"plugins/sp.py", # use sequence parallel
|
|
||||||
"plugins/t2i2v.py",
|
|
||||||
]
|
|
||||||
sampling_option = dict(
|
|
||||||
resolution="768px", # 256px or 768px
|
|
||||||
method="i2v_inference_scaling", # hard-coded for now
|
|
||||||
vbench_dimension_list=['subject_consistency'],
|
|
||||||
do_inference_scaling=True,
|
|
||||||
num_subtree=3,
|
|
||||||
backward_scale=0.78,
|
|
||||||
forward_scale=0.83,
|
|
||||||
scaling_steps=[1,2,4,7,9,15,20],
|
|
||||||
seed=None, # random seed for z
|
|
||||||
vbench_gpus=[4,5,6,7]
|
|
||||||
)
|
|
||||||
17
configs/diffusion/inference/t2i2v_256px_inference_scaling.py
Normal file
17
configs/diffusion/inference/t2i2v_256px_inference_scaling.py
Normal file
|
|
@ -0,0 +1,17 @@
|
||||||
|
_base_ = [ # inherit grammer from mmengine
|
||||||
|
"256px.py",
|
||||||
|
"plugins/t2i2v.py",
|
||||||
|
]
|
||||||
|
|
||||||
|
# update the inference scaling parameters
|
||||||
|
sampling_option = dict(
|
||||||
|
method="i2v_inference_scaling",
|
||||||
|
vbench_dimension_list=['subject_consistency', 'background_consistency', 'motion_smoothness', 'dynamic_degree', 'aesthetic_quality', 'imaging_quality'],
|
||||||
|
do_inference_scaling=True,
|
||||||
|
num_subtree=3,
|
||||||
|
backward_scale=0.78,
|
||||||
|
forward_scale=0.83,
|
||||||
|
scaling_steps=[1,2,4,7,9,15,20],
|
||||||
|
vbench_gpus=[4,5,6,7],
|
||||||
|
seed=42
|
||||||
|
)
|
||||||
43
configs/diffusion/inference/t2i2v_768px_inference_scaling.py
Normal file
43
configs/diffusion/inference/t2i2v_768px_inference_scaling.py
Normal file
|
|
@ -0,0 +1,43 @@
|
||||||
|
_base_ = [ # inherit grammer from mmengine
|
||||||
|
"768px.py",
|
||||||
|
"plugins/t2i2v.py",
|
||||||
|
]
|
||||||
|
|
||||||
|
# # update the inference scaling parameters
|
||||||
|
# sampling_option = dict(
|
||||||
|
# method="i2v_inference_scaling",
|
||||||
|
# vbench_dimension_list=['subject_consistency', 'background_consistency', 'motion_smoothness', 'dynamic_degree', 'aesthetic_quality', 'imaging_quality'],
|
||||||
|
# do_inference_scaling=True,
|
||||||
|
# num_subtree=3,
|
||||||
|
# backward_scale=1.0,
|
||||||
|
# forward_scale=0.5,
|
||||||
|
# scaling_steps=[1,2,4,8,13],
|
||||||
|
# vbench_gpus=[4,5,6,7],
|
||||||
|
# seed=42
|
||||||
|
# )
|
||||||
|
|
||||||
|
# second setting
|
||||||
|
sampling_option = dict(
|
||||||
|
method="i2v_inference_scaling",
|
||||||
|
vbench_dimension_list=['subject_consistency', 'background_consistency', 'motion_smoothness', 'dynamic_degree', 'aesthetic_quality', 'imaging_quality'],
|
||||||
|
do_inference_scaling=True,
|
||||||
|
num_subtree=5,
|
||||||
|
backward_scale=1.0,
|
||||||
|
forward_scale=0.5,
|
||||||
|
scaling_steps=[1,2,3,4,6,8,10,13],
|
||||||
|
vbench_gpus=[6,7],
|
||||||
|
seed=42
|
||||||
|
)
|
||||||
|
|
||||||
|
# third setting
|
||||||
|
# sampling_option = dict(
|
||||||
|
# method="i2v_inference_scaling",
|
||||||
|
# vbench_dimension_list=['subject_consistency', 'background_consistency', 'motion_smoothness', 'dynamic_degree', 'aesthetic_quality', 'imaging_quality'],
|
||||||
|
# do_inference_scaling=True,
|
||||||
|
# num_subtree=8,
|
||||||
|
# backward_scale=0.78,
|
||||||
|
# forward_scale=0.83,
|
||||||
|
# scaling_steps=[1,2,3,4,5,6,7,8,9,12,15,18,20,22,25,30,35],
|
||||||
|
# vbench_gpus=[6,7],
|
||||||
|
# seed=42
|
||||||
|
# )
|
||||||
1
demo
Submodule
1
demo
Submodule
|
|
@ -0,0 +1 @@
|
||||||
|
Subproject commit 5f49710b9bfb3d3d74fca6363b6cb6b7d54aff23
|
||||||
|
|
@ -6,6 +6,7 @@ from dataclasses import dataclass, replace
|
||||||
import json
|
import json
|
||||||
import subprocess
|
import subprocess
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
import sys
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torchvision
|
import torchvision
|
||||||
|
|
@ -105,62 +106,69 @@ class SamplingOption:
|
||||||
vbench_dimension_list: list = None
|
vbench_dimension_list: list = None
|
||||||
|
|
||||||
|
|
||||||
def find_highest_score_video(data):
|
NORMALIZE_DIC = {
|
||||||
video_scores = defaultdict(list)
|
"subject consistency": {"Min": 0.1462, "Max": 1.0},
|
||||||
normalization_rules = {
|
"background consistency": {"Min": 0.2615, "Max": 1.0},
|
||||||
"subject_consistency": lambda e: e["video_results"],
|
"motion smoothness": {"Min": 0.706, "Max": 0.9975},
|
||||||
"background_consistency": lambda e: e["video_results"],
|
"dynamic degree": {"Min": 0.0, "Max": 1.0},
|
||||||
"temporal_flickering": lambda e: e["video_results"],
|
"aesthetic quality": {"Min": 0.0, "Max": 1.0},
|
||||||
"motion_smoothness": lambda e: e["video_results"],
|
"imaging quality": {"Min": 0.0, "Max": 1.0},
|
||||||
|
}
|
||||||
"dynamic_degree": lambda e: 1.0 if e["video_results"] else 0.0,
|
|
||||||
|
|
||||||
"aesthetic_quality": lambda e: e["video_results"],
|
|
||||||
"imaging_quality": lambda e: e["video_results"] / 100,
|
|
||||||
|
|
||||||
"human_action": lambda e: e["cor_num_per_video"],
|
|
||||||
|
|
||||||
"temporal_style": lambda e: e["video_results"],
|
|
||||||
"overall_consistency": lambda e: e["video_results"]
|
|
||||||
}
|
|
||||||
|
|
||||||
|
DIM_WEIGHT = {
|
||||||
|
"subject consistency":1,
|
||||||
|
"background consistency":1,
|
||||||
|
"motion smoothness":1,
|
||||||
|
"aesthetic quality":1,
|
||||||
|
"imaging quality":1,
|
||||||
|
"dynamic degree":0.5,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def find_highest_score_video(data):
|
||||||
|
video_scores = defaultdict(dict)
|
||||||
|
|
||||||
for metric_name, metric_data in data.items():
|
for metric_name, metric_data in data.items():
|
||||||
if not isinstance(metric_data, list) or len(metric_data) < 2:
|
if not isinstance(metric_data, list) or len(metric_data) < 2:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
process_rule = normalization_rules.get(metric_name)
|
if metric_name not in NORMALIZE_DIC:
|
||||||
if not process_rule:
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
min_val = NORMALIZE_DIC[metric_name]["Min"]
|
||||||
|
max_val = NORMALIZE_DIC[metric_name]["Max"]
|
||||||
|
dim_weight = DIM_WEIGHT[metric_name]
|
||||||
|
|
||||||
for entry in metric_data[1]:
|
for entry in metric_data[1]:
|
||||||
try:
|
try:
|
||||||
path_parts = entry["video_path"].split("/")
|
path_parts = entry["video_path"].split("/")
|
||||||
filename = path_parts[-1]
|
filename = path_parts[-1]
|
||||||
video_index = int(filename.split(".")[0])
|
video_index = int(filename.split(".")[0])
|
||||||
|
|
||||||
score = process_rule(entry)
|
if "video_results" in entry:
|
||||||
video_scores[video_index].append(score)
|
raw_score = entry["video_results"]
|
||||||
|
elif "cor_num_per_video" in entry:
|
||||||
|
raw_score = entry["cor_num_per_video"]
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
|
||||||
|
norm_score = (raw_score - min_val) / (max_val - min_val) * dim_weight
|
||||||
|
video_scores[video_index][metric_name] = norm_score
|
||||||
|
|
||||||
except (KeyError, ValueError, IndexError):
|
except (KeyError, ValueError, IndexError):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
avg_scores = {}
|
final_scores = {}
|
||||||
for vid, scores in video_scores.items():
|
for vid, scores in video_scores.items():
|
||||||
if len(scores) == 0:
|
if len(scores) > 0:
|
||||||
avg_scores[vid] = 0.0
|
final_scores[vid] = sum(scores.values()) / sum(DIM_WEIGHT[key] for key in scores.keys())
|
||||||
continue
|
|
||||||
|
if not final_scores:
|
||||||
avg_scores[vid] = sum(scores) / len(scores)
|
|
||||||
|
|
||||||
if not avg_scores:
|
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
max_score = max(avg_scores.values())
|
max_score = max(final_scores.values())
|
||||||
candidates = sorted(
|
candidates = [vid for vid, score in final_scores.items() if score == max_score]
|
||||||
[vid for vid, score in avg_scores.items() if score == max_score]
|
return min(candidates) if candidates else -1
|
||||||
)
|
|
||||||
|
|
||||||
return candidates[0] if candidates else -1
|
|
||||||
|
|
||||||
|
|
||||||
def sanitize_sampling_option(sampling_option: SamplingOption) -> SamplingOption:
|
def sanitize_sampling_option(sampling_option: SamplingOption) -> SamplingOption:
|
||||||
|
|
@ -364,7 +372,7 @@ class I2VScalingDenoiser(Denoiser):
|
||||||
scale_temporal_osci = kwargs.pop("scale_temporal_osci", False)
|
scale_temporal_osci = kwargs.pop("scale_temporal_osci", False)
|
||||||
|
|
||||||
prompt = [prompt[0]]
|
prompt = [prompt[0]]
|
||||||
prompt_short = prompt[0][:30].replace(" ", "_")
|
prompt_short = sanitize_filename(prompt[0])
|
||||||
save_dir = f'temp/{prompt_short}'
|
save_dir = f'temp/{prompt_short}'
|
||||||
os.makedirs(save_dir, exist_ok=True)
|
os.makedirs(save_dir, exist_ok=True)
|
||||||
|
|
||||||
|
|
@ -437,9 +445,9 @@ class I2VScalingDenoiser(Denoiser):
|
||||||
noise = torch.randn(noise_shape, device=cond_x.device, dtype=cond_x.dtype)
|
noise = torch.randn(noise_shape, device=cond_x.device, dtype=cond_x.dtype)
|
||||||
zeros = torch.zeros(noise_shape, device=cond_x.device, dtype=cond_x.dtype)
|
zeros = torch.zeros(noise_shape, device=cond_x.device, dtype=cond_x.dtype)
|
||||||
noise = torch.cat([noise, zeros, zeros], dim=0)
|
noise = torch.cat([noise, zeros, zeros], dim=0)
|
||||||
subtree = img - (t_curr - timesteps[i-1]) * forward_scale * noise
|
subtree = img - (timesteps[i+1] - t_curr) * forward_scale * noise
|
||||||
t_subtree = t_curr - (t_curr - timesteps[i-1]) * forward_scale
|
t_subtree = t_curr
|
||||||
t_subtree_prev = t_subtree + (t_curr - timesteps[i-1]) * backward_scale
|
t_subtree_prev = t_subtree + (timesteps[i+1] - t_curr) * backward_scale
|
||||||
t_subtree_vec = torch.full((img.shape[0],), t_subtree, dtype=cond.dtype, device=cond.device)
|
t_subtree_vec = torch.full((img.shape[0],), t_subtree, dtype=cond.dtype, device=cond.device)
|
||||||
subtree_noise_pred = model(
|
subtree_noise_pred = model(
|
||||||
img=subtree,
|
img=subtree,
|
||||||
|
|
@ -506,20 +514,21 @@ class I2VScalingDenoiser(Denoiser):
|
||||||
videos_path = f"{save_dir}/{i}_subtree"
|
videos_path = f"{save_dir}/{i}_subtree"
|
||||||
output_path = f"{save_dir}/{i}_subtree"
|
output_path = f"{save_dir}/{i}_subtree"
|
||||||
|
|
||||||
prompt_file = "temp/prompt.json"
|
prompt_file = "temp/prompt.json" # hard coded for now
|
||||||
with open(prompt_file, "w") as fp:
|
with open(prompt_file, "w") as fp:
|
||||||
prompt_json = {f"{index}.mp4": prompt[0] for index in range(num_subtree)}
|
prompt_json = {f"{index}.mp4": prompt[0] for index in range(num_subtree)}
|
||||||
json.dump(prompt_json, fp)
|
json.dump(prompt_json, fp)
|
||||||
|
|
||||||
|
python_path = os.path.dirname(sys.executable)
|
||||||
minimal_env = {
|
minimal_env = {
|
||||||
"PATH": "/usr/local/bin:/usr/bin:/bin:/mnt/jfs-hdd/home/huangshijie/opensora_vbench/bin",
|
"PATH": f"{python_path}:/usr/local/bin:/usr/bin:/bin",
|
||||||
"CUDA_VISIBLE_DEVICES": ",".join([str(item) for item in vbench_gpus])
|
"CUDA_VISIBLE_DEVICES": ",".join([str(item) for item in vbench_gpus])
|
||||||
}
|
}
|
||||||
cmd_args = [
|
cmd_args = [
|
||||||
'vbench',
|
'vbench',
|
||||||
'evaluate',
|
'evaluate',
|
||||||
'--dimension',
|
'--dimension',
|
||||||
','.join(vbench_dimension_list),
|
' '.join(vbench_dimension_list),
|
||||||
'--videos_path',
|
'--videos_path',
|
||||||
videos_path,
|
videos_path,
|
||||||
'--mode',
|
'--mode',
|
||||||
|
|
@ -1060,3 +1069,23 @@ def prepare_api(
|
||||||
return x
|
return x
|
||||||
|
|
||||||
return api_fn
|
return api_fn
|
||||||
|
|
||||||
|
|
||||||
|
def sanitize_filename(prompt):
|
||||||
|
"""Sanitize the prompt to create a valid filename."""
|
||||||
|
# Remove or replace special characters
|
||||||
|
invalid_chars = '<>:"/\\|?*\n\r\t'
|
||||||
|
filename = prompt.strip()
|
||||||
|
for char in invalid_chars:
|
||||||
|
filename = filename.replace(char, '_')
|
||||||
|
|
||||||
|
# Replace multiple spaces/underscores with single underscore
|
||||||
|
filename = '_'.join(filter(None, filename.split()))
|
||||||
|
|
||||||
|
# Limit length and ensure it's not empty
|
||||||
|
filename = filename[:30] if filename else "default"
|
||||||
|
|
||||||
|
# Remove leading/trailing special characters
|
||||||
|
filename = filename.strip('._-')
|
||||||
|
|
||||||
|
return filename or "default"
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue