add configs and clean code

2026-04-14 18:25:35 +02:00 · 2025-03-19 17:33:32 +08:00 · 2025-03-19 17:33:32 +08:00 · a2e4e1689f
commit a2e4e1689f
parent bb64366a85
8 changed files with 149 additions and 89 deletions
--- a/.gitignore
+++ b/.gitignore
@ -198,4 +198,5 @@ flash-attention
 datasets
 # inference scaling
-temp/
+temp*
 samples*
--- a/README.md
+++ b/README.md
@ -262,6 +262,20 @@ export OPENAI_API_KEY=sk-xxxx
 torchrun --nproc_per_node 1 --standalone scripts/diffusion/inference.py configs/diffusion/inference/t2i2v_256px.py --save-dir samples --prompt "raining, sea" --refine-prompt True
 ```
 ### Inference Scaling
 We implemented an inference scaling sampling method inspaired by [Inference-Time Scaling for Diffusion Models beyond Scaling Denoising Steps](https://inference-scale-diffusion.github.io). You can spent more computational resources to get better results. Use it by specifying the sampling option.
 ```
 torchrun --nproc_per_node 4 --standalone scripts/diffusion/inference.py configs/diffusion/inference/768px_t2i2v_inference_scaling.py --save-dir samples --dataset.data-path assets/texts/sora.csv 
 ```
 | Orignal               | <br>num_subtree=3<br>num_scaling_steps=5<br>num_noise=1<br>time=16min | <br>num_subtree=7<br>num_scaling_steps=8<br>num_noise=1<br>time=1h |
 |----------------------|----------------------------------------------------------------|----------------------------------------------------------------|
 | [Video Placeholder 1] | [Video Placeholder 2]                                          | [Video Placeholder 3]                                          |
 | [Video Placeholder 1] | [Video Placeholder 2]                                          | [Video Placeholder 3]                                          |
 ### Reproductivity
 To make the results reproducible, you can set the random seed by:
--- a/configs/diffusion/inference/256px_inference_scaling.py
+++ b/configs/diffusion/inference/256px_inference_scaling.py
@ -1,28 +0,0 @@
 _base_ = [  # inherit grammer from mmengine
    "256px.py",
    "plugins/t2i2v.py",
    "plugins/tp.py",  # use tensor parallel
 ]
 sampling_option = dict(
    resolution="256px",  # 256px or 768px
    aspect_ratio="16:9",  # 9:16 or 16:9 or 1:1
    num_frames=129,  # number of frames
    num_steps=50,  # number of steps
    shift=True,
    temporal_reduction=4,
    is_causal_vae=True,
    guidance=7.5,  # guidance for text-to-video
    guidance_img=3.0,  # guidance for image-to-video
    text_osci=True,  # enable text guidance oscillation
    image_osci=True,  # enable image guidance oscillation
    scale_temporal_osci=True,
    method="i2v_inference_scaling",  # hard-coded for now
    vbench_dimension_list=['subject_consistency'],
    do_inference_scaling=True,
    num_subtree=3,
    backward_scale=0.78,
    forward_scale=0.83,
    scaling_steps=[1,2,4,7,9,15,20],
    seed=None,  # random seed for z
    vbench_gpus=[4,5,6,7]
 )
--- a/configs/diffusion/inference/768px_inference_scaling.py
+++ b/configs/diffusion/inference/768px_inference_scaling.py
@ -1,17 +0,0 @@
 _base_ = [  # inherit grammer from mmengine
    "256px.py",
    "plugins/sp.py",  # use sequence parallel
    "plugins/t2i2v.py",
 ]
 sampling_option = dict(
    resolution="768px",  # 256px or 768px
    method="i2v_inference_scaling",  # hard-coded for now
    vbench_dimension_list=['subject_consistency'],
    do_inference_scaling=True,
    num_subtree=3,
    backward_scale=0.78,
    forward_scale=0.83,
    scaling_steps=[1,2,4,7,9,15,20],
    seed=None,  # random seed for z
    vbench_gpus=[4,5,6,7]
 )
--- a/configs/diffusion/inference/t2i2v_256px_inference_scaling.py
+++ b/configs/diffusion/inference/t2i2v_256px_inference_scaling.py
@ -0,0 +1,17 @@
 _base_ = [  # inherit grammer from mmengine
    "256px.py",
    "plugins/t2i2v.py",
 ]
 # update the inference scaling parameters
 sampling_option = dict(
    method="i2v_inference_scaling",
    vbench_dimension_list=['subject_consistency', 'background_consistency', 'motion_smoothness', 'dynamic_degree', 'aesthetic_quality', 'imaging_quality'],
    do_inference_scaling=True,
    num_subtree=3,
    backward_scale=0.78,
    forward_scale=0.83,
    scaling_steps=[1,2,4,7,9,15,20],
    vbench_gpus=[4,5,6,7],
    seed=42
 )
--- a/configs/diffusion/inference/t2i2v_768px_inference_scaling.py
+++ b/configs/diffusion/inference/t2i2v_768px_inference_scaling.py
@ -0,0 +1,43 @@
 _base_ = [  # inherit grammer from mmengine
    "768px.py",
    "plugins/t2i2v.py",
 ]
 # # update the inference scaling parameters
 # sampling_option = dict(
 #     method="i2v_inference_scaling",
 #     vbench_dimension_list=['subject_consistency', 'background_consistency', 'motion_smoothness', 'dynamic_degree', 'aesthetic_quality', 'imaging_quality'],
 #     do_inference_scaling=True,
 #     num_subtree=3,
 #     backward_scale=1.0,
 #     forward_scale=0.5,
 #     scaling_steps=[1,2,4,8,13],
 #     vbench_gpus=[4,5,6,7],
 #     seed=42
 # )
 # second setting
 sampling_option = dict(
    method="i2v_inference_scaling",
    vbench_dimension_list=['subject_consistency', 'background_consistency', 'motion_smoothness', 'dynamic_degree', 'aesthetic_quality', 'imaging_quality'],
    do_inference_scaling=True,
    num_subtree=5,
    backward_scale=1.0,
    forward_scale=0.5,
    scaling_steps=[1,2,3,4,6,8,10,13],
    vbench_gpus=[6,7],
    seed=42
 )
 # third setting
 # sampling_option = dict(
 #     method="i2v_inference_scaling",
 #     vbench_dimension_list=['subject_consistency', 'background_consistency', 'motion_smoothness', 'dynamic_degree', 'aesthetic_quality', 'imaging_quality'],
 #     do_inference_scaling=True,
 #     num_subtree=8,
 #     backward_scale=0.78,
 #     forward_scale=0.83,
 #     scaling_steps=[1,2,3,4,5,6,7,8,9,12,15,18,20,22,25,30,35],
 #     vbench_gpus=[6,7],
 #     seed=42
 # )
--- a/1
+++ b/1
@ -0,0 +1 @@
 Subproject commit 5f49710b9bfb3d3d74fca6363b6cb6b7d54aff23
--- a/opensora/utils/sampling.py
+++ b/opensora/utils/sampling.py
@ -6,6 +6,7 @@ from dataclasses import dataclass, replace
 import json
 import subprocess
 from collections import defaultdict
 import sys
 import torch
 import torchvision
@ -105,62 +106,69 @@ class SamplingOption:
    vbench_dimension_list: list = None
-def find_highest_score_video(data):
+NORMALIZE_DIC = {
-    video_scores = defaultdict(list)
+    "subject consistency": {"Min": 0.1462, "Max": 1.0},
-    normalization_rules = {
+    "background consistency": {"Min": 0.2615, "Max": 1.0},
-        "subject_consistency": lambda e: e["video_results"],
+    "motion smoothness": {"Min": 0.706, "Max": 0.9975},
-        "background_consistency": lambda e: e["video_results"],
+    "dynamic degree": {"Min": 0.0, "Max": 1.0},
-        "temporal_flickering": lambda e: e["video_results"],
+    "aesthetic quality": {"Min": 0.0, "Max": 1.0},
-        "motion_smoothness": lambda e: e["video_results"],
+    "imaging quality": {"Min": 0.0, "Max": 1.0},
-        
+}
        "dynamic_degree": lambda e: 1.0 if e["video_results"] else 0.0,
        "aesthetic_quality": lambda e: e["video_results"],
        "imaging_quality": lambda e: e["video_results"] / 100,
        "human_action": lambda e: e["cor_num_per_video"],
        "temporal_style": lambda e: e["video_results"],
        "overall_consistency": lambda e: e["video_results"]
    }
 DIM_WEIGHT = {
    "subject consistency":1,
    "background consistency":1,
    "motion smoothness":1,
    "aesthetic quality":1,
    "imaging quality":1,
    "dynamic degree":0.5,
 }
 def find_highest_score_video(data):
    video_scores = defaultdict(dict)
    for metric_name, metric_data in data.items():
        if not isinstance(metric_data, list) or len(metric_data) < 2:
            continue
-        process_rule = normalization_rules.get(metric_name)
+        if metric_name not in NORMALIZE_DIC:
        if not process_rule:
            continue
        min_val = NORMALIZE_DIC[metric_name]["Min"] 
        max_val = NORMALIZE_DIC[metric_name]["Max"]
        dim_weight = DIM_WEIGHT[metric_name]
        for entry in metric_data[1]:
            try:
                path_parts = entry["video_path"].split("/")
                filename = path_parts[-1]
                video_index = int(filename.split(".")[0])
-                score = process_rule(entry)
+                if "video_results" in entry:
-                video_scores[video_index].append(score)
+                    raw_score = entry["video_results"]
                elif "cor_num_per_video" in entry:
                    raw_score = entry["cor_num_per_video"]
                else:
                    continue
                norm_score = (raw_score - min_val) / (max_val - min_val) * dim_weight
                video_scores[video_index][metric_name] = norm_score
            except (KeyError, ValueError, IndexError):
                continue
-    avg_scores = {}
+    final_scores = {}
    for vid, scores in video_scores.items():
-        if len(scores) == 0:
+        if len(scores) > 0:
-            avg_scores[vid] = 0.0
+            final_scores[vid] = sum(scores.values()) / sum(DIM_WEIGHT[key] for key in scores.keys())
-            continue
+    
-            
+    if not final_scores:
        avg_scores[vid] = sum(scores) / len(scores)
    if not avg_scores:
        return -1
-    max_score = max(avg_scores.values())
+    max_score = max(final_scores.values())
-    candidates = sorted(
+    candidates = [vid for vid, score in final_scores.items() if score == max_score]
-        [vid for vid, score in avg_scores.items() if score == max_score]
+    return min(candidates) if candidates else -1
    )
    return candidates[0] if candidates else -1
 def sanitize_sampling_option(sampling_option: SamplingOption) -> SamplingOption:
@ -364,7 +372,7 @@ class I2VScalingDenoiser(Denoiser):
        scale_temporal_osci = kwargs.pop("scale_temporal_osci", False)
        prompt = [prompt[0]]
-        prompt_short = prompt[0][:30].replace(" ", "_")
+        prompt_short = sanitize_filename(prompt[0])
        save_dir = f'temp/{prompt_short}'
        os.makedirs(save_dir, exist_ok=True)
@ -437,9 +445,9 @@ class I2VScalingDenoiser(Denoiser):
                    noise = torch.randn(noise_shape, device=cond_x.device, dtype=cond_x.dtype)
                    zeros = torch.zeros(noise_shape, device=cond_x.device, dtype=cond_x.dtype)
                    noise = torch.cat([noise, zeros, zeros], dim=0)
-                    subtree = img - (t_curr - timesteps[i-1]) * forward_scale * noise
+                    subtree = img - (timesteps[i+1] - t_curr) * forward_scale * noise
-                    t_subtree = t_curr - (t_curr - timesteps[i-1]) * forward_scale
+                    t_subtree = t_curr
-                    t_subtree_prev = t_subtree + (t_curr - timesteps[i-1]) * backward_scale
+                    t_subtree_prev = t_subtree + (timesteps[i+1] - t_curr) * backward_scale
                    t_subtree_vec = torch.full((img.shape[0],), t_subtree, dtype=cond.dtype, device=cond.device)
                    subtree_noise_pred = model(
                        img=subtree,
@ -506,20 +514,21 @@ class I2VScalingDenoiser(Denoiser):
                    videos_path = f"{save_dir}/{i}_subtree"
                    output_path = f"{save_dir}/{i}_subtree"
-                    prompt_file = "temp/prompt.json"
+                    prompt_file = "temp/prompt.json" # hard coded for now
                    with open(prompt_file, "w") as fp:
                        prompt_json = {f"{index}.mp4": prompt[0] for index in range(num_subtree)}
                        json.dump(prompt_json, fp)
                    python_path = os.path.dirname(sys.executable)
                    minimal_env = {
-                        "PATH": "/usr/local/bin:/usr/bin:/bin:/mnt/jfs-hdd/home/huangshijie/opensora_vbench/bin",
+                        "PATH": f"{python_path}:/usr/local/bin:/usr/bin:/bin",
                        "CUDA_VISIBLE_DEVICES": ",".join([str(item) for item in vbench_gpus])
                    }
                    cmd_args = [
                        'vbench', 
                        'evaluate',
                        '--dimension', 
-                        ','.join(vbench_dimension_list),
+                        ' '.join(vbench_dimension_list),
                        '--videos_path', 
                        videos_path,
                        '--mode',
@ -1060,3 +1069,23 @@ def prepare_api(
        return x
    return api_fn
 def sanitize_filename(prompt):
    """Sanitize the prompt to create a valid filename."""
    # Remove or replace special characters
    invalid_chars = '<>:"/\\|?*\n\r\t'
    filename = prompt.strip()
    for char in invalid_chars:
        filename = filename.replace(char, '_')
    # Replace multiple spaces/underscores with single underscore
    filename = '_'.join(filter(None, filename.split()))
    # Limit length and ensure it's not empty
    filename = filename[:30] if filename else "default"
    # Remove leading/trailing special characters
    filename = filename.strip('._-')
    return filename or "default"
		`@ -0,0 +1 @@`
							`Subproject commit 5f49710b9bfb3d3d74fca6363b6cb6b7d54aff23`