add configs and clean code

2026-04-14 18:25:35 +02:00 · 2025-03-19 17:33:32 +08:00 · 2025-03-19 17:33:32 +08:00 · a2e4e1689f
commit a2e4e1689f
parent bb64366a85
8 changed files with 149 additions and 89 deletions
--- a/.gitignore
+++ b/.gitignore
@ -198,4 +198,5 @@ flash-attention
 datasets

 # inference scaling
-temp/
+temp*
+samples*
--- a/README.md
+++ b/README.md
@ -262,6 +262,20 @@ export OPENAI_API_KEY=sk-xxxx
 torchrun --nproc_per_node 1 --standalone scripts/diffusion/inference.py configs/diffusion/inference/t2i2v_256px.py --save-dir samples --prompt "raining, sea" --refine-prompt True
 ```

+### Inference Scaling
+
+We implemented an inference scaling sampling method inspaired by [Inference-Time Scaling for Diffusion Models beyond Scaling Denoising Steps](https://inference-scale-diffusion.github.io). You can spent more computational resources to get better results. Use it by specifying the sampling option.
+
+```
+torchrun --nproc_per_node 4 --standalone scripts/diffusion/inference.py configs/diffusion/inference/768px_t2i2v_inference_scaling.py --save-dir samples --dataset.data-path assets/texts/sora.csv 
+```
+
+| Orignal               | <br>num_subtree=3<br>num_scaling_steps=5<br>num_noise=1<br>time=16min | <br>num_subtree=7<br>num_scaling_steps=8<br>num_noise=1<br>time=1h |
+|----------------------|----------------------------------------------------------------|----------------------------------------------------------------|
+| [Video Placeholder 1] | [Video Placeholder 2]                                          | [Video Placeholder 3]                                          |
+| [Video Placeholder 1] | [Video Placeholder 2]                                          | [Video Placeholder 3]                                          |
+
+
 ### Reproductivity

 To make the results reproducible, you can set the random seed by:
--- a/configs/diffusion/inference/256px_inference_scaling.py
+++ b/configs/diffusion/inference/256px_inference_scaling.py
@ -1,28 +0,0 @@
-_base_ = [  # inherit grammer from mmengine
-    "256px.py",
-    "plugins/t2i2v.py",
-    "plugins/tp.py",  # use tensor parallel
-]
-sampling_option = dict(
-    resolution="256px",  # 256px or 768px
-    aspect_ratio="16:9",  # 9:16 or 16:9 or 1:1
-    num_frames=129,  # number of frames
-    num_steps=50,  # number of steps
-    shift=True,
-    temporal_reduction=4,
-    is_causal_vae=True,
-    guidance=7.5,  # guidance for text-to-video
-    guidance_img=3.0,  # guidance for image-to-video
-    text_osci=True,  # enable text guidance oscillation
-    image_osci=True,  # enable image guidance oscillation
-    scale_temporal_osci=True,
-    method="i2v_inference_scaling",  # hard-coded for now
-    vbench_dimension_list=['subject_consistency'],
-    do_inference_scaling=True,
-    num_subtree=3,
-    backward_scale=0.78,
-    forward_scale=0.83,
-    scaling_steps=[1,2,4,7,9,15,20],
-    seed=None,  # random seed for z
-    vbench_gpus=[4,5,6,7]
-)
--- a/configs/diffusion/inference/768px_inference_scaling.py
+++ b/configs/diffusion/inference/768px_inference_scaling.py
@ -1,17 +0,0 @@
-_base_ = [  # inherit grammer from mmengine
-    "256px.py",
-    "plugins/sp.py",  # use sequence parallel
-    "plugins/t2i2v.py",
-]
-sampling_option = dict(
-    resolution="768px",  # 256px or 768px
-    method="i2v_inference_scaling",  # hard-coded for now
-    vbench_dimension_list=['subject_consistency'],
-    do_inference_scaling=True,
-    num_subtree=3,
-    backward_scale=0.78,
-    forward_scale=0.83,
-    scaling_steps=[1,2,4,7,9,15,20],
-    seed=None,  # random seed for z
-    vbench_gpus=[4,5,6,7]
-)
--- a/configs/diffusion/inference/t2i2v_256px_inference_scaling.py
+++ b/configs/diffusion/inference/t2i2v_256px_inference_scaling.py
@ -0,0 +1,17 @@
+_base_ = [  # inherit grammer from mmengine
+    "256px.py",
+    "plugins/t2i2v.py",
+]
+
+# update the inference scaling parameters
+sampling_option = dict(
+    method="i2v_inference_scaling",
+    vbench_dimension_list=['subject_consistency', 'background_consistency', 'motion_smoothness', 'dynamic_degree', 'aesthetic_quality', 'imaging_quality'],
+    do_inference_scaling=True,
+    num_subtree=3,
+    backward_scale=0.78,
+    forward_scale=0.83,
+    scaling_steps=[1,2,4,7,9,15,20],
+    vbench_gpus=[4,5,6,7],
+    seed=42
+)
--- a/configs/diffusion/inference/t2i2v_768px_inference_scaling.py
+++ b/configs/diffusion/inference/t2i2v_768px_inference_scaling.py
@ -0,0 +1,43 @@
+_base_ = [  # inherit grammer from mmengine
+    "768px.py",
+    "plugins/t2i2v.py",
+]
+
+# # update the inference scaling parameters
+# sampling_option = dict(
+#     method="i2v_inference_scaling",
+#     vbench_dimension_list=['subject_consistency', 'background_consistency', 'motion_smoothness', 'dynamic_degree', 'aesthetic_quality', 'imaging_quality'],
+#     do_inference_scaling=True,
+#     num_subtree=3,
+#     backward_scale=1.0,
+#     forward_scale=0.5,
+#     scaling_steps=[1,2,4,8,13],
+#     vbench_gpus=[4,5,6,7],
+#     seed=42
+# )
+
+# second setting
+sampling_option = dict(
+    method="i2v_inference_scaling",
+    vbench_dimension_list=['subject_consistency', 'background_consistency', 'motion_smoothness', 'dynamic_degree', 'aesthetic_quality', 'imaging_quality'],
+    do_inference_scaling=True,
+    num_subtree=5,
+    backward_scale=1.0,
+    forward_scale=0.5,
+    scaling_steps=[1,2,3,4,6,8,10,13],
+    vbench_gpus=[6,7],
+    seed=42
+)
+
+# third setting
+# sampling_option = dict(
+#     method="i2v_inference_scaling",
+#     vbench_dimension_list=['subject_consistency', 'background_consistency', 'motion_smoothness', 'dynamic_degree', 'aesthetic_quality', 'imaging_quality'],
+#     do_inference_scaling=True,
+#     num_subtree=8,
+#     backward_scale=0.78,
+#     forward_scale=0.83,
+#     scaling_steps=[1,2,3,4,5,6,7,8,9,12,15,18,20,22,25,30,35],
+#     vbench_gpus=[6,7],
+#     seed=42
+# )
--- a/1
+++ b/1
@ -0,0 +1 @@
+Subproject commit 5f49710b9bfb3d3d74fca6363b6cb6b7d54aff23
--- a/opensora/utils/sampling.py
+++ b/opensora/utils/sampling.py
@ -6,6 +6,7 @@ from dataclasses import dataclass, replace
 import json
 import subprocess
 from collections import defaultdict
+import sys

 import torch
 import torchvision
@ -105,62 +106,69 @@ class SamplingOption:
    vbench_dimension_list: list = None


+NORMALIZE_DIC = {
+    "subject consistency": {"Min": 0.1462, "Max": 1.0},
+    "background consistency": {"Min": 0.2615, "Max": 1.0},
+    "motion smoothness": {"Min": 0.706, "Max": 0.9975},
+    "dynamic degree": {"Min": 0.0, "Max": 1.0},
+    "aesthetic quality": {"Min": 0.0, "Max": 1.0},
+    "imaging quality": {"Min": 0.0, "Max": 1.0},
+}
+
+DIM_WEIGHT = {
+    "subject consistency":1,
+    "background consistency":1,
+    "motion smoothness":1,
+    "aesthetic quality":1,
+    "imaging quality":1,
+    "dynamic degree":0.5,
+}
+
+
 def find_highest_score_video(data):
-    video_scores = defaultdict(list)
-    normalization_rules = {
-        "subject_consistency": lambda e: e["video_results"],
-        "background_consistency": lambda e: e["video_results"],
-        "temporal_flickering": lambda e: e["video_results"],
-        "motion_smoothness": lambda e: e["video_results"],
-        
-        "dynamic_degree": lambda e: 1.0 if e["video_results"] else 0.0,
-        
-        "aesthetic_quality": lambda e: e["video_results"],
-        "imaging_quality": lambda e: e["video_results"] / 100,
-        
-        "human_action": lambda e: e["cor_num_per_video"],
-        
-        "temporal_style": lambda e: e["video_results"],
-        "overall_consistency": lambda e: e["video_results"]
-    }
+    video_scores = defaultdict(dict)
    
    for metric_name, metric_data in data.items():
        if not isinstance(metric_data, list) or len(metric_data) < 2:
            continue
            
-        process_rule = normalization_rules.get(metric_name)
-        if not process_rule:
+        if metric_name not in NORMALIZE_DIC:
            continue
            
+        min_val = NORMALIZE_DIC[metric_name]["Min"] 
+        max_val = NORMALIZE_DIC[metric_name]["Max"]
+        dim_weight = DIM_WEIGHT[metric_name]
+            
        for entry in metric_data[1]:
            try:
                path_parts = entry["video_path"].split("/")
                filename = path_parts[-1]
                video_index = int(filename.split(".")[0])
                
-                score = process_rule(entry)
-                video_scores[video_index].append(score)
+                if "video_results" in entry:
+                    raw_score = entry["video_results"]
+                elif "cor_num_per_video" in entry:
+                    raw_score = entry["cor_num_per_video"]
+                else:
+                    continue
+                    
+                norm_score = (raw_score - min_val) / (max_val - min_val) * dim_weight
+                video_scores[video_index][metric_name] = norm_score
                
            except (KeyError, ValueError, IndexError):
                continue

-    avg_scores = {}
+    final_scores = {}
    for vid, scores in video_scores.items():
-        if len(scores) == 0:
-            avg_scores[vid] = 0.0
-            continue
+        if len(scores) > 0:
+            final_scores[vid] = sum(scores.values()) / sum(DIM_WEIGHT[key] for key in scores.keys())
    
-        avg_scores[vid] = sum(scores) / len(scores)
-
-    if not avg_scores:
+    if not final_scores:
        return -1
        
-    max_score = max(avg_scores.values())
-    candidates = sorted(
-        [vid for vid, score in avg_scores.items() if score == max_score]
-    )
-    
-    return candidates[0] if candidates else -1
+    max_score = max(final_scores.values())
+    candidates = [vid for vid, score in final_scores.items() if score == max_score]
+    return min(candidates) if candidates else -1


 def sanitize_sampling_option(sampling_option: SamplingOption) -> SamplingOption:
@ -364,7 +372,7 @@ class I2VScalingDenoiser(Denoiser):
        scale_temporal_osci = kwargs.pop("scale_temporal_osci", False)

        prompt = [prompt[0]]
-        prompt_short = prompt[0][:30].replace(" ", "_")
+        prompt_short = sanitize_filename(prompt[0])
        save_dir = f'temp/{prompt_short}'
        os.makedirs(save_dir, exist_ok=True)

@ -437,9 +445,9 @@ class I2VScalingDenoiser(Denoiser):
                    noise = torch.randn(noise_shape, device=cond_x.device, dtype=cond_x.dtype)
                    zeros = torch.zeros(noise_shape, device=cond_x.device, dtype=cond_x.dtype)
                    noise = torch.cat([noise, zeros, zeros], dim=0)
-                    subtree = img - (t_curr - timesteps[i-1]) * forward_scale * noise
-                    t_subtree = t_curr - (t_curr - timesteps[i-1]) * forward_scale
-                    t_subtree_prev = t_subtree + (t_curr - timesteps[i-1]) * backward_scale
+                    subtree = img - (timesteps[i+1] - t_curr) * forward_scale * noise
+                    t_subtree = t_curr
+                    t_subtree_prev = t_subtree + (timesteps[i+1] - t_curr) * backward_scale
                    t_subtree_vec = torch.full((img.shape[0],), t_subtree, dtype=cond.dtype, device=cond.device)
                    subtree_noise_pred = model(
                        img=subtree,
@ -506,20 +514,21 @@ class I2VScalingDenoiser(Denoiser):
                    videos_path = f"{save_dir}/{i}_subtree"
                    output_path = f"{save_dir}/{i}_subtree"
                    
-                    prompt_file = "temp/prompt.json"
+                    prompt_file = "temp/prompt.json" # hard coded for now
                    with open(prompt_file, "w") as fp:
                        prompt_json = {f"{index}.mp4": prompt[0] for index in range(num_subtree)}
                        json.dump(prompt_json, fp)
                    
+                    python_path = os.path.dirname(sys.executable)
                    minimal_env = {
-                        "PATH": "/usr/local/bin:/usr/bin:/bin:/mnt/jfs-hdd/home/huangshijie/opensora_vbench/bin",
+                        "PATH": f"{python_path}:/usr/local/bin:/usr/bin:/bin",
                        "CUDA_VISIBLE_DEVICES": ",".join([str(item) for item in vbench_gpus])
                    }
                    cmd_args = [
                        'vbench', 
                        'evaluate',
                        '--dimension', 
-                        ','.join(vbench_dimension_list),
+                        ' '.join(vbench_dimension_list),
                        '--videos_path', 
                        videos_path,
                        '--mode',
@ -1060,3 +1069,23 @@ def prepare_api(
        return x

    return api_fn
+
+
+def sanitize_filename(prompt):
+    """Sanitize the prompt to create a valid filename."""
+    # Remove or replace special characters
+    invalid_chars = '<>:"/\\|?*\n\r\t'
+    filename = prompt.strip()
+    for char in invalid_chars:
+        filename = filename.replace(char, '_')
+    
+    # Replace multiple spaces/underscores with single underscore
+    filename = '_'.join(filter(None, filename.split()))
+    
+    # Limit length and ensure it's not empty
+    filename = filename[:30] if filename else "default"
+    
+    # Remove leading/trailing special characters
+    filename = filename.strip('._-')
+    
+    return filename or "default"
				`@ -0,0 +1 @@`
				`Subproject commit 5f49710b9bfb3d3d74fca6363b6cb6b7d54aff23`