Open-Sora/opensora/utils/train_utils.py

import math
import random
from collections import OrderedDict

import torch
import torch.distributed as dist
from colossalai.booster.plugin import LowLevelZeroPlugin

from opensora.acceleration.parallel_states import set_data_parallel_group, set_sequence_parallel_group
from opensora.acceleration.plugin import ZeroSeqParallelPlugin

from .misc import get_logger


def create_colossalai_plugin(plugin, dtype, grad_clip, sp_size, reduce_bucket_size_in_m: int = 20):
    if plugin == "zero2":
        assert sp_size == 1, "Zero2 plugin does not support sequence parallelism"
        plugin = LowLevelZeroPlugin(
            stage=2,
            precision=dtype,
            initial_scale=2**16,
            max_norm=grad_clip,
            reduce_bucket_size_in_m=reduce_bucket_size_in_m,
        )
        set_data_parallel_group(dist.group.WORLD)
    elif plugin == "zero2-seq":
        assert sp_size > 1, "Zero2-seq plugin requires sequence parallelism"
        plugin = ZeroSeqParallelPlugin(
            sp_size=sp_size,
            stage=2,
            precision=dtype,
            initial_scale=2**16,
            max_norm=grad_clip,
            reduce_bucket_size_in_m=reduce_bucket_size_in_m,
        )
        set_sequence_parallel_group(plugin.sp_group)
        set_data_parallel_group(plugin.dp_group)
    else:
        raise ValueError(f"Unknown plugin {plugin}")
    return plugin


@torch.no_grad()
def update_ema(
    ema_model: torch.nn.Module, model: torch.nn.Module, optimizer=None, decay: float = 0.9999, sharded: bool = True
) -> None:
    """
    Step the EMA model towards the current model.
    """
    ema_params = OrderedDict(ema_model.named_parameters())
    model_params = OrderedDict(model.named_parameters())

    for name, param in model_params.items():
        if name == "pos_embed":
            continue
        if not param.requires_grad:
            continue
        if not sharded:
            param_data = param.data
            ema_params[name].mul_(decay).add_(param_data, alpha=1 - decay)
        else:
            if param.data.dtype != torch.float32:
                param_id = id(param)
                master_param = optimizer._param_store.working_to_master_param[param_id]
                param_data = master_param.data
            else:
                param_data = param.data
            ema_params[name].mul_(decay).add_(param_data, alpha=1 - decay)


class MaskGenerator:
    def __init__(self, mask_ratios):
        valid_mask_names = [
            "identity",
            "quarter_random",
            "quarter_head",
            "quarter_tail",
            "quarter_head_tail",
            "image_random",
            "image_head",
            "image_tail",
            "image_head_tail",
            "random",
            "intepolate",
        ]
        assert all(
            mask_name in valid_mask_names for mask_name in mask_ratios.keys()
        ), f"mask_name should be one of {valid_mask_names}, got {mask_ratios.keys()}"
        assert all(
            mask_ratio >= 0 for mask_ratio in mask_ratios.values()
        ), f"mask_ratio should be greater than or equal to 0, got {mask_ratios.values()}"
        assert all(
            mask_ratio <= 1 for mask_ratio in mask_ratios.values()
        ), f"mask_ratio should be less than or equal to 1, got {mask_ratios.values()}"
        # sum of mask_ratios should be 1
        if "identity" not in mask_ratios:
            mask_ratios["identity"] = 1.0 - sum(mask_ratios.values())
        assert math.isclose(
            sum(mask_ratios.values()), 1.0, abs_tol=1e-6
        ), f"sum of mask_ratios should be 1, got {sum(mask_ratios.values())}"
        get_logger().info("mask ratios: %s", mask_ratios)
        self.mask_ratios = mask_ratios

    def get_mask(self, x):
        mask_type = random.random()
        mask_name = None
        prob_acc = 0.0
        for mask, mask_ratio in self.mask_ratios.items():
            prob_acc += mask_ratio
            if mask_type < prob_acc:
                mask_name = mask
                break

        num_frames = x.shape[2]
        # Hardcoded condition_frames
        condition_frames_max = num_frames // 4

        mask = torch.ones(num_frames, dtype=torch.bool, device=x.device)
        if num_frames <= 1:
            return mask

        if mask_name == "quarter_random":
            random_size = random.randint(1, condition_frames_max)
            random_pos = random.randint(0, x.shape[2] - random_size)
            mask[random_pos : random_pos + random_size] = 0
        elif mask_name == "image_random":
            random_size = 1
            random_pos = random.randint(0, x.shape[2] - random_size)
            mask[random_pos : random_pos + random_size] = 0
        elif mask_name == "quarter_head":
            random_size = random.randint(1, condition_frames_max)
            mask[:random_size] = 0
        elif mask_name == "image_head":
            random_size = 1
            mask[:random_size] = 0
        elif mask_name == "quarter_tail":
            random_size = random.randint(1, condition_frames_max)
            mask[-random_size:] = 0
        elif mask_name == "image_tail":
            random_size = 1
            mask[-random_size:] = 0
        elif mask_name == "quarter_head_tail":
            random_size = random.randint(1, condition_frames_max)
            mask[:random_size] = 0
            mask[-random_size:] = 0
        elif mask_name == "image_head_tail":
            random_size = 1
            mask[:random_size] = 0
            mask[-random_size:] = 0
        elif mask_name == "intepolate":
            random_start = random.randint(0, 1)
            mask[random_start::2] = 0
        elif mask_name == "random":
            mask_ratio = random.uniform(0.1, 0.9)
            mask = torch.rand(num_frames, device=x.device) > mask_ratio
            # if mask is all False, set the last frame to True
            if not mask.any():
                mask[-1] = 1

        return mask

    def get_masks(self, x):
        masks = []
        for _ in range(len(x)):
            mask = self.get_mask(x)
            masks.append(mask)
        masks = torch.stack(masks, dim=0)
        return masks
add speediffusion 2024-03-25 15:20:10 +01:00			`import math`
complete masked training 2024-03-23 15:06:19 +01:00			`import random`
first commit (#67) 2024-03-15 15:06:36 +01:00			`from collections import OrderedDict`

			`import torch`
[refactor] clean train.py code (#94) 2024-05-08 10:07:57 +02:00			`import torch.distributed as dist`
			`from colossalai.booster.plugin import LowLevelZeroPlugin`

			`from opensora.acceleration.parallel_states import set_data_parallel_group, set_sequence_parallel_group`
			`from opensora.acceleration.plugin import ZeroSeqParallelPlugin`

[feat] update logging and bucket 2024-05-09 10:07:56 +02:00			`from .misc import get_logger`

[refactor] clean train.py code (#94) 2024-05-08 10:07:57 +02:00
[feature] make timer optional and make reduce bucket size configurable (#549) * [feature] make reduce bucket size configurable * [feature] make timer optional 2024-06-27 07:37:54 +02:00			`def create_colossalai_plugin(plugin, dtype, grad_clip, sp_size, reduce_bucket_size_in_m: int = 20):`
[refactor] clean train.py code (#94) 2024-05-08 10:07:57 +02:00			`if plugin == "zero2":`
added sp for stdit3 (#131) 2024-06-14 05:10:43 +02:00			`assert sp_size == 1, "Zero2 plugin does not support sequence parallelism"`
[refactor] clean train.py code (#94) 2024-05-08 10:07:57 +02:00			`plugin = LowLevelZeroPlugin(`
			`stage=2,`
			`precision=dtype,`
			`initial_scale=2**16,`
			`max_norm=grad_clip,`
[feature] make timer optional and make reduce bucket size configurable (#549) * [feature] make reduce bucket size configurable * [feature] make timer optional 2024-06-27 07:37:54 +02:00			`reduce_bucket_size_in_m=reduce_bucket_size_in_m,`
[refactor] clean train.py code (#94) 2024-05-08 10:07:57 +02:00			`)`
			`set_data_parallel_group(dist.group.WORLD)`
			`elif plugin == "zero2-seq":`
added sp for stdit3 (#131) 2024-06-14 05:10:43 +02:00			`assert sp_size > 1, "Zero2-seq plugin requires sequence parallelism"`
[refactor] clean train.py code (#94) 2024-05-08 10:07:57 +02:00			`plugin = ZeroSeqParallelPlugin(`
			`sp_size=sp_size,`
			`stage=2,`
			`precision=dtype,`
			`initial_scale=2**16,`
			`max_norm=grad_clip,`
[feature] make timer optional and make reduce bucket size configurable (#549) * [feature] make reduce bucket size configurable * [feature] make timer optional 2024-06-27 07:37:54 +02:00			`reduce_bucket_size_in_m=reduce_bucket_size_in_m,`
[refactor] clean train.py code (#94) 2024-05-08 10:07:57 +02:00			`)`
			`set_sequence_parallel_group(plugin.sp_group)`
			`set_data_parallel_group(plugin.dp_group)`
			`else:`
			`raise ValueError(f"Unknown plugin {plugin}")`
			`return plugin`
first commit (#67) 2024-03-15 15:06:36 +01:00

			`@torch.no_grad()`
			`def update_ema(`
			`ema_model: torch.nn.Module, model: torch.nn.Module, optimizer=None, decay: float = 0.9999, sharded: bool = True`
			`) -> None:`
			`"""`
			`Step the EMA model towards the current model.`
			`"""`
			`ema_params = OrderedDict(ema_model.named_parameters())`
			`model_params = OrderedDict(model.named_parameters())`

			`for name, param in model_params.items():`
			`if name == "pos_embed":`
			`continue`
[refactor] clean train.py code (#94) 2024-05-08 10:07:57 +02:00			`if not param.requires_grad:`
first commit (#67) 2024-03-15 15:06:36 +01:00			`continue`
			`if not sharded:`
			`param_data = param.data`
			`ema_params[name].mul_(decay).add_(param_data, alpha=1 - decay)`
			`else:`
			`if param.data.dtype != torch.float32:`
			`param_id = id(param)`
			`master_param = optimizer._param_store.working_to_master_param[param_id]`
			`param_data = master_param.data`
			`else:`
			`param_data = param.data`
			`ema_params[name].mul_(decay).add_(param_data, alpha=1 - decay)`
complete masked training 2024-03-23 15:06:19 +01:00

			`class MaskGenerator:`
update csvutil & mask 2024-04-03 09:15:04 +02:00			`def __init__(self, mask_ratios):`
update mask strategy 2024-04-18 10:13:53 +02:00			`valid_mask_names = [`
[feat] update logging and bucket 2024-05-09 10:07:56 +02:00			`"identity",`
			`"quarter_random",`
			`"quarter_head",`
			`"quarter_tail",`
			`"quarter_head_tail",`
			`"image_random",`
			`"image_head",`
			`"image_tail",`
			`"image_head_tail",`
			`"random",`
			`"intepolate",`
update mask strategy 2024-04-18 10:13:53 +02:00			`]`
update csvutil & mask 2024-04-03 09:15:04 +02:00			`assert all(`
			`mask_name in valid_mask_names for mask_name in mask_ratios.keys()`
			`), f"mask_name should be one of {valid_mask_names}, got {mask_ratios.keys()}"`
			`assert all(`
			`mask_ratio >= 0 for mask_ratio in mask_ratios.values()`
			`), f"mask_ratio should be greater than or equal to 0, got {mask_ratios.values()}"`
			`assert all(`
			`mask_ratio <= 1 for mask_ratio in mask_ratios.values()`
			`), f"mask_ratio should be less than or equal to 1, got {mask_ratios.values()}"`
			`# sum of mask_ratios should be 1`
[feat] update logging and bucket 2024-05-09 10:07:56 +02:00			`if "identity" not in mask_ratios:`
			`mask_ratios["identity"] = 1.0 - sum(mask_ratios.values())`
update csvutil & mask 2024-04-03 09:15:04 +02:00			`assert math.isclose(`
			`sum(mask_ratios.values()), 1.0, abs_tol=1e-6`
			`), f"sum of mask_ratios should be 1, got {sum(mask_ratios.values())}"`
[feat] update logging and bucket 2024-05-09 10:07:56 +02:00			`get_logger().info("mask ratios: %s", mask_ratios)`
update csvutil & mask 2024-04-03 09:15:04 +02:00			`self.mask_ratios = mask_ratios`
complete masked training 2024-03-23 15:06:19 +01:00
			`def get_mask(self, x):`
			`mask_type = random.random()`
update csvutil & mask 2024-04-03 09:15:04 +02:00			`mask_name = None`
			`prob_acc = 0.0`
			`for mask, mask_ratio in self.mask_ratios.items():`
			`prob_acc += mask_ratio`
			`if mask_type < prob_acc:`
			`mask_name = mask`
complete masked training 2024-03-23 15:06:19 +01:00			`break`

update csvutil & mask 2024-04-03 09:15:04 +02:00			`num_frames = x.shape[2]`
			`# Hardcoded condition_frames`
			`condition_frames_max = num_frames // 4`

			`mask = torch.ones(num_frames, dtype=torch.bool, device=x.device)`
			`if num_frames <= 1:`
			`return mask`

[feat] update logging and bucket 2024-05-09 10:07:56 +02:00			`if mask_name == "quarter_random":`
update csvutil & mask 2024-04-03 09:15:04 +02:00			`random_size = random.randint(1, condition_frames_max)`
complete masked training 2024-03-23 15:06:19 +01:00			`random_pos = random.randint(0, x.shape[2] - random_size)`
			`mask[random_pos : random_pos + random_size] = 0`
[feat] update logging and bucket 2024-05-09 10:07:56 +02:00			`elif mask_name == "image_random":`
update mask strategy 2024-04-18 10:13:53 +02:00			`random_size = 1`
			`random_pos = random.randint(0, x.shape[2] - random_size)`
			`mask[random_pos : random_pos + random_size] = 0`
[feat] update logging and bucket 2024-05-09 10:07:56 +02:00			`elif mask_name == "quarter_head":`
update csvutil & mask 2024-04-03 09:15:04 +02:00			`random_size = random.randint(1, condition_frames_max)`
complete masked training 2024-03-23 15:06:19 +01:00			`mask[:random_size] = 0`
[feat] update logging and bucket 2024-05-09 10:07:56 +02:00			`elif mask_name == "image_head":`
update mask strategy 2024-04-18 10:13:53 +02:00			`random_size = 1`
			`mask[:random_size] = 0`
[feat] update logging and bucket 2024-05-09 10:07:56 +02:00			`elif mask_name == "quarter_tail":`
update csvutil & mask 2024-04-03 09:15:04 +02:00			`random_size = random.randint(1, condition_frames_max)`
complete masked training 2024-03-23 15:06:19 +01:00			`mask[-random_size:] = 0`
[feat] update logging and bucket 2024-05-09 10:07:56 +02:00			`elif mask_name == "image_tail":`
update mask strategy 2024-04-18 10:13:53 +02:00			`random_size = 1`
			`mask[-random_size:] = 0`
[feat] update logging and bucket 2024-05-09 10:07:56 +02:00			`elif mask_name == "quarter_head_tail":`
update csvutil & mask 2024-04-03 09:15:04 +02:00			`random_size = random.randint(1, condition_frames_max)`
complete masked training 2024-03-23 15:06:19 +01:00			`mask[:random_size] = 0`
			`mask[-random_size:] = 0`
[feat] update logging and bucket 2024-05-09 10:07:56 +02:00			`elif mask_name == "image_head_tail":`
update mask strategy 2024-04-18 10:13:53 +02:00			`random_size = 1`
			`mask[:random_size] = 0`
			`mask[-random_size:] = 0`
[feat] update logging and bucket 2024-05-09 10:07:56 +02:00			`elif mask_name == "intepolate":`
complete 3b backbone 2024-04-29 08:00:14 +02:00			`random_start = random.randint(0, 1)`
			`mask[random_start::2] = 0`
[feat] update logging and bucket 2024-05-09 10:07:56 +02:00			`elif mask_name == "random":`
			`mask_ratio = random.uniform(0.1, 0.9)`
complete 3b backbone 2024-04-29 08:00:14 +02:00			`mask = torch.rand(num_frames, device=x.device) > mask_ratio`
[fix] random mask may have nothing to learn 2024-05-06 11:34:08 +02:00			`# if mask is all False, set the last frame to True`
			`if not mask.any():`
			`mask[-1] = 1`
complete masked training 2024-03-23 15:06:19 +01:00
			`return mask`

			`def get_masks(self, x):`
			`masks = []`
			`for _ in range(len(x)):`
			`mask = self.get_mask(x)`
			`masks.append(mask)`
			`masks = torch.stack(masks, dim=0)`
			`return masks`