diff --git a/configs/opensora/train/16x256x256-spee.py b/configs/opensora/train/16x256x256-spee.py index 729a382..782fdf3 100644 --- a/configs/opensora/train/16x256x256-spee.py +++ b/configs/opensora/train/16x256x256-spee.py @@ -1,14 +1,14 @@ -num_frames = 16 -frame_interval = 3 -image_size = (256, 256) - # Define dataset -root = None -data_path = "CSV_PATH" -use_image_transform = False -num_workers = 4 +dataset = dict( + type="VideoTextDataset", + data_path=None, + num_frames=16, + frame_interval=3, + image_size=(256, 256), +) # Define acceleration +num_workers = 4 dtype = "bf16" grad_checkpoint = True plugin = "zero2" @@ -23,7 +23,7 @@ model = dict( enable_flashattn=True, enable_layernorm_kernel=True, ) -# mask_ratios = [0.5, 0.29, 0.07, 0.07, 0.07] +mask_ratios = [0.5, 0.29, 0.07, 0.07, 0.07] vae = dict( type="VideoAutoencoderKL", from_pretrained="stabilityai/sd-vae-ft-ema", diff --git a/opensora/schedulers/iddpm/gaussian_diffusion.py b/opensora/schedulers/iddpm/gaussian_diffusion.py index 9c51934..febac63 100644 --- a/opensora/schedulers/iddpm/gaussian_diffusion.py +++ b/opensora/schedulers/iddpm/gaussian_diffusion.py @@ -684,16 +684,13 @@ class GaussianDiffusion: true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance(x_start=x_start, x_t=x_t, t=t) out = self.p_mean_variance(model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs) kl = normal_kl(true_mean, true_log_variance_clipped, out["mean"], out["log_variance"]) + kl = mean_flat(kl, mask=mask) / np.log(2.0) decoder_nll = -discretized_gaussian_log_likelihood( x_start, means=out["mean"], log_scales=0.5 * out["log_variance"] ) assert decoder_nll.shape == x_start.shape - - if mask is not None: - kl = th.where(mask[:, None, :, None, None], kl, decoder_nll) - kl = mean_flat(kl) / np.log(2.0) - decoder_nll = mean_flat(decoder_nll) / np.log(2.0) + decoder_nll = mean_flat(decoder_nll, mask=mask) / np.log(2.0) # At the first timestep return the decoder NLL, # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t)) @@ -769,10 +766,10 @@ class GaussianDiffusion: }[self.model_mean_type] assert model_output.shape == target.shape == x_start.shape if weights is None: - terms["mse"] = mean_flat((target - model_output) ** 2) + terms["mse"] = mean_flat((target - model_output) ** 2, mask=mask) else: weight = _extract_into_tensor(weights, t, target.shape) - terms["mse"] = mean_flat(weight * (target - model_output) ** 2) + terms["mse"] = mean_flat(weight * (target - model_output) ** 2, mask=mask) if "vb" in terms: terms["loss"] = terms["mse"] + terms["vb"] else: diff --git a/opensora/utils/config_utils.py b/opensora/utils/config_utils.py index d820058..272e211 100644 --- a/opensora/utils/config_utils.py +++ b/opensora/utils/config_utils.py @@ -47,9 +47,7 @@ def merge_args(cfg, args, training=False): if args.ckpt_path is not None: cfg.model["from_pretrained"] = args.ckpt_path args.ckpt_path = None - if args.data_path is not None: - cfg.dataset["data_path"] = args.data_path - args.data_path = None + for k, v in vars(args).items(): if k in cfg and v is not None: @@ -66,6 +64,9 @@ def merge_args(cfg, args, training=False): cfg["prompt"] = load_prompts(cfg["prompt_path"]) else: # Training only + if args.data_path is not None: + cfg.dataset["data_path"] = args.data_path + args.data_path = None if "mask_ratios" not in cfg: cfg["mask_ratios"] = None