diff --git a/configs/opensora/train/16x256x256-spee.py b/configs/opensora/train/16x256x256-spee.py
index 729a382..782fdf3 100644
--- a/configs/opensora/train/16x256x256-spee.py
+++ b/configs/opensora/train/16x256x256-spee.py
@@ -1,14 +1,14 @@
-num_frames = 16
-frame_interval = 3
-image_size = (256, 256)
-
 # Define dataset
-root = None
-data_path = "CSV_PATH"
-use_image_transform = False
-num_workers = 4
+dataset = dict(
+    type="VideoTextDataset",
+    data_path=None,
+    num_frames=16,
+    frame_interval=3,
+    image_size=(256, 256),
+)
 
 # Define acceleration
+num_workers = 4
 dtype = "bf16"
 grad_checkpoint = True
 plugin = "zero2"
@@ -23,7 +23,7 @@ model = dict(
     enable_flashattn=True,
     enable_layernorm_kernel=True,
 )
-# mask_ratios = [0.5, 0.29, 0.07, 0.07, 0.07]
+mask_ratios = [0.5, 0.29, 0.07, 0.07, 0.07]
 vae = dict(
     type="VideoAutoencoderKL",
     from_pretrained="stabilityai/sd-vae-ft-ema",
diff --git a/opensora/schedulers/iddpm/gaussian_diffusion.py b/opensora/schedulers/iddpm/gaussian_diffusion.py
index 9c51934..febac63 100644
--- a/opensora/schedulers/iddpm/gaussian_diffusion.py
+++ b/opensora/schedulers/iddpm/gaussian_diffusion.py
@@ -684,16 +684,13 @@ class GaussianDiffusion:
         true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance(x_start=x_start, x_t=x_t, t=t)
         out = self.p_mean_variance(model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs)
         kl = normal_kl(true_mean, true_log_variance_clipped, out["mean"], out["log_variance"])
+        kl = mean_flat(kl, mask=mask) / np.log(2.0)
 
         decoder_nll = -discretized_gaussian_log_likelihood(
             x_start, means=out["mean"], log_scales=0.5 * out["log_variance"]
         )
         assert decoder_nll.shape == x_start.shape
-
-        if mask is not None:
-            kl = th.where(mask[:, None, :, None, None], kl, decoder_nll)
-        kl = mean_flat(kl) / np.log(2.0)
-        decoder_nll = mean_flat(decoder_nll) / np.log(2.0)
+        decoder_nll = mean_flat(decoder_nll, mask=mask) / np.log(2.0)
 
         # At the first timestep return the decoder NLL,
         # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t))
@@ -769,10 +766,10 @@ class GaussianDiffusion:
             }[self.model_mean_type]
             assert model_output.shape == target.shape == x_start.shape
             if weights is None:
-                terms["mse"] = mean_flat((target - model_output) ** 2)
+                terms["mse"] = mean_flat((target - model_output) ** 2, mask=mask)
             else:
                 weight = _extract_into_tensor(weights, t, target.shape)
-                terms["mse"] = mean_flat(weight * (target - model_output) ** 2)
+                terms["mse"] = mean_flat(weight * (target - model_output) ** 2, mask=mask)
             if "vb" in terms:
                 terms["loss"] = terms["mse"] + terms["vb"]
             else:
diff --git a/opensora/utils/config_utils.py b/opensora/utils/config_utils.py
index d820058..272e211 100644
--- a/opensora/utils/config_utils.py
+++ b/opensora/utils/config_utils.py
@@ -47,9 +47,7 @@ def merge_args(cfg, args, training=False):
     if args.ckpt_path is not None:
         cfg.model["from_pretrained"] = args.ckpt_path
         args.ckpt_path = None
-    if args.data_path is not None:
-        cfg.dataset["data_path"] = args.data_path
-        args.data_path = None
+    
 
     for k, v in vars(args).items():
         if k in cfg and v is not None:
@@ -66,6 +64,9 @@ def merge_args(cfg, args, training=False):
             cfg["prompt"] = load_prompts(cfg["prompt_path"])
     else:
     # Training only
+        if args.data_path is not None:
+            cfg.dataset["data_path"] = args.data_path
+            args.data_path = None
         if "mask_ratios" not in cfg:
             cfg["mask_ratios"] = None