diff --git a/configs/opensora-v1-2/inference/sample.py b/configs/opensora-v1-2/inference/sample.py
index 0e84251..3e2c623 100644
--- a/configs/opensora-v1-2/inference/sample.py
+++ b/configs/opensora-v1-2/inference/sample.py
@@ -19,14 +19,12 @@ model = dict(
     qk_norm=True,
     enable_flash_attn=True,
     enable_layernorm_kernel=True,
-    force_huggingface=True,
 )
 vae = dict(
     type="OpenSoraVAE_V1_2",
     from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
     micro_frame_size=17,
     micro_batch_size=4,
-    force_huggingface=True,
 )
 text_encoder = dict(
     type="t5",
diff --git a/configs/opensora-v1-2/inference/sample_hf.py b/configs/opensora-v1-2/inference/sample_hf.py
new file mode 100644
index 0000000..0e84251
--- /dev/null
+++ b/configs/opensora-v1-2/inference/sample_hf.py
@@ -0,0 +1,44 @@
+resolution = "240p"
+aspect_ratio = "9:16"
+num_frames = 51
+fps = 24
+frame_interval = 1
+save_fps = 24
+
+save_dir = "./samples/samples/"
+seed = 42
+batch_size = 1
+multi_resolution = "STDiT2"
+dtype = "bf16"
+condition_frame_length = 5
+align = 5
+
+model = dict(
+    type="STDiT3-XL/2",
+    from_pretrained="hpcai-tech/OpenSora-STDiT-v3",
+    qk_norm=True,
+    enable_flash_attn=True,
+    enable_layernorm_kernel=True,
+    force_huggingface=True,
+)
+vae = dict(
+    type="OpenSoraVAE_V1_2",
+    from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
+    micro_frame_size=17,
+    micro_batch_size=4,
+    force_huggingface=True,
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
+    model_max_length=300,
+)
+scheduler = dict(
+    type="rflow",
+    use_timestep_transform=True,
+    num_sampling_steps=30,
+    cfg_scale=7.0,
+)
+
+aes = 6.5
+flow = None
diff --git a/eval/sample.sh b/eval/sample.sh
index 0123309..241c229 100644
--- a/eval/sample.sh
+++ b/eval/sample.sh
@@ -39,7 +39,7 @@ DEFAULT_BS=1
 # called inside run_video_b
 function run_image() { # 14min
   # 1.1 1024x1024
-  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2i_samples.txt --save-dir $OUTPUT --num-frames 1 --resolution 1024 --aspect_ratio 1:1 --sample-name image_1024_1_1 --batch-size $DEFAULT_BS
+  eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2i_samples.txt --save-dir $OUTPUT --num-frames 1 --resolution 1024 --aspect-ratio 1:1 --sample-name image_1024_1_1 --batch-size $DEFAULT_BS
 
   # 1.2 240x426
   eval $CMD --ckpt-path $CKPT --prompt-path assets/texts/t2i_samples.txt --save-dir $OUTPUT --num-frames 1 --resolution 240p --aspect-ratio 9:16 --sample-name image_240p_9_16 --end-index 3 --batch-size $DEFAULT_BS
diff --git a/opensora/models/stdit/stdit3.py b/opensora/models/stdit/stdit3.py
index 8703b2d..bd9672d 100644
--- a/opensora/models/stdit/stdit3.py
+++ b/opensora/models/stdit/stdit3.py
@@ -448,7 +448,7 @@ class STDiT3(PreTrainedModel):
 @MODELS.register_module("STDiT3-XL/2")
 def STDiT3_XL_2(from_pretrained=None, **kwargs):
     force_huggingface = kwargs.pop("force_huggingface", False)
-    if force_huggingface or from_pretrained is not None and not os.path.isdir(from_pretrained):
+    if force_huggingface or from_pretrained is not None and not os.path.exists(from_pretrained):
         model = STDiT3.from_pretrained(from_pretrained, **kwargs)
     else:
         config = STDiT3Config(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs)
@@ -460,7 +460,8 @@ def STDiT3_XL_2(from_pretrained=None, **kwargs):
 
 @MODELS.register_module("STDiT3-3B/2")
 def STDiT3_3B_2(from_pretrained=None, **kwargs):
-    if from_pretrained is not None and not os.path.isdir(from_pretrained):
+    force_huggingface = kwargs.pop("force_huggingface", False)
+    if force_huggingface or from_pretrained is not None and not os.path.exists(from_pretrained):
         model = STDiT3.from_pretrained(from_pretrained, **kwargs)
     else:
         config = STDiT3Config(depth=28, hidden_size=1872, patch_size=(1, 2, 2), num_heads=26, **kwargs)
diff --git a/opensora/models/vae/vae.py b/opensora/models/vae/vae.py
index bf50ec8..9802b02 100644
--- a/opensora/models/vae/vae.py
+++ b/opensora/models/vae/vae.py
@@ -277,7 +277,7 @@ def OpenSoraVAE_V1_2(
         scale=scale,
     )
 
-    if force_huggingface or (from_pretrained is not None and not os.path.isdir(from_pretrained)):
+    if force_huggingface or (from_pretrained is not None and not os.path.exists(from_pretrained)):
         model = VideoAutoencoderPipeline.from_pretrained(from_pretrained, **kwargs)
     else:
         config = VideoAutoencoderPipelineConfig(**kwargs)