diff --git a/.gitignore b/.gitignore
index b1848ea..7b81785 100644
--- a/.gitignore
+++ b/.gitignore
@@ -169,6 +169,7 @@ dataset/
 runs/
 checkpoints/
 outputs/
+samples/
 
 # Secret files
 hostfile
diff --git a/README.md b/README.md
index b97f507..6422cb4 100644
--- a/README.md
+++ b/README.md
@@ -86,9 +86,28 @@ Click for the original video.
 ## Installation
 
 ```bash
+# create a virtual env
+conda create -n opensora python=3.10
+
+# install torch
+# the command below is for CUDA 12.1, choose install commands from 
+# https://pytorch.org/get-started/locally/ based on your own CUDA version
+pip3 install torch torchvision
+
+# install flash attention (optional)
+pip install packaging ninja
+pip install flash-attn --no-build-isolation
+
+# install apex (optional)
+pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git
+
+# install xformers
+pip3 install -U xformers --index-url https://download.pytorch.org/whl/cu121
+
+# install this project
 git clone https://github.com/hpcaitech/Open-Sora
 cd Open-Sora
-pip install xxx
+pip install -v -e .
 ```
 
 After installation, we suggest reading [structure.md](docs/structure.md) to learn the project structure and how to use the config files.
@@ -101,10 +120,21 @@ After installation, we suggest reading [structure.md](docs/structure.md) to lear
 
 ## Inference
 
+To run inference with our provided weights, first prepare the pretrained weights including XXX. [WIP]
+
+Then run the following commands to generate samples. See [here](docs/structure.md#inference-config-demos) to customize the configuration.
+
 ```bash
-python scripts/inference.py configs/opensora/inference/16x256x256.py
+# Sample 16x256x256 (~2s)
+python scripts/inference.py configs/opensora/inference/16x256x256.py --ckpt-path ./path/to/your/ckpt.pth
+# Sample 16x512x512 (~2s)
+python scripts/inference.py configs/opensora/inference/16x512x512.py
+# Sample 64x512x512 (~5s)
+python scripts/inference.py configs/opensora/inference/64x512x512.py
 ```
 
+For inference with other models, see [here](docs/commands.md) for more instructions.
+
 ## Data Processing
 
 ### Split video into clips
@@ -124,8 +154,7 @@ We provide code to split a long video into separate clips efficiently using `mul
 * [StabilityAI VAE](https://huggingface.co/stabilityai/sd-vae-ft-mse-original): A powerful image VAE model.
 * [CLIP](https://github.com/openai/CLIP): A powerful text-image embedding model.
 * [T5](https://github.com/google-research/text-to-text-transfer-transformer): A powerful text encoder.
-* [LLaVA](https://github.com/haotian-liu/LLaVA): A powerful image captioning model based on [LLaMA](https://github.com/meta-llama/llama) and [Yi-34B](https://huggingface.co/01-ai/Yi-34B).
-* [PySceneDetect](https://github.com/Breakthrough/PySceneDetect): A powerful tool to split video into clips.
+* [LLaVA](https://github.com/haotian-liu/LLaVA): A powerful image captioning model based on [Yi-34B](https://huggingface.co/01-ai/Yi-34B).
 
 We are grateful for their exceptional work and generous contribution to open source.
 
diff --git a/configs/opensora/inference/16x256x256.py b/configs/opensora/inference/16x256x256.py
index 7b1fa99..4b2e4f2 100644
--- a/configs/opensora/inference/16x256x256.py
+++ b/configs/opensora/inference/16x256x256.py
@@ -1,15 +1,13 @@
-# sample size
 num_frames = 16
 fps = 24 // 3
 image_size = (256, 256)
 
-# model config
+# Define model
 model = dict(
     type="STDiT-XL/2",
     space_scale=0.5,
     time_scale=1.0,
-    from_pretrained="outputs/129-F16S3-PixArt-ST-XL-2/epoch83-global_step80000/ema.pt",
-    # from_pretrained="outputs/285-F16S3-PixArt-ST-XL-2/epoch615-global_step24000/ema.pt",
+    from_pretrained="PRETRAINED_MODEL",
 )
 vae = dict(
     type="VideoAutoencoderKL",
@@ -21,15 +19,13 @@ text_encoder = dict(
     model_max_length=120,
 )
 scheduler = dict(
-    # type="iddpm",
-    # num_sampling_steps=250,
-    type="dpm-solver",
-    num_sampling_steps=20,
+    type="iddpm",
+    num_sampling_steps=100,
     cfg_scale=7.0,
 )
 dtype = "fp16"
 
-# prompts
+# Others
 batch_size = 2
 seed = 42
 prompt_path = "./assets/texts/t2v_samples.txt"
diff --git a/configs/opensora/inference/16x512x512.py b/configs/opensora/inference/16x512x512.py
index 6ca7cf6..2837275 100644
--- a/configs/opensora/inference/16x512x512.py
+++ b/configs/opensora/inference/16x512x512.py
@@ -1,14 +1,13 @@
-# sample size
 num_frames = 16
 fps = 24 // 3
 image_size = (512, 512)
 
-# model config
+# Define model
 model = dict(
     type="STDiT-XL/2",
     space_scale=1.0,
     time_scale=1.0,
-    from_pretrained="outputs/314-F16S3-PixArt-ST-XL-2/epoch128-global_step20000/ema.pt",
+    from_pretrained="PRETRAINED_MODEL",
 )
 vae = dict(
     type="VideoAutoencoderKL",
@@ -23,13 +22,11 @@ text_encoder = dict(
 scheduler = dict(
     type="iddpm",
     num_sampling_steps=100,
-    # type = "dpm-solver",
-    # num_sampling_steps=20,
     cfg_scale=7.0,
 )
 dtype = "fp16"
 
-# prompts
+# Others
 batch_size = 2
 seed = 42
 prompt_path = "./assets/texts/t2v_samples.txt"
diff --git a/configs/opensora/inference/64x512x512-v2.py b/configs/opensora/inference/64x512x512-v2.py
deleted file mode 100644
index dfb9358..0000000
--- a/configs/opensora/inference/64x512x512-v2.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# sample size
-num_frames = 64
-fps = 24 // 2
-image_size = (512, 512)
-
-# model config
-model = dict(
-    type="STDiT-XL/2",
-    space_scale=1.0,
-    time_scale=2 / 3,
-    from_pretrained="outputs/524-F64S2-STDiT-XL-2/epoch4-global_step750/",
-)
-vae = dict(
-    type="VideoAutoencoderKL",
-    from_pretrained="stabilityai/sd-vae-ft-ema",
-    split=8,
-)
-text_encoder = dict(
-    type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
-    model_max_length=120,
-)
-scheduler = dict(
-    type="iddpm",
-    num_sampling_steps=100,
-    # type = "dpm-solver",
-    # num_sampling_steps=20,
-    cfg_scale=7.0,
-)
-dtype = "fp16"
-
-# prompts
-batch_size = 1
-seed = 42
-prompt_path = "./assets/texts/t2v_samples.txt"
-save_dir = "./outputs/samples/"
diff --git a/configs/opensora/inference/64x512x512.py b/configs/opensora/inference/64x512x512.py
index 00b3e8f..25aad23 100644
--- a/configs/opensora/inference/64x512x512.py
+++ b/configs/opensora/inference/64x512x512.py
@@ -1,18 +1,18 @@
-# sample size
 num_frames = 64
 fps = 24 // 2
 image_size = (512, 512)
 
-# model config
+# Define model
 model = dict(
     type="STDiT-XL/2",
     space_scale=1.0,
     time_scale=2 / 3,
-    from_pretrained="outputs/314-F16S3-PixArt-ST-XL-2/epoch96-global_step15000/",
+    from_pretrained="PRETRAINED_MODEL",
 )
 vae = dict(
     type="VideoAutoencoderKL",
     from_pretrained="stabilityai/sd-vae-ft-ema",
+    split=8,
 )
 text_encoder = dict(
     type="t5",
@@ -22,13 +22,11 @@ text_encoder = dict(
 scheduler = dict(
     type="iddpm",
     num_sampling_steps=100,
-    # type = "dpm-solver",
-    # num_sampling_steps=20,
     cfg_scale=7.0,
 )
 dtype = "fp16"
 
-# prompts
+# Others
 batch_size = 1
 seed = 42
 prompt_path = "./assets/texts/t2v_samples.txt"
diff --git a/configs/opensora/train/16x256x256.py b/configs/opensora/train/16x256x256.py
index ad2a62e..a64a318 100644
--- a/configs/opensora/train/16x256x256.py
+++ b/configs/opensora/train/16x256x256.py
@@ -1,22 +1,20 @@
-# sample size
 num_frames = 16
 frame_interval = 3
 image_size = (256, 256)
 
-# dataset
+# Define dataset
 root = None
-# data_path = "/mnt/hdd/data/csv/bak_00/pexels_inter4k_fmin_48_rp.csv"
-data_path = "/mnt/hdd/data/csv/ucf101_videos.csv"
+data_path = "CSV_PATH"
 use_image_transform = False
 num_workers = 4
 
-# acceleration
-dtype = "fp16"
+# Define acceleration
+dtype = "bf16"
 grad_checkpoint = True
 plugin = "zero2"
 sp_size = 1
 
-# model config
+# Define model
 model = dict(
     type="STDiT-XL/2",
     space_scale=0.5,
@@ -40,7 +38,7 @@ scheduler = dict(
     timestep_respacing="",
 )
 
-# runtime
+# Others
 seed = 42
 outputs = "outputs"
 wandb = False
diff --git a/configs/opensora/train/16x512x512.py b/configs/opensora/train/16x512x512.py
index edcd92f..f827ce3 100644
--- a/configs/opensora/train/16x512x512.py
+++ b/configs/opensora/train/16x512x512.py
@@ -1,26 +1,25 @@
-# sample size
 num_frames = 16
 frame_interval = 3
 image_size = (512, 512)
 
-# dataset
+# Define dataset
 root = None
-data_path = "/home/zhaowangbo/data_hdd/csv/inter4k_pexels_rp_fmin_48.csv"
+data_path = "CSV_PATH"
 use_image_transform = False
 num_workers = 4
 
-# acceleration
-dtype = "fp16"
+# Define acceleration
+dtype = "bf16"
 grad_checkpoint = True
 plugin = "zero2"
 sp_size = 1
 
-# model config
+# Define model
 model = dict(
     type="STDiT-XL/2",
     space_scale=1.0,
     time_scale=1.0,
-    from_pretrained="outputs/285-F16S3-PixArt-ST-XL-2/epoch615-global_step24000/ema.pt",
+    from_pretrained=None,
     enable_flashattn=True,
     enable_layernorm_kernel=True,
 )
@@ -40,7 +39,7 @@ scheduler = dict(
     timestep_respacing="",
 )
 
-# runtime
+# Others
 seed = 42
 outputs = "outputs"
 wandb = False
diff --git a/configs/opensora/train/64x512x512-v2.py b/configs/opensora/train/64x512x512-sp.py
similarity index 69%
rename from configs/opensora/train/64x512x512-v2.py
rename to configs/opensora/train/64x512x512-sp.py
index f20a8f3..b0b9062 100644
--- a/configs/opensora/train/64x512x512-v2.py
+++ b/configs/opensora/train/64x512x512-sp.py
@@ -1,33 +1,32 @@
-# sample size
 num_frames = 64
 frame_interval = 2
 image_size = (512, 512)
 
-# dataset
+# Define dataset
 root = None
-data_path = "/mnt/hdd/data/csv/inter4k_pexels_rp_fmin_128.csv"
+data_path = "CSV_PATH"
 use_image_transform = False
 num_workers = 4
 
-# acceleration
-dtype = "fp16"
+# Define acceleration
+dtype = "bf16"
 grad_checkpoint = True
-plugin = "zero2"
-sp_size = 1
+plugin = "zero2-seq"
+sp_size = 2
 
-# model config
+# Define model
 model = dict(
     type="STDiT-XL/2",
     space_scale=1.0,
     time_scale=2 / 3,
-    from_pretrained="outputs/314-F16S3-PixArt-ST-XL-2/epoch128-global_step20000/ema.pt",
+    from_pretrained=None,
     enable_flashattn=True,
     enable_layernorm_kernel=True,
+    enable_sequence_parallelism=True,  # enable sq here
 )
 vae = dict(
     type="VideoAutoencoderKL",
     from_pretrained="stabilityai/sd-vae-ft-ema",
-    split=8,
 )
 text_encoder = dict(
     type="t5",
@@ -40,16 +39,16 @@ scheduler = dict(
     timestep_respacing="",
 )
 
-# runtime
+# Others
 seed = 42
 outputs = "outputs"
 wandb = False
 
 epochs = 1000
 log_every = 10
-ckpt_every = 250
+ckpt_every = 1000
 load = None
 
-batch_size = 4
+batch_size = 1
 lr = 2e-5
 grad_clip = 1.0
diff --git a/configs/opensora/train/64x512x512.py b/configs/opensora/train/64x512x512.py
index 642abe4..d902849 100644
--- a/configs/opensora/train/64x512x512.py
+++ b/configs/opensora/train/64x512x512.py
@@ -1,34 +1,32 @@
-# sample size
 num_frames = 64
 frame_interval = 2
 image_size = (512, 512)
 
-# dataset
+# Define dataset
 root = None
-# data_path = "/mnt/hdd/data/csv/bak_00/pexels_inter4k_fmin_48_rp.csv"
-data_path = "/mnt/hdd/data/csv/ucf101_videos.csv"
+data_path = "CSV_PATH"
 use_image_transform = False
 num_workers = 4
 
-# acceleration
-dtype = "fp16"
+# Define acceleration
+dtype = "bf16"
 grad_checkpoint = True
-plugin = "zero2-seq"
-sp_size = 2
+plugin = "zero2"
+sp_size = 1
 
-# model config
+# Define model
 model = dict(
     type="STDiT-XL/2",
     space_scale=1.0,
     time_scale=2 / 3,
-    from_pretrained="PixArt-XL-2-512x512.pth",
+    from_pretrained=None,
     enable_flashattn=True,
     enable_layernorm_kernel=True,
-    enable_sequence_parallelism=True,  # enable sq here
 )
 vae = dict(
     type="VideoAutoencoderKL",
     from_pretrained="stabilityai/sd-vae-ft-ema",
+    split=8,    # split to lower memory usage
 )
 text_encoder = dict(
     type="t5",
@@ -41,16 +39,16 @@ scheduler = dict(
     timestep_respacing="",
 )
 
-# runtime
+# Others
 seed = 42
 outputs = "outputs"
 wandb = False
 
 epochs = 1000
 log_every = 10
-ckpt_every = 1000
+ckpt_every = 250
 load = None
 
-batch_size = 1
+batch_size = 4
 lr = 2e-5
 grad_clip = 1.0
diff --git a/docs/command.md b/docs/command.md
deleted file mode 100644
index e69de29..0000000
diff --git a/docs/commands.md b/docs/commands.md
new file mode 100644
index 0000000..261e179
--- /dev/null
+++ b/docs/commands.md
@@ -0,0 +1,9 @@
+# Commands
+
+## Inference
+
+### Inference with DiT pretrained on ImageNet
+
+
+
+## Training
diff --git a/docs/structure.md b/docs/structure.md
index deb967b..fbbcc05 100644
--- a/docs/structure.md
+++ b/docs/structure.md
@@ -68,6 +68,15 @@ Open-Sora
 
 ## Inference config demos
 
+To change the inference settings, you can directly modify the corresponding config file. Or you can pass arguments to overwrite the config file ([config_utils.py](/opensora/utils/config_utils.py)). To change sampling prompts, you should modify the `.txt` file passed to the `--prompt_path` argument.
+
+```plaintext
+--prompt_path ./assets/texts/t2v_samples.txt  -> prompt_path
+--ckpt-path ./path/to/your/ckpt.pth           -> model["from_pretrained"]
+```
+
+The explanation of each field is provided below.
+
 ```python
 # Define sampling size
 num_frames = 64               # number of frames
diff --git a/opensora/utils/config_utils.py b/opensora/utils/config_utils.py
index 48f2d9a..5ef8150 100644
--- a/opensora/utils/config_utils.py
+++ b/opensora/utils/config_utils.py
@@ -32,13 +32,14 @@ def parse_args(training=False):
     else:
         parser.add_argument("--wandb", default=None, type=bool, help="enable wandb")
         parser.add_argument("--load", default=None, type=str, help="path to continue training")
+        parser.add_argument("--data-path", default=None, type=str, help="path to data csv")
 
     return parser.parse_args()
 
 
 def merge_args(cfg, args, training=False):
     if args.ckpt_path is not None:
-        cfg.model["from_pratrained"] = args.ckpt_path
+        cfg.model["from_pretrained"] = args.ckpt_path
         args.ckpt_path = None
 
     if not training:
diff --git a/requirements.txt b/requirements.txt
index d2e02b8..0724a86 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,28 +1,13 @@
-# should be installed by users
-torch>=1.13
 colossalai
-flash_attn # optional
-apex # optional
-
-# packages
 accelerate
 diffusers
-timm
-transformers
-xformers # necessary for PixArt
-
-# config & logging
-gdown
-pre-commit
-matplotlib
-mmengine
-tensorboard
-tqdm
-wandb
-
-# video
-pyav
-
-# text
-clip
 ftfy
+gdown
+mmengine
+pre-commit
+pyav
+tensorboard
+timm
+tqdm
+transformers
+wandb
diff --git a/setup.py b/setup.py
index 1ac64d2..b4b3f49 100644
--- a/setup.py
+++ b/setup.py
@@ -30,12 +30,10 @@ def fetch_readme() -> str:
 
 setup(
     name="opensora",
-    version="0.1.0",
+    version="1.0.0",
     packages=find_packages(
         exclude=(
-            "videos",
             "tests",
-            "figure",
             "*.egg-info",
         )
     ),