Docs/fix (#211)

* fix #210 * fix #209
2026-04-10 12:49:38 +02:00 · 2024-03-25 13:19:11 +08:00 · 2024-03-25 13:19:11 +08:00 · b454751f8f
commit b454751f8f
parent 7abee23c4c
14 changed files with 16 additions and 16 deletions
--- a/configs/opensora/train/16x512x512.py
+++ b/configs/opensora/train/16x512x512.py
@ -30,7 +30,7 @@ vae = dict(
 )
 text_encoder = dict(
    type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
    model_max_length=120,
    shardformer=True,
 )
--- a/configs/opensora/train/360x512x512.py
+++ b/configs/opensora/train/360x512x512.py
@ -31,7 +31,7 @@ vae = dict(
 )
 text_encoder = dict(
    type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
    model_max_length=120,
    shardformer=True,
 )
--- a/configs/opensora/train/64x512x512-sp.py
+++ b/configs/opensora/train/64x512x512-sp.py
@ -30,7 +30,7 @@ vae = dict(
 )
 text_encoder = dict(
    type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
    model_max_length=120,
    shardformer=True,
 )
--- a/configs/opensora/train/64x512x512.py
+++ b/configs/opensora/train/64x512x512.py
@ -30,7 +30,7 @@ vae = dict(
 )
 text_encoder = dict(
    type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
    model_max_length=120,
    shardformer=True,
 )
--- a/configs/pixart/inference/16x256x256.py
+++ b/configs/pixart/inference/16x256x256.py
@ -15,7 +15,7 @@ vae = dict(
 )
 text_encoder = dict(
    type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
    model_max_length=120,
 )
 scheduler = dict(
--- a/configs/pixart/inference/1x1024MS.py
+++ b/configs/pixart/inference/1x1024MS.py
@ -17,7 +17,7 @@ vae = dict(
 )
 text_encoder = dict(
    type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
    model_max_length=120,
 )
 scheduler = dict(
--- a/configs/pixart/inference/1x256x256.py
+++ b/configs/pixart/inference/1x256x256.py
@ -16,7 +16,7 @@ vae = dict(
 )
 text_encoder = dict(
    type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
    model_max_length=120,
 )
 scheduler = dict(
--- a/configs/pixart/inference/1x512x512.py
+++ b/configs/pixart/inference/1x512x512.py
@ -16,7 +16,7 @@ vae = dict(
 )
 text_encoder = dict(
    type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
    model_max_length=120,
 )
 scheduler = dict(
--- a/configs/pixart/train/16x256x256.py
+++ b/configs/pixart/train/16x256x256.py
@ -29,7 +29,7 @@ vae = dict(
 )
 text_encoder = dict(
    type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
    model_max_length=120,
    shardformer=True,
 )
--- a/configs/pixart/train/1x512x512.py
+++ b/configs/pixart/train/1x512x512.py
@ -30,7 +30,7 @@ vae = dict(
 )
 text_encoder = dict(
    type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
    model_max_length=120,
    shardformer=True,
 )
--- a/configs/pixart/train/64x512x512.py
+++ b/configs/pixart/train/64x512x512.py
@ -30,7 +30,7 @@ vae = dict(
 )
 text_encoder = dict(
    type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
    model_max_length=120,
    shardformer=True,
 )
--- a/docs/report_v1.md
+++ b/docs/report_v1.md
@ -14,7 +14,7 @@ As shown in the figure, we insert a temporal attention right after each spatial

 To focus on video generation, we hope to train the model based on a powerful image generation model. [PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha) is an efficiently trained high-quality image generation model with T5-conditioned DiT structure. We initialize our model with PixArt-α and initialize the projection layer of inserted temporal attention with zero. This initialization preserves model's ability of image generation at beginning, while Latte's architecture cannot. The inserted attention increases the number of parameter from 580M to 724M.

-![Architecture](https://i0.imgs.ovh/2024/03/16/erC1d.png)
+![Architecture](https://image.jiqizhixin.com/uploads/editor/ff49eaba-6b19-43d7-b65d-ad2ecdb9d555/640.jpeg)

 Drawing from the success of PixArt-α and Stable Video Diffusion, we also adopt a progressive training strategy: 16x256x256 on 366K pretraining datasets, and then 16x256x256, 16x512x512, and 64x512x512 on 20K datasets. With scaled position embedding, this strategy greatly reduces the computational cost.

--- a/docs/structure.md
+++ b/docs/structure.md
@ -100,7 +100,7 @@ vae = dict(
 )
 text_encoder = dict(
    type="t5",                 # Select text encoder type (t5, clip)
-    from_pretrained="./pretrained_models/t5_ckpts", # Load from pretrained text encoder
+    from_pretrained="DeepFloyd/t5-v1_1-xxl", # Load from pretrained text encoder
    model_max_length=120,      # Maximum length of input text
 )
 scheduler = dict(
@ -153,7 +153,7 @@ vae = dict(
 )
 text_encoder = dict(
    type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
    model_max_length=120,
    shardformer=True,           # Enable shardformer for T5 acceleration
 )
--- a/docs/zh_CN/structure.md
+++ b/docs/zh_CN/structure.md
@ -100,7 +100,7 @@ vae = dict(
 )
 text_encoder = dict(
    type="t5",                 # Select text encoder type (t5, clip)
-    from_pretrained="./pretrained_models/t5_ckpts", # Load from pretrained text encoder
+    from_pretrained="DeepFloyd/t5-v1_1-xxl", # Load from pretrained text encoder
    model_max_length=120,      # Maximum length of input text
 )
 scheduler = dict(
@ -153,7 +153,7 @@ vae = dict(
 )
 text_encoder = dict(
    type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
    model_max_length=120,
    shardformer=True,           # Enable shardformer for T5 acceleration
 )