Merge branch 'dev/v1.1' of github.com:hpcaitech/Open-Sora-dev into dev/v1.1

2026-05-05 08:07:39 +02:00 · 2024-04-22 14:26:43 +00:00 · 2024-04-22 14:26:43 +00:00 · 63e042cc43
commit 63e042cc43
parent ae6d88911d 3f267bd326
6 changed files with 22 additions and 216 deletions
--- a/tools/scoring/README.md
+++ b/tools/scoring/README.md
@ -32,7 +32,7 @@ With `meta.csv` containing the paths to the videos, run the following command:

 ```bash
 # output: meta_aes.csv
-torchrun --nproc_per_node 8  -m tools.scoring.aesthetic.inference meta.csv --bs 1024 --num_workers 16
+torchrun --nproc_per_node 8 -m tools.scoring.aesthetic.inference /path/to/meta.csv --bs 1024 --num_workers 16
 ```

 This will generate multiple part files, you can use `python -m tools.datasets.csvutil DATA1.csv DATA2.csv` to merge these part files.
@ -51,7 +51,7 @@ wget https://s3.eu-central-1.amazonaws.com/avg-projects/unimatch/pretrained/gmfl
 Then run:

 ```bash
-torchrun --standalone --nproc_per_node 8 tools/scoring/optical_flow/inference.py --meta_path /path/to/meta.csv
+torchrun --standalone --nproc_per_node 8 tools/scoring/optical_flow/inference.py /path/to/meta.csv
 ```

 The output should be `/path/to/meta_flow.csv` with column `flow`.
@ -64,7 +64,7 @@ For videos, we compute the matching score of the middle frame and the caption.
 **Make sure** meta files contain the column `text`, which is the caption of the sample. Then run:

 ```bash
-torchrun --standalone --nproc_per_node 8 tools/scoring/matching/inference.py --meta_path /path/to/meta.csv
+torchrun --standalone --nproc_per_node 8 tools/scoring/matching/inference.py /path/to/meta.csv
 ```

 The output should be `/path/to/meta_match.csv` with column `match`. Higher matching scores indicate better image-text/video-text alignment.
--- a/tools/scoring/matching/inference.py
+++ b/tools/scoring/matching/inference.py
@ -58,13 +58,18 @@ def merge_scores(gathered_list: list, meta: pd.DataFrame):
    meta.loc[unique_indices, "match"] = flat_scores[unique_indices_idx]


-def main():
-    colossalai.launch_from_torch({})
+def parse_args():
    parser = argparse.ArgumentParser()
-    parser.add_argument("--meta_path", type=str, help="Path to the input CSV file")
+    parser.add_argument("meta_path", type=str, help="Path to the input CSV file")
    parser.add_argument("--bs", type=int, default=16, help="Batch size")
    parser.add_argument("--num_workers", type=int, default=16, help="Number of workers")
    args = parser.parse_args()
+    return args
+
+
+def main():
+    colossalai.launch_from_torch({})
+    args = parse_args()

    meta_path = args.meta_path
    wo_ext, ext = os.path.splitext(meta_path)
--- a/tools/scoring/ocr/dbnetpp_debug.py
+++ b/tools/scoring/ocr/dbnetpp_debug.py
@ -1,96 +0,0 @@
-model = dict(
-    type='DBNet',
-    backbone=dict(
-        type='CLIPResNet',
-        depth=50,
-        num_stages=4,
-        out_indices=(0, 1, 2, 3),
-        frozen_stages=-1,
-        norm_cfg=dict(type='BN', requires_grad=True),
-        norm_eval=False,
-        style='pytorch',
-        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
-        # init_cfg=dict(
-        #     type='Pretrained',
-        #     checkpoint='https://download.openmmlab.com/mmocr/backbone/resnet50-oclip-7ba0c533.pth'),
-        stage_with_dcn=(False, True, True, True),
-    ),
-    neck=dict(
-        type='FPNC',
-        in_channels=[256, 512, 1024, 2048],
-        lateral_channels=256,
-        asf_cfg=dict(attention_type='ScaleChannelSpatial'),
-    ),
-    det_head=dict(
-        type='DBHead',
-        in_channels=256,
-        module_loss=dict(type='DBModuleLoss'),
-        postprocessor=dict(
-            type='DBPostprocessor', text_repr_type='quad',
-            epsilon_ratio=0.002,
-        ),
-    ),
-    data_preprocessor=dict(
-        type='TextDetDataPreprocessor',
-        mean=[123.675, 116.28, 103.53],
-        std=[58.395, 57.12, 57.375],
-        bgr_to_rgb=True,
-        pad_size_divisor=32,
-    ),
-    init_cfg=dict(
-        type='Pretrained',
-        checkpoint='https://download.openmmlab.com/mmocr/textdet/dbnetpp/'
-                   'dbnetpp_resnet50-oclip_fpnc_1200e_icdar2015/'
-                   'dbnetpp_resnet50-oclip_fpnc_1200e_icdar2015_20221101_124139-4ecb39ac.pth',
-    )
-)
-
-train_pipeline = [
-    dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
-    dict(
-        type='LoadOCRAnnotations',
-        with_bbox=True,
-        with_polygon=True,
-        with_label=True,
-    ),
-    dict(
-        type='TorchVisionWrapper',
-        op='ColorJitter',
-        brightness=32.0 / 255,
-        saturation=0.5),
-    dict(
-        type='ImgAugWrapper',
-        args=[['Fliplr', 0.5],
-              dict(cls='Affine', rotate=[-10, 10]), ['Resize', [0.5, 3.0]]]),
-    dict(type='RandomCrop', min_side_ratio=0.1),
-    dict(type='Resize', scale=(640, 640), keep_ratio=True),
-    dict(type='Pad', size=(640, 640)),
-    dict(
-        type='PackTextDetInputs',
-        meta_keys=('img_path', 'ori_shape', 'img_shape'))
-]
-
-test_pipeline = [
-    dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
-    dict(type='Resize', scale=(4068, 1024), keep_ratio=True),
-    # dict(
-    #     type='LoadOCRAnnotations',
-    #     with_polygon=True,
-    #     with_bbox=True,
-    #     with_label=True,
-    # ),
-    dict(
-        type='PackTextDetInputs',
-        # meta_keys=('img_path', 'ori_shape', 'img_shape', 'scale_factor',
-        #            'instances'),
-        meta_keys=('img_path', 'ori_shape', 'img_shape', 'scale_factor'),
-    )
-]
-
-# Visualization
-vis_backends = [dict(type='LocalVisBackend')]
-visualizer = dict(
-    type='TextDetLocalVisualizer',
-    name='visualizer',
-    vis_backends=vis_backends,
-)
--- a/tools/scoring/ocr/debug.py
+++ b/tools/scoring/ocr/debug.py
@ -1,108 +0,0 @@
-import os
-import os.path as osp
-from PIL import Image
-import numpy as np
-import torch
-
-import mmcv
-import mmengine
-from mmengine import Config
-from mmengine.registry import DefaultScope
-from mmengine.dataset import Compose, default_collate
-from mmocr.registry import MODELS, VISUALIZERS
-
-
-def visualize(visualizer,
-              inputs,
-              preds,
-              # return_vis: bool = False,
-              show: bool = False,
-              wait_time: int = 0,
-              draw_pred: bool = True,
-              pred_score_thr: float = 0.3,
-              save_vis: bool = False,
-              img_out_dir: str = ''):
-    """Visualize predictions.
-
-    Args:
-        inputs (List[Union[str, np.ndarray]]): Inputs for the inferencer.
-        preds (List[Dict]): Predictions of the model.
-        return_vis (bool): Whether to return the visualization result.
-            Defaults to False.
-        show (bool): Whether to display the image in a popup window.
-            Defaults to False.
-        wait_time (float): The interval of show (s). Defaults to 0.
-        draw_pred (bool): Whether to draw predicted bounding boxes.
-            Defaults to True.
-        pred_score_thr (float): Minimum score of bboxes to draw.
-            Defaults to 0.3.
-        save_vis (bool): Whether to save the visualization result. Defaults
-            to False.
-        img_out_dir (str): Output directory of visualization results.
-            If left as empty, no file will be saved. Defaults to ''.
-
-    Returns:
-        List[np.ndarray] or None: Returns visualization results only if
-        applicable.
-    """
-    results = []
-
-    for single_input, pred in zip(inputs, preds):
-        if isinstance(single_input, str):
-            img_bytes = mmengine.fileio.get(single_input)
-            img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
-        elif isinstance(single_input, np.ndarray):
-            img = single_input.copy()[:, :, ::-1]  # to RGB
-        else:
-            raise ValueError('Unsupported input type: '
-                             f'{type(single_input)}')
-        img_name = osp.splitext(osp.basename(pred.img_path))[0]
-
-        if save_vis and img_out_dir:
-            out_file = osp.splitext(img_name)[0]
-            out_file = f'{out_file}.jpg'
-            out_file = osp.join(img_out_dir, out_file)
-        else:
-            out_file = None
-
-        visualization = visualizer.add_datasample(
-            img_name,
-            img,
-            pred,
-            show=show,
-            wait_time=wait_time,
-            draw_gt=False,
-            draw_pred=draw_pred,
-            pred_score_thr=pred_score_thr,
-            out_file=out_file,
-        )
-        results.append(visualization)
-
-    return results
-
-
-cfg = Config.fromfile('./tools/scoring/ocr/dbnetpp_debug.py')
-
-device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
-print(device)
-DefaultScope.get_instance('ocr', scope_name='mmocr')
-
-model = MODELS.build(cfg.model)
-model.init_weights()
-model.to(device)
-pipeline = Compose(cfg.test_pipeline)
-visualizer = VISUALIZERS.build(cfg.visualizer)
-
-inputs = {
-    'img_path': './assets/images/ocr/demo_text_ocr.jpg',
-    # 'img_path': './assets/images/ocr/demo_text_det.jpg',
-}
-results = pipeline(inputs)
-results['index'] = 0
-# imgs = results['inputs'].unsqueeze(0)
-# pred = model.predict(imgs, results['data_samples'])
-data = default_collate([results])  # list[Dict] to Dict
-with torch.no_grad():
-    pred = model.test_step(data)
-vis_results = visualize(visualizer, [x.img_path for x in data['data_samples']], pred, show=True)
-x = 0
--- a/tools/scoring/ocr/inference.py
+++ b/tools/scoring/ocr/inference.py
@ -77,7 +77,7 @@ class VideoTextDataset(torch.utils.data.Dataset):

 def parse_args():
    parser = argparse.ArgumentParser()
-    parser.add_argument("--meta_path", type=str, help="Path to the input CSV file")
+    parser.add_argument("meta_path", type=str, help="Path to the input CSV file")
    parser.add_argument("--bs", type=int, default=16, help="Batch size")
    parser.add_argument("--num_workers", type=int, default=16, help="Number of workers")
    args = parser.parse_args()
--- a/tools/scoring/optical_flow/inference.py
+++ b/tools/scoring/optical_flow/inference.py
@ -58,15 +58,20 @@ class VideoTextDataset(torch.utils.data.Dataset):
        return len(self.meta)


+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("meta_path", type=str, help="Path to the input CSV file")
+    parser.add_argument("--bs", type=int, default=4, help="Batch size")
+    parser.add_argument("--num_workers", type=int, default=16, help="Number of workers")
+    args = parser.parse_args()
+    return args
+
+
 def main():
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    colossalai.launch_from_torch({})
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--meta_path", type=str, help="Path to the input CSV file")
-    parser.add_argument("--bs", type=int, default=4, help="Batch size")
-    parser.add_argument("--num_workers", type=int, default=16, help="Number of workers")
-    args = parser.parse_args()
+    args = parse_args()

    meta_path = args.meta_path
    wo_ext, ext = os.path.splitext(meta_path)