diff --git a/tools/scoring/README.md b/tools/scoring/README.md index 4e18d71..e4ccae8 100644 --- a/tools/scoring/README.md +++ b/tools/scoring/README.md @@ -32,7 +32,7 @@ With `meta.csv` containing the paths to the videos, run the following command: ```bash # output: meta_aes.csv -torchrun --nproc_per_node 8 -m tools.scoring.aesthetic.inference meta.csv --bs 1024 --num_workers 16 +torchrun --nproc_per_node 8 -m tools.scoring.aesthetic.inference /path/to/meta.csv --bs 1024 --num_workers 16 ``` This will generate multiple part files, you can use `python -m tools.datasets.csvutil DATA1.csv DATA2.csv` to merge these part files. @@ -51,7 +51,7 @@ wget https://s3.eu-central-1.amazonaws.com/avg-projects/unimatch/pretrained/gmfl Then run: ```bash -torchrun --standalone --nproc_per_node 8 tools/scoring/optical_flow/inference.py --meta_path /path/to/meta.csv +torchrun --standalone --nproc_per_node 8 tools/scoring/optical_flow/inference.py /path/to/meta.csv ``` The output should be `/path/to/meta_flow.csv` with column `flow`. @@ -64,7 +64,7 @@ For videos, we compute the matching score of the middle frame and the caption. **Make sure** meta files contain the column `text`, which is the caption of the sample. Then run: ```bash -torchrun --standalone --nproc_per_node 8 tools/scoring/matching/inference.py --meta_path /path/to/meta.csv +torchrun --standalone --nproc_per_node 8 tools/scoring/matching/inference.py /path/to/meta.csv ``` The output should be `/path/to/meta_match.csv` with column `match`. Higher matching scores indicate better image-text/video-text alignment. diff --git a/tools/scoring/matching/inference.py b/tools/scoring/matching/inference.py index b1751a8..7bedef1 100644 --- a/tools/scoring/matching/inference.py +++ b/tools/scoring/matching/inference.py @@ -58,13 +58,18 @@ def merge_scores(gathered_list: list, meta: pd.DataFrame): meta.loc[unique_indices, "match"] = flat_scores[unique_indices_idx] -def main(): - colossalai.launch_from_torch({}) +def parse_args(): parser = argparse.ArgumentParser() - parser.add_argument("--meta_path", type=str, help="Path to the input CSV file") + parser.add_argument("meta_path", type=str, help="Path to the input CSV file") parser.add_argument("--bs", type=int, default=16, help="Batch size") parser.add_argument("--num_workers", type=int, default=16, help="Number of workers") args = parser.parse_args() + return args + + +def main(): + colossalai.launch_from_torch({}) + args = parse_args() meta_path = args.meta_path wo_ext, ext = os.path.splitext(meta_path) diff --git a/tools/scoring/ocr/dbnetpp_debug.py b/tools/scoring/ocr/dbnetpp_debug.py deleted file mode 100644 index c13a433..0000000 --- a/tools/scoring/ocr/dbnetpp_debug.py +++ /dev/null @@ -1,96 +0,0 @@ -model = dict( - type='DBNet', - backbone=dict( - type='CLIPResNet', - depth=50, - num_stages=4, - out_indices=(0, 1, 2, 3), - frozen_stages=-1, - norm_cfg=dict(type='BN', requires_grad=True), - norm_eval=False, - style='pytorch', - dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), - # init_cfg=dict( - # type='Pretrained', - # checkpoint='https://download.openmmlab.com/mmocr/backbone/resnet50-oclip-7ba0c533.pth'), - stage_with_dcn=(False, True, True, True), - ), - neck=dict( - type='FPNC', - in_channels=[256, 512, 1024, 2048], - lateral_channels=256, - asf_cfg=dict(attention_type='ScaleChannelSpatial'), - ), - det_head=dict( - type='DBHead', - in_channels=256, - module_loss=dict(type='DBModuleLoss'), - postprocessor=dict( - type='DBPostprocessor', text_repr_type='quad', - epsilon_ratio=0.002, - ), - ), - data_preprocessor=dict( - type='TextDetDataPreprocessor', - mean=[123.675, 116.28, 103.53], - std=[58.395, 57.12, 57.375], - bgr_to_rgb=True, - pad_size_divisor=32, - ), - init_cfg=dict( - type='Pretrained', - checkpoint='https://download.openmmlab.com/mmocr/textdet/dbnetpp/' - 'dbnetpp_resnet50-oclip_fpnc_1200e_icdar2015/' - 'dbnetpp_resnet50-oclip_fpnc_1200e_icdar2015_20221101_124139-4ecb39ac.pth', - ) -) - -train_pipeline = [ - dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), - dict( - type='LoadOCRAnnotations', - with_bbox=True, - with_polygon=True, - with_label=True, - ), - dict( - type='TorchVisionWrapper', - op='ColorJitter', - brightness=32.0 / 255, - saturation=0.5), - dict( - type='ImgAugWrapper', - args=[['Fliplr', 0.5], - dict(cls='Affine', rotate=[-10, 10]), ['Resize', [0.5, 3.0]]]), - dict(type='RandomCrop', min_side_ratio=0.1), - dict(type='Resize', scale=(640, 640), keep_ratio=True), - dict(type='Pad', size=(640, 640)), - dict( - type='PackTextDetInputs', - meta_keys=('img_path', 'ori_shape', 'img_shape')) -] - -test_pipeline = [ - dict(type='LoadImageFromFile', color_type='color_ignore_orientation'), - dict(type='Resize', scale=(4068, 1024), keep_ratio=True), - # dict( - # type='LoadOCRAnnotations', - # with_polygon=True, - # with_bbox=True, - # with_label=True, - # ), - dict( - type='PackTextDetInputs', - # meta_keys=('img_path', 'ori_shape', 'img_shape', 'scale_factor', - # 'instances'), - meta_keys=('img_path', 'ori_shape', 'img_shape', 'scale_factor'), - ) -] - -# Visualization -vis_backends = [dict(type='LocalVisBackend')] -visualizer = dict( - type='TextDetLocalVisualizer', - name='visualizer', - vis_backends=vis_backends, -) diff --git a/tools/scoring/ocr/debug.py b/tools/scoring/ocr/debug.py deleted file mode 100644 index 6028888..0000000 --- a/tools/scoring/ocr/debug.py +++ /dev/null @@ -1,108 +0,0 @@ -import os -import os.path as osp -from PIL import Image -import numpy as np -import torch - -import mmcv -import mmengine -from mmengine import Config -from mmengine.registry import DefaultScope -from mmengine.dataset import Compose, default_collate -from mmocr.registry import MODELS, VISUALIZERS - - -def visualize(visualizer, - inputs, - preds, - # return_vis: bool = False, - show: bool = False, - wait_time: int = 0, - draw_pred: bool = True, - pred_score_thr: float = 0.3, - save_vis: bool = False, - img_out_dir: str = ''): - """Visualize predictions. - - Args: - inputs (List[Union[str, np.ndarray]]): Inputs for the inferencer. - preds (List[Dict]): Predictions of the model. - return_vis (bool): Whether to return the visualization result. - Defaults to False. - show (bool): Whether to display the image in a popup window. - Defaults to False. - wait_time (float): The interval of show (s). Defaults to 0. - draw_pred (bool): Whether to draw predicted bounding boxes. - Defaults to True. - pred_score_thr (float): Minimum score of bboxes to draw. - Defaults to 0.3. - save_vis (bool): Whether to save the visualization result. Defaults - to False. - img_out_dir (str): Output directory of visualization results. - If left as empty, no file will be saved. Defaults to ''. - - Returns: - List[np.ndarray] or None: Returns visualization results only if - applicable. - """ - results = [] - - for single_input, pred in zip(inputs, preds): - if isinstance(single_input, str): - img_bytes = mmengine.fileio.get(single_input) - img = mmcv.imfrombytes(img_bytes, channel_order='rgb') - elif isinstance(single_input, np.ndarray): - img = single_input.copy()[:, :, ::-1] # to RGB - else: - raise ValueError('Unsupported input type: ' - f'{type(single_input)}') - img_name = osp.splitext(osp.basename(pred.img_path))[0] - - if save_vis and img_out_dir: - out_file = osp.splitext(img_name)[0] - out_file = f'{out_file}.jpg' - out_file = osp.join(img_out_dir, out_file) - else: - out_file = None - - visualization = visualizer.add_datasample( - img_name, - img, - pred, - show=show, - wait_time=wait_time, - draw_gt=False, - draw_pred=draw_pred, - pred_score_thr=pred_score_thr, - out_file=out_file, - ) - results.append(visualization) - - return results - - -cfg = Config.fromfile('./tools/scoring/ocr/dbnetpp_debug.py') - -device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") -print(device) -DefaultScope.get_instance('ocr', scope_name='mmocr') - -model = MODELS.build(cfg.model) -model.init_weights() -model.to(device) -pipeline = Compose(cfg.test_pipeline) -visualizer = VISUALIZERS.build(cfg.visualizer) - -inputs = { - 'img_path': './assets/images/ocr/demo_text_ocr.jpg', - # 'img_path': './assets/images/ocr/demo_text_det.jpg', -} -results = pipeline(inputs) -results['index'] = 0 -# imgs = results['inputs'].unsqueeze(0) -# pred = model.predict(imgs, results['data_samples']) -data = default_collate([results]) # list[Dict] to Dict -with torch.no_grad(): - pred = model.test_step(data) -vis_results = visualize(visualizer, [x.img_path for x in data['data_samples']], pred, show=True) -x = 0 \ No newline at end of file diff --git a/tools/scoring/ocr/inference.py b/tools/scoring/ocr/inference.py index 9c54bfd..3b15605 100644 --- a/tools/scoring/ocr/inference.py +++ b/tools/scoring/ocr/inference.py @@ -77,7 +77,7 @@ class VideoTextDataset(torch.utils.data.Dataset): def parse_args(): parser = argparse.ArgumentParser() - parser.add_argument("--meta_path", type=str, help="Path to the input CSV file") + parser.add_argument("meta_path", type=str, help="Path to the input CSV file") parser.add_argument("--bs", type=int, default=16, help="Batch size") parser.add_argument("--num_workers", type=int, default=16, help="Number of workers") args = parser.parse_args() diff --git a/tools/scoring/optical_flow/inference.py b/tools/scoring/optical_flow/inference.py index 487121f..170b076 100644 --- a/tools/scoring/optical_flow/inference.py +++ b/tools/scoring/optical_flow/inference.py @@ -58,15 +58,20 @@ class VideoTextDataset(torch.utils.data.Dataset): return len(self.meta) +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("meta_path", type=str, help="Path to the input CSV file") + parser.add_argument("--bs", type=int, default=4, help="Batch size") + parser.add_argument("--num_workers", type=int, default=16, help="Number of workers") + args = parser.parse_args() + return args + + def main(): torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False colossalai.launch_from_torch({}) - parser = argparse.ArgumentParser() - parser.add_argument("--meta_path", type=str, help="Path to the input CSV file") - parser.add_argument("--bs", type=int, default=4, help="Batch size") - parser.add_argument("--num_workers", type=int, default=16, help="Number of workers") - args = parser.parse_args() + args = parse_args() meta_path = args.meta_path wo_ext, ext = os.path.splitext(meta_path)