Merge branch 'dev/v1.1' of github.com:hpcaitech/Open-Sora-dev into dev/v1.1

This commit is contained in:
zhengzangw 2024-04-22 14:26:43 +00:00
commit 63e042cc43
6 changed files with 22 additions and 216 deletions

View file

@ -32,7 +32,7 @@ With `meta.csv` containing the paths to the videos, run the following command:
```bash
# output: meta_aes.csv
torchrun --nproc_per_node 8 -m tools.scoring.aesthetic.inference meta.csv --bs 1024 --num_workers 16
torchrun --nproc_per_node 8 -m tools.scoring.aesthetic.inference /path/to/meta.csv --bs 1024 --num_workers 16
```
This will generate multiple part files, you can use `python -m tools.datasets.csvutil DATA1.csv DATA2.csv` to merge these part files.
@ -51,7 +51,7 @@ wget https://s3.eu-central-1.amazonaws.com/avg-projects/unimatch/pretrained/gmfl
Then run:
```bash
torchrun --standalone --nproc_per_node 8 tools/scoring/optical_flow/inference.py --meta_path /path/to/meta.csv
torchrun --standalone --nproc_per_node 8 tools/scoring/optical_flow/inference.py /path/to/meta.csv
```
The output should be `/path/to/meta_flow.csv` with column `flow`.
@ -64,7 +64,7 @@ For videos, we compute the matching score of the middle frame and the caption.
**Make sure** meta files contain the column `text`, which is the caption of the sample. Then run:
```bash
torchrun --standalone --nproc_per_node 8 tools/scoring/matching/inference.py --meta_path /path/to/meta.csv
torchrun --standalone --nproc_per_node 8 tools/scoring/matching/inference.py /path/to/meta.csv
```
The output should be `/path/to/meta_match.csv` with column `match`. Higher matching scores indicate better image-text/video-text alignment.

View file

@ -58,13 +58,18 @@ def merge_scores(gathered_list: list, meta: pd.DataFrame):
meta.loc[unique_indices, "match"] = flat_scores[unique_indices_idx]
def main():
colossalai.launch_from_torch({})
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--meta_path", type=str, help="Path to the input CSV file")
parser.add_argument("meta_path", type=str, help="Path to the input CSV file")
parser.add_argument("--bs", type=int, default=16, help="Batch size")
parser.add_argument("--num_workers", type=int, default=16, help="Number of workers")
args = parser.parse_args()
return args
def main():
colossalai.launch_from_torch({})
args = parse_args()
meta_path = args.meta_path
wo_ext, ext = os.path.splitext(meta_path)

View file

@ -1,96 +0,0 @@
model = dict(
type='DBNet',
backbone=dict(
type='CLIPResNet',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=-1,
norm_cfg=dict(type='BN', requires_grad=True),
norm_eval=False,
style='pytorch',
dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
# init_cfg=dict(
# type='Pretrained',
# checkpoint='https://download.openmmlab.com/mmocr/backbone/resnet50-oclip-7ba0c533.pth'),
stage_with_dcn=(False, True, True, True),
),
neck=dict(
type='FPNC',
in_channels=[256, 512, 1024, 2048],
lateral_channels=256,
asf_cfg=dict(attention_type='ScaleChannelSpatial'),
),
det_head=dict(
type='DBHead',
in_channels=256,
module_loss=dict(type='DBModuleLoss'),
postprocessor=dict(
type='DBPostprocessor', text_repr_type='quad',
epsilon_ratio=0.002,
),
),
data_preprocessor=dict(
type='TextDetDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True,
pad_size_divisor=32,
),
init_cfg=dict(
type='Pretrained',
checkpoint='https://download.openmmlab.com/mmocr/textdet/dbnetpp/'
'dbnetpp_resnet50-oclip_fpnc_1200e_icdar2015/'
'dbnetpp_resnet50-oclip_fpnc_1200e_icdar2015_20221101_124139-4ecb39ac.pth',
)
)
train_pipeline = [
dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
dict(
type='LoadOCRAnnotations',
with_bbox=True,
with_polygon=True,
with_label=True,
),
dict(
type='TorchVisionWrapper',
op='ColorJitter',
brightness=32.0 / 255,
saturation=0.5),
dict(
type='ImgAugWrapper',
args=[['Fliplr', 0.5],
dict(cls='Affine', rotate=[-10, 10]), ['Resize', [0.5, 3.0]]]),
dict(type='RandomCrop', min_side_ratio=0.1),
dict(type='Resize', scale=(640, 640), keep_ratio=True),
dict(type='Pad', size=(640, 640)),
dict(
type='PackTextDetInputs',
meta_keys=('img_path', 'ori_shape', 'img_shape'))
]
test_pipeline = [
dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
dict(type='Resize', scale=(4068, 1024), keep_ratio=True),
# dict(
# type='LoadOCRAnnotations',
# with_polygon=True,
# with_bbox=True,
# with_label=True,
# ),
dict(
type='PackTextDetInputs',
# meta_keys=('img_path', 'ori_shape', 'img_shape', 'scale_factor',
# 'instances'),
meta_keys=('img_path', 'ori_shape', 'img_shape', 'scale_factor'),
)
]
# Visualization
vis_backends = [dict(type='LocalVisBackend')]
visualizer = dict(
type='TextDetLocalVisualizer',
name='visualizer',
vis_backends=vis_backends,
)

View file

@ -1,108 +0,0 @@
import os
import os.path as osp
from PIL import Image
import numpy as np
import torch
import mmcv
import mmengine
from mmengine import Config
from mmengine.registry import DefaultScope
from mmengine.dataset import Compose, default_collate
from mmocr.registry import MODELS, VISUALIZERS
def visualize(visualizer,
inputs,
preds,
# return_vis: bool = False,
show: bool = False,
wait_time: int = 0,
draw_pred: bool = True,
pred_score_thr: float = 0.3,
save_vis: bool = False,
img_out_dir: str = ''):
"""Visualize predictions.
Args:
inputs (List[Union[str, np.ndarray]]): Inputs for the inferencer.
preds (List[Dict]): Predictions of the model.
return_vis (bool): Whether to return the visualization result.
Defaults to False.
show (bool): Whether to display the image in a popup window.
Defaults to False.
wait_time (float): The interval of show (s). Defaults to 0.
draw_pred (bool): Whether to draw predicted bounding boxes.
Defaults to True.
pred_score_thr (float): Minimum score of bboxes to draw.
Defaults to 0.3.
save_vis (bool): Whether to save the visualization result. Defaults
to False.
img_out_dir (str): Output directory of visualization results.
If left as empty, no file will be saved. Defaults to ''.
Returns:
List[np.ndarray] or None: Returns visualization results only if
applicable.
"""
results = []
for single_input, pred in zip(inputs, preds):
if isinstance(single_input, str):
img_bytes = mmengine.fileio.get(single_input)
img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
elif isinstance(single_input, np.ndarray):
img = single_input.copy()[:, :, ::-1] # to RGB
else:
raise ValueError('Unsupported input type: '
f'{type(single_input)}')
img_name = osp.splitext(osp.basename(pred.img_path))[0]
if save_vis and img_out_dir:
out_file = osp.splitext(img_name)[0]
out_file = f'{out_file}.jpg'
out_file = osp.join(img_out_dir, out_file)
else:
out_file = None
visualization = visualizer.add_datasample(
img_name,
img,
pred,
show=show,
wait_time=wait_time,
draw_gt=False,
draw_pred=draw_pred,
pred_score_thr=pred_score_thr,
out_file=out_file,
)
results.append(visualization)
return results
cfg = Config.fromfile('./tools/scoring/ocr/dbnetpp_debug.py')
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)
DefaultScope.get_instance('ocr', scope_name='mmocr')
model = MODELS.build(cfg.model)
model.init_weights()
model.to(device)
pipeline = Compose(cfg.test_pipeline)
visualizer = VISUALIZERS.build(cfg.visualizer)
inputs = {
'img_path': './assets/images/ocr/demo_text_ocr.jpg',
# 'img_path': './assets/images/ocr/demo_text_det.jpg',
}
results = pipeline(inputs)
results['index'] = 0
# imgs = results['inputs'].unsqueeze(0)
# pred = model.predict(imgs, results['data_samples'])
data = default_collate([results]) # list[Dict] to Dict
with torch.no_grad():
pred = model.test_step(data)
vis_results = visualize(visualizer, [x.img_path for x in data['data_samples']], pred, show=True)
x = 0

View file

@ -77,7 +77,7 @@ class VideoTextDataset(torch.utils.data.Dataset):
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--meta_path", type=str, help="Path to the input CSV file")
parser.add_argument("meta_path", type=str, help="Path to the input CSV file")
parser.add_argument("--bs", type=int, default=16, help="Batch size")
parser.add_argument("--num_workers", type=int, default=16, help="Number of workers")
args = parser.parse_args()

View file

@ -58,15 +58,20 @@ class VideoTextDataset(torch.utils.data.Dataset):
return len(self.meta)
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("meta_path", type=str, help="Path to the input CSV file")
parser.add_argument("--bs", type=int, default=4, help="Batch size")
parser.add_argument("--num_workers", type=int, default=16, help="Number of workers")
args = parser.parse_args()
return args
def main():
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
colossalai.launch_from_torch({})
parser = argparse.ArgumentParser()
parser.add_argument("--meta_path", type=str, help="Path to the input CSV file")
parser.add_argument("--bs", type=int, default=4, help="Batch size")
parser.add_argument("--num_workers", type=int, default=16, help="Number of workers")
args = parser.parse_args()
args = parse_args()
meta_path = args.meta_path
wo_ext, ext = os.path.splitext(meta_path)