mirror of
https://github.com/hpcaitech/Open-Sora.git
synced 2026-05-05 08:07:39 +02:00
Merge branch 'dev/v1.1' of github.com:hpcaitech/Open-Sora-dev into dev/v1.1
This commit is contained in:
commit
63e042cc43
|
|
@ -32,7 +32,7 @@ With `meta.csv` containing the paths to the videos, run the following command:
|
|||
|
||||
```bash
|
||||
# output: meta_aes.csv
|
||||
torchrun --nproc_per_node 8 -m tools.scoring.aesthetic.inference meta.csv --bs 1024 --num_workers 16
|
||||
torchrun --nproc_per_node 8 -m tools.scoring.aesthetic.inference /path/to/meta.csv --bs 1024 --num_workers 16
|
||||
```
|
||||
|
||||
This will generate multiple part files, you can use `python -m tools.datasets.csvutil DATA1.csv DATA2.csv` to merge these part files.
|
||||
|
|
@ -51,7 +51,7 @@ wget https://s3.eu-central-1.amazonaws.com/avg-projects/unimatch/pretrained/gmfl
|
|||
Then run:
|
||||
|
||||
```bash
|
||||
torchrun --standalone --nproc_per_node 8 tools/scoring/optical_flow/inference.py --meta_path /path/to/meta.csv
|
||||
torchrun --standalone --nproc_per_node 8 tools/scoring/optical_flow/inference.py /path/to/meta.csv
|
||||
```
|
||||
|
||||
The output should be `/path/to/meta_flow.csv` with column `flow`.
|
||||
|
|
@ -64,7 +64,7 @@ For videos, we compute the matching score of the middle frame and the caption.
|
|||
**Make sure** meta files contain the column `text`, which is the caption of the sample. Then run:
|
||||
|
||||
```bash
|
||||
torchrun --standalone --nproc_per_node 8 tools/scoring/matching/inference.py --meta_path /path/to/meta.csv
|
||||
torchrun --standalone --nproc_per_node 8 tools/scoring/matching/inference.py /path/to/meta.csv
|
||||
```
|
||||
|
||||
The output should be `/path/to/meta_match.csv` with column `match`. Higher matching scores indicate better image-text/video-text alignment.
|
||||
|
|
|
|||
|
|
@ -58,13 +58,18 @@ def merge_scores(gathered_list: list, meta: pd.DataFrame):
|
|||
meta.loc[unique_indices, "match"] = flat_scores[unique_indices_idx]
|
||||
|
||||
|
||||
def main():
|
||||
colossalai.launch_from_torch({})
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--meta_path", type=str, help="Path to the input CSV file")
|
||||
parser.add_argument("meta_path", type=str, help="Path to the input CSV file")
|
||||
parser.add_argument("--bs", type=int, default=16, help="Batch size")
|
||||
parser.add_argument("--num_workers", type=int, default=16, help="Number of workers")
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def main():
|
||||
colossalai.launch_from_torch({})
|
||||
args = parse_args()
|
||||
|
||||
meta_path = args.meta_path
|
||||
wo_ext, ext = os.path.splitext(meta_path)
|
||||
|
|
|
|||
|
|
@ -1,96 +0,0 @@
|
|||
model = dict(
|
||||
type='DBNet',
|
||||
backbone=dict(
|
||||
type='CLIPResNet',
|
||||
depth=50,
|
||||
num_stages=4,
|
||||
out_indices=(0, 1, 2, 3),
|
||||
frozen_stages=-1,
|
||||
norm_cfg=dict(type='BN', requires_grad=True),
|
||||
norm_eval=False,
|
||||
style='pytorch',
|
||||
dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
|
||||
# init_cfg=dict(
|
||||
# type='Pretrained',
|
||||
# checkpoint='https://download.openmmlab.com/mmocr/backbone/resnet50-oclip-7ba0c533.pth'),
|
||||
stage_with_dcn=(False, True, True, True),
|
||||
),
|
||||
neck=dict(
|
||||
type='FPNC',
|
||||
in_channels=[256, 512, 1024, 2048],
|
||||
lateral_channels=256,
|
||||
asf_cfg=dict(attention_type='ScaleChannelSpatial'),
|
||||
),
|
||||
det_head=dict(
|
||||
type='DBHead',
|
||||
in_channels=256,
|
||||
module_loss=dict(type='DBModuleLoss'),
|
||||
postprocessor=dict(
|
||||
type='DBPostprocessor', text_repr_type='quad',
|
||||
epsilon_ratio=0.002,
|
||||
),
|
||||
),
|
||||
data_preprocessor=dict(
|
||||
type='TextDetDataPreprocessor',
|
||||
mean=[123.675, 116.28, 103.53],
|
||||
std=[58.395, 57.12, 57.375],
|
||||
bgr_to_rgb=True,
|
||||
pad_size_divisor=32,
|
||||
),
|
||||
init_cfg=dict(
|
||||
type='Pretrained',
|
||||
checkpoint='https://download.openmmlab.com/mmocr/textdet/dbnetpp/'
|
||||
'dbnetpp_resnet50-oclip_fpnc_1200e_icdar2015/'
|
||||
'dbnetpp_resnet50-oclip_fpnc_1200e_icdar2015_20221101_124139-4ecb39ac.pth',
|
||||
)
|
||||
)
|
||||
|
||||
train_pipeline = [
|
||||
dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
|
||||
dict(
|
||||
type='LoadOCRAnnotations',
|
||||
with_bbox=True,
|
||||
with_polygon=True,
|
||||
with_label=True,
|
||||
),
|
||||
dict(
|
||||
type='TorchVisionWrapper',
|
||||
op='ColorJitter',
|
||||
brightness=32.0 / 255,
|
||||
saturation=0.5),
|
||||
dict(
|
||||
type='ImgAugWrapper',
|
||||
args=[['Fliplr', 0.5],
|
||||
dict(cls='Affine', rotate=[-10, 10]), ['Resize', [0.5, 3.0]]]),
|
||||
dict(type='RandomCrop', min_side_ratio=0.1),
|
||||
dict(type='Resize', scale=(640, 640), keep_ratio=True),
|
||||
dict(type='Pad', size=(640, 640)),
|
||||
dict(
|
||||
type='PackTextDetInputs',
|
||||
meta_keys=('img_path', 'ori_shape', 'img_shape'))
|
||||
]
|
||||
|
||||
test_pipeline = [
|
||||
dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
|
||||
dict(type='Resize', scale=(4068, 1024), keep_ratio=True),
|
||||
# dict(
|
||||
# type='LoadOCRAnnotations',
|
||||
# with_polygon=True,
|
||||
# with_bbox=True,
|
||||
# with_label=True,
|
||||
# ),
|
||||
dict(
|
||||
type='PackTextDetInputs',
|
||||
# meta_keys=('img_path', 'ori_shape', 'img_shape', 'scale_factor',
|
||||
# 'instances'),
|
||||
meta_keys=('img_path', 'ori_shape', 'img_shape', 'scale_factor'),
|
||||
)
|
||||
]
|
||||
|
||||
# Visualization
|
||||
vis_backends = [dict(type='LocalVisBackend')]
|
||||
visualizer = dict(
|
||||
type='TextDetLocalVisualizer',
|
||||
name='visualizer',
|
||||
vis_backends=vis_backends,
|
||||
)
|
||||
|
|
@ -1,108 +0,0 @@
|
|||
import os
|
||||
import os.path as osp
|
||||
from PIL import Image
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
import mmcv
|
||||
import mmengine
|
||||
from mmengine import Config
|
||||
from mmengine.registry import DefaultScope
|
||||
from mmengine.dataset import Compose, default_collate
|
||||
from mmocr.registry import MODELS, VISUALIZERS
|
||||
|
||||
|
||||
def visualize(visualizer,
|
||||
inputs,
|
||||
preds,
|
||||
# return_vis: bool = False,
|
||||
show: bool = False,
|
||||
wait_time: int = 0,
|
||||
draw_pred: bool = True,
|
||||
pred_score_thr: float = 0.3,
|
||||
save_vis: bool = False,
|
||||
img_out_dir: str = ''):
|
||||
"""Visualize predictions.
|
||||
|
||||
Args:
|
||||
inputs (List[Union[str, np.ndarray]]): Inputs for the inferencer.
|
||||
preds (List[Dict]): Predictions of the model.
|
||||
return_vis (bool): Whether to return the visualization result.
|
||||
Defaults to False.
|
||||
show (bool): Whether to display the image in a popup window.
|
||||
Defaults to False.
|
||||
wait_time (float): The interval of show (s). Defaults to 0.
|
||||
draw_pred (bool): Whether to draw predicted bounding boxes.
|
||||
Defaults to True.
|
||||
pred_score_thr (float): Minimum score of bboxes to draw.
|
||||
Defaults to 0.3.
|
||||
save_vis (bool): Whether to save the visualization result. Defaults
|
||||
to False.
|
||||
img_out_dir (str): Output directory of visualization results.
|
||||
If left as empty, no file will be saved. Defaults to ''.
|
||||
|
||||
Returns:
|
||||
List[np.ndarray] or None: Returns visualization results only if
|
||||
applicable.
|
||||
"""
|
||||
results = []
|
||||
|
||||
for single_input, pred in zip(inputs, preds):
|
||||
if isinstance(single_input, str):
|
||||
img_bytes = mmengine.fileio.get(single_input)
|
||||
img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
|
||||
elif isinstance(single_input, np.ndarray):
|
||||
img = single_input.copy()[:, :, ::-1] # to RGB
|
||||
else:
|
||||
raise ValueError('Unsupported input type: '
|
||||
f'{type(single_input)}')
|
||||
img_name = osp.splitext(osp.basename(pred.img_path))[0]
|
||||
|
||||
if save_vis and img_out_dir:
|
||||
out_file = osp.splitext(img_name)[0]
|
||||
out_file = f'{out_file}.jpg'
|
||||
out_file = osp.join(img_out_dir, out_file)
|
||||
else:
|
||||
out_file = None
|
||||
|
||||
visualization = visualizer.add_datasample(
|
||||
img_name,
|
||||
img,
|
||||
pred,
|
||||
show=show,
|
||||
wait_time=wait_time,
|
||||
draw_gt=False,
|
||||
draw_pred=draw_pred,
|
||||
pred_score_thr=pred_score_thr,
|
||||
out_file=out_file,
|
||||
)
|
||||
results.append(visualization)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
cfg = Config.fromfile('./tools/scoring/ocr/dbnetpp_debug.py')
|
||||
|
||||
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
||||
print(device)
|
||||
DefaultScope.get_instance('ocr', scope_name='mmocr')
|
||||
|
||||
model = MODELS.build(cfg.model)
|
||||
model.init_weights()
|
||||
model.to(device)
|
||||
pipeline = Compose(cfg.test_pipeline)
|
||||
visualizer = VISUALIZERS.build(cfg.visualizer)
|
||||
|
||||
inputs = {
|
||||
'img_path': './assets/images/ocr/demo_text_ocr.jpg',
|
||||
# 'img_path': './assets/images/ocr/demo_text_det.jpg',
|
||||
}
|
||||
results = pipeline(inputs)
|
||||
results['index'] = 0
|
||||
# imgs = results['inputs'].unsqueeze(0)
|
||||
# pred = model.predict(imgs, results['data_samples'])
|
||||
data = default_collate([results]) # list[Dict] to Dict
|
||||
with torch.no_grad():
|
||||
pred = model.test_step(data)
|
||||
vis_results = visualize(visualizer, [x.img_path for x in data['data_samples']], pred, show=True)
|
||||
x = 0
|
||||
|
|
@ -77,7 +77,7 @@ class VideoTextDataset(torch.utils.data.Dataset):
|
|||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--meta_path", type=str, help="Path to the input CSV file")
|
||||
parser.add_argument("meta_path", type=str, help="Path to the input CSV file")
|
||||
parser.add_argument("--bs", type=int, default=16, help="Batch size")
|
||||
parser.add_argument("--num_workers", type=int, default=16, help="Number of workers")
|
||||
args = parser.parse_args()
|
||||
|
|
|
|||
|
|
@ -58,15 +58,20 @@ class VideoTextDataset(torch.utils.data.Dataset):
|
|||
return len(self.meta)
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("meta_path", type=str, help="Path to the input CSV file")
|
||||
parser.add_argument("--bs", type=int, default=4, help="Batch size")
|
||||
parser.add_argument("--num_workers", type=int, default=16, help="Number of workers")
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def main():
|
||||
torch.backends.cudnn.deterministic = True
|
||||
torch.backends.cudnn.benchmark = False
|
||||
colossalai.launch_from_torch({})
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--meta_path", type=str, help="Path to the input CSV file")
|
||||
parser.add_argument("--bs", type=int, default=4, help="Batch size")
|
||||
parser.add_argument("--num_workers", type=int, default=16, help="Number of workers")
|
||||
args = parser.parse_args()
|
||||
args = parse_args()
|
||||
|
||||
meta_path = args.meta_path
|
||||
wo_ext, ext = os.path.splitext(meta_path)
|
||||
|
|
|
|||
Loading…
Reference in a new issue