Open-Sora/copy_over_caption.py
2024-04-11 13:34:02 +08:00

47 lines
1.5 KiB
Python

import argparse
import json
import pandas as pd
from pandarallel import pandarallel
def process_csv(original_csv_path, original_video_format="mp4"):
pandarallel.initialize(progress_bar=True)
# Construct the new CSV file name by adding '_caption' before the '.csv' extension
caption_file = original_csv_path.replace(".csv", "_caption.csv")
df = pd.read_csv(original_csv_path)
# Add a new column for captions initialized with 'None'
df["caption"] = "None"
def process_row(row):
path = row["path"]
json_path = path.replace(original_video_format, "json")
with open(json_path, "r") as f:
json_data = json.load(f)
row["caption"] = json_data["caption"]
return row
# Iterate over each row to replace video format with json in the path, and extract captions
df = df.parallel_apply(process_row, axis=1)
# Save the modified DataFrame to a new CSV file
df.to_csv(caption_file, index=False)
print(f"New CSV file with captions is saved as {caption_file}")
if __name__ == "__main__":
# Setup argument parser
parser = argparse.ArgumentParser(description="Process a CSV file to add video captions.")
parser.add_argument("csv_path", type=str, help="The path to the original CSV file.")
parser.add_argument("video_format", nargs="?", default="mp4", help="The original video format (default: mp4).")
# Parse arguments
args = parser.parse_args()
# Call the function with the provided arguments
process_csv(args.csv_path, args.video_format)