现在需要自动化设计音视频、字幕自动化剪辑的程序,就涉及到了字幕位置、字号大小适应、字幕、音频、视频的时间匹配和同步等问题,我记录踩坑经验如下:
我的任务是剪辑不同片段的自动生成的音频、视频、和字幕,那么首先就是音频和视频持续时间如何对应,我的逻辑是以视频为基准,音频时长和视频时长进行对齐。也就是音频会根据时长来进行变速,但是实现之后发现音频变速对整体视听效果影响较大, 所以改成以音频时长为准,如果视频过长则音频通过补齐静音音频来实现对齐;如果视频过短则通过视频重复补最后一帧来实现对齐,当然,我这样实现的前提是音频和视频的时长总是接近的,差距总是在1s内,而这一步是在音频文本生成的agent中就实现了的。
过程中出现的几个问题列举如下:
字幕位置、字号大小适应
# 假设一个合理的宽度来换行
max_chars_per_line = int(clip_width / (font_size * 0.6)) # 可根据字体宽度调整比例
wrapped_subtitle = wrap_text(subtitle, max_chars_per_line)
# 创建 TextClip
txt_clip = TextClip(wrapped_subtitle, fontsize=font_size, color='white', align='center',
size=(clip_width, None))
txt_clip = txt_clip.set_position(('center', 'bottom')).set_duration(video_clip.duration)
音视频和字幕时间匹配
字幕时间和视频时间同步,这样最符合人们的观看习惯。音视频的时间匹配逻辑如下:
if audio_clip.duration > video_clip_duration:
# 当音频时长大于视频时长,补帧
freeze_duration = audio_clip.duration - video_clip_duration
last_frame = video_clip.get_frame(video_clip_duration - 0.1)
freeze_frame = ImageClip(last_frame).set_duration(freeze_duration).set_fps(video_clip.fps)
video_clip = concatenate_videoclips([video_clip, freeze_frame])
elif audio_clip.duration < video_clip_duration:
# 当音频时长小于视频时长,添加静音来填充剩余时间
silence_duration = video_clip_duration - audio_clip.duration
silence = AudioFileClip(blank_mp3_file).subclip(0,silence_duration)
audio_clip = concatenate_audioclips([audio_clip, silence])
moviepy剪辑出现杂音
这个问题出在版本上,moviepy的`2.0.0`版本解决了这个问题,但是实际上默认安装的版本是`1.8`,我猜测跟duration的计算方式和实际视频时长的计算方式不一样有关,因为duration是按秒为单位,但是实际视频的时长是按帧和帧率来换算的,因此补帧的时候,会出现不能完全对齐的问题,导致音频在补超过的部分也进行了一小段的复读杂音。
import re
from moviepy.audio.AudioClip import concatenate_audioclips
import os
from moviepy.editor import VideoFileClip, AudioFileClip, concatenate_videoclips, concatenate_audioclips, CompositeVideoClip, TextClip, ImageClip
import os
import shutil
import json
from moviepy.video.VideoClip import ImageClip
from pydub import AudioSegment
from pydub.playback import play
def extract_number(filename):
# 从文件名中提取数字
numbers = re.findall(r'\d+', filename)
return int(numbers[0]) if numbers else 0
def adjust_audio_speed(audio_path, target_duration):
# 动态调整音频速度,其实已经不用了
audio = AudioSegment.from_file(audio_path)
speed_change = (audio.duration_seconds / target_duration)
new_audio = audio._spawn(audio.raw_data, overrides={
"frame_rate": int(audio.frame_rate * speed_change)
})
new_audio = new_audio.set_frame_rate(audio.frame_rate)
return new_audio
def merge_audio(audio_path):
audio_files = [f for f in os.listdir(audio_path) if f.endswith('.mp3')]
audio_groups = {}
# 将文件按前缀分组
for audio in audio_files:
category = audio.split('-')[0]
if category not in audio_groups:
audio_groups[category] = []
audio_groups[category].append(audio)
# 对每个组进行处理
for category, files in audio_groups.items():
# 对文件名进行排序确保顺序正确
files.sort()
combined = AudioSegment.empty()
# 合并音频
for file_name in files:
audio_file_path = os.path.join(audio_path, file_name)
sound = AudioSegment.from_mp3(audio_file_path)
combined += sound
# 删除原始文件
os.remove(audio_file_path)
# 导出合并后的音频
combined.export(os.path.join(audio_path, f"{category}.mp3"), format="mp3")
def merge_segment(video_path):
# 拼接 2 个segment为一个scene,六秒的视频
# 视频目录路径
video_dir = video_path
# 获取目录中的所有视频文件
video_files = [f for f in os.listdir(video_dir) if f.endswith('.mp4')]
# 按类别分组视频
video_groups = {}
for video in video_files:
category = video.split('-')[0]
if category not in video_groups:
video_groups[category] = []
video_groups[category].append(video)
# 拼接视频并删除原始文件
for category, videos in video_groups.items():
# 按文件名的数字部分排序,确保按照0,1,2的顺序
videos.sort(key=lambda x: int(x.split('-')[1].split('.')[0]))
clips = [VideoFileClip(os.path.join(video_dir, v)) for v in videos]
final_clip = concatenate_videoclips(clips, method="chain")
output_filename = f"{category}.mp4"
final_clip.write_videofile(os.path.join(video_dir, output_filename), codec='libx264')
# 释放资源
for clip in clips:
clip.close()
final_clip.close()
# 删除原始视频文件
for video in videos:
os.remove(os.path.join(video_dir, video))
print("视频拼接并删除原视频完成。")
def extract_last_frame(video_path, output_path):
# 获取最后一帧,弃用了
# 加载视频
clip = VideoFileClip(video_path)
# 获取视频的总持续时间
duration = clip.duration
# 截取最后一帧
last_frame = clip.get_frame(duration)
# 保存最后一帧的图片
clip.save_frame(output_path, t=duration)
def load_subtitles(data):
subtitles = {}
for scene in data:
subtitles[f"{scene['id']}"] = scene.get('dialogue', '')
print(subtitles)
return subtitles
def clip_auto(video_path, audio_path, temp_video_path, result_video_file, subtitles, blank_mp3_file):
videos = sorted([file for file in os.listdir(video_path) if file.endswith('.mp4')])
audios = sorted([file for file in os.listdir(audio_path) if file.endswith('.mp3')])
temp_video_files = []
for video_file, audio_file in zip(videos, audios):
video_clip = VideoFileClip(os.path.join(video_path, video_file))
video_clip_duration = video_clip.duration
audio_file_path = os.path.join(audio_path, audio_file)
audio_clip = AudioFileClip(audio_file_path)
if audio_clip.duration > video_clip_duration:
# 当音频时长大于视频时长,补帧
freeze_duration = audio_clip.duration - video_clip_duration
last_frame = video_clip.get_frame(video_clip_duration - 0.1)
freeze_frame = ImageClip(last_frame).set_duration(freeze_duration).set_fps(video_clip.fps)
video_clip = concatenate_videoclips([video_clip, freeze_frame])
elif audio_clip.duration < video_clip_duration:
# 当音频时长小于视频时长,添加静音来填充剩余时间
silence_duration = video_clip_duration - audio_clip.duration
silence = AudioFileClip(blank_mp3_file).subclip(0,silence_duration)
audio_clip = concatenate_audioclips([audio_clip, silence])
video_clip = video_clip.set_audio(audio_clip)
subtitle_key = video_file.split('.')[0]
subtitle = subtitles.get(subtitle_key, '')
font_size = 25
clip_width = video_clip.size[0]
def wrap_text(text, width):
import textwrap
return '\n'.join(textwrap.wrap(text, width=width, break_long_words=False))
# 假设一个合理的宽度来换行
max_chars_per_line = int(clip_width / (font_size * 0.6)) # 可根据字体宽度调整比例
wrapped_subtitle = wrap_text(subtitle, max_chars_per_line)
# 创建 TextClip
txt_clip = TextClip(wrapped_subtitle, fontsize=font_size, color='white', align='center',
size=(clip_width, None))
txt_clip = txt_clip.set_position(('center', 'bottom')).set_duration(video_clip.duration)
video = CompositeVideoClip([video_clip, txt_clip])
temp_output_file = os.path.join(temp_video_path, f"temp_{video_file}")
video.write_videofile(temp_output_file, codec="libx264", fps=24)
temp_video_files.append(temp_output_file)
sorted_files = sorted(temp_video_files, key=lambda x: int(x.split('/')[-1].split('.')[0].split('_')[-1]))
final_clips = [VideoFileClip(file) for file in sorted_files]
final_clip = concatenate_videoclips(final_clips)
final_clip.write_videofile(result_video_file, codec="libx264", fps=24)
for clip in final_clips + [final_clip]:
clip.close()
def copy_video(video_path, video_name, new_video_path):
# 提取视频名中"-"分隔的前两个数字作为新的视频名
new_video_name = "-".join(video_name.split("-")[:2]) + ".mp4"
# 生成完整的新视频路径
full_new_video_path = os.path.join(new_video_path, new_video_name)
full_video_path = os.path.join(video_path, video_name)
# 复制并重命名视频文件
try:
shutil.copy(full_video_path, full_new_video_path)
print("chosen video copied:", full_new_video_path)
return full_new_video_path
except IOError as e:
print("无法复制文件:", e)
def copy_segment_video(scenes_data, trial_video_folder, video_folder):
for scene in scenes_data:
for segment in scene["segments"]:
if "1" in segment["vbench"][-1]["videos"][0]["0"]:
key_name = "1"
else:
key_name = "0"
print(segment["vbench"][-1]["videos"][0]["0"][key_name])
copy_video(trial_video_folder, segment["vbench"][-1]["videos"][0]["0"][key_name], video_folder)
if "transition" in scene:
transition = scene["transition"]
if "1" in transition["vbench"][-1]["videos"][0]["0"]:
key_name = "1"
else:
key_name = "0"
print(transition["vbench"][-1]["videos"][0]["0"][key_name])
copy_video(trial_video_folder, transition["vbench"][-1]["videos"][0]["0"][key_name], video_folder)
def auto_clip_video(src, cfg):
# 合并音频
merge_audio(cfg["paths"].get('audio_folder'))
# 移动segment视频到video_folder
copy_segment_video(src.get('scenes'), cfg["paths"].get('trial_video_folder'), cfg["paths"].get('video_folder'))
# 两个segment合并成一个scene
merge_segment(cfg["paths"].get('video_folder'))
# 加载字幕文件
subtitles = load_subtitles(src.get('scenes'))
# 音视频字幕同步
clip_auto(cfg["paths"].get('video_folder'), cfg["paths"].get('audio_folder'), cfg["paths"].get('temp_video_folder'), cfg["paths"].get('result_video_file'), subtitles, cfg["paths"].get('blank_mp3_file'))
print("Successfully synthesized final video!(clip_auto)")
# 处理video_materials和temp_video
# ...
参与讨论
(Participate in the discussion)
参与讨论