DEV Community

drake
drake

Posted on

faster whisper从多媒体语音材料中抽取出文本-2

为脚本添加每个音频的时长统计和每个音频转换所有的耗时统计

安装依赖

pip install faster-whisper pydub 
Enter fullscreen mode Exit fullscreen mode
""" 批量转录当前目录下的 .mp3 文件,使用 faster-whisper 新增功能: - 每个音频的时长(秒) - 每个音频的转录耗时(秒) - 总计统计:总音频时长、总转录耗时、平均实时倍率 """ import os import sys import time from pathlib import Path from typing import List, Tuple from faster_whisper import WhisperModel from pydub import AudioSegment # ================== 配置区 ================== MODEL_SIZE = "small" # 可选: tiny, base, small, medium, large DEVICE = "cpu" # cpu 或 cuda COMPUTE_TYPE = "int8" # int8, float16, float32 (CPU 推荐 int8) VAD_FILTER = True # 启用语音活动检测,去除静音 OUTPUT_FORMAT = "txt" # 只输出 .txt VERBOSE = True # 是否显示详细日志 # ===========================================  def get_audio_duration(audio_path: Path) -> float: """使用 pydub 获取音频时长(秒)""" try: audio = AudioSegment.from_file(str(audio_path)) return len(audio) / 1000.0 # 毫秒 → 秒  except Exception as e: print(f"无法获取 {audio_path.name} 时长: {e}", file=sys.stderr) return 0.0 def transcribe_audio( audio_path: Path, model: WhisperModel ) -> Tuple[str, float, float]: """ 转录单个音频文件 返回: (文本内容, 音频时长秒, 转录耗时秒) """ duration = get_audio_duration(audio_path) print(f"转录: {audio_path.name} ({duration:.2f}s) → {audio_path.stem}.txt") start_time = time.perf_counter() segments, info = model.transcribe( str(audio_path), language=None, # 自动检测  beam_size=5, vad_filter=VAD_FILTER, vad_parameters=dict(min_silence_duration_ms=500), word_timestamps=False, ) elapsed = time.perf_counter() - start_time text_lines = [] for segment in segments: line = segment.text.strip() text_lines.append(line) if VERBOSE: print(f"[{segment.start:06.2f}s --> {segment.end:06.2f}s] {line}", flush=True) return "\n".join(text_lines), duration, elapsed def format_time(seconds: float) -> str: """将秒数格式化为 h:mm:ss""" hours = int(seconds // 3600) minutes = int((seconds % 3600) // 60) secs = seconds % 60 return f"{hours}:{minutes:02d}:{secs:05.2f}" def main(): print("=== faster-whisper 批量转录(带时长与耗时统计)===") current_dir = Path(".") mp3_files = sorted(current_dir.glob("*.mp3")) if not mp3_files: print("未找到 .mp3 文件,退出。") return # 加载模型(只加载一次)  print(f"正在加载模型 {MODEL_SIZE} ({DEVICE}, {COMPUTE_TYPE})...") model = WhisperModel(MODEL_SIZE, device=DEVICE, compute_type=COMPUTE_TYPE) processed = 0 total_audio_duration = 0.0 total_transcribe_time = 0.0 results = [] for mp3_path in mp3_files: txt_path = mp3_path.with_suffix(".txt") if txt_path.exists(): duration = get_audio_duration(mp3_path) print(f"跳过: {txt_path.name} 已存在 ({duration:.2f}s)") total_audio_duration += duration continue try: text, duration, elapsed = transcribe_audio(mp3_path, model) txt_path.write_text(text, encoding="utf-8") total_audio_duration += duration total_transcribe_time += elapsed processed += 1 rtf = elapsed / duration if duration > 0 else float('inf') print(f"完成: {mp3_path.name} | 时长 {duration:.2f}s | 耗时 {elapsed:.2f}s | RTF {rtf:.2f}x") results.append((mp3_path.name, duration, elapsed, rtf)) except Exception as e: print(f"错误转录 {mp3_path.name}: {e}", file=sys.stderr) # ================== 汇总统计 ==================  print("\n" + "=" * 60) print("转录完成汇总") print("=" * 60) print(f"成功处理文件数 : {processed}") print(f"总音频时长 : {format_time(total_audio_duration)}") print(f"总转录耗时 : {format_time(total_transcribe_time)}") if total_audio_duration > 0: avg_rtf = total_transcribe_time / total_audio_duration print(f"平均实时倍率(RTF): {avg_rtf:.2f}x") else: print(f"平均实时倍率(RTF): N/A") if results: print(f"\n明细列表:") print(f"{'文件名':<40} {'音频时长':>10} {'转录耗时':>10} {'RTF':>8}") print("-" * 70) for name, dur, ela, rtf in results: print(f"{name:<40} {dur:10.2f}s {ela:10.2f}s {rtf:8.2f}x") print("=" * 60) if __name__ == "__main__": main() 
Enter fullscreen mode Exit fullscreen mode

Top comments (0)