refactor: Replace subprocess with yt-dlp for subtitle download

This commit is contained in:
2025-01-03 13:52:50 +01:00
parent d4f96d72d8
commit 9a2f4d1c8c

View File

@@ -3,9 +3,9 @@ import argparse
import os
import re
import signal
import subprocess
import sys
from datetime import datetime
from yt_dlp import YoutubeDL
from pathlib import Path
from typing import Optional, Tuple
@@ -56,33 +56,27 @@ def ensure_transcript_dir() -> Path:
def download_subtitles(url: str, quiet: bool = False) -> Tuple[bool, Optional[Path]]:
"""Download subtitles from YouTube using yt-dlp."""
try:
cmd = ["yt-dlp", "--skip-download", "--write-auto-sub", "--sub-lang", "en"]
if quiet:
cmd.append("--quiet")
cmd.append(url)
ydl_opts = {
'skip_download': True,
'writeautomaticsub': True,
'subtitleslangs': ['en'],
'quiet': quiet,
'no_warnings': quiet,
}
print(f"Debug: Running command: {cmd}") # Debug line
if not quiet:
print(f"Debug: Downloading subtitles for {url}")
process = subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True # This might help with encoding issues
)
try:
stdout, stderr = process.communicate(timeout=30)
print(f"Debug: stdout: {stdout}") # Debug line
print(f"Debug: stderr: {stderr}") # Debug line
if process.returncode != 0:
print(f"Error downloading subtitles: {stderr}", file=sys.stderr)
return False, None
except subprocess.TimeoutExpired:
process.kill()
print("Download timed out after 30 seconds", file=sys.stderr)
return False, None
with YoutubeDL(ydl_opts) as ydl:
ydl.download([url])
# Find the downloaded VTT file
current_dir = Path(".")
vtt_files = list(current_dir.glob("*.en.vtt"))
print(f"Debug: Found VTT files: {vtt_files}") # Debug line
if not quiet:
print(f"Debug: Found VTT files: {vtt_files}")
if not vtt_files:
print("No VTT file found after download", file=sys.stderr)
return False, None
@@ -91,10 +85,10 @@ def download_subtitles(url: str, quiet: bool = False) -> Tuple[bool, Optional[Pa
except Exception as e:
print(f"Error during subtitle download: {str(e)}", file=sys.stderr)
print(f"Debug: Full exception info: {type(e).__name__}: {str(e)}") # Debug line
import traceback
traceback.print_exc() # This will print the full traceback
if not quiet:
print(f"Debug: Full exception info: {type(e).__name__}: {str(e)}")
import traceback
traceback.print_exc()
return False, None