refactor: Simplify CLI, remove optional features, and streamline summary generation

This commit is contained in:
2025-01-03 14:54:45 +01:00
committed by Zev Averbach (aider)
parent 1fbe9094c2
commit d3451c19e4

View File

@@ -1,30 +1,47 @@
# Standard library imports remain the same
import argparse import argparse
import os import os
from pathlib import Path
import re import re
import signal
import sys import sys
from datetime import datetime from datetime import datetime
from yt_dlp import YoutubeDL
from pathlib import Path
from typing import Optional, Tuple
# Third-party packages
import anthropic import anthropic
from yt_dlp import YoutubeDL
# Local modules
from . import __version__ from . import __version__
# Add cost estimation constants # Add cost estimation constants
CLAUDE_COST_PER_1K_INPUT = 0.015 # Cost per 1K tokens for input CLAUDE_COST_PER_1K_INPUT = 0.80 / 1000
CLAUDE_COST_PER_1K_OUTPUT = 0.075 # Cost per 1K tokens for output CLAUDE_COST_PER_1K_OUTPUT = 4 / 1000
ESTIMATED_TOKENS_PER_CHAR = 0.25 # Rough estimate of tokens per character ESTIMATED_TOKENS_PER_CHAR = 0.25
HAIKU_OUTPUT_TOKENS = 50 # Haiku summaries are very short
def estimate_api_cost(text: str) -> float: def setup_terminal_control():
"""Set up terminal control at program start."""
try:
# Put process in its own process group and take control of terminal
os.setpgrp()
# Ignore terminal control signals
signal.signal(signal.SIGTTOU, signal.SIG_IGN)
signal.signal(signal.SIGTTIN, signal.SIG_IGN)
signal.signal(signal.SIGTSTP, signal.SIG_IGN)
# Take control of terminal if we're running in one
if sys.stdin.isatty():
import termios
termios.tcsetpgrp(sys.stdin.fileno(), os.getpgrp()) # type: ignore
except Exception:
# If we can't get terminal control, just continue
pass
def estimate_api_cost(text: str, target_words: float) -> float:
"""Estimate the cost of sending text to Claude API.""" """Estimate the cost of sending text to Claude API."""
estimated_input_tokens = len(text) * ESTIMATED_TOKENS_PER_CHAR estimated_input_tokens = len(text) * ESTIMATED_TOKENS_PER_CHAR
estimated_output_tokens = HAIKU_OUTPUT_TOKENS estimated_output_tokens = target_words
input_cost = (estimated_input_tokens / 1000) * CLAUDE_COST_PER_1K_INPUT input_cost = (estimated_input_tokens / 1000) * CLAUDE_COST_PER_1K_INPUT
output_cost = (estimated_output_tokens / 1000) * CLAUDE_COST_PER_1K_OUTPUT output_cost = (estimated_output_tokens / 1000) * CLAUDE_COST_PER_1K_OUTPUT
@@ -47,7 +64,7 @@ def sanitize_filename(url: str) -> str:
return f"{timestamp}_{video_id if video_id else 'video'}" return f"{timestamp}_{video_id if video_id else 'video'}"
def ensure_yts_dirs() -> Tuple[Path, Path]: def ensure_yts_dirs() -> tuple[Path, Path]:
"""Create and return paths to transcript and summary directories.""" """Create and return paths to transcript and summary directories."""
base_dir = Path.home() / ".yts" base_dir = Path.home() / ".yts"
transcript_dir = base_dir / "transcripts" transcript_dir = base_dir / "transcripts"
@@ -57,23 +74,23 @@ def ensure_yts_dirs() -> Tuple[Path, Path]:
return transcript_dir, summary_dir return transcript_dir, summary_dir
def download_subtitles(url: str, quiet: bool = False) -> Tuple[bool, Optional[Path], int]: def download_subtitles(url: str, quiet: bool = False) -> tuple[bool, Path | None, int]:
"""Download subtitles from YouTube using yt-dlp and return success, path, and duration in minutes.""" """Download subtitles from YouTube using yt-dlp and return success, path, and duration in minutes."""
try: try:
ydl_opts = { ydl_opts = {
'skip_download': True, "skip_download": True,
'writeautomaticsub': True, "writeautomaticsub": True,
'subtitleslangs': ['en'], "subtitleslangs": ["en"],
'quiet': quiet, "quiet": quiet,
'no_warnings': quiet, "no_warnings": quiet,
} }
# First get video duration # First get video duration
info_opts = dict(ydl_opts) info_opts = dict(ydl_opts)
info_opts['extract_flat'] = True info_opts["extract_flat"] = True
with YoutubeDL(info_opts) as ydl: with YoutubeDL(info_opts) as ydl:
info = ydl.extract_info(url, download=False) info = ydl.extract_info(url, download=False)
duration_mins = int(info.get('duration', 0) / 60) duration_mins = int(info.get("duration", 0) / 60) # type: ignore
if not quiet: if not quiet:
print(f"Debug: Downloading subtitles for {url}") print(f"Debug: Downloading subtitles for {url}")
@@ -84,10 +101,10 @@ def download_subtitles(url: str, quiet: bool = False) -> Tuple[bool, Optional[Pa
# Find the downloaded VTT file # Find the downloaded VTT file
current_dir = Path(".") current_dir = Path(".")
vtt_files = list(current_dir.glob("*.en.vtt")) vtt_files = list(current_dir.glob("*.en.vtt"))
if not quiet: if not quiet:
print(f"Debug: Found VTT files: {vtt_files}") print(f"Debug: Found VTT files: {vtt_files}")
if not vtt_files: if not vtt_files:
print("No VTT file found after download", file=sys.stderr) print("No VTT file found after download", file=sys.stderr)
return False, None, 0 return False, None, 0
@@ -99,6 +116,7 @@ def download_subtitles(url: str, quiet: bool = False) -> Tuple[bool, Optional[Pa
if not quiet: if not quiet:
print(f"Debug: Full exception info: {type(e).__name__}: {str(e)}") print(f"Debug: Full exception info: {type(e).__name__}: {str(e)}")
import traceback import traceback
traceback.print_exc() traceback.print_exc()
return False, None, 0 return False, None, 0
@@ -151,7 +169,7 @@ def process_file(input_path: Path, output_path: Path | None = None, quiet: bool
return "" return ""
def save_transcript(text: str, url: str, prompt: str) -> Path: def save_transcript(text: str, url: str) -> Path:
"""Save transcript with metadata to ~/.yts/transcripts/.""" """Save transcript with metadata to ~/.yts/transcripts/."""
transcript_dir, _ = ensure_yts_dirs() transcript_dir, _ = ensure_yts_dirs()
filename = sanitize_filename(url) + ".txt" filename = sanitize_filename(url) + ".txt"
@@ -160,7 +178,6 @@ def save_transcript(text: str, url: str, prompt: str) -> Path:
metadata = f"""URL: {url} metadata = f"""URL: {url}
Script Version: {__version__} Script Version: {__version__}
Timestamp: {datetime.now().isoformat()} Timestamp: {datetime.now().isoformat()}
Claude Prompt: {prompt}
--- ---
""" """
@@ -170,7 +187,7 @@ Claude Prompt: {prompt}
return filepath return filepath
def cleanup_files(vtt_path: Optional[Path]): def cleanup_files(vtt_path: Path | None):
"""Remove downloaded files after processing.""" """Remove downloaded files after processing."""
try: try:
if vtt_path and vtt_path.exists(): if vtt_path and vtt_path.exists():
@@ -185,7 +202,7 @@ def cleanup_files(vtt_path: Optional[Path]):
print(f"Warning: Could not clean up temporary files: {str(e)}", file=sys.stderr) print(f"Warning: Could not clean up temporary files: {str(e)}", file=sys.stderr)
def get_summary_from_claude(text: str, duration_mins: int, prompt: str = None) -> str: def get_summary_from_claude(text: str, target_words: float) -> str:
"""Send text to Claude API for summarization.""" """Send text to Claude API for summarization."""
try: try:
api_key = os.environ.get("ANTHROPIC_API_KEY") api_key = os.environ.get("ANTHROPIC_API_KEY")
@@ -193,20 +210,17 @@ def get_summary_from_claude(text: str, duration_mins: int, prompt: str = None) -
raise ValueError("ANTHROPIC_API_KEY environment variable not set") raise ValueError("ANTHROPIC_API_KEY environment variable not set")
client = anthropic.Anthropic() client = anthropic.Anthropic()
# Calculate target word count based on duration
target_words = max(500, (duration_mins // 10) * 500) prompt = f"Please summarize this transcript in {target_words} or less."
if prompt is None:
prompt = "Please summarize this transcript as a haiku. A haiku is a three-line poem with 5 syllables in the first line, 7 syllables in the second line, and 5 syllables in the third line."
message = client.messages.create( message = client.messages.create(
model="claude-3-sonnet-20240229", model="claude-3-5-haiku-latest",
max_tokens=2048, # Increased for longer summaries max_tokens=2048, # Increased for longer summaries
temperature=0, temperature=0,
system="You are a helpful assistant that summarizes transcripts accurately and concisely.", system="You are a helpful assistant that summarizes transcripts accurately.",
messages=[{"role": "user", "content": f"{prompt}:\n\n{text}"}], messages=[{"role": "user", "content": f"{prompt}:\n\n{text}"}],
) )
return message.content[0].text return message.content[0].text # type: ignore
except Exception as e: except Exception as e:
print(f"Error getting summary from Claude: {str(e)}", file=sys.stderr) print(f"Error getting summary from Claude: {str(e)}", file=sys.stderr)
@@ -214,16 +228,12 @@ def get_summary_from_claude(text: str, duration_mins: int, prompt: str = None) -
def main(): def main():
setup_terminal_control()
parser = argparse.ArgumentParser(description="Download YouTube subtitles and get Claude summary") parser = argparse.ArgumentParser(description="Download YouTube subtitles and get Claude summary")
parser.add_argument("url", help="YouTube video URL", type=str) parser.add_argument("url", help="YouTube video URL", type=str)
parser.add_argument("-o", "--output", help="Output file for summary")
parser.add_argument("-q", "--quiet", action="store_true", help="Suppress status messages") parser.add_argument("-q", "--quiet", action="store_true", help="Suppress status messages")
parser.add_argument("--keep-files", action="store_true", help="Don't delete downloaded VTT files") parser.add_argument("--keep-files", action="store_true", help="Don't delete downloaded VTT files")
parser.add_argument("--transcript", action="store_true", help="Include full transcript in output") parser.add_argument("--transcript", action="store_true", help="Include full transcript in output")
parser.add_argument(
"--prompt", help="Custom prompt for Claude (default: auto-calculated based on video length)"
)
parser.add_argument("-y", "--yes", action="store_true", help="Skip cost confirmation")
args = parser.parse_args() args = parser.parse_args()
@@ -234,36 +244,24 @@ def main():
if not success: if not success:
cleanup_files(None) cleanup_files(None)
sys.exit(1) sys.exit(1)
target_words = max(500, (duration_mins // 10) * 500)
# Process the VTT file # Process the VTT file
cleaned_text = process_file(vtt_path, None, args.quiet) cleaned_text = process_file(vtt_path, None, args.quiet) # type: ignore
# Save transcript # Save transcript
transcript_path = save_transcript(cleaned_text, args.url, args.prompt) transcript_path = save_transcript(cleaned_text, args.url)
print(f"\nTranscript saved to: {transcript_path}") print(f"\nTranscript saved to: {transcript_path}")
# Estimate and display cost # Estimate and display cost
estimated_cost = estimate_api_cost(cleaned_text) estimated_cost = estimate_api_cost(cleaned_text, target_words)
print(f"\nEstimated API cost: ${estimated_cost:.4f}") print(f"\nEstimated API cost: ${estimated_cost:.4f}")
if not args.yes:
try:
response = input("\nDo you want to proceed with getting the summary? (Y/n): ").strip().lower()
if response == 'n':
print("Operation cancelled by user.")
cleanup_files(vtt_path)
sys.exit(0)
except (EOFError, KeyboardInterrupt):
print("\nOperation cancelled by user.")
cleanup_files(vtt_path)
sys.exit(0)
# Get summary from Claude # Get summary from Claude
if not args.quiet: if not args.quiet:
print("\nGetting summary from Claude...") print("\nGetting summary from Claude...")
summary = get_summary_from_claude(cleaned_text, duration_mins, args.prompt) summary = get_summary_from_claude(cleaned_text, target_words)
# Prepare output
output = "" output = ""
if args.transcript: if args.transcript:
output += "=== Full Transcript ===\n\n" output += "=== Full Transcript ===\n\n"
@@ -275,23 +273,14 @@ def main():
_, summary_dir = ensure_yts_dirs() _, summary_dir = ensure_yts_dirs()
summary_filename = sanitize_filename(args.url) + "_summary.txt" summary_filename = sanitize_filename(args.url) + "_summary.txt"
summary_path = summary_dir / summary_filename summary_path = summary_dir / summary_filename
with open(summary_path, "w", encoding="utf-8") as f: with open(summary_path, "w", encoding="utf-8") as f:
f.write(output) f.write(output)
print(f"\nSummary saved to: {summary_path}")
# Print the summary to console
print("\n=== Summary ===\n") print("\n=== Summary ===\n")
print(summary) print(summary)
print() # Extra newline for readability print()
print(f"\nSummary saved to: {summary_path}")
# If output path specified, also save there
if args.output:
with open(args.output, "w", encoding="utf-8") as f:
f.write(output)
print(f"Summary also saved to: {args.output}")
# Cleanup downloaded files unless --keep-files is specified
if not args.keep_files: if not args.keep_files:
cleanup_files(vtt_path) cleanup_files(vtt_path)