refactor: Simplify CLI, remove optional features, and streamline summary generation

2025-01-03 14:54:45 +01:00
parent 1fbe9094c2
commit d3451c19e4
1 changed files with 60 additions and 71 deletions
--- a/summarize_yt/cli.py
+++ b/summarize_yt/cli.py
@@ -1,30 +1,47 @@
 # Standard library imports remain the same
 import argparse
 import os
 from pathlib import Path
 import re
 import signal
 import sys
 from datetime import datetime
 from yt_dlp import YoutubeDL
 from pathlib import Path
 from typing import Optional, Tuple
 # Third-party packages
 import anthropic
 from yt_dlp import YoutubeDL
 # Local modules
 from . import __version__
 # Add cost estimation constants
-CLAUDE_COST_PER_1K_INPUT = 0.015  # Cost per 1K tokens for input
+CLAUDE_COST_PER_1K_INPUT = 0.80 / 1000
-CLAUDE_COST_PER_1K_OUTPUT = 0.075  # Cost per 1K tokens for output
+CLAUDE_COST_PER_1K_OUTPUT = 4 / 1000
-ESTIMATED_TOKENS_PER_CHAR = 0.25  # Rough estimate of tokens per character
+ESTIMATED_TOKENS_PER_CHAR = 0.25
 HAIKU_OUTPUT_TOKENS = 50  # Haiku summaries are very short
-def estimate_api_cost(text: str) -> float:
+def setup_terminal_control():
    """Set up terminal control at program start."""
    try:
        # Put process in its own process group and take control of terminal
        os.setpgrp()
        # Ignore terminal control signals
        signal.signal(signal.SIGTTOU, signal.SIG_IGN)
        signal.signal(signal.SIGTTIN, signal.SIG_IGN)
        signal.signal(signal.SIGTSTP, signal.SIG_IGN)
        # Take control of terminal if we're running in one
        if sys.stdin.isatty():
            import termios
            termios.tcsetpgrp(sys.stdin.fileno(), os.getpgrp())  # type: ignore
    except Exception:
        # If we can't get terminal control, just continue
        pass
 def estimate_api_cost(text: str, target_words: float) -> float:
    """Estimate the cost of sending text to Claude API."""
    estimated_input_tokens = len(text) * ESTIMATED_TOKENS_PER_CHAR
-    estimated_output_tokens = HAIKU_OUTPUT_TOKENS
+    estimated_output_tokens = target_words
    input_cost = (estimated_input_tokens / 1000) * CLAUDE_COST_PER_1K_INPUT
    output_cost = (estimated_output_tokens / 1000) * CLAUDE_COST_PER_1K_OUTPUT
@@ -47,7 +64,7 @@ def sanitize_filename(url: str) -> str:
    return f"{timestamp}_{video_id if video_id else 'video'}"
-def ensure_yts_dirs() -> Tuple[Path, Path]:
+def ensure_yts_dirs() -> tuple[Path, Path]:
    """Create and return paths to transcript and summary directories."""
    base_dir = Path.home() / ".yts"
    transcript_dir = base_dir / "transcripts"
@@ -57,23 +74,23 @@ def ensure_yts_dirs() -> Tuple[Path, Path]:
    return transcript_dir, summary_dir
-def download_subtitles(url: str, quiet: bool = False) -> Tuple[bool, Optional[Path], int]:
+def download_subtitles(url: str, quiet: bool = False) -> tuple[bool, Path | None, int]:
    """Download subtitles from YouTube using yt-dlp and return success, path, and duration in minutes."""
    try:
        ydl_opts = {
-            'skip_download': True,
+            "skip_download": True,
-            'writeautomaticsub': True,
+            "writeautomaticsub": True,
-            'subtitleslangs': ['en'],
+            "subtitleslangs": ["en"],
-            'quiet': quiet,
+            "quiet": quiet,
-            'no_warnings': quiet,
+            "no_warnings": quiet,
        }
        # First get video duration
        info_opts = dict(ydl_opts)
-        info_opts['extract_flat'] = True
+        info_opts["extract_flat"] = True
        with YoutubeDL(info_opts) as ydl:
            info = ydl.extract_info(url, download=False)
-            duration_mins = int(info.get('duration', 0) / 60)
+            duration_mins = int(info.get("duration", 0) / 60)  # type: ignore
        if not quiet:
            print(f"Debug: Downloading subtitles for {url}")
@@ -99,6 +116,7 @@ def download_subtitles(url: str, quiet: bool = False) -> Tuple[bool, Optional[Pa
        if not quiet:
            print(f"Debug: Full exception info: {type(e).__name__}: {str(e)}")
            import traceback
            traceback.print_exc()
        return False, None, 0
@@ -151,7 +169,7 @@ def process_file(input_path: Path, output_path: Path | None = None, quiet: bool
        return ""
-def save_transcript(text: str, url: str, prompt: str) -> Path:
+def save_transcript(text: str, url: str) -> Path:
    """Save transcript with metadata to ~/.yts/transcripts/."""
    transcript_dir, _ = ensure_yts_dirs()
    filename = sanitize_filename(url) + ".txt"
@@ -160,7 +178,6 @@ def save_transcript(text: str, url: str, prompt: str) -> Path:
    metadata = f"""URL: {url}
 Script Version: {__version__}
 Timestamp: {datetime.now().isoformat()}
 Claude Prompt: {prompt}
 ---
 """
@@ -170,7 +187,7 @@ Claude Prompt: {prompt}
    return filepath
-def cleanup_files(vtt_path: Optional[Path]):
+def cleanup_files(vtt_path: Path | None):
    """Remove downloaded files after processing."""
    try:
        if vtt_path and vtt_path.exists():
@@ -185,7 +202,7 @@ def cleanup_files(vtt_path: Optional[Path]):
        print(f"Warning: Could not clean up temporary files: {str(e)}", file=sys.stderr)
-def get_summary_from_claude(text: str, duration_mins: int, prompt: str = None) -> str:
+def get_summary_from_claude(text: str, target_words: float) -> str:
    """Send text to Claude API for summarization."""
    try:
        api_key = os.environ.get("ANTHROPIC_API_KEY")
@@ -193,20 +210,17 @@ def get_summary_from_claude(text: str, duration_mins: int, prompt: str = None) -
            raise ValueError("ANTHROPIC_API_KEY environment variable not set")
        client = anthropic.Anthropic()
        # Calculate target word count based on duration
        target_words = max(500, (duration_mins // 10) * 500)
-        if prompt is None:
+        prompt = f"Please summarize this transcript in {target_words} or less."
            prompt = "Please summarize this transcript as a haiku. A haiku is a three-line poem with 5 syllables in the first line, 7 syllables in the second line, and 5 syllables in the third line."
        message = client.messages.create(
-            model="claude-3-sonnet-20240229",
+            model="claude-3-5-haiku-latest",
            max_tokens=2048,  # Increased for longer summaries
            temperature=0,
-            system="You are a helpful assistant that summarizes transcripts accurately and concisely.",
+            system="You are a helpful assistant that summarizes transcripts accurately.",
            messages=[{"role": "user", "content": f"{prompt}:\n\n{text}"}],
        )
-        return message.content[0].text
+        return message.content[0].text  # type: ignore
    except Exception as e:
        print(f"Error getting summary from Claude: {str(e)}", file=sys.stderr)
@@ -214,16 +228,12 @@ def get_summary_from_claude(text: str, duration_mins: int, prompt: str = None) -
 def main():
    setup_terminal_control()
    parser = argparse.ArgumentParser(description="Download YouTube subtitles and get Claude summary")
    parser.add_argument("url", help="YouTube video URL", type=str)
    parser.add_argument("-o", "--output", help="Output file for summary")
    parser.add_argument("-q", "--quiet", action="store_true", help="Suppress status messages")
    parser.add_argument("--keep-files", action="store_true", help="Don't delete downloaded VTT files")
    parser.add_argument("--transcript", action="store_true", help="Include full transcript in output")
    parser.add_argument(
        "--prompt", help="Custom prompt for Claude (default: auto-calculated based on video length)"
    )
    parser.add_argument("-y", "--yes", action="store_true", help="Skip cost confirmation")
    args = parser.parse_args()
@@ -234,36 +244,24 @@ def main():
    if not success:
        cleanup_files(None)
        sys.exit(1)
    target_words = max(500, (duration_mins // 10) * 500)
    # Process the VTT file
-    cleaned_text = process_file(vtt_path, None, args.quiet)
+    cleaned_text = process_file(vtt_path, None, args.quiet)  # type: ignore
    # Save transcript
-    transcript_path = save_transcript(cleaned_text, args.url, args.prompt)
+    transcript_path = save_transcript(cleaned_text, args.url)
    print(f"\nTranscript saved to: {transcript_path}")
    # Estimate and display cost
-    estimated_cost = estimate_api_cost(cleaned_text)
+    estimated_cost = estimate_api_cost(cleaned_text, target_words)
    print(f"\nEstimated API cost: ${estimated_cost:.4f}")
    if not args.yes:
        try:
            response = input("\nDo you want to proceed with getting the summary? (Y/n): ").strip().lower()
            if response == 'n':
                print("Operation cancelled by user.")
                cleanup_files(vtt_path)
                sys.exit(0)
        except (EOFError, KeyboardInterrupt):
            print("\nOperation cancelled by user.")
            cleanup_files(vtt_path)
            sys.exit(0)
    # Get summary from Claude
    if not args.quiet:
        print("\nGetting summary from Claude...")
-    summary = get_summary_from_claude(cleaned_text, duration_mins, args.prompt)
+    summary = get_summary_from_claude(cleaned_text, target_words)
    # Prepare output
    output = ""
    if args.transcript:
        output += "=== Full Transcript ===\n\n"
@@ -278,20 +276,11 @@ def main():
    with open(summary_path, "w", encoding="utf-8") as f:
        f.write(output)
    print(f"\nSummary saved to: {summary_path}")
    # Print the summary to console
    print("\n=== Summary ===\n")
    print(summary)
-    print()  # Extra newline for readability
+    print()
    print(f"\nSummary saved to: {summary_path}")
    # If output path specified, also save there
    if args.output:
        with open(args.output, "w", encoding="utf-8") as f:
            f.write(output)
        print(f"Summary also saved to: {args.output}")
    # Cleanup downloaded files unless --keep-files is specified
    if not args.keep_files:
        cleanup_files(vtt_path)