From 6d27e00f8aee3a06313b2698b9157fdd66c5ddec Mon Sep 17 00:00:00 2001 From: "Zev Averbach (aider)" Date: Fri, 3 Jan 2025 14:05:29 +0100 Subject: [PATCH] feat: Update summary generation to scale words based on video duration --- summarize_yt/cli.py | 63 ++++++++++++++++++++++++++++++--------------- 1 file changed, 42 insertions(+), 21 deletions(-) diff --git a/summarize_yt/cli.py b/summarize_yt/cli.py index 8e7e0c8..1af1735 100644 --- a/summarize_yt/cli.py +++ b/summarize_yt/cli.py @@ -45,15 +45,18 @@ def sanitize_filename(url: str) -> str: return f"{timestamp}_{video_id if video_id else 'video'}" -def ensure_transcript_dir() -> Path: - """Create and return path to transcript directory.""" - transcript_dir = Path.home() / ".yts" / "transcripts" +def ensure_yts_dirs() -> Tuple[Path, Path]: + """Create and return paths to transcript and summary directories.""" + base_dir = Path.home() / ".yts" + transcript_dir = base_dir / "transcripts" + summary_dir = base_dir / "summaries" transcript_dir.mkdir(parents=True, exist_ok=True) - return transcript_dir + summary_dir.mkdir(parents=True, exist_ok=True) + return transcript_dir, summary_dir -def download_subtitles(url: str, quiet: bool = False) -> Tuple[bool, Optional[Path]]: - """Download subtitles from YouTube using yt-dlp.""" +def download_subtitles(url: str, quiet: bool = False) -> Tuple[bool, Optional[Path], int]: + """Download subtitles from YouTube using yt-dlp and return success, path, and duration in minutes.""" try: ydl_opts = { 'skip_download': True, @@ -62,6 +65,13 @@ def download_subtitles(url: str, quiet: bool = False) -> Tuple[bool, Optional[Pa 'quiet': quiet, 'no_warnings': quiet, } + + # First get video duration + info_opts = dict(ydl_opts) + info_opts['extract_flat'] = True + with YoutubeDL(info_opts) as ydl: + info = ydl.extract_info(url, download=False) + duration_mins = int(info.get('duration', 0) / 60) if not quiet: print(f"Debug: Downloading subtitles for {url}") @@ -80,7 +90,7 @@ def download_subtitles(url: str, quiet: bool = False) -> Tuple[bool, Optional[Pa print("No VTT file found after download", file=sys.stderr) return False, None - return True, vtt_files[0] + return True, vtt_files[0], duration_mins except Exception as e: print(f"Error during subtitle download: {str(e)}", file=sys.stderr) @@ -141,7 +151,7 @@ def process_file(input_path: Path, output_path: Path | None = None, quiet: bool def save_transcript(text: str, url: str, prompt: str) -> Path: """Save transcript with metadata to ~/.yts/transcripts/.""" - transcript_dir = ensure_transcript_dir() + transcript_dir, _ = ensure_yts_dirs() filename = sanitize_filename(url) + ".txt" filepath = transcript_dir / filename @@ -173,7 +183,7 @@ def cleanup_files(vtt_path: Optional[Path]): print(f"Warning: Could not clean up temporary files: {str(e)}", file=sys.stderr) -def get_summary_from_claude(text: str, prompt: str = "Please summarize this transcript in 500 words or less") -> str: +def get_summary_from_claude(text: str, duration_mins: int, prompt: str = None) -> str: """Send text to Claude API for summarization.""" try: api_key = os.environ.get("ANTHROPIC_API_KEY") @@ -181,9 +191,15 @@ def get_summary_from_claude(text: str, prompt: str = "Please summarize this tran raise ValueError("ANTHROPIC_API_KEY environment variable not set") client = anthropic.Anthropic() + # Calculate target word count based on duration + target_words = max(500, (duration_mins // 10) * 500) + + if prompt is None: + prompt = f"Please summarize this transcript in approximately {target_words} words" + message = client.messages.create( model="claude-3-sonnet-20240229", - max_tokens=1024, + max_tokens=2048, # Increased for longer summaries temperature=0, system="You are a helpful assistant that summarizes transcripts accurately and concisely.", messages=[{"role": "user", "content": f"{prompt}:\n\n{text}"}], @@ -203,7 +219,7 @@ def main(): parser.add_argument("--keep-files", action="store_true", help="Don't delete downloaded VTT files") parser.add_argument("--transcript", action="store_true", help="Include full transcript in output") parser.add_argument( - "--prompt", default="Please summarize this transcript in 500 words or less", help="Custom prompt for Claude" + "--prompt", help="Custom prompt for Claude (default: auto-calculated based on video length)" ) parser.add_argument("-y", "--yes", action="store_true", help="Skip cost confirmation") @@ -212,7 +228,7 @@ def main(): # Download subtitles if not args.quiet: print("Downloading subtitles...") - success, vtt_path = download_subtitles(args.url, args.quiet) + success, vtt_path, duration_mins = download_subtitles(args.url, args.quiet) if not success: cleanup_files(None) sys.exit(1) @@ -243,7 +259,7 @@ def main(): # Get summary from Claude if not args.quiet: print("\nGetting summary from Claude...") - summary = get_summary_from_claude(cleaned_text, args.prompt) + summary = get_summary_from_claude(cleaned_text, duration_mins, args.prompt) # Prepare output output = "" @@ -253,15 +269,20 @@ def main(): output += "\n\n=== Summary ===\n\n" output += summary - # Output results - output_path = args.output - if output_path: - with open(output_path, "w", encoding="utf-8") as f: + # Save summary + _, summary_dir = ensure_yts_dirs() + summary_filename = sanitize_filename(args.url) + "_summary.txt" + summary_path = summary_dir / summary_filename + + with open(summary_path, "w", encoding="utf-8") as f: + f.write(output) + print(f"\nSummary saved to: {summary_path}") + + # If output path specified, also save there + if args.output: + with open(args.output, "w", encoding="utf-8") as f: f.write(output) - print(f"\nSummary saved to: {output_path}") - else: - print("\n=== Summary ===") - print(output) + print(f"Summary also saved to: {args.output}") # Cleanup downloaded files unless --keep-files is specified if not args.keep_files: