add some docs

fix: Update subtitle download error handling to return unknown title
feat: Add video title and enhanced metadata to transcripts and summaries
2025-01-03 15:16:23 +01:00 · 2025-01-03 14:54:54 +01:00 · 2025-01-03 14:54:47 +01:00 · 2025-01-03 14:54:45 +01:00 · 2025-01-03 14:12:40 +01:00 · 2025-01-03 14:09:33 +01:00
6 changed files with 154 additions and 111 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,12 @@
+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+uv.lock
+
+# Virtual environments
+.venv
+.aider*
--- a/.python-version
+++ b/.python-version
@@ -0,0 +1 @@
+3.12
--- a/README.md
+++ b/README.md
@@ -0,0 +1,20 @@
+# What?
+
+This is a CLI for summarizing YouTubes
+
+## Installation
+
+```bash
+> python3 -m build && uv pip install --system dist/*.whl 
+```
+
+## Usage
+
+```bash
+> yts <youtube URL>
+```
+
+## Default Behavior
+
+The transcripts and summaries will be saved by default to `~/.yts/summaries` and `~/.yts/transcripts`.
+
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,18 @@
+[project]
+name = "summarize-yt"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+    "anthropic>=0.42.0",
+    "hatchling>=1.27.0",
+    "yt-dlp>=2024.12.23",
+]
+
+[project.scripts]
+yts = "summarize_yt.cli:main"
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
--- a/summarize_yt/init.py
+++ b/summarize_yt/init.py
@@ -0,0 +1 @@
+__version__ = "0.1.1"
--- a/summarize_yt/cli.py
+++ b/summarize_yt/cli.py
@@ -1,30 +1,47 @@
-# Standard library imports remain the same
 import argparse
 import os
+from pathlib import Path
 import re
 import signal
 import sys
 from datetime import datetime
-from yt_dlp import YoutubeDL
-from pathlib import Path
-from typing import Optional, Tuple

-# Third-party packages
 import anthropic
+from yt_dlp import YoutubeDL

-# Local modules
 from . import __version__

 # Add cost estimation constants
-CLAUDE_COST_PER_1K_INPUT = 0.015  # Cost per 1K tokens for input
-CLAUDE_COST_PER_1K_OUTPUT = 0.075  # Cost per 1K tokens for output
-ESTIMATED_TOKENS_PER_CHAR = 0.25  # Rough estimate of tokens per character
+CLAUDE_COST_PER_1K_INPUT = 0.80 / 1000
+CLAUDE_COST_PER_1K_OUTPUT = 4 / 1000
+ESTIMATED_TOKENS_PER_CHAR = 0.25


-def estimate_api_cost(text: str) -> float:
+def setup_terminal_control():
+    """Set up terminal control at program start."""
+    try:
+        # Put process in its own process group and take control of terminal
+        os.setpgrp()
+
+        # Ignore terminal control signals
+        signal.signal(signal.SIGTTOU, signal.SIG_IGN)
+        signal.signal(signal.SIGTTIN, signal.SIG_IGN)
+        signal.signal(signal.SIGTSTP, signal.SIG_IGN)
+
+        # Take control of terminal if we're running in one
+        if sys.stdin.isatty():
+            import termios
+
+            termios.tcsetpgrp(sys.stdin.fileno(), os.getpgrp())  # type: ignore
+    except Exception:
+        # If we can't get terminal control, just continue
+        pass
+
+
+def estimate_api_cost(text: str, target_words: float) -> float:
    """Estimate the cost of sending text to Claude API."""
    estimated_input_tokens = len(text) * ESTIMATED_TOKENS_PER_CHAR
-    estimated_output_tokens = 1024  # max_tokens setting
+    estimated_output_tokens = target_words

    input_cost = (estimated_input_tokens / 1000) * CLAUDE_COST_PER_1K_INPUT
    output_cost = (estimated_output_tokens / 1000) * CLAUDE_COST_PER_1K_OUTPUT
@@ -34,36 +51,48 @@ def estimate_api_cost(text: str) -> float:

 def sanitize_filename(url: str) -> str:
    """Convert URL to safe filename, keeping video ID."""
-    # Extract video ID if it's a YouTube URL
+    # Clean URL and extract video ID
+    clean_url = url.split("&")[0]  # Remove everything after first &
    video_id = None
-    if "youtube.com" in url or "youtu.be" in url:
-        if "v=" in url:
-            video_id = url.split("v=")[1].split("&")[0]
+    if "youtube.com" in clean_url or "youtu.be" in clean_url:
+        if "v=" in clean_url:
+            video_id = clean_url.split("v=")[1]
        else:
-            video_id = url.split("/")[-1].split("?")[0]
+            video_id = clean_url.split("/")[-1].split("?")[0]

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    return f"{timestamp}_{video_id if video_id else 'video'}"


-def ensure_transcript_dir() -> Path:
-    """Create and return path to transcript directory."""
-    transcript_dir = Path.home() / ".yts" / "transcripts"
+def ensure_yts_dirs() -> tuple[Path, Path]:
+    """Create and return paths to transcript and summary directories."""
+    base_dir = Path.home() / ".yts"
+    transcript_dir = base_dir / "transcripts"
+    summary_dir = base_dir / "summaries"
    transcript_dir.mkdir(parents=True, exist_ok=True)
-    return transcript_dir
+    summary_dir.mkdir(parents=True, exist_ok=True)
+    return transcript_dir, summary_dir


-def download_subtitles(url: str, quiet: bool = False) -> Tuple[bool, Optional[Path]]:
-    """Download subtitles from YouTube using yt-dlp."""
+def download_subtitles(url: str, quiet: bool = False) -> tuple[bool, Path | None, int, str]:
+    """Download subtitles from YouTube using yt-dlp and return success, path, duration in minutes, and title."""
    try:
        ydl_opts = {
-            'skip_download': True,
-            'writeautomaticsub': True,
-            'subtitleslangs': ['en'],
-            'quiet': quiet,
-            'no_warnings': quiet,
+            "skip_download": True,
+            "writeautomaticsub": True,
+            "subtitleslangs": ["en"],
+            "quiet": quiet,
+            "no_warnings": quiet,
        }

+        # First get video info
+        info_opts = dict(ydl_opts)
+        info_opts["extract_flat"] = True
+        with YoutubeDL(info_opts) as ydl:
+            info = ydl.extract_info(url, download=False)
+            duration_mins = int(info.get("duration", 0) / 60)  # type: ignore
+            title = info.get("title", "Unknown Title")  # type: ignore
+
        if not quiet:
            print(f"Debug: Downloading subtitles for {url}")

@@ -73,23 +102,24 @@ def download_subtitles(url: str, quiet: bool = False) -> Tuple[bool, Optional[Pa
        # Find the downloaded VTT file
        current_dir = Path(".")
        vtt_files = list(current_dir.glob("*.en.vtt"))
-        
+
        if not quiet:
            print(f"Debug: Found VTT files: {vtt_files}")
-            
+
        if not vtt_files:
            print("No VTT file found after download", file=sys.stderr)
-            return False, None
+            return False, None, 0, "Unknown Title"

-        return True, vtt_files[0]
+        return True, vtt_files[0], duration_mins, title

    except Exception as e:
        print(f"Error during subtitle download: {str(e)}", file=sys.stderr)
        if not quiet:
            print(f"Debug: Full exception info: {type(e).__name__}: {str(e)}")
            import traceback
+
            traceback.print_exc()
-        return False, None
+        return False, None, 0, "Unknown Title"


 def clean_vtt_text(text: str) -> str:
@@ -140,16 +170,17 @@ def process_file(input_path: Path, output_path: Path | None = None, quiet: bool
        return ""


-def save_transcript(text: str, url: str, prompt: str) -> Path:
+def save_transcript(text: str, url: str, title: str) -> Path:
    """Save transcript with metadata to ~/.yts/transcripts/."""
-    transcript_dir = ensure_transcript_dir()
+    transcript_dir, _ = ensure_yts_dirs()
    filename = sanitize_filename(url) + ".txt"
    filepath = transcript_dir / filename

-    metadata = f"""URL: {url}
+    metadata = f"""Title: {title}
+URL: {url}
 Script Version: {__version__}
 Timestamp: {datetime.now().isoformat()}
-Claude Prompt: {prompt}
+Type: Transcript
 ---
 """

@@ -159,7 +190,7 @@ Claude Prompt: {prompt}
    return filepath


-def cleanup_files(vtt_path: Optional[Path]):
+def cleanup_files(vtt_path: Path | None):
    """Remove downloaded files after processing."""
    try:
        if vtt_path and vtt_path.exists():
@@ -174,7 +205,7 @@ def cleanup_files(vtt_path: Optional[Path]):
        print(f"Warning: Could not clean up temporary files: {str(e)}", file=sys.stderr)


-def get_summary_from_claude(text: str, prompt: str = "Please summarize this transcript in 500 words or less") -> str:
+def get_summary_from_claude(text: str, target_words: float) -> str:
    """Send text to Claude API for summarization."""
    try:
        api_key = os.environ.get("ANTHROPIC_API_KEY")
@@ -182,14 +213,17 @@ def get_summary_from_claude(text: str, prompt: str = "Please summarize this tran
            raise ValueError("ANTHROPIC_API_KEY environment variable not set")

        client = anthropic.Anthropic()
+
+        prompt = f"Please summarize this transcript in {target_words} or less."
+
        message = client.messages.create(
-            model="claude-3-sonnet-20240229",
-            max_tokens=1024,
+            model="claude-3-5-haiku-latest",
+            max_tokens=2048,  # Increased for longer summaries
            temperature=0,
-            system="You are a helpful assistant that summarizes transcripts accurately and concisely.",
+            system="You are a helpful assistant that summarizes transcripts accurately.",
            messages=[{"role": "user", "content": f"{prompt}:\n\n{text}"}],
        )
-        return message.content[0].text
+        return message.content[0].text  # type: ignore

    except Exception as e:
        print(f"Error getting summary from Claude: {str(e)}", file=sys.stderr)
@@ -197,92 +231,40 @@ def get_summary_from_claude(text: str, prompt: str = "Please summarize this tran


 def main():
+    setup_terminal_control()
    parser = argparse.ArgumentParser(description="Download YouTube subtitles and get Claude summary")
    parser.add_argument("url", help="YouTube video URL", type=str)
-    parser.add_argument("-o", "--output", help="Output file for summary")
    parser.add_argument("-q", "--quiet", action="store_true", help="Suppress status messages")
    parser.add_argument("--keep-files", action="store_true", help="Don't delete downloaded VTT files")
    parser.add_argument("--transcript", action="store_true", help="Include full transcript in output")
-    parser.add_argument(
-        "--prompt", default="Please summarize this transcript in 500 words or less", help="Custom prompt for Claude"
-    )
-    parser.add_argument("-y", "--yes", action="store_true", help="Skip cost confirmation")

    args = parser.parse_args()

-    # Set up signal handler for clean exit
-    def signal_handler(sig, frame):
-        print("\nCleaning up and exiting...")
-        cleanup_files(None)  # Clean any VTT files
-        print(f"Debug: Script terminated by signal at {datetime.now().isoformat()}")
-        sys.exit(0)
-
-    signal.signal(signal.SIGINT, signal_handler)
-
    # Download subtitles
    if not args.quiet:
        print("Downloading subtitles...")
-    success, vtt_path = download_subtitles(args.url, args.quiet)
+    success, vtt_path, duration_mins, video_title = download_subtitles(args.url, args.quiet)
    if not success:
        cleanup_files(None)
        sys.exit(1)
+    target_words = max(500, (duration_mins // 10) * 500)

    # Process the VTT file
-    cleaned_text = process_file(vtt_path, None, args.quiet)
+    cleaned_text = process_file(vtt_path, None, args.quiet)  # type: ignore

    # Save transcript
-    transcript_path = save_transcript(cleaned_text, args.url, args.prompt)
+    transcript_path = save_transcript(cleaned_text, args.url, video_title)
    print(f"\nTranscript saved to: {transcript_path}")

    # Estimate and display cost
-    estimated_cost = estimate_api_cost(cleaned_text)
+    estimated_cost = estimate_api_cost(cleaned_text, target_words)
    print(f"\nEstimated API cost: ${estimated_cost:.4f}")

-    if not args.yes:
-        try:
-            # Check if running in a terminal
-            if sys.stdin.isatty():
-                import tty
-                import termios
-
-                # Save the terminal settings
-                fd = sys.stdin.fileno()
-                old_settings = termios.tcgetattr(fd)
-                try:
-                    # Set the terminal to raw mode
-                    tty.setraw(sys.stdin.fileno())
-                    sys.stdout.write("\nDo you want to proceed with getting the summary? (y/N): ")
-                    sys.stdout.flush()
-                    # Read a single character
-                    char = sys.stdin.read(1)
-                    # Print a newline since we're in raw mode
-                    sys.stdout.write("\n")
-                    sys.stdout.flush()
-                finally:
-                    # Restore terminal settings
-                    termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
-            else:
-                # If not in a terminal, use regular input
-                sys.stdout.write("\nDo you want to proceed with getting the summary? (y/N): ")
-                sys.stdout.flush()
-                char = input().strip()
-
-            if not char or char.lower() != "y":
-                print("Operation cancelled by user.")
-                cleanup_files(vtt_path)
-                sys.exit(0)
-
-        except (EOFError, KeyboardInterrupt, termios.error):
-            print("\nOperation cancelled by user.")
-            cleanup_files(vtt_path)
-            sys.exit(0)
-
    # Get summary from Claude
    if not args.quiet:
        print("\nGetting summary from Claude...")
-    summary = get_summary_from_claude(cleaned_text, args.prompt)
+    summary = get_summary_from_claude(cleaned_text, target_words)

-    # Prepare output
    output = ""
    if args.transcript:
        output += "=== Full Transcript ===\n\n"
@@ -290,17 +272,26 @@ def main():
        output += "\n\n=== Summary ===\n\n"
    output += summary

-    # Output results
-    output_path = args.output
-    if output_path:
-        with open(output_path, "w", encoding="utf-8") as f:
-            f.write(output)
-        print(f"\nSummary saved to: {output_path}")
-    else:
-        print("\n=== Summary ===")
-        print(output)
+    # Save summary
+    _, summary_dir = ensure_yts_dirs()
+    summary_filename = sanitize_filename(args.url) + "_summary.txt"
+    summary_path = summary_dir / summary_filename
+
+    summary_metadata = f"""Title: {video_title}
+URL: {args.url}
+Script Version: {__version__}
+Timestamp: {datetime.now().isoformat()}
+Type: Summary
+---
+
+"""
+    with open(summary_path, "w", encoding="utf-8") as f:
+        f.write(summary_metadata + output)
+    print("\n=== Summary ===\n")
+    print(summary)
+    print()
+    print(f"\nSummary saved to: {summary_path}")

-    # Cleanup downloaded files unless --keep-files is specified
    if not args.keep_files:
        cleanup_files(vtt_path)
Author	SHA1	Message	Date
Zev Averbach	f7b369de95	add some docs	2025-01-03 15:16:23 +01:00
Zev Averbach (aider)	45c06d081c	fix: Update subtitle download error handling to return unknown title	2025-01-03 14:54:54 +01:00
Zev Averbach (aider)	4761a590ad	feat: Add video title and enhanced metadata to transcripts and summaries	2025-01-03 14:54:47 +01:00
Zev Averbach	d3451c19e4	refactor: Simplify CLI, remove optional features, and streamline summary generation	2025-01-03 14:54:45 +01:00
Zev Averbach (aider)	1fbe9094c2	feat: Update CLI to use Haiku model with Y default and accurate cost info	2025-01-03 14:12:40 +01:00
Zev Averbach (aider)	e5f35154a8	feat: Enhance CLI to clean URLs and display summary on console	2025-01-03 14:09:33 +01:00
Zev Averbach (aider)	3bcc5164e1	fix: Update subtitle download function to return duration_mins	2025-01-03 14:05:41 +01:00
Zev Averbach (aider)	6d27e00f8a	feat: Update summary generation to scale words based on video duration	2025-01-03 14:05:29 +01:00
Zev Averbach (aider)	c2c5f461fa	refactor: Remove signal handler from main() function	2025-01-03 13:59:24 +01:00
Zev Averbach (aider)	fc6a3bd13b	refactor: Simplify user input logic using input() instead of low-level terminal handling	2025-01-03 13:57:18 +01:00