Compare commits

..

10 Commits

6 changed files with 154 additions and 111 deletions

12
.gitignore vendored Normal file
View File

@@ -0,0 +1,12 @@
# Python-generated files
__pycache__/
*.py[oc]
build/
dist/
wheels/
*.egg-info
uv.lock
# Virtual environments
.venv
.aider*

1
.python-version Normal file
View File

@@ -0,0 +1 @@
3.12

20
README.md Normal file
View File

@@ -0,0 +1,20 @@
# What?
This is a CLI for summarizing YouTubes
## Installation
```bash
> python3 -m build && uv pip install --system dist/*.whl
```
## Usage
```bash
> yts <youtube URL>
```
## Default Behavior
The transcripts and summaries will be saved by default to `~/.yts/summaries` and `~/.yts/transcripts`.

18
pyproject.toml Normal file
View File

@@ -0,0 +1,18 @@
[project]
name = "summarize-yt"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.11"
dependencies = [
"anthropic>=0.42.0",
"hatchling>=1.27.0",
"yt-dlp>=2024.12.23",
]
[project.scripts]
yts = "summarize_yt.cli:main"
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

1
summarize_yt/__init__.py Normal file
View File

@@ -0,0 +1 @@
__version__ = "0.1.1"

View File

@@ -1,30 +1,47 @@
# Standard library imports remain the same
import argparse import argparse
import os import os
from pathlib import Path
import re import re
import signal import signal
import sys import sys
from datetime import datetime from datetime import datetime
from yt_dlp import YoutubeDL
from pathlib import Path
from typing import Optional, Tuple
# Third-party packages
import anthropic import anthropic
from yt_dlp import YoutubeDL
# Local modules
from . import __version__ from . import __version__
# Add cost estimation constants # Add cost estimation constants
CLAUDE_COST_PER_1K_INPUT = 0.015 # Cost per 1K tokens for input CLAUDE_COST_PER_1K_INPUT = 0.80 / 1000
CLAUDE_COST_PER_1K_OUTPUT = 0.075 # Cost per 1K tokens for output CLAUDE_COST_PER_1K_OUTPUT = 4 / 1000
ESTIMATED_TOKENS_PER_CHAR = 0.25 # Rough estimate of tokens per character ESTIMATED_TOKENS_PER_CHAR = 0.25
def estimate_api_cost(text: str) -> float: def setup_terminal_control():
"""Set up terminal control at program start."""
try:
# Put process in its own process group and take control of terminal
os.setpgrp()
# Ignore terminal control signals
signal.signal(signal.SIGTTOU, signal.SIG_IGN)
signal.signal(signal.SIGTTIN, signal.SIG_IGN)
signal.signal(signal.SIGTSTP, signal.SIG_IGN)
# Take control of terminal if we're running in one
if sys.stdin.isatty():
import termios
termios.tcsetpgrp(sys.stdin.fileno(), os.getpgrp()) # type: ignore
except Exception:
# If we can't get terminal control, just continue
pass
def estimate_api_cost(text: str, target_words: float) -> float:
"""Estimate the cost of sending text to Claude API.""" """Estimate the cost of sending text to Claude API."""
estimated_input_tokens = len(text) * ESTIMATED_TOKENS_PER_CHAR estimated_input_tokens = len(text) * ESTIMATED_TOKENS_PER_CHAR
estimated_output_tokens = 1024 # max_tokens setting estimated_output_tokens = target_words
input_cost = (estimated_input_tokens / 1000) * CLAUDE_COST_PER_1K_INPUT input_cost = (estimated_input_tokens / 1000) * CLAUDE_COST_PER_1K_INPUT
output_cost = (estimated_output_tokens / 1000) * CLAUDE_COST_PER_1K_OUTPUT output_cost = (estimated_output_tokens / 1000) * CLAUDE_COST_PER_1K_OUTPUT
@@ -34,36 +51,48 @@ def estimate_api_cost(text: str) -> float:
def sanitize_filename(url: str) -> str: def sanitize_filename(url: str) -> str:
"""Convert URL to safe filename, keeping video ID.""" """Convert URL to safe filename, keeping video ID."""
# Extract video ID if it's a YouTube URL # Clean URL and extract video ID
clean_url = url.split("&")[0] # Remove everything after first &
video_id = None video_id = None
if "youtube.com" in url or "youtu.be" in url: if "youtube.com" in clean_url or "youtu.be" in clean_url:
if "v=" in url: if "v=" in clean_url:
video_id = url.split("v=")[1].split("&")[0] video_id = clean_url.split("v=")[1]
else: else:
video_id = url.split("/")[-1].split("?")[0] video_id = clean_url.split("/")[-1].split("?")[0]
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
return f"{timestamp}_{video_id if video_id else 'video'}" return f"{timestamp}_{video_id if video_id else 'video'}"
def ensure_transcript_dir() -> Path: def ensure_yts_dirs() -> tuple[Path, Path]:
"""Create and return path to transcript directory.""" """Create and return paths to transcript and summary directories."""
transcript_dir = Path.home() / ".yts" / "transcripts" base_dir = Path.home() / ".yts"
transcript_dir = base_dir / "transcripts"
summary_dir = base_dir / "summaries"
transcript_dir.mkdir(parents=True, exist_ok=True) transcript_dir.mkdir(parents=True, exist_ok=True)
return transcript_dir summary_dir.mkdir(parents=True, exist_ok=True)
return transcript_dir, summary_dir
def download_subtitles(url: str, quiet: bool = False) -> Tuple[bool, Optional[Path]]: def download_subtitles(url: str, quiet: bool = False) -> tuple[bool, Path | None, int, str]:
"""Download subtitles from YouTube using yt-dlp.""" """Download subtitles from YouTube using yt-dlp and return success, path, duration in minutes, and title."""
try: try:
ydl_opts = { ydl_opts = {
'skip_download': True, "skip_download": True,
'writeautomaticsub': True, "writeautomaticsub": True,
'subtitleslangs': ['en'], "subtitleslangs": ["en"],
'quiet': quiet, "quiet": quiet,
'no_warnings': quiet, "no_warnings": quiet,
} }
# First get video info
info_opts = dict(ydl_opts)
info_opts["extract_flat"] = True
with YoutubeDL(info_opts) as ydl:
info = ydl.extract_info(url, download=False)
duration_mins = int(info.get("duration", 0) / 60) # type: ignore
title = info.get("title", "Unknown Title") # type: ignore
if not quiet: if not quiet:
print(f"Debug: Downloading subtitles for {url}") print(f"Debug: Downloading subtitles for {url}")
@@ -73,23 +102,24 @@ def download_subtitles(url: str, quiet: bool = False) -> Tuple[bool, Optional[Pa
# Find the downloaded VTT file # Find the downloaded VTT file
current_dir = Path(".") current_dir = Path(".")
vtt_files = list(current_dir.glob("*.en.vtt")) vtt_files = list(current_dir.glob("*.en.vtt"))
if not quiet: if not quiet:
print(f"Debug: Found VTT files: {vtt_files}") print(f"Debug: Found VTT files: {vtt_files}")
if not vtt_files: if not vtt_files:
print("No VTT file found after download", file=sys.stderr) print("No VTT file found after download", file=sys.stderr)
return False, None return False, None, 0, "Unknown Title"
return True, vtt_files[0] return True, vtt_files[0], duration_mins, title
except Exception as e: except Exception as e:
print(f"Error during subtitle download: {str(e)}", file=sys.stderr) print(f"Error during subtitle download: {str(e)}", file=sys.stderr)
if not quiet: if not quiet:
print(f"Debug: Full exception info: {type(e).__name__}: {str(e)}") print(f"Debug: Full exception info: {type(e).__name__}: {str(e)}")
import traceback import traceback
traceback.print_exc() traceback.print_exc()
return False, None return False, None, 0, "Unknown Title"
def clean_vtt_text(text: str) -> str: def clean_vtt_text(text: str) -> str:
@@ -140,16 +170,17 @@ def process_file(input_path: Path, output_path: Path | None = None, quiet: bool
return "" return ""
def save_transcript(text: str, url: str, prompt: str) -> Path: def save_transcript(text: str, url: str, title: str) -> Path:
"""Save transcript with metadata to ~/.yts/transcripts/.""" """Save transcript with metadata to ~/.yts/transcripts/."""
transcript_dir = ensure_transcript_dir() transcript_dir, _ = ensure_yts_dirs()
filename = sanitize_filename(url) + ".txt" filename = sanitize_filename(url) + ".txt"
filepath = transcript_dir / filename filepath = transcript_dir / filename
metadata = f"""URL: {url} metadata = f"""Title: {title}
URL: {url}
Script Version: {__version__} Script Version: {__version__}
Timestamp: {datetime.now().isoformat()} Timestamp: {datetime.now().isoformat()}
Claude Prompt: {prompt} Type: Transcript
--- ---
""" """
@@ -159,7 +190,7 @@ Claude Prompt: {prompt}
return filepath return filepath
def cleanup_files(vtt_path: Optional[Path]): def cleanup_files(vtt_path: Path | None):
"""Remove downloaded files after processing.""" """Remove downloaded files after processing."""
try: try:
if vtt_path and vtt_path.exists(): if vtt_path and vtt_path.exists():
@@ -174,7 +205,7 @@ def cleanup_files(vtt_path: Optional[Path]):
print(f"Warning: Could not clean up temporary files: {str(e)}", file=sys.stderr) print(f"Warning: Could not clean up temporary files: {str(e)}", file=sys.stderr)
def get_summary_from_claude(text: str, prompt: str = "Please summarize this transcript in 500 words or less") -> str: def get_summary_from_claude(text: str, target_words: float) -> str:
"""Send text to Claude API for summarization.""" """Send text to Claude API for summarization."""
try: try:
api_key = os.environ.get("ANTHROPIC_API_KEY") api_key = os.environ.get("ANTHROPIC_API_KEY")
@@ -182,14 +213,17 @@ def get_summary_from_claude(text: str, prompt: str = "Please summarize this tran
raise ValueError("ANTHROPIC_API_KEY environment variable not set") raise ValueError("ANTHROPIC_API_KEY environment variable not set")
client = anthropic.Anthropic() client = anthropic.Anthropic()
prompt = f"Please summarize this transcript in {target_words} or less."
message = client.messages.create( message = client.messages.create(
model="claude-3-sonnet-20240229", model="claude-3-5-haiku-latest",
max_tokens=1024, max_tokens=2048, # Increased for longer summaries
temperature=0, temperature=0,
system="You are a helpful assistant that summarizes transcripts accurately and concisely.", system="You are a helpful assistant that summarizes transcripts accurately.",
messages=[{"role": "user", "content": f"{prompt}:\n\n{text}"}], messages=[{"role": "user", "content": f"{prompt}:\n\n{text}"}],
) )
return message.content[0].text return message.content[0].text # type: ignore
except Exception as e: except Exception as e:
print(f"Error getting summary from Claude: {str(e)}", file=sys.stderr) print(f"Error getting summary from Claude: {str(e)}", file=sys.stderr)
@@ -197,92 +231,40 @@ def get_summary_from_claude(text: str, prompt: str = "Please summarize this tran
def main(): def main():
setup_terminal_control()
parser = argparse.ArgumentParser(description="Download YouTube subtitles and get Claude summary") parser = argparse.ArgumentParser(description="Download YouTube subtitles and get Claude summary")
parser.add_argument("url", help="YouTube video URL", type=str) parser.add_argument("url", help="YouTube video URL", type=str)
parser.add_argument("-o", "--output", help="Output file for summary")
parser.add_argument("-q", "--quiet", action="store_true", help="Suppress status messages") parser.add_argument("-q", "--quiet", action="store_true", help="Suppress status messages")
parser.add_argument("--keep-files", action="store_true", help="Don't delete downloaded VTT files") parser.add_argument("--keep-files", action="store_true", help="Don't delete downloaded VTT files")
parser.add_argument("--transcript", action="store_true", help="Include full transcript in output") parser.add_argument("--transcript", action="store_true", help="Include full transcript in output")
parser.add_argument(
"--prompt", default="Please summarize this transcript in 500 words or less", help="Custom prompt for Claude"
)
parser.add_argument("-y", "--yes", action="store_true", help="Skip cost confirmation")
args = parser.parse_args() args = parser.parse_args()
# Set up signal handler for clean exit
def signal_handler(sig, frame):
print("\nCleaning up and exiting...")
cleanup_files(None) # Clean any VTT files
print(f"Debug: Script terminated by signal at {datetime.now().isoformat()}")
sys.exit(0)
signal.signal(signal.SIGINT, signal_handler)
# Download subtitles # Download subtitles
if not args.quiet: if not args.quiet:
print("Downloading subtitles...") print("Downloading subtitles...")
success, vtt_path = download_subtitles(args.url, args.quiet) success, vtt_path, duration_mins, video_title = download_subtitles(args.url, args.quiet)
if not success: if not success:
cleanup_files(None) cleanup_files(None)
sys.exit(1) sys.exit(1)
target_words = max(500, (duration_mins // 10) * 500)
# Process the VTT file # Process the VTT file
cleaned_text = process_file(vtt_path, None, args.quiet) cleaned_text = process_file(vtt_path, None, args.quiet) # type: ignore
# Save transcript # Save transcript
transcript_path = save_transcript(cleaned_text, args.url, args.prompt) transcript_path = save_transcript(cleaned_text, args.url, video_title)
print(f"\nTranscript saved to: {transcript_path}") print(f"\nTranscript saved to: {transcript_path}")
# Estimate and display cost # Estimate and display cost
estimated_cost = estimate_api_cost(cleaned_text) estimated_cost = estimate_api_cost(cleaned_text, target_words)
print(f"\nEstimated API cost: ${estimated_cost:.4f}") print(f"\nEstimated API cost: ${estimated_cost:.4f}")
if not args.yes:
try:
# Check if running in a terminal
if sys.stdin.isatty():
import tty
import termios
# Save the terminal settings
fd = sys.stdin.fileno()
old_settings = termios.tcgetattr(fd)
try:
# Set the terminal to raw mode
tty.setraw(sys.stdin.fileno())
sys.stdout.write("\nDo you want to proceed with getting the summary? (y/N): ")
sys.stdout.flush()
# Read a single character
char = sys.stdin.read(1)
# Print a newline since we're in raw mode
sys.stdout.write("\n")
sys.stdout.flush()
finally:
# Restore terminal settings
termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
else:
# If not in a terminal, use regular input
sys.stdout.write("\nDo you want to proceed with getting the summary? (y/N): ")
sys.stdout.flush()
char = input().strip()
if not char or char.lower() != "y":
print("Operation cancelled by user.")
cleanup_files(vtt_path)
sys.exit(0)
except (EOFError, KeyboardInterrupt, termios.error):
print("\nOperation cancelled by user.")
cleanup_files(vtt_path)
sys.exit(0)
# Get summary from Claude # Get summary from Claude
if not args.quiet: if not args.quiet:
print("\nGetting summary from Claude...") print("\nGetting summary from Claude...")
summary = get_summary_from_claude(cleaned_text, args.prompt) summary = get_summary_from_claude(cleaned_text, target_words)
# Prepare output
output = "" output = ""
if args.transcript: if args.transcript:
output += "=== Full Transcript ===\n\n" output += "=== Full Transcript ===\n\n"
@@ -290,17 +272,26 @@ def main():
output += "\n\n=== Summary ===\n\n" output += "\n\n=== Summary ===\n\n"
output += summary output += summary
# Output results # Save summary
output_path = args.output _, summary_dir = ensure_yts_dirs()
if output_path: summary_filename = sanitize_filename(args.url) + "_summary.txt"
with open(output_path, "w", encoding="utf-8") as f: summary_path = summary_dir / summary_filename
f.write(output)
print(f"\nSummary saved to: {output_path}") summary_metadata = f"""Title: {video_title}
else: URL: {args.url}
print("\n=== Summary ===") Script Version: {__version__}
print(output) Timestamp: {datetime.now().isoformat()}
Type: Summary
---
"""
with open(summary_path, "w", encoding="utf-8") as f:
f.write(summary_metadata + output)
print("\n=== Summary ===\n")
print(summary)
print()
print(f"\nSummary saved to: {summary_path}")
# Cleanup downloaded files unless --keep-files is specified
if not args.keep_files: if not args.keep_files:
cleanup_files(vtt_path) cleanup_files(vtt_path)