Compare commits
10 Commits
9a2f4d1c8c
...
f7b369de95
| Author | SHA1 | Date | |
|---|---|---|---|
| f7b369de95 | |||
| 45c06d081c | |||
| 4761a590ad | |||
| d3451c19e4 | |||
| 1fbe9094c2 | |||
| e5f35154a8 | |||
| 3bcc5164e1 | |||
| 6d27e00f8a | |||
| c2c5f461fa | |||
| fc6a3bd13b |
12
.gitignore
vendored
Normal file
12
.gitignore
vendored
Normal file
@@ -0,0 +1,12 @@
|
||||
# Python-generated files
|
||||
__pycache__/
|
||||
*.py[oc]
|
||||
build/
|
||||
dist/
|
||||
wheels/
|
||||
*.egg-info
|
||||
uv.lock
|
||||
|
||||
# Virtual environments
|
||||
.venv
|
||||
.aider*
|
||||
1
.python-version
Normal file
1
.python-version
Normal file
@@ -0,0 +1 @@
|
||||
3.12
|
||||
20
README.md
Normal file
20
README.md
Normal file
@@ -0,0 +1,20 @@
|
||||
# What?
|
||||
|
||||
This is a CLI for summarizing YouTubes
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
> python3 -m build && uv pip install --system dist/*.whl
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
> yts <youtube URL>
|
||||
```
|
||||
|
||||
## Default Behavior
|
||||
|
||||
The transcripts and summaries will be saved by default to `~/.yts/summaries` and `~/.yts/transcripts`.
|
||||
|
||||
18
pyproject.toml
Normal file
18
pyproject.toml
Normal file
@@ -0,0 +1,18 @@
|
||||
[project]
|
||||
name = "summarize-yt"
|
||||
version = "0.1.0"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.11"
|
||||
dependencies = [
|
||||
"anthropic>=0.42.0",
|
||||
"hatchling>=1.27.0",
|
||||
"yt-dlp>=2024.12.23",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
yts = "summarize_yt.cli:main"
|
||||
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
1
summarize_yt/__init__.py
Normal file
1
summarize_yt/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
__version__ = "0.1.1"
|
||||
@@ -1,30 +1,47 @@
|
||||
# Standard library imports remain the same
|
||||
import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
import re
|
||||
import signal
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from yt_dlp import YoutubeDL
|
||||
from pathlib import Path
|
||||
from typing import Optional, Tuple
|
||||
|
||||
# Third-party packages
|
||||
import anthropic
|
||||
from yt_dlp import YoutubeDL
|
||||
|
||||
# Local modules
|
||||
from . import __version__
|
||||
|
||||
# Add cost estimation constants
|
||||
CLAUDE_COST_PER_1K_INPUT = 0.015 # Cost per 1K tokens for input
|
||||
CLAUDE_COST_PER_1K_OUTPUT = 0.075 # Cost per 1K tokens for output
|
||||
ESTIMATED_TOKENS_PER_CHAR = 0.25 # Rough estimate of tokens per character
|
||||
CLAUDE_COST_PER_1K_INPUT = 0.80 / 1000
|
||||
CLAUDE_COST_PER_1K_OUTPUT = 4 / 1000
|
||||
ESTIMATED_TOKENS_PER_CHAR = 0.25
|
||||
|
||||
|
||||
def estimate_api_cost(text: str) -> float:
|
||||
def setup_terminal_control():
|
||||
"""Set up terminal control at program start."""
|
||||
try:
|
||||
# Put process in its own process group and take control of terminal
|
||||
os.setpgrp()
|
||||
|
||||
# Ignore terminal control signals
|
||||
signal.signal(signal.SIGTTOU, signal.SIG_IGN)
|
||||
signal.signal(signal.SIGTTIN, signal.SIG_IGN)
|
||||
signal.signal(signal.SIGTSTP, signal.SIG_IGN)
|
||||
|
||||
# Take control of terminal if we're running in one
|
||||
if sys.stdin.isatty():
|
||||
import termios
|
||||
|
||||
termios.tcsetpgrp(sys.stdin.fileno(), os.getpgrp()) # type: ignore
|
||||
except Exception:
|
||||
# If we can't get terminal control, just continue
|
||||
pass
|
||||
|
||||
|
||||
def estimate_api_cost(text: str, target_words: float) -> float:
|
||||
"""Estimate the cost of sending text to Claude API."""
|
||||
estimated_input_tokens = len(text) * ESTIMATED_TOKENS_PER_CHAR
|
||||
estimated_output_tokens = 1024 # max_tokens setting
|
||||
estimated_output_tokens = target_words
|
||||
|
||||
input_cost = (estimated_input_tokens / 1000) * CLAUDE_COST_PER_1K_INPUT
|
||||
output_cost = (estimated_output_tokens / 1000) * CLAUDE_COST_PER_1K_OUTPUT
|
||||
@@ -34,36 +51,48 @@ def estimate_api_cost(text: str) -> float:
|
||||
|
||||
def sanitize_filename(url: str) -> str:
|
||||
"""Convert URL to safe filename, keeping video ID."""
|
||||
# Extract video ID if it's a YouTube URL
|
||||
# Clean URL and extract video ID
|
||||
clean_url = url.split("&")[0] # Remove everything after first &
|
||||
video_id = None
|
||||
if "youtube.com" in url or "youtu.be" in url:
|
||||
if "v=" in url:
|
||||
video_id = url.split("v=")[1].split("&")[0]
|
||||
if "youtube.com" in clean_url or "youtu.be" in clean_url:
|
||||
if "v=" in clean_url:
|
||||
video_id = clean_url.split("v=")[1]
|
||||
else:
|
||||
video_id = url.split("/")[-1].split("?")[0]
|
||||
video_id = clean_url.split("/")[-1].split("?")[0]
|
||||
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
return f"{timestamp}_{video_id if video_id else 'video'}"
|
||||
|
||||
|
||||
def ensure_transcript_dir() -> Path:
|
||||
"""Create and return path to transcript directory."""
|
||||
transcript_dir = Path.home() / ".yts" / "transcripts"
|
||||
def ensure_yts_dirs() -> tuple[Path, Path]:
|
||||
"""Create and return paths to transcript and summary directories."""
|
||||
base_dir = Path.home() / ".yts"
|
||||
transcript_dir = base_dir / "transcripts"
|
||||
summary_dir = base_dir / "summaries"
|
||||
transcript_dir.mkdir(parents=True, exist_ok=True)
|
||||
return transcript_dir
|
||||
summary_dir.mkdir(parents=True, exist_ok=True)
|
||||
return transcript_dir, summary_dir
|
||||
|
||||
|
||||
def download_subtitles(url: str, quiet: bool = False) -> Tuple[bool, Optional[Path]]:
|
||||
"""Download subtitles from YouTube using yt-dlp."""
|
||||
def download_subtitles(url: str, quiet: bool = False) -> tuple[bool, Path | None, int, str]:
|
||||
"""Download subtitles from YouTube using yt-dlp and return success, path, duration in minutes, and title."""
|
||||
try:
|
||||
ydl_opts = {
|
||||
'skip_download': True,
|
||||
'writeautomaticsub': True,
|
||||
'subtitleslangs': ['en'],
|
||||
'quiet': quiet,
|
||||
'no_warnings': quiet,
|
||||
"skip_download": True,
|
||||
"writeautomaticsub": True,
|
||||
"subtitleslangs": ["en"],
|
||||
"quiet": quiet,
|
||||
"no_warnings": quiet,
|
||||
}
|
||||
|
||||
# First get video info
|
||||
info_opts = dict(ydl_opts)
|
||||
info_opts["extract_flat"] = True
|
||||
with YoutubeDL(info_opts) as ydl:
|
||||
info = ydl.extract_info(url, download=False)
|
||||
duration_mins = int(info.get("duration", 0) / 60) # type: ignore
|
||||
title = info.get("title", "Unknown Title") # type: ignore
|
||||
|
||||
if not quiet:
|
||||
print(f"Debug: Downloading subtitles for {url}")
|
||||
|
||||
@@ -73,23 +102,24 @@ def download_subtitles(url: str, quiet: bool = False) -> Tuple[bool, Optional[Pa
|
||||
# Find the downloaded VTT file
|
||||
current_dir = Path(".")
|
||||
vtt_files = list(current_dir.glob("*.en.vtt"))
|
||||
|
||||
|
||||
if not quiet:
|
||||
print(f"Debug: Found VTT files: {vtt_files}")
|
||||
|
||||
|
||||
if not vtt_files:
|
||||
print("No VTT file found after download", file=sys.stderr)
|
||||
return False, None
|
||||
return False, None, 0, "Unknown Title"
|
||||
|
||||
return True, vtt_files[0]
|
||||
return True, vtt_files[0], duration_mins, title
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error during subtitle download: {str(e)}", file=sys.stderr)
|
||||
if not quiet:
|
||||
print(f"Debug: Full exception info: {type(e).__name__}: {str(e)}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
return False, None
|
||||
return False, None, 0, "Unknown Title"
|
||||
|
||||
|
||||
def clean_vtt_text(text: str) -> str:
|
||||
@@ -140,16 +170,17 @@ def process_file(input_path: Path, output_path: Path | None = None, quiet: bool
|
||||
return ""
|
||||
|
||||
|
||||
def save_transcript(text: str, url: str, prompt: str) -> Path:
|
||||
def save_transcript(text: str, url: str, title: str) -> Path:
|
||||
"""Save transcript with metadata to ~/.yts/transcripts/."""
|
||||
transcript_dir = ensure_transcript_dir()
|
||||
transcript_dir, _ = ensure_yts_dirs()
|
||||
filename = sanitize_filename(url) + ".txt"
|
||||
filepath = transcript_dir / filename
|
||||
|
||||
metadata = f"""URL: {url}
|
||||
metadata = f"""Title: {title}
|
||||
URL: {url}
|
||||
Script Version: {__version__}
|
||||
Timestamp: {datetime.now().isoformat()}
|
||||
Claude Prompt: {prompt}
|
||||
Type: Transcript
|
||||
---
|
||||
"""
|
||||
|
||||
@@ -159,7 +190,7 @@ Claude Prompt: {prompt}
|
||||
return filepath
|
||||
|
||||
|
||||
def cleanup_files(vtt_path: Optional[Path]):
|
||||
def cleanup_files(vtt_path: Path | None):
|
||||
"""Remove downloaded files after processing."""
|
||||
try:
|
||||
if vtt_path and vtt_path.exists():
|
||||
@@ -174,7 +205,7 @@ def cleanup_files(vtt_path: Optional[Path]):
|
||||
print(f"Warning: Could not clean up temporary files: {str(e)}", file=sys.stderr)
|
||||
|
||||
|
||||
def get_summary_from_claude(text: str, prompt: str = "Please summarize this transcript in 500 words or less") -> str:
|
||||
def get_summary_from_claude(text: str, target_words: float) -> str:
|
||||
"""Send text to Claude API for summarization."""
|
||||
try:
|
||||
api_key = os.environ.get("ANTHROPIC_API_KEY")
|
||||
@@ -182,14 +213,17 @@ def get_summary_from_claude(text: str, prompt: str = "Please summarize this tran
|
||||
raise ValueError("ANTHROPIC_API_KEY environment variable not set")
|
||||
|
||||
client = anthropic.Anthropic()
|
||||
|
||||
prompt = f"Please summarize this transcript in {target_words} or less."
|
||||
|
||||
message = client.messages.create(
|
||||
model="claude-3-sonnet-20240229",
|
||||
max_tokens=1024,
|
||||
model="claude-3-5-haiku-latest",
|
||||
max_tokens=2048, # Increased for longer summaries
|
||||
temperature=0,
|
||||
system="You are a helpful assistant that summarizes transcripts accurately and concisely.",
|
||||
system="You are a helpful assistant that summarizes transcripts accurately.",
|
||||
messages=[{"role": "user", "content": f"{prompt}:\n\n{text}"}],
|
||||
)
|
||||
return message.content[0].text
|
||||
return message.content[0].text # type: ignore
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error getting summary from Claude: {str(e)}", file=sys.stderr)
|
||||
@@ -197,92 +231,40 @@ def get_summary_from_claude(text: str, prompt: str = "Please summarize this tran
|
||||
|
||||
|
||||
def main():
|
||||
setup_terminal_control()
|
||||
parser = argparse.ArgumentParser(description="Download YouTube subtitles and get Claude summary")
|
||||
parser.add_argument("url", help="YouTube video URL", type=str)
|
||||
parser.add_argument("-o", "--output", help="Output file for summary")
|
||||
parser.add_argument("-q", "--quiet", action="store_true", help="Suppress status messages")
|
||||
parser.add_argument("--keep-files", action="store_true", help="Don't delete downloaded VTT files")
|
||||
parser.add_argument("--transcript", action="store_true", help="Include full transcript in output")
|
||||
parser.add_argument(
|
||||
"--prompt", default="Please summarize this transcript in 500 words or less", help="Custom prompt for Claude"
|
||||
)
|
||||
parser.add_argument("-y", "--yes", action="store_true", help="Skip cost confirmation")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Set up signal handler for clean exit
|
||||
def signal_handler(sig, frame):
|
||||
print("\nCleaning up and exiting...")
|
||||
cleanup_files(None) # Clean any VTT files
|
||||
print(f"Debug: Script terminated by signal at {datetime.now().isoformat()}")
|
||||
sys.exit(0)
|
||||
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
|
||||
# Download subtitles
|
||||
if not args.quiet:
|
||||
print("Downloading subtitles...")
|
||||
success, vtt_path = download_subtitles(args.url, args.quiet)
|
||||
success, vtt_path, duration_mins, video_title = download_subtitles(args.url, args.quiet)
|
||||
if not success:
|
||||
cleanup_files(None)
|
||||
sys.exit(1)
|
||||
target_words = max(500, (duration_mins // 10) * 500)
|
||||
|
||||
# Process the VTT file
|
||||
cleaned_text = process_file(vtt_path, None, args.quiet)
|
||||
cleaned_text = process_file(vtt_path, None, args.quiet) # type: ignore
|
||||
|
||||
# Save transcript
|
||||
transcript_path = save_transcript(cleaned_text, args.url, args.prompt)
|
||||
transcript_path = save_transcript(cleaned_text, args.url, video_title)
|
||||
print(f"\nTranscript saved to: {transcript_path}")
|
||||
|
||||
# Estimate and display cost
|
||||
estimated_cost = estimate_api_cost(cleaned_text)
|
||||
estimated_cost = estimate_api_cost(cleaned_text, target_words)
|
||||
print(f"\nEstimated API cost: ${estimated_cost:.4f}")
|
||||
|
||||
if not args.yes:
|
||||
try:
|
||||
# Check if running in a terminal
|
||||
if sys.stdin.isatty():
|
||||
import tty
|
||||
import termios
|
||||
|
||||
# Save the terminal settings
|
||||
fd = sys.stdin.fileno()
|
||||
old_settings = termios.tcgetattr(fd)
|
||||
try:
|
||||
# Set the terminal to raw mode
|
||||
tty.setraw(sys.stdin.fileno())
|
||||
sys.stdout.write("\nDo you want to proceed with getting the summary? (y/N): ")
|
||||
sys.stdout.flush()
|
||||
# Read a single character
|
||||
char = sys.stdin.read(1)
|
||||
# Print a newline since we're in raw mode
|
||||
sys.stdout.write("\n")
|
||||
sys.stdout.flush()
|
||||
finally:
|
||||
# Restore terminal settings
|
||||
termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
|
||||
else:
|
||||
# If not in a terminal, use regular input
|
||||
sys.stdout.write("\nDo you want to proceed with getting the summary? (y/N): ")
|
||||
sys.stdout.flush()
|
||||
char = input().strip()
|
||||
|
||||
if not char or char.lower() != "y":
|
||||
print("Operation cancelled by user.")
|
||||
cleanup_files(vtt_path)
|
||||
sys.exit(0)
|
||||
|
||||
except (EOFError, KeyboardInterrupt, termios.error):
|
||||
print("\nOperation cancelled by user.")
|
||||
cleanup_files(vtt_path)
|
||||
sys.exit(0)
|
||||
|
||||
# Get summary from Claude
|
||||
if not args.quiet:
|
||||
print("\nGetting summary from Claude...")
|
||||
summary = get_summary_from_claude(cleaned_text, args.prompt)
|
||||
summary = get_summary_from_claude(cleaned_text, target_words)
|
||||
|
||||
# Prepare output
|
||||
output = ""
|
||||
if args.transcript:
|
||||
output += "=== Full Transcript ===\n\n"
|
||||
@@ -290,17 +272,26 @@ def main():
|
||||
output += "\n\n=== Summary ===\n\n"
|
||||
output += summary
|
||||
|
||||
# Output results
|
||||
output_path = args.output
|
||||
if output_path:
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
f.write(output)
|
||||
print(f"\nSummary saved to: {output_path}")
|
||||
else:
|
||||
print("\n=== Summary ===")
|
||||
print(output)
|
||||
# Save summary
|
||||
_, summary_dir = ensure_yts_dirs()
|
||||
summary_filename = sanitize_filename(args.url) + "_summary.txt"
|
||||
summary_path = summary_dir / summary_filename
|
||||
|
||||
summary_metadata = f"""Title: {video_title}
|
||||
URL: {args.url}
|
||||
Script Version: {__version__}
|
||||
Timestamp: {datetime.now().isoformat()}
|
||||
Type: Summary
|
||||
---
|
||||
|
||||
"""
|
||||
with open(summary_path, "w", encoding="utf-8") as f:
|
||||
f.write(summary_metadata + output)
|
||||
print("\n=== Summary ===\n")
|
||||
print(summary)
|
||||
print()
|
||||
print(f"\nSummary saved to: {summary_path}")
|
||||
|
||||
# Cleanup downloaded files unless --keep-files is specified
|
||||
if not args.keep_files:
|
||||
cleanup_files(vtt_path)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user