feat: Update summary generation to scale words based on video duration
This commit is contained in:
@@ -45,15 +45,18 @@ def sanitize_filename(url: str) -> str:
|
||||
return f"{timestamp}_{video_id if video_id else 'video'}"
|
||||
|
||||
|
||||
def ensure_transcript_dir() -> Path:
|
||||
"""Create and return path to transcript directory."""
|
||||
transcript_dir = Path.home() / ".yts" / "transcripts"
|
||||
def ensure_yts_dirs() -> Tuple[Path, Path]:
|
||||
"""Create and return paths to transcript and summary directories."""
|
||||
base_dir = Path.home() / ".yts"
|
||||
transcript_dir = base_dir / "transcripts"
|
||||
summary_dir = base_dir / "summaries"
|
||||
transcript_dir.mkdir(parents=True, exist_ok=True)
|
||||
return transcript_dir
|
||||
summary_dir.mkdir(parents=True, exist_ok=True)
|
||||
return transcript_dir, summary_dir
|
||||
|
||||
|
||||
def download_subtitles(url: str, quiet: bool = False) -> Tuple[bool, Optional[Path]]:
|
||||
"""Download subtitles from YouTube using yt-dlp."""
|
||||
def download_subtitles(url: str, quiet: bool = False) -> Tuple[bool, Optional[Path], int]:
|
||||
"""Download subtitles from YouTube using yt-dlp and return success, path, and duration in minutes."""
|
||||
try:
|
||||
ydl_opts = {
|
||||
'skip_download': True,
|
||||
@@ -62,6 +65,13 @@ def download_subtitles(url: str, quiet: bool = False) -> Tuple[bool, Optional[Pa
|
||||
'quiet': quiet,
|
||||
'no_warnings': quiet,
|
||||
}
|
||||
|
||||
# First get video duration
|
||||
info_opts = dict(ydl_opts)
|
||||
info_opts['extract_flat'] = True
|
||||
with YoutubeDL(info_opts) as ydl:
|
||||
info = ydl.extract_info(url, download=False)
|
||||
duration_mins = int(info.get('duration', 0) / 60)
|
||||
|
||||
if not quiet:
|
||||
print(f"Debug: Downloading subtitles for {url}")
|
||||
@@ -80,7 +90,7 @@ def download_subtitles(url: str, quiet: bool = False) -> Tuple[bool, Optional[Pa
|
||||
print("No VTT file found after download", file=sys.stderr)
|
||||
return False, None
|
||||
|
||||
return True, vtt_files[0]
|
||||
return True, vtt_files[0], duration_mins
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error during subtitle download: {str(e)}", file=sys.stderr)
|
||||
@@ -141,7 +151,7 @@ def process_file(input_path: Path, output_path: Path | None = None, quiet: bool
|
||||
|
||||
def save_transcript(text: str, url: str, prompt: str) -> Path:
|
||||
"""Save transcript with metadata to ~/.yts/transcripts/."""
|
||||
transcript_dir = ensure_transcript_dir()
|
||||
transcript_dir, _ = ensure_yts_dirs()
|
||||
filename = sanitize_filename(url) + ".txt"
|
||||
filepath = transcript_dir / filename
|
||||
|
||||
@@ -173,7 +183,7 @@ def cleanup_files(vtt_path: Optional[Path]):
|
||||
print(f"Warning: Could not clean up temporary files: {str(e)}", file=sys.stderr)
|
||||
|
||||
|
||||
def get_summary_from_claude(text: str, prompt: str = "Please summarize this transcript in 500 words or less") -> str:
|
||||
def get_summary_from_claude(text: str, duration_mins: int, prompt: str = None) -> str:
|
||||
"""Send text to Claude API for summarization."""
|
||||
try:
|
||||
api_key = os.environ.get("ANTHROPIC_API_KEY")
|
||||
@@ -181,9 +191,15 @@ def get_summary_from_claude(text: str, prompt: str = "Please summarize this tran
|
||||
raise ValueError("ANTHROPIC_API_KEY environment variable not set")
|
||||
|
||||
client = anthropic.Anthropic()
|
||||
# Calculate target word count based on duration
|
||||
target_words = max(500, (duration_mins // 10) * 500)
|
||||
|
||||
if prompt is None:
|
||||
prompt = f"Please summarize this transcript in approximately {target_words} words"
|
||||
|
||||
message = client.messages.create(
|
||||
model="claude-3-sonnet-20240229",
|
||||
max_tokens=1024,
|
||||
max_tokens=2048, # Increased for longer summaries
|
||||
temperature=0,
|
||||
system="You are a helpful assistant that summarizes transcripts accurately and concisely.",
|
||||
messages=[{"role": "user", "content": f"{prompt}:\n\n{text}"}],
|
||||
@@ -203,7 +219,7 @@ def main():
|
||||
parser.add_argument("--keep-files", action="store_true", help="Don't delete downloaded VTT files")
|
||||
parser.add_argument("--transcript", action="store_true", help="Include full transcript in output")
|
||||
parser.add_argument(
|
||||
"--prompt", default="Please summarize this transcript in 500 words or less", help="Custom prompt for Claude"
|
||||
"--prompt", help="Custom prompt for Claude (default: auto-calculated based on video length)"
|
||||
)
|
||||
parser.add_argument("-y", "--yes", action="store_true", help="Skip cost confirmation")
|
||||
|
||||
@@ -212,7 +228,7 @@ def main():
|
||||
# Download subtitles
|
||||
if not args.quiet:
|
||||
print("Downloading subtitles...")
|
||||
success, vtt_path = download_subtitles(args.url, args.quiet)
|
||||
success, vtt_path, duration_mins = download_subtitles(args.url, args.quiet)
|
||||
if not success:
|
||||
cleanup_files(None)
|
||||
sys.exit(1)
|
||||
@@ -243,7 +259,7 @@ def main():
|
||||
# Get summary from Claude
|
||||
if not args.quiet:
|
||||
print("\nGetting summary from Claude...")
|
||||
summary = get_summary_from_claude(cleaned_text, args.prompt)
|
||||
summary = get_summary_from_claude(cleaned_text, duration_mins, args.prompt)
|
||||
|
||||
# Prepare output
|
||||
output = ""
|
||||
@@ -253,15 +269,20 @@ def main():
|
||||
output += "\n\n=== Summary ===\n\n"
|
||||
output += summary
|
||||
|
||||
# Output results
|
||||
output_path = args.output
|
||||
if output_path:
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
# Save summary
|
||||
_, summary_dir = ensure_yts_dirs()
|
||||
summary_filename = sanitize_filename(args.url) + "_summary.txt"
|
||||
summary_path = summary_dir / summary_filename
|
||||
|
||||
with open(summary_path, "w", encoding="utf-8") as f:
|
||||
f.write(output)
|
||||
print(f"\nSummary saved to: {summary_path}")
|
||||
|
||||
# If output path specified, also save there
|
||||
if args.output:
|
||||
with open(args.output, "w", encoding="utf-8") as f:
|
||||
f.write(output)
|
||||
print(f"\nSummary saved to: {output_path}")
|
||||
else:
|
||||
print("\n=== Summary ===")
|
||||
print(output)
|
||||
print(f"Summary also saved to: {args.output}")
|
||||
|
||||
# Cleanup downloaded files unless --keep-files is specified
|
||||
if not args.keep_files:
|
||||
|
||||
Reference in New Issue
Block a user