mirror of
https://github.com/lfnovo/open-notebook.git
synced 2026-04-29 12:00:00 +00:00
improve yt transcript and title
This commit is contained in:
parent
399f175b0c
commit
3f997aa22c
2 changed files with 161 additions and 0 deletions
155
open_notebook/graphs/content_processing/youtube.py
Normal file
155
open_notebook/graphs/content_processing/youtube.py
Normal file
|
|
@ -0,0 +1,155 @@
|
|||
import re
|
||||
import ssl
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from loguru import logger
|
||||
from youtube_transcript_api import YouTubeTranscriptApi # type: ignore
|
||||
from youtube_transcript_api.formatters import TextFormatter # type: ignore
|
||||
|
||||
from open_notebook.config import CONFIG
|
||||
from open_notebook.exceptions import NoTranscriptFound
|
||||
from open_notebook.graphs.content_processing.state import SourceState
|
||||
|
||||
ssl._create_default_https_context = ssl._create_unverified_context
|
||||
|
||||
|
||||
def get_video_title(video_id):
|
||||
try:
|
||||
url = f"https://www.youtube.com/watch?v={video_id}"
|
||||
response = requests.get(url)
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
# YouTube stores title in a meta tag
|
||||
title = soup.find("meta", property="og:title")["content"]
|
||||
return title
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get video title: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def _extract_youtube_id(url):
|
||||
"""
|
||||
Extract the YouTube video ID from a given URL using regular expressions.
|
||||
|
||||
Args:
|
||||
url (str): The YouTube URL from which to extract the video ID.
|
||||
|
||||
Returns:
|
||||
str: The extracted YouTube video ID or None if no valid ID is found.
|
||||
"""
|
||||
# Define a regular expression pattern to capture the YouTube video ID
|
||||
youtube_regex = (
|
||||
r"(?:https?://)?" # Optional scheme
|
||||
r"(?:www\.)?" # Optional www.
|
||||
r"(?:"
|
||||
r"youtu\.be/" # Shortened URL
|
||||
r"|youtube\.com" # Main URL
|
||||
r"(?:" # Group start
|
||||
r"/embed/" # Embed URL
|
||||
r"|/v/" # Older video URL
|
||||
r"|/watch\?v=" # Standard watch URL
|
||||
r"|/watch\?.+&v=" # Other watch URL
|
||||
r")" # Group end
|
||||
r")" # End main group
|
||||
r"([\w-]{11})" # 11 characters (YouTube video ID)
|
||||
)
|
||||
|
||||
# Search the URL for the pattern
|
||||
match = re.search(youtube_regex, url)
|
||||
|
||||
# Return the video ID if a match is found
|
||||
return match.group(1) if match else None
|
||||
|
||||
|
||||
def get_best_transcript(video_id, preferred_langs=["en", "es", "pt"]):
|
||||
try:
|
||||
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
|
||||
|
||||
# First try: Manual transcripts in preferred languages
|
||||
manual_transcripts = []
|
||||
try:
|
||||
for transcript in transcript_list:
|
||||
if not transcript.is_generated and not transcript.is_translatable:
|
||||
manual_transcripts.append(transcript)
|
||||
|
||||
if manual_transcripts:
|
||||
# Sort based on preferred language order
|
||||
for lang in preferred_langs:
|
||||
for transcript in manual_transcripts:
|
||||
if transcript.language_code == lang:
|
||||
return transcript.fetch()
|
||||
# If no preferred language found, return first manual transcript
|
||||
return manual_transcripts[0].fetch()
|
||||
except NoTranscriptFound:
|
||||
pass
|
||||
|
||||
# Second try: Auto-generated transcripts in preferred languages
|
||||
generated_transcripts = []
|
||||
try:
|
||||
for transcript in transcript_list:
|
||||
if transcript.is_generated and not transcript.is_translatable:
|
||||
generated_transcripts.append(transcript)
|
||||
|
||||
if generated_transcripts:
|
||||
# Sort based on preferred language order
|
||||
for lang in preferred_langs:
|
||||
for transcript in generated_transcripts:
|
||||
if transcript.language_code == lang:
|
||||
return transcript.fetch()
|
||||
# If no preferred language found, return first generated transcript
|
||||
return generated_transcripts[0].fetch()
|
||||
except NoTranscriptFound:
|
||||
pass
|
||||
|
||||
# Last try: Translated transcripts in preferred languages
|
||||
translated_transcripts = []
|
||||
try:
|
||||
for transcript in transcript_list:
|
||||
if transcript.is_translatable:
|
||||
translated_transcripts.append(transcript)
|
||||
|
||||
if translated_transcripts:
|
||||
# Sort based on preferred language order
|
||||
for lang in preferred_langs:
|
||||
for transcript in translated_transcripts:
|
||||
if transcript.language_code == lang:
|
||||
return transcript.fetch()
|
||||
# If no preferred language found, return translation to first preferred language
|
||||
translation = translated_transcripts[0].translate(preferred_langs[0])
|
||||
return translation.fetch()
|
||||
except NoTranscriptFound:
|
||||
pass
|
||||
|
||||
raise Exception("No suitable transcript found")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get transcript for video {video_id}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def extract_youtube_transcript(state: SourceState):
|
||||
"""
|
||||
Parse the text file and print its content.
|
||||
"""
|
||||
|
||||
languages = CONFIG.get("youtube_transcripts", {}).get(
|
||||
"preferred_languages", ["en", "es", "pt"]
|
||||
)
|
||||
|
||||
video_id = _extract_youtube_id(state.get("url"))
|
||||
transcript = get_best_transcript(video_id, languages)
|
||||
|
||||
logger.debug(f"Found transcript: {transcript}")
|
||||
formatter = TextFormatter()
|
||||
try:
|
||||
title = get_video_title(video_id)
|
||||
except Exception as e:
|
||||
logger.critical(f"Failed to get video title for video_id: {video_id}")
|
||||
logger.exception(e)
|
||||
title = None
|
||||
return {
|
||||
"content": formatter.format_transcript(transcript),
|
||||
"title": title,
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue