import re from urllib.parse import urlparse import aiohttp from bs4 import BeautifulSoup, Comment from loguru import logger from open_notebook.graphs.content_processing.state import ContentState # future: better extraction methods # https://github.com/buriy/python-readability # also try readability: from readability import Document def url_provider(state: ContentState): """ Identify the provider """ return_dict = {} url = state.get("url") if url: if "youtube.com" in url or "youtu.be" in url: return_dict["identified_type"] = ( "youtube" # future: playlists, channels in the future ) else: return_dict["identified_type"] = "article" # future: article providers in the future return return_dict async def extract_url_bs4(url: str): """ Get the title and content of a URL using bs4 """ try: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" } # If URL is actually HTML content if url.startswith("") or url.startswith("") else None, "url": url if not url.startswith("") else None, } except aiohttp.ClientError as e: logger.error(f"Failed to fetch URL {url}: {e}") return None except Exception as e: logger.error(f"Failed to process content: {e}") return None async def extract_url_jina(url: str): """ Get the content of a URL using Jina """ async with aiohttp.ClientSession() as session: async with session.get(f"https://r.jina.ai/{url}") as response: text = await response.text() if text.startswith("Title:") and "\n" in text: title_end = text.index("\n") title = text[6:title_end].strip() content = text[title_end + 1 :].strip() logger.debug( f"Processed url: {url}, found title: {title}, content: {content[:100]}..." ) return {"title": title, "content": content} else: logger.debug( f"Processed url: {url}, does not have Title prefix, returning full content: {text[:100]}..." ) return {"content": text} async def extract_url(state: ContentState): assert state.get("url"), "No URL provided" url = state["url"] try: result = await extract_url_bs4(url) if not result or not result.get("content"): logger.debug( f"BS4 extraction failed for url {url}, falling back to Jina extractor" ) result = await extract_url_jina(url) return result except Exception as e: logger.error(f"URL extraction failed for URL: {url}") logger.exception(e) return None