From ab9824ebedac9db781ecaa88d31d01dc887f204a Mon Sep 17 00:00:00 2001 From: LUIS NOVO Date: Mon, 28 Oct 2024 16:32:38 -0300 Subject: [PATCH] default to bs4 for url processing --- .../graphs/content_processing/url.py | 190 ++++++++++++++++++ 1 file changed, 190 insertions(+) create mode 100644 open_notebook/graphs/content_processing/url.py diff --git a/open_notebook/graphs/content_processing/url.py b/open_notebook/graphs/content_processing/url.py new file mode 100644 index 0000000..05a00fd --- /dev/null +++ b/open_notebook/graphs/content_processing/url.py @@ -0,0 +1,190 @@ +import re +from urllib.parse import urlparse + +import requests # type: ignore +from bs4 import BeautifulSoup, Comment +from loguru import logger + +from open_notebook.graphs.content_processing.state import SourceState + +# future: better extraction methods +# https://github.com/buriy/python-readability +# also try readability: from readability import Document + + +def url_provider(state: SourceState): + """ + Identify the provider + """ + return_dict = {} + url = state.get("url") + if url: + if "youtube.com" in url or "youtu.be" in url: + return_dict["identified_type"] = ( + "youtube" # future: playlists, channels in the future + ) + else: + return_dict["identified_type"] = "article" + # future: article providers in the future + return return_dict + + +def extract_url_bs4(url: str): + """ + Get the title and content of a URL using bs4 + """ + try: + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + } + + # If URL is actually HTML content + if url.startswith("") or url.startswith("") + else None, + "url": url if not url.startswith("") else None, + } + + except requests.exceptions.RequestException as e: + logger.error(f"Failed to fetch URL {url}: {e}") + return None + except Exception as e: + logger.error(f"Failed to process content: {e}") + return None + + +def extract_url_jina(url: str): + """ + Get the content of a URL using Jina + """ + response = requests.get(f"https://r.jina.ai/{url}") + text = response.text + if text.startswith("Title:") and "\n" in text: + title_end = text.index("\n") + title = text[6:title_end].strip() + content = text[title_end + 1 :].strip() + logger.debug( + f"Processed url: {url}, found title: {title}, content: {content[:100]}..." + ) + return {"title": title, "content": content} + else: + content = text + logger.debug( + f"Processed url: {url}, does not have Title prefix, returning full content: {content[:100]}..." + ) + return {"content": text} + + +def extract_url(state: SourceState): + assert state.get("url"), "No URL provided" + url = state["url"] + try: + result = extract_url_bs4(url) + if not result or not result.get("content"): + logger.debug( + f"BS4 extraction failed for url {url}, falling back to Jina extractor" + ) + result = extract_url_jina(url) + return result + except Exception as e: + logger.error(f"URL extraction failed for URL: {url}") + logger.exception(e) + return None