separate source and content graph

2026-04-29 12:00:00 +00:00 · 2024-11-10 13:30:03 -03:00 · 2024-11-10 13:30:03 -03:00 · 2e2a4947b3
commit 2e2a4947b3
parent b42a95b35f
12 changed files with 167 additions and 56 deletions
--- a/open_notebook/graphs/content_processing/init.py
+++ b/open_notebook/graphs/content_processing/init.py
@ -14,14 +14,14 @@ from open_notebook.graphs.content_processing.pdf import (
    SUPPORTED_FITZ_TYPES,
    extract_pdf,
 )
-from open_notebook.graphs.content_processing.state import SourceState
+from open_notebook.graphs.content_processing.state import ContentState
 from open_notebook.graphs.content_processing.text import extract_txt
 from open_notebook.graphs.content_processing.url import extract_url, url_provider
 from open_notebook.graphs.content_processing.video import extract_best_audio_from_video
 from open_notebook.graphs.content_processing.youtube import extract_youtube_transcript


-def source_identification(state: SourceState):
+def source_identification(state: ContentState):
    """
    Identify the content source based on parameters
    """
@ -37,7 +37,7 @@ def source_identification(state: SourceState):
    return {"source_type": doc_type}


-def file_type(state: SourceState):
+def file_type(state: ContentState):
    """
    Identify the file using python-magic
    """
@ -45,10 +45,11 @@ def file_type(state: SourceState):
    file_path = state.get("file_path")
    if file_path is not None:
        return_dict["identified_type"] = magic.from_file(file_path, mime=True)
+        return_dict["title"] = os.path.basename(file_path)
    return return_dict


-def file_type_edge(data: SourceState):
+def file_type_edge(data: ContentState):
    assert data.get("identified_type"), "Type not identified"
    identified_type = data["identified_type"]

@ -68,7 +69,7 @@ def file_type_edge(data: SourceState):
        )


-def delete_file(data: SourceState):
+def delete_file(data: ContentState):
    if data.get("delete_source"):
        logger.debug(f"Deleting file: {data.get('file_path')}")
        file_path = data.get("file_path")
@ -82,7 +83,7 @@ def delete_file(data: SourceState):
        logger.debug("Not deleting file")


-workflow = StateGraph(SourceState)
+workflow = StateGraph(ContentState)
 workflow.add_node("source", source_identification)
 workflow.add_node("url_provider", url_provider)
 workflow.add_node("file_type", file_type)
--- a/open_notebook/graphs/content_processing/audio.py
+++ b/open_notebook/graphs/content_processing/audio.py
@ -5,7 +5,7 @@ from loguru import logger
 from pydub import AudioSegment

 from open_notebook.domain.models import model_manager
-from open_notebook.graphs.content_processing.state import SourceState
+from open_notebook.graphs.content_processing.state import ContentState

 # todo: remove reference to model_manager
 # future: parallelize the transcription process
@ -72,7 +72,7 @@ def split_audio(input_file, segment_length_minutes=15, output_prefix=None):
    return output_files


-def extract_audio(data: SourceState):
+def extract_audio(data: ContentState):
    SPEECH_TO_TEXT_MODEL = model_manager.speech_to_text

    input_audio_path = data.get("file_path")
--- a/open_notebook/graphs/content_processing/office.py
+++ b/open_notebook/graphs/content_processing/office.py
@ -3,7 +3,7 @@ from loguru import logger
 from openpyxl import load_workbook
 from pptx import Presentation

-from open_notebook.graphs.content_processing.state import SourceState
+from open_notebook.graphs.content_processing.state import ContentState

 SUPPORTED_OFFICE_TYPES = [
    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@ -251,7 +251,7 @@ def get_xlsx_info(file_path):
        return None


-def extract_office_content(state: SourceState):
+def extract_office_content(state: ContentState):
    """Universal function to extract content from Office files"""
    assert state.get("file_path"), "No file path provided"
    assert (
--- a/open_notebook/graphs/content_processing/pdf.py
+++ b/open_notebook/graphs/content_processing/pdf.py
@ -4,7 +4,7 @@ import unicodedata
 import fitz  # type: ignore
 from loguru import logger

-from open_notebook.graphs.content_processing.state import SourceState
+from open_notebook.graphs.content_processing.state import ContentState

 # todo: find tables - https://pymupdf.readthedocs.io/en/latest/the-basics.html#extracting-tables-from-a-page
 # todo: what else can we do to make the text more readable?
@ -127,7 +127,7 @@ def _extract_text_from_pdf(pdf_path):
        doc.close()


-def extract_pdf(state: SourceState):
+def extract_pdf(state: ContentState):
    """
    Parse the text file and print its content.
    """
--- a/open_notebook/graphs/content_processing/state.py
+++ b/open_notebook/graphs/content_processing/state.py
@ -1,7 +1,7 @@
 from typing_extensions import TypedDict


-class SourceState(TypedDict):
+class ContentState(TypedDict):
    content: str
    file_path: str
    url: str
--- a/open_notebook/graphs/content_processing/text.py
+++ b/open_notebook/graphs/content_processing/text.py
@ -1,9 +1,9 @@
 from loguru import logger

-from open_notebook.graphs.content_processing.state import SourceState
+from open_notebook.graphs.content_processing.state import ContentState


-def extract_txt(state: SourceState):
+def extract_txt(state: ContentState):
    """
    Parse the text file and print its content.
    """
--- a/open_notebook/graphs/content_processing/url.py
+++ b/open_notebook/graphs/content_processing/url.py
@ -5,14 +5,14 @@ import requests  # type: ignore
 from bs4 import BeautifulSoup, Comment
 from loguru import logger

-from open_notebook.graphs.content_processing.state import SourceState
+from open_notebook.graphs.content_processing.state import ContentState

 # future: better extraction methods
 # https://github.com/buriy/python-readability
 # also try readability: from readability import Document


-def url_provider(state: SourceState):
+def url_provider(state: ContentState):
    """
    Identify the provider
    """
@ -173,7 +173,7 @@ def extract_url_jina(url: str):
        return {"content": text}


-def extract_url(state: SourceState):
+def extract_url(state: ContentState):
    assert state.get("url"), "No URL provided"
    url = state["url"]
    try:
--- a/open_notebook/graphs/content_processing/video.py
+++ b/open_notebook/graphs/content_processing/video.py
@ -4,7 +4,7 @@ import subprocess

 from loguru import logger

-from open_notebook.graphs.content_processing.state import SourceState
+from open_notebook.graphs.content_processing.state import ContentState


 def extract_audio_from_video(input_file, output_file, stream_index):
@ -102,7 +102,7 @@ def select_best_audio_stream(streams):
    return max(scored_streams, key=lambda x: x[0])[1]


-def extract_best_audio_from_video(data: SourceState):
+def extract_best_audio_from_video(data: ContentState):
    """
    Main function to extract the best audio stream from a video file
    """
--- a/open_notebook/graphs/content_processing/youtube.py
+++ b/open_notebook/graphs/content_processing/youtube.py
@ -9,7 +9,7 @@ from youtube_transcript_api.formatters import TextFormatter  # type: ignore

 from open_notebook.config import CONFIG
 from open_notebook.exceptions import NoTranscriptFound
-from open_notebook.graphs.content_processing.state import SourceState
+from open_notebook.graphs.content_processing.state import ContentState

 ssl._create_default_https_context = ssl._create_unverified_context

@ -129,7 +129,7 @@ def get_best_transcript(video_id, preferred_langs=["en", "es", "pt"]):
        return None


-def extract_youtube_transcript(state: SourceState):
+def extract_youtube_transcript(state: ContentState):
    """
    Parse the text file and print its content.
    """