From 1586a0bd780e3605cca71a26bb597117ca797ddf Mon Sep 17 00:00:00 2001 From: "DESKTOP-RTLN3BA\\$punk" Date: Wed, 7 May 2025 22:04:57 -0700 Subject: [PATCH] chore: Added direct handling for markdown files. - Fixed podcast imports. --- .gitignore | 2 +- .../app/routes/documents_routes.py | 88 ++++++++++++------- surfsense_backend/app/schemas/chats.py | 5 +- surfsense_backend/app/schemas/chunks.py | 5 +- surfsense_backend/app/schemas/documents.py | 5 +- surfsense_backend/app/schemas/podcasts.py | 5 +- .../app/schemas/search_source_connector.py | 5 +- surfsense_backend/app/schemas/search_space.py | 5 +- .../app/tasks/background_tasks.py | 46 ++++++++++ surfsense_backend/app/tasks/podcast_tasks.py | 11 ++- 10 files changed, 118 insertions(+), 59 deletions(-) diff --git a/.gitignore b/.gitignore index b67a7dd..1a7f226 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,2 @@ .flashrank_cache* -podcasts/* +podcasts/ diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py index aa42476..bbefbcd 100644 --- a/surfsense_backend/app/routes/documents_routes.py +++ b/surfsense_backend/app/routes/documents_routes.py @@ -6,7 +6,7 @@ from app.db import get_async_session, User, SearchSpace, Document, DocumentType from app.schemas import DocumentsCreate, DocumentUpdate, DocumentRead from app.users import current_active_user from app.utils.check_ownership import check_ownership -from app.tasks.background_tasks import add_extension_received_document, add_received_file_document, add_crawled_url_document, add_youtube_video_document +from app.tasks.background_tasks import add_received_markdown_file_document, add_extension_received_document, add_received_file_document, add_crawled_url_document, add_youtube_video_document # Force asyncio to use standard event loop before unstructured imports import asyncio try: @@ -15,9 +15,8 @@ except RuntimeError: pass import os os.environ["UNSTRUCTURED_HAS_PATCHED_LOOP"] = "1" -from langchain_unstructured import UnstructuredLoader -from app.config import config -import json + + router = APIRouter() @@ -132,36 +131,57 @@ async def process_file_in_background( session: AsyncSession ): try: - # Use synchronous unstructured API to avoid event loop issues - from langchain_community.document_loaders import UnstructuredFileLoader - - # Process the file - loader = UnstructuredFileLoader( - file_path, - mode="elements", - post_processors=[], - languages=["eng"], - include_orig_elements=False, - include_metadata=False, - strategy="auto", - ) - - docs = loader.load() - - # Clean up the temp file - import os - try: - os.unlink(file_path) - except: - pass - - # Pass the documents to the existing background task - await add_received_file_document( - session, - filename, - docs, - search_space_id - ) + # Check if the file is a markdown file + if filename.lower().endswith(('.md', '.markdown')): + # For markdown files, read the content directly + with open(file_path, 'r', encoding='utf-8') as f: + markdown_content = f.read() + + # Clean up the temp file + import os + try: + os.unlink(file_path) + except: + pass + + # Process markdown directly through specialized function + await add_received_markdown_file_document( + session, + filename, + markdown_content, + search_space_id + ) + else: + # Use synchronous unstructured API to avoid event loop issues + from langchain_unstructured import UnstructuredLoader + + # Process the file + loader = UnstructuredLoader( + file_path, + mode="elements", + post_processors=[], + languages=["eng"], + include_orig_elements=False, + include_metadata=False, + strategy="auto", + ) + + docs = await loader.aload() + + # Clean up the temp file + import os + try: + os.unlink(file_path) + except: + pass + + # Pass the documents to the existing background task + await add_received_file_document( + session, + filename, + docs, + search_space_id + ) except Exception as e: import logging logging.error(f"Error processing file in background: {str(e)}") diff --git a/surfsense_backend/app/schemas/chats.py b/surfsense_backend/app/schemas/chats.py index f5eefc5..3d73ab2 100644 --- a/surfsense_backend/app/schemas/chats.py +++ b/surfsense_backend/app/schemas/chats.py @@ -1,7 +1,7 @@ from typing import Any, Dict, List, Optional from app.db import ChatType -from pydantic import BaseModel +from pydantic import BaseModel, ConfigDict from .base import IDModel, TimestampModel @@ -44,5 +44,4 @@ class ChatUpdate(ChatBase): pass class ChatRead(ChatBase, IDModel, TimestampModel): - class Config: - from_attributes = True \ No newline at end of file + model_config = ConfigDict(from_attributes=True) \ No newline at end of file diff --git a/surfsense_backend/app/schemas/chunks.py b/surfsense_backend/app/schemas/chunks.py index 4230981..de0764f 100644 --- a/surfsense_backend/app/schemas/chunks.py +++ b/surfsense_backend/app/schemas/chunks.py @@ -1,4 +1,4 @@ -from pydantic import BaseModel +from pydantic import BaseModel, ConfigDict from .base import IDModel, TimestampModel class ChunkBase(BaseModel): @@ -12,5 +12,4 @@ class ChunkUpdate(ChunkBase): pass class ChunkRead(ChunkBase, IDModel, TimestampModel): - class Config: - from_attributes = True \ No newline at end of file + model_config = ConfigDict(from_attributes=True) \ No newline at end of file diff --git a/surfsense_backend/app/schemas/documents.py b/surfsense_backend/app/schemas/documents.py index dcd7187..6a99fcd 100644 --- a/surfsense_backend/app/schemas/documents.py +++ b/surfsense_backend/app/schemas/documents.py @@ -1,5 +1,5 @@ from typing import List, Any -from pydantic import BaseModel +from pydantic import BaseModel, ConfigDict from sqlalchemy import JSON from .base import IDModel, TimestampModel from app.db import DocumentType @@ -37,6 +37,5 @@ class DocumentRead(BaseModel): created_at: datetime search_space_id: int - class Config: - from_attributes = True + model_config = ConfigDict(from_attributes=True) diff --git a/surfsense_backend/app/schemas/podcasts.py b/surfsense_backend/app/schemas/podcasts.py index 4132fb2..0356dd0 100644 --- a/surfsense_backend/app/schemas/podcasts.py +++ b/surfsense_backend/app/schemas/podcasts.py @@ -1,4 +1,4 @@ -from pydantic import BaseModel +from pydantic import BaseModel, ConfigDict from typing import Any, List, Literal from .base import IDModel, TimestampModel @@ -15,8 +15,7 @@ class PodcastUpdate(PodcastBase): pass class PodcastRead(PodcastBase, IDModel, TimestampModel): - class Config: - from_attributes = True + model_config = ConfigDict(from_attributes=True) class PodcastGenerateRequest(BaseModel): type: Literal["DOCUMENT", "CHAT"] diff --git a/surfsense_backend/app/schemas/search_source_connector.py b/surfsense_backend/app/schemas/search_source_connector.py index cb7152e..b136757 100644 --- a/surfsense_backend/app/schemas/search_source_connector.py +++ b/surfsense_backend/app/schemas/search_source_connector.py @@ -1,7 +1,7 @@ from datetime import datetime import uuid from typing import Dict, Any, Optional -from pydantic import BaseModel, field_validator +from pydantic import BaseModel, field_validator, ConfigDict from .base import IDModel, TimestampModel from app.db import SearchSourceConnectorType @@ -106,5 +106,4 @@ class SearchSourceConnectorUpdate(BaseModel): class SearchSourceConnectorRead(SearchSourceConnectorBase, IDModel, TimestampModel): user_id: uuid.UUID - class Config: - from_attributes = True + model_config = ConfigDict(from_attributes=True) diff --git a/surfsense_backend/app/schemas/search_space.py b/surfsense_backend/app/schemas/search_space.py index feebcf1..2c99c45 100644 --- a/surfsense_backend/app/schemas/search_space.py +++ b/surfsense_backend/app/schemas/search_space.py @@ -1,7 +1,7 @@ from datetime import datetime import uuid from typing import Optional -from pydantic import BaseModel +from pydantic import BaseModel, ConfigDict from .base import IDModel, TimestampModel class SearchSpaceBase(BaseModel): @@ -19,5 +19,4 @@ class SearchSpaceRead(SearchSpaceBase, IDModel, TimestampModel): created_at: datetime user_id: uuid.UUID - class Config: - from_attributes = True \ No newline at end of file + model_config = ConfigDict(from_attributes=True) \ No newline at end of file diff --git a/surfsense_backend/app/tasks/background_tasks.py b/surfsense_backend/app/tasks/background_tasks.py index 68b56c4..099391f 100644 --- a/surfsense_backend/app/tasks/background_tasks.py +++ b/surfsense_backend/app/tasks/background_tasks.py @@ -194,6 +194,52 @@ async def add_extension_received_document( await session.rollback() raise RuntimeError(f"Failed to process extension document: {str(e)}") +async def add_received_markdown_file_document( + session: AsyncSession, + file_name: str, + file_in_markdown: str, + search_space_id: int +) -> Optional[Document]: + try: + + # Generate summary + summary_chain = SUMMARY_PROMPT_TEMPLATE | config.long_context_llm_instance + summary_result = await summary_chain.ainvoke({"document": file_in_markdown}) + summary_content = summary_result.content + summary_embedding = config.embedding_model_instance.embed( + summary_content) + + # Process chunks + chunks = [ + Chunk(content=chunk.text, embedding=config.embedding_model_instance.embed(chunk.text)) + for chunk in config.chunker_instance.chunk(file_in_markdown) + ] + + # Create and store document + document = Document( + search_space_id=search_space_id, + title=file_name, + document_type=DocumentType.FILE, + document_metadata={ + "FILE_NAME": file_name, + "SAVED_AT": datetime.now().strftime("%Y-%m-%d %H:%M:%S") + }, + content=summary_content, + embedding=summary_embedding, + chunks=chunks + ) + + session.add(document) + await session.commit() + await session.refresh(document) + + return document + except SQLAlchemyError as db_error: + await session.rollback() + raise db_error + except Exception as e: + await session.rollback() + raise RuntimeError(f"Failed to process file document: {str(e)}") async def add_received_file_document( session: AsyncSession, diff --git a/surfsense_backend/app/tasks/podcast_tasks.py b/surfsense_backend/app/tasks/podcast_tasks.py index e148f54..12364e7 100644 --- a/surfsense_backend/app/tasks/podcast_tasks.py +++ b/surfsense_backend/app/tasks/podcast_tasks.py @@ -1,10 +1,9 @@ -from sqlalchemy.ext.asyncio import AsyncSession -from app.schemas import PodcastGenerateRequest -from typing import List -from sqlalchemy import select -from app.db import Chat, Podcast + from app.agents.podcaster.graph import graph as podcaster_graph -from surfsense_backend.app.agents.podcaster.state import State +from app.agents.podcaster.state import State +from app.db import Chat, Podcast +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession async def generate_document_podcast(