From 73751c0eb13243f5d1ac1a9efcf7177acba10b6a Mon Sep 17 00:00:00 2001 From: "DESKTOP-RTLN3BA\\$punk" Date: Fri, 30 May 2025 19:17:19 -0700 Subject: [PATCH] feat: Removed Hard Dependency on Unstructured.io - Added Llamaparse Support :) --- README.md | 36 +++-- surfsense_backend/.env.example | 6 +- surfsense_backend/app/config/__init__.py | 13 +- .../app/routes/documents_routes.py | 97 +++++++++---- .../app/tasks/background_tasks.py | 79 ++++++++++- surfsense_backend/pyproject.toml | 1 + surfsense_backend/uv.lock | 128 +++++++++++++++++- surfsense_web/.env.example | 3 +- .../documents/upload/page.tsx | 113 ++++++++++++---- .../content/docs/docker-installation.mdx | 5 +- .../content/docs/manual-installation.mdx | 5 +- 11 files changed, 402 insertions(+), 84 deletions(-) diff --git a/README.md b/README.md index 8e0eae5..00cf5ad 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ https://github.com/user-attachments/assets/bf64a6ca-934b-47ac-9e1b-edac5fe972ec ### 💡 **Idea**: Have your own highly customizable private NotebookLM and Perplexity integrated with external sources. ### 📁 **Multiple File Format Uploading Support** -Save content from your own personal files *(Documents, images, videos and supports **34 file extensions**)* to your own personal knowledge base . +Save content from your own personal files *(Documents, images, videos and supports **50+ file extensions**)* to your own personal knowledge base . ### 🔍 **Powerful Search** Quickly research or find anything in your saved content . ### 💬 **Chat with your Saved Content** @@ -66,35 +66,33 @@ Open source and easy to deploy locally. ### 📄 **Supported File Extensions** -#### Document +> **Note**: File format support depends on your ETL service configuration. LlamaCloud supports 100+ formats, while Unstructured supports 34+ core formats. -`.doc`, `.docx`, `.odt`, `.rtf`, `.pdf`, `.xml` +#### Documents & Text +**LlamaCloud**: `.pdf`, `.doc`, `.docx`, `.docm`, `.dot`, `.dotm`, `.rtf`, `.txt`, `.xml`, `.epub`, `.odt`, `.wpd`, `.pages`, `.key`, `.numbers`, `.602`, `.abw`, `.cgm`, `.cwk`, `.hwp`, `.lwp`, `.mw`, `.mcw`, `.pbd`, `.sda`, `.sdd`, `.sdp`, `.sdw`, `.sgl`, `.sti`, `.sxi`, `.sxw`, `.stw`, `.sxg`, `.uof`, `.uop`, `.uot`, `.vor`, `.wps`, `.zabw` -#### Text & Markup +**Unstructured**: `.doc`, `.docx`, `.odt`, `.rtf`, `.pdf`, `.xml`, `.txt`, `.md`, `.markdown`, `.rst`, `.html`, `.org`, `.epub` -`.txt`, `.md`, `.markdown`, `.rst`, `.html`, `.org` +#### Presentations +**LlamaCloud**: `.ppt`, `.pptx`, `.pptm`, `.pot`, `.potm`, `.potx`, `.odp`, `.key` -#### Spreadsheets & Tables +**Unstructured**: `.ppt`, `.pptx` -`.xls`, `.xlsx`, `.csv`, `.tsv` +#### Spreadsheets & Data +**LlamaCloud**: `.xlsx`, `.xls`, `.xlsm`, `.xlsb`, `.xlw`, `.csv`, `.tsv`, `.ods`, `.fods`, `.numbers`, `.dbf`, `.123`, `.dif`, `.sylk`, `.slk`, `.prn`, `.et`, `.uos1`, `.uos2`, `.wk1`, `.wk2`, `.wk3`, `.wk4`, `.wks`, `.wq1`, `.wq2`, `.wb1`, `.wb2`, `.wb3`, `.qpw`, `.xlr`, `.eth` -#### Audio & Video - -`.mp3`, `.mpga`, `.m4a`, `.wav`, `.mp4`, `.mpeg`, `.webm` +**Unstructured**: `.xls`, `.xlsx`, `.csv`, `.tsv` #### Images +**LlamaCloud**: `.jpg`, `.jpeg`, `.png`, `.gif`, `.bmp`, `.svg`, `.tiff`, `.webp`, `.html`, `.htm`, `.web` -`.jpg`, `.jpeg`, `.png`, `.bmp`, `.tiff`, `.heic` - -#### Email & eBooks - -`.eml`, `.msg`, `.epub` - -#### PowerPoint Presentations & Other - -`.ppt`, `.pptx`, `.p7s` +**Unstructured**: `.jpg`, `.jpeg`, `.png`, `.bmp`, `.tiff`, `.heic` +#### Audio & Video *(Always Supported)* +`.mp3`, `.mpga`, `.m4a`, `.wav`, `.mp4`, `.mpeg`, `.webm` +#### Email & Communication +**Unstructured**: `.eml`, `.msg`, `.p7s` ### 🔖 Cross Browser Extension - The SurfSense extension can be used to save any webpage you like. diff --git a/surfsense_backend/.env.example b/surfsense_backend/.env.example index f9c43d1..c0032a9 100644 --- a/surfsense_backend/.env.example +++ b/surfsense_backend/.env.example @@ -30,9 +30,13 @@ STT_SERVICE="openai/whisper-1" OPENAI_API_KEY="sk-proj-iA" GEMINI_API_KEY="AIzaSyB6-1641124124124124124124124124124" -UNSTRUCTURED_API_KEY="Tpu3P0U8iy" FIRECRAWL_API_KEY="fcr-01J0000000000000000000000" +#File Parser Service +ETL_SERVICE="UNSTRUCTURED" or "LLAMACLOUD" +UNSTRUCTURED_API_KEY="Tpu3P0U8iy" +LLAMA_CLOUD_API_KEY="llx-nnn" + #OPTIONAL: Add these for LangSmith Observability LANGSMITH_TRACING=true LANGSMITH_ENDPOINT="https://api.smith.langchain.com" diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py index 81cd9a2..9135c32 100644 --- a/surfsense_backend/app/config/__init__.py +++ b/surfsense_backend/app/config/__init__.py @@ -96,9 +96,18 @@ class Config: # OAuth JWT SECRET_KEY = os.getenv("SECRET_KEY") - # Unstructured API Key - UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY") + # ETL Service + ETL_SERVICE = os.getenv("ETL_SERVICE") + if ETL_SERVICE == "UNSTRUCTURED": + # Unstructured API Key + UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY") + + elif ETL_SERVICE == "LLAMACLOUD": + # LlamaCloud API Key + LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY") + + # Firecrawl API Key FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY", None) diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py index 5ea2327..acd246e 100644 --- a/surfsense_backend/app/routes/documents_routes.py +++ b/surfsense_backend/app/routes/documents_routes.py @@ -7,7 +7,7 @@ from app.db import get_async_session, User, SearchSpace, Document, DocumentType from app.schemas import DocumentsCreate, DocumentUpdate, DocumentRead from app.users import current_active_user from app.utils.check_ownership import check_ownership -from app.tasks.background_tasks import add_received_markdown_file_document, add_extension_received_document, add_received_file_document, add_crawled_url_document, add_youtube_video_document +from app.tasks.background_tasks import add_received_markdown_file_document, add_extension_received_document, add_received_file_document_using_unstructured, add_crawled_url_document, add_youtube_video_document, add_received_file_document_using_llamacloud from app.config import config as app_config # Force asyncio to use standard event loop before unstructured imports import asyncio @@ -101,8 +101,7 @@ async def create_documents( content = await file.read() with open(temp_path, "wb") as f: f.write(content) - - # Process in background to avoid uvloop conflicts + fastapi_background_tasks.add_task( process_file_in_background_with_new_session, temp_path, @@ -191,36 +190,74 @@ async def process_file_in_background( search_space_id ) else: - # Use synchronous unstructured API to avoid event loop issues - from langchain_unstructured import UnstructuredLoader + if app_config.ETL_SERVICE == "UNSTRUCTURED": + from langchain_unstructured import UnstructuredLoader + + # Process the file + loader = UnstructuredLoader( + file_path, + mode="elements", + post_processors=[], + languages=["eng"], + include_orig_elements=False, + include_metadata=False, + strategy="auto", + ) - # Process the file - loader = UnstructuredLoader( - file_path, - mode="elements", - post_processors=[], - languages=["eng"], - include_orig_elements=False, - include_metadata=False, - strategy="auto", - ) + docs = await loader.aload() - docs = await loader.aload() + # Clean up the temp file + import os + try: + os.unlink(file_path) + except: + pass - # Clean up the temp file - import os - try: - os.unlink(file_path) - except: - pass + # Pass the documents to the existing background task + await add_received_file_document_using_unstructured( + session, + filename, + docs, + search_space_id + ) + elif app_config.ETL_SERVICE == "LLAMACLOUD": + from llama_cloud_services import LlamaParse + from llama_cloud_services.parse.utils import ResultType - # Pass the documents to the existing background task - await add_received_file_document( - session, - filename, - docs, - search_space_id - ) + + # Create LlamaParse parser instance + parser = LlamaParse( + api_key=app_config.LLAMA_CLOUD_API_KEY, + num_workers=1, # Use single worker for file processing + verbose=True, + language="en", + result_type=ResultType.MD + ) + + # Parse the file asynchronously + result = await parser.aparse(file_path) + + # Clean up the temp file + import os + try: + os.unlink(file_path) + except: + pass + + # Get markdown documents from the result + markdown_documents = await result.aget_markdown_documents(split_by_page=False) + + for doc in markdown_documents: + # Extract text content from the markdown documents + markdown_content = doc.text + + # Process the documents using our LlamaCloud background task + await add_received_file_document_using_llamacloud( + session, + filename, + llamacloud_markdown_document=markdown_content, + search_space_id=search_space_id + ) except Exception as e: import logging logging.error(f"Error processing file in background: {str(e)}") @@ -442,3 +479,5 @@ async def process_youtube_video_with_new_session( except Exception as e: import logging logging.error(f"Error processing YouTube video: {str(e)}") + + diff --git a/surfsense_backend/app/tasks/background_tasks.py b/surfsense_backend/app/tasks/background_tasks.py index 1c6cd6d..f6b1eb2 100644 --- a/surfsense_backend/app/tasks/background_tasks.py +++ b/surfsense_backend/app/tasks/background_tasks.py @@ -289,7 +289,7 @@ async def add_received_markdown_file_document( raise RuntimeError(f"Failed to process file document: {str(e)}") -async def add_received_file_document( +async def add_received_file_document_using_unstructured( session: AsyncSession, file_name: str, unstructured_processed_elements: List[LangChainDocument], @@ -357,6 +357,83 @@ async def add_received_file_document( raise RuntimeError(f"Failed to process file document: {str(e)}") +async def add_received_file_document_using_llamacloud( + session: AsyncSession, + file_name: str, + llamacloud_markdown_document: str, + search_space_id: int, +) -> Optional[Document]: + """ + Process and store document content parsed by LlamaCloud. + + Args: + session: Database session + file_name: Name of the processed file + llamacloud_markdown_documents: List of markdown content from LlamaCloud parsing + search_space_id: ID of the search space + + Returns: + Document object if successful, None if failed + """ + try: + # Combine all markdown documents into one + file_in_markdown = llamacloud_markdown_document + + content_hash = generate_content_hash(file_in_markdown) + + # Check if document with this content hash already exists + existing_doc_result = await session.execute( + select(Document).where(Document.content_hash == content_hash) + ) + existing_document = existing_doc_result.scalars().first() + + if existing_document: + logging.info(f"Document with content hash {content_hash} already exists. Skipping processing.") + return existing_document + + # Generate summary + summary_chain = SUMMARY_PROMPT_TEMPLATE | config.long_context_llm_instance + summary_result = await summary_chain.ainvoke({"document": file_in_markdown}) + summary_content = summary_result.content + summary_embedding = config.embedding_model_instance.embed(summary_content) + + # Process chunks + chunks = [ + Chunk( + content=chunk.text, + embedding=config.embedding_model_instance.embed(chunk.text), + ) + for chunk in config.chunker_instance.chunk(file_in_markdown) + ] + + # Create and store document + document = Document( + search_space_id=search_space_id, + title=file_name, + document_type=DocumentType.FILE, + document_metadata={ + "FILE_NAME": file_name, + "ETL_SERVICE": "LLAMACLOUD", + }, + content=summary_content, + embedding=summary_embedding, + chunks=chunks, + content_hash=content_hash, + ) + + session.add(document) + await session.commit() + await session.refresh(document) + + return document + except SQLAlchemyError as db_error: + await session.rollback() + raise db_error + except Exception as e: + await session.rollback() + raise RuntimeError(f"Failed to process file document using LlamaCloud: {str(e)}") + + async def add_youtube_video_document( session: AsyncSession, url: str, search_space_id: int ): diff --git a/surfsense_backend/pyproject.toml b/surfsense_backend/pyproject.toml index 1e5345b..dfa7559 100644 --- a/surfsense_backend/pyproject.toml +++ b/surfsense_backend/pyproject.toml @@ -17,6 +17,7 @@ dependencies = [ "langgraph>=0.3.29", "linkup-sdk>=0.2.4", "litellm>=1.61.4", + "llama-cloud-services>=0.6.25", "markdownify>=0.14.1", "notion-client>=2.3.0", "pgvector>=0.3.6", diff --git a/surfsense_backend/uv.lock b/surfsense_backend/uv.lock index 5f90ed9..968e5c9 100644 --- a/surfsense_backend/uv.lock +++ b/surfsense_backend/uv.lock @@ -110,6 +110,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ec/6a/bc7e17a3e87a2985d3e8f4da4cd0f481060eb78fb08596c42be62c90a4d9/aiosignal-1.3.2-py2.py3-none-any.whl", hash = "sha256:45cde58e409a301715980c2b01d0c28bdde3770d8290b5eb2173759d9acb31a5", size = 7597 }, ] +[[package]] +name = "aiosqlite" +version = "0.21.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/13/7d/8bca2bf9a247c2c5dfeec1d7a5f40db6518f88d314b8bca9da29670d2671/aiosqlite-0.21.0.tar.gz", hash = "sha256:131bb8056daa3bc875608c631c678cda73922a2d4ba8aec373b19f18c17e7aa3", size = 13454 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f5/10/6c25ed6de94c49f88a91fa5018cb4c0f3625f31d5be9f771ebe5cc7cd506/aiosqlite-0.21.0-py3-none-any.whl", hash = "sha256:2549cf4057f95f53dcba16f2b64e8e2791d7e1adedb13197dd8ed77bb226d7d0", size = 15792 }, +] + [[package]] name = "alembic" version = "1.15.2" @@ -228,6 +240,22 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/df/73/b6e24bd22e6720ca8ee9a85a0c4a2971af8497d8f3193fa05390cbd46e09/backoff-2.2.1-py3-none-any.whl", hash = "sha256:63579f9a0628e06278f7e47b7d7d5b6ce20dc65c5e96a6f3ca99a6adca0396e8", size = 15148 }, ] +[[package]] +name = "banks" +version = "2.1.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "deprecated" }, + { name = "griffe" }, + { name = "jinja2" }, + { name = "platformdirs" }, + { name = "pydantic" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/77/34/2b6697f02ffb68bee50e5fd37d6c64432244d3245603fd62950169dfed7e/banks-2.1.2.tar.gz", hash = "sha256:a0651db9d14b57fa2e115e78f68dbb1b36fe226ad6eef96192542908b1d20c1f", size = 173332 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/4a/7fdca29d1db62f5f5c3446bf8f668beacdb0b5a8aff4247574ddfddc6bcd/banks-2.1.2-py3-none-any.whl", hash = "sha256:7fba451069f6bea376483b8136a0f29cb1e6883133626d00e077e20a3d102c0e", size = 28064 }, +] + [[package]] name = "bcrypt" version = "4.2.1" @@ -572,6 +600,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6e/c6/ac0b6c1e2d138f1002bcf799d330bd6d85084fece321e662a14223794041/Deprecated-1.2.18-py2.py3-none-any.whl", hash = "sha256:bd5011788200372a32418f888e326a09ff80d0214bd961147cfed01b5c018eec", size = 9998 }, ] +[[package]] +name = "dirtyjson" +version = "1.0.8" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/db/04/d24f6e645ad82ba0ef092fa17d9ef7a21953781663648a01c9371d9e8e98/dirtyjson-1.0.8.tar.gz", hash = "sha256:90ca4a18f3ff30ce849d100dcf4a003953c79d3a2348ef056f1d9c22231a25fd", size = 30782 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/68/69/1bcf70f81de1b4a9f21b3a62ec0c83bdff991c88d6cc2267d02408457e88/dirtyjson-1.0.8-py3-none-any.whl", hash = "sha256:125e27248435a58acace26d5c2c4c11a1c0de0a9c5124c5a94ba78e517d74f53", size = 25197 }, +] + [[package]] name = "distro" version = "1.9.0" @@ -988,6 +1025,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ac/38/08cc303ddddc4b3d7c628c3039a61a3aae36c241ed01393d00c2fd663473/greenlet-3.1.1-cp313-cp313t-musllinux_1_1_x86_64.whl", hash = "sha256:411f015496fec93c1c8cd4e5238da364e1da7a124bcb293f085bf2860c32c6f6", size = 1142112 }, ] +[[package]] +name = "griffe" +version = "1.7.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a9/3e/5aa9a61f7c3c47b0b52a1d930302992229d191bf4bc76447b324b731510a/griffe-1.7.3.tar.gz", hash = "sha256:52ee893c6a3a968b639ace8015bec9d36594961e156e23315c8e8e51401fa50b", size = 395137 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/58/c6/5c20af38c2a57c15d87f7f38bee77d63c1d2a3689f74fefaf35915dd12b2/griffe-1.7.3-py3-none-any.whl", hash = "sha256:c6b3ee30c2f0f17f30bcdef5068d6ab7a2a4f1b8bf1a3e74b56fffd21e1c5f75", size = 129303 }, +] + [[package]] name = "grpcio" version = "1.71.0" @@ -1604,6 +1653,72 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f9/c2/1b6c502909b7af9054736af61e27558a3341e8c1ba28e7f82473e6dd936f/litellm-1.61.4-py3-none-any.whl", hash = "sha256:e87e0d397a191795b4217f9299fc9b21eaacaab91409695f0a4780cceccda6e1", size = 6814517 }, ] +[[package]] +name = "llama-cloud" +version = "0.1.23" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "httpx" }, + { name = "pydantic" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5b/e4/d1a30167ed6690a408382be1cf7de220a506085f4371baaf067d65bad8fd/llama_cloud-0.1.23.tar.gz", hash = "sha256:3d84a24a860f046d39a106c06742ec0ea39a574ac42bbf91706fe025f44e233e", size = 101292 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8a/15/3b56acef877dbc5d01d7e1a782c2cc50ef8a08d5773121c3bc20546de582/llama_cloud-0.1.23-py3-none-any.whl", hash = "sha256:ce95b0705d85c99b3b27b0af0d16a17d9a81b14c96bf13c1063a1bd13d8d0446", size = 267343 }, +] + +[[package]] +name = "llama-cloud-services" +version = "0.6.25" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "llama-cloud" }, + { name = "llama-index-core" }, + { name = "platformdirs" }, + { name = "pydantic" }, + { name = "python-dotenv" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/79/c0/89f89dfc2c2b6c2d5c1c5fde9f445696eb12f9c2a4e17637ab0aaf7cc373/llama_cloud_services-0.6.25.tar.gz", hash = "sha256:3608004b0cf984640a3a36657b8b40394d7ce2c48e3eb9dd24fc654df7643595", size = 32303 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e6/f1/99b8ef4a636dafd5f1ae1e1b19eb9f793f51573d782919bf01d9b9f797f4/llama_cloud_services-0.6.25-py3-none-any.whl", hash = "sha256:aef0afbbf0d6dc485e6566af2daeeefa8caa7bc7f6511d860036bc0aac15361b", size = 37231 }, +] + +[[package]] +name = "llama-index-core" +version = "0.12.39" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "aiohttp" }, + { name = "aiosqlite" }, + { name = "banks" }, + { name = "dataclasses-json" }, + { name = "deprecated" }, + { name = "dirtyjson" }, + { name = "filetype" }, + { name = "fsspec" }, + { name = "httpx" }, + { name = "nest-asyncio" }, + { name = "networkx" }, + { name = "nltk" }, + { name = "numpy" }, + { name = "pillow" }, + { name = "pydantic" }, + { name = "pyyaml" }, + { name = "requests" }, + { name = "sqlalchemy", extra = ["asyncio"] }, + { name = "tenacity" }, + { name = "tiktoken" }, + { name = "tqdm" }, + { name = "typing-extensions" }, + { name = "typing-inspect" }, + { name = "wrapt" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f7/45/163806502804ff75ace474f868cc33158774c4eb31d565133f32932e930e/llama_index_core-0.12.39.tar.gz", hash = "sha256:0cca9de59953542a3c2f1db61327c5204e0b1e997f31f1200e49392b2879593a", size = 7292040 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/dd/a3/583d80764df75aefc9885f28dcc06a0e5aefc993fa5318186e70f2340d73/llama_index_core-0.12.39-py3-none-any.whl", hash = "sha256:c255ed87aa85e43893f2bb05870b61ce7701d7a6a931d174ba925def5856b4c2", size = 7664906 }, +] + [[package]] name = "lxml" version = "5.3.1" @@ -2468,6 +2583,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cf/6c/41c21c6c8af92b9fea313aa47c75de49e2f9a467964ee33eb0135d47eb64/pillow-11.1.0-cp313-cp313t-win_arm64.whl", hash = "sha256:67cd427c68926108778a9005f2a04adbd5e67c442ed21d95389fe1d595458756", size = 2377651 }, ] +[[package]] +name = "platformdirs" +version = "4.3.8" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/fe/8b/3c73abc9c759ecd3f1f7ceff6685840859e8070c4d947c93fae71f6a0bf2/platformdirs-4.3.8.tar.gz", hash = "sha256:3d512d96e16bcb959a814c9f348431070822a6496326a4be0911c40b5a74c2bc", size = 21362 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fe/39/979e8e21520d4e47a0bbe349e2713c0aac6f3d853d0e5b34d76206c439aa/platformdirs-4.3.8-py3-none-any.whl", hash = "sha256:ff7059bb7eb1179e2685604f4aaf157cfd9535242bd23742eadc3c13542139b4", size = 18567 }, +] + [[package]] name = "playwright" version = "1.50.0" @@ -3392,7 +3516,7 @@ wheels = [ [[package]] name = "surf-new-backend" -version = "0.0.6" +version = "0.0.7" source = { virtual = "." } dependencies = [ { name = "alembic" }, @@ -3407,6 +3531,7 @@ dependencies = [ { name = "langgraph" }, { name = "linkup-sdk" }, { name = "litellm" }, + { name = "llama-cloud-services" }, { name = "markdownify" }, { name = "notion-client" }, { name = "pgvector" }, @@ -3438,6 +3563,7 @@ requires-dist = [ { name = "langgraph", specifier = ">=0.3.29" }, { name = "linkup-sdk", specifier = ">=0.2.4" }, { name = "litellm", specifier = ">=1.61.4" }, + { name = "llama-cloud-services", specifier = ">=0.6.25" }, { name = "markdownify", specifier = ">=0.14.1" }, { name = "notion-client", specifier = ">=2.3.0" }, { name = "pgvector", specifier = ">=0.3.6" }, diff --git a/surfsense_web/.env.example b/surfsense_web/.env.example index 3ab9d17..03f266b 100644 --- a/surfsense_web/.env.example +++ b/surfsense_web/.env.example @@ -1,2 +1,3 @@ NEXT_PUBLIC_FASTAPI_BACKEND_URL=http://localhost:8000 -NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE=LOCAL or GOOGLE \ No newline at end of file +NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE=LOCAL or GOOGLE +NEXT_PUBLIC_ETL_SERVICE=UNSTRUCTURED or LLAMACLOUD \ No newline at end of file diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/upload/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/upload/page.tsx index e1adbe2..b8848b0 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/upload/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/upload/page.tsx @@ -42,38 +42,95 @@ export default function FileUploader() { const router = useRouter(); const fileInputRef = useRef(null); - const acceptedFileTypes = { - 'image/bmp': ['.bmp'], - 'text/csv': ['.csv'], - 'application/msword': ['.doc'], - 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'], - 'message/rfc822': ['.eml'], - 'application/epub+zip': ['.epub'], - 'image/heic': ['.heic'], - 'text/html': ['.html'], - 'image/jpeg': ['.jpeg', '.jpg'], - 'image/png': ['.png'], - 'text/markdown': ['.md', '.markdown'], - 'application/vnd.ms-outlook': ['.msg'], - 'application/vnd.oasis.opendocument.text': ['.odt'], - 'text/x-org': ['.org'], - 'application/pkcs7-signature': ['.p7s'], - 'application/pdf': ['.pdf'], - 'application/vnd.ms-powerpoint': ['.ppt'], - 'application/vnd.openxmlformats-officedocument.presentationml.presentation': ['.pptx'], - 'text/x-rst': ['.rst'], - 'application/rtf': ['.rtf'], - 'image/tiff': ['.tiff'], - 'text/plain': ['.txt'], - 'text/tab-separated-values': ['.tsv'], - 'application/vnd.ms-excel': ['.xls'], - 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'], - 'application/xml': ['.xml'], + // Audio files are always supported (using whisper) + const audioFileTypes = { 'audio/mpeg': ['.mp3', '.mpeg', '.mpga'], 'audio/mp4': ['.mp4', '.m4a'], 'audio/wav': ['.wav'], 'audio/webm': ['.webm'], - } + }; + + // Conditionally set accepted file types based on ETL service + const acceptedFileTypes = process.env.NEXT_PUBLIC_ETL_SERVICE === 'LLAMACLOUD' + ? { + // LlamaCloud supported file types + 'application/pdf': ['.pdf'], + 'application/msword': ['.doc'], + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'], + 'application/vnd.ms-word.document.macroEnabled.12': ['.docm'], + 'application/msword-template': ['.dot'], + 'application/vnd.ms-word.template.macroEnabled.12': ['.dotm'], + 'application/vnd.ms-powerpoint': ['.ppt'], + 'application/vnd.ms-powerpoint.template.macroEnabled.12': ['.pptm'], + 'application/vnd.openxmlformats-officedocument.presentationml.presentation': ['.pptx'], + 'application/vnd.ms-powerpoint.template': ['.pot'], + 'application/vnd.openxmlformats-officedocument.presentationml.template': ['.potx'], + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'], + 'application/vnd.ms-excel': ['.xls'], + 'application/vnd.ms-excel.sheet.macroEnabled.12': ['.xlsm'], + 'application/vnd.ms-excel.sheet.binary.macroEnabled.12': ['.xlsb'], + 'application/vnd.ms-excel.workspace': ['.xlw'], + 'application/rtf': ['.rtf'], + 'application/xml': ['.xml'], + 'application/epub+zip': ['.epub'], + 'application/vnd.apple.keynote': ['.key'], + 'application/vnd.apple.pages': ['.pages'], + 'application/vnd.apple.numbers': ['.numbers'], + 'application/vnd.wordperfect': ['.wpd'], + 'application/vnd.oasis.opendocument.text': ['.odt'], + 'application/vnd.oasis.opendocument.presentation': ['.odp'], + 'application/vnd.oasis.opendocument.graphics': ['.odg'], + 'application/vnd.oasis.opendocument.spreadsheet': ['.ods'], + 'application/vnd.oasis.opendocument.formula': ['.fods'], + 'text/plain': ['.txt'], + 'text/csv': ['.csv'], + 'text/tab-separated-values': ['.tsv'], + 'text/html': ['.html', '.htm', '.web'], + 'image/jpeg': ['.jpg', '.jpeg'], + 'image/png': ['.png'], + 'image/gif': ['.gif'], + 'image/bmp': ['.bmp'], + 'image/svg+xml': ['.svg'], + 'image/tiff': ['.tiff'], + 'image/webp': ['.webp'], + 'application/dbase': ['.dbf'], + 'application/vnd.lotus-1-2-3': ['.123'], + 'text/x-web-markdown': ['.602', '.abw', '.cgm', '.cwk', '.hwp', '.lwp', '.mw', '.mcw', '.pbd', '.sda', '.sdd', '.sdp', '.sdw', '.sgl', '.sti', '.sxi', '.sxw', '.stw', '.sxg', '.uof', '.uop', '.uot', '.vor', '.wps', '.zabw'], + 'text/x-spreadsheet': ['.dif', '.sylk', '.slk', '.prn', '.et', '.uos1', '.uos2', '.wk1', '.wk2', '.wk3', '.wk4', '.wks', '.wq1', '.wq2', '.wb1', '.wb2', '.wb3', '.qpw', '.xlr', '.eth'], + // Audio files (always supported) + ...audioFileTypes, + } + : { + // Unstructured supported file types + 'image/bmp': ['.bmp'], + 'text/csv': ['.csv'], + 'application/msword': ['.doc'], + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'], + 'message/rfc822': ['.eml'], + 'application/epub+zip': ['.epub'], + 'image/heic': ['.heic'], + 'text/html': ['.html'], + 'image/jpeg': ['.jpeg', '.jpg'], + 'image/png': ['.png'], + 'text/markdown': ['.md', '.markdown'], + 'application/vnd.ms-outlook': ['.msg'], + 'application/vnd.oasis.opendocument.text': ['.odt'], + 'text/x-org': ['.org'], + 'application/pkcs7-signature': ['.p7s'], + 'application/pdf': ['.pdf'], + 'application/vnd.ms-powerpoint': ['.ppt'], + 'application/vnd.openxmlformats-officedocument.presentationml.presentation': ['.pptx'], + 'text/x-rst': ['.rst'], + 'application/rtf': ['.rtf'], + 'image/tiff': ['.tiff'], + 'text/plain': ['.txt'], + 'text/tab-separated-values': ['.tsv'], + 'application/vnd.ms-excel': ['.xls'], + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'], + 'application/xml': ['.xml'], + // Audio files (always supported) + ...audioFileTypes, + }; const supportedExtensions = Array.from(new Set(Object.values(acceptedFileTypes).flat())).sort() diff --git a/surfsense_web/content/docs/docker-installation.mdx b/surfsense_web/content/docs/docker-installation.mdx index aac7cc7..03d6874 100644 --- a/surfsense_web/content/docs/docker-installation.mdx +++ b/surfsense_web/content/docs/docker-installation.mdx @@ -90,7 +90,9 @@ Before you begin, ensure you have: | FAST_LLM | LiteLLM routed smaller, faster LLM (e.g., `openai/gpt-4o-mini`, `ollama/deepseek-r1:8b`) | | STRATEGIC_LLM | LiteLLM routed advanced LLM for complex tasks (e.g., `openai/gpt-4o`, `ollama/gemma3:12b`) | | LONG_CONTEXT_LLM | LiteLLM routed LLM for longer context windows (e.g., `gemini/gemini-2.0-flash`, `ollama/deepseek-r1:8b`) | -| UNSTRUCTURED_API_KEY | API key for Unstructured.io service for document parsing | +| ETL_SERVICE | Document parsing service: `UNSTRUCTURED` (supports 34+ formats) or `LLAMACLOUD` (supports 50+ formats including legacy document types) | +| UNSTRUCTURED_API_KEY | API key for Unstructured.io service for document parsing (required if ETL_SERVICE=UNSTRUCTURED) | +| LLAMA_CLOUD_API_KEY | API key for LlamaCloud service for document parsing (required if ETL_SERVICE=LLAMACLOUD) | | FIRECRAWL_API_KEY | API key for Firecrawl service for web crawling | | TTS_SERVICE | Text-to-Speech API provider for Podcasts (e.g., `openai/tts-1`, `azure/neural`, `vertex_ai/`). See [supported providers](https://docs.litellm.ai/docs/text_to_speech#supported-providers) | | STT_SERVICE | Speech-to-Text API provider for Podcasts (e.g., `openai/whisper-1`). See [supported providers](https://docs.litellm.ai/docs/audio_transcription#supported-providers) | @@ -136,6 +138,7 @@ For other LLM providers, refer to the [LiteLLM documentation](https://docs.litel | ------------------------------- | ---------------------------------------------------------- | | NEXT_PUBLIC_FASTAPI_BACKEND_URL | URL of the backend service (e.g., `http://localhost:8000`) | | NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE | Same value as set in backend AUTH_TYPE i.e `GOOGLE` for OAuth with Google, `LOCAL` for email/password authentication | +| NEXT_PUBLIC_ETL_SERVICE | Document parsing service (should match backend ETL_SERVICE): `UNSTRUCTURED` or `LLAMACLOUD` - affects supported file formats in upload interface | 2. **Build and Start Containers** diff --git a/surfsense_web/content/docs/manual-installation.mdx b/surfsense_web/content/docs/manual-installation.mdx index 72492c1..82b9f0f 100644 --- a/surfsense_web/content/docs/manual-installation.mdx +++ b/surfsense_web/content/docs/manual-installation.mdx @@ -61,7 +61,9 @@ Edit the `.env` file and set the following variables: | FAST_LLM | LiteLLM routed smaller, faster LLM (e.g., `openai/gpt-4o-mini`, `ollama/deepseek-r1:8b`) | | STRATEGIC_LLM | LiteLLM routed advanced LLM for complex tasks (e.g., `openai/gpt-4o`, `ollama/gemma3:12b`) | | LONG_CONTEXT_LLM | LiteLLM routed LLM for longer context windows (e.g., `gemini/gemini-2.0-flash`, `ollama/deepseek-r1:8b`) | -| UNSTRUCTURED_API_KEY | API key for Unstructured.io service for document parsing | +| ETL_SERVICE | Document parsing service: `UNSTRUCTURED` (supports 34+ formats) or `LLAMACLOUD` (supports 50+ formats including legacy document types) | +| UNSTRUCTURED_API_KEY | API key for Unstructured.io service for document parsing (required if ETL_SERVICE=UNSTRUCTURED) | +| LLAMA_CLOUD_API_KEY | API key for LlamaCloud service for document parsing (required if ETL_SERVICE=LLAMACLOUD) | | FIRECRAWL_API_KEY | API key for Firecrawl service for web crawling | | TTS_SERVICE | Text-to-Speech API provider for Podcasts (e.g., `openai/tts-1`, `azure/neural`, `vertex_ai/`). See [supported providers](https://docs.litellm.ai/docs/text_to_speech#supported-providers) | | STT_SERVICE | Speech-to-Text API provider for Podcasts (e.g., `openai/whisper-1`). See [supported providers](https://docs.litellm.ai/docs/audio_transcription#supported-providers) | @@ -182,6 +184,7 @@ Edit the `.env` file and set: | ------------------------------- | ------------------------------------------- | | NEXT_PUBLIC_FASTAPI_BACKEND_URL | Backend URL (e.g., `http://localhost:8000`) | | NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE | Same value as set in backend AUTH_TYPE i.e `GOOGLE` for OAuth with Google, `LOCAL` for email/password authentication | +| NEXT_PUBLIC_ETL_SERVICE | Document parsing service (should match backend ETL_SERVICE): `UNSTRUCTURED` or `LLAMACLOUD` - affects supported file formats in upload interface | ### 2. Install Dependencies