From 73751c0eb13243f5d1ac1a9efcf7177acba10b6a Mon Sep 17 00:00:00 2001
From: "DESKTOP-RTLN3BA\\$punk" <vermarohanfinal@gmail.com>
Date: Fri, 30 May 2025 19:17:19 -0700
Subject: [PATCH] feat: Removed Hard Dependency on Unstructured.io

- Added Llamaparse Support :)
---
 README.md                                     |  36 +++--
 surfsense_backend/.env.example                |   6 +-
 surfsense_backend/app/config/__init__.py      |  13 +-
 .../app/routes/documents_routes.py            |  97 +++++++++----
 .../app/tasks/background_tasks.py             |  79 ++++++++++-
 surfsense_backend/pyproject.toml              |   1 +
 surfsense_backend/uv.lock                     | 128 +++++++++++++++++-
 surfsense_web/.env.example                    |   3 +-
 .../documents/upload/page.tsx                 | 113 ++++++++++++----
 .../content/docs/docker-installation.mdx      |   5 +-
 .../content/docs/manual-installation.mdx      |   5 +-
 11 files changed, 402 insertions(+), 84 deletions(-)

diff --git a/README.md b/README.md
index 8e0eae5..00cf5ad 100644
--- a/README.md
+++ b/README.md
@@ -31,7 +31,7 @@ https://github.com/user-attachments/assets/bf64a6ca-934b-47ac-9e1b-edac5fe972ec
 ### 💡 **Idea**: 
 Have your own highly customizable private NotebookLM and Perplexity integrated with external sources.
 ### 📁 **Multiple File Format Uploading Support**
-Save content from your own personal files *(Documents, images, videos and supports **34 file extensions**)* to your own personal knowledge base .
+Save content from your own personal files *(Documents, images, videos and supports **50+ file extensions**)* to your own personal knowledge base .
 ### 🔍 **Powerful Search**
 Quickly research or find anything in your saved content .
 ### 💬 **Chat with your Saved Content**
@@ -66,35 +66,33 @@ Open source and easy to deploy locally.
 
 ### 📄 **Supported File Extensions**
 
-#### Document
+> **Note**: File format support depends on your ETL service configuration. LlamaCloud supports 100+ formats, while Unstructured supports 34+ core formats.
 
-`.doc`, `.docx`, `.odt`, `.rtf`, `.pdf`, `.xml`
+#### Documents & Text
+**LlamaCloud**: `.pdf`, `.doc`, `.docx`, `.docm`, `.dot`, `.dotm`, `.rtf`, `.txt`, `.xml`, `.epub`, `.odt`, `.wpd`, `.pages`, `.key`, `.numbers`, `.602`, `.abw`, `.cgm`, `.cwk`, `.hwp`, `.lwp`, `.mw`, `.mcw`, `.pbd`, `.sda`, `.sdd`, `.sdp`, `.sdw`, `.sgl`, `.sti`, `.sxi`, `.sxw`, `.stw`, `.sxg`, `.uof`, `.uop`, `.uot`, `.vor`, `.wps`, `.zabw`
 
-#### Text & Markup
+**Unstructured**: `.doc`, `.docx`, `.odt`, `.rtf`, `.pdf`, `.xml`, `.txt`, `.md`, `.markdown`, `.rst`, `.html`, `.org`, `.epub`
 
-`.txt`, `.md`, `.markdown`, `.rst`, `.html`, `.org`
+#### Presentations
+**LlamaCloud**: `.ppt`, `.pptx`, `.pptm`, `.pot`, `.potm`, `.potx`, `.odp`, `.key`
 
-#### Spreadsheets & Tables
+**Unstructured**: `.ppt`, `.pptx`
 
-`.xls`, `.xlsx`, `.csv`, `.tsv`
+#### Spreadsheets & Data
+**LlamaCloud**: `.xlsx`, `.xls`, `.xlsm`, `.xlsb`, `.xlw`, `.csv`, `.tsv`, `.ods`, `.fods`, `.numbers`, `.dbf`, `.123`, `.dif`, `.sylk`, `.slk`, `.prn`, `.et`, `.uos1`, `.uos2`, `.wk1`, `.wk2`, `.wk3`, `.wk4`, `.wks`, `.wq1`, `.wq2`, `.wb1`, `.wb2`, `.wb3`, `.qpw`, `.xlr`, `.eth`
 
-#### Audio & Video
-
-`.mp3`, `.mpga`, `.m4a`, `.wav`, `.mp4`, `.mpeg`, `.webm`
+**Unstructured**: `.xls`, `.xlsx`, `.csv`, `.tsv`
 
 #### Images
+**LlamaCloud**: `.jpg`, `.jpeg`, `.png`, `.gif`, `.bmp`, `.svg`, `.tiff`, `.webp`, `.html`, `.htm`, `.web`
 
-`.jpg`, `.jpeg`, `.png`, `.bmp`, `.tiff`, `.heic`
-
-#### Email & eBooks
-
-`.eml`, `.msg`, `.epub`
-
-#### PowerPoint Presentations & Other
-
-`.ppt`, `.pptx`, `.p7s`
+**Unstructured**: `.jpg`, `.jpeg`, `.png`, `.bmp`, `.tiff`, `.heic`
 
+#### Audio & Video *(Always Supported)*
+`.mp3`, `.mpga`, `.m4a`, `.wav`, `.mp4`, `.mpeg`, `.webm`
 
+#### Email & Communication
+**Unstructured**: `.eml`, `.msg`, `.p7s`
 
 ### 🔖 Cross Browser Extension
 - The SurfSense extension can be used to save any webpage you like.
diff --git a/surfsense_backend/.env.example b/surfsense_backend/.env.example
index f9c43d1..c0032a9 100644
--- a/surfsense_backend/.env.example
+++ b/surfsense_backend/.env.example
@@ -30,9 +30,13 @@ STT_SERVICE="openai/whisper-1"
 OPENAI_API_KEY="sk-proj-iA"
 GEMINI_API_KEY="AIzaSyB6-1641124124124124124124124124124"
 
-UNSTRUCTURED_API_KEY="Tpu3P0U8iy"
 FIRECRAWL_API_KEY="fcr-01J0000000000000000000000"
 
+#File Parser Service
+ETL_SERVICE="UNSTRUCTURED" or "LLAMACLOUD"
+UNSTRUCTURED_API_KEY="Tpu3P0U8iy"
+LLAMA_CLOUD_API_KEY="llx-nnn"
+
 #OPTIONAL: Add these for LangSmith Observability
 LANGSMITH_TRACING=true
 LANGSMITH_ENDPOINT="https://api.smith.langchain.com"
diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py
index 81cd9a2..9135c32 100644
--- a/surfsense_backend/app/config/__init__.py
+++ b/surfsense_backend/app/config/__init__.py
@@ -96,9 +96,18 @@ class Config:
     # OAuth JWT
     SECRET_KEY = os.getenv("SECRET_KEY")
     
-    # Unstructured API Key
-    UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY")
+    # ETL Service
+    ETL_SERVICE = os.getenv("ETL_SERVICE")
     
+    if ETL_SERVICE == "UNSTRUCTURED":
+        # Unstructured API Key
+        UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY")
+        
+    elif ETL_SERVICE == "LLAMACLOUD":
+        # LlamaCloud API Key
+        LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
+        
+        
     # Firecrawl API Key
     FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY", None) 
     
diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py
index 5ea2327..acd246e 100644
--- a/surfsense_backend/app/routes/documents_routes.py
+++ b/surfsense_backend/app/routes/documents_routes.py
@@ -7,7 +7,7 @@ from app.db import get_async_session, User, SearchSpace, Document, DocumentType
 from app.schemas import DocumentsCreate, DocumentUpdate, DocumentRead
 from app.users import current_active_user
 from app.utils.check_ownership import check_ownership
-from app.tasks.background_tasks import add_received_markdown_file_document, add_extension_received_document, add_received_file_document, add_crawled_url_document, add_youtube_video_document
+from app.tasks.background_tasks import add_received_markdown_file_document, add_extension_received_document, add_received_file_document_using_unstructured, add_crawled_url_document, add_youtube_video_document, add_received_file_document_using_llamacloud
 from app.config import config as app_config
 # Force asyncio to use standard event loop before unstructured imports
 import asyncio
@@ -101,8 +101,7 @@ async def create_documents(
                 content = await file.read()
                 with open(temp_path, "wb") as f:
                     f.write(content)
-
-                # Process in background to avoid uvloop conflicts
+         
                 fastapi_background_tasks.add_task(
                     process_file_in_background_with_new_session,
                     temp_path,
@@ -191,36 +190,74 @@ async def process_file_in_background(
                 search_space_id
             )
         else:
-            # Use synchronous unstructured API to avoid event loop issues
-            from langchain_unstructured import UnstructuredLoader
+            if app_config.ETL_SERVICE == "UNSTRUCTURED":
+                from langchain_unstructured import UnstructuredLoader
+                
+                # Process the file
+                loader = UnstructuredLoader(
+                    file_path,
+                    mode="elements",
+                    post_processors=[],
+                    languages=["eng"],
+                    include_orig_elements=False,
+                    include_metadata=False,
+                    strategy="auto",
+                )
 
-            # Process the file
-            loader = UnstructuredLoader(
-                file_path,
-                mode="elements",
-                post_processors=[],
-                languages=["eng"],
-                include_orig_elements=False,
-                include_metadata=False,
-                strategy="auto",
-            )
+                docs = await loader.aload()
 
-            docs = await loader.aload()
+                # Clean up the temp file
+                import os
+                try:
+                    os.unlink(file_path)
+                except:
+                    pass
 
-            # Clean up the temp file
-            import os
-            try:
-                os.unlink(file_path)
-            except:
-                pass
+                # Pass the documents to the existing background task
+                await add_received_file_document_using_unstructured(
+                    session,
+                    filename,
+                    docs,
+                    search_space_id
+                )
+            elif app_config.ETL_SERVICE == "LLAMACLOUD":
+                from llama_cloud_services import LlamaParse
+                from llama_cloud_services.parse.utils import ResultType
 
-            # Pass the documents to the existing background task
-            await add_received_file_document(
-                session,
-                filename,
-                docs,
-                search_space_id
-            )
+                
+                # Create LlamaParse parser instance
+                parser = LlamaParse(
+                    api_key=app_config.LLAMA_CLOUD_API_KEY,
+                    num_workers=1,  # Use single worker for file processing
+                    verbose=True,
+                    language="en",
+                    result_type=ResultType.MD
+                )
+                
+                # Parse the file asynchronously
+                result = await parser.aparse(file_path)
+                
+                # Clean up the temp file
+                import os
+                try:
+                    os.unlink(file_path)
+                except:
+                    pass
+                
+                # Get markdown documents from the result
+                markdown_documents = await result.aget_markdown_documents(split_by_page=False)
+                
+                for doc in markdown_documents:
+                    # Extract text content from the markdown documents
+                    markdown_content = doc.text
+                    
+                    # Process the documents using our LlamaCloud background task
+                    await add_received_file_document_using_llamacloud(
+                        session,
+                        filename,
+                        llamacloud_markdown_document=markdown_content,
+                        search_space_id=search_space_id
+                    )
     except Exception as e:
         import logging
         logging.error(f"Error processing file in background: {str(e)}")
@@ -442,3 +479,5 @@ async def process_youtube_video_with_new_session(
         except Exception as e:
             import logging
             logging.error(f"Error processing YouTube video: {str(e)}")
+
+
diff --git a/surfsense_backend/app/tasks/background_tasks.py b/surfsense_backend/app/tasks/background_tasks.py
index 1c6cd6d..f6b1eb2 100644
--- a/surfsense_backend/app/tasks/background_tasks.py
+++ b/surfsense_backend/app/tasks/background_tasks.py
@@ -289,7 +289,7 @@ async def add_received_markdown_file_document(
         raise RuntimeError(f"Failed to process file document: {str(e)}")
 
 
-async def add_received_file_document(
+async def add_received_file_document_using_unstructured(
     session: AsyncSession,
     file_name: str,
     unstructured_processed_elements: List[LangChainDocument],
@@ -357,6 +357,83 @@ async def add_received_file_document(
         raise RuntimeError(f"Failed to process file document: {str(e)}")
 
 
+async def add_received_file_document_using_llamacloud(
+    session: AsyncSession,
+    file_name: str,
+    llamacloud_markdown_document: str,
+    search_space_id: int,
+) -> Optional[Document]:
+    """
+    Process and store document content parsed by LlamaCloud.
+
+    Args:
+        session: Database session
+        file_name: Name of the processed file
+        llamacloud_markdown_documents: List of markdown content from LlamaCloud parsing
+        search_space_id: ID of the search space
+
+    Returns:
+        Document object if successful, None if failed
+    """
+    try:
+        # Combine all markdown documents into one
+        file_in_markdown = llamacloud_markdown_document
+
+        content_hash = generate_content_hash(file_in_markdown)
+
+        # Check if document with this content hash already exists
+        existing_doc_result = await session.execute(
+            select(Document).where(Document.content_hash == content_hash)
+        )
+        existing_document = existing_doc_result.scalars().first()
+        
+        if existing_document:
+            logging.info(f"Document with content hash {content_hash} already exists. Skipping processing.")
+            return existing_document
+
+        # Generate summary
+        summary_chain = SUMMARY_PROMPT_TEMPLATE | config.long_context_llm_instance
+        summary_result = await summary_chain.ainvoke({"document": file_in_markdown})
+        summary_content = summary_result.content
+        summary_embedding = config.embedding_model_instance.embed(summary_content)
+
+        # Process chunks
+        chunks = [
+            Chunk(
+                content=chunk.text,
+                embedding=config.embedding_model_instance.embed(chunk.text),
+            )
+            for chunk in config.chunker_instance.chunk(file_in_markdown)
+        ]
+
+        # Create and store document
+        document = Document(
+            search_space_id=search_space_id,
+            title=file_name,
+            document_type=DocumentType.FILE,
+            document_metadata={
+                "FILE_NAME": file_name,
+                "ETL_SERVICE": "LLAMACLOUD",
+            },
+            content=summary_content,
+            embedding=summary_embedding,
+            chunks=chunks,
+            content_hash=content_hash,
+        )
+
+        session.add(document)
+        await session.commit()
+        await session.refresh(document)
+
+        return document
+    except SQLAlchemyError as db_error:
+        await session.rollback()
+        raise db_error
+    except Exception as e:
+        await session.rollback()
+        raise RuntimeError(f"Failed to process file document using LlamaCloud: {str(e)}")
+
+
 async def add_youtube_video_document(
     session: AsyncSession, url: str, search_space_id: int
 ):
diff --git a/surfsense_backend/pyproject.toml b/surfsense_backend/pyproject.toml
index 1e5345b..dfa7559 100644
--- a/surfsense_backend/pyproject.toml
+++ b/surfsense_backend/pyproject.toml
@@ -17,6 +17,7 @@ dependencies = [
     "langgraph>=0.3.29",
     "linkup-sdk>=0.2.4",
     "litellm>=1.61.4",
+    "llama-cloud-services>=0.6.25",
     "markdownify>=0.14.1",
     "notion-client>=2.3.0",
     "pgvector>=0.3.6",
diff --git a/surfsense_backend/uv.lock b/surfsense_backend/uv.lock
index 5f90ed9..968e5c9 100644
--- a/surfsense_backend/uv.lock
+++ b/surfsense_backend/uv.lock
@@ -110,6 +110,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ec/6a/bc7e17a3e87a2985d3e8f4da4cd0f481060eb78fb08596c42be62c90a4d9/aiosignal-1.3.2-py2.py3-none-any.whl", hash = "sha256:45cde58e409a301715980c2b01d0c28bdde3770d8290b5eb2173759d9acb31a5", size = 7597 },
 ]
 
+[[package]]
+name = "aiosqlite"
+version = "0.21.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/13/7d/8bca2bf9a247c2c5dfeec1d7a5f40db6518f88d314b8bca9da29670d2671/aiosqlite-0.21.0.tar.gz", hash = "sha256:131bb8056daa3bc875608c631c678cda73922a2d4ba8aec373b19f18c17e7aa3", size = 13454 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f5/10/6c25ed6de94c49f88a91fa5018cb4c0f3625f31d5be9f771ebe5cc7cd506/aiosqlite-0.21.0-py3-none-any.whl", hash = "sha256:2549cf4057f95f53dcba16f2b64e8e2791d7e1adedb13197dd8ed77bb226d7d0", size = 15792 },
+]
+
 [[package]]
 name = "alembic"
 version = "1.15.2"
@@ -228,6 +240,22 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/df/73/b6e24bd22e6720ca8ee9a85a0c4a2971af8497d8f3193fa05390cbd46e09/backoff-2.2.1-py3-none-any.whl", hash = "sha256:63579f9a0628e06278f7e47b7d7d5b6ce20dc65c5e96a6f3ca99a6adca0396e8", size = 15148 },
 ]
 
+[[package]]
+name = "banks"
+version = "2.1.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "deprecated" },
+    { name = "griffe" },
+    { name = "jinja2" },
+    { name = "platformdirs" },
+    { name = "pydantic" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/77/34/2b6697f02ffb68bee50e5fd37d6c64432244d3245603fd62950169dfed7e/banks-2.1.2.tar.gz", hash = "sha256:a0651db9d14b57fa2e115e78f68dbb1b36fe226ad6eef96192542908b1d20c1f", size = 173332 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/04/4a/7fdca29d1db62f5f5c3446bf8f668beacdb0b5a8aff4247574ddfddc6bcd/banks-2.1.2-py3-none-any.whl", hash = "sha256:7fba451069f6bea376483b8136a0f29cb1e6883133626d00e077e20a3d102c0e", size = 28064 },
+]
+
 [[package]]
 name = "bcrypt"
 version = "4.2.1"
@@ -572,6 +600,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/6e/c6/ac0b6c1e2d138f1002bcf799d330bd6d85084fece321e662a14223794041/Deprecated-1.2.18-py2.py3-none-any.whl", hash = "sha256:bd5011788200372a32418f888e326a09ff80d0214bd961147cfed01b5c018eec", size = 9998 },
 ]
 
+[[package]]
+name = "dirtyjson"
+version = "1.0.8"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/db/04/d24f6e645ad82ba0ef092fa17d9ef7a21953781663648a01c9371d9e8e98/dirtyjson-1.0.8.tar.gz", hash = "sha256:90ca4a18f3ff30ce849d100dcf4a003953c79d3a2348ef056f1d9c22231a25fd", size = 30782 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/68/69/1bcf70f81de1b4a9f21b3a62ec0c83bdff991c88d6cc2267d02408457e88/dirtyjson-1.0.8-py3-none-any.whl", hash = "sha256:125e27248435a58acace26d5c2c4c11a1c0de0a9c5124c5a94ba78e517d74f53", size = 25197 },
+]
+
 [[package]]
 name = "distro"
 version = "1.9.0"
@@ -988,6 +1025,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ac/38/08cc303ddddc4b3d7c628c3039a61a3aae36c241ed01393d00c2fd663473/greenlet-3.1.1-cp313-cp313t-musllinux_1_1_x86_64.whl", hash = "sha256:411f015496fec93c1c8cd4e5238da364e1da7a124bcb293f085bf2860c32c6f6", size = 1142112 },
 ]
 
+[[package]]
+name = "griffe"
+version = "1.7.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a9/3e/5aa9a61f7c3c47b0b52a1d930302992229d191bf4bc76447b324b731510a/griffe-1.7.3.tar.gz", hash = "sha256:52ee893c6a3a968b639ace8015bec9d36594961e156e23315c8e8e51401fa50b", size = 395137 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/58/c6/5c20af38c2a57c15d87f7f38bee77d63c1d2a3689f74fefaf35915dd12b2/griffe-1.7.3-py3-none-any.whl", hash = "sha256:c6b3ee30c2f0f17f30bcdef5068d6ab7a2a4f1b8bf1a3e74b56fffd21e1c5f75", size = 129303 },
+]
+
 [[package]]
 name = "grpcio"
 version = "1.71.0"
@@ -1604,6 +1653,72 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/f9/c2/1b6c502909b7af9054736af61e27558a3341e8c1ba28e7f82473e6dd936f/litellm-1.61.4-py3-none-any.whl", hash = "sha256:e87e0d397a191795b4217f9299fc9b21eaacaab91409695f0a4780cceccda6e1", size = 6814517 },
 ]
 
+[[package]]
+name = "llama-cloud"
+version = "0.1.23"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "certifi" },
+    { name = "httpx" },
+    { name = "pydantic" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/5b/e4/d1a30167ed6690a408382be1cf7de220a506085f4371baaf067d65bad8fd/llama_cloud-0.1.23.tar.gz", hash = "sha256:3d84a24a860f046d39a106c06742ec0ea39a574ac42bbf91706fe025f44e233e", size = 101292 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8a/15/3b56acef877dbc5d01d7e1a782c2cc50ef8a08d5773121c3bc20546de582/llama_cloud-0.1.23-py3-none-any.whl", hash = "sha256:ce95b0705d85c99b3b27b0af0d16a17d9a81b14c96bf13c1063a1bd13d8d0446", size = 267343 },
+]
+
+[[package]]
+name = "llama-cloud-services"
+version = "0.6.25"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "click" },
+    { name = "llama-cloud" },
+    { name = "llama-index-core" },
+    { name = "platformdirs" },
+    { name = "pydantic" },
+    { name = "python-dotenv" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/79/c0/89f89dfc2c2b6c2d5c1c5fde9f445696eb12f9c2a4e17637ab0aaf7cc373/llama_cloud_services-0.6.25.tar.gz", hash = "sha256:3608004b0cf984640a3a36657b8b40394d7ce2c48e3eb9dd24fc654df7643595", size = 32303 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e6/f1/99b8ef4a636dafd5f1ae1e1b19eb9f793f51573d782919bf01d9b9f797f4/llama_cloud_services-0.6.25-py3-none-any.whl", hash = "sha256:aef0afbbf0d6dc485e6566af2daeeefa8caa7bc7f6511d860036bc0aac15361b", size = 37231 },
+]
+
+[[package]]
+name = "llama-index-core"
+version = "0.12.39"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "aiohttp" },
+    { name = "aiosqlite" },
+    { name = "banks" },
+    { name = "dataclasses-json" },
+    { name = "deprecated" },
+    { name = "dirtyjson" },
+    { name = "filetype" },
+    { name = "fsspec" },
+    { name = "httpx" },
+    { name = "nest-asyncio" },
+    { name = "networkx" },
+    { name = "nltk" },
+    { name = "numpy" },
+    { name = "pillow" },
+    { name = "pydantic" },
+    { name = "pyyaml" },
+    { name = "requests" },
+    { name = "sqlalchemy", extra = ["asyncio"] },
+    { name = "tenacity" },
+    { name = "tiktoken" },
+    { name = "tqdm" },
+    { name = "typing-extensions" },
+    { name = "typing-inspect" },
+    { name = "wrapt" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/f7/45/163806502804ff75ace474f868cc33158774c4eb31d565133f32932e930e/llama_index_core-0.12.39.tar.gz", hash = "sha256:0cca9de59953542a3c2f1db61327c5204e0b1e997f31f1200e49392b2879593a", size = 7292040 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/dd/a3/583d80764df75aefc9885f28dcc06a0e5aefc993fa5318186e70f2340d73/llama_index_core-0.12.39-py3-none-any.whl", hash = "sha256:c255ed87aa85e43893f2bb05870b61ce7701d7a6a931d174ba925def5856b4c2", size = 7664906 },
+]
+
 [[package]]
 name = "lxml"
 version = "5.3.1"
@@ -2468,6 +2583,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/cf/6c/41c21c6c8af92b9fea313aa47c75de49e2f9a467964ee33eb0135d47eb64/pillow-11.1.0-cp313-cp313t-win_arm64.whl", hash = "sha256:67cd427c68926108778a9005f2a04adbd5e67c442ed21d95389fe1d595458756", size = 2377651 },
 ]
 
+[[package]]
+name = "platformdirs"
+version = "4.3.8"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/fe/8b/3c73abc9c759ecd3f1f7ceff6685840859e8070c4d947c93fae71f6a0bf2/platformdirs-4.3.8.tar.gz", hash = "sha256:3d512d96e16bcb959a814c9f348431070822a6496326a4be0911c40b5a74c2bc", size = 21362 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/fe/39/979e8e21520d4e47a0bbe349e2713c0aac6f3d853d0e5b34d76206c439aa/platformdirs-4.3.8-py3-none-any.whl", hash = "sha256:ff7059bb7eb1179e2685604f4aaf157cfd9535242bd23742eadc3c13542139b4", size = 18567 },
+]
+
 [[package]]
 name = "playwright"
 version = "1.50.0"
@@ -3392,7 +3516,7 @@ wheels = [
 
 [[package]]
 name = "surf-new-backend"
-version = "0.0.6"
+version = "0.0.7"
 source = { virtual = "." }
 dependencies = [
     { name = "alembic" },
@@ -3407,6 +3531,7 @@ dependencies = [
     { name = "langgraph" },
     { name = "linkup-sdk" },
     { name = "litellm" },
+    { name = "llama-cloud-services" },
     { name = "markdownify" },
     { name = "notion-client" },
     { name = "pgvector" },
@@ -3438,6 +3563,7 @@ requires-dist = [
     { name = "langgraph", specifier = ">=0.3.29" },
     { name = "linkup-sdk", specifier = ">=0.2.4" },
     { name = "litellm", specifier = ">=1.61.4" },
+    { name = "llama-cloud-services", specifier = ">=0.6.25" },
     { name = "markdownify", specifier = ">=0.14.1" },
     { name = "notion-client", specifier = ">=2.3.0" },
     { name = "pgvector", specifier = ">=0.3.6" },
diff --git a/surfsense_web/.env.example b/surfsense_web/.env.example
index 3ab9d17..03f266b 100644
--- a/surfsense_web/.env.example
+++ b/surfsense_web/.env.example
@@ -1,2 +1,3 @@
 NEXT_PUBLIC_FASTAPI_BACKEND_URL=http://localhost:8000
-NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE=LOCAL or GOOGLE
\ No newline at end of file
+NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE=LOCAL or GOOGLE
+NEXT_PUBLIC_ETL_SERVICE=UNSTRUCTURED or LLAMACLOUD
\ No newline at end of file
diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/upload/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/upload/page.tsx
index e1adbe2..b8848b0 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/documents/upload/page.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/documents/upload/page.tsx
@@ -42,38 +42,95 @@ export default function FileUploader() {
     const router = useRouter();
     const fileInputRef = useRef<HTMLInputElement>(null);
 
-    const acceptedFileTypes = {
-        'image/bmp': ['.bmp'],
-        'text/csv': ['.csv'],
-        'application/msword': ['.doc'],
-        'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'],
-        'message/rfc822': ['.eml'],
-        'application/epub+zip': ['.epub'],
-        'image/heic': ['.heic'],
-        'text/html': ['.html'],
-        'image/jpeg': ['.jpeg', '.jpg'],
-        'image/png': ['.png'],
-        'text/markdown': ['.md', '.markdown'],
-        'application/vnd.ms-outlook': ['.msg'],
-        'application/vnd.oasis.opendocument.text': ['.odt'],
-        'text/x-org': ['.org'],
-        'application/pkcs7-signature': ['.p7s'],
-        'application/pdf': ['.pdf'],
-        'application/vnd.ms-powerpoint': ['.ppt'],
-        'application/vnd.openxmlformats-officedocument.presentationml.presentation': ['.pptx'],
-        'text/x-rst': ['.rst'],
-        'application/rtf': ['.rtf'],
-        'image/tiff': ['.tiff'],
-        'text/plain': ['.txt'],
-        'text/tab-separated-values': ['.tsv'],
-        'application/vnd.ms-excel': ['.xls'],
-        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'],
-        'application/xml': ['.xml'],
+    // Audio files are always supported (using whisper)
+    const audioFileTypes = {
         'audio/mpeg': ['.mp3', '.mpeg', '.mpga'],
         'audio/mp4': ['.mp4', '.m4a'],
         'audio/wav': ['.wav'],
         'audio/webm': ['.webm'],
-    }
+    };
+
+    // Conditionally set accepted file types based on ETL service
+    const acceptedFileTypes = process.env.NEXT_PUBLIC_ETL_SERVICE === 'LLAMACLOUD' 
+        ? {
+            // LlamaCloud supported file types
+            'application/pdf': ['.pdf'],
+            'application/msword': ['.doc'],
+            'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'],
+            'application/vnd.ms-word.document.macroEnabled.12': ['.docm'],
+            'application/msword-template': ['.dot'],
+            'application/vnd.ms-word.template.macroEnabled.12': ['.dotm'],
+            'application/vnd.ms-powerpoint': ['.ppt'],
+            'application/vnd.ms-powerpoint.template.macroEnabled.12': ['.pptm'],
+            'application/vnd.openxmlformats-officedocument.presentationml.presentation': ['.pptx'],
+            'application/vnd.ms-powerpoint.template': ['.pot'],
+            'application/vnd.openxmlformats-officedocument.presentationml.template': ['.potx'],
+            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'],
+            'application/vnd.ms-excel': ['.xls'],
+            'application/vnd.ms-excel.sheet.macroEnabled.12': ['.xlsm'],
+            'application/vnd.ms-excel.sheet.binary.macroEnabled.12': ['.xlsb'],
+            'application/vnd.ms-excel.workspace': ['.xlw'],
+            'application/rtf': ['.rtf'],
+            'application/xml': ['.xml'],
+            'application/epub+zip': ['.epub'],
+            'application/vnd.apple.keynote': ['.key'],
+            'application/vnd.apple.pages': ['.pages'],
+            'application/vnd.apple.numbers': ['.numbers'],
+            'application/vnd.wordperfect': ['.wpd'],
+            'application/vnd.oasis.opendocument.text': ['.odt'],
+            'application/vnd.oasis.opendocument.presentation': ['.odp'],
+            'application/vnd.oasis.opendocument.graphics': ['.odg'],
+            'application/vnd.oasis.opendocument.spreadsheet': ['.ods'],
+            'application/vnd.oasis.opendocument.formula': ['.fods'],
+            'text/plain': ['.txt'],
+            'text/csv': ['.csv'],
+            'text/tab-separated-values': ['.tsv'],
+            'text/html': ['.html', '.htm', '.web'],
+            'image/jpeg': ['.jpg', '.jpeg'],
+            'image/png': ['.png'],
+            'image/gif': ['.gif'],
+            'image/bmp': ['.bmp'],
+            'image/svg+xml': ['.svg'],
+            'image/tiff': ['.tiff'],
+            'image/webp': ['.webp'],
+            'application/dbase': ['.dbf'],
+            'application/vnd.lotus-1-2-3': ['.123'],
+            'text/x-web-markdown': ['.602', '.abw', '.cgm', '.cwk', '.hwp', '.lwp', '.mw', '.mcw', '.pbd', '.sda', '.sdd', '.sdp', '.sdw', '.sgl', '.sti', '.sxi', '.sxw', '.stw', '.sxg', '.uof', '.uop', '.uot', '.vor', '.wps', '.zabw'],
+            'text/x-spreadsheet': ['.dif', '.sylk', '.slk', '.prn', '.et', '.uos1', '.uos2', '.wk1', '.wk2', '.wk3', '.wk4', '.wks', '.wq1', '.wq2', '.wb1', '.wb2', '.wb3', '.qpw', '.xlr', '.eth'],
+            // Audio files (always supported)
+            ...audioFileTypes,
+        }
+        : {
+            // Unstructured supported file types
+            'image/bmp': ['.bmp'],
+            'text/csv': ['.csv'],
+            'application/msword': ['.doc'],
+            'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'],
+            'message/rfc822': ['.eml'],
+            'application/epub+zip': ['.epub'],
+            'image/heic': ['.heic'],
+            'text/html': ['.html'],
+            'image/jpeg': ['.jpeg', '.jpg'],
+            'image/png': ['.png'],
+            'text/markdown': ['.md', '.markdown'],
+            'application/vnd.ms-outlook': ['.msg'],
+            'application/vnd.oasis.opendocument.text': ['.odt'],
+            'text/x-org': ['.org'],
+            'application/pkcs7-signature': ['.p7s'],
+            'application/pdf': ['.pdf'],
+            'application/vnd.ms-powerpoint': ['.ppt'],
+            'application/vnd.openxmlformats-officedocument.presentationml.presentation': ['.pptx'],
+            'text/x-rst': ['.rst'],
+            'application/rtf': ['.rtf'],
+            'image/tiff': ['.tiff'],
+            'text/plain': ['.txt'],
+            'text/tab-separated-values': ['.tsv'],
+            'application/vnd.ms-excel': ['.xls'],
+            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'],
+            'application/xml': ['.xml'],
+            // Audio files (always supported)
+            ...audioFileTypes,
+        };
 
     const supportedExtensions = Array.from(new Set(Object.values(acceptedFileTypes).flat())).sort()
 
diff --git a/surfsense_web/content/docs/docker-installation.mdx b/surfsense_web/content/docs/docker-installation.mdx
index aac7cc7..03d6874 100644
--- a/surfsense_web/content/docs/docker-installation.mdx
+++ b/surfsense_web/content/docs/docker-installation.mdx
@@ -90,7 +90,9 @@ Before you begin, ensure you have:
 | FAST_LLM                   | LiteLLM routed smaller, faster LLM (e.g., `openai/gpt-4o-mini`, `ollama/deepseek-r1:8b`)                                                                                                  |
 | STRATEGIC_LLM              | LiteLLM routed advanced LLM for complex tasks (e.g., `openai/gpt-4o`, `ollama/gemma3:12b`)                                                                                                |
 | LONG_CONTEXT_LLM           | LiteLLM routed LLM for longer context windows (e.g., `gemini/gemini-2.0-flash`, `ollama/deepseek-r1:8b`)                                                                                  |
-| UNSTRUCTURED_API_KEY       | API key for Unstructured.io service for document parsing                                                                                                                                  |
+| ETL_SERVICE                | Document parsing service: `UNSTRUCTURED` (supports 34+ formats) or `LLAMACLOUD` (supports 50+ formats including legacy document types)                                                  |
+| UNSTRUCTURED_API_KEY       | API key for Unstructured.io service for document parsing (required if ETL_SERVICE=UNSTRUCTURED)                                                                                           |
+| LLAMA_CLOUD_API_KEY        | API key for LlamaCloud service for document parsing (required if ETL_SERVICE=LLAMACLOUD)                                                                                                  |
 | FIRECRAWL_API_KEY          | API key for Firecrawl service for web crawling                                                                                                                                            |
 | TTS_SERVICE                | Text-to-Speech API provider for Podcasts (e.g., `openai/tts-1`, `azure/neural`, `vertex_ai/`). See [supported providers](https://docs.litellm.ai/docs/text_to_speech#supported-providers) |
 | STT_SERVICE                | Speech-to-Text API provider for Podcasts (e.g., `openai/whisper-1`). See [supported providers](https://docs.litellm.ai/docs/audio_transcription#supported-providers)                      |
@@ -136,6 +138,7 @@ For other LLM providers, refer to the [LiteLLM documentation](https://docs.litel
 | ------------------------------- | ---------------------------------------------------------- |
 | NEXT_PUBLIC_FASTAPI_BACKEND_URL | URL of the backend service (e.g., `http://localhost:8000`) |
 | NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE | Same value as set in backend AUTH_TYPE i.e `GOOGLE` for OAuth with Google, `LOCAL` for email/password authentication  |
+| NEXT_PUBLIC_ETL_SERVICE         | Document parsing service (should match backend ETL_SERVICE): `UNSTRUCTURED` or `LLAMACLOUD` - affects supported file formats in upload interface |
 
 2. **Build and Start Containers**
 
diff --git a/surfsense_web/content/docs/manual-installation.mdx b/surfsense_web/content/docs/manual-installation.mdx
index 72492c1..82b9f0f 100644
--- a/surfsense_web/content/docs/manual-installation.mdx
+++ b/surfsense_web/content/docs/manual-installation.mdx
@@ -61,7 +61,9 @@ Edit the `.env` file and set the following variables:
 | FAST_LLM                   | LiteLLM routed smaller, faster LLM (e.g., `openai/gpt-4o-mini`, `ollama/deepseek-r1:8b`)                                                                                                  |
 | STRATEGIC_LLM              | LiteLLM routed advanced LLM for complex tasks (e.g., `openai/gpt-4o`, `ollama/gemma3:12b`)                                                                                                |
 | LONG_CONTEXT_LLM           | LiteLLM routed LLM for longer context windows (e.g., `gemini/gemini-2.0-flash`, `ollama/deepseek-r1:8b`)                                                                                  |
-| UNSTRUCTURED_API_KEY       | API key for Unstructured.io service for document parsing                                                                                                                                  |
+| ETL_SERVICE                | Document parsing service: `UNSTRUCTURED` (supports 34+ formats) or `LLAMACLOUD` (supports 50+ formats including legacy document types)                                                  |
+| UNSTRUCTURED_API_KEY       | API key for Unstructured.io service for document parsing (required if ETL_SERVICE=UNSTRUCTURED)                                                                                           |
+| LLAMA_CLOUD_API_KEY        | API key for LlamaCloud service for document parsing (required if ETL_SERVICE=LLAMACLOUD)                                                                                                  |
 | FIRECRAWL_API_KEY          | API key for Firecrawl service for web crawling                                                                                                                                            |
 | TTS_SERVICE                | Text-to-Speech API provider for Podcasts (e.g., `openai/tts-1`, `azure/neural`, `vertex_ai/`). See [supported providers](https://docs.litellm.ai/docs/text_to_speech#supported-providers) |
 | STT_SERVICE                | Speech-to-Text API provider for Podcasts (e.g., `openai/whisper-1`). See [supported providers](https://docs.litellm.ai/docs/audio_transcription#supported-providers)                      |
@@ -182,6 +184,7 @@ Edit the `.env` file and set:
 | ------------------------------- | ------------------------------------------- |
 | NEXT_PUBLIC_FASTAPI_BACKEND_URL | Backend URL (e.g., `http://localhost:8000`) |
 | NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE | Same value as set in backend AUTH_TYPE i.e `GOOGLE` for OAuth with Google, `LOCAL` for email/password authentication  |
+| NEXT_PUBLIC_ETL_SERVICE         | Document parsing service (should match backend ETL_SERVICE): `UNSTRUCTURED` or `LLAMACLOUD` - affects supported file formats in upload interface |
 
 ### 2. Install Dependencies