mirror of
https://github.com/MODSetter/SurfSense.git
synced 2025-09-01 10:09:08 +00:00
feat: Removed Hard Dependency on Unstructured.io
- Added Llamaparse Support :)
This commit is contained in:
parent
5737ea80c0
commit
73751c0eb1
11 changed files with 402 additions and 84 deletions
36
README.md
36
README.md
|
@ -31,7 +31,7 @@ https://github.com/user-attachments/assets/bf64a6ca-934b-47ac-9e1b-edac5fe972ec
|
|||
### 💡 **Idea**:
|
||||
Have your own highly customizable private NotebookLM and Perplexity integrated with external sources.
|
||||
### 📁 **Multiple File Format Uploading Support**
|
||||
Save content from your own personal files *(Documents, images, videos and supports **34 file extensions**)* to your own personal knowledge base .
|
||||
Save content from your own personal files *(Documents, images, videos and supports **50+ file extensions**)* to your own personal knowledge base .
|
||||
### 🔍 **Powerful Search**
|
||||
Quickly research or find anything in your saved content .
|
||||
### 💬 **Chat with your Saved Content**
|
||||
|
@ -66,35 +66,33 @@ Open source and easy to deploy locally.
|
|||
|
||||
### 📄 **Supported File Extensions**
|
||||
|
||||
#### Document
|
||||
> **Note**: File format support depends on your ETL service configuration. LlamaCloud supports 100+ formats, while Unstructured supports 34+ core formats.
|
||||
|
||||
`.doc`, `.docx`, `.odt`, `.rtf`, `.pdf`, `.xml`
|
||||
#### Documents & Text
|
||||
**LlamaCloud**: `.pdf`, `.doc`, `.docx`, `.docm`, `.dot`, `.dotm`, `.rtf`, `.txt`, `.xml`, `.epub`, `.odt`, `.wpd`, `.pages`, `.key`, `.numbers`, `.602`, `.abw`, `.cgm`, `.cwk`, `.hwp`, `.lwp`, `.mw`, `.mcw`, `.pbd`, `.sda`, `.sdd`, `.sdp`, `.sdw`, `.sgl`, `.sti`, `.sxi`, `.sxw`, `.stw`, `.sxg`, `.uof`, `.uop`, `.uot`, `.vor`, `.wps`, `.zabw`
|
||||
|
||||
#### Text & Markup
|
||||
**Unstructured**: `.doc`, `.docx`, `.odt`, `.rtf`, `.pdf`, `.xml`, `.txt`, `.md`, `.markdown`, `.rst`, `.html`, `.org`, `.epub`
|
||||
|
||||
`.txt`, `.md`, `.markdown`, `.rst`, `.html`, `.org`
|
||||
#### Presentations
|
||||
**LlamaCloud**: `.ppt`, `.pptx`, `.pptm`, `.pot`, `.potm`, `.potx`, `.odp`, `.key`
|
||||
|
||||
#### Spreadsheets & Tables
|
||||
**Unstructured**: `.ppt`, `.pptx`
|
||||
|
||||
`.xls`, `.xlsx`, `.csv`, `.tsv`
|
||||
#### Spreadsheets & Data
|
||||
**LlamaCloud**: `.xlsx`, `.xls`, `.xlsm`, `.xlsb`, `.xlw`, `.csv`, `.tsv`, `.ods`, `.fods`, `.numbers`, `.dbf`, `.123`, `.dif`, `.sylk`, `.slk`, `.prn`, `.et`, `.uos1`, `.uos2`, `.wk1`, `.wk2`, `.wk3`, `.wk4`, `.wks`, `.wq1`, `.wq2`, `.wb1`, `.wb2`, `.wb3`, `.qpw`, `.xlr`, `.eth`
|
||||
|
||||
#### Audio & Video
|
||||
|
||||
`.mp3`, `.mpga`, `.m4a`, `.wav`, `.mp4`, `.mpeg`, `.webm`
|
||||
**Unstructured**: `.xls`, `.xlsx`, `.csv`, `.tsv`
|
||||
|
||||
#### Images
|
||||
**LlamaCloud**: `.jpg`, `.jpeg`, `.png`, `.gif`, `.bmp`, `.svg`, `.tiff`, `.webp`, `.html`, `.htm`, `.web`
|
||||
|
||||
`.jpg`, `.jpeg`, `.png`, `.bmp`, `.tiff`, `.heic`
|
||||
|
||||
#### Email & eBooks
|
||||
|
||||
`.eml`, `.msg`, `.epub`
|
||||
|
||||
#### PowerPoint Presentations & Other
|
||||
|
||||
`.ppt`, `.pptx`, `.p7s`
|
||||
**Unstructured**: `.jpg`, `.jpeg`, `.png`, `.bmp`, `.tiff`, `.heic`
|
||||
|
||||
#### Audio & Video *(Always Supported)*
|
||||
`.mp3`, `.mpga`, `.m4a`, `.wav`, `.mp4`, `.mpeg`, `.webm`
|
||||
|
||||
#### Email & Communication
|
||||
**Unstructured**: `.eml`, `.msg`, `.p7s`
|
||||
|
||||
### 🔖 Cross Browser Extension
|
||||
- The SurfSense extension can be used to save any webpage you like.
|
||||
|
|
|
@ -30,9 +30,13 @@ STT_SERVICE="openai/whisper-1"
|
|||
OPENAI_API_KEY="sk-proj-iA"
|
||||
GEMINI_API_KEY="AIzaSyB6-1641124124124124124124124124124"
|
||||
|
||||
UNSTRUCTURED_API_KEY="Tpu3P0U8iy"
|
||||
FIRECRAWL_API_KEY="fcr-01J0000000000000000000000"
|
||||
|
||||
#File Parser Service
|
||||
ETL_SERVICE="UNSTRUCTURED" or "LLAMACLOUD"
|
||||
UNSTRUCTURED_API_KEY="Tpu3P0U8iy"
|
||||
LLAMA_CLOUD_API_KEY="llx-nnn"
|
||||
|
||||
#OPTIONAL: Add these for LangSmith Observability
|
||||
LANGSMITH_TRACING=true
|
||||
LANGSMITH_ENDPOINT="https://api.smith.langchain.com"
|
||||
|
|
|
@ -96,9 +96,18 @@ class Config:
|
|||
# OAuth JWT
|
||||
SECRET_KEY = os.getenv("SECRET_KEY")
|
||||
|
||||
# Unstructured API Key
|
||||
UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY")
|
||||
# ETL Service
|
||||
ETL_SERVICE = os.getenv("ETL_SERVICE")
|
||||
|
||||
if ETL_SERVICE == "UNSTRUCTURED":
|
||||
# Unstructured API Key
|
||||
UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY")
|
||||
|
||||
elif ETL_SERVICE == "LLAMACLOUD":
|
||||
# LlamaCloud API Key
|
||||
LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")
|
||||
|
||||
|
||||
# Firecrawl API Key
|
||||
FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY", None)
|
||||
|
||||
|
|
|
@ -7,7 +7,7 @@ from app.db import get_async_session, User, SearchSpace, Document, DocumentType
|
|||
from app.schemas import DocumentsCreate, DocumentUpdate, DocumentRead
|
||||
from app.users import current_active_user
|
||||
from app.utils.check_ownership import check_ownership
|
||||
from app.tasks.background_tasks import add_received_markdown_file_document, add_extension_received_document, add_received_file_document, add_crawled_url_document, add_youtube_video_document
|
||||
from app.tasks.background_tasks import add_received_markdown_file_document, add_extension_received_document, add_received_file_document_using_unstructured, add_crawled_url_document, add_youtube_video_document, add_received_file_document_using_llamacloud
|
||||
from app.config import config as app_config
|
||||
# Force asyncio to use standard event loop before unstructured imports
|
||||
import asyncio
|
||||
|
@ -101,8 +101,7 @@ async def create_documents(
|
|||
content = await file.read()
|
||||
with open(temp_path, "wb") as f:
|
||||
f.write(content)
|
||||
|
||||
# Process in background to avoid uvloop conflicts
|
||||
|
||||
fastapi_background_tasks.add_task(
|
||||
process_file_in_background_with_new_session,
|
||||
temp_path,
|
||||
|
@ -191,36 +190,74 @@ async def process_file_in_background(
|
|||
search_space_id
|
||||
)
|
||||
else:
|
||||
# Use synchronous unstructured API to avoid event loop issues
|
||||
from langchain_unstructured import UnstructuredLoader
|
||||
if app_config.ETL_SERVICE == "UNSTRUCTURED":
|
||||
from langchain_unstructured import UnstructuredLoader
|
||||
|
||||
# Process the file
|
||||
loader = UnstructuredLoader(
|
||||
file_path,
|
||||
mode="elements",
|
||||
post_processors=[],
|
||||
languages=["eng"],
|
||||
include_orig_elements=False,
|
||||
include_metadata=False,
|
||||
strategy="auto",
|
||||
)
|
||||
|
||||
# Process the file
|
||||
loader = UnstructuredLoader(
|
||||
file_path,
|
||||
mode="elements",
|
||||
post_processors=[],
|
||||
languages=["eng"],
|
||||
include_orig_elements=False,
|
||||
include_metadata=False,
|
||||
strategy="auto",
|
||||
)
|
||||
docs = await loader.aload()
|
||||
|
||||
docs = await loader.aload()
|
||||
# Clean up the temp file
|
||||
import os
|
||||
try:
|
||||
os.unlink(file_path)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Clean up the temp file
|
||||
import os
|
||||
try:
|
||||
os.unlink(file_path)
|
||||
except:
|
||||
pass
|
||||
# Pass the documents to the existing background task
|
||||
await add_received_file_document_using_unstructured(
|
||||
session,
|
||||
filename,
|
||||
docs,
|
||||
search_space_id
|
||||
)
|
||||
elif app_config.ETL_SERVICE == "LLAMACLOUD":
|
||||
from llama_cloud_services import LlamaParse
|
||||
from llama_cloud_services.parse.utils import ResultType
|
||||
|
||||
# Pass the documents to the existing background task
|
||||
await add_received_file_document(
|
||||
session,
|
||||
filename,
|
||||
docs,
|
||||
search_space_id
|
||||
)
|
||||
|
||||
# Create LlamaParse parser instance
|
||||
parser = LlamaParse(
|
||||
api_key=app_config.LLAMA_CLOUD_API_KEY,
|
||||
num_workers=1, # Use single worker for file processing
|
||||
verbose=True,
|
||||
language="en",
|
||||
result_type=ResultType.MD
|
||||
)
|
||||
|
||||
# Parse the file asynchronously
|
||||
result = await parser.aparse(file_path)
|
||||
|
||||
# Clean up the temp file
|
||||
import os
|
||||
try:
|
||||
os.unlink(file_path)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Get markdown documents from the result
|
||||
markdown_documents = await result.aget_markdown_documents(split_by_page=False)
|
||||
|
||||
for doc in markdown_documents:
|
||||
# Extract text content from the markdown documents
|
||||
markdown_content = doc.text
|
||||
|
||||
# Process the documents using our LlamaCloud background task
|
||||
await add_received_file_document_using_llamacloud(
|
||||
session,
|
||||
filename,
|
||||
llamacloud_markdown_document=markdown_content,
|
||||
search_space_id=search_space_id
|
||||
)
|
||||
except Exception as e:
|
||||
import logging
|
||||
logging.error(f"Error processing file in background: {str(e)}")
|
||||
|
@ -442,3 +479,5 @@ async def process_youtube_video_with_new_session(
|
|||
except Exception as e:
|
||||
import logging
|
||||
logging.error(f"Error processing YouTube video: {str(e)}")
|
||||
|
||||
|
||||
|
|
|
@ -289,7 +289,7 @@ async def add_received_markdown_file_document(
|
|||
raise RuntimeError(f"Failed to process file document: {str(e)}")
|
||||
|
||||
|
||||
async def add_received_file_document(
|
||||
async def add_received_file_document_using_unstructured(
|
||||
session: AsyncSession,
|
||||
file_name: str,
|
||||
unstructured_processed_elements: List[LangChainDocument],
|
||||
|
@ -357,6 +357,83 @@ async def add_received_file_document(
|
|||
raise RuntimeError(f"Failed to process file document: {str(e)}")
|
||||
|
||||
|
||||
async def add_received_file_document_using_llamacloud(
|
||||
session: AsyncSession,
|
||||
file_name: str,
|
||||
llamacloud_markdown_document: str,
|
||||
search_space_id: int,
|
||||
) -> Optional[Document]:
|
||||
"""
|
||||
Process and store document content parsed by LlamaCloud.
|
||||
|
||||
Args:
|
||||
session: Database session
|
||||
file_name: Name of the processed file
|
||||
llamacloud_markdown_documents: List of markdown content from LlamaCloud parsing
|
||||
search_space_id: ID of the search space
|
||||
|
||||
Returns:
|
||||
Document object if successful, None if failed
|
||||
"""
|
||||
try:
|
||||
# Combine all markdown documents into one
|
||||
file_in_markdown = llamacloud_markdown_document
|
||||
|
||||
content_hash = generate_content_hash(file_in_markdown)
|
||||
|
||||
# Check if document with this content hash already exists
|
||||
existing_doc_result = await session.execute(
|
||||
select(Document).where(Document.content_hash == content_hash)
|
||||
)
|
||||
existing_document = existing_doc_result.scalars().first()
|
||||
|
||||
if existing_document:
|
||||
logging.info(f"Document with content hash {content_hash} already exists. Skipping processing.")
|
||||
return existing_document
|
||||
|
||||
# Generate summary
|
||||
summary_chain = SUMMARY_PROMPT_TEMPLATE | config.long_context_llm_instance
|
||||
summary_result = await summary_chain.ainvoke({"document": file_in_markdown})
|
||||
summary_content = summary_result.content
|
||||
summary_embedding = config.embedding_model_instance.embed(summary_content)
|
||||
|
||||
# Process chunks
|
||||
chunks = [
|
||||
Chunk(
|
||||
content=chunk.text,
|
||||
embedding=config.embedding_model_instance.embed(chunk.text),
|
||||
)
|
||||
for chunk in config.chunker_instance.chunk(file_in_markdown)
|
||||
]
|
||||
|
||||
# Create and store document
|
||||
document = Document(
|
||||
search_space_id=search_space_id,
|
||||
title=file_name,
|
||||
document_type=DocumentType.FILE,
|
||||
document_metadata={
|
||||
"FILE_NAME": file_name,
|
||||
"ETL_SERVICE": "LLAMACLOUD",
|
||||
},
|
||||
content=summary_content,
|
||||
embedding=summary_embedding,
|
||||
chunks=chunks,
|
||||
content_hash=content_hash,
|
||||
)
|
||||
|
||||
session.add(document)
|
||||
await session.commit()
|
||||
await session.refresh(document)
|
||||
|
||||
return document
|
||||
except SQLAlchemyError as db_error:
|
||||
await session.rollback()
|
||||
raise db_error
|
||||
except Exception as e:
|
||||
await session.rollback()
|
||||
raise RuntimeError(f"Failed to process file document using LlamaCloud: {str(e)}")
|
||||
|
||||
|
||||
async def add_youtube_video_document(
|
||||
session: AsyncSession, url: str, search_space_id: int
|
||||
):
|
||||
|
|
|
@ -17,6 +17,7 @@ dependencies = [
|
|||
"langgraph>=0.3.29",
|
||||
"linkup-sdk>=0.2.4",
|
||||
"litellm>=1.61.4",
|
||||
"llama-cloud-services>=0.6.25",
|
||||
"markdownify>=0.14.1",
|
||||
"notion-client>=2.3.0",
|
||||
"pgvector>=0.3.6",
|
||||
|
|
128
surfsense_backend/uv.lock
generated
128
surfsense_backend/uv.lock
generated
|
@ -110,6 +110,18 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/ec/6a/bc7e17a3e87a2985d3e8f4da4cd0f481060eb78fb08596c42be62c90a4d9/aiosignal-1.3.2-py2.py3-none-any.whl", hash = "sha256:45cde58e409a301715980c2b01d0c28bdde3770d8290b5eb2173759d9acb31a5", size = 7597 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aiosqlite"
|
||||
version = "0.21.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "typing-extensions" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/13/7d/8bca2bf9a247c2c5dfeec1d7a5f40db6518f88d314b8bca9da29670d2671/aiosqlite-0.21.0.tar.gz", hash = "sha256:131bb8056daa3bc875608c631c678cda73922a2d4ba8aec373b19f18c17e7aa3", size = 13454 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/f5/10/6c25ed6de94c49f88a91fa5018cb4c0f3625f31d5be9f771ebe5cc7cd506/aiosqlite-0.21.0-py3-none-any.whl", hash = "sha256:2549cf4057f95f53dcba16f2b64e8e2791d7e1adedb13197dd8ed77bb226d7d0", size = 15792 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "alembic"
|
||||
version = "1.15.2"
|
||||
|
@ -228,6 +240,22 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/df/73/b6e24bd22e6720ca8ee9a85a0c4a2971af8497d8f3193fa05390cbd46e09/backoff-2.2.1-py3-none-any.whl", hash = "sha256:63579f9a0628e06278f7e47b7d7d5b6ce20dc65c5e96a6f3ca99a6adca0396e8", size = 15148 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "banks"
|
||||
version = "2.1.2"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "deprecated" },
|
||||
{ name = "griffe" },
|
||||
{ name = "jinja2" },
|
||||
{ name = "platformdirs" },
|
||||
{ name = "pydantic" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/77/34/2b6697f02ffb68bee50e5fd37d6c64432244d3245603fd62950169dfed7e/banks-2.1.2.tar.gz", hash = "sha256:a0651db9d14b57fa2e115e78f68dbb1b36fe226ad6eef96192542908b1d20c1f", size = 173332 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/04/4a/7fdca29d1db62f5f5c3446bf8f668beacdb0b5a8aff4247574ddfddc6bcd/banks-2.1.2-py3-none-any.whl", hash = "sha256:7fba451069f6bea376483b8136a0f29cb1e6883133626d00e077e20a3d102c0e", size = 28064 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bcrypt"
|
||||
version = "4.2.1"
|
||||
|
@ -572,6 +600,15 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/6e/c6/ac0b6c1e2d138f1002bcf799d330bd6d85084fece321e662a14223794041/Deprecated-1.2.18-py2.py3-none-any.whl", hash = "sha256:bd5011788200372a32418f888e326a09ff80d0214bd961147cfed01b5c018eec", size = 9998 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dirtyjson"
|
||||
version = "1.0.8"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/db/04/d24f6e645ad82ba0ef092fa17d9ef7a21953781663648a01c9371d9e8e98/dirtyjson-1.0.8.tar.gz", hash = "sha256:90ca4a18f3ff30ce849d100dcf4a003953c79d3a2348ef056f1d9c22231a25fd", size = 30782 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/68/69/1bcf70f81de1b4a9f21b3a62ec0c83bdff991c88d6cc2267d02408457e88/dirtyjson-1.0.8-py3-none-any.whl", hash = "sha256:125e27248435a58acace26d5c2c4c11a1c0de0a9c5124c5a94ba78e517d74f53", size = 25197 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "distro"
|
||||
version = "1.9.0"
|
||||
|
@ -988,6 +1025,18 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/ac/38/08cc303ddddc4b3d7c628c3039a61a3aae36c241ed01393d00c2fd663473/greenlet-3.1.1-cp313-cp313t-musllinux_1_1_x86_64.whl", hash = "sha256:411f015496fec93c1c8cd4e5238da364e1da7a124bcb293f085bf2860c32c6f6", size = 1142112 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "griffe"
|
||||
version = "1.7.3"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "colorama" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/a9/3e/5aa9a61f7c3c47b0b52a1d930302992229d191bf4bc76447b324b731510a/griffe-1.7.3.tar.gz", hash = "sha256:52ee893c6a3a968b639ace8015bec9d36594961e156e23315c8e8e51401fa50b", size = 395137 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/58/c6/5c20af38c2a57c15d87f7f38bee77d63c1d2a3689f74fefaf35915dd12b2/griffe-1.7.3-py3-none-any.whl", hash = "sha256:c6b3ee30c2f0f17f30bcdef5068d6ab7a2a4f1b8bf1a3e74b56fffd21e1c5f75", size = 129303 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "grpcio"
|
||||
version = "1.71.0"
|
||||
|
@ -1604,6 +1653,72 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/f9/c2/1b6c502909b7af9054736af61e27558a3341e8c1ba28e7f82473e6dd936f/litellm-1.61.4-py3-none-any.whl", hash = "sha256:e87e0d397a191795b4217f9299fc9b21eaacaab91409695f0a4780cceccda6e1", size = 6814517 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "llama-cloud"
|
||||
version = "0.1.23"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "certifi" },
|
||||
{ name = "httpx" },
|
||||
{ name = "pydantic" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/5b/e4/d1a30167ed6690a408382be1cf7de220a506085f4371baaf067d65bad8fd/llama_cloud-0.1.23.tar.gz", hash = "sha256:3d84a24a860f046d39a106c06742ec0ea39a574ac42bbf91706fe025f44e233e", size = 101292 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/8a/15/3b56acef877dbc5d01d7e1a782c2cc50ef8a08d5773121c3bc20546de582/llama_cloud-0.1.23-py3-none-any.whl", hash = "sha256:ce95b0705d85c99b3b27b0af0d16a17d9a81b14c96bf13c1063a1bd13d8d0446", size = 267343 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "llama-cloud-services"
|
||||
version = "0.6.25"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "click" },
|
||||
{ name = "llama-cloud" },
|
||||
{ name = "llama-index-core" },
|
||||
{ name = "platformdirs" },
|
||||
{ name = "pydantic" },
|
||||
{ name = "python-dotenv" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/79/c0/89f89dfc2c2b6c2d5c1c5fde9f445696eb12f9c2a4e17637ab0aaf7cc373/llama_cloud_services-0.6.25.tar.gz", hash = "sha256:3608004b0cf984640a3a36657b8b40394d7ce2c48e3eb9dd24fc654df7643595", size = 32303 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/e6/f1/99b8ef4a636dafd5f1ae1e1b19eb9f793f51573d782919bf01d9b9f797f4/llama_cloud_services-0.6.25-py3-none-any.whl", hash = "sha256:aef0afbbf0d6dc485e6566af2daeeefa8caa7bc7f6511d860036bc0aac15361b", size = 37231 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "llama-index-core"
|
||||
version = "0.12.39"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "aiohttp" },
|
||||
{ name = "aiosqlite" },
|
||||
{ name = "banks" },
|
||||
{ name = "dataclasses-json" },
|
||||
{ name = "deprecated" },
|
||||
{ name = "dirtyjson" },
|
||||
{ name = "filetype" },
|
||||
{ name = "fsspec" },
|
||||
{ name = "httpx" },
|
||||
{ name = "nest-asyncio" },
|
||||
{ name = "networkx" },
|
||||
{ name = "nltk" },
|
||||
{ name = "numpy" },
|
||||
{ name = "pillow" },
|
||||
{ name = "pydantic" },
|
||||
{ name = "pyyaml" },
|
||||
{ name = "requests" },
|
||||
{ name = "sqlalchemy", extra = ["asyncio"] },
|
||||
{ name = "tenacity" },
|
||||
{ name = "tiktoken" },
|
||||
{ name = "tqdm" },
|
||||
{ name = "typing-extensions" },
|
||||
{ name = "typing-inspect" },
|
||||
{ name = "wrapt" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/f7/45/163806502804ff75ace474f868cc33158774c4eb31d565133f32932e930e/llama_index_core-0.12.39.tar.gz", hash = "sha256:0cca9de59953542a3c2f1db61327c5204e0b1e997f31f1200e49392b2879593a", size = 7292040 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/dd/a3/583d80764df75aefc9885f28dcc06a0e5aefc993fa5318186e70f2340d73/llama_index_core-0.12.39-py3-none-any.whl", hash = "sha256:c255ed87aa85e43893f2bb05870b61ce7701d7a6a931d174ba925def5856b4c2", size = 7664906 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lxml"
|
||||
version = "5.3.1"
|
||||
|
@ -2468,6 +2583,15 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/cf/6c/41c21c6c8af92b9fea313aa47c75de49e2f9a467964ee33eb0135d47eb64/pillow-11.1.0-cp313-cp313t-win_arm64.whl", hash = "sha256:67cd427c68926108778a9005f2a04adbd5e67c442ed21d95389fe1d595458756", size = 2377651 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "platformdirs"
|
||||
version = "4.3.8"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/fe/8b/3c73abc9c759ecd3f1f7ceff6685840859e8070c4d947c93fae71f6a0bf2/platformdirs-4.3.8.tar.gz", hash = "sha256:3d512d96e16bcb959a814c9f348431070822a6496326a4be0911c40b5a74c2bc", size = 21362 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/fe/39/979e8e21520d4e47a0bbe349e2713c0aac6f3d853d0e5b34d76206c439aa/platformdirs-4.3.8-py3-none-any.whl", hash = "sha256:ff7059bb7eb1179e2685604f4aaf157cfd9535242bd23742eadc3c13542139b4", size = 18567 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "playwright"
|
||||
version = "1.50.0"
|
||||
|
@ -3392,7 +3516,7 @@ wheels = [
|
|||
|
||||
[[package]]
|
||||
name = "surf-new-backend"
|
||||
version = "0.0.6"
|
||||
version = "0.0.7"
|
||||
source = { virtual = "." }
|
||||
dependencies = [
|
||||
{ name = "alembic" },
|
||||
|
@ -3407,6 +3531,7 @@ dependencies = [
|
|||
{ name = "langgraph" },
|
||||
{ name = "linkup-sdk" },
|
||||
{ name = "litellm" },
|
||||
{ name = "llama-cloud-services" },
|
||||
{ name = "markdownify" },
|
||||
{ name = "notion-client" },
|
||||
{ name = "pgvector" },
|
||||
|
@ -3438,6 +3563,7 @@ requires-dist = [
|
|||
{ name = "langgraph", specifier = ">=0.3.29" },
|
||||
{ name = "linkup-sdk", specifier = ">=0.2.4" },
|
||||
{ name = "litellm", specifier = ">=1.61.4" },
|
||||
{ name = "llama-cloud-services", specifier = ">=0.6.25" },
|
||||
{ name = "markdownify", specifier = ">=0.14.1" },
|
||||
{ name = "notion-client", specifier = ">=2.3.0" },
|
||||
{ name = "pgvector", specifier = ">=0.3.6" },
|
||||
|
|
|
@ -1,2 +1,3 @@
|
|||
NEXT_PUBLIC_FASTAPI_BACKEND_URL=http://localhost:8000
|
||||
NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE=LOCAL or GOOGLE
|
||||
NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE=LOCAL or GOOGLE
|
||||
NEXT_PUBLIC_ETL_SERVICE=UNSTRUCTURED or LLAMACLOUD
|
|
@ -42,38 +42,95 @@ export default function FileUploader() {
|
|||
const router = useRouter();
|
||||
const fileInputRef = useRef<HTMLInputElement>(null);
|
||||
|
||||
const acceptedFileTypes = {
|
||||
'image/bmp': ['.bmp'],
|
||||
'text/csv': ['.csv'],
|
||||
'application/msword': ['.doc'],
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'],
|
||||
'message/rfc822': ['.eml'],
|
||||
'application/epub+zip': ['.epub'],
|
||||
'image/heic': ['.heic'],
|
||||
'text/html': ['.html'],
|
||||
'image/jpeg': ['.jpeg', '.jpg'],
|
||||
'image/png': ['.png'],
|
||||
'text/markdown': ['.md', '.markdown'],
|
||||
'application/vnd.ms-outlook': ['.msg'],
|
||||
'application/vnd.oasis.opendocument.text': ['.odt'],
|
||||
'text/x-org': ['.org'],
|
||||
'application/pkcs7-signature': ['.p7s'],
|
||||
'application/pdf': ['.pdf'],
|
||||
'application/vnd.ms-powerpoint': ['.ppt'],
|
||||
'application/vnd.openxmlformats-officedocument.presentationml.presentation': ['.pptx'],
|
||||
'text/x-rst': ['.rst'],
|
||||
'application/rtf': ['.rtf'],
|
||||
'image/tiff': ['.tiff'],
|
||||
'text/plain': ['.txt'],
|
||||
'text/tab-separated-values': ['.tsv'],
|
||||
'application/vnd.ms-excel': ['.xls'],
|
||||
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'],
|
||||
'application/xml': ['.xml'],
|
||||
// Audio files are always supported (using whisper)
|
||||
const audioFileTypes = {
|
||||
'audio/mpeg': ['.mp3', '.mpeg', '.mpga'],
|
||||
'audio/mp4': ['.mp4', '.m4a'],
|
||||
'audio/wav': ['.wav'],
|
||||
'audio/webm': ['.webm'],
|
||||
}
|
||||
};
|
||||
|
||||
// Conditionally set accepted file types based on ETL service
|
||||
const acceptedFileTypes = process.env.NEXT_PUBLIC_ETL_SERVICE === 'LLAMACLOUD'
|
||||
? {
|
||||
// LlamaCloud supported file types
|
||||
'application/pdf': ['.pdf'],
|
||||
'application/msword': ['.doc'],
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'],
|
||||
'application/vnd.ms-word.document.macroEnabled.12': ['.docm'],
|
||||
'application/msword-template': ['.dot'],
|
||||
'application/vnd.ms-word.template.macroEnabled.12': ['.dotm'],
|
||||
'application/vnd.ms-powerpoint': ['.ppt'],
|
||||
'application/vnd.ms-powerpoint.template.macroEnabled.12': ['.pptm'],
|
||||
'application/vnd.openxmlformats-officedocument.presentationml.presentation': ['.pptx'],
|
||||
'application/vnd.ms-powerpoint.template': ['.pot'],
|
||||
'application/vnd.openxmlformats-officedocument.presentationml.template': ['.potx'],
|
||||
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'],
|
||||
'application/vnd.ms-excel': ['.xls'],
|
||||
'application/vnd.ms-excel.sheet.macroEnabled.12': ['.xlsm'],
|
||||
'application/vnd.ms-excel.sheet.binary.macroEnabled.12': ['.xlsb'],
|
||||
'application/vnd.ms-excel.workspace': ['.xlw'],
|
||||
'application/rtf': ['.rtf'],
|
||||
'application/xml': ['.xml'],
|
||||
'application/epub+zip': ['.epub'],
|
||||
'application/vnd.apple.keynote': ['.key'],
|
||||
'application/vnd.apple.pages': ['.pages'],
|
||||
'application/vnd.apple.numbers': ['.numbers'],
|
||||
'application/vnd.wordperfect': ['.wpd'],
|
||||
'application/vnd.oasis.opendocument.text': ['.odt'],
|
||||
'application/vnd.oasis.opendocument.presentation': ['.odp'],
|
||||
'application/vnd.oasis.opendocument.graphics': ['.odg'],
|
||||
'application/vnd.oasis.opendocument.spreadsheet': ['.ods'],
|
||||
'application/vnd.oasis.opendocument.formula': ['.fods'],
|
||||
'text/plain': ['.txt'],
|
||||
'text/csv': ['.csv'],
|
||||
'text/tab-separated-values': ['.tsv'],
|
||||
'text/html': ['.html', '.htm', '.web'],
|
||||
'image/jpeg': ['.jpg', '.jpeg'],
|
||||
'image/png': ['.png'],
|
||||
'image/gif': ['.gif'],
|
||||
'image/bmp': ['.bmp'],
|
||||
'image/svg+xml': ['.svg'],
|
||||
'image/tiff': ['.tiff'],
|
||||
'image/webp': ['.webp'],
|
||||
'application/dbase': ['.dbf'],
|
||||
'application/vnd.lotus-1-2-3': ['.123'],
|
||||
'text/x-web-markdown': ['.602', '.abw', '.cgm', '.cwk', '.hwp', '.lwp', '.mw', '.mcw', '.pbd', '.sda', '.sdd', '.sdp', '.sdw', '.sgl', '.sti', '.sxi', '.sxw', '.stw', '.sxg', '.uof', '.uop', '.uot', '.vor', '.wps', '.zabw'],
|
||||
'text/x-spreadsheet': ['.dif', '.sylk', '.slk', '.prn', '.et', '.uos1', '.uos2', '.wk1', '.wk2', '.wk3', '.wk4', '.wks', '.wq1', '.wq2', '.wb1', '.wb2', '.wb3', '.qpw', '.xlr', '.eth'],
|
||||
// Audio files (always supported)
|
||||
...audioFileTypes,
|
||||
}
|
||||
: {
|
||||
// Unstructured supported file types
|
||||
'image/bmp': ['.bmp'],
|
||||
'text/csv': ['.csv'],
|
||||
'application/msword': ['.doc'],
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'],
|
||||
'message/rfc822': ['.eml'],
|
||||
'application/epub+zip': ['.epub'],
|
||||
'image/heic': ['.heic'],
|
||||
'text/html': ['.html'],
|
||||
'image/jpeg': ['.jpeg', '.jpg'],
|
||||
'image/png': ['.png'],
|
||||
'text/markdown': ['.md', '.markdown'],
|
||||
'application/vnd.ms-outlook': ['.msg'],
|
||||
'application/vnd.oasis.opendocument.text': ['.odt'],
|
||||
'text/x-org': ['.org'],
|
||||
'application/pkcs7-signature': ['.p7s'],
|
||||
'application/pdf': ['.pdf'],
|
||||
'application/vnd.ms-powerpoint': ['.ppt'],
|
||||
'application/vnd.openxmlformats-officedocument.presentationml.presentation': ['.pptx'],
|
||||
'text/x-rst': ['.rst'],
|
||||
'application/rtf': ['.rtf'],
|
||||
'image/tiff': ['.tiff'],
|
||||
'text/plain': ['.txt'],
|
||||
'text/tab-separated-values': ['.tsv'],
|
||||
'application/vnd.ms-excel': ['.xls'],
|
||||
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'],
|
||||
'application/xml': ['.xml'],
|
||||
// Audio files (always supported)
|
||||
...audioFileTypes,
|
||||
};
|
||||
|
||||
const supportedExtensions = Array.from(new Set(Object.values(acceptedFileTypes).flat())).sort()
|
||||
|
||||
|
|
|
@ -90,7 +90,9 @@ Before you begin, ensure you have:
|
|||
| FAST_LLM | LiteLLM routed smaller, faster LLM (e.g., `openai/gpt-4o-mini`, `ollama/deepseek-r1:8b`) |
|
||||
| STRATEGIC_LLM | LiteLLM routed advanced LLM for complex tasks (e.g., `openai/gpt-4o`, `ollama/gemma3:12b`) |
|
||||
| LONG_CONTEXT_LLM | LiteLLM routed LLM for longer context windows (e.g., `gemini/gemini-2.0-flash`, `ollama/deepseek-r1:8b`) |
|
||||
| UNSTRUCTURED_API_KEY | API key for Unstructured.io service for document parsing |
|
||||
| ETL_SERVICE | Document parsing service: `UNSTRUCTURED` (supports 34+ formats) or `LLAMACLOUD` (supports 50+ formats including legacy document types) |
|
||||
| UNSTRUCTURED_API_KEY | API key for Unstructured.io service for document parsing (required if ETL_SERVICE=UNSTRUCTURED) |
|
||||
| LLAMA_CLOUD_API_KEY | API key for LlamaCloud service for document parsing (required if ETL_SERVICE=LLAMACLOUD) |
|
||||
| FIRECRAWL_API_KEY | API key for Firecrawl service for web crawling |
|
||||
| TTS_SERVICE | Text-to-Speech API provider for Podcasts (e.g., `openai/tts-1`, `azure/neural`, `vertex_ai/`). See [supported providers](https://docs.litellm.ai/docs/text_to_speech#supported-providers) |
|
||||
| STT_SERVICE | Speech-to-Text API provider for Podcasts (e.g., `openai/whisper-1`). See [supported providers](https://docs.litellm.ai/docs/audio_transcription#supported-providers) |
|
||||
|
@ -136,6 +138,7 @@ For other LLM providers, refer to the [LiteLLM documentation](https://docs.litel
|
|||
| ------------------------------- | ---------------------------------------------------------- |
|
||||
| NEXT_PUBLIC_FASTAPI_BACKEND_URL | URL of the backend service (e.g., `http://localhost:8000`) |
|
||||
| NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE | Same value as set in backend AUTH_TYPE i.e `GOOGLE` for OAuth with Google, `LOCAL` for email/password authentication |
|
||||
| NEXT_PUBLIC_ETL_SERVICE | Document parsing service (should match backend ETL_SERVICE): `UNSTRUCTURED` or `LLAMACLOUD` - affects supported file formats in upload interface |
|
||||
|
||||
2. **Build and Start Containers**
|
||||
|
||||
|
|
|
@ -61,7 +61,9 @@ Edit the `.env` file and set the following variables:
|
|||
| FAST_LLM | LiteLLM routed smaller, faster LLM (e.g., `openai/gpt-4o-mini`, `ollama/deepseek-r1:8b`) |
|
||||
| STRATEGIC_LLM | LiteLLM routed advanced LLM for complex tasks (e.g., `openai/gpt-4o`, `ollama/gemma3:12b`) |
|
||||
| LONG_CONTEXT_LLM | LiteLLM routed LLM for longer context windows (e.g., `gemini/gemini-2.0-flash`, `ollama/deepseek-r1:8b`) |
|
||||
| UNSTRUCTURED_API_KEY | API key for Unstructured.io service for document parsing |
|
||||
| ETL_SERVICE | Document parsing service: `UNSTRUCTURED` (supports 34+ formats) or `LLAMACLOUD` (supports 50+ formats including legacy document types) |
|
||||
| UNSTRUCTURED_API_KEY | API key for Unstructured.io service for document parsing (required if ETL_SERVICE=UNSTRUCTURED) |
|
||||
| LLAMA_CLOUD_API_KEY | API key for LlamaCloud service for document parsing (required if ETL_SERVICE=LLAMACLOUD) |
|
||||
| FIRECRAWL_API_KEY | API key for Firecrawl service for web crawling |
|
||||
| TTS_SERVICE | Text-to-Speech API provider for Podcasts (e.g., `openai/tts-1`, `azure/neural`, `vertex_ai/`). See [supported providers](https://docs.litellm.ai/docs/text_to_speech#supported-providers) |
|
||||
| STT_SERVICE | Speech-to-Text API provider for Podcasts (e.g., `openai/whisper-1`). See [supported providers](https://docs.litellm.ai/docs/audio_transcription#supported-providers) |
|
||||
|
@ -182,6 +184,7 @@ Edit the `.env` file and set:
|
|||
| ------------------------------- | ------------------------------------------- |
|
||||
| NEXT_PUBLIC_FASTAPI_BACKEND_URL | Backend URL (e.g., `http://localhost:8000`) |
|
||||
| NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE | Same value as set in backend AUTH_TYPE i.e `GOOGLE` for OAuth with Google, `LOCAL` for email/password authentication |
|
||||
| NEXT_PUBLIC_ETL_SERVICE | Document parsing service (should match backend ETL_SERVICE): `UNSTRUCTURED` or `LLAMACLOUD` - affects supported file formats in upload interface |
|
||||
|
||||
### 2. Install Dependencies
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue