diff --git a/README.md b/README.md index e8979cf..7ffecc9 100644 --- a/README.md +++ b/README.md @@ -27,28 +27,27 @@ https://github.com/user-attachments/assets/bf64a6ca-934b-47ac-9e1b-edac5fe972ec ## Key Features -### 1. Latest -#### 💡 **Idea**: +### 💡 **Idea**: Have your own highly customizable private NotebookLM and Perplexity integrated with external sources. -#### 📁 **Multiple File Format Uploading Support** -Save content from your own personal files *(Documents, images and supports **27 file extensions**)* to your own personal knowledge base . -#### 🔍 **Powerful Search** +### 📁 **Multiple File Format Uploading Support** +Save content from your own personal files *(Documents, images, videos and supports **34 file extensions**)* to your own personal knowledge base . +### 🔍 **Powerful Search** Quickly research or find anything in your saved content . -#### đŸ’Ŧ **Chat with your Saved Content** +### đŸ’Ŧ **Chat with your Saved Content** Interact in Natural Language and get cited answers. -#### 📄 **Cited Answers** +### 📄 **Cited Answers** Get Cited answers just like Perplexity. -#### 🔔 **Privacy & Local LLM Support** +### 🔔 **Privacy & Local LLM Support** Works Flawlessly with Ollama local LLMs. -#### 🏠 **Self Hostable** +### 🏠 **Self Hostable** Open source and easy to deploy locally. -#### đŸŽ™ī¸ Podcasts +### đŸŽ™ī¸ Podcasts - Blazingly fast podcast generation agent. (Creates a 3-minute podcast in under 20 seconds.) - Convert your chat conversations into engaging audio content - Support for multiple TTS providers (OpenAI, Azure, Google Vertex AI) -#### 📊 **Advanced RAG Techniques** +### 📊 **Advanced RAG Techniques** - Supports 150+ LLM's - Supports 6000+ Embedding Models. - Supports all major Rerankers (Pinecode, Cohere, Flashrank etc) @@ -56,7 +55,7 @@ Open source and easy to deploy locally. - Utilizes Hybrid Search (Semantic + Full Text Search combined with Reciprocal Rank Fusion). - RAG as a Service API Backend. -#### â„šī¸ **External Sources** +### â„šī¸ **External Sources** - Search Engines (Tavily, LinkUp) - Slack - Linear @@ -65,7 +64,39 @@ Open source and easy to deploy locally. - GitHub - and more to come..... -#### 🔖 Cross Browser Extension +### 📄 **Supported File Extensions** + +#### Document + +`.doc`, `.docx`, `.odt`, `.rtf`, `.pdf`, `.xml` + +#### Text & Markup + +`.txt`, `.md`, `.markdown`, `.rst`, `.html`, `.org` + +#### Spreadsheets & Tables + +`.xls`, `.xlsx`, `.csv`, `.tsv` + +#### Audio & Video + +`.mp3`, `.mpga`, `.m4a`, `.wav`, `.mp4`, `.mpeg`, `.webm` + +#### Images + +`.jpg`, `.jpeg`, `.png`, `.bmp`, `.tiff`, `.heic` + +#### Email & eBooks + +`.eml`, `.msg`, `.epub` + +#### PowerPoint Presentations & Other + +`.ppt`, `.pptx`, `.p7s` + + + +### 🔖 Cross Browser Extension - The SurfSense extension can be used to save any webpage you like. - Its main usecase is to save any webpages protected beyond authentication. @@ -209,16 +240,8 @@ Before installation, make sure to complete the [prerequisite setup steps](https: ## Future Work - Add More Connectors. - Patch minor bugs. -- Implement Canvas. -- Complete Hybrid Search. **[Done]** -- Add support for file uploads QA. **[Done]** -- Shift to WebSockets for Streaming responses. **[Deprecated in favor of AI SDK Stream Protocol]** -- Based on feedback, I will work on making it compatible with local models. **[Done]** -- Cross Browser Extension **[Done]** -- Critical Notifications **[Done | PAUSED]** -- Saving Chats **[Done]** -- Basic keyword search page for saved sessions **[Done]** -- Multi & Single Document Chat **[Done]** +- Document Chat **[REIMPLEMENT]** +- Document Podcasts diff --git a/surfsense_backend/.env.example b/surfsense_backend/.env.example index 53a8fb5..19a4115 100644 --- a/surfsense_backend/.env.example +++ b/surfsense_backend/.env.example @@ -18,6 +18,9 @@ LONG_CONTEXT_LLM="gemini/gemini-2.0-flash" #LiteLLM TTS Provider: https://docs.litellm.ai/docs/text_to_speech#supported-providers TTS_SERVICE="openai/tts-1" +#LiteLLM STT Provider: https://docs.litellm.ai/docs/audio_transcription#supported-providers +STT_SERVICE="openai/whisper-1" + # Chosen LiteLLM Providers Keys OPENAI_API_KEY="sk-proj-iA" GEMINI_API_KEY="AIzaSyB6-1641124124124124124124124124124" @@ -35,3 +38,5 @@ LANGSMITH_PROJECT="surfsense" FAST_LLM_API_BASE="" STRATEGIC_LLM_API_BASE="" LONG_CONTEXT_LLM_API_BASE="" +TTS_SERVICE_API_BASE="" +STT_SERVICE_API_BASE="" diff --git a/surfsense_backend/app/agents/podcaster/nodes.py b/surfsense_backend/app/agents/podcaster/nodes.py index 19a233a..9ea590a 100644 --- a/surfsense_backend/app/agents/podcaster/nodes.py +++ b/surfsense_backend/app/agents/podcaster/nodes.py @@ -135,14 +135,23 @@ async def create_merged_podcast_audio(state: State, config: RunnableConfig) -> D filename = f"{temp_dir}/{session_id}_{index}.mp3" try: - # Generate speech using litellm - response = await aspeech( - model=app_config.TTS_SERVICE, - voice=voice, - input=dialog, - max_retries=2, - timeout=600, - ) + if app_config.TTS_SERVICE_API_BASE: + response = await aspeech( + model=app_config.TTS_SERVICE, + api_base=app_config.TTS_SERVICE_API_BASE, + voice=voice, + input=dialog, + max_retries=2, + timeout=600, + ) + else: + response = await aspeech( + model=app_config.TTS_SERVICE, + voice=voice, + input=dialog, + max_retries=2, + timeout=600, + ) # Save the audio to a file - use proper streaming method with open(filename, 'wb') as f: diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py index 7fd032a..eed4627 100644 --- a/surfsense_backend/app/config/__init__.py +++ b/surfsense_backend/app/config/__init__.py @@ -6,7 +6,7 @@ from chonkie import AutoEmbeddings, CodeChunker, RecursiveChunker from dotenv import load_dotenv from langchain_community.chat_models import ChatLiteLLM from rerankers import Reranker -from litellm import speech + # Get the base directory of the project BASE_DIR = Path(__file__).resolve().parent.parent.parent @@ -97,6 +97,12 @@ class Config: # Litellm TTS Configuration TTS_SERVICE = os.getenv("TTS_SERVICE") + TTS_SERVICE_API_BASE = os.getenv("TTS_SERVICE_API_BASE") + + # Litellm STT Configuration + STT_SERVICE = os.getenv("STT_SERVICE") + STT_SERVICE_API_BASE = os.getenv("STT_SERVICE_API_BASE") + # Validation Checks # Check embedding dimension diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py index bbefbcd..5ea2327 100644 --- a/surfsense_backend/app/routes/documents_routes.py +++ b/surfsense_backend/app/routes/documents_routes.py @@ -1,3 +1,4 @@ +from litellm import atranscription from fastapi import APIRouter, Depends, BackgroundTasks, UploadFile, Form, HTTPException from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.future import select @@ -7,6 +8,7 @@ from app.schemas import DocumentsCreate, DocumentUpdate, DocumentRead from app.users import current_active_user from app.utils.check_ownership import check_ownership from app.tasks.background_tasks import add_received_markdown_file_document, add_extension_received_document, add_received_file_document, add_crawled_url_document, add_youtube_video_document +from app.config import config as app_config # Force asyncio to use standard event loop before unstructured imports import asyncio try: @@ -17,9 +19,9 @@ import os os.environ["UNSTRUCTURED_HAS_PATCHED_LOOP"] = "1" - router = APIRouter() + @router.post("/documents/") async def create_documents( request: DocumentsCreate, @@ -30,19 +32,19 @@ async def create_documents( try: # Check if the user owns the search space await check_ownership(session, SearchSpace, request.search_space_id, user) - + if request.document_type == DocumentType.EXTENSION: for individual_document in request.content: fastapi_background_tasks.add_task( - process_extension_document_with_new_session, - individual_document, + process_extension_document_with_new_session, + individual_document, request.search_space_id ) elif request.document_type == DocumentType.CRAWLED_URL: - for url in request.content: + for url in request.content: fastapi_background_tasks.add_task( - process_crawled_url_with_new_session, - url, + process_crawled_url_with_new_session, + url, request.search_space_id ) elif request.document_type == DocumentType.YOUTUBE_VIDEO: @@ -57,7 +59,7 @@ async def create_documents( status_code=400, detail="Invalid document type" ) - + await session.commit() return {"message": "Documents processed successfully"} except HTTPException: @@ -69,6 +71,7 @@ async def create_documents( detail=f"Failed to process documents: {str(e)}" ) + @router.post("/documents/fileupload") async def create_documents( files: list[UploadFile], @@ -79,26 +82,26 @@ async def create_documents( ): try: await check_ownership(session, SearchSpace, search_space_id, user) - + if not files: raise HTTPException(status_code=400, detail="No files provided") - + for file in files: try: # Save file to a temporary location to avoid stream issues import tempfile import aiofiles import os - + # Create temp file with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as temp_file: temp_path = temp_file.name - + # Write uploaded file to temp file content = await file.read() with open(temp_path, "wb") as f: f.write(content) - + # Process in background to avoid uvloop conflicts fastapi_background_tasks.add_task( process_file_in_background_with_new_session, @@ -111,7 +114,7 @@ async def create_documents( status_code=422, detail=f"Failed to process file {file.filename}: {str(e)}" ) - + await session.commit() return {"message": "Files uploaded for processing"} except HTTPException: @@ -136,14 +139,14 @@ async def process_file_in_background( # For markdown files, read the content directly with open(file_path, 'r', encoding='utf-8') as f: markdown_content = f.read() - + # Clean up the temp file import os try: os.unlink(file_path) except: pass - + # Process markdown directly through specialized function await add_received_markdown_file_document( session, @@ -151,10 +154,46 @@ async def process_file_in_background( markdown_content, search_space_id ) + # Check if the file is an audio file + elif filename.lower().endswith(('.mp3', '.mp4', '.mpeg', '.mpga', '.m4a', '.wav', '.webm')): + # Open the audio file for transcription + with open(file_path, "rb") as audio_file: + # Use LiteLLM for audio transcription + if app_config.STT_SERVICE_API_BASE: + transcription_response = await atranscription( + model=app_config.STT_SERVICE, + file=audio_file, + api_base=app_config.STT_SERVICE_API_BASE + ) + else: + transcription_response = await atranscription( + model=app_config.STT_SERVICE, + file=audio_file + ) + + # Extract the transcribed text + transcribed_text = transcription_response.get("text", "") + + # Add metadata about the transcription + transcribed_text = f"# Transcription of {filename}\n\n{transcribed_text}" + + # Clean up the temp file + try: + os.unlink(file_path) + except: + pass + + # Process transcription as markdown document + await add_received_markdown_file_document( + session, + filename, + transcribed_text, + search_space_id + ) else: # Use synchronous unstructured API to avoid event loop issues from langchain_unstructured import UnstructuredLoader - + # Process the file loader = UnstructuredLoader( file_path, @@ -165,16 +204,16 @@ async def process_file_in_background( include_metadata=False, strategy="auto", ) - + docs = await loader.aload() - + # Clean up the temp file import os try: os.unlink(file_path) except: pass - + # Pass the documents to the existing background task await add_received_file_document( session, @@ -186,6 +225,7 @@ async def process_file_in_background( import logging logging.error(f"Error processing file in background: {str(e)}") + @router.get("/documents/", response_model=List[DocumentRead]) async def read_documents( skip: int = 0, @@ -195,17 +235,18 @@ async def read_documents( user: User = Depends(current_active_user) ): try: - query = select(Document).join(SearchSpace).filter(SearchSpace.user_id == user.id) - + query = select(Document).join(SearchSpace).filter( + SearchSpace.user_id == user.id) + # Filter by search_space_id if provided if search_space_id is not None: query = query.filter(Document.search_space_id == search_space_id) - + result = await session.execute( query.offset(skip).limit(limit) ) db_documents = result.scalars().all() - + # Convert database objects to API-friendly format api_documents = [] for doc in db_documents: @@ -218,7 +259,7 @@ async def read_documents( created_at=doc.created_at, search_space_id=doc.search_space_id )) - + return api_documents except Exception as e: raise HTTPException( @@ -226,6 +267,7 @@ async def read_documents( detail=f"Failed to fetch documents: {str(e)}" ) + @router.get("/documents/{document_id}", response_model=DocumentRead) async def read_document( document_id: int, @@ -239,13 +281,13 @@ async def read_document( .filter(Document.id == document_id, SearchSpace.user_id == user.id) ) document = result.scalars().first() - + if not document: raise HTTPException( status_code=404, detail=f"Document with id {document_id} not found" ) - + # Convert database object to API-friendly format return DocumentRead( id=document.id, @@ -262,6 +304,7 @@ async def read_document( detail=f"Failed to fetch document: {str(e)}" ) + @router.put("/documents/{document_id}", response_model=DocumentRead) async def update_document( document_id: int, @@ -277,19 +320,19 @@ async def update_document( .filter(Document.id == document_id, SearchSpace.user_id == user.id) ) db_document = result.scalars().first() - + if not db_document: raise HTTPException( status_code=404, detail=f"Document with id {document_id} not found" ) - + update_data = document_update.model_dump(exclude_unset=True) for key, value in update_data.items(): setattr(db_document, key, value) await session.commit() await session.refresh(db_document) - + # Convert to DocumentRead for response return DocumentRead( id=db_document.id, @@ -309,6 +352,7 @@ async def update_document( detail=f"Failed to update document: {str(e)}" ) + @router.delete("/documents/{document_id}", response_model=dict) async def delete_document( document_id: int, @@ -323,13 +367,13 @@ async def delete_document( .filter(Document.id == document_id, SearchSpace.user_id == user.id) ) document = result.scalars().first() - + if not document: raise HTTPException( status_code=404, detail=f"Document with id {document_id} not found" ) - + await session.delete(document) await session.commit() return {"message": "Document deleted successfully"} @@ -340,16 +384,16 @@ async def delete_document( raise HTTPException( status_code=500, detail=f"Failed to delete document: {str(e)}" - ) - - + ) + + async def process_extension_document_with_new_session( individual_document, search_space_id: int ): """Create a new session and process extension document.""" from app.db import async_session_maker - + async with async_session_maker() as session: try: await add_extension_received_document(session, individual_document, search_space_id) @@ -357,13 +401,14 @@ async def process_extension_document_with_new_session( import logging logging.error(f"Error processing extension document: {str(e)}") + async def process_crawled_url_with_new_session( url: str, search_space_id: int ): """Create a new session and process crawled URL.""" from app.db import async_session_maker - + async with async_session_maker() as session: try: await add_crawled_url_document(session, url, search_space_id) @@ -371,6 +416,7 @@ async def process_crawled_url_with_new_session( import logging logging.error(f"Error processing crawled URL: {str(e)}") + async def process_file_in_background_with_new_session( file_path: str, filename: str, @@ -378,21 +424,21 @@ async def process_file_in_background_with_new_session( ): """Create a new session and process file.""" from app.db import async_session_maker - + async with async_session_maker() as session: await process_file_in_background(file_path, filename, search_space_id, session) + async def process_youtube_video_with_new_session( url: str, search_space_id: int ): """Create a new session and process YouTube video.""" from app.db import async_session_maker - + async with async_session_maker() as session: try: await add_youtube_video_document(session, url, search_space_id) except Exception as e: import logging logging.error(f"Error processing YouTube video: {str(e)}") - diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/upload/page.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/upload/page.tsx index a6fb3d1..e1adbe2 100644 --- a/surfsense_web/app/dashboard/[search_space_id]/documents/upload/page.tsx +++ b/surfsense_web/app/dashboard/[search_space_id]/documents/upload/page.tsx @@ -53,7 +53,7 @@ export default function FileUploader() { 'text/html': ['.html'], 'image/jpeg': ['.jpeg', '.jpg'], 'image/png': ['.png'], - 'text/markdown': ['.md'], + 'text/markdown': ['.md', '.markdown'], 'application/vnd.ms-outlook': ['.msg'], 'application/vnd.oasis.opendocument.text': ['.odt'], 'text/x-org': ['.org'], @@ -69,6 +69,10 @@ export default function FileUploader() { 'application/vnd.ms-excel': ['.xls'], 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'], 'application/xml': ['.xml'], + 'audio/mpeg': ['.mp3', '.mpeg', '.mpga'], + 'audio/mp4': ['.mp4', '.m4a'], + 'audio/wav': ['.wav'], + 'audio/webm': ['.webm'], } const supportedExtensions = Array.from(new Set(Object.values(acceptedFileTypes).flat())).sort() diff --git a/surfsense_web/content/docs/docker-installation.mdx b/surfsense_web/content/docs/docker-installation.mdx index 1545fd5..a150082 100644 --- a/surfsense_web/content/docs/docker-installation.mdx +++ b/surfsense_web/content/docs/docker-installation.mdx @@ -94,6 +94,7 @@ Before you begin, ensure you have: | UNSTRUCTURED_API_KEY | API key for Unstructured.io service for document parsing | | FIRECRAWL_API_KEY | API key for Firecrawl service for web crawling | | TTS_SERVICE | Text-to-Speech API provider for Podcasts (e.g., `openai/tts-1`, `azure/neural`, `vertex_ai/`). See [supported providers](https://docs.litellm.ai/docs/text_to_speech#supported-providers) | +| STT_SERVICE | Speech-to-Text API provider for Podcasts (e.g., `openai/whisper-1`). See [supported providers](https://docs.litellm.ai/docs/audio_transcription#supported-providers) | Include API keys for the LLM providers you're using. For example: @@ -114,6 +115,8 @@ Include API keys for the LLM providers you're using. For example: | FAST_LLM_API_BASE | Custom API base URL for the fast LLM | | STRATEGIC_LLM_API_BASE | Custom API base URL for the strategic LLM | | LONG_CONTEXT_LLM_API_BASE | Custom API base URL for the long context LLM | +| TTS_SERVICE_API_BASE | Custom API base URL for the Text-to-Speech (TTS) service | +| STT_SERVICE_API_BASE | Custom API base URL for the Speech-to-Text (STT) service | For other LLM providers, refer to the [LiteLLM documentation](https://docs.litellm.ai/docs/providers). diff --git a/surfsense_web/content/docs/manual-installation.mdx b/surfsense_web/content/docs/manual-installation.mdx index 749aac2..b3999dc 100644 --- a/surfsense_web/content/docs/manual-installation.mdx +++ b/surfsense_web/content/docs/manual-installation.mdx @@ -65,6 +65,7 @@ Edit the `.env` file and set the following variables: | UNSTRUCTURED_API_KEY | API key for Unstructured.io service | | FIRECRAWL_API_KEY | API key for Firecrawl service (if using crawler) | | TTS_SERVICE | Text-to-Speech API provider for Podcasts (e.g., `openai/tts-1`, `azure/neural`, `vertex_ai/`). See [supported providers](https://docs.litellm.ai/docs/text_to_speech#supported-providers) | +| STT_SERVICE | Speech-to-Text API provider for Podcasts (e.g., `openai/whisper-1`). See [supported providers](https://docs.litellm.ai/docs/audio_transcription#supported-providers) | **Important**: Since LLM calls are routed through LiteLLM, include API keys for the LLM providers you're using: @@ -86,6 +87,8 @@ Edit the `.env` file and set the following variables: | FAST_LLM_API_BASE | Custom API base URL for the fast LLM | | STRATEGIC_LLM_API_BASE | Custom API base URL for the strategic LLM | | LONG_CONTEXT_LLM_API_BASE | Custom API base URL for the long context LLM | +| TTS_SERVICE_API_BASE | Custom API base URL for the Text-to-Speech (TTS) service | +| STT_SERVICE_API_BASE | Custom API base URL for the Speech-to-Text (STT) service | ### 2. Install Dependencies