Merge pull request #286 from MODSetter/dev

feat: added jump to source referencing of citations
2025-09-01 10:09:08 +00:00 · 2025-08-23 19:40:46 -07:00 · 2025-08-23 19:40:46 -07:00 · bc89959d2f
commit bc89959d2f
parent 9b91bea51d 660d1cb444
9 changed files with 819 additions and 564 deletions
--- a/surfsense_backend/app/routes/documents_routes.py
+++ b/surfsense_backend/app/routes/documents_routes.py
@ -5,10 +5,24 @@ from fastapi import APIRouter, BackgroundTasks, Depends, Form, HTTPException, Up
 from litellm import atranscription
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.future import select
+from sqlalchemy.orm import selectinload

 from app.config import config as app_config
-from app.db import Document, DocumentType, Log, SearchSpace, User, get_async_session
-from app.schemas import DocumentRead, DocumentsCreate, DocumentUpdate
+from app.db import (
+    Chunk,
+    Document,
+    DocumentType,
+    Log,
+    SearchSpace,
+    User,
+    get_async_session,
+)
+from app.schemas import (
+    DocumentRead,
+    DocumentsCreate,
+    DocumentUpdate,
+    DocumentWithChunksRead,
+)
 from app.services.task_logging_service import TaskLoggingService
 from app.tasks.document_processors import (
    add_crawled_url_document,
@ -140,6 +154,423 @@ async def create_documents_file_upload(
        ) from e


+@router.get("/documents/", response_model=list[DocumentRead])
+async def read_documents(
+    skip: int = 0,
+    limit: int = 3000,
+    search_space_id: int | None = None,
+    session: AsyncSession = Depends(get_async_session),
+    user: User = Depends(current_active_user),
+):
+    try:
+        query = (
+            select(Document).join(SearchSpace).filter(SearchSpace.user_id == user.id)
+        )
+
+        # Filter by search_space_id if provided
+        if search_space_id is not None:
+            query = query.filter(Document.search_space_id == search_space_id)
+
+        result = await session.execute(query.offset(skip).limit(limit))
+        db_documents = result.scalars().all()
+
+        # Convert database objects to API-friendly format
+        api_documents = []
+        for doc in db_documents:
+            api_documents.append(
+                DocumentRead(
+                    id=doc.id,
+                    title=doc.title,
+                    document_type=doc.document_type,
+                    document_metadata=doc.document_metadata,
+                    content=doc.content,
+                    created_at=doc.created_at,
+                    search_space_id=doc.search_space_id,
+                )
+            )
+
+        return api_documents
+    except Exception as e:
+        raise HTTPException(
+            status_code=500, detail=f"Failed to fetch documents: {e!s}"
+        ) from e
+
+
+@router.get("/documents/{document_id}", response_model=DocumentRead)
+async def read_document(
+    document_id: int,
+    session: AsyncSession = Depends(get_async_session),
+    user: User = Depends(current_active_user),
+):
+    try:
+        result = await session.execute(
+            select(Document)
+            .join(SearchSpace)
+            .filter(Document.id == document_id, SearchSpace.user_id == user.id)
+        )
+        document = result.scalars().first()
+
+        if not document:
+            raise HTTPException(
+                status_code=404, detail=f"Document with id {document_id} not found"
+            )
+
+        # Convert database object to API-friendly format
+        return DocumentRead(
+            id=document.id,
+            title=document.title,
+            document_type=document.document_type,
+            document_metadata=document.document_metadata,
+            content=document.content,
+            created_at=document.created_at,
+            search_space_id=document.search_space_id,
+        )
+    except Exception as e:
+        raise HTTPException(
+            status_code=500, detail=f"Failed to fetch document: {e!s}"
+        ) from e
+
+
+@router.put("/documents/{document_id}", response_model=DocumentRead)
+async def update_document(
+    document_id: int,
+    document_update: DocumentUpdate,
+    session: AsyncSession = Depends(get_async_session),
+    user: User = Depends(current_active_user),
+):
+    try:
+        # Query the document directly instead of using read_document function
+        result = await session.execute(
+            select(Document)
+            .join(SearchSpace)
+            .filter(Document.id == document_id, SearchSpace.user_id == user.id)
+        )
+        db_document = result.scalars().first()
+
+        if not db_document:
+            raise HTTPException(
+                status_code=404, detail=f"Document with id {document_id} not found"
+            )
+
+        update_data = document_update.model_dump(exclude_unset=True)
+        for key, value in update_data.items():
+            setattr(db_document, key, value)
+        await session.commit()
+        await session.refresh(db_document)
+
+        # Convert to DocumentRead for response
+        return DocumentRead(
+            id=db_document.id,
+            title=db_document.title,
+            document_type=db_document.document_type,
+            document_metadata=db_document.document_metadata,
+            content=db_document.content,
+            created_at=db_document.created_at,
+            search_space_id=db_document.search_space_id,
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        await session.rollback()
+        raise HTTPException(
+            status_code=500, detail=f"Failed to update document: {e!s}"
+        ) from e
+
+
+@router.delete("/documents/{document_id}", response_model=dict)
+async def delete_document(
+    document_id: int,
+    session: AsyncSession = Depends(get_async_session),
+    user: User = Depends(current_active_user),
+):
+    try:
+        # Query the document directly instead of using read_document function
+        result = await session.execute(
+            select(Document)
+            .join(SearchSpace)
+            .filter(Document.id == document_id, SearchSpace.user_id == user.id)
+        )
+        document = result.scalars().first()
+
+        if not document:
+            raise HTTPException(
+                status_code=404, detail=f"Document with id {document_id} not found"
+            )
+
+        await session.delete(document)
+        await session.commit()
+        return {"message": "Document deleted successfully"}
+    except HTTPException:
+        raise
+    except Exception as e:
+        await session.rollback()
+        raise HTTPException(
+            status_code=500, detail=f"Failed to delete document: {e!s}"
+        ) from e
+
+
+@router.get("/documents/by-chunk/{chunk_id}", response_model=DocumentWithChunksRead)
+async def get_document_by_chunk_id(
+    chunk_id: int,
+    session: AsyncSession = Depends(get_async_session),
+    user: User = Depends(current_active_user),
+):
+    """
+    Retrieves a document based on a chunk ID, including all its chunks ordered by creation time.
+    The document's embedding and chunk embeddings are excluded from the response.
+    """
+    try:
+        # First, get the chunk and verify it exists
+        chunk_result = await session.execute(select(Chunk).filter(Chunk.id == chunk_id))
+        chunk = chunk_result.scalars().first()
+
+        if not chunk:
+            raise HTTPException(
+                status_code=404, detail=f"Chunk with id {chunk_id} not found"
+            )
+
+        # Get the associated document and verify ownership
+        document_result = await session.execute(
+            select(Document)
+            .options(selectinload(Document.chunks))
+            .join(SearchSpace)
+            .filter(Document.id == chunk.document_id, SearchSpace.user_id == user.id)
+        )
+        document = document_result.scalars().first()
+
+        if not document:
+            raise HTTPException(
+                status_code=404,
+                detail="Document not found or you don't have access to it",
+            )
+
+        # Sort chunks by creation time
+        sorted_chunks = sorted(document.chunks, key=lambda x: x.created_at)
+
+        # Return the document with its chunks
+        return DocumentWithChunksRead(
+            id=document.id,
+            title=document.title,
+            document_type=document.document_type,
+            document_metadata=document.document_metadata,
+            content=document.content,
+            created_at=document.created_at,
+            search_space_id=document.search_space_id,
+            chunks=sorted_chunks,
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(
+            status_code=500, detail=f"Failed to retrieve document: {e!s}"
+        ) from e
+
+
+async def process_extension_document_with_new_session(
+    individual_document, search_space_id: int, user_id: str
+):
+    """Create a new session and process extension document."""
+    from app.db import async_session_maker
+    from app.services.task_logging_service import TaskLoggingService
+
+    async with async_session_maker() as session:
+        # Initialize task logging service
+        task_logger = TaskLoggingService(session, search_space_id)
+
+        # Log task start
+        log_entry = await task_logger.log_task_start(
+            task_name="process_extension_document",
+            source="document_processor",
+            message=f"Starting processing of extension document from {individual_document.metadata.VisitedWebPageTitle}",
+            metadata={
+                "document_type": "EXTENSION",
+                "url": individual_document.metadata.VisitedWebPageURL,
+                "title": individual_document.metadata.VisitedWebPageTitle,
+                "user_id": user_id,
+            },
+        )
+
+        try:
+            result = await add_extension_received_document(
+                session, individual_document, search_space_id, user_id
+            )
+
+            if result:
+                await task_logger.log_task_success(
+                    log_entry,
+                    f"Successfully processed extension document: {individual_document.metadata.VisitedWebPageTitle}",
+                    {"document_id": result.id, "content_hash": result.content_hash},
+                )
+            else:
+                await task_logger.log_task_success(
+                    log_entry,
+                    f"Extension document already exists (duplicate): {individual_document.metadata.VisitedWebPageTitle}",
+                    {"duplicate_detected": True},
+                )
+        except Exception as e:
+            await task_logger.log_task_failure(
+                log_entry,
+                f"Failed to process extension document: {individual_document.metadata.VisitedWebPageTitle}",
+                str(e),
+                {"error_type": type(e).__name__},
+            )
+            import logging
+
+            logging.error(f"Error processing extension document: {e!s}")
+
+
+async def process_crawled_url_with_new_session(
+    url: str, search_space_id: int, user_id: str
+):
+    """Create a new session and process crawled URL."""
+    from app.db import async_session_maker
+    from app.services.task_logging_service import TaskLoggingService
+
+    async with async_session_maker() as session:
+        # Initialize task logging service
+        task_logger = TaskLoggingService(session, search_space_id)
+
+        # Log task start
+        log_entry = await task_logger.log_task_start(
+            task_name="process_crawled_url",
+            source="document_processor",
+            message=f"Starting URL crawling and processing for: {url}",
+            metadata={"document_type": "CRAWLED_URL", "url": url, "user_id": user_id},
+        )
+
+        try:
+            result = await add_crawled_url_document(
+                session, url, search_space_id, user_id
+            )
+
+            if result:
+                await task_logger.log_task_success(
+                    log_entry,
+                    f"Successfully crawled and processed URL: {url}",
+                    {
+                        "document_id": result.id,
+                        "title": result.title,
+                        "content_hash": result.content_hash,
+                    },
+                )
+            else:
+                await task_logger.log_task_success(
+                    log_entry,
+                    f"URL document already exists (duplicate): {url}",
+                    {"duplicate_detected": True},
+                )
+        except Exception as e:
+            await task_logger.log_task_failure(
+                log_entry,
+                f"Failed to crawl URL: {url}",
+                str(e),
+                {"error_type": type(e).__name__},
+            )
+            import logging
+
+            logging.error(f"Error processing crawled URL: {e!s}")
+
+
+async def process_file_in_background_with_new_session(
+    file_path: str, filename: str, search_space_id: int, user_id: str
+):
+    """Create a new session and process file."""
+    from app.db import async_session_maker
+    from app.services.task_logging_service import TaskLoggingService
+
+    async with async_session_maker() as session:
+        # Initialize task logging service
+        task_logger = TaskLoggingService(session, search_space_id)
+
+        # Log task start
+        log_entry = await task_logger.log_task_start(
+            task_name="process_file_upload",
+            source="document_processor",
+            message=f"Starting file processing for: {filename}",
+            metadata={
+                "document_type": "FILE",
+                "filename": filename,
+                "file_path": file_path,
+                "user_id": user_id,
+            },
+        )
+
+        try:
+            await process_file_in_background(
+                file_path,
+                filename,
+                search_space_id,
+                user_id,
+                session,
+                task_logger,
+                log_entry,
+            )
+
+            # Note: success/failure logging is handled within process_file_in_background
+        except Exception as e:
+            await task_logger.log_task_failure(
+                log_entry,
+                f"Failed to process file: {filename}",
+                str(e),
+                {"error_type": type(e).__name__},
+            )
+            import logging
+
+            logging.error(f"Error processing file: {e!s}")
+
+
+async def process_youtube_video_with_new_session(
+    url: str, search_space_id: int, user_id: str
+):
+    """Create a new session and process YouTube video."""
+    from app.db import async_session_maker
+    from app.services.task_logging_service import TaskLoggingService
+
+    async with async_session_maker() as session:
+        # Initialize task logging service
+        task_logger = TaskLoggingService(session, search_space_id)
+
+        # Log task start
+        log_entry = await task_logger.log_task_start(
+            task_name="process_youtube_video",
+            source="document_processor",
+            message=f"Starting YouTube video processing for: {url}",
+            metadata={"document_type": "YOUTUBE_VIDEO", "url": url, "user_id": user_id},
+        )
+
+        try:
+            result = await add_youtube_video_document(
+                session, url, search_space_id, user_id
+            )
+
+            if result:
+                await task_logger.log_task_success(
+                    log_entry,
+                    f"Successfully processed YouTube video: {result.title}",
+                    {
+                        "document_id": result.id,
+                        "video_id": result.document_metadata.get("video_id"),
+                        "content_hash": result.content_hash,
+                    },
+                )
+            else:
+                await task_logger.log_task_success(
+                    log_entry,
+                    f"YouTube video document already exists (duplicate): {url}",
+                    {"duplicate_detected": True},
+                )
+        except Exception as e:
+            await task_logger.log_task_failure(
+                log_entry,
+                f"Failed to process YouTube video: {url}",
+                str(e),
+                {"error_type": type(e).__name__},
+            )
+            import logging
+
+            logging.error(f"Error processing YouTube video: {e!s}")
+
+
 async def process_file_in_background(
    file_path: str,
    filename: str,
@ -508,363 +939,3 @@ async def process_file_in_background(

        logging.error(f"Error processing file in background: {e!s}")
        raise  # Re-raise so the wrapper can also handle it
-
-
-@router.get("/documents/", response_model=list[DocumentRead])
-async def read_documents(
-    skip: int = 0,
-    limit: int = 3000,
-    search_space_id: int | None = None,
-    session: AsyncSession = Depends(get_async_session),
-    user: User = Depends(current_active_user),
-):
-    try:
-        query = (
-            select(Document).join(SearchSpace).filter(SearchSpace.user_id == user.id)
-        )
-
-        # Filter by search_space_id if provided
-        if search_space_id is not None:
-            query = query.filter(Document.search_space_id == search_space_id)
-
-        result = await session.execute(query.offset(skip).limit(limit))
-        db_documents = result.scalars().all()
-
-        # Convert database objects to API-friendly format
-        api_documents = []
-        for doc in db_documents:
-            api_documents.append(
-                DocumentRead(
-                    id=doc.id,
-                    title=doc.title,
-                    document_type=doc.document_type,
-                    document_metadata=doc.document_metadata,
-                    content=doc.content,
-                    created_at=doc.created_at,
-                    search_space_id=doc.search_space_id,
-                )
-            )
-
-        return api_documents
-    except Exception as e:
-        raise HTTPException(
-            status_code=500, detail=f"Failed to fetch documents: {e!s}"
-        ) from e
-
-
-@router.get("/documents/{document_id}", response_model=DocumentRead)
-async def read_document(
-    document_id: int,
-    session: AsyncSession = Depends(get_async_session),
-    user: User = Depends(current_active_user),
-):
-    try:
-        result = await session.execute(
-            select(Document)
-            .join(SearchSpace)
-            .filter(Document.id == document_id, SearchSpace.user_id == user.id)
-        )
-        document = result.scalars().first()
-
-        if not document:
-            raise HTTPException(
-                status_code=404, detail=f"Document with id {document_id} not found"
-            )
-
-        # Convert database object to API-friendly format
-        return DocumentRead(
-            id=document.id,
-            title=document.title,
-            document_type=document.document_type,
-            document_metadata=document.document_metadata,
-            content=document.content,
-            created_at=document.created_at,
-            search_space_id=document.search_space_id,
-        )
-    except Exception as e:
-        raise HTTPException(
-            status_code=500, detail=f"Failed to fetch document: {e!s}"
-        ) from e
-
-
-@router.put("/documents/{document_id}", response_model=DocumentRead)
-async def update_document(
-    document_id: int,
-    document_update: DocumentUpdate,
-    session: AsyncSession = Depends(get_async_session),
-    user: User = Depends(current_active_user),
-):
-    try:
-        # Query the document directly instead of using read_document function
-        result = await session.execute(
-            select(Document)
-            .join(SearchSpace)
-            .filter(Document.id == document_id, SearchSpace.user_id == user.id)
-        )
-        db_document = result.scalars().first()
-
-        if not db_document:
-            raise HTTPException(
-                status_code=404, detail=f"Document with id {document_id} not found"
-            )
-
-        update_data = document_update.model_dump(exclude_unset=True)
-        for key, value in update_data.items():
-            setattr(db_document, key, value)
-        await session.commit()
-        await session.refresh(db_document)
-
-        # Convert to DocumentRead for response
-        return DocumentRead(
-            id=db_document.id,
-            title=db_document.title,
-            document_type=db_document.document_type,
-            document_metadata=db_document.document_metadata,
-            content=db_document.content,
-            created_at=db_document.created_at,
-            search_space_id=db_document.search_space_id,
-        )
-    except HTTPException:
-        raise
-    except Exception as e:
-        await session.rollback()
-        raise HTTPException(
-            status_code=500, detail=f"Failed to update document: {e!s}"
-        ) from e
-
-
-@router.delete("/documents/{document_id}", response_model=dict)
-async def delete_document(
-    document_id: int,
-    session: AsyncSession = Depends(get_async_session),
-    user: User = Depends(current_active_user),
-):
-    try:
-        # Query the document directly instead of using read_document function
-        result = await session.execute(
-            select(Document)
-            .join(SearchSpace)
-            .filter(Document.id == document_id, SearchSpace.user_id == user.id)
-        )
-        document = result.scalars().first()
-
-        if not document:
-            raise HTTPException(
-                status_code=404, detail=f"Document with id {document_id} not found"
-            )
-
-        await session.delete(document)
-        await session.commit()
-        return {"message": "Document deleted successfully"}
-    except HTTPException:
-        raise
-    except Exception as e:
-        await session.rollback()
-        raise HTTPException(
-            status_code=500, detail=f"Failed to delete document: {e!s}"
-        ) from e
-
-
-async def process_extension_document_with_new_session(
-    individual_document, search_space_id: int, user_id: str
-):
-    """Create a new session and process extension document."""
-    from app.db import async_session_maker
-    from app.services.task_logging_service import TaskLoggingService
-
-    async with async_session_maker() as session:
-        # Initialize task logging service
-        task_logger = TaskLoggingService(session, search_space_id)
-
-        # Log task start
-        log_entry = await task_logger.log_task_start(
-            task_name="process_extension_document",
-            source="document_processor",
-            message=f"Starting processing of extension document from {individual_document.metadata.VisitedWebPageTitle}",
-            metadata={
-                "document_type": "EXTENSION",
-                "url": individual_document.metadata.VisitedWebPageURL,
-                "title": individual_document.metadata.VisitedWebPageTitle,
-                "user_id": user_id,
-            },
-        )
-
-        try:
-            result = await add_extension_received_document(
-                session, individual_document, search_space_id, user_id
-            )
-
-            if result:
-                await task_logger.log_task_success(
-                    log_entry,
-                    f"Successfully processed extension document: {individual_document.metadata.VisitedWebPageTitle}",
-                    {"document_id": result.id, "content_hash": result.content_hash},
-                )
-            else:
-                await task_logger.log_task_success(
-                    log_entry,
-                    f"Extension document already exists (duplicate): {individual_document.metadata.VisitedWebPageTitle}",
-                    {"duplicate_detected": True},
-                )
-        except Exception as e:
-            await task_logger.log_task_failure(
-                log_entry,
-                f"Failed to process extension document: {individual_document.metadata.VisitedWebPageTitle}",
-                str(e),
-                {"error_type": type(e).__name__},
-            )
-            import logging
-
-            logging.error(f"Error processing extension document: {e!s}")
-
-
-async def process_crawled_url_with_new_session(
-    url: str, search_space_id: int, user_id: str
-):
-    """Create a new session and process crawled URL."""
-    from app.db import async_session_maker
-    from app.services.task_logging_service import TaskLoggingService
-
-    async with async_session_maker() as session:
-        # Initialize task logging service
-        task_logger = TaskLoggingService(session, search_space_id)
-
-        # Log task start
-        log_entry = await task_logger.log_task_start(
-            task_name="process_crawled_url",
-            source="document_processor",
-            message=f"Starting URL crawling and processing for: {url}",
-            metadata={"document_type": "CRAWLED_URL", "url": url, "user_id": user_id},
-        )
-
-        try:
-            result = await add_crawled_url_document(
-                session, url, search_space_id, user_id
-            )
-
-            if result:
-                await task_logger.log_task_success(
-                    log_entry,
-                    f"Successfully crawled and processed URL: {url}",
-                    {
-                        "document_id": result.id,
-                        "title": result.title,
-                        "content_hash": result.content_hash,
-                    },
-                )
-            else:
-                await task_logger.log_task_success(
-                    log_entry,
-                    f"URL document already exists (duplicate): {url}",
-                    {"duplicate_detected": True},
-                )
-        except Exception as e:
-            await task_logger.log_task_failure(
-                log_entry,
-                f"Failed to crawl URL: {url}",
-                str(e),
-                {"error_type": type(e).__name__},
-            )
-            import logging
-
-            logging.error(f"Error processing crawled URL: {e!s}")
-
-
-async def process_file_in_background_with_new_session(
-    file_path: str, filename: str, search_space_id: int, user_id: str
-):
-    """Create a new session and process file."""
-    from app.db import async_session_maker
-    from app.services.task_logging_service import TaskLoggingService
-
-    async with async_session_maker() as session:
-        # Initialize task logging service
-        task_logger = TaskLoggingService(session, search_space_id)
-
-        # Log task start
-        log_entry = await task_logger.log_task_start(
-            task_name="process_file_upload",
-            source="document_processor",
-            message=f"Starting file processing for: {filename}",
-            metadata={
-                "document_type": "FILE",
-                "filename": filename,
-                "file_path": file_path,
-                "user_id": user_id,
-            },
-        )
-
-        try:
-            await process_file_in_background(
-                file_path,
-                filename,
-                search_space_id,
-                user_id,
-                session,
-                task_logger,
-                log_entry,
-            )
-
-            # Note: success/failure logging is handled within process_file_in_background
-        except Exception as e:
-            await task_logger.log_task_failure(
-                log_entry,
-                f"Failed to process file: {filename}",
-                str(e),
-                {"error_type": type(e).__name__},
-            )
-            import logging
-
-            logging.error(f"Error processing file: {e!s}")
-
-
-async def process_youtube_video_with_new_session(
-    url: str, search_space_id: int, user_id: str
-):
-    """Create a new session and process YouTube video."""
-    from app.db import async_session_maker
-    from app.services.task_logging_service import TaskLoggingService
-
-    async with async_session_maker() as session:
-        # Initialize task logging service
-        task_logger = TaskLoggingService(session, search_space_id)
-
-        # Log task start
-        log_entry = await task_logger.log_task_start(
-            task_name="process_youtube_video",
-            source="document_processor",
-            message=f"Starting YouTube video processing for: {url}",
-            metadata={"document_type": "YOUTUBE_VIDEO", "url": url, "user_id": user_id},
-        )
-
-        try:
-            result = await add_youtube_video_document(
-                session, url, search_space_id, user_id
-            )
-
-            if result:
-                await task_logger.log_task_success(
-                    log_entry,
-                    f"Successfully processed YouTube video: {result.title}",
-                    {
-                        "document_id": result.id,
-                        "video_id": result.document_metadata.get("video_id"),
-                        "content_hash": result.content_hash,
-                    },
-                )
-            else:
-                await task_logger.log_task_success(
-                    log_entry,
-                    f"YouTube video document already exists (duplicate): {url}",
-                    {"duplicate_detected": True},
-                )
-        except Exception as e:
-            await task_logger.log_task_failure(
-                log_entry,
-                f"Failed to process YouTube video: {url}",
-                str(e),
-                {"error_type": type(e).__name__},
-            )
-            import logging
-
-            logging.error(f"Error processing YouTube video: {e!s}")
--- a/surfsense_backend/app/schemas/init.py
+++ b/surfsense_backend/app/schemas/init.py
@ -13,6 +13,7 @@ from .documents import (
    DocumentRead,
    DocumentsCreate,
    DocumentUpdate,
+    DocumentWithChunksRead,
    ExtensionDocumentContent,
    ExtensionDocumentMetadata,
 )
@ -53,6 +54,7 @@ __all__ = [
    "DocumentBase",
    "DocumentRead",
    "DocumentUpdate",
+    "DocumentWithChunksRead",
    "DocumentsCreate",
    "ExtensionDocumentContent",
    "ExtensionDocumentMetadata",
--- a/surfsense_backend/app/schemas/documents.py
+++ b/surfsense_backend/app/schemas/documents.py
@ -4,6 +4,8 @@ from pydantic import BaseModel, ConfigDict

 from app.db import DocumentType

+from .chunks import ChunkRead
+

 class ExtensionDocumentMetadata(BaseModel):
    BrowsingSessionId: str
@ -45,3 +47,9 @@ class DocumentRead(BaseModel):
    search_space_id: int

    model_config = ConfigDict(from_attributes=True)
+
+
+class DocumentWithChunksRead(DocumentRead):
+    chunks: list[ChunkRead] = []
+
+    model_config = ConfigDict(from_attributes=True)
--- a/surfsense_web/components/chat/ChatCitation.tsx
+++ b/surfsense_web/components/chat/ChatCitation.tsx
@ -1,58 +1,202 @@
 "use client";

-import { ExternalLink } from "lucide-react";
+import { ExternalLink, FileText, Loader2 } from "lucide-react";
 import type React from "react";
+import { useEffect, useRef, useState } from "react";
+import { MarkdownViewer } from "@/components/markdown-viewer";
 import { Button } from "@/components/ui/button";
-import { Popover, PopoverContent, PopoverTrigger } from "@/components/ui/popover";
+import { ScrollArea } from "@/components/ui/scroll-area";
+import {
+	Sheet,
+	SheetContent,
+	SheetDescription,
+	SheetHeader,
+	SheetTitle,
+	SheetTrigger,
+} from "@/components/ui/sheet";
+import { useDocumentByChunk } from "@/hooks/use-document-by-chunk";
+import { cn } from "@/lib/utils";

 export const CitationDisplay: React.FC<{ index: number; node: any }> = ({ index, node }) => {
-	const truncateText = (text: string, maxLength: number = 200) => {
-		if (text.length <= maxLength) return text;
-		return `${text.substring(0, maxLength)}...`;
+	const chunkId = Number(node?.id);
+	const sourceType = node?.metadata?.source_type;
+	const [isOpen, setIsOpen] = useState(false);
+	const { document, loading, error, fetchDocumentByChunk, clearDocument } = useDocumentByChunk();
+	const chunksContainerRef = useRef<HTMLDivElement>(null);
+	const highlightedChunkRef = useRef<HTMLDivElement>(null);
+
+	// Check if this is a source type that should render directly from node
+	const isDirectRenderSource = sourceType === "TAVILY_API" || sourceType === "LINKUP_API";
+
+	const handleOpenChange = async (open: boolean) => {
+		setIsOpen(open);
+		if (open && chunkId && !isDirectRenderSource) {
+			await fetchDocumentByChunk(chunkId);
+		} else if (!open && !isDirectRenderSource) {
+			clearDocument();
+		}
 	};

+	useEffect(() => {
+		// Scroll to highlighted chunk when document loads
+		if (document && highlightedChunkRef.current && chunksContainerRef.current) {
+			setTimeout(() => {
+				highlightedChunkRef.current?.scrollIntoView({
+					behavior: "smooth",
+					block: "start",
+				});
+			}, 100);
+		}
+	}, [document]);
+
 	const handleUrlClick = (e: React.MouseEvent, url: string) => {
 		e.preventDefault();
 		e.stopPropagation();
 		window.open(url, "_blank", "noopener,noreferrer");
 	};

+	const formatDocumentType = (type: string) => {
+		return type
+			.split("_")
+			.map((word) => word.charAt(0) + word.slice(1).toLowerCase())
+			.join(" ");
+	};
+
 	return (
-		<Popover>
-			<PopoverTrigger asChild>
+		<Sheet open={isOpen} onOpenChange={handleOpenChange}>
+			<SheetTrigger asChild>
 				<span className="text-[10px] font-bold bg-slate-500 hover:bg-slate-600 text-white rounded-full w-4 h-4 inline-flex items-center justify-center align-super cursor-pointer transition-colors">
 					{index + 1}
 				</span>
-			</PopoverTrigger>
-			<PopoverContent className="w-80 p-4 space-y-3 relative" align="start">
-				{/* External Link Button - Top Right */}
-				{node?.url && (
-					<Button
-						size="icon"
-						variant="ghost"
-						onClick={(e) => handleUrlClick(e, node.url)}
-						className="absolute top-3 right-3 inline-flex items-center justify-center w-6 h-6 text-blue-600 hover:text-blue-800 dark:text-blue-400 dark:hover:text-blue-200 hover:bg-blue-50 dark:hover:bg-blue-900/20 rounded transition-colors"
-						title="Open in new tab"
-					>
-						<ExternalLink size={14} />
-					</Button>
+			</SheetTrigger>
+			<SheetContent side="right" className="w-full sm:max-w-5xl lg:max-w-7xl">
+				<SheetHeader className="px-6 py-4 border-b">
+					<SheetTitle className="flex items-center gap-3 text-lg">
+						<FileText className="h-6 w-6" />
+						{document?.title || node?.metadata?.title || node?.metadata?.group_name || "Source"}
+					</SheetTitle>
+					<SheetDescription className="text-base mt-2">
+						{document
+							? formatDocumentType(document.document_type)
+							: sourceType && formatDocumentType(sourceType)}
+					</SheetDescription>
+				</SheetHeader>
+
+				{!isDirectRenderSource && loading && (
+					<div className="flex items-center justify-center h-64 px-6">
+						<Loader2 className="h-8 w-8 animate-spin text-muted-foreground" />
+					</div>
 				)}

-				{/* Heading */}
-				<div className="text-sm font-semibold text-slate-900 dark:text-slate-100 pr-8">
-					{node?.metadata?.group_name || "Source"}
-				</div>
+				{!isDirectRenderSource && error && (
+					<div className="flex items-center justify-center h-64 px-6">
+						<p className="text-sm text-destructive">{error}</p>
+					</div>
+				)}

-				{/* Source */}
-				<div className="text-xs text-slate-600 dark:text-slate-400 font-medium">
-					{node?.metadata?.title || "Untitled"}
-				</div>
+				{/* Direct render for TAVILY_API and LINEAR_API */}
+				{isDirectRenderSource && (
+					<ScrollArea className="h-[calc(100vh-10rem)]">
+						<div className="px-6 py-4">
+							{/* External Link */}
+							{node?.url && (
+								<div className="mb-8">
+									<Button
+										size="default"
+										variant="outline"
+										onClick={(e) => handleUrlClick(e, node.url)}
+										className="w-full py-3"
+									>
+										<ExternalLink className="mr-2 h-4 w-4" />
+										Open in Browser
+									</Button>
+								</div>
+							)}

-				{/* Body */}
-				<div className="text-xs text-slate-700 dark:text-slate-300 leading-relaxed">
-					{truncateText(node?.text || "No content available")}
-				</div>
-			</PopoverContent>
-		</Popover>
+							{/* Source Information */}
+							<div className="mb-8 p-6 bg-muted/50 rounded-lg border">
+								<h3 className="text-base font-semibold mb-4">Source Information</h3>
+								<div className="text-sm text-muted-foreground mb-3 font-medium">
+									{node?.metadata?.title || "Untitled"}
+								</div>
+								<div className="text-sm text-foreground leading-relaxed whitespace-pre-wrap">
+									{node?.text || "No content available"}
+								</div>
+							</div>
+						</div>
+					</ScrollArea>
+				)}
+
+				{/* API-fetched document content */}
+				{!isDirectRenderSource && document && (
+					<ScrollArea className="h-[calc(100vh-10rem)]">
+						<div className="px-6 py-4">
+							{/* Document Metadata */}
+							{document.document_metadata && Object.keys(document.document_metadata).length > 0 && (
+								<div className="mb-8 p-6 bg-muted/50 rounded-lg border">
+									<h3 className="text-base font-semibold mb-4">Document Information</h3>
+									<dl className="grid grid-cols-1 gap-3 text-sm">
+										{Object.entries(document.document_metadata).map(([key, value]) => (
+											<div key={key} className="flex gap-3">
+												<dt className="font-medium text-muted-foreground capitalize min-w-0 flex-shrink-0">
+													{key.replace(/_/g, " ")}:
+												</dt>
+												<dd className="text-foreground break-words">{String(value)}</dd>
+											</div>
+										))}
+									</dl>
+								</div>
+							)}
+
+							{/* External Link */}
+							{node?.url && (
+								<div className="mb-8">
+									<Button
+										size="default"
+										variant="outline"
+										onClick={(e) => handleUrlClick(e, node.url)}
+										className="w-full py-3"
+									>
+										<ExternalLink className="mr-2 h-4 w-4" />
+										Open in Browser
+									</Button>
+								</div>
+							)}
+
+							{/* Chunks */}
+							<div className="space-y-6" ref={chunksContainerRef}>
+								<h3 className="text-base font-semibold mb-4">Document Content</h3>
+								{document.chunks.map((chunk, idx) => (
+									<div
+										key={chunk.id}
+										ref={chunk.id === chunkId ? highlightedChunkRef : null}
+										className={cn(
+											"p-6 rounded-lg border transition-all duration-300",
+											chunk.id === chunkId
+												? "bg-primary/10 border-primary shadow-md ring-1 ring-primary/20"
+												: "bg-background border-border hover:bg-muted/50 hover:border-muted-foreground/20"
+										)}
+									>
+										<div className="mb-4 flex items-center justify-between">
+											<span className="text-sm font-medium text-muted-foreground">
+												Chunk {idx + 1} of {document.chunks.length}
+											</span>
+											{chunk.id === chunkId && (
+												<span className="text-sm font-medium text-primary bg-primary/10 px-3 py-1 rounded-full">
+													Referenced Chunk
+												</span>
+											)}
+										</div>
+										<div className="text-sm text-foreground whitespace-pre-wrap leading-relaxed">
+											<MarkdownViewer content={chunk.content} className="max-w-fit" />
+										</div>
+									</div>
+								))}
+							</div>
+						</div>
+					</ScrollArea>
+				)}
+			</SheetContent>
+		</Sheet>
 	);
 };
--- a/surfsense_web/components/markdown-viewer.tsx
+++ b/surfsense_web/components/markdown-viewer.tsx
@ -1,7 +1,7 @@
 import { Check, Copy } from "lucide-react";
 import Image from "next/image";
 import { useTheme } from "next-themes";
-import React, { useEffect, useMemo, useRef, useState } from "react";
+import { useEffect, useMemo, useRef, useState } from "react";
 import ReactMarkdown from "react-markdown";
 import { Prism as SyntaxHighlighter } from "react-syntax-highlighter";
 import { oneDark, oneLight } from "react-syntax-highlighter/dist/cjs/styles/prism";
@ -10,105 +10,51 @@ import rehypeSanitize from "rehype-sanitize";
 import remarkGfm from "remark-gfm";
 import { Button } from "@/components/ui/button";
 import { cn } from "@/lib/utils";
-import { Citation } from "./chat/Citation";
-import type { Source } from "./chat/types";
-import CopyButton from "./copy-button";

 interface MarkdownViewerProps {
 	content: string;
 	className?: string;
-	getCitationSource?: (id: number) => Source | null;
-	type?: "user" | "ai";
 }

-export function MarkdownViewer({
-	content,
-	className,
-	getCitationSource,
-	type = "user",
-}: MarkdownViewerProps) {
+export function MarkdownViewer({ content, className }: MarkdownViewerProps) {
 	const ref = useRef<HTMLDivElement>(null);
 	// Memoize the markdown components to prevent unnecessary re-renders
 	const components = useMemo(() => {
 		return {
 			// Define custom components for markdown elements
-			p: ({ node, children, ...props }: any) => {
-				// If there's no getCitationSource function, just render normally
-				if (!getCitationSource) {
-					return (
-						<p className="my-2" {...props}>
-							{children}
-						</p>
-					);
-				}
-
-				// Process citations within paragraph content
-				return (
-					<p className="my-2" {...props}>
-						{processCitationsInReactChildren(children, getCitationSource)}
-					</p>
-				);
-			},
-			a: ({ node, children, ...props }: any) => {
-				// Process citations within link content if needed
-				const processedChildren = getCitationSource
-					? processCitationsInReactChildren(children, getCitationSource)
-					: children;
-				return (
-					<a className="text-primary hover:underline" {...props}>
-						{processedChildren}
-					</a>
-				);
-			},
-			li: ({ node, children, ...props }: any) => {
-				// Process citations within list item content
-				const processedChildren = getCitationSource
-					? processCitationsInReactChildren(children, getCitationSource)
-					: children;
-				return <li {...props}>{processedChildren}</li>;
-			},
+			p: ({ node, children, ...props }: any) => (
+				<p className="my-2" {...props}>
+					{children}
+				</p>
+			),
+			a: ({ node, children, ...props }: any) => (
+				<a className="text-primary hover:underline" {...props}>
+					{children}
+				</a>
+			),
+			li: ({ node, children, ...props }: any) => <li {...props}>{children}</li>,
 			ul: ({ node, ...props }: any) => <ul className="list-disc pl-5 my-2" {...props} />,
 			ol: ({ node, ...props }: any) => <ol className="list-decimal pl-5 my-2" {...props} />,
-			h1: ({ node, children, ...props }: any) => {
-				const processedChildren = getCitationSource
-					? processCitationsInReactChildren(children, getCitationSource)
-					: children;
-				return (
-					<h1 className="text-2xl font-bold mt-6 mb-2" {...props}>
-						{processedChildren}
-					</h1>
-				);
-			},
-			h2: ({ node, children, ...props }: any) => {
-				const processedChildren = getCitationSource
-					? processCitationsInReactChildren(children, getCitationSource)
-					: children;
-				return (
-					<h2 className="text-xl font-bold mt-5 mb-2" {...props}>
-						{processedChildren}
-					</h2>
-				);
-			},
-			h3: ({ node, children, ...props }: any) => {
-				const processedChildren = getCitationSource
-					? processCitationsInReactChildren(children, getCitationSource)
-					: children;
-				return (
-					<h3 className="text-lg font-bold mt-4 mb-2" {...props}>
-						{processedChildren}
-					</h3>
-				);
-			},
-			h4: ({ node, children, ...props }: any) => {
-				const processedChildren = getCitationSource
-					? processCitationsInReactChildren(children, getCitationSource)
-					: children;
-				return (
-					<h4 className="text-base font-bold mt-3 mb-1" {...props}>
-						{processedChildren}
-					</h4>
-				);
-			},
+			h1: ({ node, children, ...props }: any) => (
+				<h1 className="text-2xl font-bold mt-6 mb-2" {...props}>
+					{children}
+				</h1>
+			),
+			h2: ({ node, children, ...props }: any) => (
+				<h2 className="text-xl font-bold mt-5 mb-2" {...props}>
+					{children}
+				</h2>
+			),
+			h3: ({ node, children, ...props }: any) => (
+				<h3 className="text-lg font-bold mt-4 mb-2" {...props}>
+					{children}
+				</h3>
+			),
+			h4: ({ node, children, ...props }: any) => (
+				<h4 className="text-base font-bold mt-3 mb-1" {...props}>
+					{children}
+				</h4>
+			),
 			blockquote: ({ node, ...props }: any) => (
 				<blockquote className="border-l-4 border-muted pl-4 italic my-2" {...props} />
 			),
@ -154,7 +100,7 @@ export function MarkdownViewer({
 				);
 			},
 		};
-	}, [getCitationSource]);
+	}, []);

 	return (
 		<div className={cn("prose prose-sm dark:prose-invert max-w-none", className)} ref={ref}>
@ -165,7 +111,6 @@ export function MarkdownViewer({
 			>
 				{content}
 			</ReactMarkdown>
-			{type === "ai" && <CopyButton ref={ref} />}
 		</div>
 	);
 }
@ -267,77 +212,3 @@ const CodeBlock = ({ children, language }: { children: string; language: string
 		</div>
 	);
 };
-
-// Helper function to process citations within React children
-const processCitationsInReactChildren = (
-	children: React.ReactNode,
-	getCitationSource: (id: number) => Source | null
-): React.ReactNode => {
-	// If children is not an array or string, just return it
-	if (!children || (typeof children !== "string" && !Array.isArray(children))) {
-		return children;
-	}
-
-	// Handle string content directly - this is where we process citation references
-	if (typeof children === "string") {
-		return processCitationsInText(children, getCitationSource);
-	}
-
-	// Handle arrays of children recursively
-	if (Array.isArray(children)) {
-		return React.Children.map(children, (child) => {
-			if (typeof child === "string") {
-				return processCitationsInText(child, getCitationSource);
-			}
-			return child;
-		});
-	}
-
-	return children;
-};
-
-// Process citation references in text content
-const processCitationsInText = (
-	text: string,
-	getCitationSource: (id: number) => Source | null
-): React.ReactNode[] => {
-	// Use improved regex to catch citation numbers more reliably
-	// This will match patterns like [1], [42], etc. including when they appear at the end of a line or sentence
-	const citationRegex = /\[(\d+)\]/g;
-	const parts: React.ReactNode[] = [];
-	let lastIndex = 0;
-	let match: RegExpExecArray | null = citationRegex.exec(text);
-	let position = 0;
-
-	while (match !== null) {
-		// Add text before the citation
-		if (match.index > lastIndex) {
-			parts.push(text.substring(lastIndex, match.index));
-		}
-
-		// Add the citation component
-		const citationId = parseInt(match[1], 10);
-		const source = getCitationSource(citationId);
-
-		parts.push(
-			<Citation
-				key={`citation-${citationId}-${position}`}
-				citationId={citationId}
-				citationText={match[0]}
-				position={position}
-				source={source}
-			/>
-		);
-
-		lastIndex = match.index + match[0].length;
-		position++;
-		match = citationRegex.exec(text);
-	}
-
-	// Add any remaining text after the last citation
-	if (lastIndex < text.length) {
-		parts.push(text.substring(lastIndex));
-	}
-
-	return parts;
-};
--- a/surfsense_web/components/ui/scroll-area.tsx
+++ b/surfsense_web/components/ui/scroll-area.tsx
@ -0,0 +1,56 @@
+"use client";
+
+import * as ScrollAreaPrimitive from "@radix-ui/react-scroll-area";
+import type * as React from "react";
+
+import { cn } from "@/lib/utils";
+
+function ScrollArea({
+	className,
+	children,
+	...props
+}: React.ComponentProps<typeof ScrollAreaPrimitive.Root>) {
+	return (
+		<ScrollAreaPrimitive.Root
+			data-slot="scroll-area"
+			className={cn("relative", className)}
+			{...props}
+		>
+			<ScrollAreaPrimitive.Viewport
+				data-slot="scroll-area-viewport"
+				className="focus-visible:ring-ring/50 size-full rounded-[inherit] transition-[color,box-shadow] outline-none focus-visible:ring-[3px] focus-visible:outline-1"
+			>
+				{children}
+			</ScrollAreaPrimitive.Viewport>
+			<ScrollBar />
+			<ScrollAreaPrimitive.Corner />
+		</ScrollAreaPrimitive.Root>
+	);
+}
+
+function ScrollBar({
+	className,
+	orientation = "vertical",
+	...props
+}: React.ComponentProps<typeof ScrollAreaPrimitive.ScrollAreaScrollbar>) {
+	return (
+		<ScrollAreaPrimitive.ScrollAreaScrollbar
+			data-slot="scroll-area-scrollbar"
+			orientation={orientation}
+			className={cn(
+				"flex touch-none p-px transition-colors select-none",
+				orientation === "vertical" && "h-full w-2.5 border-l border-l-transparent",
+				orientation === "horizontal" && "h-2.5 flex-col border-t border-t-transparent",
+				className
+			)}
+			{...props}
+		>
+			<ScrollAreaPrimitive.ScrollAreaThumb
+				data-slot="scroll-area-thumb"
+				className="bg-border relative flex-1 rounded-full"
+			/>
+		</ScrollAreaPrimitive.ScrollAreaScrollbar>
+	);
+}
+
+export { ScrollArea, ScrollBar };
--- a/surfsense_web/hooks/index.ts
+++ b/surfsense_web/hooks/index.ts
@ -1,2 +1,3 @@
+export * from "./use-document-by-chunk";
 export * from "./use-logs";
 export * from "./useSearchSourceConnectors";
--- a/surfsense_web/hooks/use-document-by-chunk.ts
+++ b/surfsense_web/hooks/use-document-by-chunk.ts
@ -0,0 +1,106 @@
+"use client";
+import { useCallback, useState } from "react";
+import { toast } from "sonner";
+
+export interface Chunk {
+	id: number;
+	content: string;
+	document_id: number;
+	created_at: string;
+}
+
+export interface DocumentWithChunks {
+	id: number;
+	title: string;
+	document_type: DocumentType;
+	document_metadata: any;
+	content: string;
+	created_at: string;
+	search_space_id: number;
+	chunks: Chunk[];
+}
+
+export type DocumentType =
+	| "EXTENSION"
+	| "CRAWLED_URL"
+	| "SLACK_CONNECTOR"
+	| "NOTION_CONNECTOR"
+	| "FILE"
+	| "YOUTUBE_VIDEO"
+	| "GITHUB_CONNECTOR"
+	| "LINEAR_CONNECTOR"
+	| "DISCORD_CONNECTOR"
+	| "JIRA_CONNECTOR"
+	| "CONFLUENCE_CONNECTOR"
+	| "CLICKUP_CONNECTOR"
+	| "GOOGLE_CALENDAR_CONNECTOR"
+	| "GOOGLE_GMAIL_CONNECTOR";
+
+export function useDocumentByChunk() {
+	const [document, setDocument] = useState<DocumentWithChunks | null>(null);
+	const [loading, setLoading] = useState(false);
+	const [error, setError] = useState<string | null>(null);
+
+	const fetchDocumentByChunk = useCallback(async (chunkId: number) => {
+		try {
+			setLoading(true);
+			setError(null);
+			setDocument(null);
+
+			const response = await fetch(
+				`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/documents/by-chunk/${chunkId}`,
+				{
+					headers: {
+						Authorization: `Bearer ${localStorage.getItem("surfsense_bearer_token")}`,
+						"Content-Type": "application/json",
+					},
+					method: "GET",
+				}
+			);
+
+			if (!response.ok) {
+				const errorText = await response.text();
+				let errorMessage = "Failed to fetch document";
+
+				try {
+					const errorData = JSON.parse(errorText);
+					errorMessage = errorData.detail || errorMessage;
+				} catch {
+					// If parsing fails, use default message
+				}
+
+				if (response.status === 404) {
+					errorMessage = "Chunk not found or you don't have access to it";
+				}
+
+				toast.error(errorMessage);
+				throw new Error(errorMessage);
+			}
+
+			const data: DocumentWithChunks = await response.json();
+			setDocument(data);
+			setError(null);
+			return data;
+		} catch (err: any) {
+			const errorMessage = err.message || "Failed to fetch document";
+			setError(errorMessage);
+			console.error("Error fetching document by chunk:", err);
+			throw err;
+		} finally {
+			setLoading(false);
+		}
+	}, []);
+
+	const clearDocument = useCallback(() => {
+		setDocument(null);
+		setError(null);
+	}, []);
+
+	return {
+		document,
+		loading,
+		error,
+		fetchDocumentByChunk,
+		clearDocument,
+	};
+}
--- a/surfsense_web/next.config.ts
+++ b/surfsense_web/next.config.ts
@ -12,11 +12,7 @@ const nextConfig: NextConfig = {
 		remotePatterns: [
 			{
 				protocol: "https",
-				hostname: "images.unsplash.com",
-			},
-			{
-				protocol: "https",
-				hostname: "static.vecteezy.com",
+				hostname: "**",
 			},
 		],
 	},