diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py index a080fd3..b75c566 100644 --- a/surfsense_backend/app/routes/documents_routes.py +++ b/surfsense_backend/app/routes/documents_routes.py @@ -5,10 +5,24 @@ from fastapi import APIRouter, BackgroundTasks, Depends, Form, HTTPException, Up from litellm import atranscription from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.future import select +from sqlalchemy.orm import selectinload from app.config import config as app_config -from app.db import Document, DocumentType, Log, SearchSpace, User, get_async_session -from app.schemas import DocumentRead, DocumentsCreate, DocumentUpdate +from app.db import ( + Chunk, + Document, + DocumentType, + Log, + SearchSpace, + User, + get_async_session, +) +from app.schemas import ( + DocumentRead, + DocumentsCreate, + DocumentUpdate, + DocumentWithChunksRead, +) from app.services.task_logging_service import TaskLoggingService from app.tasks.document_processors import ( add_crawled_url_document, @@ -140,6 +154,423 @@ async def create_documents_file_upload( ) from e +@router.get("/documents/", response_model=list[DocumentRead]) +async def read_documents( + skip: int = 0, + limit: int = 3000, + search_space_id: int | None = None, + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + try: + query = ( + select(Document).join(SearchSpace).filter(SearchSpace.user_id == user.id) + ) + + # Filter by search_space_id if provided + if search_space_id is not None: + query = query.filter(Document.search_space_id == search_space_id) + + result = await session.execute(query.offset(skip).limit(limit)) + db_documents = result.scalars().all() + + # Convert database objects to API-friendly format + api_documents = [] + for doc in db_documents: + api_documents.append( + DocumentRead( + id=doc.id, + title=doc.title, + document_type=doc.document_type, + document_metadata=doc.document_metadata, + content=doc.content, + created_at=doc.created_at, + search_space_id=doc.search_space_id, + ) + ) + + return api_documents + except Exception as e: + raise HTTPException( + status_code=500, detail=f"Failed to fetch documents: {e!s}" + ) from e + + +@router.get("/documents/{document_id}", response_model=DocumentRead) +async def read_document( + document_id: int, + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + try: + result = await session.execute( + select(Document) + .join(SearchSpace) + .filter(Document.id == document_id, SearchSpace.user_id == user.id) + ) + document = result.scalars().first() + + if not document: + raise HTTPException( + status_code=404, detail=f"Document with id {document_id} not found" + ) + + # Convert database object to API-friendly format + return DocumentRead( + id=document.id, + title=document.title, + document_type=document.document_type, + document_metadata=document.document_metadata, + content=document.content, + created_at=document.created_at, + search_space_id=document.search_space_id, + ) + except Exception as e: + raise HTTPException( + status_code=500, detail=f"Failed to fetch document: {e!s}" + ) from e + + +@router.put("/documents/{document_id}", response_model=DocumentRead) +async def update_document( + document_id: int, + document_update: DocumentUpdate, + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + try: + # Query the document directly instead of using read_document function + result = await session.execute( + select(Document) + .join(SearchSpace) + .filter(Document.id == document_id, SearchSpace.user_id == user.id) + ) + db_document = result.scalars().first() + + if not db_document: + raise HTTPException( + status_code=404, detail=f"Document with id {document_id} not found" + ) + + update_data = document_update.model_dump(exclude_unset=True) + for key, value in update_data.items(): + setattr(db_document, key, value) + await session.commit() + await session.refresh(db_document) + + # Convert to DocumentRead for response + return DocumentRead( + id=db_document.id, + title=db_document.title, + document_type=db_document.document_type, + document_metadata=db_document.document_metadata, + content=db_document.content, + created_at=db_document.created_at, + search_space_id=db_document.search_space_id, + ) + except HTTPException: + raise + except Exception as e: + await session.rollback() + raise HTTPException( + status_code=500, detail=f"Failed to update document: {e!s}" + ) from e + + +@router.delete("/documents/{document_id}", response_model=dict) +async def delete_document( + document_id: int, + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + try: + # Query the document directly instead of using read_document function + result = await session.execute( + select(Document) + .join(SearchSpace) + .filter(Document.id == document_id, SearchSpace.user_id == user.id) + ) + document = result.scalars().first() + + if not document: + raise HTTPException( + status_code=404, detail=f"Document with id {document_id} not found" + ) + + await session.delete(document) + await session.commit() + return {"message": "Document deleted successfully"} + except HTTPException: + raise + except Exception as e: + await session.rollback() + raise HTTPException( + status_code=500, detail=f"Failed to delete document: {e!s}" + ) from e + + +@router.get("/documents/by-chunk/{chunk_id}", response_model=DocumentWithChunksRead) +async def get_document_by_chunk_id( + chunk_id: int, + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + """ + Retrieves a document based on a chunk ID, including all its chunks ordered by creation time. + The document's embedding and chunk embeddings are excluded from the response. + """ + try: + # First, get the chunk and verify it exists + chunk_result = await session.execute(select(Chunk).filter(Chunk.id == chunk_id)) + chunk = chunk_result.scalars().first() + + if not chunk: + raise HTTPException( + status_code=404, detail=f"Chunk with id {chunk_id} not found" + ) + + # Get the associated document and verify ownership + document_result = await session.execute( + select(Document) + .options(selectinload(Document.chunks)) + .join(SearchSpace) + .filter(Document.id == chunk.document_id, SearchSpace.user_id == user.id) + ) + document = document_result.scalars().first() + + if not document: + raise HTTPException( + status_code=404, + detail="Document not found or you don't have access to it", + ) + + # Sort chunks by creation time + sorted_chunks = sorted(document.chunks, key=lambda x: x.created_at) + + # Return the document with its chunks + return DocumentWithChunksRead( + id=document.id, + title=document.title, + document_type=document.document_type, + document_metadata=document.document_metadata, + content=document.content, + created_at=document.created_at, + search_space_id=document.search_space_id, + chunks=sorted_chunks, + ) + except HTTPException: + raise + except Exception as e: + raise HTTPException( + status_code=500, detail=f"Failed to retrieve document: {e!s}" + ) from e + + +async def process_extension_document_with_new_session( + individual_document, search_space_id: int, user_id: str +): + """Create a new session and process extension document.""" + from app.db import async_session_maker + from app.services.task_logging_service import TaskLoggingService + + async with async_session_maker() as session: + # Initialize task logging service + task_logger = TaskLoggingService(session, search_space_id) + + # Log task start + log_entry = await task_logger.log_task_start( + task_name="process_extension_document", + source="document_processor", + message=f"Starting processing of extension document from {individual_document.metadata.VisitedWebPageTitle}", + metadata={ + "document_type": "EXTENSION", + "url": individual_document.metadata.VisitedWebPageURL, + "title": individual_document.metadata.VisitedWebPageTitle, + "user_id": user_id, + }, + ) + + try: + result = await add_extension_received_document( + session, individual_document, search_space_id, user_id + ) + + if result: + await task_logger.log_task_success( + log_entry, + f"Successfully processed extension document: {individual_document.metadata.VisitedWebPageTitle}", + {"document_id": result.id, "content_hash": result.content_hash}, + ) + else: + await task_logger.log_task_success( + log_entry, + f"Extension document already exists (duplicate): {individual_document.metadata.VisitedWebPageTitle}", + {"duplicate_detected": True}, + ) + except Exception as e: + await task_logger.log_task_failure( + log_entry, + f"Failed to process extension document: {individual_document.metadata.VisitedWebPageTitle}", + str(e), + {"error_type": type(e).__name__}, + ) + import logging + + logging.error(f"Error processing extension document: {e!s}") + + +async def process_crawled_url_with_new_session( + url: str, search_space_id: int, user_id: str +): + """Create a new session and process crawled URL.""" + from app.db import async_session_maker + from app.services.task_logging_service import TaskLoggingService + + async with async_session_maker() as session: + # Initialize task logging service + task_logger = TaskLoggingService(session, search_space_id) + + # Log task start + log_entry = await task_logger.log_task_start( + task_name="process_crawled_url", + source="document_processor", + message=f"Starting URL crawling and processing for: {url}", + metadata={"document_type": "CRAWLED_URL", "url": url, "user_id": user_id}, + ) + + try: + result = await add_crawled_url_document( + session, url, search_space_id, user_id + ) + + if result: + await task_logger.log_task_success( + log_entry, + f"Successfully crawled and processed URL: {url}", + { + "document_id": result.id, + "title": result.title, + "content_hash": result.content_hash, + }, + ) + else: + await task_logger.log_task_success( + log_entry, + f"URL document already exists (duplicate): {url}", + {"duplicate_detected": True}, + ) + except Exception as e: + await task_logger.log_task_failure( + log_entry, + f"Failed to crawl URL: {url}", + str(e), + {"error_type": type(e).__name__}, + ) + import logging + + logging.error(f"Error processing crawled URL: {e!s}") + + +async def process_file_in_background_with_new_session( + file_path: str, filename: str, search_space_id: int, user_id: str +): + """Create a new session and process file.""" + from app.db import async_session_maker + from app.services.task_logging_service import TaskLoggingService + + async with async_session_maker() as session: + # Initialize task logging service + task_logger = TaskLoggingService(session, search_space_id) + + # Log task start + log_entry = await task_logger.log_task_start( + task_name="process_file_upload", + source="document_processor", + message=f"Starting file processing for: {filename}", + metadata={ + "document_type": "FILE", + "filename": filename, + "file_path": file_path, + "user_id": user_id, + }, + ) + + try: + await process_file_in_background( + file_path, + filename, + search_space_id, + user_id, + session, + task_logger, + log_entry, + ) + + # Note: success/failure logging is handled within process_file_in_background + except Exception as e: + await task_logger.log_task_failure( + log_entry, + f"Failed to process file: {filename}", + str(e), + {"error_type": type(e).__name__}, + ) + import logging + + logging.error(f"Error processing file: {e!s}") + + +async def process_youtube_video_with_new_session( + url: str, search_space_id: int, user_id: str +): + """Create a new session and process YouTube video.""" + from app.db import async_session_maker + from app.services.task_logging_service import TaskLoggingService + + async with async_session_maker() as session: + # Initialize task logging service + task_logger = TaskLoggingService(session, search_space_id) + + # Log task start + log_entry = await task_logger.log_task_start( + task_name="process_youtube_video", + source="document_processor", + message=f"Starting YouTube video processing for: {url}", + metadata={"document_type": "YOUTUBE_VIDEO", "url": url, "user_id": user_id}, + ) + + try: + result = await add_youtube_video_document( + session, url, search_space_id, user_id + ) + + if result: + await task_logger.log_task_success( + log_entry, + f"Successfully processed YouTube video: {result.title}", + { + "document_id": result.id, + "video_id": result.document_metadata.get("video_id"), + "content_hash": result.content_hash, + }, + ) + else: + await task_logger.log_task_success( + log_entry, + f"YouTube video document already exists (duplicate): {url}", + {"duplicate_detected": True}, + ) + except Exception as e: + await task_logger.log_task_failure( + log_entry, + f"Failed to process YouTube video: {url}", + str(e), + {"error_type": type(e).__name__}, + ) + import logging + + logging.error(f"Error processing YouTube video: {e!s}") + + async def process_file_in_background( file_path: str, filename: str, @@ -508,363 +939,3 @@ async def process_file_in_background( logging.error(f"Error processing file in background: {e!s}") raise # Re-raise so the wrapper can also handle it - - -@router.get("/documents/", response_model=list[DocumentRead]) -async def read_documents( - skip: int = 0, - limit: int = 3000, - search_space_id: int | None = None, - session: AsyncSession = Depends(get_async_session), - user: User = Depends(current_active_user), -): - try: - query = ( - select(Document).join(SearchSpace).filter(SearchSpace.user_id == user.id) - ) - - # Filter by search_space_id if provided - if search_space_id is not None: - query = query.filter(Document.search_space_id == search_space_id) - - result = await session.execute(query.offset(skip).limit(limit)) - db_documents = result.scalars().all() - - # Convert database objects to API-friendly format - api_documents = [] - for doc in db_documents: - api_documents.append( - DocumentRead( - id=doc.id, - title=doc.title, - document_type=doc.document_type, - document_metadata=doc.document_metadata, - content=doc.content, - created_at=doc.created_at, - search_space_id=doc.search_space_id, - ) - ) - - return api_documents - except Exception as e: - raise HTTPException( - status_code=500, detail=f"Failed to fetch documents: {e!s}" - ) from e - - -@router.get("/documents/{document_id}", response_model=DocumentRead) -async def read_document( - document_id: int, - session: AsyncSession = Depends(get_async_session), - user: User = Depends(current_active_user), -): - try: - result = await session.execute( - select(Document) - .join(SearchSpace) - .filter(Document.id == document_id, SearchSpace.user_id == user.id) - ) - document = result.scalars().first() - - if not document: - raise HTTPException( - status_code=404, detail=f"Document with id {document_id} not found" - ) - - # Convert database object to API-friendly format - return DocumentRead( - id=document.id, - title=document.title, - document_type=document.document_type, - document_metadata=document.document_metadata, - content=document.content, - created_at=document.created_at, - search_space_id=document.search_space_id, - ) - except Exception as e: - raise HTTPException( - status_code=500, detail=f"Failed to fetch document: {e!s}" - ) from e - - -@router.put("/documents/{document_id}", response_model=DocumentRead) -async def update_document( - document_id: int, - document_update: DocumentUpdate, - session: AsyncSession = Depends(get_async_session), - user: User = Depends(current_active_user), -): - try: - # Query the document directly instead of using read_document function - result = await session.execute( - select(Document) - .join(SearchSpace) - .filter(Document.id == document_id, SearchSpace.user_id == user.id) - ) - db_document = result.scalars().first() - - if not db_document: - raise HTTPException( - status_code=404, detail=f"Document with id {document_id} not found" - ) - - update_data = document_update.model_dump(exclude_unset=True) - for key, value in update_data.items(): - setattr(db_document, key, value) - await session.commit() - await session.refresh(db_document) - - # Convert to DocumentRead for response - return DocumentRead( - id=db_document.id, - title=db_document.title, - document_type=db_document.document_type, - document_metadata=db_document.document_metadata, - content=db_document.content, - created_at=db_document.created_at, - search_space_id=db_document.search_space_id, - ) - except HTTPException: - raise - except Exception as e: - await session.rollback() - raise HTTPException( - status_code=500, detail=f"Failed to update document: {e!s}" - ) from e - - -@router.delete("/documents/{document_id}", response_model=dict) -async def delete_document( - document_id: int, - session: AsyncSession = Depends(get_async_session), - user: User = Depends(current_active_user), -): - try: - # Query the document directly instead of using read_document function - result = await session.execute( - select(Document) - .join(SearchSpace) - .filter(Document.id == document_id, SearchSpace.user_id == user.id) - ) - document = result.scalars().first() - - if not document: - raise HTTPException( - status_code=404, detail=f"Document with id {document_id} not found" - ) - - await session.delete(document) - await session.commit() - return {"message": "Document deleted successfully"} - except HTTPException: - raise - except Exception as e: - await session.rollback() - raise HTTPException( - status_code=500, detail=f"Failed to delete document: {e!s}" - ) from e - - -async def process_extension_document_with_new_session( - individual_document, search_space_id: int, user_id: str -): - """Create a new session and process extension document.""" - from app.db import async_session_maker - from app.services.task_logging_service import TaskLoggingService - - async with async_session_maker() as session: - # Initialize task logging service - task_logger = TaskLoggingService(session, search_space_id) - - # Log task start - log_entry = await task_logger.log_task_start( - task_name="process_extension_document", - source="document_processor", - message=f"Starting processing of extension document from {individual_document.metadata.VisitedWebPageTitle}", - metadata={ - "document_type": "EXTENSION", - "url": individual_document.metadata.VisitedWebPageURL, - "title": individual_document.metadata.VisitedWebPageTitle, - "user_id": user_id, - }, - ) - - try: - result = await add_extension_received_document( - session, individual_document, search_space_id, user_id - ) - - if result: - await task_logger.log_task_success( - log_entry, - f"Successfully processed extension document: {individual_document.metadata.VisitedWebPageTitle}", - {"document_id": result.id, "content_hash": result.content_hash}, - ) - else: - await task_logger.log_task_success( - log_entry, - f"Extension document already exists (duplicate): {individual_document.metadata.VisitedWebPageTitle}", - {"duplicate_detected": True}, - ) - except Exception as e: - await task_logger.log_task_failure( - log_entry, - f"Failed to process extension document: {individual_document.metadata.VisitedWebPageTitle}", - str(e), - {"error_type": type(e).__name__}, - ) - import logging - - logging.error(f"Error processing extension document: {e!s}") - - -async def process_crawled_url_with_new_session( - url: str, search_space_id: int, user_id: str -): - """Create a new session and process crawled URL.""" - from app.db import async_session_maker - from app.services.task_logging_service import TaskLoggingService - - async with async_session_maker() as session: - # Initialize task logging service - task_logger = TaskLoggingService(session, search_space_id) - - # Log task start - log_entry = await task_logger.log_task_start( - task_name="process_crawled_url", - source="document_processor", - message=f"Starting URL crawling and processing for: {url}", - metadata={"document_type": "CRAWLED_URL", "url": url, "user_id": user_id}, - ) - - try: - result = await add_crawled_url_document( - session, url, search_space_id, user_id - ) - - if result: - await task_logger.log_task_success( - log_entry, - f"Successfully crawled and processed URL: {url}", - { - "document_id": result.id, - "title": result.title, - "content_hash": result.content_hash, - }, - ) - else: - await task_logger.log_task_success( - log_entry, - f"URL document already exists (duplicate): {url}", - {"duplicate_detected": True}, - ) - except Exception as e: - await task_logger.log_task_failure( - log_entry, - f"Failed to crawl URL: {url}", - str(e), - {"error_type": type(e).__name__}, - ) - import logging - - logging.error(f"Error processing crawled URL: {e!s}") - - -async def process_file_in_background_with_new_session( - file_path: str, filename: str, search_space_id: int, user_id: str -): - """Create a new session and process file.""" - from app.db import async_session_maker - from app.services.task_logging_service import TaskLoggingService - - async with async_session_maker() as session: - # Initialize task logging service - task_logger = TaskLoggingService(session, search_space_id) - - # Log task start - log_entry = await task_logger.log_task_start( - task_name="process_file_upload", - source="document_processor", - message=f"Starting file processing for: {filename}", - metadata={ - "document_type": "FILE", - "filename": filename, - "file_path": file_path, - "user_id": user_id, - }, - ) - - try: - await process_file_in_background( - file_path, - filename, - search_space_id, - user_id, - session, - task_logger, - log_entry, - ) - - # Note: success/failure logging is handled within process_file_in_background - except Exception as e: - await task_logger.log_task_failure( - log_entry, - f"Failed to process file: {filename}", - str(e), - {"error_type": type(e).__name__}, - ) - import logging - - logging.error(f"Error processing file: {e!s}") - - -async def process_youtube_video_with_new_session( - url: str, search_space_id: int, user_id: str -): - """Create a new session and process YouTube video.""" - from app.db import async_session_maker - from app.services.task_logging_service import TaskLoggingService - - async with async_session_maker() as session: - # Initialize task logging service - task_logger = TaskLoggingService(session, search_space_id) - - # Log task start - log_entry = await task_logger.log_task_start( - task_name="process_youtube_video", - source="document_processor", - message=f"Starting YouTube video processing for: {url}", - metadata={"document_type": "YOUTUBE_VIDEO", "url": url, "user_id": user_id}, - ) - - try: - result = await add_youtube_video_document( - session, url, search_space_id, user_id - ) - - if result: - await task_logger.log_task_success( - log_entry, - f"Successfully processed YouTube video: {result.title}", - { - "document_id": result.id, - "video_id": result.document_metadata.get("video_id"), - "content_hash": result.content_hash, - }, - ) - else: - await task_logger.log_task_success( - log_entry, - f"YouTube video document already exists (duplicate): {url}", - {"duplicate_detected": True}, - ) - except Exception as e: - await task_logger.log_task_failure( - log_entry, - f"Failed to process YouTube video: {url}", - str(e), - {"error_type": type(e).__name__}, - ) - import logging - - logging.error(f"Error processing YouTube video: {e!s}") diff --git a/surfsense_backend/app/schemas/__init__.py b/surfsense_backend/app/schemas/__init__.py index c038d9c..ca9287a 100644 --- a/surfsense_backend/app/schemas/__init__.py +++ b/surfsense_backend/app/schemas/__init__.py @@ -13,6 +13,7 @@ from .documents import ( DocumentRead, DocumentsCreate, DocumentUpdate, + DocumentWithChunksRead, ExtensionDocumentContent, ExtensionDocumentMetadata, ) @@ -53,6 +54,7 @@ __all__ = [ "DocumentBase", "DocumentRead", "DocumentUpdate", + "DocumentWithChunksRead", "DocumentsCreate", "ExtensionDocumentContent", "ExtensionDocumentMetadata", diff --git a/surfsense_backend/app/schemas/documents.py b/surfsense_backend/app/schemas/documents.py index b98ccfd..bdaf568 100644 --- a/surfsense_backend/app/schemas/documents.py +++ b/surfsense_backend/app/schemas/documents.py @@ -4,6 +4,8 @@ from pydantic import BaseModel, ConfigDict from app.db import DocumentType +from .chunks import ChunkRead + class ExtensionDocumentMetadata(BaseModel): BrowsingSessionId: str @@ -45,3 +47,9 @@ class DocumentRead(BaseModel): search_space_id: int model_config = ConfigDict(from_attributes=True) + + +class DocumentWithChunksRead(DocumentRead): + chunks: list[ChunkRead] = [] + + model_config = ConfigDict(from_attributes=True) diff --git a/surfsense_web/components/chat/ChatCitation.tsx b/surfsense_web/components/chat/ChatCitation.tsx index 68f260b..34b7534 100644 --- a/surfsense_web/components/chat/ChatCitation.tsx +++ b/surfsense_web/components/chat/ChatCitation.tsx @@ -1,58 +1,202 @@ "use client"; -import { ExternalLink } from "lucide-react"; +import { ExternalLink, FileText, Loader2 } from "lucide-react"; import type React from "react"; +import { useEffect, useRef, useState } from "react"; +import { MarkdownViewer } from "@/components/markdown-viewer"; import { Button } from "@/components/ui/button"; -import { Popover, PopoverContent, PopoverTrigger } from "@/components/ui/popover"; +import { ScrollArea } from "@/components/ui/scroll-area"; +import { + Sheet, + SheetContent, + SheetDescription, + SheetHeader, + SheetTitle, + SheetTrigger, +} from "@/components/ui/sheet"; +import { useDocumentByChunk } from "@/hooks/use-document-by-chunk"; +import { cn } from "@/lib/utils"; export const CitationDisplay: React.FC<{ index: number; node: any }> = ({ index, node }) => { - const truncateText = (text: string, maxLength: number = 200) => { - if (text.length <= maxLength) return text; - return `${text.substring(0, maxLength)}...`; + const chunkId = Number(node?.id); + const sourceType = node?.metadata?.source_type; + const [isOpen, setIsOpen] = useState(false); + const { document, loading, error, fetchDocumentByChunk, clearDocument } = useDocumentByChunk(); + const chunksContainerRef = useRef(null); + const highlightedChunkRef = useRef(null); + + // Check if this is a source type that should render directly from node + const isDirectRenderSource = sourceType === "TAVILY_API" || sourceType === "LINKUP_API"; + + const handleOpenChange = async (open: boolean) => { + setIsOpen(open); + if (open && chunkId && !isDirectRenderSource) { + await fetchDocumentByChunk(chunkId); + } else if (!open && !isDirectRenderSource) { + clearDocument(); + } }; + useEffect(() => { + // Scroll to highlighted chunk when document loads + if (document && highlightedChunkRef.current && chunksContainerRef.current) { + setTimeout(() => { + highlightedChunkRef.current?.scrollIntoView({ + behavior: "smooth", + block: "start", + }); + }, 100); + } + }, [document]); + const handleUrlClick = (e: React.MouseEvent, url: string) => { e.preventDefault(); e.stopPropagation(); window.open(url, "_blank", "noopener,noreferrer"); }; + const formatDocumentType = (type: string) => { + return type + .split("_") + .map((word) => word.charAt(0) + word.slice(1).toLowerCase()) + .join(" "); + }; + return ( - - + + {index + 1} - - - {/* External Link Button - Top Right */} - {node?.url && ( - + + + + + + {document?.title || node?.metadata?.title || node?.metadata?.group_name || "Source"} + + + {document + ? formatDocumentType(document.document_type) + : sourceType && formatDocumentType(sourceType)} + + + + {!isDirectRenderSource && loading && ( +
+ +
)} - {/* Heading */} -
- {node?.metadata?.group_name || "Source"} -
+ {!isDirectRenderSource && error && ( +
+

{error}

+
+ )} - {/* Source */} -
- {node?.metadata?.title || "Untitled"} -
+ {/* Direct render for TAVILY_API and LINEAR_API */} + {isDirectRenderSource && ( + +
+ {/* External Link */} + {node?.url && ( +
+ +
+ )} - {/* Body */} -
- {truncateText(node?.text || "No content available")} -
- - + {/* Source Information */} +
+

Source Information

+
+ {node?.metadata?.title || "Untitled"} +
+
+ {node?.text || "No content available"} +
+
+
+
+ )} + + {/* API-fetched document content */} + {!isDirectRenderSource && document && ( + +
+ {/* Document Metadata */} + {document.document_metadata && Object.keys(document.document_metadata).length > 0 && ( +
+

Document Information

+
+ {Object.entries(document.document_metadata).map(([key, value]) => ( +
+
+ {key.replace(/_/g, " ")}: +
+
{String(value)}
+
+ ))} +
+
+ )} + + {/* External Link */} + {node?.url && ( +
+ +
+ )} + + {/* Chunks */} +
+

Document Content

+ {document.chunks.map((chunk, idx) => ( +
+
+ + Chunk {idx + 1} of {document.chunks.length} + + {chunk.id === chunkId && ( + + Referenced Chunk + + )} +
+
+ +
+
+ ))} +
+
+
+ )} +
+ ); }; diff --git a/surfsense_web/components/markdown-viewer.tsx b/surfsense_web/components/markdown-viewer.tsx index 1fb0605..7b4ce57 100644 --- a/surfsense_web/components/markdown-viewer.tsx +++ b/surfsense_web/components/markdown-viewer.tsx @@ -1,7 +1,7 @@ import { Check, Copy } from "lucide-react"; import Image from "next/image"; import { useTheme } from "next-themes"; -import React, { useEffect, useMemo, useRef, useState } from "react"; +import { useEffect, useMemo, useRef, useState } from "react"; import ReactMarkdown from "react-markdown"; import { Prism as SyntaxHighlighter } from "react-syntax-highlighter"; import { oneDark, oneLight } from "react-syntax-highlighter/dist/cjs/styles/prism"; @@ -10,105 +10,51 @@ import rehypeSanitize from "rehype-sanitize"; import remarkGfm from "remark-gfm"; import { Button } from "@/components/ui/button"; import { cn } from "@/lib/utils"; -import { Citation } from "./chat/Citation"; -import type { Source } from "./chat/types"; -import CopyButton from "./copy-button"; interface MarkdownViewerProps { content: string; className?: string; - getCitationSource?: (id: number) => Source | null; - type?: "user" | "ai"; } -export function MarkdownViewer({ - content, - className, - getCitationSource, - type = "user", -}: MarkdownViewerProps) { +export function MarkdownViewer({ content, className }: MarkdownViewerProps) { const ref = useRef(null); // Memoize the markdown components to prevent unnecessary re-renders const components = useMemo(() => { return { // Define custom components for markdown elements - p: ({ node, children, ...props }: any) => { - // If there's no getCitationSource function, just render normally - if (!getCitationSource) { - return ( -

- {children} -

- ); - } - - // Process citations within paragraph content - return ( -

- {processCitationsInReactChildren(children, getCitationSource)} -

- ); - }, - a: ({ node, children, ...props }: any) => { - // Process citations within link content if needed - const processedChildren = getCitationSource - ? processCitationsInReactChildren(children, getCitationSource) - : children; - return ( - - {processedChildren} - - ); - }, - li: ({ node, children, ...props }: any) => { - // Process citations within list item content - const processedChildren = getCitationSource - ? processCitationsInReactChildren(children, getCitationSource) - : children; - return
  • {processedChildren}
  • ; - }, + p: ({ node, children, ...props }: any) => ( +

    + {children} +

    + ), + a: ({ node, children, ...props }: any) => ( + + {children} + + ), + li: ({ node, children, ...props }: any) =>
  • {children}
  • , ul: ({ node, ...props }: any) =>
      , ol: ({ node, ...props }: any) =>
        , - h1: ({ node, children, ...props }: any) => { - const processedChildren = getCitationSource - ? processCitationsInReactChildren(children, getCitationSource) - : children; - return ( -

        - {processedChildren} -

        - ); - }, - h2: ({ node, children, ...props }: any) => { - const processedChildren = getCitationSource - ? processCitationsInReactChildren(children, getCitationSource) - : children; - return ( -

        - {processedChildren} -

        - ); - }, - h3: ({ node, children, ...props }: any) => { - const processedChildren = getCitationSource - ? processCitationsInReactChildren(children, getCitationSource) - : children; - return ( -

        - {processedChildren} -

        - ); - }, - h4: ({ node, children, ...props }: any) => { - const processedChildren = getCitationSource - ? processCitationsInReactChildren(children, getCitationSource) - : children; - return ( -

        - {processedChildren} -

        - ); - }, + h1: ({ node, children, ...props }: any) => ( +

        + {children} +

        + ), + h2: ({ node, children, ...props }: any) => ( +

        + {children} +

        + ), + h3: ({ node, children, ...props }: any) => ( +

        + {children} +

        + ), + h4: ({ node, children, ...props }: any) => ( +

        + {children} +

        + ), blockquote: ({ node, ...props }: any) => (
        ), @@ -154,7 +100,7 @@ export function MarkdownViewer({ ); }, }; - }, [getCitationSource]); + }, []); return (
        @@ -165,7 +111,6 @@ export function MarkdownViewer({ > {content} - {type === "ai" && }
        ); } @@ -267,77 +212,3 @@ const CodeBlock = ({ children, language }: { children: string; language: string ); }; - -// Helper function to process citations within React children -const processCitationsInReactChildren = ( - children: React.ReactNode, - getCitationSource: (id: number) => Source | null -): React.ReactNode => { - // If children is not an array or string, just return it - if (!children || (typeof children !== "string" && !Array.isArray(children))) { - return children; - } - - // Handle string content directly - this is where we process citation references - if (typeof children === "string") { - return processCitationsInText(children, getCitationSource); - } - - // Handle arrays of children recursively - if (Array.isArray(children)) { - return React.Children.map(children, (child) => { - if (typeof child === "string") { - return processCitationsInText(child, getCitationSource); - } - return child; - }); - } - - return children; -}; - -// Process citation references in text content -const processCitationsInText = ( - text: string, - getCitationSource: (id: number) => Source | null -): React.ReactNode[] => { - // Use improved regex to catch citation numbers more reliably - // This will match patterns like [1], [42], etc. including when they appear at the end of a line or sentence - const citationRegex = /\[(\d+)\]/g; - const parts: React.ReactNode[] = []; - let lastIndex = 0; - let match: RegExpExecArray | null = citationRegex.exec(text); - let position = 0; - - while (match !== null) { - // Add text before the citation - if (match.index > lastIndex) { - parts.push(text.substring(lastIndex, match.index)); - } - - // Add the citation component - const citationId = parseInt(match[1], 10); - const source = getCitationSource(citationId); - - parts.push( - - ); - - lastIndex = match.index + match[0].length; - position++; - match = citationRegex.exec(text); - } - - // Add any remaining text after the last citation - if (lastIndex < text.length) { - parts.push(text.substring(lastIndex)); - } - - return parts; -}; diff --git a/surfsense_web/components/ui/scroll-area.tsx b/surfsense_web/components/ui/scroll-area.tsx new file mode 100644 index 0000000..a3e05bb --- /dev/null +++ b/surfsense_web/components/ui/scroll-area.tsx @@ -0,0 +1,56 @@ +"use client"; + +import * as ScrollAreaPrimitive from "@radix-ui/react-scroll-area"; +import type * as React from "react"; + +import { cn } from "@/lib/utils"; + +function ScrollArea({ + className, + children, + ...props +}: React.ComponentProps) { + return ( + + + {children} + + + + + ); +} + +function ScrollBar({ + className, + orientation = "vertical", + ...props +}: React.ComponentProps) { + return ( + + + + ); +} + +export { ScrollArea, ScrollBar }; diff --git a/surfsense_web/hooks/index.ts b/surfsense_web/hooks/index.ts index 666bac3..908d2ad 100644 --- a/surfsense_web/hooks/index.ts +++ b/surfsense_web/hooks/index.ts @@ -1,2 +1,3 @@ +export * from "./use-document-by-chunk"; export * from "./use-logs"; export * from "./useSearchSourceConnectors"; diff --git a/surfsense_web/hooks/use-document-by-chunk.ts b/surfsense_web/hooks/use-document-by-chunk.ts new file mode 100644 index 0000000..df2b1a7 --- /dev/null +++ b/surfsense_web/hooks/use-document-by-chunk.ts @@ -0,0 +1,106 @@ +"use client"; +import { useCallback, useState } from "react"; +import { toast } from "sonner"; + +export interface Chunk { + id: number; + content: string; + document_id: number; + created_at: string; +} + +export interface DocumentWithChunks { + id: number; + title: string; + document_type: DocumentType; + document_metadata: any; + content: string; + created_at: string; + search_space_id: number; + chunks: Chunk[]; +} + +export type DocumentType = + | "EXTENSION" + | "CRAWLED_URL" + | "SLACK_CONNECTOR" + | "NOTION_CONNECTOR" + | "FILE" + | "YOUTUBE_VIDEO" + | "GITHUB_CONNECTOR" + | "LINEAR_CONNECTOR" + | "DISCORD_CONNECTOR" + | "JIRA_CONNECTOR" + | "CONFLUENCE_CONNECTOR" + | "CLICKUP_CONNECTOR" + | "GOOGLE_CALENDAR_CONNECTOR" + | "GOOGLE_GMAIL_CONNECTOR"; + +export function useDocumentByChunk() { + const [document, setDocument] = useState(null); + const [loading, setLoading] = useState(false); + const [error, setError] = useState(null); + + const fetchDocumentByChunk = useCallback(async (chunkId: number) => { + try { + setLoading(true); + setError(null); + setDocument(null); + + const response = await fetch( + `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/documents/by-chunk/${chunkId}`, + { + headers: { + Authorization: `Bearer ${localStorage.getItem("surfsense_bearer_token")}`, + "Content-Type": "application/json", + }, + method: "GET", + } + ); + + if (!response.ok) { + const errorText = await response.text(); + let errorMessage = "Failed to fetch document"; + + try { + const errorData = JSON.parse(errorText); + errorMessage = errorData.detail || errorMessage; + } catch { + // If parsing fails, use default message + } + + if (response.status === 404) { + errorMessage = "Chunk not found or you don't have access to it"; + } + + toast.error(errorMessage); + throw new Error(errorMessage); + } + + const data: DocumentWithChunks = await response.json(); + setDocument(data); + setError(null); + return data; + } catch (err: any) { + const errorMessage = err.message || "Failed to fetch document"; + setError(errorMessage); + console.error("Error fetching document by chunk:", err); + throw err; + } finally { + setLoading(false); + } + }, []); + + const clearDocument = useCallback(() => { + setDocument(null); + setError(null); + }, []); + + return { + document, + loading, + error, + fetchDocumentByChunk, + clearDocument, + }; +} diff --git a/surfsense_web/next.config.ts b/surfsense_web/next.config.ts index 3728a7b..460ef5d 100644 --- a/surfsense_web/next.config.ts +++ b/surfsense_web/next.config.ts @@ -12,11 +12,7 @@ const nextConfig: NextConfig = { remotePatterns: [ { protocol: "https", - hostname: "images.unsplash.com", - }, - { - protocol: "https", - hostname: "static.vecteezy.com", + hostname: "**", }, ], },