Merge pull request #286 from MODSetter/dev
Some checks failed
pre-commit / pre-commit (push) Has been cancelled

feat: added jump to source referencing of citations
This commit is contained in:
Rohan Verma 2025-08-23 19:40:46 -07:00 committed by GitHub
commit bc89959d2f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 819 additions and 564 deletions

View file

@ -5,10 +5,24 @@ from fastapi import APIRouter, BackgroundTasks, Depends, Form, HTTPException, Up
from litellm import atranscription
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from sqlalchemy.orm import selectinload
from app.config import config as app_config
from app.db import Document, DocumentType, Log, SearchSpace, User, get_async_session
from app.schemas import DocumentRead, DocumentsCreate, DocumentUpdate
from app.db import (
Chunk,
Document,
DocumentType,
Log,
SearchSpace,
User,
get_async_session,
)
from app.schemas import (
DocumentRead,
DocumentsCreate,
DocumentUpdate,
DocumentWithChunksRead,
)
from app.services.task_logging_service import TaskLoggingService
from app.tasks.document_processors import (
add_crawled_url_document,
@ -140,6 +154,423 @@ async def create_documents_file_upload(
) from e
@router.get("/documents/", response_model=list[DocumentRead])
async def read_documents(
skip: int = 0,
limit: int = 3000,
search_space_id: int | None = None,
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
try:
query = (
select(Document).join(SearchSpace).filter(SearchSpace.user_id == user.id)
)
# Filter by search_space_id if provided
if search_space_id is not None:
query = query.filter(Document.search_space_id == search_space_id)
result = await session.execute(query.offset(skip).limit(limit))
db_documents = result.scalars().all()
# Convert database objects to API-friendly format
api_documents = []
for doc in db_documents:
api_documents.append(
DocumentRead(
id=doc.id,
title=doc.title,
document_type=doc.document_type,
document_metadata=doc.document_metadata,
content=doc.content,
created_at=doc.created_at,
search_space_id=doc.search_space_id,
)
)
return api_documents
except Exception as e:
raise HTTPException(
status_code=500, detail=f"Failed to fetch documents: {e!s}"
) from e
@router.get("/documents/{document_id}", response_model=DocumentRead)
async def read_document(
document_id: int,
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
try:
result = await session.execute(
select(Document)
.join(SearchSpace)
.filter(Document.id == document_id, SearchSpace.user_id == user.id)
)
document = result.scalars().first()
if not document:
raise HTTPException(
status_code=404, detail=f"Document with id {document_id} not found"
)
# Convert database object to API-friendly format
return DocumentRead(
id=document.id,
title=document.title,
document_type=document.document_type,
document_metadata=document.document_metadata,
content=document.content,
created_at=document.created_at,
search_space_id=document.search_space_id,
)
except Exception as e:
raise HTTPException(
status_code=500, detail=f"Failed to fetch document: {e!s}"
) from e
@router.put("/documents/{document_id}", response_model=DocumentRead)
async def update_document(
document_id: int,
document_update: DocumentUpdate,
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
try:
# Query the document directly instead of using read_document function
result = await session.execute(
select(Document)
.join(SearchSpace)
.filter(Document.id == document_id, SearchSpace.user_id == user.id)
)
db_document = result.scalars().first()
if not db_document:
raise HTTPException(
status_code=404, detail=f"Document with id {document_id} not found"
)
update_data = document_update.model_dump(exclude_unset=True)
for key, value in update_data.items():
setattr(db_document, key, value)
await session.commit()
await session.refresh(db_document)
# Convert to DocumentRead for response
return DocumentRead(
id=db_document.id,
title=db_document.title,
document_type=db_document.document_type,
document_metadata=db_document.document_metadata,
content=db_document.content,
created_at=db_document.created_at,
search_space_id=db_document.search_space_id,
)
except HTTPException:
raise
except Exception as e:
await session.rollback()
raise HTTPException(
status_code=500, detail=f"Failed to update document: {e!s}"
) from e
@router.delete("/documents/{document_id}", response_model=dict)
async def delete_document(
document_id: int,
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
try:
# Query the document directly instead of using read_document function
result = await session.execute(
select(Document)
.join(SearchSpace)
.filter(Document.id == document_id, SearchSpace.user_id == user.id)
)
document = result.scalars().first()
if not document:
raise HTTPException(
status_code=404, detail=f"Document with id {document_id} not found"
)
await session.delete(document)
await session.commit()
return {"message": "Document deleted successfully"}
except HTTPException:
raise
except Exception as e:
await session.rollback()
raise HTTPException(
status_code=500, detail=f"Failed to delete document: {e!s}"
) from e
@router.get("/documents/by-chunk/{chunk_id}", response_model=DocumentWithChunksRead)
async def get_document_by_chunk_id(
chunk_id: int,
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""
Retrieves a document based on a chunk ID, including all its chunks ordered by creation time.
The document's embedding and chunk embeddings are excluded from the response.
"""
try:
# First, get the chunk and verify it exists
chunk_result = await session.execute(select(Chunk).filter(Chunk.id == chunk_id))
chunk = chunk_result.scalars().first()
if not chunk:
raise HTTPException(
status_code=404, detail=f"Chunk with id {chunk_id} not found"
)
# Get the associated document and verify ownership
document_result = await session.execute(
select(Document)
.options(selectinload(Document.chunks))
.join(SearchSpace)
.filter(Document.id == chunk.document_id, SearchSpace.user_id == user.id)
)
document = document_result.scalars().first()
if not document:
raise HTTPException(
status_code=404,
detail="Document not found or you don't have access to it",
)
# Sort chunks by creation time
sorted_chunks = sorted(document.chunks, key=lambda x: x.created_at)
# Return the document with its chunks
return DocumentWithChunksRead(
id=document.id,
title=document.title,
document_type=document.document_type,
document_metadata=document.document_metadata,
content=document.content,
created_at=document.created_at,
search_space_id=document.search_space_id,
chunks=sorted_chunks,
)
except HTTPException:
raise
except Exception as e:
raise HTTPException(
status_code=500, detail=f"Failed to retrieve document: {e!s}"
) from e
async def process_extension_document_with_new_session(
individual_document, search_space_id: int, user_id: str
):
"""Create a new session and process extension document."""
from app.db import async_session_maker
from app.services.task_logging_service import TaskLoggingService
async with async_session_maker() as session:
# Initialize task logging service
task_logger = TaskLoggingService(session, search_space_id)
# Log task start
log_entry = await task_logger.log_task_start(
task_name="process_extension_document",
source="document_processor",
message=f"Starting processing of extension document from {individual_document.metadata.VisitedWebPageTitle}",
metadata={
"document_type": "EXTENSION",
"url": individual_document.metadata.VisitedWebPageURL,
"title": individual_document.metadata.VisitedWebPageTitle,
"user_id": user_id,
},
)
try:
result = await add_extension_received_document(
session, individual_document, search_space_id, user_id
)
if result:
await task_logger.log_task_success(
log_entry,
f"Successfully processed extension document: {individual_document.metadata.VisitedWebPageTitle}",
{"document_id": result.id, "content_hash": result.content_hash},
)
else:
await task_logger.log_task_success(
log_entry,
f"Extension document already exists (duplicate): {individual_document.metadata.VisitedWebPageTitle}",
{"duplicate_detected": True},
)
except Exception as e:
await task_logger.log_task_failure(
log_entry,
f"Failed to process extension document: {individual_document.metadata.VisitedWebPageTitle}",
str(e),
{"error_type": type(e).__name__},
)
import logging
logging.error(f"Error processing extension document: {e!s}")
async def process_crawled_url_with_new_session(
url: str, search_space_id: int, user_id: str
):
"""Create a new session and process crawled URL."""
from app.db import async_session_maker
from app.services.task_logging_service import TaskLoggingService
async with async_session_maker() as session:
# Initialize task logging service
task_logger = TaskLoggingService(session, search_space_id)
# Log task start
log_entry = await task_logger.log_task_start(
task_name="process_crawled_url",
source="document_processor",
message=f"Starting URL crawling and processing for: {url}",
metadata={"document_type": "CRAWLED_URL", "url": url, "user_id": user_id},
)
try:
result = await add_crawled_url_document(
session, url, search_space_id, user_id
)
if result:
await task_logger.log_task_success(
log_entry,
f"Successfully crawled and processed URL: {url}",
{
"document_id": result.id,
"title": result.title,
"content_hash": result.content_hash,
},
)
else:
await task_logger.log_task_success(
log_entry,
f"URL document already exists (duplicate): {url}",
{"duplicate_detected": True},
)
except Exception as e:
await task_logger.log_task_failure(
log_entry,
f"Failed to crawl URL: {url}",
str(e),
{"error_type": type(e).__name__},
)
import logging
logging.error(f"Error processing crawled URL: {e!s}")
async def process_file_in_background_with_new_session(
file_path: str, filename: str, search_space_id: int, user_id: str
):
"""Create a new session and process file."""
from app.db import async_session_maker
from app.services.task_logging_service import TaskLoggingService
async with async_session_maker() as session:
# Initialize task logging service
task_logger = TaskLoggingService(session, search_space_id)
# Log task start
log_entry = await task_logger.log_task_start(
task_name="process_file_upload",
source="document_processor",
message=f"Starting file processing for: {filename}",
metadata={
"document_type": "FILE",
"filename": filename,
"file_path": file_path,
"user_id": user_id,
},
)
try:
await process_file_in_background(
file_path,
filename,
search_space_id,
user_id,
session,
task_logger,
log_entry,
)
# Note: success/failure logging is handled within process_file_in_background
except Exception as e:
await task_logger.log_task_failure(
log_entry,
f"Failed to process file: {filename}",
str(e),
{"error_type": type(e).__name__},
)
import logging
logging.error(f"Error processing file: {e!s}")
async def process_youtube_video_with_new_session(
url: str, search_space_id: int, user_id: str
):
"""Create a new session and process YouTube video."""
from app.db import async_session_maker
from app.services.task_logging_service import TaskLoggingService
async with async_session_maker() as session:
# Initialize task logging service
task_logger = TaskLoggingService(session, search_space_id)
# Log task start
log_entry = await task_logger.log_task_start(
task_name="process_youtube_video",
source="document_processor",
message=f"Starting YouTube video processing for: {url}",
metadata={"document_type": "YOUTUBE_VIDEO", "url": url, "user_id": user_id},
)
try:
result = await add_youtube_video_document(
session, url, search_space_id, user_id
)
if result:
await task_logger.log_task_success(
log_entry,
f"Successfully processed YouTube video: {result.title}",
{
"document_id": result.id,
"video_id": result.document_metadata.get("video_id"),
"content_hash": result.content_hash,
},
)
else:
await task_logger.log_task_success(
log_entry,
f"YouTube video document already exists (duplicate): {url}",
{"duplicate_detected": True},
)
except Exception as e:
await task_logger.log_task_failure(
log_entry,
f"Failed to process YouTube video: {url}",
str(e),
{"error_type": type(e).__name__},
)
import logging
logging.error(f"Error processing YouTube video: {e!s}")
async def process_file_in_background(
file_path: str,
filename: str,
@ -508,363 +939,3 @@ async def process_file_in_background(
logging.error(f"Error processing file in background: {e!s}")
raise # Re-raise so the wrapper can also handle it
@router.get("/documents/", response_model=list[DocumentRead])
async def read_documents(
skip: int = 0,
limit: int = 3000,
search_space_id: int | None = None,
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
try:
query = (
select(Document).join(SearchSpace).filter(SearchSpace.user_id == user.id)
)
# Filter by search_space_id if provided
if search_space_id is not None:
query = query.filter(Document.search_space_id == search_space_id)
result = await session.execute(query.offset(skip).limit(limit))
db_documents = result.scalars().all()
# Convert database objects to API-friendly format
api_documents = []
for doc in db_documents:
api_documents.append(
DocumentRead(
id=doc.id,
title=doc.title,
document_type=doc.document_type,
document_metadata=doc.document_metadata,
content=doc.content,
created_at=doc.created_at,
search_space_id=doc.search_space_id,
)
)
return api_documents
except Exception as e:
raise HTTPException(
status_code=500, detail=f"Failed to fetch documents: {e!s}"
) from e
@router.get("/documents/{document_id}", response_model=DocumentRead)
async def read_document(
document_id: int,
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
try:
result = await session.execute(
select(Document)
.join(SearchSpace)
.filter(Document.id == document_id, SearchSpace.user_id == user.id)
)
document = result.scalars().first()
if not document:
raise HTTPException(
status_code=404, detail=f"Document with id {document_id} not found"
)
# Convert database object to API-friendly format
return DocumentRead(
id=document.id,
title=document.title,
document_type=document.document_type,
document_metadata=document.document_metadata,
content=document.content,
created_at=document.created_at,
search_space_id=document.search_space_id,
)
except Exception as e:
raise HTTPException(
status_code=500, detail=f"Failed to fetch document: {e!s}"
) from e
@router.put("/documents/{document_id}", response_model=DocumentRead)
async def update_document(
document_id: int,
document_update: DocumentUpdate,
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
try:
# Query the document directly instead of using read_document function
result = await session.execute(
select(Document)
.join(SearchSpace)
.filter(Document.id == document_id, SearchSpace.user_id == user.id)
)
db_document = result.scalars().first()
if not db_document:
raise HTTPException(
status_code=404, detail=f"Document with id {document_id} not found"
)
update_data = document_update.model_dump(exclude_unset=True)
for key, value in update_data.items():
setattr(db_document, key, value)
await session.commit()
await session.refresh(db_document)
# Convert to DocumentRead for response
return DocumentRead(
id=db_document.id,
title=db_document.title,
document_type=db_document.document_type,
document_metadata=db_document.document_metadata,
content=db_document.content,
created_at=db_document.created_at,
search_space_id=db_document.search_space_id,
)
except HTTPException:
raise
except Exception as e:
await session.rollback()
raise HTTPException(
status_code=500, detail=f"Failed to update document: {e!s}"
) from e
@router.delete("/documents/{document_id}", response_model=dict)
async def delete_document(
document_id: int,
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
try:
# Query the document directly instead of using read_document function
result = await session.execute(
select(Document)
.join(SearchSpace)
.filter(Document.id == document_id, SearchSpace.user_id == user.id)
)
document = result.scalars().first()
if not document:
raise HTTPException(
status_code=404, detail=f"Document with id {document_id} not found"
)
await session.delete(document)
await session.commit()
return {"message": "Document deleted successfully"}
except HTTPException:
raise
except Exception as e:
await session.rollback()
raise HTTPException(
status_code=500, detail=f"Failed to delete document: {e!s}"
) from e
async def process_extension_document_with_new_session(
individual_document, search_space_id: int, user_id: str
):
"""Create a new session and process extension document."""
from app.db import async_session_maker
from app.services.task_logging_service import TaskLoggingService
async with async_session_maker() as session:
# Initialize task logging service
task_logger = TaskLoggingService(session, search_space_id)
# Log task start
log_entry = await task_logger.log_task_start(
task_name="process_extension_document",
source="document_processor",
message=f"Starting processing of extension document from {individual_document.metadata.VisitedWebPageTitle}",
metadata={
"document_type": "EXTENSION",
"url": individual_document.metadata.VisitedWebPageURL,
"title": individual_document.metadata.VisitedWebPageTitle,
"user_id": user_id,
},
)
try:
result = await add_extension_received_document(
session, individual_document, search_space_id, user_id
)
if result:
await task_logger.log_task_success(
log_entry,
f"Successfully processed extension document: {individual_document.metadata.VisitedWebPageTitle}",
{"document_id": result.id, "content_hash": result.content_hash},
)
else:
await task_logger.log_task_success(
log_entry,
f"Extension document already exists (duplicate): {individual_document.metadata.VisitedWebPageTitle}",
{"duplicate_detected": True},
)
except Exception as e:
await task_logger.log_task_failure(
log_entry,
f"Failed to process extension document: {individual_document.metadata.VisitedWebPageTitle}",
str(e),
{"error_type": type(e).__name__},
)
import logging
logging.error(f"Error processing extension document: {e!s}")
async def process_crawled_url_with_new_session(
url: str, search_space_id: int, user_id: str
):
"""Create a new session and process crawled URL."""
from app.db import async_session_maker
from app.services.task_logging_service import TaskLoggingService
async with async_session_maker() as session:
# Initialize task logging service
task_logger = TaskLoggingService(session, search_space_id)
# Log task start
log_entry = await task_logger.log_task_start(
task_name="process_crawled_url",
source="document_processor",
message=f"Starting URL crawling and processing for: {url}",
metadata={"document_type": "CRAWLED_URL", "url": url, "user_id": user_id},
)
try:
result = await add_crawled_url_document(
session, url, search_space_id, user_id
)
if result:
await task_logger.log_task_success(
log_entry,
f"Successfully crawled and processed URL: {url}",
{
"document_id": result.id,
"title": result.title,
"content_hash": result.content_hash,
},
)
else:
await task_logger.log_task_success(
log_entry,
f"URL document already exists (duplicate): {url}",
{"duplicate_detected": True},
)
except Exception as e:
await task_logger.log_task_failure(
log_entry,
f"Failed to crawl URL: {url}",
str(e),
{"error_type": type(e).__name__},
)
import logging
logging.error(f"Error processing crawled URL: {e!s}")
async def process_file_in_background_with_new_session(
file_path: str, filename: str, search_space_id: int, user_id: str
):
"""Create a new session and process file."""
from app.db import async_session_maker
from app.services.task_logging_service import TaskLoggingService
async with async_session_maker() as session:
# Initialize task logging service
task_logger = TaskLoggingService(session, search_space_id)
# Log task start
log_entry = await task_logger.log_task_start(
task_name="process_file_upload",
source="document_processor",
message=f"Starting file processing for: {filename}",
metadata={
"document_type": "FILE",
"filename": filename,
"file_path": file_path,
"user_id": user_id,
},
)
try:
await process_file_in_background(
file_path,
filename,
search_space_id,
user_id,
session,
task_logger,
log_entry,
)
# Note: success/failure logging is handled within process_file_in_background
except Exception as e:
await task_logger.log_task_failure(
log_entry,
f"Failed to process file: {filename}",
str(e),
{"error_type": type(e).__name__},
)
import logging
logging.error(f"Error processing file: {e!s}")
async def process_youtube_video_with_new_session(
url: str, search_space_id: int, user_id: str
):
"""Create a new session and process YouTube video."""
from app.db import async_session_maker
from app.services.task_logging_service import TaskLoggingService
async with async_session_maker() as session:
# Initialize task logging service
task_logger = TaskLoggingService(session, search_space_id)
# Log task start
log_entry = await task_logger.log_task_start(
task_name="process_youtube_video",
source="document_processor",
message=f"Starting YouTube video processing for: {url}",
metadata={"document_type": "YOUTUBE_VIDEO", "url": url, "user_id": user_id},
)
try:
result = await add_youtube_video_document(
session, url, search_space_id, user_id
)
if result:
await task_logger.log_task_success(
log_entry,
f"Successfully processed YouTube video: {result.title}",
{
"document_id": result.id,
"video_id": result.document_metadata.get("video_id"),
"content_hash": result.content_hash,
},
)
else:
await task_logger.log_task_success(
log_entry,
f"YouTube video document already exists (duplicate): {url}",
{"duplicate_detected": True},
)
except Exception as e:
await task_logger.log_task_failure(
log_entry,
f"Failed to process YouTube video: {url}",
str(e),
{"error_type": type(e).__name__},
)
import logging
logging.error(f"Error processing YouTube video: {e!s}")

View file

@ -13,6 +13,7 @@ from .documents import (
DocumentRead,
DocumentsCreate,
DocumentUpdate,
DocumentWithChunksRead,
ExtensionDocumentContent,
ExtensionDocumentMetadata,
)
@ -53,6 +54,7 @@ __all__ = [
"DocumentBase",
"DocumentRead",
"DocumentUpdate",
"DocumentWithChunksRead",
"DocumentsCreate",
"ExtensionDocumentContent",
"ExtensionDocumentMetadata",

View file

@ -4,6 +4,8 @@ from pydantic import BaseModel, ConfigDict
from app.db import DocumentType
from .chunks import ChunkRead
class ExtensionDocumentMetadata(BaseModel):
BrowsingSessionId: str
@ -45,3 +47,9 @@ class DocumentRead(BaseModel):
search_space_id: int
model_config = ConfigDict(from_attributes=True)
class DocumentWithChunksRead(DocumentRead):
chunks: list[ChunkRead] = []
model_config = ConfigDict(from_attributes=True)

View file

@ -1,58 +1,202 @@
"use client";
import { ExternalLink } from "lucide-react";
import { ExternalLink, FileText, Loader2 } from "lucide-react";
import type React from "react";
import { useEffect, useRef, useState } from "react";
import { MarkdownViewer } from "@/components/markdown-viewer";
import { Button } from "@/components/ui/button";
import { Popover, PopoverContent, PopoverTrigger } from "@/components/ui/popover";
import { ScrollArea } from "@/components/ui/scroll-area";
import {
Sheet,
SheetContent,
SheetDescription,
SheetHeader,
SheetTitle,
SheetTrigger,
} from "@/components/ui/sheet";
import { useDocumentByChunk } from "@/hooks/use-document-by-chunk";
import { cn } from "@/lib/utils";
export const CitationDisplay: React.FC<{ index: number; node: any }> = ({ index, node }) => {
const truncateText = (text: string, maxLength: number = 200) => {
if (text.length <= maxLength) return text;
return `${text.substring(0, maxLength)}...`;
const chunkId = Number(node?.id);
const sourceType = node?.metadata?.source_type;
const [isOpen, setIsOpen] = useState(false);
const { document, loading, error, fetchDocumentByChunk, clearDocument } = useDocumentByChunk();
const chunksContainerRef = useRef<HTMLDivElement>(null);
const highlightedChunkRef = useRef<HTMLDivElement>(null);
// Check if this is a source type that should render directly from node
const isDirectRenderSource = sourceType === "TAVILY_API" || sourceType === "LINKUP_API";
const handleOpenChange = async (open: boolean) => {
setIsOpen(open);
if (open && chunkId && !isDirectRenderSource) {
await fetchDocumentByChunk(chunkId);
} else if (!open && !isDirectRenderSource) {
clearDocument();
}
};
useEffect(() => {
// Scroll to highlighted chunk when document loads
if (document && highlightedChunkRef.current && chunksContainerRef.current) {
setTimeout(() => {
highlightedChunkRef.current?.scrollIntoView({
behavior: "smooth",
block: "start",
});
}, 100);
}
}, [document]);
const handleUrlClick = (e: React.MouseEvent, url: string) => {
e.preventDefault();
e.stopPropagation();
window.open(url, "_blank", "noopener,noreferrer");
};
const formatDocumentType = (type: string) => {
return type
.split("_")
.map((word) => word.charAt(0) + word.slice(1).toLowerCase())
.join(" ");
};
return (
<Popover>
<PopoverTrigger asChild>
<Sheet open={isOpen} onOpenChange={handleOpenChange}>
<SheetTrigger asChild>
<span className="text-[10px] font-bold bg-slate-500 hover:bg-slate-600 text-white rounded-full w-4 h-4 inline-flex items-center justify-center align-super cursor-pointer transition-colors">
{index + 1}
</span>
</PopoverTrigger>
<PopoverContent className="w-80 p-4 space-y-3 relative" align="start">
{/* External Link Button - Top Right */}
{node?.url && (
<Button
size="icon"
variant="ghost"
onClick={(e) => handleUrlClick(e, node.url)}
className="absolute top-3 right-3 inline-flex items-center justify-center w-6 h-6 text-blue-600 hover:text-blue-800 dark:text-blue-400 dark:hover:text-blue-200 hover:bg-blue-50 dark:hover:bg-blue-900/20 rounded transition-colors"
title="Open in new tab"
>
<ExternalLink size={14} />
</Button>
</SheetTrigger>
<SheetContent side="right" className="w-full sm:max-w-5xl lg:max-w-7xl">
<SheetHeader className="px-6 py-4 border-b">
<SheetTitle className="flex items-center gap-3 text-lg">
<FileText className="h-6 w-6" />
{document?.title || node?.metadata?.title || node?.metadata?.group_name || "Source"}
</SheetTitle>
<SheetDescription className="text-base mt-2">
{document
? formatDocumentType(document.document_type)
: sourceType && formatDocumentType(sourceType)}
</SheetDescription>
</SheetHeader>
{!isDirectRenderSource && loading && (
<div className="flex items-center justify-center h-64 px-6">
<Loader2 className="h-8 w-8 animate-spin text-muted-foreground" />
</div>
)}
{/* Heading */}
<div className="text-sm font-semibold text-slate-900 dark:text-slate-100 pr-8">
{node?.metadata?.group_name || "Source"}
</div>
{!isDirectRenderSource && error && (
<div className="flex items-center justify-center h-64 px-6">
<p className="text-sm text-destructive">{error}</p>
</div>
)}
{/* Source */}
<div className="text-xs text-slate-600 dark:text-slate-400 font-medium">
{node?.metadata?.title || "Untitled"}
</div>
{/* Direct render for TAVILY_API and LINEAR_API */}
{isDirectRenderSource && (
<ScrollArea className="h-[calc(100vh-10rem)]">
<div className="px-6 py-4">
{/* External Link */}
{node?.url && (
<div className="mb-8">
<Button
size="default"
variant="outline"
onClick={(e) => handleUrlClick(e, node.url)}
className="w-full py-3"
>
<ExternalLink className="mr-2 h-4 w-4" />
Open in Browser
</Button>
</div>
)}
{/* Body */}
<div className="text-xs text-slate-700 dark:text-slate-300 leading-relaxed">
{truncateText(node?.text || "No content available")}
</div>
</PopoverContent>
</Popover>
{/* Source Information */}
<div className="mb-8 p-6 bg-muted/50 rounded-lg border">
<h3 className="text-base font-semibold mb-4">Source Information</h3>
<div className="text-sm text-muted-foreground mb-3 font-medium">
{node?.metadata?.title || "Untitled"}
</div>
<div className="text-sm text-foreground leading-relaxed whitespace-pre-wrap">
{node?.text || "No content available"}
</div>
</div>
</div>
</ScrollArea>
)}
{/* API-fetched document content */}
{!isDirectRenderSource && document && (
<ScrollArea className="h-[calc(100vh-10rem)]">
<div className="px-6 py-4">
{/* Document Metadata */}
{document.document_metadata && Object.keys(document.document_metadata).length > 0 && (
<div className="mb-8 p-6 bg-muted/50 rounded-lg border">
<h3 className="text-base font-semibold mb-4">Document Information</h3>
<dl className="grid grid-cols-1 gap-3 text-sm">
{Object.entries(document.document_metadata).map(([key, value]) => (
<div key={key} className="flex gap-3">
<dt className="font-medium text-muted-foreground capitalize min-w-0 flex-shrink-0">
{key.replace(/_/g, " ")}:
</dt>
<dd className="text-foreground break-words">{String(value)}</dd>
</div>
))}
</dl>
</div>
)}
{/* External Link */}
{node?.url && (
<div className="mb-8">
<Button
size="default"
variant="outline"
onClick={(e) => handleUrlClick(e, node.url)}
className="w-full py-3"
>
<ExternalLink className="mr-2 h-4 w-4" />
Open in Browser
</Button>
</div>
)}
{/* Chunks */}
<div className="space-y-6" ref={chunksContainerRef}>
<h3 className="text-base font-semibold mb-4">Document Content</h3>
{document.chunks.map((chunk, idx) => (
<div
key={chunk.id}
ref={chunk.id === chunkId ? highlightedChunkRef : null}
className={cn(
"p-6 rounded-lg border transition-all duration-300",
chunk.id === chunkId
? "bg-primary/10 border-primary shadow-md ring-1 ring-primary/20"
: "bg-background border-border hover:bg-muted/50 hover:border-muted-foreground/20"
)}
>
<div className="mb-4 flex items-center justify-between">
<span className="text-sm font-medium text-muted-foreground">
Chunk {idx + 1} of {document.chunks.length}
</span>
{chunk.id === chunkId && (
<span className="text-sm font-medium text-primary bg-primary/10 px-3 py-1 rounded-full">
Referenced Chunk
</span>
)}
</div>
<div className="text-sm text-foreground whitespace-pre-wrap leading-relaxed">
<MarkdownViewer content={chunk.content} className="max-w-fit" />
</div>
</div>
))}
</div>
</div>
</ScrollArea>
)}
</SheetContent>
</Sheet>
);
};

View file

@ -1,7 +1,7 @@
import { Check, Copy } from "lucide-react";
import Image from "next/image";
import { useTheme } from "next-themes";
import React, { useEffect, useMemo, useRef, useState } from "react";
import { useEffect, useMemo, useRef, useState } from "react";
import ReactMarkdown from "react-markdown";
import { Prism as SyntaxHighlighter } from "react-syntax-highlighter";
import { oneDark, oneLight } from "react-syntax-highlighter/dist/cjs/styles/prism";
@ -10,105 +10,51 @@ import rehypeSanitize from "rehype-sanitize";
import remarkGfm from "remark-gfm";
import { Button } from "@/components/ui/button";
import { cn } from "@/lib/utils";
import { Citation } from "./chat/Citation";
import type { Source } from "./chat/types";
import CopyButton from "./copy-button";
interface MarkdownViewerProps {
content: string;
className?: string;
getCitationSource?: (id: number) => Source | null;
type?: "user" | "ai";
}
export function MarkdownViewer({
content,
className,
getCitationSource,
type = "user",
}: MarkdownViewerProps) {
export function MarkdownViewer({ content, className }: MarkdownViewerProps) {
const ref = useRef<HTMLDivElement>(null);
// Memoize the markdown components to prevent unnecessary re-renders
const components = useMemo(() => {
return {
// Define custom components for markdown elements
p: ({ node, children, ...props }: any) => {
// If there's no getCitationSource function, just render normally
if (!getCitationSource) {
return (
<p className="my-2" {...props}>
{children}
</p>
);
}
// Process citations within paragraph content
return (
<p className="my-2" {...props}>
{processCitationsInReactChildren(children, getCitationSource)}
</p>
);
},
a: ({ node, children, ...props }: any) => {
// Process citations within link content if needed
const processedChildren = getCitationSource
? processCitationsInReactChildren(children, getCitationSource)
: children;
return (
<a className="text-primary hover:underline" {...props}>
{processedChildren}
</a>
);
},
li: ({ node, children, ...props }: any) => {
// Process citations within list item content
const processedChildren = getCitationSource
? processCitationsInReactChildren(children, getCitationSource)
: children;
return <li {...props}>{processedChildren}</li>;
},
p: ({ node, children, ...props }: any) => (
<p className="my-2" {...props}>
{children}
</p>
),
a: ({ node, children, ...props }: any) => (
<a className="text-primary hover:underline" {...props}>
{children}
</a>
),
li: ({ node, children, ...props }: any) => <li {...props}>{children}</li>,
ul: ({ node, ...props }: any) => <ul className="list-disc pl-5 my-2" {...props} />,
ol: ({ node, ...props }: any) => <ol className="list-decimal pl-5 my-2" {...props} />,
h1: ({ node, children, ...props }: any) => {
const processedChildren = getCitationSource
? processCitationsInReactChildren(children, getCitationSource)
: children;
return (
<h1 className="text-2xl font-bold mt-6 mb-2" {...props}>
{processedChildren}
</h1>
);
},
h2: ({ node, children, ...props }: any) => {
const processedChildren = getCitationSource
? processCitationsInReactChildren(children, getCitationSource)
: children;
return (
<h2 className="text-xl font-bold mt-5 mb-2" {...props}>
{processedChildren}
</h2>
);
},
h3: ({ node, children, ...props }: any) => {
const processedChildren = getCitationSource
? processCitationsInReactChildren(children, getCitationSource)
: children;
return (
<h3 className="text-lg font-bold mt-4 mb-2" {...props}>
{processedChildren}
</h3>
);
},
h4: ({ node, children, ...props }: any) => {
const processedChildren = getCitationSource
? processCitationsInReactChildren(children, getCitationSource)
: children;
return (
<h4 className="text-base font-bold mt-3 mb-1" {...props}>
{processedChildren}
</h4>
);
},
h1: ({ node, children, ...props }: any) => (
<h1 className="text-2xl font-bold mt-6 mb-2" {...props}>
{children}
</h1>
),
h2: ({ node, children, ...props }: any) => (
<h2 className="text-xl font-bold mt-5 mb-2" {...props}>
{children}
</h2>
),
h3: ({ node, children, ...props }: any) => (
<h3 className="text-lg font-bold mt-4 mb-2" {...props}>
{children}
</h3>
),
h4: ({ node, children, ...props }: any) => (
<h4 className="text-base font-bold mt-3 mb-1" {...props}>
{children}
</h4>
),
blockquote: ({ node, ...props }: any) => (
<blockquote className="border-l-4 border-muted pl-4 italic my-2" {...props} />
),
@ -154,7 +100,7 @@ export function MarkdownViewer({
);
},
};
}, [getCitationSource]);
}, []);
return (
<div className={cn("prose prose-sm dark:prose-invert max-w-none", className)} ref={ref}>
@ -165,7 +111,6 @@ export function MarkdownViewer({
>
{content}
</ReactMarkdown>
{type === "ai" && <CopyButton ref={ref} />}
</div>
);
}
@ -267,77 +212,3 @@ const CodeBlock = ({ children, language }: { children: string; language: string
</div>
);
};
// Helper function to process citations within React children
const processCitationsInReactChildren = (
children: React.ReactNode,
getCitationSource: (id: number) => Source | null
): React.ReactNode => {
// If children is not an array or string, just return it
if (!children || (typeof children !== "string" && !Array.isArray(children))) {
return children;
}
// Handle string content directly - this is where we process citation references
if (typeof children === "string") {
return processCitationsInText(children, getCitationSource);
}
// Handle arrays of children recursively
if (Array.isArray(children)) {
return React.Children.map(children, (child) => {
if (typeof child === "string") {
return processCitationsInText(child, getCitationSource);
}
return child;
});
}
return children;
};
// Process citation references in text content
const processCitationsInText = (
text: string,
getCitationSource: (id: number) => Source | null
): React.ReactNode[] => {
// Use improved regex to catch citation numbers more reliably
// This will match patterns like [1], [42], etc. including when they appear at the end of a line or sentence
const citationRegex = /\[(\d+)\]/g;
const parts: React.ReactNode[] = [];
let lastIndex = 0;
let match: RegExpExecArray | null = citationRegex.exec(text);
let position = 0;
while (match !== null) {
// Add text before the citation
if (match.index > lastIndex) {
parts.push(text.substring(lastIndex, match.index));
}
// Add the citation component
const citationId = parseInt(match[1], 10);
const source = getCitationSource(citationId);
parts.push(
<Citation
key={`citation-${citationId}-${position}`}
citationId={citationId}
citationText={match[0]}
position={position}
source={source}
/>
);
lastIndex = match.index + match[0].length;
position++;
match = citationRegex.exec(text);
}
// Add any remaining text after the last citation
if (lastIndex < text.length) {
parts.push(text.substring(lastIndex));
}
return parts;
};

View file

@ -0,0 +1,56 @@
"use client";
import * as ScrollAreaPrimitive from "@radix-ui/react-scroll-area";
import type * as React from "react";
import { cn } from "@/lib/utils";
function ScrollArea({
className,
children,
...props
}: React.ComponentProps<typeof ScrollAreaPrimitive.Root>) {
return (
<ScrollAreaPrimitive.Root
data-slot="scroll-area"
className={cn("relative", className)}
{...props}
>
<ScrollAreaPrimitive.Viewport
data-slot="scroll-area-viewport"
className="focus-visible:ring-ring/50 size-full rounded-[inherit] transition-[color,box-shadow] outline-none focus-visible:ring-[3px] focus-visible:outline-1"
>
{children}
</ScrollAreaPrimitive.Viewport>
<ScrollBar />
<ScrollAreaPrimitive.Corner />
</ScrollAreaPrimitive.Root>
);
}
function ScrollBar({
className,
orientation = "vertical",
...props
}: React.ComponentProps<typeof ScrollAreaPrimitive.ScrollAreaScrollbar>) {
return (
<ScrollAreaPrimitive.ScrollAreaScrollbar
data-slot="scroll-area-scrollbar"
orientation={orientation}
className={cn(
"flex touch-none p-px transition-colors select-none",
orientation === "vertical" && "h-full w-2.5 border-l border-l-transparent",
orientation === "horizontal" && "h-2.5 flex-col border-t border-t-transparent",
className
)}
{...props}
>
<ScrollAreaPrimitive.ScrollAreaThumb
data-slot="scroll-area-thumb"
className="bg-border relative flex-1 rounded-full"
/>
</ScrollAreaPrimitive.ScrollAreaScrollbar>
);
}
export { ScrollArea, ScrollBar };

View file

@ -1,2 +1,3 @@
export * from "./use-document-by-chunk";
export * from "./use-logs";
export * from "./useSearchSourceConnectors";

View file

@ -0,0 +1,106 @@
"use client";
import { useCallback, useState } from "react";
import { toast } from "sonner";
export interface Chunk {
id: number;
content: string;
document_id: number;
created_at: string;
}
export interface DocumentWithChunks {
id: number;
title: string;
document_type: DocumentType;
document_metadata: any;
content: string;
created_at: string;
search_space_id: number;
chunks: Chunk[];
}
export type DocumentType =
| "EXTENSION"
| "CRAWLED_URL"
| "SLACK_CONNECTOR"
| "NOTION_CONNECTOR"
| "FILE"
| "YOUTUBE_VIDEO"
| "GITHUB_CONNECTOR"
| "LINEAR_CONNECTOR"
| "DISCORD_CONNECTOR"
| "JIRA_CONNECTOR"
| "CONFLUENCE_CONNECTOR"
| "CLICKUP_CONNECTOR"
| "GOOGLE_CALENDAR_CONNECTOR"
| "GOOGLE_GMAIL_CONNECTOR";
export function useDocumentByChunk() {
const [document, setDocument] = useState<DocumentWithChunks | null>(null);
const [loading, setLoading] = useState(false);
const [error, setError] = useState<string | null>(null);
const fetchDocumentByChunk = useCallback(async (chunkId: number) => {
try {
setLoading(true);
setError(null);
setDocument(null);
const response = await fetch(
`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/documents/by-chunk/${chunkId}`,
{
headers: {
Authorization: `Bearer ${localStorage.getItem("surfsense_bearer_token")}`,
"Content-Type": "application/json",
},
method: "GET",
}
);
if (!response.ok) {
const errorText = await response.text();
let errorMessage = "Failed to fetch document";
try {
const errorData = JSON.parse(errorText);
errorMessage = errorData.detail || errorMessage;
} catch {
// If parsing fails, use default message
}
if (response.status === 404) {
errorMessage = "Chunk not found or you don't have access to it";
}
toast.error(errorMessage);
throw new Error(errorMessage);
}
const data: DocumentWithChunks = await response.json();
setDocument(data);
setError(null);
return data;
} catch (err: any) {
const errorMessage = err.message || "Failed to fetch document";
setError(errorMessage);
console.error("Error fetching document by chunk:", err);
throw err;
} finally {
setLoading(false);
}
}, []);
const clearDocument = useCallback(() => {
setDocument(null);
setError(null);
}, []);
return {
document,
loading,
error,
fetchDocumentByChunk,
clearDocument,
};
}

View file

@ -12,11 +12,7 @@ const nextConfig: NextConfig = {
remotePatterns: [
{
protocol: "https",
hostname: "images.unsplash.com",
},
{
protocol: "https",
hostname: "static.vecteezy.com",
hostname: "**",
},
],
},