mirror of
https://github.com/MODSetter/SurfSense.git
synced 2025-09-02 02:29:08 +00:00
Merge pull request #286 from MODSetter/dev
Some checks failed
pre-commit / pre-commit (push) Has been cancelled
Some checks failed
pre-commit / pre-commit (push) Has been cancelled
feat: added jump to source referencing of citations
This commit is contained in:
commit
bc89959d2f
9 changed files with 819 additions and 564 deletions
|
@ -5,10 +5,24 @@ from fastapi import APIRouter, BackgroundTasks, Depends, Form, HTTPException, Up
|
||||||
from litellm import atranscription
|
from litellm import atranscription
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
from sqlalchemy.future import select
|
from sqlalchemy.future import select
|
||||||
|
from sqlalchemy.orm import selectinload
|
||||||
|
|
||||||
from app.config import config as app_config
|
from app.config import config as app_config
|
||||||
from app.db import Document, DocumentType, Log, SearchSpace, User, get_async_session
|
from app.db import (
|
||||||
from app.schemas import DocumentRead, DocumentsCreate, DocumentUpdate
|
Chunk,
|
||||||
|
Document,
|
||||||
|
DocumentType,
|
||||||
|
Log,
|
||||||
|
SearchSpace,
|
||||||
|
User,
|
||||||
|
get_async_session,
|
||||||
|
)
|
||||||
|
from app.schemas import (
|
||||||
|
DocumentRead,
|
||||||
|
DocumentsCreate,
|
||||||
|
DocumentUpdate,
|
||||||
|
DocumentWithChunksRead,
|
||||||
|
)
|
||||||
from app.services.task_logging_service import TaskLoggingService
|
from app.services.task_logging_service import TaskLoggingService
|
||||||
from app.tasks.document_processors import (
|
from app.tasks.document_processors import (
|
||||||
add_crawled_url_document,
|
add_crawled_url_document,
|
||||||
|
@ -140,6 +154,423 @@ async def create_documents_file_upload(
|
||||||
) from e
|
) from e
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/documents/", response_model=list[DocumentRead])
|
||||||
|
async def read_documents(
|
||||||
|
skip: int = 0,
|
||||||
|
limit: int = 3000,
|
||||||
|
search_space_id: int | None = None,
|
||||||
|
session: AsyncSession = Depends(get_async_session),
|
||||||
|
user: User = Depends(current_active_user),
|
||||||
|
):
|
||||||
|
try:
|
||||||
|
query = (
|
||||||
|
select(Document).join(SearchSpace).filter(SearchSpace.user_id == user.id)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Filter by search_space_id if provided
|
||||||
|
if search_space_id is not None:
|
||||||
|
query = query.filter(Document.search_space_id == search_space_id)
|
||||||
|
|
||||||
|
result = await session.execute(query.offset(skip).limit(limit))
|
||||||
|
db_documents = result.scalars().all()
|
||||||
|
|
||||||
|
# Convert database objects to API-friendly format
|
||||||
|
api_documents = []
|
||||||
|
for doc in db_documents:
|
||||||
|
api_documents.append(
|
||||||
|
DocumentRead(
|
||||||
|
id=doc.id,
|
||||||
|
title=doc.title,
|
||||||
|
document_type=doc.document_type,
|
||||||
|
document_metadata=doc.document_metadata,
|
||||||
|
content=doc.content,
|
||||||
|
created_at=doc.created_at,
|
||||||
|
search_space_id=doc.search_space_id,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return api_documents
|
||||||
|
except Exception as e:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=500, detail=f"Failed to fetch documents: {e!s}"
|
||||||
|
) from e
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/documents/{document_id}", response_model=DocumentRead)
|
||||||
|
async def read_document(
|
||||||
|
document_id: int,
|
||||||
|
session: AsyncSession = Depends(get_async_session),
|
||||||
|
user: User = Depends(current_active_user),
|
||||||
|
):
|
||||||
|
try:
|
||||||
|
result = await session.execute(
|
||||||
|
select(Document)
|
||||||
|
.join(SearchSpace)
|
||||||
|
.filter(Document.id == document_id, SearchSpace.user_id == user.id)
|
||||||
|
)
|
||||||
|
document = result.scalars().first()
|
||||||
|
|
||||||
|
if not document:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=404, detail=f"Document with id {document_id} not found"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Convert database object to API-friendly format
|
||||||
|
return DocumentRead(
|
||||||
|
id=document.id,
|
||||||
|
title=document.title,
|
||||||
|
document_type=document.document_type,
|
||||||
|
document_metadata=document.document_metadata,
|
||||||
|
content=document.content,
|
||||||
|
created_at=document.created_at,
|
||||||
|
search_space_id=document.search_space_id,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=500, detail=f"Failed to fetch document: {e!s}"
|
||||||
|
) from e
|
||||||
|
|
||||||
|
|
||||||
|
@router.put("/documents/{document_id}", response_model=DocumentRead)
|
||||||
|
async def update_document(
|
||||||
|
document_id: int,
|
||||||
|
document_update: DocumentUpdate,
|
||||||
|
session: AsyncSession = Depends(get_async_session),
|
||||||
|
user: User = Depends(current_active_user),
|
||||||
|
):
|
||||||
|
try:
|
||||||
|
# Query the document directly instead of using read_document function
|
||||||
|
result = await session.execute(
|
||||||
|
select(Document)
|
||||||
|
.join(SearchSpace)
|
||||||
|
.filter(Document.id == document_id, SearchSpace.user_id == user.id)
|
||||||
|
)
|
||||||
|
db_document = result.scalars().first()
|
||||||
|
|
||||||
|
if not db_document:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=404, detail=f"Document with id {document_id} not found"
|
||||||
|
)
|
||||||
|
|
||||||
|
update_data = document_update.model_dump(exclude_unset=True)
|
||||||
|
for key, value in update_data.items():
|
||||||
|
setattr(db_document, key, value)
|
||||||
|
await session.commit()
|
||||||
|
await session.refresh(db_document)
|
||||||
|
|
||||||
|
# Convert to DocumentRead for response
|
||||||
|
return DocumentRead(
|
||||||
|
id=db_document.id,
|
||||||
|
title=db_document.title,
|
||||||
|
document_type=db_document.document_type,
|
||||||
|
document_metadata=db_document.document_metadata,
|
||||||
|
content=db_document.content,
|
||||||
|
created_at=db_document.created_at,
|
||||||
|
search_space_id=db_document.search_space_id,
|
||||||
|
)
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
await session.rollback()
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=500, detail=f"Failed to update document: {e!s}"
|
||||||
|
) from e
|
||||||
|
|
||||||
|
|
||||||
|
@router.delete("/documents/{document_id}", response_model=dict)
|
||||||
|
async def delete_document(
|
||||||
|
document_id: int,
|
||||||
|
session: AsyncSession = Depends(get_async_session),
|
||||||
|
user: User = Depends(current_active_user),
|
||||||
|
):
|
||||||
|
try:
|
||||||
|
# Query the document directly instead of using read_document function
|
||||||
|
result = await session.execute(
|
||||||
|
select(Document)
|
||||||
|
.join(SearchSpace)
|
||||||
|
.filter(Document.id == document_id, SearchSpace.user_id == user.id)
|
||||||
|
)
|
||||||
|
document = result.scalars().first()
|
||||||
|
|
||||||
|
if not document:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=404, detail=f"Document with id {document_id} not found"
|
||||||
|
)
|
||||||
|
|
||||||
|
await session.delete(document)
|
||||||
|
await session.commit()
|
||||||
|
return {"message": "Document deleted successfully"}
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
await session.rollback()
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=500, detail=f"Failed to delete document: {e!s}"
|
||||||
|
) from e
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/documents/by-chunk/{chunk_id}", response_model=DocumentWithChunksRead)
|
||||||
|
async def get_document_by_chunk_id(
|
||||||
|
chunk_id: int,
|
||||||
|
session: AsyncSession = Depends(get_async_session),
|
||||||
|
user: User = Depends(current_active_user),
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Retrieves a document based on a chunk ID, including all its chunks ordered by creation time.
|
||||||
|
The document's embedding and chunk embeddings are excluded from the response.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# First, get the chunk and verify it exists
|
||||||
|
chunk_result = await session.execute(select(Chunk).filter(Chunk.id == chunk_id))
|
||||||
|
chunk = chunk_result.scalars().first()
|
||||||
|
|
||||||
|
if not chunk:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=404, detail=f"Chunk with id {chunk_id} not found"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get the associated document and verify ownership
|
||||||
|
document_result = await session.execute(
|
||||||
|
select(Document)
|
||||||
|
.options(selectinload(Document.chunks))
|
||||||
|
.join(SearchSpace)
|
||||||
|
.filter(Document.id == chunk.document_id, SearchSpace.user_id == user.id)
|
||||||
|
)
|
||||||
|
document = document_result.scalars().first()
|
||||||
|
|
||||||
|
if not document:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=404,
|
||||||
|
detail="Document not found or you don't have access to it",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Sort chunks by creation time
|
||||||
|
sorted_chunks = sorted(document.chunks, key=lambda x: x.created_at)
|
||||||
|
|
||||||
|
# Return the document with its chunks
|
||||||
|
return DocumentWithChunksRead(
|
||||||
|
id=document.id,
|
||||||
|
title=document.title,
|
||||||
|
document_type=document.document_type,
|
||||||
|
document_metadata=document.document_metadata,
|
||||||
|
content=document.content,
|
||||||
|
created_at=document.created_at,
|
||||||
|
search_space_id=document.search_space_id,
|
||||||
|
chunks=sorted_chunks,
|
||||||
|
)
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=500, detail=f"Failed to retrieve document: {e!s}"
|
||||||
|
) from e
|
||||||
|
|
||||||
|
|
||||||
|
async def process_extension_document_with_new_session(
|
||||||
|
individual_document, search_space_id: int, user_id: str
|
||||||
|
):
|
||||||
|
"""Create a new session and process extension document."""
|
||||||
|
from app.db import async_session_maker
|
||||||
|
from app.services.task_logging_service import TaskLoggingService
|
||||||
|
|
||||||
|
async with async_session_maker() as session:
|
||||||
|
# Initialize task logging service
|
||||||
|
task_logger = TaskLoggingService(session, search_space_id)
|
||||||
|
|
||||||
|
# Log task start
|
||||||
|
log_entry = await task_logger.log_task_start(
|
||||||
|
task_name="process_extension_document",
|
||||||
|
source="document_processor",
|
||||||
|
message=f"Starting processing of extension document from {individual_document.metadata.VisitedWebPageTitle}",
|
||||||
|
metadata={
|
||||||
|
"document_type": "EXTENSION",
|
||||||
|
"url": individual_document.metadata.VisitedWebPageURL,
|
||||||
|
"title": individual_document.metadata.VisitedWebPageTitle,
|
||||||
|
"user_id": user_id,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = await add_extension_received_document(
|
||||||
|
session, individual_document, search_space_id, user_id
|
||||||
|
)
|
||||||
|
|
||||||
|
if result:
|
||||||
|
await task_logger.log_task_success(
|
||||||
|
log_entry,
|
||||||
|
f"Successfully processed extension document: {individual_document.metadata.VisitedWebPageTitle}",
|
||||||
|
{"document_id": result.id, "content_hash": result.content_hash},
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
await task_logger.log_task_success(
|
||||||
|
log_entry,
|
||||||
|
f"Extension document already exists (duplicate): {individual_document.metadata.VisitedWebPageTitle}",
|
||||||
|
{"duplicate_detected": True},
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
await task_logger.log_task_failure(
|
||||||
|
log_entry,
|
||||||
|
f"Failed to process extension document: {individual_document.metadata.VisitedWebPageTitle}",
|
||||||
|
str(e),
|
||||||
|
{"error_type": type(e).__name__},
|
||||||
|
)
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logging.error(f"Error processing extension document: {e!s}")
|
||||||
|
|
||||||
|
|
||||||
|
async def process_crawled_url_with_new_session(
|
||||||
|
url: str, search_space_id: int, user_id: str
|
||||||
|
):
|
||||||
|
"""Create a new session and process crawled URL."""
|
||||||
|
from app.db import async_session_maker
|
||||||
|
from app.services.task_logging_service import TaskLoggingService
|
||||||
|
|
||||||
|
async with async_session_maker() as session:
|
||||||
|
# Initialize task logging service
|
||||||
|
task_logger = TaskLoggingService(session, search_space_id)
|
||||||
|
|
||||||
|
# Log task start
|
||||||
|
log_entry = await task_logger.log_task_start(
|
||||||
|
task_name="process_crawled_url",
|
||||||
|
source="document_processor",
|
||||||
|
message=f"Starting URL crawling and processing for: {url}",
|
||||||
|
metadata={"document_type": "CRAWLED_URL", "url": url, "user_id": user_id},
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = await add_crawled_url_document(
|
||||||
|
session, url, search_space_id, user_id
|
||||||
|
)
|
||||||
|
|
||||||
|
if result:
|
||||||
|
await task_logger.log_task_success(
|
||||||
|
log_entry,
|
||||||
|
f"Successfully crawled and processed URL: {url}",
|
||||||
|
{
|
||||||
|
"document_id": result.id,
|
||||||
|
"title": result.title,
|
||||||
|
"content_hash": result.content_hash,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
await task_logger.log_task_success(
|
||||||
|
log_entry,
|
||||||
|
f"URL document already exists (duplicate): {url}",
|
||||||
|
{"duplicate_detected": True},
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
await task_logger.log_task_failure(
|
||||||
|
log_entry,
|
||||||
|
f"Failed to crawl URL: {url}",
|
||||||
|
str(e),
|
||||||
|
{"error_type": type(e).__name__},
|
||||||
|
)
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logging.error(f"Error processing crawled URL: {e!s}")
|
||||||
|
|
||||||
|
|
||||||
|
async def process_file_in_background_with_new_session(
|
||||||
|
file_path: str, filename: str, search_space_id: int, user_id: str
|
||||||
|
):
|
||||||
|
"""Create a new session and process file."""
|
||||||
|
from app.db import async_session_maker
|
||||||
|
from app.services.task_logging_service import TaskLoggingService
|
||||||
|
|
||||||
|
async with async_session_maker() as session:
|
||||||
|
# Initialize task logging service
|
||||||
|
task_logger = TaskLoggingService(session, search_space_id)
|
||||||
|
|
||||||
|
# Log task start
|
||||||
|
log_entry = await task_logger.log_task_start(
|
||||||
|
task_name="process_file_upload",
|
||||||
|
source="document_processor",
|
||||||
|
message=f"Starting file processing for: {filename}",
|
||||||
|
metadata={
|
||||||
|
"document_type": "FILE",
|
||||||
|
"filename": filename,
|
||||||
|
"file_path": file_path,
|
||||||
|
"user_id": user_id,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
await process_file_in_background(
|
||||||
|
file_path,
|
||||||
|
filename,
|
||||||
|
search_space_id,
|
||||||
|
user_id,
|
||||||
|
session,
|
||||||
|
task_logger,
|
||||||
|
log_entry,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Note: success/failure logging is handled within process_file_in_background
|
||||||
|
except Exception as e:
|
||||||
|
await task_logger.log_task_failure(
|
||||||
|
log_entry,
|
||||||
|
f"Failed to process file: {filename}",
|
||||||
|
str(e),
|
||||||
|
{"error_type": type(e).__name__},
|
||||||
|
)
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logging.error(f"Error processing file: {e!s}")
|
||||||
|
|
||||||
|
|
||||||
|
async def process_youtube_video_with_new_session(
|
||||||
|
url: str, search_space_id: int, user_id: str
|
||||||
|
):
|
||||||
|
"""Create a new session and process YouTube video."""
|
||||||
|
from app.db import async_session_maker
|
||||||
|
from app.services.task_logging_service import TaskLoggingService
|
||||||
|
|
||||||
|
async with async_session_maker() as session:
|
||||||
|
# Initialize task logging service
|
||||||
|
task_logger = TaskLoggingService(session, search_space_id)
|
||||||
|
|
||||||
|
# Log task start
|
||||||
|
log_entry = await task_logger.log_task_start(
|
||||||
|
task_name="process_youtube_video",
|
||||||
|
source="document_processor",
|
||||||
|
message=f"Starting YouTube video processing for: {url}",
|
||||||
|
metadata={"document_type": "YOUTUBE_VIDEO", "url": url, "user_id": user_id},
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = await add_youtube_video_document(
|
||||||
|
session, url, search_space_id, user_id
|
||||||
|
)
|
||||||
|
|
||||||
|
if result:
|
||||||
|
await task_logger.log_task_success(
|
||||||
|
log_entry,
|
||||||
|
f"Successfully processed YouTube video: {result.title}",
|
||||||
|
{
|
||||||
|
"document_id": result.id,
|
||||||
|
"video_id": result.document_metadata.get("video_id"),
|
||||||
|
"content_hash": result.content_hash,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
await task_logger.log_task_success(
|
||||||
|
log_entry,
|
||||||
|
f"YouTube video document already exists (duplicate): {url}",
|
||||||
|
{"duplicate_detected": True},
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
await task_logger.log_task_failure(
|
||||||
|
log_entry,
|
||||||
|
f"Failed to process YouTube video: {url}",
|
||||||
|
str(e),
|
||||||
|
{"error_type": type(e).__name__},
|
||||||
|
)
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logging.error(f"Error processing YouTube video: {e!s}")
|
||||||
|
|
||||||
|
|
||||||
async def process_file_in_background(
|
async def process_file_in_background(
|
||||||
file_path: str,
|
file_path: str,
|
||||||
filename: str,
|
filename: str,
|
||||||
|
@ -508,363 +939,3 @@ async def process_file_in_background(
|
||||||
|
|
||||||
logging.error(f"Error processing file in background: {e!s}")
|
logging.error(f"Error processing file in background: {e!s}")
|
||||||
raise # Re-raise so the wrapper can also handle it
|
raise # Re-raise so the wrapper can also handle it
|
||||||
|
|
||||||
|
|
||||||
@router.get("/documents/", response_model=list[DocumentRead])
|
|
||||||
async def read_documents(
|
|
||||||
skip: int = 0,
|
|
||||||
limit: int = 3000,
|
|
||||||
search_space_id: int | None = None,
|
|
||||||
session: AsyncSession = Depends(get_async_session),
|
|
||||||
user: User = Depends(current_active_user),
|
|
||||||
):
|
|
||||||
try:
|
|
||||||
query = (
|
|
||||||
select(Document).join(SearchSpace).filter(SearchSpace.user_id == user.id)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Filter by search_space_id if provided
|
|
||||||
if search_space_id is not None:
|
|
||||||
query = query.filter(Document.search_space_id == search_space_id)
|
|
||||||
|
|
||||||
result = await session.execute(query.offset(skip).limit(limit))
|
|
||||||
db_documents = result.scalars().all()
|
|
||||||
|
|
||||||
# Convert database objects to API-friendly format
|
|
||||||
api_documents = []
|
|
||||||
for doc in db_documents:
|
|
||||||
api_documents.append(
|
|
||||||
DocumentRead(
|
|
||||||
id=doc.id,
|
|
||||||
title=doc.title,
|
|
||||||
document_type=doc.document_type,
|
|
||||||
document_metadata=doc.document_metadata,
|
|
||||||
content=doc.content,
|
|
||||||
created_at=doc.created_at,
|
|
||||||
search_space_id=doc.search_space_id,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
return api_documents
|
|
||||||
except Exception as e:
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=500, detail=f"Failed to fetch documents: {e!s}"
|
|
||||||
) from e
|
|
||||||
|
|
||||||
|
|
||||||
@router.get("/documents/{document_id}", response_model=DocumentRead)
|
|
||||||
async def read_document(
|
|
||||||
document_id: int,
|
|
||||||
session: AsyncSession = Depends(get_async_session),
|
|
||||||
user: User = Depends(current_active_user),
|
|
||||||
):
|
|
||||||
try:
|
|
||||||
result = await session.execute(
|
|
||||||
select(Document)
|
|
||||||
.join(SearchSpace)
|
|
||||||
.filter(Document.id == document_id, SearchSpace.user_id == user.id)
|
|
||||||
)
|
|
||||||
document = result.scalars().first()
|
|
||||||
|
|
||||||
if not document:
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=404, detail=f"Document with id {document_id} not found"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Convert database object to API-friendly format
|
|
||||||
return DocumentRead(
|
|
||||||
id=document.id,
|
|
||||||
title=document.title,
|
|
||||||
document_type=document.document_type,
|
|
||||||
document_metadata=document.document_metadata,
|
|
||||||
content=document.content,
|
|
||||||
created_at=document.created_at,
|
|
||||||
search_space_id=document.search_space_id,
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=500, detail=f"Failed to fetch document: {e!s}"
|
|
||||||
) from e
|
|
||||||
|
|
||||||
|
|
||||||
@router.put("/documents/{document_id}", response_model=DocumentRead)
|
|
||||||
async def update_document(
|
|
||||||
document_id: int,
|
|
||||||
document_update: DocumentUpdate,
|
|
||||||
session: AsyncSession = Depends(get_async_session),
|
|
||||||
user: User = Depends(current_active_user),
|
|
||||||
):
|
|
||||||
try:
|
|
||||||
# Query the document directly instead of using read_document function
|
|
||||||
result = await session.execute(
|
|
||||||
select(Document)
|
|
||||||
.join(SearchSpace)
|
|
||||||
.filter(Document.id == document_id, SearchSpace.user_id == user.id)
|
|
||||||
)
|
|
||||||
db_document = result.scalars().first()
|
|
||||||
|
|
||||||
if not db_document:
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=404, detail=f"Document with id {document_id} not found"
|
|
||||||
)
|
|
||||||
|
|
||||||
update_data = document_update.model_dump(exclude_unset=True)
|
|
||||||
for key, value in update_data.items():
|
|
||||||
setattr(db_document, key, value)
|
|
||||||
await session.commit()
|
|
||||||
await session.refresh(db_document)
|
|
||||||
|
|
||||||
# Convert to DocumentRead for response
|
|
||||||
return DocumentRead(
|
|
||||||
id=db_document.id,
|
|
||||||
title=db_document.title,
|
|
||||||
document_type=db_document.document_type,
|
|
||||||
document_metadata=db_document.document_metadata,
|
|
||||||
content=db_document.content,
|
|
||||||
created_at=db_document.created_at,
|
|
||||||
search_space_id=db_document.search_space_id,
|
|
||||||
)
|
|
||||||
except HTTPException:
|
|
||||||
raise
|
|
||||||
except Exception as e:
|
|
||||||
await session.rollback()
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=500, detail=f"Failed to update document: {e!s}"
|
|
||||||
) from e
|
|
||||||
|
|
||||||
|
|
||||||
@router.delete("/documents/{document_id}", response_model=dict)
|
|
||||||
async def delete_document(
|
|
||||||
document_id: int,
|
|
||||||
session: AsyncSession = Depends(get_async_session),
|
|
||||||
user: User = Depends(current_active_user),
|
|
||||||
):
|
|
||||||
try:
|
|
||||||
# Query the document directly instead of using read_document function
|
|
||||||
result = await session.execute(
|
|
||||||
select(Document)
|
|
||||||
.join(SearchSpace)
|
|
||||||
.filter(Document.id == document_id, SearchSpace.user_id == user.id)
|
|
||||||
)
|
|
||||||
document = result.scalars().first()
|
|
||||||
|
|
||||||
if not document:
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=404, detail=f"Document with id {document_id} not found"
|
|
||||||
)
|
|
||||||
|
|
||||||
await session.delete(document)
|
|
||||||
await session.commit()
|
|
||||||
return {"message": "Document deleted successfully"}
|
|
||||||
except HTTPException:
|
|
||||||
raise
|
|
||||||
except Exception as e:
|
|
||||||
await session.rollback()
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=500, detail=f"Failed to delete document: {e!s}"
|
|
||||||
) from e
|
|
||||||
|
|
||||||
|
|
||||||
async def process_extension_document_with_new_session(
|
|
||||||
individual_document, search_space_id: int, user_id: str
|
|
||||||
):
|
|
||||||
"""Create a new session and process extension document."""
|
|
||||||
from app.db import async_session_maker
|
|
||||||
from app.services.task_logging_service import TaskLoggingService
|
|
||||||
|
|
||||||
async with async_session_maker() as session:
|
|
||||||
# Initialize task logging service
|
|
||||||
task_logger = TaskLoggingService(session, search_space_id)
|
|
||||||
|
|
||||||
# Log task start
|
|
||||||
log_entry = await task_logger.log_task_start(
|
|
||||||
task_name="process_extension_document",
|
|
||||||
source="document_processor",
|
|
||||||
message=f"Starting processing of extension document from {individual_document.metadata.VisitedWebPageTitle}",
|
|
||||||
metadata={
|
|
||||||
"document_type": "EXTENSION",
|
|
||||||
"url": individual_document.metadata.VisitedWebPageURL,
|
|
||||||
"title": individual_document.metadata.VisitedWebPageTitle,
|
|
||||||
"user_id": user_id,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
|
||||||
result = await add_extension_received_document(
|
|
||||||
session, individual_document, search_space_id, user_id
|
|
||||||
)
|
|
||||||
|
|
||||||
if result:
|
|
||||||
await task_logger.log_task_success(
|
|
||||||
log_entry,
|
|
||||||
f"Successfully processed extension document: {individual_document.metadata.VisitedWebPageTitle}",
|
|
||||||
{"document_id": result.id, "content_hash": result.content_hash},
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
await task_logger.log_task_success(
|
|
||||||
log_entry,
|
|
||||||
f"Extension document already exists (duplicate): {individual_document.metadata.VisitedWebPageTitle}",
|
|
||||||
{"duplicate_detected": True},
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
await task_logger.log_task_failure(
|
|
||||||
log_entry,
|
|
||||||
f"Failed to process extension document: {individual_document.metadata.VisitedWebPageTitle}",
|
|
||||||
str(e),
|
|
||||||
{"error_type": type(e).__name__},
|
|
||||||
)
|
|
||||||
import logging
|
|
||||||
|
|
||||||
logging.error(f"Error processing extension document: {e!s}")
|
|
||||||
|
|
||||||
|
|
||||||
async def process_crawled_url_with_new_session(
|
|
||||||
url: str, search_space_id: int, user_id: str
|
|
||||||
):
|
|
||||||
"""Create a new session and process crawled URL."""
|
|
||||||
from app.db import async_session_maker
|
|
||||||
from app.services.task_logging_service import TaskLoggingService
|
|
||||||
|
|
||||||
async with async_session_maker() as session:
|
|
||||||
# Initialize task logging service
|
|
||||||
task_logger = TaskLoggingService(session, search_space_id)
|
|
||||||
|
|
||||||
# Log task start
|
|
||||||
log_entry = await task_logger.log_task_start(
|
|
||||||
task_name="process_crawled_url",
|
|
||||||
source="document_processor",
|
|
||||||
message=f"Starting URL crawling and processing for: {url}",
|
|
||||||
metadata={"document_type": "CRAWLED_URL", "url": url, "user_id": user_id},
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
|
||||||
result = await add_crawled_url_document(
|
|
||||||
session, url, search_space_id, user_id
|
|
||||||
)
|
|
||||||
|
|
||||||
if result:
|
|
||||||
await task_logger.log_task_success(
|
|
||||||
log_entry,
|
|
||||||
f"Successfully crawled and processed URL: {url}",
|
|
||||||
{
|
|
||||||
"document_id": result.id,
|
|
||||||
"title": result.title,
|
|
||||||
"content_hash": result.content_hash,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
await task_logger.log_task_success(
|
|
||||||
log_entry,
|
|
||||||
f"URL document already exists (duplicate): {url}",
|
|
||||||
{"duplicate_detected": True},
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
await task_logger.log_task_failure(
|
|
||||||
log_entry,
|
|
||||||
f"Failed to crawl URL: {url}",
|
|
||||||
str(e),
|
|
||||||
{"error_type": type(e).__name__},
|
|
||||||
)
|
|
||||||
import logging
|
|
||||||
|
|
||||||
logging.error(f"Error processing crawled URL: {e!s}")
|
|
||||||
|
|
||||||
|
|
||||||
async def process_file_in_background_with_new_session(
|
|
||||||
file_path: str, filename: str, search_space_id: int, user_id: str
|
|
||||||
):
|
|
||||||
"""Create a new session and process file."""
|
|
||||||
from app.db import async_session_maker
|
|
||||||
from app.services.task_logging_service import TaskLoggingService
|
|
||||||
|
|
||||||
async with async_session_maker() as session:
|
|
||||||
# Initialize task logging service
|
|
||||||
task_logger = TaskLoggingService(session, search_space_id)
|
|
||||||
|
|
||||||
# Log task start
|
|
||||||
log_entry = await task_logger.log_task_start(
|
|
||||||
task_name="process_file_upload",
|
|
||||||
source="document_processor",
|
|
||||||
message=f"Starting file processing for: {filename}",
|
|
||||||
metadata={
|
|
||||||
"document_type": "FILE",
|
|
||||||
"filename": filename,
|
|
||||||
"file_path": file_path,
|
|
||||||
"user_id": user_id,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
|
||||||
await process_file_in_background(
|
|
||||||
file_path,
|
|
||||||
filename,
|
|
||||||
search_space_id,
|
|
||||||
user_id,
|
|
||||||
session,
|
|
||||||
task_logger,
|
|
||||||
log_entry,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Note: success/failure logging is handled within process_file_in_background
|
|
||||||
except Exception as e:
|
|
||||||
await task_logger.log_task_failure(
|
|
||||||
log_entry,
|
|
||||||
f"Failed to process file: {filename}",
|
|
||||||
str(e),
|
|
||||||
{"error_type": type(e).__name__},
|
|
||||||
)
|
|
||||||
import logging
|
|
||||||
|
|
||||||
logging.error(f"Error processing file: {e!s}")
|
|
||||||
|
|
||||||
|
|
||||||
async def process_youtube_video_with_new_session(
|
|
||||||
url: str, search_space_id: int, user_id: str
|
|
||||||
):
|
|
||||||
"""Create a new session and process YouTube video."""
|
|
||||||
from app.db import async_session_maker
|
|
||||||
from app.services.task_logging_service import TaskLoggingService
|
|
||||||
|
|
||||||
async with async_session_maker() as session:
|
|
||||||
# Initialize task logging service
|
|
||||||
task_logger = TaskLoggingService(session, search_space_id)
|
|
||||||
|
|
||||||
# Log task start
|
|
||||||
log_entry = await task_logger.log_task_start(
|
|
||||||
task_name="process_youtube_video",
|
|
||||||
source="document_processor",
|
|
||||||
message=f"Starting YouTube video processing for: {url}",
|
|
||||||
metadata={"document_type": "YOUTUBE_VIDEO", "url": url, "user_id": user_id},
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
|
||||||
result = await add_youtube_video_document(
|
|
||||||
session, url, search_space_id, user_id
|
|
||||||
)
|
|
||||||
|
|
||||||
if result:
|
|
||||||
await task_logger.log_task_success(
|
|
||||||
log_entry,
|
|
||||||
f"Successfully processed YouTube video: {result.title}",
|
|
||||||
{
|
|
||||||
"document_id": result.id,
|
|
||||||
"video_id": result.document_metadata.get("video_id"),
|
|
||||||
"content_hash": result.content_hash,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
await task_logger.log_task_success(
|
|
||||||
log_entry,
|
|
||||||
f"YouTube video document already exists (duplicate): {url}",
|
|
||||||
{"duplicate_detected": True},
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
await task_logger.log_task_failure(
|
|
||||||
log_entry,
|
|
||||||
f"Failed to process YouTube video: {url}",
|
|
||||||
str(e),
|
|
||||||
{"error_type": type(e).__name__},
|
|
||||||
)
|
|
||||||
import logging
|
|
||||||
|
|
||||||
logging.error(f"Error processing YouTube video: {e!s}")
|
|
||||||
|
|
|
@ -13,6 +13,7 @@ from .documents import (
|
||||||
DocumentRead,
|
DocumentRead,
|
||||||
DocumentsCreate,
|
DocumentsCreate,
|
||||||
DocumentUpdate,
|
DocumentUpdate,
|
||||||
|
DocumentWithChunksRead,
|
||||||
ExtensionDocumentContent,
|
ExtensionDocumentContent,
|
||||||
ExtensionDocumentMetadata,
|
ExtensionDocumentMetadata,
|
||||||
)
|
)
|
||||||
|
@ -53,6 +54,7 @@ __all__ = [
|
||||||
"DocumentBase",
|
"DocumentBase",
|
||||||
"DocumentRead",
|
"DocumentRead",
|
||||||
"DocumentUpdate",
|
"DocumentUpdate",
|
||||||
|
"DocumentWithChunksRead",
|
||||||
"DocumentsCreate",
|
"DocumentsCreate",
|
||||||
"ExtensionDocumentContent",
|
"ExtensionDocumentContent",
|
||||||
"ExtensionDocumentMetadata",
|
"ExtensionDocumentMetadata",
|
||||||
|
|
|
@ -4,6 +4,8 @@ from pydantic import BaseModel, ConfigDict
|
||||||
|
|
||||||
from app.db import DocumentType
|
from app.db import DocumentType
|
||||||
|
|
||||||
|
from .chunks import ChunkRead
|
||||||
|
|
||||||
|
|
||||||
class ExtensionDocumentMetadata(BaseModel):
|
class ExtensionDocumentMetadata(BaseModel):
|
||||||
BrowsingSessionId: str
|
BrowsingSessionId: str
|
||||||
|
@ -45,3 +47,9 @@ class DocumentRead(BaseModel):
|
||||||
search_space_id: int
|
search_space_id: int
|
||||||
|
|
||||||
model_config = ConfigDict(from_attributes=True)
|
model_config = ConfigDict(from_attributes=True)
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentWithChunksRead(DocumentRead):
|
||||||
|
chunks: list[ChunkRead] = []
|
||||||
|
|
||||||
|
model_config = ConfigDict(from_attributes=True)
|
||||||
|
|
|
@ -1,58 +1,202 @@
|
||||||
"use client";
|
"use client";
|
||||||
|
|
||||||
import { ExternalLink } from "lucide-react";
|
import { ExternalLink, FileText, Loader2 } from "lucide-react";
|
||||||
import type React from "react";
|
import type React from "react";
|
||||||
|
import { useEffect, useRef, useState } from "react";
|
||||||
|
import { MarkdownViewer } from "@/components/markdown-viewer";
|
||||||
import { Button } from "@/components/ui/button";
|
import { Button } from "@/components/ui/button";
|
||||||
import { Popover, PopoverContent, PopoverTrigger } from "@/components/ui/popover";
|
import { ScrollArea } from "@/components/ui/scroll-area";
|
||||||
|
import {
|
||||||
|
Sheet,
|
||||||
|
SheetContent,
|
||||||
|
SheetDescription,
|
||||||
|
SheetHeader,
|
||||||
|
SheetTitle,
|
||||||
|
SheetTrigger,
|
||||||
|
} from "@/components/ui/sheet";
|
||||||
|
import { useDocumentByChunk } from "@/hooks/use-document-by-chunk";
|
||||||
|
import { cn } from "@/lib/utils";
|
||||||
|
|
||||||
export const CitationDisplay: React.FC<{ index: number; node: any }> = ({ index, node }) => {
|
export const CitationDisplay: React.FC<{ index: number; node: any }> = ({ index, node }) => {
|
||||||
const truncateText = (text: string, maxLength: number = 200) => {
|
const chunkId = Number(node?.id);
|
||||||
if (text.length <= maxLength) return text;
|
const sourceType = node?.metadata?.source_type;
|
||||||
return `${text.substring(0, maxLength)}...`;
|
const [isOpen, setIsOpen] = useState(false);
|
||||||
|
const { document, loading, error, fetchDocumentByChunk, clearDocument } = useDocumentByChunk();
|
||||||
|
const chunksContainerRef = useRef<HTMLDivElement>(null);
|
||||||
|
const highlightedChunkRef = useRef<HTMLDivElement>(null);
|
||||||
|
|
||||||
|
// Check if this is a source type that should render directly from node
|
||||||
|
const isDirectRenderSource = sourceType === "TAVILY_API" || sourceType === "LINKUP_API";
|
||||||
|
|
||||||
|
const handleOpenChange = async (open: boolean) => {
|
||||||
|
setIsOpen(open);
|
||||||
|
if (open && chunkId && !isDirectRenderSource) {
|
||||||
|
await fetchDocumentByChunk(chunkId);
|
||||||
|
} else if (!open && !isDirectRenderSource) {
|
||||||
|
clearDocument();
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
// Scroll to highlighted chunk when document loads
|
||||||
|
if (document && highlightedChunkRef.current && chunksContainerRef.current) {
|
||||||
|
setTimeout(() => {
|
||||||
|
highlightedChunkRef.current?.scrollIntoView({
|
||||||
|
behavior: "smooth",
|
||||||
|
block: "start",
|
||||||
|
});
|
||||||
|
}, 100);
|
||||||
|
}
|
||||||
|
}, [document]);
|
||||||
|
|
||||||
const handleUrlClick = (e: React.MouseEvent, url: string) => {
|
const handleUrlClick = (e: React.MouseEvent, url: string) => {
|
||||||
e.preventDefault();
|
e.preventDefault();
|
||||||
e.stopPropagation();
|
e.stopPropagation();
|
||||||
window.open(url, "_blank", "noopener,noreferrer");
|
window.open(url, "_blank", "noopener,noreferrer");
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const formatDocumentType = (type: string) => {
|
||||||
|
return type
|
||||||
|
.split("_")
|
||||||
|
.map((word) => word.charAt(0) + word.slice(1).toLowerCase())
|
||||||
|
.join(" ");
|
||||||
|
};
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<Popover>
|
<Sheet open={isOpen} onOpenChange={handleOpenChange}>
|
||||||
<PopoverTrigger asChild>
|
<SheetTrigger asChild>
|
||||||
<span className="text-[10px] font-bold bg-slate-500 hover:bg-slate-600 text-white rounded-full w-4 h-4 inline-flex items-center justify-center align-super cursor-pointer transition-colors">
|
<span className="text-[10px] font-bold bg-slate-500 hover:bg-slate-600 text-white rounded-full w-4 h-4 inline-flex items-center justify-center align-super cursor-pointer transition-colors">
|
||||||
{index + 1}
|
{index + 1}
|
||||||
</span>
|
</span>
|
||||||
</PopoverTrigger>
|
</SheetTrigger>
|
||||||
<PopoverContent className="w-80 p-4 space-y-3 relative" align="start">
|
<SheetContent side="right" className="w-full sm:max-w-5xl lg:max-w-7xl">
|
||||||
{/* External Link Button - Top Right */}
|
<SheetHeader className="px-6 py-4 border-b">
|
||||||
{node?.url && (
|
<SheetTitle className="flex items-center gap-3 text-lg">
|
||||||
<Button
|
<FileText className="h-6 w-6" />
|
||||||
size="icon"
|
{document?.title || node?.metadata?.title || node?.metadata?.group_name || "Source"}
|
||||||
variant="ghost"
|
</SheetTitle>
|
||||||
onClick={(e) => handleUrlClick(e, node.url)}
|
<SheetDescription className="text-base mt-2">
|
||||||
className="absolute top-3 right-3 inline-flex items-center justify-center w-6 h-6 text-blue-600 hover:text-blue-800 dark:text-blue-400 dark:hover:text-blue-200 hover:bg-blue-50 dark:hover:bg-blue-900/20 rounded transition-colors"
|
{document
|
||||||
title="Open in new tab"
|
? formatDocumentType(document.document_type)
|
||||||
>
|
: sourceType && formatDocumentType(sourceType)}
|
||||||
<ExternalLink size={14} />
|
</SheetDescription>
|
||||||
</Button>
|
</SheetHeader>
|
||||||
|
|
||||||
|
{!isDirectRenderSource && loading && (
|
||||||
|
<div className="flex items-center justify-center h-64 px-6">
|
||||||
|
<Loader2 className="h-8 w-8 animate-spin text-muted-foreground" />
|
||||||
|
</div>
|
||||||
)}
|
)}
|
||||||
|
|
||||||
{/* Heading */}
|
{!isDirectRenderSource && error && (
|
||||||
<div className="text-sm font-semibold text-slate-900 dark:text-slate-100 pr-8">
|
<div className="flex items-center justify-center h-64 px-6">
|
||||||
{node?.metadata?.group_name || "Source"}
|
<p className="text-sm text-destructive">{error}</p>
|
||||||
</div>
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
{/* Source */}
|
{/* Direct render for TAVILY_API and LINEAR_API */}
|
||||||
<div className="text-xs text-slate-600 dark:text-slate-400 font-medium">
|
{isDirectRenderSource && (
|
||||||
|
<ScrollArea className="h-[calc(100vh-10rem)]">
|
||||||
|
<div className="px-6 py-4">
|
||||||
|
{/* External Link */}
|
||||||
|
{node?.url && (
|
||||||
|
<div className="mb-8">
|
||||||
|
<Button
|
||||||
|
size="default"
|
||||||
|
variant="outline"
|
||||||
|
onClick={(e) => handleUrlClick(e, node.url)}
|
||||||
|
className="w-full py-3"
|
||||||
|
>
|
||||||
|
<ExternalLink className="mr-2 h-4 w-4" />
|
||||||
|
Open in Browser
|
||||||
|
</Button>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{/* Source Information */}
|
||||||
|
<div className="mb-8 p-6 bg-muted/50 rounded-lg border">
|
||||||
|
<h3 className="text-base font-semibold mb-4">Source Information</h3>
|
||||||
|
<div className="text-sm text-muted-foreground mb-3 font-medium">
|
||||||
{node?.metadata?.title || "Untitled"}
|
{node?.metadata?.title || "Untitled"}
|
||||||
</div>
|
</div>
|
||||||
|
<div className="text-sm text-foreground leading-relaxed whitespace-pre-wrap">
|
||||||
{/* Body */}
|
{node?.text || "No content available"}
|
||||||
<div className="text-xs text-slate-700 dark:text-slate-300 leading-relaxed">
|
|
||||||
{truncateText(node?.text || "No content available")}
|
|
||||||
</div>
|
</div>
|
||||||
</PopoverContent>
|
</div>
|
||||||
</Popover>
|
</div>
|
||||||
|
</ScrollArea>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{/* API-fetched document content */}
|
||||||
|
{!isDirectRenderSource && document && (
|
||||||
|
<ScrollArea className="h-[calc(100vh-10rem)]">
|
||||||
|
<div className="px-6 py-4">
|
||||||
|
{/* Document Metadata */}
|
||||||
|
{document.document_metadata && Object.keys(document.document_metadata).length > 0 && (
|
||||||
|
<div className="mb-8 p-6 bg-muted/50 rounded-lg border">
|
||||||
|
<h3 className="text-base font-semibold mb-4">Document Information</h3>
|
||||||
|
<dl className="grid grid-cols-1 gap-3 text-sm">
|
||||||
|
{Object.entries(document.document_metadata).map(([key, value]) => (
|
||||||
|
<div key={key} className="flex gap-3">
|
||||||
|
<dt className="font-medium text-muted-foreground capitalize min-w-0 flex-shrink-0">
|
||||||
|
{key.replace(/_/g, " ")}:
|
||||||
|
</dt>
|
||||||
|
<dd className="text-foreground break-words">{String(value)}</dd>
|
||||||
|
</div>
|
||||||
|
))}
|
||||||
|
</dl>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{/* External Link */}
|
||||||
|
{node?.url && (
|
||||||
|
<div className="mb-8">
|
||||||
|
<Button
|
||||||
|
size="default"
|
||||||
|
variant="outline"
|
||||||
|
onClick={(e) => handleUrlClick(e, node.url)}
|
||||||
|
className="w-full py-3"
|
||||||
|
>
|
||||||
|
<ExternalLink className="mr-2 h-4 w-4" />
|
||||||
|
Open in Browser
|
||||||
|
</Button>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{/* Chunks */}
|
||||||
|
<div className="space-y-6" ref={chunksContainerRef}>
|
||||||
|
<h3 className="text-base font-semibold mb-4">Document Content</h3>
|
||||||
|
{document.chunks.map((chunk, idx) => (
|
||||||
|
<div
|
||||||
|
key={chunk.id}
|
||||||
|
ref={chunk.id === chunkId ? highlightedChunkRef : null}
|
||||||
|
className={cn(
|
||||||
|
"p-6 rounded-lg border transition-all duration-300",
|
||||||
|
chunk.id === chunkId
|
||||||
|
? "bg-primary/10 border-primary shadow-md ring-1 ring-primary/20"
|
||||||
|
: "bg-background border-border hover:bg-muted/50 hover:border-muted-foreground/20"
|
||||||
|
)}
|
||||||
|
>
|
||||||
|
<div className="mb-4 flex items-center justify-between">
|
||||||
|
<span className="text-sm font-medium text-muted-foreground">
|
||||||
|
Chunk {idx + 1} of {document.chunks.length}
|
||||||
|
</span>
|
||||||
|
{chunk.id === chunkId && (
|
||||||
|
<span className="text-sm font-medium text-primary bg-primary/10 px-3 py-1 rounded-full">
|
||||||
|
Referenced Chunk
|
||||||
|
</span>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
<div className="text-sm text-foreground whitespace-pre-wrap leading-relaxed">
|
||||||
|
<MarkdownViewer content={chunk.content} className="max-w-fit" />
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
))}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</ScrollArea>
|
||||||
|
)}
|
||||||
|
</SheetContent>
|
||||||
|
</Sheet>
|
||||||
);
|
);
|
||||||
};
|
};
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import { Check, Copy } from "lucide-react";
|
import { Check, Copy } from "lucide-react";
|
||||||
import Image from "next/image";
|
import Image from "next/image";
|
||||||
import { useTheme } from "next-themes";
|
import { useTheme } from "next-themes";
|
||||||
import React, { useEffect, useMemo, useRef, useState } from "react";
|
import { useEffect, useMemo, useRef, useState } from "react";
|
||||||
import ReactMarkdown from "react-markdown";
|
import ReactMarkdown from "react-markdown";
|
||||||
import { Prism as SyntaxHighlighter } from "react-syntax-highlighter";
|
import { Prism as SyntaxHighlighter } from "react-syntax-highlighter";
|
||||||
import { oneDark, oneLight } from "react-syntax-highlighter/dist/cjs/styles/prism";
|
import { oneDark, oneLight } from "react-syntax-highlighter/dist/cjs/styles/prism";
|
||||||
|
@ -10,105 +10,51 @@ import rehypeSanitize from "rehype-sanitize";
|
||||||
import remarkGfm from "remark-gfm";
|
import remarkGfm from "remark-gfm";
|
||||||
import { Button } from "@/components/ui/button";
|
import { Button } from "@/components/ui/button";
|
||||||
import { cn } from "@/lib/utils";
|
import { cn } from "@/lib/utils";
|
||||||
import { Citation } from "./chat/Citation";
|
|
||||||
import type { Source } from "./chat/types";
|
|
||||||
import CopyButton from "./copy-button";
|
|
||||||
|
|
||||||
interface MarkdownViewerProps {
|
interface MarkdownViewerProps {
|
||||||
content: string;
|
content: string;
|
||||||
className?: string;
|
className?: string;
|
||||||
getCitationSource?: (id: number) => Source | null;
|
|
||||||
type?: "user" | "ai";
|
|
||||||
}
|
}
|
||||||
|
|
||||||
export function MarkdownViewer({
|
export function MarkdownViewer({ content, className }: MarkdownViewerProps) {
|
||||||
content,
|
|
||||||
className,
|
|
||||||
getCitationSource,
|
|
||||||
type = "user",
|
|
||||||
}: MarkdownViewerProps) {
|
|
||||||
const ref = useRef<HTMLDivElement>(null);
|
const ref = useRef<HTMLDivElement>(null);
|
||||||
// Memoize the markdown components to prevent unnecessary re-renders
|
// Memoize the markdown components to prevent unnecessary re-renders
|
||||||
const components = useMemo(() => {
|
const components = useMemo(() => {
|
||||||
return {
|
return {
|
||||||
// Define custom components for markdown elements
|
// Define custom components for markdown elements
|
||||||
p: ({ node, children, ...props }: any) => {
|
p: ({ node, children, ...props }: any) => (
|
||||||
// If there's no getCitationSource function, just render normally
|
|
||||||
if (!getCitationSource) {
|
|
||||||
return (
|
|
||||||
<p className="my-2" {...props}>
|
<p className="my-2" {...props}>
|
||||||
{children}
|
{children}
|
||||||
</p>
|
</p>
|
||||||
);
|
),
|
||||||
}
|
a: ({ node, children, ...props }: any) => (
|
||||||
|
|
||||||
// Process citations within paragraph content
|
|
||||||
return (
|
|
||||||
<p className="my-2" {...props}>
|
|
||||||
{processCitationsInReactChildren(children, getCitationSource)}
|
|
||||||
</p>
|
|
||||||
);
|
|
||||||
},
|
|
||||||
a: ({ node, children, ...props }: any) => {
|
|
||||||
// Process citations within link content if needed
|
|
||||||
const processedChildren = getCitationSource
|
|
||||||
? processCitationsInReactChildren(children, getCitationSource)
|
|
||||||
: children;
|
|
||||||
return (
|
|
||||||
<a className="text-primary hover:underline" {...props}>
|
<a className="text-primary hover:underline" {...props}>
|
||||||
{processedChildren}
|
{children}
|
||||||
</a>
|
</a>
|
||||||
);
|
),
|
||||||
},
|
li: ({ node, children, ...props }: any) => <li {...props}>{children}</li>,
|
||||||
li: ({ node, children, ...props }: any) => {
|
|
||||||
// Process citations within list item content
|
|
||||||
const processedChildren = getCitationSource
|
|
||||||
? processCitationsInReactChildren(children, getCitationSource)
|
|
||||||
: children;
|
|
||||||
return <li {...props}>{processedChildren}</li>;
|
|
||||||
},
|
|
||||||
ul: ({ node, ...props }: any) => <ul className="list-disc pl-5 my-2" {...props} />,
|
ul: ({ node, ...props }: any) => <ul className="list-disc pl-5 my-2" {...props} />,
|
||||||
ol: ({ node, ...props }: any) => <ol className="list-decimal pl-5 my-2" {...props} />,
|
ol: ({ node, ...props }: any) => <ol className="list-decimal pl-5 my-2" {...props} />,
|
||||||
h1: ({ node, children, ...props }: any) => {
|
h1: ({ node, children, ...props }: any) => (
|
||||||
const processedChildren = getCitationSource
|
|
||||||
? processCitationsInReactChildren(children, getCitationSource)
|
|
||||||
: children;
|
|
||||||
return (
|
|
||||||
<h1 className="text-2xl font-bold mt-6 mb-2" {...props}>
|
<h1 className="text-2xl font-bold mt-6 mb-2" {...props}>
|
||||||
{processedChildren}
|
{children}
|
||||||
</h1>
|
</h1>
|
||||||
);
|
),
|
||||||
},
|
h2: ({ node, children, ...props }: any) => (
|
||||||
h2: ({ node, children, ...props }: any) => {
|
|
||||||
const processedChildren = getCitationSource
|
|
||||||
? processCitationsInReactChildren(children, getCitationSource)
|
|
||||||
: children;
|
|
||||||
return (
|
|
||||||
<h2 className="text-xl font-bold mt-5 mb-2" {...props}>
|
<h2 className="text-xl font-bold mt-5 mb-2" {...props}>
|
||||||
{processedChildren}
|
{children}
|
||||||
</h2>
|
</h2>
|
||||||
);
|
),
|
||||||
},
|
h3: ({ node, children, ...props }: any) => (
|
||||||
h3: ({ node, children, ...props }: any) => {
|
|
||||||
const processedChildren = getCitationSource
|
|
||||||
? processCitationsInReactChildren(children, getCitationSource)
|
|
||||||
: children;
|
|
||||||
return (
|
|
||||||
<h3 className="text-lg font-bold mt-4 mb-2" {...props}>
|
<h3 className="text-lg font-bold mt-4 mb-2" {...props}>
|
||||||
{processedChildren}
|
{children}
|
||||||
</h3>
|
</h3>
|
||||||
);
|
),
|
||||||
},
|
h4: ({ node, children, ...props }: any) => (
|
||||||
h4: ({ node, children, ...props }: any) => {
|
|
||||||
const processedChildren = getCitationSource
|
|
||||||
? processCitationsInReactChildren(children, getCitationSource)
|
|
||||||
: children;
|
|
||||||
return (
|
|
||||||
<h4 className="text-base font-bold mt-3 mb-1" {...props}>
|
<h4 className="text-base font-bold mt-3 mb-1" {...props}>
|
||||||
{processedChildren}
|
{children}
|
||||||
</h4>
|
</h4>
|
||||||
);
|
),
|
||||||
},
|
|
||||||
blockquote: ({ node, ...props }: any) => (
|
blockquote: ({ node, ...props }: any) => (
|
||||||
<blockquote className="border-l-4 border-muted pl-4 italic my-2" {...props} />
|
<blockquote className="border-l-4 border-muted pl-4 italic my-2" {...props} />
|
||||||
),
|
),
|
||||||
|
@ -154,7 +100,7 @@ export function MarkdownViewer({
|
||||||
);
|
);
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
}, [getCitationSource]);
|
}, []);
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<div className={cn("prose prose-sm dark:prose-invert max-w-none", className)} ref={ref}>
|
<div className={cn("prose prose-sm dark:prose-invert max-w-none", className)} ref={ref}>
|
||||||
|
@ -165,7 +111,6 @@ export function MarkdownViewer({
|
||||||
>
|
>
|
||||||
{content}
|
{content}
|
||||||
</ReactMarkdown>
|
</ReactMarkdown>
|
||||||
{type === "ai" && <CopyButton ref={ref} />}
|
|
||||||
</div>
|
</div>
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
@ -267,77 +212,3 @@ const CodeBlock = ({ children, language }: { children: string; language: string
|
||||||
</div>
|
</div>
|
||||||
);
|
);
|
||||||
};
|
};
|
||||||
|
|
||||||
// Helper function to process citations within React children
|
|
||||||
const processCitationsInReactChildren = (
|
|
||||||
children: React.ReactNode,
|
|
||||||
getCitationSource: (id: number) => Source | null
|
|
||||||
): React.ReactNode => {
|
|
||||||
// If children is not an array or string, just return it
|
|
||||||
if (!children || (typeof children !== "string" && !Array.isArray(children))) {
|
|
||||||
return children;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Handle string content directly - this is where we process citation references
|
|
||||||
if (typeof children === "string") {
|
|
||||||
return processCitationsInText(children, getCitationSource);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Handle arrays of children recursively
|
|
||||||
if (Array.isArray(children)) {
|
|
||||||
return React.Children.map(children, (child) => {
|
|
||||||
if (typeof child === "string") {
|
|
||||||
return processCitationsInText(child, getCitationSource);
|
|
||||||
}
|
|
||||||
return child;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
return children;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Process citation references in text content
|
|
||||||
const processCitationsInText = (
|
|
||||||
text: string,
|
|
||||||
getCitationSource: (id: number) => Source | null
|
|
||||||
): React.ReactNode[] => {
|
|
||||||
// Use improved regex to catch citation numbers more reliably
|
|
||||||
// This will match patterns like [1], [42], etc. including when they appear at the end of a line or sentence
|
|
||||||
const citationRegex = /\[(\d+)\]/g;
|
|
||||||
const parts: React.ReactNode[] = [];
|
|
||||||
let lastIndex = 0;
|
|
||||||
let match: RegExpExecArray | null = citationRegex.exec(text);
|
|
||||||
let position = 0;
|
|
||||||
|
|
||||||
while (match !== null) {
|
|
||||||
// Add text before the citation
|
|
||||||
if (match.index > lastIndex) {
|
|
||||||
parts.push(text.substring(lastIndex, match.index));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add the citation component
|
|
||||||
const citationId = parseInt(match[1], 10);
|
|
||||||
const source = getCitationSource(citationId);
|
|
||||||
|
|
||||||
parts.push(
|
|
||||||
<Citation
|
|
||||||
key={`citation-${citationId}-${position}`}
|
|
||||||
citationId={citationId}
|
|
||||||
citationText={match[0]}
|
|
||||||
position={position}
|
|
||||||
source={source}
|
|
||||||
/>
|
|
||||||
);
|
|
||||||
|
|
||||||
lastIndex = match.index + match[0].length;
|
|
||||||
position++;
|
|
||||||
match = citationRegex.exec(text);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add any remaining text after the last citation
|
|
||||||
if (lastIndex < text.length) {
|
|
||||||
parts.push(text.substring(lastIndex));
|
|
||||||
}
|
|
||||||
|
|
||||||
return parts;
|
|
||||||
};
|
|
||||||
|
|
56
surfsense_web/components/ui/scroll-area.tsx
Normal file
56
surfsense_web/components/ui/scroll-area.tsx
Normal file
|
@ -0,0 +1,56 @@
|
||||||
|
"use client";
|
||||||
|
|
||||||
|
import * as ScrollAreaPrimitive from "@radix-ui/react-scroll-area";
|
||||||
|
import type * as React from "react";
|
||||||
|
|
||||||
|
import { cn } from "@/lib/utils";
|
||||||
|
|
||||||
|
function ScrollArea({
|
||||||
|
className,
|
||||||
|
children,
|
||||||
|
...props
|
||||||
|
}: React.ComponentProps<typeof ScrollAreaPrimitive.Root>) {
|
||||||
|
return (
|
||||||
|
<ScrollAreaPrimitive.Root
|
||||||
|
data-slot="scroll-area"
|
||||||
|
className={cn("relative", className)}
|
||||||
|
{...props}
|
||||||
|
>
|
||||||
|
<ScrollAreaPrimitive.Viewport
|
||||||
|
data-slot="scroll-area-viewport"
|
||||||
|
className="focus-visible:ring-ring/50 size-full rounded-[inherit] transition-[color,box-shadow] outline-none focus-visible:ring-[3px] focus-visible:outline-1"
|
||||||
|
>
|
||||||
|
{children}
|
||||||
|
</ScrollAreaPrimitive.Viewport>
|
||||||
|
<ScrollBar />
|
||||||
|
<ScrollAreaPrimitive.Corner />
|
||||||
|
</ScrollAreaPrimitive.Root>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
function ScrollBar({
|
||||||
|
className,
|
||||||
|
orientation = "vertical",
|
||||||
|
...props
|
||||||
|
}: React.ComponentProps<typeof ScrollAreaPrimitive.ScrollAreaScrollbar>) {
|
||||||
|
return (
|
||||||
|
<ScrollAreaPrimitive.ScrollAreaScrollbar
|
||||||
|
data-slot="scroll-area-scrollbar"
|
||||||
|
orientation={orientation}
|
||||||
|
className={cn(
|
||||||
|
"flex touch-none p-px transition-colors select-none",
|
||||||
|
orientation === "vertical" && "h-full w-2.5 border-l border-l-transparent",
|
||||||
|
orientation === "horizontal" && "h-2.5 flex-col border-t border-t-transparent",
|
||||||
|
className
|
||||||
|
)}
|
||||||
|
{...props}
|
||||||
|
>
|
||||||
|
<ScrollAreaPrimitive.ScrollAreaThumb
|
||||||
|
data-slot="scroll-area-thumb"
|
||||||
|
className="bg-border relative flex-1 rounded-full"
|
||||||
|
/>
|
||||||
|
</ScrollAreaPrimitive.ScrollAreaScrollbar>
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
export { ScrollArea, ScrollBar };
|
|
@ -1,2 +1,3 @@
|
||||||
|
export * from "./use-document-by-chunk";
|
||||||
export * from "./use-logs";
|
export * from "./use-logs";
|
||||||
export * from "./useSearchSourceConnectors";
|
export * from "./useSearchSourceConnectors";
|
||||||
|
|
106
surfsense_web/hooks/use-document-by-chunk.ts
Normal file
106
surfsense_web/hooks/use-document-by-chunk.ts
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
"use client";
|
||||||
|
import { useCallback, useState } from "react";
|
||||||
|
import { toast } from "sonner";
|
||||||
|
|
||||||
|
export interface Chunk {
|
||||||
|
id: number;
|
||||||
|
content: string;
|
||||||
|
document_id: number;
|
||||||
|
created_at: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface DocumentWithChunks {
|
||||||
|
id: number;
|
||||||
|
title: string;
|
||||||
|
document_type: DocumentType;
|
||||||
|
document_metadata: any;
|
||||||
|
content: string;
|
||||||
|
created_at: string;
|
||||||
|
search_space_id: number;
|
||||||
|
chunks: Chunk[];
|
||||||
|
}
|
||||||
|
|
||||||
|
export type DocumentType =
|
||||||
|
| "EXTENSION"
|
||||||
|
| "CRAWLED_URL"
|
||||||
|
| "SLACK_CONNECTOR"
|
||||||
|
| "NOTION_CONNECTOR"
|
||||||
|
| "FILE"
|
||||||
|
| "YOUTUBE_VIDEO"
|
||||||
|
| "GITHUB_CONNECTOR"
|
||||||
|
| "LINEAR_CONNECTOR"
|
||||||
|
| "DISCORD_CONNECTOR"
|
||||||
|
| "JIRA_CONNECTOR"
|
||||||
|
| "CONFLUENCE_CONNECTOR"
|
||||||
|
| "CLICKUP_CONNECTOR"
|
||||||
|
| "GOOGLE_CALENDAR_CONNECTOR"
|
||||||
|
| "GOOGLE_GMAIL_CONNECTOR";
|
||||||
|
|
||||||
|
export function useDocumentByChunk() {
|
||||||
|
const [document, setDocument] = useState<DocumentWithChunks | null>(null);
|
||||||
|
const [loading, setLoading] = useState(false);
|
||||||
|
const [error, setError] = useState<string | null>(null);
|
||||||
|
|
||||||
|
const fetchDocumentByChunk = useCallback(async (chunkId: number) => {
|
||||||
|
try {
|
||||||
|
setLoading(true);
|
||||||
|
setError(null);
|
||||||
|
setDocument(null);
|
||||||
|
|
||||||
|
const response = await fetch(
|
||||||
|
`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/documents/by-chunk/${chunkId}`,
|
||||||
|
{
|
||||||
|
headers: {
|
||||||
|
Authorization: `Bearer ${localStorage.getItem("surfsense_bearer_token")}`,
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
},
|
||||||
|
method: "GET",
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
const errorText = await response.text();
|
||||||
|
let errorMessage = "Failed to fetch document";
|
||||||
|
|
||||||
|
try {
|
||||||
|
const errorData = JSON.parse(errorText);
|
||||||
|
errorMessage = errorData.detail || errorMessage;
|
||||||
|
} catch {
|
||||||
|
// If parsing fails, use default message
|
||||||
|
}
|
||||||
|
|
||||||
|
if (response.status === 404) {
|
||||||
|
errorMessage = "Chunk not found or you don't have access to it";
|
||||||
|
}
|
||||||
|
|
||||||
|
toast.error(errorMessage);
|
||||||
|
throw new Error(errorMessage);
|
||||||
|
}
|
||||||
|
|
||||||
|
const data: DocumentWithChunks = await response.json();
|
||||||
|
setDocument(data);
|
||||||
|
setError(null);
|
||||||
|
return data;
|
||||||
|
} catch (err: any) {
|
||||||
|
const errorMessage = err.message || "Failed to fetch document";
|
||||||
|
setError(errorMessage);
|
||||||
|
console.error("Error fetching document by chunk:", err);
|
||||||
|
throw err;
|
||||||
|
} finally {
|
||||||
|
setLoading(false);
|
||||||
|
}
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
const clearDocument = useCallback(() => {
|
||||||
|
setDocument(null);
|
||||||
|
setError(null);
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
return {
|
||||||
|
document,
|
||||||
|
loading,
|
||||||
|
error,
|
||||||
|
fetchDocumentByChunk,
|
||||||
|
clearDocument,
|
||||||
|
};
|
||||||
|
}
|
|
@ -12,11 +12,7 @@ const nextConfig: NextConfig = {
|
||||||
remotePatterns: [
|
remotePatterns: [
|
||||||
{
|
{
|
||||||
protocol: "https",
|
protocol: "https",
|
||||||
hostname: "images.unsplash.com",
|
hostname: "**",
|
||||||
},
|
|
||||||
{
|
|
||||||
protocol: "https",
|
|
||||||
hostname: "static.vecteezy.com",
|
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
|
|
Loading…
Add table
Reference in a new issue