mirror of
https://github.com/MODSetter/SurfSense.git
synced 2025-09-01 10:09:08 +00:00
feat: added jump to source referencing of citations
This commit is contained in:
parent
9b91bea51d
commit
76732c36ba
8 changed files with 818 additions and 559 deletions
|
@ -5,10 +5,24 @@ from fastapi import APIRouter, BackgroundTasks, Depends, Form, HTTPException, Up
|
|||
from litellm import atranscription
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy.future import select
|
||||
from sqlalchemy.orm import selectinload
|
||||
|
||||
from app.config import config as app_config
|
||||
from app.db import Document, DocumentType, Log, SearchSpace, User, get_async_session
|
||||
from app.schemas import DocumentRead, DocumentsCreate, DocumentUpdate
|
||||
from app.db import (
|
||||
Chunk,
|
||||
Document,
|
||||
DocumentType,
|
||||
Log,
|
||||
SearchSpace,
|
||||
User,
|
||||
get_async_session,
|
||||
)
|
||||
from app.schemas import (
|
||||
DocumentRead,
|
||||
DocumentsCreate,
|
||||
DocumentUpdate,
|
||||
DocumentWithChunksRead,
|
||||
)
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.tasks.document_processors import (
|
||||
add_crawled_url_document,
|
||||
|
@ -140,6 +154,423 @@ async def create_documents_file_upload(
|
|||
) from e
|
||||
|
||||
|
||||
@router.get("/documents/", response_model=list[DocumentRead])
|
||||
async def read_documents(
|
||||
skip: int = 0,
|
||||
limit: int = 3000,
|
||||
search_space_id: int | None = None,
|
||||
session: AsyncSession = Depends(get_async_session),
|
||||
user: User = Depends(current_active_user),
|
||||
):
|
||||
try:
|
||||
query = (
|
||||
select(Document).join(SearchSpace).filter(SearchSpace.user_id == user.id)
|
||||
)
|
||||
|
||||
# Filter by search_space_id if provided
|
||||
if search_space_id is not None:
|
||||
query = query.filter(Document.search_space_id == search_space_id)
|
||||
|
||||
result = await session.execute(query.offset(skip).limit(limit))
|
||||
db_documents = result.scalars().all()
|
||||
|
||||
# Convert database objects to API-friendly format
|
||||
api_documents = []
|
||||
for doc in db_documents:
|
||||
api_documents.append(
|
||||
DocumentRead(
|
||||
id=doc.id,
|
||||
title=doc.title,
|
||||
document_type=doc.document_type,
|
||||
document_metadata=doc.document_metadata,
|
||||
content=doc.content,
|
||||
created_at=doc.created_at,
|
||||
search_space_id=doc.search_space_id,
|
||||
)
|
||||
)
|
||||
|
||||
return api_documents
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Failed to fetch documents: {e!s}"
|
||||
) from e
|
||||
|
||||
|
||||
@router.get("/documents/{document_id}", response_model=DocumentRead)
|
||||
async def read_document(
|
||||
document_id: int,
|
||||
session: AsyncSession = Depends(get_async_session),
|
||||
user: User = Depends(current_active_user),
|
||||
):
|
||||
try:
|
||||
result = await session.execute(
|
||||
select(Document)
|
||||
.join(SearchSpace)
|
||||
.filter(Document.id == document_id, SearchSpace.user_id == user.id)
|
||||
)
|
||||
document = result.scalars().first()
|
||||
|
||||
if not document:
|
||||
raise HTTPException(
|
||||
status_code=404, detail=f"Document with id {document_id} not found"
|
||||
)
|
||||
|
||||
# Convert database object to API-friendly format
|
||||
return DocumentRead(
|
||||
id=document.id,
|
||||
title=document.title,
|
||||
document_type=document.document_type,
|
||||
document_metadata=document.document_metadata,
|
||||
content=document.content,
|
||||
created_at=document.created_at,
|
||||
search_space_id=document.search_space_id,
|
||||
)
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Failed to fetch document: {e!s}"
|
||||
) from e
|
||||
|
||||
|
||||
@router.put("/documents/{document_id}", response_model=DocumentRead)
|
||||
async def update_document(
|
||||
document_id: int,
|
||||
document_update: DocumentUpdate,
|
||||
session: AsyncSession = Depends(get_async_session),
|
||||
user: User = Depends(current_active_user),
|
||||
):
|
||||
try:
|
||||
# Query the document directly instead of using read_document function
|
||||
result = await session.execute(
|
||||
select(Document)
|
||||
.join(SearchSpace)
|
||||
.filter(Document.id == document_id, SearchSpace.user_id == user.id)
|
||||
)
|
||||
db_document = result.scalars().first()
|
||||
|
||||
if not db_document:
|
||||
raise HTTPException(
|
||||
status_code=404, detail=f"Document with id {document_id} not found"
|
||||
)
|
||||
|
||||
update_data = document_update.model_dump(exclude_unset=True)
|
||||
for key, value in update_data.items():
|
||||
setattr(db_document, key, value)
|
||||
await session.commit()
|
||||
await session.refresh(db_document)
|
||||
|
||||
# Convert to DocumentRead for response
|
||||
return DocumentRead(
|
||||
id=db_document.id,
|
||||
title=db_document.title,
|
||||
document_type=db_document.document_type,
|
||||
document_metadata=db_document.document_metadata,
|
||||
content=db_document.content,
|
||||
created_at=db_document.created_at,
|
||||
search_space_id=db_document.search_space_id,
|
||||
)
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
await session.rollback()
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Failed to update document: {e!s}"
|
||||
) from e
|
||||
|
||||
|
||||
@router.delete("/documents/{document_id}", response_model=dict)
|
||||
async def delete_document(
|
||||
document_id: int,
|
||||
session: AsyncSession = Depends(get_async_session),
|
||||
user: User = Depends(current_active_user),
|
||||
):
|
||||
try:
|
||||
# Query the document directly instead of using read_document function
|
||||
result = await session.execute(
|
||||
select(Document)
|
||||
.join(SearchSpace)
|
||||
.filter(Document.id == document_id, SearchSpace.user_id == user.id)
|
||||
)
|
||||
document = result.scalars().first()
|
||||
|
||||
if not document:
|
||||
raise HTTPException(
|
||||
status_code=404, detail=f"Document with id {document_id} not found"
|
||||
)
|
||||
|
||||
await session.delete(document)
|
||||
await session.commit()
|
||||
return {"message": "Document deleted successfully"}
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
await session.rollback()
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Failed to delete document: {e!s}"
|
||||
) from e
|
||||
|
||||
|
||||
@router.get("/documents/by-chunk/{chunk_id}", response_model=DocumentWithChunksRead)
|
||||
async def get_document_by_chunk_id(
|
||||
chunk_id: int,
|
||||
session: AsyncSession = Depends(get_async_session),
|
||||
user: User = Depends(current_active_user),
|
||||
):
|
||||
"""
|
||||
Retrieves a document based on a chunk ID, including all its chunks ordered by creation time.
|
||||
The document's embedding and chunk embeddings are excluded from the response.
|
||||
"""
|
||||
try:
|
||||
# First, get the chunk and verify it exists
|
||||
chunk_result = await session.execute(select(Chunk).filter(Chunk.id == chunk_id))
|
||||
chunk = chunk_result.scalars().first()
|
||||
|
||||
if not chunk:
|
||||
raise HTTPException(
|
||||
status_code=404, detail=f"Chunk with id {chunk_id} not found"
|
||||
)
|
||||
|
||||
# Get the associated document and verify ownership
|
||||
document_result = await session.execute(
|
||||
select(Document)
|
||||
.options(selectinload(Document.chunks))
|
||||
.join(SearchSpace)
|
||||
.filter(Document.id == chunk.document_id, SearchSpace.user_id == user.id)
|
||||
)
|
||||
document = document_result.scalars().first()
|
||||
|
||||
if not document:
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail="Document not found or you don't have access to it",
|
||||
)
|
||||
|
||||
# Sort chunks by creation time
|
||||
sorted_chunks = sorted(document.chunks, key=lambda x: x.created_at)
|
||||
|
||||
# Return the document with its chunks
|
||||
return DocumentWithChunksRead(
|
||||
id=document.id,
|
||||
title=document.title,
|
||||
document_type=document.document_type,
|
||||
document_metadata=document.document_metadata,
|
||||
content=document.content,
|
||||
created_at=document.created_at,
|
||||
search_space_id=document.search_space_id,
|
||||
chunks=sorted_chunks,
|
||||
)
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Failed to retrieve document: {e!s}"
|
||||
) from e
|
||||
|
||||
|
||||
async def process_extension_document_with_new_session(
|
||||
individual_document, search_space_id: int, user_id: str
|
||||
):
|
||||
"""Create a new session and process extension document."""
|
||||
from app.db import async_session_maker
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
|
||||
async with async_session_maker() as session:
|
||||
# Initialize task logging service
|
||||
task_logger = TaskLoggingService(session, search_space_id)
|
||||
|
||||
# Log task start
|
||||
log_entry = await task_logger.log_task_start(
|
||||
task_name="process_extension_document",
|
||||
source="document_processor",
|
||||
message=f"Starting processing of extension document from {individual_document.metadata.VisitedWebPageTitle}",
|
||||
metadata={
|
||||
"document_type": "EXTENSION",
|
||||
"url": individual_document.metadata.VisitedWebPageURL,
|
||||
"title": individual_document.metadata.VisitedWebPageTitle,
|
||||
"user_id": user_id,
|
||||
},
|
||||
)
|
||||
|
||||
try:
|
||||
result = await add_extension_received_document(
|
||||
session, individual_document, search_space_id, user_id
|
||||
)
|
||||
|
||||
if result:
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Successfully processed extension document: {individual_document.metadata.VisitedWebPageTitle}",
|
||||
{"document_id": result.id, "content_hash": result.content_hash},
|
||||
)
|
||||
else:
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Extension document already exists (duplicate): {individual_document.metadata.VisitedWebPageTitle}",
|
||||
{"duplicate_detected": True},
|
||||
)
|
||||
except Exception as e:
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
f"Failed to process extension document: {individual_document.metadata.VisitedWebPageTitle}",
|
||||
str(e),
|
||||
{"error_type": type(e).__name__},
|
||||
)
|
||||
import logging
|
||||
|
||||
logging.error(f"Error processing extension document: {e!s}")
|
||||
|
||||
|
||||
async def process_crawled_url_with_new_session(
|
||||
url: str, search_space_id: int, user_id: str
|
||||
):
|
||||
"""Create a new session and process crawled URL."""
|
||||
from app.db import async_session_maker
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
|
||||
async with async_session_maker() as session:
|
||||
# Initialize task logging service
|
||||
task_logger = TaskLoggingService(session, search_space_id)
|
||||
|
||||
# Log task start
|
||||
log_entry = await task_logger.log_task_start(
|
||||
task_name="process_crawled_url",
|
||||
source="document_processor",
|
||||
message=f"Starting URL crawling and processing for: {url}",
|
||||
metadata={"document_type": "CRAWLED_URL", "url": url, "user_id": user_id},
|
||||
)
|
||||
|
||||
try:
|
||||
result = await add_crawled_url_document(
|
||||
session, url, search_space_id, user_id
|
||||
)
|
||||
|
||||
if result:
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Successfully crawled and processed URL: {url}",
|
||||
{
|
||||
"document_id": result.id,
|
||||
"title": result.title,
|
||||
"content_hash": result.content_hash,
|
||||
},
|
||||
)
|
||||
else:
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"URL document already exists (duplicate): {url}",
|
||||
{"duplicate_detected": True},
|
||||
)
|
||||
except Exception as e:
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
f"Failed to crawl URL: {url}",
|
||||
str(e),
|
||||
{"error_type": type(e).__name__},
|
||||
)
|
||||
import logging
|
||||
|
||||
logging.error(f"Error processing crawled URL: {e!s}")
|
||||
|
||||
|
||||
async def process_file_in_background_with_new_session(
|
||||
file_path: str, filename: str, search_space_id: int, user_id: str
|
||||
):
|
||||
"""Create a new session and process file."""
|
||||
from app.db import async_session_maker
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
|
||||
async with async_session_maker() as session:
|
||||
# Initialize task logging service
|
||||
task_logger = TaskLoggingService(session, search_space_id)
|
||||
|
||||
# Log task start
|
||||
log_entry = await task_logger.log_task_start(
|
||||
task_name="process_file_upload",
|
||||
source="document_processor",
|
||||
message=f"Starting file processing for: {filename}",
|
||||
metadata={
|
||||
"document_type": "FILE",
|
||||
"filename": filename,
|
||||
"file_path": file_path,
|
||||
"user_id": user_id,
|
||||
},
|
||||
)
|
||||
|
||||
try:
|
||||
await process_file_in_background(
|
||||
file_path,
|
||||
filename,
|
||||
search_space_id,
|
||||
user_id,
|
||||
session,
|
||||
task_logger,
|
||||
log_entry,
|
||||
)
|
||||
|
||||
# Note: success/failure logging is handled within process_file_in_background
|
||||
except Exception as e:
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
f"Failed to process file: {filename}",
|
||||
str(e),
|
||||
{"error_type": type(e).__name__},
|
||||
)
|
||||
import logging
|
||||
|
||||
logging.error(f"Error processing file: {e!s}")
|
||||
|
||||
|
||||
async def process_youtube_video_with_new_session(
|
||||
url: str, search_space_id: int, user_id: str
|
||||
):
|
||||
"""Create a new session and process YouTube video."""
|
||||
from app.db import async_session_maker
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
|
||||
async with async_session_maker() as session:
|
||||
# Initialize task logging service
|
||||
task_logger = TaskLoggingService(session, search_space_id)
|
||||
|
||||
# Log task start
|
||||
log_entry = await task_logger.log_task_start(
|
||||
task_name="process_youtube_video",
|
||||
source="document_processor",
|
||||
message=f"Starting YouTube video processing for: {url}",
|
||||
metadata={"document_type": "YOUTUBE_VIDEO", "url": url, "user_id": user_id},
|
||||
)
|
||||
|
||||
try:
|
||||
result = await add_youtube_video_document(
|
||||
session, url, search_space_id, user_id
|
||||
)
|
||||
|
||||
if result:
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Successfully processed YouTube video: {result.title}",
|
||||
{
|
||||
"document_id": result.id,
|
||||
"video_id": result.document_metadata.get("video_id"),
|
||||
"content_hash": result.content_hash,
|
||||
},
|
||||
)
|
||||
else:
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"YouTube video document already exists (duplicate): {url}",
|
||||
{"duplicate_detected": True},
|
||||
)
|
||||
except Exception as e:
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
f"Failed to process YouTube video: {url}",
|
||||
str(e),
|
||||
{"error_type": type(e).__name__},
|
||||
)
|
||||
import logging
|
||||
|
||||
logging.error(f"Error processing YouTube video: {e!s}")
|
||||
|
||||
|
||||
async def process_file_in_background(
|
||||
file_path: str,
|
||||
filename: str,
|
||||
|
@ -508,363 +939,3 @@ async def process_file_in_background(
|
|||
|
||||
logging.error(f"Error processing file in background: {e!s}")
|
||||
raise # Re-raise so the wrapper can also handle it
|
||||
|
||||
|
||||
@router.get("/documents/", response_model=list[DocumentRead])
|
||||
async def read_documents(
|
||||
skip: int = 0,
|
||||
limit: int = 3000,
|
||||
search_space_id: int | None = None,
|
||||
session: AsyncSession = Depends(get_async_session),
|
||||
user: User = Depends(current_active_user),
|
||||
):
|
||||
try:
|
||||
query = (
|
||||
select(Document).join(SearchSpace).filter(SearchSpace.user_id == user.id)
|
||||
)
|
||||
|
||||
# Filter by search_space_id if provided
|
||||
if search_space_id is not None:
|
||||
query = query.filter(Document.search_space_id == search_space_id)
|
||||
|
||||
result = await session.execute(query.offset(skip).limit(limit))
|
||||
db_documents = result.scalars().all()
|
||||
|
||||
# Convert database objects to API-friendly format
|
||||
api_documents = []
|
||||
for doc in db_documents:
|
||||
api_documents.append(
|
||||
DocumentRead(
|
||||
id=doc.id,
|
||||
title=doc.title,
|
||||
document_type=doc.document_type,
|
||||
document_metadata=doc.document_metadata,
|
||||
content=doc.content,
|
||||
created_at=doc.created_at,
|
||||
search_space_id=doc.search_space_id,
|
||||
)
|
||||
)
|
||||
|
||||
return api_documents
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Failed to fetch documents: {e!s}"
|
||||
) from e
|
||||
|
||||
|
||||
@router.get("/documents/{document_id}", response_model=DocumentRead)
|
||||
async def read_document(
|
||||
document_id: int,
|
||||
session: AsyncSession = Depends(get_async_session),
|
||||
user: User = Depends(current_active_user),
|
||||
):
|
||||
try:
|
||||
result = await session.execute(
|
||||
select(Document)
|
||||
.join(SearchSpace)
|
||||
.filter(Document.id == document_id, SearchSpace.user_id == user.id)
|
||||
)
|
||||
document = result.scalars().first()
|
||||
|
||||
if not document:
|
||||
raise HTTPException(
|
||||
status_code=404, detail=f"Document with id {document_id} not found"
|
||||
)
|
||||
|
||||
# Convert database object to API-friendly format
|
||||
return DocumentRead(
|
||||
id=document.id,
|
||||
title=document.title,
|
||||
document_type=document.document_type,
|
||||
document_metadata=document.document_metadata,
|
||||
content=document.content,
|
||||
created_at=document.created_at,
|
||||
search_space_id=document.search_space_id,
|
||||
)
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Failed to fetch document: {e!s}"
|
||||
) from e
|
||||
|
||||
|
||||
@router.put("/documents/{document_id}", response_model=DocumentRead)
|
||||
async def update_document(
|
||||
document_id: int,
|
||||
document_update: DocumentUpdate,
|
||||
session: AsyncSession = Depends(get_async_session),
|
||||
user: User = Depends(current_active_user),
|
||||
):
|
||||
try:
|
||||
# Query the document directly instead of using read_document function
|
||||
result = await session.execute(
|
||||
select(Document)
|
||||
.join(SearchSpace)
|
||||
.filter(Document.id == document_id, SearchSpace.user_id == user.id)
|
||||
)
|
||||
db_document = result.scalars().first()
|
||||
|
||||
if not db_document:
|
||||
raise HTTPException(
|
||||
status_code=404, detail=f"Document with id {document_id} not found"
|
||||
)
|
||||
|
||||
update_data = document_update.model_dump(exclude_unset=True)
|
||||
for key, value in update_data.items():
|
||||
setattr(db_document, key, value)
|
||||
await session.commit()
|
||||
await session.refresh(db_document)
|
||||
|
||||
# Convert to DocumentRead for response
|
||||
return DocumentRead(
|
||||
id=db_document.id,
|
||||
title=db_document.title,
|
||||
document_type=db_document.document_type,
|
||||
document_metadata=db_document.document_metadata,
|
||||
content=db_document.content,
|
||||
created_at=db_document.created_at,
|
||||
search_space_id=db_document.search_space_id,
|
||||
)
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
await session.rollback()
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Failed to update document: {e!s}"
|
||||
) from e
|
||||
|
||||
|
||||
@router.delete("/documents/{document_id}", response_model=dict)
|
||||
async def delete_document(
|
||||
document_id: int,
|
||||
session: AsyncSession = Depends(get_async_session),
|
||||
user: User = Depends(current_active_user),
|
||||
):
|
||||
try:
|
||||
# Query the document directly instead of using read_document function
|
||||
result = await session.execute(
|
||||
select(Document)
|
||||
.join(SearchSpace)
|
||||
.filter(Document.id == document_id, SearchSpace.user_id == user.id)
|
||||
)
|
||||
document = result.scalars().first()
|
||||
|
||||
if not document:
|
||||
raise HTTPException(
|
||||
status_code=404, detail=f"Document with id {document_id} not found"
|
||||
)
|
||||
|
||||
await session.delete(document)
|
||||
await session.commit()
|
||||
return {"message": "Document deleted successfully"}
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
await session.rollback()
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Failed to delete document: {e!s}"
|
||||
) from e
|
||||
|
||||
|
||||
async def process_extension_document_with_new_session(
|
||||
individual_document, search_space_id: int, user_id: str
|
||||
):
|
||||
"""Create a new session and process extension document."""
|
||||
from app.db import async_session_maker
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
|
||||
async with async_session_maker() as session:
|
||||
# Initialize task logging service
|
||||
task_logger = TaskLoggingService(session, search_space_id)
|
||||
|
||||
# Log task start
|
||||
log_entry = await task_logger.log_task_start(
|
||||
task_name="process_extension_document",
|
||||
source="document_processor",
|
||||
message=f"Starting processing of extension document from {individual_document.metadata.VisitedWebPageTitle}",
|
||||
metadata={
|
||||
"document_type": "EXTENSION",
|
||||
"url": individual_document.metadata.VisitedWebPageURL,
|
||||
"title": individual_document.metadata.VisitedWebPageTitle,
|
||||
"user_id": user_id,
|
||||
},
|
||||
)
|
||||
|
||||
try:
|
||||
result = await add_extension_received_document(
|
||||
session, individual_document, search_space_id, user_id
|
||||
)
|
||||
|
||||
if result:
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Successfully processed extension document: {individual_document.metadata.VisitedWebPageTitle}",
|
||||
{"document_id": result.id, "content_hash": result.content_hash},
|
||||
)
|
||||
else:
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Extension document already exists (duplicate): {individual_document.metadata.VisitedWebPageTitle}",
|
||||
{"duplicate_detected": True},
|
||||
)
|
||||
except Exception as e:
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
f"Failed to process extension document: {individual_document.metadata.VisitedWebPageTitle}",
|
||||
str(e),
|
||||
{"error_type": type(e).__name__},
|
||||
)
|
||||
import logging
|
||||
|
||||
logging.error(f"Error processing extension document: {e!s}")
|
||||
|
||||
|
||||
async def process_crawled_url_with_new_session(
|
||||
url: str, search_space_id: int, user_id: str
|
||||
):
|
||||
"""Create a new session and process crawled URL."""
|
||||
from app.db import async_session_maker
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
|
||||
async with async_session_maker() as session:
|
||||
# Initialize task logging service
|
||||
task_logger = TaskLoggingService(session, search_space_id)
|
||||
|
||||
# Log task start
|
||||
log_entry = await task_logger.log_task_start(
|
||||
task_name="process_crawled_url",
|
||||
source="document_processor",
|
||||
message=f"Starting URL crawling and processing for: {url}",
|
||||
metadata={"document_type": "CRAWLED_URL", "url": url, "user_id": user_id},
|
||||
)
|
||||
|
||||
try:
|
||||
result = await add_crawled_url_document(
|
||||
session, url, search_space_id, user_id
|
||||
)
|
||||
|
||||
if result:
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Successfully crawled and processed URL: {url}",
|
||||
{
|
||||
"document_id": result.id,
|
||||
"title": result.title,
|
||||
"content_hash": result.content_hash,
|
||||
},
|
||||
)
|
||||
else:
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"URL document already exists (duplicate): {url}",
|
||||
{"duplicate_detected": True},
|
||||
)
|
||||
except Exception as e:
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
f"Failed to crawl URL: {url}",
|
||||
str(e),
|
||||
{"error_type": type(e).__name__},
|
||||
)
|
||||
import logging
|
||||
|
||||
logging.error(f"Error processing crawled URL: {e!s}")
|
||||
|
||||
|
||||
async def process_file_in_background_with_new_session(
|
||||
file_path: str, filename: str, search_space_id: int, user_id: str
|
||||
):
|
||||
"""Create a new session and process file."""
|
||||
from app.db import async_session_maker
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
|
||||
async with async_session_maker() as session:
|
||||
# Initialize task logging service
|
||||
task_logger = TaskLoggingService(session, search_space_id)
|
||||
|
||||
# Log task start
|
||||
log_entry = await task_logger.log_task_start(
|
||||
task_name="process_file_upload",
|
||||
source="document_processor",
|
||||
message=f"Starting file processing for: {filename}",
|
||||
metadata={
|
||||
"document_type": "FILE",
|
||||
"filename": filename,
|
||||
"file_path": file_path,
|
||||
"user_id": user_id,
|
||||
},
|
||||
)
|
||||
|
||||
try:
|
||||
await process_file_in_background(
|
||||
file_path,
|
||||
filename,
|
||||
search_space_id,
|
||||
user_id,
|
||||
session,
|
||||
task_logger,
|
||||
log_entry,
|
||||
)
|
||||
|
||||
# Note: success/failure logging is handled within process_file_in_background
|
||||
except Exception as e:
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
f"Failed to process file: {filename}",
|
||||
str(e),
|
||||
{"error_type": type(e).__name__},
|
||||
)
|
||||
import logging
|
||||
|
||||
logging.error(f"Error processing file: {e!s}")
|
||||
|
||||
|
||||
async def process_youtube_video_with_new_session(
|
||||
url: str, search_space_id: int, user_id: str
|
||||
):
|
||||
"""Create a new session and process YouTube video."""
|
||||
from app.db import async_session_maker
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
|
||||
async with async_session_maker() as session:
|
||||
# Initialize task logging service
|
||||
task_logger = TaskLoggingService(session, search_space_id)
|
||||
|
||||
# Log task start
|
||||
log_entry = await task_logger.log_task_start(
|
||||
task_name="process_youtube_video",
|
||||
source="document_processor",
|
||||
message=f"Starting YouTube video processing for: {url}",
|
||||
metadata={"document_type": "YOUTUBE_VIDEO", "url": url, "user_id": user_id},
|
||||
)
|
||||
|
||||
try:
|
||||
result = await add_youtube_video_document(
|
||||
session, url, search_space_id, user_id
|
||||
)
|
||||
|
||||
if result:
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"Successfully processed YouTube video: {result.title}",
|
||||
{
|
||||
"document_id": result.id,
|
||||
"video_id": result.document_metadata.get("video_id"),
|
||||
"content_hash": result.content_hash,
|
||||
},
|
||||
)
|
||||
else:
|
||||
await task_logger.log_task_success(
|
||||
log_entry,
|
||||
f"YouTube video document already exists (duplicate): {url}",
|
||||
{"duplicate_detected": True},
|
||||
)
|
||||
except Exception as e:
|
||||
await task_logger.log_task_failure(
|
||||
log_entry,
|
||||
f"Failed to process YouTube video: {url}",
|
||||
str(e),
|
||||
{"error_type": type(e).__name__},
|
||||
)
|
||||
import logging
|
||||
|
||||
logging.error(f"Error processing YouTube video: {e!s}")
|
||||
|
|
|
@ -13,6 +13,7 @@ from .documents import (
|
|||
DocumentRead,
|
||||
DocumentsCreate,
|
||||
DocumentUpdate,
|
||||
DocumentWithChunksRead,
|
||||
ExtensionDocumentContent,
|
||||
ExtensionDocumentMetadata,
|
||||
)
|
||||
|
@ -53,6 +54,7 @@ __all__ = [
|
|||
"DocumentBase",
|
||||
"DocumentRead",
|
||||
"DocumentUpdate",
|
||||
"DocumentWithChunksRead",
|
||||
"DocumentsCreate",
|
||||
"ExtensionDocumentContent",
|
||||
"ExtensionDocumentMetadata",
|
||||
|
|
|
@ -4,6 +4,8 @@ from pydantic import BaseModel, ConfigDict
|
|||
|
||||
from app.db import DocumentType
|
||||
|
||||
from .chunks import ChunkRead
|
||||
|
||||
|
||||
class ExtensionDocumentMetadata(BaseModel):
|
||||
BrowsingSessionId: str
|
||||
|
@ -45,3 +47,9 @@ class DocumentRead(BaseModel):
|
|||
search_space_id: int
|
||||
|
||||
model_config = ConfigDict(from_attributes=True)
|
||||
|
||||
|
||||
class DocumentWithChunksRead(DocumentRead):
|
||||
chunks: list[ChunkRead] = []
|
||||
|
||||
model_config = ConfigDict(from_attributes=True)
|
||||
|
|
|
@ -1,58 +1,202 @@
|
|||
"use client";
|
||||
|
||||
import { ExternalLink } from "lucide-react";
|
||||
import { ExternalLink, FileText, Loader2 } from "lucide-react";
|
||||
import type React from "react";
|
||||
import { useEffect, useRef, useState } from "react";
|
||||
import { MarkdownViewer } from "@/components/markdown-viewer";
|
||||
import { Button } from "@/components/ui/button";
|
||||
import { Popover, PopoverContent, PopoverTrigger } from "@/components/ui/popover";
|
||||
import { ScrollArea } from "@/components/ui/scroll-area";
|
||||
import {
|
||||
Sheet,
|
||||
SheetContent,
|
||||
SheetDescription,
|
||||
SheetHeader,
|
||||
SheetTitle,
|
||||
SheetTrigger,
|
||||
} from "@/components/ui/sheet";
|
||||
import { useDocumentByChunk } from "@/hooks/use-document-by-chunk";
|
||||
import { cn } from "@/lib/utils";
|
||||
|
||||
export const CitationDisplay: React.FC<{ index: number; node: any }> = ({ index, node }) => {
|
||||
const truncateText = (text: string, maxLength: number = 200) => {
|
||||
if (text.length <= maxLength) return text;
|
||||
return `${text.substring(0, maxLength)}...`;
|
||||
const chunkId = Number(node?.id);
|
||||
const sourceType = node?.metadata?.source_type;
|
||||
const [isOpen, setIsOpen] = useState(false);
|
||||
const { document, loading, error, fetchDocumentByChunk, clearDocument } = useDocumentByChunk();
|
||||
const chunksContainerRef = useRef<HTMLDivElement>(null);
|
||||
const highlightedChunkRef = useRef<HTMLDivElement>(null);
|
||||
|
||||
// Check if this is a source type that should render directly from node
|
||||
const isDirectRenderSource = sourceType === "TAVILY_API" || sourceType === "LINKUP_API";
|
||||
|
||||
const handleOpenChange = async (open: boolean) => {
|
||||
setIsOpen(open);
|
||||
if (open && chunkId && !isDirectRenderSource) {
|
||||
await fetchDocumentByChunk(chunkId);
|
||||
} else if (!open && !isDirectRenderSource) {
|
||||
clearDocument();
|
||||
}
|
||||
};
|
||||
|
||||
useEffect(() => {
|
||||
// Scroll to highlighted chunk when document loads
|
||||
if (document && highlightedChunkRef.current && chunksContainerRef.current) {
|
||||
setTimeout(() => {
|
||||
highlightedChunkRef.current?.scrollIntoView({
|
||||
behavior: "smooth",
|
||||
block: "start",
|
||||
});
|
||||
}, 100);
|
||||
}
|
||||
}, [document]);
|
||||
|
||||
const handleUrlClick = (e: React.MouseEvent, url: string) => {
|
||||
e.preventDefault();
|
||||
e.stopPropagation();
|
||||
window.open(url, "_blank", "noopener,noreferrer");
|
||||
};
|
||||
|
||||
const formatDocumentType = (type: string) => {
|
||||
return type
|
||||
.split("_")
|
||||
.map((word) => word.charAt(0) + word.slice(1).toLowerCase())
|
||||
.join(" ");
|
||||
};
|
||||
|
||||
return (
|
||||
<Popover>
|
||||
<PopoverTrigger asChild>
|
||||
<Sheet open={isOpen} onOpenChange={handleOpenChange}>
|
||||
<SheetTrigger asChild>
|
||||
<span className="text-[10px] font-bold bg-slate-500 hover:bg-slate-600 text-white rounded-full w-4 h-4 inline-flex items-center justify-center align-super cursor-pointer transition-colors">
|
||||
{index + 1}
|
||||
</span>
|
||||
</PopoverTrigger>
|
||||
<PopoverContent className="w-80 p-4 space-y-3 relative" align="start">
|
||||
{/* External Link Button - Top Right */}
|
||||
{node?.url && (
|
||||
<Button
|
||||
size="icon"
|
||||
variant="ghost"
|
||||
onClick={(e) => handleUrlClick(e, node.url)}
|
||||
className="absolute top-3 right-3 inline-flex items-center justify-center w-6 h-6 text-blue-600 hover:text-blue-800 dark:text-blue-400 dark:hover:text-blue-200 hover:bg-blue-50 dark:hover:bg-blue-900/20 rounded transition-colors"
|
||||
title="Open in new tab"
|
||||
>
|
||||
<ExternalLink size={14} />
|
||||
</Button>
|
||||
</SheetTrigger>
|
||||
<SheetContent side="right" className="w-full sm:max-w-5xl lg:max-w-7xl">
|
||||
<SheetHeader className="px-6 py-4 border-b">
|
||||
<SheetTitle className="flex items-center gap-3 text-lg">
|
||||
<FileText className="h-6 w-6" />
|
||||
{document?.title || node?.metadata?.title || node?.metadata?.group_name || "Source"}
|
||||
</SheetTitle>
|
||||
<SheetDescription className="text-base mt-2">
|
||||
{document
|
||||
? formatDocumentType(document.document_type)
|
||||
: sourceType && formatDocumentType(sourceType)}
|
||||
</SheetDescription>
|
||||
</SheetHeader>
|
||||
|
||||
{!isDirectRenderSource && loading && (
|
||||
<div className="flex items-center justify-center h-64 px-6">
|
||||
<Loader2 className="h-8 w-8 animate-spin text-muted-foreground" />
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Heading */}
|
||||
<div className="text-sm font-semibold text-slate-900 dark:text-slate-100 pr-8">
|
||||
{node?.metadata?.group_name || "Source"}
|
||||
</div>
|
||||
{!isDirectRenderSource && error && (
|
||||
<div className="flex items-center justify-center h-64 px-6">
|
||||
<p className="text-sm text-destructive">{error}</p>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Source */}
|
||||
<div className="text-xs text-slate-600 dark:text-slate-400 font-medium">
|
||||
{node?.metadata?.title || "Untitled"}
|
||||
</div>
|
||||
{/* Direct render for TAVILY_API and LINEAR_API */}
|
||||
{isDirectRenderSource && (
|
||||
<ScrollArea className="h-[calc(100vh-10rem)]">
|
||||
<div className="px-6 py-4">
|
||||
{/* External Link */}
|
||||
{node?.url && (
|
||||
<div className="mb-8">
|
||||
<Button
|
||||
size="default"
|
||||
variant="outline"
|
||||
onClick={(e) => handleUrlClick(e, node.url)}
|
||||
className="w-full py-3"
|
||||
>
|
||||
<ExternalLink className="mr-2 h-4 w-4" />
|
||||
Open in Browser
|
||||
</Button>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Body */}
|
||||
<div className="text-xs text-slate-700 dark:text-slate-300 leading-relaxed">
|
||||
{truncateText(node?.text || "No content available")}
|
||||
</div>
|
||||
</PopoverContent>
|
||||
</Popover>
|
||||
{/* Source Information */}
|
||||
<div className="mb-8 p-6 bg-muted/50 rounded-lg border">
|
||||
<h3 className="text-base font-semibold mb-4">Source Information</h3>
|
||||
<div className="text-sm text-muted-foreground mb-3 font-medium">
|
||||
{node?.metadata?.title || "Untitled"}
|
||||
</div>
|
||||
<div className="text-sm text-foreground leading-relaxed whitespace-pre-wrap">
|
||||
{node?.text || "No content available"}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</ScrollArea>
|
||||
)}
|
||||
|
||||
{/* API-fetched document content */}
|
||||
{!isDirectRenderSource && document && (
|
||||
<ScrollArea className="h-[calc(100vh-10rem)]">
|
||||
<div className="px-6 py-4">
|
||||
{/* Document Metadata */}
|
||||
{document.document_metadata && Object.keys(document.document_metadata).length > 0 && (
|
||||
<div className="mb-8 p-6 bg-muted/50 rounded-lg border">
|
||||
<h3 className="text-base font-semibold mb-4">Document Information</h3>
|
||||
<dl className="grid grid-cols-1 gap-3 text-sm">
|
||||
{Object.entries(document.document_metadata).map(([key, value]) => (
|
||||
<div key={key} className="flex gap-3">
|
||||
<dt className="font-medium text-muted-foreground capitalize min-w-0 flex-shrink-0">
|
||||
{key.replace(/_/g, " ")}:
|
||||
</dt>
|
||||
<dd className="text-foreground break-words">{String(value)}</dd>
|
||||
</div>
|
||||
))}
|
||||
</dl>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* External Link */}
|
||||
{node?.url && (
|
||||
<div className="mb-8">
|
||||
<Button
|
||||
size="default"
|
||||
variant="outline"
|
||||
onClick={(e) => handleUrlClick(e, node.url)}
|
||||
className="w-full py-3"
|
||||
>
|
||||
<ExternalLink className="mr-2 h-4 w-4" />
|
||||
Open in Browser
|
||||
</Button>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Chunks */}
|
||||
<div className="space-y-6" ref={chunksContainerRef}>
|
||||
<h3 className="text-base font-semibold mb-4">Document Content</h3>
|
||||
{document.chunks.map((chunk, idx) => (
|
||||
<div
|
||||
key={chunk.id}
|
||||
ref={chunk.id === chunkId ? highlightedChunkRef : null}
|
||||
className={cn(
|
||||
"p-6 rounded-lg border transition-all duration-300",
|
||||
chunk.id === chunkId
|
||||
? "bg-primary/10 border-primary shadow-md ring-1 ring-primary/20"
|
||||
: "bg-background border-border hover:bg-muted/50 hover:border-muted-foreground/20"
|
||||
)}
|
||||
>
|
||||
<div className="mb-4 flex items-center justify-between">
|
||||
<span className="text-sm font-medium text-muted-foreground">
|
||||
Chunk {idx + 1} of {document.chunks.length}
|
||||
</span>
|
||||
{chunk.id === chunkId && (
|
||||
<span className="text-sm font-medium text-primary bg-primary/10 px-3 py-1 rounded-full">
|
||||
Referenced Chunk
|
||||
</span>
|
||||
)}
|
||||
</div>
|
||||
<div className="text-sm text-foreground whitespace-pre-wrap leading-relaxed">
|
||||
<MarkdownViewer content={chunk.content} className="max-w-fit" />
|
||||
</div>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
</ScrollArea>
|
||||
)}
|
||||
</SheetContent>
|
||||
</Sheet>
|
||||
);
|
||||
};
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import { Check, Copy } from "lucide-react";
|
||||
import Image from "next/image";
|
||||
import { useTheme } from "next-themes";
|
||||
import React, { useEffect, useMemo, useRef, useState } from "react";
|
||||
import { useEffect, useMemo, useRef, useState } from "react";
|
||||
import ReactMarkdown from "react-markdown";
|
||||
import { Prism as SyntaxHighlighter } from "react-syntax-highlighter";
|
||||
import { oneDark, oneLight } from "react-syntax-highlighter/dist/cjs/styles/prism";
|
||||
|
@ -10,105 +10,51 @@ import rehypeSanitize from "rehype-sanitize";
|
|||
import remarkGfm from "remark-gfm";
|
||||
import { Button } from "@/components/ui/button";
|
||||
import { cn } from "@/lib/utils";
|
||||
import { Citation } from "./chat/Citation";
|
||||
import type { Source } from "./chat/types";
|
||||
import CopyButton from "./copy-button";
|
||||
|
||||
interface MarkdownViewerProps {
|
||||
content: string;
|
||||
className?: string;
|
||||
getCitationSource?: (id: number) => Source | null;
|
||||
type?: "user" | "ai";
|
||||
}
|
||||
|
||||
export function MarkdownViewer({
|
||||
content,
|
||||
className,
|
||||
getCitationSource,
|
||||
type = "user",
|
||||
}: MarkdownViewerProps) {
|
||||
export function MarkdownViewer({ content, className }: MarkdownViewerProps) {
|
||||
const ref = useRef<HTMLDivElement>(null);
|
||||
// Memoize the markdown components to prevent unnecessary re-renders
|
||||
const components = useMemo(() => {
|
||||
return {
|
||||
// Define custom components for markdown elements
|
||||
p: ({ node, children, ...props }: any) => {
|
||||
// If there's no getCitationSource function, just render normally
|
||||
if (!getCitationSource) {
|
||||
return (
|
||||
<p className="my-2" {...props}>
|
||||
{children}
|
||||
</p>
|
||||
);
|
||||
}
|
||||
|
||||
// Process citations within paragraph content
|
||||
return (
|
||||
<p className="my-2" {...props}>
|
||||
{processCitationsInReactChildren(children, getCitationSource)}
|
||||
</p>
|
||||
);
|
||||
},
|
||||
a: ({ node, children, ...props }: any) => {
|
||||
// Process citations within link content if needed
|
||||
const processedChildren = getCitationSource
|
||||
? processCitationsInReactChildren(children, getCitationSource)
|
||||
: children;
|
||||
return (
|
||||
<a className="text-primary hover:underline" {...props}>
|
||||
{processedChildren}
|
||||
</a>
|
||||
);
|
||||
},
|
||||
li: ({ node, children, ...props }: any) => {
|
||||
// Process citations within list item content
|
||||
const processedChildren = getCitationSource
|
||||
? processCitationsInReactChildren(children, getCitationSource)
|
||||
: children;
|
||||
return <li {...props}>{processedChildren}</li>;
|
||||
},
|
||||
p: ({ node, children, ...props }: any) => (
|
||||
<p className="my-2" {...props}>
|
||||
{children}
|
||||
</p>
|
||||
),
|
||||
a: ({ node, children, ...props }: any) => (
|
||||
<a className="text-primary hover:underline" {...props}>
|
||||
{children}
|
||||
</a>
|
||||
),
|
||||
li: ({ node, children, ...props }: any) => <li {...props}>{children}</li>,
|
||||
ul: ({ node, ...props }: any) => <ul className="list-disc pl-5 my-2" {...props} />,
|
||||
ol: ({ node, ...props }: any) => <ol className="list-decimal pl-5 my-2" {...props} />,
|
||||
h1: ({ node, children, ...props }: any) => {
|
||||
const processedChildren = getCitationSource
|
||||
? processCitationsInReactChildren(children, getCitationSource)
|
||||
: children;
|
||||
return (
|
||||
<h1 className="text-2xl font-bold mt-6 mb-2" {...props}>
|
||||
{processedChildren}
|
||||
</h1>
|
||||
);
|
||||
},
|
||||
h2: ({ node, children, ...props }: any) => {
|
||||
const processedChildren = getCitationSource
|
||||
? processCitationsInReactChildren(children, getCitationSource)
|
||||
: children;
|
||||
return (
|
||||
<h2 className="text-xl font-bold mt-5 mb-2" {...props}>
|
||||
{processedChildren}
|
||||
</h2>
|
||||
);
|
||||
},
|
||||
h3: ({ node, children, ...props }: any) => {
|
||||
const processedChildren = getCitationSource
|
||||
? processCitationsInReactChildren(children, getCitationSource)
|
||||
: children;
|
||||
return (
|
||||
<h3 className="text-lg font-bold mt-4 mb-2" {...props}>
|
||||
{processedChildren}
|
||||
</h3>
|
||||
);
|
||||
},
|
||||
h4: ({ node, children, ...props }: any) => {
|
||||
const processedChildren = getCitationSource
|
||||
? processCitationsInReactChildren(children, getCitationSource)
|
||||
: children;
|
||||
return (
|
||||
<h4 className="text-base font-bold mt-3 mb-1" {...props}>
|
||||
{processedChildren}
|
||||
</h4>
|
||||
);
|
||||
},
|
||||
h1: ({ node, children, ...props }: any) => (
|
||||
<h1 className="text-2xl font-bold mt-6 mb-2" {...props}>
|
||||
{children}
|
||||
</h1>
|
||||
),
|
||||
h2: ({ node, children, ...props }: any) => (
|
||||
<h2 className="text-xl font-bold mt-5 mb-2" {...props}>
|
||||
{children}
|
||||
</h2>
|
||||
),
|
||||
h3: ({ node, children, ...props }: any) => (
|
||||
<h3 className="text-lg font-bold mt-4 mb-2" {...props}>
|
||||
{children}
|
||||
</h3>
|
||||
),
|
||||
h4: ({ node, children, ...props }: any) => (
|
||||
<h4 className="text-base font-bold mt-3 mb-1" {...props}>
|
||||
{children}
|
||||
</h4>
|
||||
),
|
||||
blockquote: ({ node, ...props }: any) => (
|
||||
<blockquote className="border-l-4 border-muted pl-4 italic my-2" {...props} />
|
||||
),
|
||||
|
@ -154,7 +100,7 @@ export function MarkdownViewer({
|
|||
);
|
||||
},
|
||||
};
|
||||
}, [getCitationSource]);
|
||||
}, []);
|
||||
|
||||
return (
|
||||
<div className={cn("prose prose-sm dark:prose-invert max-w-none", className)} ref={ref}>
|
||||
|
@ -165,7 +111,6 @@ export function MarkdownViewer({
|
|||
>
|
||||
{content}
|
||||
</ReactMarkdown>
|
||||
{type === "ai" && <CopyButton ref={ref} />}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
@ -267,77 +212,3 @@ const CodeBlock = ({ children, language }: { children: string; language: string
|
|||
</div>
|
||||
);
|
||||
};
|
||||
|
||||
// Helper function to process citations within React children
|
||||
const processCitationsInReactChildren = (
|
||||
children: React.ReactNode,
|
||||
getCitationSource: (id: number) => Source | null
|
||||
): React.ReactNode => {
|
||||
// If children is not an array or string, just return it
|
||||
if (!children || (typeof children !== "string" && !Array.isArray(children))) {
|
||||
return children;
|
||||
}
|
||||
|
||||
// Handle string content directly - this is where we process citation references
|
||||
if (typeof children === "string") {
|
||||
return processCitationsInText(children, getCitationSource);
|
||||
}
|
||||
|
||||
// Handle arrays of children recursively
|
||||
if (Array.isArray(children)) {
|
||||
return React.Children.map(children, (child) => {
|
||||
if (typeof child === "string") {
|
||||
return processCitationsInText(child, getCitationSource);
|
||||
}
|
||||
return child;
|
||||
});
|
||||
}
|
||||
|
||||
return children;
|
||||
};
|
||||
|
||||
// Process citation references in text content
|
||||
const processCitationsInText = (
|
||||
text: string,
|
||||
getCitationSource: (id: number) => Source | null
|
||||
): React.ReactNode[] => {
|
||||
// Use improved regex to catch citation numbers more reliably
|
||||
// This will match patterns like [1], [42], etc. including when they appear at the end of a line or sentence
|
||||
const citationRegex = /\[(\d+)\]/g;
|
||||
const parts: React.ReactNode[] = [];
|
||||
let lastIndex = 0;
|
||||
let match: RegExpExecArray | null = citationRegex.exec(text);
|
||||
let position = 0;
|
||||
|
||||
while (match !== null) {
|
||||
// Add text before the citation
|
||||
if (match.index > lastIndex) {
|
||||
parts.push(text.substring(lastIndex, match.index));
|
||||
}
|
||||
|
||||
// Add the citation component
|
||||
const citationId = parseInt(match[1], 10);
|
||||
const source = getCitationSource(citationId);
|
||||
|
||||
parts.push(
|
||||
<Citation
|
||||
key={`citation-${citationId}-${position}`}
|
||||
citationId={citationId}
|
||||
citationText={match[0]}
|
||||
position={position}
|
||||
source={source}
|
||||
/>
|
||||
);
|
||||
|
||||
lastIndex = match.index + match[0].length;
|
||||
position++;
|
||||
match = citationRegex.exec(text);
|
||||
}
|
||||
|
||||
// Add any remaining text after the last citation
|
||||
if (lastIndex < text.length) {
|
||||
parts.push(text.substring(lastIndex));
|
||||
}
|
||||
|
||||
return parts;
|
||||
};
|
||||
|
|
56
surfsense_web/components/ui/scroll-area.tsx
Normal file
56
surfsense_web/components/ui/scroll-area.tsx
Normal file
|
@ -0,0 +1,56 @@
|
|||
"use client";
|
||||
|
||||
import * as ScrollAreaPrimitive from "@radix-ui/react-scroll-area";
|
||||
import type * as React from "react";
|
||||
|
||||
import { cn } from "@/lib/utils";
|
||||
|
||||
function ScrollArea({
|
||||
className,
|
||||
children,
|
||||
...props
|
||||
}: React.ComponentProps<typeof ScrollAreaPrimitive.Root>) {
|
||||
return (
|
||||
<ScrollAreaPrimitive.Root
|
||||
data-slot="scroll-area"
|
||||
className={cn("relative", className)}
|
||||
{...props}
|
||||
>
|
||||
<ScrollAreaPrimitive.Viewport
|
||||
data-slot="scroll-area-viewport"
|
||||
className="focus-visible:ring-ring/50 size-full rounded-[inherit] transition-[color,box-shadow] outline-none focus-visible:ring-[3px] focus-visible:outline-1"
|
||||
>
|
||||
{children}
|
||||
</ScrollAreaPrimitive.Viewport>
|
||||
<ScrollBar />
|
||||
<ScrollAreaPrimitive.Corner />
|
||||
</ScrollAreaPrimitive.Root>
|
||||
);
|
||||
}
|
||||
|
||||
function ScrollBar({
|
||||
className,
|
||||
orientation = "vertical",
|
||||
...props
|
||||
}: React.ComponentProps<typeof ScrollAreaPrimitive.ScrollAreaScrollbar>) {
|
||||
return (
|
||||
<ScrollAreaPrimitive.ScrollAreaScrollbar
|
||||
data-slot="scroll-area-scrollbar"
|
||||
orientation={orientation}
|
||||
className={cn(
|
||||
"flex touch-none p-px transition-colors select-none",
|
||||
orientation === "vertical" && "h-full w-2.5 border-l border-l-transparent",
|
||||
orientation === "horizontal" && "h-2.5 flex-col border-t border-t-transparent",
|
||||
className
|
||||
)}
|
||||
{...props}
|
||||
>
|
||||
<ScrollAreaPrimitive.ScrollAreaThumb
|
||||
data-slot="scroll-area-thumb"
|
||||
className="bg-border relative flex-1 rounded-full"
|
||||
/>
|
||||
</ScrollAreaPrimitive.ScrollAreaScrollbar>
|
||||
);
|
||||
}
|
||||
|
||||
export { ScrollArea, ScrollBar };
|
|
@ -1,2 +1,3 @@
|
|||
export * from "./use-document-by-chunk";
|
||||
export * from "./use-logs";
|
||||
export * from "./useSearchSourceConnectors";
|
||||
|
|
106
surfsense_web/hooks/use-document-by-chunk.ts
Normal file
106
surfsense_web/hooks/use-document-by-chunk.ts
Normal file
|
@ -0,0 +1,106 @@
|
|||
"use client";
|
||||
import { useCallback, useState } from "react";
|
||||
import { toast } from "sonner";
|
||||
|
||||
export interface Chunk {
|
||||
id: number;
|
||||
content: string;
|
||||
document_id: number;
|
||||
created_at: string;
|
||||
}
|
||||
|
||||
export interface DocumentWithChunks {
|
||||
id: number;
|
||||
title: string;
|
||||
document_type: DocumentType;
|
||||
document_metadata: any;
|
||||
content: string;
|
||||
created_at: string;
|
||||
search_space_id: number;
|
||||
chunks: Chunk[];
|
||||
}
|
||||
|
||||
export type DocumentType =
|
||||
| "EXTENSION"
|
||||
| "CRAWLED_URL"
|
||||
| "SLACK_CONNECTOR"
|
||||
| "NOTION_CONNECTOR"
|
||||
| "FILE"
|
||||
| "YOUTUBE_VIDEO"
|
||||
| "GITHUB_CONNECTOR"
|
||||
| "LINEAR_CONNECTOR"
|
||||
| "DISCORD_CONNECTOR"
|
||||
| "JIRA_CONNECTOR"
|
||||
| "CONFLUENCE_CONNECTOR"
|
||||
| "CLICKUP_CONNECTOR"
|
||||
| "GOOGLE_CALENDAR_CONNECTOR"
|
||||
| "GOOGLE_GMAIL_CONNECTOR";
|
||||
|
||||
export function useDocumentByChunk() {
|
||||
const [document, setDocument] = useState<DocumentWithChunks | null>(null);
|
||||
const [loading, setLoading] = useState(false);
|
||||
const [error, setError] = useState<string | null>(null);
|
||||
|
||||
const fetchDocumentByChunk = useCallback(async (chunkId: number) => {
|
||||
try {
|
||||
setLoading(true);
|
||||
setError(null);
|
||||
setDocument(null);
|
||||
|
||||
const response = await fetch(
|
||||
`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/documents/by-chunk/${chunkId}`,
|
||||
{
|
||||
headers: {
|
||||
Authorization: `Bearer ${localStorage.getItem("surfsense_bearer_token")}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
method: "GET",
|
||||
}
|
||||
);
|
||||
|
||||
if (!response.ok) {
|
||||
const errorText = await response.text();
|
||||
let errorMessage = "Failed to fetch document";
|
||||
|
||||
try {
|
||||
const errorData = JSON.parse(errorText);
|
||||
errorMessage = errorData.detail || errorMessage;
|
||||
} catch {
|
||||
// If parsing fails, use default message
|
||||
}
|
||||
|
||||
if (response.status === 404) {
|
||||
errorMessage = "Chunk not found or you don't have access to it";
|
||||
}
|
||||
|
||||
toast.error(errorMessage);
|
||||
throw new Error(errorMessage);
|
||||
}
|
||||
|
||||
const data: DocumentWithChunks = await response.json();
|
||||
setDocument(data);
|
||||
setError(null);
|
||||
return data;
|
||||
} catch (err: any) {
|
||||
const errorMessage = err.message || "Failed to fetch document";
|
||||
setError(errorMessage);
|
||||
console.error("Error fetching document by chunk:", err);
|
||||
throw err;
|
||||
} finally {
|
||||
setLoading(false);
|
||||
}
|
||||
}, []);
|
||||
|
||||
const clearDocument = useCallback(() => {
|
||||
setDocument(null);
|
||||
setError(null);
|
||||
}, []);
|
||||
|
||||
return {
|
||||
document,
|
||||
loading,
|
||||
error,
|
||||
fetchDocumentByChunk,
|
||||
clearDocument,
|
||||
};
|
||||
}
|
Loading…
Add table
Reference in a new issue