Fixed all ruff lint and formatting errors

This commit is contained in:
Utkarsh-Patel-13 2025-07-24 14:43:48 -07:00
parent 0a03c42cc5
commit d359a59f6d
85 changed files with 5520 additions and 3870 deletions

View file

@ -1,23 +1,35 @@
from litellm import atranscription
from fastapi import APIRouter, Depends, BackgroundTasks, UploadFile, Form, HTTPException
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from typing import List
from app.db import Log, get_async_session, User, SearchSpace, Document, DocumentType
from app.schemas import DocumentsCreate, DocumentUpdate, DocumentRead
from app.users import current_active_user
from app.utils.check_ownership import check_ownership
from app.tasks.background_tasks import add_received_markdown_file_document, add_extension_received_document, add_received_file_document_using_unstructured, add_crawled_url_document, add_youtube_video_document, add_received_file_document_using_llamacloud, add_received_file_document_using_docling
from app.config import config as app_config
# Force asyncio to use standard event loop before unstructured imports
import asyncio
from fastapi import APIRouter, BackgroundTasks, Depends, Form, HTTPException, UploadFile
from litellm import atranscription
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from app.config import config as app_config
from app.db import Document, DocumentType, Log, SearchSpace, User, get_async_session
from app.schemas import DocumentRead, DocumentsCreate, DocumentUpdate
from app.services.task_logging_service import TaskLoggingService
from app.tasks.background_tasks import (
add_crawled_url_document,
add_extension_received_document,
add_received_file_document_using_docling,
add_received_file_document_using_llamacloud,
add_received_file_document_using_unstructured,
add_received_markdown_file_document,
add_youtube_video_document,
)
from app.users import current_active_user
from app.utils.check_ownership import check_ownership
try:
asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())
except RuntimeError:
except RuntimeError as e:
print("Error setting event loop policy", e)
pass
import os
os.environ["UNSTRUCTURED_HAS_PATCHED_LOOP"] = "1"
@ -29,7 +41,7 @@ async def create_documents(
request: DocumentsCreate,
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
fastapi_background_tasks: BackgroundTasks = BackgroundTasks()
fastapi_background_tasks: BackgroundTasks = BackgroundTasks(),
):
try:
# Check if the user owns the search space
@ -41,7 +53,7 @@ async def create_documents(
process_extension_document_with_new_session,
individual_document,
request.search_space_id,
str(user.id)
str(user.id),
)
elif request.document_type == DocumentType.CRAWLED_URL:
for url in request.content:
@ -49,7 +61,7 @@ async def create_documents(
process_crawled_url_with_new_session,
url,
request.search_space_id,
str(user.id)
str(user.id),
)
elif request.document_type == DocumentType.YOUTUBE_VIDEO:
for url in request.content:
@ -57,13 +69,10 @@ async def create_documents(
process_youtube_video_with_new_session,
url,
request.search_space_id,
str(user.id)
str(user.id),
)
else:
raise HTTPException(
status_code=400,
detail="Invalid document type"
)
raise HTTPException(status_code=400, detail="Invalid document type")
await session.commit()
return {"message": "Documents processed successfully"}
@ -72,18 +81,17 @@ async def create_documents(
except Exception as e:
await session.rollback()
raise HTTPException(
status_code=500,
detail=f"Failed to process documents: {str(e)}"
)
status_code=500, detail=f"Failed to process documents: {e!s}"
) from e
@router.post("/documents/fileupload")
async def create_documents(
async def create_documents_file_upload(
files: list[UploadFile],
search_space_id: int = Form(...),
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
fastapi_background_tasks: BackgroundTasks = BackgroundTasks()
fastapi_background_tasks: BackgroundTasks = BackgroundTasks(),
):
try:
await check_ownership(session, SearchSpace, search_space_id, user)
@ -94,31 +102,32 @@ async def create_documents(
for file in files:
try:
# Save file to a temporary location to avoid stream issues
import tempfile
import aiofiles
import os
import tempfile
# Create temp file
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as temp_file:
with tempfile.NamedTemporaryFile(
delete=False, suffix=os.path.splitext(file.filename)[1]
) as temp_file:
temp_path = temp_file.name
# Write uploaded file to temp file
content = await file.read()
with open(temp_path, "wb") as f:
f.write(content)
fastapi_background_tasks.add_task(
process_file_in_background_with_new_session,
temp_path,
file.filename,
search_space_id,
str(user.id)
str(user.id),
)
except Exception as e:
raise HTTPException(
status_code=422,
detail=f"Failed to process file {file.filename}: {str(e)}"
)
detail=f"Failed to process file {file.filename}: {e!s}",
) from e
await session.commit()
return {"message": "Files uploaded for processing"}
@ -127,9 +136,8 @@ async def create_documents(
except Exception as e:
await session.rollback()
raise HTTPException(
status_code=500,
detail=f"Failed to upload files: {str(e)}"
)
status_code=500, detail=f"Failed to upload files: {e!s}"
) from e
async def process_file_in_background(
@ -139,64 +147,71 @@ async def process_file_in_background(
user_id: str,
session: AsyncSession,
task_logger: TaskLoggingService,
log_entry: Log
log_entry: Log,
):
try:
# Check if the file is a markdown or text file
if filename.lower().endswith(('.md', '.markdown', '.txt')):
if filename.lower().endswith((".md", ".markdown", ".txt")):
await task_logger.log_task_progress(
log_entry,
f"Processing markdown/text file: {filename}",
{"file_type": "markdown", "processing_stage": "reading_file"}
{"file_type": "markdown", "processing_stage": "reading_file"},
)
# For markdown files, read the content directly
with open(file_path, 'r', encoding='utf-8') as f:
with open(file_path, encoding="utf-8") as f:
markdown_content = f.read()
# Clean up the temp file
import os
try:
os.unlink(file_path)
except:
except Exception as e:
print("Error deleting temp file", e)
pass
await task_logger.log_task_progress(
log_entry,
f"Creating document from markdown content: {filename}",
{"processing_stage": "creating_document", "content_length": len(markdown_content)}
{
"processing_stage": "creating_document",
"content_length": len(markdown_content),
},
)
# Process markdown directly through specialized function
result = await add_received_markdown_file_document(
session,
filename,
markdown_content,
search_space_id,
user_id
session, filename, markdown_content, search_space_id, user_id
)
if result:
await task_logger.log_task_success(
log_entry,
f"Successfully processed markdown file: {filename}",
{"document_id": result.id, "content_hash": result.content_hash, "file_type": "markdown"}
{
"document_id": result.id,
"content_hash": result.content_hash,
"file_type": "markdown",
},
)
else:
await task_logger.log_task_success(
log_entry,
f"Markdown file already exists (duplicate): {filename}",
{"duplicate_detected": True, "file_type": "markdown"}
{"duplicate_detected": True, "file_type": "markdown"},
)
# Check if the file is an audio file
elif filename.lower().endswith(('.mp3', '.mp4', '.mpeg', '.mpga', '.m4a', '.wav', '.webm')):
elif filename.lower().endswith(
(".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm")
):
await task_logger.log_task_progress(
log_entry,
f"Processing audio file for transcription: {filename}",
{"file_type": "audio", "processing_stage": "starting_transcription"}
{"file_type": "audio", "processing_stage": "starting_transcription"},
)
# Open the audio file for transcription
with open(file_path, "rb") as audio_file:
# Use LiteLLM for audio transcription
@ -205,65 +220,76 @@ async def process_file_in_background(
model=app_config.STT_SERVICE,
file=audio_file,
api_base=app_config.STT_SERVICE_API_BASE,
api_key=app_config.STT_SERVICE_API_KEY
api_key=app_config.STT_SERVICE_API_KEY,
)
else:
transcription_response = await atranscription(
model=app_config.STT_SERVICE,
api_key=app_config.STT_SERVICE_API_KEY,
file=audio_file
file=audio_file,
)
# Extract the transcribed text
transcribed_text = transcription_response.get("text", "")
# Add metadata about the transcription
transcribed_text = f"# Transcription of {filename}\n\n{transcribed_text}"
transcribed_text = (
f"# Transcription of {filename}\n\n{transcribed_text}"
)
await task_logger.log_task_progress(
log_entry,
f"Transcription completed, creating document: {filename}",
{"processing_stage": "transcription_complete", "transcript_length": len(transcribed_text)}
{
"processing_stage": "transcription_complete",
"transcript_length": len(transcribed_text),
},
)
# Clean up the temp file
try:
os.unlink(file_path)
except:
except Exception as e:
print("Error deleting temp file", e)
pass
# Process transcription as markdown document
result = await add_received_markdown_file_document(
session,
filename,
transcribed_text,
search_space_id,
user_id
session, filename, transcribed_text, search_space_id, user_id
)
if result:
await task_logger.log_task_success(
log_entry,
f"Successfully transcribed and processed audio file: {filename}",
{"document_id": result.id, "content_hash": result.content_hash, "file_type": "audio", "transcript_length": len(transcribed_text)}
{
"document_id": result.id,
"content_hash": result.content_hash,
"file_type": "audio",
"transcript_length": len(transcribed_text),
},
)
else:
await task_logger.log_task_success(
log_entry,
f"Audio file transcript already exists (duplicate): {filename}",
{"duplicate_detected": True, "file_type": "audio"}
{"duplicate_detected": True, "file_type": "audio"},
)
else:
if app_config.ETL_SERVICE == "UNSTRUCTURED":
await task_logger.log_task_progress(
log_entry,
f"Processing file with Unstructured ETL: {filename}",
{"file_type": "document", "etl_service": "UNSTRUCTURED", "processing_stage": "loading"}
{
"file_type": "document",
"etl_service": "UNSTRUCTURED",
"processing_stage": "loading",
},
)
from langchain_unstructured import UnstructuredLoader
# Process the file
loader = UnstructuredLoader(
file_path,
@ -280,212 +306,257 @@ async def process_file_in_background(
await task_logger.log_task_progress(
log_entry,
f"Unstructured ETL completed, creating document: {filename}",
{"processing_stage": "etl_complete", "elements_count": len(docs)}
{"processing_stage": "etl_complete", "elements_count": len(docs)},
)
# Clean up the temp file
import os
try:
os.unlink(file_path)
except:
except Exception as e:
print("Error deleting temp file", e)
pass
# Pass the documents to the existing background task
result = await add_received_file_document_using_unstructured(
session,
filename,
docs,
search_space_id,
user_id
session, filename, docs, search_space_id, user_id
)
if result:
await task_logger.log_task_success(
log_entry,
f"Successfully processed file with Unstructured: {filename}",
{"document_id": result.id, "content_hash": result.content_hash, "file_type": "document", "etl_service": "UNSTRUCTURED"}
{
"document_id": result.id,
"content_hash": result.content_hash,
"file_type": "document",
"etl_service": "UNSTRUCTURED",
},
)
else:
await task_logger.log_task_success(
log_entry,
f"Document already exists (duplicate): {filename}",
{"duplicate_detected": True, "file_type": "document", "etl_service": "UNSTRUCTURED"}
{
"duplicate_detected": True,
"file_type": "document",
"etl_service": "UNSTRUCTURED",
},
)
elif app_config.ETL_SERVICE == "LLAMACLOUD":
await task_logger.log_task_progress(
log_entry,
f"Processing file with LlamaCloud ETL: {filename}",
{"file_type": "document", "etl_service": "LLAMACLOUD", "processing_stage": "parsing"}
{
"file_type": "document",
"etl_service": "LLAMACLOUD",
"processing_stage": "parsing",
},
)
from llama_cloud_services import LlamaParse
from llama_cloud_services.parse.utils import ResultType
# Create LlamaParse parser instance
parser = LlamaParse(
api_key=app_config.LLAMA_CLOUD_API_KEY,
num_workers=1, # Use single worker for file processing
verbose=True,
language="en",
result_type=ResultType.MD
result_type=ResultType.MD,
)
# Parse the file asynchronously
result = await parser.aparse(file_path)
# Clean up the temp file
import os
try:
os.unlink(file_path)
except:
except Exception as e:
print("Error deleting temp file", e)
pass
# Get markdown documents from the result
markdown_documents = await result.aget_markdown_documents(split_by_page=False)
markdown_documents = await result.aget_markdown_documents(
split_by_page=False
)
await task_logger.log_task_progress(
log_entry,
f"LlamaCloud parsing completed, creating documents: {filename}",
{"processing_stage": "parsing_complete", "documents_count": len(markdown_documents)}
{
"processing_stage": "parsing_complete",
"documents_count": len(markdown_documents),
},
)
for doc in markdown_documents:
# Extract text content from the markdown documents
markdown_content = doc.text
# Process the documents using our LlamaCloud background task
doc_result = await add_received_file_document_using_llamacloud(
session,
filename,
llamacloud_markdown_document=markdown_content,
search_space_id=search_space_id,
user_id=user_id
user_id=user_id,
)
if doc_result:
await task_logger.log_task_success(
log_entry,
f"Successfully processed file with LlamaCloud: {filename}",
{"document_id": doc_result.id, "content_hash": doc_result.content_hash, "file_type": "document", "etl_service": "LLAMACLOUD"}
{
"document_id": doc_result.id,
"content_hash": doc_result.content_hash,
"file_type": "document",
"etl_service": "LLAMACLOUD",
},
)
else:
await task_logger.log_task_success(
log_entry,
f"Document already exists (duplicate): {filename}",
{"duplicate_detected": True, "file_type": "document", "etl_service": "LLAMACLOUD"}
{
"duplicate_detected": True,
"file_type": "document",
"etl_service": "LLAMACLOUD",
},
)
elif app_config.ETL_SERVICE == "DOCLING":
await task_logger.log_task_progress(
log_entry,
f"Processing file with Docling ETL: {filename}",
{"file_type": "document", "etl_service": "DOCLING", "processing_stage": "parsing"}
{
"file_type": "document",
"etl_service": "DOCLING",
"processing_stage": "parsing",
},
)
# Use Docling service for document processing
from app.services.docling_service import create_docling_service
# Create Docling service
docling_service = create_docling_service()
# Process the document
result = await docling_service.process_document(file_path, filename)
# Clean up the temp file
import os
try:
os.unlink(file_path)
except:
except Exception as e:
print("Error deleting temp file", e)
pass
await task_logger.log_task_progress(
log_entry,
f"Docling parsing completed, creating document: {filename}",
{"processing_stage": "parsing_complete", "content_length": len(result['content'])}
{
"processing_stage": "parsing_complete",
"content_length": len(result["content"]),
},
)
# Process the document using our Docling background task
doc_result = await add_received_file_document_using_docling(
session,
filename,
docling_markdown_document=result['content'],
docling_markdown_document=result["content"],
search_space_id=search_space_id,
user_id=user_id
user_id=user_id,
)
if doc_result:
await task_logger.log_task_success(
log_entry,
f"Successfully processed file with Docling: {filename}",
{"document_id": doc_result.id, "content_hash": doc_result.content_hash, "file_type": "document", "etl_service": "DOCLING"}
{
"document_id": doc_result.id,
"content_hash": doc_result.content_hash,
"file_type": "document",
"etl_service": "DOCLING",
},
)
else:
await task_logger.log_task_success(
log_entry,
f"Document already exists (duplicate): {filename}",
{"duplicate_detected": True, "file_type": "document", "etl_service": "DOCLING"}
{
"duplicate_detected": True,
"file_type": "document",
"etl_service": "DOCLING",
},
)
except Exception as e:
await task_logger.log_task_failure(
log_entry,
f"Failed to process file: {filename}",
str(e),
{"error_type": type(e).__name__, "filename": filename}
{"error_type": type(e).__name__, "filename": filename},
)
import logging
logging.error(f"Error processing file in background: {str(e)}")
logging.error(f"Error processing file in background: {e!s}")
raise # Re-raise so the wrapper can also handle it
@router.get("/documents/", response_model=List[DocumentRead])
@router.get("/documents/", response_model=list[DocumentRead])
async def read_documents(
skip: int = 0,
limit: int = 300,
search_space_id: int = None,
search_space_id: int | None = None,
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user)
user: User = Depends(current_active_user),
):
try:
query = select(Document).join(SearchSpace).filter(
SearchSpace.user_id == user.id)
query = (
select(Document).join(SearchSpace).filter(SearchSpace.user_id == user.id)
)
# Filter by search_space_id if provided
if search_space_id is not None:
query = query.filter(Document.search_space_id == search_space_id)
result = await session.execute(
query.offset(skip).limit(limit)
)
result = await session.execute(query.offset(skip).limit(limit))
db_documents = result.scalars().all()
# Convert database objects to API-friendly format
api_documents = []
for doc in db_documents:
api_documents.append(DocumentRead(
id=doc.id,
title=doc.title,
document_type=doc.document_type,
document_metadata=doc.document_metadata,
content=doc.content,
created_at=doc.created_at,
search_space_id=doc.search_space_id
))
api_documents.append(
DocumentRead(
id=doc.id,
title=doc.title,
document_type=doc.document_type,
document_metadata=doc.document_metadata,
content=doc.content,
created_at=doc.created_at,
search_space_id=doc.search_space_id,
)
)
return api_documents
except Exception as e:
raise HTTPException(
status_code=500,
detail=f"Failed to fetch documents: {str(e)}"
)
status_code=500, detail=f"Failed to fetch documents: {e!s}"
) from e
@router.get("/documents/{document_id}", response_model=DocumentRead)
async def read_document(
document_id: int,
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user)
user: User = Depends(current_active_user),
):
try:
result = await session.execute(
@ -497,8 +568,7 @@ async def read_document(
if not document:
raise HTTPException(
status_code=404,
detail=f"Document with id {document_id} not found"
status_code=404, detail=f"Document with id {document_id} not found"
)
# Convert database object to API-friendly format
@ -509,13 +579,12 @@ async def read_document(
document_metadata=document.document_metadata,
content=document.content,
created_at=document.created_at,
search_space_id=document.search_space_id
search_space_id=document.search_space_id,
)
except Exception as e:
raise HTTPException(
status_code=500,
detail=f"Failed to fetch document: {str(e)}"
)
status_code=500, detail=f"Failed to fetch document: {e!s}"
) from e
@router.put("/documents/{document_id}", response_model=DocumentRead)
@ -523,7 +592,7 @@ async def update_document(
document_id: int,
document_update: DocumentUpdate,
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user)
user: User = Depends(current_active_user),
):
try:
# Query the document directly instead of using read_document function
@ -536,8 +605,7 @@ async def update_document(
if not db_document:
raise HTTPException(
status_code=404,
detail=f"Document with id {document_id} not found"
status_code=404, detail=f"Document with id {document_id} not found"
)
update_data = document_update.model_dump(exclude_unset=True)
@ -554,23 +622,22 @@ async def update_document(
document_metadata=db_document.document_metadata,
content=db_document.content,
created_at=db_document.created_at,
search_space_id=db_document.search_space_id
search_space_id=db_document.search_space_id,
)
except HTTPException:
raise
except Exception as e:
await session.rollback()
raise HTTPException(
status_code=500,
detail=f"Failed to update document: {str(e)}"
)
status_code=500, detail=f"Failed to update document: {e!s}"
) from e
@router.delete("/documents/{document_id}", response_model=dict)
async def delete_document(
document_id: int,
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user)
user: User = Depends(current_active_user),
):
try:
# Query the document directly instead of using read_document function
@ -583,8 +650,7 @@ async def delete_document(
if not document:
raise HTTPException(
status_code=404,
detail=f"Document with id {document_id} not found"
status_code=404, detail=f"Document with id {document_id} not found"
)
await session.delete(document)
@ -595,15 +661,12 @@ async def delete_document(
except Exception as e:
await session.rollback()
raise HTTPException(
status_code=500,
detail=f"Failed to delete document: {str(e)}"
)
status_code=500, detail=f"Failed to delete document: {e!s}"
) from e
async def process_extension_document_with_new_session(
individual_document,
search_space_id: int,
user_id: str
individual_document, search_space_id: int, user_id: str
):
"""Create a new session and process extension document."""
from app.db import async_session_maker
@ -612,7 +675,7 @@ async def process_extension_document_with_new_session(
async with async_session_maker() as session:
# Initialize task logging service
task_logger = TaskLoggingService(session, search_space_id)
# Log task start
log_entry = await task_logger.log_task_start(
task_name="process_extension_document",
@ -622,40 +685,41 @@ async def process_extension_document_with_new_session(
"document_type": "EXTENSION",
"url": individual_document.metadata.VisitedWebPageURL,
"title": individual_document.metadata.VisitedWebPageTitle,
"user_id": user_id
}
"user_id": user_id,
},
)
try:
result = await add_extension_received_document(session, individual_document, search_space_id, user_id)
result = await add_extension_received_document(
session, individual_document, search_space_id, user_id
)
if result:
await task_logger.log_task_success(
log_entry,
f"Successfully processed extension document: {individual_document.metadata.VisitedWebPageTitle}",
{"document_id": result.id, "content_hash": result.content_hash}
{"document_id": result.id, "content_hash": result.content_hash},
)
else:
await task_logger.log_task_success(
log_entry,
f"Extension document already exists (duplicate): {individual_document.metadata.VisitedWebPageTitle}",
{"duplicate_detected": True}
{"duplicate_detected": True},
)
except Exception as e:
await task_logger.log_task_failure(
log_entry,
f"Failed to process extension document: {individual_document.metadata.VisitedWebPageTitle}",
str(e),
{"error_type": type(e).__name__}
{"error_type": type(e).__name__},
)
import logging
logging.error(f"Error processing extension document: {str(e)}")
logging.error(f"Error processing extension document: {e!s}")
async def process_crawled_url_with_new_session(
url: str,
search_space_id: int,
user_id: str
url: str, search_space_id: int, user_id: str
):
"""Create a new session and process crawled URL."""
from app.db import async_session_maker
@ -664,50 +728,50 @@ async def process_crawled_url_with_new_session(
async with async_session_maker() as session:
# Initialize task logging service
task_logger = TaskLoggingService(session, search_space_id)
# Log task start
log_entry = await task_logger.log_task_start(
task_name="process_crawled_url",
source="document_processor",
message=f"Starting URL crawling and processing for: {url}",
metadata={
"document_type": "CRAWLED_URL",
"url": url,
"user_id": user_id
}
metadata={"document_type": "CRAWLED_URL", "url": url, "user_id": user_id},
)
try:
result = await add_crawled_url_document(session, url, search_space_id, user_id)
result = await add_crawled_url_document(
session, url, search_space_id, user_id
)
if result:
await task_logger.log_task_success(
log_entry,
f"Successfully crawled and processed URL: {url}",
{"document_id": result.id, "title": result.title, "content_hash": result.content_hash}
{
"document_id": result.id,
"title": result.title,
"content_hash": result.content_hash,
},
)
else:
await task_logger.log_task_success(
log_entry,
f"URL document already exists (duplicate): {url}",
{"duplicate_detected": True}
{"duplicate_detected": True},
)
except Exception as e:
await task_logger.log_task_failure(
log_entry,
f"Failed to crawl URL: {url}",
str(e),
{"error_type": type(e).__name__}
{"error_type": type(e).__name__},
)
import logging
logging.error(f"Error processing crawled URL: {str(e)}")
logging.error(f"Error processing crawled URL: {e!s}")
async def process_file_in_background_with_new_session(
file_path: str,
filename: str,
search_space_id: int,
user_id: str
file_path: str, filename: str, search_space_id: int, user_id: str
):
"""Create a new session and process file."""
from app.db import async_session_maker
@ -716,7 +780,7 @@ async def process_file_in_background_with_new_session(
async with async_session_maker() as session:
# Initialize task logging service
task_logger = TaskLoggingService(session, search_space_id)
# Log task start
log_entry = await task_logger.log_task_start(
task_name="process_file_upload",
@ -726,29 +790,36 @@ async def process_file_in_background_with_new_session(
"document_type": "FILE",
"filename": filename,
"file_path": file_path,
"user_id": user_id
}
"user_id": user_id,
},
)
try:
await process_file_in_background(file_path, filename, search_space_id, user_id, session, task_logger, log_entry)
await process_file_in_background(
file_path,
filename,
search_space_id,
user_id,
session,
task_logger,
log_entry,
)
# Note: success/failure logging is handled within process_file_in_background
except Exception as e:
await task_logger.log_task_failure(
log_entry,
f"Failed to process file: {filename}",
str(e),
{"error_type": type(e).__name__}
{"error_type": type(e).__name__},
)
import logging
logging.error(f"Error processing file: {str(e)}")
logging.error(f"Error processing file: {e!s}")
async def process_youtube_video_with_new_session(
url: str,
search_space_id: int,
user_id: str
url: str, search_space_id: int, user_id: str
):
"""Create a new session and process YouTube video."""
from app.db import async_session_maker
@ -757,42 +828,43 @@ async def process_youtube_video_with_new_session(
async with async_session_maker() as session:
# Initialize task logging service
task_logger = TaskLoggingService(session, search_space_id)
# Log task start
log_entry = await task_logger.log_task_start(
task_name="process_youtube_video",
source="document_processor",
message=f"Starting YouTube video processing for: {url}",
metadata={
"document_type": "YOUTUBE_VIDEO",
"url": url,
"user_id": user_id
}
metadata={"document_type": "YOUTUBE_VIDEO", "url": url, "user_id": user_id},
)
try:
result = await add_youtube_video_document(session, url, search_space_id, user_id)
result = await add_youtube_video_document(
session, url, search_space_id, user_id
)
if result:
await task_logger.log_task_success(
log_entry,
f"Successfully processed YouTube video: {result.title}",
{"document_id": result.id, "video_id": result.document_metadata.get("video_id"), "content_hash": result.content_hash}
{
"document_id": result.id,
"video_id": result.document_metadata.get("video_id"),
"content_hash": result.content_hash,
},
)
else:
await task_logger.log_task_success(
log_entry,
f"YouTube video document already exists (duplicate): {url}",
{"duplicate_detected": True}
{"duplicate_detected": True},
)
except Exception as e:
await task_logger.log_task_failure(
log_entry,
f"Failed to process YouTube video: {url}",
str(e),
{"error_type": type(e).__name__}
{"error_type": type(e).__name__},
)
import logging
logging.error(f"Error processing YouTube video: {str(e)}")
logging.error(f"Error processing YouTube video: {e!s}")