open-notebook/open_notebook/graphs/content_processing/office.py
2024-11-11 17:32:35 -03:00

323 lines
11 KiB
Python

import asyncio
from functools import partial
from docx import Document
from loguru import logger
from openpyxl import load_workbook
from pptx import Presentation
from open_notebook.graphs.content_processing.state import ContentState
SUPPORTED_OFFICE_TYPES = [
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
]
async def extract_docx_content_detailed(file_path):
"""Extract content from DOCX file"""
def _extract():
try:
doc = Document(file_path)
content = []
for paragraph in doc.paragraphs:
if not paragraph.text.strip():
continue
style = paragraph.style.name if paragraph.style else "Normal"
text = paragraph.text.strip()
# Get paragraph formatting
p_format = paragraph.paragraph_format
indent = p_format.left_indent or 0
# Convert indent to spaces (1 level = 4 spaces)
indent_level = 0
if hasattr(indent, "pt"):
indent_level = int(indent.pt / 72) # 72 points = 1 inch
indent_spaces = " " * (indent_level * 4)
# Handle different types of formatting
if "Heading" in style:
level = style[-1] if style[-1].isdigit() else "1"
heading_marks = "#" * int(level)
content.append(f"\n{heading_marks} {text}\n")
# Handle bullet points
elif (
paragraph.style
and hasattr(paragraph.style, "name")
and paragraph.style.name.startswith("List")
):
# Numbered list
if (
hasattr(paragraph._p, "pPr")
and paragraph._p.pPr is not None
and hasattr(paragraph._p.pPr, "numPr")
and paragraph._p.pPr.numPr is not None
):
# Try to get the actual number
try:
if (
hasattr(paragraph._p.pPr.numPr, "numId")
and paragraph._p.pPr.numPr.numId is not None
and hasattr(paragraph._p.pPr.numPr.numId, "val")
):
number = paragraph._p.pPr.numPr.numId.val
content.append(f"{indent_spaces}{number}. {text}")
else:
content.append(f"{indent_spaces}1. {text}")
except Exception:
content.append(f"{indent_spaces}1. {text}")
# Bullet list
else:
content.append(f"{indent_spaces}* {text}")
else:
# Handle text formatting
formatted_text = []
for run in paragraph.runs:
if run.bold:
formatted_text.append(f"**{run.text}**")
elif run.italic:
formatted_text.append(f"*{run.text}*")
else:
formatted_text.append(run.text)
content.append(f"{indent_spaces}{''.join(formatted_text)}")
return "\n\n".join(content)
except Exception as e:
logger.error(f"Failed to extract DOCX content: {e}")
return None
return await asyncio.get_event_loop().run_in_executor(None, _extract)
async def get_docx_info(file_path):
"""Get DOCX metadata and content"""
def _get_info():
try:
doc = Document(file_path)
# Extract core properties if available
core_props = {
"author": doc.core_properties.author,
"created": doc.core_properties.created,
"modified": doc.core_properties.modified,
"title": doc.core_properties.title,
"subject": doc.core_properties.subject,
"keywords": doc.core_properties.keywords,
"category": doc.core_properties.category,
"comments": doc.core_properties.comments,
}
# Get document content
content = extract_docx_content_detailed(file_path)
# Get document statistics
stats = {
"paragraph_count": len(doc.paragraphs),
"word_count": sum(
len(p.text.split()) for p in doc.paragraphs if p.text.strip()
),
"character_count": sum(
len(p.text) for p in doc.paragraphs if p.text.strip()
),
}
return {"metadata": core_props, "content": content, "statistics": stats}
except Exception as e:
logger.error(f"Failed to get DOCX info: {e}")
return None
return await asyncio.get_event_loop().run_in_executor(None, _get_info)
async def extract_pptx_content(file_path):
"""Extract content from PPTX file"""
def _extract():
try:
prs = Presentation(file_path)
content = []
for slide_number, slide in enumerate(prs.slides, 1):
content.append(f"\n# Slide {slide_number}\n")
# Extract title
if slide.shapes.title:
content.append(f"## {slide.shapes.title.text}\n")
# Extract text from all shapes
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text.strip():
if (
shape != slide.shapes.title
): # Skip title as it's already added
content.append(shape.text.strip())
return "\n\n".join(content)
except Exception as e:
logger.error(f"Failed to extract PPTX content: {e}")
return None
return await asyncio.get_event_loop().run_in_executor(None, _extract)
async def extract_xlsx_content(file_path, max_rows=10000, max_cols=100):
"""Extract content from XLSX file"""
def _extract():
try:
wb = load_workbook(file_path, data_only=True)
content = []
for sheet in wb.sheetnames:
ws = wb[sheet]
content.append(f"\n# Sheet: {sheet}\n")
# Get the maximum row and column with data
max_row = min(ws.max_row, max_rows)
max_col = min(ws.max_column, max_cols)
# Create markdown table header
headers = []
for col in range(1, max_col + 1):
cell_value = ws.cell(row=1, column=col).value
headers.append(str(cell_value) if cell_value is not None else "")
content.append("| " + " | ".join(headers) + " |")
content.append("| " + " | ".join(["---"] * len(headers)) + " |")
# Add table content
for row in range(2, max_row + 1):
row_data = []
for col in range(1, max_col + 1):
cell_value = ws.cell(row=row, column=col).value
row_data.append(
str(cell_value) if cell_value is not None else ""
)
content.append("| " + " | ".join(row_data) + " |")
return "\n".join(content)
except Exception as e:
logger.error(f"Failed to extract XLSX content: {e}")
return None
return await asyncio.get_event_loop().run_in_executor(None, partial(_extract))
async def get_pptx_info(file_path):
"""Get PPTX metadata and content"""
def _get_info():
try:
prs = Presentation(file_path)
# Extract basic properties
props = {
"slide_count": len(prs.slides),
"title": "", # PowerPoint doesn't have built-in metadata like Word
}
# Get document content
content = extract_pptx_content(file_path)
# Get presentation statistics
stats = {
"slide_count": len(prs.slides),
"shape_count": sum(len(slide.shapes) for slide in prs.slides),
"text_frame_count": sum(
sum(1 for shape in slide.shapes if hasattr(shape, "text"))
for slide in prs.slides
),
}
return {"metadata": props, "content": content, "statistics": stats}
except Exception as e:
logger.error(f"Failed to get PPTX info: {e}")
return None
return await asyncio.get_event_loop().run_in_executor(None, _get_info)
async def get_xlsx_info(file_path):
"""Get XLSX metadata and content"""
def _get_info():
try:
wb = load_workbook(file_path, data_only=True)
# Extract basic properties
props = {
"sheet_count": len(wb.sheetnames),
"sheets": wb.sheetnames,
"title": wb.properties.title,
"creator": wb.properties.creator,
"created": wb.properties.created,
"modified": wb.properties.modified,
}
# Get document content
content = extract_xlsx_content(file_path)
# Get workbook statistics
stats = {
"sheet_count": len(wb.sheetnames),
"total_rows": sum(sheet.max_row for sheet in wb.worksheets),
"total_columns": sum(sheet.max_column for sheet in wb.worksheets),
}
return {"metadata": props, "content": content, "statistics": stats}
except Exception as e:
logger.error(f"Failed to get XLSX info: {e}")
return None
return await asyncio.get_event_loop().run_in_executor(None, _get_info)
async def extract_office_content(state: ContentState):
"""Universal function to extract content from Office files"""
assert state.get("file_path"), "No file path provided"
assert (
state.get("identified_type") in SUPPORTED_OFFICE_TYPES
), "Unsupported File Type"
file_path = state["file_path"]
doc_type = state["identified_type"]
if (
doc_type
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
):
logger.debug("Extracting content from DOCX file")
content = await extract_docx_content_detailed(file_path)
info = await get_docx_info(file_path)
elif (
doc_type
== "application/vnd.openxmlformats-officedocument.presentationml.presentation"
):
logger.debug("Extracting content from PPTX file")
content = await extract_pptx_content(file_path)
info = await get_pptx_info(file_path)
elif (
doc_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
):
logger.debug("Extracting content from XLSX file")
content = await extract_xlsx_content(file_path)
info = await get_xlsx_info(file_path)
else:
raise Exception(f"Unsupported file format: {doc_type}")
del info["content"]
return {"content": content, "metadata": info}