open-notebook/open_notebook/graphs/content_processing/office.py

import asyncio
from functools import partial

from docx import Document
from loguru import logger
from openpyxl import load_workbook
from pptx import Presentation

from open_notebook.graphs.content_processing.state import ContentState

SUPPORTED_OFFICE_TYPES = [
    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
    "application/vnd.openxmlformats-officedocument.presentationml.presentation",
    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
]


async def extract_docx_content_detailed(file_path):
    """Extract content from DOCX file"""

    def _extract():
        try:
            doc = Document(file_path)
            content = []

            for paragraph in doc.paragraphs:
                if not paragraph.text.strip():
                    continue

                style = paragraph.style.name if paragraph.style else "Normal"
                text = paragraph.text.strip()

                # Get paragraph formatting
                p_format = paragraph.paragraph_format
                indent = p_format.left_indent or 0

                # Convert indent to spaces (1 level = 4 spaces)
                indent_level = 0
                if hasattr(indent, "pt"):
                    indent_level = int(indent.pt / 72)  # 72 points = 1 inch
                indent_spaces = " " * (indent_level * 4)

                # Handle different types of formatting
                if "Heading" in style:
                    level = style[-1] if style[-1].isdigit() else "1"
                    heading_marks = "#" * int(level)
                    content.append(f"\n{heading_marks} {text}\n")

                # Handle bullet points
                elif (
                    paragraph.style
                    and hasattr(paragraph.style, "name")
                    and paragraph.style.name.startswith("List")
                ):
                    # Numbered list
                    if (
                        hasattr(paragraph._p, "pPr")
                        and paragraph._p.pPr is not None
                        and hasattr(paragraph._p.pPr, "numPr")
                        and paragraph._p.pPr.numPr is not None
                    ):
                        # Try to get the actual number
                        try:
                            if (
                                hasattr(paragraph._p.pPr.numPr, "numId")
                                and paragraph._p.pPr.numPr.numId is not None
                                and hasattr(paragraph._p.pPr.numPr.numId, "val")
                            ):
                                number = paragraph._p.pPr.numPr.numId.val
                                content.append(f"{indent_spaces}{number}. {text}")
                            else:
                                content.append(f"{indent_spaces}1. {text}")
                        except Exception:
                            content.append(f"{indent_spaces}1. {text}")
                    # Bullet list
                    else:
                        content.append(f"{indent_spaces}* {text}")

                else:
                    # Handle text formatting
                    formatted_text = []
                    for run in paragraph.runs:
                        if run.bold:
                            formatted_text.append(f"**{run.text}**")
                        elif run.italic:
                            formatted_text.append(f"*{run.text}*")
                        else:
                            formatted_text.append(run.text)

                    content.append(f"{indent_spaces}{''.join(formatted_text)}")

            return "\n\n".join(content)

        except Exception as e:
            logger.error(f"Failed to extract DOCX content: {e}")
            return None

    return await asyncio.get_event_loop().run_in_executor(None, _extract)


async def get_docx_info(file_path):
    """Get DOCX metadata and content"""

    def _get_info():
        try:
            doc = Document(file_path)

            # Extract core properties if available
            core_props = {
                "author": doc.core_properties.author,
                "created": doc.core_properties.created,
                "modified": doc.core_properties.modified,
                "title": doc.core_properties.title,
                "subject": doc.core_properties.subject,
                "keywords": doc.core_properties.keywords,
                "category": doc.core_properties.category,
                "comments": doc.core_properties.comments,
            }

            # Get document content
            content = extract_docx_content_detailed(file_path)

            # Get document statistics
            stats = {
                "paragraph_count": len(doc.paragraphs),
                "word_count": sum(
                    len(p.text.split()) for p in doc.paragraphs if p.text.strip()
                ),
                "character_count": sum(
                    len(p.text) for p in doc.paragraphs if p.text.strip()
                ),
            }

            return {"metadata": core_props, "content": content, "statistics": stats}

        except Exception as e:
            logger.error(f"Failed to get DOCX info: {e}")
            return None

    return await asyncio.get_event_loop().run_in_executor(None, _get_info)


async def extract_pptx_content(file_path):
    """Extract content from PPTX file"""

    def _extract():
        try:
            prs = Presentation(file_path)
            content = []

            for slide_number, slide in enumerate(prs.slides, 1):
                content.append(f"\n# Slide {slide_number}\n")

                # Extract title
                if slide.shapes.title:
                    content.append(f"## {slide.shapes.title.text}\n")

                # Extract text from all shapes
                for shape in slide.shapes:
                    if hasattr(shape, "text") and shape.text.strip():
                        if (
                            shape != slide.shapes.title
                        ):  # Skip title as it's already added
                            content.append(shape.text.strip())

            return "\n\n".join(content)

        except Exception as e:
            logger.error(f"Failed to extract PPTX content: {e}")
            return None

    return await asyncio.get_event_loop().run_in_executor(None, _extract)


async def extract_xlsx_content(file_path, max_rows=10000, max_cols=100):
    """Extract content from XLSX file"""

    def _extract():
        try:
            wb = load_workbook(file_path, data_only=True)
            content = []

            for sheet in wb.sheetnames:
                ws = wb[sheet]
                content.append(f"\n# Sheet: {sheet}\n")

                # Get the maximum row and column with data
                max_row = min(ws.max_row, max_rows)
                max_col = min(ws.max_column, max_cols)

                # Create markdown table header
                headers = []
                for col in range(1, max_col + 1):
                    cell_value = ws.cell(row=1, column=col).value
                    headers.append(str(cell_value) if cell_value is not None else "")

                content.append("| " + " | ".join(headers) + " |")
                content.append("| " + " | ".join(["---"] * len(headers)) + " |")

                # Add table content
                for row in range(2, max_row + 1):
                    row_data = []
                    for col in range(1, max_col + 1):
                        cell_value = ws.cell(row=row, column=col).value
                        row_data.append(
                            str(cell_value) if cell_value is not None else ""
                        )
                    content.append("| " + " | ".join(row_data) + " |")

            return "\n".join(content)

        except Exception as e:
            logger.error(f"Failed to extract XLSX content: {e}")
            return None

    return await asyncio.get_event_loop().run_in_executor(None, partial(_extract))


async def get_pptx_info(file_path):
    """Get PPTX metadata and content"""

    def _get_info():
        try:
            prs = Presentation(file_path)

            # Extract basic properties
            props = {
                "slide_count": len(prs.slides),
                "title": "",  # PowerPoint doesn't have built-in metadata like Word
            }

            # Get document content
            content = extract_pptx_content(file_path)

            # Get presentation statistics
            stats = {
                "slide_count": len(prs.slides),
                "shape_count": sum(len(slide.shapes) for slide in prs.slides),
                "text_frame_count": sum(
                    sum(1 for shape in slide.shapes if hasattr(shape, "text"))
                    for slide in prs.slides
                ),
            }

            return {"metadata": props, "content": content, "statistics": stats}

        except Exception as e:
            logger.error(f"Failed to get PPTX info: {e}")
            return None

    return await asyncio.get_event_loop().run_in_executor(None, _get_info)


async def get_xlsx_info(file_path):
    """Get XLSX metadata and content"""

    def _get_info():
        try:
            wb = load_workbook(file_path, data_only=True)

            # Extract basic properties
            props = {
                "sheet_count": len(wb.sheetnames),
                "sheets": wb.sheetnames,
                "title": wb.properties.title,
                "creator": wb.properties.creator,
                "created": wb.properties.created,
                "modified": wb.properties.modified,
            }

            # Get document content
            content = extract_xlsx_content(file_path)

            # Get workbook statistics
            stats = {
                "sheet_count": len(wb.sheetnames),
                "total_rows": sum(sheet.max_row for sheet in wb.worksheets),
                "total_columns": sum(sheet.max_column for sheet in wb.worksheets),
            }

            return {"metadata": props, "content": content, "statistics": stats}

        except Exception as e:
            logger.error(f"Failed to get XLSX info: {e}")
            return None

    return await asyncio.get_event_loop().run_in_executor(None, _get_info)


async def extract_office_content(state: ContentState):
    """Universal function to extract content from Office files"""
    assert state.get("file_path"), "No file path provided"
    assert (
        state.get("identified_type") in SUPPORTED_OFFICE_TYPES
    ), "Unsupported File Type"
    file_path = state["file_path"]
    doc_type = state["identified_type"]

    if (
        doc_type
        == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
    ):
        logger.debug("Extracting content from DOCX file")
        content = await extract_docx_content_detailed(file_path)
        info = await get_docx_info(file_path)
    elif (
        doc_type
        == "application/vnd.openxmlformats-officedocument.presentationml.presentation"
    ):
        logger.debug("Extracting content from PPTX file")
        content = await extract_pptx_content(file_path)
        info = await get_pptx_info(file_path)
    elif (
        doc_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
    ):
        logger.debug("Extracting content from XLSX file")
        content = await extract_xlsx_content(file_path)
        info = await get_xlsx_info(file_path)
    else:
        raise Exception(f"Unsupported file format: {doc_type}")

    del info["content"]
    return {"content": content, "metadata": info}