Skyvern/tests/unit/workflow/test_file_parser_block.py
Shuchang Zheng 76b10eb007
Fix OSS frontend build: add useFeatureFlag stub (#5042)
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-10 23:36:42 -07:00

253 lines
11 KiB
Python

"""
Tests for FileParserBlock DOCX support.
Covers file type detection, validation, text extraction (paragraphs + tables),
token truncation, and error handling for DOCX files.
"""
from __future__ import annotations
from datetime import datetime, timezone
from pathlib import Path
import docx
import pytest
from skyvern.forge.sdk.workflow.exceptions import InvalidFileType
from skyvern.forge.sdk.workflow.models.block import BlockType, FileParserBlock
from skyvern.forge.sdk.workflow.models.parameter import OutputParameter, ParameterType
from skyvern.schemas.workflows import FileType
def _make_output_parameter(key: str) -> OutputParameter:
return OutputParameter(
parameter_type=ParameterType.OUTPUT,
key=key,
description="test",
output_parameter_id="test-output-id",
workflow_id="test-workflow-id",
created_at=datetime.now(timezone.utc),
modified_at=datetime.now(timezone.utc),
)
def _make_file_parser_block(file_url: str, file_type: FileType) -> FileParserBlock:
return FileParserBlock(
label="test_file_parser",
block_type=BlockType.FILE_URL_PARSER,
output_parameter=_make_output_parameter("test_output"),
file_url=file_url,
file_type=file_type,
)
def _create_docx(
path: Path,
paragraphs: list[str] | None = None,
table_rows: list[list[str]] | None = None,
) -> Path:
"""Create a DOCX file with optional paragraphs and tables."""
doc = docx.Document()
if paragraphs:
for text in paragraphs:
doc.add_paragraph(text)
if table_rows:
cols = len(table_rows[0])
table = doc.add_table(rows=len(table_rows), cols=cols)
for i, row_data in enumerate(table_rows):
for j, cell_text in enumerate(row_data):
table.rows[i].cells[j].text = cell_text
doc.save(str(path))
return path
class TestDetectFileTypeFromUrl:
"""Tests for _detect_file_type_from_url with DOCX extensions."""
def _detect(self, url: str, file_path: str | None = None) -> FileType:
block = _make_file_parser_block(url, FileType.CSV)
return block._detect_file_type_from_url(url, file_path=file_path)
def test_docx_extension(self) -> None:
assert self._detect("https://example.com/file.docx") == FileType.DOCX
def test_doc_extension_raises_error(self) -> None:
# Legacy .doc (Word 97-2003) is not supported by python-docx
with pytest.raises(InvalidFileType, match="Legacy .doc format"):
self._detect("https://example.com/file.doc")
def test_docx_with_query_params(self) -> None:
assert self._detect("https://example.com/file.docx?token=abc&v=1") == FileType.DOCX
def test_docx_case_insensitive(self) -> None:
assert self._detect("https://example.com/file.DOCX") == FileType.DOCX
def test_other_extensions_unchanged(self) -> None:
assert self._detect("https://example.com/file.pdf") == FileType.PDF
assert self._detect("https://example.com/file.xlsx") == FileType.EXCEL
assert self._detect("https://example.com/file.csv") == FileType.CSV
assert self._detect("https://example.com/file.png") == FileType.IMAGE
def test_no_extension_without_file_path_falls_back_to_csv(self) -> None:
assert self._detect("https://example.com/34371136523") == FileType.CSV
def test_no_extension_with_pdf_file_detected_as_pdf(self, tmp_path: Path) -> None:
# Create a minimal valid PDF file
pdf_path = tmp_path / "no_ext_file"
pdf_path.write_bytes(b"%PDF-1.5\n1 0 obj\n<< /Type /Catalog >>\nendobj\n%%EOF")
assert self._detect("https://example.com/34371136523", file_path=str(pdf_path)) == FileType.PDF
def test_no_extension_with_unknown_file_falls_back_to_csv(self, tmp_path: Path) -> None:
# Plain text file — filetype.guess returns None for text
txt_path = tmp_path / "unknown_file"
txt_path.write_text("just,some,csv,data\n1,2,3,4")
assert self._detect("https://example.com/some_file", file_path=str(txt_path)) == FileType.CSV
def test_query_params_only_url_with_pdf_file(self, tmp_path: Path) -> None:
# URL like /download?id=123 — no file extension visible
pdf_path = tmp_path / "downloaded"
pdf_path.write_bytes(b"%PDF-1.5\n1 0 obj\n<< /Type /Catalog >>\nendobj\n%%EOF")
assert self._detect("https://example.com/download?id=123", file_path=str(pdf_path)) == FileType.PDF
class TestValidateFileType:
"""Tests for validate_file_type with DOCX files."""
def test_valid_docx(self, tmp_path: Path) -> None:
path = _create_docx(tmp_path / "valid.docx", paragraphs=["Hello"])
block = _make_file_parser_block("https://example.com/valid.docx", FileType.DOCX)
# Should not raise
block.validate_file_type("https://example.com/valid.docx", str(path))
def test_plain_text_with_docx_extension(self, tmp_path: Path) -> None:
path = tmp_path / "fake.docx"
path.write_text("This is plain text, not a DOCX file.")
block = _make_file_parser_block("https://example.com/fake.docx", FileType.DOCX)
with pytest.raises(InvalidFileType):
block.validate_file_type("https://example.com/fake.docx", str(path))
def test_empty_file(self, tmp_path: Path) -> None:
path = tmp_path / "empty.docx"
path.write_bytes(b"")
block = _make_file_parser_block("https://example.com/empty.docx", FileType.DOCX)
with pytest.raises(InvalidFileType):
block.validate_file_type("https://example.com/empty.docx", str(path))
@pytest.mark.asyncio
class TestParseDocxFile:
"""Tests for _parse_docx_file text extraction."""
async def test_paragraphs_joined_by_newline(self, tmp_path: Path) -> None:
path = _create_docx(tmp_path / "paras.docx", paragraphs=["Hello", "World"])
block = _make_file_parser_block("https://example.com/paras.docx", FileType.DOCX)
result = await block._parse_docx_file(str(path))
assert result == "Hello\nWorld"
async def test_empty_paragraphs_skipped(self, tmp_path: Path) -> None:
path = _create_docx(tmp_path / "blanks.docx", paragraphs=["Hello", "", " ", "World"])
block = _make_file_parser_block("https://example.com/blanks.docx", FileType.DOCX)
result = await block._parse_docx_file(str(path))
assert result == "Hello\nWorld"
async def test_table_rows_formatted_with_pipe(self, tmp_path: Path) -> None:
path = _create_docx(
tmp_path / "table.docx",
table_rows=[["Name", "Age"], ["Alice", "30"]],
)
block = _make_file_parser_block("https://example.com/table.docx", FileType.DOCX)
result = await block._parse_docx_file(str(path))
assert result == "Name | Age\nAlice | 30"
async def test_mixed_paragraphs_and_tables(self, tmp_path: Path) -> None:
path = _create_docx(
tmp_path / "mixed.docx",
paragraphs=["Intro"],
table_rows=[["Col1", "Col2"], ["A", "B"]],
)
block = _make_file_parser_block("https://example.com/mixed.docx", FileType.DOCX)
result = await block._parse_docx_file(str(path))
assert result == "Intro\nCol1 | Col2\nA | B"
async def test_empty_document(self, tmp_path: Path) -> None:
path = _create_docx(tmp_path / "empty.docx")
block = _make_file_parser_block("https://example.com/empty.docx", FileType.DOCX)
result = await block._parse_docx_file(str(path))
assert result == ""
async def test_empty_table_cells_skipped(self, tmp_path: Path) -> None:
path = _create_docx(
tmp_path / "sparse.docx",
table_rows=[["Name", "", "Age"], ["", "", ""]],
)
block = _make_file_parser_block("https://example.com/sparse.docx", FileType.DOCX)
result = await block._parse_docx_file(str(path))
# First row: "Name" and "Age" (empty cell skipped), second row: all empty -> skipped
assert result == "Name | Age"
async def test_multiple_tables(self, tmp_path: Path) -> None:
doc = docx.Document()
t1 = doc.add_table(rows=1, cols=2)
t1.rows[0].cells[0].text = "T1C1"
t1.rows[0].cells[1].text = "T1C2"
t2 = doc.add_table(rows=1, cols=2)
t2.rows[0].cells[0].text = "T2C1"
t2.rows[0].cells[1].text = "T2C2"
path = tmp_path / "multi_table.docx"
doc.save(str(path))
block = _make_file_parser_block("https://example.com/multi_table.docx", FileType.DOCX)
result = await block._parse_docx_file(str(path))
assert result == "T1C1 | T1C2\nT2C1 | T2C2"
@pytest.mark.asyncio
class TestParseDocxFileTokenTruncation:
"""Tests for _parse_docx_file token limit enforcement."""
async def test_paragraphs_truncated(self, tmp_path: Path) -> None:
# Create many paragraphs that will exceed a small token limit
paragraphs = [f"This is paragraph number {i} with some text content." for i in range(100)]
path = _create_docx(tmp_path / "long.docx", paragraphs=paragraphs)
block = _make_file_parser_block("https://example.com/long.docx", FileType.DOCX)
result = await block._parse_docx_file(str(path), max_tokens=20)
lines = result.split("\n")
assert len(lines) < len(paragraphs)
# Each included line should be a valid paragraph
for line in lines:
assert line.startswith("This is paragraph number")
async def test_tables_truncated(self, tmp_path: Path) -> None:
table_rows = [[f"R{i}C1", f"R{i}C2", f"R{i}C3"] for i in range(100)]
path = _create_docx(tmp_path / "big_table.docx", table_rows=table_rows)
block = _make_file_parser_block("https://example.com/big_table.docx", FileType.DOCX)
result = await block._parse_docx_file(str(path), max_tokens=20)
lines = result.split("\n")
assert len(lines) < len(table_rows)
async def test_tables_skipped_when_paragraphs_exhaust_budget(self, tmp_path: Path) -> None:
paragraphs = [f"Long paragraph {i} with lots of content to fill tokens." for i in range(100)]
table_rows = [["Should", "Not", "Appear"]]
path = _create_docx(tmp_path / "para_heavy.docx", paragraphs=paragraphs, table_rows=table_rows)
block = _make_file_parser_block("https://example.com/para_heavy.docx", FileType.DOCX)
result = await block._parse_docx_file(str(path), max_tokens=20)
assert "Should" not in result
assert "Not" not in result
assert "Appear" not in result
@pytest.mark.asyncio
class TestParseDocxFileErrorHandling:
"""Tests for _parse_docx_file error handling."""
async def test_corrupt_file(self, tmp_path: Path) -> None:
path = tmp_path / "corrupt.docx"
path.write_bytes(b"\x00\x01\x02\x03random bytes")
block = _make_file_parser_block("https://example.com/corrupt.docx", FileType.DOCX)
with pytest.raises(InvalidFileType):
await block._parse_docx_file(str(path))
async def test_nonexistent_file(self, tmp_path: Path) -> None:
block = _make_file_parser_block("https://example.com/missing.docx", FileType.DOCX)
with pytest.raises(InvalidFileType):
await block._parse_docx_file(str(tmp_path / "nonexistent.docx"))