Skyvern/tests/unit/test_cdp_download_interceptor.py

179 lines
8 KiB
Python

"""Unit tests for CDPDownloadInterceptor pure functions (no browser needed)."""
import time
from skyvern.webeye.cdp_download_interceptor import extract_filename, is_download_response
class TestIsDownloadResponse:
"""Tests for is_download_response()."""
def test_attachment_header(self) -> None:
headers = {"content-disposition": 'attachment; filename="report.csv"', "content-type": "text/csv"}
assert is_download_response(headers, 200) is True
def test_attachment_header_case_insensitive(self) -> None:
headers = {"content-disposition": 'Attachment; filename="report.csv"', "content-type": "text/csv"}
assert is_download_response(headers, 200) is True
def test_download_mime_type_pdf(self) -> None:
headers = {"content-type": "application/pdf"}
assert is_download_response(headers, 200) is True
def test_download_mime_type_zip(self) -> None:
headers = {"content-type": "application/zip"}
assert is_download_response(headers, 200) is True
def test_download_mime_type_xlsx(self) -> None:
headers = {
"content-type": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
}
assert is_download_response(headers, 200) is True
def test_download_mime_type_octet_stream(self) -> None:
headers = {"content-type": "application/octet-stream"}
assert is_download_response(headers, 200) is True
def test_download_mime_type_with_charset(self) -> None:
headers = {"content-type": "application/pdf; charset=utf-8"}
assert is_download_response(headers, 200) is True
def test_html_not_download(self) -> None:
headers = {"content-type": "text/html"}
assert is_download_response(headers, 200) is False
def test_json_not_download(self) -> None:
headers = {"content-type": "application/json"}
assert is_download_response(headers, 200) is False
def test_json_with_attachment_not_download(self) -> None:
"""JSON responses with Content-Disposition: attachment should NOT be treated as downloads."""
headers = {
"content-disposition": "attachment",
"content-type": "application/json",
}
assert is_download_response(headers, 200) is False
def test_xml_not_download(self) -> None:
headers = {"content-type": "application/xml"}
assert is_download_response(headers, 200) is False
def test_grpc_not_download(self) -> None:
headers = {"content-type": "application/grpc"}
assert is_download_response(headers, 200) is False
def test_empty_headers_not_download(self) -> None:
assert is_download_response({}, 200) is False
# Resource type filtering
def test_xhr_resource_type_not_download(self) -> None:
headers = {"content-disposition": "attachment", "content-type": "application/octet-stream"}
assert is_download_response(headers, 200, resource_type="XHR") is False
def test_fetch_resource_type_not_download(self) -> None:
headers = {"content-disposition": "attachment"}
assert is_download_response(headers, 200, resource_type="Fetch") is False
def test_font_resource_type_not_download(self) -> None:
headers = {"content-type": "application/octet-stream"}
assert is_download_response(headers, 200, resource_type="Font") is False
def test_stylesheet_resource_type_not_download(self) -> None:
headers = {"content-type": "application/octet-stream"}
assert is_download_response(headers, 200, resource_type="Stylesheet") is False
def test_script_resource_type_not_download(self) -> None:
headers = {"content-type": "application/octet-stream"}
assert is_download_response(headers, 200, resource_type="Script") is False
def test_image_resource_type_not_download(self) -> None:
headers = {"content-type": "application/octet-stream"}
assert is_download_response(headers, 200, resource_type="Image") is False
def test_document_resource_type_is_download(self) -> None:
"""Document resource type (link click) should allow download detection."""
headers = {"content-disposition": "attachment", "content-type": "application/pdf"}
assert is_download_response(headers, 200, resource_type="Document") is True
def test_empty_resource_type_is_download(self) -> None:
headers = {"content-type": "application/pdf"}
assert is_download_response(headers, 200, resource_type="") is True
def test_error_status_code_not_download(self) -> None:
headers = {"content-disposition": "attachment", "content-type": "application/pdf"}
assert is_download_response(headers, 404) is False
def test_server_error_not_download(self) -> None:
headers = {"content-type": "application/octet-stream"}
assert is_download_response(headers, 500) is False
class TestExtractFilename:
"""Tests for extract_filename()."""
def test_rfc5987_filename_star(self) -> None:
headers = {"content-disposition": "attachment; filename*=UTF-8''my%20report%282024%29.pdf"}
result = extract_filename(headers, "https://example.com/download", 1)
assert result == "my report(2024).pdf"
def test_regular_filename(self) -> None:
headers = {"content-disposition": 'attachment; filename="report.csv"'}
result = extract_filename(headers, "https://example.com/download", 1)
assert result == "report.csv"
def test_unquoted_filename(self) -> None:
headers = {"content-disposition": "attachment; filename=report.csv"}
result = extract_filename(headers, "https://example.com/download", 1)
assert result == "report.csv"
def test_filename_star_takes_priority(self) -> None:
headers = {
"content-disposition": "attachment; filename=\"fallback.csv\"; filename*=UTF-8''preferred.csv",
}
result = extract_filename(headers, "https://example.com/download", 1)
assert result == "preferred.csv"
def test_url_path_fallback(self) -> None:
headers: dict[str, str] = {}
result = extract_filename(headers, "https://example.com/files/document.pdf", 1)
assert result == "document.pdf"
def test_url_path_with_encoded_chars(self) -> None:
headers: dict[str, str] = {}
result = extract_filename(headers, "https://example.com/files/my%20report.xlsx", 1)
assert result == "my report.xlsx"
def test_url_path_no_extension_uses_fallback(self) -> None:
headers: dict[str, str] = {}
result = extract_filename(headers, "https://example.com/download", 1)
assert result.startswith("download_")
def test_fallback_format(self) -> None:
headers: dict[str, str] = {}
before = int(time.time())
result = extract_filename(headers, "https://example.com/api/export", 42)
after = int(time.time())
# Should be download_{timestamp}_{index}
parts = result.split("_")
assert parts[0] == "download"
assert before <= int(parts[1]) <= after
assert parts[2] == "42"
def test_empty_content_disposition(self) -> None:
headers = {"content-disposition": ""}
result = extract_filename(headers, "https://example.com/files/data.csv", 1)
assert result == "data.csv"
def test_content_disposition_inline(self) -> None:
"""inline disposition without filename should fall back to URL."""
headers = {"content-disposition": "inline"}
result = extract_filename(headers, "https://example.com/files/report.pdf", 1)
assert result == "report.pdf"
def test_path_traversal_stripped(self) -> None:
"""Path traversal in filename should be sanitized to just the filename part."""
headers = {"content-disposition": 'attachment; filename="../../etc/cron.d/evil"'}
result = extract_filename(headers, "https://example.com/download", 1)
# extract_filename returns the raw name; sanitization is done in _handle_download.
# But verify the raw output so tests document the behavior.
assert result == "../../etc/cron.d/evil"