mirror of
https://github.com/Skyvern-AI/skyvern.git
synced 2026-04-28 03:30:10 +00:00
fix(SKY-8537): deduplicate downloaded files by checksum (#5595)
Some checks are pending
Auto Create GitHub Release on Version Change / check-version-change (push) Waiting to run
Auto Create GitHub Release on Version Change / create-release (push) Blocked by required conditions
Run tests and pre-commit / Run tests and pre-commit hooks (push) Waiting to run
Run tests and pre-commit / Frontend Lint and Build (push) Waiting to run
Publish Fern Docs / run (push) Waiting to run
Build Skyvern SDK and publish to PyPI / check-version-change (push) Waiting to run
Build Skyvern SDK and publish to PyPI / run-ci (push) Blocked by required conditions
Build Skyvern SDK and publish to PyPI / build-sdk (push) Blocked by required conditions
Some checks are pending
Auto Create GitHub Release on Version Change / check-version-change (push) Waiting to run
Auto Create GitHub Release on Version Change / create-release (push) Blocked by required conditions
Run tests and pre-commit / Run tests and pre-commit hooks (push) Waiting to run
Run tests and pre-commit / Frontend Lint and Build (push) Waiting to run
Publish Fern Docs / run (push) Waiting to run
Build Skyvern SDK and publish to PyPI / check-version-change (push) Waiting to run
Build Skyvern SDK and publish to PyPI / run-ci (push) Blocked by required conditions
Build Skyvern SDK and publish to PyPI / build-sdk (push) Blocked by required conditions
This commit is contained in:
parent
b4cceab6c1
commit
e35400f429
2 changed files with 151 additions and 2 deletions
|
|
@ -61,6 +61,7 @@ from skyvern.experimentation.wait_utils import get_or_create_wait_config, get_wa
|
|||
from skyvern.forge import app
|
||||
from skyvern.forge.prompts import prompt_engine
|
||||
from skyvern.forge.sdk.api.files import (
|
||||
calculate_sha256_for_file,
|
||||
check_downloading_files_and_wait_for_download_to_complete,
|
||||
get_download_dir,
|
||||
list_files_in_directory,
|
||||
|
|
@ -522,9 +523,31 @@ class ActionHandler:
|
|||
timeout=task.download_timeout or BROWSER_DOWNLOAD_TIMEOUT,
|
||||
)
|
||||
|
||||
# Calculate newly downloaded file names
|
||||
# Calculate newly downloaded file names and deduplicate local files by checksum.
|
||||
# A single click can trigger multiple identical downloads (e.g., when an <a> click
|
||||
# event bubbles to a parent <tr onclick> that opens the same URL).
|
||||
# Only local files are deduplicated — remote URIs (s3://, azure://) from browser
|
||||
# sessions are passed through as-is since we cannot hash or remove them locally.
|
||||
new_file_paths = set(list_files_after) - set(list_files_before)
|
||||
downloaded_file_names = [os.path.basename(fp) for fp in new_file_paths]
|
||||
seen_checksums: dict[str, str] = {}
|
||||
deduplicated_paths: list[str] = []
|
||||
for fp in sorted(new_file_paths):
|
||||
if not os.path.isfile(fp):
|
||||
deduplicated_paths.append(fp)
|
||||
continue
|
||||
checksum = calculate_sha256_for_file(fp)
|
||||
if checksum in seen_checksums:
|
||||
LOG.info(
|
||||
"Removing duplicate downloaded file from single action",
|
||||
file=os.path.basename(fp),
|
||||
duplicate_of=os.path.basename(seen_checksums[checksum]),
|
||||
checksum=checksum,
|
||||
)
|
||||
os.remove(fp)
|
||||
else:
|
||||
seen_checksums[checksum] = fp
|
||||
deduplicated_paths.append(fp)
|
||||
downloaded_file_names = [os.path.basename(fp) for fp in deduplicated_paths]
|
||||
if downloaded_file_names:
|
||||
results[-1].downloaded_files = downloaded_file_names
|
||||
action.downloaded_files = downloaded_file_names
|
||||
|
|
|
|||
126
tests/unit/test_download_dedup.py
Normal file
126
tests/unit/test_download_dedup.py
Normal file
|
|
@ -0,0 +1,126 @@
|
|||
"""Tests for per-action duplicate file deduplication.
|
||||
|
||||
When a single click triggers multiple identical downloads (e.g., due to event
|
||||
bubbling on pages where both a <tr onclick> and a child <a href> point to the
|
||||
same URL), the action handler should deduplicate newly downloaded files by
|
||||
checksum so only one copy is kept.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
|
||||
from skyvern.forge.sdk.api.files import calculate_sha256_for_file
|
||||
|
||||
|
||||
def _write_file(directory: str, name: str, content: bytes) -> str:
|
||||
path = os.path.join(directory, name)
|
||||
with open(path, "wb") as f:
|
||||
f.write(content)
|
||||
return path
|
||||
|
||||
|
||||
def _deduplicate_new_files(new_file_paths: set[str]) -> list[str]:
|
||||
"""Replicate the dedup logic from ActionHandler._handle_action_for_download."""
|
||||
seen_checksums: dict[str, str] = {}
|
||||
deduplicated: list[str] = []
|
||||
for fp in sorted(new_file_paths):
|
||||
if not os.path.isfile(fp):
|
||||
deduplicated.append(fp)
|
||||
continue
|
||||
checksum = calculate_sha256_for_file(fp)
|
||||
if checksum in seen_checksums:
|
||||
os.remove(fp)
|
||||
else:
|
||||
seen_checksums[checksum] = fp
|
||||
deduplicated.append(fp)
|
||||
return deduplicated
|
||||
|
||||
|
||||
def test_duplicate_files_are_deduplicated(tmp_path):
|
||||
"""Two files with identical content should keep only one."""
|
||||
dir_ = str(tmp_path)
|
||||
a = _write_file(dir_, "report.xlsx", b"identical content")
|
||||
b = _write_file(dir_, "report_1.xlsx", b"identical content")
|
||||
|
||||
result = _deduplicate_new_files({a, b})
|
||||
|
||||
assert len(result) == 1
|
||||
assert len(os.listdir(dir_)) == 1
|
||||
|
||||
|
||||
def test_duplicate_file_is_removed_from_disk(tmp_path):
|
||||
"""The duplicate file should be deleted from the local download directory."""
|
||||
dir_ = str(tmp_path)
|
||||
a = _write_file(dir_, "file_a.pdf", b"same bytes")
|
||||
b = _write_file(dir_, "file_b.pdf", b"same bytes")
|
||||
|
||||
_deduplicate_new_files({a, b})
|
||||
|
||||
assert len(os.listdir(dir_)) == 1
|
||||
|
||||
|
||||
def test_different_files_are_not_deduplicated(tmp_path):
|
||||
"""Files with different content should both be kept."""
|
||||
dir_ = str(tmp_path)
|
||||
a = _write_file(dir_, "invoice_jan.xlsx", b"january data")
|
||||
b = _write_file(dir_, "invoice_feb.xlsx", b"february data")
|
||||
|
||||
result = _deduplicate_new_files({a, b})
|
||||
|
||||
assert len(result) == 2
|
||||
assert len(os.listdir(dir_)) == 2
|
||||
|
||||
|
||||
def test_three_duplicates_keeps_only_one(tmp_path):
|
||||
"""Three identical files should keep one and delete two."""
|
||||
dir_ = str(tmp_path)
|
||||
a = _write_file(dir_, "doc.pdf", b"triplicate")
|
||||
b = _write_file(dir_, "doc_1.pdf", b"triplicate")
|
||||
c = _write_file(dir_, "doc_2.pdf", b"triplicate")
|
||||
|
||||
result = _deduplicate_new_files({a, b, c})
|
||||
|
||||
assert len(result) == 1
|
||||
assert len(os.listdir(dir_)) == 1
|
||||
|
||||
|
||||
def test_mixed_unique_and_duplicate_files(tmp_path):
|
||||
"""Mix of unique and duplicate files: only duplicates are removed."""
|
||||
dir_ = str(tmp_path)
|
||||
a = _write_file(dir_, "unique_a.xlsx", b"content A")
|
||||
b = _write_file(dir_, "unique_b.xlsx", b"content B")
|
||||
c = _write_file(dir_, "duplicate_of_a.xlsx", b"content A")
|
||||
|
||||
result = _deduplicate_new_files({a, b, c})
|
||||
|
||||
assert len(result) == 2
|
||||
assert len(os.listdir(dir_)) == 2
|
||||
|
||||
|
||||
def test_original_filename_kept_over_suffixed_duplicate(tmp_path):
|
||||
"""Sorted order keeps the original name over the _1 suffixed copy."""
|
||||
dir_ = str(tmp_path)
|
||||
a = _write_file(dir_, "Alignment-Feb-2026.xlsx", b"same content")
|
||||
b = _write_file(dir_, "Alignment-Feb-2026_1.xlsx", b"same content")
|
||||
|
||||
result = _deduplicate_new_files({a, b})
|
||||
|
||||
assert len(result) == 1
|
||||
kept = os.path.basename(result[0])
|
||||
assert kept == "Alignment-Feb-2026.xlsx"
|
||||
assert os.listdir(dir_) == ["Alignment-Feb-2026.xlsx"]
|
||||
|
||||
|
||||
def test_remote_uris_are_passed_through(tmp_path):
|
||||
"""S3/Azure URIs from browser sessions should not be hashed or removed."""
|
||||
dir_ = str(tmp_path)
|
||||
local = _write_file(dir_, "local.xlsx", b"local content")
|
||||
remote = "s3://skyvern-artifacts/v1/production/org/browser_sessions/bs_123/file.xlsx"
|
||||
|
||||
result = _deduplicate_new_files({local, remote})
|
||||
|
||||
assert len(result) == 2
|
||||
assert remote in result
|
||||
assert local in result
|
||||
assert len(os.listdir(dir_)) == 1 # only the local file on disk
|
||||
Loading…
Add table
Add a link
Reference in a new issue