mirror of
https://github.com/Skyvern-AI/skyvern.git
synced 2025-09-13 00:39:40 +00:00
Implement FileURLParserBlock and FILE_URL WorkflowParameterType (#559)
This commit is contained in:
parent
8be94d7928
commit
6929a1d24d
7 changed files with 135 additions and 19 deletions
|
@ -9,10 +9,18 @@ import structlog
|
||||||
|
|
||||||
from skyvern.constants import REPO_ROOT_DIR
|
from skyvern.constants import REPO_ROOT_DIR
|
||||||
from skyvern.exceptions import DownloadFileMaxSizeExceeded
|
from skyvern.exceptions import DownloadFileMaxSizeExceeded
|
||||||
|
from skyvern.forge.sdk.api.aws import AsyncAWSClient
|
||||||
|
|
||||||
LOG = structlog.get_logger()
|
LOG = structlog.get_logger()
|
||||||
|
|
||||||
|
|
||||||
|
async def download_from_s3(client: AsyncAWSClient, s3_uri: str) -> str:
|
||||||
|
downloaded_bytes = await client.download_file(uri=s3_uri)
|
||||||
|
file_path = tempfile.NamedTemporaryFile(delete=False)
|
||||||
|
file_path.write(downloaded_bytes)
|
||||||
|
return file_path.name
|
||||||
|
|
||||||
|
|
||||||
async def download_file(url: str, max_size_mb: int | None = None) -> str:
|
async def download_file(url: str, max_size_mb: int | None = None) -> str:
|
||||||
try:
|
try:
|
||||||
async with aiohttp.ClientSession(raise_for_status=True) as session:
|
async with aiohttp.ClientSession(raise_for_status=True) as session:
|
||||||
|
|
|
@ -255,16 +255,21 @@ class WorkflowRunContext:
|
||||||
old_value=parameter.value,
|
old_value=parameter.value,
|
||||||
new_value=value,
|
new_value=value,
|
||||||
)
|
)
|
||||||
if not isinstance(value, dict):
|
if not isinstance(value, dict) and not isinstance(value, list):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"ContextParameter can't depend on an OutputParameter with a non-dict value. "
|
f"ContextParameter can only depend on an OutputParameter with a dict or list value. "
|
||||||
f"ContextParameter key: {parameter.key}, "
|
f"ContextParameter key: {parameter.key}, "
|
||||||
f"OutputParameter key: {output_parameter.key}, "
|
f"OutputParameter key: {output_parameter.key}, "
|
||||||
f"OutputParameter value: {value}"
|
f"OutputParameter value: {value}"
|
||||||
)
|
)
|
||||||
parameter.value = value.get(parameter.key)
|
if isinstance(value, dict):
|
||||||
self.parameters[parameter.key] = parameter
|
parameter.value = value.get(parameter.key)
|
||||||
self.values[parameter.key] = parameter.value
|
self.parameters[parameter.key] = parameter
|
||||||
|
self.values[parameter.key] = parameter.value
|
||||||
|
else:
|
||||||
|
parameter.value = value
|
||||||
|
self.parameters[parameter.key] = parameter
|
||||||
|
self.values[parameter.key] = parameter.value
|
||||||
|
|
||||||
async def register_block_parameters(
|
async def register_block_parameters(
|
||||||
self,
|
self,
|
||||||
|
|
|
@ -75,3 +75,11 @@ class ContextParameterSourceNotDefined(BaseWorkflowHTTPException):
|
||||||
f"Source parameter key {source_key} for context parameter {context_parameter_key} does not exist.",
|
f"Source parameter key {source_key} for context parameter {context_parameter_key} does not exist.",
|
||||||
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class InvalidFileType(BaseWorkflowHTTPException):
|
||||||
|
def __init__(self, file_url: str, file_type: str, error: str) -> None:
|
||||||
|
super().__init__(
|
||||||
|
f"File URL {file_url} is not a valid {file_type} file. Error: {error}",
|
||||||
|
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
||||||
|
)
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import abc
|
import abc
|
||||||
|
import csv
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import smtplib
|
import smtplib
|
||||||
|
@ -25,12 +26,16 @@ from skyvern.exceptions import (
|
||||||
from skyvern.forge import app
|
from skyvern.forge import app
|
||||||
from skyvern.forge.prompts import prompt_engine
|
from skyvern.forge.prompts import prompt_engine
|
||||||
from skyvern.forge.sdk.api.aws import AsyncAWSClient
|
from skyvern.forge.sdk.api.aws import AsyncAWSClient
|
||||||
from skyvern.forge.sdk.api.files import download_file, get_path_for_workflow_download_directory
|
from skyvern.forge.sdk.api.files import download_file, download_from_s3, get_path_for_workflow_download_directory
|
||||||
from skyvern.forge.sdk.api.llm.api_handler_factory import LLMAPIHandlerFactory
|
from skyvern.forge.sdk.api.llm.api_handler_factory import LLMAPIHandlerFactory
|
||||||
from skyvern.forge.sdk.schemas.tasks import TaskOutput, TaskStatus
|
from skyvern.forge.sdk.schemas.tasks import TaskOutput, TaskStatus
|
||||||
from skyvern.forge.sdk.settings_manager import SettingsManager
|
from skyvern.forge.sdk.settings_manager import SettingsManager
|
||||||
from skyvern.forge.sdk.workflow.context_manager import WorkflowRunContext
|
from skyvern.forge.sdk.workflow.context_manager import WorkflowRunContext
|
||||||
from skyvern.forge.sdk.workflow.exceptions import InvalidEmailClientConfiguration, NoValidEmailRecipient
|
from skyvern.forge.sdk.workflow.exceptions import (
|
||||||
|
InvalidEmailClientConfiguration,
|
||||||
|
InvalidFileType,
|
||||||
|
NoValidEmailRecipient,
|
||||||
|
)
|
||||||
from skyvern.forge.sdk.workflow.models.parameter import (
|
from skyvern.forge.sdk.workflow.models.parameter import (
|
||||||
PARAMETER_TYPE,
|
PARAMETER_TYPE,
|
||||||
AWSSecretParameter,
|
AWSSecretParameter,
|
||||||
|
@ -50,6 +55,7 @@ class BlockType(StrEnum):
|
||||||
DOWNLOAD_TO_S3 = "download_to_s3"
|
DOWNLOAD_TO_S3 = "download_to_s3"
|
||||||
UPLOAD_TO_S3 = "upload_to_s3"
|
UPLOAD_TO_S3 = "upload_to_s3"
|
||||||
SEND_EMAIL = "send_email"
|
SEND_EMAIL = "send_email"
|
||||||
|
FILE_URL_PARSER = "file_url_parser"
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
|
@ -353,9 +359,8 @@ class ForLoopBlock(Block):
|
||||||
return list(parameters)
|
return list(parameters)
|
||||||
|
|
||||||
def get_loop_block_context_parameters(self, workflow_run_id: str, loop_data: Any) -> list[ContextParameter]:
|
def get_loop_block_context_parameters(self, workflow_run_id: str, loop_data: Any) -> list[ContextParameter]:
|
||||||
if not isinstance(loop_data, dict):
|
if not isinstance(loop_data, dict) and not isinstance(loop_data, list):
|
||||||
# TODO (kerem): Should we add support for other types?
|
raise ValueError("loop_data should be a dict or a list.")
|
||||||
raise ValueError("loop_data should be a dict")
|
|
||||||
|
|
||||||
context_parameters = []
|
context_parameters = []
|
||||||
for loop_block in self.loop_blocks:
|
for loop_block in self.loop_blocks:
|
||||||
|
@ -369,13 +374,19 @@ class ForLoopBlock(Block):
|
||||||
for context_parameter in context_parameters:
|
for context_parameter in context_parameters:
|
||||||
if context_parameter.source.key != self.loop_over.key:
|
if context_parameter.source.key != self.loop_over.key:
|
||||||
continue
|
continue
|
||||||
if context_parameter.key not in loop_data:
|
# If the loop_data is a dict, we need to check if the key exists in the loop_data
|
||||||
raise ContextParameterValueNotFound(
|
if isinstance(loop_data, dict):
|
||||||
parameter_key=context_parameter.key,
|
if context_parameter.key in loop_data:
|
||||||
existing_keys=list(loop_data.keys()),
|
context_parameter.value = loop_data[context_parameter.key]
|
||||||
workflow_run_id=workflow_run_id,
|
else:
|
||||||
)
|
raise ContextParameterValueNotFound(
|
||||||
context_parameter.value = loop_data[context_parameter.key]
|
parameter_key=context_parameter.key,
|
||||||
|
existing_keys=list(loop_data.keys()),
|
||||||
|
workflow_run_id=workflow_run_id,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# If the loop_data is a list, we can directly assign the loop_data to the context_parameter value
|
||||||
|
context_parameter.value = loop_data
|
||||||
|
|
||||||
return context_parameters
|
return context_parameters
|
||||||
|
|
||||||
|
@ -859,7 +870,7 @@ class SendEmailBlock(Block):
|
||||||
path = None
|
path = None
|
||||||
try:
|
try:
|
||||||
if filename.startswith("s3://"):
|
if filename.startswith("s3://"):
|
||||||
path = await self._download_from_s3(filename)
|
path = await download_from_s3(self.get_async_aws_client(), filename)
|
||||||
elif filename.startswith("http://") or filename.startswith("https://"):
|
elif filename.startswith("http://") or filename.startswith("https://"):
|
||||||
path = await download_file(filename)
|
path = await download_file(filename)
|
||||||
else:
|
else:
|
||||||
|
@ -947,6 +958,69 @@ class SendEmailBlock(Block):
|
||||||
return self.build_block_result(success=True, output_parameter_value=result_dict)
|
return self.build_block_result(success=True, output_parameter_value=result_dict)
|
||||||
|
|
||||||
|
|
||||||
|
class FileType(StrEnum):
|
||||||
|
CSV = "csv"
|
||||||
|
|
||||||
|
|
||||||
|
class FileParserBlock(Block):
|
||||||
|
block_type: Literal[BlockType.FILE_URL_PARSER] = BlockType.FILE_URL_PARSER
|
||||||
|
|
||||||
|
file_url: str
|
||||||
|
file_type: FileType
|
||||||
|
|
||||||
|
def get_all_parameters(
|
||||||
|
self,
|
||||||
|
workflow_run_id: str,
|
||||||
|
) -> list[PARAMETER_TYPE]:
|
||||||
|
workflow_run_context = self.get_workflow_run_context(workflow_run_id)
|
||||||
|
if self.file_url and workflow_run_context.has_parameter(self.file_url):
|
||||||
|
return [workflow_run_context.get_parameter(self.file_url)]
|
||||||
|
return []
|
||||||
|
|
||||||
|
def validate_file_type(self, file_url_used: str, file_path: str) -> None:
|
||||||
|
if self.file_type == FileType.CSV:
|
||||||
|
try:
|
||||||
|
with open(file_path, "r") as file:
|
||||||
|
csv.Sniffer().sniff(file.read(1024))
|
||||||
|
except csv.Error as e:
|
||||||
|
raise InvalidFileType(file_url=file_url_used, file_type=self.file_type, error=str(e))
|
||||||
|
|
||||||
|
async def execute(self, workflow_run_id: str, **kwargs: dict) -> BlockResult:
|
||||||
|
workflow_run_context = self.get_workflow_run_context(workflow_run_id)
|
||||||
|
file_url_to_use = self.file_url
|
||||||
|
if (
|
||||||
|
self.file_url
|
||||||
|
and workflow_run_context.has_parameter(self.file_url)
|
||||||
|
and workflow_run_context.has_value(self.file_url)
|
||||||
|
):
|
||||||
|
file_url_parameter_value = workflow_run_context.get_value(self.file_url)
|
||||||
|
if file_url_parameter_value:
|
||||||
|
LOG.info(
|
||||||
|
"FileParserBlock: File URL is parameterized, using parameter value",
|
||||||
|
file_url_parameter_value=file_url_parameter_value,
|
||||||
|
file_url_parameter_key=self.file_url,
|
||||||
|
)
|
||||||
|
file_url_to_use = file_url_parameter_value
|
||||||
|
|
||||||
|
# Download the file
|
||||||
|
if file_url_to_use.startswith("s3://"):
|
||||||
|
file_path = await download_from_s3(self.get_async_aws_client(), file_url_to_use)
|
||||||
|
else:
|
||||||
|
file_path = await download_file(file_url_to_use)
|
||||||
|
# Validate the file type
|
||||||
|
self.validate_file_type(file_url_to_use, file_path)
|
||||||
|
# Parse the file into a list of dictionaries where each dictionary represents a row in the file
|
||||||
|
parsed_data = []
|
||||||
|
with open(file_path, "r") as file:
|
||||||
|
if self.file_type == FileType.CSV:
|
||||||
|
reader = csv.DictReader(file)
|
||||||
|
for row in reader:
|
||||||
|
parsed_data.append(row)
|
||||||
|
# Record the parsed data
|
||||||
|
await self.record_output_parameter_value(workflow_run_context, workflow_run_id, parsed_data)
|
||||||
|
return self.build_block_result(success=True, output_parameter_value=parsed_data)
|
||||||
|
|
||||||
|
|
||||||
BlockSubclasses = Union[
|
BlockSubclasses = Union[
|
||||||
ForLoopBlock,
|
ForLoopBlock,
|
||||||
TaskBlock,
|
TaskBlock,
|
||||||
|
@ -955,5 +1029,6 @@ BlockSubclasses = Union[
|
||||||
DownloadToS3Block,
|
DownloadToS3Block,
|
||||||
UploadToS3Block,
|
UploadToS3Block,
|
||||||
SendEmailBlock,
|
SendEmailBlock,
|
||||||
|
FileParserBlock,
|
||||||
]
|
]
|
||||||
BlockTypeVar = Annotated[BlockSubclasses, Field(discriminator="block_type")]
|
BlockTypeVar = Annotated[BlockSubclasses, Field(discriminator="block_type")]
|
||||||
|
|
|
@ -67,6 +67,7 @@ class WorkflowParameterType(StrEnum):
|
||||||
FLOAT = "float"
|
FLOAT = "float"
|
||||||
BOOLEAN = "boolean"
|
BOOLEAN = "boolean"
|
||||||
JSON = "json"
|
JSON = "json"
|
||||||
|
FILE_URL = "file_url"
|
||||||
|
|
||||||
def convert_value(self, value: str | None) -> str | int | float | bool | dict | list | None:
|
def convert_value(self, value: str | None) -> str | int | float | bool | dict | list | None:
|
||||||
if value is None:
|
if value is None:
|
||||||
|
@ -81,6 +82,8 @@ class WorkflowParameterType(StrEnum):
|
||||||
return value.lower() in ["true", "1"]
|
return value.lower() in ["true", "1"]
|
||||||
elif self == WorkflowParameterType.JSON:
|
elif self == WorkflowParameterType.JSON:
|
||||||
return json.loads(value)
|
return json.loads(value)
|
||||||
|
elif self == WorkflowParameterType.FILE_URL:
|
||||||
|
return value
|
||||||
|
|
||||||
|
|
||||||
class WorkflowParameter(Parameter):
|
class WorkflowParameter(Parameter):
|
||||||
|
|
|
@ -4,7 +4,7 @@ from typing import Annotated, Any, Literal
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
from skyvern.forge.sdk.schemas.tasks import ProxyLocation
|
from skyvern.forge.sdk.schemas.tasks import ProxyLocation
|
||||||
from skyvern.forge.sdk.workflow.models.block import BlockType
|
from skyvern.forge.sdk.workflow.models.block import BlockType, FileType
|
||||||
from skyvern.forge.sdk.workflow.models.parameter import ParameterType, WorkflowParameterType
|
from skyvern.forge.sdk.workflow.models.parameter import ParameterType, WorkflowParameterType
|
||||||
|
|
||||||
|
|
||||||
|
@ -162,6 +162,13 @@ class SendEmailBlockYAML(BlockYAML):
|
||||||
file_attachments: list[str] | None = None
|
file_attachments: list[str] | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class FileParserBlockYAML(BlockYAML):
|
||||||
|
block_type: Literal[BlockType.FILE_URL_PARSER] = BlockType.FILE_URL_PARSER # type: ignore
|
||||||
|
|
||||||
|
file_url: str
|
||||||
|
file_type: FileType
|
||||||
|
|
||||||
|
|
||||||
PARAMETER_YAML_SUBCLASSES = (
|
PARAMETER_YAML_SUBCLASSES = (
|
||||||
AWSSecretParameterYAML
|
AWSSecretParameterYAML
|
||||||
| BitwardenLoginCredentialParameterYAML
|
| BitwardenLoginCredentialParameterYAML
|
||||||
|
@ -179,6 +186,7 @@ BLOCK_YAML_SUBCLASSES = (
|
||||||
| DownloadToS3BlockYAML
|
| DownloadToS3BlockYAML
|
||||||
| UploadToS3BlockYAML
|
| UploadToS3BlockYAML
|
||||||
| SendEmailBlockYAML
|
| SendEmailBlockYAML
|
||||||
|
| FileParserBlockYAML
|
||||||
)
|
)
|
||||||
BLOCK_YAML_TYPES = Annotated[BLOCK_YAML_SUBCLASSES, Field(discriminator="block_type")]
|
BLOCK_YAML_TYPES = Annotated[BLOCK_YAML_SUBCLASSES, Field(discriminator="block_type")]
|
||||||
|
|
||||||
|
|
|
@ -25,6 +25,7 @@ from skyvern.forge.sdk.workflow.models.block import (
|
||||||
BlockTypeVar,
|
BlockTypeVar,
|
||||||
CodeBlock,
|
CodeBlock,
|
||||||
DownloadToS3Block,
|
DownloadToS3Block,
|
||||||
|
FileParserBlock,
|
||||||
ForLoopBlock,
|
ForLoopBlock,
|
||||||
SendEmailBlock,
|
SendEmailBlock,
|
||||||
TaskBlock,
|
TaskBlock,
|
||||||
|
@ -1052,4 +1053,12 @@ class WorkflowService:
|
||||||
file_attachments=block_yaml.file_attachments or [],
|
file_attachments=block_yaml.file_attachments or [],
|
||||||
continue_on_failure=block_yaml.continue_on_failure,
|
continue_on_failure=block_yaml.continue_on_failure,
|
||||||
)
|
)
|
||||||
|
elif block_yaml.block_type == BlockType.FILE_URL_PARSER:
|
||||||
|
return FileParserBlock(
|
||||||
|
label=block_yaml.label,
|
||||||
|
output_parameter=output_parameter,
|
||||||
|
file_url=block_yaml.file_url,
|
||||||
|
file_type=block_yaml.file_type,
|
||||||
|
continue_on_failure=block_yaml.continue_on_failure,
|
||||||
|
)
|
||||||
raise ValueError(f"Invalid block type {block_yaml.block_type}")
|
raise ValueError(f"Invalid block type {block_yaml.block_type}")
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue