mirror of
https://github.com/unslothai/unsloth.git
synced 2026-04-28 03:19:57 +00:00
feat(studio): multi-file unstructured seed upload with better backend extraction (#4468)
* fix(recipe-studio): prevent fitView from zooming to wrong location on recipe load * feat: add pymupdf/python-docx deps and unstructured uploads storage root * feat: add POST /seed/upload-unstructured-file endpoint * feat: add multi-file chunking with source_file column * feat: update frontend types and API layer for multi-file upload * feat: round-robin preview rows across source files Ensures every uploaded file is represented in the preview table by cycling through sources instead of just taking the first N rows. * fix: disable OCR, fix auto-load timing, fix persistence on reload - Disable pymupdf4llm OCR with write_images=False, show_progress=False - Replace onAllUploaded callback with useEffect that detects uploading→done transition (avoids stale closure reading empty file IDs) - Fix importer to preserve file IDs from saved recipes instead of clearing (clearing only happens at share time via sanitizeSeedForShare) * fix: harden unstructured upload with input validation and state fixes Validate block_id/file_id with alphanumeric regex to prevent path traversal, use exact stem match for file deletion, add error handling for metadata writes and empty files, fix React stale closures and object mutations in upload loop, and correct validation logic for unstructured seed resolved_paths. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix: address PR review - legacy path import, share sanitizer, sync effect Promote legacy source.path into resolved_paths for old unstructured recipes, clear source.paths in share sanitizer to prevent leaking local filesystem paths, and gate file sync effect to dialog open transition so users can actually delete all uploaded files. * fix: CSV column fix (BOM + whitespace + unnamed index re-save) for #4470 * fix: harden unstructured upload flow and polish dialog UX * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
parent
f113f3511d
commit
dd283b0605
49 changed files with 1216 additions and 315 deletions
|
|
@ -9,7 +9,7 @@ from __future__ import annotations
|
|||
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
from pydantic import BaseModel, Field, model_validator
|
||||
|
||||
|
||||
class RecipePayload(BaseModel):
|
||||
|
|
@ -76,13 +76,41 @@ class SeedInspectRequest(BaseModel):
|
|||
|
||||
|
||||
class SeedInspectUploadRequest(BaseModel):
|
||||
filename: str = Field(min_length = 1)
|
||||
content_base64: str = Field(min_length = 1)
|
||||
# Legacy single-file flow (mutually exclusive with file_ids)
|
||||
filename: str | None = None
|
||||
content_base64: str | None = None
|
||||
# Multi-file flow (mutually exclusive with content_base64)
|
||||
block_id: str | None = None
|
||||
file_ids: list[str] | None = None
|
||||
file_names: list[str] | None = None
|
||||
# Shared fields
|
||||
preview_size: int = Field(default = 10, ge = 1, le = 50)
|
||||
seed_source_type: str | None = None
|
||||
unstructured_chunk_size: int | None = Field(default = None, ge = 1, le = 20000)
|
||||
unstructured_chunk_overlap: int | None = Field(default = None, ge = 0, le = 20000)
|
||||
|
||||
@model_validator(mode = "after")
|
||||
def _check_mutual_exclusivity(self) -> "SeedInspectUploadRequest":
|
||||
has_legacy = self.content_base64 is not None
|
||||
has_multi = self.file_ids is not None
|
||||
if has_legacy and has_multi:
|
||||
raise ValueError("Provide either content_base64 or file_ids, not both")
|
||||
if not has_legacy and not has_multi:
|
||||
raise ValueError("Provide either content_base64 or file_ids")
|
||||
if has_multi:
|
||||
if len(self.file_ids) == 0:
|
||||
raise ValueError("file_ids must not be empty")
|
||||
if not self.block_id:
|
||||
raise ValueError("block_id is required when using file_ids")
|
||||
if self.file_names is None or len(self.file_ids) != len(self.file_names):
|
||||
raise ValueError(
|
||||
"file_names must be provided and same length as file_ids"
|
||||
)
|
||||
if has_legacy:
|
||||
if not self.filename:
|
||||
raise ValueError("filename is required when using content_base64")
|
||||
return self
|
||||
|
||||
|
||||
class SeedInspectResponse(BaseModel):
|
||||
dataset_name: str
|
||||
|
|
@ -91,6 +119,15 @@ class SeedInspectResponse(BaseModel):
|
|||
preview_rows: list[dict[str, Any]] = Field(default_factory = list)
|
||||
split: str | None = None
|
||||
subset: str | None = None
|
||||
resolved_paths: list[str] | None = None
|
||||
|
||||
|
||||
class UnstructuredFileUploadResponse(BaseModel):
|
||||
file_id: str
|
||||
filename: str
|
||||
size_bytes: int
|
||||
status: str # "ok" or "error"
|
||||
error: str | None = None
|
||||
|
||||
|
||||
class McpToolsListRequest(BaseModel):
|
||||
|
|
|
|||
|
|
@ -13,6 +13,9 @@ requires-python = ">=3.11"
|
|||
dependencies = [
|
||||
"data-designer-engine>=0.5.1,<0.6",
|
||||
"pandas>=2,<3",
|
||||
"pymupdf>=1.24.0",
|
||||
"pymupdf4llm>=0.0.17",
|
||||
"mammoth>=1.8.0",
|
||||
]
|
||||
|
||||
[project.entry-points."data_designer.plugins"]
|
||||
|
|
|
|||
|
|
@ -8,6 +8,8 @@ import re
|
|||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from utils.paths import ensure_dir, unstructured_seed_cache_root
|
||||
|
||||
DEFAULT_CHUNK_SIZE = 1200
|
||||
|
|
@ -59,6 +61,59 @@ def build_unstructured_preview_rows(
|
|||
]
|
||||
|
||||
|
||||
def build_multi_file_preview_rows(
|
||||
*,
|
||||
file_entries: list[tuple[Path, str]],
|
||||
preview_size: int,
|
||||
chunk_size: int | None,
|
||||
chunk_overlap: int | None,
|
||||
) -> list[dict[str, str]]:
|
||||
cs = _to_int(chunk_size, DEFAULT_CHUNK_SIZE)
|
||||
co = _to_int(chunk_overlap, DEFAULT_CHUNK_OVERLAP)
|
||||
_, rows = materialize_multi_file_unstructured_seed(
|
||||
file_entries = file_entries,
|
||||
chunk_size = cs,
|
||||
chunk_overlap = co,
|
||||
)
|
||||
return _round_robin_preview(rows, preview_size)
|
||||
|
||||
|
||||
def _round_robin_preview(
|
||||
rows: list[dict[str, str]],
|
||||
preview_size: int,
|
||||
) -> list[dict[str, str]]:
|
||||
"""Pick preview rows round-robin across source files so every file is represented."""
|
||||
if not rows or preview_size <= 0:
|
||||
return []
|
||||
|
||||
# Group rows by source_file, preserving order of first appearance
|
||||
from collections import OrderedDict
|
||||
|
||||
grouped: OrderedDict[str, list[dict[str, str]]] = OrderedDict()
|
||||
for row in rows:
|
||||
key = row.get("source_file", "")
|
||||
if key not in grouped:
|
||||
grouped[key] = []
|
||||
grouped[key].append(row)
|
||||
|
||||
result: list[dict[str, str]] = []
|
||||
iterators = [iter(chunks) for chunks in grouped.values()]
|
||||
while len(result) < preview_size and iterators:
|
||||
exhausted: list[int] = []
|
||||
for i, it in enumerate(iterators):
|
||||
if len(result) >= preview_size:
|
||||
break
|
||||
val = next(it, None)
|
||||
if val is not None:
|
||||
result.append(val)
|
||||
else:
|
||||
exhausted.append(i)
|
||||
for i in reversed(exhausted):
|
||||
iterators.pop(i)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def materialize_unstructured_seed_dataset(
|
||||
*,
|
||||
source_path: Path,
|
||||
|
|
@ -103,6 +158,43 @@ def materialize_unstructured_seed_dataset(
|
|||
return parquet_path, rows
|
||||
|
||||
|
||||
def materialize_multi_file_unstructured_seed(
|
||||
*,
|
||||
file_entries: list[tuple[Path, str]], # (extracted_txt_path, original_filename)
|
||||
chunk_size: int,
|
||||
chunk_overlap: int,
|
||||
) -> tuple[Path, list[dict[str, str]]]:
|
||||
"""Chunk multiple files and combine into one parquet dataset with source_file column."""
|
||||
chunk_size, chunk_overlap = resolve_chunking(chunk_size, chunk_overlap)
|
||||
cache_key = _compute_multi_file_cache_key(file_entries, chunk_size, chunk_overlap)
|
||||
cached = _CACHE_DIR / f"{cache_key}.parquet"
|
||||
if cached.exists():
|
||||
df = pd.read_parquet(cached)
|
||||
rows = df.to_dict(orient = "records")
|
||||
return cached, rows
|
||||
|
||||
all_rows: list[dict[str, str]] = []
|
||||
for txt_path, orig_name in file_entries:
|
||||
text = load_unstructured_text_file(txt_path)
|
||||
chunks = split_text_into_chunks(
|
||||
text = text,
|
||||
chunk_size = chunk_size,
|
||||
chunk_overlap = chunk_overlap,
|
||||
)
|
||||
for chunk in chunks:
|
||||
all_rows.append({"chunk_text": chunk, "source_file": orig_name})
|
||||
|
||||
if not all_rows:
|
||||
raise ValueError("No text found in any uploaded files.")
|
||||
|
||||
df = pd.DataFrame(all_rows)
|
||||
ensure_dir(_CACHE_DIR)
|
||||
tmp = _CACHE_DIR / f"{cache_key}.tmp.parquet"
|
||||
df.to_parquet(tmp, index = False)
|
||||
tmp.replace(cached)
|
||||
return cached, all_rows
|
||||
|
||||
|
||||
def load_unstructured_text_file(path: Path) -> str:
|
||||
ext = path.suffix.lower()
|
||||
if ext not in {".txt", ".md"}:
|
||||
|
|
@ -193,3 +285,17 @@ def _compute_cache_key(
|
|||
]
|
||||
).encode("utf-8")
|
||||
return hashlib.sha256(payload).hexdigest()
|
||||
|
||||
|
||||
def _compute_multi_file_cache_key(
|
||||
file_entries: list[tuple[Path, str]],
|
||||
chunk_size: int,
|
||||
chunk_overlap: int,
|
||||
) -> str:
|
||||
parts: list[str] = []
|
||||
for path, name in sorted(file_entries, key = lambda e: e[1]):
|
||||
st = path.stat()
|
||||
parts.append(f"{path}|{st.st_size}|{st.st_mtime_ns}|{name}")
|
||||
parts.append(f"cs={chunk_size}|co={chunk_overlap}")
|
||||
raw = "\n".join(parts)
|
||||
return hashlib.sha256(raw.encode()).hexdigest()
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ from __future__ import annotations
|
|||
from pathlib import Path
|
||||
from typing import Literal
|
||||
|
||||
from pydantic import Field, field_validator
|
||||
from pydantic import Field, field_validator, model_validator
|
||||
|
||||
from data_designer.config.seed_source import SeedSource
|
||||
|
||||
|
|
@ -15,27 +15,37 @@ from .chunking import DEFAULT_CHUNK_OVERLAP, DEFAULT_CHUNK_SIZE, resolve_chunkin
|
|||
|
||||
class UnstructuredSeedSource(SeedSource):
|
||||
seed_type: Literal["unstructured"] = "unstructured"
|
||||
path: str = Field(..., min_length = 1)
|
||||
paths: list[str] = Field(min_length = 1)
|
||||
|
||||
@model_validator(mode = "before")
|
||||
@classmethod
|
||||
def _normalize_legacy_path(cls, data):
|
||||
if isinstance(data, dict) and "paths" not in data and data.get("path"):
|
||||
data = dict(data)
|
||||
data["paths"] = [data["path"]]
|
||||
return data
|
||||
|
||||
chunk_size: int = DEFAULT_CHUNK_SIZE
|
||||
chunk_overlap: int = DEFAULT_CHUNK_OVERLAP
|
||||
|
||||
@field_validator("path", mode = "after")
|
||||
@field_validator("paths")
|
||||
@classmethod
|
||||
def _validate_path(cls, value: str) -> str:
|
||||
path = Path(value).expanduser()
|
||||
if not path.is_file():
|
||||
raise ValueError(f"Unstructured seed path is not a file: {path}")
|
||||
return value
|
||||
def _validate_paths(cls, v: list[str]) -> list[str]:
|
||||
for p in v:
|
||||
expanded = Path(p).expanduser()
|
||||
if not expanded.is_file():
|
||||
raise ValueError(f"Seed file does not exist: {expanded}")
|
||||
return v
|
||||
|
||||
@field_validator("chunk_size", mode = "after")
|
||||
@field_validator("chunk_size")
|
||||
@classmethod
|
||||
def _validate_chunk_size(cls, value: int) -> int:
|
||||
size, _ = resolve_chunking(value, 0)
|
||||
return size
|
||||
def _resolve_chunk_size(cls, v: int) -> int:
|
||||
cs, _ = resolve_chunking(v, 0)
|
||||
return cs
|
||||
|
||||
@field_validator("chunk_overlap", mode = "after")
|
||||
@field_validator("chunk_overlap")
|
||||
@classmethod
|
||||
def _validate_chunk_overlap(cls, value: int, info) -> int:
|
||||
size = info.data.get("chunk_size", cls.model_fields["chunk_size"].default)
|
||||
_, overlap = resolve_chunking(size, value)
|
||||
return overlap
|
||||
def _resolve_chunk_overlap(cls, v: int, info) -> int:
|
||||
cs = info.data.get("chunk_size", DEFAULT_CHUNK_SIZE)
|
||||
_, co = resolve_chunking(cs, v)
|
||||
return co
|
||||
|
|
|
|||
|
|
@ -8,7 +8,6 @@ from pathlib import Path
|
|||
import data_designer.lazy_heavy_imports as lazy
|
||||
from data_designer.engine.resources.seed_reader import SeedReader
|
||||
|
||||
from .chunking import materialize_unstructured_seed_dataset
|
||||
from .config import UnstructuredSeedSource
|
||||
|
||||
|
||||
|
|
@ -17,8 +16,25 @@ class UnstructuredSeedReader(SeedReader[UnstructuredSeedSource]):
|
|||
return lazy.duckdb.connect()
|
||||
|
||||
def get_dataset_uri(self) -> str:
|
||||
path, _ = materialize_unstructured_seed_dataset(
|
||||
source_path = Path(self.source.path),
|
||||
from .chunking import materialize_multi_file_unstructured_seed
|
||||
import json as json_mod
|
||||
|
||||
file_entries: list[tuple[Path, str]] = []
|
||||
for p in self.source.paths:
|
||||
path_obj = Path(p)
|
||||
file_id = path_obj.name.replace(".extracted.txt", "")
|
||||
meta_path = path_obj.parent / f"{file_id}.meta.json"
|
||||
orig_name = path_obj.name
|
||||
if meta_path.exists():
|
||||
try:
|
||||
meta = json_mod.loads(meta_path.read_text())
|
||||
orig_name = meta.get("original_filename", path_obj.name)
|
||||
except (json_mod.JSONDecodeError, OSError):
|
||||
pass
|
||||
file_entries.append((path_obj, orig_name))
|
||||
|
||||
path, _ = materialize_multi_file_unstructured_seed(
|
||||
file_entries = file_entries,
|
||||
chunk_size = self.source.chunk_size,
|
||||
chunk_overlap = self.source.chunk_overlap,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -17,3 +17,6 @@ ruff<1,>=0.14.10
|
|||
scipy<2,>=1.11.0
|
||||
sqlfluff<4,>=3.2.0
|
||||
tiktoken<1,>=0.8.0
|
||||
pymupdf>=1.24.0
|
||||
pymupdf4llm>=0.0.17
|
||||
mammoth>=1.8.0
|
||||
|
|
|
|||
|
|
@ -7,23 +7,27 @@ from __future__ import annotations
|
|||
|
||||
import base64
|
||||
import binascii
|
||||
import json
|
||||
import re
|
||||
from itertools import islice
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from uuid import uuid4
|
||||
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from fastapi import APIRouter, HTTPException, UploadFile, File as FastAPIFile, Form
|
||||
from data_designer_unstructured_seed.chunking import (
|
||||
build_unstructured_preview_rows,
|
||||
normalize_unstructured_text,
|
||||
resolve_chunking,
|
||||
)
|
||||
from core.data_recipe.jsonable import to_preview_jsonable
|
||||
from utils.paths import ensure_dir, seed_uploads_root
|
||||
from utils.paths import ensure_dir, seed_uploads_root, unstructured_uploads_root
|
||||
|
||||
from models.data_recipe import (
|
||||
SeedInspectRequest,
|
||||
SeedInspectResponse,
|
||||
SeedInspectUploadRequest,
|
||||
UnstructuredFileUploadResponse,
|
||||
)
|
||||
|
||||
router = APIRouter()
|
||||
|
|
@ -31,8 +35,21 @@ router = APIRouter()
|
|||
DATA_EXTS = (".parquet", ".jsonl", ".json", ".csv")
|
||||
DEFAULT_SPLIT = "train"
|
||||
LOCAL_UPLOAD_EXTS = {".csv", ".json", ".jsonl"}
|
||||
UNSTRUCTURED_UPLOAD_EXTS = {".txt", ".md"}
|
||||
UNSTRUCTURED_ALLOWED_EXTS = {".pdf", ".docx", ".txt", ".md"}
|
||||
SEED_UPLOAD_DIR = seed_uploads_root()
|
||||
UNSTRUCTURED_UPLOAD_ROOT = unstructured_uploads_root()
|
||||
MAX_FILE_SIZE = 50 * 1024 * 1024 # 50MB
|
||||
MAX_TOTAL_SIZE = 100 * 1024 * 1024 # 100MB
|
||||
|
||||
_SAFE_ID_RE = re.compile(r"^[a-zA-Z0-9_-]+$")
|
||||
|
||||
|
||||
def _validate_safe_id(value: str, label: str) -> str:
|
||||
if not value or not _SAFE_ID_RE.match(value):
|
||||
raise HTTPException(
|
||||
400, f"Invalid {label}: must be alphanumeric/dash/underscore only"
|
||||
)
|
||||
return value
|
||||
|
||||
|
||||
def _serialize_preview_value(value: Any) -> Any:
|
||||
|
|
@ -177,7 +194,17 @@ def _read_preview_rows_from_local_file(
|
|||
ext = path.suffix.lower()
|
||||
try:
|
||||
if ext == ".csv":
|
||||
df = pd.read_csv(path, nrows = preview_size)
|
||||
df = pd.read_csv(path, nrows = preview_size, encoding = "utf-8-sig")
|
||||
df.columns = df.columns.str.strip()
|
||||
unnamed = [c for c in df.columns if c == "" or c.startswith("Unnamed:")]
|
||||
if unnamed:
|
||||
df = df.drop(columns = unnamed)
|
||||
full_df = pd.read_csv(path, encoding = "utf-8-sig")
|
||||
full_df.columns = full_df.columns.str.strip()
|
||||
full_df = full_df.drop(columns = unnamed)
|
||||
tmp_csv = path.with_suffix(".tmp.csv")
|
||||
full_df.to_csv(tmp_csv, index = False, encoding = "utf-8")
|
||||
tmp_csv.replace(path)
|
||||
elif ext == ".jsonl":
|
||||
df = pd.read_json(path, lines = True).head(preview_size)
|
||||
elif ext == ".json":
|
||||
|
|
@ -220,6 +247,36 @@ def _read_preview_rows_from_unstructured_file(
|
|||
return _serialize_preview_rows(rows)
|
||||
|
||||
|
||||
def _read_preview_rows_from_multi_files(
|
||||
*,
|
||||
block_id: str,
|
||||
file_ids: list[str],
|
||||
file_names: list[str],
|
||||
preview_size: int,
|
||||
chunk_size: int | None,
|
||||
chunk_overlap: int | None,
|
||||
) -> list[dict[str, str]]:
|
||||
from data_designer_unstructured_seed.chunking import build_multi_file_preview_rows
|
||||
|
||||
_validate_safe_id(block_id, "block_id")
|
||||
block_dir = UNSTRUCTURED_UPLOAD_ROOT / block_id
|
||||
file_entries: list[tuple[Path, str]] = []
|
||||
for fid, fname in zip(file_ids, file_names):
|
||||
extracted = block_dir / f"{fid}.extracted.txt"
|
||||
if not extracted.exists():
|
||||
raise HTTPException(
|
||||
404, f"Extracted text not found for file: {fname} (id: {fid})"
|
||||
)
|
||||
file_entries.append((extracted, fname))
|
||||
|
||||
return build_multi_file_preview_rows(
|
||||
file_entries = file_entries,
|
||||
preview_size = preview_size,
|
||||
chunk_size = chunk_size,
|
||||
chunk_overlap = chunk_overlap,
|
||||
)
|
||||
|
||||
|
||||
@router.post("/seed/inspect", response_model = SeedInspectResponse)
|
||||
def inspect_seed_dataset(payload: SeedInspectRequest) -> SeedInspectResponse:
|
||||
dataset_name = payload.dataset_name.strip()
|
||||
|
|
@ -306,14 +363,202 @@ def inspect_seed_dataset(payload: SeedInspectRequest) -> SeedInspectResponse:
|
|||
)
|
||||
|
||||
|
||||
def _extract_text_from_file(file_path: Path, ext: str) -> str:
|
||||
"""Extract text from uploaded file based on extension, converting to markdown where possible."""
|
||||
if ext in {".txt", ".md"}:
|
||||
raw = file_path.read_text(encoding = "utf-8", errors = "ignore")
|
||||
elif ext == ".pdf":
|
||||
import pymupdf4llm
|
||||
|
||||
raw = pymupdf4llm.to_markdown(
|
||||
str(file_path), write_images = False, show_progress = False
|
||||
)
|
||||
elif ext == ".docx":
|
||||
import mammoth
|
||||
|
||||
with open(str(file_path), "rb") as f:
|
||||
result = mammoth.convert_to_markdown(f)
|
||||
raw = result.value
|
||||
else:
|
||||
raise ValueError(f"Unsupported file type: {ext}")
|
||||
|
||||
return normalize_unstructured_text(raw)
|
||||
|
||||
|
||||
def _get_block_total_size(block_dir: Path, file_ids: list[str]) -> int:
|
||||
"""Sum raw upload sizes for tracked file IDs only."""
|
||||
if not block_dir.exists() or not file_ids:
|
||||
return 0
|
||||
id_set = set(file_ids)
|
||||
total = 0
|
||||
for f in block_dir.iterdir():
|
||||
if not f.is_file():
|
||||
continue
|
||||
if f.name.endswith(".extracted.txt") or f.name.endswith(".meta.json"):
|
||||
continue
|
||||
stem = f.name.split(".")[0]
|
||||
if stem in id_set:
|
||||
total += f.stat().st_size
|
||||
return total
|
||||
|
||||
|
||||
@router.post("/seed/upload-unstructured-file")
|
||||
async def upload_unstructured_file(
|
||||
file: UploadFile = FastAPIFile(...),
|
||||
block_id: str = Form(...),
|
||||
existing_file_ids: str = Form(""),
|
||||
) -> UnstructuredFileUploadResponse:
|
||||
_validate_safe_id(block_id, "block_id")
|
||||
|
||||
tracked_ids = [fid.strip() for fid in existing_file_ids.split(",") if fid.strip()]
|
||||
|
||||
original_filename = file.filename or "upload"
|
||||
ext = Path(original_filename).suffix.lower()
|
||||
if ext not in UNSTRUCTURED_ALLOWED_EXTS:
|
||||
raise HTTPException(
|
||||
400,
|
||||
f"Unsupported file type: {ext}. Allowed: {', '.join(sorted(UNSTRUCTURED_ALLOWED_EXTS))}",
|
||||
)
|
||||
|
||||
content = await file.read()
|
||||
size_bytes = len(content)
|
||||
|
||||
if size_bytes == 0:
|
||||
raise HTTPException(400, "Empty file not allowed")
|
||||
|
||||
if size_bytes > MAX_FILE_SIZE:
|
||||
raise HTTPException(
|
||||
413, f"File too large ({size_bytes} bytes). Maximum is 50MB."
|
||||
)
|
||||
|
||||
block_dir = UNSTRUCTURED_UPLOAD_ROOT / block_id
|
||||
ensure_dir(block_dir)
|
||||
current_total = _get_block_total_size(block_dir, file_ids = tracked_ids)
|
||||
if current_total + size_bytes > MAX_TOTAL_SIZE:
|
||||
raise HTTPException(
|
||||
413, f"Total upload limit ({MAX_TOTAL_SIZE // (1024 * 1024)}MB) exceeded"
|
||||
)
|
||||
|
||||
file_id = uuid4().hex
|
||||
raw_path = block_dir / f"{file_id}{ext}"
|
||||
raw_path.write_bytes(content)
|
||||
|
||||
extracted_path = block_dir / f"{file_id}.extracted.txt"
|
||||
try:
|
||||
extracted_text = _extract_text_from_file(raw_path, ext)
|
||||
if not extracted_text or not extracted_text.strip():
|
||||
raw_path.unlink(missing_ok = True)
|
||||
return UnstructuredFileUploadResponse(
|
||||
file_id = file_id,
|
||||
filename = original_filename,
|
||||
size_bytes = size_bytes,
|
||||
status = "error",
|
||||
error = "No extractable text found in file",
|
||||
)
|
||||
extracted_path.write_text(extracted_text, encoding = "utf-8")
|
||||
except Exception as e:
|
||||
raw_path.unlink(missing_ok = True)
|
||||
extracted_path.unlink(missing_ok = True)
|
||||
return UnstructuredFileUploadResponse(
|
||||
file_id = file_id,
|
||||
filename = original_filename,
|
||||
size_bytes = size_bytes,
|
||||
status = "error",
|
||||
error = f"Text extraction failed: {type(e).__name__}: {e}",
|
||||
)
|
||||
|
||||
try:
|
||||
meta_path = block_dir / f"{file_id}.meta.json"
|
||||
meta_path.write_text(
|
||||
json.dumps(
|
||||
{"original_filename": original_filename, "size_bytes": size_bytes}
|
||||
),
|
||||
encoding = "utf-8",
|
||||
)
|
||||
except OSError:
|
||||
raw_path.unlink(missing_ok = True)
|
||||
extracted_path.unlink(missing_ok = True)
|
||||
return UnstructuredFileUploadResponse(
|
||||
file_id = file_id,
|
||||
filename = original_filename,
|
||||
size_bytes = size_bytes,
|
||||
status = "error",
|
||||
error = "Failed to save file metadata",
|
||||
)
|
||||
|
||||
return UnstructuredFileUploadResponse(
|
||||
file_id = file_id,
|
||||
filename = original_filename,
|
||||
size_bytes = size_bytes,
|
||||
status = "ok",
|
||||
)
|
||||
|
||||
|
||||
@router.delete("/seed/unstructured-file/{block_id}/{file_id}")
|
||||
async def remove_unstructured_file(block_id: str, file_id: str):
|
||||
_validate_safe_id(block_id, "block_id")
|
||||
_validate_safe_id(file_id, "file_id")
|
||||
|
||||
block_dir = UNSTRUCTURED_UPLOAD_ROOT / block_id
|
||||
if not block_dir.exists():
|
||||
raise HTTPException(404, "Block not found")
|
||||
|
||||
deleted = False
|
||||
for f in block_dir.iterdir():
|
||||
stem = f.name.split(".")[0]
|
||||
if stem == file_id:
|
||||
f.unlink(missing_ok = True)
|
||||
deleted = True
|
||||
|
||||
if not deleted:
|
||||
raise HTTPException(404, "File not found")
|
||||
try:
|
||||
if not any(block_dir.iterdir()):
|
||||
block_dir.rmdir()
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
return {"status": "ok"}
|
||||
|
||||
|
||||
@router.post("/seed/inspect-upload", response_model = SeedInspectResponse)
|
||||
def inspect_seed_upload(payload: SeedInspectUploadRequest) -> SeedInspectResponse:
|
||||
if payload.file_ids is not None:
|
||||
if len(payload.file_ids) == 0:
|
||||
raise HTTPException(400, "file_ids must not be empty")
|
||||
_validate_safe_id(payload.block_id, "block_id")
|
||||
for fid in payload.file_ids:
|
||||
_validate_safe_id(fid, "file_id")
|
||||
preview_rows = _read_preview_rows_from_multi_files(
|
||||
block_id = payload.block_id,
|
||||
file_ids = payload.file_ids,
|
||||
file_names = payload.file_names,
|
||||
preview_size = payload.preview_size,
|
||||
chunk_size = payload.unstructured_chunk_size,
|
||||
chunk_overlap = payload.unstructured_chunk_overlap,
|
||||
)
|
||||
columns = ["chunk_text", "source_file"] if preview_rows else []
|
||||
resolved_paths = [
|
||||
str(UNSTRUCTURED_UPLOAD_ROOT / payload.block_id / f"{fid}.extracted.txt")
|
||||
for fid in payload.file_ids
|
||||
]
|
||||
return SeedInspectResponse(
|
||||
dataset_name = "unstructured_seed",
|
||||
resolved_path = resolved_paths[0] if resolved_paths else "",
|
||||
resolved_paths = resolved_paths,
|
||||
columns = columns,
|
||||
preview_rows = _serialize_preview_rows(preview_rows),
|
||||
)
|
||||
|
||||
seed_source_type = _normalize_optional_text(payload.seed_source_type) or "local"
|
||||
filename = _sanitize_filename(payload.filename)
|
||||
ext = Path(filename).suffix.lower()
|
||||
# Legacy single-file unstructured path only supports .txt/.md
|
||||
# PDF/DOCX extraction uses the multi-file upload endpoint instead
|
||||
_LEGACY_UNSTRUCTURED_EXTS = {".txt", ".md"}
|
||||
if seed_source_type == "unstructured":
|
||||
if ext not in UNSTRUCTURED_UPLOAD_EXTS:
|
||||
allowed = ", ".join(sorted(UNSTRUCTURED_UPLOAD_EXTS))
|
||||
if ext not in _LEGACY_UNSTRUCTURED_EXTS:
|
||||
allowed = ", ".join(sorted(_LEGACY_UNSTRUCTURED_EXTS))
|
||||
raise HTTPException(
|
||||
status_code = 400,
|
||||
detail = f"unsupported file type: {ext}. allowed: {allowed}",
|
||||
|
|
@ -329,8 +574,7 @@ def inspect_seed_upload(payload: SeedInspectUploadRequest) -> SeedInspectRespons
|
|||
file_bytes = _decode_base64_payload(payload.content_base64)
|
||||
if not file_bytes:
|
||||
raise HTTPException(status_code = 400, detail = "empty upload payload")
|
||||
max_size_bytes = 50 * 1024 * 1024
|
||||
if len(file_bytes) > max_size_bytes:
|
||||
if len(file_bytes) > MAX_FILE_SIZE:
|
||||
raise HTTPException(status_code = 413, detail = "file too large (max 50MB)")
|
||||
|
||||
ensure_dir(SEED_UPLOAD_DIR)
|
||||
|
|
|
|||
|
|
@ -19,6 +19,7 @@ from .storage_roots import (
|
|||
tmp_root,
|
||||
seed_uploads_root,
|
||||
unstructured_seed_cache_root,
|
||||
unstructured_uploads_root,
|
||||
oxc_validator_tmp_root,
|
||||
tensorboard_root,
|
||||
ensure_dir,
|
||||
|
|
@ -47,6 +48,7 @@ __all__ = [
|
|||
"tmp_root",
|
||||
"seed_uploads_root",
|
||||
"unstructured_seed_cache_root",
|
||||
"unstructured_uploads_root",
|
||||
"oxc_validator_tmp_root",
|
||||
"tensorboard_root",
|
||||
"ensure_dir",
|
||||
|
|
|
|||
|
|
@ -54,13 +54,17 @@ def tmp_root() -> Path:
|
|||
|
||||
|
||||
def seed_uploads_root() -> Path:
|
||||
return tmp_root() / "seed-uploads"
|
||||
return datasets_root() / "seed-uploads"
|
||||
|
||||
|
||||
def unstructured_seed_cache_root() -> Path:
|
||||
return tmp_root() / "unstructured-seed-cache"
|
||||
|
||||
|
||||
def unstructured_uploads_root() -> Path:
|
||||
return datasets_root() / "unstructured-uploads"
|
||||
|
||||
|
||||
def oxc_validator_tmp_root() -> Path:
|
||||
return tmp_root() / "oxc-validator"
|
||||
|
||||
|
|
@ -104,6 +108,7 @@ def ensure_studio_directories() -> None:
|
|||
datasets_root,
|
||||
dataset_uploads_root,
|
||||
recipe_datasets_root,
|
||||
unstructured_uploads_root,
|
||||
outputs_root,
|
||||
exports_root,
|
||||
auth_root,
|
||||
|
|
|
|||
|
|
@ -103,13 +103,22 @@ export type SeedInspectRequest = {
|
|||
};
|
||||
|
||||
export type SeedInspectUploadRequest = {
|
||||
filename: string;
|
||||
// base64 payload without data URL prefix
|
||||
content_base64: string;
|
||||
// Legacy single-file
|
||||
filename?: string;
|
||||
// biome-ignore lint/style/useNamingConvention: api schema
|
||||
content_base64?: string;
|
||||
// Multi-file
|
||||
// biome-ignore lint/style/useNamingConvention: api schema
|
||||
block_id?: string;
|
||||
// biome-ignore lint/style/useNamingConvention: api schema
|
||||
file_ids?: string[];
|
||||
// biome-ignore lint/style/useNamingConvention: api schema
|
||||
file_names?: string[];
|
||||
// Shared
|
||||
// biome-ignore lint/style/useNamingConvention: api schema
|
||||
preview_size?: number;
|
||||
// biome-ignore lint/style/useNamingConvention: api schema
|
||||
seed_source_type?: "local" | "unstructured";
|
||||
seed_source_type?: string;
|
||||
// biome-ignore lint/style/useNamingConvention: api schema
|
||||
unstructured_chunk_size?: number;
|
||||
// biome-ignore lint/style/useNamingConvention: api schema
|
||||
|
|
@ -126,6 +135,8 @@ export type SeedInspectResponse = {
|
|||
preview_rows: Record<string, unknown>[];
|
||||
split?: string | null;
|
||||
subset?: string | null;
|
||||
// biome-ignore lint/style/useNamingConvention: api schema
|
||||
resolved_paths?: string[] | null;
|
||||
};
|
||||
|
||||
export type ValidateError = {
|
||||
|
|
@ -372,3 +383,64 @@ export async function streamRecipeJobEvents(options: {
|
|||
}
|
||||
|
||||
// NOTE: preview endpoints removed from harness.
|
||||
|
||||
type UnstructuredFileUploadResponse = {
|
||||
// biome-ignore lint/style/useNamingConvention: api schema
|
||||
file_id: string;
|
||||
filename: string;
|
||||
// biome-ignore lint/style/useNamingConvention: api schema
|
||||
size_bytes: number;
|
||||
status: "ok" | "error";
|
||||
error?: string;
|
||||
};
|
||||
|
||||
export async function uploadUnstructuredFile(
|
||||
file: File,
|
||||
blockId: string,
|
||||
signal?: AbortSignal,
|
||||
existingFileIds?: string[],
|
||||
): Promise<UnstructuredFileUploadResponse> {
|
||||
const formData = new FormData();
|
||||
formData.append("file", file);
|
||||
formData.append("block_id", blockId);
|
||||
if (existingFileIds?.length) {
|
||||
formData.append("existing_file_ids", existingFileIds.join(","));
|
||||
}
|
||||
|
||||
const res = await authFetch(`${DATA_DESIGNER_API_BASE}/seed/upload-unstructured-file`, {
|
||||
method: "POST",
|
||||
body: formData,
|
||||
signal,
|
||||
});
|
||||
|
||||
if (res.status === 413) {
|
||||
const detail = await res.json().catch(() => ({ detail: "File too large" }));
|
||||
return {
|
||||
file_id: "",
|
||||
filename: file.name,
|
||||
size_bytes: file.size,
|
||||
status: "error",
|
||||
error: typeof detail.detail === "string" ? detail.detail : "File too large",
|
||||
};
|
||||
}
|
||||
|
||||
if (!res.ok) {
|
||||
const detail = await res.json().catch(() => ({ detail: "Upload failed" }));
|
||||
throw new Error(typeof detail.detail === "string" ? detail.detail : "Upload failed");
|
||||
}
|
||||
|
||||
return res.json();
|
||||
}
|
||||
|
||||
export async function removeUnstructuredFile(
|
||||
blockId: string,
|
||||
fileId: string,
|
||||
): Promise<void> {
|
||||
const res = await authFetch(
|
||||
`${DATA_DESIGNER_API_BASE}/seed/unstructured-file/${encodeURIComponent(blockId)}/${encodeURIComponent(fileId)}`,
|
||||
{ method: "DELETE" },
|
||||
);
|
||||
if (!res.ok && res.status !== 404) {
|
||||
throw new Error("Failed to remove file");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ import {
|
|||
useUpdateNodeInternals,
|
||||
} from "@xyflow/react";
|
||||
import { Button } from "@/components/ui/button";
|
||||
import { getFitNodeIdsIgnoringNotes } from "../../utils/graph/fit-view";
|
||||
import { buildFitViewOptions } from "../../utils/graph/fit-view";
|
||||
|
||||
type LayoutControlsProps = {
|
||||
direction: "LR" | "TB";
|
||||
|
|
@ -36,10 +36,7 @@ export function LayoutControls({
|
|||
requestAnimationFrame(() => {
|
||||
refreshNodeInternals();
|
||||
requestAnimationFrame(() => {
|
||||
fitView({
|
||||
duration: 250,
|
||||
nodes: getFitNodeIdsIgnoringNotes(getNodes()),
|
||||
});
|
||||
fitView(buildFitViewOptions(getNodes()));
|
||||
});
|
||||
});
|
||||
}, [fitView, getNodes, onLayout, refreshNodeInternals]);
|
||||
|
|
@ -51,10 +48,7 @@ export function LayoutControls({
|
|||
requestAnimationFrame(() => {
|
||||
refreshNodeInternals();
|
||||
requestAnimationFrame(() => {
|
||||
fitView({
|
||||
duration: 250,
|
||||
nodes: getFitNodeIdsIgnoringNotes(getNodes()),
|
||||
});
|
||||
fitView(buildFitViewOptions(getNodes()));
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ import { type ReactElement, useCallback } from "react";
|
|||
import { Lock, LockOpen, Maximize2, Minus, Plus } from "lucide-react";
|
||||
import { Panel, useReactFlow } from "@xyflow/react";
|
||||
import { Button } from "@/components/ui/button";
|
||||
import { getFitNodeIdsIgnoringNotes } from "../../utils/graph/fit-view";
|
||||
import { buildFitViewOptions } from "../../utils/graph/fit-view";
|
||||
import { RECIPE_FLOATING_ICON_BUTTON_CLASS } from "../recipe-floating-icon-button-class";
|
||||
|
||||
type ViewportControlsProps = {
|
||||
|
|
@ -30,10 +30,7 @@ export function ViewportControls({
|
|||
}, [zoomOut]);
|
||||
|
||||
const handleFitView = useCallback(() => {
|
||||
fitView({
|
||||
duration: 250,
|
||||
nodes: getFitNodeIdsIgnoringNotes(getNodes()),
|
||||
});
|
||||
fitView(buildFitViewOptions(getNodes()));
|
||||
}, [fitView, getNodes]);
|
||||
|
||||
return (
|
||||
|
|
|
|||
|
|
@ -45,7 +45,9 @@ export function InlineSeed({ config, onUpdate }: InlineSeedProps): ReactElement
|
|||
const isLocal = mode === "local";
|
||||
const fileName = isLocal
|
||||
? config.local_file_name?.trim()
|
||||
: config.unstructured_file_name?.trim();
|
||||
: config.unstructured_file_names?.length
|
||||
? `${config.unstructured_file_names.length} file${config.unstructured_file_names.length !== 1 ? "s" : ""}`
|
||||
: undefined;
|
||||
|
||||
return (
|
||||
<div className="corner-squircle flex items-center gap-2 rounded-md border border-border/60 bg-muted/30 px-2 py-2">
|
||||
|
|
|
|||
|
|
@ -256,9 +256,10 @@ function getConfigSummary(config: NodeConfig | undefined): string {
|
|||
}
|
||||
if (
|
||||
seedSourceType === "unstructured" &&
|
||||
config.unstructured_file_name?.trim()
|
||||
config.unstructured_file_names?.length
|
||||
) {
|
||||
return config.unstructured_file_name.trim();
|
||||
const count = config.unstructured_file_names.length;
|
||||
return `${count} file${count !== 1 ? "s" : ""} uploaded`;
|
||||
}
|
||||
if (config.hf_path.trim()) {
|
||||
return config.hf_path.trim();
|
||||
|
|
|
|||
|
|
@ -58,7 +58,7 @@ export function ExpressionDialog({
|
|||
value={config.name}
|
||||
onChange={(value) => onUpdate({ name: value })}
|
||||
/>
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Output type"
|
||||
htmlFor={dtypeId}
|
||||
|
|
@ -82,7 +82,7 @@ export function ExpressionDialog({
|
|||
</SelectContent>
|
||||
</Select>
|
||||
</div>
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Formula"
|
||||
htmlFor={exprId}
|
||||
|
|
|
|||
|
|
@ -58,7 +58,7 @@ export function ImportDialog({
|
|||
<DialogHeader>
|
||||
<DialogTitle>Import recipe</DialogTitle>
|
||||
</DialogHeader>
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Recipe JSON"
|
||||
htmlFor={payloadId}
|
||||
|
|
|
|||
|
|
@ -215,7 +215,7 @@ export function LlmGeneralTab({
|
|||
</p>
|
||||
</div>
|
||||
) : null}
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Model preset"
|
||||
htmlFor={modelAliasId}
|
||||
|
|
@ -262,7 +262,7 @@ export function LlmGeneralTab({
|
|||
</p>
|
||||
)}
|
||||
{(hasToolProfiles || Boolean(config.tool_alias?.trim())) && (
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Tool access (optional)"
|
||||
htmlFor={toolAliasId}
|
||||
|
|
@ -304,7 +304,7 @@ export function LlmGeneralTab({
|
|||
</div>
|
||||
)}
|
||||
{config.llm_type === "code" && (
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Code language"
|
||||
htmlFor={codeLangId}
|
||||
|
|
@ -327,7 +327,7 @@ export function LlmGeneralTab({
|
|||
</Select>
|
||||
</div>
|
||||
)}
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Prompt"
|
||||
htmlFor={promptId}
|
||||
|
|
@ -377,7 +377,7 @@ export function LlmGeneralTab({
|
|||
/>
|
||||
</div>
|
||||
{imageContext.enabled && (
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Image field"
|
||||
htmlFor={imageContextColumnId}
|
||||
|
|
@ -414,7 +414,7 @@ export function LlmGeneralTab({
|
|||
</div>
|
||||
)}
|
||||
{config.llm_type === "structured" && (
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Response format"
|
||||
htmlFor={outputFormatId}
|
||||
|
|
@ -441,7 +441,7 @@ export function LlmGeneralTab({
|
|||
/>
|
||||
</CollapsibleTrigger>
|
||||
<CollapsibleContent className="mt-3 space-y-4">
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Instructions (optional)"
|
||||
htmlFor={systemPromptId}
|
||||
|
|
@ -465,7 +465,7 @@ export function LlmGeneralTab({
|
|||
</p>
|
||||
)}
|
||||
</div>
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Save trace details"
|
||||
htmlFor={traceModeId}
|
||||
|
|
|
|||
|
|
@ -58,7 +58,7 @@ export function MarkdownNoteDialog({
|
|||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Markdown"
|
||||
htmlFor={markdownId}
|
||||
|
|
|
|||
|
|
@ -72,7 +72,7 @@ export function ModelConfigDialog({
|
|||
generation defaults you want to reuse.
|
||||
</p>
|
||||
</div>
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Provider connection"
|
||||
htmlFor={providerId}
|
||||
|
|
@ -120,7 +120,7 @@ export function ModelConfigDialog({
|
|||
: "Matching blocks are linked automatically on the canvas."}
|
||||
</p>
|
||||
</div>
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Model ID"
|
||||
htmlFor={modelId}
|
||||
|
|
@ -144,7 +144,7 @@ export function ModelConfigDialog({
|
|||
</p>
|
||||
</div>
|
||||
<div className="grid gap-3 sm:grid-cols-2">
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Temperature"
|
||||
htmlFor={tempId}
|
||||
|
|
@ -159,7 +159,7 @@ export function ModelConfigDialog({
|
|||
}
|
||||
/>
|
||||
</div>
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Top-p"
|
||||
htmlFor={topPId}
|
||||
|
|
@ -174,7 +174,7 @@ export function ModelConfigDialog({
|
|||
}
|
||||
/>
|
||||
</div>
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Max tokens"
|
||||
htmlFor={maxTokensId}
|
||||
|
|
@ -189,7 +189,7 @@ export function ModelConfigDialog({
|
|||
}
|
||||
/>
|
||||
</div>
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Timeout (seconds)"
|
||||
htmlFor={timeoutId}
|
||||
|
|
@ -214,7 +214,7 @@ export function ModelConfigDialog({
|
|||
/>
|
||||
</CollapsibleTrigger>
|
||||
<CollapsibleContent className="mt-3 space-y-4">
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Advanced request fields (JSON)"
|
||||
htmlFor={extraBodyId}
|
||||
|
|
|
|||
|
|
@ -52,7 +52,7 @@ export function ModelProviderDialog({
|
|||
service requires one.
|
||||
</p>
|
||||
</div>
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Endpoint"
|
||||
htmlFor={endpointId}
|
||||
|
|
@ -66,7 +66,7 @@ export function ModelProviderDialog({
|
|||
onChange={(event) => updateField("endpoint", event.target.value)}
|
||||
/>
|
||||
</div>
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="API key (optional)"
|
||||
htmlFor={apiKeyId}
|
||||
|
|
@ -87,7 +87,7 @@ export function ModelProviderDialog({
|
|||
/>
|
||||
</CollapsibleTrigger>
|
||||
<CollapsibleContent className="mt-3 space-y-4">
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="API key environment variable"
|
||||
htmlFor={apiKeyEnvId}
|
||||
|
|
@ -101,7 +101,7 @@ export function ModelProviderDialog({
|
|||
onChange={(event) => updateField("api_key_env", event.target.value)}
|
||||
/>
|
||||
</div>
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Extra headers (JSON)"
|
||||
htmlFor={extraHeadersId}
|
||||
|
|
@ -115,7 +115,7 @@ export function ModelProviderDialog({
|
|||
onChange={(event) => updateField("extra_headers", event.target.value)}
|
||||
/>
|
||||
</div>
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Extra body (JSON)"
|
||||
htmlFor={extraBodyId}
|
||||
|
|
|
|||
|
|
@ -157,7 +157,7 @@ function DraftInputField({
|
|||
placeholder,
|
||||
}: DraftInputFieldProps): ReactElement {
|
||||
return (
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel label={label} htmlFor={id} hint={hint} />
|
||||
<Input
|
||||
id={id}
|
||||
|
|
@ -330,7 +330,7 @@ function RunDialogBody({
|
|||
</p>
|
||||
</DialogHeader>
|
||||
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Run type"
|
||||
hint="Start with a quick check or generate the full dataset."
|
||||
|
|
@ -358,7 +358,7 @@ function RunDialogBody({
|
|||
</div>
|
||||
|
||||
{kind === "full" && (
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Run name"
|
||||
htmlFor="run-name"
|
||||
|
|
@ -380,7 +380,7 @@ function RunDialogBody({
|
|||
</div>
|
||||
)}
|
||||
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel label="Records" htmlFor="run-rows" hint={rowHint} />
|
||||
<Input
|
||||
id="run-rows"
|
||||
|
|
|
|||
|
|
@ -93,7 +93,7 @@ export function ProcessorsDialog({
|
|||
{schemaProcessor && (
|
||||
<div className="space-y-3">
|
||||
<AvailableVariables configId="" />
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Name"
|
||||
htmlFor={nameId}
|
||||
|
|
@ -106,7 +106,7 @@ export function ProcessorsDialog({
|
|||
onChange={(event) => updateSchema({ name: event.target.value })}
|
||||
/>
|
||||
</div>
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Template (JSON)"
|
||||
htmlFor={templateId}
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@ export function BernoulliDialog({
|
|||
value={config.name}
|
||||
onChange={(value) => onUpdate({ name: value })}
|
||||
/>
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Probability (p)"
|
||||
htmlFor={pId}
|
||||
|
|
|
|||
|
|
@ -89,7 +89,7 @@ export function CategoryDialog({
|
|||
onChange={(value) => onUpdate({ name: value })}
|
||||
/>
|
||||
<div className="space-y-3">
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Values"
|
||||
hint="Define allowed categorical values for this column."
|
||||
|
|
@ -127,7 +127,7 @@ export function CategoryDialog({
|
|||
/>
|
||||
</CollapsibleTrigger>
|
||||
<CollapsibleContent className="mt-2 space-y-3">
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Weights (optional)"
|
||||
hint="Set selection probability per value."
|
||||
|
|
@ -239,7 +239,7 @@ export function CategoryDialog({
|
|||
}}
|
||||
placeholder="Type a conditional value and press Enter"
|
||||
/>
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<p className="text-xs font-semibold uppercase text-muted-foreground">
|
||||
Rule weights (optional)
|
||||
</p>
|
||||
|
|
|
|||
|
|
@ -50,7 +50,7 @@ export function DatetimeDialog({
|
|||
/>
|
||||
<div className="grid gap-3">
|
||||
<div className="grid gap-2 sm:grid-cols-2">
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Start"
|
||||
htmlFor={startId}
|
||||
|
|
@ -66,7 +66,7 @@ export function DatetimeDialog({
|
|||
}
|
||||
/>
|
||||
</div>
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="End"
|
||||
htmlFor={endId}
|
||||
|
|
@ -83,7 +83,7 @@ export function DatetimeDialog({
|
|||
/>
|
||||
</div>
|
||||
</div>
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Unit"
|
||||
htmlFor={unitId}
|
||||
|
|
|
|||
|
|
@ -33,7 +33,7 @@ export function GaussianDialog({
|
|||
onChange={(value) => onUpdate({ name: value })}
|
||||
/>
|
||||
<div className="grid gap-3 sm:grid-cols-2">
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Mean"
|
||||
htmlFor={meanId}
|
||||
|
|
@ -47,7 +47,7 @@ export function GaussianDialog({
|
|||
onChange={(event) => onUpdate({ mean: event.target.value })}
|
||||
/>
|
||||
</div>
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Std"
|
||||
htmlFor={stdId}
|
||||
|
|
@ -62,7 +62,7 @@ export function GaussianDialog({
|
|||
/>
|
||||
</div>
|
||||
</div>
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Convert to"
|
||||
htmlFor={convertId}
|
||||
|
|
|
|||
|
|
@ -58,7 +58,7 @@ export function PersonDialog({
|
|||
<p className="text-sm text-foreground">Faker</p>
|
||||
</div>
|
||||
<div className="grid gap-3 sm:grid-cols-2">
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Locale"
|
||||
htmlFor={localeId}
|
||||
|
|
@ -73,7 +73,7 @@ export function PersonDialog({
|
|||
}
|
||||
/>
|
||||
</div>
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Sex"
|
||||
htmlFor={sexId}
|
||||
|
|
@ -95,7 +95,7 @@ export function PersonDialog({
|
|||
</SelectContent>
|
||||
</Select>
|
||||
</div>
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Age range"
|
||||
htmlFor={ageRangeId}
|
||||
|
|
@ -111,7 +111,7 @@ export function PersonDialog({
|
|||
placeholder="18-70"
|
||||
/>
|
||||
</div>
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="City"
|
||||
htmlFor={cityId}
|
||||
|
|
|
|||
|
|
@ -74,7 +74,7 @@ export function SubcategoryDialog({
|
|||
onChange={(value) => onUpdate({ name: value })}
|
||||
/>
|
||||
<div className="space-y-3">
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Parent category column"
|
||||
htmlFor={parentSelectId}
|
||||
|
|
|
|||
|
|
@ -45,7 +45,7 @@ export function TimedeltaDialog({
|
|||
onChange={(value) => onUpdate({ name: value })}
|
||||
/>
|
||||
<div className="grid gap-3 sm:grid-cols-2">
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="dt_min"
|
||||
htmlFor={dtMinId}
|
||||
|
|
@ -59,7 +59,7 @@ export function TimedeltaDialog({
|
|||
onChange={(event) => updateField("dt_min", event.target.value)}
|
||||
/>
|
||||
</div>
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="dt_max"
|
||||
htmlFor={dtMaxId}
|
||||
|
|
@ -74,7 +74,7 @@ export function TimedeltaDialog({
|
|||
/>
|
||||
</div>
|
||||
</div>
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Unit"
|
||||
htmlFor={unitId}
|
||||
|
|
@ -98,7 +98,7 @@ export function TimedeltaDialog({
|
|||
</SelectContent>
|
||||
</Select>
|
||||
</div>
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Reference datetime column"
|
||||
htmlFor={referenceId}
|
||||
|
|
|
|||
|
|
@ -33,7 +33,7 @@ export function UniformDialog({
|
|||
onChange={(value) => onUpdate({ name: value })}
|
||||
/>
|
||||
<div className="grid gap-3 sm:grid-cols-2">
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Low"
|
||||
htmlFor={lowId}
|
||||
|
|
@ -47,7 +47,7 @@ export function UniformDialog({
|
|||
onChange={(event) => onUpdate({ low: event.target.value })}
|
||||
/>
|
||||
</div>
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="High"
|
||||
htmlFor={highId}
|
||||
|
|
@ -62,7 +62,7 @@ export function UniformDialog({
|
|||
/>
|
||||
</div>
|
||||
</div>
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Convert to"
|
||||
htmlFor={convertId}
|
||||
|
|
|
|||
|
|
@ -29,7 +29,7 @@ export function UuidDialog({
|
|||
value={config.name}
|
||||
onChange={(value) => onUpdate({ name: value })}
|
||||
/>
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="UUID format (optional)"
|
||||
htmlFor={uuidId}
|
||||
|
|
|
|||
|
|
@ -37,10 +37,9 @@ import {
|
|||
TabsList,
|
||||
TabsTrigger,
|
||||
} from "@/components/ui/tabs";
|
||||
import mammoth from "mammoth";
|
||||
import { type ReactElement, useCallback, useEffect, useMemo, useRef, useState } from "react";
|
||||
import { extractText, getDocumentProxy } from "unpdf";
|
||||
import { cn } from "@/lib/utils";
|
||||
import { UnstructuredDropZone, type FileEntry } from "./unstructured-drop-zone";
|
||||
import { inspectSeedDataset, inspectSeedUpload } from "../../api";
|
||||
import { resolveImagePreview } from "../../utils/image-preview";
|
||||
import type {
|
||||
|
|
@ -64,7 +63,6 @@ const SELECTION_OPTIONS: Array<{ value: SeedSelectionType; label: string }> = [
|
|||
];
|
||||
|
||||
const LOCAL_ACCEPT = ".csv,.json,.jsonl";
|
||||
const UNSTRUCTURED_ACCEPT = ".txt,.pdf,.docx";
|
||||
const MAX_UPLOAD_BYTES = 50 * 1024 * 1024;
|
||||
const DEFAULT_CHUNK_SIZE = 1200;
|
||||
const DEFAULT_CHUNK_OVERLAP = 200;
|
||||
|
|
@ -112,20 +110,20 @@ function getPreviewEmptyStateCopy(mode: SeedConfig["seed_source_type"]): {
|
|||
} {
|
||||
if (mode === "local") {
|
||||
return {
|
||||
title: "No local preview yet",
|
||||
description: "Choose a CSV/JSON/JSONL file, then click Load to fetch 10 rows.",
|
||||
title: "No preview yet",
|
||||
description: "Upload a CSV, JSON, or JSONL file and click Load to see a sample.",
|
||||
};
|
||||
}
|
||||
if (mode === "unstructured") {
|
||||
return {
|
||||
title: "No chunk preview yet",
|
||||
title: "No preview yet",
|
||||
description:
|
||||
"Choose a TXT/PDF/DOCX file, then click Load to extract + preview chunk_text rows.",
|
||||
"Upload your documents and the preview will appear once processing is done.",
|
||||
};
|
||||
}
|
||||
return {
|
||||
title: "No dataset preview yet",
|
||||
description: "Pick a Hugging Face dataset and click Load to fetch 10 sample rows.",
|
||||
title: "No preview yet",
|
||||
description: "Select a Hugging Face dataset and click Load to see a sample.",
|
||||
};
|
||||
}
|
||||
|
||||
|
|
@ -177,42 +175,6 @@ async function fileToBase64Payload(file: File): Promise<string> {
|
|||
});
|
||||
}
|
||||
|
||||
async function extractUnstructuredText(file: File): Promise<string> {
|
||||
const lower = file.name.toLowerCase();
|
||||
if (lower.endsWith(".txt")) {
|
||||
return file.text();
|
||||
}
|
||||
if (lower.endsWith(".pdf")) {
|
||||
const buffer = new Uint8Array(await file.arrayBuffer());
|
||||
const pdf = await getDocumentProxy(buffer);
|
||||
const { text } = await extractText(pdf, { mergePages: true });
|
||||
return text;
|
||||
}
|
||||
if (lower.endsWith(".docx")) {
|
||||
const arrayBuffer = await file.arrayBuffer();
|
||||
const { value } = await mammoth.extractRawText({ arrayBuffer });
|
||||
return value;
|
||||
}
|
||||
throw new Error("Unsupported unstructured file type");
|
||||
}
|
||||
|
||||
async function toUnstructuredUploadFile(file: File): Promise<File> {
|
||||
const lower = file.name.toLowerCase();
|
||||
if (lower.endsWith(".txt") || lower.endsWith(".md")) {
|
||||
return file;
|
||||
}
|
||||
|
||||
const text = (await extractUnstructuredText(file)).trim();
|
||||
if (!text) {
|
||||
throw new Error("No text found in file.");
|
||||
}
|
||||
const normalized = text.replace(/\r\n/g, "\n").replace(/\r/g, "\n");
|
||||
const stem = file.name.replace(/\.(pdf|docx)$/i, "") || "unstructured_seed";
|
||||
return new File([normalized], `${stem}.txt`, {
|
||||
type: "text/plain",
|
||||
});
|
||||
}
|
||||
|
||||
export function SeedDialog({ config, onUpdate, open }: SeedDialogProps): ReactElement {
|
||||
const [inspectError, setInspectError] = useState<string | null>(null);
|
||||
const [isInspecting, setIsInspecting] = useState(false);
|
||||
|
|
@ -220,16 +182,84 @@ export function SeedDialog({ config, onUpdate, open }: SeedDialogProps): ReactEl
|
|||
const [previewRows, setPreviewRows] = useState<Record<string, unknown>[]>([]);
|
||||
const [expandedPreviewRows, setExpandedPreviewRows] = useState<Record<number, boolean>>({});
|
||||
const [localFile, setLocalFile] = useState<File | null>(null);
|
||||
const [unstructuredFile, setUnstructuredFile] = useState<File | null>(null);
|
||||
const [unstructuredFiles, setUnstructuredFiles] = useState<FileEntry[]>(() => {
|
||||
if (config.unstructured_file_ids?.length) {
|
||||
return config.unstructured_file_ids.map((id, i) => ({
|
||||
id,
|
||||
name: config.unstructured_file_names?.[i] ?? "Unknown",
|
||||
size: config.unstructured_file_sizes?.[i] ?? 0,
|
||||
status: "ok" as const,
|
||||
}));
|
||||
}
|
||||
return [];
|
||||
});
|
||||
|
||||
const mode = config.seed_source_type ?? "hf";
|
||||
const previewEmpty = getPreviewEmptyStateCopy(mode);
|
||||
|
||||
const prevModeRef = useRef(mode);
|
||||
useEffect(() => {
|
||||
const prevMode = prevModeRef.current;
|
||||
prevModeRef.current = mode;
|
||||
setInspectError(null);
|
||||
setLocalFile(null);
|
||||
setUnstructuredFile(null);
|
||||
}, [mode]);
|
||||
if (prevMode === "unstructured" && mode !== "unstructured") {
|
||||
setUnstructuredFiles([]);
|
||||
}
|
||||
if (prevMode !== "unstructured" && mode === "unstructured") {
|
||||
if (config.unstructured_file_ids?.length) {
|
||||
setUnstructuredFiles(
|
||||
config.unstructured_file_ids.map((id, i) => ({
|
||||
id,
|
||||
name: config.unstructured_file_names?.[i] ?? "Unknown",
|
||||
size: config.unstructured_file_sizes?.[i] ?? 0,
|
||||
status: "ok" as const,
|
||||
})),
|
||||
);
|
||||
} else {
|
||||
setUnstructuredFiles([]);
|
||||
}
|
||||
}
|
||||
}, [mode]); // eslint-disable-line react-hooks/exhaustive-deps
|
||||
|
||||
const didSyncFilesRef = useRef(false);
|
||||
useEffect(() => {
|
||||
if (!open) {
|
||||
didSyncFilesRef.current = false;
|
||||
return;
|
||||
}
|
||||
if (didSyncFilesRef.current) return;
|
||||
if (mode !== "unstructured") return;
|
||||
if (unstructuredFiles.length > 0) return;
|
||||
if (!config.unstructured_file_ids?.length) return;
|
||||
didSyncFilesRef.current = true;
|
||||
setUnstructuredFiles(
|
||||
config.unstructured_file_ids.map((id, i) => ({
|
||||
id,
|
||||
name: config.unstructured_file_names?.[i] ?? "Unknown",
|
||||
size: config.unstructured_file_sizes?.[i] ?? 0,
|
||||
status: "ok" as const,
|
||||
})),
|
||||
);
|
||||
}, [open, mode, unstructuredFiles.length, config.unstructured_file_ids, config.unstructured_file_names, config.unstructured_file_sizes]);
|
||||
|
||||
const handleUnstructuredFilesChange = useCallback(
|
||||
(updater: FileEntry[] | ((prev: FileEntry[]) => FileEntry[])) => {
|
||||
setUnstructuredFiles((prev) => {
|
||||
const next = typeof updater === "function" ? updater(prev) : updater;
|
||||
const okFiles = next.filter((f) => f.status === "ok");
|
||||
queueMicrotask(() => {
|
||||
onUpdate({
|
||||
unstructured_file_ids: okFiles.map((f) => f.id),
|
||||
unstructured_file_names: okFiles.map((f) => f.name),
|
||||
unstructured_file_sizes: okFiles.map((f) => f.size),
|
||||
});
|
||||
});
|
||||
return next;
|
||||
});
|
||||
},
|
||||
[onUpdate],
|
||||
);
|
||||
|
||||
useEffect(() => {
|
||||
setPreviewRows(config.seed_preview_rows ?? []);
|
||||
|
|
@ -256,14 +286,16 @@ export function SeedDialog({ config, onUpdate, open }: SeedDialogProps): ReactEl
|
|||
if (!localFile) return null;
|
||||
return `local:${localFile.name}|${localFile.size}|${localFile.lastModified}`;
|
||||
}
|
||||
if (!unstructuredFile) return null;
|
||||
const okFiles = unstructuredFiles.filter((f) => f.status === "ok");
|
||||
if (okFiles.length === 0) return null;
|
||||
const { chunkSize, chunkOverlap } = resolveChunking(config);
|
||||
return `unstructured:${unstructuredFile.name}|${unstructuredFile.size}|${unstructuredFile.lastModified}|${chunkSize}|${chunkOverlap}`;
|
||||
const fileKey = okFiles.map((f) => `${f.id}|${f.name}`).join(",");
|
||||
return `unstructured:${fileKey}|${chunkSize}|${chunkOverlap}`;
|
||||
}, [
|
||||
config,
|
||||
localFile,
|
||||
mode,
|
||||
unstructuredFile,
|
||||
unstructuredFiles,
|
||||
]);
|
||||
|
||||
const loadSeedMetadata = useCallback(async (opts?: { silent?: boolean }): Promise<boolean> => {
|
||||
|
|
@ -295,7 +327,9 @@ export function SeedDialog({ config, onUpdate, open }: SeedDialogProps): ReactEl
|
|||
hf_split: response.split ?? "",
|
||||
hf_subset: response.subset ?? "",
|
||||
local_file_name: "",
|
||||
unstructured_file_name: "",
|
||||
unstructured_file_ids: [],
|
||||
unstructured_file_names: [],
|
||||
unstructured_file_sizes: [],
|
||||
});
|
||||
setPreviewRows(response.preview_rows ?? []);
|
||||
setLastLoadedKey(loadKey);
|
||||
|
|
@ -326,50 +360,56 @@ export function SeedDialog({ config, onUpdate, open }: SeedDialogProps): ReactEl
|
|||
hf_subset: "",
|
||||
hf_split: "",
|
||||
local_file_name: localFile.name,
|
||||
unstructured_file_name: "",
|
||||
unstructured_file_ids: [],
|
||||
unstructured_file_names: [],
|
||||
unstructured_file_sizes: [],
|
||||
});
|
||||
setPreviewRows(response.preview_rows ?? []);
|
||||
setLastLoadedKey(loadKey);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!unstructuredFile) {
|
||||
throw new Error("Select a PDF/DOCX/TXT file first.");
|
||||
}
|
||||
if (unstructuredFile.size > MAX_UPLOAD_BYTES) {
|
||||
throw new Error("File too large (max 50MB).");
|
||||
if (mode === "unstructured") {
|
||||
const fileIds = unstructuredFiles
|
||||
.filter((f) => f.status === "ok")
|
||||
.map((f) => f.id);
|
||||
const fileNames = unstructuredFiles
|
||||
.filter((f) => f.status === "ok")
|
||||
.map((f) => f.name);
|
||||
|
||||
if (fileIds.length === 0) {
|
||||
setInspectError("No files uploaded");
|
||||
return false;
|
||||
}
|
||||
|
||||
const { chunkSize, chunkOverlap } = resolveChunking(config);
|
||||
const response = await inspectSeedUpload({
|
||||
block_id: config.id,
|
||||
file_ids: fileIds,
|
||||
file_names: fileNames,
|
||||
preview_size: 10,
|
||||
seed_source_type: "unstructured",
|
||||
unstructured_chunk_size: chunkSize,
|
||||
unstructured_chunk_overlap: chunkOverlap,
|
||||
});
|
||||
|
||||
onUpdate({
|
||||
hf_path: response.resolved_path,
|
||||
resolved_paths: response.resolved_paths ?? [],
|
||||
seed_columns: response.columns,
|
||||
seed_preview_rows: response.preview_rows ?? [],
|
||||
unstructured_file_ids: fileIds,
|
||||
unstructured_file_names: fileNames,
|
||||
unstructured_file_sizes: unstructuredFiles
|
||||
.filter((f) => f.status === "ok")
|
||||
.map((f) => f.size),
|
||||
});
|
||||
setPreviewRows(response.preview_rows ?? []);
|
||||
setLastLoadedKey(loadKey);
|
||||
return true;
|
||||
}
|
||||
|
||||
const { chunkSize, chunkOverlap } = resolveChunking(config);
|
||||
const uploadFile = await toUnstructuredUploadFile(unstructuredFile);
|
||||
if (uploadFile.size > MAX_UPLOAD_BYTES) {
|
||||
throw new Error("Processed text is too large (max 50MB).");
|
||||
}
|
||||
const payload = await fileToBase64Payload(uploadFile);
|
||||
const response = await inspectSeedUpload({
|
||||
filename: uploadFile.name,
|
||||
content_base64: payload,
|
||||
preview_size: 10,
|
||||
seed_source_type: "unstructured",
|
||||
unstructured_chunk_size: chunkSize,
|
||||
unstructured_chunk_overlap: chunkOverlap,
|
||||
});
|
||||
onUpdate({
|
||||
hf_path: response.resolved_path,
|
||||
seed_columns: response.columns,
|
||||
seed_drop_columns: (config.seed_drop_columns ?? []).filter((name) =>
|
||||
response.columns.includes(name),
|
||||
),
|
||||
seed_preview_rows: response.preview_rows ?? [],
|
||||
hf_repo_id: "",
|
||||
hf_subset: "",
|
||||
hf_split: "",
|
||||
local_file_name: "",
|
||||
unstructured_file_name: unstructuredFile.name,
|
||||
});
|
||||
setPreviewRows(response.preview_rows ?? []);
|
||||
setLastLoadedKey(loadKey);
|
||||
return true;
|
||||
return false;
|
||||
} catch (error) {
|
||||
if (!opts?.silent) {
|
||||
setInspectError(getErrorMessage(error, "Failed to load seed metadata."));
|
||||
|
|
@ -385,7 +425,7 @@ export function SeedDialog({ config, onUpdate, open }: SeedDialogProps): ReactEl
|
|||
localFile,
|
||||
mode,
|
||||
onUpdate,
|
||||
unstructuredFile,
|
||||
unstructuredFiles,
|
||||
]);
|
||||
|
||||
useEffect(() => {
|
||||
|
|
@ -401,6 +441,21 @@ export function SeedDialog({ config, onUpdate, open }: SeedDialogProps): ReactEl
|
|||
void loadSeedMetadata({ silent: true });
|
||||
}, [getCurrentLoadKey, isInspecting, lastLoadedKey, loadSeedMetadata, open]);
|
||||
|
||||
const wasUploadingRef = useRef(false);
|
||||
useEffect(() => {
|
||||
if (mode !== "unstructured") return;
|
||||
const isUploading = unstructuredFiles.some((f) => f.status === "uploading");
|
||||
if (isUploading) {
|
||||
wasUploadingRef.current = true;
|
||||
} else if (wasUploadingRef.current) {
|
||||
wasUploadingRef.current = false;
|
||||
const hasOk = unstructuredFiles.some((f) => f.status === "ok");
|
||||
if (hasOk) {
|
||||
void loadSeedMetadata({ silent: true });
|
||||
}
|
||||
}
|
||||
}, [mode, unstructuredFiles, loadSeedMetadata]);
|
||||
|
||||
const previewColumns = useMemo(() => {
|
||||
const loadedColumns = config.seed_columns ?? [];
|
||||
if (loadedColumns.length > 0) return loadedColumns;
|
||||
|
|
@ -434,10 +489,10 @@ export function SeedDialog({ config, onUpdate, open }: SeedDialogProps): ReactEl
|
|||
</TabsList>
|
||||
|
||||
<TabsContent value="config" className="min-w-0 pt-3">
|
||||
<div className="space-y-4">
|
||||
<div className="space-y-3">
|
||||
{mode === "hf" && (
|
||||
<>
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Dataset"
|
||||
htmlFor={datasetId}
|
||||
|
|
@ -474,7 +529,7 @@ export function SeedDialog({ config, onUpdate, open }: SeedDialogProps): ReactEl
|
|||
</div>
|
||||
</div>
|
||||
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="HF token (optional)"
|
||||
htmlFor={tokenId}
|
||||
|
|
@ -493,7 +548,7 @@ export function SeedDialog({ config, onUpdate, open }: SeedDialogProps): ReactEl
|
|||
)}
|
||||
|
||||
{mode === "local" && (
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Structured file"
|
||||
hint="Upload CSV, JSON, or JSONL seed file."
|
||||
|
|
@ -526,7 +581,7 @@ export function SeedDialog({ config, onUpdate, open }: SeedDialogProps): ReactEl
|
|||
</Button>
|
||||
</div>
|
||||
<p className="text-xs text-muted-foreground">
|
||||
Upload-only. Max 50MB.
|
||||
Max 50MB per file.
|
||||
</p>
|
||||
{(localFile?.name || config.local_file_name?.trim()) && (
|
||||
<p className="text-xs text-muted-foreground">
|
||||
|
|
@ -537,49 +592,12 @@ export function SeedDialog({ config, onUpdate, open }: SeedDialogProps): ReactEl
|
|||
)}
|
||||
|
||||
{mode === "unstructured" && (
|
||||
<div className="grid gap-2">
|
||||
<FieldLabel
|
||||
label="Unstructured file"
|
||||
hint="Upload PDF, DOCX, or TXT. We chunk text into seed rows."
|
||||
/>
|
||||
<div className="flex items-center gap-2">
|
||||
<Input
|
||||
className="nodrag flex-1"
|
||||
type="file"
|
||||
accept={UNSTRUCTURED_ACCEPT}
|
||||
onChange={(event) => {
|
||||
const file = event.target.files?.[0] ?? null;
|
||||
setUnstructuredFile(file);
|
||||
onUpdate({
|
||||
hf_path: "",
|
||||
seed_columns: [],
|
||||
seed_drop_columns: [],
|
||||
seed_preview_rows: [],
|
||||
unstructured_file_name: file?.name ?? "",
|
||||
});
|
||||
}}
|
||||
/>
|
||||
<Button
|
||||
type="button"
|
||||
variant="outline"
|
||||
className="nodrag shrink-0"
|
||||
onClick={() => void loadSeedMetadata()}
|
||||
disabled={isInspecting || !unstructuredFile}
|
||||
>
|
||||
{isInspecting ? "Loading..." : "Load"}
|
||||
</Button>
|
||||
</div>
|
||||
<p className="text-xs text-muted-foreground">
|
||||
File is converted to text, then chunked server-side into chunk_text rows. Max 50MB.
|
||||
</p>
|
||||
{(unstructuredFile?.name ||
|
||||
config.unstructured_file_name?.trim()) && (
|
||||
<p className="text-xs text-muted-foreground">
|
||||
Selected:{" "}
|
||||
{unstructuredFile?.name ?? config.unstructured_file_name?.trim()}
|
||||
</p>
|
||||
)}
|
||||
</div>
|
||||
<UnstructuredDropZone
|
||||
blockId={config.id}
|
||||
files={unstructuredFiles}
|
||||
onFilesChange={handleUnstructuredFilesChange}
|
||||
disabled={isInspecting}
|
||||
/>
|
||||
)}
|
||||
|
||||
{inspectError && <p className="text-xs text-red-600">{inspectError}</p>}
|
||||
|
|
@ -633,7 +651,7 @@ export function SeedDialog({ config, onUpdate, open }: SeedDialogProps): ReactEl
|
|||
/>
|
||||
</CollapsibleTrigger>
|
||||
<CollapsibleContent className="mt-2 space-y-3">
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Sampling strategy"
|
||||
htmlFor={samplingId}
|
||||
|
|
@ -658,7 +676,7 @@ export function SeedDialog({ config, onUpdate, open }: SeedDialogProps): ReactEl
|
|||
</Select>
|
||||
</div>
|
||||
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Selection strategy"
|
||||
htmlFor={selectionId}
|
||||
|
|
@ -685,7 +703,7 @@ export function SeedDialog({ config, onUpdate, open }: SeedDialogProps): ReactEl
|
|||
|
||||
{mode === "unstructured" && (
|
||||
<div className="grid grid-cols-2 gap-3">
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Chunk size"
|
||||
htmlFor={chunkSizeId}
|
||||
|
|
@ -701,7 +719,7 @@ export function SeedDialog({ config, onUpdate, open }: SeedDialogProps): ReactEl
|
|||
}
|
||||
/>
|
||||
</div>
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Chunk overlap"
|
||||
htmlFor={chunkOverlapId}
|
||||
|
|
@ -725,7 +743,7 @@ export function SeedDialog({ config, onUpdate, open }: SeedDialogProps): ReactEl
|
|||
|
||||
{config.selection_type === "index_range" && (
|
||||
<div className="grid grid-cols-2 gap-3">
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel label="Start" hint="Inclusive start row index for index_range." />
|
||||
<Input
|
||||
className="nodrag"
|
||||
|
|
@ -734,7 +752,7 @@ export function SeedDialog({ config, onUpdate, open }: SeedDialogProps): ReactEl
|
|||
onChange={(event) => onUpdate({ selection_start: event.target.value })}
|
||||
/>
|
||||
</div>
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel label="End" hint="Inclusive end row index for index_range." />
|
||||
<Input
|
||||
className="nodrag"
|
||||
|
|
@ -748,7 +766,7 @@ export function SeedDialog({ config, onUpdate, open }: SeedDialogProps): ReactEl
|
|||
|
||||
{config.selection_type === "partition_block" && (
|
||||
<div className="grid grid-cols-2 gap-3">
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel label="Index" hint="Partition index to load." />
|
||||
<Input
|
||||
className="nodrag"
|
||||
|
|
@ -757,7 +775,7 @@ export function SeedDialog({ config, onUpdate, open }: SeedDialogProps): ReactEl
|
|||
onChange={(event) => onUpdate({ selection_index: event.target.value })}
|
||||
/>
|
||||
</div>
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel label="Partitions" hint="Total number of partitions." />
|
||||
<Input
|
||||
className="nodrag"
|
||||
|
|
|
|||
|
|
@ -0,0 +1,240 @@
|
|||
import { useCallback, useRef, useState } from "react";
|
||||
import { CloudUploadIcon, Cancel01Icon, Loading03Icon, CheckmarkCircle02Icon, Alert02Icon } from "@hugeicons/core-free-icons";
|
||||
import { HugeiconsIcon } from "@hugeicons/react";
|
||||
import { uploadUnstructuredFile, removeUnstructuredFile } from "../../api";
|
||||
|
||||
const ACCEPTED_EXTENSIONS = [".txt", ".pdf", ".docx", ".md"];
|
||||
const MAX_FILE_SIZE = 50 * 1024 * 1024;
|
||||
const MAX_TOTAL_SIZE = 100 * 1024 * 1024;
|
||||
|
||||
type FileEntry = {
|
||||
id: string;
|
||||
name: string;
|
||||
size: number;
|
||||
status: "uploading" | "ok" | "error";
|
||||
error?: string;
|
||||
abortController?: AbortController;
|
||||
};
|
||||
|
||||
type UnstructuredDropZoneProps = {
|
||||
blockId: string;
|
||||
files: FileEntry[];
|
||||
onFilesChange: (files: FileEntry[] | ((prev: FileEntry[]) => FileEntry[])) => void;
|
||||
disabled?: boolean;
|
||||
};
|
||||
|
||||
function formatSize(bytes: number): string {
|
||||
if (bytes < 1024) return `${bytes} B`;
|
||||
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
|
||||
return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
|
||||
}
|
||||
|
||||
function isValidExtension(name: string): boolean {
|
||||
const ext = name.slice(name.lastIndexOf(".")).toLowerCase();
|
||||
return ACCEPTED_EXTENSIONS.includes(ext);
|
||||
}
|
||||
|
||||
export function UnstructuredDropZone({
|
||||
blockId,
|
||||
files,
|
||||
onFilesChange,
|
||||
disabled,
|
||||
}: UnstructuredDropZoneProps) {
|
||||
const inputRef = useRef<HTMLInputElement>(null);
|
||||
const filesRef = useRef(files);
|
||||
filesRef.current = files;
|
||||
const [isDragOver, setIsDragOver] = useState(false);
|
||||
|
||||
const totalSize = files.reduce((sum, f) => sum + f.size, 0);
|
||||
|
||||
const handleFiles = useCallback(
|
||||
async (newFiles: File[]) => {
|
||||
const valid = newFiles.filter((f) => {
|
||||
if (!isValidExtension(f.name)) return false;
|
||||
if (f.size > MAX_FILE_SIZE) return false;
|
||||
return true;
|
||||
});
|
||||
|
||||
if (valid.length === 0) return;
|
||||
|
||||
const addedSize = valid.reduce((s, f) => s + f.size, 0);
|
||||
const currentTotal = filesRef.current.reduce((sum, f) => sum + f.size, 0);
|
||||
if (currentTotal + addedSize > MAX_TOTAL_SIZE) return;
|
||||
|
||||
const entries: FileEntry[] = valid.map((f) => ({
|
||||
id: "",
|
||||
name: f.name,
|
||||
size: f.size,
|
||||
status: "uploading" as const,
|
||||
abortController: new AbortController(),
|
||||
}));
|
||||
|
||||
onFilesChange((prev) => [...prev, ...entries]);
|
||||
|
||||
for (let i = 0; i < valid.length; i++) {
|
||||
const file = valid[i];
|
||||
const entry = entries[i];
|
||||
let updatedId = "";
|
||||
let updatedStatus: FileEntry["status"] = "error";
|
||||
let updatedError: string | undefined;
|
||||
try {
|
||||
const existingIds = filesRef.current.filter((f) => f.id).map((f) => f.id);
|
||||
const result = await uploadUnstructuredFile(
|
||||
file,
|
||||
blockId,
|
||||
entry.abortController?.signal,
|
||||
existingIds,
|
||||
);
|
||||
updatedId = result.file_id;
|
||||
updatedStatus = result.status === "ok" ? "ok" : "error";
|
||||
updatedError = result.error;
|
||||
} catch (e) {
|
||||
if (e instanceof DOMException && e.name === "AbortError") {
|
||||
updatedError = "Cancelled";
|
||||
} else {
|
||||
updatedError = e instanceof Error ? e.message : "Upload failed";
|
||||
}
|
||||
}
|
||||
onFilesChange((prev) =>
|
||||
prev.map((f) =>
|
||||
f === entry
|
||||
? { ...f, id: updatedId, status: updatedStatus, error: updatedError }
|
||||
: f,
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
},
|
||||
[blockId, onFilesChange],
|
||||
);
|
||||
|
||||
const deletedIdsRef = useRef(new Set<string>());
|
||||
const handleRemove = useCallback(
|
||||
(index: number) => {
|
||||
const entry = filesRef.current[index];
|
||||
if (!entry) return;
|
||||
if (entry.status === "uploading" && entry.abortController) {
|
||||
entry.abortController.abort();
|
||||
}
|
||||
if (entry.id && entry.status === "ok" && !deletedIdsRef.current.has(entry.id)) {
|
||||
deletedIdsRef.current.add(entry.id);
|
||||
void removeUnstructuredFile(blockId, entry.id).catch(() => {});
|
||||
}
|
||||
onFilesChange((prev) => prev.filter((_, i) => i !== index));
|
||||
},
|
||||
[blockId, onFilesChange],
|
||||
);
|
||||
|
||||
const handleDrop = useCallback(
|
||||
(e: React.DragEvent) => {
|
||||
e.preventDefault();
|
||||
setIsDragOver(false);
|
||||
if (disabled) return;
|
||||
const dropped = Array.from(e.dataTransfer.files);
|
||||
handleFiles(dropped);
|
||||
},
|
||||
[disabled, handleFiles],
|
||||
);
|
||||
|
||||
const handleDragOver = useCallback(
|
||||
(e: React.DragEvent) => {
|
||||
e.preventDefault();
|
||||
if (!disabled) setIsDragOver(true);
|
||||
},
|
||||
[disabled],
|
||||
);
|
||||
|
||||
const handleDragLeave = useCallback(() => setIsDragOver(false), []);
|
||||
|
||||
const handleClick = useCallback(() => {
|
||||
if (!disabled) inputRef.current?.click();
|
||||
}, [disabled]);
|
||||
|
||||
const handleInputChange = useCallback(
|
||||
(e: React.ChangeEvent<HTMLInputElement>) => {
|
||||
const selected = Array.from(e.target.files || []);
|
||||
handleFiles(selected);
|
||||
e.target.value = "";
|
||||
},
|
||||
[handleFiles],
|
||||
);
|
||||
|
||||
const successFiles = files.filter((f) => f.status === "ok");
|
||||
|
||||
return (
|
||||
<div className="space-y-2">
|
||||
<div
|
||||
className={`nodrag flex cursor-pointer flex-col items-center justify-center rounded-md border-2 border-dashed px-4 py-6 text-center transition-colors ${
|
||||
isDragOver
|
||||
? "border-primary bg-primary/5"
|
||||
: "border-muted-foreground/25 hover:border-muted-foreground/50"
|
||||
} ${disabled ? "pointer-events-none opacity-50" : ""}`}
|
||||
onDrop={handleDrop}
|
||||
onDragOver={handleDragOver}
|
||||
onDragLeave={handleDragLeave}
|
||||
onClick={handleClick}
|
||||
>
|
||||
<HugeiconsIcon icon={CloudUploadIcon} className="text-muted-foreground mb-2 size-8" />
|
||||
<p className="text-muted-foreground text-sm">
|
||||
Drop files here or click to browse
|
||||
</p>
|
||||
<p className="text-muted-foreground/60 mt-1 text-xs">
|
||||
PDF, DOCX, TXT, MD - up to 50MB each, 100MB total
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<input
|
||||
ref={inputRef}
|
||||
type="file"
|
||||
accept={ACCEPTED_EXTENSIONS.join(",")}
|
||||
multiple
|
||||
className="hidden"
|
||||
onChange={handleInputChange}
|
||||
/>
|
||||
|
||||
{files.length > 0 && (
|
||||
<div className="space-y-1">
|
||||
{files.map((entry, i) => (
|
||||
<div
|
||||
key={`${entry.name}-${i}`}
|
||||
className="flex items-center gap-2 rounded-md border px-3 py-1.5 text-sm"
|
||||
>
|
||||
{entry.status === "uploading" && (
|
||||
<HugeiconsIcon icon={Loading03Icon} className="text-muted-foreground size-4 animate-spin" />
|
||||
)}
|
||||
{entry.status === "ok" && (
|
||||
<HugeiconsIcon icon={CheckmarkCircle02Icon} className="size-4 text-green-500" />
|
||||
)}
|
||||
{entry.status === "error" && (
|
||||
<HugeiconsIcon icon={Alert02Icon} className="size-4 text-red-500" />
|
||||
)}
|
||||
<span className="flex-1 truncate">{entry.name}</span>
|
||||
<span className="text-muted-foreground text-xs">
|
||||
{formatSize(entry.size)}
|
||||
</span>
|
||||
{entry.error && (
|
||||
<span className="text-xs text-red-500">{entry.error}</span>
|
||||
)}
|
||||
<button
|
||||
type="button"
|
||||
className="ml-auto inline-flex size-7 shrink-0 items-center justify-center rounded-md text-muted-foreground transition hover:bg-destructive/10 hover:text-destructive"
|
||||
onClick={(e) => {
|
||||
e.stopPropagation();
|
||||
handleRemove(i);
|
||||
}}
|
||||
>
|
||||
<HugeiconsIcon icon={Cancel01Icon} className="size-3.5" />
|
||||
</button>
|
||||
</div>
|
||||
))}
|
||||
<div className="text-muted-foreground flex justify-between px-1 text-xs">
|
||||
<span>{successFiles.length} file{successFiles.length !== 1 ? "s" : ""} uploaded</span>
|
||||
<span>{formatSize(totalSize)} / 100MB</span>
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
export type { FileEntry };
|
||||
|
|
@ -18,7 +18,7 @@ export function FieldLabel({
|
|||
hint,
|
||||
}: FieldLabelProps): ReactElement {
|
||||
return (
|
||||
<div className="flex min-w-0 items-start gap-1.5 text-xs font-semibold uppercase text-muted-foreground">
|
||||
<div className="flex min-w-0 items-center gap-1 text-xs font-semibold uppercase text-muted-foreground">
|
||||
{htmlFor ? (
|
||||
<label className="min-w-0 cursor-pointer" htmlFor={htmlFor}>
|
||||
<span className="break-words">{label}</span>
|
||||
|
|
@ -31,7 +31,7 @@ export function FieldLabel({
|
|||
<TooltipTrigger asChild={true}>
|
||||
<button
|
||||
type="button"
|
||||
className="inline-flex size-6 shrink-0 items-center justify-center rounded-full text-muted-foreground/80 transition hover:text-foreground"
|
||||
className="inline-flex size-4 shrink-0 items-center justify-center rounded-full text-muted-foreground/80 transition hover:text-foreground"
|
||||
aria-label={`More info: ${label}`}
|
||||
title={`More info about ${label}`}
|
||||
>
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@ export function NameField({
|
|||
const fallbackId = useId();
|
||||
const inputId = id ?? fallbackId;
|
||||
return (
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label={label ?? "Field name"}
|
||||
htmlFor={inputId}
|
||||
|
|
|
|||
|
|
@ -165,7 +165,7 @@ function McpServerCard({
|
|||
</div>
|
||||
)}
|
||||
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel label="Server name" hint="Name shown in this tool access setup." />
|
||||
<Input
|
||||
className="nodrag"
|
||||
|
|
@ -194,7 +194,7 @@ function McpServerCard({
|
|||
|
||||
{provider.provider_type === "stdio" ? (
|
||||
<div className="space-y-4">
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel label="Command" hint="Command used to start the tool server." />
|
||||
<Input
|
||||
className="nodrag"
|
||||
|
|
@ -293,7 +293,7 @@ function McpServerCard({
|
|||
</div>
|
||||
) : (
|
||||
<div className="space-y-4">
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel label="Endpoint" hint="URL for the tool server." />
|
||||
<Input
|
||||
className="nodrag"
|
||||
|
|
@ -305,7 +305,7 @@ function McpServerCard({
|
|||
/>
|
||||
</div>
|
||||
<div className="grid gap-2 sm:grid-cols-2">
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="API key environment variable"
|
||||
hint="Optional environment variable that stores the API key."
|
||||
|
|
@ -322,7 +322,7 @@ function McpServerCard({
|
|||
}
|
||||
/>
|
||||
</div>
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="API key"
|
||||
hint="Optional API key."
|
||||
|
|
@ -705,7 +705,7 @@ export function ToolProfileDialog({
|
|||
)}
|
||||
</div>
|
||||
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Tools this setup may use"
|
||||
hint="Leave this empty to allow every tool from these servers."
|
||||
|
|
@ -740,7 +740,7 @@ export function ToolProfileDialog({
|
|||
</CollapsibleTrigger>
|
||||
<CollapsibleContent className="mt-3">
|
||||
<div className="grid gap-3 sm:grid-cols-2">
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Max tool-use turns"
|
||||
hint="How many back-and-forth tool calls an AI step can make."
|
||||
|
|
@ -756,7 +756,7 @@ export function ToolProfileDialog({
|
|||
}
|
||||
/>
|
||||
</div>
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Timeout (seconds)"
|
||||
hint="How long to wait when loading or calling tools."
|
||||
|
|
|
|||
|
|
@ -107,7 +107,7 @@ export function ValidatorDialog({
|
|||
value={config.name}
|
||||
onChange={(value) => onUpdate({ name: value })}
|
||||
/>
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Code to check"
|
||||
htmlFor={targetColumnId}
|
||||
|
|
@ -158,7 +158,7 @@ export function ValidatorDialog({
|
|||
</div>
|
||||
{config.validator_type === "oxc" && (
|
||||
<div className="grid gap-3">
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Check mode"
|
||||
htmlFor={oxcModeId}
|
||||
|
|
@ -197,7 +197,7 @@ export function ValidatorDialog({
|
|||
</Combobox>
|
||||
</div>
|
||||
</div>
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Code shape"
|
||||
htmlFor={oxcCodeShapeId}
|
||||
|
|
@ -249,7 +249,7 @@ export function ValidatorDialog({
|
|||
/>
|
||||
</CollapsibleTrigger>
|
||||
<CollapsibleContent className="mt-3">
|
||||
<div className="grid gap-2">
|
||||
<div className="grid gap-1.5">
|
||||
<FieldLabel
|
||||
label="Batch size"
|
||||
htmlFor={batchSizeId}
|
||||
|
|
|
|||
|
|
@ -144,7 +144,9 @@ function sanitizeSeedForShare(payload: unknown): unknown {
|
|||
ui.seed_drop_columns = [];
|
||||
ui.seed_preview_rows = [];
|
||||
ui.local_file_name = "";
|
||||
ui.unstructured_file_name = "";
|
||||
ui.unstructured_file_ids = [];
|
||||
ui.unstructured_file_names = [];
|
||||
ui.unstructured_file_sizes = [];
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -152,12 +154,20 @@ function sanitizeSeedForShare(payload: unknown): unknown {
|
|||
if (source && "path" in source) {
|
||||
source.path = "";
|
||||
}
|
||||
if (source && "paths" in source) {
|
||||
source.paths = [];
|
||||
}
|
||||
if (seedConfig) {
|
||||
seedConfig.resolved_paths = [];
|
||||
}
|
||||
if (ui) {
|
||||
ui.seed_columns = [];
|
||||
ui.seed_drop_columns = [];
|
||||
ui.seed_preview_rows = [];
|
||||
ui.local_file_name = "";
|
||||
ui.unstructured_file_name = "";
|
||||
ui.unstructured_file_ids = [];
|
||||
ui.unstructured_file_names = [];
|
||||
ui.unstructured_file_sizes = [];
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -58,7 +58,12 @@ import { useRecipeStudioActions } from "./hooks/use-recipe-studio-actions";
|
|||
import { useRecipeStudioStore } from "./stores/recipe-studio";
|
||||
import type { RecipeNodeData } from "./types";
|
||||
import { getGraphWarnings } from "./utils/graph-warnings";
|
||||
import { getFitNodeIdsIgnoringNotes } from "./utils/graph/fit-view";
|
||||
import {
|
||||
FIT_VIEW_DURATION_MS,
|
||||
FIT_VIEW_MAX_ZOOM,
|
||||
FIT_VIEW_PADDING,
|
||||
getFitViewTargetNodes,
|
||||
} from "./utils/graph/fit-view";
|
||||
import { buildRecipePayload } from "./utils/payload";
|
||||
import type { RecipePayload } from "./utils/payload/types";
|
||||
import { buildDefaultSchemaTransform } from "./utils/processors";
|
||||
|
|
@ -71,7 +76,19 @@ const EDGE_TYPES: EdgeTypes = {
|
|||
};
|
||||
const COMPLETE_ISLAND_VISIBLE_MS = 7_000;
|
||||
const TAB_SWITCH_FIT_DELAY_MS = 110;
|
||||
const FIT_ANIMATION_MS = 340;
|
||||
/**
|
||||
* Maximum RAF iterations to wait for React Flow's ResizeObserver to populate
|
||||
* `node.measured` dimensions before calling fitView. ~20 frames ≈ 333 ms at
|
||||
* 60 fps — more than enough for the render → layout → ResizeObserver cycle.
|
||||
*/
|
||||
const MAX_FIT_VIEW_RETRIES = 20;
|
||||
/**
|
||||
* After all target nodes appear measured, wait this many extra stable frames
|
||||
* before firing fitView. This absorbs `updateNodeInternals` calls from
|
||||
* InternalsSync and individual node mount effects that can transiently reset
|
||||
* measurements.
|
||||
*/
|
||||
const FIT_VIEW_STABLE_FRAMES = 3;
|
||||
|
||||
export type PersistRecipeInput = {
|
||||
id: string | null;
|
||||
|
|
@ -421,40 +438,69 @@ export function RecipeStudioPage({
|
|||
const scheduleFitView = useCallback(
|
||||
({ delayMs = 0 }: { delayMs?: number } = {}) => {
|
||||
if (!reactFlowInstance) {
|
||||
return () => {};
|
||||
// eslint-disable-next-line @typescript-eslint/no-empty-function
|
||||
return () => {
|
||||
/* no-op: instance not available */
|
||||
};
|
||||
}
|
||||
|
||||
let timeoutId = 0;
|
||||
let frameId = 0;
|
||||
let retryFrameId = 0;
|
||||
let cancelled = false;
|
||||
|
||||
const fitWithCurrentNodes = () => {
|
||||
const targetNodes = getFitNodeIdsIgnoringNotes(
|
||||
reactFlowInstance.getNodes(),
|
||||
/** Check whether every primary workflow node has been measured. */
|
||||
const allTargetsMeasured = (targets: Node[]): boolean =>
|
||||
targets.length > 0 &&
|
||||
targets.every(
|
||||
(n) => n.measured?.width != null && n.measured?.height != null,
|
||||
);
|
||||
if (targetNodes.length === 0) {
|
||||
return false;
|
||||
|
||||
/** Execute fitView on the current primary workflow nodes. */
|
||||
const doFit = () => {
|
||||
const targets = getFitViewTargetNodes(reactFlowInstance.getNodes());
|
||||
if (targets.length === 0) {
|
||||
return;
|
||||
}
|
||||
viewportMovedSinceAutoFitRef.current = false;
|
||||
reactFlowInstance.fitView({
|
||||
duration: FIT_ANIMATION_MS,
|
||||
nodes: targetNodes,
|
||||
duration: FIT_VIEW_DURATION_MS,
|
||||
maxZoom: FIT_VIEW_MAX_ZOOM,
|
||||
padding: FIT_VIEW_PADDING,
|
||||
nodes: targets.map((n) => ({ id: n.id })),
|
||||
});
|
||||
return true;
|
||||
};
|
||||
|
||||
const runFit = () => {
|
||||
if (fitWithCurrentNodes()) {
|
||||
let retries = 0;
|
||||
let stableCount = 0;
|
||||
const poll = () => {
|
||||
if (cancelled) {
|
||||
return;
|
||||
}
|
||||
|
||||
retryFrameId = window.requestAnimationFrame(() => {
|
||||
fitWithCurrentNodes();
|
||||
});
|
||||
if (retries >= MAX_FIT_VIEW_RETRIES) {
|
||||
// Timed out waiting — fit with whatever we have (graceful fallback).
|
||||
doFit();
|
||||
return;
|
||||
}
|
||||
const targets = getFitViewTargetNodes(reactFlowInstance.getNodes());
|
||||
if (allTargetsMeasured(targets)) {
|
||||
stableCount++;
|
||||
// Wait a few extra frames after measurements appear to let
|
||||
// updateNodeInternals (InternalsSync, node mount effects) settle.
|
||||
if (stableCount >= FIT_VIEW_STABLE_FRAMES) {
|
||||
doFit();
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
// Measurements were reset (e.g. by updateNodeInternals) — restart
|
||||
// the stability counter.
|
||||
stableCount = 0;
|
||||
}
|
||||
retries++;
|
||||
frameId = window.requestAnimationFrame(poll);
|
||||
};
|
||||
|
||||
const start = () => {
|
||||
frameId = window.requestAnimationFrame(runFit);
|
||||
frameId = window.requestAnimationFrame(poll);
|
||||
};
|
||||
|
||||
if (delayMs > 0) {
|
||||
|
|
@ -464,15 +510,13 @@ export function RecipeStudioPage({
|
|||
}
|
||||
|
||||
return () => {
|
||||
cancelled = true;
|
||||
if (timeoutId) {
|
||||
window.clearTimeout(timeoutId);
|
||||
}
|
||||
if (frameId) {
|
||||
window.cancelAnimationFrame(frameId);
|
||||
}
|
||||
if (retryFrameId) {
|
||||
window.cancelAnimationFrame(retryFrameId);
|
||||
}
|
||||
};
|
||||
},
|
||||
[reactFlowInstance],
|
||||
|
|
|
|||
|
|
@ -406,7 +406,10 @@ export const useRecipeStudioStore = create<RecipeStudioState>((set, get) => ({
|
|||
hf_token: "",
|
||||
hf_endpoint: "https://huggingface.co",
|
||||
local_file_name: "",
|
||||
unstructured_file_name: "",
|
||||
unstructured_file_ids: [],
|
||||
unstructured_file_names: [],
|
||||
unstructured_file_sizes: [],
|
||||
resolved_paths: [],
|
||||
seed_columns: [],
|
||||
seed_drop_columns: [],
|
||||
seed_preview_rows: [],
|
||||
|
|
|
|||
|
|
@ -333,7 +333,10 @@ export type SeedConfig = {
|
|||
hf_token?: string;
|
||||
hf_endpoint?: string;
|
||||
local_file_name?: string;
|
||||
unstructured_file_name?: string;
|
||||
unstructured_file_ids?: string[];
|
||||
unstructured_file_names?: string[];
|
||||
unstructured_file_sizes?: number[];
|
||||
resolved_paths?: string[];
|
||||
// ui-only
|
||||
seed_preview_rows?: Record<string, unknown>[];
|
||||
// ui-only (string for input ergonomics)
|
||||
|
|
|
|||
|
|
@ -366,7 +366,9 @@ export function makeSeedConfig(
|
|||
hf_token: "",
|
||||
hf_endpoint: "https://huggingface.co",
|
||||
local_file_name: "",
|
||||
unstructured_file_name: "",
|
||||
unstructured_file_ids: [],
|
||||
unstructured_file_names: [],
|
||||
unstructured_file_sizes: [],
|
||||
seed_preview_rows: [],
|
||||
unstructured_chunk_size: "1200",
|
||||
unstructured_chunk_overlap: "200",
|
||||
|
|
|
|||
|
|
@ -1,8 +1,14 @@
|
|||
// SPDX-License-Identifier: AGPL-3.0-only
|
||||
// Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0
|
||||
|
||||
import type { Node } from "@xyflow/react";
|
||||
import type { FitViewOptions, Node } from "@xyflow/react";
|
||||
|
||||
/** Cap auto-fit zoom so the view doesn't punch in too tight on small graphs. */
|
||||
export const FIT_VIEW_MAX_ZOOM = 1.1;
|
||||
export const FIT_VIEW_PADDING = 0.12;
|
||||
export const FIT_VIEW_DURATION_MS = 340;
|
||||
|
||||
/** Markdown note nodes are decorative and should not affect the fitView bbox. */
|
||||
function isMarkdownNoteNode(node: Node): boolean {
|
||||
if (node.type !== "builder") {
|
||||
return false;
|
||||
|
|
@ -13,8 +19,43 @@ function isMarkdownNoteNode(node: Node): boolean {
|
|||
return (node.data as { kind?: string }).kind === "note";
|
||||
}
|
||||
|
||||
export function getFitNodeIdsIgnoringNotes(nodes: Node[]): Array<{ id: string }> {
|
||||
const nodesWithoutNotes = nodes.filter((node) => !isMarkdownNoteNode(node));
|
||||
const targetNodes = nodesWithoutNotes.length > 0 ? nodesWithoutNotes : nodes;
|
||||
return targetNodes.map((node) => ({ id: node.id }));
|
||||
/** Aux nodes (llm-prompt-input, llm-judge-score) are satellite overlays. */
|
||||
function isAuxNode(node: Node): boolean {
|
||||
return node.type === "aux";
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the primary workflow nodes that fitView should target.
|
||||
*
|
||||
* Excludes markdown notes and aux (LLM input overlay) nodes so the viewport
|
||||
* is framed around the primary workflow blocks. Falls back to all nodes if
|
||||
* filtering would leave an empty set.
|
||||
*
|
||||
* The returned array contains full {@link Node} objects so callers can inspect
|
||||
* `node.measured` without a second lookup pass.
|
||||
*/
|
||||
export function getFitViewTargetNodes(nodes: Node[]): Node[] {
|
||||
const primary = nodes.filter(
|
||||
(node) => !(isMarkdownNoteNode(node) || isAuxNode(node)),
|
||||
);
|
||||
return primary.length > 0 ? primary : nodes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds a standard {@link FitViewOptions} object targeting the primary
|
||||
* workflow nodes. Every call site that invokes `fitView` should go through
|
||||
* this helper so zoom, padding, and node filtering stay consistent.
|
||||
*/
|
||||
export function buildFitViewOptions(
|
||||
nodes: Node[],
|
||||
overrides?: Partial<FitViewOptions>,
|
||||
): FitViewOptions {
|
||||
const targets = getFitViewTargetNodes(nodes);
|
||||
return {
|
||||
duration: FIT_VIEW_DURATION_MS,
|
||||
maxZoom: FIT_VIEW_MAX_ZOOM,
|
||||
padding: FIT_VIEW_PADDING,
|
||||
nodes: targets.map((n) => ({ id: n.id })),
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
|
|
|||
|
|
@ -43,7 +43,9 @@ type UiInput = {
|
|||
seed_drop_columns?: unknown;
|
||||
seed_preview_rows?: unknown;
|
||||
local_file_name?: unknown;
|
||||
unstructured_file_name?: unknown;
|
||||
unstructured_file_ids?: unknown;
|
||||
unstructured_file_names?: unknown;
|
||||
unstructured_file_sizes?: unknown;
|
||||
unstructured_chunk_size?: unknown;
|
||||
unstructured_chunk_overlap?: unknown;
|
||||
advanced_open_by_node?: unknown;
|
||||
|
|
@ -408,8 +410,16 @@ export function importRecipePayload(input: string): ImportResult {
|
|||
.map((row) => ({ ...row }))
|
||||
: undefined;
|
||||
const uiLocalFileName = readString(ui?.local_file_name) ?? undefined;
|
||||
const uiUnstructuredFileName =
|
||||
readString(ui?.unstructured_file_name) ?? undefined;
|
||||
// Preserve file IDs/names from saved recipes (cleared at share time by sanitizeSeedForShare)
|
||||
const uiUnstructuredFileIds: string[] = Array.isArray(ui?.unstructured_file_ids)
|
||||
? (ui.unstructured_file_ids as string[]).filter((v): v is string => typeof v === "string")
|
||||
: [];
|
||||
const uiUnstructuredFileNames: string[] = Array.isArray(ui?.unstructured_file_names)
|
||||
? (ui.unstructured_file_names as string[]).filter((v): v is string => typeof v === "string")
|
||||
: [];
|
||||
const uiUnstructuredFileSizes: number[] = Array.isArray(ui?.unstructured_file_sizes)
|
||||
? (ui.unstructured_file_sizes as number[]).filter((v): v is number => typeof v === "number")
|
||||
: [];
|
||||
const uiUnstructuredChunkSize = readStringNumber(ui?.unstructured_chunk_size);
|
||||
const uiUnstructuredChunkOverlap = readStringNumber(
|
||||
ui?.unstructured_chunk_overlap,
|
||||
|
|
@ -449,7 +459,9 @@ export function importRecipePayload(input: string): ImportResult {
|
|||
: payloadSeedDropColumns,
|
||||
seed_preview_rows: uiSeedPreviewRows,
|
||||
local_file_name: uiLocalFileName,
|
||||
unstructured_file_name: uiUnstructuredFileName,
|
||||
unstructuredFileIds: uiUnstructuredFileIds,
|
||||
unstructuredFileNames: uiUnstructuredFileNames,
|
||||
unstructuredFileSizes: uiUnstructuredFileSizes,
|
||||
unstructured_chunk_size: uiUnstructuredChunkSize,
|
||||
unstructured_chunk_overlap: uiUnstructuredChunkOverlap,
|
||||
});
|
||||
|
|
|
|||
|
|
@ -30,7 +30,9 @@ function makeDefaultSeedConfig(id: string): SeedConfig {
|
|||
hf_token: "",
|
||||
hf_endpoint: "https://huggingface.co",
|
||||
local_file_name: "",
|
||||
unstructured_file_name: "",
|
||||
unstructured_file_ids: [],
|
||||
unstructured_file_names: [],
|
||||
unstructured_file_sizes: [],
|
||||
seed_preview_rows: [],
|
||||
unstructured_chunk_size: "1200",
|
||||
unstructured_chunk_overlap: "200",
|
||||
|
|
@ -72,7 +74,10 @@ function parseSeedSettings(seedConfigRaw: unknown): Partial<SeedConfig> {
|
|||
let hf_endpoint = "https://huggingface.co";
|
||||
let hf_repo_id = "";
|
||||
let local_file_name = "";
|
||||
let unstructured_file_name = "";
|
||||
let unstructuredFileIds: string[] = [];
|
||||
let unstructuredFileNames: string[] = [];
|
||||
let unstructuredFileSizes: number[] = [];
|
||||
let resolved_paths: string[] = [];
|
||||
let unstructured_chunk_size = "1200";
|
||||
let unstructured_chunk_overlap = "200";
|
||||
const sourceRaw = seedConfigRaw.source;
|
||||
|
|
@ -91,8 +96,15 @@ function parseSeedSettings(seedConfigRaw: unknown): Partial<SeedConfig> {
|
|||
local_file_name = sourcePath.split("/").pop() ?? sourcePath;
|
||||
} else if (seedType === "unstructured") {
|
||||
seed_source_type = "unstructured";
|
||||
hf_path = sourcePath;
|
||||
unstructured_file_name = sourcePath.split("/").pop() ?? sourcePath;
|
||||
const paths = Array.isArray(sourceRaw.paths) ? sourceRaw.paths : [];
|
||||
const stringPaths = paths.filter((p): p is string => typeof p === "string");
|
||||
if (stringPaths.length === 0 && sourcePath) {
|
||||
stringPaths.push(sourcePath);
|
||||
}
|
||||
hf_path = stringPaths[0] ?? sourcePath;
|
||||
resolved_paths = stringPaths;
|
||||
unstructuredFileIds = [];
|
||||
unstructuredFileNames = [];
|
||||
unstructured_chunk_size = readNumberString(sourceRaw.chunk_size) || "1200";
|
||||
unstructured_chunk_overlap = readNumberString(sourceRaw.chunk_overlap) || "200";
|
||||
}
|
||||
|
|
@ -129,7 +141,10 @@ function parseSeedSettings(seedConfigRaw: unknown): Partial<SeedConfig> {
|
|||
hf_token,
|
||||
hf_endpoint,
|
||||
local_file_name,
|
||||
unstructured_file_name,
|
||||
unstructured_file_ids: unstructuredFileIds,
|
||||
unstructured_file_names: unstructuredFileNames,
|
||||
unstructured_file_sizes: unstructuredFileSizes,
|
||||
resolved_paths,
|
||||
unstructured_chunk_size,
|
||||
unstructured_chunk_overlap,
|
||||
sampling_strategy,
|
||||
|
|
@ -150,7 +165,9 @@ export function parseSeedConfig(
|
|||
seed_drop_columns?: string[];
|
||||
seed_preview_rows?: Record<string, unknown>[];
|
||||
local_file_name?: string;
|
||||
unstructured_file_name?: string;
|
||||
unstructuredFileIds?: string[];
|
||||
unstructuredFileNames?: string[];
|
||||
unstructuredFileSizes?: number[];
|
||||
unstructured_chunk_size?: string;
|
||||
unstructured_chunk_overlap?: string;
|
||||
},
|
||||
|
|
@ -181,8 +198,14 @@ export function parseSeedConfig(
|
|||
...(options?.local_file_name !== undefined
|
||||
? { local_file_name: options.local_file_name }
|
||||
: {}),
|
||||
...(options?.unstructured_file_name !== undefined
|
||||
? { unstructured_file_name: options.unstructured_file_name }
|
||||
...(options?.unstructuredFileIds !== undefined
|
||||
? { unstructured_file_ids: options.unstructuredFileIds }
|
||||
: {}),
|
||||
...(options?.unstructuredFileNames !== undefined
|
||||
? { unstructured_file_names: options.unstructuredFileNames }
|
||||
: {}),
|
||||
...(options?.unstructuredFileSizes !== undefined
|
||||
? { unstructured_file_sizes: options.unstructuredFileSizes }
|
||||
: {}),
|
||||
...(options?.unstructured_chunk_size !== undefined
|
||||
? { unstructured_chunk_size: options.unstructured_chunk_size }
|
||||
|
|
|
|||
|
|
@ -430,8 +430,10 @@ export function buildRecipePayload(
|
|||
local_file_name: firstSeed.local_file_name,
|
||||
}),
|
||||
...(firstSeed &&
|
||||
firstSeed.unstructured_file_name !== undefined && {
|
||||
unstructured_file_name: firstSeed.unstructured_file_name,
|
||||
firstSeed.unstructured_file_ids !== undefined && {
|
||||
unstructured_file_ids: firstSeed.unstructured_file_ids,
|
||||
unstructured_file_names: firstSeed.unstructured_file_names,
|
||||
unstructured_file_sizes: firstSeed.unstructured_file_sizes,
|
||||
}),
|
||||
...(firstSeed &&
|
||||
firstSeed.unstructured_chunk_size !== undefined && {
|
||||
|
|
|
|||
|
|
@ -71,7 +71,7 @@ export function buildSeedConfig(
|
|||
return {
|
||||
// biome-ignore lint/style/useNamingConvention: api schema
|
||||
seed_type: "unstructured",
|
||||
path,
|
||||
paths: config.resolved_paths?.length ? config.resolved_paths : [config.hf_path],
|
||||
// biome-ignore lint/style/useNamingConvention: api schema
|
||||
chunk_size: chunkSize,
|
||||
// biome-ignore lint/style/useNamingConvention: api schema
|
||||
|
|
|
|||
|
|
@ -70,8 +70,15 @@ export type RecipePayload = {
|
|||
seed_drop_columns?: string[];
|
||||
seed_preview_rows?: Record<string, unknown>[];
|
||||
local_file_name?: string;
|
||||
unstructured_file_name?: string;
|
||||
// biome-ignore lint/style/useNamingConvention: api schema
|
||||
unstructured_file_ids?: string[];
|
||||
// biome-ignore lint/style/useNamingConvention: api schema
|
||||
unstructured_file_names?: string[];
|
||||
// biome-ignore lint/style/useNamingConvention: api schema
|
||||
unstructured_file_sizes?: number[];
|
||||
// biome-ignore lint/style/useNamingConvention: api schema
|
||||
unstructured_chunk_size?: string;
|
||||
// biome-ignore lint/style/useNamingConvention: api schema
|
||||
unstructured_chunk_overlap?: string;
|
||||
// ui-only: per-node advanced accordion state
|
||||
advanced_open_by_node?: Record<string, boolean>;
|
||||
|
|
|
|||
|
|
@ -271,7 +271,11 @@ export function getConfigErrors(config: NodeConfig | null): string[] {
|
|||
if (seedSourceType === "hf" && !config.hf_repo_id.trim()) {
|
||||
errors.push("Choose a Hugging Face dataset.");
|
||||
}
|
||||
if (!config.hf_path.trim()) {
|
||||
const hasPath =
|
||||
seedSourceType === "unstructured"
|
||||
? (config.resolved_paths?.length ?? 0) > 0
|
||||
: Boolean(config.hf_path.trim());
|
||||
if (!hasPath) {
|
||||
errors.push("Load the source-data preview first.");
|
||||
}
|
||||
if (
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue