feat(studio): multi-file unstructured seed upload with better backend extraction (#4468)

* fix(recipe-studio): prevent fitView from zooming to wrong location on recipe load

* feat: add pymupdf/python-docx deps and unstructured uploads storage root

* feat: add POST /seed/upload-unstructured-file endpoint

* feat: add multi-file chunking with source_file column

* feat: update frontend types and API layer for multi-file upload

* feat: round-robin preview rows across source files

Ensures every uploaded file is represented in the preview table
by cycling through sources instead of just taking the first N rows.

* fix: disable OCR, fix auto-load timing, fix persistence on reload

- Disable pymupdf4llm OCR with write_images=False, show_progress=False
- Replace onAllUploaded callback with useEffect that detects uploading→done
  transition (avoids stale closure reading empty file IDs)
- Fix importer to preserve file IDs from saved recipes instead of clearing
  (clearing only happens at share time via sanitizeSeedForShare)

* fix: harden unstructured upload with input validation and state fixes

Validate block_id/file_id with alphanumeric regex to prevent path
traversal, use exact stem match for file deletion, add error handling
for metadata writes and empty files, fix React stale closures and
object mutations in upload loop, and correct validation logic for
unstructured seed resolved_paths.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix: address PR review - legacy path import, share sanitizer, sync effect

Promote legacy source.path into resolved_paths for old unstructured
recipes, clear source.paths in share sanitizer to prevent leaking local
filesystem paths, and gate file sync effect to dialog open transition
so users can actually delete all uploaded files.

* fix: CSV column fix (BOM + whitespace + unnamed index re-save) for #4470

* fix: harden unstructured upload flow and polish dialog UX

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
Wasim Yousef Said 2026-03-20 21:22:42 +01:00 committed by GitHub
parent f113f3511d
commit dd283b0605
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
49 changed files with 1216 additions and 315 deletions

View file

@ -9,7 +9,7 @@ from __future__ import annotations
from typing import Any
from pydantic import BaseModel, Field
from pydantic import BaseModel, Field, model_validator
class RecipePayload(BaseModel):
@ -76,13 +76,41 @@ class SeedInspectRequest(BaseModel):
class SeedInspectUploadRequest(BaseModel):
filename: str = Field(min_length = 1)
content_base64: str = Field(min_length = 1)
# Legacy single-file flow (mutually exclusive with file_ids)
filename: str | None = None
content_base64: str | None = None
# Multi-file flow (mutually exclusive with content_base64)
block_id: str | None = None
file_ids: list[str] | None = None
file_names: list[str] | None = None
# Shared fields
preview_size: int = Field(default = 10, ge = 1, le = 50)
seed_source_type: str | None = None
unstructured_chunk_size: int | None = Field(default = None, ge = 1, le = 20000)
unstructured_chunk_overlap: int | None = Field(default = None, ge = 0, le = 20000)
@model_validator(mode = "after")
def _check_mutual_exclusivity(self) -> "SeedInspectUploadRequest":
has_legacy = self.content_base64 is not None
has_multi = self.file_ids is not None
if has_legacy and has_multi:
raise ValueError("Provide either content_base64 or file_ids, not both")
if not has_legacy and not has_multi:
raise ValueError("Provide either content_base64 or file_ids")
if has_multi:
if len(self.file_ids) == 0:
raise ValueError("file_ids must not be empty")
if not self.block_id:
raise ValueError("block_id is required when using file_ids")
if self.file_names is None or len(self.file_ids) != len(self.file_names):
raise ValueError(
"file_names must be provided and same length as file_ids"
)
if has_legacy:
if not self.filename:
raise ValueError("filename is required when using content_base64")
return self
class SeedInspectResponse(BaseModel):
dataset_name: str
@ -91,6 +119,15 @@ class SeedInspectResponse(BaseModel):
preview_rows: list[dict[str, Any]] = Field(default_factory = list)
split: str | None = None
subset: str | None = None
resolved_paths: list[str] | None = None
class UnstructuredFileUploadResponse(BaseModel):
file_id: str
filename: str
size_bytes: int
status: str # "ok" or "error"
error: str | None = None
class McpToolsListRequest(BaseModel):

View file

@ -13,6 +13,9 @@ requires-python = ">=3.11"
dependencies = [
"data-designer-engine>=0.5.1,<0.6",
"pandas>=2,<3",
"pymupdf>=1.24.0",
"pymupdf4llm>=0.0.17",
"mammoth>=1.8.0",
]
[project.entry-points."data_designer.plugins"]

View file

@ -8,6 +8,8 @@ import re
from pathlib import Path
from typing import Any
import pandas as pd
from utils.paths import ensure_dir, unstructured_seed_cache_root
DEFAULT_CHUNK_SIZE = 1200
@ -59,6 +61,59 @@ def build_unstructured_preview_rows(
]
def build_multi_file_preview_rows(
*,
file_entries: list[tuple[Path, str]],
preview_size: int,
chunk_size: int | None,
chunk_overlap: int | None,
) -> list[dict[str, str]]:
cs = _to_int(chunk_size, DEFAULT_CHUNK_SIZE)
co = _to_int(chunk_overlap, DEFAULT_CHUNK_OVERLAP)
_, rows = materialize_multi_file_unstructured_seed(
file_entries = file_entries,
chunk_size = cs,
chunk_overlap = co,
)
return _round_robin_preview(rows, preview_size)
def _round_robin_preview(
rows: list[dict[str, str]],
preview_size: int,
) -> list[dict[str, str]]:
"""Pick preview rows round-robin across source files so every file is represented."""
if not rows or preview_size <= 0:
return []
# Group rows by source_file, preserving order of first appearance
from collections import OrderedDict
grouped: OrderedDict[str, list[dict[str, str]]] = OrderedDict()
for row in rows:
key = row.get("source_file", "")
if key not in grouped:
grouped[key] = []
grouped[key].append(row)
result: list[dict[str, str]] = []
iterators = [iter(chunks) for chunks in grouped.values()]
while len(result) < preview_size and iterators:
exhausted: list[int] = []
for i, it in enumerate(iterators):
if len(result) >= preview_size:
break
val = next(it, None)
if val is not None:
result.append(val)
else:
exhausted.append(i)
for i in reversed(exhausted):
iterators.pop(i)
return result
def materialize_unstructured_seed_dataset(
*,
source_path: Path,
@ -103,6 +158,43 @@ def materialize_unstructured_seed_dataset(
return parquet_path, rows
def materialize_multi_file_unstructured_seed(
*,
file_entries: list[tuple[Path, str]], # (extracted_txt_path, original_filename)
chunk_size: int,
chunk_overlap: int,
) -> tuple[Path, list[dict[str, str]]]:
"""Chunk multiple files and combine into one parquet dataset with source_file column."""
chunk_size, chunk_overlap = resolve_chunking(chunk_size, chunk_overlap)
cache_key = _compute_multi_file_cache_key(file_entries, chunk_size, chunk_overlap)
cached = _CACHE_DIR / f"{cache_key}.parquet"
if cached.exists():
df = pd.read_parquet(cached)
rows = df.to_dict(orient = "records")
return cached, rows
all_rows: list[dict[str, str]] = []
for txt_path, orig_name in file_entries:
text = load_unstructured_text_file(txt_path)
chunks = split_text_into_chunks(
text = text,
chunk_size = chunk_size,
chunk_overlap = chunk_overlap,
)
for chunk in chunks:
all_rows.append({"chunk_text": chunk, "source_file": orig_name})
if not all_rows:
raise ValueError("No text found in any uploaded files.")
df = pd.DataFrame(all_rows)
ensure_dir(_CACHE_DIR)
tmp = _CACHE_DIR / f"{cache_key}.tmp.parquet"
df.to_parquet(tmp, index = False)
tmp.replace(cached)
return cached, all_rows
def load_unstructured_text_file(path: Path) -> str:
ext = path.suffix.lower()
if ext not in {".txt", ".md"}:
@ -193,3 +285,17 @@ def _compute_cache_key(
]
).encode("utf-8")
return hashlib.sha256(payload).hexdigest()
def _compute_multi_file_cache_key(
file_entries: list[tuple[Path, str]],
chunk_size: int,
chunk_overlap: int,
) -> str:
parts: list[str] = []
for path, name in sorted(file_entries, key = lambda e: e[1]):
st = path.stat()
parts.append(f"{path}|{st.st_size}|{st.st_mtime_ns}|{name}")
parts.append(f"cs={chunk_size}|co={chunk_overlap}")
raw = "\n".join(parts)
return hashlib.sha256(raw.encode()).hexdigest()

View file

@ -6,7 +6,7 @@ from __future__ import annotations
from pathlib import Path
from typing import Literal
from pydantic import Field, field_validator
from pydantic import Field, field_validator, model_validator
from data_designer.config.seed_source import SeedSource
@ -15,27 +15,37 @@ from .chunking import DEFAULT_CHUNK_OVERLAP, DEFAULT_CHUNK_SIZE, resolve_chunkin
class UnstructuredSeedSource(SeedSource):
seed_type: Literal["unstructured"] = "unstructured"
path: str = Field(..., min_length = 1)
paths: list[str] = Field(min_length = 1)
@model_validator(mode = "before")
@classmethod
def _normalize_legacy_path(cls, data):
if isinstance(data, dict) and "paths" not in data and data.get("path"):
data = dict(data)
data["paths"] = [data["path"]]
return data
chunk_size: int = DEFAULT_CHUNK_SIZE
chunk_overlap: int = DEFAULT_CHUNK_OVERLAP
@field_validator("path", mode = "after")
@field_validator("paths")
@classmethod
def _validate_path(cls, value: str) -> str:
path = Path(value).expanduser()
if not path.is_file():
raise ValueError(f"Unstructured seed path is not a file: {path}")
return value
def _validate_paths(cls, v: list[str]) -> list[str]:
for p in v:
expanded = Path(p).expanduser()
if not expanded.is_file():
raise ValueError(f"Seed file does not exist: {expanded}")
return v
@field_validator("chunk_size", mode = "after")
@field_validator("chunk_size")
@classmethod
def _validate_chunk_size(cls, value: int) -> int:
size, _ = resolve_chunking(value, 0)
return size
def _resolve_chunk_size(cls, v: int) -> int:
cs, _ = resolve_chunking(v, 0)
return cs
@field_validator("chunk_overlap", mode = "after")
@field_validator("chunk_overlap")
@classmethod
def _validate_chunk_overlap(cls, value: int, info) -> int:
size = info.data.get("chunk_size", cls.model_fields["chunk_size"].default)
_, overlap = resolve_chunking(size, value)
return overlap
def _resolve_chunk_overlap(cls, v: int, info) -> int:
cs = info.data.get("chunk_size", DEFAULT_CHUNK_SIZE)
_, co = resolve_chunking(cs, v)
return co

View file

@ -8,7 +8,6 @@ from pathlib import Path
import data_designer.lazy_heavy_imports as lazy
from data_designer.engine.resources.seed_reader import SeedReader
from .chunking import materialize_unstructured_seed_dataset
from .config import UnstructuredSeedSource
@ -17,8 +16,25 @@ class UnstructuredSeedReader(SeedReader[UnstructuredSeedSource]):
return lazy.duckdb.connect()
def get_dataset_uri(self) -> str:
path, _ = materialize_unstructured_seed_dataset(
source_path = Path(self.source.path),
from .chunking import materialize_multi_file_unstructured_seed
import json as json_mod
file_entries: list[tuple[Path, str]] = []
for p in self.source.paths:
path_obj = Path(p)
file_id = path_obj.name.replace(".extracted.txt", "")
meta_path = path_obj.parent / f"{file_id}.meta.json"
orig_name = path_obj.name
if meta_path.exists():
try:
meta = json_mod.loads(meta_path.read_text())
orig_name = meta.get("original_filename", path_obj.name)
except (json_mod.JSONDecodeError, OSError):
pass
file_entries.append((path_obj, orig_name))
path, _ = materialize_multi_file_unstructured_seed(
file_entries = file_entries,
chunk_size = self.source.chunk_size,
chunk_overlap = self.source.chunk_overlap,
)

View file

@ -17,3 +17,6 @@ ruff<1,>=0.14.10
scipy<2,>=1.11.0
sqlfluff<4,>=3.2.0
tiktoken<1,>=0.8.0
pymupdf>=1.24.0
pymupdf4llm>=0.0.17
mammoth>=1.8.0

View file

@ -7,23 +7,27 @@ from __future__ import annotations
import base64
import binascii
import json
import re
from itertools import islice
from pathlib import Path
from typing import Any
from uuid import uuid4
from fastapi import APIRouter, HTTPException
from fastapi import APIRouter, HTTPException, UploadFile, File as FastAPIFile, Form
from data_designer_unstructured_seed.chunking import (
build_unstructured_preview_rows,
normalize_unstructured_text,
resolve_chunking,
)
from core.data_recipe.jsonable import to_preview_jsonable
from utils.paths import ensure_dir, seed_uploads_root
from utils.paths import ensure_dir, seed_uploads_root, unstructured_uploads_root
from models.data_recipe import (
SeedInspectRequest,
SeedInspectResponse,
SeedInspectUploadRequest,
UnstructuredFileUploadResponse,
)
router = APIRouter()
@ -31,8 +35,21 @@ router = APIRouter()
DATA_EXTS = (".parquet", ".jsonl", ".json", ".csv")
DEFAULT_SPLIT = "train"
LOCAL_UPLOAD_EXTS = {".csv", ".json", ".jsonl"}
UNSTRUCTURED_UPLOAD_EXTS = {".txt", ".md"}
UNSTRUCTURED_ALLOWED_EXTS = {".pdf", ".docx", ".txt", ".md"}
SEED_UPLOAD_DIR = seed_uploads_root()
UNSTRUCTURED_UPLOAD_ROOT = unstructured_uploads_root()
MAX_FILE_SIZE = 50 * 1024 * 1024 # 50MB
MAX_TOTAL_SIZE = 100 * 1024 * 1024 # 100MB
_SAFE_ID_RE = re.compile(r"^[a-zA-Z0-9_-]+$")
def _validate_safe_id(value: str, label: str) -> str:
if not value or not _SAFE_ID_RE.match(value):
raise HTTPException(
400, f"Invalid {label}: must be alphanumeric/dash/underscore only"
)
return value
def _serialize_preview_value(value: Any) -> Any:
@ -177,7 +194,17 @@ def _read_preview_rows_from_local_file(
ext = path.suffix.lower()
try:
if ext == ".csv":
df = pd.read_csv(path, nrows = preview_size)
df = pd.read_csv(path, nrows = preview_size, encoding = "utf-8-sig")
df.columns = df.columns.str.strip()
unnamed = [c for c in df.columns if c == "" or c.startswith("Unnamed:")]
if unnamed:
df = df.drop(columns = unnamed)
full_df = pd.read_csv(path, encoding = "utf-8-sig")
full_df.columns = full_df.columns.str.strip()
full_df = full_df.drop(columns = unnamed)
tmp_csv = path.with_suffix(".tmp.csv")
full_df.to_csv(tmp_csv, index = False, encoding = "utf-8")
tmp_csv.replace(path)
elif ext == ".jsonl":
df = pd.read_json(path, lines = True).head(preview_size)
elif ext == ".json":
@ -220,6 +247,36 @@ def _read_preview_rows_from_unstructured_file(
return _serialize_preview_rows(rows)
def _read_preview_rows_from_multi_files(
*,
block_id: str,
file_ids: list[str],
file_names: list[str],
preview_size: int,
chunk_size: int | None,
chunk_overlap: int | None,
) -> list[dict[str, str]]:
from data_designer_unstructured_seed.chunking import build_multi_file_preview_rows
_validate_safe_id(block_id, "block_id")
block_dir = UNSTRUCTURED_UPLOAD_ROOT / block_id
file_entries: list[tuple[Path, str]] = []
for fid, fname in zip(file_ids, file_names):
extracted = block_dir / f"{fid}.extracted.txt"
if not extracted.exists():
raise HTTPException(
404, f"Extracted text not found for file: {fname} (id: {fid})"
)
file_entries.append((extracted, fname))
return build_multi_file_preview_rows(
file_entries = file_entries,
preview_size = preview_size,
chunk_size = chunk_size,
chunk_overlap = chunk_overlap,
)
@router.post("/seed/inspect", response_model = SeedInspectResponse)
def inspect_seed_dataset(payload: SeedInspectRequest) -> SeedInspectResponse:
dataset_name = payload.dataset_name.strip()
@ -306,14 +363,202 @@ def inspect_seed_dataset(payload: SeedInspectRequest) -> SeedInspectResponse:
)
def _extract_text_from_file(file_path: Path, ext: str) -> str:
"""Extract text from uploaded file based on extension, converting to markdown where possible."""
if ext in {".txt", ".md"}:
raw = file_path.read_text(encoding = "utf-8", errors = "ignore")
elif ext == ".pdf":
import pymupdf4llm
raw = pymupdf4llm.to_markdown(
str(file_path), write_images = False, show_progress = False
)
elif ext == ".docx":
import mammoth
with open(str(file_path), "rb") as f:
result = mammoth.convert_to_markdown(f)
raw = result.value
else:
raise ValueError(f"Unsupported file type: {ext}")
return normalize_unstructured_text(raw)
def _get_block_total_size(block_dir: Path, file_ids: list[str]) -> int:
"""Sum raw upload sizes for tracked file IDs only."""
if not block_dir.exists() or not file_ids:
return 0
id_set = set(file_ids)
total = 0
for f in block_dir.iterdir():
if not f.is_file():
continue
if f.name.endswith(".extracted.txt") or f.name.endswith(".meta.json"):
continue
stem = f.name.split(".")[0]
if stem in id_set:
total += f.stat().st_size
return total
@router.post("/seed/upload-unstructured-file")
async def upload_unstructured_file(
file: UploadFile = FastAPIFile(...),
block_id: str = Form(...),
existing_file_ids: str = Form(""),
) -> UnstructuredFileUploadResponse:
_validate_safe_id(block_id, "block_id")
tracked_ids = [fid.strip() for fid in existing_file_ids.split(",") if fid.strip()]
original_filename = file.filename or "upload"
ext = Path(original_filename).suffix.lower()
if ext not in UNSTRUCTURED_ALLOWED_EXTS:
raise HTTPException(
400,
f"Unsupported file type: {ext}. Allowed: {', '.join(sorted(UNSTRUCTURED_ALLOWED_EXTS))}",
)
content = await file.read()
size_bytes = len(content)
if size_bytes == 0:
raise HTTPException(400, "Empty file not allowed")
if size_bytes > MAX_FILE_SIZE:
raise HTTPException(
413, f"File too large ({size_bytes} bytes). Maximum is 50MB."
)
block_dir = UNSTRUCTURED_UPLOAD_ROOT / block_id
ensure_dir(block_dir)
current_total = _get_block_total_size(block_dir, file_ids = tracked_ids)
if current_total + size_bytes > MAX_TOTAL_SIZE:
raise HTTPException(
413, f"Total upload limit ({MAX_TOTAL_SIZE // (1024 * 1024)}MB) exceeded"
)
file_id = uuid4().hex
raw_path = block_dir / f"{file_id}{ext}"
raw_path.write_bytes(content)
extracted_path = block_dir / f"{file_id}.extracted.txt"
try:
extracted_text = _extract_text_from_file(raw_path, ext)
if not extracted_text or not extracted_text.strip():
raw_path.unlink(missing_ok = True)
return UnstructuredFileUploadResponse(
file_id = file_id,
filename = original_filename,
size_bytes = size_bytes,
status = "error",
error = "No extractable text found in file",
)
extracted_path.write_text(extracted_text, encoding = "utf-8")
except Exception as e:
raw_path.unlink(missing_ok = True)
extracted_path.unlink(missing_ok = True)
return UnstructuredFileUploadResponse(
file_id = file_id,
filename = original_filename,
size_bytes = size_bytes,
status = "error",
error = f"Text extraction failed: {type(e).__name__}: {e}",
)
try:
meta_path = block_dir / f"{file_id}.meta.json"
meta_path.write_text(
json.dumps(
{"original_filename": original_filename, "size_bytes": size_bytes}
),
encoding = "utf-8",
)
except OSError:
raw_path.unlink(missing_ok = True)
extracted_path.unlink(missing_ok = True)
return UnstructuredFileUploadResponse(
file_id = file_id,
filename = original_filename,
size_bytes = size_bytes,
status = "error",
error = "Failed to save file metadata",
)
return UnstructuredFileUploadResponse(
file_id = file_id,
filename = original_filename,
size_bytes = size_bytes,
status = "ok",
)
@router.delete("/seed/unstructured-file/{block_id}/{file_id}")
async def remove_unstructured_file(block_id: str, file_id: str):
_validate_safe_id(block_id, "block_id")
_validate_safe_id(file_id, "file_id")
block_dir = UNSTRUCTURED_UPLOAD_ROOT / block_id
if not block_dir.exists():
raise HTTPException(404, "Block not found")
deleted = False
for f in block_dir.iterdir():
stem = f.name.split(".")[0]
if stem == file_id:
f.unlink(missing_ok = True)
deleted = True
if not deleted:
raise HTTPException(404, "File not found")
try:
if not any(block_dir.iterdir()):
block_dir.rmdir()
except OSError:
pass
return {"status": "ok"}
@router.post("/seed/inspect-upload", response_model = SeedInspectResponse)
def inspect_seed_upload(payload: SeedInspectUploadRequest) -> SeedInspectResponse:
if payload.file_ids is not None:
if len(payload.file_ids) == 0:
raise HTTPException(400, "file_ids must not be empty")
_validate_safe_id(payload.block_id, "block_id")
for fid in payload.file_ids:
_validate_safe_id(fid, "file_id")
preview_rows = _read_preview_rows_from_multi_files(
block_id = payload.block_id,
file_ids = payload.file_ids,
file_names = payload.file_names,
preview_size = payload.preview_size,
chunk_size = payload.unstructured_chunk_size,
chunk_overlap = payload.unstructured_chunk_overlap,
)
columns = ["chunk_text", "source_file"] if preview_rows else []
resolved_paths = [
str(UNSTRUCTURED_UPLOAD_ROOT / payload.block_id / f"{fid}.extracted.txt")
for fid in payload.file_ids
]
return SeedInspectResponse(
dataset_name = "unstructured_seed",
resolved_path = resolved_paths[0] if resolved_paths else "",
resolved_paths = resolved_paths,
columns = columns,
preview_rows = _serialize_preview_rows(preview_rows),
)
seed_source_type = _normalize_optional_text(payload.seed_source_type) or "local"
filename = _sanitize_filename(payload.filename)
ext = Path(filename).suffix.lower()
# Legacy single-file unstructured path only supports .txt/.md
# PDF/DOCX extraction uses the multi-file upload endpoint instead
_LEGACY_UNSTRUCTURED_EXTS = {".txt", ".md"}
if seed_source_type == "unstructured":
if ext not in UNSTRUCTURED_UPLOAD_EXTS:
allowed = ", ".join(sorted(UNSTRUCTURED_UPLOAD_EXTS))
if ext not in _LEGACY_UNSTRUCTURED_EXTS:
allowed = ", ".join(sorted(_LEGACY_UNSTRUCTURED_EXTS))
raise HTTPException(
status_code = 400,
detail = f"unsupported file type: {ext}. allowed: {allowed}",
@ -329,8 +574,7 @@ def inspect_seed_upload(payload: SeedInspectUploadRequest) -> SeedInspectRespons
file_bytes = _decode_base64_payload(payload.content_base64)
if not file_bytes:
raise HTTPException(status_code = 400, detail = "empty upload payload")
max_size_bytes = 50 * 1024 * 1024
if len(file_bytes) > max_size_bytes:
if len(file_bytes) > MAX_FILE_SIZE:
raise HTTPException(status_code = 413, detail = "file too large (max 50MB)")
ensure_dir(SEED_UPLOAD_DIR)

View file

@ -19,6 +19,7 @@ from .storage_roots import (
tmp_root,
seed_uploads_root,
unstructured_seed_cache_root,
unstructured_uploads_root,
oxc_validator_tmp_root,
tensorboard_root,
ensure_dir,
@ -47,6 +48,7 @@ __all__ = [
"tmp_root",
"seed_uploads_root",
"unstructured_seed_cache_root",
"unstructured_uploads_root",
"oxc_validator_tmp_root",
"tensorboard_root",
"ensure_dir",

View file

@ -54,13 +54,17 @@ def tmp_root() -> Path:
def seed_uploads_root() -> Path:
return tmp_root() / "seed-uploads"
return datasets_root() / "seed-uploads"
def unstructured_seed_cache_root() -> Path:
return tmp_root() / "unstructured-seed-cache"
def unstructured_uploads_root() -> Path:
return datasets_root() / "unstructured-uploads"
def oxc_validator_tmp_root() -> Path:
return tmp_root() / "oxc-validator"
@ -104,6 +108,7 @@ def ensure_studio_directories() -> None:
datasets_root,
dataset_uploads_root,
recipe_datasets_root,
unstructured_uploads_root,
outputs_root,
exports_root,
auth_root,

View file

@ -103,13 +103,22 @@ export type SeedInspectRequest = {
};
export type SeedInspectUploadRequest = {
filename: string;
// base64 payload without data URL prefix
content_base64: string;
// Legacy single-file
filename?: string;
// biome-ignore lint/style/useNamingConvention: api schema
content_base64?: string;
// Multi-file
// biome-ignore lint/style/useNamingConvention: api schema
block_id?: string;
// biome-ignore lint/style/useNamingConvention: api schema
file_ids?: string[];
// biome-ignore lint/style/useNamingConvention: api schema
file_names?: string[];
// Shared
// biome-ignore lint/style/useNamingConvention: api schema
preview_size?: number;
// biome-ignore lint/style/useNamingConvention: api schema
seed_source_type?: "local" | "unstructured";
seed_source_type?: string;
// biome-ignore lint/style/useNamingConvention: api schema
unstructured_chunk_size?: number;
// biome-ignore lint/style/useNamingConvention: api schema
@ -126,6 +135,8 @@ export type SeedInspectResponse = {
preview_rows: Record<string, unknown>[];
split?: string | null;
subset?: string | null;
// biome-ignore lint/style/useNamingConvention: api schema
resolved_paths?: string[] | null;
};
export type ValidateError = {
@ -372,3 +383,64 @@ export async function streamRecipeJobEvents(options: {
}
// NOTE: preview endpoints removed from harness.
type UnstructuredFileUploadResponse = {
// biome-ignore lint/style/useNamingConvention: api schema
file_id: string;
filename: string;
// biome-ignore lint/style/useNamingConvention: api schema
size_bytes: number;
status: "ok" | "error";
error?: string;
};
export async function uploadUnstructuredFile(
file: File,
blockId: string,
signal?: AbortSignal,
existingFileIds?: string[],
): Promise<UnstructuredFileUploadResponse> {
const formData = new FormData();
formData.append("file", file);
formData.append("block_id", blockId);
if (existingFileIds?.length) {
formData.append("existing_file_ids", existingFileIds.join(","));
}
const res = await authFetch(`${DATA_DESIGNER_API_BASE}/seed/upload-unstructured-file`, {
method: "POST",
body: formData,
signal,
});
if (res.status === 413) {
const detail = await res.json().catch(() => ({ detail: "File too large" }));
return {
file_id: "",
filename: file.name,
size_bytes: file.size,
status: "error",
error: typeof detail.detail === "string" ? detail.detail : "File too large",
};
}
if (!res.ok) {
const detail = await res.json().catch(() => ({ detail: "Upload failed" }));
throw new Error(typeof detail.detail === "string" ? detail.detail : "Upload failed");
}
return res.json();
}
export async function removeUnstructuredFile(
blockId: string,
fileId: string,
): Promise<void> {
const res = await authFetch(
`${DATA_DESIGNER_API_BASE}/seed/unstructured-file/${encodeURIComponent(blockId)}/${encodeURIComponent(fileId)}`,
{ method: "DELETE" },
);
if (!res.ok && res.status !== 404) {
throw new Error("Failed to remove file");
}
}

View file

@ -8,7 +8,7 @@ import {
useUpdateNodeInternals,
} from "@xyflow/react";
import { Button } from "@/components/ui/button";
import { getFitNodeIdsIgnoringNotes } from "../../utils/graph/fit-view";
import { buildFitViewOptions } from "../../utils/graph/fit-view";
type LayoutControlsProps = {
direction: "LR" | "TB";
@ -36,10 +36,7 @@ export function LayoutControls({
requestAnimationFrame(() => {
refreshNodeInternals();
requestAnimationFrame(() => {
fitView({
duration: 250,
nodes: getFitNodeIdsIgnoringNotes(getNodes()),
});
fitView(buildFitViewOptions(getNodes()));
});
});
}, [fitView, getNodes, onLayout, refreshNodeInternals]);
@ -51,10 +48,7 @@ export function LayoutControls({
requestAnimationFrame(() => {
refreshNodeInternals();
requestAnimationFrame(() => {
fitView({
duration: 250,
nodes: getFitNodeIdsIgnoringNotes(getNodes()),
});
fitView(buildFitViewOptions(getNodes()));
});
});
});

View file

@ -5,7 +5,7 @@ import { type ReactElement, useCallback } from "react";
import { Lock, LockOpen, Maximize2, Minus, Plus } from "lucide-react";
import { Panel, useReactFlow } from "@xyflow/react";
import { Button } from "@/components/ui/button";
import { getFitNodeIdsIgnoringNotes } from "../../utils/graph/fit-view";
import { buildFitViewOptions } from "../../utils/graph/fit-view";
import { RECIPE_FLOATING_ICON_BUTTON_CLASS } from "../recipe-floating-icon-button-class";
type ViewportControlsProps = {
@ -30,10 +30,7 @@ export function ViewportControls({
}, [zoomOut]);
const handleFitView = useCallback(() => {
fitView({
duration: 250,
nodes: getFitNodeIdsIgnoringNotes(getNodes()),
});
fitView(buildFitViewOptions(getNodes()));
}, [fitView, getNodes]);
return (

View file

@ -45,7 +45,9 @@ export function InlineSeed({ config, onUpdate }: InlineSeedProps): ReactElement
const isLocal = mode === "local";
const fileName = isLocal
? config.local_file_name?.trim()
: config.unstructured_file_name?.trim();
: config.unstructured_file_names?.length
? `${config.unstructured_file_names.length} file${config.unstructured_file_names.length !== 1 ? "s" : ""}`
: undefined;
return (
<div className="corner-squircle flex items-center gap-2 rounded-md border border-border/60 bg-muted/30 px-2 py-2">

View file

@ -256,9 +256,10 @@ function getConfigSummary(config: NodeConfig | undefined): string {
}
if (
seedSourceType === "unstructured" &&
config.unstructured_file_name?.trim()
config.unstructured_file_names?.length
) {
return config.unstructured_file_name.trim();
const count = config.unstructured_file_names.length;
return `${count} file${count !== 1 ? "s" : ""} uploaded`;
}
if (config.hf_path.trim()) {
return config.hf_path.trim();

View file

@ -58,7 +58,7 @@ export function ExpressionDialog({
value={config.name}
onChange={(value) => onUpdate({ name: value })}
/>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Output type"
htmlFor={dtypeId}
@ -82,7 +82,7 @@ export function ExpressionDialog({
</SelectContent>
</Select>
</div>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Formula"
htmlFor={exprId}

View file

@ -58,7 +58,7 @@ export function ImportDialog({
<DialogHeader>
<DialogTitle>Import recipe</DialogTitle>
</DialogHeader>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Recipe JSON"
htmlFor={payloadId}

View file

@ -215,7 +215,7 @@ export function LlmGeneralTab({
</p>
</div>
) : null}
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Model preset"
htmlFor={modelAliasId}
@ -262,7 +262,7 @@ export function LlmGeneralTab({
</p>
)}
{(hasToolProfiles || Boolean(config.tool_alias?.trim())) && (
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Tool access (optional)"
htmlFor={toolAliasId}
@ -304,7 +304,7 @@ export function LlmGeneralTab({
</div>
)}
{config.llm_type === "code" && (
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Code language"
htmlFor={codeLangId}
@ -327,7 +327,7 @@ export function LlmGeneralTab({
</Select>
</div>
)}
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Prompt"
htmlFor={promptId}
@ -377,7 +377,7 @@ export function LlmGeneralTab({
/>
</div>
{imageContext.enabled && (
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Image field"
htmlFor={imageContextColumnId}
@ -414,7 +414,7 @@ export function LlmGeneralTab({
</div>
)}
{config.llm_type === "structured" && (
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Response format"
htmlFor={outputFormatId}
@ -441,7 +441,7 @@ export function LlmGeneralTab({
/>
</CollapsibleTrigger>
<CollapsibleContent className="mt-3 space-y-4">
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Instructions (optional)"
htmlFor={systemPromptId}
@ -465,7 +465,7 @@ export function LlmGeneralTab({
</p>
)}
</div>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Save trace details"
htmlFor={traceModeId}

View file

@ -58,7 +58,7 @@ export function MarkdownNoteDialog({
</div>
</div>
</div>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Markdown"
htmlFor={markdownId}

View file

@ -72,7 +72,7 @@ export function ModelConfigDialog({
generation defaults you want to reuse.
</p>
</div>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Provider connection"
htmlFor={providerId}
@ -120,7 +120,7 @@ export function ModelConfigDialog({
: "Matching blocks are linked automatically on the canvas."}
</p>
</div>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Model ID"
htmlFor={modelId}
@ -144,7 +144,7 @@ export function ModelConfigDialog({
</p>
</div>
<div className="grid gap-3 sm:grid-cols-2">
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Temperature"
htmlFor={tempId}
@ -159,7 +159,7 @@ export function ModelConfigDialog({
}
/>
</div>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Top-p"
htmlFor={topPId}
@ -174,7 +174,7 @@ export function ModelConfigDialog({
}
/>
</div>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Max tokens"
htmlFor={maxTokensId}
@ -189,7 +189,7 @@ export function ModelConfigDialog({
}
/>
</div>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Timeout (seconds)"
htmlFor={timeoutId}
@ -214,7 +214,7 @@ export function ModelConfigDialog({
/>
</CollapsibleTrigger>
<CollapsibleContent className="mt-3 space-y-4">
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Advanced request fields (JSON)"
htmlFor={extraBodyId}

View file

@ -52,7 +52,7 @@ export function ModelProviderDialog({
service requires one.
</p>
</div>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Endpoint"
htmlFor={endpointId}
@ -66,7 +66,7 @@ export function ModelProviderDialog({
onChange={(event) => updateField("endpoint", event.target.value)}
/>
</div>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="API key (optional)"
htmlFor={apiKeyId}
@ -87,7 +87,7 @@ export function ModelProviderDialog({
/>
</CollapsibleTrigger>
<CollapsibleContent className="mt-3 space-y-4">
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="API key environment variable"
htmlFor={apiKeyEnvId}
@ -101,7 +101,7 @@ export function ModelProviderDialog({
onChange={(event) => updateField("api_key_env", event.target.value)}
/>
</div>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Extra headers (JSON)"
htmlFor={extraHeadersId}
@ -115,7 +115,7 @@ export function ModelProviderDialog({
onChange={(event) => updateField("extra_headers", event.target.value)}
/>
</div>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Extra body (JSON)"
htmlFor={extraBodyId}

View file

@ -157,7 +157,7 @@ function DraftInputField({
placeholder,
}: DraftInputFieldProps): ReactElement {
return (
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel label={label} htmlFor={id} hint={hint} />
<Input
id={id}
@ -330,7 +330,7 @@ function RunDialogBody({
</p>
</DialogHeader>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Run type"
hint="Start with a quick check or generate the full dataset."
@ -358,7 +358,7 @@ function RunDialogBody({
</div>
{kind === "full" && (
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Run name"
htmlFor="run-name"
@ -380,7 +380,7 @@ function RunDialogBody({
</div>
)}
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel label="Records" htmlFor="run-rows" hint={rowHint} />
<Input
id="run-rows"

View file

@ -93,7 +93,7 @@ export function ProcessorsDialog({
{schemaProcessor && (
<div className="space-y-3">
<AvailableVariables configId="" />
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Name"
htmlFor={nameId}
@ -106,7 +106,7 @@ export function ProcessorsDialog({
onChange={(event) => updateSchema({ name: event.target.value })}
/>
</div>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Template (JSON)"
htmlFor={templateId}

View file

@ -23,7 +23,7 @@ export function BernoulliDialog({
value={config.name}
onChange={(value) => onUpdate({ name: value })}
/>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Probability (p)"
htmlFor={pId}

View file

@ -89,7 +89,7 @@ export function CategoryDialog({
onChange={(value) => onUpdate({ name: value })}
/>
<div className="space-y-3">
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Values"
hint="Define allowed categorical values for this column."
@ -127,7 +127,7 @@ export function CategoryDialog({
/>
</CollapsibleTrigger>
<CollapsibleContent className="mt-2 space-y-3">
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Weights (optional)"
hint="Set selection probability per value."
@ -239,7 +239,7 @@ export function CategoryDialog({
}}
placeholder="Type a conditional value and press Enter"
/>
<div className="grid gap-2">
<div className="grid gap-1.5">
<p className="text-xs font-semibold uppercase text-muted-foreground">
Rule weights (optional)
</p>

View file

@ -50,7 +50,7 @@ export function DatetimeDialog({
/>
<div className="grid gap-3">
<div className="grid gap-2 sm:grid-cols-2">
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Start"
htmlFor={startId}
@ -66,7 +66,7 @@ export function DatetimeDialog({
}
/>
</div>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="End"
htmlFor={endId}
@ -83,7 +83,7 @@ export function DatetimeDialog({
/>
</div>
</div>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Unit"
htmlFor={unitId}

View file

@ -33,7 +33,7 @@ export function GaussianDialog({
onChange={(value) => onUpdate({ name: value })}
/>
<div className="grid gap-3 sm:grid-cols-2">
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Mean"
htmlFor={meanId}
@ -47,7 +47,7 @@ export function GaussianDialog({
onChange={(event) => onUpdate({ mean: event.target.value })}
/>
</div>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Std"
htmlFor={stdId}
@ -62,7 +62,7 @@ export function GaussianDialog({
/>
</div>
</div>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Convert to"
htmlFor={convertId}

View file

@ -58,7 +58,7 @@ export function PersonDialog({
<p className="text-sm text-foreground">Faker</p>
</div>
<div className="grid gap-3 sm:grid-cols-2">
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Locale"
htmlFor={localeId}
@ -73,7 +73,7 @@ export function PersonDialog({
}
/>
</div>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Sex"
htmlFor={sexId}
@ -95,7 +95,7 @@ export function PersonDialog({
</SelectContent>
</Select>
</div>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Age range"
htmlFor={ageRangeId}
@ -111,7 +111,7 @@ export function PersonDialog({
placeholder="18-70"
/>
</div>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="City"
htmlFor={cityId}

View file

@ -74,7 +74,7 @@ export function SubcategoryDialog({
onChange={(value) => onUpdate({ name: value })}
/>
<div className="space-y-3">
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Parent category column"
htmlFor={parentSelectId}

View file

@ -45,7 +45,7 @@ export function TimedeltaDialog({
onChange={(value) => onUpdate({ name: value })}
/>
<div className="grid gap-3 sm:grid-cols-2">
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="dt_min"
htmlFor={dtMinId}
@ -59,7 +59,7 @@ export function TimedeltaDialog({
onChange={(event) => updateField("dt_min", event.target.value)}
/>
</div>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="dt_max"
htmlFor={dtMaxId}
@ -74,7 +74,7 @@ export function TimedeltaDialog({
/>
</div>
</div>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Unit"
htmlFor={unitId}
@ -98,7 +98,7 @@ export function TimedeltaDialog({
</SelectContent>
</Select>
</div>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Reference datetime column"
htmlFor={referenceId}

View file

@ -33,7 +33,7 @@ export function UniformDialog({
onChange={(value) => onUpdate({ name: value })}
/>
<div className="grid gap-3 sm:grid-cols-2">
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Low"
htmlFor={lowId}
@ -47,7 +47,7 @@ export function UniformDialog({
onChange={(event) => onUpdate({ low: event.target.value })}
/>
</div>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="High"
htmlFor={highId}
@ -62,7 +62,7 @@ export function UniformDialog({
/>
</div>
</div>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Convert to"
htmlFor={convertId}

View file

@ -29,7 +29,7 @@ export function UuidDialog({
value={config.name}
onChange={(value) => onUpdate({ name: value })}
/>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="UUID format (optional)"
htmlFor={uuidId}

View file

@ -37,10 +37,9 @@ import {
TabsList,
TabsTrigger,
} from "@/components/ui/tabs";
import mammoth from "mammoth";
import { type ReactElement, useCallback, useEffect, useMemo, useRef, useState } from "react";
import { extractText, getDocumentProxy } from "unpdf";
import { cn } from "@/lib/utils";
import { UnstructuredDropZone, type FileEntry } from "./unstructured-drop-zone";
import { inspectSeedDataset, inspectSeedUpload } from "../../api";
import { resolveImagePreview } from "../../utils/image-preview";
import type {
@ -64,7 +63,6 @@ const SELECTION_OPTIONS: Array<{ value: SeedSelectionType; label: string }> = [
];
const LOCAL_ACCEPT = ".csv,.json,.jsonl";
const UNSTRUCTURED_ACCEPT = ".txt,.pdf,.docx";
const MAX_UPLOAD_BYTES = 50 * 1024 * 1024;
const DEFAULT_CHUNK_SIZE = 1200;
const DEFAULT_CHUNK_OVERLAP = 200;
@ -112,20 +110,20 @@ function getPreviewEmptyStateCopy(mode: SeedConfig["seed_source_type"]): {
} {
if (mode === "local") {
return {
title: "No local preview yet",
description: "Choose a CSV/JSON/JSONL file, then click Load to fetch 10 rows.",
title: "No preview yet",
description: "Upload a CSV, JSON, or JSONL file and click Load to see a sample.",
};
}
if (mode === "unstructured") {
return {
title: "No chunk preview yet",
title: "No preview yet",
description:
"Choose a TXT/PDF/DOCX file, then click Load to extract + preview chunk_text rows.",
"Upload your documents and the preview will appear once processing is done.",
};
}
return {
title: "No dataset preview yet",
description: "Pick a Hugging Face dataset and click Load to fetch 10 sample rows.",
title: "No preview yet",
description: "Select a Hugging Face dataset and click Load to see a sample.",
};
}
@ -177,42 +175,6 @@ async function fileToBase64Payload(file: File): Promise<string> {
});
}
async function extractUnstructuredText(file: File): Promise<string> {
const lower = file.name.toLowerCase();
if (lower.endsWith(".txt")) {
return file.text();
}
if (lower.endsWith(".pdf")) {
const buffer = new Uint8Array(await file.arrayBuffer());
const pdf = await getDocumentProxy(buffer);
const { text } = await extractText(pdf, { mergePages: true });
return text;
}
if (lower.endsWith(".docx")) {
const arrayBuffer = await file.arrayBuffer();
const { value } = await mammoth.extractRawText({ arrayBuffer });
return value;
}
throw new Error("Unsupported unstructured file type");
}
async function toUnstructuredUploadFile(file: File): Promise<File> {
const lower = file.name.toLowerCase();
if (lower.endsWith(".txt") || lower.endsWith(".md")) {
return file;
}
const text = (await extractUnstructuredText(file)).trim();
if (!text) {
throw new Error("No text found in file.");
}
const normalized = text.replace(/\r\n/g, "\n").replace(/\r/g, "\n");
const stem = file.name.replace(/\.(pdf|docx)$/i, "") || "unstructured_seed";
return new File([normalized], `${stem}.txt`, {
type: "text/plain",
});
}
export function SeedDialog({ config, onUpdate, open }: SeedDialogProps): ReactElement {
const [inspectError, setInspectError] = useState<string | null>(null);
const [isInspecting, setIsInspecting] = useState(false);
@ -220,16 +182,84 @@ export function SeedDialog({ config, onUpdate, open }: SeedDialogProps): ReactEl
const [previewRows, setPreviewRows] = useState<Record<string, unknown>[]>([]);
const [expandedPreviewRows, setExpandedPreviewRows] = useState<Record<number, boolean>>({});
const [localFile, setLocalFile] = useState<File | null>(null);
const [unstructuredFile, setUnstructuredFile] = useState<File | null>(null);
const [unstructuredFiles, setUnstructuredFiles] = useState<FileEntry[]>(() => {
if (config.unstructured_file_ids?.length) {
return config.unstructured_file_ids.map((id, i) => ({
id,
name: config.unstructured_file_names?.[i] ?? "Unknown",
size: config.unstructured_file_sizes?.[i] ?? 0,
status: "ok" as const,
}));
}
return [];
});
const mode = config.seed_source_type ?? "hf";
const previewEmpty = getPreviewEmptyStateCopy(mode);
const prevModeRef = useRef(mode);
useEffect(() => {
const prevMode = prevModeRef.current;
prevModeRef.current = mode;
setInspectError(null);
setLocalFile(null);
setUnstructuredFile(null);
}, [mode]);
if (prevMode === "unstructured" && mode !== "unstructured") {
setUnstructuredFiles([]);
}
if (prevMode !== "unstructured" && mode === "unstructured") {
if (config.unstructured_file_ids?.length) {
setUnstructuredFiles(
config.unstructured_file_ids.map((id, i) => ({
id,
name: config.unstructured_file_names?.[i] ?? "Unknown",
size: config.unstructured_file_sizes?.[i] ?? 0,
status: "ok" as const,
})),
);
} else {
setUnstructuredFiles([]);
}
}
}, [mode]); // eslint-disable-line react-hooks/exhaustive-deps
const didSyncFilesRef = useRef(false);
useEffect(() => {
if (!open) {
didSyncFilesRef.current = false;
return;
}
if (didSyncFilesRef.current) return;
if (mode !== "unstructured") return;
if (unstructuredFiles.length > 0) return;
if (!config.unstructured_file_ids?.length) return;
didSyncFilesRef.current = true;
setUnstructuredFiles(
config.unstructured_file_ids.map((id, i) => ({
id,
name: config.unstructured_file_names?.[i] ?? "Unknown",
size: config.unstructured_file_sizes?.[i] ?? 0,
status: "ok" as const,
})),
);
}, [open, mode, unstructuredFiles.length, config.unstructured_file_ids, config.unstructured_file_names, config.unstructured_file_sizes]);
const handleUnstructuredFilesChange = useCallback(
(updater: FileEntry[] | ((prev: FileEntry[]) => FileEntry[])) => {
setUnstructuredFiles((prev) => {
const next = typeof updater === "function" ? updater(prev) : updater;
const okFiles = next.filter((f) => f.status === "ok");
queueMicrotask(() => {
onUpdate({
unstructured_file_ids: okFiles.map((f) => f.id),
unstructured_file_names: okFiles.map((f) => f.name),
unstructured_file_sizes: okFiles.map((f) => f.size),
});
});
return next;
});
},
[onUpdate],
);
useEffect(() => {
setPreviewRows(config.seed_preview_rows ?? []);
@ -256,14 +286,16 @@ export function SeedDialog({ config, onUpdate, open }: SeedDialogProps): ReactEl
if (!localFile) return null;
return `local:${localFile.name}|${localFile.size}|${localFile.lastModified}`;
}
if (!unstructuredFile) return null;
const okFiles = unstructuredFiles.filter((f) => f.status === "ok");
if (okFiles.length === 0) return null;
const { chunkSize, chunkOverlap } = resolveChunking(config);
return `unstructured:${unstructuredFile.name}|${unstructuredFile.size}|${unstructuredFile.lastModified}|${chunkSize}|${chunkOverlap}`;
const fileKey = okFiles.map((f) => `${f.id}|${f.name}`).join(",");
return `unstructured:${fileKey}|${chunkSize}|${chunkOverlap}`;
}, [
config,
localFile,
mode,
unstructuredFile,
unstructuredFiles,
]);
const loadSeedMetadata = useCallback(async (opts?: { silent?: boolean }): Promise<boolean> => {
@ -295,7 +327,9 @@ export function SeedDialog({ config, onUpdate, open }: SeedDialogProps): ReactEl
hf_split: response.split ?? "",
hf_subset: response.subset ?? "",
local_file_name: "",
unstructured_file_name: "",
unstructured_file_ids: [],
unstructured_file_names: [],
unstructured_file_sizes: [],
});
setPreviewRows(response.preview_rows ?? []);
setLastLoadedKey(loadKey);
@ -326,50 +360,56 @@ export function SeedDialog({ config, onUpdate, open }: SeedDialogProps): ReactEl
hf_subset: "",
hf_split: "",
local_file_name: localFile.name,
unstructured_file_name: "",
unstructured_file_ids: [],
unstructured_file_names: [],
unstructured_file_sizes: [],
});
setPreviewRows(response.preview_rows ?? []);
setLastLoadedKey(loadKey);
return true;
}
if (!unstructuredFile) {
throw new Error("Select a PDF/DOCX/TXT file first.");
}
if (unstructuredFile.size > MAX_UPLOAD_BYTES) {
throw new Error("File too large (max 50MB).");
if (mode === "unstructured") {
const fileIds = unstructuredFiles
.filter((f) => f.status === "ok")
.map((f) => f.id);
const fileNames = unstructuredFiles
.filter((f) => f.status === "ok")
.map((f) => f.name);
if (fileIds.length === 0) {
setInspectError("No files uploaded");
return false;
}
const { chunkSize, chunkOverlap } = resolveChunking(config);
const response = await inspectSeedUpload({
block_id: config.id,
file_ids: fileIds,
file_names: fileNames,
preview_size: 10,
seed_source_type: "unstructured",
unstructured_chunk_size: chunkSize,
unstructured_chunk_overlap: chunkOverlap,
});
onUpdate({
hf_path: response.resolved_path,
resolved_paths: response.resolved_paths ?? [],
seed_columns: response.columns,
seed_preview_rows: response.preview_rows ?? [],
unstructured_file_ids: fileIds,
unstructured_file_names: fileNames,
unstructured_file_sizes: unstructuredFiles
.filter((f) => f.status === "ok")
.map((f) => f.size),
});
setPreviewRows(response.preview_rows ?? []);
setLastLoadedKey(loadKey);
return true;
}
const { chunkSize, chunkOverlap } = resolveChunking(config);
const uploadFile = await toUnstructuredUploadFile(unstructuredFile);
if (uploadFile.size > MAX_UPLOAD_BYTES) {
throw new Error("Processed text is too large (max 50MB).");
}
const payload = await fileToBase64Payload(uploadFile);
const response = await inspectSeedUpload({
filename: uploadFile.name,
content_base64: payload,
preview_size: 10,
seed_source_type: "unstructured",
unstructured_chunk_size: chunkSize,
unstructured_chunk_overlap: chunkOverlap,
});
onUpdate({
hf_path: response.resolved_path,
seed_columns: response.columns,
seed_drop_columns: (config.seed_drop_columns ?? []).filter((name) =>
response.columns.includes(name),
),
seed_preview_rows: response.preview_rows ?? [],
hf_repo_id: "",
hf_subset: "",
hf_split: "",
local_file_name: "",
unstructured_file_name: unstructuredFile.name,
});
setPreviewRows(response.preview_rows ?? []);
setLastLoadedKey(loadKey);
return true;
return false;
} catch (error) {
if (!opts?.silent) {
setInspectError(getErrorMessage(error, "Failed to load seed metadata."));
@ -385,7 +425,7 @@ export function SeedDialog({ config, onUpdate, open }: SeedDialogProps): ReactEl
localFile,
mode,
onUpdate,
unstructuredFile,
unstructuredFiles,
]);
useEffect(() => {
@ -401,6 +441,21 @@ export function SeedDialog({ config, onUpdate, open }: SeedDialogProps): ReactEl
void loadSeedMetadata({ silent: true });
}, [getCurrentLoadKey, isInspecting, lastLoadedKey, loadSeedMetadata, open]);
const wasUploadingRef = useRef(false);
useEffect(() => {
if (mode !== "unstructured") return;
const isUploading = unstructuredFiles.some((f) => f.status === "uploading");
if (isUploading) {
wasUploadingRef.current = true;
} else if (wasUploadingRef.current) {
wasUploadingRef.current = false;
const hasOk = unstructuredFiles.some((f) => f.status === "ok");
if (hasOk) {
void loadSeedMetadata({ silent: true });
}
}
}, [mode, unstructuredFiles, loadSeedMetadata]);
const previewColumns = useMemo(() => {
const loadedColumns = config.seed_columns ?? [];
if (loadedColumns.length > 0) return loadedColumns;
@ -434,10 +489,10 @@ export function SeedDialog({ config, onUpdate, open }: SeedDialogProps): ReactEl
</TabsList>
<TabsContent value="config" className="min-w-0 pt-3">
<div className="space-y-4">
<div className="space-y-3">
{mode === "hf" && (
<>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Dataset"
htmlFor={datasetId}
@ -474,7 +529,7 @@ export function SeedDialog({ config, onUpdate, open }: SeedDialogProps): ReactEl
</div>
</div>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="HF token (optional)"
htmlFor={tokenId}
@ -493,7 +548,7 @@ export function SeedDialog({ config, onUpdate, open }: SeedDialogProps): ReactEl
)}
{mode === "local" && (
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Structured file"
hint="Upload CSV, JSON, or JSONL seed file."
@ -526,7 +581,7 @@ export function SeedDialog({ config, onUpdate, open }: SeedDialogProps): ReactEl
</Button>
</div>
<p className="text-xs text-muted-foreground">
Upload-only. Max 50MB.
Max 50MB per file.
</p>
{(localFile?.name || config.local_file_name?.trim()) && (
<p className="text-xs text-muted-foreground">
@ -537,49 +592,12 @@ export function SeedDialog({ config, onUpdate, open }: SeedDialogProps): ReactEl
)}
{mode === "unstructured" && (
<div className="grid gap-2">
<FieldLabel
label="Unstructured file"
hint="Upload PDF, DOCX, or TXT. We chunk text into seed rows."
/>
<div className="flex items-center gap-2">
<Input
className="nodrag flex-1"
type="file"
accept={UNSTRUCTURED_ACCEPT}
onChange={(event) => {
const file = event.target.files?.[0] ?? null;
setUnstructuredFile(file);
onUpdate({
hf_path: "",
seed_columns: [],
seed_drop_columns: [],
seed_preview_rows: [],
unstructured_file_name: file?.name ?? "",
});
}}
/>
<Button
type="button"
variant="outline"
className="nodrag shrink-0"
onClick={() => void loadSeedMetadata()}
disabled={isInspecting || !unstructuredFile}
>
{isInspecting ? "Loading..." : "Load"}
</Button>
</div>
<p className="text-xs text-muted-foreground">
File is converted to text, then chunked server-side into chunk_text rows. Max 50MB.
</p>
{(unstructuredFile?.name ||
config.unstructured_file_name?.trim()) && (
<p className="text-xs text-muted-foreground">
Selected:{" "}
{unstructuredFile?.name ?? config.unstructured_file_name?.trim()}
</p>
)}
</div>
<UnstructuredDropZone
blockId={config.id}
files={unstructuredFiles}
onFilesChange={handleUnstructuredFilesChange}
disabled={isInspecting}
/>
)}
{inspectError && <p className="text-xs text-red-600">{inspectError}</p>}
@ -633,7 +651,7 @@ export function SeedDialog({ config, onUpdate, open }: SeedDialogProps): ReactEl
/>
</CollapsibleTrigger>
<CollapsibleContent className="mt-2 space-y-3">
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Sampling strategy"
htmlFor={samplingId}
@ -658,7 +676,7 @@ export function SeedDialog({ config, onUpdate, open }: SeedDialogProps): ReactEl
</Select>
</div>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Selection strategy"
htmlFor={selectionId}
@ -685,7 +703,7 @@ export function SeedDialog({ config, onUpdate, open }: SeedDialogProps): ReactEl
{mode === "unstructured" && (
<div className="grid grid-cols-2 gap-3">
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Chunk size"
htmlFor={chunkSizeId}
@ -701,7 +719,7 @@ export function SeedDialog({ config, onUpdate, open }: SeedDialogProps): ReactEl
}
/>
</div>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Chunk overlap"
htmlFor={chunkOverlapId}
@ -725,7 +743,7 @@ export function SeedDialog({ config, onUpdate, open }: SeedDialogProps): ReactEl
{config.selection_type === "index_range" && (
<div className="grid grid-cols-2 gap-3">
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel label="Start" hint="Inclusive start row index for index_range." />
<Input
className="nodrag"
@ -734,7 +752,7 @@ export function SeedDialog({ config, onUpdate, open }: SeedDialogProps): ReactEl
onChange={(event) => onUpdate({ selection_start: event.target.value })}
/>
</div>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel label="End" hint="Inclusive end row index for index_range." />
<Input
className="nodrag"
@ -748,7 +766,7 @@ export function SeedDialog({ config, onUpdate, open }: SeedDialogProps): ReactEl
{config.selection_type === "partition_block" && (
<div className="grid grid-cols-2 gap-3">
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel label="Index" hint="Partition index to load." />
<Input
className="nodrag"
@ -757,7 +775,7 @@ export function SeedDialog({ config, onUpdate, open }: SeedDialogProps): ReactEl
onChange={(event) => onUpdate({ selection_index: event.target.value })}
/>
</div>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel label="Partitions" hint="Total number of partitions." />
<Input
className="nodrag"

View file

@ -0,0 +1,240 @@
import { useCallback, useRef, useState } from "react";
import { CloudUploadIcon, Cancel01Icon, Loading03Icon, CheckmarkCircle02Icon, Alert02Icon } from "@hugeicons/core-free-icons";
import { HugeiconsIcon } from "@hugeicons/react";
import { uploadUnstructuredFile, removeUnstructuredFile } from "../../api";
const ACCEPTED_EXTENSIONS = [".txt", ".pdf", ".docx", ".md"];
const MAX_FILE_SIZE = 50 * 1024 * 1024;
const MAX_TOTAL_SIZE = 100 * 1024 * 1024;
type FileEntry = {
id: string;
name: string;
size: number;
status: "uploading" | "ok" | "error";
error?: string;
abortController?: AbortController;
};
type UnstructuredDropZoneProps = {
blockId: string;
files: FileEntry[];
onFilesChange: (files: FileEntry[] | ((prev: FileEntry[]) => FileEntry[])) => void;
disabled?: boolean;
};
function formatSize(bytes: number): string {
if (bytes < 1024) return `${bytes} B`;
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
}
function isValidExtension(name: string): boolean {
const ext = name.slice(name.lastIndexOf(".")).toLowerCase();
return ACCEPTED_EXTENSIONS.includes(ext);
}
export function UnstructuredDropZone({
blockId,
files,
onFilesChange,
disabled,
}: UnstructuredDropZoneProps) {
const inputRef = useRef<HTMLInputElement>(null);
const filesRef = useRef(files);
filesRef.current = files;
const [isDragOver, setIsDragOver] = useState(false);
const totalSize = files.reduce((sum, f) => sum + f.size, 0);
const handleFiles = useCallback(
async (newFiles: File[]) => {
const valid = newFiles.filter((f) => {
if (!isValidExtension(f.name)) return false;
if (f.size > MAX_FILE_SIZE) return false;
return true;
});
if (valid.length === 0) return;
const addedSize = valid.reduce((s, f) => s + f.size, 0);
const currentTotal = filesRef.current.reduce((sum, f) => sum + f.size, 0);
if (currentTotal + addedSize > MAX_TOTAL_SIZE) return;
const entries: FileEntry[] = valid.map((f) => ({
id: "",
name: f.name,
size: f.size,
status: "uploading" as const,
abortController: new AbortController(),
}));
onFilesChange((prev) => [...prev, ...entries]);
for (let i = 0; i < valid.length; i++) {
const file = valid[i];
const entry = entries[i];
let updatedId = "";
let updatedStatus: FileEntry["status"] = "error";
let updatedError: string | undefined;
try {
const existingIds = filesRef.current.filter((f) => f.id).map((f) => f.id);
const result = await uploadUnstructuredFile(
file,
blockId,
entry.abortController?.signal,
existingIds,
);
updatedId = result.file_id;
updatedStatus = result.status === "ok" ? "ok" : "error";
updatedError = result.error;
} catch (e) {
if (e instanceof DOMException && e.name === "AbortError") {
updatedError = "Cancelled";
} else {
updatedError = e instanceof Error ? e.message : "Upload failed";
}
}
onFilesChange((prev) =>
prev.map((f) =>
f === entry
? { ...f, id: updatedId, status: updatedStatus, error: updatedError }
: f,
),
);
}
},
[blockId, onFilesChange],
);
const deletedIdsRef = useRef(new Set<string>());
const handleRemove = useCallback(
(index: number) => {
const entry = filesRef.current[index];
if (!entry) return;
if (entry.status === "uploading" && entry.abortController) {
entry.abortController.abort();
}
if (entry.id && entry.status === "ok" && !deletedIdsRef.current.has(entry.id)) {
deletedIdsRef.current.add(entry.id);
void removeUnstructuredFile(blockId, entry.id).catch(() => {});
}
onFilesChange((prev) => prev.filter((_, i) => i !== index));
},
[blockId, onFilesChange],
);
const handleDrop = useCallback(
(e: React.DragEvent) => {
e.preventDefault();
setIsDragOver(false);
if (disabled) return;
const dropped = Array.from(e.dataTransfer.files);
handleFiles(dropped);
},
[disabled, handleFiles],
);
const handleDragOver = useCallback(
(e: React.DragEvent) => {
e.preventDefault();
if (!disabled) setIsDragOver(true);
},
[disabled],
);
const handleDragLeave = useCallback(() => setIsDragOver(false), []);
const handleClick = useCallback(() => {
if (!disabled) inputRef.current?.click();
}, [disabled]);
const handleInputChange = useCallback(
(e: React.ChangeEvent<HTMLInputElement>) => {
const selected = Array.from(e.target.files || []);
handleFiles(selected);
e.target.value = "";
},
[handleFiles],
);
const successFiles = files.filter((f) => f.status === "ok");
return (
<div className="space-y-2">
<div
className={`nodrag flex cursor-pointer flex-col items-center justify-center rounded-md border-2 border-dashed px-4 py-6 text-center transition-colors ${
isDragOver
? "border-primary bg-primary/5"
: "border-muted-foreground/25 hover:border-muted-foreground/50"
} ${disabled ? "pointer-events-none opacity-50" : ""}`}
onDrop={handleDrop}
onDragOver={handleDragOver}
onDragLeave={handleDragLeave}
onClick={handleClick}
>
<HugeiconsIcon icon={CloudUploadIcon} className="text-muted-foreground mb-2 size-8" />
<p className="text-muted-foreground text-sm">
Drop files here or click to browse
</p>
<p className="text-muted-foreground/60 mt-1 text-xs">
PDF, DOCX, TXT, MD - up to 50MB each, 100MB total
</p>
</div>
<input
ref={inputRef}
type="file"
accept={ACCEPTED_EXTENSIONS.join(",")}
multiple
className="hidden"
onChange={handleInputChange}
/>
{files.length > 0 && (
<div className="space-y-1">
{files.map((entry, i) => (
<div
key={`${entry.name}-${i}`}
className="flex items-center gap-2 rounded-md border px-3 py-1.5 text-sm"
>
{entry.status === "uploading" && (
<HugeiconsIcon icon={Loading03Icon} className="text-muted-foreground size-4 animate-spin" />
)}
{entry.status === "ok" && (
<HugeiconsIcon icon={CheckmarkCircle02Icon} className="size-4 text-green-500" />
)}
{entry.status === "error" && (
<HugeiconsIcon icon={Alert02Icon} className="size-4 text-red-500" />
)}
<span className="flex-1 truncate">{entry.name}</span>
<span className="text-muted-foreground text-xs">
{formatSize(entry.size)}
</span>
{entry.error && (
<span className="text-xs text-red-500">{entry.error}</span>
)}
<button
type="button"
className="ml-auto inline-flex size-7 shrink-0 items-center justify-center rounded-md text-muted-foreground transition hover:bg-destructive/10 hover:text-destructive"
onClick={(e) => {
e.stopPropagation();
handleRemove(i);
}}
>
<HugeiconsIcon icon={Cancel01Icon} className="size-3.5" />
</button>
</div>
))}
<div className="text-muted-foreground flex justify-between px-1 text-xs">
<span>{successFiles.length} file{successFiles.length !== 1 ? "s" : ""} uploaded</span>
<span>{formatSize(totalSize)} / 100MB</span>
</div>
</div>
)}
</div>
);
}
export type { FileEntry };

View file

@ -18,7 +18,7 @@ export function FieldLabel({
hint,
}: FieldLabelProps): ReactElement {
return (
<div className="flex min-w-0 items-start gap-1.5 text-xs font-semibold uppercase text-muted-foreground">
<div className="flex min-w-0 items-center gap-1 text-xs font-semibold uppercase text-muted-foreground">
{htmlFor ? (
<label className="min-w-0 cursor-pointer" htmlFor={htmlFor}>
<span className="break-words">{label}</span>
@ -31,7 +31,7 @@ export function FieldLabel({
<TooltipTrigger asChild={true}>
<button
type="button"
className="inline-flex size-6 shrink-0 items-center justify-center rounded-full text-muted-foreground/80 transition hover:text-foreground"
className="inline-flex size-4 shrink-0 items-center justify-center rounded-full text-muted-foreground/80 transition hover:text-foreground"
aria-label={`More info: ${label}`}
title={`More info about ${label}`}
>

View file

@ -23,7 +23,7 @@ export function NameField({
const fallbackId = useId();
const inputId = id ?? fallbackId;
return (
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label={label ?? "Field name"}
htmlFor={inputId}

View file

@ -165,7 +165,7 @@ function McpServerCard({
</div>
)}
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel label="Server name" hint="Name shown in this tool access setup." />
<Input
className="nodrag"
@ -194,7 +194,7 @@ function McpServerCard({
{provider.provider_type === "stdio" ? (
<div className="space-y-4">
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel label="Command" hint="Command used to start the tool server." />
<Input
className="nodrag"
@ -293,7 +293,7 @@ function McpServerCard({
</div>
) : (
<div className="space-y-4">
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel label="Endpoint" hint="URL for the tool server." />
<Input
className="nodrag"
@ -305,7 +305,7 @@ function McpServerCard({
/>
</div>
<div className="grid gap-2 sm:grid-cols-2">
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="API key environment variable"
hint="Optional environment variable that stores the API key."
@ -322,7 +322,7 @@ function McpServerCard({
}
/>
</div>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="API key"
hint="Optional API key."
@ -705,7 +705,7 @@ export function ToolProfileDialog({
)}
</div>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Tools this setup may use"
hint="Leave this empty to allow every tool from these servers."
@ -740,7 +740,7 @@ export function ToolProfileDialog({
</CollapsibleTrigger>
<CollapsibleContent className="mt-3">
<div className="grid gap-3 sm:grid-cols-2">
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Max tool-use turns"
hint="How many back-and-forth tool calls an AI step can make."
@ -756,7 +756,7 @@ export function ToolProfileDialog({
}
/>
</div>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Timeout (seconds)"
hint="How long to wait when loading or calling tools."

View file

@ -107,7 +107,7 @@ export function ValidatorDialog({
value={config.name}
onChange={(value) => onUpdate({ name: value })}
/>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Code to check"
htmlFor={targetColumnId}
@ -158,7 +158,7 @@ export function ValidatorDialog({
</div>
{config.validator_type === "oxc" && (
<div className="grid gap-3">
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Check mode"
htmlFor={oxcModeId}
@ -197,7 +197,7 @@ export function ValidatorDialog({
</Combobox>
</div>
</div>
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Code shape"
htmlFor={oxcCodeShapeId}
@ -249,7 +249,7 @@ export function ValidatorDialog({
/>
</CollapsibleTrigger>
<CollapsibleContent className="mt-3">
<div className="grid gap-2">
<div className="grid gap-1.5">
<FieldLabel
label="Batch size"
htmlFor={batchSizeId}

View file

@ -144,7 +144,9 @@ function sanitizeSeedForShare(payload: unknown): unknown {
ui.seed_drop_columns = [];
ui.seed_preview_rows = [];
ui.local_file_name = "";
ui.unstructured_file_name = "";
ui.unstructured_file_ids = [];
ui.unstructured_file_names = [];
ui.unstructured_file_sizes = [];
}
}
@ -152,12 +154,20 @@ function sanitizeSeedForShare(payload: unknown): unknown {
if (source && "path" in source) {
source.path = "";
}
if (source && "paths" in source) {
source.paths = [];
}
if (seedConfig) {
seedConfig.resolved_paths = [];
}
if (ui) {
ui.seed_columns = [];
ui.seed_drop_columns = [];
ui.seed_preview_rows = [];
ui.local_file_name = "";
ui.unstructured_file_name = "";
ui.unstructured_file_ids = [];
ui.unstructured_file_names = [];
ui.unstructured_file_sizes = [];
}
}

View file

@ -58,7 +58,12 @@ import { useRecipeStudioActions } from "./hooks/use-recipe-studio-actions";
import { useRecipeStudioStore } from "./stores/recipe-studio";
import type { RecipeNodeData } from "./types";
import { getGraphWarnings } from "./utils/graph-warnings";
import { getFitNodeIdsIgnoringNotes } from "./utils/graph/fit-view";
import {
FIT_VIEW_DURATION_MS,
FIT_VIEW_MAX_ZOOM,
FIT_VIEW_PADDING,
getFitViewTargetNodes,
} from "./utils/graph/fit-view";
import { buildRecipePayload } from "./utils/payload";
import type { RecipePayload } from "./utils/payload/types";
import { buildDefaultSchemaTransform } from "./utils/processors";
@ -71,7 +76,19 @@ const EDGE_TYPES: EdgeTypes = {
};
const COMPLETE_ISLAND_VISIBLE_MS = 7_000;
const TAB_SWITCH_FIT_DELAY_MS = 110;
const FIT_ANIMATION_MS = 340;
/**
* Maximum RAF iterations to wait for React Flow's ResizeObserver to populate
* `node.measured` dimensions before calling fitView. ~20 frames 333 ms at
* 60 fps more than enough for the render layout ResizeObserver cycle.
*/
const MAX_FIT_VIEW_RETRIES = 20;
/**
* After all target nodes appear measured, wait this many extra stable frames
* before firing fitView. This absorbs `updateNodeInternals` calls from
* InternalsSync and individual node mount effects that can transiently reset
* measurements.
*/
const FIT_VIEW_STABLE_FRAMES = 3;
export type PersistRecipeInput = {
id: string | null;
@ -421,40 +438,69 @@ export function RecipeStudioPage({
const scheduleFitView = useCallback(
({ delayMs = 0 }: { delayMs?: number } = {}) => {
if (!reactFlowInstance) {
return () => {};
// eslint-disable-next-line @typescript-eslint/no-empty-function
return () => {
/* no-op: instance not available */
};
}
let timeoutId = 0;
let frameId = 0;
let retryFrameId = 0;
let cancelled = false;
const fitWithCurrentNodes = () => {
const targetNodes = getFitNodeIdsIgnoringNotes(
reactFlowInstance.getNodes(),
/** Check whether every primary workflow node has been measured. */
const allTargetsMeasured = (targets: Node[]): boolean =>
targets.length > 0 &&
targets.every(
(n) => n.measured?.width != null && n.measured?.height != null,
);
if (targetNodes.length === 0) {
return false;
/** Execute fitView on the current primary workflow nodes. */
const doFit = () => {
const targets = getFitViewTargetNodes(reactFlowInstance.getNodes());
if (targets.length === 0) {
return;
}
viewportMovedSinceAutoFitRef.current = false;
reactFlowInstance.fitView({
duration: FIT_ANIMATION_MS,
nodes: targetNodes,
duration: FIT_VIEW_DURATION_MS,
maxZoom: FIT_VIEW_MAX_ZOOM,
padding: FIT_VIEW_PADDING,
nodes: targets.map((n) => ({ id: n.id })),
});
return true;
};
const runFit = () => {
if (fitWithCurrentNodes()) {
let retries = 0;
let stableCount = 0;
const poll = () => {
if (cancelled) {
return;
}
retryFrameId = window.requestAnimationFrame(() => {
fitWithCurrentNodes();
});
if (retries >= MAX_FIT_VIEW_RETRIES) {
// Timed out waiting — fit with whatever we have (graceful fallback).
doFit();
return;
}
const targets = getFitViewTargetNodes(reactFlowInstance.getNodes());
if (allTargetsMeasured(targets)) {
stableCount++;
// Wait a few extra frames after measurements appear to let
// updateNodeInternals (InternalsSync, node mount effects) settle.
if (stableCount >= FIT_VIEW_STABLE_FRAMES) {
doFit();
return;
}
} else {
// Measurements were reset (e.g. by updateNodeInternals) — restart
// the stability counter.
stableCount = 0;
}
retries++;
frameId = window.requestAnimationFrame(poll);
};
const start = () => {
frameId = window.requestAnimationFrame(runFit);
frameId = window.requestAnimationFrame(poll);
};
if (delayMs > 0) {
@ -464,15 +510,13 @@ export function RecipeStudioPage({
}
return () => {
cancelled = true;
if (timeoutId) {
window.clearTimeout(timeoutId);
}
if (frameId) {
window.cancelAnimationFrame(frameId);
}
if (retryFrameId) {
window.cancelAnimationFrame(retryFrameId);
}
};
},
[reactFlowInstance],

View file

@ -406,7 +406,10 @@ export const useRecipeStudioStore = create<RecipeStudioState>((set, get) => ({
hf_token: "",
hf_endpoint: "https://huggingface.co",
local_file_name: "",
unstructured_file_name: "",
unstructured_file_ids: [],
unstructured_file_names: [],
unstructured_file_sizes: [],
resolved_paths: [],
seed_columns: [],
seed_drop_columns: [],
seed_preview_rows: [],

View file

@ -333,7 +333,10 @@ export type SeedConfig = {
hf_token?: string;
hf_endpoint?: string;
local_file_name?: string;
unstructured_file_name?: string;
unstructured_file_ids?: string[];
unstructured_file_names?: string[];
unstructured_file_sizes?: number[];
resolved_paths?: string[];
// ui-only
seed_preview_rows?: Record<string, unknown>[];
// ui-only (string for input ergonomics)

View file

@ -366,7 +366,9 @@ export function makeSeedConfig(
hf_token: "",
hf_endpoint: "https://huggingface.co",
local_file_name: "",
unstructured_file_name: "",
unstructured_file_ids: [],
unstructured_file_names: [],
unstructured_file_sizes: [],
seed_preview_rows: [],
unstructured_chunk_size: "1200",
unstructured_chunk_overlap: "200",

View file

@ -1,8 +1,14 @@
// SPDX-License-Identifier: AGPL-3.0-only
// Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0
import type { Node } from "@xyflow/react";
import type { FitViewOptions, Node } from "@xyflow/react";
/** Cap auto-fit zoom so the view doesn't punch in too tight on small graphs. */
export const FIT_VIEW_MAX_ZOOM = 1.1;
export const FIT_VIEW_PADDING = 0.12;
export const FIT_VIEW_DURATION_MS = 340;
/** Markdown note nodes are decorative and should not affect the fitView bbox. */
function isMarkdownNoteNode(node: Node): boolean {
if (node.type !== "builder") {
return false;
@ -13,8 +19,43 @@ function isMarkdownNoteNode(node: Node): boolean {
return (node.data as { kind?: string }).kind === "note";
}
export function getFitNodeIdsIgnoringNotes(nodes: Node[]): Array<{ id: string }> {
const nodesWithoutNotes = nodes.filter((node) => !isMarkdownNoteNode(node));
const targetNodes = nodesWithoutNotes.length > 0 ? nodesWithoutNotes : nodes;
return targetNodes.map((node) => ({ id: node.id }));
/** Aux nodes (llm-prompt-input, llm-judge-score) are satellite overlays. */
function isAuxNode(node: Node): boolean {
return node.type === "aux";
}
/**
* Returns the primary workflow nodes that fitView should target.
*
* Excludes markdown notes and aux (LLM input overlay) nodes so the viewport
* is framed around the primary workflow blocks. Falls back to all nodes if
* filtering would leave an empty set.
*
* The returned array contains full {@link Node} objects so callers can inspect
* `node.measured` without a second lookup pass.
*/
export function getFitViewTargetNodes(nodes: Node[]): Node[] {
const primary = nodes.filter(
(node) => !(isMarkdownNoteNode(node) || isAuxNode(node)),
);
return primary.length > 0 ? primary : nodes;
}
/**
* Builds a standard {@link FitViewOptions} object targeting the primary
* workflow nodes. Every call site that invokes `fitView` should go through
* this helper so zoom, padding, and node filtering stay consistent.
*/
export function buildFitViewOptions(
nodes: Node[],
overrides?: Partial<FitViewOptions>,
): FitViewOptions {
const targets = getFitViewTargetNodes(nodes);
return {
duration: FIT_VIEW_DURATION_MS,
maxZoom: FIT_VIEW_MAX_ZOOM,
padding: FIT_VIEW_PADDING,
nodes: targets.map((n) => ({ id: n.id })),
...overrides,
};
}

View file

@ -43,7 +43,9 @@ type UiInput = {
seed_drop_columns?: unknown;
seed_preview_rows?: unknown;
local_file_name?: unknown;
unstructured_file_name?: unknown;
unstructured_file_ids?: unknown;
unstructured_file_names?: unknown;
unstructured_file_sizes?: unknown;
unstructured_chunk_size?: unknown;
unstructured_chunk_overlap?: unknown;
advanced_open_by_node?: unknown;
@ -408,8 +410,16 @@ export function importRecipePayload(input: string): ImportResult {
.map((row) => ({ ...row }))
: undefined;
const uiLocalFileName = readString(ui?.local_file_name) ?? undefined;
const uiUnstructuredFileName =
readString(ui?.unstructured_file_name) ?? undefined;
// Preserve file IDs/names from saved recipes (cleared at share time by sanitizeSeedForShare)
const uiUnstructuredFileIds: string[] = Array.isArray(ui?.unstructured_file_ids)
? (ui.unstructured_file_ids as string[]).filter((v): v is string => typeof v === "string")
: [];
const uiUnstructuredFileNames: string[] = Array.isArray(ui?.unstructured_file_names)
? (ui.unstructured_file_names as string[]).filter((v): v is string => typeof v === "string")
: [];
const uiUnstructuredFileSizes: number[] = Array.isArray(ui?.unstructured_file_sizes)
? (ui.unstructured_file_sizes as number[]).filter((v): v is number => typeof v === "number")
: [];
const uiUnstructuredChunkSize = readStringNumber(ui?.unstructured_chunk_size);
const uiUnstructuredChunkOverlap = readStringNumber(
ui?.unstructured_chunk_overlap,
@ -449,7 +459,9 @@ export function importRecipePayload(input: string): ImportResult {
: payloadSeedDropColumns,
seed_preview_rows: uiSeedPreviewRows,
local_file_name: uiLocalFileName,
unstructured_file_name: uiUnstructuredFileName,
unstructuredFileIds: uiUnstructuredFileIds,
unstructuredFileNames: uiUnstructuredFileNames,
unstructuredFileSizes: uiUnstructuredFileSizes,
unstructured_chunk_size: uiUnstructuredChunkSize,
unstructured_chunk_overlap: uiUnstructuredChunkOverlap,
});

View file

@ -30,7 +30,9 @@ function makeDefaultSeedConfig(id: string): SeedConfig {
hf_token: "",
hf_endpoint: "https://huggingface.co",
local_file_name: "",
unstructured_file_name: "",
unstructured_file_ids: [],
unstructured_file_names: [],
unstructured_file_sizes: [],
seed_preview_rows: [],
unstructured_chunk_size: "1200",
unstructured_chunk_overlap: "200",
@ -72,7 +74,10 @@ function parseSeedSettings(seedConfigRaw: unknown): Partial<SeedConfig> {
let hf_endpoint = "https://huggingface.co";
let hf_repo_id = "";
let local_file_name = "";
let unstructured_file_name = "";
let unstructuredFileIds: string[] = [];
let unstructuredFileNames: string[] = [];
let unstructuredFileSizes: number[] = [];
let resolved_paths: string[] = [];
let unstructured_chunk_size = "1200";
let unstructured_chunk_overlap = "200";
const sourceRaw = seedConfigRaw.source;
@ -91,8 +96,15 @@ function parseSeedSettings(seedConfigRaw: unknown): Partial<SeedConfig> {
local_file_name = sourcePath.split("/").pop() ?? sourcePath;
} else if (seedType === "unstructured") {
seed_source_type = "unstructured";
hf_path = sourcePath;
unstructured_file_name = sourcePath.split("/").pop() ?? sourcePath;
const paths = Array.isArray(sourceRaw.paths) ? sourceRaw.paths : [];
const stringPaths = paths.filter((p): p is string => typeof p === "string");
if (stringPaths.length === 0 && sourcePath) {
stringPaths.push(sourcePath);
}
hf_path = stringPaths[0] ?? sourcePath;
resolved_paths = stringPaths;
unstructuredFileIds = [];
unstructuredFileNames = [];
unstructured_chunk_size = readNumberString(sourceRaw.chunk_size) || "1200";
unstructured_chunk_overlap = readNumberString(sourceRaw.chunk_overlap) || "200";
}
@ -129,7 +141,10 @@ function parseSeedSettings(seedConfigRaw: unknown): Partial<SeedConfig> {
hf_token,
hf_endpoint,
local_file_name,
unstructured_file_name,
unstructured_file_ids: unstructuredFileIds,
unstructured_file_names: unstructuredFileNames,
unstructured_file_sizes: unstructuredFileSizes,
resolved_paths,
unstructured_chunk_size,
unstructured_chunk_overlap,
sampling_strategy,
@ -150,7 +165,9 @@ export function parseSeedConfig(
seed_drop_columns?: string[];
seed_preview_rows?: Record<string, unknown>[];
local_file_name?: string;
unstructured_file_name?: string;
unstructuredFileIds?: string[];
unstructuredFileNames?: string[];
unstructuredFileSizes?: number[];
unstructured_chunk_size?: string;
unstructured_chunk_overlap?: string;
},
@ -181,8 +198,14 @@ export function parseSeedConfig(
...(options?.local_file_name !== undefined
? { local_file_name: options.local_file_name }
: {}),
...(options?.unstructured_file_name !== undefined
? { unstructured_file_name: options.unstructured_file_name }
...(options?.unstructuredFileIds !== undefined
? { unstructured_file_ids: options.unstructuredFileIds }
: {}),
...(options?.unstructuredFileNames !== undefined
? { unstructured_file_names: options.unstructuredFileNames }
: {}),
...(options?.unstructuredFileSizes !== undefined
? { unstructured_file_sizes: options.unstructuredFileSizes }
: {}),
...(options?.unstructured_chunk_size !== undefined
? { unstructured_chunk_size: options.unstructured_chunk_size }

View file

@ -430,8 +430,10 @@ export function buildRecipePayload(
local_file_name: firstSeed.local_file_name,
}),
...(firstSeed &&
firstSeed.unstructured_file_name !== undefined && {
unstructured_file_name: firstSeed.unstructured_file_name,
firstSeed.unstructured_file_ids !== undefined && {
unstructured_file_ids: firstSeed.unstructured_file_ids,
unstructured_file_names: firstSeed.unstructured_file_names,
unstructured_file_sizes: firstSeed.unstructured_file_sizes,
}),
...(firstSeed &&
firstSeed.unstructured_chunk_size !== undefined && {

View file

@ -71,7 +71,7 @@ export function buildSeedConfig(
return {
// biome-ignore lint/style/useNamingConvention: api schema
seed_type: "unstructured",
path,
paths: config.resolved_paths?.length ? config.resolved_paths : [config.hf_path],
// biome-ignore lint/style/useNamingConvention: api schema
chunk_size: chunkSize,
// biome-ignore lint/style/useNamingConvention: api schema

View file

@ -70,8 +70,15 @@ export type RecipePayload = {
seed_drop_columns?: string[];
seed_preview_rows?: Record<string, unknown>[];
local_file_name?: string;
unstructured_file_name?: string;
// biome-ignore lint/style/useNamingConvention: api schema
unstructured_file_ids?: string[];
// biome-ignore lint/style/useNamingConvention: api schema
unstructured_file_names?: string[];
// biome-ignore lint/style/useNamingConvention: api schema
unstructured_file_sizes?: number[];
// biome-ignore lint/style/useNamingConvention: api schema
unstructured_chunk_size?: string;
// biome-ignore lint/style/useNamingConvention: api schema
unstructured_chunk_overlap?: string;
// ui-only: per-node advanced accordion state
advanced_open_by_node?: Record<string, boolean>;

View file

@ -271,7 +271,11 @@ export function getConfigErrors(config: NodeConfig | null): string[] {
if (seedSourceType === "hf" && !config.hf_repo_id.trim()) {
errors.push("Choose a Hugging Face dataset.");
}
if (!config.hf_path.trim()) {
const hasPath =
seedSourceType === "unstructured"
? (config.resolved_paths?.length ?? 0) > 0
: Boolean(config.hf_path.trim());
if (!hasPath) {
errors.push("Load the source-data preview first.");
}
if (