agent-zero/python/survey_assistant/extract.py
nic b7efe4992a
Add human-in-the-loop survey helper (GUI + CLI + Agent tool) (#181)
* Add human-in-the-loop survey helper

Co-authored-by: nic <nicsins@users.noreply.github.com>

* Make survey helper launcher robust and add CLI fallback

Co-authored-by: nic <nicsins@users.noreply.github.com>

* Detect missing display for tkinter GUI

Co-authored-by: nic <nicsins@users.noreply.github.com>

* Add prediction dataset + review workflow for uncertain survey answers

Co-authored-by: nic <nicsins@users.noreply.github.com>

---------

Co-authored-by: Cursor Agent <cursoragent@cursor.com>
Co-authored-by: nic <nicsins@users.noreply.github.com>
2026-03-17 04:02:15 -05:00

218 lines
6.7 KiB
Python

from __future__ import annotations
from dataclasses import dataclass, asdict, field
from typing import Any, Iterable
from bs4 import BeautifulSoup, Tag
@dataclass
class ExtractedField:
kind: str # input|textarea|select
input_type: str | None = None # text|radio|checkbox|...
name: str | None = None
id: str | None = None
label: str | None = None
required: bool = False
options: list[dict[str, Any]] = field(default_factory=list) # [{label,value}]
def to_dict(self) -> dict[str, Any]:
return asdict(self)
def _text(el: Tag | None) -> str:
if not el:
return ""
return " ".join(el.get_text(" ", strip=True).split())
def _first_non_empty(*vals: str | None) -> str | None:
for v in vals:
if v is None:
continue
vv = " ".join(str(v).strip().split())
if vv:
return vv
return None
def _is_hidden_input(tag: Tag) -> bool:
if tag.name != "input":
return False
t = (tag.get("type") or "").lower()
return t == "hidden"
def _is_ignorable_control(tag: Tag) -> bool:
if tag.name == "input":
t = (tag.get("type") or "text").lower()
return t in {"submit", "button", "reset", "image", "file", "hidden"}
if tag.name in {"button"}:
return True
return False
def _infer_label(soup: BeautifulSoup, field_tag: Tag) -> str | None:
# 1) explicit <label for="...">
field_id = field_tag.get("id")
if field_id:
lbl = soup.find("label", attrs={"for": field_id})
if isinstance(lbl, Tag):
txt = _text(lbl)
if txt:
return txt
# 2) enclosing label
enclosing = field_tag.find_parent("label")
if isinstance(enclosing, Tag):
txt = _text(enclosing)
if txt:
return txt
# 3) aria-label / placeholder / name
return _first_non_empty(
field_tag.get("aria-label"),
field_tag.get("placeholder"),
field_tag.get("name"),
)
def _infer_group_label(soup: BeautifulSoup, first_input: Tag) -> str | None:
# Prefer <fieldset><legend>Question</legend>...</fieldset>
fs = first_input.find_parent("fieldset")
if isinstance(fs, Tag):
legend = fs.find("legend")
if isinstance(legend, Tag):
txt = _text(legend)
if txt:
return txt
# Try previous meaningful text near the input (common in survey builders)
probe: Tag | None = first_input
for _ in range(6):
if not probe:
break
prev = probe.find_previous(
["h1", "h2", "h3", "h4", "h5", "h6", "p", "div", "span", "label"]
)
if isinstance(prev, Tag):
txt = _text(prev)
# Avoid using option labels (very short) as question label
if txt and len(txt) >= 4:
return txt
probe = probe.parent if isinstance(probe.parent, Tag) else None
return _first_non_empty(first_input.get("name"), first_input.get("aria-label"))
def _iter_controls(soup: BeautifulSoup) -> Iterable[Tag]:
for tag in soup.find_all(["input", "textarea", "select"]):
if not isinstance(tag, Tag):
continue
if _is_ignorable_control(tag):
continue
# Skip invisible elements
if tag.has_attr("hidden") or (tag.get("aria-hidden") == "true"):
continue
if _is_hidden_input(tag):
continue
yield tag
def extract_form_fields(html: str, *, max_fields: int = 200) -> list[ExtractedField]:
"""
Extract likely survey/form fields from HTML into a normalized structure.
This is intentionally "best-effort" and works across many survey builders.
"""
soup = BeautifulSoup(html or "", "html.parser")
# First pass: gather raw controls in order
controls: list[Tag] = []
for tag in _iter_controls(soup):
controls.append(tag)
if len(controls) >= max_fields:
break
# Group radio/checkbox by name so they show up as one question
seen_group_names: set[str] = set()
out: list[ExtractedField] = []
for c in controls:
kind = c.name
name = c.get("name")
cid = c.get("id")
required = bool(c.has_attr("required") or (c.get("aria-required") == "true"))
if kind == "input":
input_type = (c.get("type") or "text").lower()
else:
input_type = None
if kind == "input" and input_type in {"radio", "checkbox"} and name:
if name in seen_group_names:
continue
seen_group_names.add(name)
group = soup.find_all("input", attrs={"name": name})
options: list[dict[str, Any]] = []
for opt in group:
if not isinstance(opt, Tag):
continue
if _is_ignorable_control(opt) or _is_hidden_input(opt):
continue
opt_id = opt.get("id")
opt_label = None
if opt_id:
lbl = soup.find("label", attrs={"for": opt_id})
if isinstance(lbl, Tag):
opt_label = _text(lbl) or None
opt_label = _first_non_empty(opt_label, opt.get("value"), opt_id)
options.append(
{
"label": opt_label,
"value": opt.get("value"),
}
)
group_label = _infer_group_label(soup, c)
out.append(
ExtractedField(
kind="input",
input_type=input_type,
name=name,
id=cid,
label=group_label or _infer_label(soup, c),
required=required,
options=options,
)
)
continue
if kind == "select":
options = []
for opt in c.find_all("option"):
if not isinstance(opt, Tag):
continue
options.append({"label": _text(opt) or opt.get("value"), "value": opt.get("value")})
out.append(
ExtractedField(
kind="select",
input_type=None,
name=name,
id=cid,
label=_infer_label(soup, c),
required=required,
options=options,
)
)
continue
out.append(
ExtractedField(
kind=str(kind),
input_type=input_type,
name=name,
id=cid,
label=_infer_label(soup, c),
required=required,
)
)
return out