🎨 add example skills (#1370)

Co-authored-by: Douglas <douglas.ym.lai@gmail.com>
2026-04-29 04:00:09 +00:00 · 2026-02-26 12:58:39 +08:00 · 2026-02-26 12:58:39 +08:00 · 343050b47f
commit 343050b47f
parent fb740bbe3f
200 changed files with 74603 additions and 33 deletions
--- a/resources/example-skills/pdf/scripts/extract_form_structure.py
+++ b/resources/example-skills/pdf/scripts/extract_form_structure.py
@ -0,0 +1,129 @@
+# ========= Copyright 2025-2026 @ Eigent.ai All Rights Reserved. =========
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ========= Copyright 2025-2026 @ Eigent.ai All Rights Reserved. =========
+
+"""
+Extract form structure from a non-fillable PDF.
+
+This script analyzes the PDF to find:
+- Text labels with their exact coordinates
+- Horizontal lines (row boundaries)
+- Checkboxes (small rectangles)
+
+Output: A JSON file with the form structure that can be used to generate
+accurate field coordinates for filling.
+
+Usage: python extract_form_structure.py <input.pdf> <output.json>
+"""
+
+import json
+import sys
+import pdfplumber
+
+
+def extract_form_structure(pdf_path):
+    structure = {
+        "pages": [],
+        "labels": [],
+        "lines": [],
+        "checkboxes": [],
+        "row_boundaries": []
+    }
+
+    with pdfplumber.open(pdf_path) as pdf:
+        for page_num, page in enumerate(pdf.pages, 1):
+            structure["pages"].append({
+                "page_number": page_num,
+                "width": float(page.width),
+                "height": float(page.height)
+            })
+
+            words = page.extract_words()
+            for word in words:
+                structure["labels"].append({
+                    "page": page_num,
+                    "text": word["text"],
+                    "x0": round(float(word["x0"]), 1),
+                    "top": round(float(word["top"]), 1),
+                    "x1": round(float(word["x1"]), 1),
+                    "bottom": round(float(word["bottom"]), 1)
+                })
+
+            for line in page.lines:
+                if abs(float(line["x1"]) - float(line["x0"])) > page.width * 0.5:
+                    structure["lines"].append({
+                        "page": page_num,
+                        "y": round(float(line["top"]), 1),
+                        "x0": round(float(line["x0"]), 1),
+                        "x1": round(float(line["x1"]), 1)
+                    })
+
+            for rect in page.rects:
+                width = float(rect["x1"]) - float(rect["x0"])
+                height = float(rect["bottom"]) - float(rect["top"])
+                if 5 <= width <= 15 and 5 <= height <= 15 and abs(width - height) < 2:
+                    structure["checkboxes"].append({
+                        "page": page_num,
+                        "x0": round(float(rect["x0"]), 1),
+                        "top": round(float(rect["top"]), 1),
+                        "x1": round(float(rect["x1"]), 1),
+                        "bottom": round(float(rect["bottom"]), 1),
+                        "center_x": round((float(rect["x0"]) + float(rect["x1"])) / 2, 1),
+                        "center_y": round((float(rect["top"]) + float(rect["bottom"])) / 2, 1)
+                    })
+
+    lines_by_page = {}
+    for line in structure["lines"]:
+        page = line["page"]
+        if page not in lines_by_page:
+            lines_by_page[page] = []
+        lines_by_page[page].append(line["y"])
+
+    for page, y_coords in lines_by_page.items():
+        y_coords = sorted(set(y_coords))
+        for i in range(len(y_coords) - 1):
+            structure["row_boundaries"].append({
+                "page": page,
+                "row_top": y_coords[i],
+                "row_bottom": y_coords[i + 1],
+                "row_height": round(y_coords[i + 1] - y_coords[i], 1)
+            })
+
+    return structure
+
+
+def main():
+    if len(sys.argv) != 3:
+        print("Usage: extract_form_structure.py <input.pdf> <output.json>")
+        sys.exit(1)
+
+    pdf_path = sys.argv[1]
+    output_path = sys.argv[2]
+
+    print(f"Extracting structure from {pdf_path}...")
+    structure = extract_form_structure(pdf_path)
+
+    with open(output_path, "w") as f:
+        json.dump(structure, f, indent=2)
+
+    print(f"Found:")
+    print(f"  - {len(structure['pages'])} pages")
+    print(f"  - {len(structure['labels'])} text labels")
+    print(f"  - {len(structure['lines'])} horizontal lines")
+    print(f"  - {len(structure['checkboxes'])} checkboxes")
+    print(f"  - {len(structure['row_boundaries'])} row boundaries")
+    print(f"Saved to {output_path}")
+
+
+if __name__ == "__main__":
+    main()