mirror of
https://github.com/eigent-ai/eigent.git
synced 2026-04-29 04:00:09 +00:00
🎨 add example skills (#1370)
Some checks are pending
CodeQL Advanced / Analyze (actions) (push) Waiting to run
CodeQL Advanced / Analyze (javascript-typescript) (push) Waiting to run
CodeQL Advanced / Analyze (python) (push) Waiting to run
Pre-commit / pre-commit (push) Waiting to run
Test / Run Python Tests (push) Waiting to run
Some checks are pending
CodeQL Advanced / Analyze (actions) (push) Waiting to run
CodeQL Advanced / Analyze (javascript-typescript) (push) Waiting to run
CodeQL Advanced / Analyze (python) (push) Waiting to run
Pre-commit / pre-commit (push) Waiting to run
Test / Run Python Tests (push) Waiting to run
Co-authored-by: Douglas <douglas.ym.lai@gmail.com>
This commit is contained in:
parent
fb740bbe3f
commit
343050b47f
200 changed files with 74603 additions and 33 deletions
129
resources/example-skills/pdf/scripts/extract_form_structure.py
Executable file
129
resources/example-skills/pdf/scripts/extract_form_structure.py
Executable file
|
|
@ -0,0 +1,129 @@
|
|||
# ========= Copyright 2025-2026 @ Eigent.ai All Rights Reserved. =========
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ========= Copyright 2025-2026 @ Eigent.ai All Rights Reserved. =========
|
||||
|
||||
"""
|
||||
Extract form structure from a non-fillable PDF.
|
||||
|
||||
This script analyzes the PDF to find:
|
||||
- Text labels with their exact coordinates
|
||||
- Horizontal lines (row boundaries)
|
||||
- Checkboxes (small rectangles)
|
||||
|
||||
Output: A JSON file with the form structure that can be used to generate
|
||||
accurate field coordinates for filling.
|
||||
|
||||
Usage: python extract_form_structure.py <input.pdf> <output.json>
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
import pdfplumber
|
||||
|
||||
|
||||
def extract_form_structure(pdf_path):
|
||||
structure = {
|
||||
"pages": [],
|
||||
"labels": [],
|
||||
"lines": [],
|
||||
"checkboxes": [],
|
||||
"row_boundaries": []
|
||||
}
|
||||
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
for page_num, page in enumerate(pdf.pages, 1):
|
||||
structure["pages"].append({
|
||||
"page_number": page_num,
|
||||
"width": float(page.width),
|
||||
"height": float(page.height)
|
||||
})
|
||||
|
||||
words = page.extract_words()
|
||||
for word in words:
|
||||
structure["labels"].append({
|
||||
"page": page_num,
|
||||
"text": word["text"],
|
||||
"x0": round(float(word["x0"]), 1),
|
||||
"top": round(float(word["top"]), 1),
|
||||
"x1": round(float(word["x1"]), 1),
|
||||
"bottom": round(float(word["bottom"]), 1)
|
||||
})
|
||||
|
||||
for line in page.lines:
|
||||
if abs(float(line["x1"]) - float(line["x0"])) > page.width * 0.5:
|
||||
structure["lines"].append({
|
||||
"page": page_num,
|
||||
"y": round(float(line["top"]), 1),
|
||||
"x0": round(float(line["x0"]), 1),
|
||||
"x1": round(float(line["x1"]), 1)
|
||||
})
|
||||
|
||||
for rect in page.rects:
|
||||
width = float(rect["x1"]) - float(rect["x0"])
|
||||
height = float(rect["bottom"]) - float(rect["top"])
|
||||
if 5 <= width <= 15 and 5 <= height <= 15 and abs(width - height) < 2:
|
||||
structure["checkboxes"].append({
|
||||
"page": page_num,
|
||||
"x0": round(float(rect["x0"]), 1),
|
||||
"top": round(float(rect["top"]), 1),
|
||||
"x1": round(float(rect["x1"]), 1),
|
||||
"bottom": round(float(rect["bottom"]), 1),
|
||||
"center_x": round((float(rect["x0"]) + float(rect["x1"])) / 2, 1),
|
||||
"center_y": round((float(rect["top"]) + float(rect["bottom"])) / 2, 1)
|
||||
})
|
||||
|
||||
lines_by_page = {}
|
||||
for line in structure["lines"]:
|
||||
page = line["page"]
|
||||
if page not in lines_by_page:
|
||||
lines_by_page[page] = []
|
||||
lines_by_page[page].append(line["y"])
|
||||
|
||||
for page, y_coords in lines_by_page.items():
|
||||
y_coords = sorted(set(y_coords))
|
||||
for i in range(len(y_coords) - 1):
|
||||
structure["row_boundaries"].append({
|
||||
"page": page,
|
||||
"row_top": y_coords[i],
|
||||
"row_bottom": y_coords[i + 1],
|
||||
"row_height": round(y_coords[i + 1] - y_coords[i], 1)
|
||||
})
|
||||
|
||||
return structure
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) != 3:
|
||||
print("Usage: extract_form_structure.py <input.pdf> <output.json>")
|
||||
sys.exit(1)
|
||||
|
||||
pdf_path = sys.argv[1]
|
||||
output_path = sys.argv[2]
|
||||
|
||||
print(f"Extracting structure from {pdf_path}...")
|
||||
structure = extract_form_structure(pdf_path)
|
||||
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(structure, f, indent=2)
|
||||
|
||||
print(f"Found:")
|
||||
print(f" - {len(structure['pages'])} pages")
|
||||
print(f" - {len(structure['labels'])} text labels")
|
||||
print(f" - {len(structure['lines'])} horizontal lines")
|
||||
print(f" - {len(structure['checkboxes'])} checkboxes")
|
||||
print(f" - {len(structure['row_boundaries'])} row boundaries")
|
||||
print(f"Saved to {output_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue