mirror of
https://github.com/eigent-ai/eigent.git
synced 2026-04-29 12:10:24 +00:00
Some checks are pending
CodeQL Advanced / Analyze (actions) (push) Waiting to run
CodeQL Advanced / Analyze (javascript-typescript) (push) Waiting to run
CodeQL Advanced / Analyze (python) (push) Waiting to run
Pre-commit / pre-commit (push) Waiting to run
Test / Run Python Tests (push) Waiting to run
Co-authored-by: Douglas <douglas.ym.lai@gmail.com>
125 lines
4.3 KiB
Python
Executable file
125 lines
4.3 KiB
Python
Executable file
# ========= Copyright 2025-2026 @ Eigent.ai All Rights Reserved. =========
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
# ========= Copyright 2025-2026 @ Eigent.ai All Rights Reserved. =========
|
|
|
|
"""
|
|
Command line tool to validate Office document XML files against XSD schemas and tracked changes.
|
|
|
|
Usage:
|
|
python validate.py <path> [--original <original_file>] [--auto-repair] [--author NAME]
|
|
|
|
The first argument can be either:
|
|
- An unpacked directory containing the Office document XML files
|
|
- A packed Office file (.docx/.pptx/.xlsx) which will be unpacked to a temp directory
|
|
|
|
Auto-repair fixes:
|
|
- paraId/durableId values that exceed OOXML limits
|
|
- Missing xml:space="preserve" on w:t elements with whitespace
|
|
"""
|
|
|
|
import argparse
|
|
import sys
|
|
import tempfile
|
|
import zipfile
|
|
from pathlib import Path
|
|
|
|
from validators import DOCXSchemaValidator, PPTXSchemaValidator, RedliningValidator
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Validate Office document XML files")
|
|
parser.add_argument(
|
|
"path",
|
|
help="Path to unpacked directory or packed Office file (.docx/.pptx/.xlsx)",
|
|
)
|
|
parser.add_argument(
|
|
"--original",
|
|
required=False,
|
|
default=None,
|
|
help="Path to original file (.docx/.pptx/.xlsx). If omitted, all XSD errors are reported and redlining validation is skipped.",
|
|
)
|
|
parser.add_argument(
|
|
"-v",
|
|
"--verbose",
|
|
action="store_true",
|
|
help="Enable verbose output",
|
|
)
|
|
parser.add_argument(
|
|
"--auto-repair",
|
|
action="store_true",
|
|
help="Automatically repair common issues (hex IDs, whitespace preservation)",
|
|
)
|
|
parser.add_argument(
|
|
"--author",
|
|
default="Claude",
|
|
help="Author name for redlining validation (default: Claude)",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
path = Path(args.path)
|
|
assert path.exists(), f"Error: {path} does not exist"
|
|
|
|
original_file = None
|
|
if args.original:
|
|
original_file = Path(args.original)
|
|
assert original_file.is_file(), f"Error: {original_file} is not a file"
|
|
assert original_file.suffix.lower() in [".docx", ".pptx", ".xlsx"], (
|
|
f"Error: {original_file} must be a .docx, .pptx, or .xlsx file"
|
|
)
|
|
|
|
file_extension = (original_file or path).suffix.lower()
|
|
assert file_extension in [".docx", ".pptx", ".xlsx"], (
|
|
f"Error: Cannot determine file type from {path}. Use --original or provide a .docx/.pptx/.xlsx file."
|
|
)
|
|
|
|
if path.is_file() and path.suffix.lower() in [".docx", ".pptx", ".xlsx"]:
|
|
temp_dir = tempfile.mkdtemp()
|
|
with zipfile.ZipFile(path, "r") as zf:
|
|
zf.extractall(temp_dir)
|
|
unpacked_dir = Path(temp_dir)
|
|
else:
|
|
assert path.is_dir(), f"Error: {path} is not a directory or Office file"
|
|
unpacked_dir = path
|
|
|
|
match file_extension:
|
|
case ".docx":
|
|
validators = [
|
|
DOCXSchemaValidator(unpacked_dir, original_file, verbose=args.verbose),
|
|
]
|
|
if original_file:
|
|
validators.append(
|
|
RedliningValidator(unpacked_dir, original_file, verbose=args.verbose, author=args.author)
|
|
)
|
|
case ".pptx":
|
|
validators = [
|
|
PPTXSchemaValidator(unpacked_dir, original_file, verbose=args.verbose),
|
|
]
|
|
case _:
|
|
print(f"Error: Validation not supported for file type {file_extension}")
|
|
sys.exit(1)
|
|
|
|
if args.auto_repair:
|
|
total_repairs = sum(v.repair() for v in validators)
|
|
if total_repairs:
|
|
print(f"Auto-repaired {total_repairs} issue(s)")
|
|
|
|
success = all(v.validate() for v in validators)
|
|
|
|
if success:
|
|
print("All validations PASSED!")
|
|
|
|
sys.exit(0 if success else 1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|